liger-kernel-nightly 0.6.1.dev20250730201330__py3-none-any.whl → 0.6.1.dev20250805235740__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -25,6 +25,7 @@ def fused_linear_cross_entropy_forward(
25
25
  reduction="mean",
26
26
  softcap=None,
27
27
  return_z_loss=False,
28
+ accum_dtype=None,
28
29
  ):
29
30
  assert isinstance(return_z_loss, bool), f"return_z_loss must be True or False. Got: {return_z_loss}"
30
31
  device = _input.device
@@ -44,10 +45,16 @@ def fused_linear_cross_entropy_forward(
44
45
  chunk_size = triton.next_power_of_2(triton.cdiv(BT, inc_factor)) # (BT + inc_factor - 1) // inc_factor
45
46
  num_chunks = triton.cdiv(BT, chunk_size) # (BT + chunk_size - 1) // chunk_size
46
47
 
47
- grad_weight = torch.zeros_like(weight, device=device) if weight.requires_grad else None
48
48
  grad_input = torch.zeros_like(_input, device=device)
49
- grad_bias = torch.zeros_like(bias, device=device) if bias is not None else None
50
- # we use fp32 for loss accumulator
49
+
50
+ # we use fp32 for loss and gradients accumulator
51
+ if accum_dtype is None:
52
+ grad_weight = torch.zeros_like(weight, device=device) if weight.requires_grad else None
53
+ grad_bias = torch.zeros_like(bias, device=device) if bias is not None else None
54
+ else:
55
+ grad_weight = torch.zeros_like(weight, dtype=accum_dtype, device=device) if weight.requires_grad else None
56
+ grad_bias = torch.zeros_like(bias, dtype=accum_dtype, device=device) if bias is not None else None
57
+
51
58
  loss_1d = torch.zeros(BT, dtype=torch.float32, device=device)
52
59
  z_loss_1d = torch.zeros(BT, dtype=_input.dtype, device=_input.device) if return_z_loss else None
53
60
 
@@ -124,16 +131,7 @@ def fused_linear_cross_entropy_forward(
124
131
  grad_input[start_idx:end_idx] = grad_logits_chunk @ weight
125
132
 
126
133
  if grad_weight is not None:
127
- torch.addmm(
128
- input=grad_weight,
129
- mat1=logits_chunk.t().to(
130
- _input_chunk.dtype
131
- ), # In an autocast scenario without bias, differing logits_chunk data types will cause an addmm operation error.
132
- mat2=_input_chunk,
133
- out=grad_weight,
134
- alpha=1.0,
135
- beta=1.0,
136
- )
134
+ grad_weight += torch.mm(grad_logits_chunk.t(), _input_chunk).float()
137
135
 
138
136
  if bias is not None:
139
137
  torch.add(
@@ -151,6 +149,11 @@ def fused_linear_cross_entropy_forward(
151
149
  else:
152
150
  loss = torch.sum(loss_1d)
153
151
  z_loss = torch.sum(z_loss_1d) if return_z_loss else None
152
+
153
+ # Cast back to original dtype
154
+ grad_weight = grad_weight.to(weight.dtype) if grad_weight is not None else None
155
+ grad_bias = grad_bias.to(bias.dtype) if grad_bias is not None else None
156
+
154
157
  return loss, z_loss, grad_input, grad_weight, grad_bias
155
158
 
156
159
 
@@ -217,6 +220,7 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
217
220
  reduction="mean",
218
221
  softcap=None,
219
222
  return_z_loss: bool = False,
223
+ accum_dtype=None,
220
224
  ):
221
225
  """
222
226
  Fusing the last linear layer with cross-entropy loss
@@ -235,6 +239,8 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
235
239
  ignore_index: the index to ignore in the target
236
240
  label_smoothing (float): The amount of smoothing when computing the loss, where 0.0 means no smoothing.
237
241
  reduction: reduction to apply
242
+ accum_dtype (torch.dtype): the dtype of intermediate result buffers for weight and bias gradient accumulations.
243
+ Recommended to set `accum_dtype` to higher precision, e.g. `torch.float32`, if the training is unstable with original dtype. Default: `None`, performing accumulations in original dtype
238
244
  """
239
245
 
240
246
  loss, z_loss, grad_input, grad_weight, grad_bias = fused_linear_cross_entropy_forward(
@@ -249,6 +255,7 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
249
255
  reduction=reduction,
250
256
  softcap=softcap,
251
257
  return_z_loss=return_z_loss,
258
+ accum_dtype=accum_dtype,
252
259
  )
253
260
  # downcast to dtype and store for backward
254
261
  ctx.save_for_backward(
@@ -280,4 +287,5 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
280
287
  None,
281
288
  None,
282
289
  None,
290
+ None,
283
291
  )
@@ -64,6 +64,7 @@ def liger_fused_linear_cross_entropy(
64
64
  reduction: str = "mean",
65
65
  softcap: Optional[float] = None,
66
66
  return_z_loss: bool = False,
67
+ accum_dtype=None,
67
68
  ):
68
69
  loss, z_loss = LigerFusedLinearCrossEntropyFunction.apply(
69
70
  input,
@@ -77,6 +78,7 @@ def liger_fused_linear_cross_entropy(
77
78
  reduction,
78
79
  softcap,
79
80
  return_z_loss,
81
+ accum_dtype,
80
82
  )
81
83
  if not return_z_loss:
82
84
  return loss
@@ -15,6 +15,7 @@ class LigerFusedLinearCrossEntropyLoss(torch.nn.Module):
15
15
  reduction: str = "mean",
16
16
  softcap: Optional[float] = None,
17
17
  return_z_loss: bool = False,
18
+ accum_dtype: Optional[torch.dtype] = None,
18
19
  ):
19
20
  super().__init__()
20
21
  assert (label_smoothing >= 0) and (label_smoothing <= 1), (
@@ -32,6 +33,7 @@ class LigerFusedLinearCrossEntropyLoss(torch.nn.Module):
32
33
  self.reduction = reduction
33
34
  self.softcap = softcap
34
35
  self.return_z_loss = return_z_loss
36
+ self.accum_dtype = accum_dtype
35
37
 
36
38
  def forward(self, lin_weight, _input, target, bias=None):
37
39
  loss, z_loss = LigerFusedLinearCrossEntropyFunction.apply(
@@ -46,6 +48,7 @@ class LigerFusedLinearCrossEntropyLoss(torch.nn.Module):
46
48
  self.reduction,
47
49
  self.softcap,
48
50
  self.return_z_loss,
51
+ self.accum_dtype,
49
52
  )
50
53
  if not self.return_z_loss:
51
54
  return loss
@@ -180,20 +180,6 @@ def lce_forward(
180
180
  'This is an example script .\n Certainly! Below is a sample script that demonstrates a simple task, such as calculating the sum'
181
181
  ```"""
182
182
 
183
- from transformers.models.phi3.modeling_phi3 import logging
184
-
185
- logger = logging.get_logger(__name__)
186
-
187
- if (
188
- use_cache
189
- and self.config.rope_scaling
190
- and cache_position is not None
191
- and cache_position[0] == self.config.original_max_position_embeddings
192
- ):
193
- logger.warning(
194
- f"If you are not using the generate method, you may encounter nonsensical outputs after the {self.config.original_max_position_embeddings}th token, as the KV cache needs to be recomputed."
195
- )
196
-
197
183
  output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
198
184
  output_hidden_states = (
199
185
  output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: liger_kernel_nightly
3
- Version: 0.6.1.dev20250730201330
3
+ Version: 0.6.1.dev20250805235740
4
4
  Summary: Efficient Triton kernels for LLM Training
5
5
  License: BSD 2-CLAUSE LICENSE
6
6
  Copyright 2024 LinkedIn Corporation
@@ -20,7 +20,7 @@ liger_kernel/ops/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,
20
20
  liger_kernel/ops/cross_entropy.py,sha256=e8THGnhOcy_0SbOLABx67HEM7-B8a8pG7nDKbCRpQKM,19123
21
21
  liger_kernel/ops/dyt.py,sha256=gCLz4S8aul8SY9nvIGaoK67aGb7U9MJRQdo3ONqmQYs,5417
22
22
  liger_kernel/ops/fused_add_rms_norm.py,sha256=UBqmlqFCmhSAIpkNKd8rrfXatX7Z4J9bp2dX9A0lrJQ,14017
23
- liger_kernel/ops/fused_linear_cross_entropy.py,sha256=5fbGhN85n3zf0uIdJ7PYHWIRzTf0VTFiS0ARtOmqIP0,11020
23
+ liger_kernel/ops/fused_linear_cross_entropy.py,sha256=YFPXUOIZpM_4r7AlfjkwOgDhAE_0H2mFjdKtx8cv-T4,11594
24
24
  liger_kernel/ops/fused_linear_jsd.py,sha256=CSoprxb-YcJy-YUKiTcYkxN8sb9h2kdk_iHuncvSV5c,9683
25
25
  liger_kernel/ops/fused_neighborhood_attention.py,sha256=vPi5xbnh6wxyZehaqo6Tuilqo2fN5SGDiONjnNmIKqs,35556
26
26
  liger_kernel/ops/geglu.py,sha256=r0WSq9E93zzynL44Wh8femzOWK07_SseBM_pJUyxT3s,4144
@@ -45,9 +45,9 @@ liger_kernel/transformers/auto_model.py,sha256=0qCTRZt280Bj_LcFdzo9hlaR-BWNazawX
45
45
  liger_kernel/transformers/cross_entropy.py,sha256=z3KTWQnFxr_IZaVjtYt0ZNEWQdDdYThN35xWkHlDGH0,1683
46
46
  liger_kernel/transformers/dyt.py,sha256=i-4GPaMrl-jab9TVI5qN0-H9qycn_mCbV82ozU4nbmU,723
47
47
  liger_kernel/transformers/fsdp.py,sha256=CUiyjTmjkjY7pLXQv8ly9rnzgXw6529csd9pvtJNMYc,3096
48
- liger_kernel/transformers/functional.py,sha256=PXnACWD7kzgge50RdOUuvtmOTS7DVkkrL7mm0cX5bOc,7734
48
+ liger_kernel/transformers/functional.py,sha256=XkYk_zb8xsRMtZtouYmlX_Tyyr-QA3WigSPF36DECYk,7777
49
49
  liger_kernel/transformers/fused_add_rms_norm.py,sha256=7_Bzg-x6lLe6W1qG2DtjDALhEpNZlC6N5GppEs9cTYY,1199
50
- liger_kernel/transformers/fused_linear_cross_entropy.py,sha256=O8Sg5BT81nTaY9fSGoOY9dOD9ekibwwiuXhdUHaxntQ,1742
50
+ liger_kernel/transformers/fused_linear_cross_entropy.py,sha256=_5AaQT2mcUEO2T7JGJYQafz6A1Efn9d3-Z3xFO_Xe0o,1862
51
51
  liger_kernel/transformers/fused_linear_jsd.py,sha256=bZ4otCvWBuOnA5XdQL-FzZVItJlDt-ht9e_pG7PG93E,3999
52
52
  liger_kernel/transformers/fused_neighborhood_attention.py,sha256=TxYDUAt9B6WSP14aJP66C_2Mbds2sSIPGnamhUSTrC8,7957
53
53
  liger_kernel/transformers/geglu.py,sha256=mrgqzIUVd6lN7fkDKLkw5YaESDxDtFgbot430WwPVOQ,1107
@@ -81,7 +81,7 @@ liger_kernel/transformers/model/mixtral.py,sha256=VY-y73IyjcCyWyI7ahxXLw0fJrhgjY
81
81
  liger_kernel/transformers/model/mllama.py,sha256=my29NXk-p6ckQaP8qDIN8e318yI_9mQZHt38MV3SqLY,11280
82
82
  liger_kernel/transformers/model/olmo2.py,sha256=6L_bo-ZUgO1lYppdJneOtYxNIylQKS6BiGp13g7Uq9E,5259
83
83
  liger_kernel/transformers/model/paligemma.py,sha256=xuIx3oOwTgftU3jqLfWOxUxgCLBNJh0yNC21an9qDjo,18773
84
- liger_kernel/transformers/model/phi3.py,sha256=zAzBVNOA16B16yy2HWsEgOMHhLoYkpWOWPgBT4z95WI,10655
84
+ liger_kernel/transformers/model/phi3.py,sha256=aOl1Pz2rp5jSahRKUHKFPgkdkgG28fnHPpOW2ZVnMPg,10124
85
85
  liger_kernel/transformers/model/qwen2.py,sha256=3fpOTEOkniQmkCfN1KUa3KhseHJVzhj2Ht9FdYPUy-E,9962
86
86
  liger_kernel/transformers/model/qwen2_5_vl.py,sha256=zEVVwotCXnAm3RRc8-1Nc8uitSWrwW4B9dYY2uOZDwg,6331
87
87
  liger_kernel/transformers/model/qwen2_vl.py,sha256=5vK-vtCDpKZ2w33xYp2BS8kQYWUbKMqaiKvQcI27Mss,5884
@@ -92,9 +92,9 @@ liger_kernel/transformers/trainer/__init__.py,sha256=p7yQfklV8-467qSz_ZMimkbDF7H
92
92
  liger_kernel/transformers/trainer/orpo_trainer.py,sha256=tX0h63aOFe3rNqTmk6JpMf75UPo981yzEa6TghnjS0Q,5370
93
93
  liger_kernel/triton/__init__.py,sha256=qCiCamzCRv6lpV8IqpAc9YMdNKC7GKurClWceQPnlis,92
94
94
  liger_kernel/triton/monkey_patch.py,sha256=Rd0hUHAzDkFfHvnX7-PBaNK5EKnZhtfM_h-fgQH9HPY,1568
95
- liger_kernel_nightly-0.6.1.dev20250730201330.dist-info/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
96
- liger_kernel_nightly-0.6.1.dev20250730201330.dist-info/METADATA,sha256=hsqE3iGoX7WtGGruvTTrjB4G4sfkTi9UYThz_vOdwos,24502
97
- liger_kernel_nightly-0.6.1.dev20250730201330.dist-info/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
98
- liger_kernel_nightly-0.6.1.dev20250730201330.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
99
- liger_kernel_nightly-0.6.1.dev20250730201330.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
100
- liger_kernel_nightly-0.6.1.dev20250730201330.dist-info/RECORD,,
95
+ liger_kernel_nightly-0.6.1.dev20250805235740.dist-info/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
96
+ liger_kernel_nightly-0.6.1.dev20250805235740.dist-info/METADATA,sha256=UCCH0NvB4SAmW8dFreS-BH1CVfCygPz7s-ZPqEaPZ3s,24502
97
+ liger_kernel_nightly-0.6.1.dev20250805235740.dist-info/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
98
+ liger_kernel_nightly-0.6.1.dev20250805235740.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
99
+ liger_kernel_nightly-0.6.1.dev20250805235740.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
100
+ liger_kernel_nightly-0.6.1.dev20250805235740.dist-info/RECORD,,