liger-kernel-nightly 0.5.10.dev20250611191801__py3-none-any.whl → 0.6.4.dev20260112233432__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of liger-kernel-nightly might be problematic. Click here for more details.

Files changed (107) hide show
  1. liger_kernel/chunked_loss/__init__.py +1 -0
  2. liger_kernel/chunked_loss/cosine_similarity_loss.py +142 -0
  3. liger_kernel/chunked_loss/dpo_loss.py +54 -3
  4. liger_kernel/chunked_loss/functional.py +2 -0
  5. liger_kernel/chunked_loss/fused_linear_distillation.py +23 -5
  6. liger_kernel/chunked_loss/fused_linear_ppo.py +25 -5
  7. liger_kernel/chunked_loss/grpo_loss.py +46 -9
  8. liger_kernel/chunked_loss/jsd_loss.py +44 -13
  9. liger_kernel/ops/__init__.py +141 -0
  10. liger_kernel/ops/backends/README.md +151 -0
  11. liger_kernel/ops/backends/__init__.py +13 -0
  12. liger_kernel/ops/backends/_ascend/__init__.py +5 -0
  13. liger_kernel/ops/backends/_ascend/ascend-ub-manager-design.md +485 -0
  14. liger_kernel/ops/backends/_ascend/ops/__init__.py +49 -0
  15. liger_kernel/ops/backends/_ascend/ops/geglu.py +266 -0
  16. liger_kernel/ops/backends/_ascend/ops/qwen2vl_mrope.py +285 -0
  17. liger_kernel/ops/backends/_ascend/ops/rope.py +290 -0
  18. liger_kernel/ops/backends/_ascend/ops/swiglu.py +142 -0
  19. liger_kernel/ops/backends/_ascend/ops/tvd.py +221 -0
  20. liger_kernel/ops/backends/_ascend/ub_manager.py +349 -0
  21. liger_kernel/ops/backends/registry.py +61 -0
  22. liger_kernel/ops/cross_entropy.py +130 -64
  23. liger_kernel/ops/dyt.py +5 -4
  24. liger_kernel/ops/fused_add_rms_norm.py +416 -0
  25. liger_kernel/ops/fused_linear_cross_entropy.py +115 -22
  26. liger_kernel/ops/geglu.py +6 -4
  27. liger_kernel/ops/group_norm.py +7 -7
  28. liger_kernel/ops/grpo_loss.py +3 -1
  29. liger_kernel/ops/kl_div.py +8 -11
  30. liger_kernel/ops/layer_norm.py +135 -80
  31. liger_kernel/ops/llama4_rope.py +225 -0
  32. liger_kernel/ops/poly_norm.py +390 -0
  33. liger_kernel/ops/rms_norm.py +148 -71
  34. liger_kernel/ops/rope.py +1 -1
  35. liger_kernel/ops/swiglu.py +1 -1
  36. liger_kernel/ops/tiled_mlp.py +136 -0
  37. liger_kernel/ops/utils.py +14 -0
  38. liger_kernel/transformers/__init__.py +65 -0
  39. liger_kernel/transformers/auto_model.py +21 -0
  40. liger_kernel/transformers/cross_entropy.py +9 -4
  41. liger_kernel/transformers/dyt.py +1 -1
  42. liger_kernel/transformers/experimental/__init__.py +5 -0
  43. liger_kernel/transformers/experimental/embedding.py +1 -1
  44. liger_kernel/transformers/functional.py +56 -24
  45. liger_kernel/transformers/fused_add_rms_norm.py +39 -0
  46. liger_kernel/transformers/fused_linear_cross_entropy.py +17 -5
  47. liger_kernel/transformers/fused_linear_jsd.py +1 -1
  48. liger_kernel/transformers/fused_neighborhood_attention.py +1 -1
  49. liger_kernel/transformers/geglu.py +1 -1
  50. liger_kernel/transformers/group_norm.py +1 -1
  51. liger_kernel/transformers/grpo_loss.py +57 -2
  52. liger_kernel/transformers/jsd.py +1 -1
  53. liger_kernel/transformers/kl_div.py +1 -1
  54. liger_kernel/transformers/layer_norm.py +1 -1
  55. liger_kernel/transformers/llama4_rope.py +93 -0
  56. liger_kernel/transformers/model/exaone4.py +136 -0
  57. liger_kernel/transformers/model/falcon_h1.py +122 -0
  58. liger_kernel/transformers/model/gemma.py +28 -8
  59. liger_kernel/transformers/model/gemma2.py +34 -11
  60. liger_kernel/transformers/model/gemma3.py +102 -112
  61. liger_kernel/transformers/model/glm4.py +18 -5
  62. liger_kernel/transformers/model/glm4v.py +163 -0
  63. liger_kernel/transformers/model/glm4v_moe.py +172 -0
  64. liger_kernel/transformers/model/gpt_oss.py +211 -0
  65. liger_kernel/transformers/model/hunyuan_v1.py +134 -0
  66. liger_kernel/transformers/model/internvl.py +157 -0
  67. liger_kernel/transformers/model/llama.py +26 -7
  68. liger_kernel/transformers/model/llama4.py +121 -0
  69. liger_kernel/transformers/model/llava.py +18 -6
  70. liger_kernel/transformers/model/loss_utils.py +34 -3
  71. liger_kernel/transformers/model/mistral.py +17 -10
  72. liger_kernel/transformers/model/mixtral.py +24 -9
  73. liger_kernel/transformers/model/mllama.py +18 -7
  74. liger_kernel/transformers/model/olmo2.py +18 -5
  75. liger_kernel/transformers/model/olmo3.py +142 -0
  76. liger_kernel/transformers/model/output_classes.py +147 -0
  77. liger_kernel/transformers/model/paligemma.py +42 -5
  78. liger_kernel/transformers/model/phi3.py +24 -159
  79. liger_kernel/transformers/model/qwen2.py +26 -4
  80. liger_kernel/transformers/model/qwen2_5_vl.py +21 -8
  81. liger_kernel/transformers/model/qwen2_vl.py +24 -7
  82. liger_kernel/transformers/model/qwen3.py +22 -6
  83. liger_kernel/transformers/model/qwen3_moe.py +27 -7
  84. liger_kernel/transformers/model/qwen3_next.py +146 -0
  85. liger_kernel/transformers/model/qwen3_vl.py +150 -0
  86. liger_kernel/transformers/model/qwen3_vl_moe.py +126 -0
  87. liger_kernel/transformers/model/smollm3.py +199 -0
  88. liger_kernel/transformers/model/smolvlm.py +158 -0
  89. liger_kernel/transformers/monkey_patch.py +1423 -100
  90. liger_kernel/transformers/multi_token_attention.py +2 -2
  91. liger_kernel/transformers/poly_norm.py +42 -0
  92. liger_kernel/transformers/qwen2vl_mrope.py +1 -1
  93. liger_kernel/transformers/rms_norm.py +15 -5
  94. liger_kernel/transformers/rope.py +45 -1
  95. liger_kernel/transformers/softmax.py +1 -1
  96. liger_kernel/transformers/sparsemax.py +1 -1
  97. liger_kernel/transformers/swiglu.py +18 -1
  98. liger_kernel/transformers/tiled_mlp.py +125 -0
  99. liger_kernel/transformers/tvd.py +1 -1
  100. liger_kernel/utils.py +52 -0
  101. {liger_kernel_nightly-0.5.10.dev20250611191801.dist-info → liger_kernel_nightly-0.6.4.dev20260112233432.dist-info}/METADATA +37 -25
  102. liger_kernel_nightly-0.6.4.dev20260112233432.dist-info/RECORD +132 -0
  103. liger_kernel_nightly-0.5.10.dev20250611191801.dist-info/RECORD +0 -95
  104. {liger_kernel_nightly-0.5.10.dev20250611191801.dist-info → liger_kernel_nightly-0.6.4.dev20260112233432.dist-info}/LICENSE +0 -0
  105. {liger_kernel_nightly-0.5.10.dev20250611191801.dist-info → liger_kernel_nightly-0.6.4.dev20260112233432.dist-info}/NOTICE +0 -0
  106. {liger_kernel_nightly-0.5.10.dev20250611191801.dist-info → liger_kernel_nightly-0.6.4.dev20260112233432.dist-info}/WHEEL +0 -0
  107. {liger_kernel_nightly-0.5.10.dev20250611191801.dist-info → liger_kernel_nightly-0.6.4.dev20260112233432.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,416 @@
1
+ import math
2
+ import operator
3
+
4
+ import torch
5
+ import triton
6
+ import triton.language as tl
7
+
8
+ from liger_kernel.ops.utils import calculate_settings
9
+ from liger_kernel.ops.utils import compare_version
10
+ from liger_kernel.ops.utils import ensure_contiguous
11
+ from liger_kernel.ops.utils import torch_to_triton_dtype
12
+ from liger_kernel.utils import get_npu_multi_processor_count
13
+ from liger_kernel.utils import is_npu_available
14
+
15
+ if compare_version("triton", operator.ge, "3.0.0") and not is_npu_available():
16
+ try:
17
+ # typical import path with dispatch available
18
+ from triton.language.extra.libdevice import rsqrt
19
+ except ModuleNotFoundError:
20
+ # for working with NGC containers
21
+ from triton.language.extra.cuda.libdevice import rsqrt
22
+ else:
23
+ from triton.language.math import rsqrt
24
+
25
+
26
+ _CASTING_MODE_NONE: tl.constexpr = tl.constexpr(-1)
27
+ _CASTING_MODE_LLAMA: tl.constexpr = tl.constexpr(0)
28
+ _CASTING_MODE_GEMMA: tl.constexpr = tl.constexpr(1)
29
+
30
+
31
+ @triton.jit
32
+ def _fused_add_rms_norm_forward_kernel(
33
+ Y_ptr,
34
+ Y_row_stride,
35
+ S_ptr, # output residual
36
+ S_row_stride,
37
+ X_ptr,
38
+ X_row_stride,
39
+ R_ptr, # input residual
40
+ R_row_stride,
41
+ W_ptr,
42
+ W_row_stride,
43
+ RSTD_ptr,
44
+ RSTD_row_stride,
45
+ n_cols,
46
+ eps,
47
+ offset,
48
+ casting_mode: tl.constexpr, # constexpr so the `if` blocks can be optimized out
49
+ BLOCK_SIZE: tl.constexpr,
50
+ ):
51
+ """
52
+ This kernel computes the following:
53
+ 1. hidden_states = residual + hidden_states
54
+ 2. residual = hidden_states
55
+ 3. hidden_states = rmsnorm(hidden_states)
56
+
57
+ This is a commonly used pattern in the decoder layers of LLMs.
58
+ Some examples:
59
+ 1. https://github.com/huggingface/transformers/blob/0dc2df5ddafe3cb5824ad24e85beba13e0aa6726/src/transformers/models/qwen3/modeling_qwen3.py#L271
60
+ 2. https://github.com/huggingface/transformers/blob/0dc2df5ddafe3cb5824ad24e85beba13e0aa6726/src/transformers/models/llama4/modeling_llama4.py#L393
61
+
62
+ This kernel is inspired by the rms_norm forward kernel, and is adapted to support the residual addition in the forward pass.
63
+ The backward pass is also adapted to support the residual addition in the backward pass.
64
+ """
65
+
66
+ row_idx = tl.program_id(0).to(tl.int64)
67
+ col_offsets = tl.arange(0, BLOCK_SIZE)
68
+ mask = col_offsets < n_cols
69
+
70
+ Y_ptr += row_idx * Y_row_stride
71
+ S_ptr += row_idx * S_row_stride
72
+ X_ptr += row_idx * X_row_stride
73
+ R_ptr += row_idx * R_row_stride
74
+ RSTD_ptr += row_idx * RSTD_row_stride
75
+
76
+ X_row = tl.load(X_ptr + col_offsets, mask=mask, other=0)
77
+ R_row = tl.load(R_ptr + col_offsets, mask=mask, other=0)
78
+ S_row = X_row + R_row
79
+ tl.store(S_ptr + col_offsets, S_row, mask=mask)
80
+ S_row_dtype = S_row.dtype
81
+ W_row = tl.load(W_ptr + col_offsets, mask=mask, other=0)
82
+
83
+ # On Llama, only rstd is computed on fp32
84
+ if casting_mode == _CASTING_MODE_LLAMA:
85
+ S_row = S_row.to(tl.float32)
86
+
87
+ # Gemma computes everything on fp32, and then casts back the output to the original dtype
88
+ if casting_mode == _CASTING_MODE_GEMMA:
89
+ W_row = W_row.to(tl.float32)
90
+ S_row = S_row.to(tl.float32)
91
+
92
+ if casting_mode == _CASTING_MODE_NONE:
93
+ eps = eps.to(S_row_dtype)
94
+ offset = offset.to(S_row_dtype)
95
+
96
+ mean_square = tl.sum(S_row * S_row, axis=0) / n_cols
97
+ rstd = rsqrt(mean_square + eps)
98
+
99
+ # We can save time by caching rms with minimal memory overhead
100
+ # because rms is much smaller compared to X_row, as rms is for each row.
101
+ # However, on the computation side, it can save 4 operations (*, sum, /, sqrt).
102
+ tl.store(RSTD_ptr, rstd)
103
+
104
+ S_row = S_row * rstd
105
+
106
+ # On Llama, the multiplication with the weight is done on the original dtype
107
+ if casting_mode == _CASTING_MODE_LLAMA:
108
+ S_row = S_row.to(S_row_dtype)
109
+
110
+ Y_row = S_row * (offset + W_row)
111
+
112
+ if casting_mode == _CASTING_MODE_GEMMA:
113
+ Y_row = Y_row.to(S_row_dtype)
114
+
115
+ tl.store(Y_ptr + col_offsets, Y_row, mask=mask)
116
+
117
+
118
+ @triton.jit
119
+ def _fused_add_rms_norm_backward_kernel(
120
+ dY_ptr,
121
+ dY_row_stride,
122
+ dS_out_ptr,
123
+ dS_out_row_stride,
124
+ dX_ptr,
125
+ dX_row_stride,
126
+ X_ptr,
127
+ X_row_stride,
128
+ X_dtype: tl.constexpr,
129
+ W_ptr,
130
+ W_row_stride,
131
+ RSTD_ptr,
132
+ RSTD_row_stride,
133
+ dW_ptr,
134
+ dW_row_stride,
135
+ n_rows,
136
+ n_cols,
137
+ offset,
138
+ rows_per_program: tl.constexpr,
139
+ casting_mode: tl.constexpr,
140
+ BLOCK_SIZE: tl.constexpr,
141
+ has_dS_out: tl.constexpr,
142
+ ):
143
+ """
144
+ This kernel is adapted from the rms_norm backward kernel, and is adapted to support the residual
145
+ addition in the backward pass. For the following code pattern:
146
+ 1. hidden_states = residual + hidden_states
147
+ 2. residual = hidden_states
148
+ 3. hidden_states = rmsnorm(hidden_states)
149
+
150
+ The gradient of hidden_states and residual comes out be exactly same. The value of this gradient is
151
+ the sum of the gradient of the hidden_states in step 3 and the gradient of the residual in step 2.
152
+
153
+ The backward pass computation logic is same as the rms_norm backward kernel, except that the gradient
154
+ of the hidden_states in step 3 and the gradient of the residual in step 2 are summed up.
155
+ """
156
+
157
+ row_block_id = tl.program_id(0).to(tl.int64)
158
+ row_start = row_block_id * rows_per_program
159
+ row_end = min((row_block_id + 1) * rows_per_program, n_rows)
160
+ col_offsets = tl.arange(0, BLOCK_SIZE)
161
+ mask = col_offsets < n_cols
162
+
163
+ dW_row = tl.zeros((BLOCK_SIZE,), dtype=tl.float32)
164
+
165
+ dY_ptr += row_start * dY_row_stride
166
+ dX_ptr += row_start * dX_row_stride
167
+ if has_dS_out:
168
+ dS_out_ptr += row_start * dS_out_row_stride
169
+
170
+ X_ptr += row_start * X_row_stride
171
+ RSTD_ptr += row_start
172
+
173
+ W_row = tl.load(W_ptr + col_offsets, mask=mask, other=0.0)
174
+ W_row = W_row + offset
175
+
176
+ for _ in range(row_start, row_end):
177
+ dY_row = tl.load(dY_ptr + col_offsets, mask=mask, other=0.0)
178
+ X_row = tl.load(X_ptr + col_offsets, mask=mask, other=0.0)
179
+
180
+ # Get cached rms
181
+ rstd_row = tl.load(RSTD_ptr)
182
+
183
+ X_row = X_row.to(tl.float32)
184
+
185
+ # Different bacward graphs for different casting modes
186
+ if casting_mode == _CASTING_MODE_LLAMA:
187
+ m = (dY_row * W_row).to(tl.float32)
188
+
189
+ elif casting_mode == _CASTING_MODE_GEMMA:
190
+ dY_row = dY_row.to(tl.float32)
191
+ m = dY_row * W_row
192
+ else:
193
+ m = dY_row * W_row
194
+
195
+ dX_row = rstd_row * m
196
+
197
+ if has_dS_out:
198
+ dS_out_row = tl.load(dS_out_ptr + col_offsets, mask=mask, other=0.0)
199
+ dX_row += (rstd_row) * (
200
+ -(1 / n_cols) * rstd_row * rstd_row * tl.sum(m * X_row, axis=0) * X_row
201
+ ) + dS_out_row
202
+ dS_out_ptr += dS_out_row_stride
203
+ else:
204
+ dX_row += (rstd_row) * (-(1 / n_cols) * rstd_row * rstd_row * tl.sum(m * X_row, axis=0) * X_row)
205
+
206
+ # calculate the gradient of W
207
+ if casting_mode == _CASTING_MODE_LLAMA:
208
+ dW_row += dY_row * (X_row * rstd_row).to(X_dtype)
209
+ else:
210
+ # here X_row is already in fp32 (see previous if block)
211
+ dW_row += dY_row * (X_row * rstd_row)
212
+
213
+ tl.store(dX_ptr + col_offsets, dX_row.to(X_dtype), mask=mask)
214
+
215
+ dY_ptr += dY_row_stride
216
+ dX_ptr += dX_row_stride
217
+ X_ptr += X_row_stride
218
+ RSTD_ptr += RSTD_row_stride
219
+
220
+ tl.store(dW_ptr + row_block_id * dW_row_stride + col_offsets, dW_row, mask=mask)
221
+
222
+
223
+ _str_to_casting_mode = {
224
+ "llama": _CASTING_MODE_LLAMA.value,
225
+ "gemma": _CASTING_MODE_GEMMA.value,
226
+ "none": _CASTING_MODE_NONE.value,
227
+ }
228
+
229
+
230
+ def fused_add_rms_norm_forward(X, R, W, eps, offset, casting_mode):
231
+ if not isinstance(casting_mode, int):
232
+ assert casting_mode in _str_to_casting_mode, f"Invalid casting mode: {casting_mode}"
233
+ casting_mode = _str_to_casting_mode[casting_mode]
234
+ else:
235
+ assert casting_mode in _str_to_casting_mode.values(), f"Invalid casting mode: {casting_mode}"
236
+
237
+ shape = X.shape
238
+ dim = shape[-1]
239
+ X = X.view(-1, dim)
240
+ R = R.view(-1, dim)
241
+ n_rows, n_cols = X.shape
242
+ BLOCK_SIZE, num_warps = calculate_settings(n_cols)
243
+
244
+ Y = torch.empty((n_rows, n_cols), dtype=X.dtype, device=X.device)
245
+ S = torch.empty((n_rows, n_cols), dtype=X.dtype, device=X.device)
246
+ # RSTD is to cache rstd for each row
247
+ # RSTD is always computed/stored in fp32 if we are using Llama or Gemma casting mode
248
+ rstd_dtype = torch.float32 if casting_mode in (_CASTING_MODE_LLAMA.value, _CASTING_MODE_GEMMA.value) else X.dtype
249
+ RSTD = torch.empty(n_rows, dtype=rstd_dtype, device=X.device)
250
+
251
+ # Check constraints.
252
+ assert X.shape[1] == W.shape[0], "Incompatible hidden size dimension between tensor1.shape[1] and tensor2.shape[0]"
253
+
254
+ # XPU-specific optimization
255
+ kernel_args = {}
256
+ if X.device.type == "xpu":
257
+ kernel_args["grf_mode"] = "large"
258
+
259
+ # TODO: add _block_fused_add_rms_norm_forward_kernel
260
+ _fused_add_rms_norm_forward_kernel[(n_rows,)](
261
+ Y,
262
+ Y.stride(0),
263
+ S,
264
+ S.stride(0),
265
+ X,
266
+ X.stride(0),
267
+ R,
268
+ R.stride(0),
269
+ W,
270
+ W.stride(0),
271
+ RSTD,
272
+ RSTD.stride(0),
273
+ n_cols,
274
+ eps,
275
+ offset,
276
+ casting_mode,
277
+ BLOCK_SIZE=BLOCK_SIZE,
278
+ num_warps=num_warps,
279
+ **kernel_args, # XPU-specific optimization
280
+ )
281
+
282
+ return Y.view(*shape), S.view(*shape), RSTD, BLOCK_SIZE, num_warps, casting_mode
283
+
284
+
285
+ def fused_add_rms_norm_backward(dY, dS_out, S, W, RSTD, offset, casting_mode, BLOCK_SIZE, num_warps, in_place):
286
+ shape = dY.shape
287
+ dim = shape[-1]
288
+ dY = dY.view(-1, dim)
289
+ dS_out = dS_out.view(-1, dim)
290
+ S = S.view(-1, dim)
291
+ n_rows, n_cols = dY.shape
292
+
293
+ sm_count = 1
294
+ if S.device.type == "cuda":
295
+ sm_count = torch.cuda.get_device_properties(S.device).multi_processor_count
296
+ elif S.device.type == "xpu":
297
+ sm_count = torch.xpu.get_device_properties(S.device).gpu_eu_count
298
+ elif S.device.type == "npu":
299
+ sm_count = get_npu_multi_processor_count()
300
+
301
+ # fp32 for numerical stability especially.
302
+ _dW = torch.empty((sm_count, n_cols), dtype=torch.float32, device=W.device)
303
+
304
+ if n_cols > BLOCK_SIZE:
305
+ raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
306
+ rows_per_program = math.ceil(n_rows / sm_count)
307
+ grid = (sm_count,)
308
+
309
+ if in_place is True:
310
+ dX = dY
311
+ else:
312
+ dX = torch.empty_like(dY)
313
+
314
+ # XPU-specific optimization
315
+ kernel_args = {}
316
+ if S.device.type == "xpu":
317
+ kernel_args["grf_mode"] = "large"
318
+
319
+ # TODO: add _block_fused_add_rms_norm_backward_kernel
320
+ _fused_add_rms_norm_backward_kernel[grid](
321
+ dY,
322
+ dY.stride(0),
323
+ dS_out,
324
+ dS_out.stride(0),
325
+ dX,
326
+ dX.stride(0),
327
+ S,
328
+ S.stride(0),
329
+ torch_to_triton_dtype[S.dtype],
330
+ W,
331
+ W.stride(0),
332
+ RSTD,
333
+ RSTD.stride(0),
334
+ _dW,
335
+ _dW.stride(0),
336
+ n_rows,
337
+ n_cols,
338
+ offset,
339
+ rows_per_program,
340
+ casting_mode,
341
+ BLOCK_SIZE=BLOCK_SIZE,
342
+ num_warps=num_warps,
343
+ has_dS_out=dS_out is not None,
344
+ **kernel_args, # XPU-specific optimization
345
+ )
346
+
347
+ dX = dX.view(*shape)
348
+ dW = _dW.sum(dim=0).to(W.dtype)
349
+
350
+ return dX, dX, dW # dR is equal to dX
351
+
352
+
353
+ class LigerFusedAddRMSNormFunction(torch.autograd.Function):
354
+ """
355
+ Performs a fused operation that first adds a residual tensor to the hidden_states tensor (`X`), then applies RMSNorm (Root Mean Square Normalization) to the result using the weight tensor `W`, with optional offset and casting mode.
356
+
357
+ This class implements the following sequence, commonly used in transformer decoder layers:
358
+ 1. hidden_states = residual + hidden_states
359
+ 2. residual = hidden_states (after addition)
360
+ 3. hidden_states = rmsnorm(hidden_states)
361
+
362
+ Both the normalized hidden_states and the updated residual are returned as outputs.
363
+
364
+ Some models use an 'offset' to shift the weight tensor `W` by a constant value. For example, Gemma
365
+ uses an offset of 1.0, so the computation becomes `(X / RMS(X)) * (W + 1.0)` instead of the usual
366
+ `(X / RMS(X)) * W`. You can pass the offset value as an argument to the forward function.
367
+
368
+ In addition, different models cast their inputs at different places during RMSNorm computation. For
369
+ example, Gemma casts everything to fp32 before starting the computation, while Llama casts only the
370
+ inverse RMS to fp32. You can specify the casting mode using the `casting_mode` argument. We currently
371
+ support the following casting modes (they match HuggingFace Transformers' implementations):
372
+ - 'llama': matches the Llama implementation, where only the inverse RMS is computed on fp32.
373
+ - 'gemma': matches the Gemma implementation, where everything is cast to fp32, then computed, then cast back to the original dtype.
374
+ - 'none': no casting is done. The computation is done in the original dtype. This saves memory and is slightly faster, but has more error w.r.t. the original implementation.
375
+
376
+ The `in_place` option determines whether to modify dY in-place to store dX. This defaults to `True` to save memory.
377
+ """
378
+
379
+ @staticmethod
380
+ @ensure_contiguous
381
+ def forward(ctx, X, R, W, eps, offset=0.0, casting_mode="llama", in_place=False):
382
+ """
383
+ X: (B, T, H) or (BxT, H)
384
+ W: (H,)
385
+ """
386
+ # TODO: add row_mode
387
+ Y, S, RSTD, BLOCK_SIZE, num_warps, casting_mode = fused_add_rms_norm_forward(X, R, W, eps, offset, casting_mode)
388
+ ctx.offset = offset
389
+ ctx.casting_mode = casting_mode
390
+ ctx.in_place = in_place
391
+ ctx.BLOCK_SIZE = BLOCK_SIZE
392
+ ctx.num_warps = num_warps
393
+ ctx.save_for_backward(S, W, RSTD)
394
+ return Y, S
395
+
396
+ @staticmethod
397
+ @ensure_contiguous
398
+ def backward(ctx, dY, dS_out):
399
+ """
400
+ Y: (B, T, H) or (BxT, H)
401
+ """
402
+ S, W, RSTD = ctx.saved_tensors
403
+ dX, dR, dW = fused_add_rms_norm_backward(
404
+ dY,
405
+ dS_out,
406
+ S,
407
+ W,
408
+ RSTD,
409
+ ctx.offset,
410
+ ctx.casting_mode,
411
+ ctx.BLOCK_SIZE,
412
+ ctx.num_warps,
413
+ ctx.in_place,
414
+ )
415
+
416
+ return dX, dR, dW, None, None, None, None, None
@@ -6,11 +6,12 @@ from liger_kernel.ops.utils import amp_custom_bwd
6
6
  from liger_kernel.ops.utils import amp_custom_fwd
7
7
  from liger_kernel.ops.utils import element_mul_kernel
8
8
  from liger_kernel.ops.utils import is_hip
9
+ from liger_kernel.utils import infer_device
9
10
 
10
11
  # The hard limit of TRITON_MAX_TENSOR_NUMEL is 1048576 https://github.com/triton-lang/triton/blob/ba42a5c68fd0505f8c42f4202d53be0f8d9a5fe0/python/triton/language/core.py#L19
11
12
  # However, setting limit as 65536 as in LayerNorm tutorial is faster because of less register spilling
12
13
  # The optimal maximum block size depends on your hardware, your kernel, and your dtype
13
- MAX_FUSED_SIZE = 65536 // 2
14
+ MAX_FUSED_SIZE = 2048 if infer_device() == "npu" else 65536 // 2
14
15
 
15
16
 
16
17
  def fused_linear_cross_entropy_forward(
@@ -25,10 +26,18 @@ def fused_linear_cross_entropy_forward(
25
26
  reduction="mean",
26
27
  softcap=None,
27
28
  return_z_loss=False,
29
+ accum_dtype=None,
30
+ use_token_scaling=False,
31
+ return_token_accuracy=False,
28
32
  ):
29
33
  assert isinstance(return_z_loss, bool), f"return_z_loss must be True or False. Got: {return_z_loss}"
34
+ assert isinstance(return_token_accuracy, bool), (
35
+ f"return_token_accuracy must be True or False. Got: {return_token_accuracy}"
36
+ )
30
37
  device = _input.device
31
38
 
39
+ input_requires_grad = _input.requires_grad
40
+
32
41
  # inputs have shape: BT x H
33
42
  # materialized activations will have shape: BT x V
34
43
  # the increase in memory = BT x V
@@ -44,12 +53,23 @@ def fused_linear_cross_entropy_forward(
44
53
  chunk_size = triton.next_power_of_2(triton.cdiv(BT, inc_factor)) # (BT + inc_factor - 1) // inc_factor
45
54
  num_chunks = triton.cdiv(BT, chunk_size) # (BT + chunk_size - 1) // chunk_size
46
55
 
47
- grad_weight = torch.zeros_like(weight, device=device) if weight.requires_grad else None
48
56
  grad_input = torch.zeros_like(_input, device=device)
49
- grad_bias = torch.zeros_like(bias, device=device) if bias is not None else None
50
- # we use fp32 for loss accumulator
57
+
58
+ # we use fp32 for loss and gradients accumulator
59
+ if input_requires_grad:
60
+ if accum_dtype is None:
61
+ grad_weight = torch.zeros_like(weight, device=device) if weight.requires_grad else None
62
+ grad_bias = torch.zeros_like(bias, device=device) if bias is not None else None
63
+ else:
64
+ grad_weight = torch.zeros_like(weight, dtype=accum_dtype, device=device) if weight.requires_grad else None
65
+ grad_bias = torch.zeros_like(bias, dtype=accum_dtype, device=device) if bias is not None else None
66
+ else:
67
+ grad_weight = None
68
+ grad_bias = None
69
+
51
70
  loss_1d = torch.zeros(BT, dtype=torch.float32, device=device)
52
71
  z_loss_1d = torch.zeros(BT, dtype=_input.dtype, device=_input.device) if return_z_loss else None
72
+ token_accuracy_1d = torch.zeros(BT, dtype=torch.float32, device=device) if return_token_accuracy else None
53
73
 
54
74
  # TODO: evaluate how CUDA synchronization caused by .item() affects the speed
55
75
  target_mask = target != ignore_index
@@ -82,9 +102,40 @@ def fused_linear_cross_entropy_forward(
82
102
 
83
103
  n_rows = logits_chunk.shape[0]
84
104
 
105
+ # Compute predicted probabilities for token scaling if needed
106
+ if use_token_scaling:
107
+ # Compute softmax probabilities for scaling
108
+ # We need to compute this before the cross entropy kernel modifies logits_chunk
109
+ logits_for_softmax = logits_chunk.detach().clone() # Detach to avoid gradient flow
110
+ if softcap is not None:
111
+ logits_for_softmax = softcap * torch.tanh(logits_for_softmax / softcap)
112
+
113
+ # Compute softmax to get predicted probabilities
114
+ probs = torch.softmax(logits_for_softmax, dim=-1)
115
+
116
+ # Get predicted probabilities for token scaling, handling ignored targets
117
+ valid_target_mask = target_chunk != ignore_index
118
+ valid_targets = target_chunk[valid_target_mask]
119
+
120
+ if len(valid_targets) > 0:
121
+ # Gather probabilities only for valid targets
122
+ valid_probs = probs[valid_target_mask]
123
+ pred_probs_valid = torch.gather(valid_probs, -1, valid_targets.unsqueeze(-1)).squeeze(-1)
124
+
125
+ # Create full tensor with zeros for ignored targets
126
+ pred_probs = torch.zeros_like(target_chunk, dtype=probs.dtype, device=probs.device)
127
+ pred_probs[valid_target_mask] = pred_probs_valid
128
+ else:
129
+ # All targets are ignored
130
+ pred_probs = torch.zeros_like(target_chunk, dtype=probs.dtype, device=probs.device)
131
+
132
+ # Store the scaling factors
133
+ scaling_factors = pred_probs.detach() # Detach to ensure no gradient flow
134
+
85
135
  # unreduced loss
86
136
  loss_1d_slice = loss_1d[start_idx:end_idx] # chunk_size,
87
137
  z_loss_1d_slice = z_loss_1d[start_idx:end_idx] if return_z_loss else None
138
+ token_accuracy_1d_slice = token_accuracy_1d[start_idx:end_idx] if return_token_accuracy else None
88
139
 
89
140
  # ensure _input and target are contiguous
90
141
  logits_chunk = logits_chunk.contiguous()
@@ -100,6 +151,10 @@ def fused_linear_cross_entropy_forward(
100
151
  loss_ptr=loss_1d_slice,
101
152
  z_loss_ptr=z_loss_1d_slice,
102
153
  loss_stride=loss_1d_slice.stride(-1), # always 1
154
+ token_accuracy_ptr=token_accuracy_1d_slice,
155
+ token_accuracy_stride=token_accuracy_1d_slice.stride(-1)
156
+ if return_token_accuracy
157
+ else 0, # always 1 if accuracy is enabled
103
158
  n_cols=V,
104
159
  n_non_ignore=total_n_non_ignore,
105
160
  sum_non_ignore_weight=total_sum_non_ignore_ce_weight,
@@ -110,35 +165,43 @@ def fused_linear_cross_entropy_forward(
110
165
  reduction=reduction,
111
166
  softcap=softcap,
112
167
  RETURN_Z_LOSS=return_z_loss,
168
+ RETURN_TOKEN_ACCURACY=return_token_accuracy,
113
169
  HAS_WEIGHT=True if ce_weight is not None else False,
114
170
  HAS_SOFTCAPPING=True if softcap is not None else False,
171
+ HAS_GRADIENTS=input_requires_grad,
115
172
  BLOCK_SIZE=BLOCK_SIZE,
116
173
  num_warps=32 if not is_hip() else 16,
117
174
  )
118
175
 
176
+ # Apply token scaling if requested
177
+ if use_token_scaling:
178
+ loss_1d_slice = loss_1d_slice * scaling_factors
179
+ if return_z_loss:
180
+ z_loss_1d_slice = z_loss_1d_slice * scaling_factors
181
+
119
182
  loss_1d[start_idx:end_idx] = loss_1d_slice
120
183
  if return_z_loss:
121
184
  z_loss_1d[start_idx:end_idx] = z_loss_1d_slice
185
+ if return_token_accuracy:
186
+ token_accuracy_1d[start_idx:end_idx] = token_accuracy_1d_slice
122
187
  grad_logits_chunk = logits_chunk # chunk_size x V
123
188
 
124
- grad_input[start_idx:end_idx] = grad_logits_chunk @ weight
189
+ # Apply token scaling to gradients if requested
190
+ if use_token_scaling:
191
+ # Expand scaling factors to match gradient dimensions
192
+ scaling_factors_expanded = scaling_factors.unsqueeze(-1) # chunk_size x 1
193
+ grad_logits_chunk = grad_logits_chunk * scaling_factors_expanded
125
194
 
126
- if grad_weight is not None:
127
- torch.addmm(
128
- input=grad_weight,
129
- mat1=logits_chunk.t().to(
130
- _input_chunk.dtype
131
- ), # In an autocast scenario without bias, differing logits_chunk data types will cause an addmm operation error.
132
- mat2=_input_chunk,
133
- out=grad_weight,
134
- alpha=1.0,
135
- beta=1.0,
136
- )
195
+ if input_requires_grad:
196
+ grad_input[start_idx:end_idx] = grad_logits_chunk @ weight
137
197
 
138
- if bias is not None:
198
+ if grad_weight is not None and input_requires_grad:
199
+ grad_weight += torch.mm(grad_logits_chunk.t(), _input_chunk).float()
200
+
201
+ if bias is not None and input_requires_grad:
139
202
  torch.add(
140
203
  input=grad_bias,
141
- other=logits_chunk.sum(dim=0),
204
+ other=grad_logits_chunk.sum(dim=0),
142
205
  out=grad_bias,
143
206
  alpha=1.0,
144
207
  )
@@ -148,10 +211,22 @@ def fused_linear_cross_entropy_forward(
148
211
  # loss = loss_1d
149
212
  # z_loss = z_loss_1d if return_z_loss else None
150
213
 
214
+ if reduction == "none":
215
+ # Return per-token losses
216
+ loss = loss_1d
217
+ z_loss = z_loss_1d if return_z_loss else None
218
+ token_accuracy = token_accuracy_1d if return_token_accuracy else None
151
219
  else:
152
220
  loss = torch.sum(loss_1d)
153
221
  z_loss = torch.sum(z_loss_1d) if return_z_loss else None
154
- return loss, z_loss, grad_input, grad_weight, grad_bias
222
+ # For accuracy, we compute the mean across all non-ignored tokens
223
+ token_accuracy = torch.sum(token_accuracy_1d) / total_n_non_ignore if return_token_accuracy else None
224
+
225
+ # Cast back to original dtype
226
+ grad_weight = grad_weight.to(weight.dtype) if grad_weight is not None else None
227
+ grad_bias = grad_bias.to(bias.dtype) if grad_bias is not None else None
228
+
229
+ return loss, z_loss, token_accuracy, grad_input, grad_weight, grad_bias
155
230
 
156
231
 
157
232
  def fused_linear_cross_entropy_backward(grad_output, grad_input, grad_weight, grad_bias):
@@ -217,6 +292,9 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
217
292
  reduction="mean",
218
293
  softcap=None,
219
294
  return_z_loss: bool = False,
295
+ accum_dtype=None,
296
+ use_token_scaling: bool = False,
297
+ return_token_accuracy: bool = False,
220
298
  ):
221
299
  """
222
300
  Fusing the last linear layer with cross-entropy loss
@@ -235,9 +313,15 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
235
313
  ignore_index: the index to ignore in the target
236
314
  label_smoothing (float): The amount of smoothing when computing the loss, where 0.0 means no smoothing.
237
315
  reduction: reduction to apply
316
+ accum_dtype (torch.dtype): the dtype of intermediate result buffers for weight and bias gradient accumulations.
317
+ Recommended to set `accum_dtype` to higher precision, e.g. `torch.float32`, if the training is unstable with original dtype. Default: `None`, performing accumulations in original dtype
318
+ use_token_scaling (bool): whether to scale each token's loss by its predicted probability (detached).
319
+ When True, each token's loss is multiplied by the model's predicted probability for that token's true class.
320
+ Default: False.
321
+ return_token_accuracy (bool): When `return_token_accuracy` is `True`, computes and returns per-token accuracy without materializing logits. Default: `False`
238
322
  """
239
323
 
240
- loss, z_loss, grad_input, grad_weight, grad_bias = fused_linear_cross_entropy_forward(
324
+ loss, z_loss, token_accuracy, grad_input, grad_weight, grad_bias = fused_linear_cross_entropy_forward(
241
325
  _input=_input,
242
326
  weight=weight,
243
327
  target=target,
@@ -249,6 +333,9 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
249
333
  reduction=reduction,
250
334
  softcap=softcap,
251
335
  return_z_loss=return_z_loss,
336
+ accum_dtype=accum_dtype,
337
+ use_token_scaling=use_token_scaling,
338
+ return_token_accuracy=return_token_accuracy,
252
339
  )
253
340
  # downcast to dtype and store for backward
254
341
  ctx.save_for_backward(
@@ -257,13 +344,16 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
257
344
  grad_bias.detach() if bias is not None else None,
258
345
  )
259
346
  ctx.return_z_loss = return_z_loss
260
- return loss, z_loss
347
+ ctx.return_token_accuracy = return_token_accuracy
348
+ return loss, z_loss, token_accuracy
261
349
 
262
350
  @staticmethod
263
351
  @amp_custom_bwd
264
- def backward(ctx, grad_output, grad_output2):
352
+ def backward(ctx, grad_output, grad_output2, grad_output3):
265
353
  if ctx.return_z_loss:
266
354
  del grad_output2 # z_loss is only for logging
355
+ if ctx.return_token_accuracy:
356
+ del grad_output3 # token_accuracy is only for metrics
267
357
  (grad_input, grad_weight, grad_bias) = ctx.saved_tensors
268
358
  grad_input, grad_weight, grad_bias = fused_linear_cross_entropy_backward(
269
359
  grad_output, grad_input, grad_weight, grad_bias
@@ -280,4 +370,7 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
280
370
  None,
281
371
  None,
282
372
  None,
373
+ None,
374
+ None, # use_token_scaling
375
+ None, # return_token_accuracy
283
376
  )