liger-kernel-nightly 0.5.10.dev20250624183504__py3-none-any.whl → 0.6.4.dev20251121224847__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of liger-kernel-nightly might be problematic. Click here for more details.

Files changed (73) hide show
  1. liger_kernel/chunked_loss/__init__.py +1 -0
  2. liger_kernel/chunked_loss/cosine_similarity_loss.py +136 -0
  3. liger_kernel/chunked_loss/dpo_loss.py +54 -3
  4. liger_kernel/chunked_loss/functional.py +2 -0
  5. liger_kernel/chunked_loss/fused_linear_distillation.py +13 -2
  6. liger_kernel/chunked_loss/fused_linear_ppo.py +25 -5
  7. liger_kernel/chunked_loss/grpo_loss.py +46 -9
  8. liger_kernel/chunked_loss/jsd_loss.py +23 -7
  9. liger_kernel/ops/cross_entropy.py +118 -62
  10. liger_kernel/ops/fused_add_rms_norm.py +412 -0
  11. liger_kernel/ops/fused_linear_cross_entropy.py +113 -21
  12. liger_kernel/ops/geglu.py +1 -1
  13. liger_kernel/ops/grpo_loss.py +3 -1
  14. liger_kernel/ops/layer_norm.py +133 -79
  15. liger_kernel/ops/llama4_rope.py +225 -0
  16. liger_kernel/ops/poly_norm.py +386 -0
  17. liger_kernel/ops/rms_norm.py +2 -2
  18. liger_kernel/ops/rope.py +1 -1
  19. liger_kernel/ops/swiglu.py +1 -1
  20. liger_kernel/ops/tiled_mlp.py +136 -0
  21. liger_kernel/transformers/__init__.py +59 -0
  22. liger_kernel/transformers/cross_entropy.py +8 -3
  23. liger_kernel/transformers/experimental/__init__.py +5 -0
  24. liger_kernel/transformers/functional.py +38 -6
  25. liger_kernel/transformers/fused_add_rms_norm.py +39 -0
  26. liger_kernel/transformers/fused_linear_cross_entropy.py +16 -4
  27. liger_kernel/transformers/grpo_loss.py +56 -1
  28. liger_kernel/transformers/llama4_rope.py +93 -0
  29. liger_kernel/transformers/model/falcon_h1.py +122 -0
  30. liger_kernel/transformers/model/gemma.py +28 -8
  31. liger_kernel/transformers/model/gemma2.py +31 -8
  32. liger_kernel/transformers/model/gemma3.py +100 -110
  33. liger_kernel/transformers/model/glm4.py +18 -5
  34. liger_kernel/transformers/model/glm4v.py +163 -0
  35. liger_kernel/transformers/model/glm4v_moe.py +172 -0
  36. liger_kernel/transformers/model/hunyuan_v1.py +134 -0
  37. liger_kernel/transformers/model/internvl.py +157 -0
  38. liger_kernel/transformers/model/llama.py +26 -7
  39. liger_kernel/transformers/model/llama4.py +121 -0
  40. liger_kernel/transformers/model/llava.py +18 -6
  41. liger_kernel/transformers/model/loss_utils.py +34 -3
  42. liger_kernel/transformers/model/mistral.py +17 -10
  43. liger_kernel/transformers/model/mixtral.py +24 -9
  44. liger_kernel/transformers/model/mllama.py +18 -7
  45. liger_kernel/transformers/model/olmo2.py +18 -5
  46. liger_kernel/transformers/model/olmo3.py +142 -0
  47. liger_kernel/transformers/model/output_classes.py +147 -0
  48. liger_kernel/transformers/model/paligemma.py +41 -5
  49. liger_kernel/transformers/model/phi3.py +24 -159
  50. liger_kernel/transformers/model/qwen2.py +26 -4
  51. liger_kernel/transformers/model/qwen2_5_vl.py +21 -8
  52. liger_kernel/transformers/model/qwen2_vl.py +24 -7
  53. liger_kernel/transformers/model/qwen3.py +22 -6
  54. liger_kernel/transformers/model/qwen3_moe.py +27 -7
  55. liger_kernel/transformers/model/qwen3_next.py +146 -0
  56. liger_kernel/transformers/model/qwen3_vl.py +150 -0
  57. liger_kernel/transformers/model/qwen3_vl_moe.py +126 -0
  58. liger_kernel/transformers/model/smollm3.py +199 -0
  59. liger_kernel/transformers/model/smolvlm.py +158 -0
  60. liger_kernel/transformers/monkey_patch.py +1278 -116
  61. liger_kernel/transformers/multi_token_attention.py +1 -1
  62. liger_kernel/transformers/poly_norm.py +42 -0
  63. liger_kernel/transformers/rms_norm.py +7 -0
  64. liger_kernel/transformers/rope.py +43 -0
  65. liger_kernel/transformers/swiglu.py +17 -0
  66. liger_kernel/transformers/tiled_mlp.py +133 -0
  67. {liger_kernel_nightly-0.5.10.dev20250624183504.dist-info → liger_kernel_nightly-0.6.4.dev20251121224847.dist-info}/METADATA +29 -24
  68. liger_kernel_nightly-0.6.4.dev20251121224847.dist-info/RECORD +118 -0
  69. liger_kernel_nightly-0.5.10.dev20250624183504.dist-info/RECORD +0 -95
  70. {liger_kernel_nightly-0.5.10.dev20250624183504.dist-info → liger_kernel_nightly-0.6.4.dev20251121224847.dist-info}/LICENSE +0 -0
  71. {liger_kernel_nightly-0.5.10.dev20250624183504.dist-info → liger_kernel_nightly-0.6.4.dev20251121224847.dist-info}/NOTICE +0 -0
  72. {liger_kernel_nightly-0.5.10.dev20250624183504.dist-info → liger_kernel_nightly-0.6.4.dev20251121224847.dist-info}/WHEEL +0 -0
  73. {liger_kernel_nightly-0.5.10.dev20250624183504.dist-info → liger_kernel_nightly-0.6.4.dev20251121224847.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,412 @@
1
+ import math
2
+ import operator
3
+
4
+ import torch
5
+ import triton
6
+ import triton.language as tl
7
+
8
+ from liger_kernel.ops.utils import calculate_settings
9
+ from liger_kernel.ops.utils import compare_version
10
+ from liger_kernel.ops.utils import ensure_contiguous
11
+ from liger_kernel.ops.utils import torch_to_triton_dtype
12
+
13
+ if compare_version("triton", operator.ge, "3.0.0"):
14
+ try:
15
+ # typical import path with dispatch available
16
+ from triton.language.extra.libdevice import rsqrt
17
+ except ModuleNotFoundError:
18
+ # for working with NGC containers
19
+ from triton.language.extra.cuda.libdevice import rsqrt
20
+ else:
21
+ from triton.language.math import rsqrt
22
+
23
+
24
+ _CASTING_MODE_NONE: tl.constexpr = tl.constexpr(-1)
25
+ _CASTING_MODE_LLAMA: tl.constexpr = tl.constexpr(0)
26
+ _CASTING_MODE_GEMMA: tl.constexpr = tl.constexpr(1)
27
+
28
+
29
+ @triton.jit
30
+ def _fused_add_rms_norm_forward_kernel(
31
+ Y_ptr,
32
+ Y_row_stride,
33
+ S_ptr, # output residual
34
+ S_row_stride,
35
+ X_ptr,
36
+ X_row_stride,
37
+ R_ptr, # input residual
38
+ R_row_stride,
39
+ W_ptr,
40
+ W_row_stride,
41
+ RSTD_ptr,
42
+ RSTD_row_stride,
43
+ n_cols,
44
+ eps,
45
+ offset,
46
+ casting_mode: tl.constexpr, # constexpr so the `if` blocks can be optimized out
47
+ BLOCK_SIZE: tl.constexpr,
48
+ ):
49
+ """
50
+ This kernel computes the following:
51
+ 1. hidden_states = residual + hidden_states
52
+ 2. residual = hidden_states
53
+ 3. hidden_states = rmsnorm(hidden_states)
54
+
55
+ This is a commonly used pattern in the decoder layers of LLMs.
56
+ Some examples:
57
+ 1. https://github.com/huggingface/transformers/blob/0dc2df5ddafe3cb5824ad24e85beba13e0aa6726/src/transformers/models/qwen3/modeling_qwen3.py#L271
58
+ 2. https://github.com/huggingface/transformers/blob/0dc2df5ddafe3cb5824ad24e85beba13e0aa6726/src/transformers/models/llama4/modeling_llama4.py#L393
59
+
60
+ This kernel is inspired by the rms_norm forward kernel, and is adapted to support the residual addition in the forward pass.
61
+ The backward pass is also adapted to support the residual addition in the backward pass.
62
+ """
63
+
64
+ row_idx = tl.program_id(0).to(tl.int64)
65
+ col_offsets = tl.arange(0, BLOCK_SIZE)
66
+ mask = col_offsets < n_cols
67
+
68
+ Y_ptr += row_idx * Y_row_stride
69
+ S_ptr += row_idx * S_row_stride
70
+ X_ptr += row_idx * X_row_stride
71
+ R_ptr += row_idx * R_row_stride
72
+ RSTD_ptr += row_idx * RSTD_row_stride
73
+
74
+ X_row = tl.load(X_ptr + col_offsets, mask=mask, other=0)
75
+ R_row = tl.load(R_ptr + col_offsets, mask=mask, other=0)
76
+ S_row = X_row + R_row
77
+ tl.store(S_ptr + col_offsets, S_row, mask=mask)
78
+ S_row_dtype = S_row.dtype
79
+ W_row = tl.load(W_ptr + col_offsets, mask=mask, other=0)
80
+
81
+ # On Llama, only rstd is computed on fp32
82
+ if casting_mode == _CASTING_MODE_LLAMA:
83
+ S_row = S_row.to(tl.float32)
84
+
85
+ # Gemma computes everything on fp32, and then casts back the output to the original dtype
86
+ if casting_mode == _CASTING_MODE_GEMMA:
87
+ W_row = W_row.to(tl.float32)
88
+ S_row = S_row.to(tl.float32)
89
+
90
+ if casting_mode == _CASTING_MODE_NONE:
91
+ eps = eps.to(S_row_dtype)
92
+ offset = offset.to(S_row_dtype)
93
+
94
+ mean_square = tl.sum(S_row * S_row, axis=0) / n_cols
95
+ rstd = rsqrt(mean_square + eps)
96
+
97
+ # We can save time by caching rms with minimal memory overhead
98
+ # because rms is much smaller compared to X_row, as rms is for each row.
99
+ # However, on the computation side, it can save 4 operations (*, sum, /, sqrt).
100
+ tl.store(RSTD_ptr, rstd)
101
+
102
+ S_row = S_row * rstd
103
+
104
+ # On Llama, the multiplication with the weight is done on the original dtype
105
+ if casting_mode == _CASTING_MODE_LLAMA:
106
+ S_row = S_row.to(S_row_dtype)
107
+
108
+ Y_row = S_row * (offset + W_row)
109
+
110
+ if casting_mode == _CASTING_MODE_GEMMA:
111
+ Y_row = Y_row.to(S_row_dtype)
112
+
113
+ tl.store(Y_ptr + col_offsets, Y_row, mask=mask)
114
+
115
+
116
+ @triton.jit
117
+ def _fused_add_rms_norm_backward_kernel(
118
+ dY_ptr,
119
+ dY_row_stride,
120
+ dS_out_ptr,
121
+ dS_out_row_stride,
122
+ dX_ptr,
123
+ dX_row_stride,
124
+ X_ptr,
125
+ X_row_stride,
126
+ X_dtype: tl.constexpr,
127
+ W_ptr,
128
+ W_row_stride,
129
+ RSTD_ptr,
130
+ RSTD_row_stride,
131
+ dW_ptr,
132
+ dW_row_stride,
133
+ n_rows,
134
+ n_cols,
135
+ offset,
136
+ rows_per_program: tl.constexpr,
137
+ casting_mode: tl.constexpr,
138
+ BLOCK_SIZE: tl.constexpr,
139
+ has_dS_out: tl.constexpr,
140
+ ):
141
+ """
142
+ This kernel is adapted from the rms_norm backward kernel, and is adapted to support the residual
143
+ addition in the backward pass. For the following code pattern:
144
+ 1. hidden_states = residual + hidden_states
145
+ 2. residual = hidden_states
146
+ 3. hidden_states = rmsnorm(hidden_states)
147
+
148
+ The gradient of hidden_states and residual comes out be exactly same. The value of this gradient is
149
+ the sum of the gradient of the hidden_states in step 3 and the gradient of the residual in step 2.
150
+
151
+ The backward pass computation logic is same as the rms_norm backward kernel, except that the gradient
152
+ of the hidden_states in step 3 and the gradient of the residual in step 2 are summed up.
153
+ """
154
+
155
+ row_block_id = tl.program_id(0).to(tl.int64)
156
+ row_start = row_block_id * rows_per_program
157
+ row_end = min((row_block_id + 1) * rows_per_program, n_rows)
158
+ col_offsets = tl.arange(0, BLOCK_SIZE)
159
+ mask = col_offsets < n_cols
160
+
161
+ dW_row = tl.zeros((BLOCK_SIZE,), dtype=tl.float32)
162
+
163
+ dY_ptr += row_start * dY_row_stride
164
+ dX_ptr += row_start * dX_row_stride
165
+ if has_dS_out:
166
+ dS_out_ptr += row_start * dS_out_row_stride
167
+
168
+ X_ptr += row_start * X_row_stride
169
+ RSTD_ptr += row_start
170
+
171
+ W_row = tl.load(W_ptr + col_offsets, mask=mask, other=0.0)
172
+ W_row = W_row + offset
173
+
174
+ for _ in range(row_start, row_end):
175
+ dY_row = tl.load(dY_ptr + col_offsets, mask=mask, other=0.0)
176
+ X_row = tl.load(X_ptr + col_offsets, mask=mask, other=0.0)
177
+
178
+ # Get cached rms
179
+ rstd_row = tl.load(RSTD_ptr)
180
+
181
+ X_row = X_row.to(tl.float32)
182
+
183
+ # Different bacward graphs for different casting modes
184
+ if casting_mode == _CASTING_MODE_LLAMA:
185
+ m = (dY_row * W_row).to(tl.float32)
186
+
187
+ elif casting_mode == _CASTING_MODE_GEMMA:
188
+ dY_row = dY_row.to(tl.float32)
189
+ m = dY_row * W_row
190
+ else:
191
+ m = dY_row * W_row
192
+
193
+ dX_row = rstd_row * m
194
+
195
+ if has_dS_out:
196
+ dS_out_row = tl.load(dS_out_ptr + col_offsets, mask=mask, other=0.0)
197
+ dX_row += (rstd_row) * (
198
+ -(1 / n_cols) * rstd_row * rstd_row * tl.sum(m * X_row, axis=0) * X_row
199
+ ) + dS_out_row
200
+ dS_out_ptr += dS_out_row_stride
201
+ else:
202
+ dX_row += (rstd_row) * (-(1 / n_cols) * rstd_row * rstd_row * tl.sum(m * X_row, axis=0) * X_row)
203
+
204
+ # calculate the gradient of W
205
+ if casting_mode == _CASTING_MODE_LLAMA:
206
+ dW_row += dY_row * (X_row * rstd_row).to(X_dtype)
207
+ else:
208
+ # here X_row is already in fp32 (see previous if block)
209
+ dW_row += dY_row * (X_row * rstd_row)
210
+
211
+ tl.store(dX_ptr + col_offsets, dX_row.to(X_dtype), mask=mask)
212
+
213
+ dY_ptr += dY_row_stride
214
+ dX_ptr += dX_row_stride
215
+ X_ptr += X_row_stride
216
+ RSTD_ptr += RSTD_row_stride
217
+
218
+ tl.store(dW_ptr + row_block_id * dW_row_stride + col_offsets, dW_row, mask=mask)
219
+
220
+
221
+ _str_to_casting_mode = {
222
+ "llama": _CASTING_MODE_LLAMA.value,
223
+ "gemma": _CASTING_MODE_GEMMA.value,
224
+ "none": _CASTING_MODE_NONE.value,
225
+ }
226
+
227
+
228
+ def fused_add_rms_norm_forward(X, R, W, eps, offset, casting_mode):
229
+ if not isinstance(casting_mode, int):
230
+ assert casting_mode in _str_to_casting_mode, f"Invalid casting mode: {casting_mode}"
231
+ casting_mode = _str_to_casting_mode[casting_mode]
232
+ else:
233
+ assert casting_mode in _str_to_casting_mode.values(), f"Invalid casting mode: {casting_mode}"
234
+
235
+ shape = X.shape
236
+ dim = shape[-1]
237
+ X = X.view(-1, dim)
238
+ R = R.view(-1, dim)
239
+ n_rows, n_cols = X.shape
240
+ BLOCK_SIZE, num_warps = calculate_settings(n_cols)
241
+
242
+ Y = torch.empty((n_rows, n_cols), dtype=X.dtype, device=X.device)
243
+ S = torch.empty((n_rows, n_cols), dtype=X.dtype, device=X.device)
244
+ # RSTD is to cache rstd for each row
245
+ # RSTD is always computed/stored in fp32 if we are using Llama or Gemma casting mode
246
+ rstd_dtype = torch.float32 if casting_mode in (_CASTING_MODE_LLAMA.value, _CASTING_MODE_GEMMA.value) else X.dtype
247
+ RSTD = torch.empty(n_rows, dtype=rstd_dtype, device=X.device)
248
+
249
+ # Check constraints.
250
+ assert X.shape[1] == W.shape[0], "Incompatible hidden size dimension between tensor1.shape[1] and tensor2.shape[0]"
251
+
252
+ # XPU-specific optimization
253
+ kernel_args = {}
254
+ if X.device.type == "xpu":
255
+ kernel_args["grf_mode"] = "large"
256
+
257
+ # TODO: add _block_fused_add_rms_norm_forward_kernel
258
+ _fused_add_rms_norm_forward_kernel[(n_rows,)](
259
+ Y,
260
+ Y.stride(0),
261
+ S,
262
+ S.stride(0),
263
+ X,
264
+ X.stride(0),
265
+ R,
266
+ R.stride(0),
267
+ W,
268
+ W.stride(0),
269
+ RSTD,
270
+ RSTD.stride(0),
271
+ n_cols,
272
+ eps,
273
+ offset,
274
+ casting_mode,
275
+ BLOCK_SIZE=BLOCK_SIZE,
276
+ num_warps=num_warps,
277
+ **kernel_args, # XPU-specific optimization
278
+ )
279
+
280
+ return Y.view(*shape), S.view(*shape), RSTD, BLOCK_SIZE, num_warps, casting_mode
281
+
282
+
283
+ def fused_add_rms_norm_backward(dY, dS_out, S, W, RSTD, offset, casting_mode, BLOCK_SIZE, num_warps, in_place):
284
+ shape = dY.shape
285
+ dim = shape[-1]
286
+ dY = dY.view(-1, dim)
287
+ dS_out = dS_out.view(-1, dim)
288
+ S = S.view(-1, dim)
289
+ n_rows, n_cols = dY.shape
290
+
291
+ sm_count = 1
292
+ if S.device.type == "cuda":
293
+ sm_count = torch.cuda.get_device_properties(S.device).multi_processor_count
294
+ elif S.device.type == "xpu":
295
+ sm_count = torch.xpu.get_device_properties(S.device).gpu_eu_count
296
+
297
+ # fp32 for numerical stability especially.
298
+ _dW = torch.empty((sm_count, n_cols), dtype=torch.float32, device=W.device)
299
+
300
+ if n_cols > BLOCK_SIZE:
301
+ raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
302
+ rows_per_program = math.ceil(n_rows / sm_count)
303
+ grid = (sm_count,)
304
+
305
+ if in_place is True:
306
+ dX = dY
307
+ else:
308
+ dX = torch.empty_like(dY)
309
+
310
+ # XPU-specific optimization
311
+ kernel_args = {}
312
+ if S.device.type == "xpu":
313
+ kernel_args["grf_mode"] = "large"
314
+
315
+ # TODO: add _block_fused_add_rms_norm_backward_kernel
316
+ _fused_add_rms_norm_backward_kernel[grid](
317
+ dY,
318
+ dY.stride(0),
319
+ dS_out,
320
+ dS_out.stride(0),
321
+ dX,
322
+ dX.stride(0),
323
+ S,
324
+ S.stride(0),
325
+ torch_to_triton_dtype[S.dtype],
326
+ W,
327
+ W.stride(0),
328
+ RSTD,
329
+ RSTD.stride(0),
330
+ _dW,
331
+ _dW.stride(0),
332
+ n_rows,
333
+ n_cols,
334
+ offset,
335
+ rows_per_program,
336
+ casting_mode,
337
+ BLOCK_SIZE=BLOCK_SIZE,
338
+ num_warps=num_warps,
339
+ has_dS_out=dS_out is not None,
340
+ **kernel_args, # XPU-specific optimization
341
+ )
342
+
343
+ dX = dX.view(*shape)
344
+ dW = _dW.sum(dim=0).to(W.dtype)
345
+
346
+ return dX, dX, dW # dR is equal to dX
347
+
348
+
349
+ class LigerFusedAddRMSNormFunction(torch.autograd.Function):
350
+ """
351
+ Performs a fused operation that first adds a residual tensor to the hidden_states tensor (`X`), then applies RMSNorm (Root Mean Square Normalization) to the result using the weight tensor `W`, with optional offset and casting mode.
352
+
353
+ This class implements the following sequence, commonly used in transformer decoder layers:
354
+ 1. hidden_states = residual + hidden_states
355
+ 2. residual = hidden_states (after addition)
356
+ 3. hidden_states = rmsnorm(hidden_states)
357
+
358
+ Both the normalized hidden_states and the updated residual are returned as outputs.
359
+
360
+ Some models use an 'offset' to shift the weight tensor `W` by a constant value. For example, Gemma
361
+ uses an offset of 1.0, so the computation becomes `(X / RMS(X)) * (W + 1.0)` instead of the usual
362
+ `(X / RMS(X)) * W`. You can pass the offset value as an argument to the forward function.
363
+
364
+ In addition, different models cast their inputs at different places during RMSNorm computation. For
365
+ example, Gemma casts everything to fp32 before starting the computation, while Llama casts only the
366
+ inverse RMS to fp32. You can specify the casting mode using the `casting_mode` argument. We currently
367
+ support the following casting modes (they match HuggingFace Transformers' implementations):
368
+ - 'llama': matches the Llama implementation, where only the inverse RMS is computed on fp32.
369
+ - 'gemma': matches the Gemma implementation, where everything is cast to fp32, then computed, then cast back to the original dtype.
370
+ - 'none': no casting is done. The computation is done in the original dtype. This saves memory and is slightly faster, but has more error w.r.t. the original implementation.
371
+
372
+ The `in_place` option determines whether to modify dY in-place to store dX. This defaults to `True` to save memory.
373
+ """
374
+
375
+ @staticmethod
376
+ @ensure_contiguous
377
+ def forward(ctx, X, R, W, eps, offset=0.0, casting_mode="llama", in_place=False):
378
+ """
379
+ X: (B, T, H) or (BxT, H)
380
+ W: (H,)
381
+ """
382
+ # TODO: add row_mode
383
+ Y, S, RSTD, BLOCK_SIZE, num_warps, casting_mode = fused_add_rms_norm_forward(X, R, W, eps, offset, casting_mode)
384
+ ctx.offset = offset
385
+ ctx.casting_mode = casting_mode
386
+ ctx.in_place = in_place
387
+ ctx.BLOCK_SIZE = BLOCK_SIZE
388
+ ctx.num_warps = num_warps
389
+ ctx.save_for_backward(S, W, RSTD)
390
+ return Y, S
391
+
392
+ @staticmethod
393
+ @ensure_contiguous
394
+ def backward(ctx, dY, dS_out):
395
+ """
396
+ Y: (B, T, H) or (BxT, H)
397
+ """
398
+ S, W, RSTD = ctx.saved_tensors
399
+ dX, dR, dW = fused_add_rms_norm_backward(
400
+ dY,
401
+ dS_out,
402
+ S,
403
+ W,
404
+ RSTD,
405
+ ctx.offset,
406
+ ctx.casting_mode,
407
+ ctx.BLOCK_SIZE,
408
+ ctx.num_warps,
409
+ ctx.in_place,
410
+ )
411
+
412
+ return dX, dR, dW, None, None, None, None, None
@@ -25,10 +25,18 @@ def fused_linear_cross_entropy_forward(
25
25
  reduction="mean",
26
26
  softcap=None,
27
27
  return_z_loss=False,
28
+ accum_dtype=None,
29
+ use_token_scaling=False,
30
+ return_token_accuracy=False,
28
31
  ):
29
32
  assert isinstance(return_z_loss, bool), f"return_z_loss must be True or False. Got: {return_z_loss}"
33
+ assert isinstance(return_token_accuracy, bool), (
34
+ f"return_token_accuracy must be True or False. Got: {return_token_accuracy}"
35
+ )
30
36
  device = _input.device
31
37
 
38
+ input_requires_grad = _input.requires_grad
39
+
32
40
  # inputs have shape: BT x H
33
41
  # materialized activations will have shape: BT x V
34
42
  # the increase in memory = BT x V
@@ -44,12 +52,23 @@ def fused_linear_cross_entropy_forward(
44
52
  chunk_size = triton.next_power_of_2(triton.cdiv(BT, inc_factor)) # (BT + inc_factor - 1) // inc_factor
45
53
  num_chunks = triton.cdiv(BT, chunk_size) # (BT + chunk_size - 1) // chunk_size
46
54
 
47
- grad_weight = torch.zeros_like(weight, device=device) if weight.requires_grad else None
48
55
  grad_input = torch.zeros_like(_input, device=device)
49
- grad_bias = torch.zeros_like(bias, device=device) if bias is not None else None
50
- # we use fp32 for loss accumulator
56
+
57
+ # we use fp32 for loss and gradients accumulator
58
+ if input_requires_grad:
59
+ if accum_dtype is None:
60
+ grad_weight = torch.zeros_like(weight, device=device) if weight.requires_grad else None
61
+ grad_bias = torch.zeros_like(bias, device=device) if bias is not None else None
62
+ else:
63
+ grad_weight = torch.zeros_like(weight, dtype=accum_dtype, device=device) if weight.requires_grad else None
64
+ grad_bias = torch.zeros_like(bias, dtype=accum_dtype, device=device) if bias is not None else None
65
+ else:
66
+ grad_weight = None
67
+ grad_bias = None
68
+
51
69
  loss_1d = torch.zeros(BT, dtype=torch.float32, device=device)
52
70
  z_loss_1d = torch.zeros(BT, dtype=_input.dtype, device=_input.device) if return_z_loss else None
71
+ token_accuracy_1d = torch.zeros(BT, dtype=torch.float32, device=device) if return_token_accuracy else None
53
72
 
54
73
  # TODO: evaluate how CUDA synchronization caused by .item() affects the speed
55
74
  target_mask = target != ignore_index
@@ -82,9 +101,40 @@ def fused_linear_cross_entropy_forward(
82
101
 
83
102
  n_rows = logits_chunk.shape[0]
84
103
 
104
+ # Compute predicted probabilities for token scaling if needed
105
+ if use_token_scaling:
106
+ # Compute softmax probabilities for scaling
107
+ # We need to compute this before the cross entropy kernel modifies logits_chunk
108
+ logits_for_softmax = logits_chunk.detach().clone() # Detach to avoid gradient flow
109
+ if softcap is not None:
110
+ logits_for_softmax = softcap * torch.tanh(logits_for_softmax / softcap)
111
+
112
+ # Compute softmax to get predicted probabilities
113
+ probs = torch.softmax(logits_for_softmax, dim=-1)
114
+
115
+ # Get predicted probabilities for token scaling, handling ignored targets
116
+ valid_target_mask = target_chunk != ignore_index
117
+ valid_targets = target_chunk[valid_target_mask]
118
+
119
+ if len(valid_targets) > 0:
120
+ # Gather probabilities only for valid targets
121
+ valid_probs = probs[valid_target_mask]
122
+ pred_probs_valid = torch.gather(valid_probs, -1, valid_targets.unsqueeze(-1)).squeeze(-1)
123
+
124
+ # Create full tensor with zeros for ignored targets
125
+ pred_probs = torch.zeros_like(target_chunk, dtype=probs.dtype, device=probs.device)
126
+ pred_probs[valid_target_mask] = pred_probs_valid
127
+ else:
128
+ # All targets are ignored
129
+ pred_probs = torch.zeros_like(target_chunk, dtype=probs.dtype, device=probs.device)
130
+
131
+ # Store the scaling factors
132
+ scaling_factors = pred_probs.detach() # Detach to ensure no gradient flow
133
+
85
134
  # unreduced loss
86
135
  loss_1d_slice = loss_1d[start_idx:end_idx] # chunk_size,
87
136
  z_loss_1d_slice = z_loss_1d[start_idx:end_idx] if return_z_loss else None
137
+ token_accuracy_1d_slice = token_accuracy_1d[start_idx:end_idx] if return_token_accuracy else None
88
138
 
89
139
  # ensure _input and target are contiguous
90
140
  logits_chunk = logits_chunk.contiguous()
@@ -100,6 +150,10 @@ def fused_linear_cross_entropy_forward(
100
150
  loss_ptr=loss_1d_slice,
101
151
  z_loss_ptr=z_loss_1d_slice,
102
152
  loss_stride=loss_1d_slice.stride(-1), # always 1
153
+ token_accuracy_ptr=token_accuracy_1d_slice,
154
+ token_accuracy_stride=token_accuracy_1d_slice.stride(-1)
155
+ if return_token_accuracy
156
+ else 0, # always 1 if accuracy is enabled
103
157
  n_cols=V,
104
158
  n_non_ignore=total_n_non_ignore,
105
159
  sum_non_ignore_weight=total_sum_non_ignore_ce_weight,
@@ -110,35 +164,43 @@ def fused_linear_cross_entropy_forward(
110
164
  reduction=reduction,
111
165
  softcap=softcap,
112
166
  RETURN_Z_LOSS=return_z_loss,
167
+ RETURN_TOKEN_ACCURACY=return_token_accuracy,
113
168
  HAS_WEIGHT=True if ce_weight is not None else False,
114
169
  HAS_SOFTCAPPING=True if softcap is not None else False,
170
+ HAS_GRADIENTS=input_requires_grad,
115
171
  BLOCK_SIZE=BLOCK_SIZE,
116
172
  num_warps=32 if not is_hip() else 16,
117
173
  )
118
174
 
175
+ # Apply token scaling if requested
176
+ if use_token_scaling:
177
+ loss_1d_slice = loss_1d_slice * scaling_factors
178
+ if return_z_loss:
179
+ z_loss_1d_slice = z_loss_1d_slice * scaling_factors
180
+
119
181
  loss_1d[start_idx:end_idx] = loss_1d_slice
120
182
  if return_z_loss:
121
183
  z_loss_1d[start_idx:end_idx] = z_loss_1d_slice
184
+ if return_token_accuracy:
185
+ token_accuracy_1d[start_idx:end_idx] = token_accuracy_1d_slice
122
186
  grad_logits_chunk = logits_chunk # chunk_size x V
123
187
 
124
- grad_input[start_idx:end_idx] = grad_logits_chunk @ weight
188
+ # Apply token scaling to gradients if requested
189
+ if use_token_scaling:
190
+ # Expand scaling factors to match gradient dimensions
191
+ scaling_factors_expanded = scaling_factors.unsqueeze(-1) # chunk_size x 1
192
+ grad_logits_chunk = grad_logits_chunk * scaling_factors_expanded
125
193
 
126
- if grad_weight is not None:
127
- torch.addmm(
128
- input=grad_weight,
129
- mat1=logits_chunk.t().to(
130
- _input_chunk.dtype
131
- ), # In an autocast scenario without bias, differing logits_chunk data types will cause an addmm operation error.
132
- mat2=_input_chunk,
133
- out=grad_weight,
134
- alpha=1.0,
135
- beta=1.0,
136
- )
194
+ if input_requires_grad:
195
+ grad_input[start_idx:end_idx] = grad_logits_chunk @ weight
137
196
 
138
- if bias is not None:
197
+ if grad_weight is not None and input_requires_grad:
198
+ grad_weight += torch.mm(grad_logits_chunk.t(), _input_chunk).float()
199
+
200
+ if bias is not None and input_requires_grad:
139
201
  torch.add(
140
202
  input=grad_bias,
141
- other=logits_chunk.sum(dim=0),
203
+ other=grad_logits_chunk.sum(dim=0),
142
204
  out=grad_bias,
143
205
  alpha=1.0,
144
206
  )
@@ -148,10 +210,22 @@ def fused_linear_cross_entropy_forward(
148
210
  # loss = loss_1d
149
211
  # z_loss = z_loss_1d if return_z_loss else None
150
212
 
213
+ if reduction == "none":
214
+ # Return per-token losses
215
+ loss = loss_1d
216
+ z_loss = z_loss_1d if return_z_loss else None
217
+ token_accuracy = token_accuracy_1d if return_token_accuracy else None
151
218
  else:
152
219
  loss = torch.sum(loss_1d)
153
220
  z_loss = torch.sum(z_loss_1d) if return_z_loss else None
154
- return loss, z_loss, grad_input, grad_weight, grad_bias
221
+ # For accuracy, we compute the mean across all non-ignored tokens
222
+ token_accuracy = torch.sum(token_accuracy_1d) / total_n_non_ignore if return_token_accuracy else None
223
+
224
+ # Cast back to original dtype
225
+ grad_weight = grad_weight.to(weight.dtype) if grad_weight is not None else None
226
+ grad_bias = grad_bias.to(bias.dtype) if grad_bias is not None else None
227
+
228
+ return loss, z_loss, token_accuracy, grad_input, grad_weight, grad_bias
155
229
 
156
230
 
157
231
  def fused_linear_cross_entropy_backward(grad_output, grad_input, grad_weight, grad_bias):
@@ -217,6 +291,9 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
217
291
  reduction="mean",
218
292
  softcap=None,
219
293
  return_z_loss: bool = False,
294
+ accum_dtype=None,
295
+ use_token_scaling: bool = False,
296
+ return_token_accuracy: bool = False,
220
297
  ):
221
298
  """
222
299
  Fusing the last linear layer with cross-entropy loss
@@ -235,9 +312,15 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
235
312
  ignore_index: the index to ignore in the target
236
313
  label_smoothing (float): The amount of smoothing when computing the loss, where 0.0 means no smoothing.
237
314
  reduction: reduction to apply
315
+ accum_dtype (torch.dtype): the dtype of intermediate result buffers for weight and bias gradient accumulations.
316
+ Recommended to set `accum_dtype` to higher precision, e.g. `torch.float32`, if the training is unstable with original dtype. Default: `None`, performing accumulations in original dtype
317
+ use_token_scaling (bool): whether to scale each token's loss by its predicted probability (detached).
318
+ When True, each token's loss is multiplied by the model's predicted probability for that token's true class.
319
+ Default: False.
320
+ return_token_accuracy (bool): When `return_token_accuracy` is `True`, computes and returns per-token accuracy without materializing logits. Default: `False`
238
321
  """
239
322
 
240
- loss, z_loss, grad_input, grad_weight, grad_bias = fused_linear_cross_entropy_forward(
323
+ loss, z_loss, token_accuracy, grad_input, grad_weight, grad_bias = fused_linear_cross_entropy_forward(
241
324
  _input=_input,
242
325
  weight=weight,
243
326
  target=target,
@@ -249,6 +332,9 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
249
332
  reduction=reduction,
250
333
  softcap=softcap,
251
334
  return_z_loss=return_z_loss,
335
+ accum_dtype=accum_dtype,
336
+ use_token_scaling=use_token_scaling,
337
+ return_token_accuracy=return_token_accuracy,
252
338
  )
253
339
  # downcast to dtype and store for backward
254
340
  ctx.save_for_backward(
@@ -257,13 +343,16 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
257
343
  grad_bias.detach() if bias is not None else None,
258
344
  )
259
345
  ctx.return_z_loss = return_z_loss
260
- return loss, z_loss
346
+ ctx.return_token_accuracy = return_token_accuracy
347
+ return loss, z_loss, token_accuracy
261
348
 
262
349
  @staticmethod
263
350
  @amp_custom_bwd
264
- def backward(ctx, grad_output, grad_output2):
351
+ def backward(ctx, grad_output, grad_output2, grad_output3):
265
352
  if ctx.return_z_loss:
266
353
  del grad_output2 # z_loss is only for logging
354
+ if ctx.return_token_accuracy:
355
+ del grad_output3 # token_accuracy is only for metrics
267
356
  (grad_input, grad_weight, grad_bias) = ctx.saved_tensors
268
357
  grad_input, grad_weight, grad_bias = fused_linear_cross_entropy_backward(
269
358
  grad_output, grad_input, grad_weight, grad_bias
@@ -280,4 +369,7 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
280
369
  None,
281
370
  None,
282
371
  None,
372
+ None,
373
+ None, # use_token_scaling
374
+ None, # return_token_accuracy
283
375
  )
liger_kernel/ops/geglu.py CHANGED
@@ -40,7 +40,7 @@ def _geglu_tanh_forward_kernel(a, b, c, stride, n_cols: tl.constexpr, BLOCK_SIZE
40
40
  tanh_arg = sqrt_2_over_pi * (a_row + 0.044715 * a_cubed)
41
41
  tanh_result = tanh(tanh_arg)
42
42
  geglu_a = 0.5 * a_row * (1 + tanh_result)
43
- c_row = geglu_a * b_row
43
+ c_row = geglu_a.cast(b_row.dtype) * b_row
44
44
  tl.store(c + col_offsets, c_row, mask=mask)
45
45
 
46
46
 
@@ -128,7 +128,9 @@ def _grpo_loss_fwd_kernel(
128
128
  per_token_loss1 = coef_1 * advantage
129
129
  per_token_loss2 = coef_2 * advantage
130
130
  per_token_loss = -tl.minimum(per_token_loss1, per_token_loss2)
131
- is_clipped = per_token_loss1 < per_token_loss2
131
+ is_low_clipped = (coef_1 < 1 - EPS_LOW) & (advantage < 0)
132
+ is_high_clipped = (coef_1 > 1 + EPS_HIGH) & (advantage > 0)
133
+ is_clipped = is_low_clipped | is_high_clipped
132
134
 
133
135
  if BETA != 0.0:
134
136
  REF_LOGP += off_b * L + off_l