liger-kernel-nightly 0.6.0.dev20250718050347__py3-none-any.whl → 0.6.0.dev20250718080702__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,412 @@
1
+ import math
2
+ import operator
3
+
4
+ import torch
5
+ import triton
6
+ import triton.language as tl
7
+
8
+ from liger_kernel.ops.utils import calculate_settings
9
+ from liger_kernel.ops.utils import compare_version
10
+ from liger_kernel.ops.utils import ensure_contiguous
11
+ from liger_kernel.ops.utils import torch_to_triton_dtype
12
+
13
+ if compare_version("triton", operator.ge, "3.0.0"):
14
+ try:
15
+ # typical import path with dispatch available
16
+ from triton.language.extra.libdevice import rsqrt
17
+ except ModuleNotFoundError:
18
+ # for working with NGC containers
19
+ from triton.language.extra.cuda.libdevice import rsqrt
20
+ else:
21
+ from triton.language.math import rsqrt
22
+
23
+
24
+ _CASTING_MODE_NONE: tl.constexpr = tl.constexpr(-1)
25
+ _CASTING_MODE_LLAMA: tl.constexpr = tl.constexpr(0)
26
+ _CASTING_MODE_GEMMA: tl.constexpr = tl.constexpr(1)
27
+
28
+
29
+ @triton.jit
30
+ def _fused_add_rms_norm_forward_kernel(
31
+ Y_ptr,
32
+ Y_row_stride,
33
+ S_ptr, # output residual
34
+ S_row_stride,
35
+ X_ptr,
36
+ X_row_stride,
37
+ R_ptr, # input residual
38
+ R_row_stride,
39
+ W_ptr,
40
+ W_row_stride,
41
+ RSTD_ptr,
42
+ RSTD_row_stride,
43
+ n_cols,
44
+ eps,
45
+ offset,
46
+ casting_mode: tl.constexpr, # constexpr so the `if` blocks can be optimized out
47
+ BLOCK_SIZE: tl.constexpr,
48
+ ):
49
+ """
50
+ This kernel computes the following:
51
+ 1. hidden_states = residual + hidden_states
52
+ 2. residual = hidden_states
53
+ 3. hidden_states = rmsnorm(hidden_states)
54
+
55
+ This is a commonly used pattern in the decoder layers of LLMs.
56
+ Some examples:
57
+ 1. https://github.com/huggingface/transformers/blob/0dc2df5ddafe3cb5824ad24e85beba13e0aa6726/src/transformers/models/qwen3/modeling_qwen3.py#L271
58
+ 2. https://github.com/huggingface/transformers/blob/0dc2df5ddafe3cb5824ad24e85beba13e0aa6726/src/transformers/models/llama4/modeling_llama4.py#L393
59
+
60
+ This kernel is inspired by the rms_norm forward kernel, and is adapted to support the residual addition in the forward pass.
61
+ The backward pass is also adapted to support the residual addition in the backward pass.
62
+ """
63
+
64
+ row_idx = tl.program_id(0).to(tl.int64)
65
+ col_offsets = tl.arange(0, BLOCK_SIZE)
66
+ mask = col_offsets < n_cols
67
+
68
+ Y_ptr += row_idx * Y_row_stride
69
+ S_ptr += row_idx * S_row_stride
70
+ X_ptr += row_idx * X_row_stride
71
+ R_ptr += row_idx * R_row_stride
72
+ RSTD_ptr += row_idx * RSTD_row_stride
73
+
74
+ X_row = tl.load(X_ptr + col_offsets, mask=mask, other=0)
75
+ R_row = tl.load(R_ptr + col_offsets, mask=mask, other=0)
76
+ S_row = X_row + R_row
77
+ tl.store(S_ptr + col_offsets, S_row, mask=mask)
78
+ S_row_dtype = S_row.dtype
79
+ W_row = tl.load(W_ptr + col_offsets, mask=mask, other=0)
80
+
81
+ # On Llama, only rstd is computed on fp32
82
+ if casting_mode == _CASTING_MODE_LLAMA:
83
+ S_row = S_row.to(tl.float32)
84
+
85
+ # Gemma computes everything on fp32, and then casts back the output to the original dtype
86
+ if casting_mode == _CASTING_MODE_GEMMA:
87
+ W_row = W_row.to(tl.float32)
88
+ S_row = S_row.to(tl.float32)
89
+
90
+ if casting_mode == _CASTING_MODE_NONE:
91
+ eps = eps.to(S_row_dtype)
92
+ offset = offset.to(S_row_dtype)
93
+
94
+ mean_square = tl.sum(S_row * S_row, axis=0) / n_cols
95
+ rstd = rsqrt(mean_square + eps)
96
+
97
+ # We can save time by caching rms with minimal memory overhead
98
+ # because rms is much smaller compared to X_row, as rms is for each row.
99
+ # However, on the computation side, it can save 4 operations (*, sum, /, sqrt).
100
+ tl.store(RSTD_ptr, rstd)
101
+
102
+ S_row = S_row * rstd
103
+
104
+ # On Llama, the multiplication with the weight is done on the original dtype
105
+ if casting_mode == _CASTING_MODE_LLAMA:
106
+ S_row = S_row.to(S_row_dtype)
107
+
108
+ Y_row = S_row * (offset + W_row)
109
+
110
+ if casting_mode == _CASTING_MODE_GEMMA:
111
+ Y_row = Y_row.to(S_row_dtype)
112
+
113
+ tl.store(Y_ptr + col_offsets, Y_row, mask=mask)
114
+
115
+
116
+ @triton.jit
117
+ def _fused_add_rms_norm_backward_kernel(
118
+ dY_ptr,
119
+ dY_row_stride,
120
+ dS_out_ptr,
121
+ dS_out_row_stride,
122
+ dX_ptr,
123
+ dX_row_stride,
124
+ X_ptr,
125
+ X_row_stride,
126
+ X_dtype: tl.constexpr,
127
+ W_ptr,
128
+ W_row_stride,
129
+ RSTD_ptr,
130
+ RSTD_row_stride,
131
+ dW_ptr,
132
+ dW_row_stride,
133
+ n_rows,
134
+ n_cols,
135
+ offset,
136
+ rows_per_program: tl.constexpr,
137
+ casting_mode: tl.constexpr,
138
+ BLOCK_SIZE: tl.constexpr,
139
+ has_dS_out: tl.constexpr,
140
+ ):
141
+ """
142
+ This kernel is adapted from the rms_norm backward kernel, and is adapted to support the residual
143
+ addition in the backward pass. For the following code pattern:
144
+ 1. hidden_states = residual + hidden_states
145
+ 2. residual = hidden_states
146
+ 3. hidden_states = rmsnorm(hidden_states)
147
+
148
+ The gradient of hidden_states and residual comes out be exactly same. The value of this gradient is
149
+ the sum of the gradient of the hidden_states in step 3 and the gradient of the residual in step 2.
150
+
151
+ The backward pass computation logic is same as the rms_norm backward kernel, except that the gradient
152
+ of the hidden_states in step 3 and the gradient of the residual in step 2 are summed up.
153
+ """
154
+
155
+ row_block_id = tl.program_id(0).to(tl.int64)
156
+ row_start = row_block_id * rows_per_program
157
+ row_end = min((row_block_id + 1) * rows_per_program, n_rows)
158
+ col_offsets = tl.arange(0, BLOCK_SIZE)
159
+ mask = col_offsets < n_cols
160
+
161
+ dW_row = tl.zeros((BLOCK_SIZE,), dtype=tl.float32)
162
+
163
+ dY_ptr += row_start * dY_row_stride
164
+ dX_ptr += row_start * dX_row_stride
165
+ if has_dS_out:
166
+ dS_out_ptr += row_start * dS_out_row_stride
167
+
168
+ X_ptr += row_start * X_row_stride
169
+ RSTD_ptr += row_start
170
+
171
+ W_row = tl.load(W_ptr + col_offsets, mask=mask, other=0.0)
172
+ W_row = W_row + offset
173
+
174
+ for _ in range(row_start, row_end):
175
+ dY_row = tl.load(dY_ptr + col_offsets, mask=mask, other=0.0)
176
+ X_row = tl.load(X_ptr + col_offsets, mask=mask, other=0.0)
177
+
178
+ # Get cached rms
179
+ rstd_row = tl.load(RSTD_ptr)
180
+
181
+ X_row = X_row.to(tl.float32)
182
+
183
+ # Different bacward graphs for different casting modes
184
+ if casting_mode == _CASTING_MODE_LLAMA:
185
+ m = (dY_row * W_row).to(tl.float32)
186
+
187
+ elif casting_mode == _CASTING_MODE_GEMMA:
188
+ dY_row = dY_row.to(tl.float32)
189
+ m = dY_row * W_row
190
+ else:
191
+ m = dY_row * W_row
192
+
193
+ dX_row = rstd_row * m
194
+
195
+ if has_dS_out:
196
+ dS_out_row = tl.load(dS_out_ptr + col_offsets, mask=mask, other=0.0)
197
+ dX_row += (rstd_row) * (
198
+ -(1 / n_cols) * rstd_row * rstd_row * tl.sum(m * X_row, axis=0) * X_row
199
+ ) + dS_out_row
200
+ dS_out_ptr += dS_out_row_stride
201
+ else:
202
+ dX_row += (rstd_row) * (-(1 / n_cols) * rstd_row * rstd_row * tl.sum(m * X_row, axis=0) * X_row)
203
+
204
+ # calculate the gradient of W
205
+ if casting_mode == _CASTING_MODE_LLAMA:
206
+ dW_row += dY_row * (X_row * rstd_row).to(X_dtype)
207
+ else:
208
+ # here X_row is already in fp32 (see previous if block)
209
+ dW_row += dY_row * (X_row * rstd_row)
210
+
211
+ tl.store(dX_ptr + col_offsets, dX_row.to(X_dtype), mask=mask)
212
+
213
+ dY_ptr += dY_row_stride
214
+ dX_ptr += dX_row_stride
215
+ X_ptr += X_row_stride
216
+ RSTD_ptr += RSTD_row_stride
217
+
218
+ tl.store(dW_ptr + row_block_id * dW_row_stride + col_offsets, dW_row, mask=mask)
219
+
220
+
221
+ _str_to_casting_mode = {
222
+ "llama": _CASTING_MODE_LLAMA.value,
223
+ "gemma": _CASTING_MODE_GEMMA.value,
224
+ "none": _CASTING_MODE_NONE.value,
225
+ }
226
+
227
+
228
+ def fused_add_rms_norm_forward(X, R, W, eps, offset, casting_mode):
229
+ if not isinstance(casting_mode, int):
230
+ assert casting_mode in _str_to_casting_mode, f"Invalid casting mode: {casting_mode}"
231
+ casting_mode = _str_to_casting_mode[casting_mode]
232
+ else:
233
+ assert casting_mode in _str_to_casting_mode.values(), f"Invalid casting mode: {casting_mode}"
234
+
235
+ shape = X.shape
236
+ dim = shape[-1]
237
+ X = X.view(-1, dim)
238
+ R = R.view(-1, dim)
239
+ n_rows, n_cols = X.shape
240
+ BLOCK_SIZE, num_warps = calculate_settings(n_cols)
241
+
242
+ Y = torch.empty((n_rows, n_cols), dtype=X.dtype, device=X.device)
243
+ S = torch.empty((n_rows, n_cols), dtype=X.dtype, device=X.device)
244
+ # RSTD is to cache rstd for each row
245
+ # RSTD is always computed/stored in fp32 if we are using Llama or Gemma casting mode
246
+ rstd_dtype = torch.float32 if casting_mode in (_CASTING_MODE_LLAMA.value, _CASTING_MODE_GEMMA.value) else X.dtype
247
+ RSTD = torch.empty(n_rows, dtype=rstd_dtype, device=X.device)
248
+
249
+ # Check constraints.
250
+ assert X.shape[1] == W.shape[0], "Incompatible hidden size dimension between tensor1.shape[1] and tensor2.shape[0]"
251
+
252
+ # XPU-specific optimization
253
+ kernel_args = {}
254
+ if X.device.type == "xpu":
255
+ kernel_args["grf_mode"] = "large"
256
+
257
+ # TODO: add _block_fused_add_rms_norm_forward_kernel
258
+ _fused_add_rms_norm_forward_kernel[(n_rows,)](
259
+ Y,
260
+ Y.stride(0),
261
+ S,
262
+ S.stride(0),
263
+ X,
264
+ X.stride(0),
265
+ R,
266
+ R.stride(0),
267
+ W,
268
+ W.stride(0),
269
+ RSTD,
270
+ RSTD.stride(0),
271
+ n_cols,
272
+ eps,
273
+ offset,
274
+ casting_mode,
275
+ BLOCK_SIZE=BLOCK_SIZE,
276
+ num_warps=num_warps,
277
+ **kernel_args, # XPU-specific optimization
278
+ )
279
+
280
+ return Y.view(*shape), S.view(*shape), RSTD, BLOCK_SIZE, num_warps, casting_mode
281
+
282
+
283
+ def fused_add_rms_norm_backward(dY, dS_out, S, W, RSTD, offset, casting_mode, BLOCK_SIZE, num_warps, in_place):
284
+ shape = dY.shape
285
+ dim = shape[-1]
286
+ dY = dY.view(-1, dim)
287
+ dS_out = dS_out.view(-1, dim)
288
+ S = S.view(-1, dim)
289
+ n_rows, n_cols = dY.shape
290
+
291
+ sm_count = 1
292
+ if S.device.type == "cuda":
293
+ sm_count = torch.cuda.get_device_properties(S.device).multi_processor_count
294
+ elif S.device.type == "xpu":
295
+ sm_count = torch.xpu.get_device_properties(S.device).gpu_eu_count
296
+
297
+ # fp32 for numerical stability especially.
298
+ _dW = torch.empty((sm_count, n_cols), dtype=torch.float32, device=W.device)
299
+
300
+ if n_cols > BLOCK_SIZE:
301
+ raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
302
+ rows_per_program = math.ceil(n_rows / sm_count)
303
+ grid = (sm_count,)
304
+
305
+ if in_place is True:
306
+ dX = dY
307
+ else:
308
+ dX = torch.empty_like(dY)
309
+
310
+ # XPU-specific optimization
311
+ kernel_args = {}
312
+ if S.device.type == "xpu":
313
+ kernel_args["grf_mode"] = "large"
314
+
315
+ # TODO: add _block_fused_add_rms_norm_backward_kernel
316
+ _fused_add_rms_norm_backward_kernel[grid](
317
+ dY,
318
+ dY.stride(0),
319
+ dS_out,
320
+ dS_out.stride(0),
321
+ dX,
322
+ dX.stride(0),
323
+ S,
324
+ S.stride(0),
325
+ torch_to_triton_dtype[S.dtype],
326
+ W,
327
+ W.stride(0),
328
+ RSTD,
329
+ RSTD.stride(0),
330
+ _dW,
331
+ _dW.stride(0),
332
+ n_rows,
333
+ n_cols,
334
+ offset,
335
+ rows_per_program,
336
+ casting_mode,
337
+ BLOCK_SIZE=BLOCK_SIZE,
338
+ num_warps=num_warps,
339
+ has_dS_out=dS_out is not None,
340
+ **kernel_args, # XPU-specific optimization
341
+ )
342
+
343
+ dX = dX.view(*shape)
344
+ dW = _dW.sum(dim=0).to(W.dtype)
345
+
346
+ return dX, dX, dW # dR is equal to dX
347
+
348
+
349
+ class LigerFusedAddRMSNormFunction(torch.autograd.Function):
350
+ """
351
+ Performs a fused operation that first adds a residual tensor to the hidden_states tensor (`X`), then applies RMSNorm (Root Mean Square Normalization) to the result using the weight tensor `W`, with optional offset and casting mode.
352
+
353
+ This class implements the following sequence, commonly used in transformer decoder layers:
354
+ 1. hidden_states = residual + hidden_states
355
+ 2. residual = hidden_states (after addition)
356
+ 3. hidden_states = rmsnorm(hidden_states)
357
+
358
+ Both the normalized hidden_states and the updated residual are returned as outputs.
359
+
360
+ Some models use an 'offset' to shift the weight tensor `W` by a constant value. For example, Gemma
361
+ uses an offset of 1.0, so the computation becomes `(X / RMS(X)) * (W + 1.0)` instead of the usual
362
+ `(X / RMS(X)) * W`. You can pass the offset value as an argument to the forward function.
363
+
364
+ In addition, different models cast their inputs at different places during RMSNorm computation. For
365
+ example, Gemma casts everything to fp32 before starting the computation, while Llama casts only the
366
+ inverse RMS to fp32. You can specify the casting mode using the `casting_mode` argument. We currently
367
+ support the following casting modes (they match HuggingFace Transformers' implementations):
368
+ - 'llama': matches the Llama implementation, where only the inverse RMS is computed on fp32.
369
+ - 'gemma': matches the Gemma implementation, where everything is cast to fp32, then computed, then cast back to the original dtype.
370
+ - 'none': no casting is done. The computation is done in the original dtype. This saves memory and is slightly faster, but has more error w.r.t. the original implementation.
371
+
372
+ The `in_place` option determines whether to modify dY in-place to store dX. This defaults to `True` to save memory.
373
+ """
374
+
375
+ @staticmethod
376
+ @ensure_contiguous
377
+ def forward(ctx, X, R, W, eps, offset=0.0, casting_mode="llama", in_place=False):
378
+ """
379
+ X: (B, T, H) or (BxT, H)
380
+ W: (H,)
381
+ """
382
+ # TODO: add row_mode
383
+ Y, S, RSTD, BLOCK_SIZE, num_warps, casting_mode = fused_add_rms_norm_forward(X, R, W, eps, offset, casting_mode)
384
+ ctx.offset = offset
385
+ ctx.casting_mode = casting_mode
386
+ ctx.in_place = in_place
387
+ ctx.BLOCK_SIZE = BLOCK_SIZE
388
+ ctx.num_warps = num_warps
389
+ ctx.save_for_backward(S, W, RSTD)
390
+ return Y, S
391
+
392
+ @staticmethod
393
+ @ensure_contiguous
394
+ def backward(ctx, dY, dS_out):
395
+ """
396
+ Y: (B, T, H) or (BxT, H)
397
+ """
398
+ S, W, RSTD = ctx.saved_tensors
399
+ dX, dR, dW = fused_add_rms_norm_backward(
400
+ dY,
401
+ dS_out,
402
+ S,
403
+ W,
404
+ RSTD,
405
+ ctx.offset,
406
+ ctx.casting_mode,
407
+ ctx.BLOCK_SIZE,
408
+ ctx.num_warps,
409
+ ctx.in_place,
410
+ )
411
+
412
+ return dX, dR, dW, None, None, None, None, None
@@ -5,6 +5,7 @@ from typing import TYPE_CHECKING
5
5
  # Always-safe imports (independent of 'transformers')
6
6
  from liger_kernel.transformers.cross_entropy import LigerCrossEntropyLoss # noqa: F401
7
7
  from liger_kernel.transformers.dyt import LigerDyT # noqa: F401
8
+ from liger_kernel.transformers.fused_add_rms_norm import LigerFusedAddRMSNorm # noqa: F401
8
9
  from liger_kernel.transformers.fused_linear_cross_entropy import LigerFusedLinearCrossEntropyLoss # noqa: F401
9
10
  from liger_kernel.transformers.fused_linear_jsd import LigerFusedLinearJSD # noqa: F401
10
11
  from liger_kernel.transformers.geglu import LigerGEGLUMLP # noqa: F401
@@ -43,6 +44,7 @@ if TYPE_CHECKING:
43
44
  from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_qwen2_vl # noqa: F401
44
45
  from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_qwen3 # noqa: F401
45
46
  from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_qwen3_moe # noqa: F401
47
+ from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_smollm3 # noqa: F401
46
48
 
47
49
 
48
50
  # Check if 'transformers' is installed
@@ -100,6 +102,7 @@ def __getattr__(name: str):
100
102
  "apply_liger_kernel_to_qwen2_vl",
101
103
  "apply_liger_kernel_to_qwen3",
102
104
  "apply_liger_kernel_to_qwen3_moe",
105
+ "apply_liger_kernel_to_smollm3",
103
106
  }
104
107
 
105
108
  if name in monkey_patch_symbols:
@@ -119,6 +122,7 @@ __all__ = [
119
122
  "LigerGEGLUMLP",
120
123
  "LigerJSD",
121
124
  "LigerLayerNorm",
125
+ "LigerFusedAddRMSNorm",
122
126
  "LigerRMSNorm",
123
127
  "liger_rotary_pos_emb",
124
128
  "LigerBlockSparseTop2MLP",
@@ -155,5 +159,6 @@ if _TRANSFORMERS_AVAILABLE:
155
159
  "apply_liger_kernel_to_qwen2_vl",
156
160
  "apply_liger_kernel_to_qwen3",
157
161
  "apply_liger_kernel_to_qwen3_moe",
162
+ "apply_liger_kernel_to_smollm3",
158
163
  ]
159
164
  )
@@ -2,6 +2,7 @@ from typing import Optional
2
2
 
3
3
  from liger_kernel.ops.cross_entropy import LigerCrossEntropyFunction
4
4
  from liger_kernel.ops.dyt import LigerDyTFunction
5
+ from liger_kernel.ops.fused_add_rms_norm import LigerFusedAddRMSNormFunction
5
6
  from liger_kernel.ops.fused_linear_cross_entropy import LigerFusedLinearCrossEntropyFunction
6
7
  from liger_kernel.ops.fused_linear_jsd import LigerFusedLinearJSDFunction
7
8
  from liger_kernel.ops.fused_neighborhood_attention import LigerFusedNeighborhoodAttentionFunction
@@ -253,6 +254,10 @@ def liger_rms_norm(X, W, eps, offset: float = 0.0, casting_mode: str = "llama",
253
254
  return LigerRMSNormFunction.apply(X, W, eps, offset, casting_mode, in_place)
254
255
 
255
256
 
257
+ def liger_fused_add_rms_norm(X, R, W, eps, offset: float = 0.0, casting_mode: str = "llama", in_place: bool = True):
258
+ return LigerFusedAddRMSNormFunction.apply(X, R, W, eps, offset, casting_mode, in_place)
259
+
260
+
256
261
  def liger_rope(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
257
262
  return LigerRopeFunction.apply(q, k, cos, sin, position_ids, unsqueeze_dim)
258
263
 
@@ -0,0 +1,39 @@
1
+ import torch
2
+ import torch.nn as nn
3
+
4
+ from liger_kernel.ops.fused_add_rms_norm import LigerFusedAddRMSNormFunction
5
+
6
+
7
+ class LigerFusedAddRMSNorm(nn.Module):
8
+ def __init__(
9
+ self,
10
+ hidden_size,
11
+ eps=1e-6,
12
+ offset=0.0,
13
+ casting_mode="llama",
14
+ init_fn="ones",
15
+ in_place=False,
16
+ ):
17
+ super().__init__()
18
+ assert init_fn in [
19
+ "ones",
20
+ "zeros",
21
+ ], f"init_fn must be either 'ones' or 'zeros', got {init_fn}"
22
+ self.weight = nn.Parameter(torch.ones(hidden_size) if init_fn == "ones" else torch.zeros(hidden_size))
23
+ self.variance_epsilon, self.offset, self.casting_mode, self.in_place = (eps, offset, casting_mode, in_place)
24
+
25
+ def forward(self, hidden_states, residual):
26
+ return LigerFusedAddRMSNormFunction.apply(
27
+ hidden_states,
28
+ residual,
29
+ self.weight,
30
+ self.variance_epsilon,
31
+ self.offset,
32
+ self.casting_mode,
33
+ self.in_place,
34
+ )
35
+
36
+ def extra_repr(self):
37
+ return (
38
+ f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}, offset={self.offset}, in_place={self.in_place}"
39
+ )
@@ -0,0 +1,189 @@
1
+ from typing import TYPE_CHECKING
2
+ from typing import List
3
+ from typing import Optional
4
+ from typing import Tuple
5
+ from typing import Union
6
+
7
+ import torch
8
+
9
+ from torch.distributed.fsdp import FullyShardedDataParallel
10
+ from transformers.modeling_outputs import CausalLMOutputWithPast
11
+ from transformers.utils.deprecation import deprecate_kwarg
12
+
13
+ from liger_kernel.transformers.fsdp import _FSDPForwardRedirection
14
+ from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
15
+ from liger_kernel.utils import PEFT_AVAILABLE
16
+
17
+ if TYPE_CHECKING:
18
+ from transformers.cache_utils import Cache
19
+
20
+ if PEFT_AVAILABLE:
21
+ from peft.utils.other import ModulesToSaveWrapper
22
+
23
+
24
+ @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
25
+ def lce_forward(
26
+ self,
27
+ input_ids: torch.LongTensor = None,
28
+ attention_mask: Optional[torch.Tensor] = None,
29
+ position_ids: Optional[torch.LongTensor] = None,
30
+ past_key_values: Optional[Union["Cache", List[torch.FloatTensor]]] = None,
31
+ inputs_embeds: Optional[torch.FloatTensor] = None,
32
+ labels: Optional[torch.LongTensor] = None,
33
+ use_cache: Optional[bool] = None,
34
+ output_attentions: Optional[bool] = None,
35
+ output_hidden_states: Optional[bool] = None,
36
+ return_dict: Optional[bool] = None,
37
+ cache_position: Optional[torch.LongTensor] = None,
38
+ logits_to_keep: Union[int, torch.Tensor] = 0,
39
+ skip_logits: Optional[bool] = None,
40
+ **kwargs,
41
+ ) -> Union[Tuple, CausalLMOutputWithPast]:
42
+ r"""
43
+ Args:
44
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
45
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
46
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
47
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
48
+
49
+ logits_to_keep (`int` or `torch.Tensor`, *optional*):
50
+ If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
51
+ `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
52
+ token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
53
+ If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
54
+ This is useful when using packed tensor format (single dimension for batch and sequence length).
55
+
56
+ Returns:
57
+
58
+ Example:
59
+
60
+ ```python
61
+ >>> from transformers import AutoTokenizer, Smollm3ForCausalLM
62
+
63
+ >>> model = Smollm3ForCausalLM.from_pretrained("HuggingFaceTB/SmolLM3-3B")
64
+ >>> tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM3-3B")
65
+
66
+ >>> prompt = "Hey, are you conscious? Can you talk to me?"
67
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
68
+
69
+ >>> # Generate
70
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
71
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
72
+ "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
73
+ ```"""
74
+
75
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
76
+ output_hidden_states = (
77
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
78
+ )
79
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
80
+
81
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
82
+ outputs = self.model(
83
+ input_ids=input_ids,
84
+ attention_mask=attention_mask,
85
+ position_ids=position_ids,
86
+ past_key_values=past_key_values,
87
+ inputs_embeds=inputs_embeds,
88
+ use_cache=use_cache,
89
+ output_attentions=output_attentions,
90
+ output_hidden_states=output_hidden_states,
91
+ return_dict=return_dict,
92
+ cache_position=cache_position,
93
+ **kwargs,
94
+ )
95
+
96
+ hidden_states = outputs[0]
97
+ # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
98
+ slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
99
+ kept_hidden_states = hidden_states[:, slice_indices, :]
100
+
101
+ shift_labels = kwargs.pop("shift_labels", None)
102
+ logits = None
103
+ loss = None
104
+ # if in training mode, don't materialize logits
105
+ if skip_logits and labels is None and shift_labels is None:
106
+ raise ValueError("skip_logits is True, but labels and shift_labels are None")
107
+
108
+ if skip_logits is None:
109
+ # By default, if in training mode, don't materialize logits
110
+ skip_logits = self.training and (labels is not None or shift_labels is not None)
111
+
112
+ if skip_logits:
113
+ loss = lce_maybe_trainable_lm_head(
114
+ self,
115
+ hidden_states=kept_hidden_states,
116
+ hidden_size=self.config.hidden_size,
117
+ labels=labels,
118
+ shift_labels=shift_labels,
119
+ **kwargs,
120
+ )
121
+
122
+ else:
123
+ logits = self.lm_head(kept_hidden_states)
124
+ if labels is not None:
125
+ loss = self.loss_function(
126
+ logits=logits,
127
+ labels=labels,
128
+ vocab_size=self.config.vocab_size,
129
+ **kwargs,
130
+ )
131
+
132
+ if not return_dict:
133
+ output = (logits,) + outputs[1:]
134
+ return (loss,) + output if loss is not None else output
135
+
136
+ return CausalLMOutputWithPast(
137
+ loss=loss,
138
+ logits=logits,
139
+ past_key_values=outputs.past_key_values,
140
+ hidden_states=outputs.hidden_states,
141
+ attentions=outputs.attentions,
142
+ )
143
+
144
+
145
+ def lce_maybe_trainable_lm_head(self, hidden_states, hidden_size, labels, shift_labels, **loss_kwargs):
146
+ lm_head = self.lm_head
147
+
148
+ # Unwrap the module if lm_head has been added as trainable module in PEFT LoRA configuration,
149
+ # i.e. listed in the modules_to_save field of LoraConfig, so the lm_head weights are read
150
+ # from the unwrapped module.
151
+ # See https://huggingface.co/docs/peft/package_reference/lora for reference.
152
+ if PEFT_AVAILABLE and isinstance(lm_head, ModulesToSaveWrapper):
153
+ lm_head = lm_head.modules_to_save.default
154
+
155
+ # If FSDP is used and lm_head is trainable, e.g., during full fine-tuning or with LoRA,
156
+ # reading the lm_head module weights and calling the kernel must be done within FSDP forward pass
157
+ # so the module entire parameters are summoned and kept in memory during the kernel execution.
158
+ if isinstance(lm_head, FullyShardedDataParallel):
159
+ return _FSDPForwardRedirection()(
160
+ lm_head,
161
+ _liger_for_causal_lm_loss,
162
+ lm_head.module,
163
+ hidden_states,
164
+ hidden_size,
165
+ labels,
166
+ shift_labels,
167
+ **loss_kwargs,
168
+ )
169
+
170
+ # FSDP is not used so we can read the lm_head weights and call the kernel directly
171
+ return _liger_for_causal_lm_loss(
172
+ lm_head=self.lm_head,
173
+ hidden_states=hidden_states,
174
+ hidden_size=hidden_size,
175
+ labels=labels,
176
+ shift_labels=shift_labels,
177
+ **loss_kwargs,
178
+ )
179
+
180
+
181
+ def _liger_for_causal_lm_loss(lm_head, hidden_states, hidden_size, labels, shift_labels, **loss_kwargs):
182
+ return LigerForCausalLMLoss(
183
+ hidden_states=hidden_states,
184
+ lm_head_weight=lm_head.weight,
185
+ labels=labels,
186
+ hidden_size=hidden_size,
187
+ shift_labels=shift_labels,
188
+ **loss_kwargs,
189
+ )
@@ -29,6 +29,7 @@ from liger_kernel.transformers.model.phi3 import lce_forward as phi3_lce_forward
29
29
  from liger_kernel.transformers.model.phi3 import lce_forward_deprecated as phi3_lce_forward_deprecated
30
30
  from liger_kernel.transformers.model.qwen2 import lce_forward as qwen2_lce_forward
31
31
  from liger_kernel.transformers.model.qwen2 import lce_forward_deprecated as qwen2_lce_forward_deprecated
32
+ from liger_kernel.transformers.model.smollm3 import lce_forward as smollm3_lce_forward
32
33
  from liger_kernel.transformers.qwen2vl_mrope import liger_multimodal_rotary_pos_emb
33
34
  from liger_kernel.transformers.rms_norm import LigerRMSNorm
34
35
  from liger_kernel.transformers.rope import liger_rotary_pos_emb
@@ -290,6 +291,77 @@ def apply_liger_kernel_to_llama(
290
291
  _patch_rms_norm_module(decoder_layer.post_attention_layernorm)
291
292
 
292
293
 
294
+ def apply_liger_kernel_to_smollm3(
295
+ rope: bool = True,
296
+ cross_entropy: bool = False,
297
+ fused_linear_cross_entropy: bool = True,
298
+ rms_norm: bool = True,
299
+ swiglu: bool = True,
300
+ model: PreTrainedModel = None,
301
+ ) -> None:
302
+ """
303
+ Apply Liger kernels to replace original implementation in HuggingFace SmolLM3 model
304
+
305
+ Args:
306
+ rope (bool): Whether to apply Liger's rotary position embedding. Default is True.
307
+ cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
308
+ fused_linear_cross_entropy (bool):
309
+ Whether to apply Liger's fused linear cross entropy loss. Default is True.
310
+ `cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
311
+ If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
312
+ rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
313
+ swiglu (bool): Whether to apply Liger's SwiGLU MLP. Default is True.
314
+ model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
315
+ loaded. Default is None.
316
+ """
317
+
318
+ assert not (cross_entropy and fused_linear_cross_entropy), (
319
+ "cross_entropy and fused_linear_cross_entropy cannot both be True."
320
+ )
321
+
322
+ from transformers.models.smollm3 import modeling_smollm3
323
+ from transformers.models.smollm3.modeling_smollm3 import SmolLM3Model
324
+
325
+ if rope:
326
+ modeling_smollm3.apply_rotary_pos_emb = liger_rotary_pos_emb
327
+ if rms_norm:
328
+ modeling_smollm3.SmolLM3RMSNorm = LigerRMSNorm
329
+ if swiglu:
330
+ modeling_smollm3.SmolLM3MLP = LigerSwiGLUMLP
331
+
332
+ if cross_entropy:
333
+ if transformer_version >= version.parse(SUPPORTED_TRANSFORMER_VERSION):
334
+ from transformers.loss.loss_utils import nn
335
+
336
+ nn.functional.cross_entropy = liger_cross_entropy
337
+ else:
338
+ logger.warning(TRANSFORMER_DEPRECATION_WARNING)
339
+ modeling_smollm3.CrossEntropyLoss = LigerCrossEntropyLoss
340
+
341
+ if fused_linear_cross_entropy:
342
+ if model is not None:
343
+ model.forward = MethodType(smollm3_lce_forward, model)
344
+ else:
345
+ modeling_smollm3.SmolLM3ForCausalLM.forward = smollm3_lce_forward
346
+
347
+ if model is not None:
348
+ # The model instance already exists, so we need to additionally patch the
349
+ # instance variables that reference already-instantiated modules (e.g. SmolLM3RMSNorm or SmolLM3MLP)
350
+
351
+ # get the base model from the model instance
352
+ base_model: SmolLM3Model = getattr(model, model.base_model_prefix, model)
353
+
354
+ if rms_norm:
355
+ _patch_rms_norm_module(base_model.norm)
356
+
357
+ for decoder_layer in base_model.layers:
358
+ if swiglu:
359
+ _patch_swiglu_module(decoder_layer.mlp, LigerSwiGLUMLP)
360
+ if rms_norm:
361
+ _patch_rms_norm_module(decoder_layer.input_layernorm)
362
+ _patch_rms_norm_module(decoder_layer.post_attention_layernorm)
363
+
364
+
293
365
  def apply_liger_kernel_to_llava(
294
366
  cross_entropy: bool = False,
295
367
  fused_linear_cross_entropy: bool = True,
@@ -1801,6 +1873,7 @@ MODEL_TYPE_TO_APPLY_LIGER_FN = {
1801
1873
  "qwen2_vl_text": apply_liger_kernel_to_qwen2_vl,
1802
1874
  "qwen2_5_vl": apply_liger_kernel_to_qwen2_5_vl,
1803
1875
  "qwen2_5_vl_text": apply_liger_kernel_to_qwen2_5_vl,
1876
+ "smollm3": apply_liger_kernel_to_smollm3,
1804
1877
  "phi3": apply_liger_kernel_to_phi3,
1805
1878
  "paligemma": apply_liger_kernel_to_paligemma,
1806
1879
  }
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: liger_kernel_nightly
3
- Version: 0.6.0.dev20250718050347
3
+ Version: 0.6.0.dev20250718080702
4
4
  Summary: Efficient Triton kernels for LLM Training
5
5
  License: BSD 2-CLAUSE LICENSE
6
6
  Copyright 2024 LinkedIn Corporation
@@ -19,6 +19,7 @@ liger_kernel/chunked_loss/simpo_loss.py,sha256=fy2w8KbhMrBv7b1jdIeH3bBFxY52bPQPZ
19
19
  liger_kernel/ops/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
20
  liger_kernel/ops/cross_entropy.py,sha256=e8THGnhOcy_0SbOLABx67HEM7-B8a8pG7nDKbCRpQKM,19123
21
21
  liger_kernel/ops/dyt.py,sha256=gCLz4S8aul8SY9nvIGaoK67aGb7U9MJRQdo3ONqmQYs,5417
22
+ liger_kernel/ops/fused_add_rms_norm.py,sha256=UBqmlqFCmhSAIpkNKd8rrfXatX7Z4J9bp2dX9A0lrJQ,14017
22
23
  liger_kernel/ops/fused_linear_cross_entropy.py,sha256=5fbGhN85n3zf0uIdJ7PYHWIRzTf0VTFiS0ARtOmqIP0,11020
23
24
  liger_kernel/ops/fused_linear_jsd.py,sha256=CSoprxb-YcJy-YUKiTcYkxN8sb9h2kdk_iHuncvSV5c,9683
24
25
  liger_kernel/ops/fused_neighborhood_attention.py,sha256=vPi5xbnh6wxyZehaqo6Tuilqo2fN5SGDiONjnNmIKqs,35556
@@ -39,12 +40,13 @@ liger_kernel/ops/tvd.py,sha256=FHJtLQI95ijqgg9UtaHpMAjSCiPxB6CduPwPMcGxelc,6405
39
40
  liger_kernel/ops/utils.py,sha256=uoFKQqo-34N2TWQNvXMFywqGiOMMXNEVBxVojzlUAa0,3836
40
41
  liger_kernel/ops/experimental/embedding.py,sha256=tolj3tItkzpSb30zWqDN2_yX4ectflaQ8HMyKyFIQc8,4172
41
42
  liger_kernel/ops/experimental/mm_int8int2.py,sha256=TrS9lpwekrik_w5qE7AhMJD1bcq-OidjtbsW80oZ6IM,13314
42
- liger_kernel/transformers/__init__.py,sha256=mWMEhOabqUkPimMOmkg9DawnO-vL9u_u-N4iIqfNZeg,7259
43
+ liger_kernel/transformers/__init__.py,sha256=VoHQp5emsAJAouql37RuvtGFeZCoMIHgoIxfsyYMTc8,7564
43
44
  liger_kernel/transformers/auto_model.py,sha256=0qCTRZt280Bj_LcFdzo9hlaR-BWNazawXOGgoCZjgEg,1545
44
45
  liger_kernel/transformers/cross_entropy.py,sha256=z3KTWQnFxr_IZaVjtYt0ZNEWQdDdYThN35xWkHlDGH0,1683
45
46
  liger_kernel/transformers/dyt.py,sha256=i-4GPaMrl-jab9TVI5qN0-H9qycn_mCbV82ozU4nbmU,723
46
47
  liger_kernel/transformers/fsdp.py,sha256=CUiyjTmjkjY7pLXQv8ly9rnzgXw6529csd9pvtJNMYc,3096
47
- liger_kernel/transformers/functional.py,sha256=7Emw7D6VPMg8hfasC33NiolvKmQVF1gV6VayKQCEWJM,7446
48
+ liger_kernel/transformers/functional.py,sha256=PXnACWD7kzgge50RdOUuvtmOTS7DVkkrL7mm0cX5bOc,7734
49
+ liger_kernel/transformers/fused_add_rms_norm.py,sha256=7_Bzg-x6lLe6W1qG2DtjDALhEpNZlC6N5GppEs9cTYY,1199
48
50
  liger_kernel/transformers/fused_linear_cross_entropy.py,sha256=O8Sg5BT81nTaY9fSGoOY9dOD9ekibwwiuXhdUHaxntQ,1742
49
51
  liger_kernel/transformers/fused_linear_jsd.py,sha256=bZ4otCvWBuOnA5XdQL-FzZVItJlDt-ht9e_pG7PG93E,3999
50
52
  liger_kernel/transformers/fused_neighborhood_attention.py,sha256=TxYDUAt9B6WSP14aJP66C_2Mbds2sSIPGnamhUSTrC8,7957
@@ -54,7 +56,7 @@ liger_kernel/transformers/grpo_loss.py,sha256=uAkUNKSnUGEOqa82L9w2e6AI1kcmG8K45-
54
56
  liger_kernel/transformers/jsd.py,sha256=DGqRnxIZxsvxo0_tbbxX3b-sDbDjC_yKufyRIHCcScY,2979
55
57
  liger_kernel/transformers/kl_div.py,sha256=WLffFbh1EExD2Eb1F7lN11fo9JJC-0751WJjZAF1Fj8,409
56
58
  liger_kernel/transformers/layer_norm.py,sha256=c9pk3PEasOKYR0rhe5e5nNrnYKVCEW4VC8S6LpCq9EQ,906
57
- liger_kernel/transformers/monkey_patch.py,sha256=W7KgJN-rrLZS3pRZ5debO_dSN7zddPegKjqOIP39wR0,85856
59
+ liger_kernel/transformers/monkey_patch.py,sha256=VsN839y5QVEC6BD_-hCiShWLerQM2QDLDoKf2rq02I4,88990
58
60
  liger_kernel/transformers/multi_token_attention.py,sha256=l9VDICK0dfmifUDW668hGscP8AHq2rYcM2oGUa3baRQ,1751
59
61
  liger_kernel/transformers/qwen2vl_mrope.py,sha256=5EwSqrMdsL9MYspeBMXBsNJKvH0MOmRrtJXAJlnnlOI,1047
60
62
  liger_kernel/transformers/rms_norm.py,sha256=vkekcvTeWY8vL4H6hg3t0XeY0Ew_3OFMPHuzqlxPPVw,2719
@@ -85,13 +87,14 @@ liger_kernel/transformers/model/qwen2_5_vl.py,sha256=zEVVwotCXnAm3RRc8-1Nc8uitSW
85
87
  liger_kernel/transformers/model/qwen2_vl.py,sha256=5vK-vtCDpKZ2w33xYp2BS8kQYWUbKMqaiKvQcI27Mss,5884
86
88
  liger_kernel/transformers/model/qwen3.py,sha256=w2jBHuK9kK9EmOr5dnEIXNQXUgUSV_sJUkXSEwxLPHs,4885
87
89
  liger_kernel/transformers/model/qwen3_moe.py,sha256=BkpfFH3fOH0yRfA7LF-AoHTLut2GV0Y4MOlkiIYewfU,5511
90
+ liger_kernel/transformers/model/smollm3.py,sha256=mqayvpwpMbp2yd_Ue7IPzy-dA4KHSDi_ROZW5vHCHfQ,7596
88
91
  liger_kernel/transformers/trainer/__init__.py,sha256=p7yQfklV8-467qSz_ZMimkbDF7HHWHwku25A-GYL0WU,193
89
92
  liger_kernel/transformers/trainer/orpo_trainer.py,sha256=tX0h63aOFe3rNqTmk6JpMf75UPo981yzEa6TghnjS0Q,5370
90
93
  liger_kernel/triton/__init__.py,sha256=qCiCamzCRv6lpV8IqpAc9YMdNKC7GKurClWceQPnlis,92
91
94
  liger_kernel/triton/monkey_patch.py,sha256=Rd0hUHAzDkFfHvnX7-PBaNK5EKnZhtfM_h-fgQH9HPY,1568
92
- liger_kernel_nightly-0.6.0.dev20250718050347.dist-info/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
93
- liger_kernel_nightly-0.6.0.dev20250718050347.dist-info/METADATA,sha256=piNBLykxYNNlxW-gKCkLrzHXuBvhbYMV03RZgFImk2Q,24672
94
- liger_kernel_nightly-0.6.0.dev20250718050347.dist-info/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
95
- liger_kernel_nightly-0.6.0.dev20250718050347.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
96
- liger_kernel_nightly-0.6.0.dev20250718050347.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
97
- liger_kernel_nightly-0.6.0.dev20250718050347.dist-info/RECORD,,
95
+ liger_kernel_nightly-0.6.0.dev20250718080702.dist-info/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
96
+ liger_kernel_nightly-0.6.0.dev20250718080702.dist-info/METADATA,sha256=mNkIMGPTMPdmmjDsW54kDe0WhPi8Ep0Cpt4koWQQuaE,24672
97
+ liger_kernel_nightly-0.6.0.dev20250718080702.dist-info/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
98
+ liger_kernel_nightly-0.6.0.dev20250718080702.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
99
+ liger_kernel_nightly-0.6.0.dev20250718080702.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
100
+ liger_kernel_nightly-0.6.0.dev20250718080702.dist-info/RECORD,,