liger-kernel-nightly 0.4.0.dev20241107052928__py3-none-any.whl → 0.6.3.dev20251121010306__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of liger-kernel-nightly might be problematic. Click here for more details.

Files changed (114) hide show
  1. liger_kernel/__init__.py +0 -0
  2. liger_kernel/chunked_loss/README.md +25 -0
  3. liger_kernel/chunked_loss/__init__.py +8 -0
  4. liger_kernel/chunked_loss/cosine_similarity_loss.py +136 -0
  5. liger_kernel/chunked_loss/cpo_loss.py +157 -0
  6. liger_kernel/chunked_loss/dpo_loss.py +229 -0
  7. liger_kernel/chunked_loss/functional.py +17 -0
  8. liger_kernel/chunked_loss/fused_linear_distillation.py +292 -0
  9. liger_kernel/chunked_loss/fused_linear_ppo.py +350 -0
  10. liger_kernel/chunked_loss/fused_linear_preference.py +433 -0
  11. liger_kernel/chunked_loss/fused_linear_unpaired_preference.py +341 -0
  12. liger_kernel/chunked_loss/grpo_loss.py +304 -0
  13. liger_kernel/chunked_loss/jsd_loss.py +200 -0
  14. liger_kernel/chunked_loss/kto_loss.py +210 -0
  15. liger_kernel/chunked_loss/orpo_loss.py +144 -0
  16. liger_kernel/chunked_loss/simpo_loss.py +165 -0
  17. liger_kernel/env_report.py +21 -4
  18. liger_kernel/ops/cross_entropy.py +235 -84
  19. liger_kernel/ops/dyt.py +157 -0
  20. liger_kernel/ops/experimental/embedding.py +1 -3
  21. liger_kernel/ops/experimental/mm_int8int2.py +3 -9
  22. liger_kernel/ops/fused_add_rms_norm.py +412 -0
  23. liger_kernel/ops/fused_linear_cross_entropy.py +197 -75
  24. liger_kernel/ops/fused_linear_jsd.py +17 -34
  25. liger_kernel/ops/fused_neighborhood_attention.py +1022 -0
  26. liger_kernel/ops/geglu.py +7 -18
  27. liger_kernel/ops/group_norm.py +305 -0
  28. liger_kernel/ops/grpo_loss.py +310 -0
  29. liger_kernel/ops/jsd.py +46 -21
  30. liger_kernel/ops/kl_div.py +23 -19
  31. liger_kernel/ops/layer_norm.py +150 -86
  32. liger_kernel/ops/llama4_rope.py +225 -0
  33. liger_kernel/ops/multi_token_attention.py +207 -0
  34. liger_kernel/ops/poly_norm.py +386 -0
  35. liger_kernel/ops/qwen2vl_mrope.py +222 -0
  36. liger_kernel/ops/rms_norm.py +314 -84
  37. liger_kernel/ops/rope.py +32 -34
  38. liger_kernel/ops/softmax.py +201 -0
  39. liger_kernel/ops/sparsemax.py +179 -0
  40. liger_kernel/ops/swiglu.py +5 -9
  41. liger_kernel/ops/tiled_mlp.py +136 -0
  42. liger_kernel/ops/tvd.py +207 -0
  43. liger_kernel/ops/utils.py +8 -4
  44. liger_kernel/transformers/__init__.py +199 -24
  45. liger_kernel/transformers/auto_model.py +6 -13
  46. liger_kernel/transformers/cross_entropy.py +33 -20
  47. liger_kernel/transformers/dyt.py +22 -0
  48. liger_kernel/transformers/experimental/__init__.py +5 -0
  49. liger_kernel/transformers/experimental/embedding.py +1 -3
  50. liger_kernel/transformers/fsdp.py +55 -0
  51. liger_kernel/transformers/functional.py +291 -13
  52. liger_kernel/transformers/fused_add_rms_norm.py +39 -0
  53. liger_kernel/transformers/fused_linear_cross_entropy.py +43 -14
  54. liger_kernel/transformers/fused_linear_jsd.py +1 -4
  55. liger_kernel/transformers/fused_neighborhood_attention.py +234 -0
  56. liger_kernel/transformers/geglu.py +1 -4
  57. liger_kernel/transformers/group_norm.py +50 -0
  58. liger_kernel/transformers/grpo_loss.py +98 -0
  59. liger_kernel/transformers/jsd.py +2 -7
  60. liger_kernel/transformers/kl_div.py +1 -3
  61. liger_kernel/transformers/layer_norm.py +3 -9
  62. liger_kernel/transformers/llama4_rope.py +93 -0
  63. liger_kernel/transformers/model/falcon_h1.py +122 -0
  64. liger_kernel/transformers/model/gemma.py +77 -77
  65. liger_kernel/transformers/model/gemma2.py +283 -0
  66. liger_kernel/transformers/model/gemma3.py +331 -0
  67. liger_kernel/transformers/model/glm4.py +141 -0
  68. liger_kernel/transformers/model/glm4v.py +163 -0
  69. liger_kernel/transformers/model/glm4v_moe.py +172 -0
  70. liger_kernel/transformers/model/internvl.py +157 -0
  71. liger_kernel/transformers/model/llama.py +128 -79
  72. liger_kernel/transformers/model/llama4.py +121 -0
  73. liger_kernel/transformers/model/llava.py +344 -0
  74. liger_kernel/transformers/model/loss_utils.py +95 -0
  75. liger_kernel/transformers/model/mistral.py +68 -64
  76. liger_kernel/transformers/model/mixtral.py +75 -91
  77. liger_kernel/transformers/model/mllama.py +63 -68
  78. liger_kernel/transformers/model/olmo2.py +141 -0
  79. liger_kernel/transformers/model/output_classes.py +147 -0
  80. liger_kernel/transformers/model/paligemma.py +432 -0
  81. liger_kernel/transformers/model/phi3.py +59 -213
  82. liger_kernel/transformers/model/qwen2.py +75 -72
  83. liger_kernel/transformers/model/qwen2_5_vl.py +163 -0
  84. liger_kernel/transformers/model/qwen2_vl.py +78 -98
  85. liger_kernel/transformers/model/qwen3.py +136 -0
  86. liger_kernel/transformers/model/qwen3_moe.py +152 -0
  87. liger_kernel/transformers/model/qwen3_next.py +146 -0
  88. liger_kernel/transformers/model/qwen3_vl.py +150 -0
  89. liger_kernel/transformers/model/qwen3_vl_moe.py +126 -0
  90. liger_kernel/transformers/model/smollm3.py +199 -0
  91. liger_kernel/transformers/model/smolvlm.py +158 -0
  92. liger_kernel/transformers/monkey_patch.py +2106 -289
  93. liger_kernel/transformers/multi_token_attention.py +64 -0
  94. liger_kernel/transformers/poly_norm.py +42 -0
  95. liger_kernel/transformers/qwen2vl_mrope.py +20 -0
  96. liger_kernel/transformers/rms_norm.py +57 -6
  97. liger_kernel/transformers/rope.py +45 -2
  98. liger_kernel/transformers/softmax.py +12 -0
  99. liger_kernel/transformers/sparsemax.py +16 -0
  100. liger_kernel/transformers/swiglu.py +23 -8
  101. liger_kernel/transformers/tiled_mlp.py +133 -0
  102. liger_kernel/transformers/trainer/__init__.py +4 -0
  103. liger_kernel/transformers/trainer/orpo_trainer.py +130 -0
  104. liger_kernel/transformers/tvd.py +13 -0
  105. liger_kernel/triton/__init__.py +1 -3
  106. liger_kernel/triton/monkey_patch.py +1 -3
  107. liger_kernel/utils.py +71 -0
  108. {liger_kernel_nightly-0.4.0.dev20241107052928.dist-info → liger_kernel_nightly-0.6.3.dev20251121010306.dist-info}/METADATA +150 -137
  109. liger_kernel_nightly-0.6.3.dev20251121010306.dist-info/RECORD +116 -0
  110. {liger_kernel_nightly-0.4.0.dev20241107052928.dist-info → liger_kernel_nightly-0.6.3.dev20251121010306.dist-info}/WHEEL +1 -1
  111. liger_kernel_nightly-0.4.0.dev20241107052928.dist-info/RECORD +0 -48
  112. {liger_kernel_nightly-0.4.0.dev20241107052928.dist-info → liger_kernel_nightly-0.6.3.dev20251121010306.dist-info}/LICENSE +0 -0
  113. {liger_kernel_nightly-0.4.0.dev20241107052928.dist-info → liger_kernel_nightly-0.6.3.dev20251121010306.dist-info}/NOTICE +0 -0
  114. {liger_kernel_nightly-0.4.0.dev20241107052928.dist-info → liger_kernel_nightly-0.6.3.dev20251121010306.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,157 @@
1
+ import operator
2
+
3
+ import torch
4
+ import triton
5
+ import triton.language as tl
6
+
7
+ from liger_kernel.ops.utils import compare_version
8
+ from liger_kernel.ops.utils import ensure_contiguous
9
+ from liger_kernel.ops.utils import infer_device
10
+
11
+ if compare_version("triton", operator.ge, "3.0.0"):
12
+ try:
13
+ # typical import path with dispatch available
14
+ from triton.language.extra.libdevice import tanh
15
+ except ModuleNotFoundError:
16
+ # for working with NGC containers
17
+ from triton.language.extra.cuda.libdevice import tanh
18
+ else:
19
+ from triton.language.math import tanh
20
+
21
+
22
+ # @triton.autotune([triton.Config({"BLOCK_N":bn}, num_stages=ns, num_warps=nw)
23
+ # for bn in [1024, 2048, 4096]
24
+ # for ns in [1,2,4]
25
+ # for nw in [4, 8, 16, 32]
26
+ # ],
27
+ # key=['N'])
28
+ @triton.jit
29
+ def _dyt_fwd_kernel(X, Y, Alpha, Gamma, Beta, HAVE_BETA: tl.constexpr, N: tl.constexpr, BLOCK_N: tl.constexpr = 1024):
30
+ col = tl.cast(tl.program_id(0), tl.int64) * BLOCK_N + tl.arange(0, BLOCK_N)
31
+ mask = col < N
32
+ row_id = tl.cast(tl.program_id(1), tl.int64)
33
+
34
+ X += row_id * N
35
+ Y += row_id * N
36
+ alpha = tl.load(Alpha).to(tl.float32)
37
+
38
+ gamma = tl.load(Gamma + col, mask=mask, other=0.0).to(tl.float32)
39
+
40
+ x = tl.load(X + col, mask=mask, other=0.0).to(tl.float32)
41
+
42
+ tanh_x = tanh(alpha * x)
43
+ y = tanh_x * gamma
44
+ if HAVE_BETA:
45
+ beta = tl.load(Beta + col, mask=mask, other=0.0).to(tl.float32)
46
+ y += beta
47
+ tl.store(Y + col, y, mask=mask)
48
+
49
+
50
+ # @triton.autotune([triton.Config({"BLOCK_N":bn}, num_stages=ns, num_warps=nw)
51
+ # for bn in [1024, 2048, 4096]
52
+ # for ns in [1,2,4]
53
+ # for nw in [4, 8, 16]
54
+ # ],
55
+ # key=['N'])
56
+ @triton.jit
57
+ def _dyt_bwd_kernel(
58
+ DY, DX, DA, DG, DB, X, Alpha, Gamma, HAVE_BETA: tl.constexpr, M, N: tl.constexpr, BLOCK_N: tl.constexpr = 1024
59
+ ):
60
+ col = tl.cast(tl.program_id(0), tl.int64) * BLOCK_N + tl.arange(0, BLOCK_N)
61
+ mask = col < N
62
+ start_row_id = tl.cast(tl.program_id(1), tl.int64)
63
+
64
+ alpha = tl.load(Alpha).to(tl.float32)
65
+ da = 0.0
66
+ gamma = tl.load(Gamma + col, mask=mask, other=0.0).to(tl.float32)
67
+ dg = tl.zeros((BLOCK_N,), dtype=tl.float32)
68
+ if HAVE_BETA:
69
+ db = tl.zeros((BLOCK_N,), dtype=tl.float32)
70
+ for row_id in range(start_row_id, M, tl.num_programs(1)):
71
+ x = tl.load(X + row_id * N + col, mask=mask, other=0.0).to(tl.float32)
72
+ dy = tl.load(DY + row_id * N + col, mask=mask, other=0.0).to(tl.float32)
73
+ tanh_x = tanh(alpha * x)
74
+ if HAVE_BETA:
75
+ db += dy
76
+ dg += dy * tanh_x
77
+ tmp = (1 - tanh_x * tanh_x) * dy * gamma
78
+ da += tl.sum(x * tmp, 0)
79
+ dx = alpha * tmp
80
+ tl.store(DX + row_id * N + col, dx, mask=mask)
81
+
82
+ tl.store(DG + start_row_id * N + col, dg, mask=mask)
83
+ if HAVE_BETA:
84
+ tl.store(DB + start_row_id * N + col, db, mask=mask)
85
+ tl.store(DA + start_row_id * tl.cdiv(N, 512) + tl.program_id(0), da)
86
+
87
+
88
+ def liger_dyt_fwd(x, alpha, gamma, beta):
89
+ assert x.is_contiguous()
90
+ HAVE_BETA = True if beta is not None else False
91
+ input_shape = x.shape
92
+ x = x.view(-1, input_shape[-1])
93
+ M, N = x.shape
94
+
95
+ y = torch.empty_like(x)
96
+
97
+ if N >= 4096:
98
+ kwargs = {"BLOCK_N": min(triton.next_power_of_2(N), 2048), "num_warps": 4, "num_stages": 1}
99
+ else:
100
+ kwargs = {"BLOCK_N": min(triton.next_power_of_2(N), 1024), "num_warps": 4, "num_stages": 1}
101
+
102
+ grid = lambda meta: (triton.cdiv(N, meta["BLOCK_N"]), M)
103
+ _dyt_fwd_kernel[(grid)](
104
+ x,
105
+ y,
106
+ alpha,
107
+ gamma,
108
+ beta,
109
+ HAVE_BETA,
110
+ N,
111
+ **kwargs,
112
+ )
113
+ return y.view(input_shape)
114
+
115
+
116
+ def liger_dyt_bwd(dy, x, alpha, gamma, beta):
117
+ assert dy.is_contiguous()
118
+ input_shape = x.shape
119
+ x = x.view(-1, input_shape[-1])
120
+ M, N = x.shape
121
+ HAVE_BETA = True if beta is not None else False
122
+
123
+ device = infer_device()
124
+ if device == "cuda":
125
+ NUM_SMS = torch.cuda.get_device_properties(x.device).multi_processor_count
126
+ elif device == "xpu":
127
+ NUM_SMS = torch.xpu.get_device_properties(x.device).gpu_subslice_count
128
+
129
+ da = torch.zeros(NUM_SMS, triton.cdiv(N, 512), dtype=torch.float32, device=x.device)
130
+ dg = torch.empty(NUM_SMS, N, dtype=torch.float32, device=x.device)
131
+ db = torch.empty(NUM_SMS, N, dtype=torch.float32, device=x.device) if HAVE_BETA else None
132
+ dx = torch.empty_like(dy)
133
+
134
+ kwargs = {"BLOCK_N": min(triton.next_power_of_2(N), 1024), "num_warps": 8, "num_stages": 2}
135
+ grid = lambda meta: (triton.cdiv(N, meta["BLOCK_N"]), NUM_SMS)
136
+ _dyt_bwd_kernel[grid](dy, dx, da, dg, db, x, alpha, gamma, HAVE_BETA, M, N, **kwargs)
137
+ if HAVE_BETA:
138
+ db = db.sum(0).to(x.dtype)
139
+ dg = dg.sum(0).to(gamma.dtype)
140
+ da = da.sum().to(x.dtype).unsqueeze(0)
141
+ return dx.view(input_shape), da, dg, db
142
+
143
+
144
+ class LigerDyTFunction(torch.autograd.Function):
145
+ @staticmethod
146
+ @ensure_contiguous
147
+ def forward(ctx, x, alpha, gamma, beta):
148
+ y = liger_dyt_fwd(x, alpha, gamma, beta)
149
+ ctx.save_for_backward(x, alpha, gamma, beta)
150
+ return y
151
+
152
+ @staticmethod
153
+ @ensure_contiguous
154
+ def backward(ctx, dy):
155
+ x, alpha, gamma, beta = ctx.saved_tensors
156
+ dx, dalpha, dgamma, dbeta = liger_dyt_bwd(dy, x, alpha, gamma, beta)
157
+ return dx, dalpha, dgamma, dbeta
@@ -34,9 +34,7 @@ def embedding_forward_kernel(
34
34
  )
35
35
 
36
36
  output_offsets = offsets_m[:, None] * embedding_dim + offsets_n[None, :]
37
- tl.store(
38
- output_ptr + output_offsets, embeddings, mask=mask_m[:, None] & mask_n[None, :]
39
- )
37
+ tl.store(output_ptr + output_offsets, embeddings, mask=mask_m[:, None] & mask_n[None, :])
40
38
 
41
39
 
42
40
  @triton.jit
@@ -37,9 +37,7 @@ def pack_weights(intweights: torch.Tensor, bits: int = 2) -> torch.Tensor:
37
37
  else:
38
38
  packed_tensor_shape = (row_dim, *original_shape[1:])
39
39
 
40
- packed = torch.zeros(
41
- packed_tensor_shape, device=intweights.device, dtype=torch.uint8
42
- )
40
+ packed = torch.zeros(packed_tensor_shape, device=intweights.device, dtype=torch.uint8)
43
41
  unpacked = intweights.to(torch.uint8)
44
42
 
45
43
  def lshift(t: torch.Tensor, bits: int):
@@ -327,17 +325,13 @@ def matmul_kernel(
327
325
 
328
326
 
329
327
  def matmul(a, b):
330
- assert (
331
- a.shape[1] == b.shape[0] * 4
332
- ), "Incompatible dimensions, the weight matrix need to be packed"
328
+ assert a.shape[1] == b.shape[0] * 4, "Incompatible dimensions, the weight matrix need to be packed"
333
329
  assert a.is_contiguous(), "Matrix A must be contiguous"
334
330
  M, K = a.shape
335
331
  _, N = b.shape
336
332
  # c is in int32 to avoid any overflows or underflows
337
333
  c = torch.empty((M, N), device=a.device, dtype=torch.int32)
338
- grid = lambda META: (
339
- triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]),
340
- )
334
+ grid = lambda META: (triton.cdiv(M, META["BLOCK_SIZE_M"]) * triton.cdiv(N, META["BLOCK_SIZE_N"]),)
341
335
  matmul_kernel[grid](
342
336
  a,
343
337
  b,
@@ -0,0 +1,412 @@
1
+ import math
2
+ import operator
3
+
4
+ import torch
5
+ import triton
6
+ import triton.language as tl
7
+
8
+ from liger_kernel.ops.utils import calculate_settings
9
+ from liger_kernel.ops.utils import compare_version
10
+ from liger_kernel.ops.utils import ensure_contiguous
11
+ from liger_kernel.ops.utils import torch_to_triton_dtype
12
+
13
+ if compare_version("triton", operator.ge, "3.0.0"):
14
+ try:
15
+ # typical import path with dispatch available
16
+ from triton.language.extra.libdevice import rsqrt
17
+ except ModuleNotFoundError:
18
+ # for working with NGC containers
19
+ from triton.language.extra.cuda.libdevice import rsqrt
20
+ else:
21
+ from triton.language.math import rsqrt
22
+
23
+
24
+ _CASTING_MODE_NONE: tl.constexpr = tl.constexpr(-1)
25
+ _CASTING_MODE_LLAMA: tl.constexpr = tl.constexpr(0)
26
+ _CASTING_MODE_GEMMA: tl.constexpr = tl.constexpr(1)
27
+
28
+
29
+ @triton.jit
30
+ def _fused_add_rms_norm_forward_kernel(
31
+ Y_ptr,
32
+ Y_row_stride,
33
+ S_ptr, # output residual
34
+ S_row_stride,
35
+ X_ptr,
36
+ X_row_stride,
37
+ R_ptr, # input residual
38
+ R_row_stride,
39
+ W_ptr,
40
+ W_row_stride,
41
+ RSTD_ptr,
42
+ RSTD_row_stride,
43
+ n_cols,
44
+ eps,
45
+ offset,
46
+ casting_mode: tl.constexpr, # constexpr so the `if` blocks can be optimized out
47
+ BLOCK_SIZE: tl.constexpr,
48
+ ):
49
+ """
50
+ This kernel computes the following:
51
+ 1. hidden_states = residual + hidden_states
52
+ 2. residual = hidden_states
53
+ 3. hidden_states = rmsnorm(hidden_states)
54
+
55
+ This is a commonly used pattern in the decoder layers of LLMs.
56
+ Some examples:
57
+ 1. https://github.com/huggingface/transformers/blob/0dc2df5ddafe3cb5824ad24e85beba13e0aa6726/src/transformers/models/qwen3/modeling_qwen3.py#L271
58
+ 2. https://github.com/huggingface/transformers/blob/0dc2df5ddafe3cb5824ad24e85beba13e0aa6726/src/transformers/models/llama4/modeling_llama4.py#L393
59
+
60
+ This kernel is inspired by the rms_norm forward kernel, and is adapted to support the residual addition in the forward pass.
61
+ The backward pass is also adapted to support the residual addition in the backward pass.
62
+ """
63
+
64
+ row_idx = tl.program_id(0).to(tl.int64)
65
+ col_offsets = tl.arange(0, BLOCK_SIZE)
66
+ mask = col_offsets < n_cols
67
+
68
+ Y_ptr += row_idx * Y_row_stride
69
+ S_ptr += row_idx * S_row_stride
70
+ X_ptr += row_idx * X_row_stride
71
+ R_ptr += row_idx * R_row_stride
72
+ RSTD_ptr += row_idx * RSTD_row_stride
73
+
74
+ X_row = tl.load(X_ptr + col_offsets, mask=mask, other=0)
75
+ R_row = tl.load(R_ptr + col_offsets, mask=mask, other=0)
76
+ S_row = X_row + R_row
77
+ tl.store(S_ptr + col_offsets, S_row, mask=mask)
78
+ S_row_dtype = S_row.dtype
79
+ W_row = tl.load(W_ptr + col_offsets, mask=mask, other=0)
80
+
81
+ # On Llama, only rstd is computed on fp32
82
+ if casting_mode == _CASTING_MODE_LLAMA:
83
+ S_row = S_row.to(tl.float32)
84
+
85
+ # Gemma computes everything on fp32, and then casts back the output to the original dtype
86
+ if casting_mode == _CASTING_MODE_GEMMA:
87
+ W_row = W_row.to(tl.float32)
88
+ S_row = S_row.to(tl.float32)
89
+
90
+ if casting_mode == _CASTING_MODE_NONE:
91
+ eps = eps.to(S_row_dtype)
92
+ offset = offset.to(S_row_dtype)
93
+
94
+ mean_square = tl.sum(S_row * S_row, axis=0) / n_cols
95
+ rstd = rsqrt(mean_square + eps)
96
+
97
+ # We can save time by caching rms with minimal memory overhead
98
+ # because rms is much smaller compared to X_row, as rms is for each row.
99
+ # However, on the computation side, it can save 4 operations (*, sum, /, sqrt).
100
+ tl.store(RSTD_ptr, rstd)
101
+
102
+ S_row = S_row * rstd
103
+
104
+ # On Llama, the multiplication with the weight is done on the original dtype
105
+ if casting_mode == _CASTING_MODE_LLAMA:
106
+ S_row = S_row.to(S_row_dtype)
107
+
108
+ Y_row = S_row * (offset + W_row)
109
+
110
+ if casting_mode == _CASTING_MODE_GEMMA:
111
+ Y_row = Y_row.to(S_row_dtype)
112
+
113
+ tl.store(Y_ptr + col_offsets, Y_row, mask=mask)
114
+
115
+
116
+ @triton.jit
117
+ def _fused_add_rms_norm_backward_kernel(
118
+ dY_ptr,
119
+ dY_row_stride,
120
+ dS_out_ptr,
121
+ dS_out_row_stride,
122
+ dX_ptr,
123
+ dX_row_stride,
124
+ X_ptr,
125
+ X_row_stride,
126
+ X_dtype: tl.constexpr,
127
+ W_ptr,
128
+ W_row_stride,
129
+ RSTD_ptr,
130
+ RSTD_row_stride,
131
+ dW_ptr,
132
+ dW_row_stride,
133
+ n_rows,
134
+ n_cols,
135
+ offset,
136
+ rows_per_program: tl.constexpr,
137
+ casting_mode: tl.constexpr,
138
+ BLOCK_SIZE: tl.constexpr,
139
+ has_dS_out: tl.constexpr,
140
+ ):
141
+ """
142
+ This kernel is adapted from the rms_norm backward kernel, and is adapted to support the residual
143
+ addition in the backward pass. For the following code pattern:
144
+ 1. hidden_states = residual + hidden_states
145
+ 2. residual = hidden_states
146
+ 3. hidden_states = rmsnorm(hidden_states)
147
+
148
+ The gradient of hidden_states and residual comes out be exactly same. The value of this gradient is
149
+ the sum of the gradient of the hidden_states in step 3 and the gradient of the residual in step 2.
150
+
151
+ The backward pass computation logic is same as the rms_norm backward kernel, except that the gradient
152
+ of the hidden_states in step 3 and the gradient of the residual in step 2 are summed up.
153
+ """
154
+
155
+ row_block_id = tl.program_id(0).to(tl.int64)
156
+ row_start = row_block_id * rows_per_program
157
+ row_end = min((row_block_id + 1) * rows_per_program, n_rows)
158
+ col_offsets = tl.arange(0, BLOCK_SIZE)
159
+ mask = col_offsets < n_cols
160
+
161
+ dW_row = tl.zeros((BLOCK_SIZE,), dtype=tl.float32)
162
+
163
+ dY_ptr += row_start * dY_row_stride
164
+ dX_ptr += row_start * dX_row_stride
165
+ if has_dS_out:
166
+ dS_out_ptr += row_start * dS_out_row_stride
167
+
168
+ X_ptr += row_start * X_row_stride
169
+ RSTD_ptr += row_start
170
+
171
+ W_row = tl.load(W_ptr + col_offsets, mask=mask, other=0.0)
172
+ W_row = W_row + offset
173
+
174
+ for _ in range(row_start, row_end):
175
+ dY_row = tl.load(dY_ptr + col_offsets, mask=mask, other=0.0)
176
+ X_row = tl.load(X_ptr + col_offsets, mask=mask, other=0.0)
177
+
178
+ # Get cached rms
179
+ rstd_row = tl.load(RSTD_ptr)
180
+
181
+ X_row = X_row.to(tl.float32)
182
+
183
+ # Different bacward graphs for different casting modes
184
+ if casting_mode == _CASTING_MODE_LLAMA:
185
+ m = (dY_row * W_row).to(tl.float32)
186
+
187
+ elif casting_mode == _CASTING_MODE_GEMMA:
188
+ dY_row = dY_row.to(tl.float32)
189
+ m = dY_row * W_row
190
+ else:
191
+ m = dY_row * W_row
192
+
193
+ dX_row = rstd_row * m
194
+
195
+ if has_dS_out:
196
+ dS_out_row = tl.load(dS_out_ptr + col_offsets, mask=mask, other=0.0)
197
+ dX_row += (rstd_row) * (
198
+ -(1 / n_cols) * rstd_row * rstd_row * tl.sum(m * X_row, axis=0) * X_row
199
+ ) + dS_out_row
200
+ dS_out_ptr += dS_out_row_stride
201
+ else:
202
+ dX_row += (rstd_row) * (-(1 / n_cols) * rstd_row * rstd_row * tl.sum(m * X_row, axis=0) * X_row)
203
+
204
+ # calculate the gradient of W
205
+ if casting_mode == _CASTING_MODE_LLAMA:
206
+ dW_row += dY_row * (X_row * rstd_row).to(X_dtype)
207
+ else:
208
+ # here X_row is already in fp32 (see previous if block)
209
+ dW_row += dY_row * (X_row * rstd_row)
210
+
211
+ tl.store(dX_ptr + col_offsets, dX_row.to(X_dtype), mask=mask)
212
+
213
+ dY_ptr += dY_row_stride
214
+ dX_ptr += dX_row_stride
215
+ X_ptr += X_row_stride
216
+ RSTD_ptr += RSTD_row_stride
217
+
218
+ tl.store(dW_ptr + row_block_id * dW_row_stride + col_offsets, dW_row, mask=mask)
219
+
220
+
221
+ _str_to_casting_mode = {
222
+ "llama": _CASTING_MODE_LLAMA.value,
223
+ "gemma": _CASTING_MODE_GEMMA.value,
224
+ "none": _CASTING_MODE_NONE.value,
225
+ }
226
+
227
+
228
+ def fused_add_rms_norm_forward(X, R, W, eps, offset, casting_mode):
229
+ if not isinstance(casting_mode, int):
230
+ assert casting_mode in _str_to_casting_mode, f"Invalid casting mode: {casting_mode}"
231
+ casting_mode = _str_to_casting_mode[casting_mode]
232
+ else:
233
+ assert casting_mode in _str_to_casting_mode.values(), f"Invalid casting mode: {casting_mode}"
234
+
235
+ shape = X.shape
236
+ dim = shape[-1]
237
+ X = X.view(-1, dim)
238
+ R = R.view(-1, dim)
239
+ n_rows, n_cols = X.shape
240
+ BLOCK_SIZE, num_warps = calculate_settings(n_cols)
241
+
242
+ Y = torch.empty((n_rows, n_cols), dtype=X.dtype, device=X.device)
243
+ S = torch.empty((n_rows, n_cols), dtype=X.dtype, device=X.device)
244
+ # RSTD is to cache rstd for each row
245
+ # RSTD is always computed/stored in fp32 if we are using Llama or Gemma casting mode
246
+ rstd_dtype = torch.float32 if casting_mode in (_CASTING_MODE_LLAMA.value, _CASTING_MODE_GEMMA.value) else X.dtype
247
+ RSTD = torch.empty(n_rows, dtype=rstd_dtype, device=X.device)
248
+
249
+ # Check constraints.
250
+ assert X.shape[1] == W.shape[0], "Incompatible hidden size dimension between tensor1.shape[1] and tensor2.shape[0]"
251
+
252
+ # XPU-specific optimization
253
+ kernel_args = {}
254
+ if X.device.type == "xpu":
255
+ kernel_args["grf_mode"] = "large"
256
+
257
+ # TODO: add _block_fused_add_rms_norm_forward_kernel
258
+ _fused_add_rms_norm_forward_kernel[(n_rows,)](
259
+ Y,
260
+ Y.stride(0),
261
+ S,
262
+ S.stride(0),
263
+ X,
264
+ X.stride(0),
265
+ R,
266
+ R.stride(0),
267
+ W,
268
+ W.stride(0),
269
+ RSTD,
270
+ RSTD.stride(0),
271
+ n_cols,
272
+ eps,
273
+ offset,
274
+ casting_mode,
275
+ BLOCK_SIZE=BLOCK_SIZE,
276
+ num_warps=num_warps,
277
+ **kernel_args, # XPU-specific optimization
278
+ )
279
+
280
+ return Y.view(*shape), S.view(*shape), RSTD, BLOCK_SIZE, num_warps, casting_mode
281
+
282
+
283
+ def fused_add_rms_norm_backward(dY, dS_out, S, W, RSTD, offset, casting_mode, BLOCK_SIZE, num_warps, in_place):
284
+ shape = dY.shape
285
+ dim = shape[-1]
286
+ dY = dY.view(-1, dim)
287
+ dS_out = dS_out.view(-1, dim)
288
+ S = S.view(-1, dim)
289
+ n_rows, n_cols = dY.shape
290
+
291
+ sm_count = 1
292
+ if S.device.type == "cuda":
293
+ sm_count = torch.cuda.get_device_properties(S.device).multi_processor_count
294
+ elif S.device.type == "xpu":
295
+ sm_count = torch.xpu.get_device_properties(S.device).gpu_eu_count
296
+
297
+ # fp32 for numerical stability especially.
298
+ _dW = torch.empty((sm_count, n_cols), dtype=torch.float32, device=W.device)
299
+
300
+ if n_cols > BLOCK_SIZE:
301
+ raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
302
+ rows_per_program = math.ceil(n_rows / sm_count)
303
+ grid = (sm_count,)
304
+
305
+ if in_place is True:
306
+ dX = dY
307
+ else:
308
+ dX = torch.empty_like(dY)
309
+
310
+ # XPU-specific optimization
311
+ kernel_args = {}
312
+ if S.device.type == "xpu":
313
+ kernel_args["grf_mode"] = "large"
314
+
315
+ # TODO: add _block_fused_add_rms_norm_backward_kernel
316
+ _fused_add_rms_norm_backward_kernel[grid](
317
+ dY,
318
+ dY.stride(0),
319
+ dS_out,
320
+ dS_out.stride(0),
321
+ dX,
322
+ dX.stride(0),
323
+ S,
324
+ S.stride(0),
325
+ torch_to_triton_dtype[S.dtype],
326
+ W,
327
+ W.stride(0),
328
+ RSTD,
329
+ RSTD.stride(0),
330
+ _dW,
331
+ _dW.stride(0),
332
+ n_rows,
333
+ n_cols,
334
+ offset,
335
+ rows_per_program,
336
+ casting_mode,
337
+ BLOCK_SIZE=BLOCK_SIZE,
338
+ num_warps=num_warps,
339
+ has_dS_out=dS_out is not None,
340
+ **kernel_args, # XPU-specific optimization
341
+ )
342
+
343
+ dX = dX.view(*shape)
344
+ dW = _dW.sum(dim=0).to(W.dtype)
345
+
346
+ return dX, dX, dW # dR is equal to dX
347
+
348
+
349
+ class LigerFusedAddRMSNormFunction(torch.autograd.Function):
350
+ """
351
+ Performs a fused operation that first adds a residual tensor to the hidden_states tensor (`X`), then applies RMSNorm (Root Mean Square Normalization) to the result using the weight tensor `W`, with optional offset and casting mode.
352
+
353
+ This class implements the following sequence, commonly used in transformer decoder layers:
354
+ 1. hidden_states = residual + hidden_states
355
+ 2. residual = hidden_states (after addition)
356
+ 3. hidden_states = rmsnorm(hidden_states)
357
+
358
+ Both the normalized hidden_states and the updated residual are returned as outputs.
359
+
360
+ Some models use an 'offset' to shift the weight tensor `W` by a constant value. For example, Gemma
361
+ uses an offset of 1.0, so the computation becomes `(X / RMS(X)) * (W + 1.0)` instead of the usual
362
+ `(X / RMS(X)) * W`. You can pass the offset value as an argument to the forward function.
363
+
364
+ In addition, different models cast their inputs at different places during RMSNorm computation. For
365
+ example, Gemma casts everything to fp32 before starting the computation, while Llama casts only the
366
+ inverse RMS to fp32. You can specify the casting mode using the `casting_mode` argument. We currently
367
+ support the following casting modes (they match HuggingFace Transformers' implementations):
368
+ - 'llama': matches the Llama implementation, where only the inverse RMS is computed on fp32.
369
+ - 'gemma': matches the Gemma implementation, where everything is cast to fp32, then computed, then cast back to the original dtype.
370
+ - 'none': no casting is done. The computation is done in the original dtype. This saves memory and is slightly faster, but has more error w.r.t. the original implementation.
371
+
372
+ The `in_place` option determines whether to modify dY in-place to store dX. This defaults to `True` to save memory.
373
+ """
374
+
375
+ @staticmethod
376
+ @ensure_contiguous
377
+ def forward(ctx, X, R, W, eps, offset=0.0, casting_mode="llama", in_place=False):
378
+ """
379
+ X: (B, T, H) or (BxT, H)
380
+ W: (H,)
381
+ """
382
+ # TODO: add row_mode
383
+ Y, S, RSTD, BLOCK_SIZE, num_warps, casting_mode = fused_add_rms_norm_forward(X, R, W, eps, offset, casting_mode)
384
+ ctx.offset = offset
385
+ ctx.casting_mode = casting_mode
386
+ ctx.in_place = in_place
387
+ ctx.BLOCK_SIZE = BLOCK_SIZE
388
+ ctx.num_warps = num_warps
389
+ ctx.save_for_backward(S, W, RSTD)
390
+ return Y, S
391
+
392
+ @staticmethod
393
+ @ensure_contiguous
394
+ def backward(ctx, dY, dS_out):
395
+ """
396
+ Y: (B, T, H) or (BxT, H)
397
+ """
398
+ S, W, RSTD = ctx.saved_tensors
399
+ dX, dR, dW = fused_add_rms_norm_backward(
400
+ dY,
401
+ dS_out,
402
+ S,
403
+ W,
404
+ RSTD,
405
+ ctx.offset,
406
+ ctx.casting_mode,
407
+ ctx.BLOCK_SIZE,
408
+ ctx.num_warps,
409
+ ctx.in_place,
410
+ )
411
+
412
+ return dX, dR, dW, None, None, None, None, None