liger-kernel-nightly 0.5.5.dev20250402185702__py3-none-any.whl → 0.6.4.dev20260112233432__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of liger-kernel-nightly might be problematic. Click here for more details.
- liger_kernel/chunked_loss/__init__.py +1 -0
- liger_kernel/chunked_loss/cosine_similarity_loss.py +142 -0
- liger_kernel/chunked_loss/dpo_loss.py +61 -3
- liger_kernel/chunked_loss/functional.py +2 -0
- liger_kernel/chunked_loss/fused_linear_distillation.py +23 -5
- liger_kernel/chunked_loss/fused_linear_ppo.py +36 -0
- liger_kernel/chunked_loss/fused_linear_preference.py +0 -1
- liger_kernel/chunked_loss/grpo_loss.py +76 -5
- liger_kernel/chunked_loss/jsd_loss.py +46 -15
- liger_kernel/ops/__init__.py +141 -0
- liger_kernel/ops/backends/README.md +151 -0
- liger_kernel/ops/backends/__init__.py +13 -0
- liger_kernel/ops/backends/_ascend/__init__.py +5 -0
- liger_kernel/ops/backends/_ascend/ascend-ub-manager-design.md +485 -0
- liger_kernel/ops/backends/_ascend/ops/__init__.py +49 -0
- liger_kernel/ops/backends/_ascend/ops/geglu.py +266 -0
- liger_kernel/ops/backends/_ascend/ops/qwen2vl_mrope.py +285 -0
- liger_kernel/ops/backends/_ascend/ops/rope.py +290 -0
- liger_kernel/ops/backends/_ascend/ops/swiglu.py +142 -0
- liger_kernel/ops/backends/_ascend/ops/tvd.py +221 -0
- liger_kernel/ops/backends/_ascend/ub_manager.py +349 -0
- liger_kernel/ops/backends/registry.py +61 -0
- liger_kernel/ops/cross_entropy.py +134 -65
- liger_kernel/ops/dyt.py +115 -180
- liger_kernel/ops/fused_add_rms_norm.py +416 -0
- liger_kernel/ops/fused_linear_cross_entropy.py +117 -23
- liger_kernel/ops/fused_neighborhood_attention.py +1022 -0
- liger_kernel/ops/geglu.py +6 -4
- liger_kernel/ops/group_norm.py +7 -7
- liger_kernel/ops/grpo_loss.py +312 -0
- liger_kernel/ops/jsd.py +2 -1
- liger_kernel/ops/kl_div.py +9 -5
- liger_kernel/ops/layer_norm.py +146 -78
- liger_kernel/ops/llama4_rope.py +225 -0
- liger_kernel/ops/multi_token_attention.py +207 -0
- liger_kernel/ops/poly_norm.py +390 -0
- liger_kernel/ops/rms_norm.py +398 -99
- liger_kernel/ops/rope.py +1 -1
- liger_kernel/ops/softmax.py +201 -0
- liger_kernel/ops/sparsemax.py +179 -0
- liger_kernel/ops/swiglu.py +1 -1
- liger_kernel/ops/tiled_mlp.py +136 -0
- liger_kernel/ops/utils.py +14 -0
- liger_kernel/transformers/__init__.py +208 -17
- liger_kernel/transformers/auto_model.py +21 -0
- liger_kernel/transformers/cross_entropy.py +9 -4
- liger_kernel/transformers/dyt.py +6 -4
- liger_kernel/transformers/experimental/__init__.py +5 -0
- liger_kernel/transformers/experimental/embedding.py +1 -1
- liger_kernel/transformers/fsdp.py +55 -0
- liger_kernel/transformers/functional.py +122 -20
- liger_kernel/transformers/fused_add_rms_norm.py +39 -0
- liger_kernel/transformers/fused_linear_cross_entropy.py +16 -5
- liger_kernel/transformers/fused_linear_jsd.py +1 -1
- liger_kernel/transformers/fused_neighborhood_attention.py +234 -0
- liger_kernel/transformers/geglu.py +1 -1
- liger_kernel/transformers/group_norm.py +1 -1
- liger_kernel/transformers/grpo_loss.py +153 -0
- liger_kernel/transformers/jsd.py +1 -1
- liger_kernel/transformers/kl_div.py +1 -1
- liger_kernel/transformers/layer_norm.py +1 -1
- liger_kernel/transformers/llama4_rope.py +93 -0
- liger_kernel/transformers/model/exaone4.py +136 -0
- liger_kernel/transformers/model/falcon_h1.py +122 -0
- liger_kernel/transformers/model/gemma.py +57 -27
- liger_kernel/transformers/model/gemma2.py +65 -28
- liger_kernel/transformers/model/gemma3.py +331 -0
- liger_kernel/transformers/model/glm4.py +141 -0
- liger_kernel/transformers/model/glm4v.py +163 -0
- liger_kernel/transformers/model/glm4v_moe.py +172 -0
- liger_kernel/transformers/model/gpt_oss.py +211 -0
- liger_kernel/transformers/model/hunyuan_v1.py +134 -0
- liger_kernel/transformers/model/internvl.py +157 -0
- liger_kernel/transformers/model/llama.py +109 -27
- liger_kernel/transformers/model/llama4.py +121 -0
- liger_kernel/transformers/model/llava.py +111 -136
- liger_kernel/transformers/model/loss_utils.py +50 -12
- liger_kernel/transformers/model/mistral.py +51 -34
- liger_kernel/transformers/model/mixtral.py +50 -29
- liger_kernel/transformers/model/mllama.py +46 -24
- liger_kernel/transformers/model/olmo2.py +47 -22
- liger_kernel/transformers/model/olmo3.py +142 -0
- liger_kernel/transformers/model/output_classes.py +147 -0
- liger_kernel/transformers/model/paligemma.py +50 -14
- liger_kernel/transformers/model/phi3.py +47 -172
- liger_kernel/transformers/model/qwen2.py +55 -23
- liger_kernel/transformers/model/qwen2_5_vl.py +62 -103
- liger_kernel/transformers/model/qwen2_vl.py +59 -108
- liger_kernel/transformers/model/qwen3.py +136 -0
- liger_kernel/transformers/model/qwen3_moe.py +152 -0
- liger_kernel/transformers/model/qwen3_next.py +146 -0
- liger_kernel/transformers/model/qwen3_vl.py +150 -0
- liger_kernel/transformers/model/qwen3_vl_moe.py +126 -0
- liger_kernel/transformers/model/smollm3.py +199 -0
- liger_kernel/transformers/model/smolvlm.py +158 -0
- liger_kernel/transformers/monkey_patch.py +2018 -244
- liger_kernel/transformers/multi_token_attention.py +64 -0
- liger_kernel/transformers/poly_norm.py +42 -0
- liger_kernel/transformers/qwen2vl_mrope.py +1 -1
- liger_kernel/transformers/rms_norm.py +54 -6
- liger_kernel/transformers/rope.py +45 -1
- liger_kernel/transformers/softmax.py +12 -0
- liger_kernel/transformers/sparsemax.py +16 -0
- liger_kernel/transformers/swiglu.py +39 -1
- liger_kernel/transformers/tiled_mlp.py +125 -0
- liger_kernel/transformers/trainer/orpo_trainer.py +1 -53
- liger_kernel/transformers/tvd.py +1 -1
- liger_kernel/utils.py +63 -0
- {liger_kernel_nightly-0.5.5.dev20250402185702.dist-info → liger_kernel_nightly-0.6.4.dev20260112233432.dist-info}/METADATA +73 -39
- liger_kernel_nightly-0.6.4.dev20260112233432.dist-info/RECORD +132 -0
- liger_kernel_nightly-0.5.5.dev20250402185702.dist-info/RECORD +0 -80
- {liger_kernel_nightly-0.5.5.dev20250402185702.dist-info → liger_kernel_nightly-0.6.4.dev20260112233432.dist-info}/LICENSE +0 -0
- {liger_kernel_nightly-0.5.5.dev20250402185702.dist-info → liger_kernel_nightly-0.6.4.dev20260112233432.dist-info}/NOTICE +0 -0
- {liger_kernel_nightly-0.5.5.dev20250402185702.dist-info → liger_kernel_nightly-0.6.4.dev20260112233432.dist-info}/WHEEL +0 -0
- {liger_kernel_nightly-0.5.5.dev20250402185702.dist-info → liger_kernel_nightly-0.6.4.dev20260112233432.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,416 @@
|
|
|
1
|
+
import math
|
|
2
|
+
import operator
|
|
3
|
+
|
|
4
|
+
import torch
|
|
5
|
+
import triton
|
|
6
|
+
import triton.language as tl
|
|
7
|
+
|
|
8
|
+
from liger_kernel.ops.utils import calculate_settings
|
|
9
|
+
from liger_kernel.ops.utils import compare_version
|
|
10
|
+
from liger_kernel.ops.utils import ensure_contiguous
|
|
11
|
+
from liger_kernel.ops.utils import torch_to_triton_dtype
|
|
12
|
+
from liger_kernel.utils import get_npu_multi_processor_count
|
|
13
|
+
from liger_kernel.utils import is_npu_available
|
|
14
|
+
|
|
15
|
+
if compare_version("triton", operator.ge, "3.0.0") and not is_npu_available():
|
|
16
|
+
try:
|
|
17
|
+
# typical import path with dispatch available
|
|
18
|
+
from triton.language.extra.libdevice import rsqrt
|
|
19
|
+
except ModuleNotFoundError:
|
|
20
|
+
# for working with NGC containers
|
|
21
|
+
from triton.language.extra.cuda.libdevice import rsqrt
|
|
22
|
+
else:
|
|
23
|
+
from triton.language.math import rsqrt
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
_CASTING_MODE_NONE: tl.constexpr = tl.constexpr(-1)
|
|
27
|
+
_CASTING_MODE_LLAMA: tl.constexpr = tl.constexpr(0)
|
|
28
|
+
_CASTING_MODE_GEMMA: tl.constexpr = tl.constexpr(1)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@triton.jit
|
|
32
|
+
def _fused_add_rms_norm_forward_kernel(
|
|
33
|
+
Y_ptr,
|
|
34
|
+
Y_row_stride,
|
|
35
|
+
S_ptr, # output residual
|
|
36
|
+
S_row_stride,
|
|
37
|
+
X_ptr,
|
|
38
|
+
X_row_stride,
|
|
39
|
+
R_ptr, # input residual
|
|
40
|
+
R_row_stride,
|
|
41
|
+
W_ptr,
|
|
42
|
+
W_row_stride,
|
|
43
|
+
RSTD_ptr,
|
|
44
|
+
RSTD_row_stride,
|
|
45
|
+
n_cols,
|
|
46
|
+
eps,
|
|
47
|
+
offset,
|
|
48
|
+
casting_mode: tl.constexpr, # constexpr so the `if` blocks can be optimized out
|
|
49
|
+
BLOCK_SIZE: tl.constexpr,
|
|
50
|
+
):
|
|
51
|
+
"""
|
|
52
|
+
This kernel computes the following:
|
|
53
|
+
1. hidden_states = residual + hidden_states
|
|
54
|
+
2. residual = hidden_states
|
|
55
|
+
3. hidden_states = rmsnorm(hidden_states)
|
|
56
|
+
|
|
57
|
+
This is a commonly used pattern in the decoder layers of LLMs.
|
|
58
|
+
Some examples:
|
|
59
|
+
1. https://github.com/huggingface/transformers/blob/0dc2df5ddafe3cb5824ad24e85beba13e0aa6726/src/transformers/models/qwen3/modeling_qwen3.py#L271
|
|
60
|
+
2. https://github.com/huggingface/transformers/blob/0dc2df5ddafe3cb5824ad24e85beba13e0aa6726/src/transformers/models/llama4/modeling_llama4.py#L393
|
|
61
|
+
|
|
62
|
+
This kernel is inspired by the rms_norm forward kernel, and is adapted to support the residual addition in the forward pass.
|
|
63
|
+
The backward pass is also adapted to support the residual addition in the backward pass.
|
|
64
|
+
"""
|
|
65
|
+
|
|
66
|
+
row_idx = tl.program_id(0).to(tl.int64)
|
|
67
|
+
col_offsets = tl.arange(0, BLOCK_SIZE)
|
|
68
|
+
mask = col_offsets < n_cols
|
|
69
|
+
|
|
70
|
+
Y_ptr += row_idx * Y_row_stride
|
|
71
|
+
S_ptr += row_idx * S_row_stride
|
|
72
|
+
X_ptr += row_idx * X_row_stride
|
|
73
|
+
R_ptr += row_idx * R_row_stride
|
|
74
|
+
RSTD_ptr += row_idx * RSTD_row_stride
|
|
75
|
+
|
|
76
|
+
X_row = tl.load(X_ptr + col_offsets, mask=mask, other=0)
|
|
77
|
+
R_row = tl.load(R_ptr + col_offsets, mask=mask, other=0)
|
|
78
|
+
S_row = X_row + R_row
|
|
79
|
+
tl.store(S_ptr + col_offsets, S_row, mask=mask)
|
|
80
|
+
S_row_dtype = S_row.dtype
|
|
81
|
+
W_row = tl.load(W_ptr + col_offsets, mask=mask, other=0)
|
|
82
|
+
|
|
83
|
+
# On Llama, only rstd is computed on fp32
|
|
84
|
+
if casting_mode == _CASTING_MODE_LLAMA:
|
|
85
|
+
S_row = S_row.to(tl.float32)
|
|
86
|
+
|
|
87
|
+
# Gemma computes everything on fp32, and then casts back the output to the original dtype
|
|
88
|
+
if casting_mode == _CASTING_MODE_GEMMA:
|
|
89
|
+
W_row = W_row.to(tl.float32)
|
|
90
|
+
S_row = S_row.to(tl.float32)
|
|
91
|
+
|
|
92
|
+
if casting_mode == _CASTING_MODE_NONE:
|
|
93
|
+
eps = eps.to(S_row_dtype)
|
|
94
|
+
offset = offset.to(S_row_dtype)
|
|
95
|
+
|
|
96
|
+
mean_square = tl.sum(S_row * S_row, axis=0) / n_cols
|
|
97
|
+
rstd = rsqrt(mean_square + eps)
|
|
98
|
+
|
|
99
|
+
# We can save time by caching rms with minimal memory overhead
|
|
100
|
+
# because rms is much smaller compared to X_row, as rms is for each row.
|
|
101
|
+
# However, on the computation side, it can save 4 operations (*, sum, /, sqrt).
|
|
102
|
+
tl.store(RSTD_ptr, rstd)
|
|
103
|
+
|
|
104
|
+
S_row = S_row * rstd
|
|
105
|
+
|
|
106
|
+
# On Llama, the multiplication with the weight is done on the original dtype
|
|
107
|
+
if casting_mode == _CASTING_MODE_LLAMA:
|
|
108
|
+
S_row = S_row.to(S_row_dtype)
|
|
109
|
+
|
|
110
|
+
Y_row = S_row * (offset + W_row)
|
|
111
|
+
|
|
112
|
+
if casting_mode == _CASTING_MODE_GEMMA:
|
|
113
|
+
Y_row = Y_row.to(S_row_dtype)
|
|
114
|
+
|
|
115
|
+
tl.store(Y_ptr + col_offsets, Y_row, mask=mask)
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
@triton.jit
|
|
119
|
+
def _fused_add_rms_norm_backward_kernel(
|
|
120
|
+
dY_ptr,
|
|
121
|
+
dY_row_stride,
|
|
122
|
+
dS_out_ptr,
|
|
123
|
+
dS_out_row_stride,
|
|
124
|
+
dX_ptr,
|
|
125
|
+
dX_row_stride,
|
|
126
|
+
X_ptr,
|
|
127
|
+
X_row_stride,
|
|
128
|
+
X_dtype: tl.constexpr,
|
|
129
|
+
W_ptr,
|
|
130
|
+
W_row_stride,
|
|
131
|
+
RSTD_ptr,
|
|
132
|
+
RSTD_row_stride,
|
|
133
|
+
dW_ptr,
|
|
134
|
+
dW_row_stride,
|
|
135
|
+
n_rows,
|
|
136
|
+
n_cols,
|
|
137
|
+
offset,
|
|
138
|
+
rows_per_program: tl.constexpr,
|
|
139
|
+
casting_mode: tl.constexpr,
|
|
140
|
+
BLOCK_SIZE: tl.constexpr,
|
|
141
|
+
has_dS_out: tl.constexpr,
|
|
142
|
+
):
|
|
143
|
+
"""
|
|
144
|
+
This kernel is adapted from the rms_norm backward kernel, and is adapted to support the residual
|
|
145
|
+
addition in the backward pass. For the following code pattern:
|
|
146
|
+
1. hidden_states = residual + hidden_states
|
|
147
|
+
2. residual = hidden_states
|
|
148
|
+
3. hidden_states = rmsnorm(hidden_states)
|
|
149
|
+
|
|
150
|
+
The gradient of hidden_states and residual comes out be exactly same. The value of this gradient is
|
|
151
|
+
the sum of the gradient of the hidden_states in step 3 and the gradient of the residual in step 2.
|
|
152
|
+
|
|
153
|
+
The backward pass computation logic is same as the rms_norm backward kernel, except that the gradient
|
|
154
|
+
of the hidden_states in step 3 and the gradient of the residual in step 2 are summed up.
|
|
155
|
+
"""
|
|
156
|
+
|
|
157
|
+
row_block_id = tl.program_id(0).to(tl.int64)
|
|
158
|
+
row_start = row_block_id * rows_per_program
|
|
159
|
+
row_end = min((row_block_id + 1) * rows_per_program, n_rows)
|
|
160
|
+
col_offsets = tl.arange(0, BLOCK_SIZE)
|
|
161
|
+
mask = col_offsets < n_cols
|
|
162
|
+
|
|
163
|
+
dW_row = tl.zeros((BLOCK_SIZE,), dtype=tl.float32)
|
|
164
|
+
|
|
165
|
+
dY_ptr += row_start * dY_row_stride
|
|
166
|
+
dX_ptr += row_start * dX_row_stride
|
|
167
|
+
if has_dS_out:
|
|
168
|
+
dS_out_ptr += row_start * dS_out_row_stride
|
|
169
|
+
|
|
170
|
+
X_ptr += row_start * X_row_stride
|
|
171
|
+
RSTD_ptr += row_start
|
|
172
|
+
|
|
173
|
+
W_row = tl.load(W_ptr + col_offsets, mask=mask, other=0.0)
|
|
174
|
+
W_row = W_row + offset
|
|
175
|
+
|
|
176
|
+
for _ in range(row_start, row_end):
|
|
177
|
+
dY_row = tl.load(dY_ptr + col_offsets, mask=mask, other=0.0)
|
|
178
|
+
X_row = tl.load(X_ptr + col_offsets, mask=mask, other=0.0)
|
|
179
|
+
|
|
180
|
+
# Get cached rms
|
|
181
|
+
rstd_row = tl.load(RSTD_ptr)
|
|
182
|
+
|
|
183
|
+
X_row = X_row.to(tl.float32)
|
|
184
|
+
|
|
185
|
+
# Different bacward graphs for different casting modes
|
|
186
|
+
if casting_mode == _CASTING_MODE_LLAMA:
|
|
187
|
+
m = (dY_row * W_row).to(tl.float32)
|
|
188
|
+
|
|
189
|
+
elif casting_mode == _CASTING_MODE_GEMMA:
|
|
190
|
+
dY_row = dY_row.to(tl.float32)
|
|
191
|
+
m = dY_row * W_row
|
|
192
|
+
else:
|
|
193
|
+
m = dY_row * W_row
|
|
194
|
+
|
|
195
|
+
dX_row = rstd_row * m
|
|
196
|
+
|
|
197
|
+
if has_dS_out:
|
|
198
|
+
dS_out_row = tl.load(dS_out_ptr + col_offsets, mask=mask, other=0.0)
|
|
199
|
+
dX_row += (rstd_row) * (
|
|
200
|
+
-(1 / n_cols) * rstd_row * rstd_row * tl.sum(m * X_row, axis=0) * X_row
|
|
201
|
+
) + dS_out_row
|
|
202
|
+
dS_out_ptr += dS_out_row_stride
|
|
203
|
+
else:
|
|
204
|
+
dX_row += (rstd_row) * (-(1 / n_cols) * rstd_row * rstd_row * tl.sum(m * X_row, axis=0) * X_row)
|
|
205
|
+
|
|
206
|
+
# calculate the gradient of W
|
|
207
|
+
if casting_mode == _CASTING_MODE_LLAMA:
|
|
208
|
+
dW_row += dY_row * (X_row * rstd_row).to(X_dtype)
|
|
209
|
+
else:
|
|
210
|
+
# here X_row is already in fp32 (see previous if block)
|
|
211
|
+
dW_row += dY_row * (X_row * rstd_row)
|
|
212
|
+
|
|
213
|
+
tl.store(dX_ptr + col_offsets, dX_row.to(X_dtype), mask=mask)
|
|
214
|
+
|
|
215
|
+
dY_ptr += dY_row_stride
|
|
216
|
+
dX_ptr += dX_row_stride
|
|
217
|
+
X_ptr += X_row_stride
|
|
218
|
+
RSTD_ptr += RSTD_row_stride
|
|
219
|
+
|
|
220
|
+
tl.store(dW_ptr + row_block_id * dW_row_stride + col_offsets, dW_row, mask=mask)
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
_str_to_casting_mode = {
|
|
224
|
+
"llama": _CASTING_MODE_LLAMA.value,
|
|
225
|
+
"gemma": _CASTING_MODE_GEMMA.value,
|
|
226
|
+
"none": _CASTING_MODE_NONE.value,
|
|
227
|
+
}
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def fused_add_rms_norm_forward(X, R, W, eps, offset, casting_mode):
|
|
231
|
+
if not isinstance(casting_mode, int):
|
|
232
|
+
assert casting_mode in _str_to_casting_mode, f"Invalid casting mode: {casting_mode}"
|
|
233
|
+
casting_mode = _str_to_casting_mode[casting_mode]
|
|
234
|
+
else:
|
|
235
|
+
assert casting_mode in _str_to_casting_mode.values(), f"Invalid casting mode: {casting_mode}"
|
|
236
|
+
|
|
237
|
+
shape = X.shape
|
|
238
|
+
dim = shape[-1]
|
|
239
|
+
X = X.view(-1, dim)
|
|
240
|
+
R = R.view(-1, dim)
|
|
241
|
+
n_rows, n_cols = X.shape
|
|
242
|
+
BLOCK_SIZE, num_warps = calculate_settings(n_cols)
|
|
243
|
+
|
|
244
|
+
Y = torch.empty((n_rows, n_cols), dtype=X.dtype, device=X.device)
|
|
245
|
+
S = torch.empty((n_rows, n_cols), dtype=X.dtype, device=X.device)
|
|
246
|
+
# RSTD is to cache rstd for each row
|
|
247
|
+
# RSTD is always computed/stored in fp32 if we are using Llama or Gemma casting mode
|
|
248
|
+
rstd_dtype = torch.float32 if casting_mode in (_CASTING_MODE_LLAMA.value, _CASTING_MODE_GEMMA.value) else X.dtype
|
|
249
|
+
RSTD = torch.empty(n_rows, dtype=rstd_dtype, device=X.device)
|
|
250
|
+
|
|
251
|
+
# Check constraints.
|
|
252
|
+
assert X.shape[1] == W.shape[0], "Incompatible hidden size dimension between tensor1.shape[1] and tensor2.shape[0]"
|
|
253
|
+
|
|
254
|
+
# XPU-specific optimization
|
|
255
|
+
kernel_args = {}
|
|
256
|
+
if X.device.type == "xpu":
|
|
257
|
+
kernel_args["grf_mode"] = "large"
|
|
258
|
+
|
|
259
|
+
# TODO: add _block_fused_add_rms_norm_forward_kernel
|
|
260
|
+
_fused_add_rms_norm_forward_kernel[(n_rows,)](
|
|
261
|
+
Y,
|
|
262
|
+
Y.stride(0),
|
|
263
|
+
S,
|
|
264
|
+
S.stride(0),
|
|
265
|
+
X,
|
|
266
|
+
X.stride(0),
|
|
267
|
+
R,
|
|
268
|
+
R.stride(0),
|
|
269
|
+
W,
|
|
270
|
+
W.stride(0),
|
|
271
|
+
RSTD,
|
|
272
|
+
RSTD.stride(0),
|
|
273
|
+
n_cols,
|
|
274
|
+
eps,
|
|
275
|
+
offset,
|
|
276
|
+
casting_mode,
|
|
277
|
+
BLOCK_SIZE=BLOCK_SIZE,
|
|
278
|
+
num_warps=num_warps,
|
|
279
|
+
**kernel_args, # XPU-specific optimization
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
return Y.view(*shape), S.view(*shape), RSTD, BLOCK_SIZE, num_warps, casting_mode
|
|
283
|
+
|
|
284
|
+
|
|
285
|
+
def fused_add_rms_norm_backward(dY, dS_out, S, W, RSTD, offset, casting_mode, BLOCK_SIZE, num_warps, in_place):
|
|
286
|
+
shape = dY.shape
|
|
287
|
+
dim = shape[-1]
|
|
288
|
+
dY = dY.view(-1, dim)
|
|
289
|
+
dS_out = dS_out.view(-1, dim)
|
|
290
|
+
S = S.view(-1, dim)
|
|
291
|
+
n_rows, n_cols = dY.shape
|
|
292
|
+
|
|
293
|
+
sm_count = 1
|
|
294
|
+
if S.device.type == "cuda":
|
|
295
|
+
sm_count = torch.cuda.get_device_properties(S.device).multi_processor_count
|
|
296
|
+
elif S.device.type == "xpu":
|
|
297
|
+
sm_count = torch.xpu.get_device_properties(S.device).gpu_eu_count
|
|
298
|
+
elif S.device.type == "npu":
|
|
299
|
+
sm_count = get_npu_multi_processor_count()
|
|
300
|
+
|
|
301
|
+
# fp32 for numerical stability especially.
|
|
302
|
+
_dW = torch.empty((sm_count, n_cols), dtype=torch.float32, device=W.device)
|
|
303
|
+
|
|
304
|
+
if n_cols > BLOCK_SIZE:
|
|
305
|
+
raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
|
|
306
|
+
rows_per_program = math.ceil(n_rows / sm_count)
|
|
307
|
+
grid = (sm_count,)
|
|
308
|
+
|
|
309
|
+
if in_place is True:
|
|
310
|
+
dX = dY
|
|
311
|
+
else:
|
|
312
|
+
dX = torch.empty_like(dY)
|
|
313
|
+
|
|
314
|
+
# XPU-specific optimization
|
|
315
|
+
kernel_args = {}
|
|
316
|
+
if S.device.type == "xpu":
|
|
317
|
+
kernel_args["grf_mode"] = "large"
|
|
318
|
+
|
|
319
|
+
# TODO: add _block_fused_add_rms_norm_backward_kernel
|
|
320
|
+
_fused_add_rms_norm_backward_kernel[grid](
|
|
321
|
+
dY,
|
|
322
|
+
dY.stride(0),
|
|
323
|
+
dS_out,
|
|
324
|
+
dS_out.stride(0),
|
|
325
|
+
dX,
|
|
326
|
+
dX.stride(0),
|
|
327
|
+
S,
|
|
328
|
+
S.stride(0),
|
|
329
|
+
torch_to_triton_dtype[S.dtype],
|
|
330
|
+
W,
|
|
331
|
+
W.stride(0),
|
|
332
|
+
RSTD,
|
|
333
|
+
RSTD.stride(0),
|
|
334
|
+
_dW,
|
|
335
|
+
_dW.stride(0),
|
|
336
|
+
n_rows,
|
|
337
|
+
n_cols,
|
|
338
|
+
offset,
|
|
339
|
+
rows_per_program,
|
|
340
|
+
casting_mode,
|
|
341
|
+
BLOCK_SIZE=BLOCK_SIZE,
|
|
342
|
+
num_warps=num_warps,
|
|
343
|
+
has_dS_out=dS_out is not None,
|
|
344
|
+
**kernel_args, # XPU-specific optimization
|
|
345
|
+
)
|
|
346
|
+
|
|
347
|
+
dX = dX.view(*shape)
|
|
348
|
+
dW = _dW.sum(dim=0).to(W.dtype)
|
|
349
|
+
|
|
350
|
+
return dX, dX, dW # dR is equal to dX
|
|
351
|
+
|
|
352
|
+
|
|
353
|
+
class LigerFusedAddRMSNormFunction(torch.autograd.Function):
|
|
354
|
+
"""
|
|
355
|
+
Performs a fused operation that first adds a residual tensor to the hidden_states tensor (`X`), then applies RMSNorm (Root Mean Square Normalization) to the result using the weight tensor `W`, with optional offset and casting mode.
|
|
356
|
+
|
|
357
|
+
This class implements the following sequence, commonly used in transformer decoder layers:
|
|
358
|
+
1. hidden_states = residual + hidden_states
|
|
359
|
+
2. residual = hidden_states (after addition)
|
|
360
|
+
3. hidden_states = rmsnorm(hidden_states)
|
|
361
|
+
|
|
362
|
+
Both the normalized hidden_states and the updated residual are returned as outputs.
|
|
363
|
+
|
|
364
|
+
Some models use an 'offset' to shift the weight tensor `W` by a constant value. For example, Gemma
|
|
365
|
+
uses an offset of 1.0, so the computation becomes `(X / RMS(X)) * (W + 1.0)` instead of the usual
|
|
366
|
+
`(X / RMS(X)) * W`. You can pass the offset value as an argument to the forward function.
|
|
367
|
+
|
|
368
|
+
In addition, different models cast their inputs at different places during RMSNorm computation. For
|
|
369
|
+
example, Gemma casts everything to fp32 before starting the computation, while Llama casts only the
|
|
370
|
+
inverse RMS to fp32. You can specify the casting mode using the `casting_mode` argument. We currently
|
|
371
|
+
support the following casting modes (they match HuggingFace Transformers' implementations):
|
|
372
|
+
- 'llama': matches the Llama implementation, where only the inverse RMS is computed on fp32.
|
|
373
|
+
- 'gemma': matches the Gemma implementation, where everything is cast to fp32, then computed, then cast back to the original dtype.
|
|
374
|
+
- 'none': no casting is done. The computation is done in the original dtype. This saves memory and is slightly faster, but has more error w.r.t. the original implementation.
|
|
375
|
+
|
|
376
|
+
The `in_place` option determines whether to modify dY in-place to store dX. This defaults to `True` to save memory.
|
|
377
|
+
"""
|
|
378
|
+
|
|
379
|
+
@staticmethod
|
|
380
|
+
@ensure_contiguous
|
|
381
|
+
def forward(ctx, X, R, W, eps, offset=0.0, casting_mode="llama", in_place=False):
|
|
382
|
+
"""
|
|
383
|
+
X: (B, T, H) or (BxT, H)
|
|
384
|
+
W: (H,)
|
|
385
|
+
"""
|
|
386
|
+
# TODO: add row_mode
|
|
387
|
+
Y, S, RSTD, BLOCK_SIZE, num_warps, casting_mode = fused_add_rms_norm_forward(X, R, W, eps, offset, casting_mode)
|
|
388
|
+
ctx.offset = offset
|
|
389
|
+
ctx.casting_mode = casting_mode
|
|
390
|
+
ctx.in_place = in_place
|
|
391
|
+
ctx.BLOCK_SIZE = BLOCK_SIZE
|
|
392
|
+
ctx.num_warps = num_warps
|
|
393
|
+
ctx.save_for_backward(S, W, RSTD)
|
|
394
|
+
return Y, S
|
|
395
|
+
|
|
396
|
+
@staticmethod
|
|
397
|
+
@ensure_contiguous
|
|
398
|
+
def backward(ctx, dY, dS_out):
|
|
399
|
+
"""
|
|
400
|
+
Y: (B, T, H) or (BxT, H)
|
|
401
|
+
"""
|
|
402
|
+
S, W, RSTD = ctx.saved_tensors
|
|
403
|
+
dX, dR, dW = fused_add_rms_norm_backward(
|
|
404
|
+
dY,
|
|
405
|
+
dS_out,
|
|
406
|
+
S,
|
|
407
|
+
W,
|
|
408
|
+
RSTD,
|
|
409
|
+
ctx.offset,
|
|
410
|
+
ctx.casting_mode,
|
|
411
|
+
ctx.BLOCK_SIZE,
|
|
412
|
+
ctx.num_warps,
|
|
413
|
+
ctx.in_place,
|
|
414
|
+
)
|
|
415
|
+
|
|
416
|
+
return dX, dR, dW, None, None, None, None, None
|
|
@@ -6,11 +6,12 @@ from liger_kernel.ops.utils import amp_custom_bwd
|
|
|
6
6
|
from liger_kernel.ops.utils import amp_custom_fwd
|
|
7
7
|
from liger_kernel.ops.utils import element_mul_kernel
|
|
8
8
|
from liger_kernel.ops.utils import is_hip
|
|
9
|
+
from liger_kernel.utils import infer_device
|
|
9
10
|
|
|
10
11
|
# The hard limit of TRITON_MAX_TENSOR_NUMEL is 1048576 https://github.com/triton-lang/triton/blob/ba42a5c68fd0505f8c42f4202d53be0f8d9a5fe0/python/triton/language/core.py#L19
|
|
11
12
|
# However, setting limit as 65536 as in LayerNorm tutorial is faster because of less register spilling
|
|
12
13
|
# The optimal maximum block size depends on your hardware, your kernel, and your dtype
|
|
13
|
-
MAX_FUSED_SIZE = 65536 // 2
|
|
14
|
+
MAX_FUSED_SIZE = 2048 if infer_device() == "npu" else 65536 // 2
|
|
14
15
|
|
|
15
16
|
|
|
16
17
|
def fused_linear_cross_entropy_forward(
|
|
@@ -25,10 +26,18 @@ def fused_linear_cross_entropy_forward(
|
|
|
25
26
|
reduction="mean",
|
|
26
27
|
softcap=None,
|
|
27
28
|
return_z_loss=False,
|
|
29
|
+
accum_dtype=None,
|
|
30
|
+
use_token_scaling=False,
|
|
31
|
+
return_token_accuracy=False,
|
|
28
32
|
):
|
|
29
33
|
assert isinstance(return_z_loss, bool), f"return_z_loss must be True or False. Got: {return_z_loss}"
|
|
34
|
+
assert isinstance(return_token_accuracy, bool), (
|
|
35
|
+
f"return_token_accuracy must be True or False. Got: {return_token_accuracy}"
|
|
36
|
+
)
|
|
30
37
|
device = _input.device
|
|
31
38
|
|
|
39
|
+
input_requires_grad = _input.requires_grad
|
|
40
|
+
|
|
32
41
|
# inputs have shape: BT x H
|
|
33
42
|
# materialized activations will have shape: BT x V
|
|
34
43
|
# the increase in memory = BT x V
|
|
@@ -44,12 +53,23 @@ def fused_linear_cross_entropy_forward(
|
|
|
44
53
|
chunk_size = triton.next_power_of_2(triton.cdiv(BT, inc_factor)) # (BT + inc_factor - 1) // inc_factor
|
|
45
54
|
num_chunks = triton.cdiv(BT, chunk_size) # (BT + chunk_size - 1) // chunk_size
|
|
46
55
|
|
|
47
|
-
grad_weight = torch.zeros_like(weight, device=device) if weight.requires_grad else None
|
|
48
56
|
grad_input = torch.zeros_like(_input, device=device)
|
|
49
|
-
|
|
50
|
-
# we use fp32 for loss accumulator
|
|
57
|
+
|
|
58
|
+
# we use fp32 for loss and gradients accumulator
|
|
59
|
+
if input_requires_grad:
|
|
60
|
+
if accum_dtype is None:
|
|
61
|
+
grad_weight = torch.zeros_like(weight, device=device) if weight.requires_grad else None
|
|
62
|
+
grad_bias = torch.zeros_like(bias, device=device) if bias is not None else None
|
|
63
|
+
else:
|
|
64
|
+
grad_weight = torch.zeros_like(weight, dtype=accum_dtype, device=device) if weight.requires_grad else None
|
|
65
|
+
grad_bias = torch.zeros_like(bias, dtype=accum_dtype, device=device) if bias is not None else None
|
|
66
|
+
else:
|
|
67
|
+
grad_weight = None
|
|
68
|
+
grad_bias = None
|
|
69
|
+
|
|
51
70
|
loss_1d = torch.zeros(BT, dtype=torch.float32, device=device)
|
|
52
71
|
z_loss_1d = torch.zeros(BT, dtype=_input.dtype, device=_input.device) if return_z_loss else None
|
|
72
|
+
token_accuracy_1d = torch.zeros(BT, dtype=torch.float32, device=device) if return_token_accuracy else None
|
|
53
73
|
|
|
54
74
|
# TODO: evaluate how CUDA synchronization caused by .item() affects the speed
|
|
55
75
|
target_mask = target != ignore_index
|
|
@@ -82,9 +102,40 @@ def fused_linear_cross_entropy_forward(
|
|
|
82
102
|
|
|
83
103
|
n_rows = logits_chunk.shape[0]
|
|
84
104
|
|
|
105
|
+
# Compute predicted probabilities for token scaling if needed
|
|
106
|
+
if use_token_scaling:
|
|
107
|
+
# Compute softmax probabilities for scaling
|
|
108
|
+
# We need to compute this before the cross entropy kernel modifies logits_chunk
|
|
109
|
+
logits_for_softmax = logits_chunk.detach().clone() # Detach to avoid gradient flow
|
|
110
|
+
if softcap is not None:
|
|
111
|
+
logits_for_softmax = softcap * torch.tanh(logits_for_softmax / softcap)
|
|
112
|
+
|
|
113
|
+
# Compute softmax to get predicted probabilities
|
|
114
|
+
probs = torch.softmax(logits_for_softmax, dim=-1)
|
|
115
|
+
|
|
116
|
+
# Get predicted probabilities for token scaling, handling ignored targets
|
|
117
|
+
valid_target_mask = target_chunk != ignore_index
|
|
118
|
+
valid_targets = target_chunk[valid_target_mask]
|
|
119
|
+
|
|
120
|
+
if len(valid_targets) > 0:
|
|
121
|
+
# Gather probabilities only for valid targets
|
|
122
|
+
valid_probs = probs[valid_target_mask]
|
|
123
|
+
pred_probs_valid = torch.gather(valid_probs, -1, valid_targets.unsqueeze(-1)).squeeze(-1)
|
|
124
|
+
|
|
125
|
+
# Create full tensor with zeros for ignored targets
|
|
126
|
+
pred_probs = torch.zeros_like(target_chunk, dtype=probs.dtype, device=probs.device)
|
|
127
|
+
pred_probs[valid_target_mask] = pred_probs_valid
|
|
128
|
+
else:
|
|
129
|
+
# All targets are ignored
|
|
130
|
+
pred_probs = torch.zeros_like(target_chunk, dtype=probs.dtype, device=probs.device)
|
|
131
|
+
|
|
132
|
+
# Store the scaling factors
|
|
133
|
+
scaling_factors = pred_probs.detach() # Detach to ensure no gradient flow
|
|
134
|
+
|
|
85
135
|
# unreduced loss
|
|
86
136
|
loss_1d_slice = loss_1d[start_idx:end_idx] # chunk_size,
|
|
87
137
|
z_loss_1d_slice = z_loss_1d[start_idx:end_idx] if return_z_loss else None
|
|
138
|
+
token_accuracy_1d_slice = token_accuracy_1d[start_idx:end_idx] if return_token_accuracy else None
|
|
88
139
|
|
|
89
140
|
# ensure _input and target are contiguous
|
|
90
141
|
logits_chunk = logits_chunk.contiguous()
|
|
@@ -100,6 +151,10 @@ def fused_linear_cross_entropy_forward(
|
|
|
100
151
|
loss_ptr=loss_1d_slice,
|
|
101
152
|
z_loss_ptr=z_loss_1d_slice,
|
|
102
153
|
loss_stride=loss_1d_slice.stride(-1), # always 1
|
|
154
|
+
token_accuracy_ptr=token_accuracy_1d_slice,
|
|
155
|
+
token_accuracy_stride=token_accuracy_1d_slice.stride(-1)
|
|
156
|
+
if return_token_accuracy
|
|
157
|
+
else 0, # always 1 if accuracy is enabled
|
|
103
158
|
n_cols=V,
|
|
104
159
|
n_non_ignore=total_n_non_ignore,
|
|
105
160
|
sum_non_ignore_weight=total_sum_non_ignore_ce_weight,
|
|
@@ -110,47 +165,68 @@ def fused_linear_cross_entropy_forward(
|
|
|
110
165
|
reduction=reduction,
|
|
111
166
|
softcap=softcap,
|
|
112
167
|
RETURN_Z_LOSS=return_z_loss,
|
|
168
|
+
RETURN_TOKEN_ACCURACY=return_token_accuracy,
|
|
113
169
|
HAS_WEIGHT=True if ce_weight is not None else False,
|
|
114
170
|
HAS_SOFTCAPPING=True if softcap is not None else False,
|
|
171
|
+
HAS_GRADIENTS=input_requires_grad,
|
|
115
172
|
BLOCK_SIZE=BLOCK_SIZE,
|
|
116
173
|
num_warps=32 if not is_hip() else 16,
|
|
117
174
|
)
|
|
118
175
|
|
|
176
|
+
# Apply token scaling if requested
|
|
177
|
+
if use_token_scaling:
|
|
178
|
+
loss_1d_slice = loss_1d_slice * scaling_factors
|
|
179
|
+
if return_z_loss:
|
|
180
|
+
z_loss_1d_slice = z_loss_1d_slice * scaling_factors
|
|
181
|
+
|
|
119
182
|
loss_1d[start_idx:end_idx] = loss_1d_slice
|
|
120
183
|
if return_z_loss:
|
|
121
184
|
z_loss_1d[start_idx:end_idx] = z_loss_1d_slice
|
|
185
|
+
if return_token_accuracy:
|
|
186
|
+
token_accuracy_1d[start_idx:end_idx] = token_accuracy_1d_slice
|
|
122
187
|
grad_logits_chunk = logits_chunk # chunk_size x V
|
|
123
188
|
|
|
124
|
-
|
|
189
|
+
# Apply token scaling to gradients if requested
|
|
190
|
+
if use_token_scaling:
|
|
191
|
+
# Expand scaling factors to match gradient dimensions
|
|
192
|
+
scaling_factors_expanded = scaling_factors.unsqueeze(-1) # chunk_size x 1
|
|
193
|
+
grad_logits_chunk = grad_logits_chunk * scaling_factors_expanded
|
|
125
194
|
|
|
126
|
-
if
|
|
127
|
-
|
|
128
|
-
input=grad_weight,
|
|
129
|
-
mat1=logits_chunk.t().to(
|
|
130
|
-
_input_chunk.dtype
|
|
131
|
-
), # In an autocast scenario without bias, differing logits_chunk data types will cause an addmm operation error.
|
|
132
|
-
mat2=_input_chunk,
|
|
133
|
-
out=grad_weight,
|
|
134
|
-
alpha=1.0,
|
|
135
|
-
beta=1.0,
|
|
136
|
-
)
|
|
195
|
+
if input_requires_grad:
|
|
196
|
+
grad_input[start_idx:end_idx] = grad_logits_chunk @ weight
|
|
137
197
|
|
|
138
|
-
if
|
|
198
|
+
if grad_weight is not None and input_requires_grad:
|
|
199
|
+
grad_weight += torch.mm(grad_logits_chunk.t(), _input_chunk).float()
|
|
200
|
+
|
|
201
|
+
if bias is not None and input_requires_grad:
|
|
139
202
|
torch.add(
|
|
140
203
|
input=grad_bias,
|
|
141
|
-
other=
|
|
204
|
+
other=grad_logits_chunk.sum(dim=0),
|
|
142
205
|
out=grad_bias,
|
|
143
206
|
alpha=1.0,
|
|
144
207
|
)
|
|
145
208
|
|
|
209
|
+
# Need extra calculations for backward if reduction=='none'. Not supporting reduction='none' now.
|
|
210
|
+
# if reduction == "none":
|
|
211
|
+
# loss = loss_1d
|
|
212
|
+
# z_loss = z_loss_1d if return_z_loss else None
|
|
213
|
+
|
|
146
214
|
if reduction == "none":
|
|
215
|
+
# Return per-token losses
|
|
147
216
|
loss = loss_1d
|
|
148
217
|
z_loss = z_loss_1d if return_z_loss else None
|
|
149
|
-
|
|
218
|
+
token_accuracy = token_accuracy_1d if return_token_accuracy else None
|
|
150
219
|
else:
|
|
151
220
|
loss = torch.sum(loss_1d)
|
|
152
221
|
z_loss = torch.sum(z_loss_1d) if return_z_loss else None
|
|
153
|
-
|
|
222
|
+
# For accuracy, we compute the mean across all non-ignored tokens
|
|
223
|
+
token_accuracy = torch.sum(token_accuracy_1d) / total_n_non_ignore if return_token_accuracy else None
|
|
224
|
+
|
|
225
|
+
# Cast back to original dtype
|
|
226
|
+
grad_weight = grad_weight.to(weight.dtype) if grad_weight is not None else None
|
|
227
|
+
grad_bias = grad_bias.to(bias.dtype) if grad_bias is not None else None
|
|
228
|
+
|
|
229
|
+
return loss, z_loss, token_accuracy, grad_input, grad_weight, grad_bias
|
|
154
230
|
|
|
155
231
|
|
|
156
232
|
def fused_linear_cross_entropy_backward(grad_output, grad_input, grad_weight, grad_bias):
|
|
@@ -216,6 +292,9 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
|
|
|
216
292
|
reduction="mean",
|
|
217
293
|
softcap=None,
|
|
218
294
|
return_z_loss: bool = False,
|
|
295
|
+
accum_dtype=None,
|
|
296
|
+
use_token_scaling: bool = False,
|
|
297
|
+
return_token_accuracy: bool = False,
|
|
219
298
|
):
|
|
220
299
|
"""
|
|
221
300
|
Fusing the last linear layer with cross-entropy loss
|
|
@@ -234,9 +313,15 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
|
|
|
234
313
|
ignore_index: the index to ignore in the target
|
|
235
314
|
label_smoothing (float): The amount of smoothing when computing the loss, where 0.0 means no smoothing.
|
|
236
315
|
reduction: reduction to apply
|
|
316
|
+
accum_dtype (torch.dtype): the dtype of intermediate result buffers for weight and bias gradient accumulations.
|
|
317
|
+
Recommended to set `accum_dtype` to higher precision, e.g. `torch.float32`, if the training is unstable with original dtype. Default: `None`, performing accumulations in original dtype
|
|
318
|
+
use_token_scaling (bool): whether to scale each token's loss by its predicted probability (detached).
|
|
319
|
+
When True, each token's loss is multiplied by the model's predicted probability for that token's true class.
|
|
320
|
+
Default: False.
|
|
321
|
+
return_token_accuracy (bool): When `return_token_accuracy` is `True`, computes and returns per-token accuracy without materializing logits. Default: `False`
|
|
237
322
|
"""
|
|
238
323
|
|
|
239
|
-
loss, z_loss, grad_input, grad_weight, grad_bias = fused_linear_cross_entropy_forward(
|
|
324
|
+
loss, z_loss, token_accuracy, grad_input, grad_weight, grad_bias = fused_linear_cross_entropy_forward(
|
|
240
325
|
_input=_input,
|
|
241
326
|
weight=weight,
|
|
242
327
|
target=target,
|
|
@@ -248,6 +333,9 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
|
|
|
248
333
|
reduction=reduction,
|
|
249
334
|
softcap=softcap,
|
|
250
335
|
return_z_loss=return_z_loss,
|
|
336
|
+
accum_dtype=accum_dtype,
|
|
337
|
+
use_token_scaling=use_token_scaling,
|
|
338
|
+
return_token_accuracy=return_token_accuracy,
|
|
251
339
|
)
|
|
252
340
|
# downcast to dtype and store for backward
|
|
253
341
|
ctx.save_for_backward(
|
|
@@ -256,13 +344,16 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
|
|
|
256
344
|
grad_bias.detach() if bias is not None else None,
|
|
257
345
|
)
|
|
258
346
|
ctx.return_z_loss = return_z_loss
|
|
259
|
-
|
|
347
|
+
ctx.return_token_accuracy = return_token_accuracy
|
|
348
|
+
return loss, z_loss, token_accuracy
|
|
260
349
|
|
|
261
350
|
@staticmethod
|
|
262
351
|
@amp_custom_bwd
|
|
263
|
-
def backward(ctx, grad_output, grad_output2):
|
|
352
|
+
def backward(ctx, grad_output, grad_output2, grad_output3):
|
|
264
353
|
if ctx.return_z_loss:
|
|
265
354
|
del grad_output2 # z_loss is only for logging
|
|
355
|
+
if ctx.return_token_accuracy:
|
|
356
|
+
del grad_output3 # token_accuracy is only for metrics
|
|
266
357
|
(grad_input, grad_weight, grad_bias) = ctx.saved_tensors
|
|
267
358
|
grad_input, grad_weight, grad_bias = fused_linear_cross_entropy_backward(
|
|
268
359
|
grad_output, grad_input, grad_weight, grad_bias
|
|
@@ -279,4 +370,7 @@ class LigerFusedLinearCrossEntropyFunction(torch.autograd.Function):
|
|
|
279
370
|
None,
|
|
280
371
|
None,
|
|
281
372
|
None,
|
|
373
|
+
None,
|
|
374
|
+
None, # use_token_scaling
|
|
375
|
+
None, # return_token_accuracy
|
|
282
376
|
)
|