liger-kernel-nightly 0.6.2.dev20251011154427__py3-none-any.whl → 0.6.4.dev20251202054858__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of liger-kernel-nightly might be problematic. Click here for more details.

Files changed (67) hide show
  1. liger_kernel/chunked_loss/cosine_similarity_loss.py +13 -4
  2. liger_kernel/chunked_loss/fused_linear_distillation.py +13 -2
  3. liger_kernel/chunked_loss/fused_linear_ppo.py +21 -5
  4. liger_kernel/chunked_loss/grpo_loss.py +8 -5
  5. liger_kernel/chunked_loss/jsd_loss.py +18 -5
  6. liger_kernel/ops/cross_entropy.py +65 -11
  7. liger_kernel/ops/dyt.py +5 -2
  8. liger_kernel/ops/fused_add_rms_norm.py +5 -1
  9. liger_kernel/ops/fused_linear_cross_entropy.py +43 -13
  10. liger_kernel/ops/geglu.py +2 -1
  11. liger_kernel/ops/group_norm.py +2 -1
  12. liger_kernel/ops/grpo_loss.py +3 -1
  13. liger_kernel/ops/layer_norm.py +86 -66
  14. liger_kernel/ops/poly_norm.py +390 -0
  15. liger_kernel/ops/rms_norm.py +7 -2
  16. liger_kernel/ops/tiled_mlp.py +136 -0
  17. liger_kernel/ops/utils.py +2 -0
  18. liger_kernel/transformers/__init__.py +27 -0
  19. liger_kernel/transformers/cross_entropy.py +8 -3
  20. liger_kernel/transformers/functional.py +29 -6
  21. liger_kernel/transformers/fused_linear_cross_entropy.py +8 -3
  22. liger_kernel/transformers/grpo_loss.py +56 -1
  23. liger_kernel/transformers/model/falcon_h1.py +19 -5
  24. liger_kernel/transformers/model/gemma.py +17 -6
  25. liger_kernel/transformers/model/gemma2.py +14 -5
  26. liger_kernel/transformers/model/gemma3.py +25 -12
  27. liger_kernel/transformers/model/glm4.py +16 -4
  28. liger_kernel/transformers/model/glm4v.py +16 -4
  29. liger_kernel/transformers/model/glm4v_moe.py +23 -4
  30. liger_kernel/transformers/model/hunyuan_v1.py +134 -0
  31. liger_kernel/transformers/model/internvl.py +12 -5
  32. liger_kernel/transformers/model/llama.py +14 -5
  33. liger_kernel/transformers/model/llama4.py +16 -4
  34. liger_kernel/transformers/model/llava.py +12 -4
  35. liger_kernel/transformers/model/loss_utils.py +31 -3
  36. liger_kernel/transformers/model/mistral.py +15 -6
  37. liger_kernel/transformers/model/mixtral.py +16 -7
  38. liger_kernel/transformers/model/mllama.py +12 -4
  39. liger_kernel/transformers/model/olmo2.py +16 -4
  40. liger_kernel/transformers/model/olmo3.py +142 -0
  41. liger_kernel/transformers/model/output_classes.py +147 -0
  42. liger_kernel/transformers/model/paligemma.py +22 -5
  43. liger_kernel/transformers/model/phi3.py +14 -7
  44. liger_kernel/transformers/model/qwen2.py +16 -3
  45. liger_kernel/transformers/model/qwen2_5_vl.py +14 -6
  46. liger_kernel/transformers/model/qwen2_vl.py +16 -4
  47. liger_kernel/transformers/model/qwen3.py +20 -5
  48. liger_kernel/transformers/model/qwen3_moe.py +19 -5
  49. liger_kernel/transformers/model/qwen3_next.py +146 -0
  50. liger_kernel/transformers/model/qwen3_vl.py +150 -0
  51. liger_kernel/transformers/model/qwen3_vl_moe.py +126 -0
  52. liger_kernel/transformers/model/smollm3.py +15 -6
  53. liger_kernel/transformers/model/smolvlm.py +158 -0
  54. liger_kernel/transformers/monkey_patch.py +594 -19
  55. liger_kernel/transformers/poly_norm.py +42 -0
  56. liger_kernel/transformers/rms_norm.py +7 -0
  57. liger_kernel/transformers/rope.py +43 -0
  58. liger_kernel/transformers/swiglu.py +17 -0
  59. liger_kernel/transformers/tiled_mlp.py +133 -0
  60. liger_kernel/utils.py +25 -0
  61. {liger_kernel_nightly-0.6.2.dev20251011154427.dist-info → liger_kernel_nightly-0.6.4.dev20251202054858.dist-info}/METADATA +4 -1
  62. liger_kernel_nightly-0.6.4.dev20251202054858.dist-info/RECORD +118 -0
  63. liger_kernel_nightly-0.6.2.dev20251011154427.dist-info/RECORD +0 -107
  64. {liger_kernel_nightly-0.6.2.dev20251011154427.dist-info → liger_kernel_nightly-0.6.4.dev20251202054858.dist-info}/LICENSE +0 -0
  65. {liger_kernel_nightly-0.6.2.dev20251011154427.dist-info → liger_kernel_nightly-0.6.4.dev20251202054858.dist-info}/NOTICE +0 -0
  66. {liger_kernel_nightly-0.6.2.dev20251011154427.dist-info → liger_kernel_nightly-0.6.4.dev20251202054858.dist-info}/WHEEL +0 -0
  67. {liger_kernel_nightly-0.6.2.dev20251011154427.dist-info → liger_kernel_nightly-0.6.4.dev20251202054858.dist-info}/top_level.txt +0 -0
@@ -6,9 +6,10 @@ from typing import Union
6
6
  import torch
7
7
 
8
8
  from transformers.modeling_outputs import BaseModelOutputWithPast
9
- from transformers.modeling_outputs import CausalLMOutputWithPast
10
9
 
11
10
  from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
11
+ from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
12
+ from liger_kernel.transformers.model.output_classes import LigerCausalLMOutputWithPast
12
13
 
13
14
 
14
15
  def lce_forward(
@@ -27,7 +28,7 @@ def lce_forward(
27
28
  logits_to_keep: Union[int, torch.Tensor] = 0,
28
29
  skip_logits: Optional[bool] = None,
29
30
  **kwargs,
30
- ) -> Union[Tuple, CausalLMOutputWithPast]:
31
+ ) -> Union[Tuple, LigerCausalLMOutputWithPast]:
31
32
  r"""
32
33
  Example:
33
34
 
@@ -71,6 +72,7 @@ def lce_forward(
71
72
  shift_labels = kwargs.pop("shift_labels", None)
72
73
  logits = None
73
74
  loss = None
75
+ token_accuracy = None
74
76
 
75
77
  if skip_logits and labels is None and shift_labels is None:
76
78
  raise ValueError("skip_logits is True, but labels and shift_labels are None")
@@ -79,8 +81,9 @@ def lce_forward(
79
81
  # By default, if in training mode, don't materialize logits
80
82
  skip_logits = self.training and (labels is not None or shift_labels is not None)
81
83
 
84
+ # Compute loss
82
85
  if skip_logits:
83
- loss = LigerForCausalLMLoss(
86
+ result = LigerForCausalLMLoss(
84
87
  hidden_states=kept_hidden_states,
85
88
  lm_head_weight=self.lm_head.weight,
86
89
  labels=labels,
@@ -88,7 +91,7 @@ def lce_forward(
88
91
  hidden_size=self.config.hidden_size,
89
92
  **kwargs,
90
93
  )
91
-
94
+ loss, _, token_accuracy = unpack_cross_entropy_result(result)
92
95
  else:
93
96
  logits = self.lm_head(kept_hidden_states)
94
97
  if labels is not None or shift_labels is not None:
@@ -101,13 +104,17 @@ def lce_forward(
101
104
  )
102
105
 
103
106
  if not return_dict:
104
- output = (logits,) + outputs[1:]
105
- return (loss,) + output if loss is not None else output
107
+ output_tuple = (logits,) + outputs[1:]
108
+ output = (loss,) + output_tuple if loss is not None else output_tuple
109
+ output = output + (token_accuracy,) if token_accuracy is not None else output
110
+ return output
106
111
 
107
- return CausalLMOutputWithPast(
112
+ # Return custom output class with token_accuracy field
113
+ return LigerCausalLMOutputWithPast(
108
114
  loss=loss,
109
115
  logits=logits,
110
116
  past_key_values=outputs.past_key_values,
111
117
  hidden_states=outputs.hidden_states,
112
118
  attentions=outputs.attentions,
119
+ token_accuracy=token_accuracy,
113
120
  )
@@ -11,6 +11,8 @@ from transformers.utils.deprecation import deprecate_kwarg
11
11
 
12
12
  from liger_kernel.transformers.fused_linear_cross_entropy import LigerFusedLinearCrossEntropyLoss
13
13
  from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
14
+ from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
15
+ from liger_kernel.transformers.model.output_classes import LigerCausalLMOutputWithPast
14
16
 
15
17
 
16
18
  def lce_forward_deprecated(
@@ -145,7 +147,7 @@ def lce_forward(
145
147
  logits_to_keep: Union[int, torch.Tensor] = 0,
146
148
  skip_logits: Optional[bool] = None,
147
149
  **kwargs,
148
- ) -> Union[Tuple, CausalLMOutputWithPast]:
150
+ ) -> Union[Tuple, LigerCausalLMOutputWithPast]:
149
151
  r"""
150
152
  Args:
151
153
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -208,6 +210,7 @@ def lce_forward(
208
210
  shift_labels = kwargs.pop("shift_labels", None)
209
211
  logits = None
210
212
  loss = None
213
+ token_accuracy = None
211
214
 
212
215
  if skip_logits and labels is None and shift_labels is None:
213
216
  raise ValueError("skip_logits is True, but labels and shift_labels are None")
@@ -216,8 +219,9 @@ def lce_forward(
216
219
  # By default, if in training mode, don't materialize logits
217
220
  skip_logits = self.training and (labels is not None or shift_labels is not None)
218
221
 
222
+ # Compute loss
219
223
  if skip_logits:
220
- loss = LigerForCausalLMLoss(
224
+ result = LigerForCausalLMLoss(
221
225
  hidden_states=kept_hidden_states,
222
226
  lm_head_weight=self.lm_head.weight,
223
227
  labels=labels,
@@ -225,6 +229,7 @@ def lce_forward(
225
229
  hidden_size=self.config.hidden_size,
226
230
  **kwargs,
227
231
  )
232
+ loss, _, token_accuracy = unpack_cross_entropy_result(result)
228
233
 
229
234
  else:
230
235
  logits = self.lm_head(kept_hidden_states)
@@ -237,10 +242,18 @@ def lce_forward(
237
242
  **kwargs,
238
243
  )
239
244
 
240
- return CausalLMOutputWithPast(
245
+ if not return_dict:
246
+ output_tuple = (logits,) + outputs[1:]
247
+ output = (loss,) + output_tuple if loss is not None else output_tuple
248
+ output = output + (token_accuracy,) if token_accuracy is not None else output
249
+ return output
250
+
251
+ # Return custom output class with token accuracy field
252
+ return LigerCausalLMOutputWithPast(
241
253
  loss=loss,
242
254
  logits=logits,
243
255
  past_key_values=outputs.past_key_values,
244
256
  hidden_states=outputs.hidden_states,
245
257
  attentions=outputs.attentions,
258
+ token_accuracy=token_accuracy,
246
259
  )
@@ -5,10 +5,11 @@ from typing import Union
5
5
 
6
6
  import torch
7
7
 
8
- from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VLCausalLMOutputWithPast
9
8
  from transformers.utils import can_return_tuple
10
9
 
11
10
  from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
11
+ from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
12
+ from liger_kernel.transformers.model.output_classes import LigerQwen2_5_VLCausalLMOutputWithPast
12
13
 
13
14
 
14
15
  @can_return_tuple
@@ -33,7 +34,7 @@ def lce_forward(
33
34
  second_per_grid_ts: Optional[torch.Tensor] = None,
34
35
  skip_logits: Optional[bool] = None,
35
36
  **kwargs,
36
- ) -> Union[Tuple, Qwen2_5_VLCausalLMOutputWithPast]:
37
+ ) -> Union[Tuple, LigerQwen2_5_VLCausalLMOutputWithPast]:
37
38
  r"""
38
39
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
39
40
  Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
@@ -113,6 +114,7 @@ def lce_forward(
113
114
  shift_labels = kwargs.pop("shift_labels", None)
114
115
  loss = None
115
116
  logits = None
117
+ token_accuracy = None
116
118
 
117
119
  if skip_logits and labels is None and shift_labels is None:
118
120
  raise ValueError("skip_logits is True, but labels and shift_labels are None")
@@ -120,8 +122,9 @@ def lce_forward(
120
122
  if skip_logits is None:
121
123
  skip_logits = self.training and (labels is not None or shift_labels is not None)
122
124
 
125
+ # Compute loss
123
126
  if skip_logits:
124
- loss = LigerForCausalLMLoss(
127
+ result = LigerForCausalLMLoss(
125
128
  hidden_states=hidden_states,
126
129
  lm_head_weight=self.lm_head.weight,
127
130
  labels=labels,
@@ -129,6 +132,7 @@ def lce_forward(
129
132
  hidden_size=self.config.hidden_size,
130
133
  **kwargs,
131
134
  )
135
+ loss, _, token_accuracy = unpack_cross_entropy_result(result)
132
136
  else:
133
137
  logits = self.lm_head(hidden_states)
134
138
 
@@ -142,14 +146,18 @@ def lce_forward(
142
146
  )
143
147
 
144
148
  if not return_dict:
145
- output = (logits,) + outputs[1:]
146
- return (loss,) + output if loss is not None else output
149
+ output_tuple = (logits,) + outputs[1:]
150
+ output = (loss,) + output_tuple if loss is not None else output_tuple
151
+ output = output + (token_accuracy,) if token_accuracy is not None else output
152
+ return output
147
153
 
148
- return Qwen2_5_VLCausalLMOutputWithPast(
154
+ # Return Qwen2.5-VL output with token accuracy
155
+ return LigerQwen2_5_VLCausalLMOutputWithPast(
149
156
  loss=loss,
150
157
  logits=logits,
151
158
  past_key_values=outputs.past_key_values,
152
159
  hidden_states=outputs.hidden_states,
153
160
  attentions=outputs.attentions,
154
161
  rope_deltas=outputs.rope_deltas,
162
+ token_accuracy=token_accuracy,
155
163
  )
@@ -5,10 +5,11 @@ from typing import Union
5
5
 
6
6
  import torch
7
7
 
8
- from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLCausalLMOutputWithPast
9
8
  from transformers.utils import can_return_tuple
10
9
 
11
10
  from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
11
+ from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
12
+ from liger_kernel.transformers.model.output_classes import LigerQwen2VLCausalLMOutputWithPast
12
13
 
13
14
 
14
15
  @can_return_tuple
@@ -32,7 +33,7 @@ def lce_forward(
32
33
  cache_position: Optional[torch.LongTensor] = None,
33
34
  skip_logits: Optional[bool] = None,
34
35
  **kwargs,
35
- ) -> Union[Tuple, Qwen2VLCausalLMOutputWithPast]:
36
+ ) -> Union[Tuple, LigerQwen2VLCausalLMOutputWithPast]:
36
37
  r"""
37
38
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
38
39
  Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
@@ -109,6 +110,7 @@ def lce_forward(
109
110
  shift_labels = kwargs.pop("shift_labels", None)
110
111
  loss = None
111
112
  logits = None
113
+ token_accuracy = None
112
114
 
113
115
  if skip_logits and labels is None and shift_labels is None:
114
116
  raise ValueError("skip_logits is True, but labels and shift_labels are None")
@@ -116,8 +118,9 @@ def lce_forward(
116
118
  if skip_logits is None:
117
119
  skip_logits = self.training and (labels is not None or shift_labels is not None)
118
120
 
121
+ # Compute loss
119
122
  if skip_logits:
120
- loss = LigerForCausalLMLoss(
123
+ result = LigerForCausalLMLoss(
121
124
  hidden_states=hidden_states,
122
125
  lm_head_weight=self.lm_head.weight,
123
126
  labels=labels,
@@ -125,6 +128,7 @@ def lce_forward(
125
128
  hidden_size=self.config.hidden_size,
126
129
  **kwargs,
127
130
  )
131
+ loss, _, token_accuracy = unpack_cross_entropy_result(result)
128
132
  else:
129
133
  logits = self.lm_head(hidden_states)
130
134
 
@@ -137,11 +141,19 @@ def lce_forward(
137
141
  vocab_size=self.config.vocab_size,
138
142
  )
139
143
 
140
- return Qwen2VLCausalLMOutputWithPast(
144
+ if not return_dict:
145
+ output_tuple = (logits,) + outputs[1:]
146
+ output = (loss,) + output_tuple if loss is not None else output_tuple
147
+ output = output + (token_accuracy,) if token_accuracy is not None else output
148
+ return output
149
+
150
+ # Return Qwen2VL output with token accuracy
151
+ return LigerQwen2VLCausalLMOutputWithPast(
141
152
  loss=loss,
142
153
  logits=logits,
143
154
  past_key_values=outputs.past_key_values,
144
155
  hidden_states=outputs.hidden_states,
145
156
  attentions=outputs.attentions,
146
157
  rope_deltas=outputs.rope_deltas,
158
+ token_accuracy=token_accuracy,
147
159
  )
@@ -4,9 +4,9 @@ from typing import Union
4
4
 
5
5
  import torch
6
6
 
7
- from transformers.modeling_outputs import CausalLMOutputWithPast
8
-
9
7
  from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
8
+ from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
9
+ from liger_kernel.transformers.model.output_classes import LigerCausalLMOutputWithPast
10
10
 
11
11
 
12
12
  def lce_forward(
@@ -23,8 +23,9 @@ def lce_forward(
23
23
  cache_position: Optional[torch.LongTensor] = None,
24
24
  logits_to_keep: Union[int, torch.Tensor] = 0,
25
25
  skip_logits: Optional[bool] = None,
26
+ return_dict: Optional[bool] = None,
26
27
  **kwargs,
27
- ) -> CausalLMOutputWithPast:
28
+ ) -> LigerCausalLMOutputWithPast:
28
29
  r"""
29
30
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
30
31
  Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
@@ -60,6 +61,7 @@ def lce_forward(
60
61
  output_hidden_states = (
61
62
  output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
62
63
  )
64
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
63
65
 
64
66
  # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
65
67
  outputs = self.model(
@@ -81,8 +83,11 @@ def lce_forward(
81
83
  kept_hidden_states = hidden_states[:, slice_indices, :]
82
84
 
83
85
  shift_labels = kwargs.pop("shift_labels", None)
86
+ # Remove output-control parameters that shouldn't be passed to loss functions
87
+ kwargs.pop("return_dict", None)
84
88
  logits = None
85
89
  loss = None
90
+ token_accuracy = None
86
91
 
87
92
  if skip_logits and labels is None and shift_labels is None:
88
93
  raise ValueError("skip_logits is True, but labels and shift_labels are None")
@@ -91,8 +96,9 @@ def lce_forward(
91
96
  # By default, if in training mode, don't materialize logits
92
97
  skip_logits = self.training and (labels is not None or shift_labels is not None)
93
98
 
99
+ # Compute loss
94
100
  if skip_logits:
95
- loss = LigerForCausalLMLoss(
101
+ result = LigerForCausalLMLoss(
96
102
  hidden_states=kept_hidden_states,
97
103
  lm_head_weight=self.lm_head.weight,
98
104
  labels=labels,
@@ -100,6 +106,7 @@ def lce_forward(
100
106
  hidden_size=self.config.hidden_size,
101
107
  **kwargs,
102
108
  )
109
+ loss, _, token_accuracy = unpack_cross_entropy_result(result)
103
110
 
104
111
  else:
105
112
  logits = self.lm_head(kept_hidden_states)
@@ -112,10 +119,18 @@ def lce_forward(
112
119
  **kwargs,
113
120
  )
114
121
 
115
- return CausalLMOutputWithPast(
122
+ if not return_dict:
123
+ output = (logits,) + outputs[1:]
124
+ output = ((loss,) + output) if loss is not None else output
125
+ output = output + (token_accuracy,) if token_accuracy is not None else output
126
+ return output
127
+
128
+ # Return custom output class with accuracy field
129
+ return LigerCausalLMOutputWithPast(
116
130
  loss=loss,
117
131
  logits=logits,
118
132
  past_key_values=outputs.past_key_values,
119
133
  hidden_states=outputs.hidden_states,
120
134
  attentions=outputs.attentions,
135
+ token_accuracy=token_accuracy,
121
136
  )
@@ -4,11 +4,12 @@ from typing import Union
4
4
 
5
5
  import torch
6
6
 
7
- from transformers.modeling_outputs import MoeCausalLMOutputWithPast
8
7
  from transformers.modeling_outputs import MoeModelOutputWithPast
9
8
  from transformers.models.mixtral.modeling_mixtral import load_balancing_loss_func
10
9
 
11
10
  from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
11
+ from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
12
+ from liger_kernel.transformers.model.output_classes import LigerMoeCausalLMOutputWithPast
12
13
 
13
14
 
14
15
  def lce_forward(
@@ -26,8 +27,9 @@ def lce_forward(
26
27
  cache_position: Optional[torch.LongTensor] = None,
27
28
  logits_to_keep: Union[int, torch.Tensor] = 0,
28
29
  skip_logits: Optional[bool] = None,
30
+ return_dict: Optional[bool] = None,
29
31
  **kwargs,
30
- ) -> MoeCausalLMOutputWithPast:
32
+ ) -> LigerMoeCausalLMOutputWithPast:
31
33
  r"""
32
34
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
33
35
  Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
@@ -64,10 +66,10 @@ def lce_forward(
64
66
  output_router_logits = (
65
67
  output_router_logits if output_router_logits is not None else self.config.output_router_logits
66
68
  )
67
-
68
69
  output_hidden_states = (
69
70
  output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
70
71
  )
72
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
71
73
 
72
74
  # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
73
75
  outputs: MoeModelOutputWithPast = self.model(
@@ -92,12 +94,14 @@ def lce_forward(
92
94
  shift_labels = kwargs.pop("shift_labels", None)
93
95
  logits = None
94
96
  loss = None
97
+ token_accuracy = None
95
98
 
96
99
  if skip_logits is None:
97
100
  skip_logits = self.training and (labels is not None or shift_labels is not None)
98
101
 
102
+ # Compute loss
99
103
  if skip_logits:
100
- loss = LigerForCausalLMLoss(
104
+ result = LigerForCausalLMLoss(
101
105
  hidden_states=kept_hidden_states,
102
106
  lm_head_weight=self.lm_head.weight,
103
107
  labels=labels,
@@ -105,6 +109,7 @@ def lce_forward(
105
109
  hidden_size=self.config.hidden_size,
106
110
  **kwargs,
107
111
  )
112
+ loss, _, token_accuracy = unpack_cross_entropy_result(result)
108
113
  else: # if in inference model materialize logits
109
114
  logits = self.lm_head(kept_hidden_states)
110
115
  if labels is not None or shift_labels is not None:
@@ -127,7 +132,15 @@ def lce_forward(
127
132
  if labels is not None:
128
133
  loss += self.router_aux_loss_coef * aux_loss.to(loss.device) # make sure to reside in the same device
129
134
 
130
- return MoeCausalLMOutputWithPast(
135
+ if not return_dict:
136
+ output = (logits,) + outputs[1:]
137
+ output = ((aux_loss,) + output) if aux_loss is not None else output
138
+ output = ((loss,) + output) if loss is not None else output
139
+ output = output + (token_accuracy,) if token_accuracy is not None else output
140
+ return output
141
+
142
+ # Return custom output class with accuracy field
143
+ return LigerMoeCausalLMOutputWithPast(
131
144
  loss=loss,
132
145
  aux_loss=aux_loss,
133
146
  logits=logits,
@@ -135,4 +148,5 @@ def lce_forward(
135
148
  hidden_states=outputs.hidden_states,
136
149
  attentions=outputs.attentions,
137
150
  router_logits=outputs.router_logits,
151
+ token_accuracy=token_accuracy,
138
152
  )
@@ -0,0 +1,146 @@
1
+ from typing import TYPE_CHECKING
2
+ from typing import List
3
+ from typing import Optional
4
+ from typing import Union
5
+
6
+ import torch
7
+
8
+ from transformers.modeling_outputs import MoeModelOutputWithPast
9
+
10
+ if TYPE_CHECKING:
11
+ from transformers.models.qwen3_next.modeling_qwen3_next import load_balancing_loss_func
12
+
13
+ from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
14
+ from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
15
+ from liger_kernel.transformers.model.output_classes import LigerMoeCausalLMOutputWithPast
16
+
17
+
18
+ def lce_forward(
19
+ self,
20
+ input_ids: Optional[torch.LongTensor] = None,
21
+ attention_mask: Optional[torch.Tensor] = None,
22
+ position_ids: Optional[torch.LongTensor] = None,
23
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
24
+ inputs_embeds: Optional[torch.FloatTensor] = None,
25
+ labels: Optional[torch.LongTensor] = None,
26
+ use_cache: Optional[bool] = None,
27
+ output_attentions: Optional[bool] = None,
28
+ output_hidden_states: Optional[bool] = None,
29
+ output_router_logits: Optional[bool] = None,
30
+ cache_position: Optional[torch.LongTensor] = None,
31
+ logits_to_keep: Union[int, torch.Tensor] = 0,
32
+ skip_logits: Optional[bool] = None,
33
+ return_dict: Optional[bool] = None,
34
+ **kwargs,
35
+ ) -> LigerMoeCausalLMOutputWithPast:
36
+ r"""
37
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
38
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
39
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
40
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
41
+
42
+ logits_to_keep (`int` or `torch.Tensor`, *optional*):
43
+ If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
44
+ `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
45
+ token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
46
+ If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
47
+ This is useful when using packed tensor format (single dimension for batch and sequence length).
48
+
49
+ Returns:
50
+
51
+ Example:
52
+
53
+ ```python
54
+ >>> from transformers import AutoModelForCausalLM, AutoTokenizer
55
+
56
+ >>> model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-Next-80B-A3B-Instruct")
57
+ >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-Next-80B-A3B-Instruct")
58
+
59
+ >>> prompt = "Give me a short introduction to large language model."
60
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
61
+
62
+ >>> # Generate
63
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
64
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
65
+ "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
66
+ ```"""
67
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
68
+ output_router_logits = (
69
+ output_router_logits if output_router_logits is not None else self.config.output_router_logits
70
+ )
71
+ output_hidden_states = (
72
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
73
+ )
74
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
75
+
76
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
77
+ outputs: MoeModelOutputWithPast = self.model(
78
+ input_ids=input_ids,
79
+ attention_mask=attention_mask,
80
+ position_ids=position_ids,
81
+ past_key_values=past_key_values,
82
+ inputs_embeds=inputs_embeds,
83
+ use_cache=use_cache,
84
+ output_attentions=output_attentions,
85
+ output_hidden_states=output_hidden_states,
86
+ output_router_logits=output_router_logits,
87
+ cache_position=cache_position,
88
+ **kwargs,
89
+ )
90
+
91
+ hidden_states = outputs.last_hidden_state
92
+ # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
93
+ slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
94
+ kept_hidden_states = hidden_states[:, slice_indices, :]
95
+
96
+ shift_labels = kwargs.pop("shift_labels", None)
97
+ logits = None
98
+ loss = None
99
+ token_accuracy = None
100
+
101
+ if skip_logits is None:
102
+ skip_logits = self.training and (labels is not None or shift_labels is not None)
103
+
104
+ if skip_logits:
105
+ result = LigerForCausalLMLoss(
106
+ hidden_states=kept_hidden_states,
107
+ lm_head_weight=self.lm_head.weight,
108
+ labels=labels,
109
+ shift_labels=shift_labels,
110
+ hidden_size=self.config.hidden_size,
111
+ **kwargs,
112
+ )
113
+ loss, _, token_accuracy = unpack_cross_entropy_result(result)
114
+ else: # if in inference model materialize logits
115
+ logits = self.lm_head(kept_hidden_states)
116
+ if labels is not None or shift_labels is not None:
117
+ loss = self.loss_function(logits, labels, self.vocab_size, **kwargs)
118
+
119
+ aux_loss = None
120
+ if output_router_logits:
121
+ aux_loss = load_balancing_loss_func(
122
+ outputs.router_logits,
123
+ self.num_experts,
124
+ self.num_experts_per_tok,
125
+ attention_mask,
126
+ )
127
+ if labels is not None:
128
+ loss += self.router_aux_loss_coef * aux_loss.to(loss.device) # make sure to reside in the same device
129
+
130
+ if not return_dict:
131
+ output = (logits,) + outputs[1:]
132
+ output = ((aux_loss,) + output) if aux_loss is not None else output
133
+ output = ((loss,) + output) if loss is not None else output
134
+ output = output + (token_accuracy,) if token_accuracy is not None else output
135
+ return output
136
+
137
+ return LigerMoeCausalLMOutputWithPast(
138
+ loss=loss,
139
+ aux_loss=aux_loss,
140
+ logits=logits,
141
+ past_key_values=outputs.past_key_values,
142
+ hidden_states=outputs.hidden_states,
143
+ attentions=outputs.attentions,
144
+ router_logits=outputs.router_logits,
145
+ token_accuracy=token_accuracy,
146
+ )