liger-kernel 0.6.2__py3-none-any.whl → 0.6.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. liger_kernel/chunked_loss/cosine_similarity_loss.py +13 -4
  2. liger_kernel/chunked_loss/fused_linear_distillation.py +13 -2
  3. liger_kernel/chunked_loss/fused_linear_ppo.py +25 -5
  4. liger_kernel/chunked_loss/grpo_loss.py +46 -9
  5. liger_kernel/chunked_loss/jsd_loss.py +23 -7
  6. liger_kernel/ops/cross_entropy.py +118 -62
  7. liger_kernel/ops/fused_linear_cross_entropy.py +97 -13
  8. liger_kernel/ops/grpo_loss.py +3 -1
  9. liger_kernel/ops/layer_norm.py +86 -69
  10. liger_kernel/ops/poly_norm.py +386 -0
  11. liger_kernel/ops/tiled_mlp.py +136 -0
  12. liger_kernel/transformers/__init__.py +36 -0
  13. liger_kernel/transformers/cross_entropy.py +8 -3
  14. liger_kernel/transformers/functional.py +31 -6
  15. liger_kernel/transformers/fused_linear_cross_entropy.py +13 -4
  16. liger_kernel/transformers/grpo_loss.py +56 -1
  17. liger_kernel/transformers/model/falcon_h1.py +122 -0
  18. liger_kernel/transformers/model/gemma.py +19 -7
  19. liger_kernel/transformers/model/gemma2.py +22 -7
  20. liger_kernel/transformers/model/gemma3.py +52 -14
  21. liger_kernel/transformers/model/glm4.py +18 -5
  22. liger_kernel/transformers/model/glm4v.py +19 -6
  23. liger_kernel/transformers/model/glm4v_moe.py +172 -0
  24. liger_kernel/transformers/model/hunyuan_v1.py +134 -0
  25. liger_kernel/transformers/model/internvl.py +157 -0
  26. liger_kernel/transformers/model/llama.py +16 -6
  27. liger_kernel/transformers/model/llama4.py +18 -5
  28. liger_kernel/transformers/model/llava.py +18 -6
  29. liger_kernel/transformers/model/loss_utils.py +32 -3
  30. liger_kernel/transformers/model/mistral.py +17 -7
  31. liger_kernel/transformers/model/mixtral.py +24 -9
  32. liger_kernel/transformers/model/mllama.py +14 -5
  33. liger_kernel/transformers/model/olmo2.py +18 -5
  34. liger_kernel/transformers/model/olmo3.py +142 -0
  35. liger_kernel/transformers/model/output_classes.py +147 -0
  36. liger_kernel/transformers/model/paligemma.py +41 -5
  37. liger_kernel/transformers/model/phi3.py +16 -8
  38. liger_kernel/transformers/model/qwen2.py +18 -4
  39. liger_kernel/transformers/model/qwen2_5_vl.py +21 -8
  40. liger_kernel/transformers/model/qwen2_vl.py +24 -7
  41. liger_kernel/transformers/model/qwen3.py +22 -6
  42. liger_kernel/transformers/model/qwen3_moe.py +27 -7
  43. liger_kernel/transformers/model/qwen3_next.py +146 -0
  44. liger_kernel/transformers/model/qwen3_vl.py +150 -0
  45. liger_kernel/transformers/model/qwen3_vl_moe.py +126 -0
  46. liger_kernel/transformers/model/smollm3.py +17 -7
  47. liger_kernel/transformers/model/smolvlm.py +158 -0
  48. liger_kernel/transformers/monkey_patch.py +830 -3
  49. liger_kernel/transformers/multi_token_attention.py +1 -1
  50. liger_kernel/transformers/poly_norm.py +42 -0
  51. liger_kernel/transformers/rms_norm.py +7 -0
  52. liger_kernel/transformers/rope.py +43 -0
  53. liger_kernel/transformers/swiglu.py +17 -0
  54. liger_kernel/transformers/tiled_mlp.py +133 -0
  55. {liger_kernel-0.6.2.dist-info → liger_kernel-0.6.4.dist-info}/METADATA +16 -10
  56. liger_kernel-0.6.4.dist-info/RECORD +118 -0
  57. liger_kernel-0.6.2.dist-info/RECORD +0 -104
  58. {liger_kernel-0.6.2.dist-info → liger_kernel-0.6.4.dist-info}/WHEEL +0 -0
  59. {liger_kernel-0.6.2.dist-info → liger_kernel-0.6.4.dist-info}/licenses/LICENSE +0 -0
  60. {liger_kernel-0.6.2.dist-info → liger_kernel-0.6.4.dist-info}/licenses/NOTICE +0 -0
  61. {liger_kernel-0.6.2.dist-info → liger_kernel-0.6.4.dist-info}/top_level.txt +0 -0
@@ -11,6 +11,8 @@ from transformers.utils.deprecation import deprecate_kwarg
11
11
 
12
12
  from liger_kernel.transformers.fused_linear_cross_entropy import LigerFusedLinearCrossEntropyLoss
13
13
  from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
14
+ from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
15
+ from liger_kernel.transformers.model.output_classes import LigerCausalLMOutputWithPast
14
16
 
15
17
 
16
18
  def lce_forward_deprecated(
@@ -145,7 +147,7 @@ def lce_forward(
145
147
  logits_to_keep: Union[int, torch.Tensor] = 0,
146
148
  skip_logits: Optional[bool] = None,
147
149
  **kwargs,
148
- ) -> Union[Tuple, CausalLMOutputWithPast]:
150
+ ) -> Union[Tuple, LigerCausalLMOutputWithPast]:
149
151
  r"""
150
152
  Args:
151
153
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -208,6 +210,7 @@ def lce_forward(
208
210
  shift_labels = kwargs.pop("shift_labels", None)
209
211
  logits = None
210
212
  loss = None
213
+ token_accuracy = None
211
214
 
212
215
  if skip_logits and labels is None and shift_labels is None:
213
216
  raise ValueError("skip_logits is True, but labels and shift_labels are None")
@@ -216,8 +219,9 @@ def lce_forward(
216
219
  # By default, if in training mode, don't materialize logits
217
220
  skip_logits = self.training and (labels is not None or shift_labels is not None)
218
221
 
222
+ # Compute loss
219
223
  if skip_logits:
220
- loss = LigerForCausalLMLoss(
224
+ result = LigerForCausalLMLoss(
221
225
  hidden_states=kept_hidden_states,
222
226
  lm_head_weight=self.lm_head.weight,
223
227
  labels=labels,
@@ -225,21 +229,31 @@ def lce_forward(
225
229
  hidden_size=self.config.hidden_size,
226
230
  **kwargs,
227
231
  )
232
+ loss, _, token_accuracy = unpack_cross_entropy_result(result)
228
233
 
229
234
  else:
230
235
  logits = self.lm_head(kept_hidden_states)
231
- if labels is not None:
236
+ if labels is not None or shift_labels is not None:
232
237
  loss = self.loss_function(
233
238
  logits=logits,
234
239
  labels=labels,
240
+ shift_labels=shift_labels,
235
241
  vocab_size=self.config.vocab_size,
236
242
  **kwargs,
237
243
  )
238
244
 
239
- return CausalLMOutputWithPast(
245
+ if not return_dict:
246
+ output_tuple = (logits,) + outputs[1:]
247
+ output = (loss,) + output_tuple if loss is not None else output_tuple
248
+ output = output + (token_accuracy,) if token_accuracy is not None else output
249
+ return output
250
+
251
+ # Return custom output class with token accuracy field
252
+ return LigerCausalLMOutputWithPast(
240
253
  loss=loss,
241
254
  logits=logits,
242
255
  past_key_values=outputs.past_key_values,
243
256
  hidden_states=outputs.hidden_states,
244
257
  attentions=outputs.attentions,
258
+ token_accuracy=token_accuracy,
245
259
  )
@@ -5,10 +5,11 @@ from typing import Union
5
5
 
6
6
  import torch
7
7
 
8
- from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VLCausalLMOutputWithPast
9
8
  from transformers.utils import can_return_tuple
10
9
 
11
10
  from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
11
+ from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
12
+ from liger_kernel.transformers.model.output_classes import LigerQwen2_5_VLCausalLMOutputWithPast
12
13
 
13
14
 
14
15
  @can_return_tuple
@@ -33,7 +34,7 @@ def lce_forward(
33
34
  second_per_grid_ts: Optional[torch.Tensor] = None,
34
35
  skip_logits: Optional[bool] = None,
35
36
  **kwargs,
36
- ) -> Union[Tuple, Qwen2_5_VLCausalLMOutputWithPast]:
37
+ ) -> Union[Tuple, LigerQwen2_5_VLCausalLMOutputWithPast]:
37
38
  r"""
38
39
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
39
40
  Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
@@ -113,6 +114,7 @@ def lce_forward(
113
114
  shift_labels = kwargs.pop("shift_labels", None)
114
115
  loss = None
115
116
  logits = None
117
+ token_accuracy = None
116
118
 
117
119
  if skip_logits and labels is None and shift_labels is None:
118
120
  raise ValueError("skip_logits is True, but labels and shift_labels are None")
@@ -120,8 +122,9 @@ def lce_forward(
120
122
  if skip_logits is None:
121
123
  skip_logits = self.training and (labels is not None or shift_labels is not None)
122
124
 
125
+ # Compute loss
123
126
  if skip_logits:
124
- loss = LigerForCausalLMLoss(
127
+ result = LigerForCausalLMLoss(
125
128
  hidden_states=hidden_states,
126
129
  lm_head_weight=self.lm_head.weight,
127
130
  labels=labels,
@@ -129,22 +132,32 @@ def lce_forward(
129
132
  hidden_size=self.config.hidden_size,
130
133
  **kwargs,
131
134
  )
135
+ loss, _, token_accuracy = unpack_cross_entropy_result(result)
132
136
  else:
133
137
  logits = self.lm_head(hidden_states)
134
138
 
135
139
  loss = None
136
- if labels is not None:
137
- loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size)
140
+ if labels is not None or shift_labels is not None:
141
+ loss = self.loss_function(
142
+ logits=logits,
143
+ labels=labels,
144
+ shift_labels=shift_labels,
145
+ vocab_size=self.config.vocab_size,
146
+ )
138
147
 
139
148
  if not return_dict:
140
- output = (logits,) + outputs[1:]
141
- return (loss,) + output if loss is not None else output
149
+ output_tuple = (logits,) + outputs[1:]
150
+ output = (loss,) + output_tuple if loss is not None else output_tuple
151
+ output = output + (token_accuracy,) if token_accuracy is not None else output
152
+ return output
142
153
 
143
- return Qwen2_5_VLCausalLMOutputWithPast(
154
+ # Return Qwen2.5-VL output with token accuracy
155
+ return LigerQwen2_5_VLCausalLMOutputWithPast(
144
156
  loss=loss,
145
157
  logits=logits,
146
158
  past_key_values=outputs.past_key_values,
147
159
  hidden_states=outputs.hidden_states,
148
160
  attentions=outputs.attentions,
149
161
  rope_deltas=outputs.rope_deltas,
162
+ token_accuracy=token_accuracy,
150
163
  )
@@ -5,10 +5,11 @@ from typing import Union
5
5
 
6
6
  import torch
7
7
 
8
- from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLCausalLMOutputWithPast
9
8
  from transformers.utils import can_return_tuple
10
9
 
11
10
  from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
11
+ from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
12
+ from liger_kernel.transformers.model.output_classes import LigerQwen2VLCausalLMOutputWithPast
12
13
 
13
14
 
14
15
  @can_return_tuple
@@ -32,7 +33,7 @@ def lce_forward(
32
33
  cache_position: Optional[torch.LongTensor] = None,
33
34
  skip_logits: Optional[bool] = None,
34
35
  **kwargs,
35
- ) -> Union[Tuple, Qwen2VLCausalLMOutputWithPast]:
36
+ ) -> Union[Tuple, LigerQwen2VLCausalLMOutputWithPast]:
36
37
  r"""
37
38
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
38
39
  Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
@@ -109,6 +110,7 @@ def lce_forward(
109
110
  shift_labels = kwargs.pop("shift_labels", None)
110
111
  loss = None
111
112
  logits = None
113
+ token_accuracy = None
112
114
 
113
115
  if skip_logits and labels is None and shift_labels is None:
114
116
  raise ValueError("skip_logits is True, but labels and shift_labels are None")
@@ -116,8 +118,9 @@ def lce_forward(
116
118
  if skip_logits is None:
117
119
  skip_logits = self.training and (labels is not None or shift_labels is not None)
118
120
 
121
+ # Compute loss
119
122
  if skip_logits:
120
- loss = LigerForCausalLMLoss(
123
+ result = LigerForCausalLMLoss(
121
124
  hidden_states=hidden_states,
122
125
  lm_head_weight=self.lm_head.weight,
123
126
  labels=labels,
@@ -125,18 +128,32 @@ def lce_forward(
125
128
  hidden_size=self.config.hidden_size,
126
129
  **kwargs,
127
130
  )
131
+ loss, _, token_accuracy = unpack_cross_entropy_result(result)
128
132
  else:
129
133
  logits = self.lm_head(hidden_states)
130
134
 
131
135
  loss = None
132
- if labels is not None:
133
- loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size)
134
-
135
- return Qwen2VLCausalLMOutputWithPast(
136
+ if labels is not None or shift_labels is not None:
137
+ loss = self.loss_function(
138
+ logits=logits,
139
+ labels=labels,
140
+ shift_labels=shift_labels,
141
+ vocab_size=self.config.vocab_size,
142
+ )
143
+
144
+ if not return_dict:
145
+ output_tuple = (logits,) + outputs[1:]
146
+ output = (loss,) + output_tuple if loss is not None else output_tuple
147
+ output = output + (token_accuracy,) if token_accuracy is not None else output
148
+ return output
149
+
150
+ # Return Qwen2VL output with token accuracy
151
+ return LigerQwen2VLCausalLMOutputWithPast(
136
152
  loss=loss,
137
153
  logits=logits,
138
154
  past_key_values=outputs.past_key_values,
139
155
  hidden_states=outputs.hidden_states,
140
156
  attentions=outputs.attentions,
141
157
  rope_deltas=outputs.rope_deltas,
158
+ token_accuracy=token_accuracy,
142
159
  )
@@ -4,9 +4,9 @@ from typing import Union
4
4
 
5
5
  import torch
6
6
 
7
- from transformers.modeling_outputs import CausalLMOutputWithPast
8
-
9
7
  from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
8
+ from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
9
+ from liger_kernel.transformers.model.output_classes import LigerCausalLMOutputWithPast
10
10
 
11
11
 
12
12
  def lce_forward(
@@ -23,8 +23,9 @@ def lce_forward(
23
23
  cache_position: Optional[torch.LongTensor] = None,
24
24
  logits_to_keep: Union[int, torch.Tensor] = 0,
25
25
  skip_logits: Optional[bool] = None,
26
+ return_dict: Optional[bool] = None,
26
27
  **kwargs,
27
- ) -> CausalLMOutputWithPast:
28
+ ) -> LigerCausalLMOutputWithPast:
28
29
  r"""
29
30
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
30
31
  Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
@@ -60,6 +61,7 @@ def lce_forward(
60
61
  output_hidden_states = (
61
62
  output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
62
63
  )
64
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
63
65
 
64
66
  # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
65
67
  outputs = self.model(
@@ -81,8 +83,11 @@ def lce_forward(
81
83
  kept_hidden_states = hidden_states[:, slice_indices, :]
82
84
 
83
85
  shift_labels = kwargs.pop("shift_labels", None)
86
+ # Remove output-control parameters that shouldn't be passed to loss functions
87
+ kwargs.pop("return_dict", None)
84
88
  logits = None
85
89
  loss = None
90
+ token_accuracy = None
86
91
 
87
92
  if skip_logits and labels is None and shift_labels is None:
88
93
  raise ValueError("skip_logits is True, but labels and shift_labels are None")
@@ -91,8 +96,9 @@ def lce_forward(
91
96
  # By default, if in training mode, don't materialize logits
92
97
  skip_logits = self.training and (labels is not None or shift_labels is not None)
93
98
 
99
+ # Compute loss
94
100
  if skip_logits:
95
- loss = LigerForCausalLMLoss(
101
+ result = LigerForCausalLMLoss(
96
102
  hidden_states=kept_hidden_states,
97
103
  lm_head_weight=self.lm_head.weight,
98
104
  labels=labels,
@@ -100,21 +106,31 @@ def lce_forward(
100
106
  hidden_size=self.config.hidden_size,
101
107
  **kwargs,
102
108
  )
109
+ loss, _, token_accuracy = unpack_cross_entropy_result(result)
103
110
 
104
111
  else:
105
112
  logits = self.lm_head(kept_hidden_states)
106
- if labels is not None:
113
+ if labels is not None or shift_labels is not None:
107
114
  loss = self.loss_function(
108
115
  logits=logits,
109
116
  labels=labels,
117
+ shift_labels=shift_labels,
110
118
  vocab_size=self.config.vocab_size,
111
119
  **kwargs,
112
120
  )
113
121
 
114
- return CausalLMOutputWithPast(
122
+ if not return_dict:
123
+ output = (logits,) + outputs[1:]
124
+ output = ((loss,) + output) if loss is not None else output
125
+ output = output + (token_accuracy,) if token_accuracy is not None else output
126
+ return output
127
+
128
+ # Return custom output class with accuracy field
129
+ return LigerCausalLMOutputWithPast(
115
130
  loss=loss,
116
131
  logits=logits,
117
132
  past_key_values=outputs.past_key_values,
118
133
  hidden_states=outputs.hidden_states,
119
134
  attentions=outputs.attentions,
135
+ token_accuracy=token_accuracy,
120
136
  )
@@ -4,11 +4,12 @@ from typing import Union
4
4
 
5
5
  import torch
6
6
 
7
- from transformers.modeling_outputs import MoeCausalLMOutputWithPast
8
7
  from transformers.modeling_outputs import MoeModelOutputWithPast
9
8
  from transformers.models.mixtral.modeling_mixtral import load_balancing_loss_func
10
9
 
11
10
  from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
11
+ from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
12
+ from liger_kernel.transformers.model.output_classes import LigerMoeCausalLMOutputWithPast
12
13
 
13
14
 
14
15
  def lce_forward(
@@ -26,8 +27,9 @@ def lce_forward(
26
27
  cache_position: Optional[torch.LongTensor] = None,
27
28
  logits_to_keep: Union[int, torch.Tensor] = 0,
28
29
  skip_logits: Optional[bool] = None,
30
+ return_dict: Optional[bool] = None,
29
31
  **kwargs,
30
- ) -> MoeCausalLMOutputWithPast:
32
+ ) -> LigerMoeCausalLMOutputWithPast:
31
33
  r"""
32
34
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
33
35
  Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
@@ -64,10 +66,10 @@ def lce_forward(
64
66
  output_router_logits = (
65
67
  output_router_logits if output_router_logits is not None else self.config.output_router_logits
66
68
  )
67
-
68
69
  output_hidden_states = (
69
70
  output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
70
71
  )
72
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
71
73
 
72
74
  # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
73
75
  outputs: MoeModelOutputWithPast = self.model(
@@ -92,12 +94,14 @@ def lce_forward(
92
94
  shift_labels = kwargs.pop("shift_labels", None)
93
95
  logits = None
94
96
  loss = None
97
+ token_accuracy = None
95
98
 
96
99
  if skip_logits is None:
97
100
  skip_logits = self.training and (labels is not None or shift_labels is not None)
98
101
 
102
+ # Compute loss
99
103
  if skip_logits:
100
- loss = LigerForCausalLMLoss(
104
+ result = LigerForCausalLMLoss(
101
105
  hidden_states=kept_hidden_states,
102
106
  lm_head_weight=self.lm_head.weight,
103
107
  labels=labels,
@@ -105,10 +109,17 @@ def lce_forward(
105
109
  hidden_size=self.config.hidden_size,
106
110
  **kwargs,
107
111
  )
112
+ loss, _, token_accuracy = unpack_cross_entropy_result(result)
108
113
  else: # if in inference model materialize logits
109
114
  logits = self.lm_head(kept_hidden_states)
110
- if labels is not None:
111
- loss = self.loss_function(logits, labels, self.vocab_size, **kwargs)
115
+ if labels is not None or shift_labels is not None:
116
+ loss = self.loss_function(
117
+ logits=logits,
118
+ labels=labels,
119
+ shift_labels=shift_labels,
120
+ vocab_size=self.vocab_size,
121
+ **kwargs,
122
+ )
112
123
 
113
124
  aux_loss = None
114
125
  if output_router_logits:
@@ -121,7 +132,15 @@ def lce_forward(
121
132
  if labels is not None:
122
133
  loss += self.router_aux_loss_coef * aux_loss.to(loss.device) # make sure to reside in the same device
123
134
 
124
- return MoeCausalLMOutputWithPast(
135
+ if not return_dict:
136
+ output = (logits,) + outputs[1:]
137
+ output = ((aux_loss,) + output) if aux_loss is not None else output
138
+ output = ((loss,) + output) if loss is not None else output
139
+ output = output + (token_accuracy,) if token_accuracy is not None else output
140
+ return output
141
+
142
+ # Return custom output class with accuracy field
143
+ return LigerMoeCausalLMOutputWithPast(
125
144
  loss=loss,
126
145
  aux_loss=aux_loss,
127
146
  logits=logits,
@@ -129,4 +148,5 @@ def lce_forward(
129
148
  hidden_states=outputs.hidden_states,
130
149
  attentions=outputs.attentions,
131
150
  router_logits=outputs.router_logits,
151
+ token_accuracy=token_accuracy,
132
152
  )
@@ -0,0 +1,146 @@
1
+ from typing import TYPE_CHECKING
2
+ from typing import List
3
+ from typing import Optional
4
+ from typing import Union
5
+
6
+ import torch
7
+
8
+ from transformers.modeling_outputs import MoeModelOutputWithPast
9
+
10
+ if TYPE_CHECKING:
11
+ from transformers.models.qwen3_next.modeling_qwen3_next import load_balancing_loss_func
12
+
13
+ from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
14
+ from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
15
+ from liger_kernel.transformers.model.output_classes import LigerMoeCausalLMOutputWithPast
16
+
17
+
18
+ def lce_forward(
19
+ self,
20
+ input_ids: Optional[torch.LongTensor] = None,
21
+ attention_mask: Optional[torch.Tensor] = None,
22
+ position_ids: Optional[torch.LongTensor] = None,
23
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
24
+ inputs_embeds: Optional[torch.FloatTensor] = None,
25
+ labels: Optional[torch.LongTensor] = None,
26
+ use_cache: Optional[bool] = None,
27
+ output_attentions: Optional[bool] = None,
28
+ output_hidden_states: Optional[bool] = None,
29
+ output_router_logits: Optional[bool] = None,
30
+ cache_position: Optional[torch.LongTensor] = None,
31
+ logits_to_keep: Union[int, torch.Tensor] = 0,
32
+ skip_logits: Optional[bool] = None,
33
+ return_dict: Optional[bool] = None,
34
+ **kwargs,
35
+ ) -> LigerMoeCausalLMOutputWithPast:
36
+ r"""
37
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
38
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
39
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
40
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
41
+
42
+ logits_to_keep (`int` or `torch.Tensor`, *optional*):
43
+ If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
44
+ `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
45
+ token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
46
+ If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
47
+ This is useful when using packed tensor format (single dimension for batch and sequence length).
48
+
49
+ Returns:
50
+
51
+ Example:
52
+
53
+ ```python
54
+ >>> from transformers import AutoModelForCausalLM, AutoTokenizer
55
+
56
+ >>> model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-Next-80B-A3B-Instruct")
57
+ >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-Next-80B-A3B-Instruct")
58
+
59
+ >>> prompt = "Give me a short introduction to large language model."
60
+ >>> inputs = tokenizer(prompt, return_tensors="pt")
61
+
62
+ >>> # Generate
63
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
64
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
65
+ "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
66
+ ```"""
67
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
68
+ output_router_logits = (
69
+ output_router_logits if output_router_logits is not None else self.config.output_router_logits
70
+ )
71
+ output_hidden_states = (
72
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
73
+ )
74
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
75
+
76
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
77
+ outputs: MoeModelOutputWithPast = self.model(
78
+ input_ids=input_ids,
79
+ attention_mask=attention_mask,
80
+ position_ids=position_ids,
81
+ past_key_values=past_key_values,
82
+ inputs_embeds=inputs_embeds,
83
+ use_cache=use_cache,
84
+ output_attentions=output_attentions,
85
+ output_hidden_states=output_hidden_states,
86
+ output_router_logits=output_router_logits,
87
+ cache_position=cache_position,
88
+ **kwargs,
89
+ )
90
+
91
+ hidden_states = outputs.last_hidden_state
92
+ # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
93
+ slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
94
+ kept_hidden_states = hidden_states[:, slice_indices, :]
95
+
96
+ shift_labels = kwargs.pop("shift_labels", None)
97
+ logits = None
98
+ loss = None
99
+ token_accuracy = None
100
+
101
+ if skip_logits is None:
102
+ skip_logits = self.training and (labels is not None or shift_labels is not None)
103
+
104
+ if skip_logits:
105
+ result = LigerForCausalLMLoss(
106
+ hidden_states=kept_hidden_states,
107
+ lm_head_weight=self.lm_head.weight,
108
+ labels=labels,
109
+ shift_labels=shift_labels,
110
+ hidden_size=self.config.hidden_size,
111
+ **kwargs,
112
+ )
113
+ loss, _, token_accuracy = unpack_cross_entropy_result(result)
114
+ else: # if in inference model materialize logits
115
+ logits = self.lm_head(kept_hidden_states)
116
+ if labels is not None or shift_labels is not None:
117
+ loss = self.loss_function(logits, labels, self.vocab_size, **kwargs)
118
+
119
+ aux_loss = None
120
+ if output_router_logits:
121
+ aux_loss = load_balancing_loss_func(
122
+ outputs.router_logits,
123
+ self.num_experts,
124
+ self.num_experts_per_tok,
125
+ attention_mask,
126
+ )
127
+ if labels is not None:
128
+ loss += self.router_aux_loss_coef * aux_loss.to(loss.device) # make sure to reside in the same device
129
+
130
+ if not return_dict:
131
+ output = (logits,) + outputs[1:]
132
+ output = ((aux_loss,) + output) if aux_loss is not None else output
133
+ output = ((loss,) + output) if loss is not None else output
134
+ output = output + (token_accuracy,) if token_accuracy is not None else output
135
+ return output
136
+
137
+ return LigerMoeCausalLMOutputWithPast(
138
+ loss=loss,
139
+ aux_loss=aux_loss,
140
+ logits=logits,
141
+ past_key_values=outputs.past_key_values,
142
+ hidden_states=outputs.hidden_states,
143
+ attentions=outputs.attentions,
144
+ router_logits=outputs.router_logits,
145
+ token_accuracy=token_accuracy,
146
+ )