liger-kernel 0.6.3__py3-none-any.whl → 0.6.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (55) hide show
  1. liger_kernel/chunked_loss/cosine_similarity_loss.py +13 -4
  2. liger_kernel/chunked_loss/fused_linear_distillation.py +13 -2
  3. liger_kernel/chunked_loss/fused_linear_ppo.py +21 -5
  4. liger_kernel/chunked_loss/grpo_loss.py +8 -5
  5. liger_kernel/chunked_loss/jsd_loss.py +18 -5
  6. liger_kernel/ops/cross_entropy.py +59 -9
  7. liger_kernel/ops/fused_linear_cross_entropy.py +30 -4
  8. liger_kernel/ops/grpo_loss.py +3 -1
  9. liger_kernel/ops/layer_norm.py +84 -65
  10. liger_kernel/ops/tiled_mlp.py +136 -0
  11. liger_kernel/transformers/__init__.py +19 -0
  12. liger_kernel/transformers/cross_entropy.py +8 -3
  13. liger_kernel/transformers/functional.py +24 -6
  14. liger_kernel/transformers/fused_linear_cross_entropy.py +8 -3
  15. liger_kernel/transformers/grpo_loss.py +56 -1
  16. liger_kernel/transformers/model/falcon_h1.py +19 -5
  17. liger_kernel/transformers/model/gemma.py +17 -6
  18. liger_kernel/transformers/model/gemma2.py +14 -5
  19. liger_kernel/transformers/model/gemma3.py +25 -12
  20. liger_kernel/transformers/model/glm4.py +16 -4
  21. liger_kernel/transformers/model/glm4v.py +16 -4
  22. liger_kernel/transformers/model/glm4v_moe.py +23 -4
  23. liger_kernel/transformers/model/hunyuan_v1.py +134 -0
  24. liger_kernel/transformers/model/internvl.py +12 -5
  25. liger_kernel/transformers/model/llama.py +14 -5
  26. liger_kernel/transformers/model/llama4.py +16 -4
  27. liger_kernel/transformers/model/llava.py +12 -4
  28. liger_kernel/transformers/model/loss_utils.py +31 -3
  29. liger_kernel/transformers/model/mistral.py +15 -6
  30. liger_kernel/transformers/model/mixtral.py +16 -7
  31. liger_kernel/transformers/model/mllama.py +12 -4
  32. liger_kernel/transformers/model/olmo2.py +16 -4
  33. liger_kernel/transformers/model/olmo3.py +142 -0
  34. liger_kernel/transformers/model/output_classes.py +147 -0
  35. liger_kernel/transformers/model/paligemma.py +22 -5
  36. liger_kernel/transformers/model/phi3.py +14 -7
  37. liger_kernel/transformers/model/qwen2.py +16 -3
  38. liger_kernel/transformers/model/qwen2_5_vl.py +14 -6
  39. liger_kernel/transformers/model/qwen2_vl.py +16 -4
  40. liger_kernel/transformers/model/qwen3.py +20 -5
  41. liger_kernel/transformers/model/qwen3_moe.py +19 -5
  42. liger_kernel/transformers/model/qwen3_next.py +17 -5
  43. liger_kernel/transformers/model/qwen3_vl.py +150 -0
  44. liger_kernel/transformers/model/qwen3_vl_moe.py +126 -0
  45. liger_kernel/transformers/model/smollm3.py +15 -6
  46. liger_kernel/transformers/monkey_patch.py +398 -20
  47. liger_kernel/transformers/rope.py +43 -0
  48. liger_kernel/transformers/swiglu.py +17 -0
  49. liger_kernel/transformers/tiled_mlp.py +133 -0
  50. {liger_kernel-0.6.3.dist-info → liger_kernel-0.6.4.dist-info}/METADATA +4 -1
  51. {liger_kernel-0.6.3.dist-info → liger_kernel-0.6.4.dist-info}/RECORD +55 -48
  52. {liger_kernel-0.6.3.dist-info → liger_kernel-0.6.4.dist-info}/WHEEL +0 -0
  53. {liger_kernel-0.6.3.dist-info → liger_kernel-0.6.4.dist-info}/licenses/LICENSE +0 -0
  54. {liger_kernel-0.6.3.dist-info → liger_kernel-0.6.4.dist-info}/licenses/NOTICE +0 -0
  55. {liger_kernel-0.6.3.dist-info → liger_kernel-0.6.4.dist-info}/top_level.txt +0 -0
@@ -5,10 +5,11 @@ from typing import Union
5
5
 
6
6
  import torch
7
7
 
8
- from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VLCausalLMOutputWithPast
9
8
  from transformers.utils import can_return_tuple
10
9
 
11
10
  from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
11
+ from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
12
+ from liger_kernel.transformers.model.output_classes import LigerQwen2_5_VLCausalLMOutputWithPast
12
13
 
13
14
 
14
15
  @can_return_tuple
@@ -33,7 +34,7 @@ def lce_forward(
33
34
  second_per_grid_ts: Optional[torch.Tensor] = None,
34
35
  skip_logits: Optional[bool] = None,
35
36
  **kwargs,
36
- ) -> Union[Tuple, Qwen2_5_VLCausalLMOutputWithPast]:
37
+ ) -> Union[Tuple, LigerQwen2_5_VLCausalLMOutputWithPast]:
37
38
  r"""
38
39
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
39
40
  Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
@@ -113,6 +114,7 @@ def lce_forward(
113
114
  shift_labels = kwargs.pop("shift_labels", None)
114
115
  loss = None
115
116
  logits = None
117
+ token_accuracy = None
116
118
 
117
119
  if skip_logits and labels is None and shift_labels is None:
118
120
  raise ValueError("skip_logits is True, but labels and shift_labels are None")
@@ -120,8 +122,9 @@ def lce_forward(
120
122
  if skip_logits is None:
121
123
  skip_logits = self.training and (labels is not None or shift_labels is not None)
122
124
 
125
+ # Compute loss
123
126
  if skip_logits:
124
- loss = LigerForCausalLMLoss(
127
+ result = LigerForCausalLMLoss(
125
128
  hidden_states=hidden_states,
126
129
  lm_head_weight=self.lm_head.weight,
127
130
  labels=labels,
@@ -129,6 +132,7 @@ def lce_forward(
129
132
  hidden_size=self.config.hidden_size,
130
133
  **kwargs,
131
134
  )
135
+ loss, _, token_accuracy = unpack_cross_entropy_result(result)
132
136
  else:
133
137
  logits = self.lm_head(hidden_states)
134
138
 
@@ -142,14 +146,18 @@ def lce_forward(
142
146
  )
143
147
 
144
148
  if not return_dict:
145
- output = (logits,) + outputs[1:]
146
- return (loss,) + output if loss is not None else output
149
+ output_tuple = (logits,) + outputs[1:]
150
+ output = (loss,) + output_tuple if loss is not None else output_tuple
151
+ output = output + (token_accuracy,) if token_accuracy is not None else output
152
+ return output
147
153
 
148
- return Qwen2_5_VLCausalLMOutputWithPast(
154
+ # Return Qwen2.5-VL output with token accuracy
155
+ return LigerQwen2_5_VLCausalLMOutputWithPast(
149
156
  loss=loss,
150
157
  logits=logits,
151
158
  past_key_values=outputs.past_key_values,
152
159
  hidden_states=outputs.hidden_states,
153
160
  attentions=outputs.attentions,
154
161
  rope_deltas=outputs.rope_deltas,
162
+ token_accuracy=token_accuracy,
155
163
  )
@@ -5,10 +5,11 @@ from typing import Union
5
5
 
6
6
  import torch
7
7
 
8
- from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLCausalLMOutputWithPast
9
8
  from transformers.utils import can_return_tuple
10
9
 
11
10
  from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
11
+ from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
12
+ from liger_kernel.transformers.model.output_classes import LigerQwen2VLCausalLMOutputWithPast
12
13
 
13
14
 
14
15
  @can_return_tuple
@@ -32,7 +33,7 @@ def lce_forward(
32
33
  cache_position: Optional[torch.LongTensor] = None,
33
34
  skip_logits: Optional[bool] = None,
34
35
  **kwargs,
35
- ) -> Union[Tuple, Qwen2VLCausalLMOutputWithPast]:
36
+ ) -> Union[Tuple, LigerQwen2VLCausalLMOutputWithPast]:
36
37
  r"""
37
38
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
38
39
  Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
@@ -109,6 +110,7 @@ def lce_forward(
109
110
  shift_labels = kwargs.pop("shift_labels", None)
110
111
  loss = None
111
112
  logits = None
113
+ token_accuracy = None
112
114
 
113
115
  if skip_logits and labels is None and shift_labels is None:
114
116
  raise ValueError("skip_logits is True, but labels and shift_labels are None")
@@ -116,8 +118,9 @@ def lce_forward(
116
118
  if skip_logits is None:
117
119
  skip_logits = self.training and (labels is not None or shift_labels is not None)
118
120
 
121
+ # Compute loss
119
122
  if skip_logits:
120
- loss = LigerForCausalLMLoss(
123
+ result = LigerForCausalLMLoss(
121
124
  hidden_states=hidden_states,
122
125
  lm_head_weight=self.lm_head.weight,
123
126
  labels=labels,
@@ -125,6 +128,7 @@ def lce_forward(
125
128
  hidden_size=self.config.hidden_size,
126
129
  **kwargs,
127
130
  )
131
+ loss, _, token_accuracy = unpack_cross_entropy_result(result)
128
132
  else:
129
133
  logits = self.lm_head(hidden_states)
130
134
 
@@ -137,11 +141,19 @@ def lce_forward(
137
141
  vocab_size=self.config.vocab_size,
138
142
  )
139
143
 
140
- return Qwen2VLCausalLMOutputWithPast(
144
+ if not return_dict:
145
+ output_tuple = (logits,) + outputs[1:]
146
+ output = (loss,) + output_tuple if loss is not None else output_tuple
147
+ output = output + (token_accuracy,) if token_accuracy is not None else output
148
+ return output
149
+
150
+ # Return Qwen2VL output with token accuracy
151
+ return LigerQwen2VLCausalLMOutputWithPast(
141
152
  loss=loss,
142
153
  logits=logits,
143
154
  past_key_values=outputs.past_key_values,
144
155
  hidden_states=outputs.hidden_states,
145
156
  attentions=outputs.attentions,
146
157
  rope_deltas=outputs.rope_deltas,
158
+ token_accuracy=token_accuracy,
147
159
  )
@@ -4,9 +4,9 @@ from typing import Union
4
4
 
5
5
  import torch
6
6
 
7
- from transformers.modeling_outputs import CausalLMOutputWithPast
8
-
9
7
  from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
8
+ from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
9
+ from liger_kernel.transformers.model.output_classes import LigerCausalLMOutputWithPast
10
10
 
11
11
 
12
12
  def lce_forward(
@@ -23,8 +23,9 @@ def lce_forward(
23
23
  cache_position: Optional[torch.LongTensor] = None,
24
24
  logits_to_keep: Union[int, torch.Tensor] = 0,
25
25
  skip_logits: Optional[bool] = None,
26
+ return_dict: Optional[bool] = None,
26
27
  **kwargs,
27
- ) -> CausalLMOutputWithPast:
28
+ ) -> LigerCausalLMOutputWithPast:
28
29
  r"""
29
30
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
30
31
  Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
@@ -60,6 +61,7 @@ def lce_forward(
60
61
  output_hidden_states = (
61
62
  output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
62
63
  )
64
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
63
65
 
64
66
  # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
65
67
  outputs = self.model(
@@ -81,8 +83,11 @@ def lce_forward(
81
83
  kept_hidden_states = hidden_states[:, slice_indices, :]
82
84
 
83
85
  shift_labels = kwargs.pop("shift_labels", None)
86
+ # Remove output-control parameters that shouldn't be passed to loss functions
87
+ kwargs.pop("return_dict", None)
84
88
  logits = None
85
89
  loss = None
90
+ token_accuracy = None
86
91
 
87
92
  if skip_logits and labels is None and shift_labels is None:
88
93
  raise ValueError("skip_logits is True, but labels and shift_labels are None")
@@ -91,8 +96,9 @@ def lce_forward(
91
96
  # By default, if in training mode, don't materialize logits
92
97
  skip_logits = self.training and (labels is not None or shift_labels is not None)
93
98
 
99
+ # Compute loss
94
100
  if skip_logits:
95
- loss = LigerForCausalLMLoss(
101
+ result = LigerForCausalLMLoss(
96
102
  hidden_states=kept_hidden_states,
97
103
  lm_head_weight=self.lm_head.weight,
98
104
  labels=labels,
@@ -100,6 +106,7 @@ def lce_forward(
100
106
  hidden_size=self.config.hidden_size,
101
107
  **kwargs,
102
108
  )
109
+ loss, _, token_accuracy = unpack_cross_entropy_result(result)
103
110
 
104
111
  else:
105
112
  logits = self.lm_head(kept_hidden_states)
@@ -112,10 +119,18 @@ def lce_forward(
112
119
  **kwargs,
113
120
  )
114
121
 
115
- return CausalLMOutputWithPast(
122
+ if not return_dict:
123
+ output = (logits,) + outputs[1:]
124
+ output = ((loss,) + output) if loss is not None else output
125
+ output = output + (token_accuracy,) if token_accuracy is not None else output
126
+ return output
127
+
128
+ # Return custom output class with accuracy field
129
+ return LigerCausalLMOutputWithPast(
116
130
  loss=loss,
117
131
  logits=logits,
118
132
  past_key_values=outputs.past_key_values,
119
133
  hidden_states=outputs.hidden_states,
120
134
  attentions=outputs.attentions,
135
+ token_accuracy=token_accuracy,
121
136
  )
@@ -4,11 +4,12 @@ from typing import Union
4
4
 
5
5
  import torch
6
6
 
7
- from transformers.modeling_outputs import MoeCausalLMOutputWithPast
8
7
  from transformers.modeling_outputs import MoeModelOutputWithPast
9
8
  from transformers.models.mixtral.modeling_mixtral import load_balancing_loss_func
10
9
 
11
10
  from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
11
+ from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
12
+ from liger_kernel.transformers.model.output_classes import LigerMoeCausalLMOutputWithPast
12
13
 
13
14
 
14
15
  def lce_forward(
@@ -26,8 +27,9 @@ def lce_forward(
26
27
  cache_position: Optional[torch.LongTensor] = None,
27
28
  logits_to_keep: Union[int, torch.Tensor] = 0,
28
29
  skip_logits: Optional[bool] = None,
30
+ return_dict: Optional[bool] = None,
29
31
  **kwargs,
30
- ) -> MoeCausalLMOutputWithPast:
32
+ ) -> LigerMoeCausalLMOutputWithPast:
31
33
  r"""
32
34
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
33
35
  Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
@@ -64,10 +66,10 @@ def lce_forward(
64
66
  output_router_logits = (
65
67
  output_router_logits if output_router_logits is not None else self.config.output_router_logits
66
68
  )
67
-
68
69
  output_hidden_states = (
69
70
  output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
70
71
  )
72
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
71
73
 
72
74
  # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
73
75
  outputs: MoeModelOutputWithPast = self.model(
@@ -92,12 +94,14 @@ def lce_forward(
92
94
  shift_labels = kwargs.pop("shift_labels", None)
93
95
  logits = None
94
96
  loss = None
97
+ token_accuracy = None
95
98
 
96
99
  if skip_logits is None:
97
100
  skip_logits = self.training and (labels is not None or shift_labels is not None)
98
101
 
102
+ # Compute loss
99
103
  if skip_logits:
100
- loss = LigerForCausalLMLoss(
104
+ result = LigerForCausalLMLoss(
101
105
  hidden_states=kept_hidden_states,
102
106
  lm_head_weight=self.lm_head.weight,
103
107
  labels=labels,
@@ -105,6 +109,7 @@ def lce_forward(
105
109
  hidden_size=self.config.hidden_size,
106
110
  **kwargs,
107
111
  )
112
+ loss, _, token_accuracy = unpack_cross_entropy_result(result)
108
113
  else: # if in inference model materialize logits
109
114
  logits = self.lm_head(kept_hidden_states)
110
115
  if labels is not None or shift_labels is not None:
@@ -127,7 +132,15 @@ def lce_forward(
127
132
  if labels is not None:
128
133
  loss += self.router_aux_loss_coef * aux_loss.to(loss.device) # make sure to reside in the same device
129
134
 
130
- return MoeCausalLMOutputWithPast(
135
+ if not return_dict:
136
+ output = (logits,) + outputs[1:]
137
+ output = ((aux_loss,) + output) if aux_loss is not None else output
138
+ output = ((loss,) + output) if loss is not None else output
139
+ output = output + (token_accuracy,) if token_accuracy is not None else output
140
+ return output
141
+
142
+ # Return custom output class with accuracy field
143
+ return LigerMoeCausalLMOutputWithPast(
131
144
  loss=loss,
132
145
  aux_loss=aux_loss,
133
146
  logits=logits,
@@ -135,4 +148,5 @@ def lce_forward(
135
148
  hidden_states=outputs.hidden_states,
136
149
  attentions=outputs.attentions,
137
150
  router_logits=outputs.router_logits,
151
+ token_accuracy=token_accuracy,
138
152
  )
@@ -5,13 +5,14 @@ from typing import Union
5
5
 
6
6
  import torch
7
7
 
8
- from transformers.modeling_outputs import MoeCausalLMOutputWithPast
9
8
  from transformers.modeling_outputs import MoeModelOutputWithPast
10
9
 
11
10
  if TYPE_CHECKING:
12
11
  from transformers.models.qwen3_next.modeling_qwen3_next import load_balancing_loss_func
13
12
 
14
13
  from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
14
+ from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
15
+ from liger_kernel.transformers.model.output_classes import LigerMoeCausalLMOutputWithPast
15
16
 
16
17
 
17
18
  def lce_forward(
@@ -29,8 +30,9 @@ def lce_forward(
29
30
  cache_position: Optional[torch.LongTensor] = None,
30
31
  logits_to_keep: Union[int, torch.Tensor] = 0,
31
32
  skip_logits: Optional[bool] = None,
33
+ return_dict: Optional[bool] = None,
32
34
  **kwargs,
33
- ) -> MoeCausalLMOutputWithPast:
35
+ ) -> LigerMoeCausalLMOutputWithPast:
34
36
  r"""
35
37
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
36
38
  Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
@@ -66,10 +68,10 @@ def lce_forward(
66
68
  output_router_logits = (
67
69
  output_router_logits if output_router_logits is not None else self.config.output_router_logits
68
70
  )
69
-
70
71
  output_hidden_states = (
71
72
  output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
72
73
  )
74
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
73
75
 
74
76
  # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
75
77
  outputs: MoeModelOutputWithPast = self.model(
@@ -94,12 +96,13 @@ def lce_forward(
94
96
  shift_labels = kwargs.pop("shift_labels", None)
95
97
  logits = None
96
98
  loss = None
99
+ token_accuracy = None
97
100
 
98
101
  if skip_logits is None:
99
102
  skip_logits = self.training and (labels is not None or shift_labels is not None)
100
103
 
101
104
  if skip_logits:
102
- loss = LigerForCausalLMLoss(
105
+ result = LigerForCausalLMLoss(
103
106
  hidden_states=kept_hidden_states,
104
107
  lm_head_weight=self.lm_head.weight,
105
108
  labels=labels,
@@ -107,6 +110,7 @@ def lce_forward(
107
110
  hidden_size=self.config.hidden_size,
108
111
  **kwargs,
109
112
  )
113
+ loss, _, token_accuracy = unpack_cross_entropy_result(result)
110
114
  else: # if in inference model materialize logits
111
115
  logits = self.lm_head(kept_hidden_states)
112
116
  if labels is not None or shift_labels is not None:
@@ -123,7 +127,14 @@ def lce_forward(
123
127
  if labels is not None:
124
128
  loss += self.router_aux_loss_coef * aux_loss.to(loss.device) # make sure to reside in the same device
125
129
 
126
- return MoeCausalLMOutputWithPast(
130
+ if not return_dict:
131
+ output = (logits,) + outputs[1:]
132
+ output = ((aux_loss,) + output) if aux_loss is not None else output
133
+ output = ((loss,) + output) if loss is not None else output
134
+ output = output + (token_accuracy,) if token_accuracy is not None else output
135
+ return output
136
+
137
+ return LigerMoeCausalLMOutputWithPast(
127
138
  loss=loss,
128
139
  aux_loss=aux_loss,
129
140
  logits=logits,
@@ -131,4 +142,5 @@ def lce_forward(
131
142
  hidden_states=outputs.hidden_states,
132
143
  attentions=outputs.attentions,
133
144
  router_logits=outputs.router_logits,
145
+ token_accuracy=token_accuracy,
134
146
  )
@@ -0,0 +1,150 @@
1
+ from typing import List
2
+ from typing import Optional
3
+ from typing import Tuple
4
+ from typing import Union
5
+
6
+ import torch
7
+
8
+ from transformers.utils import can_return_tuple
9
+
10
+ from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
11
+ from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
12
+ from liger_kernel.transformers.model.output_classes import LigerQwen3VLCausalLMOutputWithPast
13
+
14
+
15
+ @can_return_tuple
16
+ def lce_forward(
17
+ self,
18
+ input_ids: torch.LongTensor = None,
19
+ attention_mask: Optional[torch.Tensor] = None,
20
+ position_ids: Optional[torch.LongTensor] = None,
21
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
22
+ inputs_embeds: Optional[torch.FloatTensor] = None,
23
+ labels: Optional[torch.LongTensor] = None,
24
+ use_cache: Optional[bool] = None,
25
+ output_attentions: Optional[bool] = None,
26
+ output_hidden_states: Optional[bool] = None,
27
+ return_dict: Optional[bool] = None,
28
+ pixel_values: Optional[torch.Tensor] = None,
29
+ pixel_values_videos: Optional[torch.FloatTensor] = None,
30
+ image_grid_thw: Optional[torch.LongTensor] = None,
31
+ video_grid_thw: Optional[torch.LongTensor] = None,
32
+ rope_deltas: Optional[torch.LongTensor] = None,
33
+ cache_position: Optional[torch.LongTensor] = None,
34
+ second_per_grid_ts: Optional[torch.Tensor] = None,
35
+ skip_logits: Optional[bool] = None,
36
+ **kwargs,
37
+ ) -> Union[Tuple, LigerQwen3VLCausalLMOutputWithPast]:
38
+ """
39
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
40
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
41
+ config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
42
+ (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
43
+ pixel_values_videos (`torch.FloatTensor` of shape `(seq_length, num_channels * temporal_size * image_size * image_size)):
44
+ The tensors corresponding to the input videos. Pixel values can be obtained using
45
+ [`AutoImageProcessor`]. See [`Qwen2_5_VLImageProcessor.__call__`] for details. [`Qwen2_5_VLProcessor`] uses
46
+ [`Qwen2_5_VLImageProcessor`] for processing videos.
47
+ image_grid_thw (`torch.LongTensor` of shape `(num_images, 3)`, *optional*):
48
+ The temporal, height and width of feature shape of each image in LLM.
49
+ video_grid_thw (`torch.LongTensor` of shape `(num_videos, 3)`, *optional*):
50
+ The temporal, height and width of feature shape of each video in LLM.
51
+ rope_deltas (`torch.LongTensor` of shape `(batch_size, )`, *optional*):
52
+ The rope index difference between sequence length and multimodal rope.
53
+ second_per_grid_ts (`torch.Tensor` of shape `(num_videos)`, *optional*):
54
+ The time interval (in seconds) for each grid along the temporal dimension in the 3D position IDs.
55
+ Example:
56
+ ```python
57
+ >>> from PIL import Image
58
+ >>> import requests
59
+ >>> from transformers import AutoProcessor, Qwen3VLForConditionalGeneration
60
+ >>> model = Qwen3VLForConditionalGeneration.from_pretrained("Qwen/Qwen3-VL")
61
+ >>> processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL")
62
+ >>> messages = [
63
+ {
64
+ "role": "user",
65
+ "content": [
66
+ {"type": "image"},
67
+ {"type": "text", "text": "What is shown in this image?"},
68
+ ],
69
+ },
70
+ ]
71
+ >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
72
+ >>> image = Image.open(requests.get(url, stream=True).raw)
73
+ >>> text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
74
+ >>> inputs = processor(text=[text], images=[image], vision_infos=[vision_infos])
75
+ >>> # Generate
76
+ >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
77
+ >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
78
+ "The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..."
79
+ ```"""
80
+
81
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
82
+ output_hidden_states = (
83
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
84
+ )
85
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
86
+
87
+ outputs = self.model(
88
+ input_ids=input_ids,
89
+ pixel_values=pixel_values,
90
+ pixel_values_videos=pixel_values_videos,
91
+ image_grid_thw=image_grid_thw,
92
+ video_grid_thw=video_grid_thw,
93
+ second_per_grid_ts=second_per_grid_ts,
94
+ position_ids=position_ids,
95
+ attention_mask=attention_mask,
96
+ past_key_values=past_key_values,
97
+ inputs_embeds=inputs_embeds,
98
+ use_cache=use_cache,
99
+ output_attentions=output_attentions,
100
+ output_hidden_states=output_hidden_states,
101
+ return_dict=return_dict,
102
+ cache_position=cache_position,
103
+ **kwargs,
104
+ )
105
+
106
+ hidden_states = outputs[0]
107
+
108
+ shift_labels = kwargs.pop("shift_labels", None)
109
+ loss = None
110
+ logits = None
111
+ token_accuracy = None
112
+
113
+ if skip_logits and labels is None and shift_labels is None:
114
+ raise ValueError("skip_logits is True, but labels and shift_labels are None")
115
+
116
+ if skip_logits is None:
117
+ skip_logits = self.training and (labels is not None or shift_labels is not None)
118
+
119
+ if skip_logits:
120
+ result = LigerForCausalLMLoss(
121
+ hidden_states=hidden_states,
122
+ lm_head_weight=self.lm_head.weight,
123
+ labels=labels,
124
+ shift_labels=shift_labels,
125
+ hidden_size=self.config.text_config.hidden_size,
126
+ **kwargs,
127
+ )
128
+ loss, _, token_accuracy = unpack_cross_entropy_result(result)
129
+ else:
130
+ logits = self.lm_head(hidden_states)
131
+
132
+ loss = None
133
+ if labels is not None:
134
+ loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size)
135
+
136
+ if not return_dict:
137
+ output = (logits,) + outputs[1:]
138
+ output = (loss,) + output if loss is not None else output
139
+ output = output + (token_accuracy,) if token_accuracy is not None else output
140
+ return output
141
+
142
+ return LigerQwen3VLCausalLMOutputWithPast(
143
+ loss=loss,
144
+ logits=logits,
145
+ past_key_values=outputs.past_key_values,
146
+ hidden_states=outputs.hidden_states,
147
+ attentions=outputs.attentions,
148
+ rope_deltas=outputs.rope_deltas,
149
+ token_accuracy=token_accuracy,
150
+ )
@@ -0,0 +1,126 @@
1
+ from typing import List
2
+ from typing import Optional
3
+ from typing import Tuple
4
+ from typing import Union
5
+
6
+ import torch
7
+
8
+ from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import load_balancing_loss_func
9
+ from transformers.utils import can_return_tuple
10
+
11
+ from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
12
+ from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
13
+ from liger_kernel.transformers.model.output_classes import LigerQwen3VLMoeCausalLMOutputWithPast
14
+
15
+
16
+ @can_return_tuple
17
+ def lce_forward(
18
+ self,
19
+ input_ids: torch.LongTensor = None,
20
+ attention_mask: Optional[torch.Tensor] = None,
21
+ position_ids: Optional[torch.LongTensor] = None,
22
+ past_key_values: Optional[List[torch.FloatTensor]] = None,
23
+ inputs_embeds: Optional[torch.FloatTensor] = None,
24
+ labels: Optional[torch.LongTensor] = None,
25
+ use_cache: Optional[bool] = None,
26
+ output_attentions: Optional[bool] = None,
27
+ output_hidden_states: Optional[bool] = None,
28
+ return_dict: Optional[bool] = None,
29
+ pixel_values: Optional[torch.Tensor] = None,
30
+ pixel_values_videos: Optional[torch.FloatTensor] = None,
31
+ image_grid_thw: Optional[torch.LongTensor] = None,
32
+ video_grid_thw: Optional[torch.LongTensor] = None,
33
+ rope_deltas: Optional[torch.LongTensor] = None,
34
+ cache_position: Optional[torch.LongTensor] = None,
35
+ second_per_grid_ts: Optional[torch.Tensor] = None,
36
+ skip_logits: Optional[bool] = None,
37
+ **kwargs,
38
+ ) -> Union[Tuple, LigerQwen3VLMoeCausalLMOutputWithPast]:
39
+ """
40
+ Qwen3-VL-MoE forward with fused linear cross entropy support mirroring Qwen3-VL behaviour.
41
+ """
42
+
43
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
44
+ output_hidden_states = (
45
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
46
+ )
47
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
48
+
49
+ outputs = self.model(
50
+ input_ids=input_ids,
51
+ pixel_values=pixel_values,
52
+ pixel_values_videos=pixel_values_videos,
53
+ image_grid_thw=image_grid_thw,
54
+ video_grid_thw=video_grid_thw,
55
+ second_per_grid_ts=second_per_grid_ts,
56
+ position_ids=position_ids,
57
+ attention_mask=attention_mask,
58
+ past_key_values=past_key_values,
59
+ inputs_embeds=inputs_embeds,
60
+ use_cache=use_cache,
61
+ output_attentions=output_attentions,
62
+ output_hidden_states=output_hidden_states,
63
+ return_dict=return_dict,
64
+ cache_position=cache_position,
65
+ **kwargs,
66
+ )
67
+
68
+ hidden_states = outputs[0]
69
+
70
+ shift_labels = kwargs.pop("shift_labels", None)
71
+ loss = None
72
+ logits = None
73
+ token_accuracy = None
74
+
75
+ if skip_logits and labels is None and shift_labels is None:
76
+ raise ValueError("skip_logits is True, but labels and shift_labels are None")
77
+
78
+ if skip_logits is None:
79
+ skip_logits = self.training and (labels is not None or shift_labels is not None)
80
+
81
+ if skip_logits:
82
+ result = LigerForCausalLMLoss(
83
+ hidden_states=hidden_states,
84
+ lm_head_weight=self.lm_head.weight,
85
+ labels=labels,
86
+ shift_labels=shift_labels,
87
+ hidden_size=self.config.text_config.hidden_size,
88
+ **kwargs,
89
+ )
90
+ loss, _, token_accuracy = unpack_cross_entropy_result(result)
91
+ else:
92
+ logits = self.lm_head(hidden_states)
93
+
94
+ if labels is not None:
95
+ loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size)
96
+
97
+ # Compute auxiliary load-balancing loss for MoE when requested
98
+ aux_loss = None
99
+ if kwargs.get("output_router_logits", False):
100
+ aux_loss = load_balancing_loss_func(
101
+ outputs.router_logits,
102
+ self.config.text_config.num_experts,
103
+ self.config.text_config.num_experts_per_tok,
104
+ attention_mask,
105
+ )
106
+ # If we computed training loss, add the scaled aux loss to it
107
+ if loss is not None and aux_loss is not None:
108
+ loss = loss + self.config.text_config.router_aux_loss_coef * aux_loss.to(loss.device)
109
+
110
+ if not return_dict:
111
+ output = (logits,) + outputs[1:]
112
+ output = (loss,) + output if loss is not None else output
113
+ output = output + (aux_loss,) if aux_loss is not None else output
114
+ output = output + (token_accuracy,) if token_accuracy is not None else output
115
+ return output
116
+
117
+ return LigerQwen3VLMoeCausalLMOutputWithPast(
118
+ loss=loss,
119
+ logits=logits,
120
+ past_key_values=outputs.past_key_values,
121
+ hidden_states=outputs.hidden_states,
122
+ attentions=outputs.attentions,
123
+ rope_deltas=outputs.rope_deltas,
124
+ aux_loss=aux_loss,
125
+ token_accuracy=token_accuracy,
126
+ )