liger-kernel-nightly 0.5.10.dev20250611191801__py3-none-any.whl → 0.6.4.dev20260112233432__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of liger-kernel-nightly might be problematic. Click here for more details.

Files changed (107) hide show
  1. liger_kernel/chunked_loss/__init__.py +1 -0
  2. liger_kernel/chunked_loss/cosine_similarity_loss.py +142 -0
  3. liger_kernel/chunked_loss/dpo_loss.py +54 -3
  4. liger_kernel/chunked_loss/functional.py +2 -0
  5. liger_kernel/chunked_loss/fused_linear_distillation.py +23 -5
  6. liger_kernel/chunked_loss/fused_linear_ppo.py +25 -5
  7. liger_kernel/chunked_loss/grpo_loss.py +46 -9
  8. liger_kernel/chunked_loss/jsd_loss.py +44 -13
  9. liger_kernel/ops/__init__.py +141 -0
  10. liger_kernel/ops/backends/README.md +151 -0
  11. liger_kernel/ops/backends/__init__.py +13 -0
  12. liger_kernel/ops/backends/_ascend/__init__.py +5 -0
  13. liger_kernel/ops/backends/_ascend/ascend-ub-manager-design.md +485 -0
  14. liger_kernel/ops/backends/_ascend/ops/__init__.py +49 -0
  15. liger_kernel/ops/backends/_ascend/ops/geglu.py +266 -0
  16. liger_kernel/ops/backends/_ascend/ops/qwen2vl_mrope.py +285 -0
  17. liger_kernel/ops/backends/_ascend/ops/rope.py +290 -0
  18. liger_kernel/ops/backends/_ascend/ops/swiglu.py +142 -0
  19. liger_kernel/ops/backends/_ascend/ops/tvd.py +221 -0
  20. liger_kernel/ops/backends/_ascend/ub_manager.py +349 -0
  21. liger_kernel/ops/backends/registry.py +61 -0
  22. liger_kernel/ops/cross_entropy.py +130 -64
  23. liger_kernel/ops/dyt.py +5 -4
  24. liger_kernel/ops/fused_add_rms_norm.py +416 -0
  25. liger_kernel/ops/fused_linear_cross_entropy.py +115 -22
  26. liger_kernel/ops/geglu.py +6 -4
  27. liger_kernel/ops/group_norm.py +7 -7
  28. liger_kernel/ops/grpo_loss.py +3 -1
  29. liger_kernel/ops/kl_div.py +8 -11
  30. liger_kernel/ops/layer_norm.py +135 -80
  31. liger_kernel/ops/llama4_rope.py +225 -0
  32. liger_kernel/ops/poly_norm.py +390 -0
  33. liger_kernel/ops/rms_norm.py +148 -71
  34. liger_kernel/ops/rope.py +1 -1
  35. liger_kernel/ops/swiglu.py +1 -1
  36. liger_kernel/ops/tiled_mlp.py +136 -0
  37. liger_kernel/ops/utils.py +14 -0
  38. liger_kernel/transformers/__init__.py +65 -0
  39. liger_kernel/transformers/auto_model.py +21 -0
  40. liger_kernel/transformers/cross_entropy.py +9 -4
  41. liger_kernel/transformers/dyt.py +1 -1
  42. liger_kernel/transformers/experimental/__init__.py +5 -0
  43. liger_kernel/transformers/experimental/embedding.py +1 -1
  44. liger_kernel/transformers/functional.py +56 -24
  45. liger_kernel/transformers/fused_add_rms_norm.py +39 -0
  46. liger_kernel/transformers/fused_linear_cross_entropy.py +17 -5
  47. liger_kernel/transformers/fused_linear_jsd.py +1 -1
  48. liger_kernel/transformers/fused_neighborhood_attention.py +1 -1
  49. liger_kernel/transformers/geglu.py +1 -1
  50. liger_kernel/transformers/group_norm.py +1 -1
  51. liger_kernel/transformers/grpo_loss.py +57 -2
  52. liger_kernel/transformers/jsd.py +1 -1
  53. liger_kernel/transformers/kl_div.py +1 -1
  54. liger_kernel/transformers/layer_norm.py +1 -1
  55. liger_kernel/transformers/llama4_rope.py +93 -0
  56. liger_kernel/transformers/model/exaone4.py +136 -0
  57. liger_kernel/transformers/model/falcon_h1.py +122 -0
  58. liger_kernel/transformers/model/gemma.py +28 -8
  59. liger_kernel/transformers/model/gemma2.py +34 -11
  60. liger_kernel/transformers/model/gemma3.py +102 -112
  61. liger_kernel/transformers/model/glm4.py +18 -5
  62. liger_kernel/transformers/model/glm4v.py +163 -0
  63. liger_kernel/transformers/model/glm4v_moe.py +172 -0
  64. liger_kernel/transformers/model/gpt_oss.py +211 -0
  65. liger_kernel/transformers/model/hunyuan_v1.py +134 -0
  66. liger_kernel/transformers/model/internvl.py +157 -0
  67. liger_kernel/transformers/model/llama.py +26 -7
  68. liger_kernel/transformers/model/llama4.py +121 -0
  69. liger_kernel/transformers/model/llava.py +18 -6
  70. liger_kernel/transformers/model/loss_utils.py +34 -3
  71. liger_kernel/transformers/model/mistral.py +17 -10
  72. liger_kernel/transformers/model/mixtral.py +24 -9
  73. liger_kernel/transformers/model/mllama.py +18 -7
  74. liger_kernel/transformers/model/olmo2.py +18 -5
  75. liger_kernel/transformers/model/olmo3.py +142 -0
  76. liger_kernel/transformers/model/output_classes.py +147 -0
  77. liger_kernel/transformers/model/paligemma.py +42 -5
  78. liger_kernel/transformers/model/phi3.py +24 -159
  79. liger_kernel/transformers/model/qwen2.py +26 -4
  80. liger_kernel/transformers/model/qwen2_5_vl.py +21 -8
  81. liger_kernel/transformers/model/qwen2_vl.py +24 -7
  82. liger_kernel/transformers/model/qwen3.py +22 -6
  83. liger_kernel/transformers/model/qwen3_moe.py +27 -7
  84. liger_kernel/transformers/model/qwen3_next.py +146 -0
  85. liger_kernel/transformers/model/qwen3_vl.py +150 -0
  86. liger_kernel/transformers/model/qwen3_vl_moe.py +126 -0
  87. liger_kernel/transformers/model/smollm3.py +199 -0
  88. liger_kernel/transformers/model/smolvlm.py +158 -0
  89. liger_kernel/transformers/monkey_patch.py +1423 -100
  90. liger_kernel/transformers/multi_token_attention.py +2 -2
  91. liger_kernel/transformers/poly_norm.py +42 -0
  92. liger_kernel/transformers/qwen2vl_mrope.py +1 -1
  93. liger_kernel/transformers/rms_norm.py +15 -5
  94. liger_kernel/transformers/rope.py +45 -1
  95. liger_kernel/transformers/softmax.py +1 -1
  96. liger_kernel/transformers/sparsemax.py +1 -1
  97. liger_kernel/transformers/swiglu.py +18 -1
  98. liger_kernel/transformers/tiled_mlp.py +125 -0
  99. liger_kernel/transformers/tvd.py +1 -1
  100. liger_kernel/utils.py +52 -0
  101. {liger_kernel_nightly-0.5.10.dev20250611191801.dist-info → liger_kernel_nightly-0.6.4.dev20260112233432.dist-info}/METADATA +37 -25
  102. liger_kernel_nightly-0.6.4.dev20260112233432.dist-info/RECORD +132 -0
  103. liger_kernel_nightly-0.5.10.dev20250611191801.dist-info/RECORD +0 -95
  104. {liger_kernel_nightly-0.5.10.dev20250611191801.dist-info → liger_kernel_nightly-0.6.4.dev20260112233432.dist-info}/LICENSE +0 -0
  105. {liger_kernel_nightly-0.5.10.dev20250611191801.dist-info → liger_kernel_nightly-0.6.4.dev20260112233432.dist-info}/NOTICE +0 -0
  106. {liger_kernel_nightly-0.5.10.dev20250611191801.dist-info → liger_kernel_nightly-0.6.4.dev20260112233432.dist-info}/WHEEL +0 -0
  107. {liger_kernel_nightly-0.5.10.dev20250611191801.dist-info → liger_kernel_nightly-0.6.4.dev20260112233432.dist-info}/top_level.txt +0 -0
@@ -5,123 +5,13 @@ from typing import Union
5
5
 
6
6
  import torch
7
7
 
8
- from torch.nn import CrossEntropyLoss
9
- from transformers.modeling_outputs import CausalLMOutputWithPast
10
- from transformers.utils.deprecation import deprecate_kwarg
8
+ from transformers.modeling_outputs import BaseModelOutputWithPast
11
9
 
12
- from liger_kernel.transformers.fused_linear_cross_entropy import LigerFusedLinearCrossEntropyLoss
13
10
  from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
11
+ from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
12
+ from liger_kernel.transformers.model.output_classes import LigerCausalLMOutputWithPast
14
13
 
15
14
 
16
- def lce_forward_deprecated(
17
- self,
18
- input_ids: torch.LongTensor = None,
19
- attention_mask: Optional[torch.Tensor] = None,
20
- position_ids: Optional[torch.LongTensor] = None,
21
- past_key_values: Optional[List[torch.FloatTensor]] = None,
22
- inputs_embeds: Optional[torch.FloatTensor] = None,
23
- labels: Optional[torch.LongTensor] = None,
24
- use_cache: Optional[bool] = None,
25
- output_attentions: Optional[bool] = None,
26
- output_hidden_states: Optional[bool] = None,
27
- return_dict: Optional[bool] = None,
28
- cache_position: Optional[torch.LongTensor] = None,
29
- ) -> Union[Tuple, CausalLMOutputWithPast]:
30
- r"""
31
- Copy paste phi3 forward from transfomers v4.44.2 but replace torch cross entropy with liger fused linear cross entropy
32
-
33
-
34
- Args:
35
- labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
36
- Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
37
- config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
38
- (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
39
-
40
- Returns:
41
-
42
- Example:
43
-
44
- ```python
45
- >>> from transformers import AutoTokenizer, Phi3ForCausalLM
46
-
47
- >>> model = Phi3ForCausalLM.from_pretrained("microsoft/phi-3-mini-4k-instruct")
48
- >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-3-mini-4k-instruct")
49
-
50
- >>> prompt = "This is an example script ."
51
- >>> inputs = tokenizer(prompt, return_tensors="pt")
52
-
53
- >>> # Generate
54
- >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
55
- >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
56
- 'This is an example script .\n Certainly! Below is a sample script that demonstrates a simple task, such as calculating the sum'
57
- ```"""
58
-
59
- output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
60
- output_hidden_states = (
61
- output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
62
- )
63
- return_dict = return_dict if return_dict is not None else self.config.use_return_dict
64
-
65
- # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
66
- outputs = self.model(
67
- input_ids=input_ids,
68
- attention_mask=attention_mask,
69
- position_ids=position_ids,
70
- past_key_values=past_key_values,
71
- inputs_embeds=inputs_embeds,
72
- use_cache=use_cache,
73
- output_attentions=output_attentions,
74
- output_hidden_states=output_hidden_states,
75
- return_dict=return_dict,
76
- )
77
-
78
- hidden_states = outputs[0]
79
-
80
- loss = None
81
- logits = None
82
-
83
- if self.training and labels is not None:
84
- shift_hidden_states = hidden_states[..., :-1, :].contiguous()
85
- shift_labels = labels[..., 1:].contiguous()
86
-
87
- # flatten tokens
88
- shift_hidden_states = shift_hidden_states.view(-1, self.config.hidden_size)
89
- shift_labels = shift_labels.view(-1)
90
-
91
- lce = LigerFusedLinearCrossEntropyLoss()
92
- loss = lce(self.lm_head.weight, shift_hidden_states, shift_labels)
93
- else:
94
- logits = self.lm_head(hidden_states)
95
-
96
- loss = None
97
- if labels is not None:
98
- # Upcast to float if we need to compute the loss to avoid potential precision issues
99
- logits = logits.float()
100
- # Shift so that tokens < n predict n
101
- shift_logits = logits[..., :-1, :].contiguous()
102
- shift_labels = labels[..., 1:].contiguous()
103
- # Flatten the tokens
104
- loss_fct = CrossEntropyLoss()
105
- shift_logits = shift_logits.view(-1, self.config.vocab_size)
106
- shift_labels = shift_labels.view(-1)
107
- # Enable model parallelism
108
- shift_labels = shift_labels.to(shift_logits.device)
109
- loss = loss_fct(shift_logits, shift_labels)
110
-
111
- if not return_dict:
112
- output = (logits,) + outputs[1:]
113
- return (loss,) + output if loss is not None else output
114
-
115
- return CausalLMOutputWithPast(
116
- loss=loss,
117
- logits=logits,
118
- past_key_values=outputs.past_key_values,
119
- hidden_states=outputs.hidden_states,
120
- attentions=outputs.attentions,
121
- )
122
-
123
-
124
- @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
125
15
  def lce_forward(
126
16
  self,
127
17
  input_ids: torch.LongTensor = None,
@@ -138,75 +28,43 @@ def lce_forward(
138
28
  logits_to_keep: Union[int, torch.Tensor] = 0,
139
29
  skip_logits: Optional[bool] = None,
140
30
  **kwargs,
141
- ) -> Union[Tuple, CausalLMOutputWithPast]:
31
+ ) -> Union[Tuple, LigerCausalLMOutputWithPast]:
142
32
  r"""
143
- Args:
144
- labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
145
- Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
146
- config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
147
- (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
148
-
149
- logits_to_keep (`int` or `torch.Tensor`, *optional*):
150
- If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
151
- `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
152
- token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
153
- If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
154
- This is useful when using packed tensor format (single dimension for batch and sequence length).
155
-
156
- Returns:
157
-
158
33
  Example:
159
34
 
160
35
  ```python
161
36
  >>> from transformers import AutoTokenizer, Phi3ForCausalLM
162
37
 
163
- >>> model = Phi3ForCausalLM.from_pretrained("microsoft/phi-3-mini-4k-instruct")
164
- >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-3-mini-4k-instruct")
38
+ >>> model = Phi3ForCausalLM.from_pretrained("meta-phi3/Phi3-2-7b-hf")
39
+ >>> tokenizer = AutoTokenizer.from_pretrained("meta-phi3/Phi3-2-7b-hf")
165
40
 
166
- >>> prompt = "This is an example script ."
41
+ >>> prompt = "Hey, are you conscious? Can you talk to me?"
167
42
  >>> inputs = tokenizer(prompt, return_tensors="pt")
168
43
 
169
44
  >>> # Generate
170
45
  >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
171
46
  >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
172
- 'This is an example script .\n Certainly! Below is a sample script that demonstrates a simple task, such as calculating the sum'
47
+ "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
173
48
  ```"""
174
49
 
175
- from transformers.models.phi3.modeling_phi3 import logging
176
-
177
- logger = logging.get_logger(__name__)
178
-
179
- if (
180
- use_cache
181
- and self.config.rope_scaling
182
- and cache_position is not None
183
- and cache_position[0] == self.config.original_max_position_embeddings
184
- ):
185
- logger.warning(
186
- f"If you are not using the generate method, you may encounter nonsensical outputs after the {self.config.original_max_position_embeddings}th token, as the KV cache needs to be recomputed."
187
- )
188
-
189
50
  output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
190
51
  output_hidden_states = (
191
52
  output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
192
53
  )
193
54
  return_dict = return_dict if return_dict is not None else self.config.use_return_dict
194
55
 
195
- # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
196
- outputs = self.model(
56
+ outputs: BaseModelOutputWithPast = self.model(
197
57
  input_ids=input_ids,
198
58
  attention_mask=attention_mask,
199
59
  position_ids=position_ids,
200
60
  past_key_values=past_key_values,
201
61
  inputs_embeds=inputs_embeds,
202
62
  use_cache=use_cache,
203
- output_attentions=output_attentions,
204
- output_hidden_states=output_hidden_states,
205
- return_dict=return_dict,
63
+ cache_position=cache_position,
206
64
  **kwargs,
207
65
  )
208
66
 
209
- hidden_states = outputs[0]
67
+ hidden_states = outputs.last_hidden_state
210
68
  # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
211
69
  slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
212
70
  kept_hidden_states = hidden_states[:, slice_indices, :]
@@ -214,6 +72,7 @@ def lce_forward(
214
72
  shift_labels = kwargs.pop("shift_labels", None)
215
73
  logits = None
216
74
  loss = None
75
+ token_accuracy = None
217
76
 
218
77
  if skip_logits and labels is None and shift_labels is None:
219
78
  raise ValueError("skip_logits is True, but labels and shift_labels are None")
@@ -222,8 +81,9 @@ def lce_forward(
222
81
  # By default, if in training mode, don't materialize logits
223
82
  skip_logits = self.training and (labels is not None or shift_labels is not None)
224
83
 
84
+ # Compute loss
225
85
  if skip_logits:
226
- loss = LigerForCausalLMLoss(
86
+ result = LigerForCausalLMLoss(
227
87
  hidden_states=kept_hidden_states,
228
88
  lm_head_weight=self.lm_head.weight,
229
89
  labels=labels,
@@ -231,25 +91,30 @@ def lce_forward(
231
91
  hidden_size=self.config.hidden_size,
232
92
  **kwargs,
233
93
  )
234
-
94
+ loss, _, token_accuracy = unpack_cross_entropy_result(result)
235
95
  else:
236
96
  logits = self.lm_head(kept_hidden_states)
237
- if labels is not None:
97
+ if labels is not None or shift_labels is not None:
238
98
  loss = self.loss_function(
239
99
  logits=logits,
240
100
  labels=labels,
101
+ shift_labels=shift_labels,
241
102
  vocab_size=self.config.vocab_size,
242
103
  **kwargs,
243
104
  )
244
105
 
245
106
  if not return_dict:
246
- output = (logits,) + outputs[1:]
247
- return (loss,) + output if loss is not None else output
107
+ output_tuple = (logits,) + outputs[1:]
108
+ output = (loss,) + output_tuple if loss is not None else output_tuple
109
+ output = output + (token_accuracy,) if token_accuracy is not None else output
110
+ return output
248
111
 
249
- return CausalLMOutputWithPast(
112
+ # Return custom output class with token_accuracy field
113
+ return LigerCausalLMOutputWithPast(
250
114
  loss=loss,
251
115
  logits=logits,
252
116
  past_key_values=outputs.past_key_values,
253
117
  hidden_states=outputs.hidden_states,
254
118
  attentions=outputs.attentions,
119
+ token_accuracy=token_accuracy,
255
120
  )
@@ -11,6 +11,8 @@ from transformers.utils.deprecation import deprecate_kwarg
11
11
 
12
12
  from liger_kernel.transformers.fused_linear_cross_entropy import LigerFusedLinearCrossEntropyLoss
13
13
  from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
14
+ from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
15
+ from liger_kernel.transformers.model.output_classes import LigerCausalLMOutputWithPast
14
16
 
15
17
 
16
18
  def lce_forward_deprecated(
@@ -26,6 +28,7 @@ def lce_forward_deprecated(
26
28
  output_hidden_states: Optional[bool] = None,
27
29
  return_dict: Optional[bool] = None,
28
30
  cache_position: Optional[torch.LongTensor] = None,
31
+ skip_logits: Optional[bool] = None,
29
32
  ) -> Union[Tuple, CausalLMOutputWithPast]:
30
33
  r"""
31
34
  Copy paste Qwen2's forward but replace torch cross entropy with liger fused linear cross entropy
@@ -80,6 +83,13 @@ def lce_forward_deprecated(
80
83
  loss = None
81
84
  logits = None
82
85
 
86
+ if skip_logits and labels is None:
87
+ raise ValueError("skip_logits is True, but labels is None")
88
+
89
+ if skip_logits is None:
90
+ # By default, if in training mode, don't materialize logits
91
+ skip_logits = self.training and labels is not None
92
+
83
93
  if self.training and (labels is not None):
84
94
  shift_hidden_states = hidden_states[..., :-1, :].contiguous()
85
95
  shift_labels = labels[..., 1:].contiguous()
@@ -137,7 +147,7 @@ def lce_forward(
137
147
  logits_to_keep: Union[int, torch.Tensor] = 0,
138
148
  skip_logits: Optional[bool] = None,
139
149
  **kwargs,
140
- ) -> Union[Tuple, CausalLMOutputWithPast]:
150
+ ) -> Union[Tuple, LigerCausalLMOutputWithPast]:
141
151
  r"""
142
152
  Args:
143
153
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -200,6 +210,7 @@ def lce_forward(
200
210
  shift_labels = kwargs.pop("shift_labels", None)
201
211
  logits = None
202
212
  loss = None
213
+ token_accuracy = None
203
214
 
204
215
  if skip_logits and labels is None and shift_labels is None:
205
216
  raise ValueError("skip_logits is True, but labels and shift_labels are None")
@@ -208,8 +219,9 @@ def lce_forward(
208
219
  # By default, if in training mode, don't materialize logits
209
220
  skip_logits = self.training and (labels is not None or shift_labels is not None)
210
221
 
222
+ # Compute loss
211
223
  if skip_logits:
212
- loss = LigerForCausalLMLoss(
224
+ result = LigerForCausalLMLoss(
213
225
  hidden_states=kept_hidden_states,
214
226
  lm_head_weight=self.lm_head.weight,
215
227
  labels=labels,
@@ -217,21 +229,31 @@ def lce_forward(
217
229
  hidden_size=self.config.hidden_size,
218
230
  **kwargs,
219
231
  )
232
+ loss, _, token_accuracy = unpack_cross_entropy_result(result)
220
233
 
221
234
  else:
222
235
  logits = self.lm_head(kept_hidden_states)
223
- if labels is not None:
236
+ if labels is not None or shift_labels is not None:
224
237
  loss = self.loss_function(
225
238
  logits=logits,
226
239
  labels=labels,
240
+ shift_labels=shift_labels,
227
241
  vocab_size=self.config.vocab_size,
228
242
  **kwargs,
229
243
  )
230
244
 
231
- return CausalLMOutputWithPast(
245
+ if not return_dict:
246
+ output_tuple = (logits,) + outputs[1:]
247
+ output = (loss,) + output_tuple if loss is not None else output_tuple
248
+ output = output + (token_accuracy,) if token_accuracy is not None else output
249
+ return output
250
+
251
+ # Return custom output class with token accuracy field
252
+ return LigerCausalLMOutputWithPast(
232
253
  loss=loss,
233
254
  logits=logits,
234
255
  past_key_values=outputs.past_key_values,
235
256
  hidden_states=outputs.hidden_states,
236
257
  attentions=outputs.attentions,
258
+ token_accuracy=token_accuracy,
237
259
  )
@@ -5,10 +5,11 @@ from typing import Union
5
5
 
6
6
  import torch
7
7
 
8
- from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VLCausalLMOutputWithPast
9
8
  from transformers.utils import can_return_tuple
10
9
 
11
10
  from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
11
+ from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
12
+ from liger_kernel.transformers.model.output_classes import LigerQwen2_5_VLCausalLMOutputWithPast
12
13
 
13
14
 
14
15
  @can_return_tuple
@@ -33,7 +34,7 @@ def lce_forward(
33
34
  second_per_grid_ts: Optional[torch.Tensor] = None,
34
35
  skip_logits: Optional[bool] = None,
35
36
  **kwargs,
36
- ) -> Union[Tuple, Qwen2_5_VLCausalLMOutputWithPast]:
37
+ ) -> Union[Tuple, LigerQwen2_5_VLCausalLMOutputWithPast]:
37
38
  r"""
38
39
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
39
40
  Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
@@ -113,6 +114,7 @@ def lce_forward(
113
114
  shift_labels = kwargs.pop("shift_labels", None)
114
115
  loss = None
115
116
  logits = None
117
+ token_accuracy = None
116
118
 
117
119
  if skip_logits and labels is None and shift_labels is None:
118
120
  raise ValueError("skip_logits is True, but labels and shift_labels are None")
@@ -120,8 +122,9 @@ def lce_forward(
120
122
  if skip_logits is None:
121
123
  skip_logits = self.training and (labels is not None or shift_labels is not None)
122
124
 
125
+ # Compute loss
123
126
  if skip_logits:
124
- loss = LigerForCausalLMLoss(
127
+ result = LigerForCausalLMLoss(
125
128
  hidden_states=hidden_states,
126
129
  lm_head_weight=self.lm_head.weight,
127
130
  labels=labels,
@@ -129,22 +132,32 @@ def lce_forward(
129
132
  hidden_size=self.config.hidden_size,
130
133
  **kwargs,
131
134
  )
135
+ loss, _, token_accuracy = unpack_cross_entropy_result(result)
132
136
  else:
133
137
  logits = self.lm_head(hidden_states)
134
138
 
135
139
  loss = None
136
- if labels is not None:
137
- loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size)
140
+ if labels is not None or shift_labels is not None:
141
+ loss = self.loss_function(
142
+ logits=logits,
143
+ labels=labels,
144
+ shift_labels=shift_labels,
145
+ vocab_size=self.config.vocab_size,
146
+ )
138
147
 
139
148
  if not return_dict:
140
- output = (logits,) + outputs[1:]
141
- return (loss,) + output if loss is not None else output
149
+ output_tuple = (logits,) + outputs[1:]
150
+ output = (loss,) + output_tuple if loss is not None else output_tuple
151
+ output = output + (token_accuracy,) if token_accuracy is not None else output
152
+ return output
142
153
 
143
- return Qwen2_5_VLCausalLMOutputWithPast(
154
+ # Return Qwen2.5-VL output with token accuracy
155
+ return LigerQwen2_5_VLCausalLMOutputWithPast(
144
156
  loss=loss,
145
157
  logits=logits,
146
158
  past_key_values=outputs.past_key_values,
147
159
  hidden_states=outputs.hidden_states,
148
160
  attentions=outputs.attentions,
149
161
  rope_deltas=outputs.rope_deltas,
162
+ token_accuracy=token_accuracy,
150
163
  )
@@ -5,10 +5,11 @@ from typing import Union
5
5
 
6
6
  import torch
7
7
 
8
- from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLCausalLMOutputWithPast
9
8
  from transformers.utils import can_return_tuple
10
9
 
11
10
  from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
11
+ from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
12
+ from liger_kernel.transformers.model.output_classes import LigerQwen2VLCausalLMOutputWithPast
12
13
 
13
14
 
14
15
  @can_return_tuple
@@ -32,7 +33,7 @@ def lce_forward(
32
33
  cache_position: Optional[torch.LongTensor] = None,
33
34
  skip_logits: Optional[bool] = None,
34
35
  **kwargs,
35
- ) -> Union[Tuple, Qwen2VLCausalLMOutputWithPast]:
36
+ ) -> Union[Tuple, LigerQwen2VLCausalLMOutputWithPast]:
36
37
  r"""
37
38
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
38
39
  Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
@@ -109,6 +110,7 @@ def lce_forward(
109
110
  shift_labels = kwargs.pop("shift_labels", None)
110
111
  loss = None
111
112
  logits = None
113
+ token_accuracy = None
112
114
 
113
115
  if skip_logits and labels is None and shift_labels is None:
114
116
  raise ValueError("skip_logits is True, but labels and shift_labels are None")
@@ -116,8 +118,9 @@ def lce_forward(
116
118
  if skip_logits is None:
117
119
  skip_logits = self.training and (labels is not None or shift_labels is not None)
118
120
 
121
+ # Compute loss
119
122
  if skip_logits:
120
- loss = LigerForCausalLMLoss(
123
+ result = LigerForCausalLMLoss(
121
124
  hidden_states=hidden_states,
122
125
  lm_head_weight=self.lm_head.weight,
123
126
  labels=labels,
@@ -125,18 +128,32 @@ def lce_forward(
125
128
  hidden_size=self.config.hidden_size,
126
129
  **kwargs,
127
130
  )
131
+ loss, _, token_accuracy = unpack_cross_entropy_result(result)
128
132
  else:
129
133
  logits = self.lm_head(hidden_states)
130
134
 
131
135
  loss = None
132
- if labels is not None:
133
- loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size)
134
-
135
- return Qwen2VLCausalLMOutputWithPast(
136
+ if labels is not None or shift_labels is not None:
137
+ loss = self.loss_function(
138
+ logits=logits,
139
+ labels=labels,
140
+ shift_labels=shift_labels,
141
+ vocab_size=self.config.vocab_size,
142
+ )
143
+
144
+ if not return_dict:
145
+ output_tuple = (logits,) + outputs[1:]
146
+ output = (loss,) + output_tuple if loss is not None else output_tuple
147
+ output = output + (token_accuracy,) if token_accuracy is not None else output
148
+ return output
149
+
150
+ # Return Qwen2VL output with token accuracy
151
+ return LigerQwen2VLCausalLMOutputWithPast(
136
152
  loss=loss,
137
153
  logits=logits,
138
154
  past_key_values=outputs.past_key_values,
139
155
  hidden_states=outputs.hidden_states,
140
156
  attentions=outputs.attentions,
141
157
  rope_deltas=outputs.rope_deltas,
158
+ token_accuracy=token_accuracy,
142
159
  )
@@ -4,9 +4,9 @@ from typing import Union
4
4
 
5
5
  import torch
6
6
 
7
- from transformers.modeling_outputs import CausalLMOutputWithPast
8
-
9
7
  from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
8
+ from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
9
+ from liger_kernel.transformers.model.output_classes import LigerCausalLMOutputWithPast
10
10
 
11
11
 
12
12
  def lce_forward(
@@ -23,8 +23,9 @@ def lce_forward(
23
23
  cache_position: Optional[torch.LongTensor] = None,
24
24
  logits_to_keep: Union[int, torch.Tensor] = 0,
25
25
  skip_logits: Optional[bool] = None,
26
+ return_dict: Optional[bool] = None,
26
27
  **kwargs,
27
- ) -> CausalLMOutputWithPast:
28
+ ) -> LigerCausalLMOutputWithPast:
28
29
  r"""
29
30
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
30
31
  Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
@@ -60,6 +61,7 @@ def lce_forward(
60
61
  output_hidden_states = (
61
62
  output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
62
63
  )
64
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
63
65
 
64
66
  # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
65
67
  outputs = self.model(
@@ -81,8 +83,11 @@ def lce_forward(
81
83
  kept_hidden_states = hidden_states[:, slice_indices, :]
82
84
 
83
85
  shift_labels = kwargs.pop("shift_labels", None)
86
+ # Remove output-control parameters that shouldn't be passed to loss functions
87
+ kwargs.pop("return_dict", None)
84
88
  logits = None
85
89
  loss = None
90
+ token_accuracy = None
86
91
 
87
92
  if skip_logits and labels is None and shift_labels is None:
88
93
  raise ValueError("skip_logits is True, but labels and shift_labels are None")
@@ -91,8 +96,9 @@ def lce_forward(
91
96
  # By default, if in training mode, don't materialize logits
92
97
  skip_logits = self.training and (labels is not None or shift_labels is not None)
93
98
 
99
+ # Compute loss
94
100
  if skip_logits:
95
- loss = LigerForCausalLMLoss(
101
+ result = LigerForCausalLMLoss(
96
102
  hidden_states=kept_hidden_states,
97
103
  lm_head_weight=self.lm_head.weight,
98
104
  labels=labels,
@@ -100,21 +106,31 @@ def lce_forward(
100
106
  hidden_size=self.config.hidden_size,
101
107
  **kwargs,
102
108
  )
109
+ loss, _, token_accuracy = unpack_cross_entropy_result(result)
103
110
 
104
111
  else:
105
112
  logits = self.lm_head(kept_hidden_states)
106
- if labels is not None:
113
+ if labels is not None or shift_labels is not None:
107
114
  loss = self.loss_function(
108
115
  logits=logits,
109
116
  labels=labels,
117
+ shift_labels=shift_labels,
110
118
  vocab_size=self.config.vocab_size,
111
119
  **kwargs,
112
120
  )
113
121
 
114
- return CausalLMOutputWithPast(
122
+ if not return_dict:
123
+ output = (logits,) + outputs[1:]
124
+ output = ((loss,) + output) if loss is not None else output
125
+ output = output + (token_accuracy,) if token_accuracy is not None else output
126
+ return output
127
+
128
+ # Return custom output class with accuracy field
129
+ return LigerCausalLMOutputWithPast(
115
130
  loss=loss,
116
131
  logits=logits,
117
132
  past_key_values=outputs.past_key_values,
118
133
  hidden_states=outputs.hidden_states,
119
134
  attentions=outputs.attentions,
135
+ token_accuracy=token_accuracy,
120
136
  )