liger-kernel-nightly 0.6.2.dev20250919191028__py3-none-any.whl → 0.6.4.dev20251202054858__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of liger-kernel-nightly might be problematic. Click here for more details.

Files changed (67) hide show
  1. liger_kernel/chunked_loss/cosine_similarity_loss.py +13 -4
  2. liger_kernel/chunked_loss/fused_linear_distillation.py +13 -2
  3. liger_kernel/chunked_loss/fused_linear_ppo.py +21 -5
  4. liger_kernel/chunked_loss/grpo_loss.py +8 -5
  5. liger_kernel/chunked_loss/jsd_loss.py +18 -5
  6. liger_kernel/ops/cross_entropy.py +120 -63
  7. liger_kernel/ops/dyt.py +5 -2
  8. liger_kernel/ops/fused_add_rms_norm.py +5 -1
  9. liger_kernel/ops/fused_linear_cross_entropy.py +43 -12
  10. liger_kernel/ops/geglu.py +2 -1
  11. liger_kernel/ops/group_norm.py +2 -1
  12. liger_kernel/ops/grpo_loss.py +3 -1
  13. liger_kernel/ops/layer_norm.py +88 -70
  14. liger_kernel/ops/poly_norm.py +390 -0
  15. liger_kernel/ops/rms_norm.py +7 -2
  16. liger_kernel/ops/tiled_mlp.py +136 -0
  17. liger_kernel/ops/utils.py +2 -0
  18. liger_kernel/transformers/__init__.py +33 -0
  19. liger_kernel/transformers/cross_entropy.py +8 -3
  20. liger_kernel/transformers/functional.py +29 -6
  21. liger_kernel/transformers/fused_linear_cross_entropy.py +8 -3
  22. liger_kernel/transformers/grpo_loss.py +56 -1
  23. liger_kernel/transformers/model/falcon_h1.py +122 -0
  24. liger_kernel/transformers/model/gemma.py +19 -7
  25. liger_kernel/transformers/model/gemma2.py +22 -7
  26. liger_kernel/transformers/model/gemma3.py +52 -14
  27. liger_kernel/transformers/model/glm4.py +18 -5
  28. liger_kernel/transformers/model/glm4v.py +18 -5
  29. liger_kernel/transformers/model/glm4v_moe.py +25 -5
  30. liger_kernel/transformers/model/hunyuan_v1.py +134 -0
  31. liger_kernel/transformers/model/internvl.py +157 -0
  32. liger_kernel/transformers/model/llama.py +16 -6
  33. liger_kernel/transformers/model/llama4.py +18 -5
  34. liger_kernel/transformers/model/llava.py +18 -6
  35. liger_kernel/transformers/model/loss_utils.py +31 -3
  36. liger_kernel/transformers/model/mistral.py +17 -7
  37. liger_kernel/transformers/model/mixtral.py +24 -9
  38. liger_kernel/transformers/model/mllama.py +14 -5
  39. liger_kernel/transformers/model/olmo2.py +18 -5
  40. liger_kernel/transformers/model/olmo3.py +142 -0
  41. liger_kernel/transformers/model/output_classes.py +147 -0
  42. liger_kernel/transformers/model/paligemma.py +41 -5
  43. liger_kernel/transformers/model/phi3.py +16 -8
  44. liger_kernel/transformers/model/qwen2.py +18 -4
  45. liger_kernel/transformers/model/qwen2_5_vl.py +21 -8
  46. liger_kernel/transformers/model/qwen2_vl.py +24 -7
  47. liger_kernel/transformers/model/qwen3.py +22 -6
  48. liger_kernel/transformers/model/qwen3_moe.py +27 -7
  49. liger_kernel/transformers/model/qwen3_next.py +146 -0
  50. liger_kernel/transformers/model/qwen3_vl.py +150 -0
  51. liger_kernel/transformers/model/qwen3_vl_moe.py +126 -0
  52. liger_kernel/transformers/model/smollm3.py +17 -7
  53. liger_kernel/transformers/model/smolvlm.py +158 -0
  54. liger_kernel/transformers/monkey_patch.py +729 -4
  55. liger_kernel/transformers/poly_norm.py +42 -0
  56. liger_kernel/transformers/rms_norm.py +7 -0
  57. liger_kernel/transformers/rope.py +43 -0
  58. liger_kernel/transformers/swiglu.py +17 -0
  59. liger_kernel/transformers/tiled_mlp.py +133 -0
  60. liger_kernel/utils.py +25 -0
  61. {liger_kernel_nightly-0.6.2.dev20250919191028.dist-info → liger_kernel_nightly-0.6.4.dev20251202054858.dist-info}/METADATA +13 -6
  62. liger_kernel_nightly-0.6.4.dev20251202054858.dist-info/RECORD +118 -0
  63. liger_kernel_nightly-0.6.2.dev20250919191028.dist-info/RECORD +0 -105
  64. {liger_kernel_nightly-0.6.2.dev20250919191028.dist-info → liger_kernel_nightly-0.6.4.dev20251202054858.dist-info}/LICENSE +0 -0
  65. {liger_kernel_nightly-0.6.2.dev20250919191028.dist-info → liger_kernel_nightly-0.6.4.dev20251202054858.dist-info}/NOTICE +0 -0
  66. {liger_kernel_nightly-0.6.2.dev20250919191028.dist-info → liger_kernel_nightly-0.6.4.dev20251202054858.dist-info}/WHEEL +0 -0
  67. {liger_kernel_nightly-0.6.2.dev20250919191028.dist-info → liger_kernel_nightly-0.6.4.dev20251202054858.dist-info}/top_level.txt +0 -0
@@ -7,11 +7,12 @@ from typing import Union
7
7
  import torch
8
8
 
9
9
  from torch.distributed.fsdp import FullyShardedDataParallel
10
- from transformers.modeling_outputs import CausalLMOutputWithPast
11
10
  from transformers.utils.deprecation import deprecate_kwarg
12
11
 
13
12
  from liger_kernel.transformers.fsdp import _FSDPForwardRedirection
14
13
  from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
14
+ from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result
15
+ from liger_kernel.transformers.model.output_classes import LigerCausalLMOutputWithPast
15
16
  from liger_kernel.utils import PEFT_AVAILABLE
16
17
 
17
18
  if TYPE_CHECKING:
@@ -38,7 +39,7 @@ def lce_forward(
38
39
  logits_to_keep: Union[int, torch.Tensor] = 0,
39
40
  skip_logits: Optional[bool] = None,
40
41
  **kwargs,
41
- ) -> Union[Tuple, CausalLMOutputWithPast]:
42
+ ) -> Union[Tuple, LigerCausalLMOutputWithPast]:
42
43
  r"""
43
44
  Args:
44
45
  labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -101,6 +102,8 @@ def lce_forward(
101
102
  shift_labels = kwargs.pop("shift_labels", None)
102
103
  logits = None
103
104
  loss = None
105
+ token_accuracy = None
106
+
104
107
  # if in training mode, don't materialize logits
105
108
  if skip_logits and labels is None and shift_labels is None:
106
109
  raise ValueError("skip_logits is True, but labels and shift_labels are None")
@@ -109,8 +112,9 @@ def lce_forward(
109
112
  # By default, if in training mode, don't materialize logits
110
113
  skip_logits = self.training and (labels is not None or shift_labels is not None)
111
114
 
115
+ # Compute loss
112
116
  if skip_logits:
113
- loss = lce_maybe_trainable_lm_head(
117
+ result = lce_maybe_trainable_lm_head(
114
118
  self,
115
119
  hidden_states=kept_hidden_states,
116
120
  hidden_size=self.config.hidden_size,
@@ -118,27 +122,33 @@ def lce_forward(
118
122
  shift_labels=shift_labels,
119
123
  **kwargs,
120
124
  )
125
+ loss, _, token_accuracy = unpack_cross_entropy_result(result)
121
126
 
122
127
  else:
123
128
  logits = self.lm_head(kept_hidden_states)
124
- if labels is not None:
129
+ if labels is not None or shift_labels is not None:
125
130
  loss = self.loss_function(
126
131
  logits=logits,
127
132
  labels=labels,
133
+ shift_labels=shift_labels,
128
134
  vocab_size=self.config.vocab_size,
129
135
  **kwargs,
130
136
  )
131
137
 
132
138
  if not return_dict:
133
- output = (logits,) + outputs[1:]
134
- return (loss,) + output if loss is not None else output
139
+ output_tuple = (logits,) + outputs[1:]
140
+ output = (loss,) + output_tuple if loss is not None else output_tuple
141
+ output = output + (token_accuracy,) if token_accuracy is not None else output
142
+ return output
135
143
 
136
- return CausalLMOutputWithPast(
144
+ # Return custom output class with token_accuracy field
145
+ return LigerCausalLMOutputWithPast(
137
146
  loss=loss,
138
147
  logits=logits,
139
148
  past_key_values=outputs.past_key_values,
140
149
  hidden_states=outputs.hidden_states,
141
150
  attentions=outputs.attentions,
151
+ token_accuracy=token_accuracy,
142
152
  )
143
153
 
144
154
 
@@ -0,0 +1,158 @@
1
+ from typing import TYPE_CHECKING
2
+ from typing import Optional
3
+ from typing import Union
4
+
5
+ import torch
6
+
7
+ from transformers.models.smolvlm.modeling_smolvlm import SmolVLMCausalLMOutputWithPast
8
+ from transformers.processing_utils import Unpack
9
+ from transformers.utils.generic import can_return_tuple
10
+
11
+ from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
12
+
13
+ if TYPE_CHECKING:
14
+ from transformers.cache_utils import Cache
15
+ from transformers.utils.generic import TransformersKwargs
16
+
17
+
18
+ # Forward adapted to enable fused Linear + CE without materializing logits.
19
+ # Mirrors the pattern used for other multimodal models (e.g., InternVL, LLaVA).
20
+ @can_return_tuple
21
+ def lce_forward(
22
+ self,
23
+ input_ids: Optional[torch.LongTensor] = None,
24
+ attention_mask: Optional[torch.Tensor] = None,
25
+ position_ids: Optional[torch.LongTensor] = None,
26
+ past_key_values: Optional["Cache"] = None,
27
+ inputs_embeds: Optional[torch.FloatTensor] = None,
28
+ pixel_values: Optional[torch.FloatTensor] = None,
29
+ pixel_attention_mask: Optional[torch.BoolTensor] = None,
30
+ image_hidden_states: Optional[torch.FloatTensor] = None,
31
+ labels: Optional[torch.LongTensor] = None,
32
+ use_cache: Optional[bool] = None,
33
+ output_attentions: Optional[bool] = None,
34
+ output_hidden_states: Optional[bool] = None,
35
+ cache_position: Optional[torch.LongTensor] = None,
36
+ return_dict: Optional[bool] = None,
37
+ logits_to_keep: Union[int, torch.Tensor] = 0,
38
+ skip_logits: Optional[bool] = None, # Added argument for liger-kernel
39
+ **lm_kwargs: Unpack["TransformersKwargs"], # renamed from kwargs
40
+ ) -> Union[tuple, SmolVLMCausalLMOutputWithPast]:
41
+ r"""
42
+ pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
43
+ Mask to avoid performing attention on padding pixel indices.
44
+ image_hidden_states (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
45
+ The hidden states of the image encoder after modality projection.
46
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
47
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
48
+ config.vocab_size]` or `model.image_token_id`. Tokens with indices set to `model.image_token_id` are
49
+ ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
50
+
51
+ Example:
52
+
53
+ ```python
54
+ >>> import requests
55
+ >>> import torch
56
+ >>> from PIL import Image
57
+ >>> from io import BytesIO
58
+
59
+ >>> from transformers import AutoProcessor, AutoModelForImageTextToText
60
+ >>> from transformers.image_utils import load_image
61
+
62
+ >>> # Note that passing the image urls (instead of the actual pil images) to the processor is also possible
63
+ >>> image1 = load_image("https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg")
64
+ >>> image2 = load_image("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg")
65
+ >>> image3 = load_image("https://cdn.britannica.com/68/170868-050-8DDE8263/Golden-Gate-Bridge-San-Francisco.jpg")
66
+
67
+ >>> processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct")
68
+ >>> model = AutoModelForImageTextToText.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct", dtype=torch.bfloat16, device_map="auto")
69
+
70
+ >>> # Create inputs
71
+ >>> messages = [
72
+ ... {
73
+ ... "role": "user",
74
+ ... "content": [
75
+ ... {"type": "video", "path": path/to/video},
76
+ ... {"type": "text", "text": "What is happening in this video?"},
77
+ ... ]
78
+ ... }
79
+ ... ]
80
+
81
+ >>> inputs = processor.apply_chat_template([messages], add_generation_prompt=True)
82
+
83
+ >>> # Generate
84
+ >>> generated_ids = model.generate(**inputs, max_new_tokens=256)
85
+ >>> generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
86
+
87
+ >>> print(generated_texts)
88
+ ```"""
89
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
90
+ output_hidden_states = (
91
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
92
+ )
93
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
94
+
95
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
96
+ outputs = self.model(
97
+ input_ids=input_ids,
98
+ attention_mask=attention_mask,
99
+ position_ids=position_ids,
100
+ past_key_values=past_key_values,
101
+ inputs_embeds=inputs_embeds,
102
+ pixel_values=pixel_values,
103
+ pixel_attention_mask=pixel_attention_mask,
104
+ image_hidden_states=image_hidden_states,
105
+ use_cache=use_cache,
106
+ output_attentions=output_attentions,
107
+ output_hidden_states=output_hidden_states,
108
+ cache_position=cache_position,
109
+ return_dict=True,
110
+ **lm_kwargs,
111
+ )
112
+
113
+ # Copied from llava.py
114
+ hidden_states = outputs[0]
115
+ # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
116
+ slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
117
+ kept_hidden_states = hidden_states[:, slice_indices, :]
118
+
119
+ shift_labels = lm_kwargs.pop("shift_labels", None)
120
+ logits = None
121
+ loss = None
122
+
123
+ if skip_logits and labels is None and shift_labels is None:
124
+ raise ValueError("skip_logits is True, but labels and shift_labels are None")
125
+
126
+ if skip_logits is None:
127
+ # By default, if in training mode, don't materialize logits
128
+ skip_logits = self.training and (labels is not None or shift_labels is not None)
129
+
130
+ if skip_logits:
131
+ loss = LigerForCausalLMLoss(
132
+ hidden_states=kept_hidden_states,
133
+ lm_head_weight=self.lm_head.weight,
134
+ labels=labels,
135
+ shift_labels=shift_labels,
136
+ hidden_size=self.config.text_config.hidden_size,
137
+ **lm_kwargs,
138
+ )
139
+
140
+ else:
141
+ logits = self.lm_head(kept_hidden_states)
142
+ if labels is not None or shift_labels is not None:
143
+ loss = self.loss_function(
144
+ logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **lm_kwargs
145
+ )
146
+
147
+ if not return_dict:
148
+ output = (logits,) + outputs[1:]
149
+ return (loss,) + output if loss is not None else output
150
+
151
+ return SmolVLMCausalLMOutputWithPast(
152
+ loss=loss,
153
+ logits=logits,
154
+ past_key_values=outputs.past_key_values,
155
+ hidden_states=outputs.hidden_states,
156
+ attentions=outputs.attentions,
157
+ image_hidden_states=outputs.image_hidden_states,
158
+ )