liger-kernel-nightly 0.6.2.dev20251024130145__py3-none-any.whl → 0.6.2.dev20251027145804__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -57,6 +57,7 @@ if TYPE_CHECKING:
57
57
  from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_qwen3_moe # noqa: F401
58
58
  from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_qwen3_next # noqa: F401
59
59
  from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_smollm3 # noqa: F401
60
+ from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_smolvlm # noqa: F401
60
61
 
61
62
 
62
63
  # Check if 'transformers' is installed
@@ -120,6 +121,7 @@ def __getattr__(name: str):
120
121
  "apply_liger_kernel_to_qwen3_moe",
121
122
  "apply_liger_kernel_to_qwen3_next",
122
123
  "apply_liger_kernel_to_smollm3",
124
+ "apply_liger_kernel_to_smolvlm",
123
125
  }
124
126
 
125
127
  if name in monkey_patch_symbols:
@@ -189,5 +191,6 @@ if _TRANSFORMERS_AVAILABLE:
189
191
  "apply_liger_kernel_to_qwen3_moe",
190
192
  "apply_liger_kernel_to_qwen3_next",
191
193
  "apply_liger_kernel_to_smollm3",
194
+ "apply_liger_kernel_to_smolvlm",
192
195
  ]
193
196
  )
@@ -0,0 +1,158 @@
1
+ from typing import TYPE_CHECKING
2
+ from typing import Optional
3
+ from typing import Union
4
+
5
+ import torch
6
+
7
+ from transformers.models.smolvlm.modeling_smolvlm import SmolVLMCausalLMOutputWithPast
8
+ from transformers.processing_utils import Unpack
9
+ from transformers.utils.generic import can_return_tuple
10
+
11
+ from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
12
+
13
+ if TYPE_CHECKING:
14
+ from transformers.cache_utils import Cache
15
+ from transformers.utils.generic import TransformersKwargs
16
+
17
+
18
+ # Forward adapted to enable fused Linear + CE without materializing logits.
19
+ # Mirrors the pattern used for other multimodal models (e.g., InternVL, LLaVA).
20
+ @can_return_tuple
21
+ def lce_forward(
22
+ self,
23
+ input_ids: Optional[torch.LongTensor] = None,
24
+ attention_mask: Optional[torch.Tensor] = None,
25
+ position_ids: Optional[torch.LongTensor] = None,
26
+ past_key_values: Optional["Cache"] = None,
27
+ inputs_embeds: Optional[torch.FloatTensor] = None,
28
+ pixel_values: Optional[torch.FloatTensor] = None,
29
+ pixel_attention_mask: Optional[torch.BoolTensor] = None,
30
+ image_hidden_states: Optional[torch.FloatTensor] = None,
31
+ labels: Optional[torch.LongTensor] = None,
32
+ use_cache: Optional[bool] = None,
33
+ output_attentions: Optional[bool] = None,
34
+ output_hidden_states: Optional[bool] = None,
35
+ cache_position: Optional[torch.LongTensor] = None,
36
+ return_dict: Optional[bool] = None,
37
+ logits_to_keep: Union[int, torch.Tensor] = 0,
38
+ skip_logits: Optional[bool] = None, # Added argument for liger-kernel
39
+ **lm_kwargs: Unpack["TransformersKwargs"], # renamed from kwargs
40
+ ) -> Union[tuple, SmolVLMCausalLMOutputWithPast]:
41
+ r"""
42
+ pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
43
+ Mask to avoid performing attention on padding pixel indices.
44
+ image_hidden_states (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
45
+ The hidden states of the image encoder after modality projection.
46
+ labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
47
+ Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
48
+ config.vocab_size]` or `model.image_token_id`. Tokens with indices set to `model.image_token_id` are
49
+ ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
50
+
51
+ Example:
52
+
53
+ ```python
54
+ >>> import requests
55
+ >>> import torch
56
+ >>> from PIL import Image
57
+ >>> from io import BytesIO
58
+
59
+ >>> from transformers import AutoProcessor, AutoModelForImageTextToText
60
+ >>> from transformers.image_utils import load_image
61
+
62
+ >>> # Note that passing the image urls (instead of the actual pil images) to the processor is also possible
63
+ >>> image1 = load_image("https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg")
64
+ >>> image2 = load_image("https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg")
65
+ >>> image3 = load_image("https://cdn.britannica.com/68/170868-050-8DDE8263/Golden-Gate-Bridge-San-Francisco.jpg")
66
+
67
+ >>> processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct")
68
+ >>> model = AutoModelForImageTextToText.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct", dtype=torch.bfloat16, device_map="auto")
69
+
70
+ >>> # Create inputs
71
+ >>> messages = [
72
+ ... {
73
+ ... "role": "user",
74
+ ... "content": [
75
+ ... {"type": "video", "path": path/to/video},
76
+ ... {"type": "text", "text": "What is happening in this video?"},
77
+ ... ]
78
+ ... }
79
+ ... ]
80
+
81
+ >>> inputs = processor.apply_chat_template([messages], add_generation_prompt=True)
82
+
83
+ >>> # Generate
84
+ >>> generated_ids = model.generate(**inputs, max_new_tokens=256)
85
+ >>> generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
86
+
87
+ >>> print(generated_texts)
88
+ ```"""
89
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
90
+ output_hidden_states = (
91
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
92
+ )
93
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
94
+
95
+ # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
96
+ outputs = self.model(
97
+ input_ids=input_ids,
98
+ attention_mask=attention_mask,
99
+ position_ids=position_ids,
100
+ past_key_values=past_key_values,
101
+ inputs_embeds=inputs_embeds,
102
+ pixel_values=pixel_values,
103
+ pixel_attention_mask=pixel_attention_mask,
104
+ image_hidden_states=image_hidden_states,
105
+ use_cache=use_cache,
106
+ output_attentions=output_attentions,
107
+ output_hidden_states=output_hidden_states,
108
+ cache_position=cache_position,
109
+ return_dict=True,
110
+ **lm_kwargs,
111
+ )
112
+
113
+ # Copied from llava.py
114
+ hidden_states = outputs[0]
115
+ # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
116
+ slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
117
+ kept_hidden_states = hidden_states[:, slice_indices, :]
118
+
119
+ shift_labels = lm_kwargs.pop("shift_labels", None)
120
+ logits = None
121
+ loss = None
122
+
123
+ if skip_logits and labels is None and shift_labels is None:
124
+ raise ValueError("skip_logits is True, but labels and shift_labels are None")
125
+
126
+ if skip_logits is None:
127
+ # By default, if in training mode, don't materialize logits
128
+ skip_logits = self.training and (labels is not None or shift_labels is not None)
129
+
130
+ if skip_logits:
131
+ loss = LigerForCausalLMLoss(
132
+ hidden_states=kept_hidden_states,
133
+ lm_head_weight=self.lm_head.weight,
134
+ labels=labels,
135
+ shift_labels=shift_labels,
136
+ hidden_size=self.config.text_config.hidden_size,
137
+ **lm_kwargs,
138
+ )
139
+
140
+ else:
141
+ logits = self.lm_head(kept_hidden_states)
142
+ if labels is not None or shift_labels is not None:
143
+ loss = self.loss_function(
144
+ logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **lm_kwargs
145
+ )
146
+
147
+ if not return_dict:
148
+ output = (logits,) + outputs[1:]
149
+ return (loss,) + output if loss is not None else output
150
+
151
+ return SmolVLMCausalLMOutputWithPast(
152
+ loss=loss,
153
+ logits=logits,
154
+ past_key_values=outputs.past_key_values,
155
+ hidden_states=outputs.hidden_states,
156
+ attentions=outputs.attentions,
157
+ image_hidden_states=outputs.image_hidden_states,
158
+ )
@@ -2112,6 +2112,106 @@ def apply_liger_kernel_to_internvl(
2112
2112
  logger.warning(f"{vision_model_name} is not supported by Liger kernel.")
2113
2113
 
2114
2114
 
2115
+ def apply_liger_kernel_to_smolvlm(
2116
+ cross_entropy: bool = False,
2117
+ fused_linear_cross_entropy: bool = True,
2118
+ rms_norm: bool = True,
2119
+ layer_norm: bool = True,
2120
+ model: Optional[PreTrainedModel] = None,
2121
+ **kwargs,
2122
+ ) -> None:
2123
+ """
2124
+ Apply Liger kernels to replace original implementation in HuggingFace SmolVLM models.
2125
+ Due to the characteristics of SmolVLM, the model must be passed to apply Liger-Kernel's patch to other models connected to SmolVLM.
2126
+ However, if an LM not supported by Liger-Kernel is connected to SmolVLM, unexpected side effects may occur.
2127
+ NOTE: SmolVLM is not available in transformers<4.50.0
2128
+
2129
+ Args:
2130
+ cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
2131
+ fused_linear_cross_entropy (bool):
2132
+ Whether to apply Liger's fused linear cross entropy loss. Default is True.
2133
+ `cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
2134
+ If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
2135
+ rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
2136
+ layer_norm (bool): Whether to apply Liger's LayerNorm. Default is True.
2137
+ model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
2138
+ loaded. Default is None.
2139
+ """
2140
+ assert not (cross_entropy and fused_linear_cross_entropy), (
2141
+ "cross_entropy and fused_linear_cross_entropy cannot both be True."
2142
+ )
2143
+
2144
+ from transformers.models.smolvlm import modeling_smolvlm
2145
+ from transformers.models.smolvlm.modeling_smolvlm import SmolVLMEncoderLayer
2146
+ from transformers.models.smolvlm.modeling_smolvlm import SmolVLMForConditionalGeneration
2147
+ from transformers.models.smolvlm.modeling_smolvlm import SmolVLMModel
2148
+ from transformers.models.smolvlm.modeling_smolvlm import SmolVLMVisionTransformer
2149
+
2150
+ from liger_kernel.transformers.model.smolvlm import lce_forward as smolvlm_lce_forward
2151
+
2152
+ # Patch LayerNorm for vision model if model is not provided (pre-initialization)
2153
+ if layer_norm and model is None:
2154
+ modeling_smolvlm.nn.LayerNorm = LigerLayerNorm
2155
+
2156
+ if cross_entropy:
2157
+ logger.info("Apply liger cross entropy")
2158
+
2159
+ from transformers.loss.loss_utils import nn
2160
+
2161
+ nn.functional.cross_entropy = liger_cross_entropy
2162
+ if fused_linear_cross_entropy:
2163
+ if model is not None:
2164
+ model.forward = MethodType(smolvlm_lce_forward, model)
2165
+ else:
2166
+ modeling_smolvlm.SmolVLMForConditionalGeneration.forward = smolvlm_lce_forward
2167
+ if rms_norm:
2168
+ modeling_smolvlm.SmolVLMRMSNorm = LigerRMSNorm
2169
+
2170
+ if model is not None:
2171
+ # The model instance already exists, so we need to additionally patch the
2172
+ # instance variables that reference already-instantiated modules
2173
+ if isinstance(model, SmolVLMForConditionalGeneration):
2174
+ text_model = model.model.text_model
2175
+ vision_model: SmolVLMVisionTransformer = model.model.vision_model
2176
+ elif isinstance(model, SmolVLMModel):
2177
+ text_model = model.text_model
2178
+ vision_model: SmolVLMVisionTransformer = model.vision_model
2179
+ else:
2180
+ raise TypeError(
2181
+ f"Unsupported smolvlm model type. `model` must be `SmolVLMForConditionalGeneration`, `SmolVLMModel`. Got: {type(model)}"
2182
+ )
2183
+
2184
+ text_model_name = model.config.text_config.model_type
2185
+ text_liger_fn = MODEL_TYPE_TO_APPLY_LIGER_FN.get(text_model_name, None)
2186
+
2187
+ kwargs = {"cross_entropy": False, "fused_linear_cross_entropy": False, **kwargs} | {"rms_norm": rms_norm}
2188
+ if text_liger_fn:
2189
+ accept_params = inspect.signature(text_liger_fn).parameters
2190
+ remain_params = set(kwargs) - (set(accept_params) & set(kwargs))
2191
+ text_kwargs = {k: v for k, v in kwargs.items() if k not in remain_params}
2192
+
2193
+ if remain_params:
2194
+ logger.warning(
2195
+ f"These parameters are not supported by {text_model_name}. Enter the remaining {list(text_kwargs.keys())} except for {list(remain_params)}\n"
2196
+ f"Parameters accepted by {text_model_name}: {list(accept_params.keys())}"
2197
+ )
2198
+ text_kwargs["model"] = text_model
2199
+ text_liger_fn(**text_kwargs)
2200
+ elif text_model_name not in MODEL_TYPE_TO_APPLY_LIGER_FN:
2201
+ logger.warning(f"{text_model_name} is not supported by Liger kernel.")
2202
+
2203
+ # Patch vision model LayerNorm layers
2204
+ if layer_norm:
2205
+ # Patch post_layernorm
2206
+ _patch_layer_norm_module(vision_model.post_layernorm)
2207
+
2208
+ # Patch encoder layers
2209
+ for encoder_layer in vision_model.encoder.layers:
2210
+ encoder_layer: SmolVLMEncoderLayer
2211
+ _patch_layer_norm_module(encoder_layer.layer_norm1)
2212
+ _patch_layer_norm_module(encoder_layer.layer_norm2)
2213
+
2214
+
2115
2215
  def apply_liger_kernel_to_falcon_h1(
2116
2216
  rope: bool = True,
2117
2217
  cross_entropy: bool = False,
@@ -2304,6 +2404,7 @@ MODEL_TYPE_TO_APPLY_LIGER_FN = {
2304
2404
  "phi3": apply_liger_kernel_to_phi3,
2305
2405
  "paligemma": apply_liger_kernel_to_paligemma,
2306
2406
  "falcon_h1": apply_liger_kernel_to_falcon_h1,
2407
+ "smolvlm": apply_liger_kernel_to_smolvlm,
2307
2408
  }
2308
2409
 
2309
2410
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: liger_kernel_nightly
3
- Version: 0.6.2.dev20251024130145
3
+ Version: 0.6.2.dev20251027145804
4
4
  Summary: Efficient Triton kernels for LLM Training
5
5
  License: BSD 2-CLAUSE LICENSE
6
6
  Copyright 2024 LinkedIn Corporation
@@ -42,7 +42,7 @@ liger_kernel/ops/tvd.py,sha256=FHJtLQI95ijqgg9UtaHpMAjSCiPxB6CduPwPMcGxelc,6405
42
42
  liger_kernel/ops/utils.py,sha256=uoFKQqo-34N2TWQNvXMFywqGiOMMXNEVBxVojzlUAa0,3836
43
43
  liger_kernel/ops/experimental/embedding.py,sha256=tolj3tItkzpSb30zWqDN2_yX4ectflaQ8HMyKyFIQc8,4172
44
44
  liger_kernel/ops/experimental/mm_int8int2.py,sha256=TrS9lpwekrik_w5qE7AhMJD1bcq-OidjtbsW80oZ6IM,13314
45
- liger_kernel/transformers/__init__.py,sha256=JovUTGIMKlQGiuoHIICmJqwBWUc9lkdZFNHBToR8bpY,9301
45
+ liger_kernel/transformers/__init__.py,sha256=MAAd-YqPdG-j_sbrIE43nrICpA4xTg-dx6M06KWLMFU,9486
46
46
  liger_kernel/transformers/auto_model.py,sha256=0qCTRZt280Bj_LcFdzo9hlaR-BWNazawXOGgoCZjgEg,1545
47
47
  liger_kernel/transformers/cross_entropy.py,sha256=z3KTWQnFxr_IZaVjtYt0ZNEWQdDdYThN35xWkHlDGH0,1683
48
48
  liger_kernel/transformers/dyt.py,sha256=i-4GPaMrl-jab9TVI5qN0-H9qycn_mCbV82ozU4nbmU,723
@@ -59,7 +59,7 @@ liger_kernel/transformers/jsd.py,sha256=DGqRnxIZxsvxo0_tbbxX3b-sDbDjC_yKufyRIHCc
59
59
  liger_kernel/transformers/kl_div.py,sha256=WLffFbh1EExD2Eb1F7lN11fo9JJC-0751WJjZAF1Fj8,409
60
60
  liger_kernel/transformers/layer_norm.py,sha256=c9pk3PEasOKYR0rhe5e5nNrnYKVCEW4VC8S6LpCq9EQ,906
61
61
  liger_kernel/transformers/llama4_rope.py,sha256=kS6PSHEwf3dS7hD7C7p8S0geugx2EMCiP0h0F7LsUoY,3639
62
- liger_kernel/transformers/monkey_patch.py,sha256=z_AQGc82vvjh0qHC0bwHOuEEeh9sVm2QaWeuySfGZHE,110766
62
+ liger_kernel/transformers/monkey_patch.py,sha256=NWinrSt9_h4aF2Uax8jZ3of_z1LGmJY_yW9fW6EDieU,115774
63
63
  liger_kernel/transformers/multi_token_attention.py,sha256=K3NIY9_5TPgZ4_Rahn0xnkMXxD_fmlJHK4CWGYvGQp0,1752
64
64
  liger_kernel/transformers/poly_norm.py,sha256=g5tC75i3qy1_N26ZUP-jfpct7ivQAEdJfIfx8IXzeyE,1377
65
65
  liger_kernel/transformers/qwen2vl_mrope.py,sha256=5EwSqrMdsL9MYspeBMXBsNJKvH0MOmRrtJXAJlnnlOI,1047
@@ -98,13 +98,14 @@ liger_kernel/transformers/model/qwen3.py,sha256=Q2aOg5erPrgVgRcqJm8sefLSDtvU1AD5
98
98
  liger_kernel/transformers/model/qwen3_moe.py,sha256=1CwTMCNFDYsjGoa_aHFBagtC5HuJTV-s0__5UvcjD3A,5686
99
99
  liger_kernel/transformers/model/qwen3_next.py,sha256=7To7azriAogxeE7oEvByKztH9154dnDiDVNHHm7PZK4,5632
100
100
  liger_kernel/transformers/model/smollm3.py,sha256=0KWVkDtXbjsBKhJnaquV6vUUYyLtfmNwYH0sxJt-qTk,7667
101
+ liger_kernel/transformers/model/smolvlm.py,sha256=yFpPKawLVo3zXzLjM7Y_T8FyRrPxVyp-YPFMM8m3k0c,6734
101
102
  liger_kernel/transformers/trainer/__init__.py,sha256=p7yQfklV8-467qSz_ZMimkbDF7HHWHwku25A-GYL0WU,193
102
103
  liger_kernel/transformers/trainer/orpo_trainer.py,sha256=tX0h63aOFe3rNqTmk6JpMf75UPo981yzEa6TghnjS0Q,5370
103
104
  liger_kernel/triton/__init__.py,sha256=qCiCamzCRv6lpV8IqpAc9YMdNKC7GKurClWceQPnlis,92
104
105
  liger_kernel/triton/monkey_patch.py,sha256=Rd0hUHAzDkFfHvnX7-PBaNK5EKnZhtfM_h-fgQH9HPY,1568
105
- liger_kernel_nightly-0.6.2.dev20251024130145.dist-info/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
106
- liger_kernel_nightly-0.6.2.dev20251024130145.dist-info/METADATA,sha256=Ul0Uqgh8q-LE7R-llbkcKqrHrvoWPk2nm6MimD0Yzsg,24777
107
- liger_kernel_nightly-0.6.2.dev20251024130145.dist-info/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
108
- liger_kernel_nightly-0.6.2.dev20251024130145.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
109
- liger_kernel_nightly-0.6.2.dev20251024130145.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
110
- liger_kernel_nightly-0.6.2.dev20251024130145.dist-info/RECORD,,
106
+ liger_kernel_nightly-0.6.2.dev20251027145804.dist-info/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
107
+ liger_kernel_nightly-0.6.2.dev20251027145804.dist-info/METADATA,sha256=fTlOtX-hdTLxvZv3VJoN6zxxhJHCx_UV61frAGslfx8,24777
108
+ liger_kernel_nightly-0.6.2.dev20251027145804.dist-info/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
109
+ liger_kernel_nightly-0.6.2.dev20251027145804.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
110
+ liger_kernel_nightly-0.6.2.dev20251027145804.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
111
+ liger_kernel_nightly-0.6.2.dev20251027145804.dist-info/RECORD,,