liger-kernel 0.5.6__py3-none-any.whl → 0.5.8__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -10,10 +10,12 @@ from transformers.models.olmo2.modeling_olmo2 import _CONFIG_FOR_DOC
10
10
  from transformers.models.olmo2.modeling_olmo2 import OLMO2_INPUTS_DOCSTRING
11
11
  from transformers.utils import add_start_docstrings_to_model_forward
12
12
  from transformers.utils import replace_return_docstrings
13
+ from transformers.utils.deprecation import deprecate_kwarg
13
14
 
14
15
  from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
15
16
 
16
17
 
18
+ @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
17
19
  @add_start_docstrings_to_model_forward(OLMO2_INPUTS_DOCSTRING)
18
20
  @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
19
21
  def lce_forward(
@@ -29,7 +31,7 @@ def lce_forward(
29
31
  output_hidden_states: Optional[bool] = None,
30
32
  return_dict: Optional[bool] = None,
31
33
  cache_position: Optional[torch.LongTensor] = None,
32
- num_logits_to_keep: int = 0,
34
+ logits_to_keep: Union[int, torch.Tensor] = 0,
33
35
  **loss_kwargs,
34
36
  ) -> Union[Tuple, CausalLMOutputWithPast]:
35
37
  r"""
@@ -39,10 +41,12 @@ def lce_forward(
39
41
  config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
40
42
  (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
41
43
 
42
- num_logits_to_keep (`int`, *optional*):
43
- Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
44
+ logits_to_keep (`int` or `torch.Tensor`, *optional*):
45
+ If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
44
46
  `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
45
47
  token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
48
+ If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
49
+ This is useful when using packed tensor format (single dimension for batch and sequence length).
46
50
 
47
51
  Returns:
48
52
 
@@ -98,7 +102,8 @@ def lce_forward(
98
102
  )
99
103
 
100
104
  else: # if in inference mode materialize logits
101
- logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
105
+ slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
106
+ logits = self.lm_head(hidden_states[:, slice_indices, :])
102
107
  if labels is not None:
103
108
  loss = self.loss_function(
104
109
  logits=logits,
@@ -11,6 +11,7 @@ from transformers.models.phi3.modeling_phi3 import _CONFIG_FOR_DOC
11
11
  from transformers.models.phi3.modeling_phi3 import PHI3_INPUTS_DOCSTRING
12
12
  from transformers.utils import add_start_docstrings_to_model_forward
13
13
  from transformers.utils import replace_return_docstrings
14
+ from transformers.utils.deprecation import deprecate_kwarg
14
15
 
15
16
  from liger_kernel.transformers.fused_linear_cross_entropy import LigerFusedLinearCrossEntropyLoss
16
17
  from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
@@ -126,6 +127,7 @@ def lce_forward_deprecated(
126
127
  )
127
128
 
128
129
 
130
+ @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
129
131
  @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING)
130
132
  @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
131
133
  def lce_forward(
@@ -141,7 +143,7 @@ def lce_forward(
141
143
  output_hidden_states: Optional[bool] = None,
142
144
  return_dict: Optional[bool] = None,
143
145
  cache_position: Optional[torch.LongTensor] = None,
144
- num_logits_to_keep: int = 0,
146
+ logits_to_keep: Union[int, torch.Tensor] = 0,
145
147
  **loss_kwargs,
146
148
  ) -> Union[Tuple, CausalLMOutputWithPast]:
147
149
  r"""
@@ -151,10 +153,12 @@ def lce_forward(
151
153
  config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
152
154
  (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
153
155
 
154
- num_logits_to_keep (`int`, *optional*):
155
- Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
156
+ logits_to_keep (`int` or `torch.Tensor`, *optional*):
157
+ If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
156
158
  `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
157
159
  token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
160
+ If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
161
+ This is useful when using packed tensor format (single dimension for batch and sequence length).
158
162
 
159
163
  Returns:
160
164
 
@@ -223,7 +227,8 @@ def lce_forward(
223
227
  )
224
228
 
225
229
  else: # if in inference mode materialize logits
226
- logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
230
+ slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
231
+ logits = self.lm_head(hidden_states[:, slice_indices, :])
227
232
  if labels is not None:
228
233
  loss = self.loss_function(
229
234
  logits=logits,
@@ -11,6 +11,7 @@ from transformers.models.qwen2.modeling_qwen2 import _CONFIG_FOR_DOC
11
11
  from transformers.models.qwen2.modeling_qwen2 import QWEN2_INPUTS_DOCSTRING
12
12
  from transformers.utils import add_start_docstrings_to_model_forward
13
13
  from transformers.utils import replace_return_docstrings
14
+ from transformers.utils.deprecation import deprecate_kwarg
14
15
 
15
16
  from liger_kernel.transformers.fused_linear_cross_entropy import LigerFusedLinearCrossEntropyLoss
16
17
  from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
@@ -125,6 +126,7 @@ def lce_forward_deprecated(
125
126
  )
126
127
 
127
128
 
129
+ @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
128
130
  @add_start_docstrings_to_model_forward(QWEN2_INPUTS_DOCSTRING)
129
131
  @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
130
132
  def lce_forward(
@@ -140,7 +142,7 @@ def lce_forward(
140
142
  output_hidden_states: Optional[bool] = None,
141
143
  return_dict: Optional[bool] = None,
142
144
  cache_position: Optional[torch.LongTensor] = None,
143
- num_logits_to_keep: int = 0,
145
+ logits_to_keep: Union[int, torch.Tensor] = 0,
144
146
  **loss_kwargs,
145
147
  ) -> Union[Tuple, CausalLMOutputWithPast]:
146
148
  r"""
@@ -150,10 +152,12 @@ def lce_forward(
150
152
  config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
151
153
  (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
152
154
 
153
- num_logits_to_keep (`int`, *optional*):
154
- Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
155
+ logits_to_keep (`int` or `torch.Tensor`, *optional*):
156
+ If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
155
157
  `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
156
158
  token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
159
+ If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
160
+ This is useful when using packed tensor format (single dimension for batch and sequence length).
157
161
 
158
162
  Returns:
159
163
 
@@ -209,7 +213,8 @@ def lce_forward(
209
213
  )
210
214
 
211
215
  else: # if in inference mode materialize logits
212
- logits = self.lm_head(hidden_states[:, -num_logits_to_keep:, :])
216
+ slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
217
+ logits = self.lm_head(hidden_states[:, slice_indices, :])
213
218
  if labels is not None:
214
219
  loss = self.loss_function(
215
220
  logits=logits,
@@ -694,6 +694,177 @@ def apply_liger_kernel_to_gemma2(
694
694
  _patch_rms_norm_module_for_gemma2(decoder_layer.post_feedforward_layernorm)
695
695
 
696
696
 
697
+ def apply_liger_kernel_to_gemma3_text(
698
+ rope: bool = True,
699
+ cross_entropy: bool = False,
700
+ fused_linear_cross_entropy: bool = True,
701
+ rms_norm: bool = True,
702
+ geglu: bool = True,
703
+ model: PreTrainedModel = None,
704
+ ) -> None:
705
+ """
706
+ Apply Liger kernels to replace original implementation in HuggingFace Gemma3
707
+
708
+ Args:
709
+ rope (bool): Whether to apply Liger's rotary position embedding. Default is True.
710
+ cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
711
+ fused_linear_cross_entropy (bool):
712
+ Whether to apply Liger's fused linear cross entropy loss. Default is True.
713
+ `cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
714
+ If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
715
+ rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
716
+ geglu (bool): Whether to apply Liger's GeGLU MLP. Default is True.
717
+ model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
718
+ loaded. Default is None.
719
+ """
720
+ assert not (cross_entropy and fused_linear_cross_entropy), (
721
+ "cross_entropy and fused_linear_cross_entropy cannot both be True."
722
+ )
723
+
724
+ from transformers.models.gemma3 import modeling_gemma3
725
+ from transformers.models.gemma3.modeling_gemma3 import Gemma3DecoderLayer
726
+ from transformers.models.gemma3.modeling_gemma3 import Gemma3ForCausalLM
727
+
728
+ from liger_kernel.transformers.gema3_rms import LigerRMSNormForGemma3
729
+ from liger_kernel.transformers.model.gemma3 import causal_forward
730
+
731
+ _patch_rms_norm_module_for_gemma3 = partial(
732
+ _patch_rms_norm_module, offset=1.0, casting_mode="gemma", in_place=False
733
+ )
734
+
735
+ if rope:
736
+ modeling_gemma3.apply_rotary_pos_emb = liger_rotary_pos_emb
737
+
738
+ if rms_norm:
739
+ modeling_gemma3.Gemma3RMSNorm = LigerRMSNormForGemma3
740
+
741
+ if geglu:
742
+ modeling_gemma3.Gemma3MLP = LigerGEGLUMLP
743
+
744
+ # Handle loss function
745
+ if cross_entropy:
746
+ from transformers.loss.loss_utils import nn
747
+
748
+ nn.functional.cross_entropy = liger_cross_entropy
749
+
750
+ if fused_linear_cross_entropy:
751
+ modeling_gemma3.Gemma3ForCausalLM.forward = causal_forward
752
+
753
+ if model is not None:
754
+ # The model instance already exists, so we need to additionally patch the
755
+ # instance variables that reference already-instantiated modules
756
+
757
+ if isinstance(model, Gemma3ForCausalLM):
758
+ # get the base model from the model instance
759
+ base_model = model.model
760
+
761
+ if rms_norm:
762
+ _patch_rms_norm_module_for_gemma3(base_model.norm)
763
+
764
+ for decoder_layer in base_model.layers:
765
+ decoder_layer: Gemma3DecoderLayer
766
+ if geglu:
767
+ _bind_method_to_module(decoder_layer.mlp, "forward", LigerGEGLUMLP.forward)
768
+ if rms_norm:
769
+ _patch_rms_norm_module_for_gemma3(decoder_layer.input_layernorm)
770
+ _patch_rms_norm_module_for_gemma3(decoder_layer.post_attention_layernorm)
771
+ _patch_rms_norm_module_for_gemma3(decoder_layer.pre_feedforward_layernorm)
772
+ _patch_rms_norm_module_for_gemma3(decoder_layer.post_feedforward_layernorm)
773
+ _patch_rms_norm_module_for_gemma3(decoder_layer.self_attn.q_norm)
774
+ _patch_rms_norm_module_for_gemma3(decoder_layer.self_attn.k_norm)
775
+
776
+ else:
777
+ raise TypeError("The model must be Gemma3ForCausalLM.")
778
+
779
+
780
+ def apply_liger_kernel_to_gemma3(
781
+ rope: bool = True,
782
+ cross_entropy: bool = False,
783
+ fused_linear_cross_entropy: bool = True,
784
+ layer_norm: bool = True,
785
+ rms_norm: bool = True,
786
+ geglu: bool = True,
787
+ model: PreTrainedModel = None,
788
+ ) -> None:
789
+ """
790
+ Apply Liger kernels to replace original implementation in HuggingFace Gemma3
791
+
792
+ Args:
793
+ rope (bool): Whether to apply Liger's rotary position embedding. Default is True.
794
+ cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
795
+ fused_linear_cross_entropy (bool):
796
+ Whether to apply Liger's fused linear cross entropy loss. Default is True.
797
+ `cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
798
+ If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
799
+ layer_norm (bool): Whether to apply Liger's LayerNorm. Default is True.
800
+ rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
801
+ geglu (bool): Whether to apply Liger's GeGLU MLP. Default is True.
802
+ model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
803
+ loaded. Default is None.
804
+ """
805
+ assert not (cross_entropy and fused_linear_cross_entropy), (
806
+ "cross_entropy and fused_linear_cross_entropy cannot both be True."
807
+ )
808
+
809
+ from transformers.models.gemma3 import modeling_gemma3
810
+ from transformers.models.gemma3.modeling_gemma3 import Gemma3ForConditionalGeneration
811
+ from transformers.models.siglip import modeling_siglip
812
+ from transformers.models.siglip.modeling_siglip import SiglipEncoderLayer
813
+ from transformers.models.siglip.modeling_siglip import SiglipVisionModel
814
+
815
+ from liger_kernel.transformers.model.gemma3 import multimodal_forward
816
+
817
+ _patch_rms_norm_module_for_gemma3 = partial(
818
+ _patch_rms_norm_module, offset=1.0, casting_mode="gemma", in_place=False
819
+ )
820
+
821
+ if layer_norm:
822
+ modeling_siglip.nn.LayerNorm = LigerLayerNorm
823
+
824
+ apply_liger_kernel_to_gemma3_text(
825
+ rope=rope, cross_entropy=False, fused_linear_cross_entropy=False, rms_norm=rms_norm, geglu=geglu
826
+ )
827
+
828
+ if cross_entropy:
829
+ modeling_gemma3.nn.CrossEntropyLoss = LigerCrossEntropyLoss
830
+
831
+ if fused_linear_cross_entropy:
832
+ modeling_gemma3.Gemma3ForConditionalGeneration.forward = multimodal_forward
833
+
834
+ if model is not None:
835
+ # The model instance already exists, so we need to additionally patch the
836
+ # instance variables that reference already-instantiated modules
837
+
838
+ if isinstance(model, Gemma3ForConditionalGeneration):
839
+ if isinstance(model.vision_tower, SiglipVisionModel):
840
+ vision_tower = model.vision_tower
841
+
842
+ _patch_layer_norm_module(vision_tower.vision_model.post_layernorm)
843
+
844
+ for layer in vision_tower.vision_model.encoder.layers:
845
+ layer: SiglipEncoderLayer
846
+ if layer_norm:
847
+ _patch_layer_norm_module(layer.layer_norm1)
848
+ _patch_layer_norm_module(layer.layer_norm2)
849
+ else:
850
+ raise TypeError("The vision tower must be SiglipVisionModel")
851
+
852
+ if rms_norm:
853
+ _patch_rms_norm_module_for_gemma3(model.multi_modal_projector.mm_soft_emb_norm)
854
+
855
+ apply_liger_kernel_to_gemma3_text(
856
+ rope=rope,
857
+ cross_entropy=False,
858
+ fused_linear_cross_entropy=False,
859
+ rms_norm=rms_norm,
860
+ geglu=geglu,
861
+ model=model.language_model,
862
+ )
863
+
864
+ else:
865
+ raise TypeError("The model must be Gemma3ForConditionalGeneration.")
866
+
867
+
697
868
  def apply_liger_kernel_to_paligemma(
698
869
  rope: bool = True,
699
870
  cross_entropy: bool = False,
@@ -1152,6 +1323,8 @@ def apply_liger_kernel_to_olmo2(
1152
1323
  MODEL_TYPE_TO_APPLY_LIGER_FN = {
1153
1324
  "gemma": apply_liger_kernel_to_gemma,
1154
1325
  "gemma2": apply_liger_kernel_to_gemma2,
1326
+ "gemma3_text": apply_liger_kernel_to_gemma3_text,
1327
+ "gemma3": apply_liger_kernel_to_gemma3,
1155
1328
  "llama": apply_liger_kernel_to_llama,
1156
1329
  "llava": apply_liger_kernel_to_llava,
1157
1330
  "granite": apply_liger_kernel_to_granite,
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: liger_kernel
3
- Version: 0.5.6
3
+ Version: 0.5.8
4
4
  Summary: Efficient Triton kernels for LLM Training
5
5
  License: BSD 2-CLAUSE LICENSE
6
6
  Copyright 2024 LinkedIn Corporation
@@ -314,6 +314,8 @@ loss.backward()
314
314
  | Mixtral | `liger_kernel.transformers.apply_liger_kernel_to_mixtral` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
315
315
  | Gemma1 | `liger_kernel.transformers.apply_liger_kernel_to_gemma` | RoPE, RMSNorm, GeGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
316
316
  | Gemma2 | `liger_kernel.transformers.apply_liger_kernel_to_gemma2` | RoPE, RMSNorm, GeGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
317
+ | Gemma3 (Text) | `liger_kernel.transformers.apply_liger_kernel_to_gemma3_text` | RoPE, RMSNorm, GeGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
318
+ | Gemma3 (Multimodal) | `liger_kernel.transformers.apply_liger_kernel_to_gemma3` | LayerNorm, RoPE, RMSNorm, GeGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
317
319
  | Paligemma, Paligemma2, & Paligemma2 Mix | `liger_kernel.transformers.apply_liger_kernel_to_paligemma` | LayerNorm, RoPE, RMSNorm, GeGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
318
320
  | Qwen2, Qwen2.5, & QwQ | `liger_kernel.transformers.apply_liger_kernel_to_qwen2` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
319
321
  | Qwen2-VL, & QVQ | `liger_kernel.transformers.apply_liger_kernel_to_qwen2_vl` | RMSNorm, LayerNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
@@ -7,10 +7,10 @@ liger_kernel/chunked_loss/cpo_loss.py,sha256=Gzz1eU4kgcbdubFVRy55e8A1Cr-r45UgNic
7
7
  liger_kernel/chunked_loss/dpo_loss.py,sha256=xZwGqS04si9zXyob95SAdalC-hajZg8fWINqiqffN8k,5855
8
8
  liger_kernel/chunked_loss/functional.py,sha256=9G3nKm-Bi7uoZRFkL8wwGMl6juDl4bSzDvTa5GHZPzg,955
9
9
  liger_kernel/chunked_loss/fused_linear_distillation.py,sha256=ooR-qnZCyWJN935oHCSWLaKKKyaYERyhNczRGi1VOiw,11935
10
- liger_kernel/chunked_loss/fused_linear_ppo.py,sha256=-E4AuWY-y2bMo_kAmEQBgQ92UJh3L5IiCRGVcfMJOCE,12731
10
+ liger_kernel/chunked_loss/fused_linear_ppo.py,sha256=AA19cpv6D8mo5RbSK5GRCcZoOSnpxV_Z1eJlAsC5eic,13434
11
11
  liger_kernel/chunked_loss/fused_linear_preference.py,sha256=ojB42jYPu0c4ki96Ft-hy7Sf6fh_WikG-aWNrlZzSio,18362
12
12
  liger_kernel/chunked_loss/fused_linear_unpaired_preference.py,sha256=RiuK3UtRwH9T6jZ36sA8Urj-TVuOLOO2syLg_JOQapY,13437
13
- liger_kernel/chunked_loss/grpo_loss.py,sha256=6Mb4ZT6MfnOr4Xo681rMR0LKkhzJhInvQp8wp2YVMK0,8913
13
+ liger_kernel/chunked_loss/grpo_loss.py,sha256=kuqHkYV383sUxqJN-DMsfADHi2hxHVyKx5S24TNc8bQ,10866
14
14
  liger_kernel/chunked_loss/jsd_loss.py,sha256=u2ahkuHsbhpNaKcpBCz5gCMDk9ou-P04DHji592dIBo,7067
15
15
  liger_kernel/chunked_loss/kto_loss.py,sha256=llVCe6DkcpCo57seGWoMikaQVFApx764jsmSbQyqwQY,7529
16
16
  liger_kernel/chunked_loss/orpo_loss.py,sha256=nu9UYG16dcMw93lvHi4_hYs3Q0FK1KnlmMRj7OpYU8s,4872
@@ -22,18 +22,18 @@ liger_kernel/ops/fused_linear_cross_entropy.py,sha256=1Y3Uk_TCSjqKgoG2eot1ptnWXJ
22
22
  liger_kernel/ops/fused_linear_jsd.py,sha256=CSoprxb-YcJy-YUKiTcYkxN8sb9h2kdk_iHuncvSV5c,9683
23
23
  liger_kernel/ops/geglu.py,sha256=axGvCIvlBzuluoAIrWTsp2iZM4BFKNInkPov8YVvH9E,4126
24
24
  liger_kernel/ops/group_norm.py,sha256=qD4D4lSjSgVtO52EBNLC2iTseALRgPgqXE50U2woggk,10837
25
- liger_kernel/ops/jsd.py,sha256=rkloGA7nDfVaa5nKY6-EYBw0E1p_MSsl4fr2xZGTp04,6961
26
- liger_kernel/ops/kl_div.py,sha256=NkG7D6_DnPBzr-ohhYiQbRBnq_fbGmpn5UU7y0UBKQo,8420
27
- liger_kernel/ops/layer_norm.py,sha256=6roQjioyg-9O2qLPV8nL4U0-5UH80tdzOMTWwjvDnn8,7961
25
+ liger_kernel/ops/jsd.py,sha256=onHp5T3MbvJaVz5Vup7Ww6EQp_HTaZeayTjJk6FgQMY,7042
26
+ liger_kernel/ops/kl_div.py,sha256=ZjGdDLKWksHT9dZ0xF_TDgAkj5cuMTwwT5tr9E-_24o,8734
27
+ liger_kernel/ops/layer_norm.py,sha256=vWCyOm-F2GMAilB-ozJcFeUQQLCJoTE_uiXq-_0uYuI,8356
28
28
  liger_kernel/ops/qwen2vl_mrope.py,sha256=3GExhYpLgB4VUtyZyjRk8XjEur3W4EWF6HQ67ML5vBU,8481
29
- liger_kernel/ops/rms_norm.py,sha256=PWLJcdIKU5e-8BuYFHd9Cqlq6wmr6fUXKi9zQD4LetU,11727
29
+ liger_kernel/ops/rms_norm.py,sha256=PP27OIBmV9By63i13jot9ylDowW0nuxY_JFIkaPLgL4,12078
30
30
  liger_kernel/ops/rope.py,sha256=ofmBOkUpZZO-Q8Z5B_LOFYYLD-YT-8WnJ4vGOrDYouI,8943
31
31
  liger_kernel/ops/swiglu.py,sha256=KmgMjaJQnbLLgZn2nEpbwHU_xpnYRweCyrLQSVvM1vA,3015
32
32
  liger_kernel/ops/tvd.py,sha256=FHJtLQI95ijqgg9UtaHpMAjSCiPxB6CduPwPMcGxelc,6405
33
33
  liger_kernel/ops/utils.py,sha256=uoFKQqo-34N2TWQNvXMFywqGiOMMXNEVBxVojzlUAa0,3836
34
34
  liger_kernel/ops/experimental/embedding.py,sha256=tolj3tItkzpSb30zWqDN2_yX4ectflaQ8HMyKyFIQc8,4172
35
35
  liger_kernel/ops/experimental/mm_int8int2.py,sha256=TrS9lpwekrik_w5qE7AhMJD1bcq-OidjtbsW80oZ6IM,13314
36
- liger_kernel/transformers/__init__.py,sha256=t70gqygxH63iz-B0MOdZx4AEgA8MfqU1G7N6dvIneCY,2618
36
+ liger_kernel/transformers/__init__.py,sha256=SH30Pt2ZqyQY-mmWQldg_r-5koowuymTIoU4F4e1KHk,6419
37
37
  liger_kernel/transformers/auto_model.py,sha256=0qCTRZt280Bj_LcFdzo9hlaR-BWNazawXOGgoCZjgEg,1545
38
38
  liger_kernel/transformers/cross_entropy.py,sha256=z3KTWQnFxr_IZaVjtYt0ZNEWQdDdYThN35xWkHlDGH0,1683
39
39
  liger_kernel/transformers/dyt.py,sha256=QMqqc14pkE0WhpRZvapfnNAun-6C0C_tHExL2ZJuCUA,648
@@ -41,11 +41,12 @@ liger_kernel/transformers/functional.py,sha256=4h9Pdx_iINBqfv2Zod_c27qOpYXDDwbdV
41
41
  liger_kernel/transformers/fused_linear_cross_entropy.py,sha256=09Rt7FZzLH42VOcIbQ4dlQd0o3Rlb4vk6fqiOQ7WTD8,1778
42
42
  liger_kernel/transformers/fused_linear_jsd.py,sha256=bZ4otCvWBuOnA5XdQL-FzZVItJlDt-ht9e_pG7PG93E,3999
43
43
  liger_kernel/transformers/geglu.py,sha256=mrgqzIUVd6lN7fkDKLkw5YaESDxDtFgbot430WwPVOQ,1107
44
+ liger_kernel/transformers/gema3_rms.py,sha256=LTmZOXe6WEnv6ZroW-kU1TE2B36-z5v8OLmKr3XEVFo,353
44
45
  liger_kernel/transformers/group_norm.py,sha256=6qMAWOprr4SzP0YhNVNGQIBpM5aUHplUD2VuGJrMBz0,2173
45
46
  liger_kernel/transformers/jsd.py,sha256=DGqRnxIZxsvxo0_tbbxX3b-sDbDjC_yKufyRIHCcScY,2979
46
47
  liger_kernel/transformers/kl_div.py,sha256=WLffFbh1EExD2Eb1F7lN11fo9JJC-0751WJjZAF1Fj8,409
47
48
  liger_kernel/transformers/layer_norm.py,sha256=c9pk3PEasOKYR0rhe5e5nNrnYKVCEW4VC8S6LpCq9EQ,906
48
- liger_kernel/transformers/monkey_patch.py,sha256=95afvIrZA9xSWLNIJspBLbz8lxv2Y5gfZke7MyqoOX8,56965
49
+ liger_kernel/transformers/monkey_patch.py,sha256=QpfNU7MmVDGlBWIZ2RLTSyh0vuZ-si7H37SL-qOliUs,64393
49
50
  liger_kernel/transformers/qwen2vl_mrope.py,sha256=5EwSqrMdsL9MYspeBMXBsNJKvH0MOmRrtJXAJlnnlOI,1047
50
51
  liger_kernel/transformers/rms_norm.py,sha256=GqCEJuGt0YdqqlMcToE0Wp4A8YFquDa4UUSyH2uFW2A,1191
51
52
  liger_kernel/transformers/rope.py,sha256=ZTrTORSAyfcFIKjk6XEeYmk4ROH7xXED9L4g2NFntlE,999
@@ -54,27 +55,28 @@ liger_kernel/transformers/trainer_integration.py,sha256=W3ON51O5GkyzNJsItz0y5rKx
54
55
  liger_kernel/transformers/tvd.py,sha256=XrRfyJIqN6HFxXk8MYyFVZM1OLz3mtSbRZvWfZ_JerQ,450
55
56
  liger_kernel/transformers/experimental/embedding.py,sha256=2P0QYdlFyFrG5OqTzTa1wcRgDSyjBMv5i1a7BrDPDQw,881
56
57
  liger_kernel/transformers/model/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
57
- liger_kernel/transformers/model/gemma.py,sha256=7cBTljzh-8_ACBhYl6NUfj5_ux92YRlmnAU5gfDAQAI,9312
58
- liger_kernel/transformers/model/gemma2.py,sha256=X0FOIhvFlTrmWI7Ws06wUkutgHW3lWtLOnnHp1NgZ3A,10403
59
- liger_kernel/transformers/model/llama.py,sha256=d9rBaK8e8RSMCFHdgom9ZHuXOlnh6U_o-GkAFGRNGOY,9989
58
+ liger_kernel/transformers/model/gemma.py,sha256=-JoHKWjtYPpxHQa6QbCwnzX_cctRZG2ZTsaUv-dmOt4,9816
59
+ liger_kernel/transformers/model/gemma2.py,sha256=n4MZupFGDMvtnvkvkNhRrxXS3ZF341BVfyLjrOXp10g,10923
60
+ liger_kernel/transformers/model/gemma3.py,sha256=ge3JYchiKvX1G1Zp00jX2zmQK2K7ymJoZAxbb2ggslw,16102
61
+ liger_kernel/transformers/model/llama.py,sha256=UVXQLRW7rCU5vPab54dLNS3ER37eM446peHX00Yz6eA,10493
60
62
  liger_kernel/transformers/model/llava.py,sha256=b0pEagjUbu2-eS9xegjyfl1DwIXLwZcNpff55ibaMbA,17601
61
- liger_kernel/transformers/model/loss_utils.py,sha256=Z-fUrf-cUDUjUIH7Tl9OL2hT8nmtx7ES3kg8syuWKy4,1476
62
- liger_kernel/transformers/model/mistral.py,sha256=o7tyl1sPWPfZwwrBLRlryHlSI8I55viuJoMI5Bh5Nww,5014
63
- liger_kernel/transformers/model/mixtral.py,sha256=T0ITv2-PkR8VErVOVUizoS4EzjmARyR7GFh0tXDB_i4,11089
64
- liger_kernel/transformers/model/mllama.py,sha256=RCKtwnGOMFYIbtt1zUQ15Cyv4eNpHkTWcgkmG2EEs2I,10804
65
- liger_kernel/transformers/model/olmo2.py,sha256=5M8kczp4D-jvbjcV7cKATIJGF34xd-Rs-PPdKZWSIlY,4685
63
+ liger_kernel/transformers/model/loss_utils.py,sha256=WWAMdiONPaXpIvxyOim_0igLrYh0yyOok5Q9_L9xvZw,1787
64
+ liger_kernel/transformers/model/mistral.py,sha256=RacuKcckuDK6oSraCGD0R0bm-fE0K3q-lkYaAC56C2E,5481
65
+ liger_kernel/transformers/model/mixtral.py,sha256=gLcqGabdv1XnuciS9b-TpkTDnGL8K32Hoq9j2vZMBRY,11502
66
+ liger_kernel/transformers/model/mllama.py,sha256=75mxtmMsNd_q8KlKeawj2uMP6v2KjDuUi4nsUKM5jqA,11308
67
+ liger_kernel/transformers/model/olmo2.py,sha256=rSzSALikEGkk0w3PLNQPrqg-ioN8TpWCXkAlg3LtCdI,5189
66
68
  liger_kernel/transformers/model/paligemma.py,sha256=GNReT6tVZt3ON6aaa9ovg8mnu1hYocSx9OhgC7b-_28,19191
67
- liger_kernel/transformers/model/phi3.py,sha256=NmU2DuU1Huwha6K7YSsJCnvQfUovTTGlsfBZhbx0UoI,9951
68
- liger_kernel/transformers/model/qwen2.py,sha256=t7NotBHoebsPqNSxwaf9DXTg8jxgB5BdunSGqYOE0hQ,9240
69
+ liger_kernel/transformers/model/phi3.py,sha256=ebITCrmwmb4z66CbSrZl1kD6BsP52IcSAR8uwUTp9nc,10455
70
+ liger_kernel/transformers/model/qwen2.py,sha256=QaoTDrJv2wIuAM8QMoeWVvgNl0N5gHzIrew9QGG7kXc,9744
69
71
  liger_kernel/transformers/model/qwen2_5_vl.py,sha256=70BnHZjx6eQWTwi3zc5SMwxTeOOA4Tbdkfy6IYRcTaM,9289
70
72
  liger_kernel/transformers/model/qwen2_vl.py,sha256=zo4O9fShNHYqSLrzLGqQYWSMtJI6UHaSY7zvMCYWyD8,9685
71
73
  liger_kernel/transformers/trainer/__init__.py,sha256=p7yQfklV8-467qSz_ZMimkbDF7HHWHwku25A-GYL0WU,193
72
74
  liger_kernel/transformers/trainer/orpo_trainer.py,sha256=pdekW7l6Qg_aqa5SYKYlSWUF8m3lkOFvFLcIMEHrz9s,8338
73
75
  liger_kernel/triton/__init__.py,sha256=qCiCamzCRv6lpV8IqpAc9YMdNKC7GKurClWceQPnlis,92
74
76
  liger_kernel/triton/monkey_patch.py,sha256=Rd0hUHAzDkFfHvnX7-PBaNK5EKnZhtfM_h-fgQH9HPY,1568
75
- liger_kernel-0.5.6.dist-info/licenses/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
76
- liger_kernel-0.5.6.dist-info/licenses/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
77
- liger_kernel-0.5.6.dist-info/METADATA,sha256=yam1-5oz74ok_T_rVfn3RLvCDXPxDfXZpChC1PVTFoY,23002
78
- liger_kernel-0.5.6.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
79
- liger_kernel-0.5.6.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
80
- liger_kernel-0.5.6.dist-info/RECORD,,
77
+ liger_kernel-0.5.8.dist-info/licenses/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
78
+ liger_kernel-0.5.8.dist-info/licenses/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
79
+ liger_kernel-0.5.8.dist-info/METADATA,sha256=FAr_rRImlE1GETlKdEpEmRKA2Y9UzWbLKDmLWidJqeg,23340
80
+ liger_kernel-0.5.8.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
81
+ liger_kernel-0.5.8.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
82
+ liger_kernel-0.5.8.dist-info/RECORD,,