liger-kernel-nightly 0.6.2.dev20251011154427__py3-none-any.whl → 0.6.4.dev20251202054858__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of liger-kernel-nightly might be problematic. Click here for more details.

Files changed (67) hide show
  1. liger_kernel/chunked_loss/cosine_similarity_loss.py +13 -4
  2. liger_kernel/chunked_loss/fused_linear_distillation.py +13 -2
  3. liger_kernel/chunked_loss/fused_linear_ppo.py +21 -5
  4. liger_kernel/chunked_loss/grpo_loss.py +8 -5
  5. liger_kernel/chunked_loss/jsd_loss.py +18 -5
  6. liger_kernel/ops/cross_entropy.py +65 -11
  7. liger_kernel/ops/dyt.py +5 -2
  8. liger_kernel/ops/fused_add_rms_norm.py +5 -1
  9. liger_kernel/ops/fused_linear_cross_entropy.py +43 -13
  10. liger_kernel/ops/geglu.py +2 -1
  11. liger_kernel/ops/group_norm.py +2 -1
  12. liger_kernel/ops/grpo_loss.py +3 -1
  13. liger_kernel/ops/layer_norm.py +86 -66
  14. liger_kernel/ops/poly_norm.py +390 -0
  15. liger_kernel/ops/rms_norm.py +7 -2
  16. liger_kernel/ops/tiled_mlp.py +136 -0
  17. liger_kernel/ops/utils.py +2 -0
  18. liger_kernel/transformers/__init__.py +27 -0
  19. liger_kernel/transformers/cross_entropy.py +8 -3
  20. liger_kernel/transformers/functional.py +29 -6
  21. liger_kernel/transformers/fused_linear_cross_entropy.py +8 -3
  22. liger_kernel/transformers/grpo_loss.py +56 -1
  23. liger_kernel/transformers/model/falcon_h1.py +19 -5
  24. liger_kernel/transformers/model/gemma.py +17 -6
  25. liger_kernel/transformers/model/gemma2.py +14 -5
  26. liger_kernel/transformers/model/gemma3.py +25 -12
  27. liger_kernel/transformers/model/glm4.py +16 -4
  28. liger_kernel/transformers/model/glm4v.py +16 -4
  29. liger_kernel/transformers/model/glm4v_moe.py +23 -4
  30. liger_kernel/transformers/model/hunyuan_v1.py +134 -0
  31. liger_kernel/transformers/model/internvl.py +12 -5
  32. liger_kernel/transformers/model/llama.py +14 -5
  33. liger_kernel/transformers/model/llama4.py +16 -4
  34. liger_kernel/transformers/model/llava.py +12 -4
  35. liger_kernel/transformers/model/loss_utils.py +31 -3
  36. liger_kernel/transformers/model/mistral.py +15 -6
  37. liger_kernel/transformers/model/mixtral.py +16 -7
  38. liger_kernel/transformers/model/mllama.py +12 -4
  39. liger_kernel/transformers/model/olmo2.py +16 -4
  40. liger_kernel/transformers/model/olmo3.py +142 -0
  41. liger_kernel/transformers/model/output_classes.py +147 -0
  42. liger_kernel/transformers/model/paligemma.py +22 -5
  43. liger_kernel/transformers/model/phi3.py +14 -7
  44. liger_kernel/transformers/model/qwen2.py +16 -3
  45. liger_kernel/transformers/model/qwen2_5_vl.py +14 -6
  46. liger_kernel/transformers/model/qwen2_vl.py +16 -4
  47. liger_kernel/transformers/model/qwen3.py +20 -5
  48. liger_kernel/transformers/model/qwen3_moe.py +19 -5
  49. liger_kernel/transformers/model/qwen3_next.py +146 -0
  50. liger_kernel/transformers/model/qwen3_vl.py +150 -0
  51. liger_kernel/transformers/model/qwen3_vl_moe.py +126 -0
  52. liger_kernel/transformers/model/smollm3.py +15 -6
  53. liger_kernel/transformers/model/smolvlm.py +158 -0
  54. liger_kernel/transformers/monkey_patch.py +594 -19
  55. liger_kernel/transformers/poly_norm.py +42 -0
  56. liger_kernel/transformers/rms_norm.py +7 -0
  57. liger_kernel/transformers/rope.py +43 -0
  58. liger_kernel/transformers/swiglu.py +17 -0
  59. liger_kernel/transformers/tiled_mlp.py +133 -0
  60. liger_kernel/utils.py +25 -0
  61. {liger_kernel_nightly-0.6.2.dev20251011154427.dist-info → liger_kernel_nightly-0.6.4.dev20251202054858.dist-info}/METADATA +4 -1
  62. liger_kernel_nightly-0.6.4.dev20251202054858.dist-info/RECORD +118 -0
  63. liger_kernel_nightly-0.6.2.dev20251011154427.dist-info/RECORD +0 -107
  64. {liger_kernel_nightly-0.6.2.dev20251011154427.dist-info → liger_kernel_nightly-0.6.4.dev20251202054858.dist-info}/LICENSE +0 -0
  65. {liger_kernel_nightly-0.6.2.dev20251011154427.dist-info → liger_kernel_nightly-0.6.4.dev20251202054858.dist-info}/NOTICE +0 -0
  66. {liger_kernel_nightly-0.6.2.dev20251011154427.dist-info → liger_kernel_nightly-0.6.4.dev20251202054858.dist-info}/WHEEL +0 -0
  67. {liger_kernel_nightly-0.6.2.dev20251011154427.dist-info → liger_kernel_nightly-0.6.4.dev20251202054858.dist-info}/top_level.txt +0 -0
@@ -34,6 +34,8 @@ from liger_kernel.transformers.model.smollm3 import lce_forward as smollm3_lce_f
34
34
  from liger_kernel.transformers.qwen2vl_mrope import liger_multimodal_rotary_pos_emb
35
35
  from liger_kernel.transformers.rms_norm import LigerRMSNorm
36
36
  from liger_kernel.transformers.rope import liger_rotary_pos_emb
37
+ from liger_kernel.transformers.rope import liger_rotary_pos_emb_with_cast
38
+ from liger_kernel.transformers.rope import liger_rotary_pos_emb_with_cast_and_leading_batch
37
39
  from liger_kernel.transformers.swiglu import LigerBlockSparseTop2MLP
38
40
  from liger_kernel.transformers.swiglu import LigerPhi3SwiGLUMLP
39
41
  from liger_kernel.transformers.swiglu import LigerSwiGLUMLP
@@ -469,7 +471,7 @@ def apply_liger_kernel_to_llama4(
469
471
  `cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
470
472
  If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
471
473
  rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
472
- swiglu (bool): Whether to apply Liger's SwiGLU MLP. Default is False.
474
+ swiglu (bool): Whether to apply Liger's SwiGLU MLP. Default is True.
473
475
  model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
474
476
  loaded. Default is None.
475
477
  """
@@ -522,7 +524,10 @@ def apply_liger_kernel_to_llama4(
522
524
  _patch_rms_norm_module(text_model.norm)
523
525
  for decoder_layer in text_model.layers:
524
526
  if swiglu:
525
- _patch_swiglu_module(decoder_layer.feed_forward, LigerSwiGLUMLP)
527
+ if decoder_layer.is_moe_layer:
528
+ _patch_swiglu_module(decoder_layer.feed_forward.shared_expert, LigerSwiGLUMLP)
529
+ else:
530
+ _patch_swiglu_module(decoder_layer.feed_forward, LigerSwiGLUMLP)
526
531
  if rms_norm:
527
532
  _patch_rms_norm_module(decoder_layer.input_layernorm)
528
533
  _patch_rms_norm_module(decoder_layer.post_attention_layernorm)
@@ -1640,6 +1645,158 @@ def apply_liger_kernel_to_qwen2_5_vl(
1640
1645
  _patch_rms_norm_module(decoder_layer.post_attention_layernorm)
1641
1646
 
1642
1647
 
1648
+ def apply_liger_kernel_to_qwen3_vl(
1649
+ rope: bool = True,
1650
+ cross_entropy: bool = False,
1651
+ fused_linear_cross_entropy: bool = True,
1652
+ rms_norm: bool = True,
1653
+ swiglu: bool = False,
1654
+ model: PreTrainedModel = None,
1655
+ ) -> None:
1656
+ """
1657
+ Apply Liger kernels to replace original implementation in HuggingFace Qwen3-VL models.
1658
+
1659
+ Args:
1660
+ cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
1661
+ fused_linear_cross_entropy (bool):
1662
+ Whether to apply Liger's fused linear cross entropy loss. Default is True.
1663
+ `cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
1664
+ If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
1665
+ rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
1666
+ swiglu (bool): Whether to apply Liger's SwiGLU MLP. Default is False.
1667
+ model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
1668
+ loaded. Default is None.
1669
+ """
1670
+
1671
+ assert not (cross_entropy and fused_linear_cross_entropy), (
1672
+ "cross_entropy and fused_linear_cross_entropy cannot both be True."
1673
+ )
1674
+
1675
+ from transformers.models.qwen3_vl import modeling_qwen3_vl
1676
+ from transformers.models.qwen3_vl.modeling_qwen3_vl import Qwen3VLForConditionalGeneration
1677
+ from transformers.models.qwen3_vl.modeling_qwen3_vl import Qwen3VLModel
1678
+ from transformers.models.qwen3_vl.modeling_qwen3_vl import Qwen3VLTextModel
1679
+
1680
+ from liger_kernel.transformers.model.qwen3_vl import lce_forward as qwen3_vl_lce_forward
1681
+
1682
+ if rope:
1683
+ modeling_qwen3_vl.apply_rotary_pos_emb = liger_rotary_pos_emb_with_cast
1684
+ modeling_qwen3_vl.apply_rotary_pos_emb_vision = liger_rotary_pos_emb_with_cast_and_leading_batch
1685
+
1686
+ if rms_norm:
1687
+ modeling_qwen3_vl.Qwen3VLTextRMSNorm = LigerRMSNorm
1688
+
1689
+ if cross_entropy:
1690
+ from transformers.loss.loss_utils import nn
1691
+
1692
+ nn.functional.cross_entropy = liger_cross_entropy
1693
+
1694
+ if fused_linear_cross_entropy:
1695
+ if model is not None:
1696
+ model.forward = MethodType(qwen3_vl_lce_forward, model)
1697
+ else:
1698
+ modeling_qwen3_vl.Qwen3VLForConditionalGeneration.forward = qwen3_vl_lce_forward
1699
+
1700
+ if model is not None and rms_norm:
1701
+ if isinstance(model, (Qwen3VLForConditionalGeneration, Qwen3VLModel)):
1702
+ text_model: Qwen3VLTextModel = model.language_model
1703
+ elif isinstance(model, Qwen3VLTextModel):
1704
+ text_model = model
1705
+ else:
1706
+ raise TypeError(
1707
+ f"Unsupported Qwen3VL model type. `model` must be `Qwen3VLForConditionalGeneration`, `Qwen3VLModel` or `Qwen3VLTextModel`. Got: {type(model)}"
1708
+ )
1709
+
1710
+ _patch_qwen3_vl_rms_norm = partial(_patch_rms_norm_module, offset=0.0, casting_mode="llama")
1711
+
1712
+ if text_model is not None:
1713
+ _patch_qwen3_vl_rms_norm(text_model.norm)
1714
+ for decoder_layer in text_model.layers:
1715
+ _patch_qwen3_vl_rms_norm(decoder_layer.input_layernorm)
1716
+ _patch_qwen3_vl_rms_norm(decoder_layer.post_attention_layernorm)
1717
+ self_attn = getattr(decoder_layer, "self_attn", None)
1718
+ if self_attn is not None:
1719
+ if hasattr(self_attn, "q_norm") and self_attn.q_norm is not None:
1720
+ _patch_qwen3_vl_rms_norm(self_attn.q_norm)
1721
+ if hasattr(self_attn, "k_norm") and self_attn.k_norm is not None:
1722
+ _patch_qwen3_vl_rms_norm(self_attn.k_norm)
1723
+
1724
+
1725
+ def apply_liger_kernel_to_qwen3_vl_moe(
1726
+ rope: bool = True,
1727
+ cross_entropy: bool = False,
1728
+ fused_linear_cross_entropy: bool = True,
1729
+ rms_norm: bool = True,
1730
+ swiglu: bool = False,
1731
+ model: PreTrainedModel = None,
1732
+ ) -> None:
1733
+ """
1734
+ Apply Liger kernels to replace original implementation in HuggingFace Qwen3-VL MoE models.
1735
+
1736
+ Args:
1737
+ cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
1738
+ fused_linear_cross_entropy (bool):
1739
+ Whether to apply Liger's fused linear cross entropy loss. Default is False.
1740
+ rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
1741
+ swiglu (bool): Whether to apply Liger's SwiGLU MLP. Default is False.
1742
+ model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
1743
+ loaded. Default is None.
1744
+ """
1745
+
1746
+ assert not (cross_entropy and fused_linear_cross_entropy), (
1747
+ "cross_entropy and fused_linear_cross_entropy cannot both be True."
1748
+ )
1749
+
1750
+ from transformers.models.qwen3_vl_moe import modeling_qwen3_vl_moe
1751
+ from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import Qwen3VLMoeForConditionalGeneration
1752
+ from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import Qwen3VLMoeModel
1753
+ from transformers.models.qwen3_vl_moe.modeling_qwen3_vl_moe import Qwen3VLMoeTextModel
1754
+
1755
+ from liger_kernel.transformers.model.qwen3_vl_moe import lce_forward as qwen3_vl_moe_lce_forward
1756
+
1757
+ if rope:
1758
+ modeling_qwen3_vl_moe.apply_rotary_pos_emb = liger_rotary_pos_emb_with_cast
1759
+ modeling_qwen3_vl_moe.apply_rotary_pos_emb_vision = liger_rotary_pos_emb_with_cast_and_leading_batch
1760
+
1761
+ if rms_norm:
1762
+ modeling_qwen3_vl_moe.Qwen3VLMoeTextRMSNorm = LigerRMSNorm
1763
+
1764
+ if cross_entropy:
1765
+ from transformers.loss.loss_utils import nn
1766
+
1767
+ nn.functional.cross_entropy = liger_cross_entropy
1768
+
1769
+ if fused_linear_cross_entropy:
1770
+ if model is not None:
1771
+ model.forward = MethodType(qwen3_vl_moe_lce_forward, model)
1772
+ else:
1773
+ modeling_qwen3_vl_moe.Qwen3VLMoeForConditionalGeneration.forward = qwen3_vl_moe_lce_forward
1774
+
1775
+ if model is not None and rms_norm:
1776
+ if isinstance(model, (Qwen3VLMoeForConditionalGeneration, Qwen3VLMoeModel)):
1777
+ text_model: Qwen3VLMoeTextModel = model.language_model
1778
+ elif isinstance(model, Qwen3VLMoeTextModel):
1779
+ text_model = model
1780
+ else:
1781
+ raise TypeError(
1782
+ f"Unsupported Qwen3VLMoe model type. `model` must be `Qwen3VLMoeForConditionalGeneration`, `Qwen3VLMoeModel` or `Qwen3VLMoeTextModel`. Got: {type(model)}"
1783
+ )
1784
+
1785
+ _patch_qwen3_vl_moe_rms_norm = partial(_patch_rms_norm_module, offset=0.0, casting_mode="llama")
1786
+
1787
+ if text_model is not None:
1788
+ _patch_qwen3_vl_moe_rms_norm(text_model.norm)
1789
+ for decoder_layer in text_model.layers:
1790
+ _patch_qwen3_vl_moe_rms_norm(decoder_layer.input_layernorm)
1791
+ _patch_qwen3_vl_moe_rms_norm(decoder_layer.post_attention_layernorm)
1792
+ self_attn = getattr(decoder_layer, "self_attn", None)
1793
+ if self_attn is not None:
1794
+ if hasattr(self_attn, "q_norm") and self_attn.q_norm is not None:
1795
+ _patch_qwen3_vl_moe_rms_norm(self_attn.q_norm)
1796
+ if hasattr(self_attn, "k_norm") and self_attn.k_norm is not None:
1797
+ _patch_qwen3_vl_moe_rms_norm(self_attn.k_norm)
1798
+
1799
+
1643
1800
  def apply_liger_kernel_to_phi3(
1644
1801
  rope: bool = True,
1645
1802
  cross_entropy: bool = False,
@@ -1771,6 +1928,74 @@ def apply_liger_kernel_to_olmo2(
1771
1928
  _patch_rms_norm_module(decoder_layer.post_feedforward_layernorm, in_place=False)
1772
1929
 
1773
1930
 
1931
+ def apply_liger_kernel_to_olmo3(
1932
+ rope: bool = True,
1933
+ cross_entropy: bool = False,
1934
+ fused_linear_cross_entropy: bool = True,
1935
+ rms_norm: bool = True,
1936
+ swiglu: bool = True,
1937
+ model: PreTrainedModel = None,
1938
+ ) -> None:
1939
+ """
1940
+ Apply Liger kernels to replace original implementation in HuggingFace Olmo3 models.
1941
+
1942
+ Args:
1943
+ rope (bool): Whether to apply Liger's rotary position embedding. Default is True.
1944
+ cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
1945
+ fused_linear_cross_entropy (bool):
1946
+ Whether to apply Liger's fused linear cross entropy loss. Default is True.
1947
+ `cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
1948
+ If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
1949
+ rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
1950
+ swiglu (bool): Whether to apply Liger's SwiGLU to Olmo3MLP. Default is True.
1951
+ model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
1952
+ loaded. Default is None.
1953
+ """
1954
+ assert not (cross_entropy and fused_linear_cross_entropy), (
1955
+ "cross_entropy and fused_linear_cross_entropy cannot both be True."
1956
+ )
1957
+
1958
+ from transformers.models.olmo3 import modeling_olmo3
1959
+ from transformers.models.olmo3.modeling_olmo3 import Olmo3Model
1960
+
1961
+ from liger_kernel.transformers.model.olmo3 import lce_forward as olmo3_lce_forward
1962
+ from liger_kernel.transformers.rms_norm import LigerRMSNormForOlmo2
1963
+
1964
+ # Olmo3 arch is very similar to Olmo2, so we can reuse all these components in the same way.
1965
+ if rope:
1966
+ modeling_olmo3.apply_rotary_pos_emb = liger_rotary_pos_emb
1967
+ if rms_norm:
1968
+ modeling_olmo3.Olmo3RMSNorm = LigerRMSNormForOlmo2 # same as olmo2
1969
+ if swiglu:
1970
+ modeling_olmo3.Olmo3MLP = LigerSwiGLUMLP
1971
+ if cross_entropy:
1972
+ from transformers.loss.loss_utils import nn
1973
+
1974
+ nn.functional.cross_entropy = liger_cross_entropy
1975
+ if fused_linear_cross_entropy:
1976
+ if model is not None:
1977
+ model.forward = MethodType(olmo3_lce_forward, model)
1978
+ else:
1979
+ modeling_olmo3.Olmo3ForCausalLM.forward = olmo3_lce_forward
1980
+
1981
+ if model is not None:
1982
+ # The model instance already exists, so we need to additionally patch the
1983
+ # instance variables that reference already-instantiated modules
1984
+
1985
+ # get the base model from the model instance
1986
+ base_model: Olmo3Model = getattr(model, model.base_model_prefix, model)
1987
+
1988
+ if rms_norm:
1989
+ _patch_rms_norm_module(base_model.norm)
1990
+
1991
+ for decoder_layer in base_model.layers:
1992
+ if swiglu:
1993
+ _patch_swiglu_module(decoder_layer.mlp, LigerSwiGLUMLP)
1994
+ if rms_norm:
1995
+ _patch_rms_norm_module(decoder_layer.post_attention_layernorm, in_place=False)
1996
+ _patch_rms_norm_module(decoder_layer.post_feedforward_layernorm, in_place=False)
1997
+
1998
+
1774
1999
  def apply_liger_kernel_to_glm4(
1775
2000
  rope: bool = False,
1776
2001
  cross_entropy: bool = False,
@@ -1968,7 +2193,8 @@ def apply_liger_kernel_to_glm4v_moe(
1968
2193
  if rope:
1969
2194
  raise NotImplementedError("liger_rotary_pos_emb is not available for Glm4 models.")
1970
2195
  if rms_norm:
1971
- modeling_glm4v_moe.Glm4vRMSNorm = LigerRMSNormForGlm4
2196
+ modeling_glm4v_moe.Glm4vMoeRMSNorm = LigerRMSNormForGlm4
2197
+ modeling_glm4v_moe.Glm4vMoeTextRMSNorm = LigerRMSNormForGlm4
1972
2198
  if cross_entropy:
1973
2199
  from transformers.loss.loss_utils import nn
1974
2200
 
@@ -2034,6 +2260,7 @@ def apply_liger_kernel_to_internvl(
2034
2260
  cross_entropy: bool = False,
2035
2261
  fused_linear_cross_entropy: bool = True,
2036
2262
  rms_norm: bool = True,
2263
+ layer_norm: bool = True,
2037
2264
  model: Optional[PreTrainedModel] = None,
2038
2265
  **kwargs,
2039
2266
  ) -> None:
@@ -2044,37 +2271,60 @@ def apply_liger_kernel_to_internvl(
2044
2271
  NOTE: InternVL is not available in transformers<4.52.1
2045
2272
 
2046
2273
  Args:
2047
- rope (bool): Whether to apply Liger's rotary position embedding. Default is True.
2048
2274
  cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
2049
2275
  fused_linear_cross_entropy (bool):
2050
2276
  Whether to apply Liger's fused linear cross entropy loss. Default is True.
2051
2277
  `cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
2052
2278
  If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
2053
2279
  rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
2054
- swiglu (bool): Whether to apply Liger's SwiGLU MLP. Default is True.
2280
+ layer_norm (bool): Whether to apply Liger's LayerNorm. Default is True.
2055
2281
  model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
2056
2282
  loaded. Default is None.
2057
2283
  """
2058
2284
  assert not (cross_entropy and fused_linear_cross_entropy), (
2059
2285
  "cross_entropy and fused_linear_cross_entropy cannot both be True."
2060
2286
  )
2287
+ import torch.nn as torch_nn
2061
2288
 
2062
2289
  from transformers.models.internvl import modeling_internvl
2290
+ from transformers.models.internvl.modeling_internvl import InternVLForConditionalGeneration
2291
+ from transformers.models.internvl.modeling_internvl import InternVLModel
2292
+ from transformers.models.internvl.modeling_internvl import InternVLVisionLayer
2293
+ from transformers.models.internvl.modeling_internvl import InternVLVisionModel
2294
+ from transformers.models.internvl.modeling_internvl import InternVLVisionRMSNorm
2063
2295
 
2296
+ from liger_kernel.transformers.layer_norm import LigerLayerNorm
2064
2297
  from liger_kernel.transformers.model.internvl import lce_forward as internvl_lce_forward
2298
+ from liger_kernel.transformers.rms_norm import LigerRMSNorm
2299
+
2300
+ if layer_norm and model is None:
2301
+ modeling_internvl.nn.LayerNorm = LigerLayerNorm
2065
2302
 
2066
2303
  if cross_entropy:
2067
- logger.warning(TRANSFORMER_DEPRECATION_WARNING)
2068
- modeling_internvl.nn.CrossEntropyLoss = LigerCrossEntropyLoss
2304
+ logger.info("Apply liger cross entropy")
2305
+
2306
+ from transformers.loss.loss_utils import nn
2307
+
2308
+ nn.functional.cross_entropy = liger_cross_entropy
2069
2309
  if fused_linear_cross_entropy:
2070
2310
  modeling_internvl.InternVLForConditionalGeneration.forward = internvl_lce_forward
2071
2311
  if rms_norm:
2072
2312
  modeling_internvl.InternVLVisionRMSNorm = LigerRMSNorm
2073
2313
 
2074
2314
  if model is not None:
2075
- text_model_name, vision_model_name = model.config.text_config.model_type, model.config.vision_config.model_type
2315
+ # The model instance already exists, so we need to additionally patch the
2316
+ # instance variables that reference already-instantiated modules
2317
+ if isinstance(model, (InternVLForConditionalGeneration, InternVLModel)):
2318
+ # NOTE: language_model and visual properties can be accessed throught conditional class.
2319
+ text_model = model.language_model
2320
+ vision_model: InternVLVisionModel = model.vision_tower
2321
+ else:
2322
+ raise TypeError(
2323
+ f"Unsupported internvl model type. `model` must be `InternVLForConditionalGeneration`, `InternVLModel`. Got: {type(model)}"
2324
+ )
2325
+
2326
+ text_model_name = model.config.text_config.model_type
2076
2327
  text_liger_fn = MODEL_TYPE_TO_APPLY_LIGER_FN.get(text_model_name, None)
2077
- vision_liger_fn = MODEL_TYPE_TO_APPLY_LIGER_FN.get(vision_model_name, None)
2078
2328
 
2079
2329
  kwargs = {"cross_entropy": False, "fused_linear_cross_entropy": False, **kwargs} | {"rms_norm": rms_norm}
2080
2330
  if text_liger_fn:
@@ -2087,25 +2337,133 @@ def apply_liger_kernel_to_internvl(
2087
2337
  f"These parameters are not supported by {text_model_name}. Enter the remaining {list(text_kwargs.keys())} except for {list(remain_params)}\n"
2088
2338
  f"Parameters accepted by {text_model_name}: {list(accept_params.keys())}"
2089
2339
  )
2090
- text_kwargs["model"] = model.language_model
2340
+ text_kwargs["model"] = text_model
2091
2341
  text_liger_fn(**text_kwargs)
2092
2342
  elif text_model_name not in MODEL_TYPE_TO_APPLY_LIGER_FN:
2093
2343
  logger.warning(f"{text_model_name} is not supported by Liger kernel.")
2094
2344
 
2095
- if vision_liger_fn:
2096
- accept_params = inspect.signature(vision_liger_fn).parameters
2345
+ # Patch vision model RMSNorm layers
2346
+ if rms_norm:
2347
+ for encoder_layer in vision_model.encoder.layer:
2348
+ encoder_layer: InternVLVisionLayer
2349
+ if isinstance(encoder_layer.attention.q_norm, InternVLVisionRMSNorm):
2350
+ _patch_rms_norm_module(encoder_layer.attention.q_norm)
2351
+ if isinstance(encoder_layer.attention.k_norm, InternVLVisionRMSNorm):
2352
+ _patch_rms_norm_module(encoder_layer.attention.k_norm)
2353
+
2354
+ # Patch vision model LayerNorm layers
2355
+ if layer_norm:
2356
+ # Patch layernorm
2357
+ if isinstance(vision_model.layernorm, torch_nn.LayerNorm):
2358
+ _patch_layer_norm_module(vision_model.layernorm)
2359
+
2360
+ # Patch encoder layers
2361
+ for encoder_layer in vision_model.encoder.layer:
2362
+ encoder_layer: InternVLVisionLayer
2363
+ if isinstance(encoder_layer.layernorm_before, torch_nn.LayerNorm):
2364
+ _patch_layer_norm_module(encoder_layer.layernorm_before)
2365
+ if isinstance(encoder_layer.layernorm_after, torch_nn.LayerNorm):
2366
+ _patch_layer_norm_module(encoder_layer.layernorm_after)
2367
+
2368
+
2369
+ def apply_liger_kernel_to_smolvlm(
2370
+ cross_entropy: bool = False,
2371
+ fused_linear_cross_entropy: bool = True,
2372
+ rms_norm: bool = True,
2373
+ layer_norm: bool = True,
2374
+ model: Optional[PreTrainedModel] = None,
2375
+ **kwargs,
2376
+ ) -> None:
2377
+ """
2378
+ Apply Liger kernels to replace original implementation in HuggingFace SmolVLM models.
2379
+ Due to the characteristics of SmolVLM, the model must be passed to apply Liger-Kernel's patch to other models connected to SmolVLM.
2380
+ However, if an LM not supported by Liger-Kernel is connected to SmolVLM, unexpected side effects may occur.
2381
+ NOTE: SmolVLM is not available in transformers<4.50.0
2382
+
2383
+ Args:
2384
+ cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
2385
+ fused_linear_cross_entropy (bool):
2386
+ Whether to apply Liger's fused linear cross entropy loss. Default is True.
2387
+ `cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
2388
+ If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
2389
+ rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
2390
+ layer_norm (bool): Whether to apply Liger's LayerNorm. Default is True.
2391
+ model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
2392
+ loaded. Default is None.
2393
+ """
2394
+ assert not (cross_entropy and fused_linear_cross_entropy), (
2395
+ "cross_entropy and fused_linear_cross_entropy cannot both be True."
2396
+ )
2397
+
2398
+ from transformers.models.smolvlm import modeling_smolvlm
2399
+ from transformers.models.smolvlm.modeling_smolvlm import SmolVLMEncoderLayer
2400
+ from transformers.models.smolvlm.modeling_smolvlm import SmolVLMForConditionalGeneration
2401
+ from transformers.models.smolvlm.modeling_smolvlm import SmolVLMModel
2402
+ from transformers.models.smolvlm.modeling_smolvlm import SmolVLMVisionTransformer
2403
+
2404
+ from liger_kernel.transformers.model.smolvlm import lce_forward as smolvlm_lce_forward
2405
+
2406
+ # Patch LayerNorm for vision model if model is not provided (pre-initialization)
2407
+ if layer_norm and model is None:
2408
+ modeling_smolvlm.nn.LayerNorm = LigerLayerNorm
2409
+
2410
+ if cross_entropy:
2411
+ logger.info("Apply liger cross entropy")
2412
+
2413
+ from transformers.loss.loss_utils import nn
2414
+
2415
+ nn.functional.cross_entropy = liger_cross_entropy
2416
+ if fused_linear_cross_entropy:
2417
+ if model is not None:
2418
+ model.forward = MethodType(smolvlm_lce_forward, model)
2419
+ else:
2420
+ modeling_smolvlm.SmolVLMForConditionalGeneration.forward = smolvlm_lce_forward
2421
+ if rms_norm:
2422
+ modeling_smolvlm.SmolVLMRMSNorm = LigerRMSNorm
2423
+
2424
+ if model is not None:
2425
+ # The model instance already exists, so we need to additionally patch the
2426
+ # instance variables that reference already-instantiated modules
2427
+ if isinstance(model, SmolVLMForConditionalGeneration):
2428
+ text_model = model.model.text_model
2429
+ vision_model: SmolVLMVisionTransformer = model.model.vision_model
2430
+ elif isinstance(model, SmolVLMModel):
2431
+ text_model = model.text_model
2432
+ vision_model: SmolVLMVisionTransformer = model.vision_model
2433
+ else:
2434
+ raise TypeError(
2435
+ f"Unsupported smolvlm model type. `model` must be `SmolVLMForConditionalGeneration`, `SmolVLMModel`. Got: {type(model)}"
2436
+ )
2437
+
2438
+ text_model_name = model.config.text_config.model_type
2439
+ text_liger_fn = MODEL_TYPE_TO_APPLY_LIGER_FN.get(text_model_name, None)
2440
+
2441
+ kwargs = {"cross_entropy": False, "fused_linear_cross_entropy": False, **kwargs} | {"rms_norm": rms_norm}
2442
+ if text_liger_fn:
2443
+ accept_params = inspect.signature(text_liger_fn).parameters
2097
2444
  remain_params = set(kwargs) - (set(accept_params) & set(kwargs))
2098
- vision_kwargs = {k: v for k, v in kwargs.items() if k not in remain_params}
2445
+ text_kwargs = {k: v for k, v in kwargs.items() if k not in remain_params}
2099
2446
 
2100
2447
  if remain_params:
2101
2448
  logger.warning(
2102
- f"These parameters are not supported by {vision_model_name}. Enter the remaining {list(vision_kwargs.keys())} except for {list(remain_params)}\n"
2103
- f"Parameters accepted by {vision_model_name}: {list(accept_params.keys())}"
2449
+ f"These parameters are not supported by {text_model_name}. Enter the remaining {list(text_kwargs.keys())} except for {list(remain_params)}\n"
2450
+ f"Parameters accepted by {text_model_name}: {list(accept_params.keys())}"
2104
2451
  )
2105
- vision_kwargs["model"] = model.vision_tower
2106
- vision_liger_fn(**vision_kwargs)
2107
- elif vision_model_name not in MODEL_TYPE_TO_APPLY_LIGER_FN:
2108
- logger.warning(f"{vision_model_name} is not supported by Liger kernel.")
2452
+ text_kwargs["model"] = text_model
2453
+ text_liger_fn(**text_kwargs)
2454
+ elif text_model_name not in MODEL_TYPE_TO_APPLY_LIGER_FN:
2455
+ logger.warning(f"{text_model_name} is not supported by Liger kernel.")
2456
+
2457
+ # Patch vision model LayerNorm layers
2458
+ if layer_norm:
2459
+ # Patch post_layernorm
2460
+ _patch_layer_norm_module(vision_model.post_layernorm)
2461
+
2462
+ # Patch encoder layers
2463
+ for encoder_layer in vision_model.encoder.layers:
2464
+ encoder_layer: SmolVLMEncoderLayer
2465
+ _patch_layer_norm_module(encoder_layer.layer_norm1)
2466
+ _patch_layer_norm_module(encoder_layer.layer_norm2)
2109
2467
 
2110
2468
 
2111
2469
  def apply_liger_kernel_to_falcon_h1(
@@ -2177,6 +2535,214 @@ def apply_liger_kernel_to_falcon_h1(
2177
2535
  _patch_rms_norm_module(decoder_layer.pre_ff_layernorm)
2178
2536
 
2179
2537
 
2538
+ def apply_liger_kernel_to_qwen3_next(
2539
+ rope: bool = False,
2540
+ cross_entropy: bool = False,
2541
+ fused_linear_cross_entropy: bool = True,
2542
+ rms_norm: bool = True,
2543
+ swiglu: bool = True,
2544
+ model: PreTrainedModel = None,
2545
+ ) -> None:
2546
+ """
2547
+ Apply Liger kernels to replace original implementation in HuggingFace GLM4v_moe models.
2548
+
2549
+ Args:
2550
+ rope (bool): Whether to apply Liger's rotary position embedding. Default is False.
2551
+ cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
2552
+ fused_linear_cross_entropy (bool):
2553
+ Whether to apply Liger's fused linear cross entropy loss. Default is True.
2554
+ `cross_entropy` and `fused_linear_cross_entropy` cannot both be True.
2555
+ If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
2556
+ rms_norm (bool): Whether to apply Liger's RMSNorm. Default is True.
2557
+ swiglu (bool): Whether to apply Liger's SwiGLUMLP. Default is True.
2558
+ model (PreTrainedModel): The model instance to apply Liger kernels to, if the model has already been
2559
+ loaded. Default is None.
2560
+ """
2561
+ assert not (cross_entropy and fused_linear_cross_entropy), (
2562
+ "cross_entropy and fused_linear_cross_entropy cannot both be True."
2563
+ )
2564
+
2565
+ from transformers.models.qwen3_next import modeling_qwen3_next
2566
+ from transformers.models.qwen3_next.modeling_qwen3_next import Qwen3NextForCausalLM
2567
+ from transformers.models.qwen3_next.modeling_qwen3_next import Qwen3NextMLP
2568
+ from transformers.models.qwen3_next.modeling_qwen3_next import Qwen3NextModel
2569
+ from transformers.models.qwen3_next.modeling_qwen3_next import Qwen3NextSparseMoeBlock
2570
+
2571
+ from liger_kernel.transformers.model.qwen3_next import lce_forward as qwen3_next_lce_forward
2572
+ from liger_kernel.transformers.rms_norm import LigerRMSNormForQwen3Next
2573
+ from liger_kernel.transformers.swiglu import LigerQwen3MoeSwiGLUMLP
2574
+
2575
+ if rope:
2576
+ # It might enocunter nan issue
2577
+ # modeling_qwen3_next.apply_rotary_pos_emb = liger_rotary_pos_emb
2578
+ raise NotImplementedError("liger_rotary_pos_emb is not available for Qwen3Next models.")
2579
+ if rms_norm:
2580
+ modeling_qwen3_next.Qwen3NextRMSNorm = LigerRMSNormForQwen3Next
2581
+ if cross_entropy:
2582
+ from transformers.loss.loss_utils import nn
2583
+
2584
+ nn.functional.cross_entropy = liger_cross_entropy
2585
+ if fused_linear_cross_entropy:
2586
+ if model is not None:
2587
+ if isinstance(model, Qwen3NextForCausalLM):
2588
+ model.forward = MethodType(qwen3_next_lce_forward, model)
2589
+ else:
2590
+ raise TypeError(
2591
+ f" fused_linear_cross_entropy is only applicable on Qwen3NextForCausalLM. Got: {type(model)}"
2592
+ )
2593
+ else:
2594
+ modeling_qwen3_next.Qwen3NextForCausalLM.forward = qwen3_next_lce_forward
2595
+ if swiglu:
2596
+ # Qwen3MoeMLP and Qwen3NextMLP are identical, hence we reuse LigerQwen3MoeSwiGLUMLP
2597
+ modeling_qwen3_next.Qwen3NextMLP = LigerQwen3MoeSwiGLUMLP
2598
+
2599
+ if model is not None:
2600
+ # The model instance already exists, so we need to additionally patch the
2601
+ # instance variables that reference already-instantiated modules
2602
+ if isinstance(model, (Qwen3NextForCausalLM, Qwen3NextModel)):
2603
+ base_model: Qwen3NextForCausalLM = getattr(model, model.base_model_prefix, model)
2604
+ else:
2605
+ raise TypeError(
2606
+ f"Unsupported qwen3_next model type. `model` must be `Qwen3NextForCausalLM`, `Qwen3NextModel`. Got: {type(model)}"
2607
+ )
2608
+
2609
+ if rms_norm:
2610
+ _patch_rms_norm_module(base_model.norm)
2611
+
2612
+ for decoder_layer in base_model.layers:
2613
+ if rms_norm:
2614
+ _patch_rms_norm_module(decoder_layer.input_layernorm)
2615
+ _patch_rms_norm_module(decoder_layer.post_attention_layernorm)
2616
+
2617
+ # Qwen3MoeMLP and Qwen3NextMLP are identical, hence we reuse LigerQwen3MoeSwiGLUMLP
2618
+ if swiglu:
2619
+ if isinstance(decoder_layer.mlp, Qwen3NextMLP):
2620
+ _patch_swiglu_module(decoder_layer.mlp, LigerQwen3MoeSwiGLUMLP)
2621
+ if isinstance(decoder_layer.mlp, Qwen3NextSparseMoeBlock):
2622
+ _patch_swiglu_module(decoder_layer.mlp.shared_expert, LigerQwen3MoeSwiGLUMLP)
2623
+ experts = getattr(decoder_layer.mlp, "experts", None)
2624
+ if experts is not None:
2625
+ for expert in experts:
2626
+ _patch_swiglu_module(expert, LigerQwen3MoeSwiGLUMLP)
2627
+
2628
+
2629
+ def apply_liger_kernel_to_hunyuan_v1_dense(
2630
+ rope: bool = True,
2631
+ cross_entropy: bool = False,
2632
+ fused_linear_cross_entropy: bool = True,
2633
+ rms_norm: bool = True,
2634
+ swiglu: bool = True,
2635
+ model: PreTrainedModel = None,
2636
+ ) -> None:
2637
+ """
2638
+ Apply Liger kernels to replace original implementation in HuggingFace Hunyuan v1 dense models.
2639
+ """
2640
+ assert not (cross_entropy and fused_linear_cross_entropy), (
2641
+ "cross_entropy and fused_linear_cross_entropy cannot both be True."
2642
+ )
2643
+
2644
+ from transformers.models.hunyuan_v1_dense import modeling_hunyuan_v1_dense
2645
+ from transformers.models.hunyuan_v1_dense.modeling_hunyuan_v1_dense import HunYuanDenseV1Model
2646
+
2647
+ from liger_kernel.transformers.model.hunyuan_v1 import lce_forward as hunyuan_v1_lce_forward
2648
+ from liger_kernel.transformers.swiglu import LigerHunyuanV1SwiGLUMLP
2649
+
2650
+ if rope:
2651
+ modeling_hunyuan_v1_dense.apply_rotary_pos_emb = liger_rotary_pos_emb
2652
+
2653
+ if rms_norm:
2654
+ modeling_hunyuan_v1_dense.HunYuanDenseV1RMSNorm = LigerRMSNorm
2655
+
2656
+ if cross_entropy:
2657
+ from transformers.loss.loss_utils import nn
2658
+
2659
+ nn.functional.cross_entropy = liger_cross_entropy
2660
+
2661
+ if fused_linear_cross_entropy:
2662
+ if model is not None:
2663
+ model.forward = MethodType(hunyuan_v1_lce_forward, model)
2664
+ else:
2665
+ modeling_hunyuan_v1_dense.HunYuanDenseV1ForCausalLM.forward = hunyuan_v1_lce_forward
2666
+
2667
+ if swiglu:
2668
+ modeling_hunyuan_v1_dense.HunYuanDenseV1MLP = LigerHunyuanV1SwiGLUMLP
2669
+
2670
+ if model is not None:
2671
+ # The model instance already exists, so we need to additionally patch the
2672
+ # instance variables that reference already-instantiated modules
2673
+
2674
+ # get the base model from the model instance
2675
+ base_model: HunYuanDenseV1Model = getattr(model, model.base_model_prefix, model)
2676
+
2677
+ if rms_norm:
2678
+ _patch_rms_norm_module(base_model.norm)
2679
+ for decoder_layer in base_model.layers:
2680
+ if swiglu:
2681
+ _patch_swiglu_module(decoder_layer.mlp, LigerHunyuanV1SwiGLUMLP)
2682
+ if rms_norm:
2683
+ _patch_rms_norm_module(decoder_layer.input_layernorm)
2684
+ _patch_rms_norm_module(decoder_layer.post_attention_layernorm)
2685
+
2686
+
2687
+ def apply_liger_kernel_to_hunyuan_v1_moe(
2688
+ rope: bool = True,
2689
+ cross_entropy: bool = False,
2690
+ fused_linear_cross_entropy: bool = True,
2691
+ rms_norm: bool = True,
2692
+ swiglu: bool = True,
2693
+ model: PreTrainedModel = None,
2694
+ ) -> None:
2695
+ """
2696
+ Apply Liger kernels to replace original implementation in HuggingFace Qwen3 models.
2697
+ """
2698
+ assert not (cross_entropy and fused_linear_cross_entropy), (
2699
+ "cross_entropy and fused_linear_cross_entropy cannot both be True."
2700
+ )
2701
+
2702
+ from transformers.models.hunyuan_v1_moe import modeling_hunyuan_v1_moe
2703
+ from transformers.models.hunyuan_v1_moe.modeling_hunyuan_v1_moe import HunYuanMoEV1Model
2704
+
2705
+ from liger_kernel.transformers.model.hunyuan_v1 import lce_forward as hunyuan_v1_moe_lce_forward
2706
+ from liger_kernel.transformers.swiglu import LigerHunyuanV1SwiGLUMLP
2707
+
2708
+ if rope:
2709
+ modeling_hunyuan_v1_moe.apply_rotary_pos_emb = liger_rotary_pos_emb
2710
+
2711
+ if rms_norm:
2712
+ modeling_hunyuan_v1_moe.HunYuanMoEV1RMSNorm = LigerRMSNorm
2713
+
2714
+ if cross_entropy:
2715
+ from transformers.loss.loss_utils import nn
2716
+
2717
+ nn.functional.cross_entropy = liger_cross_entropy
2718
+
2719
+ if fused_linear_cross_entropy:
2720
+ if model is not None:
2721
+ model.forward = MethodType(hunyuan_v1_moe_lce_forward, model)
2722
+ else:
2723
+ modeling_hunyuan_v1_moe.HunYuanMoEV1ForCausalLM.forward = hunyuan_v1_moe_lce_forward
2724
+
2725
+ if swiglu:
2726
+ modeling_hunyuan_v1_moe.HunYuanMoEV1MLP = LigerHunyuanV1SwiGLUMLP
2727
+
2728
+ if model is not None:
2729
+ # The model instance already exists, so we need to additionally patch the
2730
+ # instance variables that reference already-instantiated modules
2731
+
2732
+ # get the base model from the model instance
2733
+ base_model: HunYuanMoEV1Model = getattr(model, model.base_model_prefix, model)
2734
+
2735
+ if rms_norm:
2736
+ _patch_rms_norm_module(base_model.norm)
2737
+ for decoder_layer in base_model.layers:
2738
+ if swiglu:
2739
+ for mlp_expert in decoder_layer.mlp.experts:
2740
+ _patch_swiglu_module(mlp_expert, LigerHunyuanV1SwiGLUMLP)
2741
+ if rms_norm:
2742
+ _patch_rms_norm_module(decoder_layer.input_layernorm)
2743
+ _patch_rms_norm_module(decoder_layer.post_attention_layernorm)
2744
+
2745
+
2180
2746
  # Model type corresponds to the keys defined in transformers/models/auto/modeling_auto.py
2181
2747
  MODEL_TYPE_TO_APPLY_LIGER_FN = {
2182
2748
  "gemma": apply_liger_kernel_to_gemma,
@@ -2197,6 +2763,7 @@ MODEL_TYPE_TO_APPLY_LIGER_FN = {
2197
2763
  "mistral": apply_liger_kernel_to_mistral,
2198
2764
  "mixtral": apply_liger_kernel_to_mixtral,
2199
2765
  "olmo2": apply_liger_kernel_to_olmo2,
2766
+ "olmo3": apply_liger_kernel_to_olmo3,
2200
2767
  "qwen2": apply_liger_kernel_to_qwen2,
2201
2768
  "qwen3": apply_liger_kernel_to_qwen3,
2202
2769
  "qwen3_moe": apply_liger_kernel_to_qwen3_moe,
@@ -2204,10 +2771,18 @@ MODEL_TYPE_TO_APPLY_LIGER_FN = {
2204
2771
  "qwen2_vl_text": apply_liger_kernel_to_qwen2_vl,
2205
2772
  "qwen2_5_vl": apply_liger_kernel_to_qwen2_5_vl,
2206
2773
  "qwen2_5_vl_text": apply_liger_kernel_to_qwen2_5_vl,
2774
+ "qwen3_next": apply_liger_kernel_to_qwen3_next,
2775
+ "qwen3_vl": apply_liger_kernel_to_qwen3_vl,
2776
+ "qwen3_vl_text": apply_liger_kernel_to_qwen3_vl,
2777
+ "qwen3_vl_moe": apply_liger_kernel_to_qwen3_vl_moe,
2778
+ "qwen3_vl_moe_text": apply_liger_kernel_to_qwen3_vl_moe,
2207
2779
  "smollm3": apply_liger_kernel_to_smollm3,
2208
2780
  "phi3": apply_liger_kernel_to_phi3,
2209
2781
  "paligemma": apply_liger_kernel_to_paligemma,
2210
2782
  "falcon_h1": apply_liger_kernel_to_falcon_h1,
2783
+ "smolvlm": apply_liger_kernel_to_smolvlm,
2784
+ "hunyuan_v1_dense": apply_liger_kernel_to_hunyuan_v1_dense,
2785
+ "hunyuan_v1_moe": apply_liger_kernel_to_hunyuan_v1_moe,
2211
2786
  }
2212
2787
 
2213
2788