PyPI - liger-kernel-nightly - Versions diffs - 0.5.3.dev20250221162633__tar.gz → 0.5.3.dev20250221230243__tar.gz - Mend

liger-kernel-nightly 0.5.3.dev20250221162633tar.gz → 0.5.3.dev20250221230243tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of liger-kernel-nightly might be problematic. Click here for more details.

Files changed (226) hide show

{liger_kernel_nightly-0.5.3.dev20250221162633 → liger_kernel_nightly-0.5.3.dev20250221230243}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.5.3.dev20250221162633
+Version: 0.5.3.dev20250221230243
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation

{liger_kernel_nightly-0.5.3.dev20250221162633 → liger_kernel_nightly-0.5.3.dev20250221230243}/benchmark/scripts/benchmark_tvd.py RENAMED Viewed

@@ -1,13 +1,12 @@
 import torch
 import triton
-from utils import (
-    QUANTILES,
-    SingleBenchmarkRunInput,
-    SingleBenchmarkRunOutput,
-    _test_memory,
-    parse_benchmark_script_args,
-    run_benchmarks,
-)
+from utils import QUANTILES
+from utils import SingleBenchmarkRunInput
+from utils import SingleBenchmarkRunOutput
+from utils import _test_memory
+from utils import parse_benchmark_script_args
+from utils import run_benchmarks
 from liger_kernel.transformers.tvd import LigerTVDLoss
@@ -67,9 +66,7 @@ def bench_speed_tvd(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
             y = fwd()
             y.backward(retain_graph=True)
-        ms_50, ms_20, ms_80 = triton.testing.do_bench(
-            full, quantiles=QUANTILES, rep=100
-        )
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(full, quantiles=QUANTILES, rep=100)
     return SingleBenchmarkRunOutput(
         y_20=ms_20,
         y_50=ms_50,

{liger_kernel_nightly-0.5.3.dev20250221162633 → liger_kernel_nightly-0.5.3.dev20250221230243}/dev/modal/tests.py RENAMED Viewed

@@ -14,7 +14,7 @@ app = modal.App("liger_tests", image=image)
 repo = modal.Mount.from_local_dir(ROOT_PATH, remote_path=REMOTE_ROOT_PATH)
-@app.function(gpu="A10G", mounts=[repo], timeout=60 * 15)
+@app.function(gpu="A10G", mounts=[repo], timeout=60 * 20)
 def liger_tests():
     import subprocess

{liger_kernel_nightly-0.5.3.dev20250221162633 → liger_kernel_nightly-0.5.3.dev20250221230243}/examples/huggingface/training.py RENAMED Viewed

@@ -15,6 +15,7 @@ from liger_kernel.transformers import AutoLigerKernelForCausalLM
 class CustomArguments:
     model_name: str = "meta-llama/Meta-Llama-3-8B"
     dataset: str = "tatsu-lab/alpaca"
+    max_seq_length: int = 512
     use_liger: bool = False
@@ -65,6 +66,7 @@ def train():
         model=model,
         args=training_args,
         data_collator=collator,
+        max_seq_length=custom_args.max_seq_length,
         train_dataset=train_dataset,
         eval_dataset=eval_dataset,
         formatting_func=formatting_prompts_func,

{liger_kernel_nightly-0.5.3.dev20250221162633 → liger_kernel_nightly-0.5.3.dev20250221230243}/examples/huggingface/training_multimodal.py RENAMED Viewed

@@ -1,11 +1,15 @@
 import os
+from dataclasses import dataclass
+import datasets
 import torch
 import transformers
-import datasets
-from dataclasses import dataclass
-from trl import SFTTrainer, SFTConfig
-from trl.trainer import ConstantLengthDataset
+from callback import EfficiencyCallback
 from datasets import Image as ImageFeature
+from trl import SFTTrainer
 from liger_kernel.transformers import monkey_patch
@@ -15,6 +19,8 @@ class CustomArguments:
     dataset: str = "HuggingFaceM4/the_cauldron"
     dataset_subset: str = "ai2d"
     dataset_split: str = "train"
+    max_seq_length: int = 512
+    dataset_text_field: str = "texts"
     use_liger: bool = False
@@ -89,37 +95,75 @@ def _format_for_convo(example, tokenizer):
 def train():
     parser = transformers.HfArgumentParser((transformers.TrainingArguments, CustomArguments))
     training_args, custom_args = parser.parse_args_into_dataclasses()
+    training_args.remove_unused_columns = False  # required to not drop the image column
+    training_args.dataset_kwargs = {"skip_prepare_dataset": True}
-    model, processor, image_token_id = construct_model_and_processor(
-        custom_args.model_name, custom_args.use_liger
-    )
+    model, processor, image_token_id = construct_model_and_processor(custom_args.model_name, custom_args.use_liger)
-    dataset = datasets.load_dataset(
-        custom_args.dataset,
-        custom_args.dataset_subset,
-        split=custom_args.dataset_split
+    dataset = (
+        datasets.load_dataset(
+            custom_args.dataset,
+            custom_args.dataset_subset,
+            split=custom_args.dataset_split,
+        )
+        .map(
+            _validate_and_extract_the_cauldron,
+            batched=True,
+            num_proc=min(os.cpu_count(), 16),
+            desc="Extracting text and images",
+        )
+        .map(
+            _format_for_convo,
+            fn_kwargs={"tokenizer": processor.tokenizer},
+            desc="Formatting for convo",
+        )
+        .cast_column("images", ImageFeature())
+        .train_test_split(test_size=0.1)
     )
-    train_dataset, eval_dataset = prepare_dataset(dataset, processor, image_token_id)
+    train_dataset = dataset["train"]
+    eval_dataset = dataset["test"]
-    sft_config = SFTConfig(
-        output_dir=training_args.output_dir,
-        per_device_train_batch_size=training_args.per_device_train_batch_size,
-        per_device_eval_batch_size=training_args.per_device_eval_batch_size,
-        learning_rate=training_args.learning_rate,
-        num_train_epochs=training_args.num_train_epochs,
-        gradient_accumulation_steps=training_args.gradient_accumulation_steps,
-    )
+    def collate_fn(examples):
+        """
+        Taken directly from the TRL documentation with minor modifications:
+        https://huggingface.co/docs/trl/en/sft_trainer#a-custom-collator-for-processing-multi-modal-data
+        Modifications:
+        1. `apply_chat_template` is used to preprocess the texts before training begins (see above)
+        2. `example["messages"]` -> `example["texts"]` to conform with the_cauldron dataset schema
+        3. Ignoring image tokens in the loss computation
+        """
+        # Get the texts and images
+        texts = [example["texts"] for example in examples]
+        images = [example["images"] for example in examples]
+        # Tokenize the texts and process the images
+        batch = processor(text=texts, images=images, return_tensors="pt", padding=True)
+        # The labels are the input_ids, and we mask the padding tokens in the loss computation
+        labels = batch["input_ids"].clone()
+        labels[labels == processor.tokenizer.pad_token_id] = -100
+        # Ignore the image token index in the loss computation
+        labels[labels == image_token_id] = -100
+        batch["labels"] = labels
+        return batch
     trainer = SFTTrainer(
         model=model,
-        args=sft_config,
+        args=training_args,
+        data_collator=collate_fn,
+        max_seq_length=custom_args.max_seq_length,
+        dataset_text_field=custom_args.dataset_text_field,
         train_dataset=train_dataset,
         eval_dataset=eval_dataset,
-        processing_class=processor,
+        tokenizer=processor.tokenizer,
+        callbacks=[EfficiencyCallback()],
     )
     trainer.train()
 if __name__ == "__main__":
-    train()
+    train()

{liger_kernel_nightly-0.5.3.dev20250221162633 → liger_kernel_nightly-0.5.3.dev20250221230243}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "liger_kernel_nightly"
-version = "0.5.3.dev20250221162633"
+version = "0.5.3.dev20250221230243"
 description = "Efficient Triton kernels for LLM Training"
 urls = { "Homepage" = "https://github.com/linkedin/Liger-Kernel" }
 readme = { file = "README.md", content-type = "text/markdown" }

{liger_kernel_nightly-0.5.3.dev20250221162633 → liger_kernel_nightly-0.5.3.dev20250221230243}/src/liger_kernel/ops/tvd.py RENAMED Viewed

@@ -1,4 +1,5 @@
-from typing import Literal, Optional
+from typing import Literal
+from typing import Optional
 import torch
 import triton
@@ -178,15 +179,13 @@ class LigerTVDLossFunction(torch.autograd.Function):
         """
         has_label = False
         if shift_labels is not None:
-            assert shift_labels.shape == (
-                p.shape[0],
-            ), f"the shape of shift_labels must be (BT,). Got: {shift_labels.shape}"
+            assert shift_labels.shape == (p.shape[0],), (
+                f"the shape of shift_labels must be (BT,). Got: {shift_labels.shape}"
+            )
             shift_labels = shift_labels.contiguous()
             has_label = True
-        loss, grads = tv_distance_forward_triton(
-            p, q, shift_labels, reduction, ignore_index, has_label
-        )
+        loss, grads = tv_distance_forward_triton(p, q, shift_labels, reduction, ignore_index, has_label)
         ctx.save_for_backward(grads)
         return loss

{liger_kernel_nightly-0.5.3.dev20250221162633 → liger_kernel_nightly-0.5.3.dev20250221230243}/src/liger_kernel/transformers/functional.py RENAMED Viewed

@@ -14,6 +14,7 @@ from liger_kernel.ops.rope import LigerRopeFunction
 from liger_kernel.ops.swiglu import LigerSiLUMulFunction
 from liger_kernel.ops.tvd import LigerTVDLossFunction
 # conform to the function signature in https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html
 # `weight` and `size_average` are placeholders and not implemented yet
 def liger_cross_entropy(
@@ -156,6 +157,7 @@ def liger_kl_div(
         eps,
     )
 def liger_tvd(
     input,
     target,
@@ -169,7 +171,8 @@ def liger_tvd(
         shift_labels,
         reduction,
         ignore_index,
-    )
+    )
 def liger_layer_norm(X, W, B, eps):
     return LigerLayerNormFunction.apply(X, W, B, eps)

{liger_kernel_nightly-0.5.3.dev20250221162633 → liger_kernel_nightly-0.5.3.dev20250221230243}/src/liger_kernel/transformers/tvd.py RENAMED Viewed

@@ -10,6 +10,4 @@ class LigerTVDLoss(nn.Module):
         self.ignore_index = ignore_index
     def forward(self, p, q, shift_labels=None):
-        return LigerTVDLossFunction.apply(
-            p, q, shift_labels, self.reduction, self.ignore_index
-        )
+        return LigerTVDLossFunction.apply(p, q, shift_labels, self.reduction, self.ignore_index)

{liger_kernel_nightly-0.5.3.dev20250221162633 → liger_kernel_nightly-0.5.3.dev20250221230243}/src/liger_kernel/utils.py RENAMED Viewed

@@ -9,5 +9,7 @@ def infer_device():
         return "cuda"
     elif torch.xpu.is_available():
         return "xpu"
+    elif torch.hip.is_available():
+        return "hip"
     else:
         return "cpu"

{liger_kernel_nightly-0.5.3.dev20250221162633 → liger_kernel_nightly-0.5.3.dev20250221230243}/src/liger_kernel_nightly.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.5.3.dev20250221162633
+Version: 0.5.3.dev20250221230243
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation

{liger_kernel_nightly-0.5.3.dev20250221162633 → liger_kernel_nightly-0.5.3.dev20250221230243}/test/convergence/bf16/test_mini_models.py RENAMED Viewed

@@ -7,8 +7,6 @@ from transformers.models.gemma import GemmaConfig
 from transformers.models.gemma import GemmaForCausalLM
 from transformers.models.gemma2 import Gemma2Config
 from transformers.models.gemma2 import Gemma2ForCausalLM
-from transformers.models.granite import GraniteConfig
-from transformers.models.granite import GraniteForCausalLM
 from transformers.models.llama import LlamaConfig
 from transformers.models.llama import LlamaForCausalLM
 from transformers.models.mistral import MistralConfig
@@ -65,44 +63,19 @@ try:
 except ImportError:
     QWEN2_VL_AVAILABLE = False
+try:
+    from transformers.models.granite import GraniteConfig
+    from transformers.models.granite import GraniteForCausalLM
+    GRANITE_AVAILABLE = True
+except ImportError:
+    GRANITE_AVAILABLE = False
 from liger_kernel.utils import infer_device
 device = infer_device()
 MINI_MODEL_SETUPS = {
-    "mini_granite3": MiniModelConfig(
-        liger_kernel_patch_func=apply_liger_kernel_to_granite,
-        liger_kernel_patch_revert_func=revert_liger_kernel_to_granite,
-        model_class=GraniteForCausalLM,
-        mini_model_config=GraniteConfig(
-            attention_bias=False,
-            attention_dropout=0.1,
-            # Special token ids/vocab size to match Mistral-7B tokenizer used to create the tokenized dataset
-            # https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
-            bos_token_id=1,  # 128000
-            eos_token_id=2,  # 128001
-            hidden_act="silu",
-            hidden_size=1024,  # 4096
-            initializer_range=0.02,
-            intermediate_size=2048,  # 14336
-            max_position_embeddings=8192,
-            num_attention_heads=8,  # 32
-            num_hidden_layers=4,  # 32
-            num_key_value_heads=2,  # 8
-            pretraining_tp=1,
-            rms_norm_eps=1e-5,
-            rope_scaling=None,
-            rope_theta=500000.0,
-            tie_word_embeddings=False,
-            use_cache=True,
-            vocab_size=32000,  # 128256,
-            # At rope backward
-            # Eager produces incontiguous dq and dk
-            # SDPA produces contiguous dq and incontiguous dk
-            # Flash_attn produces contiguous dq and dk
-            attn_implementation="sdpa",  # default value, pytorch native attention
-        ),
-    ),
     "mini_llama3": MiniModelConfig(
         liger_kernel_patch_func=apply_liger_kernel_to_llama,
         liger_kernel_patch_revert_func=revert_liger_kernel_to_llama,
@@ -418,6 +391,41 @@ if QWEN2_VL_AVAILABLE:
         ),
     )
+if GRANITE_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_granite3"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_granite,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_granite,
+        model_class=GraniteForCausalLM,
+        mini_model_config=GraniteConfig(
+            attention_bias=False,
+            attention_dropout=0.1,
+            # Special token ids/vocab size to match Mistral-7B tokenizer used to create the tokenized dataset
+            # https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
+            bos_token_id=1,  # 128000
+            eos_token_id=2,  # 128001
+            hidden_act="silu",
+            hidden_size=1024,  # 4096
+            initializer_range=0.02,
+            intermediate_size=2048,  # 14336
+            max_position_embeddings=8192,
+            num_attention_heads=8,  # 32
+            num_hidden_layers=4,  # 32
+            num_key_value_heads=2,  # 8
+            pretraining_tp=1,
+            rms_norm_eps=1e-5,
+            rope_scaling=None,
+            rope_theta=500000.0,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,  # 128256,
+            # At rope backward
+            # Eager produces incontiguous dq and dk
+            # SDPA produces contiguous dq and incontiguous dk
+            # Flash_attn produces contiguous dq and dk
+            attn_implementation="sdpa",  # default value, pytorch native attention
+        ),
+    )
 def create_model(model_name="mini_llama3"):
     """
@@ -462,7 +470,8 @@ def run_mini_model(
         else:
             kwargs["swiglu"] = True
-        kwargs["fused_linear_cross_entropy"] = True
+        # fused_linear_cross_entropy is not supported in mini_granite3
+        kwargs["fused_linear_cross_entropy"] = True if model_name != "mini_granite3" else False
         kwargs["cross_entropy"] = False
         MINI_MODEL_SETUPS[model_name].liger_kernel_patch_func(**kwargs)
@@ -518,7 +527,13 @@ def run_mini_model(
             1e-2,
             1e-2,
             1e-2,
-            marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not GRANITE_AVAILABLE,
+                    reason="Granite not available in this version of transformers",
+                ),
+            ],
         ),
         pytest.param(
             "mini_mllama",

{liger_kernel_nightly-0.5.3.dev20250221162633 → liger_kernel_nightly-0.5.3.dev20250221230243}/test/convergence/bf16/test_mini_models_with_logits.py RENAMED Viewed

@@ -7,8 +7,6 @@ from transformers.models.gemma import GemmaConfig
 from transformers.models.gemma import GemmaForCausalLM
 from transformers.models.gemma2 import Gemma2Config
 from transformers.models.gemma2 import Gemma2ForCausalLM
-from transformers.models.granite import GraniteConfig
-from transformers.models.granite import GraniteForCausalLM
 from transformers.models.llama import LlamaConfig
 from transformers.models.llama import LlamaForCausalLM
 from transformers.models.mistral import MistralConfig
@@ -65,6 +63,14 @@ try:
 except ImportError:
     QWEN2_VL_AVAILABLE = False
+try:
+    from transformers.models.granite import GraniteConfig
+    from transformers.models.granite import GraniteForCausalLM
+    GRANITE_AVAILABLE = True
+except ImportError:
+    GRANITE_AVAILABLE = False
 from liger_kernel.utils import infer_device
 device = infer_device()
@@ -103,40 +109,6 @@ MINI_MODEL_SETUPS = {
             attn_implementation="sdpa",  # default value, pytorch native attention
         ),
     ),
-    "mini_granite3": MiniModelConfig(
-        liger_kernel_patch_func=apply_liger_kernel_to_granite,
-        liger_kernel_patch_revert_func=revert_liger_kernel_to_granite,
-        model_class=GraniteForCausalLM,
-        mini_model_config=GraniteConfig(
-            attention_bias=False,
-            attention_dropout=0.0,
-            # Special token ids/vocab size to match Mistral-7B tokenizer used to create the tokenized dataset
-            # https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
-            bos_token_id=1,  # 128000
-            eos_token_id=2,  # 128001
-            hidden_act="silu",
-            hidden_size=1024,  # 4096
-            initializer_range=0.02,
-            intermediate_size=2048,  # 14336
-            max_position_embeddings=8192,
-            num_attention_heads=8,  # 32
-            num_hidden_layers=4,  # 32
-            num_key_value_heads=2,  # 8
-            pretraining_tp=1,
-            rms_norm_eps=1e-5,
-            rope_scaling=None,
-            rope_theta=500000.0,
-            tie_word_embeddings=False,
-            use_cache=True,
-            vocab_size=32000,  # 128256,
-            logits_scaling=8.0,
-            # At rope backward
-            # Eager produces incontiguous dq and dk
-            # SDPA produces contiguous dq and incontiguous dk
-            # Flash_attn produces contiguous dq and dk
-            attn_implementation="sdpa",  # default value, pytorch native attention
-        ),
-    ),
     "mini_qwen2": MiniModelConfig(
         liger_kernel_patch_func=apply_liger_kernel_to_qwen2,
         liger_kernel_patch_revert_func=revert_liger_kernel_to_qwen2,
@@ -419,6 +391,42 @@ if QWEN2_VL_AVAILABLE:
         ),
     )
+if GRANITE_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_granite3"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_granite,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_granite,
+        model_class=GraniteForCausalLM,
+        mini_model_config=GraniteConfig(
+            attention_bias=False,
+            attention_dropout=0.0,
+            # Special token ids/vocab size to match Mistral-7B tokenizer used to create the tokenized dataset
+            # https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
+            bos_token_id=1,  # 128000
+            eos_token_id=2,  # 128001
+            hidden_act="silu",
+            hidden_size=1024,  # 4096
+            initializer_range=0.02,
+            intermediate_size=2048,  # 14336
+            max_position_embeddings=8192,
+            num_attention_heads=8,  # 32
+            num_hidden_layers=4,  # 32
+            num_key_value_heads=2,  # 8
+            pretraining_tp=1,
+            rms_norm_eps=1e-5,
+            rope_scaling=None,
+            rope_theta=500000.0,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,  # 128256,
+            logits_scaling=8.0,
+            # At rope backward
+            # Eager produces incontiguous dq and dk
+            # SDPA produces contiguous dq and incontiguous dk
+            # Flash_attn produces contiguous dq and dk
+            attn_implementation="sdpa",  # default value, pytorch native attention
+        ),
+    )
 def create_model(model_name="mini_llama3"):
     """
@@ -518,7 +526,13 @@ def run_mini_model(
             1e-2,  # logits rtol
             1e-2,
             1e-2,
-            marks=pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+            marks=[
+                pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"),
+                pytest.mark.skipif(
+                    not GRANITE_AVAILABLE,
+                    reason="Granite not available in this version of transformers",
+                ),
+            ],
         ),
         pytest.param(
             "mini_mllama",

{liger_kernel_nightly-0.5.3.dev20250221162633 → liger_kernel_nightly-0.5.3.dev20250221230243}/test/convergence/fp32/test_mini_models.py RENAMED Viewed

@@ -7,8 +7,6 @@ from transformers.models.gemma import GemmaConfig
 from transformers.models.gemma import GemmaForCausalLM
 from transformers.models.gemma2 import Gemma2Config
 from transformers.models.gemma2 import Gemma2ForCausalLM
-from transformers.models.granite import GraniteConfig
-from transformers.models.granite import GraniteForCausalLM
 from transformers.models.llama import LlamaConfig
 from transformers.models.llama import LlamaForCausalLM
 from transformers.models.mistral import MistralConfig
@@ -64,44 +62,19 @@ try:
 except ImportError:
     QWEN2_VL_AVAILABLE = False
+try:
+    from transformers.models.granite import GraniteConfig
+    from transformers.models.granite import GraniteForCausalLM
+    GRANITE_AVAILABLE = True
+except ImportError:
+    GRANITE_AVAILABLE = False
 from liger_kernel.utils import infer_device
 device = infer_device()
 MINI_MODEL_SETUPS = {
-    "mini_granite3": MiniModelConfig(
-        liger_kernel_patch_func=apply_liger_kernel_to_granite,
-        liger_kernel_patch_revert_func=revert_liger_kernel_to_granite,
-        model_class=GraniteForCausalLM,
-        mini_model_config=GraniteConfig(
-            attention_bias=False,
-            attention_dropout=0.1,
-            # Special token ids/vocab size to match Mistral-7B tokenizer used to create the tokenized dataset
-            # https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
-            bos_token_id=1,  # 128000
-            eos_token_id=2,  # 128001
-            hidden_act="silu",
-            hidden_size=1024,  # 4096
-            initializer_range=0.02,
-            intermediate_size=2048,  # 14336
-            max_position_embeddings=8192,
-            num_attention_heads=8,  # 32
-            num_hidden_layers=4,  # 32
-            num_key_value_heads=2,  # 8
-            pretraining_tp=1,
-            rms_norm_eps=1e-5,
-            rope_scaling=None,
-            rope_theta=500000.0,
-            tie_word_embeddings=False,
-            use_cache=True,
-            vocab_size=32000,  # 128256,
-            # At rope backward
-            # Eager produces incontiguous dq and dk
-            # SDPA produces contiguous dq and incontiguous dk
-            # Flash_attn produces contiguous dq and dk
-            attn_implementation="sdpa",  # default value, pytorch native attention
-        ),
-    ),
     "mini_llama3": MiniModelConfig(
         liger_kernel_patch_func=apply_liger_kernel_to_llama,
         liger_kernel_patch_revert_func=revert_liger_kernel_to_llama,
@@ -417,6 +390,41 @@ if QWEN2_VL_AVAILABLE:
         ),
     )
+if GRANITE_AVAILABLE:
+    MINI_MODEL_SETUPS["mini_granite3"] = MiniModelConfig(
+        liger_kernel_patch_func=apply_liger_kernel_to_granite,
+        liger_kernel_patch_revert_func=revert_liger_kernel_to_granite,
+        model_class=GraniteForCausalLM,
+        mini_model_config=GraniteConfig(
+            attention_bias=False,
+            attention_dropout=0.1,
+            # Special token ids/vocab size to match Mistral-7B tokenizer used to create the tokenized dataset
+            # https://huggingface.co/mistralai/Mistral-7B-v0.1/blob/main/config.json
+            bos_token_id=1,  # 128000
+            eos_token_id=2,  # 128001
+            hidden_act="silu",
+            hidden_size=1024,  # 4096
+            initializer_range=0.02,
+            intermediate_size=2048,  # 14336
+            max_position_embeddings=8192,
+            num_attention_heads=8,  # 32
+            num_hidden_layers=4,  # 32
+            num_key_value_heads=2,  # 8
+            pretraining_tp=1,
+            rms_norm_eps=1e-5,
+            rope_scaling=None,
+            rope_theta=500000.0,
+            tie_word_embeddings=False,
+            use_cache=True,
+            vocab_size=32000,  # 128256,
+            # At rope backward
+            # Eager produces incontiguous dq and dk
+            # SDPA produces contiguous dq and incontiguous dk
+            # Flash_attn produces contiguous dq and dk
+            attn_implementation="sdpa",  # default value, pytorch native attention
+        ),
+    )
 def create_model(model_name="mini_llama3"):
     """
@@ -461,7 +469,8 @@ def run_mini_model(
         else:
             kwargs["swiglu"] = True
-        kwargs["fused_linear_cross_entropy"] = True
+        # fused_linear_cross_entropy is not supported in mini_granite3
+        kwargs["fused_linear_cross_entropy"] = True if model_name != "mini_granite3" else False
         kwargs["cross_entropy"] = False
         MINI_MODEL_SETUPS[model_name].liger_kernel_patch_func(**kwargs)
@@ -535,7 +544,22 @@ def run_mini_model(
         ("mini_gemma1", 32, 1e-4, torch.float32, 1e-8, 1e-4, 5e-3, 1e-5, 5e-3, 1e-5),
         ("mini_gemma1.1", 32, 1e-4, torch.float32, 1e-8, 1e-4, 5e-3, 1e-5, 5e-3, 1e-5),
         ("mini_gemma2", 32, 1e-4, torch.float32, 1e-8, 1e-4, 5e-3, 1e-5, 5e-3, 1e-5),
-        ("mini_granite3", 32, 1e-4, torch.float32, 1e-8, 1e-4, 5e-3, 1e-5, 5e-3, 1e-5),
+        pytest.param(
+            "mini_granite3",
+            32,
+            1e-4,
+            torch.float32,
+            1e-8,
+            1e-4,
+            5e-3,
+            1e-5,
+            5e-3,
+            1e-5,
+            marks=pytest.mark.skipif(
+                not GRANITE_AVAILABLE,
+                reason="Granite not available in this version of transformers",
+            ),
+        ),
     ],
 )
 def test_mini_model(

liger-kernel-nightly 0.5.3.dev20250221162633__tar.gz → 0.5.3.dev20250221230243__tar.gz

Potentially problematic release.

liger-kernel-nightly 0.5.3.dev20250221162633tar.gz → 0.5.3.dev20250221230243tar.gz