PyPI - liger-kernel - Versions diffs - 0.6.4__tar.gz → 0.6.5__tar.gz - Mend

liger-kernel 0.6.4tar.gz → 0.6.5tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (326) hide show

{liger_kernel-0.6.4 → liger_kernel-0.6.5}/.github/workflows/amd-ci.yml RENAMED Viewed

@@ -26,10 +26,10 @@ jobs:
     steps:
     - name: Checkout code
-      uses: actions/checkout@v3
+      uses: actions/checkout@v6
     - name: Set up Python
-      uses: actions/setup-python@v3
+      uses: actions/setup-python@v6
       with:
         python-version: '3.10'
@@ -50,10 +50,10 @@ jobs:
     steps:
     - name: Checkout code
-      uses: actions/checkout@v3
+      uses: actions/checkout@v6
     - name: Set up Python
-      uses: actions/setup-python@v3
+      uses: actions/setup-python@v6
       with:
         python-version: '3.10'

{liger_kernel-0.6.4 → liger_kernel-0.6.5}/.github/workflows/benchmark.yml RENAMED Viewed

@@ -35,22 +35,26 @@ jobs:
       OUTPUT_DIR: benchmarks
       OUTPUT_FILENAME: benchmark.csv
       GENERATED_CSV: benchmark/data/all_benchmark_data.csv
+      # Sanitize user-controlled inputs by declaring them as environment variables
+      # This prevents command injection attacks by filtering dangerous characters
+      INPUT_COMMIT_HASH: ${{ github.event.inputs.commit_hash }}
+      INPUT_OVERWRITE: ${{ github.event.inputs.overwrite }}
     steps:
       # Step: Decide the commit hash to use
       # Step: Checkout full history so we can check out any commit
       - name: Checkout full repo history
-        uses: actions/checkout@v3
+        uses: actions/checkout@v6
         with:
           fetch-depth: 0  # Important: so we can checkout arbitrary commit
       - name: Determine commit hash to checkout
         id: choose_commit
         run: |
-          if [ "${{ github.event_name}}" == "workflow_dispatch" ] && [ "${{ github.event.inputs.commit_hash }}" != "main" ]; then
-            echo "Using manual input commit: ${{ github.event.inputs.commit_hash }}"
-            echo "hash=${{ github.event.inputs.commit_hash }}" >> $GITHUB_OUTPUT
+          if [ "${{ github.event_name}}" == "workflow_dispatch" ] && [ "$INPUT_COMMIT_HASH" != "main" ]; then
+            echo "Using manual input commit: $INPUT_COMMIT_HASH"
+            echo "hash=$INPUT_COMMIT_HASH" >> $GITHUB_OUTPUT
           else
             echo "Using latest commit from main"
             echo "hash=$(git rev-parse --short HEAD)" >> $GITHUB_OUTPUT
@@ -60,10 +64,10 @@ jobs:
       - name: Replace benchmark folder from main (manual only, commit ≠ main)
         if: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.commit_hash != 'main' }}
         run: |
-          echo "Detected manual trigger with commit_hash = ${{ github.event.inputs.commit_hash }}"
+          echo "Detected manual trigger with commit_hash = $INPUT_COMMIT_HASH"
           # Save current branch (detached HEAD at old commit)
-          ORIG_COMMIT=${{ github.event.inputs.commit_hash }}
+          ORIG_COMMIT="$INPUT_COMMIT_HASH"
           # Fetch and checkout main
           git fetch origin main
@@ -72,7 +76,7 @@ jobs:
           # Save benchmark folder from main
           cp -r benchmark /tmp/benchmark_main
           # Checkout back to target commit
-          git checkout $ORIG_COMMIT
+          git checkout "$ORIG_COMMIT"
           # Replace old benchmark with one from main
           rm -rf benchmark
           cp -r /tmp/benchmark_main benchmark
@@ -85,7 +89,7 @@ jobs:
           if curl --output /dev/null --silent --head --fail "$BENCHMARK_URL"; then
             echo "Benchmark already exists for commit $COMMIT_HASH"
-            if [ "${{ github.event.inputs.overwrite }}" != "true" ]; then
+            if [ "$INPUT_OVERWRITE" != "true" ]; then
               echo "Overwrite is false - exiting"
               exit 1
             else
@@ -96,7 +100,7 @@ jobs:
           fi
       - name: Set up Python
-        uses: actions/setup-python@v3
+        uses: actions/setup-python@v6
         with:
           python-version: '3.10'
@@ -117,7 +121,7 @@ jobs:
        # Step 5: Checkout gh-pages branch in a subfolderAdd commentMore actions
       - name: Checkout gh-pages
-        uses: actions/checkout@v3
+        uses: actions/checkout@v6
         with:
           ref: gh-pages
           path: gh-pages

{liger_kernel-0.6.4 → liger_kernel-0.6.5}/.github/workflows/docs.yml RENAMED Viewed

@@ -13,16 +13,16 @@ jobs:
   deploy:
     runs-on: ubuntu-latest
     steps:
-      - uses: actions/checkout@v4
+      - uses: actions/checkout@v6
       - name: Configure Git Credentials
         run: |
           git config user.name github-actions[bot]
           git config user.email 41898282+github-actions[bot]@users.noreply.github.com
-      - uses: actions/setup-python@v5
+      - uses: actions/setup-python@v6
         with:
           python-version: 3.x
       - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV
-      - uses: actions/cache@v4
+      - uses: actions/cache@v5
         with:
           key: mkdocs-material-${{ env.cache_id }}
           path: .cache

{liger_kernel-0.6.4 → liger_kernel-0.6.5}/.github/workflows/intel-ci.yml RENAMED Viewed

@@ -26,10 +26,10 @@ jobs:
     steps:
     - name: Checkout code
-      uses: actions/checkout@v3
+      uses: actions/checkout@v6
     - name: Set up Python
-      uses: actions/setup-python@v3
+      uses: actions/setup-python@v6
       with:
         python-version: '3.10'
@@ -58,7 +58,7 @@ jobs:
         apt-get clean && rm -rf /var/lib/apt/lists/*
     - name: Checkout code
-      uses: actions/checkout@v3
+      uses: actions/checkout@v6
     - name: Setup Dependencies
       shell: bash

{liger_kernel-0.6.4 → liger_kernel-0.6.5}/.github/workflows/nvi-ci.yml RENAMED Viewed

@@ -25,10 +25,10 @@ jobs:
     steps:
     - name: Checkout code
-      uses: actions/checkout@v3
+      uses: actions/checkout@v6
     - name: Set up Python
-      uses: actions/setup-python@v3
+      uses: actions/setup-python@v6
       with:
         python-version: '3.10'
@@ -49,10 +49,10 @@ jobs:
     steps:
     - name: Checkout code
-      uses: actions/checkout@v3
+      uses: actions/checkout@v6
     - name: Set up Python
-      uses: actions/setup-python@v3
+      uses: actions/setup-python@v6
       with:
         python-version: '3.10'
@@ -75,10 +75,10 @@ jobs:
     steps:
     - name: Checkout code
-      uses: actions/checkout@v3
+      uses: actions/checkout@v6
     - name: Set up Python
-      uses: actions/setup-python@v3
+      uses: actions/setup-python@v6
       with:
         python-version: '3.10'

{liger_kernel-0.6.4 → liger_kernel-0.6.5}/.github/workflows/publish-nightly.yml RENAMED Viewed

@@ -13,10 +13,10 @@ jobs:
     steps:
     - name: Checkout repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v6
     - name: Set up Python
-      uses: actions/setup-python@v3
+      uses: actions/setup-python@v6
       with:
         python-version: '3.8'

{liger_kernel-0.6.4 → liger_kernel-0.6.5}/.github/workflows/publish-release.yml RENAMED Viewed

@@ -10,10 +10,10 @@ jobs:
     steps:
     - name: Checkout repository
-      uses: actions/checkout@v3
+      uses: actions/checkout@v6
     - name: Set up Python
-      uses: actions/setup-python@v3
+      uses: actions/setup-python@v6
       with:
         python-version: '3.10'

liger_kernel-0.6.5/.pre-commit-config.yaml ADDED Viewed

@@ -0,0 +1,10 @@
+repos:
+- repo: https://github.com/astral-sh/ruff-pre-commit
+  # Ruff version.
+  rev: v0.14.11
+  hooks:
+    # Run the linter.
+    - id: ruff-check
+      args: [ --fix ]
+    # Run the formatter.
+    - id: ruff-format

{liger_kernel-0.6.4 → liger_kernel-0.6.5}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: liger_kernel
-Version: 0.6.4
+Version: 0.6.5
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation
@@ -33,7 +33,7 @@ License-File: NOTICE
 Requires-Dist: torch>=2.1.2
 Requires-Dist: triton>=2.3.1
 Provides-Extra: dev
-Requires-Dist: transformers>=4.49.0; extra == "dev"
+Requires-Dist: transformers<5.0.0,>=4.49.0; extra == "dev"
 Requires-Dist: matplotlib>=3.7.2; extra == "dev"
 Requires-Dist: ruff>=0.12.0; extra == "dev"
 Requires-Dist: pytest>=7.1.2; extra == "dev"
@@ -45,6 +45,7 @@ Requires-Dist: datasets>=2.19.2; extra == "dev"
 Requires-Dist: seaborn; extra == "dev"
 Requires-Dist: mkdocs-material; extra == "dev"
 Requires-Dist: torchvision>=0.20; extra == "dev"
+Requires-Dist: prek>=0.2.28; extra == "dev"
 Dynamic: license-file
 Dynamic: provides-extra
 Dynamic: requires-dist
@@ -82,8 +83,8 @@ Dynamic: requires-dist
             </a>
         </td>
         <td style="padding: 10px;">
-            <a href="https://discord.gg/gpumode">
-                <img src="https://dcbadge.limes.pink/api/server/gpumode?style=flat" alt="Join Our Discord">
+            <a href="https://discord.gg/X4MaxPgA">
+                <img src="https://dcbadge.limes.pink/api/server/https://discord.gg/X4MaxPgA?style=flat" alt="Join Our Discord">
             </a>
         </td>
     </tr>
@@ -98,6 +99,7 @@ Dynamic: requires-dist
 <details>
   <summary>Latest News 🔥</summary>
+  - [2025/12/19] We announced a liger kernel discord channel at https://discord.gg/X4MaxPgA; We will be hosting Liger Kernel x Triton China Meetup in mid of January 2026
   - [2025/03/06] We release a joint blog post on TorchTune × Liger - [Peak Performance, Minimized Memory: Optimizing torchtune’s performance with torch.compile & Liger Kernel](https://pytorch.org/blog/peak-performance-minimized-memory/)
   - [2024/12/11] We release [v0.5.0](https://github.com/linkedin/Liger-Kernel/releases/tag/v0.5.0): 80% more memory efficient post training losses (DPO, ORPO, CPO, etc)!
   - [2024/12/5] We release LinkedIn Engineering Blog - [Liger-Kernel: Empowering an open source ecosystem of Triton Kernels for Efficient LLM Training](https://www.linkedin.com/blog/engineering/open-source/liger-kernel-open-source-ecosystem-for-efficient-llm-training)
@@ -116,6 +118,8 @@ We've also added optimized Post-Training kernels that deliver **up to 80% memory
 You can view the documentation site for additional installation, usage examples, and API references:https://linkedin.github.io/Liger-Kernel/
+You can view the Liger Kernel Technical Report: https://openreview.net/forum?id=36SjAIT42G
 ## Supercharge Your Model with Liger Kernel
 ![Banner](https://raw.githubusercontent.com/linkedin/Liger-Kernel/main/docs/images/banner.GIF)
@@ -315,6 +319,7 @@ loss.backward()
 | OLMo2   | `liger_kernel.transformers.apply_liger_kernel_to_olmo2`     | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
 | Olmo3   | `liger_kernel.transformers.apply_liger_kernel_to_olmo3`     | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
 | GLM-4   | `liger_kernel.transformers.apply_liger_kernel_to_glm4`     | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
+| GPT-OSS   | `liger_kernel.transformers.apply_liger_kernel_to_gpt_oss`     | RoPE, RMSNorm, CrossEntropyLoss, FusedLinearCrossEntropy |
 | InternVL3   | `liger_kernel.transformers.apply_liger_kernel_to_internvl`     | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
 | HunyuanV1   | `liger_kernel.transformers.apply_liger_kernel_to_hunyuan_v1_dense`    |  RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy       |
 | HunyuanV1 MoE | `liger_kernel.transformers.apply_liger_kernel_to_hunyuan_v1_moe` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy       |
@@ -444,3 +449,5 @@ url={https://openreview.net/forum?id=36SjAIT42G}
         ↑ Back to Top ↑
     </a>
 </p>

{liger_kernel-0.6.4 → liger_kernel-0.6.5}/README.md RENAMED Viewed

@@ -31,8 +31,8 @@
             </a>
         </td>
         <td style="padding: 10px;">
-            <a href="https://discord.gg/gpumode">
-                <img src="https://dcbadge.limes.pink/api/server/gpumode?style=flat" alt="Join Our Discord">
+            <a href="https://discord.gg/X4MaxPgA">
+                <img src="https://dcbadge.limes.pink/api/server/https://discord.gg/X4MaxPgA?style=flat" alt="Join Our Discord">
             </a>
         </td>
     </tr>
@@ -47,6 +47,7 @@
 <details>
   <summary>Latest News 🔥</summary>
+  - [2025/12/19] We announced a liger kernel discord channel at https://discord.gg/X4MaxPgA; We will be hosting Liger Kernel x Triton China Meetup in mid of January 2026
   - [2025/03/06] We release a joint blog post on TorchTune × Liger - [Peak Performance, Minimized Memory: Optimizing torchtune’s performance with torch.compile & Liger Kernel](https://pytorch.org/blog/peak-performance-minimized-memory/)
   - [2024/12/11] We release [v0.5.0](https://github.com/linkedin/Liger-Kernel/releases/tag/v0.5.0): 80% more memory efficient post training losses (DPO, ORPO, CPO, etc)!
   - [2024/12/5] We release LinkedIn Engineering Blog - [Liger-Kernel: Empowering an open source ecosystem of Triton Kernels for Efficient LLM Training](https://www.linkedin.com/blog/engineering/open-source/liger-kernel-open-source-ecosystem-for-efficient-llm-training)
@@ -65,6 +66,8 @@ We've also added optimized Post-Training kernels that deliver **up to 80% memory
 You can view the documentation site for additional installation, usage examples, and API references:https://linkedin.github.io/Liger-Kernel/
+You can view the Liger Kernel Technical Report: https://openreview.net/forum?id=36SjAIT42G
 ## Supercharge Your Model with Liger Kernel
 ![Banner](https://raw.githubusercontent.com/linkedin/Liger-Kernel/main/docs/images/banner.GIF)
@@ -264,6 +267,7 @@ loss.backward()
 | OLMo2   | `liger_kernel.transformers.apply_liger_kernel_to_olmo2`     | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
 | Olmo3   | `liger_kernel.transformers.apply_liger_kernel_to_olmo3`     | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
 | GLM-4   | `liger_kernel.transformers.apply_liger_kernel_to_glm4`     | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
+| GPT-OSS   | `liger_kernel.transformers.apply_liger_kernel_to_gpt_oss`     | RoPE, RMSNorm, CrossEntropyLoss, FusedLinearCrossEntropy |
 | InternVL3   | `liger_kernel.transformers.apply_liger_kernel_to_internvl`     | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
 | HunyuanV1   | `liger_kernel.transformers.apply_liger_kernel_to_hunyuan_v1_dense`    |  RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy       |
 | HunyuanV1 MoE | `liger_kernel.transformers.apply_liger_kernel_to_hunyuan_v1_moe` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy       |
@@ -393,3 +397,5 @@ url={https://openreview.net/forum?id=36SjAIT42G}
         ↑ Back to Top ↑
     </a>
 </p>

{liger_kernel-0.6.4 → liger_kernel-0.6.5}/benchmark/scripts/benchmark_distill_jsd_loss.py RENAMED Viewed

@@ -12,6 +12,7 @@ from utils import parse_benchmark_script_args
 from utils import run_benchmarks
 from liger_kernel.chunked_loss.jsd_loss import LigerFusedLinearJSDFunction
+from liger_kernel.utils import get_total_gpu_memory
 from liger_kernel.utils import infer_device
 device = infer_device()
@@ -224,12 +225,20 @@ def bench_speed_jsd_loss(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOu
 if __name__ == "__main__":
     args = parse_benchmark_script_args()
+    gpu_memory_gbs = get_total_gpu_memory()
+    # We know that the full test will require 69GBs for vocab size 2^13 and 39GBs for vocab size 2^12 on torch
+    if gpu_memory_gbs >= 69:
+        x_max = 13
+    elif gpu_memory_gbs >= 39:
+        x_max = 12
+    else:
+        x_max = 11
     common_configs = {
         "kernel_name": "distill_jsd_loss",
         "x_name": "BT",
         "x_label": "B x T",
-        "x_values": [2**i for i in range(10, 14)],
+        "x_values": [2**i for i in range(10, x_max + 1)],
         "kernel_providers": ["liger", "torch"],
         "extra_benchmark_configs": [
             {

{liger_kernel-0.6.4 → liger_kernel-0.6.5}/benchmark/scripts/benchmark_jsd.py RENAMED Viewed

@@ -9,6 +9,7 @@ from utils import parse_benchmark_script_args
 from utils import run_benchmarks
 from liger_kernel.transformers.jsd import LigerJSD
+from liger_kernel.utils import get_total_gpu_memory
 from liger_kernel.utils import infer_device
 device = infer_device()
@@ -123,11 +124,17 @@ def bench_memory_jsd(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput
 if __name__ == "__main__":
     args = parse_benchmark_script_args()
+    gpu_memory_gbs = get_total_gpu_memory()
+    # We know that the full test will require 54GBs for vocab size 2^17 on torch
+    if gpu_memory_gbs >= 54:
+        x_max = 17
+    else:
+        x_max = 16
     common_args = {
         "kernel_name": "jsd",
         "x_name": "V",
         "x_label": "vocab size",
-        "x_values": [2**i for i in range(12, 18)],
+        "x_values": [2**i for i in range(12, x_max + 1)],
         "kernel_providers": ["liger", "torch"],
         "extra_benchmark_configs": [{"B": 4, "T": 2048}],
         "overwrite": args.overwrite,

{liger_kernel-0.6.4 → liger_kernel-0.6.5}/benchmark/scripts/benchmark_qwen2vl_mrope.py RENAMED Viewed

@@ -1,6 +1,7 @@
 import torch
 import triton
+from transformers.models.qwen2_vl.configuration_qwen2_vl import Qwen2VLTextConfig
 from transformers.models.qwen2_vl.modeling_qwen2_vl import Qwen2VLRotaryEmbedding
 from transformers.models.qwen2_vl.modeling_qwen2_vl import apply_multimodal_rotary_pos_emb
 from utils import QUANTILES
@@ -32,7 +33,20 @@ def bench_speed_qwen2vl_mrope(
     seq_len = extra_benchmark_config["seq_len"] if "seq_len" in extra_benchmark_config else input.x
     head_dim = hidden_size // num_q_heads
-    rotary_emb = Qwen2VLRotaryEmbedding(head_dim, device=device)
+    mrope_section_hw = head_dim * 3 // 16
+    mrope_section = [
+        head_dim // 2 - 2 * mrope_section_hw,
+        mrope_section_hw,
+        mrope_section_hw,
+    ]
+    config = Qwen2VLTextConfig(
+        hidden_size=hidden_size,
+        num_attention_heads=num_q_heads,
+        num_key_value_heads=num_kv_heads,
+        rope_theta=1000000.0,
+        mrope_section=mrope_section,
+    )
+    rotary_emb = Qwen2VLRotaryEmbedding(config, device=device)
     q = torch.randn(
         (1, seq_len, num_q_heads, head_dim),
         device=device,
@@ -47,18 +61,11 @@ def bench_speed_qwen2vl_mrope(
     ).transpose(1, 2)
     dq, dk = (
         torch.randn_like(q, device=device, dtype=dtype),
-        torch.randn_like(k, device=device),
+        torch.randn_like(k, device=device, dtype=dtype),
     )
     pos_ids = torch.arange(seq_len * 3, device=device, dtype=torch.long).view(3, 1, -1)
     cos, sin = rotary_emb(k, pos_ids)
-    mrope_section_hw = head_dim * 3 // 16
-    mrope_section = [
-        head_dim // 2 - 2 * mrope_section_hw,
-        mrope_section_hw,
-        mrope_section_hw,
-    ]
     def fwd():
         if provider == "liger":
             return liger_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section)
@@ -116,7 +123,21 @@ def bench_memory_qwen2vl_mrope(
     seq_len = extra_benchmark_config["seq_len"] if "seq_len" in extra_benchmark_config else input.x
     head_dim = hidden_size // num_q_heads
-    rotary_emb = Qwen2VLRotaryEmbedding(head_dim, device=device)
+    mrope_section_hw = head_dim * 3 // 16
+    mrope_section = [
+        head_dim // 2 - 2 * mrope_section_hw,
+        mrope_section_hw,
+        mrope_section_hw,
+    ]
+    config = Qwen2VLTextConfig(
+        hidden_size=hidden_size,
+        num_attention_heads=num_q_heads,
+        num_key_value_heads=num_kv_heads,
+        rope_theta=1000000.0,
+        mrope_section=mrope_section,
+    )
+    rotary_emb = Qwen2VLRotaryEmbedding(config, device=device)
     q = torch.randn(
         (1, seq_len, num_q_heads, head_dim),
         device=device,
@@ -131,18 +152,11 @@ def bench_memory_qwen2vl_mrope(
     ).transpose(1, 2)
     dq, dk = (
         torch.randn_like(q, device=device, dtype=dtype),
-        torch.randn_like(k, device=device),
+        torch.randn_like(k, device=device, dtype=dtype),
     )
     pos_ids = torch.arange(seq_len * 3, device=device, dtype=torch.long).view(3, 1, -1)
     cos, sin = rotary_emb(k, pos_ids)
-    mrope_section_hw = head_dim * 3 // 16
-    mrope_section = [
-        head_dim // 2 - 2 * mrope_section_hw,
-        mrope_section_hw,
-        mrope_section_hw,
-    ]
     def full():
         if provider == "liger":
             q_out, k_out = liger_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section)

{liger_kernel-0.6.4 → liger_kernel-0.6.5}/benchmark/scripts/benchmark_tvd.py RENAMED Viewed

@@ -9,6 +9,10 @@ from utils import parse_benchmark_script_args
 from utils import run_benchmarks
 from liger_kernel.transformers.tvd import LigerTVDLoss
+from liger_kernel.utils import get_total_gpu_memory
+from liger_kernel.utils import infer_device
+device = infer_device()
 class TorchTVDLoss(torch.nn.Module):
@@ -40,8 +44,8 @@ def bench_speed_tvd(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
     torch_tvd = TorchTVDLoss(reduction=reduction)
     liger_tvd = LigerTVDLoss(reduction=reduction)
-    _input = torch.randn(B * T, V, requires_grad=True, device="cuda").softmax(dim=-1)
-    target = torch.randn(B * T, V, device="cuda").softmax(dim=-1)
+    _input = torch.randn(B * T, V, requires_grad=True, device=device).softmax(dim=-1)
+    target = torch.randn(B * T, V, device=device).softmax(dim=-1)
     def fwd():
         if input.kernel_provider == "liger":
@@ -82,8 +86,8 @@ def bench_memory_tvd(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput
     V = input.x
     B, T = input.extra_benchmark_config["B"], input.extra_benchmark_config["T"]
-    _input = torch.randn(B * T, V, requires_grad=True, device="cuda").softmax(dim=-1)
-    target = torch.randn(B * T, V, device="cuda").softmax(dim=-1)
+    _input = torch.randn(B * T, V, requires_grad=True, device=device).softmax(dim=-1)
+    target = torch.randn(B * T, V, device=device).softmax(dim=-1)
     def fwd():
         if input.kernel_provider == "liger":
@@ -106,11 +110,17 @@ def bench_memory_tvd(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput
 if __name__ == "__main__":
     args = parse_benchmark_script_args()
+    gpu_memory_gbs = get_total_gpu_memory()
+    # We know that the full test will require 66GBs for vocab size 2^17
+    if gpu_memory_gbs >= 66:
+        x_max = 17
+    else:
+        x_max = 16
     common_args = {
         "kernel_name": "tvd",
         "x_name": "V",
         "x_label": "vocab size",
-        "x_values": [2**i for i in range(12, 18)],
+        "x_values": [2**i for i in range(12, x_max + 1)],
         "kernel_providers": ["liger", "torch"],
         "extra_benchmark_configs": [{"B": 8, "T": 2048}],
         "overwrite": args.overwrite,

liger-kernel 0.6.4__tar.gz → 0.6.5__tar.gz

liger-kernel 0.6.4tar.gz → 0.6.5tar.gz