PyPI - liger-kernel - Versions diffs - 0.6.2__tar.gz → 0.6.3__tar.gz - Mend

liger-kernel 0.6.2tar.gz → 0.6.3tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (299) hide show

{liger_kernel-0.6.2 → liger_kernel-0.6.3}/.github/workflows/amd-ci.yml RENAMED Viewed

@@ -13,9 +13,6 @@ on:
     paths:
       - "src/**"
       - "test/**"
-  schedule:
-    # Runs at 00:00 UTC daily
-    - cron: '0 0 * * *'
   workflow_dispatch:  # Enables manual trigger
 concurrency:
@@ -64,7 +61,8 @@ jobs:
       run: |
         rocm-smi
         python -m pip install --upgrade pip
-        pip install -e .[dev] --extra-index-url https://download.pytorch.org/whl/nightly/rocm${{ matrix.rocm_version }}
+        pip install -e .[dev]
+        pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm${{ matrix.rocm_version }}/
     - name: List Python Environments
       run: python -m pip list

liger_kernel-0.6.3/.github/workflows/docs.yml ADDED Viewed

@@ -0,0 +1,64 @@
+name: Publish documentation
+on:
+  push:
+    branches:
+      - main
+    paths:
+      - 'docs/**'
+      - 'mkdocs.yml'
+permissions:
+  contents: write
+jobs:
+  deploy:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Configure Git Credentials
+        run: |
+          git config user.name github-actions[bot]
+          git config user.email 41898282+github-actions[bot]@users.noreply.github.com
+      - uses: actions/setup-python@v5
+        with:
+          python-version: 3.x
+      - run: echo "cache_id=$(date --utc '+%V')" >> $GITHUB_ENV
+      - uses: actions/cache@v4
+        with:
+          key: mkdocs-material-${{ env.cache_id }}
+          path: .cache
+          restore-keys: |
+            mkdocs-material-
+      - run: pip install mkdocs-material mkdocstrings[python]
+      # ====== Backup the benchmarks from gh-pages ======
+      # This is necessary because the benchmarks are not included in the documentation build process.
+      # So we need to backup the benchmarks from gh-pages and restore them after the documentation is built.
+      - name: Backup benchmarks from gh-pages
+        run: |
+          git fetch origin gh-pages
+          # create worktree bound to local gh-pages, tracking origin/gh-pages
+          git branch -f gh-pages origin/gh-pages || true
+          mkdir -p ghp && git worktree add ghp gh-pages || true
+          if [ -d ghp/benchmarks ]; then
+            tar -C ghp -czf /tmp/benchmarks.tgz benchmarks
+          fi
+          # IMPORTANT: remove worktree so gh-pages isn't checked out anywhere
+          git worktree remove ghp --force || true
+          echo "Backed up benchmarks from gh-pages"
+      # ====== Deploy the documentation ======
+      - name: Deploy documentation
+        run: mkdocs gh-deploy --force
+      # ====== Restore the benchmarks onto gh-pages ======
+      # This is necessary because the benchmarks are not included in the documentation build process.
+      # So we need to restore the benchmarks onto gh-pages after the documentation is built.
+      - name: Restore benchmarks onto gh-pages
+        run: |
+          # Refresh remote tracking and recreate a clean worktree
+          git fetch origin gh-pages
+          git worktree add -B gh-pages ghp origin/gh-pages
+          if [ -f /tmp/benchmarks.tgz ]; then
+            tar -C ghp -xzf /tmp/benchmarks.tgz
+            git -C ghp add -A
+            git -C ghp commit -m "Restore benchmarks after gh-deploy" || echo "No changes"
+            git -C ghp push origin gh-pages
+          fi
+          git worktree remove ghp --force || true

{liger_kernel-0.6.2 → liger_kernel-0.6.3}/.github/workflows/intel-ci.yml RENAMED Viewed

@@ -13,9 +13,6 @@ on:
     paths:
       - "src/**"
       - "test/**"
-  schedule:
-    # Runs at 00:00 UTC daily
-    - cron: '0 0 * * *'
   workflow_dispatch:  # Enables manual trigger
 concurrency:

{liger_kernel-0.6.2 → liger_kernel-0.6.3}/.github/workflows/nvi-ci.yml RENAMED Viewed

@@ -13,9 +13,6 @@ on:
     paths:
       - "src/**"
       - "test/**"
-  schedule:
-    # Runs at 00:00 UTC daily
-    - cron: '0 0 * * *'
   workflow_dispatch:  # Enables manual trigger
 concurrency:

{liger_kernel-0.6.2 → liger_kernel-0.6.3}/.gitignore RENAMED Viewed

@@ -23,4 +23,5 @@ uv.lock
 # Benchmark images
 benchmark/visualizations
-.vscode/
+.vscode/
+.coverage

{liger_kernel-0.6.2 → liger_kernel-0.6.3}/Makefile RENAMED Viewed

@@ -5,7 +5,15 @@ all: checkstyle test test-convergence
 # Command to run pytest for correctness tests
 test:
-	python -m pytest --disable-warnings test/ --ignore=test/convergence
+	python -m pytest --disable-warnings \
+		--cov=src/liger_kernel \
+		--cov-report=term-missing \
+		--ignore=test/convergence \
+		test/
+# Command to run coverage report
+coverage:
+	coverage report -m
 # Command to run ruff for linting and formatting code
 checkstyle:

{liger_kernel-0.6.2 → liger_kernel-0.6.3}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: liger_kernel
-Version: 0.6.2
+Version: 0.6.3
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation
@@ -35,15 +35,14 @@ Requires-Dist: triton>=2.3.1
 Provides-Extra: dev
 Requires-Dist: transformers>=4.49.0; extra == "dev"
 Requires-Dist: matplotlib>=3.7.2; extra == "dev"
-Requires-Dist: flake8>=4.0.1.1; extra == "dev"
-Requires-Dist: black>=24.4.2; extra == "dev"
-Requires-Dist: isort>=5.13.2; extra == "dev"
+Requires-Dist: ruff>=0.12.0; extra == "dev"
 Requires-Dist: pytest>=7.1.2; extra == "dev"
 Requires-Dist: pytest-xdist; extra == "dev"
+Requires-Dist: pytest-cov; extra == "dev"
+Requires-Dist: pytest-asyncio; extra == "dev"
 Requires-Dist: pytest-rerunfailures; extra == "dev"
 Requires-Dist: datasets>=2.19.2; extra == "dev"
 Requires-Dist: seaborn; extra == "dev"
-Requires-Dist: mkdocs; extra == "dev"
 Requires-Dist: mkdocs-material; extra == "dev"
 Requires-Dist: torchvision>=0.20; extra == "dev"
 Dynamic: license-file
@@ -181,8 +180,8 @@ y = orpo_loss(lm_head.weight, x, target)
 - `triton >= 3.0.0` Install from pypi. (e.g. `pip install triton==3.0.0`)
 ```bash
-# Need to pass the url when installing
-pip install -e .[dev] --extra-index-url https://download.pytorch.org/whl/nightly/rocm6.2
+pip install -e .[dev]
+pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.3/
 ```
 ### Optional Dependencies
@@ -216,6 +215,9 @@ pip install -e .
 # Setup Development Dependencies
 pip install -e ".[dev]"
+# NOTE -> For AMD users only
+pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.3/
 ```
@@ -312,6 +314,7 @@ loss.backward()
 | Granite 3.0 & 3.1   | `liger_kernel.transformers.apply_liger_kernel_to_granite`     | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss |
 | OLMo2   | `liger_kernel.transformers.apply_liger_kernel_to_olmo2`     | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
 | GLM-4   | `liger_kernel.transformers.apply_liger_kernel_to_glm4`     | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
+| InternVL3   | `liger_kernel.transformers.apply_liger_kernel_to_internvl`     | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
 ## Low-level APIs
@@ -391,17 +394,17 @@ loss.backward()
         <td style="padding: 10px;">
             <div style="display: block;">
                 <a href="https://github.com/linkedin/Liger-Kernel/actions/workflows/nvi-ci.yml">
-                    <img src="https://github.com/linkedin/Liger-Kernel/actions/workflows/nvi-ci.yml/badge.svg?event=schedule" alt="Build">
+                    <img src="https://github.com/linkedin/Liger-Kernel/actions/workflows/nvi-ci.yml/badge.svg?branch=main&event=push" alt="Build">
                 </a>
             </div>
             <div style="display: block;">
                 <a href="https://github.com/linkedin/Liger-Kernel/actions/workflows/amd-ci.yml">
-                    <img src="https://github.com/linkedin/Liger-Kernel/actions/workflows/amd-ci.yml/badge.svg?event=schedule" alt="Build">
+                    <img src="https://github.com/linkedin/Liger-Kernel/actions/workflows/amd-ci.yml/badge.svg?branch=main&event=push" alt="Build">
                 </a>
             </div>
             <div style="display: block;">
                 <a href="https://github.com/linkedin/Liger-Kernel/actions/workflows/intel-ci.yml">
-                    <img src="https://github.com/linkedin/Liger-Kernel/actions/workflows/intel-ci.yml/badge.svg?event=schedule" alt="Build">
+                    <img src="https://github.com/linkedin/Liger-Kernel/actions/workflows/intel-ci.yml/badge.svg?branch=main&event=push" alt="Build">
                 </a>
             </div>
         </td>

{liger_kernel-0.6.2 → liger_kernel-0.6.3}/README.md RENAMED Viewed

@@ -129,8 +129,8 @@ y = orpo_loss(lm_head.weight, x, target)
 - `triton >= 3.0.0` Install from pypi. (e.g. `pip install triton==3.0.0`)
 ```bash
-# Need to pass the url when installing
-pip install -e .[dev] --extra-index-url https://download.pytorch.org/whl/nightly/rocm6.2
+pip install -e .[dev]
+pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.3/
 ```
 ### Optional Dependencies
@@ -164,6 +164,9 @@ pip install -e .
 # Setup Development Dependencies
 pip install -e ".[dev]"
+# NOTE -> For AMD users only
+pip3 install --pre torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/rocm6.3/
 ```
@@ -260,6 +263,7 @@ loss.backward()
 | Granite 3.0 & 3.1   | `liger_kernel.transformers.apply_liger_kernel_to_granite`     | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss |
 | OLMo2   | `liger_kernel.transformers.apply_liger_kernel_to_olmo2`     | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
 | GLM-4   | `liger_kernel.transformers.apply_liger_kernel_to_glm4`     | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
+| InternVL3   | `liger_kernel.transformers.apply_liger_kernel_to_internvl`     | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy |
 ## Low-level APIs
@@ -339,17 +343,17 @@ loss.backward()
         <td style="padding: 10px;">
             <div style="display: block;">
                 <a href="https://github.com/linkedin/Liger-Kernel/actions/workflows/nvi-ci.yml">
-                    <img src="https://github.com/linkedin/Liger-Kernel/actions/workflows/nvi-ci.yml/badge.svg?event=schedule" alt="Build">
+                    <img src="https://github.com/linkedin/Liger-Kernel/actions/workflows/nvi-ci.yml/badge.svg?branch=main&event=push" alt="Build">
                 </a>
             </div>
             <div style="display: block;">
                 <a href="https://github.com/linkedin/Liger-Kernel/actions/workflows/amd-ci.yml">
-                    <img src="https://github.com/linkedin/Liger-Kernel/actions/workflows/amd-ci.yml/badge.svg?event=schedule" alt="Build">
+                    <img src="https://github.com/linkedin/Liger-Kernel/actions/workflows/amd-ci.yml/badge.svg?branch=main&event=push" alt="Build">
                 </a>
             </div>
             <div style="display: block;">
                 <a href="https://github.com/linkedin/Liger-Kernel/actions/workflows/intel-ci.yml">
-                    <img src="https://github.com/linkedin/Liger-Kernel/actions/workflows/intel-ci.yml/badge.svg?event=schedule" alt="Build">
+                    <img src="https://github.com/linkedin/Liger-Kernel/actions/workflows/intel-ci.yml/badge.svg?branch=main&event=push" alt="Build">
                 </a>
             </div>
         </td>

{liger_kernel-0.6.2 → liger_kernel-0.6.3}/benchmark/data/all_benchmark_data.csv RENAMED Viewed

@@ -1575,6 +1575,70 @@ fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,4096,416.11767578
 fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,8192,832.22705078125,832.22705078125,832.22705078125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
 fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,16384,1544.44580078125,1544.44580078125,1544.44580078125,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
 fused_add_rms_norm,liger_rms_norm,full,memory,MB,H,hidden size,32768,2960.8837890625,2960.8837890625,2960.8837890625,"{""M"": 2048, ""dtype"": ""torch.float32"", ""eps"": 1e-06}",NVIDIA H100 NVL,2025-07-16 07:04:45,0.6.0
+fused_linear_grpo_loss_token,liger,forward,speed,ms,B,B,2,40.75366401672363,40.749671173095706,40.75765686035156,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-04 23:58:45,0.6.1
+fused_linear_grpo_loss_token,liger,forward,speed,ms,B,B,4,80.95231628417969,80.95231628417969,80.95231628417969,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-04 23:58:45,0.6.1
+fused_linear_grpo_loss_token,liger,forward,speed,ms,B,B,8,163.58604431152344,163.58604431152344,163.58604431152344,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-04 23:58:45,0.6.1
+fused_linear_grpo_loss_token,liger,forward,speed,ms,B,B,16,323.6761474609375,323.6761474609375,323.6761474609375,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-04 23:58:45,0.6.1
+fused_linear_grpo_loss_token,torch,forward,speed,ms,B,B,2,23.71225643157959,23.612825775146483,23.8354434967041,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-04 23:59:51,0.6.1
+fused_linear_grpo_loss_token,torch,forward,speed,ms,B,B,4,46.86131286621094,46.80355911254883,46.91906661987304,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-04 23:59:51,0.6.1
+fused_linear_grpo_loss_token,torch,forward,speed,ms,B,B,8,94.54898834228516,94.54898834228516,94.54898834228516,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-04 23:59:51,0.6.1
+fused_linear_grpo_loss_token,torch,forward,speed,ms,B,B,16,189.99501037597656,189.99501037597656,189.99501037597656,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-04 23:59:51,0.6.1
+fused_linear_grpo_loss_token,liger,full,speed,ms,B,B,2,42.67263984680176,42.54085083007813,42.80442886352539,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:00:58,0.6.1
+fused_linear_grpo_loss_token,liger,full,speed,ms,B,B,4,82.2446060180664,82.2446060180664,82.2446060180664,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:00:58,0.6.1
+fused_linear_grpo_loss_token,liger,full,speed,ms,B,B,8,167.00416564941406,167.00416564941406,167.00416564941406,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:00:58,0.6.1
+fused_linear_grpo_loss_token,liger,full,speed,ms,B,B,16,327.0911865234375,327.0911865234375,327.0911865234375,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:00:58,0.6.1
+fused_linear_grpo_loss_token,torch,full,speed,ms,B,B,2,45.36115264892578,45.241344451904304,45.480960845947266,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:02:07,0.6.1
+fused_linear_grpo_loss_token,torch,full,speed,ms,B,B,4,90.00038146972656,90.00038146972656,90.00038146972656,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:02:07,0.6.1
+fused_linear_grpo_loss_token,torch,full,speed,ms,B,B,8,177.22674560546875,177.22674560546875,177.22674560546875,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:02:07,0.6.1
+fused_linear_grpo_loss_token,torch,full,speed,ms,B,B,16,356.5383605957031,356.5383605957031,356.5383605957031,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:02:07,0.6.1
+fused_linear_grpo_loss_token,liger,backward,speed,ms,B,B,2,1.814527988433838,1.8124799728393555,1.8167808055877686,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:03:11,0.6.1
+fused_linear_grpo_loss_token,liger,backward,speed,ms,B,B,4,1.84934401512146,1.8472959995269775,1.8524160385131836,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:03:11,0.6.1
+fused_linear_grpo_loss_token,liger,backward,speed,ms,B,B,8,1.891327977180481,1.8872319459915161,1.893990397453308,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:03:11,0.6.1
+fused_linear_grpo_loss_token,liger,backward,speed,ms,B,B,16,1.9722239971160889,1.9660799503326416,1.9763200283050537,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:03:11,0.6.1
+fused_linear_grpo_loss_token,torch,backward,speed,ms,B,B,2,22.014975547790527,21.710438537597657,22.19417533874512,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:04:16,0.6.1
+fused_linear_grpo_loss_token,torch,backward,speed,ms,B,B,4,41.83603096008301,41.752165222167974,41.91989669799805,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:04:16,0.6.1
+fused_linear_grpo_loss_token,torch,backward,speed,ms,B,B,8,81.66400146484375,81.66400146484375,81.66400146484375,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:04:16,0.6.1
+fused_linear_grpo_loss_token,torch,backward,speed,ms,B,B,16,162.6429443359375,162.6429443359375,162.6429443359375,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:04:16,0.6.1
+fused_linear_grpo_loss_token,liger,full,memory,MB,B,B,2,7344.77685546875,7344.77685546875,7344.77685546875,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:05:31,0.6.1
+fused_linear_grpo_loss_token,liger,full,memory,MB,B,B,4,7408.80029296875,7408.80029296875,7408.80029296875,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:05:31,0.6.1
+fused_linear_grpo_loss_token,liger,full,memory,MB,B,B,8,7536.84716796875,7536.84716796875,7536.84716796875,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:05:31,0.6.1
+fused_linear_grpo_loss_token,liger,full,memory,MB,B,B,16,7792.94091796875,7792.94091796875,7792.94091796875,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:05:31,0.6.1
+fused_linear_grpo_loss_token,torch,full,memory,MB,B,B,2,9083.28125,9083.28125,9083.28125,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:06:37,0.6.1
+fused_linear_grpo_loss_token,torch,full,memory,MB,B,B,4,13138.3125,13138.3125,13138.3125,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:06:37,0.6.1
+fused_linear_grpo_loss_token,torch,full,memory,MB,B,B,8,21250.375,21250.375,21250.375,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:06:37,0.6.1
+fused_linear_grpo_loss_token,torch,full,memory,MB,B,B,16,37474.5,37474.5,37474.5,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""token"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:06:37,0.6.1
+fused_linear_grpo_loss_sequence,liger,forward,speed,ms,B,B,2,40.72038269042969,40.71178131103516,40.728984069824214,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:07:48,0.6.1
+fused_linear_grpo_loss_sequence,liger,forward,speed,ms,B,B,4,81.69369506835938,81.69369506835938,81.69369506835938,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:07:48,0.6.1
+fused_linear_grpo_loss_sequence,liger,forward,speed,ms,B,B,8,162.79653930664062,162.79653930664062,162.79653930664062,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:07:48,0.6.1
+fused_linear_grpo_loss_sequence,liger,forward,speed,ms,B,B,16,323.6546630859375,323.6546630859375,323.6546630859375,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:07:48,0.6.1
+fused_linear_grpo_loss_sequence,torch,forward,speed,ms,B,B,2,23.70047950744629,23.628594589233398,23.732429122924806,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:08:54,0.6.1
+fused_linear_grpo_loss_sequence,torch,forward,speed,ms,B,B,4,47.36921691894531,47.085364532470706,47.65306930541992,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:08:54,0.6.1
+fused_linear_grpo_loss_sequence,torch,forward,speed,ms,B,B,8,94.83366394042969,94.83366394042969,94.83366394042969,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:08:54,0.6.1
+fused_linear_grpo_loss_sequence,torch,forward,speed,ms,B,B,16,190.0963897705078,190.0963897705078,190.0963897705078,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:08:54,0.6.1
+fused_linear_grpo_loss_sequence,liger,full,speed,ms,B,B,2,42.318336486816406,42.15214080810547,42.48453216552734,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:10:02,0.6.1
+fused_linear_grpo_loss_sequence,liger,full,speed,ms,B,B,4,82.4616928100586,82.4616928100586,82.4616928100586,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:10:02,0.6.1
+fused_linear_grpo_loss_sequence,liger,full,speed,ms,B,B,8,163.43756103515625,163.43756103515625,163.43756103515625,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:10:02,0.6.1
+fused_linear_grpo_loss_sequence,liger,full,speed,ms,B,B,16,325.4384765625,325.4384765625,325.4384765625,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:10:02,0.6.1
+fused_linear_grpo_loss_sequence,torch,full,speed,ms,B,B,2,45.99193572998047,45.80761489868165,46.176256561279295,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:11:10,0.6.1
+fused_linear_grpo_loss_sequence,torch,full,speed,ms,B,B,4,88.57190704345703,88.57190704345703,88.57190704345703,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:11:10,0.6.1
+fused_linear_grpo_loss_sequence,torch,full,speed,ms,B,B,8,176.94105529785156,176.94105529785156,176.94105529785156,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:11:10,0.6.1
+fused_linear_grpo_loss_sequence,torch,full,speed,ms,B,B,16,356.0478820800781,356.0478820800781,356.0478820800781,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:11:10,0.6.1
+fused_linear_grpo_loss_sequence,liger,backward,speed,ms,B,B,2,1.8242560029029846,1.8102271556854248,1.8309119939804077,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:12:14,0.6.1
+fused_linear_grpo_loss_sequence,liger,backward,speed,ms,B,B,4,1.84934401512146,1.846886396408081,1.8534400463104248,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:12:14,0.6.1
+fused_linear_grpo_loss_sequence,liger,backward,speed,ms,B,B,8,1.891327977180481,1.8892799615859985,1.8933759927749634,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:12:14,0.6.1
+fused_linear_grpo_loss_sequence,liger,backward,speed,ms,B,B,16,1.9752960205078125,1.9722239971160889,1.977344036102295,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:12:14,0.6.1
+fused_linear_grpo_loss_sequence,torch,backward,speed,ms,B,B,2,22.0262393951416,21.80997085571289,22.20482559204102,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:13:20,0.6.1
+fused_linear_grpo_loss_sequence,torch,backward,speed,ms,B,B,4,41.54521560668945,41.224806213378905,41.865625,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:13:20,0.6.1
+fused_linear_grpo_loss_sequence,torch,backward,speed,ms,B,B,8,81.21753692626953,81.21753692626953,81.21753692626953,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:13:20,0.6.1
+fused_linear_grpo_loss_sequence,torch,backward,speed,ms,B,B,16,160.82022094726562,160.82022094726562,160.82022094726562,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:13:20,0.6.1
+fused_linear_grpo_loss_sequence,liger,full,memory,MB,B,B,2,7344.77685546875,7344.77685546875,7344.77685546875,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:14:28,0.6.1
+fused_linear_grpo_loss_sequence,liger,full,memory,MB,B,B,4,7408.80029296875,7408.80029296875,7408.80029296875,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:14:28,0.6.1
+fused_linear_grpo_loss_sequence,liger,full,memory,MB,B,B,8,7536.84716796875,7536.84716796875,7536.84716796875,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:14:28,0.6.1
+fused_linear_grpo_loss_sequence,liger,full,memory,MB,B,B,16,7792.94091796875,7792.94091796875,7792.94091796875,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:14:28,0.6.1
+fused_linear_grpo_loss_sequence,torch,full,memory,MB,B,B,2,9083.28125,9083.28125,9083.28125,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:15:31,0.6.1
+fused_linear_grpo_loss_sequence,torch,full,memory,MB,B,B,4,13138.3125,13138.3125,13138.3125,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:15:31,0.6.1
+fused_linear_grpo_loss_sequence,torch,full,memory,MB,B,B,8,21250.375,21250.375,21250.375,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:15:31,0.6.1
+fused_linear_grpo_loss_sequence,torch,full,memory,MB,B,B,16,37474.5,37474.5,37474.5,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""importance_sampling_level"": ""sequence"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2025-08-05 00:15:31,0.6.1
 llama4_rope,liger,forward,speed,ms,H,hidden size,512,0.08249600231647491,0.08102399855852127,0.08432000130414963,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:01,0.6.1
 llama4_rope,liger,forward,speed,ms,H,hidden size,2048,0.08169600367546082,0.08037760108709335,0.08329600095748901,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:01,0.6.1
 llama4_rope,liger,forward,speed,ms,H,hidden size,8192,0.08128000050783157,0.07980799674987793,0.08329600095748901,"{""dtype"": ""torch.bfloat16"", ""seq_len"": 2048, ""num_q_heads"": 32, ""num_kv_heads"": 8}",NVIDIA H100 80GB HBM3,2025-08-07 21:42:01,0.6.1

{liger_kernel-0.6.2 → liger_kernel-0.6.3}/benchmark/scripts/benchmark_cross_entropy.py RENAMED Viewed

@@ -70,6 +70,9 @@ def bench_speed_cross_entropy(
     if mode == "forward":
         ms_50, ms_20, ms_80 = triton.testing.do_bench(fwd, rep=100, quantiles=QUANTILES)
+    elif mode == "no-grad-forward":
+        with torch.no_grad():
+            ms_50, ms_20, ms_80 = triton.testing.do_bench(fwd, rep=100, quantiles=QUANTILES)
     elif mode == "backward":
         y = fwd()
@@ -109,7 +112,7 @@ if __name__ == "__main__":
     run_benchmarks(
         bench_test_fn=bench_speed_cross_entropy,
-        kernel_operation_modes=["forward", "backward", "full"],
+        kernel_operation_modes=["forward", "backward", "full", "no-grad-forward"],
         metric_name="speed",
         metric_unit="ms",
         **common_configs,

{liger_kernel-0.6.2 → liger_kernel-0.6.3}/benchmark/scripts/benchmark_fused_linear_cross_entropy.py RENAMED Viewed

@@ -59,26 +59,26 @@ def bench_memory_fused_linear_cross_entropy(
     dtype = input.extra_benchmark_config["dtype"]
     provider = input.kernel_provider
-    torch_lm_head_ce = TorchLMHeadCE(H=H, V=V, dtype=dtype).to(device)
-    liger_lm_head_ce = LigerLMHeadCE(H=H, V=V, dtype=dtype).to(device)
-    liger_lm_head_ce_fp32_accum = LigerLMHeadCE(H=H, V=V, dtype=dtype, accum_dtype=torch.float32).to(device)
+    lm_head_ce = None
+    if provider == "liger":
+        lm_head_ce = LigerLMHeadCE(H=H, V=V, dtype=dtype).to(device)
+    elif provider == "liger-fp32-accum":
+        lm_head_ce = LigerLMHeadCE(H=H, V=V, dtype=dtype, accum_dtype=torch.float32).to(device)
+    else:
+        lm_head_ce = TorchLMHeadCE(H=H, V=V, dtype=dtype).to(device)
     _input = torch.randn(BT, H, requires_grad=True, dtype=dtype, device=device)
     target = torch.randint(V, (BT, 1), dtype=torch.long, device=device).squeeze(1)
     def fwd():
-        if provider == "liger":
-            return liger_lm_head_ce(_input, target)
-        elif provider == "liger-fp32-accum":
-            return liger_lm_head_ce_fp32_accum(_input, target)
-        elif provider == "huggingface":
-            return torch_lm_head_ce(_input, target)
+        return lm_head_ce(_input, target)
     def full():
         y = fwd()
         y.backward()
     mem_50, mem_20, mem_80 = _test_memory(full, _iter=10, quantiles=QUANTILES)
     return SingleBenchmarkRunOutput(
         y_20=mem_20,
         y_50=mem_50,
@@ -101,20 +101,19 @@ def bench_speed_fused_linear_cross_entropy(
     provider = input.kernel_provider
     mode = input.kernel_operation_mode
-    torch_lm_head_ce = TorchLMHeadCE(H=H, V=V, dtype=dtype).to(device)
-    liger_lm_head_ce = LigerLMHeadCE(H=H, V=V, dtype=dtype).to(device)
-    liger_lm_head_ce_fp32_accum = LigerLMHeadCE(H=H, V=V, dtype=dtype, accum_dtype=torch.float32).to(device)
+    lm_head_ce = None
+    if provider == "liger":
+        lm_head_ce = LigerLMHeadCE(H=H, V=V, dtype=dtype).to(device)
+    elif provider == "liger-fp32-accum":
+        lm_head_ce = LigerLMHeadCE(H=H, V=V, dtype=dtype, accum_dtype=torch.float32).to(device)
+    else:
+        lm_head_ce = TorchLMHeadCE(H=H, V=V, dtype=dtype).to(device)
     _input = torch.randn(BT, H, requires_grad=True, dtype=dtype, device=device)
     target = torch.randint(V, (BT, 1), dtype=torch.long, device=device).squeeze(1)
     def fwd():
-        if provider == "liger":
-            return liger_lm_head_ce(_input, target)
-        elif provider == "liger-fp32-accum":
-            return liger_lm_head_ce_fp32_accum(_input, target)
-        elif provider == "huggingface":
-            return torch_lm_head_ce(_input, target)
+        return lm_head_ce(_input, target)
     if mode == "forward":
         ms_50, ms_20, ms_80 = triton.testing.do_bench(
@@ -122,6 +121,13 @@ def bench_speed_fused_linear_cross_entropy(
             rep=100,
             quantiles=QUANTILES,
         )
+    elif mode == "no-grad-forward":
+        with torch.no_grad():
+            ms_50, ms_20, ms_80 = triton.testing.do_bench(
+                fwd,
+                rep=100,
+                quantiles=QUANTILES,
+            )
     elif mode == "backward":
         y = fwd()
@@ -164,7 +170,7 @@ if __name__ == "__main__":
     run_benchmarks(
         bench_test_fn=bench_speed_fused_linear_cross_entropy,
-        kernel_operation_modes=["forward", "backward", "full"],
+        kernel_operation_modes=["forward", "backward", "full", "no-grad-forward"],
         metric_name="speed",
         metric_unit="ms",
         **common_configs,

liger-kernel 0.6.2__tar.gz → 0.6.3__tar.gz

liger-kernel 0.6.2tar.gz → 0.6.3tar.gz