PyPI - liger-kernel-nightly - Versions diffs - 0.5.2.dev20241223032630__tar.gz → 0.5.2.dev20241228022953__tar.gz - Mend

liger-kernel-nightly 0.5.2.dev20241223032630tar.gz → 0.5.2.dev20241228022953tar.gz

Files changed (204) hide show

{liger_kernel_nightly-0.5.2.dev20241223032630 → liger_kernel_nightly-0.5.2.dev20241228022953}/Makefile RENAMED Viewed

@@ -10,10 +10,9 @@ test:
 # Command to run flake8 (code style check), isort (import ordering), and black (code formatting)
 # Subsequent commands still run if the previous fails, but return failure at the end
 checkstyle:
-	flake8 .; flake8_status=$$?; \
-	isort .; isort_status=$$?; \
-	black .; black_status=$$?; \
-	if [ $$flake8_status -ne 0 ] || [ $$isort_status -ne 0 ] || [ $$black_status -ne 0 ]; then \
+	ruff check . --fix; ruff_check_status=$$?; \
+	ruff format .; ruff_format_status=$$?; \
+	if [ $$ruff_check_status -ne 0 ] || [ $$ruff_format_status -ne 0 ]; then \
 		exit 1; \
 	fi

{liger_kernel_nightly-0.5.2.dev20241223032630 → liger_kernel_nightly-0.5.2.dev20241228022953}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.5.2.dev20241223032630
+Version: 0.5.2.dev20241228022953
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation

{liger_kernel_nightly-0.5.2.dev20241223032630 → liger_kernel_nightly-0.5.2.dev20241228022953}/benchmark/benchmarks_visualizer.py RENAMED Viewed

@@ -1,5 +1,6 @@
 import json
 import os
 from argparse import ArgumentParser
 from dataclasses import dataclass
@@ -39,9 +40,7 @@ def parse_args() -> VisualizationsConfig:
         VisualizationsConfig: Configuration object for the visualizations script.
     """
     parser = ArgumentParser()
-    parser.add_argument(
-        "--kernel-name", type=str, required=True, help="Kernel name to benchmark"
-    )
+    parser.add_argument("--kernel-name", type=str, required=True, help="Kernel name to benchmark")
     parser.add_argument(
         "--metric-name",
         type=str,
@@ -54,9 +53,7 @@ def parse_args() -> VisualizationsConfig:
         required=True,
         help="Kernel operation mode to visualize (forward/backward/full)",
     )
-    parser.add_argument(
-        "--display", action="store_true", help="Display the visualization"
-    )
+    parser.add_argument("--display", action="store_true", help="Display the visualization")
     parser.add_argument(
         "--overwrite",
         action="store_true",
@@ -126,7 +123,7 @@ def plot_data(df: pd.DataFrame, config: VisualizationsConfig):
     lines = ax.get_lines()
     colors = [line.get_color() for line in lines]
-    for (_, group_data), color in zip(df.groupby("kernel_provider"), colors):
+    for (_, group_data), color in zip(df.groupby("kernel_provider"), colors, strict=False):
         # for i, row in group_data.iterrows():
         y_error_lower = group_data["y_value_50"] - group_data["y_value_20"]
         y_error_upper = group_data["y_value_80"] - group_data["y_value_50"]
@@ -145,9 +142,7 @@ def plot_data(df: pd.DataFrame, config: VisualizationsConfig):
     plt.ylabel(ylabel)
     plt.tight_layout()
-    out_path = os.path.join(
-        VISUALIZATIONS_PATH, f"{config.kernel_name}_{config.metric_name}.png"
-    )
+    out_path = os.path.join(VISUALIZATIONS_PATH, f"{config.kernel_name}_{config.metric_name}.png")
     if config.display:
         plt.show()

{liger_kernel_nightly-0.5.2.dev20241223032630 → liger_kernel_nightly-0.5.2.dev20241228022953}/benchmark/scripts/benchmark_cpo_loss.py RENAMED Viewed

@@ -3,14 +3,13 @@ import sys
 import torch
 import triton
-from utils import (
-    QUANTILES,
-    SingleBenchmarkRunInput,
-    SingleBenchmarkRunOutput,
-    _test_memory,
-    parse_benchmark_script_args,
-    run_benchmarks,
-)
+from utils import QUANTILES
+from utils import SingleBenchmarkRunInput
+from utils import SingleBenchmarkRunOutput
+from utils import _test_memory
+from utils import parse_benchmark_script_args
+from utils import run_benchmarks
 from liger_kernel.chunked_loss.cpo_loss import LigerFusedLinearCPOFunction
 from liger_kernel.utils import infer_device
@@ -33,9 +32,7 @@ class TorchLMHeadCPO(torch.nn.Module):
         from test.chunked_loss.test_cpo_loss import HFCPOLoss
         super().__init__()
-        self.lin = torch.nn.Linear(
-            in_features=H, out_features=V, bias=False, dtype=dtype
-        )
+        self.lin = torch.nn.Linear(in_features=H, out_features=V, bias=False, dtype=dtype)
         self.cpo_loss = HFCPOLoss().get_batch_loss_metrics
     def forward(self, x, y):
@@ -45,9 +42,7 @@ class TorchLMHeadCPO(torch.nn.Module):
 class LigerLMHeadCPO(torch.nn.Module):
     def __init__(self, H: int, V: int, dtype: torch.dtype, ignore_index: int = -100):
         super().__init__()
-        self.lin = torch.nn.Linear(
-            in_features=H, out_features=V, bias=False, dtype=dtype
-        )
+        self.lin = torch.nn.Linear(in_features=H, out_features=V, bias=False, dtype=dtype)
         self.cpo_loss = LigerFusedLinearCPOFunction.apply
     def forward(self, x, y):
@@ -180,12 +175,12 @@ if __name__ == "__main__":
         kernel_operation_modes=["forward", "full"],
         metric_name="speed",
         metric_unit="ms",
-        **common_configs
+        **common_configs,
     )
     run_benchmarks(
         bench_test_fn=bench_memory_fused_linear_cpo_loss,
         kernel_operation_modes=["full"],
         metric_name="memory",
         metric_unit="MB",
-        **common_configs
+        **common_configs,
     )

{liger_kernel_nightly-0.5.2.dev20241223032630 → liger_kernel_nightly-0.5.2.dev20241228022953}/benchmark/scripts/benchmark_cross_entropy.py RENAMED Viewed

@@ -1,14 +1,13 @@
 import torch
 import triton
 from torch.nn import CrossEntropyLoss
-from utils import (
-    QUANTILES,
-    SingleBenchmarkRunInput,
-    SingleBenchmarkRunOutput,
-    _test_memory,
-    parse_benchmark_script_args,
-    run_benchmarks,
-)
+from utils import QUANTILES
+from utils import SingleBenchmarkRunInput
+from utils import SingleBenchmarkRunOutput
+from utils import _test_memory
+from utils import parse_benchmark_script_args
+from utils import run_benchmarks
 from liger_kernel.transformers.cross_entropy import LigerCrossEntropyLoss
 from liger_kernel.utils import infer_device
@@ -86,9 +85,7 @@ def bench_speed_cross_entropy(
             y = fwd()
             y.backward()
-        ms_50, ms_20, ms_80 = triton.testing.do_bench(
-            full, rep=100, quantiles=QUANTILES
-        )
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(full, rep=100, quantiles=QUANTILES)
     return SingleBenchmarkRunOutput(
         y_20=ms_20,
@@ -115,12 +112,12 @@ if __name__ == "__main__":
         kernel_operation_modes=["forward", "full"],
         metric_name="speed",
         metric_unit="ms",
-        **common_configs
+        **common_configs,
     )
     run_benchmarks(
         bench_test_fn=bench_memory_cross_entropy,
         kernel_operation_modes=["full"],
         metric_name="memory",
         metric_unit="MB",
-        **common_configs
+        **common_configs,
     )

{liger_kernel_nightly-0.5.2.dev20241223032630 → liger_kernel_nightly-0.5.2.dev20241228022953}/benchmark/scripts/benchmark_dpo_loss.py RENAMED Viewed

@@ -1,15 +1,13 @@
-from test.chunked_loss.test_dpo_loss import HF_DPO_Loss
 import torch
 import triton
-from utils import (
-    QUANTILES,
-    SingleBenchmarkRunInput,
-    SingleBenchmarkRunOutput,
-    _test_memory,
-    parse_benchmark_script_args,
-    run_benchmarks,
-)
+from test.chunked_loss.test_dpo_loss import HF_DPO_Loss
+from utils import QUANTILES
+from utils import SingleBenchmarkRunInput
+from utils import SingleBenchmarkRunOutput
+from utils import _test_memory
+from utils import parse_benchmark_script_args
+from utils import run_benchmarks
 from liger_kernel.chunked_loss.dpo_loss import LigerFusedLinearDPOFunction
 from liger_kernel.utils import infer_device
@@ -28,9 +26,7 @@ class TorchDPOLoss(torch.nn.Module):
         bias: bool = False,
     ):
         super().__init__()
-        self.lin = torch.nn.Linear(
-            in_features=H, out_features=V, bias=bias, dtype=dtype
-        )
+        self.lin = torch.nn.Linear(in_features=H, out_features=V, bias=bias, dtype=dtype)
         self.dpo_loss = HF_DPO_Loss(beta=beta, ignore_index=ignore_index)
     def forward(self, x, target):
@@ -53,9 +49,7 @@ class LigerDPOLoss(torch.nn.Module):
         bias: bool = False,
     ):
         super().__init__()
-        self.lin = torch.nn.Linear(
-            in_features=H, out_features=V, bias=bias, dtype=dtype
-        )
+        self.lin = torch.nn.Linear(in_features=H, out_features=V, bias=bias, dtype=dtype)
         self.beta = beta
         self.ignore_index = ignore_index
@@ -82,12 +76,8 @@ def bench_memory_dpo_loss(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunO
     ignore_index = input.extra_benchmark_config["ignore_index"]
     provider = input.kernel_provider
-    torch_dpo_loss = TorchDPOLoss(
-        H=H, V=V, dtype=dtype, beta=beta, ignore_index=ignore_index, bias=bias
-    ).to(device)
-    liger_dpo_loss = LigerDPOLoss(
-        H=H, V=V, dtype=dtype, beta=beta, ignore_index=ignore_index, bias=bias
-    ).to(device)
+    torch_dpo_loss = TorchDPOLoss(H=H, V=V, dtype=dtype, beta=beta, ignore_index=ignore_index, bias=bias).to(device)
+    liger_dpo_loss = LigerDPOLoss(H=H, V=V, dtype=dtype, beta=beta, ignore_index=ignore_index, bias=bias).to(device)
     # Input shape: [B, T, H]
     _input = torch.randn(B, T, H, device=device, dtype=dtype)
@@ -129,12 +119,8 @@ def bench_speed_dpo_loss(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOu
     provider = input.kernel_provider
     mode = input.kernel_operation_mode
-    torch_dpo_loss = TorchDPOLoss(
-        H=H, V=V, dtype=dtype, beta=beta, ignore_index=ignore_index, bias=bias
-    ).to(device)
-    liger_dpo_loss = LigerDPOLoss(
-        H=H, V=V, dtype=dtype, beta=beta, ignore_index=ignore_index, bias=bias
-    ).to(device)
+    torch_dpo_loss = TorchDPOLoss(H=H, V=V, dtype=dtype, beta=beta, ignore_index=ignore_index, bias=bias).to(device)
+    liger_dpo_loss = LigerDPOLoss(H=H, V=V, dtype=dtype, beta=beta, ignore_index=ignore_index, bias=bias).to(device)
     # Input shape: [B, T, H]
     _input = torch.randn(B, T, H, device=device, dtype=dtype)
@@ -215,7 +201,7 @@ if __name__ == "__main__":
         kernel_operation_modes=["forward", "full"],
         metric_name="speed",
         metric_unit="ms",
-        **common_configs
+        **common_configs,
     )
     run_benchmarks(
@@ -223,5 +209,5 @@ if __name__ == "__main__":
         kernel_operation_modes=["full"],
         metric_name="memory",
         metric_unit="MB",
-        **common_configs
+        **common_configs,
     )

{liger_kernel_nightly-0.5.2.dev20241223032630 → liger_kernel_nightly-0.5.2.dev20241228022953}/benchmark/scripts/benchmark_embedding.py RENAMED Viewed

@@ -1,14 +1,13 @@
 import torch
 import triton
 from torch.nn import Embedding
-from utils import (
-    QUANTILES,
-    SingleBenchmarkRunInput,
-    SingleBenchmarkRunOutput,
-    _test_memory,
-    parse_benchmark_script_args,
-    run_benchmarks,
-)
+from utils import QUANTILES
+from utils import SingleBenchmarkRunInput
+from utils import SingleBenchmarkRunOutput
+from utils import _test_memory
+from utils import parse_benchmark_script_args
+from utils import run_benchmarks
 from liger_kernel.transformers.experimental.embedding import LigerEmbedding
 from liger_kernel.utils import infer_device
@@ -50,9 +49,7 @@ def bench_speed_embedding(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunO
     if mode == "forward":
         ms_50, ms_20, ms_80 = triton.testing.do_bench(fwd, quantiles=QUANTILES, rep=100)
     elif mode == "full":
-        ms_50, ms_20, ms_80 = triton.testing.do_bench(
-            full, quantiles=QUANTILES, rep=100
-        )
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(full, quantiles=QUANTILES, rep=100)
     return SingleBenchmarkRunOutput(
         y_20=ms_20,
         y_50=ms_50,
@@ -118,12 +115,12 @@ if __name__ == "__main__":
         kernel_operation_modes=["forward", "full"],
         metric_name="speed",
         metric_unit="ms",
-        **common_configs
+        **common_configs,
     )
     run_benchmarks(
         bench_test_fn=bench_memory_embedding,
         kernel_operation_modes=["full"],
         metric_name="memory",
         metric_unit="MB",
-        **common_configs
+        **common_configs,
     )

{liger_kernel_nightly-0.5.2.dev20241223032630 → liger_kernel_nightly-0.5.2.dev20241228022953}/benchmark/scripts/benchmark_fused_linear_cross_entropy.py RENAMED Viewed

@@ -1,17 +1,14 @@
 import torch
 import triton
-from utils import (
-    QUANTILES,
-    SingleBenchmarkRunInput,
-    SingleBenchmarkRunOutput,
-    _test_memory,
-    parse_benchmark_script_args,
-    run_benchmarks,
-)
-from liger_kernel.transformers.fused_linear_cross_entropy import (
-    LigerFusedLinearCrossEntropyLoss,
-)
+from utils import QUANTILES
+from utils import SingleBenchmarkRunInput
+from utils import SingleBenchmarkRunOutput
+from utils import _test_memory
+from utils import parse_benchmark_script_args
+from utils import run_benchmarks
+from liger_kernel.transformers.fused_linear_cross_entropy import LigerFusedLinearCrossEntropyLoss
 from liger_kernel.utils import infer_device
 device = infer_device()
@@ -28,12 +25,8 @@ class TorchLMHeadCE(torch.nn.Module):
     def __init__(self, H: int, V: int, dtype: torch.dtype, ignore_index: int = -100):
         super().__init__()
-        self.lin = torch.nn.Linear(
-            in_features=H, out_features=V, bias=False, dtype=dtype
-        )
-        self.ce_loss = torch.nn.CrossEntropyLoss(
-            ignore_index=ignore_index, reduction="mean"
-        )
+        self.lin = torch.nn.Linear(in_features=H, out_features=V, bias=False, dtype=dtype)
+        self.ce_loss = torch.nn.CrossEntropyLoss(ignore_index=ignore_index, reduction="mean")
     def forward(self, x, y):
         logits = self.lin(x)
@@ -43,12 +36,8 @@ class TorchLMHeadCE(torch.nn.Module):
 class LigerLMHeadCE(torch.nn.Module):
     def __init__(self, H: int, V: int, dtype: torch.dtype, ignore_index: int = -100):
         super().__init__()
-        self.lin = torch.nn.Linear(
-            in_features=H, out_features=V, bias=False, dtype=dtype
-        )
-        self.ce_loss = LigerFusedLinearCrossEntropyLoss(
-            ignore_index=ignore_index, reduction="mean"
-        )
+        self.lin = torch.nn.Linear(in_features=H, out_features=V, bias=False, dtype=dtype)
+        self.ce_loss = LigerFusedLinearCrossEntropyLoss(ignore_index=ignore_index, reduction="mean")
     def forward(self, x, y):
         return self.ce_loss(self.lin.weight, x, y)
@@ -161,9 +150,7 @@ if __name__ == "__main__":
         "x_label": "B x T",
         "x_values": [2**i for i in range(12, 16)],
         "kernel_providers": ["liger", "huggingface"],
-        "extra_benchmark_configs": [
-            {"H": 4096, "V": 128256, "mode": "forward", "dtype": torch.bfloat16}
-        ],
+        "extra_benchmark_configs": [{"H": 4096, "V": 128256, "mode": "forward", "dtype": torch.bfloat16}],
         "overwrite": args.overwrite,
     }
@@ -172,12 +159,12 @@ if __name__ == "__main__":
         kernel_operation_modes=["forward", "full"],
         metric_name="speed",
         metric_unit="ms",
-        **common_configs
+        **common_configs,
     )
     run_benchmarks(
         bench_test_fn=bench_memory_fused_linear_cross_entropy,
         kernel_operation_modes=["full"],
         metric_name="memory",
         metric_unit="MB",
-        **common_configs
+        **common_configs,
     )

{liger_kernel_nightly-0.5.2.dev20241223032630 → liger_kernel_nightly-0.5.2.dev20241228022953}/benchmark/scripts/benchmark_fused_linear_jsd.py RENAMED Viewed

@@ -1,13 +1,12 @@
 import torch
 import triton
-from utils import (
-    QUANTILES,
-    SingleBenchmarkRunInput,
-    SingleBenchmarkRunOutput,
-    _test_memory,
-    parse_benchmark_script_args,
-    run_benchmarks,
-)
+from utils import QUANTILES
+from utils import SingleBenchmarkRunInput
+from utils import SingleBenchmarkRunOutput
+from utils import _test_memory
+from utils import parse_benchmark_script_args
+from utils import run_benchmarks
 from liger_kernel.transformers.fused_linear_jsd import LigerFusedLinearJSD
 from liger_kernel.utils import infer_device
@@ -37,9 +36,9 @@ class TorchJSD(torch.nn.Module):
         log_p, log_q = log_p.to(torch.float), log_q.to(torch.float)
         log_p, log_q = log_p.view(-1, log_p.size(-1)), log_q.view(-1, log_q.size(-1))
         m = torch.lerp(torch.exp(log_q), torch.exp(log_p), self.beta)
-        loss = self.beta * self.kl(torch.log(m), log_p).sum(dim=-1) + (
-            1 - self.beta
-        ) * self.kl(torch.log(m), log_q).sum(dim=-1)
+        loss = self.beta * self.kl(torch.log(m), log_p).sum(dim=-1) + (1 - self.beta) * self.kl(
+            torch.log(m), log_q
+        ).sum(dim=-1)
         if label is not None:
             loss = torch.where(label != self.ignore_index, loss, 0.0)
@@ -73,12 +72,8 @@ class TorchLMHeadJSD(torch.nn.Module):
         temperature: float = 1.0,
     ):
         super().__init__()
-        self.student_lin = torch.nn.Linear(
-            in_features=H, out_features=V, bias=False, dtype=dtype, device=device
-        )
-        self.teacher_lin = torch.nn.Linear(
-            in_features=H, out_features=V, bias=False, dtype=dtype, device=device
-        )
+        self.student_lin = torch.nn.Linear(in_features=H, out_features=V, bias=False, dtype=dtype, device=device)
+        self.teacher_lin = torch.nn.Linear(in_features=H, out_features=V, bias=False, dtype=dtype, device=device)
         self.jsd = TorchJSD(beta=beta, ignore_index=ignore_index, dtype=dtype)
         self.temperature = temperature
@@ -103,15 +98,9 @@ class LigerLMHeadJSD(torch.nn.Module):
         temperature: float = 1.0,
     ):
         super().__init__()
-        self.student_lin = torch.nn.Linear(
-            in_features=H, out_features=V, bias=False, dtype=dtype, device=device
-        )
-        self.teacher_lin = torch.nn.Linear(
-            in_features=H, out_features=V, bias=False, dtype=dtype, device=device
-        )
-        self.fused_jsd = LigerFusedLinearJSD(
-            jsd_beta=beta, ignore_index=ignore_index, temperature=temperature
-        )
+        self.student_lin = torch.nn.Linear(in_features=H, out_features=V, bias=False, dtype=dtype, device=device)
+        self.teacher_lin = torch.nn.Linear(in_features=H, out_features=V, bias=False, dtype=dtype, device=device)
+        self.fused_jsd = LigerFusedLinearJSD(jsd_beta=beta, ignore_index=ignore_index, temperature=temperature)
     def forward(self, student_input, teacher_input, label=None):
         return self.fused_jsd(
@@ -141,12 +130,12 @@ def bench_memory_fused_linear_jsd(
     liger_lm_head_jsd = LigerLMHeadJSD(H=H, V=V, dtype=dtype, device=device).to(device)
     # init the linear in all FusedLinearJSDs with the same weights
-    torch_lm_head_jsd.student_lin.weight.data = (
-        liger_lm_head_jsd.student_lin.weight.data
-    ) = torch.rand(V, H, device=device, dtype=dtype)
-    torch_lm_head_jsd.teacher_lin.weight.data = (
-        liger_lm_head_jsd.teacher_lin.weight.data
-    ) = torch.rand(V, H, device=device, dtype=dtype)
+    torch_lm_head_jsd.student_lin.weight.data = liger_lm_head_jsd.student_lin.weight.data = torch.rand(
+        V, H, device=device, dtype=dtype
+    )
+    torch_lm_head_jsd.teacher_lin.weight.data = liger_lm_head_jsd.teacher_lin.weight.data = torch.rand(
+        V, H, device=device, dtype=dtype
+    )
     student_input = torch.rand(BT, H, requires_grad=True, dtype=dtype, device=device)
     teacher_input = torch.rand(BT, H, dtype=dtype, device=device)
@@ -189,12 +178,12 @@ def bench_speed_fused_linear_jsd(
     liger_lm_head_jsd = LigerLMHeadJSD(H=H, V=V, dtype=dtype, device=device).to(device)
     # init the linear in all FusedLinearJSDs with the same weights
-    torch_lm_head_jsd.student_lin.weight.data = (
-        liger_lm_head_jsd.student_lin.weight.data
-    ) = torch.rand(V, H, device=device, dtype=dtype)
-    torch_lm_head_jsd.teacher_lin.weight.data = (
-        liger_lm_head_jsd.teacher_lin.weight.data
-    ) = torch.rand(V, H, device=device, dtype=dtype)
+    torch_lm_head_jsd.student_lin.weight.data = liger_lm_head_jsd.student_lin.weight.data = torch.rand(
+        V, H, device=device, dtype=dtype
+    )
+    torch_lm_head_jsd.teacher_lin.weight.data = liger_lm_head_jsd.teacher_lin.weight.data = torch.rand(
+        V, H, device=device, dtype=dtype
+    )
     student_input = torch.rand(BT, H, requires_grad=True, dtype=dtype, device=device)
     teacher_input = torch.rand(BT, H, dtype=dtype, device=device)
@@ -251,9 +240,7 @@ if __name__ == "__main__":
         "x_label": "B x T",
         "x_values": [2**i for i in range(10, 14)],
         "kernel_providers": ["liger", "torch"],
-        "extra_benchmark_configs": [
-            {"H": 4096, "V": 128256, "mode": "forward", "dtype": torch.bfloat16}
-        ],
+        "extra_benchmark_configs": [{"H": 4096, "V": 128256, "mode": "forward", "dtype": torch.bfloat16}],
         "overwrite": args.overwrite,
     }
@@ -262,12 +249,12 @@ if __name__ == "__main__":
         kernel_operation_modes=["forward", "full"],
         metric_name="speed",
         metric_unit="ms",
-        **common_configs
+        **common_configs,
     )
     run_benchmarks(
         bench_test_fn=bench_memory_fused_linear_jsd,
         kernel_operation_modes=["full"],
         metric_name="memory",
         metric_unit="MB",
-        **common_configs
+        **common_configs,
     )

{liger_kernel_nightly-0.5.2.dev20241223032630 → liger_kernel_nightly-0.5.2.dev20241228022953}/benchmark/scripts/benchmark_geglu.py RENAMED Viewed

@@ -1,15 +1,14 @@
 import torch
 import triton
 from transformers.models.llama.configuration_llama import LlamaConfig
 from transformers.models.llama.modeling_llama import LlamaMLP
-from utils import (
-    QUANTILES,
-    SingleBenchmarkRunInput,
-    SingleBenchmarkRunOutput,
-    _test_memory,
-    parse_benchmark_script_args,
-    run_benchmarks,
-)
+from utils import QUANTILES
+from utils import SingleBenchmarkRunInput
+from utils import SingleBenchmarkRunOutput
+from utils import _test_memory
+from utils import parse_benchmark_script_args
+from utils import run_benchmarks
 from liger_kernel.transformers.geglu import LigerGEGLUMLP
 from liger_kernel.utils import infer_device

{liger_kernel_nightly-0.5.2.dev20241223032630 → liger_kernel_nightly-0.5.2.dev20241228022953}/benchmark/scripts/benchmark_group_norm.py RENAMED Viewed

@@ -1,13 +1,12 @@
 import torch
 import triton
-from utils import (
-    QUANTILES,
-    SingleBenchmarkRunInput,
-    SingleBenchmarkRunOutput,
-    _test_memory,
-    parse_benchmark_script_args,
-    run_benchmarks,
-)
+from utils import QUANTILES
+from utils import SingleBenchmarkRunInput
+from utils import SingleBenchmarkRunOutput
+from utils import _test_memory
+from utils import parse_benchmark_script_args
+from utils import run_benchmarks
 from liger_kernel.transformers.group_norm import LigerGroupNorm
 from liger_kernel.utils import infer_device
@@ -27,12 +26,8 @@ def bench_speed_group_norm(input: SingleBenchmarkRunInput) -> SingleBenchmarkRun
     dtype = extra_benchmark_config["dtype"]
     x_shape = (M, C, H)
-    triton_ln = LigerGroupNorm(
-        num_channels=C, num_groups=C // channels_per_group, eps=eps
-    ).to(device)
-    torch_ln = torch.nn.GroupNorm(
-        num_groups=C // channels_per_group, num_channels=C, eps=eps
-    ).to(device)
+    triton_ln = LigerGroupNorm(num_channels=C, num_groups=C // channels_per_group, eps=eps).to(device)
+    torch_ln = torch.nn.GroupNorm(num_groups=C // channels_per_group, num_channels=C, eps=eps).to(device)
     x = torch.randn(x_shape, dtype=dtype, device=device)
     dy = torch.randn_like(x)
@@ -45,9 +40,7 @@ def bench_speed_group_norm(input: SingleBenchmarkRunInput) -> SingleBenchmarkRun
             return torch_ln(x)
     if mode == "forward":
-        ms_50, ms_20, ms_80 = triton.testing.do_bench(
-            y_fwd, quantiles=QUANTILES, grad_to_none=[x], rep=500
-        )
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(y_fwd, quantiles=QUANTILES, grad_to_none=[x], rep=500)
     elif mode == "backward":
         y = y_fwd()
         ms_50, ms_20, ms_80 = triton.testing.do_bench(
@@ -62,9 +55,7 @@ def bench_speed_group_norm(input: SingleBenchmarkRunInput) -> SingleBenchmarkRun
             y = y_fwd()
             y.backward(dy, retain_graph=True)
-        ms_50, ms_20, ms_80 = triton.testing.do_bench(
-            full, quantiles=QUANTILES, grad_to_none=[x], rep=500
-        )
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(full, quantiles=QUANTILES, grad_to_none=[x], rep=500)
     return SingleBenchmarkRunOutput(
         y_20=ms_20,
@@ -84,12 +75,8 @@ def bench_memory_group_norm(input: SingleBenchmarkRunInput) -> SingleBenchmarkRu
     dtype = extra_benchmark_config["dtype"]
     x_shape = (M, C, H)
-    triton_ln = LigerGroupNorm(
-        num_channels=C, num_groups=C // channels_per_group, eps=eps
-    ).to(device)
-    torch_ln = torch.nn.GroupNorm(
-        num_groups=C // channels_per_group, num_channels=C, eps=eps
-    ).to(device)
+    triton_ln = LigerGroupNorm(num_channels=C, num_groups=C // channels_per_group, eps=eps).to(device)
+    torch_ln = torch.nn.GroupNorm(num_groups=C // channels_per_group, num_channels=C, eps=eps).to(device)
     x = torch.randn(x_shape, dtype=dtype, device=device)
     dy = torch.randn_like(x)
@@ -139,12 +126,12 @@ if __name__ == "__main__":
         kernel_operation_modes=["forward", "full", "backward"],
         metric_name="speed",
         metric_unit="ms",
-        **common_configs
+        **common_configs,
     )
     run_benchmarks(
         bench_test_fn=bench_memory_group_norm,
         kernel_operation_modes=["full", "forward", "backward"],
         metric_name="memory",
         metric_unit="MB",
-        **common_configs
+        **common_configs,
     )

liger-kernel-nightly 0.5.2.dev20241223032630__tar.gz → 0.5.2.dev20241228022953__tar.gz

liger-kernel-nightly 0.5.2.dev20241223032630tar.gz → 0.5.2.dev20241228022953tar.gz