PyPI - liger-kernel-nightly - Versions diffs - 0.5.2.dev20250122005057__tar.gz → 0.5.2.dev20250122195349__tar.gz - Mend

liger-kernel-nightly 0.5.2.dev20250122005057tar.gz → 0.5.2.dev20250122195349tar.gz

Files changed (209) hide show

{liger_kernel_nightly-0.5.2.dev20250122005057 → liger_kernel_nightly-0.5.2.dev20250122195349}/.gitignore RENAMED Viewed

@@ -5,6 +5,7 @@ site/
 .venv/
 venv/
 .ipynb_checkpoints/
+.vscode/
 # Misc
 .DS_Store

{liger_kernel_nightly-0.5.2.dev20250122005057 → liger_kernel_nightly-0.5.2.dev20250122195349}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.5.2.dev20250122005057
+Version: 0.5.2.dev20250122195349
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation

liger_kernel_nightly-0.5.2.dev20250122195349/benchmark/README.md ADDED Viewed

@@ -0,0 +1,30 @@
+## Benchmarking Liger Kernels
+Follow these steps to benchmark and visualize kernel performance:
+1. Create a benchmark script
+   - Add your script under `benchmark/scripts/`
+   - Name it according to the kernel (e.g., `benchmark_<kernel_name>.py`)
+2. Run the benchmark
+   - Results will be saved to `benchmark/data/all_benchmark_data.csv`
+   Example: Benchmarking KTO Loss
+   ```bash
+   cd benchmark
+   python scripts/benchmark_kto_loss.py
+   ```
+3. Visualize results
+   - Use the visualization script with appropriate parameters
+   Example: Visualizing KTO Loss benchmark results
+   ```bash
+   python benchmarks_visualizer.py \
+       --kernel-name kto_loss \
+       --metric-name memory \
+       --kernel-operation-mode full
+   ```
+4. View results
+   - Generated plots will be saved in `benchmark/visualizations/`

{liger_kernel_nightly-0.5.2.dev20250122005057 → liger_kernel_nightly-0.5.2.dev20250122195349}/benchmark/data/all_benchmark_data.csv RENAMED Viewed

@@ -715,3 +715,33 @@ fused_linear_simpo_loss,huggingface,full,memory,MB,B,B,2,8645.314453125,8645.314
 fused_linear_simpo_loss,huggingface,full,memory,MB,B,B,4,12184.330078125,12184.330078125,12184.330078125,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-15 14:30:01,0.4.1
 fused_linear_simpo_loss,huggingface,full,memory,MB,B,B,8,19262.361328125,19262.361328125,19262.361328125,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-15 14:30:01,0.4.1
 fused_linear_simpo_loss,huggingface,full,memory,MB,B,B,16,33418.42578125,33418.42578125,33418.42578125,"{""T"": 1024, ""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16""}",NVIDIA A100-SXM4-80GB,2024-11-15 14:30:01,0.4.1
+kto_loss,liger,forward,speed,ms,B,Batch Size (B),2,7.841599941253662,7.801983833312988,7.849664211273193,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:33:06,0.5.2
+kto_loss,liger,forward,speed,ms,B,Batch Size (B),4,15.568096160888672,15.555737495422363,16.054176330566406,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:33:06,0.5.2
+kto_loss,liger,forward,speed,ms,B,Batch Size (B),8,31.145376205444336,30.750951766967773,31.5398006439209,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:33:06,0.5.2
+kto_loss,liger,forward,speed,ms,B,Batch Size (B),16,61.49708938598633,61.49708938598633,61.49708938598633,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:33:06,0.5.2
+kto_loss,liger,forward,speed,ms,B,Batch Size (B),32,122.01449584960938,122.01449584960938,122.01449584960938,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:33:06,0.5.2
+kto_loss,huggingface,forward,speed,ms,B,Batch Size (B),2,7.892335891723633,7.8687615394592285,8.03729248046875,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:33:28,0.5.2
+kto_loss,huggingface,forward,speed,ms,B,Batch Size (B),4,14.16302490234375,13.813311576843262,15.860223770141602,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:33:28,0.5.2
+kto_loss,huggingface,forward,speed,ms,B,Batch Size (B),8,25.56470489501953,25.564167022705078,25.641658782958984,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:33:28,0.5.2
+kto_loss,huggingface,forward,speed,ms,B,Batch Size (B),16,53.0928955078125,53.0928955078125,53.0928955078125,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:33:28,0.5.2
+kto_loss,huggingface,forward,speed,ms,B,Batch Size (B),32,108.76080322265625,108.76080322265625,108.76080322265625,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:33:28,0.5.2
+kto_loss,liger,full,speed,ms,B,Batch Size (B),2,8.662687301635742,8.488287925720215,9.611334800720215,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:33:50,0.5.2
+kto_loss,liger,full,speed,ms,B,Batch Size (B),4,18.40096092224121,17.99224281311035,18.57883644104004,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:33:50,0.5.2
+kto_loss,liger,full,speed,ms,B,Batch Size (B),8,32.09159851074219,31.708070755004883,32.475128173828125,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:33:50,0.5.2
+kto_loss,liger,full,speed,ms,B,Batch Size (B),16,69.30239868164062,69.30239868164062,69.30239868164062,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:33:50,0.5.2
+kto_loss,liger,full,speed,ms,B,Batch Size (B),32,124.2437744140625,124.2437744140625,124.2437744140625,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:33:50,0.5.2
+kto_loss,huggingface,full,speed,ms,B,Batch Size (B),2,11.449472427368164,11.407564163208008,11.773555755615234,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:34:11,0.5.2
+kto_loss,huggingface,full,speed,ms,B,Batch Size (B),4,20.871471405029297,20.862951278686523,20.879276275634766,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:34:11,0.5.2
+kto_loss,huggingface,full,speed,ms,B,Batch Size (B),8,41.16409683227539,40.760780334472656,41.567413330078125,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:34:11,0.5.2
+kto_loss,huggingface,full,speed,ms,B,Batch Size (B),16,77.720703125,77.720703125,77.720703125,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:34:11,0.5.2
+kto_loss,huggingface,full,speed,ms,B,Batch Size (B),32,156.25794982910156,156.25794982910156,156.25794982910156,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:34:11,0.5.2
+kto_loss,liger,full,memory,MB,B,Batch Size (B),2,2027.48583984375,2027.48583984375,2027.48583984375,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:34:36,0.5.2
+kto_loss,liger,full,memory,MB,B,Batch Size (B),4,2789.736328125,2789.736328125,2789.736328125,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:34:36,0.5.2
+kto_loss,liger,full,memory,MB,B,Batch Size (B),8,2801.751953125,2801.751953125,2801.751953125,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:34:36,0.5.2
+kto_loss,liger,full,memory,MB,B,Batch Size (B),16,2825.783203125,2825.783203125,2825.783203125,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:34:36,0.5.2
+kto_loss,liger,full,memory,MB,B,Batch Size (B),32,2873.845703125,2873.845703125,2873.845703125,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:34:36,0.5.2
+kto_loss,huggingface,full,memory,MB,B,Batch Size (B),2,3786.7373046875,3786.7373046875,3786.7373046875,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:34:59,0.5.2
+kto_loss,huggingface,full,memory,MB,B,Batch Size (B),4,5544.25390625,5544.25390625,5544.25390625,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:34:59,0.5.2
+kto_loss,huggingface,full,memory,MB,B,Batch Size (B),8,9057.287109375,9057.287109375,9057.287109375,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:34:59,0.5.2
+kto_loss,huggingface,full,memory,MB,B,Batch Size (B),16,16087.353515625,16087.353515625,16087.353515625,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:34:59,0.5.2
+kto_loss,huggingface,full,memory,MB,B,Batch Size (B),32,30147.486328125,30147.486328125,30147.486328125,"{""T"": 512, ""H"": 1024, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": true, ""beta"": 0.1, ""ignore_index"": 42}",NVIDIA A100-SXM4-80GB,2024-12-23 23:34:59,0.5.2

liger_kernel_nightly-0.5.2.dev20250122195349/benchmark/scripts/benchmark_kto_loss.py ADDED Viewed

@@ -0,0 +1,314 @@
+import os
+import sys
+import torch
+import triton
+from utils import QUANTILES
+from utils import SingleBenchmarkRunInput
+from utils import SingleBenchmarkRunOutput
+from utils import _test_memory
+from utils import parse_benchmark_script_args
+from utils import run_benchmarks
+from liger_kernel.chunked_loss import LigerFusedLinearKTOLoss
+from liger_kernel.utils import infer_device
+device = infer_device()
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
+class TorchLMHeadKTO(torch.nn.Module):
+    def __init__(
+        self,
+        H: int,
+        V: int,
+        dtype: torch.dtype,
+        use_bias: bool = False,
+        use_ref_bias: bool = False,
+        ignore_index: int = -100,
+        beta: float = 0.1,
+    ):
+        from test.chunked_loss.test_kto_loss import HFKTOLoss
+        super().__init__()
+        self.lin = torch.nn.Linear(in_features=H, out_features=V, bias=use_bias, dtype=dtype)
+        self.ref_lin = torch.nn.Linear(in_features=H, out_features=V, bias=use_ref_bias, dtype=dtype)
+        self.KTO_loss = HFKTOLoss(
+            ignore_index=ignore_index,
+            beta=beta,
+            use_ref_model=True,
+        ).get_batch_loss_metrics
+    def forward(self, x, ref_x, y, preference_labels, kl=None):
+        return self.KTO_loss(
+            weight=self.lin.weight,
+            _input=x,
+            target=y,
+            bias=self.lin.bias,
+            ref_input=ref_x,
+            ref_weight=self.ref_lin.weight,
+            ref_bias=self.ref_lin.bias,
+            preference_labels=preference_labels,
+            kl=kl,
+        )
+class LigerLMHeadKTO(torch.nn.Module):
+    def __init__(
+        self,
+        H: int,
+        V: int,
+        dtype: torch.dtype,
+        use_bias: bool = False,
+        use_ref_bias: bool = False,
+        ignore_index: int = -100,
+        beta: float = 0.1,
+    ):
+        super().__init__()
+        self.lin = torch.nn.Linear(in_features=H, out_features=V, bias=use_bias, dtype=dtype)
+        self.ref_lin = torch.nn.Linear(in_features=H, out_features=V, bias=use_ref_bias, dtype=dtype)
+        self.KTO_loss = LigerFusedLinearKTOLoss(
+            ignore_index=ignore_index,
+            beta=beta,
+            use_ref_model=True,
+        )
+    def forward(self, x, ref_x, y, preference_labels, kl=None):
+        return self.KTO_loss(
+            _input=x,
+            lin_weight=self.lin.weight,
+            target=y,
+            preference_labels=preference_labels,
+            bias=self.lin.bias,
+            ref_input=ref_x,
+            ref_weight=self.ref_lin.weight,
+            ref_bias=self.ref_lin.bias,
+            kl=kl,
+        )
+def bench_memory_kto_loss(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    B = input.x
+    T = input.extra_benchmark_config["T"]
+    H = input.extra_benchmark_config["H"]
+    V = input.extra_benchmark_config["V"]
+    dtype = input.extra_benchmark_config["dtype"]
+    bias = input.extra_benchmark_config["bias"]
+    beta = input.extra_benchmark_config["beta"]
+    ignore_index = input.extra_benchmark_config["ignore_index"]
+    provider = input.kernel_provider
+    torch_kto_loss = TorchLMHeadKTO(
+        H=H,
+        V=V,
+        dtype=dtype,
+        bias=bias,
+        ref_bias=bias,
+        ignore_index=ignore_index,
+        beta=beta,
+    ).to(device)
+    liger_kto_loss = LigerLMHeadKTO(
+        H=H,
+        V=V,
+        dtype=dtype,
+        bias=bias,
+        ref_bias=bias,
+        ignore_index=ignore_index,
+        beta=beta,
+    ).to(device)
+    # Input shape: [B, T, H]
+    _input = torch.randn(B, T, H, device=device, dtype=dtype)
+    # Target shape: [B, T]
+    target = torch.randint(V, (B, T), dtype=torch.long, device=device)
+    # Preference labels shape: [B]
+    # Create binary preference labels (0 or 1) for each sequence in the batch
+    # Used to indicate preferred sequences (1) vs non-preferred sequences (0)
+    preference_labels = torch.randint(2, (B,), dtype=torch.bool, device=device)
+    # Precomputed KL divergence between policy and reference distributions
+    kl = torch.randn(1, device=device, dtype=dtype)
+    # Add ignore_index tokens to simulate padding
+    num_elements_to_assign = torch.randint(1, B * T // 2, (1,)).item()
+    indices_to_assign = torch.randperm(B * T)[:num_elements_to_assign]
+    target.view(-1)[indices_to_assign] = ignore_index
+    # Add ref_x with the same shape as _input
+    ref_input = torch.randn(B, T, H, device=device, dtype=dtype)
+    def fwd():
+        if provider == "liger":
+            return liger_kto_loss(
+                x=_input,
+                ref_x=ref_input,
+                y=target,
+                preference_labels=preference_labels,
+                kl=kl,
+            )
+        elif provider == "huggingface":
+            return torch_kto_loss(
+                x=_input,
+                ref_x=ref_input,
+                y=target,
+                preference_labels=preference_labels,
+                kl=kl,
+            )
+    def full():
+        y = fwd()
+        y.backward()
+    mem_50, mem_20, mem_80 = _test_memory(full, _iter=10, quantiles=QUANTILES)
+    return SingleBenchmarkRunOutput(
+        y_20=mem_20,
+        y_50=mem_50,
+        y_80=mem_80,
+    )
+def bench_speed_kto_loss(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    B = input.x
+    T = input.extra_benchmark_config["T"]
+    H = input.extra_benchmark_config["H"]
+    V = input.extra_benchmark_config["V"]
+    dtype = input.extra_benchmark_config["dtype"]
+    bias = input.extra_benchmark_config["bias"]
+    beta = input.extra_benchmark_config["beta"]
+    ignore_index = input.extra_benchmark_config["ignore_index"]
+    provider = input.kernel_provider
+    mode = input.kernel_operation_mode
+    torch_kto_loss = TorchLMHeadKTO(
+        H=H,
+        V=V,
+        dtype=dtype,
+        beta=beta,
+        ignore_index=ignore_index,
+        bias=bias,
+    ).to(device)
+    liger_kto_loss = LigerLMHeadKTO(
+        H=H,
+        V=V,
+        dtype=dtype,
+        beta=beta,
+        ignore_index=ignore_index,
+        bias=bias,
+    ).to(device)
+    # Input shape: [B, T, H]
+    _input = torch.randn(B, T, H, device=device, dtype=dtype)
+    # Target shape: [B, T]
+    target = torch.randint(V, (B, T), device=device, dtype=torch.long)
+    # Preference labels shape: [B]
+    # Create binary preference labels (0 or 1) for each sequence in the batch
+    # Used to indicate preferred sequences (1) vs non-preferred sequences (0)
+    preference_labels = torch.randint(2, (B,), dtype=torch.bool, device=device)
+    # Precomputed KL divergence between policy and reference distributions
+    kl = torch.randn(1, device=device, dtype=dtype)
+    # Add ignore_index tokens
+    num_elements_to_assign = torch.randint(1, B * T // 2, (1,)).item()
+    indices_to_assign = torch.randperm(B * T)[:num_elements_to_assign]
+    target.view(-1)[indices_to_assign] = ignore_index
+    # Add ref_x with the same shape as _input
+    ref_input = torch.randn(B, T, H, device=device, dtype=dtype)
+    def fwd():
+        if provider == "liger":
+            return liger_kto_loss(
+                x=_input,
+                ref_x=ref_input,
+                y=target,
+                preference_labels=preference_labels,
+                kl=kl,
+            )
+        elif provider == "huggingface":
+            return torch_kto_loss(
+                x=_input,
+                ref_x=ref_input,
+                y=target,
+                preference_labels=preference_labels,
+                kl=kl,
+            )
+    if mode == "forward":
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            fwd,
+            rep=100,
+            quantiles=QUANTILES,
+        )
+    elif mode == "backward":
+        y = fwd()
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            lambda: y.backward(retain_graph=True),
+            grad_to_none=[_input],
+            rep=100,
+            quantiles=QUANTILES,
+        )
+    elif mode == "full":
+        def full():
+            y = fwd()
+            y.backward()
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            full,
+            rep=100,
+            quantiles=QUANTILES,
+        )
+    return SingleBenchmarkRunOutput(
+        y_20=ms_20,
+        y_50=ms_50,
+        y_80=ms_80,
+    )
+if __name__ == "__main__":
+    args = parse_benchmark_script_args()
+    common_configs = {
+        "kernel_name": "kto_loss",
+        "x_name": "B",
+        "x_label": "Batch Size (B)",
+        "x_values": [2**i for i in range(1, 6)],
+        "kernel_providers": ["liger", "huggingface"],
+        "extra_benchmark_configs": [
+            {
+                "T": 512,
+                "H": 1024,
+                "V": 128256,
+                "mode": "forward",
+                "dtype": torch.bfloat16,
+                "bias": True,
+                "beta": 0.1,
+                "ignore_index": 42,
+            }
+        ],
+        "overwrite": args.overwrite,
+    }
+    run_benchmarks(
+        bench_test_fn=bench_speed_kto_loss,
+        kernel_operation_modes=["forward", "full"],
+        metric_name="speed",
+        metric_unit="ms",
+        **common_configs,
+    )
+    run_benchmarks(
+        bench_test_fn=bench_memory_kto_loss,
+        kernel_operation_modes=["full"],
+        metric_name="memory",
+        metric_unit="MB",
+        **common_configs,
+    )

{liger_kernel_nightly-0.5.2.dev20250122005057 → liger_kernel_nightly-0.5.2.dev20250122195349}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "liger_kernel_nightly"
-version = "0.5.2.dev20250122005057"
+version = "0.5.2.dev20250122195349"
 description = "Efficient Triton kernels for LLM Training"
 urls = { "Homepage" = "https://github.com/linkedin/Liger-Kernel" }
 readme = { file = "README.md", content-type = "text/markdown" }

{liger_kernel_nightly-0.5.2.dev20250122005057 → liger_kernel_nightly-0.5.2.dev20250122195349}/src/liger_kernel/chunked_loss/README.md RENAMED Viewed

@@ -1,6 +1,6 @@
 # Liger FlexChunkLoss: Alignment and Distillation loss
-Liger FlexChunkLoss offers a versatile interface, delivering up to 80% memory savings and a 10% throughput boost for post-training loss functions, including alignment (DPO, ORPO, CPO) and very soon, distillation. Its flexible design supports custom losses, ensuring efficiency gains across diverse use cases.
+Liger FlexChunkLoss offers a versatile interface, delivering up to 80% memory savings and a 10% throughput boost for post-training loss functions, including alignment (DPO, ORPO, CPO, KTO) and very soon, distillation. Its flexible design supports custom losses, ensuring efficiency gains across diverse use cases.
 ### User interface

{liger_kernel_nightly-0.5.2.dev20250122005057 → liger_kernel_nightly-0.5.2.dev20250122195349}/src/liger_kernel/chunked_loss/__init__.py RENAMED Viewed

@@ -1,4 +1,5 @@
 from liger_kernel.chunked_loss.cpo_loss import LigerFusedLinearCPOLoss  # noqa: F401
 from liger_kernel.chunked_loss.dpo_loss import LigerFusedLinearDPOLoss  # noqa: F401
+from liger_kernel.chunked_loss.kto_loss import LigerFusedLinearKTOLoss  # noqa: F401
 from liger_kernel.chunked_loss.orpo_loss import LigerFusedLinearORPOLoss  # noqa: F401
 from liger_kernel.chunked_loss.simpo_loss import LigerFusedLinearSimPOLoss  # noqa: F401

{liger_kernel_nightly-0.5.2.dev20250122005057 → liger_kernel_nightly-0.5.2.dev20250122195349}/src/liger_kernel/chunked_loss/functional.py RENAMED Viewed

@@ -1,5 +1,6 @@
 from liger_kernel.chunked_loss.cpo_loss import LigerFusedLinearCPOFunction
 from liger_kernel.chunked_loss.dpo_loss import LigerFusedLinearDPOFunction
+from liger_kernel.chunked_loss.kto_loss import LigerFusedLinearKTOFunction
 from liger_kernel.chunked_loss.orpo_loss import LigerFusedLinearORPOFunction
 from liger_kernel.chunked_loss.simpo_loss import LigerFusedLinearSimPOFunction
@@ -7,3 +8,4 @@ liger_fused_linear_orpo = LigerFusedLinearORPOFunction.apply
 liger_fused_linear_dpo = LigerFusedLinearDPOFunction.apply
 liger_fused_linear_cpo = LigerFusedLinearCPOFunction.apply
 liger_fused_linear_simpo = LigerFusedLinearSimPOFunction.apply
+liger_fused_linear_kto = LigerFusedLinearKTOFunction.apply

liger-kernel-nightly 0.5.2.dev20250122005057__tar.gz → 0.5.2.dev20250122195349__tar.gz

liger-kernel-nightly 0.5.2.dev20250122005057tar.gz → 0.5.2.dev20250122195349tar.gz