PyPI - liger-kernel-nightly - Versions diffs - 0.5.10.dev20250630171450__tar.gz → 0.5.10.dev20250702150221__tar.gz - Mend

liger-kernel-nightly 0.5.10.dev20250630171450tar.gz → 0.5.10.dev20250702150221tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (274) hide show

{liger_kernel_nightly-0.5.10.dev20250630171450 → liger_kernel_nightly-0.5.10.dev20250702150221}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.5.10.dev20250630171450
+Version: 0.5.10.dev20250702150221
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation

{liger_kernel_nightly-0.5.10.dev20250630171450 → liger_kernel_nightly-0.5.10.dev20250702150221}/benchmark/data/all_benchmark_data.csv RENAMED Viewed

@@ -1469,3 +1469,27 @@ fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,512,15
 fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,1024,369.0234375,369.0234375,369.0234375,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:39,0.5.10
 fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,2048,1176.0234375,1176.0234375,1176.0234375,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:39,0.5.10
 fused_neighborhood_attention,torch,full,memory,MB,seq_len,sequence length,4096,4332.0234375,4332.0234375,4332.0234375,"{""batch_size"": 2, ""hidden_size"": 512, ""num_heads"": 8, ""kernel_size"": 7, ""dilation"": 2, ""bias"": true, ""dtype"": ""torch.float32""}",NVIDIA H100 80GB HBM3,2025-05-27 19:27:39,0.5.10
+distill_cosine_loss,liger,forward,speed,ms,BT,B x T,1024,13.828096389770508,13.821133041381836,13.885849952697754,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:19:52,0.5.10
+distill_cosine_loss,liger,forward,speed,ms,BT,B x T,2048,27.57427215576172,27.52573432922363,27.579801940917967,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:19:52,0.5.10
+distill_cosine_loss,liger,forward,speed,ms,BT,B x T,4096,54.79423904418945,54.79423904418945,54.79423904418945,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:19:52,0.5.10
+distill_cosine_loss,liger,forward,speed,ms,BT,B x T,8192,109.73490905761719,109.73490905761719,109.73490905761719,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:19:52,0.5.10
+distill_cosine_loss,torch,forward,speed,ms,BT,B x T,1024,16.456703186035156,15.045836448669434,16.761650466918944,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:20:34,0.5.10
+distill_cosine_loss,torch,forward,speed,ms,BT,B x T,2048,29.703168869018555,29.69333839416504,29.71177024841309,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:20:34,0.5.10
+distill_cosine_loss,torch,forward,speed,ms,BT,B x T,4096,59.177982330322266,59.177982330322266,59.177982330322266,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:20:34,0.5.10
+distill_cosine_loss,torch,forward,speed,ms,BT,B x T,8192,118.3815689086914,118.3815689086914,118.3815689086914,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:20:34,0.5.10
+distill_cosine_loss,liger,full,speed,ms,BT,B x T,1024,14.654463768005371,14.63398380279541,14.68006420135498,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:21:16,0.5.10
+distill_cosine_loss,liger,full,speed,ms,BT,B x T,2048,28.274688720703125,28.27284507751465,28.279603958129883,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:21:16,0.5.10
+distill_cosine_loss,liger,full,speed,ms,BT,B x T,4096,55.96672058105469,55.96672058105469,55.96672058105469,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:21:16,0.5.10
+distill_cosine_loss,liger,full,speed,ms,BT,B x T,8192,111.38764953613281,111.38764953613281,111.38764953613281,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:21:16,0.5.10
+distill_cosine_loss,torch,full,speed,ms,BT,B x T,1024,37.45382308959961,37.42556076049805,37.482085418701175,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:22:01,0.5.10
+distill_cosine_loss,torch,full,speed,ms,BT,B x T,2048,73.56620788574219,73.56620788574219,73.56620788574219,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:22:01,0.5.10
+distill_cosine_loss,torch,full,speed,ms,BT,B x T,4096,145.73056030273438,145.73056030273438,145.73056030273438,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:22:01,0.5.10
+distill_cosine_loss,torch,full,speed,ms,BT,B x T,8192,291.5000305175781,291.5000305175781,291.5000305175781,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:22:01,0.5.10
+distill_cosine_loss,liger,full,memory,MB,BT,B x T,1024,5059.26806640625,5059.26806640625,5059.26806640625,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:22:43,0.5.10
+distill_cosine_loss,liger,full,memory,MB,BT,B x T,2048,5087.27587890625,5087.27587890625,5087.27587890625,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:22:43,0.5.10
+distill_cosine_loss,liger,full,memory,MB,BT,B x T,4096,5143.29150390625,5143.29150390625,5143.29150390625,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:22:43,0.5.10
+distill_cosine_loss,liger,full,memory,MB,BT,B x T,8192,5255.32275390625,5255.32275390625,5255.32275390625,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:22:43,0.5.10
+distill_cosine_loss,torch,full,memory,MB,BT,B x T,1024,7566.2822265625,7566.2822265625,7566.2822265625,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:23:28,0.5.10
+distill_cosine_loss,torch,full,memory,MB,BT,B x T,2048,11590.3134765625,11590.3134765625,11590.3134765625,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:23:28,0.5.10
+distill_cosine_loss,torch,full,memory,MB,BT,B x T,4096,19654.375,19654.375,19654.375,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:23:28,0.5.10
+distill_cosine_loss,torch,full,memory,MB,BT,B x T,8192,35782.5,35782.5,35782.5,"{""H"": 4096, ""V"": 128256, ""mode"": ""forward"", ""dtype"": ""torch.bfloat16"", ""bias"": false, ""weight_hard_loss"": 0.5, ""weight_soft_loss"": 0.5, ""ignore_index"": -100}",NVIDIA A100-SXM4-80GB,2025-06-27 09:23:28,0.5.10

liger_kernel_nightly-0.5.10.dev20250702150221/benchmark/scripts/benchmark_distill_cosine_loss.py ADDED Viewed

@@ -0,0 +1,266 @@
+import os
+import sys
+import torch
+import torch.nn as nn
+import triton
+from utils import QUANTILES
+from utils import SingleBenchmarkRunInput
+from utils import SingleBenchmarkRunOutput
+from utils import _test_memory
+from utils import parse_benchmark_script_args
+from utils import run_benchmarks
+from liger_kernel.chunked_loss.cosine_similarity_loss import LigerFusedLinearCosineSimilarityFunction
+from liger_kernel.utils import infer_device
+device = infer_device()
+# Ensure the project root is in the path
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
+class TorchCosineSimilarityLoss(nn.Module):
+    def __init__(
+        self,
+        H: int,
+        V: int,
+        dtype: torch.dtype,
+        weight_hard_loss: float = 0.5,
+        weight_soft_loss: float = 0.5,
+        ignore_index: int = -100,
+        temperature: float = 1.0,
+        bias: bool = False,
+    ):
+        from test.chunked_loss.test_cosine_loss import HFCosineLoss
+        super().__init__()
+        self.student_lin = nn.Linear(in_features=H // 2, out_features=V, bias=bias).to(dtype=dtype)
+        self.teacher_lin = nn.Linear(in_features=H, out_features=V, bias=bias).to(dtype=dtype)
+        self.cosine_loss = HFCosineLoss(
+            ignore_index=ignore_index,
+            weight_hard_loss=weight_hard_loss,
+            weight_soft_loss=weight_soft_loss,
+            temperature=temperature,
+        ).get_batch_loss_metrics
+    def forward(self, student: torch.Tensor, teacher: torch.Tensor, target: torch.Tensor):
+        return self.cosine_loss(student, self.student_lin.weight, teacher, self.teacher_lin.weight, target)
+class LigerCosineSimilarityLoss(nn.Module):
+    def __init__(
+        self,
+        H: int,
+        V: int,
+        dtype: torch.dtype,
+        weight_hard_loss: float = 0.5,
+        weight_soft_loss: float = 0.5,
+        ignore_index: int = -100,
+        temperature: float = 1.0,
+        bias: bool = False,
+    ):
+        super().__init__()
+        self.student_lin = nn.Linear(in_features=H // 2, out_features=V, bias=bias).to(dtype=dtype)
+        self.teacher_lin = nn.Linear(in_features=H, out_features=V, bias=bias).to(dtype=dtype)
+        self.weight_hard_loss = weight_hard_loss
+        self.weight_soft_loss = weight_soft_loss
+        self.ignore_index = ignore_index
+        self.temperature = temperature
+        self.cosine_loss = LigerFusedLinearCosineSimilarityFunction.apply
+    def forward(self, student: torch.Tensor, teacher: torch.Tensor, target: torch.Tensor):
+        return self.cosine_loss(
+            student,
+            self.student_lin.weight,
+            teacher,
+            self.teacher_lin.weight,
+            target,
+            self.student_lin.bias,
+            self.teacher_lin.bias,
+            self.weight_hard_loss,
+            self.weight_soft_loss,
+        )
+def bench_memory_cosine_similarity_loss(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    BT = input.x
+    H = input.extra_benchmark_config["H"]
+    V = input.extra_benchmark_config["V"]
+    dtype = input.extra_benchmark_config["dtype"]
+    bias = input.extra_benchmark_config["bias"]
+    weight_hard_loss = input.extra_benchmark_config["weight_hard_loss"]
+    weight_soft_loss = input.extra_benchmark_config["weight_soft_loss"]
+    ignore_index = input.extra_benchmark_config["ignore_index"]
+    provider = input.kernel_provider
+    torch_cosine_loss = TorchCosineSimilarityLoss(
+        H=H,
+        V=V,
+        dtype=dtype,
+        weight_hard_loss=weight_hard_loss,
+        weight_soft_loss=weight_soft_loss,
+        bias=bias,
+    ).to(device)
+    liger_cosine_loss = LigerCosineSimilarityLoss(
+        H=H,
+        V=V,
+        dtype=dtype,
+        ignore_index=ignore_index,
+        bias=bias,
+        weight_hard_loss=weight_hard_loss,
+        weight_soft_loss=weight_soft_loss,
+    ).to(device)
+    _tensor = torch.rand(BT, H // 2, device=device, dtype=dtype)
+    student_input1 = _tensor.detach().clone().requires_grad_(True)
+    student_input2 = _tensor.detach().clone().requires_grad_(True)
+    teacher_input = torch.rand(BT, H, device=device, dtype=dtype)
+    target = torch.randint(0, V, (BT,), device=device, dtype=torch.long)
+    def fwd():
+        if provider == "liger":
+            return liger_cosine_loss(student_input1, teacher_input, target)
+        elif provider == "torch":
+            return torch_cosine_loss(student_input2, teacher_input, target)
+    def full():
+        y = fwd()
+        y.backward()
+    mem_50, mem_20, mem_80 = _test_memory(full, _iter=10, quantiles=QUANTILES)
+    return SingleBenchmarkRunOutput(
+        y_20=mem_20,
+        y_50=mem_50,
+        y_80=mem_80,
+    )
+def bench_speed_cosine_similarity_loss(input: SingleBenchmarkRunInput) -> SingleBenchmarkRunOutput:
+    BT = input.x
+    H = input.extra_benchmark_config["H"]
+    V = input.extra_benchmark_config["V"]
+    dtype = input.extra_benchmark_config["dtype"]
+    bias = input.extra_benchmark_config["bias"]
+    weight_hard_loss = input.extra_benchmark_config["weight_hard_loss"]
+    weight_soft_loss = input.extra_benchmark_config["weight_soft_loss"]
+    ignore_index = input.extra_benchmark_config["ignore_index"]
+    provider = input.kernel_provider
+    mode = input.kernel_operation_mode
+    torch_cosine_loss = TorchCosineSimilarityLoss(
+        H=H,
+        V=V,
+        dtype=dtype,
+        ignore_index=ignore_index,
+        bias=bias,
+        weight_hard_loss=weight_hard_loss,
+        weight_soft_loss=weight_soft_loss,
+    ).to(device)
+    liger_cosine_loss = LigerCosineSimilarityLoss(
+        H=H,
+        V=V,
+        dtype=dtype,
+        ignore_index=ignore_index,
+        bias=bias,
+        weight_hard_loss=weight_hard_loss,
+        weight_soft_loss=weight_soft_loss,
+    ).to(device)
+    _tensor = torch.rand(BT, H // 2, device=device, dtype=dtype)
+    student_input1 = _tensor.detach().clone().requires_grad_(True)
+    student_input2 = _tensor.detach().clone().requires_grad_(True)
+    teacher_input = torch.rand(BT, H, device=device, dtype=dtype)
+    target = torch.randint(0, V, (BT,), device=device, dtype=torch.long)
+    def fwd():
+        if provider == "liger":
+            return liger_cosine_loss(student_input1, teacher_input, target)
+        elif provider == "torch":
+            return torch_cosine_loss(student_input2, teacher_input, target)
+    if mode == "forward":
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            fwd,
+            rep=100,
+            quantiles=QUANTILES,
+        )
+    elif mode == "backward":
+        y = fwd()
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            fwd,
+            rep=100,
+            quantiles=QUANTILES,
+        )
+    elif mode == "backward":
+        y = fwd()
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            lambda: y.backward(retain_graph=True),
+            grad_to_none=[student_input1, student_input2],
+            rep=100,
+            quantiles=QUANTILES,
+        )
+    elif mode == "full":
+        def full():
+            y = fwd()
+            y.backward()
+        ms_50, ms_20, ms_80 = triton.testing.do_bench(
+            full,
+            rep=100,
+            quantiles=QUANTILES,
+        )
+    return SingleBenchmarkRunOutput(
+        y_20=ms_20,
+        y_50=ms_50,
+        y_80=ms_80,
+    )
+if __name__ == "__main__":
+    args = parse_benchmark_script_args()
+    common_configs = {
+        "kernel_name": "distill_cosine_loss",
+        "x_name": "BT",
+        "x_label": "B x T",
+        "x_values": [2**i for i in range(10, 14)],
+        "kernel_providers": ["liger", "torch"],
+        "extra_benchmark_configs": [
+            {
+                "H": 4096,
+                "V": 128256,
+                "mode": "forward",
+                "dtype": torch.bfloat16,
+                "bias": False,
+                "weight_hard_loss": 0.5,
+                "weight_soft_loss": 0.5,
+                "ignore_index": -100,
+            }
+        ],
+        "overwrite": args.overwrite,
+    }
+    run_benchmarks(
+        bench_test_fn=bench_speed_cosine_similarity_loss,
+        kernel_operation_modes=["forward", "full"],
+        metric_name="speed",
+        metric_unit="ms",
+        **common_configs,
+    )
+    run_benchmarks(
+        bench_test_fn=bench_memory_cosine_similarity_loss,
+        kernel_operation_modes=["full"],
+        metric_name="memory",
+        metric_unit="MB",
+        **common_configs,
+    )

{liger_kernel_nightly-0.5.10.dev20250630171450 → liger_kernel_nightly-0.5.10.dev20250702150221}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "liger_kernel_nightly"
-version = "0.5.10.dev20250630171450"
+version = "0.5.10.dev20250702150221"
 description = "Efficient Triton kernels for LLM Training"
 urls = { "Homepage" = "https://github.com/linkedin/Liger-Kernel" }
 readme = { file = "README.md", content-type = "text/markdown" }

{liger_kernel_nightly-0.5.10.dev20250630171450 → liger_kernel_nightly-0.5.10.dev20250702150221}/src/liger_kernel/chunked_loss/__init__.py RENAMED Viewed

@@ -1,3 +1,4 @@
+from liger_kernel.chunked_loss.cosine_similarity_loss import LigerFusedLinearCosineSimilarityLoss  # noqa:F401
 from liger_kernel.chunked_loss.cpo_loss import LigerFusedLinearCPOLoss  # noqa: F401
 from liger_kernel.chunked_loss.dpo_loss import LigerFusedLinearDPOLoss  # noqa: F401
 from liger_kernel.chunked_loss.grpo_loss import LigerFusedLinearGRPOLoss  # noqa: F401

liger_kernel_nightly-0.5.10.dev20250702150221/src/liger_kernel/chunked_loss/cosine_similarity_loss.py ADDED Viewed

@@ -0,0 +1,127 @@
+import torch
+import torch.nn.functional as F
+from liger_kernel.chunked_loss.fused_linear_distillation import LigerFusedLinearDistillationBase
+class LigerFusedLinearCosineSimilarityFunction(LigerFusedLinearDistillationBase):
+    @staticmethod
+    def distillation_loss_fn(student_logits, teacher_logits, beta=1.0):
+        """
+        Compute Cosine loss (Cosine Similarity Loss).
+        Args:
+            student_logits (torch.Tensor): Logits of student tokens. Shape: (batch_size * seq_len,).
+            teacher_logits (torch.Tensor): Logits of teacher tokens. Shape: (batch_size * seq_len,).
+            beta: Coefficient beta of generalized Cosine Similarity in the interval [0, 1]. Default: `1.0` (float): .
+        Returns:
+            torch.Tensor: cosine similarity loss
+        """
+        student_norm = F.normalize(student_logits, p=2, dim=-1)
+        teacher_norm = F.normalize(teacher_logits, p=2, dim=-1)
+        cosine_sim = F.cosine_similarity(student_norm, teacher_norm, dim=-1)
+        loss = beta * (1 - cosine_sim)
+        return loss.sum()
+    @classmethod
+    def forward(
+        cls,
+        ctx,
+        student_input: torch.Tensor,
+        student_weight: torch.Tensor,
+        teacher_input: torch.Tensor,
+        teacher_weight: torch.Tensor,
+        true_labels: torch.LongTensor,
+        student_bias: torch.Tensor,
+        teacher_bias: torch.Tensor,
+        weight_hard_loss: float = 0.5,
+        weight_soft_loss: float = 0.5,
+        beta: float = 0.5,
+        ignore_index: int = -100,
+        temperature: float = 1.0,
+        compiled: bool = True,
+        chunk_size: int = 1024,
+    ):
+        return super().forward(
+            cls=cls,
+            ctx=ctx,
+            student_input=student_input,
+            student_weight=student_weight,
+            teacher_input=teacher_input,
+            teacher_weight=teacher_weight,
+            target=true_labels,
+            student_bias=student_bias,
+            teacher_bias=teacher_bias,
+            chunk_size=chunk_size,
+            weight_hard_loss=weight_hard_loss,
+            weight_soft_loss=weight_soft_loss,
+            beta=beta,
+            ignore_index=ignore_index,
+            temperature=temperature,
+            compiled=compiled,
+        )
+    @staticmethod
+    def backward(ctx, grad_output):
+        grads = LigerFusedLinearDistillationBase.backward(ctx, grad_output)[:6]
+        return (
+            *grads,
+            None,  # teacher_bias
+            None,  # weight_hard_loss
+            None,  # weight_soft_loss
+            None,  # beta
+            None,  # ignore_index
+            None,  # temperature
+            None,  # compiled
+            None,  # chunk_size
+        )
+class LigerFusedLinearCosineSimilarityLoss(torch.nn.Module):
+    def __init__(
+        self,
+        weight_hard_loss: float = 0.5,
+        weight_soft_loss: float = 0.5,
+        beta: float = 0.5,
+        ignore_index: int = -100,
+        temperature: float = 1.0,
+        compiled: bool = True,
+        chunk_size: int = 1024,
+    ):
+        super().__init__()
+        assert temperature != 0, "Temperature cannot be 0."
+        self.weight_hard_loss = weight_hard_loss
+        self.weight_soft_loss = weight_soft_loss
+        self.ignore_index = ignore_index
+        self.temperature = temperature
+        self.compiled = compiled
+        self.beta = beta
+        self.chunk_size = chunk_size
+    def forward(
+        self,
+        student_input: torch.Tensor,
+        student_weight: torch.Tensor,
+        teacher_input: torch.Tensor,
+        teacher_weight: torch.Tensor,
+        true_labels: torch.LongTensor,
+        student_bias: torch.Tensor = None,
+        teacher_bias: torch.Tensor = None,
+    ) -> torch.Tensor:
+        return LigerFusedLinearCosineSimilarityFunction.apply(
+            student_input,
+            student_weight,
+            teacher_input,
+            teacher_weight,
+            true_labels,
+            student_bias,
+            teacher_bias,
+            self.weight_hard_loss,
+            self.weight_soft_loss,
+            self.beta,
+            self.ignore_index,
+            self.temperature,
+            self.compiled,
+            self.chunk_size,
+        )

{liger_kernel_nightly-0.5.10.dev20250630171450 → liger_kernel_nightly-0.5.10.dev20250702150221}/src/liger_kernel/chunked_loss/functional.py RENAMED Viewed

@@ -1,3 +1,4 @@
+from liger_kernel.chunked_loss.cosine_similarity_loss import LigerFusedLinearCosineSimilarityFunction
 from liger_kernel.chunked_loss.cpo_loss import LigerFusedLinearCPOFunction
 from liger_kernel.chunked_loss.dpo_loss import LigerFusedLinearDPOFunction
 from liger_kernel.chunked_loss.grpo_loss import LigerFusedLinearGRPOFunction
@@ -9,6 +10,7 @@ from liger_kernel.chunked_loss.simpo_loss import LigerFusedLinearSimPOFunction
 liger_fused_linear_orpo = LigerFusedLinearORPOFunction.apply
 liger_fused_linear_dpo = LigerFusedLinearDPOFunction.apply
 liger_fused_linear_jsd = LigerFusedLinearJSDFunction.apply
+liger_fused_linear_cosine = LigerFusedLinearCosineSimilarityFunction.apply
 liger_fused_linear_cpo = LigerFusedLinearCPOFunction.apply
 liger_fused_linear_simpo = LigerFusedLinearSimPOFunction.apply
 liger_fused_linear_kto = LigerFusedLinearKTOFunction.apply

liger-kernel-nightly 0.5.10.dev20250630171450__tar.gz → 0.5.10.dev20250702150221__tar.gz

liger-kernel-nightly 0.5.10.dev20250630171450tar.gz → 0.5.10.dev20250702150221tar.gz