PyPI - liger-kernel-nightly - Versions diffs - 0.5.2.dev20250130024630__py3-none-any.whl → 0.5.2.dev20250130172806__py3-none-any.whl - Mend

liger-kernel-nightly 0.5.2.dev20250130024630py3-none-any.whl → 0.5.2.dev20250130172806py3-none-any.whl

Files changed (10) hide show

liger_kernel/chunked_loss/__init__.py CHANGED Viewed

@@ -1,5 +1,6 @@
 from liger_kernel.chunked_loss.cpo_loss import LigerFusedLinearCPOLoss  # noqa: F401
 from liger_kernel.chunked_loss.dpo_loss import LigerFusedLinearDPOLoss  # noqa: F401
+from liger_kernel.chunked_loss.jsd_loss import LigerFusedLinearJSDLoss  # noqa: F401
 from liger_kernel.chunked_loss.kto_loss import LigerFusedLinearKTOLoss  # noqa: F401
 from liger_kernel.chunked_loss.orpo_loss import LigerFusedLinearORPOLoss  # noqa: F401
 from liger_kernel.chunked_loss.simpo_loss import LigerFusedLinearSimPOLoss  # noqa: F401

liger_kernel/chunked_loss/functional.py CHANGED Viewed

@@ -1,11 +1,13 @@
 from liger_kernel.chunked_loss.cpo_loss import LigerFusedLinearCPOFunction
 from liger_kernel.chunked_loss.dpo_loss import LigerFusedLinearDPOFunction
+from liger_kernel.chunked_loss.jsd_loss import LigerFusedLinearJSDFunction
 from liger_kernel.chunked_loss.kto_loss import LigerFusedLinearKTOFunction
 from liger_kernel.chunked_loss.orpo_loss import LigerFusedLinearORPOFunction
 from liger_kernel.chunked_loss.simpo_loss import LigerFusedLinearSimPOFunction
 liger_fused_linear_orpo = LigerFusedLinearORPOFunction.apply
 liger_fused_linear_dpo = LigerFusedLinearDPOFunction.apply
+liger_fused_linear_jsd = LigerFusedLinearJSDFunction.apply
 liger_fused_linear_cpo = LigerFusedLinearCPOFunction.apply
 liger_fused_linear_simpo = LigerFusedLinearSimPOFunction.apply
 liger_fused_linear_kto = LigerFusedLinearKTOFunction.apply

liger_kernel/chunked_loss/fused_linear_distillation.py CHANGED Viewed

@@ -17,6 +17,9 @@ class LigerFusedLinearDistillationBase(torch.autograd.Function):
         Args:
             student_logits (torch.Tensor): Raw (temperature-scaled) logits of student tokens. Shape: (batch_size * seq_len, vocab_size).
             teacher_logits (torch.Tensor): Raw (temperature-scaled) logits of teacher tokens. Shape: (batch_size * seq_len, vocab_size).
+        Returns:
+            torch.Tensor: Sum of distillation losses for the chunk. The class will handle
+                converting this to mean loss by dividing by the full batch size * sequence length in _compute_loss.
         """
         raise NotImplementedError("Distillation loss function must be implemented.")
@@ -71,10 +74,11 @@ class LigerFusedLinearDistillationBase(torch.autograd.Function):
         weight_hard_loss=0.5,
         weight_soft_loss=0.5,
         compute_ce_loss=True,
+        temperature=1,
         **loss_kwargs,
     ):
         """
-        Compute the total loss for a chunk of input and target, while using an knowleedge distillation loss function.
+        Compute the total loss for a chunk of input and target, while using an knowledge distillation loss function.
         Args:
             distillation_loss_fn (callable): Loss function to compute the loss on a chunk of input/target.
             student_input_chunk (torch.Tensor): Chunk of input tensor. Shape: (chunk_size, student_hidden_size).
@@ -84,11 +88,12 @@ class LigerFusedLinearDistillationBase(torch.autograd.Function):
             target_chunk (torch.Tensor): Chunk of target tensor. Shape: (chunk_size,).
             student_bias (torch.Tensor, optional): Bias tensor. Shape: (vocab_size,).
             teacher_bias (torch.Tensor, optional): Bias tensor. Shape: (vocab_size,).
-            full_target (torch.Tensor): Full target tensor. Shape: (chunk_size,).
+            full_target (torch.Tensor): Full target tensor. Shape: (batch_size * sequence_length,).
             ignore_index (int): Index to ignore for loss computation.
             weight_hard_loss (float): Weight for hard loss.
             weight_soft_loss (float): Weight for soft loss.
             compute_ce_loss (bool): Whether to compute CE loss.
+            temperature (float): Temperature to control the input probability distribution. Default: `1.0` (i.e. no scale)
             loss_kwargs (dict): Additional arguments for the loss function.
         """
         (
@@ -107,6 +112,9 @@ class LigerFusedLinearDistillationBase(torch.autograd.Function):
             compute_ce_loss=compute_ce_loss,
         )
+        student_logits_chunk /= temperature
+        teacher_logits_chunk /= temperature
         hard_loss /= full_target.shape[0]
         soft_loss = distillation_loss_fn(student_logits_chunk, teacher_logits_chunk)
@@ -130,6 +138,7 @@ class LigerFusedLinearDistillationBase(torch.autograd.Function):
         ignore_index=-100,
         weight_hard_loss=0.5,
         weight_soft_loss=0.5,
+        beta=0.5,
         compute_ce_loss=True,
         temperature=1.0,
         compiled=True,
@@ -152,6 +161,7 @@ class LigerFusedLinearDistillationBase(torch.autograd.Function):
             ignore_index (int): Index to ignore for loss computation.
             weight_hard_loss (float): Weight for hard/task loss.
             weight_soft_loss (float): Weight for soft/distillation loss.
+            beta (float): Interpolation coefficient between 0 and 1 (default: 0.5).
             compute_ce_loss (bool): Whether to compute CE loss.
             temperature (float): Temperature to control the input probability distribution. Default: `1.0` (i.e. no scale)
             compiled (bool): Whether to use torch compile for chunk accumulation.
@@ -170,7 +180,9 @@ class LigerFusedLinearDistillationBase(torch.autograd.Function):
             ignore_index=ignore_index,
             weight_hard_loss=weight_hard_loss,
             weight_soft_loss=weight_soft_loss,
+            beta=beta,
             compute_ce_loss=compute_ce_loss,
+            temperature=temperature,
             **loss_kwargs,
         )
@@ -225,9 +237,6 @@ class LigerFusedLinearDistillationBase(torch.autograd.Function):
         if compiled:
             accumulate_chunk = torch.compile(accumulate_chunk)
-        student_input /= temperature
-        teacher_input /= temperature
         num_chunks = max(1, student_input.shape[0] // CHUNK_SIZE)
         _student_input_chunks = torch.chunk(student_input, chunks=num_chunks, dim=0)
         _teacher_input_chunks = torch.chunk(teacher_input, chunks=num_chunks, dim=0)

liger_kernel/chunked_loss/jsd_loss.py ADDED Viewed

@@ -0,0 +1,154 @@
+import torch
+import torch.nn.functional as F
+from liger_kernel.chunked_loss.fused_linear_distillation import LigerFusedLinearDistillationBase
+class LigerFusedLinearJSDFunction(LigerFusedLinearDistillationBase):
+    @staticmethod
+    def distillation_loss_fn(student_logits, teacher_logits, beta=0.5):
+        """
+        Compute JSD loss (Jensen-Shannon Divergence Loss).
+        Args:
+            student_logits (torch.Tensor): Logits of student tokens. Shape: (batch_size * seq_len,).
+            teacher_logits (torch.Tensor): Logits of teacher tokens. Shape: (batch_size * seq_len,).
+            beta (float): Coefficient beta of generalized JSD in the interval [0, 1]. Default: `0.5`.
+        Returns:
+            torch.Tensor: Jensen-Shannon Divergence loss
+        """
+        student_log_probs = F.log_softmax(student_logits, dim=-1)
+        teacher_log_probs = F.log_softmax(teacher_logits, dim=-1)
+        # Compute probabilities (only required for mean calculation)
+        mean_probs = beta * student_log_probs.exp() + (1 - beta) * teacher_log_probs.exp()
+        log_mean_probs = mean_probs.log()
+        student_kl = F.kl_div(log_mean_probs, student_log_probs, reduction="sum", log_target=True)
+        teacher_kl = F.kl_div(log_mean_probs, teacher_log_probs, reduction="sum", log_target=True)
+        # JSD is the weighted average of the KL divergences
+        jsd_loss = beta * teacher_kl + (1 - beta) * student_kl
+        return jsd_loss
+    @staticmethod
+    def forward(
+        ctx,
+        student_input: torch.Tensor,
+        student_weight: torch.Tensor,
+        teacher_input: torch.Tensor,
+        teacher_weight: torch.Tensor,
+        true_labels: torch.LongTensor,
+        weight_hard_loss: float = 0.5,
+        weight_soft_loss: float = 0.5,
+        beta: float = 0.5,
+        ignore_index: int = -100,
+        temperature: float = 1.0,
+        compiled: bool = True,
+    ):
+        """
+        Fused linear layer with JSD distillation loss.
+        Args:
+            student_input (torch.Tensor): Student input tensor. Shape: (batch_size * seq_len, hidden_size_student)
+            student_weight (torch.Tensor): Student weight tensor. Shape: (vocab_size, hidden_size_student)
+            teacher_input (torch.Tensor): Teacher input tensor. Shape: (batch_size * seq_len, hidden_size_teacher)
+            teacher_weight (torch.Tensor): Teacher weight tensor. Shape: (vocab_size, hidden_size_teacher)
+            true_labels (torch.LongTensor): Target tensor. Shape: (batch_size * seq_len,)
+            weight_hard_loss (float): Weight for hard loss.
+            weight_soft_loss (float): Weight for soft loss.
+            beta (float): Coefficient beta of generalized JSD in the interval [0, 1]. Default: `0.5`.
+            ignore_index (int): Index to ignore in loss computation
+            temperature (float): Temperature for softening/sharpening distributions
+            compiled (bool): Whether to use torch compile
+        Returns:
+            torch.Tensor: Computed loss
+        """
+        return LigerFusedLinearDistillationBase.forward(
+            ctx=ctx,
+            student_input=student_input,
+            student_weight=student_weight,
+            teacher_input=teacher_input,
+            teacher_weight=teacher_weight,
+            target=true_labels,
+            loss_fn=LigerFusedLinearJSDFunction.distillation_loss_fn,
+            chunk_size=1,
+            weight_hard_loss=weight_hard_loss,
+            weight_soft_loss=weight_soft_loss,
+            beta=beta,
+            ignore_index=ignore_index,
+            temperature=temperature,
+            compiled=compiled,
+        )
+    @staticmethod
+    def backward(ctx, grad_output):
+        grads = LigerFusedLinearDistillationBase.backward(ctx, grad_output)[:4]
+        return (*grads, None, None, None, None, None, None, None)
+class LigerFusedLinearJSDLoss(torch.nn.Module):
+    """
+    Fused linear layer with JSD distillation loss.
+    """
+    def __init__(
+        self,
+        weight_hard_loss: float = 0.5,
+        weight_soft_loss: float = 0.5,
+        beta: float = 0.5,
+        ignore_index: int = -100,
+        temperature: float = 1.0,
+        compiled: bool = True,
+    ):
+        """
+        Args:
+            weight_hard_loss (float): Weight for hard loss.
+            weight_soft_loss (float): Weight for soft loss.
+            ignore_index (int): Index to ignore in the loss
+            temperature (float): Temperature for softening distributions
+            compiled (bool): Whether to use torch compile
+            beta (float): Coefficient beta of generalized JSD in the interval [0, 1]. Default: `0.5`.
+        """
+        super().__init__()
+        assert temperature != 0, "Temperature cannot be 0."
+        self.weight_hard_loss = weight_hard_loss
+        self.weight_soft_loss = weight_soft_loss
+        self.ignore_index = ignore_index
+        self.temperature = temperature
+        self.compiled = compiled
+        self.beta = beta
+    def forward(
+        self,
+        student_input: torch.Tensor,
+        student_weight: torch.Tensor,
+        teacher_input: torch.Tensor,
+        teacher_weight: torch.Tensor,
+        true_labels: torch.LongTensor,
+    ) -> torch.Tensor:
+        """
+        Compute the JSD distillation loss.
+        Args:
+            student_input (torch.Tensor): Student input tensor
+            student_weight (torch.Tensor): Student weight tensor
+            teacher_input (torch.Tensor): Teacher input tensor
+            teacher_weight (torch.Tensor): Teacher weight tensor
+            true_labels (torch.LongTensor): Target labels tensor
+        Returns:
+            torch.Tensor: Computed loss
+        """
+        return LigerFusedLinearJSDFunction.apply(
+            student_input,
+            student_weight,
+            teacher_input,
+            teacher_weight,
+            true_labels,
+            self.weight_hard_loss,
+            self.weight_soft_loss,
+            self.beta,
+            self.ignore_index,
+            self.temperature,
+            self.compiled,
+        )

{liger_kernel_nightly-0.5.2.dev20250130024630.dist-info → liger_kernel_nightly-0.5.2.dev20250130172806.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.5.2.dev20250130024630
+Version: 0.5.2.dev20250130172806
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation

{liger_kernel_nightly-0.5.2.dev20250130024630.dist-info → liger_kernel_nightly-0.5.2.dev20250130172806.dist-info}/RECORD RENAMED Viewed

@@ -2,13 +2,14 @@ liger_kernel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 liger_kernel/env_report.py,sha256=uhdEC8OydxoZlb7B6YYcAaBF3crGFdIck-4cxaW4NJY,1728
 liger_kernel/utils.py,sha256=HJa-xVKOohDn6pLVIx-Fv0V9h0QAL3qZGQNRICI-OpI,249
 liger_kernel/chunked_loss/README.md,sha256=0FmkFC3hKBqyoDT5uTlIYmrvRkF-EOCR1y-EBU1LpWU,2248
-liger_kernel/chunked_loss/__init__.py,sha256=CI6hBI7VldTX748c7F6F8YpHTn1q4gv5-lMXf273oXQ,431
+liger_kernel/chunked_loss/__init__.py,sha256=48m-8IMOAReZbi0HK5aV-KGBp2IsZSwFvdnzTNrS4bk,516
 liger_kernel/chunked_loss/cpo_loss.py,sha256=OdBR8WYdHTKpLI_c9DcuwqKSWPeAAeTyREz46Vu_cAY,3682
 liger_kernel/chunked_loss/dpo_loss.py,sha256=VYZMOafdvE8xlhvTtwjrz81tIzxR1mHF4lXdsADnIQg,4373
-liger_kernel/chunked_loss/functional.py,sha256=dO0DYMPTBxwPtEUQ1DUV2zCmZ6i-k3B7COeR3-IwA6M,683
-liger_kernel/chunked_loss/fused_linear_distillation.py,sha256=uQtwtu-kaUZJTjNhAnIr3O794oUlUZ98XR5shYtwP5k,10440
+liger_kernel/chunked_loss/functional.py,sha256=THWWpCnRVhTVfnPnyvQjdBvo1JDtxhwLmtZE_yiBBqM,817
+liger_kernel/chunked_loss/fused_linear_distillation.py,sha256=5V8rdva89WyHVbmJ8JOmC4DYNOR6ByXfx3qlUieOZkI,11002
 liger_kernel/chunked_loss/fused_linear_preference.py,sha256=idK9V9NivoVITqVpiG0fEGUHSvinYWkn9-EYXZjR-KQ,18356
 liger_kernel/chunked_loss/fused_linear_unpaired_preference.py,sha256=ZqYlXXhIphkJPxOS7iI70avgrr6x0skEtgpckZTYau0,9819
+liger_kernel/chunked_loss/jsd_loss.py,sha256=yRCQdvd3ruTWP4A_BfU8VcZ6LepSUfO0Ob7stGnueQY,6052
 liger_kernel/chunked_loss/kto_loss.py,sha256=eVNW6HVCAm32shpfhbRlk92Flnjd7G32v0gK9DUUSOQ,5655
 liger_kernel/chunked_loss/orpo_loss.py,sha256=yjcrrbVeemLYodoSKT-FMSnaPtyKAZ3aOrvPD6tTY6Y,3617
 liger_kernel/chunked_loss/simpo_loss.py,sha256=3TTc7U79Orjgi-Wu81WZkWk5MgsdqKXIOBHgIvDazPw,3865
@@ -60,9 +61,9 @@ liger_kernel/transformers/trainer/__init__.py,sha256=p7yQfklV8-467qSz_ZMimkbDF7H
 liger_kernel/transformers/trainer/orpo_trainer.py,sha256=pdekW7l6Qg_aqa5SYKYlSWUF8m3lkOFvFLcIMEHrz9s,8338
 liger_kernel/triton/__init__.py,sha256=qCiCamzCRv6lpV8IqpAc9YMdNKC7GKurClWceQPnlis,92
 liger_kernel/triton/monkey_patch.py,sha256=Rd0hUHAzDkFfHvnX7-PBaNK5EKnZhtfM_h-fgQH9HPY,1568
-liger_kernel_nightly-0.5.2.dev20250130024630.dist-info/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
-liger_kernel_nightly-0.5.2.dev20250130024630.dist-info/METADATA,sha256=1iagCXBr_TV35IIu5T8wUpFhYmmrxMbbKkOwPZF7pgc,21205
-liger_kernel_nightly-0.5.2.dev20250130024630.dist-info/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
-liger_kernel_nightly-0.5.2.dev20250130024630.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
-liger_kernel_nightly-0.5.2.dev20250130024630.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
-liger_kernel_nightly-0.5.2.dev20250130024630.dist-info/RECORD,,
+liger_kernel_nightly-0.5.2.dev20250130172806.dist-info/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
+liger_kernel_nightly-0.5.2.dev20250130172806.dist-info/METADATA,sha256=LCLD7LcN4x7h1_LMuYPAtIUrhHGi2eoON-NOBtofCN0,21205
+liger_kernel_nightly-0.5.2.dev20250130172806.dist-info/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
+liger_kernel_nightly-0.5.2.dev20250130172806.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
+liger_kernel_nightly-0.5.2.dev20250130172806.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
+liger_kernel_nightly-0.5.2.dev20250130172806.dist-info/RECORD,,

{liger_kernel_nightly-0.5.2.dev20250130024630.dist-info → liger_kernel_nightly-0.5.2.dev20250130172806.dist-info}/LICENSE RENAMED Viewed

File without changes

{liger_kernel_nightly-0.5.2.dev20250130024630.dist-info → liger_kernel_nightly-0.5.2.dev20250130172806.dist-info}/NOTICE RENAMED Viewed

File without changes

{liger_kernel_nightly-0.5.2.dev20250130024630.dist-info → liger_kernel_nightly-0.5.2.dev20250130172806.dist-info}/WHEEL RENAMED Viewed

File without changes

{liger_kernel_nightly-0.5.2.dev20250130024630.dist-info → liger_kernel_nightly-0.5.2.dev20250130172806.dist-info}/top_level.txt RENAMED Viewed

File without changes

liger-kernel-nightly 0.5.2.dev20250130024630__py3-none-any.whl → 0.5.2.dev20250130172806__py3-none-any.whl

liger-kernel-nightly 0.5.2.dev20250130024630py3-none-any.whl → 0.5.2.dev20250130172806py3-none-any.whl