PyPI - liger-kernel-nightly - Versions diffs - 0.4.2.dev20241121224158__tar.gz → 0.4.2.dev20241122052539__tar.gz - Mend

liger-kernel-nightly 0.4.2.dev20241121224158tar.gz → 0.4.2.dev20241122052539tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (66) hide show

{liger_kernel_nightly-0.4.2.dev20241121224158/src/liger_kernel_nightly.egg-info → liger_kernel_nightly-0.4.2.dev20241122052539}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.4.2.dev20241121224158
+Version: 0.4.2.dev20241122052539
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation

{liger_kernel_nightly-0.4.2.dev20241121224158 → liger_kernel_nightly-0.4.2.dev20241122052539}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "liger_kernel_nightly"
-version = "0.4.2.dev20241121224158"
+version = "0.4.2.dev20241122052539"
 description = "Efficient Triton kernels for LLM Training"
 urls = { "Homepage" = "https://github.com/linkedin/Liger-Kernel" }
 readme = { file = "README.md", content-type = "text/markdown" }

{liger_kernel_nightly-0.4.2.dev20241121224158 → liger_kernel_nightly-0.4.2.dev20241122052539}/src/liger_kernel/chunked_loss/dpo_loss.py RENAMED Viewed

@@ -9,15 +9,31 @@ from liger_kernel.chunked_loss.fused_linear_preference import (
 class LigerFusedLinearDPOFunction(LigerFusedLinearPreferenceBase):
     @staticmethod
-    def preference_loss_fn(chosen_logps, rejected_logps, beta=0.1):
+    def preference_loss_fn(
+        chosen_logps,
+        rejected_logps,
+        ref_chosen_logps=None,
+        ref_rejected_logps=None,
+        beta=0.1,
+    ):
         """
         Compute DPO loss (Direct Preference Optimization).
         Args:
             chosen_logps (torch.Tensor): Avg log probabilities of chosen tokens. Shape: (batch_size,).
             rejected_logps (torch.Tensor): Avg log probabilities of rejected tokens. Shape: (batch_size,).
+            ref_chosen_logps (torch.Tensor, optional): Reference log probabilities of chosen tokens. Shape: (batch_size,).
+            ref_rejected_logps (torch.Tensor, optional): Reference log probabilities of rejected tokens. Shape: (batch_size,).
             beta (float): Weight for the direct preference loss.
         """
-        logits_diff = beta * (chosen_logps - rejected_logps)
+        if ref_chosen_logps is None:
+            ref_chosen_logps = torch.tensor(0.0, device=chosen_logps.device)
+        if ref_rejected_logps is None:
+            ref_rejected_logps = torch.tensor(0.0, device=rejected_logps.device)
+        chosen_logratios = chosen_logps - ref_chosen_logps
+        rejected_logratios = rejected_logps - ref_rejected_logps
+        logits_diff = beta * (chosen_logratios - rejected_logratios)
         losses = -F.logsigmoid(logits_diff)
         return losses.sum()
@@ -28,10 +44,13 @@ class LigerFusedLinearDPOFunction(LigerFusedLinearPreferenceBase):
         weight,
         target,
         bias=None,
+        ref_weight=None,
+        ref_bias=None,
         ignore_index=-100,
         beta=0.1,
         compute_nll_loss=True,
         compiled=True,
+        use_ref_model=True,
     ):
         """
         Fused linear layer with DPO (Direct Preference Optimization) loss.
@@ -48,6 +67,9 @@ class LigerFusedLinearDPOFunction(LigerFusedLinearPreferenceBase):
             beta=beta,
             compute_nll_loss=compute_nll_loss,
             compiled=compiled,
+            use_ref_model=use_ref_model,
+            ref_weight=ref_weight,
+            ref_bias=ref_bias,
         )
     @staticmethod
@@ -55,7 +77,7 @@ class LigerFusedLinearDPOFunction(LigerFusedLinearPreferenceBase):
         # Get gradients for _input, weight, bias, and target from the base class
         grads = LigerFusedLinearPreferenceBase.backward(ctx, grad_output)[:4]
         # Return these gradients, followed by None for the remaining inputs
-        return *grads, None, None, None, None
+        return *grads, None, None, None, None, None, None, None
 class LigerFusedLinearDPOLoss(torch.nn.Module):
@@ -69,26 +91,36 @@ class LigerFusedLinearDPOLoss(torch.nn.Module):
         beta: float = 0.1,
         compute_nll_loss: bool = True,
         compiled: bool = True,
+        use_ref_model: bool = False,
     ):
         """
         Args:
             ignore_index (int): Index to ignore in the loss.
             beta (float): Weight for the odds ratio loss.
+            compute_nll_loss (bool): Whether to compute the NLL loss.
+            compiled (bool): Whether to use the torch compiled kernel.
+            use_ref_model (bool): Whether to use a reference model for the DPO loss.
         """
         super().__init__()
         self.ignore_index = ignore_index
         self.beta = beta
         self.compute_nll_loss = compute_nll_loss
         self.compiled = compiled
+        self.use_ref_model = use_ref_model
-    def forward(self, lin_weight, _input, target, bias=None):
+    def forward(
+        self, lin_weight, _input, target, bias=None, ref_weight=None, ref_bias=None
+    ):
         return LigerFusedLinearDPOFunction.apply(
             _input,
             lin_weight,
             target,
             bias,
+            ref_weight,
+            ref_bias,
             self.ignore_index,
             self.beta,
             self.compute_nll_loss,
             self.compiled,
+            self.use_ref_model,
         )

{liger_kernel_nightly-0.4.2.dev20241121224158 → liger_kernel_nightly-0.4.2.dev20241122052539}/src/liger_kernel/chunked_loss/fused_linear_preference.py RENAMED Viewed

@@ -18,6 +18,42 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         """
         raise NotImplementedError("Preference loss function must be implemented.")
+    @staticmethod
+    def chunk_forward(
+        input_chunk,
+        weight,
+        target_chunk,
+        bias=None,
+        ignore_index=-100,
+        compute_nll_loss=True,
+    ):
+        len_chosen_chunk = target_chunk.shape[0] // 2
+        logits_chunk = input_chunk @ weight.t()
+        if bias is not None:
+            logits_chunk = logits_chunk + bias
+        log_probs_chunk = F.log_softmax(logits_chunk.float(), dim=-1)
+        chosen_nll_loss = 0.0
+        if compute_nll_loss:
+            chosen_nll_loss = F.nll_loss(
+                log_probs_chunk[:len_chosen_chunk].view(-1, log_probs_chunk.shape[-1]),
+                target_chunk[:len_chosen_chunk].view(-1),
+                reduction="sum",
+                ignore_index=ignore_index,
+            )
+        loss_mask = target_chunk != ignore_index
+        label_chunk = torch.where(loss_mask, target_chunk, 0)
+        per_token_logps = log_probs_chunk.gather(-1, label_chunk.unsqueeze(-1)).squeeze(
+            -1
+        )
+        average_log_prob = (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)
+        chosen_logps = average_log_prob[:len_chosen_chunk]
+        rejected_logps = average_log_prob[len_chosen_chunk:]
+        return chosen_logps, rejected_logps, chosen_nll_loss
     @staticmethod
     def forward(
         ctx,
@@ -32,6 +68,9 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         beta=0.1,
         compute_nll_loss=True,
         compiled=True,
+        use_ref_model=False,
+        ref_weight=None,
+        ref_bias=None,
         **loss_kwargs,
     ):
         """
@@ -49,7 +88,11 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             ignore_index (int): Index to ignore for loss computation.
             alpha (float): Weight for the NLL loss.
             beta (float): Weight for the odds ratio loss.
+            compute_nll_loss (bool): Whether to compute NLL loss.
             compiled (bool): Whether to use torch compile for chunk accumulation.
+            use_ref_model (bool): Whether to use a reference model for the alignment loss.
+            ref_weight (torch.Tensor): Reference weight tensor. Shape: (vocab_size, hidden_size).
+            ref_bias (torch.Tensor, optional): Reference bias tensor. Shape: (vocab_size,).
             loss_kwargs (dict): Other possible arguments that a loss function might need
         """
         # TODO: Tune CHUNK_SIZE to fully utilize the GPU
@@ -61,7 +104,6 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         grad_bias = torch.zeros_like(bias) if bias is not None else None
         loss_acc = torch.zeros((), device=_input.device)
-        chunks = max(1, _input.shape[0] // (2 * CHUNK_SIZE))
         loss_func_to_call = partial(
             LigerFusedLinearPreferenceBase._compute_loss,
             preference_loss_fn=loss_fn,
@@ -70,6 +112,9 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             beta=beta,
             compute_nll_loss=compute_nll_loss,
             full_target=target,
+            use_ref_model=use_ref_model,
+            ref_weight=ref_weight,
+            ref_bias=ref_bias,
             **loss_kwargs,
         )
@@ -101,6 +146,7 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             accumulate_chunk = torch.compile(accumulate_chunk)
         len_chosen = target.shape[0] // 2
+        chunks = max(1, _input.shape[0] // (2 * CHUNK_SIZE))
         _chosen_input_chunks = torch.chunk(_input[:len_chosen], chunks=chunks, dim=0)
         _chosen_target_chunks = torch.chunk(target[:len_chosen], chunks=chunks, dim=0)
         _rejected_input_chunks = torch.chunk(_input[len_chosen:], chunks=chunks, dim=0)
@@ -159,6 +205,9 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         alpha=1.0,
         beta=0.1,
         compute_nll_loss=True,
+        use_ref_model=False,
+        ref_weight=None,
+        ref_bias=None,
         **loss_kwargs,
     ):
         """
@@ -173,38 +222,41 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             ignore_index (int): Index to ignore for loss computation.
             alpha (float): Weight for the NLL loss.
             beta (float): Weight for the odds ratio loss.
+            compute_nll_loss (bool): Whether to compute NLL loss.
+            use_ref_model (bool): Whether to use a reference model for the alignment loss.
+            ref_weight (torch.Tensor): Reference weight tensor. Shape: (vocab_size, hidden_size).
+            ref_bias (torch.Tensor, optional): Reference bias tensor. Shape: (vocab_size,).
             loss_kwargs (dict): Additional arguments for the loss function.
         """
-        len_chosen_chunk = target_chunk.shape[0] // 2
-        logits_chunk = input_chunk @ weight.t()  # chunk_size x V
-        if bias is not None:
-            logits_chunk = logits_chunk + bias
-        log_probs_chunk = F.log_softmax(logits_chunk.float(), dim=-1)
-        chosen_nll_loss = 0.0
-        if compute_nll_loss:
-            chosen_nll_loss = F.nll_loss(
-                log_probs_chunk[:len_chosen_chunk].view(-1, log_probs_chunk.shape[-1]),
-                target_chunk[:len_chosen_chunk].view(-1),
-                reduction="sum",
+        chosen_logps, rejected_logps, chosen_nll_loss = (
+            LigerFusedLinearPreferenceBase.chunk_forward(
+                input_chunk,
+                weight,
+                target_chunk,
+                bias=bias,
                 ignore_index=ignore_index,
+                compute_nll_loss=compute_nll_loss,
             )
-            chosen_nll_loss = (
-                chosen_nll_loss
-                / (full_target[: full_target.shape[0] // 2] != ignore_index).sum()
-            )
-        loss_mask = target_chunk != ignore_index
-        label_chunk = torch.where(loss_mask, target_chunk, 0)
-        per_token_logps = log_probs_chunk.gather(-1, label_chunk.unsqueeze(-1)).squeeze(
-            -1
         )
-        average_log_prob = (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)
+        chosen_nll_loss = (
+            chosen_nll_loss
+            / (full_target[: full_target.shape[0] // 2] != ignore_index).sum()
+        )
-        chosen_logps = average_log_prob[:len_chosen_chunk]
-        rejected_logps = average_log_prob[len_chosen_chunk:]
+        if use_ref_model:
+            with torch.no_grad():
+                ref_chosen_logps, ref_rejected_logps, _ = (
+                    LigerFusedLinearPreferenceBase.chunk_forward(
+                        input_chunk,
+                        ref_weight,
+                        target_chunk,
+                        ref_bias,
+                        ignore_index=ignore_index,
+                        compute_nll_loss=False,
+                    )
+                )
+            loss_kwargs["ref_chosen_logps"] = ref_chosen_logps
+            loss_kwargs["ref_rejected_logps"] = ref_rejected_logps
         alignment_loss = preference_loss_fn(
             chosen_logps, rejected_logps, beta=beta, **loss_kwargs

liger_kernel_nightly-0.4.2.dev20241122052539/src/liger_kernel/transformers/functional.py ADDED Viewed

@@ -0,0 +1,173 @@
+from typing import Optional
+from liger_kernel.ops.cross_entropy import LigerCrossEntropyFunction
+from liger_kernel.ops.fused_linear_cross_entropy import (
+    LigerFusedLinearCrossEntropyFunction,
+)
+from liger_kernel.ops.fused_linear_jsd import LigerFusedLinearJSDFunction
+from liger_kernel.ops.geglu import LigerGELUMulFunction
+from liger_kernel.ops.group_norm import LigerGroupNormFunction
+from liger_kernel.ops.jsd import LigerJSDFunction
+from liger_kernel.ops.kl_div import LigerKLDivLossFunction
+from liger_kernel.ops.layer_norm import LigerLayerNormFunction
+from liger_kernel.ops.qwen2vl_mrope import LigerQwen2VLMRopeFunction
+from liger_kernel.ops.rms_norm import LigerRMSNormFunction
+from liger_kernel.ops.rope import LigerRopeFunction
+from liger_kernel.ops.swiglu import LigerSiLUMulFunction
+# conform to the function signature in https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html
+# `weight` and `size_average` are placeholders and not implemented yet
+def liger_cross_entropy(
+    input,
+    target,
+    weight=None,
+    size_average=None,
+    ignore_index: int = -100,
+    reduce=None,
+    reduction: str = "mean",
+    label_smoothing: float = 0.0,
+    lse_square_scale: float = 0.0,
+    softcap: Optional[float] = None,
+    return_z_loss: bool = False,
+):
+    loss, z_loss = LigerCrossEntropyFunction.apply(
+        input,
+        target,
+        ignore_index,
+        lse_square_scale,
+        label_smoothing,
+        reduction,
+        softcap,
+        return_z_loss,
+    )
+    if not return_z_loss:
+        return loss
+    return loss, z_loss
+def liger_fused_linear_cross_entropy(
+    input,
+    weight,
+    target,
+    bias=None,
+    ignore_index: int = -100,
+    lse_square_scale: float = 0.0,
+    label_smoothing: float = 0.0,
+    reduction: str = "mean",
+    softcap: Optional[float] = None,
+):
+    return LigerFusedLinearCrossEntropyFunction.apply(
+        input,
+        weight,
+        target,
+        bias,
+        ignore_index,
+        lse_square_scale,
+        label_smoothing,
+        reduction,
+        softcap,
+    )
+def liger_fused_linear_jsd(
+    student_input,
+    student_weight,
+    teacher_input,
+    teacher_weight,
+    shift_labels=None,
+    jsd_beta: float = 0.5,
+    ignore_index: int = -100,
+    temperature: float = 1.0,
+):
+    return LigerFusedLinearJSDFunction.apply(
+        student_input,
+        student_weight,
+        teacher_input,
+        teacher_weight,
+        shift_labels,
+        jsd_beta,
+        ignore_index,
+        temperature,
+    )
+def liger_geglu(a, b):
+    return LigerGELUMulFunction.apply(a, b)
+def liger_group_norm(
+    X,
+    affine_scaling_weight,
+    affine_shifting_bias,
+    num_channels,
+    num_groups,
+    eps,
+):
+    return LigerGroupNormFunction.apply(
+        X,
+        affine_scaling_weight,
+        affine_shifting_bias,
+        num_channels,
+        num_groups,
+        eps,
+    )
+def liger_jsd(
+    input,
+    target,
+    shift_labels=None,
+    beta: float = 0.5,
+    ignore_index: int = -100,
+):
+    return LigerJSDFunction.apply(
+        input,
+        target,
+        shift_labels,
+        beta,
+        ignore_index,
+    )
+# conform to the function signature in https://pytorch.org/docs/stable/generated/torch.nn.functional.kl_div.html#torch.nn.functional.kl_div
+# `size_average` and `mean` are being deprecated in torch API and are placeholders here
+def liger_kl_div(
+    input,
+    target,
+    size_average: bool = True,
+    reduce: bool = True,
+    reduction: str = "mean",
+    log_target: bool = False,
+    eps: float = 1e-10,
+):
+    # Note: the default reduction in torch is `mean`, but being `batchmean` in Liger
+    return LigerKLDivLossFunction.apply(
+        input,
+        target,
+        reduction,
+        log_target,
+        eps,
+    )
+def liger_layer_norm(X, W, B, eps):
+    return LigerLayerNormFunction.apply(X, W, B, eps)
+def liger_qwen2vl_mrope(q, k, cos, sin, mrope_section, unsqueeze_dim=1):
+    return LigerQwen2VLMRopeFunction.apply(q, k, cos, sin, mrope_section, unsqueeze_dim)
+def liger_rms_norm(
+    X, W, eps, offset: float = 0.0, casting_mode: str = "llama", in_place: bool = True
+):
+    return LigerRMSNormFunction.apply(X, W, eps, offset, casting_mode, in_place)
+def liger_rope(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    return LigerRopeFunction.apply(q, k, cos, sin, position_ids, unsqueeze_dim)
+def liger_swiglu(a, b):
+    return LigerSiLUMulFunction.apply(a, b)

{liger_kernel_nightly-0.4.2.dev20241121224158 → liger_kernel_nightly-0.4.2.dev20241122052539/src/liger_kernel_nightly.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.4.2.dev20241121224158
+Version: 0.4.2.dev20241122052539
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation

liger_kernel_nightly-0.4.2.dev20241121224158/src/liger_kernel/transformers/functional.py DELETED Viewed

@@ -1,58 +0,0 @@
-from typing import Optional
-from liger_kernel.ops.cross_entropy import LigerCrossEntropyFunction
-from liger_kernel.ops.fused_linear_cross_entropy import (
-    LigerFusedLinearCrossEntropyFunction,
-)
-from liger_kernel.ops.fused_linear_jsd import LigerFusedLinearJSDFunction
-from liger_kernel.ops.geglu import LigerGELUMulFunction
-from liger_kernel.ops.group_norm import LigerGroupNormFunction
-from liger_kernel.ops.jsd import LigerJSDFunction
-from liger_kernel.ops.kl_div import LigerKLDivLossFunction
-from liger_kernel.ops.layer_norm import LigerLayerNormFunction
-from liger_kernel.ops.qwen2vl_mrope import LigerQwen2VLMRopeFunction
-from liger_kernel.ops.rms_norm import LigerRMSNormFunction
-from liger_kernel.ops.rope import LigerRopeFunction
-from liger_kernel.ops.swiglu import LigerSiLUMulFunction
-liger_swiglu = LigerSiLUMulFunction.apply
-liger_fused_linear_cross_entropy = LigerFusedLinearCrossEntropyFunction.apply
-liger_geglu = LigerGELUMulFunction.apply
-liger_rms_norm = LigerRMSNormFunction.apply
-liger_rope = LigerRopeFunction.apply
-liger_qwen2vl_mrope = LigerQwen2VLMRopeFunction.apply
-liger_layer_norm = LigerLayerNormFunction.apply
-liger_kl_div = LigerKLDivLossFunction.apply
-liger_jsd = LigerJSDFunction.apply
-liger_fused_linear_jsd = LigerFusedLinearJSDFunction.apply
-liger_group_norm = LigerGroupNormFunction.apply
-# conform to the function signature in https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html
-# `weight` and `size_average` are placeholders and not implemented yet
-def liger_cross_entropy(
-    input,
-    target,
-    weight=None,
-    size_average=None,
-    ignore_index: int = -100,
-    reduce=None,
-    reduction: str = "mean",
-    label_smoothing: float = 0.0,
-    lse_square_scale: float = 0.0,
-    softcap: Optional[float] = None,
-    return_z_loss: bool = False,
-):
-    loss, z_loss = LigerCrossEntropyFunction.apply(
-        input,
-        target,
-        ignore_index,
-        lse_square_scale,
-        label_smoothing,
-        reduction,
-        softcap,
-        return_z_loss,
-    )
-    if not return_z_loss:
-        return loss
-    return loss, z_loss