PyPI - liger-kernel-nightly - Versions diffs - 0.4.2.dev20241121225747__tar.gz → 0.4.2.dev20241122052539__tar.gz - Mend

liger-kernel-nightly 0.4.2.dev20241121225747tar.gz → 0.4.2.dev20241122052539tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (65) hide show

{liger_kernel_nightly-0.4.2.dev20241121225747/src/liger_kernel_nightly.egg-info → liger_kernel_nightly-0.4.2.dev20241122052539}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.4.2.dev20241121225747
+Version: 0.4.2.dev20241122052539
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation

{liger_kernel_nightly-0.4.2.dev20241121225747 → liger_kernel_nightly-0.4.2.dev20241122052539}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "liger_kernel_nightly"
-version = "0.4.2.dev20241121225747"
+version = "0.4.2.dev20241122052539"
 description = "Efficient Triton kernels for LLM Training"
 urls = { "Homepage" = "https://github.com/linkedin/Liger-Kernel" }
 readme = { file = "README.md", content-type = "text/markdown" }

{liger_kernel_nightly-0.4.2.dev20241121225747 → liger_kernel_nightly-0.4.2.dev20241122052539}/src/liger_kernel/chunked_loss/dpo_loss.py RENAMED Viewed

@@ -9,15 +9,31 @@ from liger_kernel.chunked_loss.fused_linear_preference import (
 class LigerFusedLinearDPOFunction(LigerFusedLinearPreferenceBase):
     @staticmethod
-    def preference_loss_fn(chosen_logps, rejected_logps, beta=0.1):
+    def preference_loss_fn(
+        chosen_logps,
+        rejected_logps,
+        ref_chosen_logps=None,
+        ref_rejected_logps=None,
+        beta=0.1,
+    ):
         """
         Compute DPO loss (Direct Preference Optimization).
         Args:
             chosen_logps (torch.Tensor): Avg log probabilities of chosen tokens. Shape: (batch_size,).
             rejected_logps (torch.Tensor): Avg log probabilities of rejected tokens. Shape: (batch_size,).
+            ref_chosen_logps (torch.Tensor, optional): Reference log probabilities of chosen tokens. Shape: (batch_size,).
+            ref_rejected_logps (torch.Tensor, optional): Reference log probabilities of rejected tokens. Shape: (batch_size,).
             beta (float): Weight for the direct preference loss.
         """
-        logits_diff = beta * (chosen_logps - rejected_logps)
+        if ref_chosen_logps is None:
+            ref_chosen_logps = torch.tensor(0.0, device=chosen_logps.device)
+        if ref_rejected_logps is None:
+            ref_rejected_logps = torch.tensor(0.0, device=rejected_logps.device)
+        chosen_logratios = chosen_logps - ref_chosen_logps
+        rejected_logratios = rejected_logps - ref_rejected_logps
+        logits_diff = beta * (chosen_logratios - rejected_logratios)
         losses = -F.logsigmoid(logits_diff)
         return losses.sum()
@@ -28,10 +44,13 @@ class LigerFusedLinearDPOFunction(LigerFusedLinearPreferenceBase):
         weight,
         target,
         bias=None,
+        ref_weight=None,
+        ref_bias=None,
         ignore_index=-100,
         beta=0.1,
         compute_nll_loss=True,
         compiled=True,
+        use_ref_model=True,
     ):
         """
         Fused linear layer with DPO (Direct Preference Optimization) loss.
@@ -48,6 +67,9 @@ class LigerFusedLinearDPOFunction(LigerFusedLinearPreferenceBase):
             beta=beta,
             compute_nll_loss=compute_nll_loss,
             compiled=compiled,
+            use_ref_model=use_ref_model,
+            ref_weight=ref_weight,
+            ref_bias=ref_bias,
         )
     @staticmethod
@@ -55,7 +77,7 @@ class LigerFusedLinearDPOFunction(LigerFusedLinearPreferenceBase):
         # Get gradients for _input, weight, bias, and target from the base class
         grads = LigerFusedLinearPreferenceBase.backward(ctx, grad_output)[:4]
         # Return these gradients, followed by None for the remaining inputs
-        return *grads, None, None, None, None
+        return *grads, None, None, None, None, None, None, None
 class LigerFusedLinearDPOLoss(torch.nn.Module):
@@ -69,26 +91,36 @@ class LigerFusedLinearDPOLoss(torch.nn.Module):
         beta: float = 0.1,
         compute_nll_loss: bool = True,
         compiled: bool = True,
+        use_ref_model: bool = False,
     ):
         """
         Args:
             ignore_index (int): Index to ignore in the loss.
             beta (float): Weight for the odds ratio loss.
+            compute_nll_loss (bool): Whether to compute the NLL loss.
+            compiled (bool): Whether to use the torch compiled kernel.
+            use_ref_model (bool): Whether to use a reference model for the DPO loss.
         """
         super().__init__()
         self.ignore_index = ignore_index
         self.beta = beta
         self.compute_nll_loss = compute_nll_loss
         self.compiled = compiled
+        self.use_ref_model = use_ref_model
-    def forward(self, lin_weight, _input, target, bias=None):
+    def forward(
+        self, lin_weight, _input, target, bias=None, ref_weight=None, ref_bias=None
+    ):
         return LigerFusedLinearDPOFunction.apply(
             _input,
             lin_weight,
             target,
             bias,
+            ref_weight,
+            ref_bias,
             self.ignore_index,
             self.beta,
             self.compute_nll_loss,
             self.compiled,
+            self.use_ref_model,
         )

{liger_kernel_nightly-0.4.2.dev20241121225747 → liger_kernel_nightly-0.4.2.dev20241122052539}/src/liger_kernel/chunked_loss/fused_linear_preference.py RENAMED Viewed

@@ -18,6 +18,42 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         """
         raise NotImplementedError("Preference loss function must be implemented.")
+    @staticmethod
+    def chunk_forward(
+        input_chunk,
+        weight,
+        target_chunk,
+        bias=None,
+        ignore_index=-100,
+        compute_nll_loss=True,
+    ):
+        len_chosen_chunk = target_chunk.shape[0] // 2
+        logits_chunk = input_chunk @ weight.t()
+        if bias is not None:
+            logits_chunk = logits_chunk + bias
+        log_probs_chunk = F.log_softmax(logits_chunk.float(), dim=-1)
+        chosen_nll_loss = 0.0
+        if compute_nll_loss:
+            chosen_nll_loss = F.nll_loss(
+                log_probs_chunk[:len_chosen_chunk].view(-1, log_probs_chunk.shape[-1]),
+                target_chunk[:len_chosen_chunk].view(-1),
+                reduction="sum",
+                ignore_index=ignore_index,
+            )
+        loss_mask = target_chunk != ignore_index
+        label_chunk = torch.where(loss_mask, target_chunk, 0)
+        per_token_logps = log_probs_chunk.gather(-1, label_chunk.unsqueeze(-1)).squeeze(
+            -1
+        )
+        average_log_prob = (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)
+        chosen_logps = average_log_prob[:len_chosen_chunk]
+        rejected_logps = average_log_prob[len_chosen_chunk:]
+        return chosen_logps, rejected_logps, chosen_nll_loss
     @staticmethod
     def forward(
         ctx,
@@ -32,6 +68,9 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         beta=0.1,
         compute_nll_loss=True,
         compiled=True,
+        use_ref_model=False,
+        ref_weight=None,
+        ref_bias=None,
         **loss_kwargs,
     ):
         """
@@ -49,7 +88,11 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             ignore_index (int): Index to ignore for loss computation.
             alpha (float): Weight for the NLL loss.
             beta (float): Weight for the odds ratio loss.
+            compute_nll_loss (bool): Whether to compute NLL loss.
             compiled (bool): Whether to use torch compile for chunk accumulation.
+            use_ref_model (bool): Whether to use a reference model for the alignment loss.
+            ref_weight (torch.Tensor): Reference weight tensor. Shape: (vocab_size, hidden_size).
+            ref_bias (torch.Tensor, optional): Reference bias tensor. Shape: (vocab_size,).
             loss_kwargs (dict): Other possible arguments that a loss function might need
         """
         # TODO: Tune CHUNK_SIZE to fully utilize the GPU
@@ -61,7 +104,6 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         grad_bias = torch.zeros_like(bias) if bias is not None else None
         loss_acc = torch.zeros((), device=_input.device)
-        chunks = max(1, _input.shape[0] // (2 * CHUNK_SIZE))
         loss_func_to_call = partial(
             LigerFusedLinearPreferenceBase._compute_loss,
             preference_loss_fn=loss_fn,
@@ -70,6 +112,9 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             beta=beta,
             compute_nll_loss=compute_nll_loss,
             full_target=target,
+            use_ref_model=use_ref_model,
+            ref_weight=ref_weight,
+            ref_bias=ref_bias,
             **loss_kwargs,
         )
@@ -101,6 +146,7 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             accumulate_chunk = torch.compile(accumulate_chunk)
         len_chosen = target.shape[0] // 2
+        chunks = max(1, _input.shape[0] // (2 * CHUNK_SIZE))
         _chosen_input_chunks = torch.chunk(_input[:len_chosen], chunks=chunks, dim=0)
         _chosen_target_chunks = torch.chunk(target[:len_chosen], chunks=chunks, dim=0)
         _rejected_input_chunks = torch.chunk(_input[len_chosen:], chunks=chunks, dim=0)
@@ -159,6 +205,9 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         alpha=1.0,
         beta=0.1,
         compute_nll_loss=True,
+        use_ref_model=False,
+        ref_weight=None,
+        ref_bias=None,
         **loss_kwargs,
     ):
         """
@@ -173,38 +222,41 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             ignore_index (int): Index to ignore for loss computation.
             alpha (float): Weight for the NLL loss.
             beta (float): Weight for the odds ratio loss.
+            compute_nll_loss (bool): Whether to compute NLL loss.
+            use_ref_model (bool): Whether to use a reference model for the alignment loss.
+            ref_weight (torch.Tensor): Reference weight tensor. Shape: (vocab_size, hidden_size).
+            ref_bias (torch.Tensor, optional): Reference bias tensor. Shape: (vocab_size,).
             loss_kwargs (dict): Additional arguments for the loss function.
         """
-        len_chosen_chunk = target_chunk.shape[0] // 2
-        logits_chunk = input_chunk @ weight.t()  # chunk_size x V
-        if bias is not None:
-            logits_chunk = logits_chunk + bias
-        log_probs_chunk = F.log_softmax(logits_chunk.float(), dim=-1)
-        chosen_nll_loss = 0.0
-        if compute_nll_loss:
-            chosen_nll_loss = F.nll_loss(
-                log_probs_chunk[:len_chosen_chunk].view(-1, log_probs_chunk.shape[-1]),
-                target_chunk[:len_chosen_chunk].view(-1),
-                reduction="sum",
+        chosen_logps, rejected_logps, chosen_nll_loss = (
+            LigerFusedLinearPreferenceBase.chunk_forward(
+                input_chunk,
+                weight,
+                target_chunk,
+                bias=bias,
                 ignore_index=ignore_index,
+                compute_nll_loss=compute_nll_loss,
             )
-            chosen_nll_loss = (
-                chosen_nll_loss
-                / (full_target[: full_target.shape[0] // 2] != ignore_index).sum()
-            )
-        loss_mask = target_chunk != ignore_index
-        label_chunk = torch.where(loss_mask, target_chunk, 0)
-        per_token_logps = log_probs_chunk.gather(-1, label_chunk.unsqueeze(-1)).squeeze(
-            -1
         )
-        average_log_prob = (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)
+        chosen_nll_loss = (
+            chosen_nll_loss
+            / (full_target[: full_target.shape[0] // 2] != ignore_index).sum()
+        )
-        chosen_logps = average_log_prob[:len_chosen_chunk]
-        rejected_logps = average_log_prob[len_chosen_chunk:]
+        if use_ref_model:
+            with torch.no_grad():
+                ref_chosen_logps, ref_rejected_logps, _ = (
+                    LigerFusedLinearPreferenceBase.chunk_forward(
+                        input_chunk,
+                        ref_weight,
+                        target_chunk,
+                        ref_bias,
+                        ignore_index=ignore_index,
+                        compute_nll_loss=False,
+                    )
+                )
+            loss_kwargs["ref_chosen_logps"] = ref_chosen_logps
+            loss_kwargs["ref_rejected_logps"] = ref_rejected_logps
         alignment_loss = preference_loss_fn(
             chosen_logps, rejected_logps, beta=beta, **loss_kwargs

{liger_kernel_nightly-0.4.2.dev20241121225747 → liger_kernel_nightly-0.4.2.dev20241122052539/src/liger_kernel_nightly.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.4.2.dev20241121225747
+Version: 0.4.2.dev20241122052539
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation