PyPI - liger-kernel-nightly - Versions diffs - 0.4.2.dev20241121225747__tar.gz → 0.4.2.dev20241122175637__tar.gz - Mend

liger-kernel-nightly 0.4.2.dev20241121225747tar.gz → 0.4.2.dev20241122175637tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (65) hide show

{liger_kernel_nightly-0.4.2.dev20241121225747/src/liger_kernel_nightly.egg-info → liger_kernel_nightly-0.4.2.dev20241122175637}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.4.2.dev20241121225747
+Version: 0.4.2.dev20241122175637
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation

{liger_kernel_nightly-0.4.2.dev20241121225747 → liger_kernel_nightly-0.4.2.dev20241122175637}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "liger_kernel_nightly"
-version = "0.4.2.dev20241121225747"
+version = "0.4.2.dev20241122175637"
 description = "Efficient Triton kernels for LLM Training"
 urls = { "Homepage" = "https://github.com/linkedin/Liger-Kernel" }
 readme = { file = "README.md", content-type = "text/markdown" }

{liger_kernel_nightly-0.4.2.dev20241121225747 → liger_kernel_nightly-0.4.2.dev20241122175637}/src/liger_kernel/chunked_loss/dpo_loss.py RENAMED Viewed

@@ -9,15 +9,31 @@ from liger_kernel.chunked_loss.fused_linear_preference import (
 class LigerFusedLinearDPOFunction(LigerFusedLinearPreferenceBase):
     @staticmethod
-    def preference_loss_fn(chosen_logps, rejected_logps, beta=0.1):
+    def preference_loss_fn(
+        chosen_logps,
+        rejected_logps,
+        ref_chosen_logps=None,
+        ref_rejected_logps=None,
+        beta=0.1,
+    ):
         """
         Compute DPO loss (Direct Preference Optimization).
         Args:
             chosen_logps (torch.Tensor): Avg log probabilities of chosen tokens. Shape: (batch_size,).
             rejected_logps (torch.Tensor): Avg log probabilities of rejected tokens. Shape: (batch_size,).
+            ref_chosen_logps (torch.Tensor, optional): Reference log probabilities of chosen tokens. Shape: (batch_size,).
+            ref_rejected_logps (torch.Tensor, optional): Reference log probabilities of rejected tokens. Shape: (batch_size,).
             beta (float): Weight for the direct preference loss.
         """
-        logits_diff = beta * (chosen_logps - rejected_logps)
+        if ref_chosen_logps is None:
+            ref_chosen_logps = torch.tensor(0.0, device=chosen_logps.device)
+        if ref_rejected_logps is None:
+            ref_rejected_logps = torch.tensor(0.0, device=rejected_logps.device)
+        chosen_logratios = chosen_logps - ref_chosen_logps
+        rejected_logratios = rejected_logps - ref_rejected_logps
+        logits_diff = beta * (chosen_logratios - rejected_logratios)
         losses = -F.logsigmoid(logits_diff)
         return losses.sum()
@@ -28,10 +44,13 @@ class LigerFusedLinearDPOFunction(LigerFusedLinearPreferenceBase):
         weight,
         target,
         bias=None,
+        ref_weight=None,
+        ref_bias=None,
         ignore_index=-100,
         beta=0.1,
         compute_nll_loss=True,
         compiled=True,
+        use_ref_model=True,
     ):
         """
         Fused linear layer with DPO (Direct Preference Optimization) loss.
@@ -48,6 +67,9 @@ class LigerFusedLinearDPOFunction(LigerFusedLinearPreferenceBase):
             beta=beta,
             compute_nll_loss=compute_nll_loss,
             compiled=compiled,
+            use_ref_model=use_ref_model,
+            ref_weight=ref_weight,
+            ref_bias=ref_bias,
         )
     @staticmethod
@@ -55,7 +77,7 @@ class LigerFusedLinearDPOFunction(LigerFusedLinearPreferenceBase):
         # Get gradients for _input, weight, bias, and target from the base class
         grads = LigerFusedLinearPreferenceBase.backward(ctx, grad_output)[:4]
         # Return these gradients, followed by None for the remaining inputs
-        return *grads, None, None, None, None
+        return *grads, None, None, None, None, None, None, None
 class LigerFusedLinearDPOLoss(torch.nn.Module):
@@ -69,26 +91,36 @@ class LigerFusedLinearDPOLoss(torch.nn.Module):
         beta: float = 0.1,
         compute_nll_loss: bool = True,
         compiled: bool = True,
+        use_ref_model: bool = False,
     ):
         """
         Args:
             ignore_index (int): Index to ignore in the loss.
             beta (float): Weight for the odds ratio loss.
+            compute_nll_loss (bool): Whether to compute the NLL loss.
+            compiled (bool): Whether to use the torch compiled kernel.
+            use_ref_model (bool): Whether to use a reference model for the DPO loss.
         """
         super().__init__()
         self.ignore_index = ignore_index
         self.beta = beta
         self.compute_nll_loss = compute_nll_loss
         self.compiled = compiled
+        self.use_ref_model = use_ref_model
-    def forward(self, lin_weight, _input, target, bias=None):
+    def forward(
+        self, lin_weight, _input, target, bias=None, ref_weight=None, ref_bias=None
+    ):
         return LigerFusedLinearDPOFunction.apply(
             _input,
             lin_weight,
             target,
             bias,
+            ref_weight,
+            ref_bias,
             self.ignore_index,
             self.beta,
             self.compute_nll_loss,
             self.compiled,
+            self.use_ref_model,
         )

{liger_kernel_nightly-0.4.2.dev20241121225747 → liger_kernel_nightly-0.4.2.dev20241122175637}/src/liger_kernel/chunked_loss/fused_linear_preference.py RENAMED Viewed

@@ -18,6 +18,42 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         """
         raise NotImplementedError("Preference loss function must be implemented.")
+    @staticmethod
+    def chunk_forward(
+        input_chunk,
+        weight,
+        target_chunk,
+        bias=None,
+        ignore_index=-100,
+        compute_nll_loss=True,
+    ):
+        len_chosen_chunk = target_chunk.shape[0] // 2
+        logits_chunk = input_chunk @ weight.t()
+        if bias is not None:
+            logits_chunk = logits_chunk + bias
+        log_probs_chunk = F.log_softmax(logits_chunk.float(), dim=-1)
+        chosen_nll_loss = 0.0
+        if compute_nll_loss:
+            chosen_nll_loss = F.nll_loss(
+                log_probs_chunk[:len_chosen_chunk].view(-1, log_probs_chunk.shape[-1]),
+                target_chunk[:len_chosen_chunk].view(-1),
+                reduction="sum",
+                ignore_index=ignore_index,
+            )
+        loss_mask = target_chunk != ignore_index
+        label_chunk = torch.where(loss_mask, target_chunk, 0)
+        per_token_logps = log_probs_chunk.gather(-1, label_chunk.unsqueeze(-1)).squeeze(
+            -1
+        )
+        average_log_prob = (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)
+        chosen_logps = average_log_prob[:len_chosen_chunk]
+        rejected_logps = average_log_prob[len_chosen_chunk:]
+        return chosen_logps, rejected_logps, chosen_nll_loss
     @staticmethod
     def forward(
         ctx,
@@ -32,6 +68,9 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         beta=0.1,
         compute_nll_loss=True,
         compiled=True,
+        use_ref_model=False,
+        ref_weight=None,
+        ref_bias=None,
         **loss_kwargs,
     ):
         """
@@ -49,7 +88,11 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             ignore_index (int): Index to ignore for loss computation.
             alpha (float): Weight for the NLL loss.
             beta (float): Weight for the odds ratio loss.
+            compute_nll_loss (bool): Whether to compute NLL loss.
             compiled (bool): Whether to use torch compile for chunk accumulation.
+            use_ref_model (bool): Whether to use a reference model for the alignment loss.
+            ref_weight (torch.Tensor): Reference weight tensor. Shape: (vocab_size, hidden_size).
+            ref_bias (torch.Tensor, optional): Reference bias tensor. Shape: (vocab_size,).
             loss_kwargs (dict): Other possible arguments that a loss function might need
         """
         # TODO: Tune CHUNK_SIZE to fully utilize the GPU
@@ -61,7 +104,6 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         grad_bias = torch.zeros_like(bias) if bias is not None else None
         loss_acc = torch.zeros((), device=_input.device)
-        chunks = max(1, _input.shape[0] // (2 * CHUNK_SIZE))
         loss_func_to_call = partial(
             LigerFusedLinearPreferenceBase._compute_loss,
             preference_loss_fn=loss_fn,
@@ -70,6 +112,9 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             beta=beta,
             compute_nll_loss=compute_nll_loss,
             full_target=target,
+            use_ref_model=use_ref_model,
+            ref_weight=ref_weight,
+            ref_bias=ref_bias,
             **loss_kwargs,
         )
@@ -101,6 +146,7 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             accumulate_chunk = torch.compile(accumulate_chunk)
         len_chosen = target.shape[0] // 2
+        chunks = max(1, _input.shape[0] // (2 * CHUNK_SIZE))
         _chosen_input_chunks = torch.chunk(_input[:len_chosen], chunks=chunks, dim=0)
         _chosen_target_chunks = torch.chunk(target[:len_chosen], chunks=chunks, dim=0)
         _rejected_input_chunks = torch.chunk(_input[len_chosen:], chunks=chunks, dim=0)
@@ -159,6 +205,9 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         alpha=1.0,
         beta=0.1,
         compute_nll_loss=True,
+        use_ref_model=False,
+        ref_weight=None,
+        ref_bias=None,
         **loss_kwargs,
     ):
         """
@@ -173,38 +222,41 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             ignore_index (int): Index to ignore for loss computation.
             alpha (float): Weight for the NLL loss.
             beta (float): Weight for the odds ratio loss.
+            compute_nll_loss (bool): Whether to compute NLL loss.
+            use_ref_model (bool): Whether to use a reference model for the alignment loss.
+            ref_weight (torch.Tensor): Reference weight tensor. Shape: (vocab_size, hidden_size).
+            ref_bias (torch.Tensor, optional): Reference bias tensor. Shape: (vocab_size,).
             loss_kwargs (dict): Additional arguments for the loss function.
         """
-        len_chosen_chunk = target_chunk.shape[0] // 2
-        logits_chunk = input_chunk @ weight.t()  # chunk_size x V
-        if bias is not None:
-            logits_chunk = logits_chunk + bias
-        log_probs_chunk = F.log_softmax(logits_chunk.float(), dim=-1)
-        chosen_nll_loss = 0.0
-        if compute_nll_loss:
-            chosen_nll_loss = F.nll_loss(
-                log_probs_chunk[:len_chosen_chunk].view(-1, log_probs_chunk.shape[-1]),
-                target_chunk[:len_chosen_chunk].view(-1),
-                reduction="sum",
+        chosen_logps, rejected_logps, chosen_nll_loss = (
+            LigerFusedLinearPreferenceBase.chunk_forward(
+                input_chunk,
+                weight,
+                target_chunk,
+                bias=bias,
                 ignore_index=ignore_index,
+                compute_nll_loss=compute_nll_loss,
             )
-            chosen_nll_loss = (
-                chosen_nll_loss
-                / (full_target[: full_target.shape[0] // 2] != ignore_index).sum()
-            )
-        loss_mask = target_chunk != ignore_index
-        label_chunk = torch.where(loss_mask, target_chunk, 0)
-        per_token_logps = log_probs_chunk.gather(-1, label_chunk.unsqueeze(-1)).squeeze(
-            -1
         )
-        average_log_prob = (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)
+        chosen_nll_loss = (
+            chosen_nll_loss
+            / (full_target[: full_target.shape[0] // 2] != ignore_index).sum()
+        )
-        chosen_logps = average_log_prob[:len_chosen_chunk]
-        rejected_logps = average_log_prob[len_chosen_chunk:]
+        if use_ref_model:
+            with torch.no_grad():
+                ref_chosen_logps, ref_rejected_logps, _ = (
+                    LigerFusedLinearPreferenceBase.chunk_forward(
+                        input_chunk,
+                        ref_weight,
+                        target_chunk,
+                        ref_bias,
+                        ignore_index=ignore_index,
+                        compute_nll_loss=False,
+                    )
+                )
+            loss_kwargs["ref_chosen_logps"] = ref_chosen_logps
+            loss_kwargs["ref_rejected_logps"] = ref_rejected_logps
         alignment_loss = preference_loss_fn(
             chosen_logps, rejected_logps, beta=beta, **loss_kwargs

{liger_kernel_nightly-0.4.2.dev20241121225747 → liger_kernel_nightly-0.4.2.dev20241122175637}/src/liger_kernel/ops/cross_entropy.py RENAMED Viewed

@@ -92,8 +92,8 @@ def liger_cross_entropy_kernel(
     # 3. [Online softmax] first pass: find max + sum
     m = float("-inf")  # m is the max value. use the notation from the paper
     d = 0.0  # d is the sum. use the notation from the paper
-    ori_X_y = tl.load(
-        X_ptr + y
+    ori_X_y = tl.load(X_ptr + y).cast(
+        tl.float32
     )  # we need to store the original value of X_y for the loss calculation
     if HAS_SOFTCAPPING:
         ori_X_y = softcap * tanh(ori_X_y / softcap)
@@ -106,8 +106,11 @@ def liger_cross_entropy_kernel(
     for i in range(0, n_cols, BLOCK_SIZE):
         X_offsets = i + tl.arange(0, BLOCK_SIZE)
         X_block = tl.load(
-            X_ptr + X_offsets, mask=X_offsets < n_cols, other=float("-inf")
-        )
+            X_ptr + X_offsets,
+            mask=X_offsets < n_cols,
+            other=float("-inf"),
+            # Ensure float32 precision for softmax calculation
+        ).cast(tl.float32)
         if HAS_SOFTCAPPING:
             X_block = softcap * tanh(X_block / softcap)
         block_max = tl.max(X_block)
@@ -141,8 +144,11 @@ def liger_cross_entropy_kernel(
     for i in range(0, n_cols, BLOCK_SIZE):
         X_offsets = i + tl.arange(0, BLOCK_SIZE)
         X_block = tl.load(
-            X_ptr + X_offsets, mask=X_offsets < n_cols, other=float("-inf")
-        )
+            X_ptr + X_offsets,
+            mask=X_offsets < n_cols,
+            other=float("-inf"),
+            # Ensure float32 precision for softmax calculation
+        ).cast(tl.float32)
         if HAS_SOFTCAPPING:
             intermediate = tanh(X_block / softcap)
             X_block = softcap * intermediate

{liger_kernel_nightly-0.4.2.dev20241121225747 → liger_kernel_nightly-0.4.2.dev20241122175637}/src/liger_kernel/ops/fused_linear_cross_entropy.py RENAMED Viewed

@@ -26,7 +26,6 @@ def fused_linear_cross_entropy_forward(
     reduction="mean",
     softcap=None,
 ):
-    dtype = _input.dtype
     device = _input.device
     # inputs have shape: BT x H
@@ -74,9 +73,6 @@ def fused_linear_cross_entropy_forward(
         loss_1d_slice = loss_1d[start_idx:end_idx]  # chunk_size,
         n_non_ignore = (target_chunk != ignore_index).sum().item()
-        # when doing CE, use the upcasted precision
-        logits_chunk = logits_chunk.float()
         # ensure _input and target are contiguous
         logits_chunk = logits_chunk.contiguous()
         target_chunk = target_chunk.contiguous()
@@ -103,13 +99,6 @@ def fused_linear_cross_entropy_forward(
             num_warps=32 if not is_hip() else 16,
         )
-        # gradient of logits_chunk is computed in-place by the above triton kernel.
-        # Following HuggingFace model source code, we do the forward and backward
-        # w.r.t. logits in fp32 for numerical stability especially as the num classes (vocab size) is huge.
-        # (reference: https://github.com/huggingface/transformers/blob/v4.42.4/src/transformers/models/llama/modeling_llama.py#L1194)
-        # Propagating to lm_head's backward, we'll switch back to the original dtype.
-        logits_chunk = logits_chunk.to(dtype)
         # gradient of logits_chunk is computed in-place by the above triton kernel and is of shape: chunk_size x V
         # thus grad_input[start_idx: end_idx] should be of shape: chunk_size x H
         # additionally, since we are chunking the inputs, observe that the loss and gradients are calculated only

{liger_kernel_nightly-0.4.2.dev20241121225747 → liger_kernel_nightly-0.4.2.dev20241122175637/src/liger_kernel_nightly.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.4.2.dev20241121225747
+Version: 0.4.2.dev20241122175637
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation