PyPI - liger-kernel-nightly - Versions diffs - 0.4.2.dev20241204180758__tar.gz → 0.4.2.dev20241207011709__tar.gz - Mend

liger-kernel-nightly 0.4.2.dev20241204180758tar.gz → 0.4.2.dev20241207011709tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (68) hide show

{liger_kernel_nightly-0.4.2.dev20241204180758/src/liger_kernel_nightly.egg-info → liger_kernel_nightly-0.4.2.dev20241207011709}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.4.2.dev20241204180758
+Version: 0.4.2.dev20241207011709
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation
@@ -36,6 +36,7 @@ Provides-Extra: transformers
 Requires-Dist: transformers~=4.0; extra == "transformers"
 Provides-Extra: dev
 Requires-Dist: transformers>=4.44.2; extra == "dev"
+Requires-Dist: trl>=0.11.0; extra == "dev"
 Requires-Dist: matplotlib>=3.7.2; extra == "dev"
 Requires-Dist: flake8>=4.0.1.1; extra == "dev"
 Requires-Dist: black>=24.4.2; extra == "dev"
@@ -55,7 +56,7 @@ Requires-Dist: seaborn; extra == "dev"
         <th style="padding: 10px;" colspan="2">Stable</th>
         <th style="padding: 10px;" colspan="2">Nightly</th>
         <th style="padding: 10px;">Discord</th>
-        <th style="padding: 10px;">Gurubase (experimental)</th>
+        <th style="padding: 10px;">Build</th>
     </tr>
     <tr>
         <td style="padding: 10px;">
@@ -84,8 +85,8 @@ Requires-Dist: seaborn; extra == "dev"
             </a>
         </td>
         <td style="padding: 10px;">
-            <a href="https://gurubase.io/g/liger-kernel">
-                <img src="https://img.shields.io/badge/Gurubase-Ask%20Liger%20Kernel%20Guru-006BFF" alt="Ask Liger Kernel Guru">
+            <a href="https://github.com/linkedin/Liger-Kernel/actions/workflows/ci.yml">
+                <img src="https://github.com/linkedin/Liger-Kernel/actions/workflows/ci.yml/badge.svg?event=schedule" alt="Build">
             </a>
         </td>
     </tr>

{liger_kernel_nightly-0.4.2.dev20241204180758 → liger_kernel_nightly-0.4.2.dev20241207011709}/README.md RENAMED Viewed

@@ -8,7 +8,7 @@
         <th style="padding: 10px;" colspan="2">Stable</th>
         <th style="padding: 10px;" colspan="2">Nightly</th>
         <th style="padding: 10px;">Discord</th>
-        <th style="padding: 10px;">Gurubase (experimental)</th>
+        <th style="padding: 10px;">Build</th>
     </tr>
     <tr>
         <td style="padding: 10px;">
@@ -37,8 +37,8 @@
             </a>
         </td>
         <td style="padding: 10px;">
-            <a href="https://gurubase.io/g/liger-kernel">
-                <img src="https://img.shields.io/badge/Gurubase-Ask%20Liger%20Kernel%20Guru-006BFF" alt="Ask Liger Kernel Guru">
+            <a href="https://github.com/linkedin/Liger-Kernel/actions/workflows/ci.yml">
+                <img src="https://github.com/linkedin/Liger-Kernel/actions/workflows/ci.yml/badge.svg?event=schedule" alt="Build">
             </a>
         </td>
     </tr>

{liger_kernel_nightly-0.4.2.dev20241204180758 → liger_kernel_nightly-0.4.2.dev20241207011709}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "liger_kernel_nightly"
-version = "0.4.2.dev20241204180758"
+version = "0.4.2.dev20241207011709"
 description = "Efficient Triton kernels for LLM Training"
 urls = { "Homepage" = "https://github.com/linkedin/Liger-Kernel" }
 readme = { file = "README.md", content-type = "text/markdown" }
@@ -21,6 +21,7 @@ transformers = [
 dev = [
     "transformers>=4.44.2",
+    "trl>=0.11.0",
     "matplotlib>=3.7.2",
     "flake8>=4.0.1.1",
     "black>=24.4.2",

{liger_kernel_nightly-0.4.2.dev20241204180758 → liger_kernel_nightly-0.4.2.dev20241207011709}/src/liger_kernel/chunked_loss/cpo_loss.py RENAMED Viewed

@@ -9,7 +9,7 @@ from liger_kernel.chunked_loss.fused_linear_preference import (
 class LigerFusedLinearCPOFunction(LigerFusedLinearPreferenceBase):
     @staticmethod
-    def preference_loss_fn(chosen_logps, rejected_logps, beta=0.1):
+    def preference_loss_fn(chosen_logps, rejected_logps, full_target, beta=0.1):
         """
         Compute odds-ratio loss.
         Args:
@@ -18,7 +18,7 @@ class LigerFusedLinearCPOFunction(LigerFusedLinearPreferenceBase):
             beta (float): Weight for the odds ratio loss.
         """
         logits = beta * (chosen_logps - rejected_logps)
-        loss = F.logsigmoid(logits).mean()
+        loss = F.logsigmoid(logits).sum() / (full_target.shape[0] // 2)
         return loss
     @staticmethod
@@ -55,7 +55,7 @@ class LigerFusedLinearCPOFunction(LigerFusedLinearPreferenceBase):
         )
     @staticmethod
-    def backward(ctx, grad_output):
+    def backward(ctx, *grad_output):
         # Get gradients for _input, weight, bias, and target from the base class
         grads = LigerFusedLinearPreferenceBase.backward(ctx, grad_output)[:4]
         # Return these gradients, followed by None for the remaining inputs

{liger_kernel_nightly-0.4.2.dev20241204180758 → liger_kernel_nightly-0.4.2.dev20241207011709}/src/liger_kernel/chunked_loss/dpo_loss.py RENAMED Viewed

@@ -12,6 +12,7 @@ class LigerFusedLinearDPOFunction(LigerFusedLinearPreferenceBase):
     def preference_loss_fn(
         chosen_logps,
         rejected_logps,
+        full_target,
         ref_chosen_logps=None,
         ref_rejected_logps=None,
         beta=0.1,
@@ -34,8 +35,8 @@ class LigerFusedLinearDPOFunction(LigerFusedLinearPreferenceBase):
         rejected_logratios = rejected_logps - ref_rejected_logps
         logits_diff = beta * (chosen_logratios - rejected_logratios)
-        losses = -F.logsigmoid(logits_diff)
-        return losses.sum()
+        loss = -F.logsigmoid(logits_diff).sum() / (full_target.shape[0] // 2)
+        return loss
     @staticmethod
     def forward(
@@ -73,7 +74,7 @@ class LigerFusedLinearDPOFunction(LigerFusedLinearPreferenceBase):
         )
     @staticmethod
-    def backward(ctx, grad_output):
+    def backward(ctx, *grad_output):
         # Get gradients for _input, weight, bias, and target from the base class
         grads = LigerFusedLinearPreferenceBase.backward(ctx, grad_output)[:4]
         # Return these gradients, followed by None for the remaining inputs

{liger_kernel_nightly-0.4.2.dev20241204180758 → liger_kernel_nightly-0.4.2.dev20241207011709}/src/liger_kernel/chunked_loss/fused_linear_preference.py RENAMED Viewed

@@ -52,7 +52,17 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         chosen_logps = average_log_prob[:len_chosen_chunk]
         rejected_logps = average_log_prob[len_chosen_chunk:]
-        return chosen_logps, rejected_logps, chosen_nll_loss
+        chosen_logits = logits_chunk[:len_chosen_chunk]
+        rejected_logits = logits_chunk[len_chosen_chunk:]
+        return (
+            chosen_logps,
+            rejected_logps,
+            chosen_logits,
+            rejected_logits,
+            chosen_nll_loss,
+        )
     @staticmethod
     def forward(
@@ -103,6 +113,12 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         grad_rejected_inputs = []
         grad_bias = torch.zeros_like(bias) if bias is not None else None
         loss_acc = torch.zeros((), device=_input.device)
+        policy_chosen_logps = []
+        policy_rejected_logps = []
+        policy_chosen_logits_mean = torch.zeros((), device=_input.device)
+        policy_rejected_logits_mean = torch.zeros((), device=_input.device)
+        policy_nll_loss = torch.zeros((), device=_input.device)
+        aggregated_aux_outputs = []  # aggregated aux outputs from all chunks
         loss_func_to_call = partial(
             LigerFusedLinearPreferenceBase._compute_loss,
@@ -118,32 +134,72 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             **loss_kwargs,
         )
+        def accumulate_helper(input_chunk, target_chunk):
+            if bias is not None:
+                return torch.func.grad_and_value(
+                    loss_func_to_call, argnums=(0, 1, 3), has_aux=True
+                )(input_chunk, weight, target_chunk, bias)
+            else:
+                return torch.func.grad_and_value(
+                    loss_func_to_call, argnums=(0, 1), has_aux=True
+                )(input_chunk, weight, target_chunk)
         def accumulate_chunk(input_chunk, target_chunk):
             if bias is not None:
                 (chunk_grad_input, chunk_grad_weight, chunk_grad_bias), (
                     chunk_loss,
-                    (chunk_or_loss, chunk_chosen_logps, chunk_rejected_logps),
-                ) = torch.func.grad_and_value(
-                    loss_func_to_call, argnums=(0, 1, 3), has_aux=True
-                )(
-                    input_chunk, weight, target_chunk, bias
-                )
-                grad_bias.add_(chunk_grad_bias)
+                    (
+                        chunk_chosen_logps,
+                        chunk_rejected_logps,
+                        chunk_chosen_logits_mean,
+                        chunk_rejected_logits_mean,
+                        chunk_nll_loss,
+                        *aux_outputs,
+                    ),
+                ) = accumulate_helper(input_chunk, target_chunk)
+                grad_bias.add_(chunk_grad_bias)  # accumulate bias gradient
             else:
                 (chunk_grad_input, chunk_grad_weight), (
                     chunk_loss,
-                    (chunk_or_loss, chunk_chosen_logps, chunk_rejected_logps),
-                ) = torch.func.grad_and_value(
-                    loss_func_to_call, argnums=(0, 1), has_aux=True
-                )(
-                    input_chunk, weight, target_chunk
-                )
+                    (
+                        chunk_chosen_logps,
+                        chunk_rejected_logps,
+                        chunk_chosen_logits_mean,
+                        chunk_rejected_logits_mean,
+                        chunk_nll_loss,
+                        *aux_outputs,
+                    ),
+                ) = accumulate_helper(input_chunk, target_chunk)
             grad_weight.add_(chunk_grad_weight)
             loss_acc.add_(chunk_loss)
+            policy_chosen_logps.append(chunk_chosen_logps)
+            policy_rejected_logps.append(chunk_rejected_logps)
+            policy_chosen_logits_mean.add_(chunk_chosen_logits_mean)
+            policy_rejected_logits_mean.add_(chunk_rejected_logits_mean)
+            policy_nll_loss.add_(chunk_nll_loss)
+            # Initialize storage for aux_outputs
+            if len(aggregated_aux_outputs) == 0:
+                for aux in aux_outputs:
+                    if aux.ndim == 0:
+                        aggregated_aux_outputs.append(
+                            torch.zeros((), device=aux.device)
+                        )
+                    else:
+                        aggregated_aux_outputs.append([])
+            # Process each aux_output
+            for i, aux in enumerate(aux_outputs):
+                if aux.ndim == 0:
+                    aggregated_aux_outputs[i].add_(aux)
+                else:
+                    aggregated_aux_outputs[i].append(aux)
             return chunk_grad_input
         if compiled:
-            accumulate_chunk = torch.compile(accumulate_chunk)
+            accumulate_helper = torch.compile(accumulate_helper)
         len_chosen = target.shape[0] // 2
         chunks = max(1, _input.shape[0] // (2 * CHUNK_SIZE))
@@ -168,6 +224,12 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
                 [chosen_target_chunk, rejected_target_chunk], dim=0
             )
+            # mark input_chunk, target_chunk, and target dimension 1 as dynamic to prevent torch.compile recompilation
+            torch._dynamo.mark_dynamic(input_chunk, 1)
+            torch._dynamo.mark_dynamic(target_chunk, 1)
+            torch._dynamo.mark_dynamic(target, 1)
+            # accumulate loss, gradients, and metrics
             grad_input = accumulate_chunk(input_chunk, target_chunk)
             grad_chosen_inputs.append(grad_input[: chosen_target_chunk.shape[0]])
@@ -175,21 +237,37 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         # combine grad_chosen_inputs and grad_rejected_inputs
         grad_inputs = grad_chosen_inputs + grad_rejected_inputs
+        policy_chosen_logps = torch.cat(policy_chosen_logps, dim=0)
+        policy_rejected_logps = torch.cat(policy_rejected_logps, dim=0)
+        # Aggregate aux outputs lists into tensors
+        for i, aux in enumerate(aggregated_aux_outputs):
+            if isinstance(aux, list):
+                aggregated_aux_outputs[i] = torch.cat(aux, dim=0)
         ctx.save_for_backward(
             torch.cat(grad_inputs, dim=0),
             grad_weight,
             grad_bias,
         )
-        return loss_acc
+        return_vars = (
+            policy_chosen_logps,
+            policy_rejected_logps,
+            policy_chosen_logits_mean,
+            policy_rejected_logits_mean,
+            policy_nll_loss,
+        )
+        return loss_acc, (*return_vars, *aggregated_aux_outputs)
     @staticmethod
-    def backward(ctx, grad_output):
+    def backward(ctx, *grad_output):
         grad_input, grad_weight, grad_bias = ctx.saved_tensors
-        if torch.ne(grad_output, torch.tensor(1.0, device=grad_output.device)):
-            grad_input = grad_input * grad_output
-            grad_weight = grad_weight * grad_output
-            grad_bias = grad_bias * grad_output if grad_bias is not None else None
+        if torch.ne(
+            grad_output[0][0], torch.tensor(1.0, device=grad_output[0][0].device)
+        ):
+            grad_input = grad_input * grad_output[0][0]
+            grad_weight = grad_weight * grad_output[0][0]
+            grad_bias = grad_bias * grad_output[0][0] if grad_bias is not None else None
         return grad_input, grad_weight, None, grad_bias, None, None, None
@@ -228,40 +306,64 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             ref_bias (torch.Tensor, optional): Reference bias tensor. Shape: (vocab_size,).
             loss_kwargs (dict): Additional arguments for the loss function.
         """
-        chosen_logps, rejected_logps, chosen_nll_loss = (
-            LigerFusedLinearPreferenceBase.chunk_forward(
-                input_chunk,
-                weight,
-                target_chunk,
-                bias=bias,
-                ignore_index=ignore_index,
-                compute_nll_loss=compute_nll_loss,
-            )
+        (
+            chosen_logps,
+            rejected_logps,
+            chosen_logits,
+            rejected_logits,
+            chosen_nll_loss,
+        ) = LigerFusedLinearPreferenceBase.chunk_forward(
+            input_chunk,
+            weight,
+            target_chunk,
+            bias=bias,
+            ignore_index=ignore_index,
+            compute_nll_loss=compute_nll_loss,
         )
         chosen_nll_loss = (
             chosen_nll_loss
             / (full_target[: full_target.shape[0] // 2] != ignore_index).sum()
         )
+        chosen_logits_mean = chosen_logits.sum() / (
+            full_target.shape[0] // 2 * input_chunk.shape[1] * weight.shape[0]
+        )
+        rejected_logits_mean = rejected_logits.sum() / (
+            full_target.shape[0] // 2 * input_chunk.shape[1] * weight.shape[0]
+        )
         if use_ref_model:
             with torch.no_grad():
-                ref_chosen_logps, ref_rejected_logps, _ = (
-                    LigerFusedLinearPreferenceBase.chunk_forward(
-                        input_chunk,
-                        ref_weight,
-                        target_chunk,
-                        ref_bias,
-                        ignore_index=ignore_index,
-                        compute_nll_loss=False,
-                    )
+                (
+                    ref_chosen_logps,
+                    ref_rejected_logps,
+                    ref_chosen_logits,
+                    ref_rejected_logits,
+                    ref_chosen_nll_loss,
+                ) = LigerFusedLinearPreferenceBase.chunk_forward(
+                    input_chunk,
+                    ref_weight,
+                    target_chunk,
+                    ref_bias,
+                    ignore_index=ignore_index,
+                    compute_nll_loss=False,  # We don't need NLL loss for the reference model
                 )
             loss_kwargs["ref_chosen_logps"] = ref_chosen_logps
             loss_kwargs["ref_rejected_logps"] = ref_rejected_logps
-        alignment_loss = preference_loss_fn(
-            chosen_logps, rejected_logps, beta=beta, **loss_kwargs
+        preference_loss_outputs = preference_loss_fn(
+            chosen_logps, rejected_logps, full_target, beta=beta, **loss_kwargs
         )
-        alignment_loss = alignment_loss / (full_target.shape[0] // 2)
+        if isinstance(preference_loss_outputs, tuple):
+            preference_loss, *aux_outputs = preference_loss_outputs
+        else:
+            preference_loss, aux_outputs = preference_loss_outputs, []
-        loss = alpha * chosen_nll_loss - alignment_loss
-        return loss, (alignment_loss, chosen_logps, rejected_logps)
+        loss = alpha * chosen_nll_loss - preference_loss
+        return_vars = (
+            chosen_logps,
+            rejected_logps,
+            chosen_logits_mean,
+            rejected_logits_mean,
+            chosen_nll_loss,
+        )
+        return loss, (*return_vars, *aux_outputs)

{liger_kernel_nightly-0.4.2.dev20241204180758 → liger_kernel_nightly-0.4.2.dev20241207011709}/src/liger_kernel/chunked_loss/orpo_loss.py RENAMED Viewed

@@ -9,7 +9,7 @@ from liger_kernel.chunked_loss.fused_linear_preference import (
 class LigerFusedLinearORPOFunction(LigerFusedLinearPreferenceBase):
     @staticmethod
-    def preference_loss_fn(chosen_logps, rejected_logps, beta=0.1):
+    def preference_loss_fn(chosen_logps, rejected_logps, full_target, beta=0.1):
         """
         Compute odds-ratio loss.
         Args:
@@ -22,7 +22,15 @@ class LigerFusedLinearORPOFunction(LigerFusedLinearPreferenceBase):
             - torch.log1p(-torch.exp(rejected_logps))
         )
         ratio = F.logsigmoid(log_odds)
-        return beta * ratio.sum()
+        loss = beta * ratio.sum() / (full_target.shape[0] // 2)
+        chosen_rewards = beta * chosen_logps
+        rejected_rewards = beta * rejected_logps
+        log_odds_ratio = torch.sum(ratio) / (full_target.shape[0] // 2)
+        log_odds_chosen = torch.sum(log_odds) / (full_target.shape[0] // 2)
+        return loss, chosen_rewards, rejected_rewards, log_odds_ratio, log_odds_chosen
     @staticmethod
     def forward(
@@ -56,7 +64,7 @@ class LigerFusedLinearORPOFunction(LigerFusedLinearPreferenceBase):
         )
     @staticmethod
-    def backward(ctx, grad_output):
+    def backward(ctx, *grad_output):
         # Get gradients for _input, weight, bias, and target from the base class
         grads = LigerFusedLinearPreferenceBase.backward(ctx, grad_output)[:4]
         # Return these gradients, followed by None for the remaining inputs

{liger_kernel_nightly-0.4.2.dev20241204180758 → liger_kernel_nightly-0.4.2.dev20241207011709}/src/liger_kernel/chunked_loss/simpo_loss.py RENAMED Viewed

@@ -9,7 +9,9 @@ from liger_kernel.chunked_loss.fused_linear_preference import (
 class LigerFusedLinearSimPOFunction(LigerFusedLinearPreferenceBase):
     @staticmethod
-    def preference_loss_fn(chosen_logps, rejected_logps, beta=0.1, gamma=0.5):
+    def preference_loss_fn(
+        chosen_logps, rejected_logps, full_target, beta=0.1, gamma=0.5
+    ):
         """
         Compute odds-ratio loss.
         Args:
@@ -19,7 +21,7 @@ class LigerFusedLinearSimPOFunction(LigerFusedLinearPreferenceBase):
             gamma (float): The simpo gamma, margin term.
         """
         logits = beta * (chosen_logps - rejected_logps) - gamma
-        loss = F.logsigmoid(logits).mean()
+        loss = F.logsigmoid(logits).sum() / (full_target.shape[0] // 2)
         return loss
     @staticmethod
@@ -58,7 +60,7 @@ class LigerFusedLinearSimPOFunction(LigerFusedLinearPreferenceBase):
         )
     @staticmethod
-    def backward(ctx, grad_output):
+    def backward(ctx, *grad_output):
         # Get gradients for _input, weight, bias, and target from the base class
         grads = LigerFusedLinearPreferenceBase.backward(ctx, grad_output)[:4]
         # Return these gradients, followed by None for the remaining inputs

{liger_kernel_nightly-0.4.2.dev20241204180758 → liger_kernel_nightly-0.4.2.dev20241207011709}/src/liger_kernel/transformers/__init__.py RENAMED Viewed

@@ -22,6 +22,7 @@ from liger_kernel.transformers.monkey_patch import (  # noqa: F401
     apply_liger_kernel_to_qwen2,
     apply_liger_kernel_to_qwen2_vl,
 )
+from liger_kernel.transformers.orpo_trainer import LigerORPOTrainer  # noqa: F401
 from liger_kernel.transformers.rms_norm import LigerRMSNorm  # noqa: F401
 from liger_kernel.transformers.rope import liger_rotary_pos_emb  # noqa: F401
 from liger_kernel.transformers.swiglu import (  # noqa: F401

liger_kernel_nightly-0.4.2.dev20241207011709/src/liger_kernel/transformers/orpo_trainer.py ADDED Viewed

@@ -0,0 +1,171 @@
+from typing import Any, Callable, Dict, List, Literal, Tuple, Union
+import torch
+import torch.nn as nn
+from torch.distributed.fsdp import FullyShardedDataParallel
+from trl.trainer import ORPOTrainer
+from liger_kernel.chunked_loss import LigerFusedLinearORPOLoss
+class _FSDPForwardRedirection:
+    """
+    Modified based on
+    https://github.com/Lightning-AI/pytorch-lightning/blob/d3f9c83d6efa4f1def36aa6c199600946cdb9117/src/lightning/pytorch/strategies/strategy.py#L601-L648
+    Redirect a method call through FullyShardedDataParallel.forward so that the FSDP module's root pre-forward and
+    post-forward can be properly executed around the method call.
+    This is needed in cases where we call a submodule of a FSDP module. For instance, when we want to call only
+    the `LlamaModel` part out of a FSDP-wrapped `LlamaForCausalLM` to get the hidden states without involving
+    GPU-memory-heavy `lm_head` and cross entropy computation, doing this directly (i.e. `model.model.forward()`)
+    will not work because the first `nn.Emebedding` layer is not independently wrapped as a FSDP module (because of
+    the transformer-based wrapping policy), and not calling it through FSDP root module forward will not all-gather
+    its parameter, thus resulting in "RuntimeError: 'weight' must be 2-D" error. Similarly, if we want to call just
+    the `lm_head` part of a model, we need this trick too to properly get its params all-gathered.
+    """
+    def __call__(
+        self,
+        wrapper_module: FullyShardedDataParallel,
+        method: Callable,
+        *args: Any,
+        **kwargs: Any,
+    ):
+        """Reroutes a method call through the `wrapper_module`'s `forward` method.
+        Args:
+            wrapper_module: The module that has `original_module` wrapped.
+            original_module: The module that was wrapped inside `wrapper_module`.
+            method_name: The name of the method that should be called on the `original_module` after inputs get
+                redirected through the `wrapper_module`'s `forward` method.
+            *args: The positional arguments to the method `method_name`. They will get passed to a patched
+                `forward` method instead.
+            **kwargs: The keyword arguments to the method `method_name`. They will get passed to a patched
+                `forward` method instead.
+        """
+        assert isinstance(wrapper_module, FullyShardedDataParallel)
+        original_module = wrapper_module._fsdp_wrapped_module
+        original_forward = original_module.forward
+        def wrapped_forward(*_args: Any, **_kwargs: Any) -> Any:
+            # Unpatch ourselves immediately before calling the method `method_name`
+            # because itself may want to call the real `forward`
+            original_module.forward = original_forward  # type: ignore[method-assign]
+            # Call the actual method e.g. `.training_step(...)`
+            out = method(*_args, **_kwargs)
+            return out
+        # Patch the original_module's forward so we can redirect the arguments back to the real method
+        original_module.forward = wrapped_forward  # type: ignore[method-assign]
+        wrapper_output = wrapper_module(*args, **kwargs)
+        return wrapper_output
+class LigerORPOTrainer(ORPOTrainer):
+    def concatenated_forward(
+        self, model: nn.Module, batch: Dict[str, Union[List, torch.LongTensor]]
+    ) -> Tuple[
+        torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor
+    ]:
+        """
+        Run the given model on the given batch of inputs, concatenating the chosen and rejected inputs together.
+        We do this to avoid doing two forward passes, because it's faster for FSDP.
+        """
+        concatenated_batch = self.concatenated_inputs(
+            batch,
+            is_encoder_decoder=self.is_encoder_decoder,
+            label_pad_token_id=self.label_pad_token_id,
+            padding_value=self.padding_value,
+            device=self.accelerator.device,
+        )
+        # if self.accelerator.is_main_process:
+        #     import pdb; pdb.set_trace()
+        # torch.distributed.barrier()
+        model_kwargs = (
+            {
+                "decoder_input_ids": self._shift_right(
+                    concatenated_batch["concatenated_labels"]
+                ),
+            }
+            if self.is_encoder_decoder
+            else {}
+        )
+        if self.aux_loss_enabled:
+            model_kwargs["output_router_logits"] = True
+        if isinstance(model, FullyShardedDataParallel):
+            outputs = _FSDPForwardRedirection()(
+                model,
+                model._fsdp_wrapped_module.model,
+                concatenated_batch["concatenated_input_ids"],
+                attention_mask=concatenated_batch["concatenated_attention_mask"],
+                use_cache=False,
+                **model_kwargs,
+            )
+        else:
+            if isinstance(model, torch.nn.DataParallel):
+                model = model.module
+            outputs = model.model(
+                concatenated_batch["concatenated_input_ids"],
+                attention_mask=concatenated_batch["concatenated_attention_mask"],
+                use_cache=False,
+                **model_kwargs,
+            )
+        orpo_loss_fn = LigerFusedLinearORPOLoss(
+            ignore_index=self.label_pad_token_id, beta=self.beta
+        )
+        def orpo_partial(lm_head, last_hidden_state, concatenated_labels):
+            return orpo_loss_fn(
+                lm_head.weight, last_hidden_state, concatenated_labels, lm_head.bias
+            )
+        orpo_loss, aux_outputs = _FSDPForwardRedirection()(
+            model,
+            orpo_partial,
+            model.lm_head,
+            outputs.last_hidden_state,
+            concatenated_batch["concatenated_labels"],
+        )
+        return orpo_loss, aux_outputs
+    def get_batch_loss_metrics(
+        self,
+        model,
+        batch: Dict[str, Union[List, torch.LongTensor]],
+        train_eval: Literal["train", "eval"] = "train",
+    ):
+        """Compute the ORPO loss and other metrics for the given batch of inputs for train or test."""
+        metrics = {}
+        loss, aux_outputs = self.concatenated_forward(model, batch)
+        (
+            policy_chosen_logps,
+            policy_rejected_logps,
+            policy_chosen_logits,
+            policy_rejected_logits,
+            policy_nll_loss,
+        ) = aux_outputs[:5]
+        # return loss, metrics
+        chosen_rewards, rejected_rewards, log_odds_ratio, log_odds_chosen = aux_outputs[
+            5:
+        ]
+        reward_accuracies = (chosen_rewards > rejected_rewards).float()
+        prefix = "eval_" if train_eval == "eval" else ""
+        metrics[f"{prefix}rewards/chosen"] = chosen_rewards.mean()
+        metrics[f"{prefix}rewards/rejected"] = rejected_rewards.mean()
+        metrics[f"{prefix}rewards/accuracies"] = reward_accuracies.mean()
+        metrics[f"{prefix}rewards/margins"] = (chosen_rewards - rejected_rewards).mean()
+        metrics[f"{prefix}logps/rejected"] = policy_rejected_logps.detach().mean()
+        metrics[f"{prefix}logps/chosen"] = policy_chosen_logps.detach().mean()
+        metrics[f"{prefix}logits/rejected"] = policy_rejected_logits.detach().mean()
+        metrics[f"{prefix}logits/chosen"] = policy_chosen_logits.detach().mean()
+        metrics[f"{prefix}nll_loss"] = policy_nll_loss.detach().mean()
+        metrics[f"{prefix}log_odds_ratio"] = log_odds_ratio
+        metrics[f"{prefix}log_odds_chosen"] = log_odds_chosen
+        for k, v in metrics.items():
+            metrics[k] = v.item()
+        return loss, metrics

{liger_kernel_nightly-0.4.2.dev20241204180758 → liger_kernel_nightly-0.4.2.dev20241207011709/src/liger_kernel_nightly.egg-info}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.4.2.dev20241204180758
+Version: 0.4.2.dev20241207011709
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation
@@ -36,6 +36,7 @@ Provides-Extra: transformers
 Requires-Dist: transformers~=4.0; extra == "transformers"
 Provides-Extra: dev
 Requires-Dist: transformers>=4.44.2; extra == "dev"
+Requires-Dist: trl>=0.11.0; extra == "dev"
 Requires-Dist: matplotlib>=3.7.2; extra == "dev"
 Requires-Dist: flake8>=4.0.1.1; extra == "dev"
 Requires-Dist: black>=24.4.2; extra == "dev"
@@ -55,7 +56,7 @@ Requires-Dist: seaborn; extra == "dev"
         <th style="padding: 10px;" colspan="2">Stable</th>
         <th style="padding: 10px;" colspan="2">Nightly</th>
         <th style="padding: 10px;">Discord</th>
-        <th style="padding: 10px;">Gurubase (experimental)</th>
+        <th style="padding: 10px;">Build</th>
     </tr>
     <tr>
         <td style="padding: 10px;">
@@ -84,8 +85,8 @@ Requires-Dist: seaborn; extra == "dev"
             </a>
         </td>
         <td style="padding: 10px;">
-            <a href="https://gurubase.io/g/liger-kernel">
-                <img src="https://img.shields.io/badge/Gurubase-Ask%20Liger%20Kernel%20Guru-006BFF" alt="Ask Liger Kernel Guru">
+            <a href="https://github.com/linkedin/Liger-Kernel/actions/workflows/ci.yml">
+                <img src="https://github.com/linkedin/Liger-Kernel/actions/workflows/ci.yml/badge.svg?event=schedule" alt="Build">
             </a>
         </td>
     </tr>

{liger_kernel_nightly-0.4.2.dev20241204180758 → liger_kernel_nightly-0.4.2.dev20241207011709}/src/liger_kernel_nightly.egg-info/SOURCES.txt RENAMED Viewed

@@ -40,6 +40,7 @@ src/liger_kernel/transformers/jsd.py
 src/liger_kernel/transformers/kl_div.py
 src/liger_kernel/transformers/layer_norm.py
 src/liger_kernel/transformers/monkey_patch.py
+src/liger_kernel/transformers/orpo_trainer.py
 src/liger_kernel/transformers/qwen2vl_mrope.py
 src/liger_kernel/transformers/rms_norm.py
 src/liger_kernel/transformers/rope.py