PyPI - liger-kernel-nightly - Versions diffs - 0.4.2.dev20241203232039__py3-none-any.whl → 0.4.2.dev20241206180928__py3-none-any.whl - Mend

liger-kernel-nightly 0.4.2.dev20241203232039py3-none-any.whl → 0.4.2.dev20241206180928py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (13) hide show

liger_kernel/chunked_loss/cpo_loss.py CHANGED Viewed

@@ -9,7 +9,7 @@ from liger_kernel.chunked_loss.fused_linear_preference import (
 class LigerFusedLinearCPOFunction(LigerFusedLinearPreferenceBase):
     @staticmethod
-    def preference_loss_fn(chosen_logps, rejected_logps, beta=0.1):
+    def preference_loss_fn(chosen_logps, rejected_logps, full_target, beta=0.1):
         """
         Compute odds-ratio loss.
         Args:
@@ -18,7 +18,7 @@ class LigerFusedLinearCPOFunction(LigerFusedLinearPreferenceBase):
             beta (float): Weight for the odds ratio loss.
         """
         logits = beta * (chosen_logps - rejected_logps)
-        loss = F.logsigmoid(logits).mean()
+        loss = F.logsigmoid(logits).sum() / (full_target.shape[0] // 2)
         return loss
     @staticmethod
@@ -55,7 +55,7 @@ class LigerFusedLinearCPOFunction(LigerFusedLinearPreferenceBase):
         )
     @staticmethod
-    def backward(ctx, grad_output):
+    def backward(ctx, *grad_output):
         # Get gradients for _input, weight, bias, and target from the base class
         grads = LigerFusedLinearPreferenceBase.backward(ctx, grad_output)[:4]
         # Return these gradients, followed by None for the remaining inputs

liger_kernel/chunked_loss/dpo_loss.py CHANGED Viewed

@@ -12,6 +12,7 @@ class LigerFusedLinearDPOFunction(LigerFusedLinearPreferenceBase):
     def preference_loss_fn(
         chosen_logps,
         rejected_logps,
+        full_target,
         ref_chosen_logps=None,
         ref_rejected_logps=None,
         beta=0.1,
@@ -34,8 +35,8 @@ class LigerFusedLinearDPOFunction(LigerFusedLinearPreferenceBase):
         rejected_logratios = rejected_logps - ref_rejected_logps
         logits_diff = beta * (chosen_logratios - rejected_logratios)
-        losses = -F.logsigmoid(logits_diff)
-        return losses.sum()
+        loss = -F.logsigmoid(logits_diff).sum() / (full_target.shape[0] // 2)
+        return loss
     @staticmethod
     def forward(
@@ -73,7 +74,7 @@ class LigerFusedLinearDPOFunction(LigerFusedLinearPreferenceBase):
         )
     @staticmethod
-    def backward(ctx, grad_output):
+    def backward(ctx, *grad_output):
         # Get gradients for _input, weight, bias, and target from the base class
         grads = LigerFusedLinearPreferenceBase.backward(ctx, grad_output)[:4]
         # Return these gradients, followed by None for the remaining inputs

liger_kernel/chunked_loss/fused_linear_preference.py CHANGED Viewed

@@ -52,7 +52,17 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         chosen_logps = average_log_prob[:len_chosen_chunk]
         rejected_logps = average_log_prob[len_chosen_chunk:]
-        return chosen_logps, rejected_logps, chosen_nll_loss
+        chosen_logits = logits_chunk[:len_chosen_chunk]
+        rejected_logits = logits_chunk[len_chosen_chunk:]
+        return (
+            chosen_logps,
+            rejected_logps,
+            chosen_logits,
+            rejected_logits,
+            chosen_nll_loss,
+        )
     @staticmethod
     def forward(
@@ -103,6 +113,12 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         grad_rejected_inputs = []
         grad_bias = torch.zeros_like(bias) if bias is not None else None
         loss_acc = torch.zeros((), device=_input.device)
+        policy_chosen_logps = []
+        policy_rejected_logps = []
+        policy_chosen_logits_mean = torch.zeros((), device=_input.device)
+        policy_rejected_logits_mean = torch.zeros((), device=_input.device)
+        policy_nll_loss = torch.zeros((), device=_input.device)
+        aggregated_aux_outputs = []  # aggregated aux outputs from all chunks
         loss_func_to_call = partial(
             LigerFusedLinearPreferenceBase._compute_loss,
@@ -118,32 +134,72 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             **loss_kwargs,
         )
+        def accumulate_helper(input_chunk, target_chunk):
+            if bias is not None:
+                return torch.func.grad_and_value(
+                    loss_func_to_call, argnums=(0, 1, 3), has_aux=True
+                )(input_chunk, weight, target_chunk, bias)
+            else:
+                return torch.func.grad_and_value(
+                    loss_func_to_call, argnums=(0, 1), has_aux=True
+                )(input_chunk, weight, target_chunk)
         def accumulate_chunk(input_chunk, target_chunk):
             if bias is not None:
                 (chunk_grad_input, chunk_grad_weight, chunk_grad_bias), (
                     chunk_loss,
-                    (chunk_or_loss, chunk_chosen_logps, chunk_rejected_logps),
-                ) = torch.func.grad_and_value(
-                    loss_func_to_call, argnums=(0, 1, 3), has_aux=True
-                )(
-                    input_chunk, weight, target_chunk, bias
-                )
-                grad_bias.add_(chunk_grad_bias)
+                    (
+                        chunk_chosen_logps,
+                        chunk_rejected_logps,
+                        chunk_chosen_logits_mean,
+                        chunk_rejected_logits_mean,
+                        chunk_nll_loss,
+                        *aux_outputs,
+                    ),
+                ) = accumulate_helper(input_chunk, target_chunk)
+                grad_bias.add_(chunk_grad_bias)  # accumulate bias gradient
             else:
                 (chunk_grad_input, chunk_grad_weight), (
                     chunk_loss,
-                    (chunk_or_loss, chunk_chosen_logps, chunk_rejected_logps),
-                ) = torch.func.grad_and_value(
-                    loss_func_to_call, argnums=(0, 1), has_aux=True
-                )(
-                    input_chunk, weight, target_chunk
-                )
+                    (
+                        chunk_chosen_logps,
+                        chunk_rejected_logps,
+                        chunk_chosen_logits_mean,
+                        chunk_rejected_logits_mean,
+                        chunk_nll_loss,
+                        *aux_outputs,
+                    ),
+                ) = accumulate_helper(input_chunk, target_chunk)
             grad_weight.add_(chunk_grad_weight)
             loss_acc.add_(chunk_loss)
+            policy_chosen_logps.append(chunk_chosen_logps)
+            policy_rejected_logps.append(chunk_rejected_logps)
+            policy_chosen_logits_mean.add_(chunk_chosen_logits_mean)
+            policy_rejected_logits_mean.add_(chunk_rejected_logits_mean)
+            policy_nll_loss.add_(chunk_nll_loss)
+            # Initialize storage for aux_outputs
+            if len(aggregated_aux_outputs) == 0:
+                for aux in aux_outputs:
+                    if aux.ndim == 0:
+                        aggregated_aux_outputs.append(
+                            torch.zeros((), device=aux.device)
+                        )
+                    else:
+                        aggregated_aux_outputs.append([])
+            # Process each aux_output
+            for i, aux in enumerate(aux_outputs):
+                if aux.ndim == 0:
+                    aggregated_aux_outputs[i].add_(aux)
+                else:
+                    aggregated_aux_outputs[i].append(aux)
             return chunk_grad_input
         if compiled:
-            accumulate_chunk = torch.compile(accumulate_chunk)
+            accumulate_helper = torch.compile(accumulate_helper)
         len_chosen = target.shape[0] // 2
         chunks = max(1, _input.shape[0] // (2 * CHUNK_SIZE))
@@ -168,6 +224,12 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
                 [chosen_target_chunk, rejected_target_chunk], dim=0
             )
+            # mark input_chunk, target_chunk, and target dimension 1 as dynamic to prevent torch.compile recompilation
+            torch._dynamo.mark_dynamic(input_chunk, 1)
+            torch._dynamo.mark_dynamic(target_chunk, 1)
+            torch._dynamo.mark_dynamic(target, 1)
+            # accumulate loss, gradients, and metrics
             grad_input = accumulate_chunk(input_chunk, target_chunk)
             grad_chosen_inputs.append(grad_input[: chosen_target_chunk.shape[0]])
@@ -175,21 +237,37 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         # combine grad_chosen_inputs and grad_rejected_inputs
         grad_inputs = grad_chosen_inputs + grad_rejected_inputs
+        policy_chosen_logps = torch.cat(policy_chosen_logps, dim=0)
+        policy_rejected_logps = torch.cat(policy_rejected_logps, dim=0)
+        # Aggregate aux outputs lists into tensors
+        for i, aux in enumerate(aggregated_aux_outputs):
+            if isinstance(aux, list):
+                aggregated_aux_outputs[i] = torch.cat(aux, dim=0)
         ctx.save_for_backward(
             torch.cat(grad_inputs, dim=0),
             grad_weight,
             grad_bias,
         )
-        return loss_acc
+        return_vars = (
+            policy_chosen_logps,
+            policy_rejected_logps,
+            policy_chosen_logits_mean,
+            policy_rejected_logits_mean,
+            policy_nll_loss,
+        )
+        return loss_acc, (*return_vars, *aggregated_aux_outputs)
     @staticmethod
-    def backward(ctx, grad_output):
+    def backward(ctx, *grad_output):
         grad_input, grad_weight, grad_bias = ctx.saved_tensors
-        if torch.ne(grad_output, torch.tensor(1.0, device=grad_output.device)):
-            grad_input = grad_input * grad_output
-            grad_weight = grad_weight * grad_output
-            grad_bias = grad_bias * grad_output if grad_bias is not None else None
+        if torch.ne(
+            grad_output[0][0], torch.tensor(1.0, device=grad_output[0][0].device)
+        ):
+            grad_input = grad_input * grad_output[0][0]
+            grad_weight = grad_weight * grad_output[0][0]
+            grad_bias = grad_bias * grad_output[0][0] if grad_bias is not None else None
         return grad_input, grad_weight, None, grad_bias, None, None, None
@@ -228,40 +306,64 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             ref_bias (torch.Tensor, optional): Reference bias tensor. Shape: (vocab_size,).
             loss_kwargs (dict): Additional arguments for the loss function.
         """
-        chosen_logps, rejected_logps, chosen_nll_loss = (
-            LigerFusedLinearPreferenceBase.chunk_forward(
-                input_chunk,
-                weight,
-                target_chunk,
-                bias=bias,
-                ignore_index=ignore_index,
-                compute_nll_loss=compute_nll_loss,
-            )
+        (
+            chosen_logps,
+            rejected_logps,
+            chosen_logits,
+            rejected_logits,
+            chosen_nll_loss,
+        ) = LigerFusedLinearPreferenceBase.chunk_forward(
+            input_chunk,
+            weight,
+            target_chunk,
+            bias=bias,
+            ignore_index=ignore_index,
+            compute_nll_loss=compute_nll_loss,
         )
         chosen_nll_loss = (
             chosen_nll_loss
             / (full_target[: full_target.shape[0] // 2] != ignore_index).sum()
         )
+        chosen_logits_mean = chosen_logits.sum() / (
+            full_target.shape[0] // 2 * input_chunk.shape[1] * weight.shape[0]
+        )
+        rejected_logits_mean = rejected_logits.sum() / (
+            full_target.shape[0] // 2 * input_chunk.shape[1] * weight.shape[0]
+        )
         if use_ref_model:
             with torch.no_grad():
-                ref_chosen_logps, ref_rejected_logps, _ = (
-                    LigerFusedLinearPreferenceBase.chunk_forward(
-                        input_chunk,
-                        ref_weight,
-                        target_chunk,
-                        ref_bias,
-                        ignore_index=ignore_index,
-                        compute_nll_loss=False,
-                    )
+                (
+                    ref_chosen_logps,
+                    ref_rejected_logps,
+                    ref_chosen_logits,
+                    ref_rejected_logits,
+                    ref_chosen_nll_loss,
+                ) = LigerFusedLinearPreferenceBase.chunk_forward(
+                    input_chunk,
+                    ref_weight,
+                    target_chunk,
+                    ref_bias,
+                    ignore_index=ignore_index,
+                    compute_nll_loss=False,  # We don't need NLL loss for the reference model
                 )
             loss_kwargs["ref_chosen_logps"] = ref_chosen_logps
             loss_kwargs["ref_rejected_logps"] = ref_rejected_logps
-        alignment_loss = preference_loss_fn(
-            chosen_logps, rejected_logps, beta=beta, **loss_kwargs
+        preference_loss_outputs = preference_loss_fn(
+            chosen_logps, rejected_logps, full_target, beta=beta, **loss_kwargs
         )
-        alignment_loss = alignment_loss / (full_target.shape[0] // 2)
+        if isinstance(preference_loss_outputs, tuple):
+            preference_loss, *aux_outputs = preference_loss_outputs
+        else:
+            preference_loss, aux_outputs = preference_loss_outputs, []
-        loss = alpha * chosen_nll_loss - alignment_loss
-        return loss, (alignment_loss, chosen_logps, rejected_logps)
+        loss = alpha * chosen_nll_loss - preference_loss
+        return_vars = (
+            chosen_logps,
+            rejected_logps,
+            chosen_logits_mean,
+            rejected_logits_mean,
+            chosen_nll_loss,
+        )
+        return loss, (*return_vars, *aux_outputs)

liger_kernel/chunked_loss/orpo_loss.py CHANGED Viewed

@@ -9,7 +9,7 @@ from liger_kernel.chunked_loss.fused_linear_preference import (
 class LigerFusedLinearORPOFunction(LigerFusedLinearPreferenceBase):
     @staticmethod
-    def preference_loss_fn(chosen_logps, rejected_logps, beta=0.1):
+    def preference_loss_fn(chosen_logps, rejected_logps, full_target, beta=0.1):
         """
         Compute odds-ratio loss.
         Args:
@@ -22,7 +22,15 @@ class LigerFusedLinearORPOFunction(LigerFusedLinearPreferenceBase):
             - torch.log1p(-torch.exp(rejected_logps))
         )
         ratio = F.logsigmoid(log_odds)
-        return beta * ratio.sum()
+        loss = beta * ratio.sum() / (full_target.shape[0] // 2)
+        chosen_rewards = beta * chosen_logps
+        rejected_rewards = beta * rejected_logps
+        log_odds_ratio = torch.sum(ratio) / (full_target.shape[0] // 2)
+        log_odds_chosen = torch.sum(log_odds) / (full_target.shape[0] // 2)
+        return loss, chosen_rewards, rejected_rewards, log_odds_ratio, log_odds_chosen
     @staticmethod
     def forward(
@@ -56,7 +64,7 @@ class LigerFusedLinearORPOFunction(LigerFusedLinearPreferenceBase):
         )
     @staticmethod
-    def backward(ctx, grad_output):
+    def backward(ctx, *grad_output):
         # Get gradients for _input, weight, bias, and target from the base class
         grads = LigerFusedLinearPreferenceBase.backward(ctx, grad_output)[:4]
         # Return these gradients, followed by None for the remaining inputs

liger_kernel/chunked_loss/simpo_loss.py CHANGED Viewed

@@ -9,7 +9,9 @@ from liger_kernel.chunked_loss.fused_linear_preference import (
 class LigerFusedLinearSimPOFunction(LigerFusedLinearPreferenceBase):
     @staticmethod
-    def preference_loss_fn(chosen_logps, rejected_logps, beta=0.1, gamma=0.5):
+    def preference_loss_fn(
+        chosen_logps, rejected_logps, full_target, beta=0.1, gamma=0.5
+    ):
         """
         Compute odds-ratio loss.
         Args:
@@ -19,7 +21,7 @@ class LigerFusedLinearSimPOFunction(LigerFusedLinearPreferenceBase):
             gamma (float): The simpo gamma, margin term.
         """
         logits = beta * (chosen_logps - rejected_logps) - gamma
-        loss = F.logsigmoid(logits).mean()
+        loss = F.logsigmoid(logits).sum() / (full_target.shape[0] // 2)
         return loss
     @staticmethod
@@ -58,7 +60,7 @@ class LigerFusedLinearSimPOFunction(LigerFusedLinearPreferenceBase):
         )
     @staticmethod
-    def backward(ctx, grad_output):
+    def backward(ctx, *grad_output):
         # Get gradients for _input, weight, bias, and target from the base class
         grads = LigerFusedLinearPreferenceBase.backward(ctx, grad_output)[:4]
         # Return these gradients, followed by None for the remaining inputs

liger_kernel/transformers/__init__.py CHANGED Viewed

@@ -22,6 +22,7 @@ from liger_kernel.transformers.monkey_patch import (  # noqa: F401
     apply_liger_kernel_to_qwen2,
     apply_liger_kernel_to_qwen2_vl,
 )
+from liger_kernel.transformers.orpo_trainer import LigerORPOTrainer  # noqa: F401
 from liger_kernel.transformers.rms_norm import LigerRMSNorm  # noqa: F401
 from liger_kernel.transformers.rope import liger_rotary_pos_emb  # noqa: F401
 from liger_kernel.transformers.swiglu import (  # noqa: F401

liger_kernel/transformers/orpo_trainer.py ADDED Viewed

@@ -0,0 +1,171 @@
+from typing import Any, Callable, Dict, List, Literal, Tuple, Union
+import torch
+import torch.nn as nn
+from torch.distributed.fsdp import FullyShardedDataParallel
+from trl.trainer import ORPOTrainer
+from liger_kernel.chunked_loss import LigerFusedLinearORPOLoss
+class _FSDPForwardRedirection:
+    """
+    Modified based on
+    https://github.com/Lightning-AI/pytorch-lightning/blob/d3f9c83d6efa4f1def36aa6c199600946cdb9117/src/lightning/pytorch/strategies/strategy.py#L601-L648
+    Redirect a method call through FullyShardedDataParallel.forward so that the FSDP module's root pre-forward and
+    post-forward can be properly executed around the method call.
+    This is needed in cases where we call a submodule of a FSDP module. For instance, when we want to call only
+    the `LlamaModel` part out of a FSDP-wrapped `LlamaForCausalLM` to get the hidden states without involving
+    GPU-memory-heavy `lm_head` and cross entropy computation, doing this directly (i.e. `model.model.forward()`)
+    will not work because the first `nn.Emebedding` layer is not independently wrapped as a FSDP module (because of
+    the transformer-based wrapping policy), and not calling it through FSDP root module forward will not all-gather
+    its parameter, thus resulting in "RuntimeError: 'weight' must be 2-D" error. Similarly, if we want to call just
+    the `lm_head` part of a model, we need this trick too to properly get its params all-gathered.
+    """
+    def __call__(
+        self,
+        wrapper_module: FullyShardedDataParallel,
+        method: Callable,
+        *args: Any,
+        **kwargs: Any,
+    ):
+        """Reroutes a method call through the `wrapper_module`'s `forward` method.
+        Args:
+            wrapper_module: The module that has `original_module` wrapped.
+            original_module: The module that was wrapped inside `wrapper_module`.
+            method_name: The name of the method that should be called on the `original_module` after inputs get
+                redirected through the `wrapper_module`'s `forward` method.
+            *args: The positional arguments to the method `method_name`. They will get passed to a patched
+                `forward` method instead.
+            **kwargs: The keyword arguments to the method `method_name`. They will get passed to a patched
+                `forward` method instead.
+        """
+        assert isinstance(wrapper_module, FullyShardedDataParallel)
+        original_module = wrapper_module._fsdp_wrapped_module
+        original_forward = original_module.forward
+        def wrapped_forward(*_args: Any, **_kwargs: Any) -> Any:
+            # Unpatch ourselves immediately before calling the method `method_name`
+            # because itself may want to call the real `forward`
+            original_module.forward = original_forward  # type: ignore[method-assign]
+            # Call the actual method e.g. `.training_step(...)`
+            out = method(*_args, **_kwargs)
+            return out
+        # Patch the original_module's forward so we can redirect the arguments back to the real method
+        original_module.forward = wrapped_forward  # type: ignore[method-assign]
+        wrapper_output = wrapper_module(*args, **kwargs)
+        return wrapper_output
+class LigerORPOTrainer(ORPOTrainer):
+    def concatenated_forward(
+        self, model: nn.Module, batch: Dict[str, Union[List, torch.LongTensor]]
+    ) -> Tuple[
+        torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor
+    ]:
+        """
+        Run the given model on the given batch of inputs, concatenating the chosen and rejected inputs together.
+        We do this to avoid doing two forward passes, because it's faster for FSDP.
+        """
+        concatenated_batch = self.concatenated_inputs(
+            batch,
+            is_encoder_decoder=self.is_encoder_decoder,
+            label_pad_token_id=self.label_pad_token_id,
+            padding_value=self.padding_value,
+            device=self.accelerator.device,
+        )
+        # if self.accelerator.is_main_process:
+        #     import pdb; pdb.set_trace()
+        # torch.distributed.barrier()
+        model_kwargs = (
+            {
+                "decoder_input_ids": self._shift_right(
+                    concatenated_batch["concatenated_labels"]
+                ),
+            }
+            if self.is_encoder_decoder
+            else {}
+        )
+        if self.aux_loss_enabled:
+            model_kwargs["output_router_logits"] = True
+        if isinstance(model, FullyShardedDataParallel):
+            outputs = _FSDPForwardRedirection()(
+                model,
+                model._fsdp_wrapped_module.model,
+                concatenated_batch["concatenated_input_ids"],
+                attention_mask=concatenated_batch["concatenated_attention_mask"],
+                use_cache=False,
+                **model_kwargs,
+            )
+        else:
+            if isinstance(model, torch.nn.DataParallel):
+                model = model.module
+            outputs = model.model(
+                concatenated_batch["concatenated_input_ids"],
+                attention_mask=concatenated_batch["concatenated_attention_mask"],
+                use_cache=False,
+                **model_kwargs,
+            )
+        orpo_loss_fn = LigerFusedLinearORPOLoss(
+            ignore_index=self.label_pad_token_id, beta=self.beta
+        )
+        def orpo_partial(lm_head, last_hidden_state, concatenated_labels):
+            return orpo_loss_fn(
+                lm_head.weight, last_hidden_state, concatenated_labels, lm_head.bias
+            )
+        orpo_loss, aux_outputs = _FSDPForwardRedirection()(
+            model,
+            orpo_partial,
+            model.lm_head,
+            outputs.last_hidden_state,
+            concatenated_batch["concatenated_labels"],
+        )
+        return orpo_loss, aux_outputs
+    def get_batch_loss_metrics(
+        self,
+        model,
+        batch: Dict[str, Union[List, torch.LongTensor]],
+        train_eval: Literal["train", "eval"] = "train",
+    ):
+        """Compute the ORPO loss and other metrics for the given batch of inputs for train or test."""
+        metrics = {}
+        loss, aux_outputs = self.concatenated_forward(model, batch)
+        (
+            policy_chosen_logps,
+            policy_rejected_logps,
+            policy_chosen_logits,
+            policy_rejected_logits,
+            policy_nll_loss,
+        ) = aux_outputs[:5]
+        # return loss, metrics
+        chosen_rewards, rejected_rewards, log_odds_ratio, log_odds_chosen = aux_outputs[
+            5:
+        ]
+        reward_accuracies = (chosen_rewards > rejected_rewards).float()
+        prefix = "eval_" if train_eval == "eval" else ""
+        metrics[f"{prefix}rewards/chosen"] = chosen_rewards.mean()
+        metrics[f"{prefix}rewards/rejected"] = rejected_rewards.mean()
+        metrics[f"{prefix}rewards/accuracies"] = reward_accuracies.mean()
+        metrics[f"{prefix}rewards/margins"] = (chosen_rewards - rejected_rewards).mean()
+        metrics[f"{prefix}logps/rejected"] = policy_rejected_logps.detach().mean()
+        metrics[f"{prefix}logps/chosen"] = policy_chosen_logps.detach().mean()
+        metrics[f"{prefix}logits/rejected"] = policy_rejected_logits.detach().mean()
+        metrics[f"{prefix}logits/chosen"] = policy_chosen_logits.detach().mean()
+        metrics[f"{prefix}nll_loss"] = policy_nll_loss.detach().mean()
+        metrics[f"{prefix}log_odds_ratio"] = log_odds_ratio
+        metrics[f"{prefix}log_odds_chosen"] = log_odds_chosen
+        for k, v in metrics.items():
+            metrics[k] = v.item()
+        return loss, metrics

{liger_kernel_nightly-0.4.2.dev20241203232039.dist-info → liger_kernel_nightly-0.4.2.dev20241206180928.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.4.2.dev20241203232039
+Version: 0.4.2.dev20241206180928
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation
@@ -34,6 +34,7 @@ Requires-Dist: torch>=2.1.2
 Requires-Dist: triton>=2.3.1
 Provides-Extra: dev
 Requires-Dist: transformers>=4.44.2; extra == "dev"
+Requires-Dist: trl>=0.11.0; extra == "dev"
 Requires-Dist: matplotlib>=3.7.2; extra == "dev"
 Requires-Dist: flake8>=4.0.1.1; extra == "dev"
 Requires-Dist: black>=24.4.2; extra == "dev"

{liger_kernel_nightly-0.4.2.dev20241203232039.dist-info → liger_kernel_nightly-0.4.2.dev20241206180928.dist-info}/RECORD RENAMED Viewed

@@ -2,12 +2,12 @@ liger_kernel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 liger_kernel/env_report.py,sha256=jye8RvUkmhqaIshdeIpoUABoAu7FPKJUib4FnAfvkpw,1132
 liger_kernel/utils.py,sha256=HJa-xVKOohDn6pLVIx-Fv0V9h0QAL3qZGQNRICI-OpI,249
 liger_kernel/chunked_loss/__init__.py,sha256=R2wCcz4Y0kTAve926DH3k182XKezpXeACMHj05g9Mm8,346
-liger_kernel/chunked_loss/cpo_loss.py,sha256=H2L6mNtU8RMJ17u4aMZ9FHEfBvg1Z_hliY5-jZxiDBM,3079
-liger_kernel/chunked_loss/dpo_loss.py,sha256=XcCGLVmTVdEX30q41XRXXK_c-MSumVJ-l4tQwobUv2w,4228
+liger_kernel/chunked_loss/cpo_loss.py,sha256=P20txjErLCSfSfToFT8pnuVPqFU4Bbybt3zRXfGEV-0,3122
+liger_kernel/chunked_loss/dpo_loss.py,sha256=NZyM4ju56MBVrUTI_7-jGMx5pWWDYzwx7ALoMj1G8Ec,4276
 liger_kernel/chunked_loss/functional.py,sha256=9Gr-YXIuEzEJkBUhDx3G2fuQayckLor7cC7svhmPML4,549
-liger_kernel/chunked_loss/fused_linear_preference.py,sha256=nkEpNWTHh5GmlnHOnGx5ifjigbOuUhc3hRy7RehXDbE,10838
-liger_kernel/chunked_loss/orpo_loss.py,sha256=DZ-_hm1twllBWujEV4M4-VDBkxMDBvoGqMGe-aGP1hA,3147
-liger_kernel/chunked_loss/simpo_loss.py,sha256=Jpl_U6DfxlzyHnlKN2i05K0vwz-ouiTmxlLGb439FwY,3328
+liger_kernel/chunked_loss/fused_linear_preference.py,sha256=nod7GcsTBV_L6RGRd55meB2D5KWzETVSnIz6xFbjVCc,14891
+liger_kernel/chunked_loss/orpo_loss.py,sha256=GGwc3pLGGJzb_P_C7IogcA1EfdAcM1uktfKPmI1z2jk,3523
+liger_kernel/chunked_loss/simpo_loss.py,sha256=FtURWbXGjoAKyiVYF7fkMv8Us7uk3UrSg21pWOFk11Y,3385
 liger_kernel/ops/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 liger_kernel/ops/cross_entropy.py,sha256=VqaYB9Zirc51eZ28OmjEZRrrV9UysRjS_vhIftB9sKo,15753
 liger_kernel/ops/fused_linear_cross_entropy.py,sha256=Tnw4gyAYVVdnCOqhOuLEzbUQ3goOTnoAfk3pqSIM5ac,9301
@@ -24,7 +24,7 @@ liger_kernel/ops/swiglu.py,sha256=Fwxtd76rhHKT9ShQAGca9RsnASplAVxtYKHmiT73_yA,29
 liger_kernel/ops/utils.py,sha256=_VQvd1PX5JXm5xaiBrk2gANp3qr4kM7qYG3ypkBwkMs,3850
 liger_kernel/ops/experimental/embedding.py,sha256=LYR66dB-jhvhtUjeV4PnNro-n77J1mdlmpSLSxB3Y6U,4186
 liger_kernel/ops/experimental/mm_int8int2.py,sha256=JpGVZCgRC6T8XMUJ_QbZRS2XU1bh0urIZphs5DTc1mY,13358
-liger_kernel/transformers/__init__.py,sha256=gia-eBxr7TLxU0GdDf8AfCY4WgDlFLqIGSt7EoQGsBA,1336
+liger_kernel/transformers/__init__.py,sha256=P5JR3fI-znhG92nRrFS2j0TIJTLhP-xD5dvEy4HP9ik,1418
 liger_kernel/transformers/auto_model.py,sha256=RMIwQHSiXoksXFTIqFZ4PLBgoqkxJJAT3q1Qh47bGN8,1552
 liger_kernel/transformers/cross_entropy.py,sha256=yEm_YQ7oa3_BzT3hdW6KrAslduhSqWcJQVNZZDcWCg4,1758
 liger_kernel/transformers/functional.py,sha256=sUBoU8Vb4pLpr9G6IdkRsToYgh-rCXL4OLYat7Tv_GU,4450
@@ -36,6 +36,7 @@ liger_kernel/transformers/jsd.py,sha256=sbr8DnKSYZJH9pv2rpmboNijYGpZKbhb2-WSGp5_
 liger_kernel/transformers/kl_div.py,sha256=qVhjBg6tjRyue5iZ3NFxo8uySY4JuIFJyv0IM_50F24,431
 liger_kernel/transformers/layer_norm.py,sha256=fd6o4kSHJWolQMWxh-l1qObfgL08ruNbUoBiANKX1ow,972
 liger_kernel/transformers/monkey_patch.py,sha256=Fk2v4GZQDJzfh3Cpc6BHNJbs_tungDyWmqS9nuG9Lc4,38406
+liger_kernel/transformers/orpo_trainer.py,sha256=mC8ePS-Oq-BrdM0lKpgSBLuYLqYsWxH_4Q2RnDthz5M,7643
 liger_kernel/transformers/qwen2vl_mrope.py,sha256=SfSQVwOe7ArrVfpmIdfZrdzCxmcj7V-YQp9zDu17-ao,1043
 liger_kernel/transformers/rms_norm.py,sha256=AHstklNIO1PLHjjCBU-TPuUD-Fl_pycJUTLlJNojbV8,1189
 liger_kernel/transformers/rope.py,sha256=m-ah8vZBYW8tfplTXCiAPMHJWlB1tdp_JPXJeWE-Boo,943
@@ -54,9 +55,9 @@ liger_kernel/transformers/model/qwen2.py,sha256=EyhSSzQOskGjSnCsKMZpd1s5IAIlHd5P
 liger_kernel/transformers/model/qwen2_vl.py,sha256=bIQe2bWiY--G84FhCD29Gdi64_qHP6vbcGsK6vKysQE,8547
 liger_kernel/triton/__init__.py,sha256=yfRe0zMb47QnqjecZWG7LnanfCTzeku7SgWRAwNVmzU,101
 liger_kernel/triton/monkey_patch.py,sha256=5BcGKTtdqeYchypBIBopGIWPx1-cFALz7sOKoEsqXJ0,1584
-liger_kernel_nightly-0.4.2.dev20241203232039.dist-info/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
-liger_kernel_nightly-0.4.2.dev20241203232039.dist-info/METADATA,sha256=GD7sOJhLqOExLzto7Qhlp554vRb1JDkM_zULsZ8HhYU,21897
-liger_kernel_nightly-0.4.2.dev20241203232039.dist-info/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
-liger_kernel_nightly-0.4.2.dev20241203232039.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
-liger_kernel_nightly-0.4.2.dev20241203232039.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
-liger_kernel_nightly-0.4.2.dev20241203232039.dist-info/RECORD,,
+liger_kernel_nightly-0.4.2.dev20241206180928.dist-info/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
+liger_kernel_nightly-0.4.2.dev20241206180928.dist-info/METADATA,sha256=WAAJkbzUZII072MIUuE8_72lDZNPoRac1suRYzGTrsg,21940
+liger_kernel_nightly-0.4.2.dev20241206180928.dist-info/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
+liger_kernel_nightly-0.4.2.dev20241206180928.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
+liger_kernel_nightly-0.4.2.dev20241206180928.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
+liger_kernel_nightly-0.4.2.dev20241206180928.dist-info/RECORD,,

{liger_kernel_nightly-0.4.2.dev20241203232039.dist-info → liger_kernel_nightly-0.4.2.dev20241206180928.dist-info}/LICENSE RENAMED Viewed

File without changes

{liger_kernel_nightly-0.4.2.dev20241203232039.dist-info → liger_kernel_nightly-0.4.2.dev20241206180928.dist-info}/NOTICE RENAMED Viewed

File without changes

{liger_kernel_nightly-0.4.2.dev20241203232039.dist-info → liger_kernel_nightly-0.4.2.dev20241206180928.dist-info}/WHEEL RENAMED Viewed

File without changes

{liger_kernel_nightly-0.4.2.dev20241203232039.dist-info → liger_kernel_nightly-0.4.2.dev20241206180928.dist-info}/top_level.txt RENAMED Viewed

File without changes

liger-kernel-nightly 0.4.2.dev20241203232039__py3-none-any.whl → 0.4.2.dev20241206180928__py3-none-any.whl

liger-kernel-nightly 0.4.2.dev20241203232039py3-none-any.whl → 0.4.2.dev20241206180928py3-none-any.whl