PyPI - liger-kernel-nightly - Versions diffs - 0.4.2.dev20241209195823__py3-none-any.whl → 0.4.2.dev20241209234352__py3-none-any.whl - Mend

liger-kernel-nightly 0.4.2.dev20241209195823py3-none-any.whl → 0.4.2.dev20241209234352py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

liger_kernel/chunked_loss/cpo_loss.py CHANGED Viewed

@@ -11,11 +11,25 @@ class LigerFusedLinearCPOFunction(LigerFusedLinearPreferenceBase):
     @staticmethod
     def preference_loss_fn(chosen_logps, rejected_logps, full_target, beta=0.1):
         """
-        Compute odds-ratio loss.
+        Paper: https://arxiv.org/pdf/2401.08417
+        Formula:
+        L(π_θ; U) = -E_(x,y_w,y_l)~D[log σ(β log π_θ(y_w|x) - β log π_θ(y_l|x))]
+        Where:
+        - π_θ(y|x): Policy (model) probability
+        - y_w: Chosen sequence
+        - y_l: Rejected sequence
+        - σ: Sigmoid function
+        - β: Temperature parameter
+        - E: Expected value over the dataset D
+        - D: Dataset of preferences
         Args:
             chosen_logps (torch.Tensor): Avg log probabilities of chosen tokens. Shape: (batch_size,).
             rejected_logps (torch.Tensor): Avg log probabilities of rejected tokens. Shape: (batch_size,).
-            beta (float): Weight for the odds ratio loss.
+            full_target (torch.Tensor): Non chunked full target tensor
+            beta (float): Weight for the CPO loss
         """
         logits = beta * (chosen_logps - rejected_logps)
         loss = F.logsigmoid(logits).sum() / (full_target.shape[0] // 2)
@@ -34,12 +48,6 @@ class LigerFusedLinearCPOFunction(LigerFusedLinearPreferenceBase):
         compute_nll_loss=True,
         compiled=True,
     ):
-        """
-        Fused linear layer with CPO (Odds-Ratio Preference Optimization) loss.
-        Handles both the forward and backward pass of the final linear layer with CPO loss.
-        Inspired from LigerFusedLinearCrossEntropyFunction (https://arxiv.org/abs/2410.10989) which fuses final linear layer and CE loss.
-        """
         return LigerFusedLinearPreferenceBase.forward(
             ctx,
             _input,
@@ -56,9 +64,7 @@ class LigerFusedLinearCPOFunction(LigerFusedLinearPreferenceBase):
     @staticmethod
     def backward(ctx, *grad_output):
-        # Get gradients for _input, weight, bias, and target from the base class
         grads = LigerFusedLinearPreferenceBase.backward(ctx, grad_output)[:4]
-        # Return these gradients, followed by None for the remaining inputs
         return *grads, None, None, None, None, None

liger_kernel/chunked_loss/dpo_loss.py CHANGED Viewed

@@ -18,14 +18,28 @@ class LigerFusedLinearDPOFunction(LigerFusedLinearPreferenceBase):
         beta=0.1,
     ):
         """
-        Compute DPO loss (Direct Preference Optimization).
+        Paper: https://arxiv.org/pdf/2305.18290
+        Formula:
+        L_DPO = -E[ log_sigmoid( β * (log(π(y_w|x)/π_ref(y_w|x)) - log(π(y_l|x)/π_ref(y_l|x))) ) ]
+        Where:
+        - π(y|x): Policy (model) probability
+        - π_ref(y|x): Reference model probability
+        - y_w: Chosen sequence
+        - y_l: Rejected sequence
+        - β: Weight for the direct preference loss
+        - E: Expected value over the dataset
         Args:
-            chosen_logps (torch.Tensor): Avg log probabilities of chosen tokens. Shape: (batch_size,).
-            rejected_logps (torch.Tensor): Avg log probabilities of rejected tokens. Shape: (batch_size,).
-            ref_chosen_logps (torch.Tensor, optional): Reference log probabilities of chosen tokens. Shape: (batch_size,).
-            ref_rejected_logps (torch.Tensor, optional): Reference log probabilities of rejected tokens. Shape: (batch_size,).
-            beta (float): Weight for the direct preference loss.
+            chosen_logps: Log probabilities of chosen tokens (batch_size,)
+            rejected_logps: Log probabilities of rejected tokens (batch_size,)
+            full_target: Non chunked full target tensor
+            ref_chosen_logps: Reference log probs of chosen tokens (batch_size,)
+            ref_rejected_logps: Reference log probs of rejected tokens (batch_size,)
+            beta: Weight for the direct preference loss
         """
         if ref_chosen_logps is None:
             ref_chosen_logps = torch.tensor(0.0, device=chosen_logps.device)
         if ref_rejected_logps is None:
@@ -53,10 +67,6 @@ class LigerFusedLinearDPOFunction(LigerFusedLinearPreferenceBase):
         compiled=True,
         use_ref_model=True,
     ):
-        """
-        Fused linear layer with DPO (Direct Preference Optimization) loss.
-        Handles both the forward and backward pass of the final linear layer with DPO loss.
-        """
         return LigerFusedLinearPreferenceBase.forward(
             ctx=ctx,
             _input=_input,
@@ -75,9 +85,7 @@ class LigerFusedLinearDPOFunction(LigerFusedLinearPreferenceBase):
     @staticmethod
     def backward(ctx, *grad_output):
-        # Get gradients for _input, weight, bias, and target from the base class
         grads = LigerFusedLinearPreferenceBase.backward(ctx, grad_output)[:4]
-        # Return these gradients, followed by None for the remaining inputs
         return *grads, None, None, None, None, None, None, None

liger_kernel/chunked_loss/fused_linear_preference.py CHANGED Viewed

@@ -8,159 +8,12 @@ from torch.nn import functional as F
 class LigerFusedLinearPreferenceBase(torch.autograd.Function):
     @abstractmethod
-    def preference_loss_fn(chosen_logps, rejected_logps, beta=0.1):
+    def preference_loss_fn(*args, **kwargs):
         """
-        Compute preference loss.
-        Args:
-            chosen_logps (torch.Tensor): Avg log probabilities of chosen tokens. Shape: (batch_size,).
-            rejected_logps (torch.Tensor): Avg log probabilities of rejected tokens. Shape: (batch_size,).
-            beta (float): Weight for the odds ratio loss.
+        To be extended by subclasses.
         """
         raise NotImplementedError("Preference loss function must be implemented.")
-    @staticmethod
-    def chunk_forward(
-        input_chunk,
-        weight,
-        target_chunk,
-        bias=None,
-        ignore_index=-100,
-        compute_nll_loss=True,
-    ):
-        len_chosen_chunk = target_chunk.shape[0] // 2
-        logits_chunk = input_chunk @ weight.t()
-        if bias is not None:
-            logits_chunk = logits_chunk + bias
-        log_probs_chunk = F.log_softmax(logits_chunk.float(), dim=-1)
-        chosen_nll_loss = 0.0
-        if compute_nll_loss:
-            chosen_nll_loss = F.nll_loss(
-                log_probs_chunk[:len_chosen_chunk].view(-1, log_probs_chunk.shape[-1]),
-                target_chunk[:len_chosen_chunk].view(-1),
-                reduction="sum",
-                ignore_index=ignore_index,
-            )
-        loss_mask = target_chunk != ignore_index
-        label_chunk = torch.where(loss_mask, target_chunk, 0)
-        per_token_logps = log_probs_chunk.gather(-1, label_chunk.unsqueeze(-1)).squeeze(
-            -1
-        )
-        average_log_prob = (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)
-        chosen_logps = average_log_prob[:len_chosen_chunk]
-        rejected_logps = average_log_prob[len_chosen_chunk:]
-        chosen_logits = logits_chunk[:len_chosen_chunk]
-        rejected_logits = logits_chunk[len_chosen_chunk:]
-        return (
-            chosen_logps,
-            rejected_logps,
-            chosen_logits,
-            rejected_logits,
-            chosen_nll_loss,
-        )
-    @staticmethod
-    def _compute_loss(
-        input_chunk,
-        weight,
-        target_chunk,
-        bias=None,
-        preference_loss_fn=None,
-        full_target=None,
-        ignore_index=-100,
-        alpha=1.0,
-        beta=0.1,
-        compute_nll_loss=True,
-        use_ref_model=False,
-        ref_weight=None,
-        ref_bias=None,
-        **loss_kwargs,
-    ):
-        """
-        Compute the total loss for a chunk of input and target, while using an alignment/preference loss function.
-        Args:
-            preference_loss_fn (callable): Loss function to compute the loss on a chunk of input/target.
-            input_chunk (torch.Tensor): Chunk of input tensor. Shape: (2 * chunk_size, sequence_length, hidden_size).
-            weight (torch.Tensor): Weight tensor. Shape: (vocab_size, hidden_size).
-            target_chunk (torch.Tensor): Chunk of target tensor. Shape: (2 * chunk_size, sequence_length).
-            bias (torch.Tensor, optional): Bias tensor. Shape: (vocab_size,).
-            full_target (torch.Tensor): Full target tensor. Shape: (batch_size, sequence_length).
-            ignore_index (int): Index to ignore for loss computation.
-            alpha (float): Weight for the NLL loss.
-            beta (float): Weight for the odds ratio loss.
-            compute_nll_loss (bool): Whether to compute NLL loss.
-            use_ref_model (bool): Whether to use a reference model for the alignment loss.
-            ref_weight (torch.Tensor): Reference weight tensor. Shape: (vocab_size, hidden_size).
-            ref_bias (torch.Tensor, optional): Reference bias tensor. Shape: (vocab_size,).
-            loss_kwargs (dict): Additional arguments for the loss function.
-        """
-        (
-            chosen_logps,
-            rejected_logps,
-            chosen_logits,
-            rejected_logits,
-            chosen_nll_loss,
-        ) = LigerFusedLinearPreferenceBase.chunk_forward(
-            input_chunk,
-            weight,
-            target_chunk,
-            bias=bias,
-            ignore_index=ignore_index,
-            compute_nll_loss=compute_nll_loss,
-        )
-        chosen_nll_loss = (
-            chosen_nll_loss
-            / (full_target[: full_target.shape[0] // 2] != ignore_index).sum()
-        )
-        chosen_logits_mean = chosen_logits.sum() / (
-            full_target.shape[0] // 2 * input_chunk.shape[1] * weight.shape[0]
-        )
-        rejected_logits_mean = rejected_logits.sum() / (
-            full_target.shape[0] // 2 * input_chunk.shape[1] * weight.shape[0]
-        )
-        if use_ref_model:
-            with torch.no_grad():
-                (
-                    ref_chosen_logps,
-                    ref_rejected_logps,
-                    ref_chosen_logits,
-                    ref_rejected_logits,
-                    ref_chosen_nll_loss,
-                ) = LigerFusedLinearPreferenceBase.chunk_forward(
-                    input_chunk,
-                    ref_weight,
-                    target_chunk,
-                    ref_bias,
-                    ignore_index=ignore_index,
-                    compute_nll_loss=False,  # We don't need NLL loss for the reference model
-                )
-            loss_kwargs["ref_chosen_logps"] = ref_chosen_logps
-            loss_kwargs["ref_rejected_logps"] = ref_rejected_logps
-        preference_loss_outputs = preference_loss_fn(
-            chosen_logps, rejected_logps, full_target, beta=beta, **loss_kwargs
-        )
-        if isinstance(preference_loss_outputs, tuple):
-            preference_loss, *aux_outputs = preference_loss_outputs
-        else:
-            preference_loss, aux_outputs = preference_loss_outputs, []
-        loss = alpha * chosen_nll_loss - preference_loss
-        return_vars = (
-            chosen_logps,
-            rejected_logps,
-            chosen_logits_mean,
-            rejected_logits_mean,
-            chosen_nll_loss,
-        )
-        return loss, (*return_vars, *aux_outputs)
     @staticmethod
     def forward(
         ctx,
@@ -176,6 +29,7 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         compute_nll_loss=True,
         compiled=True,
         use_ref_model=False,
+        # TODO: ref input
         ref_weight=None,
         ref_bias=None,
         **loss_kwargs,
@@ -184,6 +38,14 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         Base class for fused linear layer with preference loss.
         Expects _input to be stacked with chosen and rejected inputs on the batch dimension.
+        The mental model is:
+        forward()
+        ├── Loop over chunks
+            └── compute_loss()
+                ├── chunk_forward()  # Compute logits and log probs
+                └── prefer_loss()    # Calculate preference loss
         Args:
             _input (torch.Tensor): Input tensor. Shape: (batch_size, seq_len, hidden_size).
             weight (torch.Tensor): Weight tensor. Shape: (vocab_size, hidden_size).
@@ -191,10 +53,9 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             bias (torch.Tensor, optional): Bias tensor. Shape: (vocab_size,).
             loss_fn (callable): Loss function to compute the loss on a chunk of input/target.
             chunk_size (int): Size of a chunk (# of batches of stacked chosen and rejected inputs).
-            compute_nll_loss (bool): Whether to compute NLL loss.
             ignore_index (int): Index to ignore for loss computation.
             alpha (float): Weight for the NLL loss.
-            beta (float): Weight for the odds ratio loss.
+            beta (float): Weight for the preference loss.
             compute_nll_loss (bool): Whether to compute NLL loss.
             compiled (bool): Whether to use torch compile for chunk accumulation.
             use_ref_model (bool): Whether to use a reference model for the alignment loss.
@@ -205,11 +66,16 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         # TODO: Tune CHUNK_SIZE to fully utilize the GPU
         CHUNK_SIZE = chunk_size
+        # Gradients to be accumulated
         grad_weight = torch.zeros_like(weight)
         grad_chosen_inputs = []
         grad_rejected_inputs = []
         grad_bias = torch.zeros_like(bias) if bias is not None else None
+        # Loss to be accumulated
         loss_acc = torch.zeros((), device=_input.device)
+        # Metrics to be recorded
         policy_chosen_logps = []
         policy_rejected_logps = []
         policy_chosen_logits_mean = torch.zeros((), device=_input.device)
@@ -217,7 +83,7 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         policy_nll_loss = torch.zeros((), device=_input.device)
         aggregated_aux_outputs = []  # aggregated aux outputs from all chunks
-        loss_func_to_call = partial(
+        compute_loss = partial(
             LigerFusedLinearPreferenceBase._compute_loss,
             preference_loss_fn=loss_fn,
             ignore_index=ignore_index,
@@ -231,14 +97,17 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             **loss_kwargs,
         )
-        def accumulate_core(input_chunk, target_chunk):
+        def fused_fwd_bwd(input_chunk, target_chunk):
+            """
+            Fused forward and backward pass for a chunk of input and target.
+            """
             if bias is not None:
                 return torch.func.grad_and_value(
-                    loss_func_to_call, argnums=(0, 1, 3), has_aux=True
+                    compute_loss, argnums=(0, 1, 3), has_aux=True
                 )(input_chunk, weight, target_chunk, bias)
             else:
                 return torch.func.grad_and_value(
-                    loss_func_to_call, argnums=(0, 1), has_aux=True
+                    compute_loss, argnums=(0, 1), has_aux=True
                 )(input_chunk, weight, target_chunk)
         def accumulate_chunk(input_chunk, target_chunk):
@@ -253,7 +122,7 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
                         chunk_nll_loss,
                         *aux_outputs,
                     ),
-                ) = accumulate_core(input_chunk, target_chunk)
+                ) = fused_fwd_bwd(input_chunk, target_chunk)
                 grad_bias.add_(chunk_grad_bias)  # accumulate bias gradient
             else:
                 (chunk_grad_input, chunk_grad_weight), (
@@ -266,16 +135,26 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
                         chunk_nll_loss,
                         *aux_outputs,
                     ),
-                ) = accumulate_core(input_chunk, target_chunk)
+                ) = fused_fwd_bwd(input_chunk, target_chunk)
+            # Accumulate gradients
             grad_weight.add_(chunk_grad_weight)
+            grad_chosen_inputs.append(chunk_grad_input[: chosen_target_chunk.shape[0]])
+            grad_rejected_inputs.append(
+                chunk_grad_input[chosen_target_chunk.shape[0] :]
+            )
+            # Accumulate loss
             loss_acc.add_(chunk_loss)
+            # Accumulate metrics
             policy_chosen_logps.append(chunk_chosen_logps)
             policy_rejected_logps.append(chunk_rejected_logps)
             policy_chosen_logits_mean.add_(chunk_chosen_logits_mean)
             policy_rejected_logits_mean.add_(chunk_rejected_logits_mean)
             policy_nll_loss.add_(chunk_nll_loss)
+            # aux_outputs
             # Initialize storage for aux_outputs
             if len(aggregated_aux_outputs) == 0:
                 for aux in aux_outputs:
@@ -293,10 +172,8 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
                 else:
                     aggregated_aux_outputs[i].append(aux)
-            return chunk_grad_input
         if compiled:
-            accumulate_core = torch.compile(accumulate_core)
+            fused_fwd_bwd = torch.compile(fused_fwd_bwd)
         len_chosen = target.shape[0] // 2
         chunks = max(1, _input.shape[0] // (2 * CHUNK_SIZE))
@@ -327,10 +204,7 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             torch._dynamo.mark_dynamic(target, 1)
             # accumulate loss, gradients, and metrics
-            grad_input = accumulate_chunk(input_chunk, target_chunk)
-            grad_chosen_inputs.append(grad_input[: chosen_target_chunk.shape[0]])
-            grad_rejected_inputs.append(grad_input[chosen_target_chunk.shape[0] :])
+            accumulate_chunk(input_chunk, target_chunk)
         # combine grad_chosen_inputs and grad_rejected_inputs
         grad_inputs = grad_chosen_inputs + grad_rejected_inputs
@@ -367,3 +241,146 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             grad_bias = grad_bias * grad_output[0][0] if grad_bias is not None else None
         return grad_input, grad_weight, None, grad_bias, None, None, None
+    @staticmethod
+    def chunk_forward(
+        input_chunk,
+        weight,
+        target_chunk,
+        bias=None,
+        ignore_index=-100,
+        compute_nll_loss=True,
+    ):
+        len_chosen_chunk = target_chunk.shape[0] // 2
+        logits_chunk = input_chunk @ weight.t()
+        if bias is not None:
+            logits_chunk = logits_chunk + bias
+        log_probs_chunk = F.log_softmax(logits_chunk.float(), dim=-1)
+        chosen_nll_loss = 0.0
+        if compute_nll_loss:
+            chosen_nll_loss = F.nll_loss(
+                log_probs_chunk[:len_chosen_chunk].view(-1, log_probs_chunk.shape[-1]),
+                target_chunk[:len_chosen_chunk].view(-1),
+                reduction="sum",
+                ignore_index=ignore_index,
+            )
+        loss_mask = target_chunk != ignore_index
+        label_chunk = torch.where(loss_mask, target_chunk, 0)
+        per_token_logps = log_probs_chunk.gather(-1, label_chunk.unsqueeze(-1)).squeeze(
+            -1
+        )
+        average_log_prob = (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)
+        chosen_logps = average_log_prob[:len_chosen_chunk]
+        rejected_logps = average_log_prob[len_chosen_chunk:]
+        chosen_logits = logits_chunk[:len_chosen_chunk]
+        rejected_logits = logits_chunk[len_chosen_chunk:]
+        return (
+            chosen_logps,
+            rejected_logps,
+            chosen_logits,
+            rejected_logits,
+            chosen_nll_loss,
+        )
+    @staticmethod
+    def _compute_loss(
+        input_chunk,
+        weight,
+        target_chunk,
+        bias=None,
+        preference_loss_fn=None,
+        full_target=None,
+        ignore_index=-100,
+        alpha=1.0,
+        beta=0.1,
+        compute_nll_loss=True,
+        use_ref_model=False,
+        ref_weight=None,
+        ref_bias=None,
+        **loss_kwargs,
+    ):
+        """
+        Compute the total loss for a chunk of input and target, while using an alignment/preference loss function.
+        Args:
+            preference_loss_fn (callable): Loss function to compute the loss on a chunk of input/target.
+            input_chunk (torch.Tensor): Chunk of input tensor. Shape: (2 * chunk_size, sequence_length, hidden_size).
+            weight (torch.Tensor): Weight tensor. Shape: (vocab_size, hidden_size).
+            target_chunk (torch.Tensor): Chunk of target tensor. Shape: (2 * chunk_size, sequence_length).
+            bias (torch.Tensor, optional): Bias tensor. Shape: (vocab_size,).
+            full_target (torch.Tensor): Full target tensor. Shape: (batch_size, sequence_length).
+            ignore_index (int): Index to ignore for loss computation.
+            alpha (float): Weight for the NLL loss.
+            beta (float): Weight for the preference loss.
+            compute_nll_loss (bool): Whether to compute NLL loss.
+            use_ref_model (bool): Whether to use a reference model for the alignment loss.
+            ref_weight (torch.Tensor): Reference weight tensor. Shape: (vocab_size, hidden_size).
+            ref_bias (torch.Tensor, optional): Reference bias tensor. Shape: (vocab_size,).
+            loss_kwargs (dict): Additional arguments for the loss function.
+        """
+        (
+            chosen_logps,
+            rejected_logps,
+            chosen_logits,
+            rejected_logits,
+            chosen_nll_loss,
+        ) = LigerFusedLinearPreferenceBase.chunk_forward(
+            input_chunk,
+            weight,
+            target_chunk,
+            bias=bias,
+            ignore_index=ignore_index,
+            compute_nll_loss=compute_nll_loss,
+        )
+        chosen_nll_loss = (
+            chosen_nll_loss
+            / (full_target[: full_target.shape[0] // 2] != ignore_index).sum()
+        )
+        chosen_logits_mean = chosen_logits.sum() / (
+            full_target.shape[0] // 2 * input_chunk.shape[1] * weight.shape[0]
+        )
+        rejected_logits_mean = rejected_logits.sum() / (
+            full_target.shape[0] // 2 * input_chunk.shape[1] * weight.shape[0]
+        )
+        if use_ref_model:
+            with torch.no_grad():
+                (
+                    ref_chosen_logps,
+                    ref_rejected_logps,
+                    ref_chosen_logits,
+                    ref_rejected_logits,
+                    ref_chosen_nll_loss,
+                ) = LigerFusedLinearPreferenceBase.chunk_forward(
+                    input_chunk,
+                    ref_weight,
+                    target_chunk,
+                    ref_bias,
+                    ignore_index=ignore_index,
+                    compute_nll_loss=False,  # We don't need NLL loss for the reference model
+                )
+            loss_kwargs["ref_chosen_logps"] = ref_chosen_logps
+            loss_kwargs["ref_rejected_logps"] = ref_rejected_logps
+        preference_loss_outputs = preference_loss_fn(
+            chosen_logps, rejected_logps, full_target, beta=beta, **loss_kwargs
+        )
+        if isinstance(preference_loss_outputs, tuple):
+            preference_loss, *aux_outputs = preference_loss_outputs
+        else:
+            preference_loss, aux_outputs = preference_loss_outputs, []
+        loss = alpha * chosen_nll_loss - preference_loss
+        return_vars = (
+            chosen_logps,
+            rejected_logps,
+            chosen_logits_mean,
+            rejected_logits_mean,
+            chosen_nll_loss,
+        )
+        return loss, (*return_vars, *aux_outputs)

liger_kernel/chunked_loss/orpo_loss.py CHANGED Viewed

@@ -11,10 +11,24 @@ class LigerFusedLinearORPOFunction(LigerFusedLinearPreferenceBase):
     @staticmethod
     def preference_loss_fn(chosen_logps, rejected_logps, full_target, beta=0.1):
         """
-        Compute odds-ratio loss.
+        Paper: https://arxiv.org/pdf/2403.07691
+        Formula:
+        Compute odds-ratio loss: L_OR = -log(σ(log(odds_θ(y_w|x) / odds_θ(y_l|x))))
+        where odds_θ(y|x) = P_θ(y|x) / (1 - P_θ(y|x))
+        Where:
+        - P_θ(y|x): Policy (model) probability
+        - y_w: Chosen sequence
+        - y_l: Rejected sequence
+        - σ: Sigmoid function
+        - β: Weight for the odds ratio loss
+        - odds_θ: Odds function for the policy
         Args:
             chosen_logps (torch.Tensor): Avg log probabilities of chosen tokens. Shape: (batch_size,).
             rejected_logps (torch.Tensor): Avg log probabilities of rejected tokens. Shape: (batch_size,).
+            full_target (torch.Tensor): Non chunked full target tensor
             beta (float): Weight for the odds ratio loss.
         """
         log_odds = (chosen_logps - rejected_logps) - (
@@ -44,12 +58,6 @@ class LigerFusedLinearORPOFunction(LigerFusedLinearPreferenceBase):
         compute_nll_loss=True,
         compiled=True,
     ):
-        """
-        Fused linear layer with ORPO (Odds-Ratio Preference Optimization) loss.
-        Handles both the forward and backward pass of the final linear layer with ORPO loss.
-        Inspired from LigerFusedLinearCrossEntropyFunction (https://arxiv.org/abs/2410.10989) which fuses final linear layer and CE loss.
-        """
         return LigerFusedLinearPreferenceBase.forward(
             ctx=ctx,
             _input=_input,
@@ -65,9 +73,7 @@ class LigerFusedLinearORPOFunction(LigerFusedLinearPreferenceBase):
     @staticmethod
     def backward(ctx, *grad_output):
-        # Get gradients for _input, weight, bias, and target from the base class
         grads = LigerFusedLinearPreferenceBase.backward(ctx, grad_output)[:4]
-        # Return these gradients, followed by None for the remaining inputs
         return *grads, None, None, None, None

liger_kernel/chunked_loss/simpo_loss.py CHANGED Viewed

@@ -13,12 +13,26 @@ class LigerFusedLinearSimPOFunction(LigerFusedLinearPreferenceBase):
         chosen_logps, rejected_logps, full_target, beta=0.1, gamma=0.5
     ):
         """
-        Compute odds-ratio loss.
+        Paper: https://arxiv.org/pdf/2405.14734
+        Formula:
+        L_SimPO(π_θ) = -E [log σ(β/|y_w| log π_θ(y_w|x) - β/|y_l| log π_θ(y_l|x) - γ)]
+        Where:
+        - π_θ(y|x): Policy (model) probability
+        - y_w: Chosen sequence
+        - y_l: Rejected sequence
+        - |y_w|, |y_l|: Sequence lengths
+        - σ: Sigmoid function
+        - β: beta weight
+        - γ: gemma margin term
         Args:
             chosen_logps (torch.Tensor): Avg log probabilities of chosen tokens. Shape: (batch_size,).
             rejected_logps (torch.Tensor): Avg log probabilities of rejected tokens. Shape: (batch_size,).
-            beta (float): Weight for the odds ratio loss.
-            gamma (float): The simpo gamma, margin term.
+            full_target: Non chunked full target tensor
+            beta (float): beta weight
+            gamma (float): gemma margin term
         """
         logits = beta * (chosen_logps - rejected_logps) - gamma
         loss = F.logsigmoid(logits).sum() / (full_target.shape[0] // 2)
@@ -38,12 +52,6 @@ class LigerFusedLinearSimPOFunction(LigerFusedLinearPreferenceBase):
         compiled=True,
         gamma=0.5,
     ):
-        """
-        Fused linear layer with SimPO (Simple Preference Optimization) loss. https://arxiv.org/pdf/2405.14734
-        Handles both the forward and backward pass of the final linear layer with SimPO loss.
-        Inspired from LigerFusedLinearCrossEntropyFunction (https://arxiv.org/abs/2410.10989) which fuses final linear layer and CE loss.
-        """
         return LigerFusedLinearPreferenceBase.forward(
             ctx,
             _input,
@@ -61,9 +69,7 @@ class LigerFusedLinearSimPOFunction(LigerFusedLinearPreferenceBase):
     @staticmethod
     def backward(ctx, *grad_output):
-        # Get gradients for _input, weight, bias, and target from the base class
         grads = LigerFusedLinearPreferenceBase.backward(ctx, grad_output)[:4]
-        # Return these gradients, followed by None for the remaining inputs
         return *grads, None, None, None, None, None, None

{liger_kernel_nightly-0.4.2.dev20241209195823.dist-info → liger_kernel_nightly-0.4.2.dev20241209234352.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.4.2.dev20241209195823
+Version: 0.4.2.dev20241209234352
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation

{liger_kernel_nightly-0.4.2.dev20241209195823.dist-info → liger_kernel_nightly-0.4.2.dev20241209234352.dist-info}/RECORD RENAMED Viewed

@@ -2,13 +2,13 @@ liger_kernel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 liger_kernel/env_report.py,sha256=1ETxx6HW4bKMK5aa5xaFzEmx0Ibc_kNryL_gXBVyyrI,1374
 liger_kernel/utils.py,sha256=HJa-xVKOohDn6pLVIx-Fv0V9h0QAL3qZGQNRICI-OpI,249
 liger_kernel/chunked_loss/__init__.py,sha256=R2wCcz4Y0kTAve926DH3k182XKezpXeACMHj05g9Mm8,346
-liger_kernel/chunked_loss/cpo_loss.py,sha256=P20txjErLCSfSfToFT8pnuVPqFU4Bbybt3zRXfGEV-0,3122
-liger_kernel/chunked_loss/dpo_loss.py,sha256=NZyM4ju56MBVrUTI_7-jGMx5pWWDYzwx7ALoMj1G8Ec,4276
+liger_kernel/chunked_loss/cpo_loss.py,sha256=Qu1Ul2A12sp6CqIT-atPbHWFb_LLtINEA9mOpIRx_0g,3097
+liger_kernel/chunked_loss/dpo_loss.py,sha256=H9_RRhclckHYM2sd75tgbnf8IxC_PU2JCALbgtPQvwc,4222
 liger_kernel/chunked_loss/functional.py,sha256=9Gr-YXIuEzEJkBUhDx3G2fuQayckLor7cC7svhmPML4,549
 liger_kernel/chunked_loss/fused_linear_distillation.py,sha256=2BH6DCPjsR2zS6zcwFPcIIZRhLF8SohjGdKsAJ_301o,10222
-liger_kernel/chunked_loss/fused_linear_preference.py,sha256=_4MDZMzrNNgm91c6qdLEuXG1M8HyglZioiufv5opJOI,14881
-liger_kernel/chunked_loss/orpo_loss.py,sha256=GGwc3pLGGJzb_P_C7IogcA1EfdAcM1uktfKPmI1z2jk,3523
-liger_kernel/chunked_loss/simpo_loss.py,sha256=FtURWbXGjoAKyiVYF7fkMv8Us7uk3UrSg21pWOFk11Y,3385
+liger_kernel/chunked_loss/fused_linear_preference.py,sha256=vlWfaaIECWvCQhY9PM7zRI0vKThIrydMf6P44bXn1EE,15114
+liger_kernel/chunked_loss/orpo_loss.py,sha256=ZuKGjbkIYzV4UzvupNdq6vyxCp7-BztQkUt8ZnFvKos,3531
+liger_kernel/chunked_loss/simpo_loss.py,sha256=Wa4LOlDG9PbJkOOkKg8hbKvnKgg7OTBz6-qIkwPK1yw,3275
 liger_kernel/ops/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 liger_kernel/ops/cross_entropy.py,sha256=VqaYB9Zirc51eZ28OmjEZRrrV9UysRjS_vhIftB9sKo,15753
 liger_kernel/ops/fused_linear_cross_entropy.py,sha256=Tnw4gyAYVVdnCOqhOuLEzbUQ3goOTnoAfk3pqSIM5ac,9301
@@ -56,9 +56,9 @@ liger_kernel/transformers/model/qwen2.py,sha256=EyhSSzQOskGjSnCsKMZpd1s5IAIlHd5P
 liger_kernel/transformers/model/qwen2_vl.py,sha256=bIQe2bWiY--G84FhCD29Gdi64_qHP6vbcGsK6vKysQE,8547
 liger_kernel/triton/__init__.py,sha256=yfRe0zMb47QnqjecZWG7LnanfCTzeku7SgWRAwNVmzU,101
 liger_kernel/triton/monkey_patch.py,sha256=5BcGKTtdqeYchypBIBopGIWPx1-cFALz7sOKoEsqXJ0,1584
-liger_kernel_nightly-0.4.2.dev20241209195823.dist-info/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
-liger_kernel_nightly-0.4.2.dev20241209195823.dist-info/METADATA,sha256=rdhqAHF-DhOwy_DKk5SVEAC65LcW-IeyMY5QcYRUwSg,22801
-liger_kernel_nightly-0.4.2.dev20241209195823.dist-info/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
-liger_kernel_nightly-0.4.2.dev20241209195823.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
-liger_kernel_nightly-0.4.2.dev20241209195823.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
-liger_kernel_nightly-0.4.2.dev20241209195823.dist-info/RECORD,,
+liger_kernel_nightly-0.4.2.dev20241209234352.dist-info/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
+liger_kernel_nightly-0.4.2.dev20241209234352.dist-info/METADATA,sha256=DXgBwRWN509ykIXn_83UuDRiwhZ-1RQPv4ubuieBXBA,22801
+liger_kernel_nightly-0.4.2.dev20241209234352.dist-info/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
+liger_kernel_nightly-0.4.2.dev20241209234352.dist-info/WHEEL,sha256=P9jw-gEje8ByB7_hXoICnHtVCrEwMQh-630tKvQWehc,91
+liger_kernel_nightly-0.4.2.dev20241209234352.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
+liger_kernel_nightly-0.4.2.dev20241209234352.dist-info/RECORD,,

{liger_kernel_nightly-0.4.2.dev20241209195823.dist-info → liger_kernel_nightly-0.4.2.dev20241209234352.dist-info}/LICENSE RENAMED Viewed

File without changes

{liger_kernel_nightly-0.4.2.dev20241209195823.dist-info → liger_kernel_nightly-0.4.2.dev20241209234352.dist-info}/NOTICE RENAMED Viewed

File without changes

{liger_kernel_nightly-0.4.2.dev20241209195823.dist-info → liger_kernel_nightly-0.4.2.dev20241209234352.dist-info}/WHEEL RENAMED Viewed

File without changes

{liger_kernel_nightly-0.4.2.dev20241209195823.dist-info → liger_kernel_nightly-0.4.2.dev20241209234352.dist-info}/top_level.txt RENAMED Viewed

File without changes

liger-kernel-nightly 0.4.2.dev20241209195823__py3-none-any.whl → 0.4.2.dev20241209234352__py3-none-any.whl

liger-kernel-nightly 0.4.2.dev20241209195823py3-none-any.whl → 0.4.2.dev20241209234352py3-none-any.whl