PyPI - liger-kernel - Versions diffs - 0.4.2__py3-none-any.whl → 0.5.1__py3-none-any.whl - Mend

liger-kernel 0.4.2py3-none-any.whl → 0.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

liger_kernel/__init__.py +0 -0
liger_kernel/chunked_loss/__init__.py +4 -0
liger_kernel/chunked_loss/cpo_loss.py +107 -0
liger_kernel/chunked_loss/dpo_loss.py +95 -17
liger_kernel/chunked_loss/functional.py +9 -0
liger_kernel/chunked_loss/fused_linear_distillation.py +252 -0
liger_kernel/chunked_loss/fused_linear_preference.py +245 -65
liger_kernel/chunked_loss/orpo_loss.py +63 -13
liger_kernel/chunked_loss/simpo_loss.py +115 -0
liger_kernel/env_report.py +22 -0
liger_kernel/ops/cross_entropy.py +17 -10
liger_kernel/ops/fused_linear_cross_entropy.py +0 -11
liger_kernel/ops/fused_linear_jsd.py +1 -1
liger_kernel/ops/jsd.py +19 -10
liger_kernel/ops/layer_norm.py +6 -1
liger_kernel/ops/qwen2vl_mrope.py +238 -0
liger_kernel/ops/rms_norm.py +6 -1
liger_kernel/ops/utils.py +5 -2
liger_kernel/transformers/functional.py +128 -11
liger_kernel/transformers/fused_linear_jsd.py +1 -4
liger_kernel/transformers/jsd.py +1 -4
liger_kernel/transformers/monkey_patch.py +6 -4
liger_kernel/transformers/qwen2vl_mrope.py +20 -0
liger_kernel/transformers/trainer/__init__.py +6 -0
liger_kernel/transformers/trainer/orpo_trainer.py +169 -0
liger_kernel/utils.py +13 -0
{liger_kernel-0.4.2.dist-info → liger_kernel-0.5.1.dist-info}/METADATA +71 -47
{liger_kernel-0.4.2.dist-info → liger_kernel-0.5.1.dist-info}/RECORD +32 -22
{liger_kernel-0.4.2.dist-info → liger_kernel-0.5.1.dist-info}/WHEEL +1 -1
{liger_kernel-0.4.2.dist-info → liger_kernel-0.5.1.dist-info}/LICENSE +0 -0
{liger_kernel-0.4.2.dist-info → liger_kernel-0.5.1.dist-info}/NOTICE +0 -0
{liger_kernel-0.4.2.dist-info → liger_kernel-0.5.1.dist-info}/top_level.txt +0 -0

liger_kernel/chunked_loss/fused_linear_preference.py CHANGED Viewed

@@ -8,13 +8,9 @@ from torch.nn import functional as F
 class LigerFusedLinearPreferenceBase(torch.autograd.Function):
     @abstractmethod
-    def preference_loss_fn(chosen_logps, rejected_logps, beta=0.1):
+    def preference_loss_fn(*args, **kwargs):
         """
-        Compute preference loss.
-        Args:
-            chosen_logps (torch.Tensor): Avg log probabilities of chosen tokens. Shape: (batch_size,).
-            rejected_logps (torch.Tensor): Avg log probabilities of rejected tokens. Shape: (batch_size,).
-            beta (float): Weight for the odds ratio loss.
+        To be extended by subclasses.
         """
         raise NotImplementedError("Preference loss function must be implemented.")
@@ -27,15 +23,29 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         bias=None,
         loss_fn=None,
         chunk_size=1,
-        compute_nll_loss=True,
         ignore_index=-100,
+        alpha=1.0,
         beta=0.1,
+        compute_nll_loss=True,
         compiled=True,
+        use_ref_model=False,
+        # TODO: ref input
+        ref_weight=None,
+        ref_bias=None,
+        **loss_kwargs,
     ):
         """
         Base class for fused linear layer with preference loss.
         Expects _input to be stacked with chosen and rejected inputs on the batch dimension.
+        The mental model is:
+        forward()
+        ├── Loop over chunks
+            └── compute_loss()
+                ├── chunk_forward()  # Compute logits and log probs
+                └── prefer_loss()    # Calculate preference loss
         Args:
             _input (torch.Tensor): Input tensor. Shape: (batch_size, seq_len, hidden_size).
             weight (torch.Tensor): Weight tensor. Shape: (vocab_size, hidden_size).
@@ -43,55 +53,130 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             bias (torch.Tensor, optional): Bias tensor. Shape: (vocab_size,).
             loss_fn (callable): Loss function to compute the loss on a chunk of input/target.
             chunk_size (int): Size of a chunk (# of batches of stacked chosen and rejected inputs).
-            compute_nll_loss (bool): Whether to compute NLL loss.
             ignore_index (int): Index to ignore for loss computation.
-            beta (float): Weight for the odds ratio loss.
+            alpha (float): Weight for the NLL loss.
+            beta (float): Weight for the preference loss.
+            compute_nll_loss (bool): Whether to compute NLL loss.
             compiled (bool): Whether to use torch compile for chunk accumulation.
+            use_ref_model (bool): Whether to use a reference model for the alignment loss.
+            ref_weight (torch.Tensor): Reference weight tensor. Shape: (vocab_size, hidden_size).
+            ref_bias (torch.Tensor, optional): Reference bias tensor. Shape: (vocab_size,).
+            loss_kwargs (dict): Other possible arguments that a loss function might need
         """
         # TODO: Tune CHUNK_SIZE to fully utilize the GPU
         CHUNK_SIZE = chunk_size
+        # Gradients to be accumulated
         grad_weight = torch.zeros_like(weight)
         grad_chosen_inputs = []
         grad_rejected_inputs = []
         grad_bias = torch.zeros_like(bias) if bias is not None else None
+        # Loss to be accumulated
         loss_acc = torch.zeros((), device=_input.device)
-        chunks = max(1, _input.shape[0] // (2 * CHUNK_SIZE))
-        loss_func_to_call = partial(
+        # Metrics to be recorded
+        policy_chosen_logps = []
+        policy_rejected_logps = []
+        policy_chosen_logits_mean = torch.zeros((), device=_input.device)
+        policy_rejected_logits_mean = torch.zeros((), device=_input.device)
+        policy_nll_loss = torch.zeros((), device=_input.device)
+        aggregated_aux_outputs = []  # aggregated aux outputs from all chunks
+        compute_loss = partial(
             LigerFusedLinearPreferenceBase._compute_loss,
             preference_loss_fn=loss_fn,
             ignore_index=ignore_index,
+            alpha=alpha,
             beta=beta,
             compute_nll_loss=compute_nll_loss,
             full_target=target,
+            use_ref_model=use_ref_model,
+            ref_weight=ref_weight,
+            ref_bias=ref_bias,
+            **loss_kwargs,
         )
+        def fused_fwd_bwd(input_chunk, target_chunk):
+            """
+            Fused forward and backward pass for a chunk of input and target.
+            """
+            if bias is not None:
+                return torch.func.grad_and_value(
+                    compute_loss, argnums=(0, 1, 3), has_aux=True
+                )(input_chunk, weight, target_chunk, bias)
+            else:
+                return torch.func.grad_and_value(
+                    compute_loss, argnums=(0, 1), has_aux=True
+                )(input_chunk, weight, target_chunk)
         def accumulate_chunk(input_chunk, target_chunk):
             if bias is not None:
                 (chunk_grad_input, chunk_grad_weight, chunk_grad_bias), (
                     chunk_loss,
-                    (chunk_or_loss, chunk_chosen_logps, chunk_rejected_logps),
-                ) = torch.func.grad_and_value(
-                    loss_func_to_call, argnums=(0, 1, 3), has_aux=True
-                )(
-                    input_chunk, weight, target_chunk, bias
-                )
-                grad_bias.add_(chunk_grad_bias)
+                    (
+                        chunk_chosen_logps,
+                        chunk_rejected_logps,
+                        chunk_chosen_logits_mean,
+                        chunk_rejected_logits_mean,
+                        chunk_nll_loss,
+                        *aux_outputs,
+                    ),
+                ) = fused_fwd_bwd(input_chunk, target_chunk)
+                grad_bias.add_(chunk_grad_bias)  # accumulate bias gradient
             else:
                 (chunk_grad_input, chunk_grad_weight), (
                     chunk_loss,
-                    (chunk_or_loss, chunk_chosen_logps, chunk_rejected_logps),
-                ) = torch.func.grad_and_value(
-                    loss_func_to_call, argnums=(0, 1), has_aux=True
-                )(
-                    input_chunk, weight, target_chunk
-                )
+                    (
+                        chunk_chosen_logps,
+                        chunk_rejected_logps,
+                        chunk_chosen_logits_mean,
+                        chunk_rejected_logits_mean,
+                        chunk_nll_loss,
+                        *aux_outputs,
+                    ),
+                ) = fused_fwd_bwd(input_chunk, target_chunk)
+            # Accumulate gradients
             grad_weight.add_(chunk_grad_weight)
+            grad_chosen_inputs.append(chunk_grad_input[: chosen_target_chunk.shape[0]])
+            grad_rejected_inputs.append(
+                chunk_grad_input[chosen_target_chunk.shape[0] :]
+            )
+            # Accumulate loss
             loss_acc.add_(chunk_loss)
-            return chunk_grad_input
+            # Accumulate metrics
+            policy_chosen_logps.append(chunk_chosen_logps)
+            policy_rejected_logps.append(chunk_rejected_logps)
+            policy_chosen_logits_mean.add_(chunk_chosen_logits_mean)
+            policy_rejected_logits_mean.add_(chunk_rejected_logits_mean)
+            policy_nll_loss.add_(chunk_nll_loss)
+            # aux_outputs
+            # Initialize storage for aux_outputs
+            if len(aggregated_aux_outputs) == 0:
+                for aux in aux_outputs:
+                    if aux.ndim == 0:
+                        aggregated_aux_outputs.append(
+                            torch.zeros((), device=aux.device)
+                        )
+                    else:
+                        aggregated_aux_outputs.append([])
+            # Process each aux_output
+            for i, aux in enumerate(aux_outputs):
+                if aux.ndim == 0:
+                    aggregated_aux_outputs[i].add_(aux)
+                else:
+                    aggregated_aux_outputs[i].append(aux)
+        if compiled:
+            fused_fwd_bwd = torch.compile(fused_fwd_bwd)
         len_chosen = target.shape[0] // 2
+        chunks = max(1, _input.shape[0] // (2 * CHUNK_SIZE))
         _chosen_input_chunks = torch.chunk(_input[:len_chosen], chunks=chunks, dim=0)
         _chosen_target_chunks = torch.chunk(target[:len_chosen], chunks=chunks, dim=0)
         _rejected_input_chunks = torch.chunk(_input[len_chosen:], chunks=chunks, dim=0)
@@ -113,62 +198,61 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
                 [chosen_target_chunk, rejected_target_chunk], dim=0
             )
-            if compiled:
-                accumulate_chunk = torch.compile(accumulate_chunk)
-            grad_input = accumulate_chunk(input_chunk, target_chunk)
+            # mark input_chunk, target_chunk, and target dimension 1 as dynamic to prevent torch.compile recompilation
+            torch._dynamo.mark_dynamic(input_chunk, 1)
+            torch._dynamo.mark_dynamic(target_chunk, 1)
+            torch._dynamo.mark_dynamic(target, 1)
-            grad_chosen_inputs.append(grad_input[: chosen_target_chunk.shape[0]])
-            grad_rejected_inputs.append(grad_input[chosen_target_chunk.shape[0] :])
+            # accumulate loss, gradients, and metrics
+            accumulate_chunk(input_chunk, target_chunk)
         # combine grad_chosen_inputs and grad_rejected_inputs
         grad_inputs = grad_chosen_inputs + grad_rejected_inputs
+        policy_chosen_logps = torch.cat(policy_chosen_logps, dim=0)
+        policy_rejected_logps = torch.cat(policy_rejected_logps, dim=0)
+        # Aggregate aux outputs lists into tensors
+        for i, aux in enumerate(aggregated_aux_outputs):
+            if isinstance(aux, list):
+                aggregated_aux_outputs[i] = torch.cat(aux, dim=0)
         ctx.save_for_backward(
             torch.cat(grad_inputs, dim=0),
             grad_weight,
             grad_bias,
         )
-        return loss_acc
+        return_vars = (
+            policy_chosen_logps,
+            policy_rejected_logps,
+            policy_chosen_logits_mean,
+            policy_rejected_logits_mean,
+            policy_nll_loss,
+        )
+        return loss_acc, (*return_vars, *aggregated_aux_outputs)
     @staticmethod
-    def backward(ctx, grad_output):
+    def backward(ctx, *grad_output):
         grad_input, grad_weight, grad_bias = ctx.saved_tensors
-        if torch.ne(grad_output, torch.tensor(1.0, device=grad_output.device)):
-            grad_input = grad_input * grad_output
-            grad_weight = grad_weight * grad_output
-            grad_bias = grad_bias * grad_output if grad_bias is not None else None
+        if torch.ne(
+            grad_output[0][0], torch.tensor(1.0, device=grad_output[0][0].device)
+        ):
+            grad_input = grad_input * grad_output[0][0]
+            grad_weight = grad_weight * grad_output[0][0]
+            grad_bias = grad_bias * grad_output[0][0] if grad_bias is not None else None
         return grad_input, grad_weight, None, grad_bias, None, None, None
     @staticmethod
-    def _compute_loss(
+    def chunk_forward(
         input_chunk,
         weight,
         target_chunk,
         bias=None,
-        preference_loss_fn=None,
-        full_target=None,
         ignore_index=-100,
-        beta=0.1,
         compute_nll_loss=True,
-        **loss_kwargs,
     ):
-        """
-        Compute the total loss for a chunk of input and target, while using an alignment/preference loss function.
-        Args:
-            preference_loss_fn (callable): Loss function to compute the loss on a chunk of input/target.
-            input_chunk (torch.Tensor): Chunk of input tensor. Shape: (2 * chunk_size, sequence_length, hidden_size).
-            weight (torch.Tensor): Weight tensor. Shape: (vocab_size, hidden_size).
-            target_chunk (torch.Tensor): Chunk of target tensor. Shape: (2 * chunk_size, sequence_length).
-            bias (torch.Tensor, optional): Bias tensor. Shape: (vocab_size,).
-            full_target (torch.Tensor): Full target tensor. Shape: (batch_size, sequence_length).
-            ignore_index (int): Index to ignore for loss computation.
-            beta (float): Weight for the odds ratio loss.
-            loss_kwargs (dict): Additional arguments for the loss function.
-        """
         len_chosen_chunk = target_chunk.shape[0] // 2
-        logits_chunk = input_chunk @ weight.t()  # chunk_size x V
+        logits_chunk = input_chunk @ weight.t()
         if bias is not None:
             logits_chunk = logits_chunk + bias
         log_probs_chunk = F.log_softmax(logits_chunk.float(), dim=-1)
@@ -181,10 +265,6 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
                 reduction="sum",
                 ignore_index=ignore_index,
             )
-            chosen_nll_loss = (
-                chosen_nll_loss
-                / (full_target[: full_target.shape[0] // 2] != ignore_index).sum()
-            )
         loss_mask = target_chunk != ignore_index
         label_chunk = torch.where(loss_mask, target_chunk, 0)
@@ -197,10 +277,110 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         chosen_logps = average_log_prob[:len_chosen_chunk]
         rejected_logps = average_log_prob[len_chosen_chunk:]
-        alignment_loss = preference_loss_fn(
-            chosen_logps, rejected_logps, beta=beta, **loss_kwargs
+        chosen_logits = logits_chunk[:len_chosen_chunk]
+        rejected_logits = logits_chunk[len_chosen_chunk:]
+        return (
+            chosen_logps,
+            rejected_logps,
+            chosen_logits,
+            rejected_logits,
+            chosen_nll_loss,
+        )
+    @staticmethod
+    def _compute_loss(
+        input_chunk,
+        weight,
+        target_chunk,
+        bias=None,
+        preference_loss_fn=None,
+        full_target=None,
+        ignore_index=-100,
+        alpha=1.0,
+        beta=0.1,
+        compute_nll_loss=True,
+        use_ref_model=False,
+        ref_weight=None,
+        ref_bias=None,
+        **loss_kwargs,
+    ):
+        """
+        Compute the total loss for a chunk of input and target, while using an alignment/preference loss function.
+        Args:
+            preference_loss_fn (callable): Loss function to compute the loss on a chunk of input/target.
+            input_chunk (torch.Tensor): Chunk of input tensor. Shape: (2 * chunk_size, sequence_length, hidden_size).
+            weight (torch.Tensor): Weight tensor. Shape: (vocab_size, hidden_size).
+            target_chunk (torch.Tensor): Chunk of target tensor. Shape: (2 * chunk_size, sequence_length).
+            bias (torch.Tensor, optional): Bias tensor. Shape: (vocab_size,).
+            full_target (torch.Tensor): Full target tensor. Shape: (batch_size, sequence_length).
+            ignore_index (int): Index to ignore for loss computation.
+            alpha (float): Weight for the NLL loss.
+            beta (float): Weight for the preference loss.
+            compute_nll_loss (bool): Whether to compute NLL loss.
+            use_ref_model (bool): Whether to use a reference model for the alignment loss.
+            ref_weight (torch.Tensor): Reference weight tensor. Shape: (vocab_size, hidden_size).
+            ref_bias (torch.Tensor, optional): Reference bias tensor. Shape: (vocab_size,).
+            loss_kwargs (dict): Additional arguments for the loss function.
+        """
+        (
+            chosen_logps,
+            rejected_logps,
+            chosen_logits,
+            rejected_logits,
+            chosen_nll_loss,
+        ) = LigerFusedLinearPreferenceBase.chunk_forward(
+            input_chunk,
+            weight,
+            target_chunk,
+            bias=bias,
+            ignore_index=ignore_index,
+            compute_nll_loss=compute_nll_loss,
+        )
+        chosen_nll_loss = (
+            chosen_nll_loss
+            / (full_target[: full_target.shape[0] // 2] != ignore_index).sum()
+        )
+        chosen_logits_mean = chosen_logits.sum() / (
+            full_target.shape[0] // 2 * input_chunk.shape[1] * weight.shape[0]
+        )
+        rejected_logits_mean = rejected_logits.sum() / (
+            full_target.shape[0] // 2 * input_chunk.shape[1] * weight.shape[0]
+        )
+        if use_ref_model:
+            with torch.no_grad():
+                (
+                    ref_chosen_logps,
+                    ref_rejected_logps,
+                    ref_chosen_logits,
+                    ref_rejected_logits,
+                    ref_chosen_nll_loss,
+                ) = LigerFusedLinearPreferenceBase.chunk_forward(
+                    input_chunk,
+                    ref_weight,
+                    target_chunk,
+                    ref_bias,
+                    ignore_index=ignore_index,
+                    compute_nll_loss=False,  # We don't need NLL loss for the reference model
+                )
+            loss_kwargs["ref_chosen_logps"] = ref_chosen_logps
+            loss_kwargs["ref_rejected_logps"] = ref_rejected_logps
+        preference_loss_outputs = preference_loss_fn(
+            chosen_logps, rejected_logps, full_target, beta=beta, **loss_kwargs
         )
-        alignment_loss = alignment_loss / (full_target.shape[0] // 2)
+        if isinstance(preference_loss_outputs, tuple):
+            preference_loss, *aux_outputs = preference_loss_outputs
+        else:
+            preference_loss, aux_outputs = preference_loss_outputs, []
-        loss = chosen_nll_loss - alignment_loss
-        return loss, (alignment_loss, chosen_logps, rejected_logps)
+        loss = alpha * chosen_nll_loss - preference_loss
+        return_vars = (
+            chosen_logps,
+            rejected_logps,
+            chosen_logits_mean,
+            rejected_logits_mean,
+            chosen_nll_loss,
+        )
+        return loss, (*return_vars, *aux_outputs)

liger_kernel/chunked_loss/orpo_loss.py CHANGED Viewed

@@ -9,12 +9,26 @@ from liger_kernel.chunked_loss.fused_linear_preference import (
 class LigerFusedLinearORPOFunction(LigerFusedLinearPreferenceBase):
     @staticmethod
-    def preference_loss_fn(chosen_logps, rejected_logps, beta=0.1):
+    def preference_loss_fn(chosen_logps, rejected_logps, full_target, beta=0.1):
         """
-        Compute odds-ratio loss.
+        Paper: https://arxiv.org/pdf/2403.07691
+        Formula:
+        Compute odds-ratio loss: L_OR = -log(σ(log(odds_θ(y_w|x) / odds_θ(y_l|x))))
+        where odds_θ(y|x) = P_θ(y|x) / (1 - P_θ(y|x))
+        Where:
+        - P_θ(y|x): Policy (model) probability
+        - y_w: Chosen sequence
+        - y_l: Rejected sequence
+        - σ: Sigmoid function
+        - β: Weight for the odds ratio loss
+        - odds_θ: Odds function for the policy
         Args:
             chosen_logps (torch.Tensor): Avg log probabilities of chosen tokens. Shape: (batch_size,).
             rejected_logps (torch.Tensor): Avg log probabilities of rejected tokens. Shape: (batch_size,).
+            full_target (torch.Tensor): Non chunked full target tensor
             beta (float): Weight for the odds ratio loss.
         """
         log_odds = (chosen_logps - rejected_logps) - (
@@ -22,7 +36,15 @@ class LigerFusedLinearORPOFunction(LigerFusedLinearPreferenceBase):
             - torch.log1p(-torch.exp(rejected_logps))
         )
         ratio = F.logsigmoid(log_odds)
-        return beta * ratio.sum()
+        loss = beta * ratio.sum() / (full_target.shape[0] // 2)
+        chosen_rewards = beta * chosen_logps
+        rejected_rewards = beta * rejected_logps
+        log_odds_ratio = torch.sum(ratio) / (full_target.shape[0] // 2)
+        log_odds_chosen = torch.sum(log_odds) / (full_target.shape[0] // 2)
+        return loss, chosen_rewards, rejected_rewards, log_odds_ratio, log_odds_chosen
     @staticmethod
     def forward(
@@ -36,12 +58,6 @@ class LigerFusedLinearORPOFunction(LigerFusedLinearPreferenceBase):
         compute_nll_loss=True,
         compiled=True,
     ):
-        """
-        Fused linear layer with ORPO (Odds-Ratio Preference Optimization) loss.
-        Handles both the forward and backward pass of the final linear layer with ORPO loss.
-        Inspired from LigerFusedLinearCrossEntropyFunction (https://arxiv.org/abs/2410.10989) which fuses final linear layer and CE loss.
-        """
         return LigerFusedLinearPreferenceBase.forward(
             ctx=ctx,
             _input=_input,
@@ -49,15 +65,49 @@ class LigerFusedLinearORPOFunction(LigerFusedLinearPreferenceBase):
             target=target,
             bias=bias,
             loss_fn=LigerFusedLinearORPOFunction.preference_loss_fn,
-            compute_nll_loss=compute_nll_loss,
             ignore_index=ignore_index,
             beta=beta,
+            compute_nll_loss=compute_nll_loss,
             compiled=compiled,
         )
     @staticmethod
-    def backward(ctx, grad_output):
-        # Get gradients for _input, weight, bias, and target from the base class
+    def backward(ctx, *grad_output):
         grads = LigerFusedLinearPreferenceBase.backward(ctx, grad_output)[:4]
-        # Return these gradients, followed by None for the remaining inputs
         return *grads, None, None, None, None
+class LigerFusedLinearORPOLoss(torch.nn.Module):
+    """
+    Fused linear layer with ORPO (Odds-Ratio Preference Optimization) loss.
+    """
+    def __init__(
+        self,
+        ignore_index: int = -100,
+        beta: float = 0.1,
+        compute_nll_loss: bool = True,
+        compiled: bool = True,
+    ):
+        """
+        Args:
+            ignore_index (int): Index to ignore in the loss.
+            beta (float): Weight for the odds ratio loss.
+        """
+        super().__init__()
+        self.ignore_index = ignore_index
+        self.beta = beta
+        self.compute_nll_loss = compute_nll_loss
+        self.compiled = compiled
+    def forward(self, lin_weight, _input, target, bias=None):
+        return LigerFusedLinearORPOFunction.apply(
+            _input,
+            lin_weight,
+            target,
+            bias,
+            self.ignore_index,
+            self.beta,
+            self.compute_nll_loss,
+            self.compiled,
+        )

liger_kernel/chunked_loss/simpo_loss.py ADDED Viewed

@@ -0,0 +1,115 @@
+import torch
+import torch.nn.functional as F
+from liger_kernel.chunked_loss.fused_linear_preference import (
+    LigerFusedLinearPreferenceBase,
+)
+class LigerFusedLinearSimPOFunction(LigerFusedLinearPreferenceBase):
+    @staticmethod
+    def preference_loss_fn(
+        chosen_logps, rejected_logps, full_target, beta=0.1, gamma=0.5
+    ):
+        """
+        Paper: https://arxiv.org/pdf/2405.14734
+        Formula:
+        L_SimPO(π_θ) = -E [log σ(β/|y_w| log π_θ(y_w|x) - β/|y_l| log π_θ(y_l|x) - γ)]
+        Where:
+        - π_θ(y|x): Policy (model) probability
+        - y_w: Chosen sequence
+        - y_l: Rejected sequence
+        - |y_w|, |y_l|: Sequence lengths
+        - σ: Sigmoid function
+        - β: beta weight
+        - γ: gemma margin term
+        Args:
+            chosen_logps (torch.Tensor): Avg log probabilities of chosen tokens. Shape: (batch_size,).
+            rejected_logps (torch.Tensor): Avg log probabilities of rejected tokens. Shape: (batch_size,).
+            full_target: Non chunked full target tensor
+            beta (float): beta weight
+            gamma (float): gemma margin term
+        """
+        logits = beta * (chosen_logps - rejected_logps) - gamma
+        loss = F.logsigmoid(logits).sum() / (full_target.shape[0] // 2)
+        return loss
+    @staticmethod
+    def forward(
+        ctx,
+        _input,
+        weight,
+        target,
+        bias=None,
+        ignore_index=-100,
+        beta=0.1,
+        alpha=1.0,
+        compute_nll_loss=False,
+        compiled=True,
+        gamma=0.5,
+    ):
+        return LigerFusedLinearPreferenceBase.forward(
+            ctx,
+            _input,
+            weight,
+            target,
+            bias,
+            loss_fn=LigerFusedLinearSimPOFunction.preference_loss_fn,
+            compute_nll_loss=compute_nll_loss,
+            ignore_index=ignore_index,
+            alpha=alpha,
+            beta=beta,
+            compiled=compiled,
+            gamma=gamma,
+        )
+    @staticmethod
+    def backward(ctx, *grad_output):
+        grads = LigerFusedLinearPreferenceBase.backward(ctx, grad_output)[:4]
+        return *grads, None, None, None, None, None, None
+class LigerFusedLinearSimPOLoss(torch.nn.Module):
+    """
+    Fused linear layer with SimPO loss.
+    """
+    def __init__(
+        self,
+        ignore_index: int = -100,
+        beta: float = 0.1,
+        alpha: float = 1.0,
+        compute_nll_loss: bool = True,
+        compiled: bool = True,
+        gamma: float = 0.5,
+    ):
+        """
+        Args:
+            ignore_index (int): Index to ignore in the loss.
+            beta (float): Weight for the odds ratio loss.
+        """
+        super().__init__()
+        self.ignore_index = ignore_index
+        self.beta = beta
+        self.alpha = alpha
+        self.compute_nll_loss = compute_nll_loss
+        self.compiled = compiled
+        self.gamma = gamma
+    def forward(self, lin_weight, _input, target, bias=None):
+        return LigerFusedLinearSimPOFunction.apply(
+            _input,
+            lin_weight,
+            target,
+            bias,
+            self.ignore_index,
+            self.beta,
+            self.alpha,
+            self.compute_nll_loss,
+            self.compiled,
+            self.gamma,
+        )

liger_kernel/env_report.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import platform
 import sys
+from importlib.metadata import version
 def print_env_report():
@@ -17,6 +18,11 @@ def print_env_report():
     print(f"Operating System: {platform.platform()}")
     print(f"Python version: {sys.version.split()[0]}")
+    try:
+        print(f"Liger Kernel version: {version('liger-kernel')}")
+    except ImportError:
+        print("Liger Kernel: Not installed")
     try:
         import torch
@@ -25,9 +31,17 @@ def print_env_report():
             torch.version.cuda if torch.cuda.is_available() else "Not available"
         )
         print(f"CUDA version: {cuda_version}")
+        hip_version = (
+            torch.version.hip
+            if torch.cuda.is_available() and torch.version.hip
+            else "Not available"
+        )
+        print(f"HIP(ROCm) version: {hip_version}")
     except ImportError:
         print("PyTorch: Not installed")
         print("CUDA version: Unable to query")
+        print("HIP(ROCm) version: Unable to query")
     try:
         import triton
@@ -43,6 +57,14 @@ def print_env_report():
     except ImportError:
         print("Transformers: Not installed")
+    try:
+        xpu_version = (
+            torch.version.xpu if torch.xpu.is_available() else "XPU Not Available"
+        )
+        print(f"XPU version: {xpu_version}")
+    except ImportError:
+        print("XPU version: Unable to query")
 if __name__ == "__main__":
     print_env_report()

liger-kernel 0.4.2__py3-none-any.whl → 0.5.1__py3-none-any.whl

liger-kernel 0.4.2py3-none-any.whl → 0.5.1py3-none-any.whl