PyPI - liger-kernel - Versions diffs - 0.5.2__py3-none-any.whl → 0.5.3__py3-none-any.whl - Mend

liger-kernel 0.5.2py3-none-any.whl → 0.5.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (63) hide show

liger_kernel/chunked_loss/README.md +25 -0
liger_kernel/chunked_loss/__init__.py +2 -0
liger_kernel/chunked_loss/cpo_loss.py +18 -8
liger_kernel/chunked_loss/dpo_loss.py +20 -10
liger_kernel/chunked_loss/functional.py +4 -0
liger_kernel/chunked_loss/fused_linear_distillation.py +58 -44
liger_kernel/chunked_loss/fused_linear_preference.py +108 -60
liger_kernel/chunked_loss/fused_linear_unpaired_preference.py +246 -0
liger_kernel/chunked_loss/jsd_loss.py +154 -0
liger_kernel/chunked_loss/kto_loss.py +172 -0
liger_kernel/chunked_loss/orpo_loss.py +8 -9
liger_kernel/chunked_loss/simpo_loss.py +22 -8
liger_kernel/env_report.py +5 -12
liger_kernel/ops/cross_entropy.py +102 -51
liger_kernel/ops/experimental/embedding.py +1 -3
liger_kernel/ops/experimental/mm_int8int2.py +3 -9
liger_kernel/ops/fused_linear_cross_entropy.py +89 -55
liger_kernel/ops/fused_linear_jsd.py +11 -29
liger_kernel/ops/geglu.py +6 -17
liger_kernel/ops/group_norm.py +11 -28
liger_kernel/ops/jsd.py +2 -6
liger_kernel/ops/kl_div.py +8 -11
liger_kernel/ops/layer_norm.py +3 -5
liger_kernel/ops/qwen2vl_mrope.py +8 -25
liger_kernel/ops/rms_norm.py +14 -32
liger_kernel/ops/rope.py +31 -33
liger_kernel/ops/swiglu.py +4 -8
liger_kernel/ops/utils.py +2 -0
liger_kernel/transformers/__init__.py +16 -24
liger_kernel/transformers/auto_model.py +6 -13
liger_kernel/transformers/cross_entropy.py +4 -6
liger_kernel/transformers/experimental/embedding.py +1 -3
liger_kernel/transformers/functional.py +11 -7
liger_kernel/transformers/fused_linear_cross_entropy.py +12 -7
liger_kernel/transformers/geglu.py +1 -4
liger_kernel/transformers/group_norm.py +3 -9
liger_kernel/transformers/jsd.py +1 -3
liger_kernel/transformers/kl_div.py +1 -3
liger_kernel/transformers/layer_norm.py +3 -9
liger_kernel/transformers/model/gemma.py +18 -40
liger_kernel/transformers/model/gemma2.py +19 -41
liger_kernel/transformers/model/llama.py +22 -48
liger_kernel/transformers/model/mistral.py +14 -26
liger_kernel/transformers/model/mixtral.py +24 -54
liger_kernel/transformers/model/mllama.py +16 -36
liger_kernel/transformers/model/phi3.py +18 -40
liger_kernel/transformers/model/qwen2.py +18 -40
liger_kernel/transformers/model/qwen2_vl.py +36 -32
liger_kernel/transformers/monkey_patch.py +43 -117
liger_kernel/transformers/rms_norm.py +4 -4
liger_kernel/transformers/rope.py +2 -2
liger_kernel/transformers/swiglu.py +2 -8
liger_kernel/transformers/trainer/__init__.py +1 -3
liger_kernel/transformers/trainer/orpo_trainer.py +31 -18
liger_kernel/triton/__init__.py +1 -3
liger_kernel/triton/monkey_patch.py +1 -3
{liger_kernel-0.5.2.dist-info → liger_kernel-0.5.3.dist-info}/METADATA +38 -25
liger_kernel-0.5.3.dist-info/RECORD +69 -0
{liger_kernel-0.5.2.dist-info → liger_kernel-0.5.3.dist-info}/WHEEL +1 -1
liger_kernel-0.5.2.dist-info/RECORD +0 -65
{liger_kernel-0.5.2.dist-info → liger_kernel-0.5.3.dist-info}/LICENSE +0 -0
{liger_kernel-0.5.2.dist-info → liger_kernel-0.5.3.dist-info}/NOTICE +0 -0
{liger_kernel-0.5.2.dist-info → liger_kernel-0.5.3.dist-info}/top_level.txt +0 -0

liger_kernel/chunked_loss/fused_linear_preference.py CHANGED Viewed

@@ -2,11 +2,11 @@ from abc import abstractmethod
 from functools import partial
 import torch
 from torch.nn import functional as F
 class LigerFusedLinearPreferenceBase(torch.autograd.Function):
     @abstractmethod
     def preference_loss_fn(*args, **kwargs):
         """
@@ -27,11 +27,13 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         alpha=1.0,
         beta=0.1,
         compute_nll_loss=True,
+        nll_target=None,
         compiled=True,
         use_ref_model=False,
-        # TODO: ref input
+        ref_input=None,
         ref_weight=None,
         ref_bias=None,
+        average_log_prob=True,
         **loss_kwargs,
     ):
         """
@@ -57,10 +59,12 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             alpha (float): Weight for the NLL loss.
             beta (float): Weight for the preference loss.
             compute_nll_loss (bool): Whether to compute NLL loss.
+            nll_target (torch.Tensor, optional): Target tensor for NLL loss. Shape: (batch_size, seq_len). If not provided the target is used.
             compiled (bool): Whether to use torch compile for chunk accumulation.
             use_ref_model (bool): Whether to use a reference model for the alignment loss.
             ref_weight (torch.Tensor): Reference weight tensor. Shape: (vocab_size, hidden_size).
             ref_bias (torch.Tensor, optional): Reference bias tensor. Shape: (vocab_size,).
+            average_log_prob (bool): Whether to average log probabilities or to sum them over the completion.
             loss_kwargs (dict): Other possible arguments that a loss function might need
         """
         # TODO: Tune CHUNK_SIZE to fully utilize the GPU
@@ -94,55 +98,70 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             use_ref_model=use_ref_model,
             ref_weight=ref_weight,
             ref_bias=ref_bias,
+            full_nll_target=nll_target,
+            average_log_prob=average_log_prob,
             **loss_kwargs,
         )
-        def fused_fwd_bwd(input_chunk, target_chunk):
+        def fused_fwd_bwd(input_chunk, target_chunk, ref_input_chunk, chosen_nll_target_chunk):
             """
             Fused forward and backward pass for a chunk of input and target.
             """
             if bias is not None:
-                return torch.func.grad_and_value(
-                    compute_loss, argnums=(0, 1, 3), has_aux=True
-                )(input_chunk, weight, target_chunk, bias)
+                return torch.func.grad_and_value(compute_loss, argnums=(0, 1, 3), has_aux=True)(
+                    input_chunk,
+                    weight,
+                    target_chunk,
+                    bias,
+                    ref_input_chunk=ref_input_chunk,
+                    chosen_nll_target_chunk=chosen_nll_target_chunk,
+                )
             else:
-                return torch.func.grad_and_value(
-                    compute_loss, argnums=(0, 1), has_aux=True
-                )(input_chunk, weight, target_chunk)
+                return torch.func.grad_and_value(compute_loss, argnums=(0, 1), has_aux=True)(
+                    input_chunk,
+                    weight,
+                    target_chunk,
+                    ref_input_chunk=ref_input_chunk,
+                    chosen_nll_target_chunk=chosen_nll_target_chunk,
+                )
-        def accumulate_chunk(input_chunk, target_chunk):
+        def accumulate_chunk(input_chunk, target_chunk, ref_input_chunk=None, chosen_nll_target_chunk=None):
             if bias is not None:
-                (chunk_grad_input, chunk_grad_weight, chunk_grad_bias), (
-                    chunk_loss,
+                (
+                    (chunk_grad_input, chunk_grad_weight, chunk_grad_bias),
                     (
-                        chunk_chosen_logps,
-                        chunk_rejected_logps,
-                        chunk_chosen_logits_mean,
-                        chunk_rejected_logits_mean,
-                        chunk_nll_loss,
-                        *aux_outputs,
+                        chunk_loss,
+                        (
+                            chunk_chosen_logps,
+                            chunk_rejected_logps,
+                            chunk_chosen_logits_mean,
+                            chunk_rejected_logits_mean,
+                            chunk_nll_loss,
+                            *aux_outputs,
+                        ),
                     ),
-                ) = fused_fwd_bwd(input_chunk, target_chunk)
+                ) = fused_fwd_bwd(input_chunk, target_chunk, ref_input_chunk, chosen_nll_target_chunk)
                 grad_bias.add_(chunk_grad_bias)  # accumulate bias gradient
             else:
-                (chunk_grad_input, chunk_grad_weight), (
-                    chunk_loss,
+                (
+                    (chunk_grad_input, chunk_grad_weight),
                     (
-                        chunk_chosen_logps,
-                        chunk_rejected_logps,
-                        chunk_chosen_logits_mean,
-                        chunk_rejected_logits_mean,
-                        chunk_nll_loss,
-                        *aux_outputs,
+                        chunk_loss,
+                        (
+                            chunk_chosen_logps,
+                            chunk_rejected_logps,
+                            chunk_chosen_logits_mean,
+                            chunk_rejected_logits_mean,
+                            chunk_nll_loss,
+                            *aux_outputs,
+                        ),
                     ),
-                ) = fused_fwd_bwd(input_chunk, target_chunk)
+                ) = fused_fwd_bwd(input_chunk, target_chunk, ref_input_chunk, chosen_nll_target_chunk)
             # Accumulate gradients
             grad_weight.add_(chunk_grad_weight)
             grad_chosen_inputs.append(chunk_grad_input[: chosen_target_chunk.shape[0]])
-            grad_rejected_inputs.append(
-                chunk_grad_input[chosen_target_chunk.shape[0] :]
-            )
+            grad_rejected_inputs.append(chunk_grad_input[chosen_target_chunk.shape[0] :])
             # Accumulate loss
             loss_acc.add_(chunk_loss)
@@ -159,9 +178,7 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             if len(aggregated_aux_outputs) == 0:
                 for aux in aux_outputs:
                     if aux.ndim == 0:
-                        aggregated_aux_outputs.append(
-                            torch.zeros((), device=aux.device)
-                        )
+                        aggregated_aux_outputs.append(torch.zeros((), device=aux.device))
                     else:
                         aggregated_aux_outputs.append([])
@@ -182,29 +199,46 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         _rejected_input_chunks = torch.chunk(_input[len_chosen:], chunks=chunks, dim=0)
         _rejected_target_chunks = torch.chunk(target[len_chosen:], chunks=chunks, dim=0)
+        if nll_target is not None:
+            _chosen_nll_target_chunks = torch.chunk(nll_target[:len_chosen], chunks=chunks, dim=0)
+        if use_ref_model:
+            _ref_chosen_input_chunks = torch.chunk(ref_input[:len_chosen], chunks=chunks, dim=0)
+            _ref_rejected_input_chunks = torch.chunk(ref_input[len_chosen:], chunks=chunks, dim=0)
         for (
             chosen_input_chunk,
             rejected_input_chunk,
             chosen_target_chunk,
             rejected_target_chunk,
+            ref_chosen_input_chunk,
+            ref_rejected_input_chunk,
+            chosen_nll_target_chunk,
         ) in zip(
             _chosen_input_chunks,
             _rejected_input_chunks,
             _chosen_target_chunks,
             _rejected_target_chunks,
+            (_ref_chosen_input_chunks if use_ref_model else [None] * len(_chosen_input_chunks)),
+            (_ref_rejected_input_chunks if use_ref_model else [None] * len(_rejected_input_chunks)),
+            (_chosen_nll_target_chunks if nll_target is not None else [None] * len(_chosen_input_chunks)),
+            strict=False,
         ):
             input_chunk = torch.cat([chosen_input_chunk, rejected_input_chunk], dim=0)
-            target_chunk = torch.cat(
-                [chosen_target_chunk, rejected_target_chunk], dim=0
+            ref_input_chunk = (
+                torch.cat([ref_chosen_input_chunk, ref_rejected_input_chunk], dim=0) if use_ref_model else None
             )
+            target_chunk = torch.cat([chosen_target_chunk, rejected_target_chunk], dim=0)
             # mark input_chunk, target_chunk, and target dimension 1 as dynamic to prevent torch.compile recompilation
             torch._dynamo.mark_dynamic(input_chunk, 1)
             torch._dynamo.mark_dynamic(target_chunk, 1)
             torch._dynamo.mark_dynamic(target, 1)
+            torch._dynamo.mark_dynamic(ref_input_chunk, 1) if use_ref_model else None
+            torch._dynamo.mark_dynamic(chosen_nll_target_chunk, 1) if nll_target is not None else None
             # accumulate loss, gradients, and metrics
-            accumulate_chunk(input_chunk, target_chunk)
+            accumulate_chunk(input_chunk, target_chunk, ref_input_chunk, chosen_nll_target_chunk)
         # combine grad_chosen_inputs and grad_rejected_inputs
         grad_inputs = grad_chosen_inputs + grad_rejected_inputs
@@ -233,14 +267,12 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
     @staticmethod
     def backward(ctx, *grad_output):
         grad_input, grad_weight, grad_bias = ctx.saved_tensors
-        if torch.ne(
-            grad_output[0][0], torch.tensor(1.0, device=grad_output[0][0].device)
-        ):
+        if torch.ne(grad_output[0][0], torch.tensor(1.0, device=grad_output[0][0].device)):
             grad_input = grad_input * grad_output[0][0]
             grad_weight = grad_weight * grad_output[0][0]
             grad_bias = grad_bias * grad_output[0][0] if grad_bias is not None else None
-        return grad_input, grad_weight, None, grad_bias, None, None, None
+        return grad_input, grad_weight, None, grad_bias, None, None, None, None
     @staticmethod
     def chunk_forward(
@@ -250,6 +282,8 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         bias=None,
         ignore_index=-100,
         compute_nll_loss=True,
+        chosen_nll_target_chunk=None,
+        average_log_prob=True,
     ):
         len_chosen_chunk = target_chunk.shape[0] // 2
         logits_chunk = input_chunk @ weight.t()
@@ -259,9 +293,12 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         chosen_nll_loss = 0.0
         if compute_nll_loss:
+            nll_labels = (
+                chosen_nll_target_chunk if chosen_nll_target_chunk is not None else target_chunk[:len_chosen_chunk]
+            )
             chosen_nll_loss = F.nll_loss(
                 log_probs_chunk[:len_chosen_chunk].view(-1, log_probs_chunk.shape[-1]),
-                target_chunk[:len_chosen_chunk].view(-1),
+                nll_labels.view(-1),
                 reduction="sum",
                 ignore_index=ignore_index,
             )
@@ -269,13 +306,14 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         loss_mask = target_chunk != ignore_index
         label_chunk = torch.where(loss_mask, target_chunk, 0)
-        per_token_logps = log_probs_chunk.gather(-1, label_chunk.unsqueeze(-1)).squeeze(
-            -1
-        )
-        average_log_prob = (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)
+        per_token_logps = log_probs_chunk.gather(-1, label_chunk.unsqueeze(-1)).squeeze(-1)
+        if average_log_prob:
+            log_prob = (per_token_logps * loss_mask).sum(-1) / loss_mask.sum(-1)
+        else:
+            log_prob = (per_token_logps * loss_mask).sum(-1)
-        chosen_logps = average_log_prob[:len_chosen_chunk]
-        rejected_logps = average_log_prob[len_chosen_chunk:]
+        chosen_logps = log_prob[:len_chosen_chunk]
+        rejected_logps = log_prob[len_chosen_chunk:]
         chosen_logits = logits_chunk[:len_chosen_chunk]
         rejected_logits = logits_chunk[len_chosen_chunk:]
@@ -301,8 +339,12 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         beta=0.1,
         compute_nll_loss=True,
         use_ref_model=False,
+        ref_input_chunk=None,
         ref_weight=None,
         ref_bias=None,
+        full_nll_target=None,
+        chosen_nll_target_chunk=None,
+        average_log_prob=True,
         **loss_kwargs,
     ):
         """
@@ -321,6 +363,9 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             use_ref_model (bool): Whether to use a reference model for the alignment loss.
             ref_weight (torch.Tensor): Reference weight tensor. Shape: (vocab_size, hidden_size).
             ref_bias (torch.Tensor, optional): Reference bias tensor. Shape: (vocab_size,).
+            full_nll_target (torch.Tensor, optional): Full target tensor for NLL loss. Shape: (batch_size, sequence_length).
+            chosen_nll_target_chunk (torch.Tensor, optional): Target tensor for NLL loss. Shape: (chunk_size, sequence_length) If not provided the target_chunk is used.
+            average_log_prob (bool): Whether to average log probabilities or the sum.
             loss_kwargs (dict): Additional arguments for the loss function.
         """
         (
@@ -336,14 +381,15 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
             bias=bias,
             ignore_index=ignore_index,
             compute_nll_loss=compute_nll_loss,
+            chosen_nll_target_chunk=chosen_nll_target_chunk,
+            average_log_prob=average_log_prob,
         )
-        chosen_nll_loss = (
-            chosen_nll_loss
-            / (full_target[: full_target.shape[0] // 2] != ignore_index).sum()
-        )
-        chosen_logits_mean = chosen_logits.sum() / (
-            full_target.shape[0] // 2 * input_chunk.shape[1] * weight.shape[0]
-        )
+        if full_nll_target is not None:
+            chosen_nll_loss = chosen_nll_loss / (full_nll_target[: full_nll_target.shape[0] // 2] != ignore_index).sum()
+        else:
+            chosen_nll_loss = chosen_nll_loss / (full_target[: full_target.shape[0] // 2] != ignore_index).sum()
+        chosen_logits_mean = chosen_logits.sum() / (full_target.shape[0] // 2 * input_chunk.shape[1] * weight.shape[0])
         rejected_logits_mean = rejected_logits.sum() / (
             full_target.shape[0] // 2 * input_chunk.shape[1] * weight.shape[0]
         )
@@ -353,16 +399,18 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
                 (
                     ref_chosen_logps,
                     ref_rejected_logps,
-                    ref_chosen_logits,
-                    ref_rejected_logits,
-                    ref_chosen_nll_loss,
+                    _,
+                    _,
+                    _,
                 ) = LigerFusedLinearPreferenceBase.chunk_forward(
-                    input_chunk,
+                    ref_input_chunk,
                     ref_weight,
                     target_chunk,
                     ref_bias,
                     ignore_index=ignore_index,
                     compute_nll_loss=False,  # We don't need NLL loss for the reference model
+                    chosen_nll_target_chunk=None,
+                    average_log_prob=average_log_prob,
                 )
             loss_kwargs["ref_chosen_logps"] = ref_chosen_logps
             loss_kwargs["ref_rejected_logps"] = ref_rejected_logps
@@ -375,7 +423,7 @@ class LigerFusedLinearPreferenceBase(torch.autograd.Function):
         else:
             preference_loss, aux_outputs = preference_loss_outputs, []
-        loss = alpha * chosen_nll_loss - preference_loss
+        loss = alpha * chosen_nll_loss + preference_loss
         return_vars = (
             chosen_logps,
             rejected_logps,

liger_kernel/chunked_loss/fused_linear_unpaired_preference.py ADDED Viewed

@@ -0,0 +1,246 @@
+from abc import abstractmethod
+from functools import partial
+import torch
+from torch.nn import functional as F
+class LigerFusedLinearUnpairedPreferenceBase(torch.autograd.Function):
+    @abstractmethod
+    def preference_loss_fn(*args, **kwargs):
+        """
+        To be extended by subclasses.
+        """
+        raise NotImplementedError("Preference loss function must be implemented.")
+    @staticmethod
+    def forward(
+        ctx,
+        _input,
+        weight,
+        target,
+        preference_labels,
+        bias=None,
+        loss_fn=None,
+        chunk_size=1,
+        ignore_index=-100,
+        compiled=True,
+        use_ref_model=False,
+        ref_input=None,
+        ref_weight=None,
+        ref_bias=None,
+        **loss_kwargs,
+    ):
+        """
+        Base class for fused linear layer with unpaired preference loss like KTO
+        Expects _input to be stacked with chosen and rejected inputs on the batch dimension.
+        The mental model is:
+        forward()
+        ├── Loop over chunks
+            └── compute_loss()
+                ├── chunk_forward()  # Compute logits and log probs
+                └── prefer_loss()    # Calculate preference loss
+        Args:
+            _input (torch.Tensor): Input tensor. Shape: (batch_size, seq_len, hidden_size).
+            weight (torch.Tensor): Weight tensor. Shape: (vocab_size, hidden_size).
+            target (torch.Tensor): Target tensor. Shape: (batch_size, seq_len).
+            bias (torch.Tensor, optional): Bias tensor. Shape: (vocab_size,).
+            loss_fn (callable): Loss function to compute the loss on a chunk of input/target.
+            chunk_size (int): Size of a chunk (# of batches of stacked chosen and rejected inputs).
+            ignore_index (int): Index to ignore for loss computation.
+            beta (float): Weight for the preference loss.
+            compiled (bool): Whether to use torch compile for chunk accumulation.
+            use_ref_model (bool): Whether to use a reference model for the alignment loss.
+            preference_labels (torch.Tensor): Boolean tensor indicating chosen (True) vs rejected (False) examples.
+                Shape: (batch_size,).
+            ref_weight (torch.Tensor): Reference weight tensor. Shape: (vocab_size, hidden_size).
+            ref_bias (torch.Tensor, optional): Reference bias tensor. Shape: (vocab_size,).
+            loss_kwargs (dict): Other possible arguments that a loss function might need
+        """
+        # TODO: Tune CHUNK_SIZE to fully utilize the GPU
+        CHUNK_SIZE = chunk_size
+        # Gradients to be accumulated
+        grad_inputs = []
+        grad_weight = torch.zeros_like(weight)
+        grad_bias = torch.zeros_like(bias) if bias is not None else None
+        # Loss to be accumulated
+        loss_acc = torch.zeros((), device=_input.device)
+        compute_loss = partial(
+            LigerFusedLinearUnpairedPreferenceBase._compute_loss,
+            preference_loss_fn=loss_fn,
+            full_target=target,
+            ignore_index=ignore_index,
+            use_ref_model=use_ref_model,
+            ref_weight=ref_weight,
+            ref_bias=ref_bias,
+            **loss_kwargs,
+        )
+        def fused_fwd_bwd(input_chunk, target_chunk, preference_labels_chunk, ref_input_chunk):
+            """
+            Fused forward and backward pass for a chunk of input and target.
+            """
+            argnums = (0, 1, 4) if bias is not None else (0, 1)
+            return torch.func.grad_and_value(compute_loss, argnums=argnums, has_aux=False)(
+                input_chunk,
+                weight,
+                target_chunk,
+                preference_labels_chunk,
+                bias,
+                ref_input_chunk=ref_input_chunk,
+            )
+        def accumulate_chunk(
+            input_chunk,
+            target_chunk,
+            preference_labels_chunk=None,
+            ref_input_chunk=None,
+        ):
+            (chunk_grad_input, chunk_grad_weight, *chunk_grad_bias), (chunk_loss) = fused_fwd_bwd(
+                input_chunk, target_chunk, preference_labels_chunk, ref_input_chunk
+            )
+            if bias is not None:
+                grad_bias.add_(chunk_grad_bias[0])  # accumulate bias gradient
+            # Accumulate gradients
+            grad_weight.add_(chunk_grad_weight)
+            grad_inputs.append(chunk_grad_input)
+            # Accumulate loss
+            loss_acc.add_(chunk_loss)
+        if compiled:
+            fused_fwd_bwd = torch.compile(fused_fwd_bwd)
+        # When not paired, use labels to separate chosen and rejected
+        assert preference_labels is not None, "preference_labels must be provided for unpaired preference loss"
+        chunks = max(1, _input.shape[0] // CHUNK_SIZE)
+        _input_chunks = torch.chunk(_input, chunks=chunks, dim=0)
+        _target_chunks = torch.chunk(target, chunks=chunks, dim=0)
+        _preference_labels_chunks = torch.chunk(preference_labels, chunks=chunks, dim=0)
+        if use_ref_model:
+            _ref_input_chunks = torch.chunk(ref_input, chunks=chunks, dim=0)
+        for (
+            input_chunk,
+            target_chunk,
+            ref_input_chunk,
+            preference_labels_chunk,
+        ) in zip(
+            _input_chunks,
+            _target_chunks,
+            (_ref_input_chunks if use_ref_model else [None] * len(_input_chunks)),
+            _preference_labels_chunks,
+        ):
+            # mark input_chunk, target_chunk, and target dimension 1 (sequence length) as dynamic to prevent torch.compile recompilation
+            torch._dynamo.mark_dynamic(input_chunk, 1)
+            torch._dynamo.mark_dynamic(target_chunk, 1)
+            torch._dynamo.mark_dynamic(target, 1)
+            torch._dynamo.mark_dynamic(ref_input_chunk, 1) if use_ref_model else None
+            torch._dynamo.mark_dynamic(preference_labels_chunk, 1)
+            # accumulate loss, gradients, and metrics
+            accumulate_chunk(input_chunk, target_chunk, preference_labels_chunk, ref_input_chunk)
+        ctx.save_for_backward(
+            torch.cat(grad_inputs, dim=0),
+            grad_weight,
+            grad_bias,
+        )
+        return loss_acc
+    @staticmethod
+    def backward(ctx, *grad_output):
+        grad_input, grad_weight, grad_bias = ctx.saved_tensors
+        if torch.ne(grad_output[0][0], torch.tensor(1.0, device=grad_output[0][0].device)):
+            grad_input = grad_input * grad_output[0][0]
+            grad_weight = grad_weight * grad_output[0][0]
+            grad_bias = grad_bias * grad_output[0][0] if grad_bias is not None else None
+        return grad_input, grad_weight, None, None, grad_bias
+    @staticmethod
+    def chunk_forward(
+        input_chunk,
+        weight,
+        target_chunk,
+        bias=None,
+        ignore_index=-100,
+    ):
+        logits_chunk = input_chunk @ weight.t()
+        if bias is not None:
+            logits_chunk = logits_chunk + bias
+        log_probs_chunk = F.log_softmax(logits_chunk.float(), dim=-1)
+        loss_mask_chunk = target_chunk != ignore_index
+        label_chunk = torch.where(loss_mask_chunk, target_chunk, 0)
+        per_token_logps_chunk = log_probs_chunk.gather(-1, label_chunk.unsqueeze(-1)).squeeze(-1)
+        average_log_prob_chunk = (per_token_logps_chunk * loss_mask_chunk).sum(-1) / loss_mask_chunk.sum(-1)
+        return average_log_prob_chunk
+    @staticmethod
+    def _compute_loss(
+        input_chunk,
+        weight,
+        target_chunk,
+        preference_labels_chunk,
+        bias=None,
+        preference_loss_fn=None,
+        full_target=None,
+        ignore_index=-100,
+        use_ref_model=False,
+        ref_input_chunk=None,
+        ref_weight=None,
+        ref_bias=None,
+        **loss_kwargs,
+    ):
+        """
+        Compute the total loss for a chunk of input and target, while using an alignment/preference loss function.
+        Args:
+            preference_loss_fn (callable): Loss function to compute the loss on a chunk of input/target.
+            input_chunk (torch.Tensor): Chunk of input tensor. Shape: (2 * chunk_size, sequence_length, hidden_size).
+            weight (torch.Tensor): Weight tensor. Shape: (vocab_size, hidden_size).
+            target_chunk (torch.Tensor): Chunk of target tensor. Shape: (2 * chunk_size, sequence_length).
+            bias (torch.Tensor, optional): Bias tensor. Shape: (vocab_size,).
+            full_target (torch.Tensor): Full target tensor. Shape: (batch_size, sequence_length).
+            ignore_index (int): Index to ignore for loss computation.
+            use_ref_model (bool): Whether to use a reference model for the alignment loss.
+            ref_weight (torch.Tensor): Reference weight tensor. Shape: (vocab_size, hidden_size).
+            ref_bias (torch.Tensor, optional): Reference bias tensor. Shape: (vocab_size,).
+            loss_kwargs (dict): Additional arguments for the loss function.
+        """
+        average_log_prob_chunk = LigerFusedLinearUnpairedPreferenceBase.chunk_forward(
+            input_chunk,
+            weight,
+            target_chunk,
+            bias=bias,
+            ignore_index=ignore_index,
+        )
+        if use_ref_model:
+            with torch.no_grad():
+                ref_average_log_prob_chunk = LigerFusedLinearUnpairedPreferenceBase.chunk_forward(
+                    ref_input_chunk,
+                    ref_weight,
+                    target_chunk,
+                    ref_bias,
+                    ignore_index=ignore_index,
+                )
+            loss_kwargs["ref_average_log_prob_chunk"] = ref_average_log_prob_chunk
+        preference_loss_chunk = preference_loss_fn(
+            average_log_prob_chunk, preference_labels_chunk, full_target, **loss_kwargs
+        )
+        return preference_loss_chunk

liger-kernel 0.5.2__py3-none-any.whl → 0.5.3__py3-none-any.whl

liger-kernel 0.5.2py3-none-any.whl → 0.5.3py3-none-any.whl