PyPI - liger-kernel - Versions diffs - 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl - Mend

liger-kernel 0.5.3py3-none-any.whl → 0.5.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (35) hide show

liger_kernel/chunked_loss/__init__.py +1 -0
liger_kernel/chunked_loss/cpo_loss.py +51 -11
liger_kernel/chunked_loss/dpo_loss.py +30 -4
liger_kernel/chunked_loss/fused_linear_distillation.py +3 -3
liger_kernel/chunked_loss/fused_linear_preference.py +2 -2
liger_kernel/chunked_loss/fused_linear_rlhf.py +240 -0
liger_kernel/chunked_loss/fused_linear_unpaired_preference.py +112 -17
liger_kernel/chunked_loss/grpo_loss.py +194 -0
liger_kernel/chunked_loss/jsd_loss.py +31 -6
liger_kernel/chunked_loss/kto_loss.py +53 -15
liger_kernel/chunked_loss/orpo_loss.py +37 -5
liger_kernel/chunked_loss/simpo_loss.py +47 -11
liger_kernel/ops/cross_entropy.py +7 -3
liger_kernel/ops/fused_linear_cross_entropy.py +3 -3
liger_kernel/ops/fused_linear_jsd.py +3 -3
liger_kernel/ops/jsd.py +3 -3
liger_kernel/ops/layer_norm.py +20 -7
liger_kernel/ops/tvd.py +207 -0
liger_kernel/ops/utils.py +1 -2
liger_kernel/transformers/__init__.py +4 -0
liger_kernel/transformers/cross_entropy.py +3 -3
liger_kernel/transformers/functional.py +17 -0
liger_kernel/transformers/fused_linear_cross_entropy.py +3 -3
liger_kernel/transformers/group_norm.py +6 -6
liger_kernel/transformers/model/olmo2.py +124 -0
liger_kernel/transformers/model/qwen2_5_vl.py +205 -0
liger_kernel/transformers/monkey_patch.py +239 -27
liger_kernel/transformers/tvd.py +13 -0
liger_kernel/utils.py +48 -1
{liger_kernel-0.5.3.dist-info → liger_kernel-0.5.5.dist-info}/METADATA +19 -4
{liger_kernel-0.5.3.dist-info → liger_kernel-0.5.5.dist-info}/RECORD +35 -29
{liger_kernel-0.5.3.dist-info → liger_kernel-0.5.5.dist-info}/WHEEL +1 -1
{liger_kernel-0.5.3.dist-info → liger_kernel-0.5.5.dist-info}/LICENSE +0 -0
{liger_kernel-0.5.3.dist-info → liger_kernel-0.5.5.dist-info}/NOTICE +0 -0
{liger_kernel-0.5.3.dist-info → liger_kernel-0.5.5.dist-info}/top_level.txt +0 -0

liger_kernel/chunked_loss/fused_linear_unpaired_preference.py CHANGED Viewed

@@ -16,13 +16,13 @@ class LigerFusedLinearUnpairedPreferenceBase(torch.autograd.Function):
     @staticmethod
     def forward(
+        cls,
         ctx,
         _input,
         weight,
         target,
         preference_labels,
         bias=None,
-        loss_fn=None,
         chunk_size=1,
         ignore_index=-100,
         compiled=True,
@@ -30,6 +30,7 @@ class LigerFusedLinearUnpairedPreferenceBase(torch.autograd.Function):
         ref_input=None,
         ref_weight=None,
         ref_bias=None,
+        average_log_prob=False,
         **loss_kwargs,
     ):
         """
@@ -59,6 +60,7 @@ class LigerFusedLinearUnpairedPreferenceBase(torch.autograd.Function):
                 Shape: (batch_size,).
             ref_weight (torch.Tensor): Reference weight tensor. Shape: (vocab_size, hidden_size).
             ref_bias (torch.Tensor, optional): Reference bias tensor. Shape: (vocab_size,).
+            average_log_prob (bool): Whether to average the log probability per non-masked token.
             loss_kwargs (dict): Other possible arguments that a loss function might need
         """
         # TODO: Tune CHUNK_SIZE to fully utilize the GPU
@@ -72,14 +74,22 @@ class LigerFusedLinearUnpairedPreferenceBase(torch.autograd.Function):
         # Loss to be accumulated
         loss_acc = torch.zeros((), device=_input.device)
+        # Metrics to be recorded
+        chosen_logps_sum = torch.zeros((), device=_input.device)
+        rejected_logps_sum = torch.zeros((), device=_input.device)
+        chosen_logits_sum = torch.zeros((), device=_input.device)
+        rejected_logits_sum = torch.zeros((), device=_input.device)
+        aggregated_aux_outputs = []
         compute_loss = partial(
             LigerFusedLinearUnpairedPreferenceBase._compute_loss,
-            preference_loss_fn=loss_fn,
+            preference_loss_fn=cls.preference_loss_fn,
             full_target=target,
             ignore_index=ignore_index,
             use_ref_model=use_ref_model,
             ref_weight=ref_weight,
             ref_bias=ref_bias,
+            average_log_prob=average_log_prob,
             **loss_kwargs,
         )
@@ -88,7 +98,7 @@ class LigerFusedLinearUnpairedPreferenceBase(torch.autograd.Function):
             Fused forward and backward pass for a chunk of input and target.
             """
             argnums = (0, 1, 4) if bias is not None else (0, 1)
-            return torch.func.grad_and_value(compute_loss, argnums=argnums, has_aux=False)(
+            return torch.func.grad_and_value(compute_loss, argnums=argnums, has_aux=True)(
                 input_chunk,
                 weight,
                 target_chunk,
@@ -103,9 +113,19 @@ class LigerFusedLinearUnpairedPreferenceBase(torch.autograd.Function):
             preference_labels_chunk=None,
             ref_input_chunk=None,
         ):
-            (chunk_grad_input, chunk_grad_weight, *chunk_grad_bias), (chunk_loss) = fused_fwd_bwd(
-                input_chunk, target_chunk, preference_labels_chunk, ref_input_chunk
-            )
+            (
+                (chunk_grad_input, chunk_grad_weight, *chunk_grad_bias),
+                (
+                    chunk_loss,
+                    (
+                        chunk_chosen_logps_sum,
+                        chunk_rejected_logps_sum,
+                        chunk_chosen_logits_sum,
+                        chunk_rejected_logits_sum,
+                        *aux_outputs,
+                    ),
+                ),
+            ) = fused_fwd_bwd(input_chunk, target_chunk, preference_labels_chunk, ref_input_chunk)
             if bias is not None:
                 grad_bias.add_(chunk_grad_bias[0])  # accumulate bias gradient
@@ -116,6 +136,23 @@ class LigerFusedLinearUnpairedPreferenceBase(torch.autograd.Function):
             # Accumulate loss
             loss_acc.add_(chunk_loss)
+            # Accumulate metrics
+            chosen_logps_sum.add_(chunk_chosen_logps_sum)
+            rejected_logps_sum.add_(chunk_rejected_logps_sum)
+            chosen_logits_sum.add_(chunk_chosen_logits_sum)
+            rejected_logits_sum.add_(chunk_rejected_logits_sum)
+            # aux_outputs
+            # Initialize storage for aux_outputs
+            if len(aggregated_aux_outputs) == 0:
+                for aux in aux_outputs:
+                    aggregated_aux_outputs.append(torch.zeros((), device=aux.device))
+            # Process each aux_output
+            for i, aux in enumerate(aux_outputs):
+                if aux.ndim == 0:
+                    aggregated_aux_outputs[i].add_(aux)
         if compiled:
             fused_fwd_bwd = torch.compile(fused_fwd_bwd)
@@ -151,12 +188,25 @@ class LigerFusedLinearUnpairedPreferenceBase(torch.autograd.Function):
             # accumulate loss, gradients, and metrics
             accumulate_chunk(input_chunk, target_chunk, preference_labels_chunk, ref_input_chunk)
+        # Aggregate aux outputs lists into tensors
+        for i, aux in enumerate(aggregated_aux_outputs):
+            if isinstance(aux, list):
+                aggregated_aux_outputs[i] = torch.cat(aux, dim=0)
         ctx.save_for_backward(
             torch.cat(grad_inputs, dim=0),
             grad_weight,
             grad_bias,
         )
-        return loss_acc
+        return_vars = (
+            chosen_logps_sum,
+            rejected_logps_sum,
+            chosen_logits_sum,
+            rejected_logits_sum,
+        )
+        return loss_acc, (*return_vars, *aggregated_aux_outputs)
     @staticmethod
     def backward(ctx, *grad_output):
@@ -173,21 +223,37 @@ class LigerFusedLinearUnpairedPreferenceBase(torch.autograd.Function):
         input_chunk,
         weight,
         target_chunk,
+        preference_labels_chunk,
         bias=None,
         ignore_index=-100,
+        average_log_prob=False,
     ):
         logits_chunk = input_chunk @ weight.t()
         if bias is not None:
             logits_chunk = logits_chunk + bias
         log_probs_chunk = F.log_softmax(logits_chunk.float(), dim=-1)
         loss_mask_chunk = target_chunk != ignore_index
         label_chunk = torch.where(loss_mask_chunk, target_chunk, 0)
         per_token_logps_chunk = log_probs_chunk.gather(-1, label_chunk.unsqueeze(-1)).squeeze(-1)
-        average_log_prob_chunk = (per_token_logps_chunk * loss_mask_chunk).sum(-1) / loss_mask_chunk.sum(-1)
-        return average_log_prob_chunk
+        if average_log_prob:
+            log_probs = (per_token_logps_chunk * loss_mask_chunk).sum(-1) / loss_mask_chunk.sum(-1)
+        else:
+            log_probs = (per_token_logps_chunk * loss_mask_chunk).sum(-1)
+        chosen_logps_sum = (log_probs * preference_labels_chunk.unsqueeze(1)).sum()
+        rejected_logps_sum = (log_probs * (~preference_labels_chunk).unsqueeze(1)).sum()
+        chosen_logits_sum = (logits_chunk * preference_labels_chunk.unsqueeze(1)).sum()
+        rejected_logits_sum = (logits_chunk * (~preference_labels_chunk).unsqueeze(1)).sum()
+        return (
+            log_probs,
+            chosen_logps_sum,
+            rejected_logps_sum,
+            chosen_logits_sum,
+            rejected_logits_sum,
+        )
     @staticmethod
     def _compute_loss(
@@ -203,6 +269,7 @@ class LigerFusedLinearUnpairedPreferenceBase(torch.autograd.Function):
         ref_input_chunk=None,
         ref_weight=None,
         ref_bias=None,
+        average_log_prob=False,
         **loss_kwargs,
     ):
         """
@@ -218,29 +285,57 @@ class LigerFusedLinearUnpairedPreferenceBase(torch.autograd.Function):
             use_ref_model (bool): Whether to use a reference model for the alignment loss.
             ref_weight (torch.Tensor): Reference weight tensor. Shape: (vocab_size, hidden_size).
             ref_bias (torch.Tensor, optional): Reference bias tensor. Shape: (vocab_size,).
+            average_log_prob (bool): Whether to average the log probability per non-masked token.
             loss_kwargs (dict): Additional arguments for the loss function.
         """
-        average_log_prob_chunk = LigerFusedLinearUnpairedPreferenceBase.chunk_forward(
+        (
+            log_prob_chunk,
+            chosen_logps_sum,
+            rejected_logps_sum,
+            chosen_logits_sum,
+            rejected_logits_sum,
+        ) = LigerFusedLinearUnpairedPreferenceBase.chunk_forward(
             input_chunk,
             weight,
             target_chunk,
+            preference_labels_chunk,
             bias=bias,
             ignore_index=ignore_index,
+            average_log_prob=average_log_prob,
         )
         if use_ref_model:
             with torch.no_grad():
-                ref_average_log_prob_chunk = LigerFusedLinearUnpairedPreferenceBase.chunk_forward(
+                (
+                    ref_log_prob_chunk,
+                    _,
+                    _,
+                    _,
+                    _,
+                ) = LigerFusedLinearUnpairedPreferenceBase.chunk_forward(
                     ref_input_chunk,
                     ref_weight,
                     target_chunk,
+                    preference_labels_chunk,
                     ref_bias,
                     ignore_index=ignore_index,
+                    average_log_prob=average_log_prob,
                 )
-            loss_kwargs["ref_average_log_prob_chunk"] = ref_average_log_prob_chunk
+            loss_kwargs["ref_log_prob_chunk"] = ref_log_prob_chunk
-        preference_loss_chunk = preference_loss_fn(
-            average_log_prob_chunk, preference_labels_chunk, full_target, **loss_kwargs
+        preference_loss_outputs = preference_loss_fn(
+            log_prob_chunk, preference_labels_chunk, full_target, **loss_kwargs
+        )
+        if isinstance(preference_loss_outputs, tuple):
+            preference_loss_chunk, *aux_outputs = preference_loss_outputs
+        else:
+            preference_loss_chunk, aux_outputs = preference_loss_outputs, []
+        return_vars = (
+            chosen_logps_sum,
+            rejected_logps_sum,
+            chosen_logits_sum,
+            rejected_logits_sum,
         )
-        return preference_loss_chunk
+        return preference_loss_chunk, (*return_vars, *aux_outputs)

liger_kernel/chunked_loss/grpo_loss.py ADDED Viewed

@@ -0,0 +1,194 @@
+import torch
+from liger_kernel.chunked_loss.fused_linear_rlhf import LigerFusedLinearRLHFBase
+class LigerFusedLinearGRPOFunction(LigerFusedLinearRLHFBase):
+    @staticmethod
+    def rlhf_loss_fn(
+        log_probs,
+        attention_mask,
+        rewards,
+        ref_log_probs=None,
+        beta=0.1,
+        **kwargs,
+    ):
+        """GRPO Loss Function matching GRPOTrainer implementation."""
+        # Get chosen token probabilities
+        chosen_tokens = log_probs.argmax(dim=-1)  # (batch_size, seq_len)
+        chosen_token_logprobs = log_probs.gather(dim=-1, index=chosen_tokens.unsqueeze(-1)).squeeze(
+            -1
+        )  # (batch_size, seq_len)
+        # Get reference model probabilities
+        if ref_log_probs is not None:
+            with torch.no_grad():
+                ref_token_logprobs = ref_log_probs.gather(dim=-1, index=chosen_tokens.unsqueeze(-1)).squeeze(-1)
+        else:
+            ref_token_logprobs = chosen_token_logprobs.detach()
+        # Compute advantages per batch entry in a grouped fashion
+        mean_grouped_rewards = rewards.mean()  # [batch_size,]
+        std_grouped_rewards = rewards.std()  # [batch_size,]
+        # Calculate advantages using the same epsilon as in GRPOTrainer
+        eps = 1e-4
+        advantages = (rewards - mean_grouped_rewards) / (std_grouped_rewards + eps)
+        # Compute policy gradient loss with importance sampling ratio
+        ratio = torch.exp(chosen_token_logprobs - chosen_token_logprobs.detach())
+        policy_loss = -ratio * advantages.unsqueeze(1)
+        # Compute KL penalty
+        kl_div = (
+            torch.exp(ref_token_logprobs - chosen_token_logprobs) - (ref_token_logprobs - chosen_token_logprobs) - 1.0
+        )
+        # Combine losses
+        per_token_loss = policy_loss + beta * kl_div
+        # Apply masking and normalize
+        masked_loss = per_token_loss * attention_mask
+        seq_lengths = attention_mask.sum()
+        seq_lengths = torch.clamp(seq_lengths, min=1.0)
+        loss = masked_loss.sum() / seq_lengths
+        # Calculate metrics
+        metrics = (
+            chosen_token_logprobs.mean(),  # mean log prob
+            chosen_token_logprobs.std(),  # std log prob
+            log_probs.mean(),  # mean all log probs
+            ((kl_div * attention_mask).sum(dim=1) / attention_mask.sum(dim=1)).mean(),  # mean KL div
+        )
+        return loss, metrics
+    @classmethod
+    def forward(
+        cls,
+        ctx,
+        _input,
+        weight,
+        attention_mask,
+        rewards,
+        bias=None,
+        ref_input=None,
+        ref_weight=None,
+        ref_bias=None,
+        beta=0.1,
+        compiled=True,
+        use_ref_model=True,
+        num_generations=1,
+        chunk_size=1,
+    ):
+        """
+        Fused linear layer with GRPO loss.
+        Args:
+            _input (torch.Tensor): Input tensor. Shape: (batch_size * seq_len, hidden_size)
+            weight (torch.Tensor): Weight tensor. Shape: (vocab_size, hidden_size)
+            attention_mask (torch.Tensor): Attention mask tensor. Shape: (batch_size, seq_len)
+            rewards (torch.Tensor): Rewards tensor. Shape: (batch_size,)
+            bias (torch.Tensor, optional): Bias tensor. Shape: (vocab_size,)
+            ref_input (torch.Tensor, optional): Reference model input tensor. Shape: (batch_size * seq_len, hidden_size)
+            ref_weight (torch.Tensor, optional): Reference model weight tensor. Shape: (vocab_size, hidden_size)
+            ref_bias (torch.Tensor, optional): Reference model bias tensor. Shape: (vocab_size,)
+            beta (float): Weight for the KL penalty
+            compiled (bool): Whether to use torch compile
+            use_ref_model (bool): Whether to use a reference model
+            num_generations (int): Number of generations per prompt
+            chunk_size (int): Size of chunks for processing.
+        Returns:
+            torch.Tensor: Computed loss
+        """
+        return super().forward(
+            cls=cls,
+            ctx=ctx,
+            _input=_input,
+            weight=weight,
+            attention_mask=attention_mask,
+            rewards=rewards,
+            bias=bias,
+            ref_input=ref_input,
+            ref_weight=ref_weight,
+            ref_bias=ref_bias,
+            beta=beta,
+            compiled=compiled,
+            use_ref_model=use_ref_model,
+            num_generations=num_generations,
+            chunk_size=chunk_size,
+        )
+    @staticmethod
+    def backward(ctx, grad_output, *grad_metrics):
+        """Backward pass for GRPO loss.
+        Args:
+            grad_output: Gradient of the loss (scalar)
+            grad_metrics: Gradients of the metrics (not used in backward computation)
+        """
+        grads = LigerFusedLinearRLHFBase.backward(ctx, grad_output)
+        return (
+            *grads[:5],  # grad_input, grad_weight, grad_attention_mask, grad_rewards, grad_bias
+            None,  # grad_ref_input
+            None,  # grad_ref_weight
+            None,  # grad_ref_bias
+            None,  # grad_beta
+            None,  # grad_compiled
+            None,  # grad_use_ref_model
+            None,  # grad_num_generations
+            None,  # grad_chunk_size
+        )
+class LigerFusedLinearGRPOLoss(torch.nn.Module):
+    """Fused linear layer with GRPO loss."""
+    def __init__(
+        self,
+        beta: float = 0.1,
+        compiled: bool = True,
+        use_ref_model: bool = True,
+        num_generations: int = 1,
+        chunk_size: int = 1,
+    ):
+        """
+        Args:
+            beta (float): Weight for the KL penalty.
+            compiled (bool): Whether to use torch compile.
+            use_ref_model (bool): Whether to use a reference model.
+            num_generations (int): Number of generations per prompt.
+            chunk_size (int): Size of chunks for processing.
+        """
+        super().__init__()
+        self.beta = beta
+        self.compiled = compiled
+        self.use_ref_model = use_ref_model
+        self.num_generations = num_generations
+        self.chunk_size = chunk_size
+    def forward(
+        self,
+        _input,
+        lin_weight,
+        attention_mask,
+        rewards,
+        bias=None,
+        ref_input=None,
+        ref_weight=None,
+        ref_bias=None,
+    ):
+        return LigerFusedLinearGRPOFunction.apply(
+            _input,
+            lin_weight,
+            attention_mask,
+            rewards,
+            bias,
+            ref_input,
+            ref_weight,
+            ref_bias,
+            self.beta,
+            self.compiled,
+            self.use_ref_model,
+            self.num_generations,
+            self.chunk_size,
+        )

liger_kernel/chunked_loss/jsd_loss.py CHANGED Viewed

@@ -30,20 +30,24 @@ class LigerFusedLinearJSDFunction(LigerFusedLinearDistillationBase):
         jsd_loss = beta * teacher_kl + (1 - beta) * student_kl
         return jsd_loss
-    @staticmethod
+    @classmethod
     def forward(
+        cls,
         ctx,
         student_input: torch.Tensor,
         student_weight: torch.Tensor,
         teacher_input: torch.Tensor,
         teacher_weight: torch.Tensor,
         true_labels: torch.LongTensor,
+        student_bias: torch.Tensor,
+        teacher_bias: torch.Tensor,
         weight_hard_loss: float = 0.5,
         weight_soft_loss: float = 0.5,
         beta: float = 0.5,
         ignore_index: int = -100,
         temperature: float = 1.0,
         compiled: bool = True,
+        chunk_size: int = 1024,
     ):
         """
         Fused linear layer with JSD distillation loss.
@@ -59,18 +63,21 @@ class LigerFusedLinearJSDFunction(LigerFusedLinearDistillationBase):
             ignore_index (int): Index to ignore in loss computation
             temperature (float): Temperature for softening/sharpening distributions
             compiled (bool): Whether to use torch compile
+            chunk_size (int): Size of chunks for processing.
         Returns:
             torch.Tensor: Computed loss
         """
-        return LigerFusedLinearDistillationBase.forward(
+        return super().forward(
+            cls=cls,
             ctx=ctx,
             student_input=student_input,
             student_weight=student_weight,
             teacher_input=teacher_input,
             teacher_weight=teacher_weight,
             target=true_labels,
-            loss_fn=LigerFusedLinearJSDFunction.distillation_loss_fn,
-            chunk_size=1,
+            student_bias=student_bias,
+            teacher_bias=teacher_bias,
+            chunk_size=chunk_size,
             weight_hard_loss=weight_hard_loss,
             weight_soft_loss=weight_soft_loss,
             beta=beta,
@@ -81,9 +88,19 @@ class LigerFusedLinearJSDFunction(LigerFusedLinearDistillationBase):
     @staticmethod
     def backward(ctx, grad_output):
-        grads = LigerFusedLinearDistillationBase.backward(ctx, grad_output)[:4]
+        grads = LigerFusedLinearDistillationBase.backward(ctx, grad_output)[:6]
-        return (*grads, None, None, None, None, None, None, None)
+        return (
+            *grads,
+            None,  # teacher_bias
+            None,  # weight_hard_loss
+            None,  # weight_soft_loss
+            None,  # beta
+            None,  # ignore_index
+            None,  # temperature
+            None,  # compiled
+            None,  # chunk_size
+        )
 class LigerFusedLinearJSDLoss(torch.nn.Module):
@@ -99,6 +116,7 @@ class LigerFusedLinearJSDLoss(torch.nn.Module):
         ignore_index: int = -100,
         temperature: float = 1.0,
         compiled: bool = True,
+        chunk_size: int = 1024,
     ):
         """
         Args:
@@ -108,6 +126,7 @@ class LigerFusedLinearJSDLoss(torch.nn.Module):
             temperature (float): Temperature for softening distributions
             compiled (bool): Whether to use torch compile
             beta (float): Coefficient beta of generalized JSD in the interval [0, 1]. Default: `0.5`.
+            chunk_size (int): Size of chunks for processing.
         """
         super().__init__()
         assert temperature != 0, "Temperature cannot be 0."
@@ -117,6 +136,7 @@ class LigerFusedLinearJSDLoss(torch.nn.Module):
         self.temperature = temperature
         self.compiled = compiled
         self.beta = beta
+        self.chunk_size = chunk_size
     def forward(
         self,
@@ -125,6 +145,8 @@ class LigerFusedLinearJSDLoss(torch.nn.Module):
         teacher_input: torch.Tensor,
         teacher_weight: torch.Tensor,
         true_labels: torch.LongTensor,
+        student_bias: torch.Tensor,
+        teacher_bias: torch.Tensor,
     ) -> torch.Tensor:
         """
         Compute the JSD distillation loss.
@@ -145,10 +167,13 @@ class LigerFusedLinearJSDLoss(torch.nn.Module):
             teacher_input,
             teacher_weight,
             true_labels,
+            student_bias,
+            teacher_bias,
             self.weight_hard_loss,
             self.weight_soft_loss,
             self.beta,
             self.ignore_index,
             self.temperature,
             self.compiled,
+            self.chunk_size,
         )

liger-kernel 0.5.3__py3-none-any.whl → 0.5.5__py3-none-any.whl

liger-kernel 0.5.3py3-none-any.whl → 0.5.5py3-none-any.whl