PyPI - liger-kernel - Versions diffs - 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl - Mend

liger-kernel 0.5.4py3-none-any.whl → 0.5.6py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (44) hide show

liger_kernel/chunked_loss/cpo_loss.py +51 -11
liger_kernel/chunked_loss/dpo_loss.py +30 -4
liger_kernel/chunked_loss/functional.py +2 -0
liger_kernel/chunked_loss/fused_linear_distillation.py +20 -5
liger_kernel/chunked_loss/fused_linear_ppo.py +331 -0
liger_kernel/chunked_loss/fused_linear_preference.py +2 -2
liger_kernel/chunked_loss/fused_linear_unpaired_preference.py +112 -17
liger_kernel/chunked_loss/grpo_loss.py +137 -61
liger_kernel/chunked_loss/jsd_loss.py +43 -13
liger_kernel/chunked_loss/kto_loss.py +50 -12
liger_kernel/chunked_loss/orpo_loss.py +37 -5
liger_kernel/chunked_loss/simpo_loss.py +47 -11
liger_kernel/ops/cross_entropy.py +7 -2
liger_kernel/ops/dyt.py +225 -0
liger_kernel/ops/fused_linear_jsd.py +2 -1
liger_kernel/ops/jsd.py +30 -11
liger_kernel/ops/kl_div.py +2 -2
liger_kernel/transformers/__init__.py +4 -0
liger_kernel/transformers/dyt.py +20 -0
liger_kernel/transformers/functional.py +5 -0
liger_kernel/transformers/model/gemma.py +8 -16
liger_kernel/transformers/model/gemma2.py +7 -16
liger_kernel/transformers/model/llama.py +8 -15
liger_kernel/transformers/model/llava.py +369 -0
liger_kernel/transformers/model/loss_utils.py +57 -0
liger_kernel/transformers/model/mistral.py +9 -10
liger_kernel/transformers/model/mixtral.py +8 -15
liger_kernel/transformers/model/mllama.py +8 -15
liger_kernel/transformers/model/olmo2.py +8 -16
liger_kernel/transformers/model/paligemma.py +397 -0
liger_kernel/transformers/model/phi3.py +8 -15
liger_kernel/transformers/model/qwen2.py +8 -15
liger_kernel/transformers/model/qwen2_5_vl.py +204 -0
liger_kernel/transformers/model/qwen2_vl.py +9 -10
liger_kernel/transformers/monkey_patch.py +286 -12
liger_kernel/utils.py +1 -3
{liger_kernel-0.5.4.dist-info → liger_kernel-0.5.6.dist-info}/METADATA +11 -7
liger_kernel-0.5.6.dist-info/RECORD +80 -0
{liger_kernel-0.5.4.dist-info → liger_kernel-0.5.6.dist-info}/WHEEL +1 -1
liger_kernel/chunked_loss/fused_linear_rlhf.py +0 -213
liger_kernel-0.5.4.dist-info/RECORD +0 -74
{liger_kernel-0.5.4.dist-info → liger_kernel-0.5.6.dist-info/licenses}/LICENSE +0 -0
{liger_kernel-0.5.4.dist-info → liger_kernel-0.5.6.dist-info/licenses}/NOTICE +0 -0
{liger_kernel-0.5.4.dist-info → liger_kernel-0.5.6.dist-info}/top_level.txt +0 -0

liger_kernel/chunked_loss/fused_linear_unpaired_preference.py CHANGED Viewed

@@ -16,13 +16,13 @@ class LigerFusedLinearUnpairedPreferenceBase(torch.autograd.Function):
     @staticmethod
     def forward(
+        cls,
         ctx,
         _input,
         weight,
         target,
         preference_labels,
         bias=None,
-        loss_fn=None,
         chunk_size=1,
         ignore_index=-100,
         compiled=True,
@@ -30,6 +30,7 @@ class LigerFusedLinearUnpairedPreferenceBase(torch.autograd.Function):
         ref_input=None,
         ref_weight=None,
         ref_bias=None,
+        average_log_prob=False,
         **loss_kwargs,
     ):
         """
@@ -59,6 +60,7 @@ class LigerFusedLinearUnpairedPreferenceBase(torch.autograd.Function):
                 Shape: (batch_size,).
             ref_weight (torch.Tensor): Reference weight tensor. Shape: (vocab_size, hidden_size).
             ref_bias (torch.Tensor, optional): Reference bias tensor. Shape: (vocab_size,).
+            average_log_prob (bool): Whether to average the log probability per non-masked token.
             loss_kwargs (dict): Other possible arguments that a loss function might need
         """
         # TODO: Tune CHUNK_SIZE to fully utilize the GPU
@@ -72,14 +74,22 @@ class LigerFusedLinearUnpairedPreferenceBase(torch.autograd.Function):
         # Loss to be accumulated
         loss_acc = torch.zeros((), device=_input.device)
+        # Metrics to be recorded
+        chosen_logps_sum = torch.zeros((), device=_input.device)
+        rejected_logps_sum = torch.zeros((), device=_input.device)
+        chosen_logits_sum = torch.zeros((), device=_input.device)
+        rejected_logits_sum = torch.zeros((), device=_input.device)
+        aggregated_aux_outputs = []
         compute_loss = partial(
             LigerFusedLinearUnpairedPreferenceBase._compute_loss,
-            preference_loss_fn=loss_fn,
+            preference_loss_fn=cls.preference_loss_fn,
             full_target=target,
             ignore_index=ignore_index,
             use_ref_model=use_ref_model,
             ref_weight=ref_weight,
             ref_bias=ref_bias,
+            average_log_prob=average_log_prob,
             **loss_kwargs,
         )
@@ -88,7 +98,7 @@ class LigerFusedLinearUnpairedPreferenceBase(torch.autograd.Function):
             Fused forward and backward pass for a chunk of input and target.
             """
             argnums = (0, 1, 4) if bias is not None else (0, 1)
-            return torch.func.grad_and_value(compute_loss, argnums=argnums, has_aux=False)(
+            return torch.func.grad_and_value(compute_loss, argnums=argnums, has_aux=True)(
                 input_chunk,
                 weight,
                 target_chunk,
@@ -103,9 +113,19 @@ class LigerFusedLinearUnpairedPreferenceBase(torch.autograd.Function):
             preference_labels_chunk=None,
             ref_input_chunk=None,
         ):
-            (chunk_grad_input, chunk_grad_weight, *chunk_grad_bias), (chunk_loss) = fused_fwd_bwd(
-                input_chunk, target_chunk, preference_labels_chunk, ref_input_chunk
-            )
+            (
+                (chunk_grad_input, chunk_grad_weight, *chunk_grad_bias),
+                (
+                    chunk_loss,
+                    (
+                        chunk_chosen_logps_sum,
+                        chunk_rejected_logps_sum,
+                        chunk_chosen_logits_sum,
+                        chunk_rejected_logits_sum,
+                        *aux_outputs,
+                    ),
+                ),
+            ) = fused_fwd_bwd(input_chunk, target_chunk, preference_labels_chunk, ref_input_chunk)
             if bias is not None:
                 grad_bias.add_(chunk_grad_bias[0])  # accumulate bias gradient
@@ -116,6 +136,23 @@ class LigerFusedLinearUnpairedPreferenceBase(torch.autograd.Function):
             # Accumulate loss
             loss_acc.add_(chunk_loss)
+            # Accumulate metrics
+            chosen_logps_sum.add_(chunk_chosen_logps_sum)
+            rejected_logps_sum.add_(chunk_rejected_logps_sum)
+            chosen_logits_sum.add_(chunk_chosen_logits_sum)
+            rejected_logits_sum.add_(chunk_rejected_logits_sum)
+            # aux_outputs
+            # Initialize storage for aux_outputs
+            if len(aggregated_aux_outputs) == 0:
+                for aux in aux_outputs:
+                    aggregated_aux_outputs.append(torch.zeros((), device=aux.device))
+            # Process each aux_output
+            for i, aux in enumerate(aux_outputs):
+                if aux.ndim == 0:
+                    aggregated_aux_outputs[i].add_(aux)
         if compiled:
             fused_fwd_bwd = torch.compile(fused_fwd_bwd)
@@ -151,12 +188,25 @@ class LigerFusedLinearUnpairedPreferenceBase(torch.autograd.Function):
             # accumulate loss, gradients, and metrics
             accumulate_chunk(input_chunk, target_chunk, preference_labels_chunk, ref_input_chunk)
+        # Aggregate aux outputs lists into tensors
+        for i, aux in enumerate(aggregated_aux_outputs):
+            if isinstance(aux, list):
+                aggregated_aux_outputs[i] = torch.cat(aux, dim=0)
         ctx.save_for_backward(
             torch.cat(grad_inputs, dim=0),
             grad_weight,
             grad_bias,
         )
-        return loss_acc
+        return_vars = (
+            chosen_logps_sum,
+            rejected_logps_sum,
+            chosen_logits_sum,
+            rejected_logits_sum,
+        )
+        return loss_acc, (*return_vars, *aggregated_aux_outputs)
     @staticmethod
     def backward(ctx, *grad_output):
@@ -173,21 +223,37 @@ class LigerFusedLinearUnpairedPreferenceBase(torch.autograd.Function):
         input_chunk,
         weight,
         target_chunk,
+        preference_labels_chunk,
         bias=None,
         ignore_index=-100,
+        average_log_prob=False,
     ):
         logits_chunk = input_chunk @ weight.t()
         if bias is not None:
             logits_chunk = logits_chunk + bias
         log_probs_chunk = F.log_softmax(logits_chunk.float(), dim=-1)
         loss_mask_chunk = target_chunk != ignore_index
         label_chunk = torch.where(loss_mask_chunk, target_chunk, 0)
         per_token_logps_chunk = log_probs_chunk.gather(-1, label_chunk.unsqueeze(-1)).squeeze(-1)
-        average_log_prob_chunk = (per_token_logps_chunk * loss_mask_chunk).sum(-1) / loss_mask_chunk.sum(-1)
-        return average_log_prob_chunk
+        if average_log_prob:
+            log_probs = (per_token_logps_chunk * loss_mask_chunk).sum(-1) / loss_mask_chunk.sum(-1)
+        else:
+            log_probs = (per_token_logps_chunk * loss_mask_chunk).sum(-1)
+        chosen_logps_sum = (log_probs * preference_labels_chunk.unsqueeze(1)).sum()
+        rejected_logps_sum = (log_probs * (~preference_labels_chunk).unsqueeze(1)).sum()
+        chosen_logits_sum = (logits_chunk * preference_labels_chunk.unsqueeze(1)).sum()
+        rejected_logits_sum = (logits_chunk * (~preference_labels_chunk).unsqueeze(1)).sum()
+        return (
+            log_probs,
+            chosen_logps_sum,
+            rejected_logps_sum,
+            chosen_logits_sum,
+            rejected_logits_sum,
+        )
     @staticmethod
     def _compute_loss(
@@ -203,6 +269,7 @@ class LigerFusedLinearUnpairedPreferenceBase(torch.autograd.Function):
         ref_input_chunk=None,
         ref_weight=None,
         ref_bias=None,
+        average_log_prob=False,
         **loss_kwargs,
     ):
         """
@@ -218,29 +285,57 @@ class LigerFusedLinearUnpairedPreferenceBase(torch.autograd.Function):
             use_ref_model (bool): Whether to use a reference model for the alignment loss.
             ref_weight (torch.Tensor): Reference weight tensor. Shape: (vocab_size, hidden_size).
             ref_bias (torch.Tensor, optional): Reference bias tensor. Shape: (vocab_size,).
+            average_log_prob (bool): Whether to average the log probability per non-masked token.
             loss_kwargs (dict): Additional arguments for the loss function.
         """
-        average_log_prob_chunk = LigerFusedLinearUnpairedPreferenceBase.chunk_forward(
+        (
+            log_prob_chunk,
+            chosen_logps_sum,
+            rejected_logps_sum,
+            chosen_logits_sum,
+            rejected_logits_sum,
+        ) = LigerFusedLinearUnpairedPreferenceBase.chunk_forward(
             input_chunk,
             weight,
             target_chunk,
+            preference_labels_chunk,
             bias=bias,
             ignore_index=ignore_index,
+            average_log_prob=average_log_prob,
         )
         if use_ref_model:
             with torch.no_grad():
-                ref_average_log_prob_chunk = LigerFusedLinearUnpairedPreferenceBase.chunk_forward(
+                (
+                    ref_log_prob_chunk,
+                    _,
+                    _,
+                    _,
+                    _,
+                ) = LigerFusedLinearUnpairedPreferenceBase.chunk_forward(
                     ref_input_chunk,
                     ref_weight,
                     target_chunk,
+                    preference_labels_chunk,
                     ref_bias,
                     ignore_index=ignore_index,
+                    average_log_prob=average_log_prob,
                 )
-            loss_kwargs["ref_average_log_prob_chunk"] = ref_average_log_prob_chunk
+            loss_kwargs["ref_log_prob_chunk"] = ref_log_prob_chunk
-        preference_loss_chunk = preference_loss_fn(
-            average_log_prob_chunk, preference_labels_chunk, full_target, **loss_kwargs
+        preference_loss_outputs = preference_loss_fn(
+            log_prob_chunk, preference_labels_chunk, full_target, **loss_kwargs
+        )
+        if isinstance(preference_loss_outputs, tuple):
+            preference_loss_chunk, *aux_outputs = preference_loss_outputs
+        else:
+            preference_loss_chunk, aux_outputs = preference_loss_outputs, []
+        return_vars = (
+            chosen_logps_sum,
+            rejected_logps_sum,
+            chosen_logits_sum,
+            rejected_logits_sum,
         )
-        return preference_loss_chunk
+        return preference_loss_chunk, (*return_vars, *aux_outputs)

liger_kernel/chunked_loss/grpo_loss.py CHANGED Viewed

@@ -1,99 +1,143 @@
 import torch
-from liger_kernel.chunked_loss.fused_linear_rlhf import LigerFusedLinearRLHFBase
+from liger_kernel.chunked_loss.fused_linear_ppo import LigerFusedLinearPPOBase
-class LigerFusedLinearGRPOFunction(LigerFusedLinearRLHFBase):
+def k3_loss_fn(log_p, log_q):
+    # computes k3 estimate of KL[q, p]
+    # ref: http://joschu.net/blog/kl-approx.html
+    return torch.exp(log_p - log_q) - (log_p - log_q) - 1.0
+def clip_coef_fn(coef, epsilon_low, epsilon_high):
+    return torch.clamp(coef, 1 - epsilon_low, 1 + epsilon_high)
+class LigerFusedLinearGRPOFunction(LigerFusedLinearPPOBase):
     @staticmethod
-    def rlhf_loss_fn(
+    def ppo_loss_fn(
         log_probs,
+        selected_token_ids,
         attention_mask,
-        rewards,
-        ref_log_probs=None,
-        beta=0.1,
+        advantages,
+        full_attention_mask,
+        ref_per_token_logps=None,  # shape: [chunk_size, seq_len]
+        old_per_token_logps=None,
+        ref_log_probs=None,  # used when ref_per_token_logps is None (shape: [chunk_size, seq_len, vocab_size])
+        epsilon_low=0.2,
+        epsilon_high=0.2,
+        beta=0.04,
         **kwargs,
     ):
         """GRPO Loss Function matching GRPOTrainer implementation."""
-        # Get chosen token probabilities
-        chosen_tokens = log_probs.argmax(dim=-1)  # (batch_size, seq_len)
-        chosen_token_logprobs = log_probs.gather(dim=-1, index=chosen_tokens.unsqueeze(-1)).squeeze(
+        per_token_logps = log_probs.gather(dim=-1, index=selected_token_ids.unsqueeze(-1)).squeeze(
             -1
         )  # (batch_size, seq_len)
         # Get reference model probabilities
-        if ref_log_probs is not None:
-            with torch.no_grad():
-                ref_token_logprobs = ref_log_probs.gather(dim=-1, index=chosen_tokens.unsqueeze(-1)).squeeze(-1)
-        else:
-            ref_token_logprobs = chosen_token_logprobs.detach()
-        # Compute advantages per batch entry in a grouped fashion
-        mean_grouped_rewards = rewards.mean()  # [batch_size,]
-        std_grouped_rewards = rewards.std()  # [batch_size,]
-        # Calculate advantages using the same epsilon as in GRPOTrainer
-        eps = 1e-4
-        advantages = (rewards - mean_grouped_rewards) / (std_grouped_rewards + eps)
+        if ref_per_token_logps is None:
+            if ref_log_probs is not None:
+                with torch.no_grad():
+                    ref_per_token_logps = ref_log_probs.gather(dim=-1, index=selected_token_ids.unsqueeze(-1)).squeeze(
+                        -1
+                    )
+            else:
+                ref_per_token_logps = per_token_logps.detach()
         # Compute policy gradient loss with importance sampling ratio
-        ratio = torch.exp(chosen_token_logprobs - chosen_token_logprobs.detach())
-        policy_loss = -ratio * advantages.unsqueeze(1)
-        # Compute KL penalty
-        kl_div = (
-            torch.exp(ref_token_logprobs - chosen_token_logprobs) - (ref_token_logprobs - chosen_token_logprobs) - 1.0
-        )
-        # Combine losses
-        per_token_loss = policy_loss + beta * kl_div
-        # Apply masking and normalize
-        masked_loss = per_token_loss * attention_mask
-        seq_lengths = attention_mask.sum()
-        seq_lengths = torch.clamp(seq_lengths, min=1.0)
-        loss = masked_loss.sum() / seq_lengths
+        old_per_token_logps = old_per_token_logps if old_per_token_logps is not None else per_token_logps.detach()
+        coef_1 = torch.exp(per_token_logps - old_per_token_logps)
+        coef_2 = clip_coef_fn(coef_1, epsilon_low, epsilon_high)
+        per_token_loss1 = coef_1 * advantages.unsqueeze(1)
+        per_token_loss2 = coef_2 * advantages.unsqueeze(1)
+        per_token_loss = -torch.min(per_token_loss1, per_token_loss2)
+        if beta != 0.0:
+            # Compute KL penalty (approximates KL[per_token_logps, ref_per_token_logps])
+            kl_div = k3_loss_fn(ref_per_token_logps, per_token_logps)
+            # Combine losses
+            per_token_loss = per_token_loss + beta * kl_div
+        # Note: We normalize by the number of tokens in the batch (using full_attention_mask),
+        # which is consistent with the DAPO loss implementation (https://arxiv.org/html/2503.14476v1)
+        # and TRL GRPO implementation
+        # (https://github.com/huggingface/trl/blob/e751a16df56e70190fb94bed4a2035eec3303777/trl/trainer/grpo_trainer.py#L966)
+        loss = (per_token_loss * attention_mask).sum() / torch.clamp(full_attention_mask.sum(), min=1.0)
         # Calculate metrics
-        metrics = (
-            chosen_token_logprobs.mean(),  # mean log prob
-            chosen_token_logprobs.std(),  # std log prob
-            log_probs.mean(),  # mean all log probs
-            ((kl_div * attention_mask).sum(dim=1) / attention_mask.sum(dim=1)).mean(),  # mean KL div
+        metrics = []
+        if beta != 0.0:
+            metrics.append(((kl_div * attention_mask).sum() / torch.clamp(full_attention_mask.sum(), min=1.0)))
+        is_clipped = ((coef_1 < 1 - epsilon_low) & (advantages.unsqueeze(1) < 0)) | (
+            (coef_1 > 1 + epsilon_high) & (advantages.unsqueeze(1) > 0)
         )
+        metrics.append((is_clipped * attention_mask).sum() / torch.clamp(full_attention_mask.sum(), min=1.0))
         return loss, metrics
-    @staticmethod
+    @classmethod
     def forward(
+        cls,
         ctx,
         _input,
         weight,
+        selected_token_ids,
         attention_mask,
-        rewards,
+        advantages,
         bias=None,
+        ref_per_token_logps=None,
+        old_per_token_logps=None,
         ref_input=None,
         ref_weight=None,
         ref_bias=None,
-        beta=0.1,
+        beta=0.04,
+        epsilon_low=0.2,
+        epsilon_high=0.2,
+        temperature=1.0,
         compiled=True,
         use_ref_model=True,
-        num_generations=1,
+        chunk_size=1,
     ):
-        return LigerFusedLinearRLHFBase.forward(
+        """
+        Fused linear layer with GRPO loss.
+        Args:
+            _input (torch.Tensor): Input tensor. Shape: (batch_size * seq_len, hidden_size)
+            weight (torch.Tensor): Weight tensor. Shape: (vocab_size, hidden_size)
+            selected_token_ids (torch.Tensor): Selected token ids tensor. Shape: (batch_size, seq_len)
+            attention_mask (torch.Tensor): Attention mask tensor. Shape: (batch_size, seq_len)
+            advantages (torch.Tensor): Advantages tensor. Shape: (batch_size,)
+            bias (torch.Tensor, optional): Bias tensor. Shape: (vocab_size,)
+            ref_per_token_logps:  Reference model log probs per token tensor. Shape:(batch_size, seq_len)
+            ref_input (torch.Tensor, optional): Reference model input tensor. Shape: (batch_size * seq_len, hidden_size)
+            ref_weight (torch.Tensor, optional): Reference model weight tensor. Shape: (vocab_size, hidden_size)
+            ref_bias (torch.Tensor, optional): Reference model bias tensor. Shape: (vocab_size,)
+            beta (float): Weight for the KL penalty
+            temperature (float): Temperature for the logits
+            compiled (bool): Whether to use torch compile
+            use_ref_model (bool): Whether to use a reference model
+            chunk_size (int): Size of chunks for processing.
+        Returns:
+            torch.Tensor: Computed loss
+        """
+        return super().forward(
+            cls=cls,
             ctx=ctx,
             _input=_input,
             weight=weight,
+            selected_token_ids=selected_token_ids,
             attention_mask=attention_mask,
-            loss_fn=LigerFusedLinearGRPOFunction.rlhf_loss_fn,
-            rewards=rewards,
+            advantages=advantages,
             bias=bias,
+            ref_per_token_logps=ref_per_token_logps,
+            old_per_token_logps=old_per_token_logps,
             ref_input=ref_input,
             ref_weight=ref_weight,
             ref_bias=ref_bias,
             beta=beta,
+            epsilon_low=epsilon_low,
+            epsilon_high=epsilon_high,
+            temperature=temperature,
             compiled=compiled,
             use_ref_model=use_ref_model,
-            num_generations=num_generations,
+            chunk_size=chunk_size,
         )
     @staticmethod
@@ -104,16 +148,23 @@ class LigerFusedLinearGRPOFunction(LigerFusedLinearRLHFBase):
             grad_output: Gradient of the loss (scalar)
             grad_metrics: Gradients of the metrics (not used in backward computation)
         """
-        grads = LigerFusedLinearRLHFBase.backward(ctx, grad_output)
+        grads = LigerFusedLinearPPOBase.backward(ctx, grad_output)
         return (
-            *grads[:5],  # grad_input, grad_weight, grad_attention_mask, grad_rewards, grad_bias
+            *grads[
+                :6
+            ],  # grad_input, grad_weight, grad_selected_token_ids, grad_attention_mask, grad_advantages, grad_bias
+            None,  # grad_ref_per_token_logps
+            None,  # grad_old_per_token_logps
             None,  # grad_ref_input
             None,  # grad_ref_weight
             None,  # grad_ref_bias
             None,  # grad_beta
+            None,  # grad_epsilon_low
+            None,  # grad_epsilon_high
+            None,  # grad_temperature
             None,  # grad_compiled
             None,  # grad_use_ref_model
-            None,  # grad_num_generations
+            None,  # grad_chunk_size
         )
@@ -122,24 +173,43 @@ class LigerFusedLinearGRPOLoss(torch.nn.Module):
     def __init__(
         self,
-        beta: float = 0.1,
+        beta: float = 0.04,
         compiled: bool = True,
         use_ref_model: bool = True,
-        num_generations: int = 1,
+        chunk_size: int = 1,
+        epsilon_low: float = 0.2,
+        epsilon_high: float = 0.2,
+        temperature: float = 1.0,
     ):
+        """
+        Args:
+            beta (float): Weight for the KL penalty.
+            compiled (bool): Whether to use torch compile.
+            use_ref_model (bool): Whether to use a reference model.
+            chunk_size (int): Size of chunks for processing.
+            epsilon_low (float): Lower bound for the importance sampling ratio.
+            epsilon_high (float): Upper bound for the importance sampling ratio.
+            temperature (float): Temperature for the logits.
+        """
         super().__init__()
         self.beta = beta
         self.compiled = compiled
         self.use_ref_model = use_ref_model
-        self.num_generations = num_generations
+        self.chunk_size = chunk_size
+        self.epsilon_low = epsilon_low
+        self.epsilon_high = epsilon_high
+        self.temperature = temperature
     def forward(
         self,
         _input,
         lin_weight,
+        selected_token_ids,
         attention_mask,
-        rewards,
+        advantages,
         bias=None,
+        ref_per_token_logps=None,
+        old_per_token_logps=None,
         ref_input=None,
         ref_weight=None,
         ref_bias=None,
@@ -147,14 +217,20 @@ class LigerFusedLinearGRPOLoss(torch.nn.Module):
         return LigerFusedLinearGRPOFunction.apply(
             _input,
             lin_weight,
+            selected_token_ids,
             attention_mask,
-            rewards,
+            advantages,
             bias,
+            ref_per_token_logps,
+            old_per_token_logps,
             ref_input,
             ref_weight,
             ref_bias,
             self.beta,
+            self.epsilon_low,
+            self.epsilon_high,
+            self.temperature,
             self.compiled,
             self.use_ref_model,
-            self.num_generations,
+            self.chunk_size,
         )

liger_kernel/chunked_loss/jsd_loss.py CHANGED Viewed

@@ -19,31 +19,40 @@ class LigerFusedLinearJSDFunction(LigerFusedLinearDistillationBase):
         student_log_probs = F.log_softmax(student_logits, dim=-1)
         teacher_log_probs = F.log_softmax(teacher_logits, dim=-1)
-        # Compute probabilities (only required for mean calculation)
-        mean_probs = beta * student_log_probs.exp() + (1 - beta) * teacher_log_probs.exp()
-        log_mean_probs = mean_probs.log()
+        if beta == 0:
+            jsd_loss = F.kl_div(student_log_probs, teacher_log_probs, reduction="sum", log_target=True)
+        elif beta == 1:
+            jsd_loss = F.kl_div(teacher_log_probs, student_log_probs, reduction="sum", log_target=True)
+        else:
+            # Compute probabilities (only required for mean calculation)
+            mean_probs = (1 - beta) * student_log_probs.exp() + beta * teacher_log_probs.exp()
+            log_mean_probs = mean_probs.log()
-        student_kl = F.kl_div(log_mean_probs, student_log_probs, reduction="sum", log_target=True)
-        teacher_kl = F.kl_div(log_mean_probs, teacher_log_probs, reduction="sum", log_target=True)
+            student_kl = F.kl_div(log_mean_probs, student_log_probs, reduction="sum", log_target=True)
+            teacher_kl = F.kl_div(log_mean_probs, teacher_log_probs, reduction="sum", log_target=True)
-        # JSD is the weighted average of the KL divergences
-        jsd_loss = beta * teacher_kl + (1 - beta) * student_kl
+            # JSD is the weighted average of the KL divergences
+            jsd_loss = beta * teacher_kl + (1 - beta) * student_kl
         return jsd_loss
-    @staticmethod
+    @classmethod
     def forward(
+        cls,
         ctx,
         student_input: torch.Tensor,
         student_weight: torch.Tensor,
         teacher_input: torch.Tensor,
         teacher_weight: torch.Tensor,
         true_labels: torch.LongTensor,
+        student_bias: torch.Tensor,
+        teacher_bias: torch.Tensor,
         weight_hard_loss: float = 0.5,
         weight_soft_loss: float = 0.5,
         beta: float = 0.5,
         ignore_index: int = -100,
         temperature: float = 1.0,
         compiled: bool = True,
+        chunk_size: int = 1024,
     ):
         """
         Fused linear layer with JSD distillation loss.
@@ -59,18 +68,21 @@ class LigerFusedLinearJSDFunction(LigerFusedLinearDistillationBase):
             ignore_index (int): Index to ignore in loss computation
             temperature (float): Temperature for softening/sharpening distributions
             compiled (bool): Whether to use torch compile
+            chunk_size (int): Size of chunks for processing.
         Returns:
             torch.Tensor: Computed loss
         """
-        return LigerFusedLinearDistillationBase.forward(
+        return super().forward(
+            cls=cls,
             ctx=ctx,
             student_input=student_input,
             student_weight=student_weight,
             teacher_input=teacher_input,
             teacher_weight=teacher_weight,
             target=true_labels,
-            loss_fn=LigerFusedLinearJSDFunction.distillation_loss_fn,
-            chunk_size=1,
+            student_bias=student_bias,
+            teacher_bias=teacher_bias,
+            chunk_size=chunk_size,
             weight_hard_loss=weight_hard_loss,
             weight_soft_loss=weight_soft_loss,
             beta=beta,
@@ -81,9 +93,19 @@ class LigerFusedLinearJSDFunction(LigerFusedLinearDistillationBase):
     @staticmethod
     def backward(ctx, grad_output):
-        grads = LigerFusedLinearDistillationBase.backward(ctx, grad_output)[:4]
+        grads = LigerFusedLinearDistillationBase.backward(ctx, grad_output)[:6]
-        return (*grads, None, None, None, None, None, None, None)
+        return (
+            *grads,
+            None,  # teacher_bias
+            None,  # weight_hard_loss
+            None,  # weight_soft_loss
+            None,  # beta
+            None,  # ignore_index
+            None,  # temperature
+            None,  # compiled
+            None,  # chunk_size
+        )
 class LigerFusedLinearJSDLoss(torch.nn.Module):
@@ -99,6 +121,7 @@ class LigerFusedLinearJSDLoss(torch.nn.Module):
         ignore_index: int = -100,
         temperature: float = 1.0,
         compiled: bool = True,
+        chunk_size: int = 1024,
     ):
         """
         Args:
@@ -108,6 +131,7 @@ class LigerFusedLinearJSDLoss(torch.nn.Module):
             temperature (float): Temperature for softening distributions
             compiled (bool): Whether to use torch compile
             beta (float): Coefficient beta of generalized JSD in the interval [0, 1]. Default: `0.5`.
+            chunk_size (int): Size of chunks for processing.
         """
         super().__init__()
         assert temperature != 0, "Temperature cannot be 0."
@@ -117,6 +141,7 @@ class LigerFusedLinearJSDLoss(torch.nn.Module):
         self.temperature = temperature
         self.compiled = compiled
         self.beta = beta
+        self.chunk_size = chunk_size
     def forward(
         self,
@@ -125,6 +150,8 @@ class LigerFusedLinearJSDLoss(torch.nn.Module):
         teacher_input: torch.Tensor,
         teacher_weight: torch.Tensor,
         true_labels: torch.LongTensor,
+        student_bias: torch.Tensor,
+        teacher_bias: torch.Tensor,
     ) -> torch.Tensor:
         """
         Compute the JSD distillation loss.
@@ -145,10 +172,13 @@ class LigerFusedLinearJSDLoss(torch.nn.Module):
             teacher_input,
             teacher_weight,
             true_labels,
+            student_bias,
+            teacher_bias,
             self.weight_hard_loss,
             self.weight_soft_loss,
             self.beta,
             self.ignore_index,
             self.temperature,
             self.compiled,
+            self.chunk_size,
         )

liger-kernel 0.5.4__py3-none-any.whl → 0.5.6__py3-none-any.whl

liger-kernel 0.5.4py3-none-any.whl → 0.5.6py3-none-any.whl