PyPI - liger-kernel-nightly - Versions diffs - 0.5.3.dev20250218225514__tar.gz → 0.5.3.dev20250219232423__tar.gz - Mend

liger-kernel-nightly 0.5.3.dev20250218225514tar.gz → 0.5.3.dev20250219232423tar.gz

Files changed (221) hide show

{liger_kernel_nightly-0.5.3.dev20250218225514 → liger_kernel_nightly-0.5.3.dev20250219232423}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.5.3.dev20250218225514
+Version: 0.5.3.dev20250219232423
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation

{liger_kernel_nightly-0.5.3.dev20250218225514 → liger_kernel_nightly-0.5.3.dev20250219232423}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "liger_kernel_nightly"
-version = "0.5.3.dev20250218225514"
+version = "0.5.3.dev20250219232423"
 description = "Efficient Triton kernels for LLM Training"
 urls = { "Homepage" = "https://github.com/linkedin/Liger-Kernel" }
 readme = { file = "README.md", content-type = "text/markdown" }

{liger_kernel_nightly-0.5.3.dev20250218225514 → liger_kernel_nightly-0.5.3.dev20250219232423}/src/liger_kernel/chunked_loss/__init__.py RENAMED Viewed

@@ -1,5 +1,6 @@
 from liger_kernel.chunked_loss.cpo_loss import LigerFusedLinearCPOLoss  # noqa: F401
 from liger_kernel.chunked_loss.dpo_loss import LigerFusedLinearDPOLoss  # noqa: F401
+from liger_kernel.chunked_loss.grpo_loss import LigerFusedLinearGRPOLoss  # noqa: F401
 from liger_kernel.chunked_loss.jsd_loss import LigerFusedLinearJSDLoss  # noqa: F401
 from liger_kernel.chunked_loss.kto_loss import LigerFusedLinearKTOLoss  # noqa: F401
 from liger_kernel.chunked_loss.orpo_loss import LigerFusedLinearORPOLoss  # noqa: F401

liger_kernel_nightly-0.5.3.dev20250219232423/src/liger_kernel/chunked_loss/fused_linear_rlhf.py ADDED Viewed

@@ -0,0 +1,213 @@
+from functools import partial
+import torch
+import torch.nn.functional as F
+class LigerFusedLinearRLHFBase(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        _input,
+        weight,
+        attention_mask,
+        rewards,
+        bias=None,
+        loss_fn=None,
+        num_generations=4,
+        beta=0.1,
+        compiled=True,
+        use_ref_model=False,
+        ref_input=None,
+        ref_weight=None,
+        ref_bias=None,
+    ):
+        """Chunked forward pass for RLHF loss computation."""
+        # Save for backward
+        ctx.beta = beta
+        ctx.rewards = rewards
+        # Initialize accumulators
+        loss_acc = torch.zeros((), device=_input.device)
+        grad_weight = torch.zeros_like(weight)  # [V, H]
+        grad_inputs = []
+        grad_bias = torch.zeros_like(bias) if bias is not None else None  # [V]
+        aggregated_metrics = []
+        # Create a partial function with fixed arguments
+        compute_loss = partial(
+            LigerFusedLinearRLHFBase._compute_chunk_loss,
+            beta=beta,
+            use_ref_model=use_ref_model,
+            ref_weight=ref_weight,
+            ref_bias=ref_bias,
+            rlhf_loss_fn=loss_fn,
+        )
+        def fused_fwd_bwd(input_chunk, attention_mask_chunk, rewards_chunk, ref_input_chunk):
+            """Fused forward and backward for a chunk."""
+            if bias is not None:
+                return torch.func.grad_and_value(compute_loss, argnums=(0, 1, 5), has_aux=True)(
+                    input_chunk,  # arg 0
+                    weight,  # arg 1
+                    attention_mask_chunk,  # arg 2
+                    rewards_chunk,  # arg 3
+                    ref_input_chunk,  # arg 4
+                    bias,  # arg 5
+                )
+            else:
+                return torch.func.grad_and_value(compute_loss, argnums=(0, 1), has_aux=True)(
+                    input_chunk,  # arg 0
+                    weight,  # arg 1
+                    attention_mask_chunk,  # arg 2
+                    rewards_chunk,  # arg 3
+                    ref_input_chunk,  # arg 4
+                )
+        def accumulate_chunk(input_chunk, attention_mask_chunk, rewards_chunk, ref_input_chunk=None):
+            if bias is not None:
+                (chunk_grad_input, chunk_grad_weight, chunk_grad_bias), (chunk_loss, chunk_metrics) = fused_fwd_bwd(
+                    input_chunk, attention_mask_chunk, rewards_chunk, ref_input_chunk
+                )
+                grad_bias.add_(chunk_grad_bias)
+            else:
+                (chunk_grad_input, chunk_grad_weight), (chunk_loss, chunk_metrics) = fused_fwd_bwd(
+                    input_chunk, attention_mask_chunk, rewards_chunk, ref_input_chunk
+                )
+            # Accumulate gradients and loss
+            grad_weight.add_(chunk_grad_weight)
+            grad_inputs.append(chunk_grad_input)
+            loss_acc.add_(chunk_loss)
+            # Initialize storage for metrics on first chunk
+            if len(aggregated_metrics) == 0:
+                for metric in chunk_metrics:
+                    if metric.ndim == 0:
+                        aggregated_metrics.append(torch.zeros((), device=metric.device))
+                    else:
+                        aggregated_metrics.append([])
+            # Accumulate metrics
+            for i, metric in enumerate(chunk_metrics):
+                if metric.ndim == 0:
+                    aggregated_metrics[i].add_(metric)
+                else:
+                    aggregated_metrics[i].append(metric)
+        if compiled:
+            accumulate_chunk = torch.compile(accumulate_chunk)
+        # Process input in chunks
+        chunks = max(1, _input.shape[0] // num_generations)
+        _input_chunks = torch.chunk(_input, chunks=chunks, dim=0)
+        _attention_mask_chunks = torch.chunk(attention_mask, chunks=chunks, dim=0)
+        _rewards_chunks = torch.chunk(rewards, chunks=chunks, dim=0)
+        _ref_input_chunks = torch.chunk(ref_input, chunks=chunks, dim=0) if use_ref_model else [None] * chunks
+        for input_chunk, attention_mask_chunk, rewards_chunk, ref_input_chunk in zip(
+            _input_chunks, _attention_mask_chunks, _rewards_chunks, _ref_input_chunks
+        ):
+            # Mark dynamic dimensions
+            torch._dynamo.mark_dynamic(input_chunk, 1)
+            torch._dynamo.mark_dynamic(attention_mask_chunk, 1)
+            if ref_input_chunk is not None:
+                torch._dynamo.mark_dynamic(ref_input_chunk, 1)
+            accumulate_chunk(input_chunk, attention_mask_chunk, rewards_chunk, ref_input_chunk)
+        # Scale accumulated loss by number of chunks since we're averaging
+        loss_acc = loss_acc / chunks
+        # Combine gradients
+        grad_input = torch.cat(grad_inputs, dim=0)
+        # Save for backward
+        ctx.save_for_backward(grad_input, grad_weight, grad_bias)
+        # Finalize metrics
+        final_metrics = []
+        for metric in aggregated_metrics:
+            if isinstance(metric, list):
+                final_metrics.append(torch.cat(metric, dim=0))
+            else:
+                final_metrics.append(metric / chunks)
+        return loss_acc, tuple(final_metrics)
+    @staticmethod
+    def _compute_chunk_loss(
+        input_chunk,
+        weight,
+        attention_mask_chunk,
+        rewards_chunk,
+        ref_input_chunk=None,
+        bias=None,
+        beta=0.1,
+        use_ref_model=False,
+        ref_weight=None,
+        ref_bias=None,
+        rlhf_loss_fn=None,
+    ):
+        """Compute loss for a single chunk."""
+        # Get policy log probabilities using chunk_forward
+        log_probs, _, logits_mean = LigerFusedLinearRLHFBase.chunk_forward(input_chunk, weight, bias=bias)
+        # Get reference log probabilities if needed
+        ref_log_probs = None
+        if use_ref_model and ref_input_chunk is not None:
+            with torch.no_grad():
+                ref_log_probs, _, _ = LigerFusedLinearRLHFBase.chunk_forward(ref_input_chunk, ref_weight, bias=ref_bias)
+        # Compute chunk loss and metrics using the provided loss function
+        chunk_loss, chunk_metrics = rlhf_loss_fn(
+            log_probs=log_probs,
+            attention_mask=attention_mask_chunk,
+            rewards=rewards_chunk,
+            ref_log_probs=ref_log_probs,
+            beta=beta,
+        )
+        return chunk_loss, (logits_mean, *chunk_metrics)
+    @staticmethod
+    def chunk_forward(input_chunk, weight, bias=None):
+        """Forward pass computation for a single chunk without explicit reshaping."""
+        # Directly compute logits via batched matrix multiplication: [B, T, H] @ [H, V] -> [B, T, V]
+        logits = torch.matmul(input_chunk, weight.t())
+        if bias is not None:
+            logits = logits + bias  # Broadcasts bias to [B, T, V]
+        # Compute log probabilities using softmax over the last dimension
+        log_probs = F.log_softmax(logits.float(), dim=-1)
+        # Monitoring: compute mean of logits
+        batch_size, seq_len, _ = input_chunk.shape
+        logits_mean = logits.sum() / (batch_size * seq_len * weight.shape[0])
+        return log_probs, logits, logits_mean
+    @staticmethod
+    def backward(ctx, grad_output, *grad_metrics):
+        """Backward pass for RLHF loss."""
+        grad_input, grad_weight, grad_bias = ctx.saved_tensors
+        if grad_output != 1.0:
+            grad_input = grad_input * grad_output
+            grad_weight = grad_weight * grad_output
+            if grad_bias is not None:
+                grad_bias = grad_bias * grad_output
+        return (
+            grad_input,
+            grad_weight,
+            None,  # grad_attention_mask
+            None,  # grad_rewards
+            grad_bias,
+            None,  # grad_loss_fn
+            None,  # grad_chunk_size
+            None,  # grad_beta
+            None,  # grad_compiled
+            None,  # grad_use_ref_model
+            None,  # grad_ref_input
+            None,  # grad_ref_weight
+            None,  # grad_ref_bias
+        )

liger_kernel_nightly-0.5.3.dev20250219232423/src/liger_kernel/chunked_loss/grpo_loss.py ADDED Viewed

@@ -0,0 +1,160 @@
+import torch
+from liger_kernel.chunked_loss.fused_linear_rlhf import LigerFusedLinearRLHFBase
+class LigerFusedLinearGRPOFunction(LigerFusedLinearRLHFBase):
+    @staticmethod
+    def rlhf_loss_fn(
+        log_probs,
+        attention_mask,
+        rewards,
+        ref_log_probs=None,
+        beta=0.1,
+        **kwargs,
+    ):
+        """GRPO Loss Function matching GRPOTrainer implementation."""
+        # Get chosen token probabilities
+        chosen_tokens = log_probs.argmax(dim=-1)  # (batch_size, seq_len)
+        chosen_token_logprobs = log_probs.gather(dim=-1, index=chosen_tokens.unsqueeze(-1)).squeeze(
+            -1
+        )  # (batch_size, seq_len)
+        # Get reference model probabilities
+        if ref_log_probs is not None:
+            with torch.no_grad():
+                ref_token_logprobs = ref_log_probs.gather(dim=-1, index=chosen_tokens.unsqueeze(-1)).squeeze(-1)
+        else:
+            ref_token_logprobs = chosen_token_logprobs.detach()
+        # Compute advantages per batch entry in a grouped fashion
+        mean_grouped_rewards = rewards.mean()  # [batch_size,]
+        std_grouped_rewards = rewards.std()  # [batch_size,]
+        # Calculate advantages using the same epsilon as in GRPOTrainer
+        eps = 1e-4
+        advantages = (rewards - mean_grouped_rewards) / (std_grouped_rewards + eps)
+        # Compute policy gradient loss with importance sampling ratio
+        ratio = torch.exp(chosen_token_logprobs - chosen_token_logprobs.detach())
+        policy_loss = -ratio * advantages.unsqueeze(1)
+        # Compute KL penalty
+        kl_div = (
+            torch.exp(ref_token_logprobs - chosen_token_logprobs) - (ref_token_logprobs - chosen_token_logprobs) - 1.0
+        )
+        # Combine losses
+        per_token_loss = policy_loss + beta * kl_div
+        # Apply masking and normalize
+        masked_loss = per_token_loss * attention_mask
+        seq_lengths = attention_mask.sum()
+        seq_lengths = torch.clamp(seq_lengths, min=1.0)
+        loss = masked_loss.sum() / seq_lengths
+        # Calculate metrics
+        metrics = (
+            chosen_token_logprobs.mean(),  # mean log prob
+            chosen_token_logprobs.std(),  # std log prob
+            log_probs.mean(),  # mean all log probs
+            ((kl_div * attention_mask).sum(dim=1) / attention_mask.sum(dim=1)).mean(),  # mean KL div
+        )
+        return loss, metrics
+    @staticmethod
+    def forward(
+        ctx,
+        _input,
+        weight,
+        attention_mask,
+        rewards,
+        bias=None,
+        ref_input=None,
+        ref_weight=None,
+        ref_bias=None,
+        beta=0.1,
+        compiled=True,
+        use_ref_model=True,
+        num_generations=1,
+    ):
+        return LigerFusedLinearRLHFBase.forward(
+            ctx=ctx,
+            _input=_input,
+            weight=weight,
+            attention_mask=attention_mask,
+            loss_fn=LigerFusedLinearGRPOFunction.rlhf_loss_fn,
+            rewards=rewards,
+            bias=bias,
+            ref_input=ref_input,
+            ref_weight=ref_weight,
+            ref_bias=ref_bias,
+            beta=beta,
+            compiled=compiled,
+            use_ref_model=use_ref_model,
+            num_generations=num_generations,
+        )
+    @staticmethod
+    def backward(ctx, grad_output, *grad_metrics):
+        """Backward pass for GRPO loss.
+        Args:
+            grad_output: Gradient of the loss (scalar)
+            grad_metrics: Gradients of the metrics (not used in backward computation)
+        """
+        grads = LigerFusedLinearRLHFBase.backward(ctx, grad_output)
+        return (
+            *grads[:5],  # grad_input, grad_weight, grad_attention_mask, grad_rewards, grad_bias
+            None,  # grad_ref_input
+            None,  # grad_ref_weight
+            None,  # grad_ref_bias
+            None,  # grad_beta
+            None,  # grad_compiled
+            None,  # grad_use_ref_model
+            None,  # grad_num_generations
+        )
+class LigerFusedLinearGRPOLoss(torch.nn.Module):
+    """Fused linear layer with GRPO loss."""
+    def __init__(
+        self,
+        beta: float = 0.1,
+        compiled: bool = True,
+        use_ref_model: bool = True,
+        num_generations: int = 1,
+    ):
+        super().__init__()
+        self.beta = beta
+        self.compiled = compiled
+        self.use_ref_model = use_ref_model
+        self.num_generations = num_generations
+    def forward(
+        self,
+        _input,
+        lin_weight,
+        attention_mask,
+        rewards,
+        bias=None,
+        ref_input=None,
+        ref_weight=None,
+        ref_bias=None,
+    ):
+        return LigerFusedLinearGRPOFunction.apply(
+            _input,
+            lin_weight,
+            attention_mask,
+            rewards,
+            bias,
+            ref_input,
+            ref_weight,
+            ref_bias,
+            self.beta,
+            self.compiled,
+            self.use_ref_model,
+            self.num_generations,
+        )

{liger_kernel_nightly-0.5.3.dev20250218225514 → liger_kernel_nightly-0.5.3.dev20250219232423}/src/liger_kernel_nightly.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.5.3.dev20250218225514
+Version: 0.5.3.dev20250219232423
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation

{liger_kernel_nightly-0.5.3.dev20250218225514 → liger_kernel_nightly-0.5.3.dev20250219232423}/src/liger_kernel_nightly.egg-info/SOURCES.txt RENAMED Viewed

@@ -111,7 +111,9 @@ src/liger_kernel/chunked_loss/dpo_loss.py
 src/liger_kernel/chunked_loss/functional.py
 src/liger_kernel/chunked_loss/fused_linear_distillation.py
 src/liger_kernel/chunked_loss/fused_linear_preference.py
+src/liger_kernel/chunked_loss/fused_linear_rlhf.py
 src/liger_kernel/chunked_loss/fused_linear_unpaired_preference.py
+src/liger_kernel/chunked_loss/grpo_loss.py
 src/liger_kernel/chunked_loss/jsd_loss.py
 src/liger_kernel/chunked_loss/kto_loss.py
 src/liger_kernel/chunked_loss/orpo_loss.py
@@ -175,6 +177,7 @@ test/utils.py
 test/chunked_loss/__init__.py
 test/chunked_loss/test_cpo_loss.py
 test/chunked_loss/test_dpo_loss.py
+test/chunked_loss/test_grpo_loss.py
 test/chunked_loss/test_jsd_loss.py
 test/chunked_loss/test_kto_loss.py
 test/chunked_loss/test_orpo_loss.py

liger-kernel-nightly 0.5.3.dev20250218225514__tar.gz → 0.5.3.dev20250219232423__tar.gz

liger-kernel-nightly 0.5.3.dev20250218225514tar.gz → 0.5.3.dev20250219232423tar.gz