PyPI - liger-kernel-nightly - Versions diffs - 0.5.9.dev20250519011716__py3-none-any.whl → 0.5.9.dev20250519025610__py3-none-any.whl - Mend

liger-kernel-nightly 0.5.9.dev20250519011716py3-none-any.whl → 0.5.9.dev20250519025610py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

liger_kernel/ops/grpo_loss.py ADDED Viewed

@@ -0,0 +1,310 @@
+import torch
+import triton
+import triton.language as tl
+@triton.jit
+def _selective_log_softmax_kernel(
+    LOGITS,
+    INPUT_IDS,
+    LOG_P,
+    MASK,
+    TEMPERATURE,
+    stride_input_ids_b,
+    L: tl.constexpr,
+    N: tl.constexpr,
+    BLOCK_N: tl.constexpr = 4096,
+):
+    off_b = tl.program_id(0).cast(tl.int64)
+    off_l = tl.program_id(1).cast(tl.int64)
+    LOGITS += off_b * (L + 1) * N + off_l * N
+    INPUT_IDS += off_b * stride_input_ids_b + off_l
+    LOG_P += off_b * L + off_l
+    if MASK is not None:
+        MASK += off_b * stride_input_ids_b + off_l
+        not_skip = tl.load(MASK)
+        if not_skip == 0:
+            return
+    m_i = float("-inf")
+    l_i = 0.0
+    for start in range(0, N, BLOCK_N):
+        cols = start + tl.arange(0, BLOCK_N)
+        logits = tl.load(LOGITS + cols, mask=cols < N, other=float("-inf")).to(tl.float32) / TEMPERATURE
+        new_m_i = tl.maximum(m_i, tl.max(logits))
+        alpha = tl.exp(m_i - new_m_i)
+        l_i = l_i * alpha + tl.sum(tl.exp(logits - new_m_i))
+        m_i = new_m_i
+    lse = m_i + tl.log(l_i)
+    ids = tl.load(INPUT_IDS)
+    x = tl.load(LOGITS + ids).to(tl.float32) / TEMPERATURE
+    logp = x - lse
+    tl.store(LOG_P, logp)
+# compue old_logp and ref_logp, it reduce 10G peak Memory. it does not requires grad
+@torch.no_grad
+def fused_selective_log_softmax(logits: torch.Tensor, input_ids: torch.Tensor, temperature: float = 0.9, mask=None):
+    assert logits.is_contiguous()
+    B, L_ADD_1, N = logits.shape
+    L = L_ADD_1 - 1
+    input_ids = input_ids[:, -L:]
+    if mask is not None:
+        mask = mask[:, -L:]
+    log_p = torch.zeros(B, L, dtype=torch.float32, device=logits.device)
+    kwargs = {"BLOCK_N": 2048, "num_stages": 4, "num_warps": 1}
+    _selective_log_softmax_kernel[(B, L)](
+        logits, input_ids, log_p, mask, temperature, input_ids.stride(0), L, N, **kwargs
+    )
+    return log_p
+# @triton.autotune([triton.Config({"BLOCK_N":BLOCK_N}, num_stages=ns, num_warps=nw)
+#                   for BLOCK_N in [2048, 4096, 8192]
+#                   for ns in [1, 2, 4]
+#                   for nw in [1, 2, 4, 8, 16]],
+#                   key=['N'])
+@triton.jit
+def _grpo_loss_fwd_kernel(
+    LOGITS,
+    OLD_LOGP,
+    REF_LOGP,
+    INPUT_IDS,
+    COMPLETION_MASK,
+    ADVANTAGES,
+    LOSS,
+    LSE,
+    KL,
+    IS_CLIPPED,
+    TEMPERATURE,
+    BETA: tl.constexpr,
+    EPS_LOW,
+    EPS_HIGH,
+    L: tl.constexpr,
+    N: tl.constexpr,
+    BLOCK_N: tl.constexpr = 4096,
+):
+    off_b = tl.program_id(0).cast(tl.int64)
+    off_l = tl.program_id(1).cast(tl.int64)
+    if COMPLETION_MASK is not None:
+        COMPLETION_MASK += off_b * L + off_l
+        not_skip = tl.load(COMPLETION_MASK)
+        if not_skip == 0:
+            return
+    LOGITS += off_b * (L + 1) * N + off_l * N
+    INPUT_IDS += off_b * L + off_l
+    ADVANTAGES += off_b
+    LOSS += off_b * L + off_l
+    LSE += off_b * L + off_l
+    IS_CLIPPED += off_b * L + off_l
+    m_i = float("-inf")
+    l_i = 0.0
+    for start in range(0, N, BLOCK_N):
+        cols = start + tl.arange(0, BLOCK_N)
+        logits = tl.load(LOGITS + cols, mask=cols < N, other=float("-inf")).to(tl.float32) / TEMPERATURE
+        new_m_i = tl.maximum(m_i, tl.max(logits))
+        alpha = tl.exp(m_i - new_m_i)
+        l_i = l_i * alpha + tl.sum(tl.exp(logits - new_m_i))
+        m_i = new_m_i
+    lse = m_i + tl.log(l_i)
+    idx = tl.load(INPUT_IDS)
+    x = tl.load(LOGITS + idx).to(tl.float32) / TEMPERATURE
+    logp = x - lse
+    if OLD_LOGP is None:
+        old_logp = logp
+    else:
+        OLD_LOGP += off_b * L + off_l
+        old_logp = tl.load(OLD_LOGP).to(tl.float32)
+    coef_1 = tl.exp(logp - old_logp)
+    coef_2 = tl.clamp(coef_1, 1 - EPS_LOW, 1 + EPS_HIGH)
+    advantage = tl.load(ADVANTAGES).to(tl.float32)
+    per_token_loss1 = coef_1 * advantage
+    per_token_loss2 = coef_2 * advantage
+    per_token_loss = -tl.minimum(per_token_loss1, per_token_loss2)
+    is_clipped = per_token_loss1 < per_token_loss2
+    if BETA != 0.0:
+        REF_LOGP += off_b * L + off_l
+        KL += off_b * L + off_l
+        ref_logp = tl.load(REF_LOGP).to(tl.float32)
+        kl = tl.exp(ref_logp - logp) - (ref_logp - logp) - 1
+        per_token_loss += BETA * kl
+        tl.store(KL, kl)
+    tl.store(LOSS, per_token_loss)
+    tl.store(LSE, lse)
+    tl.store(IS_CLIPPED, is_clipped)
+# @triton.autotune([triton.Config({"BLOCK_N":BLOCK_N}, num_stages=ns, num_warps=nw)
+#                   for BLOCK_N in [2048, 4096, 8192]
+#                   for ns in [1, 2, 4]
+#                   for nw in [1, 2, 4, 8, 16]],
+#                   key=['N'])
+@triton.jit
+def _grpo_loss_bwd_kernel(
+    DLOSS,
+    DLOGITS,
+    LOGITS,
+    OLD_LOGP,
+    REF_LOGP,
+    INPUT_IDS,
+    ADVANTAGES,
+    COMPLETION_MASK,
+    LSE,
+    TEMPERATURE,
+    BETA: tl.constexpr,
+    EPS_LOW,
+    EPS_HIGH,
+    loss_stride0,
+    loss_stride1,
+    L: tl.constexpr,
+    N: tl.constexpr,
+    BLOCK_N: tl.constexpr = 4096,
+):
+    off_b = tl.program_id(0).cast(tl.int64)
+    off_l = tl.program_id(1).cast(tl.int64)
+    DLOGITS += off_b * (L + 1) * N + off_l * N
+    if COMPLETION_MASK is not None:
+        COMPLETION_MASK += off_b * L + off_l
+        not_skip = tl.load(COMPLETION_MASK)
+        if not_skip == 0:
+            for start in range(0, N, BLOCK_N):
+                cols = tl.arange(0, BLOCK_N) + start
+                tl.store(DLOGITS + cols, 0.0, mask=cols < N)
+            return
+    LOGITS += off_b * (L + 1) * N + off_l * N
+    DLOSS += off_b * loss_stride0 + off_l * loss_stride1
+    INPUT_IDS += off_b * L + off_l
+    ADVANTAGES += off_b
+    LSE += off_b * L + off_l
+    dloss = tl.load(DLOSS).to(tl.float32)
+    lse = tl.load(LSE).to(tl.float32)
+    idx = tl.load(INPUT_IDS)
+    x = tl.load(LOGITS + idx).to(tl.float32) / TEMPERATURE
+    logp = x - lse
+    if OLD_LOGP is None:
+        old_logp = logp
+    else:
+        OLD_LOGP += off_b * L + off_l
+        old_logp = tl.load(OLD_LOGP).to(tl.float32)
+    coef_1 = tl.exp(logp - old_logp)
+    coef_2 = tl.clamp(coef_1, 1 - EPS_LOW, 1 + EPS_HIGH)
+    advantage = tl.load(ADVANTAGES).to(tl.float32)
+    per_token_loss1 = coef_1 * advantage
+    per_token_loss2 = coef_2 * advantage
+    mask = per_token_loss2 >= per_token_loss1
+    dlogp = -per_token_loss1 * mask
+    if BETA != 0.0:
+        REF_LOGP += off_b * L + off_l
+        ref_logp = tl.load(REF_LOGP).to(tl.float32)
+        dlogp += BETA * (1 - tl.exp(ref_logp - logp))
+    dlogp = dlogp * dloss / TEMPERATURE
+    tl.debug_barrier()
+    for start_n in tl.range(0, N, BLOCK_N):
+        cols = start_n + tl.arange(0, BLOCK_N)
+        logits = tl.load(LOGITS + cols, mask=cols < N, other=-float("inf")).to(tl.float32) / TEMPERATURE
+        probs = tl.exp(logits - lse)
+        dlogits = tl.where(cols == idx, 1 - probs, -probs) * dlogp
+        tl.store(DLOGITS + cols, dlogits, mask=cols < N)
+class GrpoLossFunction(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx,
+        logits,
+        old_logp,
+        ref_logp,
+        completion_ids,
+        advantages,
+        completion_mask,
+        temperature,
+        beta,
+        eps_low,
+        eps_high,
+        inplace,
+    ):
+        assert logits.is_contiguous() and completion_ids.is_contiguous()
+        assert old_logp is None or old_logp.is_contiguous()
+        assert (ref_logp is not None and ref_logp.is_contiguous()) if beta != 0.0 else True
+        B, L_ADD_1, N = logits.shape
+        L = L_ADD_1 - 1
+        if completion_mask is not None:
+            assert completion_mask.is_contiguous()
+        loss = torch.zeros(B, L, device=logits.device, dtype=torch.float32)
+        lse = torch.zeros_like(loss)
+        is_clipped = torch.zeros_like(loss)
+        kl = torch.zeros_like(loss) if beta != 0.0 else None
+        kwargs = {"BLOCK_N": 2048, "num_stages": 2, "num_warps": 1}
+        _grpo_loss_fwd_kernel[(B, L)](
+            logits,
+            old_logp,
+            ref_logp,
+            completion_ids,
+            completion_mask,
+            advantages,
+            loss,
+            lse,
+            kl,
+            is_clipped,
+            temperature,
+            beta,
+            eps_low,
+            eps_high,
+            L,
+            N,
+            **kwargs,
+        )
+        ctx.save_for_backward(logits, old_logp, ref_logp, completion_ids, advantages, completion_mask, lse)
+        ctx.infos = (temperature, beta, eps_low, eps_high, inplace)
+        # return loss
+        return loss, kl, is_clipped
+    @staticmethod
+    def backward(ctx, *args):
+        dloss = args[0]
+        # print(dloss.shape)
+        logits, old_logp, ref_logp, completion_ids, advantages, completion_mask, lse = ctx.saved_tensors
+        temperature, beta, eps_low, eps_high, inplace = ctx.infos
+        B, L_ADD_1, N = logits.shape
+        L = L_ADD_1 - 1
+        dlogits = logits.data if inplace else torch.empty_like(logits)
+        kwargs = {"BLOCK_N": 4096, "num_stages": 1, "num_warps": 16}
+        _grpo_loss_bwd_kernel[(B, L)](
+            dloss,
+            dlogits,
+            logits,
+            old_logp,
+            ref_logp,
+            completion_ids,
+            advantages,
+            completion_mask,
+            lse,
+            temperature,
+            beta,
+            eps_low,
+            eps_high,
+            *dloss.stride(),
+            L,
+            N,
+            **kwargs,
+        )
+        dlogits[:, -1, :] = 0
+        return dlogits, None, None, None, None, None, None, None, None, None, None

liger_kernel/transformers/fsdp.py ADDED Viewed

@@ -0,0 +1,55 @@
+from typing import Any
+from typing import Callable
+from torch.distributed.fsdp import FullyShardedDataParallel
+class _FSDPForwardRedirection:
+    """
+    Modified based on
+    https://github.com/Lightning-AI/pytorch-lightning/blob/d3f9c83d6efa4f1def36aa6c199600946cdb9117/src/lightning/pytorch/strategies/strategy.py#L601-L648
+    Redirect a method call through FullyShardedDataParallel.forward so that the FSDP module's root pre-forward and
+    post-forward can be properly executed around the method call.
+    This is needed in cases where we call a submodule of a FSDP module. For instance, when we want to call only
+    the `LlamaModel` part out of a FSDP-wrapped `LlamaForCausalLM` to get the hidden states without involving
+    GPU-memory-heavy `lm_head` and cross entropy computation, doing this directly (i.e. `model.model.forward()`)
+    will not work because the first `nn.Embedding` layer is not independently wrapped as a FSDP module (because of
+    the transformer-based wrapping policy), and not calling it through FSDP root module forward will not all-gather
+    its parameter, thus resulting in "RuntimeError: 'weight' must be 2-D" error. Similarly, if we want to call just
+    the `lm_head` part of a model, we need this trick too to properly get its params all-gathered.
+    """
+    def __call__(
+        self,
+        wrapper_module: FullyShardedDataParallel,
+        method: Callable,
+        *args: Any,
+        **kwargs: Any,
+    ):
+        """Reroutes a method call through the `wrapper_module`'s `forward` method.
+        Args:
+            wrapper_module: The module that has `original_module` wrapped.
+            original_module: The module that was wrapped inside `wrapper_module`.
+            method_name: The name of the method that should be called on the `original_module` after inputs get
+                redirected through the `wrapper_module`'s `forward` method.
+            *args: The positional arguments to the method `method_name`. They will get passed to a patched
+                `forward` method instead.
+            **kwargs: The keyword arguments to the method `method_name`. They will get passed to a patched
+                `forward` method instead.
+        """
+        assert isinstance(wrapper_module, FullyShardedDataParallel)
+        original_module = wrapper_module._fsdp_wrapped_module
+        original_forward = original_module.forward
+        def wrapped_forward(*_args: Any, **_kwargs: Any) -> Any:
+            # Unpatch ourselves immediately before calling the method `method_name`
+            # because itself may want to call the real `forward`
+            original_module.forward = original_forward  # type: ignore[method-assign]
+            # Call the actual method e.g. `.training_step(...)`
+            out = method(*_args, **_kwargs)
+            return out
+        # Patch the original_module's forward so we can redirect the arguments back to the real method
+        original_module.forward = wrapped_forward  # type: ignore[method-assign]
+        wrapper_output = wrapper_module(*args, **kwargs)
+        return wrapper_output

liger_kernel/transformers/grpo_loss.py ADDED Viewed

@@ -0,0 +1,98 @@
+from liger_kernel.ops.grpo_loss import GrpoLossFunction
+def triton_grpo_loss(
+    logits,
+    old_logp,
+    ref_logp,
+    completion_ids,
+    advantages,
+    completion_mask=None,
+    temperature=0.9,
+    beta=0.04,
+    eps_low=0.2,
+    eps_high=0.4,
+    inplace=True,
+):
+    assert logits is not None and completion_ids is not None and advantages is not None, (
+        "must provide logits、completion_ids and advantages"
+    )
+    return GrpoLossFunction.apply(
+        logits,
+        old_logp,
+        ref_logp,
+        completion_ids,
+        advantages,
+        completion_mask,
+        temperature,
+        beta,
+        eps_low,
+        eps_high,
+        inplace,
+    )
+# This is a demo how to use grpo_loss in GRPOTrainer. The Trl version must be 0.16
+"""
+import torch
+import trl
+assert trl.__version__.startswith("0.16"), "please pip install trl==0.16"
+from trl.extras.profiling import profiling_decorator
+@profiling_decorator
+def _get_per_token_logps(self, model, input_ids, attention_mask, logits_to_keep):
+    # We add 1 to `logits_to_keep` because the last logits of the sequence is later excluded
+    logits = model(input_ids=input_ids, attention_mask=attention_mask, logits_to_keep=logits_to_keep + 1).logits
+    return fused_selective_log_softmax(logits, input_ids, self.temperature, mask=attention_mask)
+@profiling_decorator
+def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=None):
+    if return_outputs:
+        raise ValueError("The GRPOTrainer does not support returning outputs")
+    # Compute the per-token log probabilities for the model
+    prompt_ids, prompt_mask = inputs["prompt_ids"], inputs["prompt_mask"]
+    completion_ids, completion_mask = inputs["completion_ids"], inputs["completion_mask"]
+    input_ids = torch.cat([prompt_ids, completion_ids], dim=1)
+    attention_mask = torch.cat([prompt_mask, completion_mask], dim=1)
+    logits_to_keep = completion_ids.size(1)  # we only need to compute the logits for the completion tokens
+    logits = model(input_ids=input_ids, attention_mask=attention_mask, logits_to_keep=logits_to_keep + 1).logits
+    ref_per_token_logps = inputs["ref_per_token_logps"]
+    advantages = inputs["advantages"]
+    old_per_token_logps = inputs["old_per_token_logps"]
+    per_token_loss, per_token_kl, is_clipped = triton_grpo_loss(logits,
+                                                                old_per_token_logps,
+                                                                ref_per_token_logps,
+                                                                completion_ids,
+                                                                advantages,
+                                                                completion_mask,
+                                                                self.temperature,
+                                                                self.beta,
+                                                                self.epsilon_low,
+                                                                self.epsilon_high,)
+    loss = (per_token_loss * completion_mask).sum() / completion_mask.sum()
+    # Log the metrics
+    mode = "eval" if self.control.should_evaluate else "train"
+    if self.beta != 0.0:
+        mean_kl = (per_token_kl * completion_mask).sum() / completion_mask.sum()
+        self._metrics[mode]["kl"].append(self.accelerator.gather_for_metrics(mean_kl).mean().item())
+    clip_ratio = (is_clipped * completion_mask).sum() / completion_mask.sum()
+    self._metrics[mode]["clip_ratio"].append(self.accelerator.gather_for_metrics(clip_ratio).mean().item())
+    return loss
+trl.GRPOTrainer._get_per_token_logps = _get_per_token_logps
+trl.GRPOTrainer.compute_loss = compute_loss
+trigger = None
+"""
+# add this line at the first line of grpo.py in open-r1
+"""
+from liger_kernel.transformers.grpo_loss import trigger
+"""

liger_kernel/transformers/model/llama.py CHANGED Viewed

@@ -7,16 +7,22 @@ from typing import Union
 import torch
 import torch.nn.functional as F
+from torch.distributed.fsdp import FullyShardedDataParallel
 from torch.nn import CrossEntropyLoss
 from transformers.modeling_outputs import CausalLMOutputWithPast
 from transformers.utils.deprecation import deprecate_kwarg
+from liger_kernel.transformers.fsdp import _FSDPForwardRedirection
 from liger_kernel.transformers.fused_linear_cross_entropy import LigerFusedLinearCrossEntropyLoss
 from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
+from liger_kernel.utils import PEFT_AVAILABLE
 if TYPE_CHECKING:
     from transformers.cache_utils import Cache
+if PEFT_AVAILABLE:
+    from peft.utils.other import ModulesToSaveWrapper
 def lce_forward_deprecated(
     self,
@@ -213,12 +219,12 @@ def lce_forward(
     loss = None
     # if in training mode, don't materialize logits
     if self.training and (labels is not None or shift_labels is not None):
-        loss = LigerForCausalLMLoss(
+        loss = lce_maybe_trainable_lm_head(
+            self,
             hidden_states=kept_hidden_states,
-            lm_head_weight=self.lm_head.weight,
+            hidden_size=self.config.hidden_size,
             labels=labels,
             shift_labels=shift_labels,
-            hidden_size=self.config.hidden_size,
             **loss_kwargs,
         )
@@ -243,3 +249,50 @@ def lce_forward(
         hidden_states=outputs.hidden_states,
         attentions=outputs.attentions,
     )
+def lce_maybe_trainable_lm_head(self, hidden_states, hidden_size, labels, shift_labels, **loss_kwargs):
+    lm_head = self.lm_head
+    # Unwrap the module if lm_head has been added as trainable module in PEFT LoRA configuration,
+    # i.e. listed in the modules_to_save field of LoraConfig, so the lm_head weights are read
+    # from the unwrapped module.
+    # See https://huggingface.co/docs/peft/package_reference/lora for reference.
+    if PEFT_AVAILABLE and isinstance(lm_head, ModulesToSaveWrapper):
+        lm_head = lm_head.modules_to_save.default
+    # If FSDP is used and lm_head is trainable, e.g., during full fine-tuning or with LoRA,
+    # reading the lm_head module weights and calling the kernel must be done within FSDP forward pass
+    # so the module entire parameters are summoned and kept in memory during the kernel execution.
+    if isinstance(lm_head, FullyShardedDataParallel):
+        return _FSDPForwardRedirection()(
+            lm_head,
+            _liger_for_causal_lm_loss,
+            lm_head.module,
+            hidden_states,
+            hidden_size,
+            labels,
+            shift_labels,
+            **loss_kwargs,
+        )
+    # FSDP is not used so we can read the lm_head weights and call the kernel directly
+    return _liger_for_causal_lm_loss(
+        lm_head=self.lm_head,
+        hidden_states=hidden_states,
+        hidden_size=hidden_size,
+        labels=labels,
+        shift_labels=shift_labels,
+        **loss_kwargs,
+    )
+def _liger_for_causal_lm_loss(lm_head, hidden_states, hidden_size, labels, shift_labels, **loss_kwargs):
+    return LigerForCausalLMLoss(
+        hidden_states=hidden_states,
+        lm_head_weight=lm_head.weight,
+        labels=labels,
+        hidden_size=hidden_size,
+        shift_labels=shift_labels,
+        **loss_kwargs,
+    )

liger_kernel/transformers/trainer/orpo_trainer.py CHANGED Viewed

@@ -1,5 +1,3 @@
-from typing import Any
-from typing import Callable
 from typing import Dict
 from typing import List
 from typing import Literal
@@ -13,57 +11,7 @@ from torch.distributed.fsdp import FullyShardedDataParallel
 from trl.trainer import ORPOTrainer
 from liger_kernel.chunked_loss import LigerFusedLinearORPOLoss
-class _FSDPForwardRedirection:
-    """
-    Modified based on
-    https://github.com/Lightning-AI/pytorch-lightning/blob/d3f9c83d6efa4f1def36aa6c199600946cdb9117/src/lightning/pytorch/strategies/strategy.py#L601-L648
-    Redirect a method call through FullyShardedDataParallel.forward so that the FSDP module's root pre-forward and
-    post-forward can be properly executed around the method call.
-    This is needed in cases where we call a submodule of a FSDP module. For instance, when we want to call only
-    the `LlamaModel` part out of a FSDP-wrapped `LlamaForCausalLM` to get the hidden states without involving
-    GPU-memory-heavy `lm_head` and cross entropy computation, doing this directly (i.e. `model.model.forward()`)
-    will not work because the first `nn.Embedding` layer is not independently wrapped as a FSDP module (because of
-    the transformer-based wrapping policy), and not calling it through FSDP root module forward will not all-gather
-    its parameter, thus resulting in "RuntimeError: 'weight' must be 2-D" error. Similarly, if we want to call just
-    the `lm_head` part of a model, we need this trick too to properly get its params all-gathered.
-    """
-    def __call__(
-        self,
-        wrapper_module: FullyShardedDataParallel,
-        method: Callable,
-        *args: Any,
-        **kwargs: Any,
-    ):
-        """Reroutes a method call through the `wrapper_module`'s `forward` method.
-        Args:
-            wrapper_module: The module that has `original_module` wrapped.
-            original_module: The module that was wrapped inside `wrapper_module`.
-            method_name: The name of the method that should be called on the `original_module` after inputs get
-                redirected through the `wrapper_module`'s `forward` method.
-            *args: The positional arguments to the method `method_name`. They will get passed to a patched
-                `forward` method instead.
-            **kwargs: The keyword arguments to the method `method_name`. They will get passed to a patched
-                `forward` method instead.
-        """
-        assert isinstance(wrapper_module, FullyShardedDataParallel)
-        original_module = wrapper_module._fsdp_wrapped_module
-        original_forward = original_module.forward
-        def wrapped_forward(*_args: Any, **_kwargs: Any) -> Any:
-            # Unpatch ourselves immediately before calling the method `method_name`
-            # because itself may want to call the real `forward`
-            original_module.forward = original_forward  # type: ignore[method-assign]
-            # Call the actual method e.g. `.training_step(...)`
-            out = method(*_args, **_kwargs)
-            return out
-        # Patch the original_module's forward so we can redirect the arguments back to the real method
-        original_module.forward = wrapped_forward  # type: ignore[method-assign]
-        wrapper_output = wrapper_module(*args, **kwargs)
-        return wrapper_output
+from liger_kernel.transformers.fsdp import _FSDPForwardRedirection
 class LigerORPOTrainer(ORPOTrainer):

liger_kernel/utils.py CHANGED Viewed

@@ -1,6 +1,17 @@
+try:
+    import peft  # noqa: F401
+    PEFT_AVAILABLE = True
+except ImportError:
+    PEFT_AVAILABLE = False
 import torch
+def is_peft_available():
+    return PEFT_AVAILABLE
 def infer_device():
     """
     Get current device name based on available devices

{liger_kernel_nightly-0.5.9.dev20250519011716.dist-info → liger_kernel_nightly-0.5.9.dev20250519025610.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.5.9.dev20250519011716
+Version: 0.5.9.dev20250519025610
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation

{liger_kernel_nightly-0.5.9.dev20250519011716.dist-info → liger_kernel_nightly-0.5.9.dev20250519025610.dist-info}/RECORD RENAMED Viewed

@@ -1,6 +1,6 @@
 liger_kernel/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
 liger_kernel/env_report.py,sha256=uhdEC8OydxoZlb7B6YYcAaBF3crGFdIck-4cxaW4NJY,1728
-liger_kernel/utils.py,sha256=178Hn8uD-VauDT6FjqMyXLbKLod8ObIpaTtapHwfEK0,1861
+liger_kernel/utils.py,sha256=BQleeZWHSZPNuPcYcoZTOp1kcNEZONZilPP5-AmjgWI,2024
 liger_kernel/chunked_loss/README.md,sha256=0FmkFC3hKBqyoDT5uTlIYmrvRkF-EOCR1y-EBU1LpWU,2248
 liger_kernel/chunked_loss/__init__.py,sha256=ATu-xX5Fc49Cr6yBOGBRNTo593ZrU5ZCsIuvoIbJWw4,603
 liger_kernel/chunked_loss/cpo_loss.py,sha256=Gzz1eU4kgcbdubFVRy55e8A1Cr-r45UgNicXwZIjmBU,5454
@@ -22,6 +22,7 @@ liger_kernel/ops/fused_linear_cross_entropy.py,sha256=5fbGhN85n3zf0uIdJ7PYHWIRzT
 liger_kernel/ops/fused_linear_jsd.py,sha256=CSoprxb-YcJy-YUKiTcYkxN8sb9h2kdk_iHuncvSV5c,9683
 liger_kernel/ops/geglu.py,sha256=axGvCIvlBzuluoAIrWTsp2iZM4BFKNInkPov8YVvH9E,4126
 liger_kernel/ops/group_norm.py,sha256=qD4D4lSjSgVtO52EBNLC2iTseALRgPgqXE50U2woggk,10837
+liger_kernel/ops/grpo_loss.py,sha256=anRnv7k1-AV3pCC6_TqP0GMg78YYUfRAJrbpx6PVhl0,9448
 liger_kernel/ops/jsd.py,sha256=onHp5T3MbvJaVz5Vup7Ww6EQp_HTaZeayTjJk6FgQMY,7042
 liger_kernel/ops/kl_div.py,sha256=ZjGdDLKWksHT9dZ0xF_TDgAkj5cuMTwwT5tr9E-_24o,8734
 liger_kernel/ops/layer_norm.py,sha256=vWCyOm-F2GMAilB-ozJcFeUQQLCJoTE_uiXq-_0uYuI,8356
@@ -38,12 +39,14 @@ liger_kernel/transformers/__init__.py,sha256=0KX0rxyy0E_uNWVE0PSTzEVzKqc5KdFHtvd
 liger_kernel/transformers/auto_model.py,sha256=0qCTRZt280Bj_LcFdzo9hlaR-BWNazawXOGgoCZjgEg,1545
 liger_kernel/transformers/cross_entropy.py,sha256=z3KTWQnFxr_IZaVjtYt0ZNEWQdDdYThN35xWkHlDGH0,1683
 liger_kernel/transformers/dyt.py,sha256=i-4GPaMrl-jab9TVI5qN0-H9qycn_mCbV82ozU4nbmU,723
+liger_kernel/transformers/fsdp.py,sha256=CUiyjTmjkjY7pLXQv8ly9rnzgXw6529csd9pvtJNMYc,3096
 liger_kernel/transformers/functional.py,sha256=2YBfvtdU1GRZuRpJhHgJXeGYa1RvmO6-qQvrKQrLJK4,5259
 liger_kernel/transformers/fused_linear_cross_entropy.py,sha256=O8Sg5BT81nTaY9fSGoOY9dOD9ekibwwiuXhdUHaxntQ,1742
 liger_kernel/transformers/fused_linear_jsd.py,sha256=bZ4otCvWBuOnA5XdQL-FzZVItJlDt-ht9e_pG7PG93E,3999
 liger_kernel/transformers/geglu.py,sha256=mrgqzIUVd6lN7fkDKLkw5YaESDxDtFgbot430WwPVOQ,1107
 liger_kernel/transformers/gema3_rms.py,sha256=LTmZOXe6WEnv6ZroW-kU1TE2B36-z5v8OLmKr3XEVFo,353
 liger_kernel/transformers/group_norm.py,sha256=6qMAWOprr4SzP0YhNVNGQIBpM5aUHplUD2VuGJrMBz0,2173
+liger_kernel/transformers/grpo_loss.py,sha256=uAkUNKSnUGEOqa82L9w2e6AI1kcmG8K45-QxyaT8zhM,3897
 liger_kernel/transformers/jsd.py,sha256=DGqRnxIZxsvxo0_tbbxX3b-sDbDjC_yKufyRIHCcScY,2979
 liger_kernel/transformers/kl_div.py,sha256=WLffFbh1EExD2Eb1F7lN11fo9JJC-0751WJjZAF1Fj8,409
 liger_kernel/transformers/layer_norm.py,sha256=c9pk3PEasOKYR0rhe5e5nNrnYKVCEW4VC8S6LpCq9EQ,906
@@ -61,7 +64,7 @@ liger_kernel/transformers/model/gemma.py,sha256=gi5fVeFPryoYy0_T3rzU2wm7v_xiJnLC
 liger_kernel/transformers/model/gemma2.py,sha256=61uH9JSZM6cPDoGHr2kNUVq2O4A3XIy2Qea36XhkkPQ,10761
 liger_kernel/transformers/model/gemma3.py,sha256=e-o7rcOJAJMZDJBB-blkLz5ildWjuDneSkakqwrADBc,15630
 liger_kernel/transformers/model/glm4.py,sha256=yYbQEcSrSTMleNTpwJosMhBf4VC9-79EyC__utmOSFg,5031
-liger_kernel/transformers/model/llama.py,sha256=pkkoKip94p3hNWA11cIVvTdNqCRB8FgR039pZWLqNeA,10181
+liger_kernel/transformers/model/llama.py,sha256=ALVgzpD_YRYE7-6npb0KkjSBwrhCsgk_4lbaymOyRVw,12226
 liger_kernel/transformers/model/llava.py,sha256=RjLVnpHtOClc1jJkkPSqke7fcgWC3Jjh1rrGyvh5kb8,17008
 liger_kernel/transformers/model/loss_utils.py,sha256=WWAMdiONPaXpIvxyOim_0igLrYh0yyOok5Q9_L9xvZw,1787
 liger_kernel/transformers/model/mistral.py,sha256=0lt1Jq37zWjxLZF-Vuj9jUyIEnWlMuT7PB5xB42KXBs,5313
@@ -76,12 +79,12 @@ liger_kernel/transformers/model/qwen2_vl.py,sha256=q3AMpxFfwHjaMu9Q3jpwpMPRzrE-e
 liger_kernel/transformers/model/qwen3.py,sha256=u_0cCRwr1jcwMkSknbBVb9my1OepCGU718uxKhNUOVM,4657
 liger_kernel/transformers/model/qwen3_moe.py,sha256=lIWGunVtNP-d7VfRvEGY820howzecb10g6ZeWRgsfl8,5463
 liger_kernel/transformers/trainer/__init__.py,sha256=p7yQfklV8-467qSz_ZMimkbDF7HHWHwku25A-GYL0WU,193
-liger_kernel/transformers/trainer/orpo_trainer.py,sha256=pdekW7l6Qg_aqa5SYKYlSWUF8m3lkOFvFLcIMEHrz9s,8338
+liger_kernel/transformers/trainer/orpo_trainer.py,sha256=tX0h63aOFe3rNqTmk6JpMf75UPo981yzEa6TghnjS0Q,5370
 liger_kernel/triton/__init__.py,sha256=qCiCamzCRv6lpV8IqpAc9YMdNKC7GKurClWceQPnlis,92
 liger_kernel/triton/monkey_patch.py,sha256=Rd0hUHAzDkFfHvnX7-PBaNK5EKnZhtfM_h-fgQH9HPY,1568
-liger_kernel_nightly-0.5.9.dev20250519011716.dist-info/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
-liger_kernel_nightly-0.5.9.dev20250519011716.dist-info/METADATA,sha256=JJ5XcqsRjwW1nB2hH580FLzHY9i3mC_aEZj9mDNX6Gg,23970
-liger_kernel_nightly-0.5.9.dev20250519011716.dist-info/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
-liger_kernel_nightly-0.5.9.dev20250519011716.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
-liger_kernel_nightly-0.5.9.dev20250519011716.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
-liger_kernel_nightly-0.5.9.dev20250519011716.dist-info/RECORD,,
+liger_kernel_nightly-0.5.9.dev20250519025610.dist-info/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
+liger_kernel_nightly-0.5.9.dev20250519025610.dist-info/METADATA,sha256=y96ZmoWt54lwSvXqmZylo4V_wUHZ2dD2Xb29tV0jvLA,23970
+liger_kernel_nightly-0.5.9.dev20250519025610.dist-info/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
+liger_kernel_nightly-0.5.9.dev20250519025610.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
+liger_kernel_nightly-0.5.9.dev20250519025610.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
+liger_kernel_nightly-0.5.9.dev20250519025610.dist-info/RECORD,,

{liger_kernel_nightly-0.5.9.dev20250519011716.dist-info → liger_kernel_nightly-0.5.9.dev20250519025610.dist-info}/LICENSE RENAMED Viewed

File without changes

{liger_kernel_nightly-0.5.9.dev20250519011716.dist-info → liger_kernel_nightly-0.5.9.dev20250519025610.dist-info}/NOTICE RENAMED Viewed

File without changes

{liger_kernel_nightly-0.5.9.dev20250519011716.dist-info → liger_kernel_nightly-0.5.9.dev20250519025610.dist-info}/WHEEL RENAMED Viewed

File without changes

{liger_kernel_nightly-0.5.9.dev20250519011716.dist-info → liger_kernel_nightly-0.5.9.dev20250519025610.dist-info}/top_level.txt RENAMED Viewed

File without changes

liger-kernel-nightly 0.5.9.dev20250519011716__py3-none-any.whl → 0.5.9.dev20250519025610__py3-none-any.whl

liger-kernel-nightly 0.5.9.dev20250519011716py3-none-any.whl → 0.5.9.dev20250519025610py3-none-any.whl