PyPI - liger-kernel - Versions diffs - 0.4.1__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

liger-kernel 0.4.1py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

liger_kernel/__init__.py +0 -0
liger_kernel/chunked_loss/__init__.py +4 -0
liger_kernel/chunked_loss/cpo_loss.py +107 -0
liger_kernel/chunked_loss/dpo_loss.py +135 -0
liger_kernel/chunked_loss/functional.py +9 -0
liger_kernel/chunked_loss/fused_linear_distillation.py +252 -0
liger_kernel/chunked_loss/fused_linear_preference.py +386 -0
liger_kernel/chunked_loss/orpo_loss.py +113 -0
liger_kernel/chunked_loss/simpo_loss.py +115 -0
liger_kernel/env_report.py +22 -0
liger_kernel/ops/cross_entropy.py +17 -10
liger_kernel/ops/fused_linear_cross_entropy.py +1 -11
liger_kernel/ops/fused_linear_jsd.py +1 -1
liger_kernel/ops/jsd.py +19 -10
liger_kernel/ops/layer_norm.py +6 -1
liger_kernel/ops/qwen2vl_mrope.py +238 -0
liger_kernel/ops/rms_norm.py +6 -1
liger_kernel/ops/utils.py +5 -2
liger_kernel/transformers/__init__.py +1 -0
liger_kernel/transformers/functional.py +128 -11
liger_kernel/transformers/fused_linear_jsd.py +1 -4
liger_kernel/transformers/jsd.py +1 -4
liger_kernel/transformers/model/qwen2_vl.py +43 -17
liger_kernel/transformers/monkey_patch.py +11 -6
liger_kernel/transformers/orpo_trainer.py +171 -0
liger_kernel/transformers/qwen2vl_mrope.py +20 -0
liger_kernel/utils.py +13 -0
{liger_kernel-0.4.1.dist-info → liger_kernel-0.5.0.dist-info}/METADATA +80 -123
{liger_kernel-0.4.1.dist-info → liger_kernel-0.5.0.dist-info}/RECORD +33 -20
{liger_kernel-0.4.1.dist-info → liger_kernel-0.5.0.dist-info}/WHEEL +1 -1
{liger_kernel-0.4.1.dist-info → liger_kernel-0.5.0.dist-info}/LICENSE +0 -0
{liger_kernel-0.4.1.dist-info → liger_kernel-0.5.0.dist-info}/NOTICE +0 -0
{liger_kernel-0.4.1.dist-info → liger_kernel-0.5.0.dist-info}/top_level.txt +0 -0

liger_kernel/transformers/model/qwen2_vl.py CHANGED Viewed

@@ -1,7 +1,9 @@
 from typing import List, Optional, Tuple, Union
 import torch
+from packaging import version
 from torch.nn import CrossEntropyLoss
+from transformers import __version__ as transformers_version
 from transformers.models.qwen2_vl.modeling_qwen2_vl import (
     _CONFIG_FOR_DOC,
     QWEN2_VL_INPUTS_DOCSTRING,
@@ -80,8 +82,6 @@ def lce_forward(
     >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
     "The image shows a street scene with a red stop sign in the foreground. In the background, there is a large red gate with Chinese characters ..."
     ```"""
-    # FIXME: The code is outdated and not compatible with transformer >= 4.46.1
     output_attentions = (
         output_attentions
         if output_attentions is not None
@@ -100,27 +100,53 @@ def lce_forward(
         inputs_embeds = self.model.embed_tokens(input_ids)
         if pixel_values is not None:
             pixel_values = pixel_values.type(self.visual.get_dtype())
-            image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw).to(
-                inputs_embeds.device
+            image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
+            n_image_tokens = (input_ids == self.config.image_token_id).sum().item()
+            n_image_features = image_embeds.shape[0]
+            if n_image_tokens != n_image_features:
+                raise ValueError(
+                    f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+                )
+            image_mask = (
+                (input_ids == self.config.image_token_id)
+                .unsqueeze(-1)
+                .expand_as(inputs_embeds)
+                .to(inputs_embeds.device)
             )
-            image_mask = input_ids == self.config.image_token_id
-            if self.training:
-                inputs_embeds = inputs_embeds.clone()
-            inputs_embeds[image_mask] = image_embeds
+            image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
+            inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
         if pixel_values_videos is not None:
             pixel_values_videos = pixel_values_videos.type(self.visual.get_dtype())
-            video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw).to(
-                inputs_embeds.device
+            video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
+            n_video_tokens = (input_ids == self.config.video_token_id).sum().item()
+            n_video_features = video_embeds.shape[0]
+            if n_video_tokens != n_video_features:
+                raise ValueError(
+                    f"Video features and video tokens do not match: tokens: {n_video_tokens}, features {n_video_features}"
+                )
+            video_mask = (
+                (input_ids == self.config.video_token_id)
+                .unsqueeze(-1)
+                .expand_as(inputs_embeds)
+                .to(inputs_embeds.device)
             )
-            video_mask = input_ids == self.config.video_token_id
-            inputs_embeds[video_mask] = video_embeds
+            video_embeds = video_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
+            inputs_embeds = inputs_embeds.masked_scatter(video_mask, video_embeds)
         if attention_mask is not None:
             attention_mask = attention_mask.to(inputs_embeds.device)
-    # The code is copied from https://github.com/huggingface/transformers/pull/33487
-    if position_ids is None and input_ids is not None:
-        position_ids, _ = self.get_rope_index(
-            input_ids, image_grid_thw, video_grid_thw, attention_mask
-        )
+    if version.parse(transformers_version) > version.parse("4.46.2"):
+        # NOTE: this bug fix for qwen2-vl is not applied until transformers 4.47.0
+        # https://github.com/huggingface/transformers/issues/33401
+        # While correct, this breaks equivalence with past versions of Qwen2-VL from
+        # transformers and leads to failed tests or users noticing differences in results.
+        # TODO: remove above conditional when liger drops support for transformers<4.47.0
+        if position_ids is None and input_ids is not None:
+            position_ids, _ = self.get_rope_index(
+                input_ids, image_grid_thw, video_grid_thw, attention_mask
+            )
     outputs = self.model(
         input_ids=None,

liger_kernel/transformers/monkey_patch.py CHANGED Viewed

@@ -36,6 +36,7 @@ from liger_kernel.transformers.model.qwen2 import lce_forward as qwen2_lce_forwa
 from liger_kernel.transformers.model.qwen2 import (
     lce_forward_deprecated as qwen2_lce_forward_deprecated,
 )
+from liger_kernel.transformers.qwen2vl_mrope import liger_multimodal_rotary_pos_emb
 from liger_kernel.transformers.rms_norm import LigerRMSNorm
 from liger_kernel.transformers.rope import liger_rotary_pos_emb
 from liger_kernel.transformers.swiglu import (
@@ -56,12 +57,15 @@ def _bind_method_to_module(module, method_name: str, new_method: Callable):
     module.__dict__[method_name] = new_method.__get__(module, module.__class__)
-def _patch_rms_norm_module(module, offset=0.0, eps=1e-6, casting_mode="llama"):
+def _patch_rms_norm_module(
+    module, offset=0.0, eps=1e-6, casting_mode="llama", in_place=True
+):
     module.offset = offset
     module.casting_mode = casting_mode
     module.variance_epsilon = (
         getattr(module, "variance_epsilon", None) or getattr(module, "eps", None) or eps
     )
+    module.in_place = in_place
     _bind_method_to_module(module, "forward", LigerRMSNorm.forward)
     _bind_method_to_module(module, "extra_repr", LigerRMSNorm.extra_repr)
@@ -510,7 +514,7 @@ def apply_liger_kernel_to_gemma2(
         LigerRMSNorm, offset=1.0, casting_mode="gemma", init_fn="zeros", in_place=False
     )
     _patch_rms_norm_module_for_gemma2 = partial(
-        _patch_rms_norm_module, offset=1.0, casting_mode="gemma"
+        _patch_rms_norm_module, offset=1.0, casting_mode="gemma", in_place=False
     )
     if rope:
@@ -607,9 +611,7 @@ def apply_liger_kernel_to_qwen2(
             logger.warning(TRANSFORMER_DEPRECATION_WARNING)
             modeling_qwen2.CrossEntropyLoss = LigerCrossEntropyLoss
-    # import pdb; pdb.set_trace()
     if fused_linear_cross_entropy:
         if transformer_version >= version.parse(SUPPORTED_TRANSFORMER_VERSION):
             modeling_qwen2.Qwen2ForCausalLM.forward = qwen2_lce_forward
         else:  # if version < 4.46.1
@@ -641,6 +643,7 @@ def apply_liger_kernel_to_qwen2(
 def apply_liger_kernel_to_qwen2_vl(
+    rope: bool = True,
     cross_entropy: bool = False,
     fused_linear_cross_entropy: bool = True,
     rms_norm: bool = True,
@@ -675,8 +678,10 @@ def apply_liger_kernel_to_qwen2_vl(
         lce_forward as qwen2_vl_lce_forward,
     )
-    # TODO: Support Qwen2-VL's multimodal RoPE implementation
+    if rope:
+        modeling_qwen2_vl.apply_multimodal_rotary_pos_emb = (
+            liger_multimodal_rotary_pos_emb
+        )
     if rms_norm:
         # https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen2_vl/modeling_qwen2_vl.py#L439
         modeling_qwen2_vl.Qwen2RMSNorm = LigerRMSNorm

liger_kernel/transformers/orpo_trainer.py ADDED Viewed

@@ -0,0 +1,171 @@
+from typing import Any, Callable, Dict, List, Literal, Tuple, Union
+import torch
+import torch.nn as nn
+from torch.distributed.fsdp import FullyShardedDataParallel
+from trl.trainer import ORPOTrainer
+from liger_kernel.chunked_loss import LigerFusedLinearORPOLoss
+class _FSDPForwardRedirection:
+    """
+    Modified based on
+    https://github.com/Lightning-AI/pytorch-lightning/blob/d3f9c83d6efa4f1def36aa6c199600946cdb9117/src/lightning/pytorch/strategies/strategy.py#L601-L648
+    Redirect a method call through FullyShardedDataParallel.forward so that the FSDP module's root pre-forward and
+    post-forward can be properly executed around the method call.
+    This is needed in cases where we call a submodule of a FSDP module. For instance, when we want to call only
+    the `LlamaModel` part out of a FSDP-wrapped `LlamaForCausalLM` to get the hidden states without involving
+    GPU-memory-heavy `lm_head` and cross entropy computation, doing this directly (i.e. `model.model.forward()`)
+    will not work because the first `nn.Emebedding` layer is not independently wrapped as a FSDP module (because of
+    the transformer-based wrapping policy), and not calling it through FSDP root module forward will not all-gather
+    its parameter, thus resulting in "RuntimeError: 'weight' must be 2-D" error. Similarly, if we want to call just
+    the `lm_head` part of a model, we need this trick too to properly get its params all-gathered.
+    """
+    def __call__(
+        self,
+        wrapper_module: FullyShardedDataParallel,
+        method: Callable,
+        *args: Any,
+        **kwargs: Any,
+    ):
+        """Reroutes a method call through the `wrapper_module`'s `forward` method.
+        Args:
+            wrapper_module: The module that has `original_module` wrapped.
+            original_module: The module that was wrapped inside `wrapper_module`.
+            method_name: The name of the method that should be called on the `original_module` after inputs get
+                redirected through the `wrapper_module`'s `forward` method.
+            *args: The positional arguments to the method `method_name`. They will get passed to a patched
+                `forward` method instead.
+            **kwargs: The keyword arguments to the method `method_name`. They will get passed to a patched
+                `forward` method instead.
+        """
+        assert isinstance(wrapper_module, FullyShardedDataParallel)
+        original_module = wrapper_module._fsdp_wrapped_module
+        original_forward = original_module.forward
+        def wrapped_forward(*_args: Any, **_kwargs: Any) -> Any:
+            # Unpatch ourselves immediately before calling the method `method_name`
+            # because itself may want to call the real `forward`
+            original_module.forward = original_forward  # type: ignore[method-assign]
+            # Call the actual method e.g. `.training_step(...)`
+            out = method(*_args, **_kwargs)
+            return out
+        # Patch the original_module's forward so we can redirect the arguments back to the real method
+        original_module.forward = wrapped_forward  # type: ignore[method-assign]
+        wrapper_output = wrapper_module(*args, **kwargs)
+        return wrapper_output
+class LigerORPOTrainer(ORPOTrainer):
+    def concatenated_forward(
+        self, model: nn.Module, batch: Dict[str, Union[List, torch.LongTensor]]
+    ) -> Tuple[
+        torch.FloatTensor, torch.FloatTensor, torch.FloatTensor, torch.FloatTensor
+    ]:
+        """
+        Run the given model on the given batch of inputs, concatenating the chosen and rejected inputs together.
+        We do this to avoid doing two forward passes, because it's faster for FSDP.
+        """
+        concatenated_batch = self.concatenated_inputs(
+            batch,
+            is_encoder_decoder=self.is_encoder_decoder,
+            label_pad_token_id=self.label_pad_token_id,
+            padding_value=self.padding_value,
+            device=self.accelerator.device,
+        )
+        # if self.accelerator.is_main_process:
+        #     import pdb; pdb.set_trace()
+        # torch.distributed.barrier()
+        model_kwargs = (
+            {
+                "decoder_input_ids": self._shift_right(
+                    concatenated_batch["concatenated_labels"]
+                ),
+            }
+            if self.is_encoder_decoder
+            else {}
+        )
+        if self.aux_loss_enabled:
+            model_kwargs["output_router_logits"] = True
+        if isinstance(model, FullyShardedDataParallel):
+            outputs = _FSDPForwardRedirection()(
+                model,
+                model._fsdp_wrapped_module.model,
+                concatenated_batch["concatenated_input_ids"],
+                attention_mask=concatenated_batch["concatenated_attention_mask"],
+                use_cache=False,
+                **model_kwargs,
+            )
+        else:
+            if isinstance(model, torch.nn.DataParallel):
+                model = model.module
+            outputs = model.model(
+                concatenated_batch["concatenated_input_ids"],
+                attention_mask=concatenated_batch["concatenated_attention_mask"],
+                use_cache=False,
+                **model_kwargs,
+            )
+        orpo_loss_fn = LigerFusedLinearORPOLoss(
+            ignore_index=self.label_pad_token_id, beta=self.beta
+        )
+        def orpo_partial(lm_head, last_hidden_state, concatenated_labels):
+            return orpo_loss_fn(
+                lm_head.weight, last_hidden_state, concatenated_labels, lm_head.bias
+            )
+        orpo_loss, aux_outputs = _FSDPForwardRedirection()(
+            model,
+            orpo_partial,
+            model.lm_head,
+            outputs.last_hidden_state,
+            concatenated_batch["concatenated_labels"],
+        )
+        return orpo_loss, aux_outputs
+    def get_batch_loss_metrics(
+        self,
+        model,
+        batch: Dict[str, Union[List, torch.LongTensor]],
+        train_eval: Literal["train", "eval"] = "train",
+    ):
+        """Compute the ORPO loss and other metrics for the given batch of inputs for train or test."""
+        metrics = {}
+        loss, aux_outputs = self.concatenated_forward(model, batch)
+        (
+            policy_chosen_logps,
+            policy_rejected_logps,
+            policy_chosen_logits,
+            policy_rejected_logits,
+            policy_nll_loss,
+        ) = aux_outputs[:5]
+        # return loss, metrics
+        chosen_rewards, rejected_rewards, log_odds_ratio, log_odds_chosen = aux_outputs[
+            5:
+        ]
+        reward_accuracies = (chosen_rewards > rejected_rewards).float()
+        prefix = "eval_" if train_eval == "eval" else ""
+        metrics[f"{prefix}rewards/chosen"] = chosen_rewards.mean()
+        metrics[f"{prefix}rewards/rejected"] = rejected_rewards.mean()
+        metrics[f"{prefix}rewards/accuracies"] = reward_accuracies.mean()
+        metrics[f"{prefix}rewards/margins"] = (chosen_rewards - rejected_rewards).mean()
+        metrics[f"{prefix}logps/rejected"] = policy_rejected_logps.detach().mean()
+        metrics[f"{prefix}logps/chosen"] = policy_chosen_logps.detach().mean()
+        metrics[f"{prefix}logits/rejected"] = policy_rejected_logits.detach().mean()
+        metrics[f"{prefix}logits/chosen"] = policy_chosen_logits.detach().mean()
+        metrics[f"{prefix}nll_loss"] = policy_nll_loss.detach().mean()
+        metrics[f"{prefix}log_odds_ratio"] = log_odds_ratio
+        metrics[f"{prefix}log_odds_chosen"] = log_odds_chosen
+        for k, v in metrics.items():
+            metrics[k] = v.item()
+        return loss, metrics

liger_kernel/transformers/qwen2vl_mrope.py ADDED Viewed

@@ -0,0 +1,20 @@
+from liger_kernel.ops.qwen2vl_mrope import LigerQwen2VLMRopeFunction
+def liger_multimodal_rotary_pos_emb(q, k, cos, sin, mrope_section, unsqueeze_dim=1):
+    """
+    Applies Multimodal Rotary Positional Embedding (M-RoPE) operation to query and key states.
+    Args:
+        q (torch.Tensor): The query tensor of shape (bsz, n_q_head, seq_len, head_dim).
+        k (torch.Tensor): The key tensor of shape (bsz, n_kv_head, seq_len, head_dim).
+        cos (torch.Tensor): The cosine tensor of shape (3, 1, seq_len, head_dim).
+        sin (torch.Tensor): The sine tensor of shape (3, 1, seq_len, head_dim).
+        mrope_section (List[int]): The multimodal rope section for channel dimension of temporal, height and width in rope calculation.
+        unsqueeze_dim (int, optional): The dimension to unsqueeze. Defaults to 1.
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: The query and key tensors after applying the M-RoPE operation.
+    """
+    return LigerQwen2VLMRopeFunction.apply(q, k, cos, sin, mrope_section, unsqueeze_dim)

liger_kernel/utils.py ADDED Viewed

@@ -0,0 +1,13 @@
+import torch
+def infer_device():
+    """
+    Get current device name based on available devices
+    """
+    if torch.cuda.is_available():
+        return "cuda"
+    elif torch.xpu.is_available():
+        return "xpu"
+    else:
+        return "cpu"

liger-kernel 0.4.1__py3-none-any.whl → 0.5.0__py3-none-any.whl

liger-kernel 0.4.1py3-none-any.whl → 0.5.0py3-none-any.whl