PyPI - liger-kernel-nightly - Versions diffs - 0.5.5.dev20250318183047__py3-none-any.whl → 0.5.5.dev20250320214749__py3-none-any.whl - Mend

liger-kernel-nightly 0.5.5.dev20250318183047py3-none-any.whl → 0.5.5.dev20250320214749py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of liger-kernel-nightly might be problematic. Click here for more details.

Files changed (8) hide show

liger_kernel/transformers/model/paligemma.py CHANGED Viewed

@@ -21,6 +21,190 @@ from liger_kernel.transformers.fused_linear_cross_entropy import LigerFusedLinea
 logger = logging.get_logger(__name__)
+@add_start_docstrings_to_model_forward(PALIGEMMA_INPUTS_DOCSTRING)
+@replace_return_docstrings(output_type=PaliGemmaCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+def lce_forward_deprecated(
+    self,
+    input_ids: torch.LongTensor = None,
+    pixel_values: torch.FloatTensor = None,
+    attention_mask: Optional[torch.Tensor] = None,
+    position_ids: Optional[torch.LongTensor] = None,
+    past_key_values: Optional[Union[List[torch.FloatTensor], Cache]] = None,
+    token_type_ids: Optional[torch.LongTensor] = None,
+    cache_position: Optional[torch.LongTensor] = None,
+    inputs_embeds: Optional[torch.FloatTensor] = None,
+    labels: Optional[torch.LongTensor] = None,
+    use_cache: Optional[bool] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+) -> Union[Tuple, PaliGemmaCausalLMOutputWithPast]:
+    r"""
+    Args:
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+    Returns:
+    Example:
+    ```python
+    >>> from PIL import Image
+    >>> import requests
+    >>> from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
+    >>> model = PaliGemmaForConditionalGeneration.from_pretrained("google/PaliGemma-test-224px-hf")
+    >>> processor = AutoProcessor.from_pretrained("google/PaliGemma-test-224px-hf")
+    >>> prompt = "answer en Where is the cow standing?"
+    >>> url = "https://huggingface.co/gv-hf/PaliGemma-test-224px-hf/resolve/main/cow_beach_1.png"
+    >>> image = Image.open(requests.get(url, stream=True).raw)
+    >>> inputs = processor(text=prompt, images=image, return_tensors="pt")
+    >>> # Generate
+    >>> generate_ids = model.generate(**inputs, max_length=30)
+    >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    "answer en Where is the cow standing?\nbeach"
+    ```"""
+    if (input_ids is None) ^ (inputs_embeds is not None):
+        raise ValueError(
+            "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+        )
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+    # the attention mask is turned 4d after, we keep track of the original one
+    input_attention_mask = attention_mask
+    if inputs_embeds is None:
+        # 1. Extra the input embeddings
+        inputs_embeds = self.get_input_embeddings()(input_ids)
+        # 2. Merge text and images
+        if pixel_values is not None and input_ids.shape[1] != 1:
+            image_outputs = self.vision_tower(pixel_values.to(inputs_embeds.dtype))
+            selected_image_feature = image_outputs.last_hidden_state
+            image_features = self.multi_modal_projector(selected_image_feature)
+            if cache_position is None:
+                cache_position = torch.arange(inputs_embeds.shape[1], device=inputs_embeds.device)
+            inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features(
+                image_features, inputs_embeds, input_ids, attention_mask, labels, token_type_ids, cache_position
+            )
+        else:
+            # In case input_ids.shape[1] == 1 & pixel_values==None & past_key_values != None, we are in the case of
+            # generation with cache
+            if past_key_values is not None and pixel_values is not None and input_ids.shape[1] == 1:
+                # Retrieve the first layer to inspect the logits and mask out the hidden states
+                # that are set to 0
+                # TODO @molbap this will only work for dynamic cache.
+                first_layer_past_key_value = past_key_values[0][0][:, :, :, 0]
+                # Sum all dimensions of head_dim (-2) to avoid random errors such as: https://github.com/huggingface/transformers/pull/28032#issuecomment-1863691941
+                batch_index, non_attended_tokens = torch.where(first_layer_past_key_value.float().sum(-2) == 0)
+                # Get the target length
+                target_seqlen = cache_position[-1] + 1
+                extended_attention_mask = torch.ones(
+                    (attention_mask.shape[0], target_seqlen - attention_mask.shape[1] + 1),
+                    dtype=attention_mask.dtype,
+                    device=attention_mask.device,
+                )
+                # Filter out only the tokens that can be un-attended, this can happen
+                # if one uses PaliGemma+ Fused modules where the cache on the
+                # first iteration is already big enough, or if one passes custom cache
+                valid_indices = non_attended_tokens < extended_attention_mask.size(-1)
+                new_batch_index = batch_index[valid_indices]
+                new_non_attended_tokens = non_attended_tokens[valid_indices]
+                # Zero-out the places where we don't need to attend
+                extended_attention_mask[new_batch_index, new_non_attended_tokens] = 0
+                attention_mask = torch.cat((attention_mask, extended_attention_mask), dim=1)
+                position_ids = torch.sum(attention_mask, dim=1).unsqueeze(-1) - 1
+    attention_mask = attention_mask.to(inputs_embeds.dtype)
+    outputs = self.language_model.model(
+        attention_mask=attention_mask,
+        position_ids=position_ids,
+        past_key_values=past_key_values,
+        inputs_embeds=inputs_embeds,
+        use_cache=use_cache,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+        cache_position=cache_position,
+    )
+    hidden_states = outputs[0]
+    loss = None
+    logits = None
+    if self.training and (labels is not None):
+        shift_hidden_states = hidden_states[..., :-1, :]
+        shift_labels = labels[..., 1:]
+        hidden_device = shift_hidden_states.device
+        if attention_mask is not None:
+            # we use the input attention mask to shift the hidden_states and labels, because it is 2D.
+            # we also crop attn mask in case it is longer, which happens in PrefixTuning with peft
+            shift_attention_mask = attention_mask[:, -shift_hidden_states.shape[1] :].to(hidden_device)
+            shift_hidden_states = shift_hidden_states[shift_attention_mask.to(hidden_device) != 0].contiguous()
+            shift_labels = shift_labels[shift_attention_mask.to(shift_labels.device) != 0].contiguous()
+        else:
+            shift_hidden_states = shift_hidden_states.contiguous()
+            shift_labels = shift_labels.contiguous()
+        # Flatten hidden state
+        shift_hidden_states = shift_hidden_states.view(-1, self.config.text_config.hidden_size)
+        shift_labels = shift_labels.view(-1).to(hidden_device)
+        lce = LigerFusedLinearCrossEntropyLoss()
+        loss = lce(self.language_model.lm_head.weight, shift_hidden_states, shift_labels)
+    else:
+        logits = self.language_model.lm_head(hidden_states)
+        if labels is not None:
+            shift_logits = logits[..., :-1, :]
+            shift_labels = labels[..., 1:]
+            if input_attention_mask is not None:
+                # we use the input attention mask to shift the logits and labels, because it is 2D.
+                shift_attention_mask = input_attention_mask[..., 1:]
+                shift_logits = shift_logits[shift_attention_mask.to(logits.device) != 0].contiguous()
+                shift_labels = shift_labels[shift_attention_mask.to(shift_labels.device) != 0].contiguous()
+            else:
+                shift_logits = shift_logits.contiguous()
+                shift_labels = shift_labels.contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            flat_logits = shift_logits.view(-1, self.config.vocab_size)
+            flat_labels = shift_labels.view(-1).to(shift_logits.device)
+            loss = loss_fct(flat_logits, flat_labels)
+    if not return_dict:
+        output = (logits,) + outputs[1:]
+        return (loss,) + output if loss is not None else output
+    return PaliGemmaCausalLMOutputWithPast(
+        loss=loss,
+        logits=logits,
+        past_key_values=outputs.past_key_values,
+        hidden_states=outputs.hidden_states,
+        attentions=outputs.attentions,
+    )
 @deprecate_kwarg("num_logits_to_keep", version="4.50", new_name="logits_to_keep")
 @add_start_docstrings_to_model_forward(PALIGEMMA_INPUTS_DOCSTRING)
 @replace_return_docstrings(output_type=PaliGemmaCausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)

liger_kernel/transformers/monkey_patch.py CHANGED Viewed

@@ -631,6 +631,7 @@ def apply_liger_kernel_to_paligemma(
     # PaliGemma submodules are ['vision_tower', 'multi_modal_projector', 'language_model']
+    from transformers.models.gemma.modeling_gemma import GemmaForCausalLM
     from transformers.models.gemma2.modeling_gemma2 import Gemma2ForCausalLM
     from transformers.models.paligemma import modeling_paligemma
     from transformers.models.paligemma.modeling_paligemma import PaliGemmaForConditionalGeneration
@@ -639,6 +640,7 @@ def apply_liger_kernel_to_paligemma(
     from transformers.models.siglip.modeling_siglip import SiglipVisionModel
     from liger_kernel.transformers.model.paligemma import lce_forward
+    from liger_kernel.transformers.model.paligemma import lce_forward_deprecated
     # The vision_tower is a SiglipVisionModel
     if layer_norm:
@@ -647,13 +649,22 @@ def apply_liger_kernel_to_paligemma(
     # SiglipMLP is standard FFN so LigerGEGLUMLP is not compatible
     # The multi_modal_projector is Linear, nothing to do
-    # The language_model is Gemma2ForCausalLM
-    apply_liger_kernel_to_gemma2(rope=rope, cross_entropy=False, fused_linear_cross_entropy=False, geglu=geglu)
+    # The language_model is GemmaForCausalLM or Gemma2ForCausalLM
+    apply_liger_kernel_to_gemma(
+        rope=rope, cross_entropy=False, fused_linear_cross_entropy=False, rms_norm=rms_norm, geglu=geglu
+    )
+    apply_liger_kernel_to_gemma2(
+        rope=rope, cross_entropy=False, fused_linear_cross_entropy=False, rms_norm=rms_norm, geglu=geglu
+    )
     # Handle loss function
     if cross_entropy:
         modeling_paligemma.nn.CrossEntropyLoss = LigerCrossEntropyLoss
     if fused_linear_cross_entropy:
-        modeling_paligemma.PaliGemmaForConditionalGeneration.forward = lce_forward
+        if transformer_version >= version.parse(SUPPORTED_TRANSFORMER_VERSION):
+            modeling_paligemma.PaliGemmaForConditionalGeneration.forward = lce_forward
+        else:  # if version < 4.46.1
+            logger.warning(TRANSFORMER_DEPRECATION_WARNING)
+            modeling_paligemma.PaliGemmaForConditionalGeneration.forward = lce_forward_deprecated
     if model is not None:
         # The model instance already exists, so we need to additionally patch the
@@ -672,16 +683,31 @@ def apply_liger_kernel_to_paligemma(
                 _patch_layer_norm_module(layer.layer_norm1)
                 _patch_layer_norm_module(layer.layer_norm2)
-        language_model: Gemma2ForCausalLM = model.language_model
-        apply_liger_kernel_to_gemma2(
-            rope=rope,
-            cross_entropy=False,
-            fused_linear_cross_entropy=False,
-            rms_norm=rms_norm,
-            geglu=geglu,
-            model=language_model,
-        )
+        language_model = model.language_model
+        if isinstance(language_model, GemmaForCausalLM):
+            apply_liger_kernel_to_gemma(
+                rope=rope,
+                cross_entropy=False,
+                fused_linear_cross_entropy=False,
+                rms_norm=rms_norm,
+                geglu=geglu,
+                model=language_model,
+            )
+        elif isinstance(language_model, Gemma2ForCausalLM):
+            apply_liger_kernel_to_gemma2(
+                rope=rope,
+                cross_entropy=False,
+                fused_linear_cross_entropy=False,
+                rms_norm=rms_norm,
+                geglu=geglu,
+                model=language_model,
+            )
+        else:
+            raise TypeError(
+                "The language_model of a PaliGemma model must be either GemmaForCausalLM or Gemma2ForCausalLM."
+            )
 def apply_liger_kernel_to_qwen2(

{liger_kernel_nightly-0.5.5.dev20250318183047.dist-info → liger_kernel_nightly-0.5.5.dev20250320214749.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: liger_kernel_nightly
-Version: 0.5.5.dev20250318183047
+Version: 0.5.5.dev20250320214749
 Summary: Efficient Triton kernels for LLM Training
 License: BSD 2-CLAUSE LICENSE
         Copyright 2024 LinkedIn Corporation

{liger_kernel_nightly-0.5.5.dev20250318183047.dist-info → liger_kernel_nightly-0.5.5.dev20250320214749.dist-info}/RECORD RENAMED Viewed

@@ -43,7 +43,7 @@ liger_kernel/transformers/group_norm.py,sha256=6qMAWOprr4SzP0YhNVNGQIBpM5aUHplUD
 liger_kernel/transformers/jsd.py,sha256=DGqRnxIZxsvxo0_tbbxX3b-sDbDjC_yKufyRIHCcScY,2979
 liger_kernel/transformers/kl_div.py,sha256=WLffFbh1EExD2Eb1F7lN11fo9JJC-0751WJjZAF1Fj8,409
 liger_kernel/transformers/layer_norm.py,sha256=c9pk3PEasOKYR0rhe5e5nNrnYKVCEW4VC8S6LpCq9EQ,906
-liger_kernel/transformers/monkey_patch.py,sha256=1Vzt_8UUMgO4t1ui7fNkKMcDfnWoCZfe9iyqeYSbe1w,50851
+liger_kernel/transformers/monkey_patch.py,sha256=qRCgchODu6AuO8la6uAnrDEA-sSP9ADt8IOp4kl-Dd0,52053
 liger_kernel/transformers/qwen2vl_mrope.py,sha256=5EwSqrMdsL9MYspeBMXBsNJKvH0MOmRrtJXAJlnnlOI,1047
 liger_kernel/transformers/rms_norm.py,sha256=GqCEJuGt0YdqqlMcToE0Wp4A8YFquDa4UUSyH2uFW2A,1191
 liger_kernel/transformers/rope.py,sha256=ZTrTORSAyfcFIKjk6XEeYmk4ROH7xXED9L4g2NFntlE,999
@@ -60,7 +60,7 @@ liger_kernel/transformers/model/mistral.py,sha256=o7tyl1sPWPfZwwrBLRlryHlSI8I55v
 liger_kernel/transformers/model/mixtral.py,sha256=T0ITv2-PkR8VErVOVUizoS4EzjmARyR7GFh0tXDB_i4,11089
 liger_kernel/transformers/model/mllama.py,sha256=RCKtwnGOMFYIbtt1zUQ15Cyv4eNpHkTWcgkmG2EEs2I,10804
 liger_kernel/transformers/model/olmo2.py,sha256=5M8kczp4D-jvbjcV7cKATIJGF34xd-Rs-PPdKZWSIlY,4685
-liger_kernel/transformers/model/paligemma.py,sha256=C_Pb1qqxZl0J0fyXlwp1jTwNXckK9xuoSLHXy3rkWsE,10298
+liger_kernel/transformers/model/paligemma.py,sha256=GNReT6tVZt3ON6aaa9ovg8mnu1hYocSx9OhgC7b-_28,19191
 liger_kernel/transformers/model/phi3.py,sha256=NmU2DuU1Huwha6K7YSsJCnvQfUovTTGlsfBZhbx0UoI,9951
 liger_kernel/transformers/model/qwen2.py,sha256=t7NotBHoebsPqNSxwaf9DXTg8jxgB5BdunSGqYOE0hQ,9240
 liger_kernel/transformers/model/qwen2_5_vl.py,sha256=70BnHZjx6eQWTwi3zc5SMwxTeOOA4Tbdkfy6IYRcTaM,9289
@@ -69,9 +69,9 @@ liger_kernel/transformers/trainer/__init__.py,sha256=p7yQfklV8-467qSz_ZMimkbDF7H
 liger_kernel/transformers/trainer/orpo_trainer.py,sha256=pdekW7l6Qg_aqa5SYKYlSWUF8m3lkOFvFLcIMEHrz9s,8338
 liger_kernel/triton/__init__.py,sha256=qCiCamzCRv6lpV8IqpAc9YMdNKC7GKurClWceQPnlis,92
 liger_kernel/triton/monkey_patch.py,sha256=Rd0hUHAzDkFfHvnX7-PBaNK5EKnZhtfM_h-fgQH9HPY,1568
-liger_kernel_nightly-0.5.5.dev20250318183047.dist-info/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
-liger_kernel_nightly-0.5.5.dev20250318183047.dist-info/METADATA,sha256=iXbBoxaUi6eIZIh18U5BHGauA2Ol0b_GcVuZKfWtnxE,22832
-liger_kernel_nightly-0.5.5.dev20250318183047.dist-info/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
-liger_kernel_nightly-0.5.5.dev20250318183047.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
-liger_kernel_nightly-0.5.5.dev20250318183047.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
-liger_kernel_nightly-0.5.5.dev20250318183047.dist-info/RECORD,,
+liger_kernel_nightly-0.5.5.dev20250320214749.dist-info/LICENSE,sha256=OhzLDHJ0to4a8sodVLELZiCFylZ1NAAYLs-HrjPy0ag,1312
+liger_kernel_nightly-0.5.5.dev20250320214749.dist-info/METADATA,sha256=WqbzHO3j_NRFdVkkvIfjevIYWO1ojp9D4NAV6hkIRV4,22832
+liger_kernel_nightly-0.5.5.dev20250320214749.dist-info/NOTICE,sha256=njwnoPZLh9AN8SJQzxvCGLHi-8X__AvWRze6joNXIY8,2066
+liger_kernel_nightly-0.5.5.dev20250320214749.dist-info/WHEEL,sha256=iAkIy5fosb7FzIOwONchHf19Qu7_1wCWyFNR5gu9nU0,91
+liger_kernel_nightly-0.5.5.dev20250320214749.dist-info/top_level.txt,sha256=2eghu4hA3LnkM7ElW92tQ8zegWKgSbeo-k-aGe1YnvY,13
+liger_kernel_nightly-0.5.5.dev20250320214749.dist-info/RECORD,,

{liger_kernel_nightly-0.5.5.dev20250318183047.dist-info → liger_kernel_nightly-0.5.5.dev20250320214749.dist-info}/LICENSE RENAMED Viewed

File without changes

{liger_kernel_nightly-0.5.5.dev20250318183047.dist-info → liger_kernel_nightly-0.5.5.dev20250320214749.dist-info}/NOTICE RENAMED Viewed

File without changes

{liger_kernel_nightly-0.5.5.dev20250318183047.dist-info → liger_kernel_nightly-0.5.5.dev20250320214749.dist-info}/WHEEL RENAMED Viewed

File without changes

{liger_kernel_nightly-0.5.5.dev20250318183047.dist-info → liger_kernel_nightly-0.5.5.dev20250320214749.dist-info}/top_level.txt RENAMED Viewed

File without changes

liger-kernel-nightly 0.5.5.dev20250318183047__py3-none-any.whl → 0.5.5.dev20250320214749__py3-none-any.whl

Potentially problematic release.

liger-kernel-nightly 0.5.5.dev20250318183047py3-none-any.whl → 0.5.5.dev20250320214749py3-none-any.whl