PyPI - transformers - Versions diffs - 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl - Mend

transformers 5.0.0rc1py3-none-any.whl → 5.0.0rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (671) hide show

transformers/models/evolla/modular_evolla.py CHANGED Viewed

@@ -91,6 +91,7 @@ class EvollaSaProtRotaryEmbedding(nn.Module):
     def __init__(self, dim: int):
         super().__init__()
+        self.dim = dim
         # Generate and save the inverse frequency buffer (non trainable)
         inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, dtype=torch.int64).float() / dim))
         self.register_buffer("inv_freq", inv_freq)
@@ -203,12 +204,19 @@ class EvollaSaProtPreTrainedModel(PreTrainedModel):
         ],
     }
+    def _init_weights(self, module):
+        super()._init_weights(module)
+        if isinstance(module, EvollaSaProtRotaryEmbedding):
+            inv_freq = 1.0 / (10000 ** (torch.arange(0, module.dim, 2, dtype=torch.int64).float() / module.dim))
+            init.copy_(module.inv_freq, inv_freq)
 class EvollaSaProtProteinEncoder(EvollaSaProtPreTrainedModel):
     def __init__(self, config: SaProtConfig):
         super().__init__(config)
         self.embeddings = EvollaSaProtEmbeddings(config)
         self.encoder = EvollaSaProtEncoder(config)
+        self.post_init()
     def get_input_embeddings(self):
         return self.embeddings.word_embeddings

transformers/models/exaone4/modeling_exaone4.py CHANGED Viewed

@@ -86,7 +86,7 @@ class Exaone4RotaryEmbedding(nn.Module):
         inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self.original_inv_freq = inv_freq
+        self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
     @staticmethod
     def compute_default_rope_parameters(

transformers/models/falcon/modeling_falcon.py CHANGED Viewed

@@ -122,7 +122,7 @@ class FalconRotaryEmbedding(nn.Module):
         inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self.original_inv_freq = inv_freq
+        self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
     @staticmethod
     def compute_default_rope_parameters(
@@ -521,8 +521,8 @@ class FalconFlashAttention2(FalconAttention):
                     else torch.get_autocast_gpu_dtype()
                 )
             # Handle the case where the model is quantized
-            elif hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
+            elif hasattr(self.config, "quantization_config"):
+                target_dtype = self.config.dtype
             else:
                 target_dtype = self.query_key_value.weight.dtype

transformers/models/falcon_h1/modeling_falcon_h1.py CHANGED Viewed

@@ -241,7 +241,7 @@ class FalconH1RotaryEmbedding(nn.Module):
         inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self.original_inv_freq = inv_freq
+        self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
     @staticmethod
     def compute_default_rope_parameters(
@@ -1187,26 +1187,6 @@ class FalconH1DecoderLayer(GradientCheckpointingLayer):
         return outputs
-@auto_docstring
-class FalconH1PreTrainedModel(PreTrainedModel):
-    config: FalconH1Config
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["FalconH1DecoderLayer"]
-    _skip_keys_device_placement = "past_key_values"
-    _supports_flash_attn = True
-    _supports_sdpa = True
-    _is_stateful = True
-    @torch.no_grad()
-    def _init_weights(self, module):
-        super()._init_weights(module)
-        if isinstance(module, FalconH1Mixer):
-            init.ones_(module.dt_bias)
-            init.copy_(module.A_log, torch.log(torch.arange(1, module.num_heads + 1)))
-            init.ones_(module.D)
 def compute_mup_vector(config):
     """
     Computes the MuP vector based on model configuration.
@@ -1244,6 +1224,30 @@ def compute_mup_vector(config):
     return mup_vector
+@auto_docstring
+class FalconH1PreTrainedModel(PreTrainedModel):
+    config: FalconH1Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["FalconH1DecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _is_stateful = True
+    @torch.no_grad()
+    def _init_weights(self, module):
+        super()._init_weights(module)
+        if isinstance(module, FalconH1Mixer):
+            init.ones_(module.dt_bias)
+            init.copy_(module.A_log, torch.log(torch.arange(1, module.num_heads + 1)))
+            init.ones_(module.D)
+        elif isinstance(module, FalconH1Model):
+            mup_vector = compute_mup_vector(module.config)
+            for layer in module.layers:
+                init.copy_(layer.mamba.mup_vector, mup_vector)
 @auto_docstring
 # Adapted from transformers.models.jamba.modeling_jamba.JambaModel
 class FalconH1Model(FalconH1PreTrainedModel):
@@ -1269,7 +1273,7 @@ class FalconH1Model(FalconH1PreTrainedModel):
         # Compute the MuP vector once and register it for all layers
         mup_vector = compute_mup_vector(config)
         for layer in self.layers:
-            layer.mamba.register_buffer("mup_vector", mup_vector, persistent=False)
+            layer.mamba.register_buffer("mup_vector", mup_vector.clone(), persistent=False)
         # Initialize weights and apply final processing
         self.post_init()
@@ -1591,6 +1595,7 @@ class FalconH1ForCausalLM(FalconH1PreTrainedModel, GenerationMixin):
         cache_position=None,
         position_ids=None,
         use_cache=True,
+        is_first_iteration=False,
         **kwargs,
     ):
         # Overwritten -- has a unique cache type, `FalconHybridMambaAttentionDynamicCache`
@@ -1628,7 +1633,7 @@ class FalconH1ForCausalLM(FalconH1PreTrainedModel, GenerationMixin):
                 position_ids = position_ids[:, -input_ids.shape[1] :]
         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and empty_past_kv:
+        if inputs_embeds is not None and is_first_iteration:
             model_inputs = {"inputs_embeds": inputs_embeds}
         else:
             model_inputs = {"input_ids": input_ids.contiguous()}  # `contiguous()` needed for compilation use cases

transformers/models/falcon_h1/modular_falcon_h1.py CHANGED Viewed

@@ -928,6 +928,10 @@ class FalconH1PreTrainedModel(PreTrainedModel):
             init.ones_(module.dt_bias)
             init.copy_(module.A_log, torch.log(torch.arange(1, module.num_heads + 1)))
             init.ones_(module.D)
+        elif isinstance(module, FalconH1Model):
+            mup_vector = compute_mup_vector(module.config)
+            for layer in module.layers:
+                init.copy_(layer.mamba.mup_vector, mup_vector)
 def compute_mup_vector(config):
@@ -992,7 +996,7 @@ class FalconH1Model(FalconH1PreTrainedModel):
         # Compute the MuP vector once and register it for all layers
         mup_vector = compute_mup_vector(config)
         for layer in self.layers:
-            layer.mamba.register_buffer("mup_vector", mup_vector, persistent=False)
+            layer.mamba.register_buffer("mup_vector", mup_vector.clone(), persistent=False)
         # Initialize weights and apply final processing
         self.post_init()
@@ -1298,6 +1302,7 @@ class FalconH1ForCausalLM(LlamaForCausalLM):
         cache_position=None,
         position_ids=None,
         use_cache=True,
+        is_first_iteration=False,
         **kwargs,
     ):
         # Overwritten -- has a unique cache type, `FalconHybridMambaAttentionDynamicCache`
@@ -1335,7 +1340,7 @@ class FalconH1ForCausalLM(LlamaForCausalLM):
                 position_ids = position_ids[:, -input_ids.shape[1] :]
         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
-        if inputs_embeds is not None and empty_past_kv:
+        if inputs_embeds is not None and is_first_iteration:
             model_inputs = {"inputs_embeds": inputs_embeds}
         else:
             model_inputs = {"input_ids": input_ids.contiguous()}  # `contiguous()` needed for compilation use cases

transformers/models/falcon_mamba/modeling_falcon_mamba.py CHANGED Viewed

@@ -31,7 +31,7 @@ from ... import initialization as init
 from ...activations import ACT2FN
 from ...configuration_utils import PreTrainedConfig
 from ...generation import GenerationMixin
-from ...integrations.hub_kernels import lazy_load_kernel
+from ...integrations import lazy_load_kernel
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_utils import PreTrainedModel
 from ...utils import ModelOutput, auto_docstring, logging
@@ -345,7 +345,7 @@ class FalconMambaMixer(nn.Module):
             # In case the model has been quantized, we need a hack to properly call the `nn.Linear` module
             # at the price of a small overhead.
-            if hasattr(self.config, "_pre_quantization_dtype"):
+            if hasattr(self.config, "quantization_config"):
                 discrete_time_step = (self.dt_proj(time_step) - self.dt_proj.bias).transpose(1, 2)
             else:
                 discrete_time_step = self.dt_proj.weight @ time_step.transpose(1, 2)
@@ -613,6 +613,9 @@ class FalconMambaPreTrainedModel(PreTrainedModel):
             init.ones_(module.weight)
         elif isinstance(module, nn.Embedding):
             init.normal_(module.weight, std=std)
+        if isinstance(module, FalconMambaMixer):
+            init.ones_(module.b_c_rms)
+            init.ones_(module.dt_rms)
 @dataclass
@@ -811,6 +814,7 @@ class FalconMambaForCausalLM(FalconMambaPreTrainedModel, GenerationMixin):
         cache_params: Optional[FalconMambaCache] = None,
         cache_position: Optional[torch.LongTensor] = None,
         attention_mask: Optional[torch.LongTensor] = None,
+        is_first_iteration: Optional[bool] = False,
         **kwargs,
     ):
         # Overwritten -- uses `cache_params` as opposed to `past_key_values`

transformers/models/falcon_mamba/modular_falcon_mamba.py CHANGED Viewed

@@ -19,6 +19,7 @@ from typing import Optional
 import torch
 from torch import nn
+from ... import initialization as init
 from ...utils import auto_docstring, logging
 from ...utils.import_utils import is_mambapy_available, is_torchdynamo_compiling
 from ..mamba.configuration_mamba import MambaConfig
@@ -357,7 +358,7 @@ class FalconMambaMixer(MambaMixer):
             # In case the model has been quantized, we need a hack to properly call the `nn.Linear` module
             # at the price of a small overhead.
-            if hasattr(self.config, "_pre_quantization_dtype"):
+            if hasattr(self.config, "quantization_config"):
                 discrete_time_step = (self.dt_proj(time_step) - self.dt_proj.bias).transpose(1, 2)
             else:
                 discrete_time_step = self.dt_proj.weight @ time_step.transpose(1, 2)
@@ -529,7 +530,11 @@ class FalconMambaBlock(MambaBlock):
 @auto_docstring
 class FalconMambaPreTrainedModel(MambaPreTrainedModel):
-    pass
+    def _init_weights(self, module):
+        super()._init_weights(module)
+        if isinstance(module, FalconMambaMixer):
+            init.ones_(module.b_c_rms)
+            init.ones_(module.dt_rms)
 class FalconMambaOutput(MambaOutput):

transformers/models/fast_vlm/modeling_fast_vlm.py CHANGED Viewed

@@ -430,6 +430,7 @@ class FastVlmForConditionalGeneration(FastVlmPreTrainedModel, GenerationMixin):
         attention_mask=None,
         cache_position=None,
         logits_to_keep=None,
+        is_first_iteration=False,
         **kwargs,
     ):
         # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
@@ -441,12 +442,15 @@ class FastVlmForConditionalGeneration(FastVlmPreTrainedModel, GenerationMixin):
             attention_mask=attention_mask,
             cache_position=cache_position,
             logits_to_keep=logits_to_keep,
+            is_first_iteration=is_first_iteration,
             **kwargs,
         )
-        if cache_position[0] == 0:
-            # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
-            # Otherwise we need pixel values to be passed to model
+        if is_first_iteration or not kwargs.get("use_cache", True):
+            # Pixel values are used only in the first iteration if available
+            # In subsquent iterations, they are already merged with text and cached
+            # NOTE: first iteration doesn't have to be prefill, it can be the first
+            # iteration with a question and cached system prompt (continue generate from cache)
             model_inputs["pixel_values"] = pixel_values
         return model_inputs

transformers/models/fastspeech2_conformer/modeling_fastspeech2_conformer.py CHANGED Viewed

@@ -727,19 +727,20 @@ class FastSpeech2ConformerRelPositionalEncoding(nn.Module):
         self.embed_dim = config.hidden_size
         self.input_scale = math.sqrt(self.embed_dim)
         self.dropout = nn.Dropout(p=module_config["positional_dropout_rate"])
-        self.pos_enc = None
         self.max_len = 5000
-        self.extend_pos_enc(torch.tensor(0.0).expand(1, self.max_len))
+        self.register_buffer(
+            "pos_enc", self.extend_pos_enc(torch.tensor(0.0).expand(1, self.max_len)), persistent=False
+        )
-    def extend_pos_enc(self, x):
+    def extend_pos_enc(self, x, pos_enc=None):
         """Reset the positional encodings."""
-        if self.pos_enc is not None:
+        if pos_enc is not None:
             # self.pos_enc contains both positive and negative parts
             # the length of self.pos_enc is 2 * input_len - 1
-            if self.pos_enc.size(1) >= x.size(1) * 2 - 1:
-                if self.pos_enc.dtype != x.dtype or self.pos_enc.device != x.device:
-                    self.pos_enc = self.pos_enc.to(dtype=x.dtype, device=x.device)
-                return
+            if pos_enc.size(1) >= x.size(1) * 2 - 1:
+                if pos_enc.dtype != x.dtype or pos_enc.device != x.device:
+                    pos_enc = pos_enc.to(dtype=x.dtype, device=x.device)
+                return pos_enc
         # Suppose `i` means to the position of query vector and `j` means the
         # position of key vector. We use position relative positions when keys
         # are to the left (i>j) and negative relative positions otherwise (i<j).
@@ -760,7 +761,7 @@ class FastSpeech2ConformerRelPositionalEncoding(nn.Module):
         pos_enc_positive = torch.flip(pos_enc_positive, [0]).unsqueeze(0)
         pos_enc_negative = pos_enc_negative[1:].unsqueeze(0)
         pos_enc = torch.cat([pos_enc_positive, pos_enc_negative], dim=1)
-        self.pos_enc = pos_enc.to(device=x.device, dtype=x.dtype)
+        return pos_enc.to(device=x.device, dtype=x.dtype)
     def forward(self, feature_representation):
         """
@@ -771,7 +772,7 @@ class FastSpeech2ConformerRelPositionalEncoding(nn.Module):
         Returns:
             `torch.Tensor`: Encoded tensor (batch_size, time, `*`).
         """
-        self.extend_pos_enc(feature_representation)
+        self.pos_enc = self.extend_pos_enc(feature_representation, self.pos_enc)
         hidden_states = feature_representation * self.input_scale
         center_idx = self.pos_enc.size(1) // 2
         pos_emb = self.pos_enc[:, center_idx - hidden_states.size(1) + 1 : center_idx + hidden_states.size(1)]
@@ -1010,6 +1011,10 @@ class FastSpeech2ConformerPreTrainedModel(PreTrainedModel):
         elif isinstance(module, (nn.LayerNorm, nn.BatchNorm1d)):
             init.zeros_(module.bias)
             init.ones_(module.weight)
+            if getattr(module, "running_mean", None) is not None:
+                init.zeros_(module.running_mean)
+                init.ones_(module.running_var)
+                init.zeros_(module.num_batches_tracked)
         elif isinstance(module, nn.Embedding):
             init.normal_(module.weight)
             # Here we need the check explicitly, as we slice the weight in the `zeros_` call, so it looses the flag
@@ -1018,6 +1023,8 @@ class FastSpeech2ConformerPreTrainedModel(PreTrainedModel):
         elif isinstance(module, FastSpeech2ConformerAttention):
             init.xavier_uniform_(module.pos_bias_u)
             init.xavier_uniform_(module.pos_bias_v)
+        elif isinstance(module, FastSpeech2ConformerRelPositionalEncoding):
+            init.copy_(module.pos_enc, module.extend_pos_enc(torch.tensor(0.0).expand(1, module.max_len)))
     def _set_gradient_checkpointing(self, module, value=False):
         if isinstance(module, FastSpeech2ConformerEncoder):
@@ -1410,6 +1417,12 @@ class FastSpeech2ConformerHifiGan(PreTrainedModel):
         # Initialize weights and apply final processing
         self.post_init()
+    def _init_weights(self, module):
+        super()._init_weights(module)
+        if isinstance(module, FastSpeech2ConformerHifiGan):
+            init.zeros_(module.mean)
+            init.ones_(module.scale)
     def apply_weight_norm(self):
         weight_norm = nn.utils.weight_norm
         if hasattr(nn.utils.parametrizations, "weight_norm"):

transformers/models/fastspeech2_conformer/tokenization_fastspeech2_conformer.py CHANGED Viewed

@@ -79,6 +79,7 @@ class FastSpeech2ConformerTokenizer(PreTrainedTokenizer):
             unk_token=unk_token,
             pad_token=pad_token,
             should_strip_spaces=should_strip_spaces,
+            special_tokens_pattern="none",
             **kwargs,
         )

transformers/models/flaubert/modeling_flaubert.py CHANGED Viewed

@@ -660,9 +660,6 @@ class FlaubertPreTrainedModel(PreTrainedModel):
     config: FlaubertConfig
     base_model_prefix = "transformer"
-    def __init__(self, *inputs, **kwargs):
-        super().__init__(*inputs, **kwargs)
     @property
     def dummy_inputs(self):
         inputs_list = torch.tensor([[7, 6, 0, 0, 1], [1, 2, 3, 0, 0], [0, 0, 0, 4, 5]])
@@ -690,15 +687,17 @@ class FlaubertPreTrainedModel(PreTrainedModel):
         if isinstance(module, nn.LayerNorm):
             init.zeros_(module.bias)
             init.ones_(module.weight)
-        if isinstance(module, FlaubertModel) and self.config.sinusoidal_embeddings:
-            init.copy_(
-                module.position_embeddings.weight,
-                create_sinusoidal_embeddings(
-                    self.config.max_position_embeddings,
-                    self.config.emb_dim,
-                    out=torch.empty_like(module.position_embeddings.weight),
-                ),
-            )
+        if isinstance(module, FlaubertModel):
+            if self.config.sinusoidal_embeddings:
+                init.copy_(
+                    module.position_embeddings.weight,
+                    create_sinusoidal_embeddings(
+                        self.config.max_position_embeddings,
+                        self.config.emb_dim,
+                        out=torch.empty_like(module.position_embeddings.weight),
+                    ),
+                )
+            init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
 @auto_docstring
@@ -760,15 +759,15 @@ class FlaubertModel(FlaubertPreTrainedModel):
             self.ffns.append(TransformerFFN(self.dim, self.hidden_dim, self.dim, config=config))
             self.layer_norm2.append(nn.LayerNorm(self.dim, eps=config.layer_norm_eps))
-        # Initialize weights and apply final processing
-        self.post_init()
         self.layerdrop = getattr(config, "layerdrop", 0.0)
         self.pre_norm = getattr(config, "pre_norm", False)
         self.register_buffer(
             "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
         )
+        # Initialize weights and apply final processing
+        self.post_init()
     # Copied from transformers.models.xlm.modeling_xlm.XLMModel.get_input_embeddings
     def get_input_embeddings(self):
         return self.embeddings

transformers/models/flava/image_processing_flava_fast.py CHANGED Viewed

@@ -306,7 +306,6 @@ class FlavaImageProcessorFast(BaseImageProcessorFast):
             processed_images_grouped[shape] = stacked_images
         processed_images = reorder_images(processed_images_grouped, grouped_images_index)
-        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
         return processed_images
@@ -397,7 +396,6 @@ class FlavaImageProcessorFast(BaseImageProcessorFast):
                 mask_group_max_aspect_ratio=mask_group_max_aspect_ratio,
             )
             masks = [mask_generator() for _ in range(len(images))]
-            masks = torch.stack(masks, dim=0) if return_tensors else masks
             data["bool_masked_pos"] = masks
         return BatchFeature(data=data, tensor_type=return_tensors)

transformers/models/flava/modeling_flava.py CHANGED Viewed

@@ -677,6 +677,9 @@ class FlavaPreTrainedModel(PreTrainedModel):
             init.zeros_(module.position_embeddings)
             if module.mask_token is not None:
                 init.zeros_(module.mask_token)
+        elif isinstance(module, FlavaTextEmbeddings):
+            init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
+            init.zeros_(module.token_type_ids)
         elif isinstance(module, FlavaMultimodalModel):
             if module.use_cls_token:
                 init.zeros_(module.cls_token)
@@ -1107,7 +1110,7 @@ class FlavaModel(FlavaPreTrainedModel):
         output_hidden_states: bool = True,
         return_dict: Optional[bool] = None,
         **kwargs,
-    ) -> Union[tuple, FlavaOutput]:
+    ) -> Union[tuple, FlavaModelOutput]:
         r"""
         input_ids (`torch.LongTensor` of shape `(batch_size, image_num_patches + text_seq_len)`):
             Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See

transformers/models/flex_olmo/modeling_flex_olmo.py CHANGED Viewed

@@ -30,14 +30,14 @@ from ... import initialization as init
 from ...activations import ACT2FN
 from ...cache_utils import Cache, DynamicCache
 from ...generation import GenerationMixin
-from ...integrations import use_kernel_forward_from_hub, use_kernelized_func
+from ...integrations import use_experts_implementation, use_kernel_forward_from_hub, use_kernelized_func
 from ...masking_utils import create_causal_mask
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast
 from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
-from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_grouped_mm_available
 from ...utils.generic import OutputRecorder, check_model_inputs, maybe_autocast
 from .configuration_flex_olmo import FlexOlmoConfig
@@ -80,7 +80,7 @@ class FlexOlmoRotaryEmbedding(nn.Module):
         inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self.original_inv_freq = inv_freq
+        self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
     @staticmethod
     def compute_default_rope_parameters(
@@ -293,6 +293,7 @@ class FlexOlmoAttention(nn.Module):
         return attn_output, attn_weights
+@use_experts_implementation
 class FlexOlmoExperts(nn.Module):
     """Collection of expert weights stored as 3D tensors."""
@@ -421,7 +422,9 @@ class FlexOlmoPreTrainedModel(PreTrainedModel):
     _supports_flash_attn = True
     _supports_sdpa = True
     _supports_flex_attn = True
-    _can_compile_fullgraph = False  # MoE models don't work with torch.compile (`torch.where(condition)` not supported)
+    _can_compile_fullgraph = (
+        is_grouped_mm_available()
+    )  # https://huggingface.co/docs/transformers/experts_interface#torchcompile
     _supports_attention_backend = True
     _can_record_outputs = {
         "router_logits": OutputRecorder(nn.Linear, layer_name="mlp.gate", index=0),

transformers/models/florence2/modeling_florence2.py CHANGED Viewed

@@ -26,6 +26,7 @@ from typing import Any, Optional, Union
 import torch.nn as nn
 import torch.nn.functional as F
+from ... import initialization as init
 from ...activations import ACT2FN
 from ...cache_utils import Cache
 from ...generation import GenerationMixin
@@ -629,6 +630,18 @@ class Florence2PreTrainedModel(PreTrainedModel):
     _supports_attention_backend = False
     config_class = Florence2Config
+    def _init_weights(self, module):
+        super()._init_weights(module)
+        if isinstance(module, Florence2VisionPositionalEmbeddingCosine1D):
+            pos_idx_to_embed = torch.empty((module.max_seq_len, module.embed_dim))
+            sine, cosine = module.get_sinusoid_embeddings(
+                max_positions=module.max_seq_len,
+                embed_dim=module.embed_dim,
+            )
+            pos_idx_to_embed[:, 0::2] = sine
+            pos_idx_to_embed[:, 1::2] = cosine
+            init.copy_(module.pos_idx_to_embed, pos_idx_to_embed)
 @auto_docstring(
     custom_intro="""
@@ -937,6 +950,7 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel, GenerationMixi
         attention_mask=None,
         cache_position=None,
         logits_to_keep=None,
+        is_first_iteration=False,
         **kwargs,
     ):
         # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
@@ -948,12 +962,15 @@ class Florence2ForConditionalGeneration(Florence2PreTrainedModel, GenerationMixi
             attention_mask=attention_mask,
             cache_position=cache_position,
             logits_to_keep=logits_to_keep,
+            is_first_iteration=is_first_iteration,
             **kwargs,
         )
-        if cache_position[0] == 0:
-            # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
-            # Otherwise we need pixel values to be passed to model
+        if is_first_iteration or not kwargs.get("use_cache", True):
+            # Pixel values are used only in the first iteration if available
+            # In subsquent iterations, they are already merged with text and cached
+            # NOTE: first iteration doesn't have to be prefill, it can be the first
+            # iteration with a question and cached system prompt (continue generate from cache)
             model_inputs["pixel_values"] = pixel_values
         return model_inputs

transformers/models/florence2/modular_florence2.py CHANGED Viewed

@@ -22,6 +22,7 @@ import numpy as np
 import torch.nn as nn
 import torch.nn.functional as F
+from ... import initialization as init
 from ...activations import ACT2FN
 from ...cache_utils import Cache
 from ...configuration_utils import PreTrainedConfig
@@ -1500,6 +1501,18 @@ class Florence2PreTrainedModel(LlavaPreTrainedModel):
     _supports_attention_backend = False
+    def _init_weights(self, module):
+        PreTrainedModel._init_weights(self, module)
+        if isinstance(module, Florence2VisionPositionalEmbeddingCosine1D):
+            pos_idx_to_embed = torch.empty((module.max_seq_len, module.embed_dim))
+            sine, cosine = module.get_sinusoid_embeddings(
+                max_positions=module.max_seq_len,
+                embed_dim=module.embed_dim,
+            )
+            pos_idx_to_embed[:, 0::2] = sine
+            pos_idx_to_embed[:, 1::2] = cosine
+            init.copy_(module.pos_idx_to_embed, pos_idx_to_embed)
 @auto_docstring(
     custom_intro="""

transformers/models/fnet/modeling_fnet.py CHANGED Viewed

@@ -23,6 +23,7 @@ import torch
 from torch import nn
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+from ... import initialization as init
 from ...utils import auto_docstring, is_scipy_available
@@ -374,6 +375,12 @@ class FNetPreTrainedModel(PreTrainedModel):
     base_model_prefix = "fnet"
     supports_gradient_checkpointing = True
+    def _init_weights(self, module):
+        super()._init_weights(module)
+        if isinstance(module, FNetEmbeddings):
+            init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
+            init.zeros_(module.token_type_ids)
 @dataclass
 @auto_docstring(

transformers/models/fuyu/image_processing_fuyu.py CHANGED Viewed

@@ -94,7 +94,7 @@ class FuyuBatchFeature(BatchFeature):
     The outputs dictionary from the processors contains a mix of tensors and lists of tensors.
     """
-    def convert_to_tensors(self, tensor_type: Optional[Union[str, TensorType]] = None):
+    def convert_to_tensors(self, tensor_type: Optional[Union[str, TensorType]] = None, **kwargs):
         """
         Convert the inner content to tensors.

transformers/models/fuyu/modeling_fuyu.py CHANGED Viewed

@@ -359,6 +359,7 @@ class FuyuForCausalLM(FuyuPreTrainedModel, GenerationMixin):
         image_patches=None,
         image_patches_indices=None,
         cache_position=None,
+        is_first_iteration=False,
         **kwargs,
     ):
         # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
@@ -371,10 +372,11 @@ class FuyuForCausalLM(FuyuPreTrainedModel, GenerationMixin):
             image_patches=image_patches,
             image_patches_indices=image_patches_indices,
             cache_position=cache_position,
+            is_first_iteration=is_first_iteration,
             **kwargs,
         )
-        if cache_position[0] != 0:
+        if not is_first_iteration and kwargs.get("use_cache", True):
             # set image_patches and image_patches_indices to `None` for decoding stage
             model_inputs["image_patches_indices"] = None
             model_inputs["image_patches"] = None

transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl

transformers 5.0.0rc1py3-none-any.whl → 5.0.0rc2py3-none-any.whl