PyPI - transformers - Versions diffs - 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl - Mend

transformers 5.0.0rc1py3-none-any.whl → 5.0.0rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (671) hide show

transformers/models/pe_video/modular_pe_video.py ADDED Viewed

@@ -0,0 +1,219 @@
+# coding=utf-8
+# Copyright 2025 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+from typing import Any, Optional
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
+from ...modeling_outputs import BaseModelOutputWithPooling, MaskedLMOutput
+from ...utils import ModelOutput, auto_docstring, can_return_tuple
+from ...utils.generic import check_model_inputs
+from ..auto import AutoModel, AutoModelForImageClassification
+from ..pe_audio_video.modeling_pe_audio_video import (
+    PeAudioVideoContrastiveHead,
+    PeAudioVideoEncoder,
+    PeAudioVideoEncoderPatchEmbedder,
+    PeAudioVideoPreTrainedModel,
+)
+from .configuration_pe_video import PeVideoConfig, PeVideoEncoderConfig
+# TODO: not sure about the typing for text_model_output
+@dataclass
+# @auto_docstring
+class PeVideoOutput(ModelOutput):
+    loss: Optional[torch.FloatTensor] = None
+    logits_video_text: Optional[torch.FloatTensor] = None
+    text_video_embeds: Optional[torch.FloatTensor] = None
+    video_embeds: Optional[torch.FloatTensor] = None
+    text_outputs: BaseModelOutputWithPooling = None
+    video_outputs: BaseModelOutputWithPooling = None
+    def to_tuple(self) -> tuple[Any]:
+        return tuple(
+            self[k] if k not in ["text_outputs", "video_outputs"] else getattr(self, k).to_tuple() for k in self.keys()
+        )
+class PeVideoContrastiveHead(PeAudioVideoContrastiveHead): ...
+class PeVideoEncoderPatchEmbedder(PeAudioVideoEncoderPatchEmbedder): ...
+class PeVideoEncoderEmbedder(nn.Module):
+    def __init__(self, config: PeVideoEncoderConfig):
+        super().__init__()
+        self.vision_model = AutoModelForImageClassification.from_config(config.vision_config)
+        self.proj = nn.Linear(config.vision_config.num_labels, config.hidden_size, bias=False)
+        self.data_proj = nn.Linear(config.hidden_size, config.hidden_size)
+    def forward(
+        self,
+        pixel_values_videos: torch.Tensor,
+        padding_mask: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        input_shape = pixel_values_videos.shape
+        pixel_values_videos = pixel_values_videos.view(-1, *input_shape[2:])
+        vision_encoder_outputs = self.vision_model(pixel_values_videos)
+        logits = vision_encoder_outputs.logits.view(*input_shape[:2], -1)
+        logits = F.normalize(logits, dim=-1)
+        vision_features = self.proj(logits)
+        inputs_embeds = self.data_proj(vision_features)
+        return inputs_embeds, padding_mask
+class PeVideoPreTrainedModel(PeAudioVideoPreTrainedModel):
+    base_model_prefix = "video_model"
+    main_input_name = "pixel_values_videos"
+@auto_docstring(
+    custom_intro="""
+    The PeVideo Encoder model.
+    """
+)
+class PeVideoEncoder(PeAudioVideoEncoder):
+    base_model_prefix = "video_model.video_encoder"
+    main_input_name = "pixel_values_videos"
+    def __init__(self, config: PeVideoEncoderConfig):
+        super().__init__(config)
+        self.embedder = PeVideoEncoderEmbedder(config)
+    @can_return_tuple
+    @check_model_inputs
+    def forward(
+        self,
+        pixel_values_videos: torch.Tensor,
+        padding_mask_videos: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> BaseModelOutputWithPooling:
+        inputs_embeds, padding_mask = self.embedder(pixel_values_videos, padding_mask=padding_mask_videos)
+        inputs_embeds, attention_mask = self.patch_embedder(inputs_embeds, padding_mask=padding_mask)
+        if attention_mask is not None:
+            attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype)
+        position_ids = torch.arange(inputs_embeds.shape[1], device=inputs_embeds.device).unsqueeze(0)
+        position_embeddings = self.rotary_emb(inputs_embeds, position_ids)
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers[: self.config.num_hidden_layers]:
+            hidden_states = encoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+        hidden_states = self.norm(hidden_states)
+        hidden_states = self.output(hidden_states)
+        return BaseModelOutputWithPooling(
+            last_hidden_state=hidden_states[:, 1:],
+            pooler_output=hidden_states[:, 0],
+        )
+class PeVideoModel(PeVideoPreTrainedModel):
+    main_input_name = "input_ids"
+    def __init__(self, config: PeVideoConfig):
+        super().__init__(config)
+        self.text_model = AutoModel.from_config(config.text_config)
+        self.video_encoder = PeVideoEncoder(config.video_config)
+        self.text_video_head = PeVideoContrastiveHead(config.text_config.hidden_size, config.text_config.hidden_size)
+        self.video_head = PeVideoContrastiveHead(config.video_config.hidden_size, config.text_config.hidden_size)
+        self.text_video_logit_scale = nn.Parameter(torch.zeros(1))
+        self.text_video_logit_bias = nn.Parameter(torch.zeros(1))
+        self.post_init()
+    def get_text_features(self, input_ids, attention_mask=None):
+        # TODO: should it be named feature or embeds
+        text_outputs: MaskedLMOutput = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            return_dict=True,
+        )
+        text_features = text_outputs.last_hidden_state
+        text_features = self.text_video_head(text_features)
+        return text_features
+    def get_video_features(self, pixel_values_videos, padding_mask_videos=None):
+        # TODO: should it be named feature or embeds
+        video_outputs: BaseModelOutputWithPooling = self.video_encoder(
+            pixel_values_videos=pixel_values_videos,
+            padding_mask_videos=padding_mask_videos,
+            return_dict=True,
+        )
+        video_features = self.video_head(video_outputs.pooler_output)
+        return video_features
+    @can_return_tuple
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        pixel_values_videos: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        padding_mask_videos: Optional[torch.Tensor] = None,
+        return_loss: Optional[bool] = None,
+        **kwargs,
+    ) -> PeVideoOutput:
+        video_outputs: BaseModelOutputWithPooling = self.video_encoder(
+            pixel_values_videos=pixel_values_videos, padding_mask_videos=padding_mask_videos, **kwargs
+        )
+        kwargs["output_hidden_states"] = True
+        text_outputs: MaskedLMOutput = self.text_model(input_ids=input_ids, attention_mask=attention_mask, **kwargs)
+        video_embeds = video_outputs.pooler_output
+        video_embeds = self.video_head(video_embeds)
+        text_video_embeds = text_outputs.hidden_states[-1][:, 0]
+        text_video_embeds = self.text_video_head(text_video_embeds)
+        logits_video_text = video_embeds @ text_video_embeds.T
+        logits_video_text = logits_video_text * self.text_video_logit_scale + self.text_video_logit_bias
+        loss = None
+        if return_loss:
+            labels = torch.eye(logits_video_text.shape[0], device=logits_video_text.device)
+            loss = -F.logsigmoid(labels * logits_video_text).sum() / logits_video_text.shape[0]
+        return PeVideoOutput(
+            logits_video_text=logits_video_text,
+            text_video_embeds=text_video_embeds,
+            video_embeds=video_embeds,
+            text_outputs=text_outputs,
+            video_outputs=video_outputs,
+            loss=loss,
+        )
+__all__ = [
+    "PeVideoEncoder",
+    "PeVideoModel",
+]

transformers/models/pe_video/processing_pe_video.py ADDED Viewed

@@ -0,0 +1,10 @@
+from ...processing_utils import ProcessorMixin
+class PeVideoProcessor(ProcessorMixin):
+    attributes = ["video_processor", "tokenizer"]
+    video_processor_class = "PeVideoVideoProcessor"
+    tokenizer_class = "AutoTokenizer"
+__all__ = ["PeVideoProcessor"]

transformers/models/pe_video/video_processing_pe_video.py ADDED Viewed

@@ -0,0 +1,66 @@
+# coding=utf-8
+# Copyright 2025 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional, Union
+import torch
+from ...image_processing_utils import BatchFeature
+from ...image_utils import PILImageResampling
+from ...processing_utils import Unpack, VideosKwargs
+from ...video_processing_utils import BaseVideoProcessor, VideoMetadata
+from ...video_utils import VideoInput
+class PeVideoVideoProcessor(BaseVideoProcessor):
+    resample = PILImageResampling.BILINEAR
+    def sample_frames(
+        self,
+        metadata: VideoMetadata,
+        num_frames: Optional[int] = None,
+        fps: Optional[Union[int, float]] = None,
+        **kwargs,
+    ):
+        if num_frames:
+            total_frames = metadata.total_num_frames
+            num_frames = num_frames if num_frames is not None else self.num_frames
+            assert num_frames is not None, "`num_frames` must be specified if `fixed_len_video == True`"
+            frame_idxs = [int(i * (total_frames - 1) / (num_frames - 1)) for i in range(num_frames)]
+            return torch.tensor(frame_idxs)
+        else:
+            return super().sample_frames(metadata, num_frames, fps, **kwargs)
+    def _preprocess(
+        self,
+        videos: VideoInput,
+        **kwargs: Unpack[VideosKwargs],
+    ) -> BatchFeature:
+        # Always set `return_tensors` to `None` since it won't pad variable length videos
+        # We'll handle this after we call the parent' method
+        return_tensors = kwargs.pop("return_tensors", None)
+        result = super()._preprocess(videos, **kwargs)
+        pixels = result.pixel_values_videos
+        data = {"pixel_values_videos": pixels}
+        if return_tensors:
+            lengths = torch.tensor([video.size(0) for video in pixels])
+            pixels = torch.nn.utils.rnn.pad_sequence(pixels, batch_first=True, padding_value=0.0)
+            data["pixel_values_videos"] = pixels
+            if lengths.unique().size(0) > 1:
+                mask = torch.arange(lengths.max())[None] < lengths[:, None]
+                data["padding_mask_videos"] = mask
+        return BatchFeature(data=data, tensor_type=return_tensors)
+__all__ = ["PeVideoVideoProcessor"]

transformers/models/pegasus/configuration_pegasus.py CHANGED Viewed

@@ -143,6 +143,7 @@ class PegasusConfig(PreTrainedConfig):
         self.use_cache = use_cache
         self.num_hidden_layers = encoder_layers
         self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
         super().__init__(
             pad_token_id=pad_token_id,
             eos_token_id=eos_token_id,

transformers/models/pegasus/modeling_pegasus.py CHANGED Viewed

@@ -443,6 +443,8 @@ class PegasusPreTrainedModel(PreTrainedModel):
         super()._init_weights(module)
         if isinstance(module, PegasusSinusoidalPositionalEmbedding):
             init.copy_(module.weight, module.create_weight())
+        elif isinstance(module, PegasusForConditionalGeneration):
+            init.zeros_(module.final_logits_bias)
 class PegasusEncoder(PegasusPreTrainedModel):
@@ -1220,6 +1222,7 @@ class PegasusDecoderWrapper(PegasusPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.decoder = PegasusDecoder(config)
+        self.post_init()
     def forward(self, *args, **kwargs):
         return self.decoder(*args, **kwargs)

transformers/models/pegasus_x/modeling_pegasus_x.py CHANGED Viewed

@@ -1476,6 +1476,7 @@ class PegasusXDecoderWrapper(PegasusXPreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.decoder = PegasusXDecoder(config)
+        self.post_init()
     def forward(self, *args, **kwargs):
         return self.decoder(*args, **kwargs)

transformers/models/perceiver/image_processing_perceiver_fast.py CHANGED Viewed

@@ -113,7 +113,6 @@ class PerceiverImageProcessorFast(BaseImageProcessorFast):
             processed_images_grouped[shape] = stacked_images
         processed_images = reorder_images(processed_images_grouped, grouped_images_index)
-        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
         return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)

transformers/models/perceiver/modeling_perceiver.py CHANGED Viewed

@@ -551,9 +551,13 @@ class PerceiverPreTrainedModel(PreTrainedModel):
             # Here we need the check explicitly, as we slice the weight in the `zeros_` call, so it looses the flag
             if module.padding_idx is not None and not getattr(module.weight, "_is_hf_initialized", False):
                 init.zeros_(module.weight[module.padding_idx])
-        elif isinstance(module, nn.LayerNorm):
+        elif isinstance(module, (nn.LayerNorm, nn.BatchNorm2d)):
             init.zeros_(module.bias)
             init.ones_(module.weight)
+            if getattr(module, "running_mean", None) is not None:
+                init.zeros_(module.running_mean)
+                init.ones_(module.running_var)
+                init.zeros_(module.num_batches_tracked)
 @auto_docstring(

transformers/models/perception_lm/image_processing_perception_lm_fast.py CHANGED Viewed

@@ -307,7 +307,6 @@ class PerceptionLMImageProcessorFast(BaseImageProcessorFast):
             processed_images_grouped[shape] = stacked_images
         processed_images = reorder_images(processed_images_grouped, grouped_images_index)
         processed_images = [p[None] if p.ndim == 3 else p for p in processed_images]  # add tiles dimension if needed
-        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
         return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)

transformers/models/perception_lm/modeling_perception_lm.py CHANGED Viewed

@@ -451,6 +451,7 @@ class PerceptionLMForConditionalGeneration(PerceptionLMPreTrainedModel, Generati
         attention_mask=None,
         cache_position=None,
         logits_to_keep=None,
+        is_first_iteration=False,
         **kwargs,
     ):
         # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
@@ -462,12 +463,15 @@ class PerceptionLMForConditionalGeneration(PerceptionLMPreTrainedModel, Generati
             attention_mask=attention_mask,
             cache_position=cache_position,
             logits_to_keep=logits_to_keep,
+            is_first_iteration=is_first_iteration,
             **kwargs,
         )
-        if cache_position[0] == 0:
-            # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
-            # Otherwise we need pixel values to be passed to model
+        if is_first_iteration or not kwargs.get("use_cache", True):
+            # Pixel values are used only in the first iteration if available
+            # In subsquent iterations, they are already merged with text and cached
+            # NOTE: first iteration doesn't have to be prefill, it can be the first
+            # iteration with a question and cached system prompt (continue generate from cache)
             model_inputs["pixel_values"] = pixel_values
             model_inputs["pixel_values_videos"] = pixel_values_videos
         return model_inputs

transformers/models/perception_lm/modular_perception_lm.py CHANGED Viewed

@@ -293,6 +293,7 @@ class PerceptionLMForConditionalGeneration(LlavaForConditionalGeneration):
         attention_mask=None,
         cache_position=None,
         logits_to_keep=None,
+        is_first_iteration=False,
         **kwargs,
     ):
         # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
@@ -304,12 +305,15 @@ class PerceptionLMForConditionalGeneration(LlavaForConditionalGeneration):
             attention_mask=attention_mask,
             cache_position=cache_position,
             logits_to_keep=logits_to_keep,
+            is_first_iteration=is_first_iteration,
             **kwargs,
         )
-        if cache_position[0] == 0:
-            # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
-            # Otherwise we need pixel values to be passed to model
+        if is_first_iteration or not kwargs.get("use_cache", True):
+            # Pixel values are used only in the first iteration if available
+            # In subsquent iterations, they are already merged with text and cached
+            # NOTE: first iteration doesn't have to be prefill, it can be the first
+            # iteration with a question and cached system prompt (continue generate from cache)
             model_inputs["pixel_values"] = pixel_values
             model_inputs["pixel_values_videos"] = pixel_values_videos
         return model_inputs

transformers/models/persimmon/modeling_persimmon.py CHANGED Viewed

@@ -77,7 +77,7 @@ class PersimmonRotaryEmbedding(nn.Module):
         inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self.original_inv_freq = inv_freq
+        self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
     @staticmethod
     # Ignore copy

transformers/models/phi/modeling_phi.py CHANGED Viewed

@@ -49,7 +49,7 @@ class PhiRotaryEmbedding(nn.Module):
         inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self.original_inv_freq = inv_freq
+        self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
     @staticmethod
     def compute_default_rope_parameters(

transformers/models/phi3/modeling_phi3.py CHANGED Viewed

@@ -83,7 +83,7 @@ class Phi3RotaryEmbedding(nn.Module):
         inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self.original_inv_freq = inv_freq
+        self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
     @staticmethod
     def compute_default_rope_parameters(

transformers/models/phi4_multimodal/modeling_phi4_multimodal.py CHANGED Viewed

@@ -881,6 +881,9 @@ class Phi4MultimodalAudioPreTrainedModel(PreTrainedModel):
         if isinstance(module, Phi4MultimodalAudioGluPointWiseConv):
             init.zeros_(module.b1)
             init.zeros_(module.b2)
+        elif isinstance(module, Phi4MultimodalAudioMeanVarianceNormLayer):
+            init.zeros_(module.global_mean)
+            init.ones_(module.global_invstd)
 def unfold_tensor(tensor, max_seq_len):
@@ -1459,7 +1462,7 @@ class Phi4MultimodalRotaryEmbedding(nn.Module):
         inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self.original_inv_freq = inv_freq
+        self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
     @staticmethod
     def compute_default_rope_parameters(

transformers/models/phi4_multimodal/modular_phi4_multimodal.py CHANGED Viewed

@@ -1123,6 +1123,9 @@ class Phi4MultimodalAudioPreTrainedModel(PreTrainedModel):
         if isinstance(module, Phi4MultimodalAudioGluPointWiseConv):
             init.zeros_(module.b1)
             init.zeros_(module.b2)
+        elif isinstance(module, Phi4MultimodalAudioMeanVarianceNormLayer):
+            init.zeros_(module.global_mean)
+            init.ones_(module.global_invstd)
 class Phi4MultimodalAudioModel(Phi4MultimodalAudioPreTrainedModel):

transformers/models/phi4_multimodal/processing_phi4_multimodal.py CHANGED Viewed

@@ -58,8 +58,6 @@ class Phi4MultimodalProcessor(ProcessorMixin):
             The fake audio token pattern.
     """
-    audio_processor_class = "Phi4MultimodalFeatureExtractor"
     def __init__(
         self,
         image_processor,

transformers/models/phimoe/modeling_phimoe.py CHANGED Viewed

@@ -30,14 +30,19 @@ from ... import initialization as init
 from ...activations import ACT2FN
 from ...cache_utils import Cache, DynamicCache
 from ...generation import GenerationMixin
-from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub, use_kernelized_func
+from ...integrations import (
+    use_experts_implementation,
+    use_kernel_forward_from_hub,
+    use_kernel_func_from_hub,
+    use_kernelized_func,
+)
 from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
 from ...modeling_layers import GenericForSequenceClassification, GradientCheckpointingLayer
 from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast
 from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
-from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_grouped_mm_available
 from ...utils.generic import OutputRecorder, check_model_inputs, maybe_autocast
 from .configuration_phimoe import PhimoeConfig
@@ -59,7 +64,7 @@ class PhimoeRotaryEmbedding(nn.Module):
         inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self.original_inv_freq = inv_freq
+        self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
     @staticmethod
     def compute_default_rope_parameters(
@@ -327,6 +332,7 @@ class PhimoeMultiplier(torch.autograd.Function):
         )
+@use_experts_implementation
 class PhimoeExperts(nn.Module):
     """Collection of expert weights stored as 3D tensors."""
@@ -617,7 +623,9 @@ class PhimoePreTrainedModel(PreTrainedModel):
     _supports_flash_attn = True
     _supports_sdpa = True
     _supports_flex_attn = True
-    _can_compile_fullgraph = False  # MoE models don't work with torch.compile (`torch.where(condition)` not supported)
+    _can_compile_fullgraph = (
+        is_grouped_mm_available()
+    )  # https://huggingface.co/docs/transformers/experts_interface#torchcompile
     _supports_attention_backend = True
     _can_record_outputs = {
         "router_logits": OutputRecorder(PhimoeTopKRouter, layer_name="mlp.router", index=0),

transformers/models/phimoe/modular_phimoe.py CHANGED Viewed

@@ -52,7 +52,7 @@ class PhimoeRotaryEmbedding(MixtralRotaryEmbedding):
         inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self.original_inv_freq = inv_freq
+        self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
     def forward(self, x, position_ids=None, layer_type=None):
         if layer_type is not None:

transformers/models/pix2struct/processing_pix2struct.py CHANGED Viewed

@@ -61,10 +61,6 @@ class Pix2StructProcessor(ProcessorMixin):
             An instance of ['T5Tokenizer`]. The tokenizer is a required input.
     """
-    attributes = ["image_processor", "tokenizer"]
-    image_processor_class = "Pix2StructImageProcessor"
-    tokenizer_class = ("T5Tokenizer",)
     def __init__(self, image_processor, tokenizer):
         tokenizer.return_token_type_ids = False
         super().__init__(image_processor, tokenizer)

transformers/models/pixio/__init__.py ADDED Viewed

@@ -0,0 +1,30 @@
+# coding=utf-8
+# Copyright 2025 Meta AI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Pixio model configuration"""
+from typing import TYPE_CHECKING
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+if TYPE_CHECKING:
+    from .configuration_pixio import *
+    from .modeling_pixio import *
+else:
+    import sys
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl

transformers 5.0.0rc1py3-none-any.whl → 5.0.0rc2py3-none-any.whl