PyPI - transformers - Versions diffs - 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl - Mend

transformers 5.0.0rc1py3-none-any.whl → 5.0.0rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (671) hide show

transformers/models/encodec/modeling_encodec.py CHANGED Viewed

@@ -474,6 +474,20 @@ class EncodecPreTrainedModel(PreTrainedAudioTokenizerBase):
                     init.xavier_uniform_(param)
                 elif "bias" in name:
                     init.constant_(param, 0.0)
+        elif isinstance(module, EncodecConv1d):
+            kernel_size = module.conv.kernel_size[0]
+            stride = torch.tensor(module.conv.stride[0], dtype=torch.int64)
+            dilation = module.conv.dilation[0]
+            # Effective kernel size with dilations.
+            kernel_size = torch.tensor((kernel_size - 1) * dilation + 1, dtype=torch.int64)
+            init.copy_(module.stride, stride)
+            init.copy_(module.kernel_size, kernel_size)
+            init.copy_(module.padding_total, kernel_size - stride)
+        elif isinstance(module, EncodecEuclideanCodebook):
+            init.copy_(module.inited, torch.Tensor([True]))
+            init.zeros_(module.cluster_size)
+            init.zeros_(module.embed)
+            init.zeros_(module.embed_avg)
 @auto_docstring(

transformers/models/eomt/image_processing_eomt_fast.py CHANGED Viewed

@@ -44,12 +44,43 @@ from ...utils import (
 from .image_processing_eomt import (
     EomtImageProcessorKwargs,
     compute_segments,
-    convert_segmentation_map_to_binary_masks,
     get_size_with_aspect_ratio,
     remove_low_and_no_objects,
 )
+# Adapted from transformers.models.maskformer.image_processing_maskformer_fast.convert_segmentation_map_to_binary_masks_fast
+def convert_segmentation_map_to_binary_masks_fast(
+    segmentation_map: "torch.Tensor",
+    instance_id_to_semantic_id: Optional[dict[int, int]] = None,
+    ignore_index: Optional[int] = None,
+):
+    if ignore_index is not None:
+        segmentation_map = torch.where(segmentation_map == 0, ignore_index, segmentation_map - 1)
+    all_labels = torch.unique(segmentation_map)
+    if ignore_index is not None:
+        all_labels = all_labels[all_labels != ignore_index]  # drop background label if applicable
+    binary_masks = [(segmentation_map == i) for i in all_labels]
+    if binary_masks:
+        binary_masks = torch.stack(binary_masks, dim=0)
+    else:
+        binary_masks = torch.zeros((0, *segmentation_map.shape), device=segmentation_map.device)
+    # Convert instance ids to class ids
+    if instance_id_to_semantic_id is not None:
+        labels = torch.zeros(all_labels.shape[0], device=segmentation_map.device)
+        for i, label in enumerate(all_labels):
+            class_id = instance_id_to_semantic_id[(label.item() + 1 if ignore_index is not None else label.item())]
+            labels[i] = class_id - 1 if ignore_index is not None else class_id
+    else:
+        labels = all_labels
+    return binary_masks.float(), labels.long()
 def get_target_size(size_dict: dict[str, int]) -> tuple[int, int]:
     """Returns the height and width from a size dict."""
     target_height = size_dict["shortest_edge"]
@@ -162,8 +193,7 @@ class EomtImageProcessorFast(BaseImageProcessorFast):
         )
         ignore_index = kwargs.pop("ignore_index", None)
         images_kwargs = kwargs.copy()
-        processed_images, patch_offsets = self._preprocess(images, **images_kwargs)
-        outputs = BatchFeature({"pixel_values": processed_images})
+        outputs = self._preprocess(images, **images_kwargs)
         if segmentation_maps is not None:
             processed_segmentation_maps = self._prepare_image_like_inputs(
@@ -183,9 +213,9 @@ class EomtImageProcessorFast(BaseImageProcessorFast):
                 }
             )
-            processed_segmentation_maps, _ = self._preprocess(
+            processed_segmentation_maps = self._preprocess(
                 images=processed_segmentation_maps, **segmentation_maps_kwargs
-            )
+            ).pixel_values
             processed_segmentation_maps = processed_segmentation_maps.squeeze(1).to(torch.int64)
             # Convert to list of binary masks and labels
             mask_labels, class_labels = [], []
@@ -195,21 +225,21 @@ class EomtImageProcessorFast(BaseImageProcessorFast):
                 else:
                     instance_id = instance_id_to_semantic_id
                 # Use instance2class_id mapping per image
-                masks, classes = convert_segmentation_map_to_binary_masks(
+                masks, classes = convert_segmentation_map_to_binary_masks_fast(
                     segmentation_map,
                     instance_id,
                     ignore_index=ignore_index,
                 )
-                mask_labels.append(torch.from_numpy(masks))
-                class_labels.append(torch.from_numpy(classes))
+                mask_labels.append(masks)
+                class_labels.append(classes)
             # we cannot batch them since they don't share a common class size
             outputs["mask_labels"] = mask_labels
             outputs["class_labels"] = class_labels
-        if patch_offsets:
-            outputs["patch_offsets"] = [torch.tensor(offsets) for offsets in patch_offsets]
+        if outputs.patch_offsets:
+            outputs["patch_offsets"] = [torch.tensor(offsets) for offsets in outputs.patch_offsets]
         return outputs
@@ -274,11 +304,13 @@ class EomtImageProcessorFast(BaseImageProcessorFast):
                 stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
             )
             processed_images_grouped[shape] = stacked_images
-        images = reorder_images(processed_images_grouped, grouped_images_index)
+        processed_images = reorder_images(processed_images_grouped, grouped_images_index)
-        processed_images = torch.stack(images, dim=0) if return_tensors else images
-        return processed_images, patch_offsets
+        return BatchFeature(
+            data={"pixel_values": processed_images, "patch_offsets": patch_offsets},
+            tensor_type=return_tensors,
+            skip_tensor_conversion=["patch_offsets"],
+        )
     def merge_image_patches(
         self,

transformers/models/eomt/modeling_eomt.py CHANGED Viewed

@@ -1020,6 +1020,13 @@ class EomtPreTrainedModel(PreTrainedModel):
         elif isinstance(module, EomtEmbeddings):
             init.trunc_normal_(module.cls_token, mean=0.0, std=std)
             init.zeros_(module.register_tokens)
+            init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
+        elif isinstance(module, EomtLoss):
+            empty_weight = torch.ones(module.num_labels + 1)
+            empty_weight[-1] = module.eos_coef
+            init.copy_(module.empty_weight, empty_weight)
+        elif isinstance(module, EomtForUniversalSegmentation):
+            init.ones_(module.attn_mask_probs)
 @auto_docstring(

transformers/models/eomt/modular_eomt.py CHANGED Viewed

@@ -425,6 +425,13 @@ class EomtPreTrainedModel(PreTrainedModel):
         elif isinstance(module, EomtEmbeddings):
             init.trunc_normal_(module.cls_token, mean=0.0, std=std)
             init.zeros_(module.register_tokens)
+            init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
+        elif isinstance(module, EomtLoss):
+            empty_weight = torch.ones(module.num_labels + 1)
+            empty_weight[-1] = module.eos_coef
+            init.copy_(module.empty_weight, empty_weight)
+        elif isinstance(module, EomtForUniversalSegmentation):
+            init.ones_(module.attn_mask_probs)
 @auto_docstring(

transformers/models/ernie/modeling_ernie.py CHANGED Viewed

@@ -113,6 +113,9 @@ class ErnieEmbeddings(nn.Module):
         if inputs_embeds is None:
             inputs_embeds = self.word_embeddings(input_ids)
         token_type_embeddings = self.token_type_embeddings(token_type_ids)
+        # .to is better than using _no_split_modules on ErnieEmbeddings as it's the first module and >1/2 the model size
+        inputs_embeds = inputs_embeds.to(token_type_embeddings.device)
         embeddings = inputs_embeds + token_type_embeddings
         position_embeddings = self.position_embeddings(position_ids)
@@ -553,6 +556,9 @@ class ErniePreTrainedModel(PreTrainedModel):
         super()._init_weights(module)
         if isinstance(module, ErnieLMPredictionHead):
             init.zeros_(module.bias)
+        elif isinstance(module, ErnieEmbeddings):
+            init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
+            init.zeros_(module.token_type_ids)
 @auto_docstring(

transformers/models/ernie/modular_ernie.py CHANGED Viewed

@@ -107,6 +107,9 @@ class ErnieEmbeddings(BertEmbeddings):
         if inputs_embeds is None:
             inputs_embeds = self.word_embeddings(input_ids)
         token_type_embeddings = self.token_type_embeddings(token_type_ids)
+        # .to is better than using _no_split_modules on ErnieEmbeddings as it's the first module and >1/2 the model size
+        inputs_embeds = inputs_embeds.to(token_type_embeddings.device)
         embeddings = inputs_embeds + token_type_embeddings
         position_embeddings = self.position_embeddings(position_ids)
@@ -169,6 +172,9 @@ class ErniePreTrainedModel(PreTrainedModel):
         super()._init_weights(module)
         if isinstance(module, ErnieLMPredictionHead):
             init.zeros_(module.bias)
+        elif isinstance(module, ErnieEmbeddings):
+            init.copy_(module.position_ids, torch.arange(module.position_ids.shape[-1]).expand((1, -1)))
+            init.zeros_(module.token_type_ids)
 class ErnieModel(BertModel):

transformers/models/ernie4_5/modeling_ernie4_5.py CHANGED Viewed

@@ -56,7 +56,7 @@ class Ernie4_5RotaryEmbedding(nn.Module):
         inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self.original_inv_freq = inv_freq
+        self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
     @staticmethod
     def compute_default_rope_parameters(

transformers/models/ernie4_5_moe/modeling_ernie4_5_moe.py CHANGED Viewed

@@ -29,14 +29,14 @@ from ... import initialization as init
 from ...activations import ACT2FN
 from ...cache_utils import Cache, DynamicCache
 from ...generation import GenerationMixin
-from ...integrations import use_kernel_forward_from_hub, use_kernelized_func
+from ...integrations import use_experts_implementation, use_kernel_forward_from_hub, use_kernelized_func
 from ...masking_utils import create_causal_mask
 from ...modeling_layers import GradientCheckpointingLayer
 from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast
 from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
 from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
 from ...processing_utils import Unpack
-from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, is_grouped_mm_available
 from ...utils.generic import OutputRecorder, check_model_inputs, maybe_autocast
 from .configuration_ernie4_5_moe import Ernie4_5_MoeConfig
@@ -96,7 +96,7 @@ class Ernie4_5_MoeRotaryEmbedding(nn.Module):
         inv_freq, self.attention_scaling = rope_init_fn(self.config, device)
         self.register_buffer("inv_freq", inv_freq, persistent=False)
-        self.original_inv_freq = inv_freq
+        self.register_buffer("original_inv_freq", inv_freq.clone(), persistent=False)
     @staticmethod
     def compute_default_rope_parameters(
@@ -317,6 +317,7 @@ class Ernie4_5_MoeStatics(nn.Module):
         return hidden_states + self.e_score_correction_bias.squeeze()
+@use_experts_implementation
 class Ernie4_5_MoeExperts(nn.Module):
     """Collection of expert weights stored as 3D tensors."""
@@ -372,15 +373,15 @@ class Ernie4_5_MoeTopKRouter(nn.Module):
         )
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            router_logits = F.linear(hidden_states.float(), self.weight)
-            router_logits = F.softmax(router_logits, dim=1, dtype=torch.float)
-            router_top_value, router_indices = torch.topk(self.moe_statics(router_logits), self.top_k, dim=-1)
-            router_top_value = router_top_value / torch.clamp(
-                router_top_value.sum(dim=-1, keepdim=True), min=self.norm_min
+            router_logits = F.linear(hidden_states.float(), self.weight.float())
+            routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+            _, selected_experts = torch.topk(self.moe_statics(routing_weights), self.top_k, dim=-1)
+            routing_weights = torch.gather(routing_weights, dim=-1, index=selected_experts)
+            routing_weights = routing_weights / torch.clamp(
+                routing_weights.sum(dim=-1, keepdim=True), min=self.norm_min
             )
-            router_scores = router_top_value
-        router_scores = router_scores.to(hidden_states.dtype)
-        return router_logits, router_scores, router_indices
+        routing_weights = routing_weights.to(hidden_states.dtype)
+        return router_logits, selected_experts, routing_weights
 class Ernie4_5_MoeSparseMoeBlock(nn.Module):
@@ -403,7 +404,7 @@ class Ernie4_5_MoeSparseMoeBlock(nn.Module):
         if self.shared_experts is not None:
             shared_output = self.shared_experts(hidden_states)
-        _, top_k_weights, top_k_index = self.gate(hidden_states)
+        _, top_k_index, top_k_weights = self.gate(hidden_states)
         final_hidden_states = self.experts(hidden_states, top_k_index, top_k_weights)
         if self.shared_experts is not None:
@@ -476,7 +477,9 @@ class Ernie4_5_MoePreTrainedModel(PreTrainedModel):
     _supports_flash_attn = True
     _supports_sdpa = True
     _supports_flex_attn = True
-    _can_compile_fullgraph = False  # MoE models don't work with torch.compile (`torch.where(condition)` not supported)
+    _can_compile_fullgraph = (
+        is_grouped_mm_available()
+    )  # https://huggingface.co/docs/transformers/experts_interface#torchcompile
     _supports_attention_backend = True
     _can_record_outputs = {
         "router_logits": OutputRecorder(Ernie4_5_MoeTopKRouter, index=0),

transformers/models/ernie4_5_moe/modular_ernie4_5_moe.py CHANGED Viewed

@@ -104,32 +104,6 @@ class Ernie4_5_MoeExperts(MixtralExperts):
         self.num_experts = config.moe_num_experts
         self.intermediate_dim = config.moe_intermediate_size
-    def forward(
-        self,
-        hidden_states: torch.Tensor,
-        top_k_index: torch.Tensor,
-        top_k_weights: torch.Tensor,
-    ) -> torch.Tensor:
-        final_hidden_states = torch.zeros_like(hidden_states)
-        with torch.no_grad():
-            expert_mask = torch.nn.functional.one_hot(top_k_index, num_classes=self.num_experts)
-            expert_mask = expert_mask.permute(2, 1, 0)
-            expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
-        for expert_idx in expert_hit:
-            expert_idx = expert_idx[0]
-            if expert_idx == self.num_experts:
-                continue
-            top_k_pos, token_idx = torch.where(expert_mask[expert_idx])
-            current_state = hidden_states[token_idx]
-            gate, up = nn.functional.linear(current_state, self.gate_up_proj[expert_idx]).chunk(2, dim=-1)
-            current_hidden_states = self.act_fn(gate) * up
-            current_hidden_states = nn.functional.linear(current_hidden_states, self.down_proj[expert_idx])
-            current_hidden_states = current_hidden_states * top_k_weights[token_idx, top_k_pos, None]
-            final_hidden_states.index_add_(0, token_idx, current_hidden_states.to(final_hidden_states.dtype))
-        return final_hidden_states
 class Ernie4_5_MoeTopKRouter(nn.Module):
     def __init__(self, config):
@@ -147,15 +121,15 @@ class Ernie4_5_MoeTopKRouter(nn.Module):
         )
         with maybe_autocast(device_type=device_type, enabled=False):  # Force float32
-            router_logits = F.linear(hidden_states.float(), self.weight)
-            router_logits = F.softmax(router_logits, dim=1, dtype=torch.float)
-            router_top_value, router_indices = torch.topk(self.moe_statics(router_logits), self.top_k, dim=-1)
-            router_top_value = router_top_value / torch.clamp(
-                router_top_value.sum(dim=-1, keepdim=True), min=self.norm_min
+            router_logits = F.linear(hidden_states.float(), self.weight.float())
+            routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+            _, selected_experts = torch.topk(self.moe_statics(routing_weights), self.top_k, dim=-1)
+            routing_weights = torch.gather(routing_weights, dim=-1, index=selected_experts)
+            routing_weights = routing_weights / torch.clamp(
+                routing_weights.sum(dim=-1, keepdim=True), min=self.norm_min
             )
-            router_scores = router_top_value
-        router_scores = router_scores.to(hidden_states.dtype)
-        return router_logits, router_scores, router_indices
+        routing_weights = routing_weights.to(hidden_states.dtype)
+        return router_logits, selected_experts, routing_weights
 class Ernie4_5_MoeSparseMoeBlock(nn.Module):
@@ -178,7 +152,7 @@ class Ernie4_5_MoeSparseMoeBlock(nn.Module):
         if self.shared_experts is not None:
             shared_output = self.shared_experts(hidden_states)
-        _, top_k_weights, top_k_index = self.gate(hidden_states)
+        _, top_k_index, top_k_weights = self.gate(hidden_states)
         final_hidden_states = self.experts(hidden_states, top_k_index, top_k_weights)
         if self.shared_experts is not None:

transformers/models/ernie4_5_vl_moe/__init__.py ADDED Viewed

@@ -0,0 +1,31 @@
+# Copyright 2025 The Qwen Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+if TYPE_CHECKING:
+    from .configuration_ernie4_5_vl_moe import *
+    from .image_processing_ernie4_5_vl_moe import *
+    from .image_processing_ernie4_5_vl_moe_fast import *
+    from .modeling_ernie4_5_vl_moe import *
+    from .processing_ernie4_5_vl_moe import *
+    from .video_processing_ernie4_5_vl_moe import *
+else:
+    import sys
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)

transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl

transformers 5.0.0rc1py3-none-any.whl → 5.0.0rc2py3-none-any.whl