PyPI - transformers - Versions diffs - 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl - Mend

transformers 5.0.0rc1py3-none-any.whl → 5.0.0rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (671) hide show

transformers/integrations/integration_utils.py CHANGED Viewed

@@ -940,6 +940,8 @@ class TrackioCallback(TrainerCallback):
     ```
     """
+    SPACE_URL = "https://huggingface.co/spaces/{space_id}"
     def __init__(self):
         has_trackio = is_trackio_available()
         if not has_trackio:
@@ -1058,6 +1060,39 @@ class TrackioCallback(TrainerCallback):
             metrics = rewrite_logs(metrics)
             self._trackio.log(metrics)
+    def on_push_begin(self, args, state, control, model, **kwargs):
+        if not state.is_world_process_zero or self._trackio is None:
+            return
+        if (current_project := self._trackio.context_vars.current_project.get()) is None:
+            return
+        trackio_version = packaging.version.parse(self._trackio.__version__)
+        if trackio_version < packaging.version.parse("0.13.0"):
+            warnings.warn(
+                "The version of `trackio` that is installed is <=0.13.0, so "
+                "the local Trackio project will not be pushed to Hugging Face. Run "
+                "`pip install --upgrade trackio` to fix this."
+            )
+            return
+        space_id = self._trackio.context_vars.current_space_id.get()
+        if space_id is None:
+            space_id = self._trackio.sync(current_project, force=True)
+        space_url = self.SPACE_URL.format(space_id=space_id)
+        badge_markdown = (
+            f'<a href="{space_url}" target="_blank"><img src="https://raw.githubusercontent.com/gradio-app/trackio/refs/heads/main/trackio/assets/badge.png" alt="Visualize in Trackio"'
+            ' title="Visualize in Trackio" style="height: 40px;"/></a>'
+        )
+        if badge_markdown not in modelcard.AUTOGENERATED_TRAINER_COMMENT:
+            modelcard.AUTOGENERATED_TRAINER_COMMENT += f"\n{badge_markdown}"
+        trackio_tags = ["trackio", f"trackio:{space_url}"]
+        if getattr(model, "model_tags", None) is not None:
+            if "trackio" not in model.model_tags:
+                model.model_tags.extend(trackio_tags)
+        else:
+            model.model_tags = trackio_tags
 class CometCallback(TrainerCallback):
     """

transformers/integrations/mistral.py CHANGED Viewed

@@ -77,6 +77,7 @@ def convert_tekken_tokenizer(tokenizer_file: str):
     """Convert a "tekken" tokenizer to a fast Tokenizer."""
     # Tekken format -- need to use the Converter
+    from mistral_common.tokens.tokenizers.base import SpecialTokens
     from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
     # Load directly using their lib
@@ -106,4 +107,15 @@ def convert_tekken_tokenizer(tokenizer_file: str):
     # Post-process
     tokenizer.add_special_tokens({"additional_special_tokens": all_special})
+    MAP_SPECAL = {
+        "bos_token": SpecialTokens.bos.value,
+        "eos_token": SpecialTokens.eos.value,
+        "pad_token": SpecialTokens.pad.value,
+        "unk_token": SpecialTokens.unk.value,
+    }
+    for special_key, special_token in MAP_SPECAL.items():
+        if special_token in all_special:
+            tokenizer.add_special_tokens({special_key: special_token})
     return tokenizer

transformers/integrations/moe.py ADDED Viewed

@@ -0,0 +1,240 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from functools import wraps
+from ..utils.generic import GeneralInterface
+from ..utils.import_utils import is_torch_available
+if is_torch_available():
+    import torch
+# Examples of experts class with its eager mm implementation
+# class Experts(nn.Module):
+#     """Collection of expert weights stored as 3D tensors."""
+#     def __init__(self, config):
+#         super().__init__()
+#         self.num_experts = config.n_routed_experts
+#         self.hidden_dim = config.hidden_size
+#         self.intermediate_dim = config.moe_intermediate_size
+#         self.gate_up_proj = nn.Parameter(torch.empty(self.num_experts, 2 * self.intermediate_dim, self.hidden_dim))
+#         self.down_proj = nn.Parameter(torch.empty(self.num_experts, self.hidden_dim, self.intermediate_dim))
+#         self.act_fn = ACT2FN[config.hidden_act]
+#     def forward(
+#         self,
+#         hidden_states: torch.Tensor,
+#         top_k_index: torch.Tensor,
+#         top_k_weights: torch.Tensor,
+#     ) -> torch.Tensor:
+#         final_hidden_states = torch.zeros_like(hidden_states)
+#         with torch.no_grad():
+#             expert_mask = torch.nn.functional.one_hot(top_k_index, num_classes=self.num_experts)
+#             expert_mask = expert_mask.permute(2, 1, 0)
+#             expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
+#         for expert_idx in expert_hit:
+#             expert_idx = expert_idx[0]
+#             if expert_idx == self.num_experts:
+#                 continue
+#             top_k_pos, token_idx = torch.where(expert_mask[expert_idx])
+#             current_state = hidden_states[token_idx]
+#             gate, up = nn.functional.linear(current_state, self.gate_up_proj[expert_idx]).chunk(2, dim=-1)
+#             current_hidden_states = self.act_fn(gate) * up
+#             current_hidden_states = nn.functional.linear(current_hidden_states, self.down_proj[expert_idx])
+#             current_hidden_states = current_hidden_states * top_k_weights[token_idx, top_k_pos, None]
+#             final_hidden_states.index_add_(0, token_idx, current_hidden_states.to(final_hidden_states.dtype))
+#         return final_hidden_states
+def batched_mm_experts_forward(
+    self: torch.nn.Module,
+    hidden_states: torch.Tensor,
+    top_k_index: torch.Tensor,
+    top_k_weights: torch.Tensor,
+) -> torch.Tensor:
+    device = hidden_states.device
+    num_top_k = top_k_index.size(-1)
+    num_tokens = hidden_states.size(0)
+    num_experts = self.gate_up_proj.size(0)
+    final_hidden_states = torch.zeros_like(hidden_states)
+    # Flatten top_k_index to get expert_ids per selected sample
+    expert_ids = top_k_index.reshape(-1)
+    token_idx = torch.arange(num_tokens, device=device).unsqueeze(1).expand(-1, num_top_k).reshape(-1)
+    # Resolve routing weights per selected sample, allowing top_k_weights to be either:
+    # - (num_tokens, num_top_k) Qwen2MoE style
+    # - (num_tokens, num_experts) DeepseekV2 style
+    if top_k_weights.shape == (num_tokens, num_top_k):
+        sample_weights = top_k_weights.reshape(-1)  # (S,)
+    elif top_k_weights.shape == (num_tokens, num_experts):
+        sample_weights = top_k_weights[token_idx, expert_ids]  # (S,)
+    else:
+        raise ValueError(
+            f"top_k_weights has an invalid/unsupported shape. It should be either (num_tokens, num_top_k)({num_tokens}, {num_top_k}) "
+            f"or (num_tokens, num_experts)({num_tokens}, {num_experts}), but got {top_k_weights.shape}."
+        )
+    # Get current hidden states for selected samples
+    current_hidden_states = hidden_states[token_idx]  # (S, hidden_dim)
+    # Select projection matrices for selected experts
+    selected_gate_up = self.gate_up_proj[expert_ids]  # (S, hidden_dim, 2 * intermediate_dim)
+    selected_down = self.down_proj[expert_ids]  # (S, hidden_dim, intermediate_dim)
+    # --- Up projection per expert (batched) ---
+    gate_up_out = torch.bmm(selected_gate_up, current_hidden_states.unsqueeze(-1)).squeeze(-1)
+    if hasattr(self, "gate_up_proj_bias") and self.gate_up_proj_bias is not None:
+        gate_up_out = gate_up_out + self.gate_up_proj_bias[expert_ids]
+    # Split into gate and up components
+    gate, up = gate_up_out.chunk(2, dim=-1)  # both have shape (S, intermediate_dim)
+    # Apply activation
+    hidden_after_activation = self.act_fn(gate) * up  # (S, intermediate_dim)
+    # --- Down projection per expert (batched) ---
+    out_per_sample = torch.bmm(selected_down, hidden_after_activation.unsqueeze(-1)).squeeze(-1)
+    if hasattr(self, "down_proj_bias") and self.down_proj_bias is not None:
+        out_per_sample = out_per_sample + self.down_proj_bias[expert_ids]
+    # Apply routing weights
+    out_per_sample = out_per_sample * sample_weights.unsqueeze(-1)  # (S, hidden_dim)
+    # Accumulate results back to the final_hidden_states using original token indices
+    final_hidden_states.index_add_(0, token_idx, out_per_sample.to(final_hidden_states.dtype))
+    return final_hidden_states
+def grouped_mm_experts_forward(
+    self: torch.nn.Module,
+    hidden_states: torch.Tensor,
+    top_k_index: torch.Tensor,
+    top_k_weights: torch.Tensor,
+) -> torch.Tensor:
+    if not hasattr(torch, "_grouped_mm"):
+        raise ImportError(
+            "torch._grouped_mm is not available. Please make sure you are using a PyTorch version that includes it (2.9+)."
+        )
+    device = hidden_states.device
+    num_top_k = top_k_index.size(-1)
+    num_tokens = hidden_states.size(0)
+    num_experts = self.gate_up_proj.size(0)
+    final_hidden_states = torch.zeros_like(hidden_states)
+    # Flatten top_k_index to get expert_ids per selected sample
+    expert_ids = top_k_index.reshape(-1)
+    token_idx = torch.arange(num_tokens, device=device).unsqueeze(1).expand(-1, num_top_k).reshape(-1)
+    # Get permutation to group by expert
+    perm = torch.argsort(expert_ids, stable=True)
+    inv_perm = torch.argsort(perm, stable=True)
+    # Resolve routing weights per selected sample, allowing top_k_weights to be either:
+    # - (num_tokens, num_top_k) Qwen2MoE style
+    # - (num_tokens, num_experts) DeepseekV2 style
+    if top_k_weights.shape == (num_tokens, num_top_k):
+        sample_weights = top_k_weights.reshape(-1)  # (S,)
+    elif top_k_weights.shape == (num_tokens, num_experts):
+        sample_weights = top_k_weights[token_idx, expert_ids]  # (S,)
+    else:
+        raise ValueError(
+            f"top_k_weights has an invalid/unsupported shape. It should be either (num_tokens, num_top_k)({num_tokens}, {num_top_k}) "
+            f"or (num_tokens, num_experts)({num_tokens}, {num_experts}), but got {top_k_weights.shape}."
+        )
+    # Get current hidden states for selected samples
+    current_hidden_states = hidden_states[token_idx]  # (S, hidden_dim)
+    # Group by expert for grouped_mm
+    expert_ids_g = expert_ids[perm]
+    sample_weights_g = sample_weights[perm]
+    current_states_g = current_hidden_states[perm]
+    # Compute offsets for grouped_mm
+    # using histc instead of bincount to avoid cuda graph issues
+    # (grouped_mm_experts_forward still fails with cuda graphs but because of _grouped_mm internals)
+    num_tokens_per_expert = torch.histc(expert_ids_g.float(), bins=num_experts, min=0, max=num_experts - 1)
+    offsets = torch.cumsum(num_tokens_per_expert, dim=0, dtype=torch.int32)
+    # --- Up projection per expert (grouped_mm) ---
+    gate_up_out = torch._grouped_mm(current_states_g, self.gate_up_proj.transpose(-2, -1), offs=offsets)
+    if hasattr(self, "gate_up_proj_bias") and self.gate_up_proj_bias is not None:
+        # we should be able to pass bias to the grouped_mm call, but it's still not fully supported
+        gate_up_out = gate_up_out + self.gate_up_proj_bias[expert_ids_g]
+    # Split into gate and up components
+    gate, up = gate_up_out.chunk(2, dim=-1)  # both have shape (S, intermediate_dim)
+    # Apply activation
+    hidden_after_activation = self.act_fn(gate) * up  # (S, intermediate_dim)
+    # --- Down projection per expert (grouped_mm) ---
+    out_per_sample_g = torch._grouped_mm(hidden_after_activation, self.down_proj.transpose(-2, -1), offs=offsets)
+    if hasattr(self, "down_proj_bias") and self.down_proj_bias is not None:
+        # we should be able to pass bias to the grouped_mm call, but it's still not fully supported
+        out_per_sample_g = out_per_sample_g + self.down_proj_bias[expert_ids_g]
+    # Apply routing weights
+    out_per_sample_g = out_per_sample_g * sample_weights_g.unsqueeze(-1)
+    # Restore original order
+    out_per_sample = out_per_sample_g[inv_perm]
+    # Accumulate results back to the final_hidden_states using original token indices
+    final_hidden_states.index_add_(0, token_idx, out_per_sample.to(final_hidden_states.dtype))
+    return final_hidden_states
+class ExpertsInterface(GeneralInterface):
+    """Interface for registering custom experts implementations."""
+    _global_mapping = {
+        "batched_mm": batched_mm_experts_forward,
+        "grouped_mm": grouped_mm_experts_forward,
+    }
+ALL_EXPERTS_FUNCTIONS = ExpertsInterface()
+def use_experts_implementation(experts_class: type[torch.nn.Module]) -> type[torch.nn.Module]:
+    original_init = experts_class.__init__
+    original_forward = experts_class.forward
+    @wraps(original_init)
+    def __init__(self, config, *args, **kwargs):
+        original_init(self, config, *args, **kwargs)
+        self.config = config
+    @wraps(original_forward)
+    def forward(self, *args, **kwargs):
+        experts_forward = original_forward
+        if self.config._experts_implementation != "eager":
+            experts_forward = ALL_EXPERTS_FUNCTIONS[self.config._experts_implementation]
+        return experts_forward(self, *args, **kwargs)
+    experts_class.__init__ = __init__
+    experts_class.forward = forward
+    return experts_class

transformers/integrations/mxfp4.py CHANGED Viewed

@@ -12,22 +12,16 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from ..utils import is_accelerate_available, is_torch_available, is_torch_xpu_available, logging
+from ..utils import is_torch_available, is_torch_xpu_available, logging
 if is_torch_available():
     import torch
     from torch import nn
+from contextlib import contextmanager
 from typing import Optional
 from ..core_model_loading import ConversionOps
-if is_accelerate_available():
-    from accelerate import init_empty_weights
-from contextlib import contextmanager
 from ..quantizers.quantizers_utils import get_module_from_name, should_convert_module
@@ -610,7 +604,7 @@ def replace_with_mxfp4_linear(model, quantization_config=None, modules_to_not_co
     if quantization_config.dequantize:
         return model
-    from kernels import get_kernel
+    from .hub_kernels import get_kernel
     global triton_kernels_hub
     triton_kernels_hub = get_kernel("kernels-community/triton_kernels")
@@ -620,7 +614,7 @@ def replace_with_mxfp4_linear(model, quantization_config=None, modules_to_not_co
         if not should_convert_module(module_name, modules_to_not_convert):
             continue
         if module.__class__.__name__ == "GptOssExperts" and not quantization_config.dequantize:
-            with init_empty_weights():
+            with torch.device("meta"):
                 model.set_submodule(module_name, Mxfp4GptOssExperts(model.config))
                 has_been_replaced = True
         if module.__class__.__name__ == "GptOssMLP" and not quantization_config.dequantize:

transformers/integrations/peft.py CHANGED Viewed

@@ -84,6 +84,7 @@ class PeftAdapterMixin:
         low_cpu_mem_usage: bool = False,
         is_trainable: bool = False,
         hotswap: bool | Literal["auto"] = "auto",
+        local_files_only: bool = False,
         adapter_kwargs: dict[str, Any] | None = None,
     ) -> None:
         """
@@ -243,6 +244,7 @@ class PeftAdapterMixin:
             adapter_config_file = find_adapter_config_file(
                 peft_model_id,
                 token=token,
+                local_files_only=local_files_only,
                 **adapter_kwargs,
             )
@@ -255,6 +257,7 @@ class PeftAdapterMixin:
             peft_config = PeftConfig.from_pretrained(
                 peft_model_id,
                 token=token,
+                local_files_only=local_files_only,
                 **adapter_kwargs,
             )
             peft_config.inference_mode = not is_trainable
@@ -268,6 +271,8 @@ class PeftAdapterMixin:
             self._hf_peft_config_loaded = True
         if peft_model_id is not None:
+            if "local_files_only" not in adapter_kwargs:
+                adapter_kwargs["local_files_only"] = local_files_only
             adapter_state_dict = load_peft_weights(peft_model_id, token=token, device=device, **adapter_kwargs)
         # We need to pre-process the state dict to remove unneeded prefixes - for backward compatibility

transformers/integrations/quanto.py CHANGED Viewed

@@ -43,6 +43,10 @@ class QuantoQuantize(ConversionOps):
         _load_parameter_into_model(model, full_layer_name, value)
         module, _ = get_module_from_name(model, full_layer_name)
+        # Need to set those to a specific value, otherwise they will remain on meta device ...
+        module.input_scale = torch.ones(module.input_scale.shape)
+        module.output_scale = torch.ones(module.output_scale.shape)
+        # quantize
         module.freeze()
         module.weight.requires_grad = False
         module._is_hf_initialized = True
@@ -73,7 +77,6 @@ def replace_with_quanto_layers(
             A list of modules to not convert. If a module name is in the list (e.g. `lm_head`), it will not be
             converted.
     """
-    from accelerate import init_empty_weights
     from optimum.quanto import QLayerNorm, QLinear, qfloat8, qint2, qint4, qint8
     w_mapping = {"float8": qfloat8, "int8": qint8, "int4": qint4, "int2": qint2}
@@ -83,7 +86,7 @@ def replace_with_quanto_layers(
     for module_name, module in model.named_modules():
         if not should_convert_module(module_name, modules_to_not_convert):
             continue
-        with init_empty_weights():
+        with torch.device("meta"):
             new_module = None
             if isinstance(module, nn.Linear):
                 new_module = QLinear(

transformers/integrations/spqr.py CHANGED Viewed

@@ -14,13 +14,11 @@
 "SpQR (Sparse-Quantized Representation) integration file"
 from ..quantizers.quantizers_utils import should_convert_module
-from ..utils import is_accelerate_available, is_spqr_available, is_torch_available, logging
+from ..utils import is_spqr_available, is_torch_available, logging
-if is_accelerate_available():
-    from accelerate import init_empty_weights
 if is_torch_available():
+    import torch
     import torch.nn as nn
 logger = logging.get_logger(__name__)
@@ -47,7 +45,7 @@ def replace_with_spqr_linear(model, modules_to_not_convert: list[str] | None = N
     for module_name, module in model.named_modules():
         if not should_convert_module(module_name, modules_to_not_convert):
             continue
-        with init_empty_weights():
+        with torch.device("meta"):
             if isinstance(module, nn.Linear):
                 shapes = quantization_config.shapes

transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl

transformers 5.0.0rc1py3-none-any.whl → 5.0.0rc2py3-none-any.whl