PyPI - transformers - Versions diffs - 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl - Mend

transformers 5.0.0rc1py3-none-any.whl → 5.0.0rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (671) hide show

transformers/generation/utils.py CHANGED Viewed

@@ -19,12 +19,12 @@ import inspect
 import os
 import warnings
 from collections.abc import Callable
+from contextlib import contextmanager
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Any, Optional, Union
 import torch
 import torch.distributed as dist
-from packaging import version
 from torch import nn
 from ..cache_utils import (
@@ -407,6 +407,9 @@ class GenerationMixin(ContinuousMixin):
                     **repo_loading_kwargs,
                 )
             except OSError:
+                # `self` already has a generation config created from model config, but model config will
+                # not contain any generation-specific params. These are popped at config's `__init__`.
+                # Thus we have to load from `config.json` and create a generation config from it (for BART)
                 logger.info(
                     "Generation config file not found, using a generation config created from the model config."
                 )
@@ -418,6 +421,7 @@ class GenerationMixin(ContinuousMixin):
                     _from_model_config=True,
                     **repo_loading_kwargs,
                 )
             # Load custom generate function if `pretrained_model_name_or_path` defines it (and override `generate`)
             if hasattr(self, "load_custom_generate") and trust_remote_code:
                 try:
@@ -593,6 +597,7 @@ class GenerationMixin(ContinuousMixin):
         attention_mask: torch.LongTensor | None = None,
         inputs_embeds: torch.FloatTensor | None = None,
         cache_position: torch.LongTensor | None = None,
+        is_first_iteration: bool | None = False,
         **kwargs,
     ):
         """
@@ -628,7 +633,7 @@ class GenerationMixin(ContinuousMixin):
         input_ids_key = "decoder_input_ids" if self.config.is_encoder_decoder else "input_ids"
         # if `inputs_embeds` are passed, we only want to use them in the 1st generation step for every prompt.
         if not self.config.is_encoder_decoder:
-            if inputs_embeds is not None and len(cache_position) == inputs_embeds.shape[1]:
+            if inputs_embeds is not None and is_first_iteration:
                 model_inputs[input_ids_key] = None
                 model_inputs["inputs_embeds"] = inputs_embeds
             else:
@@ -708,6 +713,7 @@ class GenerationMixin(ContinuousMixin):
                     past_key_values=past_key_values,
                     position_ids=position_ids,
                     token_type_ids=token_type_ids,
+                    is_first_iteration=is_first_iteration,
                 )
             else:
                 attention_mask = causal_mask_creation_function(
@@ -1300,7 +1306,7 @@ class GenerationMixin(ContinuousMixin):
         if generation_config.do_sample:
             # In beam methods, we need to keep at least one non-eos token to explore continuations that might have a
             # better score (i.e. keep len(list(generation_config._eos_token_tensor)) + 1)
-            if generation_config.num_beams > 1:
+            if generation_config.num_beams is not None and generation_config.num_beams > 1:
                 if isinstance(generation_config._eos_token_tensor, list):
                     min_tokens_to_keep = len(generation_config._eos_token_tensor) + 1
                 elif isinstance(generation_config._eos_token_tensor, torch.Tensor):
@@ -1722,8 +1728,8 @@ class GenerationMixin(ContinuousMixin):
                 )
             generation_config.max_length = generation_config.max_new_tokens + input_ids_length
-        # if both `inputs_embeds` and `input_ids` are passed, we do not correct the length
-        # otherwise we need total length [inputs-embeds-len + new-tokens-len] to not go beyond indicated `max_length``
+        # If both `inputs_embeds` and `input_ids` are passed, we correct length with `inputs_tensor.shape`
+        # We need to get max_length = inputs_embeds_len + max_new_tokens
         elif (
             model_input_name == "inputs_embeds"
             and input_ids_length != inputs_tensor.shape[1]
@@ -1731,11 +1737,10 @@ class GenerationMixin(ContinuousMixin):
         ):
             generation_config.max_length -= inputs_tensor.shape[1]
         elif has_default_max_length:  # by default let's always generate 20 new tokens
-            if generation_config.max_length == GenerationConfig().max_length:
-                generation_config.max_length = generation_config.max_length + input_ids_length
-                max_position_embeddings = getattr(self.config, "max_position_embeddings", None)
-                if max_position_embeddings is not None:
-                    generation_config.max_length = min(generation_config.max_length, max_position_embeddings)
+            generation_config.max_length = generation_config.max_length + input_ids_length
+            max_position_embeddings = getattr(self.config, "max_position_embeddings", None)
+            if max_position_embeddings is not None:
+                generation_config.max_length = min(generation_config.max_length, max_position_embeddings)
         # same for min length
         if generation_config.min_new_tokens is not None:
@@ -1760,7 +1765,6 @@ class GenerationMixin(ContinuousMixin):
     def _prepare_generation_config(
         self,
         generation_config: GenerationConfig | None,
-        use_model_defaults: bool | None = None,
         **kwargs: Any,
     ) -> tuple[GenerationConfig, dict]:
         """
@@ -1768,93 +1772,57 @@ class GenerationMixin(ContinuousMixin):
         function handles retrocompatibility with respect to configuration files.
         """
         # parameterization priority:
-        # kwargs > non-global default values in `generation_config` > `model.generation_config` > GenerationConfig()
+        # user-defined kwargs or `generation_config` > `self.generation_config` > global default values
+        # TODO: (raushan) doesn't make sense to allow kwargs and `generation_config`. Should be mutually exclusive!
         # TODO (joao): per-model generation config classes.
-        using_model_generation_config = False
         if generation_config is None:
-            # legacy: users may modify the model configuration to control generation. To trigger this legacy behavior,
-            # the following conditions must be met
-            # 1) the generation config must have been created from the model config (`_from_model_config` field);
-            # 2) the generation config must have seen no modification since its creation (the hash is the same);
-            # 3) there are non-default generation parameters in the model config.
-            # 4) the user must have set new generation parameters in the model config.
-            if (
-                self.generation_config._from_model_config  # 1)
-                and self.generation_config._original_object_hash == hash(self.generation_config)  # 2)
-                and len(self.config._get_non_default_generation_parameters()) > 0  # 3)
-            ):
-                new_generation_config = GenerationConfig.from_model_config(self.config)
-                if new_generation_config != self.generation_config:  # 4)
-                    raise ValueError(
-                        "You have modified the pretrained model configuration to control generation."
-                        " This strategy to control generation is not supported anymore. "
-                        " Please use and modify the model generation configuration (see"
-                        " https://huggingface.co/docs/transformers/generation_strategies#default-text-generation-configuration )",
-                    )
-            generation_config = self.generation_config
-            using_model_generation_config = True
-            # Related to #40039: prior to this PR, models with sliding window attention were forced to have
-            # `cache_implementation="hybrid"` (the static sliding window cache). For these models, we now want to use
-            # the dynamic sliding window cache by default, so we UNSET `cache_implementation` if it is a default value.
-            # (if we're inside this branch, then it is because we're using default values from the Hub)
-            if generation_config.cache_implementation == "hybrid":
-                generation_config.cache_implementation = None
+            # Users may modify `model.config` to control generation. This is a legacy behavior and is not supported anymore
+            if len(self.config._get_generation_parameters()) > 0:
+                raise ValueError(
+                    "You have modified the pretrained model configuration to control generation "
+                    f"We detected the following values set - {self.config._get_generation_parameters()}. "
+                    "This strategy to control generation is not supported anymore. Please use and modify `model.generation_config` "
+                    "(see https://huggingface.co/docs/transformers/generation_strategies#default-text-generation-configuration )",
+                )
+            generation_config = GenerationConfig()
         # `torch.export.export` usually raises an exception if it is called
         # with ``strict=True``. deepcopy can only be processed if ``strict=False``.
         generation_config = copy.deepcopy(generation_config)
-        if not using_model_generation_config:
-            # If `generation_config` is provided:
-            # - `use_model_defaults`: let's fallback ALL default values to the model's generation config
-            # - otherwise: legacy behavior, let's just make sure we have the tokens defined
-            model_base_version = version.parse(version.parse(self.generation_config.transformers_version).base_version)
-            if use_model_defaults is True or (
-                use_model_defaults is None and model_base_version >= version.parse("4.50.0")
-            ):
-                modified_values = {}
-                global_default_generation_config = GenerationConfig()
-                model_generation_config = self.generation_config
-                # we iterate over the model's generation config: it may hold custom keys, which we'll want to copy
-                for key, model_gen_config_value in model_generation_config.__dict__.items():
-                    if key.startswith("_") or key == "transformers_version":  # metadata
-                        continue
-                    # Don't set `cache_implementation = 'hybrid'` from the model defaults, see #40135
-                    if key == "cache_implementation" and model_generation_config.cache_implementation == "hybrid":
-                        continue
-                    global_default_value = getattr(global_default_generation_config, key, None)
-                    custom_gen_config_value = getattr(generation_config, key, None)
-                    if (
-                        custom_gen_config_value == global_default_value
-                        and model_gen_config_value != global_default_value
-                    ):
-                        modified_values[key] = model_gen_config_value
-                        setattr(generation_config, key, model_gen_config_value)
-                # edge case: we may set `temperature=0.0` and `do_sample=False`, but the model defaults to
-                # `do_sample=True`
-                if generation_config.temperature == 0.0:
-                    generation_config.do_sample = False
-                if use_model_defaults is None and len(modified_values) > 0:
-                    logger.warning_once(
-                        f"`generation_config` default values have been modified to match model-specific defaults: "
-                        f"{modified_values}. If this is not desired, please set these values explicitly."
-                    )
-            else:
-                if generation_config.bos_token_id is None:
-                    generation_config.bos_token_id = self.generation_config.bos_token_id
-                if generation_config.eos_token_id is None:
-                    generation_config.eos_token_id = self.generation_config.eos_token_id
-                if generation_config.pad_token_id is None:
-                    generation_config.pad_token_id = self.generation_config.pad_token_id
-                if generation_config.decoder_start_token_id is None:
-                    generation_config.decoder_start_token_id = self.generation_config.decoder_start_token_id
-        # Finally, apply any passed kwargs
+        # First set values from the loaded `self.generation_config`, then set default values (BC)
+        # Do not update any values that aren't `None`, i.e. if set by users explicitly and passed
+        # to `generate()`. Thus the `defaults_only=True` is used
+        global_defaults = self.generation_config._get_default_generation_params()
+        generation_config.update(**self.generation_config.to_dict(), defaults_only=True)
+        generation_config.update(**global_defaults, defaults_only=True)
+        # Due to some values being boolean and not `None`, we need additional logic to overwrite
+        # them explicitly (`defaults_only=False`) on the condition that it's only a previous
+        # default value
+        default_generation_config = GenerationConfig()
+        generation_config.update(
+            **{
+                k: v
+                for k, v in self.generation_config.to_dict().items()
+                if isinstance(v, bool)
+                and hasattr(default_generation_config, k)
+                and getattr(generation_config, k, None) == getattr(default_generation_config, k)
+            }
+        )
+        # Finally, if there are any kwargs, update config with it -> highest priority at the end
         model_kwargs = generation_config.update(**kwargs)
-        # And keep in model_kwargs variable output controls
+        # Related to #40039: prior to this PR, models with sliding window attention were forced to have
+        # `cache_implementation="hybrid"` (the static sliding window cache). For these models, we now want to use
+        # the dynamic sliding window cache by default, so we UNSET `cache_implementation` if it is a default value.
+        # (if we're inside this branch, then it is because we're using default values from the Hub)
+        if generation_config.cache_implementation == "hybrid":
+            generation_config.cache_implementation = None
+        # Finally keep output_xxx args in `model_kwargs` so it can be passed to `forward`
         output_attentions = generation_config.output_attentions
         output_hidden_states = generation_config.output_hidden_states
         model_kwargs.update({"output_attentions": output_attentions} if output_attentions else {})
@@ -2211,8 +2179,10 @@ class GenerationMixin(ContinuousMixin):
                 "will be skipped."
             )
-            # Finally: if we can compile, disable tokenizers parallelism and check for FA2 + static cache
+        if can_compile:
+            # Finally: if we can compile, disable tokenizers parallelism
             os.environ["TOKENIZERS_PARALLELISM"] = "0"
             # If we use FA2 and a static cache, we cannot compile with fullgraph
             if self.config._attn_implementation == "flash_attention_2":
                 # only raise warning if the user passed an explicit compile-config
@@ -2225,6 +2195,22 @@ class GenerationMixin(ContinuousMixin):
         return can_compile
+    @contextmanager
+    def _optimize_model_for_decode(self):
+        original_experts_implementation = self.config._experts_implementation
+        if original_experts_implementation == "grouped_mm":
+            logger.info_once(
+                "We will be switching to 'batched_mm' for the decoding stage as it is much more performant than 'grouped_mm' on smaller inputs. "
+                "If you experience any issues with this, please open an issue on the Hugging Face Transformers GitHub repository.",
+            )
+            self.set_experts_implementation("batched_mm")
+        try:
+            yield
+        finally:
+            if original_experts_implementation == "grouped_mm":
+                self.set_experts_implementation(original_experts_implementation)
     def _get_deprecated_gen_repo(
         self,
         generation_mode: GenerationMode,
@@ -2294,7 +2280,6 @@ class GenerationMixin(ContinuousMixin):
         streamer: Optional["BaseStreamer"] = None,
         negative_prompt_ids: torch.Tensor | None = None,
         negative_prompt_attention_mask: torch.Tensor | None = None,
-        use_model_defaults: bool | None = None,
         custom_generate: str | Callable | None = None,
         **kwargs,
     ) -> GenerateOutput | torch.LongTensor:
@@ -2360,11 +2345,6 @@ class GenerationMixin(ContinuousMixin):
                 size. This is an experimental feature, subject to breaking API changes in future versions.
             negative_prompt_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                 Attention_mask for `negative_prompt_ids`.
-            use_model_defaults (`bool`, *optional*):
-                When it is `True`, unset parameters in `generation_config` will be set to the model-specific default
-                generation configuration (`model.generation_config`), as opposed to the global defaults
-                (`GenerationConfig()`). If unset, models saved starting from `v4.50` will consider this flag to be
-                `True`.
             custom_generate (`str` or `Callable`, *optional*):
                 One of the following:
                 - `str` (Hugging Face Hub repository name): runs the custom `generate` function defined at
@@ -2474,7 +2454,7 @@ class GenerationMixin(ContinuousMixin):
             # switch to CB
             outputs = self.generate_batch(
                 inputs=inputs,
-                generation_config=self._prepare_generation_config(generation_config, use_model_defaults, **kwargs)[0],
+                generation_config=self._prepare_generation_config(generation_config, **kwargs)[0],
                 **kwargs,
             )
             sequences = [
@@ -2495,9 +2475,15 @@ class GenerationMixin(ContinuousMixin):
             streamer,
         )
-        generation_config, model_kwargs = self._prepare_generation_config(
-            generation_config, use_model_defaults, **kwargs
+        # Check length values before updating the config with defaults. We'll use it later to define the final min/max length (# 6)
+        has_default_max_length = kwargs.get("max_length") is None and (
+            generation_config is None or generation_config.max_length is None
         )
+        has_default_min_length = kwargs.get("min_length") is None and (
+            generation_config is None or generation_config.min_length is None
+        )
+        generation_config, model_kwargs = self._prepare_generation_config(generation_config, **kwargs)
         generation_mode = generation_config.get_generation_mode(assistant_model)
         if isinstance(custom_generate, Callable):
             decoding_method = custom_generate
@@ -2523,7 +2509,6 @@ class GenerationMixin(ContinuousMixin):
                 assistant_model=assistant_model,
                 negative_prompt_ids=negative_prompt_ids,
                 negative_prompt_attention_mask=negative_prompt_attention_mask,
-                use_model_defaults=use_model_defaults,
                 custom_generate=deprecated_mode_repo,
                 trust_remote_code=trust_remote_code,
                 **generation_mode_kwargs,
@@ -2614,8 +2599,6 @@ class GenerationMixin(ContinuousMixin):
         # 6. Prepare `max_length` depending on other stopping criteria.
         input_ids_length = input_ids.shape[1]
-        has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
-        has_default_min_length = kwargs.get("min_length") is None and generation_config.min_length is not None
         generation_config = self._prepare_generated_length(
             generation_config=generation_config,
             has_default_max_length=has_default_max_length,
@@ -2873,13 +2856,20 @@ class GenerationMixin(ContinuousMixin):
             else self.__call__
         )
-        prefill_consumed = False
-        outputs = self._prefill(input_ids, generation_config, model_kwargs)
+        # Assisted generation completes the prefill stage in candidate generator so that
+        # we don't have several `prefill` calls in one generation loop. Skip `_prefill` for assistants
+        if not generation_config.is_assistant:
+            outputs = self._prefill(input_ids, generation_config, model_kwargs)
+            prefill_consumed = False
+        else:
+            model_kwargs = self._get_initial_cache_position(input_ids.shape[1], input_ids.device, model_kwargs)
+            prefill_consumed = True
         while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
             if prefill_consumed:
                 model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
-                outputs = model_forward(**model_inputs, return_dict=True)
+                with self._optimize_model_for_decode():
+                    outputs = model_forward(**model_inputs, return_dict=True)
             prefill_consumed = True
             model_kwargs = self._update_model_kwargs_for_generation(
                 outputs,
@@ -3351,9 +3341,15 @@ class GenerationMixin(ContinuousMixin):
         )
         beam_indices = running_beam_indices.detach().clone()
-        prefill_consumed = False
         flat_running_sequences = input_ids
-        model_outputs = self._prefill(input_ids, generation_config, model_kwargs)
+        # Assisted generation completes the prefill stage in candidate generator so that
+        # we don't have several `prefill` calls in one generation loop. Skip `_prefill` for assistants
+        if not generation_config.is_assistant:
+            model_outputs = self._prefill(input_ids, generation_config, model_kwargs)
+            prefill_consumed = False
+        else:
+            model_kwargs = self._get_initial_cache_position(input_ids.shape[1], input_ids.device, model_kwargs)
+            prefill_consumed = True
         # 4. run the generation loop
         while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
@@ -3659,7 +3655,7 @@ class GenerationMixin(ContinuousMixin):
             cur_len = input_ids.shape[1]
             #  1. Fetch candidate sequences from a `CandidateGenerator` and move to the correct device
-            candidate_input_ids, candidate_logits = candidate_generator.get_candidates(input_ids)
+            candidate_input_ids, candidate_logits = candidate_generator.get_candidates(input_ids, is_first_iteration)
             candidate_input_ids = candidate_input_ids.to(self.device)
             if candidate_logits is not None:
                 candidate_logits = candidate_logits.to(self.device)
@@ -3686,7 +3682,9 @@ class GenerationMixin(ContinuousMixin):
                     dim=0,
                 )
-            model_inputs = self.prepare_inputs_for_generation(candidate_input_ids, **candidate_kwargs)
+            model_inputs = self.prepare_inputs_for_generation(
+                candidate_input_ids, is_first_iteration=is_first_iteration, **candidate_kwargs
+            )
             if "logits_to_keep" in model_inputs:
                 model_inputs["logits_to_keep"] = candidate_length + 1
@@ -3849,7 +3847,7 @@ class GenerationMixin(ContinuousMixin):
     def _prefill(self, input_ids: torch.LongTensor, generation_config: GenerationConfig, model_kwargs):
         if generation_config.prefill_chunk_size is None:
             model_kwargs = self._get_initial_cache_position(input_ids.shape[1], input_ids.device, model_kwargs)
-            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+            model_inputs = self.prepare_inputs_for_generation(input_ids, is_first_iteration=True, **model_kwargs)
             return self(**model_inputs, return_dict=True)
         else:  # Chunked prefill
             # Even if we are not compiling the forward, flex is always compiled when used. With chunked prefill, we may

transformers/generation/watermarking.py CHANGED Viewed

@@ -16,7 +16,7 @@
 import collections
 from dataclasses import dataclass
 from functools import lru_cache
-from typing import Any
+from typing import TYPE_CHECKING, Any, Optional, Union
 import numpy as np
 import torch
@@ -24,12 +24,15 @@ from torch import nn
 from torch.nn import BCELoss
 from .. import initialization as init
+from ..configuration_utils import PreTrainedConfig
 from ..modeling_utils import PreTrainedModel
 from ..utils import ModelOutput, logging
-from .configuration_utils import PreTrainedConfig, WatermarkingConfig
 from .logits_process import SynthIDTextWatermarkLogitsProcessor, WatermarkLogitsProcessor
+if TYPE_CHECKING:
+    from .configuration_utils import WatermarkingConfig
 logger = logging.get_logger(__name__)
@@ -120,13 +123,13 @@ class WatermarkDetector:
     def __init__(
         self,
-        model_config: PreTrainedConfig,
+        model_config: "PreTrainedConfig",
         device: str,
-        watermarking_config: WatermarkingConfig | dict,
+        watermarking_config: Optional[Union["WatermarkingConfig", dict]],
         ignore_repeated_ngrams: bool = False,
         max_cache_size: int = 128,
     ):
-        if isinstance(watermarking_config, WatermarkingConfig):
+        if not isinstance(watermarking_config, dict):
             watermarking_config = watermarking_config.to_dict()
         self.bos_token_id = (

transformers/image_processing_base.py CHANGED Viewed

@@ -71,8 +71,8 @@ class ImageProcessingMixin(PushToHubMixin):
         # This key was saved while we still used `XXXFeatureExtractor` for image processing. Now we use
         # `XXXImageProcessor`, this attribute and its value are misleading.
         kwargs.pop("feature_extractor_type", None)
-        # Pop "processor_class" as it should be saved as private attribute
-        self._processor_class = kwargs.pop("processor_class", None)
+        # Pop "processor_class", should not be saved with image processing config anymore
+        kwargs.pop("processor_class", None)
         # Additional attributes without default values
         for key, value in kwargs.items():
             try:
@@ -81,10 +81,6 @@ class ImageProcessingMixin(PushToHubMixin):
                 logger.error(f"Can't set {key} with value {value} for {self}")
                 raise err
-    def _set_processor_class(self, processor_class: str):
-        """Sets processor class as an attribute."""
-        self._processor_class = processor_class
     @classmethod
     def from_pretrained(
         cls: type[ImageProcessorType],
@@ -428,12 +424,6 @@ class ImageProcessingMixin(PushToHubMixin):
             if isinstance(value, np.ndarray):
                 dictionary[key] = value.tolist()
-        # make sure private name "_processor_class" is correctly
-        # saved as "processor_class"
-        _processor_class = dictionary.pop("_processor_class", None)
-        if _processor_class is not None:
-            dictionary["processor_class"] = _processor_class
         return json.dumps(dictionary, indent=2, sort_keys=True) + "\n"
     def to_json_file(self, json_file_path: Union[str, os.PathLike]):

transformers/image_processing_utils_fast.py CHANGED Viewed

@@ -932,11 +932,22 @@ class BaseImageProcessorFast(BaseImageProcessor):
         if do_pad:
             processed_images = self.pad(processed_images, pad_size=pad_size, disable_grouping=disable_grouping)
-        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
         return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
     def to_dict(self):
         encoder_dict = super().to_dict()
-        encoder_dict.pop("_valid_processor_keys", None)
-        encoder_dict.pop("_valid_kwargs_names", None)
-        return encoder_dict
+        # Filter out None values that are class defaults, but preserve explicitly set None values
+        filtered_dict = {}
+        for key, value in encoder_dict.items():
+            if value is None:
+                class_default = getattr(type(self), key, "NOT_FOUND")
+                # Keep None if user explicitly set it (class default is non-None)
+                if class_default != "NOT_FOUND" and class_default is not None:
+                    filtered_dict[key] = value
+            else:
+                filtered_dict[key] = value
+        filtered_dict.pop("_valid_processor_keys", None)
+        filtered_dict.pop("_valid_kwargs_names", None)
+        return filtered_dict

transformers/initialization.py CHANGED Viewed

@@ -206,3 +206,40 @@ def guard_torch_init_functions():
         for module, functions in originals.items():
             for func_name, func in functions.items():
                 setattr(module, func_name, func)
+@contextmanager
+def no_init_weights():
+    """
+    Disable weight initialization both at the torch-level, and at the transformers-level (`init_weights`).
+    This is used to speed-up initializing an empty model with deepspeed, as we do not initialize the model on meta device
+    with deepspeed, but we still don't need to run expensive weight initializations as we are loading params afterwards.
+    """
+    from .modeling_utils import PreTrainedModel
+    def empty_func(*args, **kwargs):
+        pass
+    originals = defaultdict(dict)
+    try:
+        # Replace all torch funcs by empty ones
+        for module_name in TORCH_MODULES_TO_PATCH:
+            if module_name in sys.modules:
+                module = sys.modules[module_name]
+                for func_name in TORCH_INIT_FUNCTIONS.keys():
+                    if hasattr(module, func_name):
+                        originals[module][func_name] = getattr(module, func_name)
+                        setattr(module, func_name, empty_func)
+        # Also patch our own `init_weights`
+        original_init_weights = PreTrainedModel.init_weights
+        PreTrainedModel.init_weights = empty_func
+        yield
+    finally:
+        # Set back the original torch functions on all modules
+        for module, functions in originals.items():
+            for func_name, func in functions.items():
+                setattr(module, func_name, func)
+        # Set back `init_weights`
+        PreTrainedModel.init_weights = original_init_weights

transformers/integrations/__init__.py CHANGED Viewed

@@ -69,6 +69,7 @@ _import_structure = {
     "hqq": ["prepare_for_hqq_linear"],
     "hub_kernels": [
         "LayerRepository",
+        "lazy_load_kernel",
         "register_kernel_mapping",
         "replace_kernel_forward_from_hub",
         "use_kernel_forward_from_hub",
@@ -116,6 +117,11 @@ _import_structure = {
         "run_hp_search_ray",
         "run_hp_search_wandb",
     ],
+    "moe": [
+        "batched_mm_experts_forward",
+        "grouped_mm_experts_forward",
+        "use_experts_implementation",
+    ],
     "mxfp4": [
         "Mxfp4GptOssExperts",
         "convert_moe_packed_tensors",
@@ -211,6 +217,7 @@ if TYPE_CHECKING:
     from .hqq import prepare_for_hqq_linear
     from .hub_kernels import (
         LayerRepository,
+        lazy_load_kernel,
         register_kernel_mapping,
         replace_kernel_forward_from_hub,
         use_kernel_forward_from_hub,
@@ -258,6 +265,11 @@ if TYPE_CHECKING:
         run_hp_search_ray,
         run_hp_search_wandb,
     )
+    from .moe import (
+        batched_mm_experts_forward,
+        grouped_mm_experts_forward,
+        use_experts_implementation,
+    )
     from .mxfp4 import (
         Mxfp4GptOssExperts,
         dequantize,

transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl

transformers 5.0.0rc1py3-none-any.whl → 5.0.0rc2py3-none-any.whl