PyPI - transformers - Versions diffs - 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl - Mend

transformers 5.0.0rc1py3-none-any.whl → 5.0.0rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (671) hide show

transformers/core_model_loading.py CHANGED Viewed

@@ -16,6 +16,7 @@
 from __future__ import annotations
+import math
 import os
 import re
 from abc import abstractmethod
@@ -25,11 +26,12 @@ from concurrent.futures import Future, ThreadPoolExecutor
 from contextlib import contextmanager
 from copy import deepcopy
 from dataclasses import dataclass, field
+from itertools import chain
 from typing import TYPE_CHECKING, Any, Optional, Union
 import torch
-from .integrations.accelerate import offload_weight
+from .integrations.accelerate import get_device, offload_weight
 from .integrations.tensor_parallel import ALL_PARALLEL_STYLES
 from .utils import is_env_variable_true, is_torch_greater_or_equal, logging
@@ -278,6 +280,166 @@ class PermuteForRope(ConversionOps):
         return output
+class ErnieFuseAndSplitTextVisionExperts(ConversionOps):
+    r"""
+    Special operation that splits a module list over all keys and fuses over the number of original modules.
+    Example with 2 original modules "Gate" and "Up" with 2 target keys "Text" and "Vision":
+                 ModuleList 1            ModuleList 2
+                [   Gate    ]            [   Up    ]
+                |           |            |         |
+          [Gate_Text] [Gate_Vision] [Up_Text]   [Up_Vision]
+              \                  \  /                /
+               \                 \ /                /
+                \               /  \               /
+                 \            /     \             /
+                 [GateUp_Text]      [GateUp_Vision]
+    The splits are equal and are defined by the amount of target keys.
+    The final fusions are defined by the amount of original module lists.
+    """
+    def __init__(self, stack_dim: int = 0, concat_dim: int = 1):
+        self.stack_dim = stack_dim
+        self.concat_dim = concat_dim
+    def split_list_into_chunks(self, tensor_list: list[torch.Tensor], chunks: int = 2):
+        split_size = math.ceil(len(tensor_list) / chunks)  # best effort split size
+        return [tensor_list[i * split_size : (i + 1) * split_size] for i in range(chunks)]
+    @torch.no_grad()
+    def convert(
+        self,
+        input_dict: dict[str, list[torch.Tensor]],
+        source_patterns: list[str],
+        target_patterns: list[str],
+        config,
+        **kwargs,
+    ) -> dict[str, list[torch.Tensor]]:
+        valid_keys = input_dict.keys()
+        split_and_fused = defaultdict(list)
+        for key in source_patterns:
+            if key not in valid_keys:
+                raise ValueError(
+                    f"Expected pattern {key} in collected tensors but only found tensors for: {valid_keys}"
+                )
+            tensors = input_dict.get(key, [])
+            split_tensor_lists = self.split_list_into_chunks(tensors, chunks=len(target_patterns))
+            stacked_tensors = (torch.stack(tensor_group, dim=self.stack_dim) for tensor_group in split_tensor_lists)
+            for idx, tensor_group in enumerate(stacked_tensors):
+                split_and_fused[target_patterns[idx]].append(tensor_group)
+        for k, v in split_and_fused.items():
+            split_and_fused[k] = torch.cat(v, dim=self.concat_dim)
+        return split_and_fused
+    @property
+    def reverse_op(self) -> ConversionOps:
+        return ErnieSplitAndDecoupleTextVisionExperts(stack_dim=self.stack_dim, concat_dim=self.concat_dim)
+class ErnieSplitAndDecoupleTextVisionExperts(ConversionOps):
+    r"""
+    Special operation that splits a fused module list over all original modules and
+    then decouples them into a mixed module list each over all keys.
+    Example with 2 original modules "Gate" and "Up" with 2 target keys "Text" and "Vision":
+                    [GateUp_Text]     [GateUp_Vision]
+                  /              \   /             \
+                 /                \ /               \
+                /                / \                 \
+               /                /   \                 \
+          [Gate_Text] [Gate_Vision] [Up_Text]   [Up_Vision]
+                |           |            |         |
+                [   Gate    ]            [   Up    ]
+                 ModuleList 1            ModuleList 2
+    The splits are equal and are defined by the amount of original module lists.
+    The final decoupled module lists are defined by the amount of keys.
+    """
+    def __init__(self, stack_dim: int = 0, concat_dim: int = 1):
+        self.stack_dim = stack_dim
+        self.concat_dim = concat_dim
+    @torch.no_grad()
+    def convert(
+        self,
+        input_dict: dict[str, list[torch.Tensor]],
+        source_patterns: list[str],
+        target_patterns: list[str],
+        config,
+        **kwargs,
+    ) -> dict[str, list[torch.Tensor]]:
+        fused_modules = len(target_patterns)
+        valid_keys = input_dict.keys()
+        split_tensors = []
+        for key in source_patterns:
+            if key not in valid_keys:
+                raise ValueError(
+                    f"Expected pattern {key} in collected tensors but only found tensors for: {valid_keys}"
+                )
+            # Assuming that we get single sized lists here to index with 0
+            split_tensors.append(input_dict[key][0].chunk(fused_modules, dim=self.concat_dim))
+        decoupled = {}
+        for idx, key in enumerate(target_patterns):
+            tensor_groups = [
+                list(torch.unbind(tensor_group[idx], dim=self.stack_dim)) for tensor_group in split_tensors
+            ]
+            tensor_list = list(chain.from_iterable(tensor_groups))
+            targets = [key.replace("*", f"{i}") for i in range(len(tensor_list))]
+            decoupled |= dict(zip(targets, tensor_list))
+        return decoupled
+    @property
+    def reverse_op(self) -> ConversionOps:
+        return ErnieFuseAndSplitTextVisionExperts(stack_dim=self.stack_dim, concat_dim=self.concat_dim)
+class Transpose(ConversionOps):
+    """
+    Transposes the given tensor along dim0 and dim1.
+    """
+    def __init__(self, dim0: int = 0, dim1: int = 1):
+        self.dim0 = dim0
+        self.dim1 = dim1
+    @torch.no_grad()
+    def convert(
+        self,
+        input_dict: dict[str, list[torch.Tensor]],
+        source_patterns: list[str],
+        target_patterns: list[str],
+        config,
+        **kwargs,
+    ) -> dict[str, list[torch.Tensor]]:
+        if len(input_dict) != len(target_patterns):
+            raise ValueError(
+                f"Transpose conversion can only happen on each key ({len(input_dict)}) "
+                f"and should match exact one target ({len(target_patterns)})."
+            )
+        output: dict[str, list[torch.Tensor]] = {}
+        for key, target_pattern in zip(input_dict.keys(), target_patterns):
+            tensor = input_dict.get(key, [])
+            if len(tensor) != 1:
+                raise ValueError(f"Transpose conversion requires exactly one tensor, found {len(tensor)}.")
+            output[target_pattern] = torch.transpose(tensor[0], dim0=self.dim0, dim1=self.dim1).contiguous()
+        return output
+    @property
+    def reverse_op(self) -> ConversionOps:
+        return Transpose(dim0=self.dim1, dim1=self.dim0)
 @dataclass(slots=True)
 class WeightTransform:
     source_patterns: Union[str, list[str]] = field(init=True)
@@ -302,8 +464,11 @@ class WeightTransform:
         for i, pattern in enumerate(self.target_patterns):
             # Some mapping contains `^` to notify start of string when matching -> remove it during reverse mapping
             pattern = pattern.removeprefix("^")
-            # Remove negative lookahead if any. This is ugly but needed for reverse mapping of Qwen2.5 and Sam3!
-            pattern = re.sub(r"\(\?!.+\)", "", pattern)
+            # Some mapping contains `$` to notify end of string when matching -> remove it during reverse mapping
+            pattern = pattern.removesuffix("$")
+            # Remove negative lookahead/behind if any. This is ugly but needed for reverse mapping of
+            # Qwen2.5, Sam3, Ernie4.5 VL MoE!
+            pattern = re.sub(r"\(\?.+\)", "", pattern)
             # Allow capturing groups in patterns, i.e. to add/remove a prefix to all keys (e.g. timm_wrapper, sam3)
             if r"(.+)" in pattern:
                 pattern = pattern.replace(r"(.+)", r"\1")
@@ -338,19 +503,19 @@ class WeightTransform:
         match_object = self.compiled_sources.search(source_key)
         if match_object is None:
             return source_key, None
         # Find the source that produced the match (it's the first group that matched, as the search stops after first branch match)
         matching_group_name = next(name for name, val in match_object.groupdict().items() if val is not None)
         source_pattern_that_matched = self.source_patterns[int(matching_group_name[1:])]
         # If we matched, we always replace with the first target pattern, in case we have several (one to many transform)
         replacement = self.target_patterns[0]
-        # # Allow capturing groups in patterns, i.e. to add a prefix to all keys (e.g. timm_wrapper, sam3)
+        # Allow capturing groups in patterns, i.e. to add a prefix to all keys (e.g. timm_wrapper, sam3)
         if r"\1" in replacement:
             # The index of the internal group we need to replace is the index of the matched named group as it comes
             # inside that matched named group
             replaced_group_idx = self.compiled_sources.groupindex[matching_group_name] + 1
             replacement = replacement.replace(r"\1", match_object.group(replaced_group_idx))
         renamed_key = source_key.replace(match_object.group(0), replacement)
         return renamed_key, source_pattern_that_matched
     def reverse_transform(self) -> WeightTransform:
@@ -437,6 +602,13 @@ class WeightRenaming(WeightTransform):
         return collected_tensors, conversion_errors
+# List of classes that are known to be able to use m:n
+_INTERNAL_MANY_TO_MANY_CONVERSIONS = (
+    ErnieFuseAndSplitTextVisionExperts,
+    ErnieSplitAndDecoupleTextVisionExperts,
+)
 @dataclass(slots=True)
 class WeightConverter(WeightTransform):
     operations: list[ConversionOps] = field(default_factory=list, repr=False)
@@ -444,9 +616,11 @@ class WeightConverter(WeightTransform):
     def __post_init__(self):
         WeightTransform.__post_init__(self)
         if bool(len(self.source_patterns) - 1) + bool(len(self.target_patterns) - 1) >= 2:
-            raise ValueError(
-                f"source keys={self.source_patterns}, target_patterns={self.target_patterns} but you can only have one to many, one to one or many to one."
-            )
+            # We allow many-to-many only if we use an internal operation that can handle it
+            if not any(isinstance(op, _INTERNAL_MANY_TO_MANY_CONVERSIONS) for op in self.operations):
+                raise ValueError(
+                    f"source keys={self.source_patterns}, target_patterns={self.target_patterns} but you can only have one to many, one to one or many to one."
+                )
         if not self.operations:
             raise ValueError("WeightConverter requires at least one operation.")
@@ -538,13 +712,13 @@ def spawn_materialize(
 def spawn_tp_materialize(
-    thread_pool: ThreadPoolExecutor | None, tensor: torch.Tensor, sharding_method, tensor_idx, dtype=None
+    thread_pool: ThreadPoolExecutor | None, tensor: torch.Tensor, sharding_method, tensor_idx, device=None, dtype=None
 ) -> Future | Callable:
     """Materialize and shard a tensor (according to the TP-plan) from file asynchronously if `thread_pool` is provided, or
     return a Callable that will load the tensor synchronously when called."""
     def _job():
-        return sharding_method.shard_tensor(tensor, param_casting_dtype=dtype, tensor_idx=tensor_idx)[0]
+        return sharding_method.shard_tensor(tensor, tensor_idx=tensor_idx, device=device, dtype=dtype)
     if thread_pool is not None:
         return thread_pool.submit(_job)
@@ -622,20 +796,17 @@ def set_param_for_module(
     if ref is None:
         unexpected_keys.add(target_name)
     else:
-        use_dtensor = hasattr(distributed_operation, "use_dtensor") and distributed_operation.use_dtensor
         if not isinstance(param_value, torch.nn.Parameter):
             if distributed_operation is not None:
-                param_value = DTensor.from_local(
-                    param_value,
-                    distributed_operation.device_mesh,
-                    getattr(distributed_operation, "shard", Replicate()),
-                    run_check=False,
-                    shape=ref.size(),
-                    stride=ref.stride(),
-                )
-                if not use_dtensor:
-                    # we convert to local
-                    param_value = param_value.to_local()
+                if getattr(distributed_operation, "use_dtensor", False):
+                    param_value = DTensor.from_local(
+                        param_value,
+                        distributed_operation.device_mesh,
+                        getattr(distributed_operation, "shard", Replicate()),
+                        run_check=False,
+                        shape=ref.size(),
+                        stride=ref.stride(),
+                    )
             if param_name not in module_obj._buffers:
                 param_value = torch.nn.Parameter(param_value, requires_grad=param_value.is_floating_point())
@@ -725,6 +896,7 @@ def convert_and_load_state_dict_in_model(
     device_mesh: torch.distributed.device_mesh.DeviceMesh | None = None,
     disk_offload_index: dict | None = None,
     disk_offload_folder: str | None = None,
+    offload_buffers: bool = False,
 ):
     r"""
     We build a mapping from the keys obtained by renaming each of the checkpoint keys according to the weight_mapping rules.
@@ -815,15 +987,12 @@ def convert_and_load_state_dict_in_model(
     prefix = model.base_model_prefix
     tp_plan = tp_plan or {}
     device_map = device_map or {"": "cpu"}
-    # Here, we first sort by number of submodules, then length of the full string, to make sure to match correctly
-    device_map_regex = re.compile(
-        "|".join(rf"({k})" for k in sorted(device_map.keys(), key=lambda x: (x.count("."), len(x)), reverse=True))
-    )
     dtype_plan = dtype_plan or {}
     weight_mapping = weight_mapping or []
     meta_model_state_dict = model.state_dict()
-    missing_keys = set(meta_model_state_dict.keys())
+    model_buffers = {k for k, _ in model.named_buffers()}
+    missing_keys = set(meta_model_state_dict.keys())
     conversion_errors = {}
     mismatch_keys = set()
     unexpected_keys = set()
@@ -897,7 +1066,7 @@ def convert_and_load_state_dict_in_model(
                     if getattr(mapping, "distributed_operation", None) is None:
                         tp_layer = ALL_PARALLEL_STYLES[model.tp_plan[matched_tp_pattern]].__class__
                         mapping.distributed_operation = tp_layer(
-                            device_mesh=device_mesh, rank=device_map[""].index, empty_param=empty_param.clone()
+                            device_mesh=device_mesh, rank=device_mesh.get_local_rank(), empty_param=empty_param.clone()
                         )
                     shard_index = len(mapping.collected_tensors.get(original_key, []))
                     future_or_tensor = spawn_tp_materialize(
@@ -905,14 +1074,12 @@ def convert_and_load_state_dict_in_model(
                         tensor,
                         mapping.distributed_operation,
                         shard_index,
+                        device_map[""],
                         _dtype,
                     )
             if future_or_tensor is None:
-                device_match = device_map_regex.match(renamed_key)
-                param_device = device_map[device_match.group()] if device_match else device_map.get("", "cpu")
-                # If disk, we need to materialize on cpu first
-                param_device = "cpu" if param_device == "disk" else param_device
+                param_device = get_device(device_map, renamed_key, valid_torch_device=True)
                 future_or_tensor = spawn_materialize(thread_pool, tensor, param_device, _dtype)
             mapping.add_tensor(renamed_key, original_key, source_pattern, future_or_tensor)
@@ -941,10 +1108,9 @@ def convert_and_load_state_dict_in_model(
                     )
                     for target_name, param in realized_value.items():
                         param = param[0] if isinstance(param, list) else param
-                        device_match = device_map_regex.match(target_name)
-                        param_device = device_map[device_match.group()] if device_match else device_map.get("", "cpu")
+                        param_device = get_device(device_map, target_name)
                         # Offloading support
-                        if param_device == "disk":
+                        if param_device == "disk" and (target_name not in model_buffers or offload_buffers):
                             disk_offload_index = offload_and_maybe_resave_param(
                                 target_name, param, missing_keys, disk_offload_folder, disk_offload_index, mapping
                             )

transformers/dependency_versions_table.py CHANGED Viewed

@@ -75,7 +75,7 @@ deps = {
     "tensorboard": "tensorboard",
     "timeout-decorator": "timeout-decorator",
     "tiktoken": "tiktoken",
-    "timm": "timm<=1.0.19,!=1.0.18",
+    "timm": "timm>=1.0.23",
     "tokenizers": "tokenizers>=0.22.0,<=0.23.0",
     "torch": "torch>=2.2",
     "torchaudio": "torchaudio",

transformers/feature_extraction_utils.py CHANGED Viewed

@@ -30,6 +30,7 @@ from .utils import (
     PROCESSOR_NAME,
     PushToHubMixin,
     TensorType,
+    _is_tensor_or_array_like,
     copy_func,
     is_numpy_array,
     is_torch_available,
@@ -67,11 +68,18 @@ class BatchFeature(UserDict):
         tensor_type (`Union[None, str, TensorType]`, *optional*):
             You can give a tensor_type here to convert the lists of integers in PyTorch/Numpy Tensors at
             initialization.
+        skip_tensor_conversion (`list[str]` or `set[str]`, *optional*):
+            List or set of keys that should NOT be converted to tensors, even when `tensor_type` is specified.
     """
-    def __init__(self, data: Optional[dict[str, Any]] = None, tensor_type: Union[None, str, TensorType] = None):
+    def __init__(
+        self,
+        data: Optional[dict[str, Any]] = None,
+        tensor_type: Union[None, str, TensorType] = None,
+        skip_tensor_conversion: Optional[Union[list[str], set[str]]] = None,
+    ):
         super().__init__(data)
-        self.convert_to_tensors(tensor_type=tensor_type)
+        self.convert_to_tensors(tensor_type=tensor_type, skip_tensor_conversion=skip_tensor_conversion)
     def __getitem__(self, item: str) -> Any:
         """
@@ -110,6 +118,14 @@ class BatchFeature(UserDict):
             import torch
             def as_tensor(value):
+                if torch.is_tensor(value):
+                    return value
+                # stack list of tensors if tensor_type is PyTorch (# torch.tensor() does not support list of tensors)
+                if isinstance(value, (list, tuple)) and len(value) > 0 and torch.is_tensor(value[0]):
+                    return torch.stack(value)
+                # convert list of numpy arrays to numpy array (stack) if tensor_type is Numpy
                 if isinstance(value, (list, tuple)) and len(value) > 0:
                     if isinstance(value[0], np.ndarray):
                         value = np.array(value)
@@ -138,7 +154,11 @@ class BatchFeature(UserDict):
             is_tensor = is_numpy_array
         return is_tensor, as_tensor
-    def convert_to_tensors(self, tensor_type: Optional[Union[str, TensorType]] = None):
+    def convert_to_tensors(
+        self,
+        tensor_type: Optional[Union[str, TensorType]] = None,
+        skip_tensor_conversion: Optional[Union[list[str], set[str]]] = None,
+    ):
         """
         Convert the inner content to tensors.
@@ -146,6 +166,13 @@ class BatchFeature(UserDict):
             tensor_type (`str` or [`~utils.TensorType`], *optional*):
                 The type of tensors to use. If `str`, should be one of the values of the enum [`~utils.TensorType`]. If
                 `None`, no modification is done.
+            skip_tensor_conversion (`list[str]` or `set[str]`, *optional*):
+                List or set of keys that should NOT be converted to tensors, even when `tensor_type` is specified.
+        Note:
+            Values that don't have an array-like structure (e.g., strings, dicts, lists of strings) are
+            automatically skipped and won't be converted to tensors. Ragged arrays (lists of arrays with
+            different lengths) are still attempted, though they may raise errors during conversion.
         """
         if tensor_type is None:
             return self
@@ -154,18 +181,30 @@ class BatchFeature(UserDict):
         # Do the tensor conversion in batch
         for key, value in self.items():
+            # Skip keys explicitly marked for no conversion
+            if skip_tensor_conversion and key in skip_tensor_conversion:
+                continue
+            # Skip values that are not array-like
+            if not _is_tensor_or_array_like(value):
+                continue
             try:
                 if not is_tensor(value):
                     tensor = as_tensor(value)
                     self[key] = tensor
-            except:  # noqa E722
+            except Exception as e:
                 if key == "overflowing_values":
-                    raise ValueError("Unable to create tensor returning overflowing values of different lengths. ")
+                    raise ValueError(
+                        f"Unable to create tensor for '{key}' with overflowing values of different lengths. "
+                        f"Original error: {str(e)}"
+                    ) from e
                 raise ValueError(
-                    "Unable to create tensor, you should probably activate padding "
-                    "with 'padding=True' to have batched tensors with the same length."
-                )
+                    f"Unable to convert output '{key}' (type: {type(value).__name__}) to tensor: {str(e)}\n"
+                    f"You can try:\n"
+                    f"  1. Use padding=True to ensure all outputs have the same shape\n"
+                    f"  2. Set return_tensors=None to return Python objects instead of tensors"
+                ) from e
         return self
@@ -204,12 +243,15 @@ class BatchFeature(UserDict):
         # We cast only floating point tensors to avoid issues with tokenizers casting `LongTensor` to `FloatTensor`
         def maybe_to(v):
-            # check if v is a floating point
+            # check if v is a floating point tensor
             if isinstance(v, torch.Tensor) and torch.is_floating_point(v):
                 # cast and send to device
                 return v.to(*args, **kwargs)
             elif isinstance(v, torch.Tensor) and device is not None:
                 return v.to(device=device, non_blocking=non_blocking)
+            # recursively handle lists and tuples
+            elif isinstance(v, (list, tuple)):
+                return type(v)(maybe_to(item) for item in v)
             else:
                 return v
@@ -227,8 +269,8 @@ class FeatureExtractionMixin(PushToHubMixin):
     def __init__(self, **kwargs):
         """Set elements of `kwargs` as attributes."""
-        # Pop "processor_class" as it should be saved as private attribute
-        self._processor_class = kwargs.pop("processor_class", None)
+        # Pop "processor_class", it should not be saved in feature extractor config
+        kwargs.pop("processor_class", None)
         # Additional attributes without default values
         for key, value in kwargs.items():
             try:
@@ -237,10 +279,6 @@ class FeatureExtractionMixin(PushToHubMixin):
                 logger.error(f"Can't set {key} with value {value} for {self}")
                 raise err
-    def _set_processor_class(self, processor_class: str):
-        """Sets processor class as an attribute."""
-        self._processor_class = processor_class
     @classmethod
     def from_pretrained(
         cls: type[SpecificFeatureExtractorType],
@@ -584,12 +622,6 @@ class FeatureExtractionMixin(PushToHubMixin):
             if isinstance(value, np.ndarray):
                 dictionary[key] = value.tolist()
-        # make sure private name "_processor_class" is correctly
-        # saved as "processor_class"
-        _processor_class = dictionary.pop("_processor_class", None)
-        if _processor_class is not None:
-            dictionary["processor_class"] = _processor_class
         return json.dumps(dictionary, indent=2, sort_keys=True) + "\n"
     def to_json_file(self, json_file_path: Union[str, os.PathLike]):

transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl

transformers 5.0.0rc1py3-none-any.whl → 5.0.0rc2py3-none-any.whl