PyPI - transformers - Versions diffs - 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl - Mend

transformers 5.0.0rc3py3-none-any.whl → 5.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1021) hide show

transformers/image_processing_utils_fast.py CHANGED Viewed

@@ -60,10 +60,9 @@ if is_torch_available():
     import torch
 if is_torchvision_available():
-    from torchvision.transforms.v2 import functional as F
+    import torchvision.transforms.v2.functional as tvF
     from .image_utils import pil_torch_interpolation_mapping
 else:
     pil_torch_interpolation_mapping = None
@@ -82,7 +81,7 @@ def validate_fast_preprocess_arguments(
     crop_size: SizeDict | None = None,
     do_resize: bool | None = None,
     size: SizeDict | None = None,
-    interpolation: Optional["F.InterpolationMode"] = None,
+    interpolation: Optional["tvF.InterpolationMode"] = None,
     return_tensors: str | TensorType | None = None,
     data_format: ChannelDimension = ChannelDimension.FIRST,
 ):
@@ -398,7 +397,7 @@ class BaseImageProcessorFast(BaseImageProcessor):
                 )
             if image_size != pad_size:
                 padding = (0, 0, padding_width, padding_height)
-                stacked_images = F.pad(stacked_images, padding, fill=fill_value, padding_mode=padding_mode)
+                stacked_images = tvF.pad(stacked_images, padding, fill=fill_value, padding_mode=padding_mode)
             processed_images_grouped[shape] = stacked_images
             if return_mask:
@@ -418,7 +417,7 @@ class BaseImageProcessorFast(BaseImageProcessor):
         self,
         image: "torch.Tensor",
         size: SizeDict,
-        interpolation: Optional["F.InterpolationMode"] = None,
+        interpolation: Optional["tvF.InterpolationMode"] = None,
         antialias: bool = True,
         **kwargs,
     ) -> "torch.Tensor":
@@ -438,7 +437,7 @@ class BaseImageProcessorFast(BaseImageProcessor):
         Returns:
             `torch.Tensor`: The resized image.
         """
-        interpolation = interpolation if interpolation is not None else F.InterpolationMode.BILINEAR
+        interpolation = interpolation if interpolation is not None else tvF.InterpolationMode.BILINEAR
         if size.shortest_edge and size.longest_edge:
             # Resize the image so that the shortest edge or the longest edge is of the given size
             # while maintaining the aspect ratio of the original image.
@@ -468,23 +467,23 @@ class BaseImageProcessorFast(BaseImageProcessor):
         # TODO: remove this once the bug is fixed (detected with torch==2.7.0+git1fee196, torchvision==0.22.0+9eb57cd)
         if is_torchdynamo_compiling() and is_rocm_platform():
             return self.compile_friendly_resize(image, new_size, interpolation, antialias)
-        return F.resize(image, new_size, interpolation=interpolation, antialias=antialias)
+        return tvF.resize(image, new_size, interpolation=interpolation, antialias=antialias)
     @staticmethod
     def compile_friendly_resize(
         image: "torch.Tensor",
         new_size: tuple[int, int],
-        interpolation: Optional["F.InterpolationMode"] = None,
+        interpolation: Optional["tvF.InterpolationMode"] = None,
         antialias: bool = True,
     ) -> "torch.Tensor":
         """
-        A wrapper around `F.resize` so that it is compatible with torch.compile when the image is a uint8 tensor.
+        A wrapper around `tvF.resize` so that it is compatible with torch.compile when the image is a uint8 tensor.
         """
         if image.dtype == torch.uint8:
             # 256 is used on purpose instead of 255 to avoid numerical differences
             # see https://github.com/huggingface/transformers/pull/38540#discussion_r2127165652
             image = image.float() / 256
-            image = F.resize(image, new_size, interpolation=interpolation, antialias=antialias)
+            image = tvF.resize(image, new_size, interpolation=interpolation, antialias=antialias)
             image = image * 256
             # torch.where is used on purpose instead of torch.clamp to avoid bug in torch.compile
             # see https://github.com/huggingface/transformers/pull/38540#discussion_r2126888471
@@ -492,7 +491,7 @@ class BaseImageProcessorFast(BaseImageProcessor):
             image = torch.where(image < 0, 0, image)
             image = image.round().to(torch.uint8)
         else:
-            image = F.resize(image, new_size, interpolation=interpolation, antialias=antialias)
+            image = tvF.resize(image, new_size, interpolation=interpolation, antialias=antialias)
         return image
     def rescale(
@@ -536,7 +535,7 @@ class BaseImageProcessorFast(BaseImageProcessor):
         Returns:
             `torch.Tensor`: The normalized image.
         """
-        return F.normalize(image, mean, std)
+        return tvF.normalize(image, mean, std)
     @lru_cache(maxsize=10)
     def _fuse_mean_std_and_rescale_factor(
@@ -615,14 +614,14 @@ class BaseImageProcessorFast(BaseImageProcessor):
                 (crop_width - image_width + 1) // 2 if crop_width > image_width else 0,
                 (crop_height - image_height + 1) // 2 if crop_height > image_height else 0,
             ]
-            image = F.pad(image, padding_ltrb, fill=0)  # PIL uses fill value 0
+            image = tvF.pad(image, padding_ltrb, fill=0)  # PIL uses fill value 0
             image_height, image_width = image.shape[-2:]
             if crop_width == image_width and crop_height == image_height:
                 return image
         crop_top = int((image_height - crop_height) / 2.0)
         crop_left = int((image_width - crop_width) / 2.0)
-        return F.crop(image, crop_top, crop_left, crop_height, crop_width)
+        return tvF.crop(image, crop_top, crop_left, crop_height, crop_width)
     def convert_to_rgb(
         self,
@@ -687,9 +686,9 @@ class BaseImageProcessorFast(BaseImageProcessor):
             image = self.convert_to_rgb(image)
         if image_type == ImageType.PIL:
-            image = F.pil_to_tensor(image)
+            image = tvF.pil_to_tensor(image)
         elif image_type == ImageType.NUMPY:
-            # not using F.to_tensor as it doesn't handle (C, H, W) numpy arrays
+            # not using tvF.to_tensor as it doesn't handle (C, H, W) numpy arrays
             image = torch.from_numpy(image).contiguous()
         # If the image is 2D, we need to unsqueeze it to add a channel dimension for processing
@@ -813,7 +812,7 @@ class BaseImageProcessorFast(BaseImageProcessor):
         size: SizeDict | None = None,
         do_center_crop: bool | None = None,
         crop_size: SizeDict | None = None,
-        interpolation: Optional["F.InterpolationMode"] = None,
+        interpolation: Optional["tvF.InterpolationMode"] = None,
         return_tensors: str | TensorType | None = None,
         data_format: ChannelDimension | None = None,
         **kwargs,
@@ -892,7 +891,7 @@ class BaseImageProcessorFast(BaseImageProcessor):
         images: list["torch.Tensor"],
         do_resize: bool,
         size: SizeDict,
-        interpolation: Optional["F.InterpolationMode"],
+        interpolation: Optional["tvF.InterpolationMode"],
         do_center_crop: bool,
         crop_size: SizeDict,
         do_rescale: bool,

transformers/image_transforms.py CHANGED Viewed

@@ -863,31 +863,43 @@ def _group_images_by_shape(nested_images, *paired_inputs, is_nested: bool = Fals
                 paired_grouped_values[paired_index][shape].append(paired_value)
             grouped_images_index[key] = (shape, len(grouped_images[shape]) - 1)
+    # Store structure size for nested inputs to handle empty sublists during reconstruction
+    if is_nested:
+        grouped_images_index["_num_sublists"] = len(normalized_images)
     return grouped_images, *paired_grouped_values, grouped_images_index
 def _reconstruct_nested_structure(indices, processed_images):
     """Helper function to reconstruct a single level nested structure."""
-    # Find the maximum outer index
-    max_outer_idx = max(idx[0] for idx in indices)
-    # Create the outer list
-    result = [None] * (max_outer_idx + 1)
+    # Get the number of sublists (handles empty sublists like in [[], [image]])
+    num_sublists = indices.pop("_num_sublists", None)
     # Group indices by outer index
     nested_indices = defaultdict(list)
     for i, j in indices:
         nested_indices[i].append(j)
+    # Determine the number of outer sublists
+    if num_sublists is not None:
+        max_outer_idx = num_sublists - 1
+    elif nested_indices:
+        max_outer_idx = max(nested_indices.keys())
+    else:
+        return []
+    # Create the result structure
+    result = []
     for i in range(max_outer_idx + 1):
-        if i in nested_indices:
+        if i not in nested_indices:
+            result.append([])
+        else:
             inner_max_idx = max(nested_indices[i])
             inner_list = [None] * (inner_max_idx + 1)
-            for j in range(inner_max_idx + 1):
-                if (i, j) in indices:
-                    shape, idx = indices[(i, j)]
-                    inner_list[j] = processed_images[shape][idx]
-            result[i] = inner_list
+            for j in nested_indices[i]:
+                shape, idx = indices[(i, j)]
+                inner_list[j] = processed_images[shape][idx]
+            result.append(inner_list)
     return result
@@ -908,6 +920,21 @@ def _iterate_items(items, is_nested: bool):
             yield i, item
+def _get_device_from_images(images, is_nested: bool) -> "torch.device":
+    """
+    Get the device from the first non-empty element in a (potentially nested) list of images.
+    Handles cases like `images = [[], [image]]` where the first sublist may be empty.
+    """
+    if is_nested:
+        for row in images:
+            if isinstance(row, torch.Tensor):
+                return row.device
+            if isinstance(row, list) and len(row) > 0:
+                return row[0].device
+    return images[0].device
 def group_images_by_shape(
     images: Union[list["torch.Tensor"], "torch.Tensor"],
     *paired_inputs,
@@ -945,17 +972,21 @@ def group_images_by_shape(
     """
     # If disable grouping is not explicitly provided, we favor disabling it if the images are on CPU, and enabling it otherwise.
     if disable_grouping is None:
-        device = images[0][0].device if is_nested else images[0].device
+        device = _get_device_from_images(images, is_nested)
         disable_grouping = device == "cpu"
     if disable_grouping:
+        grouped_images_index = {key: (key, 0) for key, _ in _iterate_items(images, is_nested)}
+        if is_nested:
+            grouped_images_index["_num_sublists"] = len(images)
         return (
             {key: img.unsqueeze(0) for key, img in _iterate_items(images, is_nested)},
             *[
                 {key: item.unsqueeze(0) for key, item in _iterate_items(paired_list, is_nested)}
                 for paired_list in paired_inputs
             ],
-            {key: (key, 0) for key, _ in _iterate_items(images, is_nested)},
+            grouped_images_index,
         )
     # Handle single level nested structure

transformers/image_utils.py CHANGED Viewed

@@ -86,11 +86,6 @@ class AnnotationFormat(ExplicitEnum):
     COCO_PANOPTIC = "coco_panoptic"
-class AnnotionFormat(ExplicitEnum):
-    COCO_DETECTION = AnnotationFormat.COCO_DETECTION.value
-    COCO_PANOPTIC = AnnotationFormat.COCO_PANOPTIC.value
 AnnotationType = dict[str, int | str | list[dict]]

transformers/initialization.py CHANGED Viewed

@@ -11,6 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import math
 import sys
 from collections import defaultdict
 from contextlib import contextmanager
@@ -162,6 +163,40 @@ def copy_(tensor: torch.Tensor, other: torch.Tensor) -> torch.Tensor:
     return tensor
+def _variance_scaling(tensor, mode="fan_in", distribution="normal"):
+    fan_in, fan_out = torch.nn.init._calculate_fan_in_and_fan_out(tensor)
+    if mode == "fan_in":
+        denom = fan_in
+    elif mode == "fan_out":
+        denom = fan_out
+    elif mode == "fan_avg":
+        denom = (fan_in + fan_out) / 2
+    variance = 1.0 / denom
+    if distribution == "truncated_normal":
+        trunc_normal_(tensor, std=math.sqrt(variance) / 0.87962566103423978)
+    elif distribution == "normal":
+        normal_(tensor, std=math.sqrt(variance))
+    elif distribution == "uniform":
+        bound = math.sqrt(3 * variance)
+        uniform_(tensor, -bound, bound)
+    else:
+        raise ValueError(f"invalid distribution {distribution}")
+def lecun_normal_(tensor):
+    if not getattr(tensor, "_is_hf_initialized", False):
+        _variance_scaling(tensor, mode="fan_in", distribution="truncated_normal")
+    return tensor
+def default_flax_embed_init_(tensor):
+    if not getattr(tensor, "_is_hf_initialized", False):
+        _variance_scaling(tensor, mode="fan_in", distribution="normal")
+    return tensor
 # Here, we need to check several modules imported, and hot patch all of them, as sometimes torch does
 # something like `from torch.nn.init import xavier_uniform_` in their internals (e.g in torch.nn.modules.activations,
 # where MultiHeadAttention lives), so the function name is binded at import time and just doing
@@ -243,3 +278,25 @@ def no_init_weights():
                 setattr(module, func_name, func)
         # Set back `init_weights`
         PreTrainedModel.init_weights = original_init_weights
+@contextmanager
+def no_tie_weights():
+    """
+    Disable weight tying during loading with `from_pretrained`. This is needed as we want to have access to ALL
+    weights in the state_dict during `from_pretrained`, and otherwise tying them would remove them from it, as it's
+    called in `post_init` when instantiating.
+    """
+    from .modeling_utils import PreTrainedModel
+    def empty_func(*args, **kwargs):
+        pass
+    try:
+        original_tie_weights = PreTrainedModel.tie_weights
+        PreTrainedModel.tie_weights = empty_func
+        yield
+    finally:
+        # Set back the original
+        PreTrainedModel.tie_weights = original_tie_weights

transformers/integrations/__init__.py CHANGED Viewed

@@ -20,7 +20,6 @@ _import_structure = {
     "aqlm": ["replace_with_aqlm_linear"],
     "awq": [
         "post_init_awq_exllama_modules",
-        "post_init_awq_ipex_modules",
         "replace_quantization_scales",
         "replace_with_awq_linear",
     ],
@@ -148,17 +147,11 @@ else:
         "convert_and_export_with_cache",
     ]
-try:
-    if not is_torch_greater_or_equal("2.3"):
-        raise OptionalDependencyNotAvailable()
-except OptionalDependencyNotAvailable:
-    pass
-else:
-    _import_structure["tensor_parallel"] = [
-        "shard_and_distribute_module",
-        "ALL_PARALLEL_STYLES",
-        "translate_to_torch_parallel_style",
-    ]
+_import_structure["tensor_parallel"] = [
+    "shard_and_distribute_module",
+    "ALL_PARALLEL_STYLES",
+    "translate_to_torch_parallel_style",
+]
 try:
     if not is_torch_greater_or_equal("2.5"):
         raise OptionalDependencyNotAvailable()
@@ -173,7 +166,6 @@ if TYPE_CHECKING:
     from .aqlm import replace_with_aqlm_linear
     from .awq import (
         post_init_awq_exllama_modules,
-        post_init_awq_ipex_modules,
         replace_quantization_scales,
         replace_with_awq_linear,
     )
@@ -291,17 +283,11 @@ if TYPE_CHECKING:
     else:
         from .executorch import TorchExportableModuleWithStaticCache, convert_and_export_with_cache
-    try:
-        if not is_torch_greater_or_equal("2.3"):
-            raise OptionalDependencyNotAvailable()
-    except OptionalDependencyNotAvailable:
-        pass
-    else:
-        from .tensor_parallel import (
-            ALL_PARALLEL_STYLES,
-            shard_and_distribute_module,
-            translate_to_torch_parallel_style,
-        )
+    from .tensor_parallel import (
+        ALL_PARALLEL_STYLES,
+        shard_and_distribute_module,
+        translate_to_torch_parallel_style,
+    )
     try:
         if not is_torch_greater_or_equal("2.5"):

transformers/integrations/accelerate.py CHANGED Viewed

@@ -44,7 +44,7 @@ if is_torch_available():
 if is_accelerate_available():
     from accelerate import dispatch_model
     from accelerate.utils import get_max_memory
-    from accelerate.utils.modeling import clean_device_map, get_max_layer_size, get_module_size_with_ties
+    from accelerate.utils.modeling import clean_device_map, get_max_layer_size
 if TYPE_CHECKING:
     from ..modeling_utils import PreTrainedModel
@@ -54,6 +54,42 @@ if TYPE_CHECKING:
 logger = logging.get_logger(__name__)
+def get_module_size_with_ties(
+    tied_params,
+    module_size,
+    module_sizes,
+    modules_to_treat,
+) -> tuple[int, list[str], list[nn.Module]]:
+    """
+    Calculate the total size of a module, including its tied parameters.
+    Args:
+        tied_params (`List[str]`): The list of tied parameters.
+        module_size (`int`): The size of the module without tied parameters.
+        module_sizes (`Dict[str, int]`): A dictionary mapping each layer name to its size.
+        modules_to_treat (`List[Tuple[str, nn.Module]]`): The list of named modules to treat.
+    Returns:
+        `Tuple[int, List[str], List[nn.Module]]`: The total size of the module, the names of the tied modules, and the
+        tied modules.
+    """
+    if len(tied_params) < 1:
+        return module_size, [], []
+    tied_module_names = []
+    tied_modules = []
+    for tied_param in tied_params:
+        tied_module_index = [i for i, (n, _) in enumerate(modules_to_treat) if tied_param.startswith(n + ".")][0]
+        tied_module_names.append(modules_to_treat[tied_module_index][0])
+        tied_modules.append(modules_to_treat[tied_module_index][1])
+    module_size_with_ties = module_size
+    for tied_param, tied_module_name in zip(tied_params, tied_module_names):
+        module_size_with_ties += module_sizes[tied_module_name] - module_sizes[tied_param]
+    return module_size_with_ties, tied_module_names, tied_modules
 def check_and_set_device_map(device_map: "torch.device | int | str | dict | None") -> dict | str | None:
     from ..modeling_utils import get_torch_context_manager_or_global_device
@@ -163,7 +199,7 @@ def compute_module_total_buffer_size(model: nn.Module, hf_quantizer: "HfQuantize
 def get_balanced_memory(
     model: "PreTrainedModel",
     max_memory: dict[int | str, int | str] | None = None,
-    no_split_module_classes: list[str] | None = None,
+    no_split_module_classes: set[str] | None = None,
     hf_quantizer: "HfQuantizer | None" = None,
     low_zero: bool = False,
 ):
@@ -183,8 +219,8 @@ def get_balanced_memory(
         max_memory (`Dict`, *optional*):
             A dictionary device identifier to maximum memory. Will default to the maximum memory available if unset.
             Example: `max_memory={0: "1GB"}`.
-        no_split_module_classes (`List[str]`, *optional*):
-            A list of layer class names that should never be split across device (for instance any layer that has a
+        no_split_module_classes (`set[str]`, *optional*):
+            A set of layer class names that should never be split across device (for instance any layer that has a
             residual connection).
         hf_quantizer (`HfQuantizer`, *optional*):
             A quantizer for the model.
@@ -227,7 +263,7 @@ def get_balanced_memory(
     # - the mean of the layer sizes
     if no_split_module_classes is None:
         no_split_module_classes = []
-    elif not isinstance(no_split_module_classes, (list, tuple)):
+    elif not isinstance(no_split_module_classes, (list, tuple, set)):
         no_split_module_classes = [no_split_module_classes]
     # Identify the size of the no_split_block modules
@@ -275,7 +311,7 @@ def _get_device_map(
     Otherwise, we check for any device inconsistencies in the device_map.
     """
     if isinstance(device_map, str):
-        no_split_modules = model._get_no_split_modules(device_map)
+        no_split_modules = model._no_split_modules
         if device_map != "sequential":
             inferred_max_memory = get_balanced_memory(
@@ -490,7 +526,7 @@ def load_offloaded_parameter(model: "PreTrainedModel", param_name: str) -> torch
 def _init_infer_auto_device_map(
     model: nn.Module,
     max_memory: dict[int | str, int | str] | None = None,
-    no_split_module_classes: list[str] | None = None,
+    no_split_module_classes: set[str] | None = None,
     tied_parameters: list[list[str]] | None = None,
     hf_quantizer: "HfQuantizer | None" = None,
 ) -> tuple[
@@ -509,7 +545,7 @@ def _init_infer_auto_device_map(
     max_memory = get_max_memory(max_memory)
     if no_split_module_classes is None:
         no_split_module_classes = []
-    elif not isinstance(no_split_module_classes, (list, tuple)):
+    elif not isinstance(no_split_module_classes, (list, tuple, set)):
         no_split_module_classes = [no_split_module_classes]
     devices = list(max_memory.keys())
@@ -560,7 +596,7 @@ def _init_infer_auto_device_map(
 def infer_auto_device_map(
     model: nn.Module,
     max_memory: dict[int | str, int | str] | None = None,
-    no_split_module_classes: list[str] | None = None,
+    no_split_module_classes: set[str] | None = None,
     verbose: bool = False,
     clean_result: bool = True,
     offload_buffers: bool = False,
@@ -590,8 +626,8 @@ def infer_auto_device_map(
         max_memory (`Dict`, *optional*):
             A dictionary device identifier to maximum memory. Will default to the maximum memory available if unset.
             Example: `max_memory={0: "1GB"}`.
-        no_split_module_classes (`List[str]`, *optional*):
-            A list of layer class names that should never be split across device (for instance any layer that has a
+        no_split_module_classes (`set[str]`, *optional*):
+            A set of layer class names that should never be split across device (for instance any layer that has a
             residual connection).
         verbose (`bool`, *optional*, defaults to `False`):
             Whether or not to provide debugging statements as the function builds the device_map.

transformers 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl

transformers 5.0.0rc3py3-none-any.whl → 5.1.0py3-none-any.whl