PyPI - transformers - Versions diffs - 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl - Mend

transformers 5.0.0rc1py3-none-any.whl → 5.0.0rc2py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (671) hide show

transformers/pipelines/automatic_speech_recognition.py CHANGED Viewed

@@ -198,11 +198,7 @@ class AutomaticSpeechRecognitionPipeline(ChunkPipeline):
             self.type = "seq2seq_whisper"
         elif model.__class__.__name__ in MODEL_FOR_SPEECH_SEQ_2_SEQ_MAPPING_NAMES.values():
             self.type = "seq2seq"
-        elif (
-            feature_extractor._processor_class
-            and feature_extractor._processor_class.endswith("WithLM")
-            and decoder is not None
-        ):
+        elif decoder is not None:
             self.decoder = decoder
             self.type = "ctc_with_lm"
         else:
@@ -350,6 +346,20 @@ class AutomaticSpeechRecognitionPipeline(ChunkPipeline):
         return preprocess_params, forward_params, postprocess_params
+    @property
+    def _align_to(self):
+        """Sample stride per output."""
+        # XXX: Carefully, this variable will not exist in `seq2seq` setting.
+        # Currently chunking is not possible at this level for `seq2seq` so
+        # it's ok.
+        align_to = getattr(self.model.config, "inputs_to_logits_ratio", 1)
+        if self.model.config.model_type == "lasr_ctc":
+            # TODO: find a standard for that but not easy because input length -> mel length depends on the feature extractor
+            # specific way of doing it
+            # means the model take mel features as input, we align according to the hop length
+            align_to *= self.feature_extractor.hop_length
+        return align_to
     def preprocess(self, inputs, chunk_length_s=0, stride_length_s=None):
         if isinstance(inputs, str):
             if inputs.startswith("http://") or inputs.startswith("https://"):
@@ -444,10 +454,7 @@ class AutomaticSpeechRecognitionPipeline(ChunkPipeline):
             if isinstance(stride_length_s, (int, float)):
                 stride_length_s = [stride_length_s, stride_length_s]
-            # XXX: Carefully, this variable will not exist in `seq2seq` setting.
-            # Currently chunking is not possible at this level for `seq2seq` so
-            # it's ok.
-            align_to = getattr(self.model.config, "inputs_to_logits_ratio", 1)
+            align_to = self._align_to
             chunk_len = int(round(chunk_length_s * self.feature_extractor.sampling_rate / align_to) * align_to)
             stride_left = int(round(stride_length_s[0] * self.feature_extractor.sampling_rate / align_to) * align_to)
             stride_right = int(round(stride_length_s[1] * self.feature_extractor.sampling_rate / align_to) * align_to)
@@ -567,7 +574,7 @@ class AutomaticSpeechRecognitionPipeline(ChunkPipeline):
                 # Send stride to `postprocess`.
                 # it needs to be handled there where
                 # the pieces are to be concatenated.
-                ratio = 1 / self.model.config.inputs_to_logits_ratio
+                ratio = 1 / self._align_to
                 if isinstance(stride, tuple):
                     out["stride"] = rescale_stride([stride], ratio)[0]
                 else:
@@ -650,11 +657,12 @@ class AutomaticSpeechRecognitionPipeline(ChunkPipeline):
         if return_timestamps and self.type not in {"seq2seq", "seq2seq_whisper"}:
             chunks = []
+            align_to = self._align_to
             for item in offsets:
-                start = item["start_offset"] * self.model.config.inputs_to_logits_ratio
+                start = item["start_offset"] * align_to
                 start /= self.feature_extractor.sampling_rate
-                stop = item["end_offset"] * self.model.config.inputs_to_logits_ratio
+                stop = item["end_offset"] * align_to
                 stop /= self.feature_extractor.sampling_rate
                 chunks.append({"text": item[return_timestamps], "timestamp": (start, stop)})

transformers/pipelines/base.py CHANGED Viewed

@@ -884,7 +884,7 @@ class Pipeline(_ScikitCompat, PushToHubMixin):
                 # NOTE: _prepare_generation_config creates a deep copy of the generation config before updating it,
                 # and returns all kwargs that were not used to update the generation config
                 prepared_generation_config, kwargs = self.model._prepare_generation_config(
-                    generation_config=default_pipeline_generation_config, use_model_defaults=True, **kwargs
+                    generation_config=default_pipeline_generation_config, **kwargs
                 )
                 self.generation_config = prepared_generation_config
                 # if the `max_new_tokens` is set to the pipeline default, but `max_length` is set to a non-default

transformers/pipelines/document_question_answering.py CHANGED Viewed

@@ -201,7 +201,7 @@ class DocumentQuestionAnsweringPipeline(ChunkPipeline):
             postprocess_params["top_k"] = top_k
         if max_answer_len is not None:
             if max_answer_len < 1:
-                raise ValueError(f"max_answer_len parameter should be >= 1 (got {max_answer_len}")
+                raise ValueError(f"max_answer_len parameter should be >= 1 (got {max_answer_len})")
             postprocess_params["max_answer_len"] = max_answer_len
         if handle_impossible_answer is not None:
             postprocess_params["handle_impossible_answer"] = handle_impossible_answer

transformers/pipelines/question_answering.py CHANGED Viewed

@@ -328,7 +328,7 @@ class QuestionAnsweringPipeline(ChunkPipeline):
             postprocess_params["top_k"] = top_k
         if max_answer_len is not None:
             if max_answer_len < 1:
-                raise ValueError(f"max_answer_len parameter should be >= 1 (got {max_answer_len}")
+                raise ValueError(f"max_answer_len parameter should be >= 1 (got {max_answer_len})")
             postprocess_params["max_answer_len"] = max_answer_len
         if handle_impossible_answer is not None:
             postprocess_params["handle_impossible_answer"] = handle_impossible_answer

transformers/pipelines/text_to_audio.py CHANGED Viewed

@@ -117,8 +117,8 @@ class TextToAudioPipeline(Pipeline):
                 else vocoder
             )
-        if self.model.config.model_type in ["musicgen"]:
-            # MusicGen expect to use the tokenizer
+        if self.model.config.model_type in ["musicgen", "speecht5"]:
+            # MusicGen and SpeechT5 expect to use their tokenizer instead
             self.processor = None
         self.sampling_rate = sampling_rate

transformers/processing_utils.py CHANGED Viewed

@@ -129,6 +129,26 @@ MODALITY_TO_BASE_CLASS_MAPPING = {
     "video_processor": "BaseVideoProcessor",
 }
+def _get_modality_for_attribute(attribute_name: str) -> str:
+    """
+    Get the canonical modality type for a given attribute name.
+    For example:
+    - "image_processor" -> "image_processor"
+    - "encoder_image_processor" -> "image_processor"
+    - "text_tokenizer" -> "tokenizer"
+    - "my_feature_extractor" -> "feature_extractor"
+    """
+    for modality in MODALITY_TO_AUTOPROCESSOR_MAPPING.keys():
+        if modality in attribute_name:
+            return modality
+    raise ValueError(
+        f"Cannot determine modality for attribute '{attribute_name}'. "
+        f"Attribute name must contain one of: {list(MODALITY_TO_AUTOPROCESSOR_MAPPING.keys())}"
+    )
 if sys.version_info >= (3, 11):
     Unpack = typing.Unpack
 else:
@@ -663,8 +683,10 @@ class ProcessorMixin(PushToHubMixin):
         mismatch between expected and actual class, an error is raise. Otherwise, the proper retrieved class
         is returned.
         """
-        if argument_name not in MODALITY_TO_BASE_CLASS_MAPPING and "tokenizer" in argument_name:
-            argument_name = "tokenizer"
+        # If the exact attribute name is not in the mapping, use its canonical modality
+        # (e.g., "encoder_tokenizer" -> "tokenizer")
+        if argument_name not in MODALITY_TO_BASE_CLASS_MAPPING:
+            argument_name = _get_modality_for_attribute(argument_name)
         class_name = MODALITY_TO_BASE_CLASS_MAPPING.get(argument_name)
         if isinstance(class_name, tuple):
             proper_class = tuple(self.get_possibly_dynamic_module(n) for n in class_name if n is not None)
@@ -695,24 +717,17 @@ class ProcessorMixin(PushToHubMixin):
         # extra attributes to be kept
         attrs_to_save += ["auto_map"]
+        # Remove tokenizers from output - they have their own vocab files and are saved separately.
+        # All other sub-processors (image_processor, feature_extractor, etc.) are kept in processor_config.json.
         for attribute in self.__class__.get_attributes():
-            if "tokenizer" in attribute and attribute in output:
-                del output[attribute]
+            if attribute in output:
+                modality = _get_modality_for_attribute(attribute)
+                if modality == "tokenizer":
+                    del output[attribute]
         if "chat_template" in output:
             del output["chat_template"]
-        def save_public_processor_class(dictionary):
-            # make sure private name "_processor_class" is correctly
-            # saved as "processor_class"
-            _processor_class = dictionary.pop("_processor_class", None)
-            if _processor_class is not None:
-                dictionary["processor_class"] = _processor_class
-            for value in dictionary.values():
-                if isinstance(value, dict):
-                    save_public_processor_class(value)
-            return dictionary
         def cast_array_to_list(dictionary):
             """
             Numpy arrays are not serialiazable but can be in pre-processing dicts.
@@ -743,7 +758,6 @@ class ProcessorMixin(PushToHubMixin):
             )
         }
         output = cast_array_to_list(output)
-        output = save_public_processor_class(output)
         output["processor_class"] = self.__class__.__name__
         return output
@@ -816,16 +830,17 @@ class ProcessorMixin(PushToHubMixin):
         for attribute_name in self.get_attributes():
             attribute = getattr(self, attribute_name)
-            if hasattr(attribute, "_set_processor_class"):
-                attribute._set_processor_class(self.__class__.__name__)
-            # Save the tokenizer in its own vocab file. The other attributes are saved as part of `processor_config.json`
-            if attribute_name == "tokenizer":
-                attribute.save_pretrained(save_directory)
-            # if a model has multiple tokenizers, save the additional tokenizers in their own folders.
-            # Note that the additional tokenizers must have "tokenizer" in their attribute name.
-            elif "tokenizer" in attribute_name:
-                attribute.save_pretrained(os.path.join(save_directory, attribute_name))
+            modality = _get_modality_for_attribute(attribute_name)
+            is_primary = attribute_name == modality
+            if modality == "tokenizer":
+                attribute._set_processor_class(self.__class__.__name__)
+                # Save the tokenizer in its own vocab file. The other attributes are saved as part of `processor_config.json`
+                if is_primary:
+                    attribute.save_pretrained(save_directory)
+                else:
+                    # if a model has multiple tokenizers, save the additional tokenizers in their own folders.
+                    attribute.save_pretrained(os.path.join(save_directory, attribute_name))
             elif attribute._auto_class is not None:
                 custom_object_save(attribute, save_directory, config=attribute)
@@ -1393,9 +1408,10 @@ class ProcessorMixin(PushToHubMixin):
         if token is not None:
             kwargs["token"] = token
-        args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
-        processor_dict, kwargs = cls.get_processor_dict(pretrained_model_name_or_path, **kwargs)
-        return cls.from_args_and_dict(args, processor_dict, **kwargs)
+        # Get processor_dict first so we can use it to instantiate non-tokenizer sub-processors
+        processor_dict, instantiation_kwargs = cls.get_processor_dict(pretrained_model_name_or_path, **kwargs)
+        args = cls._get_arguments_from_pretrained(pretrained_model_name_or_path, processor_dict, **kwargs)
+        return cls.from_args_and_dict(args, processor_dict, **instantiation_kwargs)
     @classmethod
     def get_attributes(cls):
@@ -1405,7 +1421,7 @@ class ProcessorMixin(PushToHubMixin):
             # don't treat audio_tokenizer as an attribute
             if sub_processor_type == "audio_tokenizer":
                 continue
-            if sub_processor_type in MODALITY_TO_AUTOPROCESSOR_MAPPING or "tokenizer" in sub_processor_type:
+            if any(modality in sub_processor_type for modality in MODALITY_TO_AUTOPROCESSOR_MAPPING.keys()):
                 attributes.append(sub_processor_type)
         # Legacy processors may not override `__init__` and instead expose modality
@@ -1419,7 +1435,7 @@ class ProcessorMixin(PushToHubMixin):
                 inferred_attribute = attribute_name[: -len("_class")]
                 if inferred_attribute == "audio_tokenizer":
                     continue
-                if inferred_attribute in MODALITY_TO_AUTOPROCESSOR_MAPPING or "tokenizer" in inferred_attribute:
+                if any(modality in inferred_attribute for modality in MODALITY_TO_AUTOPROCESSOR_MAPPING.keys()):
                     attributes.append(inferred_attribute)
         return attributes
@@ -1447,49 +1463,104 @@ class ProcessorMixin(PushToHubMixin):
         cls._auto_class = auto_class
     @classmethod
-    def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+    def _load_tokenizer_from_pretrained(
+        cls, sub_processor_type, pretrained_model_name_or_path, subfolder="", **kwargs
+    ):
+        auto_processor_class = MODALITY_TO_AUTOPROCESSOR_MAPPING["tokenizer"]
+        is_primary = sub_processor_type == "tokenizer"
+        if is_primary:
+            # Primary tokenizer: load from root
+            tokenizer = auto_processor_class.from_pretrained(
+                pretrained_model_name_or_path, subfolder=subfolder, **kwargs
+            )
+        else:
+            # Additional tokenizer: load from subfolder (e.g., "decoder_tokenizer")
+            tokenizer_subfolder = os.path.join(subfolder, sub_processor_type) if subfolder else sub_processor_type
+            tokenizer = auto_processor_class.from_pretrained(
+                pretrained_model_name_or_path, subfolder=tokenizer_subfolder, **kwargs
+            )
+        return tokenizer
+    @classmethod
+    def _get_arguments_from_pretrained(cls, pretrained_model_name_or_path, processor_dict=None, **kwargs):
         """
         Identify and instantiate the subcomponents of Processor classes, such as image processors, tokenizers,
         and feature extractors. This method inspects the processor's `__init__` signature to identify parameters
         that correspond to known modality types (image_processor, tokenizer, feature_extractor, etc.) or contain
-        "tokenizer" in their name. It then uses the appropriate Auto class (AutoImageProcessor, AutoTokenizer, etc.)
-        from `MODALITY_TO_AUTOPROCESSOR_MAPPING` to load each subcomponent via `.from_pretrained()`. For tokenizer-like
-        parameters not explicitly in the mapping, the method uses AutoTokenizer with a subfolder argument.
+        modality names in their attribute name.
+        For tokenizers: Uses the appropriate Auto class (AutoTokenizer) to load via `.from_pretrained()`.
+        Additional tokenizers (e.g., "decoder_tokenizer") are loaded from subfolders.
+        For other sub-processors (image_processor, feature_extractor, etc.): Primary ones are loaded via
+        Auto class. Additional ones are instantiated from the config stored in processor_config.json
+        (passed as processor_dict).
+        Args:
+            pretrained_model_name_or_path: Path or model id to load from.
+            processor_dict: Optional dict containing processor config (from processor_config.json).
+                Required when loading additional non-tokenizer sub-processors.
         """
         args = []
+        processor_dict = processor_dict if processor_dict is not None else {}
+        # Remove subfolder from kwargs to avoid duplicate keyword arguments
+        subfolder = kwargs.pop("subfolder", "")
         # get args from processor init signature
         sub_processors = cls.get_attributes()
         for sub_processor_type in sub_processors:
-            if "FuyuProcessor" in cls.__name__ and "tokenizer" in sub_processor_type:
-                from .tokenization_utils_tokenizers import TokenizersBackend
-                tokenizer = TokenizersBackend.from_pretrained(pretrained_model_name_or_path, **kwargs)
-                if "token_type_ids" in tokenizer.model_input_names:
-                    tokenizer.model_input_names.remove("token_type_ids")
-                args.append(tokenizer)
-            elif "PixtralProcessor" in cls.__name__ and "tokenizer" in sub_processor_type:
-                from tokenizers import pre_tokenizers
+            modality = _get_modality_for_attribute(sub_processor_type)
+            is_primary = sub_processor_type == modality
-                from .models.llama import LlamaTokenizer
+            if (
+                "tokenizer" in sub_processor_type
+            ):  # This is only necessary for the checkpoing in test_procesing_mistral3.py which has no config.json and
+                # the tokenizer_config.json references LlamaTokenizerFast. TODO: update the config on the hub.
+                if "PixtralProcessor" in cls.__name__:
+                    from .tokenization_utils_tokenizers import TokenizersBackend
-                tokenizer = LlamaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
-                tokenizer._tokenizer.pre_tokenizer = pre_tokenizers.Sequence(
-                    [pre_tokenizers.ByteLevel(False), tokenizer._tokenizer.pre_tokenizer]
-                )
+                    tokenizer = TokenizersBackend.from_pretrained(pretrained_model_name_or_path, **kwargs)
+                else:
+                    tokenizer = cls._load_tokenizer_from_pretrained(
+                        sub_processor_type, pretrained_model_name_or_path, subfolder=subfolder, **kwargs
+                    )
                 args.append(tokenizer)
-            elif sub_processor_type in MODALITY_TO_AUTOPROCESSOR_MAPPING:
+            elif is_primary:
+                # Primary non-tokenizer sub-processor: load via Auto class
                 auto_processor_class = MODALITY_TO_AUTOPROCESSOR_MAPPING[sub_processor_type]
-                sub_processor = auto_processor_class.from_pretrained(pretrained_model_name_or_path, **kwargs)
-                args.append(sub_processor)
-            elif "tokenizer" in sub_processor_type:
-                # Special case: tokenizer-like parameters not in the mapping (e.g., "protein_tokenizer")
-                # Load using AutoTokenizer with subfolder
-                auto_processor_class = MODALITY_TO_AUTOPROCESSOR_MAPPING["tokenizer"]
                 sub_processor = auto_processor_class.from_pretrained(
-                    pretrained_model_name_or_path, subfolder=sub_processor_type, **kwargs
+                    pretrained_model_name_or_path, subfolder=subfolder, **kwargs
                 )
                 args.append(sub_processor)
+            elif sub_processor_type in processor_dict:
+                # Additional non-tokenizer sub-processor: instantiate from config in processor_dict
+                sub_processor_config = processor_dict[sub_processor_type]
+                if isinstance(sub_processor_config, dict):
+                    # Determine the class to instantiate
+                    # Image processors have 'image_processor_type', feature extractors have 'feature_extractor_type'
+                    type_key = f"{modality}_type"
+                    class_name = sub_processor_config.get(type_key)
+                    if class_name is None:
+                        raise ValueError(
+                            f"Cannot instantiate {sub_processor_type}: missing '{type_key}' in config. "
+                            f"Config keys: {list(sub_processor_config.keys())}"
+                        )
+                    processor_class = cls.get_possibly_dynamic_module(class_name)
+                    sub_processor = processor_class(**sub_processor_config)
+                    args.append(sub_processor)
+                else:
+                    raise ValueError(
+                        f"Expected dict for {sub_processor_type} in processor_config.json, "
+                        f"got {type(sub_processor_config)}"
+                    )
+            else:
+                raise ValueError(
+                    f"Cannot find config for {sub_processor_type} in processor_config.json. "
+                    f"Available keys: {list(processor_dict.keys())}"
+                )
         return args
     @staticmethod

transformers/quantizers/auto.py CHANGED Viewed

@@ -302,7 +302,7 @@ def register_quantizer(name: str):
     return register_quantizer_fn
-def get_hf_quantizer(config, quantization_config, dtype, device_map, weights_only, user_agent):
+def get_hf_quantizer(config, quantization_config, device_map, weights_only, user_agent):
     pre_quantized = hasattr(config, "quantization_config")
     if pre_quantized and not AutoHfQuantizer.supports_quant_method(config.quantization_config):
         pre_quantized = False
@@ -324,11 +324,9 @@ def get_hf_quantizer(config, quantization_config, dtype, device_map, weights_onl
     if hf_quantizer is not None:
         hf_quantizer.validate_environment(
-            dtype=dtype,
             device_map=device_map,
             weights_only=weights_only,
         )
-        dtype = hf_quantizer.update_dtype(dtype)
         device_map = hf_quantizer.update_device_map(device_map)
         config = hf_quantizer.update_tp_plan(config)
         config = hf_quantizer.update_ep_plan(config)
@@ -337,4 +335,4 @@ def get_hf_quantizer(config, quantization_config, dtype, device_map, weights_onl
         if not getattr(hf_quantizer.quantization_config, "dequantize", False):
             quant_method = hf_quantizer.quantization_config.quant_method
             user_agent["quant"] = getattr(quant_method, "value", quant_method)
-    return hf_quantizer, config, dtype, device_map
+    return hf_quantizer, config, device_map

transformers/quantizers/base.py CHANGED Viewed

@@ -31,16 +31,6 @@ else:
 logger = logging.get_logger(__file__)
-def _assign_original_dtype(module, original_dtype):
-    # not very nice in a recursive function but it avoids a circular import
-    from ..modeling_utils import PreTrainedModel
-    for child in module.children():
-        if isinstance(child, PreTrainedModel):
-            child.config._pre_quantization_dtype = original_dtype
-        _assign_original_dtype(child, original_dtype)
 def get_keys_to_not_convert(model) -> list:
     r"""
     Function to automatically detect keys to not convert for usage like quantization. For example for CausalLM modules
@@ -118,33 +108,7 @@ class HfQuantizer(ABC):
         """
         return device_map
-    def adjust_target_dtype(self, dtype: "torch.dtype") -> "torch.dtype":
-        """
-        Override this method if you want to adjust the `target_dtype` variable used in `from_pretrained`
-        to compute the device_map in case the device_map is a `str`. E.g. for bitsandbytes we force-set `target_dtype`
-        to `torch.int8` and for 4-bit we pass a custom enum `accelerate.CustomDtype.int4`.
-        Args:
-            dtype (`torch.dtype`, *optional*):
-                The dtype that is used to compute the device_map.
-        """
-        return dtype
     def param_element_size(self, model: "PreTrainedModel", param_name: str, param: "torch.Tensor") -> float:
-        "Return the element size (in bytes) for `param_name`."
-        if self.param_needs_quantization(model, param_name):
-            from accelerate.utils import CustomDtype
-            mapping = {
-                torch.int8: 1,
-                CustomDtype.INT4: 0.5,
-                CustomDtype.FP8: 1,
-                CustomDtype.INT2: 0.25,
-            }
-            # The value passed is actually not used when the method is overridden
-            if (custom_dtype := self.adjust_target_dtype(torch.float16)) in mapping:
-                return mapping[custom_dtype]
         return param.element_size()
     def adjust_max_memory(self, max_memory: dict[str, int | str]) -> dict[str, int | str]:
@@ -176,7 +140,7 @@ class HfQuantizer(ABC):
     def _process_model_before_weight_loading(self, model, **kwargs):
         return model
-    def preprocess_model(self, model: "PreTrainedModel", config, dtype=None, checkpoint_files=None, **kwargs):
+    def preprocess_model(self, model: "PreTrainedModel", dtype=None, **kwargs):
         """
         Setting model attributes and/or converting model before weights loading. At this point
         the model should be initialized on the meta device so you can freely manipulate the skeleton
@@ -194,14 +158,6 @@ class HfQuantizer(ABC):
             self._convert_model_for_quantization(model)
         self._process_model_before_weight_loading(model, **kwargs)
-        # We store the original dtype for quantized models as we cannot easily retrieve it
-        # once the weights have been quantized
-        # Note that once you have loaded a quantized model, you can't change its dtype so this will
-        # remain a single source of truth
-        original_dtype = dtype if dtype is not None else torch.get_default_dtype()
-        config._pre_quantization_dtype = original_dtype
-        _assign_original_dtype(model, original_dtype)
     def _process_model_after_weight_loading(self, model: "PreTrainedModel", **kwargs):
         return model
@@ -231,34 +187,25 @@ class HfQuantizer(ABC):
             del model.hf_quantizer
         if hasattr(model.config, "quantization_config"):
             del model.config.quantization_config
-        if hasattr(model.config, "_pre_quantization_dtype"):
-            del model.config._pre_quantization_dtype
         if hasattr(model, "quantization_method"):
             del model.quantization_method
         model.is_quantized = False
-    def dequantize(self, model):
+    def dequantize(self, model, dtype=None):
         """
         Potentially dequantize the model to retrieve the original model, with some loss in accuracy / performance.
         Note not all quantization schemes support this.
         """
-        model = self._dequantize(model)
+        if dtype is None:
+            # using the same dtype we used to load the model. If we don't do that, we might have issues with modules we didn't quantize.
+            # or we need to upcast everything to the same dtype
+            dtype = model.config.dtype
+        model = self._dequantize(model, dtype=dtype)
         self.remove_quantization_config(model)
         return model
-    def get_accelerator_warm_up_factor(self):
-        """
-        The factor to be used in `caching_allocator_warmup` to get the number of bytes to pre-allocate to warm up accelerator.
-        A factor of 2 means we allocate all bytes in the empty model (since we allocate in fp16), a factor of 4 means
-        we allocate half the memory of the weights residing in the empty model, etc...
-        """
-        # By default we return 4, i.e. half the model size (this corresponds to the case where the model is not
-        # really pre-processed, i.e. we do not have the info that weights are going to be 8 bits before actual
-        # weight loading)
-        return 4
-    def _dequantize(self, model):
+    def _dequantize(self, model, dtype=None):
         raise NotImplementedError(
             f"{self.quantization_config.quant_method} has no implementation of `dequantize`, please raise an issue on GitHub."
         )
@@ -313,15 +260,13 @@ class HfQuantizer(ABC):
     def is_trainable(self): ...
     def _convert_model_for_quantization(self, model):
-        from accelerate import init_empty_weights
         for name, module in model.named_modules():
             module_class_name = module.__class__.__name__
             if module_class_name in MODULES_TO_PATCH_FOR_QUANTIZATION and (
                 self.quantization_config.quant_method
                 in MODULES_TO_PATCH_FOR_QUANTIZATION[module_class_name]["quantization_methods"]
             ):
-                with init_empty_weights():
+                with torch.device("meta"):
                     parent_module, name = get_module_from_name(model, name)
                     parent_module._modules[name] = MODULES_TO_PATCH_FOR_QUANTIZATION[module_class_name]["module_name"](
                         model.config.get_text_config()

transformers/quantizers/quantizer_aqlm.py CHANGED Viewed

@@ -23,13 +23,10 @@ if TYPE_CHECKING:
     from ..modeling_utils import PreTrainedModel
 from ..integrations import replace_with_aqlm_linear
-from ..utils import is_accelerate_available, is_aqlm_available, is_torch_available, logging
+from ..utils import is_accelerate_available, is_aqlm_available, logging
 from ..utils.quantization_config import QuantizationConfigMixin
-if is_torch_available():
-    import torch
 logger = logging.get_logger(__name__)
@@ -50,20 +47,6 @@ class AqlmHfQuantizer(HfQuantizer):
         if not is_aqlm_available():
             raise ImportError("Using `aqlm` quantization requires AQLM: `pip install aqlm[gpu,cpu]`")
-    def update_dtype(self, dtype: "torch.dtype") -> "torch.dtype":
-        if dtype is None:
-            if torch.cuda.is_available():
-                dtype = torch.float16
-                logger.info(
-                    "CUDA available. Assuming AQLM inference on GPU and loading the model in `torch.float16`. To overwrite it, set `dtype` manually."
-                )
-            else:
-                dtype = torch.float32
-                logger.info(
-                    "CUDA is unavailable. Assuming AQLM inference on CPU and loading the model in `torch.float32`. To overwrite it, set `dtype` manually."
-                )
-        return dtype
     def _process_model_before_weight_loading(
         self,
         model: "PreTrainedModel",

transformers/quantizers/quantizer_auto_round.py CHANGED Viewed

@@ -19,13 +19,10 @@ from .base import HfQuantizer
 if TYPE_CHECKING:
     from ..modeling_utils import PreTrainedModel
-from ..utils import is_auto_round_available, is_torch_available, logging
+from ..utils import is_auto_round_available, logging
 from ..utils.quantization_config import QuantizationConfigMixin
-if is_torch_available():
-    import torch
 logger = logging.get_logger(__name__)
@@ -47,12 +44,6 @@ class AutoRoundQuantizer(HfQuantizer):
                 "Loading an AutoRound quantized model requires auto-round library (`pip install 'auto-round>=0.5'`)"
             )
-    def update_dtype(self, dtype: "torch.dtype") -> "torch.dtype":
-        if dtype is None:
-            dtype = torch.bfloat16
-            logger.info("Loading the model in `torch.bfloat16`. To overwrite it, set `dtype` manually.")
-        return dtype
     def _process_model_before_weight_loading(self, model: "PreTrainedModel", **kwargs):
         if model.__class__.main_input_name != "input_ids":
             logger.warning("AutoRound offers only limited support for models that are not strictly text-based.")

transformers/quantizers/quantizer_awq.py CHANGED Viewed

@@ -53,10 +53,7 @@ class AwqQuantizer(HfQuantizer):
             raise ImportError("Loading an AWQ quantized model requires accelerate (`pip install accelerate`)")
     def update_dtype(self, dtype):
-        if dtype is None:
-            dtype = torch.float16
-            logger.info("Loading the model in `torch.float16`. To overwrite it, set `dtype` manually.")
-        elif dtype == torch.bfloat16 and (torch.cuda.is_available() or torch.xpu.is_available()):
+        if dtype == torch.bfloat16 and (torch.cuda.is_available() or torch.xpu.is_available()):
             logger.warning(
                 "`torch.bfloat16` is not supported for AWQ CUDA/XPU kernels yet. Casting to `torch.float16`."
             )
@@ -65,13 +62,11 @@ class AwqQuantizer(HfQuantizer):
             logger.warning("We suggest you to set `dtype=torch.float16` for better efficiency on CUDA/XPU with AWQ.")
         return dtype
-    def _process_model_before_weight_loading(
-        self, model: "PreTrainedModel", keep_in_fp32_modules: list[str] | None = None, **kwargs
-    ):
+    def _process_model_before_weight_loading(self, model: "PreTrainedModel", **kwargs):
         from ..integrations import replace_quantization_scales, replace_with_awq_linear
         self.modules_to_not_convert = self.get_modules_to_not_convert(
-            model, self.quantization_config.modules_to_not_convert, keep_in_fp32_modules, add_default_skips=True
+            model, self.quantization_config.modules_to_not_convert, model._keep_in_fp32_modules, add_default_skips=True
         )
         model = replace_with_awq_linear(

transformers 5.0.0rc1__py3-none-any.whl → 5.0.0rc2__py3-none-any.whl

transformers 5.0.0rc1py3-none-any.whl → 5.0.0rc2py3-none-any.whl