PyPI - transformers - Versions diffs - 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl - Mend

transformers 5.0.0rc3py3-none-any.whl → 5.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1021) hide show

transformers/pipelines/__init__.py CHANGED Viewed

@@ -59,7 +59,6 @@ from .base import (
     get_default_model_and_revision,
     load_model,
 )
-from .deprecated import SummarizationPipeline, Text2TextGenerationPipeline, TranslationPipeline
 from .depth_estimation import DepthEstimationPipeline
 from .document_question_answering import DocumentQuestionAnsweringPipeline
 from .feature_extraction import FeatureExtractionPipeline
@@ -69,7 +68,6 @@ from .image_feature_extraction import ImageFeatureExtractionPipeline
 from .image_segmentation import ImageSegmentationPipeline
 from .image_text_to_text import ImageTextToTextPipeline
 from .image_to_image import ImageToImagePipeline
-from .image_to_text import ImageToTextPipeline
 from .keypoint_matching import KeypointMatchingPipeline
 from .mask_generation import MaskGenerationPipeline
 from .object_detection import ObjectDetectionPipeline
@@ -207,29 +205,6 @@ SUPPORTED_TASKS = {
         "default": {"model": ("distilbert/distilroberta-base", "fb53ab8")},
         "type": "text",
     },
-    "summarization": {
-        "impl": SummarizationPipeline,
-        "pt": (AutoModelForSeq2SeqLM,) if is_torch_available() else (),
-        "default": {"model": ("sshleifer/distilbart-cnn-12-6", "a4f8f3e")},
-        "type": "text",
-    },
-    # This task is a special case as it's parametrized by SRC, TGT languages.
-    "translation": {
-        "impl": TranslationPipeline,
-        "pt": (AutoModelForSeq2SeqLM,) if is_torch_available() else (),
-        "default": {
-            ("en", "fr"): {"model": ("google-t5/t5-base", "a9723ea")},
-            ("en", "de"): {"model": ("google-t5/t5-base", "a9723ea")},
-            ("en", "ro"): {"model": ("google-t5/t5-base", "a9723ea")},
-        },
-        "type": "text",
-    },
-    "text2text-generation": {
-        "impl": Text2TextGenerationPipeline,
-        "pt": (AutoModelForSeq2SeqLM,) if is_torch_available() else (),
-        "default": {"model": ("google-t5/t5-base", "a9723ea")},
-        "type": "text",
-    },
     "text-generation": {
         "impl": TextGenerationPipeline,
         "pt": (AutoModelForCausalLM,) if is_torch_available() else (),
@@ -275,12 +250,6 @@ SUPPORTED_TASKS = {
         "default": {"model": ("facebook/detr-resnet-50-panoptic", "d53b52a")},
         "type": "multimodal",
     },
-    "image-to-text": {
-        "impl": ImageToTextPipeline,
-        "pt": (AutoModelForImageTextToText,) if is_torch_available() else (),
-        "default": {"model": ("ydshieh/vit-gpt2-coco-en", "e460201")},
-        "type": "multimodal",
-    },
     "image-text-to-text": {
         "impl": ImageTextToTextPipeline,
         "pt": (AutoModelForImageTextToText,) if is_torch_available() else (),
@@ -388,20 +357,15 @@ def check_task(task: str) -> tuple[str, dict, Any]:
             - `"image-classification"`
             - `"image-feature-extraction"`
             - `"image-segmentation"`
-            - `"image-to-text"`
             - `"image-to-image"`
             - `"keypoint-matching"`
             - `"object-detection"`
             - `"question-answering"`
-            - `"summarization"`
             - `"table-question-answering"`
-            - `"text2text-generation"`
             - `"text-classification"` (alias `"sentiment-analysis"` available)
             - `"text-generation"`
             - `"text-to-audio"` (alias `"text-to-speech"` available)
             - `"token-classification"` (alias `"ner"` available)
-            - `"translation"`
-            - `"translation_xx_to_yy"`
             - `"video-classification"`
             - `"visual-question-answering"` (alias `"vqa"` available)
             - `"zero-shot-classification"`
@@ -410,8 +374,7 @@ def check_task(task: str) -> tuple[str, dict, Any]:
     Returns:
         (normalized_task: `str`, task_defaults: `dict`, task_options: (`tuple`, None)) The normalized task name
-        (removed alias and options). The actual dictionary required to initialize the pipeline and some extra task
-        options for parametrized tasks like "translation_xx_to_yy"
+        (removed alias and options).
     """
@@ -469,8 +432,6 @@ def pipeline(task: Literal["image-text-to-text"], model: str | PreTrainedModel |
 @overload
 def pipeline(task: Literal["image-to-image"], model: str | PreTrainedModel | None = None, config: str | PreTrainedConfig | None = None, tokenizer: str | PreTrainedTokenizer | PreTrainedTokenizerFast | None = None, feature_extractor: str | PreTrainedFeatureExtractor | None = None, image_processor: str | BaseImageProcessor | None = None, processor: str | ProcessorMixin | None = None, revision: str | None = None, use_fast: bool = True, token: str | bool | None = None, device: int | str | torch.device | None = None, device_map: str | dict[str, int | str] | None = None, dtype: str | torch.dtype | None = "auto", trust_remote_code: bool | None = None, model_kwargs: dict[str, Any] | None = None, pipeline_class: Any | None = None, **kwargs: Any) -> ImageToImagePipeline: ...
 @overload
-def pipeline(task: Literal["image-to-text"], model: str | PreTrainedModel | None = None, config: str | PreTrainedConfig | None = None, tokenizer: str | PreTrainedTokenizer | PreTrainedTokenizerFast | None = None, feature_extractor: str | PreTrainedFeatureExtractor | None = None, image_processor: str | BaseImageProcessor | None = None, processor: str | ProcessorMixin | None = None, revision: str | None = None, use_fast: bool = True, token: str | bool | None = None, device: int | str | torch.device | None = None, device_map: str | dict[str, int | str] | None = None, dtype: str | torch.dtype | None = "auto", trust_remote_code: bool | None = None, model_kwargs: dict[str, Any] | None = None, pipeline_class: Any | None = None, **kwargs: Any) -> ImageToTextPipeline: ...
-@overload
 def pipeline(task: Literal["keypoint-matching"], model: str | PreTrainedModel | None = None, config: str | PreTrainedConfig | None = None, tokenizer: str | PreTrainedTokenizer | PreTrainedTokenizerFast | None = None, feature_extractor: str | PreTrainedFeatureExtractor | None = None, image_processor: str | BaseImageProcessor | None = None, processor: str | ProcessorMixin | None = None, revision: str | None = None, use_fast: bool = True, token: str | bool | None = None, device: int | str | torch.device | None = None, device_map: str | dict[str, int | str] | None = None, dtype: str | torch.dtype | None = "auto", trust_remote_code: bool | None = None, model_kwargs: dict[str, Any] | None = None, pipeline_class: Any | None = None, **kwargs: Any) -> KeypointMatchingPipeline: ...
 @overload
 def pipeline(task: Literal["mask-generation"], model: str | PreTrainedModel | None = None, config: str | PreTrainedConfig | None = None, tokenizer: str | PreTrainedTokenizer | PreTrainedTokenizerFast | None = None, feature_extractor: str | PreTrainedFeatureExtractor | None = None, image_processor: str | BaseImageProcessor | None = None, processor: str | ProcessorMixin | None = None, revision: str | None = None, use_fast: bool = True, token: str | bool | None = None, device: int | str | torch.device | None = None, device_map: str | dict[str, int | str] | None = None, dtype: str | torch.dtype | None = "auto", trust_remote_code: bool | None = None, model_kwargs: dict[str, Any] | None = None, pipeline_class: Any | None = None, **kwargs: Any) -> MaskGenerationPipeline: ...
@@ -479,8 +440,6 @@ def pipeline(task: Literal["object-detection"], model: str | PreTrainedModel | N
 @overload
 def pipeline(task: Literal["question-answering"], model: str | PreTrainedModel | None = None, config: str | PreTrainedConfig | None = None, tokenizer: str | PreTrainedTokenizer | PreTrainedTokenizerFast | None = None, feature_extractor: str | PreTrainedFeatureExtractor | None = None, image_processor: str | BaseImageProcessor | None = None, processor: str | ProcessorMixin | None = None, revision: str | None = None, use_fast: bool = True, token: str | bool | None = None, device: int | str | torch.device | None = None, device_map: str | dict[str, int | str] | None = None, dtype: str | torch.dtype | None = "auto", trust_remote_code: bool | None = None, model_kwargs: dict[str, Any] | None = None, pipeline_class: Any | None = None, **kwargs: Any) -> QuestionAnsweringPipeline: ...
 @overload
-def pipeline(task: Literal["summarization"], model: str | PreTrainedModel | None = None, config: str | PreTrainedConfig | None = None, tokenizer: str | PreTrainedTokenizer | PreTrainedTokenizerFast | None = None, feature_extractor: str | PreTrainedFeatureExtractor | None = None, image_processor: str | BaseImageProcessor | None = None, processor: str | ProcessorMixin | None = None, revision: str | None = None, use_fast: bool = True, token: str | bool | None = None, device: int | str | torch.device | None = None, device_map: str | dict[str, int | str] | None = None, dtype: str | torch.dtype | None = "auto", trust_remote_code: bool | None = None, model_kwargs: dict[str, Any] | None = None, pipeline_class: Any | None = None, **kwargs: Any) -> SummarizationPipeline: ...
-@overload
 def pipeline(task: Literal["table-question-answering"], model: str | PreTrainedModel | None = None, config: str | PreTrainedConfig | None = None, tokenizer: str | PreTrainedTokenizer | PreTrainedTokenizerFast | None = None, feature_extractor: str | PreTrainedFeatureExtractor | None = None, image_processor: str | BaseImageProcessor | None = None, processor: str | ProcessorMixin | None = None, revision: str | None = None, use_fast: bool = True, token: str | bool | None = None, device: int | str | torch.device | None = None, device_map: str | dict[str, int | str] | None = None, dtype: str | torch.dtype | None = "auto", trust_remote_code: bool | None = None, model_kwargs: dict[str, Any] | None = None, pipeline_class: Any | None = None, **kwargs: Any) -> TableQuestionAnsweringPipeline: ...
 @overload
 def pipeline(task: Literal["text-classification"], model: str | PreTrainedModel | None = None, config: str | PreTrainedConfig | None = None, tokenizer: str | PreTrainedTokenizer | PreTrainedTokenizerFast | None = None, feature_extractor: str | PreTrainedFeatureExtractor | None = None, image_processor: str | BaseImageProcessor | None = None, processor: str | ProcessorMixin | None = None, revision: str | None = None, use_fast: bool = True, token: str | bool | None = None, device: int | str | torch.device | None = None, device_map: str | dict[str, int | str] | None = None, dtype: str | torch.dtype | None = "auto", trust_remote_code: bool | None = None, model_kwargs: dict[str, Any] | None = None, pipeline_class: Any | None = None, **kwargs: Any) -> TextClassificationPipeline: ...
@@ -489,12 +448,8 @@ def pipeline(task: Literal["text-generation"], model: str | PreTrainedModel | No
 @overload
 def pipeline(task: Literal["text-to-audio"], model: str | PreTrainedModel | None = None, config: str | PreTrainedConfig | None = None, tokenizer: str | PreTrainedTokenizer | PreTrainedTokenizerFast | None = None, feature_extractor: str | PreTrainedFeatureExtractor | None = None, image_processor: str | BaseImageProcessor | None = None, processor: str | ProcessorMixin | None = None, revision: str | None = None, use_fast: bool = True, token: str | bool | None = None, device: int | str | torch.device | None = None, device_map: str | dict[str, int | str] | None = None, dtype: str | torch.dtype | None = "auto", trust_remote_code: bool | None = None, model_kwargs: dict[str, Any] | None = None, pipeline_class: Any | None = None, **kwargs: Any) -> TextToAudioPipeline: ...
 @overload
-def pipeline(task: Literal["text2text-generation"], model: str | PreTrainedModel | None = None, config: str | PreTrainedConfig | None = None, tokenizer: str | PreTrainedTokenizer | PreTrainedTokenizerFast | None = None, feature_extractor: str | PreTrainedFeatureExtractor | None = None, image_processor: str | BaseImageProcessor | None = None, processor: str | ProcessorMixin | None = None, revision: str | None = None, use_fast: bool = True, token: str | bool | None = None, device: int | str | torch.device | None = None, device_map: str | dict[str, int | str] | None = None, dtype: str | torch.dtype | None = "auto", trust_remote_code: bool | None = None, model_kwargs: dict[str, Any] | None = None, pipeline_class: Any | None = None, **kwargs: Any) -> Text2TextGenerationPipeline: ...
-@overload
 def pipeline(task: Literal["token-classification"], model: str | PreTrainedModel | None = None, config: str | PreTrainedConfig | None = None, tokenizer: str | PreTrainedTokenizer | PreTrainedTokenizerFast | None = None, feature_extractor: str | PreTrainedFeatureExtractor | None = None, image_processor: str | BaseImageProcessor | None = None, processor: str | ProcessorMixin | None = None, revision: str | None = None, use_fast: bool = True, token: str | bool | None = None, device: int | str | torch.device | None = None, device_map: str | dict[str, int | str] | None = None, dtype: str | torch.dtype | None = "auto", trust_remote_code: bool | None = None, model_kwargs: dict[str, Any] | None = None, pipeline_class: Any | None = None, **kwargs: Any) -> TokenClassificationPipeline: ...
 @overload
-def pipeline(task: Literal["translation"], model: str | PreTrainedModel | None = None, config: str | PreTrainedConfig | None = None, tokenizer: str | PreTrainedTokenizer | PreTrainedTokenizerFast | None = None, feature_extractor: str | PreTrainedFeatureExtractor | None = None, image_processor: str | BaseImageProcessor | None = None, processor: str | ProcessorMixin | None = None, revision: str | None = None, use_fast: bool = True, token: str | bool | None = None, device: int | str | torch.device | None = None, device_map: str | dict[str, int | str] | None = None, dtype: str | torch.dtype | None = "auto", trust_remote_code: bool | None = None, model_kwargs: dict[str, Any] | None = None, pipeline_class: Any | None = None, **kwargs: Any) -> TranslationPipeline: ...
-@overload
 def pipeline(task: Literal["video-classification"], model: str | PreTrainedModel | None = None, config: str | PreTrainedConfig | None = None, tokenizer: str | PreTrainedTokenizer | PreTrainedTokenizerFast | None = None, feature_extractor: str | PreTrainedFeatureExtractor | None = None, image_processor: str | BaseImageProcessor | None = None, processor: str | ProcessorMixin | None = None, revision: str | None = None, use_fast: bool = True, token: str | bool | None = None, device: int | str | torch.device | None = None, device_map: str | dict[str, int | str] | None = None, dtype: str | torch.dtype | None = "auto", trust_remote_code: bool | None = None, model_kwargs: dict[str, Any] | None = None, pipeline_class: Any | None = None, **kwargs: Any) -> VideoClassificationPipeline: ...
 @overload
 def pipeline(task: Literal["visual-question-answering"], model: str | PreTrainedModel | None = None, config: str | PreTrainedConfig | None = None, tokenizer: str | PreTrainedTokenizer | PreTrainedTokenizerFast | None = None, feature_extractor: str | PreTrainedFeatureExtractor | None = None, image_processor: str | BaseImageProcessor | None = None, processor: str | ProcessorMixin | None = None, revision: str | None = None, use_fast: bool = True, token: str | bool | None = None, device: int | str | torch.device | None = None, device_map: str | dict[str, int | str] | None = None, dtype: str | torch.dtype | None = "auto", trust_remote_code: bool | None = None, model_kwargs: dict[str, Any] | None = None, pipeline_class: Any | None = None, **kwargs: Any) -> VisualQuestionAnsweringPipeline: ...
@@ -565,21 +520,16 @@ def pipeline(
             - `"image-segmentation"`: will return a [`ImageSegmentationPipeline`].
             - `"image-text-to-text"`: will return a [`ImageTextToTextPipeline`].
             - `"image-to-image"`: will return a [`ImageToImagePipeline`].
-            - `"image-to-text"`: will return a [`ImageToTextPipeline`].
             - `"keypoint-matching"`: will return a [`KeypointMatchingPipeline`].
             - `"mask-generation"`: will return a [`MaskGenerationPipeline`].
             - `"object-detection"`: will return a [`ObjectDetectionPipeline`].
             - `"question-answering"`: will return a [`QuestionAnsweringPipeline`].
-            - `"summarization"`: will return a [`SummarizationPipeline`].
             - `"table-question-answering"`: will return a [`TableQuestionAnsweringPipeline`].
-            - `"text2text-generation"`: will return a [`Text2TextGenerationPipeline`].
             - `"text-classification"` (alias `"sentiment-analysis"` available): will return a
               [`TextClassificationPipeline`].
             - `"text-generation"`: will return a [`TextGenerationPipeline`]:.
             - `"text-to-audio"` (alias `"text-to-speech"` available): will return a [`TextToAudioPipeline`]:.
             - `"token-classification"` (alias `"ner"` available): will return a [`TokenClassificationPipeline`].
-            - `"translation"`: will return a [`TranslationPipeline`].
-            - `"translation_xx_to_yy"`: will return a [`TranslationPipeline`].
             - `"video-classification"`: will return a [`VideoClassificationPipeline`].
             - `"visual-question-answering"`: will return a [`VisualQuestionAnsweringPipeline`].
             - `"zero-shot-classification"`: will return a [`ZeroShotClassificationPipeline`].
@@ -1057,16 +1007,6 @@ def pipeline(
             else:
                 processor = None
-    if task == "translation" and model.config.task_specific_params:
-        for key in model.config.task_specific_params:
-            if key.startswith("translation"):
-                task = key
-                warnings.warn(
-                    f'"translation" task was used, instead of "translation_XX_to_YY", defaulting to "{task}"',
-                    UserWarning,
-                )
-                break
     if tokenizer is not None:
         kwargs["tokenizer"] = tokenizer

transformers/pipelines/any_to_any.py CHANGED Viewed

@@ -386,7 +386,7 @@ class AnyToAnyPipeline(Pipeline):
             text = inputs.pop("text")
             # Feature extractor do not load audio files and expect a decode array
-            if "audio" in inputs and hasattr(self.processor, "feature_extractor"):
+            if inputs.get("audio", None) is not None and hasattr(self.processor, "feature_extractor"):
                 inputs["audio"] = self.processor.feature_extractor.fetch_audio(inputs["audio"])
         # If batched text inputs, we set padding to True unless specified otherwise

transformers/pipelines/automatic_speech_recognition.py CHANGED Viewed

@@ -480,10 +480,8 @@ class AutomaticSpeechRecognitionPipeline(ChunkPipeline):
                         inputs,
                         sampling_rate=self.feature_extractor.sampling_rate,
                         return_tensors="pt",
-                        return_token_timestamps=True,
                         return_attention_mask=True,
                     )
-                    extra["num_frames"] = processed.pop("num_frames")
                 else:
                     processed = self.feature_extractor(
                         inputs,

transformers/pipelines/base.py CHANGED Viewed

@@ -901,7 +901,7 @@ class Pipeline(_ScikitCompat, PushToHubMixin):
             # Update the generation config with task specific params if they exist.
             # NOTE: 1. `prefix` is pipeline-specific and doesn't exist in the generation config.
             #       2. `task_specific_params` is a legacy feature and should be removed in a future version.
-            task_specific_params = self.model.config.task_specific_params
+            task_specific_params = getattr(self.model.config, "task_specific_params", None)
             if task_specific_params is not None and task in task_specific_params:
                 this_task_params = task_specific_params.get(task)
                 if "prefix" in this_task_params:

transformers/pipelines/image_text_to_text.py CHANGED Viewed

@@ -334,7 +334,7 @@ class ImageTextToTextPipeline(Pipeline):
                 return_tensors="pt",
                 tokenize=True,
                 return_dict=True,
-            )
+            ).to(dtype=self.dtype)
             model_inputs["text"] = inputs
             return model_inputs
         # In case we only have text inputs

transformers/pipelines/text_to_audio.py CHANGED Viewed

@@ -152,8 +152,12 @@ class TextToAudioPipeline(Pipeline):
         if self.model.config.model_type == "bark":
             # bark Tokenizer is called with BarkProcessor which uses those kwargs
+            # Check if generation_config has semantic_config (BarkGenerationConfig) or use default
+            max_length = 256
+            if hasattr(self.generation_config, "semantic_config"):
+                max_length = getattr(self.generation_config.semantic_config, "max_input_semantic_length", 256)
             new_kwargs = {
-                "max_length": self.generation_config.semantic_config.get("max_input_semantic_length", 256),
+                "max_length": max_length,
                 "add_special_tokens": False,
                 "return_attention_mask": True,
                 "return_token_type_ids": False,

transformers/processing_utils.py CHANGED Viewed

@@ -508,22 +508,7 @@ class TokenizerChatTemplateKwargs(TypedDict, total=False):
     return_assistant_tokens_mask: bool | None = False
-class ChatTemplateLoadKwargs(TypedDict, total=False):
-    """
-    Keyword arguments used to load multimodal data in processor chat templates.
-    num_frames (`int`, *optional*):
-        Number of frames to sample uniformly. If not passed, the whole video is loaded.
-    load_audio_from_video (`bool`, *optional*):
-            Whether to use the audio track of input video. If `True` the audio track will be loaded and passed to the
-            processor. This flag has no effect if the model doesn't support audio modality.
-    """
-    sampling_rate: int | None = 16_000
-    load_audio_from_video: bool | None = False
-class ProcessorChatTemplateKwargs(ChatTemplateLoadKwargs, TokenizerChatTemplateKwargs, total=False):
+class ProcessorChatTemplateKwargs(TokenizerChatTemplateKwargs, total=False):
     """
     Keyword arguments for processor's `apply_chat_template`.
@@ -531,15 +516,18 @@ class ProcessorChatTemplateKwargs(ChatTemplateLoadKwargs, TokenizerChatTemplateK
         Whether to tokenize the output or not.
     return_dict (`bool`, defaults to `False`):
         Whether to return a dictionary with named outputs. Has no effect if tokenize is `False`.
+    load_audio_from_video (`bool`, *optional*, defaults to `False`):
+        Whether to use the audio track of input video. If `True` the audio track will be loaded and passed to the
+        processor. This flag has no effect if the model doesn't support audio modality.
     """
     tokenize: bool | None = False
     return_dict: bool | None = False
+    load_audio_from_video: bool | None = False
 class AllKwargsForChatTemplate(TypedDict, total=False):
     processor_kwargs: ProcessingKwargs
-    mm_load_kwargs: ChatTemplateLoadKwargs
     template_kwargs: ProcessorChatTemplateKwargs
@@ -1233,7 +1221,8 @@ class ProcessorMixin(PushToHubMixin):
         """
         # holding a copy to avoid mutating user-provided arguments
-        kwargs = kwargs.copy()
+        # Use deepcopy to also copy nested dicts (like videos_kwargs) that will be modified via pop()
+        kwargs = copy.deepcopy(kwargs)
         # Initialize dictionaries
         output_kwargs = {
@@ -1520,7 +1509,9 @@ class ProcessorMixin(PushToHubMixin):
                 if "PixtralProcessor" in cls.__name__:
                     from .tokenization_utils_tokenizers import TokenizersBackend
-                    tokenizer = TokenizersBackend.from_pretrained(pretrained_model_name_or_path, **kwargs)
+                    tokenizer = TokenizersBackend.from_pretrained(
+                        pretrained_model_name_or_path, subfolder=subfolder, **kwargs
+                    )
                 else:
                     tokenizer = cls._load_tokenizer_from_pretrained(
                         sub_processor_type, pretrained_model_name_or_path, subfolder=subfolder, **kwargs
@@ -1708,22 +1699,25 @@ class ProcessorMixin(PushToHubMixin):
             else:
                 kwargs["return_offsets_mapping"] = True  # force offset mapping so we can infer token boundaries
-        # Fill sets of kwargs that should be used by different parts of template
-        processed_kwargs = {
-            "mm_load_kwargs": {},
-            "template_kwargs": {},
-        }
-        for kwarg_type in processed_kwargs:
-            for key in AllKwargsForChatTemplate.__annotations__[kwarg_type].__annotations__:
-                kwarg_type_defaults = AllKwargsForChatTemplate.__annotations__[kwarg_type]
-                default_value = getattr(kwarg_type_defaults, key, None)
-                value = kwargs.pop(key, default_value)
-                if value is not None and not isinstance(value, dict):
-                    processed_kwargs[kwarg_type][key] = value
+        # Fill sets of kwargs that should be used by jinja template, filtering out kwargs used in `processor.__call__`
+        # NOTE: we don't only filter but also set the default values here. Without default values, we can remove it
+        template_kwargs = {}
+        for key in AllKwargsForChatTemplate.__annotations__["template_kwargs"].__annotations__:
+            kwarg_type_defaults = AllKwargsForChatTemplate.__annotations__["template_kwargs"]
+            default_value = getattr(kwarg_type_defaults, key, None)
+            value = kwargs.pop(key, default_value)
+            if value is not None and not isinstance(value, dict):
+                template_kwargs[key] = value
         # Pass unprocessed custom kwargs
-        processed_kwargs["template_kwargs"].update(kwargs)
+        template_kwargs.update(kwargs)
+        # Set the sampling rate to load the audio files if user hasn't already passed with `kwargs`
+        if "sampling_rate" not in template_kwargs:
+            if hasattr(self, "feature_extractor") and hasattr(self.feature_extractor, "sampling_rate"):
+                template_kwargs["sampling_rate"] = self.feature_extractor.sampling_rate
+            else:
+                template_kwargs["sampling_rate"] = 16_000
         if isinstance(conversation, (list, tuple)) and (
             isinstance(conversation[0], (list, tuple)) or hasattr(conversation[0], "content")
@@ -1734,9 +1728,8 @@ class ProcessorMixin(PushToHubMixin):
             is_batched = False
             conversations = [conversation]
-        tokenize = processed_kwargs["template_kwargs"].pop("tokenize", False)
-        return_dict = processed_kwargs["template_kwargs"].pop("return_dict", True)
-        mm_load_kwargs = processed_kwargs["mm_load_kwargs"]
+        tokenize = template_kwargs.pop("tokenize", False)
+        return_dict = template_kwargs.pop("return_dict", True)
         if tokenize:
             batch_images, batch_videos = [], []
@@ -1767,12 +1760,12 @@ class ProcessorMixin(PushToHubMixin):
                     videos.extend(video_fnames)
                     # Audio models do not accept nested list of audios (yet!) so we construct a flat input audio list
-                    if not mm_load_kwargs["load_audio_from_video"]:
+                    if not template_kwargs["load_audio_from_video"]:
                         for fname in audio_fnames:
-                            batch_audios.append(load_audio(fname, sampling_rate=mm_load_kwargs["sampling_rate"]))
+                            batch_audios.append(load_audio(fname, sampling_rate=template_kwargs["sampling_rate"]))
                     else:
                         for fname in video_fnames:
-                            batch_audios.append(load_audio(fname, sampling_rate=mm_load_kwargs["sampling_rate"]))
+                            batch_audios.append(load_audio(fname, sampling_rate=template_kwargs["sampling_rate"]))
                 # Currently all processors can accept nested list of batches, but not flat list of visuals
                 # So we'll make a batched list of images and let the processor handle it
@@ -1783,14 +1776,12 @@ class ProcessorMixin(PushToHubMixin):
         if hasattr(self, "tokenizer") and hasattr(self.tokenizer, "special_tokens_map"):
             special_tokens = self.tokenizer.special_tokens_map
             # Filter out tokens that conflict with template kwargs
-            special_tokens_map = {
-                k: v for k, v in special_tokens.items() if k not in processed_kwargs["template_kwargs"]
-            }
+            special_tokens_map = {k: v for k, v in special_tokens.items() if k not in template_kwargs}
         prompt, generation_indices = render_jinja_template(
             conversations=conversations,
             chat_template=chat_template,
-            **processed_kwargs["template_kwargs"],  # different flags such as `return_assistant_mask`
+            **template_kwargs,  # different flags such as `return_assistant_mask`
             **special_tokens_map,  # tokenizer special tokens are used by some templates
         )
@@ -1826,7 +1817,7 @@ class ProcessorMixin(PushToHubMixin):
             )
             if return_dict:
-                if processed_kwargs["template_kwargs"].get("return_assistant_tokens_mask", False):
+                if template_kwargs.get("return_assistant_tokens_mask", False):
                     assistant_masks = []
                     offset_mapping = out.pop("offset_mapping")
                     input_ids = out["input_ids"]

transformers/pytorch_utils.py CHANGED Viewed

@@ -35,10 +35,10 @@ logger = logging.get_logger(__name__)
 is_torch_greater_or_equal_than_2_8 = is_torch_greater_or_equal("2.8", accept_dev=True)
 is_torch_greater_or_equal_than_2_6 = is_torch_greater_or_equal("2.6", accept_dev=True)
-is_torch_greater_or_equal_than_2_4 = is_torch_greater_or_equal("2.4", accept_dev=True)
-is_torch_greater_or_equal_than_2_3 = is_torch_greater_or_equal("2.3", accept_dev=True)
 # For backwards compatibility (e.g. some remote codes on Hub using those variables).
+is_torch_greater_or_equal_than_2_4 = is_torch_greater_or_equal("2.4", accept_dev=True)
+is_torch_greater_or_equal_than_2_3 = is_torch_greater_or_equal("2.3", accept_dev=True)
 is_torch_greater_or_equal_than_2_2 = is_torch_greater_or_equal("2.2", accept_dev=True)
 is_torch_greater_or_equal_than_2_1 = is_torch_greater_or_equal("2.1", accept_dev=True)
 is_torch_greater_or_equal_than_2_0 = is_torch_greater_or_equal("2.0", accept_dev=True)
@@ -238,30 +238,6 @@ def id_tensor_storage(tensor: torch.Tensor) -> tuple[torch.device, int, int]:
     return tensor.device, unique_id, storage_size(tensor)
-def isin_mps_friendly(elements: torch.Tensor, test_elements: torch.Tensor | int) -> torch.Tensor:
-    """
-    Same as `torch.isin` without flags, but MPS-friendly. We can remove this function when we stop supporting
-    torch <= 2.3. See https://github.com/pytorch/pytorch/issues/77764#issuecomment-2067838075
-    Args:
-        elements (`torch.Tensor`): Input elements
-        test_elements (`torch.Tensor` or `int`): The elements to check against.
-    Returns:
-        `torch.Tensor`: A boolean tensor of the same shape as `elements` that is True for `elements` in `test_elements`
-        and False otherwise
-    """
-    if elements.device.type == "mps" and not is_torch_greater_or_equal_than_2_4:
-        test_elements = torch.tensor(test_elements)
-        if test_elements.ndim == 0:
-            test_elements = test_elements.unsqueeze(0)
-        return elements.tile(test_elements.shape[0], 1).eq(test_elements.unsqueeze(1)).sum(dim=0).bool().squeeze()
-    else:
-        # Note: don't use named arguments in `torch.isin`, see https://github.com/pytorch/pytorch/issues/126045
-        return torch.isin(elements, test_elements)
 @wraps(lru_cache)
 def compile_compatible_method_lru_cache(*lru_args, **lru_kwargs):
     """

transformers/quantizers/quantizer_compressed_tensors.py CHANGED Viewed

@@ -84,13 +84,15 @@ class CompressedTensorsHfQuantizer(HfQuantizer):
         ) or self.quantization_config.is_sparsification_compressed:
             self.compressor.decompress_model(model=model)
+    # NOTE: TP plan override for compressed tensors removed - unsupported styles were used.
+    # TODO: Implement proper TP support for compressed tensors quantization
     def update_tp_plan(self, config):
         additional_plan = {
-            "layers.*.feed_forward.experts.*.gate_proj.weight": "local_colwise",
-            "layers.*.feed_forward.experts.*.gate_proj.weight_scale": "local_colwise",
-            "layers.*.feed_forward.experts.*.up_proj.weight": "local_colwise",
-            "layers.*.feed_forward.experts.*.up_proj.weight_scale": "local_colwise",
-            "layers.*.feed_forward.experts.*.down_proj.weight": "local_rowwise",
+            "layers.*.feed_forward.experts.*.gate_proj.weight": "colwise",
+            "layers.*.feed_forward.experts.*.gate_proj.weight_scale": "colwise",
+            "layers.*.feed_forward.experts.*.up_proj.weight": "colwise",
+            "layers.*.feed_forward.experts.*.up_proj.weight_scale": "colwise",
+            "layers.*.feed_forward.experts.*.down_proj.weight": "rowwise",
         }
         if config.get_text_config() is not None and config.get_text_config().base_model_tp_plan is not None:
             config.get_text_config().base_model_tp_plan.update(additional_plan)

transformers/quantizers/quantizer_fbgemm_fp8.py CHANGED Viewed

@@ -133,14 +133,13 @@ class FbgemmFp8HfQuantizer(HfQuantizer):
                 # We are using a different tp plan with local_colwise and local_rowwise for the attention because fbgemm operations cannot be parallelized
                 # With local_colwise and local_rowwise, all the operations are done locally, and we add a gather operation to gather the results instead of
                 # using dtensors
-                "layers.*.self_attn.q_proj.weight": "local_colwise",
-                "layers.*.self_attn.q_proj.weight_scale": "local_colwise",
-                "layers.*.self_attn.k_proj.weight": "local_colwise",
-                "layers.*.self_attn.k_proj.weight_scale": "local_colwise",
-                "layers.*.self_attn.v_proj.weight": "local_colwise",
-                "layers.*.self_attn.v_proj.weight_scale": "local_colwise",
-                "layers.*.self_attn.o_proj.weight": "local_rowwise",
-                "layers.*.self_attn": "gather",
+                "layers.*.self_attn.q_proj.weight": "colwise",
+                "layers.*.self_attn.q_proj.weight_scale": "colwise",
+                "layers.*.self_attn.k_proj.weight": "colwise",
+                "layers.*.self_attn.k_proj.weight_scale": "colwise",
+                "layers.*.self_attn.v_proj.weight": "colwise",
+                "layers.*.self_attn.v_proj.weight_scale": "colwise",
+                "layers.*.self_attn.o_proj.weight": "rowwise",
                 # We keep the same sequence_parallel plan for layernorms
                 "layers.*.input_layernorm.weight": "sequence_parallel",
                 "layers.*.post_attention_layernorm.weight": "sequence_parallel",
@@ -148,23 +147,21 @@ class FbgemmFp8HfQuantizer(HfQuantizer):
                 # We keep the same local_colwise and local_rowwise plan for the feed forward shared expert
                 # We also add scales for the shared expert, for local_colwise the scale is also local_colwise
                 # For local_rowwise the scale is replicated, so we don't need to add it
-                "layers.*.feed_forward.shared_expert.gate_proj.weight": "local_colwise",
-                "layers.*.feed_forward.shared_expert.gate_proj.weight_scale": "local_colwise",
-                "layers.*.feed_forward.shared_expert.up_proj.weight": "local_colwise",
-                "layers.*.feed_forward.shared_expert.up_proj.weight_scale": "local_colwise",
-                "layers.*.feed_forward.shared_expert.down_proj.weight": "local_rowwise",
-                "layers.*.feed_forward.experts": "local",
-                "layers.*.feed_forward": "gather",
-                "layers.*.feed_forward.experts.*.gate_proj.weight": "local_colwise",
-                "layers.*.feed_forward.experts.*.gate_proj.weight_scale": "local_colwise",
-                "layers.*.feed_forward.experts.*.up_proj.weight": "local_colwise",
-                "layers.*.feed_forward.experts.*.up_proj.weight_scale": "local_colwise",
-                "layers.*.feed_forward.experts.*.down_proj.weight": "local_rowwise",
+                "layers.*.feed_forward.shared_expert.gate_proj.weight": "colwise",
+                "layers.*.feed_forward.shared_expert.gate_proj.weight_scale": "colwise",
+                "layers.*.feed_forward.shared_expert.up_proj.weight": "colwise",
+                "layers.*.feed_forward.shared_expert.up_proj.weight_scale": "colwise",
+                "layers.*.feed_forward.shared_expert.down_proj.weight": "rowwise",
+                "layers.*.feed_forward.experts.*.gate_proj.weight": "colwise",
+                "layers.*.feed_forward.experts.*.gate_proj.weight_scale": "colwise",
+                "layers.*.feed_forward.experts.*.up_proj.weight": "colwise",
+                "layers.*.feed_forward.experts.*.up_proj.weight_scale": "colwise",
+                "layers.*.feed_forward.experts.*.down_proj.weight": "rowwise",
                 # For Fused implementation we use local_packed_rowwise for the gate_up_proj, and the same for the packed scales
                 # We use local_colwise for the down_proj, and the scales are replicated so we don't add them
-                "layers.*.feed_forward.experts.gate_up_proj": "local_packed_rowwise",
-                "layers.*.feed_forward.experts.gate_up_proj_scale": "local_packed_rowwise",
-                "layers.*.feed_forward.experts.down_proj": "local_colwise",
+                "layers.*.feed_forward.experts.gate_up_proj": "packed_rowwise",
+                "layers.*.feed_forward.experts.gate_up_proj_scale": "packed_rowwise",
+                "layers.*.feed_forward.experts.down_proj": "colwise",
             }
             if config.get_text_config() is not None:
                 config.get_text_config().base_model_tp_plan = text_plan

transformers/quantizers/quantizer_finegrained_fp8.py CHANGED Viewed

@@ -110,29 +110,23 @@ class FineGrainedFP8HfQuantizer(HfQuantizer):
             pre_quantized=self.pre_quantized,
         )
-    # NOTE: TP is applied before quantization so this is only to add hooks.
-    # Quantization is incompatible with DTensors, so we have to anyway have
-    # gathers! But it should be model independent -> figure out where to put
-    # the gather and that's it.
     def update_tp_plan(self, config):
         if "Qwen3" in config.__class__.__name__:
             text_plan = {
-                "layers.*.self_attn.q_proj.weight": "local_colwise",
-                "layers.*.self_attn.q_proj.weight_scale_inv": "local_colwise",
-                "layers.*.self_attn.k_proj.weight": "local_colwise",
-                "layers.*.self_attn.k_proj.weight_scale_inv": "local_colwise",
-                "layers.*.self_attn.v_proj.weight": "local_colwise",
-                "layers.*.self_attn.v_proj.weight_scale_inv": "local_colwise",
-                "layers.*.self_attn.o_proj.weight": "local_rowwise",
-                "layers.*.self_attn.o_proj.weight_scale_inv": "local_rowwise",
-                "layers.*.self_attn": "gather",
-                "layers.*.mlp.gate_proj.weight": "local_colwise",
-                "layers.*.mlp.gate_proj.weight_scale_inv": "local_colwise",
-                "layers.*.mlp.up_proj.weight": "local_colwise",
-                "layers.*.mlp.up_proj.weight_scale_inv": "local_colwise",
-                "layers.*.mlp.down_proj.weight": "local_rowwise",
-                "layers.*.mlp.down_proj.weight_scale_inv": "local_rowwise",
-                "layers.*.mlp": "gather",
+                "layers.*.self_attn.q_proj.weight": "colwise",
+                "layers.*.self_attn.q_proj.weight_scale_inv": "colwise",
+                "layers.*.self_attn.k_proj.weight": "colwise",
+                "layers.*.self_attn.k_proj.weight_scale_inv": "colwise",
+                "layers.*.self_attn.v_proj.weight": "colwise",
+                "layers.*.self_attn.v_proj.weight_scale_inv": "colwise",
+                "layers.*.self_attn.o_proj.weight": "rowwise",
+                "layers.*.self_attn.o_proj.weight_scale_inv": "rowwise",
+                "layers.*.mlp.gate_proj.weight": "colwise",
+                "layers.*.mlp.gate_proj.weight_scale_inv": "colwise",
+                "layers.*.mlp.up_proj.weight": "colwise",
+                "layers.*.mlp.up_proj.weight_scale_inv": "colwise",
+                "layers.*.mlp.down_proj.weight": "rowwise",
+                "layers.*.mlp.down_proj.weight_scale_inv": "rowwise",
             }
             config.base_model_tp_plan = text_plan

transformers/quantizers/quantizer_mxfp4.py CHANGED Viewed

@@ -55,7 +55,7 @@ class Mxfp4HfQuantizer(HfQuantizer):
             try:
                 from ..integrations.hub_kernels import get_kernel
-                self.triton_kernels_hub = get_kernel("kernels-community/triton_kernels")
+                self.triton_kernels_hub = get_kernel("kernels-community/gpt-oss-triton-kernels")
             except ImportError:
                 raise ImportError("kernels package is required for MXFP4 quantization")
         return self.triton_kernels_hub

transformers/quantizers/quantizer_torchao.py CHANGED Viewed

@@ -181,9 +181,6 @@ class TorchAoHfQuantizer(HfQuantizer):
             self.set_metadata(checkpoint_files)
     def param_needs_quantization(self, model: "PreTrainedModel", param_name: str, **kwargs) -> bool:
-        if self.quantization_config.quant_type == "autoquant":
-            return False
         # check if the param_name is not in self.modules_to_not_convert
         if not should_convert_module(param_name, self.modules_to_not_convert):
             return False
@@ -213,19 +210,6 @@ class TorchAoHfQuantizer(HfQuantizer):
         return isinstance(module, tuple(_QUANTIZABLE)) and tensor_name == "weight"
     def _process_model_after_weight_loading(self, model, **kwargs):
-        """No process required for torchao quantized model"""
-        if self.quantization_config.quant_type == "autoquant":
-            from torchao import autoquant
-            from torchao.quantization import ALL_AUTOQUANT_CLASS_LIST
-            model = torch.compile(model, mode="max-autotune")
-            model = autoquant(
-                model,
-                qtensor_class_list=ALL_AUTOQUANT_CLASS_LIST,
-                set_inductor_config=False,
-                **self.quantization_config.quant_type_kwargs,
-            )
-            return model
         return
     def is_serializable(self) -> bool:

transformers 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl

transformers 5.0.0rc3py3-none-any.whl → 5.1.0py3-none-any.whl