PyPI - transformers - Versions diffs - 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl - Mend

transformers 5.0.0rc3py3-none-any.whl → 5.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1021) hide show

transformers/__init__.py CHANGED Viewed

@@ -18,7 +18,7 @@
 # to defer the actual importing for when the objects are requested. This way `import transformers` provides the names
 # in the namespace without actually importing anything (and especially none of the backends).
-__version__ = "5.0.0rc3"
+__version__ = "5.1.0"
 import importlib
 import sys
@@ -147,7 +147,6 @@ _import_structure = {
         "ImageSegmentationPipeline",
         "ImageTextToTextPipeline",
         "ImageToImagePipeline",
-        "ImageToTextPipeline",
         "JsonPipelineDataFormat",
         "KeypointMatchingPipeline",
         "MaskGenerationPipeline",
@@ -157,14 +156,11 @@ _import_structure = {
         "Pipeline",
         "PipelineDataFormat",
         "QuestionAnsweringPipeline",
-        "SummarizationPipeline",
         "TableQuestionAnsweringPipeline",
-        "Text2TextGenerationPipeline",
         "TextClassificationPipeline",
         "TextGenerationPipeline",
         "TextToAudioPipeline",
         "TokenClassificationPipeline",
-        "TranslationPipeline",
         "VideoClassificationPipeline",
         "VisualQuestionAnsweringPipeline",
         "ZeroShotAudioClassificationPipeline",
@@ -443,6 +439,7 @@ else:
     _import_structure["modeling_flash_attention_utils"] = []
     _import_structure["modeling_layers"] = ["GradientCheckpointingLayer"]
     _import_structure["modeling_outputs"] = []
+    _import_structure["backbone_utils"] = ["BackboneConfigMixin", "BackboneMixin"]
     _import_structure["modeling_rope_utils"] = ["ROPE_INIT_FUNCTIONS", "dynamic_rope_update", "RopeParameters"]
     _import_structure["modeling_utils"] = ["PreTrainedModel", "AttentionInterface"]
     _import_structure["masking_utils"] = ["AttentionMaskInterface"]
@@ -471,6 +468,8 @@ else:
 # Direct imports for type-checking
 if TYPE_CHECKING:
     # All modeling imports
+    # Models
+    from .backbone_utils import BackboneConfigMixin, BackboneMixin
     from .cache_utils import Cache as Cache
     from .cache_utils import DynamicCache as DynamicCache
     from .cache_utils import DynamicLayer as DynamicLayer
@@ -613,8 +612,6 @@ if TYPE_CHECKING:
     from .integrations.executorch import convert_and_export_with_cache as convert_and_export_with_cache
     from .masking_utils import AttentionMaskInterface as AttentionMaskInterface
     from .model_debugging_utils import model_addition_debugger_context as model_addition_debugger_context
-    # Models
     from .modeling_layers import GradientCheckpointingLayer as GradientCheckpointingLayer
     from .modeling_rope_utils import ROPE_INIT_FUNCTIONS as ROPE_INIT_FUNCTIONS
     from .modeling_rope_utils import RopeParameters as RopeParameters
@@ -659,7 +656,6 @@ if TYPE_CHECKING:
     from .pipelines import ImageSegmentationPipeline as ImageSegmentationPipeline
     from .pipelines import ImageTextToTextPipeline as ImageTextToTextPipeline
     from .pipelines import ImageToImagePipeline as ImageToImagePipeline
-    from .pipelines import ImageToTextPipeline as ImageToTextPipeline
     from .pipelines import JsonPipelineDataFormat as JsonPipelineDataFormat
     from .pipelines import KeypointMatchingPipeline as KeypointMatchingPipeline
     from .pipelines import MaskGenerationPipeline as MaskGenerationPipeline
@@ -669,14 +665,11 @@ if TYPE_CHECKING:
     from .pipelines import Pipeline as Pipeline
     from .pipelines import PipelineDataFormat as PipelineDataFormat
     from .pipelines import QuestionAnsweringPipeline as QuestionAnsweringPipeline
-    from .pipelines import SummarizationPipeline as SummarizationPipeline
     from .pipelines import TableQuestionAnsweringPipeline as TableQuestionAnsweringPipeline
-    from .pipelines import Text2TextGenerationPipeline as Text2TextGenerationPipeline
     from .pipelines import TextClassificationPipeline as TextClassificationPipeline
     from .pipelines import TextGenerationPipeline as TextGenerationPipeline
     from .pipelines import TextToAudioPipeline as TextToAudioPipeline
     from .pipelines import TokenClassificationPipeline as TokenClassificationPipeline
-    from .pipelines import TranslationPipeline as TranslationPipeline
     from .pipelines import VideoClassificationPipeline as VideoClassificationPipeline
     from .pipelines import VisualQuestionAnsweringPipeline as VisualQuestionAnsweringPipeline
     from .pipelines import ZeroShotAudioClassificationPipeline as ZeroShotAudioClassificationPipeline

transformers/activations.py CHANGED Viewed

@@ -247,8 +247,8 @@ class XIELUActivation(nn.Module):
         self.register_buffer("eps", torch.tensor(eps, dtype=dtype))
         self.with_vector_loads = with_vector_loads
         # Temporary until xIELU CUDA fully implemented
-        self._beta_scalar = float(self.beta.detach().cpu().float().item())
-        self._eps_scalar = float(self.eps.detach().cpu().float().item())
+        self._beta_scalar = float(beta)
+        self._eps_scalar = float(eps)
         self._xielu_cuda_obj = None
         try:

transformers/backbone_utils.py ADDED Viewed

@@ -0,0 +1,326 @@
+# Copyright 2026 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Collection of utils to be used by backbones and their components."""
+import enum
+import inspect
+from huggingface_hub import repo_exists
+from .utils import logging
+logger = logging.get_logger(__name__)
+class BackboneType(enum.Enum):
+    TIMM = "timm"
+    TRANSFORMERS = "transformers"
+class BackboneConfigMixin:
+    """
+    A Mixin to support handling the `out_features` and `out_indices` attributes for the backbone configurations.
+    """
+    def set_output_features_output_indices(
+        self,
+        out_features: list | None,
+        out_indices: list | None,
+    ):
+        """
+        Sets output indices and features to new values and aligns them with the given `stage_names`.
+        If one of the inputs is not given, find the corresponding `out_features` or `out_indices`
+        for the given `stage_names`.
+        Args:
+            out_features (`list[str]`, *optional*):
+                The names of the features for the backbone to output. Defaults to `config._out_features` if not provided.
+            out_indices (`list[int]` or `tuple[int]`, *optional*):
+                The indices of the features for the backbone to output. Defaults to `config._out_indices` if not provided.
+        """
+        self._out_features = out_features
+        self._out_indices = list(out_indices) if isinstance(out_indices, tuple) else out_indices
+        # First verify that the out_features and out_indices are valid
+        self.verify_out_features_out_indices()
+        # Align output features with indices
+        out_features, out_indices = self._out_features, self._out_indices
+        if out_indices is None and out_features is None:
+            out_indices = [len(self.stage_names) - 1]
+            out_features = [self.stage_names[-1]]
+        elif out_indices is None and out_features is not None:
+            out_indices = [self.stage_names.index(layer) for layer in out_features]
+        elif out_features is None and out_indices is not None:
+            out_features = [self.stage_names[idx] for idx in out_indices]
+        # Update values and verify that the aligned out_features and out_indices are valid
+        self._out_features, self._out_indices = out_features, out_indices
+        self.verify_out_features_out_indices()
+    def verify_out_features_out_indices(self):
+        """
+        Verify that out_indices and out_features are valid for the given stage_names.
+        """
+        if self.stage_names is None:
+            raise ValueError("Stage_names must be set for transformers backbones")
+        if self._out_features is not None:
+            if not isinstance(self._out_features, (list,)):
+                raise ValueError(f"out_features must be a list got {type(self._out_features)}")
+            if any(feat not in self.stage_names for feat in self._out_features):
+                raise ValueError(
+                    f"out_features must be a subset of stage_names: {self.stage_names} got {self._out_features}"
+                )
+            if len(self._out_features) != len(set(self._out_features)):
+                raise ValueError(f"out_features must not contain any duplicates, got {self._out_features}")
+            if self._out_features != (
+                sorted_feats := [feat for feat in self.stage_names if feat in self._out_features]
+            ):
+                raise ValueError(
+                    f"out_features must be in the same order as stage_names, expected {sorted_feats} got {self._out_features}"
+                )
+        if self._out_indices is not None:
+            if not isinstance(self._out_indices, list):
+                raise ValueError(f"out_indices must be a list, got {type(self._out_indices)}")
+            # Convert negative indices to their positive equivalent: [-1,] -> [len(stage_names) - 1,]
+            positive_indices = tuple(idx % len(self.stage_names) if idx < 0 else idx for idx in self._out_indices)
+            if any(idx for idx in positive_indices if idx not in range(len(self.stage_names))):
+                raise ValueError(
+                    f"out_indices must be valid indices for stage_names {self.stage_names}, got {self._out_indices}"
+                )
+            if len(positive_indices) != len(set(positive_indices)):
+                msg = f"out_indices must not contain any duplicates, got {self._out_indices}"
+                msg += f"(equivalent to {positive_indices}))" if positive_indices != self._out_indices else ""
+                raise ValueError(msg)
+            if positive_indices != tuple(sorted(positive_indices)):
+                sorted_negative = [
+                    idx for _, idx in sorted(zip(positive_indices, self._out_indices), key=lambda x: x[0])
+                ]
+                raise ValueError(
+                    f"out_indices must be in the same order as stage_names, expected {sorted_negative} got {self._out_indices}"
+                )
+        if self._out_features is not None and self._out_indices is not None:
+            if len(self._out_features) != len(self._out_indices):
+                raise ValueError("out_features and out_indices should have the same length if both are set")
+            if self._out_features != [self.stage_names[idx] for idx in self._out_indices]:
+                raise ValueError("out_features and out_indices should correspond to the same stages if both are set")
+    @property
+    def out_features(self):
+        return self._out_features
+    @out_features.setter
+    def out_features(self, out_features: list[str]):
+        """
+        Set the out_features attribute. This will also update the out_indices attribute to match the new out_features.
+        """
+        self.set_output_features_output_indices(out_features=out_features, out_indices=None)
+    @property
+    def out_indices(self):
+        return self._out_indices
+    @out_indices.setter
+    def out_indices(self, out_indices: tuple[int, ...] | list[int]):
+        """
+        Set the out_indices attribute. This will also update the out_features attribute to match the new out_indices.
+        """
+        out_indices = list(out_indices) if out_indices is not None else out_indices
+        self.set_output_features_output_indices(out_features=None, out_indices=out_indices)
+    def to_dict(self):
+        """
+        Serializes this instance to a Python dictionary. Override the default `to_dict()` from `PreTrainedConfig` to
+        include the `out_features` and `out_indices` attributes.
+        """
+        output = super().to_dict()
+        output["out_features"] = output.pop("_out_features", None)
+        output["out_indices"] = output.pop("_out_indices", None)
+        return output
+class BackboneMixin:
+    backbone_type: BackboneType | None = None
+    # Attribute to indicate if the backbone has attention and can return attention outputs.
+    # Should be set to `False` for conv-based models to be able to run `forward_with_filtered_kwargs`
+    has_attentions: bool = True
+    def __init__(self, *args, **kwargs) -> None:
+        """
+        Method to initialize the backbone. This method is called by the constructor of the base class after the
+        pretrained model weights have been loaded.
+        """
+        super().__init__(*args, **kwargs)
+        timm_backbone = kwargs.pop("timm_backbone", None)
+        if timm_backbone is not None:
+            self.backbone_type = BackboneType.TIMM
+        else:
+            self.backbone_type = BackboneType.TRANSFORMERS
+        if self.backbone_type == BackboneType.TIMM:
+            self._init_timm_backbone(backbone=timm_backbone)
+        elif self.backbone_type == BackboneType.TRANSFORMERS:
+            self._init_transformers_backbone()
+        else:
+            raise ValueError(f"backbone_type {self.backbone_type} not supported.")
+    def _init_timm_backbone(self, backbone) -> None:
+        """
+        Initialize the backbone model from timm. The backbone must already be loaded to backbone
+        """
+        # These will disagree with the defaults for the transformers models e.g. for resnet50
+        # the transformer model has out_features = ['stem', 'stage1', 'stage2', 'stage3', 'stage4']
+        # the timm model has out_features = ['act', 'layer1', 'layer2', 'layer3', 'layer4']
+        self.stage_names = [stage["module"] for stage in backbone.feature_info.info]
+        self.num_features = [stage["num_chs"] for stage in backbone.feature_info.info]
+        self.config._out_indices = list(backbone.feature_info.out_indices)
+        self.config._out_features = backbone.feature_info.module_name()
+        self.config.stage_names = self.stage_names
+        # We verify the out indices and out features are valid
+        self.config.verify_out_features_out_indices()
+    def _init_transformers_backbone(self) -> None:
+        self.stage_names = self.config.stage_names
+        self.config.verify_out_features_out_indices()
+        # Number of channels for each stage. This is set in the transformer backbone model init
+        self.num_features = None
+    @property
+    def out_features(self):
+        return self.config._out_features
+    @out_features.setter
+    def out_features(self, out_features: list[str]):
+        """
+        Set the out_features attribute. This will also update the out_indices attribute to match the new out_features.
+        """
+        self.config.out_features = out_features
+    @property
+    def out_indices(self):
+        return self.config._out_indices
+    @out_indices.setter
+    def out_indices(self, out_indices: tuple[int] | list[int]):
+        """
+        Set the out_indices attribute. This will also update the out_features attribute to match the new out_indices.
+        """
+        self.config.out_indices = out_indices
+    @property
+    def out_feature_channels(self):
+        # the current backbones will output the number of channels for each stage
+        # even if that stage is not in the out_features list.
+        return {stage: self.num_features[i] for i, stage in enumerate(self.stage_names)}
+    @property
+    def channels(self):
+        return [self.out_feature_channels[name] for name in self.out_features]
+    def forward_with_filtered_kwargs(self, *args, **kwargs):
+        if not self.has_attentions:
+            kwargs.pop("output_attentions", None)
+        if self.backbone_type == BackboneType.TIMM:
+            signature = dict(inspect.signature(self.forward).parameters)
+            kwargs = {k: v for k, v in kwargs.items() if k in signature}
+        return self(*args, **kwargs)
+    def forward(
+        self,
+        pixel_values,
+        output_hidden_states: bool | None = None,
+        output_attentions: bool | None = None,
+        return_dict: bool | None = None,
+    ):
+        raise NotImplementedError("This method should be implemented by the derived class.")
+def consolidate_backbone_kwargs_to_config(
+    backbone_config,
+    default_backbone: str | None = None,
+    default_config_type: str | None = None,
+    default_config_kwargs: dict | None = None,
+    timm_default_kwargs: dict | None = None,
+    **kwargs,
+):
+    # Lazy import to avoid circular import issues. Can be imported properly
+    # after deleting ref to `BackboneMixin` in `utils/backbone_utils.py`
+    from .configuration_utils import PreTrainedConfig
+    from .models.auto import CONFIG_MAPPING
+    use_timm_backbone = kwargs.pop("use_timm_backbone", True)
+    backbone_kwargs = kwargs.pop("backbone_kwargs", {})
+    backbone = kwargs.pop("backbone") if kwargs.get("backbone") is not None else default_backbone
+    kwargs.pop("use_pretrained_backbone", None)
+    # Init timm backbone with hardcoded values for BC. If everything is set to `None` and there is
+    # a default timm config, we use it to init the backbone.
+    if (
+        timm_default_kwargs is not None
+        and use_timm_backbone
+        and backbone is not None
+        and backbone_config is None
+        and not backbone_kwargs
+    ):
+        backbone_config = CONFIG_MAPPING["timm_backbone"](backbone=backbone, **timm_default_kwargs)
+    elif backbone is not None and backbone_config is None:
+        if repo_exists(backbone):
+            config_dict, _ = PreTrainedConfig.get_config_dict(backbone)
+            config_class = CONFIG_MAPPING[config_dict["model_type"]]
+            config_dict.update(backbone_kwargs)
+            backbone_config = config_class(**config_dict)
+        else:
+            backbone_config = CONFIG_MAPPING["timm_backbone"](backbone=backbone, **backbone_kwargs)
+    elif backbone_config is None and default_config_type is not None:
+        logger.info(
+            f"`backbone_config` is `None`. Initializing the config with the default `{default_config_type}` vision config."
+        )
+        default_config_kwargs = default_config_kwargs or {}
+        backbone_config = CONFIG_MAPPING[default_config_type](**default_config_kwargs)
+    elif isinstance(backbone_config, dict):
+        backbone_model_type = backbone_config.get("model_type")
+        config_class = CONFIG_MAPPING[backbone_model_type]
+        backbone_config = config_class.from_dict(backbone_config)
+    return backbone_config, kwargs
+def load_backbone(config):
+    """
+    Loads the backbone model from a config object.
+    If the config is from the backbone model itself, then we return a backbone model with randomly initialized
+    weights.
+    If the config is from the parent model of the backbone model itself, then we load the pretrained backbone weights
+    if specified.
+    """
+    from transformers import AutoBackbone
+    backbone_config = getattr(config, "backbone_config", None)
+    if backbone_config is None:
+        backbone = AutoBackbone.from_config(config=config)
+    else:
+        backbone = AutoBackbone.from_config(config=backbone_config)
+    return backbone

transformers/cache_utils.py CHANGED Viewed

@@ -7,6 +7,7 @@ import torch
 from .configuration_utils import PreTrainedConfig
 from .utils import (
     is_hqq_available,
+    is_optimum_quanto_available,
     is_quanto_greater,
     is_torch_greater_or_equal,
     is_torchdynamo_compiling,
@@ -584,7 +585,12 @@ class QuantoQuantizedLayer(QuantizedLayer):
         )
         # We need to import quanto here to avoid circular imports due to optimum/quanto/models/transformers_models.py
-        if is_quanto_greater("0.2.5", accept_dev=True):
+        if not is_optimum_quanto_available():
+            raise ImportError(
+                "You need to install optimum-quanto in order to use KV cache quantization with optimum-quanto "
+                "backend. Please install it via  with `pip install optimum-quanto`"
+            )
+        elif is_quanto_greater("0.2.5", accept_dev=True):
             from optimum.quanto import MaxOptimizer, qint2, qint4
         else:
             raise ImportError(
@@ -634,7 +640,10 @@ class HQQQuantizedLayer(QuantizedLayer):
         )
         if not is_hqq_available():
-            raise ImportError("You need to install `hqq` to use `HQQQuantizedLayer`")
+            raise ImportError(
+                "You need to install `HQQ` in order to use KV cache quantization with HQQ backend. "
+                "Please install it via  with `pip install hqq`"
+            )
         if self.nbits not in [1, 2, 3, 4, 8]:
             raise ValueError(

transformers/cli/serve.py CHANGED Viewed

@@ -839,6 +839,17 @@ class Serve:
                 for result in self.running_continuous_batching_manager.request_id_iter(request_id):
                     n_tokens_generated += 1
+                    # Always yield the token content (even for the final FINISHED token)
+                    if result.generated_tokens:
+                        token_id = result.generated_tokens[-1]
+                        yield self.build_chat_completion_chunk(
+                            request_id=request_id,
+                            content=token_id,
+                            model=model_id_and_revision,
+                            decode_stream=decode_stream,
+                            tokenizer=tokenizer,
+                        )
                     if result.status == RequestStatus.FINISHED:
                         generated_all_tokens = n_tokens_generated >= generation_config.max_new_tokens
@@ -855,14 +866,6 @@ class Serve:
                             model=model_id_and_revision,
                         )
                         break
-                    else:
-                        yield self.build_chat_completion_chunk(
-                            request_id=request_id,
-                            content=result.generated_tokens[-1],
-                            model=model_id_and_revision,
-                            decode_stream=decode_stream,
-                            tokenizer=tokenizer,
-                        )
             except Exception as e:
                 logger.error(str(e))

transformers/configuration_utils.py CHANGED Viewed

@@ -114,16 +114,6 @@ class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):
             Whether or not the model should return a [`~transformers.utils.ModelOutput`] instead of a plain tuple.
         is_encoder_decoder (`bool`, *optional*, defaults to `False`):
             Whether the model is used as an encoder/decoder or not.
-        is_decoder (`bool`, *optional*, defaults to `False`):
-            Whether to only use the decoder in an encoder-decoder architecture, otherwise it has no effect on
-            decoder-only or encoder-only architectures.
-        cross_attention_hidden_size (`bool`, *optional*):
-            The hidden size of the cross-attention layer in case the model is used as a decoder in an encoder-decoder
-            setting and the cross-attention hidden dimension differs from `self.config.hidden_size`.
-        add_cross_attention (`bool`, *optional*, defaults to `False`):
-            Whether cross-attention layers should be added to the model. Note, this option is only relevant for models
-            that can be used as decoder models within the [`EncoderDecoderModel`] class, which consists of all models
-            in `AUTO_MODELS_FOR_CAUSAL_LM`.
         chunk_size_feed_forward (`int`, *optional*, defaults to `0`):
             The chunk size of all feed forward layers in the residual attention blocks. A chunk size of `0` means that
             the feed forward layer is not chunked. A chunk size of n means that the feed forward layer processes `n` <
@@ -134,43 +124,18 @@ class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):
         architectures (`list[str]`, *optional*):
             Model architectures that can be used with the model pretrained weights.
-        finetuning_task (`str`, *optional*):
-            Name of the task used to fine-tune the model.
         id2label (`dict[int, str]`, *optional*):
             A map from index (for instance prediction index, or target index) to label.
         label2id (`dict[str, int]`, *optional*):
             A map from label to index for the model.
         num_labels (`int`, *optional*):
             Number of labels to use in the last layer added to the model, typically for a classification task.
-        task_specific_params (`dict[str, Any]`, *optional*):
-            Additional keyword arguments to store for the current task.
         problem_type (`str`, *optional*):
             Problem type for `XxxForSequenceClassification` models. Can be one of `"regression"`,
             `"single_label_classification"` or `"multi_label_classification"`.
-        > Parameters linked to the tokenizer
-        tokenizer_class (`str`, *optional*):
-            The name of the associated tokenizer class to use (if none is set, will use the tokenizer associated to the
-            model by default).
-        prefix (`str`, *optional*):
-            A specific prompt that should be added at the beginning of each text before calling the model.
-        bos_token_id (`int`, *optional*):
-            The id of the _beginning-of-stream_ token.
-        pad_token_id (`int`, *optional*):
-            The id of the _padding_ token.
-        eos_token_id (`int`, *optional*):
-            The id of the _end-of-stream_ token.
-        decoder_start_token_id (`int`, *optional*):
-            If an encoder-decoder model starts decoding with a different token than _bos_, the id of that token.
-        sep_token_id (`int`, *optional*):
-            The id of the _separation_ token.
         > PyTorch specific parameters
-        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
-            Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the
-            model has a output word embedding layer.
         dtype (`str`, *optional*):
             The `dtype` of the weights. This attribute can be used to initialize the model to a non-default `dtype`
             (which is normally `float32`) and thus allow for optimal storage allocation. For example, if the saved
@@ -207,28 +172,14 @@ class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):
         return_dict: bool = True,
         dtype: Union[str, "torch.dtype"] | None = None,
         # Common arguments
-        tie_word_embeddings: bool = True,
         chunk_size_feed_forward: int = 0,
         is_encoder_decoder: bool = False,
-        is_decoder: bool = False,
-        cross_attention_hidden_size: int | None = None,
-        add_cross_attention: bool = False,
         # Fine-tuning task arguments
         architectures: list[str] | None = None,
-        finetuning_task: str | None = None,
         id2label: dict[int, str] | None = None,
         label2id: dict[str, int] | None = None,
         num_labels: int | None = None,
-        task_specific_params: dict[str, Any] | None = None,
         problem_type: str | None = None,
-        # Tokenizer kwargs
-        tokenizer_class: str | None = None,
-        prefix: str | None = None,
-        bos_token_id: int | None = None,
-        pad_token_id: int | None = None,
-        eos_token_id: int | None = None,
-        sep_token_id: int | None = None,
-        decoder_start_token_id: int | None = None,
         **kwargs,
     ):
         # Validation for some arguments
@@ -276,25 +227,15 @@ class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):
         self._output_attentions = output_attentions  # has public property
         # Less common kwargs, only used by some models
-        if "tie_encoder_decoder" in kwargs:
-            tie_encoder_decoder = kwargs.pop("tie_encoder_decoder")
-            tie_word_embeddings = tie_encoder_decoder or tie_word_embeddings
-        self.tie_word_embeddings = tie_word_embeddings
         self.chunk_size_feed_forward = chunk_size_feed_forward
         # Encoder-decoder models attributes
         self.is_encoder_decoder = is_encoder_decoder
-        self.is_decoder = is_decoder  # used in encoder-decoder models to differentiate encoder from decoder
-        self.cross_attention_hidden_size = cross_attention_hidden_size
-        self.add_cross_attention = add_cross_attention
         # Fine-tuning task attributes
         self.architectures = architectures
-        self.finetuning_task = finetuning_task
         self.id2label = id2label
         self.label2id = label2id
-        self.task_specific_params = task_specific_params
         self.problem_type = problem_type
         if self.id2label is None:
@@ -303,15 +244,6 @@ class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):
             # Keys are always strings in JSON so convert ids to int here.
             self.id2label = {int(key): value for key, value in self.id2label.items()}
-        # Tokenizer attributes
-        self.tokenizer_class = tokenizer_class
-        self.prefix = prefix
-        self.bos_token_id = bos_token_id
-        self.pad_token_id = pad_token_id
-        self.eos_token_id = eos_token_id
-        self.sep_token_id = sep_token_id
-        self.decoder_start_token_id = decoder_start_token_id
         # Parameters for sequence generation saved in the config are popped instead of loading them.
         for parameter_name in GenerationConfig._get_default_generation_params().keys():
             kwargs.pop(parameter_name, None)
@@ -321,7 +253,7 @@ class PreTrainedConfig(PushToHubMixin, RotaryEmbeddingConfigMixin):
         self._commit_hash = kwargs.pop("_commit_hash", None)
         # Attention implementation to use, if relevant (it sets it recursively on sub-configs)
-        self._attn_implementation = kwargs.pop("attn_implementation", None)
+        self._attn_implementation: str | None = kwargs.pop("attn_implementation", None)
         # Experts implementation to use, if relevant (it sets it recursively on sub-configs)
         self._experts_implementation = kwargs.pop("experts_implementation", None)

transformers 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl

transformers 5.0.0rc3py3-none-any.whl → 5.1.0py3-none-any.whl