PyPI - transformers - Versions diffs - 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl - Mend

transformers 5.0.0rc3py3-none-any.whl → 5.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1021) hide show

transformers/safetensors_conversion.py CHANGED Viewed

@@ -28,7 +28,8 @@ def spawn_conversion(token: str, private: bool, model_id: str):
     def start(_sse_connection):
         for line in _sse_connection.iter_lines():
-            line = line.decode()
+            if not isinstance(line, str):
+                line = line.decode()
             if line.startswith("event:"):
                 status = line[7:]
                 logger.debug(f"Safetensors conversion status: {status}")
@@ -83,7 +84,13 @@ def get_conversion_pr_reference(api: HfApi, model_id: str, **kwargs):
     return sha
-def auto_conversion(pretrained_model_name_or_path: str, ignore_errors_during_conversion=False, **cached_file_kwargs):
+def auto_conversion(
+    pretrained_model_name_or_path: str,
+    ignore_errors_during_conversion: bool = False,
+    safe_weights_name: str = "model.safetensors",
+    safe_weights_index_name: str = "model.safetensors.index.json",
+    **cached_file_kwargs,
+):
     try:
         api = HfApi(token=cached_file_kwargs.get("token"), headers={"user-agent": http_user_agent()})
         sha = get_conversion_pr_reference(api, pretrained_model_name_or_path, **cached_file_kwargs)
@@ -97,11 +104,11 @@ def auto_conversion(pretrained_model_name_or_path: str, ignore_errors_during_con
         # description.
         sharded = api.file_exists(
             pretrained_model_name_or_path,
-            "model.safetensors.index.json",
+            safe_weights_index_name,
             revision=sha,
             token=cached_file_kwargs.get("token"),
         )
-        filename = "model.safetensors.index.json" if sharded else "model.safetensors"
+        filename = safe_weights_index_name if sharded else safe_weights_name
         resolved_archive_file = cached_file(pretrained_model_name_or_path, filename, **cached_file_kwargs)
         return resolved_archive_file, sha, sharded

transformers/testing_utils.py CHANGED Viewed

@@ -48,7 +48,6 @@ from unittest import mock
 from unittest.mock import patch
 import httpx
-import urllib3
 from huggingface_hub import create_repo, delete_repo
 from packaging import version
@@ -97,7 +96,6 @@ from .utils import (
     is_flute_available,
     is_fp_quant_available,
     is_fsdp_available,
-    is_ftfy_available,
     is_g2p_en_available,
     is_galore_torch_available,
     is_gguf_available,
@@ -106,7 +104,6 @@ from .utils import (
     is_hadamard_available,
     is_hqq_available,
     is_huggingface_hub_greater_or_equal,
-    is_ipex_available,
     is_jinja_available,
     is_jmespath_available,
     is_jumanpp_available,
@@ -678,21 +675,6 @@ def require_torchcodec(test_case):
     return unittest.skipUnless(is_torchcodec_available(), "test requires Torchcodec")(test_case)
-def require_intel_extension_for_pytorch(test_case):
-    """
-    Decorator marking a test that requires Intel Extension for PyTorch.
-    These tests are skipped when Intel Extension for PyTorch isn't installed or it does not match current PyTorch
-    version.
-    """
-    return unittest.skipUnless(
-        is_ipex_available(),
-        "test requires Intel Extension for PyTorch to be installed and match current PyTorch version, see"
-        " https://github.com/intel/intel-extension-for-pytorch",
-    )(test_case)
 def require_torchaudio(test_case):
     """
     Decorator marking a test that requires torchaudio. These tests are skipped when torchaudio isn't installed.
@@ -767,13 +749,6 @@ def require_vision(test_case):
     return unittest.skipUnless(is_vision_available(), "test requires vision")(test_case)
-def require_ftfy(test_case):
-    """
-    Decorator marking a test that requires ftfy. These tests are skipped when ftfy isn't installed.
-    """
-    return unittest.skipUnless(is_ftfy_available(), "test requires ftfy")(test_case)
 def require_spacy(test_case):
     """
     Decorator marking a test that requires SpaCy. These tests are skipped when SpaCy isn't installed.
@@ -903,9 +878,7 @@ def require_torch_xpu(test_case):
     """
     Decorator marking a test that requires XPU (in PyTorch).
-    These tests are skipped when XPU backend is not available. XPU backend might be available either via stock
-    PyTorch (>=2.4) or via Intel Extension for PyTorch. In the latter case, if IPEX is installed, its version
-    must match match current PyTorch version.
+    These tests are skipped when XPU backend is not available.
     """
     return unittest.skipUnless(is_torch_xpu_available(), "test requires XPU device")(test_case)
@@ -2515,6 +2488,8 @@ class RequestCounter:
             return wrap
+        import urllib3
         self.patcher = patch.object(
             urllib3.connectionpool.log, "debug", side_effect=patched_with_thread_info(urllib3.connectionpool.log.debug)
         )

transformers/tokenization_mistral_common.py CHANGED Viewed

@@ -268,6 +268,15 @@ class MistralCommonBackend(PreTrainedTokenizerBase):
         if kwargs and not set(kwargs.keys()).issubset(_VALID_INIT_KWARGS):
             raise ValueError(f"Kwargs {list(kwargs.keys())} are not supported to init `MistralCommonBackend`.")
+        self.init_kwargs = {
+            "tokenizer_path": tokenizer_path,
+            "mode": mode,
+            "model_max_length": model_max_length,
+            "padding_side": padding_side,
+            "truncation_side": truncation_side,
+            "model_input_names": model_input_names,
+            "clean_up_tokenization_spaces": clean_up_tokenization_spaces,
+        }
         self._tokenizer_path = Path(tokenizer_path)
         self._mode = self._get_validation_mode(mode)

transformers/tokenization_python.py CHANGED Viewed

@@ -415,6 +415,9 @@ class PythonBackend(PreTrainedTokenizerBase):
         self.tokens_trie = Trie()
+        # Initialize total_vocab_size early to avoid issues if get_vocab() is called early (custom tokenizers)
+        self.total_vocab_size = 0
         # 2. init `_added_tokens_decoder` if child class did not
         if not hasattr(self, "_added_tokens_decoder"):
             self._added_tokens_decoder: dict[int, AddedToken] = {}
@@ -439,9 +442,6 @@ class PythonBackend(PreTrainedTokenizerBase):
         # 7. init the parent class
         super().__init__(**kwargs)
-        if self._added_tokens_decoder:
-            self._update_total_vocab_size()
         # 4. If some of the special tokens are not part of the vocab, we add them, at the end.
         # V5: the order of addition follows self.SPECIAL_TOKENS_ATTRIBUTES, then extra special tokens
         # Note: _add_tokens will automatically skip tokens that are already in the base vocab
@@ -449,7 +449,6 @@ class PythonBackend(PreTrainedTokenizerBase):
             [token for token in self.all_special_tokens if token not in self._added_tokens_encoder],
             special_tokens=True,
         )
-        self._update_total_vocab_size()
     @property
     def is_fast(self) -> bool:
@@ -501,6 +500,9 @@ class PythonBackend(PreTrainedTokenizerBase):
         """
         Size of the full vocabulary with the added tokens.
         """
+        # Lazy evaluation: compute if not already set (e.g., during initialization)
+        if self.total_vocab_size == 0:
+            self._update_total_vocab_size()
         return self.total_vocab_size
     def _update_total_vocab_size(self):

transformers/tokenization_utils_base.py CHANGED Viewed

@@ -990,14 +990,13 @@ class PreTrainedTokenizerBase(PushToHubMixin):
             if hasattr(self, key) and callable(getattr(self, key)):
                 raise AttributeError(f"{key} conflicts with the method {key} in {self.__class__.__name__}")
+        # V5: Convert deprecated additional_special_tokens to extra_special_tokens before storing init_kwargs
+        if "additional_special_tokens" in kwargs and "extra_special_tokens" not in kwargs:
+            kwargs["extra_special_tokens"] = kwargs.pop("additional_special_tokens")
         self.init_kwargs = copy.deepcopy(kwargs)
         self.name_or_path = kwargs.pop("name_or_path", "")
         self._processor_class = kwargs.pop("processor_class", None)
-        # Store additional_special_tokens in init_kwargs before conversion for backward compatibility
-        additional_special_tokens_value = kwargs.pop("additional_special_tokens", None)
-        if "additional_special_tokens" not in self.init_kwargs:
-            self.init_kwargs["additional_special_tokens"] = additional_special_tokens_value
-        kwargs.setdefault("extra_special_tokens", additional_special_tokens_value)
         self._pad_token_type_id = 0
         self.verbose = kwargs.pop("verbose", False)
@@ -1025,21 +1024,15 @@ class PreTrainedTokenizerBase(PushToHubMixin):
                 else:
                     raise TypeError(f"Special token {key} has to be either str or AddedToken but got: {type(value)}")
             elif key == "extra_special_tokens":
-                # V5: Support extra_special_tokens in __init__
                 value = kwargs.pop(key)
                 if value is None:
                     continue
-                # If dict: treat as model specific named special tokens (attributes)
                 if isinstance(value, dict):
                     self._set_model_specific_special_tokens(special_tokens=value)
-                else:
-                    if not isinstance(value, (list, tuple)) or not all(
-                        isinstance(t, (str, AddedToken)) for t in value
-                    ):
-                        raise TypeError(
-                            "extra_special_tokens must be a list/tuple of str or AddedToken, or a dict mapping names to tokens"
-                        )
+                elif isinstance(value, (list, tuple)):
                     self._extra_special_tokens = list(value)
+                else:
+                    raise TypeError("extra_special_tokens must be a list/tuple of tokens or a dict of named tokens")
             elif (
                 key.endswith("_token")
                 and key not in self.SPECIAL_TOKENS_ATTRIBUTES
@@ -1163,8 +1156,10 @@ class PreTrainedTokenizerBase(PushToHubMixin):
         # V5: Allowed keys are SPECIAL_TOKENS_ATTRIBUTES + "extra_special_tokens"
         # Backward compatibility: convert "additional_special_tokens" to "extra_special_tokens"
         special_tokens_dict = dict(special_tokens_dict)
-        if "additional_special_tokens" in special_tokens_dict and "extra_special_tokens" not in special_tokens_dict:
-            special_tokens_dict["extra_special_tokens"] = special_tokens_dict.pop("additional_special_tokens")
+        if "additional_special_tokens" in special_tokens_dict:
+            special_tokens_dict.setdefault(
+                "extra_special_tokens", special_tokens_dict.pop("additional_special_tokens")
+            )
         allowed_keys = set(self.SPECIAL_TOKENS_ATTRIBUTES) | {"extra_special_tokens"}
         tokens_to_add = []
@@ -1251,81 +1246,50 @@ class PreTrainedTokenizerBase(PushToHubMixin):
         return self._pad_token_type_id
     def __setattr__(self, key, value):
-        key_without_id = key
-        key_is_special_id = key.endswith("_id") or key.endswith("_ids")
-        if key_is_special_id:
-            key_without_id = key[:-3] if not key.endswith("_ids") else key[:-4]
+        # Handle _id/_ids suffix (eg. bos_token_id -> bos_token)
+        key_without_id = key.removesuffix("_ids").removesuffix("_id") if key.endswith(("_id", "_ids")) else key
+        # Named special tokens (bos_token, eos_token, etc.)
+        if key_without_id in self.SPECIAL_TOKENS_ATTRIBUTES:
+            if key != key_without_id and value is not None:
+                value = self.convert_ids_to_tokens(value)
+            if value is not None and not isinstance(value, (str, AddedToken)):
+                raise ValueError(f"Cannot set a non-string value as the {key_without_id}")
+            self._special_tokens_map[key_without_id] = value
+            return
-        # Check if this is a named special token
-        if (
-            self.__dict__.get("_special_tokens_map", None) is not None
-            and key_without_id in self.SPECIAL_TOKENS_ATTRIBUTES
-        ):
-            if key_is_special_id:
-                if value is not None:
-                    value = self.convert_ids_to_tokens(value)
-                key = key_without_id
-            if not isinstance(value, (str, AddedToken)) and value is not None:
-                raise ValueError(f"Cannot set a non-string value as the {key}")
-            self._special_tokens_map[key] = value
-        # Check if this is extra_special_tokens or extra_special_tokens_ids
-        elif self.__dict__.get("_extra_special_tokens", None) is not None and key_without_id == "extra_special_tokens":
-            if key_is_special_id:
-                if value is not None:
-                    value = [self.convert_ids_to_tokens(val) for val in value]
-                key = key_without_id
+        # Extra special tokens: model-specific special tokens without standard names (eg. <mask_1>)
+        if key_without_id == "extra_special_tokens":
+            if key != key_without_id and value is not None and isinstance(value, (list, tuple)):
+                value = [self.convert_ids_to_tokens(v) for v in value]
+            if not isinstance(value, (list, tuple)) and value is not None:
+                raise ValueError(f"extra_special_tokens must be a list or tuple, got {type(value)}")
+            self._extra_special_tokens = [] if value is None else list(value)
+            return
-            if key == "extra_special_tokens":
-                if value is None:
-                    self._extra_special_tokens = []
-                elif isinstance(value, dict):
-                    # Dict is treated as model-specific special tokens (such as multimodal tokens)
-                    self._set_model_specific_special_tokens(special_tokens=value)
-                elif isinstance(value, (list, tuple)):
-                    self._extra_special_tokens = list(value)
-                else:
-                    raise ValueError(f"extra_special_tokens must be a list, tuple, or dict, got {type(value)}")
-        else:
-            super().__setattr__(key, value)
+        super().__setattr__(key, value)
     def __getattr__(self, key):
-        key_without_id = key
-        key_is_special_id = key.endswith("_id") or key.endswith("_ids")
-        if key_is_special_id:
-            key_without_id = key[:-3] if not key.endswith("_ids") else key[:-4]
-        # Check if this is a named special token
-        if (
-            self.__dict__.get("_special_tokens_map", None) is not None
-            and key_without_id in self.SPECIAL_TOKENS_ATTRIBUTES
-        ):
-            _special_tokens_map = self.__dict__["_special_tokens_map"]
-            if not key_is_special_id:
-                if _special_tokens_map[key_without_id] is None:
-                    if self.verbose:
-                        logger.error(f"Using {key}, but it is not set yet.")
-                    return None
-                value = _special_tokens_map[key_without_id]
-                return str(value)
-            else:
-                attr_as_tokens = getattr(self, key_without_id)
-                return self.convert_tokens_to_ids(attr_as_tokens) if attr_as_tokens is not None else None
-        # Check if this is extra_special_tokens or extra_special_tokens_ids
-        elif key_without_id == "extra_special_tokens":
-            if self.__dict__.get("_extra_special_tokens", None) is not None:
-                if not key_is_special_id:
-                    return [str(tok) for tok in self.__dict__["_extra_special_tokens"]]
-                else:
-                    # extra_special_tokens_ids
-                    tokens = self.__dict__["_extra_special_tokens"]
-                    return self.convert_tokens_to_ids([str(tok) for tok in tokens]) if tokens else []
+        # Handle _id/_ids suffix (eg. bos_token_id -> bos_token)
+        key_without_id = key.removesuffix("_ids").removesuffix("_id") if key.endswith(("_id", "_ids")) else key
+        # Named special tokens (bos_token, eos_token, etc.)
+        if key_without_id in self.SPECIAL_TOKENS_ATTRIBUTES:
+            token_value = self._special_tokens_map.get(key_without_id)
+            if token_value is None:
+                if self.verbose:
+                    logger.error(f"Using {key}, but it is not set yet.")
+                return None
+            return self.convert_tokens_to_ids(str(token_value)) if key != key_without_id else str(token_value)
+        # Extra special tokens
+        if key_without_id == "extra_special_tokens":
+            tokens = [str(tok) for tok in self._extra_special_tokens]
+            return self.convert_tokens_to_ids(tokens) if key != key_without_id else tokens
         if key not in self.__dict__:
             raise AttributeError(f"{self.__class__.__name__} has no attribute {key}")
-        else:
-            return super().__getattr__(key)
+        return super().__getattr__(key)
     def get_special_tokens_mask(
         self, token_ids_0: list[int], token_ids_1: list[int] | None = None, already_has_special_tokens: bool = False
@@ -1607,6 +1571,7 @@ class PreTrainedTokenizerBase(PushToHubMixin):
         pretrained_model_name_or_path = str(pretrained_model_name_or_path)
         vocab_files = {}
+        additional_files_names = {}
         init_configuration = {}
         is_local = os.path.isdir(pretrained_model_name_or_path)
@@ -1648,29 +1613,26 @@ class PreTrainedTokenizerBase(PushToHubMixin):
             # Check for versioned tokenizer files
             if "tokenizer_file" in vocab_files:
                 fast_tokenizer_file = FULL_TOKENIZER_FILE
-                try:
-                    resolved_config_file = cached_file(
-                        pretrained_model_name_or_path,
-                        TOKENIZER_CONFIG_FILE,
-                        cache_dir=cache_dir,
-                        force_download=force_download,
-                        proxies=proxies,
-                        token=token,
-                        revision=revision,
-                        local_files_only=local_files_only,
-                        subfolder=subfolder,
-                        user_agent=user_agent,
-                        _raise_exceptions_for_missing_entries=False,
-                        _commit_hash=commit_hash,
-                    )
-                    if resolved_config_file is not None:
-                        with open(resolved_config_file, encoding="utf-8") as reader:
-                            tokenizer_config = json.load(reader)
-                            if "fast_tokenizer_files" in tokenizer_config:
-                                fast_tokenizer_file = get_fast_tokenizer_file(tokenizer_config["fast_tokenizer_files"])
-                        commit_hash = extract_commit_hash(resolved_config_file, commit_hash)
-                except Exception:
-                    pass
+                resolved_config_file = cached_file(
+                    pretrained_model_name_or_path,
+                    TOKENIZER_CONFIG_FILE,
+                    cache_dir=cache_dir,
+                    force_download=force_download,
+                    proxies=proxies,
+                    token=token,
+                    revision=revision,
+                    local_files_only=local_files_only,
+                    subfolder=subfolder,
+                    user_agent=user_agent,
+                    _raise_exceptions_for_missing_entries=False,
+                    _commit_hash=commit_hash,
+                )
+                if resolved_config_file is not None:
+                    with open(resolved_config_file, encoding="utf-8") as reader:
+                        tokenizer_config = json.load(reader)
+                        if "fast_tokenizer_files" in tokenizer_config:
+                            fast_tokenizer_file = get_fast_tokenizer_file(tokenizer_config["fast_tokenizer_files"])
+                    commit_hash = extract_commit_hash(resolved_config_file, commit_hash)
                 vocab_files["tokenizer_file"] = fast_tokenizer_file
             # This block looks for any extra chat template files
@@ -1819,52 +1781,25 @@ class PreTrainedTokenizerBase(PushToHubMixin):
                 if isinstance(init_kwargs["auto_map"], (tuple, list)):
                     init_kwargs["auto_map"] = {"AutoTokenizer": init_kwargs["auto_map"]}
-        # Preserve extra_special_tokens from tokenizer_config.json before updating with kwargs
-        # extra_special_tokens should be a list (user-defined extra tokens)
-        extra_special_tokens_from_config = init_kwargs.get("extra_special_tokens")
-        if isinstance(extra_special_tokens_from_config, (list, tuple)):
-            extra_special_tokens_from_config = list(extra_special_tokens_from_config)
-        else:
-            extra_special_tokens_from_config = None
         # Update with newly provided kwargs
         init_kwargs.update(kwargs)
-        # V5: Backward compatibility - convert old "additional_special_tokens" to "extra_special_tokens"
-        if "additional_special_tokens" in init_kwargs and "extra_special_tokens" not in init_kwargs:
-            init_kwargs["extra_special_tokens"] = init_kwargs.pop("additional_special_tokens")
-        # Restore extra_special_tokens from config if kwargs overwrote it or it's missing
-        elif extra_special_tokens_from_config is not None:
-            if "extra_special_tokens" not in init_kwargs or not isinstance(
-                init_kwargs.get("extra_special_tokens"), (list, tuple)
-            ):
-                init_kwargs["extra_special_tokens"] = extra_special_tokens_from_config
-        # V5: Get model-specific special tokens from config (saved as individual keys in special_tokens_map)
-        # These need to be grouped as extra_special_tokens dict so __init__ can save them to attributes
-        if "extra_special_tokens" not in init_kwargs or not isinstance(init_kwargs.get("extra_special_tokens"), dict):
-            default_attrs = set(cls.SPECIAL_TOKENS_ATTRIBUTES)
-            model_specific_tokens = {
-                key: init_kwargs.pop(key)
-                for key in list(init_kwargs.keys())
-                if key not in default_attrs
-                and key.endswith("_token")
-                and isinstance(init_kwargs[key], (str, AddedToken))
-            }
-            if model_specific_tokens:
-                # If extra_special_tokens is already a list, we need to preserve it
-                if "extra_special_tokens" in init_kwargs and isinstance(
-                    init_kwargs["extra_special_tokens"], (list, tuple)
-                ):
-                    # Keep the list as is, but also add model-specific tokens as a separate dict
-                    # Convert to model_specific_special_tokens so __init__ handles it
-                    init_kwargs["model_specific_special_tokens"] = model_specific_tokens
-                else:
-                    init_kwargs["extra_special_tokens"] = model_specific_tokens
-        elif isinstance(init_kwargs.get("extra_special_tokens"), dict):
-            # If extra_special_tokens is already a dict, convert it to model_specific_special_tokens
-            # so __init__ handles it properly
-            init_kwargs["model_specific_special_tokens"] = init_kwargs.pop("extra_special_tokens")
+        # V5: Convert deprecated additional_special_tokens to extra_special_tokens
+        if "additional_special_tokens" in init_kwargs:
+            init_kwargs.setdefault("extra_special_tokens", init_kwargs.pop("additional_special_tokens"))
+        # V5: Collect model-specific tokens (custom *_token keys not in standard attributes)
+        default_attrs = set(cls.SPECIAL_TOKENS_ATTRIBUTES)
+        model_specific_tokens = {
+            key: init_kwargs.pop(key)
+            for key in list(init_kwargs.keys())
+            if key not in default_attrs and key.endswith("_token") and isinstance(init_kwargs[key], (str, AddedToken))
+        }
+        # If extra_special_tokens is a dict, merge it into model_specific_tokens
+        if isinstance(init_kwargs.get("extra_special_tokens"), dict):
+            model_specific_tokens.update(init_kwargs.pop("extra_special_tokens"))
+        if model_specific_tokens:
+            init_kwargs["model_specific_special_tokens"] = model_specific_tokens
         # Merge resolved_vocab_files arguments in init_kwargs.
         added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None)
@@ -1893,82 +1828,45 @@ class PreTrainedTokenizerBase(PushToHubMixin):
                         f"Found a {token.__class__} in the saved `added_tokens_decoder`, should be a dictionary or an AddedToken instance"
                     )
         else:
-            # begin legacy: read the added_tokens_file and update kwargs with special_tokens_map if modified
+            # Legacy: read special_tokens_map.json and merge into init_kwargs
             if special_tokens_map_file is not None:
-                with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle:
-                    special_tokens_map = json.load(special_tokens_map_handle)
-                    # Preserve extra_special_tokens from tokenizer_config.json before processing special_tokens_map
-                    extra_special_tokens_before_map = init_kwargs.get("extra_special_tokens")
-                    if isinstance(extra_special_tokens_before_map, (list, tuple)):
-                        extra_special_tokens_before_map = list(extra_special_tokens_before_map)
-                    else:
-                        extra_special_tokens_before_map = None
-                    for key, value in special_tokens_map.items():
-                        if key in kwargs and kwargs[key]:
-                            # This value has already been redefined by the kwargs
-                            # We keep this new value and ignore the one stored in the special_tokens_map_file
-                            continue
-                        # V5: Convert dict-format tokens to AddedToken
-                        if isinstance(value, dict):
-                            value["special"] = True
-                            value = AddedToken(**value)
-                        elif key == "extra_special_tokens":
-                            # Handle extra_special_tokens from special_tokens_map.json
-                            if isinstance(value, dict):
-                                # Dict format for model-specific tokens - keep as is
-                                init_kwargs[key] = value
-                                continue
-                            elif isinstance(value, list):
-                                # List format - merge with existing if present
-                                existing = init_kwargs.pop("extra_special_tokens", []) or []
-                                if not isinstance(existing, (list, tuple)):
-                                    existing = []
-                                for token in value:
-                                    if isinstance(token, dict):
-                                        token = AddedToken(**token, special=True)
-                                    if token not in existing:
-                                        existing.append(token)
-                                init_kwargs[key] = existing
-                                continue
-                        init_kwargs[key] = value
-                    # Restore extra_special_tokens from tokenizer_config.json if not in special_tokens_map.json
-                    if (
-                        "extra_special_tokens" not in special_tokens_map
-                        and extra_special_tokens_before_map is not None
-                    ):
-                        if "extra_special_tokens" not in init_kwargs or not isinstance(
-                            init_kwargs.get("extra_special_tokens"), (list, tuple)
-                        ):
-                            init_kwargs["extra_special_tokens"] = extra_special_tokens_before_map
-                    # Convert extra_special_tokens dict to model_specific_special_tokens if it's a dict
-                    if isinstance(init_kwargs.get("extra_special_tokens"), dict):
-                        init_kwargs["model_specific_special_tokens"] = init_kwargs.pop("extra_special_tokens")
+                with open(special_tokens_map_file, encoding="utf-8") as f:
+                    special_tokens_map = json.load(f)
+                for key, value in special_tokens_map.items():
+                    if key in kwargs and kwargs[key]:
+                        continue  # User-provided kwargs take precedence
+                    if isinstance(value, dict) and key != "extra_special_tokens":
+                        value = AddedToken(**value, special=True)
+                    elif key == "extra_special_tokens" and isinstance(value, list):
+                        # Merge list tokens, converting dicts to AddedToken
+                        existing = list(init_kwargs.get("extra_special_tokens") or [])
+                        for tok in value:
+                            tok = AddedToken(**tok, special=True) if isinstance(tok, dict) else tok
+                            if tok not in existing:
+                                existing.append(tok)
+                        value = existing
+                    init_kwargs[key] = value
+                # Convert dict extra_special_tokens to model_specific_special_tokens
+                if isinstance(init_kwargs.get("extra_special_tokens"), dict):
+                    init_kwargs.setdefault("model_specific_special_tokens", {}).update(
+                        init_kwargs.pop("extra_special_tokens")
+                    )
             # slow -> slow|fast, legacy: convert the `"added_tokens.json"` file to `added_tokens_decoder`.
             # this is for legacy purpose. We don't add the tokens after init for efficiency.
             if added_tokens_file is not None:
-                special_tokens = []
                 # V5: Check both named and extra special tokens
-                for key in cls.SPECIAL_TOKENS_ATTRIBUTES:
-                    if key in init_kwargs and init_kwargs[key] is not None:
-                        special_tokens.append(str(init_kwargs[key]))
+                special_tokens = {str(init_kwargs[k]) for k in cls.SPECIAL_TOKENS_ATTRIBUTES if init_kwargs.get(k)}
+                special_tokens.update(str(t) for t in (init_kwargs.get("extra_special_tokens") or []))
-                # Handle extra_special_tokens
-                if "extra_special_tokens" in init_kwargs and init_kwargs["extra_special_tokens"] is not None:
-                    special_tokens += [str(token) for token in init_kwargs["extra_special_tokens"]]
-                with open(added_tokens_file, encoding="utf-8") as added_tokens_handle:
-                    added_tok_encoder = json.load(added_tokens_handle)
+                with open(added_tokens_file, encoding="utf-8") as f:
+                    added_tok_encoder = json.load(f)
                 for str_token, index in added_tok_encoder.items():
-                    # if index not in added_tokens_decoder and str_token not in added_tokens_map:
-                    special = str_token in special_tokens
+                    is_special = str_token in special_tokens
                     added_tokens_decoder[index] = AddedToken(
-                        str_token, rstrip=False, lstrip=False, normalized=not special, special=special
+                        str_token, rstrip=False, lstrip=False, normalized=not is_special, special=is_special
                     )
-                    added_tokens_map[str(token)] = added_tokens_decoder[index]
+                    added_tokens_map[str_token] = added_tokens_decoder[index]
             # allows converting a fast -> slow: add the `tokenizer.json`'s `"added_tokens"` to the slow tokenizer
             # if `tokenizer_config.json` is `None`
@@ -3450,7 +3348,8 @@ def find_sentencepiece_model_file(pretrained_model_name_or_path, **kwargs):
             ):
                 return candidate
         except Exception:
-            pass
+            # TODO: tighten to OSError / ProxyError
+            continue
     subfolder = kwargs.get("subfolder", "")
     local_files_only = kwargs.get("local_files_only", False)
@@ -3480,8 +3379,9 @@ def find_sentencepiece_model_file(pretrained_model_name_or_path, **kwargs):
             for entry in entries:
                 if entry.path.endswith(".model"):
                     return entry.path if not subfolder else entry.path.removeprefix(f"{subfolder}/")
-        except Exception:
-            pass
+        except Exception as e:
+            # TODO: tighten exception class
+            logger.debug(f"Could not list Hub repository files: {e}")
     return None

transformers 5.0.0rc3__py3-none-any.whl → 5.1.0__py3-none-any.whl

transformers 5.0.0rc3py3-none-any.whl → 5.1.0py3-none-any.whl