PyPI - transformers - Versions diffs - 4.57.2__py3-none-any.whl → 4.57.3__py3-none-any.whl - Mend

transformers 4.57.2py3-none-any.whl → 4.57.3py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (8) hide show

transformers/__init__.py CHANGED Viewed

@@ -18,7 +18,7 @@
 # to defer the actual importing for when the objects are requested. This way `import transformers` provides the names
 # in the namespace without actually importing anything (and especially none of the backends).
-__version__ = "4.57.2"
+__version__ = "4.57.3"
 from pathlib import Path
 from typing import TYPE_CHECKING

transformers/tokenization_utils_base.py CHANGED Viewed

@@ -2046,12 +2046,13 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
                             template = template.removesuffix(".jinja")
                             vocab_files[f"chat_template_{template}"] = f"{CHAT_TEMPLATE_DIR}/{template}.jinja"
+        remote_files = []
         if not is_local and not local_files_only:
             try:
                 remote_files = list_repo_files(pretrained_model_name_or_path)
             except Exception:
                 remote_files = []
-        else:
+        elif pretrained_model_name_or_path and os.path.isdir(pretrained_model_name_or_path):
             remote_files = os.listdir(pretrained_model_name_or_path)
         if "tokenizer_file" in vocab_files and not re.search(vocab_files["tokenizer_file"], "".join(remote_files)):
@@ -2385,57 +2386,108 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
         except NotImplementedError:
             vocab_size = 0
+        # Optionally patches mistral tokenizers with wrong regex
         if (
             vocab_size > 100000
             and hasattr(tokenizer, "_tokenizer")
             and getattr(tokenizer._tokenizer, "pre_tokenizer", None) is not None
         ):
-            from huggingface_hub import model_info
+            tokenizer = cls._patch_mistral_regex(
+                tokenizer,
+                pretrained_model_name_or_path,
+                token=token,
+                cache_dir=cache_dir,
+                local_files_only=local_files_only,
+                _commit_hash=_commit_hash,
+                _is_local=_is_local,
+                init_kwargs=init_kwargs,
+                fix_mistral_regex=kwargs.get("fix_mistral_regex"),
+            )
-            def is_base_mistral(model_id: str) -> bool:
-                model = model_info(model_id)
-                if model.tags is not None:
-                    if re.search("base_model:.*mistralai", "".join(model.tags)):
-                        return True
-                return False
+        return tokenizer
-            if _is_local or is_base_mistral(pretrained_model_name_or_path):
-                _config_file = cached_file(
-                    pretrained_model_name_or_path,
-                    "config.json",
-                    cache_dir=cache_dir,
-                    token=token,
-                    local_files_only=local_files_only,
-                    _raise_exceptions_for_missing_entries=False,
-                    _raise_exceptions_for_connection_errors=False,
-                    _commit_hash=_commit_hash,
-                )
-                if _config_file is not None:
-                    with open(_config_file, encoding="utf-8") as f:
-                        _config = json.load(f)
-                    transformers_version = _config.get("transformers_version")
+    @classmethod
+    def _patch_mistral_regex(
+        cls,
+        tokenizer,
+        pretrained_model_name_or_path,
+        token=None,
+        cache_dir=None,
+        local_files_only=False,
+        _commit_hash=None,
+        _is_local=False,
+        init_kwargs=None,
+        fix_mistral_regex=None,
+    ):
+        """
+        Patches mistral related tokenizers with incorrect regex if detected
+            1) Local file with an associated config saved next to it
+                >> Model type one of the mistral models (on older versions)
+            2) Remote models on the hub from official mistral models
+                >> Tags including `base_model:.*mistralai`
+        """
+        from huggingface_hub import model_info
-                    if transformers_version and version.parse(transformers_version) <= version.parse("4.57.2"):
-                        if _is_local and _config.model_type not in [
+        def is_base_mistral(model_id: str) -> bool:
+            model = model_info(model_id)
+            if model.tags is not None:
+                if re.search("base_model:.*mistralai", "".join(model.tags)):
+                    return True
+            return False
+        if _is_local or is_base_mistral(pretrained_model_name_or_path):
+            _config_file = cached_file(
+                pretrained_model_name_or_path,
+                "config.json",
+                cache_dir=cache_dir,
+                token=token,
+                local_files_only=local_files_only,
+                _raise_exceptions_for_missing_entries=False,
+                _raise_exceptions_for_connection_errors=False,
+                _commit_hash=_commit_hash,
+            )
+            # Detected using a (local) mistral tokenizer
+            mistral_config_detected = False
+            if _config_file is not None:
+                with open(_config_file, encoding="utf-8") as f:
+                    _config = json.load(f)
+                transformers_version = _config.get("transformers_version")
+                transformers_model_type = _config.get("model_type")
+                # Detect if we can skip the mistral fix by
+                #   a) having a non-mistral tokenizer
+                #   b) fixed version of transformers
+                if transformers_version and version.parse(transformers_version) <= version.parse("4.57.2"):
+                    if (
+                        _is_local
+                        and transformers_model_type is not None
+                        and transformers_model_type
+                        not in [
                             "mistral",
                             "mistral3",
-                            "voxstral",
+                            "voxtral",
                             "ministral",
                             "pixtral",
-                        ]:
-                            return tokenizer
+                        ]
+                    ):
+                        return tokenizer
+                elif transformers_version and version.parse(transformers_version) >= version.parse("5.0.0"):
+                    return tokenizer
+                mistral_config_detected = True
+            if mistral_config_detected or (not _is_local and is_base_mistral(pretrained_model_name_or_path)):
                 # Expose the `fix_mistral_regex` flag on the tokenizer when provided, even if no correction is applied.
-                if "fix_mistral_regex" in init_kwargs:
+                if init_kwargs and "fix_mistral_regex" in init_kwargs:
                     setattr(tokenizer, "fix_mistral_regex", init_kwargs["fix_mistral_regex"])
-                fix_mistral_regex = kwargs.get("fix_mistral_regex")  # not init kwargs
                 # only warn if its not explicitly passed
                 if fix_mistral_regex is None and not getattr(tokenizer, "fix_mistral_regex", False):
                     setattr(tokenizer, "fix_mistral_regex", False)
                     logger.warning(
                         f"The tokenizer you are loading from '{pretrained_model_name_or_path}'"
-                        f" with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. "
+                        f" with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e."
                         " This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue."
                     )
                 elif fix_mistral_regex is True or getattr(tokenizer, "fix_mistral_regex", False):
@@ -2448,7 +2500,6 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
                         ),
                         behavior="isolated",
                     )
         return tokenizer
     @staticmethod

{transformers-4.57.2.dist-info → transformers-4.57.3.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: transformers
-Version: 4.57.2
+Version: 4.57.3
 Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
 Home-page: https://github.com/huggingface/transformers
 Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)

{transformers-4.57.2.dist-info → transformers-4.57.3.dist-info}/RECORD RENAMED Viewed

@@ -1,4 +1,4 @@
-transformers/__init__.py,sha256=yb0bqmxZroYcHajMKWTUv6OablhN82Xklj9AMteMOtM,47000
+transformers/__init__.py,sha256=frB0PthD8kwdTouNr0sYJJy4FAO0VxpwwAvgpMCme9Q,47000
 transformers/activations.py,sha256=PdWoGx5eDFNxJW8A7-wZ31IlVCAxhzfbHgNDCpjPQmQ,13109
 transformers/activations_tf.py,sha256=TGmah3loMs_pERwxpjWb5-AUeHLoBAyDxFYWVuLC7FU,4729
 transformers/audio_utils.py,sha256=wDhFAweo28mpXu2OQTdw80gU-jgFgSHKny7ujdDfqVg,54284
@@ -50,7 +50,7 @@ transformers/tf_utils.py,sha256=uiS6uSPmB_ZUaxbV-vMkGy1roDTtY3ujpIgkwuskGmc,1139
 transformers/time_series_utils.py,sha256=fhc___L7NHqLzQ2lvrojW0yGkXJUTVqHGEAt5VDRqNA,7493
 transformers/tokenization_mistral_common.py,sha256=U-f69MiL63qob6z6MjGPh0TaN3J26vqioOzvZNElZYk,91444
 transformers/tokenization_utils.py,sha256=38xCQUA32GXSJWglnuwS3RDKpzdbLXoncHt6UCoI74A,47780
-transformers/tokenization_utils_base.py,sha256=yfJLlYZ46Lyif-bioflltRMlGAITpcSYUYFEOONgJaI,215810
+transformers/tokenization_utils_base.py,sha256=2dRqMBK69GNt1SEwQ6W5g9l1ysPNmdyM7gdLip_uK5E,217765
 transformers/tokenization_utils_fast.py,sha256=VY5FTuaFDpDovF0XguNYeN_aHam_35l64RP6f_dxoPM,41383
 transformers/trainer.py,sha256=w-rI-ii9pjMP1N7hMef-fZvcD6NdnelE_OUsuXs3m6s,279527
 transformers/trainer_callback.py,sha256=YkfU5q-2K7G2RcmdaDLnajZOdSXiaCNKSsmJAot8hN8,33631
@@ -2238,9 +2238,9 @@ transformers/utils/quantization_config.py,sha256=MK8CU9pBIqA8TXWMraDfrM3YndtyW39
 transformers/utils/sentencepiece_model_pb2.py,sha256=WcMZRm2-571XwxSfo-6FZih9fDy_Zl5mMwqrDrC1Dlg,50663
 transformers/utils/sentencepiece_model_pb2_new.py,sha256=ahaV--amhGIL3nXFCTHqezqxuGXm8SHr_C3Zvj7KbAY,6598
 transformers/utils/versions.py,sha256=C-Tqr4qGSHH64ygIBCSo8gA6azz7Dbzh8zdc_yjMkX8,4337
-transformers-4.57.2.dist-info/licenses/LICENSE,sha256=d_1HEN757DwPYiWADgI18VpCWr1KiwNVkSf814JhIEk,11418
-transformers-4.57.2.dist-info/METADATA,sha256=m3p90Bg6sBOoRnkBOWYe76DXKfRkCAPejaHhvzP-SDM,43991
-transformers-4.57.2.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-transformers-4.57.2.dist-info/entry_points.txt,sha256=Zra3dVQyt6Q3fU_suoD3gF81JV3WeV8gH66vzoev408,144
-transformers-4.57.2.dist-info/top_level.txt,sha256=GLBaeTo_CSdhnHvbxQ0kzpEHdlLuA_33foIogaWxntI,13
-transformers-4.57.2.dist-info/RECORD,,
+transformers-4.57.3.dist-info/licenses/LICENSE,sha256=d_1HEN757DwPYiWADgI18VpCWr1KiwNVkSf814JhIEk,11418
+transformers-4.57.3.dist-info/METADATA,sha256=q3v1SoZQphG24EWvK2gZk_JhMC9-5NyohXcS3gc3xXc,43991
+transformers-4.57.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+transformers-4.57.3.dist-info/entry_points.txt,sha256=Zra3dVQyt6Q3fU_suoD3gF81JV3WeV8gH66vzoev408,144
+transformers-4.57.3.dist-info/top_level.txt,sha256=GLBaeTo_CSdhnHvbxQ0kzpEHdlLuA_33foIogaWxntI,13
+transformers-4.57.3.dist-info/RECORD,,

{transformers-4.57.2.dist-info → transformers-4.57.3.dist-info}/WHEEL RENAMED Viewed

File without changes

{transformers-4.57.2.dist-info → transformers-4.57.3.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{transformers-4.57.2.dist-info → transformers-4.57.3.dist-info}/licenses/LICENSE RENAMED Viewed

File without changes

{transformers-4.57.2.dist-info → transformers-4.57.3.dist-info}/top_level.txt RENAMED Viewed

File without changes

transformers 4.57.2__py3-none-any.whl → 4.57.3__py3-none-any.whl

transformers 4.57.2py3-none-any.whl → 4.57.3py3-none-any.whl