transformers 4.57.2__py3-none-any.whl → 4.57.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- transformers/__init__.py +1 -1
- transformers/tokenization_utils_base.py +83 -32
- {transformers-4.57.2.dist-info → transformers-4.57.3.dist-info}/METADATA +1 -1
- {transformers-4.57.2.dist-info → transformers-4.57.3.dist-info}/RECORD +8 -8
- {transformers-4.57.2.dist-info → transformers-4.57.3.dist-info}/WHEEL +0 -0
- {transformers-4.57.2.dist-info → transformers-4.57.3.dist-info}/entry_points.txt +0 -0
- {transformers-4.57.2.dist-info → transformers-4.57.3.dist-info}/licenses/LICENSE +0 -0
- {transformers-4.57.2.dist-info → transformers-4.57.3.dist-info}/top_level.txt +0 -0
transformers/__init__.py
CHANGED
|
@@ -18,7 +18,7 @@
|
|
|
18
18
|
# to defer the actual importing for when the objects are requested. This way `import transformers` provides the names
|
|
19
19
|
# in the namespace without actually importing anything (and especially none of the backends).
|
|
20
20
|
|
|
21
|
-
__version__ = "4.57.
|
|
21
|
+
__version__ = "4.57.3"
|
|
22
22
|
|
|
23
23
|
from pathlib import Path
|
|
24
24
|
from typing import TYPE_CHECKING
|
|
@@ -2046,12 +2046,13 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
|
|
2046
2046
|
template = template.removesuffix(".jinja")
|
|
2047
2047
|
vocab_files[f"chat_template_{template}"] = f"{CHAT_TEMPLATE_DIR}/{template}.jinja"
|
|
2048
2048
|
|
|
2049
|
+
remote_files = []
|
|
2049
2050
|
if not is_local and not local_files_only:
|
|
2050
2051
|
try:
|
|
2051
2052
|
remote_files = list_repo_files(pretrained_model_name_or_path)
|
|
2052
2053
|
except Exception:
|
|
2053
2054
|
remote_files = []
|
|
2054
|
-
|
|
2055
|
+
elif pretrained_model_name_or_path and os.path.isdir(pretrained_model_name_or_path):
|
|
2055
2056
|
remote_files = os.listdir(pretrained_model_name_or_path)
|
|
2056
2057
|
|
|
2057
2058
|
if "tokenizer_file" in vocab_files and not re.search(vocab_files["tokenizer_file"], "".join(remote_files)):
|
|
@@ -2385,57 +2386,108 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
|
|
2385
2386
|
except NotImplementedError:
|
|
2386
2387
|
vocab_size = 0
|
|
2387
2388
|
|
|
2389
|
+
# Optionally patches mistral tokenizers with wrong regex
|
|
2388
2390
|
if (
|
|
2389
2391
|
vocab_size > 100000
|
|
2390
2392
|
and hasattr(tokenizer, "_tokenizer")
|
|
2391
2393
|
and getattr(tokenizer._tokenizer, "pre_tokenizer", None) is not None
|
|
2392
2394
|
):
|
|
2393
|
-
|
|
2395
|
+
tokenizer = cls._patch_mistral_regex(
|
|
2396
|
+
tokenizer,
|
|
2397
|
+
pretrained_model_name_or_path,
|
|
2398
|
+
token=token,
|
|
2399
|
+
cache_dir=cache_dir,
|
|
2400
|
+
local_files_only=local_files_only,
|
|
2401
|
+
_commit_hash=_commit_hash,
|
|
2402
|
+
_is_local=_is_local,
|
|
2403
|
+
init_kwargs=init_kwargs,
|
|
2404
|
+
fix_mistral_regex=kwargs.get("fix_mistral_regex"),
|
|
2405
|
+
)
|
|
2394
2406
|
|
|
2395
|
-
|
|
2396
|
-
model = model_info(model_id)
|
|
2397
|
-
if model.tags is not None:
|
|
2398
|
-
if re.search("base_model:.*mistralai", "".join(model.tags)):
|
|
2399
|
-
return True
|
|
2400
|
-
return False
|
|
2407
|
+
return tokenizer
|
|
2401
2408
|
|
|
2402
|
-
|
|
2403
|
-
|
|
2404
|
-
|
|
2405
|
-
|
|
2406
|
-
|
|
2407
|
-
|
|
2408
|
-
|
|
2409
|
-
|
|
2410
|
-
|
|
2411
|
-
|
|
2412
|
-
|
|
2413
|
-
|
|
2414
|
-
|
|
2415
|
-
|
|
2416
|
-
|
|
2409
|
+
@classmethod
|
|
2410
|
+
def _patch_mistral_regex(
|
|
2411
|
+
cls,
|
|
2412
|
+
tokenizer,
|
|
2413
|
+
pretrained_model_name_or_path,
|
|
2414
|
+
token=None,
|
|
2415
|
+
cache_dir=None,
|
|
2416
|
+
local_files_only=False,
|
|
2417
|
+
_commit_hash=None,
|
|
2418
|
+
_is_local=False,
|
|
2419
|
+
init_kwargs=None,
|
|
2420
|
+
fix_mistral_regex=None,
|
|
2421
|
+
):
|
|
2422
|
+
"""
|
|
2423
|
+
Patches mistral related tokenizers with incorrect regex if detected
|
|
2424
|
+
1) Local file with an associated config saved next to it
|
|
2425
|
+
>> Model type one of the mistral models (on older versions)
|
|
2426
|
+
2) Remote models on the hub from official mistral models
|
|
2427
|
+
>> Tags including `base_model:.*mistralai`
|
|
2428
|
+
"""
|
|
2429
|
+
from huggingface_hub import model_info
|
|
2417
2430
|
|
|
2418
|
-
|
|
2419
|
-
|
|
2431
|
+
def is_base_mistral(model_id: str) -> bool:
|
|
2432
|
+
model = model_info(model_id)
|
|
2433
|
+
if model.tags is not None:
|
|
2434
|
+
if re.search("base_model:.*mistralai", "".join(model.tags)):
|
|
2435
|
+
return True
|
|
2436
|
+
return False
|
|
2437
|
+
|
|
2438
|
+
if _is_local or is_base_mistral(pretrained_model_name_or_path):
|
|
2439
|
+
_config_file = cached_file(
|
|
2440
|
+
pretrained_model_name_or_path,
|
|
2441
|
+
"config.json",
|
|
2442
|
+
cache_dir=cache_dir,
|
|
2443
|
+
token=token,
|
|
2444
|
+
local_files_only=local_files_only,
|
|
2445
|
+
_raise_exceptions_for_missing_entries=False,
|
|
2446
|
+
_raise_exceptions_for_connection_errors=False,
|
|
2447
|
+
_commit_hash=_commit_hash,
|
|
2448
|
+
)
|
|
2449
|
+
|
|
2450
|
+
# Detected using a (local) mistral tokenizer
|
|
2451
|
+
mistral_config_detected = False
|
|
2452
|
+
if _config_file is not None:
|
|
2453
|
+
with open(_config_file, encoding="utf-8") as f:
|
|
2454
|
+
_config = json.load(f)
|
|
2455
|
+
transformers_version = _config.get("transformers_version")
|
|
2456
|
+
transformers_model_type = _config.get("model_type")
|
|
2457
|
+
|
|
2458
|
+
# Detect if we can skip the mistral fix by
|
|
2459
|
+
# a) having a non-mistral tokenizer
|
|
2460
|
+
# b) fixed version of transformers
|
|
2461
|
+
if transformers_version and version.parse(transformers_version) <= version.parse("4.57.2"):
|
|
2462
|
+
if (
|
|
2463
|
+
_is_local
|
|
2464
|
+
and transformers_model_type is not None
|
|
2465
|
+
and transformers_model_type
|
|
2466
|
+
not in [
|
|
2420
2467
|
"mistral",
|
|
2421
2468
|
"mistral3",
|
|
2422
|
-
"
|
|
2469
|
+
"voxtral",
|
|
2423
2470
|
"ministral",
|
|
2424
2471
|
"pixtral",
|
|
2425
|
-
]
|
|
2426
|
-
|
|
2472
|
+
]
|
|
2473
|
+
):
|
|
2474
|
+
return tokenizer
|
|
2475
|
+
elif transformers_version and version.parse(transformers_version) >= version.parse("5.0.0"):
|
|
2476
|
+
return tokenizer
|
|
2427
2477
|
|
|
2478
|
+
mistral_config_detected = True
|
|
2479
|
+
|
|
2480
|
+
if mistral_config_detected or (not _is_local and is_base_mistral(pretrained_model_name_or_path)):
|
|
2428
2481
|
# Expose the `fix_mistral_regex` flag on the tokenizer when provided, even if no correction is applied.
|
|
2429
|
-
if "fix_mistral_regex" in init_kwargs:
|
|
2482
|
+
if init_kwargs and "fix_mistral_regex" in init_kwargs:
|
|
2430
2483
|
setattr(tokenizer, "fix_mistral_regex", init_kwargs["fix_mistral_regex"])
|
|
2431
2484
|
|
|
2432
|
-
fix_mistral_regex = kwargs.get("fix_mistral_regex") # not init kwargs
|
|
2433
2485
|
# only warn if its not explicitly passed
|
|
2434
2486
|
if fix_mistral_regex is None and not getattr(tokenizer, "fix_mistral_regex", False):
|
|
2435
2487
|
setattr(tokenizer, "fix_mistral_regex", False)
|
|
2436
2488
|
logger.warning(
|
|
2437
2489
|
f"The tokenizer you are loading from '{pretrained_model_name_or_path}'"
|
|
2438
|
-
f" with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e.
|
|
2490
|
+
f" with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e."
|
|
2439
2491
|
" This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue."
|
|
2440
2492
|
)
|
|
2441
2493
|
elif fix_mistral_regex is True or getattr(tokenizer, "fix_mistral_regex", False):
|
|
@@ -2448,7 +2500,6 @@ class PreTrainedTokenizerBase(SpecialTokensMixin, PushToHubMixin):
|
|
|
2448
2500
|
),
|
|
2449
2501
|
behavior="isolated",
|
|
2450
2502
|
)
|
|
2451
|
-
|
|
2452
2503
|
return tokenizer
|
|
2453
2504
|
|
|
2454
2505
|
@staticmethod
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: transformers
|
|
3
|
-
Version: 4.57.
|
|
3
|
+
Version: 4.57.3
|
|
4
4
|
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
|
|
5
5
|
Home-page: https://github.com/huggingface/transformers
|
|
6
6
|
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
transformers/__init__.py,sha256=
|
|
1
|
+
transformers/__init__.py,sha256=frB0PthD8kwdTouNr0sYJJy4FAO0VxpwwAvgpMCme9Q,47000
|
|
2
2
|
transformers/activations.py,sha256=PdWoGx5eDFNxJW8A7-wZ31IlVCAxhzfbHgNDCpjPQmQ,13109
|
|
3
3
|
transformers/activations_tf.py,sha256=TGmah3loMs_pERwxpjWb5-AUeHLoBAyDxFYWVuLC7FU,4729
|
|
4
4
|
transformers/audio_utils.py,sha256=wDhFAweo28mpXu2OQTdw80gU-jgFgSHKny7ujdDfqVg,54284
|
|
@@ -50,7 +50,7 @@ transformers/tf_utils.py,sha256=uiS6uSPmB_ZUaxbV-vMkGy1roDTtY3ujpIgkwuskGmc,1139
|
|
|
50
50
|
transformers/time_series_utils.py,sha256=fhc___L7NHqLzQ2lvrojW0yGkXJUTVqHGEAt5VDRqNA,7493
|
|
51
51
|
transformers/tokenization_mistral_common.py,sha256=U-f69MiL63qob6z6MjGPh0TaN3J26vqioOzvZNElZYk,91444
|
|
52
52
|
transformers/tokenization_utils.py,sha256=38xCQUA32GXSJWglnuwS3RDKpzdbLXoncHt6UCoI74A,47780
|
|
53
|
-
transformers/tokenization_utils_base.py,sha256=
|
|
53
|
+
transformers/tokenization_utils_base.py,sha256=2dRqMBK69GNt1SEwQ6W5g9l1ysPNmdyM7gdLip_uK5E,217765
|
|
54
54
|
transformers/tokenization_utils_fast.py,sha256=VY5FTuaFDpDovF0XguNYeN_aHam_35l64RP6f_dxoPM,41383
|
|
55
55
|
transformers/trainer.py,sha256=w-rI-ii9pjMP1N7hMef-fZvcD6NdnelE_OUsuXs3m6s,279527
|
|
56
56
|
transformers/trainer_callback.py,sha256=YkfU5q-2K7G2RcmdaDLnajZOdSXiaCNKSsmJAot8hN8,33631
|
|
@@ -2238,9 +2238,9 @@ transformers/utils/quantization_config.py,sha256=MK8CU9pBIqA8TXWMraDfrM3YndtyW39
|
|
|
2238
2238
|
transformers/utils/sentencepiece_model_pb2.py,sha256=WcMZRm2-571XwxSfo-6FZih9fDy_Zl5mMwqrDrC1Dlg,50663
|
|
2239
2239
|
transformers/utils/sentencepiece_model_pb2_new.py,sha256=ahaV--amhGIL3nXFCTHqezqxuGXm8SHr_C3Zvj7KbAY,6598
|
|
2240
2240
|
transformers/utils/versions.py,sha256=C-Tqr4qGSHH64ygIBCSo8gA6azz7Dbzh8zdc_yjMkX8,4337
|
|
2241
|
-
transformers-4.57.
|
|
2242
|
-
transformers-4.57.
|
|
2243
|
-
transformers-4.57.
|
|
2244
|
-
transformers-4.57.
|
|
2245
|
-
transformers-4.57.
|
|
2246
|
-
transformers-4.57.
|
|
2241
|
+
transformers-4.57.3.dist-info/licenses/LICENSE,sha256=d_1HEN757DwPYiWADgI18VpCWr1KiwNVkSf814JhIEk,11418
|
|
2242
|
+
transformers-4.57.3.dist-info/METADATA,sha256=q3v1SoZQphG24EWvK2gZk_JhMC9-5NyohXcS3gc3xXc,43991
|
|
2243
|
+
transformers-4.57.3.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
2244
|
+
transformers-4.57.3.dist-info/entry_points.txt,sha256=Zra3dVQyt6Q3fU_suoD3gF81JV3WeV8gH66vzoev408,144
|
|
2245
|
+
transformers-4.57.3.dist-info/top_level.txt,sha256=GLBaeTo_CSdhnHvbxQ0kzpEHdlLuA_33foIogaWxntI,13
|
|
2246
|
+
transformers-4.57.3.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|