huggingface-hub 0.12.1__py3-none-any.whl → 0.13.0rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- huggingface_hub/__init__.py +165 -127
- huggingface_hub/_commit_api.py +25 -51
- huggingface_hub/_login.py +4 -13
- huggingface_hub/_snapshot_download.py +45 -23
- huggingface_hub/_space_api.py +7 -0
- huggingface_hub/commands/delete_cache.py +13 -39
- huggingface_hub/commands/env.py +1 -3
- huggingface_hub/commands/huggingface_cli.py +1 -3
- huggingface_hub/commands/lfs.py +4 -8
- huggingface_hub/commands/scan_cache.py +5 -16
- huggingface_hub/commands/user.py +27 -45
- huggingface_hub/community.py +4 -4
- huggingface_hub/constants.py +22 -19
- huggingface_hub/fastai_utils.py +14 -23
- huggingface_hub/file_download.py +166 -108
- huggingface_hub/hf_api.py +500 -255
- huggingface_hub/hub_mixin.py +181 -176
- huggingface_hub/inference_api.py +4 -10
- huggingface_hub/keras_mixin.py +39 -71
- huggingface_hub/lfs.py +8 -24
- huggingface_hub/repocard.py +33 -48
- huggingface_hub/repocard_data.py +141 -30
- huggingface_hub/repository.py +41 -112
- huggingface_hub/templates/modelcard_template.md +39 -34
- huggingface_hub/utils/__init__.py +1 -0
- huggingface_hub/utils/_cache_assets.py +1 -4
- huggingface_hub/utils/_cache_manager.py +17 -39
- huggingface_hub/utils/_deprecation.py +8 -12
- huggingface_hub/utils/_errors.py +10 -57
- huggingface_hub/utils/_fixes.py +2 -6
- huggingface_hub/utils/_git_credential.py +5 -16
- huggingface_hub/utils/_headers.py +22 -11
- huggingface_hub/utils/_http.py +1 -4
- huggingface_hub/utils/_paths.py +5 -12
- huggingface_hub/utils/_runtime.py +2 -1
- huggingface_hub/utils/_telemetry.py +120 -0
- huggingface_hub/utils/_validators.py +5 -13
- huggingface_hub/utils/endpoint_helpers.py +1 -3
- huggingface_hub/utils/logging.py +10 -8
- {huggingface_hub-0.12.1.dist-info → huggingface_hub-0.13.0rc0.dist-info}/METADATA +7 -14
- huggingface_hub-0.13.0rc0.dist-info/RECORD +56 -0
- huggingface_hub/py.typed +0 -0
- huggingface_hub-0.12.1.dist-info/RECORD +0 -56
- {huggingface_hub-0.12.1.dist-info → huggingface_hub-0.13.0rc0.dist-info}/LICENSE +0 -0
- {huggingface_hub-0.12.1.dist-info → huggingface_hub-0.13.0rc0.dist-info}/WHEEL +0 -0
- {huggingface_hub-0.12.1.dist-info → huggingface_hub-0.13.0rc0.dist-info}/entry_points.txt +0 -0
- {huggingface_hub-0.12.1.dist-info → huggingface_hub-0.13.0rc0.dist-info}/top_level.txt +0 -0
huggingface_hub/fastai_utils.py
CHANGED
|
@@ -144,16 +144,11 @@ def _check_fastai_fastcore_pyproject_versions(
|
|
|
144
144
|
# If the package is specified but not the version (e.g. "fastai" instead of "fastai=2.4"), the default versions are the highest.
|
|
145
145
|
fastai_packages = [pck for pck in package_versions if pck.startswith("fastai")]
|
|
146
146
|
if len(fastai_packages) == 0:
|
|
147
|
-
logger.warning(
|
|
148
|
-
"The repository does not have a fastai version specified in the"
|
|
149
|
-
" `pyproject.toml`."
|
|
150
|
-
)
|
|
147
|
+
logger.warning("The repository does not have a fastai version specified in the `pyproject.toml`.")
|
|
151
148
|
# fastai_version is an empty string if not specified
|
|
152
149
|
else:
|
|
153
150
|
fastai_version = str(fastai_packages[0]).partition("=")[2]
|
|
154
|
-
if fastai_version != "" and version.Version(fastai_version) < version.Version(
|
|
155
|
-
fastai_min_version
|
|
156
|
-
):
|
|
151
|
+
if fastai_version != "" and version.Version(fastai_version) < version.Version(fastai_min_version):
|
|
157
152
|
raise ImportError(
|
|
158
153
|
"`from_pretrained_fastai` requires"
|
|
159
154
|
f" fastai>={fastai_min_version} version but the model to load uses"
|
|
@@ -162,16 +157,11 @@ def _check_fastai_fastcore_pyproject_versions(
|
|
|
162
157
|
|
|
163
158
|
fastcore_packages = [pck for pck in package_versions if pck.startswith("fastcore")]
|
|
164
159
|
if len(fastcore_packages) == 0:
|
|
165
|
-
logger.warning(
|
|
166
|
-
"The repository does not have a fastcore version specified in the"
|
|
167
|
-
" `pyproject.toml`."
|
|
168
|
-
)
|
|
160
|
+
logger.warning("The repository does not have a fastcore version specified in the `pyproject.toml`.")
|
|
169
161
|
# fastcore_version is an empty string if not specified
|
|
170
162
|
else:
|
|
171
163
|
fastcore_version = str(fastcore_packages[0]).partition("=")[2]
|
|
172
|
-
if fastcore_version != "" and version.Version(
|
|
173
|
-
fastcore_version
|
|
174
|
-
) < version.Version(fastcore_min_version):
|
|
164
|
+
if fastcore_version != "" and version.Version(fastcore_version) < version.Version(fastcore_min_version):
|
|
175
165
|
raise ImportError(
|
|
176
166
|
"`from_pretrained_fastai` requires"
|
|
177
167
|
f" fastcore>={fastcore_min_version} version, but you are using fastcore"
|
|
@@ -281,9 +271,7 @@ def _save_pretrained_fastai(
|
|
|
281
271
|
# if the user provides config then we update it with the fastai and fastcore versions in CONFIG_TEMPLATE.
|
|
282
272
|
if config is not None:
|
|
283
273
|
if not isinstance(config, dict):
|
|
284
|
-
raise RuntimeError(
|
|
285
|
-
f"Provided config should be a dict. Got: '{type(config)}'"
|
|
286
|
-
)
|
|
274
|
+
raise RuntimeError(f"Provided config should be a dict. Got: '{type(config)}'")
|
|
287
275
|
path = os.path.join(save_directory, CONFIG_NAME)
|
|
288
276
|
with open(path, "w") as f:
|
|
289
277
|
json.dump(config, f)
|
|
@@ -365,13 +353,15 @@ def push_to_hub_fastai(
|
|
|
365
353
|
create_pr: Optional[bool] = None,
|
|
366
354
|
allow_patterns: Optional[Union[List[str], str]] = None,
|
|
367
355
|
ignore_patterns: Optional[Union[List[str], str]] = None,
|
|
356
|
+
delete_patterns: Optional[Union[List[str], str]] = None,
|
|
368
357
|
api_endpoint: Optional[str] = None,
|
|
369
358
|
):
|
|
370
359
|
"""
|
|
371
360
|
Upload learner checkpoint files to the Hub.
|
|
372
361
|
|
|
373
|
-
Use `allow_patterns` and `ignore_patterns` to precisely filter which files should be
|
|
374
|
-
|
|
362
|
+
Use `allow_patterns` and `ignore_patterns` to precisely filter which files should be pushed to the hub. Use
|
|
363
|
+
`delete_patterns` to delete existing remote files in the same commit. See [`upload_folder`] reference for more
|
|
364
|
+
details.
|
|
375
365
|
|
|
376
366
|
Args:
|
|
377
367
|
learner (`Learner`):
|
|
@@ -399,6 +389,9 @@ def push_to_hub_fastai(
|
|
|
399
389
|
If provided, only files matching at least one pattern are pushed.
|
|
400
390
|
ignore_patterns (`List[str]` or `str`, *optional*):
|
|
401
391
|
If provided, files matching any of the patterns are not pushed.
|
|
392
|
+
delete_patterns (`List[str]` or `str`, *optional*):
|
|
393
|
+
If provided, remote files matching any of the patterns will be deleted from the repo.
|
|
394
|
+
|
|
402
395
|
Returns:
|
|
403
396
|
The url of the commit of your model in the given repository.
|
|
404
397
|
|
|
@@ -413,9 +406,7 @@ def push_to_hub_fastai(
|
|
|
413
406
|
"""
|
|
414
407
|
_check_fastai_fastcore_versions()
|
|
415
408
|
api = HfApi(endpoint=api_endpoint)
|
|
416
|
-
api.create_repo(
|
|
417
|
-
repo_id=repo_id, repo_type="model", token=token, private=private, exist_ok=True
|
|
418
|
-
)
|
|
409
|
+
repo_id = api.create_repo(repo_id=repo_id, token=token, private=private, exist_ok=True).repo_id
|
|
419
410
|
|
|
420
411
|
# Push the files to the repo in a single commit
|
|
421
412
|
with SoftTemporaryDirectory() as tmp:
|
|
@@ -423,7 +414,6 @@ def push_to_hub_fastai(
|
|
|
423
414
|
_save_pretrained_fastai(learner, saved_path, config=config)
|
|
424
415
|
return api.upload_folder(
|
|
425
416
|
repo_id=repo_id,
|
|
426
|
-
repo_type="model",
|
|
427
417
|
token=token,
|
|
428
418
|
folder_path=saved_path,
|
|
429
419
|
commit_message=commit_message,
|
|
@@ -431,4 +421,5 @@ def push_to_hub_fastai(
|
|
|
431
421
|
create_pr=create_pr,
|
|
432
422
|
allow_patterns=allow_patterns,
|
|
433
423
|
ignore_patterns=ignore_patterns,
|
|
424
|
+
delete_patterns=delete_patterns,
|
|
434
425
|
)
|
huggingface_hub/file_download.py
CHANGED
|
@@ -19,9 +19,10 @@ from urllib.parse import quote, urlparse
|
|
|
19
19
|
|
|
20
20
|
import requests
|
|
21
21
|
from filelock import FileLock
|
|
22
|
-
from huggingface_hub import constants
|
|
23
22
|
from requests.exceptions import ConnectTimeout, ProxyError
|
|
24
23
|
|
|
24
|
+
from huggingface_hub import constants
|
|
25
|
+
|
|
25
26
|
from . import __version__ # noqa: F401 # for backward compatibility
|
|
26
27
|
from .constants import (
|
|
27
28
|
DEFAULT_REVISION,
|
|
@@ -36,38 +37,41 @@ from .constants import (
|
|
|
36
37
|
REPO_TYPES,
|
|
37
38
|
REPO_TYPES_URL_PREFIXES,
|
|
38
39
|
)
|
|
39
|
-
from .utils import get_fastai_version # noqa: F401 # for backward compatibility
|
|
40
|
-
from .utils import get_fastcore_version # noqa: F401 # for backward compatibility
|
|
41
|
-
from .utils import get_graphviz_version # noqa: F401 # for backward compatibility
|
|
42
|
-
from .utils import get_jinja_version # noqa: F401 # for backward compatibility
|
|
43
|
-
from .utils import get_pydot_version # noqa: F401 # for backward compatibility
|
|
44
|
-
from .utils import get_tf_version # noqa: F401 # for backward compatibility
|
|
45
|
-
from .utils import get_torch_version # noqa: F401 # for backward compatibility
|
|
46
|
-
from .utils import is_fastai_available # noqa: F401 # for backward compatibility
|
|
47
|
-
from .utils import is_fastcore_available # noqa: F401 # for backward compatibility
|
|
48
|
-
from .utils import is_graphviz_available # noqa: F401 # for backward compatibility
|
|
49
|
-
from .utils import is_jinja_available # noqa: F401 # for backward compatibility
|
|
50
|
-
from .utils import is_pydot_available # noqa: F401 # for backward compatibility
|
|
51
|
-
from .utils import is_tf_available # noqa: F401 # for backward compatibility
|
|
52
|
-
from .utils import is_torch_available # noqa: F401 # for backward compatibility
|
|
53
40
|
from .utils import (
|
|
54
41
|
EntryNotFoundError,
|
|
55
42
|
LocalEntryNotFoundError,
|
|
56
43
|
SoftTemporaryDirectory,
|
|
57
44
|
build_hf_headers,
|
|
45
|
+
get_fastai_version, # noqa: F401 # for backward compatibility
|
|
46
|
+
get_fastcore_version, # noqa: F401 # for backward compatibility
|
|
47
|
+
get_graphviz_version, # noqa: F401 # for backward compatibility
|
|
48
|
+
get_jinja_version, # noqa: F401 # for backward compatibility
|
|
49
|
+
get_pydot_version, # noqa: F401 # for backward compatibility
|
|
50
|
+
get_tf_version, # noqa: F401 # for backward compatibility
|
|
51
|
+
get_torch_version, # noqa: F401 # for backward compatibility
|
|
58
52
|
hf_raise_for_status,
|
|
59
53
|
http_backoff,
|
|
54
|
+
is_fastai_available, # noqa: F401 # for backward compatibility
|
|
55
|
+
is_fastcore_available, # noqa: F401 # for backward compatibility
|
|
56
|
+
is_graphviz_available, # noqa: F401 # for backward compatibility
|
|
57
|
+
is_jinja_available, # noqa: F401 # for backward compatibility
|
|
58
|
+
is_pydot_available, # noqa: F401 # for backward compatibility
|
|
59
|
+
is_tf_available, # noqa: F401 # for backward compatibility
|
|
60
|
+
is_torch_available, # noqa: F401 # for backward compatibility
|
|
60
61
|
logging,
|
|
61
62
|
tqdm,
|
|
62
63
|
validate_hf_hub_args,
|
|
63
64
|
)
|
|
64
65
|
from .utils._headers import _http_user_agent
|
|
65
66
|
from .utils._runtime import _PY_VERSION # noqa: F401 # for backward compatibility
|
|
66
|
-
from .utils._typing import HTTP_METHOD_T
|
|
67
|
+
from .utils._typing import HTTP_METHOD_T, Literal
|
|
67
68
|
|
|
68
69
|
|
|
69
70
|
logger = logging.get_logger(__name__)
|
|
70
71
|
|
|
72
|
+
# Regex to get filename from a "Content-Disposition" header for CDN-served files
|
|
73
|
+
HEADER_FILENAME_PATTERN = re.compile(r'filename="(?P<filename>.*?)";')
|
|
74
|
+
|
|
71
75
|
|
|
72
76
|
_are_symlinks_supported_in_dir: Dict[str, bool] = {}
|
|
73
77
|
|
|
@@ -185,8 +189,8 @@ def hf_hub_url(
|
|
|
185
189
|
subfolder (`str`, *optional*):
|
|
186
190
|
An optional value corresponding to a folder inside the repo.
|
|
187
191
|
repo_type (`str`, *optional*):
|
|
188
|
-
Set to `"dataset"` or `"space"` if
|
|
189
|
-
`None` or `"model"` if
|
|
192
|
+
Set to `"dataset"` or `"space"` if downloading from a dataset or space,
|
|
193
|
+
`None` or `"model"` if downloading from a model. Default is `None`.
|
|
190
194
|
revision (`str`, *optional*):
|
|
191
195
|
An optional Git revision id which can be a branch name, a tag, or a
|
|
192
196
|
commit hash.
|
|
@@ -347,9 +351,7 @@ def _raise_if_offline_mode_is_enabled(msg: Optional[str] = None):
|
|
|
347
351
|
HF_HUB_OFFLINE is True."""
|
|
348
352
|
if constants.HF_HUB_OFFLINE:
|
|
349
353
|
raise OfflineModeIsEnabled(
|
|
350
|
-
"Offline mode is enabled."
|
|
351
|
-
if msg is None
|
|
352
|
-
else "Offline mode is enabled. " + str(msg)
|
|
354
|
+
"Offline mode is enabled." if msg is None else "Offline mode is enabled. " + str(msg)
|
|
353
355
|
)
|
|
354
356
|
|
|
355
357
|
|
|
@@ -515,16 +517,22 @@ def http_get(
|
|
|
515
517
|
|
|
516
518
|
displayed_name = url
|
|
517
519
|
content_disposition = r.headers.get("Content-Disposition")
|
|
518
|
-
if content_disposition is not None
|
|
519
|
-
|
|
520
|
-
|
|
520
|
+
if content_disposition is not None:
|
|
521
|
+
match = HEADER_FILENAME_PATTERN.search(content_disposition)
|
|
522
|
+
if match is not None:
|
|
523
|
+
# Means file is on CDN
|
|
524
|
+
displayed_name = match.groupdict()["filename"]
|
|
525
|
+
|
|
526
|
+
# Truncate filename if too long to display
|
|
527
|
+
if len(displayed_name) > 22:
|
|
528
|
+
displayed_name = f"(…){displayed_name[-20:]}"
|
|
521
529
|
|
|
522
530
|
progress = tqdm(
|
|
523
531
|
unit="B",
|
|
524
532
|
unit_scale=True,
|
|
525
533
|
total=total,
|
|
526
534
|
initial=resume_size,
|
|
527
|
-
desc=f"Downloading
|
|
535
|
+
desc=f"Downloading {displayed_name}",
|
|
528
536
|
disable=bool(logger.getEffectiveLevel() == logging.NOTSET),
|
|
529
537
|
)
|
|
530
538
|
for chunk in r.iter_content(chunk_size=10 * 1024 * 1024):
|
|
@@ -627,8 +635,10 @@ def cached_download(
|
|
|
627
635
|
"""
|
|
628
636
|
if not legacy_cache_layout:
|
|
629
637
|
warnings.warn(
|
|
630
|
-
|
|
631
|
-
|
|
638
|
+
(
|
|
639
|
+
"`cached_download` is the legacy way to download files from the HF hub,"
|
|
640
|
+
" please consider upgrading to `hf_hub_download`"
|
|
641
|
+
),
|
|
632
642
|
FutureWarning,
|
|
633
643
|
)
|
|
634
644
|
|
|
@@ -666,8 +676,7 @@ def cached_download(
|
|
|
666
676
|
# If we don't have any of those, raise an error.
|
|
667
677
|
if etag is None:
|
|
668
678
|
raise OSError(
|
|
669
|
-
"Distant resource does not have an ETag, we won't be able to"
|
|
670
|
-
" reliably ensure reproducibility."
|
|
679
|
+
"Distant resource does not have an ETag, we won't be able to reliably ensure reproducibility."
|
|
671
680
|
)
|
|
672
681
|
# In case of a redirect, save an extra redirect on the request.get call,
|
|
673
682
|
# and ensure we download the exact atomic version even if it changed
|
|
@@ -688,9 +697,7 @@ def cached_download(
|
|
|
688
697
|
# etag is None
|
|
689
698
|
pass
|
|
690
699
|
|
|
691
|
-
filename = (
|
|
692
|
-
force_filename if force_filename is not None else url_to_filename(url, etag)
|
|
693
|
-
)
|
|
700
|
+
filename = force_filename if force_filename is not None else url_to_filename(url, etag)
|
|
694
701
|
|
|
695
702
|
# get cache path to put the file
|
|
696
703
|
cache_path = os.path.join(cache_dir, filename)
|
|
@@ -703,16 +710,10 @@ def cached_download(
|
|
|
703
710
|
else:
|
|
704
711
|
matching_files = [
|
|
705
712
|
file
|
|
706
|
-
for file in fnmatch.filter(
|
|
707
|
-
os.listdir(cache_dir), filename.split(".")[0] + ".*"
|
|
708
|
-
)
|
|
713
|
+
for file in fnmatch.filter(os.listdir(cache_dir), filename.split(".")[0] + ".*")
|
|
709
714
|
if not file.endswith(".json") and not file.endswith(".lock")
|
|
710
715
|
]
|
|
711
|
-
if (
|
|
712
|
-
len(matching_files) > 0
|
|
713
|
-
and not force_download
|
|
714
|
-
and force_filename is None
|
|
715
|
-
):
|
|
716
|
+
if len(matching_files) > 0 and not force_download and force_filename is None:
|
|
716
717
|
return os.path.join(cache_dir, matching_files[-1])
|
|
717
718
|
else:
|
|
718
719
|
# If files cannot be found and local_files_only=True,
|
|
@@ -844,11 +845,19 @@ def _create_relative_symlink(src: str, dst: str, new_blob: bool = False) -> None
|
|
|
844
845
|
except OSError:
|
|
845
846
|
pass
|
|
846
847
|
|
|
847
|
-
|
|
848
|
-
|
|
849
|
-
|
|
848
|
+
try:
|
|
849
|
+
_support_symlinks = are_symlinks_supported(
|
|
850
|
+
os.path.dirname(os.path.commonpath([os.path.realpath(src), os.path.realpath(dst)]))
|
|
851
|
+
)
|
|
852
|
+
except PermissionError:
|
|
853
|
+
# Permission error means src and dst are not in the same volume (e.g. destination path has been provided
|
|
854
|
+
# by the user via `local_dir`. Let's test symlink support there)
|
|
855
|
+
_support_symlinks = are_symlinks_supported(os.path.dirname(dst))
|
|
856
|
+
|
|
857
|
+
if _support_symlinks:
|
|
858
|
+
logger.info(f"Creating pointer from {src} to {dst}")
|
|
850
859
|
try:
|
|
851
|
-
os.symlink(
|
|
860
|
+
os.symlink(src, dst)
|
|
852
861
|
except FileExistsError:
|
|
853
862
|
if os.path.islink(dst) and os.path.realpath(dst) == os.path.realpath(src):
|
|
854
863
|
# `dst` already exists and is a symlink to the `src` blob. It is most
|
|
@@ -861,14 +870,14 @@ def _create_relative_symlink(src: str, dst: str, new_blob: bool = False) -> None
|
|
|
861
870
|
# blob file. Raise exception.
|
|
862
871
|
raise
|
|
863
872
|
elif new_blob:
|
|
873
|
+
logger.info(f"Symlink not supported. Moving file from {src} to {dst}")
|
|
864
874
|
os.replace(src, dst)
|
|
865
875
|
else:
|
|
876
|
+
logger.info(f"Symlink not supported. Copying file from {src} to {dst}")
|
|
866
877
|
shutil.copyfile(src, dst)
|
|
867
878
|
|
|
868
879
|
|
|
869
|
-
def _cache_commit_hash_for_specific_revision(
|
|
870
|
-
storage_folder: str, revision: str, commit_hash: str
|
|
871
|
-
) -> None:
|
|
880
|
+
def _cache_commit_hash_for_specific_revision(storage_folder: str, revision: str, commit_hash: str) -> None:
|
|
872
881
|
"""Cache reference between a revision (tag, branch or truncated commit hash) and the corresponding commit hash.
|
|
873
882
|
|
|
874
883
|
Does nothing if `revision` is already a proper `commit_hash` or reference is already cached.
|
|
@@ -906,6 +915,8 @@ def hf_hub_download(
|
|
|
906
915
|
library_name: Optional[str] = None,
|
|
907
916
|
library_version: Optional[str] = None,
|
|
908
917
|
cache_dir: Union[str, Path, None] = None,
|
|
918
|
+
local_dir: Union[str, Path, None] = None,
|
|
919
|
+
local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto",
|
|
909
920
|
user_agent: Union[Dict, str, None] = None,
|
|
910
921
|
force_download: bool = False,
|
|
911
922
|
force_filename: Optional[str] = None,
|
|
@@ -928,6 +939,21 @@ def hf_hub_download(
|
|
|
928
939
|
that have been resolved at that particular commit. Each filename is a symlink to the blob
|
|
929
940
|
at that particular commit.
|
|
930
941
|
|
|
942
|
+
If `local_dir` is provided, the file structure from the repo will be replicated in this location. You can configure
|
|
943
|
+
how you want to move those files:
|
|
944
|
+
- If `local_dir_use_symlinks="auto"` (default), files are downloaded and stored in the cache directory as blob
|
|
945
|
+
files. Small files (<5MB) are duplicated in `local_dir` while a symlink is created for bigger files. The goal
|
|
946
|
+
is to be able to manually edit and save small files without corrupting the cache while saving disk space for
|
|
947
|
+
binary files. The 5MB threshold can be configured with the `HF_HUB_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD`
|
|
948
|
+
environment variable.
|
|
949
|
+
- If `local_dir_use_symlinks=True`, files are downloaded, stored in the cache directory and symlinked in `local_dir`.
|
|
950
|
+
This is optimal in term of disk usage but files must not be manually edited.
|
|
951
|
+
- If `local_dir_use_symlinks=False` and the blob files exist in the cache directory, they are duplicated in the
|
|
952
|
+
local dir. This means disk usage is not optimized.
|
|
953
|
+
- Finally, if `local_dir_use_symlinks=False` and the blob files do not exist in the cache directory, then the
|
|
954
|
+
files are downloaded and directly placed under `local_dir`. This means if you need to download them again later,
|
|
955
|
+
they will be re-downloaded entirely.
|
|
956
|
+
|
|
931
957
|
```
|
|
932
958
|
[ 96] .
|
|
933
959
|
└── [ 160] models--julien-c--EsperBERTo-small
|
|
@@ -954,8 +980,8 @@ def hf_hub_download(
|
|
|
954
980
|
subfolder (`str`, *optional*):
|
|
955
981
|
An optional value corresponding to a folder inside the model repo.
|
|
956
982
|
repo_type (`str`, *optional*):
|
|
957
|
-
Set to `"dataset"` or `"space"` if
|
|
958
|
-
`None` or `"model"` if
|
|
983
|
+
Set to `"dataset"` or `"space"` if downloading from a dataset or space,
|
|
984
|
+
`None` or `"model"` if downloading from a model. Default is `None`.
|
|
959
985
|
revision (`str`, *optional*):
|
|
960
986
|
An optional Git revision id which can be a branch name, a tag, or a
|
|
961
987
|
commit hash.
|
|
@@ -965,6 +991,14 @@ def hf_hub_download(
|
|
|
965
991
|
The version of the library.
|
|
966
992
|
cache_dir (`str`, `Path`, *optional*):
|
|
967
993
|
Path to the folder where cached files are stored.
|
|
994
|
+
local_dir (`str` or `Path`, *optional*):
|
|
995
|
+
If provided, the downloaded file will be placed under this directory, either as a symlink (default) or
|
|
996
|
+
a regular file (see description for more details).
|
|
997
|
+
local_dir_use_symlinks (`"auto"` or `bool`, defaults to `"auto"`):
|
|
998
|
+
To be used with `local_dir`. If set to "auto", the cache directory will be used and the file will be either
|
|
999
|
+
duplicated or symlinked to the local directory depending on its size. It set to `True`, a symlink will be
|
|
1000
|
+
created, no matter the file size. If set to `False`, the file will either be duplicated from cache (if
|
|
1001
|
+
already exists) or downloaded from the Hub and not cached. See description for more details.
|
|
968
1002
|
user_agent (`dict`, `str`, *optional*):
|
|
969
1003
|
The user-agent info in the form of a dictionary or a string.
|
|
970
1004
|
force_download (`bool`, *optional*, defaults to `False`):
|
|
@@ -1019,8 +1053,10 @@ def hf_hub_download(
|
|
|
1019
1053
|
"""
|
|
1020
1054
|
if force_filename is not None:
|
|
1021
1055
|
warnings.warn(
|
|
1022
|
-
|
|
1023
|
-
|
|
1056
|
+
(
|
|
1057
|
+
"The `force_filename` parameter is deprecated as a new caching system, "
|
|
1058
|
+
"which keeps the filenames as they are on the Hub, is now in place."
|
|
1059
|
+
),
|
|
1024
1060
|
FutureWarning,
|
|
1025
1061
|
)
|
|
1026
1062
|
legacy_cache_layout = True
|
|
@@ -1056,6 +1092,8 @@ def hf_hub_download(
|
|
|
1056
1092
|
revision = DEFAULT_REVISION
|
|
1057
1093
|
if isinstance(cache_dir, Path):
|
|
1058
1094
|
cache_dir = str(cache_dir)
|
|
1095
|
+
if isinstance(local_dir, Path):
|
|
1096
|
+
local_dir = str(local_dir)
|
|
1059
1097
|
|
|
1060
1098
|
if subfolder == "":
|
|
1061
1099
|
subfolder = None
|
|
@@ -1066,14 +1104,9 @@ def hf_hub_download(
|
|
|
1066
1104
|
if repo_type is None:
|
|
1067
1105
|
repo_type = "model"
|
|
1068
1106
|
if repo_type not in REPO_TYPES:
|
|
1069
|
-
raise ValueError(
|
|
1070
|
-
f"Invalid repo type: {repo_type}. Accepted repo types are:"
|
|
1071
|
-
f" {str(REPO_TYPES)}"
|
|
1072
|
-
)
|
|
1107
|
+
raise ValueError(f"Invalid repo type: {repo_type}. Accepted repo types are: {str(REPO_TYPES)}")
|
|
1073
1108
|
|
|
1074
|
-
storage_folder = os.path.join(
|
|
1075
|
-
cache_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type)
|
|
1076
|
-
)
|
|
1109
|
+
storage_folder = os.path.join(cache_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type))
|
|
1077
1110
|
os.makedirs(storage_folder, exist_ok=True)
|
|
1078
1111
|
|
|
1079
1112
|
# cross platform transcription of filename, to be used as a local file path.
|
|
@@ -1082,10 +1115,10 @@ def hf_hub_download(
|
|
|
1082
1115
|
# if user provides a commit_hash and they already have the file on disk,
|
|
1083
1116
|
# shortcut everything.
|
|
1084
1117
|
if REGEX_COMMIT_HASH.match(revision):
|
|
1085
|
-
pointer_path = os.path.join(
|
|
1086
|
-
storage_folder, "snapshots", revision, relative_filename
|
|
1087
|
-
)
|
|
1118
|
+
pointer_path = os.path.join(storage_folder, "snapshots", revision, relative_filename)
|
|
1088
1119
|
if os.path.exists(pointer_path):
|
|
1120
|
+
if local_dir is not None:
|
|
1121
|
+
return _to_local_dir(pointer_path, local_dir, relative_filename, use_symlinks=local_dir_use_symlinks)
|
|
1089
1122
|
return pointer_path
|
|
1090
1123
|
|
|
1091
1124
|
url = hf_hub_url(repo_id, filename, repo_type=repo_type, revision=revision)
|
|
@@ -1111,30 +1144,18 @@ def hf_hub_download(
|
|
|
1111
1144
|
)
|
|
1112
1145
|
except EntryNotFoundError as http_error:
|
|
1113
1146
|
# Cache the non-existence of the file and raise
|
|
1114
|
-
commit_hash = http_error.response.headers.get(
|
|
1115
|
-
HUGGINGFACE_HEADER_X_REPO_COMMIT
|
|
1116
|
-
)
|
|
1147
|
+
commit_hash = http_error.response.headers.get(HUGGINGFACE_HEADER_X_REPO_COMMIT)
|
|
1117
1148
|
if commit_hash is not None and not legacy_cache_layout:
|
|
1118
|
-
no_exist_file_path = (
|
|
1119
|
-
Path(storage_folder)
|
|
1120
|
-
/ ".no_exist"
|
|
1121
|
-
/ commit_hash
|
|
1122
|
-
/ relative_filename
|
|
1123
|
-
)
|
|
1149
|
+
no_exist_file_path = Path(storage_folder) / ".no_exist" / commit_hash / relative_filename
|
|
1124
1150
|
no_exist_file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
1125
1151
|
no_exist_file_path.touch()
|
|
1126
|
-
_cache_commit_hash_for_specific_revision(
|
|
1127
|
-
storage_folder, revision, commit_hash
|
|
1128
|
-
)
|
|
1152
|
+
_cache_commit_hash_for_specific_revision(storage_folder, revision, commit_hash)
|
|
1129
1153
|
raise
|
|
1130
1154
|
|
|
1131
1155
|
# Commit hash must exist
|
|
1132
1156
|
commit_hash = metadata.commit_hash
|
|
1133
1157
|
if commit_hash is None:
|
|
1134
|
-
raise OSError(
|
|
1135
|
-
"Distant resource does not seem to be on huggingface.co (missing"
|
|
1136
|
-
" commit header)."
|
|
1137
|
-
)
|
|
1158
|
+
raise OSError("Distant resource does not seem to be on huggingface.co (missing commit header).")
|
|
1138
1159
|
|
|
1139
1160
|
# Etag must exist
|
|
1140
1161
|
etag = metadata.etag
|
|
@@ -1143,8 +1164,7 @@ def hf_hub_download(
|
|
|
1143
1164
|
# If we don't have any of those, raise an error.
|
|
1144
1165
|
if etag is None:
|
|
1145
1166
|
raise OSError(
|
|
1146
|
-
"Distant resource does not have an ETag, we won't be able to"
|
|
1147
|
-
" reliably ensure reproducibility."
|
|
1167
|
+
"Distant resource does not have an ETag, we won't be able to reliably ensure reproducibility."
|
|
1148
1168
|
)
|
|
1149
1169
|
|
|
1150
1170
|
# In case of a redirect, save an extra redirect on the request.get call,
|
|
@@ -1175,8 +1195,7 @@ def hf_hub_download(
|
|
|
1175
1195
|
# In those cases, we cannot force download.
|
|
1176
1196
|
if force_download:
|
|
1177
1197
|
raise ValueError(
|
|
1178
|
-
"We have no connection or you passed local_files_only, so"
|
|
1179
|
-
" force_download is not an accepted option."
|
|
1198
|
+
"We have no connection or you passed local_files_only, so force_download is not an accepted option."
|
|
1180
1199
|
)
|
|
1181
1200
|
|
|
1182
1201
|
# Try to get "commit_hash" from "revision"
|
|
@@ -1191,14 +1210,15 @@ def hf_hub_download(
|
|
|
1191
1210
|
|
|
1192
1211
|
# Return pointer file if exists
|
|
1193
1212
|
if commit_hash is not None:
|
|
1194
|
-
pointer_path = os.path.join(
|
|
1195
|
-
storage_folder, "snapshots", commit_hash, relative_filename
|
|
1196
|
-
)
|
|
1213
|
+
pointer_path = os.path.join(storage_folder, "snapshots", commit_hash, relative_filename)
|
|
1197
1214
|
if os.path.exists(pointer_path):
|
|
1215
|
+
if local_dir is not None:
|
|
1216
|
+
return _to_local_dir(
|
|
1217
|
+
pointer_path, local_dir, relative_filename, use_symlinks=local_dir_use_symlinks
|
|
1218
|
+
)
|
|
1198
1219
|
return pointer_path
|
|
1199
1220
|
|
|
1200
|
-
# If we couldn't find an appropriate file on disk,
|
|
1201
|
-
# raise an error.
|
|
1221
|
+
# If we couldn't find an appropriate file on disk, raise an error.
|
|
1202
1222
|
# If files cannot be found and local_files_only=True,
|
|
1203
1223
|
# the models might've been found if local_files_only=False
|
|
1204
1224
|
# Notify the user about that
|
|
@@ -1219,9 +1239,7 @@ def hf_hub_download(
|
|
|
1219
1239
|
assert etag is not None, "etag must have been retrieved from server"
|
|
1220
1240
|
assert commit_hash is not None, "commit_hash must have been retrieved from server"
|
|
1221
1241
|
blob_path = os.path.join(storage_folder, "blobs", etag)
|
|
1222
|
-
pointer_path = os.path.join(
|
|
1223
|
-
storage_folder, "snapshots", commit_hash, relative_filename
|
|
1224
|
-
)
|
|
1242
|
+
pointer_path = os.path.join(storage_folder, "snapshots", commit_hash, relative_filename)
|
|
1225
1243
|
|
|
1226
1244
|
os.makedirs(os.path.dirname(blob_path), exist_ok=True)
|
|
1227
1245
|
os.makedirs(os.path.dirname(pointer_path), exist_ok=True)
|
|
@@ -1231,13 +1249,17 @@ def hf_hub_download(
|
|
|
1231
1249
|
_cache_commit_hash_for_specific_revision(storage_folder, revision, commit_hash)
|
|
1232
1250
|
|
|
1233
1251
|
if os.path.exists(pointer_path) and not force_download:
|
|
1252
|
+
if local_dir is not None:
|
|
1253
|
+
return _to_local_dir(pointer_path, local_dir, relative_filename, use_symlinks=local_dir_use_symlinks)
|
|
1234
1254
|
return pointer_path
|
|
1235
1255
|
|
|
1236
1256
|
if os.path.exists(blob_path) and not force_download:
|
|
1237
1257
|
# we have the blob already, but not the pointer
|
|
1238
|
-
|
|
1239
|
-
|
|
1240
|
-
|
|
1258
|
+
if local_dir is not None: # to local dir
|
|
1259
|
+
return _to_local_dir(blob_path, local_dir, relative_filename, use_symlinks=local_dir_use_symlinks)
|
|
1260
|
+
else: # or in snapshot cache
|
|
1261
|
+
_create_relative_symlink(blob_path, pointer_path, new_blob=False)
|
|
1262
|
+
return pointer_path
|
|
1241
1263
|
|
|
1242
1264
|
# Prevent parallel downloads of the same file with a lock.
|
|
1243
1265
|
lock_path = blob_path + ".lock"
|
|
@@ -1288,11 +1310,31 @@ def hf_hub_download(
|
|
|
1288
1310
|
headers=headers,
|
|
1289
1311
|
)
|
|
1290
1312
|
|
|
1291
|
-
|
|
1292
|
-
|
|
1293
|
-
|
|
1294
|
-
|
|
1295
|
-
|
|
1313
|
+
if local_dir is None:
|
|
1314
|
+
logger.info(f"Storing {url} in cache at {blob_path}")
|
|
1315
|
+
_chmod_and_replace(temp_file.name, blob_path)
|
|
1316
|
+
_create_relative_symlink(blob_path, pointer_path, new_blob=True)
|
|
1317
|
+
else:
|
|
1318
|
+
local_dir_filepath = os.path.join(local_dir, relative_filename)
|
|
1319
|
+
os.makedirs(os.path.dirname(local_dir_filepath), exist_ok=True)
|
|
1320
|
+
|
|
1321
|
+
# If "auto" (default) copy-paste small files to ease manual editing but symlink big files to save disk
|
|
1322
|
+
# In both cases, blob file is cached.
|
|
1323
|
+
is_big_file = os.stat(temp_file.name).st_size > constants.HF_HUB_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD
|
|
1324
|
+
if local_dir_use_symlinks is True or (local_dir_use_symlinks == "auto" and is_big_file):
|
|
1325
|
+
logger.info(f"Storing {url} in cache at {blob_path}")
|
|
1326
|
+
_chmod_and_replace(temp_file.name, blob_path)
|
|
1327
|
+
logger.info("Create symlink to local dir")
|
|
1328
|
+
_create_relative_symlink(blob_path, local_dir_filepath, new_blob=False)
|
|
1329
|
+
elif local_dir_use_symlinks == "auto" and not is_big_file:
|
|
1330
|
+
logger.info(f"Storing {url} in cache at {blob_path}")
|
|
1331
|
+
_chmod_and_replace(temp_file.name, blob_path)
|
|
1332
|
+
logger.info("Duplicate in local dir (small file and use_symlink set to 'auto')")
|
|
1333
|
+
shutil.copyfile(blob_path, local_dir_filepath)
|
|
1334
|
+
else:
|
|
1335
|
+
logger.info(f"Storing {url} in local_dir at {local_dir_filepath} (not cached).")
|
|
1336
|
+
_chmod_and_replace(temp_file.name, local_dir_filepath)
|
|
1337
|
+
pointer_path = local_dir_filepath # for return value
|
|
1296
1338
|
|
|
1297
1339
|
try:
|
|
1298
1340
|
os.remove(lock_path)
|
|
@@ -1357,10 +1399,7 @@ def try_to_load_from_cache(
|
|
|
1357
1399
|
if repo_type is None:
|
|
1358
1400
|
repo_type = "model"
|
|
1359
1401
|
if repo_type not in REPO_TYPES:
|
|
1360
|
-
raise ValueError(
|
|
1361
|
-
f"Invalid repo type: {repo_type}. Accepted repo types are:"
|
|
1362
|
-
f" {str(REPO_TYPES)}"
|
|
1363
|
-
)
|
|
1402
|
+
raise ValueError(f"Invalid repo type: {repo_type}. Accepted repo types are: {str(REPO_TYPES)}")
|
|
1364
1403
|
if cache_dir is None:
|
|
1365
1404
|
cache_dir = HUGGINGFACE_HUB_CACHE
|
|
1366
1405
|
|
|
@@ -1376,9 +1415,9 @@ def try_to_load_from_cache(
|
|
|
1376
1415
|
|
|
1377
1416
|
# Resolve refs (for instance to convert main to the associated commit sha)
|
|
1378
1417
|
if os.path.isdir(refs_dir):
|
|
1379
|
-
|
|
1380
|
-
if
|
|
1381
|
-
with open(
|
|
1418
|
+
revision_file = os.path.join(refs_dir, revision)
|
|
1419
|
+
if os.path.isfile(revision_file):
|
|
1420
|
+
with open(revision_file) as f:
|
|
1382
1421
|
revision = f.read()
|
|
1383
1422
|
|
|
1384
1423
|
# Check if file is cached as "no_exist"
|
|
@@ -1453,10 +1492,7 @@ def get_hf_file_metadata(
|
|
|
1453
1492
|
# Do not use directly `url`, as `_request_wrapper` might have followed relative
|
|
1454
1493
|
# redirects.
|
|
1455
1494
|
location=r.headers.get("Location") or r.request.url, # type: ignore
|
|
1456
|
-
size=_int_or_none(
|
|
1457
|
-
r.headers.get(HUGGINGFACE_HEADER_X_LINKED_SIZE)
|
|
1458
|
-
or r.headers.get("Content-Length")
|
|
1459
|
-
),
|
|
1495
|
+
size=_int_or_none(r.headers.get(HUGGINGFACE_HEADER_X_LINKED_SIZE) or r.headers.get("Content-Length")),
|
|
1460
1496
|
)
|
|
1461
1497
|
|
|
1462
1498
|
|
|
@@ -1490,3 +1526,25 @@ def _chmod_and_replace(src: str, dst: str) -> None:
|
|
|
1490
1526
|
tmp_file.unlink()
|
|
1491
1527
|
|
|
1492
1528
|
os.replace(src, dst)
|
|
1529
|
+
|
|
1530
|
+
|
|
1531
|
+
def _to_local_dir(
|
|
1532
|
+
path: str, local_dir: str, relative_filename: str, use_symlinks: Union[bool, Literal["auto"]]
|
|
1533
|
+
) -> str:
|
|
1534
|
+
"""Place a file in a local dir (different than cache_dir).
|
|
1535
|
+
|
|
1536
|
+
Either symlink to blob file in cache or duplicate file depending on `use_symlinks` and file size.
|
|
1537
|
+
"""
|
|
1538
|
+
local_dir_filepath = os.path.join(local_dir, relative_filename)
|
|
1539
|
+
os.makedirs(os.path.dirname(local_dir_filepath), exist_ok=True)
|
|
1540
|
+
real_blob_path = os.path.realpath(path)
|
|
1541
|
+
|
|
1542
|
+
# If "auto" (default) copy-paste small files to ease manual editing but symlink big files to save disk
|
|
1543
|
+
if use_symlinks == "auto":
|
|
1544
|
+
use_symlinks = os.stat(real_blob_path).st_size > constants.HF_HUB_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD
|
|
1545
|
+
|
|
1546
|
+
if use_symlinks:
|
|
1547
|
+
_create_relative_symlink(real_blob_path, local_dir_filepath, new_blob=False)
|
|
1548
|
+
else:
|
|
1549
|
+
shutil.copyfile(real_blob_path, local_dir_filepath)
|
|
1550
|
+
return local_dir_filepath
|