huggingface-hub 0.12.1__py3-none-any.whl → 0.13.0rc0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. huggingface_hub/__init__.py +165 -127
  2. huggingface_hub/_commit_api.py +25 -51
  3. huggingface_hub/_login.py +4 -13
  4. huggingface_hub/_snapshot_download.py +45 -23
  5. huggingface_hub/_space_api.py +7 -0
  6. huggingface_hub/commands/delete_cache.py +13 -39
  7. huggingface_hub/commands/env.py +1 -3
  8. huggingface_hub/commands/huggingface_cli.py +1 -3
  9. huggingface_hub/commands/lfs.py +4 -8
  10. huggingface_hub/commands/scan_cache.py +5 -16
  11. huggingface_hub/commands/user.py +27 -45
  12. huggingface_hub/community.py +4 -4
  13. huggingface_hub/constants.py +22 -19
  14. huggingface_hub/fastai_utils.py +14 -23
  15. huggingface_hub/file_download.py +166 -108
  16. huggingface_hub/hf_api.py +500 -255
  17. huggingface_hub/hub_mixin.py +181 -176
  18. huggingface_hub/inference_api.py +4 -10
  19. huggingface_hub/keras_mixin.py +39 -71
  20. huggingface_hub/lfs.py +8 -24
  21. huggingface_hub/repocard.py +33 -48
  22. huggingface_hub/repocard_data.py +141 -30
  23. huggingface_hub/repository.py +41 -112
  24. huggingface_hub/templates/modelcard_template.md +39 -34
  25. huggingface_hub/utils/__init__.py +1 -0
  26. huggingface_hub/utils/_cache_assets.py +1 -4
  27. huggingface_hub/utils/_cache_manager.py +17 -39
  28. huggingface_hub/utils/_deprecation.py +8 -12
  29. huggingface_hub/utils/_errors.py +10 -57
  30. huggingface_hub/utils/_fixes.py +2 -6
  31. huggingface_hub/utils/_git_credential.py +5 -16
  32. huggingface_hub/utils/_headers.py +22 -11
  33. huggingface_hub/utils/_http.py +1 -4
  34. huggingface_hub/utils/_paths.py +5 -12
  35. huggingface_hub/utils/_runtime.py +2 -1
  36. huggingface_hub/utils/_telemetry.py +120 -0
  37. huggingface_hub/utils/_validators.py +5 -13
  38. huggingface_hub/utils/endpoint_helpers.py +1 -3
  39. huggingface_hub/utils/logging.py +10 -8
  40. {huggingface_hub-0.12.1.dist-info → huggingface_hub-0.13.0rc0.dist-info}/METADATA +7 -14
  41. huggingface_hub-0.13.0rc0.dist-info/RECORD +56 -0
  42. huggingface_hub/py.typed +0 -0
  43. huggingface_hub-0.12.1.dist-info/RECORD +0 -56
  44. {huggingface_hub-0.12.1.dist-info → huggingface_hub-0.13.0rc0.dist-info}/LICENSE +0 -0
  45. {huggingface_hub-0.12.1.dist-info → huggingface_hub-0.13.0rc0.dist-info}/WHEEL +0 -0
  46. {huggingface_hub-0.12.1.dist-info → huggingface_hub-0.13.0rc0.dist-info}/entry_points.txt +0 -0
  47. {huggingface_hub-0.12.1.dist-info → huggingface_hub-0.13.0rc0.dist-info}/top_level.txt +0 -0
@@ -144,16 +144,11 @@ def _check_fastai_fastcore_pyproject_versions(
144
144
  # If the package is specified but not the version (e.g. "fastai" instead of "fastai=2.4"), the default versions are the highest.
145
145
  fastai_packages = [pck for pck in package_versions if pck.startswith("fastai")]
146
146
  if len(fastai_packages) == 0:
147
- logger.warning(
148
- "The repository does not have a fastai version specified in the"
149
- " `pyproject.toml`."
150
- )
147
+ logger.warning("The repository does not have a fastai version specified in the `pyproject.toml`.")
151
148
  # fastai_version is an empty string if not specified
152
149
  else:
153
150
  fastai_version = str(fastai_packages[0]).partition("=")[2]
154
- if fastai_version != "" and version.Version(fastai_version) < version.Version(
155
- fastai_min_version
156
- ):
151
+ if fastai_version != "" and version.Version(fastai_version) < version.Version(fastai_min_version):
157
152
  raise ImportError(
158
153
  "`from_pretrained_fastai` requires"
159
154
  f" fastai>={fastai_min_version} version but the model to load uses"
@@ -162,16 +157,11 @@ def _check_fastai_fastcore_pyproject_versions(
162
157
 
163
158
  fastcore_packages = [pck for pck in package_versions if pck.startswith("fastcore")]
164
159
  if len(fastcore_packages) == 0:
165
- logger.warning(
166
- "The repository does not have a fastcore version specified in the"
167
- " `pyproject.toml`."
168
- )
160
+ logger.warning("The repository does not have a fastcore version specified in the `pyproject.toml`.")
169
161
  # fastcore_version is an empty string if not specified
170
162
  else:
171
163
  fastcore_version = str(fastcore_packages[0]).partition("=")[2]
172
- if fastcore_version != "" and version.Version(
173
- fastcore_version
174
- ) < version.Version(fastcore_min_version):
164
+ if fastcore_version != "" and version.Version(fastcore_version) < version.Version(fastcore_min_version):
175
165
  raise ImportError(
176
166
  "`from_pretrained_fastai` requires"
177
167
  f" fastcore>={fastcore_min_version} version, but you are using fastcore"
@@ -281,9 +271,7 @@ def _save_pretrained_fastai(
281
271
  # if the user provides config then we update it with the fastai and fastcore versions in CONFIG_TEMPLATE.
282
272
  if config is not None:
283
273
  if not isinstance(config, dict):
284
- raise RuntimeError(
285
- f"Provided config should be a dict. Got: '{type(config)}'"
286
- )
274
+ raise RuntimeError(f"Provided config should be a dict. Got: '{type(config)}'")
287
275
  path = os.path.join(save_directory, CONFIG_NAME)
288
276
  with open(path, "w") as f:
289
277
  json.dump(config, f)
@@ -365,13 +353,15 @@ def push_to_hub_fastai(
365
353
  create_pr: Optional[bool] = None,
366
354
  allow_patterns: Optional[Union[List[str], str]] = None,
367
355
  ignore_patterns: Optional[Union[List[str], str]] = None,
356
+ delete_patterns: Optional[Union[List[str], str]] = None,
368
357
  api_endpoint: Optional[str] = None,
369
358
  ):
370
359
  """
371
360
  Upload learner checkpoint files to the Hub.
372
361
 
373
- Use `allow_patterns` and `ignore_patterns` to precisely filter which files should be
374
- pushed to the hub. See [`upload_folder`] reference for more details.
362
+ Use `allow_patterns` and `ignore_patterns` to precisely filter which files should be pushed to the hub. Use
363
+ `delete_patterns` to delete existing remote files in the same commit. See [`upload_folder`] reference for more
364
+ details.
375
365
 
376
366
  Args:
377
367
  learner (`Learner`):
@@ -399,6 +389,9 @@ def push_to_hub_fastai(
399
389
  If provided, only files matching at least one pattern are pushed.
400
390
  ignore_patterns (`List[str]` or `str`, *optional*):
401
391
  If provided, files matching any of the patterns are not pushed.
392
+ delete_patterns (`List[str]` or `str`, *optional*):
393
+ If provided, remote files matching any of the patterns will be deleted from the repo.
394
+
402
395
  Returns:
403
396
  The url of the commit of your model in the given repository.
404
397
 
@@ -413,9 +406,7 @@ def push_to_hub_fastai(
413
406
  """
414
407
  _check_fastai_fastcore_versions()
415
408
  api = HfApi(endpoint=api_endpoint)
416
- api.create_repo(
417
- repo_id=repo_id, repo_type="model", token=token, private=private, exist_ok=True
418
- )
409
+ repo_id = api.create_repo(repo_id=repo_id, token=token, private=private, exist_ok=True).repo_id
419
410
 
420
411
  # Push the files to the repo in a single commit
421
412
  with SoftTemporaryDirectory() as tmp:
@@ -423,7 +414,6 @@ def push_to_hub_fastai(
423
414
  _save_pretrained_fastai(learner, saved_path, config=config)
424
415
  return api.upload_folder(
425
416
  repo_id=repo_id,
426
- repo_type="model",
427
417
  token=token,
428
418
  folder_path=saved_path,
429
419
  commit_message=commit_message,
@@ -431,4 +421,5 @@ def push_to_hub_fastai(
431
421
  create_pr=create_pr,
432
422
  allow_patterns=allow_patterns,
433
423
  ignore_patterns=ignore_patterns,
424
+ delete_patterns=delete_patterns,
434
425
  )
@@ -19,9 +19,10 @@ from urllib.parse import quote, urlparse
19
19
 
20
20
  import requests
21
21
  from filelock import FileLock
22
- from huggingface_hub import constants
23
22
  from requests.exceptions import ConnectTimeout, ProxyError
24
23
 
24
+ from huggingface_hub import constants
25
+
25
26
  from . import __version__ # noqa: F401 # for backward compatibility
26
27
  from .constants import (
27
28
  DEFAULT_REVISION,
@@ -36,38 +37,41 @@ from .constants import (
36
37
  REPO_TYPES,
37
38
  REPO_TYPES_URL_PREFIXES,
38
39
  )
39
- from .utils import get_fastai_version # noqa: F401 # for backward compatibility
40
- from .utils import get_fastcore_version # noqa: F401 # for backward compatibility
41
- from .utils import get_graphviz_version # noqa: F401 # for backward compatibility
42
- from .utils import get_jinja_version # noqa: F401 # for backward compatibility
43
- from .utils import get_pydot_version # noqa: F401 # for backward compatibility
44
- from .utils import get_tf_version # noqa: F401 # for backward compatibility
45
- from .utils import get_torch_version # noqa: F401 # for backward compatibility
46
- from .utils import is_fastai_available # noqa: F401 # for backward compatibility
47
- from .utils import is_fastcore_available # noqa: F401 # for backward compatibility
48
- from .utils import is_graphviz_available # noqa: F401 # for backward compatibility
49
- from .utils import is_jinja_available # noqa: F401 # for backward compatibility
50
- from .utils import is_pydot_available # noqa: F401 # for backward compatibility
51
- from .utils import is_tf_available # noqa: F401 # for backward compatibility
52
- from .utils import is_torch_available # noqa: F401 # for backward compatibility
53
40
  from .utils import (
54
41
  EntryNotFoundError,
55
42
  LocalEntryNotFoundError,
56
43
  SoftTemporaryDirectory,
57
44
  build_hf_headers,
45
+ get_fastai_version, # noqa: F401 # for backward compatibility
46
+ get_fastcore_version, # noqa: F401 # for backward compatibility
47
+ get_graphviz_version, # noqa: F401 # for backward compatibility
48
+ get_jinja_version, # noqa: F401 # for backward compatibility
49
+ get_pydot_version, # noqa: F401 # for backward compatibility
50
+ get_tf_version, # noqa: F401 # for backward compatibility
51
+ get_torch_version, # noqa: F401 # for backward compatibility
58
52
  hf_raise_for_status,
59
53
  http_backoff,
54
+ is_fastai_available, # noqa: F401 # for backward compatibility
55
+ is_fastcore_available, # noqa: F401 # for backward compatibility
56
+ is_graphviz_available, # noqa: F401 # for backward compatibility
57
+ is_jinja_available, # noqa: F401 # for backward compatibility
58
+ is_pydot_available, # noqa: F401 # for backward compatibility
59
+ is_tf_available, # noqa: F401 # for backward compatibility
60
+ is_torch_available, # noqa: F401 # for backward compatibility
60
61
  logging,
61
62
  tqdm,
62
63
  validate_hf_hub_args,
63
64
  )
64
65
  from .utils._headers import _http_user_agent
65
66
  from .utils._runtime import _PY_VERSION # noqa: F401 # for backward compatibility
66
- from .utils._typing import HTTP_METHOD_T
67
+ from .utils._typing import HTTP_METHOD_T, Literal
67
68
 
68
69
 
69
70
  logger = logging.get_logger(__name__)
70
71
 
72
+ # Regex to get filename from a "Content-Disposition" header for CDN-served files
73
+ HEADER_FILENAME_PATTERN = re.compile(r'filename="(?P<filename>.*?)";')
74
+
71
75
 
72
76
  _are_symlinks_supported_in_dir: Dict[str, bool] = {}
73
77
 
@@ -185,8 +189,8 @@ def hf_hub_url(
185
189
  subfolder (`str`, *optional*):
186
190
  An optional value corresponding to a folder inside the repo.
187
191
  repo_type (`str`, *optional*):
188
- Set to `"dataset"` or `"space"` if uploading to a dataset or space,
189
- `None` or `"model"` if uploading to a model. Default is `None`.
192
+ Set to `"dataset"` or `"space"` if downloading from a dataset or space,
193
+ `None` or `"model"` if downloading from a model. Default is `None`.
190
194
  revision (`str`, *optional*):
191
195
  An optional Git revision id which can be a branch name, a tag, or a
192
196
  commit hash.
@@ -347,9 +351,7 @@ def _raise_if_offline_mode_is_enabled(msg: Optional[str] = None):
347
351
  HF_HUB_OFFLINE is True."""
348
352
  if constants.HF_HUB_OFFLINE:
349
353
  raise OfflineModeIsEnabled(
350
- "Offline mode is enabled."
351
- if msg is None
352
- else "Offline mode is enabled. " + str(msg)
354
+ "Offline mode is enabled." if msg is None else "Offline mode is enabled. " + str(msg)
353
355
  )
354
356
 
355
357
 
@@ -515,16 +517,22 @@ def http_get(
515
517
 
516
518
  displayed_name = url
517
519
  content_disposition = r.headers.get("Content-Disposition")
518
- if content_disposition is not None and "filename=" in content_disposition:
519
- # Means file is on CDN
520
- displayed_name = content_disposition.split("filename=")[-1]
520
+ if content_disposition is not None:
521
+ match = HEADER_FILENAME_PATTERN.search(content_disposition)
522
+ if match is not None:
523
+ # Means file is on CDN
524
+ displayed_name = match.groupdict()["filename"]
525
+
526
+ # Truncate filename if too long to display
527
+ if len(displayed_name) > 22:
528
+ displayed_name = f"(…){displayed_name[-20:]}"
521
529
 
522
530
  progress = tqdm(
523
531
  unit="B",
524
532
  unit_scale=True,
525
533
  total=total,
526
534
  initial=resume_size,
527
- desc=f"Downloading (…){displayed_name[-20:]}",
535
+ desc=f"Downloading {displayed_name}",
528
536
  disable=bool(logger.getEffectiveLevel() == logging.NOTSET),
529
537
  )
530
538
  for chunk in r.iter_content(chunk_size=10 * 1024 * 1024):
@@ -627,8 +635,10 @@ def cached_download(
627
635
  """
628
636
  if not legacy_cache_layout:
629
637
  warnings.warn(
630
- "`cached_download` is the legacy way to download files from the HF hub,"
631
- " please consider upgrading to `hf_hub_download`",
638
+ (
639
+ "`cached_download` is the legacy way to download files from the HF hub,"
640
+ " please consider upgrading to `hf_hub_download`"
641
+ ),
632
642
  FutureWarning,
633
643
  )
634
644
 
@@ -666,8 +676,7 @@ def cached_download(
666
676
  # If we don't have any of those, raise an error.
667
677
  if etag is None:
668
678
  raise OSError(
669
- "Distant resource does not have an ETag, we won't be able to"
670
- " reliably ensure reproducibility."
679
+ "Distant resource does not have an ETag, we won't be able to reliably ensure reproducibility."
671
680
  )
672
681
  # In case of a redirect, save an extra redirect on the request.get call,
673
682
  # and ensure we download the exact atomic version even if it changed
@@ -688,9 +697,7 @@ def cached_download(
688
697
  # etag is None
689
698
  pass
690
699
 
691
- filename = (
692
- force_filename if force_filename is not None else url_to_filename(url, etag)
693
- )
700
+ filename = force_filename if force_filename is not None else url_to_filename(url, etag)
694
701
 
695
702
  # get cache path to put the file
696
703
  cache_path = os.path.join(cache_dir, filename)
@@ -703,16 +710,10 @@ def cached_download(
703
710
  else:
704
711
  matching_files = [
705
712
  file
706
- for file in fnmatch.filter(
707
- os.listdir(cache_dir), filename.split(".")[0] + ".*"
708
- )
713
+ for file in fnmatch.filter(os.listdir(cache_dir), filename.split(".")[0] + ".*")
709
714
  if not file.endswith(".json") and not file.endswith(".lock")
710
715
  ]
711
- if (
712
- len(matching_files) > 0
713
- and not force_download
714
- and force_filename is None
715
- ):
716
+ if len(matching_files) > 0 and not force_download and force_filename is None:
716
717
  return os.path.join(cache_dir, matching_files[-1])
717
718
  else:
718
719
  # If files cannot be found and local_files_only=True,
@@ -844,11 +845,19 @@ def _create_relative_symlink(src: str, dst: str, new_blob: bool = False) -> None
844
845
  except OSError:
845
846
  pass
846
847
 
847
- cache_dir = os.path.dirname(os.path.commonpath([src, dst]))
848
- if are_symlinks_supported(cache_dir=cache_dir):
849
- relative_src = os.path.relpath(src, start=os.path.dirname(dst))
848
+ try:
849
+ _support_symlinks = are_symlinks_supported(
850
+ os.path.dirname(os.path.commonpath([os.path.realpath(src), os.path.realpath(dst)]))
851
+ )
852
+ except PermissionError:
853
+ # Permission error means src and dst are not in the same volume (e.g. destination path has been provided
854
+ # by the user via `local_dir`. Let's test symlink support there)
855
+ _support_symlinks = are_symlinks_supported(os.path.dirname(dst))
856
+
857
+ if _support_symlinks:
858
+ logger.info(f"Creating pointer from {src} to {dst}")
850
859
  try:
851
- os.symlink(relative_src, dst)
860
+ os.symlink(src, dst)
852
861
  except FileExistsError:
853
862
  if os.path.islink(dst) and os.path.realpath(dst) == os.path.realpath(src):
854
863
  # `dst` already exists and is a symlink to the `src` blob. It is most
@@ -861,14 +870,14 @@ def _create_relative_symlink(src: str, dst: str, new_blob: bool = False) -> None
861
870
  # blob file. Raise exception.
862
871
  raise
863
872
  elif new_blob:
873
+ logger.info(f"Symlink not supported. Moving file from {src} to {dst}")
864
874
  os.replace(src, dst)
865
875
  else:
876
+ logger.info(f"Symlink not supported. Copying file from {src} to {dst}")
866
877
  shutil.copyfile(src, dst)
867
878
 
868
879
 
869
- def _cache_commit_hash_for_specific_revision(
870
- storage_folder: str, revision: str, commit_hash: str
871
- ) -> None:
880
+ def _cache_commit_hash_for_specific_revision(storage_folder: str, revision: str, commit_hash: str) -> None:
872
881
  """Cache reference between a revision (tag, branch or truncated commit hash) and the corresponding commit hash.
873
882
 
874
883
  Does nothing if `revision` is already a proper `commit_hash` or reference is already cached.
@@ -906,6 +915,8 @@ def hf_hub_download(
906
915
  library_name: Optional[str] = None,
907
916
  library_version: Optional[str] = None,
908
917
  cache_dir: Union[str, Path, None] = None,
918
+ local_dir: Union[str, Path, None] = None,
919
+ local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto",
909
920
  user_agent: Union[Dict, str, None] = None,
910
921
  force_download: bool = False,
911
922
  force_filename: Optional[str] = None,
@@ -928,6 +939,21 @@ def hf_hub_download(
928
939
  that have been resolved at that particular commit. Each filename is a symlink to the blob
929
940
  at that particular commit.
930
941
 
942
+ If `local_dir` is provided, the file structure from the repo will be replicated in this location. You can configure
943
+ how you want to move those files:
944
+ - If `local_dir_use_symlinks="auto"` (default), files are downloaded and stored in the cache directory as blob
945
+ files. Small files (<5MB) are duplicated in `local_dir` while a symlink is created for bigger files. The goal
946
+ is to be able to manually edit and save small files without corrupting the cache while saving disk space for
947
+ binary files. The 5MB threshold can be configured with the `HF_HUB_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD`
948
+ environment variable.
949
+ - If `local_dir_use_symlinks=True`, files are downloaded, stored in the cache directory and symlinked in `local_dir`.
950
+ This is optimal in term of disk usage but files must not be manually edited.
951
+ - If `local_dir_use_symlinks=False` and the blob files exist in the cache directory, they are duplicated in the
952
+ local dir. This means disk usage is not optimized.
953
+ - Finally, if `local_dir_use_symlinks=False` and the blob files do not exist in the cache directory, then the
954
+ files are downloaded and directly placed under `local_dir`. This means if you need to download them again later,
955
+ they will be re-downloaded entirely.
956
+
931
957
  ```
932
958
  [ 96] .
933
959
  └── [ 160] models--julien-c--EsperBERTo-small
@@ -954,8 +980,8 @@ def hf_hub_download(
954
980
  subfolder (`str`, *optional*):
955
981
  An optional value corresponding to a folder inside the model repo.
956
982
  repo_type (`str`, *optional*):
957
- Set to `"dataset"` or `"space"` if uploading to a dataset or space,
958
- `None` or `"model"` if uploading to a model. Default is `None`.
983
+ Set to `"dataset"` or `"space"` if downloading from a dataset or space,
984
+ `None` or `"model"` if downloading from a model. Default is `None`.
959
985
  revision (`str`, *optional*):
960
986
  An optional Git revision id which can be a branch name, a tag, or a
961
987
  commit hash.
@@ -965,6 +991,14 @@ def hf_hub_download(
965
991
  The version of the library.
966
992
  cache_dir (`str`, `Path`, *optional*):
967
993
  Path to the folder where cached files are stored.
994
+ local_dir (`str` or `Path`, *optional*):
995
+ If provided, the downloaded file will be placed under this directory, either as a symlink (default) or
996
+ a regular file (see description for more details).
997
+ local_dir_use_symlinks (`"auto"` or `bool`, defaults to `"auto"`):
998
+ To be used with `local_dir`. If set to "auto", the cache directory will be used and the file will be either
999
+ duplicated or symlinked to the local directory depending on its size. It set to `True`, a symlink will be
1000
+ created, no matter the file size. If set to `False`, the file will either be duplicated from cache (if
1001
+ already exists) or downloaded from the Hub and not cached. See description for more details.
968
1002
  user_agent (`dict`, `str`, *optional*):
969
1003
  The user-agent info in the form of a dictionary or a string.
970
1004
  force_download (`bool`, *optional*, defaults to `False`):
@@ -1019,8 +1053,10 @@ def hf_hub_download(
1019
1053
  """
1020
1054
  if force_filename is not None:
1021
1055
  warnings.warn(
1022
- "The `force_filename` parameter is deprecated as a new caching system, "
1023
- "which keeps the filenames as they are on the Hub, is now in place.",
1056
+ (
1057
+ "The `force_filename` parameter is deprecated as a new caching system, "
1058
+ "which keeps the filenames as they are on the Hub, is now in place."
1059
+ ),
1024
1060
  FutureWarning,
1025
1061
  )
1026
1062
  legacy_cache_layout = True
@@ -1056,6 +1092,8 @@ def hf_hub_download(
1056
1092
  revision = DEFAULT_REVISION
1057
1093
  if isinstance(cache_dir, Path):
1058
1094
  cache_dir = str(cache_dir)
1095
+ if isinstance(local_dir, Path):
1096
+ local_dir = str(local_dir)
1059
1097
 
1060
1098
  if subfolder == "":
1061
1099
  subfolder = None
@@ -1066,14 +1104,9 @@ def hf_hub_download(
1066
1104
  if repo_type is None:
1067
1105
  repo_type = "model"
1068
1106
  if repo_type not in REPO_TYPES:
1069
- raise ValueError(
1070
- f"Invalid repo type: {repo_type}. Accepted repo types are:"
1071
- f" {str(REPO_TYPES)}"
1072
- )
1107
+ raise ValueError(f"Invalid repo type: {repo_type}. Accepted repo types are: {str(REPO_TYPES)}")
1073
1108
 
1074
- storage_folder = os.path.join(
1075
- cache_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type)
1076
- )
1109
+ storage_folder = os.path.join(cache_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type))
1077
1110
  os.makedirs(storage_folder, exist_ok=True)
1078
1111
 
1079
1112
  # cross platform transcription of filename, to be used as a local file path.
@@ -1082,10 +1115,10 @@ def hf_hub_download(
1082
1115
  # if user provides a commit_hash and they already have the file on disk,
1083
1116
  # shortcut everything.
1084
1117
  if REGEX_COMMIT_HASH.match(revision):
1085
- pointer_path = os.path.join(
1086
- storage_folder, "snapshots", revision, relative_filename
1087
- )
1118
+ pointer_path = os.path.join(storage_folder, "snapshots", revision, relative_filename)
1088
1119
  if os.path.exists(pointer_path):
1120
+ if local_dir is not None:
1121
+ return _to_local_dir(pointer_path, local_dir, relative_filename, use_symlinks=local_dir_use_symlinks)
1089
1122
  return pointer_path
1090
1123
 
1091
1124
  url = hf_hub_url(repo_id, filename, repo_type=repo_type, revision=revision)
@@ -1111,30 +1144,18 @@ def hf_hub_download(
1111
1144
  )
1112
1145
  except EntryNotFoundError as http_error:
1113
1146
  # Cache the non-existence of the file and raise
1114
- commit_hash = http_error.response.headers.get(
1115
- HUGGINGFACE_HEADER_X_REPO_COMMIT
1116
- )
1147
+ commit_hash = http_error.response.headers.get(HUGGINGFACE_HEADER_X_REPO_COMMIT)
1117
1148
  if commit_hash is not None and not legacy_cache_layout:
1118
- no_exist_file_path = (
1119
- Path(storage_folder)
1120
- / ".no_exist"
1121
- / commit_hash
1122
- / relative_filename
1123
- )
1149
+ no_exist_file_path = Path(storage_folder) / ".no_exist" / commit_hash / relative_filename
1124
1150
  no_exist_file_path.parent.mkdir(parents=True, exist_ok=True)
1125
1151
  no_exist_file_path.touch()
1126
- _cache_commit_hash_for_specific_revision(
1127
- storage_folder, revision, commit_hash
1128
- )
1152
+ _cache_commit_hash_for_specific_revision(storage_folder, revision, commit_hash)
1129
1153
  raise
1130
1154
 
1131
1155
  # Commit hash must exist
1132
1156
  commit_hash = metadata.commit_hash
1133
1157
  if commit_hash is None:
1134
- raise OSError(
1135
- "Distant resource does not seem to be on huggingface.co (missing"
1136
- " commit header)."
1137
- )
1158
+ raise OSError("Distant resource does not seem to be on huggingface.co (missing commit header).")
1138
1159
 
1139
1160
  # Etag must exist
1140
1161
  etag = metadata.etag
@@ -1143,8 +1164,7 @@ def hf_hub_download(
1143
1164
  # If we don't have any of those, raise an error.
1144
1165
  if etag is None:
1145
1166
  raise OSError(
1146
- "Distant resource does not have an ETag, we won't be able to"
1147
- " reliably ensure reproducibility."
1167
+ "Distant resource does not have an ETag, we won't be able to reliably ensure reproducibility."
1148
1168
  )
1149
1169
 
1150
1170
  # In case of a redirect, save an extra redirect on the request.get call,
@@ -1175,8 +1195,7 @@ def hf_hub_download(
1175
1195
  # In those cases, we cannot force download.
1176
1196
  if force_download:
1177
1197
  raise ValueError(
1178
- "We have no connection or you passed local_files_only, so"
1179
- " force_download is not an accepted option."
1198
+ "We have no connection or you passed local_files_only, so force_download is not an accepted option."
1180
1199
  )
1181
1200
 
1182
1201
  # Try to get "commit_hash" from "revision"
@@ -1191,14 +1210,15 @@ def hf_hub_download(
1191
1210
 
1192
1211
  # Return pointer file if exists
1193
1212
  if commit_hash is not None:
1194
- pointer_path = os.path.join(
1195
- storage_folder, "snapshots", commit_hash, relative_filename
1196
- )
1213
+ pointer_path = os.path.join(storage_folder, "snapshots", commit_hash, relative_filename)
1197
1214
  if os.path.exists(pointer_path):
1215
+ if local_dir is not None:
1216
+ return _to_local_dir(
1217
+ pointer_path, local_dir, relative_filename, use_symlinks=local_dir_use_symlinks
1218
+ )
1198
1219
  return pointer_path
1199
1220
 
1200
- # If we couldn't find an appropriate file on disk,
1201
- # raise an error.
1221
+ # If we couldn't find an appropriate file on disk, raise an error.
1202
1222
  # If files cannot be found and local_files_only=True,
1203
1223
  # the models might've been found if local_files_only=False
1204
1224
  # Notify the user about that
@@ -1219,9 +1239,7 @@ def hf_hub_download(
1219
1239
  assert etag is not None, "etag must have been retrieved from server"
1220
1240
  assert commit_hash is not None, "commit_hash must have been retrieved from server"
1221
1241
  blob_path = os.path.join(storage_folder, "blobs", etag)
1222
- pointer_path = os.path.join(
1223
- storage_folder, "snapshots", commit_hash, relative_filename
1224
- )
1242
+ pointer_path = os.path.join(storage_folder, "snapshots", commit_hash, relative_filename)
1225
1243
 
1226
1244
  os.makedirs(os.path.dirname(blob_path), exist_ok=True)
1227
1245
  os.makedirs(os.path.dirname(pointer_path), exist_ok=True)
@@ -1231,13 +1249,17 @@ def hf_hub_download(
1231
1249
  _cache_commit_hash_for_specific_revision(storage_folder, revision, commit_hash)
1232
1250
 
1233
1251
  if os.path.exists(pointer_path) and not force_download:
1252
+ if local_dir is not None:
1253
+ return _to_local_dir(pointer_path, local_dir, relative_filename, use_symlinks=local_dir_use_symlinks)
1234
1254
  return pointer_path
1235
1255
 
1236
1256
  if os.path.exists(blob_path) and not force_download:
1237
1257
  # we have the blob already, but not the pointer
1238
- logger.info("creating pointer to %s from %s", blob_path, pointer_path)
1239
- _create_relative_symlink(blob_path, pointer_path, new_blob=False)
1240
- return pointer_path
1258
+ if local_dir is not None: # to local dir
1259
+ return _to_local_dir(blob_path, local_dir, relative_filename, use_symlinks=local_dir_use_symlinks)
1260
+ else: # or in snapshot cache
1261
+ _create_relative_symlink(blob_path, pointer_path, new_blob=False)
1262
+ return pointer_path
1241
1263
 
1242
1264
  # Prevent parallel downloads of the same file with a lock.
1243
1265
  lock_path = blob_path + ".lock"
@@ -1288,11 +1310,31 @@ def hf_hub_download(
1288
1310
  headers=headers,
1289
1311
  )
1290
1312
 
1291
- logger.info("storing %s in cache at %s", url, blob_path)
1292
- _chmod_and_replace(temp_file.name, blob_path)
1293
-
1294
- logger.info("creating pointer to %s from %s", blob_path, pointer_path)
1295
- _create_relative_symlink(blob_path, pointer_path, new_blob=True)
1313
+ if local_dir is None:
1314
+ logger.info(f"Storing {url} in cache at {blob_path}")
1315
+ _chmod_and_replace(temp_file.name, blob_path)
1316
+ _create_relative_symlink(blob_path, pointer_path, new_blob=True)
1317
+ else:
1318
+ local_dir_filepath = os.path.join(local_dir, relative_filename)
1319
+ os.makedirs(os.path.dirname(local_dir_filepath), exist_ok=True)
1320
+
1321
+ # If "auto" (default) copy-paste small files to ease manual editing but symlink big files to save disk
1322
+ # In both cases, blob file is cached.
1323
+ is_big_file = os.stat(temp_file.name).st_size > constants.HF_HUB_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD
1324
+ if local_dir_use_symlinks is True or (local_dir_use_symlinks == "auto" and is_big_file):
1325
+ logger.info(f"Storing {url} in cache at {blob_path}")
1326
+ _chmod_and_replace(temp_file.name, blob_path)
1327
+ logger.info("Create symlink to local dir")
1328
+ _create_relative_symlink(blob_path, local_dir_filepath, new_blob=False)
1329
+ elif local_dir_use_symlinks == "auto" and not is_big_file:
1330
+ logger.info(f"Storing {url} in cache at {blob_path}")
1331
+ _chmod_and_replace(temp_file.name, blob_path)
1332
+ logger.info("Duplicate in local dir (small file and use_symlink set to 'auto')")
1333
+ shutil.copyfile(blob_path, local_dir_filepath)
1334
+ else:
1335
+ logger.info(f"Storing {url} in local_dir at {local_dir_filepath} (not cached).")
1336
+ _chmod_and_replace(temp_file.name, local_dir_filepath)
1337
+ pointer_path = local_dir_filepath # for return value
1296
1338
 
1297
1339
  try:
1298
1340
  os.remove(lock_path)
@@ -1357,10 +1399,7 @@ def try_to_load_from_cache(
1357
1399
  if repo_type is None:
1358
1400
  repo_type = "model"
1359
1401
  if repo_type not in REPO_TYPES:
1360
- raise ValueError(
1361
- f"Invalid repo type: {repo_type}. Accepted repo types are:"
1362
- f" {str(REPO_TYPES)}"
1363
- )
1402
+ raise ValueError(f"Invalid repo type: {repo_type}. Accepted repo types are: {str(REPO_TYPES)}")
1364
1403
  if cache_dir is None:
1365
1404
  cache_dir = HUGGINGFACE_HUB_CACHE
1366
1405
 
@@ -1376,9 +1415,9 @@ def try_to_load_from_cache(
1376
1415
 
1377
1416
  # Resolve refs (for instance to convert main to the associated commit sha)
1378
1417
  if os.path.isdir(refs_dir):
1379
- cached_refs = os.listdir(refs_dir)
1380
- if revision in cached_refs:
1381
- with open(os.path.join(refs_dir, revision)) as f:
1418
+ revision_file = os.path.join(refs_dir, revision)
1419
+ if os.path.isfile(revision_file):
1420
+ with open(revision_file) as f:
1382
1421
  revision = f.read()
1383
1422
 
1384
1423
  # Check if file is cached as "no_exist"
@@ -1453,10 +1492,7 @@ def get_hf_file_metadata(
1453
1492
  # Do not use directly `url`, as `_request_wrapper` might have followed relative
1454
1493
  # redirects.
1455
1494
  location=r.headers.get("Location") or r.request.url, # type: ignore
1456
- size=_int_or_none(
1457
- r.headers.get(HUGGINGFACE_HEADER_X_LINKED_SIZE)
1458
- or r.headers.get("Content-Length")
1459
- ),
1495
+ size=_int_or_none(r.headers.get(HUGGINGFACE_HEADER_X_LINKED_SIZE) or r.headers.get("Content-Length")),
1460
1496
  )
1461
1497
 
1462
1498
 
@@ -1490,3 +1526,25 @@ def _chmod_and_replace(src: str, dst: str) -> None:
1490
1526
  tmp_file.unlink()
1491
1527
 
1492
1528
  os.replace(src, dst)
1529
+
1530
+
1531
+ def _to_local_dir(
1532
+ path: str, local_dir: str, relative_filename: str, use_symlinks: Union[bool, Literal["auto"]]
1533
+ ) -> str:
1534
+ """Place a file in a local dir (different than cache_dir).
1535
+
1536
+ Either symlink to blob file in cache or duplicate file depending on `use_symlinks` and file size.
1537
+ """
1538
+ local_dir_filepath = os.path.join(local_dir, relative_filename)
1539
+ os.makedirs(os.path.dirname(local_dir_filepath), exist_ok=True)
1540
+ real_blob_path = os.path.realpath(path)
1541
+
1542
+ # If "auto" (default) copy-paste small files to ease manual editing but symlink big files to save disk
1543
+ if use_symlinks == "auto":
1544
+ use_symlinks = os.stat(real_blob_path).st_size > constants.HF_HUB_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD
1545
+
1546
+ if use_symlinks:
1547
+ _create_relative_symlink(real_blob_path, local_dir_filepath, new_blob=False)
1548
+ else:
1549
+ shutil.copyfile(real_blob_path, local_dir_filepath)
1550
+ return local_dir_filepath