huggingface-hub 0.22.2__py3-none-any.whl → 0.23.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of huggingface-hub might be problematic. Click here for more details.

Files changed (45) hide show
  1. huggingface_hub/__init__.py +51 -19
  2. huggingface_hub/_commit_api.py +9 -8
  3. huggingface_hub/_commit_scheduler.py +2 -2
  4. huggingface_hub/_inference_endpoints.py +10 -17
  5. huggingface_hub/_local_folder.py +229 -0
  6. huggingface_hub/_login.py +4 -3
  7. huggingface_hub/_multi_commits.py +1 -1
  8. huggingface_hub/_snapshot_download.py +16 -38
  9. huggingface_hub/_tensorboard_logger.py +16 -6
  10. huggingface_hub/_webhooks_payload.py +22 -1
  11. huggingface_hub/_webhooks_server.py +24 -20
  12. huggingface_hub/commands/download.py +11 -34
  13. huggingface_hub/commands/huggingface_cli.py +2 -0
  14. huggingface_hub/commands/tag.py +159 -0
  15. huggingface_hub/constants.py +3 -5
  16. huggingface_hub/errors.py +58 -0
  17. huggingface_hub/file_download.py +545 -376
  18. huggingface_hub/hf_api.py +756 -622
  19. huggingface_hub/hf_file_system.py +20 -5
  20. huggingface_hub/hub_mixin.py +127 -43
  21. huggingface_hub/inference/_client.py +402 -183
  22. huggingface_hub/inference/_common.py +19 -29
  23. huggingface_hub/inference/_generated/_async_client.py +402 -184
  24. huggingface_hub/inference/_generated/types/__init__.py +23 -6
  25. huggingface_hub/inference/_generated/types/chat_completion.py +197 -43
  26. huggingface_hub/inference/_generated/types/text_generation.py +57 -79
  27. huggingface_hub/inference/_templating.py +2 -4
  28. huggingface_hub/keras_mixin.py +0 -3
  29. huggingface_hub/lfs.py +9 -1
  30. huggingface_hub/repository.py +1 -0
  31. huggingface_hub/utils/__init__.py +12 -6
  32. huggingface_hub/utils/_fixes.py +1 -0
  33. huggingface_hub/utils/_headers.py +2 -4
  34. huggingface_hub/utils/_http.py +2 -4
  35. huggingface_hub/utils/_paths.py +13 -1
  36. huggingface_hub/utils/_runtime.py +10 -0
  37. huggingface_hub/utils/_safetensors.py +0 -13
  38. huggingface_hub/utils/_validators.py +2 -7
  39. huggingface_hub/utils/tqdm.py +124 -46
  40. {huggingface_hub-0.22.2.dist-info → huggingface_hub-0.23.1.dist-info}/METADATA +5 -1
  41. {huggingface_hub-0.22.2.dist-info → huggingface_hub-0.23.1.dist-info}/RECORD +45 -43
  42. {huggingface_hub-0.22.2.dist-info → huggingface_hub-0.23.1.dist-info}/LICENSE +0 -0
  43. {huggingface_hub-0.22.2.dist-info → huggingface_hub-0.23.1.dist-info}/WHEEL +0 -0
  44. {huggingface_hub-0.22.2.dist-info → huggingface_hub-0.23.1.dist-info}/entry_points.txt +0 -0
  45. {huggingface_hub-0.22.2.dist-info → huggingface_hub-0.23.1.dist-info}/top_level.txt +0 -0
@@ -2,28 +2,27 @@ import copy
2
2
  import errno
3
3
  import fnmatch
4
4
  import inspect
5
- import io
6
5
  import json
7
6
  import os
8
7
  import re
9
8
  import shutil
10
9
  import stat
11
- import tempfile
12
10
  import time
13
11
  import uuid
14
12
  import warnings
15
- from contextlib import contextmanager
16
13
  from dataclasses import dataclass
17
- from functools import partial
18
14
  from pathlib import Path
19
- from typing import Any, BinaryIO, Dict, Generator, Literal, Optional, Tuple, Union
15
+ from typing import Any, BinaryIO, Dict, Literal, NoReturn, Optional, Tuple, Union
20
16
  from urllib.parse import quote, urlparse
21
17
 
22
18
  import requests
23
19
 
24
- from huggingface_hub import constants
25
-
26
20
  from . import __version__ # noqa: F401 # for backward compatibility
21
+ from ._local_folder import (
22
+ get_local_download_paths,
23
+ read_download_metadata,
24
+ write_download_metadata,
25
+ )
27
26
  from .constants import (
28
27
  DEFAULT_ETAG_TIMEOUT,
29
28
  DEFAULT_REQUEST_TIMEOUT,
@@ -80,13 +79,23 @@ from .utils import (
80
79
  from .utils._runtime import _PY_VERSION # noqa: F401 # for backward compatibility
81
80
  from .utils._typing import HTTP_METHOD_T
82
81
  from .utils.insecure_hashlib import sha256
82
+ from .utils.sha import sha_fileobj
83
83
 
84
84
 
85
85
  logger = logging.get_logger(__name__)
86
86
 
87
+ # Return value when trying to load a file from cache but the file does not exist in the distant repo.
88
+ _CACHED_NO_EXIST = object()
89
+ _CACHED_NO_EXIST_T = Any
90
+
87
91
  # Regex to get filename from a "Content-Disposition" header for CDN-served files
88
92
  HEADER_FILENAME_PATTERN = re.compile(r'filename="(?P<filename>.*?)";')
89
93
 
94
+ # Regex to check if the revision IS directly a commit_hash
95
+ REGEX_COMMIT_HASH = re.compile(r"^[0-9a-f]{40}$")
96
+
97
+ # Regex to check if the file etag IS a valid sha256
98
+ REGEX_SHA256 = re.compile(r"^[0-9a-f]{64}$")
90
99
 
91
100
  _are_symlinks_supported_in_dir: Dict[str, bool] = {}
92
101
 
@@ -150,12 +159,6 @@ def are_symlinks_supported(cache_dir: Union[str, Path, None] = None) -> bool:
150
159
  return _are_symlinks_supported_in_dir[cache_dir]
151
160
 
152
161
 
153
- # Return value when trying to load a file from cache but the file does not exist in the distant repo.
154
- _CACHED_NO_EXIST = object()
155
- _CACHED_NO_EXIST_T = Any
156
- REGEX_COMMIT_HASH = re.compile(r"^[0-9a-f]{40}$")
157
-
158
-
159
162
  @dataclass(frozen=True)
160
163
  class HfFileMetadata:
161
164
  """Data structure containing information about a file versioned on the Hub.
@@ -478,9 +481,9 @@ def http_get(
478
481
 
479
482
  consistency_error_message = (
480
483
  f"Consistency check failed: file should be of size {expected_size} but has size"
481
- f" {{actual_size}} ({displayed_filename}).\nWe are sorry for the inconvenience. Please retry download and"
482
- " pass `force_download=True, resume_download=False` as argument.\nIf the issue persists, please let us"
483
- " know by opening an issue on https://github.com/huggingface/huggingface_hub."
484
+ f" {{actual_size}} ({displayed_filename}).\nWe are sorry for the inconvenience. Please retry"
485
+ " with `force_download=True`.\nIf the issue persists, please let us know by opening an issue "
486
+ "on https://github.com/huggingface/huggingface_hub."
484
487
  )
485
488
 
486
489
  # Stream file to buffer
@@ -495,6 +498,7 @@ def http_get(
495
498
  disable=True if (logger.getEffectiveLevel() == logging.NOTSET) else None,
496
499
  # ^ set `disable=None` rather than `disable=False` by default to disable progress bar when no TTY attached
497
500
  # see https://github.com/huggingface/huggingface_hub/pull/2000
501
+ name="huggingface_hub.http_get",
498
502
  )
499
503
 
500
504
  if hf_transfer and total is not None and total > 5 * DOWNLOAD_CHUNK_SIZE:
@@ -582,7 +586,7 @@ def cached_download(
582
586
  force_filename: Optional[str] = None,
583
587
  proxies: Optional[Dict] = None,
584
588
  etag_timeout: float = DEFAULT_ETAG_TIMEOUT,
585
- resume_download: bool = False,
589
+ resume_download: Optional[bool] = None,
586
590
  token: Union[bool, str, None] = None,
587
591
  local_files_only: bool = False,
588
592
  legacy_cache_layout: bool = False,
@@ -619,8 +623,6 @@ def cached_download(
619
623
  etag_timeout (`float`, *optional* defaults to `10`):
620
624
  When fetching ETag, how many seconds to wait for the server to send
621
625
  data before giving up which is passed to `requests.request`.
622
- resume_download (`bool`, *optional*, defaults to `False`):
623
- If `True`, resume a previously interrupted download.
624
626
  token (`bool`, `str`, *optional*):
625
627
  A token to be used for the download.
626
628
  - If `True`, the token is read from the HuggingFace config
@@ -671,6 +673,13 @@ def cached_download(
671
673
  " 'hf_hub_download'",
672
674
  FutureWarning,
673
675
  )
676
+ if resume_download is not None:
677
+ warnings.warn(
678
+ "`resume_download` is deprecated and will be removed in version 1.0.0. "
679
+ "Downloads always resume when possible. "
680
+ "If you want to force a new download, use `force_download=True`.",
681
+ FutureWarning,
682
+ )
674
683
 
675
684
  if cache_dir is None:
676
685
  cache_dir = HF_HUB_CACHE
@@ -786,46 +795,16 @@ def cached_download(
786
795
  cache_path = "\\\\?\\" + os.path.abspath(cache_path)
787
796
 
788
797
  with WeakFileLock(lock_path):
789
- # If the download just completed while the lock was activated.
790
- if os.path.exists(cache_path) and not force_download:
791
- # Even if returning early like here, the lock will be released.
792
- return cache_path
793
-
794
- if resume_download:
795
- incomplete_path = cache_path + ".incomplete"
796
-
797
- @contextmanager
798
- def _resumable_file_manager() -> Generator[io.BufferedWriter, None, None]:
799
- with open(incomplete_path, "ab") as f:
800
- yield f
801
-
802
- temp_file_manager = _resumable_file_manager
803
- if os.path.exists(incomplete_path):
804
- resume_size = os.stat(incomplete_path).st_size
805
- else:
806
- resume_size = 0
807
- else:
808
- temp_file_manager = partial( # type: ignore
809
- tempfile.NamedTemporaryFile, mode="wb", dir=cache_dir, delete=False
810
- )
811
- resume_size = 0
812
-
813
- # Download to temporary file, then copy to cache dir once finished.
814
- # Otherwise you get corrupt cache entries if the download gets interrupted.
815
- with temp_file_manager() as temp_file:
816
- logger.info("downloading %s to %s", url, temp_file.name)
817
-
818
- http_get(
819
- url_to_download,
820
- temp_file,
821
- proxies=proxies,
822
- resume_size=resume_size,
823
- headers=headers,
824
- expected_size=expected_size,
825
- )
826
-
827
- logger.info("storing %s in cache at %s", url, cache_path)
828
- _chmod_and_replace(temp_file.name, cache_path)
798
+ _download_to_tmp_and_move(
799
+ incomplete_path=Path(cache_path + ".incomplete"),
800
+ destination_path=Path(cache_path),
801
+ url_to_download=url_to_download,
802
+ proxies=proxies,
803
+ headers=headers,
804
+ expected_size=expected_size,
805
+ filename=filename,
806
+ force_download=force_download,
807
+ )
829
808
 
830
809
  if force_filename is None:
831
810
  logger.info("creating metadata file for %s", cache_path)
@@ -1022,18 +1001,19 @@ def hf_hub_download(
1022
1001
  library_version: Optional[str] = None,
1023
1002
  cache_dir: Union[str, Path, None] = None,
1024
1003
  local_dir: Union[str, Path, None] = None,
1025
- local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto",
1026
1004
  user_agent: Union[Dict, str, None] = None,
1027
1005
  force_download: bool = False,
1028
- force_filename: Optional[str] = None,
1029
1006
  proxies: Optional[Dict] = None,
1030
1007
  etag_timeout: float = DEFAULT_ETAG_TIMEOUT,
1031
- resume_download: bool = False,
1032
1008
  token: Union[bool, str, None] = None,
1033
1009
  local_files_only: bool = False,
1034
1010
  headers: Optional[Dict[str, str]] = None,
1035
- legacy_cache_layout: bool = False,
1036
1011
  endpoint: Optional[str] = None,
1012
+ # Deprecated args
1013
+ legacy_cache_layout: bool = False,
1014
+ resume_download: Optional[bool] = None,
1015
+ force_filename: Optional[str] = None,
1016
+ local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto",
1037
1017
  ) -> str:
1038
1018
  """Download a given file if it's not already present in the local cache.
1039
1019
 
@@ -1047,21 +1027,6 @@ def hf_hub_download(
1047
1027
  that have been resolved at that particular commit. Each filename is a symlink to the blob
1048
1028
  at that particular commit.
1049
1029
 
1050
- If `local_dir` is provided, the file structure from the repo will be replicated in this location. You can configure
1051
- how you want to move those files:
1052
- - If `local_dir_use_symlinks="auto"` (default), files are downloaded and stored in the cache directory as blob
1053
- files. Small files (<5MB) are duplicated in `local_dir` while a symlink is created for bigger files. The goal
1054
- is to be able to manually edit and save small files without corrupting the cache while saving disk space for
1055
- binary files. The 5MB threshold can be configured with the `HF_HUB_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD`
1056
- environment variable.
1057
- - If `local_dir_use_symlinks=True`, files are downloaded, stored in the cache directory and symlinked in `local_dir`.
1058
- This is optimal in term of disk usage but files must not be manually edited.
1059
- - If `local_dir_use_symlinks=False` and the blob files exist in the cache directory, they are duplicated in the
1060
- local dir. This means disk usage is not optimized.
1061
- - Finally, if `local_dir_use_symlinks=False` and the blob files do not exist in the cache directory, then the
1062
- files are downloaded and directly placed under `local_dir`. This means if you need to download them again later,
1063
- they will be re-downloaded entirely.
1064
-
1065
1030
  ```
1066
1031
  [ 96] .
1067
1032
  └── [ 160] models--julien-c--EsperBERTo-small
@@ -1080,6 +1045,11 @@ def hf_hub_download(
1080
1045
  └── [ 76] pytorch_model.bin -> ../../blobs/403450e234d65943a7dcf7e05a771ce3c92faa84dd07db4ac20f592037a1e4bd
1081
1046
  ```
1082
1047
 
1048
+ If `local_dir` is provided, the file structure from the repo will be replicated in this location. When using this
1049
+ option, the `cache_dir` will not be used and a `.huggingface/` folder will be created at the root of `local_dir`
1050
+ to store some metadata related to the downloaded files. While this mechanism is not as robust as the main
1051
+ cache-system, it's optimized for regularly pulling the latest version of a repository.
1052
+
1083
1053
  Args:
1084
1054
  repo_id (`str`):
1085
1055
  A user or an organization name and a repo name separated by a `/`.
@@ -1100,13 +1070,7 @@ def hf_hub_download(
1100
1070
  cache_dir (`str`, `Path`, *optional*):
1101
1071
  Path to the folder where cached files are stored.
1102
1072
  local_dir (`str` or `Path`, *optional*):
1103
- If provided, the downloaded file will be placed under this directory, either as a symlink (default) or
1104
- a regular file (see description for more details).
1105
- local_dir_use_symlinks (`"auto"` or `bool`, defaults to `"auto"`):
1106
- To be used with `local_dir`. If set to "auto", the cache directory will be used and the file will be either
1107
- duplicated or symlinked to the local directory depending on its size. It set to `True`, a symlink will be
1108
- created, no matter the file size. If set to `False`, the file will either be duplicated from cache (if
1109
- already exists) or downloaded from the Hub and not cached. See description for more details.
1073
+ If provided, the downloaded file will be placed under this directory.
1110
1074
  user_agent (`dict`, `str`, *optional*):
1111
1075
  The user-agent info in the form of a dictionary or a string.
1112
1076
  force_download (`bool`, *optional*, defaults to `False`):
@@ -1118,8 +1082,6 @@ def hf_hub_download(
1118
1082
  etag_timeout (`float`, *optional*, defaults to `10`):
1119
1083
  When fetching ETag, how many seconds to wait for the server to send
1120
1084
  data before giving up which is passed to `requests.request`.
1121
- resume_download (`bool`, *optional*, defaults to `False`):
1122
- If `True`, resume a previously interrupted download.
1123
1085
  token (`str`, `bool`, *optional*):
1124
1086
  A token to be used for the download.
1125
1087
  - If `True`, the token is read from the HuggingFace config
@@ -1136,30 +1098,24 @@ def hf_hub_download(
1136
1098
  more powerful.
1137
1099
 
1138
1100
  Returns:
1139
- Local path (string) of file or if networking is off, last version of
1140
- file cached on disk.
1141
-
1142
- <Tip>
1143
-
1144
- Raises the following errors:
1101
+ `str`: Local path of file or if networking is off, last version of file cached on disk.
1145
1102
 
1103
+ Raises:
1146
1104
  - [`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError)
1147
- if `token=True` and the token cannot be found.
1105
+ if `token=True` and the token cannot be found.
1148
1106
  - [`OSError`](https://docs.python.org/3/library/exceptions.html#OSError)
1149
- if ETag cannot be determined.
1107
+ if ETag cannot be determined.
1150
1108
  - [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
1151
- if some parameter value is invalid
1109
+ if some parameter value is invalid
1152
1110
  - [`~utils.RepositoryNotFoundError`]
1153
- If the repository to download from cannot be found. This may be because it doesn't exist,
1154
- or because it is set to `private` and you do not have access.
1111
+ If the repository to download from cannot be found. This may be because it doesn't exist,
1112
+ or because it is set to `private` and you do not have access.
1155
1113
  - [`~utils.RevisionNotFoundError`]
1156
- If the revision to download from cannot be found.
1114
+ If the revision to download from cannot be found.
1157
1115
  - [`~utils.EntryNotFoundError`]
1158
- If the file to download cannot be found.
1116
+ If the file to download cannot be found.
1159
1117
  - [`~utils.LocalEntryNotFoundError`]
1160
- If network is disabled or unavailable and file is not found in cache.
1161
-
1162
- </Tip>
1118
+ If network is disabled or unavailable and file is not found in cache.
1163
1119
  """
1164
1120
  if HF_HUB_ETAG_TIMEOUT != DEFAULT_ETAG_TIMEOUT:
1165
1121
  # Respect environment variable above user value
@@ -1172,6 +1128,13 @@ def hf_hub_download(
1172
1128
  FutureWarning,
1173
1129
  )
1174
1130
  legacy_cache_layout = True
1131
+ if resume_download is not None:
1132
+ warnings.warn(
1133
+ "`resume_download` is deprecated and will be removed in version 1.0.0. "
1134
+ "Downloads always resume when possible. "
1135
+ "If you want to force a new download, use `force_download=True`.",
1136
+ FutureWarning,
1137
+ )
1175
1138
 
1176
1139
  if legacy_cache_layout:
1177
1140
  url = hf_hub_url(
@@ -1193,7 +1156,6 @@ def hf_hub_download(
1193
1156
  force_filename=force_filename,
1194
1157
  proxies=proxies,
1195
1158
  etag_timeout=etag_timeout,
1196
- resume_download=resume_download,
1197
1159
  token=token,
1198
1160
  local_files_only=local_files_only,
1199
1161
  legacy_cache_layout=legacy_cache_layout,
@@ -1207,7 +1169,6 @@ def hf_hub_download(
1207
1169
  cache_dir = str(cache_dir)
1208
1170
  if isinstance(local_dir, Path):
1209
1171
  local_dir = str(local_dir)
1210
- locks_dir = os.path.join(cache_dir, ".locks")
1211
1172
 
1212
1173
  if subfolder == "":
1213
1174
  subfolder = None
@@ -1220,6 +1181,85 @@ def hf_hub_download(
1220
1181
  if repo_type not in REPO_TYPES:
1221
1182
  raise ValueError(f"Invalid repo type: {repo_type}. Accepted repo types are: {str(REPO_TYPES)}")
1222
1183
 
1184
+ headers = build_hf_headers(
1185
+ token=token,
1186
+ library_name=library_name,
1187
+ library_version=library_version,
1188
+ user_agent=user_agent,
1189
+ headers=headers,
1190
+ )
1191
+
1192
+ if local_dir is not None:
1193
+ if local_dir_use_symlinks != "auto":
1194
+ warnings.warn(
1195
+ "`local_dir_use_symlinks` parameter is deprecated and will be ignored. "
1196
+ "The process to download files to a local folder has been updated and do "
1197
+ "not rely on symlinks anymore. You only need to pass a destination folder "
1198
+ "as`local_dir`.\n"
1199
+ "For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder."
1200
+ )
1201
+
1202
+ return _hf_hub_download_to_local_dir(
1203
+ # Destination
1204
+ local_dir=local_dir,
1205
+ # File info
1206
+ repo_id=repo_id,
1207
+ repo_type=repo_type,
1208
+ filename=filename,
1209
+ revision=revision,
1210
+ # HTTP info
1211
+ proxies=proxies,
1212
+ etag_timeout=etag_timeout,
1213
+ headers=headers,
1214
+ endpoint=endpoint,
1215
+ # Additional options
1216
+ cache_dir=cache_dir,
1217
+ force_download=force_download,
1218
+ local_files_only=local_files_only,
1219
+ )
1220
+ else:
1221
+ return _hf_hub_download_to_cache_dir(
1222
+ # Destination
1223
+ cache_dir=cache_dir,
1224
+ # File info
1225
+ repo_id=repo_id,
1226
+ filename=filename,
1227
+ repo_type=repo_type,
1228
+ revision=revision,
1229
+ # HTTP info
1230
+ headers=headers,
1231
+ proxies=proxies,
1232
+ etag_timeout=etag_timeout,
1233
+ endpoint=endpoint,
1234
+ # Additional options
1235
+ local_files_only=local_files_only,
1236
+ force_download=force_download,
1237
+ )
1238
+
1239
+
1240
+ def _hf_hub_download_to_cache_dir(
1241
+ *,
1242
+ # Destination
1243
+ cache_dir: str,
1244
+ # File info
1245
+ repo_id: str,
1246
+ filename: str,
1247
+ repo_type: str,
1248
+ revision: str,
1249
+ # HTTP info
1250
+ headers: Dict[str, str],
1251
+ proxies: Optional[Dict],
1252
+ etag_timeout: float,
1253
+ endpoint: Optional[str],
1254
+ # Additional options
1255
+ local_files_only: bool,
1256
+ force_download: bool,
1257
+ ) -> str:
1258
+ """Download a given file to a cache folder, if not already present.
1259
+
1260
+ Method should not be called directly. Please use `hf_hub_download` instead.
1261
+ """
1262
+ locks_dir = os.path.join(cache_dir, ".locks")
1223
1263
  storage_folder = os.path.join(cache_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type))
1224
1264
 
1225
1265
  # cross platform transcription of filename, to be used as a local file path.
@@ -1231,207 +1271,82 @@ def hf_hub_download(
1231
1271
  " owner to rename this file."
1232
1272
  )
1233
1273
 
1234
- # if user provides a commit_hash and they already have the file on disk,
1235
- # shortcut everything.
1274
+ # if user provides a commit_hash and they already have the file on disk, shortcut everything.
1236
1275
  if REGEX_COMMIT_HASH.match(revision):
1237
1276
  pointer_path = _get_pointer_path(storage_folder, revision, relative_filename)
1238
- if os.path.exists(pointer_path):
1239
- if local_dir is not None:
1240
- return _to_local_dir(pointer_path, local_dir, relative_filename, use_symlinks=local_dir_use_symlinks)
1277
+ if os.path.exists(pointer_path) and not force_download:
1241
1278
  return pointer_path
1242
1279
 
1243
- url = hf_hub_url(repo_id, filename, repo_type=repo_type, revision=revision, endpoint=endpoint)
1244
-
1245
- headers = build_hf_headers(
1246
- token=token,
1247
- library_name=library_name,
1248
- library_version=library_version,
1249
- user_agent=user_agent,
1280
+ # Try to get metadata (etag, commit_hash, url, size) from the server.
1281
+ # If we can't, a HEAD request error is returned.
1282
+ (url_to_download, etag, commit_hash, expected_size, head_call_error) = _get_metadata_or_catch_error(
1283
+ repo_id=repo_id,
1284
+ filename=filename,
1285
+ repo_type=repo_type,
1286
+ revision=revision,
1287
+ endpoint=endpoint,
1288
+ proxies=proxies,
1289
+ etag_timeout=etag_timeout,
1250
1290
  headers=headers,
1291
+ local_files_only=local_files_only,
1292
+ storage_folder=storage_folder,
1293
+ relative_filename=relative_filename,
1251
1294
  )
1252
1295
 
1253
- url_to_download = url
1254
- etag = None
1255
- commit_hash = None
1256
- expected_size = None
1257
- head_call_error: Optional[Exception] = None
1258
- if not local_files_only:
1259
- try:
1260
- try:
1261
- metadata = get_hf_file_metadata(
1262
- url=url,
1263
- token=token,
1264
- proxies=proxies,
1265
- timeout=etag_timeout,
1266
- library_name=library_name,
1267
- library_version=library_version,
1268
- user_agent=user_agent,
1269
- )
1270
- except EntryNotFoundError as http_error:
1271
- # Cache the non-existence of the file and raise
1272
- commit_hash = http_error.response.headers.get(HUGGINGFACE_HEADER_X_REPO_COMMIT)
1273
- if commit_hash is not None and not legacy_cache_layout:
1274
- no_exist_file_path = Path(storage_folder) / ".no_exist" / commit_hash / relative_filename
1275
- no_exist_file_path.parent.mkdir(parents=True, exist_ok=True)
1276
- no_exist_file_path.touch()
1277
- _cache_commit_hash_for_specific_revision(storage_folder, revision, commit_hash)
1278
- raise
1279
-
1280
- # Commit hash must exist
1281
- commit_hash = metadata.commit_hash
1282
- if commit_hash is None:
1283
- raise FileMetadataError(
1284
- "Distant resource does not seem to be on huggingface.co. It is possible that a configuration issue"
1285
- " prevents you from downloading resources from https://huggingface.co. Please check your firewall"
1286
- " and proxy settings and make sure your SSL certificates are updated."
1287
- )
1288
-
1289
- # Etag must exist
1290
- etag = metadata.etag
1291
- # We favor a custom header indicating the etag of the linked resource, and
1292
- # we fallback to the regular etag header.
1293
- # If we don't have any of those, raise an error.
1294
- if etag is None:
1295
- raise FileMetadataError(
1296
- "Distant resource does not have an ETag, we won't be able to reliably ensure reproducibility."
1297
- )
1298
-
1299
- # Expected (uncompressed) size
1300
- expected_size = metadata.size
1301
-
1302
- # In case of a redirect, save an extra redirect on the request.get call,
1303
- # and ensure we download the exact atomic version even if it changed
1304
- # between the HEAD and the GET (unlikely, but hey).
1305
- #
1306
- # If url domain is different => we are downloading from a CDN => url is signed => don't send auth
1307
- # If url domain is the same => redirect due to repo rename AND downloading a regular file => keep auth
1308
- if metadata.location != url:
1309
- url_to_download = metadata.location
1310
- if urlparse(url).netloc != urlparse(url_to_download).netloc:
1311
- # Remove authorization header when downloading a LFS blob
1312
- headers.pop("authorization", None)
1313
- except (requests.exceptions.SSLError, requests.exceptions.ProxyError):
1314
- # Actually raise for those subclasses of ConnectionError
1315
- raise
1316
- except (
1317
- requests.exceptions.ConnectionError,
1318
- requests.exceptions.Timeout,
1319
- OfflineModeIsEnabled,
1320
- ) as error:
1321
- # Otherwise, our Internet connection is down.
1322
- # etag is None
1323
- head_call_error = error
1324
- pass
1325
- except (RevisionNotFoundError, EntryNotFoundError):
1326
- # The repo was found but the revision or entry doesn't exist on the Hub (never existed or got deleted)
1327
- raise
1328
- except requests.HTTPError as error:
1329
- # Multiple reasons for an http error:
1330
- # - Repository is private and invalid/missing token sent
1331
- # - Repository is gated and invalid/missing token sent
1332
- # - Hub is down (error 500 or 504)
1333
- # => let's switch to 'local_files_only=True' to check if the files are already cached.
1334
- # (if it's not the case, the error will be re-raised)
1335
- head_call_error = error
1336
- pass
1337
- except FileMetadataError as error:
1338
- # Multiple reasons for a FileMetadataError:
1339
- # - Wrong network configuration (proxy, firewall, SSL certificates)
1340
- # - Inconsistency on the Hub
1341
- # => let's switch to 'local_files_only=True' to check if the files are already cached.
1342
- # (if it's not the case, the error will be re-raised)
1343
- head_call_error = error
1344
- pass
1345
-
1346
- assert (
1347
- local_files_only or etag is not None or head_call_error is not None
1348
- ), "etag is empty due to uncovered problems"
1349
-
1350
1296
  # etag can be None for several reasons:
1351
1297
  # 1. we passed local_files_only.
1352
1298
  # 2. we don't have a connection
1353
- # 3. Hub is down (HTTP 500 or 504)
1299
+ # 3. Hub is down (HTTP 500, 503, 504)
1354
1300
  # 4. repo is not found -for example private or gated- and invalid/missing token sent
1355
1301
  # 5. Hub is blocked by a firewall or proxy is not set correctly.
1356
1302
  # => Try to get the last downloaded one from the specified revision.
1357
1303
  #
1358
1304
  # If the specified revision is a commit hash, look inside "snapshots".
1359
1305
  # If the specified revision is a branch or tag, look inside "refs".
1360
- if etag is None:
1361
- # In those cases, we cannot force download.
1362
- if force_download:
1363
- if local_files_only:
1364
- raise ValueError("Cannot pass 'force_download=True' and 'local_files_only=True' at the same time.")
1365
- elif isinstance(head_call_error, OfflineModeIsEnabled):
1366
- raise ValueError(
1367
- "Cannot pass 'force_download=True' when offline mode is enabled."
1368
- ) from head_call_error
1306
+ if head_call_error is not None:
1307
+ # Couldn't make a HEAD call => let's try to find a local file
1308
+ if not force_download:
1309
+ commit_hash = None
1310
+ if REGEX_COMMIT_HASH.match(revision):
1311
+ commit_hash = revision
1369
1312
  else:
1370
- raise ValueError("Force download failed due to the above error.") from head_call_error
1313
+ ref_path = os.path.join(storage_folder, "refs", revision)
1314
+ if os.path.isfile(ref_path):
1315
+ with open(ref_path) as f:
1316
+ commit_hash = f.read()
1371
1317
 
1372
- # Try to get "commit_hash" from "revision"
1373
- commit_hash = None
1374
- if REGEX_COMMIT_HASH.match(revision):
1375
- commit_hash = revision
1376
- else:
1377
- ref_path = os.path.join(storage_folder, "refs", revision)
1378
- if os.path.isfile(ref_path):
1379
- with open(ref_path) as f:
1380
- commit_hash = f.read()
1381
-
1382
- # Return pointer file if exists
1383
- if commit_hash is not None:
1384
- pointer_path = _get_pointer_path(storage_folder, commit_hash, relative_filename)
1385
- if os.path.exists(pointer_path):
1386
- if local_dir is not None:
1387
- return _to_local_dir(
1388
- pointer_path, local_dir, relative_filename, use_symlinks=local_dir_use_symlinks
1389
- )
1390
- return pointer_path
1318
+ # Return pointer file if exists
1319
+ if commit_hash is not None:
1320
+ pointer_path = _get_pointer_path(storage_folder, commit_hash, relative_filename)
1321
+ if os.path.exists(pointer_path) and not force_download:
1322
+ return pointer_path
1391
1323
 
1392
- # If we couldn't find an appropriate file on disk, raise an error.
1393
- # If files cannot be found and local_files_only=True,
1394
- # the models might've been found if local_files_only=False
1395
- # Notify the user about that
1396
- if local_files_only:
1397
- raise LocalEntryNotFoundError(
1398
- "Cannot find the requested files in the disk cache and outgoing traffic has been disabled. To enable"
1399
- " hf.co look-ups and downloads online, set 'local_files_only' to False."
1400
- )
1401
- elif isinstance(head_call_error, RepositoryNotFoundError) or isinstance(head_call_error, GatedRepoError):
1402
- # Repo not found or gated => let's raise the actual error
1403
- raise head_call_error
1404
- else:
1405
- # Otherwise: most likely a connection issue or Hub downtime => let's warn the user
1406
- raise LocalEntryNotFoundError(
1407
- "An error happened while trying to locate the file on the Hub and we cannot find the requested files"
1408
- " in the local cache. Please check your connection and try again or make sure your Internet connection"
1409
- " is on."
1410
- ) from head_call_error
1411
-
1412
- # From now on, etag and commit_hash are not None.
1324
+ # Otherwise, raise appropriate error
1325
+ _raise_on_head_call_error(head_call_error, force_download, local_files_only)
1326
+
1327
+ # From now on, etag, commit_hash, url and size are not None.
1413
1328
  assert etag is not None, "etag must have been retrieved from server"
1414
1329
  assert commit_hash is not None, "commit_hash must have been retrieved from server"
1330
+ assert url_to_download is not None, "file location must have been retrieved from server"
1331
+ assert expected_size is not None, "expected_size must have been retrieved from server"
1415
1332
  blob_path = os.path.join(storage_folder, "blobs", etag)
1416
1333
  pointer_path = _get_pointer_path(storage_folder, commit_hash, relative_filename)
1417
1334
 
1418
1335
  os.makedirs(os.path.dirname(blob_path), exist_ok=True)
1419
1336
  os.makedirs(os.path.dirname(pointer_path), exist_ok=True)
1337
+
1420
1338
  # if passed revision is not identical to commit_hash
1421
1339
  # then revision has to be a branch name or tag name.
1422
1340
  # In that case store a ref.
1423
1341
  _cache_commit_hash_for_specific_revision(storage_folder, revision, commit_hash)
1424
1342
 
1425
- if os.path.exists(pointer_path) and not force_download:
1426
- if local_dir is not None:
1427
- return _to_local_dir(pointer_path, local_dir, relative_filename, use_symlinks=local_dir_use_symlinks)
1428
- return pointer_path
1343
+ # If file already exists, return it (except if force_download=True)
1344
+ if not force_download:
1345
+ if os.path.exists(pointer_path):
1346
+ return pointer_path
1429
1347
 
1430
- if os.path.exists(blob_path) and not force_download:
1431
- # we have the blob already, but not the pointer
1432
- if local_dir is not None: # to local dir
1433
- return _to_local_dir(blob_path, local_dir, relative_filename, use_symlinks=local_dir_use_symlinks)
1434
- else: # or in snapshot cache
1348
+ if os.path.exists(blob_path):
1349
+ # we have the blob already, but not the pointer
1435
1350
  _create_symlink(blob_path, pointer_path, new_blob=False)
1436
1351
  return pointer_path
1437
1352
 
@@ -1449,83 +1364,139 @@ def hf_hub_download(
1449
1364
 
1450
1365
  Path(lock_path).parent.mkdir(parents=True, exist_ok=True)
1451
1366
  with WeakFileLock(lock_path):
1452
- # If the download just completed while the lock was activated.
1453
- if os.path.exists(pointer_path) and not force_download:
1454
- # Even if returning early like here, the lock will be released.
1455
- if local_dir is not None:
1456
- return _to_local_dir(pointer_path, local_dir, relative_filename, use_symlinks=local_dir_use_symlinks)
1457
- return pointer_path
1458
-
1459
- if resume_download:
1460
- incomplete_path = blob_path + ".incomplete"
1461
-
1462
- @contextmanager
1463
- def _resumable_file_manager() -> Generator[io.BufferedWriter, None, None]:
1464
- with open(incomplete_path, "ab") as f:
1465
- yield f
1367
+ _download_to_tmp_and_move(
1368
+ incomplete_path=Path(blob_path + ".incomplete"),
1369
+ destination_path=Path(blob_path),
1370
+ url_to_download=url_to_download,
1371
+ proxies=proxies,
1372
+ headers=headers,
1373
+ expected_size=expected_size,
1374
+ filename=filename,
1375
+ force_download=force_download,
1376
+ )
1377
+ _create_symlink(blob_path, pointer_path, new_blob=True)
1466
1378
 
1467
- temp_file_manager = _resumable_file_manager
1468
- if os.path.exists(incomplete_path):
1469
- resume_size = os.stat(incomplete_path).st_size
1470
- else:
1471
- resume_size = 0
1472
- else:
1473
- temp_file_manager = partial( # type: ignore
1474
- tempfile.NamedTemporaryFile, mode="wb", dir=cache_dir, delete=False
1475
- )
1476
- resume_size = 0
1379
+ return pointer_path
1477
1380
 
1478
- # Download to temporary file, then copy to cache dir once finished.
1479
- # Otherwise you get corrupt cache entries if the download gets interrupted.
1480
- with temp_file_manager() as temp_file:
1481
- logger.info("downloading %s to %s", url, temp_file.name)
1482
1381
 
1483
- if expected_size is not None: # might be None if HTTP header not set correctly
1484
- # Check tmp path
1485
- _check_disk_space(expected_size, os.path.dirname(temp_file.name))
1382
+ def _hf_hub_download_to_local_dir(
1383
+ *,
1384
+ # Destination
1385
+ local_dir: Union[str, Path],
1386
+ # File info
1387
+ repo_id: str,
1388
+ repo_type: str,
1389
+ filename: str,
1390
+ revision: str,
1391
+ # HTTP info
1392
+ proxies: Optional[Dict],
1393
+ etag_timeout: float,
1394
+ headers: Dict[str, str],
1395
+ endpoint: Optional[str],
1396
+ # Additional options
1397
+ cache_dir: str,
1398
+ force_download: bool,
1399
+ local_files_only: bool,
1400
+ ) -> str:
1401
+ """Download a given file to a local folder, if not already present.
1486
1402
 
1487
- # Check destination
1488
- _check_disk_space(expected_size, os.path.dirname(blob_path))
1489
- if local_dir is not None:
1490
- _check_disk_space(expected_size, local_dir)
1403
+ Method should not be called directly. Please use `hf_hub_download` instead.
1404
+ """
1405
+ local_dir = Path(local_dir)
1406
+ paths = get_local_download_paths(local_dir=local_dir, filename=filename)
1407
+ local_metadata = read_download_metadata(local_dir=local_dir, filename=filename)
1408
+
1409
+ # Local file exists + metadata exists + commit_hash matches => return file
1410
+ if (
1411
+ not force_download
1412
+ and REGEX_COMMIT_HASH.match(revision)
1413
+ and paths.file_path.is_file()
1414
+ and local_metadata is not None
1415
+ and local_metadata.commit_hash == revision
1416
+ ):
1417
+ return str(paths.file_path)
1418
+
1419
+ # Local file doesn't exist or commit_hash doesn't match => we need the etag
1420
+ (url_to_download, etag, commit_hash, expected_size, head_call_error) = _get_metadata_or_catch_error(
1421
+ repo_id=repo_id,
1422
+ filename=filename,
1423
+ repo_type=repo_type,
1424
+ revision=revision,
1425
+ endpoint=endpoint,
1426
+ proxies=proxies,
1427
+ etag_timeout=etag_timeout,
1428
+ headers=headers,
1429
+ local_files_only=local_files_only,
1430
+ )
1491
1431
 
1492
- http_get(
1493
- url_to_download,
1494
- temp_file,
1495
- proxies=proxies,
1496
- resume_size=resume_size,
1497
- headers=headers,
1498
- expected_size=expected_size,
1499
- displayed_filename=filename,
1432
+ if head_call_error is not None:
1433
+ # No HEAD call but local file exists => default to local file
1434
+ if not force_download and paths.file_path.is_file():
1435
+ logger.warning(
1436
+ f"Couldn't access the Hub to check for update but local file already exists. Defaulting to existing file. (error: {head_call_error})"
1500
1437
  )
1438
+ return str(paths.file_path)
1439
+ # Otherwise => raise
1440
+ _raise_on_head_call_error(head_call_error, force_download, local_files_only)
1501
1441
 
1502
- if local_dir is None:
1503
- logger.debug(f"Storing {url} in cache at {blob_path}")
1504
- _chmod_and_replace(temp_file.name, blob_path)
1505
- _create_symlink(blob_path, pointer_path, new_blob=True)
1506
- else:
1507
- local_dir_filepath = os.path.join(local_dir, relative_filename)
1508
- os.makedirs(os.path.dirname(local_dir_filepath), exist_ok=True)
1509
-
1510
- # If "auto" (default) copy-paste small files to ease manual editing but symlink big files to save disk
1511
- # In both cases, blob file is cached.
1512
- is_big_file = os.stat(temp_file.name).st_size > constants.HF_HUB_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD
1513
- if local_dir_use_symlinks is True or (local_dir_use_symlinks == "auto" and is_big_file):
1514
- logger.debug(f"Storing {url} in cache at {blob_path}")
1515
- _chmod_and_replace(temp_file.name, blob_path)
1516
- logger.debug("Create symlink to local dir")
1517
- _create_symlink(blob_path, local_dir_filepath, new_blob=False)
1518
- elif local_dir_use_symlinks == "auto" and not is_big_file:
1519
- logger.debug(f"Storing {url} in cache at {blob_path}")
1520
- _chmod_and_replace(temp_file.name, blob_path)
1521
- logger.debug("Duplicate in local dir (small file and use_symlink set to 'auto')")
1522
- shutil.copyfile(blob_path, local_dir_filepath)
1523
- else:
1524
- logger.debug(f"Storing {url} in local_dir at {local_dir_filepath} (not cached).")
1525
- _chmod_and_replace(temp_file.name, local_dir_filepath)
1526
- pointer_path = local_dir_filepath # for return value
1442
+ # From now on, etag, commit_hash, url and size are not None.
1443
+ assert etag is not None, "etag must have been retrieved from server"
1444
+ assert commit_hash is not None, "commit_hash must have been retrieved from server"
1445
+ assert url_to_download is not None, "file location must have been retrieved from server"
1446
+ assert expected_size is not None, "expected_size must have been retrieved from server"
1447
+
1448
+ # Local file exists => check if it's up-to-date
1449
+ if not force_download and paths.file_path.is_file():
1450
+ # etag matches => update metadata and return file
1451
+ if local_metadata is not None and local_metadata.etag == etag:
1452
+ write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag)
1453
+ return str(paths.file_path)
1454
+
1455
+ # metadata is outdated + etag is a sha256
1456
+ # => means it's an LFS file (large)
1457
+ # => let's compute local hash and compare
1458
+ # => if match, update metadata and return file
1459
+ if local_metadata is None and REGEX_SHA256.match(etag) is not None:
1460
+ with open(paths.file_path, "rb") as f:
1461
+ file_hash = sha_fileobj(f).hex()
1462
+ if file_hash == etag:
1463
+ write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag)
1464
+ return str(paths.file_path)
1465
+
1466
+ # Local file doesn't exist or etag isn't a match => retrieve file from remote (or cache)
1467
+
1468
+ # If we are lucky enough, the file is already in the cache => copy it
1469
+ if not force_download:
1470
+ cached_path = try_to_load_from_cache(
1471
+ repo_id=repo_id,
1472
+ filename=filename,
1473
+ cache_dir=cache_dir,
1474
+ revision=commit_hash,
1475
+ repo_type=repo_type,
1476
+ )
1477
+ if isinstance(cached_path, str):
1478
+ with WeakFileLock(paths.lock_path):
1479
+ paths.file_path.parent.mkdir(parents=True, exist_ok=True)
1480
+ shutil.copyfile(cached_path, paths.file_path)
1481
+ write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag)
1482
+ return str(paths.file_path)
1483
+
1484
+ # Otherwise, let's download the file!
1485
+ with WeakFileLock(paths.lock_path):
1486
+ paths.file_path.unlink(missing_ok=True) # delete outdated file first
1487
+ _download_to_tmp_and_move(
1488
+ incomplete_path=paths.incomplete_path(etag),
1489
+ destination_path=paths.file_path,
1490
+ url_to_download=url_to_download,
1491
+ proxies=proxies,
1492
+ headers=headers,
1493
+ expected_size=expected_size,
1494
+ filename=filename,
1495
+ force_download=force_download,
1496
+ )
1527
1497
 
1528
- return pointer_path
1498
+ write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag)
1499
+ return str(paths.file_path)
1529
1500
 
1530
1501
 
1531
1502
  @validate_hf_hub_args
@@ -1696,6 +1667,233 @@ def get_hf_file_metadata(
1696
1667
  )
1697
1668
 
1698
1669
 
1670
+ def _get_metadata_or_catch_error(
1671
+ *,
1672
+ repo_id: str,
1673
+ filename: str,
1674
+ repo_type: str,
1675
+ revision: str,
1676
+ endpoint: Optional[str],
1677
+ proxies: Optional[Dict],
1678
+ etag_timeout: Optional[float],
1679
+ headers: Dict[str, str], # mutated inplace!
1680
+ local_files_only: bool,
1681
+ relative_filename: Optional[str] = None, # only used to store `.no_exists` in cache
1682
+ storage_folder: Optional[str] = None, # only used to store `.no_exists` in cache
1683
+ ) -> Union[
1684
+ # Either an exception is caught and returned
1685
+ Tuple[None, None, None, None, Exception],
1686
+ # Or the metadata is returned as
1687
+ # `(url_to_download, etag, commit_hash, expected_size, None)`
1688
+ Tuple[str, str, str, int, None],
1689
+ ]:
1690
+ """Get metadata for a file on the Hub, safely handling network issues.
1691
+
1692
+ Returns either the etag, commit_hash and expected size of the file, or the error
1693
+ raised while fetching the metadata.
1694
+
1695
+ NOTE: This function mutates `headers` inplace! It removes the `authorization` header
1696
+ if the file is a LFS blob and the domain of the url is different from the
1697
+ domain of the location (typically an S3 bucket).
1698
+ """
1699
+ if local_files_only:
1700
+ return (
1701
+ None,
1702
+ None,
1703
+ None,
1704
+ None,
1705
+ OfflineModeIsEnabled(
1706
+ f"Cannot access file since 'local_files_only=True' as been set. (repo_id: {repo_id}, repo_type: {repo_type}, revision: {revision}, filename: {filename})"
1707
+ ),
1708
+ )
1709
+
1710
+ url = url = hf_hub_url(repo_id, filename, repo_type=repo_type, revision=revision, endpoint=endpoint)
1711
+ url_to_download: str = url
1712
+ etag: Optional[str] = None
1713
+ commit_hash: Optional[str] = None
1714
+ expected_size: Optional[int] = None
1715
+ head_error_call: Optional[Exception] = None
1716
+
1717
+ # Try to get metadata from the server.
1718
+ # Do not raise yet if the file is not found or not accessible.
1719
+ if not local_files_only:
1720
+ try:
1721
+ try:
1722
+ metadata = get_hf_file_metadata(url=url, proxies=proxies, timeout=etag_timeout, headers=headers)
1723
+ except EntryNotFoundError as http_error:
1724
+ if storage_folder is not None and relative_filename is not None:
1725
+ # Cache the non-existence of the file
1726
+ commit_hash = http_error.response.headers.get(HUGGINGFACE_HEADER_X_REPO_COMMIT)
1727
+ if commit_hash is not None:
1728
+ no_exist_file_path = Path(storage_folder) / ".no_exist" / commit_hash / relative_filename
1729
+ no_exist_file_path.parent.mkdir(parents=True, exist_ok=True)
1730
+ no_exist_file_path.touch()
1731
+ _cache_commit_hash_for_specific_revision(storage_folder, revision, commit_hash)
1732
+ raise
1733
+
1734
+ # Commit hash must exist
1735
+ commit_hash = metadata.commit_hash
1736
+ if commit_hash is None:
1737
+ raise FileMetadataError(
1738
+ "Distant resource does not seem to be on huggingface.co. It is possible that a configuration issue"
1739
+ " prevents you from downloading resources from https://huggingface.co. Please check your firewall"
1740
+ " and proxy settings and make sure your SSL certificates are updated."
1741
+ )
1742
+
1743
+ # Etag must exist
1744
+ # If we don't have any of those, raise an error.
1745
+ etag = metadata.etag
1746
+ if etag is None:
1747
+ raise FileMetadataError(
1748
+ "Distant resource does not have an ETag, we won't be able to reliably ensure reproducibility."
1749
+ )
1750
+
1751
+ # Size must exist
1752
+ expected_size = metadata.size
1753
+ if expected_size is None:
1754
+ raise FileMetadataError("Distant resource does not have a Content-Length.")
1755
+
1756
+ # In case of a redirect, save an extra redirect on the request.get call,
1757
+ # and ensure we download the exact atomic version even if it changed
1758
+ # between the HEAD and the GET (unlikely, but hey).
1759
+ #
1760
+ # If url domain is different => we are downloading from a CDN => url is signed => don't send auth
1761
+ # If url domain is the same => redirect due to repo rename AND downloading a regular file => keep auth
1762
+ if url != metadata.location:
1763
+ url_to_download = metadata.location
1764
+ if urlparse(url).netloc != urlparse(metadata.location).netloc:
1765
+ # Remove authorization header when downloading a LFS blob
1766
+ headers.pop("authorization", None)
1767
+ except (requests.exceptions.SSLError, requests.exceptions.ProxyError):
1768
+ # Actually raise for those subclasses of ConnectionError
1769
+ raise
1770
+ except (
1771
+ requests.exceptions.ConnectionError,
1772
+ requests.exceptions.Timeout,
1773
+ OfflineModeIsEnabled,
1774
+ ) as error:
1775
+ # Otherwise, our Internet connection is down.
1776
+ # etag is None
1777
+ head_error_call = error
1778
+ except (RevisionNotFoundError, EntryNotFoundError):
1779
+ # The repo was found but the revision or entry doesn't exist on the Hub (never existed or got deleted)
1780
+ raise
1781
+ except requests.HTTPError as error:
1782
+ # Multiple reasons for an http error:
1783
+ # - Repository is private and invalid/missing token sent
1784
+ # - Repository is gated and invalid/missing token sent
1785
+ # - Hub is down (error 500 or 504)
1786
+ # => let's switch to 'local_files_only=True' to check if the files are already cached.
1787
+ # (if it's not the case, the error will be re-raised)
1788
+ head_error_call = error
1789
+ except FileMetadataError as error:
1790
+ # Multiple reasons for a FileMetadataError:
1791
+ # - Wrong network configuration (proxy, firewall, SSL certificates)
1792
+ # - Inconsistency on the Hub
1793
+ # => let's switch to 'local_files_only=True' to check if the files are already cached.
1794
+ # (if it's not the case, the error will be re-raised)
1795
+ head_error_call = error
1796
+
1797
+ if not (local_files_only or etag is not None or head_error_call is not None):
1798
+ raise RuntimeError("etag is empty due to uncovered problems")
1799
+
1800
+ return (url_to_download, etag, commit_hash, expected_size, head_error_call) # type: ignore [return-value]
1801
+
1802
+
1803
+ def _raise_on_head_call_error(head_call_error: Exception, force_download: bool, local_files_only: bool) -> NoReturn:
1804
+ """Raise an appropriate error when the HEAD call failed and we cannot locate a local file."""
1805
+
1806
+ # No head call => we cannot force download.
1807
+ if force_download:
1808
+ if local_files_only:
1809
+ raise ValueError("Cannot pass 'force_download=True' and 'local_files_only=True' at the same time.")
1810
+ elif isinstance(head_call_error, OfflineModeIsEnabled):
1811
+ raise ValueError("Cannot pass 'force_download=True' when offline mode is enabled.") from head_call_error
1812
+ else:
1813
+ raise ValueError("Force download failed due to the above error.") from head_call_error
1814
+
1815
+ # No head call + couldn't find an appropriate file on disk => raise an error.
1816
+ if local_files_only:
1817
+ raise LocalEntryNotFoundError(
1818
+ "Cannot find the requested files in the disk cache and outgoing traffic has been disabled. To enable"
1819
+ " hf.co look-ups and downloads online, set 'local_files_only' to False."
1820
+ )
1821
+ elif isinstance(head_call_error, RepositoryNotFoundError) or isinstance(head_call_error, GatedRepoError):
1822
+ # Repo not found or gated => let's raise the actual error
1823
+ raise head_call_error
1824
+ else:
1825
+ # Otherwise: most likely a connection issue or Hub downtime => let's warn the user
1826
+ raise LocalEntryNotFoundError(
1827
+ "An error happened while trying to locate the file on the Hub and we cannot find the requested files"
1828
+ " in the local cache. Please check your connection and try again or make sure your Internet connection"
1829
+ " is on."
1830
+ ) from head_call_error
1831
+
1832
+
1833
+ def _download_to_tmp_and_move(
1834
+ incomplete_path: Path,
1835
+ destination_path: Path,
1836
+ url_to_download: str,
1837
+ proxies: Optional[Dict],
1838
+ headers: Dict[str, str],
1839
+ expected_size: Optional[int],
1840
+ filename: str,
1841
+ force_download: bool,
1842
+ ) -> None:
1843
+ """Download content from a URL to a destination path.
1844
+
1845
+ Internal logic:
1846
+ - return early if file is already downloaded
1847
+ - resume download if possible (from incomplete file)
1848
+ - do not resume download if `force_download=True` or `HF_HUB_ENABLE_HF_TRANSFER=True`
1849
+ - check disk space before downloading
1850
+ - download content to a temporary file
1851
+ - set correct permissions on temporary file
1852
+ - move the temporary file to the destination path
1853
+
1854
+ Both `incomplete_path` and `destination_path` must be on the same volume to avoid a local copy.
1855
+ """
1856
+ if destination_path.exists() and not force_download:
1857
+ # Do nothing if already exists (except if force_download=True)
1858
+ return
1859
+
1860
+ if incomplete_path.exists() and (force_download or (HF_HUB_ENABLE_HF_TRANSFER and not proxies)):
1861
+ # By default, we will try to resume the download if possible.
1862
+ # However, if the user has set `force_download=True` or if `hf_transfer` is enabled, then we should
1863
+ # not resume the download => delete the incomplete file.
1864
+ message = f"Removing incomplete file '{incomplete_path}'"
1865
+ if force_download:
1866
+ message += " (force_download=True)"
1867
+ elif HF_HUB_ENABLE_HF_TRANSFER and not proxies:
1868
+ message += " (hf_transfer=True)"
1869
+ logger.info(message)
1870
+ incomplete_path.unlink(missing_ok=True)
1871
+
1872
+ with incomplete_path.open("ab") as f:
1873
+ resume_size = f.tell()
1874
+ message = f"Downloading '{filename}' to '{incomplete_path}'"
1875
+ if resume_size > 0 and expected_size is not None:
1876
+ message += f" (resume from {resume_size}/{expected_size})"
1877
+ logger.info(message)
1878
+
1879
+ if expected_size is not None: # might be None if HTTP header not set correctly
1880
+ # Check disk space in both tmp and destination path
1881
+ _check_disk_space(expected_size, incomplete_path.parent)
1882
+ _check_disk_space(expected_size, destination_path.parent)
1883
+
1884
+ http_get(
1885
+ url_to_download,
1886
+ f,
1887
+ proxies=proxies,
1888
+ resume_size=resume_size,
1889
+ headers=headers,
1890
+ expected_size=expected_size,
1891
+ )
1892
+
1893
+ logger.info(f"Download complete. Moving file to {destination_path}")
1894
+ _chmod_and_move(incomplete_path, destination_path)
1895
+
1896
+
1699
1897
  def _int_or_none(value: Optional[str]) -> Optional[int]:
1700
1898
  try:
1701
1899
  return int(value) # type: ignore
@@ -1703,7 +1901,7 @@ def _int_or_none(value: Optional[str]) -> Optional[int]:
1703
1901
  return None
1704
1902
 
1705
1903
 
1706
- def _chmod_and_replace(src: str, dst: str) -> None:
1904
+ def _chmod_and_move(src: Path, dst: Path) -> None:
1707
1905
  """Set correct permission before moving a blob from tmp directory to cache dir.
1708
1906
 
1709
1907
  Do not take into account the `umask` from the process as there is no convenient way
@@ -1717,15 +1915,15 @@ def _chmod_and_replace(src: str, dst: str) -> None:
1717
1915
  - Fix issue: https://github.com/huggingface/huggingface_hub/issues/1215
1718
1916
  """
1719
1917
  # Get umask by creating a temporary file in the cached repo folder.
1720
- tmp_file = Path(dst).parent.parent / f"tmp_{uuid.uuid4()}"
1918
+ tmp_file = dst.parent.parent / f"tmp_{uuid.uuid4()}"
1721
1919
  try:
1722
1920
  tmp_file.touch()
1723
1921
  cache_dir_mode = Path(tmp_file).stat().st_mode
1724
- os.chmod(src, stat.S_IMODE(cache_dir_mode))
1922
+ os.chmod(str(src), stat.S_IMODE(cache_dir_mode))
1725
1923
  finally:
1726
1924
  tmp_file.unlink()
1727
1925
 
1728
- shutil.move(src, dst)
1926
+ shutil.move(str(src), str(dst))
1729
1927
 
1730
1928
 
1731
1929
  def _get_pointer_path(storage_folder: str, revision: str, relative_filename: str) -> str:
@@ -1739,32 +1937,3 @@ def _get_pointer_path(storage_folder: str, revision: str, relative_filename: str
1739
1937
  f" `relative_filename='{relative_filename}'`."
1740
1938
  )
1741
1939
  return pointer_path
1742
-
1743
-
1744
- def _to_local_dir(
1745
- path: str, local_dir: str, relative_filename: str, use_symlinks: Union[bool, Literal["auto"]]
1746
- ) -> str:
1747
- """Place a file in a local dir (different than cache_dir).
1748
-
1749
- Either symlink to blob file in cache or duplicate file depending on `use_symlinks` and file size.
1750
- """
1751
- # Using `os.path.abspath` instead of `Path.resolve()` to avoid resolving symlinks
1752
- local_dir_filepath = os.path.join(local_dir, relative_filename)
1753
- if Path(os.path.abspath(local_dir)) not in Path(os.path.abspath(local_dir_filepath)).parents:
1754
- raise ValueError(
1755
- f"Cannot copy file '{relative_filename}' to local dir '{local_dir}': file would not be in the local"
1756
- " directory."
1757
- )
1758
-
1759
- os.makedirs(os.path.dirname(local_dir_filepath), exist_ok=True)
1760
- real_blob_path = os.path.realpath(path)
1761
-
1762
- # If "auto" (default) copy-paste small files to ease manual editing but symlink big files to save disk
1763
- if use_symlinks == "auto":
1764
- use_symlinks = os.stat(real_blob_path).st_size > constants.HF_HUB_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD
1765
-
1766
- if use_symlinks:
1767
- _create_symlink(real_blob_path, local_dir_filepath, new_blob=False)
1768
- else:
1769
- shutil.copyfile(real_blob_path, local_dir_filepath)
1770
- return local_dir_filepath