huggingface-hub 0.29.0rc2__py3-none-any.whl → 1.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- huggingface_hub/__init__.py +160 -46
- huggingface_hub/_commit_api.py +277 -71
- huggingface_hub/_commit_scheduler.py +15 -15
- huggingface_hub/_inference_endpoints.py +33 -22
- huggingface_hub/_jobs_api.py +301 -0
- huggingface_hub/_local_folder.py +18 -3
- huggingface_hub/_login.py +31 -63
- huggingface_hub/_oauth.py +460 -0
- huggingface_hub/_snapshot_download.py +241 -81
- huggingface_hub/_space_api.py +18 -10
- huggingface_hub/_tensorboard_logger.py +15 -19
- huggingface_hub/_upload_large_folder.py +196 -76
- huggingface_hub/_webhooks_payload.py +3 -3
- huggingface_hub/_webhooks_server.py +15 -25
- huggingface_hub/{commands → cli}/__init__.py +1 -15
- huggingface_hub/cli/_cli_utils.py +173 -0
- huggingface_hub/cli/auth.py +147 -0
- huggingface_hub/cli/cache.py +841 -0
- huggingface_hub/cli/download.py +189 -0
- huggingface_hub/cli/hf.py +60 -0
- huggingface_hub/cli/inference_endpoints.py +377 -0
- huggingface_hub/cli/jobs.py +772 -0
- huggingface_hub/cli/lfs.py +175 -0
- huggingface_hub/cli/repo.py +315 -0
- huggingface_hub/cli/repo_files.py +94 -0
- huggingface_hub/{commands/env.py → cli/system.py} +10 -13
- huggingface_hub/cli/upload.py +294 -0
- huggingface_hub/cli/upload_large_folder.py +117 -0
- huggingface_hub/community.py +20 -12
- huggingface_hub/constants.py +83 -59
- huggingface_hub/dataclasses.py +609 -0
- huggingface_hub/errors.py +99 -30
- huggingface_hub/fastai_utils.py +30 -41
- huggingface_hub/file_download.py +606 -346
- huggingface_hub/hf_api.py +2445 -1132
- huggingface_hub/hf_file_system.py +269 -152
- huggingface_hub/hub_mixin.py +61 -66
- huggingface_hub/inference/_client.py +501 -630
- huggingface_hub/inference/_common.py +133 -121
- huggingface_hub/inference/_generated/_async_client.py +536 -722
- huggingface_hub/inference/_generated/types/__init__.py +6 -1
- huggingface_hub/inference/_generated/types/automatic_speech_recognition.py +5 -6
- huggingface_hub/inference/_generated/types/base.py +10 -7
- huggingface_hub/inference/_generated/types/chat_completion.py +77 -31
- huggingface_hub/inference/_generated/types/depth_estimation.py +2 -2
- huggingface_hub/inference/_generated/types/document_question_answering.py +2 -2
- huggingface_hub/inference/_generated/types/feature_extraction.py +2 -2
- huggingface_hub/inference/_generated/types/fill_mask.py +2 -2
- huggingface_hub/inference/_generated/types/image_to_image.py +8 -2
- huggingface_hub/inference/_generated/types/image_to_text.py +2 -3
- huggingface_hub/inference/_generated/types/image_to_video.py +60 -0
- huggingface_hub/inference/_generated/types/sentence_similarity.py +3 -3
- huggingface_hub/inference/_generated/types/summarization.py +2 -2
- huggingface_hub/inference/_generated/types/table_question_answering.py +5 -5
- huggingface_hub/inference/_generated/types/text2text_generation.py +2 -2
- huggingface_hub/inference/_generated/types/text_generation.py +11 -11
- huggingface_hub/inference/_generated/types/text_to_audio.py +1 -2
- huggingface_hub/inference/_generated/types/text_to_speech.py +1 -2
- huggingface_hub/inference/_generated/types/text_to_video.py +2 -2
- huggingface_hub/inference/_generated/types/token_classification.py +2 -2
- huggingface_hub/inference/_generated/types/translation.py +2 -2
- huggingface_hub/inference/_generated/types/zero_shot_classification.py +2 -2
- huggingface_hub/inference/_generated/types/zero_shot_image_classification.py +2 -2
- huggingface_hub/inference/_generated/types/zero_shot_object_detection.py +1 -3
- huggingface_hub/inference/_mcp/__init__.py +0 -0
- huggingface_hub/inference/_mcp/_cli_hacks.py +88 -0
- huggingface_hub/inference/_mcp/agent.py +100 -0
- huggingface_hub/inference/_mcp/cli.py +247 -0
- huggingface_hub/inference/_mcp/constants.py +81 -0
- huggingface_hub/inference/_mcp/mcp_client.py +395 -0
- huggingface_hub/inference/_mcp/types.py +45 -0
- huggingface_hub/inference/_mcp/utils.py +128 -0
- huggingface_hub/inference/_providers/__init__.py +149 -20
- huggingface_hub/inference/_providers/_common.py +160 -37
- huggingface_hub/inference/_providers/black_forest_labs.py +12 -9
- huggingface_hub/inference/_providers/cerebras.py +6 -0
- huggingface_hub/inference/_providers/clarifai.py +13 -0
- huggingface_hub/inference/_providers/cohere.py +32 -0
- huggingface_hub/inference/_providers/fal_ai.py +231 -22
- huggingface_hub/inference/_providers/featherless_ai.py +38 -0
- huggingface_hub/inference/_providers/fireworks_ai.py +22 -1
- huggingface_hub/inference/_providers/groq.py +9 -0
- huggingface_hub/inference/_providers/hf_inference.py +143 -33
- huggingface_hub/inference/_providers/hyperbolic.py +9 -5
- huggingface_hub/inference/_providers/nebius.py +47 -5
- huggingface_hub/inference/_providers/novita.py +48 -5
- huggingface_hub/inference/_providers/nscale.py +44 -0
- huggingface_hub/inference/_providers/openai.py +25 -0
- huggingface_hub/inference/_providers/publicai.py +6 -0
- huggingface_hub/inference/_providers/replicate.py +46 -9
- huggingface_hub/inference/_providers/sambanova.py +37 -1
- huggingface_hub/inference/_providers/scaleway.py +28 -0
- huggingface_hub/inference/_providers/together.py +34 -5
- huggingface_hub/inference/_providers/wavespeed.py +138 -0
- huggingface_hub/inference/_providers/zai_org.py +17 -0
- huggingface_hub/lfs.py +33 -100
- huggingface_hub/repocard.py +34 -38
- huggingface_hub/repocard_data.py +79 -59
- huggingface_hub/serialization/__init__.py +0 -1
- huggingface_hub/serialization/_base.py +12 -15
- huggingface_hub/serialization/_dduf.py +8 -8
- huggingface_hub/serialization/_torch.py +69 -69
- huggingface_hub/utils/__init__.py +27 -8
- huggingface_hub/utils/_auth.py +7 -7
- huggingface_hub/utils/_cache_manager.py +92 -147
- huggingface_hub/utils/_chunk_utils.py +2 -3
- huggingface_hub/utils/_deprecation.py +1 -1
- huggingface_hub/utils/_dotenv.py +55 -0
- huggingface_hub/utils/_experimental.py +7 -5
- huggingface_hub/utils/_fixes.py +0 -10
- huggingface_hub/utils/_git_credential.py +5 -5
- huggingface_hub/utils/_headers.py +8 -30
- huggingface_hub/utils/_http.py +399 -237
- huggingface_hub/utils/_pagination.py +6 -6
- huggingface_hub/utils/_parsing.py +98 -0
- huggingface_hub/utils/_paths.py +5 -5
- huggingface_hub/utils/_runtime.py +74 -22
- huggingface_hub/utils/_safetensors.py +21 -21
- huggingface_hub/utils/_subprocess.py +13 -11
- huggingface_hub/utils/_telemetry.py +4 -4
- huggingface_hub/{commands/_cli_utils.py → utils/_terminal.py} +4 -4
- huggingface_hub/utils/_typing.py +25 -5
- huggingface_hub/utils/_validators.py +55 -74
- huggingface_hub/utils/_verification.py +167 -0
- huggingface_hub/utils/_xet.py +235 -0
- huggingface_hub/utils/_xet_progress_reporting.py +162 -0
- huggingface_hub/utils/insecure_hashlib.py +3 -5
- huggingface_hub/utils/logging.py +8 -11
- huggingface_hub/utils/tqdm.py +33 -4
- {huggingface_hub-0.29.0rc2.dist-info → huggingface_hub-1.1.3.dist-info}/METADATA +94 -82
- huggingface_hub-1.1.3.dist-info/RECORD +155 -0
- {huggingface_hub-0.29.0rc2.dist-info → huggingface_hub-1.1.3.dist-info}/WHEEL +1 -1
- huggingface_hub-1.1.3.dist-info/entry_points.txt +6 -0
- huggingface_hub/commands/delete_cache.py +0 -428
- huggingface_hub/commands/download.py +0 -200
- huggingface_hub/commands/huggingface_cli.py +0 -61
- huggingface_hub/commands/lfs.py +0 -200
- huggingface_hub/commands/repo_files.py +0 -128
- huggingface_hub/commands/scan_cache.py +0 -181
- huggingface_hub/commands/tag.py +0 -159
- huggingface_hub/commands/upload.py +0 -299
- huggingface_hub/commands/upload_large_folder.py +0 -129
- huggingface_hub/commands/user.py +0 -304
- huggingface_hub/commands/version.py +0 -37
- huggingface_hub/inference_api.py +0 -217
- huggingface_hub/keras_mixin.py +0 -500
- huggingface_hub/repository.py +0 -1477
- huggingface_hub/serialization/_tensorflow.py +0 -95
- huggingface_hub/utils/_hf_folder.py +0 -68
- huggingface_hub-0.29.0rc2.dist-info/RECORD +0 -131
- huggingface_hub-0.29.0rc2.dist-info/entry_points.txt +0 -6
- {huggingface_hub-0.29.0rc2.dist-info → huggingface_hub-1.1.3.dist-info/licenses}/LICENSE +0 -0
- {huggingface_hub-0.29.0rc2.dist-info → huggingface_hub-1.1.3.dist-info}/top_level.txt +0 -0
huggingface_hub/file_download.py
CHANGED
|
@@ -1,7 +1,5 @@
|
|
|
1
|
-
import contextlib
|
|
2
1
|
import copy
|
|
3
2
|
import errno
|
|
4
|
-
import inspect
|
|
5
3
|
import os
|
|
6
4
|
import re
|
|
7
5
|
import shutil
|
|
@@ -11,26 +9,20 @@ import uuid
|
|
|
11
9
|
import warnings
|
|
12
10
|
from dataclasses import dataclass
|
|
13
11
|
from pathlib import Path
|
|
14
|
-
from typing import Any, BinaryIO,
|
|
12
|
+
from typing import Any, BinaryIO, Literal, NoReturn, Optional, Union, overload
|
|
15
13
|
from urllib.parse import quote, urlparse
|
|
16
14
|
|
|
17
|
-
import
|
|
15
|
+
import httpx
|
|
16
|
+
from tqdm.auto import tqdm as base_tqdm
|
|
18
17
|
|
|
19
|
-
from . import
|
|
20
|
-
__version__, # noqa: F401 # for backward compatibility
|
|
21
|
-
constants,
|
|
22
|
-
)
|
|
18
|
+
from . import constants
|
|
23
19
|
from ._local_folder import get_local_download_paths, read_download_metadata, write_download_metadata
|
|
24
|
-
from .constants import (
|
|
25
|
-
HUGGINGFACE_CO_URL_TEMPLATE, # noqa: F401 # for backward compatibility
|
|
26
|
-
HUGGINGFACE_HUB_CACHE, # noqa: F401 # for backward compatibility
|
|
27
|
-
)
|
|
28
20
|
from .errors import (
|
|
29
|
-
EntryNotFoundError,
|
|
30
21
|
FileMetadataError,
|
|
31
22
|
GatedRepoError,
|
|
32
23
|
HfHubHTTPError,
|
|
33
24
|
LocalEntryNotFoundError,
|
|
25
|
+
RemoteEntryNotFoundError,
|
|
34
26
|
RepositoryNotFoundError,
|
|
35
27
|
RevisionNotFoundError,
|
|
36
28
|
)
|
|
@@ -38,33 +30,20 @@ from .utils import (
|
|
|
38
30
|
OfflineModeIsEnabled,
|
|
39
31
|
SoftTemporaryDirectory,
|
|
40
32
|
WeakFileLock,
|
|
33
|
+
XetFileData,
|
|
41
34
|
build_hf_headers,
|
|
42
|
-
get_fastai_version, # noqa: F401 # for backward compatibility
|
|
43
|
-
get_fastcore_version, # noqa: F401 # for backward compatibility
|
|
44
|
-
get_graphviz_version, # noqa: F401 # for backward compatibility
|
|
45
|
-
get_jinja_version, # noqa: F401 # for backward compatibility
|
|
46
|
-
get_pydot_version, # noqa: F401 # for backward compatibility
|
|
47
|
-
get_session,
|
|
48
|
-
get_tf_version, # noqa: F401 # for backward compatibility
|
|
49
|
-
get_torch_version, # noqa: F401 # for backward compatibility
|
|
50
35
|
hf_raise_for_status,
|
|
51
|
-
is_fastai_available, # noqa: F401 # for backward compatibility
|
|
52
|
-
is_fastcore_available, # noqa: F401 # for backward compatibility
|
|
53
|
-
is_graphviz_available, # noqa: F401 # for backward compatibility
|
|
54
|
-
is_jinja_available, # noqa: F401 # for backward compatibility
|
|
55
|
-
is_pydot_available, # noqa: F401 # for backward compatibility
|
|
56
|
-
is_tf_available, # noqa: F401 # for backward compatibility
|
|
57
|
-
is_torch_available, # noqa: F401 # for backward compatibility
|
|
58
36
|
logging,
|
|
59
|
-
|
|
37
|
+
parse_xet_file_data_from_response,
|
|
38
|
+
refresh_xet_connection_info,
|
|
60
39
|
tqdm,
|
|
61
40
|
validate_hf_hub_args,
|
|
62
41
|
)
|
|
63
|
-
from .utils._http import _adjust_range_header
|
|
64
|
-
from .utils._runtime import
|
|
42
|
+
from .utils._http import _adjust_range_header, http_backoff, http_stream_backoff
|
|
43
|
+
from .utils._runtime import is_xet_available
|
|
65
44
|
from .utils._typing import HTTP_METHOD_T
|
|
66
45
|
from .utils.sha import sha_fileobj
|
|
67
|
-
from .utils.tqdm import
|
|
46
|
+
from .utils.tqdm import _get_progress_bar_context
|
|
68
47
|
|
|
69
48
|
|
|
70
49
|
logger = logging.get_logger(__name__)
|
|
@@ -82,7 +61,7 @@ REGEX_COMMIT_HASH = re.compile(r"^[0-9a-f]{40}$")
|
|
|
82
61
|
# Regex to check if the file etag IS a valid sha256
|
|
83
62
|
REGEX_SHA256 = re.compile(r"^[0-9a-f]{64}$")
|
|
84
63
|
|
|
85
|
-
_are_symlinks_supported_in_dir:
|
|
64
|
+
_are_symlinks_supported_in_dir: dict[str, bool] = {}
|
|
86
65
|
|
|
87
66
|
|
|
88
67
|
def are_symlinks_supported(cache_dir: Union[str, Path, None] = None) -> bool:
|
|
@@ -160,12 +139,43 @@ class HfFileMetadata:
|
|
|
160
139
|
size (`size`):
|
|
161
140
|
Size of the file. In case of an LFS file, contains the size of the actual
|
|
162
141
|
LFS file, not the pointer.
|
|
142
|
+
xet_file_data (`XetFileData`, *optional*):
|
|
143
|
+
Xet information for the file. This is only set if the file is stored using Xet storage.
|
|
163
144
|
"""
|
|
164
145
|
|
|
165
146
|
commit_hash: Optional[str]
|
|
166
147
|
etag: Optional[str]
|
|
167
148
|
location: str
|
|
168
149
|
size: Optional[int]
|
|
150
|
+
xet_file_data: Optional[XetFileData]
|
|
151
|
+
|
|
152
|
+
|
|
153
|
+
@dataclass
|
|
154
|
+
class DryRunFileInfo:
|
|
155
|
+
"""Information returned when performing a dry run of a file download.
|
|
156
|
+
|
|
157
|
+
Returned by [`hf_hub_download`] when `dry_run=True`.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
commit_hash (`str`):
|
|
161
|
+
The commit_hash related to the file.
|
|
162
|
+
file_size (`int`):
|
|
163
|
+
Size of the file. In case of an LFS file, contains the size of the actual LFS file, not the pointer.
|
|
164
|
+
filename (`str`):
|
|
165
|
+
Name of the file in the repo.
|
|
166
|
+
is_cached (`bool`):
|
|
167
|
+
Whether the file is already cached locally.
|
|
168
|
+
will_download (`bool`):
|
|
169
|
+
Whether the file will be downloaded if `hf_hub_download` is called with `dry_run=False`.
|
|
170
|
+
In practice, will_download is `True` if the file is not cached or if `force_download=True`.
|
|
171
|
+
"""
|
|
172
|
+
|
|
173
|
+
commit_hash: str
|
|
174
|
+
file_size: int
|
|
175
|
+
filename: str
|
|
176
|
+
local_path: str
|
|
177
|
+
is_cached: bool
|
|
178
|
+
will_download: bool
|
|
169
179
|
|
|
170
180
|
|
|
171
181
|
@validate_hf_hub_args
|
|
@@ -210,26 +220,23 @@ def hf_hub_url(
|
|
|
210
220
|
'https://huggingface.co/julien-c/EsperBERTo-small/resolve/main/pytorch_model.bin'
|
|
211
221
|
```
|
|
212
222
|
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
if stored in git, or its sha256 if stored in git-lfs.
|
|
231
|
-
|
|
232
|
-
</Tip>
|
|
223
|
+
> [!TIP]
|
|
224
|
+
> Notes:
|
|
225
|
+
>
|
|
226
|
+
> Cloudfront is replicated over the globe so downloads are way faster for
|
|
227
|
+
> the end user (and it also lowers our bandwidth costs).
|
|
228
|
+
>
|
|
229
|
+
> Cloudfront aggressively caches files by default (default TTL is 24
|
|
230
|
+
> hours), however this is not an issue here because we implement a
|
|
231
|
+
> git-based versioning system on huggingface.co, which means that we store
|
|
232
|
+
> the files on S3/Cloudfront in a content-addressable way (i.e., the file
|
|
233
|
+
> name is its hash). Using content-addressable filenames means cache can't
|
|
234
|
+
> ever be stale.
|
|
235
|
+
>
|
|
236
|
+
> In terms of client-side caching from this library, we base our caching
|
|
237
|
+
> on the objects' entity tag (`ETag`), which is an identifier of a
|
|
238
|
+
> specific version of a resource [1]_. An object's ETag is: its git-sha1
|
|
239
|
+
> if stored in git, or its sha256 if stored in git-lfs.
|
|
233
240
|
|
|
234
241
|
References:
|
|
235
242
|
|
|
@@ -248,7 +255,7 @@ def hf_hub_url(
|
|
|
248
255
|
|
|
249
256
|
if revision is None:
|
|
250
257
|
revision = constants.DEFAULT_REVISION
|
|
251
|
-
url = HUGGINGFACE_CO_URL_TEMPLATE.format(
|
|
258
|
+
url = constants.HUGGINGFACE_CO_URL_TEMPLATE.format(
|
|
252
259
|
repo_id=repo_id, revision=quote(revision, safe=""), filename=quote(filename)
|
|
253
260
|
)
|
|
254
261
|
# Update endpoint if provided
|
|
@@ -257,63 +264,92 @@ def hf_hub_url(
|
|
|
257
264
|
return url
|
|
258
265
|
|
|
259
266
|
|
|
260
|
-
def
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
267
|
+
def _httpx_follow_relative_redirects(method: HTTP_METHOD_T, url: str, **httpx_kwargs) -> httpx.Response:
|
|
268
|
+
"""Perform an HTTP request with backoff and follow relative redirects only.
|
|
269
|
+
|
|
270
|
+
This is useful to follow a redirection to a renamed repository without following redirection to a CDN.
|
|
271
|
+
|
|
272
|
+
A backoff mechanism retries the HTTP call on 5xx errors and network errors.
|
|
265
273
|
|
|
266
274
|
Args:
|
|
267
275
|
method (`str`):
|
|
268
276
|
HTTP method, such as 'GET' or 'HEAD'.
|
|
269
277
|
url (`str`):
|
|
270
278
|
The URL of the resource to fetch.
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
kwarg is set to False. Useful when we want to follow a redirection to a renamed repository without
|
|
274
|
-
following redirection to a CDN.
|
|
275
|
-
**params (`dict`, *optional*):
|
|
276
|
-
Params to pass to `requests.request`.
|
|
279
|
+
**httpx_kwargs (`dict`, *optional*):
|
|
280
|
+
Params to pass to `httpx.request`.
|
|
277
281
|
"""
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
response =
|
|
282
|
+
while True:
|
|
283
|
+
# Make the request
|
|
284
|
+
response = http_backoff(
|
|
281
285
|
method=method,
|
|
282
286
|
url=url,
|
|
283
|
-
|
|
284
|
-
|
|
287
|
+
**httpx_kwargs,
|
|
288
|
+
follow_redirects=False,
|
|
289
|
+
retry_on_exceptions=(),
|
|
290
|
+
retry_on_status_codes=(429,),
|
|
285
291
|
)
|
|
292
|
+
hf_raise_for_status(response)
|
|
286
293
|
|
|
287
|
-
#
|
|
288
|
-
# This is useful in case of a renamed repository.
|
|
294
|
+
# Check if response is a relative redirect
|
|
289
295
|
if 300 <= response.status_code <= 399:
|
|
290
296
|
parsed_target = urlparse(response.headers["Location"])
|
|
291
297
|
if parsed_target.netloc == "":
|
|
292
|
-
#
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
return _request_wrapper(method=method, url=next_url, follow_relative_redirects=True, **params)
|
|
300
|
-
return response
|
|
301
|
-
|
|
302
|
-
# Perform request and return if status_code is not in the retry list.
|
|
303
|
-
response = get_session().request(method=method, url=url, **params)
|
|
304
|
-
hf_raise_for_status(response)
|
|
298
|
+
# Relative redirect -> update URL and retry
|
|
299
|
+
url = urlparse(url)._replace(path=parsed_target.path).geturl()
|
|
300
|
+
continue
|
|
301
|
+
|
|
302
|
+
# Break if no relative redirect
|
|
303
|
+
break
|
|
304
|
+
|
|
305
305
|
return response
|
|
306
306
|
|
|
307
307
|
|
|
308
|
+
def _get_file_length_from_http_response(response: httpx.Response) -> Optional[int]:
|
|
309
|
+
"""
|
|
310
|
+
Get the length of the file from the HTTP response headers.
|
|
311
|
+
|
|
312
|
+
This function extracts the file size from the HTTP response headers, either from the
|
|
313
|
+
`Content-Range` or `Content-Length` header, if available (in that order).
|
|
314
|
+
|
|
315
|
+
Args:
|
|
316
|
+
response (`httpx.Response`):
|
|
317
|
+
The HTTP response object.
|
|
318
|
+
|
|
319
|
+
Returns:
|
|
320
|
+
`int` or `None`: The length of the file in bytes, or None if not available.
|
|
321
|
+
"""
|
|
322
|
+
|
|
323
|
+
# If HTTP response contains compressed body (e.g. gzip), the `Content-Length` header will
|
|
324
|
+
# contain the length of the compressed body, not the uncompressed file size.
|
|
325
|
+
# And at the start of transmission there's no way to know the uncompressed file size for gzip,
|
|
326
|
+
# thus we return None in that case.
|
|
327
|
+
content_encoding = response.headers.get("Content-Encoding", "identity").lower()
|
|
328
|
+
if content_encoding != "identity":
|
|
329
|
+
# gzip/br/deflate/zstd etc
|
|
330
|
+
return None
|
|
331
|
+
|
|
332
|
+
content_range = response.headers.get("Content-Range")
|
|
333
|
+
if content_range is not None:
|
|
334
|
+
return int(content_range.rsplit("/")[-1])
|
|
335
|
+
|
|
336
|
+
content_length = response.headers.get("Content-Length")
|
|
337
|
+
if content_length is not None:
|
|
338
|
+
return int(content_length)
|
|
339
|
+
|
|
340
|
+
return None
|
|
341
|
+
|
|
342
|
+
|
|
343
|
+
@validate_hf_hub_args
|
|
308
344
|
def http_get(
|
|
309
345
|
url: str,
|
|
310
346
|
temp_file: BinaryIO,
|
|
311
347
|
*,
|
|
312
|
-
proxies: Optional[Dict] = None,
|
|
313
348
|
resume_size: int = 0,
|
|
314
|
-
headers: Optional[
|
|
349
|
+
headers: Optional[dict[str, Any]] = None,
|
|
315
350
|
expected_size: Optional[int] = None,
|
|
316
351
|
displayed_filename: Optional[str] = None,
|
|
352
|
+
tqdm_class: Optional[type[base_tqdm]] = None,
|
|
317
353
|
_nb_retries: int = 5,
|
|
318
354
|
_tqdm_bar: Optional[tqdm] = None,
|
|
319
355
|
) -> None:
|
|
@@ -329,8 +365,6 @@ def http_get(
|
|
|
329
365
|
The URL of the file to download.
|
|
330
366
|
temp_file (`BinaryIO`):
|
|
331
367
|
The file-like object where to save the file.
|
|
332
|
-
proxies (`dict`, *optional*):
|
|
333
|
-
Dictionary mapping protocol to the URL of the proxy passed to `requests.request`.
|
|
334
368
|
resume_size (`int`, *optional*):
|
|
335
369
|
The number of bytes already downloaded. If set to 0 (default), the whole file is download. If set to a
|
|
336
370
|
positive number, the download will resume at the given position.
|
|
@@ -347,137 +381,85 @@ def http_get(
|
|
|
347
381
|
# If the file is already fully downloaded, we don't need to download it again.
|
|
348
382
|
return
|
|
349
383
|
|
|
350
|
-
hf_transfer = None
|
|
351
|
-
if constants.HF_HUB_ENABLE_HF_TRANSFER:
|
|
352
|
-
if resume_size != 0:
|
|
353
|
-
warnings.warn("'hf_transfer' does not support `resume_size`: falling back to regular download method")
|
|
354
|
-
elif proxies is not None:
|
|
355
|
-
warnings.warn("'hf_transfer' does not support `proxies`: falling back to regular download method")
|
|
356
|
-
else:
|
|
357
|
-
try:
|
|
358
|
-
import hf_transfer # type: ignore[no-redef]
|
|
359
|
-
except ImportError:
|
|
360
|
-
raise ValueError(
|
|
361
|
-
"Fast download using 'hf_transfer' is enabled"
|
|
362
|
-
" (HF_HUB_ENABLE_HF_TRANSFER=1) but 'hf_transfer' package is not"
|
|
363
|
-
" available in your environment. Try `pip install hf_transfer`."
|
|
364
|
-
)
|
|
365
|
-
|
|
366
384
|
initial_headers = headers
|
|
367
385
|
headers = copy.deepcopy(headers) or {}
|
|
368
386
|
if resume_size > 0:
|
|
369
387
|
headers["Range"] = _adjust_range_header(headers.get("Range"), resume_size)
|
|
388
|
+
elif expected_size and expected_size > constants.MAX_HTTP_DOWNLOAD_SIZE:
|
|
389
|
+
# Any files over 50GB will not be available through basic http requests.
|
|
390
|
+
raise ValueError(
|
|
391
|
+
"The file is too large to be downloaded using the regular download method. "
|
|
392
|
+
" Install `hf_xet` with `pip install hf_xet` for xet-powered downloads."
|
|
393
|
+
)
|
|
370
394
|
|
|
371
|
-
|
|
372
|
-
method="GET",
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
displayed_filename
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
displayed_filename
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
unit="B",
|
|
404
|
-
unit_scale=True,
|
|
395
|
+
with http_stream_backoff(
|
|
396
|
+
method="GET",
|
|
397
|
+
url=url,
|
|
398
|
+
headers=headers,
|
|
399
|
+
timeout=constants.HF_HUB_DOWNLOAD_TIMEOUT,
|
|
400
|
+
retry_on_exceptions=(),
|
|
401
|
+
retry_on_status_codes=(429,),
|
|
402
|
+
) as response:
|
|
403
|
+
hf_raise_for_status(response)
|
|
404
|
+
total: Optional[int] = _get_file_length_from_http_response(response)
|
|
405
|
+
|
|
406
|
+
if displayed_filename is None:
|
|
407
|
+
displayed_filename = url
|
|
408
|
+
content_disposition = response.headers.get("Content-Disposition")
|
|
409
|
+
if content_disposition is not None:
|
|
410
|
+
match = HEADER_FILENAME_PATTERN.search(content_disposition)
|
|
411
|
+
if match is not None:
|
|
412
|
+
# Means file is on CDN
|
|
413
|
+
displayed_filename = match.groupdict()["filename"]
|
|
414
|
+
|
|
415
|
+
# Truncate filename if too long to display
|
|
416
|
+
if len(displayed_filename) > 40:
|
|
417
|
+
displayed_filename = f"(…){displayed_filename[-40:]}"
|
|
418
|
+
|
|
419
|
+
consistency_error_message = (
|
|
420
|
+
f"Consistency check failed: file should be of size {expected_size} but has size"
|
|
421
|
+
f" {{actual_size}} ({displayed_filename}).\nThis is usually due to network issues while downloading the file."
|
|
422
|
+
" Please retry with `force_download=True`."
|
|
423
|
+
)
|
|
424
|
+
progress_cm = _get_progress_bar_context(
|
|
425
|
+
desc=displayed_filename,
|
|
426
|
+
log_level=logger.getEffectiveLevel(),
|
|
405
427
|
total=total,
|
|
406
428
|
initial=resume_size,
|
|
407
|
-
desc=displayed_filename,
|
|
408
|
-
disable=is_tqdm_disabled(logger.getEffectiveLevel()),
|
|
409
429
|
name="huggingface_hub.http_get",
|
|
430
|
+
tqdm_class=tqdm_class,
|
|
431
|
+
_tqdm_bar=_tqdm_bar,
|
|
410
432
|
)
|
|
411
|
-
if _tqdm_bar is None
|
|
412
|
-
else contextlib.nullcontext(_tqdm_bar)
|
|
413
|
-
# ^ `contextlib.nullcontext` mimics a context manager that does nothing
|
|
414
|
-
# Makes it easier to use the same code path for both cases but in the later
|
|
415
|
-
# case, the progress bar is not closed when exiting the context manager.
|
|
416
|
-
)
|
|
417
433
|
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
supports_callback = "callback" in inspect.signature(hf_transfer.download).parameters
|
|
421
|
-
if not supports_callback:
|
|
422
|
-
warnings.warn(
|
|
423
|
-
"You are using an outdated version of `hf_transfer`. "
|
|
424
|
-
"Consider upgrading to latest version to enable progress bars "
|
|
425
|
-
"using `pip install -U hf_transfer`."
|
|
426
|
-
)
|
|
434
|
+
with progress_cm as progress:
|
|
435
|
+
new_resume_size = resume_size
|
|
427
436
|
try:
|
|
428
|
-
|
|
437
|
+
for chunk in response.iter_bytes(chunk_size=constants.DOWNLOAD_CHUNK_SIZE):
|
|
438
|
+
if chunk: # filter out keep-alive new chunks
|
|
439
|
+
progress.update(len(chunk))
|
|
440
|
+
temp_file.write(chunk)
|
|
441
|
+
new_resume_size += len(chunk)
|
|
442
|
+
# Some data has been downloaded from the server so we reset the number of retries.
|
|
443
|
+
_nb_retries = 5
|
|
444
|
+
except (httpx.ConnectError, httpx.TimeoutException) as e:
|
|
445
|
+
# If ConnectionError (SSLError) or ReadTimeout happen while streaming data from the server, it is most likely
|
|
446
|
+
# a transient error (network outage?). We log a warning message and try to resume the download a few times
|
|
447
|
+
# before giving up. Tre retry mechanism is basic but should be enough in most cases.
|
|
448
|
+
if _nb_retries <= 0:
|
|
449
|
+
logger.warning("Error while downloading from %s: %s\nMax retries exceeded.", url, str(e))
|
|
450
|
+
raise
|
|
451
|
+
logger.warning("Error while downloading from %s: %s\nTrying to resume download...", url, str(e))
|
|
452
|
+
time.sleep(1)
|
|
453
|
+
return http_get(
|
|
429
454
|
url=url,
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
455
|
+
temp_file=temp_file,
|
|
456
|
+
resume_size=new_resume_size,
|
|
457
|
+
headers=initial_headers,
|
|
458
|
+
expected_size=expected_size,
|
|
459
|
+
tqdm_class=tqdm_class,
|
|
460
|
+
_nb_retries=_nb_retries - 1,
|
|
461
|
+
_tqdm_bar=_tqdm_bar,
|
|
437
462
|
)
|
|
438
|
-
except Exception as e:
|
|
439
|
-
raise RuntimeError(
|
|
440
|
-
"An error occurred while downloading using `hf_transfer`. Consider"
|
|
441
|
-
" disabling HF_HUB_ENABLE_HF_TRANSFER for better error handling."
|
|
442
|
-
) from e
|
|
443
|
-
if not supports_callback:
|
|
444
|
-
progress.update(total)
|
|
445
|
-
if expected_size is not None and expected_size != os.path.getsize(temp_file.name):
|
|
446
|
-
raise EnvironmentError(
|
|
447
|
-
consistency_error_message.format(
|
|
448
|
-
actual_size=os.path.getsize(temp_file.name),
|
|
449
|
-
)
|
|
450
|
-
)
|
|
451
|
-
return
|
|
452
|
-
new_resume_size = resume_size
|
|
453
|
-
try:
|
|
454
|
-
for chunk in r.iter_content(chunk_size=constants.DOWNLOAD_CHUNK_SIZE):
|
|
455
|
-
if chunk: # filter out keep-alive new chunks
|
|
456
|
-
progress.update(len(chunk))
|
|
457
|
-
temp_file.write(chunk)
|
|
458
|
-
new_resume_size += len(chunk)
|
|
459
|
-
# Some data has been downloaded from the server so we reset the number of retries.
|
|
460
|
-
_nb_retries = 5
|
|
461
|
-
except (requests.ConnectionError, requests.ReadTimeout) as e:
|
|
462
|
-
# If ConnectionError (SSLError) or ReadTimeout happen while streaming data from the server, it is most likely
|
|
463
|
-
# a transient error (network outage?). We log a warning message and try to resume the download a few times
|
|
464
|
-
# before giving up. Tre retry mechanism is basic but should be enough in most cases.
|
|
465
|
-
if _nb_retries <= 0:
|
|
466
|
-
logger.warning("Error while downloading from %s: %s\nMax retries exceeded.", url, str(e))
|
|
467
|
-
raise
|
|
468
|
-
logger.warning("Error while downloading from %s: %s\nTrying to resume download...", url, str(e))
|
|
469
|
-
time.sleep(1)
|
|
470
|
-
reset_sessions() # In case of SSLError it's best to reset the shared requests.Session objects
|
|
471
|
-
return http_get(
|
|
472
|
-
url=url,
|
|
473
|
-
temp_file=temp_file,
|
|
474
|
-
proxies=proxies,
|
|
475
|
-
resume_size=new_resume_size,
|
|
476
|
-
headers=initial_headers,
|
|
477
|
-
expected_size=expected_size,
|
|
478
|
-
_nb_retries=_nb_retries - 1,
|
|
479
|
-
_tqdm_bar=_tqdm_bar,
|
|
480
|
-
)
|
|
481
463
|
|
|
482
464
|
if expected_size is not None and expected_size != temp_file.tell():
|
|
483
465
|
raise EnvironmentError(
|
|
@@ -487,6 +469,114 @@ def http_get(
|
|
|
487
469
|
)
|
|
488
470
|
|
|
489
471
|
|
|
472
|
+
def xet_get(
|
|
473
|
+
*,
|
|
474
|
+
incomplete_path: Path,
|
|
475
|
+
xet_file_data: XetFileData,
|
|
476
|
+
headers: dict[str, str],
|
|
477
|
+
expected_size: Optional[int] = None,
|
|
478
|
+
displayed_filename: Optional[str] = None,
|
|
479
|
+
tqdm_class: Optional[type[base_tqdm]] = None,
|
|
480
|
+
_tqdm_bar: Optional[tqdm] = None,
|
|
481
|
+
) -> None:
|
|
482
|
+
"""
|
|
483
|
+
Download a file using Xet storage service.
|
|
484
|
+
|
|
485
|
+
Args:
|
|
486
|
+
incomplete_path (`Path`):
|
|
487
|
+
The path to the file to download.
|
|
488
|
+
xet_file_data (`XetFileData`):
|
|
489
|
+
The file metadata needed to make the request to the xet storage service.
|
|
490
|
+
headers (`dict[str, str]`):
|
|
491
|
+
The headers to send to the xet storage service.
|
|
492
|
+
expected_size (`int`, *optional*):
|
|
493
|
+
The expected size of the file to download. If set, the download will raise an error if the size of the
|
|
494
|
+
received content is different from the expected one.
|
|
495
|
+
displayed_filename (`str`, *optional*):
|
|
496
|
+
The filename of the file that is being downloaded. Value is used only to display a nice progress bar. If
|
|
497
|
+
not set, the filename is guessed from the URL or the `Content-Disposition` header.
|
|
498
|
+
|
|
499
|
+
**How it works:**
|
|
500
|
+
The file download system uses Xet storage, which is a content-addressable storage system that breaks files into chunks
|
|
501
|
+
for efficient storage and transfer.
|
|
502
|
+
|
|
503
|
+
`hf_xet.download_files` manages downloading files by:
|
|
504
|
+
- Taking a list of files to download (each with its unique content hash)
|
|
505
|
+
- Connecting to a storage server (CAS server) that knows how files are chunked
|
|
506
|
+
- Using authentication to ensure secure access
|
|
507
|
+
- Providing progress updates during download
|
|
508
|
+
|
|
509
|
+
Authentication works by regularly refreshing access tokens through `refresh_xet_connection_info` to maintain a valid
|
|
510
|
+
connection to the storage server.
|
|
511
|
+
|
|
512
|
+
The download process works like this:
|
|
513
|
+
1. Create a local cache folder at `~/.cache/huggingface/xet/chunk-cache` to store reusable file chunks
|
|
514
|
+
2. Download files in parallel:
|
|
515
|
+
2.1. Prepare to write the file to disk
|
|
516
|
+
2.2. Ask the server "how is this file split into chunks?" using the file's unique hash
|
|
517
|
+
The server responds with:
|
|
518
|
+
- Which chunks make up the complete file
|
|
519
|
+
- Where each chunk can be downloaded from
|
|
520
|
+
2.3. For each needed chunk:
|
|
521
|
+
- Checks if we already have it in our local cache
|
|
522
|
+
- If not, download it from cloud storage (S3)
|
|
523
|
+
- Save it to cache for future use
|
|
524
|
+
- Assemble the chunks in order to recreate the original file
|
|
525
|
+
|
|
526
|
+
"""
|
|
527
|
+
try:
|
|
528
|
+
from hf_xet import PyXetDownloadInfo, download_files # type: ignore[no-redef]
|
|
529
|
+
except ImportError:
|
|
530
|
+
raise ValueError(
|
|
531
|
+
"To use optimized download using Xet storage, you need to install the hf_xet package. "
|
|
532
|
+
'Try `pip install "huggingface_hub[hf_xet]"` or `pip install hf_xet`.'
|
|
533
|
+
)
|
|
534
|
+
|
|
535
|
+
connection_info = refresh_xet_connection_info(file_data=xet_file_data, headers=headers)
|
|
536
|
+
|
|
537
|
+
def token_refresher() -> tuple[str, int]:
|
|
538
|
+
connection_info = refresh_xet_connection_info(file_data=xet_file_data, headers=headers)
|
|
539
|
+
if connection_info is None:
|
|
540
|
+
raise ValueError("Failed to refresh token using xet metadata.")
|
|
541
|
+
return connection_info.access_token, connection_info.expiration_unix_epoch
|
|
542
|
+
|
|
543
|
+
xet_download_info = [
|
|
544
|
+
PyXetDownloadInfo(
|
|
545
|
+
destination_path=str(incomplete_path.absolute()), hash=xet_file_data.file_hash, file_size=expected_size
|
|
546
|
+
)
|
|
547
|
+
]
|
|
548
|
+
|
|
549
|
+
if not displayed_filename:
|
|
550
|
+
displayed_filename = incomplete_path.name
|
|
551
|
+
|
|
552
|
+
# Truncate filename if too long to display
|
|
553
|
+
if len(displayed_filename) > 40:
|
|
554
|
+
displayed_filename = f"{displayed_filename[:40]}(…)"
|
|
555
|
+
|
|
556
|
+
progress_cm = _get_progress_bar_context(
|
|
557
|
+
desc=displayed_filename,
|
|
558
|
+
log_level=logger.getEffectiveLevel(),
|
|
559
|
+
total=expected_size,
|
|
560
|
+
initial=0,
|
|
561
|
+
name="huggingface_hub.xet_get",
|
|
562
|
+
tqdm_class=tqdm_class,
|
|
563
|
+
_tqdm_bar=_tqdm_bar,
|
|
564
|
+
)
|
|
565
|
+
|
|
566
|
+
with progress_cm as progress:
|
|
567
|
+
|
|
568
|
+
def progress_updater(progress_bytes: float):
|
|
569
|
+
progress.update(progress_bytes)
|
|
570
|
+
|
|
571
|
+
download_files(
|
|
572
|
+
xet_download_info,
|
|
573
|
+
endpoint=connection_info.endpoint,
|
|
574
|
+
token_info=(connection_info.access_token, connection_info.expiration_unix_epoch),
|
|
575
|
+
token_refresher=token_refresher,
|
|
576
|
+
progress_updater=[progress_updater],
|
|
577
|
+
)
|
|
578
|
+
|
|
579
|
+
|
|
490
580
|
def _normalize_etag(etag: Optional[str]) -> Optional[str]:
|
|
491
581
|
"""Normalize ETag HTTP header, so it can be used to create nice filepaths.
|
|
492
582
|
|
|
@@ -601,10 +691,10 @@ def _create_symlink(src: str, dst: str, new_blob: bool = False) -> None:
|
|
|
601
691
|
|
|
602
692
|
# Symlinks are not supported => let's move or copy the file.
|
|
603
693
|
if new_blob:
|
|
604
|
-
logger.
|
|
694
|
+
logger.debug(f"Symlink not supported. Moving file from {abs_src} to {abs_dst}")
|
|
605
695
|
shutil.move(abs_src, abs_dst, copy_function=_copy_no_matter_what)
|
|
606
696
|
else:
|
|
607
|
-
logger.
|
|
697
|
+
logger.debug(f"Symlink not supported. Copying file from {abs_src} to {abs_dst}")
|
|
608
698
|
shutil.copyfile(abs_src, abs_dst)
|
|
609
699
|
|
|
610
700
|
|
|
@@ -660,6 +750,78 @@ def _check_disk_space(expected_size: int, target_dir: Union[str, Path]) -> None:
|
|
|
660
750
|
pass
|
|
661
751
|
|
|
662
752
|
|
|
753
|
+
@overload
|
|
754
|
+
def hf_hub_download(
|
|
755
|
+
repo_id: str,
|
|
756
|
+
filename: str,
|
|
757
|
+
*,
|
|
758
|
+
subfolder: Optional[str] = None,
|
|
759
|
+
repo_type: Optional[str] = None,
|
|
760
|
+
revision: Optional[str] = None,
|
|
761
|
+
library_name: Optional[str] = None,
|
|
762
|
+
library_version: Optional[str] = None,
|
|
763
|
+
cache_dir: Union[str, Path, None] = None,
|
|
764
|
+
local_dir: Union[str, Path, None] = None,
|
|
765
|
+
user_agent: Union[dict, str, None] = None,
|
|
766
|
+
force_download: bool = False,
|
|
767
|
+
etag_timeout: float = constants.DEFAULT_ETAG_TIMEOUT,
|
|
768
|
+
token: Union[bool, str, None] = None,
|
|
769
|
+
local_files_only: bool = False,
|
|
770
|
+
headers: Optional[dict[str, str]] = None,
|
|
771
|
+
endpoint: Optional[str] = None,
|
|
772
|
+
tqdm_class: Optional[type[base_tqdm]] = None,
|
|
773
|
+
dry_run: Literal[False] = False,
|
|
774
|
+
) -> str: ...
|
|
775
|
+
|
|
776
|
+
|
|
777
|
+
@overload
|
|
778
|
+
def hf_hub_download(
|
|
779
|
+
repo_id: str,
|
|
780
|
+
filename: str,
|
|
781
|
+
*,
|
|
782
|
+
subfolder: Optional[str] = None,
|
|
783
|
+
repo_type: Optional[str] = None,
|
|
784
|
+
revision: Optional[str] = None,
|
|
785
|
+
library_name: Optional[str] = None,
|
|
786
|
+
library_version: Optional[str] = None,
|
|
787
|
+
cache_dir: Union[str, Path, None] = None,
|
|
788
|
+
local_dir: Union[str, Path, None] = None,
|
|
789
|
+
user_agent: Union[dict, str, None] = None,
|
|
790
|
+
force_download: bool = False,
|
|
791
|
+
etag_timeout: float = constants.DEFAULT_ETAG_TIMEOUT,
|
|
792
|
+
token: Union[bool, str, None] = None,
|
|
793
|
+
local_files_only: bool = False,
|
|
794
|
+
headers: Optional[dict[str, str]] = None,
|
|
795
|
+
endpoint: Optional[str] = None,
|
|
796
|
+
tqdm_class: Optional[type[base_tqdm]] = None,
|
|
797
|
+
dry_run: Literal[True] = True,
|
|
798
|
+
) -> DryRunFileInfo: ...
|
|
799
|
+
|
|
800
|
+
|
|
801
|
+
@overload
|
|
802
|
+
def hf_hub_download(
|
|
803
|
+
repo_id: str,
|
|
804
|
+
filename: str,
|
|
805
|
+
*,
|
|
806
|
+
subfolder: Optional[str] = None,
|
|
807
|
+
repo_type: Optional[str] = None,
|
|
808
|
+
revision: Optional[str] = None,
|
|
809
|
+
library_name: Optional[str] = None,
|
|
810
|
+
library_version: Optional[str] = None,
|
|
811
|
+
cache_dir: Union[str, Path, None] = None,
|
|
812
|
+
local_dir: Union[str, Path, None] = None,
|
|
813
|
+
user_agent: Union[dict, str, None] = None,
|
|
814
|
+
force_download: bool = False,
|
|
815
|
+
etag_timeout: float = constants.DEFAULT_ETAG_TIMEOUT,
|
|
816
|
+
token: Union[bool, str, None] = None,
|
|
817
|
+
local_files_only: bool = False,
|
|
818
|
+
headers: Optional[dict[str, str]] = None,
|
|
819
|
+
endpoint: Optional[str] = None,
|
|
820
|
+
tqdm_class: Optional[type[base_tqdm]] = None,
|
|
821
|
+
dry_run: bool = False,
|
|
822
|
+
) -> Union[str, DryRunFileInfo]: ...
|
|
823
|
+
|
|
824
|
+
|
|
663
825
|
@validate_hf_hub_args
|
|
664
826
|
def hf_hub_download(
|
|
665
827
|
repo_id: str,
|
|
@@ -672,18 +834,16 @@ def hf_hub_download(
|
|
|
672
834
|
library_version: Optional[str] = None,
|
|
673
835
|
cache_dir: Union[str, Path, None] = None,
|
|
674
836
|
local_dir: Union[str, Path, None] = None,
|
|
675
|
-
user_agent: Union[
|
|
837
|
+
user_agent: Union[dict, str, None] = None,
|
|
676
838
|
force_download: bool = False,
|
|
677
|
-
proxies: Optional[Dict] = None,
|
|
678
839
|
etag_timeout: float = constants.DEFAULT_ETAG_TIMEOUT,
|
|
679
840
|
token: Union[bool, str, None] = None,
|
|
680
841
|
local_files_only: bool = False,
|
|
681
|
-
headers: Optional[
|
|
842
|
+
headers: Optional[dict[str, str]] = None,
|
|
682
843
|
endpoint: Optional[str] = None,
|
|
683
|
-
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
) -> str:
|
|
844
|
+
tqdm_class: Optional[type[base_tqdm]] = None,
|
|
845
|
+
dry_run: bool = False,
|
|
846
|
+
) -> Union[str, DryRunFileInfo]:
|
|
687
847
|
"""Download a given file if it's not already present in the local cache.
|
|
688
848
|
|
|
689
849
|
The new cache file layout looks like this:
|
|
@@ -745,9 +905,6 @@ def hf_hub_download(
|
|
|
745
905
|
force_download (`bool`, *optional*, defaults to `False`):
|
|
746
906
|
Whether the file should be downloaded even if it already exists in
|
|
747
907
|
the local cache.
|
|
748
|
-
proxies (`dict`, *optional*):
|
|
749
|
-
Dictionary mapping protocol to the URL of the proxy passed to
|
|
750
|
-
`requests.request`.
|
|
751
908
|
etag_timeout (`float`, *optional*, defaults to `10`):
|
|
752
909
|
When fetching ETag, how many seconds to wait for the server to send
|
|
753
910
|
data before giving up which is passed to `requests.request`.
|
|
@@ -761,9 +918,19 @@ def hf_hub_download(
|
|
|
761
918
|
local cached file if it exists.
|
|
762
919
|
headers (`dict`, *optional*):
|
|
763
920
|
Additional headers to be sent with the request.
|
|
921
|
+
tqdm_class (`tqdm`, *optional*):
|
|
922
|
+
If provided, overwrites the default behavior for the progress bar. Passed
|
|
923
|
+
argument must inherit from `tqdm.auto.tqdm` or at least mimic its behavior.
|
|
924
|
+
Defaults to the custom HF progress bar that can be disabled by setting
|
|
925
|
+
`HF_HUB_DISABLE_PROGRESS_BARS` environment variable.
|
|
926
|
+
dry_run (`bool`, *optional*, defaults to `False`):
|
|
927
|
+
If `True`, perform a dry run without actually downloading the file. Returns a
|
|
928
|
+
[`DryRunFileInfo`] object containing information about what would be downloaded.
|
|
764
929
|
|
|
765
930
|
Returns:
|
|
766
|
-
`str
|
|
931
|
+
`str` or [`DryRunFileInfo`]:
|
|
932
|
+
- If `dry_run=False`: Local path of file or if networking is off, last version of file cached on disk.
|
|
933
|
+
- If `dry_run=True`: A [`DryRunFileInfo`] object containing download information.
|
|
767
934
|
|
|
768
935
|
Raises:
|
|
769
936
|
[`~utils.RepositoryNotFoundError`]
|
|
@@ -771,7 +938,7 @@ def hf_hub_download(
|
|
|
771
938
|
or because it is set to `private` and you do not have access.
|
|
772
939
|
[`~utils.RevisionNotFoundError`]
|
|
773
940
|
If the revision to download from cannot be found.
|
|
774
|
-
[`~utils.
|
|
941
|
+
[`~utils.RemoteEntryNotFoundError`]
|
|
775
942
|
If the file to download cannot be found.
|
|
776
943
|
[`~utils.LocalEntryNotFoundError`]
|
|
777
944
|
If network is disabled or unavailable and file is not found in cache.
|
|
@@ -787,20 +954,6 @@ def hf_hub_download(
|
|
|
787
954
|
# Respect environment variable above user value
|
|
788
955
|
etag_timeout = constants.HF_HUB_ETAG_TIMEOUT
|
|
789
956
|
|
|
790
|
-
if force_filename is not None:
|
|
791
|
-
warnings.warn(
|
|
792
|
-
"The `force_filename` parameter is deprecated as a new caching system, "
|
|
793
|
-
"which keeps the filenames as they are on the Hub, is now in place.",
|
|
794
|
-
FutureWarning,
|
|
795
|
-
)
|
|
796
|
-
if resume_download is not None:
|
|
797
|
-
warnings.warn(
|
|
798
|
-
"`resume_download` is deprecated and will be removed in version 1.0.0. "
|
|
799
|
-
"Downloads always resume when possible. "
|
|
800
|
-
"If you want to force a new download, use `force_download=True`.",
|
|
801
|
-
FutureWarning,
|
|
802
|
-
)
|
|
803
|
-
|
|
804
957
|
if cache_dir is None:
|
|
805
958
|
cache_dir = constants.HF_HUB_CACHE
|
|
806
959
|
if revision is None:
|
|
@@ -830,15 +983,6 @@ def hf_hub_download(
|
|
|
830
983
|
)
|
|
831
984
|
|
|
832
985
|
if local_dir is not None:
|
|
833
|
-
if local_dir_use_symlinks != "auto":
|
|
834
|
-
warnings.warn(
|
|
835
|
-
"`local_dir_use_symlinks` parameter is deprecated and will be ignored. "
|
|
836
|
-
"The process to download files to a local folder has been updated and do "
|
|
837
|
-
"not rely on symlinks anymore. You only need to pass a destination folder "
|
|
838
|
-
"as`local_dir`.\n"
|
|
839
|
-
"For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder."
|
|
840
|
-
)
|
|
841
|
-
|
|
842
986
|
return _hf_hub_download_to_local_dir(
|
|
843
987
|
# Destination
|
|
844
988
|
local_dir=local_dir,
|
|
@@ -851,12 +995,13 @@ def hf_hub_download(
|
|
|
851
995
|
endpoint=endpoint,
|
|
852
996
|
etag_timeout=etag_timeout,
|
|
853
997
|
headers=hf_headers,
|
|
854
|
-
proxies=proxies,
|
|
855
998
|
token=token,
|
|
856
999
|
# Additional options
|
|
857
1000
|
cache_dir=cache_dir,
|
|
858
1001
|
force_download=force_download,
|
|
859
1002
|
local_files_only=local_files_only,
|
|
1003
|
+
tqdm_class=tqdm_class,
|
|
1004
|
+
dry_run=dry_run,
|
|
860
1005
|
)
|
|
861
1006
|
else:
|
|
862
1007
|
return _hf_hub_download_to_cache_dir(
|
|
@@ -871,11 +1016,12 @@ def hf_hub_download(
|
|
|
871
1016
|
endpoint=endpoint,
|
|
872
1017
|
etag_timeout=etag_timeout,
|
|
873
1018
|
headers=hf_headers,
|
|
874
|
-
proxies=proxies,
|
|
875
1019
|
token=token,
|
|
876
1020
|
# Additional options
|
|
877
1021
|
local_files_only=local_files_only,
|
|
878
1022
|
force_download=force_download,
|
|
1023
|
+
tqdm_class=tqdm_class,
|
|
1024
|
+
dry_run=dry_run,
|
|
879
1025
|
)
|
|
880
1026
|
|
|
881
1027
|
|
|
@@ -891,13 +1037,14 @@ def _hf_hub_download_to_cache_dir(
|
|
|
891
1037
|
# HTTP info
|
|
892
1038
|
endpoint: Optional[str],
|
|
893
1039
|
etag_timeout: float,
|
|
894
|
-
headers:
|
|
895
|
-
proxies: Optional[Dict],
|
|
1040
|
+
headers: dict[str, str],
|
|
896
1041
|
token: Optional[Union[bool, str]],
|
|
897
1042
|
# Additional options
|
|
898
1043
|
local_files_only: bool,
|
|
899
1044
|
force_download: bool,
|
|
900
|
-
|
|
1045
|
+
tqdm_class: Optional[type[base_tqdm]],
|
|
1046
|
+
dry_run: bool,
|
|
1047
|
+
) -> Union[str, DryRunFileInfo]:
|
|
901
1048
|
"""Download a given file to a cache folder, if not already present.
|
|
902
1049
|
|
|
903
1050
|
Method should not be called directly. Please use `hf_hub_download` instead.
|
|
@@ -905,7 +1052,7 @@ def _hf_hub_download_to_cache_dir(
|
|
|
905
1052
|
locks_dir = os.path.join(cache_dir, ".locks")
|
|
906
1053
|
storage_folder = os.path.join(cache_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type))
|
|
907
1054
|
|
|
908
|
-
# cross
|
|
1055
|
+
# cross-platform transcription of filename, to be used as a local file path.
|
|
909
1056
|
relative_filename = os.path.join(*filename.split("/"))
|
|
910
1057
|
if os.name == "nt":
|
|
911
1058
|
if relative_filename.startswith("..\\") or "\\..\\" in relative_filename:
|
|
@@ -917,18 +1064,27 @@ def _hf_hub_download_to_cache_dir(
|
|
|
917
1064
|
# if user provides a commit_hash and they already have the file on disk, shortcut everything.
|
|
918
1065
|
if REGEX_COMMIT_HASH.match(revision):
|
|
919
1066
|
pointer_path = _get_pointer_path(storage_folder, revision, relative_filename)
|
|
920
|
-
if os.path.exists(pointer_path)
|
|
921
|
-
|
|
1067
|
+
if os.path.exists(pointer_path):
|
|
1068
|
+
if dry_run:
|
|
1069
|
+
return DryRunFileInfo(
|
|
1070
|
+
commit_hash=revision,
|
|
1071
|
+
file_size=os.path.getsize(pointer_path),
|
|
1072
|
+
filename=filename,
|
|
1073
|
+
is_cached=True,
|
|
1074
|
+
local_path=pointer_path,
|
|
1075
|
+
will_download=force_download,
|
|
1076
|
+
)
|
|
1077
|
+
if not force_download:
|
|
1078
|
+
return pointer_path
|
|
922
1079
|
|
|
923
1080
|
# Try to get metadata (etag, commit_hash, url, size) from the server.
|
|
924
1081
|
# If we can't, a HEAD request error is returned.
|
|
925
|
-
(url_to_download, etag, commit_hash, expected_size, head_call_error) = _get_metadata_or_catch_error(
|
|
1082
|
+
(url_to_download, etag, commit_hash, expected_size, xet_file_data, head_call_error) = _get_metadata_or_catch_error(
|
|
926
1083
|
repo_id=repo_id,
|
|
927
1084
|
filename=filename,
|
|
928
1085
|
repo_type=repo_type,
|
|
929
1086
|
revision=revision,
|
|
930
1087
|
endpoint=endpoint,
|
|
931
|
-
proxies=proxies,
|
|
932
1088
|
etag_timeout=etag_timeout,
|
|
933
1089
|
headers=headers,
|
|
934
1090
|
token=token,
|
|
@@ -962,8 +1118,18 @@ def _hf_hub_download_to_cache_dir(
|
|
|
962
1118
|
# Return pointer file if exists
|
|
963
1119
|
if commit_hash is not None:
|
|
964
1120
|
pointer_path = _get_pointer_path(storage_folder, commit_hash, relative_filename)
|
|
965
|
-
if os.path.exists(pointer_path)
|
|
966
|
-
|
|
1121
|
+
if os.path.exists(pointer_path):
|
|
1122
|
+
if dry_run:
|
|
1123
|
+
return DryRunFileInfo(
|
|
1124
|
+
commit_hash=commit_hash,
|
|
1125
|
+
file_size=os.path.getsize(pointer_path),
|
|
1126
|
+
filename=filename,
|
|
1127
|
+
is_cached=True,
|
|
1128
|
+
local_path=pointer_path,
|
|
1129
|
+
will_download=force_download,
|
|
1130
|
+
)
|
|
1131
|
+
if not force_download:
|
|
1132
|
+
return pointer_path
|
|
967
1133
|
|
|
968
1134
|
# Otherwise, raise appropriate error
|
|
969
1135
|
_raise_on_head_call_error(head_call_error, force_download, local_files_only)
|
|
@@ -976,6 +1142,17 @@ def _hf_hub_download_to_cache_dir(
|
|
|
976
1142
|
blob_path = os.path.join(storage_folder, "blobs", etag)
|
|
977
1143
|
pointer_path = _get_pointer_path(storage_folder, commit_hash, relative_filename)
|
|
978
1144
|
|
|
1145
|
+
if dry_run:
|
|
1146
|
+
is_cached = os.path.exists(pointer_path) or os.path.exists(blob_path)
|
|
1147
|
+
return DryRunFileInfo(
|
|
1148
|
+
commit_hash=commit_hash,
|
|
1149
|
+
file_size=expected_size,
|
|
1150
|
+
filename=filename,
|
|
1151
|
+
is_cached=is_cached,
|
|
1152
|
+
local_path=pointer_path,
|
|
1153
|
+
will_download=force_download or not is_cached,
|
|
1154
|
+
)
|
|
1155
|
+
|
|
979
1156
|
os.makedirs(os.path.dirname(blob_path), exist_ok=True)
|
|
980
1157
|
os.makedirs(os.path.dirname(pointer_path), exist_ok=True)
|
|
981
1158
|
|
|
@@ -984,39 +1161,53 @@ def _hf_hub_download_to_cache_dir(
|
|
|
984
1161
|
# In that case store a ref.
|
|
985
1162
|
_cache_commit_hash_for_specific_revision(storage_folder, revision, commit_hash)
|
|
986
1163
|
|
|
987
|
-
# If file already exists, return it (except if force_download=True)
|
|
988
|
-
if not force_download:
|
|
989
|
-
if os.path.exists(pointer_path):
|
|
990
|
-
return pointer_path
|
|
991
|
-
|
|
992
|
-
if os.path.exists(blob_path):
|
|
993
|
-
# we have the blob already, but not the pointer
|
|
994
|
-
_create_symlink(blob_path, pointer_path, new_blob=False)
|
|
995
|
-
return pointer_path
|
|
996
|
-
|
|
997
1164
|
# Prevent parallel downloads of the same file with a lock.
|
|
998
1165
|
# etag could be duplicated across repos,
|
|
999
1166
|
lock_path = os.path.join(locks_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type), f"{etag}.lock")
|
|
1000
1167
|
|
|
1001
1168
|
# Some Windows versions do not allow for paths longer than 255 characters.
|
|
1002
1169
|
# In this case, we must specify it as an extended path by using the "\\?\" prefix.
|
|
1003
|
-
if
|
|
1170
|
+
if (
|
|
1171
|
+
os.name == "nt"
|
|
1172
|
+
and len(os.path.abspath(lock_path)) > 255
|
|
1173
|
+
and not os.path.abspath(lock_path).startswith("\\\\?\\")
|
|
1174
|
+
):
|
|
1004
1175
|
lock_path = "\\\\?\\" + os.path.abspath(lock_path)
|
|
1005
1176
|
|
|
1006
|
-
if
|
|
1177
|
+
if (
|
|
1178
|
+
os.name == "nt"
|
|
1179
|
+
and len(os.path.abspath(blob_path)) > 255
|
|
1180
|
+
and not os.path.abspath(blob_path).startswith("\\\\?\\")
|
|
1181
|
+
):
|
|
1007
1182
|
blob_path = "\\\\?\\" + os.path.abspath(blob_path)
|
|
1008
1183
|
|
|
1009
1184
|
Path(lock_path).parent.mkdir(parents=True, exist_ok=True)
|
|
1185
|
+
|
|
1186
|
+
# pointer already exists -> immediate return
|
|
1187
|
+
if not force_download and os.path.exists(pointer_path):
|
|
1188
|
+
return pointer_path
|
|
1189
|
+
|
|
1190
|
+
# Blob exists but pointer must be (safely) created -> take the lock
|
|
1191
|
+
if not force_download and os.path.exists(blob_path):
|
|
1192
|
+
with WeakFileLock(lock_path):
|
|
1193
|
+
if not os.path.exists(pointer_path):
|
|
1194
|
+
_create_symlink(blob_path, pointer_path, new_blob=False)
|
|
1195
|
+
return pointer_path
|
|
1196
|
+
|
|
1197
|
+
# Local file doesn't exist or etag isn't a match => retrieve file from remote (or cache)
|
|
1198
|
+
|
|
1010
1199
|
with WeakFileLock(lock_path):
|
|
1011
1200
|
_download_to_tmp_and_move(
|
|
1012
1201
|
incomplete_path=Path(blob_path + ".incomplete"),
|
|
1013
1202
|
destination_path=Path(blob_path),
|
|
1014
1203
|
url_to_download=url_to_download,
|
|
1015
|
-
proxies=proxies,
|
|
1016
1204
|
headers=headers,
|
|
1017
1205
|
expected_size=expected_size,
|
|
1018
1206
|
filename=filename,
|
|
1019
1207
|
force_download=force_download,
|
|
1208
|
+
etag=etag,
|
|
1209
|
+
xet_file_data=xet_file_data,
|
|
1210
|
+
tqdm_class=tqdm_class,
|
|
1020
1211
|
)
|
|
1021
1212
|
if not os.path.exists(pointer_path):
|
|
1022
1213
|
_create_symlink(blob_path, pointer_path, new_blob=True)
|
|
@@ -1036,14 +1227,15 @@ def _hf_hub_download_to_local_dir(
|
|
|
1036
1227
|
# HTTP info
|
|
1037
1228
|
endpoint: Optional[str],
|
|
1038
1229
|
etag_timeout: float,
|
|
1039
|
-
headers:
|
|
1040
|
-
proxies: Optional[Dict],
|
|
1230
|
+
headers: dict[str, str],
|
|
1041
1231
|
token: Union[bool, str, None],
|
|
1042
1232
|
# Additional options
|
|
1043
1233
|
cache_dir: str,
|
|
1044
1234
|
force_download: bool,
|
|
1045
1235
|
local_files_only: bool,
|
|
1046
|
-
|
|
1236
|
+
tqdm_class: Optional[type[base_tqdm]],
|
|
1237
|
+
dry_run: bool,
|
|
1238
|
+
) -> Union[str, DryRunFileInfo]:
|
|
1047
1239
|
"""Download a given file to a local folder, if not already present.
|
|
1048
1240
|
|
|
1049
1241
|
Method should not be called directly. Please use `hf_hub_download` instead.
|
|
@@ -1058,22 +1250,31 @@ def _hf_hub_download_to_local_dir(
|
|
|
1058
1250
|
|
|
1059
1251
|
# Local file exists + metadata exists + commit_hash matches => return file
|
|
1060
1252
|
if (
|
|
1061
|
-
|
|
1062
|
-
and REGEX_COMMIT_HASH.match(revision)
|
|
1253
|
+
REGEX_COMMIT_HASH.match(revision)
|
|
1063
1254
|
and paths.file_path.is_file()
|
|
1064
1255
|
and local_metadata is not None
|
|
1065
1256
|
and local_metadata.commit_hash == revision
|
|
1066
1257
|
):
|
|
1067
|
-
|
|
1258
|
+
local_file = str(paths.file_path)
|
|
1259
|
+
if dry_run:
|
|
1260
|
+
return DryRunFileInfo(
|
|
1261
|
+
commit_hash=revision,
|
|
1262
|
+
file_size=os.path.getsize(local_file),
|
|
1263
|
+
filename=filename,
|
|
1264
|
+
is_cached=True,
|
|
1265
|
+
local_path=local_file,
|
|
1266
|
+
will_download=force_download,
|
|
1267
|
+
)
|
|
1268
|
+
if not force_download:
|
|
1269
|
+
return local_file
|
|
1068
1270
|
|
|
1069
1271
|
# Local file doesn't exist or commit_hash doesn't match => we need the etag
|
|
1070
|
-
(url_to_download, etag, commit_hash, expected_size, head_call_error) = _get_metadata_or_catch_error(
|
|
1272
|
+
(url_to_download, etag, commit_hash, expected_size, xet_file_data, head_call_error) = _get_metadata_or_catch_error(
|
|
1071
1273
|
repo_id=repo_id,
|
|
1072
1274
|
filename=filename,
|
|
1073
1275
|
repo_type=repo_type,
|
|
1074
1276
|
revision=revision,
|
|
1075
1277
|
endpoint=endpoint,
|
|
1076
|
-
proxies=proxies,
|
|
1077
1278
|
etag_timeout=etag_timeout,
|
|
1078
1279
|
headers=headers,
|
|
1079
1280
|
token=token,
|
|
@@ -1082,11 +1283,24 @@ def _hf_hub_download_to_local_dir(
|
|
|
1082
1283
|
|
|
1083
1284
|
if head_call_error is not None:
|
|
1084
1285
|
# No HEAD call but local file exists => default to local file
|
|
1085
|
-
if
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
|
|
1286
|
+
if paths.file_path.is_file():
|
|
1287
|
+
if dry_run or not force_download:
|
|
1288
|
+
logger.warning(
|
|
1289
|
+
f"Couldn't access the Hub to check for update but local file already exists. Defaulting to existing file. (error: {head_call_error})"
|
|
1290
|
+
)
|
|
1291
|
+
local_path = str(paths.file_path)
|
|
1292
|
+
if dry_run and local_metadata is not None:
|
|
1293
|
+
return DryRunFileInfo(
|
|
1294
|
+
commit_hash=local_metadata.commit_hash,
|
|
1295
|
+
file_size=os.path.getsize(local_path),
|
|
1296
|
+
filename=filename,
|
|
1297
|
+
is_cached=True,
|
|
1298
|
+
local_path=local_path,
|
|
1299
|
+
will_download=force_download,
|
|
1300
|
+
)
|
|
1301
|
+
if not force_download:
|
|
1302
|
+
return local_path
|
|
1303
|
+
|
|
1090
1304
|
# Otherwise => raise
|
|
1091
1305
|
_raise_on_head_call_error(head_call_error, force_download, local_files_only)
|
|
1092
1306
|
|
|
@@ -1101,6 +1315,15 @@ def _hf_hub_download_to_local_dir(
|
|
|
1101
1315
|
# etag matches => update metadata and return file
|
|
1102
1316
|
if local_metadata is not None and local_metadata.etag == etag:
|
|
1103
1317
|
write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag)
|
|
1318
|
+
if dry_run:
|
|
1319
|
+
return DryRunFileInfo(
|
|
1320
|
+
commit_hash=commit_hash,
|
|
1321
|
+
file_size=expected_size,
|
|
1322
|
+
filename=filename,
|
|
1323
|
+
is_cached=True,
|
|
1324
|
+
local_path=str(paths.file_path),
|
|
1325
|
+
will_download=False,
|
|
1326
|
+
)
|
|
1104
1327
|
return str(paths.file_path)
|
|
1105
1328
|
|
|
1106
1329
|
# metadata is outdated + etag is a sha256
|
|
@@ -1112,6 +1335,15 @@ def _hf_hub_download_to_local_dir(
|
|
|
1112
1335
|
file_hash = sha_fileobj(f).hex()
|
|
1113
1336
|
if file_hash == etag:
|
|
1114
1337
|
write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag)
|
|
1338
|
+
if dry_run:
|
|
1339
|
+
return DryRunFileInfo(
|
|
1340
|
+
commit_hash=commit_hash,
|
|
1341
|
+
file_size=expected_size,
|
|
1342
|
+
filename=filename,
|
|
1343
|
+
is_cached=True,
|
|
1344
|
+
local_path=str(paths.file_path),
|
|
1345
|
+
will_download=False,
|
|
1346
|
+
)
|
|
1115
1347
|
return str(paths.file_path)
|
|
1116
1348
|
|
|
1117
1349
|
# Local file doesn't exist or etag isn't a match => retrieve file from remote (or cache)
|
|
@@ -1130,8 +1362,28 @@ def _hf_hub_download_to_local_dir(
|
|
|
1130
1362
|
paths.file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
1131
1363
|
shutil.copyfile(cached_path, paths.file_path)
|
|
1132
1364
|
write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag)
|
|
1365
|
+
if dry_run:
|
|
1366
|
+
return DryRunFileInfo(
|
|
1367
|
+
commit_hash=commit_hash,
|
|
1368
|
+
file_size=expected_size,
|
|
1369
|
+
filename=filename,
|
|
1370
|
+
is_cached=True,
|
|
1371
|
+
local_path=str(paths.file_path),
|
|
1372
|
+
will_download=False,
|
|
1373
|
+
)
|
|
1133
1374
|
return str(paths.file_path)
|
|
1134
1375
|
|
|
1376
|
+
if dry_run:
|
|
1377
|
+
is_cached = paths.file_path.is_file()
|
|
1378
|
+
return DryRunFileInfo(
|
|
1379
|
+
commit_hash=commit_hash,
|
|
1380
|
+
file_size=expected_size,
|
|
1381
|
+
filename=filename,
|
|
1382
|
+
is_cached=is_cached,
|
|
1383
|
+
local_path=str(paths.file_path),
|
|
1384
|
+
will_download=force_download or not is_cached,
|
|
1385
|
+
)
|
|
1386
|
+
|
|
1135
1387
|
# Otherwise, let's download the file!
|
|
1136
1388
|
with WeakFileLock(paths.lock_path):
|
|
1137
1389
|
paths.file_path.unlink(missing_ok=True) # delete outdated file first
|
|
@@ -1139,11 +1391,13 @@ def _hf_hub_download_to_local_dir(
|
|
|
1139
1391
|
incomplete_path=paths.incomplete_path(etag),
|
|
1140
1392
|
destination_path=paths.file_path,
|
|
1141
1393
|
url_to_download=url_to_download,
|
|
1142
|
-
proxies=proxies,
|
|
1143
1394
|
headers=headers,
|
|
1144
1395
|
expected_size=expected_size,
|
|
1145
1396
|
filename=filename,
|
|
1146
1397
|
force_download=force_download,
|
|
1398
|
+
etag=etag,
|
|
1399
|
+
xet_file_data=xet_file_data,
|
|
1400
|
+
tqdm_class=tqdm_class,
|
|
1147
1401
|
)
|
|
1148
1402
|
|
|
1149
1403
|
write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag)
|
|
@@ -1247,12 +1501,12 @@ def try_to_load_from_cache(
|
|
|
1247
1501
|
def get_hf_file_metadata(
|
|
1248
1502
|
url: str,
|
|
1249
1503
|
token: Union[bool, str, None] = None,
|
|
1250
|
-
proxies: Optional[Dict] = None,
|
|
1251
1504
|
timeout: Optional[float] = constants.DEFAULT_REQUEST_TIMEOUT,
|
|
1252
1505
|
library_name: Optional[str] = None,
|
|
1253
1506
|
library_version: Optional[str] = None,
|
|
1254
|
-
user_agent: Union[
|
|
1255
|
-
headers: Optional[
|
|
1507
|
+
user_agent: Union[dict, str, None] = None,
|
|
1508
|
+
headers: Optional[dict[str, str]] = None,
|
|
1509
|
+
endpoint: Optional[str] = None,
|
|
1256
1510
|
) -> HfFileMetadata:
|
|
1257
1511
|
"""Fetch metadata of a file versioned on the Hub for a given url.
|
|
1258
1512
|
|
|
@@ -1265,9 +1519,6 @@ def get_hf_file_metadata(
|
|
|
1265
1519
|
folder.
|
|
1266
1520
|
- If `False` or `None`, no token is provided.
|
|
1267
1521
|
- If a string, it's used as the authentication token.
|
|
1268
|
-
proxies (`dict`, *optional*):
|
|
1269
|
-
Dictionary mapping protocol to the URL of the proxy passed to
|
|
1270
|
-
`requests.request`.
|
|
1271
1522
|
timeout (`float`, *optional*, defaults to 10):
|
|
1272
1523
|
How many seconds to wait for the server to send metadata before giving up.
|
|
1273
1524
|
library_name (`str`, *optional*):
|
|
@@ -1278,6 +1529,8 @@ def get_hf_file_metadata(
|
|
|
1278
1529
|
The user-agent info in the form of a dictionary or a string.
|
|
1279
1530
|
headers (`dict`, *optional*):
|
|
1280
1531
|
Additional headers to be sent with the request.
|
|
1532
|
+
endpoint (`str`, *optional*):
|
|
1533
|
+
Endpoint of the Hub. Defaults to <https://huggingface.co>.
|
|
1281
1534
|
|
|
1282
1535
|
Returns:
|
|
1283
1536
|
A [`HfFileMetadata`] object containing metadata such as location, etag, size and
|
|
@@ -1293,30 +1546,23 @@ def get_hf_file_metadata(
|
|
|
1293
1546
|
hf_headers["Accept-Encoding"] = "identity" # prevent any compression => we want to know the real size of the file
|
|
1294
1547
|
|
|
1295
1548
|
# Retrieve metadata
|
|
1296
|
-
|
|
1297
|
-
|
|
1298
|
-
url=url,
|
|
1299
|
-
headers=hf_headers,
|
|
1300
|
-
allow_redirects=False,
|
|
1301
|
-
follow_relative_redirects=True,
|
|
1302
|
-
proxies=proxies,
|
|
1303
|
-
timeout=timeout,
|
|
1304
|
-
)
|
|
1305
|
-
hf_raise_for_status(r)
|
|
1549
|
+
response = _httpx_follow_relative_redirects(method="HEAD", url=url, headers=hf_headers, timeout=timeout)
|
|
1550
|
+
hf_raise_for_status(response)
|
|
1306
1551
|
|
|
1307
1552
|
# Return
|
|
1308
1553
|
return HfFileMetadata(
|
|
1309
|
-
commit_hash=
|
|
1310
|
-
# We favor a custom header indicating the etag of the linked resource, and
|
|
1311
|
-
|
|
1312
|
-
|
|
1554
|
+
commit_hash=response.headers.get(constants.HUGGINGFACE_HEADER_X_REPO_COMMIT),
|
|
1555
|
+
# We favor a custom header indicating the etag of the linked resource, and we fall back to the regular etag header.
|
|
1556
|
+
etag=_normalize_etag(
|
|
1557
|
+
response.headers.get(constants.HUGGINGFACE_HEADER_X_LINKED_ETAG) or response.headers.get("ETag")
|
|
1558
|
+
),
|
|
1313
1559
|
# Either from response headers (if redirected) or defaults to request url
|
|
1314
|
-
# Do not use directly `url
|
|
1315
|
-
#
|
|
1316
|
-
location=r.headers.get("Location") or r.request.url, # type: ignore
|
|
1560
|
+
# Do not use directly `url` as we might have followed relative redirects.
|
|
1561
|
+
location=response.headers.get("Location") or str(response.request.url), # type: ignore
|
|
1317
1562
|
size=_int_or_none(
|
|
1318
|
-
|
|
1563
|
+
response.headers.get(constants.HUGGINGFACE_HEADER_X_LINKED_SIZE) or response.headers.get("Content-Length")
|
|
1319
1564
|
),
|
|
1565
|
+
xet_file_data=parse_xet_file_data_from_response(response, endpoint=endpoint), # type: ignore
|
|
1320
1566
|
)
|
|
1321
1567
|
|
|
1322
1568
|
|
|
@@ -1327,19 +1573,18 @@ def _get_metadata_or_catch_error(
|
|
|
1327
1573
|
repo_type: str,
|
|
1328
1574
|
revision: str,
|
|
1329
1575
|
endpoint: Optional[str],
|
|
1330
|
-
proxies: Optional[Dict],
|
|
1331
1576
|
etag_timeout: Optional[float],
|
|
1332
|
-
headers:
|
|
1577
|
+
headers: dict[str, str], # mutated inplace!
|
|
1333
1578
|
token: Union[bool, str, None],
|
|
1334
1579
|
local_files_only: bool,
|
|
1335
1580
|
relative_filename: Optional[str] = None, # only used to store `.no_exists` in cache
|
|
1336
1581
|
storage_folder: Optional[str] = None, # only used to store `.no_exists` in cache
|
|
1337
1582
|
) -> Union[
|
|
1338
1583
|
# Either an exception is caught and returned
|
|
1339
|
-
|
|
1584
|
+
tuple[None, None, None, None, None, Exception],
|
|
1340
1585
|
# Or the metadata is returned as
|
|
1341
|
-
# `(url_to_download, etag, commit_hash, expected_size, None)`
|
|
1342
|
-
|
|
1586
|
+
# `(url_to_download, etag, commit_hash, expected_size, xet_file_data, None)`
|
|
1587
|
+
tuple[str, str, str, int, Optional[XetFileData], None],
|
|
1343
1588
|
]:
|
|
1344
1589
|
"""Get metadata for a file on the Hub, safely handling network issues.
|
|
1345
1590
|
|
|
@@ -1356,6 +1601,7 @@ def _get_metadata_or_catch_error(
|
|
|
1356
1601
|
None,
|
|
1357
1602
|
None,
|
|
1358
1603
|
None,
|
|
1604
|
+
None,
|
|
1359
1605
|
OfflineModeIsEnabled(
|
|
1360
1606
|
f"Cannot access file since 'local_files_only=True' as been set. (repo_id: {repo_id}, repo_type: {repo_type}, revision: {revision}, filename: {filename})"
|
|
1361
1607
|
),
|
|
@@ -1367,6 +1613,7 @@ def _get_metadata_or_catch_error(
|
|
|
1367
1613
|
commit_hash: Optional[str] = None
|
|
1368
1614
|
expected_size: Optional[int] = None
|
|
1369
1615
|
head_error_call: Optional[Exception] = None
|
|
1616
|
+
xet_file_data: Optional[XetFileData] = None
|
|
1370
1617
|
|
|
1371
1618
|
# Try to get metadata from the server.
|
|
1372
1619
|
# Do not raise yet if the file is not found or not accessible.
|
|
@@ -1374,9 +1621,9 @@ def _get_metadata_or_catch_error(
|
|
|
1374
1621
|
try:
|
|
1375
1622
|
try:
|
|
1376
1623
|
metadata = get_hf_file_metadata(
|
|
1377
|
-
url=url,
|
|
1624
|
+
url=url, timeout=etag_timeout, headers=headers, token=token, endpoint=endpoint
|
|
1378
1625
|
)
|
|
1379
|
-
except
|
|
1626
|
+
except RemoteEntryNotFoundError as http_error:
|
|
1380
1627
|
if storage_folder is not None and relative_filename is not None:
|
|
1381
1628
|
# Cache the non-existence of the file
|
|
1382
1629
|
commit_hash = http_error.response.headers.get(constants.HUGGINGFACE_HEADER_X_REPO_COMMIT)
|
|
@@ -1414,32 +1661,30 @@ def _get_metadata_or_catch_error(
|
|
|
1414
1661
|
if expected_size is None:
|
|
1415
1662
|
raise FileMetadataError("Distant resource does not have a Content-Length.")
|
|
1416
1663
|
|
|
1664
|
+
xet_file_data = metadata.xet_file_data
|
|
1665
|
+
|
|
1417
1666
|
# In case of a redirect, save an extra redirect on the request.get call,
|
|
1418
1667
|
# and ensure we download the exact atomic version even if it changed
|
|
1419
1668
|
# between the HEAD and the GET (unlikely, but hey).
|
|
1420
1669
|
#
|
|
1421
1670
|
# If url domain is different => we are downloading from a CDN => url is signed => don't send auth
|
|
1422
1671
|
# If url domain is the same => redirect due to repo rename AND downloading a regular file => keep auth
|
|
1423
|
-
if url != metadata.location:
|
|
1672
|
+
if xet_file_data is None and url != metadata.location:
|
|
1424
1673
|
url_to_download = metadata.location
|
|
1425
1674
|
if urlparse(url).netloc != urlparse(metadata.location).netloc:
|
|
1426
1675
|
# Remove authorization header when downloading a LFS blob
|
|
1427
1676
|
headers.pop("authorization", None)
|
|
1428
|
-
except
|
|
1429
|
-
# Actually raise
|
|
1677
|
+
except httpx.ProxyError:
|
|
1678
|
+
# Actually raise on proxy error
|
|
1430
1679
|
raise
|
|
1431
|
-
except (
|
|
1432
|
-
requests.exceptions.ConnectionError,
|
|
1433
|
-
requests.exceptions.Timeout,
|
|
1434
|
-
OfflineModeIsEnabled,
|
|
1435
|
-
) as error:
|
|
1680
|
+
except (httpx.ConnectError, httpx.TimeoutException, OfflineModeIsEnabled) as error:
|
|
1436
1681
|
# Otherwise, our Internet connection is down.
|
|
1437
1682
|
# etag is None
|
|
1438
1683
|
head_error_call = error
|
|
1439
|
-
except (RevisionNotFoundError,
|
|
1684
|
+
except (RevisionNotFoundError, RemoteEntryNotFoundError):
|
|
1440
1685
|
# The repo was found but the revision or entry doesn't exist on the Hub (never existed or got deleted)
|
|
1441
1686
|
raise
|
|
1442
|
-
except
|
|
1687
|
+
except HfHubHTTPError as error:
|
|
1443
1688
|
# Multiple reasons for an http error:
|
|
1444
1689
|
# - Repository is private and invalid/missing token sent
|
|
1445
1690
|
# - Repository is gated and invalid/missing token sent
|
|
@@ -1458,7 +1703,7 @@ def _get_metadata_or_catch_error(
|
|
|
1458
1703
|
if not (local_files_only or etag is not None or head_error_call is not None):
|
|
1459
1704
|
raise RuntimeError("etag is empty due to uncovered problems")
|
|
1460
1705
|
|
|
1461
|
-
return (url_to_download, etag, commit_hash, expected_size, head_error_call) # type: ignore [return-value]
|
|
1706
|
+
return (url_to_download, etag, commit_hash, expected_size, xet_file_data, head_error_call) # type: ignore [return-value]
|
|
1462
1707
|
|
|
1463
1708
|
|
|
1464
1709
|
def _raise_on_head_call_error(head_call_error: Exception, force_download: bool, local_files_only: bool) -> NoReturn:
|
|
@@ -1497,18 +1742,20 @@ def _download_to_tmp_and_move(
|
|
|
1497
1742
|
incomplete_path: Path,
|
|
1498
1743
|
destination_path: Path,
|
|
1499
1744
|
url_to_download: str,
|
|
1500
|
-
|
|
1501
|
-
headers: Dict[str, str],
|
|
1745
|
+
headers: dict[str, str],
|
|
1502
1746
|
expected_size: Optional[int],
|
|
1503
1747
|
filename: str,
|
|
1504
1748
|
force_download: bool,
|
|
1749
|
+
etag: Optional[str],
|
|
1750
|
+
xet_file_data: Optional[XetFileData],
|
|
1751
|
+
tqdm_class: Optional[type[base_tqdm]] = None,
|
|
1505
1752
|
) -> None:
|
|
1506
1753
|
"""Download content from a URL to a destination path.
|
|
1507
1754
|
|
|
1508
1755
|
Internal logic:
|
|
1509
1756
|
- return early if file is already downloaded
|
|
1510
1757
|
- resume download if possible (from incomplete file)
|
|
1511
|
-
- do not resume download if `force_download=True`
|
|
1758
|
+
- do not resume download if `force_download=True`
|
|
1512
1759
|
- check disk space before downloading
|
|
1513
1760
|
- download content to a temporary file
|
|
1514
1761
|
- set correct permissions on temporary file
|
|
@@ -1520,16 +1767,11 @@ def _download_to_tmp_and_move(
|
|
|
1520
1767
|
# Do nothing if already exists (except if force_download=True)
|
|
1521
1768
|
return
|
|
1522
1769
|
|
|
1523
|
-
if incomplete_path.exists() and
|
|
1770
|
+
if incomplete_path.exists() and force_download:
|
|
1524
1771
|
# By default, we will try to resume the download if possible.
|
|
1525
|
-
# However, if the user has set `force_download=True
|
|
1772
|
+
# However, if the user has set `force_download=True`, then we should
|
|
1526
1773
|
# not resume the download => delete the incomplete file.
|
|
1527
|
-
|
|
1528
|
-
if force_download:
|
|
1529
|
-
message += " (force_download=True)"
|
|
1530
|
-
elif constants.HF_HUB_ENABLE_HF_TRANSFER and not proxies:
|
|
1531
|
-
message += " (hf_transfer=True)"
|
|
1532
|
-
logger.info(message)
|
|
1774
|
+
logger.debug(f"Removing incomplete file '{incomplete_path}' (force_download=True)")
|
|
1533
1775
|
incomplete_path.unlink(missing_ok=True)
|
|
1534
1776
|
|
|
1535
1777
|
with incomplete_path.open("ab") as f:
|
|
@@ -1537,23 +1779,41 @@ def _download_to_tmp_and_move(
|
|
|
1537
1779
|
message = f"Downloading '{filename}' to '{incomplete_path}'"
|
|
1538
1780
|
if resume_size > 0 and expected_size is not None:
|
|
1539
1781
|
message += f" (resume from {resume_size}/{expected_size})"
|
|
1540
|
-
logger.
|
|
1782
|
+
logger.debug(message)
|
|
1541
1783
|
|
|
1542
1784
|
if expected_size is not None: # might be None if HTTP header not set correctly
|
|
1543
1785
|
# Check disk space in both tmp and destination path
|
|
1544
1786
|
_check_disk_space(expected_size, incomplete_path.parent)
|
|
1545
1787
|
_check_disk_space(expected_size, destination_path.parent)
|
|
1546
1788
|
|
|
1547
|
-
|
|
1548
|
-
|
|
1549
|
-
|
|
1550
|
-
|
|
1551
|
-
|
|
1552
|
-
|
|
1553
|
-
|
|
1554
|
-
|
|
1789
|
+
if xet_file_data is not None and is_xet_available():
|
|
1790
|
+
logger.debug("Xet Storage is enabled for this repo. Downloading file from Xet Storage..")
|
|
1791
|
+
xet_get(
|
|
1792
|
+
incomplete_path=incomplete_path,
|
|
1793
|
+
xet_file_data=xet_file_data,
|
|
1794
|
+
headers=headers,
|
|
1795
|
+
expected_size=expected_size,
|
|
1796
|
+
displayed_filename=filename,
|
|
1797
|
+
tqdm_class=tqdm_class,
|
|
1798
|
+
)
|
|
1799
|
+
else:
|
|
1800
|
+
if xet_file_data is not None and not constants.HF_HUB_DISABLE_XET:
|
|
1801
|
+
logger.warning(
|
|
1802
|
+
"Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. "
|
|
1803
|
+
"Falling back to regular HTTP download. "
|
|
1804
|
+
"For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`"
|
|
1805
|
+
)
|
|
1806
|
+
|
|
1807
|
+
http_get(
|
|
1808
|
+
url_to_download,
|
|
1809
|
+
f,
|
|
1810
|
+
resume_size=resume_size,
|
|
1811
|
+
headers=headers,
|
|
1812
|
+
expected_size=expected_size,
|
|
1813
|
+
tqdm_class=tqdm_class,
|
|
1814
|
+
)
|
|
1555
1815
|
|
|
1556
|
-
logger.
|
|
1816
|
+
logger.debug(f"Download complete. Moving file to {destination_path}")
|
|
1557
1817
|
_chmod_and_move(incomplete_path, destination_path)
|
|
1558
1818
|
|
|
1559
1819
|
|