huggingface-hub 0.36.0rc0__py3-none-any.whl → 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of huggingface-hub might be problematic. Click here for more details.
- huggingface_hub/__init__.py +33 -45
- huggingface_hub/_commit_api.py +39 -43
- huggingface_hub/_commit_scheduler.py +11 -8
- huggingface_hub/_inference_endpoints.py +8 -8
- huggingface_hub/_jobs_api.py +20 -20
- huggingface_hub/_login.py +17 -43
- huggingface_hub/_oauth.py +8 -8
- huggingface_hub/_snapshot_download.py +135 -50
- huggingface_hub/_space_api.py +4 -4
- huggingface_hub/_tensorboard_logger.py +5 -5
- huggingface_hub/_upload_large_folder.py +18 -32
- huggingface_hub/_webhooks_payload.py +3 -3
- huggingface_hub/_webhooks_server.py +2 -2
- huggingface_hub/cli/__init__.py +0 -14
- huggingface_hub/cli/_cli_utils.py +143 -39
- huggingface_hub/cli/auth.py +105 -171
- huggingface_hub/cli/cache.py +594 -361
- huggingface_hub/cli/download.py +120 -112
- huggingface_hub/cli/hf.py +38 -41
- huggingface_hub/cli/jobs.py +689 -1017
- huggingface_hub/cli/lfs.py +120 -143
- huggingface_hub/cli/repo.py +282 -216
- huggingface_hub/cli/repo_files.py +50 -84
- huggingface_hub/cli/system.py +6 -25
- huggingface_hub/cli/upload.py +198 -220
- huggingface_hub/cli/upload_large_folder.py +91 -106
- huggingface_hub/community.py +5 -5
- huggingface_hub/constants.py +17 -52
- huggingface_hub/dataclasses.py +135 -21
- huggingface_hub/errors.py +47 -30
- huggingface_hub/fastai_utils.py +8 -9
- huggingface_hub/file_download.py +351 -303
- huggingface_hub/hf_api.py +398 -570
- huggingface_hub/hf_file_system.py +101 -66
- huggingface_hub/hub_mixin.py +32 -54
- huggingface_hub/inference/_client.py +177 -162
- huggingface_hub/inference/_common.py +38 -54
- huggingface_hub/inference/_generated/_async_client.py +218 -258
- huggingface_hub/inference/_generated/types/automatic_speech_recognition.py +3 -3
- huggingface_hub/inference/_generated/types/base.py +10 -7
- huggingface_hub/inference/_generated/types/chat_completion.py +16 -16
- huggingface_hub/inference/_generated/types/depth_estimation.py +2 -2
- huggingface_hub/inference/_generated/types/document_question_answering.py +2 -2
- huggingface_hub/inference/_generated/types/feature_extraction.py +2 -2
- huggingface_hub/inference/_generated/types/fill_mask.py +2 -2
- huggingface_hub/inference/_generated/types/sentence_similarity.py +3 -3
- huggingface_hub/inference/_generated/types/summarization.py +2 -2
- huggingface_hub/inference/_generated/types/table_question_answering.py +4 -4
- huggingface_hub/inference/_generated/types/text2text_generation.py +2 -2
- huggingface_hub/inference/_generated/types/text_generation.py +10 -10
- huggingface_hub/inference/_generated/types/text_to_video.py +2 -2
- huggingface_hub/inference/_generated/types/token_classification.py +2 -2
- huggingface_hub/inference/_generated/types/translation.py +2 -2
- huggingface_hub/inference/_generated/types/zero_shot_classification.py +2 -2
- huggingface_hub/inference/_generated/types/zero_shot_image_classification.py +2 -2
- huggingface_hub/inference/_generated/types/zero_shot_object_detection.py +1 -3
- huggingface_hub/inference/_mcp/agent.py +3 -3
- huggingface_hub/inference/_mcp/constants.py +1 -2
- huggingface_hub/inference/_mcp/mcp_client.py +33 -22
- huggingface_hub/inference/_mcp/types.py +10 -10
- huggingface_hub/inference/_mcp/utils.py +4 -4
- huggingface_hub/inference/_providers/__init__.py +12 -4
- huggingface_hub/inference/_providers/_common.py +62 -24
- huggingface_hub/inference/_providers/black_forest_labs.py +6 -6
- huggingface_hub/inference/_providers/cohere.py +3 -3
- huggingface_hub/inference/_providers/fal_ai.py +25 -25
- huggingface_hub/inference/_providers/featherless_ai.py +4 -4
- huggingface_hub/inference/_providers/fireworks_ai.py +3 -3
- huggingface_hub/inference/_providers/hf_inference.py +13 -13
- huggingface_hub/inference/_providers/hyperbolic.py +4 -4
- huggingface_hub/inference/_providers/nebius.py +10 -10
- huggingface_hub/inference/_providers/novita.py +5 -5
- huggingface_hub/inference/_providers/nscale.py +4 -4
- huggingface_hub/inference/_providers/replicate.py +15 -15
- huggingface_hub/inference/_providers/sambanova.py +6 -6
- huggingface_hub/inference/_providers/together.py +7 -7
- huggingface_hub/lfs.py +21 -94
- huggingface_hub/repocard.py +15 -16
- huggingface_hub/repocard_data.py +57 -57
- huggingface_hub/serialization/__init__.py +0 -1
- huggingface_hub/serialization/_base.py +9 -9
- huggingface_hub/serialization/_dduf.py +7 -7
- huggingface_hub/serialization/_torch.py +28 -28
- huggingface_hub/utils/__init__.py +11 -6
- huggingface_hub/utils/_auth.py +5 -5
- huggingface_hub/utils/_cache_manager.py +49 -74
- huggingface_hub/utils/_deprecation.py +1 -1
- huggingface_hub/utils/_dotenv.py +3 -3
- huggingface_hub/utils/_fixes.py +0 -10
- huggingface_hub/utils/_git_credential.py +3 -3
- huggingface_hub/utils/_headers.py +7 -29
- huggingface_hub/utils/_http.py +371 -208
- huggingface_hub/utils/_pagination.py +4 -4
- huggingface_hub/utils/_parsing.py +98 -0
- huggingface_hub/utils/_paths.py +5 -5
- huggingface_hub/utils/_runtime.py +59 -23
- huggingface_hub/utils/_safetensors.py +21 -21
- huggingface_hub/utils/_subprocess.py +9 -9
- huggingface_hub/utils/_telemetry.py +3 -3
- huggingface_hub/{commands/_cli_utils.py → utils/_terminal.py} +4 -9
- huggingface_hub/utils/_typing.py +3 -3
- huggingface_hub/utils/_validators.py +53 -72
- huggingface_hub/utils/_xet.py +16 -16
- huggingface_hub/utils/_xet_progress_reporting.py +1 -1
- huggingface_hub/utils/insecure_hashlib.py +3 -9
- huggingface_hub/utils/tqdm.py +3 -3
- {huggingface_hub-0.36.0rc0.dist-info → huggingface_hub-1.0.0.dist-info}/METADATA +16 -35
- huggingface_hub-1.0.0.dist-info/RECORD +152 -0
- {huggingface_hub-0.36.0rc0.dist-info → huggingface_hub-1.0.0.dist-info}/entry_points.txt +0 -1
- huggingface_hub/commands/__init__.py +0 -27
- huggingface_hub/commands/delete_cache.py +0 -476
- huggingface_hub/commands/download.py +0 -204
- huggingface_hub/commands/env.py +0 -39
- huggingface_hub/commands/huggingface_cli.py +0 -65
- huggingface_hub/commands/lfs.py +0 -200
- huggingface_hub/commands/repo.py +0 -151
- huggingface_hub/commands/repo_files.py +0 -132
- huggingface_hub/commands/scan_cache.py +0 -183
- huggingface_hub/commands/tag.py +0 -161
- huggingface_hub/commands/upload.py +0 -318
- huggingface_hub/commands/upload_large_folder.py +0 -131
- huggingface_hub/commands/user.py +0 -208
- huggingface_hub/commands/version.py +0 -40
- huggingface_hub/inference_api.py +0 -217
- huggingface_hub/keras_mixin.py +0 -497
- huggingface_hub/repository.py +0 -1471
- huggingface_hub/serialization/_tensorflow.py +0 -92
- huggingface_hub/utils/_hf_folder.py +0 -68
- huggingface_hub-0.36.0rc0.dist-info/RECORD +0 -170
- {huggingface_hub-0.36.0rc0.dist-info → huggingface_hub-1.0.0.dist-info}/LICENSE +0 -0
- {huggingface_hub-0.36.0rc0.dist-info → huggingface_hub-1.0.0.dist-info}/WHEEL +0 -0
- {huggingface_hub-0.36.0rc0.dist-info → huggingface_hub-1.0.0.dist-info}/top_level.txt +0 -0
huggingface_hub/file_download.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import copy
|
|
2
2
|
import errno
|
|
3
|
-
import inspect
|
|
4
3
|
import os
|
|
5
4
|
import re
|
|
6
5
|
import shutil
|
|
@@ -10,26 +9,19 @@ import uuid
|
|
|
10
9
|
import warnings
|
|
11
10
|
from dataclasses import dataclass
|
|
12
11
|
from pathlib import Path
|
|
13
|
-
from typing import Any, BinaryIO,
|
|
12
|
+
from typing import Any, BinaryIO, Literal, NoReturn, Optional, Union, overload
|
|
14
13
|
from urllib.parse import quote, urlparse
|
|
15
14
|
|
|
16
|
-
import
|
|
15
|
+
import httpx
|
|
17
16
|
|
|
18
|
-
from . import
|
|
19
|
-
__version__, # noqa: F401 # for backward compatibility
|
|
20
|
-
constants,
|
|
21
|
-
)
|
|
17
|
+
from . import constants
|
|
22
18
|
from ._local_folder import get_local_download_paths, read_download_metadata, write_download_metadata
|
|
23
|
-
from .constants import (
|
|
24
|
-
HUGGINGFACE_CO_URL_TEMPLATE, # noqa: F401 # for backward compatibility
|
|
25
|
-
HUGGINGFACE_HUB_CACHE, # noqa: F401 # for backward compatibility
|
|
26
|
-
)
|
|
27
19
|
from .errors import (
|
|
28
|
-
EntryNotFoundError,
|
|
29
20
|
FileMetadataError,
|
|
30
21
|
GatedRepoError,
|
|
31
22
|
HfHubHTTPError,
|
|
32
23
|
LocalEntryNotFoundError,
|
|
24
|
+
RemoteEntryNotFoundError,
|
|
33
25
|
RepositoryNotFoundError,
|
|
34
26
|
RevisionNotFoundError,
|
|
35
27
|
)
|
|
@@ -39,30 +31,15 @@ from .utils import (
|
|
|
39
31
|
WeakFileLock,
|
|
40
32
|
XetFileData,
|
|
41
33
|
build_hf_headers,
|
|
42
|
-
get_fastai_version, # noqa: F401 # for backward compatibility
|
|
43
|
-
get_fastcore_version, # noqa: F401 # for backward compatibility
|
|
44
|
-
get_graphviz_version, # noqa: F401 # for backward compatibility
|
|
45
|
-
get_jinja_version, # noqa: F401 # for backward compatibility
|
|
46
|
-
get_pydot_version, # noqa: F401 # for backward compatibility
|
|
47
|
-
get_tf_version, # noqa: F401 # for backward compatibility
|
|
48
|
-
get_torch_version, # noqa: F401 # for backward compatibility
|
|
49
34
|
hf_raise_for_status,
|
|
50
|
-
is_fastai_available, # noqa: F401 # for backward compatibility
|
|
51
|
-
is_fastcore_available, # noqa: F401 # for backward compatibility
|
|
52
|
-
is_graphviz_available, # noqa: F401 # for backward compatibility
|
|
53
|
-
is_jinja_available, # noqa: F401 # for backward compatibility
|
|
54
|
-
is_pydot_available, # noqa: F401 # for backward compatibility
|
|
55
|
-
is_tf_available, # noqa: F401 # for backward compatibility
|
|
56
|
-
is_torch_available, # noqa: F401 # for backward compatibility
|
|
57
35
|
logging,
|
|
58
36
|
parse_xet_file_data_from_response,
|
|
59
37
|
refresh_xet_connection_info,
|
|
60
|
-
reset_sessions,
|
|
61
38
|
tqdm,
|
|
62
39
|
validate_hf_hub_args,
|
|
63
40
|
)
|
|
64
|
-
from .utils._http import _adjust_range_header, http_backoff
|
|
65
|
-
from .utils._runtime import
|
|
41
|
+
from .utils._http import _adjust_range_header, http_backoff, http_stream_backoff
|
|
42
|
+
from .utils._runtime import is_xet_available
|
|
66
43
|
from .utils._typing import HTTP_METHOD_T
|
|
67
44
|
from .utils.sha import sha_fileobj
|
|
68
45
|
from .utils.tqdm import _get_progress_bar_context
|
|
@@ -83,7 +60,7 @@ REGEX_COMMIT_HASH = re.compile(r"^[0-9a-f]{40}$")
|
|
|
83
60
|
# Regex to check if the file etag IS a valid sha256
|
|
84
61
|
REGEX_SHA256 = re.compile(r"^[0-9a-f]{64}$")
|
|
85
62
|
|
|
86
|
-
_are_symlinks_supported_in_dir:
|
|
63
|
+
_are_symlinks_supported_in_dir: dict[str, bool] = {}
|
|
87
64
|
|
|
88
65
|
|
|
89
66
|
def are_symlinks_supported(cache_dir: Union[str, Path, None] = None) -> bool:
|
|
@@ -172,6 +149,34 @@ class HfFileMetadata:
|
|
|
172
149
|
xet_file_data: Optional[XetFileData]
|
|
173
150
|
|
|
174
151
|
|
|
152
|
+
@dataclass
|
|
153
|
+
class DryRunFileInfo:
|
|
154
|
+
"""Information returned when performing a dry run of a file download.
|
|
155
|
+
|
|
156
|
+
Returned by [`hf_hub_download`] when `dry_run=True`.
|
|
157
|
+
|
|
158
|
+
Args:
|
|
159
|
+
commit_hash (`str`):
|
|
160
|
+
The commit_hash related to the file.
|
|
161
|
+
file_size (`int`):
|
|
162
|
+
Size of the file. In case of an LFS file, contains the size of the actual LFS file, not the pointer.
|
|
163
|
+
filename (`str`):
|
|
164
|
+
Name of the file in the repo.
|
|
165
|
+
is_cached (`bool`):
|
|
166
|
+
Whether the file is already cached locally.
|
|
167
|
+
will_download (`bool`):
|
|
168
|
+
Whether the file will be downloaded if `hf_hub_download` is called with `dry_run=False`.
|
|
169
|
+
In practice, will_download is `True` if the file is not cached or if `force_download=True`.
|
|
170
|
+
"""
|
|
171
|
+
|
|
172
|
+
commit_hash: str
|
|
173
|
+
file_size: int
|
|
174
|
+
filename: str
|
|
175
|
+
local_path: str
|
|
176
|
+
is_cached: bool
|
|
177
|
+
will_download: bool
|
|
178
|
+
|
|
179
|
+
|
|
175
180
|
@validate_hf_hub_args
|
|
176
181
|
def hf_hub_url(
|
|
177
182
|
repo_id: str,
|
|
@@ -249,7 +254,7 @@ def hf_hub_url(
|
|
|
249
254
|
|
|
250
255
|
if revision is None:
|
|
251
256
|
revision = constants.DEFAULT_REVISION
|
|
252
|
-
url = HUGGINGFACE_CO_URL_TEMPLATE.format(
|
|
257
|
+
url = constants.HUGGINGFACE_CO_URL_TEMPLATE.format(
|
|
253
258
|
repo_id=repo_id, revision=quote(revision, safe=""), filename=quote(filename)
|
|
254
259
|
)
|
|
255
260
|
# Update endpoint if provided
|
|
@@ -258,11 +263,10 @@ def hf_hub_url(
|
|
|
258
263
|
return url
|
|
259
264
|
|
|
260
265
|
|
|
261
|
-
def
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
`allow_redirection=False`.
|
|
266
|
+
def _httpx_follow_relative_redirects(method: HTTP_METHOD_T, url: str, **httpx_kwargs) -> httpx.Response:
|
|
267
|
+
"""Perform an HTTP request with backoff and follow relative redirects only.
|
|
268
|
+
|
|
269
|
+
This is useful to follow a redirection to a renamed repository without following redirection to a CDN.
|
|
266
270
|
|
|
267
271
|
A backoff mechanism retries the HTTP call on 5xx errors and network errors.
|
|
268
272
|
|
|
@@ -271,44 +275,36 @@ def _request_wrapper(
|
|
|
271
275
|
HTTP method, such as 'GET' or 'HEAD'.
|
|
272
276
|
url (`str`):
|
|
273
277
|
The URL of the resource to fetch.
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
kwarg is set to False. Useful when we want to follow a redirection to a renamed repository without
|
|
277
|
-
following redirection to a CDN.
|
|
278
|
-
**params (`dict`, *optional*):
|
|
279
|
-
Params to pass to `requests.request`.
|
|
278
|
+
**httpx_kwargs (`dict`, *optional*):
|
|
279
|
+
Params to pass to `httpx.request`.
|
|
280
280
|
"""
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
response =
|
|
281
|
+
while True:
|
|
282
|
+
# Make the request
|
|
283
|
+
response = http_backoff(
|
|
284
284
|
method=method,
|
|
285
285
|
url=url,
|
|
286
|
-
|
|
287
|
-
|
|
286
|
+
**httpx_kwargs,
|
|
287
|
+
follow_redirects=False,
|
|
288
|
+
retry_on_exceptions=(),
|
|
289
|
+
retry_on_status_codes=(429,),
|
|
288
290
|
)
|
|
291
|
+
hf_raise_for_status(response)
|
|
289
292
|
|
|
290
|
-
#
|
|
291
|
-
# This is useful in case of a renamed repository.
|
|
293
|
+
# Check if response is a relative redirect
|
|
292
294
|
if 300 <= response.status_code <= 399:
|
|
293
295
|
parsed_target = urlparse(response.headers["Location"])
|
|
294
296
|
if parsed_target.netloc == "":
|
|
295
|
-
#
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
return _request_wrapper(method=method, url=next_url, follow_relative_redirects=True, **params)
|
|
303
|
-
return response
|
|
304
|
-
|
|
305
|
-
# Perform request and return if status_code is not in the retry list.
|
|
306
|
-
response = http_backoff(method=method, url=url, **params)
|
|
307
|
-
hf_raise_for_status(response)
|
|
297
|
+
# Relative redirect -> update URL and retry
|
|
298
|
+
url = urlparse(url)._replace(path=parsed_target.path).geturl()
|
|
299
|
+
continue
|
|
300
|
+
|
|
301
|
+
# Break if no relative redirect
|
|
302
|
+
break
|
|
303
|
+
|
|
308
304
|
return response
|
|
309
305
|
|
|
310
306
|
|
|
311
|
-
def _get_file_length_from_http_response(response:
|
|
307
|
+
def _get_file_length_from_http_response(response: httpx.Response) -> Optional[int]:
|
|
312
308
|
"""
|
|
313
309
|
Get the length of the file from the HTTP response headers.
|
|
314
310
|
|
|
@@ -316,7 +312,7 @@ def _get_file_length_from_http_response(response: requests.Response) -> Optional
|
|
|
316
312
|
`Content-Range` or `Content-Length` header, if available (in that order).
|
|
317
313
|
|
|
318
314
|
Args:
|
|
319
|
-
response (`
|
|
315
|
+
response (`httpx.Response`):
|
|
320
316
|
The HTTP response object.
|
|
321
317
|
|
|
322
318
|
Returns:
|
|
@@ -343,13 +339,13 @@ def _get_file_length_from_http_response(response: requests.Response) -> Optional
|
|
|
343
339
|
return None
|
|
344
340
|
|
|
345
341
|
|
|
342
|
+
@validate_hf_hub_args
|
|
346
343
|
def http_get(
|
|
347
344
|
url: str,
|
|
348
345
|
temp_file: BinaryIO,
|
|
349
346
|
*,
|
|
350
|
-
proxies: Optional[Dict] = None,
|
|
351
347
|
resume_size: int = 0,
|
|
352
|
-
headers: Optional[
|
|
348
|
+
headers: Optional[dict[str, Any]] = None,
|
|
353
349
|
expected_size: Optional[int] = None,
|
|
354
350
|
displayed_filename: Optional[str] = None,
|
|
355
351
|
_nb_retries: int = 5,
|
|
@@ -367,8 +363,6 @@ def http_get(
|
|
|
367
363
|
The URL of the file to download.
|
|
368
364
|
temp_file (`BinaryIO`):
|
|
369
365
|
The file-like object where to save the file.
|
|
370
|
-
proxies (`dict`, *optional*):
|
|
371
|
-
Dictionary mapping protocol to the URL of the proxy passed to `requests.request`.
|
|
372
366
|
resume_size (`int`, *optional*):
|
|
373
367
|
The number of bytes already downloaded. If set to 0 (default), the whole file is download. If set to a
|
|
374
368
|
positive number, the download will resume at the given position.
|
|
@@ -385,138 +379,83 @@ def http_get(
|
|
|
385
379
|
# If the file is already fully downloaded, we don't need to download it again.
|
|
386
380
|
return
|
|
387
381
|
|
|
388
|
-
has_custom_range_header = headers is not None and any(h.lower() == "range" for h in headers)
|
|
389
|
-
hf_transfer = None
|
|
390
|
-
if constants.HF_HUB_ENABLE_HF_TRANSFER:
|
|
391
|
-
if resume_size != 0:
|
|
392
|
-
warnings.warn("'hf_transfer' does not support `resume_size`: falling back to regular download method")
|
|
393
|
-
elif proxies is not None:
|
|
394
|
-
warnings.warn("'hf_transfer' does not support `proxies`: falling back to regular download method")
|
|
395
|
-
elif has_custom_range_header:
|
|
396
|
-
warnings.warn("'hf_transfer' ignores custom 'Range' headers; falling back to regular download method")
|
|
397
|
-
else:
|
|
398
|
-
try:
|
|
399
|
-
import hf_transfer # type: ignore[no-redef]
|
|
400
|
-
except ImportError:
|
|
401
|
-
raise ValueError(
|
|
402
|
-
"Fast download using 'hf_transfer' is enabled"
|
|
403
|
-
" (HF_HUB_ENABLE_HF_TRANSFER=1) but 'hf_transfer' package is not"
|
|
404
|
-
" available in your environment. Try `pip install hf_transfer`."
|
|
405
|
-
)
|
|
406
|
-
|
|
407
382
|
initial_headers = headers
|
|
408
383
|
headers = copy.deepcopy(headers) or {}
|
|
409
384
|
if resume_size > 0:
|
|
410
385
|
headers["Range"] = _adjust_range_header(headers.get("Range"), resume_size)
|
|
411
386
|
elif expected_size and expected_size > constants.MAX_HTTP_DOWNLOAD_SIZE:
|
|
412
|
-
# Any files over 50GB will not be available through basic http
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
else:
|
|
418
|
-
raise ValueError(
|
|
419
|
-
"The file is too large to be downloaded using the regular download method. Use `hf_transfer` or `hf_xet` instead."
|
|
420
|
-
" Try `pip install hf_transfer` or `pip install hf_xet`."
|
|
421
|
-
)
|
|
422
|
-
|
|
423
|
-
r = _request_wrapper(
|
|
424
|
-
method="GET", url=url, stream=True, proxies=proxies, headers=headers, timeout=constants.HF_HUB_DOWNLOAD_TIMEOUT
|
|
425
|
-
)
|
|
426
|
-
|
|
427
|
-
hf_raise_for_status(r)
|
|
428
|
-
total: Optional[int] = _get_file_length_from_http_response(r)
|
|
429
|
-
|
|
430
|
-
if displayed_filename is None:
|
|
431
|
-
displayed_filename = url
|
|
432
|
-
content_disposition = r.headers.get("Content-Disposition")
|
|
433
|
-
if content_disposition is not None:
|
|
434
|
-
match = HEADER_FILENAME_PATTERN.search(content_disposition)
|
|
435
|
-
if match is not None:
|
|
436
|
-
# Means file is on CDN
|
|
437
|
-
displayed_filename = match.groupdict()["filename"]
|
|
438
|
-
|
|
439
|
-
# Truncate filename if too long to display
|
|
440
|
-
if len(displayed_filename) > 40:
|
|
441
|
-
displayed_filename = f"(…){displayed_filename[-40:]}"
|
|
387
|
+
# Any files over 50GB will not be available through basic http requests.
|
|
388
|
+
raise ValueError(
|
|
389
|
+
"The file is too large to be downloaded using the regular download method. "
|
|
390
|
+
" Install `hf_xet` with `pip install hf_xet` for xet-powered downloads."
|
|
391
|
+
)
|
|
442
392
|
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
393
|
+
with http_stream_backoff(
|
|
394
|
+
method="GET",
|
|
395
|
+
url=url,
|
|
396
|
+
headers=headers,
|
|
397
|
+
timeout=constants.HF_HUB_DOWNLOAD_TIMEOUT,
|
|
398
|
+
retry_on_exceptions=(),
|
|
399
|
+
retry_on_status_codes=(429,),
|
|
400
|
+
) as response:
|
|
401
|
+
hf_raise_for_status(response)
|
|
402
|
+
total: Optional[int] = _get_file_length_from_http_response(response)
|
|
403
|
+
|
|
404
|
+
if displayed_filename is None:
|
|
405
|
+
displayed_filename = url
|
|
406
|
+
content_disposition = response.headers.get("Content-Disposition")
|
|
407
|
+
if content_disposition is not None:
|
|
408
|
+
match = HEADER_FILENAME_PATTERN.search(content_disposition)
|
|
409
|
+
if match is not None:
|
|
410
|
+
# Means file is on CDN
|
|
411
|
+
displayed_filename = match.groupdict()["filename"]
|
|
412
|
+
|
|
413
|
+
# Truncate filename if too long to display
|
|
414
|
+
if len(displayed_filename) > 40:
|
|
415
|
+
displayed_filename = f"(…){displayed_filename[-40:]}"
|
|
416
|
+
|
|
417
|
+
consistency_error_message = (
|
|
418
|
+
f"Consistency check failed: file should be of size {expected_size} but has size"
|
|
419
|
+
f" {{actual_size}} ({displayed_filename}).\nThis is usually due to network issues while downloading the file."
|
|
420
|
+
" Please retry with `force_download=True`."
|
|
421
|
+
)
|
|
422
|
+
progress_cm = _get_progress_bar_context(
|
|
423
|
+
desc=displayed_filename,
|
|
424
|
+
log_level=logger.getEffectiveLevel(),
|
|
425
|
+
total=total,
|
|
426
|
+
initial=resume_size,
|
|
427
|
+
name="huggingface_hub.http_get",
|
|
428
|
+
_tqdm_bar=_tqdm_bar,
|
|
429
|
+
)
|
|
456
430
|
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
supports_callback = "callback" in inspect.signature(hf_transfer.download).parameters
|
|
460
|
-
if not supports_callback:
|
|
461
|
-
warnings.warn(
|
|
462
|
-
"You are using an outdated version of `hf_transfer`. "
|
|
463
|
-
"Consider upgrading to latest version to enable progress bars "
|
|
464
|
-
"using `pip install -U hf_transfer`."
|
|
465
|
-
)
|
|
431
|
+
with progress_cm as progress:
|
|
432
|
+
new_resume_size = resume_size
|
|
466
433
|
try:
|
|
467
|
-
|
|
434
|
+
for chunk in response.iter_bytes(chunk_size=constants.DOWNLOAD_CHUNK_SIZE):
|
|
435
|
+
if chunk: # filter out keep-alive new chunks
|
|
436
|
+
progress.update(len(chunk))
|
|
437
|
+
temp_file.write(chunk)
|
|
438
|
+
new_resume_size += len(chunk)
|
|
439
|
+
# Some data has been downloaded from the server so we reset the number of retries.
|
|
440
|
+
_nb_retries = 5
|
|
441
|
+
except (httpx.ConnectError, httpx.TimeoutException) as e:
|
|
442
|
+
# If ConnectionError (SSLError) or ReadTimeout happen while streaming data from the server, it is most likely
|
|
443
|
+
# a transient error (network outage?). We log a warning message and try to resume the download a few times
|
|
444
|
+
# before giving up. Tre retry mechanism is basic but should be enough in most cases.
|
|
445
|
+
if _nb_retries <= 0:
|
|
446
|
+
logger.warning("Error while downloading from %s: %s\nMax retries exceeded.", url, str(e))
|
|
447
|
+
raise
|
|
448
|
+
logger.warning("Error while downloading from %s: %s\nTrying to resume download...", url, str(e))
|
|
449
|
+
time.sleep(1)
|
|
450
|
+
return http_get(
|
|
468
451
|
url=url,
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
chunk_size=constants.DOWNLOAD_CHUNK_SIZE,
|
|
452
|
+
temp_file=temp_file,
|
|
453
|
+
resume_size=new_resume_size,
|
|
472
454
|
headers=initial_headers,
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
)
|
|
477
|
-
except Exception as e:
|
|
478
|
-
raise RuntimeError(
|
|
479
|
-
"An error occurred while downloading using `hf_transfer`. Consider"
|
|
480
|
-
" disabling HF_HUB_ENABLE_HF_TRANSFER for better error handling."
|
|
481
|
-
) from e
|
|
482
|
-
if not supports_callback:
|
|
483
|
-
progress.update(total)
|
|
484
|
-
if expected_size is not None and expected_size != os.path.getsize(temp_file.name):
|
|
485
|
-
raise EnvironmentError(
|
|
486
|
-
consistency_error_message.format(
|
|
487
|
-
actual_size=os.path.getsize(temp_file.name),
|
|
488
|
-
)
|
|
455
|
+
expected_size=expected_size,
|
|
456
|
+
_nb_retries=_nb_retries - 1,
|
|
457
|
+
_tqdm_bar=_tqdm_bar,
|
|
489
458
|
)
|
|
490
|
-
return
|
|
491
|
-
new_resume_size = resume_size
|
|
492
|
-
try:
|
|
493
|
-
for chunk in r.iter_content(chunk_size=constants.DOWNLOAD_CHUNK_SIZE):
|
|
494
|
-
if chunk: # filter out keep-alive new chunks
|
|
495
|
-
progress.update(len(chunk))
|
|
496
|
-
temp_file.write(chunk)
|
|
497
|
-
new_resume_size += len(chunk)
|
|
498
|
-
# Some data has been downloaded from the server so we reset the number of retries.
|
|
499
|
-
_nb_retries = 5
|
|
500
|
-
except (requests.ConnectionError, requests.ReadTimeout) as e:
|
|
501
|
-
# If ConnectionError (SSLError) or ReadTimeout happen while streaming data from the server, it is most likely
|
|
502
|
-
# a transient error (network outage?). We log a warning message and try to resume the download a few times
|
|
503
|
-
# before giving up. Tre retry mechanism is basic but should be enough in most cases.
|
|
504
|
-
if _nb_retries <= 0:
|
|
505
|
-
logger.warning("Error while downloading from %s: %s\nMax retries exceeded.", url, str(e))
|
|
506
|
-
raise
|
|
507
|
-
logger.warning("Error while downloading from %s: %s\nTrying to resume download...", url, str(e))
|
|
508
|
-
time.sleep(1)
|
|
509
|
-
reset_sessions() # In case of SSLError it's best to reset the shared requests.Session objects
|
|
510
|
-
return http_get(
|
|
511
|
-
url=url,
|
|
512
|
-
temp_file=temp_file,
|
|
513
|
-
proxies=proxies,
|
|
514
|
-
resume_size=new_resume_size,
|
|
515
|
-
headers=initial_headers,
|
|
516
|
-
expected_size=expected_size,
|
|
517
|
-
_nb_retries=_nb_retries - 1,
|
|
518
|
-
_tqdm_bar=_tqdm_bar,
|
|
519
|
-
)
|
|
520
459
|
|
|
521
460
|
if expected_size is not None and expected_size != temp_file.tell():
|
|
522
461
|
raise EnvironmentError(
|
|
@@ -530,7 +469,7 @@ def xet_get(
|
|
|
530
469
|
*,
|
|
531
470
|
incomplete_path: Path,
|
|
532
471
|
xet_file_data: XetFileData,
|
|
533
|
-
headers:
|
|
472
|
+
headers: dict[str, str],
|
|
534
473
|
expected_size: Optional[int] = None,
|
|
535
474
|
displayed_filename: Optional[str] = None,
|
|
536
475
|
_tqdm_bar: Optional[tqdm] = None,
|
|
@@ -543,7 +482,7 @@ def xet_get(
|
|
|
543
482
|
The path to the file to download.
|
|
544
483
|
xet_file_data (`XetFileData`):
|
|
545
484
|
The file metadata needed to make the request to the xet storage service.
|
|
546
|
-
headers (`
|
|
485
|
+
headers (`dict[str, str]`):
|
|
547
486
|
The headers to send to the xet storage service.
|
|
548
487
|
expected_size (`int`, *optional*):
|
|
549
488
|
The expected size of the file to download. If set, the download will raise an error if the size of the
|
|
@@ -590,7 +529,7 @@ def xet_get(
|
|
|
590
529
|
|
|
591
530
|
connection_info = refresh_xet_connection_info(file_data=xet_file_data, headers=headers)
|
|
592
531
|
|
|
593
|
-
def token_refresher() ->
|
|
532
|
+
def token_refresher() -> tuple[str, int]:
|
|
594
533
|
connection_info = refresh_xet_connection_info(file_data=xet_file_data, headers=headers)
|
|
595
534
|
if connection_info is None:
|
|
596
535
|
raise ValueError("Failed to refresh token using xet metadata.")
|
|
@@ -805,6 +744,75 @@ def _check_disk_space(expected_size: int, target_dir: Union[str, Path]) -> None:
|
|
|
805
744
|
pass
|
|
806
745
|
|
|
807
746
|
|
|
747
|
+
@overload
|
|
748
|
+
def hf_hub_download(
|
|
749
|
+
repo_id: str,
|
|
750
|
+
filename: str,
|
|
751
|
+
*,
|
|
752
|
+
subfolder: Optional[str] = None,
|
|
753
|
+
repo_type: Optional[str] = None,
|
|
754
|
+
revision: Optional[str] = None,
|
|
755
|
+
library_name: Optional[str] = None,
|
|
756
|
+
library_version: Optional[str] = None,
|
|
757
|
+
cache_dir: Union[str, Path, None] = None,
|
|
758
|
+
local_dir: Union[str, Path, None] = None,
|
|
759
|
+
user_agent: Union[dict, str, None] = None,
|
|
760
|
+
force_download: bool = False,
|
|
761
|
+
etag_timeout: float = constants.DEFAULT_ETAG_TIMEOUT,
|
|
762
|
+
token: Union[bool, str, None] = None,
|
|
763
|
+
local_files_only: bool = False,
|
|
764
|
+
headers: Optional[dict[str, str]] = None,
|
|
765
|
+
endpoint: Optional[str] = None,
|
|
766
|
+
dry_run: Literal[False] = False,
|
|
767
|
+
) -> str: ...
|
|
768
|
+
|
|
769
|
+
|
|
770
|
+
@overload
|
|
771
|
+
def hf_hub_download(
|
|
772
|
+
repo_id: str,
|
|
773
|
+
filename: str,
|
|
774
|
+
*,
|
|
775
|
+
subfolder: Optional[str] = None,
|
|
776
|
+
repo_type: Optional[str] = None,
|
|
777
|
+
revision: Optional[str] = None,
|
|
778
|
+
library_name: Optional[str] = None,
|
|
779
|
+
library_version: Optional[str] = None,
|
|
780
|
+
cache_dir: Union[str, Path, None] = None,
|
|
781
|
+
local_dir: Union[str, Path, None] = None,
|
|
782
|
+
user_agent: Union[dict, str, None] = None,
|
|
783
|
+
force_download: bool = False,
|
|
784
|
+
etag_timeout: float = constants.DEFAULT_ETAG_TIMEOUT,
|
|
785
|
+
token: Union[bool, str, None] = None,
|
|
786
|
+
local_files_only: bool = False,
|
|
787
|
+
headers: Optional[dict[str, str]] = None,
|
|
788
|
+
endpoint: Optional[str] = None,
|
|
789
|
+
dry_run: Literal[True] = True,
|
|
790
|
+
) -> DryRunFileInfo: ...
|
|
791
|
+
|
|
792
|
+
|
|
793
|
+
@overload
|
|
794
|
+
def hf_hub_download(
|
|
795
|
+
repo_id: str,
|
|
796
|
+
filename: str,
|
|
797
|
+
*,
|
|
798
|
+
subfolder: Optional[str] = None,
|
|
799
|
+
repo_type: Optional[str] = None,
|
|
800
|
+
revision: Optional[str] = None,
|
|
801
|
+
library_name: Optional[str] = None,
|
|
802
|
+
library_version: Optional[str] = None,
|
|
803
|
+
cache_dir: Union[str, Path, None] = None,
|
|
804
|
+
local_dir: Union[str, Path, None] = None,
|
|
805
|
+
user_agent: Union[dict, str, None] = None,
|
|
806
|
+
force_download: bool = False,
|
|
807
|
+
etag_timeout: float = constants.DEFAULT_ETAG_TIMEOUT,
|
|
808
|
+
token: Union[bool, str, None] = None,
|
|
809
|
+
local_files_only: bool = False,
|
|
810
|
+
headers: Optional[dict[str, str]] = None,
|
|
811
|
+
endpoint: Optional[str] = None,
|
|
812
|
+
dry_run: bool = False,
|
|
813
|
+
) -> Union[str, DryRunFileInfo]: ...
|
|
814
|
+
|
|
815
|
+
|
|
808
816
|
@validate_hf_hub_args
|
|
809
817
|
def hf_hub_download(
|
|
810
818
|
repo_id: str,
|
|
@@ -817,18 +825,15 @@ def hf_hub_download(
|
|
|
817
825
|
library_version: Optional[str] = None,
|
|
818
826
|
cache_dir: Union[str, Path, None] = None,
|
|
819
827
|
local_dir: Union[str, Path, None] = None,
|
|
820
|
-
user_agent: Union[
|
|
828
|
+
user_agent: Union[dict, str, None] = None,
|
|
821
829
|
force_download: bool = False,
|
|
822
|
-
proxies: Optional[Dict] = None,
|
|
823
830
|
etag_timeout: float = constants.DEFAULT_ETAG_TIMEOUT,
|
|
824
831
|
token: Union[bool, str, None] = None,
|
|
825
832
|
local_files_only: bool = False,
|
|
826
|
-
headers: Optional[
|
|
833
|
+
headers: Optional[dict[str, str]] = None,
|
|
827
834
|
endpoint: Optional[str] = None,
|
|
828
|
-
|
|
829
|
-
|
|
830
|
-
local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto",
|
|
831
|
-
) -> str:
|
|
835
|
+
dry_run: bool = False,
|
|
836
|
+
) -> Union[str, DryRunFileInfo]:
|
|
832
837
|
"""Download a given file if it's not already present in the local cache.
|
|
833
838
|
|
|
834
839
|
The new cache file layout looks like this:
|
|
@@ -890,9 +895,6 @@ def hf_hub_download(
|
|
|
890
895
|
force_download (`bool`, *optional*, defaults to `False`):
|
|
891
896
|
Whether the file should be downloaded even if it already exists in
|
|
892
897
|
the local cache.
|
|
893
|
-
proxies (`dict`, *optional*):
|
|
894
|
-
Dictionary mapping protocol to the URL of the proxy passed to
|
|
895
|
-
`requests.request`.
|
|
896
898
|
etag_timeout (`float`, *optional*, defaults to `10`):
|
|
897
899
|
When fetching ETag, how many seconds to wait for the server to send
|
|
898
900
|
data before giving up which is passed to `requests.request`.
|
|
@@ -906,9 +908,14 @@ def hf_hub_download(
|
|
|
906
908
|
local cached file if it exists.
|
|
907
909
|
headers (`dict`, *optional*):
|
|
908
910
|
Additional headers to be sent with the request.
|
|
911
|
+
dry_run (`bool`, *optional*, defaults to `False`):
|
|
912
|
+
If `True`, perform a dry run without actually downloading the file. Returns a
|
|
913
|
+
[`DryRunFileInfo`] object containing information about what would be downloaded.
|
|
909
914
|
|
|
910
915
|
Returns:
|
|
911
|
-
`str
|
|
916
|
+
`str` or [`DryRunFileInfo`]:
|
|
917
|
+
- If `dry_run=False`: Local path of file or if networking is off, last version of file cached on disk.
|
|
918
|
+
- If `dry_run=True`: A [`DryRunFileInfo`] object containing download information.
|
|
912
919
|
|
|
913
920
|
Raises:
|
|
914
921
|
[`~utils.RepositoryNotFoundError`]
|
|
@@ -916,7 +923,7 @@ def hf_hub_download(
|
|
|
916
923
|
or because it is set to `private` and you do not have access.
|
|
917
924
|
[`~utils.RevisionNotFoundError`]
|
|
918
925
|
If the revision to download from cannot be found.
|
|
919
|
-
[`~utils.
|
|
926
|
+
[`~utils.RemoteEntryNotFoundError`]
|
|
920
927
|
If the file to download cannot be found.
|
|
921
928
|
[`~utils.LocalEntryNotFoundError`]
|
|
922
929
|
If network is disabled or unavailable and file is not found in cache.
|
|
@@ -932,20 +939,6 @@ def hf_hub_download(
|
|
|
932
939
|
# Respect environment variable above user value
|
|
933
940
|
etag_timeout = constants.HF_HUB_ETAG_TIMEOUT
|
|
934
941
|
|
|
935
|
-
if force_filename is not None:
|
|
936
|
-
warnings.warn(
|
|
937
|
-
"The `force_filename` parameter is deprecated as a new caching system, "
|
|
938
|
-
"which keeps the filenames as they are on the Hub, is now in place.",
|
|
939
|
-
FutureWarning,
|
|
940
|
-
)
|
|
941
|
-
if resume_download is not None:
|
|
942
|
-
warnings.warn(
|
|
943
|
-
"`resume_download` is deprecated and will be removed in version 1.0.0. "
|
|
944
|
-
"Downloads always resume when possible. "
|
|
945
|
-
"If you want to force a new download, use `force_download=True`.",
|
|
946
|
-
FutureWarning,
|
|
947
|
-
)
|
|
948
|
-
|
|
949
942
|
if cache_dir is None:
|
|
950
943
|
cache_dir = constants.HF_HUB_CACHE
|
|
951
944
|
if revision is None:
|
|
@@ -975,15 +968,6 @@ def hf_hub_download(
|
|
|
975
968
|
)
|
|
976
969
|
|
|
977
970
|
if local_dir is not None:
|
|
978
|
-
if local_dir_use_symlinks != "auto":
|
|
979
|
-
warnings.warn(
|
|
980
|
-
"`local_dir_use_symlinks` parameter is deprecated and will be ignored. "
|
|
981
|
-
"The process to download files to a local folder has been updated and do "
|
|
982
|
-
"not rely on symlinks anymore. You only need to pass a destination folder "
|
|
983
|
-
"as`local_dir`.\n"
|
|
984
|
-
"For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder."
|
|
985
|
-
)
|
|
986
|
-
|
|
987
971
|
return _hf_hub_download_to_local_dir(
|
|
988
972
|
# Destination
|
|
989
973
|
local_dir=local_dir,
|
|
@@ -996,12 +980,12 @@ def hf_hub_download(
|
|
|
996
980
|
endpoint=endpoint,
|
|
997
981
|
etag_timeout=etag_timeout,
|
|
998
982
|
headers=hf_headers,
|
|
999
|
-
proxies=proxies,
|
|
1000
983
|
token=token,
|
|
1001
984
|
# Additional options
|
|
1002
985
|
cache_dir=cache_dir,
|
|
1003
986
|
force_download=force_download,
|
|
1004
987
|
local_files_only=local_files_only,
|
|
988
|
+
dry_run=dry_run,
|
|
1005
989
|
)
|
|
1006
990
|
else:
|
|
1007
991
|
return _hf_hub_download_to_cache_dir(
|
|
@@ -1016,11 +1000,11 @@ def hf_hub_download(
|
|
|
1016
1000
|
endpoint=endpoint,
|
|
1017
1001
|
etag_timeout=etag_timeout,
|
|
1018
1002
|
headers=hf_headers,
|
|
1019
|
-
proxies=proxies,
|
|
1020
1003
|
token=token,
|
|
1021
1004
|
# Additional options
|
|
1022
1005
|
local_files_only=local_files_only,
|
|
1023
1006
|
force_download=force_download,
|
|
1007
|
+
dry_run=dry_run,
|
|
1024
1008
|
)
|
|
1025
1009
|
|
|
1026
1010
|
|
|
@@ -1036,13 +1020,13 @@ def _hf_hub_download_to_cache_dir(
|
|
|
1036
1020
|
# HTTP info
|
|
1037
1021
|
endpoint: Optional[str],
|
|
1038
1022
|
etag_timeout: float,
|
|
1039
|
-
headers:
|
|
1040
|
-
proxies: Optional[Dict],
|
|
1023
|
+
headers: dict[str, str],
|
|
1041
1024
|
token: Optional[Union[bool, str]],
|
|
1042
1025
|
# Additional options
|
|
1043
1026
|
local_files_only: bool,
|
|
1044
1027
|
force_download: bool,
|
|
1045
|
-
|
|
1028
|
+
dry_run: bool,
|
|
1029
|
+
) -> Union[str, DryRunFileInfo]:
|
|
1046
1030
|
"""Download a given file to a cache folder, if not already present.
|
|
1047
1031
|
|
|
1048
1032
|
Method should not be called directly. Please use `hf_hub_download` instead.
|
|
@@ -1062,8 +1046,18 @@ def _hf_hub_download_to_cache_dir(
|
|
|
1062
1046
|
# if user provides a commit_hash and they already have the file on disk, shortcut everything.
|
|
1063
1047
|
if REGEX_COMMIT_HASH.match(revision):
|
|
1064
1048
|
pointer_path = _get_pointer_path(storage_folder, revision, relative_filename)
|
|
1065
|
-
if os.path.exists(pointer_path)
|
|
1066
|
-
|
|
1049
|
+
if os.path.exists(pointer_path):
|
|
1050
|
+
if dry_run:
|
|
1051
|
+
return DryRunFileInfo(
|
|
1052
|
+
commit_hash=revision,
|
|
1053
|
+
file_size=os.path.getsize(pointer_path),
|
|
1054
|
+
filename=filename,
|
|
1055
|
+
is_cached=True,
|
|
1056
|
+
local_path=pointer_path,
|
|
1057
|
+
will_download=force_download,
|
|
1058
|
+
)
|
|
1059
|
+
if not force_download:
|
|
1060
|
+
return pointer_path
|
|
1067
1061
|
|
|
1068
1062
|
# Try to get metadata (etag, commit_hash, url, size) from the server.
|
|
1069
1063
|
# If we can't, a HEAD request error is returned.
|
|
@@ -1073,7 +1067,6 @@ def _hf_hub_download_to_cache_dir(
|
|
|
1073
1067
|
repo_type=repo_type,
|
|
1074
1068
|
revision=revision,
|
|
1075
1069
|
endpoint=endpoint,
|
|
1076
|
-
proxies=proxies,
|
|
1077
1070
|
etag_timeout=etag_timeout,
|
|
1078
1071
|
headers=headers,
|
|
1079
1072
|
token=token,
|
|
@@ -1107,8 +1100,18 @@ def _hf_hub_download_to_cache_dir(
|
|
|
1107
1100
|
# Return pointer file if exists
|
|
1108
1101
|
if commit_hash is not None:
|
|
1109
1102
|
pointer_path = _get_pointer_path(storage_folder, commit_hash, relative_filename)
|
|
1110
|
-
if os.path.exists(pointer_path)
|
|
1111
|
-
|
|
1103
|
+
if os.path.exists(pointer_path):
|
|
1104
|
+
if dry_run:
|
|
1105
|
+
return DryRunFileInfo(
|
|
1106
|
+
commit_hash=commit_hash,
|
|
1107
|
+
file_size=os.path.getsize(pointer_path),
|
|
1108
|
+
filename=filename,
|
|
1109
|
+
is_cached=True,
|
|
1110
|
+
local_path=pointer_path,
|
|
1111
|
+
will_download=force_download,
|
|
1112
|
+
)
|
|
1113
|
+
if not force_download:
|
|
1114
|
+
return pointer_path
|
|
1112
1115
|
|
|
1113
1116
|
# Otherwise, raise appropriate error
|
|
1114
1117
|
_raise_on_head_call_error(head_call_error, force_download, local_files_only)
|
|
@@ -1121,6 +1124,17 @@ def _hf_hub_download_to_cache_dir(
|
|
|
1121
1124
|
blob_path = os.path.join(storage_folder, "blobs", etag)
|
|
1122
1125
|
pointer_path = _get_pointer_path(storage_folder, commit_hash, relative_filename)
|
|
1123
1126
|
|
|
1127
|
+
if dry_run:
|
|
1128
|
+
is_cached = os.path.exists(pointer_path) or os.path.exists(blob_path)
|
|
1129
|
+
return DryRunFileInfo(
|
|
1130
|
+
commit_hash=commit_hash,
|
|
1131
|
+
file_size=expected_size,
|
|
1132
|
+
filename=filename,
|
|
1133
|
+
is_cached=is_cached,
|
|
1134
|
+
local_path=pointer_path,
|
|
1135
|
+
will_download=force_download or not is_cached,
|
|
1136
|
+
)
|
|
1137
|
+
|
|
1124
1138
|
os.makedirs(os.path.dirname(blob_path), exist_ok=True)
|
|
1125
1139
|
os.makedirs(os.path.dirname(pointer_path), exist_ok=True)
|
|
1126
1140
|
|
|
@@ -1169,7 +1183,6 @@ def _hf_hub_download_to_cache_dir(
|
|
|
1169
1183
|
incomplete_path=Path(blob_path + ".incomplete"),
|
|
1170
1184
|
destination_path=Path(blob_path),
|
|
1171
1185
|
url_to_download=url_to_download,
|
|
1172
|
-
proxies=proxies,
|
|
1173
1186
|
headers=headers,
|
|
1174
1187
|
expected_size=expected_size,
|
|
1175
1188
|
filename=filename,
|
|
@@ -1195,14 +1208,14 @@ def _hf_hub_download_to_local_dir(
|
|
|
1195
1208
|
# HTTP info
|
|
1196
1209
|
endpoint: Optional[str],
|
|
1197
1210
|
etag_timeout: float,
|
|
1198
|
-
headers:
|
|
1199
|
-
proxies: Optional[Dict],
|
|
1211
|
+
headers: dict[str, str],
|
|
1200
1212
|
token: Union[bool, str, None],
|
|
1201
1213
|
# Additional options
|
|
1202
1214
|
cache_dir: str,
|
|
1203
1215
|
force_download: bool,
|
|
1204
1216
|
local_files_only: bool,
|
|
1205
|
-
|
|
1217
|
+
dry_run: bool,
|
|
1218
|
+
) -> Union[str, DryRunFileInfo]:
|
|
1206
1219
|
"""Download a given file to a local folder, if not already present.
|
|
1207
1220
|
|
|
1208
1221
|
Method should not be called directly. Please use `hf_hub_download` instead.
|
|
@@ -1217,13 +1230,23 @@ def _hf_hub_download_to_local_dir(
|
|
|
1217
1230
|
|
|
1218
1231
|
# Local file exists + metadata exists + commit_hash matches => return file
|
|
1219
1232
|
if (
|
|
1220
|
-
|
|
1221
|
-
and REGEX_COMMIT_HASH.match(revision)
|
|
1233
|
+
REGEX_COMMIT_HASH.match(revision)
|
|
1222
1234
|
and paths.file_path.is_file()
|
|
1223
1235
|
and local_metadata is not None
|
|
1224
1236
|
and local_metadata.commit_hash == revision
|
|
1225
1237
|
):
|
|
1226
|
-
|
|
1238
|
+
local_file = str(paths.file_path)
|
|
1239
|
+
if dry_run:
|
|
1240
|
+
return DryRunFileInfo(
|
|
1241
|
+
commit_hash=revision,
|
|
1242
|
+
file_size=os.path.getsize(local_file),
|
|
1243
|
+
filename=filename,
|
|
1244
|
+
is_cached=True,
|
|
1245
|
+
local_path=local_file,
|
|
1246
|
+
will_download=force_download,
|
|
1247
|
+
)
|
|
1248
|
+
if not force_download:
|
|
1249
|
+
return local_file
|
|
1227
1250
|
|
|
1228
1251
|
# Local file doesn't exist or commit_hash doesn't match => we need the etag
|
|
1229
1252
|
(url_to_download, etag, commit_hash, expected_size, xet_file_data, head_call_error) = _get_metadata_or_catch_error(
|
|
@@ -1232,7 +1255,6 @@ def _hf_hub_download_to_local_dir(
|
|
|
1232
1255
|
repo_type=repo_type,
|
|
1233
1256
|
revision=revision,
|
|
1234
1257
|
endpoint=endpoint,
|
|
1235
|
-
proxies=proxies,
|
|
1236
1258
|
etag_timeout=etag_timeout,
|
|
1237
1259
|
headers=headers,
|
|
1238
1260
|
token=token,
|
|
@@ -1241,11 +1263,24 @@ def _hf_hub_download_to_local_dir(
|
|
|
1241
1263
|
|
|
1242
1264
|
if head_call_error is not None:
|
|
1243
1265
|
# No HEAD call but local file exists => default to local file
|
|
1244
|
-
if
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
|
|
1266
|
+
if paths.file_path.is_file():
|
|
1267
|
+
if dry_run or not force_download:
|
|
1268
|
+
logger.warning(
|
|
1269
|
+
f"Couldn't access the Hub to check for update but local file already exists. Defaulting to existing file. (error: {head_call_error})"
|
|
1270
|
+
)
|
|
1271
|
+
local_path = str(paths.file_path)
|
|
1272
|
+
if dry_run and local_metadata is not None:
|
|
1273
|
+
return DryRunFileInfo(
|
|
1274
|
+
commit_hash=local_metadata.commit_hash,
|
|
1275
|
+
file_size=os.path.getsize(local_path),
|
|
1276
|
+
filename=filename,
|
|
1277
|
+
is_cached=True,
|
|
1278
|
+
local_path=local_path,
|
|
1279
|
+
will_download=force_download,
|
|
1280
|
+
)
|
|
1281
|
+
if not force_download:
|
|
1282
|
+
return local_path
|
|
1283
|
+
|
|
1249
1284
|
# Otherwise => raise
|
|
1250
1285
|
_raise_on_head_call_error(head_call_error, force_download, local_files_only)
|
|
1251
1286
|
|
|
@@ -1260,6 +1295,15 @@ def _hf_hub_download_to_local_dir(
|
|
|
1260
1295
|
# etag matches => update metadata and return file
|
|
1261
1296
|
if local_metadata is not None and local_metadata.etag == etag:
|
|
1262
1297
|
write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag)
|
|
1298
|
+
if dry_run:
|
|
1299
|
+
return DryRunFileInfo(
|
|
1300
|
+
commit_hash=commit_hash,
|
|
1301
|
+
file_size=expected_size,
|
|
1302
|
+
filename=filename,
|
|
1303
|
+
is_cached=True,
|
|
1304
|
+
local_path=str(paths.file_path),
|
|
1305
|
+
will_download=False,
|
|
1306
|
+
)
|
|
1263
1307
|
return str(paths.file_path)
|
|
1264
1308
|
|
|
1265
1309
|
# metadata is outdated + etag is a sha256
|
|
@@ -1271,6 +1315,15 @@ def _hf_hub_download_to_local_dir(
|
|
|
1271
1315
|
file_hash = sha_fileobj(f).hex()
|
|
1272
1316
|
if file_hash == etag:
|
|
1273
1317
|
write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag)
|
|
1318
|
+
if dry_run:
|
|
1319
|
+
return DryRunFileInfo(
|
|
1320
|
+
commit_hash=commit_hash,
|
|
1321
|
+
file_size=expected_size,
|
|
1322
|
+
filename=filename,
|
|
1323
|
+
is_cached=True,
|
|
1324
|
+
local_path=str(paths.file_path),
|
|
1325
|
+
will_download=False,
|
|
1326
|
+
)
|
|
1274
1327
|
return str(paths.file_path)
|
|
1275
1328
|
|
|
1276
1329
|
# Local file doesn't exist or etag isn't a match => retrieve file from remote (or cache)
|
|
@@ -1289,8 +1342,28 @@ def _hf_hub_download_to_local_dir(
|
|
|
1289
1342
|
paths.file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
1290
1343
|
shutil.copyfile(cached_path, paths.file_path)
|
|
1291
1344
|
write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag)
|
|
1345
|
+
if dry_run:
|
|
1346
|
+
return DryRunFileInfo(
|
|
1347
|
+
commit_hash=commit_hash,
|
|
1348
|
+
file_size=expected_size,
|
|
1349
|
+
filename=filename,
|
|
1350
|
+
is_cached=True,
|
|
1351
|
+
local_path=str(paths.file_path),
|
|
1352
|
+
will_download=False,
|
|
1353
|
+
)
|
|
1292
1354
|
return str(paths.file_path)
|
|
1293
1355
|
|
|
1356
|
+
if dry_run:
|
|
1357
|
+
is_cached = paths.file_path.is_file()
|
|
1358
|
+
return DryRunFileInfo(
|
|
1359
|
+
commit_hash=commit_hash,
|
|
1360
|
+
file_size=expected_size,
|
|
1361
|
+
filename=filename,
|
|
1362
|
+
is_cached=is_cached,
|
|
1363
|
+
local_path=str(paths.file_path),
|
|
1364
|
+
will_download=force_download or not is_cached,
|
|
1365
|
+
)
|
|
1366
|
+
|
|
1294
1367
|
# Otherwise, let's download the file!
|
|
1295
1368
|
with WeakFileLock(paths.lock_path):
|
|
1296
1369
|
paths.file_path.unlink(missing_ok=True) # delete outdated file first
|
|
@@ -1298,7 +1371,6 @@ def _hf_hub_download_to_local_dir(
|
|
|
1298
1371
|
incomplete_path=paths.incomplete_path(etag),
|
|
1299
1372
|
destination_path=paths.file_path,
|
|
1300
1373
|
url_to_download=url_to_download,
|
|
1301
|
-
proxies=proxies,
|
|
1302
1374
|
headers=headers,
|
|
1303
1375
|
expected_size=expected_size,
|
|
1304
1376
|
filename=filename,
|
|
@@ -1408,12 +1480,11 @@ def try_to_load_from_cache(
|
|
|
1408
1480
|
def get_hf_file_metadata(
|
|
1409
1481
|
url: str,
|
|
1410
1482
|
token: Union[bool, str, None] = None,
|
|
1411
|
-
proxies: Optional[Dict] = None,
|
|
1412
1483
|
timeout: Optional[float] = constants.DEFAULT_REQUEST_TIMEOUT,
|
|
1413
1484
|
library_name: Optional[str] = None,
|
|
1414
1485
|
library_version: Optional[str] = None,
|
|
1415
|
-
user_agent: Union[
|
|
1416
|
-
headers: Optional[
|
|
1486
|
+
user_agent: Union[dict, str, None] = None,
|
|
1487
|
+
headers: Optional[dict[str, str]] = None,
|
|
1417
1488
|
endpoint: Optional[str] = None,
|
|
1418
1489
|
) -> HfFileMetadata:
|
|
1419
1490
|
"""Fetch metadata of a file versioned on the Hub for a given url.
|
|
@@ -1427,9 +1498,6 @@ def get_hf_file_metadata(
|
|
|
1427
1498
|
folder.
|
|
1428
1499
|
- If `False` or `None`, no token is provided.
|
|
1429
1500
|
- If a string, it's used as the authentication token.
|
|
1430
|
-
proxies (`dict`, *optional*):
|
|
1431
|
-
Dictionary mapping protocol to the URL of the proxy passed to
|
|
1432
|
-
`requests.request`.
|
|
1433
1501
|
timeout (`float`, *optional*, defaults to 10):
|
|
1434
1502
|
How many seconds to wait for the server to send metadata before giving up.
|
|
1435
1503
|
library_name (`str`, *optional*):
|
|
@@ -1457,31 +1525,23 @@ def get_hf_file_metadata(
|
|
|
1457
1525
|
hf_headers["Accept-Encoding"] = "identity" # prevent any compression => we want to know the real size of the file
|
|
1458
1526
|
|
|
1459
1527
|
# Retrieve metadata
|
|
1460
|
-
|
|
1461
|
-
|
|
1462
|
-
url=url,
|
|
1463
|
-
headers=hf_headers,
|
|
1464
|
-
allow_redirects=False,
|
|
1465
|
-
follow_relative_redirects=True,
|
|
1466
|
-
proxies=proxies,
|
|
1467
|
-
timeout=timeout,
|
|
1468
|
-
)
|
|
1469
|
-
hf_raise_for_status(r)
|
|
1528
|
+
response = _httpx_follow_relative_redirects(method="HEAD", url=url, headers=hf_headers, timeout=timeout)
|
|
1529
|
+
hf_raise_for_status(response)
|
|
1470
1530
|
|
|
1471
1531
|
# Return
|
|
1472
1532
|
return HfFileMetadata(
|
|
1473
|
-
commit_hash=
|
|
1474
|
-
# We favor a custom header indicating the etag of the linked resource, and
|
|
1475
|
-
|
|
1476
|
-
|
|
1533
|
+
commit_hash=response.headers.get(constants.HUGGINGFACE_HEADER_X_REPO_COMMIT),
|
|
1534
|
+
# We favor a custom header indicating the etag of the linked resource, and we fallback to the regular etag header.
|
|
1535
|
+
etag=_normalize_etag(
|
|
1536
|
+
response.headers.get(constants.HUGGINGFACE_HEADER_X_LINKED_ETAG) or response.headers.get("ETag")
|
|
1537
|
+
),
|
|
1477
1538
|
# Either from response headers (if redirected) or defaults to request url
|
|
1478
|
-
# Do not use directly `url
|
|
1479
|
-
#
|
|
1480
|
-
location=r.headers.get("Location") or r.request.url, # type: ignore
|
|
1539
|
+
# Do not use directly `url` as we might have followed relative redirects.
|
|
1540
|
+
location=response.headers.get("Location") or str(response.request.url), # type: ignore
|
|
1481
1541
|
size=_int_or_none(
|
|
1482
|
-
|
|
1542
|
+
response.headers.get(constants.HUGGINGFACE_HEADER_X_LINKED_SIZE) or response.headers.get("Content-Length")
|
|
1483
1543
|
),
|
|
1484
|
-
xet_file_data=parse_xet_file_data_from_response(
|
|
1544
|
+
xet_file_data=parse_xet_file_data_from_response(response, endpoint=endpoint), # type: ignore
|
|
1485
1545
|
)
|
|
1486
1546
|
|
|
1487
1547
|
|
|
@@ -1492,19 +1552,18 @@ def _get_metadata_or_catch_error(
|
|
|
1492
1552
|
repo_type: str,
|
|
1493
1553
|
revision: str,
|
|
1494
1554
|
endpoint: Optional[str],
|
|
1495
|
-
proxies: Optional[Dict],
|
|
1496
1555
|
etag_timeout: Optional[float],
|
|
1497
|
-
headers:
|
|
1556
|
+
headers: dict[str, str], # mutated inplace!
|
|
1498
1557
|
token: Union[bool, str, None],
|
|
1499
1558
|
local_files_only: bool,
|
|
1500
1559
|
relative_filename: Optional[str] = None, # only used to store `.no_exists` in cache
|
|
1501
1560
|
storage_folder: Optional[str] = None, # only used to store `.no_exists` in cache
|
|
1502
1561
|
) -> Union[
|
|
1503
1562
|
# Either an exception is caught and returned
|
|
1504
|
-
|
|
1563
|
+
tuple[None, None, None, None, None, Exception],
|
|
1505
1564
|
# Or the metadata is returned as
|
|
1506
1565
|
# `(url_to_download, etag, commit_hash, expected_size, xet_file_data, None)`
|
|
1507
|
-
|
|
1566
|
+
tuple[str, str, str, int, Optional[XetFileData], None],
|
|
1508
1567
|
]:
|
|
1509
1568
|
"""Get metadata for a file on the Hub, safely handling network issues.
|
|
1510
1569
|
|
|
@@ -1541,9 +1600,9 @@ def _get_metadata_or_catch_error(
|
|
|
1541
1600
|
try:
|
|
1542
1601
|
try:
|
|
1543
1602
|
metadata = get_hf_file_metadata(
|
|
1544
|
-
url=url,
|
|
1603
|
+
url=url, timeout=etag_timeout, headers=headers, token=token, endpoint=endpoint
|
|
1545
1604
|
)
|
|
1546
|
-
except
|
|
1605
|
+
except RemoteEntryNotFoundError as http_error:
|
|
1547
1606
|
if storage_folder is not None and relative_filename is not None:
|
|
1548
1607
|
# Cache the non-existence of the file
|
|
1549
1608
|
commit_hash = http_error.response.headers.get(constants.HUGGINGFACE_HEADER_X_REPO_COMMIT)
|
|
@@ -1594,21 +1653,17 @@ def _get_metadata_or_catch_error(
|
|
|
1594
1653
|
if urlparse(url).netloc != urlparse(metadata.location).netloc:
|
|
1595
1654
|
# Remove authorization header when downloading a LFS blob
|
|
1596
1655
|
headers.pop("authorization", None)
|
|
1597
|
-
except
|
|
1598
|
-
# Actually raise
|
|
1656
|
+
except httpx.ProxyError:
|
|
1657
|
+
# Actually raise on proxy error
|
|
1599
1658
|
raise
|
|
1600
|
-
except (
|
|
1601
|
-
requests.exceptions.ConnectionError,
|
|
1602
|
-
requests.exceptions.Timeout,
|
|
1603
|
-
OfflineModeIsEnabled,
|
|
1604
|
-
) as error:
|
|
1659
|
+
except (httpx.ConnectError, httpx.TimeoutException, OfflineModeIsEnabled) as error:
|
|
1605
1660
|
# Otherwise, our Internet connection is down.
|
|
1606
1661
|
# etag is None
|
|
1607
1662
|
head_error_call = error
|
|
1608
|
-
except (RevisionNotFoundError,
|
|
1663
|
+
except (RevisionNotFoundError, RemoteEntryNotFoundError):
|
|
1609
1664
|
# The repo was found but the revision or entry doesn't exist on the Hub (never existed or got deleted)
|
|
1610
1665
|
raise
|
|
1611
|
-
except
|
|
1666
|
+
except HfHubHTTPError as error:
|
|
1612
1667
|
# Multiple reasons for an http error:
|
|
1613
1668
|
# - Repository is private and invalid/missing token sent
|
|
1614
1669
|
# - Repository is gated and invalid/missing token sent
|
|
@@ -1666,8 +1721,7 @@ def _download_to_tmp_and_move(
|
|
|
1666
1721
|
incomplete_path: Path,
|
|
1667
1722
|
destination_path: Path,
|
|
1668
1723
|
url_to_download: str,
|
|
1669
|
-
|
|
1670
|
-
headers: Dict[str, str],
|
|
1724
|
+
headers: dict[str, str],
|
|
1671
1725
|
expected_size: Optional[int],
|
|
1672
1726
|
filename: str,
|
|
1673
1727
|
force_download: bool,
|
|
@@ -1679,7 +1733,7 @@ def _download_to_tmp_and_move(
|
|
|
1679
1733
|
Internal logic:
|
|
1680
1734
|
- return early if file is already downloaded
|
|
1681
1735
|
- resume download if possible (from incomplete file)
|
|
1682
|
-
- do not resume download if `force_download=True`
|
|
1736
|
+
- do not resume download if `force_download=True`
|
|
1683
1737
|
- check disk space before downloading
|
|
1684
1738
|
- download content to a temporary file
|
|
1685
1739
|
- set correct permissions on temporary file
|
|
@@ -1691,16 +1745,11 @@ def _download_to_tmp_and_move(
|
|
|
1691
1745
|
# Do nothing if already exists (except if force_download=True)
|
|
1692
1746
|
return
|
|
1693
1747
|
|
|
1694
|
-
if incomplete_path.exists() and
|
|
1748
|
+
if incomplete_path.exists() and force_download:
|
|
1695
1749
|
# By default, we will try to resume the download if possible.
|
|
1696
|
-
# However, if the user has set `force_download=True
|
|
1750
|
+
# However, if the user has set `force_download=True`, then we should
|
|
1697
1751
|
# not resume the download => delete the incomplete file.
|
|
1698
|
-
|
|
1699
|
-
if force_download:
|
|
1700
|
-
message += " (force_download=True)"
|
|
1701
|
-
elif constants.HF_HUB_ENABLE_HF_TRANSFER and not proxies:
|
|
1702
|
-
message += " (hf_transfer=True)"
|
|
1703
|
-
logger.info(message)
|
|
1752
|
+
logger.info(f"Removing incomplete file '{incomplete_path}' (force_download=True)")
|
|
1704
1753
|
incomplete_path.unlink(missing_ok=True)
|
|
1705
1754
|
|
|
1706
1755
|
with incomplete_path.open("ab") as f:
|
|
@@ -1735,7 +1784,6 @@ def _download_to_tmp_and_move(
|
|
|
1735
1784
|
http_get(
|
|
1736
1785
|
url_to_download,
|
|
1737
1786
|
f,
|
|
1738
|
-
proxies=proxies,
|
|
1739
1787
|
resume_size=resume_size,
|
|
1740
1788
|
headers=headers,
|
|
1741
1789
|
expected_size=expected_size,
|