huggingface-hub 0.31.0rc0__py3-none-any.whl → 1.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- huggingface_hub/__init__.py +145 -46
- huggingface_hub/_commit_api.py +168 -119
- huggingface_hub/_commit_scheduler.py +15 -15
- huggingface_hub/_inference_endpoints.py +15 -12
- huggingface_hub/_jobs_api.py +301 -0
- huggingface_hub/_local_folder.py +18 -3
- huggingface_hub/_login.py +31 -63
- huggingface_hub/_oauth.py +460 -0
- huggingface_hub/_snapshot_download.py +239 -80
- huggingface_hub/_space_api.py +5 -5
- huggingface_hub/_tensorboard_logger.py +15 -19
- huggingface_hub/_upload_large_folder.py +172 -76
- huggingface_hub/_webhooks_payload.py +3 -3
- huggingface_hub/_webhooks_server.py +13 -25
- huggingface_hub/{commands → cli}/__init__.py +1 -15
- huggingface_hub/cli/_cli_utils.py +173 -0
- huggingface_hub/cli/auth.py +147 -0
- huggingface_hub/cli/cache.py +841 -0
- huggingface_hub/cli/download.py +189 -0
- huggingface_hub/cli/hf.py +60 -0
- huggingface_hub/cli/inference_endpoints.py +377 -0
- huggingface_hub/cli/jobs.py +772 -0
- huggingface_hub/cli/lfs.py +175 -0
- huggingface_hub/cli/repo.py +315 -0
- huggingface_hub/cli/repo_files.py +94 -0
- huggingface_hub/{commands/env.py → cli/system.py} +10 -13
- huggingface_hub/cli/upload.py +294 -0
- huggingface_hub/cli/upload_large_folder.py +117 -0
- huggingface_hub/community.py +20 -12
- huggingface_hub/constants.py +38 -53
- huggingface_hub/dataclasses.py +609 -0
- huggingface_hub/errors.py +80 -30
- huggingface_hub/fastai_utils.py +30 -41
- huggingface_hub/file_download.py +435 -351
- huggingface_hub/hf_api.py +2050 -1124
- huggingface_hub/hf_file_system.py +269 -152
- huggingface_hub/hub_mixin.py +43 -63
- huggingface_hub/inference/_client.py +347 -434
- huggingface_hub/inference/_common.py +133 -121
- huggingface_hub/inference/_generated/_async_client.py +397 -541
- huggingface_hub/inference/_generated/types/__init__.py +5 -1
- huggingface_hub/inference/_generated/types/automatic_speech_recognition.py +3 -3
- huggingface_hub/inference/_generated/types/base.py +10 -7
- huggingface_hub/inference/_generated/types/chat_completion.py +59 -23
- huggingface_hub/inference/_generated/types/depth_estimation.py +2 -2
- huggingface_hub/inference/_generated/types/document_question_answering.py +2 -2
- huggingface_hub/inference/_generated/types/feature_extraction.py +2 -2
- huggingface_hub/inference/_generated/types/fill_mask.py +2 -2
- huggingface_hub/inference/_generated/types/image_to_image.py +6 -2
- huggingface_hub/inference/_generated/types/image_to_video.py +60 -0
- huggingface_hub/inference/_generated/types/sentence_similarity.py +3 -3
- huggingface_hub/inference/_generated/types/summarization.py +2 -2
- huggingface_hub/inference/_generated/types/table_question_answering.py +5 -5
- huggingface_hub/inference/_generated/types/text2text_generation.py +2 -2
- huggingface_hub/inference/_generated/types/text_generation.py +10 -10
- huggingface_hub/inference/_generated/types/text_to_video.py +2 -2
- huggingface_hub/inference/_generated/types/token_classification.py +2 -2
- huggingface_hub/inference/_generated/types/translation.py +2 -2
- huggingface_hub/inference/_generated/types/zero_shot_classification.py +2 -2
- huggingface_hub/inference/_generated/types/zero_shot_image_classification.py +2 -2
- huggingface_hub/inference/_generated/types/zero_shot_object_detection.py +1 -3
- huggingface_hub/inference/_mcp/__init__.py +0 -0
- huggingface_hub/inference/_mcp/_cli_hacks.py +88 -0
- huggingface_hub/inference/_mcp/agent.py +100 -0
- huggingface_hub/inference/_mcp/cli.py +247 -0
- huggingface_hub/inference/_mcp/constants.py +81 -0
- huggingface_hub/inference/_mcp/mcp_client.py +395 -0
- huggingface_hub/inference/_mcp/types.py +45 -0
- huggingface_hub/inference/_mcp/utils.py +128 -0
- huggingface_hub/inference/_providers/__init__.py +82 -7
- huggingface_hub/inference/_providers/_common.py +129 -27
- huggingface_hub/inference/_providers/black_forest_labs.py +6 -6
- huggingface_hub/inference/_providers/cerebras.py +1 -1
- huggingface_hub/inference/_providers/clarifai.py +13 -0
- huggingface_hub/inference/_providers/cohere.py +20 -3
- huggingface_hub/inference/_providers/fal_ai.py +183 -56
- huggingface_hub/inference/_providers/featherless_ai.py +38 -0
- huggingface_hub/inference/_providers/fireworks_ai.py +18 -0
- huggingface_hub/inference/_providers/groq.py +9 -0
- huggingface_hub/inference/_providers/hf_inference.py +69 -30
- huggingface_hub/inference/_providers/hyperbolic.py +4 -4
- huggingface_hub/inference/_providers/nebius.py +33 -5
- huggingface_hub/inference/_providers/novita.py +5 -5
- huggingface_hub/inference/_providers/nscale.py +44 -0
- huggingface_hub/inference/_providers/openai.py +3 -1
- huggingface_hub/inference/_providers/publicai.py +6 -0
- huggingface_hub/inference/_providers/replicate.py +31 -13
- huggingface_hub/inference/_providers/sambanova.py +18 -4
- huggingface_hub/inference/_providers/scaleway.py +28 -0
- huggingface_hub/inference/_providers/together.py +20 -5
- huggingface_hub/inference/_providers/wavespeed.py +138 -0
- huggingface_hub/inference/_providers/zai_org.py +17 -0
- huggingface_hub/lfs.py +33 -100
- huggingface_hub/repocard.py +34 -38
- huggingface_hub/repocard_data.py +57 -57
- huggingface_hub/serialization/__init__.py +0 -1
- huggingface_hub/serialization/_base.py +12 -15
- huggingface_hub/serialization/_dduf.py +8 -8
- huggingface_hub/serialization/_torch.py +69 -69
- huggingface_hub/utils/__init__.py +19 -8
- huggingface_hub/utils/_auth.py +7 -7
- huggingface_hub/utils/_cache_manager.py +92 -147
- huggingface_hub/utils/_chunk_utils.py +2 -3
- huggingface_hub/utils/_deprecation.py +1 -1
- huggingface_hub/utils/_dotenv.py +55 -0
- huggingface_hub/utils/_experimental.py +7 -5
- huggingface_hub/utils/_fixes.py +0 -10
- huggingface_hub/utils/_git_credential.py +5 -5
- huggingface_hub/utils/_headers.py +8 -30
- huggingface_hub/utils/_http.py +398 -239
- huggingface_hub/utils/_pagination.py +4 -4
- huggingface_hub/utils/_parsing.py +98 -0
- huggingface_hub/utils/_paths.py +5 -5
- huggingface_hub/utils/_runtime.py +61 -24
- huggingface_hub/utils/_safetensors.py +21 -21
- huggingface_hub/utils/_subprocess.py +9 -9
- huggingface_hub/utils/_telemetry.py +4 -4
- huggingface_hub/{commands/_cli_utils.py → utils/_terminal.py} +4 -4
- huggingface_hub/utils/_typing.py +25 -5
- huggingface_hub/utils/_validators.py +55 -74
- huggingface_hub/utils/_verification.py +167 -0
- huggingface_hub/utils/_xet.py +64 -17
- huggingface_hub/utils/_xet_progress_reporting.py +162 -0
- huggingface_hub/utils/insecure_hashlib.py +3 -5
- huggingface_hub/utils/logging.py +8 -11
- huggingface_hub/utils/tqdm.py +5 -4
- {huggingface_hub-0.31.0rc0.dist-info → huggingface_hub-1.1.3.dist-info}/METADATA +94 -85
- huggingface_hub-1.1.3.dist-info/RECORD +155 -0
- {huggingface_hub-0.31.0rc0.dist-info → huggingface_hub-1.1.3.dist-info}/WHEEL +1 -1
- huggingface_hub-1.1.3.dist-info/entry_points.txt +6 -0
- huggingface_hub/commands/delete_cache.py +0 -474
- huggingface_hub/commands/download.py +0 -200
- huggingface_hub/commands/huggingface_cli.py +0 -61
- huggingface_hub/commands/lfs.py +0 -200
- huggingface_hub/commands/repo_files.py +0 -128
- huggingface_hub/commands/scan_cache.py +0 -181
- huggingface_hub/commands/tag.py +0 -159
- huggingface_hub/commands/upload.py +0 -314
- huggingface_hub/commands/upload_large_folder.py +0 -129
- huggingface_hub/commands/user.py +0 -304
- huggingface_hub/commands/version.py +0 -37
- huggingface_hub/inference_api.py +0 -217
- huggingface_hub/keras_mixin.py +0 -500
- huggingface_hub/repository.py +0 -1477
- huggingface_hub/serialization/_tensorflow.py +0 -95
- huggingface_hub/utils/_hf_folder.py +0 -68
- huggingface_hub-0.31.0rc0.dist-info/RECORD +0 -135
- huggingface_hub-0.31.0rc0.dist-info/entry_points.txt +0 -6
- {huggingface_hub-0.31.0rc0.dist-info → huggingface_hub-1.1.3.dist-info/licenses}/LICENSE +0 -0
- {huggingface_hub-0.31.0rc0.dist-info → huggingface_hub-1.1.3.dist-info}/top_level.txt +0 -0
huggingface_hub/file_download.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
import copy
|
|
2
2
|
import errno
|
|
3
|
-
import inspect
|
|
4
3
|
import os
|
|
5
4
|
import re
|
|
6
5
|
import shutil
|
|
@@ -10,26 +9,20 @@ import uuid
|
|
|
10
9
|
import warnings
|
|
11
10
|
from dataclasses import dataclass
|
|
12
11
|
from pathlib import Path
|
|
13
|
-
from typing import Any, BinaryIO,
|
|
12
|
+
from typing import Any, BinaryIO, Literal, NoReturn, Optional, Union, overload
|
|
14
13
|
from urllib.parse import quote, urlparse
|
|
15
14
|
|
|
16
|
-
import
|
|
15
|
+
import httpx
|
|
16
|
+
from tqdm.auto import tqdm as base_tqdm
|
|
17
17
|
|
|
18
|
-
from . import
|
|
19
|
-
__version__, # noqa: F401 # for backward compatibility
|
|
20
|
-
constants,
|
|
21
|
-
)
|
|
18
|
+
from . import constants
|
|
22
19
|
from ._local_folder import get_local_download_paths, read_download_metadata, write_download_metadata
|
|
23
|
-
from .constants import (
|
|
24
|
-
HUGGINGFACE_CO_URL_TEMPLATE, # noqa: F401 # for backward compatibility
|
|
25
|
-
HUGGINGFACE_HUB_CACHE, # noqa: F401 # for backward compatibility
|
|
26
|
-
)
|
|
27
20
|
from .errors import (
|
|
28
|
-
EntryNotFoundError,
|
|
29
21
|
FileMetadataError,
|
|
30
22
|
GatedRepoError,
|
|
31
23
|
HfHubHTTPError,
|
|
32
24
|
LocalEntryNotFoundError,
|
|
25
|
+
RemoteEntryNotFoundError,
|
|
33
26
|
RepositoryNotFoundError,
|
|
34
27
|
RevisionNotFoundError,
|
|
35
28
|
)
|
|
@@ -39,30 +32,15 @@ from .utils import (
|
|
|
39
32
|
WeakFileLock,
|
|
40
33
|
XetFileData,
|
|
41
34
|
build_hf_headers,
|
|
42
|
-
get_fastai_version, # noqa: F401 # for backward compatibility
|
|
43
|
-
get_fastcore_version, # noqa: F401 # for backward compatibility
|
|
44
|
-
get_graphviz_version, # noqa: F401 # for backward compatibility
|
|
45
|
-
get_jinja_version, # noqa: F401 # for backward compatibility
|
|
46
|
-
get_pydot_version, # noqa: F401 # for backward compatibility
|
|
47
|
-
get_tf_version, # noqa: F401 # for backward compatibility
|
|
48
|
-
get_torch_version, # noqa: F401 # for backward compatibility
|
|
49
35
|
hf_raise_for_status,
|
|
50
|
-
is_fastai_available, # noqa: F401 # for backward compatibility
|
|
51
|
-
is_fastcore_available, # noqa: F401 # for backward compatibility
|
|
52
|
-
is_graphviz_available, # noqa: F401 # for backward compatibility
|
|
53
|
-
is_jinja_available, # noqa: F401 # for backward compatibility
|
|
54
|
-
is_pydot_available, # noqa: F401 # for backward compatibility
|
|
55
|
-
is_tf_available, # noqa: F401 # for backward compatibility
|
|
56
|
-
is_torch_available, # noqa: F401 # for backward compatibility
|
|
57
36
|
logging,
|
|
58
37
|
parse_xet_file_data_from_response,
|
|
59
38
|
refresh_xet_connection_info,
|
|
60
|
-
reset_sessions,
|
|
61
39
|
tqdm,
|
|
62
40
|
validate_hf_hub_args,
|
|
63
41
|
)
|
|
64
|
-
from .utils._http import _adjust_range_header, http_backoff
|
|
65
|
-
from .utils._runtime import
|
|
42
|
+
from .utils._http import _adjust_range_header, http_backoff, http_stream_backoff
|
|
43
|
+
from .utils._runtime import is_xet_available
|
|
66
44
|
from .utils._typing import HTTP_METHOD_T
|
|
67
45
|
from .utils.sha import sha_fileobj
|
|
68
46
|
from .utils.tqdm import _get_progress_bar_context
|
|
@@ -83,7 +61,7 @@ REGEX_COMMIT_HASH = re.compile(r"^[0-9a-f]{40}$")
|
|
|
83
61
|
# Regex to check if the file etag IS a valid sha256
|
|
84
62
|
REGEX_SHA256 = re.compile(r"^[0-9a-f]{64}$")
|
|
85
63
|
|
|
86
|
-
_are_symlinks_supported_in_dir:
|
|
64
|
+
_are_symlinks_supported_in_dir: dict[str, bool] = {}
|
|
87
65
|
|
|
88
66
|
|
|
89
67
|
def are_symlinks_supported(cache_dir: Union[str, Path, None] = None) -> bool:
|
|
@@ -172,6 +150,34 @@ class HfFileMetadata:
|
|
|
172
150
|
xet_file_data: Optional[XetFileData]
|
|
173
151
|
|
|
174
152
|
|
|
153
|
+
@dataclass
|
|
154
|
+
class DryRunFileInfo:
|
|
155
|
+
"""Information returned when performing a dry run of a file download.
|
|
156
|
+
|
|
157
|
+
Returned by [`hf_hub_download`] when `dry_run=True`.
|
|
158
|
+
|
|
159
|
+
Args:
|
|
160
|
+
commit_hash (`str`):
|
|
161
|
+
The commit_hash related to the file.
|
|
162
|
+
file_size (`int`):
|
|
163
|
+
Size of the file. In case of an LFS file, contains the size of the actual LFS file, not the pointer.
|
|
164
|
+
filename (`str`):
|
|
165
|
+
Name of the file in the repo.
|
|
166
|
+
is_cached (`bool`):
|
|
167
|
+
Whether the file is already cached locally.
|
|
168
|
+
will_download (`bool`):
|
|
169
|
+
Whether the file will be downloaded if `hf_hub_download` is called with `dry_run=False`.
|
|
170
|
+
In practice, will_download is `True` if the file is not cached or if `force_download=True`.
|
|
171
|
+
"""
|
|
172
|
+
|
|
173
|
+
commit_hash: str
|
|
174
|
+
file_size: int
|
|
175
|
+
filename: str
|
|
176
|
+
local_path: str
|
|
177
|
+
is_cached: bool
|
|
178
|
+
will_download: bool
|
|
179
|
+
|
|
180
|
+
|
|
175
181
|
@validate_hf_hub_args
|
|
176
182
|
def hf_hub_url(
|
|
177
183
|
repo_id: str,
|
|
@@ -214,26 +220,23 @@ def hf_hub_url(
|
|
|
214
220
|
'https://huggingface.co/julien-c/EsperBERTo-small/resolve/main/pytorch_model.bin'
|
|
215
221
|
```
|
|
216
222
|
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
if stored in git, or its sha256 if stored in git-lfs.
|
|
235
|
-
|
|
236
|
-
</Tip>
|
|
223
|
+
> [!TIP]
|
|
224
|
+
> Notes:
|
|
225
|
+
>
|
|
226
|
+
> Cloudfront is replicated over the globe so downloads are way faster for
|
|
227
|
+
> the end user (and it also lowers our bandwidth costs).
|
|
228
|
+
>
|
|
229
|
+
> Cloudfront aggressively caches files by default (default TTL is 24
|
|
230
|
+
> hours), however this is not an issue here because we implement a
|
|
231
|
+
> git-based versioning system on huggingface.co, which means that we store
|
|
232
|
+
> the files on S3/Cloudfront in a content-addressable way (i.e., the file
|
|
233
|
+
> name is its hash). Using content-addressable filenames means cache can't
|
|
234
|
+
> ever be stale.
|
|
235
|
+
>
|
|
236
|
+
> In terms of client-side caching from this library, we base our caching
|
|
237
|
+
> on the objects' entity tag (`ETag`), which is an identifier of a
|
|
238
|
+
> specific version of a resource [1]_. An object's ETag is: its git-sha1
|
|
239
|
+
> if stored in git, or its sha256 if stored in git-lfs.
|
|
237
240
|
|
|
238
241
|
References:
|
|
239
242
|
|
|
@@ -252,7 +255,7 @@ def hf_hub_url(
|
|
|
252
255
|
|
|
253
256
|
if revision is None:
|
|
254
257
|
revision = constants.DEFAULT_REVISION
|
|
255
|
-
url = HUGGINGFACE_CO_URL_TEMPLATE.format(
|
|
258
|
+
url = constants.HUGGINGFACE_CO_URL_TEMPLATE.format(
|
|
256
259
|
repo_id=repo_id, revision=quote(revision, safe=""), filename=quote(filename)
|
|
257
260
|
)
|
|
258
261
|
# Update endpoint if provided
|
|
@@ -261,74 +264,71 @@ def hf_hub_url(
|
|
|
261
264
|
return url
|
|
262
265
|
|
|
263
266
|
|
|
264
|
-
def
|
|
265
|
-
|
|
266
|
-
) -> requests.Response:
|
|
267
|
-
"""Wrapper around requests methods to follow relative redirects if `follow_relative_redirects=True` even when
|
|
268
|
-
`allow_redirection=False`.
|
|
267
|
+
def _httpx_follow_relative_redirects(method: HTTP_METHOD_T, url: str, **httpx_kwargs) -> httpx.Response:
|
|
268
|
+
"""Perform an HTTP request with backoff and follow relative redirects only.
|
|
269
269
|
|
|
270
|
-
|
|
270
|
+
This is useful to follow a redirection to a renamed repository without following redirection to a CDN.
|
|
271
|
+
|
|
272
|
+
A backoff mechanism retries the HTTP call on 5xx errors and network errors.
|
|
271
273
|
|
|
272
274
|
Args:
|
|
273
275
|
method (`str`):
|
|
274
276
|
HTTP method, such as 'GET' or 'HEAD'.
|
|
275
277
|
url (`str`):
|
|
276
278
|
The URL of the resource to fetch.
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
kwarg is set to False. Useful when we want to follow a redirection to a renamed repository without
|
|
280
|
-
following redirection to a CDN.
|
|
281
|
-
**params (`dict`, *optional*):
|
|
282
|
-
Params to pass to `requests.request`.
|
|
279
|
+
**httpx_kwargs (`dict`, *optional*):
|
|
280
|
+
Params to pass to `httpx.request`.
|
|
283
281
|
"""
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
response =
|
|
282
|
+
while True:
|
|
283
|
+
# Make the request
|
|
284
|
+
response = http_backoff(
|
|
287
285
|
method=method,
|
|
288
286
|
url=url,
|
|
289
|
-
|
|
290
|
-
|
|
287
|
+
**httpx_kwargs,
|
|
288
|
+
follow_redirects=False,
|
|
289
|
+
retry_on_exceptions=(),
|
|
290
|
+
retry_on_status_codes=(429,),
|
|
291
291
|
)
|
|
292
|
+
hf_raise_for_status(response)
|
|
292
293
|
|
|
293
|
-
#
|
|
294
|
-
# This is useful in case of a renamed repository.
|
|
294
|
+
# Check if response is a relative redirect
|
|
295
295
|
if 300 <= response.status_code <= 399:
|
|
296
296
|
parsed_target = urlparse(response.headers["Location"])
|
|
297
297
|
if parsed_target.netloc == "":
|
|
298
|
-
#
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
return _request_wrapper(method=method, url=next_url, follow_relative_redirects=True, **params)
|
|
306
|
-
return response
|
|
307
|
-
|
|
308
|
-
# Perform request and return if status_code is not in the retry list.
|
|
309
|
-
response = http_backoff(method=method, url=url, **params, retry_on_exceptions=(), retry_on_status_codes=(429,))
|
|
310
|
-
hf_raise_for_status(response)
|
|
298
|
+
# Relative redirect -> update URL and retry
|
|
299
|
+
url = urlparse(url)._replace(path=parsed_target.path).geturl()
|
|
300
|
+
continue
|
|
301
|
+
|
|
302
|
+
# Break if no relative redirect
|
|
303
|
+
break
|
|
304
|
+
|
|
311
305
|
return response
|
|
312
306
|
|
|
313
307
|
|
|
314
|
-
def _get_file_length_from_http_response(response:
|
|
308
|
+
def _get_file_length_from_http_response(response: httpx.Response) -> Optional[int]:
|
|
315
309
|
"""
|
|
316
310
|
Get the length of the file from the HTTP response headers.
|
|
317
311
|
|
|
318
312
|
This function extracts the file size from the HTTP response headers, either from the
|
|
319
313
|
`Content-Range` or `Content-Length` header, if available (in that order).
|
|
320
|
-
The HTTP response object containing the headers.
|
|
321
|
-
`int` or `None`: The length of the file in bytes if the information is available,
|
|
322
|
-
otherwise `None`.
|
|
323
314
|
|
|
324
315
|
Args:
|
|
325
|
-
response (`
|
|
316
|
+
response (`httpx.Response`):
|
|
326
317
|
The HTTP response object.
|
|
327
318
|
|
|
328
319
|
Returns:
|
|
329
320
|
`int` or `None`: The length of the file in bytes, or None if not available.
|
|
330
321
|
"""
|
|
331
322
|
|
|
323
|
+
# If HTTP response contains compressed body (e.g. gzip), the `Content-Length` header will
|
|
324
|
+
# contain the length of the compressed body, not the uncompressed file size.
|
|
325
|
+
# And at the start of transmission there's no way to know the uncompressed file size for gzip,
|
|
326
|
+
# thus we return None in that case.
|
|
327
|
+
content_encoding = response.headers.get("Content-Encoding", "identity").lower()
|
|
328
|
+
if content_encoding != "identity":
|
|
329
|
+
# gzip/br/deflate/zstd etc
|
|
330
|
+
return None
|
|
331
|
+
|
|
332
332
|
content_range = response.headers.get("Content-Range")
|
|
333
333
|
if content_range is not None:
|
|
334
334
|
return int(content_range.rsplit("/")[-1])
|
|
@@ -340,15 +340,16 @@ def _get_file_length_from_http_response(response: requests.Response) -> Optional
|
|
|
340
340
|
return None
|
|
341
341
|
|
|
342
342
|
|
|
343
|
+
@validate_hf_hub_args
|
|
343
344
|
def http_get(
|
|
344
345
|
url: str,
|
|
345
346
|
temp_file: BinaryIO,
|
|
346
347
|
*,
|
|
347
|
-
proxies: Optional[Dict] = None,
|
|
348
348
|
resume_size: int = 0,
|
|
349
|
-
headers: Optional[
|
|
349
|
+
headers: Optional[dict[str, Any]] = None,
|
|
350
350
|
expected_size: Optional[int] = None,
|
|
351
351
|
displayed_filename: Optional[str] = None,
|
|
352
|
+
tqdm_class: Optional[type[base_tqdm]] = None,
|
|
352
353
|
_nb_retries: int = 5,
|
|
353
354
|
_tqdm_bar: Optional[tqdm] = None,
|
|
354
355
|
) -> None:
|
|
@@ -364,8 +365,6 @@ def http_get(
|
|
|
364
365
|
The URL of the file to download.
|
|
365
366
|
temp_file (`BinaryIO`):
|
|
366
367
|
The file-like object where to save the file.
|
|
367
|
-
proxies (`dict`, *optional*):
|
|
368
|
-
Dictionary mapping protocol to the URL of the proxy passed to `requests.request`.
|
|
369
368
|
resume_size (`int`, *optional*):
|
|
370
369
|
The number of bytes already downloaded. If set to 0 (default), the whole file is download. If set to a
|
|
371
370
|
positive number, the download will resume at the given position.
|
|
@@ -382,142 +381,85 @@ def http_get(
|
|
|
382
381
|
# If the file is already fully downloaded, we don't need to download it again.
|
|
383
382
|
return
|
|
384
383
|
|
|
385
|
-
has_custom_range_header = headers is not None and any(h.lower() == "range" for h in headers)
|
|
386
|
-
hf_transfer = None
|
|
387
|
-
if constants.HF_HUB_ENABLE_HF_TRANSFER:
|
|
388
|
-
if resume_size != 0:
|
|
389
|
-
warnings.warn("'hf_transfer' does not support `resume_size`: falling back to regular download method")
|
|
390
|
-
elif proxies is not None:
|
|
391
|
-
warnings.warn("'hf_transfer' does not support `proxies`: falling back to regular download method")
|
|
392
|
-
elif has_custom_range_header:
|
|
393
|
-
warnings.warn("'hf_transfer' ignores custom 'Range' headers; falling back to regular download method")
|
|
394
|
-
else:
|
|
395
|
-
try:
|
|
396
|
-
import hf_transfer # type: ignore[no-redef]
|
|
397
|
-
except ImportError:
|
|
398
|
-
raise ValueError(
|
|
399
|
-
"Fast download using 'hf_transfer' is enabled"
|
|
400
|
-
" (HF_HUB_ENABLE_HF_TRANSFER=1) but 'hf_transfer' package is not"
|
|
401
|
-
" available in your environment. Try `pip install hf_transfer`."
|
|
402
|
-
)
|
|
403
|
-
|
|
404
384
|
initial_headers = headers
|
|
405
385
|
headers = copy.deepcopy(headers) or {}
|
|
406
386
|
if resume_size > 0:
|
|
407
387
|
headers["Range"] = _adjust_range_header(headers.get("Range"), resume_size)
|
|
408
388
|
elif expected_size and expected_size > constants.MAX_HTTP_DOWNLOAD_SIZE:
|
|
409
|
-
# Any files over 50GB will not be available through basic http
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
else:
|
|
415
|
-
raise ValueError(
|
|
416
|
-
"The file is too large to be downloaded using the regular download method. Use `hf_transfer` or `hf_xet` instead."
|
|
417
|
-
" Try `pip install hf_transfer` or `pip install hf_xet`."
|
|
418
|
-
)
|
|
419
|
-
|
|
420
|
-
r = _request_wrapper(
|
|
421
|
-
method="GET", url=url, stream=True, proxies=proxies, headers=headers, timeout=constants.HF_HUB_DOWNLOAD_TIMEOUT
|
|
422
|
-
)
|
|
423
|
-
|
|
424
|
-
hf_raise_for_status(r)
|
|
425
|
-
content_length = _get_file_length_from_http_response(r)
|
|
426
|
-
|
|
427
|
-
# NOTE: 'total' is the total number of bytes to download, not the number of bytes in the file.
|
|
428
|
-
# If the file is compressed, the number of bytes in the saved file will be higher than 'total'.
|
|
429
|
-
total = resume_size + int(content_length) if content_length is not None else None
|
|
430
|
-
|
|
431
|
-
if displayed_filename is None:
|
|
432
|
-
displayed_filename = url
|
|
433
|
-
content_disposition = r.headers.get("Content-Disposition")
|
|
434
|
-
if content_disposition is not None:
|
|
435
|
-
match = HEADER_FILENAME_PATTERN.search(content_disposition)
|
|
436
|
-
if match is not None:
|
|
437
|
-
# Means file is on CDN
|
|
438
|
-
displayed_filename = match.groupdict()["filename"]
|
|
439
|
-
|
|
440
|
-
# Truncate filename if too long to display
|
|
441
|
-
if len(displayed_filename) > 40:
|
|
442
|
-
displayed_filename = f"(…){displayed_filename[-40:]}"
|
|
389
|
+
# Any files over 50GB will not be available through basic http requests.
|
|
390
|
+
raise ValueError(
|
|
391
|
+
"The file is too large to be downloaded using the regular download method. "
|
|
392
|
+
" Install `hf_xet` with `pip install hf_xet` for xet-powered downloads."
|
|
393
|
+
)
|
|
443
394
|
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
395
|
+
with http_stream_backoff(
|
|
396
|
+
method="GET",
|
|
397
|
+
url=url,
|
|
398
|
+
headers=headers,
|
|
399
|
+
timeout=constants.HF_HUB_DOWNLOAD_TIMEOUT,
|
|
400
|
+
retry_on_exceptions=(),
|
|
401
|
+
retry_on_status_codes=(429,),
|
|
402
|
+
) as response:
|
|
403
|
+
hf_raise_for_status(response)
|
|
404
|
+
total: Optional[int] = _get_file_length_from_http_response(response)
|
|
405
|
+
|
|
406
|
+
if displayed_filename is None:
|
|
407
|
+
displayed_filename = url
|
|
408
|
+
content_disposition = response.headers.get("Content-Disposition")
|
|
409
|
+
if content_disposition is not None:
|
|
410
|
+
match = HEADER_FILENAME_PATTERN.search(content_disposition)
|
|
411
|
+
if match is not None:
|
|
412
|
+
# Means file is on CDN
|
|
413
|
+
displayed_filename = match.groupdict()["filename"]
|
|
414
|
+
|
|
415
|
+
# Truncate filename if too long to display
|
|
416
|
+
if len(displayed_filename) > 40:
|
|
417
|
+
displayed_filename = f"(…){displayed_filename[-40:]}"
|
|
418
|
+
|
|
419
|
+
consistency_error_message = (
|
|
420
|
+
f"Consistency check failed: file should be of size {expected_size} but has size"
|
|
421
|
+
f" {{actual_size}} ({displayed_filename}).\nThis is usually due to network issues while downloading the file."
|
|
422
|
+
" Please retry with `force_download=True`."
|
|
423
|
+
)
|
|
424
|
+
progress_cm = _get_progress_bar_context(
|
|
425
|
+
desc=displayed_filename,
|
|
426
|
+
log_level=logger.getEffectiveLevel(),
|
|
427
|
+
total=total,
|
|
428
|
+
initial=resume_size,
|
|
429
|
+
name="huggingface_hub.http_get",
|
|
430
|
+
tqdm_class=tqdm_class,
|
|
431
|
+
_tqdm_bar=_tqdm_bar,
|
|
432
|
+
)
|
|
457
433
|
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
supports_callback = "callback" in inspect.signature(hf_transfer.download).parameters
|
|
461
|
-
if not supports_callback:
|
|
462
|
-
warnings.warn(
|
|
463
|
-
"You are using an outdated version of `hf_transfer`. "
|
|
464
|
-
"Consider upgrading to latest version to enable progress bars "
|
|
465
|
-
"using `pip install -U hf_transfer`."
|
|
466
|
-
)
|
|
434
|
+
with progress_cm as progress:
|
|
435
|
+
new_resume_size = resume_size
|
|
467
436
|
try:
|
|
468
|
-
|
|
437
|
+
for chunk in response.iter_bytes(chunk_size=constants.DOWNLOAD_CHUNK_SIZE):
|
|
438
|
+
if chunk: # filter out keep-alive new chunks
|
|
439
|
+
progress.update(len(chunk))
|
|
440
|
+
temp_file.write(chunk)
|
|
441
|
+
new_resume_size += len(chunk)
|
|
442
|
+
# Some data has been downloaded from the server so we reset the number of retries.
|
|
443
|
+
_nb_retries = 5
|
|
444
|
+
except (httpx.ConnectError, httpx.TimeoutException) as e:
|
|
445
|
+
# If ConnectionError (SSLError) or ReadTimeout happen while streaming data from the server, it is most likely
|
|
446
|
+
# a transient error (network outage?). We log a warning message and try to resume the download a few times
|
|
447
|
+
# before giving up. Tre retry mechanism is basic but should be enough in most cases.
|
|
448
|
+
if _nb_retries <= 0:
|
|
449
|
+
logger.warning("Error while downloading from %s: %s\nMax retries exceeded.", url, str(e))
|
|
450
|
+
raise
|
|
451
|
+
logger.warning("Error while downloading from %s: %s\nTrying to resume download...", url, str(e))
|
|
452
|
+
time.sleep(1)
|
|
453
|
+
return http_get(
|
|
469
454
|
url=url,
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
chunk_size=constants.DOWNLOAD_CHUNK_SIZE,
|
|
455
|
+
temp_file=temp_file,
|
|
456
|
+
resume_size=new_resume_size,
|
|
473
457
|
headers=initial_headers,
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
except Exception as e:
|
|
479
|
-
raise RuntimeError(
|
|
480
|
-
"An error occurred while downloading using `hf_transfer`. Consider"
|
|
481
|
-
" disabling HF_HUB_ENABLE_HF_TRANSFER for better error handling."
|
|
482
|
-
) from e
|
|
483
|
-
if not supports_callback:
|
|
484
|
-
progress.update(total)
|
|
485
|
-
if expected_size is not None and expected_size != os.path.getsize(temp_file.name):
|
|
486
|
-
raise EnvironmentError(
|
|
487
|
-
consistency_error_message.format(
|
|
488
|
-
actual_size=os.path.getsize(temp_file.name),
|
|
489
|
-
)
|
|
458
|
+
expected_size=expected_size,
|
|
459
|
+
tqdm_class=tqdm_class,
|
|
460
|
+
_nb_retries=_nb_retries - 1,
|
|
461
|
+
_tqdm_bar=_tqdm_bar,
|
|
490
462
|
)
|
|
491
|
-
return
|
|
492
|
-
new_resume_size = resume_size
|
|
493
|
-
try:
|
|
494
|
-
for chunk in r.iter_content(chunk_size=constants.DOWNLOAD_CHUNK_SIZE):
|
|
495
|
-
if chunk: # filter out keep-alive new chunks
|
|
496
|
-
progress.update(len(chunk))
|
|
497
|
-
temp_file.write(chunk)
|
|
498
|
-
new_resume_size += len(chunk)
|
|
499
|
-
# Some data has been downloaded from the server so we reset the number of retries.
|
|
500
|
-
_nb_retries = 5
|
|
501
|
-
except (requests.ConnectionError, requests.ReadTimeout) as e:
|
|
502
|
-
# If ConnectionError (SSLError) or ReadTimeout happen while streaming data from the server, it is most likely
|
|
503
|
-
# a transient error (network outage?). We log a warning message and try to resume the download a few times
|
|
504
|
-
# before giving up. Tre retry mechanism is basic but should be enough in most cases.
|
|
505
|
-
if _nb_retries <= 0:
|
|
506
|
-
logger.warning("Error while downloading from %s: %s\nMax retries exceeded.", url, str(e))
|
|
507
|
-
raise
|
|
508
|
-
logger.warning("Error while downloading from %s: %s\nTrying to resume download...", url, str(e))
|
|
509
|
-
time.sleep(1)
|
|
510
|
-
reset_sessions() # In case of SSLError it's best to reset the shared requests.Session objects
|
|
511
|
-
return http_get(
|
|
512
|
-
url=url,
|
|
513
|
-
temp_file=temp_file,
|
|
514
|
-
proxies=proxies,
|
|
515
|
-
resume_size=new_resume_size,
|
|
516
|
-
headers=initial_headers,
|
|
517
|
-
expected_size=expected_size,
|
|
518
|
-
_nb_retries=_nb_retries - 1,
|
|
519
|
-
_tqdm_bar=_tqdm_bar,
|
|
520
|
-
)
|
|
521
463
|
|
|
522
464
|
if expected_size is not None and expected_size != temp_file.tell():
|
|
523
465
|
raise EnvironmentError(
|
|
@@ -531,9 +473,10 @@ def xet_get(
|
|
|
531
473
|
*,
|
|
532
474
|
incomplete_path: Path,
|
|
533
475
|
xet_file_data: XetFileData,
|
|
534
|
-
headers:
|
|
476
|
+
headers: dict[str, str],
|
|
535
477
|
expected_size: Optional[int] = None,
|
|
536
478
|
displayed_filename: Optional[str] = None,
|
|
479
|
+
tqdm_class: Optional[type[base_tqdm]] = None,
|
|
537
480
|
_tqdm_bar: Optional[tqdm] = None,
|
|
538
481
|
) -> None:
|
|
539
482
|
"""
|
|
@@ -544,7 +487,7 @@ def xet_get(
|
|
|
544
487
|
The path to the file to download.
|
|
545
488
|
xet_file_data (`XetFileData`):
|
|
546
489
|
The file metadata needed to make the request to the xet storage service.
|
|
547
|
-
headers (`
|
|
490
|
+
headers (`dict[str, str]`):
|
|
548
491
|
The headers to send to the xet storage service.
|
|
549
492
|
expected_size (`int`, *optional*):
|
|
550
493
|
The expected size of the file to download. If set, the download will raise an error if the size of the
|
|
@@ -591,7 +534,7 @@ def xet_get(
|
|
|
591
534
|
|
|
592
535
|
connection_info = refresh_xet_connection_info(file_data=xet_file_data, headers=headers)
|
|
593
536
|
|
|
594
|
-
def token_refresher() ->
|
|
537
|
+
def token_refresher() -> tuple[str, int]:
|
|
595
538
|
connection_info = refresh_xet_connection_info(file_data=xet_file_data, headers=headers)
|
|
596
539
|
if connection_info is None:
|
|
597
540
|
raise ValueError("Failed to refresh token using xet metadata.")
|
|
@@ -616,6 +559,7 @@ def xet_get(
|
|
|
616
559
|
total=expected_size,
|
|
617
560
|
initial=0,
|
|
618
561
|
name="huggingface_hub.xet_get",
|
|
562
|
+
tqdm_class=tqdm_class,
|
|
619
563
|
_tqdm_bar=_tqdm_bar,
|
|
620
564
|
)
|
|
621
565
|
|
|
@@ -747,10 +691,10 @@ def _create_symlink(src: str, dst: str, new_blob: bool = False) -> None:
|
|
|
747
691
|
|
|
748
692
|
# Symlinks are not supported => let's move or copy the file.
|
|
749
693
|
if new_blob:
|
|
750
|
-
logger.
|
|
694
|
+
logger.debug(f"Symlink not supported. Moving file from {abs_src} to {abs_dst}")
|
|
751
695
|
shutil.move(abs_src, abs_dst, copy_function=_copy_no_matter_what)
|
|
752
696
|
else:
|
|
753
|
-
logger.
|
|
697
|
+
logger.debug(f"Symlink not supported. Copying file from {abs_src} to {abs_dst}")
|
|
754
698
|
shutil.copyfile(abs_src, abs_dst)
|
|
755
699
|
|
|
756
700
|
|
|
@@ -806,6 +750,78 @@ def _check_disk_space(expected_size: int, target_dir: Union[str, Path]) -> None:
|
|
|
806
750
|
pass
|
|
807
751
|
|
|
808
752
|
|
|
753
|
+
@overload
|
|
754
|
+
def hf_hub_download(
|
|
755
|
+
repo_id: str,
|
|
756
|
+
filename: str,
|
|
757
|
+
*,
|
|
758
|
+
subfolder: Optional[str] = None,
|
|
759
|
+
repo_type: Optional[str] = None,
|
|
760
|
+
revision: Optional[str] = None,
|
|
761
|
+
library_name: Optional[str] = None,
|
|
762
|
+
library_version: Optional[str] = None,
|
|
763
|
+
cache_dir: Union[str, Path, None] = None,
|
|
764
|
+
local_dir: Union[str, Path, None] = None,
|
|
765
|
+
user_agent: Union[dict, str, None] = None,
|
|
766
|
+
force_download: bool = False,
|
|
767
|
+
etag_timeout: float = constants.DEFAULT_ETAG_TIMEOUT,
|
|
768
|
+
token: Union[bool, str, None] = None,
|
|
769
|
+
local_files_only: bool = False,
|
|
770
|
+
headers: Optional[dict[str, str]] = None,
|
|
771
|
+
endpoint: Optional[str] = None,
|
|
772
|
+
tqdm_class: Optional[type[base_tqdm]] = None,
|
|
773
|
+
dry_run: Literal[False] = False,
|
|
774
|
+
) -> str: ...
|
|
775
|
+
|
|
776
|
+
|
|
777
|
+
@overload
|
|
778
|
+
def hf_hub_download(
|
|
779
|
+
repo_id: str,
|
|
780
|
+
filename: str,
|
|
781
|
+
*,
|
|
782
|
+
subfolder: Optional[str] = None,
|
|
783
|
+
repo_type: Optional[str] = None,
|
|
784
|
+
revision: Optional[str] = None,
|
|
785
|
+
library_name: Optional[str] = None,
|
|
786
|
+
library_version: Optional[str] = None,
|
|
787
|
+
cache_dir: Union[str, Path, None] = None,
|
|
788
|
+
local_dir: Union[str, Path, None] = None,
|
|
789
|
+
user_agent: Union[dict, str, None] = None,
|
|
790
|
+
force_download: bool = False,
|
|
791
|
+
etag_timeout: float = constants.DEFAULT_ETAG_TIMEOUT,
|
|
792
|
+
token: Union[bool, str, None] = None,
|
|
793
|
+
local_files_only: bool = False,
|
|
794
|
+
headers: Optional[dict[str, str]] = None,
|
|
795
|
+
endpoint: Optional[str] = None,
|
|
796
|
+
tqdm_class: Optional[type[base_tqdm]] = None,
|
|
797
|
+
dry_run: Literal[True] = True,
|
|
798
|
+
) -> DryRunFileInfo: ...
|
|
799
|
+
|
|
800
|
+
|
|
801
|
+
@overload
|
|
802
|
+
def hf_hub_download(
|
|
803
|
+
repo_id: str,
|
|
804
|
+
filename: str,
|
|
805
|
+
*,
|
|
806
|
+
subfolder: Optional[str] = None,
|
|
807
|
+
repo_type: Optional[str] = None,
|
|
808
|
+
revision: Optional[str] = None,
|
|
809
|
+
library_name: Optional[str] = None,
|
|
810
|
+
library_version: Optional[str] = None,
|
|
811
|
+
cache_dir: Union[str, Path, None] = None,
|
|
812
|
+
local_dir: Union[str, Path, None] = None,
|
|
813
|
+
user_agent: Union[dict, str, None] = None,
|
|
814
|
+
force_download: bool = False,
|
|
815
|
+
etag_timeout: float = constants.DEFAULT_ETAG_TIMEOUT,
|
|
816
|
+
token: Union[bool, str, None] = None,
|
|
817
|
+
local_files_only: bool = False,
|
|
818
|
+
headers: Optional[dict[str, str]] = None,
|
|
819
|
+
endpoint: Optional[str] = None,
|
|
820
|
+
tqdm_class: Optional[type[base_tqdm]] = None,
|
|
821
|
+
dry_run: bool = False,
|
|
822
|
+
) -> Union[str, DryRunFileInfo]: ...
|
|
823
|
+
|
|
824
|
+
|
|
809
825
|
@validate_hf_hub_args
|
|
810
826
|
def hf_hub_download(
|
|
811
827
|
repo_id: str,
|
|
@@ -818,18 +834,16 @@ def hf_hub_download(
|
|
|
818
834
|
library_version: Optional[str] = None,
|
|
819
835
|
cache_dir: Union[str, Path, None] = None,
|
|
820
836
|
local_dir: Union[str, Path, None] = None,
|
|
821
|
-
user_agent: Union[
|
|
837
|
+
user_agent: Union[dict, str, None] = None,
|
|
822
838
|
force_download: bool = False,
|
|
823
|
-
proxies: Optional[Dict] = None,
|
|
824
839
|
etag_timeout: float = constants.DEFAULT_ETAG_TIMEOUT,
|
|
825
840
|
token: Union[bool, str, None] = None,
|
|
826
841
|
local_files_only: bool = False,
|
|
827
|
-
headers: Optional[
|
|
842
|
+
headers: Optional[dict[str, str]] = None,
|
|
828
843
|
endpoint: Optional[str] = None,
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
|
|
832
|
-
) -> str:
|
|
844
|
+
tqdm_class: Optional[type[base_tqdm]] = None,
|
|
845
|
+
dry_run: bool = False,
|
|
846
|
+
) -> Union[str, DryRunFileInfo]:
|
|
833
847
|
"""Download a given file if it's not already present in the local cache.
|
|
834
848
|
|
|
835
849
|
The new cache file layout looks like this:
|
|
@@ -891,9 +905,6 @@ def hf_hub_download(
|
|
|
891
905
|
force_download (`bool`, *optional*, defaults to `False`):
|
|
892
906
|
Whether the file should be downloaded even if it already exists in
|
|
893
907
|
the local cache.
|
|
894
|
-
proxies (`dict`, *optional*):
|
|
895
|
-
Dictionary mapping protocol to the URL of the proxy passed to
|
|
896
|
-
`requests.request`.
|
|
897
908
|
etag_timeout (`float`, *optional*, defaults to `10`):
|
|
898
909
|
When fetching ETag, how many seconds to wait for the server to send
|
|
899
910
|
data before giving up which is passed to `requests.request`.
|
|
@@ -907,9 +918,19 @@ def hf_hub_download(
|
|
|
907
918
|
local cached file if it exists.
|
|
908
919
|
headers (`dict`, *optional*):
|
|
909
920
|
Additional headers to be sent with the request.
|
|
921
|
+
tqdm_class (`tqdm`, *optional*):
|
|
922
|
+
If provided, overwrites the default behavior for the progress bar. Passed
|
|
923
|
+
argument must inherit from `tqdm.auto.tqdm` or at least mimic its behavior.
|
|
924
|
+
Defaults to the custom HF progress bar that can be disabled by setting
|
|
925
|
+
`HF_HUB_DISABLE_PROGRESS_BARS` environment variable.
|
|
926
|
+
dry_run (`bool`, *optional*, defaults to `False`):
|
|
927
|
+
If `True`, perform a dry run without actually downloading the file. Returns a
|
|
928
|
+
[`DryRunFileInfo`] object containing information about what would be downloaded.
|
|
910
929
|
|
|
911
930
|
Returns:
|
|
912
|
-
`str
|
|
931
|
+
`str` or [`DryRunFileInfo`]:
|
|
932
|
+
- If `dry_run=False`: Local path of file or if networking is off, last version of file cached on disk.
|
|
933
|
+
- If `dry_run=True`: A [`DryRunFileInfo`] object containing download information.
|
|
913
934
|
|
|
914
935
|
Raises:
|
|
915
936
|
[`~utils.RepositoryNotFoundError`]
|
|
@@ -917,7 +938,7 @@ def hf_hub_download(
|
|
|
917
938
|
or because it is set to `private` and you do not have access.
|
|
918
939
|
[`~utils.RevisionNotFoundError`]
|
|
919
940
|
If the revision to download from cannot be found.
|
|
920
|
-
[`~utils.
|
|
941
|
+
[`~utils.RemoteEntryNotFoundError`]
|
|
921
942
|
If the file to download cannot be found.
|
|
922
943
|
[`~utils.LocalEntryNotFoundError`]
|
|
923
944
|
If network is disabled or unavailable and file is not found in cache.
|
|
@@ -933,20 +954,6 @@ def hf_hub_download(
|
|
|
933
954
|
# Respect environment variable above user value
|
|
934
955
|
etag_timeout = constants.HF_HUB_ETAG_TIMEOUT
|
|
935
956
|
|
|
936
|
-
if force_filename is not None:
|
|
937
|
-
warnings.warn(
|
|
938
|
-
"The `force_filename` parameter is deprecated as a new caching system, "
|
|
939
|
-
"which keeps the filenames as they are on the Hub, is now in place.",
|
|
940
|
-
FutureWarning,
|
|
941
|
-
)
|
|
942
|
-
if resume_download is not None:
|
|
943
|
-
warnings.warn(
|
|
944
|
-
"`resume_download` is deprecated and will be removed in version 1.0.0. "
|
|
945
|
-
"Downloads always resume when possible. "
|
|
946
|
-
"If you want to force a new download, use `force_download=True`.",
|
|
947
|
-
FutureWarning,
|
|
948
|
-
)
|
|
949
|
-
|
|
950
957
|
if cache_dir is None:
|
|
951
958
|
cache_dir = constants.HF_HUB_CACHE
|
|
952
959
|
if revision is None:
|
|
@@ -976,15 +983,6 @@ def hf_hub_download(
|
|
|
976
983
|
)
|
|
977
984
|
|
|
978
985
|
if local_dir is not None:
|
|
979
|
-
if local_dir_use_symlinks != "auto":
|
|
980
|
-
warnings.warn(
|
|
981
|
-
"`local_dir_use_symlinks` parameter is deprecated and will be ignored. "
|
|
982
|
-
"The process to download files to a local folder has been updated and do "
|
|
983
|
-
"not rely on symlinks anymore. You only need to pass a destination folder "
|
|
984
|
-
"as`local_dir`.\n"
|
|
985
|
-
"For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder."
|
|
986
|
-
)
|
|
987
|
-
|
|
988
986
|
return _hf_hub_download_to_local_dir(
|
|
989
987
|
# Destination
|
|
990
988
|
local_dir=local_dir,
|
|
@@ -997,12 +995,13 @@ def hf_hub_download(
|
|
|
997
995
|
endpoint=endpoint,
|
|
998
996
|
etag_timeout=etag_timeout,
|
|
999
997
|
headers=hf_headers,
|
|
1000
|
-
proxies=proxies,
|
|
1001
998
|
token=token,
|
|
1002
999
|
# Additional options
|
|
1003
1000
|
cache_dir=cache_dir,
|
|
1004
1001
|
force_download=force_download,
|
|
1005
1002
|
local_files_only=local_files_only,
|
|
1003
|
+
tqdm_class=tqdm_class,
|
|
1004
|
+
dry_run=dry_run,
|
|
1006
1005
|
)
|
|
1007
1006
|
else:
|
|
1008
1007
|
return _hf_hub_download_to_cache_dir(
|
|
@@ -1017,11 +1016,12 @@ def hf_hub_download(
|
|
|
1017
1016
|
endpoint=endpoint,
|
|
1018
1017
|
etag_timeout=etag_timeout,
|
|
1019
1018
|
headers=hf_headers,
|
|
1020
|
-
proxies=proxies,
|
|
1021
1019
|
token=token,
|
|
1022
1020
|
# Additional options
|
|
1023
1021
|
local_files_only=local_files_only,
|
|
1024
1022
|
force_download=force_download,
|
|
1023
|
+
tqdm_class=tqdm_class,
|
|
1024
|
+
dry_run=dry_run,
|
|
1025
1025
|
)
|
|
1026
1026
|
|
|
1027
1027
|
|
|
@@ -1037,13 +1037,14 @@ def _hf_hub_download_to_cache_dir(
|
|
|
1037
1037
|
# HTTP info
|
|
1038
1038
|
endpoint: Optional[str],
|
|
1039
1039
|
etag_timeout: float,
|
|
1040
|
-
headers:
|
|
1041
|
-
proxies: Optional[Dict],
|
|
1040
|
+
headers: dict[str, str],
|
|
1042
1041
|
token: Optional[Union[bool, str]],
|
|
1043
1042
|
# Additional options
|
|
1044
1043
|
local_files_only: bool,
|
|
1045
1044
|
force_download: bool,
|
|
1046
|
-
|
|
1045
|
+
tqdm_class: Optional[type[base_tqdm]],
|
|
1046
|
+
dry_run: bool,
|
|
1047
|
+
) -> Union[str, DryRunFileInfo]:
|
|
1047
1048
|
"""Download a given file to a cache folder, if not already present.
|
|
1048
1049
|
|
|
1049
1050
|
Method should not be called directly. Please use `hf_hub_download` instead.
|
|
@@ -1051,7 +1052,7 @@ def _hf_hub_download_to_cache_dir(
|
|
|
1051
1052
|
locks_dir = os.path.join(cache_dir, ".locks")
|
|
1052
1053
|
storage_folder = os.path.join(cache_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type))
|
|
1053
1054
|
|
|
1054
|
-
# cross
|
|
1055
|
+
# cross-platform transcription of filename, to be used as a local file path.
|
|
1055
1056
|
relative_filename = os.path.join(*filename.split("/"))
|
|
1056
1057
|
if os.name == "nt":
|
|
1057
1058
|
if relative_filename.startswith("..\\") or "\\..\\" in relative_filename:
|
|
@@ -1063,8 +1064,18 @@ def _hf_hub_download_to_cache_dir(
|
|
|
1063
1064
|
# if user provides a commit_hash and they already have the file on disk, shortcut everything.
|
|
1064
1065
|
if REGEX_COMMIT_HASH.match(revision):
|
|
1065
1066
|
pointer_path = _get_pointer_path(storage_folder, revision, relative_filename)
|
|
1066
|
-
if os.path.exists(pointer_path)
|
|
1067
|
-
|
|
1067
|
+
if os.path.exists(pointer_path):
|
|
1068
|
+
if dry_run:
|
|
1069
|
+
return DryRunFileInfo(
|
|
1070
|
+
commit_hash=revision,
|
|
1071
|
+
file_size=os.path.getsize(pointer_path),
|
|
1072
|
+
filename=filename,
|
|
1073
|
+
is_cached=True,
|
|
1074
|
+
local_path=pointer_path,
|
|
1075
|
+
will_download=force_download,
|
|
1076
|
+
)
|
|
1077
|
+
if not force_download:
|
|
1078
|
+
return pointer_path
|
|
1068
1079
|
|
|
1069
1080
|
# Try to get metadata (etag, commit_hash, url, size) from the server.
|
|
1070
1081
|
# If we can't, a HEAD request error is returned.
|
|
@@ -1074,7 +1085,6 @@ def _hf_hub_download_to_cache_dir(
|
|
|
1074
1085
|
repo_type=repo_type,
|
|
1075
1086
|
revision=revision,
|
|
1076
1087
|
endpoint=endpoint,
|
|
1077
|
-
proxies=proxies,
|
|
1078
1088
|
etag_timeout=etag_timeout,
|
|
1079
1089
|
headers=headers,
|
|
1080
1090
|
token=token,
|
|
@@ -1108,8 +1118,18 @@ def _hf_hub_download_to_cache_dir(
|
|
|
1108
1118
|
# Return pointer file if exists
|
|
1109
1119
|
if commit_hash is not None:
|
|
1110
1120
|
pointer_path = _get_pointer_path(storage_folder, commit_hash, relative_filename)
|
|
1111
|
-
if os.path.exists(pointer_path)
|
|
1112
|
-
|
|
1121
|
+
if os.path.exists(pointer_path):
|
|
1122
|
+
if dry_run:
|
|
1123
|
+
return DryRunFileInfo(
|
|
1124
|
+
commit_hash=commit_hash,
|
|
1125
|
+
file_size=os.path.getsize(pointer_path),
|
|
1126
|
+
filename=filename,
|
|
1127
|
+
is_cached=True,
|
|
1128
|
+
local_path=pointer_path,
|
|
1129
|
+
will_download=force_download,
|
|
1130
|
+
)
|
|
1131
|
+
if not force_download:
|
|
1132
|
+
return pointer_path
|
|
1113
1133
|
|
|
1114
1134
|
# Otherwise, raise appropriate error
|
|
1115
1135
|
_raise_on_head_call_error(head_call_error, force_download, local_files_only)
|
|
@@ -1122,6 +1142,17 @@ def _hf_hub_download_to_cache_dir(
|
|
|
1122
1142
|
blob_path = os.path.join(storage_folder, "blobs", etag)
|
|
1123
1143
|
pointer_path = _get_pointer_path(storage_folder, commit_hash, relative_filename)
|
|
1124
1144
|
|
|
1145
|
+
if dry_run:
|
|
1146
|
+
is_cached = os.path.exists(pointer_path) or os.path.exists(blob_path)
|
|
1147
|
+
return DryRunFileInfo(
|
|
1148
|
+
commit_hash=commit_hash,
|
|
1149
|
+
file_size=expected_size,
|
|
1150
|
+
filename=filename,
|
|
1151
|
+
is_cached=is_cached,
|
|
1152
|
+
local_path=pointer_path,
|
|
1153
|
+
will_download=force_download or not is_cached,
|
|
1154
|
+
)
|
|
1155
|
+
|
|
1125
1156
|
os.makedirs(os.path.dirname(blob_path), exist_ok=True)
|
|
1126
1157
|
os.makedirs(os.path.dirname(pointer_path), exist_ok=True)
|
|
1127
1158
|
|
|
@@ -1130,43 +1161,53 @@ def _hf_hub_download_to_cache_dir(
|
|
|
1130
1161
|
# In that case store a ref.
|
|
1131
1162
|
_cache_commit_hash_for_specific_revision(storage_folder, revision, commit_hash)
|
|
1132
1163
|
|
|
1133
|
-
# If file already exists, return it (except if force_download=True)
|
|
1134
|
-
if not force_download:
|
|
1135
|
-
if os.path.exists(pointer_path):
|
|
1136
|
-
return pointer_path
|
|
1137
|
-
|
|
1138
|
-
if os.path.exists(blob_path):
|
|
1139
|
-
# we have the blob already, but not the pointer
|
|
1140
|
-
_create_symlink(blob_path, pointer_path, new_blob=False)
|
|
1141
|
-
return pointer_path
|
|
1142
|
-
|
|
1143
1164
|
# Prevent parallel downloads of the same file with a lock.
|
|
1144
1165
|
# etag could be duplicated across repos,
|
|
1145
1166
|
lock_path = os.path.join(locks_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type), f"{etag}.lock")
|
|
1146
1167
|
|
|
1147
1168
|
# Some Windows versions do not allow for paths longer than 255 characters.
|
|
1148
1169
|
# In this case, we must specify it as an extended path by using the "\\?\" prefix.
|
|
1149
|
-
if
|
|
1170
|
+
if (
|
|
1171
|
+
os.name == "nt"
|
|
1172
|
+
and len(os.path.abspath(lock_path)) > 255
|
|
1173
|
+
and not os.path.abspath(lock_path).startswith("\\\\?\\")
|
|
1174
|
+
):
|
|
1150
1175
|
lock_path = "\\\\?\\" + os.path.abspath(lock_path)
|
|
1151
1176
|
|
|
1152
|
-
if
|
|
1177
|
+
if (
|
|
1178
|
+
os.name == "nt"
|
|
1179
|
+
and len(os.path.abspath(blob_path)) > 255
|
|
1180
|
+
and not os.path.abspath(blob_path).startswith("\\\\?\\")
|
|
1181
|
+
):
|
|
1153
1182
|
blob_path = "\\\\?\\" + os.path.abspath(blob_path)
|
|
1154
1183
|
|
|
1184
|
+
Path(lock_path).parent.mkdir(parents=True, exist_ok=True)
|
|
1185
|
+
|
|
1186
|
+
# pointer already exists -> immediate return
|
|
1187
|
+
if not force_download and os.path.exists(pointer_path):
|
|
1188
|
+
return pointer_path
|
|
1189
|
+
|
|
1190
|
+
# Blob exists but pointer must be (safely) created -> take the lock
|
|
1191
|
+
if not force_download and os.path.exists(blob_path):
|
|
1192
|
+
with WeakFileLock(lock_path):
|
|
1193
|
+
if not os.path.exists(pointer_path):
|
|
1194
|
+
_create_symlink(blob_path, pointer_path, new_blob=False)
|
|
1195
|
+
return pointer_path
|
|
1196
|
+
|
|
1155
1197
|
# Local file doesn't exist or etag isn't a match => retrieve file from remote (or cache)
|
|
1156
1198
|
|
|
1157
|
-
Path(lock_path).parent.mkdir(parents=True, exist_ok=True)
|
|
1158
1199
|
with WeakFileLock(lock_path):
|
|
1159
1200
|
_download_to_tmp_and_move(
|
|
1160
1201
|
incomplete_path=Path(blob_path + ".incomplete"),
|
|
1161
1202
|
destination_path=Path(blob_path),
|
|
1162
1203
|
url_to_download=url_to_download,
|
|
1163
|
-
proxies=proxies,
|
|
1164
1204
|
headers=headers,
|
|
1165
1205
|
expected_size=expected_size,
|
|
1166
1206
|
filename=filename,
|
|
1167
1207
|
force_download=force_download,
|
|
1168
1208
|
etag=etag,
|
|
1169
1209
|
xet_file_data=xet_file_data,
|
|
1210
|
+
tqdm_class=tqdm_class,
|
|
1170
1211
|
)
|
|
1171
1212
|
if not os.path.exists(pointer_path):
|
|
1172
1213
|
_create_symlink(blob_path, pointer_path, new_blob=True)
|
|
@@ -1186,14 +1227,15 @@ def _hf_hub_download_to_local_dir(
|
|
|
1186
1227
|
# HTTP info
|
|
1187
1228
|
endpoint: Optional[str],
|
|
1188
1229
|
etag_timeout: float,
|
|
1189
|
-
headers:
|
|
1190
|
-
proxies: Optional[Dict],
|
|
1230
|
+
headers: dict[str, str],
|
|
1191
1231
|
token: Union[bool, str, None],
|
|
1192
1232
|
# Additional options
|
|
1193
1233
|
cache_dir: str,
|
|
1194
1234
|
force_download: bool,
|
|
1195
1235
|
local_files_only: bool,
|
|
1196
|
-
|
|
1236
|
+
tqdm_class: Optional[type[base_tqdm]],
|
|
1237
|
+
dry_run: bool,
|
|
1238
|
+
) -> Union[str, DryRunFileInfo]:
|
|
1197
1239
|
"""Download a given file to a local folder, if not already present.
|
|
1198
1240
|
|
|
1199
1241
|
Method should not be called directly. Please use `hf_hub_download` instead.
|
|
@@ -1208,13 +1250,23 @@ def _hf_hub_download_to_local_dir(
|
|
|
1208
1250
|
|
|
1209
1251
|
# Local file exists + metadata exists + commit_hash matches => return file
|
|
1210
1252
|
if (
|
|
1211
|
-
|
|
1212
|
-
and REGEX_COMMIT_HASH.match(revision)
|
|
1253
|
+
REGEX_COMMIT_HASH.match(revision)
|
|
1213
1254
|
and paths.file_path.is_file()
|
|
1214
1255
|
and local_metadata is not None
|
|
1215
1256
|
and local_metadata.commit_hash == revision
|
|
1216
1257
|
):
|
|
1217
|
-
|
|
1258
|
+
local_file = str(paths.file_path)
|
|
1259
|
+
if dry_run:
|
|
1260
|
+
return DryRunFileInfo(
|
|
1261
|
+
commit_hash=revision,
|
|
1262
|
+
file_size=os.path.getsize(local_file),
|
|
1263
|
+
filename=filename,
|
|
1264
|
+
is_cached=True,
|
|
1265
|
+
local_path=local_file,
|
|
1266
|
+
will_download=force_download,
|
|
1267
|
+
)
|
|
1268
|
+
if not force_download:
|
|
1269
|
+
return local_file
|
|
1218
1270
|
|
|
1219
1271
|
# Local file doesn't exist or commit_hash doesn't match => we need the etag
|
|
1220
1272
|
(url_to_download, etag, commit_hash, expected_size, xet_file_data, head_call_error) = _get_metadata_or_catch_error(
|
|
@@ -1223,7 +1275,6 @@ def _hf_hub_download_to_local_dir(
|
|
|
1223
1275
|
repo_type=repo_type,
|
|
1224
1276
|
revision=revision,
|
|
1225
1277
|
endpoint=endpoint,
|
|
1226
|
-
proxies=proxies,
|
|
1227
1278
|
etag_timeout=etag_timeout,
|
|
1228
1279
|
headers=headers,
|
|
1229
1280
|
token=token,
|
|
@@ -1232,11 +1283,24 @@ def _hf_hub_download_to_local_dir(
|
|
|
1232
1283
|
|
|
1233
1284
|
if head_call_error is not None:
|
|
1234
1285
|
# No HEAD call but local file exists => default to local file
|
|
1235
|
-
if
|
|
1236
|
-
|
|
1237
|
-
|
|
1238
|
-
|
|
1239
|
-
|
|
1286
|
+
if paths.file_path.is_file():
|
|
1287
|
+
if dry_run or not force_download:
|
|
1288
|
+
logger.warning(
|
|
1289
|
+
f"Couldn't access the Hub to check for update but local file already exists. Defaulting to existing file. (error: {head_call_error})"
|
|
1290
|
+
)
|
|
1291
|
+
local_path = str(paths.file_path)
|
|
1292
|
+
if dry_run and local_metadata is not None:
|
|
1293
|
+
return DryRunFileInfo(
|
|
1294
|
+
commit_hash=local_metadata.commit_hash,
|
|
1295
|
+
file_size=os.path.getsize(local_path),
|
|
1296
|
+
filename=filename,
|
|
1297
|
+
is_cached=True,
|
|
1298
|
+
local_path=local_path,
|
|
1299
|
+
will_download=force_download,
|
|
1300
|
+
)
|
|
1301
|
+
if not force_download:
|
|
1302
|
+
return local_path
|
|
1303
|
+
|
|
1240
1304
|
# Otherwise => raise
|
|
1241
1305
|
_raise_on_head_call_error(head_call_error, force_download, local_files_only)
|
|
1242
1306
|
|
|
@@ -1251,6 +1315,15 @@ def _hf_hub_download_to_local_dir(
|
|
|
1251
1315
|
# etag matches => update metadata and return file
|
|
1252
1316
|
if local_metadata is not None and local_metadata.etag == etag:
|
|
1253
1317
|
write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag)
|
|
1318
|
+
if dry_run:
|
|
1319
|
+
return DryRunFileInfo(
|
|
1320
|
+
commit_hash=commit_hash,
|
|
1321
|
+
file_size=expected_size,
|
|
1322
|
+
filename=filename,
|
|
1323
|
+
is_cached=True,
|
|
1324
|
+
local_path=str(paths.file_path),
|
|
1325
|
+
will_download=False,
|
|
1326
|
+
)
|
|
1254
1327
|
return str(paths.file_path)
|
|
1255
1328
|
|
|
1256
1329
|
# metadata is outdated + etag is a sha256
|
|
@@ -1262,6 +1335,15 @@ def _hf_hub_download_to_local_dir(
|
|
|
1262
1335
|
file_hash = sha_fileobj(f).hex()
|
|
1263
1336
|
if file_hash == etag:
|
|
1264
1337
|
write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag)
|
|
1338
|
+
if dry_run:
|
|
1339
|
+
return DryRunFileInfo(
|
|
1340
|
+
commit_hash=commit_hash,
|
|
1341
|
+
file_size=expected_size,
|
|
1342
|
+
filename=filename,
|
|
1343
|
+
is_cached=True,
|
|
1344
|
+
local_path=str(paths.file_path),
|
|
1345
|
+
will_download=False,
|
|
1346
|
+
)
|
|
1265
1347
|
return str(paths.file_path)
|
|
1266
1348
|
|
|
1267
1349
|
# Local file doesn't exist or etag isn't a match => retrieve file from remote (or cache)
|
|
@@ -1280,8 +1362,28 @@ def _hf_hub_download_to_local_dir(
|
|
|
1280
1362
|
paths.file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
1281
1363
|
shutil.copyfile(cached_path, paths.file_path)
|
|
1282
1364
|
write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag)
|
|
1365
|
+
if dry_run:
|
|
1366
|
+
return DryRunFileInfo(
|
|
1367
|
+
commit_hash=commit_hash,
|
|
1368
|
+
file_size=expected_size,
|
|
1369
|
+
filename=filename,
|
|
1370
|
+
is_cached=True,
|
|
1371
|
+
local_path=str(paths.file_path),
|
|
1372
|
+
will_download=False,
|
|
1373
|
+
)
|
|
1283
1374
|
return str(paths.file_path)
|
|
1284
1375
|
|
|
1376
|
+
if dry_run:
|
|
1377
|
+
is_cached = paths.file_path.is_file()
|
|
1378
|
+
return DryRunFileInfo(
|
|
1379
|
+
commit_hash=commit_hash,
|
|
1380
|
+
file_size=expected_size,
|
|
1381
|
+
filename=filename,
|
|
1382
|
+
is_cached=is_cached,
|
|
1383
|
+
local_path=str(paths.file_path),
|
|
1384
|
+
will_download=force_download or not is_cached,
|
|
1385
|
+
)
|
|
1386
|
+
|
|
1285
1387
|
# Otherwise, let's download the file!
|
|
1286
1388
|
with WeakFileLock(paths.lock_path):
|
|
1287
1389
|
paths.file_path.unlink(missing_ok=True) # delete outdated file first
|
|
@@ -1289,13 +1391,13 @@ def _hf_hub_download_to_local_dir(
|
|
|
1289
1391
|
incomplete_path=paths.incomplete_path(etag),
|
|
1290
1392
|
destination_path=paths.file_path,
|
|
1291
1393
|
url_to_download=url_to_download,
|
|
1292
|
-
proxies=proxies,
|
|
1293
1394
|
headers=headers,
|
|
1294
1395
|
expected_size=expected_size,
|
|
1295
1396
|
filename=filename,
|
|
1296
1397
|
force_download=force_download,
|
|
1297
1398
|
etag=etag,
|
|
1298
1399
|
xet_file_data=xet_file_data,
|
|
1400
|
+
tqdm_class=tqdm_class,
|
|
1299
1401
|
)
|
|
1300
1402
|
|
|
1301
1403
|
write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag)
|
|
@@ -1399,12 +1501,12 @@ def try_to_load_from_cache(
|
|
|
1399
1501
|
def get_hf_file_metadata(
|
|
1400
1502
|
url: str,
|
|
1401
1503
|
token: Union[bool, str, None] = None,
|
|
1402
|
-
proxies: Optional[Dict] = None,
|
|
1403
1504
|
timeout: Optional[float] = constants.DEFAULT_REQUEST_TIMEOUT,
|
|
1404
1505
|
library_name: Optional[str] = None,
|
|
1405
1506
|
library_version: Optional[str] = None,
|
|
1406
|
-
user_agent: Union[
|
|
1407
|
-
headers: Optional[
|
|
1507
|
+
user_agent: Union[dict, str, None] = None,
|
|
1508
|
+
headers: Optional[dict[str, str]] = None,
|
|
1509
|
+
endpoint: Optional[str] = None,
|
|
1408
1510
|
) -> HfFileMetadata:
|
|
1409
1511
|
"""Fetch metadata of a file versioned on the Hub for a given url.
|
|
1410
1512
|
|
|
@@ -1417,9 +1519,6 @@ def get_hf_file_metadata(
|
|
|
1417
1519
|
folder.
|
|
1418
1520
|
- If `False` or `None`, no token is provided.
|
|
1419
1521
|
- If a string, it's used as the authentication token.
|
|
1420
|
-
proxies (`dict`, *optional*):
|
|
1421
|
-
Dictionary mapping protocol to the URL of the proxy passed to
|
|
1422
|
-
`requests.request`.
|
|
1423
1522
|
timeout (`float`, *optional*, defaults to 10):
|
|
1424
1523
|
How many seconds to wait for the server to send metadata before giving up.
|
|
1425
1524
|
library_name (`str`, *optional*):
|
|
@@ -1430,6 +1529,8 @@ def get_hf_file_metadata(
|
|
|
1430
1529
|
The user-agent info in the form of a dictionary or a string.
|
|
1431
1530
|
headers (`dict`, *optional*):
|
|
1432
1531
|
Additional headers to be sent with the request.
|
|
1532
|
+
endpoint (`str`, *optional*):
|
|
1533
|
+
Endpoint of the Hub. Defaults to <https://huggingface.co>.
|
|
1433
1534
|
|
|
1434
1535
|
Returns:
|
|
1435
1536
|
A [`HfFileMetadata`] object containing metadata such as location, etag, size and
|
|
@@ -1445,31 +1546,23 @@ def get_hf_file_metadata(
|
|
|
1445
1546
|
hf_headers["Accept-Encoding"] = "identity" # prevent any compression => we want to know the real size of the file
|
|
1446
1547
|
|
|
1447
1548
|
# Retrieve metadata
|
|
1448
|
-
|
|
1449
|
-
|
|
1450
|
-
url=url,
|
|
1451
|
-
headers=hf_headers,
|
|
1452
|
-
allow_redirects=False,
|
|
1453
|
-
follow_relative_redirects=True,
|
|
1454
|
-
proxies=proxies,
|
|
1455
|
-
timeout=timeout,
|
|
1456
|
-
)
|
|
1457
|
-
hf_raise_for_status(r)
|
|
1549
|
+
response = _httpx_follow_relative_redirects(method="HEAD", url=url, headers=hf_headers, timeout=timeout)
|
|
1550
|
+
hf_raise_for_status(response)
|
|
1458
1551
|
|
|
1459
1552
|
# Return
|
|
1460
1553
|
return HfFileMetadata(
|
|
1461
|
-
commit_hash=
|
|
1462
|
-
# We favor a custom header indicating the etag of the linked resource, and
|
|
1463
|
-
|
|
1464
|
-
|
|
1554
|
+
commit_hash=response.headers.get(constants.HUGGINGFACE_HEADER_X_REPO_COMMIT),
|
|
1555
|
+
# We favor a custom header indicating the etag of the linked resource, and we fall back to the regular etag header.
|
|
1556
|
+
etag=_normalize_etag(
|
|
1557
|
+
response.headers.get(constants.HUGGINGFACE_HEADER_X_LINKED_ETAG) or response.headers.get("ETag")
|
|
1558
|
+
),
|
|
1465
1559
|
# Either from response headers (if redirected) or defaults to request url
|
|
1466
|
-
# Do not use directly `url
|
|
1467
|
-
#
|
|
1468
|
-
location=r.headers.get("Location") or r.request.url, # type: ignore
|
|
1560
|
+
# Do not use directly `url` as we might have followed relative redirects.
|
|
1561
|
+
location=response.headers.get("Location") or str(response.request.url), # type: ignore
|
|
1469
1562
|
size=_int_or_none(
|
|
1470
|
-
|
|
1563
|
+
response.headers.get(constants.HUGGINGFACE_HEADER_X_LINKED_SIZE) or response.headers.get("Content-Length")
|
|
1471
1564
|
),
|
|
1472
|
-
xet_file_data=parse_xet_file_data_from_response(
|
|
1565
|
+
xet_file_data=parse_xet_file_data_from_response(response, endpoint=endpoint), # type: ignore
|
|
1473
1566
|
)
|
|
1474
1567
|
|
|
1475
1568
|
|
|
@@ -1480,19 +1573,18 @@ def _get_metadata_or_catch_error(
|
|
|
1480
1573
|
repo_type: str,
|
|
1481
1574
|
revision: str,
|
|
1482
1575
|
endpoint: Optional[str],
|
|
1483
|
-
proxies: Optional[Dict],
|
|
1484
1576
|
etag_timeout: Optional[float],
|
|
1485
|
-
headers:
|
|
1577
|
+
headers: dict[str, str], # mutated inplace!
|
|
1486
1578
|
token: Union[bool, str, None],
|
|
1487
1579
|
local_files_only: bool,
|
|
1488
1580
|
relative_filename: Optional[str] = None, # only used to store `.no_exists` in cache
|
|
1489
1581
|
storage_folder: Optional[str] = None, # only used to store `.no_exists` in cache
|
|
1490
1582
|
) -> Union[
|
|
1491
1583
|
# Either an exception is caught and returned
|
|
1492
|
-
|
|
1584
|
+
tuple[None, None, None, None, None, Exception],
|
|
1493
1585
|
# Or the metadata is returned as
|
|
1494
1586
|
# `(url_to_download, etag, commit_hash, expected_size, xet_file_data, None)`
|
|
1495
|
-
|
|
1587
|
+
tuple[str, str, str, int, Optional[XetFileData], None],
|
|
1496
1588
|
]:
|
|
1497
1589
|
"""Get metadata for a file on the Hub, safely handling network issues.
|
|
1498
1590
|
|
|
@@ -1529,9 +1621,9 @@ def _get_metadata_or_catch_error(
|
|
|
1529
1621
|
try:
|
|
1530
1622
|
try:
|
|
1531
1623
|
metadata = get_hf_file_metadata(
|
|
1532
|
-
url=url,
|
|
1624
|
+
url=url, timeout=etag_timeout, headers=headers, token=token, endpoint=endpoint
|
|
1533
1625
|
)
|
|
1534
|
-
except
|
|
1626
|
+
except RemoteEntryNotFoundError as http_error:
|
|
1535
1627
|
if storage_folder is not None and relative_filename is not None:
|
|
1536
1628
|
# Cache the non-existence of the file
|
|
1537
1629
|
commit_hash = http_error.response.headers.get(constants.HUGGINGFACE_HEADER_X_REPO_COMMIT)
|
|
@@ -1582,21 +1674,17 @@ def _get_metadata_or_catch_error(
|
|
|
1582
1674
|
if urlparse(url).netloc != urlparse(metadata.location).netloc:
|
|
1583
1675
|
# Remove authorization header when downloading a LFS blob
|
|
1584
1676
|
headers.pop("authorization", None)
|
|
1585
|
-
except
|
|
1586
|
-
# Actually raise
|
|
1677
|
+
except httpx.ProxyError:
|
|
1678
|
+
# Actually raise on proxy error
|
|
1587
1679
|
raise
|
|
1588
|
-
except (
|
|
1589
|
-
requests.exceptions.ConnectionError,
|
|
1590
|
-
requests.exceptions.Timeout,
|
|
1591
|
-
OfflineModeIsEnabled,
|
|
1592
|
-
) as error:
|
|
1680
|
+
except (httpx.ConnectError, httpx.TimeoutException, OfflineModeIsEnabled) as error:
|
|
1593
1681
|
# Otherwise, our Internet connection is down.
|
|
1594
1682
|
# etag is None
|
|
1595
1683
|
head_error_call = error
|
|
1596
|
-
except (RevisionNotFoundError,
|
|
1684
|
+
except (RevisionNotFoundError, RemoteEntryNotFoundError):
|
|
1597
1685
|
# The repo was found but the revision or entry doesn't exist on the Hub (never existed or got deleted)
|
|
1598
1686
|
raise
|
|
1599
|
-
except
|
|
1687
|
+
except HfHubHTTPError as error:
|
|
1600
1688
|
# Multiple reasons for an http error:
|
|
1601
1689
|
# - Repository is private and invalid/missing token sent
|
|
1602
1690
|
# - Repository is gated and invalid/missing token sent
|
|
@@ -1654,20 +1742,20 @@ def _download_to_tmp_and_move(
|
|
|
1654
1742
|
incomplete_path: Path,
|
|
1655
1743
|
destination_path: Path,
|
|
1656
1744
|
url_to_download: str,
|
|
1657
|
-
|
|
1658
|
-
headers: Dict[str, str],
|
|
1745
|
+
headers: dict[str, str],
|
|
1659
1746
|
expected_size: Optional[int],
|
|
1660
1747
|
filename: str,
|
|
1661
1748
|
force_download: bool,
|
|
1662
1749
|
etag: Optional[str],
|
|
1663
1750
|
xet_file_data: Optional[XetFileData],
|
|
1751
|
+
tqdm_class: Optional[type[base_tqdm]] = None,
|
|
1664
1752
|
) -> None:
|
|
1665
1753
|
"""Download content from a URL to a destination path.
|
|
1666
1754
|
|
|
1667
1755
|
Internal logic:
|
|
1668
1756
|
- return early if file is already downloaded
|
|
1669
1757
|
- resume download if possible (from incomplete file)
|
|
1670
|
-
- do not resume download if `force_download=True`
|
|
1758
|
+
- do not resume download if `force_download=True`
|
|
1671
1759
|
- check disk space before downloading
|
|
1672
1760
|
- download content to a temporary file
|
|
1673
1761
|
- set correct permissions on temporary file
|
|
@@ -1679,16 +1767,11 @@ def _download_to_tmp_and_move(
|
|
|
1679
1767
|
# Do nothing if already exists (except if force_download=True)
|
|
1680
1768
|
return
|
|
1681
1769
|
|
|
1682
|
-
if incomplete_path.exists() and
|
|
1770
|
+
if incomplete_path.exists() and force_download:
|
|
1683
1771
|
# By default, we will try to resume the download if possible.
|
|
1684
|
-
# However, if the user has set `force_download=True
|
|
1772
|
+
# However, if the user has set `force_download=True`, then we should
|
|
1685
1773
|
# not resume the download => delete the incomplete file.
|
|
1686
|
-
|
|
1687
|
-
if force_download:
|
|
1688
|
-
message += " (force_download=True)"
|
|
1689
|
-
elif constants.HF_HUB_ENABLE_HF_TRANSFER and not proxies:
|
|
1690
|
-
message += " (hf_transfer=True)"
|
|
1691
|
-
logger.info(message)
|
|
1774
|
+
logger.debug(f"Removing incomplete file '{incomplete_path}' (force_download=True)")
|
|
1692
1775
|
incomplete_path.unlink(missing_ok=True)
|
|
1693
1776
|
|
|
1694
1777
|
with incomplete_path.open("ab") as f:
|
|
@@ -1696,7 +1779,7 @@ def _download_to_tmp_and_move(
|
|
|
1696
1779
|
message = f"Downloading '{filename}' to '{incomplete_path}'"
|
|
1697
1780
|
if resume_size > 0 and expected_size is not None:
|
|
1698
1781
|
message += f" (resume from {resume_size}/{expected_size})"
|
|
1699
|
-
logger.
|
|
1782
|
+
logger.debug(message)
|
|
1700
1783
|
|
|
1701
1784
|
if expected_size is not None: # might be None if HTTP header not set correctly
|
|
1702
1785
|
# Check disk space in both tmp and destination path
|
|
@@ -1704,16 +1787,17 @@ def _download_to_tmp_and_move(
|
|
|
1704
1787
|
_check_disk_space(expected_size, destination_path.parent)
|
|
1705
1788
|
|
|
1706
1789
|
if xet_file_data is not None and is_xet_available():
|
|
1707
|
-
logger.
|
|
1790
|
+
logger.debug("Xet Storage is enabled for this repo. Downloading file from Xet Storage..")
|
|
1708
1791
|
xet_get(
|
|
1709
1792
|
incomplete_path=incomplete_path,
|
|
1710
1793
|
xet_file_data=xet_file_data,
|
|
1711
1794
|
headers=headers,
|
|
1712
1795
|
expected_size=expected_size,
|
|
1713
1796
|
displayed_filename=filename,
|
|
1797
|
+
tqdm_class=tqdm_class,
|
|
1714
1798
|
)
|
|
1715
1799
|
else:
|
|
1716
|
-
if xet_file_data is not None:
|
|
1800
|
+
if xet_file_data is not None and not constants.HF_HUB_DISABLE_XET:
|
|
1717
1801
|
logger.warning(
|
|
1718
1802
|
"Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. "
|
|
1719
1803
|
"Falling back to regular HTTP download. "
|
|
@@ -1723,13 +1807,13 @@ def _download_to_tmp_and_move(
|
|
|
1723
1807
|
http_get(
|
|
1724
1808
|
url_to_download,
|
|
1725
1809
|
f,
|
|
1726
|
-
proxies=proxies,
|
|
1727
1810
|
resume_size=resume_size,
|
|
1728
1811
|
headers=headers,
|
|
1729
1812
|
expected_size=expected_size,
|
|
1813
|
+
tqdm_class=tqdm_class,
|
|
1730
1814
|
)
|
|
1731
1815
|
|
|
1732
|
-
logger.
|
|
1816
|
+
logger.debug(f"Download complete. Moving file to {destination_path}")
|
|
1733
1817
|
_chmod_and_move(incomplete_path, destination_path)
|
|
1734
1818
|
|
|
1735
1819
|
|