huggingface-hub 0.29.0rc2__py3-none-any.whl → 1.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (153) hide show
  1. huggingface_hub/__init__.py +160 -46
  2. huggingface_hub/_commit_api.py +277 -71
  3. huggingface_hub/_commit_scheduler.py +15 -15
  4. huggingface_hub/_inference_endpoints.py +33 -22
  5. huggingface_hub/_jobs_api.py +301 -0
  6. huggingface_hub/_local_folder.py +18 -3
  7. huggingface_hub/_login.py +31 -63
  8. huggingface_hub/_oauth.py +460 -0
  9. huggingface_hub/_snapshot_download.py +241 -81
  10. huggingface_hub/_space_api.py +18 -10
  11. huggingface_hub/_tensorboard_logger.py +15 -19
  12. huggingface_hub/_upload_large_folder.py +196 -76
  13. huggingface_hub/_webhooks_payload.py +3 -3
  14. huggingface_hub/_webhooks_server.py +15 -25
  15. huggingface_hub/{commands → cli}/__init__.py +1 -15
  16. huggingface_hub/cli/_cli_utils.py +173 -0
  17. huggingface_hub/cli/auth.py +147 -0
  18. huggingface_hub/cli/cache.py +841 -0
  19. huggingface_hub/cli/download.py +189 -0
  20. huggingface_hub/cli/hf.py +60 -0
  21. huggingface_hub/cli/inference_endpoints.py +377 -0
  22. huggingface_hub/cli/jobs.py +772 -0
  23. huggingface_hub/cli/lfs.py +175 -0
  24. huggingface_hub/cli/repo.py +315 -0
  25. huggingface_hub/cli/repo_files.py +94 -0
  26. huggingface_hub/{commands/env.py → cli/system.py} +10 -13
  27. huggingface_hub/cli/upload.py +294 -0
  28. huggingface_hub/cli/upload_large_folder.py +117 -0
  29. huggingface_hub/community.py +20 -12
  30. huggingface_hub/constants.py +83 -59
  31. huggingface_hub/dataclasses.py +609 -0
  32. huggingface_hub/errors.py +99 -30
  33. huggingface_hub/fastai_utils.py +30 -41
  34. huggingface_hub/file_download.py +606 -346
  35. huggingface_hub/hf_api.py +2445 -1132
  36. huggingface_hub/hf_file_system.py +269 -152
  37. huggingface_hub/hub_mixin.py +61 -66
  38. huggingface_hub/inference/_client.py +501 -630
  39. huggingface_hub/inference/_common.py +133 -121
  40. huggingface_hub/inference/_generated/_async_client.py +536 -722
  41. huggingface_hub/inference/_generated/types/__init__.py +6 -1
  42. huggingface_hub/inference/_generated/types/automatic_speech_recognition.py +5 -6
  43. huggingface_hub/inference/_generated/types/base.py +10 -7
  44. huggingface_hub/inference/_generated/types/chat_completion.py +77 -31
  45. huggingface_hub/inference/_generated/types/depth_estimation.py +2 -2
  46. huggingface_hub/inference/_generated/types/document_question_answering.py +2 -2
  47. huggingface_hub/inference/_generated/types/feature_extraction.py +2 -2
  48. huggingface_hub/inference/_generated/types/fill_mask.py +2 -2
  49. huggingface_hub/inference/_generated/types/image_to_image.py +8 -2
  50. huggingface_hub/inference/_generated/types/image_to_text.py +2 -3
  51. huggingface_hub/inference/_generated/types/image_to_video.py +60 -0
  52. huggingface_hub/inference/_generated/types/sentence_similarity.py +3 -3
  53. huggingface_hub/inference/_generated/types/summarization.py +2 -2
  54. huggingface_hub/inference/_generated/types/table_question_answering.py +5 -5
  55. huggingface_hub/inference/_generated/types/text2text_generation.py +2 -2
  56. huggingface_hub/inference/_generated/types/text_generation.py +11 -11
  57. huggingface_hub/inference/_generated/types/text_to_audio.py +1 -2
  58. huggingface_hub/inference/_generated/types/text_to_speech.py +1 -2
  59. huggingface_hub/inference/_generated/types/text_to_video.py +2 -2
  60. huggingface_hub/inference/_generated/types/token_classification.py +2 -2
  61. huggingface_hub/inference/_generated/types/translation.py +2 -2
  62. huggingface_hub/inference/_generated/types/zero_shot_classification.py +2 -2
  63. huggingface_hub/inference/_generated/types/zero_shot_image_classification.py +2 -2
  64. huggingface_hub/inference/_generated/types/zero_shot_object_detection.py +1 -3
  65. huggingface_hub/inference/_mcp/__init__.py +0 -0
  66. huggingface_hub/inference/_mcp/_cli_hacks.py +88 -0
  67. huggingface_hub/inference/_mcp/agent.py +100 -0
  68. huggingface_hub/inference/_mcp/cli.py +247 -0
  69. huggingface_hub/inference/_mcp/constants.py +81 -0
  70. huggingface_hub/inference/_mcp/mcp_client.py +395 -0
  71. huggingface_hub/inference/_mcp/types.py +45 -0
  72. huggingface_hub/inference/_mcp/utils.py +128 -0
  73. huggingface_hub/inference/_providers/__init__.py +149 -20
  74. huggingface_hub/inference/_providers/_common.py +160 -37
  75. huggingface_hub/inference/_providers/black_forest_labs.py +12 -9
  76. huggingface_hub/inference/_providers/cerebras.py +6 -0
  77. huggingface_hub/inference/_providers/clarifai.py +13 -0
  78. huggingface_hub/inference/_providers/cohere.py +32 -0
  79. huggingface_hub/inference/_providers/fal_ai.py +231 -22
  80. huggingface_hub/inference/_providers/featherless_ai.py +38 -0
  81. huggingface_hub/inference/_providers/fireworks_ai.py +22 -1
  82. huggingface_hub/inference/_providers/groq.py +9 -0
  83. huggingface_hub/inference/_providers/hf_inference.py +143 -33
  84. huggingface_hub/inference/_providers/hyperbolic.py +9 -5
  85. huggingface_hub/inference/_providers/nebius.py +47 -5
  86. huggingface_hub/inference/_providers/novita.py +48 -5
  87. huggingface_hub/inference/_providers/nscale.py +44 -0
  88. huggingface_hub/inference/_providers/openai.py +25 -0
  89. huggingface_hub/inference/_providers/publicai.py +6 -0
  90. huggingface_hub/inference/_providers/replicate.py +46 -9
  91. huggingface_hub/inference/_providers/sambanova.py +37 -1
  92. huggingface_hub/inference/_providers/scaleway.py +28 -0
  93. huggingface_hub/inference/_providers/together.py +34 -5
  94. huggingface_hub/inference/_providers/wavespeed.py +138 -0
  95. huggingface_hub/inference/_providers/zai_org.py +17 -0
  96. huggingface_hub/lfs.py +33 -100
  97. huggingface_hub/repocard.py +34 -38
  98. huggingface_hub/repocard_data.py +79 -59
  99. huggingface_hub/serialization/__init__.py +0 -1
  100. huggingface_hub/serialization/_base.py +12 -15
  101. huggingface_hub/serialization/_dduf.py +8 -8
  102. huggingface_hub/serialization/_torch.py +69 -69
  103. huggingface_hub/utils/__init__.py +27 -8
  104. huggingface_hub/utils/_auth.py +7 -7
  105. huggingface_hub/utils/_cache_manager.py +92 -147
  106. huggingface_hub/utils/_chunk_utils.py +2 -3
  107. huggingface_hub/utils/_deprecation.py +1 -1
  108. huggingface_hub/utils/_dotenv.py +55 -0
  109. huggingface_hub/utils/_experimental.py +7 -5
  110. huggingface_hub/utils/_fixes.py +0 -10
  111. huggingface_hub/utils/_git_credential.py +5 -5
  112. huggingface_hub/utils/_headers.py +8 -30
  113. huggingface_hub/utils/_http.py +399 -237
  114. huggingface_hub/utils/_pagination.py +6 -6
  115. huggingface_hub/utils/_parsing.py +98 -0
  116. huggingface_hub/utils/_paths.py +5 -5
  117. huggingface_hub/utils/_runtime.py +74 -22
  118. huggingface_hub/utils/_safetensors.py +21 -21
  119. huggingface_hub/utils/_subprocess.py +13 -11
  120. huggingface_hub/utils/_telemetry.py +4 -4
  121. huggingface_hub/{commands/_cli_utils.py → utils/_terminal.py} +4 -4
  122. huggingface_hub/utils/_typing.py +25 -5
  123. huggingface_hub/utils/_validators.py +55 -74
  124. huggingface_hub/utils/_verification.py +167 -0
  125. huggingface_hub/utils/_xet.py +235 -0
  126. huggingface_hub/utils/_xet_progress_reporting.py +162 -0
  127. huggingface_hub/utils/insecure_hashlib.py +3 -5
  128. huggingface_hub/utils/logging.py +8 -11
  129. huggingface_hub/utils/tqdm.py +33 -4
  130. {huggingface_hub-0.29.0rc2.dist-info → huggingface_hub-1.1.3.dist-info}/METADATA +94 -82
  131. huggingface_hub-1.1.3.dist-info/RECORD +155 -0
  132. {huggingface_hub-0.29.0rc2.dist-info → huggingface_hub-1.1.3.dist-info}/WHEEL +1 -1
  133. huggingface_hub-1.1.3.dist-info/entry_points.txt +6 -0
  134. huggingface_hub/commands/delete_cache.py +0 -428
  135. huggingface_hub/commands/download.py +0 -200
  136. huggingface_hub/commands/huggingface_cli.py +0 -61
  137. huggingface_hub/commands/lfs.py +0 -200
  138. huggingface_hub/commands/repo_files.py +0 -128
  139. huggingface_hub/commands/scan_cache.py +0 -181
  140. huggingface_hub/commands/tag.py +0 -159
  141. huggingface_hub/commands/upload.py +0 -299
  142. huggingface_hub/commands/upload_large_folder.py +0 -129
  143. huggingface_hub/commands/user.py +0 -304
  144. huggingface_hub/commands/version.py +0 -37
  145. huggingface_hub/inference_api.py +0 -217
  146. huggingface_hub/keras_mixin.py +0 -500
  147. huggingface_hub/repository.py +0 -1477
  148. huggingface_hub/serialization/_tensorflow.py +0 -95
  149. huggingface_hub/utils/_hf_folder.py +0 -68
  150. huggingface_hub-0.29.0rc2.dist-info/RECORD +0 -131
  151. huggingface_hub-0.29.0rc2.dist-info/entry_points.txt +0 -6
  152. {huggingface_hub-0.29.0rc2.dist-info → huggingface_hub-1.1.3.dist-info/licenses}/LICENSE +0 -0
  153. {huggingface_hub-0.29.0rc2.dist-info → huggingface_hub-1.1.3.dist-info}/top_level.txt +0 -0
@@ -1,7 +1,5 @@
1
- import contextlib
2
1
  import copy
3
2
  import errno
4
- import inspect
5
3
  import os
6
4
  import re
7
5
  import shutil
@@ -11,26 +9,20 @@ import uuid
11
9
  import warnings
12
10
  from dataclasses import dataclass
13
11
  from pathlib import Path
14
- from typing import Any, BinaryIO, Dict, Literal, NoReturn, Optional, Tuple, Union
12
+ from typing import Any, BinaryIO, Literal, NoReturn, Optional, Union, overload
15
13
  from urllib.parse import quote, urlparse
16
14
 
17
- import requests
15
+ import httpx
16
+ from tqdm.auto import tqdm as base_tqdm
18
17
 
19
- from . import (
20
- __version__, # noqa: F401 # for backward compatibility
21
- constants,
22
- )
18
+ from . import constants
23
19
  from ._local_folder import get_local_download_paths, read_download_metadata, write_download_metadata
24
- from .constants import (
25
- HUGGINGFACE_CO_URL_TEMPLATE, # noqa: F401 # for backward compatibility
26
- HUGGINGFACE_HUB_CACHE, # noqa: F401 # for backward compatibility
27
- )
28
20
  from .errors import (
29
- EntryNotFoundError,
30
21
  FileMetadataError,
31
22
  GatedRepoError,
32
23
  HfHubHTTPError,
33
24
  LocalEntryNotFoundError,
25
+ RemoteEntryNotFoundError,
34
26
  RepositoryNotFoundError,
35
27
  RevisionNotFoundError,
36
28
  )
@@ -38,33 +30,20 @@ from .utils import (
38
30
  OfflineModeIsEnabled,
39
31
  SoftTemporaryDirectory,
40
32
  WeakFileLock,
33
+ XetFileData,
41
34
  build_hf_headers,
42
- get_fastai_version, # noqa: F401 # for backward compatibility
43
- get_fastcore_version, # noqa: F401 # for backward compatibility
44
- get_graphviz_version, # noqa: F401 # for backward compatibility
45
- get_jinja_version, # noqa: F401 # for backward compatibility
46
- get_pydot_version, # noqa: F401 # for backward compatibility
47
- get_session,
48
- get_tf_version, # noqa: F401 # for backward compatibility
49
- get_torch_version, # noqa: F401 # for backward compatibility
50
35
  hf_raise_for_status,
51
- is_fastai_available, # noqa: F401 # for backward compatibility
52
- is_fastcore_available, # noqa: F401 # for backward compatibility
53
- is_graphviz_available, # noqa: F401 # for backward compatibility
54
- is_jinja_available, # noqa: F401 # for backward compatibility
55
- is_pydot_available, # noqa: F401 # for backward compatibility
56
- is_tf_available, # noqa: F401 # for backward compatibility
57
- is_torch_available, # noqa: F401 # for backward compatibility
58
36
  logging,
59
- reset_sessions,
37
+ parse_xet_file_data_from_response,
38
+ refresh_xet_connection_info,
60
39
  tqdm,
61
40
  validate_hf_hub_args,
62
41
  )
63
- from .utils._http import _adjust_range_header
64
- from .utils._runtime import _PY_VERSION # noqa: F401 # for backward compatibility
42
+ from .utils._http import _adjust_range_header, http_backoff, http_stream_backoff
43
+ from .utils._runtime import is_xet_available
65
44
  from .utils._typing import HTTP_METHOD_T
66
45
  from .utils.sha import sha_fileobj
67
- from .utils.tqdm import is_tqdm_disabled
46
+ from .utils.tqdm import _get_progress_bar_context
68
47
 
69
48
 
70
49
  logger = logging.get_logger(__name__)
@@ -82,7 +61,7 @@ REGEX_COMMIT_HASH = re.compile(r"^[0-9a-f]{40}$")
82
61
  # Regex to check if the file etag IS a valid sha256
83
62
  REGEX_SHA256 = re.compile(r"^[0-9a-f]{64}$")
84
63
 
85
- _are_symlinks_supported_in_dir: Dict[str, bool] = {}
64
+ _are_symlinks_supported_in_dir: dict[str, bool] = {}
86
65
 
87
66
 
88
67
  def are_symlinks_supported(cache_dir: Union[str, Path, None] = None) -> bool:
@@ -160,12 +139,43 @@ class HfFileMetadata:
160
139
  size (`size`):
161
140
  Size of the file. In case of an LFS file, contains the size of the actual
162
141
  LFS file, not the pointer.
142
+ xet_file_data (`XetFileData`, *optional*):
143
+ Xet information for the file. This is only set if the file is stored using Xet storage.
163
144
  """
164
145
 
165
146
  commit_hash: Optional[str]
166
147
  etag: Optional[str]
167
148
  location: str
168
149
  size: Optional[int]
150
+ xet_file_data: Optional[XetFileData]
151
+
152
+
153
+ @dataclass
154
+ class DryRunFileInfo:
155
+ """Information returned when performing a dry run of a file download.
156
+
157
+ Returned by [`hf_hub_download`] when `dry_run=True`.
158
+
159
+ Args:
160
+ commit_hash (`str`):
161
+ The commit_hash related to the file.
162
+ file_size (`int`):
163
+ Size of the file. In case of an LFS file, contains the size of the actual LFS file, not the pointer.
164
+ filename (`str`):
165
+ Name of the file in the repo.
166
+ is_cached (`bool`):
167
+ Whether the file is already cached locally.
168
+ will_download (`bool`):
169
+ Whether the file will be downloaded if `hf_hub_download` is called with `dry_run=False`.
170
+ In practice, will_download is `True` if the file is not cached or if `force_download=True`.
171
+ """
172
+
173
+ commit_hash: str
174
+ file_size: int
175
+ filename: str
176
+ local_path: str
177
+ is_cached: bool
178
+ will_download: bool
169
179
 
170
180
 
171
181
  @validate_hf_hub_args
@@ -210,26 +220,23 @@ def hf_hub_url(
210
220
  'https://huggingface.co/julien-c/EsperBERTo-small/resolve/main/pytorch_model.bin'
211
221
  ```
212
222
 
213
- <Tip>
214
-
215
- Notes:
216
-
217
- Cloudfront is replicated over the globe so downloads are way faster for
218
- the end user (and it also lowers our bandwidth costs).
219
-
220
- Cloudfront aggressively caches files by default (default TTL is 24
221
- hours), however this is not an issue here because we implement a
222
- git-based versioning system on huggingface.co, which means that we store
223
- the files on S3/Cloudfront in a content-addressable way (i.e., the file
224
- name is its hash). Using content-addressable filenames means cache can't
225
- ever be stale.
226
-
227
- In terms of client-side caching from this library, we base our caching
228
- on the objects' entity tag (`ETag`), which is an identifier of a
229
- specific version of a resource [1]_. An object's ETag is: its git-sha1
230
- if stored in git, or its sha256 if stored in git-lfs.
231
-
232
- </Tip>
223
+ > [!TIP]
224
+ > Notes:
225
+ >
226
+ > Cloudfront is replicated over the globe so downloads are way faster for
227
+ > the end user (and it also lowers our bandwidth costs).
228
+ >
229
+ > Cloudfront aggressively caches files by default (default TTL is 24
230
+ > hours), however this is not an issue here because we implement a
231
+ > git-based versioning system on huggingface.co, which means that we store
232
+ > the files on S3/Cloudfront in a content-addressable way (i.e., the file
233
+ > name is its hash). Using content-addressable filenames means cache can't
234
+ > ever be stale.
235
+ >
236
+ > In terms of client-side caching from this library, we base our caching
237
+ > on the objects' entity tag (`ETag`), which is an identifier of a
238
+ > specific version of a resource [1]_. An object's ETag is: its git-sha1
239
+ > if stored in git, or its sha256 if stored in git-lfs.
233
240
 
234
241
  References:
235
242
 
@@ -248,7 +255,7 @@ def hf_hub_url(
248
255
 
249
256
  if revision is None:
250
257
  revision = constants.DEFAULT_REVISION
251
- url = HUGGINGFACE_CO_URL_TEMPLATE.format(
258
+ url = constants.HUGGINGFACE_CO_URL_TEMPLATE.format(
252
259
  repo_id=repo_id, revision=quote(revision, safe=""), filename=quote(filename)
253
260
  )
254
261
  # Update endpoint if provided
@@ -257,63 +264,92 @@ def hf_hub_url(
257
264
  return url
258
265
 
259
266
 
260
- def _request_wrapper(
261
- method: HTTP_METHOD_T, url: str, *, follow_relative_redirects: bool = False, **params
262
- ) -> requests.Response:
263
- """Wrapper around requests methods to follow relative redirects if `follow_relative_redirects=True` even when
264
- `allow_redirection=False`.
267
+ def _httpx_follow_relative_redirects(method: HTTP_METHOD_T, url: str, **httpx_kwargs) -> httpx.Response:
268
+ """Perform an HTTP request with backoff and follow relative redirects only.
269
+
270
+ This is useful to follow a redirection to a renamed repository without following redirection to a CDN.
271
+
272
+ A backoff mechanism retries the HTTP call on 5xx errors and network errors.
265
273
 
266
274
  Args:
267
275
  method (`str`):
268
276
  HTTP method, such as 'GET' or 'HEAD'.
269
277
  url (`str`):
270
278
  The URL of the resource to fetch.
271
- follow_relative_redirects (`bool`, *optional*, defaults to `False`)
272
- If True, relative redirection (redirection to the same site) will be resolved even when `allow_redirection`
273
- kwarg is set to False. Useful when we want to follow a redirection to a renamed repository without
274
- following redirection to a CDN.
275
- **params (`dict`, *optional*):
276
- Params to pass to `requests.request`.
279
+ **httpx_kwargs (`dict`, *optional*):
280
+ Params to pass to `httpx.request`.
277
281
  """
278
- # Recursively follow relative redirects
279
- if follow_relative_redirects:
280
- response = _request_wrapper(
282
+ while True:
283
+ # Make the request
284
+ response = http_backoff(
281
285
  method=method,
282
286
  url=url,
283
- follow_relative_redirects=False,
284
- **params,
287
+ **httpx_kwargs,
288
+ follow_redirects=False,
289
+ retry_on_exceptions=(),
290
+ retry_on_status_codes=(429,),
285
291
  )
292
+ hf_raise_for_status(response)
286
293
 
287
- # If redirection, we redirect only relative paths.
288
- # This is useful in case of a renamed repository.
294
+ # Check if response is a relative redirect
289
295
  if 300 <= response.status_code <= 399:
290
296
  parsed_target = urlparse(response.headers["Location"])
291
297
  if parsed_target.netloc == "":
292
- # This means it is a relative 'location' headers, as allowed by RFC 7231.
293
- # (e.g. '/path/to/resource' instead of 'http://domain.tld/path/to/resource')
294
- # We want to follow this relative redirect !
295
- #
296
- # Highly inspired by `resolve_redirects` from requests library.
297
- # See https://github.com/psf/requests/blob/main/requests/sessions.py#L159
298
- next_url = urlparse(url)._replace(path=parsed_target.path).geturl()
299
- return _request_wrapper(method=method, url=next_url, follow_relative_redirects=True, **params)
300
- return response
301
-
302
- # Perform request and return if status_code is not in the retry list.
303
- response = get_session().request(method=method, url=url, **params)
304
- hf_raise_for_status(response)
298
+ # Relative redirect -> update URL and retry
299
+ url = urlparse(url)._replace(path=parsed_target.path).geturl()
300
+ continue
301
+
302
+ # Break if no relative redirect
303
+ break
304
+
305
305
  return response
306
306
 
307
307
 
308
+ def _get_file_length_from_http_response(response: httpx.Response) -> Optional[int]:
309
+ """
310
+ Get the length of the file from the HTTP response headers.
311
+
312
+ This function extracts the file size from the HTTP response headers, either from the
313
+ `Content-Range` or `Content-Length` header, if available (in that order).
314
+
315
+ Args:
316
+ response (`httpx.Response`):
317
+ The HTTP response object.
318
+
319
+ Returns:
320
+ `int` or `None`: The length of the file in bytes, or None if not available.
321
+ """
322
+
323
+ # If HTTP response contains compressed body (e.g. gzip), the `Content-Length` header will
324
+ # contain the length of the compressed body, not the uncompressed file size.
325
+ # And at the start of transmission there's no way to know the uncompressed file size for gzip,
326
+ # thus we return None in that case.
327
+ content_encoding = response.headers.get("Content-Encoding", "identity").lower()
328
+ if content_encoding != "identity":
329
+ # gzip/br/deflate/zstd etc
330
+ return None
331
+
332
+ content_range = response.headers.get("Content-Range")
333
+ if content_range is not None:
334
+ return int(content_range.rsplit("/")[-1])
335
+
336
+ content_length = response.headers.get("Content-Length")
337
+ if content_length is not None:
338
+ return int(content_length)
339
+
340
+ return None
341
+
342
+
343
+ @validate_hf_hub_args
308
344
  def http_get(
309
345
  url: str,
310
346
  temp_file: BinaryIO,
311
347
  *,
312
- proxies: Optional[Dict] = None,
313
348
  resume_size: int = 0,
314
- headers: Optional[Dict[str, Any]] = None,
349
+ headers: Optional[dict[str, Any]] = None,
315
350
  expected_size: Optional[int] = None,
316
351
  displayed_filename: Optional[str] = None,
352
+ tqdm_class: Optional[type[base_tqdm]] = None,
317
353
  _nb_retries: int = 5,
318
354
  _tqdm_bar: Optional[tqdm] = None,
319
355
  ) -> None:
@@ -329,8 +365,6 @@ def http_get(
329
365
  The URL of the file to download.
330
366
  temp_file (`BinaryIO`):
331
367
  The file-like object where to save the file.
332
- proxies (`dict`, *optional*):
333
- Dictionary mapping protocol to the URL of the proxy passed to `requests.request`.
334
368
  resume_size (`int`, *optional*):
335
369
  The number of bytes already downloaded. If set to 0 (default), the whole file is download. If set to a
336
370
  positive number, the download will resume at the given position.
@@ -347,137 +381,85 @@ def http_get(
347
381
  # If the file is already fully downloaded, we don't need to download it again.
348
382
  return
349
383
 
350
- hf_transfer = None
351
- if constants.HF_HUB_ENABLE_HF_TRANSFER:
352
- if resume_size != 0:
353
- warnings.warn("'hf_transfer' does not support `resume_size`: falling back to regular download method")
354
- elif proxies is not None:
355
- warnings.warn("'hf_transfer' does not support `proxies`: falling back to regular download method")
356
- else:
357
- try:
358
- import hf_transfer # type: ignore[no-redef]
359
- except ImportError:
360
- raise ValueError(
361
- "Fast download using 'hf_transfer' is enabled"
362
- " (HF_HUB_ENABLE_HF_TRANSFER=1) but 'hf_transfer' package is not"
363
- " available in your environment. Try `pip install hf_transfer`."
364
- )
365
-
366
384
  initial_headers = headers
367
385
  headers = copy.deepcopy(headers) or {}
368
386
  if resume_size > 0:
369
387
  headers["Range"] = _adjust_range_header(headers.get("Range"), resume_size)
388
+ elif expected_size and expected_size > constants.MAX_HTTP_DOWNLOAD_SIZE:
389
+ # Any files over 50GB will not be available through basic http requests.
390
+ raise ValueError(
391
+ "The file is too large to be downloaded using the regular download method. "
392
+ " Install `hf_xet` with `pip install hf_xet` for xet-powered downloads."
393
+ )
370
394
 
371
- r = _request_wrapper(
372
- method="GET", url=url, stream=True, proxies=proxies, headers=headers, timeout=constants.HF_HUB_DOWNLOAD_TIMEOUT
373
- )
374
- hf_raise_for_status(r)
375
- content_length = r.headers.get("Content-Length")
376
-
377
- # NOTE: 'total' is the total number of bytes to download, not the number of bytes in the file.
378
- # If the file is compressed, the number of bytes in the saved file will be higher than 'total'.
379
- total = resume_size + int(content_length) if content_length is not None else None
380
-
381
- if displayed_filename is None:
382
- displayed_filename = url
383
- content_disposition = r.headers.get("Content-Disposition")
384
- if content_disposition is not None:
385
- match = HEADER_FILENAME_PATTERN.search(content_disposition)
386
- if match is not None:
387
- # Means file is on CDN
388
- displayed_filename = match.groupdict()["filename"]
389
-
390
- # Truncate filename if too long to display
391
- if len(displayed_filename) > 40:
392
- displayed_filename = f"(…){displayed_filename[-40:]}"
393
-
394
- consistency_error_message = (
395
- f"Consistency check failed: file should be of size {expected_size} but has size"
396
- f" {{actual_size}} ({displayed_filename}).\nThis is usually due to network issues while downloading the file."
397
- " Please retry with `force_download=True`."
398
- )
399
-
400
- # Stream file to buffer
401
- progress_cm: tqdm = (
402
- tqdm( # type: ignore[assignment]
403
- unit="B",
404
- unit_scale=True,
395
+ with http_stream_backoff(
396
+ method="GET",
397
+ url=url,
398
+ headers=headers,
399
+ timeout=constants.HF_HUB_DOWNLOAD_TIMEOUT,
400
+ retry_on_exceptions=(),
401
+ retry_on_status_codes=(429,),
402
+ ) as response:
403
+ hf_raise_for_status(response)
404
+ total: Optional[int] = _get_file_length_from_http_response(response)
405
+
406
+ if displayed_filename is None:
407
+ displayed_filename = url
408
+ content_disposition = response.headers.get("Content-Disposition")
409
+ if content_disposition is not None:
410
+ match = HEADER_FILENAME_PATTERN.search(content_disposition)
411
+ if match is not None:
412
+ # Means file is on CDN
413
+ displayed_filename = match.groupdict()["filename"]
414
+
415
+ # Truncate filename if too long to display
416
+ if len(displayed_filename) > 40:
417
+ displayed_filename = f"(…){displayed_filename[-40:]}"
418
+
419
+ consistency_error_message = (
420
+ f"Consistency check failed: file should be of size {expected_size} but has size"
421
+ f" {{actual_size}} ({displayed_filename}).\nThis is usually due to network issues while downloading the file."
422
+ " Please retry with `force_download=True`."
423
+ )
424
+ progress_cm = _get_progress_bar_context(
425
+ desc=displayed_filename,
426
+ log_level=logger.getEffectiveLevel(),
405
427
  total=total,
406
428
  initial=resume_size,
407
- desc=displayed_filename,
408
- disable=is_tqdm_disabled(logger.getEffectiveLevel()),
409
429
  name="huggingface_hub.http_get",
430
+ tqdm_class=tqdm_class,
431
+ _tqdm_bar=_tqdm_bar,
410
432
  )
411
- if _tqdm_bar is None
412
- else contextlib.nullcontext(_tqdm_bar)
413
- # ^ `contextlib.nullcontext` mimics a context manager that does nothing
414
- # Makes it easier to use the same code path for both cases but in the later
415
- # case, the progress bar is not closed when exiting the context manager.
416
- )
417
433
 
418
- with progress_cm as progress:
419
- if hf_transfer and total is not None and total > 5 * constants.DOWNLOAD_CHUNK_SIZE:
420
- supports_callback = "callback" in inspect.signature(hf_transfer.download).parameters
421
- if not supports_callback:
422
- warnings.warn(
423
- "You are using an outdated version of `hf_transfer`. "
424
- "Consider upgrading to latest version to enable progress bars "
425
- "using `pip install -U hf_transfer`."
426
- )
434
+ with progress_cm as progress:
435
+ new_resume_size = resume_size
427
436
  try:
428
- hf_transfer.download(
437
+ for chunk in response.iter_bytes(chunk_size=constants.DOWNLOAD_CHUNK_SIZE):
438
+ if chunk: # filter out keep-alive new chunks
439
+ progress.update(len(chunk))
440
+ temp_file.write(chunk)
441
+ new_resume_size += len(chunk)
442
+ # Some data has been downloaded from the server so we reset the number of retries.
443
+ _nb_retries = 5
444
+ except (httpx.ConnectError, httpx.TimeoutException) as e:
445
+ # If ConnectionError (SSLError) or ReadTimeout happen while streaming data from the server, it is most likely
446
+ # a transient error (network outage?). We log a warning message and try to resume the download a few times
447
+ # before giving up. Tre retry mechanism is basic but should be enough in most cases.
448
+ if _nb_retries <= 0:
449
+ logger.warning("Error while downloading from %s: %s\nMax retries exceeded.", url, str(e))
450
+ raise
451
+ logger.warning("Error while downloading from %s: %s\nTrying to resume download...", url, str(e))
452
+ time.sleep(1)
453
+ return http_get(
429
454
  url=url,
430
- filename=temp_file.name,
431
- max_files=constants.HF_TRANSFER_CONCURRENCY,
432
- chunk_size=constants.DOWNLOAD_CHUNK_SIZE,
433
- headers=headers,
434
- parallel_failures=3,
435
- max_retries=5,
436
- **({"callback": progress.update} if supports_callback else {}),
455
+ temp_file=temp_file,
456
+ resume_size=new_resume_size,
457
+ headers=initial_headers,
458
+ expected_size=expected_size,
459
+ tqdm_class=tqdm_class,
460
+ _nb_retries=_nb_retries - 1,
461
+ _tqdm_bar=_tqdm_bar,
437
462
  )
438
- except Exception as e:
439
- raise RuntimeError(
440
- "An error occurred while downloading using `hf_transfer`. Consider"
441
- " disabling HF_HUB_ENABLE_HF_TRANSFER for better error handling."
442
- ) from e
443
- if not supports_callback:
444
- progress.update(total)
445
- if expected_size is not None and expected_size != os.path.getsize(temp_file.name):
446
- raise EnvironmentError(
447
- consistency_error_message.format(
448
- actual_size=os.path.getsize(temp_file.name),
449
- )
450
- )
451
- return
452
- new_resume_size = resume_size
453
- try:
454
- for chunk in r.iter_content(chunk_size=constants.DOWNLOAD_CHUNK_SIZE):
455
- if chunk: # filter out keep-alive new chunks
456
- progress.update(len(chunk))
457
- temp_file.write(chunk)
458
- new_resume_size += len(chunk)
459
- # Some data has been downloaded from the server so we reset the number of retries.
460
- _nb_retries = 5
461
- except (requests.ConnectionError, requests.ReadTimeout) as e:
462
- # If ConnectionError (SSLError) or ReadTimeout happen while streaming data from the server, it is most likely
463
- # a transient error (network outage?). We log a warning message and try to resume the download a few times
464
- # before giving up. Tre retry mechanism is basic but should be enough in most cases.
465
- if _nb_retries <= 0:
466
- logger.warning("Error while downloading from %s: %s\nMax retries exceeded.", url, str(e))
467
- raise
468
- logger.warning("Error while downloading from %s: %s\nTrying to resume download...", url, str(e))
469
- time.sleep(1)
470
- reset_sessions() # In case of SSLError it's best to reset the shared requests.Session objects
471
- return http_get(
472
- url=url,
473
- temp_file=temp_file,
474
- proxies=proxies,
475
- resume_size=new_resume_size,
476
- headers=initial_headers,
477
- expected_size=expected_size,
478
- _nb_retries=_nb_retries - 1,
479
- _tqdm_bar=_tqdm_bar,
480
- )
481
463
 
482
464
  if expected_size is not None and expected_size != temp_file.tell():
483
465
  raise EnvironmentError(
@@ -487,6 +469,114 @@ def http_get(
487
469
  )
488
470
 
489
471
 
472
+ def xet_get(
473
+ *,
474
+ incomplete_path: Path,
475
+ xet_file_data: XetFileData,
476
+ headers: dict[str, str],
477
+ expected_size: Optional[int] = None,
478
+ displayed_filename: Optional[str] = None,
479
+ tqdm_class: Optional[type[base_tqdm]] = None,
480
+ _tqdm_bar: Optional[tqdm] = None,
481
+ ) -> None:
482
+ """
483
+ Download a file using Xet storage service.
484
+
485
+ Args:
486
+ incomplete_path (`Path`):
487
+ The path to the file to download.
488
+ xet_file_data (`XetFileData`):
489
+ The file metadata needed to make the request to the xet storage service.
490
+ headers (`dict[str, str]`):
491
+ The headers to send to the xet storage service.
492
+ expected_size (`int`, *optional*):
493
+ The expected size of the file to download. If set, the download will raise an error if the size of the
494
+ received content is different from the expected one.
495
+ displayed_filename (`str`, *optional*):
496
+ The filename of the file that is being downloaded. Value is used only to display a nice progress bar. If
497
+ not set, the filename is guessed from the URL or the `Content-Disposition` header.
498
+
499
+ **How it works:**
500
+ The file download system uses Xet storage, which is a content-addressable storage system that breaks files into chunks
501
+ for efficient storage and transfer.
502
+
503
+ `hf_xet.download_files` manages downloading files by:
504
+ - Taking a list of files to download (each with its unique content hash)
505
+ - Connecting to a storage server (CAS server) that knows how files are chunked
506
+ - Using authentication to ensure secure access
507
+ - Providing progress updates during download
508
+
509
+ Authentication works by regularly refreshing access tokens through `refresh_xet_connection_info` to maintain a valid
510
+ connection to the storage server.
511
+
512
+ The download process works like this:
513
+ 1. Create a local cache folder at `~/.cache/huggingface/xet/chunk-cache` to store reusable file chunks
514
+ 2. Download files in parallel:
515
+ 2.1. Prepare to write the file to disk
516
+ 2.2. Ask the server "how is this file split into chunks?" using the file's unique hash
517
+ The server responds with:
518
+ - Which chunks make up the complete file
519
+ - Where each chunk can be downloaded from
520
+ 2.3. For each needed chunk:
521
+ - Checks if we already have it in our local cache
522
+ - If not, download it from cloud storage (S3)
523
+ - Save it to cache for future use
524
+ - Assemble the chunks in order to recreate the original file
525
+
526
+ """
527
+ try:
528
+ from hf_xet import PyXetDownloadInfo, download_files # type: ignore[no-redef]
529
+ except ImportError:
530
+ raise ValueError(
531
+ "To use optimized download using Xet storage, you need to install the hf_xet package. "
532
+ 'Try `pip install "huggingface_hub[hf_xet]"` or `pip install hf_xet`.'
533
+ )
534
+
535
+ connection_info = refresh_xet_connection_info(file_data=xet_file_data, headers=headers)
536
+
537
+ def token_refresher() -> tuple[str, int]:
538
+ connection_info = refresh_xet_connection_info(file_data=xet_file_data, headers=headers)
539
+ if connection_info is None:
540
+ raise ValueError("Failed to refresh token using xet metadata.")
541
+ return connection_info.access_token, connection_info.expiration_unix_epoch
542
+
543
+ xet_download_info = [
544
+ PyXetDownloadInfo(
545
+ destination_path=str(incomplete_path.absolute()), hash=xet_file_data.file_hash, file_size=expected_size
546
+ )
547
+ ]
548
+
549
+ if not displayed_filename:
550
+ displayed_filename = incomplete_path.name
551
+
552
+ # Truncate filename if too long to display
553
+ if len(displayed_filename) > 40:
554
+ displayed_filename = f"{displayed_filename[:40]}(…)"
555
+
556
+ progress_cm = _get_progress_bar_context(
557
+ desc=displayed_filename,
558
+ log_level=logger.getEffectiveLevel(),
559
+ total=expected_size,
560
+ initial=0,
561
+ name="huggingface_hub.xet_get",
562
+ tqdm_class=tqdm_class,
563
+ _tqdm_bar=_tqdm_bar,
564
+ )
565
+
566
+ with progress_cm as progress:
567
+
568
+ def progress_updater(progress_bytes: float):
569
+ progress.update(progress_bytes)
570
+
571
+ download_files(
572
+ xet_download_info,
573
+ endpoint=connection_info.endpoint,
574
+ token_info=(connection_info.access_token, connection_info.expiration_unix_epoch),
575
+ token_refresher=token_refresher,
576
+ progress_updater=[progress_updater],
577
+ )
578
+
579
+
490
580
  def _normalize_etag(etag: Optional[str]) -> Optional[str]:
491
581
  """Normalize ETag HTTP header, so it can be used to create nice filepaths.
492
582
 
@@ -601,10 +691,10 @@ def _create_symlink(src: str, dst: str, new_blob: bool = False) -> None:
601
691
 
602
692
  # Symlinks are not supported => let's move or copy the file.
603
693
  if new_blob:
604
- logger.info(f"Symlink not supported. Moving file from {abs_src} to {abs_dst}")
694
+ logger.debug(f"Symlink not supported. Moving file from {abs_src} to {abs_dst}")
605
695
  shutil.move(abs_src, abs_dst, copy_function=_copy_no_matter_what)
606
696
  else:
607
- logger.info(f"Symlink not supported. Copying file from {abs_src} to {abs_dst}")
697
+ logger.debug(f"Symlink not supported. Copying file from {abs_src} to {abs_dst}")
608
698
  shutil.copyfile(abs_src, abs_dst)
609
699
 
610
700
 
@@ -660,6 +750,78 @@ def _check_disk_space(expected_size: int, target_dir: Union[str, Path]) -> None:
660
750
  pass
661
751
 
662
752
 
753
+ @overload
754
+ def hf_hub_download(
755
+ repo_id: str,
756
+ filename: str,
757
+ *,
758
+ subfolder: Optional[str] = None,
759
+ repo_type: Optional[str] = None,
760
+ revision: Optional[str] = None,
761
+ library_name: Optional[str] = None,
762
+ library_version: Optional[str] = None,
763
+ cache_dir: Union[str, Path, None] = None,
764
+ local_dir: Union[str, Path, None] = None,
765
+ user_agent: Union[dict, str, None] = None,
766
+ force_download: bool = False,
767
+ etag_timeout: float = constants.DEFAULT_ETAG_TIMEOUT,
768
+ token: Union[bool, str, None] = None,
769
+ local_files_only: bool = False,
770
+ headers: Optional[dict[str, str]] = None,
771
+ endpoint: Optional[str] = None,
772
+ tqdm_class: Optional[type[base_tqdm]] = None,
773
+ dry_run: Literal[False] = False,
774
+ ) -> str: ...
775
+
776
+
777
+ @overload
778
+ def hf_hub_download(
779
+ repo_id: str,
780
+ filename: str,
781
+ *,
782
+ subfolder: Optional[str] = None,
783
+ repo_type: Optional[str] = None,
784
+ revision: Optional[str] = None,
785
+ library_name: Optional[str] = None,
786
+ library_version: Optional[str] = None,
787
+ cache_dir: Union[str, Path, None] = None,
788
+ local_dir: Union[str, Path, None] = None,
789
+ user_agent: Union[dict, str, None] = None,
790
+ force_download: bool = False,
791
+ etag_timeout: float = constants.DEFAULT_ETAG_TIMEOUT,
792
+ token: Union[bool, str, None] = None,
793
+ local_files_only: bool = False,
794
+ headers: Optional[dict[str, str]] = None,
795
+ endpoint: Optional[str] = None,
796
+ tqdm_class: Optional[type[base_tqdm]] = None,
797
+ dry_run: Literal[True] = True,
798
+ ) -> DryRunFileInfo: ...
799
+
800
+
801
+ @overload
802
+ def hf_hub_download(
803
+ repo_id: str,
804
+ filename: str,
805
+ *,
806
+ subfolder: Optional[str] = None,
807
+ repo_type: Optional[str] = None,
808
+ revision: Optional[str] = None,
809
+ library_name: Optional[str] = None,
810
+ library_version: Optional[str] = None,
811
+ cache_dir: Union[str, Path, None] = None,
812
+ local_dir: Union[str, Path, None] = None,
813
+ user_agent: Union[dict, str, None] = None,
814
+ force_download: bool = False,
815
+ etag_timeout: float = constants.DEFAULT_ETAG_TIMEOUT,
816
+ token: Union[bool, str, None] = None,
817
+ local_files_only: bool = False,
818
+ headers: Optional[dict[str, str]] = None,
819
+ endpoint: Optional[str] = None,
820
+ tqdm_class: Optional[type[base_tqdm]] = None,
821
+ dry_run: bool = False,
822
+ ) -> Union[str, DryRunFileInfo]: ...
823
+
824
+
663
825
  @validate_hf_hub_args
664
826
  def hf_hub_download(
665
827
  repo_id: str,
@@ -672,18 +834,16 @@ def hf_hub_download(
672
834
  library_version: Optional[str] = None,
673
835
  cache_dir: Union[str, Path, None] = None,
674
836
  local_dir: Union[str, Path, None] = None,
675
- user_agent: Union[Dict, str, None] = None,
837
+ user_agent: Union[dict, str, None] = None,
676
838
  force_download: bool = False,
677
- proxies: Optional[Dict] = None,
678
839
  etag_timeout: float = constants.DEFAULT_ETAG_TIMEOUT,
679
840
  token: Union[bool, str, None] = None,
680
841
  local_files_only: bool = False,
681
- headers: Optional[Dict[str, str]] = None,
842
+ headers: Optional[dict[str, str]] = None,
682
843
  endpoint: Optional[str] = None,
683
- resume_download: Optional[bool] = None,
684
- force_filename: Optional[str] = None,
685
- local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto",
686
- ) -> str:
844
+ tqdm_class: Optional[type[base_tqdm]] = None,
845
+ dry_run: bool = False,
846
+ ) -> Union[str, DryRunFileInfo]:
687
847
  """Download a given file if it's not already present in the local cache.
688
848
 
689
849
  The new cache file layout looks like this:
@@ -745,9 +905,6 @@ def hf_hub_download(
745
905
  force_download (`bool`, *optional*, defaults to `False`):
746
906
  Whether the file should be downloaded even if it already exists in
747
907
  the local cache.
748
- proxies (`dict`, *optional*):
749
- Dictionary mapping protocol to the URL of the proxy passed to
750
- `requests.request`.
751
908
  etag_timeout (`float`, *optional*, defaults to `10`):
752
909
  When fetching ETag, how many seconds to wait for the server to send
753
910
  data before giving up which is passed to `requests.request`.
@@ -761,9 +918,19 @@ def hf_hub_download(
761
918
  local cached file if it exists.
762
919
  headers (`dict`, *optional*):
763
920
  Additional headers to be sent with the request.
921
+ tqdm_class (`tqdm`, *optional*):
922
+ If provided, overwrites the default behavior for the progress bar. Passed
923
+ argument must inherit from `tqdm.auto.tqdm` or at least mimic its behavior.
924
+ Defaults to the custom HF progress bar that can be disabled by setting
925
+ `HF_HUB_DISABLE_PROGRESS_BARS` environment variable.
926
+ dry_run (`bool`, *optional*, defaults to `False`):
927
+ If `True`, perform a dry run without actually downloading the file. Returns a
928
+ [`DryRunFileInfo`] object containing information about what would be downloaded.
764
929
 
765
930
  Returns:
766
- `str`: Local path of file or if networking is off, last version of file cached on disk.
931
+ `str` or [`DryRunFileInfo`]:
932
+ - If `dry_run=False`: Local path of file or if networking is off, last version of file cached on disk.
933
+ - If `dry_run=True`: A [`DryRunFileInfo`] object containing download information.
767
934
 
768
935
  Raises:
769
936
  [`~utils.RepositoryNotFoundError`]
@@ -771,7 +938,7 @@ def hf_hub_download(
771
938
  or because it is set to `private` and you do not have access.
772
939
  [`~utils.RevisionNotFoundError`]
773
940
  If the revision to download from cannot be found.
774
- [`~utils.EntryNotFoundError`]
941
+ [`~utils.RemoteEntryNotFoundError`]
775
942
  If the file to download cannot be found.
776
943
  [`~utils.LocalEntryNotFoundError`]
777
944
  If network is disabled or unavailable and file is not found in cache.
@@ -787,20 +954,6 @@ def hf_hub_download(
787
954
  # Respect environment variable above user value
788
955
  etag_timeout = constants.HF_HUB_ETAG_TIMEOUT
789
956
 
790
- if force_filename is not None:
791
- warnings.warn(
792
- "The `force_filename` parameter is deprecated as a new caching system, "
793
- "which keeps the filenames as they are on the Hub, is now in place.",
794
- FutureWarning,
795
- )
796
- if resume_download is not None:
797
- warnings.warn(
798
- "`resume_download` is deprecated and will be removed in version 1.0.0. "
799
- "Downloads always resume when possible. "
800
- "If you want to force a new download, use `force_download=True`.",
801
- FutureWarning,
802
- )
803
-
804
957
  if cache_dir is None:
805
958
  cache_dir = constants.HF_HUB_CACHE
806
959
  if revision is None:
@@ -830,15 +983,6 @@ def hf_hub_download(
830
983
  )
831
984
 
832
985
  if local_dir is not None:
833
- if local_dir_use_symlinks != "auto":
834
- warnings.warn(
835
- "`local_dir_use_symlinks` parameter is deprecated and will be ignored. "
836
- "The process to download files to a local folder has been updated and do "
837
- "not rely on symlinks anymore. You only need to pass a destination folder "
838
- "as`local_dir`.\n"
839
- "For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder."
840
- )
841
-
842
986
  return _hf_hub_download_to_local_dir(
843
987
  # Destination
844
988
  local_dir=local_dir,
@@ -851,12 +995,13 @@ def hf_hub_download(
851
995
  endpoint=endpoint,
852
996
  etag_timeout=etag_timeout,
853
997
  headers=hf_headers,
854
- proxies=proxies,
855
998
  token=token,
856
999
  # Additional options
857
1000
  cache_dir=cache_dir,
858
1001
  force_download=force_download,
859
1002
  local_files_only=local_files_only,
1003
+ tqdm_class=tqdm_class,
1004
+ dry_run=dry_run,
860
1005
  )
861
1006
  else:
862
1007
  return _hf_hub_download_to_cache_dir(
@@ -871,11 +1016,12 @@ def hf_hub_download(
871
1016
  endpoint=endpoint,
872
1017
  etag_timeout=etag_timeout,
873
1018
  headers=hf_headers,
874
- proxies=proxies,
875
1019
  token=token,
876
1020
  # Additional options
877
1021
  local_files_only=local_files_only,
878
1022
  force_download=force_download,
1023
+ tqdm_class=tqdm_class,
1024
+ dry_run=dry_run,
879
1025
  )
880
1026
 
881
1027
 
@@ -891,13 +1037,14 @@ def _hf_hub_download_to_cache_dir(
891
1037
  # HTTP info
892
1038
  endpoint: Optional[str],
893
1039
  etag_timeout: float,
894
- headers: Dict[str, str],
895
- proxies: Optional[Dict],
1040
+ headers: dict[str, str],
896
1041
  token: Optional[Union[bool, str]],
897
1042
  # Additional options
898
1043
  local_files_only: bool,
899
1044
  force_download: bool,
900
- ) -> str:
1045
+ tqdm_class: Optional[type[base_tqdm]],
1046
+ dry_run: bool,
1047
+ ) -> Union[str, DryRunFileInfo]:
901
1048
  """Download a given file to a cache folder, if not already present.
902
1049
 
903
1050
  Method should not be called directly. Please use `hf_hub_download` instead.
@@ -905,7 +1052,7 @@ def _hf_hub_download_to_cache_dir(
905
1052
  locks_dir = os.path.join(cache_dir, ".locks")
906
1053
  storage_folder = os.path.join(cache_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type))
907
1054
 
908
- # cross platform transcription of filename, to be used as a local file path.
1055
+ # cross-platform transcription of filename, to be used as a local file path.
909
1056
  relative_filename = os.path.join(*filename.split("/"))
910
1057
  if os.name == "nt":
911
1058
  if relative_filename.startswith("..\\") or "\\..\\" in relative_filename:
@@ -917,18 +1064,27 @@ def _hf_hub_download_to_cache_dir(
917
1064
  # if user provides a commit_hash and they already have the file on disk, shortcut everything.
918
1065
  if REGEX_COMMIT_HASH.match(revision):
919
1066
  pointer_path = _get_pointer_path(storage_folder, revision, relative_filename)
920
- if os.path.exists(pointer_path) and not force_download:
921
- return pointer_path
1067
+ if os.path.exists(pointer_path):
1068
+ if dry_run:
1069
+ return DryRunFileInfo(
1070
+ commit_hash=revision,
1071
+ file_size=os.path.getsize(pointer_path),
1072
+ filename=filename,
1073
+ is_cached=True,
1074
+ local_path=pointer_path,
1075
+ will_download=force_download,
1076
+ )
1077
+ if not force_download:
1078
+ return pointer_path
922
1079
 
923
1080
  # Try to get metadata (etag, commit_hash, url, size) from the server.
924
1081
  # If we can't, a HEAD request error is returned.
925
- (url_to_download, etag, commit_hash, expected_size, head_call_error) = _get_metadata_or_catch_error(
1082
+ (url_to_download, etag, commit_hash, expected_size, xet_file_data, head_call_error) = _get_metadata_or_catch_error(
926
1083
  repo_id=repo_id,
927
1084
  filename=filename,
928
1085
  repo_type=repo_type,
929
1086
  revision=revision,
930
1087
  endpoint=endpoint,
931
- proxies=proxies,
932
1088
  etag_timeout=etag_timeout,
933
1089
  headers=headers,
934
1090
  token=token,
@@ -962,8 +1118,18 @@ def _hf_hub_download_to_cache_dir(
962
1118
  # Return pointer file if exists
963
1119
  if commit_hash is not None:
964
1120
  pointer_path = _get_pointer_path(storage_folder, commit_hash, relative_filename)
965
- if os.path.exists(pointer_path) and not force_download:
966
- return pointer_path
1121
+ if os.path.exists(pointer_path):
1122
+ if dry_run:
1123
+ return DryRunFileInfo(
1124
+ commit_hash=commit_hash,
1125
+ file_size=os.path.getsize(pointer_path),
1126
+ filename=filename,
1127
+ is_cached=True,
1128
+ local_path=pointer_path,
1129
+ will_download=force_download,
1130
+ )
1131
+ if not force_download:
1132
+ return pointer_path
967
1133
 
968
1134
  # Otherwise, raise appropriate error
969
1135
  _raise_on_head_call_error(head_call_error, force_download, local_files_only)
@@ -976,6 +1142,17 @@ def _hf_hub_download_to_cache_dir(
976
1142
  blob_path = os.path.join(storage_folder, "blobs", etag)
977
1143
  pointer_path = _get_pointer_path(storage_folder, commit_hash, relative_filename)
978
1144
 
1145
+ if dry_run:
1146
+ is_cached = os.path.exists(pointer_path) or os.path.exists(blob_path)
1147
+ return DryRunFileInfo(
1148
+ commit_hash=commit_hash,
1149
+ file_size=expected_size,
1150
+ filename=filename,
1151
+ is_cached=is_cached,
1152
+ local_path=pointer_path,
1153
+ will_download=force_download or not is_cached,
1154
+ )
1155
+
979
1156
  os.makedirs(os.path.dirname(blob_path), exist_ok=True)
980
1157
  os.makedirs(os.path.dirname(pointer_path), exist_ok=True)
981
1158
 
@@ -984,39 +1161,53 @@ def _hf_hub_download_to_cache_dir(
984
1161
  # In that case store a ref.
985
1162
  _cache_commit_hash_for_specific_revision(storage_folder, revision, commit_hash)
986
1163
 
987
- # If file already exists, return it (except if force_download=True)
988
- if not force_download:
989
- if os.path.exists(pointer_path):
990
- return pointer_path
991
-
992
- if os.path.exists(blob_path):
993
- # we have the blob already, but not the pointer
994
- _create_symlink(blob_path, pointer_path, new_blob=False)
995
- return pointer_path
996
-
997
1164
  # Prevent parallel downloads of the same file with a lock.
998
1165
  # etag could be duplicated across repos,
999
1166
  lock_path = os.path.join(locks_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type), f"{etag}.lock")
1000
1167
 
1001
1168
  # Some Windows versions do not allow for paths longer than 255 characters.
1002
1169
  # In this case, we must specify it as an extended path by using the "\\?\" prefix.
1003
- if os.name == "nt" and len(os.path.abspath(lock_path)) > 255:
1170
+ if (
1171
+ os.name == "nt"
1172
+ and len(os.path.abspath(lock_path)) > 255
1173
+ and not os.path.abspath(lock_path).startswith("\\\\?\\")
1174
+ ):
1004
1175
  lock_path = "\\\\?\\" + os.path.abspath(lock_path)
1005
1176
 
1006
- if os.name == "nt" and len(os.path.abspath(blob_path)) > 255:
1177
+ if (
1178
+ os.name == "nt"
1179
+ and len(os.path.abspath(blob_path)) > 255
1180
+ and not os.path.abspath(blob_path).startswith("\\\\?\\")
1181
+ ):
1007
1182
  blob_path = "\\\\?\\" + os.path.abspath(blob_path)
1008
1183
 
1009
1184
  Path(lock_path).parent.mkdir(parents=True, exist_ok=True)
1185
+
1186
+ # pointer already exists -> immediate return
1187
+ if not force_download and os.path.exists(pointer_path):
1188
+ return pointer_path
1189
+
1190
+ # Blob exists but pointer must be (safely) created -> take the lock
1191
+ if not force_download and os.path.exists(blob_path):
1192
+ with WeakFileLock(lock_path):
1193
+ if not os.path.exists(pointer_path):
1194
+ _create_symlink(blob_path, pointer_path, new_blob=False)
1195
+ return pointer_path
1196
+
1197
+ # Local file doesn't exist or etag isn't a match => retrieve file from remote (or cache)
1198
+
1010
1199
  with WeakFileLock(lock_path):
1011
1200
  _download_to_tmp_and_move(
1012
1201
  incomplete_path=Path(blob_path + ".incomplete"),
1013
1202
  destination_path=Path(blob_path),
1014
1203
  url_to_download=url_to_download,
1015
- proxies=proxies,
1016
1204
  headers=headers,
1017
1205
  expected_size=expected_size,
1018
1206
  filename=filename,
1019
1207
  force_download=force_download,
1208
+ etag=etag,
1209
+ xet_file_data=xet_file_data,
1210
+ tqdm_class=tqdm_class,
1020
1211
  )
1021
1212
  if not os.path.exists(pointer_path):
1022
1213
  _create_symlink(blob_path, pointer_path, new_blob=True)
@@ -1036,14 +1227,15 @@ def _hf_hub_download_to_local_dir(
1036
1227
  # HTTP info
1037
1228
  endpoint: Optional[str],
1038
1229
  etag_timeout: float,
1039
- headers: Dict[str, str],
1040
- proxies: Optional[Dict],
1230
+ headers: dict[str, str],
1041
1231
  token: Union[bool, str, None],
1042
1232
  # Additional options
1043
1233
  cache_dir: str,
1044
1234
  force_download: bool,
1045
1235
  local_files_only: bool,
1046
- ) -> str:
1236
+ tqdm_class: Optional[type[base_tqdm]],
1237
+ dry_run: bool,
1238
+ ) -> Union[str, DryRunFileInfo]:
1047
1239
  """Download a given file to a local folder, if not already present.
1048
1240
 
1049
1241
  Method should not be called directly. Please use `hf_hub_download` instead.
@@ -1058,22 +1250,31 @@ def _hf_hub_download_to_local_dir(
1058
1250
 
1059
1251
  # Local file exists + metadata exists + commit_hash matches => return file
1060
1252
  if (
1061
- not force_download
1062
- and REGEX_COMMIT_HASH.match(revision)
1253
+ REGEX_COMMIT_HASH.match(revision)
1063
1254
  and paths.file_path.is_file()
1064
1255
  and local_metadata is not None
1065
1256
  and local_metadata.commit_hash == revision
1066
1257
  ):
1067
- return str(paths.file_path)
1258
+ local_file = str(paths.file_path)
1259
+ if dry_run:
1260
+ return DryRunFileInfo(
1261
+ commit_hash=revision,
1262
+ file_size=os.path.getsize(local_file),
1263
+ filename=filename,
1264
+ is_cached=True,
1265
+ local_path=local_file,
1266
+ will_download=force_download,
1267
+ )
1268
+ if not force_download:
1269
+ return local_file
1068
1270
 
1069
1271
  # Local file doesn't exist or commit_hash doesn't match => we need the etag
1070
- (url_to_download, etag, commit_hash, expected_size, head_call_error) = _get_metadata_or_catch_error(
1272
+ (url_to_download, etag, commit_hash, expected_size, xet_file_data, head_call_error) = _get_metadata_or_catch_error(
1071
1273
  repo_id=repo_id,
1072
1274
  filename=filename,
1073
1275
  repo_type=repo_type,
1074
1276
  revision=revision,
1075
1277
  endpoint=endpoint,
1076
- proxies=proxies,
1077
1278
  etag_timeout=etag_timeout,
1078
1279
  headers=headers,
1079
1280
  token=token,
@@ -1082,11 +1283,24 @@ def _hf_hub_download_to_local_dir(
1082
1283
 
1083
1284
  if head_call_error is not None:
1084
1285
  # No HEAD call but local file exists => default to local file
1085
- if not force_download and paths.file_path.is_file():
1086
- logger.warning(
1087
- f"Couldn't access the Hub to check for update but local file already exists. Defaulting to existing file. (error: {head_call_error})"
1088
- )
1089
- return str(paths.file_path)
1286
+ if paths.file_path.is_file():
1287
+ if dry_run or not force_download:
1288
+ logger.warning(
1289
+ f"Couldn't access the Hub to check for update but local file already exists. Defaulting to existing file. (error: {head_call_error})"
1290
+ )
1291
+ local_path = str(paths.file_path)
1292
+ if dry_run and local_metadata is not None:
1293
+ return DryRunFileInfo(
1294
+ commit_hash=local_metadata.commit_hash,
1295
+ file_size=os.path.getsize(local_path),
1296
+ filename=filename,
1297
+ is_cached=True,
1298
+ local_path=local_path,
1299
+ will_download=force_download,
1300
+ )
1301
+ if not force_download:
1302
+ return local_path
1303
+
1090
1304
  # Otherwise => raise
1091
1305
  _raise_on_head_call_error(head_call_error, force_download, local_files_only)
1092
1306
 
@@ -1101,6 +1315,15 @@ def _hf_hub_download_to_local_dir(
1101
1315
  # etag matches => update metadata and return file
1102
1316
  if local_metadata is not None and local_metadata.etag == etag:
1103
1317
  write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag)
1318
+ if dry_run:
1319
+ return DryRunFileInfo(
1320
+ commit_hash=commit_hash,
1321
+ file_size=expected_size,
1322
+ filename=filename,
1323
+ is_cached=True,
1324
+ local_path=str(paths.file_path),
1325
+ will_download=False,
1326
+ )
1104
1327
  return str(paths.file_path)
1105
1328
 
1106
1329
  # metadata is outdated + etag is a sha256
@@ -1112,6 +1335,15 @@ def _hf_hub_download_to_local_dir(
1112
1335
  file_hash = sha_fileobj(f).hex()
1113
1336
  if file_hash == etag:
1114
1337
  write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag)
1338
+ if dry_run:
1339
+ return DryRunFileInfo(
1340
+ commit_hash=commit_hash,
1341
+ file_size=expected_size,
1342
+ filename=filename,
1343
+ is_cached=True,
1344
+ local_path=str(paths.file_path),
1345
+ will_download=False,
1346
+ )
1115
1347
  return str(paths.file_path)
1116
1348
 
1117
1349
  # Local file doesn't exist or etag isn't a match => retrieve file from remote (or cache)
@@ -1130,8 +1362,28 @@ def _hf_hub_download_to_local_dir(
1130
1362
  paths.file_path.parent.mkdir(parents=True, exist_ok=True)
1131
1363
  shutil.copyfile(cached_path, paths.file_path)
1132
1364
  write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag)
1365
+ if dry_run:
1366
+ return DryRunFileInfo(
1367
+ commit_hash=commit_hash,
1368
+ file_size=expected_size,
1369
+ filename=filename,
1370
+ is_cached=True,
1371
+ local_path=str(paths.file_path),
1372
+ will_download=False,
1373
+ )
1133
1374
  return str(paths.file_path)
1134
1375
 
1376
+ if dry_run:
1377
+ is_cached = paths.file_path.is_file()
1378
+ return DryRunFileInfo(
1379
+ commit_hash=commit_hash,
1380
+ file_size=expected_size,
1381
+ filename=filename,
1382
+ is_cached=is_cached,
1383
+ local_path=str(paths.file_path),
1384
+ will_download=force_download or not is_cached,
1385
+ )
1386
+
1135
1387
  # Otherwise, let's download the file!
1136
1388
  with WeakFileLock(paths.lock_path):
1137
1389
  paths.file_path.unlink(missing_ok=True) # delete outdated file first
@@ -1139,11 +1391,13 @@ def _hf_hub_download_to_local_dir(
1139
1391
  incomplete_path=paths.incomplete_path(etag),
1140
1392
  destination_path=paths.file_path,
1141
1393
  url_to_download=url_to_download,
1142
- proxies=proxies,
1143
1394
  headers=headers,
1144
1395
  expected_size=expected_size,
1145
1396
  filename=filename,
1146
1397
  force_download=force_download,
1398
+ etag=etag,
1399
+ xet_file_data=xet_file_data,
1400
+ tqdm_class=tqdm_class,
1147
1401
  )
1148
1402
 
1149
1403
  write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag)
@@ -1247,12 +1501,12 @@ def try_to_load_from_cache(
1247
1501
  def get_hf_file_metadata(
1248
1502
  url: str,
1249
1503
  token: Union[bool, str, None] = None,
1250
- proxies: Optional[Dict] = None,
1251
1504
  timeout: Optional[float] = constants.DEFAULT_REQUEST_TIMEOUT,
1252
1505
  library_name: Optional[str] = None,
1253
1506
  library_version: Optional[str] = None,
1254
- user_agent: Union[Dict, str, None] = None,
1255
- headers: Optional[Dict[str, str]] = None,
1507
+ user_agent: Union[dict, str, None] = None,
1508
+ headers: Optional[dict[str, str]] = None,
1509
+ endpoint: Optional[str] = None,
1256
1510
  ) -> HfFileMetadata:
1257
1511
  """Fetch metadata of a file versioned on the Hub for a given url.
1258
1512
 
@@ -1265,9 +1519,6 @@ def get_hf_file_metadata(
1265
1519
  folder.
1266
1520
  - If `False` or `None`, no token is provided.
1267
1521
  - If a string, it's used as the authentication token.
1268
- proxies (`dict`, *optional*):
1269
- Dictionary mapping protocol to the URL of the proxy passed to
1270
- `requests.request`.
1271
1522
  timeout (`float`, *optional*, defaults to 10):
1272
1523
  How many seconds to wait for the server to send metadata before giving up.
1273
1524
  library_name (`str`, *optional*):
@@ -1278,6 +1529,8 @@ def get_hf_file_metadata(
1278
1529
  The user-agent info in the form of a dictionary or a string.
1279
1530
  headers (`dict`, *optional*):
1280
1531
  Additional headers to be sent with the request.
1532
+ endpoint (`str`, *optional*):
1533
+ Endpoint of the Hub. Defaults to <https://huggingface.co>.
1281
1534
 
1282
1535
  Returns:
1283
1536
  A [`HfFileMetadata`] object containing metadata such as location, etag, size and
@@ -1293,30 +1546,23 @@ def get_hf_file_metadata(
1293
1546
  hf_headers["Accept-Encoding"] = "identity" # prevent any compression => we want to know the real size of the file
1294
1547
 
1295
1548
  # Retrieve metadata
1296
- r = _request_wrapper(
1297
- method="HEAD",
1298
- url=url,
1299
- headers=hf_headers,
1300
- allow_redirects=False,
1301
- follow_relative_redirects=True,
1302
- proxies=proxies,
1303
- timeout=timeout,
1304
- )
1305
- hf_raise_for_status(r)
1549
+ response = _httpx_follow_relative_redirects(method="HEAD", url=url, headers=hf_headers, timeout=timeout)
1550
+ hf_raise_for_status(response)
1306
1551
 
1307
1552
  # Return
1308
1553
  return HfFileMetadata(
1309
- commit_hash=r.headers.get(constants.HUGGINGFACE_HEADER_X_REPO_COMMIT),
1310
- # We favor a custom header indicating the etag of the linked resource, and
1311
- # we fallback to the regular etag header.
1312
- etag=_normalize_etag(r.headers.get(constants.HUGGINGFACE_HEADER_X_LINKED_ETAG) or r.headers.get("ETag")),
1554
+ commit_hash=response.headers.get(constants.HUGGINGFACE_HEADER_X_REPO_COMMIT),
1555
+ # We favor a custom header indicating the etag of the linked resource, and we fall back to the regular etag header.
1556
+ etag=_normalize_etag(
1557
+ response.headers.get(constants.HUGGINGFACE_HEADER_X_LINKED_ETAG) or response.headers.get("ETag")
1558
+ ),
1313
1559
  # Either from response headers (if redirected) or defaults to request url
1314
- # Do not use directly `url`, as `_request_wrapper` might have followed relative
1315
- # redirects.
1316
- location=r.headers.get("Location") or r.request.url, # type: ignore
1560
+ # Do not use directly `url` as we might have followed relative redirects.
1561
+ location=response.headers.get("Location") or str(response.request.url), # type: ignore
1317
1562
  size=_int_or_none(
1318
- r.headers.get(constants.HUGGINGFACE_HEADER_X_LINKED_SIZE) or r.headers.get("Content-Length")
1563
+ response.headers.get(constants.HUGGINGFACE_HEADER_X_LINKED_SIZE) or response.headers.get("Content-Length")
1319
1564
  ),
1565
+ xet_file_data=parse_xet_file_data_from_response(response, endpoint=endpoint), # type: ignore
1320
1566
  )
1321
1567
 
1322
1568
 
@@ -1327,19 +1573,18 @@ def _get_metadata_or_catch_error(
1327
1573
  repo_type: str,
1328
1574
  revision: str,
1329
1575
  endpoint: Optional[str],
1330
- proxies: Optional[Dict],
1331
1576
  etag_timeout: Optional[float],
1332
- headers: Dict[str, str], # mutated inplace!
1577
+ headers: dict[str, str], # mutated inplace!
1333
1578
  token: Union[bool, str, None],
1334
1579
  local_files_only: bool,
1335
1580
  relative_filename: Optional[str] = None, # only used to store `.no_exists` in cache
1336
1581
  storage_folder: Optional[str] = None, # only used to store `.no_exists` in cache
1337
1582
  ) -> Union[
1338
1583
  # Either an exception is caught and returned
1339
- Tuple[None, None, None, None, Exception],
1584
+ tuple[None, None, None, None, None, Exception],
1340
1585
  # Or the metadata is returned as
1341
- # `(url_to_download, etag, commit_hash, expected_size, None)`
1342
- Tuple[str, str, str, int, None],
1586
+ # `(url_to_download, etag, commit_hash, expected_size, xet_file_data, None)`
1587
+ tuple[str, str, str, int, Optional[XetFileData], None],
1343
1588
  ]:
1344
1589
  """Get metadata for a file on the Hub, safely handling network issues.
1345
1590
 
@@ -1356,6 +1601,7 @@ def _get_metadata_or_catch_error(
1356
1601
  None,
1357
1602
  None,
1358
1603
  None,
1604
+ None,
1359
1605
  OfflineModeIsEnabled(
1360
1606
  f"Cannot access file since 'local_files_only=True' as been set. (repo_id: {repo_id}, repo_type: {repo_type}, revision: {revision}, filename: {filename})"
1361
1607
  ),
@@ -1367,6 +1613,7 @@ def _get_metadata_or_catch_error(
1367
1613
  commit_hash: Optional[str] = None
1368
1614
  expected_size: Optional[int] = None
1369
1615
  head_error_call: Optional[Exception] = None
1616
+ xet_file_data: Optional[XetFileData] = None
1370
1617
 
1371
1618
  # Try to get metadata from the server.
1372
1619
  # Do not raise yet if the file is not found or not accessible.
@@ -1374,9 +1621,9 @@ def _get_metadata_or_catch_error(
1374
1621
  try:
1375
1622
  try:
1376
1623
  metadata = get_hf_file_metadata(
1377
- url=url, proxies=proxies, timeout=etag_timeout, headers=headers, token=token
1624
+ url=url, timeout=etag_timeout, headers=headers, token=token, endpoint=endpoint
1378
1625
  )
1379
- except EntryNotFoundError as http_error:
1626
+ except RemoteEntryNotFoundError as http_error:
1380
1627
  if storage_folder is not None and relative_filename is not None:
1381
1628
  # Cache the non-existence of the file
1382
1629
  commit_hash = http_error.response.headers.get(constants.HUGGINGFACE_HEADER_X_REPO_COMMIT)
@@ -1414,32 +1661,30 @@ def _get_metadata_or_catch_error(
1414
1661
  if expected_size is None:
1415
1662
  raise FileMetadataError("Distant resource does not have a Content-Length.")
1416
1663
 
1664
+ xet_file_data = metadata.xet_file_data
1665
+
1417
1666
  # In case of a redirect, save an extra redirect on the request.get call,
1418
1667
  # and ensure we download the exact atomic version even if it changed
1419
1668
  # between the HEAD and the GET (unlikely, but hey).
1420
1669
  #
1421
1670
  # If url domain is different => we are downloading from a CDN => url is signed => don't send auth
1422
1671
  # If url domain is the same => redirect due to repo rename AND downloading a regular file => keep auth
1423
- if url != metadata.location:
1672
+ if xet_file_data is None and url != metadata.location:
1424
1673
  url_to_download = metadata.location
1425
1674
  if urlparse(url).netloc != urlparse(metadata.location).netloc:
1426
1675
  # Remove authorization header when downloading a LFS blob
1427
1676
  headers.pop("authorization", None)
1428
- except (requests.exceptions.SSLError, requests.exceptions.ProxyError):
1429
- # Actually raise for those subclasses of ConnectionError
1677
+ except httpx.ProxyError:
1678
+ # Actually raise on proxy error
1430
1679
  raise
1431
- except (
1432
- requests.exceptions.ConnectionError,
1433
- requests.exceptions.Timeout,
1434
- OfflineModeIsEnabled,
1435
- ) as error:
1680
+ except (httpx.ConnectError, httpx.TimeoutException, OfflineModeIsEnabled) as error:
1436
1681
  # Otherwise, our Internet connection is down.
1437
1682
  # etag is None
1438
1683
  head_error_call = error
1439
- except (RevisionNotFoundError, EntryNotFoundError):
1684
+ except (RevisionNotFoundError, RemoteEntryNotFoundError):
1440
1685
  # The repo was found but the revision or entry doesn't exist on the Hub (never existed or got deleted)
1441
1686
  raise
1442
- except requests.HTTPError as error:
1687
+ except HfHubHTTPError as error:
1443
1688
  # Multiple reasons for an http error:
1444
1689
  # - Repository is private and invalid/missing token sent
1445
1690
  # - Repository is gated and invalid/missing token sent
@@ -1458,7 +1703,7 @@ def _get_metadata_or_catch_error(
1458
1703
  if not (local_files_only or etag is not None or head_error_call is not None):
1459
1704
  raise RuntimeError("etag is empty due to uncovered problems")
1460
1705
 
1461
- return (url_to_download, etag, commit_hash, expected_size, head_error_call) # type: ignore [return-value]
1706
+ return (url_to_download, etag, commit_hash, expected_size, xet_file_data, head_error_call) # type: ignore [return-value]
1462
1707
 
1463
1708
 
1464
1709
  def _raise_on_head_call_error(head_call_error: Exception, force_download: bool, local_files_only: bool) -> NoReturn:
@@ -1497,18 +1742,20 @@ def _download_to_tmp_and_move(
1497
1742
  incomplete_path: Path,
1498
1743
  destination_path: Path,
1499
1744
  url_to_download: str,
1500
- proxies: Optional[Dict],
1501
- headers: Dict[str, str],
1745
+ headers: dict[str, str],
1502
1746
  expected_size: Optional[int],
1503
1747
  filename: str,
1504
1748
  force_download: bool,
1749
+ etag: Optional[str],
1750
+ xet_file_data: Optional[XetFileData],
1751
+ tqdm_class: Optional[type[base_tqdm]] = None,
1505
1752
  ) -> None:
1506
1753
  """Download content from a URL to a destination path.
1507
1754
 
1508
1755
  Internal logic:
1509
1756
  - return early if file is already downloaded
1510
1757
  - resume download if possible (from incomplete file)
1511
- - do not resume download if `force_download=True` or `HF_HUB_ENABLE_HF_TRANSFER=True`
1758
+ - do not resume download if `force_download=True`
1512
1759
  - check disk space before downloading
1513
1760
  - download content to a temporary file
1514
1761
  - set correct permissions on temporary file
@@ -1520,16 +1767,11 @@ def _download_to_tmp_and_move(
1520
1767
  # Do nothing if already exists (except if force_download=True)
1521
1768
  return
1522
1769
 
1523
- if incomplete_path.exists() and (force_download or (constants.HF_HUB_ENABLE_HF_TRANSFER and not proxies)):
1770
+ if incomplete_path.exists() and force_download:
1524
1771
  # By default, we will try to resume the download if possible.
1525
- # However, if the user has set `force_download=True` or if `hf_transfer` is enabled, then we should
1772
+ # However, if the user has set `force_download=True`, then we should
1526
1773
  # not resume the download => delete the incomplete file.
1527
- message = f"Removing incomplete file '{incomplete_path}'"
1528
- if force_download:
1529
- message += " (force_download=True)"
1530
- elif constants.HF_HUB_ENABLE_HF_TRANSFER and not proxies:
1531
- message += " (hf_transfer=True)"
1532
- logger.info(message)
1774
+ logger.debug(f"Removing incomplete file '{incomplete_path}' (force_download=True)")
1533
1775
  incomplete_path.unlink(missing_ok=True)
1534
1776
 
1535
1777
  with incomplete_path.open("ab") as f:
@@ -1537,23 +1779,41 @@ def _download_to_tmp_and_move(
1537
1779
  message = f"Downloading '{filename}' to '{incomplete_path}'"
1538
1780
  if resume_size > 0 and expected_size is not None:
1539
1781
  message += f" (resume from {resume_size}/{expected_size})"
1540
- logger.info(message)
1782
+ logger.debug(message)
1541
1783
 
1542
1784
  if expected_size is not None: # might be None if HTTP header not set correctly
1543
1785
  # Check disk space in both tmp and destination path
1544
1786
  _check_disk_space(expected_size, incomplete_path.parent)
1545
1787
  _check_disk_space(expected_size, destination_path.parent)
1546
1788
 
1547
- http_get(
1548
- url_to_download,
1549
- f,
1550
- proxies=proxies,
1551
- resume_size=resume_size,
1552
- headers=headers,
1553
- expected_size=expected_size,
1554
- )
1789
+ if xet_file_data is not None and is_xet_available():
1790
+ logger.debug("Xet Storage is enabled for this repo. Downloading file from Xet Storage..")
1791
+ xet_get(
1792
+ incomplete_path=incomplete_path,
1793
+ xet_file_data=xet_file_data,
1794
+ headers=headers,
1795
+ expected_size=expected_size,
1796
+ displayed_filename=filename,
1797
+ tqdm_class=tqdm_class,
1798
+ )
1799
+ else:
1800
+ if xet_file_data is not None and not constants.HF_HUB_DISABLE_XET:
1801
+ logger.warning(
1802
+ "Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. "
1803
+ "Falling back to regular HTTP download. "
1804
+ "For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`"
1805
+ )
1806
+
1807
+ http_get(
1808
+ url_to_download,
1809
+ f,
1810
+ resume_size=resume_size,
1811
+ headers=headers,
1812
+ expected_size=expected_size,
1813
+ tqdm_class=tqdm_class,
1814
+ )
1555
1815
 
1556
- logger.info(f"Download complete. Moving file to {destination_path}")
1816
+ logger.debug(f"Download complete. Moving file to {destination_path}")
1557
1817
  _chmod_and_move(incomplete_path, destination_path)
1558
1818
 
1559
1819