huggingface-hub 0.18.0__py3-none-any.whl → 0.19.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of huggingface-hub might be problematic. Click here for more details.

Files changed (43) hide show
  1. huggingface_hub/__init__.py +31 -5
  2. huggingface_hub/_inference_endpoints.py +348 -0
  3. huggingface_hub/_login.py +9 -7
  4. huggingface_hub/_multi_commits.py +1 -1
  5. huggingface_hub/_snapshot_download.py +6 -7
  6. huggingface_hub/_space_api.py +7 -4
  7. huggingface_hub/_tensorboard_logger.py +1 -0
  8. huggingface_hub/_webhooks_payload.py +7 -7
  9. huggingface_hub/commands/lfs.py +3 -6
  10. huggingface_hub/commands/user.py +1 -4
  11. huggingface_hub/constants.py +27 -0
  12. huggingface_hub/file_download.py +142 -134
  13. huggingface_hub/hf_api.py +1036 -501
  14. huggingface_hub/hf_file_system.py +57 -12
  15. huggingface_hub/hub_mixin.py +3 -5
  16. huggingface_hub/inference/_client.py +43 -8
  17. huggingface_hub/inference/_common.py +8 -16
  18. huggingface_hub/inference/_generated/_async_client.py +41 -8
  19. huggingface_hub/inference/_text_generation.py +43 -0
  20. huggingface_hub/inference_api.py +1 -1
  21. huggingface_hub/lfs.py +32 -14
  22. huggingface_hub/repocard_data.py +7 -0
  23. huggingface_hub/repository.py +19 -3
  24. huggingface_hub/templates/modelcard_template.md +1 -1
  25. huggingface_hub/utils/__init__.py +1 -1
  26. huggingface_hub/utils/_cache_assets.py +3 -3
  27. huggingface_hub/utils/_cache_manager.py +6 -7
  28. huggingface_hub/utils/_datetime.py +3 -1
  29. huggingface_hub/utils/_errors.py +10 -0
  30. huggingface_hub/utils/_hf_folder.py +4 -2
  31. huggingface_hub/utils/_http.py +10 -1
  32. huggingface_hub/utils/_runtime.py +4 -2
  33. huggingface_hub/utils/endpoint_helpers.py +27 -175
  34. huggingface_hub/utils/insecure_hashlib.py +34 -0
  35. huggingface_hub/utils/logging.py +4 -6
  36. huggingface_hub/utils/sha.py +2 -1
  37. {huggingface_hub-0.18.0.dist-info → huggingface_hub-0.19.0.dist-info}/METADATA +16 -15
  38. huggingface_hub-0.19.0.dist-info/RECORD +74 -0
  39. {huggingface_hub-0.18.0.dist-info → huggingface_hub-0.19.0.dist-info}/WHEEL +1 -1
  40. huggingface_hub-0.18.0.dist-info/RECORD +0 -72
  41. {huggingface_hub-0.18.0.dist-info → huggingface_hub-0.19.0.dist-info}/LICENSE +0 -0
  42. {huggingface_hub-0.18.0.dist-info → huggingface_hub-0.19.0.dist-info}/entry_points.txt +0 -0
  43. {huggingface_hub-0.18.0.dist-info → huggingface_hub-0.19.0.dist-info}/top_level.txt +0 -0
@@ -55,7 +55,7 @@ class ObjectId(BaseModel):
55
55
 
56
56
  class WebhookPayloadUrl(BaseModel):
57
57
  web: str
58
- api: Optional[str]
58
+ api: Optional[str] = None
59
59
 
60
60
 
61
61
  class WebhookPayloadMovedTo(BaseModel):
@@ -74,7 +74,7 @@ class WebhookPayloadEvent(BaseModel):
74
74
 
75
75
  class WebhookPayloadDiscussionChanges(BaseModel):
76
76
  base: str
77
- mergeCommitId: Optional[str]
77
+ mergeCommitId: Optional[str] = None
78
78
 
79
79
 
80
80
  class WebhookPayloadComment(ObjectId):
@@ -92,16 +92,16 @@ class WebhookPayloadDiscussion(ObjectId):
92
92
  isPullRequest: bool
93
93
  status: DiscussionStatus_T
94
94
  changes: Optional[WebhookPayloadDiscussionChanges]
95
- pinned: Optional[bool]
95
+ pinned: Optional[bool] = None
96
96
 
97
97
 
98
98
  class WebhookPayloadRepo(ObjectId):
99
99
  owner: ObjectId
100
- head_sha: Optional[str]
100
+ head_sha: Optional[str] = None
101
101
  name: str
102
102
  private: bool
103
- subdomain: Optional[str]
104
- tags: Optional[List[str]]
103
+ subdomain: Optional[str] = None
104
+ tags: Optional[List[str]] = None
105
105
  type: Literal["dataset", "model", "space"]
106
106
  url: WebhookPayloadUrl
107
107
 
@@ -112,4 +112,4 @@ class WebhookPayload(BaseModel):
112
112
  discussion: Optional[WebhookPayloadDiscussion]
113
113
  comment: Optional[WebhookPayloadComment]
114
114
  webhook: WebhookPayloadWebhook
115
- movedTo: Optional[WebhookPayloadMovedTo]
115
+ movedTo: Optional[WebhookPayloadMovedTo] = None
@@ -56,16 +56,13 @@ class LfsCommands(BaseHuggingfaceCLICommand):
56
56
  @staticmethod
57
57
  def register_subcommand(parser: _SubParsersAction):
58
58
  enable_parser = parser.add_parser(
59
- "lfs-enable-largefiles",
60
- help="Configure your repository to enable upload of files > 5GB.",
59
+ "lfs-enable-largefiles", help="Configure your repository to enable upload of files > 5GB."
61
60
  )
62
61
  enable_parser.add_argument("path", type=str, help="Local path to repository you want to configure.")
63
62
  enable_parser.set_defaults(func=lambda args: LfsEnableCommand(args))
64
63
 
65
- upload_parser = parser.add_parser(
66
- LFS_MULTIPART_UPLOAD_COMMAND,
67
- help="Command will get called by git-lfs, do not call it directly.",
68
- )
64
+ # Command will get called by git-lfs, do not call it directly.
65
+ upload_parser = parser.add_parser(LFS_MULTIPART_UPLOAD_COMMAND, add_help=False)
69
66
  upload_parser.set_defaults(func=lambda args: LfsUploadCommand(args))
70
67
 
71
68
 
@@ -58,10 +58,7 @@ class UserCommands(BaseHuggingfaceCLICommand):
58
58
  logout_parser.set_defaults(func=lambda args: LogoutCommand(args))
59
59
 
60
60
  # new system: git-based repo system
61
- repo_parser = parser.add_parser(
62
- "repo",
63
- help="{create, ls-files} Commands to interact with your huggingface.co repos.",
64
- )
61
+ repo_parser = parser.add_parser("repo", help="{create} Commands to interact with your huggingface.co repos.")
65
62
  repo_subparsers = repo_parser.add_subparsers(help="huggingface.co repos related commands")
66
63
  repo_create_parser = repo_subparsers.add_parser("create", help="Create a new repo on huggingface.co")
67
64
  repo_create_parser.add_argument(
@@ -5,6 +5,7 @@ from typing import Optional
5
5
 
6
6
  # Possible values for env variables
7
7
 
8
+
8
9
  ENV_VARS_TRUE_VALUES = {"1", "ON", "YES", "TRUE"}
9
10
  ENV_VARS_TRUE_AND_AUTO_VALUES = ENV_VARS_TRUE_VALUES.union({"AUTO"})
10
11
 
@@ -29,6 +30,11 @@ TF_WEIGHTS_NAME = "model.ckpt"
29
30
  FLAX_WEIGHTS_NAME = "flax_model.msgpack"
30
31
  CONFIG_NAME = "config.json"
31
32
  REPOCARD_NAME = "README.md"
33
+ DEFAULT_ETAG_TIMEOUT = 10
34
+ DEFAULT_DOWNLOAD_TIMEOUT = 10
35
+ DEFAULT_REQUEST_TIMEOUT = 10
36
+ DOWNLOAD_CHUNK_SIZE = 10 * 1024 * 1024
37
+ HF_TRANSFER_CONCURRENCY = 100
32
38
 
33
39
  # Git-related constants
34
40
 
@@ -48,6 +54,10 @@ HUGGINGFACE_HEADER_X_LINKED_SIZE = "X-Linked-Size"
48
54
 
49
55
  INFERENCE_ENDPOINT = os.environ.get("HF_INFERENCE_ENDPOINT", "https://api-inference.huggingface.co")
50
56
 
57
+ # See https://huggingface.co/docs/inference-endpoints/index
58
+ INFERENCE_ENDPOINTS_ENDPOINT = "https://api.endpoints.huggingface.cloud/v2"
59
+
60
+
51
61
  REPO_ID_SEPARATOR = "--"
52
62
  # ^ this substring is not allowed in repo_ids on hf.co
53
63
  # and is the canonical one we use for serialization of repo ids elsewhere.
@@ -82,9 +92,14 @@ hf_cache_home = os.path.expanduser(
82
92
  default_cache_path = os.path.join(hf_cache_home, "hub")
83
93
  default_assets_cache_path = os.path.join(hf_cache_home, "assets")
84
94
 
95
+ # Legacy env variables
85
96
  HUGGINGFACE_HUB_CACHE = os.getenv("HUGGINGFACE_HUB_CACHE", default_cache_path)
86
97
  HUGGINGFACE_ASSETS_CACHE = os.getenv("HUGGINGFACE_ASSETS_CACHE", default_assets_cache_path)
87
98
 
99
+ # New env variables
100
+ HF_HUB_CACHE = os.getenv("HF_HUB_CACHE", HUGGINGFACE_HUB_CACHE)
101
+ HF_ASSETS_CACHE = os.getenv("HF_ASSETS_CACHE", HUGGINGFACE_ASSETS_CACHE)
102
+
88
103
  HF_HUB_OFFLINE = _is_true(os.environ.get("HF_HUB_OFFLINE") or os.environ.get("TRANSFORMERS_OFFLINE"))
89
104
 
90
105
  # Opt-out from telemetry requests
@@ -97,6 +112,12 @@ _OLD_HF_TOKEN_PATH = os.path.expanduser("~/.huggingface/token")
97
112
  HF_TOKEN_PATH = os.path.join(hf_cache_home, "token")
98
113
 
99
114
 
115
+ if _staging_mode:
116
+ # In staging mode, we use a different cache to ensure we don't mix up production and staging data or tokens
117
+ _staging_home = os.path.join(os.path.expanduser("~"), ".cache", "huggingface_staging")
118
+ HUGGINGFACE_HUB_CACHE = os.path.join(_staging_home, "hub")
119
+ HF_TOKEN_PATH = os.path.join(_staging_home, "token")
120
+
100
121
  # Here, `True` will disable progress bars globally without possibility of enabling it
101
122
  # programmatically. `False` will enable them without possibility of disabling them.
102
123
  # If environment variable is not set (None), then the user is free to enable/disable
@@ -130,6 +151,12 @@ HF_HUB_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD: int = (
130
151
  _as_int(os.environ.get("HF_HUB_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD")) or 5 * 1024 * 1024
131
152
  )
132
153
 
154
+ # Used to override the etag timeout on a system level
155
+ HF_HUB_ETAG_TIMEOUT: int = _as_int(os.environ.get("HF_HUB_ETAG_TIMEOUT")) or DEFAULT_ETAG_TIMEOUT
156
+
157
+ # Used to override the get request timeout on a system level
158
+ HF_HUB_DOWNLOAD_TIMEOUT: int = _as_int(os.environ.get("HF_HUB_DOWNLOAD_TIMEOUT")) or DEFAULT_DOWNLOAD_TIMEOUT
159
+
133
160
  # List frameworks that are handled by the InferenceAPI service. Useful to scan endpoints and check which models are
134
161
  # deployed and running. Since 95% of the models are using the top 4 frameworks listed below, we scan only those by
135
162
  # default. We still keep the full list of supported frameworks in case we want to scan all of them.
@@ -1,5 +1,6 @@
1
1
  import copy
2
2
  import fnmatch
3
+ import inspect
3
4
  import io
4
5
  import json
5
6
  import os
@@ -7,33 +8,39 @@ import re
7
8
  import shutil
8
9
  import stat
9
10
  import tempfile
11
+ import time
10
12
  import uuid
11
13
  import warnings
12
14
  from contextlib import contextmanager
13
15
  from dataclasses import dataclass
14
16
  from functools import partial
15
- from hashlib import sha256
16
17
  from pathlib import Path
17
18
  from typing import Any, BinaryIO, Dict, Generator, Literal, Optional, Tuple, Union
18
19
  from urllib.parse import quote, urlparse
19
20
 
20
21
  import requests
21
22
  from filelock import FileLock
22
- from requests.exceptions import ProxyError, Timeout
23
23
 
24
24
  from huggingface_hub import constants
25
25
 
26
26
  from . import __version__ # noqa: F401 # for backward compatibility
27
27
  from .constants import (
28
+ DEFAULT_ETAG_TIMEOUT,
29
+ DEFAULT_REQUEST_TIMEOUT,
28
30
  DEFAULT_REVISION,
31
+ DOWNLOAD_CHUNK_SIZE,
29
32
  ENDPOINT,
33
+ HF_HUB_CACHE,
30
34
  HF_HUB_DISABLE_SYMLINKS_WARNING,
35
+ HF_HUB_DOWNLOAD_TIMEOUT,
31
36
  HF_HUB_ENABLE_HF_TRANSFER,
37
+ HF_HUB_ETAG_TIMEOUT,
38
+ HF_TRANSFER_CONCURRENCY,
32
39
  HUGGINGFACE_CO_URL_TEMPLATE,
33
40
  HUGGINGFACE_HEADER_X_LINKED_ETAG,
34
41
  HUGGINGFACE_HEADER_X_LINKED_SIZE,
35
42
  HUGGINGFACE_HEADER_X_REPO_COMMIT,
36
- HUGGINGFACE_HUB_CACHE,
43
+ HUGGINGFACE_HUB_CACHE, # noqa: F401 # for backward compatibility
37
44
  REPO_ID_SEPARATOR,
38
45
  REPO_TYPES,
39
46
  REPO_TYPES_URL_PREFIXES,
@@ -52,10 +59,10 @@ from .utils import (
52
59
  get_graphviz_version, # noqa: F401 # for backward compatibility
53
60
  get_jinja_version, # noqa: F401 # for backward compatibility
54
61
  get_pydot_version, # noqa: F401 # for backward compatibility
62
+ get_session,
55
63
  get_tf_version, # noqa: F401 # for backward compatibility
56
64
  get_torch_version, # noqa: F401 # for backward compatibility
57
65
  hf_raise_for_status,
58
- http_backoff,
59
66
  is_fastai_available, # noqa: F401 # for backward compatibility
60
67
  is_fastcore_available, # noqa: F401 # for backward compatibility
61
68
  is_graphviz_available, # noqa: F401 # for backward compatibility
@@ -64,12 +71,14 @@ from .utils import (
64
71
  is_tf_available, # noqa: F401 # for backward compatibility
65
72
  is_torch_available, # noqa: F401 # for backward compatibility
66
73
  logging,
74
+ reset_sessions,
67
75
  tqdm,
68
76
  validate_hf_hub_args,
69
77
  )
70
78
  from .utils._headers import _http_user_agent
71
79
  from .utils._runtime import _PY_VERSION # noqa: F401 # for backward compatibility
72
80
  from .utils._typing import HTTP_METHOD_T
81
+ from .utils.insecure_hashlib import sha256
73
82
 
74
83
 
75
84
  logger = logging.get_logger(__name__)
@@ -95,7 +104,7 @@ def are_symlinks_supported(cache_dir: Union[str, Path, None] = None) -> bool:
95
104
  """
96
105
  # Defaults to HF cache
97
106
  if cache_dir is None:
98
- cache_dir = HUGGINGFACE_HUB_CACHE
107
+ cache_dir = HF_HUB_CACHE
99
108
  cache_dir = str(Path(cache_dir).expanduser().resolve()) # make it unique
100
109
 
101
110
  # Check symlink compatibility only once (per cache directory) at first time use
@@ -200,9 +209,6 @@ def hf_hub_url(
200
209
  revision (`str`, *optional*):
201
210
  An optional Git revision id which can be a branch name, a tag, or a
202
211
  commit hash.
203
- endpoint (`str`, *optional*):
204
- Hugging Face Hub base url. Will default to https://huggingface.co/. Otherwise, one can set the `HF_ENDPOINT`
205
- environment variable.
206
212
 
207
213
  Example:
208
214
 
@@ -319,7 +325,7 @@ def filename_to_url(
319
325
  )
320
326
 
321
327
  if cache_dir is None:
322
- cache_dir = HUGGINGFACE_HUB_CACHE
328
+ cache_dir = HF_HUB_CACHE
323
329
  if isinstance(cache_dir, Path):
324
330
  cache_dir = str(cache_dir)
325
331
 
@@ -367,46 +373,24 @@ def _raise_if_offline_mode_is_enabled(msg: Optional[str] = None):
367
373
 
368
374
 
369
375
  def _request_wrapper(
370
- method: HTTP_METHOD_T,
371
- url: str,
372
- *,
373
- max_retries: int = 0,
374
- base_wait_time: float = 0.5,
375
- max_wait_time: float = 2,
376
- timeout: Optional[float] = 10.0,
377
- follow_relative_redirects: bool = False,
378
- **params,
376
+ method: HTTP_METHOD_T, url: str, *, follow_relative_redirects: bool = False, **params
379
377
  ) -> requests.Response:
380
378
  """Wrapper around requests methods to add several features.
381
379
 
382
380
  What it does:
383
- 1. Ensure offline mode is disabled (env variable `HF_HUB_OFFLINE` not set to 1).
384
- If enabled, a `OfflineModeIsEnabled` exception is raised.
385
- 2. Follow relative redirections if `follow_relative_redirects=True` even when
386
- `allow_redirection` kwarg is set to False.
387
- 3. Retry in case request fails with a `Timeout` or `ProxyError`, with exponential backoff.
381
+ 1. Ensure offline mode is disabled (env variable `HF_HUB_OFFLINE` not set to 1). If enabled, a
382
+ `OfflineModeIsEnabled` exception is raised.
383
+ 2. Follow relative redirects if `follow_relative_redirects=True` even when `allow_redirection=False`.
388
384
 
389
385
  Args:
390
386
  method (`str`):
391
387
  HTTP method, such as 'GET' or 'HEAD'.
392
388
  url (`str`):
393
389
  The URL of the resource to fetch.
394
- max_retries (`int`, *optional*, defaults to `0`):
395
- Maximum number of retries, defaults to 0 (no retries).
396
- base_wait_time (`float`, *optional*, defaults to `0.5`):
397
- Duration (in seconds) to wait before retrying the first time.
398
- Wait time between retries then grows exponentially, capped by
399
- `max_wait_time`.
400
- max_wait_time (`float`, *optional*, defaults to `2`):
401
- Maximum amount of time between two retries, in seconds.
402
- timeout (`float`, *optional*, defaults to `10`):
403
- How many seconds to wait for the server to send data before
404
- giving up which is passed to `requests.request`.
405
390
  follow_relative_redirects (`bool`, *optional*, defaults to `False`)
406
- If True, relative redirection (redirection to the same site) will be
407
- resolved even when `allow_redirection` kwarg is set to False. Useful when we
408
- want to follow a redirection to a renamed repository without following
409
- redirection to a CDN.
391
+ If True, relative redirection (redirection to the same site) will be resolved even when `allow_redirection`
392
+ kwarg is set to False. Useful when we want to follow a redirection to a renamed repository without
393
+ following redirection to a CDN.
410
394
  **params (`dict`, *optional*):
411
395
  Params to pass to `requests.request`.
412
396
  """
@@ -418,10 +402,6 @@ def _request_wrapper(
418
402
  response = _request_wrapper(
419
403
  method=method,
420
404
  url=url,
421
- max_retries=max_retries,
422
- base_wait_time=base_wait_time,
423
- max_wait_time=max_wait_time,
424
- timeout=timeout,
425
405
  follow_relative_redirects=False,
426
406
  **params,
427
407
  )
@@ -437,38 +417,14 @@ def _request_wrapper(
437
417
  #
438
418
  # Highly inspired by `resolve_redirects` from requests library.
439
419
  # See https://github.com/psf/requests/blob/main/requests/sessions.py#L159
440
- return _request_wrapper(
441
- method=method,
442
- url=urlparse(url)._replace(path=parsed_target.path).geturl(),
443
- max_retries=max_retries,
444
- base_wait_time=base_wait_time,
445
- max_wait_time=max_wait_time,
446
- timeout=timeout,
447
- follow_relative_redirects=True, # resolve recursively
448
- **params,
449
- )
420
+ next_url = urlparse(url)._replace(path=parsed_target.path).geturl()
421
+ return _request_wrapper(method=method, url=next_url, follow_relative_redirects=True, **params)
450
422
  return response
451
423
 
452
- # 3. Exponential backoff
453
- return http_backoff(
454
- method=method,
455
- url=url,
456
- max_retries=max_retries,
457
- base_wait_time=base_wait_time,
458
- max_wait_time=max_wait_time,
459
- retry_on_exceptions=(Timeout, ProxyError),
460
- retry_on_status_codes=(),
461
- timeout=timeout,
462
- **params,
463
- )
464
-
465
-
466
- def _request_with_retry(*args, **kwargs) -> requests.Response:
467
- """Deprecated method. Please use `_request_wrapper` instead.
468
-
469
- Alias to keep backward compatibility (used in Transformers).
470
- """
471
- return _request_wrapper(*args, **kwargs)
424
+ # Perform request and return if status_code is not in the retry list.
425
+ response = get_session().request(method=method, url=url, **params)
426
+ hf_raise_for_status(response)
427
+ return response
472
428
 
473
429
 
474
430
  def http_get(
@@ -478,49 +434,39 @@ def http_get(
478
434
  proxies=None,
479
435
  resume_size: float = 0,
480
436
  headers: Optional[Dict[str, str]] = None,
481
- timeout: Optional[float] = 10.0,
482
- max_retries: int = 0,
483
437
  expected_size: Optional[int] = None,
438
+ _nb_retries: int = 5,
484
439
  ):
485
440
  """
486
441
  Download a remote file. Do not gobble up errors, and will return errors tailored to the Hugging Face Hub.
442
+
443
+ If ConnectionError (SSLError) or ReadTimeout happen while streaming data from the server, it is most likely a
444
+ transient error (network outage?). We log a warning message and try to resume the download a few times before
445
+ giving up. The method gives up after 5 attempts if no new data has being received from the server.
487
446
  """
488
- if not resume_size:
489
- if HF_HUB_ENABLE_HF_TRANSFER:
447
+ hf_transfer = None
448
+ if HF_HUB_ENABLE_HF_TRANSFER:
449
+ if resume_size != 0:
450
+ warnings.warn("'hf_transfer' does not support `resume_size`: falling back to regular download method")
451
+ elif proxies is not None:
452
+ warnings.warn("'hf_transfer' does not support `proxies`: falling back to regular download method")
453
+ else:
490
454
  try:
491
- # Download file using an external Rust-based package. Download is faster
492
- # (~2x speed-up) but support less features (no progress bars).
493
- from hf_transfer import download
494
-
495
- logger.debug(f"Download {url} using HF_TRANSFER.")
496
- max_files = 100
497
- chunk_size = 10 * 1024 * 1024 # 10 MB
498
- download(url, temp_file.name, max_files, chunk_size, headers=headers)
499
- return
455
+ import hf_transfer # type: ignore[no-redef]
500
456
  except ImportError:
501
457
  raise ValueError(
502
458
  "Fast download using 'hf_transfer' is enabled"
503
459
  " (HF_HUB_ENABLE_HF_TRANSFER=1) but 'hf_transfer' package is not"
504
460
  " available in your environment. Try `pip install hf_transfer`."
505
461
  )
506
- except Exception as e:
507
- raise RuntimeError(
508
- "An error occurred while downloading using `hf_transfer`. Consider"
509
- " disabling HF_HUB_ENABLE_HF_TRANSFER for better error handling."
510
- ) from e
511
462
 
463
+ initial_headers = headers
512
464
  headers = copy.deepcopy(headers) or {}
513
465
  if resume_size > 0:
514
466
  headers["Range"] = "bytes=%d-" % (resume_size,)
515
467
 
516
468
  r = _request_wrapper(
517
- method="GET",
518
- url=url,
519
- stream=True,
520
- proxies=proxies,
521
- headers=headers,
522
- timeout=timeout,
523
- max_retries=max_retries,
469
+ method="GET", url=url, stream=True, proxies=proxies, headers=headers, timeout=HF_HUB_DOWNLOAD_TIMEOUT
524
470
  )
525
471
  hf_raise_for_status(r)
526
472
  content_length = r.headers.get("Content-Length")
@@ -541,28 +487,90 @@ def http_get(
541
487
  if len(displayed_name) > 40:
542
488
  displayed_name = f"(…){displayed_name[-40:]}"
543
489
 
544
- progress = tqdm(
490
+ consistency_error_message = (
491
+ f"Consistency check failed: file should be of size {expected_size} but has size"
492
+ f" {{actual_size}} ({displayed_name}).\nWe are sorry for the inconvenience. Please retry download and"
493
+ " pass `force_download=True, resume_download=False` as argument.\nIf the issue persists, please let us"
494
+ " know by opening an issue on https://github.com/huggingface/huggingface_hub."
495
+ )
496
+
497
+ # Stream file to buffer
498
+ with tqdm(
545
499
  unit="B",
546
500
  unit_scale=True,
547
501
  total=total,
548
502
  initial=resume_size,
549
503
  desc=displayed_name,
550
504
  disable=bool(logger.getEffectiveLevel() == logging.NOTSET),
551
- )
552
- for chunk in r.iter_content(chunk_size=10 * 1024 * 1024):
553
- if chunk: # filter out keep-alive new chunks
554
- progress.update(len(chunk))
555
- temp_file.write(chunk)
556
-
557
- if expected_size is not None and expected_size != temp_file.tell():
558
- raise EnvironmentError(
559
- f"Consistency check failed: file should be of size {expected_size} but has size"
560
- f" {temp_file.tell()} ({displayed_name}).\nWe are sorry for the inconvenience. Please retry download and"
561
- " pass `force_download=True, resume_download=False` as argument.\nIf the issue persists, please let us"
562
- " know by opening an issue on https://github.com/huggingface/huggingface_hub."
563
- )
505
+ ) as progress:
506
+ if hf_transfer and total is not None and total > 5 * DOWNLOAD_CHUNK_SIZE:
507
+ supports_callback = "callback" in inspect.signature(hf_transfer.download).parameters
508
+ if not supports_callback:
509
+ warnings.warn(
510
+ "You are using an outdated version of `hf_transfer`. "
511
+ "Consider upgrading to latest version to enable progress bars "
512
+ "using `pip install -U hf_transfer`."
513
+ )
514
+ try:
515
+ hf_transfer.download(
516
+ url=url,
517
+ filename=temp_file.name,
518
+ max_files=HF_TRANSFER_CONCURRENCY,
519
+ chunk_size=DOWNLOAD_CHUNK_SIZE,
520
+ headers=headers,
521
+ parallel_failures=3,
522
+ max_retries=5,
523
+ **({"callback": progress.update} if supports_callback else {}),
524
+ )
525
+ except Exception as e:
526
+ raise RuntimeError(
527
+ "An error occurred while downloading using `hf_transfer`. Consider"
528
+ " disabling HF_HUB_ENABLE_HF_TRANSFER for better error handling."
529
+ ) from e
530
+ if not supports_callback:
531
+ progress.update(total)
532
+ if expected_size is not None and expected_size != os.path.getsize(temp_file.name):
533
+ raise EnvironmentError(
534
+ consistency_error_message.format(
535
+ actual_size=os.path.getsize(temp_file.name),
536
+ )
537
+ )
538
+ return
539
+ new_resume_size = resume_size
540
+ try:
541
+ for chunk in r.iter_content(chunk_size=DOWNLOAD_CHUNK_SIZE):
542
+ if chunk: # filter out keep-alive new chunks
543
+ progress.update(len(chunk))
544
+ temp_file.write(chunk)
545
+ new_resume_size += len(chunk)
546
+ # Some data has been downloaded from the server so we reset the number of retries.
547
+ _nb_retries = 5
548
+ except (requests.ConnectionError, requests.ReadTimeout) as e:
549
+ # If ConnectionError (SSLError) or ReadTimeout happen while streaming data from the server, it is most likely
550
+ # a transient error (network outage?). We log a warning message and try to resume the download a few times
551
+ # before giving up. Tre retry mechanism is basic but should be enough in most cases.
552
+ if _nb_retries <= 0:
553
+ logger.warning("Error while downloading from %s: %s\nMax retries exceeded.", url, str(e))
554
+ raise
555
+ logger.warning("Error while downloading from %s: %s\nTrying to resume download...", url, str(e))
556
+ time.sleep(1)
557
+ reset_sessions() # In case of SSLError it's best to reset the shared requests.Session objects
558
+ return http_get(
559
+ url=url,
560
+ temp_file=temp_file,
561
+ proxies=proxies,
562
+ resume_size=new_resume_size,
563
+ headers=initial_headers,
564
+ expected_size=expected_size,
565
+ _nb_retries=_nb_retries - 1,
566
+ )
564
567
 
565
- progress.close()
568
+ if expected_size is not None and expected_size != temp_file.tell():
569
+ raise EnvironmentError(
570
+ consistency_error_message.format(
571
+ actual_size=temp_file.tell(),
572
+ )
573
+ )
566
574
 
567
575
 
568
576
  @validate_hf_hub_args
@@ -576,7 +584,7 @@ def cached_download(
576
584
  force_download: bool = False,
577
585
  force_filename: Optional[str] = None,
578
586
  proxies: Optional[Dict] = None,
579
- etag_timeout: float = 10,
587
+ etag_timeout: float = DEFAULT_ETAG_TIMEOUT,
580
588
  resume_download: bool = False,
581
589
  token: Union[bool, str, None] = None,
582
590
  local_files_only: bool = False,
@@ -656,6 +664,10 @@ def cached_download(
656
664
 
657
665
  </Tip>
658
666
  """
667
+ if HF_HUB_ETAG_TIMEOUT != DEFAULT_ETAG_TIMEOUT:
668
+ # Respect environment variable above user value
669
+ etag_timeout = HF_HUB_ETAG_TIMEOUT
670
+
659
671
  if not legacy_cache_layout:
660
672
  warnings.warn(
661
673
  "'cached_download' is the legacy way to download files from the HF hub, please consider upgrading to"
@@ -664,7 +676,7 @@ def cached_download(
664
676
  )
665
677
 
666
678
  if cache_dir is None:
667
- cache_dir = HUGGINGFACE_HUB_CACHE
679
+ cache_dir = HF_HUB_CACHE
668
680
  if isinstance(cache_dir, Path):
669
681
  cache_dir = str(cache_dir)
670
682
 
@@ -995,7 +1007,6 @@ def hf_hub_download(
995
1007
  subfolder: Optional[str] = None,
996
1008
  repo_type: Optional[str] = None,
997
1009
  revision: Optional[str] = None,
998
- endpoint: Optional[str] = None,
999
1010
  library_name: Optional[str] = None,
1000
1011
  library_version: Optional[str] = None,
1001
1012
  cache_dir: Union[str, Path, None] = None,
@@ -1005,11 +1016,12 @@ def hf_hub_download(
1005
1016
  force_download: bool = False,
1006
1017
  force_filename: Optional[str] = None,
1007
1018
  proxies: Optional[Dict] = None,
1008
- etag_timeout: float = 10,
1019
+ etag_timeout: float = DEFAULT_ETAG_TIMEOUT,
1009
1020
  resume_download: bool = False,
1010
1021
  token: Union[bool, str, None] = None,
1011
1022
  local_files_only: bool = False,
1012
1023
  legacy_cache_layout: bool = False,
1024
+ endpoint: Optional[str] = None,
1013
1025
  ) -> str:
1014
1026
  """Download a given file if it's not already present in the local cache.
1015
1027
 
@@ -1069,9 +1081,6 @@ def hf_hub_download(
1069
1081
  revision (`str`, *optional*):
1070
1082
  An optional Git revision id which can be a branch name, a tag, or a
1071
1083
  commit hash.
1072
- endpoint (`str`, *optional*):
1073
- Hugging Face Hub base url. Will default to https://huggingface.co/. Otherwise, one can set the `HF_ENDPOINT`
1074
- environment variable.
1075
1084
  library_name (`str`, *optional*):
1076
1085
  The name of the library to which the object corresponds.
1077
1086
  library_version (`str`, *optional*):
@@ -1138,6 +1147,10 @@ def hf_hub_download(
1138
1147
 
1139
1148
  </Tip>
1140
1149
  """
1150
+ if HF_HUB_ETAG_TIMEOUT != DEFAULT_ETAG_TIMEOUT:
1151
+ # Respect environment variable above user value
1152
+ etag_timeout = HF_HUB_ETAG_TIMEOUT
1153
+
1141
1154
  if force_filename is not None:
1142
1155
  warnings.warn(
1143
1156
  "The `force_filename` parameter is deprecated as a new caching system, "
@@ -1173,13 +1186,14 @@ def hf_hub_download(
1173
1186
  )
1174
1187
 
1175
1188
  if cache_dir is None:
1176
- cache_dir = HUGGINGFACE_HUB_CACHE
1189
+ cache_dir = HF_HUB_CACHE
1177
1190
  if revision is None:
1178
1191
  revision = DEFAULT_REVISION
1179
1192
  if isinstance(cache_dir, Path):
1180
1193
  cache_dir = str(cache_dir)
1181
1194
  if isinstance(local_dir, Path):
1182
1195
  local_dir = str(local_dir)
1196
+ locks_dir = os.path.join(cache_dir, ".locks")
1183
1197
 
1184
1198
  if subfolder == "":
1185
1199
  subfolder = None
@@ -1393,7 +1407,8 @@ def hf_hub_download(
1393
1407
  return pointer_path
1394
1408
 
1395
1409
  # Prevent parallel downloads of the same file with a lock.
1396
- lock_path = blob_path + ".lock"
1410
+ # etag could be duplicated across repos,
1411
+ lock_path = os.path.join(locks_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type), f"{etag}.lock")
1397
1412
 
1398
1413
  # Some Windows versions do not allow for paths longer than 255 characters.
1399
1414
  # In this case, we must specify it is an extended path by using the "\\?\" prefix.
@@ -1403,6 +1418,7 @@ def hf_hub_download(
1403
1418
  if os.name == "nt" and len(os.path.abspath(blob_path)) > 255:
1404
1419
  blob_path = "\\\\?\\" + os.path.abspath(blob_path)
1405
1420
 
1421
+ Path(lock_path).parent.mkdir(parents=True, exist_ok=True)
1406
1422
  with FileLock(lock_path):
1407
1423
  # If the download just completed while the lock was activated.
1408
1424
  if os.path.exists(pointer_path) and not force_download:
@@ -1477,11 +1493,6 @@ def hf_hub_download(
1477
1493
  _chmod_and_replace(temp_file.name, local_dir_filepath)
1478
1494
  pointer_path = local_dir_filepath # for return value
1479
1495
 
1480
- try:
1481
- os.remove(lock_path)
1482
- except OSError:
1483
- pass
1484
-
1485
1496
  return pointer_path
1486
1497
 
1487
1498
 
@@ -1542,7 +1553,7 @@ def try_to_load_from_cache(
1542
1553
  if repo_type not in REPO_TYPES:
1543
1554
  raise ValueError(f"Invalid repo type: {repo_type}. Accepted repo types are: {str(REPO_TYPES)}")
1544
1555
  if cache_dir is None:
1545
- cache_dir = HUGGINGFACE_HUB_CACHE
1556
+ cache_dir = HF_HUB_CACHE
1546
1557
 
1547
1558
  object_id = repo_id.replace("/", "--")
1548
1559
  repo_cache = os.path.join(cache_dir, f"{repo_type}s--{object_id}")
@@ -1583,7 +1594,7 @@ def get_hf_file_metadata(
1583
1594
  url: str,
1584
1595
  token: Union[bool, str, None] = None,
1585
1596
  proxies: Optional[Dict] = None,
1586
- timeout: Optional[float] = 10.0,
1597
+ timeout: Optional[float] = DEFAULT_REQUEST_TIMEOUT,
1587
1598
  ) -> HfFileMetadata:
1588
1599
  """Fetch metadata of a file versioned on the Hub for a given url.
1589
1600
 
@@ -1624,12 +1635,9 @@ def get_hf_file_metadata(
1624
1635
  # Return
1625
1636
  return HfFileMetadata(
1626
1637
  commit_hash=r.headers.get(HUGGINGFACE_HEADER_X_REPO_COMMIT),
1627
- etag=_normalize_etag(
1628
- # We favor a custom header indicating the etag of the linked resource, and
1629
- # we fallback to the regular etag header.
1630
- r.headers.get(HUGGINGFACE_HEADER_X_LINKED_ETAG)
1631
- or r.headers.get("ETag")
1632
- ),
1638
+ # We favor a custom header indicating the etag of the linked resource, and
1639
+ # we fallback to the regular etag header.
1640
+ etag=_normalize_etag(r.headers.get(HUGGINGFACE_HEADER_X_LINKED_ETAG) or r.headers.get("ETag")),
1633
1641
  # Either from response headers (if redirected) or defaults to request url
1634
1642
  # Do not use directly `url`, as `_request_wrapper` might have followed relative
1635
1643
  # redirects.