huggingface-hub 0.31.0rc0__py3-none-any.whl → 1.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. huggingface_hub/__init__.py +145 -46
  2. huggingface_hub/_commit_api.py +168 -119
  3. huggingface_hub/_commit_scheduler.py +15 -15
  4. huggingface_hub/_inference_endpoints.py +15 -12
  5. huggingface_hub/_jobs_api.py +301 -0
  6. huggingface_hub/_local_folder.py +18 -3
  7. huggingface_hub/_login.py +31 -63
  8. huggingface_hub/_oauth.py +460 -0
  9. huggingface_hub/_snapshot_download.py +239 -80
  10. huggingface_hub/_space_api.py +5 -5
  11. huggingface_hub/_tensorboard_logger.py +15 -19
  12. huggingface_hub/_upload_large_folder.py +172 -76
  13. huggingface_hub/_webhooks_payload.py +3 -3
  14. huggingface_hub/_webhooks_server.py +13 -25
  15. huggingface_hub/{commands → cli}/__init__.py +1 -15
  16. huggingface_hub/cli/_cli_utils.py +173 -0
  17. huggingface_hub/cli/auth.py +147 -0
  18. huggingface_hub/cli/cache.py +841 -0
  19. huggingface_hub/cli/download.py +189 -0
  20. huggingface_hub/cli/hf.py +60 -0
  21. huggingface_hub/cli/inference_endpoints.py +377 -0
  22. huggingface_hub/cli/jobs.py +772 -0
  23. huggingface_hub/cli/lfs.py +175 -0
  24. huggingface_hub/cli/repo.py +315 -0
  25. huggingface_hub/cli/repo_files.py +94 -0
  26. huggingface_hub/{commands/env.py → cli/system.py} +10 -13
  27. huggingface_hub/cli/upload.py +294 -0
  28. huggingface_hub/cli/upload_large_folder.py +117 -0
  29. huggingface_hub/community.py +20 -12
  30. huggingface_hub/constants.py +38 -53
  31. huggingface_hub/dataclasses.py +609 -0
  32. huggingface_hub/errors.py +80 -30
  33. huggingface_hub/fastai_utils.py +30 -41
  34. huggingface_hub/file_download.py +435 -351
  35. huggingface_hub/hf_api.py +2050 -1124
  36. huggingface_hub/hf_file_system.py +269 -152
  37. huggingface_hub/hub_mixin.py +43 -63
  38. huggingface_hub/inference/_client.py +347 -434
  39. huggingface_hub/inference/_common.py +133 -121
  40. huggingface_hub/inference/_generated/_async_client.py +397 -541
  41. huggingface_hub/inference/_generated/types/__init__.py +5 -1
  42. huggingface_hub/inference/_generated/types/automatic_speech_recognition.py +3 -3
  43. huggingface_hub/inference/_generated/types/base.py +10 -7
  44. huggingface_hub/inference/_generated/types/chat_completion.py +59 -23
  45. huggingface_hub/inference/_generated/types/depth_estimation.py +2 -2
  46. huggingface_hub/inference/_generated/types/document_question_answering.py +2 -2
  47. huggingface_hub/inference/_generated/types/feature_extraction.py +2 -2
  48. huggingface_hub/inference/_generated/types/fill_mask.py +2 -2
  49. huggingface_hub/inference/_generated/types/image_to_image.py +6 -2
  50. huggingface_hub/inference/_generated/types/image_to_video.py +60 -0
  51. huggingface_hub/inference/_generated/types/sentence_similarity.py +3 -3
  52. huggingface_hub/inference/_generated/types/summarization.py +2 -2
  53. huggingface_hub/inference/_generated/types/table_question_answering.py +5 -5
  54. huggingface_hub/inference/_generated/types/text2text_generation.py +2 -2
  55. huggingface_hub/inference/_generated/types/text_generation.py +10 -10
  56. huggingface_hub/inference/_generated/types/text_to_video.py +2 -2
  57. huggingface_hub/inference/_generated/types/token_classification.py +2 -2
  58. huggingface_hub/inference/_generated/types/translation.py +2 -2
  59. huggingface_hub/inference/_generated/types/zero_shot_classification.py +2 -2
  60. huggingface_hub/inference/_generated/types/zero_shot_image_classification.py +2 -2
  61. huggingface_hub/inference/_generated/types/zero_shot_object_detection.py +1 -3
  62. huggingface_hub/inference/_mcp/__init__.py +0 -0
  63. huggingface_hub/inference/_mcp/_cli_hacks.py +88 -0
  64. huggingface_hub/inference/_mcp/agent.py +100 -0
  65. huggingface_hub/inference/_mcp/cli.py +247 -0
  66. huggingface_hub/inference/_mcp/constants.py +81 -0
  67. huggingface_hub/inference/_mcp/mcp_client.py +395 -0
  68. huggingface_hub/inference/_mcp/types.py +45 -0
  69. huggingface_hub/inference/_mcp/utils.py +128 -0
  70. huggingface_hub/inference/_providers/__init__.py +82 -7
  71. huggingface_hub/inference/_providers/_common.py +129 -27
  72. huggingface_hub/inference/_providers/black_forest_labs.py +6 -6
  73. huggingface_hub/inference/_providers/cerebras.py +1 -1
  74. huggingface_hub/inference/_providers/clarifai.py +13 -0
  75. huggingface_hub/inference/_providers/cohere.py +20 -3
  76. huggingface_hub/inference/_providers/fal_ai.py +183 -56
  77. huggingface_hub/inference/_providers/featherless_ai.py +38 -0
  78. huggingface_hub/inference/_providers/fireworks_ai.py +18 -0
  79. huggingface_hub/inference/_providers/groq.py +9 -0
  80. huggingface_hub/inference/_providers/hf_inference.py +69 -30
  81. huggingface_hub/inference/_providers/hyperbolic.py +4 -4
  82. huggingface_hub/inference/_providers/nebius.py +33 -5
  83. huggingface_hub/inference/_providers/novita.py +5 -5
  84. huggingface_hub/inference/_providers/nscale.py +44 -0
  85. huggingface_hub/inference/_providers/openai.py +3 -1
  86. huggingface_hub/inference/_providers/publicai.py +6 -0
  87. huggingface_hub/inference/_providers/replicate.py +31 -13
  88. huggingface_hub/inference/_providers/sambanova.py +18 -4
  89. huggingface_hub/inference/_providers/scaleway.py +28 -0
  90. huggingface_hub/inference/_providers/together.py +20 -5
  91. huggingface_hub/inference/_providers/wavespeed.py +138 -0
  92. huggingface_hub/inference/_providers/zai_org.py +17 -0
  93. huggingface_hub/lfs.py +33 -100
  94. huggingface_hub/repocard.py +34 -38
  95. huggingface_hub/repocard_data.py +57 -57
  96. huggingface_hub/serialization/__init__.py +0 -1
  97. huggingface_hub/serialization/_base.py +12 -15
  98. huggingface_hub/serialization/_dduf.py +8 -8
  99. huggingface_hub/serialization/_torch.py +69 -69
  100. huggingface_hub/utils/__init__.py +19 -8
  101. huggingface_hub/utils/_auth.py +7 -7
  102. huggingface_hub/utils/_cache_manager.py +92 -147
  103. huggingface_hub/utils/_chunk_utils.py +2 -3
  104. huggingface_hub/utils/_deprecation.py +1 -1
  105. huggingface_hub/utils/_dotenv.py +55 -0
  106. huggingface_hub/utils/_experimental.py +7 -5
  107. huggingface_hub/utils/_fixes.py +0 -10
  108. huggingface_hub/utils/_git_credential.py +5 -5
  109. huggingface_hub/utils/_headers.py +8 -30
  110. huggingface_hub/utils/_http.py +398 -239
  111. huggingface_hub/utils/_pagination.py +4 -4
  112. huggingface_hub/utils/_parsing.py +98 -0
  113. huggingface_hub/utils/_paths.py +5 -5
  114. huggingface_hub/utils/_runtime.py +61 -24
  115. huggingface_hub/utils/_safetensors.py +21 -21
  116. huggingface_hub/utils/_subprocess.py +9 -9
  117. huggingface_hub/utils/_telemetry.py +4 -4
  118. huggingface_hub/{commands/_cli_utils.py → utils/_terminal.py} +4 -4
  119. huggingface_hub/utils/_typing.py +25 -5
  120. huggingface_hub/utils/_validators.py +55 -74
  121. huggingface_hub/utils/_verification.py +167 -0
  122. huggingface_hub/utils/_xet.py +64 -17
  123. huggingface_hub/utils/_xet_progress_reporting.py +162 -0
  124. huggingface_hub/utils/insecure_hashlib.py +3 -5
  125. huggingface_hub/utils/logging.py +8 -11
  126. huggingface_hub/utils/tqdm.py +5 -4
  127. {huggingface_hub-0.31.0rc0.dist-info → huggingface_hub-1.1.3.dist-info}/METADATA +94 -85
  128. huggingface_hub-1.1.3.dist-info/RECORD +155 -0
  129. {huggingface_hub-0.31.0rc0.dist-info → huggingface_hub-1.1.3.dist-info}/WHEEL +1 -1
  130. huggingface_hub-1.1.3.dist-info/entry_points.txt +6 -0
  131. huggingface_hub/commands/delete_cache.py +0 -474
  132. huggingface_hub/commands/download.py +0 -200
  133. huggingface_hub/commands/huggingface_cli.py +0 -61
  134. huggingface_hub/commands/lfs.py +0 -200
  135. huggingface_hub/commands/repo_files.py +0 -128
  136. huggingface_hub/commands/scan_cache.py +0 -181
  137. huggingface_hub/commands/tag.py +0 -159
  138. huggingface_hub/commands/upload.py +0 -314
  139. huggingface_hub/commands/upload_large_folder.py +0 -129
  140. huggingface_hub/commands/user.py +0 -304
  141. huggingface_hub/commands/version.py +0 -37
  142. huggingface_hub/inference_api.py +0 -217
  143. huggingface_hub/keras_mixin.py +0 -500
  144. huggingface_hub/repository.py +0 -1477
  145. huggingface_hub/serialization/_tensorflow.py +0 -95
  146. huggingface_hub/utils/_hf_folder.py +0 -68
  147. huggingface_hub-0.31.0rc0.dist-info/RECORD +0 -135
  148. huggingface_hub-0.31.0rc0.dist-info/entry_points.txt +0 -6
  149. {huggingface_hub-0.31.0rc0.dist-info → huggingface_hub-1.1.3.dist-info/licenses}/LICENSE +0 -0
  150. {huggingface_hub-0.31.0rc0.dist-info → huggingface_hub-1.1.3.dist-info}/top_level.txt +0 -0
@@ -19,7 +19,7 @@ import re
19
19
  import warnings
20
20
  from functools import wraps
21
21
  from itertools import chain
22
- from typing import Any, Dict
22
+ from typing import Any
23
23
 
24
24
  from huggingface_hub.errors import HFValidationError
25
25
 
@@ -48,9 +48,7 @@ def validate_hf_hub_args(fn: CallableT) -> CallableT:
48
48
  Validators:
49
49
  - [`~utils.validate_repo_id`]: `repo_id` must be `"repo_name"`
50
50
  or `"namespace/repo_name"`. Namespace is a username or an organization.
51
- - [`~utils.smoothly_deprecate_use_auth_token`]: Use `token` instead of
52
- `use_auth_token` (only if `use_auth_token` is not expected by the decorated
53
- function - in practice, always the case in `huggingface_hub`).
51
+ - [`~utils.smoothly_deprecate_legacy_arguments`]: Ignore `proxies` when downloading files (should be set globally).
54
52
 
55
53
  Example:
56
54
  ```py
@@ -68,20 +66,6 @@ def validate_hf_hub_args(fn: CallableT) -> CallableT:
68
66
 
69
67
  >>> my_cool_method(repo_id="other..repo..id")
70
68
  huggingface_hub.utils._validators.HFValidationError: Cannot have -- or .. in repo_id: 'other..repo..id'.
71
-
72
- >>> @validate_hf_hub_args
73
- ... def my_cool_auth_method(token: str):
74
- ... print(token)
75
-
76
- >>> my_cool_auth_method(token="a token")
77
- "a token"
78
-
79
- >>> my_cool_auth_method(use_auth_token="a use_auth_token")
80
- "a use_auth_token"
81
-
82
- >>> my_cool_auth_method(token="a token", use_auth_token="a use_auth_token")
83
- UserWarning: Both `token` and `use_auth_token` are passed (...)
84
- "a token"
85
69
  ```
86
70
 
87
71
  Raises:
@@ -91,13 +75,8 @@ def validate_hf_hub_args(fn: CallableT) -> CallableT:
91
75
  # TODO: add an argument to opt-out validation for specific argument?
92
76
  signature = inspect.signature(fn)
93
77
 
94
- # Should the validator switch `use_auth_token` values to `token`? In practice, always
95
- # True in `huggingface_hub`. Might not be the case in a downstream library.
96
- check_use_auth_token = "use_auth_token" not in signature.parameters and "token" in signature.parameters
97
-
98
78
  @wraps(fn)
99
79
  def _inner_fn(*args, **kwargs):
100
- has_token = False
101
80
  for arg_name, arg_value in chain(
102
81
  zip(signature.parameters, args), # Args values
103
82
  kwargs.items(), # Kwargs values
@@ -105,11 +84,7 @@ def validate_hf_hub_args(fn: CallableT) -> CallableT:
105
84
  if arg_name in ["repo_id", "from_id", "to_id"]:
106
85
  validate_repo_id(arg_value)
107
86
 
108
- elif arg_name == "token" and arg_value is not None:
109
- has_token = True
110
-
111
- if check_use_auth_token:
112
- kwargs = smoothly_deprecate_use_auth_token(fn_name=fn.__name__, has_token=has_token, kwargs=kwargs)
87
+ kwargs = smoothly_deprecate_legacy_arguments(fn_name=fn.__name__, kwargs=kwargs)
113
88
 
114
89
  return fn(*args, **kwargs)
115
90
 
@@ -158,8 +133,8 @@ def validate_repo_id(repo_id: str) -> None:
158
133
 
159
134
  if not REPO_ID_REGEX.match(repo_id):
160
135
  raise HFValidationError(
161
- "Repo id must use alphanumeric chars or '-', '_', '.', '--' and '..' are"
162
- " forbidden, '-' and '.' cannot start or end the name, max length is 96:"
136
+ "Repo id must use alphanumeric chars, '-', '_' or '.'."
137
+ " The name cannot start or end with '-' or '.' and the maximum length is 96:"
163
138
  f" '{repo_id}'."
164
139
  )
165
140
 
@@ -170,57 +145,63 @@ def validate_repo_id(repo_id: str) -> None:
170
145
  raise HFValidationError(f"Repo_id cannot end by '.git': '{repo_id}'.")
171
146
 
172
147
 
173
- def smoothly_deprecate_use_auth_token(fn_name: str, has_token: bool, kwargs: Dict[str, Any]) -> Dict[str, Any]:
174
- """Smoothly deprecate `use_auth_token` in the `huggingface_hub` codebase.
175
-
176
- The long-term goal is to remove any mention of `use_auth_token` in the codebase in
177
- favor of a unique and less verbose `token` argument. This will be done a few steps:
148
+ def smoothly_deprecate_legacy_arguments(fn_name: str, kwargs: dict[str, Any]) -> dict[str, Any]:
149
+ """Smoothly deprecate legacy arguments in the `huggingface_hub` codebase.
178
150
 
179
- 0. Step 0: methods that require a read-access to the Hub use the `use_auth_token`
180
- argument (`str`, `bool` or `None`). Methods requiring write-access have a `token`
181
- argument (`str`, `None`). This implicit rule exists to be able to not send the
182
- token when not necessary (`use_auth_token=False`) even if logged in.
151
+ This function ignores some deprecated arguments from the kwargs and warns the user they are ignored.
152
+ The goal is to avoid breaking existing code while guiding the user to the new way of doing things.
183
153
 
184
- 1. Step 1: we want to harmonize everything and use `token` everywhere (supporting
185
- `token=False` for read-only methods). In order not to break existing code, if
186
- `use_auth_token` is passed to a function, the `use_auth_token` value is passed
187
- as `token` instead, without any warning.
188
- a. Corner case: if both `use_auth_token` and `token` values are passed, a warning
189
- is thrown and the `use_auth_token` value is ignored.
154
+ List of deprecated arguments:
155
+ - `proxies`:
156
+ To set up proxies, user must either use the HTTP_PROXY environment variable or configure the `httpx.Client`
157
+ manually using the [`set_client_factory`] function.
190
158
 
191
- 2. Step 2: Once it is release, we should push downstream libraries to switch from
192
- `use_auth_token` to `token` as much as possible, but without throwing a warning
193
- (e.g. manually create issues on the corresponding repos).
159
+ In huggingface_hub 0.x, `proxies` was a dictionary directly passed to `requests.request`.
160
+ In huggingface_hub 1.x, we migrated to `httpx` which does not support `proxies` the same way.
161
+ In particular, it is not possible to configure proxies on a per-request basis. The solution is to configure
162
+ it globally using the [`set_client_factory`] function or using the HTTP_PROXY environment variable.
194
163
 
195
- 3. Step 3: After a transitional period (6 months e.g. until April 2023?), we update
196
- `huggingface_hub` to throw a warning on `use_auth_token`. Hopefully, very few
197
- users will be impacted as it would have already been fixed.
198
- In addition, unit tests in `huggingface_hub` must be adapted to expect warnings
199
- to be thrown (but still use `use_auth_token` as before).
164
+ For more details, see:
165
+ - https://www.python-httpx.org/advanced/proxies/
166
+ - https://www.python-httpx.org/compatibility/#proxy-keys.
200
167
 
201
- 4. Step 4: After a normal deprecation cycle (3 releases ?), remove this validator.
202
- `use_auth_token` will definitely not be supported.
203
- In addition, we update unit tests in `huggingface_hub` to use `token` everywhere.
204
-
205
- This has been discussed in:
206
- - https://github.com/huggingface/huggingface_hub/issues/1094.
207
- - https://github.com/huggingface/huggingface_hub/pull/928
208
- - (related) https://github.com/huggingface/huggingface_hub/pull/1064
168
+ - `resume_download`: deprecated without replacement. `huggingface_hub` always resumes downloads whenever possible.
169
+ - `force_filename`: deprecated without replacement. Filename is always the same as on the Hub.
170
+ - `local_dir_use_symlinks`: deprecated without replacement. Downloading to a local directory does not use symlinks anymore.
209
171
  """
210
172
  new_kwargs = kwargs.copy() # do not mutate input !
211
173
 
212
- use_auth_token = new_kwargs.pop("use_auth_token", None) # remove from kwargs
213
- if use_auth_token is not None:
214
- if has_token:
215
- warnings.warn(
216
- "Both `token` and `use_auth_token` are passed to"
217
- f" `{fn_name}` with non-None values. `token` is now the"
218
- " preferred argument to pass a User Access Token."
219
- " `use_auth_token` value will be ignored."
220
- )
221
- else:
222
- # `token` argument is not passed and a non-None value is passed in
223
- # `use_auth_token` => use `use_auth_token` value as `token` kwarg.
224
- new_kwargs["token"] = use_auth_token
174
+ # proxies
175
+ proxies = new_kwargs.pop("proxies", None) # remove from kwargs
176
+ if proxies is not None:
177
+ warnings.warn(
178
+ f"The `proxies` argument is ignored in `{fn_name}`. To set up proxies, use the HTTP_PROXY / HTTPS_PROXY"
179
+ " environment variables or configure the `httpx.Client` manually using `huggingface_hub.set_client_factory`."
180
+ " See https://www.python-httpx.org/advanced/proxies/ for more details."
181
+ )
182
+
183
+ # resume_download
184
+ resume_download = new_kwargs.pop("resume_download", None) # remove from kwargs
185
+ if resume_download is not None:
186
+ warnings.warn(
187
+ f"The `resume_download` argument is deprecated and ignored in `{fn_name}`. Downloads always resume"
188
+ " whenever possible."
189
+ )
190
+
191
+ # force_filename
192
+ force_filename = new_kwargs.pop("force_filename", None) # remove from kwargs
193
+ if force_filename is not None:
194
+ warnings.warn(
195
+ f"The `force_filename` argument is deprecated and ignored in `{fn_name}`. Filename is always the same "
196
+ "as on the Hub."
197
+ )
198
+
199
+ # local_dir_use_symlinks
200
+ local_dir_use_symlinks = new_kwargs.pop("local_dir_use_symlinks", None) # remove from kwargs
201
+ if local_dir_use_symlinks is not None:
202
+ warnings.warn(
203
+ f"The `local_dir_use_symlinks` argument is deprecated and ignored in `{fn_name}`. Downloading to a local"
204
+ " directory does not use symlinks anymore."
205
+ )
225
206
 
226
207
  return new_kwargs
@@ -0,0 +1,167 @@
1
+ import re
2
+ from dataclasses import dataclass
3
+ from pathlib import Path
4
+ from typing import TYPE_CHECKING, Literal, Optional, TypedDict, Union
5
+
6
+ from .. import constants
7
+ from ..file_download import repo_folder_name
8
+ from .sha import git_hash, sha_fileobj
9
+
10
+
11
+ if TYPE_CHECKING:
12
+ from ..hf_api import RepoFile, RepoFolder
13
+
14
+ # using fullmatch for clarity and strictness
15
+ _REGEX_COMMIT_HASH = re.compile(r"^[0-9a-f]{40}$")
16
+
17
+
18
+ # Typed structure describing a checksum mismatch
19
+ class Mismatch(TypedDict):
20
+ path: str
21
+ expected: str
22
+ actual: str
23
+ algorithm: str
24
+
25
+
26
+ HashAlgo = Literal["sha256", "git-sha1"]
27
+
28
+
29
+ @dataclass(frozen=True)
30
+ class FolderVerification:
31
+ revision: str
32
+ checked_count: int
33
+ mismatches: list[Mismatch]
34
+ missing_paths: list[str]
35
+ extra_paths: list[str]
36
+ verified_path: Path
37
+
38
+
39
+ def collect_local_files(root: Path) -> dict[str, Path]:
40
+ """
41
+ Return a mapping of repo-relative path -> absolute path for all files under `root`.
42
+ """
43
+ return {p.relative_to(root).as_posix(): p for p in root.rglob("*") if p.is_file()}
44
+
45
+
46
+ def _resolve_commit_hash_from_cache(storage_folder: Path, revision: Optional[str]) -> str:
47
+ """
48
+ Resolve a commit hash from a cache repo folder and an optional revision.
49
+ """
50
+ if revision and _REGEX_COMMIT_HASH.fullmatch(revision):
51
+ return revision
52
+
53
+ refs_dir = storage_folder / "refs"
54
+ snapshots_dir = storage_folder / "snapshots"
55
+
56
+ if revision:
57
+ ref_path = refs_dir / revision
58
+ if ref_path.is_file():
59
+ return ref_path.read_text(encoding="utf-8").strip()
60
+ raise ValueError(f"Revision '{revision}' could not be resolved in cache (expected file '{ref_path}').")
61
+
62
+ # No revision provided: try common defaults
63
+ main_ref = refs_dir / "main"
64
+ if main_ref.is_file():
65
+ return main_ref.read_text(encoding="utf-8").strip()
66
+
67
+ if not snapshots_dir.is_dir():
68
+ raise ValueError(f"Cache repo is missing snapshots directory: {snapshots_dir}. Provide --revision explicitly.")
69
+
70
+ candidates = [p.name for p in snapshots_dir.iterdir() if p.is_dir() and _REGEX_COMMIT_HASH.fullmatch(p.name)]
71
+ if len(candidates) == 1:
72
+ return candidates[0]
73
+
74
+ raise ValueError(
75
+ "Ambiguous cached revision: multiple snapshots found and no refs to disambiguate. Please pass --revision."
76
+ )
77
+
78
+
79
+ def compute_file_hash(path: Path, algorithm: HashAlgo) -> str:
80
+ """
81
+ Compute the checksum of a local file using the requested algorithm.
82
+ """
83
+
84
+ with path.open("rb") as stream:
85
+ if algorithm == "sha256":
86
+ return sha_fileobj(stream).hex()
87
+ if algorithm == "git-sha1":
88
+ return git_hash(stream.read())
89
+ raise ValueError(f"Unsupported hash algorithm: {algorithm}")
90
+
91
+
92
+ def verify_maps(
93
+ *,
94
+ remote_by_path: dict[str, Union["RepoFile", "RepoFolder"]],
95
+ local_by_path: dict[str, Path],
96
+ revision: str,
97
+ verified_path: Path,
98
+ ) -> FolderVerification:
99
+ """Compare remote entries and local files and return a verification result."""
100
+ remote_paths = set(remote_by_path)
101
+ local_paths = set(local_by_path)
102
+
103
+ missing = sorted(remote_paths - local_paths)
104
+ extra = sorted(local_paths - remote_paths)
105
+ both = sorted(remote_paths & local_paths)
106
+
107
+ mismatches: list[Mismatch] = []
108
+
109
+ for rel_path in both:
110
+ remote_entry = remote_by_path[rel_path]
111
+ local_path = local_by_path[rel_path]
112
+
113
+ lfs = getattr(remote_entry, "lfs", None)
114
+ lfs_sha = getattr(lfs, "sha256", None) if lfs is not None else None
115
+ if lfs_sha is None and isinstance(lfs, dict):
116
+ lfs_sha = lfs.get("sha256")
117
+ if lfs_sha:
118
+ algorithm: HashAlgo = "sha256"
119
+ expected = str(lfs_sha).lower()
120
+ else:
121
+ blob_id = remote_entry.blob_id # type: ignore
122
+ algorithm = "git-sha1"
123
+ expected = str(blob_id).lower()
124
+
125
+ actual = compute_file_hash(local_path, algorithm)
126
+
127
+ if actual != expected:
128
+ mismatches.append(Mismatch(path=rel_path, expected=expected, actual=actual, algorithm=algorithm))
129
+
130
+ return FolderVerification(
131
+ revision=revision,
132
+ checked_count=len(both),
133
+ mismatches=mismatches,
134
+ missing_paths=missing,
135
+ extra_paths=extra,
136
+ verified_path=verified_path,
137
+ )
138
+
139
+
140
+ def resolve_local_root(
141
+ *,
142
+ repo_id: str,
143
+ repo_type: str,
144
+ revision: Optional[str],
145
+ cache_dir: Optional[Path],
146
+ local_dir: Optional[Path],
147
+ ) -> tuple[Path, str]:
148
+ """
149
+ Resolve the root directory to scan locally and the remote revision to verify.
150
+ """
151
+ if local_dir is not None:
152
+ root = Path(local_dir).expanduser().resolve()
153
+ if not root.is_dir():
154
+ raise ValueError(f"Local directory does not exist or is not a directory: {root}")
155
+ return root, (revision or constants.DEFAULT_REVISION)
156
+
157
+ cache_root = Path(cache_dir or constants.HF_HUB_CACHE).expanduser().resolve()
158
+ storage_folder = cache_root / repo_folder_name(repo_id=repo_id, repo_type=repo_type)
159
+ if not storage_folder.exists():
160
+ raise ValueError(
161
+ f"Repo is not present in cache: {storage_folder}. Use 'hf download' first or pass --local-dir."
162
+ )
163
+ commit = _resolve_commit_hash_from_cache(storage_folder, revision)
164
+ snapshot_dir = storage_folder / "snapshots" / commit
165
+ if not snapshot_dir.is_dir():
166
+ raise ValueError(f"Snapshot directory does not exist for revision '{commit}': {snapshot_dir}.")
167
+ return snapshot_dir, commit
@@ -1,13 +1,19 @@
1
+ import time
1
2
  from dataclasses import dataclass
2
3
  from enum import Enum
3
- from typing import Dict, Optional
4
+ from typing import Optional
4
5
 
5
- import requests
6
+ import httpx
6
7
 
7
8
  from .. import constants
8
9
  from . import get_session, hf_raise_for_status, validate_hf_hub_args
9
10
 
10
11
 
12
+ XET_CONNECTION_INFO_SAFETY_PERIOD = 60 # seconds
13
+ XET_CONNECTION_INFO_CACHE_SIZE = 1_000
14
+ XET_CONNECTION_INFO_CACHE: dict[str, "XetConnectionInfo"] = {}
15
+
16
+
11
17
  class XetTokenType(str, Enum):
12
18
  READ = "read"
13
19
  WRITE = "write"
@@ -26,7 +32,9 @@ class XetConnectionInfo:
26
32
  endpoint: str
27
33
 
28
34
 
29
- def parse_xet_file_data_from_response(response: requests.Response) -> Optional[XetFileData]:
35
+ def parse_xet_file_data_from_response(
36
+ response: httpx.Response, endpoint: Optional[str] = None
37
+ ) -> Optional[XetFileData]:
30
38
  """
31
39
  Parse XET file metadata from an HTTP response.
32
40
 
@@ -34,7 +42,7 @@ def parse_xet_file_data_from_response(response: requests.Response) -> Optional[X
34
42
  of a given response object. If the required metadata is not found, it returns `None`.
35
43
 
36
44
  Args:
37
- response (`requests.Response`):
45
+ response (`httpx.Response`):
38
46
  The HTTP response object containing headers dict and links dict to extract the XET metadata from.
39
47
  Returns:
40
48
  `Optional[XetFileData]`:
@@ -52,18 +60,20 @@ def parse_xet_file_data_from_response(response: requests.Response) -> Optional[X
52
60
  refresh_route = response.headers[constants.HUGGINGFACE_HEADER_X_XET_REFRESH_ROUTE]
53
61
  except KeyError:
54
62
  return None
55
-
63
+ endpoint = endpoint if endpoint is not None else constants.ENDPOINT
64
+ if refresh_route.startswith(constants.HUGGINGFACE_CO_URL_HOME):
65
+ refresh_route = refresh_route.replace(constants.HUGGINGFACE_CO_URL_HOME.rstrip("/"), endpoint.rstrip("/"))
56
66
  return XetFileData(
57
67
  file_hash=file_hash,
58
68
  refresh_route=refresh_route,
59
69
  )
60
70
 
61
71
 
62
- def parse_xet_connection_info_from_headers(headers: Dict[str, str]) -> Optional[XetConnectionInfo]:
72
+ def parse_xet_connection_info_from_headers(headers: dict[str, str]) -> Optional[XetConnectionInfo]:
63
73
  """
64
74
  Parse XET connection info from the HTTP headers or return None if not found.
65
75
  Args:
66
- headers (`Dict`):
76
+ headers (`dict`):
67
77
  HTTP headers to extract the XET metadata from.
68
78
  Returns:
69
79
  `XetConnectionInfo` or `None`:
@@ -88,7 +98,7 @@ def parse_xet_connection_info_from_headers(headers: Dict[str, str]) -> Optional[
88
98
  def refresh_xet_connection_info(
89
99
  *,
90
100
  file_data: XetFileData,
91
- headers: Dict[str, str],
101
+ headers: dict[str, str],
92
102
  ) -> XetConnectionInfo:
93
103
  """
94
104
  Utilizes the information in the parsed metadata to request the Hub xet connection information.
@@ -96,7 +106,7 @@ def refresh_xet_connection_info(
96
106
  Args:
97
107
  file_data: (`XetFileData`):
98
108
  The file data needed to refresh the xet connection information.
99
- headers (`Dict[str, str]`):
109
+ headers (`dict[str, str]`):
100
110
  Headers to use for the request, including authorization headers and user agent.
101
111
  Returns:
102
112
  `XetConnectionInfo`:
@@ -119,9 +129,9 @@ def fetch_xet_connection_info_from_repo_info(
119
129
  repo_id: str,
120
130
  repo_type: str,
121
131
  revision: Optional[str] = None,
122
- headers: Dict[str, str],
132
+ headers: dict[str, str],
123
133
  endpoint: Optional[str] = None,
124
- params: Optional[Dict[str, str]] = None,
134
+ params: Optional[dict[str, str]] = None,
125
135
  ) -> XetConnectionInfo:
126
136
  """
127
137
  Uses the repo info to request a xet access token from Hub.
@@ -134,11 +144,11 @@ def fetch_xet_connection_info_from_repo_info(
134
144
  Type of the repo to upload to: `"model"`, `"dataset"` or `"space"`.
135
145
  revision (`str`, `optional`):
136
146
  The revision of the repo to get the token for.
137
- headers (`Dict[str, str]`):
147
+ headers (`dict[str, str]`):
138
148
  Headers to use for the request, including authorization headers and user agent.
139
149
  endpoint (`str`, `optional`):
140
150
  The endpoint to use for the request. Defaults to the Hub endpoint.
141
- params (`Dict[str, str]`, `optional`):
151
+ params (`dict[str, str]`, `optional`):
142
152
  Additional parameters to pass with the request.
143
153
  Returns:
144
154
  `XetConnectionInfo`:
@@ -157,18 +167,21 @@ def fetch_xet_connection_info_from_repo_info(
157
167
  @validate_hf_hub_args
158
168
  def _fetch_xet_connection_info_with_url(
159
169
  url: str,
160
- headers: Dict[str, str],
161
- params: Optional[Dict[str, str]] = None,
170
+ headers: dict[str, str],
171
+ params: Optional[dict[str, str]] = None,
162
172
  ) -> XetConnectionInfo:
163
173
  """
164
174
  Requests the xet connection info from the supplied URL. This includes the
165
175
  access token, expiration time, and endpoint to use for the xet storage service.
176
+
177
+ Result is cached to avoid redundant requests.
178
+
166
179
  Args:
167
180
  url: (`str`):
168
181
  The access token endpoint URL.
169
- headers (`Dict[str, str]`):
182
+ headers (`dict[str, str]`):
170
183
  Headers to use for the request, including authorization headers and user agent.
171
- params (`Dict[str, str]`, `optional`):
184
+ params (`dict[str, str]`, `optional`):
172
185
  Additional parameters to pass with the request.
173
186
  Returns:
174
187
  `XetConnectionInfo`:
@@ -179,10 +192,44 @@ def _fetch_xet_connection_info_with_url(
179
192
  [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
180
193
  If the Hub API response is improperly formatted.
181
194
  """
195
+ # Check cache first
196
+ cache_key = _cache_key(url, headers, params)
197
+ cached_info = XET_CONNECTION_INFO_CACHE.get(cache_key)
198
+ if cached_info is not None:
199
+ if not _is_expired(cached_info):
200
+ return cached_info
201
+
202
+ # Fetch from server
182
203
  resp = get_session().get(headers=headers, url=url, params=params)
183
204
  hf_raise_for_status(resp)
184
205
 
185
206
  metadata = parse_xet_connection_info_from_headers(resp.headers) # type: ignore
186
207
  if metadata is None:
187
208
  raise ValueError("Xet headers have not been correctly set by the server.")
209
+
210
+ # Delete expired cache entries
211
+ for k, v in list(XET_CONNECTION_INFO_CACHE.items()):
212
+ if _is_expired(v):
213
+ XET_CONNECTION_INFO_CACHE.pop(k, None)
214
+
215
+ # Enforce cache size limit
216
+ if len(XET_CONNECTION_INFO_CACHE) >= XET_CONNECTION_INFO_CACHE_SIZE:
217
+ XET_CONNECTION_INFO_CACHE.pop(next(iter(XET_CONNECTION_INFO_CACHE)))
218
+
219
+ # Update cache
220
+ XET_CONNECTION_INFO_CACHE[cache_key] = metadata
221
+
188
222
  return metadata
223
+
224
+
225
+ def _cache_key(url: str, headers: dict[str, str], params: Optional[dict[str, str]]) -> str:
226
+ """Return a unique cache key for the given request parameters."""
227
+ lower_headers = {k.lower(): v for k, v in headers.items()} # casing is not guaranteed here
228
+ auth_header = lower_headers.get("authorization", "")
229
+ params_str = "&".join(f"{k}={v}" for k, v in sorted((params or {}).items(), key=lambda x: x[0]))
230
+ return f"{url}|{auth_header}|{params_str}"
231
+
232
+
233
+ def _is_expired(connection_info: XetConnectionInfo) -> bool:
234
+ """Check if the given XET connection info is expired."""
235
+ return connection_info.expiration_unix_epoch <= int(time.time()) + XET_CONNECTION_INFO_SAFETY_PERIOD