huggingface-hub 0.31.0rc0__py3-none-any.whl → 1.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- huggingface_hub/__init__.py +145 -46
- huggingface_hub/_commit_api.py +168 -119
- huggingface_hub/_commit_scheduler.py +15 -15
- huggingface_hub/_inference_endpoints.py +15 -12
- huggingface_hub/_jobs_api.py +301 -0
- huggingface_hub/_local_folder.py +18 -3
- huggingface_hub/_login.py +31 -63
- huggingface_hub/_oauth.py +460 -0
- huggingface_hub/_snapshot_download.py +239 -80
- huggingface_hub/_space_api.py +5 -5
- huggingface_hub/_tensorboard_logger.py +15 -19
- huggingface_hub/_upload_large_folder.py +172 -76
- huggingface_hub/_webhooks_payload.py +3 -3
- huggingface_hub/_webhooks_server.py +13 -25
- huggingface_hub/{commands → cli}/__init__.py +1 -15
- huggingface_hub/cli/_cli_utils.py +173 -0
- huggingface_hub/cli/auth.py +147 -0
- huggingface_hub/cli/cache.py +841 -0
- huggingface_hub/cli/download.py +189 -0
- huggingface_hub/cli/hf.py +60 -0
- huggingface_hub/cli/inference_endpoints.py +377 -0
- huggingface_hub/cli/jobs.py +772 -0
- huggingface_hub/cli/lfs.py +175 -0
- huggingface_hub/cli/repo.py +315 -0
- huggingface_hub/cli/repo_files.py +94 -0
- huggingface_hub/{commands/env.py → cli/system.py} +10 -13
- huggingface_hub/cli/upload.py +294 -0
- huggingface_hub/cli/upload_large_folder.py +117 -0
- huggingface_hub/community.py +20 -12
- huggingface_hub/constants.py +38 -53
- huggingface_hub/dataclasses.py +609 -0
- huggingface_hub/errors.py +80 -30
- huggingface_hub/fastai_utils.py +30 -41
- huggingface_hub/file_download.py +435 -351
- huggingface_hub/hf_api.py +2050 -1124
- huggingface_hub/hf_file_system.py +269 -152
- huggingface_hub/hub_mixin.py +43 -63
- huggingface_hub/inference/_client.py +347 -434
- huggingface_hub/inference/_common.py +133 -121
- huggingface_hub/inference/_generated/_async_client.py +397 -541
- huggingface_hub/inference/_generated/types/__init__.py +5 -1
- huggingface_hub/inference/_generated/types/automatic_speech_recognition.py +3 -3
- huggingface_hub/inference/_generated/types/base.py +10 -7
- huggingface_hub/inference/_generated/types/chat_completion.py +59 -23
- huggingface_hub/inference/_generated/types/depth_estimation.py +2 -2
- huggingface_hub/inference/_generated/types/document_question_answering.py +2 -2
- huggingface_hub/inference/_generated/types/feature_extraction.py +2 -2
- huggingface_hub/inference/_generated/types/fill_mask.py +2 -2
- huggingface_hub/inference/_generated/types/image_to_image.py +6 -2
- huggingface_hub/inference/_generated/types/image_to_video.py +60 -0
- huggingface_hub/inference/_generated/types/sentence_similarity.py +3 -3
- huggingface_hub/inference/_generated/types/summarization.py +2 -2
- huggingface_hub/inference/_generated/types/table_question_answering.py +5 -5
- huggingface_hub/inference/_generated/types/text2text_generation.py +2 -2
- huggingface_hub/inference/_generated/types/text_generation.py +10 -10
- huggingface_hub/inference/_generated/types/text_to_video.py +2 -2
- huggingface_hub/inference/_generated/types/token_classification.py +2 -2
- huggingface_hub/inference/_generated/types/translation.py +2 -2
- huggingface_hub/inference/_generated/types/zero_shot_classification.py +2 -2
- huggingface_hub/inference/_generated/types/zero_shot_image_classification.py +2 -2
- huggingface_hub/inference/_generated/types/zero_shot_object_detection.py +1 -3
- huggingface_hub/inference/_mcp/__init__.py +0 -0
- huggingface_hub/inference/_mcp/_cli_hacks.py +88 -0
- huggingface_hub/inference/_mcp/agent.py +100 -0
- huggingface_hub/inference/_mcp/cli.py +247 -0
- huggingface_hub/inference/_mcp/constants.py +81 -0
- huggingface_hub/inference/_mcp/mcp_client.py +395 -0
- huggingface_hub/inference/_mcp/types.py +45 -0
- huggingface_hub/inference/_mcp/utils.py +128 -0
- huggingface_hub/inference/_providers/__init__.py +82 -7
- huggingface_hub/inference/_providers/_common.py +129 -27
- huggingface_hub/inference/_providers/black_forest_labs.py +6 -6
- huggingface_hub/inference/_providers/cerebras.py +1 -1
- huggingface_hub/inference/_providers/clarifai.py +13 -0
- huggingface_hub/inference/_providers/cohere.py +20 -3
- huggingface_hub/inference/_providers/fal_ai.py +183 -56
- huggingface_hub/inference/_providers/featherless_ai.py +38 -0
- huggingface_hub/inference/_providers/fireworks_ai.py +18 -0
- huggingface_hub/inference/_providers/groq.py +9 -0
- huggingface_hub/inference/_providers/hf_inference.py +69 -30
- huggingface_hub/inference/_providers/hyperbolic.py +4 -4
- huggingface_hub/inference/_providers/nebius.py +33 -5
- huggingface_hub/inference/_providers/novita.py +5 -5
- huggingface_hub/inference/_providers/nscale.py +44 -0
- huggingface_hub/inference/_providers/openai.py +3 -1
- huggingface_hub/inference/_providers/publicai.py +6 -0
- huggingface_hub/inference/_providers/replicate.py +31 -13
- huggingface_hub/inference/_providers/sambanova.py +18 -4
- huggingface_hub/inference/_providers/scaleway.py +28 -0
- huggingface_hub/inference/_providers/together.py +20 -5
- huggingface_hub/inference/_providers/wavespeed.py +138 -0
- huggingface_hub/inference/_providers/zai_org.py +17 -0
- huggingface_hub/lfs.py +33 -100
- huggingface_hub/repocard.py +34 -38
- huggingface_hub/repocard_data.py +57 -57
- huggingface_hub/serialization/__init__.py +0 -1
- huggingface_hub/serialization/_base.py +12 -15
- huggingface_hub/serialization/_dduf.py +8 -8
- huggingface_hub/serialization/_torch.py +69 -69
- huggingface_hub/utils/__init__.py +19 -8
- huggingface_hub/utils/_auth.py +7 -7
- huggingface_hub/utils/_cache_manager.py +92 -147
- huggingface_hub/utils/_chunk_utils.py +2 -3
- huggingface_hub/utils/_deprecation.py +1 -1
- huggingface_hub/utils/_dotenv.py +55 -0
- huggingface_hub/utils/_experimental.py +7 -5
- huggingface_hub/utils/_fixes.py +0 -10
- huggingface_hub/utils/_git_credential.py +5 -5
- huggingface_hub/utils/_headers.py +8 -30
- huggingface_hub/utils/_http.py +398 -239
- huggingface_hub/utils/_pagination.py +4 -4
- huggingface_hub/utils/_parsing.py +98 -0
- huggingface_hub/utils/_paths.py +5 -5
- huggingface_hub/utils/_runtime.py +61 -24
- huggingface_hub/utils/_safetensors.py +21 -21
- huggingface_hub/utils/_subprocess.py +9 -9
- huggingface_hub/utils/_telemetry.py +4 -4
- huggingface_hub/{commands/_cli_utils.py → utils/_terminal.py} +4 -4
- huggingface_hub/utils/_typing.py +25 -5
- huggingface_hub/utils/_validators.py +55 -74
- huggingface_hub/utils/_verification.py +167 -0
- huggingface_hub/utils/_xet.py +64 -17
- huggingface_hub/utils/_xet_progress_reporting.py +162 -0
- huggingface_hub/utils/insecure_hashlib.py +3 -5
- huggingface_hub/utils/logging.py +8 -11
- huggingface_hub/utils/tqdm.py +5 -4
- {huggingface_hub-0.31.0rc0.dist-info → huggingface_hub-1.1.3.dist-info}/METADATA +94 -85
- huggingface_hub-1.1.3.dist-info/RECORD +155 -0
- {huggingface_hub-0.31.0rc0.dist-info → huggingface_hub-1.1.3.dist-info}/WHEEL +1 -1
- huggingface_hub-1.1.3.dist-info/entry_points.txt +6 -0
- huggingface_hub/commands/delete_cache.py +0 -474
- huggingface_hub/commands/download.py +0 -200
- huggingface_hub/commands/huggingface_cli.py +0 -61
- huggingface_hub/commands/lfs.py +0 -200
- huggingface_hub/commands/repo_files.py +0 -128
- huggingface_hub/commands/scan_cache.py +0 -181
- huggingface_hub/commands/tag.py +0 -159
- huggingface_hub/commands/upload.py +0 -314
- huggingface_hub/commands/upload_large_folder.py +0 -129
- huggingface_hub/commands/user.py +0 -304
- huggingface_hub/commands/version.py +0 -37
- huggingface_hub/inference_api.py +0 -217
- huggingface_hub/keras_mixin.py +0 -500
- huggingface_hub/repository.py +0 -1477
- huggingface_hub/serialization/_tensorflow.py +0 -95
- huggingface_hub/utils/_hf_folder.py +0 -68
- huggingface_hub-0.31.0rc0.dist-info/RECORD +0 -135
- huggingface_hub-0.31.0rc0.dist-info/entry_points.txt +0 -6
- {huggingface_hub-0.31.0rc0.dist-info → huggingface_hub-1.1.3.dist-info/licenses}/LICENSE +0 -0
- {huggingface_hub-0.31.0rc0.dist-info → huggingface_hub-1.1.3.dist-info}/top_level.txt +0 -0
|
@@ -19,7 +19,7 @@ import re
|
|
|
19
19
|
import warnings
|
|
20
20
|
from functools import wraps
|
|
21
21
|
from itertools import chain
|
|
22
|
-
from typing import Any
|
|
22
|
+
from typing import Any
|
|
23
23
|
|
|
24
24
|
from huggingface_hub.errors import HFValidationError
|
|
25
25
|
|
|
@@ -48,9 +48,7 @@ def validate_hf_hub_args(fn: CallableT) -> CallableT:
|
|
|
48
48
|
Validators:
|
|
49
49
|
- [`~utils.validate_repo_id`]: `repo_id` must be `"repo_name"`
|
|
50
50
|
or `"namespace/repo_name"`. Namespace is a username or an organization.
|
|
51
|
-
- [`~utils.
|
|
52
|
-
`use_auth_token` (only if `use_auth_token` is not expected by the decorated
|
|
53
|
-
function - in practice, always the case in `huggingface_hub`).
|
|
51
|
+
- [`~utils.smoothly_deprecate_legacy_arguments`]: Ignore `proxies` when downloading files (should be set globally).
|
|
54
52
|
|
|
55
53
|
Example:
|
|
56
54
|
```py
|
|
@@ -68,20 +66,6 @@ def validate_hf_hub_args(fn: CallableT) -> CallableT:
|
|
|
68
66
|
|
|
69
67
|
>>> my_cool_method(repo_id="other..repo..id")
|
|
70
68
|
huggingface_hub.utils._validators.HFValidationError: Cannot have -- or .. in repo_id: 'other..repo..id'.
|
|
71
|
-
|
|
72
|
-
>>> @validate_hf_hub_args
|
|
73
|
-
... def my_cool_auth_method(token: str):
|
|
74
|
-
... print(token)
|
|
75
|
-
|
|
76
|
-
>>> my_cool_auth_method(token="a token")
|
|
77
|
-
"a token"
|
|
78
|
-
|
|
79
|
-
>>> my_cool_auth_method(use_auth_token="a use_auth_token")
|
|
80
|
-
"a use_auth_token"
|
|
81
|
-
|
|
82
|
-
>>> my_cool_auth_method(token="a token", use_auth_token="a use_auth_token")
|
|
83
|
-
UserWarning: Both `token` and `use_auth_token` are passed (...)
|
|
84
|
-
"a token"
|
|
85
69
|
```
|
|
86
70
|
|
|
87
71
|
Raises:
|
|
@@ -91,13 +75,8 @@ def validate_hf_hub_args(fn: CallableT) -> CallableT:
|
|
|
91
75
|
# TODO: add an argument to opt-out validation for specific argument?
|
|
92
76
|
signature = inspect.signature(fn)
|
|
93
77
|
|
|
94
|
-
# Should the validator switch `use_auth_token` values to `token`? In practice, always
|
|
95
|
-
# True in `huggingface_hub`. Might not be the case in a downstream library.
|
|
96
|
-
check_use_auth_token = "use_auth_token" not in signature.parameters and "token" in signature.parameters
|
|
97
|
-
|
|
98
78
|
@wraps(fn)
|
|
99
79
|
def _inner_fn(*args, **kwargs):
|
|
100
|
-
has_token = False
|
|
101
80
|
for arg_name, arg_value in chain(
|
|
102
81
|
zip(signature.parameters, args), # Args values
|
|
103
82
|
kwargs.items(), # Kwargs values
|
|
@@ -105,11 +84,7 @@ def validate_hf_hub_args(fn: CallableT) -> CallableT:
|
|
|
105
84
|
if arg_name in ["repo_id", "from_id", "to_id"]:
|
|
106
85
|
validate_repo_id(arg_value)
|
|
107
86
|
|
|
108
|
-
|
|
109
|
-
has_token = True
|
|
110
|
-
|
|
111
|
-
if check_use_auth_token:
|
|
112
|
-
kwargs = smoothly_deprecate_use_auth_token(fn_name=fn.__name__, has_token=has_token, kwargs=kwargs)
|
|
87
|
+
kwargs = smoothly_deprecate_legacy_arguments(fn_name=fn.__name__, kwargs=kwargs)
|
|
113
88
|
|
|
114
89
|
return fn(*args, **kwargs)
|
|
115
90
|
|
|
@@ -158,8 +133,8 @@ def validate_repo_id(repo_id: str) -> None:
|
|
|
158
133
|
|
|
159
134
|
if not REPO_ID_REGEX.match(repo_id):
|
|
160
135
|
raise HFValidationError(
|
|
161
|
-
"Repo id must use alphanumeric chars
|
|
162
|
-
"
|
|
136
|
+
"Repo id must use alphanumeric chars, '-', '_' or '.'."
|
|
137
|
+
" The name cannot start or end with '-' or '.' and the maximum length is 96:"
|
|
163
138
|
f" '{repo_id}'."
|
|
164
139
|
)
|
|
165
140
|
|
|
@@ -170,57 +145,63 @@ def validate_repo_id(repo_id: str) -> None:
|
|
|
170
145
|
raise HFValidationError(f"Repo_id cannot end by '.git': '{repo_id}'.")
|
|
171
146
|
|
|
172
147
|
|
|
173
|
-
def
|
|
174
|
-
"""Smoothly deprecate
|
|
175
|
-
|
|
176
|
-
The long-term goal is to remove any mention of `use_auth_token` in the codebase in
|
|
177
|
-
favor of a unique and less verbose `token` argument. This will be done a few steps:
|
|
148
|
+
def smoothly_deprecate_legacy_arguments(fn_name: str, kwargs: dict[str, Any]) -> dict[str, Any]:
|
|
149
|
+
"""Smoothly deprecate legacy arguments in the `huggingface_hub` codebase.
|
|
178
150
|
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
argument (`str`, `None`). This implicit rule exists to be able to not send the
|
|
182
|
-
token when not necessary (`use_auth_token=False`) even if logged in.
|
|
151
|
+
This function ignores some deprecated arguments from the kwargs and warns the user they are ignored.
|
|
152
|
+
The goal is to avoid breaking existing code while guiding the user to the new way of doing things.
|
|
183
153
|
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
a. Corner case: if both `use_auth_token` and `token` values are passed, a warning
|
|
189
|
-
is thrown and the `use_auth_token` value is ignored.
|
|
154
|
+
List of deprecated arguments:
|
|
155
|
+
- `proxies`:
|
|
156
|
+
To set up proxies, user must either use the HTTP_PROXY environment variable or configure the `httpx.Client`
|
|
157
|
+
manually using the [`set_client_factory`] function.
|
|
190
158
|
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
159
|
+
In huggingface_hub 0.x, `proxies` was a dictionary directly passed to `requests.request`.
|
|
160
|
+
In huggingface_hub 1.x, we migrated to `httpx` which does not support `proxies` the same way.
|
|
161
|
+
In particular, it is not possible to configure proxies on a per-request basis. The solution is to configure
|
|
162
|
+
it globally using the [`set_client_factory`] function or using the HTTP_PROXY environment variable.
|
|
194
163
|
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
In addition, unit tests in `huggingface_hub` must be adapted to expect warnings
|
|
199
|
-
to be thrown (but still use `use_auth_token` as before).
|
|
164
|
+
For more details, see:
|
|
165
|
+
- https://www.python-httpx.org/advanced/proxies/
|
|
166
|
+
- https://www.python-httpx.org/compatibility/#proxy-keys.
|
|
200
167
|
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
This has been discussed in:
|
|
206
|
-
- https://github.com/huggingface/huggingface_hub/issues/1094.
|
|
207
|
-
- https://github.com/huggingface/huggingface_hub/pull/928
|
|
208
|
-
- (related) https://github.com/huggingface/huggingface_hub/pull/1064
|
|
168
|
+
- `resume_download`: deprecated without replacement. `huggingface_hub` always resumes downloads whenever possible.
|
|
169
|
+
- `force_filename`: deprecated without replacement. Filename is always the same as on the Hub.
|
|
170
|
+
- `local_dir_use_symlinks`: deprecated without replacement. Downloading to a local directory does not use symlinks anymore.
|
|
209
171
|
"""
|
|
210
172
|
new_kwargs = kwargs.copy() # do not mutate input !
|
|
211
173
|
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
174
|
+
# proxies
|
|
175
|
+
proxies = new_kwargs.pop("proxies", None) # remove from kwargs
|
|
176
|
+
if proxies is not None:
|
|
177
|
+
warnings.warn(
|
|
178
|
+
f"The `proxies` argument is ignored in `{fn_name}`. To set up proxies, use the HTTP_PROXY / HTTPS_PROXY"
|
|
179
|
+
" environment variables or configure the `httpx.Client` manually using `huggingface_hub.set_client_factory`."
|
|
180
|
+
" See https://www.python-httpx.org/advanced/proxies/ for more details."
|
|
181
|
+
)
|
|
182
|
+
|
|
183
|
+
# resume_download
|
|
184
|
+
resume_download = new_kwargs.pop("resume_download", None) # remove from kwargs
|
|
185
|
+
if resume_download is not None:
|
|
186
|
+
warnings.warn(
|
|
187
|
+
f"The `resume_download` argument is deprecated and ignored in `{fn_name}`. Downloads always resume"
|
|
188
|
+
" whenever possible."
|
|
189
|
+
)
|
|
190
|
+
|
|
191
|
+
# force_filename
|
|
192
|
+
force_filename = new_kwargs.pop("force_filename", None) # remove from kwargs
|
|
193
|
+
if force_filename is not None:
|
|
194
|
+
warnings.warn(
|
|
195
|
+
f"The `force_filename` argument is deprecated and ignored in `{fn_name}`. Filename is always the same "
|
|
196
|
+
"as on the Hub."
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
# local_dir_use_symlinks
|
|
200
|
+
local_dir_use_symlinks = new_kwargs.pop("local_dir_use_symlinks", None) # remove from kwargs
|
|
201
|
+
if local_dir_use_symlinks is not None:
|
|
202
|
+
warnings.warn(
|
|
203
|
+
f"The `local_dir_use_symlinks` argument is deprecated and ignored in `{fn_name}`. Downloading to a local"
|
|
204
|
+
" directory does not use symlinks anymore."
|
|
205
|
+
)
|
|
225
206
|
|
|
226
207
|
return new_kwargs
|
|
@@ -0,0 +1,167 @@
|
|
|
1
|
+
import re
|
|
2
|
+
from dataclasses import dataclass
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import TYPE_CHECKING, Literal, Optional, TypedDict, Union
|
|
5
|
+
|
|
6
|
+
from .. import constants
|
|
7
|
+
from ..file_download import repo_folder_name
|
|
8
|
+
from .sha import git_hash, sha_fileobj
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
if TYPE_CHECKING:
|
|
12
|
+
from ..hf_api import RepoFile, RepoFolder
|
|
13
|
+
|
|
14
|
+
# using fullmatch for clarity and strictness
|
|
15
|
+
_REGEX_COMMIT_HASH = re.compile(r"^[0-9a-f]{40}$")
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
# Typed structure describing a checksum mismatch
|
|
19
|
+
class Mismatch(TypedDict):
|
|
20
|
+
path: str
|
|
21
|
+
expected: str
|
|
22
|
+
actual: str
|
|
23
|
+
algorithm: str
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
HashAlgo = Literal["sha256", "git-sha1"]
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass(frozen=True)
|
|
30
|
+
class FolderVerification:
|
|
31
|
+
revision: str
|
|
32
|
+
checked_count: int
|
|
33
|
+
mismatches: list[Mismatch]
|
|
34
|
+
missing_paths: list[str]
|
|
35
|
+
extra_paths: list[str]
|
|
36
|
+
verified_path: Path
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def collect_local_files(root: Path) -> dict[str, Path]:
|
|
40
|
+
"""
|
|
41
|
+
Return a mapping of repo-relative path -> absolute path for all files under `root`.
|
|
42
|
+
"""
|
|
43
|
+
return {p.relative_to(root).as_posix(): p for p in root.rglob("*") if p.is_file()}
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _resolve_commit_hash_from_cache(storage_folder: Path, revision: Optional[str]) -> str:
|
|
47
|
+
"""
|
|
48
|
+
Resolve a commit hash from a cache repo folder and an optional revision.
|
|
49
|
+
"""
|
|
50
|
+
if revision and _REGEX_COMMIT_HASH.fullmatch(revision):
|
|
51
|
+
return revision
|
|
52
|
+
|
|
53
|
+
refs_dir = storage_folder / "refs"
|
|
54
|
+
snapshots_dir = storage_folder / "snapshots"
|
|
55
|
+
|
|
56
|
+
if revision:
|
|
57
|
+
ref_path = refs_dir / revision
|
|
58
|
+
if ref_path.is_file():
|
|
59
|
+
return ref_path.read_text(encoding="utf-8").strip()
|
|
60
|
+
raise ValueError(f"Revision '{revision}' could not be resolved in cache (expected file '{ref_path}').")
|
|
61
|
+
|
|
62
|
+
# No revision provided: try common defaults
|
|
63
|
+
main_ref = refs_dir / "main"
|
|
64
|
+
if main_ref.is_file():
|
|
65
|
+
return main_ref.read_text(encoding="utf-8").strip()
|
|
66
|
+
|
|
67
|
+
if not snapshots_dir.is_dir():
|
|
68
|
+
raise ValueError(f"Cache repo is missing snapshots directory: {snapshots_dir}. Provide --revision explicitly.")
|
|
69
|
+
|
|
70
|
+
candidates = [p.name for p in snapshots_dir.iterdir() if p.is_dir() and _REGEX_COMMIT_HASH.fullmatch(p.name)]
|
|
71
|
+
if len(candidates) == 1:
|
|
72
|
+
return candidates[0]
|
|
73
|
+
|
|
74
|
+
raise ValueError(
|
|
75
|
+
"Ambiguous cached revision: multiple snapshots found and no refs to disambiguate. Please pass --revision."
|
|
76
|
+
)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def compute_file_hash(path: Path, algorithm: HashAlgo) -> str:
|
|
80
|
+
"""
|
|
81
|
+
Compute the checksum of a local file using the requested algorithm.
|
|
82
|
+
"""
|
|
83
|
+
|
|
84
|
+
with path.open("rb") as stream:
|
|
85
|
+
if algorithm == "sha256":
|
|
86
|
+
return sha_fileobj(stream).hex()
|
|
87
|
+
if algorithm == "git-sha1":
|
|
88
|
+
return git_hash(stream.read())
|
|
89
|
+
raise ValueError(f"Unsupported hash algorithm: {algorithm}")
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def verify_maps(
|
|
93
|
+
*,
|
|
94
|
+
remote_by_path: dict[str, Union["RepoFile", "RepoFolder"]],
|
|
95
|
+
local_by_path: dict[str, Path],
|
|
96
|
+
revision: str,
|
|
97
|
+
verified_path: Path,
|
|
98
|
+
) -> FolderVerification:
|
|
99
|
+
"""Compare remote entries and local files and return a verification result."""
|
|
100
|
+
remote_paths = set(remote_by_path)
|
|
101
|
+
local_paths = set(local_by_path)
|
|
102
|
+
|
|
103
|
+
missing = sorted(remote_paths - local_paths)
|
|
104
|
+
extra = sorted(local_paths - remote_paths)
|
|
105
|
+
both = sorted(remote_paths & local_paths)
|
|
106
|
+
|
|
107
|
+
mismatches: list[Mismatch] = []
|
|
108
|
+
|
|
109
|
+
for rel_path in both:
|
|
110
|
+
remote_entry = remote_by_path[rel_path]
|
|
111
|
+
local_path = local_by_path[rel_path]
|
|
112
|
+
|
|
113
|
+
lfs = getattr(remote_entry, "lfs", None)
|
|
114
|
+
lfs_sha = getattr(lfs, "sha256", None) if lfs is not None else None
|
|
115
|
+
if lfs_sha is None and isinstance(lfs, dict):
|
|
116
|
+
lfs_sha = lfs.get("sha256")
|
|
117
|
+
if lfs_sha:
|
|
118
|
+
algorithm: HashAlgo = "sha256"
|
|
119
|
+
expected = str(lfs_sha).lower()
|
|
120
|
+
else:
|
|
121
|
+
blob_id = remote_entry.blob_id # type: ignore
|
|
122
|
+
algorithm = "git-sha1"
|
|
123
|
+
expected = str(blob_id).lower()
|
|
124
|
+
|
|
125
|
+
actual = compute_file_hash(local_path, algorithm)
|
|
126
|
+
|
|
127
|
+
if actual != expected:
|
|
128
|
+
mismatches.append(Mismatch(path=rel_path, expected=expected, actual=actual, algorithm=algorithm))
|
|
129
|
+
|
|
130
|
+
return FolderVerification(
|
|
131
|
+
revision=revision,
|
|
132
|
+
checked_count=len(both),
|
|
133
|
+
mismatches=mismatches,
|
|
134
|
+
missing_paths=missing,
|
|
135
|
+
extra_paths=extra,
|
|
136
|
+
verified_path=verified_path,
|
|
137
|
+
)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def resolve_local_root(
|
|
141
|
+
*,
|
|
142
|
+
repo_id: str,
|
|
143
|
+
repo_type: str,
|
|
144
|
+
revision: Optional[str],
|
|
145
|
+
cache_dir: Optional[Path],
|
|
146
|
+
local_dir: Optional[Path],
|
|
147
|
+
) -> tuple[Path, str]:
|
|
148
|
+
"""
|
|
149
|
+
Resolve the root directory to scan locally and the remote revision to verify.
|
|
150
|
+
"""
|
|
151
|
+
if local_dir is not None:
|
|
152
|
+
root = Path(local_dir).expanduser().resolve()
|
|
153
|
+
if not root.is_dir():
|
|
154
|
+
raise ValueError(f"Local directory does not exist or is not a directory: {root}")
|
|
155
|
+
return root, (revision or constants.DEFAULT_REVISION)
|
|
156
|
+
|
|
157
|
+
cache_root = Path(cache_dir or constants.HF_HUB_CACHE).expanduser().resolve()
|
|
158
|
+
storage_folder = cache_root / repo_folder_name(repo_id=repo_id, repo_type=repo_type)
|
|
159
|
+
if not storage_folder.exists():
|
|
160
|
+
raise ValueError(
|
|
161
|
+
f"Repo is not present in cache: {storage_folder}. Use 'hf download' first or pass --local-dir."
|
|
162
|
+
)
|
|
163
|
+
commit = _resolve_commit_hash_from_cache(storage_folder, revision)
|
|
164
|
+
snapshot_dir = storage_folder / "snapshots" / commit
|
|
165
|
+
if not snapshot_dir.is_dir():
|
|
166
|
+
raise ValueError(f"Snapshot directory does not exist for revision '{commit}': {snapshot_dir}.")
|
|
167
|
+
return snapshot_dir, commit
|
huggingface_hub/utils/_xet.py
CHANGED
|
@@ -1,13 +1,19 @@
|
|
|
1
|
+
import time
|
|
1
2
|
from dataclasses import dataclass
|
|
2
3
|
from enum import Enum
|
|
3
|
-
from typing import
|
|
4
|
+
from typing import Optional
|
|
4
5
|
|
|
5
|
-
import
|
|
6
|
+
import httpx
|
|
6
7
|
|
|
7
8
|
from .. import constants
|
|
8
9
|
from . import get_session, hf_raise_for_status, validate_hf_hub_args
|
|
9
10
|
|
|
10
11
|
|
|
12
|
+
XET_CONNECTION_INFO_SAFETY_PERIOD = 60 # seconds
|
|
13
|
+
XET_CONNECTION_INFO_CACHE_SIZE = 1_000
|
|
14
|
+
XET_CONNECTION_INFO_CACHE: dict[str, "XetConnectionInfo"] = {}
|
|
15
|
+
|
|
16
|
+
|
|
11
17
|
class XetTokenType(str, Enum):
|
|
12
18
|
READ = "read"
|
|
13
19
|
WRITE = "write"
|
|
@@ -26,7 +32,9 @@ class XetConnectionInfo:
|
|
|
26
32
|
endpoint: str
|
|
27
33
|
|
|
28
34
|
|
|
29
|
-
def parse_xet_file_data_from_response(
|
|
35
|
+
def parse_xet_file_data_from_response(
|
|
36
|
+
response: httpx.Response, endpoint: Optional[str] = None
|
|
37
|
+
) -> Optional[XetFileData]:
|
|
30
38
|
"""
|
|
31
39
|
Parse XET file metadata from an HTTP response.
|
|
32
40
|
|
|
@@ -34,7 +42,7 @@ def parse_xet_file_data_from_response(response: requests.Response) -> Optional[X
|
|
|
34
42
|
of a given response object. If the required metadata is not found, it returns `None`.
|
|
35
43
|
|
|
36
44
|
Args:
|
|
37
|
-
response (`
|
|
45
|
+
response (`httpx.Response`):
|
|
38
46
|
The HTTP response object containing headers dict and links dict to extract the XET metadata from.
|
|
39
47
|
Returns:
|
|
40
48
|
`Optional[XetFileData]`:
|
|
@@ -52,18 +60,20 @@ def parse_xet_file_data_from_response(response: requests.Response) -> Optional[X
|
|
|
52
60
|
refresh_route = response.headers[constants.HUGGINGFACE_HEADER_X_XET_REFRESH_ROUTE]
|
|
53
61
|
except KeyError:
|
|
54
62
|
return None
|
|
55
|
-
|
|
63
|
+
endpoint = endpoint if endpoint is not None else constants.ENDPOINT
|
|
64
|
+
if refresh_route.startswith(constants.HUGGINGFACE_CO_URL_HOME):
|
|
65
|
+
refresh_route = refresh_route.replace(constants.HUGGINGFACE_CO_URL_HOME.rstrip("/"), endpoint.rstrip("/"))
|
|
56
66
|
return XetFileData(
|
|
57
67
|
file_hash=file_hash,
|
|
58
68
|
refresh_route=refresh_route,
|
|
59
69
|
)
|
|
60
70
|
|
|
61
71
|
|
|
62
|
-
def parse_xet_connection_info_from_headers(headers:
|
|
72
|
+
def parse_xet_connection_info_from_headers(headers: dict[str, str]) -> Optional[XetConnectionInfo]:
|
|
63
73
|
"""
|
|
64
74
|
Parse XET connection info from the HTTP headers or return None if not found.
|
|
65
75
|
Args:
|
|
66
|
-
headers (`
|
|
76
|
+
headers (`dict`):
|
|
67
77
|
HTTP headers to extract the XET metadata from.
|
|
68
78
|
Returns:
|
|
69
79
|
`XetConnectionInfo` or `None`:
|
|
@@ -88,7 +98,7 @@ def parse_xet_connection_info_from_headers(headers: Dict[str, str]) -> Optional[
|
|
|
88
98
|
def refresh_xet_connection_info(
|
|
89
99
|
*,
|
|
90
100
|
file_data: XetFileData,
|
|
91
|
-
headers:
|
|
101
|
+
headers: dict[str, str],
|
|
92
102
|
) -> XetConnectionInfo:
|
|
93
103
|
"""
|
|
94
104
|
Utilizes the information in the parsed metadata to request the Hub xet connection information.
|
|
@@ -96,7 +106,7 @@ def refresh_xet_connection_info(
|
|
|
96
106
|
Args:
|
|
97
107
|
file_data: (`XetFileData`):
|
|
98
108
|
The file data needed to refresh the xet connection information.
|
|
99
|
-
headers (`
|
|
109
|
+
headers (`dict[str, str]`):
|
|
100
110
|
Headers to use for the request, including authorization headers and user agent.
|
|
101
111
|
Returns:
|
|
102
112
|
`XetConnectionInfo`:
|
|
@@ -119,9 +129,9 @@ def fetch_xet_connection_info_from_repo_info(
|
|
|
119
129
|
repo_id: str,
|
|
120
130
|
repo_type: str,
|
|
121
131
|
revision: Optional[str] = None,
|
|
122
|
-
headers:
|
|
132
|
+
headers: dict[str, str],
|
|
123
133
|
endpoint: Optional[str] = None,
|
|
124
|
-
params: Optional[
|
|
134
|
+
params: Optional[dict[str, str]] = None,
|
|
125
135
|
) -> XetConnectionInfo:
|
|
126
136
|
"""
|
|
127
137
|
Uses the repo info to request a xet access token from Hub.
|
|
@@ -134,11 +144,11 @@ def fetch_xet_connection_info_from_repo_info(
|
|
|
134
144
|
Type of the repo to upload to: `"model"`, `"dataset"` or `"space"`.
|
|
135
145
|
revision (`str`, `optional`):
|
|
136
146
|
The revision of the repo to get the token for.
|
|
137
|
-
headers (`
|
|
147
|
+
headers (`dict[str, str]`):
|
|
138
148
|
Headers to use for the request, including authorization headers and user agent.
|
|
139
149
|
endpoint (`str`, `optional`):
|
|
140
150
|
The endpoint to use for the request. Defaults to the Hub endpoint.
|
|
141
|
-
params (`
|
|
151
|
+
params (`dict[str, str]`, `optional`):
|
|
142
152
|
Additional parameters to pass with the request.
|
|
143
153
|
Returns:
|
|
144
154
|
`XetConnectionInfo`:
|
|
@@ -157,18 +167,21 @@ def fetch_xet_connection_info_from_repo_info(
|
|
|
157
167
|
@validate_hf_hub_args
|
|
158
168
|
def _fetch_xet_connection_info_with_url(
|
|
159
169
|
url: str,
|
|
160
|
-
headers:
|
|
161
|
-
params: Optional[
|
|
170
|
+
headers: dict[str, str],
|
|
171
|
+
params: Optional[dict[str, str]] = None,
|
|
162
172
|
) -> XetConnectionInfo:
|
|
163
173
|
"""
|
|
164
174
|
Requests the xet connection info from the supplied URL. This includes the
|
|
165
175
|
access token, expiration time, and endpoint to use for the xet storage service.
|
|
176
|
+
|
|
177
|
+
Result is cached to avoid redundant requests.
|
|
178
|
+
|
|
166
179
|
Args:
|
|
167
180
|
url: (`str`):
|
|
168
181
|
The access token endpoint URL.
|
|
169
|
-
headers (`
|
|
182
|
+
headers (`dict[str, str]`):
|
|
170
183
|
Headers to use for the request, including authorization headers and user agent.
|
|
171
|
-
params (`
|
|
184
|
+
params (`dict[str, str]`, `optional`):
|
|
172
185
|
Additional parameters to pass with the request.
|
|
173
186
|
Returns:
|
|
174
187
|
`XetConnectionInfo`:
|
|
@@ -179,10 +192,44 @@ def _fetch_xet_connection_info_with_url(
|
|
|
179
192
|
[`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
|
|
180
193
|
If the Hub API response is improperly formatted.
|
|
181
194
|
"""
|
|
195
|
+
# Check cache first
|
|
196
|
+
cache_key = _cache_key(url, headers, params)
|
|
197
|
+
cached_info = XET_CONNECTION_INFO_CACHE.get(cache_key)
|
|
198
|
+
if cached_info is not None:
|
|
199
|
+
if not _is_expired(cached_info):
|
|
200
|
+
return cached_info
|
|
201
|
+
|
|
202
|
+
# Fetch from server
|
|
182
203
|
resp = get_session().get(headers=headers, url=url, params=params)
|
|
183
204
|
hf_raise_for_status(resp)
|
|
184
205
|
|
|
185
206
|
metadata = parse_xet_connection_info_from_headers(resp.headers) # type: ignore
|
|
186
207
|
if metadata is None:
|
|
187
208
|
raise ValueError("Xet headers have not been correctly set by the server.")
|
|
209
|
+
|
|
210
|
+
# Delete expired cache entries
|
|
211
|
+
for k, v in list(XET_CONNECTION_INFO_CACHE.items()):
|
|
212
|
+
if _is_expired(v):
|
|
213
|
+
XET_CONNECTION_INFO_CACHE.pop(k, None)
|
|
214
|
+
|
|
215
|
+
# Enforce cache size limit
|
|
216
|
+
if len(XET_CONNECTION_INFO_CACHE) >= XET_CONNECTION_INFO_CACHE_SIZE:
|
|
217
|
+
XET_CONNECTION_INFO_CACHE.pop(next(iter(XET_CONNECTION_INFO_CACHE)))
|
|
218
|
+
|
|
219
|
+
# Update cache
|
|
220
|
+
XET_CONNECTION_INFO_CACHE[cache_key] = metadata
|
|
221
|
+
|
|
188
222
|
return metadata
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def _cache_key(url: str, headers: dict[str, str], params: Optional[dict[str, str]]) -> str:
|
|
226
|
+
"""Return a unique cache key for the given request parameters."""
|
|
227
|
+
lower_headers = {k.lower(): v for k, v in headers.items()} # casing is not guaranteed here
|
|
228
|
+
auth_header = lower_headers.get("authorization", "")
|
|
229
|
+
params_str = "&".join(f"{k}={v}" for k, v in sorted((params or {}).items(), key=lambda x: x[0]))
|
|
230
|
+
return f"{url}|{auth_header}|{params_str}"
|
|
231
|
+
|
|
232
|
+
|
|
233
|
+
def _is_expired(connection_info: XetConnectionInfo) -> bool:
|
|
234
|
+
"""Check if the given XET connection info is expired."""
|
|
235
|
+
return connection_info.expiration_unix_epoch <= int(time.time()) + XET_CONNECTION_INFO_SAFETY_PERIOD
|