huggingface-hub 0.29.0rc2__py3-none-any.whl → 1.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- huggingface_hub/__init__.py +160 -46
- huggingface_hub/_commit_api.py +277 -71
- huggingface_hub/_commit_scheduler.py +15 -15
- huggingface_hub/_inference_endpoints.py +33 -22
- huggingface_hub/_jobs_api.py +301 -0
- huggingface_hub/_local_folder.py +18 -3
- huggingface_hub/_login.py +31 -63
- huggingface_hub/_oauth.py +460 -0
- huggingface_hub/_snapshot_download.py +241 -81
- huggingface_hub/_space_api.py +18 -10
- huggingface_hub/_tensorboard_logger.py +15 -19
- huggingface_hub/_upload_large_folder.py +196 -76
- huggingface_hub/_webhooks_payload.py +3 -3
- huggingface_hub/_webhooks_server.py +15 -25
- huggingface_hub/{commands → cli}/__init__.py +1 -15
- huggingface_hub/cli/_cli_utils.py +173 -0
- huggingface_hub/cli/auth.py +147 -0
- huggingface_hub/cli/cache.py +841 -0
- huggingface_hub/cli/download.py +189 -0
- huggingface_hub/cli/hf.py +60 -0
- huggingface_hub/cli/inference_endpoints.py +377 -0
- huggingface_hub/cli/jobs.py +772 -0
- huggingface_hub/cli/lfs.py +175 -0
- huggingface_hub/cli/repo.py +315 -0
- huggingface_hub/cli/repo_files.py +94 -0
- huggingface_hub/{commands/env.py → cli/system.py} +10 -13
- huggingface_hub/cli/upload.py +294 -0
- huggingface_hub/cli/upload_large_folder.py +117 -0
- huggingface_hub/community.py +20 -12
- huggingface_hub/constants.py +83 -59
- huggingface_hub/dataclasses.py +609 -0
- huggingface_hub/errors.py +99 -30
- huggingface_hub/fastai_utils.py +30 -41
- huggingface_hub/file_download.py +606 -346
- huggingface_hub/hf_api.py +2445 -1132
- huggingface_hub/hf_file_system.py +269 -152
- huggingface_hub/hub_mixin.py +61 -66
- huggingface_hub/inference/_client.py +501 -630
- huggingface_hub/inference/_common.py +133 -121
- huggingface_hub/inference/_generated/_async_client.py +536 -722
- huggingface_hub/inference/_generated/types/__init__.py +6 -1
- huggingface_hub/inference/_generated/types/automatic_speech_recognition.py +5 -6
- huggingface_hub/inference/_generated/types/base.py +10 -7
- huggingface_hub/inference/_generated/types/chat_completion.py +77 -31
- huggingface_hub/inference/_generated/types/depth_estimation.py +2 -2
- huggingface_hub/inference/_generated/types/document_question_answering.py +2 -2
- huggingface_hub/inference/_generated/types/feature_extraction.py +2 -2
- huggingface_hub/inference/_generated/types/fill_mask.py +2 -2
- huggingface_hub/inference/_generated/types/image_to_image.py +8 -2
- huggingface_hub/inference/_generated/types/image_to_text.py +2 -3
- huggingface_hub/inference/_generated/types/image_to_video.py +60 -0
- huggingface_hub/inference/_generated/types/sentence_similarity.py +3 -3
- huggingface_hub/inference/_generated/types/summarization.py +2 -2
- huggingface_hub/inference/_generated/types/table_question_answering.py +5 -5
- huggingface_hub/inference/_generated/types/text2text_generation.py +2 -2
- huggingface_hub/inference/_generated/types/text_generation.py +11 -11
- huggingface_hub/inference/_generated/types/text_to_audio.py +1 -2
- huggingface_hub/inference/_generated/types/text_to_speech.py +1 -2
- huggingface_hub/inference/_generated/types/text_to_video.py +2 -2
- huggingface_hub/inference/_generated/types/token_classification.py +2 -2
- huggingface_hub/inference/_generated/types/translation.py +2 -2
- huggingface_hub/inference/_generated/types/zero_shot_classification.py +2 -2
- huggingface_hub/inference/_generated/types/zero_shot_image_classification.py +2 -2
- huggingface_hub/inference/_generated/types/zero_shot_object_detection.py +1 -3
- huggingface_hub/inference/_mcp/__init__.py +0 -0
- huggingface_hub/inference/_mcp/_cli_hacks.py +88 -0
- huggingface_hub/inference/_mcp/agent.py +100 -0
- huggingface_hub/inference/_mcp/cli.py +247 -0
- huggingface_hub/inference/_mcp/constants.py +81 -0
- huggingface_hub/inference/_mcp/mcp_client.py +395 -0
- huggingface_hub/inference/_mcp/types.py +45 -0
- huggingface_hub/inference/_mcp/utils.py +128 -0
- huggingface_hub/inference/_providers/__init__.py +149 -20
- huggingface_hub/inference/_providers/_common.py +160 -37
- huggingface_hub/inference/_providers/black_forest_labs.py +12 -9
- huggingface_hub/inference/_providers/cerebras.py +6 -0
- huggingface_hub/inference/_providers/clarifai.py +13 -0
- huggingface_hub/inference/_providers/cohere.py +32 -0
- huggingface_hub/inference/_providers/fal_ai.py +231 -22
- huggingface_hub/inference/_providers/featherless_ai.py +38 -0
- huggingface_hub/inference/_providers/fireworks_ai.py +22 -1
- huggingface_hub/inference/_providers/groq.py +9 -0
- huggingface_hub/inference/_providers/hf_inference.py +143 -33
- huggingface_hub/inference/_providers/hyperbolic.py +9 -5
- huggingface_hub/inference/_providers/nebius.py +47 -5
- huggingface_hub/inference/_providers/novita.py +48 -5
- huggingface_hub/inference/_providers/nscale.py +44 -0
- huggingface_hub/inference/_providers/openai.py +25 -0
- huggingface_hub/inference/_providers/publicai.py +6 -0
- huggingface_hub/inference/_providers/replicate.py +46 -9
- huggingface_hub/inference/_providers/sambanova.py +37 -1
- huggingface_hub/inference/_providers/scaleway.py +28 -0
- huggingface_hub/inference/_providers/together.py +34 -5
- huggingface_hub/inference/_providers/wavespeed.py +138 -0
- huggingface_hub/inference/_providers/zai_org.py +17 -0
- huggingface_hub/lfs.py +33 -100
- huggingface_hub/repocard.py +34 -38
- huggingface_hub/repocard_data.py +79 -59
- huggingface_hub/serialization/__init__.py +0 -1
- huggingface_hub/serialization/_base.py +12 -15
- huggingface_hub/serialization/_dduf.py +8 -8
- huggingface_hub/serialization/_torch.py +69 -69
- huggingface_hub/utils/__init__.py +27 -8
- huggingface_hub/utils/_auth.py +7 -7
- huggingface_hub/utils/_cache_manager.py +92 -147
- huggingface_hub/utils/_chunk_utils.py +2 -3
- huggingface_hub/utils/_deprecation.py +1 -1
- huggingface_hub/utils/_dotenv.py +55 -0
- huggingface_hub/utils/_experimental.py +7 -5
- huggingface_hub/utils/_fixes.py +0 -10
- huggingface_hub/utils/_git_credential.py +5 -5
- huggingface_hub/utils/_headers.py +8 -30
- huggingface_hub/utils/_http.py +399 -237
- huggingface_hub/utils/_pagination.py +6 -6
- huggingface_hub/utils/_parsing.py +98 -0
- huggingface_hub/utils/_paths.py +5 -5
- huggingface_hub/utils/_runtime.py +74 -22
- huggingface_hub/utils/_safetensors.py +21 -21
- huggingface_hub/utils/_subprocess.py +13 -11
- huggingface_hub/utils/_telemetry.py +4 -4
- huggingface_hub/{commands/_cli_utils.py → utils/_terminal.py} +4 -4
- huggingface_hub/utils/_typing.py +25 -5
- huggingface_hub/utils/_validators.py +55 -74
- huggingface_hub/utils/_verification.py +167 -0
- huggingface_hub/utils/_xet.py +235 -0
- huggingface_hub/utils/_xet_progress_reporting.py +162 -0
- huggingface_hub/utils/insecure_hashlib.py +3 -5
- huggingface_hub/utils/logging.py +8 -11
- huggingface_hub/utils/tqdm.py +33 -4
- {huggingface_hub-0.29.0rc2.dist-info → huggingface_hub-1.1.3.dist-info}/METADATA +94 -82
- huggingface_hub-1.1.3.dist-info/RECORD +155 -0
- {huggingface_hub-0.29.0rc2.dist-info → huggingface_hub-1.1.3.dist-info}/WHEEL +1 -1
- huggingface_hub-1.1.3.dist-info/entry_points.txt +6 -0
- huggingface_hub/commands/delete_cache.py +0 -428
- huggingface_hub/commands/download.py +0 -200
- huggingface_hub/commands/huggingface_cli.py +0 -61
- huggingface_hub/commands/lfs.py +0 -200
- huggingface_hub/commands/repo_files.py +0 -128
- huggingface_hub/commands/scan_cache.py +0 -181
- huggingface_hub/commands/tag.py +0 -159
- huggingface_hub/commands/upload.py +0 -299
- huggingface_hub/commands/upload_large_folder.py +0 -129
- huggingface_hub/commands/user.py +0 -304
- huggingface_hub/commands/version.py +0 -37
- huggingface_hub/inference_api.py +0 -217
- huggingface_hub/keras_mixin.py +0 -500
- huggingface_hub/repository.py +0 -1477
- huggingface_hub/serialization/_tensorflow.py +0 -95
- huggingface_hub/utils/_hf_folder.py +0 -68
- huggingface_hub-0.29.0rc2.dist-info/RECORD +0 -131
- huggingface_hub-0.29.0rc2.dist-info/entry_points.txt +0 -6
- {huggingface_hub-0.29.0rc2.dist-info → huggingface_hub-1.1.3.dist-info/licenses}/LICENSE +0 -0
- {huggingface_hub-0.29.0rc2.dist-info → huggingface_hub-1.1.3.dist-info}/top_level.txt +0 -0
huggingface_hub/_commit_api.py
CHANGED
|
@@ -11,17 +11,20 @@ from contextlib import contextmanager
|
|
|
11
11
|
from dataclasses import dataclass, field
|
|
12
12
|
from itertools import groupby
|
|
13
13
|
from pathlib import Path, PurePosixPath
|
|
14
|
-
from typing import TYPE_CHECKING, Any, BinaryIO,
|
|
14
|
+
from typing import TYPE_CHECKING, Any, BinaryIO, Iterable, Iterator, Literal, Optional, Union
|
|
15
15
|
|
|
16
16
|
from tqdm.contrib.concurrent import thread_map
|
|
17
17
|
|
|
18
18
|
from . import constants
|
|
19
|
-
from .errors import EntryNotFoundError
|
|
19
|
+
from .errors import EntryNotFoundError, HfHubHTTPError, XetAuthorizationError, XetRefreshTokenError
|
|
20
20
|
from .file_download import hf_hub_url
|
|
21
21
|
from .lfs import UploadInfo, lfs_upload, post_lfs_batch_info
|
|
22
22
|
from .utils import (
|
|
23
23
|
FORBIDDEN_FOLDERS,
|
|
24
|
+
XetTokenType,
|
|
25
|
+
are_progress_bars_disabled,
|
|
24
26
|
chunk_iterable,
|
|
27
|
+
fetch_xet_connection_info_from_repo_info,
|
|
25
28
|
get_session,
|
|
26
29
|
hf_raise_for_status,
|
|
27
30
|
logging,
|
|
@@ -30,6 +33,7 @@ from .utils import (
|
|
|
30
33
|
validate_hf_hub_args,
|
|
31
34
|
)
|
|
32
35
|
from .utils import tqdm as hf_tqdm
|
|
36
|
+
from .utils._runtime import is_xet_available
|
|
33
37
|
|
|
34
38
|
|
|
35
39
|
if TYPE_CHECKING:
|
|
@@ -47,6 +51,8 @@ UploadMode = Literal["lfs", "regular"]
|
|
|
47
51
|
# See https://github.com/huggingface/huggingface_hub/issues/1503
|
|
48
52
|
FETCH_LFS_BATCH_SIZE = 500
|
|
49
53
|
|
|
54
|
+
UPLOAD_BATCH_MAX_NUM_FILES = 256
|
|
55
|
+
|
|
50
56
|
|
|
51
57
|
@dataclass
|
|
52
58
|
class CommitOperationDelete:
|
|
@@ -230,7 +236,7 @@ class CommitOperationAdd:
|
|
|
230
236
|
config.json: 100%|█████████████████████████| 8.19k/8.19k [00:02<00:00, 3.72kB/s]
|
|
231
237
|
|
|
232
238
|
>>> with operation.as_file(with_tqdm=True) as file:
|
|
233
|
-
...
|
|
239
|
+
... httpx.put(..., data=file)
|
|
234
240
|
config.json: 100%|█████████████████████████| 8.19k/8.19k [00:02<00:00, 3.72kB/s]
|
|
235
241
|
```
|
|
236
242
|
"""
|
|
@@ -301,7 +307,7 @@ def _validate_path_in_repo(path_in_repo: str) -> str:
|
|
|
301
307
|
CommitOperation = Union[CommitOperationAdd, CommitOperationCopy, CommitOperationDelete]
|
|
302
308
|
|
|
303
309
|
|
|
304
|
-
def _warn_on_overwriting_operations(operations:
|
|
310
|
+
def _warn_on_overwriting_operations(operations: list[CommitOperation]) -> None:
|
|
305
311
|
"""
|
|
306
312
|
Warn user when a list of operations is expected to overwrite itself in a single
|
|
307
313
|
commit.
|
|
@@ -316,7 +322,7 @@ def _warn_on_overwriting_operations(operations: List[CommitOperation]) -> None:
|
|
|
316
322
|
delete before upload) but can happen if a user deletes an entire folder and then
|
|
317
323
|
add new files to it.
|
|
318
324
|
"""
|
|
319
|
-
nb_additions_per_path:
|
|
325
|
+
nb_additions_per_path: dict[str, int] = defaultdict(int)
|
|
320
326
|
for operation in operations:
|
|
321
327
|
path_in_repo = operation.path_in_repo
|
|
322
328
|
if isinstance(operation, CommitOperationAdd):
|
|
@@ -348,15 +354,95 @@ def _warn_on_overwriting_operations(operations: List[CommitOperation]) -> None:
|
|
|
348
354
|
|
|
349
355
|
|
|
350
356
|
@validate_hf_hub_args
|
|
351
|
-
def
|
|
357
|
+
def _upload_files(
|
|
352
358
|
*,
|
|
353
|
-
additions:
|
|
359
|
+
additions: list[CommitOperationAdd],
|
|
354
360
|
repo_type: str,
|
|
355
361
|
repo_id: str,
|
|
356
|
-
headers:
|
|
362
|
+
headers: dict[str, str],
|
|
357
363
|
endpoint: Optional[str] = None,
|
|
358
364
|
num_threads: int = 5,
|
|
359
365
|
revision: Optional[str] = None,
|
|
366
|
+
create_pr: Optional[bool] = None,
|
|
367
|
+
):
|
|
368
|
+
"""
|
|
369
|
+
Negotiates per-file transfer (LFS vs Xet) and uploads in batches.
|
|
370
|
+
"""
|
|
371
|
+
xet_additions: list[CommitOperationAdd] = []
|
|
372
|
+
lfs_actions: list[dict[str, Any]] = []
|
|
373
|
+
lfs_oid2addop: dict[str, CommitOperationAdd] = {}
|
|
374
|
+
|
|
375
|
+
for chunk in chunk_iterable(additions, chunk_size=UPLOAD_BATCH_MAX_NUM_FILES):
|
|
376
|
+
chunk_list = [op for op in chunk]
|
|
377
|
+
|
|
378
|
+
transfers: list[str] = ["basic", "multipart"]
|
|
379
|
+
has_buffered_io_data = any(isinstance(op.path_or_fileobj, io.BufferedIOBase) for op in chunk_list)
|
|
380
|
+
if is_xet_available():
|
|
381
|
+
if not has_buffered_io_data:
|
|
382
|
+
transfers.append("xet")
|
|
383
|
+
else:
|
|
384
|
+
logger.warning(
|
|
385
|
+
"Uploading files as a binary IO buffer is not supported by Xet Storage. "
|
|
386
|
+
"Falling back to HTTP upload."
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
actions_chunk, errors_chunk, chosen_transfer = post_lfs_batch_info(
|
|
390
|
+
upload_infos=[op.upload_info for op in chunk_list],
|
|
391
|
+
repo_id=repo_id,
|
|
392
|
+
repo_type=repo_type,
|
|
393
|
+
revision=revision,
|
|
394
|
+
endpoint=endpoint,
|
|
395
|
+
headers=headers,
|
|
396
|
+
token=None, # already passed in 'headers'
|
|
397
|
+
transfers=transfers,
|
|
398
|
+
)
|
|
399
|
+
if errors_chunk:
|
|
400
|
+
message = "\n".join(
|
|
401
|
+
[
|
|
402
|
+
f"Encountered error for file with OID {err.get('oid')}: `{err.get('error', {}).get('message')}"
|
|
403
|
+
for err in errors_chunk
|
|
404
|
+
]
|
|
405
|
+
)
|
|
406
|
+
raise ValueError(f"LFS batch API returned errors:\n{message}")
|
|
407
|
+
|
|
408
|
+
# If server returns a transfer we didn't offer (e.g "xet" while uploading from BytesIO),
|
|
409
|
+
# fall back to LFS for this chunk.
|
|
410
|
+
if chosen_transfer == "xet" and ("xet" in transfers):
|
|
411
|
+
xet_additions.extend(chunk_list)
|
|
412
|
+
else:
|
|
413
|
+
lfs_actions.extend(actions_chunk)
|
|
414
|
+
for op in chunk_list:
|
|
415
|
+
lfs_oid2addop[op.upload_info.sha256.hex()] = op
|
|
416
|
+
|
|
417
|
+
if len(lfs_actions) > 0:
|
|
418
|
+
_upload_lfs_files(
|
|
419
|
+
actions=lfs_actions,
|
|
420
|
+
oid2addop=lfs_oid2addop,
|
|
421
|
+
headers=headers,
|
|
422
|
+
endpoint=endpoint,
|
|
423
|
+
num_threads=num_threads,
|
|
424
|
+
)
|
|
425
|
+
|
|
426
|
+
if len(xet_additions) > 0:
|
|
427
|
+
_upload_xet_files(
|
|
428
|
+
additions=xet_additions,
|
|
429
|
+
repo_type=repo_type,
|
|
430
|
+
repo_id=repo_id,
|
|
431
|
+
headers=headers,
|
|
432
|
+
endpoint=endpoint,
|
|
433
|
+
revision=revision,
|
|
434
|
+
create_pr=create_pr,
|
|
435
|
+
)
|
|
436
|
+
|
|
437
|
+
|
|
438
|
+
@validate_hf_hub_args
|
|
439
|
+
def _upload_lfs_files(
|
|
440
|
+
*,
|
|
441
|
+
actions: list[dict[str, Any]],
|
|
442
|
+
oid2addop: dict[str, CommitOperationAdd],
|
|
443
|
+
headers: dict[str, str],
|
|
444
|
+
endpoint: Optional[str] = None,
|
|
445
|
+
num_threads: int = 5,
|
|
360
446
|
):
|
|
361
447
|
"""
|
|
362
448
|
Uploads the content of `additions` to the Hub using the large file storage protocol.
|
|
@@ -365,14 +451,26 @@ def _upload_lfs_files(
|
|
|
365
451
|
- LFS Batch API: https://github.com/git-lfs/git-lfs/blob/main/docs/api/batch.md
|
|
366
452
|
|
|
367
453
|
Args:
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
454
|
+
actions (`list[dict[str, Any]]`):
|
|
455
|
+
LFS batch actions returned by the server.
|
|
456
|
+
oid2addop (`dict[str, CommitOperationAdd]`):
|
|
457
|
+
A dictionary mapping the OID of the file to the corresponding `CommitOperationAdd` object.
|
|
458
|
+
headers (`dict[str, str]`):
|
|
459
|
+
Headers to use for the request, including authorization headers and user agent.
|
|
460
|
+
endpoint (`str`, *optional*):
|
|
461
|
+
The endpoint to use for the request. Defaults to `constants.ENDPOINT`.
|
|
462
|
+
num_threads (`int`, *optional*):
|
|
463
|
+
The number of concurrent threads to use when uploading. Defaults to 5.
|
|
464
|
+
|
|
465
|
+
Raises:
|
|
466
|
+
[`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError)
|
|
467
|
+
If an upload failed for any reason
|
|
468
|
+
[`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
|
|
371
469
|
Type of the repo to upload to: `"model"`, `"dataset"` or `"space"`.
|
|
372
470
|
repo_id (`str`):
|
|
373
471
|
A namespace (user or an organization) and a repo name separated
|
|
374
472
|
by a `/`.
|
|
375
|
-
headers (`
|
|
473
|
+
headers (`dict[str, str]`):
|
|
376
474
|
Headers to use for the request, including authorization headers and user agent.
|
|
377
475
|
num_threads (`int`, *optional*):
|
|
378
476
|
The number of concurrent threads to use when uploading. Defaults to 5.
|
|
@@ -384,53 +482,20 @@ def _upload_lfs_files(
|
|
|
384
482
|
If an upload failed for any reason
|
|
385
483
|
[`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
|
|
386
484
|
If the server returns malformed responses
|
|
387
|
-
[`
|
|
485
|
+
[`HfHubHTTPError`]
|
|
388
486
|
If the LFS batch endpoint returned an HTTP error.
|
|
389
487
|
"""
|
|
390
|
-
#
|
|
391
|
-
# Upload instructions are retrieved by chunk of 256 files to avoid reaching
|
|
392
|
-
# the payload limit.
|
|
393
|
-
batch_actions: List[Dict] = []
|
|
394
|
-
for chunk in chunk_iterable(additions, chunk_size=256):
|
|
395
|
-
batch_actions_chunk, batch_errors_chunk = post_lfs_batch_info(
|
|
396
|
-
upload_infos=[op.upload_info for op in chunk],
|
|
397
|
-
repo_id=repo_id,
|
|
398
|
-
repo_type=repo_type,
|
|
399
|
-
revision=revision,
|
|
400
|
-
endpoint=endpoint,
|
|
401
|
-
headers=headers,
|
|
402
|
-
token=None, # already passed in 'headers'
|
|
403
|
-
)
|
|
404
|
-
|
|
405
|
-
# If at least 1 error, we do not retrieve information for other chunks
|
|
406
|
-
if batch_errors_chunk:
|
|
407
|
-
message = "\n".join(
|
|
408
|
-
[
|
|
409
|
-
f"Encountered error for file with OID {err.get('oid')}: `{err.get('error', {}).get('message')}"
|
|
410
|
-
for err in batch_errors_chunk
|
|
411
|
-
]
|
|
412
|
-
)
|
|
413
|
-
raise ValueError(f"LFS batch endpoint returned errors:\n{message}")
|
|
414
|
-
|
|
415
|
-
batch_actions += batch_actions_chunk
|
|
416
|
-
oid2addop = {add_op.upload_info.sha256.hex(): add_op for add_op in additions}
|
|
417
|
-
|
|
418
|
-
# Step 2: ignore files that have already been uploaded
|
|
488
|
+
# Filter out files already present upstream
|
|
419
489
|
filtered_actions = []
|
|
420
|
-
for action in
|
|
490
|
+
for action in actions:
|
|
421
491
|
if action.get("actions") is None:
|
|
422
492
|
logger.debug(
|
|
423
|
-
f"Content of file {oid2addop[action['oid']].path_in_repo} is already"
|
|
424
|
-
" present upstream - skipping upload."
|
|
493
|
+
f"Content of file {oid2addop[action['oid']].path_in_repo} is already present upstream - skipping upload."
|
|
425
494
|
)
|
|
426
495
|
else:
|
|
427
496
|
filtered_actions.append(action)
|
|
428
497
|
|
|
429
|
-
|
|
430
|
-
logger.debug("No LFS files to upload.")
|
|
431
|
-
return
|
|
432
|
-
|
|
433
|
-
# Step 3: upload files concurrently according to these instructions
|
|
498
|
+
# Upload according to server-provided actions
|
|
434
499
|
def _wrapped_lfs_upload(batch_action) -> None:
|
|
435
500
|
try:
|
|
436
501
|
operation = oid2addop[batch_action["oid"]]
|
|
@@ -438,11 +503,7 @@ def _upload_lfs_files(
|
|
|
438
503
|
except Exception as exc:
|
|
439
504
|
raise RuntimeError(f"Error while uploading '{operation.path_in_repo}' to the Hub.") from exc
|
|
440
505
|
|
|
441
|
-
if
|
|
442
|
-
logger.debug(f"Uploading {len(filtered_actions)} LFS files to the Hub using `hf_transfer`.")
|
|
443
|
-
for action in hf_tqdm(filtered_actions, name="huggingface_hub.lfs_upload"):
|
|
444
|
-
_wrapped_lfs_upload(action)
|
|
445
|
-
elif len(filtered_actions) == 1:
|
|
506
|
+
if len(filtered_actions) == 1:
|
|
446
507
|
logger.debug("Uploading 1 LFS file to the Hub")
|
|
447
508
|
_wrapped_lfs_upload(filtered_actions[0])
|
|
448
509
|
else:
|
|
@@ -458,6 +519,151 @@ def _upload_lfs_files(
|
|
|
458
519
|
)
|
|
459
520
|
|
|
460
521
|
|
|
522
|
+
@validate_hf_hub_args
|
|
523
|
+
def _upload_xet_files(
|
|
524
|
+
*,
|
|
525
|
+
additions: list[CommitOperationAdd],
|
|
526
|
+
repo_type: str,
|
|
527
|
+
repo_id: str,
|
|
528
|
+
headers: dict[str, str],
|
|
529
|
+
endpoint: Optional[str] = None,
|
|
530
|
+
revision: Optional[str] = None,
|
|
531
|
+
create_pr: Optional[bool] = None,
|
|
532
|
+
):
|
|
533
|
+
"""
|
|
534
|
+
Uploads the content of `additions` to the Hub using the xet storage protocol.
|
|
535
|
+
This chunks the files and deduplicates the chunks before uploading them to xetcas storage.
|
|
536
|
+
|
|
537
|
+
Args:
|
|
538
|
+
additions (`` of `CommitOperationAdd`):
|
|
539
|
+
The files to be uploaded.
|
|
540
|
+
repo_type (`str`):
|
|
541
|
+
Type of the repo to upload to: `"model"`, `"dataset"` or `"space"`.
|
|
542
|
+
repo_id (`str`):
|
|
543
|
+
A namespace (user or an organization) and a repo name separated
|
|
544
|
+
by a `/`.
|
|
545
|
+
headers (`dict[str, str]`):
|
|
546
|
+
Headers to use for the request, including authorization headers and user agent.
|
|
547
|
+
endpoint: (`str`, *optional*):
|
|
548
|
+
The endpoint to use for the xetcas service. Defaults to `constants.ENDPOINT`.
|
|
549
|
+
revision (`str`, *optional*):
|
|
550
|
+
The git revision to upload to.
|
|
551
|
+
create_pr (`bool`, *optional*):
|
|
552
|
+
Whether or not to create a Pull Request with that commit.
|
|
553
|
+
|
|
554
|
+
Raises:
|
|
555
|
+
[`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError)
|
|
556
|
+
If an upload failed for any reason.
|
|
557
|
+
[`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
|
|
558
|
+
If the server returns malformed responses or if the user is unauthorized to upload to xet storage.
|
|
559
|
+
[`HfHubHTTPError`]
|
|
560
|
+
If the LFS batch endpoint returned an HTTP error.
|
|
561
|
+
|
|
562
|
+
**How it works:**
|
|
563
|
+
The file download system uses Xet storage, which is a content-addressable storage system that breaks files into chunks
|
|
564
|
+
for efficient storage and transfer.
|
|
565
|
+
|
|
566
|
+
`hf_xet.upload_files` manages uploading files by:
|
|
567
|
+
- Taking a list of file paths to upload
|
|
568
|
+
- Breaking files into smaller chunks for efficient storage
|
|
569
|
+
- Avoiding duplicate storage by recognizing identical chunks across files
|
|
570
|
+
- Connecting to a storage server (CAS server) that manages these chunks
|
|
571
|
+
|
|
572
|
+
The upload process works like this:
|
|
573
|
+
1. Create a local folder at ~/.cache/huggingface/xet/chunk-cache to store file chunks for reuse.
|
|
574
|
+
2. Process files in parallel (up to 8 files at once):
|
|
575
|
+
2.1. Read the file content.
|
|
576
|
+
2.2. Split the file content into smaller chunks based on content patterns: each chunk gets a unique ID based on what's in it.
|
|
577
|
+
2.3. For each chunk:
|
|
578
|
+
- Check if it already exists in storage.
|
|
579
|
+
- Skip uploading chunks that already exist.
|
|
580
|
+
2.4. Group chunks into larger blocks for efficient transfer.
|
|
581
|
+
2.5. Upload these blocks to the storage server.
|
|
582
|
+
2.6. Create and upload information about how the file is structured.
|
|
583
|
+
3. Return reference files that contain information about the uploaded files, which can be used later to download them.
|
|
584
|
+
"""
|
|
585
|
+
if len(additions) == 0:
|
|
586
|
+
return
|
|
587
|
+
|
|
588
|
+
# at this point, we know that hf_xet is installed
|
|
589
|
+
from hf_xet import upload_bytes, upload_files
|
|
590
|
+
|
|
591
|
+
from .utils._xet_progress_reporting import XetProgressReporter
|
|
592
|
+
|
|
593
|
+
try:
|
|
594
|
+
xet_connection_info = fetch_xet_connection_info_from_repo_info(
|
|
595
|
+
token_type=XetTokenType.WRITE,
|
|
596
|
+
repo_id=repo_id,
|
|
597
|
+
repo_type=repo_type,
|
|
598
|
+
revision=revision,
|
|
599
|
+
headers=headers,
|
|
600
|
+
endpoint=endpoint,
|
|
601
|
+
params={"create_pr": "1"} if create_pr else None,
|
|
602
|
+
)
|
|
603
|
+
except HfHubHTTPError as e:
|
|
604
|
+
if e.response.status_code == 401:
|
|
605
|
+
raise XetAuthorizationError(
|
|
606
|
+
f"You are unauthorized to upload to xet storage for {repo_type}/{repo_id}. "
|
|
607
|
+
f"Please check that you have configured your access token with write access to the repo."
|
|
608
|
+
) from e
|
|
609
|
+
raise
|
|
610
|
+
|
|
611
|
+
xet_endpoint = xet_connection_info.endpoint
|
|
612
|
+
access_token_info = (xet_connection_info.access_token, xet_connection_info.expiration_unix_epoch)
|
|
613
|
+
|
|
614
|
+
def token_refresher() -> tuple[str, int]:
|
|
615
|
+
new_xet_connection = fetch_xet_connection_info_from_repo_info(
|
|
616
|
+
token_type=XetTokenType.WRITE,
|
|
617
|
+
repo_id=repo_id,
|
|
618
|
+
repo_type=repo_type,
|
|
619
|
+
revision=revision,
|
|
620
|
+
headers=headers,
|
|
621
|
+
endpoint=endpoint,
|
|
622
|
+
params={"create_pr": "1"} if create_pr else None,
|
|
623
|
+
)
|
|
624
|
+
if new_xet_connection is None:
|
|
625
|
+
raise XetRefreshTokenError("Failed to refresh xet token")
|
|
626
|
+
return new_xet_connection.access_token, new_xet_connection.expiration_unix_epoch
|
|
627
|
+
|
|
628
|
+
if not are_progress_bars_disabled():
|
|
629
|
+
progress = XetProgressReporter()
|
|
630
|
+
progress_callback = progress.update_progress
|
|
631
|
+
else:
|
|
632
|
+
progress, progress_callback = None, None
|
|
633
|
+
|
|
634
|
+
try:
|
|
635
|
+
all_bytes_ops = [op for op in additions if isinstance(op.path_or_fileobj, bytes)]
|
|
636
|
+
all_paths_ops = [op for op in additions if isinstance(op.path_or_fileobj, (str, Path))]
|
|
637
|
+
|
|
638
|
+
if len(all_paths_ops) > 0:
|
|
639
|
+
all_paths = [str(op.path_or_fileobj) for op in all_paths_ops]
|
|
640
|
+
upload_files(
|
|
641
|
+
all_paths,
|
|
642
|
+
xet_endpoint,
|
|
643
|
+
access_token_info,
|
|
644
|
+
token_refresher,
|
|
645
|
+
progress_callback,
|
|
646
|
+
repo_type,
|
|
647
|
+
)
|
|
648
|
+
|
|
649
|
+
if len(all_bytes_ops) > 0:
|
|
650
|
+
all_bytes = [op.path_or_fileobj for op in all_bytes_ops]
|
|
651
|
+
upload_bytes(
|
|
652
|
+
all_bytes,
|
|
653
|
+
xet_endpoint,
|
|
654
|
+
access_token_info,
|
|
655
|
+
token_refresher,
|
|
656
|
+
progress_callback,
|
|
657
|
+
repo_type,
|
|
658
|
+
)
|
|
659
|
+
|
|
660
|
+
finally:
|
|
661
|
+
if progress is not None:
|
|
662
|
+
progress.close(False)
|
|
663
|
+
|
|
664
|
+
return
|
|
665
|
+
|
|
666
|
+
|
|
461
667
|
def _validate_preupload_info(preupload_info: dict):
|
|
462
668
|
files = preupload_info.get("files")
|
|
463
669
|
if not isinstance(files, list):
|
|
@@ -478,15 +684,15 @@ def _fetch_upload_modes(
|
|
|
478
684
|
additions: Iterable[CommitOperationAdd],
|
|
479
685
|
repo_type: str,
|
|
480
686
|
repo_id: str,
|
|
481
|
-
headers:
|
|
687
|
+
headers: dict[str, str],
|
|
482
688
|
revision: str,
|
|
483
689
|
endpoint: Optional[str] = None,
|
|
484
690
|
create_pr: bool = False,
|
|
485
691
|
gitignore_content: Optional[str] = None,
|
|
486
692
|
) -> None:
|
|
487
693
|
"""
|
|
488
|
-
Requests the Hub "preupload" endpoint to determine whether each input file should be uploaded as a regular git blob
|
|
489
|
-
|
|
694
|
+
Requests the Hub "preupload" endpoint to determine whether each input file should be uploaded as a regular git blob,
|
|
695
|
+
as a git LFS blob, or as a XET file. Input `additions` are mutated in-place with the upload mode.
|
|
490
696
|
|
|
491
697
|
Args:
|
|
492
698
|
additions (`Iterable` of :class:`CommitOperationAdd`):
|
|
@@ -497,7 +703,7 @@ def _fetch_upload_modes(
|
|
|
497
703
|
repo_id (`str`):
|
|
498
704
|
A namespace (user or an organization) and a repo name separated
|
|
499
705
|
by a `/`.
|
|
500
|
-
headers (`
|
|
706
|
+
headers (`dict[str, str]`):
|
|
501
707
|
Headers to use for the request, including authorization headers and user agent.
|
|
502
708
|
revision (`str`):
|
|
503
709
|
The git revision to upload the files to. Can be any valid git revision.
|
|
@@ -515,12 +721,12 @@ def _fetch_upload_modes(
|
|
|
515
721
|
endpoint = endpoint if endpoint is not None else constants.ENDPOINT
|
|
516
722
|
|
|
517
723
|
# Fetch upload mode (LFS or regular) chunk by chunk.
|
|
518
|
-
upload_modes:
|
|
519
|
-
should_ignore_info:
|
|
520
|
-
oid_info:
|
|
724
|
+
upload_modes: dict[str, UploadMode] = {}
|
|
725
|
+
should_ignore_info: dict[str, bool] = {}
|
|
726
|
+
oid_info: dict[str, Optional[str]] = {}
|
|
521
727
|
|
|
522
728
|
for chunk in chunk_iterable(additions, 256):
|
|
523
|
-
payload:
|
|
729
|
+
payload: dict = {
|
|
524
730
|
"files": [
|
|
525
731
|
{
|
|
526
732
|
"path": op.path_in_repo,
|
|
@@ -563,10 +769,10 @@ def _fetch_files_to_copy(
|
|
|
563
769
|
copies: Iterable[CommitOperationCopy],
|
|
564
770
|
repo_type: str,
|
|
565
771
|
repo_id: str,
|
|
566
|
-
headers:
|
|
772
|
+
headers: dict[str, str],
|
|
567
773
|
revision: str,
|
|
568
774
|
endpoint: Optional[str] = None,
|
|
569
|
-
) ->
|
|
775
|
+
) -> dict[tuple[str, Optional[str]], Union["RepoFile", bytes]]:
|
|
570
776
|
"""
|
|
571
777
|
Fetch information about the files to copy.
|
|
572
778
|
|
|
@@ -582,12 +788,12 @@ def _fetch_files_to_copy(
|
|
|
582
788
|
repo_id (`str`):
|
|
583
789
|
A namespace (user or an organization) and a repo name separated
|
|
584
790
|
by a `/`.
|
|
585
|
-
headers (`
|
|
791
|
+
headers (`dict[str, str]`):
|
|
586
792
|
Headers to use for the request, including authorization headers and user agent.
|
|
587
793
|
revision (`str`):
|
|
588
794
|
The git revision to upload the files to. Can be any valid git revision.
|
|
589
795
|
|
|
590
|
-
Returns: `
|
|
796
|
+
Returns: `dict[tuple[str, Optional[str]], Union[RepoFile, bytes]]]`
|
|
591
797
|
Key is the file path and revision of the file to copy.
|
|
592
798
|
Value is the raw content as bytes (for regular files) or the file information as a RepoFile (for LFS files).
|
|
593
799
|
|
|
@@ -600,9 +806,9 @@ def _fetch_files_to_copy(
|
|
|
600
806
|
from .hf_api import HfApi, RepoFolder
|
|
601
807
|
|
|
602
808
|
hf_api = HfApi(endpoint=endpoint, headers=headers)
|
|
603
|
-
files_to_copy:
|
|
809
|
+
files_to_copy: dict[tuple[str, Optional[str]], Union["RepoFile", bytes]] = {}
|
|
604
810
|
# Store (path, revision) -> oid mapping
|
|
605
|
-
oid_info:
|
|
811
|
+
oid_info: dict[tuple[str, Optional[str]], Optional[str]] = {}
|
|
606
812
|
# 1. Fetch OIDs for destination paths in batches.
|
|
607
813
|
dest_paths = [op.path_in_repo for op in copies]
|
|
608
814
|
for offset in range(0, len(dest_paths), FETCH_LFS_BATCH_SIZE):
|
|
@@ -662,11 +868,11 @@ def _fetch_files_to_copy(
|
|
|
662
868
|
|
|
663
869
|
def _prepare_commit_payload(
|
|
664
870
|
operations: Iterable[CommitOperation],
|
|
665
|
-
files_to_copy:
|
|
871
|
+
files_to_copy: dict[tuple[str, Optional[str]], Union["RepoFile", bytes]],
|
|
666
872
|
commit_message: str,
|
|
667
873
|
commit_description: Optional[str] = None,
|
|
668
874
|
parent_commit: Optional[str] = None,
|
|
669
|
-
) -> Iterable[
|
|
875
|
+
) -> Iterable[dict[str, Any]]:
|
|
670
876
|
"""
|
|
671
877
|
Builds the payload to POST to the `/commit` API of the Hub.
|
|
672
878
|
|
|
@@ -7,7 +7,7 @@ from dataclasses import dataclass
|
|
|
7
7
|
from io import SEEK_END, SEEK_SET, BytesIO
|
|
8
8
|
from pathlib import Path
|
|
9
9
|
from threading import Lock, Thread
|
|
10
|
-
from typing import
|
|
10
|
+
from typing import Optional, Union
|
|
11
11
|
|
|
12
12
|
from .hf_api import DEFAULT_IGNORE_PATTERNS, CommitInfo, CommitOperationAdd, HfApi
|
|
13
13
|
from .utils import filter_repo_objects
|
|
@@ -53,9 +53,9 @@ class CommitScheduler:
|
|
|
53
53
|
Whether to make the repo private. If `None` (default), the repo will be public unless the organization's default is private. This value is ignored if the repo already exists.
|
|
54
54
|
token (`str`, *optional*):
|
|
55
55
|
The token to use to commit to the repo. Defaults to the token saved on the machine.
|
|
56
|
-
allow_patterns (`
|
|
56
|
+
allow_patterns (`list[str]` or `str`, *optional*):
|
|
57
57
|
If provided, only files matching at least one pattern are uploaded.
|
|
58
|
-
ignore_patterns (`
|
|
58
|
+
ignore_patterns (`list[str]` or `str`, *optional*):
|
|
59
59
|
If provided, files matching any of the patterns are not uploaded.
|
|
60
60
|
squash_history (`bool`, *optional*):
|
|
61
61
|
Whether to squash the history of the repo after each commit. Defaults to `False`. Squashing commits is
|
|
@@ -108,8 +108,8 @@ class CommitScheduler:
|
|
|
108
108
|
revision: Optional[str] = None,
|
|
109
109
|
private: Optional[bool] = None,
|
|
110
110
|
token: Optional[str] = None,
|
|
111
|
-
allow_patterns: Optional[Union[
|
|
112
|
-
ignore_patterns: Optional[Union[
|
|
111
|
+
allow_patterns: Optional[Union[list[str], str]] = None,
|
|
112
|
+
ignore_patterns: Optional[Union[list[str], str]] = None,
|
|
113
113
|
squash_history: bool = False,
|
|
114
114
|
hf_api: Optional["HfApi"] = None,
|
|
115
115
|
) -> None:
|
|
@@ -138,7 +138,7 @@ class CommitScheduler:
|
|
|
138
138
|
self.token = token
|
|
139
139
|
|
|
140
140
|
# Keep track of already uploaded files
|
|
141
|
-
self.last_uploaded:
|
|
141
|
+
self.last_uploaded: dict[Path, float] = {} # key is local path, value is timestamp
|
|
142
142
|
|
|
143
143
|
# Scheduler
|
|
144
144
|
if not every > 0:
|
|
@@ -205,13 +205,10 @@ class CommitScheduler:
|
|
|
205
205
|
"""
|
|
206
206
|
Push folder to the Hub and return the commit info.
|
|
207
207
|
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
issues.
|
|
213
|
-
|
|
214
|
-
</Tip>
|
|
208
|
+
> [!WARNING]
|
|
209
|
+
> This method is not meant to be called directly. It is run in the background by the scheduler, respecting a
|
|
210
|
+
> queue mechanism to avoid concurrent commits. Making a direct call to the method might lead to concurrency
|
|
211
|
+
> issues.
|
|
215
212
|
|
|
216
213
|
The default behavior of `push_to_hub` is to assume an append-only folder. It lists all files in the folder and
|
|
217
214
|
uploads only changed files. If no changes are found, the method returns without committing anything. If you want
|
|
@@ -232,7 +229,7 @@ class CommitScheduler:
|
|
|
232
229
|
prefix = f"{self.path_in_repo.strip('/')}/" if self.path_in_repo else ""
|
|
233
230
|
|
|
234
231
|
# Filter with pattern + filter out unchanged files + retrieve current file size
|
|
235
|
-
files_to_upload:
|
|
232
|
+
files_to_upload: list[_FileToUpload] = []
|
|
236
233
|
for relpath in filter_repo_objects(
|
|
237
234
|
relpath_to_abspath.keys(), allow_patterns=self.allow_patterns, ignore_patterns=self.ignore_patterns
|
|
238
235
|
):
|
|
@@ -315,10 +312,13 @@ class PartialFileIO(BytesIO):
|
|
|
315
312
|
return self._size_limit
|
|
316
313
|
|
|
317
314
|
def __getattribute__(self, name: str):
|
|
318
|
-
if name.startswith("_") or name in ("read", "tell", "seek"): # only
|
|
315
|
+
if name.startswith("_") or name in ("read", "tell", "seek", "fileno"): # only 4 public methods supported
|
|
319
316
|
return super().__getattribute__(name)
|
|
320
317
|
raise NotImplementedError(f"PartialFileIO does not support '{name}'.")
|
|
321
318
|
|
|
319
|
+
def fileno(self):
|
|
320
|
+
raise AttributeError("PartialFileIO does not have a fileno.")
|
|
321
|
+
|
|
322
322
|
def tell(self) -> int:
|
|
323
323
|
"""Return the current file position."""
|
|
324
324
|
return self._file.tell()
|