huggingface-hub 0.33.5__py3-none-any.whl → 0.35.0rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of huggingface-hub might be problematic. Click here for more details.
- huggingface_hub/__init__.py +487 -525
- huggingface_hub/_commit_api.py +21 -28
- huggingface_hub/_jobs_api.py +145 -0
- huggingface_hub/_local_folder.py +7 -1
- huggingface_hub/_login.py +5 -5
- huggingface_hub/_oauth.py +1 -1
- huggingface_hub/_snapshot_download.py +11 -6
- huggingface_hub/_upload_large_folder.py +46 -23
- huggingface_hub/cli/__init__.py +27 -0
- huggingface_hub/cli/_cli_utils.py +69 -0
- huggingface_hub/cli/auth.py +210 -0
- huggingface_hub/cli/cache.py +405 -0
- huggingface_hub/cli/download.py +181 -0
- huggingface_hub/cli/hf.py +66 -0
- huggingface_hub/cli/jobs.py +522 -0
- huggingface_hub/cli/lfs.py +198 -0
- huggingface_hub/cli/repo.py +243 -0
- huggingface_hub/cli/repo_files.py +128 -0
- huggingface_hub/cli/system.py +52 -0
- huggingface_hub/cli/upload.py +316 -0
- huggingface_hub/cli/upload_large_folder.py +132 -0
- huggingface_hub/commands/_cli_utils.py +5 -0
- huggingface_hub/commands/delete_cache.py +3 -1
- huggingface_hub/commands/download.py +4 -0
- huggingface_hub/commands/env.py +3 -0
- huggingface_hub/commands/huggingface_cli.py +2 -0
- huggingface_hub/commands/repo.py +4 -0
- huggingface_hub/commands/repo_files.py +4 -0
- huggingface_hub/commands/scan_cache.py +3 -1
- huggingface_hub/commands/tag.py +3 -1
- huggingface_hub/commands/upload.py +4 -0
- huggingface_hub/commands/upload_large_folder.py +3 -1
- huggingface_hub/commands/user.py +11 -1
- huggingface_hub/commands/version.py +3 -0
- huggingface_hub/constants.py +1 -0
- huggingface_hub/file_download.py +16 -5
- huggingface_hub/hf_api.py +519 -7
- huggingface_hub/hf_file_system.py +8 -16
- huggingface_hub/hub_mixin.py +3 -3
- huggingface_hub/inference/_client.py +38 -39
- huggingface_hub/inference/_common.py +38 -11
- huggingface_hub/inference/_generated/_async_client.py +50 -51
- huggingface_hub/inference/_generated/types/__init__.py +1 -0
- huggingface_hub/inference/_generated/types/image_to_video.py +60 -0
- huggingface_hub/inference/_mcp/cli.py +36 -18
- huggingface_hub/inference/_mcp/constants.py +8 -0
- huggingface_hub/inference/_mcp/types.py +3 -0
- huggingface_hub/inference/_providers/__init__.py +4 -1
- huggingface_hub/inference/_providers/_common.py +3 -6
- huggingface_hub/inference/_providers/fal_ai.py +85 -42
- huggingface_hub/inference/_providers/hf_inference.py +17 -9
- huggingface_hub/inference/_providers/replicate.py +19 -1
- huggingface_hub/keras_mixin.py +2 -2
- huggingface_hub/repocard.py +1 -1
- huggingface_hub/repository.py +2 -2
- huggingface_hub/utils/_auth.py +1 -1
- huggingface_hub/utils/_cache_manager.py +2 -2
- huggingface_hub/utils/_dotenv.py +51 -0
- huggingface_hub/utils/_headers.py +1 -1
- huggingface_hub/utils/_runtime.py +1 -1
- huggingface_hub/utils/_xet.py +6 -2
- huggingface_hub/utils/_xet_progress_reporting.py +141 -0
- {huggingface_hub-0.33.5.dist-info → huggingface_hub-0.35.0rc0.dist-info}/METADATA +7 -8
- {huggingface_hub-0.33.5.dist-info → huggingface_hub-0.35.0rc0.dist-info}/RECORD +68 -51
- {huggingface_hub-0.33.5.dist-info → huggingface_hub-0.35.0rc0.dist-info}/entry_points.txt +1 -0
- {huggingface_hub-0.33.5.dist-info → huggingface_hub-0.35.0rc0.dist-info}/LICENSE +0 -0
- {huggingface_hub-0.33.5.dist-info → huggingface_hub-0.35.0rc0.dist-info}/WHEEL +0 -0
- {huggingface_hub-0.33.5.dist-info → huggingface_hub-0.35.0rc0.dist-info}/top_level.txt +0 -0
huggingface_hub/_commit_api.py
CHANGED
|
@@ -4,7 +4,6 @@ Type definitions and utilities for the `create_commit` API
|
|
|
4
4
|
|
|
5
5
|
import base64
|
|
6
6
|
import io
|
|
7
|
-
import math
|
|
8
7
|
import os
|
|
9
8
|
import warnings
|
|
10
9
|
from collections import defaultdict
|
|
@@ -23,6 +22,7 @@ from .lfs import UploadInfo, lfs_upload, post_lfs_batch_info
|
|
|
23
22
|
from .utils import (
|
|
24
23
|
FORBIDDEN_FOLDERS,
|
|
25
24
|
XetTokenType,
|
|
25
|
+
are_progress_bars_disabled,
|
|
26
26
|
chunk_iterable,
|
|
27
27
|
fetch_xet_connection_info_from_repo_info,
|
|
28
28
|
get_session,
|
|
@@ -33,7 +33,6 @@ from .utils import (
|
|
|
33
33
|
validate_hf_hub_args,
|
|
34
34
|
)
|
|
35
35
|
from .utils import tqdm as hf_tqdm
|
|
36
|
-
from .utils.tqdm import _get_progress_bar_context
|
|
37
36
|
|
|
38
37
|
|
|
39
38
|
if TYPE_CHECKING:
|
|
@@ -529,9 +528,12 @@ def _upload_xet_files(
|
|
|
529
528
|
"""
|
|
530
529
|
if len(additions) == 0:
|
|
531
530
|
return
|
|
531
|
+
|
|
532
532
|
# at this point, we know that hf_xet is installed
|
|
533
533
|
from hf_xet import upload_bytes, upload_files
|
|
534
534
|
|
|
535
|
+
from .utils._xet_progress_reporting import XetProgressReporter
|
|
536
|
+
|
|
535
537
|
try:
|
|
536
538
|
xet_connection_info = fetch_xet_connection_info_from_repo_info(
|
|
537
539
|
token_type=XetTokenType.WRITE,
|
|
@@ -567,32 +569,18 @@ def _upload_xet_files(
|
|
|
567
569
|
raise XetRefreshTokenError("Failed to refresh xet token")
|
|
568
570
|
return new_xet_connection.access_token, new_xet_connection.expiration_unix_epoch
|
|
569
571
|
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
bytes_ops = [op for op in _chunk if isinstance(op.path_or_fileobj, bytes)]
|
|
576
|
-
paths_ops = [op for op in _chunk if isinstance(op.path_or_fileobj, (str, Path))]
|
|
577
|
-
expected_size = sum(op.upload_info.size for op in bytes_ops + paths_ops)
|
|
572
|
+
if not are_progress_bars_disabled():
|
|
573
|
+
progress = XetProgressReporter()
|
|
574
|
+
progress_callback = progress.update_progress
|
|
575
|
+
else:
|
|
576
|
+
progress, progress_callback = None, None
|
|
578
577
|
|
|
579
|
-
|
|
580
|
-
|
|
581
|
-
|
|
582
|
-
description = "Uploading..."
|
|
583
|
-
progress_cm = _get_progress_bar_context(
|
|
584
|
-
desc=description,
|
|
585
|
-
total=expected_size,
|
|
586
|
-
initial=0,
|
|
587
|
-
unit="B",
|
|
588
|
-
unit_scale=True,
|
|
589
|
-
name="huggingface_hub.xet_put",
|
|
590
|
-
log_level=logger.getEffectiveLevel(),
|
|
591
|
-
)
|
|
592
|
-
with progress_cm as progress:
|
|
578
|
+
try:
|
|
579
|
+
for i, chunk in enumerate(chunk_iterable(additions, chunk_size=UPLOAD_BATCH_MAX_NUM_FILES)):
|
|
580
|
+
_chunk = [op for op in chunk]
|
|
593
581
|
|
|
594
|
-
|
|
595
|
-
|
|
582
|
+
bytes_ops = [op for op in _chunk if isinstance(op.path_or_fileobj, bytes)]
|
|
583
|
+
paths_ops = [op for op in _chunk if isinstance(op.path_or_fileobj, (str, Path))]
|
|
596
584
|
|
|
597
585
|
if len(paths_ops) > 0:
|
|
598
586
|
upload_files(
|
|
@@ -600,7 +588,7 @@ def _upload_xet_files(
|
|
|
600
588
|
xet_endpoint,
|
|
601
589
|
access_token_info,
|
|
602
590
|
token_refresher,
|
|
603
|
-
|
|
591
|
+
progress_callback,
|
|
604
592
|
repo_type,
|
|
605
593
|
)
|
|
606
594
|
if len(bytes_ops) > 0:
|
|
@@ -609,9 +597,14 @@ def _upload_xet_files(
|
|
|
609
597
|
xet_endpoint,
|
|
610
598
|
access_token_info,
|
|
611
599
|
token_refresher,
|
|
612
|
-
|
|
600
|
+
progress_callback,
|
|
613
601
|
repo_type,
|
|
614
602
|
)
|
|
603
|
+
|
|
604
|
+
finally:
|
|
605
|
+
if progress is not None:
|
|
606
|
+
progress.close(False)
|
|
607
|
+
|
|
615
608
|
return
|
|
616
609
|
|
|
617
610
|
|
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
# coding=utf-8
|
|
2
|
+
# Copyright 2025-present, the HuggingFace Inc. team.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
from dataclasses import dataclass
|
|
16
|
+
from datetime import datetime
|
|
17
|
+
from enum import Enum
|
|
18
|
+
from typing import Any, Dict, List, Optional
|
|
19
|
+
|
|
20
|
+
from huggingface_hub import constants
|
|
21
|
+
from huggingface_hub._space_api import SpaceHardware
|
|
22
|
+
from huggingface_hub.utils._datetime import parse_datetime
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class JobStage(str, Enum):
|
|
26
|
+
"""
|
|
27
|
+
Enumeration of possible stage of a Job on the Hub.
|
|
28
|
+
|
|
29
|
+
Value can be compared to a string:
|
|
30
|
+
```py
|
|
31
|
+
assert JobStage.COMPLETED == "COMPLETED"
|
|
32
|
+
```
|
|
33
|
+
|
|
34
|
+
Taken from https://github.com/huggingface/moon-landing/blob/main/server/job_types/JobInfo.ts#L61 (private url).
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
# Copied from moon-landing > server > lib > Job.ts
|
|
38
|
+
COMPLETED = "COMPLETED"
|
|
39
|
+
CANCELED = "CANCELED"
|
|
40
|
+
ERROR = "ERROR"
|
|
41
|
+
DELETED = "DELETED"
|
|
42
|
+
RUNNING = "RUNNING"
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@dataclass
|
|
46
|
+
class JobStatus:
|
|
47
|
+
stage: JobStage
|
|
48
|
+
message: Optional[str]
|
|
49
|
+
|
|
50
|
+
def __init__(self, **kwargs) -> None:
|
|
51
|
+
self.stage = kwargs["stage"]
|
|
52
|
+
self.message = kwargs.get("message")
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
@dataclass
|
|
56
|
+
class JobOwner:
|
|
57
|
+
id: str
|
|
58
|
+
name: str
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
@dataclass
|
|
62
|
+
class JobInfo:
|
|
63
|
+
"""
|
|
64
|
+
Contains information about a Job.
|
|
65
|
+
|
|
66
|
+
Args:
|
|
67
|
+
id (`str`):
|
|
68
|
+
Job ID.
|
|
69
|
+
created_at (`datetime` or `None`):
|
|
70
|
+
When the Job was created.
|
|
71
|
+
docker_image (`str` or `None`):
|
|
72
|
+
The Docker image from Docker Hub used for the Job.
|
|
73
|
+
Can be None if space_id is present instead.
|
|
74
|
+
space_id (`str` or `None`):
|
|
75
|
+
The Docker image from Hugging Face Spaces used for the Job.
|
|
76
|
+
Can be None if docker_image is present instead.
|
|
77
|
+
command (`List[str]` or `None`):
|
|
78
|
+
Command of the Job, e.g. `["python", "-c", "print('hello world')"]`
|
|
79
|
+
arguments (`List[str]` or `None`):
|
|
80
|
+
Arguments passed to the command
|
|
81
|
+
environment (`Dict[str]` or `None`):
|
|
82
|
+
Environment variables of the Job as a dictionary.
|
|
83
|
+
secrets (`Dict[str]` or `None`):
|
|
84
|
+
Secret environment variables of the Job (encrypted).
|
|
85
|
+
flavor (`str` or `None`):
|
|
86
|
+
Flavor for the hardware, as in Hugging Face Spaces. See [`SpaceHardware`] for possible values.
|
|
87
|
+
E.g. `"cpu-basic"`.
|
|
88
|
+
status: (`JobStatus` or `None`):
|
|
89
|
+
Status of the Job, e.g. `JobStatus(stage="RUNNING", message=None)`
|
|
90
|
+
See [`JobStage`] for possible stage values.
|
|
91
|
+
status: (`JobOwner` or `None`):
|
|
92
|
+
Owner of the Job, e.g. `JobOwner(id="5e9ecfc04957053f60648a3e", name="lhoestq")`
|
|
93
|
+
|
|
94
|
+
Example:
|
|
95
|
+
|
|
96
|
+
```python
|
|
97
|
+
>>> from huggingface_hub import run_job
|
|
98
|
+
>>> job = run_job(
|
|
99
|
+
... image="python:3.12",
|
|
100
|
+
... command=["python", "-c", "print('Hello from the cloud!')"]
|
|
101
|
+
... )
|
|
102
|
+
>>> job
|
|
103
|
+
JobInfo(id='687fb701029421ae5549d998', created_at=datetime.datetime(2025, 7, 22, 16, 6, 25, 79000, tzinfo=datetime.timezone.utc), docker_image='python:3.12', space_id=None, command=['python', '-c', "print('Hello from the cloud!')"], arguments=[], environment={}, secrets={}, flavor='cpu-basic', status=JobStatus(stage='RUNNING', message=None), owner=JobOwner(id='5e9ecfc04957053f60648a3e', name='lhoestq'), endpoint='https://huggingface.co', url='https://huggingface.co/jobs/lhoestq/687fb701029421ae5549d998')
|
|
104
|
+
>>> job.id
|
|
105
|
+
'687fb701029421ae5549d998'
|
|
106
|
+
>>> job.url
|
|
107
|
+
'https://huggingface.co/jobs/lhoestq/687fb701029421ae5549d998'
|
|
108
|
+
>>> job.status.stage
|
|
109
|
+
'RUNNING'
|
|
110
|
+
```
|
|
111
|
+
"""
|
|
112
|
+
|
|
113
|
+
id: str
|
|
114
|
+
created_at: Optional[datetime]
|
|
115
|
+
docker_image: Optional[str]
|
|
116
|
+
space_id: Optional[str]
|
|
117
|
+
command: Optional[List[str]]
|
|
118
|
+
arguments: Optional[List[str]]
|
|
119
|
+
environment: Optional[Dict[str, Any]]
|
|
120
|
+
secrets: Optional[Dict[str, Any]]
|
|
121
|
+
flavor: Optional[SpaceHardware]
|
|
122
|
+
status: Optional[JobStatus]
|
|
123
|
+
owner: Optional[JobOwner]
|
|
124
|
+
|
|
125
|
+
# Inferred fields
|
|
126
|
+
endpoint: str
|
|
127
|
+
url: str
|
|
128
|
+
|
|
129
|
+
def __init__(self, **kwargs) -> None:
|
|
130
|
+
self.id = kwargs["id"]
|
|
131
|
+
created_at = kwargs.get("createdAt") or kwargs.get("created_at")
|
|
132
|
+
self.created_at = parse_datetime(created_at) if created_at else None
|
|
133
|
+
self.docker_image = kwargs.get("dockerImage") or kwargs.get("docker_image")
|
|
134
|
+
self.space_id = kwargs.get("spaceId") or kwargs.get("space_id")
|
|
135
|
+
self.owner = JobOwner(**(kwargs["owner"] if isinstance(kwargs.get("owner"), dict) else {}))
|
|
136
|
+
self.command = kwargs.get("command")
|
|
137
|
+
self.arguments = kwargs.get("arguments")
|
|
138
|
+
self.environment = kwargs.get("environment")
|
|
139
|
+
self.secrets = kwargs.get("secrets")
|
|
140
|
+
self.flavor = kwargs.get("flavor")
|
|
141
|
+
self.status = JobStatus(**(kwargs["status"] if isinstance(kwargs.get("status"), dict) else {}))
|
|
142
|
+
|
|
143
|
+
# Inferred fields
|
|
144
|
+
self.endpoint = kwargs.get("endpoint", constants.ENDPOINT)
|
|
145
|
+
self.url = f"{self.endpoint}/jobs/{self.owner.name}/{self.id}"
|
huggingface_hub/_local_folder.py
CHANGED
|
@@ -86,7 +86,13 @@ class LocalDownloadFilePaths:
|
|
|
86
86
|
|
|
87
87
|
def incomplete_path(self, etag: str) -> Path:
|
|
88
88
|
"""Return the path where a file will be temporarily downloaded before being moved to `file_path`."""
|
|
89
|
-
|
|
89
|
+
path = self.metadata_path.parent / f"{_short_hash(self.metadata_path.name)}.{etag}.incomplete"
|
|
90
|
+
resolved_path = str(path.resolve())
|
|
91
|
+
# Some Windows versions do not allow for paths longer than 255 characters.
|
|
92
|
+
# In this case, we must specify it as an extended path by using the "\\?\" prefix.
|
|
93
|
+
if len(resolved_path) > 255 and not resolved_path.startswith("\\\\?\\"):
|
|
94
|
+
path = Path("\\\\?\\" + resolved_path)
|
|
95
|
+
return path
|
|
90
96
|
|
|
91
97
|
|
|
92
98
|
@dataclass(frozen=True)
|
huggingface_hub/_login.py
CHANGED
|
@@ -75,7 +75,7 @@ def login(
|
|
|
75
75
|
components. If `token` is not provided, it will be prompted to the user either with
|
|
76
76
|
a widget (in a notebook) or via the terminal.
|
|
77
77
|
|
|
78
|
-
To log in from outside of a script, one can also use `
|
|
78
|
+
To log in from outside of a script, one can also use `hf auth login` which is
|
|
79
79
|
a cli command that wraps [`login`].
|
|
80
80
|
|
|
81
81
|
<Tip>
|
|
@@ -120,7 +120,7 @@ def login(
|
|
|
120
120
|
logger.info(
|
|
121
121
|
"The token has not been saved to the git credentials helper. Pass "
|
|
122
122
|
"`add_to_git_credential=True` in this function directly or "
|
|
123
|
-
"`--add-to-git-credential` if using via `
|
|
123
|
+
"`--add-to-git-credential` if using via `hf`CLI if "
|
|
124
124
|
"you want to set the git credential as well."
|
|
125
125
|
)
|
|
126
126
|
_login(token, add_to_git_credential=add_to_git_credential)
|
|
@@ -233,7 +233,7 @@ def auth_list() -> None:
|
|
|
233
233
|
)
|
|
234
234
|
elif current_token_name is None:
|
|
235
235
|
logger.warning(
|
|
236
|
-
"\nNote: No active token is set and no environment variable `HF_TOKEN` is found. Use `
|
|
236
|
+
"\nNote: No active token is set and no environment variable `HF_TOKEN` is found. Use `hf auth login` to log in."
|
|
237
237
|
)
|
|
238
238
|
|
|
239
239
|
|
|
@@ -273,8 +273,8 @@ def interpreter_login(*, new_session: bool = True, write_permission: bool = Fals
|
|
|
273
273
|
print(_HF_LOGO_ASCII)
|
|
274
274
|
if get_token() is not None:
|
|
275
275
|
logger.info(
|
|
276
|
-
" A token is already saved on your machine. Run `
|
|
277
|
-
"
|
|
276
|
+
" A token is already saved on your machine. Run `hf auth whoami`"
|
|
277
|
+
" to get more information or `hf auth logout` if you want"
|
|
278
278
|
" to log out."
|
|
279
279
|
)
|
|
280
280
|
logger.info(" Setting a new token will erase the existing one.")
|
huggingface_hub/_oauth.py
CHANGED
|
@@ -415,7 +415,7 @@ def _get_mocked_oauth_info() -> Dict:
|
|
|
415
415
|
if token is None:
|
|
416
416
|
raise ValueError(
|
|
417
417
|
"Your machine must be logged in to HF to debug an OAuth app locally. Please"
|
|
418
|
-
" run `
|
|
418
|
+
" run `hf auth login` or set `HF_TOKEN` as environment variable "
|
|
419
419
|
"with one of your access token. You can generate a new token in your "
|
|
420
420
|
"settings page (https://huggingface.co/settings/tokens)."
|
|
421
421
|
)
|
|
@@ -254,14 +254,19 @@ def snapshot_download(
|
|
|
254
254
|
# At this stage, internet connection is up and running
|
|
255
255
|
# => let's download the files!
|
|
256
256
|
assert repo_info.sha is not None, "Repo info returned from server must have a revision sha."
|
|
257
|
-
assert repo_info.siblings is not None, "Repo info returned from server must have a siblings list."
|
|
258
257
|
|
|
259
258
|
# Corner case: on very large repos, the siblings list in `repo_info` might not contain all files.
|
|
260
259
|
# In that case, we need to use the `list_repo_tree` method to prevent caching issues.
|
|
261
|
-
repo_files: Iterable[str] = [f.rfilename for f in repo_info.siblings]
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
260
|
+
repo_files: Iterable[str] = [f.rfilename for f in repo_info.siblings] if repo_info.siblings is not None else []
|
|
261
|
+
unreliable_nb_files = (
|
|
262
|
+
repo_info.siblings is None
|
|
263
|
+
or len(repo_info.siblings) == 0
|
|
264
|
+
or len(repo_info.siblings) > VERY_LARGE_REPO_THRESHOLD
|
|
265
|
+
)
|
|
266
|
+
if unreliable_nb_files:
|
|
267
|
+
logger.info(
|
|
268
|
+
"Number of files in the repo is unreliable. Using `list_repo_tree` to ensure all files are listed."
|
|
269
|
+
)
|
|
265
270
|
repo_files = (
|
|
266
271
|
f.rfilename
|
|
267
272
|
for f in api.list_repo_tree(repo_id=repo_id, recursive=True, revision=revision, repo_type=repo_type)
|
|
@@ -274,7 +279,7 @@ def snapshot_download(
|
|
|
274
279
|
ignore_patterns=ignore_patterns,
|
|
275
280
|
)
|
|
276
281
|
|
|
277
|
-
if not
|
|
282
|
+
if not unreliable_nb_files:
|
|
278
283
|
filtered_repo_files = list(filtered_repo_files)
|
|
279
284
|
tqdm_desc = f"Fetching {len(filtered_repo_files)} files"
|
|
280
285
|
else:
|
|
@@ -33,6 +33,7 @@ from ._local_folder import LocalUploadFileMetadata, LocalUploadFilePaths, get_lo
|
|
|
33
33
|
from .constants import DEFAULT_REVISION, REPO_TYPES
|
|
34
34
|
from .utils import DEFAULT_IGNORE_PATTERNS, filter_repo_objects, tqdm
|
|
35
35
|
from .utils._cache_manager import _format_size
|
|
36
|
+
from .utils._runtime import is_xet_available
|
|
36
37
|
from .utils.sha import sha_fileobj
|
|
37
38
|
|
|
38
39
|
|
|
@@ -45,6 +46,9 @@ WAITING_TIME_IF_NO_TASKS = 10 # seconds
|
|
|
45
46
|
MAX_NB_FILES_FETCH_UPLOAD_MODE = 100
|
|
46
47
|
COMMIT_SIZE_SCALE: List[int] = [20, 50, 75, 100, 125, 200, 250, 400, 600, 1000]
|
|
47
48
|
|
|
49
|
+
UPLOAD_BATCH_SIZE_XET = 256 # Max 256 files per upload batch for XET-enabled repos
|
|
50
|
+
UPLOAD_BATCH_SIZE_LFS = 1 # Otherwise, batches of 1 for regular LFS upload
|
|
51
|
+
|
|
48
52
|
|
|
49
53
|
def upload_large_folder_internal(
|
|
50
54
|
api: "HfApi",
|
|
@@ -93,6 +97,17 @@ def upload_large_folder_internal(
|
|
|
93
97
|
repo_url = api.create_repo(repo_id=repo_id, repo_type=repo_type, private=private, exist_ok=True)
|
|
94
98
|
logger.info(f"Repo created: {repo_url}")
|
|
95
99
|
repo_id = repo_url.repo_id
|
|
100
|
+
# 2.1 Check if xet is enabled to set batch file upload size
|
|
101
|
+
is_xet_enabled = (
|
|
102
|
+
is_xet_available()
|
|
103
|
+
and api.repo_info(
|
|
104
|
+
repo_id=repo_id,
|
|
105
|
+
repo_type=repo_type,
|
|
106
|
+
revision=revision,
|
|
107
|
+
expand="xetEnabled",
|
|
108
|
+
).xet_enabled
|
|
109
|
+
)
|
|
110
|
+
upload_batch_size = UPLOAD_BATCH_SIZE_XET if is_xet_enabled else UPLOAD_BATCH_SIZE_LFS
|
|
96
111
|
|
|
97
112
|
# 3. List files to upload
|
|
98
113
|
filtered_paths_list = filter_repo_objects(
|
|
@@ -110,7 +125,7 @@ def upload_large_folder_internal(
|
|
|
110
125
|
]
|
|
111
126
|
|
|
112
127
|
# 4. Start workers
|
|
113
|
-
status = LargeUploadStatus(items)
|
|
128
|
+
status = LargeUploadStatus(items, upload_batch_size)
|
|
114
129
|
threads = [
|
|
115
130
|
threading.Thread(
|
|
116
131
|
target=_worker_job,
|
|
@@ -168,7 +183,7 @@ JOB_ITEM_T = Tuple[LocalUploadFilePaths, LocalUploadFileMetadata]
|
|
|
168
183
|
class LargeUploadStatus:
|
|
169
184
|
"""Contains information, queues and tasks for a large upload process."""
|
|
170
185
|
|
|
171
|
-
def __init__(self, items: List[JOB_ITEM_T]):
|
|
186
|
+
def __init__(self, items: List[JOB_ITEM_T], upload_batch_size: int = 1):
|
|
172
187
|
self.items = items
|
|
173
188
|
self.queue_sha256: "queue.Queue[JOB_ITEM_T]" = queue.Queue()
|
|
174
189
|
self.queue_get_upload_mode: "queue.Queue[JOB_ITEM_T]" = queue.Queue()
|
|
@@ -179,6 +194,7 @@ class LargeUploadStatus:
|
|
|
179
194
|
self.nb_workers_sha256: int = 0
|
|
180
195
|
self.nb_workers_get_upload_mode: int = 0
|
|
181
196
|
self.nb_workers_preupload_lfs: int = 0
|
|
197
|
+
self.upload_batch_size: int = upload_batch_size
|
|
182
198
|
self.nb_workers_commit: int = 0
|
|
183
199
|
self.nb_workers_waiting: int = 0
|
|
184
200
|
self.last_commit_attempt: Optional[float] = None
|
|
@@ -353,16 +369,17 @@ def _worker_job(
|
|
|
353
369
|
status.nb_workers_get_upload_mode -= 1
|
|
354
370
|
|
|
355
371
|
elif job == WorkerJob.PREUPLOAD_LFS:
|
|
356
|
-
item = items[0] # single item
|
|
357
372
|
try:
|
|
358
|
-
_preupload_lfs(
|
|
359
|
-
|
|
373
|
+
_preupload_lfs(items, api=api, repo_id=repo_id, repo_type=repo_type, revision=revision)
|
|
374
|
+
for item in items:
|
|
375
|
+
status.queue_commit.put(item)
|
|
360
376
|
except KeyboardInterrupt:
|
|
361
377
|
raise
|
|
362
378
|
except Exception as e:
|
|
363
379
|
logger.error(f"Failed to preupload LFS: {e}")
|
|
364
380
|
traceback.format_exc()
|
|
365
|
-
|
|
381
|
+
for item in items:
|
|
382
|
+
status.queue_preupload_lfs.put(item)
|
|
366
383
|
|
|
367
384
|
with status.lock:
|
|
368
385
|
status.nb_workers_preupload_lfs -= 1
|
|
@@ -417,11 +434,11 @@ def _determine_next_job(status: LargeUploadStatus) -> Optional[Tuple[WorkerJob,
|
|
|
417
434
|
logger.debug(f"Job: get upload mode (>{MAX_NB_FILES_FETCH_UPLOAD_MODE} files ready)")
|
|
418
435
|
return (WorkerJob.GET_UPLOAD_MODE, _get_n(status.queue_get_upload_mode, MAX_NB_FILES_FETCH_UPLOAD_MODE))
|
|
419
436
|
|
|
420
|
-
# 4. Preupload LFS file if at least
|
|
421
|
-
elif status.queue_preupload_lfs.qsize()
|
|
437
|
+
# 4. Preupload LFS file if at least `status.upload_batch_size` files and no worker is preuploading LFS
|
|
438
|
+
elif status.queue_preupload_lfs.qsize() >= status.upload_batch_size and status.nb_workers_preupload_lfs == 0:
|
|
422
439
|
status.nb_workers_preupload_lfs += 1
|
|
423
440
|
logger.debug("Job: preupload LFS (no other worker preuploading LFS)")
|
|
424
|
-
return (WorkerJob.PREUPLOAD_LFS,
|
|
441
|
+
return (WorkerJob.PREUPLOAD_LFS, _get_n(status.queue_preupload_lfs, status.upload_batch_size))
|
|
425
442
|
|
|
426
443
|
# 5. Compute sha256 if at least 1 file and no worker is computing sha256
|
|
427
444
|
elif status.queue_sha256.qsize() > 0 and status.nb_workers_sha256 == 0:
|
|
@@ -435,14 +452,14 @@ def _determine_next_job(status: LargeUploadStatus) -> Optional[Tuple[WorkerJob,
|
|
|
435
452
|
logger.debug("Job: get upload mode (no other worker getting upload mode)")
|
|
436
453
|
return (WorkerJob.GET_UPLOAD_MODE, _get_n(status.queue_get_upload_mode, MAX_NB_FILES_FETCH_UPLOAD_MODE))
|
|
437
454
|
|
|
438
|
-
# 7. Preupload LFS file if at least
|
|
455
|
+
# 7. Preupload LFS file if at least `status.upload_batch_size` files
|
|
439
456
|
# Skip if hf_transfer is enabled and there is already a worker preuploading LFS
|
|
440
|
-
elif status.queue_preupload_lfs.qsize()
|
|
457
|
+
elif status.queue_preupload_lfs.qsize() >= status.upload_batch_size and (
|
|
441
458
|
status.nb_workers_preupload_lfs == 0 or not constants.HF_HUB_ENABLE_HF_TRANSFER
|
|
442
459
|
):
|
|
443
460
|
status.nb_workers_preupload_lfs += 1
|
|
444
461
|
logger.debug("Job: preupload LFS")
|
|
445
|
-
return (WorkerJob.PREUPLOAD_LFS,
|
|
462
|
+
return (WorkerJob.PREUPLOAD_LFS, _get_n(status.queue_preupload_lfs, status.upload_batch_size))
|
|
446
463
|
|
|
447
464
|
# 8. Compute sha256 if at least 1 file
|
|
448
465
|
elif status.queue_sha256.qsize() > 0:
|
|
@@ -456,7 +473,13 @@ def _determine_next_job(status: LargeUploadStatus) -> Optional[Tuple[WorkerJob,
|
|
|
456
473
|
logger.debug("Job: get upload mode")
|
|
457
474
|
return (WorkerJob.GET_UPLOAD_MODE, _get_n(status.queue_get_upload_mode, MAX_NB_FILES_FETCH_UPLOAD_MODE))
|
|
458
475
|
|
|
459
|
-
# 10.
|
|
476
|
+
# 10. Preupload LFS file if at least 1 file
|
|
477
|
+
elif status.queue_preupload_lfs.qsize() > 0:
|
|
478
|
+
status.nb_workers_preupload_lfs += 1
|
|
479
|
+
logger.debug("Job: preupload LFS")
|
|
480
|
+
return (WorkerJob.PREUPLOAD_LFS, _get_n(status.queue_preupload_lfs, status.upload_batch_size))
|
|
481
|
+
|
|
482
|
+
# 11. Commit if at least 1 file and 1 min since last commit attempt
|
|
460
483
|
elif (
|
|
461
484
|
status.nb_workers_commit == 0
|
|
462
485
|
and status.queue_commit.qsize() > 0
|
|
@@ -467,7 +490,7 @@ def _determine_next_job(status: LargeUploadStatus) -> Optional[Tuple[WorkerJob,
|
|
|
467
490
|
logger.debug("Job: commit (1 min since last commit attempt)")
|
|
468
491
|
return (WorkerJob.COMMIT, _get_n(status.queue_commit, status.target_chunk()))
|
|
469
492
|
|
|
470
|
-
#
|
|
493
|
+
# 12. Commit if at least 1 file all other queues are empty and all workers are waiting
|
|
471
494
|
# e.g. when it's the last commit
|
|
472
495
|
elif (
|
|
473
496
|
status.nb_workers_commit == 0
|
|
@@ -483,12 +506,12 @@ def _determine_next_job(status: LargeUploadStatus) -> Optional[Tuple[WorkerJob,
|
|
|
483
506
|
logger.debug("Job: commit")
|
|
484
507
|
return (WorkerJob.COMMIT, _get_n(status.queue_commit, status.target_chunk()))
|
|
485
508
|
|
|
486
|
-
#
|
|
509
|
+
# 13. If all queues are empty, exit
|
|
487
510
|
elif all(metadata.is_committed or metadata.should_ignore for _, metadata in status.items):
|
|
488
511
|
logger.info("All files have been processed! Exiting worker.")
|
|
489
512
|
return None
|
|
490
513
|
|
|
491
|
-
#
|
|
514
|
+
# 14. If no task is available, wait
|
|
492
515
|
else:
|
|
493
516
|
status.nb_workers_waiting += 1
|
|
494
517
|
logger.debug(f"No task available, waiting... ({WAITING_TIME_IF_NO_TASKS}s)")
|
|
@@ -531,19 +554,19 @@ def _get_upload_mode(items: List[JOB_ITEM_T], api: "HfApi", repo_id: str, repo_t
|
|
|
531
554
|
metadata.save(paths)
|
|
532
555
|
|
|
533
556
|
|
|
534
|
-
def _preupload_lfs(
|
|
535
|
-
"""Preupload LFS
|
|
536
|
-
|
|
537
|
-
addition = _build_hacky_operation(item)
|
|
557
|
+
def _preupload_lfs(items: List[JOB_ITEM_T], api: "HfApi", repo_id: str, repo_type: str, revision: str) -> None:
|
|
558
|
+
"""Preupload LFS files and update metadata."""
|
|
559
|
+
additions = [_build_hacky_operation(item) for item in items]
|
|
538
560
|
api.preupload_lfs_files(
|
|
539
561
|
repo_id=repo_id,
|
|
540
562
|
repo_type=repo_type,
|
|
541
563
|
revision=revision,
|
|
542
|
-
additions=
|
|
564
|
+
additions=additions,
|
|
543
565
|
)
|
|
544
566
|
|
|
545
|
-
metadata
|
|
546
|
-
|
|
567
|
+
for paths, metadata in items:
|
|
568
|
+
metadata.is_uploaded = True
|
|
569
|
+
metadata.save(paths)
|
|
547
570
|
|
|
548
571
|
|
|
549
572
|
def _commit(items: List[JOB_ITEM_T], api: "HfApi", repo_id: str, repo_type: str, revision: str) -> None:
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
# Copyright 2025 The HuggingFace Team. All rights reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
|
|
15
|
+
from abc import ABC, abstractmethod
|
|
16
|
+
from argparse import _SubParsersAction
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class BaseHuggingfaceCLICommand(ABC):
|
|
20
|
+
@staticmethod
|
|
21
|
+
@abstractmethod
|
|
22
|
+
def register_subcommand(parser: _SubParsersAction):
|
|
23
|
+
raise NotImplementedError()
|
|
24
|
+
|
|
25
|
+
@abstractmethod
|
|
26
|
+
def run(self):
|
|
27
|
+
raise NotImplementedError()
|
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
# Copyright 2022 The HuggingFace Team. All rights reserved.
|
|
2
|
+
#
|
|
3
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
|
+
# you may not use this file except in compliance with the License.
|
|
5
|
+
# You may obtain a copy of the License at
|
|
6
|
+
#
|
|
7
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
8
|
+
#
|
|
9
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
10
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
11
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
|
+
# See the License for the specific language governing permissions and
|
|
13
|
+
# limitations under the License.
|
|
14
|
+
"""Contains a utility for good-looking prints."""
|
|
15
|
+
|
|
16
|
+
import os
|
|
17
|
+
from typing import List, Union
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
class ANSI:
|
|
21
|
+
"""
|
|
22
|
+
Helper for en.wikipedia.org/wiki/ANSI_escape_code
|
|
23
|
+
"""
|
|
24
|
+
|
|
25
|
+
_bold = "\u001b[1m"
|
|
26
|
+
_gray = "\u001b[90m"
|
|
27
|
+
_red = "\u001b[31m"
|
|
28
|
+
_reset = "\u001b[0m"
|
|
29
|
+
_yellow = "\u001b[33m"
|
|
30
|
+
|
|
31
|
+
@classmethod
|
|
32
|
+
def bold(cls, s: str) -> str:
|
|
33
|
+
return cls._format(s, cls._bold)
|
|
34
|
+
|
|
35
|
+
@classmethod
|
|
36
|
+
def gray(cls, s: str) -> str:
|
|
37
|
+
return cls._format(s, cls._gray)
|
|
38
|
+
|
|
39
|
+
@classmethod
|
|
40
|
+
def red(cls, s: str) -> str:
|
|
41
|
+
return cls._format(s, cls._bold + cls._red)
|
|
42
|
+
|
|
43
|
+
@classmethod
|
|
44
|
+
def yellow(cls, s: str) -> str:
|
|
45
|
+
return cls._format(s, cls._yellow)
|
|
46
|
+
|
|
47
|
+
@classmethod
|
|
48
|
+
def _format(cls, s: str, code: str) -> str:
|
|
49
|
+
if os.environ.get("NO_COLOR"):
|
|
50
|
+
# See https://no-color.org/
|
|
51
|
+
return s
|
|
52
|
+
return f"{code}{s}{cls._reset}"
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def tabulate(rows: List[List[Union[str, int]]], headers: List[str]) -> str:
|
|
56
|
+
"""
|
|
57
|
+
Inspired by:
|
|
58
|
+
|
|
59
|
+
- stackoverflow.com/a/8356620/593036
|
|
60
|
+
- stackoverflow.com/questions/9535954/printing-lists-as-tabular-data
|
|
61
|
+
"""
|
|
62
|
+
col_widths = [max(len(str(x)) for x in col) for col in zip(*rows, headers)]
|
|
63
|
+
row_format = ("{{:{}}} " * len(headers)).format(*col_widths)
|
|
64
|
+
lines = []
|
|
65
|
+
lines.append(row_format.format(*headers))
|
|
66
|
+
lines.append(row_format.format(*["-" * w for w in col_widths]))
|
|
67
|
+
for row in rows:
|
|
68
|
+
lines.append(row_format.format(*row))
|
|
69
|
+
return "\n".join(lines)
|