huggingface-hub 0.31.0rc0__py3-none-any.whl → 1.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- huggingface_hub/__init__.py +145 -46
- huggingface_hub/_commit_api.py +168 -119
- huggingface_hub/_commit_scheduler.py +15 -15
- huggingface_hub/_inference_endpoints.py +15 -12
- huggingface_hub/_jobs_api.py +301 -0
- huggingface_hub/_local_folder.py +18 -3
- huggingface_hub/_login.py +31 -63
- huggingface_hub/_oauth.py +460 -0
- huggingface_hub/_snapshot_download.py +239 -80
- huggingface_hub/_space_api.py +5 -5
- huggingface_hub/_tensorboard_logger.py +15 -19
- huggingface_hub/_upload_large_folder.py +172 -76
- huggingface_hub/_webhooks_payload.py +3 -3
- huggingface_hub/_webhooks_server.py +13 -25
- huggingface_hub/{commands → cli}/__init__.py +1 -15
- huggingface_hub/cli/_cli_utils.py +173 -0
- huggingface_hub/cli/auth.py +147 -0
- huggingface_hub/cli/cache.py +841 -0
- huggingface_hub/cli/download.py +189 -0
- huggingface_hub/cli/hf.py +60 -0
- huggingface_hub/cli/inference_endpoints.py +377 -0
- huggingface_hub/cli/jobs.py +772 -0
- huggingface_hub/cli/lfs.py +175 -0
- huggingface_hub/cli/repo.py +315 -0
- huggingface_hub/cli/repo_files.py +94 -0
- huggingface_hub/{commands/env.py → cli/system.py} +10 -13
- huggingface_hub/cli/upload.py +294 -0
- huggingface_hub/cli/upload_large_folder.py +117 -0
- huggingface_hub/community.py +20 -12
- huggingface_hub/constants.py +38 -53
- huggingface_hub/dataclasses.py +609 -0
- huggingface_hub/errors.py +80 -30
- huggingface_hub/fastai_utils.py +30 -41
- huggingface_hub/file_download.py +435 -351
- huggingface_hub/hf_api.py +2050 -1124
- huggingface_hub/hf_file_system.py +269 -152
- huggingface_hub/hub_mixin.py +43 -63
- huggingface_hub/inference/_client.py +347 -434
- huggingface_hub/inference/_common.py +133 -121
- huggingface_hub/inference/_generated/_async_client.py +397 -541
- huggingface_hub/inference/_generated/types/__init__.py +5 -1
- huggingface_hub/inference/_generated/types/automatic_speech_recognition.py +3 -3
- huggingface_hub/inference/_generated/types/base.py +10 -7
- huggingface_hub/inference/_generated/types/chat_completion.py +59 -23
- huggingface_hub/inference/_generated/types/depth_estimation.py +2 -2
- huggingface_hub/inference/_generated/types/document_question_answering.py +2 -2
- huggingface_hub/inference/_generated/types/feature_extraction.py +2 -2
- huggingface_hub/inference/_generated/types/fill_mask.py +2 -2
- huggingface_hub/inference/_generated/types/image_to_image.py +6 -2
- huggingface_hub/inference/_generated/types/image_to_video.py +60 -0
- huggingface_hub/inference/_generated/types/sentence_similarity.py +3 -3
- huggingface_hub/inference/_generated/types/summarization.py +2 -2
- huggingface_hub/inference/_generated/types/table_question_answering.py +5 -5
- huggingface_hub/inference/_generated/types/text2text_generation.py +2 -2
- huggingface_hub/inference/_generated/types/text_generation.py +10 -10
- huggingface_hub/inference/_generated/types/text_to_video.py +2 -2
- huggingface_hub/inference/_generated/types/token_classification.py +2 -2
- huggingface_hub/inference/_generated/types/translation.py +2 -2
- huggingface_hub/inference/_generated/types/zero_shot_classification.py +2 -2
- huggingface_hub/inference/_generated/types/zero_shot_image_classification.py +2 -2
- huggingface_hub/inference/_generated/types/zero_shot_object_detection.py +1 -3
- huggingface_hub/inference/_mcp/__init__.py +0 -0
- huggingface_hub/inference/_mcp/_cli_hacks.py +88 -0
- huggingface_hub/inference/_mcp/agent.py +100 -0
- huggingface_hub/inference/_mcp/cli.py +247 -0
- huggingface_hub/inference/_mcp/constants.py +81 -0
- huggingface_hub/inference/_mcp/mcp_client.py +395 -0
- huggingface_hub/inference/_mcp/types.py +45 -0
- huggingface_hub/inference/_mcp/utils.py +128 -0
- huggingface_hub/inference/_providers/__init__.py +82 -7
- huggingface_hub/inference/_providers/_common.py +129 -27
- huggingface_hub/inference/_providers/black_forest_labs.py +6 -6
- huggingface_hub/inference/_providers/cerebras.py +1 -1
- huggingface_hub/inference/_providers/clarifai.py +13 -0
- huggingface_hub/inference/_providers/cohere.py +20 -3
- huggingface_hub/inference/_providers/fal_ai.py +183 -56
- huggingface_hub/inference/_providers/featherless_ai.py +38 -0
- huggingface_hub/inference/_providers/fireworks_ai.py +18 -0
- huggingface_hub/inference/_providers/groq.py +9 -0
- huggingface_hub/inference/_providers/hf_inference.py +69 -30
- huggingface_hub/inference/_providers/hyperbolic.py +4 -4
- huggingface_hub/inference/_providers/nebius.py +33 -5
- huggingface_hub/inference/_providers/novita.py +5 -5
- huggingface_hub/inference/_providers/nscale.py +44 -0
- huggingface_hub/inference/_providers/openai.py +3 -1
- huggingface_hub/inference/_providers/publicai.py +6 -0
- huggingface_hub/inference/_providers/replicate.py +31 -13
- huggingface_hub/inference/_providers/sambanova.py +18 -4
- huggingface_hub/inference/_providers/scaleway.py +28 -0
- huggingface_hub/inference/_providers/together.py +20 -5
- huggingface_hub/inference/_providers/wavespeed.py +138 -0
- huggingface_hub/inference/_providers/zai_org.py +17 -0
- huggingface_hub/lfs.py +33 -100
- huggingface_hub/repocard.py +34 -38
- huggingface_hub/repocard_data.py +57 -57
- huggingface_hub/serialization/__init__.py +0 -1
- huggingface_hub/serialization/_base.py +12 -15
- huggingface_hub/serialization/_dduf.py +8 -8
- huggingface_hub/serialization/_torch.py +69 -69
- huggingface_hub/utils/__init__.py +19 -8
- huggingface_hub/utils/_auth.py +7 -7
- huggingface_hub/utils/_cache_manager.py +92 -147
- huggingface_hub/utils/_chunk_utils.py +2 -3
- huggingface_hub/utils/_deprecation.py +1 -1
- huggingface_hub/utils/_dotenv.py +55 -0
- huggingface_hub/utils/_experimental.py +7 -5
- huggingface_hub/utils/_fixes.py +0 -10
- huggingface_hub/utils/_git_credential.py +5 -5
- huggingface_hub/utils/_headers.py +8 -30
- huggingface_hub/utils/_http.py +398 -239
- huggingface_hub/utils/_pagination.py +4 -4
- huggingface_hub/utils/_parsing.py +98 -0
- huggingface_hub/utils/_paths.py +5 -5
- huggingface_hub/utils/_runtime.py +61 -24
- huggingface_hub/utils/_safetensors.py +21 -21
- huggingface_hub/utils/_subprocess.py +9 -9
- huggingface_hub/utils/_telemetry.py +4 -4
- huggingface_hub/{commands/_cli_utils.py → utils/_terminal.py} +4 -4
- huggingface_hub/utils/_typing.py +25 -5
- huggingface_hub/utils/_validators.py +55 -74
- huggingface_hub/utils/_verification.py +167 -0
- huggingface_hub/utils/_xet.py +64 -17
- huggingface_hub/utils/_xet_progress_reporting.py +162 -0
- huggingface_hub/utils/insecure_hashlib.py +3 -5
- huggingface_hub/utils/logging.py +8 -11
- huggingface_hub/utils/tqdm.py +5 -4
- {huggingface_hub-0.31.0rc0.dist-info → huggingface_hub-1.1.3.dist-info}/METADATA +94 -85
- huggingface_hub-1.1.3.dist-info/RECORD +155 -0
- {huggingface_hub-0.31.0rc0.dist-info → huggingface_hub-1.1.3.dist-info}/WHEEL +1 -1
- huggingface_hub-1.1.3.dist-info/entry_points.txt +6 -0
- huggingface_hub/commands/delete_cache.py +0 -474
- huggingface_hub/commands/download.py +0 -200
- huggingface_hub/commands/huggingface_cli.py +0 -61
- huggingface_hub/commands/lfs.py +0 -200
- huggingface_hub/commands/repo_files.py +0 -128
- huggingface_hub/commands/scan_cache.py +0 -181
- huggingface_hub/commands/tag.py +0 -159
- huggingface_hub/commands/upload.py +0 -314
- huggingface_hub/commands/upload_large_folder.py +0 -129
- huggingface_hub/commands/user.py +0 -304
- huggingface_hub/commands/version.py +0 -37
- huggingface_hub/inference_api.py +0 -217
- huggingface_hub/keras_mixin.py +0 -500
- huggingface_hub/repository.py +0 -1477
- huggingface_hub/serialization/_tensorflow.py +0 -95
- huggingface_hub/utils/_hf_folder.py +0 -68
- huggingface_hub-0.31.0rc0.dist-info/RECORD +0 -135
- huggingface_hub-0.31.0rc0.dist-info/entry_points.txt +0 -6
- {huggingface_hub-0.31.0rc0.dist-info → huggingface_hub-1.1.3.dist-info/licenses}/LICENSE +0 -0
- {huggingface_hub-0.31.0rc0.dist-info → huggingface_hub-1.1.3.dist-info}/top_level.txt +0 -0
|
@@ -24,15 +24,14 @@ import traceback
|
|
|
24
24
|
from datetime import datetime
|
|
25
25
|
from pathlib import Path
|
|
26
26
|
from threading import Lock
|
|
27
|
-
from typing import TYPE_CHECKING,
|
|
27
|
+
from typing import TYPE_CHECKING, Any, Optional, Union
|
|
28
28
|
from urllib.parse import quote
|
|
29
29
|
|
|
30
|
-
from . import constants
|
|
31
30
|
from ._commit_api import CommitOperationAdd, UploadInfo, _fetch_upload_modes
|
|
32
31
|
from ._local_folder import LocalUploadFileMetadata, LocalUploadFilePaths, get_local_upload_paths, read_upload_metadata
|
|
33
32
|
from .constants import DEFAULT_REVISION, REPO_TYPES
|
|
34
|
-
from .utils import DEFAULT_IGNORE_PATTERNS, filter_repo_objects, tqdm
|
|
35
|
-
from .utils.
|
|
33
|
+
from .utils import DEFAULT_IGNORE_PATTERNS, _format_size, filter_repo_objects, tqdm
|
|
34
|
+
from .utils._runtime import is_xet_available
|
|
36
35
|
from .utils.sha import sha_fileobj
|
|
37
36
|
|
|
38
37
|
|
|
@@ -42,9 +41,113 @@ if TYPE_CHECKING:
|
|
|
42
41
|
logger = logging.getLogger(__name__)
|
|
43
42
|
|
|
44
43
|
WAITING_TIME_IF_NO_TASKS = 10 # seconds
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
44
|
+
MAX_NB_FILES_FETCH_UPLOAD_MODE = 100
|
|
45
|
+
COMMIT_SIZE_SCALE: list[int] = [20, 50, 75, 100, 125, 200, 250, 400, 600, 1000]
|
|
46
|
+
|
|
47
|
+
UPLOAD_BATCH_SIZE_XET = 256 # Max 256 files per upload batch for XET-enabled repos
|
|
48
|
+
UPLOAD_BATCH_SIZE_LFS = 1 # Otherwise, batches of 1 for regular LFS upload
|
|
49
|
+
|
|
50
|
+
# Repository limits (from https://huggingface.co/docs/hub/repositories-recommendations)
|
|
51
|
+
MAX_FILES_PER_REPO = 100_000 # Recommended maximum number of files per repository
|
|
52
|
+
MAX_FILES_PER_FOLDER = 10_000 # Recommended maximum number of files per folder
|
|
53
|
+
MAX_FILE_SIZE_GB = 50 # Hard limit for individual file size
|
|
54
|
+
RECOMMENDED_FILE_SIZE_GB = 20 # Recommended maximum for individual file size
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def _validate_upload_limits(paths_list: list[LocalUploadFilePaths]) -> None:
|
|
58
|
+
"""
|
|
59
|
+
Validate upload against repository limits and warn about potential issues.
|
|
60
|
+
|
|
61
|
+
Args:
|
|
62
|
+
paths_list: List of file paths to be uploaded
|
|
63
|
+
|
|
64
|
+
Warns about:
|
|
65
|
+
- Too many files in the repository (>100k)
|
|
66
|
+
- Too many entries (files or subdirectories) in a single folder (>10k)
|
|
67
|
+
- Files exceeding size limits (>20GB recommended, >50GB hard limit)
|
|
68
|
+
"""
|
|
69
|
+
logger.info("Running validation checks on files to upload...")
|
|
70
|
+
|
|
71
|
+
# Check 1: Total file count
|
|
72
|
+
if len(paths_list) > MAX_FILES_PER_REPO:
|
|
73
|
+
logger.warning(
|
|
74
|
+
f"You are about to upload {len(paths_list):,} files. "
|
|
75
|
+
f"This exceeds the recommended limit of {MAX_FILES_PER_REPO:,} files per repository.\n"
|
|
76
|
+
f"Consider:\n"
|
|
77
|
+
f" - Splitting your data into multiple repositories\n"
|
|
78
|
+
f" - Using fewer, larger files (e.g., parquet files)\n"
|
|
79
|
+
f" - See: https://huggingface.co/docs/hub/repositories-recommendations"
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
# Check 2: Files and subdirectories per folder
|
|
83
|
+
# Track immediate children (files and subdirs) for each folder
|
|
84
|
+
from collections import defaultdict
|
|
85
|
+
|
|
86
|
+
entries_per_folder: dict[str, Any] = defaultdict(lambda: {"files": 0, "subdirs": set()})
|
|
87
|
+
|
|
88
|
+
for paths in paths_list:
|
|
89
|
+
path = Path(paths.path_in_repo)
|
|
90
|
+
parts = path.parts
|
|
91
|
+
|
|
92
|
+
# Count this file in its immediate parent directory
|
|
93
|
+
parent = str(path.parent) if str(path.parent) != "." else "."
|
|
94
|
+
entries_per_folder[parent]["files"] += 1
|
|
95
|
+
|
|
96
|
+
# Track immediate subdirectories for each parent folder
|
|
97
|
+
# Walk through the path components to track parent-child relationships
|
|
98
|
+
for i, child in enumerate(parts[:-1]):
|
|
99
|
+
parent = "." if i == 0 else "/".join(parts[:i])
|
|
100
|
+
entries_per_folder[parent]["subdirs"].add(child)
|
|
101
|
+
|
|
102
|
+
# Check limits for each folder
|
|
103
|
+
for folder, data in entries_per_folder.items():
|
|
104
|
+
file_count = data["files"]
|
|
105
|
+
subdir_count = len(data["subdirs"])
|
|
106
|
+
total_entries = file_count + subdir_count
|
|
107
|
+
|
|
108
|
+
if total_entries > MAX_FILES_PER_FOLDER:
|
|
109
|
+
folder_display = "root" if folder == "." else folder
|
|
110
|
+
logger.warning(
|
|
111
|
+
f"Folder '{folder_display}' contains {total_entries:,} entries "
|
|
112
|
+
f"({file_count:,} files and {subdir_count:,} subdirectories). "
|
|
113
|
+
f"This exceeds the recommended {MAX_FILES_PER_FOLDER:,} entries per folder.\n"
|
|
114
|
+
"Consider reorganising into sub-folders."
|
|
115
|
+
)
|
|
116
|
+
|
|
117
|
+
# Check 3: File sizes
|
|
118
|
+
large_files = []
|
|
119
|
+
very_large_files = []
|
|
120
|
+
|
|
121
|
+
for paths in paths_list:
|
|
122
|
+
size = paths.file_path.stat().st_size
|
|
123
|
+
size_gb = size / 1_000_000_000 # Use decimal GB as per Hub limits
|
|
124
|
+
|
|
125
|
+
if size_gb > MAX_FILE_SIZE_GB:
|
|
126
|
+
very_large_files.append((paths.path_in_repo, size_gb))
|
|
127
|
+
elif size_gb > RECOMMENDED_FILE_SIZE_GB:
|
|
128
|
+
large_files.append((paths.path_in_repo, size_gb))
|
|
129
|
+
|
|
130
|
+
# Warn about very large files (>50GB)
|
|
131
|
+
if very_large_files:
|
|
132
|
+
files_str = "\n - ".join(f"{path}: {size:.1f}GB" for path, size in very_large_files[:5])
|
|
133
|
+
more_str = f"\n ... and {len(very_large_files) - 5} more files" if len(very_large_files) > 5 else ""
|
|
134
|
+
logger.warning(
|
|
135
|
+
f"Found {len(very_large_files)} files exceeding the {MAX_FILE_SIZE_GB}GB hard limit:\n"
|
|
136
|
+
f" - {files_str}{more_str}\n"
|
|
137
|
+
f"These files may fail to upload. Consider splitting them into smaller chunks."
|
|
138
|
+
)
|
|
139
|
+
|
|
140
|
+
# Warn about large files (>20GB)
|
|
141
|
+
if large_files:
|
|
142
|
+
files_str = "\n - ".join(f"{path}: {size:.1f}GB" for path, size in large_files[:5])
|
|
143
|
+
more_str = f"\n ... and {len(large_files) - 5} more files" if len(large_files) > 5 else ""
|
|
144
|
+
logger.warning(
|
|
145
|
+
f"Found {len(large_files)} files larger than {RECOMMENDED_FILE_SIZE_GB}GB (recommended limit):\n"
|
|
146
|
+
f" - {files_str}{more_str}\n"
|
|
147
|
+
f"Large files may slow down loading and processing."
|
|
148
|
+
)
|
|
149
|
+
|
|
150
|
+
logger.info("Validation checks complete.")
|
|
48
151
|
|
|
49
152
|
|
|
50
153
|
def upload_large_folder_internal(
|
|
@@ -55,8 +158,8 @@ def upload_large_folder_internal(
|
|
|
55
158
|
repo_type: str, # Repo type is required!
|
|
56
159
|
revision: Optional[str] = None,
|
|
57
160
|
private: Optional[bool] = None,
|
|
58
|
-
allow_patterns: Optional[Union[
|
|
59
|
-
ignore_patterns: Optional[Union[
|
|
161
|
+
allow_patterns: Optional[Union[list[str], str]] = None,
|
|
162
|
+
ignore_patterns: Optional[Union[list[str], str]] = None,
|
|
60
163
|
num_workers: Optional[int] = None,
|
|
61
164
|
print_report: bool = True,
|
|
62
165
|
print_report_every: int = 60,
|
|
@@ -94,6 +197,8 @@ def upload_large_folder_internal(
|
|
|
94
197
|
repo_url = api.create_repo(repo_id=repo_id, repo_type=repo_type, private=private, exist_ok=True)
|
|
95
198
|
logger.info(f"Repo created: {repo_url}")
|
|
96
199
|
repo_id = repo_url.repo_id
|
|
200
|
+
# 2.1 Check if xet is enabled to set batch file upload size
|
|
201
|
+
upload_batch_size = UPLOAD_BATCH_SIZE_XET if is_xet_available() else UPLOAD_BATCH_SIZE_LFS
|
|
97
202
|
|
|
98
203
|
# 3. List files to upload
|
|
99
204
|
filtered_paths_list = filter_repo_objects(
|
|
@@ -104,6 +209,11 @@ def upload_large_folder_internal(
|
|
|
104
209
|
paths_list = [get_local_upload_paths(folder_path, relpath) for relpath in filtered_paths_list]
|
|
105
210
|
logger.info(f"Found {len(paths_list)} candidate files to upload")
|
|
106
211
|
|
|
212
|
+
# Validate upload against repository limits
|
|
213
|
+
_validate_upload_limits(paths_list)
|
|
214
|
+
|
|
215
|
+
logger.info("Starting upload...")
|
|
216
|
+
|
|
107
217
|
# Read metadata for each file
|
|
108
218
|
items = [
|
|
109
219
|
(paths, read_upload_metadata(folder_path, paths.path_in_repo))
|
|
@@ -111,7 +221,7 @@ def upload_large_folder_internal(
|
|
|
111
221
|
]
|
|
112
222
|
|
|
113
223
|
# 4. Start workers
|
|
114
|
-
status = LargeUploadStatus(items)
|
|
224
|
+
status = LargeUploadStatus(items, upload_batch_size)
|
|
115
225
|
threads = [
|
|
116
226
|
threading.Thread(
|
|
117
227
|
target=_worker_job,
|
|
@@ -163,13 +273,13 @@ class WorkerJob(enum.Enum):
|
|
|
163
273
|
WAIT = enum.auto() # if no tasks are available but we don't want to exit
|
|
164
274
|
|
|
165
275
|
|
|
166
|
-
JOB_ITEM_T =
|
|
276
|
+
JOB_ITEM_T = tuple[LocalUploadFilePaths, LocalUploadFileMetadata]
|
|
167
277
|
|
|
168
278
|
|
|
169
279
|
class LargeUploadStatus:
|
|
170
280
|
"""Contains information, queues and tasks for a large upload process."""
|
|
171
281
|
|
|
172
|
-
def __init__(self, items:
|
|
282
|
+
def __init__(self, items: list[JOB_ITEM_T], upload_batch_size: int = 1):
|
|
173
283
|
self.items = items
|
|
174
284
|
self.queue_sha256: "queue.Queue[JOB_ITEM_T]" = queue.Queue()
|
|
175
285
|
self.queue_get_upload_mode: "queue.Queue[JOB_ITEM_T]" = queue.Queue()
|
|
@@ -180,6 +290,7 @@ class LargeUploadStatus:
|
|
|
180
290
|
self.nb_workers_sha256: int = 0
|
|
181
291
|
self.nb_workers_get_upload_mode: int = 0
|
|
182
292
|
self.nb_workers_preupload_lfs: int = 0
|
|
293
|
+
self.upload_batch_size: int = upload_batch_size
|
|
183
294
|
self.nb_workers_commit: int = 0
|
|
184
295
|
self.nb_workers_waiting: int = 0
|
|
185
296
|
self.last_commit_attempt: Optional[float] = None
|
|
@@ -301,7 +412,7 @@ def _worker_job(
|
|
|
301
412
|
Read `upload_large_folder` docstring for more information on how tasks are prioritized.
|
|
302
413
|
"""
|
|
303
414
|
while True:
|
|
304
|
-
next_job: Optional[
|
|
415
|
+
next_job: Optional[tuple[WorkerJob, list[JOB_ITEM_T]]] = None
|
|
305
416
|
|
|
306
417
|
# Determine next task
|
|
307
418
|
next_job = _determine_next_job(status)
|
|
@@ -354,16 +465,17 @@ def _worker_job(
|
|
|
354
465
|
status.nb_workers_get_upload_mode -= 1
|
|
355
466
|
|
|
356
467
|
elif job == WorkerJob.PREUPLOAD_LFS:
|
|
357
|
-
item = items[0] # single item
|
|
358
468
|
try:
|
|
359
|
-
_preupload_lfs(
|
|
360
|
-
|
|
469
|
+
_preupload_lfs(items, api=api, repo_id=repo_id, repo_type=repo_type, revision=revision)
|
|
470
|
+
for item in items:
|
|
471
|
+
status.queue_commit.put(item)
|
|
361
472
|
except KeyboardInterrupt:
|
|
362
473
|
raise
|
|
363
474
|
except Exception as e:
|
|
364
475
|
logger.error(f"Failed to preupload LFS: {e}")
|
|
365
476
|
traceback.format_exc()
|
|
366
|
-
|
|
477
|
+
for item in items:
|
|
478
|
+
status.queue_preupload_lfs.put(item)
|
|
367
479
|
|
|
368
480
|
with status.lock:
|
|
369
481
|
status.nb_workers_preupload_lfs -= 1
|
|
@@ -393,7 +505,7 @@ def _worker_job(
|
|
|
393
505
|
status.nb_workers_waiting -= 1
|
|
394
506
|
|
|
395
507
|
|
|
396
|
-
def _determine_next_job(status: LargeUploadStatus) -> Optional[
|
|
508
|
+
def _determine_next_job(status: LargeUploadStatus) -> Optional[tuple[WorkerJob, list[JOB_ITEM_T]]]:
|
|
397
509
|
with status.lock:
|
|
398
510
|
# 1. Commit if more than 5 minutes since last commit attempt (and at least 1 file)
|
|
399
511
|
if (
|
|
@@ -404,25 +516,25 @@ def _determine_next_job(status: LargeUploadStatus) -> Optional[Tuple[WorkerJob,
|
|
|
404
516
|
):
|
|
405
517
|
status.nb_workers_commit += 1
|
|
406
518
|
logger.debug("Job: commit (more than 5 minutes since last commit attempt)")
|
|
407
|
-
return (WorkerJob.COMMIT,
|
|
519
|
+
return (WorkerJob.COMMIT, _get_n(status.queue_commit, status.target_chunk()))
|
|
408
520
|
|
|
409
521
|
# 2. Commit if at least 100 files are ready to commit
|
|
410
522
|
elif status.nb_workers_commit == 0 and status.queue_commit.qsize() >= 150:
|
|
411
523
|
status.nb_workers_commit += 1
|
|
412
524
|
logger.debug("Job: commit (>100 files ready)")
|
|
413
|
-
return (WorkerJob.COMMIT,
|
|
525
|
+
return (WorkerJob.COMMIT, _get_n(status.queue_commit, status.target_chunk()))
|
|
414
526
|
|
|
415
|
-
# 3. Get upload mode if at least
|
|
416
|
-
elif status.queue_get_upload_mode.qsize() >=
|
|
527
|
+
# 3. Get upload mode if at least 100 files
|
|
528
|
+
elif status.queue_get_upload_mode.qsize() >= MAX_NB_FILES_FETCH_UPLOAD_MODE:
|
|
417
529
|
status.nb_workers_get_upload_mode += 1
|
|
418
|
-
logger.debug("Job: get upload mode (>
|
|
419
|
-
return (WorkerJob.GET_UPLOAD_MODE, _get_n(status.queue_get_upload_mode,
|
|
530
|
+
logger.debug(f"Job: get upload mode (>{MAX_NB_FILES_FETCH_UPLOAD_MODE} files ready)")
|
|
531
|
+
return (WorkerJob.GET_UPLOAD_MODE, _get_n(status.queue_get_upload_mode, MAX_NB_FILES_FETCH_UPLOAD_MODE))
|
|
420
532
|
|
|
421
|
-
# 4. Preupload LFS file if at least
|
|
422
|
-
elif status.queue_preupload_lfs.qsize()
|
|
533
|
+
# 4. Preupload LFS file if at least `status.upload_batch_size` files and no worker is preuploading LFS
|
|
534
|
+
elif status.queue_preupload_lfs.qsize() >= status.upload_batch_size and status.nb_workers_preupload_lfs == 0:
|
|
423
535
|
status.nb_workers_preupload_lfs += 1
|
|
424
536
|
logger.debug("Job: preupload LFS (no other worker preuploading LFS)")
|
|
425
|
-
return (WorkerJob.PREUPLOAD_LFS,
|
|
537
|
+
return (WorkerJob.PREUPLOAD_LFS, _get_n(status.queue_preupload_lfs, status.upload_batch_size))
|
|
426
538
|
|
|
427
539
|
# 5. Compute sha256 if at least 1 file and no worker is computing sha256
|
|
428
540
|
elif status.queue_sha256.qsize() > 0 and status.nb_workers_sha256 == 0:
|
|
@@ -434,16 +546,13 @@ def _determine_next_job(status: LargeUploadStatus) -> Optional[Tuple[WorkerJob,
|
|
|
434
546
|
elif status.queue_get_upload_mode.qsize() > 0 and status.nb_workers_get_upload_mode == 0:
|
|
435
547
|
status.nb_workers_get_upload_mode += 1
|
|
436
548
|
logger.debug("Job: get upload mode (no other worker getting upload mode)")
|
|
437
|
-
return (WorkerJob.GET_UPLOAD_MODE, _get_n(status.queue_get_upload_mode,
|
|
549
|
+
return (WorkerJob.GET_UPLOAD_MODE, _get_n(status.queue_get_upload_mode, MAX_NB_FILES_FETCH_UPLOAD_MODE))
|
|
438
550
|
|
|
439
|
-
# 7. Preupload LFS file if at least
|
|
440
|
-
|
|
441
|
-
elif status.queue_preupload_lfs.qsize() > 0 and (
|
|
442
|
-
status.nb_workers_preupload_lfs == 0 or not constants.HF_HUB_ENABLE_HF_TRANSFER
|
|
443
|
-
):
|
|
551
|
+
# 7. Preupload LFS file if at least `status.upload_batch_size` files
|
|
552
|
+
elif status.queue_preupload_lfs.qsize() >= status.upload_batch_size:
|
|
444
553
|
status.nb_workers_preupload_lfs += 1
|
|
445
554
|
logger.debug("Job: preupload LFS")
|
|
446
|
-
return (WorkerJob.PREUPLOAD_LFS,
|
|
555
|
+
return (WorkerJob.PREUPLOAD_LFS, _get_n(status.queue_preupload_lfs, status.upload_batch_size))
|
|
447
556
|
|
|
448
557
|
# 8. Compute sha256 if at least 1 file
|
|
449
558
|
elif status.queue_sha256.qsize() > 0:
|
|
@@ -455,9 +564,15 @@ def _determine_next_job(status: LargeUploadStatus) -> Optional[Tuple[WorkerJob,
|
|
|
455
564
|
elif status.queue_get_upload_mode.qsize() > 0:
|
|
456
565
|
status.nb_workers_get_upload_mode += 1
|
|
457
566
|
logger.debug("Job: get upload mode")
|
|
458
|
-
return (WorkerJob.GET_UPLOAD_MODE, _get_n(status.queue_get_upload_mode,
|
|
567
|
+
return (WorkerJob.GET_UPLOAD_MODE, _get_n(status.queue_get_upload_mode, MAX_NB_FILES_FETCH_UPLOAD_MODE))
|
|
459
568
|
|
|
460
|
-
# 10.
|
|
569
|
+
# 10. Preupload LFS file if at least 1 file
|
|
570
|
+
elif status.queue_preupload_lfs.qsize() > 0:
|
|
571
|
+
status.nb_workers_preupload_lfs += 1
|
|
572
|
+
logger.debug("Job: preupload LFS")
|
|
573
|
+
return (WorkerJob.PREUPLOAD_LFS, _get_n(status.queue_preupload_lfs, status.upload_batch_size))
|
|
574
|
+
|
|
575
|
+
# 11. Commit if at least 1 file and 1 min since last commit attempt
|
|
461
576
|
elif (
|
|
462
577
|
status.nb_workers_commit == 0
|
|
463
578
|
and status.queue_commit.qsize() > 0
|
|
@@ -466,9 +581,9 @@ def _determine_next_job(status: LargeUploadStatus) -> Optional[Tuple[WorkerJob,
|
|
|
466
581
|
):
|
|
467
582
|
status.nb_workers_commit += 1
|
|
468
583
|
logger.debug("Job: commit (1 min since last commit attempt)")
|
|
469
|
-
return (WorkerJob.COMMIT,
|
|
584
|
+
return (WorkerJob.COMMIT, _get_n(status.queue_commit, status.target_chunk()))
|
|
470
585
|
|
|
471
|
-
#
|
|
586
|
+
# 12. Commit if at least 1 file all other queues are empty and all workers are waiting
|
|
472
587
|
# e.g. when it's the last commit
|
|
473
588
|
elif (
|
|
474
589
|
status.nb_workers_commit == 0
|
|
@@ -482,14 +597,14 @@ def _determine_next_job(status: LargeUploadStatus) -> Optional[Tuple[WorkerJob,
|
|
|
482
597
|
):
|
|
483
598
|
status.nb_workers_commit += 1
|
|
484
599
|
logger.debug("Job: commit")
|
|
485
|
-
return (WorkerJob.COMMIT,
|
|
600
|
+
return (WorkerJob.COMMIT, _get_n(status.queue_commit, status.target_chunk()))
|
|
486
601
|
|
|
487
|
-
#
|
|
602
|
+
# 13. If all queues are empty, exit
|
|
488
603
|
elif all(metadata.is_committed or metadata.should_ignore for _, metadata in status.items):
|
|
489
604
|
logger.info("All files have been processed! Exiting worker.")
|
|
490
605
|
return None
|
|
491
606
|
|
|
492
|
-
#
|
|
607
|
+
# 14. If no task is available, wait
|
|
493
608
|
else:
|
|
494
609
|
status.nb_workers_waiting += 1
|
|
495
610
|
logger.debug(f"No task available, waiting... ({WAITING_TIME_IF_NO_TASKS}s)")
|
|
@@ -510,7 +625,7 @@ def _compute_sha256(item: JOB_ITEM_T) -> None:
|
|
|
510
625
|
metadata.save(paths)
|
|
511
626
|
|
|
512
627
|
|
|
513
|
-
def _get_upload_mode(items:
|
|
628
|
+
def _get_upload_mode(items: list[JOB_ITEM_T], api: "HfApi", repo_id: str, repo_type: str, revision: str) -> None:
|
|
514
629
|
"""Get upload mode for each file and update metadata.
|
|
515
630
|
|
|
516
631
|
Also receive info if the file should be ignored.
|
|
@@ -522,30 +637,32 @@ def _get_upload_mode(items: List[JOB_ITEM_T], api: "HfApi", repo_id: str, repo_t
|
|
|
522
637
|
repo_id=repo_id,
|
|
523
638
|
headers=api._build_hf_headers(),
|
|
524
639
|
revision=quote(revision, safe=""),
|
|
640
|
+
endpoint=api.endpoint,
|
|
525
641
|
)
|
|
526
642
|
for item, addition in zip(items, additions):
|
|
527
643
|
paths, metadata = item
|
|
528
644
|
metadata.upload_mode = addition._upload_mode
|
|
529
645
|
metadata.should_ignore = addition._should_ignore
|
|
646
|
+
metadata.remote_oid = addition._remote_oid
|
|
530
647
|
metadata.save(paths)
|
|
531
648
|
|
|
532
649
|
|
|
533
|
-
def _preupload_lfs(
|
|
534
|
-
"""Preupload LFS
|
|
535
|
-
|
|
536
|
-
addition = _build_hacky_operation(item)
|
|
650
|
+
def _preupload_lfs(items: list[JOB_ITEM_T], api: "HfApi", repo_id: str, repo_type: str, revision: str) -> None:
|
|
651
|
+
"""Preupload LFS files and update metadata."""
|
|
652
|
+
additions = [_build_hacky_operation(item) for item in items]
|
|
537
653
|
api.preupload_lfs_files(
|
|
538
654
|
repo_id=repo_id,
|
|
539
655
|
repo_type=repo_type,
|
|
540
656
|
revision=revision,
|
|
541
|
-
additions=
|
|
657
|
+
additions=additions,
|
|
542
658
|
)
|
|
543
659
|
|
|
544
|
-
metadata
|
|
545
|
-
|
|
660
|
+
for paths, metadata in items:
|
|
661
|
+
metadata.is_uploaded = True
|
|
662
|
+
metadata.save(paths)
|
|
546
663
|
|
|
547
664
|
|
|
548
|
-
def _commit(items:
|
|
665
|
+
def _commit(items: list[JOB_ITEM_T], api: "HfApi", repo_id: str, repo_type: str, revision: str) -> None:
|
|
549
666
|
"""Commit files to the repo."""
|
|
550
667
|
additions = [_build_hacky_operation(item) for item in items]
|
|
551
668
|
api.create_commit(
|
|
@@ -579,6 +696,9 @@ def _build_hacky_operation(item: JOB_ITEM_T) -> HackyCommitOperationAdd:
|
|
|
579
696
|
if metadata.sha256 is None:
|
|
580
697
|
raise ValueError("sha256 must have been computed by now!")
|
|
581
698
|
operation.upload_info = UploadInfo(sha256=bytes.fromhex(metadata.sha256), size=metadata.size, sample=sample)
|
|
699
|
+
operation._upload_mode = metadata.upload_mode # type: ignore[assignment]
|
|
700
|
+
operation._should_ignore = metadata.should_ignore
|
|
701
|
+
operation._remote_oid = metadata.remote_oid
|
|
582
702
|
return operation
|
|
583
703
|
|
|
584
704
|
|
|
@@ -587,38 +707,14 @@ def _build_hacky_operation(item: JOB_ITEM_T) -> HackyCommitOperationAdd:
|
|
|
587
707
|
####################
|
|
588
708
|
|
|
589
709
|
|
|
590
|
-
def _get_one(queue: "queue.Queue[JOB_ITEM_T]") ->
|
|
710
|
+
def _get_one(queue: "queue.Queue[JOB_ITEM_T]") -> list[JOB_ITEM_T]:
|
|
591
711
|
return [queue.get()]
|
|
592
712
|
|
|
593
713
|
|
|
594
|
-
def _get_n(queue: "queue.Queue[JOB_ITEM_T]", n: int) ->
|
|
714
|
+
def _get_n(queue: "queue.Queue[JOB_ITEM_T]", n: int) -> list[JOB_ITEM_T]:
|
|
595
715
|
return [queue.get() for _ in range(min(queue.qsize(), n))]
|
|
596
716
|
|
|
597
717
|
|
|
598
|
-
def _get_items_to_commit(queue: "queue.Queue[JOB_ITEM_T]") -> List[JOB_ITEM_T]:
|
|
599
|
-
"""Special case for commit job: the number of items to commit depends on the type of files."""
|
|
600
|
-
# Can take at most 50 regular files and/or 100 LFS files in a single commit
|
|
601
|
-
items: List[JOB_ITEM_T] = []
|
|
602
|
-
nb_lfs, nb_regular = 0, 0
|
|
603
|
-
while True:
|
|
604
|
-
# If empty queue => commit everything
|
|
605
|
-
if queue.qsize() == 0:
|
|
606
|
-
return items
|
|
607
|
-
|
|
608
|
-
# If we have enough items => commit them
|
|
609
|
-
if nb_lfs >= MAX_NB_LFS_FILES_PER_COMMIT or nb_regular >= MAX_NB_REGULAR_FILES_PER_COMMIT:
|
|
610
|
-
return items
|
|
611
|
-
|
|
612
|
-
# Else, get a new item and increase counter
|
|
613
|
-
item = queue.get()
|
|
614
|
-
items.append(item)
|
|
615
|
-
_, metadata = item
|
|
616
|
-
if metadata.upload_mode == "lfs":
|
|
617
|
-
nb_lfs += 1
|
|
618
|
-
else:
|
|
619
|
-
nb_regular += 1
|
|
620
|
-
|
|
621
|
-
|
|
622
718
|
def _print_overwrite(report: str) -> None:
|
|
623
719
|
"""Print a report, overwriting the previous lines.
|
|
624
720
|
|
|
@@ -14,7 +14,7 @@
|
|
|
14
14
|
# limitations under the License.
|
|
15
15
|
"""Contains data structures to parse the webhooks payload."""
|
|
16
16
|
|
|
17
|
-
from typing import
|
|
17
|
+
from typing import Literal, Optional
|
|
18
18
|
|
|
19
19
|
from .utils import is_pydantic_available
|
|
20
20
|
|
|
@@ -116,7 +116,7 @@ class WebhookPayloadRepo(ObjectId):
|
|
|
116
116
|
name: str
|
|
117
117
|
private: bool
|
|
118
118
|
subdomain: Optional[str] = None
|
|
119
|
-
tags: Optional[
|
|
119
|
+
tags: Optional[list[str]] = None
|
|
120
120
|
type: Literal["dataset", "model", "space"]
|
|
121
121
|
url: WebhookPayloadUrl
|
|
122
122
|
|
|
@@ -134,4 +134,4 @@ class WebhookPayload(BaseModel):
|
|
|
134
134
|
comment: Optional[WebhookPayloadComment] = None
|
|
135
135
|
webhook: WebhookPayloadWebhook
|
|
136
136
|
movedTo: Optional[WebhookPayloadMovedTo] = None
|
|
137
|
-
updatedRefs: Optional[
|
|
137
|
+
updatedRefs: Optional[list[WebhookPayloadUpdatedRef]] = None
|
|
@@ -18,7 +18,7 @@ import atexit
|
|
|
18
18
|
import inspect
|
|
19
19
|
import os
|
|
20
20
|
from functools import wraps
|
|
21
|
-
from typing import TYPE_CHECKING, Any, Callable,
|
|
21
|
+
from typing import TYPE_CHECKING, Any, Callable, Optional
|
|
22
22
|
|
|
23
23
|
from .utils import experimental, is_fastapi_available, is_gradio_available
|
|
24
24
|
|
|
@@ -32,7 +32,7 @@ if is_fastapi_available():
|
|
|
32
32
|
from fastapi.responses import JSONResponse
|
|
33
33
|
else:
|
|
34
34
|
# Will fail at runtime if FastAPI is not available
|
|
35
|
-
FastAPI = Request = JSONResponse = None # type: ignore
|
|
35
|
+
FastAPI = Request = JSONResponse = None # type: ignore
|
|
36
36
|
|
|
37
37
|
|
|
38
38
|
_global_app: Optional["WebhooksServer"] = None
|
|
@@ -50,20 +50,14 @@ class WebhooksServer:
|
|
|
50
50
|
It is recommended to accept [`WebhookPayload`] as the first argument of the webhook function. It is a Pydantic
|
|
51
51
|
model that contains all the information about the webhook event. The data will be parsed automatically for you.
|
|
52
52
|
|
|
53
|
-
Check out the [webhooks guide](../guides/webhooks_server) for a step-by-step tutorial on how to
|
|
53
|
+
Check out the [webhooks guide](../guides/webhooks_server) for a step-by-step tutorial on how to set up your
|
|
54
54
|
WebhooksServer and deploy it on a Space.
|
|
55
55
|
|
|
56
|
-
|
|
56
|
+
> [!WARNING]
|
|
57
|
+
> `WebhooksServer` is experimental. Its API is subject to change in the future.
|
|
57
58
|
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
</Tip>
|
|
61
|
-
|
|
62
|
-
<Tip warning={true}>
|
|
63
|
-
|
|
64
|
-
You must have `gradio` installed to use `WebhooksServer` (`pip install --upgrade gradio`).
|
|
65
|
-
|
|
66
|
-
</Tip>
|
|
59
|
+
> [!WARNING]
|
|
60
|
+
> You must have `gradio` installed to use `WebhooksServer` (`pip install --upgrade gradio`).
|
|
67
61
|
|
|
68
62
|
Args:
|
|
69
63
|
ui (`gradio.Blocks`, optional):
|
|
@@ -115,7 +109,7 @@ class WebhooksServer:
|
|
|
115
109
|
self._ui = ui
|
|
116
110
|
|
|
117
111
|
self.webhook_secret = webhook_secret or os.getenv("WEBHOOK_SECRET")
|
|
118
|
-
self.registered_webhooks:
|
|
112
|
+
self.registered_webhooks: dict[str, Callable] = {}
|
|
119
113
|
_warn_on_empty_secret(self.webhook_secret)
|
|
120
114
|
|
|
121
115
|
def add_webhook(self, path: Optional[str] = None) -> Callable:
|
|
@@ -237,20 +231,14 @@ def webhook_endpoint(path: Optional[str] = None) -> Callable:
|
|
|
237
231
|
you can use [`WebhooksServer`] directly. You can register multiple webhook endpoints (to the same server) by using
|
|
238
232
|
this decorator multiple times.
|
|
239
233
|
|
|
240
|
-
Check out the [webhooks guide](../guides/webhooks_server) for a step-by-step tutorial on how to
|
|
234
|
+
Check out the [webhooks guide](../guides/webhooks_server) for a step-by-step tutorial on how to set up your
|
|
241
235
|
server and deploy it on a Space.
|
|
242
236
|
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
`webhook_endpoint` is experimental. Its API is subject to change in the future.
|
|
246
|
-
|
|
247
|
-
</Tip>
|
|
248
|
-
|
|
249
|
-
<Tip warning={true}>
|
|
250
|
-
|
|
251
|
-
You must have `gradio` installed to use `webhook_endpoint` (`pip install --upgrade gradio`).
|
|
237
|
+
> [!WARNING]
|
|
238
|
+
> `webhook_endpoint` is experimental. Its API is subject to change in the future.
|
|
252
239
|
|
|
253
|
-
|
|
240
|
+
> [!WARNING]
|
|
241
|
+
> You must have `gradio` installed to use `webhook_endpoint` (`pip install --upgrade gradio`).
|
|
254
242
|
|
|
255
243
|
Args:
|
|
256
244
|
path (`str`, optional):
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
# Copyright
|
|
1
|
+
# Copyright 2025 The HuggingFace Team. All rights reserved.
|
|
2
2
|
#
|
|
3
3
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
4
4
|
# you may not use this file except in compliance with the License.
|
|
@@ -11,17 +11,3 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
|
|
15
|
-
from abc import ABC, abstractmethod
|
|
16
|
-
from argparse import _SubParsersAction
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
class BaseHuggingfaceCLICommand(ABC):
|
|
20
|
-
@staticmethod
|
|
21
|
-
@abstractmethod
|
|
22
|
-
def register_subcommand(parser: _SubParsersAction):
|
|
23
|
-
raise NotImplementedError()
|
|
24
|
-
|
|
25
|
-
@abstractmethod
|
|
26
|
-
def run(self):
|
|
27
|
-
raise NotImplementedError()
|