huggingface-hub 0.24.7__py3-none-any.whl → 0.25.0rc0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of huggingface-hub might be problematic. Click here for more details.
- huggingface_hub/__init__.py +21 -1
- huggingface_hub/_commit_api.py +4 -4
- huggingface_hub/_inference_endpoints.py +13 -1
- huggingface_hub/_local_folder.py +191 -4
- huggingface_hub/_login.py +6 -6
- huggingface_hub/_snapshot_download.py +8 -17
- huggingface_hub/_space_api.py +5 -0
- huggingface_hub/_tensorboard_logger.py +29 -13
- huggingface_hub/_upload_large_folder.py +573 -0
- huggingface_hub/_webhooks_server.py +1 -1
- huggingface_hub/commands/_cli_utils.py +5 -0
- huggingface_hub/commands/download.py +8 -0
- huggingface_hub/commands/huggingface_cli.py +6 -1
- huggingface_hub/commands/lfs.py +2 -1
- huggingface_hub/commands/repo_files.py +2 -2
- huggingface_hub/commands/scan_cache.py +99 -57
- huggingface_hub/commands/tag.py +1 -1
- huggingface_hub/commands/upload.py +2 -1
- huggingface_hub/commands/upload_large_folder.py +129 -0
- huggingface_hub/commands/version.py +37 -0
- huggingface_hub/community.py +2 -2
- huggingface_hub/errors.py +218 -1
- huggingface_hub/fastai_utils.py +2 -3
- huggingface_hub/file_download.py +61 -62
- huggingface_hub/hf_api.py +758 -314
- huggingface_hub/hf_file_system.py +15 -23
- huggingface_hub/hub_mixin.py +27 -25
- huggingface_hub/inference/_client.py +78 -127
- huggingface_hub/inference/_generated/_async_client.py +169 -144
- huggingface_hub/inference/_generated/types/base.py +0 -9
- huggingface_hub/inference/_templating.py +2 -3
- huggingface_hub/inference_api.py +2 -2
- huggingface_hub/keras_mixin.py +2 -2
- huggingface_hub/lfs.py +7 -98
- huggingface_hub/repocard.py +6 -5
- huggingface_hub/repository.py +5 -5
- huggingface_hub/serialization/_torch.py +64 -11
- huggingface_hub/utils/__init__.py +13 -14
- huggingface_hub/utils/_cache_manager.py +97 -14
- huggingface_hub/utils/_fixes.py +18 -2
- huggingface_hub/utils/_http.py +228 -2
- huggingface_hub/utils/_lfs.py +110 -0
- huggingface_hub/utils/_runtime.py +7 -1
- huggingface_hub/utils/_token.py +3 -2
- {huggingface_hub-0.24.7.dist-info → huggingface_hub-0.25.0rc0.dist-info}/METADATA +2 -2
- {huggingface_hub-0.24.7.dist-info → huggingface_hub-0.25.0rc0.dist-info}/RECORD +50 -48
- huggingface_hub/inference/_types.py +0 -52
- huggingface_hub/utils/_errors.py +0 -397
- {huggingface_hub-0.24.7.dist-info → huggingface_hub-0.25.0rc0.dist-info}/LICENSE +0 -0
- {huggingface_hub-0.24.7.dist-info → huggingface_hub-0.25.0rc0.dist-info}/WHEEL +0 -0
- {huggingface_hub-0.24.7.dist-info → huggingface_hub-0.25.0rc0.dist-info}/entry_points.txt +0 -0
- {huggingface_hub-0.24.7.dist-info → huggingface_hub-0.25.0rc0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,573 @@
|
|
|
1
|
+
# coding=utf-8
|
|
2
|
+
# Copyright 2024-present, the HuggingFace Inc. team.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
import enum
|
|
16
|
+
import logging
|
|
17
|
+
import os
|
|
18
|
+
import queue
|
|
19
|
+
import shutil
|
|
20
|
+
import sys
|
|
21
|
+
import threading
|
|
22
|
+
import time
|
|
23
|
+
import traceback
|
|
24
|
+
from datetime import datetime
|
|
25
|
+
from pathlib import Path
|
|
26
|
+
from threading import Lock
|
|
27
|
+
from typing import TYPE_CHECKING, List, Optional, Tuple, Union
|
|
28
|
+
|
|
29
|
+
from . import constants
|
|
30
|
+
from ._commit_api import CommitOperationAdd, UploadInfo, _fetch_upload_modes
|
|
31
|
+
from ._local_folder import LocalUploadFileMetadata, LocalUploadFilePaths, get_local_upload_paths, read_upload_metadata
|
|
32
|
+
from .constants import DEFAULT_REVISION, REPO_TYPES
|
|
33
|
+
from .utils import DEFAULT_IGNORE_PATTERNS, filter_repo_objects, tqdm
|
|
34
|
+
from .utils._cache_manager import _format_size
|
|
35
|
+
from .utils.sha import sha_fileobj
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
if TYPE_CHECKING:
|
|
39
|
+
from .hf_api import HfApi
|
|
40
|
+
|
|
41
|
+
logger = logging.getLogger(__name__)
|
|
42
|
+
|
|
43
|
+
WAITING_TIME_IF_NO_TASKS = 10 # seconds
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def upload_large_folder_internal(
|
|
47
|
+
api: "HfApi",
|
|
48
|
+
repo_id: str,
|
|
49
|
+
folder_path: Union[str, Path],
|
|
50
|
+
*,
|
|
51
|
+
repo_type: str, # Repo type is required!
|
|
52
|
+
revision: Optional[str] = None,
|
|
53
|
+
private: bool = False,
|
|
54
|
+
allow_patterns: Optional[Union[List[str], str]] = None,
|
|
55
|
+
ignore_patterns: Optional[Union[List[str], str]] = None,
|
|
56
|
+
num_workers: Optional[int] = None,
|
|
57
|
+
print_report: bool = True,
|
|
58
|
+
print_report_every: int = 60,
|
|
59
|
+
):
|
|
60
|
+
"""Upload a large folder to the Hub in the most resilient way possible.
|
|
61
|
+
|
|
62
|
+
See [`HfApi.upload_large_folder`] for the full documentation.
|
|
63
|
+
"""
|
|
64
|
+
# 1. Check args and setup
|
|
65
|
+
if repo_type is None:
|
|
66
|
+
raise ValueError(
|
|
67
|
+
"For large uploads, `repo_type` is explicitly required. Please set it to `model`, `dataset` or `space`."
|
|
68
|
+
" If you are using the CLI, pass it as `--repo-type=model`."
|
|
69
|
+
)
|
|
70
|
+
if repo_type not in REPO_TYPES:
|
|
71
|
+
raise ValueError(f"Invalid repo type, must be one of {REPO_TYPES}")
|
|
72
|
+
if revision is None:
|
|
73
|
+
revision = DEFAULT_REVISION
|
|
74
|
+
|
|
75
|
+
folder_path = Path(folder_path).expanduser().resolve()
|
|
76
|
+
if not folder_path.is_dir():
|
|
77
|
+
raise ValueError(f"Provided path: '{folder_path}' is not a directory")
|
|
78
|
+
|
|
79
|
+
if ignore_patterns is None:
|
|
80
|
+
ignore_patterns = []
|
|
81
|
+
elif isinstance(ignore_patterns, str):
|
|
82
|
+
ignore_patterns = [ignore_patterns]
|
|
83
|
+
ignore_patterns += DEFAULT_IGNORE_PATTERNS
|
|
84
|
+
|
|
85
|
+
if num_workers is None:
|
|
86
|
+
nb_cores = os.cpu_count() or 1
|
|
87
|
+
num_workers = max(nb_cores - 2, 2) # Use all but 2 cores, or at least 2 cores
|
|
88
|
+
|
|
89
|
+
# 2. Create repo if missing
|
|
90
|
+
repo_url = api.create_repo(repo_id=repo_id, repo_type=repo_type, private=private, exist_ok=True)
|
|
91
|
+
logger.info(f"Repo created: {repo_url}")
|
|
92
|
+
repo_id = repo_url.repo_id
|
|
93
|
+
|
|
94
|
+
# 3. List files to upload
|
|
95
|
+
filtered_paths_list = filter_repo_objects(
|
|
96
|
+
(path.relative_to(folder_path).as_posix() for path in folder_path.glob("**/*") if path.is_file()),
|
|
97
|
+
allow_patterns=allow_patterns,
|
|
98
|
+
ignore_patterns=ignore_patterns,
|
|
99
|
+
)
|
|
100
|
+
paths_list = [get_local_upload_paths(folder_path, relpath) for relpath in filtered_paths_list]
|
|
101
|
+
logger.info(f"Found {len(paths_list)} candidate files to upload")
|
|
102
|
+
|
|
103
|
+
# Read metadata for each file
|
|
104
|
+
items = [
|
|
105
|
+
(paths, read_upload_metadata(folder_path, paths.path_in_repo))
|
|
106
|
+
for paths in tqdm(paths_list, desc="Recovering from metadata files")
|
|
107
|
+
]
|
|
108
|
+
|
|
109
|
+
# 4. Start workers
|
|
110
|
+
status = LargeUploadStatus(items)
|
|
111
|
+
threads = [
|
|
112
|
+
threading.Thread(
|
|
113
|
+
target=_worker_job,
|
|
114
|
+
kwargs={
|
|
115
|
+
"status": status,
|
|
116
|
+
"api": api,
|
|
117
|
+
"repo_id": repo_id,
|
|
118
|
+
"repo_type": repo_type,
|
|
119
|
+
"revision": revision,
|
|
120
|
+
},
|
|
121
|
+
)
|
|
122
|
+
for _ in range(num_workers)
|
|
123
|
+
]
|
|
124
|
+
|
|
125
|
+
for thread in threads:
|
|
126
|
+
thread.start()
|
|
127
|
+
|
|
128
|
+
# 5. Print regular reports
|
|
129
|
+
if print_report:
|
|
130
|
+
print("\n\n" + status.current_report())
|
|
131
|
+
last_report_ts = time.time()
|
|
132
|
+
while True:
|
|
133
|
+
time.sleep(1)
|
|
134
|
+
if time.time() - last_report_ts >= print_report_every:
|
|
135
|
+
if print_report:
|
|
136
|
+
_print_overwrite(status.current_report())
|
|
137
|
+
last_report_ts = time.time()
|
|
138
|
+
if status.is_done():
|
|
139
|
+
logging.info("Is done: exiting main loop")
|
|
140
|
+
break
|
|
141
|
+
|
|
142
|
+
for thread in threads:
|
|
143
|
+
thread.join()
|
|
144
|
+
|
|
145
|
+
logger.info(status.current_report())
|
|
146
|
+
logging.info("Upload is complete!")
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
####################
|
|
150
|
+
# Logic to manage workers and synchronize tasks
|
|
151
|
+
####################
|
|
152
|
+
|
|
153
|
+
|
|
154
|
+
class WorkerJob(enum.Enum):
|
|
155
|
+
SHA256 = enum.auto()
|
|
156
|
+
GET_UPLOAD_MODE = enum.auto()
|
|
157
|
+
PREUPLOAD_LFS = enum.auto()
|
|
158
|
+
COMMIT = enum.auto()
|
|
159
|
+
WAIT = enum.auto() # if no tasks are available but we don't want to exit
|
|
160
|
+
|
|
161
|
+
|
|
162
|
+
JOB_ITEM_T = Tuple[LocalUploadFilePaths, LocalUploadFileMetadata]
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
class LargeUploadStatus:
|
|
166
|
+
"""Contains information, queues and tasks for a large upload process."""
|
|
167
|
+
|
|
168
|
+
def __init__(self, items: List[JOB_ITEM_T]):
|
|
169
|
+
self.items = items
|
|
170
|
+
self.queue_sha256: "queue.Queue[JOB_ITEM_T]" = queue.Queue()
|
|
171
|
+
self.queue_get_upload_mode: "queue.Queue[JOB_ITEM_T]" = queue.Queue()
|
|
172
|
+
self.queue_preupload_lfs: "queue.Queue[JOB_ITEM_T]" = queue.Queue()
|
|
173
|
+
self.queue_commit: "queue.Queue[JOB_ITEM_T]" = queue.Queue()
|
|
174
|
+
self.lock = Lock()
|
|
175
|
+
|
|
176
|
+
self.nb_workers_sha256: int = 0
|
|
177
|
+
self.nb_workers_get_upload_mode: int = 0
|
|
178
|
+
self.nb_workers_preupload_lfs: int = 0
|
|
179
|
+
self.nb_workers_commit: int = 0
|
|
180
|
+
self.nb_workers_waiting: int = 0
|
|
181
|
+
self.last_commit_attempt: Optional[float] = None
|
|
182
|
+
|
|
183
|
+
self._started_at = datetime.now()
|
|
184
|
+
|
|
185
|
+
# Setup queues
|
|
186
|
+
for item in self.items:
|
|
187
|
+
paths, metadata = item
|
|
188
|
+
if metadata.sha256 is None:
|
|
189
|
+
self.queue_sha256.put(item)
|
|
190
|
+
elif metadata.upload_mode is None:
|
|
191
|
+
self.queue_get_upload_mode.put(item)
|
|
192
|
+
elif metadata.upload_mode == "lfs" and not metadata.is_uploaded:
|
|
193
|
+
self.queue_preupload_lfs.put(item)
|
|
194
|
+
elif not metadata.is_committed:
|
|
195
|
+
self.queue_commit.put(item)
|
|
196
|
+
else:
|
|
197
|
+
logger.debug(f"Skipping file {paths.path_in_repo} (already uploaded and committed)")
|
|
198
|
+
|
|
199
|
+
def current_report(self) -> str:
|
|
200
|
+
"""Generate a report of the current status of the large upload."""
|
|
201
|
+
nb_hashed = 0
|
|
202
|
+
size_hashed = 0
|
|
203
|
+
nb_preuploaded = 0
|
|
204
|
+
nb_lfs = 0
|
|
205
|
+
nb_lfs_unsure = 0
|
|
206
|
+
size_preuploaded = 0
|
|
207
|
+
nb_committed = 0
|
|
208
|
+
size_committed = 0
|
|
209
|
+
total_size = 0
|
|
210
|
+
ignored_files = 0
|
|
211
|
+
total_files = 0
|
|
212
|
+
|
|
213
|
+
with self.lock:
|
|
214
|
+
for _, metadata in self.items:
|
|
215
|
+
if metadata.should_ignore:
|
|
216
|
+
ignored_files += 1
|
|
217
|
+
continue
|
|
218
|
+
total_size += metadata.size
|
|
219
|
+
total_files += 1
|
|
220
|
+
if metadata.sha256 is not None:
|
|
221
|
+
nb_hashed += 1
|
|
222
|
+
size_hashed += metadata.size
|
|
223
|
+
if metadata.upload_mode == "lfs":
|
|
224
|
+
nb_lfs += 1
|
|
225
|
+
if metadata.upload_mode is None:
|
|
226
|
+
nb_lfs_unsure += 1
|
|
227
|
+
if metadata.is_uploaded:
|
|
228
|
+
nb_preuploaded += 1
|
|
229
|
+
size_preuploaded += metadata.size
|
|
230
|
+
if metadata.is_committed:
|
|
231
|
+
nb_committed += 1
|
|
232
|
+
size_committed += metadata.size
|
|
233
|
+
total_size_str = _format_size(total_size)
|
|
234
|
+
|
|
235
|
+
now = datetime.now()
|
|
236
|
+
now_str = now.strftime("%Y-%m-%d %H:%M:%S")
|
|
237
|
+
elapsed = now - self._started_at
|
|
238
|
+
elapsed_str = str(elapsed).split(".")[0] # remove milliseconds
|
|
239
|
+
|
|
240
|
+
message = "\n" + "-" * 10
|
|
241
|
+
message += f" {now_str} ({elapsed_str}) "
|
|
242
|
+
message += "-" * 10 + "\n"
|
|
243
|
+
|
|
244
|
+
message += "Files: "
|
|
245
|
+
message += f"hashed {nb_hashed}/{total_files} ({_format_size(size_hashed)}/{total_size_str}) | "
|
|
246
|
+
message += f"pre-uploaded: {nb_preuploaded}/{nb_lfs} ({_format_size(size_preuploaded)}/{total_size_str})"
|
|
247
|
+
if nb_lfs_unsure > 0:
|
|
248
|
+
message += f" (+{nb_lfs_unsure} unsure)"
|
|
249
|
+
message += f" | committed: {nb_committed}/{total_files} ({_format_size(size_committed)}/{total_size_str})"
|
|
250
|
+
message += f" | ignored: {ignored_files}\n"
|
|
251
|
+
|
|
252
|
+
message += "Workers: "
|
|
253
|
+
message += f"hashing: {self.nb_workers_sha256} | "
|
|
254
|
+
message += f"get upload mode: {self.nb_workers_get_upload_mode} | "
|
|
255
|
+
message += f"pre-uploading: {self.nb_workers_preupload_lfs} | "
|
|
256
|
+
message += f"committing: {self.nb_workers_commit} | "
|
|
257
|
+
message += f"waiting: {self.nb_workers_waiting}\n"
|
|
258
|
+
message += "-" * 51
|
|
259
|
+
|
|
260
|
+
return message
|
|
261
|
+
|
|
262
|
+
def is_done(self) -> bool:
|
|
263
|
+
with self.lock:
|
|
264
|
+
return all(metadata.is_committed or metadata.should_ignore for _, metadata in self.items)
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
def _worker_job(
|
|
268
|
+
status: LargeUploadStatus,
|
|
269
|
+
api: "HfApi",
|
|
270
|
+
repo_id: str,
|
|
271
|
+
repo_type: str,
|
|
272
|
+
revision: str,
|
|
273
|
+
):
|
|
274
|
+
"""
|
|
275
|
+
Main process for a worker. The worker will perform tasks based on the priority list until all files are uploaded
|
|
276
|
+
and committed. If no tasks are available, the worker will wait for 10 seconds before checking again.
|
|
277
|
+
|
|
278
|
+
If a task fails for any reason, the item(s) are put back in the queue for another worker to pick up.
|
|
279
|
+
|
|
280
|
+
Read `upload_large_folder` docstring for more information on how tasks are prioritized.
|
|
281
|
+
"""
|
|
282
|
+
while True:
|
|
283
|
+
next_job: Optional[Tuple[WorkerJob, List[JOB_ITEM_T]]] = None
|
|
284
|
+
|
|
285
|
+
# Determine next task
|
|
286
|
+
next_job = _determine_next_job(status)
|
|
287
|
+
if next_job is None:
|
|
288
|
+
return
|
|
289
|
+
job, items = next_job
|
|
290
|
+
|
|
291
|
+
# Perform task
|
|
292
|
+
if job == WorkerJob.SHA256:
|
|
293
|
+
item = items[0] # single item
|
|
294
|
+
try:
|
|
295
|
+
_compute_sha256(item)
|
|
296
|
+
status.queue_get_upload_mode.put(item)
|
|
297
|
+
except KeyboardInterrupt:
|
|
298
|
+
raise
|
|
299
|
+
except Exception as e:
|
|
300
|
+
logger.error(f"Failed to compute sha256: {e}")
|
|
301
|
+
traceback.format_exc()
|
|
302
|
+
status.queue_sha256.put(item)
|
|
303
|
+
|
|
304
|
+
with status.lock:
|
|
305
|
+
status.nb_workers_sha256 -= 1
|
|
306
|
+
|
|
307
|
+
elif job == WorkerJob.GET_UPLOAD_MODE:
|
|
308
|
+
try:
|
|
309
|
+
_get_upload_mode(items, api=api, repo_id=repo_id, repo_type=repo_type, revision=revision)
|
|
310
|
+
except KeyboardInterrupt:
|
|
311
|
+
raise
|
|
312
|
+
except Exception as e:
|
|
313
|
+
logger.error(f"Failed to get upload mode: {e}")
|
|
314
|
+
traceback.format_exc()
|
|
315
|
+
|
|
316
|
+
# Items are either:
|
|
317
|
+
# - dropped (if should_ignore)
|
|
318
|
+
# - put in LFS queue (if LFS)
|
|
319
|
+
# - put in commit queue (if regular)
|
|
320
|
+
# - or put back (if error occurred).
|
|
321
|
+
for item in items:
|
|
322
|
+
_, metadata = item
|
|
323
|
+
if metadata.should_ignore:
|
|
324
|
+
continue
|
|
325
|
+
if metadata.upload_mode == "lfs":
|
|
326
|
+
status.queue_preupload_lfs.put(item)
|
|
327
|
+
elif metadata.upload_mode == "regular":
|
|
328
|
+
status.queue_commit.put(item)
|
|
329
|
+
else:
|
|
330
|
+
status.queue_get_upload_mode.put(item)
|
|
331
|
+
|
|
332
|
+
with status.lock:
|
|
333
|
+
status.nb_workers_get_upload_mode -= 1
|
|
334
|
+
|
|
335
|
+
elif job == WorkerJob.PREUPLOAD_LFS:
|
|
336
|
+
item = items[0] # single item
|
|
337
|
+
try:
|
|
338
|
+
_preupload_lfs(item, api=api, repo_id=repo_id, repo_type=repo_type, revision=revision)
|
|
339
|
+
status.queue_commit.put(item)
|
|
340
|
+
except KeyboardInterrupt:
|
|
341
|
+
raise
|
|
342
|
+
except Exception as e:
|
|
343
|
+
logger.error(f"Failed to preupload LFS: {e}")
|
|
344
|
+
traceback.format_exc()
|
|
345
|
+
status.queue_preupload_lfs.put(item)
|
|
346
|
+
|
|
347
|
+
with status.lock:
|
|
348
|
+
status.nb_workers_preupload_lfs -= 1
|
|
349
|
+
|
|
350
|
+
elif job == WorkerJob.COMMIT:
|
|
351
|
+
try:
|
|
352
|
+
_commit(items, api=api, repo_id=repo_id, repo_type=repo_type, revision=revision)
|
|
353
|
+
except KeyboardInterrupt:
|
|
354
|
+
raise
|
|
355
|
+
except Exception as e:
|
|
356
|
+
logger.error(f"Failed to commit: {e}")
|
|
357
|
+
traceback.format_exc()
|
|
358
|
+
for item in items:
|
|
359
|
+
status.queue_commit.put(item)
|
|
360
|
+
with status.lock:
|
|
361
|
+
status.last_commit_attempt = time.time()
|
|
362
|
+
status.nb_workers_commit -= 1
|
|
363
|
+
|
|
364
|
+
elif job == WorkerJob.WAIT:
|
|
365
|
+
time.sleep(WAITING_TIME_IF_NO_TASKS)
|
|
366
|
+
with status.lock:
|
|
367
|
+
status.nb_workers_waiting -= 1
|
|
368
|
+
|
|
369
|
+
|
|
370
|
+
def _determine_next_job(status: LargeUploadStatus) -> Optional[Tuple[WorkerJob, List[JOB_ITEM_T]]]:
|
|
371
|
+
with status.lock:
|
|
372
|
+
# 1. Commit if more than 5 minutes since last commit attempt (and at least 1 file)
|
|
373
|
+
if (
|
|
374
|
+
status.nb_workers_commit == 0
|
|
375
|
+
and status.queue_commit.qsize() > 0
|
|
376
|
+
and (status.last_commit_attempt is None or time.time() - status.last_commit_attempt > 5 * 60)
|
|
377
|
+
):
|
|
378
|
+
status.nb_workers_commit += 1
|
|
379
|
+
logger.debug("Job: commit (more than 5 minutes since last commit attempt)")
|
|
380
|
+
return (WorkerJob.COMMIT, _get_n(status.queue_commit, 25))
|
|
381
|
+
|
|
382
|
+
# 2. Commit if at least 25 files are ready to commit
|
|
383
|
+
elif status.nb_workers_commit == 0 and status.queue_commit.qsize() >= 25:
|
|
384
|
+
status.nb_workers_commit += 1
|
|
385
|
+
logger.debug("Job: commit (>25 files ready)")
|
|
386
|
+
return (WorkerJob.COMMIT, _get_n(status.queue_commit, 25))
|
|
387
|
+
|
|
388
|
+
# 3. Get upload mode if at least 10 files
|
|
389
|
+
elif status.queue_get_upload_mode.qsize() >= 10:
|
|
390
|
+
status.nb_workers_get_upload_mode += 1
|
|
391
|
+
logger.debug("Job: get upload mode (>10 files ready)")
|
|
392
|
+
return (WorkerJob.GET_UPLOAD_MODE, _get_n(status.queue_get_upload_mode, 50))
|
|
393
|
+
|
|
394
|
+
# 4. Preupload LFS file if at least 1 file and no worker is preuploading LFS
|
|
395
|
+
elif status.queue_preupload_lfs.qsize() > 0 and status.nb_workers_preupload_lfs == 0:
|
|
396
|
+
status.nb_workers_preupload_lfs += 1
|
|
397
|
+
logger.debug("Job: preupload LFS (no other worker preuploading LFS)")
|
|
398
|
+
return (WorkerJob.PREUPLOAD_LFS, _get_one(status.queue_preupload_lfs))
|
|
399
|
+
|
|
400
|
+
# 5. Compute sha256 if at least 1 file and no worker is computing sha256
|
|
401
|
+
elif status.queue_sha256.qsize() > 0 and status.nb_workers_sha256 == 0:
|
|
402
|
+
status.nb_workers_sha256 += 1
|
|
403
|
+
logger.debug("Job: sha256 (no other worker computing sha256)")
|
|
404
|
+
return (WorkerJob.SHA256, _get_one(status.queue_sha256))
|
|
405
|
+
|
|
406
|
+
# 6. Get upload mode if at least 1 file and no worker is getting upload mode
|
|
407
|
+
elif status.queue_get_upload_mode.qsize() > 0 and status.nb_workers_get_upload_mode == 0:
|
|
408
|
+
status.nb_workers_get_upload_mode += 1
|
|
409
|
+
logger.debug("Job: get upload mode (no other worker getting upload mode)")
|
|
410
|
+
return (WorkerJob.GET_UPLOAD_MODE, _get_n(status.queue_get_upload_mode, 50))
|
|
411
|
+
|
|
412
|
+
# 7. Preupload LFS file if at least 1 file
|
|
413
|
+
# Skip if hf_transfer is enabled and there is already a worker preuploading LFS
|
|
414
|
+
elif status.queue_preupload_lfs.qsize() > 0 and (
|
|
415
|
+
status.nb_workers_preupload_lfs == 0 or not constants.HF_HUB_ENABLE_HF_TRANSFER
|
|
416
|
+
):
|
|
417
|
+
status.nb_workers_preupload_lfs += 1
|
|
418
|
+
logger.debug("Job: preupload LFS")
|
|
419
|
+
return (WorkerJob.PREUPLOAD_LFS, _get_one(status.queue_preupload_lfs))
|
|
420
|
+
|
|
421
|
+
# 8. Compute sha256 if at least 1 file
|
|
422
|
+
elif status.queue_sha256.qsize() > 0:
|
|
423
|
+
status.nb_workers_sha256 += 1
|
|
424
|
+
logger.debug("Job: sha256")
|
|
425
|
+
return (WorkerJob.SHA256, _get_one(status.queue_sha256))
|
|
426
|
+
|
|
427
|
+
# 9. Get upload mode if at least 1 file
|
|
428
|
+
elif status.queue_get_upload_mode.qsize() > 0:
|
|
429
|
+
status.nb_workers_get_upload_mode += 1
|
|
430
|
+
logger.debug("Job: get upload mode")
|
|
431
|
+
return (WorkerJob.GET_UPLOAD_MODE, _get_n(status.queue_get_upload_mode, 50))
|
|
432
|
+
|
|
433
|
+
# 10. Commit if at least 1 file
|
|
434
|
+
elif status.nb_workers_commit == 0 and status.queue_commit.qsize() > 0:
|
|
435
|
+
status.nb_workers_commit += 1
|
|
436
|
+
logger.debug("Job: commit")
|
|
437
|
+
return (WorkerJob.COMMIT, _get_n(status.queue_commit, 25))
|
|
438
|
+
|
|
439
|
+
# 11. If all queues are empty, exit
|
|
440
|
+
elif all(metadata.is_committed or metadata.should_ignore for _, metadata in status.items):
|
|
441
|
+
logger.info("All files have been processed! Exiting worker.")
|
|
442
|
+
return None
|
|
443
|
+
|
|
444
|
+
# 12. If no task is available, wait
|
|
445
|
+
else:
|
|
446
|
+
status.nb_workers_waiting += 1
|
|
447
|
+
logger.debug(f"No task available, waiting... ({WAITING_TIME_IF_NO_TASKS}s)")
|
|
448
|
+
return (WorkerJob.WAIT, [])
|
|
449
|
+
|
|
450
|
+
|
|
451
|
+
####################
|
|
452
|
+
# Atomic jobs (sha256, get_upload_mode, preupload_lfs, commit)
|
|
453
|
+
####################
|
|
454
|
+
|
|
455
|
+
|
|
456
|
+
def _compute_sha256(item: JOB_ITEM_T) -> None:
|
|
457
|
+
"""Compute sha256 of a file and save it in metadata."""
|
|
458
|
+
paths, metadata = item
|
|
459
|
+
if metadata.sha256 is None:
|
|
460
|
+
with paths.file_path.open("rb") as f:
|
|
461
|
+
metadata.sha256 = sha_fileobj(f).hex()
|
|
462
|
+
metadata.save(paths)
|
|
463
|
+
|
|
464
|
+
|
|
465
|
+
def _get_upload_mode(items: List[JOB_ITEM_T], api: "HfApi", repo_id: str, repo_type: str, revision: str) -> None:
|
|
466
|
+
"""Get upload mode for each file and update metadata.
|
|
467
|
+
|
|
468
|
+
Also receive info if the file should be ignored.
|
|
469
|
+
"""
|
|
470
|
+
additions = [_build_hacky_operation(item) for item in items]
|
|
471
|
+
_fetch_upload_modes(
|
|
472
|
+
additions=additions,
|
|
473
|
+
repo_type=repo_type,
|
|
474
|
+
repo_id=repo_id,
|
|
475
|
+
headers=api._build_hf_headers(),
|
|
476
|
+
revision=revision,
|
|
477
|
+
)
|
|
478
|
+
for item, addition in zip(items, additions):
|
|
479
|
+
paths, metadata = item
|
|
480
|
+
metadata.upload_mode = addition._upload_mode
|
|
481
|
+
metadata.should_ignore = addition._should_ignore
|
|
482
|
+
metadata.save(paths)
|
|
483
|
+
|
|
484
|
+
|
|
485
|
+
def _preupload_lfs(item: JOB_ITEM_T, api: "HfApi", repo_id: str, repo_type: str, revision: str) -> None:
|
|
486
|
+
"""Preupload LFS file and update metadata."""
|
|
487
|
+
paths, metadata = item
|
|
488
|
+
addition = _build_hacky_operation(item)
|
|
489
|
+
api.preupload_lfs_files(
|
|
490
|
+
repo_id=repo_id,
|
|
491
|
+
repo_type=repo_type,
|
|
492
|
+
revision=revision,
|
|
493
|
+
additions=[addition],
|
|
494
|
+
)
|
|
495
|
+
|
|
496
|
+
metadata.is_uploaded = True
|
|
497
|
+
metadata.save(paths)
|
|
498
|
+
|
|
499
|
+
|
|
500
|
+
def _commit(items: List[JOB_ITEM_T], api: "HfApi", repo_id: str, repo_type: str, revision: str) -> None:
|
|
501
|
+
"""Commit files to the repo."""
|
|
502
|
+
additions = [_build_hacky_operation(item) for item in items]
|
|
503
|
+
api.create_commit(
|
|
504
|
+
repo_id=repo_id,
|
|
505
|
+
repo_type=repo_type,
|
|
506
|
+
revision=revision,
|
|
507
|
+
operations=additions,
|
|
508
|
+
commit_message="Add files using upload-large-folder tool",
|
|
509
|
+
)
|
|
510
|
+
for paths, metadata in items:
|
|
511
|
+
metadata.is_committed = True
|
|
512
|
+
metadata.save(paths)
|
|
513
|
+
|
|
514
|
+
|
|
515
|
+
####################
|
|
516
|
+
# Hacks with CommitOperationAdd to bypass checks/sha256 calculation
|
|
517
|
+
####################
|
|
518
|
+
|
|
519
|
+
|
|
520
|
+
class HackyCommitOperationAdd(CommitOperationAdd):
|
|
521
|
+
def __post_init__(self) -> None:
|
|
522
|
+
if isinstance(self.path_or_fileobj, Path):
|
|
523
|
+
self.path_or_fileobj = str(self.path_or_fileobj)
|
|
524
|
+
|
|
525
|
+
|
|
526
|
+
def _build_hacky_operation(item: JOB_ITEM_T) -> HackyCommitOperationAdd:
|
|
527
|
+
paths, metadata = item
|
|
528
|
+
operation = HackyCommitOperationAdd(path_in_repo=paths.path_in_repo, path_or_fileobj=paths.file_path)
|
|
529
|
+
with paths.file_path.open("rb") as file:
|
|
530
|
+
sample = file.peek(512)[:512]
|
|
531
|
+
if metadata.sha256 is None:
|
|
532
|
+
raise ValueError("sha256 must have been computed by now!")
|
|
533
|
+
operation.upload_info = UploadInfo(sha256=bytes.fromhex(metadata.sha256), size=metadata.size, sample=sample)
|
|
534
|
+
return operation
|
|
535
|
+
|
|
536
|
+
|
|
537
|
+
####################
|
|
538
|
+
# Misc helpers
|
|
539
|
+
####################
|
|
540
|
+
|
|
541
|
+
|
|
542
|
+
def _get_one(queue: "queue.Queue[JOB_ITEM_T]") -> List[JOB_ITEM_T]:
|
|
543
|
+
return [queue.get()]
|
|
544
|
+
|
|
545
|
+
|
|
546
|
+
def _get_n(queue: "queue.Queue[JOB_ITEM_T]", n: int) -> List[JOB_ITEM_T]:
|
|
547
|
+
return [queue.get() for _ in range(min(queue.qsize(), n))]
|
|
548
|
+
|
|
549
|
+
|
|
550
|
+
def _print_overwrite(report: str) -> None:
|
|
551
|
+
"""Print a report, overwriting the previous lines.
|
|
552
|
+
|
|
553
|
+
Since tqdm in using `sys.stderr` to (re-)write progress bars, we need to use `sys.stdout`
|
|
554
|
+
to print the report.
|
|
555
|
+
|
|
556
|
+
Note: works well only if no other process is writing to `sys.stdout`!
|
|
557
|
+
"""
|
|
558
|
+
report += "\n"
|
|
559
|
+
# Get terminal width
|
|
560
|
+
terminal_width = shutil.get_terminal_size().columns
|
|
561
|
+
|
|
562
|
+
# Count number of lines that should be cleared
|
|
563
|
+
nb_lines = sum(len(line) // terminal_width + 1 for line in report.splitlines())
|
|
564
|
+
|
|
565
|
+
# Clear previous lines based on the number of lines in the report
|
|
566
|
+
for _ in range(nb_lines):
|
|
567
|
+
sys.stdout.write("\r\033[K") # Clear line
|
|
568
|
+
sys.stdout.write("\033[F") # Move cursor up one line
|
|
569
|
+
|
|
570
|
+
# Print the new report, filling remaining space with whitespace
|
|
571
|
+
sys.stdout.write(report)
|
|
572
|
+
sys.stdout.write(" " * (terminal_width - len(report.splitlines()[-1])))
|
|
573
|
+
sys.stdout.flush()
|
|
@@ -26,6 +26,7 @@ class ANSI:
|
|
|
26
26
|
_gray = "\u001b[90m"
|
|
27
27
|
_red = "\u001b[31m"
|
|
28
28
|
_reset = "\u001b[0m"
|
|
29
|
+
_yellow = "\u001b[33m"
|
|
29
30
|
|
|
30
31
|
@classmethod
|
|
31
32
|
def bold(cls, s: str) -> str:
|
|
@@ -39,6 +40,10 @@ class ANSI:
|
|
|
39
40
|
def red(cls, s: str) -> str:
|
|
40
41
|
return cls._format(s, cls._bold + cls._red)
|
|
41
42
|
|
|
43
|
+
@classmethod
|
|
44
|
+
def yellow(cls, s: str) -> str:
|
|
45
|
+
return cls._format(s, cls._yellow)
|
|
46
|
+
|
|
42
47
|
@classmethod
|
|
43
48
|
def _format(cls, s: str, code: str) -> str:
|
|
44
49
|
if os.environ.get("NO_COLOR"):
|
|
@@ -112,6 +112,12 @@ class DownloadCommand(BaseHuggingfaceCLICommand):
|
|
|
112
112
|
action="store_true",
|
|
113
113
|
help="If True, progress bars are disabled and only the path to the download files is printed.",
|
|
114
114
|
)
|
|
115
|
+
download_parser.add_argument(
|
|
116
|
+
"--max-workers",
|
|
117
|
+
type=int,
|
|
118
|
+
default=8,
|
|
119
|
+
help="Maximum number of workers to use for downloading files. Default is 8.",
|
|
120
|
+
)
|
|
115
121
|
download_parser.set_defaults(func=DownloadCommand)
|
|
116
122
|
|
|
117
123
|
def __init__(self, args: Namespace) -> None:
|
|
@@ -127,6 +133,7 @@ class DownloadCommand(BaseHuggingfaceCLICommand):
|
|
|
127
133
|
self.force_download: bool = args.force_download
|
|
128
134
|
self.resume_download: Optional[bool] = args.resume_download or None
|
|
129
135
|
self.quiet: bool = args.quiet
|
|
136
|
+
self.max_workers: int = args.max_workers
|
|
130
137
|
|
|
131
138
|
if args.local_dir_use_symlinks is not None:
|
|
132
139
|
warnings.warn(
|
|
@@ -189,4 +196,5 @@ class DownloadCommand(BaseHuggingfaceCLICommand):
|
|
|
189
196
|
token=self.token,
|
|
190
197
|
local_dir=self.local_dir,
|
|
191
198
|
library_name="huggingface-cli",
|
|
199
|
+
max_workers=self.max_workers,
|
|
192
200
|
)
|
|
@@ -22,7 +22,9 @@ from huggingface_hub.commands.repo_files import RepoFilesCommand
|
|
|
22
22
|
from huggingface_hub.commands.scan_cache import ScanCacheCommand
|
|
23
23
|
from huggingface_hub.commands.tag import TagCommands
|
|
24
24
|
from huggingface_hub.commands.upload import UploadCommand
|
|
25
|
+
from huggingface_hub.commands.upload_large_folder import UploadLargeFolderCommand
|
|
25
26
|
from huggingface_hub.commands.user import UserCommands
|
|
27
|
+
from huggingface_hub.commands.version import VersionCommand
|
|
26
28
|
|
|
27
29
|
|
|
28
30
|
def main():
|
|
@@ -39,10 +41,13 @@ def main():
|
|
|
39
41
|
ScanCacheCommand.register_subcommand(commands_parser)
|
|
40
42
|
DeleteCacheCommand.register_subcommand(commands_parser)
|
|
41
43
|
TagCommands.register_subcommand(commands_parser)
|
|
44
|
+
VersionCommand.register_subcommand(commands_parser)
|
|
45
|
+
|
|
46
|
+
# Experimental
|
|
47
|
+
UploadLargeFolderCommand.register_subcommand(commands_parser)
|
|
42
48
|
|
|
43
49
|
# Let's go
|
|
44
50
|
args = parser.parse_args()
|
|
45
|
-
|
|
46
51
|
if not hasattr(args, "func"):
|
|
47
52
|
parser.print_help()
|
|
48
53
|
exit(1)
|
huggingface_hub/commands/lfs.py
CHANGED
|
@@ -24,9 +24,10 @@ from argparse import _SubParsersAction
|
|
|
24
24
|
from typing import Dict, List, Optional
|
|
25
25
|
|
|
26
26
|
from huggingface_hub.commands import BaseHuggingfaceCLICommand
|
|
27
|
-
from huggingface_hub.lfs import LFS_MULTIPART_UPLOAD_COMMAND
|
|
27
|
+
from huggingface_hub.lfs import LFS_MULTIPART_UPLOAD_COMMAND
|
|
28
28
|
|
|
29
29
|
from ..utils import get_session, hf_raise_for_status, logging
|
|
30
|
+
from ..utils._lfs import SliceFileObj
|
|
30
31
|
|
|
31
32
|
|
|
32
33
|
logger = logging.get_logger(__name__)
|
|
@@ -16,7 +16,7 @@
|
|
|
16
16
|
|
|
17
17
|
Usage:
|
|
18
18
|
# delete all
|
|
19
|
-
huggingface-cli repo-files <repo_id> delete *
|
|
19
|
+
huggingface-cli repo-files <repo_id> delete "*"
|
|
20
20
|
|
|
21
21
|
# delete single file
|
|
22
22
|
huggingface-cli repo-files <repo_id> delete file.txt
|
|
@@ -28,7 +28,7 @@ Usage:
|
|
|
28
28
|
huggingface-cli repo-files <repo_id> delete file.txt folder/ file2.txt
|
|
29
29
|
|
|
30
30
|
# delete multiple patterns
|
|
31
|
-
huggingface-cli repo-files <repo_id> delete file.txt *.json folder/*.parquet
|
|
31
|
+
huggingface-cli repo-files <repo_id> delete file.txt "*.json" "folder/*.parquet"
|
|
32
32
|
|
|
33
33
|
# delete from different revision / repo-type
|
|
34
34
|
huggingface-cli repo-files <repo_id> delete file.txt --revision=refs/pr/1 --repo-type=dataset
|