huggingface-hub 0.22.1__py3-none-any.whl → 0.23.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of huggingface-hub might be problematic. Click here for more details.
- huggingface_hub/__init__.py +51 -19
- huggingface_hub/_commit_api.py +10 -9
- huggingface_hub/_commit_scheduler.py +2 -2
- huggingface_hub/_inference_endpoints.py +10 -17
- huggingface_hub/_local_folder.py +229 -0
- huggingface_hub/_login.py +4 -3
- huggingface_hub/_multi_commits.py +1 -1
- huggingface_hub/_snapshot_download.py +16 -38
- huggingface_hub/_tensorboard_logger.py +16 -6
- huggingface_hub/_webhooks_payload.py +22 -1
- huggingface_hub/_webhooks_server.py +24 -20
- huggingface_hub/commands/download.py +11 -34
- huggingface_hub/commands/huggingface_cli.py +2 -0
- huggingface_hub/commands/tag.py +159 -0
- huggingface_hub/constants.py +3 -5
- huggingface_hub/errors.py +58 -0
- huggingface_hub/file_download.py +545 -376
- huggingface_hub/hf_api.py +758 -629
- huggingface_hub/hf_file_system.py +14 -5
- huggingface_hub/hub_mixin.py +127 -43
- huggingface_hub/inference/_client.py +402 -183
- huggingface_hub/inference/_common.py +19 -29
- huggingface_hub/inference/_generated/_async_client.py +402 -184
- huggingface_hub/inference/_generated/types/__init__.py +23 -6
- huggingface_hub/inference/_generated/types/chat_completion.py +197 -43
- huggingface_hub/inference/_generated/types/text_generation.py +57 -79
- huggingface_hub/inference/_templating.py +2 -4
- huggingface_hub/keras_mixin.py +0 -3
- huggingface_hub/lfs.py +16 -4
- huggingface_hub/repository.py +1 -0
- huggingface_hub/utils/__init__.py +19 -6
- huggingface_hub/utils/_fixes.py +1 -0
- huggingface_hub/utils/_headers.py +2 -4
- huggingface_hub/utils/_http.py +16 -5
- huggingface_hub/utils/_paths.py +13 -1
- huggingface_hub/utils/_runtime.py +10 -0
- huggingface_hub/utils/_safetensors.py +0 -13
- huggingface_hub/utils/_validators.py +2 -7
- huggingface_hub/utils/tqdm.py +124 -46
- {huggingface_hub-0.22.1.dist-info → huggingface_hub-0.23.0.dist-info}/METADATA +5 -1
- {huggingface_hub-0.22.1.dist-info → huggingface_hub-0.23.0.dist-info}/RECORD +45 -43
- {huggingface_hub-0.22.1.dist-info → huggingface_hub-0.23.0.dist-info}/LICENSE +0 -0
- {huggingface_hub-0.22.1.dist-info → huggingface_hub-0.23.0.dist-info}/WHEEL +0 -0
- {huggingface_hub-0.22.1.dist-info → huggingface_hub-0.23.0.dist-info}/entry_points.txt +0 -0
- {huggingface_hub-0.22.1.dist-info → huggingface_hub-0.23.0.dist-info}/top_level.txt +0 -0
huggingface_hub/__init__.py
CHANGED
|
@@ -46,7 +46,7 @@ import sys
|
|
|
46
46
|
from typing import TYPE_CHECKING
|
|
47
47
|
|
|
48
48
|
|
|
49
|
-
__version__ = "0.
|
|
49
|
+
__version__ = "0.23.0"
|
|
50
50
|
|
|
51
51
|
# Alphabetical order of definitions is ensured in tests
|
|
52
52
|
# WARNING: any comment added in this dictionary definition will be lost when
|
|
@@ -198,7 +198,6 @@ _SUBMOD_ATTRS = {
|
|
|
198
198
|
"list_accepted_access_requests",
|
|
199
199
|
"list_collections",
|
|
200
200
|
"list_datasets",
|
|
201
|
-
"list_files_info",
|
|
202
201
|
"list_inference_endpoints",
|
|
203
202
|
"list_liked_repos",
|
|
204
203
|
"list_metrics",
|
|
@@ -271,13 +270,28 @@ _SUBMOD_ATTRS = {
|
|
|
271
270
|
"AutomaticSpeechRecognitionOutputChunk",
|
|
272
271
|
"AutomaticSpeechRecognitionParameters",
|
|
273
272
|
"ChatCompletionInput",
|
|
273
|
+
"ChatCompletionInputFunctionDefinition",
|
|
274
274
|
"ChatCompletionInputMessage",
|
|
275
|
+
"ChatCompletionInputTool",
|
|
276
|
+
"ChatCompletionInputToolCall",
|
|
277
|
+
"ChatCompletionInputToolTypeClass",
|
|
275
278
|
"ChatCompletionOutput",
|
|
276
|
-
"
|
|
277
|
-
"
|
|
279
|
+
"ChatCompletionOutputComplete",
|
|
280
|
+
"ChatCompletionOutputFunctionDefinition",
|
|
281
|
+
"ChatCompletionOutputLogprob",
|
|
282
|
+
"ChatCompletionOutputLogprobs",
|
|
283
|
+
"ChatCompletionOutputMessage",
|
|
284
|
+
"ChatCompletionOutputToolCall",
|
|
285
|
+
"ChatCompletionOutputTopLogprob",
|
|
286
|
+
"ChatCompletionOutputUsage",
|
|
278
287
|
"ChatCompletionStreamOutput",
|
|
279
288
|
"ChatCompletionStreamOutputChoice",
|
|
280
289
|
"ChatCompletionStreamOutputDelta",
|
|
290
|
+
"ChatCompletionStreamOutputDeltaToolCall",
|
|
291
|
+
"ChatCompletionStreamOutputFunction",
|
|
292
|
+
"ChatCompletionStreamOutputLogprob",
|
|
293
|
+
"ChatCompletionStreamOutputLogprobs",
|
|
294
|
+
"ChatCompletionStreamOutputTopLogprob",
|
|
281
295
|
"DepthEstimationInput",
|
|
282
296
|
"DepthEstimationOutput",
|
|
283
297
|
"DocumentQuestionAnsweringInput",
|
|
@@ -325,14 +339,16 @@ _SUBMOD_ATTRS = {
|
|
|
325
339
|
"TextClassificationOutputElement",
|
|
326
340
|
"TextClassificationParameters",
|
|
327
341
|
"TextGenerationInput",
|
|
342
|
+
"TextGenerationInputGenerateParameters",
|
|
343
|
+
"TextGenerationInputGrammarType",
|
|
328
344
|
"TextGenerationOutput",
|
|
345
|
+
"TextGenerationOutputBestOfSequence",
|
|
329
346
|
"TextGenerationOutputDetails",
|
|
330
|
-
"
|
|
347
|
+
"TextGenerationOutputPrefillToken",
|
|
331
348
|
"TextGenerationOutputToken",
|
|
332
|
-
"TextGenerationParameters",
|
|
333
|
-
"TextGenerationPrefillToken",
|
|
334
|
-
"TextGenerationStreamDetails",
|
|
335
349
|
"TextGenerationStreamOutput",
|
|
350
|
+
"TextGenerationStreamOutputStreamDetails",
|
|
351
|
+
"TextGenerationStreamOutputToken",
|
|
336
352
|
"TextToAudioGenerationParameters",
|
|
337
353
|
"TextToAudioInput",
|
|
338
354
|
"TextToAudioOutput",
|
|
@@ -501,15 +517,15 @@ def _attach(package_name, submodules=None, submod_attrs=None):
|
|
|
501
517
|
def __dir__():
|
|
502
518
|
return __all__
|
|
503
519
|
|
|
504
|
-
if os.environ.get("EAGER_IMPORT", ""):
|
|
505
|
-
for attr in set(attr_to_modules.keys()) | submodules:
|
|
506
|
-
__getattr__(attr)
|
|
507
|
-
|
|
508
520
|
return __getattr__, __dir__, list(__all__)
|
|
509
521
|
|
|
510
522
|
|
|
511
523
|
__getattr__, __dir__, __all__ = _attach(__name__, submodules=[], submod_attrs=_SUBMOD_ATTRS)
|
|
512
524
|
|
|
525
|
+
if os.environ.get("EAGER_IMPORT", ""):
|
|
526
|
+
for attr in __all__:
|
|
527
|
+
__getattr__(attr)
|
|
528
|
+
|
|
513
529
|
# WARNING: any content below this statement is generated automatically. Any manual edit
|
|
514
530
|
# will be lost when re-generating this file !
|
|
515
531
|
#
|
|
@@ -662,7 +678,6 @@ if TYPE_CHECKING: # pragma: no cover
|
|
|
662
678
|
list_accepted_access_requests, # noqa: F401
|
|
663
679
|
list_collections, # noqa: F401
|
|
664
680
|
list_datasets, # noqa: F401
|
|
665
|
-
list_files_info, # noqa: F401
|
|
666
681
|
list_inference_endpoints, # noqa: F401
|
|
667
682
|
list_liked_repos, # noqa: F401
|
|
668
683
|
list_metrics, # noqa: F401
|
|
@@ -733,13 +748,28 @@ if TYPE_CHECKING: # pragma: no cover
|
|
|
733
748
|
AutomaticSpeechRecognitionOutputChunk, # noqa: F401
|
|
734
749
|
AutomaticSpeechRecognitionParameters, # noqa: F401
|
|
735
750
|
ChatCompletionInput, # noqa: F401
|
|
751
|
+
ChatCompletionInputFunctionDefinition, # noqa: F401
|
|
736
752
|
ChatCompletionInputMessage, # noqa: F401
|
|
753
|
+
ChatCompletionInputTool, # noqa: F401
|
|
754
|
+
ChatCompletionInputToolCall, # noqa: F401
|
|
755
|
+
ChatCompletionInputToolTypeClass, # noqa: F401
|
|
737
756
|
ChatCompletionOutput, # noqa: F401
|
|
738
|
-
|
|
739
|
-
|
|
757
|
+
ChatCompletionOutputComplete, # noqa: F401
|
|
758
|
+
ChatCompletionOutputFunctionDefinition, # noqa: F401
|
|
759
|
+
ChatCompletionOutputLogprob, # noqa: F401
|
|
760
|
+
ChatCompletionOutputLogprobs, # noqa: F401
|
|
761
|
+
ChatCompletionOutputMessage, # noqa: F401
|
|
762
|
+
ChatCompletionOutputToolCall, # noqa: F401
|
|
763
|
+
ChatCompletionOutputTopLogprob, # noqa: F401
|
|
764
|
+
ChatCompletionOutputUsage, # noqa: F401
|
|
740
765
|
ChatCompletionStreamOutput, # noqa: F401
|
|
741
766
|
ChatCompletionStreamOutputChoice, # noqa: F401
|
|
742
767
|
ChatCompletionStreamOutputDelta, # noqa: F401
|
|
768
|
+
ChatCompletionStreamOutputDeltaToolCall, # noqa: F401
|
|
769
|
+
ChatCompletionStreamOutputFunction, # noqa: F401
|
|
770
|
+
ChatCompletionStreamOutputLogprob, # noqa: F401
|
|
771
|
+
ChatCompletionStreamOutputLogprobs, # noqa: F401
|
|
772
|
+
ChatCompletionStreamOutputTopLogprob, # noqa: F401
|
|
743
773
|
DepthEstimationInput, # noqa: F401
|
|
744
774
|
DepthEstimationOutput, # noqa: F401
|
|
745
775
|
DocumentQuestionAnsweringInput, # noqa: F401
|
|
@@ -787,14 +817,16 @@ if TYPE_CHECKING: # pragma: no cover
|
|
|
787
817
|
TextClassificationOutputElement, # noqa: F401
|
|
788
818
|
TextClassificationParameters, # noqa: F401
|
|
789
819
|
TextGenerationInput, # noqa: F401
|
|
820
|
+
TextGenerationInputGenerateParameters, # noqa: F401
|
|
821
|
+
TextGenerationInputGrammarType, # noqa: F401
|
|
790
822
|
TextGenerationOutput, # noqa: F401
|
|
823
|
+
TextGenerationOutputBestOfSequence, # noqa: F401
|
|
791
824
|
TextGenerationOutputDetails, # noqa: F401
|
|
792
|
-
|
|
825
|
+
TextGenerationOutputPrefillToken, # noqa: F401
|
|
793
826
|
TextGenerationOutputToken, # noqa: F401
|
|
794
|
-
TextGenerationParameters, # noqa: F401
|
|
795
|
-
TextGenerationPrefillToken, # noqa: F401
|
|
796
|
-
TextGenerationStreamDetails, # noqa: F401
|
|
797
827
|
TextGenerationStreamOutput, # noqa: F401
|
|
828
|
+
TextGenerationStreamOutputStreamDetails, # noqa: F401
|
|
829
|
+
TextGenerationStreamOutputToken, # noqa: F401
|
|
798
830
|
TextToAudioGenerationParameters, # noqa: F401
|
|
799
831
|
TextToAudioInput, # noqa: F401
|
|
800
832
|
TextToAudioOutput, # noqa: F401
|
huggingface_hub/_commit_api.py
CHANGED
|
@@ -15,14 +15,14 @@ from typing import TYPE_CHECKING, Any, BinaryIO, Dict, Iterable, Iterator, List,
|
|
|
15
15
|
|
|
16
16
|
from tqdm.contrib.concurrent import thread_map
|
|
17
17
|
|
|
18
|
-
from huggingface_hub import get_session
|
|
19
|
-
|
|
20
18
|
from .constants import ENDPOINT, HF_HUB_ENABLE_HF_TRANSFER
|
|
21
19
|
from .file_download import hf_hub_url
|
|
22
20
|
from .lfs import UploadInfo, lfs_upload, post_lfs_batch_info
|
|
23
21
|
from .utils import (
|
|
22
|
+
FORBIDDEN_FOLDERS,
|
|
24
23
|
EntryNotFoundError,
|
|
25
24
|
chunk_iterable,
|
|
25
|
+
get_session,
|
|
26
26
|
hf_raise_for_status,
|
|
27
27
|
logging,
|
|
28
28
|
tqdm_stream_file,
|
|
@@ -255,11 +255,12 @@ def _validate_path_in_repo(path_in_repo: str) -> str:
|
|
|
255
255
|
raise ValueError(f"Invalid `path_in_repo` in CommitOperation: '{path_in_repo}'")
|
|
256
256
|
if path_in_repo.startswith("./"):
|
|
257
257
|
path_in_repo = path_in_repo[2:]
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
258
|
+
for forbidden in FORBIDDEN_FOLDERS:
|
|
259
|
+
if any(part == forbidden for part in path_in_repo.split("/")):
|
|
260
|
+
raise ValueError(
|
|
261
|
+
f"Invalid `path_in_repo` in CommitOperation: cannot update files under a '{forbidden}/' folder (path:"
|
|
262
|
+
f" '{path_in_repo}')."
|
|
263
|
+
)
|
|
263
264
|
return path_in_repo
|
|
264
265
|
|
|
265
266
|
|
|
@@ -399,13 +400,13 @@ def _upload_lfs_files(
|
|
|
399
400
|
def _wrapped_lfs_upload(batch_action) -> None:
|
|
400
401
|
try:
|
|
401
402
|
operation = oid2addop[batch_action["oid"]]
|
|
402
|
-
lfs_upload(operation=operation, lfs_batch_action=batch_action, headers=headers)
|
|
403
|
+
lfs_upload(operation=operation, lfs_batch_action=batch_action, headers=headers, endpoint=endpoint)
|
|
403
404
|
except Exception as exc:
|
|
404
405
|
raise RuntimeError(f"Error while uploading '{operation.path_in_repo}' to the Hub.") from exc
|
|
405
406
|
|
|
406
407
|
if HF_HUB_ENABLE_HF_TRANSFER:
|
|
407
408
|
logger.debug(f"Uploading {len(filtered_actions)} LFS files to the Hub using `hf_transfer`.")
|
|
408
|
-
for action in hf_tqdm(filtered_actions):
|
|
409
|
+
for action in hf_tqdm(filtered_actions, name="huggingface_hub.lfs_upload"):
|
|
409
410
|
_wrapped_lfs_upload(action)
|
|
410
411
|
elif len(filtered_actions) == 1:
|
|
411
412
|
logger.debug("Uploading 1 LFS file to the Hub")
|
|
@@ -9,7 +9,7 @@ from pathlib import Path
|
|
|
9
9
|
from threading import Lock, Thread
|
|
10
10
|
from typing import Dict, List, Optional, Union
|
|
11
11
|
|
|
12
|
-
from .hf_api import
|
|
12
|
+
from .hf_api import DEFAULT_IGNORE_PATTERNS, CommitInfo, CommitOperationAdd, HfApi
|
|
13
13
|
from .utils import filter_repo_objects
|
|
14
14
|
|
|
15
15
|
|
|
@@ -107,7 +107,7 @@ class CommitScheduler:
|
|
|
107
107
|
ignore_patterns = []
|
|
108
108
|
elif isinstance(ignore_patterns, str):
|
|
109
109
|
ignore_patterns = [ignore_patterns]
|
|
110
|
-
self.ignore_patterns = ignore_patterns +
|
|
110
|
+
self.ignore_patterns = ignore_patterns + DEFAULT_IGNORE_PATTERNS
|
|
111
111
|
|
|
112
112
|
if self.folder_path.is_file():
|
|
113
113
|
raise ValueError(f"'folder_path' must be a directory, not a file: '{self.folder_path}'.")
|
|
@@ -4,9 +4,11 @@ from datetime import datetime
|
|
|
4
4
|
from enum import Enum
|
|
5
5
|
from typing import TYPE_CHECKING, Dict, Optional, Union
|
|
6
6
|
|
|
7
|
+
from huggingface_hub.errors import InferenceEndpointError, InferenceEndpointTimeoutError
|
|
8
|
+
|
|
7
9
|
from .inference._client import InferenceClient
|
|
8
10
|
from .inference._generated._async_client import AsyncInferenceClient
|
|
9
|
-
from .utils import logging, parse_datetime
|
|
11
|
+
from .utils import get_session, logging, parse_datetime
|
|
10
12
|
|
|
11
13
|
|
|
12
14
|
if TYPE_CHECKING:
|
|
@@ -16,14 +18,6 @@ if TYPE_CHECKING:
|
|
|
16
18
|
logger = logging.get_logger(__name__)
|
|
17
19
|
|
|
18
20
|
|
|
19
|
-
class InferenceEndpointError(Exception):
|
|
20
|
-
"""Generic exception when dealing with Inference Endpoints."""
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
class InferenceEndpointTimeoutError(InferenceEndpointError, TimeoutError):
|
|
24
|
-
"""Exception for timeouts while waiting for Inference Endpoint."""
|
|
25
|
-
|
|
26
|
-
|
|
27
21
|
class InferenceEndpointStatus(str, Enum):
|
|
28
22
|
PENDING = "pending"
|
|
29
23
|
INITIALIZING = "initializing"
|
|
@@ -200,10 +194,6 @@ class InferenceEndpoint:
|
|
|
200
194
|
[`InferenceEndpointTimeoutError`]
|
|
201
195
|
If the Inference Endpoint is not deployed after `timeout` seconds.
|
|
202
196
|
"""
|
|
203
|
-
if self.url is not None: # Means the endpoint is deployed
|
|
204
|
-
logger.info("Inference Endpoint is ready to be used.")
|
|
205
|
-
return self
|
|
206
|
-
|
|
207
197
|
if timeout is not None and timeout < 0:
|
|
208
198
|
raise ValueError("`timeout` cannot be negative.")
|
|
209
199
|
if refresh_every <= 0:
|
|
@@ -211,10 +201,12 @@ class InferenceEndpoint:
|
|
|
211
201
|
|
|
212
202
|
start = time.time()
|
|
213
203
|
while True:
|
|
214
|
-
self.
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
204
|
+
if self.url is not None:
|
|
205
|
+
# Means the URL is provisioned => check if the endpoint is reachable
|
|
206
|
+
response = get_session().get(self.url, headers=self._api._build_hf_headers(token=self._token))
|
|
207
|
+
if response.status_code == 200:
|
|
208
|
+
logger.info("Inference Endpoint is ready to be used.")
|
|
209
|
+
return self
|
|
218
210
|
if self.status == InferenceEndpointStatus.FAILED:
|
|
219
211
|
raise InferenceEndpointError(
|
|
220
212
|
f"Inference Endpoint {self.name} failed to deploy. Please check the logs for more information."
|
|
@@ -224,6 +216,7 @@ class InferenceEndpoint:
|
|
|
224
216
|
raise InferenceEndpointTimeoutError("Timeout while waiting for Inference Endpoint to be deployed.")
|
|
225
217
|
logger.info(f"Inference Endpoint is not deployed yet ({self.status}). Waiting {refresh_every}s...")
|
|
226
218
|
time.sleep(refresh_every)
|
|
219
|
+
self.fetch()
|
|
227
220
|
|
|
228
221
|
def fetch(self) -> "InferenceEndpoint":
|
|
229
222
|
"""Fetch latest information about the Inference Endpoint.
|
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
# coding=utf-8
|
|
2
|
+
# Copyright 2024-present, the HuggingFace Inc. team.
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
"""Contains utilities to handle the `../.huggingface` folder in local directories.
|
|
16
|
+
|
|
17
|
+
First discussed in https://github.com/huggingface/huggingface_hub/issues/1738 to store
|
|
18
|
+
download metadata when downloading files from the hub to a local directory (without
|
|
19
|
+
using the cache).
|
|
20
|
+
|
|
21
|
+
./.huggingface folder structure:
|
|
22
|
+
[4.0K] data
|
|
23
|
+
├── [4.0K] .huggingface
|
|
24
|
+
│ └── [4.0K] download
|
|
25
|
+
│ ├── [ 16] file.parquet.metadata
|
|
26
|
+
│ ├── [ 16] file.txt.metadata
|
|
27
|
+
│ └── [4.0K] folder
|
|
28
|
+
│ └── [ 16] file.parquet.metadata
|
|
29
|
+
│
|
|
30
|
+
├── [6.5G] file.parquet
|
|
31
|
+
├── [1.5K] file.txt
|
|
32
|
+
└── [4.0K] folder
|
|
33
|
+
└── [ 16] file.parquet
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
Metadata file structure:
|
|
37
|
+
```
|
|
38
|
+
# file.txt.metadata
|
|
39
|
+
11c5a3d5811f50298f278a704980280950aedb10
|
|
40
|
+
a16a55fda99d2f2e7b69cce5cf93ff4ad3049930
|
|
41
|
+
1712656091.123
|
|
42
|
+
|
|
43
|
+
# file.parquet.metadata
|
|
44
|
+
11c5a3d5811f50298f278a704980280950aedb10
|
|
45
|
+
7c5d3f4b8b76583b422fcb9189ad6c89d5d97a094541ce8932dce3ecabde1421
|
|
46
|
+
1712656091.123
|
|
47
|
+
}
|
|
48
|
+
```
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
import logging
|
|
52
|
+
import os
|
|
53
|
+
import time
|
|
54
|
+
from dataclasses import dataclass
|
|
55
|
+
from functools import lru_cache
|
|
56
|
+
from pathlib import Path
|
|
57
|
+
from typing import Optional
|
|
58
|
+
|
|
59
|
+
from .utils import WeakFileLock
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
logger = logging.getLogger(__name__)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
@dataclass
|
|
66
|
+
class LocalDownloadFilePaths:
|
|
67
|
+
"""
|
|
68
|
+
Paths to the files related to a download process in a local dir.
|
|
69
|
+
|
|
70
|
+
Returned by `get_local_download_paths`.
|
|
71
|
+
|
|
72
|
+
Attributes:
|
|
73
|
+
file_path (`Path`):
|
|
74
|
+
Path where the file will be saved.
|
|
75
|
+
lock_path (`Path`):
|
|
76
|
+
Path to the lock file used to ensure atomicity when reading/writing metadata.
|
|
77
|
+
metadata_path (`Path`):
|
|
78
|
+
Path to the metadata file.
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
file_path: Path
|
|
82
|
+
lock_path: Path
|
|
83
|
+
metadata_path: Path
|
|
84
|
+
|
|
85
|
+
def incomplete_path(self, etag: str) -> Path:
|
|
86
|
+
"""Return the path where a file will be temporarily downloaded before being moved to `file_path`."""
|
|
87
|
+
return self.metadata_path.with_suffix(f".{etag}.incomplete")
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
@dataclass
|
|
91
|
+
class LocalDownloadFileMetadata:
|
|
92
|
+
"""
|
|
93
|
+
Metadata about a file in the local directory related to a download process.
|
|
94
|
+
|
|
95
|
+
Attributes:
|
|
96
|
+
filename (`str`):
|
|
97
|
+
Path of the file in the repo.
|
|
98
|
+
commit_hash (`str`):
|
|
99
|
+
Commit hash of the file in the repo.
|
|
100
|
+
etag (`str`):
|
|
101
|
+
ETag of the file in the repo. Used to check if the file has changed.
|
|
102
|
+
For LFS files, this is the sha256 of the file. For regular files, it corresponds to the git hash.
|
|
103
|
+
timestamp (`int`):
|
|
104
|
+
Unix timestamp of when the metadata was saved i.e. when the metadata was accurate.
|
|
105
|
+
"""
|
|
106
|
+
|
|
107
|
+
filename: str
|
|
108
|
+
commit_hash: str
|
|
109
|
+
etag: str
|
|
110
|
+
timestamp: float
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
@lru_cache(maxsize=128) # ensure singleton
|
|
114
|
+
def get_local_download_paths(local_dir: Path, filename: str) -> LocalDownloadFilePaths:
|
|
115
|
+
"""Compute paths to the files related to a download process.
|
|
116
|
+
|
|
117
|
+
Folders containing the paths are all guaranteed to exist.
|
|
118
|
+
|
|
119
|
+
Args:
|
|
120
|
+
local_dir (`Path`):
|
|
121
|
+
Path to the local directory in which files are downloaded.
|
|
122
|
+
filename (`str`):
|
|
123
|
+
Path of the file in the repo.
|
|
124
|
+
|
|
125
|
+
Return:
|
|
126
|
+
[`LocalDownloadFilePaths`]: the paths to the files (file_path, lock_path, metadata_path, incomplete_path).
|
|
127
|
+
"""
|
|
128
|
+
# filename is the path in the Hub repository (separated by '/')
|
|
129
|
+
# make sure to have a cross platform transcription
|
|
130
|
+
sanitized_filename = os.path.join(*filename.split("/"))
|
|
131
|
+
if os.name == "nt":
|
|
132
|
+
if sanitized_filename.startswith("..\\") or "\\..\\" in sanitized_filename:
|
|
133
|
+
raise ValueError(
|
|
134
|
+
f"Invalid filename: cannot handle filename '{sanitized_filename}' on Windows. Please ask the repository"
|
|
135
|
+
" owner to rename this file."
|
|
136
|
+
)
|
|
137
|
+
file_path = local_dir / sanitized_filename
|
|
138
|
+
metadata_path = _huggingface_dir(local_dir) / "download" / f"{sanitized_filename}.metadata"
|
|
139
|
+
lock_path = metadata_path.with_suffix(".lock")
|
|
140
|
+
|
|
141
|
+
file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
142
|
+
metadata_path.parent.mkdir(parents=True, exist_ok=True)
|
|
143
|
+
return LocalDownloadFilePaths(file_path=file_path, lock_path=lock_path, metadata_path=metadata_path)
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
def read_download_metadata(local_dir: Path, filename: str) -> Optional[LocalDownloadFileMetadata]:
|
|
147
|
+
"""Read metadata about a file in the local directory related to a download process.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
local_dir (`Path`):
|
|
151
|
+
Path to the local directory in which files are downloaded.
|
|
152
|
+
filename (`str`):
|
|
153
|
+
Path of the file in the repo.
|
|
154
|
+
|
|
155
|
+
Return:
|
|
156
|
+
`[LocalDownloadFileMetadata]` or `None`: the metadata if it exists, `None` otherwise.
|
|
157
|
+
"""
|
|
158
|
+
paths = get_local_download_paths(local_dir, filename)
|
|
159
|
+
# file_path = local_file_path(local_dir, filename)
|
|
160
|
+
# lock_path, metadata_path = _download_metadata_file_path(local_dir, filename)
|
|
161
|
+
with WeakFileLock(paths.lock_path):
|
|
162
|
+
if paths.metadata_path.exists():
|
|
163
|
+
try:
|
|
164
|
+
with paths.metadata_path.open() as f:
|
|
165
|
+
commit_hash = f.readline().strip()
|
|
166
|
+
etag = f.readline().strip()
|
|
167
|
+
timestamp = float(f.readline().strip())
|
|
168
|
+
metadata = LocalDownloadFileMetadata(
|
|
169
|
+
filename=filename,
|
|
170
|
+
commit_hash=commit_hash,
|
|
171
|
+
etag=etag,
|
|
172
|
+
timestamp=timestamp,
|
|
173
|
+
)
|
|
174
|
+
except Exception as e:
|
|
175
|
+
# remove the metadata file if it is corrupted / not the right format
|
|
176
|
+
logger.warning(
|
|
177
|
+
f"Invalid metadata file {paths.metadata_path}: {e}. Removing it from disk and continue."
|
|
178
|
+
)
|
|
179
|
+
try:
|
|
180
|
+
paths.metadata_path.unlink()
|
|
181
|
+
except Exception as e:
|
|
182
|
+
logger.warning(f"Could not remove corrupted metadata file {paths.metadata_path}: {e}")
|
|
183
|
+
|
|
184
|
+
try:
|
|
185
|
+
# check if the file exists and hasn't been modified since the metadata was saved
|
|
186
|
+
stat = paths.file_path.stat()
|
|
187
|
+
if (
|
|
188
|
+
stat.st_mtime - 1 <= metadata.timestamp
|
|
189
|
+
): # allow 1s difference as stat.st_mtime might not be precise
|
|
190
|
+
return metadata
|
|
191
|
+
logger.info(f"Ignored metadata for '{filename}' (outdated). Will re-compute hash.")
|
|
192
|
+
except FileNotFoundError:
|
|
193
|
+
# file does not exist => metadata is outdated
|
|
194
|
+
return None
|
|
195
|
+
return None
|
|
196
|
+
|
|
197
|
+
|
|
198
|
+
def write_download_metadata(local_dir: Path, filename: str, commit_hash: str, etag: str) -> None:
|
|
199
|
+
"""Write metadata about a file in the local directory related to a download process.
|
|
200
|
+
|
|
201
|
+
Args:
|
|
202
|
+
local_dir (`Path`):
|
|
203
|
+
Path to the local directory in which files are downloaded.
|
|
204
|
+
"""
|
|
205
|
+
paths = get_local_download_paths(local_dir, filename)
|
|
206
|
+
with WeakFileLock(paths.lock_path):
|
|
207
|
+
with paths.metadata_path.open("w") as f:
|
|
208
|
+
f.write(f"{commit_hash}\n{etag}\n{time.time()}\n")
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
@lru_cache()
|
|
212
|
+
def _huggingface_dir(local_dir: Path) -> Path:
|
|
213
|
+
"""Return the path to the `.huggingface` directory in a local directory."""
|
|
214
|
+
# Wrap in lru_cache to avoid overwriting the .gitignore file if called multiple times
|
|
215
|
+
path = local_dir / ".huggingface"
|
|
216
|
+
path.mkdir(exist_ok=True, parents=True)
|
|
217
|
+
|
|
218
|
+
# Create a .gitignore file in the .huggingface directory if it doesn't exist
|
|
219
|
+
# Should be thread-safe enough like this.
|
|
220
|
+
gitignore = path / ".gitignore"
|
|
221
|
+
gitignore_lock = path / ".gitignore.lock"
|
|
222
|
+
if not gitignore.exists():
|
|
223
|
+
with WeakFileLock(gitignore_lock):
|
|
224
|
+
gitignore.write_text("*")
|
|
225
|
+
try:
|
|
226
|
+
gitignore_lock.unlink()
|
|
227
|
+
except OSError: # FileNotFoundError, PermissionError, etc.
|
|
228
|
+
pass
|
|
229
|
+
return path
|
huggingface_hub/_login.py
CHANGED
|
@@ -103,9 +103,10 @@ def login(
|
|
|
103
103
|
if token is not None:
|
|
104
104
|
if not add_to_git_credential:
|
|
105
105
|
print(
|
|
106
|
-
"
|
|
107
|
-
"
|
|
108
|
-
"
|
|
106
|
+
"The token has not been saved to the git credentials helper. Pass "
|
|
107
|
+
"`add_to_git_credential=True` in this function directly or "
|
|
108
|
+
"`--add-to-git-credential` if using via `huggingface-cli` if "
|
|
109
|
+
"you want to set the git credential as well."
|
|
109
110
|
)
|
|
110
111
|
_login(token, add_to_git_credential=add_to_git_credential, write_permission=write_permission)
|
|
111
112
|
elif is_notebook():
|
|
@@ -273,8 +273,8 @@ def multi_commit_create_pull_request(
|
|
|
273
273
|
commit_message: str,
|
|
274
274
|
commit_description: Optional[str],
|
|
275
275
|
strategy: MultiCommitStrategy,
|
|
276
|
-
token: Optional[str],
|
|
277
276
|
repo_type: Optional[str],
|
|
277
|
+
token: Union[str, bool, None] = None,
|
|
278
278
|
) -> DiscussionWithDetails:
|
|
279
279
|
return api.create_pull_request(
|
|
280
280
|
repo_id=repo_id,
|
|
@@ -39,13 +39,11 @@ def snapshot_download(
|
|
|
39
39
|
revision: Optional[str] = None,
|
|
40
40
|
cache_dir: Union[str, Path, None] = None,
|
|
41
41
|
local_dir: Union[str, Path, None] = None,
|
|
42
|
-
local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto",
|
|
43
42
|
library_name: Optional[str] = None,
|
|
44
43
|
library_version: Optional[str] = None,
|
|
45
44
|
user_agent: Optional[Union[Dict, str]] = None,
|
|
46
45
|
proxies: Optional[Dict] = None,
|
|
47
46
|
etag_timeout: float = DEFAULT_ETAG_TIMEOUT,
|
|
48
|
-
resume_download: bool = False,
|
|
49
47
|
force_download: bool = False,
|
|
50
48
|
token: Optional[Union[bool, str]] = None,
|
|
51
49
|
local_files_only: bool = False,
|
|
@@ -55,6 +53,9 @@ def snapshot_download(
|
|
|
55
53
|
tqdm_class: Optional[base_tqdm] = None,
|
|
56
54
|
headers: Optional[Dict[str, str]] = None,
|
|
57
55
|
endpoint: Optional[str] = None,
|
|
56
|
+
# Deprecated args
|
|
57
|
+
local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto",
|
|
58
|
+
resume_download: Optional[bool] = None,
|
|
58
59
|
) -> str:
|
|
59
60
|
"""Download repo files.
|
|
60
61
|
|
|
@@ -63,20 +64,10 @@ def snapshot_download(
|
|
|
63
64
|
to keep their actual filename relative to that folder. You can also filter which files to download using
|
|
64
65
|
`allow_patterns` and `ignore_patterns`.
|
|
65
66
|
|
|
66
|
-
If `local_dir` is provided, the file structure from the repo will be replicated in this location.
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
is to be able to manually edit and save small files without corrupting the cache while saving disk space for
|
|
71
|
-
binary files. The 5MB threshold can be configured with the `HF_HUB_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD`
|
|
72
|
-
environment variable.
|
|
73
|
-
- If `local_dir_use_symlinks=True`, files are downloaded, stored in the cache directory and symlinked in `local_dir`.
|
|
74
|
-
This is optimal in term of disk usage but files must not be manually edited.
|
|
75
|
-
- If `local_dir_use_symlinks=False` and the blob files exist in the cache directory, they are duplicated in the
|
|
76
|
-
local dir. This means disk usage is not optimized.
|
|
77
|
-
- Finally, if `local_dir_use_symlinks=False` and the blob files do not exist in the cache directory, then the
|
|
78
|
-
files are downloaded and directly placed under `local_dir`. This means if you need to download them again later,
|
|
79
|
-
they will be re-downloaded entirely.
|
|
67
|
+
If `local_dir` is provided, the file structure from the repo will be replicated in this location. When using this
|
|
68
|
+
option, the `cache_dir` will not be used and a `.huggingface/` folder will be created at the root of `local_dir`
|
|
69
|
+
to store some metadata related to the downloaded files. While this mechanism is not as robust as the main
|
|
70
|
+
cache-system, it's optimized for regularly pulling the latest version of a repository.
|
|
80
71
|
|
|
81
72
|
An alternative would be to clone the repo but this requires git and git-lfs to be installed and properly
|
|
82
73
|
configured. It is also not possible to filter which files to download when cloning a repository using git.
|
|
@@ -93,13 +84,7 @@ def snapshot_download(
|
|
|
93
84
|
cache_dir (`str`, `Path`, *optional*):
|
|
94
85
|
Path to the folder where cached files are stored.
|
|
95
86
|
local_dir (`str` or `Path`, *optional*):
|
|
96
|
-
If provided, the downloaded files will be placed under this directory
|
|
97
|
-
regular files (see description for more details).
|
|
98
|
-
local_dir_use_symlinks (`"auto"` or `bool`, defaults to `"auto"`):
|
|
99
|
-
To be used with `local_dir`. If set to "auto", the cache directory will be used and the file will be either
|
|
100
|
-
duplicated or symlinked to the local directory depending on its size. It set to `True`, a symlink will be
|
|
101
|
-
created, no matter the file size. If set to `False`, the file will either be duplicated from cache (if
|
|
102
|
-
already exists) or downloaded from the Hub and not cached. See description for more details.
|
|
87
|
+
If provided, the downloaded files will be placed under this directory.
|
|
103
88
|
library_name (`str`, *optional*):
|
|
104
89
|
The name of the library to which the object corresponds.
|
|
105
90
|
library_version (`str`, *optional*):
|
|
@@ -112,8 +97,6 @@ def snapshot_download(
|
|
|
112
97
|
etag_timeout (`float`, *optional*, defaults to `10`):
|
|
113
98
|
When fetching ETag, how many seconds to wait for the server to send
|
|
114
99
|
data before giving up which is passed to `requests.request`.
|
|
115
|
-
resume_download (`bool`, *optional*, defaults to `False):
|
|
116
|
-
If `True`, resume a previously interrupted download.
|
|
117
100
|
force_download (`bool`, *optional*, defaults to `False`):
|
|
118
101
|
Whether the file should be downloaded even if it already exists in the local cache.
|
|
119
102
|
token (`str`, `bool`, *optional*):
|
|
@@ -141,20 +124,15 @@ def snapshot_download(
|
|
|
141
124
|
`HF_HUB_DISABLE_PROGRESS_BARS` environment variable.
|
|
142
125
|
|
|
143
126
|
Returns:
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
<Tip>
|
|
147
|
-
|
|
148
|
-
Raises the following errors:
|
|
149
|
-
|
|
150
|
-
- [`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError)
|
|
151
|
-
if `token=True` and the token cannot be found.
|
|
152
|
-
- [`OSError`](https://docs.python.org/3/library/exceptions.html#OSError) if
|
|
153
|
-
ETag cannot be determined.
|
|
154
|
-
- [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
|
|
155
|
-
if some parameter value is invalid
|
|
127
|
+
`str`: folder path of the repo snapshot.
|
|
156
128
|
|
|
157
|
-
|
|
129
|
+
Raises:
|
|
130
|
+
- [`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError)
|
|
131
|
+
if `token=True` and the token cannot be found.
|
|
132
|
+
- [`OSError`](https://docs.python.org/3/library/exceptions.html#OSError) if
|
|
133
|
+
ETag cannot be determined.
|
|
134
|
+
- [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
|
|
135
|
+
if some parameter value is invalid
|
|
158
136
|
"""
|
|
159
137
|
if cache_dir is None:
|
|
160
138
|
cache_dir = HF_HUB_CACHE
|