huggingface-hub 0.31.0rc0__py3-none-any.whl → 1.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- huggingface_hub/__init__.py +145 -46
- huggingface_hub/_commit_api.py +168 -119
- huggingface_hub/_commit_scheduler.py +15 -15
- huggingface_hub/_inference_endpoints.py +15 -12
- huggingface_hub/_jobs_api.py +301 -0
- huggingface_hub/_local_folder.py +18 -3
- huggingface_hub/_login.py +31 -63
- huggingface_hub/_oauth.py +460 -0
- huggingface_hub/_snapshot_download.py +239 -80
- huggingface_hub/_space_api.py +5 -5
- huggingface_hub/_tensorboard_logger.py +15 -19
- huggingface_hub/_upload_large_folder.py +172 -76
- huggingface_hub/_webhooks_payload.py +3 -3
- huggingface_hub/_webhooks_server.py +13 -25
- huggingface_hub/{commands → cli}/__init__.py +1 -15
- huggingface_hub/cli/_cli_utils.py +173 -0
- huggingface_hub/cli/auth.py +147 -0
- huggingface_hub/cli/cache.py +841 -0
- huggingface_hub/cli/download.py +189 -0
- huggingface_hub/cli/hf.py +60 -0
- huggingface_hub/cli/inference_endpoints.py +377 -0
- huggingface_hub/cli/jobs.py +772 -0
- huggingface_hub/cli/lfs.py +175 -0
- huggingface_hub/cli/repo.py +315 -0
- huggingface_hub/cli/repo_files.py +94 -0
- huggingface_hub/{commands/env.py → cli/system.py} +10 -13
- huggingface_hub/cli/upload.py +294 -0
- huggingface_hub/cli/upload_large_folder.py +117 -0
- huggingface_hub/community.py +20 -12
- huggingface_hub/constants.py +38 -53
- huggingface_hub/dataclasses.py +609 -0
- huggingface_hub/errors.py +80 -30
- huggingface_hub/fastai_utils.py +30 -41
- huggingface_hub/file_download.py +435 -351
- huggingface_hub/hf_api.py +2050 -1124
- huggingface_hub/hf_file_system.py +269 -152
- huggingface_hub/hub_mixin.py +43 -63
- huggingface_hub/inference/_client.py +347 -434
- huggingface_hub/inference/_common.py +133 -121
- huggingface_hub/inference/_generated/_async_client.py +397 -541
- huggingface_hub/inference/_generated/types/__init__.py +5 -1
- huggingface_hub/inference/_generated/types/automatic_speech_recognition.py +3 -3
- huggingface_hub/inference/_generated/types/base.py +10 -7
- huggingface_hub/inference/_generated/types/chat_completion.py +59 -23
- huggingface_hub/inference/_generated/types/depth_estimation.py +2 -2
- huggingface_hub/inference/_generated/types/document_question_answering.py +2 -2
- huggingface_hub/inference/_generated/types/feature_extraction.py +2 -2
- huggingface_hub/inference/_generated/types/fill_mask.py +2 -2
- huggingface_hub/inference/_generated/types/image_to_image.py +6 -2
- huggingface_hub/inference/_generated/types/image_to_video.py +60 -0
- huggingface_hub/inference/_generated/types/sentence_similarity.py +3 -3
- huggingface_hub/inference/_generated/types/summarization.py +2 -2
- huggingface_hub/inference/_generated/types/table_question_answering.py +5 -5
- huggingface_hub/inference/_generated/types/text2text_generation.py +2 -2
- huggingface_hub/inference/_generated/types/text_generation.py +10 -10
- huggingface_hub/inference/_generated/types/text_to_video.py +2 -2
- huggingface_hub/inference/_generated/types/token_classification.py +2 -2
- huggingface_hub/inference/_generated/types/translation.py +2 -2
- huggingface_hub/inference/_generated/types/zero_shot_classification.py +2 -2
- huggingface_hub/inference/_generated/types/zero_shot_image_classification.py +2 -2
- huggingface_hub/inference/_generated/types/zero_shot_object_detection.py +1 -3
- huggingface_hub/inference/_mcp/__init__.py +0 -0
- huggingface_hub/inference/_mcp/_cli_hacks.py +88 -0
- huggingface_hub/inference/_mcp/agent.py +100 -0
- huggingface_hub/inference/_mcp/cli.py +247 -0
- huggingface_hub/inference/_mcp/constants.py +81 -0
- huggingface_hub/inference/_mcp/mcp_client.py +395 -0
- huggingface_hub/inference/_mcp/types.py +45 -0
- huggingface_hub/inference/_mcp/utils.py +128 -0
- huggingface_hub/inference/_providers/__init__.py +82 -7
- huggingface_hub/inference/_providers/_common.py +129 -27
- huggingface_hub/inference/_providers/black_forest_labs.py +6 -6
- huggingface_hub/inference/_providers/cerebras.py +1 -1
- huggingface_hub/inference/_providers/clarifai.py +13 -0
- huggingface_hub/inference/_providers/cohere.py +20 -3
- huggingface_hub/inference/_providers/fal_ai.py +183 -56
- huggingface_hub/inference/_providers/featherless_ai.py +38 -0
- huggingface_hub/inference/_providers/fireworks_ai.py +18 -0
- huggingface_hub/inference/_providers/groq.py +9 -0
- huggingface_hub/inference/_providers/hf_inference.py +69 -30
- huggingface_hub/inference/_providers/hyperbolic.py +4 -4
- huggingface_hub/inference/_providers/nebius.py +33 -5
- huggingface_hub/inference/_providers/novita.py +5 -5
- huggingface_hub/inference/_providers/nscale.py +44 -0
- huggingface_hub/inference/_providers/openai.py +3 -1
- huggingface_hub/inference/_providers/publicai.py +6 -0
- huggingface_hub/inference/_providers/replicate.py +31 -13
- huggingface_hub/inference/_providers/sambanova.py +18 -4
- huggingface_hub/inference/_providers/scaleway.py +28 -0
- huggingface_hub/inference/_providers/together.py +20 -5
- huggingface_hub/inference/_providers/wavespeed.py +138 -0
- huggingface_hub/inference/_providers/zai_org.py +17 -0
- huggingface_hub/lfs.py +33 -100
- huggingface_hub/repocard.py +34 -38
- huggingface_hub/repocard_data.py +57 -57
- huggingface_hub/serialization/__init__.py +0 -1
- huggingface_hub/serialization/_base.py +12 -15
- huggingface_hub/serialization/_dduf.py +8 -8
- huggingface_hub/serialization/_torch.py +69 -69
- huggingface_hub/utils/__init__.py +19 -8
- huggingface_hub/utils/_auth.py +7 -7
- huggingface_hub/utils/_cache_manager.py +92 -147
- huggingface_hub/utils/_chunk_utils.py +2 -3
- huggingface_hub/utils/_deprecation.py +1 -1
- huggingface_hub/utils/_dotenv.py +55 -0
- huggingface_hub/utils/_experimental.py +7 -5
- huggingface_hub/utils/_fixes.py +0 -10
- huggingface_hub/utils/_git_credential.py +5 -5
- huggingface_hub/utils/_headers.py +8 -30
- huggingface_hub/utils/_http.py +398 -239
- huggingface_hub/utils/_pagination.py +4 -4
- huggingface_hub/utils/_parsing.py +98 -0
- huggingface_hub/utils/_paths.py +5 -5
- huggingface_hub/utils/_runtime.py +61 -24
- huggingface_hub/utils/_safetensors.py +21 -21
- huggingface_hub/utils/_subprocess.py +9 -9
- huggingface_hub/utils/_telemetry.py +4 -4
- huggingface_hub/{commands/_cli_utils.py → utils/_terminal.py} +4 -4
- huggingface_hub/utils/_typing.py +25 -5
- huggingface_hub/utils/_validators.py +55 -74
- huggingface_hub/utils/_verification.py +167 -0
- huggingface_hub/utils/_xet.py +64 -17
- huggingface_hub/utils/_xet_progress_reporting.py +162 -0
- huggingface_hub/utils/insecure_hashlib.py +3 -5
- huggingface_hub/utils/logging.py +8 -11
- huggingface_hub/utils/tqdm.py +5 -4
- {huggingface_hub-0.31.0rc0.dist-info → huggingface_hub-1.1.3.dist-info}/METADATA +94 -85
- huggingface_hub-1.1.3.dist-info/RECORD +155 -0
- {huggingface_hub-0.31.0rc0.dist-info → huggingface_hub-1.1.3.dist-info}/WHEEL +1 -1
- huggingface_hub-1.1.3.dist-info/entry_points.txt +6 -0
- huggingface_hub/commands/delete_cache.py +0 -474
- huggingface_hub/commands/download.py +0 -200
- huggingface_hub/commands/huggingface_cli.py +0 -61
- huggingface_hub/commands/lfs.py +0 -200
- huggingface_hub/commands/repo_files.py +0 -128
- huggingface_hub/commands/scan_cache.py +0 -181
- huggingface_hub/commands/tag.py +0 -159
- huggingface_hub/commands/upload.py +0 -314
- huggingface_hub/commands/upload_large_folder.py +0 -129
- huggingface_hub/commands/user.py +0 -304
- huggingface_hub/commands/version.py +0 -37
- huggingface_hub/inference_api.py +0 -217
- huggingface_hub/keras_mixin.py +0 -500
- huggingface_hub/repository.py +0 -1477
- huggingface_hub/serialization/_tensorflow.py +0 -95
- huggingface_hub/utils/_hf_folder.py +0 -68
- huggingface_hub-0.31.0rc0.dist-info/RECORD +0 -135
- huggingface_hub-0.31.0rc0.dist-info/entry_points.txt +0 -6
- {huggingface_hub-0.31.0rc0.dist-info → huggingface_hub-1.1.3.dist-info/licenses}/LICENSE +0 -0
- {huggingface_hub-0.31.0rc0.dist-info → huggingface_hub-1.1.3.dist-info}/top_level.txt +0 -0
|
@@ -1,25 +1,29 @@
|
|
|
1
1
|
import os
|
|
2
2
|
import re
|
|
3
3
|
import tempfile
|
|
4
|
+
import threading
|
|
4
5
|
from collections import deque
|
|
6
|
+
from contextlib import ExitStack
|
|
7
|
+
from copy import deepcopy
|
|
5
8
|
from dataclasses import dataclass, field
|
|
6
9
|
from datetime import datetime
|
|
7
10
|
from itertools import chain
|
|
8
11
|
from pathlib import Path
|
|
9
|
-
from typing import Any,
|
|
12
|
+
from typing import Any, Iterator, NoReturn, Optional, Union
|
|
10
13
|
from urllib.parse import quote, unquote
|
|
11
14
|
|
|
12
15
|
import fsspec
|
|
16
|
+
import httpx
|
|
13
17
|
from fsspec.callbacks import _DEFAULT_CALLBACK, NoOpCallback, TqdmCallback
|
|
14
18
|
from fsspec.utils import isfilelike
|
|
15
|
-
from requests import Response
|
|
16
19
|
|
|
17
20
|
from . import constants
|
|
18
21
|
from ._commit_api import CommitOperationCopy, CommitOperationDelete
|
|
19
|
-
from .errors import EntryNotFoundError, RepositoryNotFoundError, RevisionNotFoundError
|
|
22
|
+
from .errors import EntryNotFoundError, HfHubHTTPError, RepositoryNotFoundError, RevisionNotFoundError
|
|
20
23
|
from .file_download import hf_hub_url, http_get
|
|
21
24
|
from .hf_api import HfApi, LastCommitInfo, RepoFile
|
|
22
|
-
from .utils import HFValidationError, hf_raise_for_status, http_backoff
|
|
25
|
+
from .utils import HFValidationError, hf_raise_for_status, http_backoff, http_stream_backoff
|
|
26
|
+
from .utils.insecure_hashlib import md5
|
|
23
27
|
|
|
24
28
|
|
|
25
29
|
# Regex used to match special revisions with "/" in them (see #1710)
|
|
@@ -55,17 +59,68 @@ class HfFileSystemResolvedPath:
|
|
|
55
59
|
return f"{repo_path}/{self.path_in_repo}".rstrip("/")
|
|
56
60
|
|
|
57
61
|
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
Access a remote Hugging Face Hub repository as if were a local file system.
|
|
62
|
+
# We need to improve fsspec.spec._Cached which is AbstractFileSystem's metaclass
|
|
63
|
+
_cached_base: Any = type(fsspec.AbstractFileSystem)
|
|
61
64
|
|
|
62
|
-
<Tip warning={true}>
|
|
63
65
|
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
66
|
+
class _Cached(_cached_base):
|
|
67
|
+
"""
|
|
68
|
+
Metaclass for caching HfFileSystem instances according to the args.
|
|
69
|
+
|
|
70
|
+
This creates an additional reference to the filesystem, which prevents the
|
|
71
|
+
filesystem from being garbage collected when all *user* references go away.
|
|
72
|
+
A call to the :meth:`AbstractFileSystem.clear_instance_cache` must *also*
|
|
73
|
+
be made for a filesystem instance to be garbage collected.
|
|
74
|
+
|
|
75
|
+
This is a slightly modified version of `fsspec.spec._Cached` to improve it.
|
|
76
|
+
In particular in `_tokenize` the pid isn't taken into account for the
|
|
77
|
+
`fs_token` used to identify cached instances. The `fs_token` logic is also
|
|
78
|
+
robust to defaults values and the order of the args. Finally new instances
|
|
79
|
+
reuse the states from sister instances in the main thread.
|
|
80
|
+
"""
|
|
67
81
|
|
|
68
|
-
|
|
82
|
+
def __init__(cls, *args, **kwargs):
|
|
83
|
+
# Hack: override https://github.com/fsspec/filesystem_spec/blob/dcb167e8f50e6273d4cfdfc4cab8fc5aa4c958bf/fsspec/spec.py#L53
|
|
84
|
+
super().__init__(*args, **kwargs)
|
|
85
|
+
# Note: we intentionally create a reference here, to avoid garbage
|
|
86
|
+
# collecting instances when all other references are gone. To really
|
|
87
|
+
# delete a FileSystem, the cache must be cleared.
|
|
88
|
+
cls._cache = {}
|
|
89
|
+
|
|
90
|
+
def __call__(cls, *args, **kwargs):
|
|
91
|
+
# Hack: override https://github.com/fsspec/filesystem_spec/blob/dcb167e8f50e6273d4cfdfc4cab8fc5aa4c958bf/fsspec/spec.py#L65
|
|
92
|
+
skip = kwargs.pop("skip_instance_cache", False)
|
|
93
|
+
fs_token = cls._tokenize(cls, threading.get_ident(), *args, **kwargs)
|
|
94
|
+
fs_token_main_thread = cls._tokenize(cls, threading.main_thread().ident, *args, **kwargs)
|
|
95
|
+
if not skip and cls.cachable and fs_token in cls._cache:
|
|
96
|
+
# reuse cached instance
|
|
97
|
+
cls._latest = fs_token
|
|
98
|
+
return cls._cache[fs_token]
|
|
99
|
+
else:
|
|
100
|
+
# create new instance
|
|
101
|
+
obj = type.__call__(cls, *args, **kwargs)
|
|
102
|
+
if not skip and cls.cachable and fs_token_main_thread in cls._cache:
|
|
103
|
+
# reuse the cache from the main thread instance in the new instance
|
|
104
|
+
instance_state = cls._cache[fs_token_main_thread]._get_instance_state()
|
|
105
|
+
for attr, state_value in instance_state.items():
|
|
106
|
+
setattr(obj, attr, state_value)
|
|
107
|
+
obj._fs_token_ = fs_token
|
|
108
|
+
obj.storage_args = args
|
|
109
|
+
obj.storage_options = kwargs
|
|
110
|
+
if cls.cachable and not skip:
|
|
111
|
+
cls._latest = fs_token
|
|
112
|
+
cls._cache[fs_token] = obj
|
|
113
|
+
return obj
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
class HfFileSystem(fsspec.AbstractFileSystem, metaclass=_Cached):
|
|
117
|
+
"""
|
|
118
|
+
Access a remote Hugging Face Hub repository as if were a local file system.
|
|
119
|
+
|
|
120
|
+
> [!WARNING]
|
|
121
|
+
> [`HfFileSystem`] provides fsspec compatibility, which is useful for libraries that require it (e.g., reading
|
|
122
|
+
> Hugging Face datasets directly with `pandas`). However, it introduces additional overhead due to this compatibility
|
|
123
|
+
> layer. For better performance and reliability, it's recommended to use `HfApi` methods when possible.
|
|
69
124
|
|
|
70
125
|
Args:
|
|
71
126
|
token (`str` or `bool`, *optional*):
|
|
@@ -104,22 +159,38 @@ class HfFileSystem(fsspec.AbstractFileSystem):
|
|
|
104
159
|
*args,
|
|
105
160
|
endpoint: Optional[str] = None,
|
|
106
161
|
token: Union[bool, str, None] = None,
|
|
162
|
+
block_size: Optional[int] = None,
|
|
107
163
|
**storage_options,
|
|
108
164
|
):
|
|
109
165
|
super().__init__(*args, **storage_options)
|
|
110
166
|
self.endpoint = endpoint or constants.ENDPOINT
|
|
111
167
|
self.token = token
|
|
112
168
|
self._api = HfApi(endpoint=endpoint, token=token)
|
|
169
|
+
self.block_size = block_size
|
|
113
170
|
# Maps (repo_type, repo_id, revision) to a 2-tuple with:
|
|
114
171
|
# * the 1st element indicating whether the repositoy and the revision exist
|
|
115
172
|
# * the 2nd element being the exception raised if the repository or revision doesn't exist
|
|
116
|
-
self._repo_and_revision_exists_cache:
|
|
117
|
-
|
|
173
|
+
self._repo_and_revision_exists_cache: dict[
|
|
174
|
+
tuple[str, str, Optional[str]], tuple[bool, Optional[Exception]]
|
|
118
175
|
] = {}
|
|
176
|
+
# Maps parent directory path to path infos
|
|
177
|
+
self.dircache: dict[str, list[dict[str, Any]]] = {}
|
|
178
|
+
|
|
179
|
+
@classmethod
|
|
180
|
+
def _tokenize(cls, threading_ident: int, *args, **kwargs) -> str:
|
|
181
|
+
"""Deterministic token for caching"""
|
|
182
|
+
# make fs_token robust to default values and to kwargs order
|
|
183
|
+
kwargs["endpoint"] = kwargs.get("endpoint") or constants.ENDPOINT
|
|
184
|
+
kwargs["token"] = kwargs.get("token")
|
|
185
|
+
kwargs = {key: kwargs[key] for key in sorted(kwargs)}
|
|
186
|
+
# contrary to fsspec, we don't include pid here
|
|
187
|
+
tokenize_args = (cls, threading_ident, args, kwargs)
|
|
188
|
+
h = md5(str(tokenize_args).encode())
|
|
189
|
+
return h.hexdigest()
|
|
119
190
|
|
|
120
191
|
def _repo_and_revision_exist(
|
|
121
192
|
self, repo_type: str, repo_id: str, revision: Optional[str]
|
|
122
|
-
) ->
|
|
193
|
+
) -> tuple[bool, Optional[Exception]]:
|
|
123
194
|
if (repo_type, repo_id, revision) not in self._repo_and_revision_exists_cache:
|
|
124
195
|
try:
|
|
125
196
|
self._api.repo_info(
|
|
@@ -267,12 +338,15 @@ class HfFileSystem(fsspec.AbstractFileSystem):
|
|
|
267
338
|
block_size: Optional[int] = None,
|
|
268
339
|
**kwargs,
|
|
269
340
|
) -> "HfFileSystemFile":
|
|
341
|
+
block_size = block_size if block_size is not None else self.block_size
|
|
342
|
+
if block_size is not None:
|
|
343
|
+
kwargs["block_size"] = block_size
|
|
270
344
|
if "a" in mode:
|
|
271
345
|
raise NotImplementedError("Appending to remote files is not yet supported.")
|
|
272
346
|
if block_size == 0:
|
|
273
|
-
return HfFileSystemStreamFile(self, path, mode=mode, revision=revision,
|
|
347
|
+
return HfFileSystemStreamFile(self, path, mode=mode, revision=revision, **kwargs)
|
|
274
348
|
else:
|
|
275
|
-
return HfFileSystemFile(self, path, mode=mode, revision=revision,
|
|
349
|
+
return HfFileSystemFile(self, path, mode=mode, revision=revision, **kwargs)
|
|
276
350
|
|
|
277
351
|
def _rm(self, path: str, revision: Optional[str] = None, **kwargs) -> None:
|
|
278
352
|
resolved_path = self.resolve_path(path, revision=revision)
|
|
@@ -300,11 +374,8 @@ class HfFileSystem(fsspec.AbstractFileSystem):
|
|
|
300
374
|
|
|
301
375
|
For more details, refer to [fsspec documentation](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.rm).
|
|
302
376
|
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
Note: When possible, use `HfApi.delete_file()` for better performance.
|
|
306
|
-
|
|
307
|
-
</Tip>
|
|
377
|
+
> [!WARNING]
|
|
378
|
+
> Note: When possible, use `HfApi.delete_file()` for better performance.
|
|
308
379
|
|
|
309
380
|
Args:
|
|
310
381
|
path (`str`):
|
|
@@ -338,17 +409,14 @@ class HfFileSystem(fsspec.AbstractFileSystem):
|
|
|
338
409
|
|
|
339
410
|
def ls(
|
|
340
411
|
self, path: str, detail: bool = True, refresh: bool = False, revision: Optional[str] = None, **kwargs
|
|
341
|
-
) ->
|
|
412
|
+
) -> list[Union[str, dict[str, Any]]]:
|
|
342
413
|
"""
|
|
343
414
|
List the contents of a directory.
|
|
344
415
|
|
|
345
416
|
For more details, refer to [fsspec documentation](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.ls).
|
|
346
417
|
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
Note: When possible, use `HfApi.list_repo_tree()` for better performance.
|
|
350
|
-
|
|
351
|
-
</Tip>
|
|
418
|
+
> [!WARNING]
|
|
419
|
+
> Note: When possible, use `HfApi.list_repo_tree()` for better performance.
|
|
352
420
|
|
|
353
421
|
Args:
|
|
354
422
|
path (`str`):
|
|
@@ -362,12 +430,11 @@ class HfFileSystem(fsspec.AbstractFileSystem):
|
|
|
362
430
|
The git revision to list from.
|
|
363
431
|
|
|
364
432
|
Returns:
|
|
365
|
-
`
|
|
433
|
+
`list[Union[str, dict[str, Any]]]`: List of file paths (if detail=False) or list of file information
|
|
366
434
|
dictionaries (if detail=True).
|
|
367
435
|
"""
|
|
368
436
|
resolved_path = self.resolve_path(path, revision=revision)
|
|
369
437
|
path = resolved_path.unresolve()
|
|
370
|
-
kwargs = {"expand_info": detail, **kwargs}
|
|
371
438
|
try:
|
|
372
439
|
out = self._ls_tree(path, refresh=refresh, revision=revision, **kwargs)
|
|
373
440
|
except EntryNotFoundError:
|
|
@@ -386,7 +453,8 @@ class HfFileSystem(fsspec.AbstractFileSystem):
|
|
|
386
453
|
recursive: bool = False,
|
|
387
454
|
refresh: bool = False,
|
|
388
455
|
revision: Optional[str] = None,
|
|
389
|
-
expand_info: bool =
|
|
456
|
+
expand_info: bool = False,
|
|
457
|
+
maxdepth: Optional[int] = None,
|
|
390
458
|
):
|
|
391
459
|
resolved_path = self.resolve_path(path, revision=revision)
|
|
392
460
|
path = resolved_path.unresolve()
|
|
@@ -406,19 +474,25 @@ class HfFileSystem(fsspec.AbstractFileSystem):
|
|
|
406
474
|
if recursive:
|
|
407
475
|
# Use BFS to traverse the cache and build the "recursive "output
|
|
408
476
|
# (The Hub uses a so-called "tree first" strategy for the tree endpoint but we sort the output to follow the spec so the result is (eventually) the same)
|
|
477
|
+
depth = 2
|
|
409
478
|
dirs_to_visit = deque(
|
|
410
|
-
[path_info for path_info in cached_path_infos if path_info["type"] == "directory"]
|
|
479
|
+
[(depth, path_info) for path_info in cached_path_infos if path_info["type"] == "directory"]
|
|
411
480
|
)
|
|
412
481
|
while dirs_to_visit:
|
|
413
|
-
dir_info = dirs_to_visit.popleft()
|
|
414
|
-
if
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
482
|
+
depth, dir_info = dirs_to_visit.popleft()
|
|
483
|
+
if maxdepth is None or depth <= maxdepth:
|
|
484
|
+
if dir_info["name"] not in self.dircache:
|
|
485
|
+
dirs_not_in_dircache.append(dir_info["name"])
|
|
486
|
+
else:
|
|
487
|
+
cached_path_infos = self.dircache[dir_info["name"]]
|
|
488
|
+
out.extend(cached_path_infos)
|
|
489
|
+
dirs_to_visit.extend(
|
|
490
|
+
[
|
|
491
|
+
(depth + 1, path_info)
|
|
492
|
+
for path_info in cached_path_infos
|
|
493
|
+
if path_info["type"] == "directory"
|
|
494
|
+
]
|
|
495
|
+
)
|
|
422
496
|
|
|
423
497
|
dirs_not_expanded = []
|
|
424
498
|
if expand_info:
|
|
@@ -437,8 +511,11 @@ class HfFileSystem(fsspec.AbstractFileSystem):
|
|
|
437
511
|
or common_prefix in chain(dirs_not_in_dircache, dirs_not_expanded)
|
|
438
512
|
else self._parent(common_prefix)
|
|
439
513
|
)
|
|
514
|
+
if maxdepth is not None:
|
|
515
|
+
common_path_depth = common_path[len(path) :].count("/")
|
|
516
|
+
maxdepth -= common_path_depth
|
|
440
517
|
out = [o for o in out if not o["name"].startswith(common_path + "/")]
|
|
441
|
-
for cached_path in self.dircache:
|
|
518
|
+
for cached_path in list(self.dircache):
|
|
442
519
|
if cached_path.startswith(common_path + "/"):
|
|
443
520
|
self.dircache.pop(cached_path, None)
|
|
444
521
|
self.dircache.pop(common_path, None)
|
|
@@ -449,6 +526,7 @@ class HfFileSystem(fsspec.AbstractFileSystem):
|
|
|
449
526
|
refresh=True,
|
|
450
527
|
revision=revision,
|
|
451
528
|
expand_info=expand_info,
|
|
529
|
+
maxdepth=maxdepth,
|
|
452
530
|
)
|
|
453
531
|
)
|
|
454
532
|
else:
|
|
@@ -461,9 +539,10 @@ class HfFileSystem(fsspec.AbstractFileSystem):
|
|
|
461
539
|
repo_type=resolved_path.repo_type,
|
|
462
540
|
)
|
|
463
541
|
for path_info in tree:
|
|
542
|
+
cache_path = root_path + "/" + path_info.path
|
|
464
543
|
if isinstance(path_info, RepoFile):
|
|
465
544
|
cache_path_info = {
|
|
466
|
-
"name":
|
|
545
|
+
"name": cache_path,
|
|
467
546
|
"size": path_info.size,
|
|
468
547
|
"type": "file",
|
|
469
548
|
"blob_id": path_info.blob_id,
|
|
@@ -473,7 +552,7 @@ class HfFileSystem(fsspec.AbstractFileSystem):
|
|
|
473
552
|
}
|
|
474
553
|
else:
|
|
475
554
|
cache_path_info = {
|
|
476
|
-
"name":
|
|
555
|
+
"name": cache_path,
|
|
477
556
|
"size": 0,
|
|
478
557
|
"type": "directory",
|
|
479
558
|
"tree_id": path_info.tree_id,
|
|
@@ -481,10 +560,12 @@ class HfFileSystem(fsspec.AbstractFileSystem):
|
|
|
481
560
|
}
|
|
482
561
|
parent_path = self._parent(cache_path_info["name"])
|
|
483
562
|
self.dircache.setdefault(parent_path, []).append(cache_path_info)
|
|
484
|
-
|
|
563
|
+
depth = cache_path[len(path) :].count("/")
|
|
564
|
+
if maxdepth is None or depth <= maxdepth:
|
|
565
|
+
out.append(cache_path_info)
|
|
485
566
|
return out
|
|
486
567
|
|
|
487
|
-
def walk(self, path: str, *args, **kwargs) -> Iterator[
|
|
568
|
+
def walk(self, path: str, *args, **kwargs) -> Iterator[tuple[str, list[str], list[str]]]:
|
|
488
569
|
"""
|
|
489
570
|
Return all files below the given path.
|
|
490
571
|
|
|
@@ -495,14 +576,12 @@ class HfFileSystem(fsspec.AbstractFileSystem):
|
|
|
495
576
|
Root path to list files from.
|
|
496
577
|
|
|
497
578
|
Returns:
|
|
498
|
-
`Iterator[
|
|
579
|
+
`Iterator[tuple[str, list[str], list[str]]]`: An iterator of (path, list of directory names, list of file names) tuples.
|
|
499
580
|
"""
|
|
500
|
-
# Set expand_info=False by default to get a x10 speed boost
|
|
501
|
-
kwargs = {"expand_info": kwargs.get("detail", False), **kwargs}
|
|
502
581
|
path = self.resolve_path(path, revision=kwargs.get("revision")).unresolve()
|
|
503
582
|
yield from super().walk(path, *args, **kwargs)
|
|
504
583
|
|
|
505
|
-
def glob(self, path: str, **kwargs) ->
|
|
584
|
+
def glob(self, path: str, **kwargs) -> list[str]:
|
|
506
585
|
"""
|
|
507
586
|
Find files by glob-matching.
|
|
508
587
|
|
|
@@ -513,10 +592,8 @@ class HfFileSystem(fsspec.AbstractFileSystem):
|
|
|
513
592
|
Path pattern to match.
|
|
514
593
|
|
|
515
594
|
Returns:
|
|
516
|
-
`
|
|
595
|
+
`list[str]`: List of paths matching the pattern.
|
|
517
596
|
"""
|
|
518
|
-
# Set expand_info=False by default to get a x10 speed boost
|
|
519
|
-
kwargs = {"expand_info": kwargs.get("detail", False), **kwargs}
|
|
520
597
|
path = self.resolve_path(path, revision=kwargs.get("revision")).unresolve()
|
|
521
598
|
return super().glob(path, **kwargs)
|
|
522
599
|
|
|
@@ -529,7 +606,7 @@ class HfFileSystem(fsspec.AbstractFileSystem):
|
|
|
529
606
|
refresh: bool = False,
|
|
530
607
|
revision: Optional[str] = None,
|
|
531
608
|
**kwargs,
|
|
532
|
-
) -> Union[
|
|
609
|
+
) -> Union[list[str], dict[str, dict[str, Any]]]:
|
|
533
610
|
"""
|
|
534
611
|
List all files below path.
|
|
535
612
|
|
|
@@ -550,22 +627,24 @@ class HfFileSystem(fsspec.AbstractFileSystem):
|
|
|
550
627
|
The git revision to list from.
|
|
551
628
|
|
|
552
629
|
Returns:
|
|
553
|
-
`Union[
|
|
630
|
+
`Union[list[str], dict[str, dict[str, Any]]]`: List of paths or dict of file information.
|
|
554
631
|
"""
|
|
555
|
-
if maxdepth:
|
|
556
|
-
|
|
557
|
-
path, maxdepth=maxdepth, withdirs=withdirs, detail=detail, refresh=refresh, revision=revision, **kwargs
|
|
558
|
-
)
|
|
632
|
+
if maxdepth is not None and maxdepth < 1:
|
|
633
|
+
raise ValueError("maxdepth must be at least 1")
|
|
559
634
|
resolved_path = self.resolve_path(path, revision=revision)
|
|
560
635
|
path = resolved_path.unresolve()
|
|
561
|
-
kwargs = {"expand_info": detail, **kwargs}
|
|
562
636
|
try:
|
|
563
|
-
out = self._ls_tree(
|
|
637
|
+
out = self._ls_tree(
|
|
638
|
+
path, recursive=True, refresh=refresh, revision=resolved_path.revision, maxdepth=maxdepth, **kwargs
|
|
639
|
+
)
|
|
564
640
|
except EntryNotFoundError:
|
|
565
641
|
# Path could be a file
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
642
|
+
try:
|
|
643
|
+
if self.info(path, revision=revision, **kwargs)["type"] == "file":
|
|
644
|
+
out = {path: {}}
|
|
645
|
+
else:
|
|
646
|
+
out = {}
|
|
647
|
+
except FileNotFoundError:
|
|
569
648
|
out = {}
|
|
570
649
|
else:
|
|
571
650
|
if not withdirs:
|
|
@@ -585,11 +664,8 @@ class HfFileSystem(fsspec.AbstractFileSystem):
|
|
|
585
664
|
"""
|
|
586
665
|
Copy a file within or between repositories.
|
|
587
666
|
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
Note: When possible, use `HfApi.upload_file()` for better performance.
|
|
591
|
-
|
|
592
|
-
</Tip>
|
|
667
|
+
> [!WARNING]
|
|
668
|
+
> Note: When possible, use `HfApi.upload_file()` for better performance.
|
|
593
669
|
|
|
594
670
|
Args:
|
|
595
671
|
path1 (`str`):
|
|
@@ -653,20 +729,17 @@ class HfFileSystem(fsspec.AbstractFileSystem):
|
|
|
653
729
|
Returns:
|
|
654
730
|
`datetime`: Last commit date of the file.
|
|
655
731
|
"""
|
|
656
|
-
info = self.info(path, **kwargs)
|
|
732
|
+
info = self.info(path, **{**kwargs, "expand_info": True}) # type: ignore
|
|
657
733
|
return info["last_commit"]["date"]
|
|
658
734
|
|
|
659
|
-
def info(self, path: str, refresh: bool = False, revision: Optional[str] = None, **kwargs) ->
|
|
735
|
+
def info(self, path: str, refresh: bool = False, revision: Optional[str] = None, **kwargs) -> dict[str, Any]:
|
|
660
736
|
"""
|
|
661
737
|
Get information about a file or directory.
|
|
662
738
|
|
|
663
739
|
For more details, refer to [fsspec documentation](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.info).
|
|
664
740
|
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
Note: When possible, use `HfApi.get_paths_info()` or `HfApi.repo_info()` for better performance.
|
|
668
|
-
|
|
669
|
-
</Tip>
|
|
741
|
+
> [!WARNING]
|
|
742
|
+
> Note: When possible, use `HfApi.get_paths_info()` or `HfApi.repo_info()` for better performance.
|
|
670
743
|
|
|
671
744
|
Args:
|
|
672
745
|
path (`str`):
|
|
@@ -677,13 +750,13 @@ class HfFileSystem(fsspec.AbstractFileSystem):
|
|
|
677
750
|
The git revision to get info from.
|
|
678
751
|
|
|
679
752
|
Returns:
|
|
680
|
-
`
|
|
753
|
+
`dict[str, Any]`: Dictionary containing file information (type, size, commit info, etc.).
|
|
681
754
|
|
|
682
755
|
"""
|
|
683
756
|
resolved_path = self.resolve_path(path, revision=revision)
|
|
684
757
|
path = resolved_path.unresolve()
|
|
685
758
|
expand_info = kwargs.get(
|
|
686
|
-
"expand_info",
|
|
759
|
+
"expand_info", False
|
|
687
760
|
) # don't expose it as a parameter in the public API to follow the spec
|
|
688
761
|
if not resolved_path.path_in_repo:
|
|
689
762
|
# Path is the root directory
|
|
@@ -691,6 +764,7 @@ class HfFileSystem(fsspec.AbstractFileSystem):
|
|
|
691
764
|
"name": path,
|
|
692
765
|
"size": 0,
|
|
693
766
|
"type": "directory",
|
|
767
|
+
"last_commit": None,
|
|
694
768
|
}
|
|
695
769
|
if expand_info:
|
|
696
770
|
last_commit = self._api.list_repo_commits(
|
|
@@ -708,7 +782,7 @@ class HfFileSystem(fsspec.AbstractFileSystem):
|
|
|
708
782
|
parent_path = self._parent(path)
|
|
709
783
|
if not expand_info and parent_path not in self.dircache:
|
|
710
784
|
# Fill the cache with cheap call
|
|
711
|
-
self.ls(parent_path
|
|
785
|
+
self.ls(parent_path)
|
|
712
786
|
if parent_path in self.dircache:
|
|
713
787
|
# Check if the path is in the cache
|
|
714
788
|
out1 = [o for o in self.dircache[parent_path] if o["name"] == path]
|
|
@@ -762,11 +836,8 @@ class HfFileSystem(fsspec.AbstractFileSystem):
|
|
|
762
836
|
|
|
763
837
|
For more details, refer to [fsspec documentation](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.exists).
|
|
764
838
|
|
|
765
|
-
|
|
766
|
-
|
|
767
|
-
Note: When possible, use `HfApi.file_exists()` for better performance.
|
|
768
|
-
|
|
769
|
-
</Tip>
|
|
839
|
+
> [!WARNING]
|
|
840
|
+
> Note: When possible, use `HfApi.file_exists()` for better performance.
|
|
770
841
|
|
|
771
842
|
Args:
|
|
772
843
|
path (`str`):
|
|
@@ -779,7 +850,7 @@ class HfFileSystem(fsspec.AbstractFileSystem):
|
|
|
779
850
|
if kwargs.get("refresh", False):
|
|
780
851
|
self.invalidate_cache(path)
|
|
781
852
|
|
|
782
|
-
self.info(path, **
|
|
853
|
+
self.info(path, **kwargs)
|
|
783
854
|
return True
|
|
784
855
|
except: # noqa: E722
|
|
785
856
|
return False
|
|
@@ -798,7 +869,7 @@ class HfFileSystem(fsspec.AbstractFileSystem):
|
|
|
798
869
|
`bool`: True if path is a directory, False otherwise.
|
|
799
870
|
"""
|
|
800
871
|
try:
|
|
801
|
-
return self.info(path
|
|
872
|
+
return self.info(path)["type"] == "directory"
|
|
802
873
|
except OSError:
|
|
803
874
|
return False
|
|
804
875
|
|
|
@@ -816,7 +887,7 @@ class HfFileSystem(fsspec.AbstractFileSystem):
|
|
|
816
887
|
`bool`: True if path is a file, False otherwise.
|
|
817
888
|
"""
|
|
818
889
|
try:
|
|
819
|
-
return self.info(path
|
|
890
|
+
return self.info(path)["type"] == "file"
|
|
820
891
|
except: # noqa: E722
|
|
821
892
|
return False
|
|
822
893
|
|
|
@@ -847,11 +918,8 @@ class HfFileSystem(fsspec.AbstractFileSystem):
|
|
|
847
918
|
"""
|
|
848
919
|
Copy single remote file to local.
|
|
849
920
|
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
Note: When possible, use `HfApi.hf_hub_download()` for better performance.
|
|
853
|
-
|
|
854
|
-
</Tip>
|
|
921
|
+
> [!WARNING]
|
|
922
|
+
> Note: When possible, use `HfApi.hf_hub_download()` for better performance.
|
|
855
923
|
|
|
856
924
|
Args:
|
|
857
925
|
rpath (`str`):
|
|
@@ -901,7 +969,7 @@ class HfFileSystem(fsspec.AbstractFileSystem):
|
|
|
901
969
|
repo_type=resolve_remote_path.repo_type,
|
|
902
970
|
endpoint=self.endpoint,
|
|
903
971
|
),
|
|
904
|
-
temp_file=outfile,
|
|
972
|
+
temp_file=outfile, # type: ignore[arg-type]
|
|
905
973
|
displayed_filename=rpath,
|
|
906
974
|
expected_size=expected_size,
|
|
907
975
|
resume_size=0,
|
|
@@ -931,6 +999,21 @@ class HfFileSystem(fsspec.AbstractFileSystem):
|
|
|
931
999
|
# See https://github.com/huggingface/huggingface_hub/issues/1733
|
|
932
1000
|
raise NotImplementedError("Transactional commits are not supported.")
|
|
933
1001
|
|
|
1002
|
+
def __reduce__(self):
|
|
1003
|
+
# re-populate the instance cache at HfFileSystem._cache and re-populate the state of every instance
|
|
1004
|
+
return make_instance, (
|
|
1005
|
+
type(self),
|
|
1006
|
+
self.storage_args,
|
|
1007
|
+
self.storage_options,
|
|
1008
|
+
self._get_instance_state(),
|
|
1009
|
+
)
|
|
1010
|
+
|
|
1011
|
+
def _get_instance_state(self):
|
|
1012
|
+
return {
|
|
1013
|
+
"dircache": deepcopy(self.dircache),
|
|
1014
|
+
"_repo_and_revision_exists_cache": deepcopy(self._repo_and_revision_exists_cache),
|
|
1015
|
+
}
|
|
1016
|
+
|
|
934
1017
|
|
|
935
1018
|
class HfFileSystemFile(fsspec.spec.AbstractBufferedFile):
|
|
936
1019
|
def __init__(self, fs: HfFileSystem, path: str, revision: Optional[str] = None, **kwargs):
|
|
@@ -942,9 +1025,6 @@ class HfFileSystemFile(fsspec.spec.AbstractBufferedFile):
|
|
|
942
1025
|
f"{e}.\nMake sure the repository and revision exist before writing data."
|
|
943
1026
|
) from e
|
|
944
1027
|
raise
|
|
945
|
-
# avoid an unnecessary .info() call with expensive expand_info=True to instantiate .details
|
|
946
|
-
if kwargs.get("mode", "rb") == "rb":
|
|
947
|
-
self.details = fs.info(self.resolved_path.unresolve(), expand_info=False)
|
|
948
1028
|
super().__init__(fs, self.resolved_path.unresolve(), **kwargs)
|
|
949
1029
|
self.fs: HfFileSystem
|
|
950
1030
|
|
|
@@ -966,13 +1046,7 @@ class HfFileSystemFile(fsspec.spec.AbstractBufferedFile):
|
|
|
966
1046
|
repo_type=self.resolved_path.repo_type,
|
|
967
1047
|
endpoint=self.fs.endpoint,
|
|
968
1048
|
)
|
|
969
|
-
r = http_backoff(
|
|
970
|
-
"GET",
|
|
971
|
-
url,
|
|
972
|
-
headers=headers,
|
|
973
|
-
retry_on_status_codes=(500, 502, 503, 504),
|
|
974
|
-
timeout=constants.HF_HUB_DOWNLOAD_TIMEOUT,
|
|
975
|
-
)
|
|
1049
|
+
r = http_backoff("GET", url, headers=headers, timeout=constants.HF_HUB_DOWNLOAD_TIMEOUT)
|
|
976
1050
|
hf_raise_for_status(r)
|
|
977
1051
|
return r.content
|
|
978
1052
|
|
|
@@ -1003,13 +1077,14 @@ class HfFileSystemFile(fsspec.spec.AbstractBufferedFile):
|
|
|
1003
1077
|
def read(self, length=-1):
|
|
1004
1078
|
"""Read remote file.
|
|
1005
1079
|
|
|
1006
|
-
If `length` is not provided or is -1, the entire file is downloaded and read. On POSIX systems
|
|
1007
|
-
|
|
1008
|
-
temporary file and read from there.
|
|
1080
|
+
If `length` is not provided or is -1, the entire file is downloaded and read. On POSIX systems the file is
|
|
1081
|
+
loaded in memory directly. Otherwise, the file is downloaded to a temporary file and read from there.
|
|
1009
1082
|
"""
|
|
1010
1083
|
if self.mode == "rb" and (length is None or length == -1) and self.loc == 0:
|
|
1011
1084
|
with self.fs.open(self.path, "rb", block_size=0) as f: # block_size=0 enables fast streaming
|
|
1012
|
-
|
|
1085
|
+
out = f.read()
|
|
1086
|
+
self.loc += len(out)
|
|
1087
|
+
return out
|
|
1013
1088
|
return super().read(length)
|
|
1014
1089
|
|
|
1015
1090
|
def url(self) -> str:
|
|
@@ -1045,8 +1120,9 @@ class HfFileSystemStreamFile(fsspec.spec.AbstractBufferedFile):
|
|
|
1045
1120
|
super().__init__(
|
|
1046
1121
|
fs, self.resolved_path.unresolve(), mode=mode, block_size=block_size, cache_type=cache_type, **kwargs
|
|
1047
1122
|
)
|
|
1048
|
-
self.response: Optional[Response] = None
|
|
1123
|
+
self.response: Optional[httpx.Response] = None
|
|
1049
1124
|
self.fs: HfFileSystem
|
|
1125
|
+
self._exit_stack = ExitStack()
|
|
1050
1126
|
|
|
1051
1127
|
def seek(self, loc: int, whence: int = 0):
|
|
1052
1128
|
if loc == 0 and whence == 1:
|
|
@@ -1056,53 +1132,32 @@ class HfFileSystemStreamFile(fsspec.spec.AbstractBufferedFile):
|
|
|
1056
1132
|
raise ValueError("Cannot seek streaming HF file")
|
|
1057
1133
|
|
|
1058
1134
|
def read(self, length: int = -1):
|
|
1059
|
-
|
|
1060
|
-
if self.response is None or self.response.raw.isclosed():
|
|
1061
|
-
url = hf_hub_url(
|
|
1062
|
-
repo_id=self.resolved_path.repo_id,
|
|
1063
|
-
revision=self.resolved_path.revision,
|
|
1064
|
-
filename=self.resolved_path.path_in_repo,
|
|
1065
|
-
repo_type=self.resolved_path.repo_type,
|
|
1066
|
-
endpoint=self.fs.endpoint,
|
|
1067
|
-
)
|
|
1068
|
-
self.response = http_backoff(
|
|
1069
|
-
"GET",
|
|
1070
|
-
url,
|
|
1071
|
-
headers=self.fs._api._build_hf_headers(),
|
|
1072
|
-
retry_on_status_codes=(500, 502, 503, 504),
|
|
1073
|
-
stream=True,
|
|
1074
|
-
timeout=constants.HF_HUB_DOWNLOAD_TIMEOUT,
|
|
1075
|
-
)
|
|
1076
|
-
hf_raise_for_status(self.response)
|
|
1077
|
-
try:
|
|
1078
|
-
out = self.response.raw.read(*read_args)
|
|
1079
|
-
except Exception:
|
|
1080
|
-
self.response.close()
|
|
1135
|
+
"""Read the remote file.
|
|
1081
1136
|
|
|
1082
|
-
|
|
1083
|
-
|
|
1084
|
-
|
|
1085
|
-
|
|
1086
|
-
|
|
1087
|
-
|
|
1088
|
-
|
|
1089
|
-
|
|
1090
|
-
|
|
1091
|
-
|
|
1092
|
-
url,
|
|
1093
|
-
headers={"Range": "bytes=%d-" % self.loc, **self.fs._api._build_hf_headers()},
|
|
1094
|
-
retry_on_status_codes=(500, 502, 503, 504),
|
|
1095
|
-
stream=True,
|
|
1096
|
-
timeout=constants.HF_HUB_DOWNLOAD_TIMEOUT,
|
|
1097
|
-
)
|
|
1098
|
-
hf_raise_for_status(self.response)
|
|
1137
|
+
If the file is already open, we reuse the connection.
|
|
1138
|
+
Otherwise, open a new connection and read from it.
|
|
1139
|
+
|
|
1140
|
+
If reading the stream fails, we retry with a new connection.
|
|
1141
|
+
"""
|
|
1142
|
+
if self.response is None:
|
|
1143
|
+
self._open_connection()
|
|
1144
|
+
|
|
1145
|
+
retried_once = False
|
|
1146
|
+
while True:
|
|
1099
1147
|
try:
|
|
1100
|
-
|
|
1148
|
+
if self.response is None:
|
|
1149
|
+
return b"" # Already read the entire file
|
|
1150
|
+
out = _partial_read(self.response, length)
|
|
1151
|
+
self.loc += len(out)
|
|
1152
|
+
return out
|
|
1101
1153
|
except Exception:
|
|
1102
|
-
self.response
|
|
1103
|
-
|
|
1104
|
-
|
|
1105
|
-
|
|
1154
|
+
if self.response is not None:
|
|
1155
|
+
self.response.close()
|
|
1156
|
+
if retried_once: # Already retried once, give up
|
|
1157
|
+
raise
|
|
1158
|
+
# First failure, retry with range header
|
|
1159
|
+
self._open_connection()
|
|
1160
|
+
retried_once = True
|
|
1106
1161
|
|
|
1107
1162
|
def url(self) -> str:
|
|
1108
1163
|
return self.fs.url(self.path)
|
|
@@ -1111,11 +1166,43 @@ class HfFileSystemStreamFile(fsspec.spec.AbstractBufferedFile):
|
|
|
1111
1166
|
if not hasattr(self, "resolved_path"):
|
|
1112
1167
|
# Means that the constructor failed. Nothing to do.
|
|
1113
1168
|
return
|
|
1169
|
+
self._exit_stack.close()
|
|
1114
1170
|
return super().__del__()
|
|
1115
1171
|
|
|
1116
1172
|
def __reduce__(self):
|
|
1117
1173
|
return reopen, (self.fs, self.path, self.mode, self.blocksize, self.cache.name)
|
|
1118
1174
|
|
|
1175
|
+
def _open_connection(self):
|
|
1176
|
+
"""Open a connection to the remote file."""
|
|
1177
|
+
url = hf_hub_url(
|
|
1178
|
+
repo_id=self.resolved_path.repo_id,
|
|
1179
|
+
revision=self.resolved_path.revision,
|
|
1180
|
+
filename=self.resolved_path.path_in_repo,
|
|
1181
|
+
repo_type=self.resolved_path.repo_type,
|
|
1182
|
+
endpoint=self.fs.endpoint,
|
|
1183
|
+
)
|
|
1184
|
+
headers = self.fs._api._build_hf_headers()
|
|
1185
|
+
if self.loc > 0:
|
|
1186
|
+
headers["Range"] = f"bytes={self.loc}-"
|
|
1187
|
+
self.response = self._exit_stack.enter_context(
|
|
1188
|
+
http_stream_backoff(
|
|
1189
|
+
"GET",
|
|
1190
|
+
url,
|
|
1191
|
+
headers=headers,
|
|
1192
|
+
retry_on_status_codes=(500, 502, 503, 504),
|
|
1193
|
+
timeout=constants.HF_HUB_DOWNLOAD_TIMEOUT,
|
|
1194
|
+
)
|
|
1195
|
+
)
|
|
1196
|
+
|
|
1197
|
+
try:
|
|
1198
|
+
hf_raise_for_status(self.response)
|
|
1199
|
+
except HfHubHTTPError as e:
|
|
1200
|
+
if e.response.status_code == 416:
|
|
1201
|
+
# Range not satisfiable => means that we have already read the entire file
|
|
1202
|
+
self.response = None
|
|
1203
|
+
return
|
|
1204
|
+
raise
|
|
1205
|
+
|
|
1119
1206
|
|
|
1120
1207
|
def safe_revision(revision: str) -> str:
|
|
1121
1208
|
return revision if SPECIAL_REFS_REVISION_REGEX.match(revision) else safe_quote(revision)
|
|
@@ -1138,3 +1225,33 @@ def _raise_file_not_found(path: str, err: Optional[Exception]) -> NoReturn:
|
|
|
1138
1225
|
|
|
1139
1226
|
def reopen(fs: HfFileSystem, path: str, mode: str, block_size: int, cache_type: str):
|
|
1140
1227
|
return fs.open(path, mode=mode, block_size=block_size, cache_type=cache_type)
|
|
1228
|
+
|
|
1229
|
+
|
|
1230
|
+
def _partial_read(response: httpx.Response, length: int = -1) -> bytes:
|
|
1231
|
+
"""
|
|
1232
|
+
Read up to `length` bytes from a streamed response.
|
|
1233
|
+
If length == -1, read until EOF.
|
|
1234
|
+
"""
|
|
1235
|
+
buf = bytearray()
|
|
1236
|
+
if length < -1:
|
|
1237
|
+
raise ValueError("length must be -1 or >= 0")
|
|
1238
|
+
if length == 0:
|
|
1239
|
+
return b""
|
|
1240
|
+
if length == -1:
|
|
1241
|
+
for chunk in response.iter_bytes():
|
|
1242
|
+
buf.extend(chunk)
|
|
1243
|
+
return bytes(buf)
|
|
1244
|
+
|
|
1245
|
+
for chunk in response.iter_bytes(chunk_size=length):
|
|
1246
|
+
buf.extend(chunk)
|
|
1247
|
+
if len(buf) >= length:
|
|
1248
|
+
return bytes(buf[:length])
|
|
1249
|
+
|
|
1250
|
+
return bytes(buf) # may be < length if response ended
|
|
1251
|
+
|
|
1252
|
+
|
|
1253
|
+
def make_instance(cls, args, kwargs, instance_state):
|
|
1254
|
+
fs = cls(*args, **kwargs)
|
|
1255
|
+
for attr, state_value in instance_state.items():
|
|
1256
|
+
setattr(fs, attr, state_value)
|
|
1257
|
+
return fs
|