huggingface-hub 0.31.0rc0__py3-none-any.whl → 1.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (150) hide show
  1. huggingface_hub/__init__.py +145 -46
  2. huggingface_hub/_commit_api.py +168 -119
  3. huggingface_hub/_commit_scheduler.py +15 -15
  4. huggingface_hub/_inference_endpoints.py +15 -12
  5. huggingface_hub/_jobs_api.py +301 -0
  6. huggingface_hub/_local_folder.py +18 -3
  7. huggingface_hub/_login.py +31 -63
  8. huggingface_hub/_oauth.py +460 -0
  9. huggingface_hub/_snapshot_download.py +239 -80
  10. huggingface_hub/_space_api.py +5 -5
  11. huggingface_hub/_tensorboard_logger.py +15 -19
  12. huggingface_hub/_upload_large_folder.py +172 -76
  13. huggingface_hub/_webhooks_payload.py +3 -3
  14. huggingface_hub/_webhooks_server.py +13 -25
  15. huggingface_hub/{commands → cli}/__init__.py +1 -15
  16. huggingface_hub/cli/_cli_utils.py +173 -0
  17. huggingface_hub/cli/auth.py +147 -0
  18. huggingface_hub/cli/cache.py +841 -0
  19. huggingface_hub/cli/download.py +189 -0
  20. huggingface_hub/cli/hf.py +60 -0
  21. huggingface_hub/cli/inference_endpoints.py +377 -0
  22. huggingface_hub/cli/jobs.py +772 -0
  23. huggingface_hub/cli/lfs.py +175 -0
  24. huggingface_hub/cli/repo.py +315 -0
  25. huggingface_hub/cli/repo_files.py +94 -0
  26. huggingface_hub/{commands/env.py → cli/system.py} +10 -13
  27. huggingface_hub/cli/upload.py +294 -0
  28. huggingface_hub/cli/upload_large_folder.py +117 -0
  29. huggingface_hub/community.py +20 -12
  30. huggingface_hub/constants.py +38 -53
  31. huggingface_hub/dataclasses.py +609 -0
  32. huggingface_hub/errors.py +80 -30
  33. huggingface_hub/fastai_utils.py +30 -41
  34. huggingface_hub/file_download.py +435 -351
  35. huggingface_hub/hf_api.py +2050 -1124
  36. huggingface_hub/hf_file_system.py +269 -152
  37. huggingface_hub/hub_mixin.py +43 -63
  38. huggingface_hub/inference/_client.py +347 -434
  39. huggingface_hub/inference/_common.py +133 -121
  40. huggingface_hub/inference/_generated/_async_client.py +397 -541
  41. huggingface_hub/inference/_generated/types/__init__.py +5 -1
  42. huggingface_hub/inference/_generated/types/automatic_speech_recognition.py +3 -3
  43. huggingface_hub/inference/_generated/types/base.py +10 -7
  44. huggingface_hub/inference/_generated/types/chat_completion.py +59 -23
  45. huggingface_hub/inference/_generated/types/depth_estimation.py +2 -2
  46. huggingface_hub/inference/_generated/types/document_question_answering.py +2 -2
  47. huggingface_hub/inference/_generated/types/feature_extraction.py +2 -2
  48. huggingface_hub/inference/_generated/types/fill_mask.py +2 -2
  49. huggingface_hub/inference/_generated/types/image_to_image.py +6 -2
  50. huggingface_hub/inference/_generated/types/image_to_video.py +60 -0
  51. huggingface_hub/inference/_generated/types/sentence_similarity.py +3 -3
  52. huggingface_hub/inference/_generated/types/summarization.py +2 -2
  53. huggingface_hub/inference/_generated/types/table_question_answering.py +5 -5
  54. huggingface_hub/inference/_generated/types/text2text_generation.py +2 -2
  55. huggingface_hub/inference/_generated/types/text_generation.py +10 -10
  56. huggingface_hub/inference/_generated/types/text_to_video.py +2 -2
  57. huggingface_hub/inference/_generated/types/token_classification.py +2 -2
  58. huggingface_hub/inference/_generated/types/translation.py +2 -2
  59. huggingface_hub/inference/_generated/types/zero_shot_classification.py +2 -2
  60. huggingface_hub/inference/_generated/types/zero_shot_image_classification.py +2 -2
  61. huggingface_hub/inference/_generated/types/zero_shot_object_detection.py +1 -3
  62. huggingface_hub/inference/_mcp/__init__.py +0 -0
  63. huggingface_hub/inference/_mcp/_cli_hacks.py +88 -0
  64. huggingface_hub/inference/_mcp/agent.py +100 -0
  65. huggingface_hub/inference/_mcp/cli.py +247 -0
  66. huggingface_hub/inference/_mcp/constants.py +81 -0
  67. huggingface_hub/inference/_mcp/mcp_client.py +395 -0
  68. huggingface_hub/inference/_mcp/types.py +45 -0
  69. huggingface_hub/inference/_mcp/utils.py +128 -0
  70. huggingface_hub/inference/_providers/__init__.py +82 -7
  71. huggingface_hub/inference/_providers/_common.py +129 -27
  72. huggingface_hub/inference/_providers/black_forest_labs.py +6 -6
  73. huggingface_hub/inference/_providers/cerebras.py +1 -1
  74. huggingface_hub/inference/_providers/clarifai.py +13 -0
  75. huggingface_hub/inference/_providers/cohere.py +20 -3
  76. huggingface_hub/inference/_providers/fal_ai.py +183 -56
  77. huggingface_hub/inference/_providers/featherless_ai.py +38 -0
  78. huggingface_hub/inference/_providers/fireworks_ai.py +18 -0
  79. huggingface_hub/inference/_providers/groq.py +9 -0
  80. huggingface_hub/inference/_providers/hf_inference.py +69 -30
  81. huggingface_hub/inference/_providers/hyperbolic.py +4 -4
  82. huggingface_hub/inference/_providers/nebius.py +33 -5
  83. huggingface_hub/inference/_providers/novita.py +5 -5
  84. huggingface_hub/inference/_providers/nscale.py +44 -0
  85. huggingface_hub/inference/_providers/openai.py +3 -1
  86. huggingface_hub/inference/_providers/publicai.py +6 -0
  87. huggingface_hub/inference/_providers/replicate.py +31 -13
  88. huggingface_hub/inference/_providers/sambanova.py +18 -4
  89. huggingface_hub/inference/_providers/scaleway.py +28 -0
  90. huggingface_hub/inference/_providers/together.py +20 -5
  91. huggingface_hub/inference/_providers/wavespeed.py +138 -0
  92. huggingface_hub/inference/_providers/zai_org.py +17 -0
  93. huggingface_hub/lfs.py +33 -100
  94. huggingface_hub/repocard.py +34 -38
  95. huggingface_hub/repocard_data.py +57 -57
  96. huggingface_hub/serialization/__init__.py +0 -1
  97. huggingface_hub/serialization/_base.py +12 -15
  98. huggingface_hub/serialization/_dduf.py +8 -8
  99. huggingface_hub/serialization/_torch.py +69 -69
  100. huggingface_hub/utils/__init__.py +19 -8
  101. huggingface_hub/utils/_auth.py +7 -7
  102. huggingface_hub/utils/_cache_manager.py +92 -147
  103. huggingface_hub/utils/_chunk_utils.py +2 -3
  104. huggingface_hub/utils/_deprecation.py +1 -1
  105. huggingface_hub/utils/_dotenv.py +55 -0
  106. huggingface_hub/utils/_experimental.py +7 -5
  107. huggingface_hub/utils/_fixes.py +0 -10
  108. huggingface_hub/utils/_git_credential.py +5 -5
  109. huggingface_hub/utils/_headers.py +8 -30
  110. huggingface_hub/utils/_http.py +398 -239
  111. huggingface_hub/utils/_pagination.py +4 -4
  112. huggingface_hub/utils/_parsing.py +98 -0
  113. huggingface_hub/utils/_paths.py +5 -5
  114. huggingface_hub/utils/_runtime.py +61 -24
  115. huggingface_hub/utils/_safetensors.py +21 -21
  116. huggingface_hub/utils/_subprocess.py +9 -9
  117. huggingface_hub/utils/_telemetry.py +4 -4
  118. huggingface_hub/{commands/_cli_utils.py → utils/_terminal.py} +4 -4
  119. huggingface_hub/utils/_typing.py +25 -5
  120. huggingface_hub/utils/_validators.py +55 -74
  121. huggingface_hub/utils/_verification.py +167 -0
  122. huggingface_hub/utils/_xet.py +64 -17
  123. huggingface_hub/utils/_xet_progress_reporting.py +162 -0
  124. huggingface_hub/utils/insecure_hashlib.py +3 -5
  125. huggingface_hub/utils/logging.py +8 -11
  126. huggingface_hub/utils/tqdm.py +5 -4
  127. {huggingface_hub-0.31.0rc0.dist-info → huggingface_hub-1.1.3.dist-info}/METADATA +94 -85
  128. huggingface_hub-1.1.3.dist-info/RECORD +155 -0
  129. {huggingface_hub-0.31.0rc0.dist-info → huggingface_hub-1.1.3.dist-info}/WHEEL +1 -1
  130. huggingface_hub-1.1.3.dist-info/entry_points.txt +6 -0
  131. huggingface_hub/commands/delete_cache.py +0 -474
  132. huggingface_hub/commands/download.py +0 -200
  133. huggingface_hub/commands/huggingface_cli.py +0 -61
  134. huggingface_hub/commands/lfs.py +0 -200
  135. huggingface_hub/commands/repo_files.py +0 -128
  136. huggingface_hub/commands/scan_cache.py +0 -181
  137. huggingface_hub/commands/tag.py +0 -159
  138. huggingface_hub/commands/upload.py +0 -314
  139. huggingface_hub/commands/upload_large_folder.py +0 -129
  140. huggingface_hub/commands/user.py +0 -304
  141. huggingface_hub/commands/version.py +0 -37
  142. huggingface_hub/inference_api.py +0 -217
  143. huggingface_hub/keras_mixin.py +0 -500
  144. huggingface_hub/repository.py +0 -1477
  145. huggingface_hub/serialization/_tensorflow.py +0 -95
  146. huggingface_hub/utils/_hf_folder.py +0 -68
  147. huggingface_hub-0.31.0rc0.dist-info/RECORD +0 -135
  148. huggingface_hub-0.31.0rc0.dist-info/entry_points.txt +0 -6
  149. {huggingface_hub-0.31.0rc0.dist-info → huggingface_hub-1.1.3.dist-info/licenses}/LICENSE +0 -0
  150. {huggingface_hub-0.31.0rc0.dist-info → huggingface_hub-1.1.3.dist-info}/top_level.txt +0 -0
@@ -1,25 +1,29 @@
1
1
  import os
2
2
  import re
3
3
  import tempfile
4
+ import threading
4
5
  from collections import deque
6
+ from contextlib import ExitStack
7
+ from copy import deepcopy
5
8
  from dataclasses import dataclass, field
6
9
  from datetime import datetime
7
10
  from itertools import chain
8
11
  from pathlib import Path
9
- from typing import Any, Dict, Iterator, List, NoReturn, Optional, Tuple, Union
12
+ from typing import Any, Iterator, NoReturn, Optional, Union
10
13
  from urllib.parse import quote, unquote
11
14
 
12
15
  import fsspec
16
+ import httpx
13
17
  from fsspec.callbacks import _DEFAULT_CALLBACK, NoOpCallback, TqdmCallback
14
18
  from fsspec.utils import isfilelike
15
- from requests import Response
16
19
 
17
20
  from . import constants
18
21
  from ._commit_api import CommitOperationCopy, CommitOperationDelete
19
- from .errors import EntryNotFoundError, RepositoryNotFoundError, RevisionNotFoundError
22
+ from .errors import EntryNotFoundError, HfHubHTTPError, RepositoryNotFoundError, RevisionNotFoundError
20
23
  from .file_download import hf_hub_url, http_get
21
24
  from .hf_api import HfApi, LastCommitInfo, RepoFile
22
- from .utils import HFValidationError, hf_raise_for_status, http_backoff
25
+ from .utils import HFValidationError, hf_raise_for_status, http_backoff, http_stream_backoff
26
+ from .utils.insecure_hashlib import md5
23
27
 
24
28
 
25
29
  # Regex used to match special revisions with "/" in them (see #1710)
@@ -55,17 +59,68 @@ class HfFileSystemResolvedPath:
55
59
  return f"{repo_path}/{self.path_in_repo}".rstrip("/")
56
60
 
57
61
 
58
- class HfFileSystem(fsspec.AbstractFileSystem):
59
- """
60
- Access a remote Hugging Face Hub repository as if were a local file system.
62
+ # We need to improve fsspec.spec._Cached which is AbstractFileSystem's metaclass
63
+ _cached_base: Any = type(fsspec.AbstractFileSystem)
61
64
 
62
- <Tip warning={true}>
63
65
 
64
- [`HfFileSystem`] provides fsspec compatibility, which is useful for libraries that require it (e.g., reading
65
- Hugging Face datasets directly with `pandas`). However, it introduces additional overhead due to this compatibility
66
- layer. For better performance and reliability, it's recommended to use `HfApi` methods when possible.
66
+ class _Cached(_cached_base):
67
+ """
68
+ Metaclass for caching HfFileSystem instances according to the args.
69
+
70
+ This creates an additional reference to the filesystem, which prevents the
71
+ filesystem from being garbage collected when all *user* references go away.
72
+ A call to the :meth:`AbstractFileSystem.clear_instance_cache` must *also*
73
+ be made for a filesystem instance to be garbage collected.
74
+
75
+ This is a slightly modified version of `fsspec.spec._Cached` to improve it.
76
+ In particular in `_tokenize` the pid isn't taken into account for the
77
+ `fs_token` used to identify cached instances. The `fs_token` logic is also
78
+ robust to defaults values and the order of the args. Finally new instances
79
+ reuse the states from sister instances in the main thread.
80
+ """
67
81
 
68
- </Tip>
82
+ def __init__(cls, *args, **kwargs):
83
+ # Hack: override https://github.com/fsspec/filesystem_spec/blob/dcb167e8f50e6273d4cfdfc4cab8fc5aa4c958bf/fsspec/spec.py#L53
84
+ super().__init__(*args, **kwargs)
85
+ # Note: we intentionally create a reference here, to avoid garbage
86
+ # collecting instances when all other references are gone. To really
87
+ # delete a FileSystem, the cache must be cleared.
88
+ cls._cache = {}
89
+
90
+ def __call__(cls, *args, **kwargs):
91
+ # Hack: override https://github.com/fsspec/filesystem_spec/blob/dcb167e8f50e6273d4cfdfc4cab8fc5aa4c958bf/fsspec/spec.py#L65
92
+ skip = kwargs.pop("skip_instance_cache", False)
93
+ fs_token = cls._tokenize(cls, threading.get_ident(), *args, **kwargs)
94
+ fs_token_main_thread = cls._tokenize(cls, threading.main_thread().ident, *args, **kwargs)
95
+ if not skip and cls.cachable and fs_token in cls._cache:
96
+ # reuse cached instance
97
+ cls._latest = fs_token
98
+ return cls._cache[fs_token]
99
+ else:
100
+ # create new instance
101
+ obj = type.__call__(cls, *args, **kwargs)
102
+ if not skip and cls.cachable and fs_token_main_thread in cls._cache:
103
+ # reuse the cache from the main thread instance in the new instance
104
+ instance_state = cls._cache[fs_token_main_thread]._get_instance_state()
105
+ for attr, state_value in instance_state.items():
106
+ setattr(obj, attr, state_value)
107
+ obj._fs_token_ = fs_token
108
+ obj.storage_args = args
109
+ obj.storage_options = kwargs
110
+ if cls.cachable and not skip:
111
+ cls._latest = fs_token
112
+ cls._cache[fs_token] = obj
113
+ return obj
114
+
115
+
116
+ class HfFileSystem(fsspec.AbstractFileSystem, metaclass=_Cached):
117
+ """
118
+ Access a remote Hugging Face Hub repository as if were a local file system.
119
+
120
+ > [!WARNING]
121
+ > [`HfFileSystem`] provides fsspec compatibility, which is useful for libraries that require it (e.g., reading
122
+ > Hugging Face datasets directly with `pandas`). However, it introduces additional overhead due to this compatibility
123
+ > layer. For better performance and reliability, it's recommended to use `HfApi` methods when possible.
69
124
 
70
125
  Args:
71
126
  token (`str` or `bool`, *optional*):
@@ -104,22 +159,38 @@ class HfFileSystem(fsspec.AbstractFileSystem):
104
159
  *args,
105
160
  endpoint: Optional[str] = None,
106
161
  token: Union[bool, str, None] = None,
162
+ block_size: Optional[int] = None,
107
163
  **storage_options,
108
164
  ):
109
165
  super().__init__(*args, **storage_options)
110
166
  self.endpoint = endpoint or constants.ENDPOINT
111
167
  self.token = token
112
168
  self._api = HfApi(endpoint=endpoint, token=token)
169
+ self.block_size = block_size
113
170
  # Maps (repo_type, repo_id, revision) to a 2-tuple with:
114
171
  # * the 1st element indicating whether the repositoy and the revision exist
115
172
  # * the 2nd element being the exception raised if the repository or revision doesn't exist
116
- self._repo_and_revision_exists_cache: Dict[
117
- Tuple[str, str, Optional[str]], Tuple[bool, Optional[Exception]]
173
+ self._repo_and_revision_exists_cache: dict[
174
+ tuple[str, str, Optional[str]], tuple[bool, Optional[Exception]]
118
175
  ] = {}
176
+ # Maps parent directory path to path infos
177
+ self.dircache: dict[str, list[dict[str, Any]]] = {}
178
+
179
+ @classmethod
180
+ def _tokenize(cls, threading_ident: int, *args, **kwargs) -> str:
181
+ """Deterministic token for caching"""
182
+ # make fs_token robust to default values and to kwargs order
183
+ kwargs["endpoint"] = kwargs.get("endpoint") or constants.ENDPOINT
184
+ kwargs["token"] = kwargs.get("token")
185
+ kwargs = {key: kwargs[key] for key in sorted(kwargs)}
186
+ # contrary to fsspec, we don't include pid here
187
+ tokenize_args = (cls, threading_ident, args, kwargs)
188
+ h = md5(str(tokenize_args).encode())
189
+ return h.hexdigest()
119
190
 
120
191
  def _repo_and_revision_exist(
121
192
  self, repo_type: str, repo_id: str, revision: Optional[str]
122
- ) -> Tuple[bool, Optional[Exception]]:
193
+ ) -> tuple[bool, Optional[Exception]]:
123
194
  if (repo_type, repo_id, revision) not in self._repo_and_revision_exists_cache:
124
195
  try:
125
196
  self._api.repo_info(
@@ -267,12 +338,15 @@ class HfFileSystem(fsspec.AbstractFileSystem):
267
338
  block_size: Optional[int] = None,
268
339
  **kwargs,
269
340
  ) -> "HfFileSystemFile":
341
+ block_size = block_size if block_size is not None else self.block_size
342
+ if block_size is not None:
343
+ kwargs["block_size"] = block_size
270
344
  if "a" in mode:
271
345
  raise NotImplementedError("Appending to remote files is not yet supported.")
272
346
  if block_size == 0:
273
- return HfFileSystemStreamFile(self, path, mode=mode, revision=revision, block_size=block_size, **kwargs)
347
+ return HfFileSystemStreamFile(self, path, mode=mode, revision=revision, **kwargs)
274
348
  else:
275
- return HfFileSystemFile(self, path, mode=mode, revision=revision, block_size=block_size, **kwargs)
349
+ return HfFileSystemFile(self, path, mode=mode, revision=revision, **kwargs)
276
350
 
277
351
  def _rm(self, path: str, revision: Optional[str] = None, **kwargs) -> None:
278
352
  resolved_path = self.resolve_path(path, revision=revision)
@@ -300,11 +374,8 @@ class HfFileSystem(fsspec.AbstractFileSystem):
300
374
 
301
375
  For more details, refer to [fsspec documentation](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.rm).
302
376
 
303
- <Tip warning={true}>
304
-
305
- Note: When possible, use `HfApi.delete_file()` for better performance.
306
-
307
- </Tip>
377
+ > [!WARNING]
378
+ > Note: When possible, use `HfApi.delete_file()` for better performance.
308
379
 
309
380
  Args:
310
381
  path (`str`):
@@ -338,17 +409,14 @@ class HfFileSystem(fsspec.AbstractFileSystem):
338
409
 
339
410
  def ls(
340
411
  self, path: str, detail: bool = True, refresh: bool = False, revision: Optional[str] = None, **kwargs
341
- ) -> List[Union[str, Dict[str, Any]]]:
412
+ ) -> list[Union[str, dict[str, Any]]]:
342
413
  """
343
414
  List the contents of a directory.
344
415
 
345
416
  For more details, refer to [fsspec documentation](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.ls).
346
417
 
347
- <Tip warning={true}>
348
-
349
- Note: When possible, use `HfApi.list_repo_tree()` for better performance.
350
-
351
- </Tip>
418
+ > [!WARNING]
419
+ > Note: When possible, use `HfApi.list_repo_tree()` for better performance.
352
420
 
353
421
  Args:
354
422
  path (`str`):
@@ -362,12 +430,11 @@ class HfFileSystem(fsspec.AbstractFileSystem):
362
430
  The git revision to list from.
363
431
 
364
432
  Returns:
365
- `List[Union[str, Dict[str, Any]]]`: List of file paths (if detail=False) or list of file information
433
+ `list[Union[str, dict[str, Any]]]`: List of file paths (if detail=False) or list of file information
366
434
  dictionaries (if detail=True).
367
435
  """
368
436
  resolved_path = self.resolve_path(path, revision=revision)
369
437
  path = resolved_path.unresolve()
370
- kwargs = {"expand_info": detail, **kwargs}
371
438
  try:
372
439
  out = self._ls_tree(path, refresh=refresh, revision=revision, **kwargs)
373
440
  except EntryNotFoundError:
@@ -386,7 +453,8 @@ class HfFileSystem(fsspec.AbstractFileSystem):
386
453
  recursive: bool = False,
387
454
  refresh: bool = False,
388
455
  revision: Optional[str] = None,
389
- expand_info: bool = True,
456
+ expand_info: bool = False,
457
+ maxdepth: Optional[int] = None,
390
458
  ):
391
459
  resolved_path = self.resolve_path(path, revision=revision)
392
460
  path = resolved_path.unresolve()
@@ -406,19 +474,25 @@ class HfFileSystem(fsspec.AbstractFileSystem):
406
474
  if recursive:
407
475
  # Use BFS to traverse the cache and build the "recursive "output
408
476
  # (The Hub uses a so-called "tree first" strategy for the tree endpoint but we sort the output to follow the spec so the result is (eventually) the same)
477
+ depth = 2
409
478
  dirs_to_visit = deque(
410
- [path_info for path_info in cached_path_infos if path_info["type"] == "directory"]
479
+ [(depth, path_info) for path_info in cached_path_infos if path_info["type"] == "directory"]
411
480
  )
412
481
  while dirs_to_visit:
413
- dir_info = dirs_to_visit.popleft()
414
- if dir_info["name"] not in self.dircache:
415
- dirs_not_in_dircache.append(dir_info["name"])
416
- else:
417
- cached_path_infos = self.dircache[dir_info["name"]]
418
- out.extend(cached_path_infos)
419
- dirs_to_visit.extend(
420
- [path_info for path_info in cached_path_infos if path_info["type"] == "directory"]
421
- )
482
+ depth, dir_info = dirs_to_visit.popleft()
483
+ if maxdepth is None or depth <= maxdepth:
484
+ if dir_info["name"] not in self.dircache:
485
+ dirs_not_in_dircache.append(dir_info["name"])
486
+ else:
487
+ cached_path_infos = self.dircache[dir_info["name"]]
488
+ out.extend(cached_path_infos)
489
+ dirs_to_visit.extend(
490
+ [
491
+ (depth + 1, path_info)
492
+ for path_info in cached_path_infos
493
+ if path_info["type"] == "directory"
494
+ ]
495
+ )
422
496
 
423
497
  dirs_not_expanded = []
424
498
  if expand_info:
@@ -437,8 +511,11 @@ class HfFileSystem(fsspec.AbstractFileSystem):
437
511
  or common_prefix in chain(dirs_not_in_dircache, dirs_not_expanded)
438
512
  else self._parent(common_prefix)
439
513
  )
514
+ if maxdepth is not None:
515
+ common_path_depth = common_path[len(path) :].count("/")
516
+ maxdepth -= common_path_depth
440
517
  out = [o for o in out if not o["name"].startswith(common_path + "/")]
441
- for cached_path in self.dircache:
518
+ for cached_path in list(self.dircache):
442
519
  if cached_path.startswith(common_path + "/"):
443
520
  self.dircache.pop(cached_path, None)
444
521
  self.dircache.pop(common_path, None)
@@ -449,6 +526,7 @@ class HfFileSystem(fsspec.AbstractFileSystem):
449
526
  refresh=True,
450
527
  revision=revision,
451
528
  expand_info=expand_info,
529
+ maxdepth=maxdepth,
452
530
  )
453
531
  )
454
532
  else:
@@ -461,9 +539,10 @@ class HfFileSystem(fsspec.AbstractFileSystem):
461
539
  repo_type=resolved_path.repo_type,
462
540
  )
463
541
  for path_info in tree:
542
+ cache_path = root_path + "/" + path_info.path
464
543
  if isinstance(path_info, RepoFile):
465
544
  cache_path_info = {
466
- "name": root_path + "/" + path_info.path,
545
+ "name": cache_path,
467
546
  "size": path_info.size,
468
547
  "type": "file",
469
548
  "blob_id": path_info.blob_id,
@@ -473,7 +552,7 @@ class HfFileSystem(fsspec.AbstractFileSystem):
473
552
  }
474
553
  else:
475
554
  cache_path_info = {
476
- "name": root_path + "/" + path_info.path,
555
+ "name": cache_path,
477
556
  "size": 0,
478
557
  "type": "directory",
479
558
  "tree_id": path_info.tree_id,
@@ -481,10 +560,12 @@ class HfFileSystem(fsspec.AbstractFileSystem):
481
560
  }
482
561
  parent_path = self._parent(cache_path_info["name"])
483
562
  self.dircache.setdefault(parent_path, []).append(cache_path_info)
484
- out.append(cache_path_info)
563
+ depth = cache_path[len(path) :].count("/")
564
+ if maxdepth is None or depth <= maxdepth:
565
+ out.append(cache_path_info)
485
566
  return out
486
567
 
487
- def walk(self, path: str, *args, **kwargs) -> Iterator[Tuple[str, List[str], List[str]]]:
568
+ def walk(self, path: str, *args, **kwargs) -> Iterator[tuple[str, list[str], list[str]]]:
488
569
  """
489
570
  Return all files below the given path.
490
571
 
@@ -495,14 +576,12 @@ class HfFileSystem(fsspec.AbstractFileSystem):
495
576
  Root path to list files from.
496
577
 
497
578
  Returns:
498
- `Iterator[Tuple[str, List[str], List[str]]]`: An iterator of (path, list of directory names, list of file names) tuples.
579
+ `Iterator[tuple[str, list[str], list[str]]]`: An iterator of (path, list of directory names, list of file names) tuples.
499
580
  """
500
- # Set expand_info=False by default to get a x10 speed boost
501
- kwargs = {"expand_info": kwargs.get("detail", False), **kwargs}
502
581
  path = self.resolve_path(path, revision=kwargs.get("revision")).unresolve()
503
582
  yield from super().walk(path, *args, **kwargs)
504
583
 
505
- def glob(self, path: str, **kwargs) -> List[str]:
584
+ def glob(self, path: str, **kwargs) -> list[str]:
506
585
  """
507
586
  Find files by glob-matching.
508
587
 
@@ -513,10 +592,8 @@ class HfFileSystem(fsspec.AbstractFileSystem):
513
592
  Path pattern to match.
514
593
 
515
594
  Returns:
516
- `List[str]`: List of paths matching the pattern.
595
+ `list[str]`: List of paths matching the pattern.
517
596
  """
518
- # Set expand_info=False by default to get a x10 speed boost
519
- kwargs = {"expand_info": kwargs.get("detail", False), **kwargs}
520
597
  path = self.resolve_path(path, revision=kwargs.get("revision")).unresolve()
521
598
  return super().glob(path, **kwargs)
522
599
 
@@ -529,7 +606,7 @@ class HfFileSystem(fsspec.AbstractFileSystem):
529
606
  refresh: bool = False,
530
607
  revision: Optional[str] = None,
531
608
  **kwargs,
532
- ) -> Union[List[str], Dict[str, Dict[str, Any]]]:
609
+ ) -> Union[list[str], dict[str, dict[str, Any]]]:
533
610
  """
534
611
  List all files below path.
535
612
 
@@ -550,22 +627,24 @@ class HfFileSystem(fsspec.AbstractFileSystem):
550
627
  The git revision to list from.
551
628
 
552
629
  Returns:
553
- `Union[List[str], Dict[str, Dict[str, Any]]]`: List of paths or dict of file information.
630
+ `Union[list[str], dict[str, dict[str, Any]]]`: List of paths or dict of file information.
554
631
  """
555
- if maxdepth:
556
- return super().find(
557
- path, maxdepth=maxdepth, withdirs=withdirs, detail=detail, refresh=refresh, revision=revision, **kwargs
558
- )
632
+ if maxdepth is not None and maxdepth < 1:
633
+ raise ValueError("maxdepth must be at least 1")
559
634
  resolved_path = self.resolve_path(path, revision=revision)
560
635
  path = resolved_path.unresolve()
561
- kwargs = {"expand_info": detail, **kwargs}
562
636
  try:
563
- out = self._ls_tree(path, recursive=True, refresh=refresh, revision=resolved_path.revision, **kwargs)
637
+ out = self._ls_tree(
638
+ path, recursive=True, refresh=refresh, revision=resolved_path.revision, maxdepth=maxdepth, **kwargs
639
+ )
564
640
  except EntryNotFoundError:
565
641
  # Path could be a file
566
- if self.info(path, revision=revision, **kwargs)["type"] == "file":
567
- out = {path: {}}
568
- else:
642
+ try:
643
+ if self.info(path, revision=revision, **kwargs)["type"] == "file":
644
+ out = {path: {}}
645
+ else:
646
+ out = {}
647
+ except FileNotFoundError:
569
648
  out = {}
570
649
  else:
571
650
  if not withdirs:
@@ -585,11 +664,8 @@ class HfFileSystem(fsspec.AbstractFileSystem):
585
664
  """
586
665
  Copy a file within or between repositories.
587
666
 
588
- <Tip warning={true}>
589
-
590
- Note: When possible, use `HfApi.upload_file()` for better performance.
591
-
592
- </Tip>
667
+ > [!WARNING]
668
+ > Note: When possible, use `HfApi.upload_file()` for better performance.
593
669
 
594
670
  Args:
595
671
  path1 (`str`):
@@ -653,20 +729,17 @@ class HfFileSystem(fsspec.AbstractFileSystem):
653
729
  Returns:
654
730
  `datetime`: Last commit date of the file.
655
731
  """
656
- info = self.info(path, **kwargs)
732
+ info = self.info(path, **{**kwargs, "expand_info": True}) # type: ignore
657
733
  return info["last_commit"]["date"]
658
734
 
659
- def info(self, path: str, refresh: bool = False, revision: Optional[str] = None, **kwargs) -> Dict[str, Any]:
735
+ def info(self, path: str, refresh: bool = False, revision: Optional[str] = None, **kwargs) -> dict[str, Any]:
660
736
  """
661
737
  Get information about a file or directory.
662
738
 
663
739
  For more details, refer to [fsspec documentation](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.info).
664
740
 
665
- <Tip warning={true}>
666
-
667
- Note: When possible, use `HfApi.get_paths_info()` or `HfApi.repo_info()` for better performance.
668
-
669
- </Tip>
741
+ > [!WARNING]
742
+ > Note: When possible, use `HfApi.get_paths_info()` or `HfApi.repo_info()` for better performance.
670
743
 
671
744
  Args:
672
745
  path (`str`):
@@ -677,13 +750,13 @@ class HfFileSystem(fsspec.AbstractFileSystem):
677
750
  The git revision to get info from.
678
751
 
679
752
  Returns:
680
- `Dict[str, Any]`: Dictionary containing file information (type, size, commit info, etc.).
753
+ `dict[str, Any]`: Dictionary containing file information (type, size, commit info, etc.).
681
754
 
682
755
  """
683
756
  resolved_path = self.resolve_path(path, revision=revision)
684
757
  path = resolved_path.unresolve()
685
758
  expand_info = kwargs.get(
686
- "expand_info", True
759
+ "expand_info", False
687
760
  ) # don't expose it as a parameter in the public API to follow the spec
688
761
  if not resolved_path.path_in_repo:
689
762
  # Path is the root directory
@@ -691,6 +764,7 @@ class HfFileSystem(fsspec.AbstractFileSystem):
691
764
  "name": path,
692
765
  "size": 0,
693
766
  "type": "directory",
767
+ "last_commit": None,
694
768
  }
695
769
  if expand_info:
696
770
  last_commit = self._api.list_repo_commits(
@@ -708,7 +782,7 @@ class HfFileSystem(fsspec.AbstractFileSystem):
708
782
  parent_path = self._parent(path)
709
783
  if not expand_info and parent_path not in self.dircache:
710
784
  # Fill the cache with cheap call
711
- self.ls(parent_path, expand_info=False)
785
+ self.ls(parent_path)
712
786
  if parent_path in self.dircache:
713
787
  # Check if the path is in the cache
714
788
  out1 = [o for o in self.dircache[parent_path] if o["name"] == path]
@@ -762,11 +836,8 @@ class HfFileSystem(fsspec.AbstractFileSystem):
762
836
 
763
837
  For more details, refer to [fsspec documentation](https://filesystem-spec.readthedocs.io/en/latest/api.html#fsspec.spec.AbstractFileSystem.exists).
764
838
 
765
- <Tip warning={true}>
766
-
767
- Note: When possible, use `HfApi.file_exists()` for better performance.
768
-
769
- </Tip>
839
+ > [!WARNING]
840
+ > Note: When possible, use `HfApi.file_exists()` for better performance.
770
841
 
771
842
  Args:
772
843
  path (`str`):
@@ -779,7 +850,7 @@ class HfFileSystem(fsspec.AbstractFileSystem):
779
850
  if kwargs.get("refresh", False):
780
851
  self.invalidate_cache(path)
781
852
 
782
- self.info(path, **{**kwargs, "expand_info": False})
853
+ self.info(path, **kwargs)
783
854
  return True
784
855
  except: # noqa: E722
785
856
  return False
@@ -798,7 +869,7 @@ class HfFileSystem(fsspec.AbstractFileSystem):
798
869
  `bool`: True if path is a directory, False otherwise.
799
870
  """
800
871
  try:
801
- return self.info(path, expand_info=False)["type"] == "directory"
872
+ return self.info(path)["type"] == "directory"
802
873
  except OSError:
803
874
  return False
804
875
 
@@ -816,7 +887,7 @@ class HfFileSystem(fsspec.AbstractFileSystem):
816
887
  `bool`: True if path is a file, False otherwise.
817
888
  """
818
889
  try:
819
- return self.info(path, expand_info=False)["type"] == "file"
890
+ return self.info(path)["type"] == "file"
820
891
  except: # noqa: E722
821
892
  return False
822
893
 
@@ -847,11 +918,8 @@ class HfFileSystem(fsspec.AbstractFileSystem):
847
918
  """
848
919
  Copy single remote file to local.
849
920
 
850
- <Tip warning={true}>
851
-
852
- Note: When possible, use `HfApi.hf_hub_download()` for better performance.
853
-
854
- </Tip>
921
+ > [!WARNING]
922
+ > Note: When possible, use `HfApi.hf_hub_download()` for better performance.
855
923
 
856
924
  Args:
857
925
  rpath (`str`):
@@ -901,7 +969,7 @@ class HfFileSystem(fsspec.AbstractFileSystem):
901
969
  repo_type=resolve_remote_path.repo_type,
902
970
  endpoint=self.endpoint,
903
971
  ),
904
- temp_file=outfile,
972
+ temp_file=outfile, # type: ignore[arg-type]
905
973
  displayed_filename=rpath,
906
974
  expected_size=expected_size,
907
975
  resume_size=0,
@@ -931,6 +999,21 @@ class HfFileSystem(fsspec.AbstractFileSystem):
931
999
  # See https://github.com/huggingface/huggingface_hub/issues/1733
932
1000
  raise NotImplementedError("Transactional commits are not supported.")
933
1001
 
1002
+ def __reduce__(self):
1003
+ # re-populate the instance cache at HfFileSystem._cache and re-populate the state of every instance
1004
+ return make_instance, (
1005
+ type(self),
1006
+ self.storage_args,
1007
+ self.storage_options,
1008
+ self._get_instance_state(),
1009
+ )
1010
+
1011
+ def _get_instance_state(self):
1012
+ return {
1013
+ "dircache": deepcopy(self.dircache),
1014
+ "_repo_and_revision_exists_cache": deepcopy(self._repo_and_revision_exists_cache),
1015
+ }
1016
+
934
1017
 
935
1018
  class HfFileSystemFile(fsspec.spec.AbstractBufferedFile):
936
1019
  def __init__(self, fs: HfFileSystem, path: str, revision: Optional[str] = None, **kwargs):
@@ -942,9 +1025,6 @@ class HfFileSystemFile(fsspec.spec.AbstractBufferedFile):
942
1025
  f"{e}.\nMake sure the repository and revision exist before writing data."
943
1026
  ) from e
944
1027
  raise
945
- # avoid an unnecessary .info() call with expensive expand_info=True to instantiate .details
946
- if kwargs.get("mode", "rb") == "rb":
947
- self.details = fs.info(self.resolved_path.unresolve(), expand_info=False)
948
1028
  super().__init__(fs, self.resolved_path.unresolve(), **kwargs)
949
1029
  self.fs: HfFileSystem
950
1030
 
@@ -966,13 +1046,7 @@ class HfFileSystemFile(fsspec.spec.AbstractBufferedFile):
966
1046
  repo_type=self.resolved_path.repo_type,
967
1047
  endpoint=self.fs.endpoint,
968
1048
  )
969
- r = http_backoff(
970
- "GET",
971
- url,
972
- headers=headers,
973
- retry_on_status_codes=(500, 502, 503, 504),
974
- timeout=constants.HF_HUB_DOWNLOAD_TIMEOUT,
975
- )
1049
+ r = http_backoff("GET", url, headers=headers, timeout=constants.HF_HUB_DOWNLOAD_TIMEOUT)
976
1050
  hf_raise_for_status(r)
977
1051
  return r.content
978
1052
 
@@ -1003,13 +1077,14 @@ class HfFileSystemFile(fsspec.spec.AbstractBufferedFile):
1003
1077
  def read(self, length=-1):
1004
1078
  """Read remote file.
1005
1079
 
1006
- If `length` is not provided or is -1, the entire file is downloaded and read. On POSIX systems and if
1007
- `hf_transfer` is not enabled, the file is loaded in memory directly. Otherwise, the file is downloaded to a
1008
- temporary file and read from there.
1080
+ If `length` is not provided or is -1, the entire file is downloaded and read. On POSIX systems the file is
1081
+ loaded in memory directly. Otherwise, the file is downloaded to a temporary file and read from there.
1009
1082
  """
1010
1083
  if self.mode == "rb" and (length is None or length == -1) and self.loc == 0:
1011
1084
  with self.fs.open(self.path, "rb", block_size=0) as f: # block_size=0 enables fast streaming
1012
- return f.read()
1085
+ out = f.read()
1086
+ self.loc += len(out)
1087
+ return out
1013
1088
  return super().read(length)
1014
1089
 
1015
1090
  def url(self) -> str:
@@ -1045,8 +1120,9 @@ class HfFileSystemStreamFile(fsspec.spec.AbstractBufferedFile):
1045
1120
  super().__init__(
1046
1121
  fs, self.resolved_path.unresolve(), mode=mode, block_size=block_size, cache_type=cache_type, **kwargs
1047
1122
  )
1048
- self.response: Optional[Response] = None
1123
+ self.response: Optional[httpx.Response] = None
1049
1124
  self.fs: HfFileSystem
1125
+ self._exit_stack = ExitStack()
1050
1126
 
1051
1127
  def seek(self, loc: int, whence: int = 0):
1052
1128
  if loc == 0 and whence == 1:
@@ -1056,53 +1132,32 @@ class HfFileSystemStreamFile(fsspec.spec.AbstractBufferedFile):
1056
1132
  raise ValueError("Cannot seek streaming HF file")
1057
1133
 
1058
1134
  def read(self, length: int = -1):
1059
- read_args = (length,) if length >= 0 else ()
1060
- if self.response is None or self.response.raw.isclosed():
1061
- url = hf_hub_url(
1062
- repo_id=self.resolved_path.repo_id,
1063
- revision=self.resolved_path.revision,
1064
- filename=self.resolved_path.path_in_repo,
1065
- repo_type=self.resolved_path.repo_type,
1066
- endpoint=self.fs.endpoint,
1067
- )
1068
- self.response = http_backoff(
1069
- "GET",
1070
- url,
1071
- headers=self.fs._api._build_hf_headers(),
1072
- retry_on_status_codes=(500, 502, 503, 504),
1073
- stream=True,
1074
- timeout=constants.HF_HUB_DOWNLOAD_TIMEOUT,
1075
- )
1076
- hf_raise_for_status(self.response)
1077
- try:
1078
- out = self.response.raw.read(*read_args)
1079
- except Exception:
1080
- self.response.close()
1135
+ """Read the remote file.
1081
1136
 
1082
- # Retry by recreating the connection
1083
- url = hf_hub_url(
1084
- repo_id=self.resolved_path.repo_id,
1085
- revision=self.resolved_path.revision,
1086
- filename=self.resolved_path.path_in_repo,
1087
- repo_type=self.resolved_path.repo_type,
1088
- endpoint=self.fs.endpoint,
1089
- )
1090
- self.response = http_backoff(
1091
- "GET",
1092
- url,
1093
- headers={"Range": "bytes=%d-" % self.loc, **self.fs._api._build_hf_headers()},
1094
- retry_on_status_codes=(500, 502, 503, 504),
1095
- stream=True,
1096
- timeout=constants.HF_HUB_DOWNLOAD_TIMEOUT,
1097
- )
1098
- hf_raise_for_status(self.response)
1137
+ If the file is already open, we reuse the connection.
1138
+ Otherwise, open a new connection and read from it.
1139
+
1140
+ If reading the stream fails, we retry with a new connection.
1141
+ """
1142
+ if self.response is None:
1143
+ self._open_connection()
1144
+
1145
+ retried_once = False
1146
+ while True:
1099
1147
  try:
1100
- out = self.response.raw.read(*read_args)
1148
+ if self.response is None:
1149
+ return b"" # Already read the entire file
1150
+ out = _partial_read(self.response, length)
1151
+ self.loc += len(out)
1152
+ return out
1101
1153
  except Exception:
1102
- self.response.close()
1103
- raise
1104
- self.loc += len(out)
1105
- return out
1154
+ if self.response is not None:
1155
+ self.response.close()
1156
+ if retried_once: # Already retried once, give up
1157
+ raise
1158
+ # First failure, retry with range header
1159
+ self._open_connection()
1160
+ retried_once = True
1106
1161
 
1107
1162
  def url(self) -> str:
1108
1163
  return self.fs.url(self.path)
@@ -1111,11 +1166,43 @@ class HfFileSystemStreamFile(fsspec.spec.AbstractBufferedFile):
1111
1166
  if not hasattr(self, "resolved_path"):
1112
1167
  # Means that the constructor failed. Nothing to do.
1113
1168
  return
1169
+ self._exit_stack.close()
1114
1170
  return super().__del__()
1115
1171
 
1116
1172
  def __reduce__(self):
1117
1173
  return reopen, (self.fs, self.path, self.mode, self.blocksize, self.cache.name)
1118
1174
 
1175
+ def _open_connection(self):
1176
+ """Open a connection to the remote file."""
1177
+ url = hf_hub_url(
1178
+ repo_id=self.resolved_path.repo_id,
1179
+ revision=self.resolved_path.revision,
1180
+ filename=self.resolved_path.path_in_repo,
1181
+ repo_type=self.resolved_path.repo_type,
1182
+ endpoint=self.fs.endpoint,
1183
+ )
1184
+ headers = self.fs._api._build_hf_headers()
1185
+ if self.loc > 0:
1186
+ headers["Range"] = f"bytes={self.loc}-"
1187
+ self.response = self._exit_stack.enter_context(
1188
+ http_stream_backoff(
1189
+ "GET",
1190
+ url,
1191
+ headers=headers,
1192
+ retry_on_status_codes=(500, 502, 503, 504),
1193
+ timeout=constants.HF_HUB_DOWNLOAD_TIMEOUT,
1194
+ )
1195
+ )
1196
+
1197
+ try:
1198
+ hf_raise_for_status(self.response)
1199
+ except HfHubHTTPError as e:
1200
+ if e.response.status_code == 416:
1201
+ # Range not satisfiable => means that we have already read the entire file
1202
+ self.response = None
1203
+ return
1204
+ raise
1205
+
1119
1206
 
1120
1207
  def safe_revision(revision: str) -> str:
1121
1208
  return revision if SPECIAL_REFS_REVISION_REGEX.match(revision) else safe_quote(revision)
@@ -1138,3 +1225,33 @@ def _raise_file_not_found(path: str, err: Optional[Exception]) -> NoReturn:
1138
1225
 
1139
1226
  def reopen(fs: HfFileSystem, path: str, mode: str, block_size: int, cache_type: str):
1140
1227
  return fs.open(path, mode=mode, block_size=block_size, cache_type=cache_type)
1228
+
1229
+
1230
+ def _partial_read(response: httpx.Response, length: int = -1) -> bytes:
1231
+ """
1232
+ Read up to `length` bytes from a streamed response.
1233
+ If length == -1, read until EOF.
1234
+ """
1235
+ buf = bytearray()
1236
+ if length < -1:
1237
+ raise ValueError("length must be -1 or >= 0")
1238
+ if length == 0:
1239
+ return b""
1240
+ if length == -1:
1241
+ for chunk in response.iter_bytes():
1242
+ buf.extend(chunk)
1243
+ return bytes(buf)
1244
+
1245
+ for chunk in response.iter_bytes(chunk_size=length):
1246
+ buf.extend(chunk)
1247
+ if len(buf) >= length:
1248
+ return bytes(buf[:length])
1249
+
1250
+ return bytes(buf) # may be < length if response ended
1251
+
1252
+
1253
+ def make_instance(cls, args, kwargs, instance_state):
1254
+ fs = cls(*args, **kwargs)
1255
+ for attr, state_value in instance_state.items():
1256
+ setattr(fs, attr, state_value)
1257
+ return fs