huggingface-hub 0.20.3__py3-none-any.whl → 0.21.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of huggingface-hub might be problematic. Click here for more details.

Files changed (35) hide show
  1. huggingface_hub/__init__.py +19 -1
  2. huggingface_hub/_commit_api.py +49 -20
  3. huggingface_hub/_inference_endpoints.py +10 -0
  4. huggingface_hub/_login.py +2 -2
  5. huggingface_hub/commands/download.py +1 -1
  6. huggingface_hub/file_download.py +57 -21
  7. huggingface_hub/hf_api.py +269 -54
  8. huggingface_hub/hf_file_system.py +131 -8
  9. huggingface_hub/hub_mixin.py +204 -42
  10. huggingface_hub/inference/_client.py +56 -9
  11. huggingface_hub/inference/_common.py +4 -3
  12. huggingface_hub/inference/_generated/_async_client.py +57 -9
  13. huggingface_hub/inference/_text_generation.py +5 -0
  14. huggingface_hub/inference/_types.py +17 -0
  15. huggingface_hub/lfs.py +6 -3
  16. huggingface_hub/repocard.py +5 -3
  17. huggingface_hub/repocard_data.py +11 -3
  18. huggingface_hub/serialization/__init__.py +19 -0
  19. huggingface_hub/serialization/_base.py +168 -0
  20. huggingface_hub/serialization/_numpy.py +67 -0
  21. huggingface_hub/serialization/_tensorflow.py +93 -0
  22. huggingface_hub/serialization/_torch.py +199 -0
  23. huggingface_hub/templates/datasetcard_template.md +1 -1
  24. huggingface_hub/templates/modelcard_template.md +1 -4
  25. huggingface_hub/utils/__init__.py +14 -10
  26. huggingface_hub/utils/_datetime.py +4 -11
  27. huggingface_hub/utils/_errors.py +29 -0
  28. huggingface_hub/utils/_runtime.py +21 -15
  29. huggingface_hub/utils/endpoint_helpers.py +27 -1
  30. {huggingface_hub-0.20.3.dist-info → huggingface_hub-0.21.0.dist-info}/METADATA +7 -3
  31. {huggingface_hub-0.20.3.dist-info → huggingface_hub-0.21.0.dist-info}/RECORD +35 -30
  32. {huggingface_hub-0.20.3.dist-info → huggingface_hub-0.21.0.dist-info}/LICENSE +0 -0
  33. {huggingface_hub-0.20.3.dist-info → huggingface_hub-0.21.0.dist-info}/WHEEL +0 -0
  34. {huggingface_hub-0.20.3.dist-info → huggingface_hub-0.21.0.dist-info}/entry_points.txt +0 -0
  35. {huggingface_hub-0.20.3.dist-info → huggingface_hub-0.21.0.dist-info}/top_level.txt +0 -0
@@ -46,7 +46,7 @@ import sys
46
46
  from typing import TYPE_CHECKING
47
47
 
48
48
 
49
- __version__ = "0.20.3"
49
+ __version__ = "0.21.0"
50
50
 
51
51
  # Alphabetical order of definitions is ensured in tests
52
52
  # WARNING: any comment added in this dictionary definition will be lost when
@@ -227,6 +227,7 @@ _SUBMOD_ATTRS = {
227
227
  "request_space_storage",
228
228
  "restart_space",
229
229
  "resume_inference_endpoint",
230
+ "revision_exists",
230
231
  "run_as_future",
231
232
  "scale_to_zero_inference_endpoint",
232
233
  "set_space_sleep_time",
@@ -245,6 +246,7 @@ _SUBMOD_ATTRS = {
245
246
  "HfFileSystem",
246
247
  "HfFileSystemFile",
247
248
  "HfFileSystemResolvedPath",
249
+ "HfFileSystemStreamFile",
248
250
  ],
249
251
  "hub_mixin": [
250
252
  "ModelHubMixin",
@@ -286,6 +288,13 @@ _SUBMOD_ATTRS = {
286
288
  "repository": [
287
289
  "Repository",
288
290
  ],
291
+ "serialization": [
292
+ "StateDictSplit",
293
+ "split_numpy_state_dict_into_shards",
294
+ "split_state_dict_into_shards_factory",
295
+ "split_tf_state_dict_into_shards",
296
+ "split_torch_state_dict_into_shards",
297
+ ],
289
298
  "utils": [
290
299
  "CacheNotFound",
291
300
  "CachedFileInfo",
@@ -574,6 +583,7 @@ if TYPE_CHECKING: # pragma: no cover
574
583
  request_space_storage, # noqa: F401
575
584
  restart_space, # noqa: F401
576
585
  resume_inference_endpoint, # noqa: F401
586
+ revision_exists, # noqa: F401
577
587
  run_as_future, # noqa: F401
578
588
  scale_to_zero_inference_endpoint, # noqa: F401
579
589
  set_space_sleep_time, # noqa: F401
@@ -592,6 +602,7 @@ if TYPE_CHECKING: # pragma: no cover
592
602
  HfFileSystem, # noqa: F401
593
603
  HfFileSystemFile, # noqa: F401
594
604
  HfFileSystemResolvedPath, # noqa: F401
605
+ HfFileSystemStreamFile, # noqa: F401
595
606
  )
596
607
  from .hub_mixin import (
597
608
  ModelHubMixin, # noqa: F401
@@ -627,6 +638,13 @@ if TYPE_CHECKING: # pragma: no cover
627
638
  SpaceCardData, # noqa: F401
628
639
  )
629
640
  from .repository import Repository # noqa: F401
641
+ from .serialization import (
642
+ StateDictSplit, # noqa: F401
643
+ split_numpy_state_dict_into_shards, # noqa: F401
644
+ split_state_dict_into_shards_factory, # noqa: F401
645
+ split_tf_state_dict_into_shards, # noqa: F401
646
+ split_torch_state_dict_into_shards, # noqa: F401
647
+ )
630
648
  from .utils import (
631
649
  CachedFileInfo, # noqa: F401
632
650
  CachedRepoInfo, # noqa: F401
@@ -17,6 +17,7 @@ from tqdm.contrib.concurrent import thread_map
17
17
  from huggingface_hub import get_session
18
18
 
19
19
  from .constants import ENDPOINT, HF_HUB_ENABLE_HF_TRANSFER
20
+ from .file_download import hf_hub_url
20
21
  from .lfs import UploadInfo, lfs_upload, post_lfs_batch_info
21
22
  from .utils import (
22
23
  EntryNotFoundError,
@@ -521,16 +522,19 @@ def _fetch_upload_modes(
521
522
 
522
523
 
523
524
  @validate_hf_hub_args
524
- def _fetch_lfs_files_to_copy(
525
+ def _fetch_files_to_copy(
525
526
  copies: Iterable[CommitOperationCopy],
526
527
  repo_type: str,
527
528
  repo_id: str,
528
529
  token: Optional[str],
529
530
  revision: str,
530
531
  endpoint: Optional[str] = None,
531
- ) -> Dict[Tuple[str, Optional[str]], "RepoFile"]:
532
+ ) -> Dict[Tuple[str, Optional[str]], Union["RepoFile", bytes]]:
532
533
  """
533
- Requests the Hub files information of the LFS files to be copied, including their sha256.
534
+ Fetch information about the files to copy.
535
+
536
+ For LFS files, we only need their metadata (file size and sha256) while for regular files
537
+ we need to download the raw content from the Hub.
534
538
 
535
539
  Args:
536
540
  copies (`Iterable` of :class:`CommitOperationCopy`):
@@ -546,8 +550,9 @@ def _fetch_lfs_files_to_copy(
546
550
  revision (`str`):
547
551
  The git revision to upload the files to. Can be any valid git revision.
548
552
 
549
- Returns: `Dict[Tuple[str, Optional[str]], RepoFile]]`
550
- Key is the file path and revision of the file to copy, value is the repo file.
553
+ Returns: `Dict[Tuple[str, Optional[str]], Union[RepoFile, bytes]]]`
554
+ Key is the file path and revision of the file to copy.
555
+ Value is the raw content as bytes (for regular files) or the file information as a RepoFile (for LFS files).
551
556
 
552
557
  Raises:
553
558
  [`~utils.HfHubHTTPError`]
@@ -558,7 +563,7 @@ def _fetch_lfs_files_to_copy(
558
563
  from .hf_api import HfApi, RepoFolder
559
564
 
560
565
  hf_api = HfApi(endpoint=endpoint, token=token)
561
- files_to_copy = {}
566
+ files_to_copy: Dict[Tuple[str, Optional[str]], Union["RepoFile", bytes]] = {}
562
567
  for src_revision, operations in groupby(copies, key=lambda op: op.src_revision):
563
568
  operations = list(operations) # type: ignore
564
569
  paths = [op.src_path_in_repo for op in operations]
@@ -572,9 +577,21 @@ def _fetch_lfs_files_to_copy(
572
577
  for src_repo_file in src_repo_files:
573
578
  if isinstance(src_repo_file, RepoFolder):
574
579
  raise NotImplementedError("Copying a folder is not implemented.")
575
- if not src_repo_file.lfs:
576
- raise NotImplementedError("Copying a non-LFS file is not implemented")
577
- files_to_copy[(src_repo_file.rfilename, src_revision)] = src_repo_file
580
+ if src_repo_file.lfs:
581
+ files_to_copy[(src_repo_file.path, src_revision)] = src_repo_file
582
+ else:
583
+ # TODO: (optimization) download regular files to copy concurrently
584
+ headers = build_hf_headers(token=token)
585
+ url = hf_hub_url(
586
+ endpoint=endpoint,
587
+ repo_type=repo_type,
588
+ repo_id=repo_id,
589
+ revision=src_revision or revision,
590
+ filename=src_repo_file.path,
591
+ )
592
+ response = get_session().get(url, headers=headers)
593
+ hf_raise_for_status(response)
594
+ files_to_copy[(src_repo_file.path, src_revision)] = response.content
578
595
  for operation in operations:
579
596
  if (operation.src_path_in_repo, src_revision) not in files_to_copy:
580
597
  raise EntryNotFoundError(
@@ -586,7 +603,7 @@ def _fetch_lfs_files_to_copy(
586
603
 
587
604
  def _prepare_commit_payload(
588
605
  operations: Iterable[CommitOperation],
589
- files_to_copy: Dict[Tuple[str, Optional[str]], "RepoFile"],
606
+ files_to_copy: Dict[Tuple[str, Optional[str]], Union["RepoFile", bytes]],
590
607
  commit_message: str,
591
608
  commit_description: Optional[str] = None,
592
609
  parent_commit: Optional[str] = None,
@@ -649,16 +666,28 @@ def _prepare_commit_payload(
649
666
  # 2.d. Case copying a file or folder
650
667
  elif isinstance(operation, CommitOperationCopy):
651
668
  file_to_copy = files_to_copy[(operation.src_path_in_repo, operation.src_revision)]
652
- if not file_to_copy.lfs:
653
- raise NotImplementedError("Copying a non-LFS file is not implemented")
654
- yield {
655
- "key": "lfsFile",
656
- "value": {
657
- "path": operation.path_in_repo,
658
- "algo": "sha256",
659
- "oid": file_to_copy.lfs["sha256"],
660
- },
661
- }
669
+ if isinstance(file_to_copy, bytes):
670
+ yield {
671
+ "key": "file",
672
+ "value": {
673
+ "content": base64.b64encode(file_to_copy).decode(),
674
+ "path": operation.path_in_repo,
675
+ "encoding": "base64",
676
+ },
677
+ }
678
+ elif file_to_copy.lfs:
679
+ yield {
680
+ "key": "lfsFile",
681
+ "value": {
682
+ "path": operation.path_in_repo,
683
+ "algo": "sha256",
684
+ "oid": file_to_copy.lfs.sha256,
685
+ },
686
+ }
687
+ else:
688
+ raise ValueError(
689
+ "Malformed files_to_copy (should be raw file content as bytes or RepoFile objects with LFS info."
690
+ )
662
691
  # 2.e. Never expected to happen
663
692
  else:
664
693
  raise ValueError(
@@ -192,6 +192,12 @@ class InferenceEndpoint:
192
192
 
193
193
  Returns:
194
194
  [`InferenceEndpoint`]: the same Inference Endpoint, mutated in place with the latest data.
195
+
196
+ Raises:
197
+ [`InferenceEndpointError`]
198
+ If the Inference Endpoint ended up in a failed state.
199
+ [`InferenceEndpointTimeoutError`]
200
+ If the Inference Endpoint is not deployed after `timeout` seconds.
195
201
  """
196
202
  if self.url is not None: # Means the endpoint is deployed
197
203
  logger.info("Inference Endpoint is ready to be used.")
@@ -208,6 +214,10 @@ class InferenceEndpoint:
208
214
  if self.url is not None: # Means the endpoint is deployed
209
215
  logger.info("Inference Endpoint is ready to be used.")
210
216
  return self
217
+ if self.status == InferenceEndpointStatus.FAILED:
218
+ raise InferenceEndpointError(
219
+ f"Inference Endpoint {self.name} failed to deploy. Please check the logs for more information."
220
+ )
211
221
  if timeout is not None:
212
222
  if time.time() - start > timeout:
213
223
  raise InferenceEndpointTimeoutError("Timeout while waiting for Inference Endpoint to be deployed.")
huggingface_hub/_login.py CHANGED
@@ -102,7 +102,7 @@ def login(
102
102
  if token is not None:
103
103
  if not add_to_git_credential:
104
104
  print(
105
- "Token will not been saved to git credential helper. Pass"
105
+ "Token has not been saved to git credential helper. Pass"
106
106
  " `add_to_git_credential=True` if you want to set the git"
107
107
  " credential as well."
108
108
  )
@@ -186,7 +186,7 @@ def interpreter_login(new_session: bool = True, write_permission: bool = False)
186
186
  print(" To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .")
187
187
  if os.name == "nt":
188
188
  print("Token can be pasted using 'Right-Click'.")
189
- token = getpass("Token: ")
189
+ token = getpass("Enter your token (input will not be visible): ")
190
190
  add_to_git_credential = _ask_for_confirmation_no_tui("Add token as git credential?")
191
191
 
192
192
  _login(token=token, add_to_git_credential=add_to_git_credential, write_permission=write_permission)
@@ -64,7 +64,7 @@ class DownloadCommand(BaseHuggingfaceCLICommand):
64
64
  "--repo-type",
65
65
  choices=["model", "dataset", "space"],
66
66
  default="model",
67
- help="Type of repo to download from (e.g. `dataset`).",
67
+ help="Type of repo to download from (defaults to 'model').",
68
68
  )
69
69
  download_parser.add_argument(
70
70
  "--revision",
@@ -414,18 +414,38 @@ def http_get(
414
414
  url: str,
415
415
  temp_file: BinaryIO,
416
416
  *,
417
- proxies=None,
417
+ proxies: Optional[Dict] = None,
418
418
  resume_size: float = 0,
419
419
  headers: Optional[Dict[str, str]] = None,
420
420
  expected_size: Optional[int] = None,
421
+ displayed_filename: Optional[str] = None,
421
422
  _nb_retries: int = 5,
422
- ):
423
+ ) -> None:
423
424
  """
424
425
  Download a remote file. Do not gobble up errors, and will return errors tailored to the Hugging Face Hub.
425
426
 
426
427
  If ConnectionError (SSLError) or ReadTimeout happen while streaming data from the server, it is most likely a
427
428
  transient error (network outage?). We log a warning message and try to resume the download a few times before
428
429
  giving up. The method gives up after 5 attempts if no new data has being received from the server.
430
+
431
+ Args:
432
+ url (`str`):
433
+ The URL of the file to download.
434
+ temp_file (`BinaryIO`):
435
+ The file-like object where to save the file.
436
+ proxies (`dict`, *optional*):
437
+ Dictionary mapping protocol to the URL of the proxy passed to `requests.request`.
438
+ resume_size (`float`, *optional*):
439
+ The number of bytes already downloaded. If set to 0 (default), the whole file is download. If set to a
440
+ positive number, the download will resume at the given position.
441
+ headers (`dict`, *optional*):
442
+ Dictionary of HTTP Headers to send with the request.
443
+ expected_size (`int`, *optional*):
444
+ The expected size of the file to download. If set, the download will raise an error if the size of the
445
+ received content is different from the expected one.
446
+ displayed_filename (`str`, *optional*):
447
+ The filename of the file that is being downloaded. Value is used only to display a nice progress bar. If
448
+ not set, the filename is guessed from the URL or the `Content-Disposition` header.
429
449
  """
430
450
  hf_transfer = None
431
451
  if HF_HUB_ENABLE_HF_TRANSFER:
@@ -458,21 +478,22 @@ def http_get(
458
478
  # If the file is compressed, the number of bytes in the saved file will be higher than 'total'.
459
479
  total = resume_size + int(content_length) if content_length is not None else None
460
480
 
461
- displayed_name = url
462
- content_disposition = r.headers.get("Content-Disposition")
463
- if content_disposition is not None:
464
- match = HEADER_FILENAME_PATTERN.search(content_disposition)
465
- if match is not None:
466
- # Means file is on CDN
467
- displayed_name = match.groupdict()["filename"]
481
+ if displayed_filename is None:
482
+ displayed_filename = url
483
+ content_disposition = r.headers.get("Content-Disposition")
484
+ if content_disposition is not None:
485
+ match = HEADER_FILENAME_PATTERN.search(content_disposition)
486
+ if match is not None:
487
+ # Means file is on CDN
488
+ displayed_filename = match.groupdict()["filename"]
468
489
 
469
490
  # Truncate filename if too long to display
470
- if len(displayed_name) > 40:
471
- displayed_name = f"(…){displayed_name[-40:]}"
491
+ if len(displayed_filename) > 40:
492
+ displayed_filename = f"(…){displayed_filename[-40:]}"
472
493
 
473
494
  consistency_error_message = (
474
495
  f"Consistency check failed: file should be of size {expected_size} but has size"
475
- f" {{actual_size}} ({displayed_name}).\nWe are sorry for the inconvenience. Please retry download and"
496
+ f" {{actual_size}} ({displayed_filename}).\nWe are sorry for the inconvenience. Please retry download and"
476
497
  " pass `force_download=True, resume_download=False` as argument.\nIf the issue persists, please let us"
477
498
  " know by opening an issue on https://github.com/huggingface/huggingface_hub."
478
499
  )
@@ -483,8 +504,10 @@ def http_get(
483
504
  unit_scale=True,
484
505
  total=total,
485
506
  initial=resume_size,
486
- desc=displayed_name,
487
- disable=bool(logger.getEffectiveLevel() == logging.NOTSET),
507
+ desc=displayed_filename,
508
+ disable=True if (logger.getEffectiveLevel() == logging.NOTSET) else None,
509
+ # ^ set `disable=None` rather than `disable=False` by default to disable progress bar when no TTY attached
510
+ # see https://github.com/huggingface/huggingface_hub/pull/2000
488
511
  ) as progress:
489
512
  if hf_transfer and total is not None and total > 5 * DOWNLOAD_CHUNK_SIZE:
490
513
  supports_callback = "callback" in inspect.signature(hf_transfer.download).parameters
@@ -1279,11 +1302,14 @@ def hf_hub_download(
1279
1302
  # In case of a redirect, save an extra redirect on the request.get call,
1280
1303
  # and ensure we download the exact atomic version even if it changed
1281
1304
  # between the HEAD and the GET (unlikely, but hey).
1282
- # Useful for lfs blobs that are stored on a CDN.
1305
+ #
1306
+ # If url domain is different => we are downloading from a CDN => url is signed => don't send auth
1307
+ # If url domain is the same => redirect due to repo rename AND downloading a regular file => keep auth
1283
1308
  if metadata.location != url:
1284
1309
  url_to_download = metadata.location
1285
- # Remove authorization header when downloading a LFS blob
1286
- headers.pop("authorization", None)
1310
+ if urlparse(url).netloc != urlparse(url_to_download).netloc:
1311
+ # Remove authorization header when downloading a LFS blob
1312
+ headers.pop("authorization", None)
1287
1313
  except (requests.exceptions.SSLError, requests.exceptions.ProxyError):
1288
1314
  # Actually raise for those subclasses of ConnectionError
1289
1315
  raise
@@ -1317,6 +1343,10 @@ def hf_hub_download(
1317
1343
  head_call_error = error
1318
1344
  pass
1319
1345
 
1346
+ assert (
1347
+ local_files_only or etag is not None or head_call_error is not None
1348
+ ), "etag is empty due to uncovered problems"
1349
+
1320
1350
  # etag can be None for several reasons:
1321
1351
  # 1. we passed local_files_only.
1322
1352
  # 2. we don't have a connection
@@ -1330,9 +1360,14 @@ def hf_hub_download(
1330
1360
  if etag is None:
1331
1361
  # In those cases, we cannot force download.
1332
1362
  if force_download:
1333
- raise ValueError(
1334
- "We have no connection or you passed local_files_only, so force_download is not an accepted option."
1335
- )
1363
+ if local_files_only:
1364
+ raise ValueError("Cannot pass 'force_download=True' and 'local_files_only=True' at the same time.")
1365
+ elif isinstance(head_call_error, OfflineModeIsEnabled):
1366
+ raise ValueError(
1367
+ "Cannot pass 'force_download=True' when offline mode is enabled."
1368
+ ) from head_call_error
1369
+ else:
1370
+ raise ValueError("Force download failed due to the above error.") from head_call_error
1336
1371
 
1337
1372
  # Try to get "commit_hash" from "revision"
1338
1373
  commit_hash = None
@@ -1364,7 +1399,7 @@ def hf_hub_download(
1364
1399
  " hf.co look-ups and downloads online, set 'local_files_only' to False."
1365
1400
  )
1366
1401
  elif isinstance(head_call_error, RepositoryNotFoundError) or isinstance(head_call_error, GatedRepoError):
1367
- # Repo not found => let's raise the actual error
1402
+ # Repo not found or gated => let's raise the actual error
1368
1403
  raise head_call_error
1369
1404
  else:
1370
1405
  # Otherwise: most likely a connection issue or Hub downtime => let's warn the user
@@ -1461,6 +1496,7 @@ def hf_hub_download(
1461
1496
  resume_size=resume_size,
1462
1497
  headers=headers,
1463
1498
  expected_size=expected_size,
1499
+ displayed_filename=filename,
1464
1500
  )
1465
1501
 
1466
1502
  if local_dir is None: