huggingface-hub 0.29.0rc2__py3-none-any.whl → 1.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (153) hide show
  1. huggingface_hub/__init__.py +160 -46
  2. huggingface_hub/_commit_api.py +277 -71
  3. huggingface_hub/_commit_scheduler.py +15 -15
  4. huggingface_hub/_inference_endpoints.py +33 -22
  5. huggingface_hub/_jobs_api.py +301 -0
  6. huggingface_hub/_local_folder.py +18 -3
  7. huggingface_hub/_login.py +31 -63
  8. huggingface_hub/_oauth.py +460 -0
  9. huggingface_hub/_snapshot_download.py +241 -81
  10. huggingface_hub/_space_api.py +18 -10
  11. huggingface_hub/_tensorboard_logger.py +15 -19
  12. huggingface_hub/_upload_large_folder.py +196 -76
  13. huggingface_hub/_webhooks_payload.py +3 -3
  14. huggingface_hub/_webhooks_server.py +15 -25
  15. huggingface_hub/{commands → cli}/__init__.py +1 -15
  16. huggingface_hub/cli/_cli_utils.py +173 -0
  17. huggingface_hub/cli/auth.py +147 -0
  18. huggingface_hub/cli/cache.py +841 -0
  19. huggingface_hub/cli/download.py +189 -0
  20. huggingface_hub/cli/hf.py +60 -0
  21. huggingface_hub/cli/inference_endpoints.py +377 -0
  22. huggingface_hub/cli/jobs.py +772 -0
  23. huggingface_hub/cli/lfs.py +175 -0
  24. huggingface_hub/cli/repo.py +315 -0
  25. huggingface_hub/cli/repo_files.py +94 -0
  26. huggingface_hub/{commands/env.py → cli/system.py} +10 -13
  27. huggingface_hub/cli/upload.py +294 -0
  28. huggingface_hub/cli/upload_large_folder.py +117 -0
  29. huggingface_hub/community.py +20 -12
  30. huggingface_hub/constants.py +83 -59
  31. huggingface_hub/dataclasses.py +609 -0
  32. huggingface_hub/errors.py +99 -30
  33. huggingface_hub/fastai_utils.py +30 -41
  34. huggingface_hub/file_download.py +606 -346
  35. huggingface_hub/hf_api.py +2445 -1132
  36. huggingface_hub/hf_file_system.py +269 -152
  37. huggingface_hub/hub_mixin.py +61 -66
  38. huggingface_hub/inference/_client.py +501 -630
  39. huggingface_hub/inference/_common.py +133 -121
  40. huggingface_hub/inference/_generated/_async_client.py +536 -722
  41. huggingface_hub/inference/_generated/types/__init__.py +6 -1
  42. huggingface_hub/inference/_generated/types/automatic_speech_recognition.py +5 -6
  43. huggingface_hub/inference/_generated/types/base.py +10 -7
  44. huggingface_hub/inference/_generated/types/chat_completion.py +77 -31
  45. huggingface_hub/inference/_generated/types/depth_estimation.py +2 -2
  46. huggingface_hub/inference/_generated/types/document_question_answering.py +2 -2
  47. huggingface_hub/inference/_generated/types/feature_extraction.py +2 -2
  48. huggingface_hub/inference/_generated/types/fill_mask.py +2 -2
  49. huggingface_hub/inference/_generated/types/image_to_image.py +8 -2
  50. huggingface_hub/inference/_generated/types/image_to_text.py +2 -3
  51. huggingface_hub/inference/_generated/types/image_to_video.py +60 -0
  52. huggingface_hub/inference/_generated/types/sentence_similarity.py +3 -3
  53. huggingface_hub/inference/_generated/types/summarization.py +2 -2
  54. huggingface_hub/inference/_generated/types/table_question_answering.py +5 -5
  55. huggingface_hub/inference/_generated/types/text2text_generation.py +2 -2
  56. huggingface_hub/inference/_generated/types/text_generation.py +11 -11
  57. huggingface_hub/inference/_generated/types/text_to_audio.py +1 -2
  58. huggingface_hub/inference/_generated/types/text_to_speech.py +1 -2
  59. huggingface_hub/inference/_generated/types/text_to_video.py +2 -2
  60. huggingface_hub/inference/_generated/types/token_classification.py +2 -2
  61. huggingface_hub/inference/_generated/types/translation.py +2 -2
  62. huggingface_hub/inference/_generated/types/zero_shot_classification.py +2 -2
  63. huggingface_hub/inference/_generated/types/zero_shot_image_classification.py +2 -2
  64. huggingface_hub/inference/_generated/types/zero_shot_object_detection.py +1 -3
  65. huggingface_hub/inference/_mcp/__init__.py +0 -0
  66. huggingface_hub/inference/_mcp/_cli_hacks.py +88 -0
  67. huggingface_hub/inference/_mcp/agent.py +100 -0
  68. huggingface_hub/inference/_mcp/cli.py +247 -0
  69. huggingface_hub/inference/_mcp/constants.py +81 -0
  70. huggingface_hub/inference/_mcp/mcp_client.py +395 -0
  71. huggingface_hub/inference/_mcp/types.py +45 -0
  72. huggingface_hub/inference/_mcp/utils.py +128 -0
  73. huggingface_hub/inference/_providers/__init__.py +149 -20
  74. huggingface_hub/inference/_providers/_common.py +160 -37
  75. huggingface_hub/inference/_providers/black_forest_labs.py +12 -9
  76. huggingface_hub/inference/_providers/cerebras.py +6 -0
  77. huggingface_hub/inference/_providers/clarifai.py +13 -0
  78. huggingface_hub/inference/_providers/cohere.py +32 -0
  79. huggingface_hub/inference/_providers/fal_ai.py +231 -22
  80. huggingface_hub/inference/_providers/featherless_ai.py +38 -0
  81. huggingface_hub/inference/_providers/fireworks_ai.py +22 -1
  82. huggingface_hub/inference/_providers/groq.py +9 -0
  83. huggingface_hub/inference/_providers/hf_inference.py +143 -33
  84. huggingface_hub/inference/_providers/hyperbolic.py +9 -5
  85. huggingface_hub/inference/_providers/nebius.py +47 -5
  86. huggingface_hub/inference/_providers/novita.py +48 -5
  87. huggingface_hub/inference/_providers/nscale.py +44 -0
  88. huggingface_hub/inference/_providers/openai.py +25 -0
  89. huggingface_hub/inference/_providers/publicai.py +6 -0
  90. huggingface_hub/inference/_providers/replicate.py +46 -9
  91. huggingface_hub/inference/_providers/sambanova.py +37 -1
  92. huggingface_hub/inference/_providers/scaleway.py +28 -0
  93. huggingface_hub/inference/_providers/together.py +34 -5
  94. huggingface_hub/inference/_providers/wavespeed.py +138 -0
  95. huggingface_hub/inference/_providers/zai_org.py +17 -0
  96. huggingface_hub/lfs.py +33 -100
  97. huggingface_hub/repocard.py +34 -38
  98. huggingface_hub/repocard_data.py +79 -59
  99. huggingface_hub/serialization/__init__.py +0 -1
  100. huggingface_hub/serialization/_base.py +12 -15
  101. huggingface_hub/serialization/_dduf.py +8 -8
  102. huggingface_hub/serialization/_torch.py +69 -69
  103. huggingface_hub/utils/__init__.py +27 -8
  104. huggingface_hub/utils/_auth.py +7 -7
  105. huggingface_hub/utils/_cache_manager.py +92 -147
  106. huggingface_hub/utils/_chunk_utils.py +2 -3
  107. huggingface_hub/utils/_deprecation.py +1 -1
  108. huggingface_hub/utils/_dotenv.py +55 -0
  109. huggingface_hub/utils/_experimental.py +7 -5
  110. huggingface_hub/utils/_fixes.py +0 -10
  111. huggingface_hub/utils/_git_credential.py +5 -5
  112. huggingface_hub/utils/_headers.py +8 -30
  113. huggingface_hub/utils/_http.py +399 -237
  114. huggingface_hub/utils/_pagination.py +6 -6
  115. huggingface_hub/utils/_parsing.py +98 -0
  116. huggingface_hub/utils/_paths.py +5 -5
  117. huggingface_hub/utils/_runtime.py +74 -22
  118. huggingface_hub/utils/_safetensors.py +21 -21
  119. huggingface_hub/utils/_subprocess.py +13 -11
  120. huggingface_hub/utils/_telemetry.py +4 -4
  121. huggingface_hub/{commands/_cli_utils.py → utils/_terminal.py} +4 -4
  122. huggingface_hub/utils/_typing.py +25 -5
  123. huggingface_hub/utils/_validators.py +55 -74
  124. huggingface_hub/utils/_verification.py +167 -0
  125. huggingface_hub/utils/_xet.py +235 -0
  126. huggingface_hub/utils/_xet_progress_reporting.py +162 -0
  127. huggingface_hub/utils/insecure_hashlib.py +3 -5
  128. huggingface_hub/utils/logging.py +8 -11
  129. huggingface_hub/utils/tqdm.py +33 -4
  130. {huggingface_hub-0.29.0rc2.dist-info → huggingface_hub-1.1.3.dist-info}/METADATA +94 -82
  131. huggingface_hub-1.1.3.dist-info/RECORD +155 -0
  132. {huggingface_hub-0.29.0rc2.dist-info → huggingface_hub-1.1.3.dist-info}/WHEEL +1 -1
  133. huggingface_hub-1.1.3.dist-info/entry_points.txt +6 -0
  134. huggingface_hub/commands/delete_cache.py +0 -428
  135. huggingface_hub/commands/download.py +0 -200
  136. huggingface_hub/commands/huggingface_cli.py +0 -61
  137. huggingface_hub/commands/lfs.py +0 -200
  138. huggingface_hub/commands/repo_files.py +0 -128
  139. huggingface_hub/commands/scan_cache.py +0 -181
  140. huggingface_hub/commands/tag.py +0 -159
  141. huggingface_hub/commands/upload.py +0 -299
  142. huggingface_hub/commands/upload_large_folder.py +0 -129
  143. huggingface_hub/commands/user.py +0 -304
  144. huggingface_hub/commands/version.py +0 -37
  145. huggingface_hub/inference_api.py +0 -217
  146. huggingface_hub/keras_mixin.py +0 -500
  147. huggingface_hub/repository.py +0 -1477
  148. huggingface_hub/serialization/_tensorflow.py +0 -95
  149. huggingface_hub/utils/_hf_folder.py +0 -68
  150. huggingface_hub-0.29.0rc2.dist-info/RECORD +0 -131
  151. huggingface_hub-0.29.0rc2.dist-info/entry_points.txt +0 -6
  152. {huggingface_hub-0.29.0rc2.dist-info → huggingface_hub-1.1.3.dist-info/licenses}/LICENSE +0 -0
  153. {huggingface_hub-0.29.0rc2.dist-info → huggingface_hub-1.1.3.dist-info}/top_level.txt +0 -0
@@ -11,17 +11,20 @@ from contextlib import contextmanager
11
11
  from dataclasses import dataclass, field
12
12
  from itertools import groupby
13
13
  from pathlib import Path, PurePosixPath
14
- from typing import TYPE_CHECKING, Any, BinaryIO, Dict, Iterable, Iterator, List, Literal, Optional, Tuple, Union
14
+ from typing import TYPE_CHECKING, Any, BinaryIO, Iterable, Iterator, Literal, Optional, Union
15
15
 
16
16
  from tqdm.contrib.concurrent import thread_map
17
17
 
18
18
  from . import constants
19
- from .errors import EntryNotFoundError
19
+ from .errors import EntryNotFoundError, HfHubHTTPError, XetAuthorizationError, XetRefreshTokenError
20
20
  from .file_download import hf_hub_url
21
21
  from .lfs import UploadInfo, lfs_upload, post_lfs_batch_info
22
22
  from .utils import (
23
23
  FORBIDDEN_FOLDERS,
24
+ XetTokenType,
25
+ are_progress_bars_disabled,
24
26
  chunk_iterable,
27
+ fetch_xet_connection_info_from_repo_info,
25
28
  get_session,
26
29
  hf_raise_for_status,
27
30
  logging,
@@ -30,6 +33,7 @@ from .utils import (
30
33
  validate_hf_hub_args,
31
34
  )
32
35
  from .utils import tqdm as hf_tqdm
36
+ from .utils._runtime import is_xet_available
33
37
 
34
38
 
35
39
  if TYPE_CHECKING:
@@ -47,6 +51,8 @@ UploadMode = Literal["lfs", "regular"]
47
51
  # See https://github.com/huggingface/huggingface_hub/issues/1503
48
52
  FETCH_LFS_BATCH_SIZE = 500
49
53
 
54
+ UPLOAD_BATCH_MAX_NUM_FILES = 256
55
+
50
56
 
51
57
  @dataclass
52
58
  class CommitOperationDelete:
@@ -230,7 +236,7 @@ class CommitOperationAdd:
230
236
  config.json: 100%|█████████████████████████| 8.19k/8.19k [00:02<00:00, 3.72kB/s]
231
237
 
232
238
  >>> with operation.as_file(with_tqdm=True) as file:
233
- ... requests.put(..., data=file)
239
+ ... httpx.put(..., data=file)
234
240
  config.json: 100%|█████████████████████████| 8.19k/8.19k [00:02<00:00, 3.72kB/s]
235
241
  ```
236
242
  """
@@ -301,7 +307,7 @@ def _validate_path_in_repo(path_in_repo: str) -> str:
301
307
  CommitOperation = Union[CommitOperationAdd, CommitOperationCopy, CommitOperationDelete]
302
308
 
303
309
 
304
- def _warn_on_overwriting_operations(operations: List[CommitOperation]) -> None:
310
+ def _warn_on_overwriting_operations(operations: list[CommitOperation]) -> None:
305
311
  """
306
312
  Warn user when a list of operations is expected to overwrite itself in a single
307
313
  commit.
@@ -316,7 +322,7 @@ def _warn_on_overwriting_operations(operations: List[CommitOperation]) -> None:
316
322
  delete before upload) but can happen if a user deletes an entire folder and then
317
323
  add new files to it.
318
324
  """
319
- nb_additions_per_path: Dict[str, int] = defaultdict(int)
325
+ nb_additions_per_path: dict[str, int] = defaultdict(int)
320
326
  for operation in operations:
321
327
  path_in_repo = operation.path_in_repo
322
328
  if isinstance(operation, CommitOperationAdd):
@@ -348,15 +354,95 @@ def _warn_on_overwriting_operations(operations: List[CommitOperation]) -> None:
348
354
 
349
355
 
350
356
  @validate_hf_hub_args
351
- def _upload_lfs_files(
357
+ def _upload_files(
352
358
  *,
353
- additions: List[CommitOperationAdd],
359
+ additions: list[CommitOperationAdd],
354
360
  repo_type: str,
355
361
  repo_id: str,
356
- headers: Dict[str, str],
362
+ headers: dict[str, str],
357
363
  endpoint: Optional[str] = None,
358
364
  num_threads: int = 5,
359
365
  revision: Optional[str] = None,
366
+ create_pr: Optional[bool] = None,
367
+ ):
368
+ """
369
+ Negotiates per-file transfer (LFS vs Xet) and uploads in batches.
370
+ """
371
+ xet_additions: list[CommitOperationAdd] = []
372
+ lfs_actions: list[dict[str, Any]] = []
373
+ lfs_oid2addop: dict[str, CommitOperationAdd] = {}
374
+
375
+ for chunk in chunk_iterable(additions, chunk_size=UPLOAD_BATCH_MAX_NUM_FILES):
376
+ chunk_list = [op for op in chunk]
377
+
378
+ transfers: list[str] = ["basic", "multipart"]
379
+ has_buffered_io_data = any(isinstance(op.path_or_fileobj, io.BufferedIOBase) for op in chunk_list)
380
+ if is_xet_available():
381
+ if not has_buffered_io_data:
382
+ transfers.append("xet")
383
+ else:
384
+ logger.warning(
385
+ "Uploading files as a binary IO buffer is not supported by Xet Storage. "
386
+ "Falling back to HTTP upload."
387
+ )
388
+
389
+ actions_chunk, errors_chunk, chosen_transfer = post_lfs_batch_info(
390
+ upload_infos=[op.upload_info for op in chunk_list],
391
+ repo_id=repo_id,
392
+ repo_type=repo_type,
393
+ revision=revision,
394
+ endpoint=endpoint,
395
+ headers=headers,
396
+ token=None, # already passed in 'headers'
397
+ transfers=transfers,
398
+ )
399
+ if errors_chunk:
400
+ message = "\n".join(
401
+ [
402
+ f"Encountered error for file with OID {err.get('oid')}: `{err.get('error', {}).get('message')}"
403
+ for err in errors_chunk
404
+ ]
405
+ )
406
+ raise ValueError(f"LFS batch API returned errors:\n{message}")
407
+
408
+ # If server returns a transfer we didn't offer (e.g "xet" while uploading from BytesIO),
409
+ # fall back to LFS for this chunk.
410
+ if chosen_transfer == "xet" and ("xet" in transfers):
411
+ xet_additions.extend(chunk_list)
412
+ else:
413
+ lfs_actions.extend(actions_chunk)
414
+ for op in chunk_list:
415
+ lfs_oid2addop[op.upload_info.sha256.hex()] = op
416
+
417
+ if len(lfs_actions) > 0:
418
+ _upload_lfs_files(
419
+ actions=lfs_actions,
420
+ oid2addop=lfs_oid2addop,
421
+ headers=headers,
422
+ endpoint=endpoint,
423
+ num_threads=num_threads,
424
+ )
425
+
426
+ if len(xet_additions) > 0:
427
+ _upload_xet_files(
428
+ additions=xet_additions,
429
+ repo_type=repo_type,
430
+ repo_id=repo_id,
431
+ headers=headers,
432
+ endpoint=endpoint,
433
+ revision=revision,
434
+ create_pr=create_pr,
435
+ )
436
+
437
+
438
+ @validate_hf_hub_args
439
+ def _upload_lfs_files(
440
+ *,
441
+ actions: list[dict[str, Any]],
442
+ oid2addop: dict[str, CommitOperationAdd],
443
+ headers: dict[str, str],
444
+ endpoint: Optional[str] = None,
445
+ num_threads: int = 5,
360
446
  ):
361
447
  """
362
448
  Uploads the content of `additions` to the Hub using the large file storage protocol.
@@ -365,14 +451,26 @@ def _upload_lfs_files(
365
451
  - LFS Batch API: https://github.com/git-lfs/git-lfs/blob/main/docs/api/batch.md
366
452
 
367
453
  Args:
368
- additions (`List` of `CommitOperationAdd`):
369
- The files to be uploaded
370
- repo_type (`str`):
454
+ actions (`list[dict[str, Any]]`):
455
+ LFS batch actions returned by the server.
456
+ oid2addop (`dict[str, CommitOperationAdd]`):
457
+ A dictionary mapping the OID of the file to the corresponding `CommitOperationAdd` object.
458
+ headers (`dict[str, str]`):
459
+ Headers to use for the request, including authorization headers and user agent.
460
+ endpoint (`str`, *optional*):
461
+ The endpoint to use for the request. Defaults to `constants.ENDPOINT`.
462
+ num_threads (`int`, *optional*):
463
+ The number of concurrent threads to use when uploading. Defaults to 5.
464
+
465
+ Raises:
466
+ [`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError)
467
+ If an upload failed for any reason
468
+ [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
371
469
  Type of the repo to upload to: `"model"`, `"dataset"` or `"space"`.
372
470
  repo_id (`str`):
373
471
  A namespace (user or an organization) and a repo name separated
374
472
  by a `/`.
375
- headers (`Dict[str, str]`):
473
+ headers (`dict[str, str]`):
376
474
  Headers to use for the request, including authorization headers and user agent.
377
475
  num_threads (`int`, *optional*):
378
476
  The number of concurrent threads to use when uploading. Defaults to 5.
@@ -384,53 +482,20 @@ def _upload_lfs_files(
384
482
  If an upload failed for any reason
385
483
  [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
386
484
  If the server returns malformed responses
387
- [`HTTPError`](https://requests.readthedocs.io/en/latest/api/#requests.HTTPError)
485
+ [`HfHubHTTPError`]
388
486
  If the LFS batch endpoint returned an HTTP error.
389
487
  """
390
- # Step 1: retrieve upload instructions from the LFS batch endpoint.
391
- # Upload instructions are retrieved by chunk of 256 files to avoid reaching
392
- # the payload limit.
393
- batch_actions: List[Dict] = []
394
- for chunk in chunk_iterable(additions, chunk_size=256):
395
- batch_actions_chunk, batch_errors_chunk = post_lfs_batch_info(
396
- upload_infos=[op.upload_info for op in chunk],
397
- repo_id=repo_id,
398
- repo_type=repo_type,
399
- revision=revision,
400
- endpoint=endpoint,
401
- headers=headers,
402
- token=None, # already passed in 'headers'
403
- )
404
-
405
- # If at least 1 error, we do not retrieve information for other chunks
406
- if batch_errors_chunk:
407
- message = "\n".join(
408
- [
409
- f"Encountered error for file with OID {err.get('oid')}: `{err.get('error', {}).get('message')}"
410
- for err in batch_errors_chunk
411
- ]
412
- )
413
- raise ValueError(f"LFS batch endpoint returned errors:\n{message}")
414
-
415
- batch_actions += batch_actions_chunk
416
- oid2addop = {add_op.upload_info.sha256.hex(): add_op for add_op in additions}
417
-
418
- # Step 2: ignore files that have already been uploaded
488
+ # Filter out files already present upstream
419
489
  filtered_actions = []
420
- for action in batch_actions:
490
+ for action in actions:
421
491
  if action.get("actions") is None:
422
492
  logger.debug(
423
- f"Content of file {oid2addop[action['oid']].path_in_repo} is already"
424
- " present upstream - skipping upload."
493
+ f"Content of file {oid2addop[action['oid']].path_in_repo} is already present upstream - skipping upload."
425
494
  )
426
495
  else:
427
496
  filtered_actions.append(action)
428
497
 
429
- if len(filtered_actions) == 0:
430
- logger.debug("No LFS files to upload.")
431
- return
432
-
433
- # Step 3: upload files concurrently according to these instructions
498
+ # Upload according to server-provided actions
434
499
  def _wrapped_lfs_upload(batch_action) -> None:
435
500
  try:
436
501
  operation = oid2addop[batch_action["oid"]]
@@ -438,11 +503,7 @@ def _upload_lfs_files(
438
503
  except Exception as exc:
439
504
  raise RuntimeError(f"Error while uploading '{operation.path_in_repo}' to the Hub.") from exc
440
505
 
441
- if constants.HF_HUB_ENABLE_HF_TRANSFER:
442
- logger.debug(f"Uploading {len(filtered_actions)} LFS files to the Hub using `hf_transfer`.")
443
- for action in hf_tqdm(filtered_actions, name="huggingface_hub.lfs_upload"):
444
- _wrapped_lfs_upload(action)
445
- elif len(filtered_actions) == 1:
506
+ if len(filtered_actions) == 1:
446
507
  logger.debug("Uploading 1 LFS file to the Hub")
447
508
  _wrapped_lfs_upload(filtered_actions[0])
448
509
  else:
@@ -458,6 +519,151 @@ def _upload_lfs_files(
458
519
  )
459
520
 
460
521
 
522
+ @validate_hf_hub_args
523
+ def _upload_xet_files(
524
+ *,
525
+ additions: list[CommitOperationAdd],
526
+ repo_type: str,
527
+ repo_id: str,
528
+ headers: dict[str, str],
529
+ endpoint: Optional[str] = None,
530
+ revision: Optional[str] = None,
531
+ create_pr: Optional[bool] = None,
532
+ ):
533
+ """
534
+ Uploads the content of `additions` to the Hub using the xet storage protocol.
535
+ This chunks the files and deduplicates the chunks before uploading them to xetcas storage.
536
+
537
+ Args:
538
+ additions (`` of `CommitOperationAdd`):
539
+ The files to be uploaded.
540
+ repo_type (`str`):
541
+ Type of the repo to upload to: `"model"`, `"dataset"` or `"space"`.
542
+ repo_id (`str`):
543
+ A namespace (user or an organization) and a repo name separated
544
+ by a `/`.
545
+ headers (`dict[str, str]`):
546
+ Headers to use for the request, including authorization headers and user agent.
547
+ endpoint: (`str`, *optional*):
548
+ The endpoint to use for the xetcas service. Defaults to `constants.ENDPOINT`.
549
+ revision (`str`, *optional*):
550
+ The git revision to upload to.
551
+ create_pr (`bool`, *optional*):
552
+ Whether or not to create a Pull Request with that commit.
553
+
554
+ Raises:
555
+ [`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError)
556
+ If an upload failed for any reason.
557
+ [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
558
+ If the server returns malformed responses or if the user is unauthorized to upload to xet storage.
559
+ [`HfHubHTTPError`]
560
+ If the LFS batch endpoint returned an HTTP error.
561
+
562
+ **How it works:**
563
+ The file download system uses Xet storage, which is a content-addressable storage system that breaks files into chunks
564
+ for efficient storage and transfer.
565
+
566
+ `hf_xet.upload_files` manages uploading files by:
567
+ - Taking a list of file paths to upload
568
+ - Breaking files into smaller chunks for efficient storage
569
+ - Avoiding duplicate storage by recognizing identical chunks across files
570
+ - Connecting to a storage server (CAS server) that manages these chunks
571
+
572
+ The upload process works like this:
573
+ 1. Create a local folder at ~/.cache/huggingface/xet/chunk-cache to store file chunks for reuse.
574
+ 2. Process files in parallel (up to 8 files at once):
575
+ 2.1. Read the file content.
576
+ 2.2. Split the file content into smaller chunks based on content patterns: each chunk gets a unique ID based on what's in it.
577
+ 2.3. For each chunk:
578
+ - Check if it already exists in storage.
579
+ - Skip uploading chunks that already exist.
580
+ 2.4. Group chunks into larger blocks for efficient transfer.
581
+ 2.5. Upload these blocks to the storage server.
582
+ 2.6. Create and upload information about how the file is structured.
583
+ 3. Return reference files that contain information about the uploaded files, which can be used later to download them.
584
+ """
585
+ if len(additions) == 0:
586
+ return
587
+
588
+ # at this point, we know that hf_xet is installed
589
+ from hf_xet import upload_bytes, upload_files
590
+
591
+ from .utils._xet_progress_reporting import XetProgressReporter
592
+
593
+ try:
594
+ xet_connection_info = fetch_xet_connection_info_from_repo_info(
595
+ token_type=XetTokenType.WRITE,
596
+ repo_id=repo_id,
597
+ repo_type=repo_type,
598
+ revision=revision,
599
+ headers=headers,
600
+ endpoint=endpoint,
601
+ params={"create_pr": "1"} if create_pr else None,
602
+ )
603
+ except HfHubHTTPError as e:
604
+ if e.response.status_code == 401:
605
+ raise XetAuthorizationError(
606
+ f"You are unauthorized to upload to xet storage for {repo_type}/{repo_id}. "
607
+ f"Please check that you have configured your access token with write access to the repo."
608
+ ) from e
609
+ raise
610
+
611
+ xet_endpoint = xet_connection_info.endpoint
612
+ access_token_info = (xet_connection_info.access_token, xet_connection_info.expiration_unix_epoch)
613
+
614
+ def token_refresher() -> tuple[str, int]:
615
+ new_xet_connection = fetch_xet_connection_info_from_repo_info(
616
+ token_type=XetTokenType.WRITE,
617
+ repo_id=repo_id,
618
+ repo_type=repo_type,
619
+ revision=revision,
620
+ headers=headers,
621
+ endpoint=endpoint,
622
+ params={"create_pr": "1"} if create_pr else None,
623
+ )
624
+ if new_xet_connection is None:
625
+ raise XetRefreshTokenError("Failed to refresh xet token")
626
+ return new_xet_connection.access_token, new_xet_connection.expiration_unix_epoch
627
+
628
+ if not are_progress_bars_disabled():
629
+ progress = XetProgressReporter()
630
+ progress_callback = progress.update_progress
631
+ else:
632
+ progress, progress_callback = None, None
633
+
634
+ try:
635
+ all_bytes_ops = [op for op in additions if isinstance(op.path_or_fileobj, bytes)]
636
+ all_paths_ops = [op for op in additions if isinstance(op.path_or_fileobj, (str, Path))]
637
+
638
+ if len(all_paths_ops) > 0:
639
+ all_paths = [str(op.path_or_fileobj) for op in all_paths_ops]
640
+ upload_files(
641
+ all_paths,
642
+ xet_endpoint,
643
+ access_token_info,
644
+ token_refresher,
645
+ progress_callback,
646
+ repo_type,
647
+ )
648
+
649
+ if len(all_bytes_ops) > 0:
650
+ all_bytes = [op.path_or_fileobj for op in all_bytes_ops]
651
+ upload_bytes(
652
+ all_bytes,
653
+ xet_endpoint,
654
+ access_token_info,
655
+ token_refresher,
656
+ progress_callback,
657
+ repo_type,
658
+ )
659
+
660
+ finally:
661
+ if progress is not None:
662
+ progress.close(False)
663
+
664
+ return
665
+
666
+
461
667
  def _validate_preupload_info(preupload_info: dict):
462
668
  files = preupload_info.get("files")
463
669
  if not isinstance(files, list):
@@ -478,15 +684,15 @@ def _fetch_upload_modes(
478
684
  additions: Iterable[CommitOperationAdd],
479
685
  repo_type: str,
480
686
  repo_id: str,
481
- headers: Dict[str, str],
687
+ headers: dict[str, str],
482
688
  revision: str,
483
689
  endpoint: Optional[str] = None,
484
690
  create_pr: bool = False,
485
691
  gitignore_content: Optional[str] = None,
486
692
  ) -> None:
487
693
  """
488
- Requests the Hub "preupload" endpoint to determine whether each input file should be uploaded as a regular git blob
489
- or as git LFS blob. Input `additions` are mutated in-place with the upload mode.
694
+ Requests the Hub "preupload" endpoint to determine whether each input file should be uploaded as a regular git blob,
695
+ as a git LFS blob, or as a XET file. Input `additions` are mutated in-place with the upload mode.
490
696
 
491
697
  Args:
492
698
  additions (`Iterable` of :class:`CommitOperationAdd`):
@@ -497,7 +703,7 @@ def _fetch_upload_modes(
497
703
  repo_id (`str`):
498
704
  A namespace (user or an organization) and a repo name separated
499
705
  by a `/`.
500
- headers (`Dict[str, str]`):
706
+ headers (`dict[str, str]`):
501
707
  Headers to use for the request, including authorization headers and user agent.
502
708
  revision (`str`):
503
709
  The git revision to upload the files to. Can be any valid git revision.
@@ -515,12 +721,12 @@ def _fetch_upload_modes(
515
721
  endpoint = endpoint if endpoint is not None else constants.ENDPOINT
516
722
 
517
723
  # Fetch upload mode (LFS or regular) chunk by chunk.
518
- upload_modes: Dict[str, UploadMode] = {}
519
- should_ignore_info: Dict[str, bool] = {}
520
- oid_info: Dict[str, Optional[str]] = {}
724
+ upload_modes: dict[str, UploadMode] = {}
725
+ should_ignore_info: dict[str, bool] = {}
726
+ oid_info: dict[str, Optional[str]] = {}
521
727
 
522
728
  for chunk in chunk_iterable(additions, 256):
523
- payload: Dict = {
729
+ payload: dict = {
524
730
  "files": [
525
731
  {
526
732
  "path": op.path_in_repo,
@@ -563,10 +769,10 @@ def _fetch_files_to_copy(
563
769
  copies: Iterable[CommitOperationCopy],
564
770
  repo_type: str,
565
771
  repo_id: str,
566
- headers: Dict[str, str],
772
+ headers: dict[str, str],
567
773
  revision: str,
568
774
  endpoint: Optional[str] = None,
569
- ) -> Dict[Tuple[str, Optional[str]], Union["RepoFile", bytes]]:
775
+ ) -> dict[tuple[str, Optional[str]], Union["RepoFile", bytes]]:
570
776
  """
571
777
  Fetch information about the files to copy.
572
778
 
@@ -582,12 +788,12 @@ def _fetch_files_to_copy(
582
788
  repo_id (`str`):
583
789
  A namespace (user or an organization) and a repo name separated
584
790
  by a `/`.
585
- headers (`Dict[str, str]`):
791
+ headers (`dict[str, str]`):
586
792
  Headers to use for the request, including authorization headers and user agent.
587
793
  revision (`str`):
588
794
  The git revision to upload the files to. Can be any valid git revision.
589
795
 
590
- Returns: `Dict[Tuple[str, Optional[str]], Union[RepoFile, bytes]]]`
796
+ Returns: `dict[tuple[str, Optional[str]], Union[RepoFile, bytes]]]`
591
797
  Key is the file path and revision of the file to copy.
592
798
  Value is the raw content as bytes (for regular files) or the file information as a RepoFile (for LFS files).
593
799
 
@@ -600,9 +806,9 @@ def _fetch_files_to_copy(
600
806
  from .hf_api import HfApi, RepoFolder
601
807
 
602
808
  hf_api = HfApi(endpoint=endpoint, headers=headers)
603
- files_to_copy: Dict[Tuple[str, Optional[str]], Union["RepoFile", bytes]] = {}
809
+ files_to_copy: dict[tuple[str, Optional[str]], Union["RepoFile", bytes]] = {}
604
810
  # Store (path, revision) -> oid mapping
605
- oid_info: Dict[Tuple[str, Optional[str]], Optional[str]] = {}
811
+ oid_info: dict[tuple[str, Optional[str]], Optional[str]] = {}
606
812
  # 1. Fetch OIDs for destination paths in batches.
607
813
  dest_paths = [op.path_in_repo for op in copies]
608
814
  for offset in range(0, len(dest_paths), FETCH_LFS_BATCH_SIZE):
@@ -662,11 +868,11 @@ def _fetch_files_to_copy(
662
868
 
663
869
  def _prepare_commit_payload(
664
870
  operations: Iterable[CommitOperation],
665
- files_to_copy: Dict[Tuple[str, Optional[str]], Union["RepoFile", bytes]],
871
+ files_to_copy: dict[tuple[str, Optional[str]], Union["RepoFile", bytes]],
666
872
  commit_message: str,
667
873
  commit_description: Optional[str] = None,
668
874
  parent_commit: Optional[str] = None,
669
- ) -> Iterable[Dict[str, Any]]:
875
+ ) -> Iterable[dict[str, Any]]:
670
876
  """
671
877
  Builds the payload to POST to the `/commit` API of the Hub.
672
878
 
@@ -7,7 +7,7 @@ from dataclasses import dataclass
7
7
  from io import SEEK_END, SEEK_SET, BytesIO
8
8
  from pathlib import Path
9
9
  from threading import Lock, Thread
10
- from typing import Dict, List, Optional, Union
10
+ from typing import Optional, Union
11
11
 
12
12
  from .hf_api import DEFAULT_IGNORE_PATTERNS, CommitInfo, CommitOperationAdd, HfApi
13
13
  from .utils import filter_repo_objects
@@ -53,9 +53,9 @@ class CommitScheduler:
53
53
  Whether to make the repo private. If `None` (default), the repo will be public unless the organization's default is private. This value is ignored if the repo already exists.
54
54
  token (`str`, *optional*):
55
55
  The token to use to commit to the repo. Defaults to the token saved on the machine.
56
- allow_patterns (`List[str]` or `str`, *optional*):
56
+ allow_patterns (`list[str]` or `str`, *optional*):
57
57
  If provided, only files matching at least one pattern are uploaded.
58
- ignore_patterns (`List[str]` or `str`, *optional*):
58
+ ignore_patterns (`list[str]` or `str`, *optional*):
59
59
  If provided, files matching any of the patterns are not uploaded.
60
60
  squash_history (`bool`, *optional*):
61
61
  Whether to squash the history of the repo after each commit. Defaults to `False`. Squashing commits is
@@ -108,8 +108,8 @@ class CommitScheduler:
108
108
  revision: Optional[str] = None,
109
109
  private: Optional[bool] = None,
110
110
  token: Optional[str] = None,
111
- allow_patterns: Optional[Union[List[str], str]] = None,
112
- ignore_patterns: Optional[Union[List[str], str]] = None,
111
+ allow_patterns: Optional[Union[list[str], str]] = None,
112
+ ignore_patterns: Optional[Union[list[str], str]] = None,
113
113
  squash_history: bool = False,
114
114
  hf_api: Optional["HfApi"] = None,
115
115
  ) -> None:
@@ -138,7 +138,7 @@ class CommitScheduler:
138
138
  self.token = token
139
139
 
140
140
  # Keep track of already uploaded files
141
- self.last_uploaded: Dict[Path, float] = {} # key is local path, value is timestamp
141
+ self.last_uploaded: dict[Path, float] = {} # key is local path, value is timestamp
142
142
 
143
143
  # Scheduler
144
144
  if not every > 0:
@@ -205,13 +205,10 @@ class CommitScheduler:
205
205
  """
206
206
  Push folder to the Hub and return the commit info.
207
207
 
208
- <Tip warning={true}>
209
-
210
- This method is not meant to be called directly. It is run in the background by the scheduler, respecting a
211
- queue mechanism to avoid concurrent commits. Making a direct call to the method might lead to concurrency
212
- issues.
213
-
214
- </Tip>
208
+ > [!WARNING]
209
+ > This method is not meant to be called directly. It is run in the background by the scheduler, respecting a
210
+ > queue mechanism to avoid concurrent commits. Making a direct call to the method might lead to concurrency
211
+ > issues.
215
212
 
216
213
  The default behavior of `push_to_hub` is to assume an append-only folder. It lists all files in the folder and
217
214
  uploads only changed files. If no changes are found, the method returns without committing anything. If you want
@@ -232,7 +229,7 @@ class CommitScheduler:
232
229
  prefix = f"{self.path_in_repo.strip('/')}/" if self.path_in_repo else ""
233
230
 
234
231
  # Filter with pattern + filter out unchanged files + retrieve current file size
235
- files_to_upload: List[_FileToUpload] = []
232
+ files_to_upload: list[_FileToUpload] = []
236
233
  for relpath in filter_repo_objects(
237
234
  relpath_to_abspath.keys(), allow_patterns=self.allow_patterns, ignore_patterns=self.ignore_patterns
238
235
  ):
@@ -315,10 +312,13 @@ class PartialFileIO(BytesIO):
315
312
  return self._size_limit
316
313
 
317
314
  def __getattribute__(self, name: str):
318
- if name.startswith("_") or name in ("read", "tell", "seek"): # only 3 public methods supported
315
+ if name.startswith("_") or name in ("read", "tell", "seek", "fileno"): # only 4 public methods supported
319
316
  return super().__getattribute__(name)
320
317
  raise NotImplementedError(f"PartialFileIO does not support '{name}'.")
321
318
 
319
+ def fileno(self):
320
+ raise AttributeError("PartialFileIO does not have a fileno.")
321
+
322
322
  def tell(self) -> int:
323
323
  """Return the current file position."""
324
324
  return self._file.tell()