huggingface-hub 0.22.2__py3-none-any.whl → 0.23.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of huggingface-hub might be problematic. Click here for more details.
- huggingface_hub/__init__.py +51 -19
- huggingface_hub/_commit_api.py +9 -8
- huggingface_hub/_commit_scheduler.py +2 -2
- huggingface_hub/_inference_endpoints.py +10 -17
- huggingface_hub/_local_folder.py +229 -0
- huggingface_hub/_login.py +4 -3
- huggingface_hub/_multi_commits.py +1 -1
- huggingface_hub/_snapshot_download.py +16 -38
- huggingface_hub/_tensorboard_logger.py +16 -6
- huggingface_hub/_webhooks_payload.py +22 -1
- huggingface_hub/_webhooks_server.py +24 -20
- huggingface_hub/commands/download.py +11 -34
- huggingface_hub/commands/huggingface_cli.py +2 -0
- huggingface_hub/commands/tag.py +159 -0
- huggingface_hub/constants.py +3 -5
- huggingface_hub/errors.py +58 -0
- huggingface_hub/file_download.py +545 -376
- huggingface_hub/hf_api.py +756 -622
- huggingface_hub/hf_file_system.py +14 -5
- huggingface_hub/hub_mixin.py +127 -43
- huggingface_hub/inference/_client.py +402 -183
- huggingface_hub/inference/_common.py +19 -29
- huggingface_hub/inference/_generated/_async_client.py +402 -184
- huggingface_hub/inference/_generated/types/__init__.py +23 -6
- huggingface_hub/inference/_generated/types/chat_completion.py +197 -43
- huggingface_hub/inference/_generated/types/text_generation.py +57 -79
- huggingface_hub/inference/_templating.py +2 -4
- huggingface_hub/keras_mixin.py +0 -3
- huggingface_hub/lfs.py +9 -1
- huggingface_hub/repository.py +1 -0
- huggingface_hub/utils/__init__.py +12 -6
- huggingface_hub/utils/_fixes.py +1 -0
- huggingface_hub/utils/_headers.py +2 -4
- huggingface_hub/utils/_http.py +2 -4
- huggingface_hub/utils/_paths.py +13 -1
- huggingface_hub/utils/_runtime.py +10 -0
- huggingface_hub/utils/_safetensors.py +0 -13
- huggingface_hub/utils/_validators.py +2 -7
- huggingface_hub/utils/tqdm.py +124 -46
- {huggingface_hub-0.22.2.dist-info → huggingface_hub-0.23.0.dist-info}/METADATA +5 -1
- {huggingface_hub-0.22.2.dist-info → huggingface_hub-0.23.0.dist-info}/RECORD +45 -43
- {huggingface_hub-0.22.2.dist-info → huggingface_hub-0.23.0.dist-info}/LICENSE +0 -0
- {huggingface_hub-0.22.2.dist-info → huggingface_hub-0.23.0.dist-info}/WHEEL +0 -0
- {huggingface_hub-0.22.2.dist-info → huggingface_hub-0.23.0.dist-info}/entry_points.txt +0 -0
- {huggingface_hub-0.22.2.dist-info → huggingface_hub-0.23.0.dist-info}/top_level.txt +0 -0
huggingface_hub/file_download.py
CHANGED
|
@@ -2,28 +2,27 @@ import copy
|
|
|
2
2
|
import errno
|
|
3
3
|
import fnmatch
|
|
4
4
|
import inspect
|
|
5
|
-
import io
|
|
6
5
|
import json
|
|
7
6
|
import os
|
|
8
7
|
import re
|
|
9
8
|
import shutil
|
|
10
9
|
import stat
|
|
11
|
-
import tempfile
|
|
12
10
|
import time
|
|
13
11
|
import uuid
|
|
14
12
|
import warnings
|
|
15
|
-
from contextlib import contextmanager
|
|
16
13
|
from dataclasses import dataclass
|
|
17
|
-
from functools import partial
|
|
18
14
|
from pathlib import Path
|
|
19
|
-
from typing import Any, BinaryIO, Dict,
|
|
15
|
+
from typing import Any, BinaryIO, Dict, Literal, NoReturn, Optional, Tuple, Union
|
|
20
16
|
from urllib.parse import quote, urlparse
|
|
21
17
|
|
|
22
18
|
import requests
|
|
23
19
|
|
|
24
|
-
from huggingface_hub import constants
|
|
25
|
-
|
|
26
20
|
from . import __version__ # noqa: F401 # for backward compatibility
|
|
21
|
+
from ._local_folder import (
|
|
22
|
+
get_local_download_paths,
|
|
23
|
+
read_download_metadata,
|
|
24
|
+
write_download_metadata,
|
|
25
|
+
)
|
|
27
26
|
from .constants import (
|
|
28
27
|
DEFAULT_ETAG_TIMEOUT,
|
|
29
28
|
DEFAULT_REQUEST_TIMEOUT,
|
|
@@ -80,13 +79,23 @@ from .utils import (
|
|
|
80
79
|
from .utils._runtime import _PY_VERSION # noqa: F401 # for backward compatibility
|
|
81
80
|
from .utils._typing import HTTP_METHOD_T
|
|
82
81
|
from .utils.insecure_hashlib import sha256
|
|
82
|
+
from .utils.sha import sha_fileobj
|
|
83
83
|
|
|
84
84
|
|
|
85
85
|
logger = logging.get_logger(__name__)
|
|
86
86
|
|
|
87
|
+
# Return value when trying to load a file from cache but the file does not exist in the distant repo.
|
|
88
|
+
_CACHED_NO_EXIST = object()
|
|
89
|
+
_CACHED_NO_EXIST_T = Any
|
|
90
|
+
|
|
87
91
|
# Regex to get filename from a "Content-Disposition" header for CDN-served files
|
|
88
92
|
HEADER_FILENAME_PATTERN = re.compile(r'filename="(?P<filename>.*?)";')
|
|
89
93
|
|
|
94
|
+
# Regex to check if the revision IS directly a commit_hash
|
|
95
|
+
REGEX_COMMIT_HASH = re.compile(r"^[0-9a-f]{40}$")
|
|
96
|
+
|
|
97
|
+
# Regex to check if the file etag IS a valid sha256
|
|
98
|
+
REGEX_SHA256 = re.compile(r"^[0-9a-f]{64}$")
|
|
90
99
|
|
|
91
100
|
_are_symlinks_supported_in_dir: Dict[str, bool] = {}
|
|
92
101
|
|
|
@@ -150,12 +159,6 @@ def are_symlinks_supported(cache_dir: Union[str, Path, None] = None) -> bool:
|
|
|
150
159
|
return _are_symlinks_supported_in_dir[cache_dir]
|
|
151
160
|
|
|
152
161
|
|
|
153
|
-
# Return value when trying to load a file from cache but the file does not exist in the distant repo.
|
|
154
|
-
_CACHED_NO_EXIST = object()
|
|
155
|
-
_CACHED_NO_EXIST_T = Any
|
|
156
|
-
REGEX_COMMIT_HASH = re.compile(r"^[0-9a-f]{40}$")
|
|
157
|
-
|
|
158
|
-
|
|
159
162
|
@dataclass(frozen=True)
|
|
160
163
|
class HfFileMetadata:
|
|
161
164
|
"""Data structure containing information about a file versioned on the Hub.
|
|
@@ -478,9 +481,9 @@ def http_get(
|
|
|
478
481
|
|
|
479
482
|
consistency_error_message = (
|
|
480
483
|
f"Consistency check failed: file should be of size {expected_size} but has size"
|
|
481
|
-
f" {{actual_size}} ({displayed_filename}).\nWe are sorry for the inconvenience. Please retry
|
|
482
|
-
"
|
|
483
|
-
"
|
|
484
|
+
f" {{actual_size}} ({displayed_filename}).\nWe are sorry for the inconvenience. Please retry"
|
|
485
|
+
" with `force_download=True`.\nIf the issue persists, please let us know by opening an issue "
|
|
486
|
+
"on https://github.com/huggingface/huggingface_hub."
|
|
484
487
|
)
|
|
485
488
|
|
|
486
489
|
# Stream file to buffer
|
|
@@ -495,6 +498,7 @@ def http_get(
|
|
|
495
498
|
disable=True if (logger.getEffectiveLevel() == logging.NOTSET) else None,
|
|
496
499
|
# ^ set `disable=None` rather than `disable=False` by default to disable progress bar when no TTY attached
|
|
497
500
|
# see https://github.com/huggingface/huggingface_hub/pull/2000
|
|
501
|
+
name="huggingface_hub.http_get",
|
|
498
502
|
)
|
|
499
503
|
|
|
500
504
|
if hf_transfer and total is not None and total > 5 * DOWNLOAD_CHUNK_SIZE:
|
|
@@ -582,7 +586,7 @@ def cached_download(
|
|
|
582
586
|
force_filename: Optional[str] = None,
|
|
583
587
|
proxies: Optional[Dict] = None,
|
|
584
588
|
etag_timeout: float = DEFAULT_ETAG_TIMEOUT,
|
|
585
|
-
resume_download: bool =
|
|
589
|
+
resume_download: Optional[bool] = None,
|
|
586
590
|
token: Union[bool, str, None] = None,
|
|
587
591
|
local_files_only: bool = False,
|
|
588
592
|
legacy_cache_layout: bool = False,
|
|
@@ -619,8 +623,6 @@ def cached_download(
|
|
|
619
623
|
etag_timeout (`float`, *optional* defaults to `10`):
|
|
620
624
|
When fetching ETag, how many seconds to wait for the server to send
|
|
621
625
|
data before giving up which is passed to `requests.request`.
|
|
622
|
-
resume_download (`bool`, *optional*, defaults to `False`):
|
|
623
|
-
If `True`, resume a previously interrupted download.
|
|
624
626
|
token (`bool`, `str`, *optional*):
|
|
625
627
|
A token to be used for the download.
|
|
626
628
|
- If `True`, the token is read from the HuggingFace config
|
|
@@ -671,6 +673,13 @@ def cached_download(
|
|
|
671
673
|
" 'hf_hub_download'",
|
|
672
674
|
FutureWarning,
|
|
673
675
|
)
|
|
676
|
+
if resume_download is not None:
|
|
677
|
+
warnings.warn(
|
|
678
|
+
"`resume_download` is deprecated and will be removed in version 1.0.0. "
|
|
679
|
+
"Downloads always resume when possible. "
|
|
680
|
+
"If you want to force a new download, use `force_download=True`.",
|
|
681
|
+
FutureWarning,
|
|
682
|
+
)
|
|
674
683
|
|
|
675
684
|
if cache_dir is None:
|
|
676
685
|
cache_dir = HF_HUB_CACHE
|
|
@@ -786,46 +795,16 @@ def cached_download(
|
|
|
786
795
|
cache_path = "\\\\?\\" + os.path.abspath(cache_path)
|
|
787
796
|
|
|
788
797
|
with WeakFileLock(lock_path):
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
|
|
793
|
-
|
|
794
|
-
|
|
795
|
-
|
|
796
|
-
|
|
797
|
-
|
|
798
|
-
|
|
799
|
-
with open(incomplete_path, "ab") as f:
|
|
800
|
-
yield f
|
|
801
|
-
|
|
802
|
-
temp_file_manager = _resumable_file_manager
|
|
803
|
-
if os.path.exists(incomplete_path):
|
|
804
|
-
resume_size = os.stat(incomplete_path).st_size
|
|
805
|
-
else:
|
|
806
|
-
resume_size = 0
|
|
807
|
-
else:
|
|
808
|
-
temp_file_manager = partial( # type: ignore
|
|
809
|
-
tempfile.NamedTemporaryFile, mode="wb", dir=cache_dir, delete=False
|
|
810
|
-
)
|
|
811
|
-
resume_size = 0
|
|
812
|
-
|
|
813
|
-
# Download to temporary file, then copy to cache dir once finished.
|
|
814
|
-
# Otherwise you get corrupt cache entries if the download gets interrupted.
|
|
815
|
-
with temp_file_manager() as temp_file:
|
|
816
|
-
logger.info("downloading %s to %s", url, temp_file.name)
|
|
817
|
-
|
|
818
|
-
http_get(
|
|
819
|
-
url_to_download,
|
|
820
|
-
temp_file,
|
|
821
|
-
proxies=proxies,
|
|
822
|
-
resume_size=resume_size,
|
|
823
|
-
headers=headers,
|
|
824
|
-
expected_size=expected_size,
|
|
825
|
-
)
|
|
826
|
-
|
|
827
|
-
logger.info("storing %s in cache at %s", url, cache_path)
|
|
828
|
-
_chmod_and_replace(temp_file.name, cache_path)
|
|
798
|
+
_download_to_tmp_and_move(
|
|
799
|
+
incomplete_path=Path(cache_path + ".incomplete"),
|
|
800
|
+
destination_path=Path(cache_path),
|
|
801
|
+
url_to_download=url_to_download,
|
|
802
|
+
proxies=proxies,
|
|
803
|
+
headers=headers,
|
|
804
|
+
expected_size=expected_size,
|
|
805
|
+
filename=filename,
|
|
806
|
+
force_download=force_download,
|
|
807
|
+
)
|
|
829
808
|
|
|
830
809
|
if force_filename is None:
|
|
831
810
|
logger.info("creating metadata file for %s", cache_path)
|
|
@@ -1022,18 +1001,19 @@ def hf_hub_download(
|
|
|
1022
1001
|
library_version: Optional[str] = None,
|
|
1023
1002
|
cache_dir: Union[str, Path, None] = None,
|
|
1024
1003
|
local_dir: Union[str, Path, None] = None,
|
|
1025
|
-
local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto",
|
|
1026
1004
|
user_agent: Union[Dict, str, None] = None,
|
|
1027
1005
|
force_download: bool = False,
|
|
1028
|
-
force_filename: Optional[str] = None,
|
|
1029
1006
|
proxies: Optional[Dict] = None,
|
|
1030
1007
|
etag_timeout: float = DEFAULT_ETAG_TIMEOUT,
|
|
1031
|
-
resume_download: bool = False,
|
|
1032
1008
|
token: Union[bool, str, None] = None,
|
|
1033
1009
|
local_files_only: bool = False,
|
|
1034
1010
|
headers: Optional[Dict[str, str]] = None,
|
|
1035
|
-
legacy_cache_layout: bool = False,
|
|
1036
1011
|
endpoint: Optional[str] = None,
|
|
1012
|
+
# Deprecated args
|
|
1013
|
+
legacy_cache_layout: bool = False,
|
|
1014
|
+
resume_download: Optional[bool] = None,
|
|
1015
|
+
force_filename: Optional[str] = None,
|
|
1016
|
+
local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto",
|
|
1037
1017
|
) -> str:
|
|
1038
1018
|
"""Download a given file if it's not already present in the local cache.
|
|
1039
1019
|
|
|
@@ -1047,21 +1027,6 @@ def hf_hub_download(
|
|
|
1047
1027
|
that have been resolved at that particular commit. Each filename is a symlink to the blob
|
|
1048
1028
|
at that particular commit.
|
|
1049
1029
|
|
|
1050
|
-
If `local_dir` is provided, the file structure from the repo will be replicated in this location. You can configure
|
|
1051
|
-
how you want to move those files:
|
|
1052
|
-
- If `local_dir_use_symlinks="auto"` (default), files are downloaded and stored in the cache directory as blob
|
|
1053
|
-
files. Small files (<5MB) are duplicated in `local_dir` while a symlink is created for bigger files. The goal
|
|
1054
|
-
is to be able to manually edit and save small files without corrupting the cache while saving disk space for
|
|
1055
|
-
binary files. The 5MB threshold can be configured with the `HF_HUB_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD`
|
|
1056
|
-
environment variable.
|
|
1057
|
-
- If `local_dir_use_symlinks=True`, files are downloaded, stored in the cache directory and symlinked in `local_dir`.
|
|
1058
|
-
This is optimal in term of disk usage but files must not be manually edited.
|
|
1059
|
-
- If `local_dir_use_symlinks=False` and the blob files exist in the cache directory, they are duplicated in the
|
|
1060
|
-
local dir. This means disk usage is not optimized.
|
|
1061
|
-
- Finally, if `local_dir_use_symlinks=False` and the blob files do not exist in the cache directory, then the
|
|
1062
|
-
files are downloaded and directly placed under `local_dir`. This means if you need to download them again later,
|
|
1063
|
-
they will be re-downloaded entirely.
|
|
1064
|
-
|
|
1065
1030
|
```
|
|
1066
1031
|
[ 96] .
|
|
1067
1032
|
└── [ 160] models--julien-c--EsperBERTo-small
|
|
@@ -1080,6 +1045,11 @@ def hf_hub_download(
|
|
|
1080
1045
|
└── [ 76] pytorch_model.bin -> ../../blobs/403450e234d65943a7dcf7e05a771ce3c92faa84dd07db4ac20f592037a1e4bd
|
|
1081
1046
|
```
|
|
1082
1047
|
|
|
1048
|
+
If `local_dir` is provided, the file structure from the repo will be replicated in this location. When using this
|
|
1049
|
+
option, the `cache_dir` will not be used and a `.huggingface/` folder will be created at the root of `local_dir`
|
|
1050
|
+
to store some metadata related to the downloaded files. While this mechanism is not as robust as the main
|
|
1051
|
+
cache-system, it's optimized for regularly pulling the latest version of a repository.
|
|
1052
|
+
|
|
1083
1053
|
Args:
|
|
1084
1054
|
repo_id (`str`):
|
|
1085
1055
|
A user or an organization name and a repo name separated by a `/`.
|
|
@@ -1100,13 +1070,7 @@ def hf_hub_download(
|
|
|
1100
1070
|
cache_dir (`str`, `Path`, *optional*):
|
|
1101
1071
|
Path to the folder where cached files are stored.
|
|
1102
1072
|
local_dir (`str` or `Path`, *optional*):
|
|
1103
|
-
If provided, the downloaded file will be placed under this directory
|
|
1104
|
-
a regular file (see description for more details).
|
|
1105
|
-
local_dir_use_symlinks (`"auto"` or `bool`, defaults to `"auto"`):
|
|
1106
|
-
To be used with `local_dir`. If set to "auto", the cache directory will be used and the file will be either
|
|
1107
|
-
duplicated or symlinked to the local directory depending on its size. It set to `True`, a symlink will be
|
|
1108
|
-
created, no matter the file size. If set to `False`, the file will either be duplicated from cache (if
|
|
1109
|
-
already exists) or downloaded from the Hub and not cached. See description for more details.
|
|
1073
|
+
If provided, the downloaded file will be placed under this directory.
|
|
1110
1074
|
user_agent (`dict`, `str`, *optional*):
|
|
1111
1075
|
The user-agent info in the form of a dictionary or a string.
|
|
1112
1076
|
force_download (`bool`, *optional*, defaults to `False`):
|
|
@@ -1118,8 +1082,6 @@ def hf_hub_download(
|
|
|
1118
1082
|
etag_timeout (`float`, *optional*, defaults to `10`):
|
|
1119
1083
|
When fetching ETag, how many seconds to wait for the server to send
|
|
1120
1084
|
data before giving up which is passed to `requests.request`.
|
|
1121
|
-
resume_download (`bool`, *optional*, defaults to `False`):
|
|
1122
|
-
If `True`, resume a previously interrupted download.
|
|
1123
1085
|
token (`str`, `bool`, *optional*):
|
|
1124
1086
|
A token to be used for the download.
|
|
1125
1087
|
- If `True`, the token is read from the HuggingFace config
|
|
@@ -1136,30 +1098,24 @@ def hf_hub_download(
|
|
|
1136
1098
|
more powerful.
|
|
1137
1099
|
|
|
1138
1100
|
Returns:
|
|
1139
|
-
Local path
|
|
1140
|
-
file cached on disk.
|
|
1141
|
-
|
|
1142
|
-
<Tip>
|
|
1143
|
-
|
|
1144
|
-
Raises the following errors:
|
|
1101
|
+
`str`: Local path of file or if networking is off, last version of file cached on disk.
|
|
1145
1102
|
|
|
1103
|
+
Raises:
|
|
1146
1104
|
- [`EnvironmentError`](https://docs.python.org/3/library/exceptions.html#EnvironmentError)
|
|
1147
|
-
|
|
1105
|
+
if `token=True` and the token cannot be found.
|
|
1148
1106
|
- [`OSError`](https://docs.python.org/3/library/exceptions.html#OSError)
|
|
1149
|
-
|
|
1107
|
+
if ETag cannot be determined.
|
|
1150
1108
|
- [`ValueError`](https://docs.python.org/3/library/exceptions.html#ValueError)
|
|
1151
|
-
|
|
1109
|
+
if some parameter value is invalid
|
|
1152
1110
|
- [`~utils.RepositoryNotFoundError`]
|
|
1153
|
-
|
|
1154
|
-
|
|
1111
|
+
If the repository to download from cannot be found. This may be because it doesn't exist,
|
|
1112
|
+
or because it is set to `private` and you do not have access.
|
|
1155
1113
|
- [`~utils.RevisionNotFoundError`]
|
|
1156
|
-
|
|
1114
|
+
If the revision to download from cannot be found.
|
|
1157
1115
|
- [`~utils.EntryNotFoundError`]
|
|
1158
|
-
|
|
1116
|
+
If the file to download cannot be found.
|
|
1159
1117
|
- [`~utils.LocalEntryNotFoundError`]
|
|
1160
|
-
|
|
1161
|
-
|
|
1162
|
-
</Tip>
|
|
1118
|
+
If network is disabled or unavailable and file is not found in cache.
|
|
1163
1119
|
"""
|
|
1164
1120
|
if HF_HUB_ETAG_TIMEOUT != DEFAULT_ETAG_TIMEOUT:
|
|
1165
1121
|
# Respect environment variable above user value
|
|
@@ -1172,6 +1128,13 @@ def hf_hub_download(
|
|
|
1172
1128
|
FutureWarning,
|
|
1173
1129
|
)
|
|
1174
1130
|
legacy_cache_layout = True
|
|
1131
|
+
if resume_download is not None:
|
|
1132
|
+
warnings.warn(
|
|
1133
|
+
"`resume_download` is deprecated and will be removed in version 1.0.0. "
|
|
1134
|
+
"Downloads always resume when possible. "
|
|
1135
|
+
"If you want to force a new download, use `force_download=True`.",
|
|
1136
|
+
FutureWarning,
|
|
1137
|
+
)
|
|
1175
1138
|
|
|
1176
1139
|
if legacy_cache_layout:
|
|
1177
1140
|
url = hf_hub_url(
|
|
@@ -1193,7 +1156,6 @@ def hf_hub_download(
|
|
|
1193
1156
|
force_filename=force_filename,
|
|
1194
1157
|
proxies=proxies,
|
|
1195
1158
|
etag_timeout=etag_timeout,
|
|
1196
|
-
resume_download=resume_download,
|
|
1197
1159
|
token=token,
|
|
1198
1160
|
local_files_only=local_files_only,
|
|
1199
1161
|
legacy_cache_layout=legacy_cache_layout,
|
|
@@ -1207,7 +1169,6 @@ def hf_hub_download(
|
|
|
1207
1169
|
cache_dir = str(cache_dir)
|
|
1208
1170
|
if isinstance(local_dir, Path):
|
|
1209
1171
|
local_dir = str(local_dir)
|
|
1210
|
-
locks_dir = os.path.join(cache_dir, ".locks")
|
|
1211
1172
|
|
|
1212
1173
|
if subfolder == "":
|
|
1213
1174
|
subfolder = None
|
|
@@ -1220,6 +1181,85 @@ def hf_hub_download(
|
|
|
1220
1181
|
if repo_type not in REPO_TYPES:
|
|
1221
1182
|
raise ValueError(f"Invalid repo type: {repo_type}. Accepted repo types are: {str(REPO_TYPES)}")
|
|
1222
1183
|
|
|
1184
|
+
headers = build_hf_headers(
|
|
1185
|
+
token=token,
|
|
1186
|
+
library_name=library_name,
|
|
1187
|
+
library_version=library_version,
|
|
1188
|
+
user_agent=user_agent,
|
|
1189
|
+
headers=headers,
|
|
1190
|
+
)
|
|
1191
|
+
|
|
1192
|
+
if local_dir is not None:
|
|
1193
|
+
if local_dir_use_symlinks != "auto":
|
|
1194
|
+
warnings.warn(
|
|
1195
|
+
"`local_dir_use_symlinks` parameter is deprecated and will be ignored. "
|
|
1196
|
+
"The process to download files to a local folder has been updated and do "
|
|
1197
|
+
"not rely on symlinks anymore. You only need to pass a destination folder "
|
|
1198
|
+
"as`local_dir`.\n"
|
|
1199
|
+
"For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder."
|
|
1200
|
+
)
|
|
1201
|
+
|
|
1202
|
+
return _hf_hub_download_to_local_dir(
|
|
1203
|
+
# Destination
|
|
1204
|
+
local_dir=local_dir,
|
|
1205
|
+
# File info
|
|
1206
|
+
repo_id=repo_id,
|
|
1207
|
+
repo_type=repo_type,
|
|
1208
|
+
filename=filename,
|
|
1209
|
+
revision=revision,
|
|
1210
|
+
# HTTP info
|
|
1211
|
+
proxies=proxies,
|
|
1212
|
+
etag_timeout=etag_timeout,
|
|
1213
|
+
headers=headers,
|
|
1214
|
+
endpoint=endpoint,
|
|
1215
|
+
# Additional options
|
|
1216
|
+
cache_dir=cache_dir,
|
|
1217
|
+
force_download=force_download,
|
|
1218
|
+
local_files_only=local_files_only,
|
|
1219
|
+
)
|
|
1220
|
+
else:
|
|
1221
|
+
return _hf_hub_download_to_cache_dir(
|
|
1222
|
+
# Destination
|
|
1223
|
+
cache_dir=cache_dir,
|
|
1224
|
+
# File info
|
|
1225
|
+
repo_id=repo_id,
|
|
1226
|
+
filename=filename,
|
|
1227
|
+
repo_type=repo_type,
|
|
1228
|
+
revision=revision,
|
|
1229
|
+
# HTTP info
|
|
1230
|
+
headers=headers,
|
|
1231
|
+
proxies=proxies,
|
|
1232
|
+
etag_timeout=etag_timeout,
|
|
1233
|
+
endpoint=endpoint,
|
|
1234
|
+
# Additional options
|
|
1235
|
+
local_files_only=local_files_only,
|
|
1236
|
+
force_download=force_download,
|
|
1237
|
+
)
|
|
1238
|
+
|
|
1239
|
+
|
|
1240
|
+
def _hf_hub_download_to_cache_dir(
|
|
1241
|
+
*,
|
|
1242
|
+
# Destination
|
|
1243
|
+
cache_dir: str,
|
|
1244
|
+
# File info
|
|
1245
|
+
repo_id: str,
|
|
1246
|
+
filename: str,
|
|
1247
|
+
repo_type: str,
|
|
1248
|
+
revision: str,
|
|
1249
|
+
# HTTP info
|
|
1250
|
+
headers: Dict[str, str],
|
|
1251
|
+
proxies: Optional[Dict],
|
|
1252
|
+
etag_timeout: float,
|
|
1253
|
+
endpoint: Optional[str],
|
|
1254
|
+
# Additional options
|
|
1255
|
+
local_files_only: bool,
|
|
1256
|
+
force_download: bool,
|
|
1257
|
+
) -> str:
|
|
1258
|
+
"""Download a given file to a cache folder, if not already present.
|
|
1259
|
+
|
|
1260
|
+
Method should not be called directly. Please use `hf_hub_download` instead.
|
|
1261
|
+
"""
|
|
1262
|
+
locks_dir = os.path.join(cache_dir, ".locks")
|
|
1223
1263
|
storage_folder = os.path.join(cache_dir, repo_folder_name(repo_id=repo_id, repo_type=repo_type))
|
|
1224
1264
|
|
|
1225
1265
|
# cross platform transcription of filename, to be used as a local file path.
|
|
@@ -1231,207 +1271,82 @@ def hf_hub_download(
|
|
|
1231
1271
|
" owner to rename this file."
|
|
1232
1272
|
)
|
|
1233
1273
|
|
|
1234
|
-
# if user provides a commit_hash and they already have the file on disk,
|
|
1235
|
-
# shortcut everything.
|
|
1274
|
+
# if user provides a commit_hash and they already have the file on disk, shortcut everything.
|
|
1236
1275
|
if REGEX_COMMIT_HASH.match(revision):
|
|
1237
1276
|
pointer_path = _get_pointer_path(storage_folder, revision, relative_filename)
|
|
1238
|
-
if os.path.exists(pointer_path):
|
|
1239
|
-
if local_dir is not None:
|
|
1240
|
-
return _to_local_dir(pointer_path, local_dir, relative_filename, use_symlinks=local_dir_use_symlinks)
|
|
1277
|
+
if os.path.exists(pointer_path) and not force_download:
|
|
1241
1278
|
return pointer_path
|
|
1242
1279
|
|
|
1243
|
-
|
|
1244
|
-
|
|
1245
|
-
|
|
1246
|
-
|
|
1247
|
-
|
|
1248
|
-
|
|
1249
|
-
|
|
1280
|
+
# Try to get metadata (etag, commit_hash, url, size) from the server.
|
|
1281
|
+
# If we can't, a HEAD request error is returned.
|
|
1282
|
+
(url_to_download, etag, commit_hash, expected_size, head_call_error) = _get_metadata_or_catch_error(
|
|
1283
|
+
repo_id=repo_id,
|
|
1284
|
+
filename=filename,
|
|
1285
|
+
repo_type=repo_type,
|
|
1286
|
+
revision=revision,
|
|
1287
|
+
endpoint=endpoint,
|
|
1288
|
+
proxies=proxies,
|
|
1289
|
+
etag_timeout=etag_timeout,
|
|
1250
1290
|
headers=headers,
|
|
1291
|
+
local_files_only=local_files_only,
|
|
1292
|
+
storage_folder=storage_folder,
|
|
1293
|
+
relative_filename=relative_filename,
|
|
1251
1294
|
)
|
|
1252
1295
|
|
|
1253
|
-
url_to_download = url
|
|
1254
|
-
etag = None
|
|
1255
|
-
commit_hash = None
|
|
1256
|
-
expected_size = None
|
|
1257
|
-
head_call_error: Optional[Exception] = None
|
|
1258
|
-
if not local_files_only:
|
|
1259
|
-
try:
|
|
1260
|
-
try:
|
|
1261
|
-
metadata = get_hf_file_metadata(
|
|
1262
|
-
url=url,
|
|
1263
|
-
token=token,
|
|
1264
|
-
proxies=proxies,
|
|
1265
|
-
timeout=etag_timeout,
|
|
1266
|
-
library_name=library_name,
|
|
1267
|
-
library_version=library_version,
|
|
1268
|
-
user_agent=user_agent,
|
|
1269
|
-
)
|
|
1270
|
-
except EntryNotFoundError as http_error:
|
|
1271
|
-
# Cache the non-existence of the file and raise
|
|
1272
|
-
commit_hash = http_error.response.headers.get(HUGGINGFACE_HEADER_X_REPO_COMMIT)
|
|
1273
|
-
if commit_hash is not None and not legacy_cache_layout:
|
|
1274
|
-
no_exist_file_path = Path(storage_folder) / ".no_exist" / commit_hash / relative_filename
|
|
1275
|
-
no_exist_file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
1276
|
-
no_exist_file_path.touch()
|
|
1277
|
-
_cache_commit_hash_for_specific_revision(storage_folder, revision, commit_hash)
|
|
1278
|
-
raise
|
|
1279
|
-
|
|
1280
|
-
# Commit hash must exist
|
|
1281
|
-
commit_hash = metadata.commit_hash
|
|
1282
|
-
if commit_hash is None:
|
|
1283
|
-
raise FileMetadataError(
|
|
1284
|
-
"Distant resource does not seem to be on huggingface.co. It is possible that a configuration issue"
|
|
1285
|
-
" prevents you from downloading resources from https://huggingface.co. Please check your firewall"
|
|
1286
|
-
" and proxy settings and make sure your SSL certificates are updated."
|
|
1287
|
-
)
|
|
1288
|
-
|
|
1289
|
-
# Etag must exist
|
|
1290
|
-
etag = metadata.etag
|
|
1291
|
-
# We favor a custom header indicating the etag of the linked resource, and
|
|
1292
|
-
# we fallback to the regular etag header.
|
|
1293
|
-
# If we don't have any of those, raise an error.
|
|
1294
|
-
if etag is None:
|
|
1295
|
-
raise FileMetadataError(
|
|
1296
|
-
"Distant resource does not have an ETag, we won't be able to reliably ensure reproducibility."
|
|
1297
|
-
)
|
|
1298
|
-
|
|
1299
|
-
# Expected (uncompressed) size
|
|
1300
|
-
expected_size = metadata.size
|
|
1301
|
-
|
|
1302
|
-
# In case of a redirect, save an extra redirect on the request.get call,
|
|
1303
|
-
# and ensure we download the exact atomic version even if it changed
|
|
1304
|
-
# between the HEAD and the GET (unlikely, but hey).
|
|
1305
|
-
#
|
|
1306
|
-
# If url domain is different => we are downloading from a CDN => url is signed => don't send auth
|
|
1307
|
-
# If url domain is the same => redirect due to repo rename AND downloading a regular file => keep auth
|
|
1308
|
-
if metadata.location != url:
|
|
1309
|
-
url_to_download = metadata.location
|
|
1310
|
-
if urlparse(url).netloc != urlparse(url_to_download).netloc:
|
|
1311
|
-
# Remove authorization header when downloading a LFS blob
|
|
1312
|
-
headers.pop("authorization", None)
|
|
1313
|
-
except (requests.exceptions.SSLError, requests.exceptions.ProxyError):
|
|
1314
|
-
# Actually raise for those subclasses of ConnectionError
|
|
1315
|
-
raise
|
|
1316
|
-
except (
|
|
1317
|
-
requests.exceptions.ConnectionError,
|
|
1318
|
-
requests.exceptions.Timeout,
|
|
1319
|
-
OfflineModeIsEnabled,
|
|
1320
|
-
) as error:
|
|
1321
|
-
# Otherwise, our Internet connection is down.
|
|
1322
|
-
# etag is None
|
|
1323
|
-
head_call_error = error
|
|
1324
|
-
pass
|
|
1325
|
-
except (RevisionNotFoundError, EntryNotFoundError):
|
|
1326
|
-
# The repo was found but the revision or entry doesn't exist on the Hub (never existed or got deleted)
|
|
1327
|
-
raise
|
|
1328
|
-
except requests.HTTPError as error:
|
|
1329
|
-
# Multiple reasons for an http error:
|
|
1330
|
-
# - Repository is private and invalid/missing token sent
|
|
1331
|
-
# - Repository is gated and invalid/missing token sent
|
|
1332
|
-
# - Hub is down (error 500 or 504)
|
|
1333
|
-
# => let's switch to 'local_files_only=True' to check if the files are already cached.
|
|
1334
|
-
# (if it's not the case, the error will be re-raised)
|
|
1335
|
-
head_call_error = error
|
|
1336
|
-
pass
|
|
1337
|
-
except FileMetadataError as error:
|
|
1338
|
-
# Multiple reasons for a FileMetadataError:
|
|
1339
|
-
# - Wrong network configuration (proxy, firewall, SSL certificates)
|
|
1340
|
-
# - Inconsistency on the Hub
|
|
1341
|
-
# => let's switch to 'local_files_only=True' to check if the files are already cached.
|
|
1342
|
-
# (if it's not the case, the error will be re-raised)
|
|
1343
|
-
head_call_error = error
|
|
1344
|
-
pass
|
|
1345
|
-
|
|
1346
|
-
assert (
|
|
1347
|
-
local_files_only or etag is not None or head_call_error is not None
|
|
1348
|
-
), "etag is empty due to uncovered problems"
|
|
1349
|
-
|
|
1350
1296
|
# etag can be None for several reasons:
|
|
1351
1297
|
# 1. we passed local_files_only.
|
|
1352
1298
|
# 2. we don't have a connection
|
|
1353
|
-
# 3. Hub is down (HTTP 500
|
|
1299
|
+
# 3. Hub is down (HTTP 500, 503, 504)
|
|
1354
1300
|
# 4. repo is not found -for example private or gated- and invalid/missing token sent
|
|
1355
1301
|
# 5. Hub is blocked by a firewall or proxy is not set correctly.
|
|
1356
1302
|
# => Try to get the last downloaded one from the specified revision.
|
|
1357
1303
|
#
|
|
1358
1304
|
# If the specified revision is a commit hash, look inside "snapshots".
|
|
1359
1305
|
# If the specified revision is a branch or tag, look inside "refs".
|
|
1360
|
-
if
|
|
1361
|
-
#
|
|
1362
|
-
if force_download:
|
|
1363
|
-
|
|
1364
|
-
|
|
1365
|
-
|
|
1366
|
-
raise ValueError(
|
|
1367
|
-
"Cannot pass 'force_download=True' when offline mode is enabled."
|
|
1368
|
-
) from head_call_error
|
|
1306
|
+
if head_call_error is not None:
|
|
1307
|
+
# Couldn't make a HEAD call => let's try to find a local file
|
|
1308
|
+
if not force_download:
|
|
1309
|
+
commit_hash = None
|
|
1310
|
+
if REGEX_COMMIT_HASH.match(revision):
|
|
1311
|
+
commit_hash = revision
|
|
1369
1312
|
else:
|
|
1370
|
-
|
|
1313
|
+
ref_path = os.path.join(storage_folder, "refs", revision)
|
|
1314
|
+
if os.path.isfile(ref_path):
|
|
1315
|
+
with open(ref_path) as f:
|
|
1316
|
+
commit_hash = f.read()
|
|
1371
1317
|
|
|
1372
|
-
|
|
1373
|
-
|
|
1374
|
-
|
|
1375
|
-
|
|
1376
|
-
|
|
1377
|
-
ref_path = os.path.join(storage_folder, "refs", revision)
|
|
1378
|
-
if os.path.isfile(ref_path):
|
|
1379
|
-
with open(ref_path) as f:
|
|
1380
|
-
commit_hash = f.read()
|
|
1381
|
-
|
|
1382
|
-
# Return pointer file if exists
|
|
1383
|
-
if commit_hash is not None:
|
|
1384
|
-
pointer_path = _get_pointer_path(storage_folder, commit_hash, relative_filename)
|
|
1385
|
-
if os.path.exists(pointer_path):
|
|
1386
|
-
if local_dir is not None:
|
|
1387
|
-
return _to_local_dir(
|
|
1388
|
-
pointer_path, local_dir, relative_filename, use_symlinks=local_dir_use_symlinks
|
|
1389
|
-
)
|
|
1390
|
-
return pointer_path
|
|
1318
|
+
# Return pointer file if exists
|
|
1319
|
+
if commit_hash is not None:
|
|
1320
|
+
pointer_path = _get_pointer_path(storage_folder, commit_hash, relative_filename)
|
|
1321
|
+
if os.path.exists(pointer_path) and not force_download:
|
|
1322
|
+
return pointer_path
|
|
1391
1323
|
|
|
1392
|
-
#
|
|
1393
|
-
|
|
1394
|
-
|
|
1395
|
-
|
|
1396
|
-
if local_files_only:
|
|
1397
|
-
raise LocalEntryNotFoundError(
|
|
1398
|
-
"Cannot find the requested files in the disk cache and outgoing traffic has been disabled. To enable"
|
|
1399
|
-
" hf.co look-ups and downloads online, set 'local_files_only' to False."
|
|
1400
|
-
)
|
|
1401
|
-
elif isinstance(head_call_error, RepositoryNotFoundError) or isinstance(head_call_error, GatedRepoError):
|
|
1402
|
-
# Repo not found or gated => let's raise the actual error
|
|
1403
|
-
raise head_call_error
|
|
1404
|
-
else:
|
|
1405
|
-
# Otherwise: most likely a connection issue or Hub downtime => let's warn the user
|
|
1406
|
-
raise LocalEntryNotFoundError(
|
|
1407
|
-
"An error happened while trying to locate the file on the Hub and we cannot find the requested files"
|
|
1408
|
-
" in the local cache. Please check your connection and try again or make sure your Internet connection"
|
|
1409
|
-
" is on."
|
|
1410
|
-
) from head_call_error
|
|
1411
|
-
|
|
1412
|
-
# From now on, etag and commit_hash are not None.
|
|
1324
|
+
# Otherwise, raise appropriate error
|
|
1325
|
+
_raise_on_head_call_error(head_call_error, force_download, local_files_only)
|
|
1326
|
+
|
|
1327
|
+
# From now on, etag, commit_hash, url and size are not None.
|
|
1413
1328
|
assert etag is not None, "etag must have been retrieved from server"
|
|
1414
1329
|
assert commit_hash is not None, "commit_hash must have been retrieved from server"
|
|
1330
|
+
assert url_to_download is not None, "file location must have been retrieved from server"
|
|
1331
|
+
assert expected_size is not None, "expected_size must have been retrieved from server"
|
|
1415
1332
|
blob_path = os.path.join(storage_folder, "blobs", etag)
|
|
1416
1333
|
pointer_path = _get_pointer_path(storage_folder, commit_hash, relative_filename)
|
|
1417
1334
|
|
|
1418
1335
|
os.makedirs(os.path.dirname(blob_path), exist_ok=True)
|
|
1419
1336
|
os.makedirs(os.path.dirname(pointer_path), exist_ok=True)
|
|
1337
|
+
|
|
1420
1338
|
# if passed revision is not identical to commit_hash
|
|
1421
1339
|
# then revision has to be a branch name or tag name.
|
|
1422
1340
|
# In that case store a ref.
|
|
1423
1341
|
_cache_commit_hash_for_specific_revision(storage_folder, revision, commit_hash)
|
|
1424
1342
|
|
|
1425
|
-
|
|
1426
|
-
|
|
1427
|
-
|
|
1428
|
-
|
|
1343
|
+
# If file already exists, return it (except if force_download=True)
|
|
1344
|
+
if not force_download:
|
|
1345
|
+
if os.path.exists(pointer_path):
|
|
1346
|
+
return pointer_path
|
|
1429
1347
|
|
|
1430
|
-
|
|
1431
|
-
|
|
1432
|
-
if local_dir is not None: # to local dir
|
|
1433
|
-
return _to_local_dir(blob_path, local_dir, relative_filename, use_symlinks=local_dir_use_symlinks)
|
|
1434
|
-
else: # or in snapshot cache
|
|
1348
|
+
if os.path.exists(blob_path):
|
|
1349
|
+
# we have the blob already, but not the pointer
|
|
1435
1350
|
_create_symlink(blob_path, pointer_path, new_blob=False)
|
|
1436
1351
|
return pointer_path
|
|
1437
1352
|
|
|
@@ -1449,83 +1364,139 @@ def hf_hub_download(
|
|
|
1449
1364
|
|
|
1450
1365
|
Path(lock_path).parent.mkdir(parents=True, exist_ok=True)
|
|
1451
1366
|
with WeakFileLock(lock_path):
|
|
1452
|
-
|
|
1453
|
-
|
|
1454
|
-
|
|
1455
|
-
|
|
1456
|
-
|
|
1457
|
-
|
|
1458
|
-
|
|
1459
|
-
|
|
1460
|
-
|
|
1461
|
-
|
|
1462
|
-
|
|
1463
|
-
def _resumable_file_manager() -> Generator[io.BufferedWriter, None, None]:
|
|
1464
|
-
with open(incomplete_path, "ab") as f:
|
|
1465
|
-
yield f
|
|
1367
|
+
_download_to_tmp_and_move(
|
|
1368
|
+
incomplete_path=Path(blob_path + ".incomplete"),
|
|
1369
|
+
destination_path=Path(blob_path),
|
|
1370
|
+
url_to_download=url_to_download,
|
|
1371
|
+
proxies=proxies,
|
|
1372
|
+
headers=headers,
|
|
1373
|
+
expected_size=expected_size,
|
|
1374
|
+
filename=filename,
|
|
1375
|
+
force_download=force_download,
|
|
1376
|
+
)
|
|
1377
|
+
_create_symlink(blob_path, pointer_path, new_blob=True)
|
|
1466
1378
|
|
|
1467
|
-
|
|
1468
|
-
if os.path.exists(incomplete_path):
|
|
1469
|
-
resume_size = os.stat(incomplete_path).st_size
|
|
1470
|
-
else:
|
|
1471
|
-
resume_size = 0
|
|
1472
|
-
else:
|
|
1473
|
-
temp_file_manager = partial( # type: ignore
|
|
1474
|
-
tempfile.NamedTemporaryFile, mode="wb", dir=cache_dir, delete=False
|
|
1475
|
-
)
|
|
1476
|
-
resume_size = 0
|
|
1379
|
+
return pointer_path
|
|
1477
1380
|
|
|
1478
|
-
# Download to temporary file, then copy to cache dir once finished.
|
|
1479
|
-
# Otherwise you get corrupt cache entries if the download gets interrupted.
|
|
1480
|
-
with temp_file_manager() as temp_file:
|
|
1481
|
-
logger.info("downloading %s to %s", url, temp_file.name)
|
|
1482
1381
|
|
|
1483
|
-
|
|
1484
|
-
|
|
1485
|
-
|
|
1382
|
+
def _hf_hub_download_to_local_dir(
|
|
1383
|
+
*,
|
|
1384
|
+
# Destination
|
|
1385
|
+
local_dir: Union[str, Path],
|
|
1386
|
+
# File info
|
|
1387
|
+
repo_id: str,
|
|
1388
|
+
repo_type: str,
|
|
1389
|
+
filename: str,
|
|
1390
|
+
revision: str,
|
|
1391
|
+
# HTTP info
|
|
1392
|
+
proxies: Optional[Dict],
|
|
1393
|
+
etag_timeout: float,
|
|
1394
|
+
headers: Dict[str, str],
|
|
1395
|
+
endpoint: Optional[str],
|
|
1396
|
+
# Additional options
|
|
1397
|
+
cache_dir: str,
|
|
1398
|
+
force_download: bool,
|
|
1399
|
+
local_files_only: bool,
|
|
1400
|
+
) -> str:
|
|
1401
|
+
"""Download a given file to a local folder, if not already present.
|
|
1486
1402
|
|
|
1487
|
-
|
|
1488
|
-
|
|
1489
|
-
|
|
1490
|
-
|
|
1403
|
+
Method should not be called directly. Please use `hf_hub_download` instead.
|
|
1404
|
+
"""
|
|
1405
|
+
local_dir = Path(local_dir)
|
|
1406
|
+
paths = get_local_download_paths(local_dir=local_dir, filename=filename)
|
|
1407
|
+
local_metadata = read_download_metadata(local_dir=local_dir, filename=filename)
|
|
1408
|
+
|
|
1409
|
+
# Local file exists + metadata exists + commit_hash matches => return file
|
|
1410
|
+
if (
|
|
1411
|
+
not force_download
|
|
1412
|
+
and REGEX_COMMIT_HASH.match(revision)
|
|
1413
|
+
and paths.file_path.is_file()
|
|
1414
|
+
and local_metadata is not None
|
|
1415
|
+
and local_metadata.commit_hash == revision
|
|
1416
|
+
):
|
|
1417
|
+
return str(paths.file_path)
|
|
1418
|
+
|
|
1419
|
+
# Local file doesn't exist or commit_hash doesn't match => we need the etag
|
|
1420
|
+
(url_to_download, etag, commit_hash, expected_size, head_call_error) = _get_metadata_or_catch_error(
|
|
1421
|
+
repo_id=repo_id,
|
|
1422
|
+
filename=filename,
|
|
1423
|
+
repo_type=repo_type,
|
|
1424
|
+
revision=revision,
|
|
1425
|
+
endpoint=endpoint,
|
|
1426
|
+
proxies=proxies,
|
|
1427
|
+
etag_timeout=etag_timeout,
|
|
1428
|
+
headers=headers,
|
|
1429
|
+
local_files_only=local_files_only,
|
|
1430
|
+
)
|
|
1491
1431
|
|
|
1492
|
-
|
|
1493
|
-
|
|
1494
|
-
|
|
1495
|
-
|
|
1496
|
-
|
|
1497
|
-
headers=headers,
|
|
1498
|
-
expected_size=expected_size,
|
|
1499
|
-
displayed_filename=filename,
|
|
1432
|
+
if head_call_error is not None:
|
|
1433
|
+
# No HEAD call but local file exists => default to local file
|
|
1434
|
+
if not force_download and paths.file_path.is_file():
|
|
1435
|
+
logger.warning(
|
|
1436
|
+
f"Couldn't access the Hub to check for update but local file already exists. Defaulting to existing file. (error: {head_call_error})"
|
|
1500
1437
|
)
|
|
1438
|
+
return str(paths.file_path)
|
|
1439
|
+
# Otherwise => raise
|
|
1440
|
+
_raise_on_head_call_error(head_call_error, force_download, local_files_only)
|
|
1501
1441
|
|
|
1502
|
-
|
|
1503
|
-
|
|
1504
|
-
|
|
1505
|
-
|
|
1506
|
-
|
|
1507
|
-
|
|
1508
|
-
|
|
1509
|
-
|
|
1510
|
-
|
|
1511
|
-
|
|
1512
|
-
|
|
1513
|
-
|
|
1514
|
-
|
|
1515
|
-
|
|
1516
|
-
|
|
1517
|
-
|
|
1518
|
-
|
|
1519
|
-
|
|
1520
|
-
|
|
1521
|
-
|
|
1522
|
-
|
|
1523
|
-
|
|
1524
|
-
|
|
1525
|
-
|
|
1526
|
-
|
|
1442
|
+
# From now on, etag, commit_hash, url and size are not None.
|
|
1443
|
+
assert etag is not None, "etag must have been retrieved from server"
|
|
1444
|
+
assert commit_hash is not None, "commit_hash must have been retrieved from server"
|
|
1445
|
+
assert url_to_download is not None, "file location must have been retrieved from server"
|
|
1446
|
+
assert expected_size is not None, "expected_size must have been retrieved from server"
|
|
1447
|
+
|
|
1448
|
+
# Local file exists => check if it's up-to-date
|
|
1449
|
+
if not force_download and paths.file_path.is_file():
|
|
1450
|
+
# etag matches => update metadata and return file
|
|
1451
|
+
if local_metadata is not None and local_metadata.etag == etag:
|
|
1452
|
+
write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag)
|
|
1453
|
+
return str(paths.file_path)
|
|
1454
|
+
|
|
1455
|
+
# metadata is outdated + etag is a sha256
|
|
1456
|
+
# => means it's an LFS file (large)
|
|
1457
|
+
# => let's compute local hash and compare
|
|
1458
|
+
# => if match, update metadata and return file
|
|
1459
|
+
if local_metadata is None and REGEX_SHA256.match(etag) is not None:
|
|
1460
|
+
with open(paths.file_path, "rb") as f:
|
|
1461
|
+
file_hash = sha_fileobj(f).hex()
|
|
1462
|
+
if file_hash == etag:
|
|
1463
|
+
write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag)
|
|
1464
|
+
return str(paths.file_path)
|
|
1465
|
+
|
|
1466
|
+
# Local file doesn't exist or etag isn't a match => retrieve file from remote (or cache)
|
|
1467
|
+
|
|
1468
|
+
# If we are lucky enough, the file is already in the cache => copy it
|
|
1469
|
+
if not force_download:
|
|
1470
|
+
cached_path = try_to_load_from_cache(
|
|
1471
|
+
repo_id=repo_id,
|
|
1472
|
+
filename=filename,
|
|
1473
|
+
cache_dir=cache_dir,
|
|
1474
|
+
revision=commit_hash,
|
|
1475
|
+
repo_type=repo_type,
|
|
1476
|
+
)
|
|
1477
|
+
if isinstance(cached_path, str):
|
|
1478
|
+
with WeakFileLock(paths.lock_path):
|
|
1479
|
+
paths.file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
1480
|
+
shutil.copyfile(cached_path, paths.file_path)
|
|
1481
|
+
write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag)
|
|
1482
|
+
return str(paths.file_path)
|
|
1483
|
+
|
|
1484
|
+
# Otherwise, let's download the file!
|
|
1485
|
+
with WeakFileLock(paths.lock_path):
|
|
1486
|
+
paths.file_path.unlink(missing_ok=True) # delete outdated file first
|
|
1487
|
+
_download_to_tmp_and_move(
|
|
1488
|
+
incomplete_path=paths.incomplete_path(etag),
|
|
1489
|
+
destination_path=paths.file_path,
|
|
1490
|
+
url_to_download=url_to_download,
|
|
1491
|
+
proxies=proxies,
|
|
1492
|
+
headers=headers,
|
|
1493
|
+
expected_size=expected_size,
|
|
1494
|
+
filename=filename,
|
|
1495
|
+
force_download=force_download,
|
|
1496
|
+
)
|
|
1527
1497
|
|
|
1528
|
-
|
|
1498
|
+
write_download_metadata(local_dir=local_dir, filename=filename, commit_hash=commit_hash, etag=etag)
|
|
1499
|
+
return str(paths.file_path)
|
|
1529
1500
|
|
|
1530
1501
|
|
|
1531
1502
|
@validate_hf_hub_args
|
|
@@ -1696,6 +1667,233 @@ def get_hf_file_metadata(
|
|
|
1696
1667
|
)
|
|
1697
1668
|
|
|
1698
1669
|
|
|
1670
|
+
def _get_metadata_or_catch_error(
|
|
1671
|
+
*,
|
|
1672
|
+
repo_id: str,
|
|
1673
|
+
filename: str,
|
|
1674
|
+
repo_type: str,
|
|
1675
|
+
revision: str,
|
|
1676
|
+
endpoint: Optional[str],
|
|
1677
|
+
proxies: Optional[Dict],
|
|
1678
|
+
etag_timeout: Optional[float],
|
|
1679
|
+
headers: Dict[str, str], # mutated inplace!
|
|
1680
|
+
local_files_only: bool,
|
|
1681
|
+
relative_filename: Optional[str] = None, # only used to store `.no_exists` in cache
|
|
1682
|
+
storage_folder: Optional[str] = None, # only used to store `.no_exists` in cache
|
|
1683
|
+
) -> Union[
|
|
1684
|
+
# Either an exception is caught and returned
|
|
1685
|
+
Tuple[None, None, None, None, Exception],
|
|
1686
|
+
# Or the metadata is returned as
|
|
1687
|
+
# `(url_to_download, etag, commit_hash, expected_size, None)`
|
|
1688
|
+
Tuple[str, str, str, int, None],
|
|
1689
|
+
]:
|
|
1690
|
+
"""Get metadata for a file on the Hub, safely handling network issues.
|
|
1691
|
+
|
|
1692
|
+
Returns either the etag, commit_hash and expected size of the file, or the error
|
|
1693
|
+
raised while fetching the metadata.
|
|
1694
|
+
|
|
1695
|
+
NOTE: This function mutates `headers` inplace! It removes the `authorization` header
|
|
1696
|
+
if the file is a LFS blob and the domain of the url is different from the
|
|
1697
|
+
domain of the location (typically an S3 bucket).
|
|
1698
|
+
"""
|
|
1699
|
+
if local_files_only:
|
|
1700
|
+
return (
|
|
1701
|
+
None,
|
|
1702
|
+
None,
|
|
1703
|
+
None,
|
|
1704
|
+
None,
|
|
1705
|
+
OfflineModeIsEnabled(
|
|
1706
|
+
f"Cannot access file since 'local_files_only=True' as been set. (repo_id: {repo_id}, repo_type: {repo_type}, revision: {revision}, filename: {filename})"
|
|
1707
|
+
),
|
|
1708
|
+
)
|
|
1709
|
+
|
|
1710
|
+
url = url = hf_hub_url(repo_id, filename, repo_type=repo_type, revision=revision, endpoint=endpoint)
|
|
1711
|
+
url_to_download: str = url
|
|
1712
|
+
etag: Optional[str] = None
|
|
1713
|
+
commit_hash: Optional[str] = None
|
|
1714
|
+
expected_size: Optional[int] = None
|
|
1715
|
+
head_error_call: Optional[Exception] = None
|
|
1716
|
+
|
|
1717
|
+
# Try to get metadata from the server.
|
|
1718
|
+
# Do not raise yet if the file is not found or not accessible.
|
|
1719
|
+
if not local_files_only:
|
|
1720
|
+
try:
|
|
1721
|
+
try:
|
|
1722
|
+
metadata = get_hf_file_metadata(url=url, proxies=proxies, timeout=etag_timeout, headers=headers)
|
|
1723
|
+
except EntryNotFoundError as http_error:
|
|
1724
|
+
if storage_folder is not None and relative_filename is not None:
|
|
1725
|
+
# Cache the non-existence of the file
|
|
1726
|
+
commit_hash = http_error.response.headers.get(HUGGINGFACE_HEADER_X_REPO_COMMIT)
|
|
1727
|
+
if commit_hash is not None:
|
|
1728
|
+
no_exist_file_path = Path(storage_folder) / ".no_exist" / commit_hash / relative_filename
|
|
1729
|
+
no_exist_file_path.parent.mkdir(parents=True, exist_ok=True)
|
|
1730
|
+
no_exist_file_path.touch()
|
|
1731
|
+
_cache_commit_hash_for_specific_revision(storage_folder, revision, commit_hash)
|
|
1732
|
+
raise
|
|
1733
|
+
|
|
1734
|
+
# Commit hash must exist
|
|
1735
|
+
commit_hash = metadata.commit_hash
|
|
1736
|
+
if commit_hash is None:
|
|
1737
|
+
raise FileMetadataError(
|
|
1738
|
+
"Distant resource does not seem to be on huggingface.co. It is possible that a configuration issue"
|
|
1739
|
+
" prevents you from downloading resources from https://huggingface.co. Please check your firewall"
|
|
1740
|
+
" and proxy settings and make sure your SSL certificates are updated."
|
|
1741
|
+
)
|
|
1742
|
+
|
|
1743
|
+
# Etag must exist
|
|
1744
|
+
# If we don't have any of those, raise an error.
|
|
1745
|
+
etag = metadata.etag
|
|
1746
|
+
if etag is None:
|
|
1747
|
+
raise FileMetadataError(
|
|
1748
|
+
"Distant resource does not have an ETag, we won't be able to reliably ensure reproducibility."
|
|
1749
|
+
)
|
|
1750
|
+
|
|
1751
|
+
# Size must exist
|
|
1752
|
+
expected_size = metadata.size
|
|
1753
|
+
if expected_size is None:
|
|
1754
|
+
raise FileMetadataError("Distant resource does not have a Content-Length.")
|
|
1755
|
+
|
|
1756
|
+
# In case of a redirect, save an extra redirect on the request.get call,
|
|
1757
|
+
# and ensure we download the exact atomic version even if it changed
|
|
1758
|
+
# between the HEAD and the GET (unlikely, but hey).
|
|
1759
|
+
#
|
|
1760
|
+
# If url domain is different => we are downloading from a CDN => url is signed => don't send auth
|
|
1761
|
+
# If url domain is the same => redirect due to repo rename AND downloading a regular file => keep auth
|
|
1762
|
+
if url != metadata.location:
|
|
1763
|
+
url_to_download = metadata.location
|
|
1764
|
+
if urlparse(url).netloc != urlparse(metadata.location).netloc:
|
|
1765
|
+
# Remove authorization header when downloading a LFS blob
|
|
1766
|
+
headers.pop("authorization", None)
|
|
1767
|
+
except (requests.exceptions.SSLError, requests.exceptions.ProxyError):
|
|
1768
|
+
# Actually raise for those subclasses of ConnectionError
|
|
1769
|
+
raise
|
|
1770
|
+
except (
|
|
1771
|
+
requests.exceptions.ConnectionError,
|
|
1772
|
+
requests.exceptions.Timeout,
|
|
1773
|
+
OfflineModeIsEnabled,
|
|
1774
|
+
) as error:
|
|
1775
|
+
# Otherwise, our Internet connection is down.
|
|
1776
|
+
# etag is None
|
|
1777
|
+
head_error_call = error
|
|
1778
|
+
except (RevisionNotFoundError, EntryNotFoundError):
|
|
1779
|
+
# The repo was found but the revision or entry doesn't exist on the Hub (never existed or got deleted)
|
|
1780
|
+
raise
|
|
1781
|
+
except requests.HTTPError as error:
|
|
1782
|
+
# Multiple reasons for an http error:
|
|
1783
|
+
# - Repository is private and invalid/missing token sent
|
|
1784
|
+
# - Repository is gated and invalid/missing token sent
|
|
1785
|
+
# - Hub is down (error 500 or 504)
|
|
1786
|
+
# => let's switch to 'local_files_only=True' to check if the files are already cached.
|
|
1787
|
+
# (if it's not the case, the error will be re-raised)
|
|
1788
|
+
head_error_call = error
|
|
1789
|
+
except FileMetadataError as error:
|
|
1790
|
+
# Multiple reasons for a FileMetadataError:
|
|
1791
|
+
# - Wrong network configuration (proxy, firewall, SSL certificates)
|
|
1792
|
+
# - Inconsistency on the Hub
|
|
1793
|
+
# => let's switch to 'local_files_only=True' to check if the files are already cached.
|
|
1794
|
+
# (if it's not the case, the error will be re-raised)
|
|
1795
|
+
head_error_call = error
|
|
1796
|
+
|
|
1797
|
+
if not (local_files_only or etag is not None or head_error_call is not None):
|
|
1798
|
+
raise RuntimeError("etag is empty due to uncovered problems")
|
|
1799
|
+
|
|
1800
|
+
return (url_to_download, etag, commit_hash, expected_size, head_error_call) # type: ignore [return-value]
|
|
1801
|
+
|
|
1802
|
+
|
|
1803
|
+
def _raise_on_head_call_error(head_call_error: Exception, force_download: bool, local_files_only: bool) -> NoReturn:
|
|
1804
|
+
"""Raise an appropriate error when the HEAD call failed and we cannot locate a local file."""
|
|
1805
|
+
|
|
1806
|
+
# No head call => we cannot force download.
|
|
1807
|
+
if force_download:
|
|
1808
|
+
if local_files_only:
|
|
1809
|
+
raise ValueError("Cannot pass 'force_download=True' and 'local_files_only=True' at the same time.")
|
|
1810
|
+
elif isinstance(head_call_error, OfflineModeIsEnabled):
|
|
1811
|
+
raise ValueError("Cannot pass 'force_download=True' when offline mode is enabled.") from head_call_error
|
|
1812
|
+
else:
|
|
1813
|
+
raise ValueError("Force download failed due to the above error.") from head_call_error
|
|
1814
|
+
|
|
1815
|
+
# No head call + couldn't find an appropriate file on disk => raise an error.
|
|
1816
|
+
if local_files_only:
|
|
1817
|
+
raise LocalEntryNotFoundError(
|
|
1818
|
+
"Cannot find the requested files in the disk cache and outgoing traffic has been disabled. To enable"
|
|
1819
|
+
" hf.co look-ups and downloads online, set 'local_files_only' to False."
|
|
1820
|
+
)
|
|
1821
|
+
elif isinstance(head_call_error, RepositoryNotFoundError) or isinstance(head_call_error, GatedRepoError):
|
|
1822
|
+
# Repo not found or gated => let's raise the actual error
|
|
1823
|
+
raise head_call_error
|
|
1824
|
+
else:
|
|
1825
|
+
# Otherwise: most likely a connection issue or Hub downtime => let's warn the user
|
|
1826
|
+
raise LocalEntryNotFoundError(
|
|
1827
|
+
"An error happened while trying to locate the file on the Hub and we cannot find the requested files"
|
|
1828
|
+
" in the local cache. Please check your connection and try again or make sure your Internet connection"
|
|
1829
|
+
" is on."
|
|
1830
|
+
) from head_call_error
|
|
1831
|
+
|
|
1832
|
+
|
|
1833
|
+
def _download_to_tmp_and_move(
|
|
1834
|
+
incomplete_path: Path,
|
|
1835
|
+
destination_path: Path,
|
|
1836
|
+
url_to_download: str,
|
|
1837
|
+
proxies: Optional[Dict],
|
|
1838
|
+
headers: Dict[str, str],
|
|
1839
|
+
expected_size: Optional[int],
|
|
1840
|
+
filename: str,
|
|
1841
|
+
force_download: bool,
|
|
1842
|
+
) -> None:
|
|
1843
|
+
"""Download content from a URL to a destination path.
|
|
1844
|
+
|
|
1845
|
+
Internal logic:
|
|
1846
|
+
- return early if file is already downloaded
|
|
1847
|
+
- resume download if possible (from incomplete file)
|
|
1848
|
+
- do not resume download if `force_download=True` or `HF_HUB_ENABLE_HF_TRANSFER=True`
|
|
1849
|
+
- check disk space before downloading
|
|
1850
|
+
- download content to a temporary file
|
|
1851
|
+
- set correct permissions on temporary file
|
|
1852
|
+
- move the temporary file to the destination path
|
|
1853
|
+
|
|
1854
|
+
Both `incomplete_path` and `destination_path` must be on the same volume to avoid a local copy.
|
|
1855
|
+
"""
|
|
1856
|
+
if destination_path.exists() and not force_download:
|
|
1857
|
+
# Do nothing if already exists (except if force_download=True)
|
|
1858
|
+
return
|
|
1859
|
+
|
|
1860
|
+
if incomplete_path.exists() and (force_download or (HF_HUB_ENABLE_HF_TRANSFER and not proxies)):
|
|
1861
|
+
# By default, we will try to resume the download if possible.
|
|
1862
|
+
# However, if the user has set `force_download=True` or if `hf_transfer` is enabled, then we should
|
|
1863
|
+
# not resume the download => delete the incomplete file.
|
|
1864
|
+
message = f"Removing incomplete file '{incomplete_path}'"
|
|
1865
|
+
if force_download:
|
|
1866
|
+
message += " (force_download=True)"
|
|
1867
|
+
elif HF_HUB_ENABLE_HF_TRANSFER and not proxies:
|
|
1868
|
+
message += " (hf_transfer=True)"
|
|
1869
|
+
logger.info(message)
|
|
1870
|
+
incomplete_path.unlink(missing_ok=True)
|
|
1871
|
+
|
|
1872
|
+
with incomplete_path.open("ab") as f:
|
|
1873
|
+
resume_size = f.tell()
|
|
1874
|
+
message = f"Downloading '{filename}' to '{incomplete_path}'"
|
|
1875
|
+
if resume_size > 0 and expected_size is not None:
|
|
1876
|
+
message += f" (resume from {resume_size}/{expected_size})"
|
|
1877
|
+
logger.info(message)
|
|
1878
|
+
|
|
1879
|
+
if expected_size is not None: # might be None if HTTP header not set correctly
|
|
1880
|
+
# Check disk space in both tmp and destination path
|
|
1881
|
+
_check_disk_space(expected_size, incomplete_path.parent)
|
|
1882
|
+
_check_disk_space(expected_size, destination_path.parent)
|
|
1883
|
+
|
|
1884
|
+
http_get(
|
|
1885
|
+
url_to_download,
|
|
1886
|
+
f,
|
|
1887
|
+
proxies=proxies,
|
|
1888
|
+
resume_size=resume_size,
|
|
1889
|
+
headers=headers,
|
|
1890
|
+
expected_size=expected_size,
|
|
1891
|
+
)
|
|
1892
|
+
|
|
1893
|
+
logger.info(f"Download complete. Moving file to {destination_path}")
|
|
1894
|
+
_chmod_and_move(incomplete_path, destination_path)
|
|
1895
|
+
|
|
1896
|
+
|
|
1699
1897
|
def _int_or_none(value: Optional[str]) -> Optional[int]:
|
|
1700
1898
|
try:
|
|
1701
1899
|
return int(value) # type: ignore
|
|
@@ -1703,7 +1901,7 @@ def _int_or_none(value: Optional[str]) -> Optional[int]:
|
|
|
1703
1901
|
return None
|
|
1704
1902
|
|
|
1705
1903
|
|
|
1706
|
-
def
|
|
1904
|
+
def _chmod_and_move(src: Path, dst: Path) -> None:
|
|
1707
1905
|
"""Set correct permission before moving a blob from tmp directory to cache dir.
|
|
1708
1906
|
|
|
1709
1907
|
Do not take into account the `umask` from the process as there is no convenient way
|
|
@@ -1717,15 +1915,15 @@ def _chmod_and_replace(src: str, dst: str) -> None:
|
|
|
1717
1915
|
- Fix issue: https://github.com/huggingface/huggingface_hub/issues/1215
|
|
1718
1916
|
"""
|
|
1719
1917
|
# Get umask by creating a temporary file in the cached repo folder.
|
|
1720
|
-
tmp_file =
|
|
1918
|
+
tmp_file = dst.parent.parent / f"tmp_{uuid.uuid4()}"
|
|
1721
1919
|
try:
|
|
1722
1920
|
tmp_file.touch()
|
|
1723
1921
|
cache_dir_mode = Path(tmp_file).stat().st_mode
|
|
1724
|
-
os.chmod(src, stat.S_IMODE(cache_dir_mode))
|
|
1922
|
+
os.chmod(str(src), stat.S_IMODE(cache_dir_mode))
|
|
1725
1923
|
finally:
|
|
1726
1924
|
tmp_file.unlink()
|
|
1727
1925
|
|
|
1728
|
-
shutil.move(src, dst)
|
|
1926
|
+
shutil.move(str(src), str(dst))
|
|
1729
1927
|
|
|
1730
1928
|
|
|
1731
1929
|
def _get_pointer_path(storage_folder: str, revision: str, relative_filename: str) -> str:
|
|
@@ -1739,32 +1937,3 @@ def _get_pointer_path(storage_folder: str, revision: str, relative_filename: str
|
|
|
1739
1937
|
f" `relative_filename='{relative_filename}'`."
|
|
1740
1938
|
)
|
|
1741
1939
|
return pointer_path
|
|
1742
|
-
|
|
1743
|
-
|
|
1744
|
-
def _to_local_dir(
|
|
1745
|
-
path: str, local_dir: str, relative_filename: str, use_symlinks: Union[bool, Literal["auto"]]
|
|
1746
|
-
) -> str:
|
|
1747
|
-
"""Place a file in a local dir (different than cache_dir).
|
|
1748
|
-
|
|
1749
|
-
Either symlink to blob file in cache or duplicate file depending on `use_symlinks` and file size.
|
|
1750
|
-
"""
|
|
1751
|
-
# Using `os.path.abspath` instead of `Path.resolve()` to avoid resolving symlinks
|
|
1752
|
-
local_dir_filepath = os.path.join(local_dir, relative_filename)
|
|
1753
|
-
if Path(os.path.abspath(local_dir)) not in Path(os.path.abspath(local_dir_filepath)).parents:
|
|
1754
|
-
raise ValueError(
|
|
1755
|
-
f"Cannot copy file '{relative_filename}' to local dir '{local_dir}': file would not be in the local"
|
|
1756
|
-
" directory."
|
|
1757
|
-
)
|
|
1758
|
-
|
|
1759
|
-
os.makedirs(os.path.dirname(local_dir_filepath), exist_ok=True)
|
|
1760
|
-
real_blob_path = os.path.realpath(path)
|
|
1761
|
-
|
|
1762
|
-
# If "auto" (default) copy-paste small files to ease manual editing but symlink big files to save disk
|
|
1763
|
-
if use_symlinks == "auto":
|
|
1764
|
-
use_symlinks = os.stat(real_blob_path).st_size > constants.HF_HUB_LOCAL_DIR_AUTO_SYMLINK_THRESHOLD
|
|
1765
|
-
|
|
1766
|
-
if use_symlinks:
|
|
1767
|
-
_create_symlink(real_blob_path, local_dir_filepath, new_blob=False)
|
|
1768
|
-
else:
|
|
1769
|
-
shutil.copyfile(real_blob_path, local_dir_filepath)
|
|
1770
|
-
return local_dir_filepath
|