modelscope-hub 0.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- modelscope_hub-0.0.0/PKG-INFO +31 -0
- modelscope_hub-0.0.0/README.md +2 -0
- modelscope_hub-0.0.0/pyproject.toml +68 -0
- modelscope_hub-0.0.0/setup.cfg +4 -0
- modelscope_hub-0.0.0/src/modelscope_hub/__init__.py +71 -0
- modelscope_hub-0.0.0/src/modelscope_hub/_cache_manager.py +190 -0
- modelscope_hub-0.0.0/src/modelscope_hub/_download.py +293 -0
- modelscope_hub-0.0.0/src/modelscope_hub/_git.py +225 -0
- modelscope_hub-0.0.0/src/modelscope_hub/_legacy_api.py +414 -0
- modelscope_hub-0.0.0/src/modelscope_hub/_openapi.py +552 -0
- modelscope_hub-0.0.0/src/modelscope_hub/_repository.py +193 -0
- modelscope_hub-0.0.0/src/modelscope_hub/_upload.py +445 -0
- modelscope_hub-0.0.0/src/modelscope_hub/api.py +1533 -0
- modelscope_hub-0.0.0/src/modelscope_hub/cli/__init__.py +1 -0
- modelscope_hub-0.0.0/src/modelscope_hub/cli/base.py +158 -0
- modelscope_hub-0.0.0/src/modelscope_hub/cli/cache.py +112 -0
- modelscope_hub-0.0.0/src/modelscope_hub/cli/deploy.py +143 -0
- modelscope_hub-0.0.0/src/modelscope_hub/cli/download.py +86 -0
- modelscope_hub-0.0.0/src/modelscope_hub/cli/login.py +70 -0
- modelscope_hub-0.0.0/src/modelscope_hub/cli/main.py +124 -0
- modelscope_hub-0.0.0/src/modelscope_hub/cli/mcp.py +102 -0
- modelscope_hub-0.0.0/src/modelscope_hub/cli/repo.py +194 -0
- modelscope_hub-0.0.0/src/modelscope_hub/cli/secret.py +120 -0
- modelscope_hub-0.0.0/src/modelscope_hub/cli/upload.py +103 -0
- modelscope_hub-0.0.0/src/modelscope_hub/config.py +149 -0
- modelscope_hub-0.0.0/src/modelscope_hub/constants.py +150 -0
- modelscope_hub-0.0.0/src/modelscope_hub/errors.py +163 -0
- modelscope_hub-0.0.0/src/modelscope_hub/types.py +278 -0
- modelscope_hub-0.0.0/src/modelscope_hub/utils/__init__.py +14 -0
- modelscope_hub-0.0.0/src/modelscope_hub/utils/file_utils.py +99 -0
- modelscope_hub-0.0.0/src/modelscope_hub/utils/logger.py +58 -0
- modelscope_hub-0.0.0/src/modelscope_hub/version.py +3 -0
- modelscope_hub-0.0.0/src/modelscope_hub.egg-info/PKG-INFO +31 -0
- modelscope_hub-0.0.0/src/modelscope_hub.egg-info/SOURCES.txt +36 -0
- modelscope_hub-0.0.0/src/modelscope_hub.egg-info/dependency_links.txt +1 -0
- modelscope_hub-0.0.0/src/modelscope_hub.egg-info/entry_points.txt +3 -0
- modelscope_hub-0.0.0/src/modelscope_hub.egg-info/requires.txt +11 -0
- modelscope_hub-0.0.0/src/modelscope_hub.egg-info/top_level.txt +1 -0
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: modelscope-hub
|
|
3
|
+
Version: 0.0.0
|
|
4
|
+
Summary: An OpenAPI-first Python SDK for interacting with the ModelScope Hub platform.
|
|
5
|
+
License: Apache-2.0
|
|
6
|
+
Keywords: modelscope,hub,sdk,openapi,machine-learning
|
|
7
|
+
Classifier: Development Status :: 3 - Alpha
|
|
8
|
+
Classifier: Intended Audience :: Developers
|
|
9
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
10
|
+
Classifier: Programming Language :: Python :: 3
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
13
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
15
|
+
Classifier: Programming Language :: Python :: 3.14
|
|
16
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
17
|
+
Requires-Python: >=3.10
|
|
18
|
+
Description-Content-Type: text/markdown
|
|
19
|
+
Requires-Dist: requests>=2.28
|
|
20
|
+
Requires-Dist: tqdm>=4.64.0
|
|
21
|
+
Requires-Dist: filelock>=3.9
|
|
22
|
+
Requires-Dist: urllib3>=1.26
|
|
23
|
+
Provides-Extra: dev
|
|
24
|
+
Requires-Dist: pytest>=7.0; extra == "dev"
|
|
25
|
+
Requires-Dist: pytest-mock; extra == "dev"
|
|
26
|
+
Requires-Dist: responses>=0.20; extra == "dev"
|
|
27
|
+
Requires-Dist: ruff; extra == "dev"
|
|
28
|
+
Requires-Dist: mypy; extra == "dev"
|
|
29
|
+
|
|
30
|
+
ModelScope hub
|
|
31
|
+
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
[project]
|
|
2
|
+
name = "modelscope-hub"
|
|
3
|
+
dynamic = ["version"]
|
|
4
|
+
description = "An OpenAPI-first Python SDK for interacting with the ModelScope Hub platform."
|
|
5
|
+
requires-python = ">=3.10"
|
|
6
|
+
license = {text = "Apache-2.0"}
|
|
7
|
+
readme = "README.md"
|
|
8
|
+
keywords = ["modelscope", "hub", "sdk", "openapi", "machine-learning"]
|
|
9
|
+
classifiers = [
|
|
10
|
+
"Development Status :: 3 - Alpha",
|
|
11
|
+
"Intended Audience :: Developers",
|
|
12
|
+
"License :: OSI Approved :: Apache Software License",
|
|
13
|
+
"Programming Language :: Python :: 3",
|
|
14
|
+
"Programming Language :: Python :: 3.10",
|
|
15
|
+
"Programming Language :: Python :: 3.11",
|
|
16
|
+
"Programming Language :: Python :: 3.12",
|
|
17
|
+
"Programming Language :: Python :: 3.13",
|
|
18
|
+
"Programming Language :: Python :: 3.14",
|
|
19
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
20
|
+
]
|
|
21
|
+
dependencies = [
|
|
22
|
+
"requests>=2.28",
|
|
23
|
+
"tqdm>=4.64.0",
|
|
24
|
+
"filelock>=3.9",
|
|
25
|
+
"urllib3>=1.26",
|
|
26
|
+
]
|
|
27
|
+
|
|
28
|
+
[project.optional-dependencies]
|
|
29
|
+
dev = [
|
|
30
|
+
"pytest>=7.0",
|
|
31
|
+
"pytest-mock",
|
|
32
|
+
"responses>=0.20",
|
|
33
|
+
"ruff",
|
|
34
|
+
"mypy",
|
|
35
|
+
]
|
|
36
|
+
|
|
37
|
+
[project.scripts]
|
|
38
|
+
modelscope = "modelscope_hub.cli.main:run_cmd"
|
|
39
|
+
ms = "modelscope_hub.cli.main:run_cmd"
|
|
40
|
+
|
|
41
|
+
[build-system]
|
|
42
|
+
requires = ["setuptools>=68.0"]
|
|
43
|
+
build-backend = "setuptools.build_meta"
|
|
44
|
+
|
|
45
|
+
[tool.setuptools.packages.find]
|
|
46
|
+
where = ["src"]
|
|
47
|
+
|
|
48
|
+
[tool.setuptools.dynamic]
|
|
49
|
+
version = {attr = "modelscope_hub.version.__version__"}
|
|
50
|
+
|
|
51
|
+
[tool.ruff]
|
|
52
|
+
target-version = "py313"
|
|
53
|
+
line-length = 120
|
|
54
|
+
|
|
55
|
+
[tool.ruff.lint]
|
|
56
|
+
select = ["E", "F", "I", "N", "W", "UP"]
|
|
57
|
+
|
|
58
|
+
[tool.pytest.ini_options]
|
|
59
|
+
testpaths = ["tests"]
|
|
60
|
+
markers = [
|
|
61
|
+
"remote: tests requiring remote API access (need .env credentials)",
|
|
62
|
+
]
|
|
63
|
+
addopts = "-v --tb=short"
|
|
64
|
+
|
|
65
|
+
[tool.mypy]
|
|
66
|
+
python_version = "3.13"
|
|
67
|
+
strict = false
|
|
68
|
+
warn_return_any = true
|
|
@@ -0,0 +1,71 @@
|
|
|
1
|
+
"""ModelScope Hub SDK.
|
|
2
|
+
|
|
3
|
+
An OpenAPI-first Python SDK for interacting with the ModelScope Hub platform.
|
|
4
|
+
|
|
5
|
+
The public surface is intentionally small: most callers should construct a
|
|
6
|
+
single :class:`HubApi` instance and call its methods. The data classes
|
|
7
|
+
exported alongside it provide structured return types for type-checked code.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from .api import HubApi
|
|
13
|
+
from .config import HubConfig, get_default_config, set_default_config
|
|
14
|
+
from .constants import License, RepoType, Visibility
|
|
15
|
+
from .errors import (
|
|
16
|
+
APIError,
|
|
17
|
+
AuthenticationError,
|
|
18
|
+
CacheError,
|
|
19
|
+
FileIntegrityError,
|
|
20
|
+
HubError,
|
|
21
|
+
NetworkError,
|
|
22
|
+
NotFoundError,
|
|
23
|
+
PermissionError,
|
|
24
|
+
RateLimitError,
|
|
25
|
+
ServerError,
|
|
26
|
+
ValidationError,
|
|
27
|
+
)
|
|
28
|
+
from .types import (
|
|
29
|
+
CacheInfo,
|
|
30
|
+
CachedRepoInfo,
|
|
31
|
+
CommitInfo,
|
|
32
|
+
FileInfo,
|
|
33
|
+
PagedResult,
|
|
34
|
+
RepoInfo,
|
|
35
|
+
UserInfo,
|
|
36
|
+
)
|
|
37
|
+
from .version import __version__
|
|
38
|
+
|
|
39
|
+
__all__ = [
|
|
40
|
+
"__version__",
|
|
41
|
+
# Facade
|
|
42
|
+
"HubApi",
|
|
43
|
+
# Configuration
|
|
44
|
+
"HubConfig",
|
|
45
|
+
"get_default_config",
|
|
46
|
+
"set_default_config",
|
|
47
|
+
# Enums
|
|
48
|
+
"License",
|
|
49
|
+
"RepoType",
|
|
50
|
+
"Visibility",
|
|
51
|
+
# Data classes
|
|
52
|
+
"CacheInfo",
|
|
53
|
+
"CachedRepoInfo",
|
|
54
|
+
"CommitInfo",
|
|
55
|
+
"FileInfo",
|
|
56
|
+
"PagedResult",
|
|
57
|
+
"RepoInfo",
|
|
58
|
+
"UserInfo",
|
|
59
|
+
# Errors
|
|
60
|
+
"APIError",
|
|
61
|
+
"AuthenticationError",
|
|
62
|
+
"CacheError",
|
|
63
|
+
"FileIntegrityError",
|
|
64
|
+
"HubError",
|
|
65
|
+
"NetworkError",
|
|
66
|
+
"NotFoundError",
|
|
67
|
+
"PermissionError",
|
|
68
|
+
"RateLimitError",
|
|
69
|
+
"ServerError",
|
|
70
|
+
"ValidationError",
|
|
71
|
+
]
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
"""Cache management utilities.
|
|
2
|
+
|
|
3
|
+
Provides scanning and cleanup of the local blob/snapshot cache produced
|
|
4
|
+
by :class:`~._download.DownloadManager`.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
import shutil
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
from .config import get_default_config
|
|
13
|
+
from .constants import RepoType
|
|
14
|
+
from .errors import CacheError
|
|
15
|
+
from .types import CacheInfo, CachedRepoInfo
|
|
16
|
+
from .utils.logger import get_logger
|
|
17
|
+
|
|
18
|
+
logger = get_logger("cache")
|
|
19
|
+
|
|
20
|
+
# Repo types to scan by default
|
|
21
|
+
_DEFAULT_SCAN_TYPES = [RepoType.MODEL, RepoType.DATASET, RepoType.STUDIO, RepoType.MCP]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def scan_cache(cache_dir: Path | None = None) -> CacheInfo:
|
|
25
|
+
"""Scan the local cache and return metadata about cached repositories.
|
|
26
|
+
|
|
27
|
+
Parameters
|
|
28
|
+
----------
|
|
29
|
+
cache_dir:
|
|
30
|
+
Override for the cache directory. Defaults to the SDK config default.
|
|
31
|
+
|
|
32
|
+
Returns
|
|
33
|
+
-------
|
|
34
|
+
CacheInfo
|
|
35
|
+
Summary of all cached repositories, total size, etc.
|
|
36
|
+
"""
|
|
37
|
+
config = get_default_config()
|
|
38
|
+
root = Path(cache_dir) if cache_dir else config.cache_dir
|
|
39
|
+
|
|
40
|
+
if not root.is_dir():
|
|
41
|
+
return CacheInfo(repos=[], total_size=0, cache_dir=str(root))
|
|
42
|
+
|
|
43
|
+
repos: list[CachedRepoInfo] = []
|
|
44
|
+
total_size = 0
|
|
45
|
+
|
|
46
|
+
for repo_type in _DEFAULT_SCAN_TYPES:
|
|
47
|
+
segment = f"{repo_type}s"
|
|
48
|
+
type_dir = root / segment
|
|
49
|
+
if not type_dir.is_dir():
|
|
50
|
+
continue
|
|
51
|
+
|
|
52
|
+
for repo_dir in type_dir.iterdir():
|
|
53
|
+
if not repo_dir.is_dir():
|
|
54
|
+
continue
|
|
55
|
+
|
|
56
|
+
# Compute size
|
|
57
|
+
size = _dir_size(repo_dir)
|
|
58
|
+
total_size += size
|
|
59
|
+
|
|
60
|
+
# Count files
|
|
61
|
+
nb_files = sum(1 for _ in repo_dir.rglob("*") if _.is_file())
|
|
62
|
+
|
|
63
|
+
# Determine revision from snapshot dirs
|
|
64
|
+
snapshots_dir = repo_dir / "snapshots"
|
|
65
|
+
revision = None
|
|
66
|
+
if snapshots_dir.is_dir():
|
|
67
|
+
revisions = [d.name for d in snapshots_dir.iterdir() if d.is_dir()]
|
|
68
|
+
revision = revisions[0] if len(revisions) == 1 else ",".join(revisions[:5])
|
|
69
|
+
|
|
70
|
+
# Last access time
|
|
71
|
+
try:
|
|
72
|
+
last_accessed_ts = repo_dir.stat().st_atime
|
|
73
|
+
except OSError:
|
|
74
|
+
last_accessed_ts = 0
|
|
75
|
+
|
|
76
|
+
# Decode repo_id from directory name (owner--name → owner/name)
|
|
77
|
+
repo_id = repo_dir.name.replace("--", "/")
|
|
78
|
+
|
|
79
|
+
repos.append(CachedRepoInfo(
|
|
80
|
+
repo_id=repo_id,
|
|
81
|
+
repo_type=repo_type,
|
|
82
|
+
revision=revision,
|
|
83
|
+
size_on_disk=size,
|
|
84
|
+
nb_files=nb_files,
|
|
85
|
+
last_accessed=last_accessed_ts if last_accessed_ts > 0 else None,
|
|
86
|
+
local_path=str(repo_dir),
|
|
87
|
+
))
|
|
88
|
+
|
|
89
|
+
return CacheInfo(
|
|
90
|
+
repos=repos,
|
|
91
|
+
total_size=total_size,
|
|
92
|
+
cache_dir=str(root),
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def clear_cache(
|
|
97
|
+
cache_dir: Path | None = None,
|
|
98
|
+
repo_type: str | None = None,
|
|
99
|
+
repo_id: str | None = None,
|
|
100
|
+
) -> int:
|
|
101
|
+
"""Remove cached data from disk.
|
|
102
|
+
|
|
103
|
+
Parameters
|
|
104
|
+
----------
|
|
105
|
+
cache_dir:
|
|
106
|
+
Override for the cache directory. Defaults to the SDK config default.
|
|
107
|
+
repo_type:
|
|
108
|
+
If given, only clear caches of this repo type.
|
|
109
|
+
repo_id:
|
|
110
|
+
If given, only clear the cache for this specific repository.
|
|
111
|
+
Must be used with ``repo_type``.
|
|
112
|
+
|
|
113
|
+
Returns
|
|
114
|
+
-------
|
|
115
|
+
int
|
|
116
|
+
Number of bytes freed.
|
|
117
|
+
|
|
118
|
+
Raises
|
|
119
|
+
------
|
|
120
|
+
CacheError
|
|
121
|
+
On filesystem errors.
|
|
122
|
+
"""
|
|
123
|
+
config = get_default_config()
|
|
124
|
+
root = Path(cache_dir) if cache_dir else config.cache_dir
|
|
125
|
+
|
|
126
|
+
# Guard against accidental nuke: passing only ``repo_id`` would otherwise
|
|
127
|
+
# silently fall through to the "clear everything" branch below.
|
|
128
|
+
if repo_id and not repo_type:
|
|
129
|
+
raise CacheError("repo_type is required when repo_id is specified")
|
|
130
|
+
|
|
131
|
+
if not root.is_dir():
|
|
132
|
+
logger.info("Cache directory does not exist: %s", root)
|
|
133
|
+
return 0
|
|
134
|
+
|
|
135
|
+
freed = 0
|
|
136
|
+
|
|
137
|
+
if repo_id and repo_type:
|
|
138
|
+
# Clear specific repo
|
|
139
|
+
segment = f"{repo_type}s" if not repo_type.endswith("s") else repo_type
|
|
140
|
+
safe_id = repo_id.replace("/", "--")
|
|
141
|
+
target = root / segment / safe_id
|
|
142
|
+
if target.is_dir():
|
|
143
|
+
freed = _dir_size(target)
|
|
144
|
+
_safe_rmtree(target)
|
|
145
|
+
logger.info("Cleared cache for %s/%s (%d bytes)", repo_type, repo_id, freed)
|
|
146
|
+
elif repo_type:
|
|
147
|
+
# Clear all repos of this type
|
|
148
|
+
segment = f"{repo_type}s" if not repo_type.endswith("s") else repo_type
|
|
149
|
+
type_dir = root / segment
|
|
150
|
+
if type_dir.is_dir():
|
|
151
|
+
freed = _dir_size(type_dir)
|
|
152
|
+
_safe_rmtree(type_dir)
|
|
153
|
+
logger.info("Cleared all %s caches (%d bytes)", repo_type, freed)
|
|
154
|
+
else:
|
|
155
|
+
# Clear everything
|
|
156
|
+
for repo_t in _DEFAULT_SCAN_TYPES:
|
|
157
|
+
segment = f"{repo_t}s"
|
|
158
|
+
type_dir = root / segment
|
|
159
|
+
if type_dir.is_dir():
|
|
160
|
+
freed += _dir_size(type_dir)
|
|
161
|
+
_safe_rmtree(type_dir)
|
|
162
|
+
logger.info("Cleared all caches (%d bytes)", freed)
|
|
163
|
+
|
|
164
|
+
return freed
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
# ---------------------------------------------------------------------------
|
|
168
|
+
# Helpers
|
|
169
|
+
# ---------------------------------------------------------------------------
|
|
170
|
+
def _dir_size(path: Path) -> int:
|
|
171
|
+
"""Compute total size of all files under ``path`` recursively."""
|
|
172
|
+
total = 0
|
|
173
|
+
try:
|
|
174
|
+
for f in path.rglob("*"):
|
|
175
|
+
if f.is_file():
|
|
176
|
+
try:
|
|
177
|
+
total += f.stat().st_size
|
|
178
|
+
except OSError:
|
|
179
|
+
pass
|
|
180
|
+
except OSError:
|
|
181
|
+
pass
|
|
182
|
+
return total
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
def _safe_rmtree(path: Path) -> None:
|
|
186
|
+
"""Remove a directory tree, raising CacheError on failure."""
|
|
187
|
+
try:
|
|
188
|
+
shutil.rmtree(path)
|
|
189
|
+
except OSError as exc:
|
|
190
|
+
raise CacheError(f"Failed to remove {path}: {exc}") from exc
|
|
@@ -0,0 +1,293 @@
|
|
|
1
|
+
"""Internal file download implementation.
|
|
2
|
+
|
|
3
|
+
Supports single-file and whole-repo (snapshot) downloads with:
|
|
4
|
+
- HTTP Range-based resume
|
|
5
|
+
- SHA256 integrity verification
|
|
6
|
+
- tqdm progress display
|
|
7
|
+
- Parallel downloads via ThreadPoolExecutor
|
|
8
|
+
- Local snapshot cache directory management
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import fnmatch
|
|
14
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
from typing import TYPE_CHECKING
|
|
17
|
+
|
|
18
|
+
from tqdm.auto import tqdm
|
|
19
|
+
|
|
20
|
+
from .constants import DOWNLOAD_CHUNK_SIZE
|
|
21
|
+
from .errors import FileIntegrityError, NetworkError
|
|
22
|
+
from .utils.file_utils import compute_hash, ensure_dir
|
|
23
|
+
from .utils.logger import get_logger
|
|
24
|
+
|
|
25
|
+
if TYPE_CHECKING:
|
|
26
|
+
from .config import HubConfig
|
|
27
|
+
from ._legacy_api import LegacyClient
|
|
28
|
+
|
|
29
|
+
logger = get_logger("download")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _matches_patterns(path: str, patterns: list[str] | None) -> bool:
|
|
33
|
+
"""Check if path matches any of the glob patterns."""
|
|
34
|
+
if not patterns:
|
|
35
|
+
return False
|
|
36
|
+
return any(fnmatch.fnmatch(path, pat) for pat in patterns)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class DownloadManager:
|
|
40
|
+
"""Internal file download implementation.
|
|
41
|
+
|
|
42
|
+
Dependencies are injected via constructor to keep this class testable.
|
|
43
|
+
"""
|
|
44
|
+
|
|
45
|
+
def __init__(self, legacy_client: "LegacyClient", config: "HubConfig") -> None:
|
|
46
|
+
self._client = legacy_client
|
|
47
|
+
self._config = config
|
|
48
|
+
|
|
49
|
+
# ------------------------------------------------------------------
|
|
50
|
+
# Public API
|
|
51
|
+
# ------------------------------------------------------------------
|
|
52
|
+
def download_file(
|
|
53
|
+
self,
|
|
54
|
+
repo_id: str,
|
|
55
|
+
repo_type: str,
|
|
56
|
+
file_path: str,
|
|
57
|
+
revision: str = "master",
|
|
58
|
+
cache_dir: Path | None = None,
|
|
59
|
+
force: bool = False,
|
|
60
|
+
) -> Path:
|
|
61
|
+
"""Download a single file from a repository.
|
|
62
|
+
|
|
63
|
+
Uses the snapshot cache layout::
|
|
64
|
+
|
|
65
|
+
{cache_dir}/{type}s/{owner}--{name}/snapshots/{revision}/{file_path}
|
|
66
|
+
|
|
67
|
+
Parameters
|
|
68
|
+
----------
|
|
69
|
+
repo_id:
|
|
70
|
+
Repository identifier (``owner/name``).
|
|
71
|
+
repo_type:
|
|
72
|
+
One of the :class:`~.constants.RepoType` values.
|
|
73
|
+
file_path:
|
|
74
|
+
Path within the repository.
|
|
75
|
+
revision:
|
|
76
|
+
Branch, tag, or commit hash.
|
|
77
|
+
cache_dir:
|
|
78
|
+
Override for the default cache directory.
|
|
79
|
+
force:
|
|
80
|
+
Re-download even if file exists in cache.
|
|
81
|
+
|
|
82
|
+
Returns
|
|
83
|
+
-------
|
|
84
|
+
Path
|
|
85
|
+
Absolute path to the downloaded (or cached) file on disk.
|
|
86
|
+
"""
|
|
87
|
+
root = self._repo_cache_dir(repo_id, repo_type, cache_dir)
|
|
88
|
+
snapshot_dir = root / "snapshots" / revision
|
|
89
|
+
target = snapshot_dir / file_path
|
|
90
|
+
|
|
91
|
+
# Return cached file if valid and not forced
|
|
92
|
+
if not force and target.exists():
|
|
93
|
+
logger.debug("Cache hit: %s", target)
|
|
94
|
+
return target
|
|
95
|
+
|
|
96
|
+
# Ensure the snapshot directory exists before streaming into it.
|
|
97
|
+
ensure_dir(target.parent)
|
|
98
|
+
|
|
99
|
+
# Perform download with resume support
|
|
100
|
+
tmp_path = self._download_with_resume(repo_id, repo_type, file_path, revision, target)
|
|
101
|
+
|
|
102
|
+
return tmp_path
|
|
103
|
+
|
|
104
|
+
def download_repo(
|
|
105
|
+
self,
|
|
106
|
+
repo_id: str,
|
|
107
|
+
repo_type: str,
|
|
108
|
+
revision: str = "master",
|
|
109
|
+
cache_dir: Path | None = None,
|
|
110
|
+
allow_patterns: list[str] | None = None,
|
|
111
|
+
ignore_patterns: list[str] | None = None,
|
|
112
|
+
max_workers: int = 4,
|
|
113
|
+
) -> Path:
|
|
114
|
+
"""Download an entire repository (snapshot download).
|
|
115
|
+
|
|
116
|
+
Parameters
|
|
117
|
+
----------
|
|
118
|
+
repo_id:
|
|
119
|
+
Repository identifier (``owner/name``).
|
|
120
|
+
repo_type:
|
|
121
|
+
One of the :class:`~.constants.RepoType` values.
|
|
122
|
+
revision:
|
|
123
|
+
Branch, tag, or commit hash.
|
|
124
|
+
cache_dir:
|
|
125
|
+
Override for the default cache directory.
|
|
126
|
+
allow_patterns:
|
|
127
|
+
Only files matching these globs will be downloaded.
|
|
128
|
+
ignore_patterns:
|
|
129
|
+
Files matching these globs will be skipped.
|
|
130
|
+
max_workers:
|
|
131
|
+
Number of parallel download threads.
|
|
132
|
+
|
|
133
|
+
Returns
|
|
134
|
+
-------
|
|
135
|
+
Path
|
|
136
|
+
Absolute path to the snapshot directory.
|
|
137
|
+
"""
|
|
138
|
+
root = self._repo_cache_dir(repo_id, repo_type, cache_dir)
|
|
139
|
+
snapshot_dir = ensure_dir(root / "snapshots" / revision)
|
|
140
|
+
|
|
141
|
+
# Fetch file tree
|
|
142
|
+
files = self._client.list_repo_files(
|
|
143
|
+
repo_id=repo_id,
|
|
144
|
+
repo_type=repo_type,
|
|
145
|
+
revision=revision,
|
|
146
|
+
recursive=True,
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
# Filter files
|
|
150
|
+
file_paths: list[str] = []
|
|
151
|
+
for f in files:
|
|
152
|
+
path = f.get("Path") or f.get("path") or f.get("Name") or ""
|
|
153
|
+
ftype = f.get("Type") or f.get("type") or "blob"
|
|
154
|
+
if ftype == "tree":
|
|
155
|
+
continue
|
|
156
|
+
if not path:
|
|
157
|
+
continue
|
|
158
|
+
if allow_patterns and not _matches_patterns(path, allow_patterns):
|
|
159
|
+
continue
|
|
160
|
+
if ignore_patterns and _matches_patterns(path, ignore_patterns):
|
|
161
|
+
continue
|
|
162
|
+
file_paths.append(path)
|
|
163
|
+
|
|
164
|
+
if not file_paths:
|
|
165
|
+
logger.info("No files to download for %s@%s", repo_id, revision)
|
|
166
|
+
return snapshot_dir
|
|
167
|
+
|
|
168
|
+
logger.info("Downloading %d files from %s@%s", len(file_paths), repo_id, revision)
|
|
169
|
+
|
|
170
|
+
# Parallel download
|
|
171
|
+
errors: list[str] = []
|
|
172
|
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
173
|
+
futures = {
|
|
174
|
+
executor.submit(
|
|
175
|
+
self.download_file,
|
|
176
|
+
repo_id=repo_id,
|
|
177
|
+
repo_type=repo_type,
|
|
178
|
+
file_path=fp,
|
|
179
|
+
revision=revision,
|
|
180
|
+
cache_dir=cache_dir,
|
|
181
|
+
): fp
|
|
182
|
+
for fp in file_paths
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
with tqdm(total=len(file_paths), desc="Downloading", unit="file") as pbar:
|
|
186
|
+
for future in as_completed(futures):
|
|
187
|
+
fp = futures[future]
|
|
188
|
+
try:
|
|
189
|
+
future.result()
|
|
190
|
+
except Exception as exc:
|
|
191
|
+
errors.append(f"{fp}: {exc}")
|
|
192
|
+
logger.error("Failed to download %s: %s", fp, exc)
|
|
193
|
+
finally:
|
|
194
|
+
pbar.update(1)
|
|
195
|
+
|
|
196
|
+
if errors:
|
|
197
|
+
logger.warning("%d file(s) failed to download", len(errors))
|
|
198
|
+
|
|
199
|
+
return snapshot_dir
|
|
200
|
+
|
|
201
|
+
# ------------------------------------------------------------------
|
|
202
|
+
# Internal helpers
|
|
203
|
+
# ------------------------------------------------------------------
|
|
204
|
+
def _repo_cache_dir(
|
|
205
|
+
self,
|
|
206
|
+
repo_id: str,
|
|
207
|
+
repo_type: str,
|
|
208
|
+
cache_dir: Path | None = None,
|
|
209
|
+
) -> Path:
|
|
210
|
+
"""Compute the cache directory for a given repo."""
|
|
211
|
+
base = cache_dir or self._config.cache_dir
|
|
212
|
+
segment = f"{repo_type}s" if not repo_type.endswith("s") else repo_type
|
|
213
|
+
# Encode repo_id: owner/name → owner--name for filesystem safety
|
|
214
|
+
safe_id = repo_id.replace("/", "--")
|
|
215
|
+
return ensure_dir(base / segment / safe_id)
|
|
216
|
+
|
|
217
|
+
def _download_with_resume(
|
|
218
|
+
self,
|
|
219
|
+
repo_id: str,
|
|
220
|
+
repo_type: str,
|
|
221
|
+
file_path: str,
|
|
222
|
+
revision: str,
|
|
223
|
+
target: Path,
|
|
224
|
+
) -> Path:
|
|
225
|
+
"""Download a file with HTTP Range resume support."""
|
|
226
|
+
# Use a temp file for partial downloads
|
|
227
|
+
tmp_path = target.with_suffix(target.suffix + ".incomplete")
|
|
228
|
+
|
|
229
|
+
existing_size = 0
|
|
230
|
+
if tmp_path.exists():
|
|
231
|
+
existing_size = tmp_path.stat().st_size
|
|
232
|
+
|
|
233
|
+
# Prepare headers for resume
|
|
234
|
+
extra_headers: dict[str, str] = {}
|
|
235
|
+
if existing_size > 0:
|
|
236
|
+
extra_headers["Range"] = f"bytes={existing_size}-"
|
|
237
|
+
logger.debug("Resuming download from byte %d", existing_size)
|
|
238
|
+
|
|
239
|
+
try:
|
|
240
|
+
resp = self._client.download_stream(
|
|
241
|
+
repo_id=repo_id,
|
|
242
|
+
repo_type=repo_type,
|
|
243
|
+
file_path=file_path,
|
|
244
|
+
revision=revision,
|
|
245
|
+
headers=extra_headers if extra_headers else None,
|
|
246
|
+
)
|
|
247
|
+
except Exception as exc:
|
|
248
|
+
raise NetworkError(f"Download failed for {file_path}: {exc}") from exc
|
|
249
|
+
|
|
250
|
+
# Determine total size
|
|
251
|
+
content_length = resp.headers.get("Content-Length")
|
|
252
|
+
total_size = int(content_length) if content_length else None
|
|
253
|
+
is_resumed = resp.status_code == 206
|
|
254
|
+
|
|
255
|
+
if is_resumed and total_size:
|
|
256
|
+
total_size += existing_size
|
|
257
|
+
|
|
258
|
+
# Write to temp file
|
|
259
|
+
mode = "ab" if is_resumed else "wb"
|
|
260
|
+
if not is_resumed:
|
|
261
|
+
existing_size = 0
|
|
262
|
+
|
|
263
|
+
with tqdm(
|
|
264
|
+
total=total_size,
|
|
265
|
+
initial=existing_size,
|
|
266
|
+
unit="B",
|
|
267
|
+
unit_scale=True,
|
|
268
|
+
desc=Path(file_path).name,
|
|
269
|
+
leave=False,
|
|
270
|
+
) as pbar:
|
|
271
|
+
with open(tmp_path, mode) as fh:
|
|
272
|
+
for chunk in resp.iter_content(chunk_size=DOWNLOAD_CHUNK_SIZE):
|
|
273
|
+
if chunk:
|
|
274
|
+
fh.write(chunk)
|
|
275
|
+
pbar.update(len(chunk))
|
|
276
|
+
|
|
277
|
+
# Move temp → final
|
|
278
|
+
tmp_path.replace(target)
|
|
279
|
+
logger.debug("Downloaded: %s", target)
|
|
280
|
+
return target
|
|
281
|
+
|
|
282
|
+
def verify_file(self, file_path: Path, expected_sha256: str) -> bool:
|
|
283
|
+
"""Verify a downloaded file's SHA256 hash.
|
|
284
|
+
|
|
285
|
+
Raises :class:`~.errors.FileIntegrityError` on mismatch.
|
|
286
|
+
"""
|
|
287
|
+
actual = compute_hash(file_path, "sha256")
|
|
288
|
+
if actual != expected_sha256:
|
|
289
|
+
raise FileIntegrityError(
|
|
290
|
+
f"Hash mismatch for {file_path.name}: "
|
|
291
|
+
f"expected {expected_sha256[:16]}..., got {actual[:16]}..."
|
|
292
|
+
)
|
|
293
|
+
return True
|