ossllms 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ossllms/__init__.py +33 -0
- ossllms/api.py +194 -0
- ossllms/cache.py +102 -0
- ossllms/catalog.py +517 -0
- ossllms/cli.py +1502 -0
- ossllms/compat.py +31 -0
- ossllms/config.py +75 -0
- ossllms/contrib_plan.py +489 -0
- ossllms/contrib_runtime.py +302 -0
- ossllms/contrib_scan.py +557 -0
- ossllms/contrib_worker.py +526 -0
- ossllms/data/default-catalog/catalog.json +11 -0
- ossllms/data/default-catalog/hash-index.json +117 -0
- ossllms/data/default-catalog/manifests/ossllms/smoke-tiny/0.0.1/files/LICENSE +1 -0
- ossllms/data/default-catalog/manifests/ossllms/smoke-tiny/0.0.1/files/README.md +3 -0
- ossllms/data/default-catalog/manifests/ossllms/smoke-tiny/0.0.1/files/config.json +1 -0
- ossllms/data/default-catalog/manifests/ossllms/smoke-tiny/0.0.1/files/model.safetensors +1 -0
- ossllms/data/default-catalog/manifests/ossllms/smoke-tiny/0.0.1/release.json +71 -0
- ossllms/data/default-catalog/manifests/sentence-transformers/all-MiniLM-L6-v2/0.0.1+1110a243fdf4/release.json +161 -0
- ossllms/data/default-catalog/manifests/sentence-transformers/all-MiniLM-L6-v2/0.0.1+1110a243fdf4/release.torrent +0 -0
- ossllms/data/default-catalog/search-index.json +25 -0
- ossllms/data/manifest.schema.json +122 -0
- ossllms/data/trusted-keys/ossllms.pub +2 -0
- ossllms/denylist.py +251 -0
- ossllms/engine.py +428 -0
- ossllms/hf.py +326 -0
- ossllms/keys.py +228 -0
- ossllms/manifest.py +109 -0
- ossllms/provenance.py +178 -0
- ossllms/publish.py +706 -0
- ossllms/reachability.py +78 -0
- ossllms/refs.py +172 -0
- ossllms/resolve.py +213 -0
- ossllms/search.py +367 -0
- ossllms/seed.py +859 -0
- ossllms/seed_engine.py +444 -0
- ossllms/signing.py +64 -0
- ossllms/torrent.py +407 -0
- ossllms/verify.py +269 -0
- ossllms-0.1.0.dist-info/METADATA +120 -0
- ossllms-0.1.0.dist-info/RECORD +43 -0
- ossllms-0.1.0.dist-info/WHEEL +4 -0
- ossllms-0.1.0.dist-info/entry_points.txt +2 -0
ossllms/__init__.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""ossllms — pull open-weight AI models from the ossllms preservation network.
|
|
2
|
+
|
|
3
|
+
A drop-in for huggingface_hub that resolves from the torrent layer, verifies
|
|
4
|
+
integrity + publisher signature, and lands files in the HF-compatible cache.
|
|
5
|
+
|
|
6
|
+
from ossllms import snapshot_download
|
|
7
|
+
path = snapshot_download("Qwen/Qwen2.5-7B")
|
|
8
|
+
"""
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from .api import (
|
|
12
|
+
IntegrityError,
|
|
13
|
+
PullResult,
|
|
14
|
+
UnverifiedOriginError,
|
|
15
|
+
hf_hub_download,
|
|
16
|
+
pull,
|
|
17
|
+
snapshot_download,
|
|
18
|
+
)
|
|
19
|
+
from .verify import TrustStore, Verdict
|
|
20
|
+
|
|
21
|
+
__version__ = "0.1.0"
|
|
22
|
+
|
|
23
|
+
__all__ = [
|
|
24
|
+
"pull",
|
|
25
|
+
"snapshot_download",
|
|
26
|
+
"hf_hub_download",
|
|
27
|
+
"PullResult",
|
|
28
|
+
"Verdict",
|
|
29
|
+
"TrustStore",
|
|
30
|
+
"IntegrityError",
|
|
31
|
+
"UnverifiedOriginError",
|
|
32
|
+
"__version__",
|
|
33
|
+
]
|
ossllms/api.py
ADDED
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
"""High-level API. `pull()` is the core; `snapshot_download` / `hf_hub_download`
|
|
2
|
+
mirror the huggingface_hub names for drop-in use.
|
|
3
|
+
"""
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
import shutil
|
|
7
|
+
import tempfile
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from pathlib import Path, PurePosixPath
|
|
10
|
+
from typing import Callable, List, Optional
|
|
11
|
+
|
|
12
|
+
from . import config
|
|
13
|
+
from .cache import place_into_cache, reuse_from_blobs
|
|
14
|
+
from .denylist import load_denylist
|
|
15
|
+
from .engine import EngineError, get_engine
|
|
16
|
+
from .manifest import model_id, version
|
|
17
|
+
from .resolve import resolve_manifest_with_source
|
|
18
|
+
from .verify import TrustStore, Verdict, verify_manifest
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class IntegrityError(RuntimeError):
|
|
22
|
+
pass
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class UnverifiedOriginError(RuntimeError):
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class PullResult:
|
|
31
|
+
path: Path
|
|
32
|
+
repo_id: str
|
|
33
|
+
revision: str
|
|
34
|
+
manifest: dict
|
|
35
|
+
verdict: Verdict
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def pull(
|
|
39
|
+
ref: str,
|
|
40
|
+
*,
|
|
41
|
+
catalog: Optional[str] = None,
|
|
42
|
+
cache_dir=None,
|
|
43
|
+
revision: Optional[str] = None,
|
|
44
|
+
include: Optional[List[str]] = None,
|
|
45
|
+
engine: str = "auto",
|
|
46
|
+
engine_impl=None,
|
|
47
|
+
trust: Optional[TrustStore] = None,
|
|
48
|
+
denylist: Optional[str] = None,
|
|
49
|
+
require_signature: bool = False,
|
|
50
|
+
progress: Optional[Callable[[str], None]] = None,
|
|
51
|
+
) -> PullResult:
|
|
52
|
+
"""Resolve, download, verify, and cache a model version.
|
|
53
|
+
|
|
54
|
+
Integrity (per-file SHA-256) is ALWAYS enforced. Signature/origin failure
|
|
55
|
+
raises only when require_signature=True; otherwise it returns an Unverified
|
|
56
|
+
verdict (community upload) and proceeds, matching the trust UX.
|
|
57
|
+
"""
|
|
58
|
+
catalog = catalog or config.default_catalog()
|
|
59
|
+
cache_dir = Path(cache_dir) if cache_dir else config.hf_hub_cache()
|
|
60
|
+
trust = trust if trust is not None else TrustStore.from_dir(config.trust_dir())
|
|
61
|
+
|
|
62
|
+
resolution = resolve_manifest_with_source(ref, catalog)
|
|
63
|
+
manifest = resolution.manifest
|
|
64
|
+
repo_id = model_id(manifest)
|
|
65
|
+
ver = version(manifest)
|
|
66
|
+
rev = revision or ver
|
|
67
|
+
denylist = denylist or config.default_denylist()
|
|
68
|
+
if denylist is not None:
|
|
69
|
+
load_denylist(denylist, trust).check_manifest(manifest)
|
|
70
|
+
|
|
71
|
+
with tempfile.TemporaryDirectory(prefix="ossllms-") as staged:
|
|
72
|
+
selected = include_to_paths(manifest, include)
|
|
73
|
+
for path in selected:
|
|
74
|
+
_safe_artifact_relpath(path)
|
|
75
|
+
sel_set = set(selected)
|
|
76
|
+
selected_artifacts = [a for a in manifest["artifacts"] if a["path"] in sel_set]
|
|
77
|
+
|
|
78
|
+
# Reuse byte-identical files already in the blob store; fetch only the rest.
|
|
79
|
+
reused = set(reuse_from_blobs(staged, cache_dir, repo_id, selected_artifacts))
|
|
80
|
+
missing = [p for p in selected if p not in reused]
|
|
81
|
+
local = set()
|
|
82
|
+
if resolution.local_source_dir is not None and missing:
|
|
83
|
+
local = set(
|
|
84
|
+
_copy_from_local_source(
|
|
85
|
+
resolution.local_source_dir,
|
|
86
|
+
staged,
|
|
87
|
+
missing,
|
|
88
|
+
)
|
|
89
|
+
)
|
|
90
|
+
missing = [p for p in missing if p not in local]
|
|
91
|
+
|
|
92
|
+
eng = None
|
|
93
|
+
if missing:
|
|
94
|
+
eng = engine_impl if engine_impl is not None else get_engine(engine, manifest)
|
|
95
|
+
if progress:
|
|
96
|
+
engine_name = eng.name if eng is not None else "none"
|
|
97
|
+
progress(
|
|
98
|
+
f"engine: {engine_name}; reuse {len(reused)} file(s), "
|
|
99
|
+
f"local {len(local)} file(s), fetch {len(missing)}"
|
|
100
|
+
)
|
|
101
|
+
if missing:
|
|
102
|
+
try:
|
|
103
|
+
eng.fetch(manifest, staged, include=missing, progress=progress)
|
|
104
|
+
except EngineError as exc:
|
|
105
|
+
_raise_integrity_error_if_staged_file_failed(manifest, staged, trust, sel_set, exc)
|
|
106
|
+
raise
|
|
107
|
+
|
|
108
|
+
verdict = verify_manifest(manifest, staged, trust, only=sel_set)
|
|
109
|
+
|
|
110
|
+
if not verdict.integrity_ok:
|
|
111
|
+
bad = [f"{f.path} ({f.reason})" for f in verdict.files if not f.ok]
|
|
112
|
+
raise IntegrityError("integrity check failed: " + "; ".join(bad))
|
|
113
|
+
if require_signature and not verdict.origin_ok:
|
|
114
|
+
raise UnverifiedOriginError(
|
|
115
|
+
"publisher signature could not be verified: " + "; ".join(verdict.messages)
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
target = place_into_cache(staged, cache_dir, repo_id, rev, manifest=manifest)
|
|
119
|
+
|
|
120
|
+
return PullResult(target, repo_id, rev, manifest, verdict)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def include_to_paths(manifest: dict, include: Optional[List[str]]) -> List[str]:
|
|
124
|
+
import fnmatch
|
|
125
|
+
|
|
126
|
+
if not include:
|
|
127
|
+
return [a["path"] for a in manifest["artifacts"]]
|
|
128
|
+
return [
|
|
129
|
+
a["path"]
|
|
130
|
+
for a in manifest["artifacts"]
|
|
131
|
+
if any(fnmatch.fnmatch(a["path"], pat) for pat in include)
|
|
132
|
+
]
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def _copy_from_local_source(source_dir: Path, staged_dir, paths: List[str]) -> List[str]:
|
|
136
|
+
source = Path(source_dir).resolve()
|
|
137
|
+
staged = Path(staged_dir)
|
|
138
|
+
copied: List[str] = []
|
|
139
|
+
for artifact_path in paths:
|
|
140
|
+
rel = _safe_artifact_relpath(artifact_path)
|
|
141
|
+
src = (source / rel).resolve()
|
|
142
|
+
try:
|
|
143
|
+
src.relative_to(source)
|
|
144
|
+
except ValueError as exc:
|
|
145
|
+
raise IntegrityError(f"artifact path escapes local source: {artifact_path!r}") from exc
|
|
146
|
+
if not src.is_file():
|
|
147
|
+
continue
|
|
148
|
+
dst = staged / rel
|
|
149
|
+
dst.parent.mkdir(parents=True, exist_ok=True)
|
|
150
|
+
shutil.copy2(src, dst)
|
|
151
|
+
copied.append(artifact_path)
|
|
152
|
+
return copied
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def _safe_artifact_relpath(path: str) -> Path:
|
|
156
|
+
if not isinstance(path, str) or not path:
|
|
157
|
+
raise IntegrityError(f"unsafe artifact path: {path!r}")
|
|
158
|
+
rel = PurePosixPath(path)
|
|
159
|
+
if rel.is_absolute() or not rel.parts or any(part in ("", ".", "..") for part in rel.parts):
|
|
160
|
+
raise IntegrityError(f"unsafe artifact path: {path!r}")
|
|
161
|
+
return Path(*rel.parts)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def _raise_integrity_error_if_staged_file_failed(
|
|
165
|
+
manifest: dict,
|
|
166
|
+
staged_dir,
|
|
167
|
+
trust: TrustStore,
|
|
168
|
+
only: set,
|
|
169
|
+
cause: EngineError,
|
|
170
|
+
) -> None:
|
|
171
|
+
verdict = verify_manifest(manifest, staged_dir, trust, only=only)
|
|
172
|
+
bad = [f"{f.path} ({f.reason})" for f in verdict.files if not f.ok and f.reason != "missing"]
|
|
173
|
+
if bad:
|
|
174
|
+
raise IntegrityError("integrity check failed: " + "; ".join(bad)) from cause
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
# --- huggingface_hub-compatible surface -------------------------------------
|
|
178
|
+
|
|
179
|
+
def snapshot_download(repo_id: str, *, revision: Optional[str] = None,
|
|
180
|
+
allow_patterns: Optional[List[str]] = None,
|
|
181
|
+
cache_dir=None, catalog: Optional[str] = None,
|
|
182
|
+
**_ignored) -> str:
|
|
183
|
+
"""Drop-in for huggingface_hub.snapshot_download. Returns the local path."""
|
|
184
|
+
ref = repo_id if not revision else f"{repo_id}@{revision}"
|
|
185
|
+
res = pull(ref, catalog=catalog, cache_dir=cache_dir, include=allow_patterns)
|
|
186
|
+
return str(res.path)
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def hf_hub_download(repo_id: str, filename: str, *, revision: Optional[str] = None,
|
|
190
|
+
cache_dir=None, catalog: Optional[str] = None, **_ignored) -> str:
|
|
191
|
+
"""Drop-in for huggingface_hub.hf_hub_download. Returns the local file path."""
|
|
192
|
+
ref = repo_id if not revision else f"{repo_id}@{revision}"
|
|
193
|
+
res = pull(ref, catalog=catalog, cache_dir=cache_dir, include=[filename])
|
|
194
|
+
return str(res.path / filename)
|
ossllms/cache.py
ADDED
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""HF-compatible cache with a blob store + hardlink dedup.
|
|
2
|
+
|
|
3
|
+
Layout (identical to huggingface_hub, so transformers/vllm/llama.cpp load unchanged
|
|
4
|
+
AND versions/quants/repos share bytes):
|
|
5
|
+
|
|
6
|
+
<cache>/models--<org>--<name>/
|
|
7
|
+
blobs/<sha256> one physical copy per unique file
|
|
8
|
+
snapshots/<revision>/<path> hardlink into blobs/
|
|
9
|
+
refs/<channel> e.g. refs/main -> a revision
|
|
10
|
+
|
|
11
|
+
Identical files across versions point at the same blob inode, so disk cost is the
|
|
12
|
+
size of the UNIQUE bytes, not N x per version. `reuse_from_blobs` lets a pull skip
|
|
13
|
+
downloading any file already present as a blob (the "only changed bytes move"
|
|
14
|
+
update path in docs/VERSIONING.md).
|
|
15
|
+
"""
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import os
|
|
19
|
+
import shutil
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
from typing import List
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def repo_folder_name(repo_id: str) -> str:
|
|
25
|
+
return "models--" + repo_id.replace("/", "--")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def repo_dir(cache_root, repo_id: str) -> Path:
|
|
29
|
+
return Path(cache_root) / repo_folder_name(repo_id)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def blobs_dir(cache_root, repo_id: str) -> Path:
|
|
33
|
+
return repo_dir(cache_root, repo_id) / "blobs"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def blob_path(cache_root, repo_id: str, sha256: str) -> Path:
|
|
37
|
+
return blobs_dir(cache_root, repo_id) / sha256
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def snapshot_dir(cache_root, repo_id: str, revision: str) -> Path:
|
|
41
|
+
return repo_dir(cache_root, repo_id) / "snapshots" / revision
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def has_blob(cache_root, repo_id: str, sha256: str) -> bool:
|
|
45
|
+
return blob_path(cache_root, repo_id, sha256).exists()
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _link_or_copy(src: Path, dst: Path) -> None:
|
|
49
|
+
dst.parent.mkdir(parents=True, exist_ok=True)
|
|
50
|
+
if dst.exists():
|
|
51
|
+
return
|
|
52
|
+
try:
|
|
53
|
+
os.link(src, dst) # hardlink (same filesystem)
|
|
54
|
+
except OSError:
|
|
55
|
+
shutil.copy2(src, dst) # cross-filesystem fallback
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def reuse_from_blobs(staged_dir, cache_root, repo_id: str, artifacts) -> List[str]:
|
|
59
|
+
"""Hardlink already-present blobs into the staging dir; return reused paths.
|
|
60
|
+
|
|
61
|
+
Call before downloading so files already on disk (from a prior version, a quant,
|
|
62
|
+
or another pull) are not fetched again.
|
|
63
|
+
"""
|
|
64
|
+
reused: List[str] = []
|
|
65
|
+
staged = Path(staged_dir)
|
|
66
|
+
for a in artifacts:
|
|
67
|
+
bp = blob_path(cache_root, repo_id, a["sha256"])
|
|
68
|
+
if bp.exists():
|
|
69
|
+
_link_or_copy(bp, staged / a["path"])
|
|
70
|
+
reused.append(a["path"])
|
|
71
|
+
return reused
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def place_into_cache(staged_dir, cache_root, repo_id: str, revision: str, manifest=None) -> Path:
|
|
75
|
+
"""Store staged files as blobs (keyed by sha256) and hardlink them into the
|
|
76
|
+
snapshot dir. Files in the manifest are deduped via the blob store; any extras
|
|
77
|
+
are copied as-is."""
|
|
78
|
+
target = snapshot_dir(cache_root, repo_id, revision)
|
|
79
|
+
target.mkdir(parents=True, exist_ok=True)
|
|
80
|
+
staged = Path(staged_dir)
|
|
81
|
+
sha_by_path = {a["path"]: a["sha256"] for a in (manifest or {}).get("artifacts", [])}
|
|
82
|
+
|
|
83
|
+
for src in staged.rglob("*"):
|
|
84
|
+
if not src.is_file():
|
|
85
|
+
continue
|
|
86
|
+
rel = src.relative_to(staged)
|
|
87
|
+
sha = sha_by_path.get(str(rel))
|
|
88
|
+
snap_dst = target / rel
|
|
89
|
+
snap_dst.parent.mkdir(parents=True, exist_ok=True)
|
|
90
|
+
if sha:
|
|
91
|
+
bp = blob_path(cache_root, repo_id, sha)
|
|
92
|
+
_link_or_copy(src, bp) # store blob (idempotent)
|
|
93
|
+
if snap_dst.exists():
|
|
94
|
+
snap_dst.unlink()
|
|
95
|
+
_link_or_copy(bp, snap_dst) # snapshot hardlinks to blob
|
|
96
|
+
elif not snap_dst.exists():
|
|
97
|
+
shutil.copy2(src, snap_dst)
|
|
98
|
+
|
|
99
|
+
refs = repo_dir(cache_root, repo_id) / "refs"
|
|
100
|
+
refs.mkdir(parents=True, exist_ok=True)
|
|
101
|
+
(refs / "main").write_text(revision, encoding="utf-8")
|
|
102
|
+
return target
|