ossllms 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. ossllms-0.1.0/.gitignore +49 -0
  2. ossllms-0.1.0/PKG-INFO +120 -0
  3. ossllms-0.1.0/README.md +98 -0
  4. ossllms-0.1.0/ossllms/__init__.py +33 -0
  5. ossllms-0.1.0/ossllms/api.py +194 -0
  6. ossllms-0.1.0/ossllms/cache.py +102 -0
  7. ossllms-0.1.0/ossllms/catalog.py +517 -0
  8. ossllms-0.1.0/ossllms/cli.py +1502 -0
  9. ossllms-0.1.0/ossllms/compat.py +31 -0
  10. ossllms-0.1.0/ossllms/config.py +75 -0
  11. ossllms-0.1.0/ossllms/contrib_plan.py +489 -0
  12. ossllms-0.1.0/ossllms/contrib_runtime.py +302 -0
  13. ossllms-0.1.0/ossllms/contrib_scan.py +557 -0
  14. ossllms-0.1.0/ossllms/contrib_worker.py +526 -0
  15. ossllms-0.1.0/ossllms/denylist.py +251 -0
  16. ossllms-0.1.0/ossllms/engine.py +428 -0
  17. ossllms-0.1.0/ossllms/hf.py +326 -0
  18. ossllms-0.1.0/ossllms/keys.py +228 -0
  19. ossllms-0.1.0/ossllms/manifest.py +109 -0
  20. ossllms-0.1.0/ossllms/provenance.py +178 -0
  21. ossllms-0.1.0/ossllms/publish.py +706 -0
  22. ossllms-0.1.0/ossllms/reachability.py +78 -0
  23. ossllms-0.1.0/ossllms/refs.py +172 -0
  24. ossllms-0.1.0/ossllms/resolve.py +213 -0
  25. ossllms-0.1.0/ossllms/search.py +367 -0
  26. ossllms-0.1.0/ossllms/seed.py +859 -0
  27. ossllms-0.1.0/ossllms/seed_engine.py +444 -0
  28. ossllms-0.1.0/ossllms/signing.py +64 -0
  29. ossllms-0.1.0/ossllms/torrent.py +407 -0
  30. ossllms-0.1.0/ossllms/verify.py +269 -0
  31. ossllms-0.1.0/pyproject.toml +36 -0
  32. ossllms-0.1.0/tests/conftest.py +6 -0
  33. ossllms-0.1.0/tests/fixtures/__init__.py +1 -0
  34. ossllms-0.1.0/tests/fixtures/v0_catalog.py +205 -0
  35. ossllms-0.1.0/tests/helpers.py +104 -0
  36. ossllms-0.1.0/tests/test_blackholed_operator_scaffold.py +28 -0
  37. ossllms-0.1.0/tests/test_catalog_refs.py +181 -0
  38. ossllms-0.1.0/tests/test_cli.py +1076 -0
  39. ossllms-0.1.0/tests/test_contrib_plan.py +274 -0
  40. ossllms-0.1.0/tests/test_contrib_runtime.py +172 -0
  41. ossllms-0.1.0/tests/test_contrib_scan.py +299 -0
  42. ossllms-0.1.0/tests/test_contrib_worker.py +162 -0
  43. ossllms-0.1.0/tests/test_dedup.py +71 -0
  44. ossllms-0.1.0/tests/test_denylist.py +156 -0
  45. ossllms-0.1.0/tests/test_e2e.py +130 -0
  46. ossllms-0.1.0/tests/test_engine.py +109 -0
  47. ossllms-0.1.0/tests/test_hf_metadata.py +128 -0
  48. ossllms-0.1.0/tests/test_http_resume.py +173 -0
  49. ossllms-0.1.0/tests/test_keys.py +104 -0
  50. ossllms-0.1.0/tests/test_provenance.py +119 -0
  51. ossllms-0.1.0/tests/test_publish.py +268 -0
  52. ossllms-0.1.0/tests/test_pull_bundle.py +90 -0
  53. ossllms-0.1.0/tests/test_reachability.py +53 -0
  54. ossllms-0.1.0/tests/test_refs.py +88 -0
  55. ossllms-0.1.0/tests/test_search.py +234 -0
  56. ossllms-0.1.0/tests/test_seed.py +295 -0
  57. ossllms-0.1.0/tests/test_seed_engine.py +274 -0
  58. ossllms-0.1.0/tests/test_signing_payload.py +133 -0
  59. ossllms-0.1.0/tests/test_torrent_builder.py +223 -0
  60. ossllms-0.1.0/tests/test_v0_fixture_catalog.py +89 -0
  61. ossllms-0.1.0/tests/test_v2_roots.py +114 -0
  62. ossllms-0.1.0/tests/test_verify.py +43 -0
@@ -0,0 +1,49 @@
1
+ # Secrets & keys — NEVER commit signing keys
2
+ *.key
3
+ *.pem
4
+ *.sec
5
+ *.minisign
6
+ minisign.key
7
+ *.private
8
+ secrets/
9
+ .env
10
+ .env.*
11
+
12
+ # Model data / large artifacts (these live in the swarm, not git)
13
+ data/
14
+ cache/
15
+ e2e/artifacts/
16
+ *.safetensors
17
+ *.gguf
18
+ *.bin
19
+ *.pt
20
+ *.ckpt
21
+ *.torrent
22
+ *.sqlite
23
+ *.sqlite-*
24
+
25
+ # Curated package data is intentionally tiny and must ship in the wheel.
26
+ !sdk/ossllms/data/
27
+ !sdk/ossllms/data/manifest.schema.json
28
+ !sdk/ossllms/data/default-catalog/
29
+ !sdk/ossllms/data/default-catalog/**
30
+ !sdk/ossllms/data/trusted-keys/
31
+ !sdk/ossllms/data/trusted-keys/**
32
+
33
+ # Build / deps
34
+ node_modules/
35
+ dist/
36
+ build/
37
+ target/
38
+ __pycache__/
39
+ *.pyc
40
+ .venv/
41
+ venv/
42
+ *.egg-info/
43
+
44
+ # OS / editor
45
+ .DS_Store
46
+ Thumbs.db
47
+ .idea/
48
+ .vscode/
49
+ *.swp
ossllms-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,120 @@
1
+ Metadata-Version: 2.4
2
+ Name: ossllms
3
+ Version: 0.1.0
4
+ Summary: Pull open-weight AI models from the ossllms torrent network. A drop-in for huggingface_hub.
5
+ Project-URL: Homepage, https://ossllms.com
6
+ Project-URL: Source, https://github.com/gittb/ossllms
7
+ Author: ossllms contributors
8
+ License-Expression: Apache-2.0
9
+ Keywords: ai,bittorrent,huggingface,llm,p2p,preservation
10
+ Requires-Python: >=3.10
11
+ Requires-Dist: cryptography>=42
12
+ Provides-Extra: dev
13
+ Requires-Dist: jsonschema>=4; extra == 'dev'
14
+ Requires-Dist: pytest>=8; extra == 'dev'
15
+ Provides-Extra: live-e2e
16
+ Requires-Dist: huggingface-hub>=0.23; extra == 'live-e2e'
17
+ Provides-Extra: schema
18
+ Requires-Dist: jsonschema>=4; extra == 'schema'
19
+ Provides-Extra: torrent
20
+ Requires-Dist: libtorrent>=2.0; extra == 'torrent'
21
+ Description-Content-Type: text/markdown
22
+
23
+ # ossllms (Python SDK + CLI)
24
+
25
+ Pull open-weight AI models from the [ossllms](https://ossllms.com) preservation
26
+ network. A drop-in for `huggingface_hub` that resolves from the torrent layer,
27
+ verifies integrity + publisher signature, and lands files in the HF-compatible
28
+ cache. Full design: [`../docs/SDK.md`](../docs/SDK.md) and
29
+ [`../docs/INTEROP-HF.md`](../docs/INTEROP-HF.md).
30
+
31
+ ## Install
32
+
33
+ ```bash
34
+ pip install 'ossllms[torrent]' # V0 path: HTTP web-seed + libtorrent swarm/seed
35
+ ```
36
+
37
+ ## CLI (simplest UI)
38
+
39
+ ```bash
40
+ ossllms search minilm # find a useful default-catalog model
41
+ ossllms pull hf/sentence-transformers/all-MiniLM-L6-v2@0.0.1+1110a243fdf4
42
+ ossllms pull hf/sentence-transformers/all-MiniLM-L6-v2@0.0.1+1110a243fdf4 --seed
43
+ ossllms contribute # start/attach to the managed seed worker
44
+ ossllms pull sha256:<64-hex> # resolve by catalog hash index
45
+ ossllms publish ./model # TTY: prompt/infer, sign, optionally seed
46
+ ossllms publish ./model --seed --yes-public # automation: publish and seed local bytes
47
+ ossllms contribute --dry-run # preview seed/publish candidates and caps
48
+ ossllms contribute --publish # TTY: choose publish/seed rows, consent, then seed
49
+ ossllms contribute --publish --yes-public # automation: publish selected cache candidates, then seed
50
+ ossllms contribute # attach to/start managed seed handoff worker
51
+ ossllms contribute --status # show worker metrics
52
+ ossllms contribute --stop # stop worker
53
+ ossllms ls # what's cached
54
+ ossllms verify hf/sentence-transformers/all-MiniLM-L6-v2@0.0.1+1110a243fdf4
55
+ ```
56
+
57
+ If no catalog is configured, the packaged signed default catalog is used. Set
58
+ `OSSLLMS_CATALOG=https://catalog.ossllms.com` or pass `--catalog ...` for a
59
+ custom directory or http(s) catalog.
60
+
61
+ ## Python
62
+
63
+ ```python
64
+ from ossllms import snapshot_download, hf_hub_download, pull
65
+
66
+ path = snapshot_download("hf/sentence-transformers/all-MiniLM-L6-v2@0.0.1+1110a243fdf4")
67
+ cfg = hf_hub_download(
68
+ "hf/sentence-transformers/all-MiniLM-L6-v2",
69
+ "config.json",
70
+ revision="0.0.1+1110a243fdf4",
71
+ )
72
+
73
+ res = pull("hf/sentence-transformers/all-MiniLM-L6-v2@0.0.1+1110a243fdf4")
74
+ print(res.verdict.label, res.verdict.signer) # "Verified" / signer
75
+
76
+ import ossllms.compat # opt-in: route huggingface_hub downloads through ossllms
77
+ ```
78
+
79
+ ## What's built (v0.1)
80
+
81
+ - `pull`: resolve signed manifest → download via **HTTP web seeds** (works today;
82
+ `file://` supported) → verify **every file's SHA-256**, **v2 root**, and
83
+ **minisign/Ed25519** signature → place in HF cache. Integrity always enforced;
84
+ origin shown as Verified/Unverified (`--require-signature` to enforce).
85
+ `pull --seed` starts the managed contribution worker for the pulled snapshot
86
+ when the catalog has matching torrent metadata.
87
+ - Store-qualified refs, `sha256:` refs, and catalog-paired `magnet:` refs.
88
+ - `publish`: build and sign a public redistribution bundle from a local model
89
+ directory; optionally update a static V0 catalog/hash index with
90
+ `--catalog-dir`, and start the managed seed worker directly from the local
91
+ directory with `--seed`.
92
+ - `contribute --dry-run`: scans local HF cache roots, matches catalog hashes,
93
+ shows seed/publish candidates, public-publish warnings, and upload caps.
94
+ - `contribute`: attaches to an active managed worker or starts one for complete
95
+ in-network seed matches, persisting upload caps, zero-download seed metrics, and
96
+ a worker plan. In a terminal it renders the scan plan, lets the user select
97
+ publish/seed rows, shows caps, and requires public redistribution consent
98
+ before publish writes. When `ossllms[torrent]` is installed and
99
+ `release.torrent` metadata is available beside `release.json`, the worker
100
+ starts libtorrent seed mode from a hardlink-only view of the HF cache.
101
+ - Worker-state, HF metadata, provenance, and seed-mode handoff primitives.
102
+ - `huggingface_hub`-compatible `snapshot_download` / `hf_hub_download` + `compat` shim.
103
+ - Manifest schema validation, trust store (pinned keys), selective `--include`.
104
+ - libtorrent swarm engine: download scaffold plus live contribute seed adapter
105
+ behind the optional torrent extra. Local two-peer E2E gates prove fixture,
106
+ direct-publish, and contribute-publish no-web-seed swarms over libtorrent.
107
+ - Bundled default MiniLM metadata includes `release.torrent`, so the post-pull
108
+ default contribution path can start live seed mode when `ossllms[torrent]` is
109
+ installed.
110
+
111
+ ## Develop / test
112
+
113
+ ```bash
114
+ python -m venv .venv
115
+ .venv/bin/pip install -e '.[dev]'
116
+ .venv/bin/pytest # or: PYTHONPATH=. .venv/bin/pytest tests
117
+ ```
118
+
119
+ `Verified` = verified **origin + integrity** (+ a `matches HF` badge). It does
120
+ **not** mean the weights are safe to run.
@@ -0,0 +1,98 @@
1
+ # ossllms (Python SDK + CLI)
2
+
3
+ Pull open-weight AI models from the [ossllms](https://ossllms.com) preservation
4
+ network. A drop-in for `huggingface_hub` that resolves from the torrent layer,
5
+ verifies integrity + publisher signature, and lands files in the HF-compatible
6
+ cache. Full design: [`../docs/SDK.md`](../docs/SDK.md) and
7
+ [`../docs/INTEROP-HF.md`](../docs/INTEROP-HF.md).
8
+
9
+ ## Install
10
+
11
+ ```bash
12
+ pip install 'ossllms[torrent]' # V0 path: HTTP web-seed + libtorrent swarm/seed
13
+ ```
14
+
15
+ ## CLI (simplest UI)
16
+
17
+ ```bash
18
+ ossllms search minilm # find a useful default-catalog model
19
+ ossllms pull hf/sentence-transformers/all-MiniLM-L6-v2@0.0.1+1110a243fdf4
20
+ ossllms pull hf/sentence-transformers/all-MiniLM-L6-v2@0.0.1+1110a243fdf4 --seed
21
+ ossllms contribute # start/attach to the managed seed worker
22
+ ossllms pull sha256:<64-hex> # resolve by catalog hash index
23
+ ossllms publish ./model # TTY: prompt/infer, sign, optionally seed
24
+ ossllms publish ./model --seed --yes-public # automation: publish and seed local bytes
25
+ ossllms contribute --dry-run # preview seed/publish candidates and caps
26
+ ossllms contribute --publish # TTY: choose publish/seed rows, consent, then seed
27
+ ossllms contribute --publish --yes-public # automation: publish selected cache candidates, then seed
28
+ ossllms contribute # attach to/start managed seed handoff worker
29
+ ossllms contribute --status # show worker metrics
30
+ ossllms contribute --stop # stop worker
31
+ ossllms ls # what's cached
32
+ ossllms verify hf/sentence-transformers/all-MiniLM-L6-v2@0.0.1+1110a243fdf4
33
+ ```
34
+
35
+ If no catalog is configured, the packaged signed default catalog is used. Set
36
+ `OSSLLMS_CATALOG=https://catalog.ossllms.com` or pass `--catalog ...` for a
37
+ custom directory or http(s) catalog.
38
+
39
+ ## Python
40
+
41
+ ```python
42
+ from ossllms import snapshot_download, hf_hub_download, pull
43
+
44
+ path = snapshot_download("hf/sentence-transformers/all-MiniLM-L6-v2@0.0.1+1110a243fdf4")
45
+ cfg = hf_hub_download(
46
+ "hf/sentence-transformers/all-MiniLM-L6-v2",
47
+ "config.json",
48
+ revision="0.0.1+1110a243fdf4",
49
+ )
50
+
51
+ res = pull("hf/sentence-transformers/all-MiniLM-L6-v2@0.0.1+1110a243fdf4")
52
+ print(res.verdict.label, res.verdict.signer) # "Verified" / signer
53
+
54
+ import ossllms.compat # opt-in: route huggingface_hub downloads through ossllms
55
+ ```
56
+
57
+ ## What's built (v0.1)
58
+
59
+ - `pull`: resolve signed manifest → download via **HTTP web seeds** (works today;
60
+ `file://` supported) → verify **every file's SHA-256**, **v2 root**, and
61
+ **minisign/Ed25519** signature → place in HF cache. Integrity always enforced;
62
+ origin shown as Verified/Unverified (`--require-signature` to enforce).
63
+ `pull --seed` starts the managed contribution worker for the pulled snapshot
64
+ when the catalog has matching torrent metadata.
65
+ - Store-qualified refs, `sha256:` refs, and catalog-paired `magnet:` refs.
66
+ - `publish`: build and sign a public redistribution bundle from a local model
67
+ directory; optionally update a static V0 catalog/hash index with
68
+ `--catalog-dir`, and start the managed seed worker directly from the local
69
+ directory with `--seed`.
70
+ - `contribute --dry-run`: scans local HF cache roots, matches catalog hashes,
71
+ shows seed/publish candidates, public-publish warnings, and upload caps.
72
+ - `contribute`: attaches to an active managed worker or starts one for complete
73
+ in-network seed matches, persisting upload caps, zero-download seed metrics, and
74
+ a worker plan. In a terminal it renders the scan plan, lets the user select
75
+ publish/seed rows, shows caps, and requires public redistribution consent
76
+ before publish writes. When `ossllms[torrent]` is installed and
77
+ `release.torrent` metadata is available beside `release.json`, the worker
78
+ starts libtorrent seed mode from a hardlink-only view of the HF cache.
79
+ - Worker-state, HF metadata, provenance, and seed-mode handoff primitives.
80
+ - `huggingface_hub`-compatible `snapshot_download` / `hf_hub_download` + `compat` shim.
81
+ - Manifest schema validation, trust store (pinned keys), selective `--include`.
82
+ - libtorrent swarm engine: download scaffold plus live contribute seed adapter
83
+ behind the optional torrent extra. Local two-peer E2E gates prove fixture,
84
+ direct-publish, and contribute-publish no-web-seed swarms over libtorrent.
85
+ - Bundled default MiniLM metadata includes `release.torrent`, so the post-pull
86
+ default contribution path can start live seed mode when `ossllms[torrent]` is
87
+ installed.
88
+
89
+ ## Develop / test
90
+
91
+ ```bash
92
+ python -m venv .venv
93
+ .venv/bin/pip install -e '.[dev]'
94
+ .venv/bin/pytest # or: PYTHONPATH=. .venv/bin/pytest tests
95
+ ```
96
+
97
+ `Verified` = verified **origin + integrity** (+ a `matches HF` badge). It does
98
+ **not** mean the weights are safe to run.
@@ -0,0 +1,33 @@
1
+ """ossllms — pull open-weight AI models from the ossllms preservation network.
2
+
3
+ A drop-in for huggingface_hub that resolves from the torrent layer, verifies
4
+ integrity + publisher signature, and lands files in the HF-compatible cache.
5
+
6
+ from ossllms import snapshot_download
7
+ path = snapshot_download("Qwen/Qwen2.5-7B")
8
+ """
9
+ from __future__ import annotations
10
+
11
+ from .api import (
12
+ IntegrityError,
13
+ PullResult,
14
+ UnverifiedOriginError,
15
+ hf_hub_download,
16
+ pull,
17
+ snapshot_download,
18
+ )
19
+ from .verify import TrustStore, Verdict
20
+
21
+ __version__ = "0.1.0"
22
+
23
+ __all__ = [
24
+ "pull",
25
+ "snapshot_download",
26
+ "hf_hub_download",
27
+ "PullResult",
28
+ "Verdict",
29
+ "TrustStore",
30
+ "IntegrityError",
31
+ "UnverifiedOriginError",
32
+ "__version__",
33
+ ]
@@ -0,0 +1,194 @@
1
+ """High-level API. `pull()` is the core; `snapshot_download` / `hf_hub_download`
2
+ mirror the huggingface_hub names for drop-in use.
3
+ """
4
+ from __future__ import annotations
5
+
6
+ import shutil
7
+ import tempfile
8
+ from dataclasses import dataclass
9
+ from pathlib import Path, PurePosixPath
10
+ from typing import Callable, List, Optional
11
+
12
+ from . import config
13
+ from .cache import place_into_cache, reuse_from_blobs
14
+ from .denylist import load_denylist
15
+ from .engine import EngineError, get_engine
16
+ from .manifest import model_id, version
17
+ from .resolve import resolve_manifest_with_source
18
+ from .verify import TrustStore, Verdict, verify_manifest
19
+
20
+
21
+ class IntegrityError(RuntimeError):
22
+ pass
23
+
24
+
25
+ class UnverifiedOriginError(RuntimeError):
26
+ pass
27
+
28
+
29
+ @dataclass
30
+ class PullResult:
31
+ path: Path
32
+ repo_id: str
33
+ revision: str
34
+ manifest: dict
35
+ verdict: Verdict
36
+
37
+
38
+ def pull(
39
+ ref: str,
40
+ *,
41
+ catalog: Optional[str] = None,
42
+ cache_dir=None,
43
+ revision: Optional[str] = None,
44
+ include: Optional[List[str]] = None,
45
+ engine: str = "auto",
46
+ engine_impl=None,
47
+ trust: Optional[TrustStore] = None,
48
+ denylist: Optional[str] = None,
49
+ require_signature: bool = False,
50
+ progress: Optional[Callable[[str], None]] = None,
51
+ ) -> PullResult:
52
+ """Resolve, download, verify, and cache a model version.
53
+
54
+ Integrity (per-file SHA-256) is ALWAYS enforced. Signature/origin failure
55
+ raises only when require_signature=True; otherwise it returns an Unverified
56
+ verdict (community upload) and proceeds, matching the trust UX.
57
+ """
58
+ catalog = catalog or config.default_catalog()
59
+ cache_dir = Path(cache_dir) if cache_dir else config.hf_hub_cache()
60
+ trust = trust if trust is not None else TrustStore.from_dir(config.trust_dir())
61
+
62
+ resolution = resolve_manifest_with_source(ref, catalog)
63
+ manifest = resolution.manifest
64
+ repo_id = model_id(manifest)
65
+ ver = version(manifest)
66
+ rev = revision or ver
67
+ denylist = denylist or config.default_denylist()
68
+ if denylist is not None:
69
+ load_denylist(denylist, trust).check_manifest(manifest)
70
+
71
+ with tempfile.TemporaryDirectory(prefix="ossllms-") as staged:
72
+ selected = include_to_paths(manifest, include)
73
+ for path in selected:
74
+ _safe_artifact_relpath(path)
75
+ sel_set = set(selected)
76
+ selected_artifacts = [a for a in manifest["artifacts"] if a["path"] in sel_set]
77
+
78
+ # Reuse byte-identical files already in the blob store; fetch only the rest.
79
+ reused = set(reuse_from_blobs(staged, cache_dir, repo_id, selected_artifacts))
80
+ missing = [p for p in selected if p not in reused]
81
+ local = set()
82
+ if resolution.local_source_dir is not None and missing:
83
+ local = set(
84
+ _copy_from_local_source(
85
+ resolution.local_source_dir,
86
+ staged,
87
+ missing,
88
+ )
89
+ )
90
+ missing = [p for p in missing if p not in local]
91
+
92
+ eng = None
93
+ if missing:
94
+ eng = engine_impl if engine_impl is not None else get_engine(engine, manifest)
95
+ if progress:
96
+ engine_name = eng.name if eng is not None else "none"
97
+ progress(
98
+ f"engine: {engine_name}; reuse {len(reused)} file(s), "
99
+ f"local {len(local)} file(s), fetch {len(missing)}"
100
+ )
101
+ if missing:
102
+ try:
103
+ eng.fetch(manifest, staged, include=missing, progress=progress)
104
+ except EngineError as exc:
105
+ _raise_integrity_error_if_staged_file_failed(manifest, staged, trust, sel_set, exc)
106
+ raise
107
+
108
+ verdict = verify_manifest(manifest, staged, trust, only=sel_set)
109
+
110
+ if not verdict.integrity_ok:
111
+ bad = [f"{f.path} ({f.reason})" for f in verdict.files if not f.ok]
112
+ raise IntegrityError("integrity check failed: " + "; ".join(bad))
113
+ if require_signature and not verdict.origin_ok:
114
+ raise UnverifiedOriginError(
115
+ "publisher signature could not be verified: " + "; ".join(verdict.messages)
116
+ )
117
+
118
+ target = place_into_cache(staged, cache_dir, repo_id, rev, manifest=manifest)
119
+
120
+ return PullResult(target, repo_id, rev, manifest, verdict)
121
+
122
+
123
+ def include_to_paths(manifest: dict, include: Optional[List[str]]) -> List[str]:
124
+ import fnmatch
125
+
126
+ if not include:
127
+ return [a["path"] for a in manifest["artifacts"]]
128
+ return [
129
+ a["path"]
130
+ for a in manifest["artifacts"]
131
+ if any(fnmatch.fnmatch(a["path"], pat) for pat in include)
132
+ ]
133
+
134
+
135
+ def _copy_from_local_source(source_dir: Path, staged_dir, paths: List[str]) -> List[str]:
136
+ source = Path(source_dir).resolve()
137
+ staged = Path(staged_dir)
138
+ copied: List[str] = []
139
+ for artifact_path in paths:
140
+ rel = _safe_artifact_relpath(artifact_path)
141
+ src = (source / rel).resolve()
142
+ try:
143
+ src.relative_to(source)
144
+ except ValueError as exc:
145
+ raise IntegrityError(f"artifact path escapes local source: {artifact_path!r}") from exc
146
+ if not src.is_file():
147
+ continue
148
+ dst = staged / rel
149
+ dst.parent.mkdir(parents=True, exist_ok=True)
150
+ shutil.copy2(src, dst)
151
+ copied.append(artifact_path)
152
+ return copied
153
+
154
+
155
+ def _safe_artifact_relpath(path: str) -> Path:
156
+ if not isinstance(path, str) or not path:
157
+ raise IntegrityError(f"unsafe artifact path: {path!r}")
158
+ rel = PurePosixPath(path)
159
+ if rel.is_absolute() or not rel.parts or any(part in ("", ".", "..") for part in rel.parts):
160
+ raise IntegrityError(f"unsafe artifact path: {path!r}")
161
+ return Path(*rel.parts)
162
+
163
+
164
+ def _raise_integrity_error_if_staged_file_failed(
165
+ manifest: dict,
166
+ staged_dir,
167
+ trust: TrustStore,
168
+ only: set,
169
+ cause: EngineError,
170
+ ) -> None:
171
+ verdict = verify_manifest(manifest, staged_dir, trust, only=only)
172
+ bad = [f"{f.path} ({f.reason})" for f in verdict.files if not f.ok and f.reason != "missing"]
173
+ if bad:
174
+ raise IntegrityError("integrity check failed: " + "; ".join(bad)) from cause
175
+
176
+
177
+ # --- huggingface_hub-compatible surface -------------------------------------
178
+
179
+ def snapshot_download(repo_id: str, *, revision: Optional[str] = None,
180
+ allow_patterns: Optional[List[str]] = None,
181
+ cache_dir=None, catalog: Optional[str] = None,
182
+ **_ignored) -> str:
183
+ """Drop-in for huggingface_hub.snapshot_download. Returns the local path."""
184
+ ref = repo_id if not revision else f"{repo_id}@{revision}"
185
+ res = pull(ref, catalog=catalog, cache_dir=cache_dir, include=allow_patterns)
186
+ return str(res.path)
187
+
188
+
189
+ def hf_hub_download(repo_id: str, filename: str, *, revision: Optional[str] = None,
190
+ cache_dir=None, catalog: Optional[str] = None, **_ignored) -> str:
191
+ """Drop-in for huggingface_hub.hf_hub_download. Returns the local file path."""
192
+ ref = repo_id if not revision else f"{repo_id}@{revision}"
193
+ res = pull(ref, catalog=catalog, cache_dir=cache_dir, include=[filename])
194
+ return str(res.path / filename)
@@ -0,0 +1,102 @@
1
+ """HF-compatible cache with a blob store + hardlink dedup.
2
+
3
+ Layout (identical to huggingface_hub, so transformers/vllm/llama.cpp load unchanged
4
+ AND versions/quants/repos share bytes):
5
+
6
+ <cache>/models--<org>--<name>/
7
+ blobs/<sha256> one physical copy per unique file
8
+ snapshots/<revision>/<path> hardlink into blobs/
9
+ refs/<channel> e.g. refs/main -> a revision
10
+
11
+ Identical files across versions point at the same blob inode, so disk cost is the
12
+ size of the UNIQUE bytes, not N x per version. `reuse_from_blobs` lets a pull skip
13
+ downloading any file already present as a blob (the "only changed bytes move"
14
+ update path in docs/VERSIONING.md).
15
+ """
16
+ from __future__ import annotations
17
+
18
+ import os
19
+ import shutil
20
+ from pathlib import Path
21
+ from typing import List
22
+
23
+
24
+ def repo_folder_name(repo_id: str) -> str:
25
+ return "models--" + repo_id.replace("/", "--")
26
+
27
+
28
+ def repo_dir(cache_root, repo_id: str) -> Path:
29
+ return Path(cache_root) / repo_folder_name(repo_id)
30
+
31
+
32
+ def blobs_dir(cache_root, repo_id: str) -> Path:
33
+ return repo_dir(cache_root, repo_id) / "blobs"
34
+
35
+
36
+ def blob_path(cache_root, repo_id: str, sha256: str) -> Path:
37
+ return blobs_dir(cache_root, repo_id) / sha256
38
+
39
+
40
+ def snapshot_dir(cache_root, repo_id: str, revision: str) -> Path:
41
+ return repo_dir(cache_root, repo_id) / "snapshots" / revision
42
+
43
+
44
+ def has_blob(cache_root, repo_id: str, sha256: str) -> bool:
45
+ return blob_path(cache_root, repo_id, sha256).exists()
46
+
47
+
48
+ def _link_or_copy(src: Path, dst: Path) -> None:
49
+ dst.parent.mkdir(parents=True, exist_ok=True)
50
+ if dst.exists():
51
+ return
52
+ try:
53
+ os.link(src, dst) # hardlink (same filesystem)
54
+ except OSError:
55
+ shutil.copy2(src, dst) # cross-filesystem fallback
56
+
57
+
58
+ def reuse_from_blobs(staged_dir, cache_root, repo_id: str, artifacts) -> List[str]:
59
+ """Hardlink already-present blobs into the staging dir; return reused paths.
60
+
61
+ Call before downloading so files already on disk (from a prior version, a quant,
62
+ or another pull) are not fetched again.
63
+ """
64
+ reused: List[str] = []
65
+ staged = Path(staged_dir)
66
+ for a in artifacts:
67
+ bp = blob_path(cache_root, repo_id, a["sha256"])
68
+ if bp.exists():
69
+ _link_or_copy(bp, staged / a["path"])
70
+ reused.append(a["path"])
71
+ return reused
72
+
73
+
74
+ def place_into_cache(staged_dir, cache_root, repo_id: str, revision: str, manifest=None) -> Path:
75
+ """Store staged files as blobs (keyed by sha256) and hardlink them into the
76
+ snapshot dir. Files in the manifest are deduped via the blob store; any extras
77
+ are copied as-is."""
78
+ target = snapshot_dir(cache_root, repo_id, revision)
79
+ target.mkdir(parents=True, exist_ok=True)
80
+ staged = Path(staged_dir)
81
+ sha_by_path = {a["path"]: a["sha256"] for a in (manifest or {}).get("artifacts", [])}
82
+
83
+ for src in staged.rglob("*"):
84
+ if not src.is_file():
85
+ continue
86
+ rel = src.relative_to(staged)
87
+ sha = sha_by_path.get(str(rel))
88
+ snap_dst = target / rel
89
+ snap_dst.parent.mkdir(parents=True, exist_ok=True)
90
+ if sha:
91
+ bp = blob_path(cache_root, repo_id, sha)
92
+ _link_or_copy(src, bp) # store blob (idempotent)
93
+ if snap_dst.exists():
94
+ snap_dst.unlink()
95
+ _link_or_copy(bp, snap_dst) # snapshot hardlinks to blob
96
+ elif not snap_dst.exists():
97
+ shutil.copy2(src, snap_dst)
98
+
99
+ refs = repo_dir(cache_root, repo_id) / "refs"
100
+ refs.mkdir(parents=True, exist_ok=True)
101
+ (refs / "main").write_text(revision, encoding="utf-8")
102
+ return target