ossllms 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ossllms-0.1.0/.gitignore +49 -0
- ossllms-0.1.0/PKG-INFO +120 -0
- ossllms-0.1.0/README.md +98 -0
- ossllms-0.1.0/ossllms/__init__.py +33 -0
- ossllms-0.1.0/ossllms/api.py +194 -0
- ossllms-0.1.0/ossllms/cache.py +102 -0
- ossllms-0.1.0/ossllms/catalog.py +517 -0
- ossllms-0.1.0/ossllms/cli.py +1502 -0
- ossllms-0.1.0/ossllms/compat.py +31 -0
- ossllms-0.1.0/ossllms/config.py +75 -0
- ossllms-0.1.0/ossllms/contrib_plan.py +489 -0
- ossllms-0.1.0/ossllms/contrib_runtime.py +302 -0
- ossllms-0.1.0/ossllms/contrib_scan.py +557 -0
- ossllms-0.1.0/ossllms/contrib_worker.py +526 -0
- ossllms-0.1.0/ossllms/denylist.py +251 -0
- ossllms-0.1.0/ossllms/engine.py +428 -0
- ossllms-0.1.0/ossllms/hf.py +326 -0
- ossllms-0.1.0/ossllms/keys.py +228 -0
- ossllms-0.1.0/ossllms/manifest.py +109 -0
- ossllms-0.1.0/ossllms/provenance.py +178 -0
- ossllms-0.1.0/ossllms/publish.py +706 -0
- ossllms-0.1.0/ossllms/reachability.py +78 -0
- ossllms-0.1.0/ossllms/refs.py +172 -0
- ossllms-0.1.0/ossllms/resolve.py +213 -0
- ossllms-0.1.0/ossllms/search.py +367 -0
- ossllms-0.1.0/ossllms/seed.py +859 -0
- ossllms-0.1.0/ossllms/seed_engine.py +444 -0
- ossllms-0.1.0/ossllms/signing.py +64 -0
- ossllms-0.1.0/ossllms/torrent.py +407 -0
- ossllms-0.1.0/ossllms/verify.py +269 -0
- ossllms-0.1.0/pyproject.toml +36 -0
- ossllms-0.1.0/tests/conftest.py +6 -0
- ossllms-0.1.0/tests/fixtures/__init__.py +1 -0
- ossllms-0.1.0/tests/fixtures/v0_catalog.py +205 -0
- ossllms-0.1.0/tests/helpers.py +104 -0
- ossllms-0.1.0/tests/test_blackholed_operator_scaffold.py +28 -0
- ossllms-0.1.0/tests/test_catalog_refs.py +181 -0
- ossllms-0.1.0/tests/test_cli.py +1076 -0
- ossllms-0.1.0/tests/test_contrib_plan.py +274 -0
- ossllms-0.1.0/tests/test_contrib_runtime.py +172 -0
- ossllms-0.1.0/tests/test_contrib_scan.py +299 -0
- ossllms-0.1.0/tests/test_contrib_worker.py +162 -0
- ossllms-0.1.0/tests/test_dedup.py +71 -0
- ossllms-0.1.0/tests/test_denylist.py +156 -0
- ossllms-0.1.0/tests/test_e2e.py +130 -0
- ossllms-0.1.0/tests/test_engine.py +109 -0
- ossllms-0.1.0/tests/test_hf_metadata.py +128 -0
- ossllms-0.1.0/tests/test_http_resume.py +173 -0
- ossllms-0.1.0/tests/test_keys.py +104 -0
- ossllms-0.1.0/tests/test_provenance.py +119 -0
- ossllms-0.1.0/tests/test_publish.py +268 -0
- ossllms-0.1.0/tests/test_pull_bundle.py +90 -0
- ossllms-0.1.0/tests/test_reachability.py +53 -0
- ossllms-0.1.0/tests/test_refs.py +88 -0
- ossllms-0.1.0/tests/test_search.py +234 -0
- ossllms-0.1.0/tests/test_seed.py +295 -0
- ossllms-0.1.0/tests/test_seed_engine.py +274 -0
- ossllms-0.1.0/tests/test_signing_payload.py +133 -0
- ossllms-0.1.0/tests/test_torrent_builder.py +223 -0
- ossllms-0.1.0/tests/test_v0_fixture_catalog.py +89 -0
- ossllms-0.1.0/tests/test_v2_roots.py +114 -0
- ossllms-0.1.0/tests/test_verify.py +43 -0
ossllms-0.1.0/.gitignore
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
# Secrets & keys — NEVER commit signing keys
|
|
2
|
+
*.key
|
|
3
|
+
*.pem
|
|
4
|
+
*.sec
|
|
5
|
+
*.minisign
|
|
6
|
+
minisign.key
|
|
7
|
+
*.private
|
|
8
|
+
secrets/
|
|
9
|
+
.env
|
|
10
|
+
.env.*
|
|
11
|
+
|
|
12
|
+
# Model data / large artifacts (these live in the swarm, not git)
|
|
13
|
+
data/
|
|
14
|
+
cache/
|
|
15
|
+
e2e/artifacts/
|
|
16
|
+
*.safetensors
|
|
17
|
+
*.gguf
|
|
18
|
+
*.bin
|
|
19
|
+
*.pt
|
|
20
|
+
*.ckpt
|
|
21
|
+
*.torrent
|
|
22
|
+
*.sqlite
|
|
23
|
+
*.sqlite-*
|
|
24
|
+
|
|
25
|
+
# Curated package data is intentionally tiny and must ship in the wheel.
|
|
26
|
+
!sdk/ossllms/data/
|
|
27
|
+
!sdk/ossllms/data/manifest.schema.json
|
|
28
|
+
!sdk/ossllms/data/default-catalog/
|
|
29
|
+
!sdk/ossllms/data/default-catalog/**
|
|
30
|
+
!sdk/ossllms/data/trusted-keys/
|
|
31
|
+
!sdk/ossllms/data/trusted-keys/**
|
|
32
|
+
|
|
33
|
+
# Build / deps
|
|
34
|
+
node_modules/
|
|
35
|
+
dist/
|
|
36
|
+
build/
|
|
37
|
+
target/
|
|
38
|
+
__pycache__/
|
|
39
|
+
*.pyc
|
|
40
|
+
.venv/
|
|
41
|
+
venv/
|
|
42
|
+
*.egg-info/
|
|
43
|
+
|
|
44
|
+
# OS / editor
|
|
45
|
+
.DS_Store
|
|
46
|
+
Thumbs.db
|
|
47
|
+
.idea/
|
|
48
|
+
.vscode/
|
|
49
|
+
*.swp
|
ossllms-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ossllms
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Pull open-weight AI models from the ossllms torrent network. A drop-in for huggingface_hub.
|
|
5
|
+
Project-URL: Homepage, https://ossllms.com
|
|
6
|
+
Project-URL: Source, https://github.com/gittb/ossllms
|
|
7
|
+
Author: ossllms contributors
|
|
8
|
+
License-Expression: Apache-2.0
|
|
9
|
+
Keywords: ai,bittorrent,huggingface,llm,p2p,preservation
|
|
10
|
+
Requires-Python: >=3.10
|
|
11
|
+
Requires-Dist: cryptography>=42
|
|
12
|
+
Provides-Extra: dev
|
|
13
|
+
Requires-Dist: jsonschema>=4; extra == 'dev'
|
|
14
|
+
Requires-Dist: pytest>=8; extra == 'dev'
|
|
15
|
+
Provides-Extra: live-e2e
|
|
16
|
+
Requires-Dist: huggingface-hub>=0.23; extra == 'live-e2e'
|
|
17
|
+
Provides-Extra: schema
|
|
18
|
+
Requires-Dist: jsonschema>=4; extra == 'schema'
|
|
19
|
+
Provides-Extra: torrent
|
|
20
|
+
Requires-Dist: libtorrent>=2.0; extra == 'torrent'
|
|
21
|
+
Description-Content-Type: text/markdown
|
|
22
|
+
|
|
23
|
+
# ossllms (Python SDK + CLI)
|
|
24
|
+
|
|
25
|
+
Pull open-weight AI models from the [ossllms](https://ossllms.com) preservation
|
|
26
|
+
network. A drop-in for `huggingface_hub` that resolves from the torrent layer,
|
|
27
|
+
verifies integrity + publisher signature, and lands files in the HF-compatible
|
|
28
|
+
cache. Full design: [`../docs/SDK.md`](../docs/SDK.md) and
|
|
29
|
+
[`../docs/INTEROP-HF.md`](../docs/INTEROP-HF.md).
|
|
30
|
+
|
|
31
|
+
## Install
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
pip install 'ossllms[torrent]' # V0 path: HTTP web-seed + libtorrent swarm/seed
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
## CLI (simplest UI)
|
|
38
|
+
|
|
39
|
+
```bash
|
|
40
|
+
ossllms search minilm # find a useful default-catalog model
|
|
41
|
+
ossllms pull hf/sentence-transformers/all-MiniLM-L6-v2@0.0.1+1110a243fdf4
|
|
42
|
+
ossllms pull hf/sentence-transformers/all-MiniLM-L6-v2@0.0.1+1110a243fdf4 --seed
|
|
43
|
+
ossllms contribute # start/attach to the managed seed worker
|
|
44
|
+
ossllms pull sha256:<64-hex> # resolve by catalog hash index
|
|
45
|
+
ossllms publish ./model # TTY: prompt/infer, sign, optionally seed
|
|
46
|
+
ossllms publish ./model --seed --yes-public # automation: publish and seed local bytes
|
|
47
|
+
ossllms contribute --dry-run # preview seed/publish candidates and caps
|
|
48
|
+
ossllms contribute --publish # TTY: choose publish/seed rows, consent, then seed
|
|
49
|
+
ossllms contribute --publish --yes-public # automation: publish selected cache candidates, then seed
|
|
50
|
+
ossllms contribute # attach to/start managed seed handoff worker
|
|
51
|
+
ossllms contribute --status # show worker metrics
|
|
52
|
+
ossllms contribute --stop # stop worker
|
|
53
|
+
ossllms ls # what's cached
|
|
54
|
+
ossllms verify hf/sentence-transformers/all-MiniLM-L6-v2@0.0.1+1110a243fdf4
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
If no catalog is configured, the packaged signed default catalog is used. Set
|
|
58
|
+
`OSSLLMS_CATALOG=https://catalog.ossllms.com` or pass `--catalog ...` for a
|
|
59
|
+
custom directory or http(s) catalog.
|
|
60
|
+
|
|
61
|
+
## Python
|
|
62
|
+
|
|
63
|
+
```python
|
|
64
|
+
from ossllms import snapshot_download, hf_hub_download, pull
|
|
65
|
+
|
|
66
|
+
path = snapshot_download("hf/sentence-transformers/all-MiniLM-L6-v2@0.0.1+1110a243fdf4")
|
|
67
|
+
cfg = hf_hub_download(
|
|
68
|
+
"hf/sentence-transformers/all-MiniLM-L6-v2",
|
|
69
|
+
"config.json",
|
|
70
|
+
revision="0.0.1+1110a243fdf4",
|
|
71
|
+
)
|
|
72
|
+
|
|
73
|
+
res = pull("hf/sentence-transformers/all-MiniLM-L6-v2@0.0.1+1110a243fdf4")
|
|
74
|
+
print(res.verdict.label, res.verdict.signer) # "Verified" / signer
|
|
75
|
+
|
|
76
|
+
import ossllms.compat # opt-in: route huggingface_hub downloads through ossllms
|
|
77
|
+
```
|
|
78
|
+
|
|
79
|
+
## What's built (v0.1)
|
|
80
|
+
|
|
81
|
+
- `pull`: resolve signed manifest → download via **HTTP web seeds** (works today;
|
|
82
|
+
`file://` supported) → verify **every file's SHA-256**, **v2 root**, and
|
|
83
|
+
**minisign/Ed25519** signature → place in HF cache. Integrity always enforced;
|
|
84
|
+
origin shown as Verified/Unverified (`--require-signature` to enforce).
|
|
85
|
+
`pull --seed` starts the managed contribution worker for the pulled snapshot
|
|
86
|
+
when the catalog has matching torrent metadata.
|
|
87
|
+
- Store-qualified refs, `sha256:` refs, and catalog-paired `magnet:` refs.
|
|
88
|
+
- `publish`: build and sign a public redistribution bundle from a local model
|
|
89
|
+
directory; optionally update a static V0 catalog/hash index with
|
|
90
|
+
`--catalog-dir`, and start the managed seed worker directly from the local
|
|
91
|
+
directory with `--seed`.
|
|
92
|
+
- `contribute --dry-run`: scans local HF cache roots, matches catalog hashes,
|
|
93
|
+
shows seed/publish candidates, public-publish warnings, and upload caps.
|
|
94
|
+
- `contribute`: attaches to an active managed worker or starts one for complete
|
|
95
|
+
in-network seed matches, persisting upload caps, zero-download seed metrics, and
|
|
96
|
+
a worker plan. In a terminal it renders the scan plan, lets the user select
|
|
97
|
+
publish/seed rows, shows caps, and requires public redistribution consent
|
|
98
|
+
before publish writes. When `ossllms[torrent]` is installed and
|
|
99
|
+
`release.torrent` metadata is available beside `release.json`, the worker
|
|
100
|
+
starts libtorrent seed mode from a hardlink-only view of the HF cache.
|
|
101
|
+
- Worker-state, HF metadata, provenance, and seed-mode handoff primitives.
|
|
102
|
+
- `huggingface_hub`-compatible `snapshot_download` / `hf_hub_download` + `compat` shim.
|
|
103
|
+
- Manifest schema validation, trust store (pinned keys), selective `--include`.
|
|
104
|
+
- libtorrent swarm engine: download scaffold plus live contribute seed adapter
|
|
105
|
+
behind the optional torrent extra. Local two-peer E2E gates prove fixture,
|
|
106
|
+
direct-publish, and contribute-publish no-web-seed swarms over libtorrent.
|
|
107
|
+
- Bundled default MiniLM metadata includes `release.torrent`, so the post-pull
|
|
108
|
+
default contribution path can start live seed mode when `ossllms[torrent]` is
|
|
109
|
+
installed.
|
|
110
|
+
|
|
111
|
+
## Develop / test
|
|
112
|
+
|
|
113
|
+
```bash
|
|
114
|
+
python -m venv .venv
|
|
115
|
+
.venv/bin/pip install -e '.[dev]'
|
|
116
|
+
.venv/bin/pytest # or: PYTHONPATH=. .venv/bin/pytest tests
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
`Verified` = verified **origin + integrity** (+ a `matches HF` badge). It does
|
|
120
|
+
**not** mean the weights are safe to run.
|
ossllms-0.1.0/README.md
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
# ossllms (Python SDK + CLI)
|
|
2
|
+
|
|
3
|
+
Pull open-weight AI models from the [ossllms](https://ossllms.com) preservation
|
|
4
|
+
network. A drop-in for `huggingface_hub` that resolves from the torrent layer,
|
|
5
|
+
verifies integrity + publisher signature, and lands files in the HF-compatible
|
|
6
|
+
cache. Full design: [`../docs/SDK.md`](../docs/SDK.md) and
|
|
7
|
+
[`../docs/INTEROP-HF.md`](../docs/INTEROP-HF.md).
|
|
8
|
+
|
|
9
|
+
## Install
|
|
10
|
+
|
|
11
|
+
```bash
|
|
12
|
+
pip install 'ossllms[torrent]' # V0 path: HTTP web-seed + libtorrent swarm/seed
|
|
13
|
+
```
|
|
14
|
+
|
|
15
|
+
## CLI (simplest UI)
|
|
16
|
+
|
|
17
|
+
```bash
|
|
18
|
+
ossllms search minilm # find a useful default-catalog model
|
|
19
|
+
ossllms pull hf/sentence-transformers/all-MiniLM-L6-v2@0.0.1+1110a243fdf4
|
|
20
|
+
ossllms pull hf/sentence-transformers/all-MiniLM-L6-v2@0.0.1+1110a243fdf4 --seed
|
|
21
|
+
ossllms contribute # start/attach to the managed seed worker
|
|
22
|
+
ossllms pull sha256:<64-hex> # resolve by catalog hash index
|
|
23
|
+
ossllms publish ./model # TTY: prompt/infer, sign, optionally seed
|
|
24
|
+
ossllms publish ./model --seed --yes-public # automation: publish and seed local bytes
|
|
25
|
+
ossllms contribute --dry-run # preview seed/publish candidates and caps
|
|
26
|
+
ossllms contribute --publish # TTY: choose publish/seed rows, consent, then seed
|
|
27
|
+
ossllms contribute --publish --yes-public # automation: publish selected cache candidates, then seed
|
|
28
|
+
ossllms contribute # attach to/start managed seed handoff worker
|
|
29
|
+
ossllms contribute --status # show worker metrics
|
|
30
|
+
ossllms contribute --stop # stop worker
|
|
31
|
+
ossllms ls # what's cached
|
|
32
|
+
ossllms verify hf/sentence-transformers/all-MiniLM-L6-v2@0.0.1+1110a243fdf4
|
|
33
|
+
```
|
|
34
|
+
|
|
35
|
+
If no catalog is configured, the packaged signed default catalog is used. Set
|
|
36
|
+
`OSSLLMS_CATALOG=https://catalog.ossllms.com` or pass `--catalog ...` for a
|
|
37
|
+
custom directory or http(s) catalog.
|
|
38
|
+
|
|
39
|
+
## Python
|
|
40
|
+
|
|
41
|
+
```python
|
|
42
|
+
from ossllms import snapshot_download, hf_hub_download, pull
|
|
43
|
+
|
|
44
|
+
path = snapshot_download("hf/sentence-transformers/all-MiniLM-L6-v2@0.0.1+1110a243fdf4")
|
|
45
|
+
cfg = hf_hub_download(
|
|
46
|
+
"hf/sentence-transformers/all-MiniLM-L6-v2",
|
|
47
|
+
"config.json",
|
|
48
|
+
revision="0.0.1+1110a243fdf4",
|
|
49
|
+
)
|
|
50
|
+
|
|
51
|
+
res = pull("hf/sentence-transformers/all-MiniLM-L6-v2@0.0.1+1110a243fdf4")
|
|
52
|
+
print(res.verdict.label, res.verdict.signer) # "Verified" / signer
|
|
53
|
+
|
|
54
|
+
import ossllms.compat # opt-in: route huggingface_hub downloads through ossllms
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
## What's built (v0.1)
|
|
58
|
+
|
|
59
|
+
- `pull`: resolve signed manifest → download via **HTTP web seeds** (works today;
|
|
60
|
+
`file://` supported) → verify **every file's SHA-256**, **v2 root**, and
|
|
61
|
+
**minisign/Ed25519** signature → place in HF cache. Integrity always enforced;
|
|
62
|
+
origin shown as Verified/Unverified (`--require-signature` to enforce).
|
|
63
|
+
`pull --seed` starts the managed contribution worker for the pulled snapshot
|
|
64
|
+
when the catalog has matching torrent metadata.
|
|
65
|
+
- Store-qualified refs, `sha256:` refs, and catalog-paired `magnet:` refs.
|
|
66
|
+
- `publish`: build and sign a public redistribution bundle from a local model
|
|
67
|
+
directory; optionally update a static V0 catalog/hash index with
|
|
68
|
+
`--catalog-dir`, and start the managed seed worker directly from the local
|
|
69
|
+
directory with `--seed`.
|
|
70
|
+
- `contribute --dry-run`: scans local HF cache roots, matches catalog hashes,
|
|
71
|
+
shows seed/publish candidates, public-publish warnings, and upload caps.
|
|
72
|
+
- `contribute`: attaches to an active managed worker or starts one for complete
|
|
73
|
+
in-network seed matches, persisting upload caps, zero-download seed metrics, and
|
|
74
|
+
a worker plan. In a terminal it renders the scan plan, lets the user select
|
|
75
|
+
publish/seed rows, shows caps, and requires public redistribution consent
|
|
76
|
+
before publish writes. When `ossllms[torrent]` is installed and
|
|
77
|
+
`release.torrent` metadata is available beside `release.json`, the worker
|
|
78
|
+
starts libtorrent seed mode from a hardlink-only view of the HF cache.
|
|
79
|
+
- Worker-state, HF metadata, provenance, and seed-mode handoff primitives.
|
|
80
|
+
- `huggingface_hub`-compatible `snapshot_download` / `hf_hub_download` + `compat` shim.
|
|
81
|
+
- Manifest schema validation, trust store (pinned keys), selective `--include`.
|
|
82
|
+
- libtorrent swarm engine: download scaffold plus live contribute seed adapter
|
|
83
|
+
behind the optional torrent extra. Local two-peer E2E gates prove fixture,
|
|
84
|
+
direct-publish, and contribute-publish no-web-seed swarms over libtorrent.
|
|
85
|
+
- Bundled default MiniLM metadata includes `release.torrent`, so the post-pull
|
|
86
|
+
default contribution path can start live seed mode when `ossllms[torrent]` is
|
|
87
|
+
installed.
|
|
88
|
+
|
|
89
|
+
## Develop / test
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
python -m venv .venv
|
|
93
|
+
.venv/bin/pip install -e '.[dev]'
|
|
94
|
+
.venv/bin/pytest # or: PYTHONPATH=. .venv/bin/pytest tests
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
`Verified` = verified **origin + integrity** (+ a `matches HF` badge). It does
|
|
98
|
+
**not** mean the weights are safe to run.
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""ossllms — pull open-weight AI models from the ossllms preservation network.
|
|
2
|
+
|
|
3
|
+
A drop-in for huggingface_hub that resolves from the torrent layer, verifies
|
|
4
|
+
integrity + publisher signature, and lands files in the HF-compatible cache.
|
|
5
|
+
|
|
6
|
+
from ossllms import snapshot_download
|
|
7
|
+
path = snapshot_download("Qwen/Qwen2.5-7B")
|
|
8
|
+
"""
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from .api import (
|
|
12
|
+
IntegrityError,
|
|
13
|
+
PullResult,
|
|
14
|
+
UnverifiedOriginError,
|
|
15
|
+
hf_hub_download,
|
|
16
|
+
pull,
|
|
17
|
+
snapshot_download,
|
|
18
|
+
)
|
|
19
|
+
from .verify import TrustStore, Verdict
|
|
20
|
+
|
|
21
|
+
__version__ = "0.1.0"
|
|
22
|
+
|
|
23
|
+
__all__ = [
|
|
24
|
+
"pull",
|
|
25
|
+
"snapshot_download",
|
|
26
|
+
"hf_hub_download",
|
|
27
|
+
"PullResult",
|
|
28
|
+
"Verdict",
|
|
29
|
+
"TrustStore",
|
|
30
|
+
"IntegrityError",
|
|
31
|
+
"UnverifiedOriginError",
|
|
32
|
+
"__version__",
|
|
33
|
+
]
|
|
@@ -0,0 +1,194 @@
|
|
|
1
|
+
"""High-level API. `pull()` is the core; `snapshot_download` / `hf_hub_download`
|
|
2
|
+
mirror the huggingface_hub names for drop-in use.
|
|
3
|
+
"""
|
|
4
|
+
from __future__ import annotations
|
|
5
|
+
|
|
6
|
+
import shutil
|
|
7
|
+
import tempfile
|
|
8
|
+
from dataclasses import dataclass
|
|
9
|
+
from pathlib import Path, PurePosixPath
|
|
10
|
+
from typing import Callable, List, Optional
|
|
11
|
+
|
|
12
|
+
from . import config
|
|
13
|
+
from .cache import place_into_cache, reuse_from_blobs
|
|
14
|
+
from .denylist import load_denylist
|
|
15
|
+
from .engine import EngineError, get_engine
|
|
16
|
+
from .manifest import model_id, version
|
|
17
|
+
from .resolve import resolve_manifest_with_source
|
|
18
|
+
from .verify import TrustStore, Verdict, verify_manifest
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class IntegrityError(RuntimeError):
|
|
22
|
+
pass
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class UnverifiedOriginError(RuntimeError):
|
|
26
|
+
pass
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass
|
|
30
|
+
class PullResult:
|
|
31
|
+
path: Path
|
|
32
|
+
repo_id: str
|
|
33
|
+
revision: str
|
|
34
|
+
manifest: dict
|
|
35
|
+
verdict: Verdict
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def pull(
|
|
39
|
+
ref: str,
|
|
40
|
+
*,
|
|
41
|
+
catalog: Optional[str] = None,
|
|
42
|
+
cache_dir=None,
|
|
43
|
+
revision: Optional[str] = None,
|
|
44
|
+
include: Optional[List[str]] = None,
|
|
45
|
+
engine: str = "auto",
|
|
46
|
+
engine_impl=None,
|
|
47
|
+
trust: Optional[TrustStore] = None,
|
|
48
|
+
denylist: Optional[str] = None,
|
|
49
|
+
require_signature: bool = False,
|
|
50
|
+
progress: Optional[Callable[[str], None]] = None,
|
|
51
|
+
) -> PullResult:
|
|
52
|
+
"""Resolve, download, verify, and cache a model version.
|
|
53
|
+
|
|
54
|
+
Integrity (per-file SHA-256) is ALWAYS enforced. Signature/origin failure
|
|
55
|
+
raises only when require_signature=True; otherwise it returns an Unverified
|
|
56
|
+
verdict (community upload) and proceeds, matching the trust UX.
|
|
57
|
+
"""
|
|
58
|
+
catalog = catalog or config.default_catalog()
|
|
59
|
+
cache_dir = Path(cache_dir) if cache_dir else config.hf_hub_cache()
|
|
60
|
+
trust = trust if trust is not None else TrustStore.from_dir(config.trust_dir())
|
|
61
|
+
|
|
62
|
+
resolution = resolve_manifest_with_source(ref, catalog)
|
|
63
|
+
manifest = resolution.manifest
|
|
64
|
+
repo_id = model_id(manifest)
|
|
65
|
+
ver = version(manifest)
|
|
66
|
+
rev = revision or ver
|
|
67
|
+
denylist = denylist or config.default_denylist()
|
|
68
|
+
if denylist is not None:
|
|
69
|
+
load_denylist(denylist, trust).check_manifest(manifest)
|
|
70
|
+
|
|
71
|
+
with tempfile.TemporaryDirectory(prefix="ossllms-") as staged:
|
|
72
|
+
selected = include_to_paths(manifest, include)
|
|
73
|
+
for path in selected:
|
|
74
|
+
_safe_artifact_relpath(path)
|
|
75
|
+
sel_set = set(selected)
|
|
76
|
+
selected_artifacts = [a for a in manifest["artifacts"] if a["path"] in sel_set]
|
|
77
|
+
|
|
78
|
+
# Reuse byte-identical files already in the blob store; fetch only the rest.
|
|
79
|
+
reused = set(reuse_from_blobs(staged, cache_dir, repo_id, selected_artifacts))
|
|
80
|
+
missing = [p for p in selected if p not in reused]
|
|
81
|
+
local = set()
|
|
82
|
+
if resolution.local_source_dir is not None and missing:
|
|
83
|
+
local = set(
|
|
84
|
+
_copy_from_local_source(
|
|
85
|
+
resolution.local_source_dir,
|
|
86
|
+
staged,
|
|
87
|
+
missing,
|
|
88
|
+
)
|
|
89
|
+
)
|
|
90
|
+
missing = [p for p in missing if p not in local]
|
|
91
|
+
|
|
92
|
+
eng = None
|
|
93
|
+
if missing:
|
|
94
|
+
eng = engine_impl if engine_impl is not None else get_engine(engine, manifest)
|
|
95
|
+
if progress:
|
|
96
|
+
engine_name = eng.name if eng is not None else "none"
|
|
97
|
+
progress(
|
|
98
|
+
f"engine: {engine_name}; reuse {len(reused)} file(s), "
|
|
99
|
+
f"local {len(local)} file(s), fetch {len(missing)}"
|
|
100
|
+
)
|
|
101
|
+
if missing:
|
|
102
|
+
try:
|
|
103
|
+
eng.fetch(manifest, staged, include=missing, progress=progress)
|
|
104
|
+
except EngineError as exc:
|
|
105
|
+
_raise_integrity_error_if_staged_file_failed(manifest, staged, trust, sel_set, exc)
|
|
106
|
+
raise
|
|
107
|
+
|
|
108
|
+
verdict = verify_manifest(manifest, staged, trust, only=sel_set)
|
|
109
|
+
|
|
110
|
+
if not verdict.integrity_ok:
|
|
111
|
+
bad = [f"{f.path} ({f.reason})" for f in verdict.files if not f.ok]
|
|
112
|
+
raise IntegrityError("integrity check failed: " + "; ".join(bad))
|
|
113
|
+
if require_signature and not verdict.origin_ok:
|
|
114
|
+
raise UnverifiedOriginError(
|
|
115
|
+
"publisher signature could not be verified: " + "; ".join(verdict.messages)
|
|
116
|
+
)
|
|
117
|
+
|
|
118
|
+
target = place_into_cache(staged, cache_dir, repo_id, rev, manifest=manifest)
|
|
119
|
+
|
|
120
|
+
return PullResult(target, repo_id, rev, manifest, verdict)
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def include_to_paths(manifest: dict, include: Optional[List[str]]) -> List[str]:
|
|
124
|
+
import fnmatch
|
|
125
|
+
|
|
126
|
+
if not include:
|
|
127
|
+
return [a["path"] for a in manifest["artifacts"]]
|
|
128
|
+
return [
|
|
129
|
+
a["path"]
|
|
130
|
+
for a in manifest["artifacts"]
|
|
131
|
+
if any(fnmatch.fnmatch(a["path"], pat) for pat in include)
|
|
132
|
+
]
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def _copy_from_local_source(source_dir: Path, staged_dir, paths: List[str]) -> List[str]:
|
|
136
|
+
source = Path(source_dir).resolve()
|
|
137
|
+
staged = Path(staged_dir)
|
|
138
|
+
copied: List[str] = []
|
|
139
|
+
for artifact_path in paths:
|
|
140
|
+
rel = _safe_artifact_relpath(artifact_path)
|
|
141
|
+
src = (source / rel).resolve()
|
|
142
|
+
try:
|
|
143
|
+
src.relative_to(source)
|
|
144
|
+
except ValueError as exc:
|
|
145
|
+
raise IntegrityError(f"artifact path escapes local source: {artifact_path!r}") from exc
|
|
146
|
+
if not src.is_file():
|
|
147
|
+
continue
|
|
148
|
+
dst = staged / rel
|
|
149
|
+
dst.parent.mkdir(parents=True, exist_ok=True)
|
|
150
|
+
shutil.copy2(src, dst)
|
|
151
|
+
copied.append(artifact_path)
|
|
152
|
+
return copied
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def _safe_artifact_relpath(path: str) -> Path:
|
|
156
|
+
if not isinstance(path, str) or not path:
|
|
157
|
+
raise IntegrityError(f"unsafe artifact path: {path!r}")
|
|
158
|
+
rel = PurePosixPath(path)
|
|
159
|
+
if rel.is_absolute() or not rel.parts or any(part in ("", ".", "..") for part in rel.parts):
|
|
160
|
+
raise IntegrityError(f"unsafe artifact path: {path!r}")
|
|
161
|
+
return Path(*rel.parts)
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def _raise_integrity_error_if_staged_file_failed(
|
|
165
|
+
manifest: dict,
|
|
166
|
+
staged_dir,
|
|
167
|
+
trust: TrustStore,
|
|
168
|
+
only: set,
|
|
169
|
+
cause: EngineError,
|
|
170
|
+
) -> None:
|
|
171
|
+
verdict = verify_manifest(manifest, staged_dir, trust, only=only)
|
|
172
|
+
bad = [f"{f.path} ({f.reason})" for f in verdict.files if not f.ok and f.reason != "missing"]
|
|
173
|
+
if bad:
|
|
174
|
+
raise IntegrityError("integrity check failed: " + "; ".join(bad)) from cause
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
# --- huggingface_hub-compatible surface -------------------------------------
|
|
178
|
+
|
|
179
|
+
def snapshot_download(repo_id: str, *, revision: Optional[str] = None,
|
|
180
|
+
allow_patterns: Optional[List[str]] = None,
|
|
181
|
+
cache_dir=None, catalog: Optional[str] = None,
|
|
182
|
+
**_ignored) -> str:
|
|
183
|
+
"""Drop-in for huggingface_hub.snapshot_download. Returns the local path."""
|
|
184
|
+
ref = repo_id if not revision else f"{repo_id}@{revision}"
|
|
185
|
+
res = pull(ref, catalog=catalog, cache_dir=cache_dir, include=allow_patterns)
|
|
186
|
+
return str(res.path)
|
|
187
|
+
|
|
188
|
+
|
|
189
|
+
def hf_hub_download(repo_id: str, filename: str, *, revision: Optional[str] = None,
|
|
190
|
+
cache_dir=None, catalog: Optional[str] = None, **_ignored) -> str:
|
|
191
|
+
"""Drop-in for huggingface_hub.hf_hub_download. Returns the local file path."""
|
|
192
|
+
ref = repo_id if not revision else f"{repo_id}@{revision}"
|
|
193
|
+
res = pull(ref, catalog=catalog, cache_dir=cache_dir, include=[filename])
|
|
194
|
+
return str(res.path / filename)
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""HF-compatible cache with a blob store + hardlink dedup.
|
|
2
|
+
|
|
3
|
+
Layout (identical to huggingface_hub, so transformers/vllm/llama.cpp load unchanged
|
|
4
|
+
AND versions/quants/repos share bytes):
|
|
5
|
+
|
|
6
|
+
<cache>/models--<org>--<name>/
|
|
7
|
+
blobs/<sha256> one physical copy per unique file
|
|
8
|
+
snapshots/<revision>/<path> hardlink into blobs/
|
|
9
|
+
refs/<channel> e.g. refs/main -> a revision
|
|
10
|
+
|
|
11
|
+
Identical files across versions point at the same blob inode, so disk cost is the
|
|
12
|
+
size of the UNIQUE bytes, not N x per version. `reuse_from_blobs` lets a pull skip
|
|
13
|
+
downloading any file already present as a blob (the "only changed bytes move"
|
|
14
|
+
update path in docs/VERSIONING.md).
|
|
15
|
+
"""
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import os
|
|
19
|
+
import shutil
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
from typing import List
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def repo_folder_name(repo_id: str) -> str:
|
|
25
|
+
return "models--" + repo_id.replace("/", "--")
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def repo_dir(cache_root, repo_id: str) -> Path:
|
|
29
|
+
return Path(cache_root) / repo_folder_name(repo_id)
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def blobs_dir(cache_root, repo_id: str) -> Path:
|
|
33
|
+
return repo_dir(cache_root, repo_id) / "blobs"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def blob_path(cache_root, repo_id: str, sha256: str) -> Path:
|
|
37
|
+
return blobs_dir(cache_root, repo_id) / sha256
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def snapshot_dir(cache_root, repo_id: str, revision: str) -> Path:
|
|
41
|
+
return repo_dir(cache_root, repo_id) / "snapshots" / revision
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def has_blob(cache_root, repo_id: str, sha256: str) -> bool:
|
|
45
|
+
return blob_path(cache_root, repo_id, sha256).exists()
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _link_or_copy(src: Path, dst: Path) -> None:
|
|
49
|
+
dst.parent.mkdir(parents=True, exist_ok=True)
|
|
50
|
+
if dst.exists():
|
|
51
|
+
return
|
|
52
|
+
try:
|
|
53
|
+
os.link(src, dst) # hardlink (same filesystem)
|
|
54
|
+
except OSError:
|
|
55
|
+
shutil.copy2(src, dst) # cross-filesystem fallback
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def reuse_from_blobs(staged_dir, cache_root, repo_id: str, artifacts) -> List[str]:
|
|
59
|
+
"""Hardlink already-present blobs into the staging dir; return reused paths.
|
|
60
|
+
|
|
61
|
+
Call before downloading so files already on disk (from a prior version, a quant,
|
|
62
|
+
or another pull) are not fetched again.
|
|
63
|
+
"""
|
|
64
|
+
reused: List[str] = []
|
|
65
|
+
staged = Path(staged_dir)
|
|
66
|
+
for a in artifacts:
|
|
67
|
+
bp = blob_path(cache_root, repo_id, a["sha256"])
|
|
68
|
+
if bp.exists():
|
|
69
|
+
_link_or_copy(bp, staged / a["path"])
|
|
70
|
+
reused.append(a["path"])
|
|
71
|
+
return reused
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def place_into_cache(staged_dir, cache_root, repo_id: str, revision: str, manifest=None) -> Path:
|
|
75
|
+
"""Store staged files as blobs (keyed by sha256) and hardlink them into the
|
|
76
|
+
snapshot dir. Files in the manifest are deduped via the blob store; any extras
|
|
77
|
+
are copied as-is."""
|
|
78
|
+
target = snapshot_dir(cache_root, repo_id, revision)
|
|
79
|
+
target.mkdir(parents=True, exist_ok=True)
|
|
80
|
+
staged = Path(staged_dir)
|
|
81
|
+
sha_by_path = {a["path"]: a["sha256"] for a in (manifest or {}).get("artifacts", [])}
|
|
82
|
+
|
|
83
|
+
for src in staged.rglob("*"):
|
|
84
|
+
if not src.is_file():
|
|
85
|
+
continue
|
|
86
|
+
rel = src.relative_to(staged)
|
|
87
|
+
sha = sha_by_path.get(str(rel))
|
|
88
|
+
snap_dst = target / rel
|
|
89
|
+
snap_dst.parent.mkdir(parents=True, exist_ok=True)
|
|
90
|
+
if sha:
|
|
91
|
+
bp = blob_path(cache_root, repo_id, sha)
|
|
92
|
+
_link_or_copy(src, bp) # store blob (idempotent)
|
|
93
|
+
if snap_dst.exists():
|
|
94
|
+
snap_dst.unlink()
|
|
95
|
+
_link_or_copy(bp, snap_dst) # snapshot hardlinks to blob
|
|
96
|
+
elif not snap_dst.exists():
|
|
97
|
+
shutil.copy2(src, snap_dst)
|
|
98
|
+
|
|
99
|
+
refs = repo_dir(cache_root, repo_id) / "refs"
|
|
100
|
+
refs.mkdir(parents=True, exist_ok=True)
|
|
101
|
+
(refs / "main").write_text(revision, encoding="utf-8")
|
|
102
|
+
return target
|