blastbox 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- blastbox-0.1.0/LICENSE +21 -0
- blastbox-0.1.0/PKG-INFO +158 -0
- blastbox-0.1.0/README.md +120 -0
- blastbox-0.1.0/pyproject.toml +66 -0
- blastbox-0.1.0/setup.cfg +4 -0
- blastbox-0.1.0/src/blastbox/__init__.py +13 -0
- blastbox-0.1.0/src/blastbox/contract/__init__.py +30 -0
- blastbox-0.1.0/src/blastbox/contract/envelope.py +173 -0
- blastbox-0.1.0/src/blastbox/contract/leaf.py +68 -0
- blastbox-0.1.0/src/blastbox/contract/nodes.py +172 -0
- blastbox-0.1.0/src/blastbox/contract/walk.py +35 -0
- blastbox-0.1.0/src/blastbox/errors.py +75 -0
- blastbox-0.1.0/src/blastbox/host/__init__.py +0 -0
- blastbox-0.1.0/src/blastbox/host/cli.py +142 -0
- blastbox-0.1.0/src/blastbox/host/dispatch.py +632 -0
- blastbox-0.1.0/src/blastbox/host/ingress/__init__.py +0 -0
- blastbox-0.1.0/src/blastbox/host/ingress/app.py +561 -0
- blastbox-0.1.0/src/blastbox/host/ingress/middleware.py +139 -0
- blastbox-0.1.0/src/blastbox/host/jobs/__init__.py +17 -0
- blastbox-0.1.0/src/blastbox/host/jobs/base.py +110 -0
- blastbox-0.1.0/src/blastbox/host/jobs/memory.py +62 -0
- blastbox-0.1.0/src/blastbox/host/jobs/redis_store.py +131 -0
- blastbox-0.1.0/src/blastbox/host/jobs/retention.py +163 -0
- blastbox-0.1.0/src/blastbox/host/jobs/sql_store.py +309 -0
- blastbox-0.1.0/src/blastbox/host/pool.py +555 -0
- blastbox-0.1.0/src/blastbox/host/pool_config.py +105 -0
- blastbox-0.1.0/src/blastbox/host/runtime/__init__.py +42 -0
- blastbox-0.1.0/src/blastbox/host/runtime/docker.py +408 -0
- blastbox-0.1.0/src/blastbox/host/runtime/firecracker.py +1072 -0
- blastbox-0.1.0/src/blastbox/host/runtime/host_limits.py +233 -0
- blastbox-0.1.0/src/blastbox/host/trust.py +145 -0
- blastbox-0.1.0/src/blastbox/limits.py +106 -0
- blastbox-0.1.0/src/blastbox/observability/__init__.py +23 -0
- blastbox-0.1.0/src/blastbox/observability/logging.py +41 -0
- blastbox-0.1.0/src/blastbox/observability/metrics.py +131 -0
- blastbox-0.1.0/src/blastbox/worker/__init__.py +25 -0
- blastbox-0.1.0/src/blastbox/worker/engine.py +89 -0
- blastbox-0.1.0/src/blastbox/worker/fc_guest.py +167 -0
- blastbox-0.1.0/src/blastbox/worker/fc_warm.py +173 -0
- blastbox-0.1.0/src/blastbox/worker/harness.py +249 -0
- blastbox-0.1.0/src/blastbox/worker/sandbox/__init__.py +31 -0
- blastbox-0.1.0/src/blastbox/worker/sandbox/base.py +97 -0
- blastbox-0.1.0/src/blastbox/worker/sandbox/bwrap.py +423 -0
- blastbox-0.1.0/src/blastbox/worker/sandbox/container.py +303 -0
- blastbox-0.1.0/src/blastbox/worker/sandbox/detect.py +226 -0
- blastbox-0.1.0/src/blastbox/worker/sandbox/nsjail.py +339 -0
- blastbox-0.1.0/src/blastbox/worker/warm.py +370 -0
- blastbox-0.1.0/src/blastbox.egg-info/PKG-INFO +158 -0
- blastbox-0.1.0/src/blastbox.egg-info/SOURCES.txt +54 -0
- blastbox-0.1.0/src/blastbox.egg-info/dependency_links.txt +1 -0
- blastbox-0.1.0/src/blastbox.egg-info/entry_points.txt +2 -0
- blastbox-0.1.0/src/blastbox.egg-info/requires.txt +17 -0
- blastbox-0.1.0/src/blastbox.egg-info/top_level.txt +1 -0
- blastbox-0.1.0/tests/test_errors.py +131 -0
- blastbox-0.1.0/tests/test_limits.py +157 -0
- blastbox-0.1.0/tests/test_observability.py +189 -0
blastbox-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Will Metcalf
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
blastbox-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: blastbox
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Reusable detonation framework: run untrusted documents through disposable, hardened workers
|
|
5
|
+
Author: Will Metcalf
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/wmetcalf/blastbox
|
|
8
|
+
Project-URL: Repository, https://github.com/wmetcalf/blastbox
|
|
9
|
+
Project-URL: Issues, https://github.com/wmetcalf/blastbox/issues
|
|
10
|
+
Keywords: sandbox,malware,detonation,untrusted,document,isolation
|
|
11
|
+
Classifier: Development Status :: 4 - Beta
|
|
12
|
+
Classifier: Intended Audience :: Developers
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Operating System :: POSIX :: Linux
|
|
15
|
+
Classifier: Programming Language :: Python :: 3
|
|
16
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
17
|
+
Classifier: Topic :: Security
|
|
18
|
+
Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
|
|
19
|
+
Requires-Python: >=3.12
|
|
20
|
+
Description-Content-Type: text/markdown
|
|
21
|
+
License-File: LICENSE
|
|
22
|
+
Requires-Dist: pydantic>=2.6.0
|
|
23
|
+
Provides-Extra: host
|
|
24
|
+
Requires-Dist: fastapi>=0.110; extra == "host"
|
|
25
|
+
Requires-Dist: uvicorn[standard]>=0.27; extra == "host"
|
|
26
|
+
Requires-Dist: python-multipart>=0.0.9; extra == "host"
|
|
27
|
+
Requires-Dist: structlog>=24.1; extra == "host"
|
|
28
|
+
Requires-Dist: prometheus-client>=0.20; extra == "host"
|
|
29
|
+
Requires-Dist: psycopg[binary]>=3.2; extra == "host"
|
|
30
|
+
Requires-Dist: redis>=5.0; extra == "host"
|
|
31
|
+
Provides-Extra: dev
|
|
32
|
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
|
33
|
+
Requires-Dist: mypy>=1.9.0; extra == "dev"
|
|
34
|
+
Requires-Dist: ruff>=0.3.0; extra == "dev"
|
|
35
|
+
Requires-Dist: fakeredis>=2.21; extra == "dev"
|
|
36
|
+
Requires-Dist: httpx>=0.27; extra == "dev"
|
|
37
|
+
Dynamic: license-file
|
|
38
|
+
|
|
39
|
+
# blastbox
|
|
40
|
+
|
|
41
|
+
**A reusable framework for running untrusted/malicious documents through disposable, hardened
|
|
42
|
+
workers** — and turning their output into a typed, host-validated result you can trust.
|
|
43
|
+
|
|
44
|
+
Extracted from the common substrate of two production services (a LibreOffice document→image
|
|
45
|
+
rasterizer and a Tika recursive-extraction service) that had each grown a parallel, drifting copy of
|
|
46
|
+
the same machinery. blastbox is the single, audited-once core; each service becomes a thin *engine*.
|
|
47
|
+
|
|
48
|
+
- **Write one function.** An engine implements `detonate(input, outdir, limits) -> DetonationResult`.
|
|
49
|
+
The framework gives it ingress, a JobStore, a disposable hardened worker per job, output-trust
|
|
50
|
+
validation, artifact serving, optional warm pooling, metrics, and a CLI — for free.
|
|
51
|
+
- **Output is never trusted.** A worker processes a malicious document; the host **re-seals its
|
|
52
|
+
output from disk** (recomputing hashes/sizes, confining paths) before believing a byte of it.
|
|
53
|
+
- **One untrusted doc per disposable slot** — always. Warm pooling pre-pays startup in the
|
|
54
|
+
background; it never reuses a worker across documents.
|
|
55
|
+
- **Typed, engine-shaped output.** A shared node library (`Page`, `EmbeddedResource`, `ExtractedText`,
|
|
56
|
+
a generic `Record` floor, recursive) lets engines be as specific or generic as they need, while the
|
|
57
|
+
framework validates a fixed security envelope identically for everyone.
|
|
58
|
+
|
|
59
|
+
Proven end-to-end against two real, language-diverse engines: a **Python + LibreOffice** rasterizer
|
|
60
|
+
and a **JVM + Tika** recursive extractor.
|
|
61
|
+
|
|
62
|
+
## Architecture
|
|
63
|
+
|
|
64
|
+
```
|
|
65
|
+
blastbox/
|
|
66
|
+
├── contract/ typed node tree + security envelope + seal/validate (registry-aware at any depth)
|
|
67
|
+
├── host/ LAYER 1 — host orchestrator (engine-agnostic) — needs blastbox[host]
|
|
68
|
+
│ ├── ingress FastAPI API + CLI: upload, status, artifact serving, /metrics
|
|
69
|
+
│ ├── jobs/ JobStore protocol + memory / sql / redis backends + retention
|
|
70
|
+
│ ├── dispatch claim → launch disposable worker → validate output → serve (+ warm path)
|
|
71
|
+
│ ├── runtime/ runc/runsc selection (fail-closed) + hardened worker `docker run` argv
|
|
72
|
+
│ ├── pool warm slot pool (one-doc-per-slot, never-reuse)
|
|
73
|
+
│ ├── trust output-trust validator — re-seals worker output from disk
|
|
74
|
+
│ └── observability
|
|
75
|
+
└── worker/ LAYER 2 — worker SDK (runs inside the disposable worker) — lean core
|
|
76
|
+
├── engine the seam: detect / warmup / detonate
|
|
77
|
+
├── harness read input → detonate → seal → write metadata.json
|
|
78
|
+
├── sandbox/ in-process hardening self-check + env-stripped subprocess execution
|
|
79
|
+
└── warm service lifecycle: boot → warmup → one job → exit
|
|
80
|
+
```
|
|
81
|
+
|
|
82
|
+
The host never imports an engine; it depends only on the **contract**. An engine never handles
|
|
83
|
+
hashes/paths defensively; the worker SDK and host do that in audited code.
|
|
84
|
+
|
|
85
|
+
## Writing an engine
|
|
86
|
+
|
|
87
|
+
```python
|
|
88
|
+
from pathlib import Path
|
|
89
|
+
from blastbox import Engine, DetonationResult, run_detonation
|
|
90
|
+
from blastbox.contract import Page, DeclaredArtifact, Detection, ArtifactRef, Dimensions
|
|
91
|
+
from blastbox.limits import Limits
|
|
92
|
+
|
|
93
|
+
class MyEngine:
|
|
94
|
+
name = "myengine"
|
|
95
|
+
formats = frozenset({"pdf"})
|
|
96
|
+
|
|
97
|
+
def detonate(self, input: Path, outdir: Path, limits: Limits) -> DetonationResult:
|
|
98
|
+
# ... render/extract; write artifact files into outdir ...
|
|
99
|
+
(outdir / "page-001.png").write_bytes(png_bytes)
|
|
100
|
+
return DetonationResult(
|
|
101
|
+
payload=Page(index=0, dims=Dimensions(width=210, height=297, unit="mm"),
|
|
102
|
+
image=ArtifactRef(id="p0")),
|
|
103
|
+
artifacts=[DeclaredArtifact(id="p0", path="page-001.png", kind="image")],
|
|
104
|
+
detected=Detection(label="pdf", mime="application/pdf", confidence=1.0, source="myengine"),
|
|
105
|
+
)
|
|
106
|
+
|
|
107
|
+
# worker entrypoint:
|
|
108
|
+
if __name__ == "__main__":
|
|
109
|
+
import sys
|
|
110
|
+
from blastbox.worker.harness import main
|
|
111
|
+
sys.exit(main(MyEngine()))
|
|
112
|
+
```
|
|
113
|
+
|
|
114
|
+
The harness seals your declared artifacts (recomputing sha256/size from disk, confining paths,
|
|
115
|
+
resolving references) and writes `metadata.json`. The host's `validate_worker_output` re-validates it.
|
|
116
|
+
Complex engines build recursive `EmbeddedResource` trees (e.g. Tika's recursive metadata); simple
|
|
117
|
+
ones use `Page`/`Record`. Engine-specific node subtypes register via `contract.register_node_type`.
|
|
118
|
+
|
|
119
|
+
## Install
|
|
120
|
+
|
|
121
|
+
```sh
|
|
122
|
+
pip install blastbox # lean core — everything an ENGINE needs (pydantic only)
|
|
123
|
+
pip install blastbox[host] # + the host orchestrator (FastAPI, jobstores, observability)
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
## Run (host)
|
|
127
|
+
|
|
128
|
+
```sh
|
|
129
|
+
blastbox serve --host 127.0.0.1 --port 8000 # the ingress API
|
|
130
|
+
blastbox dispatch # the worker dispatcher loop
|
|
131
|
+
```
|
|
132
|
+
|
|
133
|
+
`POST /v1/jobs` (multipart `file` + `engine`) enqueues a job; the dispatcher launches a hardened
|
|
134
|
+
disposable worker for it; `GET /v1/jobs/{id}/artifacts/{artifact_id}` serves validated output.
|
|
135
|
+
|
|
136
|
+
## Security model
|
|
137
|
+
|
|
138
|
+
- Disposable worker per job: `--network=none --cap-drop=ALL --no-new-privileges --read-only`, runsc
|
|
139
|
+
preferred (fail-closed when a secure runtime is required); the input is deleted after conversion.
|
|
140
|
+
- The host re-seals worker output from disk — worker-reported hashes/sizes are never trusted; artifact
|
|
141
|
+
paths are confined; the input-SHA round-trip is checked; `metadata.json` must be a regular file.
|
|
142
|
+
- Ingress rejects oversized bodies before spooling, sanitizes filenames, serves artifacts by id under
|
|
143
|
+
a confined path, and (optionally) sits behind a bearer token / auth proxy.
|
|
144
|
+
- The contract bounds payload size/depth and validates every node; engine subtypes are validated
|
|
145
|
+
against their registered schema.
|
|
146
|
+
|
|
147
|
+
## Status
|
|
148
|
+
|
|
149
|
+
Core framework complete and adversarially tested: contract + full host orchestrator + worker SDK
|
|
150
|
+
(harness, container sandbox, warm protocol) + warm pool + warm dispatch. Proven end-to-end on two real
|
|
151
|
+
engines.
|
|
152
|
+
|
|
153
|
+
Roadmap: host-native `bwrap`/`nsjail` sandbox backends; Firecracker microVM + snapshot runtime; the
|
|
154
|
+
warm-pool burst/health loops.
|
|
155
|
+
|
|
156
|
+
## License
|
|
157
|
+
|
|
158
|
+
MIT.
|
blastbox-0.1.0/README.md
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
# blastbox
|
|
2
|
+
|
|
3
|
+
**A reusable framework for running untrusted/malicious documents through disposable, hardened
|
|
4
|
+
workers** — and turning their output into a typed, host-validated result you can trust.
|
|
5
|
+
|
|
6
|
+
Extracted from the common substrate of two production services (a LibreOffice document→image
|
|
7
|
+
rasterizer and a Tika recursive-extraction service) that had each grown a parallel, drifting copy of
|
|
8
|
+
the same machinery. blastbox is the single, audited-once core; each service becomes a thin *engine*.
|
|
9
|
+
|
|
10
|
+
- **Write one function.** An engine implements `detonate(input, outdir, limits) -> DetonationResult`.
|
|
11
|
+
The framework gives it ingress, a JobStore, a disposable hardened worker per job, output-trust
|
|
12
|
+
validation, artifact serving, optional warm pooling, metrics, and a CLI — for free.
|
|
13
|
+
- **Output is never trusted.** A worker processes a malicious document; the host **re-seals its
|
|
14
|
+
output from disk** (recomputing hashes/sizes, confining paths) before believing a byte of it.
|
|
15
|
+
- **One untrusted doc per disposable slot** — always. Warm pooling pre-pays startup in the
|
|
16
|
+
background; it never reuses a worker across documents.
|
|
17
|
+
- **Typed, engine-shaped output.** A shared node library (`Page`, `EmbeddedResource`, `ExtractedText`,
|
|
18
|
+
a generic `Record` floor, recursive) lets engines be as specific or generic as they need, while the
|
|
19
|
+
framework validates a fixed security envelope identically for everyone.
|
|
20
|
+
|
|
21
|
+
Proven end-to-end against two real, language-diverse engines: a **Python + LibreOffice** rasterizer
|
|
22
|
+
and a **JVM + Tika** recursive extractor.
|
|
23
|
+
|
|
24
|
+
## Architecture
|
|
25
|
+
|
|
26
|
+
```
|
|
27
|
+
blastbox/
|
|
28
|
+
├── contract/ typed node tree + security envelope + seal/validate (registry-aware at any depth)
|
|
29
|
+
├── host/ LAYER 1 — host orchestrator (engine-agnostic) — needs blastbox[host]
|
|
30
|
+
│ ├── ingress FastAPI API + CLI: upload, status, artifact serving, /metrics
|
|
31
|
+
│ ├── jobs/ JobStore protocol + memory / sql / redis backends + retention
|
|
32
|
+
│ ├── dispatch claim → launch disposable worker → validate output → serve (+ warm path)
|
|
33
|
+
│ ├── runtime/ runc/runsc selection (fail-closed) + hardened worker `docker run` argv
|
|
34
|
+
│ ├── pool warm slot pool (one-doc-per-slot, never-reuse)
|
|
35
|
+
│ ├── trust output-trust validator — re-seals worker output from disk
|
|
36
|
+
│ └── observability
|
|
37
|
+
└── worker/ LAYER 2 — worker SDK (runs inside the disposable worker) — lean core
|
|
38
|
+
├── engine the seam: detect / warmup / detonate
|
|
39
|
+
├── harness read input → detonate → seal → write metadata.json
|
|
40
|
+
├── sandbox/ in-process hardening self-check + env-stripped subprocess execution
|
|
41
|
+
└── warm service lifecycle: boot → warmup → one job → exit
|
|
42
|
+
```
|
|
43
|
+
|
|
44
|
+
The host never imports an engine; it depends only on the **contract**. An engine never handles
|
|
45
|
+
hashes/paths defensively; the worker SDK and host do that in audited code.
|
|
46
|
+
|
|
47
|
+
## Writing an engine
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
from pathlib import Path
|
|
51
|
+
from blastbox import Engine, DetonationResult, run_detonation
|
|
52
|
+
from blastbox.contract import Page, DeclaredArtifact, Detection, ArtifactRef, Dimensions
|
|
53
|
+
from blastbox.limits import Limits
|
|
54
|
+
|
|
55
|
+
class MyEngine:
|
|
56
|
+
name = "myengine"
|
|
57
|
+
formats = frozenset({"pdf"})
|
|
58
|
+
|
|
59
|
+
def detonate(self, input: Path, outdir: Path, limits: Limits) -> DetonationResult:
|
|
60
|
+
# ... render/extract; write artifact files into outdir ...
|
|
61
|
+
(outdir / "page-001.png").write_bytes(png_bytes)
|
|
62
|
+
return DetonationResult(
|
|
63
|
+
payload=Page(index=0, dims=Dimensions(width=210, height=297, unit="mm"),
|
|
64
|
+
image=ArtifactRef(id="p0")),
|
|
65
|
+
artifacts=[DeclaredArtifact(id="p0", path="page-001.png", kind="image")],
|
|
66
|
+
detected=Detection(label="pdf", mime="application/pdf", confidence=1.0, source="myengine"),
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
# worker entrypoint:
|
|
70
|
+
if __name__ == "__main__":
|
|
71
|
+
import sys
|
|
72
|
+
from blastbox.worker.harness import main
|
|
73
|
+
sys.exit(main(MyEngine()))
|
|
74
|
+
```
|
|
75
|
+
|
|
76
|
+
The harness seals your declared artifacts (recomputing sha256/size from disk, confining paths,
|
|
77
|
+
resolving references) and writes `metadata.json`. The host's `validate_worker_output` re-validates it.
|
|
78
|
+
Complex engines build recursive `EmbeddedResource` trees (e.g. Tika's recursive metadata); simple
|
|
79
|
+
ones use `Page`/`Record`. Engine-specific node subtypes register via `contract.register_node_type`.
|
|
80
|
+
|
|
81
|
+
## Install
|
|
82
|
+
|
|
83
|
+
```sh
|
|
84
|
+
pip install blastbox # lean core — everything an ENGINE needs (pydantic only)
|
|
85
|
+
pip install blastbox[host] # + the host orchestrator (FastAPI, jobstores, observability)
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## Run (host)
|
|
89
|
+
|
|
90
|
+
```sh
|
|
91
|
+
blastbox serve --host 127.0.0.1 --port 8000 # the ingress API
|
|
92
|
+
blastbox dispatch # the worker dispatcher loop
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
`POST /v1/jobs` (multipart `file` + `engine`) enqueues a job; the dispatcher launches a hardened
|
|
96
|
+
disposable worker for it; `GET /v1/jobs/{id}/artifacts/{artifact_id}` serves validated output.
|
|
97
|
+
|
|
98
|
+
## Security model
|
|
99
|
+
|
|
100
|
+
- Disposable worker per job: `--network=none --cap-drop=ALL --no-new-privileges --read-only`, runsc
|
|
101
|
+
preferred (fail-closed when a secure runtime is required); the input is deleted after conversion.
|
|
102
|
+
- The host re-seals worker output from disk — worker-reported hashes/sizes are never trusted; artifact
|
|
103
|
+
paths are confined; the input-SHA round-trip is checked; `metadata.json` must be a regular file.
|
|
104
|
+
- Ingress rejects oversized bodies before spooling, sanitizes filenames, serves artifacts by id under
|
|
105
|
+
a confined path, and (optionally) sits behind a bearer token / auth proxy.
|
|
106
|
+
- The contract bounds payload size/depth and validates every node; engine subtypes are validated
|
|
107
|
+
against their registered schema.
|
|
108
|
+
|
|
109
|
+
## Status
|
|
110
|
+
|
|
111
|
+
Core framework complete and adversarially tested: contract + full host orchestrator + worker SDK
|
|
112
|
+
(harness, container sandbox, warm protocol) + warm pool + warm dispatch. Proven end-to-end on two real
|
|
113
|
+
engines.
|
|
114
|
+
|
|
115
|
+
Roadmap: host-native `bwrap`/`nsjail` sandbox backends; Firecracker microVM + snapshot runtime; the
|
|
116
|
+
warm-pool burst/health loops.
|
|
117
|
+
|
|
118
|
+
## License
|
|
119
|
+
|
|
120
|
+
MIT.
|
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "blastbox"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Reusable detonation framework: run untrusted documents through disposable, hardened workers"
|
|
9
|
+
readme = "README.md"
|
|
10
|
+
requires-python = ">=3.12"
|
|
11
|
+
license = { text = "MIT" }
|
|
12
|
+
authors = [{ name = "Will Metcalf" }]
|
|
13
|
+
keywords = ["sandbox", "malware", "detonation", "untrusted", "document", "isolation"]
|
|
14
|
+
classifiers = [
|
|
15
|
+
"Development Status :: 4 - Beta",
|
|
16
|
+
"Intended Audience :: Developers",
|
|
17
|
+
"License :: OSI Approved :: MIT License",
|
|
18
|
+
"Operating System :: POSIX :: Linux",
|
|
19
|
+
"Programming Language :: Python :: 3",
|
|
20
|
+
"Programming Language :: Python :: 3.12",
|
|
21
|
+
"Topic :: Security",
|
|
22
|
+
"Topic :: Software Development :: Libraries :: Application Frameworks",
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
# Core depends on pydantic ONLY. This is everything an *engine* needs — the
|
|
26
|
+
# typed contract + the worker SDK (engine seam, harness, sandbox, warm
|
|
27
|
+
# protocol). An engine adapter installs `blastbox` and stays lean.
|
|
28
|
+
dependencies = [
|
|
29
|
+
"pydantic>=2.6.0",
|
|
30
|
+
]
|
|
31
|
+
|
|
32
|
+
[project.optional-dependencies]
|
|
33
|
+
# The host orchestrator (ingress API + CLI, dispatcher, pool, runtime,
|
|
34
|
+
# jobstores, observability). A host deployment installs `blastbox[host]`.
|
|
35
|
+
host = [
|
|
36
|
+
"fastapi>=0.110",
|
|
37
|
+
"uvicorn[standard]>=0.27",
|
|
38
|
+
"python-multipart>=0.0.9",
|
|
39
|
+
"structlog>=24.1",
|
|
40
|
+
"prometheus-client>=0.20",
|
|
41
|
+
"psycopg[binary]>=3.2",
|
|
42
|
+
"redis>=5.0",
|
|
43
|
+
]
|
|
44
|
+
dev = [
|
|
45
|
+
"pytest>=8.0.0",
|
|
46
|
+
"mypy>=1.9.0",
|
|
47
|
+
"ruff>=0.3.0",
|
|
48
|
+
"fakeredis>=2.21",
|
|
49
|
+
"httpx>=0.27",
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
[project.urls]
|
|
53
|
+
Homepage = "https://github.com/wmetcalf/blastbox"
|
|
54
|
+
Repository = "https://github.com/wmetcalf/blastbox"
|
|
55
|
+
Issues = "https://github.com/wmetcalf/blastbox/issues"
|
|
56
|
+
|
|
57
|
+
[project.scripts]
|
|
58
|
+
blastbox = "blastbox.host.cli:main"
|
|
59
|
+
|
|
60
|
+
[tool.setuptools.packages.find]
|
|
61
|
+
where = ["src"]
|
|
62
|
+
|
|
63
|
+
[tool.pytest.ini_options]
|
|
64
|
+
testpaths = ["tests"]
|
|
65
|
+
pythonpath = ["src"]
|
|
66
|
+
addopts = "-ra -q"
|
blastbox-0.1.0/setup.cfg
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""blastbox — reusable detonation framework for untrusted documents.
|
|
2
|
+
|
|
3
|
+
Engine authors need only the lean core (`pip install blastbox`): implement the
|
|
4
|
+
``Engine`` protocol's ``detonate()`` and return a ``DetonationResult``; the
|
|
5
|
+
host orchestrator (``blastbox[host]``) handles ingress, disposable-worker
|
|
6
|
+
launch, output-trust validation, and serving.
|
|
7
|
+
"""
|
|
8
|
+
from blastbox.worker.engine import DetonationResult, Engine
|
|
9
|
+
from blastbox.worker.harness import run_detonation
|
|
10
|
+
|
|
11
|
+
__version__ = "0.1.0"
|
|
12
|
+
|
|
13
|
+
__all__ = ["Engine", "DetonationResult", "run_detonation", "__version__"]
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""Typed data contract for the detonation framework.
|
|
2
|
+
|
|
3
|
+
Engines emit a typed payload tree + declared artifacts; the worker SDK seals
|
|
4
|
+
them into an Envelope (hashes, sizes, path-confinement); the host re-validates.
|
|
5
|
+
"""
|
|
6
|
+
from .leaf import Hash, Detection, Warning, ArtifactRef, Dimensions, Lang
|
|
7
|
+
from .nodes import (
|
|
8
|
+
Record, ExtractedText, Page, EmbeddedResource,
|
|
9
|
+
parse_node, register_node_type, rebuild_node_union,
|
|
10
|
+
)
|
|
11
|
+
from .envelope import (
|
|
12
|
+
DeclaredArtifact, Artifact, Envelope,
|
|
13
|
+
seal_envelope, validate_envelope, envelope_from_json,
|
|
14
|
+
)
|
|
15
|
+
from .walk import iter_nodes, find_by_type
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def json_schema() -> dict:
|
|
19
|
+
"""Canonical JSON Schema for the Envelope (for non-Python engines)."""
|
|
20
|
+
return Envelope.model_json_schema()
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
__all__ = [
|
|
24
|
+
"Hash", "Detection", "Warning", "ArtifactRef", "Dimensions", "Lang",
|
|
25
|
+
"Record", "ExtractedText", "Page", "EmbeddedResource",
|
|
26
|
+
"parse_node", "register_node_type", "rebuild_node_union",
|
|
27
|
+
"DeclaredArtifact", "Artifact", "Envelope",
|
|
28
|
+
"seal_envelope", "validate_envelope", "envelope_from_json",
|
|
29
|
+
"iter_nodes", "find_by_type", "json_schema",
|
|
30
|
+
]
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
"""The security envelope: sealed by the worker SDK, re-validated by the host."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import hashlib
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Annotated, Literal
|
|
7
|
+
|
|
8
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
9
|
+
|
|
10
|
+
from .leaf import Detection, Warning
|
|
11
|
+
from .nodes import ChildNode, _REBUILD_CALLBACKS, parse_node
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class DeclaredArtifact(BaseModel):
|
|
15
|
+
"""What an engine declares; the SDK turns it into a sealed Artifact."""
|
|
16
|
+
model_config = ConfigDict(frozen=True, extra="forbid")
|
|
17
|
+
id: str = Field(pattern=r"^[A-Za-z0-9._-]{1,128}$")
|
|
18
|
+
path: str = Field(max_length=4096) # outdir-relative
|
|
19
|
+
kind: str = Field(min_length=1, max_length=64)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class Artifact(BaseModel):
|
|
23
|
+
model_config = ConfigDict(frozen=True, extra="forbid")
|
|
24
|
+
id: str
|
|
25
|
+
path: str
|
|
26
|
+
kind: str
|
|
27
|
+
sha256: str = Field(pattern=r"^[0-9a-f]{64}$")
|
|
28
|
+
bytes: int = Field(ge=0)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class Envelope(BaseModel):
|
|
32
|
+
"""A signed, sealed, and validated job result envelope.
|
|
33
|
+
|
|
34
|
+
The ``payload`` field is typed as ``Annotated[ChildNode, ...]`` at class
|
|
35
|
+
definition time. After each ``register_node_type()`` call,
|
|
36
|
+
``_rebuild_envelope()`` is triggered via ``nodes._REBUILD_CALLBACKS`` and
|
|
37
|
+
calls ``Envelope.model_rebuild(force=True, _types_namespace=...)`` so that
|
|
38
|
+
pydantic re-evaluates the ``"_PayloadNode"`` forward-ref string against the
|
|
39
|
+
current live union — without any top-level circular import.
|
|
40
|
+
"""
|
|
41
|
+
model_config = ConfigDict(extra="forbid")
|
|
42
|
+
engine: str = Field(min_length=1, max_length=64)
|
|
43
|
+
status: Literal["ok", "rejected", "engine_error"] = "ok"
|
|
44
|
+
input_sha256: str = Field(pattern=r"^[0-9a-f]{64}$")
|
|
45
|
+
detected: Detection
|
|
46
|
+
artifacts: list[Artifact] = Field(default_factory=list)
|
|
47
|
+
warnings: list[Warning] = Field(default_factory=list)
|
|
48
|
+
# Initial annotation uses ChildNode (the base union); _rebuild_envelope()
|
|
49
|
+
# replaces model_fields["payload"].annotation with the live Node union after
|
|
50
|
+
# each register_node_type() call so engine subtypes are also accepted.
|
|
51
|
+
payload: Annotated[ChildNode, Field(discriminator="type")]
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _rebuild_envelope() -> None:
|
|
55
|
+
"""Rebuild Envelope against the current live Node union.
|
|
56
|
+
|
|
57
|
+
Called by nodes.rebuild_node_union() via _REBUILD_CALLBACKS.
|
|
58
|
+
Uses a lazy import to avoid a circular dependency at module-top level.
|
|
59
|
+
Updates the ``payload`` field's annotation to the current live ``Node``
|
|
60
|
+
union so pydantic regenerates the discriminated-union validator correctly.
|
|
61
|
+
"""
|
|
62
|
+
import blastbox.contract.nodes as _nodes
|
|
63
|
+
Envelope.model_fields["payload"].annotation = _nodes.Node # type: ignore[assignment]
|
|
64
|
+
Envelope.model_rebuild(force=True)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
# Register so every rebuild_node_union() call (triggered by register_node_type)
|
|
68
|
+
# also refreshes the Envelope discriminated union.
|
|
69
|
+
_REBUILD_CALLBACKS.append(_rebuild_envelope)
|
|
70
|
+
# Apply immediately so the initial union is in place.
|
|
71
|
+
_rebuild_envelope()
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _collect_refs(node) -> set[str]:
|
|
75
|
+
"""Walk a node tree and collect every ArtifactRef.id it references."""
|
|
76
|
+
from .leaf import ArtifactRef as _ArtifactRef
|
|
77
|
+
|
|
78
|
+
refs: set[str] = set()
|
|
79
|
+
stack: list = [node]
|
|
80
|
+
while stack:
|
|
81
|
+
v = stack.pop()
|
|
82
|
+
if isinstance(v, _ArtifactRef):
|
|
83
|
+
refs.add(v.id)
|
|
84
|
+
elif isinstance(v, BaseModel):
|
|
85
|
+
for f in type(v).model_fields:
|
|
86
|
+
stack.append(getattr(v, f))
|
|
87
|
+
elif isinstance(v, (list, tuple)):
|
|
88
|
+
for it in v:
|
|
89
|
+
stack.append(it)
|
|
90
|
+
elif isinstance(v, dict):
|
|
91
|
+
for it in v.values():
|
|
92
|
+
stack.append(it)
|
|
93
|
+
return refs
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def seal_envelope(*, engine: str, outdir: Path, input_sha256: str,
|
|
97
|
+
detected: Detection, declared: list[DeclaredArtifact],
|
|
98
|
+
warnings: list[Warning], payload: ChildNode,
|
|
99
|
+
status: Literal["ok", "rejected", "engine_error"] = "ok") -> Envelope:
|
|
100
|
+
"""Seal declared artifacts + payload into a validated Envelope.
|
|
101
|
+
|
|
102
|
+
Computes sha256/bytes from disk, confines every path under outdir, and
|
|
103
|
+
verifies every ArtifactRef in the payload resolves to a declared id.
|
|
104
|
+
Raises ValueError on any violation — the worker must not emit on failure.
|
|
105
|
+
"""
|
|
106
|
+
outdir_resolved = outdir.resolve(strict=False)
|
|
107
|
+
artifacts: list[Artifact] = []
|
|
108
|
+
declared_ids: set[str] = set()
|
|
109
|
+
for d in declared:
|
|
110
|
+
if d.id in declared_ids:
|
|
111
|
+
raise ValueError(f"duplicate artifact id: {d.id}")
|
|
112
|
+
declared_ids.add(d.id)
|
|
113
|
+
target = (outdir / d.path).resolve(strict=False)
|
|
114
|
+
if outdir_resolved != target and outdir_resolved not in target.parents:
|
|
115
|
+
raise ValueError(f"artifact path not confined to outdir: {d.path}")
|
|
116
|
+
if not target.is_file():
|
|
117
|
+
raise ValueError(f"declared artifact file missing or not a regular file: {d.path}")
|
|
118
|
+
data = target.read_bytes()
|
|
119
|
+
artifacts.append(Artifact(id=d.id, path=d.path, kind=d.kind,
|
|
120
|
+
sha256=hashlib.sha256(data).hexdigest(),
|
|
121
|
+
bytes=len(data)))
|
|
122
|
+
unresolved = _collect_refs(payload) - declared_ids
|
|
123
|
+
if unresolved:
|
|
124
|
+
raise ValueError(f"payload has unresolved ArtifactRef(s): {sorted(unresolved)}")
|
|
125
|
+
return Envelope(engine=engine, status=status, input_sha256=input_sha256,
|
|
126
|
+
detected=detected, artifacts=artifacts, warnings=warnings,
|
|
127
|
+
payload=payload)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def validate_envelope(env: Envelope, *, outdir: Path, max_artifact_bytes: int,
|
|
131
|
+
max_total_bytes: int, max_artifacts: int) -> Envelope:
|
|
132
|
+
"""Host-side re-validation: enforce count/size bounds and verify on-disk sizes.
|
|
133
|
+
|
|
134
|
+
Re-stats every artifact file under outdir to confirm st_size matches
|
|
135
|
+
the declared bytes (so a tampered worker-reported size is caught).
|
|
136
|
+
Raises ValueError on any violation.
|
|
137
|
+
"""
|
|
138
|
+
if len(env.artifacts) > max_artifacts:
|
|
139
|
+
raise ValueError(f"artifact count {len(env.artifacts)} exceeds {max_artifacts}")
|
|
140
|
+
outdir_resolved = outdir.resolve(strict=False)
|
|
141
|
+
total = 0
|
|
142
|
+
for a in env.artifacts:
|
|
143
|
+
target = (outdir / a.path).resolve(strict=False)
|
|
144
|
+
if outdir_resolved != target and outdir_resolved not in target.parents:
|
|
145
|
+
raise ValueError(f"artifact path not confined to outdir: {a.path}")
|
|
146
|
+
if not target.is_file():
|
|
147
|
+
raise ValueError(f"artifact file missing or not a regular file: {a.path}")
|
|
148
|
+
actual_size = target.stat().st_size
|
|
149
|
+
if actual_size != a.bytes:
|
|
150
|
+
raise ValueError(
|
|
151
|
+
f"artifact {a.id} declared bytes={a.bytes} but on-disk size={actual_size}"
|
|
152
|
+
)
|
|
153
|
+
if actual_size > max_artifact_bytes:
|
|
154
|
+
raise ValueError(f"artifact {a.id} bytes {actual_size} exceeds {max_artifact_bytes}")
|
|
155
|
+
total += actual_size
|
|
156
|
+
if total > max_total_bytes:
|
|
157
|
+
raise ValueError(f"total artifact bytes {total} exceeds {max_total_bytes}")
|
|
158
|
+
return env
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def envelope_from_json(raw: bytes, *, max_bytes: int = 4 * 1024 * 1024) -> Envelope:
|
|
162
|
+
"""Parse a worker-emitted metadata.json into an Envelope (size-bounded)."""
|
|
163
|
+
if len(raw) > max_bytes:
|
|
164
|
+
raise ValueError(f"metadata json {len(raw)} bytes exceeds {max_bytes}")
|
|
165
|
+
import json
|
|
166
|
+
obj = json.loads(raw)
|
|
167
|
+
if not isinstance(obj, dict):
|
|
168
|
+
raise ValueError("envelope JSON must be a JSON object")
|
|
169
|
+
payload_data = obj.get("payload")
|
|
170
|
+
if payload_data is None:
|
|
171
|
+
raise ValueError("envelope JSON missing required 'payload' field")
|
|
172
|
+
obj["payload"] = parse_node(payload_data)
|
|
173
|
+
return Envelope.model_validate(obj)
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"""Leaf types: the shared vocabulary every engine can reuse."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import re
|
|
5
|
+
from typing import Literal
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, ConfigDict, Field, field_validator
|
|
8
|
+
|
|
9
|
+
_HEX_RE = re.compile(r"\A[0-9a-fA-F]+\Z")
|
|
10
|
+
_SAFE_ID_RE = re.compile(r"\A[A-Za-z0-9._-]{1,128}\Z")
|
|
11
|
+
# Expected hex length per hash algorithm (None = any positive hex length).
|
|
12
|
+
_HASH_HEXLEN: dict[str, int | None] = {
|
|
13
|
+
"sha256": 64, "phash": 16, "dhash": 16, "ahash": 16, "colorhash": None,
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class _Frozen(BaseModel):
|
|
18
|
+
model_config = ConfigDict(frozen=True, extra="forbid")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class Hash(_Frozen):
|
|
22
|
+
algo: Literal["sha256", "phash", "dhash", "ahash", "colorhash"]
|
|
23
|
+
value: str
|
|
24
|
+
|
|
25
|
+
@field_validator("value")
|
|
26
|
+
@classmethod
|
|
27
|
+
def _hex(cls, v: str, info) -> str:
|
|
28
|
+
if not _HEX_RE.match(v):
|
|
29
|
+
raise ValueError("hash value must be hex")
|
|
30
|
+
expected = _HASH_HEXLEN.get(info.data.get("algo"))
|
|
31
|
+
if expected is not None and len(v) != expected:
|
|
32
|
+
raise ValueError(f"expected {expected} hex chars, got {len(v)}")
|
|
33
|
+
return v.lower()
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class ArtifactRef(_Frozen):
|
|
37
|
+
"""A reference into the Envelope's artifact set by id (never a path)."""
|
|
38
|
+
id: str
|
|
39
|
+
|
|
40
|
+
@field_validator("id")
|
|
41
|
+
@classmethod
|
|
42
|
+
def _safe(cls, v: str) -> str:
|
|
43
|
+
if not _SAFE_ID_RE.match(v):
|
|
44
|
+
raise ValueError("artifact id must match [A-Za-z0-9._-]{1,128}")
|
|
45
|
+
return v
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class Detection(_Frozen):
|
|
49
|
+
label: str = Field(min_length=1, max_length=64)
|
|
50
|
+
mime: str = Field(max_length=255)
|
|
51
|
+
confidence: float = Field(ge=0.0, le=1.0)
|
|
52
|
+
source: str = Field(min_length=1, max_length=32)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class Warning(_Frozen):
|
|
56
|
+
code: str = Field(min_length=1, max_length=64)
|
|
57
|
+
message: str = Field(max_length=2000)
|
|
58
|
+
context: str | None = Field(default=None, max_length=255)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class Dimensions(_Frozen):
|
|
62
|
+
width: float = Field(gt=0)
|
|
63
|
+
height: float = Field(gt=0)
|
|
64
|
+
unit: Literal["mm", "px", "pt"]
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class Lang(_Frozen):
|
|
68
|
+
code: str = Field(min_length=2, max_length=64)
|