blastbox 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. blastbox-0.1.0/LICENSE +21 -0
  2. blastbox-0.1.0/PKG-INFO +158 -0
  3. blastbox-0.1.0/README.md +120 -0
  4. blastbox-0.1.0/pyproject.toml +66 -0
  5. blastbox-0.1.0/setup.cfg +4 -0
  6. blastbox-0.1.0/src/blastbox/__init__.py +13 -0
  7. blastbox-0.1.0/src/blastbox/contract/__init__.py +30 -0
  8. blastbox-0.1.0/src/blastbox/contract/envelope.py +173 -0
  9. blastbox-0.1.0/src/blastbox/contract/leaf.py +68 -0
  10. blastbox-0.1.0/src/blastbox/contract/nodes.py +172 -0
  11. blastbox-0.1.0/src/blastbox/contract/walk.py +35 -0
  12. blastbox-0.1.0/src/blastbox/errors.py +75 -0
  13. blastbox-0.1.0/src/blastbox/host/__init__.py +0 -0
  14. blastbox-0.1.0/src/blastbox/host/cli.py +142 -0
  15. blastbox-0.1.0/src/blastbox/host/dispatch.py +632 -0
  16. blastbox-0.1.0/src/blastbox/host/ingress/__init__.py +0 -0
  17. blastbox-0.1.0/src/blastbox/host/ingress/app.py +561 -0
  18. blastbox-0.1.0/src/blastbox/host/ingress/middleware.py +139 -0
  19. blastbox-0.1.0/src/blastbox/host/jobs/__init__.py +17 -0
  20. blastbox-0.1.0/src/blastbox/host/jobs/base.py +110 -0
  21. blastbox-0.1.0/src/blastbox/host/jobs/memory.py +62 -0
  22. blastbox-0.1.0/src/blastbox/host/jobs/redis_store.py +131 -0
  23. blastbox-0.1.0/src/blastbox/host/jobs/retention.py +163 -0
  24. blastbox-0.1.0/src/blastbox/host/jobs/sql_store.py +309 -0
  25. blastbox-0.1.0/src/blastbox/host/pool.py +555 -0
  26. blastbox-0.1.0/src/blastbox/host/pool_config.py +105 -0
  27. blastbox-0.1.0/src/blastbox/host/runtime/__init__.py +42 -0
  28. blastbox-0.1.0/src/blastbox/host/runtime/docker.py +408 -0
  29. blastbox-0.1.0/src/blastbox/host/runtime/firecracker.py +1072 -0
  30. blastbox-0.1.0/src/blastbox/host/runtime/host_limits.py +233 -0
  31. blastbox-0.1.0/src/blastbox/host/trust.py +145 -0
  32. blastbox-0.1.0/src/blastbox/limits.py +106 -0
  33. blastbox-0.1.0/src/blastbox/observability/__init__.py +23 -0
  34. blastbox-0.1.0/src/blastbox/observability/logging.py +41 -0
  35. blastbox-0.1.0/src/blastbox/observability/metrics.py +131 -0
  36. blastbox-0.1.0/src/blastbox/worker/__init__.py +25 -0
  37. blastbox-0.1.0/src/blastbox/worker/engine.py +89 -0
  38. blastbox-0.1.0/src/blastbox/worker/fc_guest.py +167 -0
  39. blastbox-0.1.0/src/blastbox/worker/fc_warm.py +173 -0
  40. blastbox-0.1.0/src/blastbox/worker/harness.py +249 -0
  41. blastbox-0.1.0/src/blastbox/worker/sandbox/__init__.py +31 -0
  42. blastbox-0.1.0/src/blastbox/worker/sandbox/base.py +97 -0
  43. blastbox-0.1.0/src/blastbox/worker/sandbox/bwrap.py +423 -0
  44. blastbox-0.1.0/src/blastbox/worker/sandbox/container.py +303 -0
  45. blastbox-0.1.0/src/blastbox/worker/sandbox/detect.py +226 -0
  46. blastbox-0.1.0/src/blastbox/worker/sandbox/nsjail.py +339 -0
  47. blastbox-0.1.0/src/blastbox/worker/warm.py +370 -0
  48. blastbox-0.1.0/src/blastbox.egg-info/PKG-INFO +158 -0
  49. blastbox-0.1.0/src/blastbox.egg-info/SOURCES.txt +54 -0
  50. blastbox-0.1.0/src/blastbox.egg-info/dependency_links.txt +1 -0
  51. blastbox-0.1.0/src/blastbox.egg-info/entry_points.txt +2 -0
  52. blastbox-0.1.0/src/blastbox.egg-info/requires.txt +17 -0
  53. blastbox-0.1.0/src/blastbox.egg-info/top_level.txt +1 -0
  54. blastbox-0.1.0/tests/test_errors.py +131 -0
  55. blastbox-0.1.0/tests/test_limits.py +157 -0
  56. blastbox-0.1.0/tests/test_observability.py +189 -0
blastbox-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Will Metcalf
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,158 @@
1
+ Metadata-Version: 2.4
2
+ Name: blastbox
3
+ Version: 0.1.0
4
+ Summary: Reusable detonation framework: run untrusted documents through disposable, hardened workers
5
+ Author: Will Metcalf
6
+ License: MIT
7
+ Project-URL: Homepage, https://github.com/wmetcalf/blastbox
8
+ Project-URL: Repository, https://github.com/wmetcalf/blastbox
9
+ Project-URL: Issues, https://github.com/wmetcalf/blastbox/issues
10
+ Keywords: sandbox,malware,detonation,untrusted,document,isolation
11
+ Classifier: Development Status :: 4 - Beta
12
+ Classifier: Intended Audience :: Developers
13
+ Classifier: License :: OSI Approved :: MIT License
14
+ Classifier: Operating System :: POSIX :: Linux
15
+ Classifier: Programming Language :: Python :: 3
16
+ Classifier: Programming Language :: Python :: 3.12
17
+ Classifier: Topic :: Security
18
+ Classifier: Topic :: Software Development :: Libraries :: Application Frameworks
19
+ Requires-Python: >=3.12
20
+ Description-Content-Type: text/markdown
21
+ License-File: LICENSE
22
+ Requires-Dist: pydantic>=2.6.0
23
+ Provides-Extra: host
24
+ Requires-Dist: fastapi>=0.110; extra == "host"
25
+ Requires-Dist: uvicorn[standard]>=0.27; extra == "host"
26
+ Requires-Dist: python-multipart>=0.0.9; extra == "host"
27
+ Requires-Dist: structlog>=24.1; extra == "host"
28
+ Requires-Dist: prometheus-client>=0.20; extra == "host"
29
+ Requires-Dist: psycopg[binary]>=3.2; extra == "host"
30
+ Requires-Dist: redis>=5.0; extra == "host"
31
+ Provides-Extra: dev
32
+ Requires-Dist: pytest>=8.0.0; extra == "dev"
33
+ Requires-Dist: mypy>=1.9.0; extra == "dev"
34
+ Requires-Dist: ruff>=0.3.0; extra == "dev"
35
+ Requires-Dist: fakeredis>=2.21; extra == "dev"
36
+ Requires-Dist: httpx>=0.27; extra == "dev"
37
+ Dynamic: license-file
38
+
39
+ # blastbox
40
+
41
+ **A reusable framework for running untrusted/malicious documents through disposable, hardened
42
+ workers** — and turning their output into a typed, host-validated result you can trust.
43
+
44
+ Extracted from the common substrate of two production services (a LibreOffice document→image
45
+ rasterizer and a Tika recursive-extraction service) that had each grown a parallel, drifting copy of
46
+ the same machinery. blastbox is the single, audited-once core; each service becomes a thin *engine*.
47
+
48
+ - **Write one function.** An engine implements `detonate(input, outdir, limits) -> DetonationResult`.
49
+ The framework gives it ingress, a JobStore, a disposable hardened worker per job, output-trust
50
+ validation, artifact serving, optional warm pooling, metrics, and a CLI — for free.
51
+ - **Output is never trusted.** A worker processes a malicious document; the host **re-seals its
52
+ output from disk** (recomputing hashes/sizes, confining paths) before believing a byte of it.
53
+ - **One untrusted doc per disposable slot** — always. Warm pooling pre-pays startup in the
54
+ background; it never reuses a worker across documents.
55
+ - **Typed, engine-shaped output.** A shared node library (`Page`, `EmbeddedResource`, `ExtractedText`,
56
+ a generic `Record` floor, recursive) lets engines be as specific or generic as they need, while the
57
+ framework validates a fixed security envelope identically for everyone.
58
+
59
+ Proven end-to-end against two real, language-diverse engines: a **Python + LibreOffice** rasterizer
60
+ and a **JVM + Tika** recursive extractor.
61
+
62
+ ## Architecture
63
+
64
+ ```
65
+ blastbox/
66
+ ├── contract/ typed node tree + security envelope + seal/validate (registry-aware at any depth)
67
+ ├── host/ LAYER 1 — host orchestrator (engine-agnostic) — needs blastbox[host]
68
+ │ ├── ingress FastAPI API + CLI: upload, status, artifact serving, /metrics
69
+ │ ├── jobs/ JobStore protocol + memory / sql / redis backends + retention
70
+ │ ├── dispatch claim → launch disposable worker → validate output → serve (+ warm path)
71
+ │ ├── runtime/ runc/runsc selection (fail-closed) + hardened worker `docker run` argv
72
+ │ ├── pool warm slot pool (one-doc-per-slot, never-reuse)
73
+ │ ├── trust output-trust validator — re-seals worker output from disk
74
+ │ └── observability
75
+ └── worker/ LAYER 2 — worker SDK (runs inside the disposable worker) — lean core
76
+ ├── engine the seam: detect / warmup / detonate
77
+ ├── harness read input → detonate → seal → write metadata.json
78
+ ├── sandbox/ in-process hardening self-check + env-stripped subprocess execution
79
+ └── warm service lifecycle: boot → warmup → one job → exit
80
+ ```
81
+
82
+ The host never imports an engine; it depends only on the **contract**. An engine never handles
83
+ hashes/paths defensively; the worker SDK and host do that in audited code.
84
+
85
+ ## Writing an engine
86
+
87
+ ```python
88
+ from pathlib import Path
89
+ from blastbox import Engine, DetonationResult, run_detonation
90
+ from blastbox.contract import Page, DeclaredArtifact, Detection, ArtifactRef, Dimensions
91
+ from blastbox.limits import Limits
92
+
93
+ class MyEngine:
94
+ name = "myengine"
95
+ formats = frozenset({"pdf"})
96
+
97
+ def detonate(self, input: Path, outdir: Path, limits: Limits) -> DetonationResult:
98
+ # ... render/extract; write artifact files into outdir ...
99
+ (outdir / "page-001.png").write_bytes(png_bytes)
100
+ return DetonationResult(
101
+ payload=Page(index=0, dims=Dimensions(width=210, height=297, unit="mm"),
102
+ image=ArtifactRef(id="p0")),
103
+ artifacts=[DeclaredArtifact(id="p0", path="page-001.png", kind="image")],
104
+ detected=Detection(label="pdf", mime="application/pdf", confidence=1.0, source="myengine"),
105
+ )
106
+
107
+ # worker entrypoint:
108
+ if __name__ == "__main__":
109
+ import sys
110
+ from blastbox.worker.harness import main
111
+ sys.exit(main(MyEngine()))
112
+ ```
113
+
114
+ The harness seals your declared artifacts (recomputing sha256/size from disk, confining paths,
115
+ resolving references) and writes `metadata.json`. The host's `validate_worker_output` re-validates it.
116
+ Complex engines build recursive `EmbeddedResource` trees (e.g. Tika's recursive metadata); simple
117
+ ones use `Page`/`Record`. Engine-specific node subtypes register via `contract.register_node_type`.
118
+
119
+ ## Install
120
+
121
+ ```sh
122
+ pip install blastbox # lean core — everything an ENGINE needs (pydantic only)
123
+ pip install blastbox[host] # + the host orchestrator (FastAPI, jobstores, observability)
124
+ ```
125
+
126
+ ## Run (host)
127
+
128
+ ```sh
129
+ blastbox serve --host 127.0.0.1 --port 8000 # the ingress API
130
+ blastbox dispatch # the worker dispatcher loop
131
+ ```
132
+
133
+ `POST /v1/jobs` (multipart `file` + `engine`) enqueues a job; the dispatcher launches a hardened
134
+ disposable worker for it; `GET /v1/jobs/{id}/artifacts/{artifact_id}` serves validated output.
135
+
136
+ ## Security model
137
+
138
+ - Disposable worker per job: `--network=none --cap-drop=ALL --no-new-privileges --read-only`, runsc
139
+ preferred (fail-closed when a secure runtime is required); the input is deleted after conversion.
140
+ - The host re-seals worker output from disk — worker-reported hashes/sizes are never trusted; artifact
141
+ paths are confined; the input-SHA round-trip is checked; `metadata.json` must be a regular file.
142
+ - Ingress rejects oversized bodies before spooling, sanitizes filenames, serves artifacts by id under
143
+ a confined path, and (optionally) sits behind a bearer token / auth proxy.
144
+ - The contract bounds payload size/depth and validates every node; engine subtypes are validated
145
+ against their registered schema.
146
+
147
+ ## Status
148
+
149
+ Core framework complete and adversarially tested: contract + full host orchestrator + worker SDK
150
+ (harness, container sandbox, warm protocol) + warm pool + warm dispatch. Proven end-to-end on two real
151
+ engines.
152
+
153
+ Roadmap: host-native `bwrap`/`nsjail` sandbox backends; Firecracker microVM + snapshot runtime; the
154
+ warm-pool burst/health loops.
155
+
156
+ ## License
157
+
158
+ MIT.
@@ -0,0 +1,120 @@
1
+ # blastbox
2
+
3
+ **A reusable framework for running untrusted/malicious documents through disposable, hardened
4
+ workers** — and turning their output into a typed, host-validated result you can trust.
5
+
6
+ Extracted from the common substrate of two production services (a LibreOffice document→image
7
+ rasterizer and a Tika recursive-extraction service) that had each grown a parallel, drifting copy of
8
+ the same machinery. blastbox is the single, audited-once core; each service becomes a thin *engine*.
9
+
10
+ - **Write one function.** An engine implements `detonate(input, outdir, limits) -> DetonationResult`.
11
+ The framework gives it ingress, a JobStore, a disposable hardened worker per job, output-trust
12
+ validation, artifact serving, optional warm pooling, metrics, and a CLI — for free.
13
+ - **Output is never trusted.** A worker processes a malicious document; the host **re-seals its
14
+ output from disk** (recomputing hashes/sizes, confining paths) before believing a byte of it.
15
+ - **One untrusted doc per disposable slot** — always. Warm pooling pre-pays startup in the
16
+ background; it never reuses a worker across documents.
17
+ - **Typed, engine-shaped output.** A shared node library (`Page`, `EmbeddedResource`, `ExtractedText`,
18
+ a generic `Record` floor, recursive) lets engines be as specific or generic as they need, while the
19
+ framework validates a fixed security envelope identically for everyone.
20
+
21
+ Proven end-to-end against two real, language-diverse engines: a **Python + LibreOffice** rasterizer
22
+ and a **JVM + Tika** recursive extractor.
23
+
24
+ ## Architecture
25
+
26
+ ```
27
+ blastbox/
28
+ ├── contract/ typed node tree + security envelope + seal/validate (registry-aware at any depth)
29
+ ├── host/ LAYER 1 — host orchestrator (engine-agnostic) — needs blastbox[host]
30
+ │ ├── ingress FastAPI API + CLI: upload, status, artifact serving, /metrics
31
+ │ ├── jobs/ JobStore protocol + memory / sql / redis backends + retention
32
+ │ ├── dispatch claim → launch disposable worker → validate output → serve (+ warm path)
33
+ │ ├── runtime/ runc/runsc selection (fail-closed) + hardened worker `docker run` argv
34
+ │ ├── pool warm slot pool (one-doc-per-slot, never-reuse)
35
+ │ ├── trust output-trust validator — re-seals worker output from disk
36
+ │ └── observability
37
+ └── worker/ LAYER 2 — worker SDK (runs inside the disposable worker) — lean core
38
+ ├── engine the seam: detect / warmup / detonate
39
+ ├── harness read input → detonate → seal → write metadata.json
40
+ ├── sandbox/ in-process hardening self-check + env-stripped subprocess execution
41
+ └── warm service lifecycle: boot → warmup → one job → exit
42
+ ```
43
+
44
+ The host never imports an engine; it depends only on the **contract**. An engine never handles
45
+ hashes/paths defensively; the worker SDK and host do that in audited code.
46
+
47
+ ## Writing an engine
48
+
49
+ ```python
50
+ from pathlib import Path
51
+ from blastbox import Engine, DetonationResult, run_detonation
52
+ from blastbox.contract import Page, DeclaredArtifact, Detection, ArtifactRef, Dimensions
53
+ from blastbox.limits import Limits
54
+
55
+ class MyEngine:
56
+ name = "myengine"
57
+ formats = frozenset({"pdf"})
58
+
59
+ def detonate(self, input: Path, outdir: Path, limits: Limits) -> DetonationResult:
60
+ # ... render/extract; write artifact files into outdir ...
61
+ (outdir / "page-001.png").write_bytes(png_bytes)
62
+ return DetonationResult(
63
+ payload=Page(index=0, dims=Dimensions(width=210, height=297, unit="mm"),
64
+ image=ArtifactRef(id="p0")),
65
+ artifacts=[DeclaredArtifact(id="p0", path="page-001.png", kind="image")],
66
+ detected=Detection(label="pdf", mime="application/pdf", confidence=1.0, source="myengine"),
67
+ )
68
+
69
+ # worker entrypoint:
70
+ if __name__ == "__main__":
71
+ import sys
72
+ from blastbox.worker.harness import main
73
+ sys.exit(main(MyEngine()))
74
+ ```
75
+
76
+ The harness seals your declared artifacts (recomputing sha256/size from disk, confining paths,
77
+ resolving references) and writes `metadata.json`. The host's `validate_worker_output` re-validates it.
78
+ Complex engines build recursive `EmbeddedResource` trees (e.g. Tika's recursive metadata); simple
79
+ ones use `Page`/`Record`. Engine-specific node subtypes register via `contract.register_node_type`.
80
+
81
+ ## Install
82
+
83
+ ```sh
84
+ pip install blastbox # lean core — everything an ENGINE needs (pydantic only)
85
+ pip install blastbox[host] # + the host orchestrator (FastAPI, jobstores, observability)
86
+ ```
87
+
88
+ ## Run (host)
89
+
90
+ ```sh
91
+ blastbox serve --host 127.0.0.1 --port 8000 # the ingress API
92
+ blastbox dispatch # the worker dispatcher loop
93
+ ```
94
+
95
+ `POST /v1/jobs` (multipart `file` + `engine`) enqueues a job; the dispatcher launches a hardened
96
+ disposable worker for it; `GET /v1/jobs/{id}/artifacts/{artifact_id}` serves validated output.
97
+
98
+ ## Security model
99
+
100
+ - Disposable worker per job: `--network=none --cap-drop=ALL --no-new-privileges --read-only`, runsc
101
+ preferred (fail-closed when a secure runtime is required); the input is deleted after conversion.
102
+ - The host re-seals worker output from disk — worker-reported hashes/sizes are never trusted; artifact
103
+ paths are confined; the input-SHA round-trip is checked; `metadata.json` must be a regular file.
104
+ - Ingress rejects oversized bodies before spooling, sanitizes filenames, serves artifacts by id under
105
+ a confined path, and (optionally) sits behind a bearer token / auth proxy.
106
+ - The contract bounds payload size/depth and validates every node; engine subtypes are validated
107
+ against their registered schema.
108
+
109
+ ## Status
110
+
111
+ Core framework complete and adversarially tested: contract + full host orchestrator + worker SDK
112
+ (harness, container sandbox, warm protocol) + warm pool + warm dispatch. Proven end-to-end on two real
113
+ engines.
114
+
115
+ Roadmap: host-native `bwrap`/`nsjail` sandbox backends; Firecracker microVM + snapshot runtime; the
116
+ warm-pool burst/health loops.
117
+
118
+ ## License
119
+
120
+ MIT.
@@ -0,0 +1,66 @@
1
+ [build-system]
2
+ requires = ["setuptools>=68", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "blastbox"
7
+ version = "0.1.0"
8
+ description = "Reusable detonation framework: run untrusted documents through disposable, hardened workers"
9
+ readme = "README.md"
10
+ requires-python = ">=3.12"
11
+ license = { text = "MIT" }
12
+ authors = [{ name = "Will Metcalf" }]
13
+ keywords = ["sandbox", "malware", "detonation", "untrusted", "document", "isolation"]
14
+ classifiers = [
15
+ "Development Status :: 4 - Beta",
16
+ "Intended Audience :: Developers",
17
+ "License :: OSI Approved :: MIT License",
18
+ "Operating System :: POSIX :: Linux",
19
+ "Programming Language :: Python :: 3",
20
+ "Programming Language :: Python :: 3.12",
21
+ "Topic :: Security",
22
+ "Topic :: Software Development :: Libraries :: Application Frameworks",
23
+ ]
24
+
25
+ # Core depends on pydantic ONLY. This is everything an *engine* needs — the
26
+ # typed contract + the worker SDK (engine seam, harness, sandbox, warm
27
+ # protocol). An engine adapter installs `blastbox` and stays lean.
28
+ dependencies = [
29
+ "pydantic>=2.6.0",
30
+ ]
31
+
32
+ [project.optional-dependencies]
33
+ # The host orchestrator (ingress API + CLI, dispatcher, pool, runtime,
34
+ # jobstores, observability). A host deployment installs `blastbox[host]`.
35
+ host = [
36
+ "fastapi>=0.110",
37
+ "uvicorn[standard]>=0.27",
38
+ "python-multipart>=0.0.9",
39
+ "structlog>=24.1",
40
+ "prometheus-client>=0.20",
41
+ "psycopg[binary]>=3.2",
42
+ "redis>=5.0",
43
+ ]
44
+ dev = [
45
+ "pytest>=8.0.0",
46
+ "mypy>=1.9.0",
47
+ "ruff>=0.3.0",
48
+ "fakeredis>=2.21",
49
+ "httpx>=0.27",
50
+ ]
51
+
52
+ [project.urls]
53
+ Homepage = "https://github.com/wmetcalf/blastbox"
54
+ Repository = "https://github.com/wmetcalf/blastbox"
55
+ Issues = "https://github.com/wmetcalf/blastbox/issues"
56
+
57
+ [project.scripts]
58
+ blastbox = "blastbox.host.cli:main"
59
+
60
+ [tool.setuptools.packages.find]
61
+ where = ["src"]
62
+
63
+ [tool.pytest.ini_options]
64
+ testpaths = ["tests"]
65
+ pythonpath = ["src"]
66
+ addopts = "-ra -q"
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1,13 @@
1
+ """blastbox — reusable detonation framework for untrusted documents.
2
+
3
+ Engine authors need only the lean core (`pip install blastbox`): implement the
4
+ ``Engine`` protocol's ``detonate()`` and return a ``DetonationResult``; the
5
+ host orchestrator (``blastbox[host]``) handles ingress, disposable-worker
6
+ launch, output-trust validation, and serving.
7
+ """
8
+ from blastbox.worker.engine import DetonationResult, Engine
9
+ from blastbox.worker.harness import run_detonation
10
+
11
+ __version__ = "0.1.0"
12
+
13
+ __all__ = ["Engine", "DetonationResult", "run_detonation", "__version__"]
@@ -0,0 +1,30 @@
1
+ """Typed data contract for the detonation framework.
2
+
3
+ Engines emit a typed payload tree + declared artifacts; the worker SDK seals
4
+ them into an Envelope (hashes, sizes, path-confinement); the host re-validates.
5
+ """
6
+ from .leaf import Hash, Detection, Warning, ArtifactRef, Dimensions, Lang
7
+ from .nodes import (
8
+ Record, ExtractedText, Page, EmbeddedResource,
9
+ parse_node, register_node_type, rebuild_node_union,
10
+ )
11
+ from .envelope import (
12
+ DeclaredArtifact, Artifact, Envelope,
13
+ seal_envelope, validate_envelope, envelope_from_json,
14
+ )
15
+ from .walk import iter_nodes, find_by_type
16
+
17
+
18
+ def json_schema() -> dict:
19
+ """Canonical JSON Schema for the Envelope (for non-Python engines)."""
20
+ return Envelope.model_json_schema()
21
+
22
+
23
+ __all__ = [
24
+ "Hash", "Detection", "Warning", "ArtifactRef", "Dimensions", "Lang",
25
+ "Record", "ExtractedText", "Page", "EmbeddedResource",
26
+ "parse_node", "register_node_type", "rebuild_node_union",
27
+ "DeclaredArtifact", "Artifact", "Envelope",
28
+ "seal_envelope", "validate_envelope", "envelope_from_json",
29
+ "iter_nodes", "find_by_type", "json_schema",
30
+ ]
@@ -0,0 +1,173 @@
1
+ """The security envelope: sealed by the worker SDK, re-validated by the host."""
2
+ from __future__ import annotations
3
+
4
+ import hashlib
5
+ from pathlib import Path
6
+ from typing import Annotated, Literal
7
+
8
+ from pydantic import BaseModel, ConfigDict, Field
9
+
10
+ from .leaf import Detection, Warning
11
+ from .nodes import ChildNode, _REBUILD_CALLBACKS, parse_node
12
+
13
+
14
+ class DeclaredArtifact(BaseModel):
15
+ """What an engine declares; the SDK turns it into a sealed Artifact."""
16
+ model_config = ConfigDict(frozen=True, extra="forbid")
17
+ id: str = Field(pattern=r"^[A-Za-z0-9._-]{1,128}$")
18
+ path: str = Field(max_length=4096) # outdir-relative
19
+ kind: str = Field(min_length=1, max_length=64)
20
+
21
+
22
+ class Artifact(BaseModel):
23
+ model_config = ConfigDict(frozen=True, extra="forbid")
24
+ id: str
25
+ path: str
26
+ kind: str
27
+ sha256: str = Field(pattern=r"^[0-9a-f]{64}$")
28
+ bytes: int = Field(ge=0)
29
+
30
+
31
+ class Envelope(BaseModel):
32
+ """A signed, sealed, and validated job result envelope.
33
+
34
+ The ``payload`` field is typed as ``Annotated[ChildNode, ...]`` at class
35
+ definition time. After each ``register_node_type()`` call,
36
+ ``_rebuild_envelope()`` is triggered via ``nodes._REBUILD_CALLBACKS`` and
37
+ calls ``Envelope.model_rebuild(force=True, _types_namespace=...)`` so that
38
+ pydantic re-evaluates the ``"_PayloadNode"`` forward-ref string against the
39
+ current live union — without any top-level circular import.
40
+ """
41
+ model_config = ConfigDict(extra="forbid")
42
+ engine: str = Field(min_length=1, max_length=64)
43
+ status: Literal["ok", "rejected", "engine_error"] = "ok"
44
+ input_sha256: str = Field(pattern=r"^[0-9a-f]{64}$")
45
+ detected: Detection
46
+ artifacts: list[Artifact] = Field(default_factory=list)
47
+ warnings: list[Warning] = Field(default_factory=list)
48
+ # Initial annotation uses ChildNode (the base union); _rebuild_envelope()
49
+ # replaces model_fields["payload"].annotation with the live Node union after
50
+ # each register_node_type() call so engine subtypes are also accepted.
51
+ payload: Annotated[ChildNode, Field(discriminator="type")]
52
+
53
+
54
+ def _rebuild_envelope() -> None:
55
+ """Rebuild Envelope against the current live Node union.
56
+
57
+ Called by nodes.rebuild_node_union() via _REBUILD_CALLBACKS.
58
+ Uses a lazy import to avoid a circular dependency at module-top level.
59
+ Updates the ``payload`` field's annotation to the current live ``Node``
60
+ union so pydantic regenerates the discriminated-union validator correctly.
61
+ """
62
+ import blastbox.contract.nodes as _nodes
63
+ Envelope.model_fields["payload"].annotation = _nodes.Node # type: ignore[assignment]
64
+ Envelope.model_rebuild(force=True)
65
+
66
+
67
+ # Register so every rebuild_node_union() call (triggered by register_node_type)
68
+ # also refreshes the Envelope discriminated union.
69
+ _REBUILD_CALLBACKS.append(_rebuild_envelope)
70
+ # Apply immediately so the initial union is in place.
71
+ _rebuild_envelope()
72
+
73
+
74
+ def _collect_refs(node) -> set[str]:
75
+ """Walk a node tree and collect every ArtifactRef.id it references."""
76
+ from .leaf import ArtifactRef as _ArtifactRef
77
+
78
+ refs: set[str] = set()
79
+ stack: list = [node]
80
+ while stack:
81
+ v = stack.pop()
82
+ if isinstance(v, _ArtifactRef):
83
+ refs.add(v.id)
84
+ elif isinstance(v, BaseModel):
85
+ for f in type(v).model_fields:
86
+ stack.append(getattr(v, f))
87
+ elif isinstance(v, (list, tuple)):
88
+ for it in v:
89
+ stack.append(it)
90
+ elif isinstance(v, dict):
91
+ for it in v.values():
92
+ stack.append(it)
93
+ return refs
94
+
95
+
96
+ def seal_envelope(*, engine: str, outdir: Path, input_sha256: str,
97
+ detected: Detection, declared: list[DeclaredArtifact],
98
+ warnings: list[Warning], payload: ChildNode,
99
+ status: Literal["ok", "rejected", "engine_error"] = "ok") -> Envelope:
100
+ """Seal declared artifacts + payload into a validated Envelope.
101
+
102
+ Computes sha256/bytes from disk, confines every path under outdir, and
103
+ verifies every ArtifactRef in the payload resolves to a declared id.
104
+ Raises ValueError on any violation — the worker must not emit on failure.
105
+ """
106
+ outdir_resolved = outdir.resolve(strict=False)
107
+ artifacts: list[Artifact] = []
108
+ declared_ids: set[str] = set()
109
+ for d in declared:
110
+ if d.id in declared_ids:
111
+ raise ValueError(f"duplicate artifact id: {d.id}")
112
+ declared_ids.add(d.id)
113
+ target = (outdir / d.path).resolve(strict=False)
114
+ if outdir_resolved != target and outdir_resolved not in target.parents:
115
+ raise ValueError(f"artifact path not confined to outdir: {d.path}")
116
+ if not target.is_file():
117
+ raise ValueError(f"declared artifact file missing or not a regular file: {d.path}")
118
+ data = target.read_bytes()
119
+ artifacts.append(Artifact(id=d.id, path=d.path, kind=d.kind,
120
+ sha256=hashlib.sha256(data).hexdigest(),
121
+ bytes=len(data)))
122
+ unresolved = _collect_refs(payload) - declared_ids
123
+ if unresolved:
124
+ raise ValueError(f"payload has unresolved ArtifactRef(s): {sorted(unresolved)}")
125
+ return Envelope(engine=engine, status=status, input_sha256=input_sha256,
126
+ detected=detected, artifacts=artifacts, warnings=warnings,
127
+ payload=payload)
128
+
129
+
130
+ def validate_envelope(env: Envelope, *, outdir: Path, max_artifact_bytes: int,
131
+ max_total_bytes: int, max_artifacts: int) -> Envelope:
132
+ """Host-side re-validation: enforce count/size bounds and verify on-disk sizes.
133
+
134
+ Re-stats every artifact file under outdir to confirm st_size matches
135
+ the declared bytes (so a tampered worker-reported size is caught).
136
+ Raises ValueError on any violation.
137
+ """
138
+ if len(env.artifacts) > max_artifacts:
139
+ raise ValueError(f"artifact count {len(env.artifacts)} exceeds {max_artifacts}")
140
+ outdir_resolved = outdir.resolve(strict=False)
141
+ total = 0
142
+ for a in env.artifacts:
143
+ target = (outdir / a.path).resolve(strict=False)
144
+ if outdir_resolved != target and outdir_resolved not in target.parents:
145
+ raise ValueError(f"artifact path not confined to outdir: {a.path}")
146
+ if not target.is_file():
147
+ raise ValueError(f"artifact file missing or not a regular file: {a.path}")
148
+ actual_size = target.stat().st_size
149
+ if actual_size != a.bytes:
150
+ raise ValueError(
151
+ f"artifact {a.id} declared bytes={a.bytes} but on-disk size={actual_size}"
152
+ )
153
+ if actual_size > max_artifact_bytes:
154
+ raise ValueError(f"artifact {a.id} bytes {actual_size} exceeds {max_artifact_bytes}")
155
+ total += actual_size
156
+ if total > max_total_bytes:
157
+ raise ValueError(f"total artifact bytes {total} exceeds {max_total_bytes}")
158
+ return env
159
+
160
+
161
+ def envelope_from_json(raw: bytes, *, max_bytes: int = 4 * 1024 * 1024) -> Envelope:
162
+ """Parse a worker-emitted metadata.json into an Envelope (size-bounded)."""
163
+ if len(raw) > max_bytes:
164
+ raise ValueError(f"metadata json {len(raw)} bytes exceeds {max_bytes}")
165
+ import json
166
+ obj = json.loads(raw)
167
+ if not isinstance(obj, dict):
168
+ raise ValueError("envelope JSON must be a JSON object")
169
+ payload_data = obj.get("payload")
170
+ if payload_data is None:
171
+ raise ValueError("envelope JSON missing required 'payload' field")
172
+ obj["payload"] = parse_node(payload_data)
173
+ return Envelope.model_validate(obj)
@@ -0,0 +1,68 @@
1
+ """Leaf types: the shared vocabulary every engine can reuse."""
2
+ from __future__ import annotations
3
+
4
+ import re
5
+ from typing import Literal
6
+
7
+ from pydantic import BaseModel, ConfigDict, Field, field_validator
8
+
9
+ _HEX_RE = re.compile(r"\A[0-9a-fA-F]+\Z")
10
+ _SAFE_ID_RE = re.compile(r"\A[A-Za-z0-9._-]{1,128}\Z")
11
+ # Expected hex length per hash algorithm (None = any positive hex length).
12
+ _HASH_HEXLEN: dict[str, int | None] = {
13
+ "sha256": 64, "phash": 16, "dhash": 16, "ahash": 16, "colorhash": None,
14
+ }
15
+
16
+
17
+ class _Frozen(BaseModel):
18
+ model_config = ConfigDict(frozen=True, extra="forbid")
19
+
20
+
21
+ class Hash(_Frozen):
22
+ algo: Literal["sha256", "phash", "dhash", "ahash", "colorhash"]
23
+ value: str
24
+
25
+ @field_validator("value")
26
+ @classmethod
27
+ def _hex(cls, v: str, info) -> str:
28
+ if not _HEX_RE.match(v):
29
+ raise ValueError("hash value must be hex")
30
+ expected = _HASH_HEXLEN.get(info.data.get("algo"))
31
+ if expected is not None and len(v) != expected:
32
+ raise ValueError(f"expected {expected} hex chars, got {len(v)}")
33
+ return v.lower()
34
+
35
+
36
+ class ArtifactRef(_Frozen):
37
+ """A reference into the Envelope's artifact set by id (never a path)."""
38
+ id: str
39
+
40
+ @field_validator("id")
41
+ @classmethod
42
+ def _safe(cls, v: str) -> str:
43
+ if not _SAFE_ID_RE.match(v):
44
+ raise ValueError("artifact id must match [A-Za-z0-9._-]{1,128}")
45
+ return v
46
+
47
+
48
+ class Detection(_Frozen):
49
+ label: str = Field(min_length=1, max_length=64)
50
+ mime: str = Field(max_length=255)
51
+ confidence: float = Field(ge=0.0, le=1.0)
52
+ source: str = Field(min_length=1, max_length=32)
53
+
54
+
55
+ class Warning(_Frozen):
56
+ code: str = Field(min_length=1, max_length=64)
57
+ message: str = Field(max_length=2000)
58
+ context: str | None = Field(default=None, max_length=255)
59
+
60
+
61
+ class Dimensions(_Frozen):
62
+ width: float = Field(gt=0)
63
+ height: float = Field(gt=0)
64
+ unit: Literal["mm", "px", "pt"]
65
+
66
+
67
+ class Lang(_Frozen):
68
+ code: str = Field(min_length=2, max_length=64)