blastbox 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- blastbox/__init__.py +13 -0
- blastbox/contract/__init__.py +30 -0
- blastbox/contract/envelope.py +173 -0
- blastbox/contract/leaf.py +68 -0
- blastbox/contract/nodes.py +172 -0
- blastbox/contract/walk.py +35 -0
- blastbox/errors.py +75 -0
- blastbox/host/__init__.py +0 -0
- blastbox/host/cli.py +142 -0
- blastbox/host/dispatch.py +632 -0
- blastbox/host/ingress/__init__.py +0 -0
- blastbox/host/ingress/app.py +561 -0
- blastbox/host/ingress/middleware.py +139 -0
- blastbox/host/jobs/__init__.py +17 -0
- blastbox/host/jobs/base.py +110 -0
- blastbox/host/jobs/memory.py +62 -0
- blastbox/host/jobs/redis_store.py +131 -0
- blastbox/host/jobs/retention.py +163 -0
- blastbox/host/jobs/sql_store.py +309 -0
- blastbox/host/pool.py +555 -0
- blastbox/host/pool_config.py +105 -0
- blastbox/host/runtime/__init__.py +42 -0
- blastbox/host/runtime/docker.py +408 -0
- blastbox/host/runtime/firecracker.py +1072 -0
- blastbox/host/runtime/host_limits.py +233 -0
- blastbox/host/trust.py +145 -0
- blastbox/limits.py +106 -0
- blastbox/observability/__init__.py +23 -0
- blastbox/observability/logging.py +41 -0
- blastbox/observability/metrics.py +131 -0
- blastbox/worker/__init__.py +25 -0
- blastbox/worker/engine.py +89 -0
- blastbox/worker/fc_guest.py +167 -0
- blastbox/worker/fc_warm.py +173 -0
- blastbox/worker/harness.py +249 -0
- blastbox/worker/sandbox/__init__.py +31 -0
- blastbox/worker/sandbox/base.py +97 -0
- blastbox/worker/sandbox/bwrap.py +423 -0
- blastbox/worker/sandbox/container.py +303 -0
- blastbox/worker/sandbox/detect.py +226 -0
- blastbox/worker/sandbox/nsjail.py +339 -0
- blastbox/worker/warm.py +370 -0
- blastbox-0.1.0.dist-info/METADATA +158 -0
- blastbox-0.1.0.dist-info/RECORD +48 -0
- blastbox-0.1.0.dist-info/WHEEL +5 -0
- blastbox-0.1.0.dist-info/entry_points.txt +2 -0
- blastbox-0.1.0.dist-info/licenses/LICENSE +21 -0
- blastbox-0.1.0.dist-info/top_level.txt +1 -0
blastbox/__init__.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""blastbox — reusable detonation framework for untrusted documents.
|
|
2
|
+
|
|
3
|
+
Engine authors need only the lean core (`pip install blastbox`): implement the
|
|
4
|
+
``Engine`` protocol's ``detonate()`` and return a ``DetonationResult``; the
|
|
5
|
+
host orchestrator (``blastbox[host]``) handles ingress, disposable-worker
|
|
6
|
+
launch, output-trust validation, and serving.
|
|
7
|
+
"""
|
|
8
|
+
from blastbox.worker.engine import DetonationResult, Engine
|
|
9
|
+
from blastbox.worker.harness import run_detonation
|
|
10
|
+
|
|
11
|
+
__version__ = "0.1.0"
|
|
12
|
+
|
|
13
|
+
__all__ = ["Engine", "DetonationResult", "run_detonation", "__version__"]
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""Typed data contract for the detonation framework.
|
|
2
|
+
|
|
3
|
+
Engines emit a typed payload tree + declared artifacts; the worker SDK seals
|
|
4
|
+
them into an Envelope (hashes, sizes, path-confinement); the host re-validates.
|
|
5
|
+
"""
|
|
6
|
+
from .leaf import Hash, Detection, Warning, ArtifactRef, Dimensions, Lang
|
|
7
|
+
from .nodes import (
|
|
8
|
+
Record, ExtractedText, Page, EmbeddedResource,
|
|
9
|
+
parse_node, register_node_type, rebuild_node_union,
|
|
10
|
+
)
|
|
11
|
+
from .envelope import (
|
|
12
|
+
DeclaredArtifact, Artifact, Envelope,
|
|
13
|
+
seal_envelope, validate_envelope, envelope_from_json,
|
|
14
|
+
)
|
|
15
|
+
from .walk import iter_nodes, find_by_type
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def json_schema() -> dict:
|
|
19
|
+
"""Canonical JSON Schema for the Envelope (for non-Python engines)."""
|
|
20
|
+
return Envelope.model_json_schema()
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
__all__ = [
|
|
24
|
+
"Hash", "Detection", "Warning", "ArtifactRef", "Dimensions", "Lang",
|
|
25
|
+
"Record", "ExtractedText", "Page", "EmbeddedResource",
|
|
26
|
+
"parse_node", "register_node_type", "rebuild_node_union",
|
|
27
|
+
"DeclaredArtifact", "Artifact", "Envelope",
|
|
28
|
+
"seal_envelope", "validate_envelope", "envelope_from_json",
|
|
29
|
+
"iter_nodes", "find_by_type", "json_schema",
|
|
30
|
+
]
|
|
@@ -0,0 +1,173 @@
|
|
|
1
|
+
"""The security envelope: sealed by the worker SDK, re-validated by the host."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import hashlib
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
from typing import Annotated, Literal
|
|
7
|
+
|
|
8
|
+
from pydantic import BaseModel, ConfigDict, Field
|
|
9
|
+
|
|
10
|
+
from .leaf import Detection, Warning
|
|
11
|
+
from .nodes import ChildNode, _REBUILD_CALLBACKS, parse_node
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class DeclaredArtifact(BaseModel):
|
|
15
|
+
"""What an engine declares; the SDK turns it into a sealed Artifact."""
|
|
16
|
+
model_config = ConfigDict(frozen=True, extra="forbid")
|
|
17
|
+
id: str = Field(pattern=r"^[A-Za-z0-9._-]{1,128}$")
|
|
18
|
+
path: str = Field(max_length=4096) # outdir-relative
|
|
19
|
+
kind: str = Field(min_length=1, max_length=64)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class Artifact(BaseModel):
|
|
23
|
+
model_config = ConfigDict(frozen=True, extra="forbid")
|
|
24
|
+
id: str
|
|
25
|
+
path: str
|
|
26
|
+
kind: str
|
|
27
|
+
sha256: str = Field(pattern=r"^[0-9a-f]{64}$")
|
|
28
|
+
bytes: int = Field(ge=0)
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class Envelope(BaseModel):
|
|
32
|
+
"""A signed, sealed, and validated job result envelope.
|
|
33
|
+
|
|
34
|
+
The ``payload`` field is typed as ``Annotated[ChildNode, ...]`` at class
|
|
35
|
+
definition time. After each ``register_node_type()`` call,
|
|
36
|
+
``_rebuild_envelope()`` is triggered via ``nodes._REBUILD_CALLBACKS`` and
|
|
37
|
+
calls ``Envelope.model_rebuild(force=True, _types_namespace=...)`` so that
|
|
38
|
+
pydantic re-evaluates the ``"_PayloadNode"`` forward-ref string against the
|
|
39
|
+
current live union — without any top-level circular import.
|
|
40
|
+
"""
|
|
41
|
+
model_config = ConfigDict(extra="forbid")
|
|
42
|
+
engine: str = Field(min_length=1, max_length=64)
|
|
43
|
+
status: Literal["ok", "rejected", "engine_error"] = "ok"
|
|
44
|
+
input_sha256: str = Field(pattern=r"^[0-9a-f]{64}$")
|
|
45
|
+
detected: Detection
|
|
46
|
+
artifacts: list[Artifact] = Field(default_factory=list)
|
|
47
|
+
warnings: list[Warning] = Field(default_factory=list)
|
|
48
|
+
# Initial annotation uses ChildNode (the base union); _rebuild_envelope()
|
|
49
|
+
# replaces model_fields["payload"].annotation with the live Node union after
|
|
50
|
+
# each register_node_type() call so engine subtypes are also accepted.
|
|
51
|
+
payload: Annotated[ChildNode, Field(discriminator="type")]
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def _rebuild_envelope() -> None:
|
|
55
|
+
"""Rebuild Envelope against the current live Node union.
|
|
56
|
+
|
|
57
|
+
Called by nodes.rebuild_node_union() via _REBUILD_CALLBACKS.
|
|
58
|
+
Uses a lazy import to avoid a circular dependency at module-top level.
|
|
59
|
+
Updates the ``payload`` field's annotation to the current live ``Node``
|
|
60
|
+
union so pydantic regenerates the discriminated-union validator correctly.
|
|
61
|
+
"""
|
|
62
|
+
import blastbox.contract.nodes as _nodes
|
|
63
|
+
Envelope.model_fields["payload"].annotation = _nodes.Node # type: ignore[assignment]
|
|
64
|
+
Envelope.model_rebuild(force=True)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
# Register so every rebuild_node_union() call (triggered by register_node_type)
|
|
68
|
+
# also refreshes the Envelope discriminated union.
|
|
69
|
+
_REBUILD_CALLBACKS.append(_rebuild_envelope)
|
|
70
|
+
# Apply immediately so the initial union is in place.
|
|
71
|
+
_rebuild_envelope()
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def _collect_refs(node) -> set[str]:
|
|
75
|
+
"""Walk a node tree and collect every ArtifactRef.id it references."""
|
|
76
|
+
from .leaf import ArtifactRef as _ArtifactRef
|
|
77
|
+
|
|
78
|
+
refs: set[str] = set()
|
|
79
|
+
stack: list = [node]
|
|
80
|
+
while stack:
|
|
81
|
+
v = stack.pop()
|
|
82
|
+
if isinstance(v, _ArtifactRef):
|
|
83
|
+
refs.add(v.id)
|
|
84
|
+
elif isinstance(v, BaseModel):
|
|
85
|
+
for f in type(v).model_fields:
|
|
86
|
+
stack.append(getattr(v, f))
|
|
87
|
+
elif isinstance(v, (list, tuple)):
|
|
88
|
+
for it in v:
|
|
89
|
+
stack.append(it)
|
|
90
|
+
elif isinstance(v, dict):
|
|
91
|
+
for it in v.values():
|
|
92
|
+
stack.append(it)
|
|
93
|
+
return refs
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
def seal_envelope(*, engine: str, outdir: Path, input_sha256: str,
|
|
97
|
+
detected: Detection, declared: list[DeclaredArtifact],
|
|
98
|
+
warnings: list[Warning], payload: ChildNode,
|
|
99
|
+
status: Literal["ok", "rejected", "engine_error"] = "ok") -> Envelope:
|
|
100
|
+
"""Seal declared artifacts + payload into a validated Envelope.
|
|
101
|
+
|
|
102
|
+
Computes sha256/bytes from disk, confines every path under outdir, and
|
|
103
|
+
verifies every ArtifactRef in the payload resolves to a declared id.
|
|
104
|
+
Raises ValueError on any violation — the worker must not emit on failure.
|
|
105
|
+
"""
|
|
106
|
+
outdir_resolved = outdir.resolve(strict=False)
|
|
107
|
+
artifacts: list[Artifact] = []
|
|
108
|
+
declared_ids: set[str] = set()
|
|
109
|
+
for d in declared:
|
|
110
|
+
if d.id in declared_ids:
|
|
111
|
+
raise ValueError(f"duplicate artifact id: {d.id}")
|
|
112
|
+
declared_ids.add(d.id)
|
|
113
|
+
target = (outdir / d.path).resolve(strict=False)
|
|
114
|
+
if outdir_resolved != target and outdir_resolved not in target.parents:
|
|
115
|
+
raise ValueError(f"artifact path not confined to outdir: {d.path}")
|
|
116
|
+
if not target.is_file():
|
|
117
|
+
raise ValueError(f"declared artifact file missing or not a regular file: {d.path}")
|
|
118
|
+
data = target.read_bytes()
|
|
119
|
+
artifacts.append(Artifact(id=d.id, path=d.path, kind=d.kind,
|
|
120
|
+
sha256=hashlib.sha256(data).hexdigest(),
|
|
121
|
+
bytes=len(data)))
|
|
122
|
+
unresolved = _collect_refs(payload) - declared_ids
|
|
123
|
+
if unresolved:
|
|
124
|
+
raise ValueError(f"payload has unresolved ArtifactRef(s): {sorted(unresolved)}")
|
|
125
|
+
return Envelope(engine=engine, status=status, input_sha256=input_sha256,
|
|
126
|
+
detected=detected, artifacts=artifacts, warnings=warnings,
|
|
127
|
+
payload=payload)
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
def validate_envelope(env: Envelope, *, outdir: Path, max_artifact_bytes: int,
|
|
131
|
+
max_total_bytes: int, max_artifacts: int) -> Envelope:
|
|
132
|
+
"""Host-side re-validation: enforce count/size bounds and verify on-disk sizes.
|
|
133
|
+
|
|
134
|
+
Re-stats every artifact file under outdir to confirm st_size matches
|
|
135
|
+
the declared bytes (so a tampered worker-reported size is caught).
|
|
136
|
+
Raises ValueError on any violation.
|
|
137
|
+
"""
|
|
138
|
+
if len(env.artifacts) > max_artifacts:
|
|
139
|
+
raise ValueError(f"artifact count {len(env.artifacts)} exceeds {max_artifacts}")
|
|
140
|
+
outdir_resolved = outdir.resolve(strict=False)
|
|
141
|
+
total = 0
|
|
142
|
+
for a in env.artifacts:
|
|
143
|
+
target = (outdir / a.path).resolve(strict=False)
|
|
144
|
+
if outdir_resolved != target and outdir_resolved not in target.parents:
|
|
145
|
+
raise ValueError(f"artifact path not confined to outdir: {a.path}")
|
|
146
|
+
if not target.is_file():
|
|
147
|
+
raise ValueError(f"artifact file missing or not a regular file: {a.path}")
|
|
148
|
+
actual_size = target.stat().st_size
|
|
149
|
+
if actual_size != a.bytes:
|
|
150
|
+
raise ValueError(
|
|
151
|
+
f"artifact {a.id} declared bytes={a.bytes} but on-disk size={actual_size}"
|
|
152
|
+
)
|
|
153
|
+
if actual_size > max_artifact_bytes:
|
|
154
|
+
raise ValueError(f"artifact {a.id} bytes {actual_size} exceeds {max_artifact_bytes}")
|
|
155
|
+
total += actual_size
|
|
156
|
+
if total > max_total_bytes:
|
|
157
|
+
raise ValueError(f"total artifact bytes {total} exceeds {max_total_bytes}")
|
|
158
|
+
return env
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def envelope_from_json(raw: bytes, *, max_bytes: int = 4 * 1024 * 1024) -> Envelope:
|
|
162
|
+
"""Parse a worker-emitted metadata.json into an Envelope (size-bounded)."""
|
|
163
|
+
if len(raw) > max_bytes:
|
|
164
|
+
raise ValueError(f"metadata json {len(raw)} bytes exceeds {max_bytes}")
|
|
165
|
+
import json
|
|
166
|
+
obj = json.loads(raw)
|
|
167
|
+
if not isinstance(obj, dict):
|
|
168
|
+
raise ValueError("envelope JSON must be a JSON object")
|
|
169
|
+
payload_data = obj.get("payload")
|
|
170
|
+
if payload_data is None:
|
|
171
|
+
raise ValueError("envelope JSON missing required 'payload' field")
|
|
172
|
+
obj["payload"] = parse_node(payload_data)
|
|
173
|
+
return Envelope.model_validate(obj)
|
|
@@ -0,0 +1,68 @@
|
|
|
1
|
+
"""Leaf types: the shared vocabulary every engine can reuse."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
import re
|
|
5
|
+
from typing import Literal
|
|
6
|
+
|
|
7
|
+
from pydantic import BaseModel, ConfigDict, Field, field_validator
|
|
8
|
+
|
|
9
|
+
_HEX_RE = re.compile(r"\A[0-9a-fA-F]+\Z")
|
|
10
|
+
_SAFE_ID_RE = re.compile(r"\A[A-Za-z0-9._-]{1,128}\Z")
|
|
11
|
+
# Expected hex length per hash algorithm (None = any positive hex length).
|
|
12
|
+
_HASH_HEXLEN: dict[str, int | None] = {
|
|
13
|
+
"sha256": 64, "phash": 16, "dhash": 16, "ahash": 16, "colorhash": None,
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class _Frozen(BaseModel):
|
|
18
|
+
model_config = ConfigDict(frozen=True, extra="forbid")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class Hash(_Frozen):
|
|
22
|
+
algo: Literal["sha256", "phash", "dhash", "ahash", "colorhash"]
|
|
23
|
+
value: str
|
|
24
|
+
|
|
25
|
+
@field_validator("value")
|
|
26
|
+
@classmethod
|
|
27
|
+
def _hex(cls, v: str, info) -> str:
|
|
28
|
+
if not _HEX_RE.match(v):
|
|
29
|
+
raise ValueError("hash value must be hex")
|
|
30
|
+
expected = _HASH_HEXLEN.get(info.data.get("algo"))
|
|
31
|
+
if expected is not None and len(v) != expected:
|
|
32
|
+
raise ValueError(f"expected {expected} hex chars, got {len(v)}")
|
|
33
|
+
return v.lower()
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class ArtifactRef(_Frozen):
|
|
37
|
+
"""A reference into the Envelope's artifact set by id (never a path)."""
|
|
38
|
+
id: str
|
|
39
|
+
|
|
40
|
+
@field_validator("id")
|
|
41
|
+
@classmethod
|
|
42
|
+
def _safe(cls, v: str) -> str:
|
|
43
|
+
if not _SAFE_ID_RE.match(v):
|
|
44
|
+
raise ValueError("artifact id must match [A-Za-z0-9._-]{1,128}")
|
|
45
|
+
return v
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class Detection(_Frozen):
|
|
49
|
+
label: str = Field(min_length=1, max_length=64)
|
|
50
|
+
mime: str = Field(max_length=255)
|
|
51
|
+
confidence: float = Field(ge=0.0, le=1.0)
|
|
52
|
+
source: str = Field(min_length=1, max_length=32)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
class Warning(_Frozen):
|
|
56
|
+
code: str = Field(min_length=1, max_length=64)
|
|
57
|
+
message: str = Field(max_length=2000)
|
|
58
|
+
context: str | None = Field(default=None, max_length=255)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class Dimensions(_Frozen):
|
|
62
|
+
width: float = Field(gt=0)
|
|
63
|
+
height: float = Field(gt=0)
|
|
64
|
+
unit: Literal["mm", "px", "pt"]
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class Lang(_Frozen):
|
|
68
|
+
code: str = Field(min_length=2, max_length=64)
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
"""Typed payload nodes: a recursive tree with a generic Record floor."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import Annotated, Any, Callable, Literal, Union, get_args
|
|
5
|
+
|
|
6
|
+
from pydantic import BaseModel, BeforeValidator, ConfigDict, Field, TypeAdapter
|
|
7
|
+
|
|
8
|
+
from .leaf import ArtifactRef, Dimensions, Hash, Lang
|
|
9
|
+
|
|
10
|
+
Scalar = Union[str, int, float, bool, None]
|
|
11
|
+
# A Record field value is a scalar, a list of scalars, or a nested Record.
|
|
12
|
+
RecordValue = Union[Scalar, list[Scalar], "Record"]
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class _Node(BaseModel):
|
|
16
|
+
model_config = ConfigDict(extra="forbid", populate_by_name=True)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def _parse_children(value: Any) -> Any:
|
|
20
|
+
"""Route each child through the *live*, registry-aware node adapter.
|
|
21
|
+
|
|
22
|
+
Runs as a ``BeforeValidator`` on the ``children`` fields so that child
|
|
23
|
+
parsing is decoupled from the static ``ChildNode`` union captured at class
|
|
24
|
+
definition. A statically-reassigned union does NOT work here: pydantic
|
|
25
|
+
bakes a discriminated-union core schema into the ``children`` field when the
|
|
26
|
+
model is built and does not re-resolve a reassigned module-global union on
|
|
27
|
+
``model_rebuild`` — so registered engine subtypes were never reachable as
|
|
28
|
+
children. Delegating to ``_NODE_ADAPTER`` (the same adapter ``parse_node``
|
|
29
|
+
uses, rebuilt by ``rebuild_node_union`` to include engine types) fixes that:
|
|
30
|
+
the registry is consulted at validation time, not class-definition time.
|
|
31
|
+
|
|
32
|
+
Already-constructed ``_Node`` instances pass through untouched; dicts are
|
|
33
|
+
validated through the live adapter (which rejects unknown ``_type`` and
|
|
34
|
+
enforces ``extra="forbid"``). Non-list / non-dict values are returned
|
|
35
|
+
unchanged so pydantic emits its normal type/length errors.
|
|
36
|
+
"""
|
|
37
|
+
if not isinstance(value, list):
|
|
38
|
+
return value
|
|
39
|
+
if _NODE_ADAPTER is None:
|
|
40
|
+
rebuild_node_union()
|
|
41
|
+
assert _NODE_ADAPTER is not None
|
|
42
|
+
parsed: list[Any] = []
|
|
43
|
+
for item in value:
|
|
44
|
+
if isinstance(item, _Node):
|
|
45
|
+
parsed.append(item)
|
|
46
|
+
elif isinstance(item, dict):
|
|
47
|
+
parsed.append(_NODE_ADAPTER.validate_python(item))
|
|
48
|
+
else:
|
|
49
|
+
# Let the list[Any] validator surface a normal error for this item.
|
|
50
|
+
parsed.append(item)
|
|
51
|
+
return parsed
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
# A children list whose items are parsed by the live registry-aware adapter
|
|
55
|
+
# (via _parse_children) rather than a static union. Typed list[Any] so pydantic
|
|
56
|
+
# does not re-narrow the already-validated _Node instances back to the 4 base
|
|
57
|
+
# types; max_length is still enforced because BeforeValidator runs first.
|
|
58
|
+
ChildList = Annotated[list[Any], BeforeValidator(_parse_children)]
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
class Record(_Node):
|
|
62
|
+
"""The generic floor: a typed bag for engine data not worth a named type."""
|
|
63
|
+
type: Literal["record"] = Field(default="record", alias="_type")
|
|
64
|
+
fields: dict[str, RecordValue] = Field(default_factory=dict, max_length=4096)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class ExtractedText(_Node):
|
|
68
|
+
type: Literal["extracted_text"] = Field(default="extracted_text", alias="_type")
|
|
69
|
+
text: str = Field(max_length=10_000_000)
|
|
70
|
+
char_count: int = Field(ge=0)
|
|
71
|
+
lang: Lang | None = None
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class Page(_Node):
|
|
75
|
+
type: Literal["page"] = Field(default="page", alias="_type")
|
|
76
|
+
index: int = Field(ge=0)
|
|
77
|
+
dims: Dimensions
|
|
78
|
+
image: ArtifactRef
|
|
79
|
+
hashes: list[Hash] = Field(default_factory=list, max_length=32)
|
|
80
|
+
children: ChildList = Field(default_factory=list, max_length=10000)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class EmbeddedResource(_Node):
|
|
84
|
+
type: Literal["embedded_resource"] = Field(default="embedded_resource", alias="_type")
|
|
85
|
+
embedded_path: str = Field(max_length=4096)
|
|
86
|
+
content_type: str = Field(max_length=255)
|
|
87
|
+
depth: int = Field(ge=0, le=64)
|
|
88
|
+
metadata: Record | None = None
|
|
89
|
+
children: ChildList = Field(default_factory=list, max_length=10000)
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
# Forward-declared recursive child union; engine types register into it (Task 4).
|
|
93
|
+
ChildNode = Union[Page, EmbeddedResource, ExtractedText, Record]
|
|
94
|
+
|
|
95
|
+
# Module-level Node type and adapter; rebuilt by rebuild_node_union().
|
|
96
|
+
Node: Any = Annotated[ChildNode, Field(discriminator="type")]
|
|
97
|
+
_NODE_ADAPTER: TypeAdapter[Any] | None = None
|
|
98
|
+
|
|
99
|
+
_ENGINE_NODE_TYPES: list[type[_Node]] = []
|
|
100
|
+
|
|
101
|
+
# Callbacks invoked after every rebuild_node_union() — allows envelope and
|
|
102
|
+
# other modules to re-bind their own models to the live union without
|
|
103
|
+
# introducing a circular top-level import (they register lazily at their
|
|
104
|
+
# own module-init time).
|
|
105
|
+
_REBUILD_CALLBACKS: list[Callable[[], None]] = []
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def rebuild_node_union() -> None:
|
|
109
|
+
"""(Re)build the discriminated-union adapter. Call after registering types."""
|
|
110
|
+
global _NODE_ADAPTER, Node
|
|
111
|
+
members: tuple[type[_Node], ...] = (
|
|
112
|
+
Page, EmbeddedResource, ExtractedText, Record, *_ENGINE_NODE_TYPES
|
|
113
|
+
)
|
|
114
|
+
for m in members:
|
|
115
|
+
m.model_rebuild()
|
|
116
|
+
if len(members) > 1:
|
|
117
|
+
union: Any = Union[members] # type: ignore[arg-type]
|
|
118
|
+
else:
|
|
119
|
+
union = members[0]
|
|
120
|
+
Node = Annotated[union, Field(discriminator="type")]
|
|
121
|
+
_NODE_ADAPTER = TypeAdapter(Node)
|
|
122
|
+
for cb in _REBUILD_CALLBACKS:
|
|
123
|
+
cb()
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def parse_node(data: dict[str, Any]) -> ChildNode:
|
|
127
|
+
"""Parse an untyped dict into the correct node by its _type discriminator."""
|
|
128
|
+
global _NODE_ADAPTER
|
|
129
|
+
if _NODE_ADAPTER is None:
|
|
130
|
+
rebuild_node_union()
|
|
131
|
+
assert _NODE_ADAPTER is not None
|
|
132
|
+
return _NODE_ADAPTER.validate_python(data) # type: ignore[return-value]
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def _discriminator_value(cls: type[_Node]) -> Any:
|
|
136
|
+
"""The Literal discriminator value carried by a node class's ``type`` field."""
|
|
137
|
+
field = cls.model_fields.get("type")
|
|
138
|
+
if field is None:
|
|
139
|
+
return None
|
|
140
|
+
args = get_args(field.annotation)
|
|
141
|
+
return args[0] if args else field.default
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def register_node_type(cls: type[_Node]) -> type[_Node]:
|
|
145
|
+
"""Register an engine-specific node subclass into the parse union.
|
|
146
|
+
|
|
147
|
+
The class MUST carry a unique Literal `type` discriminator. After
|
|
148
|
+
registration the union is rebuilt so parse_node() accepts it.
|
|
149
|
+
|
|
150
|
+
Idempotent: re-registering the same class is a no-op, and registering a
|
|
151
|
+
class whose discriminator value matches an already-registered engine type
|
|
152
|
+
*replaces* the prior one. Without the replace, two classes sharing a
|
|
153
|
+
``_type`` would both enter the union and pydantic would reject the build
|
|
154
|
+
with "mapped to multiple choices" — so this keeps the discriminated union
|
|
155
|
+
well-formed under reloads / duplicate registrations.
|
|
156
|
+
"""
|
|
157
|
+
if cls in _ENGINE_NODE_TYPES:
|
|
158
|
+
rebuild_node_union()
|
|
159
|
+
return cls
|
|
160
|
+
disc = _discriminator_value(cls)
|
|
161
|
+
for i, existing in enumerate(_ENGINE_NODE_TYPES):
|
|
162
|
+
if _discriminator_value(existing) == disc:
|
|
163
|
+
_ENGINE_NODE_TYPES[i] = cls
|
|
164
|
+
break
|
|
165
|
+
else:
|
|
166
|
+
_ENGINE_NODE_TYPES.append(cls)
|
|
167
|
+
rebuild_node_union()
|
|
168
|
+
return cls
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
# Bootstrap: rebuild after all forward refs are defined.
|
|
172
|
+
rebuild_node_union()
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""Generic, engine-agnostic walkers over the typed payload tree."""
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
|
|
4
|
+
from typing import Iterator, TypeVar
|
|
5
|
+
|
|
6
|
+
_T = TypeVar("_T")
|
|
7
|
+
|
|
8
|
+
_MAX_DEPTH = 128
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def iter_nodes(root, *, _max_depth: int = _MAX_DEPTH) -> Iterator[object]:
|
|
12
|
+
"""Yield root and every descendant node (pre-order).
|
|
13
|
+
|
|
14
|
+
Uses an explicit stack to avoid Python recursion limits. Raises
|
|
15
|
+
``ValueError`` if the tree is deeper than *_max_depth* (default 128)
|
|
16
|
+
so callers get a clean error instead of a ``RecursionError``.
|
|
17
|
+
"""
|
|
18
|
+
# Stack entries are (node, depth)
|
|
19
|
+
stack: list[tuple[object, int]] = [(root, 0)]
|
|
20
|
+
while stack:
|
|
21
|
+
node, depth = stack.pop()
|
|
22
|
+
if depth > _max_depth:
|
|
23
|
+
raise ValueError(
|
|
24
|
+
f"payload tree exceeds maximum nesting depth of {_max_depth}"
|
|
25
|
+
)
|
|
26
|
+
yield node
|
|
27
|
+
children = getattr(node, "children", None) or []
|
|
28
|
+
# Reverse so left-to-right pre-order is preserved when popping.
|
|
29
|
+
for child in reversed(list(children)):
|
|
30
|
+
stack.append((child, depth + 1))
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def find_by_type(root, node_type: type[_T]) -> list[_T]:
|
|
34
|
+
"""All nodes that are instances of node_type (subclasses included)."""
|
|
35
|
+
return [n for n in iter_nodes(root) if isinstance(n, node_type)]
|
blastbox/errors.py
ADDED
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
"""Exception hierarchy for the blastbox framework."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import re
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
# Strip internal filesystem paths from public-facing error messages.
|
|
9
|
+
# Root-agnostic: match any absolute POSIX path of two or more segments
|
|
10
|
+
# (``/var/lib/blastbox/x``, ``/etc/passwd``, ``/proc/self/mem``) rather
|
|
11
|
+
# than denylisting specific roots. A single ``/`` between words (``and/or``)
|
|
12
|
+
# is not a path and is left untouched because it lacks a trailing segment
|
|
13
|
+
# separator.
|
|
14
|
+
_INTERNAL_PATH_RE = re.compile(r"/(?:[A-Za-z0-9._+-]+/)+[A-Za-z0-9._+-]*")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def sanitize_public_error(msg: str) -> str:
|
|
18
|
+
"""Remove internal filesystem paths from an error message."""
|
|
19
|
+
return _INTERNAL_PATH_RE.sub("<path>", msg)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class BlastboxError(Exception):
|
|
23
|
+
"""Base for all blastbox errors."""
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class DetectionError(BlastboxError):
|
|
27
|
+
"""Input was rejected by the detector."""
|
|
28
|
+
|
|
29
|
+
def __init__(self, reason: str, detail: str = "") -> None:
|
|
30
|
+
super().__init__(f"{reason}: {detail}" if detail else reason)
|
|
31
|
+
self.reason = reason
|
|
32
|
+
self.detail = detail
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
class SandboxError(BlastboxError):
|
|
36
|
+
"""Sandbox setup or execution failed."""
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class SandboxTimeout(SandboxError):
|
|
40
|
+
"""Sandboxed process exceeded the wall-clock timeout."""
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class SandboxUnavailable(SandboxError):
|
|
44
|
+
"""No usable sandbox backend on this host."""
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class EngineError(BlastboxError):
|
|
48
|
+
"""An engine-specific failure (e.g. LibreOffice, Tika, …)."""
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class DetonationError(BlastboxError):
|
|
52
|
+
"""Top-level detonation failure, optionally wrapping a cause."""
|
|
53
|
+
|
|
54
|
+
def __init__(self, message: str, cause: Exception | None = None) -> None:
|
|
55
|
+
super().__init__(message)
|
|
56
|
+
self.cause = cause
|
|
57
|
+
if cause is not None:
|
|
58
|
+
self.__cause__ = cause
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
# Keep ConversionError as an alias so engines can use the familiar name
|
|
62
|
+
# without importing engine-specific terms into the generic layer.
|
|
63
|
+
ConversionError = DetonationError
|
|
64
|
+
|
|
65
|
+
|
|
66
|
+
class ValidationError(BlastboxError):
|
|
67
|
+
"""Envelope / metadata validation failed."""
|
|
68
|
+
|
|
69
|
+
|
|
70
|
+
class OutputTrustError(BlastboxError):
|
|
71
|
+
"""Worker output failed the host-side trust validation."""
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class WarmTimeout(BlastboxError):
|
|
75
|
+
"""No job arrived within the idle-timeout window for a warm worker slot."""
|
|
File without changes
|