datadoom 0.1.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datadoom/__init__.py +23 -0
- datadoom/adapters/__init__.py +29 -0
- datadoom/adapters/frameworks.py +94 -0
- datadoom/adapters/loaders.py +72 -0
- datadoom/api/__init__.py +11 -0
- datadoom/api/app.py +109 -0
- datadoom/api/deps.py +30 -0
- datadoom/api/errors.py +89 -0
- datadoom/api/estimate.py +82 -0
- datadoom/api/routes/__init__.py +7 -0
- datadoom/api/routes/artifacts.py +147 -0
- datadoom/api/routes/datasets.py +180 -0
- datadoom/api/routes/meta.py +45 -0
- datadoom/api/routes/plugins.py +22 -0
- datadoom/api/routes/runs.py +144 -0
- datadoom/api/routes/specs.py +73 -0
- datadoom/api/routes/templates.py +30 -0
- datadoom/api/schemas.py +230 -0
- datadoom/api/serializers.py +143 -0
- datadoom/api/state.py +24 -0
- datadoom/api/store_helpers.py +56 -0
- datadoom/api/ws.py +72 -0
- datadoom/cli/__init__.py +1 -0
- datadoom/cli/main.py +313 -0
- datadoom/config.py +108 -0
- datadoom/engine/__init__.py +38 -0
- datadoom/engine/advice.py +289 -0
- datadoom/engine/audit.py +290 -0
- datadoom/engine/causal/__init__.py +15 -0
- datadoom/engine/causal/execute.py +116 -0
- datadoom/engine/causal/functions.py +116 -0
- datadoom/engine/causal/graph.py +54 -0
- datadoom/engine/difficulty/__init__.py +36 -0
- datadoom/engine/difficulty/calibrate.py +235 -0
- datadoom/engine/difficulty/knobs.py +171 -0
- datadoom/engine/difficulty/probes.py +181 -0
- datadoom/engine/dist/__init__.py +35 -0
- datadoom/engine/dist/base.py +46 -0
- datadoom/engine/dist/builtins.py +172 -0
- datadoom/engine/dist/compliance.py +344 -0
- datadoom/engine/dist/providers.py +117 -0
- datadoom/engine/errors.py +32 -0
- datadoom/engine/export/__init__.py +27 -0
- datadoom/engine/export/base.py +49 -0
- datadoom/engine/export/checksums.py +18 -0
- datadoom/engine/export/csv_exporter.py +34 -0
- datadoom/engine/export/json_exporter.py +67 -0
- datadoom/engine/export/metadata.py +58 -0
- datadoom/engine/export/parquet_exporter.py +45 -0
- datadoom/engine/failure/__init__.py +18 -0
- datadoom/engine/failure/apply.py +37 -0
- datadoom/engine/failure/base.py +116 -0
- datadoom/engine/failure/modes.py +442 -0
- datadoom/engine/pipeline.py +418 -0
- datadoom/engine/profile.py +327 -0
- datadoom/engine/progress.py +14 -0
- datadoom/engine/reference.py +338 -0
- datadoom/engine/reports.py +206 -0
- datadoom/engine/rng.py +79 -0
- datadoom/engine/spec/__init__.py +45 -0
- datadoom/engine/spec/hashing.py +57 -0
- datadoom/engine/spec/models.py +238 -0
- datadoom/engine/spec/validate.py +345 -0
- datadoom/engine/timeseries.py +88 -0
- datadoom/jobs/__init__.py +14 -0
- datadoom/jobs/progress.py +155 -0
- datadoom/jobs/worker.py +162 -0
- datadoom/plugin.py +35 -0
- datadoom/plugins/__init__.py +47 -0
- datadoom/plugins/contracts.py +72 -0
- datadoom/plugins/loader.py +125 -0
- datadoom/plugins/registry.py +214 -0
- datadoom/plugins/scaffold.py +434 -0
- datadoom/store/__init__.py +47 -0
- datadoom/store/artifacts.py +67 -0
- datadoom/store/db.py +104 -0
- datadoom/store/migrations/__init__.py +0 -0
- datadoom/store/migrations/env.py +53 -0
- datadoom/store/migrations/script.py.mako +24 -0
- datadoom/store/migrations/versions/0001_init.py +149 -0
- datadoom/store/migrations/versions/0002_report_mutual_information.py +23 -0
- datadoom/store/migrations/versions/0003_run_name.py +23 -0
- datadoom/store/migrations/versions/0004_report_profile.py +24 -0
- datadoom/store/models.py +170 -0
- datadoom/store/repositories.py +279 -0
- datadoom/templates/__init__.py +239 -0
- datadoom/templates/ab_test.datadoom.yaml +46 -0
- datadoom/templates/clinical_deterioration.datadoom.yaml +124 -0
- datadoom/templates/credit_default_challenge.datadoom.yaml +147 -0
- datadoom/templates/customer_churn.datadoom.yaml +60 -0
- datadoom/templates/ecommerce_orders.datadoom.yaml +46 -0
- datadoom/templates/fraud_detection.datadoom.yaml +57 -0
- datadoom/templates/hospital_readmission.datadoom.yaml +61 -0
- datadoom/templates/insurance_claims.datadoom.yaml +43 -0
- datadoom/templates/iot_sensors.datadoom.yaml +44 -0
- datadoom/templates/people_directory.datadoom.yaml +56 -0
- datadoom/templates/predictive_maintenance.datadoom.yaml +107 -0
- datadoom/templates/telecom_churn_challenge.datadoom.yaml +125 -0
- datadoom/version.py +3 -0
- datadoom/webdist/assets/index-V8VAuTJG.js +445 -0
- datadoom/webdist/assets/index-doRjyG5s.css +1 -0
- datadoom/webdist/assets/inter-cyrillic-ext-wght-normal-BOeWTOD4.woff2 +0 -0
- datadoom/webdist/assets/inter-cyrillic-wght-normal-DqGufNeO.woff2 +0 -0
- datadoom/webdist/assets/inter-greek-ext-wght-normal-DlzME5K_.woff2 +0 -0
- datadoom/webdist/assets/inter-greek-wght-normal-CkhJZR-_.woff2 +0 -0
- datadoom/webdist/assets/inter-latin-ext-wght-normal-DO1Apj_S.woff2 +0 -0
- datadoom/webdist/assets/inter-latin-wght-normal-Dx4kXJAl.woff2 +0 -0
- datadoom/webdist/assets/inter-vietnamese-wght-normal-CBcvBZtf.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-cyrillic-wght-normal-D73BlboJ.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-greek-wght-normal-Bw9x6K1M.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-latin-ext-wght-normal-DBQx-q_a.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-latin-wght-normal-B9CIFXIH.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-vietnamese-wght-normal-Bt-aOZkq.woff2 +0 -0
- datadoom/webdist/assets/space-grotesk-latin-ext-wght-normal-D9tNdqV9.woff2 +0 -0
- datadoom/webdist/assets/space-grotesk-latin-wght-normal-BhU9QXUp.woff2 +0 -0
- datadoom/webdist/assets/space-grotesk-vietnamese-wght-normal-D0rl6rjA.woff2 +0 -0
- datadoom/webdist/index.html +15 -0
- datadoom-0.1.0.dev0.dist-info/METADATA +143 -0
- datadoom-0.1.0.dev0.dist-info/RECORD +122 -0
- datadoom-0.1.0.dev0.dist-info/WHEEL +4 -0
- datadoom-0.1.0.dev0.dist-info/entry_points.txt +2 -0
- datadoom-0.1.0.dev0.dist-info/licenses/LICENSE +202 -0
|
@@ -0,0 +1,345 @@
|
|
|
1
|
+
"""Cross-field spec validation (04 §9).
|
|
2
|
+
|
|
3
|
+
Pydantic handles shape/type. This module enforces semantic rules that span
|
|
4
|
+
multiple parts of the spec — references, acyclicity, derived-vs-sampled
|
|
5
|
+
consistency — raising :class:`SpecValidationError` with a ``locator`` pointing
|
|
6
|
+
at the offending control.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from ..errors import SpecValidationError
|
|
12
|
+
from .models import (
|
|
13
|
+
BooleanFeature,
|
|
14
|
+
CategoricalFeature,
|
|
15
|
+
NumericFeature,
|
|
16
|
+
Spec,
|
|
17
|
+
TextFeature,
|
|
18
|
+
TimeseriesFeature,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
SUPPORTED_SPEC_VERSIONS = {"1"}
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def validate_spec(spec: Spec) -> None:
|
|
25
|
+
"""Run all cross-field checks. Raises on the first violation."""
|
|
26
|
+
_check_version(spec)
|
|
27
|
+
_check_features(spec)
|
|
28
|
+
_check_causal(spec)
|
|
29
|
+
_check_difficulty(spec)
|
|
30
|
+
_check_failures(spec)
|
|
31
|
+
_check_export(spec)
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
def _check_version(spec: Spec) -> None:
|
|
35
|
+
if spec.datadoom_version not in SUPPORTED_SPEC_VERSIONS:
|
|
36
|
+
raise SpecValidationError(
|
|
37
|
+
f"unsupported datadoom_version {spec.datadoom_version!r} "
|
|
38
|
+
f"(supported: {sorted(SUPPORTED_SPEC_VERSIONS)})",
|
|
39
|
+
locator="datadoom_version",
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def _check_features(spec: Spec) -> None:
|
|
44
|
+
# Lazy import to keep the dist layer optional at module load time.
|
|
45
|
+
from ..dist.builtins import REGISTRY
|
|
46
|
+
from ..dist.providers import is_realistic_generator, resolve_locale
|
|
47
|
+
|
|
48
|
+
derived = _derived_names(spec)
|
|
49
|
+
for name, feat in spec.features.items():
|
|
50
|
+
loc = f"features.{name}"
|
|
51
|
+
if isinstance(feat, NumericFeature):
|
|
52
|
+
if feat.min is not None and feat.max is not None and feat.min > feat.max:
|
|
53
|
+
raise SpecValidationError("min > max", locator=loc)
|
|
54
|
+
if feat.dist is not None:
|
|
55
|
+
dist = REGISTRY.get(feat.dist)
|
|
56
|
+
if dist is None:
|
|
57
|
+
raise SpecValidationError(
|
|
58
|
+
f"unknown distribution {feat.dist!r}", locator=f"{loc}.dist"
|
|
59
|
+
)
|
|
60
|
+
dist.validate(feat.params, locator=f"{loc}.params")
|
|
61
|
+
elif name not in derived:
|
|
62
|
+
# No dist and not produced by the causal layer → unsamplable.
|
|
63
|
+
raise SpecValidationError(
|
|
64
|
+
f"numeric feature {name!r} has no 'dist' and is not a causal target "
|
|
65
|
+
"(it cannot be sampled or derived)",
|
|
66
|
+
locator=loc,
|
|
67
|
+
)
|
|
68
|
+
elif isinstance(feat, CategoricalFeature):
|
|
69
|
+
if feat.weights is not None and len(feat.weights) != len(feat.categories):
|
|
70
|
+
raise SpecValidationError(
|
|
71
|
+
"weights length must match categories length", locator=f"{loc}.weights"
|
|
72
|
+
)
|
|
73
|
+
elif isinstance(feat, TextFeature):
|
|
74
|
+
if feat.generator != "lorem" and not is_realistic_generator(feat.generator):
|
|
75
|
+
raise SpecValidationError(
|
|
76
|
+
f"unknown text generator {feat.generator!r}", locator=f"{loc}.generator"
|
|
77
|
+
)
|
|
78
|
+
resolve_locale(feat.locale, locator=f"{loc}.locale")
|
|
79
|
+
elif isinstance(feat, TimeseriesFeature):
|
|
80
|
+
if feat.min is not None and feat.max is not None and feat.min > feat.max:
|
|
81
|
+
raise SpecValidationError("min > max", locator=loc)
|
|
82
|
+
# AR stationarity: Σ|φᵢ| < 1 is a conservative sufficient condition that
|
|
83
|
+
# keeps the recursion bounded (a true unit-root/explosive series drifts
|
|
84
|
+
# without bound and isn't reproducibly meaningful). Reject otherwise.
|
|
85
|
+
if feat.ar and sum(abs(c) for c in feat.ar) >= 1.0:
|
|
86
|
+
raise SpecValidationError(
|
|
87
|
+
"time-series AR is non-stationary: sum(|ar coefficients|) must be "
|
|
88
|
+
f"< 1 (got {sum(abs(c) for c in feat.ar):.3f})",
|
|
89
|
+
locator=f"{loc}.ar",
|
|
90
|
+
)
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _derived_names(spec: Spec) -> set[str]:
|
|
94
|
+
if spec.causal is None:
|
|
95
|
+
return set()
|
|
96
|
+
return {edge.dst for edge in spec.causal.edges}
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
def _check_causal(spec: Spec) -> None:
|
|
100
|
+
if spec.causal is None:
|
|
101
|
+
return
|
|
102
|
+
# Lazy imports keep the dist/causal layers optional at module load time.
|
|
103
|
+
from ..causal.functions import STRUCTURAL_FNS
|
|
104
|
+
from ..dist.builtins import REGISTRY
|
|
105
|
+
|
|
106
|
+
feature_names = set(spec.features)
|
|
107
|
+
targets: set[str] = set()
|
|
108
|
+
adjacency: dict[str, list[str]] = {n: [] for n in feature_names}
|
|
109
|
+
|
|
110
|
+
for i, edge in enumerate(spec.causal.edges):
|
|
111
|
+
loc = f"causal.edges[{i}]"
|
|
112
|
+
if edge.src not in feature_names:
|
|
113
|
+
raise SpecValidationError(f"unknown 'from' feature {edge.src!r}", locator=loc)
|
|
114
|
+
if edge.dst not in feature_names:
|
|
115
|
+
raise SpecValidationError(f"unknown 'to' feature {edge.dst!r}", locator=loc)
|
|
116
|
+
fn = STRUCTURAL_FNS.get(edge.fn)
|
|
117
|
+
if fn is None:
|
|
118
|
+
raise SpecValidationError(
|
|
119
|
+
f"unknown structural function {edge.fn!r}", locator=f"{loc}.fn"
|
|
120
|
+
)
|
|
121
|
+
fn.validate(edge, locator=loc)
|
|
122
|
+
# The structural function must be compatible with the parent's type, or
|
|
123
|
+
# execution would hit a raw coercion error. `map` consumes a categorical
|
|
124
|
+
# parent; the numeric fns need a numeric/boolean (float-coercible) parent.
|
|
125
|
+
parent = spec.features[edge.src]
|
|
126
|
+
if edge.fn == "map":
|
|
127
|
+
if not isinstance(parent, CategoricalFeature):
|
|
128
|
+
raise SpecValidationError(
|
|
129
|
+
f"map edge requires a categorical 'from' feature; {edge.src!r} is "
|
|
130
|
+
f"type {parent.type!r}",
|
|
131
|
+
locator=f"{loc}.fn",
|
|
132
|
+
)
|
|
133
|
+
missing = [c for c in parent.categories if c not in (edge.mapping or {})]
|
|
134
|
+
if missing:
|
|
135
|
+
raise SpecValidationError(
|
|
136
|
+
f"map edge is missing mappings for categories {missing}",
|
|
137
|
+
locator=f"{loc}.mapping",
|
|
138
|
+
)
|
|
139
|
+
elif not isinstance(parent, (NumericFeature, BooleanFeature, TimeseriesFeature)):
|
|
140
|
+
raise SpecValidationError(
|
|
141
|
+
f"{edge.fn!r} edge requires a numeric/boolean 'from' feature; {edge.src!r} "
|
|
142
|
+
f"is type {parent.type!r} (use fn 'map' for categorical parents)",
|
|
143
|
+
locator=f"{loc}.fn",
|
|
144
|
+
)
|
|
145
|
+
adjacency[edge.src].append(edge.dst)
|
|
146
|
+
targets.add(edge.dst)
|
|
147
|
+
|
|
148
|
+
for name in targets:
|
|
149
|
+
feat = spec.features[name]
|
|
150
|
+
# A feature that is both directly sampled and a causal target is ambiguous.
|
|
151
|
+
if isinstance(feat, NumericFeature) and feat.dist is not None:
|
|
152
|
+
raise SpecValidationError(
|
|
153
|
+
f"feature {name!r} is both sampled (has dist) and derived (causal target)",
|
|
154
|
+
locator=f"features.{name}",
|
|
155
|
+
)
|
|
156
|
+
# Only numeric/boolean targets can be derived by the SEM.
|
|
157
|
+
if not isinstance(feat, (NumericFeature, BooleanFeature)):
|
|
158
|
+
raise SpecValidationError(
|
|
159
|
+
f"causal target {name!r} has type {feat.type!r}; only numeric and boolean "
|
|
160
|
+
"features can be derived",
|
|
161
|
+
locator=f"features.{name}",
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
# Per-node noise must reference a known distribution (or be 'none').
|
|
165
|
+
for node, noise in spec.causal.noise.items():
|
|
166
|
+
loc = f"causal.noise.{node}"
|
|
167
|
+
if node not in feature_names:
|
|
168
|
+
raise SpecValidationError(f"noise references unknown feature {node!r}", locator=loc)
|
|
169
|
+
dist_name = noise.get("dist")
|
|
170
|
+
if dist_name not in (None, "none") and dist_name not in REGISTRY:
|
|
171
|
+
raise SpecValidationError(
|
|
172
|
+
f"unknown noise distribution {dist_name!r}", locator=f"{loc}.dist"
|
|
173
|
+
)
|
|
174
|
+
|
|
175
|
+
# Interventions must reference real features.
|
|
176
|
+
for i, item in enumerate(spec.causal.interventions):
|
|
177
|
+
do = item.get("do", {})
|
|
178
|
+
for node in do:
|
|
179
|
+
if node not in feature_names:
|
|
180
|
+
raise SpecValidationError(
|
|
181
|
+
f"intervention references unknown feature {node!r}",
|
|
182
|
+
locator=f"causal.interventions[{i}].do",
|
|
183
|
+
)
|
|
184
|
+
|
|
185
|
+
_reject_cycles(adjacency)
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def _reject_cycles(adjacency: dict[str, list[str]]) -> None:
|
|
189
|
+
"""Kahn's algorithm; if not all nodes are emitted, a cycle exists."""
|
|
190
|
+
indegree = {n: 0 for n in adjacency}
|
|
191
|
+
for _, children in sorted(adjacency.items()):
|
|
192
|
+
for child in children:
|
|
193
|
+
indegree[child] += 1
|
|
194
|
+
queue = sorted(n for n, d in indegree.items() if d == 0)
|
|
195
|
+
emitted = 0
|
|
196
|
+
while queue:
|
|
197
|
+
node = queue.pop(0)
|
|
198
|
+
emitted += 1
|
|
199
|
+
for child in sorted(adjacency[node]):
|
|
200
|
+
indegree[child] -= 1
|
|
201
|
+
if indegree[child] == 0:
|
|
202
|
+
queue.append(child)
|
|
203
|
+
queue.sort()
|
|
204
|
+
if emitted != len(adjacency):
|
|
205
|
+
raise SpecValidationError("causal graph is not acyclic", locator="causal.edges")
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def _check_difficulty(spec: Spec) -> None:
|
|
209
|
+
if spec.difficulty is None:
|
|
210
|
+
return
|
|
211
|
+
# Lazy imports keep the difficulty layer optional at module load time.
|
|
212
|
+
from ..difficulty import PROBES, TIER_BANDS
|
|
213
|
+
from ..difficulty.knobs import ACTIVE_KNOBS
|
|
214
|
+
|
|
215
|
+
cfg = spec.difficulty
|
|
216
|
+
|
|
217
|
+
# The probe predicts the label; it must be a declared, classification-able
|
|
218
|
+
# feature. v0.1 calibrates binary-classification AUROC only.
|
|
219
|
+
label_feat = spec.features.get(cfg.label)
|
|
220
|
+
if label_feat is None:
|
|
221
|
+
raise SpecValidationError(
|
|
222
|
+
f"difficulty.label {cfg.label!r} is not a declared feature",
|
|
223
|
+
locator="difficulty.label",
|
|
224
|
+
)
|
|
225
|
+
if label_feat.emit is False:
|
|
226
|
+
raise SpecValidationError(
|
|
227
|
+
f"difficulty.label {cfg.label!r} is latent (emit: false) and is not shipped; "
|
|
228
|
+
"the probe can only predict an observable label",
|
|
229
|
+
locator="difficulty.label",
|
|
230
|
+
)
|
|
231
|
+
if isinstance(label_feat, BooleanFeature):
|
|
232
|
+
pass
|
|
233
|
+
elif isinstance(label_feat, CategoricalFeature):
|
|
234
|
+
if len(label_feat.categories) != 2:
|
|
235
|
+
raise SpecValidationError(
|
|
236
|
+
f"difficulty.label {cfg.label!r} is categorical with "
|
|
237
|
+
f"{len(label_feat.categories)} classes; v0.1 calibrates binary "
|
|
238
|
+
"classification only (use a boolean or 2-class categorical label)",
|
|
239
|
+
locator="difficulty.label",
|
|
240
|
+
)
|
|
241
|
+
else:
|
|
242
|
+
raise SpecValidationError(
|
|
243
|
+
f"difficulty.label {cfg.label!r} has type {label_feat.type!r}; v0.1 "
|
|
244
|
+
"supports binary-classification targets only (boolean or 2-class "
|
|
245
|
+
"categorical)",
|
|
246
|
+
locator="difficulty.label",
|
|
247
|
+
)
|
|
248
|
+
|
|
249
|
+
if cfg.probe not in PROBES:
|
|
250
|
+
raise SpecValidationError(
|
|
251
|
+
f"unknown difficulty probe {cfg.probe!r} (known: {sorted(PROBES)})",
|
|
252
|
+
locator="difficulty.probe",
|
|
253
|
+
)
|
|
254
|
+
|
|
255
|
+
# Target is either a named tier or an explicit band dict.
|
|
256
|
+
if isinstance(cfg.target, str):
|
|
257
|
+
if cfg.target not in TIER_BANDS:
|
|
258
|
+
raise SpecValidationError(
|
|
259
|
+
f"unknown difficulty tier {cfg.target!r} (known: {sorted(TIER_BANDS)})",
|
|
260
|
+
locator="difficulty.target",
|
|
261
|
+
)
|
|
262
|
+
elif isinstance(cfg.target, dict):
|
|
263
|
+
band = cfg.target.get("band")
|
|
264
|
+
if not (isinstance(band, (list, tuple)) and len(band) == 2):
|
|
265
|
+
raise SpecValidationError(
|
|
266
|
+
"difficulty.target must name a tier or carry a 'band': [lo, hi]",
|
|
267
|
+
locator="difficulty.target.band",
|
|
268
|
+
)
|
|
269
|
+
lo, hi = float(band[0]), float(band[1])
|
|
270
|
+
if not (0.0 <= lo <= hi <= 1.0):
|
|
271
|
+
raise SpecValidationError(
|
|
272
|
+
f"difficulty band must satisfy 0 <= lo <= hi <= 1 (got [{lo}, {hi}])",
|
|
273
|
+
locator="difficulty.target.band",
|
|
274
|
+
)
|
|
275
|
+
metric = cfg.target.get("metric", "auroc")
|
|
276
|
+
if metric != "auroc":
|
|
277
|
+
raise SpecValidationError(
|
|
278
|
+
f"unsupported difficulty metric {metric!r}; v0.1 supports 'auroc'",
|
|
279
|
+
locator="difficulty.target.metric",
|
|
280
|
+
)
|
|
281
|
+
else:
|
|
282
|
+
raise SpecValidationError(
|
|
283
|
+
"difficulty.target must be a tier name or an explicit-band object",
|
|
284
|
+
locator="difficulty.target",
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
# Only the actively-implemented knobs are accepted — no silently-ignored
|
|
288
|
+
# config. `causal` shrink and `imbalance` are planned (see status.md backlog).
|
|
289
|
+
unknown = [k for k in cfg.knobs if k not in ACTIVE_KNOBS]
|
|
290
|
+
if unknown:
|
|
291
|
+
raise SpecValidationError(
|
|
292
|
+
f"unsupported difficulty knob(s) {unknown}; v0.1 implements "
|
|
293
|
+
f"{list(ACTIVE_KNOBS)} (causal shrink and imbalance are planned)",
|
|
294
|
+
locator="difficulty.knobs",
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def _check_failures(spec: Spec) -> None:
|
|
299
|
+
# Lazy import keeps the failure layer optional at module load time.
|
|
300
|
+
from ..failure.modes import FAILURE_MODES
|
|
301
|
+
|
|
302
|
+
latent = spec.latent_names()
|
|
303
|
+
for i, failure in enumerate(spec.failures):
|
|
304
|
+
loc = f"failures[{i}]"
|
|
305
|
+
mode = FAILURE_MODES.get(failure.type)
|
|
306
|
+
if mode is None:
|
|
307
|
+
raise SpecValidationError(
|
|
308
|
+
f"unknown failure type {failure.type!r} "
|
|
309
|
+
f"(known: {sorted(FAILURE_MODES)})",
|
|
310
|
+
locator=f"{loc}.type",
|
|
311
|
+
)
|
|
312
|
+
params = failure.model_dump()
|
|
313
|
+
params.pop("type", None)
|
|
314
|
+
mode.validate(params, spec.features, loc)
|
|
315
|
+
# Failures corrupt the *shipped* frame; a latent column was already
|
|
316
|
+
# dropped, so referencing one would fail at runtime — reject it early.
|
|
317
|
+
for val in (*params.values(),):
|
|
318
|
+
refs = val if isinstance(val, list) else [val]
|
|
319
|
+
for ref in refs:
|
|
320
|
+
if isinstance(ref, str) and ref in latent:
|
|
321
|
+
raise SpecValidationError(
|
|
322
|
+
f"failure references latent feature {ref!r} (emit: false), "
|
|
323
|
+
"which is not shipped and cannot be corrupted",
|
|
324
|
+
locator=loc,
|
|
325
|
+
)
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
def _check_export(spec: Spec) -> None:
|
|
329
|
+
from ..export import EXPORTERS
|
|
330
|
+
|
|
331
|
+
for fmt in spec.export.formats:
|
|
332
|
+
if fmt not in EXPORTERS:
|
|
333
|
+
known = ", ".join(sorted(EXPORTERS))
|
|
334
|
+
raise SpecValidationError(
|
|
335
|
+
f"unknown export format {fmt!r}; known formats: {known}",
|
|
336
|
+
locator="export.formats",
|
|
337
|
+
)
|
|
338
|
+
|
|
339
|
+
splits = spec.export.splits
|
|
340
|
+
if splits is not None:
|
|
341
|
+
total = sum(splits.values())
|
|
342
|
+
if abs(total - 1.0) > 1e-9:
|
|
343
|
+
raise SpecValidationError(
|
|
344
|
+
f"export.splits ratios must sum to 1.0 (got {total})", locator="export.splits"
|
|
345
|
+
)
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
"""Time-series generation — additive decomposition (05 §6).
|
|
2
|
+
|
|
3
|
+
A time-series feature realizes an ordered series over the row index ``t = 0 … n−1``:
|
|
4
|
+
|
|
5
|
+
Xₜ = T(t) + S(t) + AR(p) + εₜ
|
|
6
|
+
T(t) = slope·t + intercept # linear trend
|
|
7
|
+
S(t) = Σ Aⱼ·sin(2π·t/periodⱼ + phaseⱼ) # (multi-)seasonality
|
|
8
|
+
AR(p): Yₜ = Σ_{i=1}^p φᵢ·Y_{t−i} + εₜ # autoregressive residual
|
|
9
|
+
εₜ ~ Normal(0, σ²) from RNG(noise:<series>)
|
|
10
|
+
|
|
11
|
+
The deterministic mean ``T(t)+S(t)`` is vectorised; the AR residual is an inherent
|
|
12
|
+
sequential recursion (each term depends on its predecessors) seeded with
|
|
13
|
+
``Y_{t<0} = 0`` — a fixed, reproducible warm-start (no hidden burn-in draws). With
|
|
14
|
+
no ``ar`` coefficients the residual is plain i.i.d. noise ``εₜ``.
|
|
15
|
+
|
|
16
|
+
This module is pure and frawework-free; all randomness flows through the injected
|
|
17
|
+
``numpy.random.Generator`` so the series is byte-reproducible on the pinned path.
|
|
18
|
+
Multivariate / hierarchical series are deferred (plugin / post-v1), per 05 §6.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
from collections.abc import Sequence
|
|
24
|
+
from dataclasses import dataclass
|
|
25
|
+
|
|
26
|
+
import numpy as np
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
@dataclass(frozen=True)
|
|
30
|
+
class Trend:
|
|
31
|
+
slope: float = 0.0
|
|
32
|
+
intercept: float = 0.0
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@dataclass(frozen=True)
|
|
36
|
+
class Seasonality:
|
|
37
|
+
amplitude: float
|
|
38
|
+
period: float
|
|
39
|
+
phase: float = 0.0
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def generate_series(
|
|
43
|
+
rng: np.random.Generator,
|
|
44
|
+
n: int,
|
|
45
|
+
*,
|
|
46
|
+
trend: Trend | None = None,
|
|
47
|
+
seasonality: Sequence[Seasonality] = (),
|
|
48
|
+
ar: Sequence[float] = (),
|
|
49
|
+
noise_std: float = 1.0,
|
|
50
|
+
) -> np.ndarray:
|
|
51
|
+
"""Realize one additive time-series of length ``n`` (05 §6).
|
|
52
|
+
|
|
53
|
+
Args:
|
|
54
|
+
rng: the per-series noise generator (``RNG(noise:<name>)``).
|
|
55
|
+
n: series length (row count).
|
|
56
|
+
trend: linear ``slope·t + intercept`` component (or ``None``).
|
|
57
|
+
seasonality: zero or more sinusoidal components, summed.
|
|
58
|
+
ar: autoregressive coefficients ``[φ₁ … φ_p]`` on the residual.
|
|
59
|
+
noise_std: σ of the Gaussian innovations ``εₜ``.
|
|
60
|
+
|
|
61
|
+
Returns:
|
|
62
|
+
A float ``ndarray`` of length ``n`` in row order.
|
|
63
|
+
"""
|
|
64
|
+
t = np.arange(n, dtype=float)
|
|
65
|
+
|
|
66
|
+
mean = np.zeros(n, dtype=float)
|
|
67
|
+
if trend is not None:
|
|
68
|
+
mean += trend.slope * t + trend.intercept
|
|
69
|
+
for s in seasonality:
|
|
70
|
+
mean += s.amplitude * np.sin(2.0 * np.pi * t / s.period + s.phase)
|
|
71
|
+
|
|
72
|
+
eps = rng.normal(0.0, noise_std, size=n) if noise_std > 0 else np.zeros(n, dtype=float)
|
|
73
|
+
|
|
74
|
+
coeffs = list(ar)
|
|
75
|
+
if not coeffs:
|
|
76
|
+
return mean + eps
|
|
77
|
+
|
|
78
|
+
# Zero-mean AR(p) residual: Yₜ = Σ φᵢ·Y_{t−i} + εₜ, warm-started at Y_{<0}=0.
|
|
79
|
+
p = len(coeffs)
|
|
80
|
+
y = np.zeros(n, dtype=float)
|
|
81
|
+
for i in range(n):
|
|
82
|
+
acc = eps[i]
|
|
83
|
+
for j in range(p):
|
|
84
|
+
k = i - j - 1
|
|
85
|
+
if k >= 0:
|
|
86
|
+
acc += coeffs[j] * y[k]
|
|
87
|
+
y[i] = acc
|
|
88
|
+
return mean + y
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""Job execution layer (03 §3.3).
|
|
2
|
+
|
|
3
|
+
The default backend is an **in-process** thread-pool worker that drives
|
|
4
|
+
``engine.pipeline`` and fans out progress events to the WebSocket hub. It may
|
|
5
|
+
import ``engine`` and ``store`` but nothing from ``api``/``cli`` (the hub lives
|
|
6
|
+
here so ``api`` can subscribe without ``jobs`` reaching upward).
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from .progress import EventHub, HubProgressEmitter, RunCancelled
|
|
12
|
+
from .worker import WorkerPool
|
|
13
|
+
|
|
14
|
+
__all__ = ["EventHub", "HubProgressEmitter", "RunCancelled", "WorkerPool"]
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
"""Progress fan-out: engine ``ProgressEmitter`` -> WebSocket hub (08 §7).
|
|
2
|
+
|
|
3
|
+
The :class:`EventHub` is an in-process async pub/sub keyed by ``run_id``. The
|
|
4
|
+
worker thread publishes events; the API's WebSocket/SSE handlers subscribe. The
|
|
5
|
+
hub keeps a per-run **replay buffer** so a late subscriber (the browser opening
|
|
6
|
+
the tracker after the run started) receives the stages so far, then live updates.
|
|
7
|
+
|
|
8
|
+
Cross-thread safety: the worker runs in a thread pool, so ``publish`` marshals
|
|
9
|
+
queue writes onto the API event loop via ``call_soon_threadsafe``. With no loop
|
|
10
|
+
registered (library/test use) it still records history and serves replay.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import asyncio
|
|
16
|
+
import contextlib
|
|
17
|
+
import threading
|
|
18
|
+
from collections.abc import AsyncIterator
|
|
19
|
+
from typing import Any
|
|
20
|
+
|
|
21
|
+
from datadoom.engine.progress import ProgressEmitter
|
|
22
|
+
|
|
23
|
+
# Canonical pipeline stages (08 §7 / 03 §4). Optional stages appear only when the
|
|
24
|
+
# spec enables them; in P1 the engine emits the headless subset.
|
|
25
|
+
CANONICAL_STAGES = (
|
|
26
|
+
"intake",
|
|
27
|
+
"snapshot",
|
|
28
|
+
"seed",
|
|
29
|
+
"base_generation",
|
|
30
|
+
"causal",
|
|
31
|
+
"failure_injection",
|
|
32
|
+
"difficulty",
|
|
33
|
+
"compliance",
|
|
34
|
+
"packaging",
|
|
35
|
+
)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
class RunCancelled(Exception):
|
|
39
|
+
"""Raised inside the pipeline when a cooperative cancel was requested."""
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
Event = dict[str, Any]
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class EventHub:
|
|
46
|
+
"""Per-run pub/sub with replay. One instance per server process."""
|
|
47
|
+
|
|
48
|
+
def __init__(self) -> None:
|
|
49
|
+
self._subscribers: dict[str, set[asyncio.Queue[Event]]] = {}
|
|
50
|
+
self._history: dict[str, list[Event]] = {}
|
|
51
|
+
self._terminal: dict[str, bool] = {}
|
|
52
|
+
self._cancels: dict[str, threading.Event] = {}
|
|
53
|
+
self._lock = threading.Lock()
|
|
54
|
+
self._loop: asyncio.AbstractEventLoop | None = None
|
|
55
|
+
|
|
56
|
+
# --- loop wiring (called once by the API on startup) --------------------------
|
|
57
|
+
def bind_loop(self, loop: asyncio.AbstractEventLoop) -> None:
|
|
58
|
+
self._loop = loop
|
|
59
|
+
|
|
60
|
+
# --- cancellation -------------------------------------------------------------
|
|
61
|
+
def cancel_flag(self, run_id: str) -> threading.Event:
|
|
62
|
+
with self._lock:
|
|
63
|
+
return self._cancels.setdefault(run_id, threading.Event())
|
|
64
|
+
|
|
65
|
+
def request_cancel(self, run_id: str) -> None:
|
|
66
|
+
self.cancel_flag(run_id).set()
|
|
67
|
+
|
|
68
|
+
def is_cancelled(self, run_id: str) -> bool:
|
|
69
|
+
return self.cancel_flag(run_id).is_set()
|
|
70
|
+
|
|
71
|
+
# --- publishing ---------------------------------------------------------------
|
|
72
|
+
def publish(self, run_id: str, event: Event) -> None:
|
|
73
|
+
with self._lock:
|
|
74
|
+
self._history.setdefault(run_id, []).append(event)
|
|
75
|
+
if event.get("type") in {"completed", "failed", "cancelled"}:
|
|
76
|
+
self._terminal[run_id] = True
|
|
77
|
+
queues = list(self._subscribers.get(run_id, ()))
|
|
78
|
+
for q in queues:
|
|
79
|
+
self._enqueue(q, event)
|
|
80
|
+
|
|
81
|
+
def _enqueue(self, q: asyncio.Queue[Event], event: Event) -> None:
|
|
82
|
+
loop = self._loop
|
|
83
|
+
if loop is None:
|
|
84
|
+
# No API loop (library/test); history still records for replay.
|
|
85
|
+
return
|
|
86
|
+
with contextlib.suppress(RuntimeError):
|
|
87
|
+
loop.call_soon_threadsafe(q.put_nowait, event)
|
|
88
|
+
|
|
89
|
+
# --- subscription -------------------------------------------------------------
|
|
90
|
+
async def subscribe(self, run_id: str) -> AsyncIterator[Event]:
|
|
91
|
+
"""Yield replay of events so far, then live events until a terminal one."""
|
|
92
|
+
q: asyncio.Queue[Event] = asyncio.Queue()
|
|
93
|
+
with self._lock:
|
|
94
|
+
replay = list(self._history.get(run_id, ()))
|
|
95
|
+
already_terminal = self._terminal.get(run_id, False)
|
|
96
|
+
self._subscribers.setdefault(run_id, set()).add(q)
|
|
97
|
+
try:
|
|
98
|
+
for ev in replay:
|
|
99
|
+
yield ev
|
|
100
|
+
if already_terminal:
|
|
101
|
+
return
|
|
102
|
+
while True:
|
|
103
|
+
ev = await q.get()
|
|
104
|
+
yield ev
|
|
105
|
+
if ev.get("type") in {"completed", "failed", "cancelled"}:
|
|
106
|
+
return
|
|
107
|
+
finally:
|
|
108
|
+
with self._lock:
|
|
109
|
+
subs = self._subscribers.get(run_id)
|
|
110
|
+
if subs is not None:
|
|
111
|
+
subs.discard(q)
|
|
112
|
+
|
|
113
|
+
def history(self, run_id: str) -> list[Event]:
|
|
114
|
+
with self._lock:
|
|
115
|
+
return list(self._history.get(run_id, ()))
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
class HubProgressEmitter(ProgressEmitter):
|
|
119
|
+
"""Engine progress sink that republishes to an :class:`EventHub`.
|
|
120
|
+
|
|
121
|
+
Translates the engine's per-stage ``emit(stage, pct, message)`` calls into the
|
|
122
|
+
WS event shapes of 08 §7, synthesizing a ``done`` for the previous stage when
|
|
123
|
+
a new one begins, and checking the cooperative cancel flag at every boundary.
|
|
124
|
+
"""
|
|
125
|
+
|
|
126
|
+
def __init__(self, hub: EventHub, run_id: str) -> None:
|
|
127
|
+
self.hub = hub
|
|
128
|
+
self.run_id = run_id
|
|
129
|
+
self._prev_stage: str | None = None
|
|
130
|
+
|
|
131
|
+
def emit(self, stage: str, pct: int, message: str = "") -> None:
|
|
132
|
+
if self.hub.is_cancelled(self.run_id):
|
|
133
|
+
raise RunCancelled(stage)
|
|
134
|
+
if self._prev_stage is not None and self._prev_stage != stage:
|
|
135
|
+
self.hub.publish(
|
|
136
|
+
self.run_id,
|
|
137
|
+
{"type": "stage", "stage": self._prev_stage, "status": "done", "pct": pct},
|
|
138
|
+
)
|
|
139
|
+
self.hub.publish(
|
|
140
|
+
self.run_id,
|
|
141
|
+
{"type": "stage", "stage": stage, "status": "running", "pct": pct},
|
|
142
|
+
)
|
|
143
|
+
if message:
|
|
144
|
+
self.hub.publish(
|
|
145
|
+
self.run_id, {"type": "log", "level": "info", "message": message}
|
|
146
|
+
)
|
|
147
|
+
self._prev_stage = stage
|
|
148
|
+
|
|
149
|
+
def finish(self, pct: int = 100) -> None:
|
|
150
|
+
"""Mark the final stage done once the pipeline returns."""
|
|
151
|
+
if self._prev_stage is not None:
|
|
152
|
+
self.hub.publish(
|
|
153
|
+
self.run_id,
|
|
154
|
+
{"type": "stage", "stage": self._prev_stage, "status": "done", "pct": pct},
|
|
155
|
+
)
|