datadoom 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. datadoom/__init__.py +23 -0
  2. datadoom/adapters/__init__.py +29 -0
  3. datadoom/adapters/frameworks.py +94 -0
  4. datadoom/adapters/loaders.py +72 -0
  5. datadoom/api/__init__.py +11 -0
  6. datadoom/api/app.py +109 -0
  7. datadoom/api/deps.py +30 -0
  8. datadoom/api/errors.py +89 -0
  9. datadoom/api/estimate.py +82 -0
  10. datadoom/api/routes/__init__.py +7 -0
  11. datadoom/api/routes/artifacts.py +147 -0
  12. datadoom/api/routes/datasets.py +180 -0
  13. datadoom/api/routes/meta.py +45 -0
  14. datadoom/api/routes/plugins.py +22 -0
  15. datadoom/api/routes/runs.py +144 -0
  16. datadoom/api/routes/specs.py +73 -0
  17. datadoom/api/routes/templates.py +30 -0
  18. datadoom/api/schemas.py +230 -0
  19. datadoom/api/serializers.py +143 -0
  20. datadoom/api/state.py +24 -0
  21. datadoom/api/store_helpers.py +56 -0
  22. datadoom/api/ws.py +72 -0
  23. datadoom/cli/__init__.py +1 -0
  24. datadoom/cli/main.py +313 -0
  25. datadoom/config.py +108 -0
  26. datadoom/engine/__init__.py +38 -0
  27. datadoom/engine/advice.py +289 -0
  28. datadoom/engine/audit.py +290 -0
  29. datadoom/engine/causal/__init__.py +15 -0
  30. datadoom/engine/causal/execute.py +116 -0
  31. datadoom/engine/causal/functions.py +116 -0
  32. datadoom/engine/causal/graph.py +54 -0
  33. datadoom/engine/difficulty/__init__.py +36 -0
  34. datadoom/engine/difficulty/calibrate.py +235 -0
  35. datadoom/engine/difficulty/knobs.py +171 -0
  36. datadoom/engine/difficulty/probes.py +181 -0
  37. datadoom/engine/dist/__init__.py +35 -0
  38. datadoom/engine/dist/base.py +46 -0
  39. datadoom/engine/dist/builtins.py +172 -0
  40. datadoom/engine/dist/compliance.py +344 -0
  41. datadoom/engine/dist/providers.py +117 -0
  42. datadoom/engine/errors.py +32 -0
  43. datadoom/engine/export/__init__.py +27 -0
  44. datadoom/engine/export/base.py +49 -0
  45. datadoom/engine/export/checksums.py +18 -0
  46. datadoom/engine/export/csv_exporter.py +34 -0
  47. datadoom/engine/export/json_exporter.py +67 -0
  48. datadoom/engine/export/metadata.py +58 -0
  49. datadoom/engine/export/parquet_exporter.py +45 -0
  50. datadoom/engine/failure/__init__.py +18 -0
  51. datadoom/engine/failure/apply.py +37 -0
  52. datadoom/engine/failure/base.py +116 -0
  53. datadoom/engine/failure/modes.py +442 -0
  54. datadoom/engine/pipeline.py +418 -0
  55. datadoom/engine/profile.py +327 -0
  56. datadoom/engine/progress.py +14 -0
  57. datadoom/engine/reference.py +338 -0
  58. datadoom/engine/reports.py +206 -0
  59. datadoom/engine/rng.py +79 -0
  60. datadoom/engine/spec/__init__.py +45 -0
  61. datadoom/engine/spec/hashing.py +57 -0
  62. datadoom/engine/spec/models.py +238 -0
  63. datadoom/engine/spec/validate.py +345 -0
  64. datadoom/engine/timeseries.py +88 -0
  65. datadoom/jobs/__init__.py +14 -0
  66. datadoom/jobs/progress.py +155 -0
  67. datadoom/jobs/worker.py +162 -0
  68. datadoom/plugin.py +35 -0
  69. datadoom/plugins/__init__.py +47 -0
  70. datadoom/plugins/contracts.py +72 -0
  71. datadoom/plugins/loader.py +125 -0
  72. datadoom/plugins/registry.py +214 -0
  73. datadoom/plugins/scaffold.py +434 -0
  74. datadoom/store/__init__.py +47 -0
  75. datadoom/store/artifacts.py +67 -0
  76. datadoom/store/db.py +104 -0
  77. datadoom/store/migrations/__init__.py +0 -0
  78. datadoom/store/migrations/env.py +53 -0
  79. datadoom/store/migrations/script.py.mako +24 -0
  80. datadoom/store/migrations/versions/0001_init.py +149 -0
  81. datadoom/store/migrations/versions/0002_report_mutual_information.py +23 -0
  82. datadoom/store/migrations/versions/0003_run_name.py +23 -0
  83. datadoom/store/migrations/versions/0004_report_profile.py +24 -0
  84. datadoom/store/models.py +170 -0
  85. datadoom/store/repositories.py +279 -0
  86. datadoom/templates/__init__.py +239 -0
  87. datadoom/templates/ab_test.datadoom.yaml +46 -0
  88. datadoom/templates/clinical_deterioration.datadoom.yaml +124 -0
  89. datadoom/templates/credit_default_challenge.datadoom.yaml +147 -0
  90. datadoom/templates/customer_churn.datadoom.yaml +60 -0
  91. datadoom/templates/ecommerce_orders.datadoom.yaml +46 -0
  92. datadoom/templates/fraud_detection.datadoom.yaml +57 -0
  93. datadoom/templates/hospital_readmission.datadoom.yaml +61 -0
  94. datadoom/templates/insurance_claims.datadoom.yaml +43 -0
  95. datadoom/templates/iot_sensors.datadoom.yaml +44 -0
  96. datadoom/templates/people_directory.datadoom.yaml +56 -0
  97. datadoom/templates/predictive_maintenance.datadoom.yaml +107 -0
  98. datadoom/templates/telecom_churn_challenge.datadoom.yaml +125 -0
  99. datadoom/version.py +3 -0
  100. datadoom/webdist/assets/index-V8VAuTJG.js +445 -0
  101. datadoom/webdist/assets/index-doRjyG5s.css +1 -0
  102. datadoom/webdist/assets/inter-cyrillic-ext-wght-normal-BOeWTOD4.woff2 +0 -0
  103. datadoom/webdist/assets/inter-cyrillic-wght-normal-DqGufNeO.woff2 +0 -0
  104. datadoom/webdist/assets/inter-greek-ext-wght-normal-DlzME5K_.woff2 +0 -0
  105. datadoom/webdist/assets/inter-greek-wght-normal-CkhJZR-_.woff2 +0 -0
  106. datadoom/webdist/assets/inter-latin-ext-wght-normal-DO1Apj_S.woff2 +0 -0
  107. datadoom/webdist/assets/inter-latin-wght-normal-Dx4kXJAl.woff2 +0 -0
  108. datadoom/webdist/assets/inter-vietnamese-wght-normal-CBcvBZtf.woff2 +0 -0
  109. datadoom/webdist/assets/jetbrains-mono-cyrillic-wght-normal-D73BlboJ.woff2 +0 -0
  110. datadoom/webdist/assets/jetbrains-mono-greek-wght-normal-Bw9x6K1M.woff2 +0 -0
  111. datadoom/webdist/assets/jetbrains-mono-latin-ext-wght-normal-DBQx-q_a.woff2 +0 -0
  112. datadoom/webdist/assets/jetbrains-mono-latin-wght-normal-B9CIFXIH.woff2 +0 -0
  113. datadoom/webdist/assets/jetbrains-mono-vietnamese-wght-normal-Bt-aOZkq.woff2 +0 -0
  114. datadoom/webdist/assets/space-grotesk-latin-ext-wght-normal-D9tNdqV9.woff2 +0 -0
  115. datadoom/webdist/assets/space-grotesk-latin-wght-normal-BhU9QXUp.woff2 +0 -0
  116. datadoom/webdist/assets/space-grotesk-vietnamese-wght-normal-D0rl6rjA.woff2 +0 -0
  117. datadoom/webdist/index.html +15 -0
  118. datadoom-0.1.0.dev0.dist-info/METADATA +143 -0
  119. datadoom-0.1.0.dev0.dist-info/RECORD +122 -0
  120. datadoom-0.1.0.dev0.dist-info/WHEEL +4 -0
  121. datadoom-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  122. datadoom-0.1.0.dev0.dist-info/licenses/LICENSE +202 -0
@@ -0,0 +1,345 @@
1
+ """Cross-field spec validation (04 §9).
2
+
3
+ Pydantic handles shape/type. This module enforces semantic rules that span
4
+ multiple parts of the spec — references, acyclicity, derived-vs-sampled
5
+ consistency — raising :class:`SpecValidationError` with a ``locator`` pointing
6
+ at the offending control.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from ..errors import SpecValidationError
12
+ from .models import (
13
+ BooleanFeature,
14
+ CategoricalFeature,
15
+ NumericFeature,
16
+ Spec,
17
+ TextFeature,
18
+ TimeseriesFeature,
19
+ )
20
+
21
+ SUPPORTED_SPEC_VERSIONS = {"1"}
22
+
23
+
24
+ def validate_spec(spec: Spec) -> None:
25
+ """Run all cross-field checks. Raises on the first violation."""
26
+ _check_version(spec)
27
+ _check_features(spec)
28
+ _check_causal(spec)
29
+ _check_difficulty(spec)
30
+ _check_failures(spec)
31
+ _check_export(spec)
32
+
33
+
34
+ def _check_version(spec: Spec) -> None:
35
+ if spec.datadoom_version not in SUPPORTED_SPEC_VERSIONS:
36
+ raise SpecValidationError(
37
+ f"unsupported datadoom_version {spec.datadoom_version!r} "
38
+ f"(supported: {sorted(SUPPORTED_SPEC_VERSIONS)})",
39
+ locator="datadoom_version",
40
+ )
41
+
42
+
43
+ def _check_features(spec: Spec) -> None:
44
+ # Lazy import to keep the dist layer optional at module load time.
45
+ from ..dist.builtins import REGISTRY
46
+ from ..dist.providers import is_realistic_generator, resolve_locale
47
+
48
+ derived = _derived_names(spec)
49
+ for name, feat in spec.features.items():
50
+ loc = f"features.{name}"
51
+ if isinstance(feat, NumericFeature):
52
+ if feat.min is not None and feat.max is not None and feat.min > feat.max:
53
+ raise SpecValidationError("min > max", locator=loc)
54
+ if feat.dist is not None:
55
+ dist = REGISTRY.get(feat.dist)
56
+ if dist is None:
57
+ raise SpecValidationError(
58
+ f"unknown distribution {feat.dist!r}", locator=f"{loc}.dist"
59
+ )
60
+ dist.validate(feat.params, locator=f"{loc}.params")
61
+ elif name not in derived:
62
+ # No dist and not produced by the causal layer → unsamplable.
63
+ raise SpecValidationError(
64
+ f"numeric feature {name!r} has no 'dist' and is not a causal target "
65
+ "(it cannot be sampled or derived)",
66
+ locator=loc,
67
+ )
68
+ elif isinstance(feat, CategoricalFeature):
69
+ if feat.weights is not None and len(feat.weights) != len(feat.categories):
70
+ raise SpecValidationError(
71
+ "weights length must match categories length", locator=f"{loc}.weights"
72
+ )
73
+ elif isinstance(feat, TextFeature):
74
+ if feat.generator != "lorem" and not is_realistic_generator(feat.generator):
75
+ raise SpecValidationError(
76
+ f"unknown text generator {feat.generator!r}", locator=f"{loc}.generator"
77
+ )
78
+ resolve_locale(feat.locale, locator=f"{loc}.locale")
79
+ elif isinstance(feat, TimeseriesFeature):
80
+ if feat.min is not None and feat.max is not None and feat.min > feat.max:
81
+ raise SpecValidationError("min > max", locator=loc)
82
+ # AR stationarity: Σ|φᵢ| < 1 is a conservative sufficient condition that
83
+ # keeps the recursion bounded (a true unit-root/explosive series drifts
84
+ # without bound and isn't reproducibly meaningful). Reject otherwise.
85
+ if feat.ar and sum(abs(c) for c in feat.ar) >= 1.0:
86
+ raise SpecValidationError(
87
+ "time-series AR is non-stationary: sum(|ar coefficients|) must be "
88
+ f"< 1 (got {sum(abs(c) for c in feat.ar):.3f})",
89
+ locator=f"{loc}.ar",
90
+ )
91
+
92
+
93
+ def _derived_names(spec: Spec) -> set[str]:
94
+ if spec.causal is None:
95
+ return set()
96
+ return {edge.dst for edge in spec.causal.edges}
97
+
98
+
99
+ def _check_causal(spec: Spec) -> None:
100
+ if spec.causal is None:
101
+ return
102
+ # Lazy imports keep the dist/causal layers optional at module load time.
103
+ from ..causal.functions import STRUCTURAL_FNS
104
+ from ..dist.builtins import REGISTRY
105
+
106
+ feature_names = set(spec.features)
107
+ targets: set[str] = set()
108
+ adjacency: dict[str, list[str]] = {n: [] for n in feature_names}
109
+
110
+ for i, edge in enumerate(spec.causal.edges):
111
+ loc = f"causal.edges[{i}]"
112
+ if edge.src not in feature_names:
113
+ raise SpecValidationError(f"unknown 'from' feature {edge.src!r}", locator=loc)
114
+ if edge.dst not in feature_names:
115
+ raise SpecValidationError(f"unknown 'to' feature {edge.dst!r}", locator=loc)
116
+ fn = STRUCTURAL_FNS.get(edge.fn)
117
+ if fn is None:
118
+ raise SpecValidationError(
119
+ f"unknown structural function {edge.fn!r}", locator=f"{loc}.fn"
120
+ )
121
+ fn.validate(edge, locator=loc)
122
+ # The structural function must be compatible with the parent's type, or
123
+ # execution would hit a raw coercion error. `map` consumes a categorical
124
+ # parent; the numeric fns need a numeric/boolean (float-coercible) parent.
125
+ parent = spec.features[edge.src]
126
+ if edge.fn == "map":
127
+ if not isinstance(parent, CategoricalFeature):
128
+ raise SpecValidationError(
129
+ f"map edge requires a categorical 'from' feature; {edge.src!r} is "
130
+ f"type {parent.type!r}",
131
+ locator=f"{loc}.fn",
132
+ )
133
+ missing = [c for c in parent.categories if c not in (edge.mapping or {})]
134
+ if missing:
135
+ raise SpecValidationError(
136
+ f"map edge is missing mappings for categories {missing}",
137
+ locator=f"{loc}.mapping",
138
+ )
139
+ elif not isinstance(parent, (NumericFeature, BooleanFeature, TimeseriesFeature)):
140
+ raise SpecValidationError(
141
+ f"{edge.fn!r} edge requires a numeric/boolean 'from' feature; {edge.src!r} "
142
+ f"is type {parent.type!r} (use fn 'map' for categorical parents)",
143
+ locator=f"{loc}.fn",
144
+ )
145
+ adjacency[edge.src].append(edge.dst)
146
+ targets.add(edge.dst)
147
+
148
+ for name in targets:
149
+ feat = spec.features[name]
150
+ # A feature that is both directly sampled and a causal target is ambiguous.
151
+ if isinstance(feat, NumericFeature) and feat.dist is not None:
152
+ raise SpecValidationError(
153
+ f"feature {name!r} is both sampled (has dist) and derived (causal target)",
154
+ locator=f"features.{name}",
155
+ )
156
+ # Only numeric/boolean targets can be derived by the SEM.
157
+ if not isinstance(feat, (NumericFeature, BooleanFeature)):
158
+ raise SpecValidationError(
159
+ f"causal target {name!r} has type {feat.type!r}; only numeric and boolean "
160
+ "features can be derived",
161
+ locator=f"features.{name}",
162
+ )
163
+
164
+ # Per-node noise must reference a known distribution (or be 'none').
165
+ for node, noise in spec.causal.noise.items():
166
+ loc = f"causal.noise.{node}"
167
+ if node not in feature_names:
168
+ raise SpecValidationError(f"noise references unknown feature {node!r}", locator=loc)
169
+ dist_name = noise.get("dist")
170
+ if dist_name not in (None, "none") and dist_name not in REGISTRY:
171
+ raise SpecValidationError(
172
+ f"unknown noise distribution {dist_name!r}", locator=f"{loc}.dist"
173
+ )
174
+
175
+ # Interventions must reference real features.
176
+ for i, item in enumerate(spec.causal.interventions):
177
+ do = item.get("do", {})
178
+ for node in do:
179
+ if node not in feature_names:
180
+ raise SpecValidationError(
181
+ f"intervention references unknown feature {node!r}",
182
+ locator=f"causal.interventions[{i}].do",
183
+ )
184
+
185
+ _reject_cycles(adjacency)
186
+
187
+
188
+ def _reject_cycles(adjacency: dict[str, list[str]]) -> None:
189
+ """Kahn's algorithm; if not all nodes are emitted, a cycle exists."""
190
+ indegree = {n: 0 for n in adjacency}
191
+ for _, children in sorted(adjacency.items()):
192
+ for child in children:
193
+ indegree[child] += 1
194
+ queue = sorted(n for n, d in indegree.items() if d == 0)
195
+ emitted = 0
196
+ while queue:
197
+ node = queue.pop(0)
198
+ emitted += 1
199
+ for child in sorted(adjacency[node]):
200
+ indegree[child] -= 1
201
+ if indegree[child] == 0:
202
+ queue.append(child)
203
+ queue.sort()
204
+ if emitted != len(adjacency):
205
+ raise SpecValidationError("causal graph is not acyclic", locator="causal.edges")
206
+
207
+
208
+ def _check_difficulty(spec: Spec) -> None:
209
+ if spec.difficulty is None:
210
+ return
211
+ # Lazy imports keep the difficulty layer optional at module load time.
212
+ from ..difficulty import PROBES, TIER_BANDS
213
+ from ..difficulty.knobs import ACTIVE_KNOBS
214
+
215
+ cfg = spec.difficulty
216
+
217
+ # The probe predicts the label; it must be a declared, classification-able
218
+ # feature. v0.1 calibrates binary-classification AUROC only.
219
+ label_feat = spec.features.get(cfg.label)
220
+ if label_feat is None:
221
+ raise SpecValidationError(
222
+ f"difficulty.label {cfg.label!r} is not a declared feature",
223
+ locator="difficulty.label",
224
+ )
225
+ if label_feat.emit is False:
226
+ raise SpecValidationError(
227
+ f"difficulty.label {cfg.label!r} is latent (emit: false) and is not shipped; "
228
+ "the probe can only predict an observable label",
229
+ locator="difficulty.label",
230
+ )
231
+ if isinstance(label_feat, BooleanFeature):
232
+ pass
233
+ elif isinstance(label_feat, CategoricalFeature):
234
+ if len(label_feat.categories) != 2:
235
+ raise SpecValidationError(
236
+ f"difficulty.label {cfg.label!r} is categorical with "
237
+ f"{len(label_feat.categories)} classes; v0.1 calibrates binary "
238
+ "classification only (use a boolean or 2-class categorical label)",
239
+ locator="difficulty.label",
240
+ )
241
+ else:
242
+ raise SpecValidationError(
243
+ f"difficulty.label {cfg.label!r} has type {label_feat.type!r}; v0.1 "
244
+ "supports binary-classification targets only (boolean or 2-class "
245
+ "categorical)",
246
+ locator="difficulty.label",
247
+ )
248
+
249
+ if cfg.probe not in PROBES:
250
+ raise SpecValidationError(
251
+ f"unknown difficulty probe {cfg.probe!r} (known: {sorted(PROBES)})",
252
+ locator="difficulty.probe",
253
+ )
254
+
255
+ # Target is either a named tier or an explicit band dict.
256
+ if isinstance(cfg.target, str):
257
+ if cfg.target not in TIER_BANDS:
258
+ raise SpecValidationError(
259
+ f"unknown difficulty tier {cfg.target!r} (known: {sorted(TIER_BANDS)})",
260
+ locator="difficulty.target",
261
+ )
262
+ elif isinstance(cfg.target, dict):
263
+ band = cfg.target.get("band")
264
+ if not (isinstance(band, (list, tuple)) and len(band) == 2):
265
+ raise SpecValidationError(
266
+ "difficulty.target must name a tier or carry a 'band': [lo, hi]",
267
+ locator="difficulty.target.band",
268
+ )
269
+ lo, hi = float(band[0]), float(band[1])
270
+ if not (0.0 <= lo <= hi <= 1.0):
271
+ raise SpecValidationError(
272
+ f"difficulty band must satisfy 0 <= lo <= hi <= 1 (got [{lo}, {hi}])",
273
+ locator="difficulty.target.band",
274
+ )
275
+ metric = cfg.target.get("metric", "auroc")
276
+ if metric != "auroc":
277
+ raise SpecValidationError(
278
+ f"unsupported difficulty metric {metric!r}; v0.1 supports 'auroc'",
279
+ locator="difficulty.target.metric",
280
+ )
281
+ else:
282
+ raise SpecValidationError(
283
+ "difficulty.target must be a tier name or an explicit-band object",
284
+ locator="difficulty.target",
285
+ )
286
+
287
+ # Only the actively-implemented knobs are accepted — no silently-ignored
288
+ # config. `causal` shrink and `imbalance` are planned (see status.md backlog).
289
+ unknown = [k for k in cfg.knobs if k not in ACTIVE_KNOBS]
290
+ if unknown:
291
+ raise SpecValidationError(
292
+ f"unsupported difficulty knob(s) {unknown}; v0.1 implements "
293
+ f"{list(ACTIVE_KNOBS)} (causal shrink and imbalance are planned)",
294
+ locator="difficulty.knobs",
295
+ )
296
+
297
+
298
+ def _check_failures(spec: Spec) -> None:
299
+ # Lazy import keeps the failure layer optional at module load time.
300
+ from ..failure.modes import FAILURE_MODES
301
+
302
+ latent = spec.latent_names()
303
+ for i, failure in enumerate(spec.failures):
304
+ loc = f"failures[{i}]"
305
+ mode = FAILURE_MODES.get(failure.type)
306
+ if mode is None:
307
+ raise SpecValidationError(
308
+ f"unknown failure type {failure.type!r} "
309
+ f"(known: {sorted(FAILURE_MODES)})",
310
+ locator=f"{loc}.type",
311
+ )
312
+ params = failure.model_dump()
313
+ params.pop("type", None)
314
+ mode.validate(params, spec.features, loc)
315
+ # Failures corrupt the *shipped* frame; a latent column was already
316
+ # dropped, so referencing one would fail at runtime — reject it early.
317
+ for val in (*params.values(),):
318
+ refs = val if isinstance(val, list) else [val]
319
+ for ref in refs:
320
+ if isinstance(ref, str) and ref in latent:
321
+ raise SpecValidationError(
322
+ f"failure references latent feature {ref!r} (emit: false), "
323
+ "which is not shipped and cannot be corrupted",
324
+ locator=loc,
325
+ )
326
+
327
+
328
+ def _check_export(spec: Spec) -> None:
329
+ from ..export import EXPORTERS
330
+
331
+ for fmt in spec.export.formats:
332
+ if fmt not in EXPORTERS:
333
+ known = ", ".join(sorted(EXPORTERS))
334
+ raise SpecValidationError(
335
+ f"unknown export format {fmt!r}; known formats: {known}",
336
+ locator="export.formats",
337
+ )
338
+
339
+ splits = spec.export.splits
340
+ if splits is not None:
341
+ total = sum(splits.values())
342
+ if abs(total - 1.0) > 1e-9:
343
+ raise SpecValidationError(
344
+ f"export.splits ratios must sum to 1.0 (got {total})", locator="export.splits"
345
+ )
@@ -0,0 +1,88 @@
1
+ """Time-series generation — additive decomposition (05 §6).
2
+
3
+ A time-series feature realizes an ordered series over the row index ``t = 0 … n−1``:
4
+
5
+ Xₜ = T(t) + S(t) + AR(p) + εₜ
6
+ T(t) = slope·t + intercept # linear trend
7
+ S(t) = Σ Aⱼ·sin(2π·t/periodⱼ + phaseⱼ) # (multi-)seasonality
8
+ AR(p): Yₜ = Σ_{i=1}^p φᵢ·Y_{t−i} + εₜ # autoregressive residual
9
+ εₜ ~ Normal(0, σ²) from RNG(noise:<series>)
10
+
11
+ The deterministic mean ``T(t)+S(t)`` is vectorised; the AR residual is an inherent
12
+ sequential recursion (each term depends on its predecessors) seeded with
13
+ ``Y_{t<0} = 0`` — a fixed, reproducible warm-start (no hidden burn-in draws). With
14
+ no ``ar`` coefficients the residual is plain i.i.d. noise ``εₜ``.
15
+
16
+ This module is pure and frawework-free; all randomness flows through the injected
17
+ ``numpy.random.Generator`` so the series is byte-reproducible on the pinned path.
18
+ Multivariate / hierarchical series are deferred (plugin / post-v1), per 05 §6.
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ from collections.abc import Sequence
24
+ from dataclasses import dataclass
25
+
26
+ import numpy as np
27
+
28
+
29
+ @dataclass(frozen=True)
30
+ class Trend:
31
+ slope: float = 0.0
32
+ intercept: float = 0.0
33
+
34
+
35
+ @dataclass(frozen=True)
36
+ class Seasonality:
37
+ amplitude: float
38
+ period: float
39
+ phase: float = 0.0
40
+
41
+
42
+ def generate_series(
43
+ rng: np.random.Generator,
44
+ n: int,
45
+ *,
46
+ trend: Trend | None = None,
47
+ seasonality: Sequence[Seasonality] = (),
48
+ ar: Sequence[float] = (),
49
+ noise_std: float = 1.0,
50
+ ) -> np.ndarray:
51
+ """Realize one additive time-series of length ``n`` (05 §6).
52
+
53
+ Args:
54
+ rng: the per-series noise generator (``RNG(noise:<name>)``).
55
+ n: series length (row count).
56
+ trend: linear ``slope·t + intercept`` component (or ``None``).
57
+ seasonality: zero or more sinusoidal components, summed.
58
+ ar: autoregressive coefficients ``[φ₁ … φ_p]`` on the residual.
59
+ noise_std: σ of the Gaussian innovations ``εₜ``.
60
+
61
+ Returns:
62
+ A float ``ndarray`` of length ``n`` in row order.
63
+ """
64
+ t = np.arange(n, dtype=float)
65
+
66
+ mean = np.zeros(n, dtype=float)
67
+ if trend is not None:
68
+ mean += trend.slope * t + trend.intercept
69
+ for s in seasonality:
70
+ mean += s.amplitude * np.sin(2.0 * np.pi * t / s.period + s.phase)
71
+
72
+ eps = rng.normal(0.0, noise_std, size=n) if noise_std > 0 else np.zeros(n, dtype=float)
73
+
74
+ coeffs = list(ar)
75
+ if not coeffs:
76
+ return mean + eps
77
+
78
+ # Zero-mean AR(p) residual: Yₜ = Σ φᵢ·Y_{t−i} + εₜ, warm-started at Y_{<0}=0.
79
+ p = len(coeffs)
80
+ y = np.zeros(n, dtype=float)
81
+ for i in range(n):
82
+ acc = eps[i]
83
+ for j in range(p):
84
+ k = i - j - 1
85
+ if k >= 0:
86
+ acc += coeffs[j] * y[k]
87
+ y[i] = acc
88
+ return mean + y
@@ -0,0 +1,14 @@
1
+ """Job execution layer (03 §3.3).
2
+
3
+ The default backend is an **in-process** thread-pool worker that drives
4
+ ``engine.pipeline`` and fans out progress events to the WebSocket hub. It may
5
+ import ``engine`` and ``store`` but nothing from ``api``/``cli`` (the hub lives
6
+ here so ``api`` can subscribe without ``jobs`` reaching upward).
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from .progress import EventHub, HubProgressEmitter, RunCancelled
12
+ from .worker import WorkerPool
13
+
14
+ __all__ = ["EventHub", "HubProgressEmitter", "RunCancelled", "WorkerPool"]
@@ -0,0 +1,155 @@
1
+ """Progress fan-out: engine ``ProgressEmitter`` -> WebSocket hub (08 §7).
2
+
3
+ The :class:`EventHub` is an in-process async pub/sub keyed by ``run_id``. The
4
+ worker thread publishes events; the API's WebSocket/SSE handlers subscribe. The
5
+ hub keeps a per-run **replay buffer** so a late subscriber (the browser opening
6
+ the tracker after the run started) receives the stages so far, then live updates.
7
+
8
+ Cross-thread safety: the worker runs in a thread pool, so ``publish`` marshals
9
+ queue writes onto the API event loop via ``call_soon_threadsafe``. With no loop
10
+ registered (library/test use) it still records history and serves replay.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import asyncio
16
+ import contextlib
17
+ import threading
18
+ from collections.abc import AsyncIterator
19
+ from typing import Any
20
+
21
+ from datadoom.engine.progress import ProgressEmitter
22
+
23
+ # Canonical pipeline stages (08 §7 / 03 §4). Optional stages appear only when the
24
+ # spec enables them; in P1 the engine emits the headless subset.
25
+ CANONICAL_STAGES = (
26
+ "intake",
27
+ "snapshot",
28
+ "seed",
29
+ "base_generation",
30
+ "causal",
31
+ "failure_injection",
32
+ "difficulty",
33
+ "compliance",
34
+ "packaging",
35
+ )
36
+
37
+
38
+ class RunCancelled(Exception):
39
+ """Raised inside the pipeline when a cooperative cancel was requested."""
40
+
41
+
42
+ Event = dict[str, Any]
43
+
44
+
45
+ class EventHub:
46
+ """Per-run pub/sub with replay. One instance per server process."""
47
+
48
+ def __init__(self) -> None:
49
+ self._subscribers: dict[str, set[asyncio.Queue[Event]]] = {}
50
+ self._history: dict[str, list[Event]] = {}
51
+ self._terminal: dict[str, bool] = {}
52
+ self._cancels: dict[str, threading.Event] = {}
53
+ self._lock = threading.Lock()
54
+ self._loop: asyncio.AbstractEventLoop | None = None
55
+
56
+ # --- loop wiring (called once by the API on startup) --------------------------
57
+ def bind_loop(self, loop: asyncio.AbstractEventLoop) -> None:
58
+ self._loop = loop
59
+
60
+ # --- cancellation -------------------------------------------------------------
61
+ def cancel_flag(self, run_id: str) -> threading.Event:
62
+ with self._lock:
63
+ return self._cancels.setdefault(run_id, threading.Event())
64
+
65
+ def request_cancel(self, run_id: str) -> None:
66
+ self.cancel_flag(run_id).set()
67
+
68
+ def is_cancelled(self, run_id: str) -> bool:
69
+ return self.cancel_flag(run_id).is_set()
70
+
71
+ # --- publishing ---------------------------------------------------------------
72
+ def publish(self, run_id: str, event: Event) -> None:
73
+ with self._lock:
74
+ self._history.setdefault(run_id, []).append(event)
75
+ if event.get("type") in {"completed", "failed", "cancelled"}:
76
+ self._terminal[run_id] = True
77
+ queues = list(self._subscribers.get(run_id, ()))
78
+ for q in queues:
79
+ self._enqueue(q, event)
80
+
81
+ def _enqueue(self, q: asyncio.Queue[Event], event: Event) -> None:
82
+ loop = self._loop
83
+ if loop is None:
84
+ # No API loop (library/test); history still records for replay.
85
+ return
86
+ with contextlib.suppress(RuntimeError):
87
+ loop.call_soon_threadsafe(q.put_nowait, event)
88
+
89
+ # --- subscription -------------------------------------------------------------
90
+ async def subscribe(self, run_id: str) -> AsyncIterator[Event]:
91
+ """Yield replay of events so far, then live events until a terminal one."""
92
+ q: asyncio.Queue[Event] = asyncio.Queue()
93
+ with self._lock:
94
+ replay = list(self._history.get(run_id, ()))
95
+ already_terminal = self._terminal.get(run_id, False)
96
+ self._subscribers.setdefault(run_id, set()).add(q)
97
+ try:
98
+ for ev in replay:
99
+ yield ev
100
+ if already_terminal:
101
+ return
102
+ while True:
103
+ ev = await q.get()
104
+ yield ev
105
+ if ev.get("type") in {"completed", "failed", "cancelled"}:
106
+ return
107
+ finally:
108
+ with self._lock:
109
+ subs = self._subscribers.get(run_id)
110
+ if subs is not None:
111
+ subs.discard(q)
112
+
113
+ def history(self, run_id: str) -> list[Event]:
114
+ with self._lock:
115
+ return list(self._history.get(run_id, ()))
116
+
117
+
118
+ class HubProgressEmitter(ProgressEmitter):
119
+ """Engine progress sink that republishes to an :class:`EventHub`.
120
+
121
+ Translates the engine's per-stage ``emit(stage, pct, message)`` calls into the
122
+ WS event shapes of 08 §7, synthesizing a ``done`` for the previous stage when
123
+ a new one begins, and checking the cooperative cancel flag at every boundary.
124
+ """
125
+
126
+ def __init__(self, hub: EventHub, run_id: str) -> None:
127
+ self.hub = hub
128
+ self.run_id = run_id
129
+ self._prev_stage: str | None = None
130
+
131
+ def emit(self, stage: str, pct: int, message: str = "") -> None:
132
+ if self.hub.is_cancelled(self.run_id):
133
+ raise RunCancelled(stage)
134
+ if self._prev_stage is not None and self._prev_stage != stage:
135
+ self.hub.publish(
136
+ self.run_id,
137
+ {"type": "stage", "stage": self._prev_stage, "status": "done", "pct": pct},
138
+ )
139
+ self.hub.publish(
140
+ self.run_id,
141
+ {"type": "stage", "stage": stage, "status": "running", "pct": pct},
142
+ )
143
+ if message:
144
+ self.hub.publish(
145
+ self.run_id, {"type": "log", "level": "info", "message": message}
146
+ )
147
+ self._prev_stage = stage
148
+
149
+ def finish(self, pct: int = 100) -> None:
150
+ """Mark the final stage done once the pipeline returns."""
151
+ if self._prev_stage is not None:
152
+ self.hub.publish(
153
+ self.run_id,
154
+ {"type": "stage", "stage": self._prev_stage, "status": "done", "pct": pct},
155
+ )