datadoom 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. datadoom/__init__.py +23 -0
  2. datadoom/adapters/__init__.py +29 -0
  3. datadoom/adapters/frameworks.py +94 -0
  4. datadoom/adapters/loaders.py +72 -0
  5. datadoom/api/__init__.py +11 -0
  6. datadoom/api/app.py +109 -0
  7. datadoom/api/deps.py +30 -0
  8. datadoom/api/errors.py +89 -0
  9. datadoom/api/estimate.py +82 -0
  10. datadoom/api/routes/__init__.py +7 -0
  11. datadoom/api/routes/artifacts.py +147 -0
  12. datadoom/api/routes/datasets.py +180 -0
  13. datadoom/api/routes/meta.py +45 -0
  14. datadoom/api/routes/plugins.py +22 -0
  15. datadoom/api/routes/runs.py +144 -0
  16. datadoom/api/routes/specs.py +73 -0
  17. datadoom/api/routes/templates.py +30 -0
  18. datadoom/api/schemas.py +230 -0
  19. datadoom/api/serializers.py +143 -0
  20. datadoom/api/state.py +24 -0
  21. datadoom/api/store_helpers.py +56 -0
  22. datadoom/api/ws.py +72 -0
  23. datadoom/cli/__init__.py +1 -0
  24. datadoom/cli/main.py +313 -0
  25. datadoom/config.py +108 -0
  26. datadoom/engine/__init__.py +38 -0
  27. datadoom/engine/advice.py +289 -0
  28. datadoom/engine/audit.py +290 -0
  29. datadoom/engine/causal/__init__.py +15 -0
  30. datadoom/engine/causal/execute.py +116 -0
  31. datadoom/engine/causal/functions.py +116 -0
  32. datadoom/engine/causal/graph.py +54 -0
  33. datadoom/engine/difficulty/__init__.py +36 -0
  34. datadoom/engine/difficulty/calibrate.py +235 -0
  35. datadoom/engine/difficulty/knobs.py +171 -0
  36. datadoom/engine/difficulty/probes.py +181 -0
  37. datadoom/engine/dist/__init__.py +35 -0
  38. datadoom/engine/dist/base.py +46 -0
  39. datadoom/engine/dist/builtins.py +172 -0
  40. datadoom/engine/dist/compliance.py +344 -0
  41. datadoom/engine/dist/providers.py +117 -0
  42. datadoom/engine/errors.py +32 -0
  43. datadoom/engine/export/__init__.py +27 -0
  44. datadoom/engine/export/base.py +49 -0
  45. datadoom/engine/export/checksums.py +18 -0
  46. datadoom/engine/export/csv_exporter.py +34 -0
  47. datadoom/engine/export/json_exporter.py +67 -0
  48. datadoom/engine/export/metadata.py +58 -0
  49. datadoom/engine/export/parquet_exporter.py +45 -0
  50. datadoom/engine/failure/__init__.py +18 -0
  51. datadoom/engine/failure/apply.py +37 -0
  52. datadoom/engine/failure/base.py +116 -0
  53. datadoom/engine/failure/modes.py +442 -0
  54. datadoom/engine/pipeline.py +418 -0
  55. datadoom/engine/profile.py +327 -0
  56. datadoom/engine/progress.py +14 -0
  57. datadoom/engine/reference.py +338 -0
  58. datadoom/engine/reports.py +206 -0
  59. datadoom/engine/rng.py +79 -0
  60. datadoom/engine/spec/__init__.py +45 -0
  61. datadoom/engine/spec/hashing.py +57 -0
  62. datadoom/engine/spec/models.py +238 -0
  63. datadoom/engine/spec/validate.py +345 -0
  64. datadoom/engine/timeseries.py +88 -0
  65. datadoom/jobs/__init__.py +14 -0
  66. datadoom/jobs/progress.py +155 -0
  67. datadoom/jobs/worker.py +162 -0
  68. datadoom/plugin.py +35 -0
  69. datadoom/plugins/__init__.py +47 -0
  70. datadoom/plugins/contracts.py +72 -0
  71. datadoom/plugins/loader.py +125 -0
  72. datadoom/plugins/registry.py +214 -0
  73. datadoom/plugins/scaffold.py +434 -0
  74. datadoom/store/__init__.py +47 -0
  75. datadoom/store/artifacts.py +67 -0
  76. datadoom/store/db.py +104 -0
  77. datadoom/store/migrations/__init__.py +0 -0
  78. datadoom/store/migrations/env.py +53 -0
  79. datadoom/store/migrations/script.py.mako +24 -0
  80. datadoom/store/migrations/versions/0001_init.py +149 -0
  81. datadoom/store/migrations/versions/0002_report_mutual_information.py +23 -0
  82. datadoom/store/migrations/versions/0003_run_name.py +23 -0
  83. datadoom/store/migrations/versions/0004_report_profile.py +24 -0
  84. datadoom/store/models.py +170 -0
  85. datadoom/store/repositories.py +279 -0
  86. datadoom/templates/__init__.py +239 -0
  87. datadoom/templates/ab_test.datadoom.yaml +46 -0
  88. datadoom/templates/clinical_deterioration.datadoom.yaml +124 -0
  89. datadoom/templates/credit_default_challenge.datadoom.yaml +147 -0
  90. datadoom/templates/customer_churn.datadoom.yaml +60 -0
  91. datadoom/templates/ecommerce_orders.datadoom.yaml +46 -0
  92. datadoom/templates/fraud_detection.datadoom.yaml +57 -0
  93. datadoom/templates/hospital_readmission.datadoom.yaml +61 -0
  94. datadoom/templates/insurance_claims.datadoom.yaml +43 -0
  95. datadoom/templates/iot_sensors.datadoom.yaml +44 -0
  96. datadoom/templates/people_directory.datadoom.yaml +56 -0
  97. datadoom/templates/predictive_maintenance.datadoom.yaml +107 -0
  98. datadoom/templates/telecom_churn_challenge.datadoom.yaml +125 -0
  99. datadoom/version.py +3 -0
  100. datadoom/webdist/assets/index-V8VAuTJG.js +445 -0
  101. datadoom/webdist/assets/index-doRjyG5s.css +1 -0
  102. datadoom/webdist/assets/inter-cyrillic-ext-wght-normal-BOeWTOD4.woff2 +0 -0
  103. datadoom/webdist/assets/inter-cyrillic-wght-normal-DqGufNeO.woff2 +0 -0
  104. datadoom/webdist/assets/inter-greek-ext-wght-normal-DlzME5K_.woff2 +0 -0
  105. datadoom/webdist/assets/inter-greek-wght-normal-CkhJZR-_.woff2 +0 -0
  106. datadoom/webdist/assets/inter-latin-ext-wght-normal-DO1Apj_S.woff2 +0 -0
  107. datadoom/webdist/assets/inter-latin-wght-normal-Dx4kXJAl.woff2 +0 -0
  108. datadoom/webdist/assets/inter-vietnamese-wght-normal-CBcvBZtf.woff2 +0 -0
  109. datadoom/webdist/assets/jetbrains-mono-cyrillic-wght-normal-D73BlboJ.woff2 +0 -0
  110. datadoom/webdist/assets/jetbrains-mono-greek-wght-normal-Bw9x6K1M.woff2 +0 -0
  111. datadoom/webdist/assets/jetbrains-mono-latin-ext-wght-normal-DBQx-q_a.woff2 +0 -0
  112. datadoom/webdist/assets/jetbrains-mono-latin-wght-normal-B9CIFXIH.woff2 +0 -0
  113. datadoom/webdist/assets/jetbrains-mono-vietnamese-wght-normal-Bt-aOZkq.woff2 +0 -0
  114. datadoom/webdist/assets/space-grotesk-latin-ext-wght-normal-D9tNdqV9.woff2 +0 -0
  115. datadoom/webdist/assets/space-grotesk-latin-wght-normal-BhU9QXUp.woff2 +0 -0
  116. datadoom/webdist/assets/space-grotesk-vietnamese-wght-normal-D0rl6rjA.woff2 +0 -0
  117. datadoom/webdist/index.html +15 -0
  118. datadoom-0.1.0.dev0.dist-info/METADATA +143 -0
  119. datadoom-0.1.0.dev0.dist-info/RECORD +122 -0
  120. datadoom-0.1.0.dev0.dist-info/WHEEL +4 -0
  121. datadoom-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  122. datadoom-0.1.0.dev0.dist-info/licenses/LICENSE +202 -0
@@ -0,0 +1,327 @@
1
+ """Per-column data profile — exploratory analysis the engine can do for you.
2
+
3
+ A synthetic dataset is special: the engine knows the ground truth of every column
4
+ — its declared type, how a derived column was generated, and exactly which
5
+ failure modes corrupted it and by how much. This module turns that into a
6
+ **per-column report card** so an engineer or student opening the Results screen
7
+ gets, at a glance, what each column is, its summary statistics, and — crucially —
8
+ *what's wrong with it and how to handle that when building an ML model*.
9
+
10
+ Pure engine code: deterministic pandas aggregation on the realized frame plus a
11
+ static advice lookup (:mod:`datadoom.engine.advice`). No randomness, no model
12
+ fitting — same ``(spec_hash, seed)`` → identical profile (invariant #6).
13
+
14
+ The profile is computed from the **clean** shipped frame (the canonical
15
+ artifact); when an **injected** variant exists, each column also carries its
16
+ post-corruption missing rate / moments so the realized impact of the failures is
17
+ visible next to the pristine baseline.
18
+ """
19
+
20
+ from __future__ import annotations
21
+
22
+ from typing import TYPE_CHECKING, Any
23
+
24
+ import numpy as np
25
+ import pandas as pd
26
+
27
+ from .advice import build_issue, severity_rank
28
+
29
+ if TYPE_CHECKING:
30
+ from .spec.models import Spec
31
+
32
+ # Categorical columns with more distinct values than this are summarised by their
33
+ # top categories only (a full breakdown would be noise for the reader).
34
+ _TOP_CATEGORIES = 12
35
+ # A target whose minority class is below this share is flagged as imbalanced.
36
+ _IMBALANCE_MINORITY = 0.35
37
+
38
+
39
+ def _num(x: Any) -> float | None:
40
+ """Coerce to a JSON-safe float, mapping NaN/inf to ``None``."""
41
+ try:
42
+ v = float(x)
43
+ except (TypeError, ValueError):
44
+ return None
45
+ return v if np.isfinite(v) else None
46
+
47
+
48
+ def _parents_of(spec: Spec, col: str) -> list[str]:
49
+ """Causal parents (edge sources) feeding a derived column, in spec order."""
50
+ if spec.causal is None:
51
+ return []
52
+ return [e.src for e in spec.causal.edges if e.dst == col]
53
+
54
+
55
+ def _derived_names(spec: Spec) -> set[str]:
56
+ return set() if spec.causal is None else {e.dst for e in spec.causal.edges}
57
+
58
+
59
+ def _label_column(spec: Spec) -> str | None:
60
+ """Best guess at the target column.
61
+
62
+ Authoritative when a difficulty block names a ``label``; otherwise a
63
+ best-effort heuristic: a boolean/categorical causal *sink* (a derived column
64
+ that no other edge consumes) is almost always the prediction target. Returns
65
+ ``None`` when the guess is ambiguous (zero or several candidates).
66
+ """
67
+ if spec.difficulty is not None and getattr(spec.difficulty, "label", None):
68
+ return spec.difficulty.label
69
+ if spec.causal is None:
70
+ return None
71
+ sources = {e.src for e in spec.causal.edges}
72
+ candidates = [
73
+ e.dst
74
+ for e in spec.causal.edges
75
+ if e.dst not in sources
76
+ and (feat := spec.features.get(e.dst)) is not None
77
+ and feat.type in ("boolean", "categorical")
78
+ ]
79
+ unique = list(dict.fromkeys(candidates))
80
+ return unique[0] if len(unique) == 1 else None
81
+
82
+
83
+ def _numeric_stats(series: pd.Series) -> dict[str, Any]:
84
+ """Summary statistics for a numeric column (NaN-aware, JSON-safe)."""
85
+ values = pd.to_numeric(series, errors="coerce").to_numpy(dtype=float)
86
+ clean = values[np.isfinite(values)]
87
+ if clean.size == 0:
88
+ return {}
89
+ q = np.quantile(clean, [0.25, 0.5, 0.75])
90
+ return {
91
+ "mean": _num(clean.mean()),
92
+ "std": _num(clean.std()),
93
+ "min": _num(clean.min()),
94
+ "p25": _num(q[0]),
95
+ "median": _num(q[1]),
96
+ "p75": _num(q[2]),
97
+ "max": _num(clean.max()),
98
+ "skew": _num(pd.Series(clean).skew()) if clean.size > 2 else None,
99
+ }
100
+
101
+
102
+ def _category_breakdown(series: pd.Series) -> tuple[list[dict[str, Any]], dict[str, Any] | None]:
103
+ """Top categories (value/count/pct) and an imbalance summary for a discrete column."""
104
+ counts = series.dropna().value_counts()
105
+ n = int(counts.sum())
106
+ if n == 0:
107
+ return [], None
108
+ top = [
109
+ {"value": _stringify(val), "count": int(c), "pct": c / n}
110
+ for val, c in counts.head(_TOP_CATEGORIES).items()
111
+ ]
112
+ majority = int(counts.iloc[0])
113
+ minority = int(counts.iloc[-1])
114
+ imbalance = {
115
+ "classes": int(counts.size),
116
+ "majority_pct": majority / n,
117
+ "minority_pct": minority / n,
118
+ "ratio": majority / minority if minority else None,
119
+ }
120
+ return top, imbalance
121
+
122
+
123
+ def _stringify(val: Any) -> str:
124
+ if isinstance(val, (bool, np.bool_)):
125
+ return "true" if val else "false"
126
+ return str(val)
127
+
128
+
129
+ def _failures_by_column(diffs: list[dict[str, Any]] | None) -> dict[str, list[dict[str, Any]]]:
130
+ """Invert the per-mode failure diffs into a per-column list of (mode, magnitude).
131
+
132
+ Each entry carries the realized magnitude (authoritative — the engine measured
133
+ it) so :func:`datadoom.engine.advice.build_issue` can size severity and the UI
134
+ can show the concrete effect.
135
+ """
136
+ out: dict[str, list[dict[str, Any]]] = {}
137
+
138
+ def add(col: str, mode: str, magnitude: str, fraction: float | None, detail: dict[str, Any]) -> None:
139
+ out.setdefault(col, []).append(
140
+ {"mode": mode, "magnitude": magnitude, "fraction": fraction, "detail": detail}
141
+ )
142
+
143
+ for d in diffs or []:
144
+ mode = str(d.get("type") or d.get("mechanism") or "")
145
+ if mode == "mcar":
146
+ for col, frac in (d.get("nullified_fraction") or {}).items():
147
+ f = _num(frac) or 0.0
148
+ add(col, "mcar", f"{f * 100:.1f}% of values missing", f, {"rate": f})
149
+ elif mode in ("mar", "mnar"):
150
+ col = str(d.get("column"))
151
+ f = _num(d.get("realized_rate")) or 0.0
152
+ detail = {"rate": f, "driver": d.get("driver"), "self_dependent": d.get("self_dependent")}
153
+ add(col, mode, f"{f * 100:.1f}% of values missing", f, detail)
154
+ elif mode == "label_noise":
155
+ col = str(d.get("column"))
156
+ f = _num(d.get("flipped_fraction")) or 0.0
157
+ add(col, "label_noise", f"{f * 100:.1f}% of labels flipped", f, {"rate": f})
158
+ elif mode == "feature_noise":
159
+ col = str(d.get("column"))
160
+ sd = _num(d.get("realized_noise_std"))
161
+ add(
162
+ col,
163
+ "feature_noise",
164
+ f"σ≈{sd:.3g} noise added" if sd is not None else "noise added",
165
+ None,
166
+ {"noise_std": sd, "mean_shift": _num(d.get("realized_mean_shift"))},
167
+ )
168
+ elif mode == "drift":
169
+ col = str(d.get("column"))
170
+ shift = _num(d.get("total_shift"))
171
+ kind = d.get("kind", "linear")
172
+ add(
173
+ col,
174
+ "drift",
175
+ f"{shift:.3g} total {kind} shift" if shift is not None else f"{kind} drift",
176
+ None,
177
+ {"total_shift": shift, "kind": kind},
178
+ )
179
+ elif mode == "covariate_shift":
180
+ col = str(d.get("column"))
181
+ before, after = d.get("before") or {}, d.get("after") or {}
182
+ bm, am = _num(before.get("mean")), _num(after.get("mean"))
183
+ mag = f"mean {bm:.3g}→{am:.3g}" if bm is not None and am is not None else "distribution shifted"
184
+ add(col, "covariate_shift", mag, None, {"before": before, "after": after})
185
+ elif mode == "leakage":
186
+ col = str(d.get("into"))
187
+ corr = _num(d.get("realized_correlation"))
188
+ mag = f"corr={corr:.3f} with {d.get('target')}" if corr is not None else "high-MI proxy"
189
+ add(col, "leakage", mag, None, {"target": d.get("target"), "correlation": corr})
190
+ return out
191
+
192
+
193
+ def _role(name: str, derived: set[str], label: str | None, planted: bool) -> str:
194
+ """How the column functions for modelling: label / leakage proxy / derived / feature."""
195
+ if planted:
196
+ return "leakage_proxy"
197
+ if name == label:
198
+ return "label"
199
+ if name in derived:
200
+ return "derived"
201
+ return "feature"
202
+
203
+
204
+ def _column_profile(
205
+ name: str,
206
+ *,
207
+ spec: Spec,
208
+ clean: pd.DataFrame,
209
+ injected: pd.DataFrame | None,
210
+ derived: set[str],
211
+ label: str | None,
212
+ col_failures: list[dict[str, Any]],
213
+ ) -> dict[str, Any]:
214
+ """Assemble the report card for a single column."""
215
+ planted = name not in clean.columns # e.g. a leakage proxy lives only in injected
216
+ base = injected if planted and injected is not None else clean
217
+ series = base[name]
218
+ feat = spec.features.get(name)
219
+ feature_type = feat.type if feat is not None else "synthetic"
220
+
221
+ n = int(len(series))
222
+ missing = int(series.isna().sum())
223
+ profile: dict[str, Any] = {
224
+ "name": name,
225
+ "role": _role(name, derived, label, planted),
226
+ "feature_type": feature_type,
227
+ "dtype": str(series.dtype),
228
+ "count": n,
229
+ "missing": missing,
230
+ "missing_pct": missing / n if n else 0.0,
231
+ "unique": int(series.nunique(dropna=True)),
232
+ "derived": name in derived,
233
+ "parents": _parents_of(spec, name),
234
+ "description": getattr(feat, "description", None),
235
+ }
236
+
237
+ if pd.api.types.is_numeric_dtype(series) and not pd.api.types.is_bool_dtype(series):
238
+ profile["stats"] = _numeric_stats(series)
239
+ profile["categories"] = None
240
+ profile["imbalance"] = None
241
+ else:
242
+ top, imbalance = _category_breakdown(series)
243
+ profile["stats"] = None
244
+ profile["categories"] = top
245
+ profile["imbalance"] = imbalance
246
+
247
+ # Post-corruption snapshot: how the column actually looks in the injected variant.
248
+ if injected is not None and name in injected.columns and not planted:
249
+ inj = injected[name]
250
+ inj_missing = int(inj.isna().sum())
251
+ post: dict[str, Any] = {"missing_pct": inj_missing / n if n else 0.0}
252
+ if pd.api.types.is_numeric_dtype(inj) and not pd.api.types.is_bool_dtype(inj):
253
+ vals = pd.to_numeric(inj, errors="coerce").to_numpy(dtype=float)
254
+ vals = vals[np.isfinite(vals)]
255
+ if vals.size:
256
+ post["mean"] = _num(vals.mean())
257
+ post["std"] = _num(vals.std())
258
+ profile["injected"] = post
259
+ else:
260
+ profile["injected"] = None
261
+
262
+ # Issues: failure-mode corruptions + (for the label) class imbalance.
263
+ issues = [
264
+ build_issue(f["mode"], magnitude=f["magnitude"], fraction=f["fraction"], detail=f["detail"]).to_dict()
265
+ for f in col_failures
266
+ ]
267
+ imbalance = profile.get("imbalance")
268
+ if name == label and imbalance and imbalance["minority_pct"] < _IMBALANCE_MINORITY:
269
+ ratio = imbalance.get("ratio")
270
+ mag = (
271
+ f"{imbalance['majority_pct'] * 100:.1f}% / {imbalance['minority_pct'] * 100:.1f}%"
272
+ + (f" ({ratio:.1f}:1)" if ratio else "")
273
+ )
274
+ issues.append(
275
+ build_issue("class_imbalance", magnitude=mag, fraction=None, detail=imbalance).to_dict()
276
+ )
277
+ issues.sort(key=lambda i: severity_rank(i["severity"]), reverse=True)
278
+ profile["issues"] = issues
279
+ return profile
280
+
281
+
282
+ def build_profile(
283
+ spec: Spec,
284
+ clean: pd.DataFrame,
285
+ *,
286
+ injected: pd.DataFrame | None = None,
287
+ failure_diffs: list[dict[str, Any]] | None = None,
288
+ ) -> dict[str, Any]:
289
+ """Build the full per-column data profile for the Results screen.
290
+
291
+ Returns a JSON-serialisable dict with a top-level ``summary`` and a
292
+ ``columns`` list (one report card each). Columns appear in shipped order,
293
+ with any injected-only columns (e.g. leakage proxies) appended.
294
+ """
295
+ derived = _derived_names(spec)
296
+ label = _label_column(spec)
297
+ by_col = _failures_by_column(failure_diffs)
298
+
299
+ names: list[str] = list(clean.columns)
300
+ if injected is not None:
301
+ names += [c for c in injected.columns if c not in clean.columns]
302
+
303
+ columns = [
304
+ _column_profile(
305
+ name,
306
+ spec=spec,
307
+ clean=clean,
308
+ injected=injected,
309
+ derived=derived,
310
+ label=label,
311
+ col_failures=by_col.get(name, []),
312
+ )
313
+ for name in names
314
+ ]
315
+
316
+ n_issue_cols = sum(1 for c in columns if c["issues"])
317
+ severities = [i["severity"] for c in columns for i in c["issues"]]
318
+ summary = {
319
+ "n_rows": int(len(clean)),
320
+ "n_columns": len(columns),
321
+ "label": label,
322
+ "columns_with_issues": n_issue_cols,
323
+ "total_issues": len(severities),
324
+ "critical_issues": sum(1 for s in severities if s == "critical"),
325
+ "high_issues": sum(1 for s in severities if s == "high"),
326
+ }
327
+ return {"summary": summary, "columns": columns}
@@ -0,0 +1,14 @@
1
+ """Progress emission contract.
2
+
3
+ The engine stays framework-free: it emits stage events to a sink. In P0 the sink
4
+ is a no-op; later phases wire a WebSocket hub behind the same interface.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+
10
+ class ProgressEmitter:
11
+ """No-op progress sink. Subclasses publish events elsewhere."""
12
+
13
+ def emit(self, stage: str, pct: int, message: str = "") -> None: # noqa: D401
14
+ return None
@@ -0,0 +1,338 @@
1
+ """Machine-readable spec capabilities manifest (the AI-authoring contract).
2
+
3
+ `build_capabilities()` returns a JSON-serializable dict that enumerates **every**
4
+ knob a DataDoom spec accepts, with its exact valid values and constraints. It is
5
+ built from the **live engine registries** (distributions, structural functions,
6
+ failure modes, exporters, text providers, difficulty tiers) so it is always in
7
+ sync with the running build *and* automatically reflects any registered plugin.
8
+
9
+ The authoritative *names* come from the registries; richer per-item *annotations*
10
+ (parameter domains, failure-mode fields, prose) are curated here and merged in by
11
+ name. An item with no annotation (e.g. a third-party plugin distribution) still
12
+ appears, carrying whatever the engine ABC exposes (required params, schema).
13
+
14
+ This is what you feed an LLM/agent so it can emit a valid `*.datadoom.yaml`
15
+ without guessing. The CLI surfaces it via ``datadoom spec-reference`` and the API
16
+ via ``GET /api/spec-reference``.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ from typing import Any
22
+
23
+ from ..version import __version__
24
+
25
+ # --- curated annotations (merged onto registry-derived names by key) ----------------
26
+
27
+ _DIST_ANNOTATIONS: dict[str, dict[str, Any]] = {
28
+ "normal": {
29
+ "summary": "Symmetric bell curve over all real numbers.",
30
+ "params": {"mean": "center (any real)", "std": "spread, must be > 0"},
31
+ },
32
+ "lognormal": {
33
+ "summary": "Right-skewed, positive-only (income, prices). mu/sigma are of the underlying normal (log space).",
34
+ "params": {"mu": "mean of ln(X) (any real)", "sigma": "std of ln(X), must be > 0"},
35
+ },
36
+ "uniform": {
37
+ "summary": "Flat — every value in [low, high] equally likely.",
38
+ "params": {"low": "lower bound", "high": "upper bound, must be > low"},
39
+ },
40
+ "exponential": {
41
+ "summary": "Decaying, non-negative (waiting times). Mean == scale.",
42
+ "params": {"scale": "mean of the distribution, must be > 0"},
43
+ },
44
+ "poisson": {
45
+ "summary": "Discrete counts 0,1,2,… ; lam is the mean. Output is integer.",
46
+ "params": {"lam": "mean count, must be > 0"},
47
+ "discrete": True,
48
+ },
49
+ "pareto": {
50
+ "summary": "Heavy-tailed power law; values are >= xm. Smaller alpha = heavier tail.",
51
+ "params": {"alpha": "tail index, must be > 0", "xm": "minimum value (scale), must be > 0"},
52
+ },
53
+ }
54
+
55
+ _FN_ANNOTATIONS: dict[str, dict[str, Any]] = {
56
+ "linear": {
57
+ "summary": "weight·parent + bias.",
58
+ "fields": {"weight": "number (required)", "bias": "number (optional, default 0)"},
59
+ },
60
+ "logistic": {
61
+ "summary": "1/(1+e^-(weight·parent+bias)) — squash a driver to 0..1; typically the last edge into a boolean target.",
62
+ "fields": {"weight": "number (required)", "bias": "number (optional, default 0)"},
63
+ },
64
+ "polynomial": {
65
+ "summary": "Σ coeffs[i]·parent^i — curved/non-linear effect.",
66
+ "fields": {"coeffs": "non-empty list of numbers (required)"},
67
+ },
68
+ "map": {
69
+ "summary": "Look up mapping[parent_category] — turns a categorical parent into a number. Must cover every category.",
70
+ "fields": {"mapping": "object {category: number} covering all parent categories (required)"},
71
+ },
72
+ "identity": {"summary": "Pass the parent value through unchanged.", "fields": {}},
73
+ }
74
+
75
+ #: Failure modes: each is a list item under top-level ``failures`` with ``type`` + these fields.
76
+ _FAILURE_MODES: dict[str, dict[str, Any]] = {
77
+ "mcar": {
78
+ "category": "missingness",
79
+ "summary": "Missing Completely At Random — blanks chosen independently of the data.",
80
+ "fields": {
81
+ "column": "feature name (or use 'columns')",
82
+ "columns": "list of feature names (alternative to 'column')",
83
+ "rate": "fraction in [0,1] to blank (required)",
84
+ },
85
+ },
86
+ "mar": {
87
+ "category": "missingness",
88
+ "summary": "Missing At Random — blank probability depends on another observed column.",
89
+ "fields": {
90
+ "column": "feature to blank (required)",
91
+ "driver": "observed numeric/boolean feature that drives missingness (required)",
92
+ "rate": "expected fraction blanked, calibrated (required)",
93
+ "strength": "driver skew, number (optional, default 2.0)",
94
+ },
95
+ },
96
+ "mnar": {
97
+ "category": "missingness",
98
+ "summary": "Missing Not At Random — blank probability depends on the column's own value.",
99
+ "fields": {
100
+ "column": "feature to blank (required)",
101
+ "driver": "optional numeric/boolean driver (defaults to the column itself)",
102
+ "rate": "expected fraction blanked, calibrated (required)",
103
+ "strength": "skew, number (optional, default 2.0)",
104
+ },
105
+ },
106
+ "label_noise": {
107
+ "category": "noise",
108
+ "summary": "Flip a boolean / reassign a categorical label to a different class.",
109
+ "fields": {
110
+ "column": "boolean or categorical feature (required)",
111
+ "rate": "fraction in [0,1] to corrupt (required)",
112
+ },
113
+ },
114
+ "feature_noise": {
115
+ "category": "noise",
116
+ "summary": "Additive noise on a numeric column: x' = x + ε.",
117
+ "fields": {
118
+ "column": "numeric feature (required)",
119
+ "dist": "noise distribution name (required, e.g. normal)",
120
+ "params": "params for that distribution (e.g. {mean: 0, std: 1})",
121
+ },
122
+ },
123
+ "drift": {
124
+ "category": "shift",
125
+ "summary": "Gradually shift a numeric column across the row index (concept drift).",
126
+ "fields": {
127
+ "column": "numeric feature (required)",
128
+ "schedule": "object: {kind: linear|step, magnitude: number (total shift) OR rate: per-row slope, at: 0..1 (step only, default 0.5)}",
129
+ },
130
+ },
131
+ "covariate_shift": {
132
+ "category": "shift",
133
+ "summary": "Affine-rescale a numeric column to a target mean/std.",
134
+ "fields": {
135
+ "column": "numeric feature (required)",
136
+ "target": "object {mean?: number, std?: number} (at least one required)",
137
+ },
138
+ },
139
+ "leakage": {
140
+ "category": "leakage",
141
+ "summary": "Plant a NEW column that is a near-perfect proxy for a target.",
142
+ "fields": {
143
+ "target": "numeric/boolean feature to leak (required)",
144
+ "into": "new column name, must differ from target (required)",
145
+ "noise": "proxy noise level relative to target spread (optional, default 0.05; smaller = stronger leak)",
146
+ },
147
+ },
148
+ }
149
+
150
+ _FEATURE_TYPES: dict[str, dict[str, Any]] = {
151
+ "numeric": {
152
+ "summary": "Numbers from a distribution, optionally clamped and/or rounded to int. Omit 'dist' to make it a causal-derived column.",
153
+ "fields": {
154
+ "dist": "distribution name (see 'distributions'); omit for a causal target",
155
+ "params": "object of distribution parameters",
156
+ "min": "lower clamp (optional)",
157
+ "max": "upper clamp (optional)",
158
+ "dtype": "'float' (default) or 'int' (rounds to whole numbers)",
159
+ },
160
+ },
161
+ "categorical": {
162
+ "summary": "One label per row from a fixed set.",
163
+ "fields": {
164
+ "categories": "non-empty list of strings (required)",
165
+ "weights": "list of non-negative numbers, positionally matched; normalized (optional, default uniform)",
166
+ },
167
+ },
168
+ "boolean": {
169
+ "summary": "True/false column.",
170
+ "fields": {"rate": "probability of true, in [0,1] (default 0.5)"},
171
+ },
172
+ "datetime": {
173
+ "summary": "Timestamps drawn uniformly in a range.",
174
+ "fields": {
175
+ "start": "ISO date string, e.g. '2023-01-01' (required)",
176
+ "end": "ISO date string >= start (required)",
177
+ "granularity": "'second' | 'minute' | 'hour' | 'day' (default 'day')",
178
+ },
179
+ },
180
+ "text": {
181
+ "summary": "Strings: 'lorem' filler or a realistic provider (see 'text_generators'). Realistic providers are seeded/reproducible.",
182
+ "fields": {
183
+ "generator": "'lorem' (default) or a realistic provider key",
184
+ "locale": "locale for realistic providers (default 'en')",
185
+ "length": "object {min, max} word-count range — lorem only (default {min:5,max:30})",
186
+ },
187
+ },
188
+ "timeseries": {
189
+ "summary": "Ordered additive series Xt = trend + seasonality + AR(p) + noise over the row index. Row order is the time axis (preserved). May be a causal parent; never a causal target; not distribution-compliance assessed.",
190
+ "fields": {
191
+ "trend": "object {slope, intercept} — linear trend (optional)",
192
+ "seasonality": "list of {amplitude, period (>0), phase} sinusoids, summed (optional)",
193
+ "ar": "list of AR coefficients [phi1..phip]; sum(|phi|) must be < 1 (stationarity)",
194
+ "noise_std": "sigma of Gaussian innovations, >= 0 (default 1.0)",
195
+ "min": "lower clamp (optional)",
196
+ "max": "upper clamp (optional)",
197
+ "dtype": "'float' (default) or 'int'",
198
+ },
199
+ },
200
+ }
201
+
202
+ _SHARED_FEATURE_FIELDS = {
203
+ "description": "free-text doc (optional)",
204
+ "emit": "boolean; false = latent (computed/drives the SEM but NOT exported, and excluded from probe/compliance/correlation). Default true.",
205
+ }
206
+
207
+ _RULES = [
208
+ "Top-level required keys: datadoom_version (always \"1\"), name (slug [A-Za-z0-9_-]+), rows (int >= 1), features.",
209
+ "A causal-derived feature (numeric or boolean) is declared WITHOUT a 'dist'/'rate' and MUST be the 'to' of at least one causal edge.",
210
+ "A feature cannot be both sampled (has dist) and a causal target.",
211
+ "The causal graph must be acyclic. Only numeric/boolean features can be causal targets.",
212
+ "'map' edges require a categorical parent and a mapping covering every category; other fns require a numeric/boolean/timeseries parent.",
213
+ "A difficulty 'label' must be a boolean or 2-class categorical feature, and must not be latent (emit:false).",
214
+ "difficulty.knobs ⊆ {noise, label_noise}. target is a named tier or {band:[a,b]}.",
215
+ "Failures are an ordered list applied after the clean baseline is captured; export versions must include 'injected' to write the corrupted variant.",
216
+ "A failure cannot reference a latent (emit:false) feature.",
217
+ "export.splits ratios must sum to 1.0. export.formats must be known formats.",
218
+ "time-series AR must satisfy sum(|coefficients|) < 1 (stationarity).",
219
+ "Determinism: same (spec, seed) -> identical bytes. Seed is NOT part of the spec hash.",
220
+ ]
221
+
222
+
223
+ def _distributions() -> list[dict[str, Any]]:
224
+ from .dist.builtins import REGISTRY
225
+
226
+ out: list[dict[str, Any]] = []
227
+ for name in sorted(REGISTRY):
228
+ dist = REGISTRY[name]
229
+ entry: dict[str, Any] = {
230
+ "name": name,
231
+ "required_params": list(dist.required_params),
232
+ "builtin": name in _DIST_ANNOTATIONS,
233
+ }
234
+ entry.update(_DIST_ANNOTATIONS.get(name, {}))
235
+ if getattr(dist, "param_schema", None) is not None:
236
+ entry["param_schema"] = dist.param_schema
237
+ out.append(entry)
238
+ return out
239
+
240
+
241
+ def _structural_fns() -> list[dict[str, Any]]:
242
+ from .causal.functions import STRUCTURAL_FNS
243
+
244
+ out: list[dict[str, Any]] = []
245
+ for name in sorted(STRUCTURAL_FNS):
246
+ entry: dict[str, Any] = {"name": name, "builtin": name in _FN_ANNOTATIONS}
247
+ entry.update(_FN_ANNOTATIONS.get(name, {}))
248
+ out.append(entry)
249
+ return out
250
+
251
+
252
+ def _failure_modes() -> list[dict[str, Any]]:
253
+ from .failure import FAILURE_MODES
254
+
255
+ out: list[dict[str, Any]] = []
256
+ for name in sorted(FAILURE_MODES):
257
+ entry: dict[str, Any] = {"type": name, "builtin": name in _FAILURE_MODES}
258
+ entry.update(_FAILURE_MODES.get(name, {}))
259
+ out.append(entry)
260
+ return out
261
+
262
+
263
+ def _difficulty() -> dict[str, Any]:
264
+ from .difficulty import PROBES, TIER_BANDS
265
+
266
+ return {
267
+ "tiers": {name: list(band) for name, band in TIER_BANDS.items()},
268
+ "probes": sorted(PROBES),
269
+ "knobs": ["noise", "label_noise"],
270
+ "target": "a named tier (e.g. 'advanced') or an explicit {band: [a, b]} of AUROC",
271
+ "label": "the boolean / 2-class categorical column the baseline probe predicts",
272
+ "max_iters": "calibration steps, int >= 1 (default 8)",
273
+ }
274
+
275
+
276
+ def _exporters() -> dict[str, Any]:
277
+ from .export import EXPORTERS
278
+
279
+ return {
280
+ "formats": sorted(EXPORTERS),
281
+ "versions": ["clean", "injected"],
282
+ "fields": {
283
+ "formats": "list of output formats (default [csv]); parquet needs the optional extra",
284
+ "versions": "subset of {clean, injected} (default [clean])",
285
+ "splits": "object {name: ratio} whose ratios sum to 1.0 (optional)",
286
+ "shuffle": "boolean (default true)",
287
+ "metadata": "boolean — write metadata.json (default true)",
288
+ },
289
+ }
290
+
291
+
292
+ def _text_generators() -> dict[str, Any]:
293
+ from .dist.providers import REALISTIC_GENERATORS
294
+
295
+ return {
296
+ "lorem": "filler words (uses 'length' {min,max})",
297
+ "realistic": sorted(REALISTIC_GENERATORS),
298
+ }
299
+
300
+
301
+ def build_capabilities() -> dict[str, Any]:
302
+ """Return the full, JSON-serializable spec capabilities manifest."""
303
+ return {
304
+ "datadoom_version": "1",
305
+ "package_version": __version__,
306
+ "summary": (
307
+ "DataDoom spec capabilities. A spec is a YAML/JSON document describing a "
308
+ "reproducible synthetic dataset. Use the exact names/fields below; same "
309
+ "(spec, seed) regenerates identical data."
310
+ ),
311
+ "top_level_keys": {
312
+ "datadoom_version": 'required, always "1"',
313
+ "name": "required, slug [A-Za-z0-9_-]+",
314
+ "description": "optional string",
315
+ "seed": "optional int (reproducibility; not part of the spec hash)",
316
+ "rows": "required int >= 1",
317
+ "features": "required object {name: feature} — see feature_types",
318
+ "causal": "optional DAG {edges, noise, interventions}",
319
+ "difficulty": "optional classification difficulty target",
320
+ "failures": "optional ordered list of corruptions",
321
+ "export": "optional output config",
322
+ "meta": "optional free-form object (ignored by the engine)",
323
+ },
324
+ "shared_feature_fields": _SHARED_FEATURE_FIELDS,
325
+ "feature_types": _FEATURE_TYPES,
326
+ "distributions": _distributions(),
327
+ "structural_fns": _structural_fns(),
328
+ "causal": {
329
+ "edges": "list of {from, to, fn, ...fn params} — see structural_fns",
330
+ "noise": "object {derived_node: {dist: <name|none>, params: {...}}}",
331
+ "interventions": "list of {do: {feature: value}} — fix a node to a constant",
332
+ },
333
+ "failure_modes": _failure_modes(),
334
+ "difficulty": _difficulty(),
335
+ "export": _exporters(),
336
+ "text_generators": _text_generators(),
337
+ "rules": _RULES,
338
+ }