datadoom 0.1.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datadoom/__init__.py +23 -0
- datadoom/adapters/__init__.py +29 -0
- datadoom/adapters/frameworks.py +94 -0
- datadoom/adapters/loaders.py +72 -0
- datadoom/api/__init__.py +11 -0
- datadoom/api/app.py +109 -0
- datadoom/api/deps.py +30 -0
- datadoom/api/errors.py +89 -0
- datadoom/api/estimate.py +82 -0
- datadoom/api/routes/__init__.py +7 -0
- datadoom/api/routes/artifacts.py +147 -0
- datadoom/api/routes/datasets.py +180 -0
- datadoom/api/routes/meta.py +45 -0
- datadoom/api/routes/plugins.py +22 -0
- datadoom/api/routes/runs.py +144 -0
- datadoom/api/routes/specs.py +73 -0
- datadoom/api/routes/templates.py +30 -0
- datadoom/api/schemas.py +230 -0
- datadoom/api/serializers.py +143 -0
- datadoom/api/state.py +24 -0
- datadoom/api/store_helpers.py +56 -0
- datadoom/api/ws.py +72 -0
- datadoom/cli/__init__.py +1 -0
- datadoom/cli/main.py +313 -0
- datadoom/config.py +108 -0
- datadoom/engine/__init__.py +38 -0
- datadoom/engine/advice.py +289 -0
- datadoom/engine/audit.py +290 -0
- datadoom/engine/causal/__init__.py +15 -0
- datadoom/engine/causal/execute.py +116 -0
- datadoom/engine/causal/functions.py +116 -0
- datadoom/engine/causal/graph.py +54 -0
- datadoom/engine/difficulty/__init__.py +36 -0
- datadoom/engine/difficulty/calibrate.py +235 -0
- datadoom/engine/difficulty/knobs.py +171 -0
- datadoom/engine/difficulty/probes.py +181 -0
- datadoom/engine/dist/__init__.py +35 -0
- datadoom/engine/dist/base.py +46 -0
- datadoom/engine/dist/builtins.py +172 -0
- datadoom/engine/dist/compliance.py +344 -0
- datadoom/engine/dist/providers.py +117 -0
- datadoom/engine/errors.py +32 -0
- datadoom/engine/export/__init__.py +27 -0
- datadoom/engine/export/base.py +49 -0
- datadoom/engine/export/checksums.py +18 -0
- datadoom/engine/export/csv_exporter.py +34 -0
- datadoom/engine/export/json_exporter.py +67 -0
- datadoom/engine/export/metadata.py +58 -0
- datadoom/engine/export/parquet_exporter.py +45 -0
- datadoom/engine/failure/__init__.py +18 -0
- datadoom/engine/failure/apply.py +37 -0
- datadoom/engine/failure/base.py +116 -0
- datadoom/engine/failure/modes.py +442 -0
- datadoom/engine/pipeline.py +418 -0
- datadoom/engine/profile.py +327 -0
- datadoom/engine/progress.py +14 -0
- datadoom/engine/reference.py +338 -0
- datadoom/engine/reports.py +206 -0
- datadoom/engine/rng.py +79 -0
- datadoom/engine/spec/__init__.py +45 -0
- datadoom/engine/spec/hashing.py +57 -0
- datadoom/engine/spec/models.py +238 -0
- datadoom/engine/spec/validate.py +345 -0
- datadoom/engine/timeseries.py +88 -0
- datadoom/jobs/__init__.py +14 -0
- datadoom/jobs/progress.py +155 -0
- datadoom/jobs/worker.py +162 -0
- datadoom/plugin.py +35 -0
- datadoom/plugins/__init__.py +47 -0
- datadoom/plugins/contracts.py +72 -0
- datadoom/plugins/loader.py +125 -0
- datadoom/plugins/registry.py +214 -0
- datadoom/plugins/scaffold.py +434 -0
- datadoom/store/__init__.py +47 -0
- datadoom/store/artifacts.py +67 -0
- datadoom/store/db.py +104 -0
- datadoom/store/migrations/__init__.py +0 -0
- datadoom/store/migrations/env.py +53 -0
- datadoom/store/migrations/script.py.mako +24 -0
- datadoom/store/migrations/versions/0001_init.py +149 -0
- datadoom/store/migrations/versions/0002_report_mutual_information.py +23 -0
- datadoom/store/migrations/versions/0003_run_name.py +23 -0
- datadoom/store/migrations/versions/0004_report_profile.py +24 -0
- datadoom/store/models.py +170 -0
- datadoom/store/repositories.py +279 -0
- datadoom/templates/__init__.py +239 -0
- datadoom/templates/ab_test.datadoom.yaml +46 -0
- datadoom/templates/clinical_deterioration.datadoom.yaml +124 -0
- datadoom/templates/credit_default_challenge.datadoom.yaml +147 -0
- datadoom/templates/customer_churn.datadoom.yaml +60 -0
- datadoom/templates/ecommerce_orders.datadoom.yaml +46 -0
- datadoom/templates/fraud_detection.datadoom.yaml +57 -0
- datadoom/templates/hospital_readmission.datadoom.yaml +61 -0
- datadoom/templates/insurance_claims.datadoom.yaml +43 -0
- datadoom/templates/iot_sensors.datadoom.yaml +44 -0
- datadoom/templates/people_directory.datadoom.yaml +56 -0
- datadoom/templates/predictive_maintenance.datadoom.yaml +107 -0
- datadoom/templates/telecom_churn_challenge.datadoom.yaml +125 -0
- datadoom/version.py +3 -0
- datadoom/webdist/assets/index-V8VAuTJG.js +445 -0
- datadoom/webdist/assets/index-doRjyG5s.css +1 -0
- datadoom/webdist/assets/inter-cyrillic-ext-wght-normal-BOeWTOD4.woff2 +0 -0
- datadoom/webdist/assets/inter-cyrillic-wght-normal-DqGufNeO.woff2 +0 -0
- datadoom/webdist/assets/inter-greek-ext-wght-normal-DlzME5K_.woff2 +0 -0
- datadoom/webdist/assets/inter-greek-wght-normal-CkhJZR-_.woff2 +0 -0
- datadoom/webdist/assets/inter-latin-ext-wght-normal-DO1Apj_S.woff2 +0 -0
- datadoom/webdist/assets/inter-latin-wght-normal-Dx4kXJAl.woff2 +0 -0
- datadoom/webdist/assets/inter-vietnamese-wght-normal-CBcvBZtf.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-cyrillic-wght-normal-D73BlboJ.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-greek-wght-normal-Bw9x6K1M.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-latin-ext-wght-normal-DBQx-q_a.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-latin-wght-normal-B9CIFXIH.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-vietnamese-wght-normal-Bt-aOZkq.woff2 +0 -0
- datadoom/webdist/assets/space-grotesk-latin-ext-wght-normal-D9tNdqV9.woff2 +0 -0
- datadoom/webdist/assets/space-grotesk-latin-wght-normal-BhU9QXUp.woff2 +0 -0
- datadoom/webdist/assets/space-grotesk-vietnamese-wght-normal-D0rl6rjA.woff2 +0 -0
- datadoom/webdist/index.html +15 -0
- datadoom-0.1.0.dev0.dist-info/METADATA +143 -0
- datadoom-0.1.0.dev0.dist-info/RECORD +122 -0
- datadoom-0.1.0.dev0.dist-info/WHEEL +4 -0
- datadoom-0.1.0.dev0.dist-info/entry_points.txt +2 -0
- datadoom-0.1.0.dev0.dist-info/licenses/LICENSE +202 -0
|
@@ -0,0 +1,327 @@
|
|
|
1
|
+
"""Per-column data profile — exploratory analysis the engine can do for you.
|
|
2
|
+
|
|
3
|
+
A synthetic dataset is special: the engine knows the ground truth of every column
|
|
4
|
+
— its declared type, how a derived column was generated, and exactly which
|
|
5
|
+
failure modes corrupted it and by how much. This module turns that into a
|
|
6
|
+
**per-column report card** so an engineer or student opening the Results screen
|
|
7
|
+
gets, at a glance, what each column is, its summary statistics, and — crucially —
|
|
8
|
+
*what's wrong with it and how to handle that when building an ML model*.
|
|
9
|
+
|
|
10
|
+
Pure engine code: deterministic pandas aggregation on the realized frame plus a
|
|
11
|
+
static advice lookup (:mod:`datadoom.engine.advice`). No randomness, no model
|
|
12
|
+
fitting — same ``(spec_hash, seed)`` → identical profile (invariant #6).
|
|
13
|
+
|
|
14
|
+
The profile is computed from the **clean** shipped frame (the canonical
|
|
15
|
+
artifact); when an **injected** variant exists, each column also carries its
|
|
16
|
+
post-corruption missing rate / moments so the realized impact of the failures is
|
|
17
|
+
visible next to the pristine baseline.
|
|
18
|
+
"""
|
|
19
|
+
|
|
20
|
+
from __future__ import annotations
|
|
21
|
+
|
|
22
|
+
from typing import TYPE_CHECKING, Any
|
|
23
|
+
|
|
24
|
+
import numpy as np
|
|
25
|
+
import pandas as pd
|
|
26
|
+
|
|
27
|
+
from .advice import build_issue, severity_rank
|
|
28
|
+
|
|
29
|
+
if TYPE_CHECKING:
|
|
30
|
+
from .spec.models import Spec
|
|
31
|
+
|
|
32
|
+
# Categorical columns with more distinct values than this are summarised by their
|
|
33
|
+
# top categories only (a full breakdown would be noise for the reader).
|
|
34
|
+
_TOP_CATEGORIES = 12
|
|
35
|
+
# A target whose minority class is below this share is flagged as imbalanced.
|
|
36
|
+
_IMBALANCE_MINORITY = 0.35
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _num(x: Any) -> float | None:
|
|
40
|
+
"""Coerce to a JSON-safe float, mapping NaN/inf to ``None``."""
|
|
41
|
+
try:
|
|
42
|
+
v = float(x)
|
|
43
|
+
except (TypeError, ValueError):
|
|
44
|
+
return None
|
|
45
|
+
return v if np.isfinite(v) else None
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _parents_of(spec: Spec, col: str) -> list[str]:
|
|
49
|
+
"""Causal parents (edge sources) feeding a derived column, in spec order."""
|
|
50
|
+
if spec.causal is None:
|
|
51
|
+
return []
|
|
52
|
+
return [e.src for e in spec.causal.edges if e.dst == col]
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def _derived_names(spec: Spec) -> set[str]:
|
|
56
|
+
return set() if spec.causal is None else {e.dst for e in spec.causal.edges}
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _label_column(spec: Spec) -> str | None:
|
|
60
|
+
"""Best guess at the target column.
|
|
61
|
+
|
|
62
|
+
Authoritative when a difficulty block names a ``label``; otherwise a
|
|
63
|
+
best-effort heuristic: a boolean/categorical causal *sink* (a derived column
|
|
64
|
+
that no other edge consumes) is almost always the prediction target. Returns
|
|
65
|
+
``None`` when the guess is ambiguous (zero or several candidates).
|
|
66
|
+
"""
|
|
67
|
+
if spec.difficulty is not None and getattr(spec.difficulty, "label", None):
|
|
68
|
+
return spec.difficulty.label
|
|
69
|
+
if spec.causal is None:
|
|
70
|
+
return None
|
|
71
|
+
sources = {e.src for e in spec.causal.edges}
|
|
72
|
+
candidates = [
|
|
73
|
+
e.dst
|
|
74
|
+
for e in spec.causal.edges
|
|
75
|
+
if e.dst not in sources
|
|
76
|
+
and (feat := spec.features.get(e.dst)) is not None
|
|
77
|
+
and feat.type in ("boolean", "categorical")
|
|
78
|
+
]
|
|
79
|
+
unique = list(dict.fromkeys(candidates))
|
|
80
|
+
return unique[0] if len(unique) == 1 else None
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _numeric_stats(series: pd.Series) -> dict[str, Any]:
|
|
84
|
+
"""Summary statistics for a numeric column (NaN-aware, JSON-safe)."""
|
|
85
|
+
values = pd.to_numeric(series, errors="coerce").to_numpy(dtype=float)
|
|
86
|
+
clean = values[np.isfinite(values)]
|
|
87
|
+
if clean.size == 0:
|
|
88
|
+
return {}
|
|
89
|
+
q = np.quantile(clean, [0.25, 0.5, 0.75])
|
|
90
|
+
return {
|
|
91
|
+
"mean": _num(clean.mean()),
|
|
92
|
+
"std": _num(clean.std()),
|
|
93
|
+
"min": _num(clean.min()),
|
|
94
|
+
"p25": _num(q[0]),
|
|
95
|
+
"median": _num(q[1]),
|
|
96
|
+
"p75": _num(q[2]),
|
|
97
|
+
"max": _num(clean.max()),
|
|
98
|
+
"skew": _num(pd.Series(clean).skew()) if clean.size > 2 else None,
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
|
|
102
|
+
def _category_breakdown(series: pd.Series) -> tuple[list[dict[str, Any]], dict[str, Any] | None]:
|
|
103
|
+
"""Top categories (value/count/pct) and an imbalance summary for a discrete column."""
|
|
104
|
+
counts = series.dropna().value_counts()
|
|
105
|
+
n = int(counts.sum())
|
|
106
|
+
if n == 0:
|
|
107
|
+
return [], None
|
|
108
|
+
top = [
|
|
109
|
+
{"value": _stringify(val), "count": int(c), "pct": c / n}
|
|
110
|
+
for val, c in counts.head(_TOP_CATEGORIES).items()
|
|
111
|
+
]
|
|
112
|
+
majority = int(counts.iloc[0])
|
|
113
|
+
minority = int(counts.iloc[-1])
|
|
114
|
+
imbalance = {
|
|
115
|
+
"classes": int(counts.size),
|
|
116
|
+
"majority_pct": majority / n,
|
|
117
|
+
"minority_pct": minority / n,
|
|
118
|
+
"ratio": majority / minority if minority else None,
|
|
119
|
+
}
|
|
120
|
+
return top, imbalance
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
def _stringify(val: Any) -> str:
|
|
124
|
+
if isinstance(val, (bool, np.bool_)):
|
|
125
|
+
return "true" if val else "false"
|
|
126
|
+
return str(val)
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _failures_by_column(diffs: list[dict[str, Any]] | None) -> dict[str, list[dict[str, Any]]]:
|
|
130
|
+
"""Invert the per-mode failure diffs into a per-column list of (mode, magnitude).
|
|
131
|
+
|
|
132
|
+
Each entry carries the realized magnitude (authoritative — the engine measured
|
|
133
|
+
it) so :func:`datadoom.engine.advice.build_issue` can size severity and the UI
|
|
134
|
+
can show the concrete effect.
|
|
135
|
+
"""
|
|
136
|
+
out: dict[str, list[dict[str, Any]]] = {}
|
|
137
|
+
|
|
138
|
+
def add(col: str, mode: str, magnitude: str, fraction: float | None, detail: dict[str, Any]) -> None:
|
|
139
|
+
out.setdefault(col, []).append(
|
|
140
|
+
{"mode": mode, "magnitude": magnitude, "fraction": fraction, "detail": detail}
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
for d in diffs or []:
|
|
144
|
+
mode = str(d.get("type") or d.get("mechanism") or "")
|
|
145
|
+
if mode == "mcar":
|
|
146
|
+
for col, frac in (d.get("nullified_fraction") or {}).items():
|
|
147
|
+
f = _num(frac) or 0.0
|
|
148
|
+
add(col, "mcar", f"{f * 100:.1f}% of values missing", f, {"rate": f})
|
|
149
|
+
elif mode in ("mar", "mnar"):
|
|
150
|
+
col = str(d.get("column"))
|
|
151
|
+
f = _num(d.get("realized_rate")) or 0.0
|
|
152
|
+
detail = {"rate": f, "driver": d.get("driver"), "self_dependent": d.get("self_dependent")}
|
|
153
|
+
add(col, mode, f"{f * 100:.1f}% of values missing", f, detail)
|
|
154
|
+
elif mode == "label_noise":
|
|
155
|
+
col = str(d.get("column"))
|
|
156
|
+
f = _num(d.get("flipped_fraction")) or 0.0
|
|
157
|
+
add(col, "label_noise", f"{f * 100:.1f}% of labels flipped", f, {"rate": f})
|
|
158
|
+
elif mode == "feature_noise":
|
|
159
|
+
col = str(d.get("column"))
|
|
160
|
+
sd = _num(d.get("realized_noise_std"))
|
|
161
|
+
add(
|
|
162
|
+
col,
|
|
163
|
+
"feature_noise",
|
|
164
|
+
f"σ≈{sd:.3g} noise added" if sd is not None else "noise added",
|
|
165
|
+
None,
|
|
166
|
+
{"noise_std": sd, "mean_shift": _num(d.get("realized_mean_shift"))},
|
|
167
|
+
)
|
|
168
|
+
elif mode == "drift":
|
|
169
|
+
col = str(d.get("column"))
|
|
170
|
+
shift = _num(d.get("total_shift"))
|
|
171
|
+
kind = d.get("kind", "linear")
|
|
172
|
+
add(
|
|
173
|
+
col,
|
|
174
|
+
"drift",
|
|
175
|
+
f"{shift:.3g} total {kind} shift" if shift is not None else f"{kind} drift",
|
|
176
|
+
None,
|
|
177
|
+
{"total_shift": shift, "kind": kind},
|
|
178
|
+
)
|
|
179
|
+
elif mode == "covariate_shift":
|
|
180
|
+
col = str(d.get("column"))
|
|
181
|
+
before, after = d.get("before") or {}, d.get("after") or {}
|
|
182
|
+
bm, am = _num(before.get("mean")), _num(after.get("mean"))
|
|
183
|
+
mag = f"mean {bm:.3g}→{am:.3g}" if bm is not None and am is not None else "distribution shifted"
|
|
184
|
+
add(col, "covariate_shift", mag, None, {"before": before, "after": after})
|
|
185
|
+
elif mode == "leakage":
|
|
186
|
+
col = str(d.get("into"))
|
|
187
|
+
corr = _num(d.get("realized_correlation"))
|
|
188
|
+
mag = f"corr={corr:.3f} with {d.get('target')}" if corr is not None else "high-MI proxy"
|
|
189
|
+
add(col, "leakage", mag, None, {"target": d.get("target"), "correlation": corr})
|
|
190
|
+
return out
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def _role(name: str, derived: set[str], label: str | None, planted: bool) -> str:
|
|
194
|
+
"""How the column functions for modelling: label / leakage proxy / derived / feature."""
|
|
195
|
+
if planted:
|
|
196
|
+
return "leakage_proxy"
|
|
197
|
+
if name == label:
|
|
198
|
+
return "label"
|
|
199
|
+
if name in derived:
|
|
200
|
+
return "derived"
|
|
201
|
+
return "feature"
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def _column_profile(
|
|
205
|
+
name: str,
|
|
206
|
+
*,
|
|
207
|
+
spec: Spec,
|
|
208
|
+
clean: pd.DataFrame,
|
|
209
|
+
injected: pd.DataFrame | None,
|
|
210
|
+
derived: set[str],
|
|
211
|
+
label: str | None,
|
|
212
|
+
col_failures: list[dict[str, Any]],
|
|
213
|
+
) -> dict[str, Any]:
|
|
214
|
+
"""Assemble the report card for a single column."""
|
|
215
|
+
planted = name not in clean.columns # e.g. a leakage proxy lives only in injected
|
|
216
|
+
base = injected if planted and injected is not None else clean
|
|
217
|
+
series = base[name]
|
|
218
|
+
feat = spec.features.get(name)
|
|
219
|
+
feature_type = feat.type if feat is not None else "synthetic"
|
|
220
|
+
|
|
221
|
+
n = int(len(series))
|
|
222
|
+
missing = int(series.isna().sum())
|
|
223
|
+
profile: dict[str, Any] = {
|
|
224
|
+
"name": name,
|
|
225
|
+
"role": _role(name, derived, label, planted),
|
|
226
|
+
"feature_type": feature_type,
|
|
227
|
+
"dtype": str(series.dtype),
|
|
228
|
+
"count": n,
|
|
229
|
+
"missing": missing,
|
|
230
|
+
"missing_pct": missing / n if n else 0.0,
|
|
231
|
+
"unique": int(series.nunique(dropna=True)),
|
|
232
|
+
"derived": name in derived,
|
|
233
|
+
"parents": _parents_of(spec, name),
|
|
234
|
+
"description": getattr(feat, "description", None),
|
|
235
|
+
}
|
|
236
|
+
|
|
237
|
+
if pd.api.types.is_numeric_dtype(series) and not pd.api.types.is_bool_dtype(series):
|
|
238
|
+
profile["stats"] = _numeric_stats(series)
|
|
239
|
+
profile["categories"] = None
|
|
240
|
+
profile["imbalance"] = None
|
|
241
|
+
else:
|
|
242
|
+
top, imbalance = _category_breakdown(series)
|
|
243
|
+
profile["stats"] = None
|
|
244
|
+
profile["categories"] = top
|
|
245
|
+
profile["imbalance"] = imbalance
|
|
246
|
+
|
|
247
|
+
# Post-corruption snapshot: how the column actually looks in the injected variant.
|
|
248
|
+
if injected is not None and name in injected.columns and not planted:
|
|
249
|
+
inj = injected[name]
|
|
250
|
+
inj_missing = int(inj.isna().sum())
|
|
251
|
+
post: dict[str, Any] = {"missing_pct": inj_missing / n if n else 0.0}
|
|
252
|
+
if pd.api.types.is_numeric_dtype(inj) and not pd.api.types.is_bool_dtype(inj):
|
|
253
|
+
vals = pd.to_numeric(inj, errors="coerce").to_numpy(dtype=float)
|
|
254
|
+
vals = vals[np.isfinite(vals)]
|
|
255
|
+
if vals.size:
|
|
256
|
+
post["mean"] = _num(vals.mean())
|
|
257
|
+
post["std"] = _num(vals.std())
|
|
258
|
+
profile["injected"] = post
|
|
259
|
+
else:
|
|
260
|
+
profile["injected"] = None
|
|
261
|
+
|
|
262
|
+
# Issues: failure-mode corruptions + (for the label) class imbalance.
|
|
263
|
+
issues = [
|
|
264
|
+
build_issue(f["mode"], magnitude=f["magnitude"], fraction=f["fraction"], detail=f["detail"]).to_dict()
|
|
265
|
+
for f in col_failures
|
|
266
|
+
]
|
|
267
|
+
imbalance = profile.get("imbalance")
|
|
268
|
+
if name == label and imbalance and imbalance["minority_pct"] < _IMBALANCE_MINORITY:
|
|
269
|
+
ratio = imbalance.get("ratio")
|
|
270
|
+
mag = (
|
|
271
|
+
f"{imbalance['majority_pct'] * 100:.1f}% / {imbalance['minority_pct'] * 100:.1f}%"
|
|
272
|
+
+ (f" ({ratio:.1f}:1)" if ratio else "")
|
|
273
|
+
)
|
|
274
|
+
issues.append(
|
|
275
|
+
build_issue("class_imbalance", magnitude=mag, fraction=None, detail=imbalance).to_dict()
|
|
276
|
+
)
|
|
277
|
+
issues.sort(key=lambda i: severity_rank(i["severity"]), reverse=True)
|
|
278
|
+
profile["issues"] = issues
|
|
279
|
+
return profile
|
|
280
|
+
|
|
281
|
+
|
|
282
|
+
def build_profile(
|
|
283
|
+
spec: Spec,
|
|
284
|
+
clean: pd.DataFrame,
|
|
285
|
+
*,
|
|
286
|
+
injected: pd.DataFrame | None = None,
|
|
287
|
+
failure_diffs: list[dict[str, Any]] | None = None,
|
|
288
|
+
) -> dict[str, Any]:
|
|
289
|
+
"""Build the full per-column data profile for the Results screen.
|
|
290
|
+
|
|
291
|
+
Returns a JSON-serialisable dict with a top-level ``summary`` and a
|
|
292
|
+
``columns`` list (one report card each). Columns appear in shipped order,
|
|
293
|
+
with any injected-only columns (e.g. leakage proxies) appended.
|
|
294
|
+
"""
|
|
295
|
+
derived = _derived_names(spec)
|
|
296
|
+
label = _label_column(spec)
|
|
297
|
+
by_col = _failures_by_column(failure_diffs)
|
|
298
|
+
|
|
299
|
+
names: list[str] = list(clean.columns)
|
|
300
|
+
if injected is not None:
|
|
301
|
+
names += [c for c in injected.columns if c not in clean.columns]
|
|
302
|
+
|
|
303
|
+
columns = [
|
|
304
|
+
_column_profile(
|
|
305
|
+
name,
|
|
306
|
+
spec=spec,
|
|
307
|
+
clean=clean,
|
|
308
|
+
injected=injected,
|
|
309
|
+
derived=derived,
|
|
310
|
+
label=label,
|
|
311
|
+
col_failures=by_col.get(name, []),
|
|
312
|
+
)
|
|
313
|
+
for name in names
|
|
314
|
+
]
|
|
315
|
+
|
|
316
|
+
n_issue_cols = sum(1 for c in columns if c["issues"])
|
|
317
|
+
severities = [i["severity"] for c in columns for i in c["issues"]]
|
|
318
|
+
summary = {
|
|
319
|
+
"n_rows": int(len(clean)),
|
|
320
|
+
"n_columns": len(columns),
|
|
321
|
+
"label": label,
|
|
322
|
+
"columns_with_issues": n_issue_cols,
|
|
323
|
+
"total_issues": len(severities),
|
|
324
|
+
"critical_issues": sum(1 for s in severities if s == "critical"),
|
|
325
|
+
"high_issues": sum(1 for s in severities if s == "high"),
|
|
326
|
+
}
|
|
327
|
+
return {"summary": summary, "columns": columns}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""Progress emission contract.
|
|
2
|
+
|
|
3
|
+
The engine stays framework-free: it emits stage events to a sink. In P0 the sink
|
|
4
|
+
is a no-op; later phases wire a WebSocket hub behind the same interface.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class ProgressEmitter:
|
|
11
|
+
"""No-op progress sink. Subclasses publish events elsewhere."""
|
|
12
|
+
|
|
13
|
+
def emit(self, stage: str, pct: int, message: str = "") -> None: # noqa: D401
|
|
14
|
+
return None
|
|
@@ -0,0 +1,338 @@
|
|
|
1
|
+
"""Machine-readable spec capabilities manifest (the AI-authoring contract).
|
|
2
|
+
|
|
3
|
+
`build_capabilities()` returns a JSON-serializable dict that enumerates **every**
|
|
4
|
+
knob a DataDoom spec accepts, with its exact valid values and constraints. It is
|
|
5
|
+
built from the **live engine registries** (distributions, structural functions,
|
|
6
|
+
failure modes, exporters, text providers, difficulty tiers) so it is always in
|
|
7
|
+
sync with the running build *and* automatically reflects any registered plugin.
|
|
8
|
+
|
|
9
|
+
The authoritative *names* come from the registries; richer per-item *annotations*
|
|
10
|
+
(parameter domains, failure-mode fields, prose) are curated here and merged in by
|
|
11
|
+
name. An item with no annotation (e.g. a third-party plugin distribution) still
|
|
12
|
+
appears, carrying whatever the engine ABC exposes (required params, schema).
|
|
13
|
+
|
|
14
|
+
This is what you feed an LLM/agent so it can emit a valid `*.datadoom.yaml`
|
|
15
|
+
without guessing. The CLI surfaces it via ``datadoom spec-reference`` and the API
|
|
16
|
+
via ``GET /api/spec-reference``.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
from typing import Any
|
|
22
|
+
|
|
23
|
+
from ..version import __version__
|
|
24
|
+
|
|
25
|
+
# --- curated annotations (merged onto registry-derived names by key) ----------------
|
|
26
|
+
|
|
27
|
+
_DIST_ANNOTATIONS: dict[str, dict[str, Any]] = {
|
|
28
|
+
"normal": {
|
|
29
|
+
"summary": "Symmetric bell curve over all real numbers.",
|
|
30
|
+
"params": {"mean": "center (any real)", "std": "spread, must be > 0"},
|
|
31
|
+
},
|
|
32
|
+
"lognormal": {
|
|
33
|
+
"summary": "Right-skewed, positive-only (income, prices). mu/sigma are of the underlying normal (log space).",
|
|
34
|
+
"params": {"mu": "mean of ln(X) (any real)", "sigma": "std of ln(X), must be > 0"},
|
|
35
|
+
},
|
|
36
|
+
"uniform": {
|
|
37
|
+
"summary": "Flat — every value in [low, high] equally likely.",
|
|
38
|
+
"params": {"low": "lower bound", "high": "upper bound, must be > low"},
|
|
39
|
+
},
|
|
40
|
+
"exponential": {
|
|
41
|
+
"summary": "Decaying, non-negative (waiting times). Mean == scale.",
|
|
42
|
+
"params": {"scale": "mean of the distribution, must be > 0"},
|
|
43
|
+
},
|
|
44
|
+
"poisson": {
|
|
45
|
+
"summary": "Discrete counts 0,1,2,… ; lam is the mean. Output is integer.",
|
|
46
|
+
"params": {"lam": "mean count, must be > 0"},
|
|
47
|
+
"discrete": True,
|
|
48
|
+
},
|
|
49
|
+
"pareto": {
|
|
50
|
+
"summary": "Heavy-tailed power law; values are >= xm. Smaller alpha = heavier tail.",
|
|
51
|
+
"params": {"alpha": "tail index, must be > 0", "xm": "minimum value (scale), must be > 0"},
|
|
52
|
+
},
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
_FN_ANNOTATIONS: dict[str, dict[str, Any]] = {
|
|
56
|
+
"linear": {
|
|
57
|
+
"summary": "weight·parent + bias.",
|
|
58
|
+
"fields": {"weight": "number (required)", "bias": "number (optional, default 0)"},
|
|
59
|
+
},
|
|
60
|
+
"logistic": {
|
|
61
|
+
"summary": "1/(1+e^-(weight·parent+bias)) — squash a driver to 0..1; typically the last edge into a boolean target.",
|
|
62
|
+
"fields": {"weight": "number (required)", "bias": "number (optional, default 0)"},
|
|
63
|
+
},
|
|
64
|
+
"polynomial": {
|
|
65
|
+
"summary": "Σ coeffs[i]·parent^i — curved/non-linear effect.",
|
|
66
|
+
"fields": {"coeffs": "non-empty list of numbers (required)"},
|
|
67
|
+
},
|
|
68
|
+
"map": {
|
|
69
|
+
"summary": "Look up mapping[parent_category] — turns a categorical parent into a number. Must cover every category.",
|
|
70
|
+
"fields": {"mapping": "object {category: number} covering all parent categories (required)"},
|
|
71
|
+
},
|
|
72
|
+
"identity": {"summary": "Pass the parent value through unchanged.", "fields": {}},
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
#: Failure modes: each is a list item under top-level ``failures`` with ``type`` + these fields.
|
|
76
|
+
_FAILURE_MODES: dict[str, dict[str, Any]] = {
|
|
77
|
+
"mcar": {
|
|
78
|
+
"category": "missingness",
|
|
79
|
+
"summary": "Missing Completely At Random — blanks chosen independently of the data.",
|
|
80
|
+
"fields": {
|
|
81
|
+
"column": "feature name (or use 'columns')",
|
|
82
|
+
"columns": "list of feature names (alternative to 'column')",
|
|
83
|
+
"rate": "fraction in [0,1] to blank (required)",
|
|
84
|
+
},
|
|
85
|
+
},
|
|
86
|
+
"mar": {
|
|
87
|
+
"category": "missingness",
|
|
88
|
+
"summary": "Missing At Random — blank probability depends on another observed column.",
|
|
89
|
+
"fields": {
|
|
90
|
+
"column": "feature to blank (required)",
|
|
91
|
+
"driver": "observed numeric/boolean feature that drives missingness (required)",
|
|
92
|
+
"rate": "expected fraction blanked, calibrated (required)",
|
|
93
|
+
"strength": "driver skew, number (optional, default 2.0)",
|
|
94
|
+
},
|
|
95
|
+
},
|
|
96
|
+
"mnar": {
|
|
97
|
+
"category": "missingness",
|
|
98
|
+
"summary": "Missing Not At Random — blank probability depends on the column's own value.",
|
|
99
|
+
"fields": {
|
|
100
|
+
"column": "feature to blank (required)",
|
|
101
|
+
"driver": "optional numeric/boolean driver (defaults to the column itself)",
|
|
102
|
+
"rate": "expected fraction blanked, calibrated (required)",
|
|
103
|
+
"strength": "skew, number (optional, default 2.0)",
|
|
104
|
+
},
|
|
105
|
+
},
|
|
106
|
+
"label_noise": {
|
|
107
|
+
"category": "noise",
|
|
108
|
+
"summary": "Flip a boolean / reassign a categorical label to a different class.",
|
|
109
|
+
"fields": {
|
|
110
|
+
"column": "boolean or categorical feature (required)",
|
|
111
|
+
"rate": "fraction in [0,1] to corrupt (required)",
|
|
112
|
+
},
|
|
113
|
+
},
|
|
114
|
+
"feature_noise": {
|
|
115
|
+
"category": "noise",
|
|
116
|
+
"summary": "Additive noise on a numeric column: x' = x + ε.",
|
|
117
|
+
"fields": {
|
|
118
|
+
"column": "numeric feature (required)",
|
|
119
|
+
"dist": "noise distribution name (required, e.g. normal)",
|
|
120
|
+
"params": "params for that distribution (e.g. {mean: 0, std: 1})",
|
|
121
|
+
},
|
|
122
|
+
},
|
|
123
|
+
"drift": {
|
|
124
|
+
"category": "shift",
|
|
125
|
+
"summary": "Gradually shift a numeric column across the row index (concept drift).",
|
|
126
|
+
"fields": {
|
|
127
|
+
"column": "numeric feature (required)",
|
|
128
|
+
"schedule": "object: {kind: linear|step, magnitude: number (total shift) OR rate: per-row slope, at: 0..1 (step only, default 0.5)}",
|
|
129
|
+
},
|
|
130
|
+
},
|
|
131
|
+
"covariate_shift": {
|
|
132
|
+
"category": "shift",
|
|
133
|
+
"summary": "Affine-rescale a numeric column to a target mean/std.",
|
|
134
|
+
"fields": {
|
|
135
|
+
"column": "numeric feature (required)",
|
|
136
|
+
"target": "object {mean?: number, std?: number} (at least one required)",
|
|
137
|
+
},
|
|
138
|
+
},
|
|
139
|
+
"leakage": {
|
|
140
|
+
"category": "leakage",
|
|
141
|
+
"summary": "Plant a NEW column that is a near-perfect proxy for a target.",
|
|
142
|
+
"fields": {
|
|
143
|
+
"target": "numeric/boolean feature to leak (required)",
|
|
144
|
+
"into": "new column name, must differ from target (required)",
|
|
145
|
+
"noise": "proxy noise level relative to target spread (optional, default 0.05; smaller = stronger leak)",
|
|
146
|
+
},
|
|
147
|
+
},
|
|
148
|
+
}
|
|
149
|
+
|
|
150
|
+
_FEATURE_TYPES: dict[str, dict[str, Any]] = {
|
|
151
|
+
"numeric": {
|
|
152
|
+
"summary": "Numbers from a distribution, optionally clamped and/or rounded to int. Omit 'dist' to make it a causal-derived column.",
|
|
153
|
+
"fields": {
|
|
154
|
+
"dist": "distribution name (see 'distributions'); omit for a causal target",
|
|
155
|
+
"params": "object of distribution parameters",
|
|
156
|
+
"min": "lower clamp (optional)",
|
|
157
|
+
"max": "upper clamp (optional)",
|
|
158
|
+
"dtype": "'float' (default) or 'int' (rounds to whole numbers)",
|
|
159
|
+
},
|
|
160
|
+
},
|
|
161
|
+
"categorical": {
|
|
162
|
+
"summary": "One label per row from a fixed set.",
|
|
163
|
+
"fields": {
|
|
164
|
+
"categories": "non-empty list of strings (required)",
|
|
165
|
+
"weights": "list of non-negative numbers, positionally matched; normalized (optional, default uniform)",
|
|
166
|
+
},
|
|
167
|
+
},
|
|
168
|
+
"boolean": {
|
|
169
|
+
"summary": "True/false column.",
|
|
170
|
+
"fields": {"rate": "probability of true, in [0,1] (default 0.5)"},
|
|
171
|
+
},
|
|
172
|
+
"datetime": {
|
|
173
|
+
"summary": "Timestamps drawn uniformly in a range.",
|
|
174
|
+
"fields": {
|
|
175
|
+
"start": "ISO date string, e.g. '2023-01-01' (required)",
|
|
176
|
+
"end": "ISO date string >= start (required)",
|
|
177
|
+
"granularity": "'second' | 'minute' | 'hour' | 'day' (default 'day')",
|
|
178
|
+
},
|
|
179
|
+
},
|
|
180
|
+
"text": {
|
|
181
|
+
"summary": "Strings: 'lorem' filler or a realistic provider (see 'text_generators'). Realistic providers are seeded/reproducible.",
|
|
182
|
+
"fields": {
|
|
183
|
+
"generator": "'lorem' (default) or a realistic provider key",
|
|
184
|
+
"locale": "locale for realistic providers (default 'en')",
|
|
185
|
+
"length": "object {min, max} word-count range — lorem only (default {min:5,max:30})",
|
|
186
|
+
},
|
|
187
|
+
},
|
|
188
|
+
"timeseries": {
|
|
189
|
+
"summary": "Ordered additive series Xt = trend + seasonality + AR(p) + noise over the row index. Row order is the time axis (preserved). May be a causal parent; never a causal target; not distribution-compliance assessed.",
|
|
190
|
+
"fields": {
|
|
191
|
+
"trend": "object {slope, intercept} — linear trend (optional)",
|
|
192
|
+
"seasonality": "list of {amplitude, period (>0), phase} sinusoids, summed (optional)",
|
|
193
|
+
"ar": "list of AR coefficients [phi1..phip]; sum(|phi|) must be < 1 (stationarity)",
|
|
194
|
+
"noise_std": "sigma of Gaussian innovations, >= 0 (default 1.0)",
|
|
195
|
+
"min": "lower clamp (optional)",
|
|
196
|
+
"max": "upper clamp (optional)",
|
|
197
|
+
"dtype": "'float' (default) or 'int'",
|
|
198
|
+
},
|
|
199
|
+
},
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
_SHARED_FEATURE_FIELDS = {
|
|
203
|
+
"description": "free-text doc (optional)",
|
|
204
|
+
"emit": "boolean; false = latent (computed/drives the SEM but NOT exported, and excluded from probe/compliance/correlation). Default true.",
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
_RULES = [
|
|
208
|
+
"Top-level required keys: datadoom_version (always \"1\"), name (slug [A-Za-z0-9_-]+), rows (int >= 1), features.",
|
|
209
|
+
"A causal-derived feature (numeric or boolean) is declared WITHOUT a 'dist'/'rate' and MUST be the 'to' of at least one causal edge.",
|
|
210
|
+
"A feature cannot be both sampled (has dist) and a causal target.",
|
|
211
|
+
"The causal graph must be acyclic. Only numeric/boolean features can be causal targets.",
|
|
212
|
+
"'map' edges require a categorical parent and a mapping covering every category; other fns require a numeric/boolean/timeseries parent.",
|
|
213
|
+
"A difficulty 'label' must be a boolean or 2-class categorical feature, and must not be latent (emit:false).",
|
|
214
|
+
"difficulty.knobs ⊆ {noise, label_noise}. target is a named tier or {band:[a,b]}.",
|
|
215
|
+
"Failures are an ordered list applied after the clean baseline is captured; export versions must include 'injected' to write the corrupted variant.",
|
|
216
|
+
"A failure cannot reference a latent (emit:false) feature.",
|
|
217
|
+
"export.splits ratios must sum to 1.0. export.formats must be known formats.",
|
|
218
|
+
"time-series AR must satisfy sum(|coefficients|) < 1 (stationarity).",
|
|
219
|
+
"Determinism: same (spec, seed) -> identical bytes. Seed is NOT part of the spec hash.",
|
|
220
|
+
]
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def _distributions() -> list[dict[str, Any]]:
|
|
224
|
+
from .dist.builtins import REGISTRY
|
|
225
|
+
|
|
226
|
+
out: list[dict[str, Any]] = []
|
|
227
|
+
for name in sorted(REGISTRY):
|
|
228
|
+
dist = REGISTRY[name]
|
|
229
|
+
entry: dict[str, Any] = {
|
|
230
|
+
"name": name,
|
|
231
|
+
"required_params": list(dist.required_params),
|
|
232
|
+
"builtin": name in _DIST_ANNOTATIONS,
|
|
233
|
+
}
|
|
234
|
+
entry.update(_DIST_ANNOTATIONS.get(name, {}))
|
|
235
|
+
if getattr(dist, "param_schema", None) is not None:
|
|
236
|
+
entry["param_schema"] = dist.param_schema
|
|
237
|
+
out.append(entry)
|
|
238
|
+
return out
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def _structural_fns() -> list[dict[str, Any]]:
|
|
242
|
+
from .causal.functions import STRUCTURAL_FNS
|
|
243
|
+
|
|
244
|
+
out: list[dict[str, Any]] = []
|
|
245
|
+
for name in sorted(STRUCTURAL_FNS):
|
|
246
|
+
entry: dict[str, Any] = {"name": name, "builtin": name in _FN_ANNOTATIONS}
|
|
247
|
+
entry.update(_FN_ANNOTATIONS.get(name, {}))
|
|
248
|
+
out.append(entry)
|
|
249
|
+
return out
|
|
250
|
+
|
|
251
|
+
|
|
252
|
+
def _failure_modes() -> list[dict[str, Any]]:
|
|
253
|
+
from .failure import FAILURE_MODES
|
|
254
|
+
|
|
255
|
+
out: list[dict[str, Any]] = []
|
|
256
|
+
for name in sorted(FAILURE_MODES):
|
|
257
|
+
entry: dict[str, Any] = {"type": name, "builtin": name in _FAILURE_MODES}
|
|
258
|
+
entry.update(_FAILURE_MODES.get(name, {}))
|
|
259
|
+
out.append(entry)
|
|
260
|
+
return out
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
def _difficulty() -> dict[str, Any]:
|
|
264
|
+
from .difficulty import PROBES, TIER_BANDS
|
|
265
|
+
|
|
266
|
+
return {
|
|
267
|
+
"tiers": {name: list(band) for name, band in TIER_BANDS.items()},
|
|
268
|
+
"probes": sorted(PROBES),
|
|
269
|
+
"knobs": ["noise", "label_noise"],
|
|
270
|
+
"target": "a named tier (e.g. 'advanced') or an explicit {band: [a, b]} of AUROC",
|
|
271
|
+
"label": "the boolean / 2-class categorical column the baseline probe predicts",
|
|
272
|
+
"max_iters": "calibration steps, int >= 1 (default 8)",
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
|
|
276
|
+
def _exporters() -> dict[str, Any]:
|
|
277
|
+
from .export import EXPORTERS
|
|
278
|
+
|
|
279
|
+
return {
|
|
280
|
+
"formats": sorted(EXPORTERS),
|
|
281
|
+
"versions": ["clean", "injected"],
|
|
282
|
+
"fields": {
|
|
283
|
+
"formats": "list of output formats (default [csv]); parquet needs the optional extra",
|
|
284
|
+
"versions": "subset of {clean, injected} (default [clean])",
|
|
285
|
+
"splits": "object {name: ratio} whose ratios sum to 1.0 (optional)",
|
|
286
|
+
"shuffle": "boolean (default true)",
|
|
287
|
+
"metadata": "boolean — write metadata.json (default true)",
|
|
288
|
+
},
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
|
|
292
|
+
def _text_generators() -> dict[str, Any]:
|
|
293
|
+
from .dist.providers import REALISTIC_GENERATORS
|
|
294
|
+
|
|
295
|
+
return {
|
|
296
|
+
"lorem": "filler words (uses 'length' {min,max})",
|
|
297
|
+
"realistic": sorted(REALISTIC_GENERATORS),
|
|
298
|
+
}
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def build_capabilities() -> dict[str, Any]:
|
|
302
|
+
"""Return the full, JSON-serializable spec capabilities manifest."""
|
|
303
|
+
return {
|
|
304
|
+
"datadoom_version": "1",
|
|
305
|
+
"package_version": __version__,
|
|
306
|
+
"summary": (
|
|
307
|
+
"DataDoom spec capabilities. A spec is a YAML/JSON document describing a "
|
|
308
|
+
"reproducible synthetic dataset. Use the exact names/fields below; same "
|
|
309
|
+
"(spec, seed) regenerates identical data."
|
|
310
|
+
),
|
|
311
|
+
"top_level_keys": {
|
|
312
|
+
"datadoom_version": 'required, always "1"',
|
|
313
|
+
"name": "required, slug [A-Za-z0-9_-]+",
|
|
314
|
+
"description": "optional string",
|
|
315
|
+
"seed": "optional int (reproducibility; not part of the spec hash)",
|
|
316
|
+
"rows": "required int >= 1",
|
|
317
|
+
"features": "required object {name: feature} — see feature_types",
|
|
318
|
+
"causal": "optional DAG {edges, noise, interventions}",
|
|
319
|
+
"difficulty": "optional classification difficulty target",
|
|
320
|
+
"failures": "optional ordered list of corruptions",
|
|
321
|
+
"export": "optional output config",
|
|
322
|
+
"meta": "optional free-form object (ignored by the engine)",
|
|
323
|
+
},
|
|
324
|
+
"shared_feature_fields": _SHARED_FEATURE_FIELDS,
|
|
325
|
+
"feature_types": _FEATURE_TYPES,
|
|
326
|
+
"distributions": _distributions(),
|
|
327
|
+
"structural_fns": _structural_fns(),
|
|
328
|
+
"causal": {
|
|
329
|
+
"edges": "list of {from, to, fn, ...fn params} — see structural_fns",
|
|
330
|
+
"noise": "object {derived_node: {dist: <name|none>, params: {...}}}",
|
|
331
|
+
"interventions": "list of {do: {feature: value}} — fix a node to a constant",
|
|
332
|
+
},
|
|
333
|
+
"failure_modes": _failure_modes(),
|
|
334
|
+
"difficulty": _difficulty(),
|
|
335
|
+
"export": _exporters(),
|
|
336
|
+
"text_generators": _text_generators(),
|
|
337
|
+
"rules": _RULES,
|
|
338
|
+
}
|