datadoom 0.1.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datadoom/__init__.py +23 -0
- datadoom/adapters/__init__.py +29 -0
- datadoom/adapters/frameworks.py +94 -0
- datadoom/adapters/loaders.py +72 -0
- datadoom/api/__init__.py +11 -0
- datadoom/api/app.py +109 -0
- datadoom/api/deps.py +30 -0
- datadoom/api/errors.py +89 -0
- datadoom/api/estimate.py +82 -0
- datadoom/api/routes/__init__.py +7 -0
- datadoom/api/routes/artifacts.py +147 -0
- datadoom/api/routes/datasets.py +180 -0
- datadoom/api/routes/meta.py +45 -0
- datadoom/api/routes/plugins.py +22 -0
- datadoom/api/routes/runs.py +144 -0
- datadoom/api/routes/specs.py +73 -0
- datadoom/api/routes/templates.py +30 -0
- datadoom/api/schemas.py +230 -0
- datadoom/api/serializers.py +143 -0
- datadoom/api/state.py +24 -0
- datadoom/api/store_helpers.py +56 -0
- datadoom/api/ws.py +72 -0
- datadoom/cli/__init__.py +1 -0
- datadoom/cli/main.py +313 -0
- datadoom/config.py +108 -0
- datadoom/engine/__init__.py +38 -0
- datadoom/engine/advice.py +289 -0
- datadoom/engine/audit.py +290 -0
- datadoom/engine/causal/__init__.py +15 -0
- datadoom/engine/causal/execute.py +116 -0
- datadoom/engine/causal/functions.py +116 -0
- datadoom/engine/causal/graph.py +54 -0
- datadoom/engine/difficulty/__init__.py +36 -0
- datadoom/engine/difficulty/calibrate.py +235 -0
- datadoom/engine/difficulty/knobs.py +171 -0
- datadoom/engine/difficulty/probes.py +181 -0
- datadoom/engine/dist/__init__.py +35 -0
- datadoom/engine/dist/base.py +46 -0
- datadoom/engine/dist/builtins.py +172 -0
- datadoom/engine/dist/compliance.py +344 -0
- datadoom/engine/dist/providers.py +117 -0
- datadoom/engine/errors.py +32 -0
- datadoom/engine/export/__init__.py +27 -0
- datadoom/engine/export/base.py +49 -0
- datadoom/engine/export/checksums.py +18 -0
- datadoom/engine/export/csv_exporter.py +34 -0
- datadoom/engine/export/json_exporter.py +67 -0
- datadoom/engine/export/metadata.py +58 -0
- datadoom/engine/export/parquet_exporter.py +45 -0
- datadoom/engine/failure/__init__.py +18 -0
- datadoom/engine/failure/apply.py +37 -0
- datadoom/engine/failure/base.py +116 -0
- datadoom/engine/failure/modes.py +442 -0
- datadoom/engine/pipeline.py +418 -0
- datadoom/engine/profile.py +327 -0
- datadoom/engine/progress.py +14 -0
- datadoom/engine/reference.py +338 -0
- datadoom/engine/reports.py +206 -0
- datadoom/engine/rng.py +79 -0
- datadoom/engine/spec/__init__.py +45 -0
- datadoom/engine/spec/hashing.py +57 -0
- datadoom/engine/spec/models.py +238 -0
- datadoom/engine/spec/validate.py +345 -0
- datadoom/engine/timeseries.py +88 -0
- datadoom/jobs/__init__.py +14 -0
- datadoom/jobs/progress.py +155 -0
- datadoom/jobs/worker.py +162 -0
- datadoom/plugin.py +35 -0
- datadoom/plugins/__init__.py +47 -0
- datadoom/plugins/contracts.py +72 -0
- datadoom/plugins/loader.py +125 -0
- datadoom/plugins/registry.py +214 -0
- datadoom/plugins/scaffold.py +434 -0
- datadoom/store/__init__.py +47 -0
- datadoom/store/artifacts.py +67 -0
- datadoom/store/db.py +104 -0
- datadoom/store/migrations/__init__.py +0 -0
- datadoom/store/migrations/env.py +53 -0
- datadoom/store/migrations/script.py.mako +24 -0
- datadoom/store/migrations/versions/0001_init.py +149 -0
- datadoom/store/migrations/versions/0002_report_mutual_information.py +23 -0
- datadoom/store/migrations/versions/0003_run_name.py +23 -0
- datadoom/store/migrations/versions/0004_report_profile.py +24 -0
- datadoom/store/models.py +170 -0
- datadoom/store/repositories.py +279 -0
- datadoom/templates/__init__.py +239 -0
- datadoom/templates/ab_test.datadoom.yaml +46 -0
- datadoom/templates/clinical_deterioration.datadoom.yaml +124 -0
- datadoom/templates/credit_default_challenge.datadoom.yaml +147 -0
- datadoom/templates/customer_churn.datadoom.yaml +60 -0
- datadoom/templates/ecommerce_orders.datadoom.yaml +46 -0
- datadoom/templates/fraud_detection.datadoom.yaml +57 -0
- datadoom/templates/hospital_readmission.datadoom.yaml +61 -0
- datadoom/templates/insurance_claims.datadoom.yaml +43 -0
- datadoom/templates/iot_sensors.datadoom.yaml +44 -0
- datadoom/templates/people_directory.datadoom.yaml +56 -0
- datadoom/templates/predictive_maintenance.datadoom.yaml +107 -0
- datadoom/templates/telecom_churn_challenge.datadoom.yaml +125 -0
- datadoom/version.py +3 -0
- datadoom/webdist/assets/index-V8VAuTJG.js +445 -0
- datadoom/webdist/assets/index-doRjyG5s.css +1 -0
- datadoom/webdist/assets/inter-cyrillic-ext-wght-normal-BOeWTOD4.woff2 +0 -0
- datadoom/webdist/assets/inter-cyrillic-wght-normal-DqGufNeO.woff2 +0 -0
- datadoom/webdist/assets/inter-greek-ext-wght-normal-DlzME5K_.woff2 +0 -0
- datadoom/webdist/assets/inter-greek-wght-normal-CkhJZR-_.woff2 +0 -0
- datadoom/webdist/assets/inter-latin-ext-wght-normal-DO1Apj_S.woff2 +0 -0
- datadoom/webdist/assets/inter-latin-wght-normal-Dx4kXJAl.woff2 +0 -0
- datadoom/webdist/assets/inter-vietnamese-wght-normal-CBcvBZtf.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-cyrillic-wght-normal-D73BlboJ.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-greek-wght-normal-Bw9x6K1M.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-latin-ext-wght-normal-DBQx-q_a.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-latin-wght-normal-B9CIFXIH.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-vietnamese-wght-normal-Bt-aOZkq.woff2 +0 -0
- datadoom/webdist/assets/space-grotesk-latin-ext-wght-normal-D9tNdqV9.woff2 +0 -0
- datadoom/webdist/assets/space-grotesk-latin-wght-normal-BhU9QXUp.woff2 +0 -0
- datadoom/webdist/assets/space-grotesk-vietnamese-wght-normal-D0rl6rjA.woff2 +0 -0
- datadoom/webdist/index.html +15 -0
- datadoom-0.1.0.dev0.dist-info/METADATA +143 -0
- datadoom-0.1.0.dev0.dist-info/RECORD +122 -0
- datadoom-0.1.0.dev0.dist-info/WHEEL +4 -0
- datadoom-0.1.0.dev0.dist-info/entry_points.txt +2 -0
- datadoom-0.1.0.dev0.dist-info/licenses/LICENSE +202 -0
|
@@ -0,0 +1,289 @@
|
|
|
1
|
+
"""ML-handling advice for the issues a column carries (exploratory guidance).
|
|
2
|
+
|
|
3
|
+
DataDoom knows *exactly* what it did to every column — which failure mode hit it,
|
|
4
|
+
the realized magnitude, and (for derived columns) how it was generated. This
|
|
5
|
+
module turns that ground truth into **actionable guidance for the engineer or
|
|
6
|
+
student who will model the data**: for each issue, a plain-language explanation,
|
|
7
|
+
the single best handling approach, and a short menu of concrete techniques.
|
|
8
|
+
|
|
9
|
+
This is static knowledge keyed on ``(mechanism, column type)`` plus the realized
|
|
10
|
+
magnitude — no randomness, no model fitting. It exists so the Results screen can
|
|
11
|
+
answer "what should I focus on, and how do I deal with it?" without the user
|
|
12
|
+
having to recognise an MNAR pattern from a histogram themselves.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
from dataclasses import dataclass, field
|
|
18
|
+
from typing import Any
|
|
19
|
+
|
|
20
|
+
# Severity ranks the urgency of an issue for someone building a model on the data.
|
|
21
|
+
# "critical" = will silently break the model (leakage); "high" = caps achievable
|
|
22
|
+
# performance or biases estimates if mishandled; "medium"/"low" = handle with care.
|
|
23
|
+
Severity = str # "critical" | "high" | "medium" | "low"
|
|
24
|
+
_SEVERITY_RANK = {"critical": 3, "high": 2, "medium": 1, "low": 0}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class Issue:
|
|
29
|
+
"""One thing about a column an ML engineer should know and handle."""
|
|
30
|
+
|
|
31
|
+
mode: str # the failure mechanism ("mnar", "leakage", …) or "class_imbalance"
|
|
32
|
+
title: str # short headline ("Missing not at random")
|
|
33
|
+
severity: Severity
|
|
34
|
+
magnitude: str # human-readable realized effect ("12.0% of values missing")
|
|
35
|
+
explanation: str # what the issue is, in plain language
|
|
36
|
+
recommendation: str # the single best way to handle it
|
|
37
|
+
techniques: list[str] = field(default_factory=list) # concrete options
|
|
38
|
+
detail: dict[str, Any] = field(default_factory=dict) # raw numbers for the UI
|
|
39
|
+
|
|
40
|
+
def to_dict(self) -> dict[str, Any]:
|
|
41
|
+
return {
|
|
42
|
+
"mode": self.mode,
|
|
43
|
+
"title": self.title,
|
|
44
|
+
"severity": self.severity,
|
|
45
|
+
"magnitude": self.magnitude,
|
|
46
|
+
"explanation": self.explanation,
|
|
47
|
+
"recommendation": self.recommendation,
|
|
48
|
+
"techniques": self.techniques,
|
|
49
|
+
"detail": self.detail,
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
# --- the knowledge base --------------------------------------------------------------
|
|
54
|
+
# Each entry is the fixed guidance for a mechanism. ``base_severity`` is escalated
|
|
55
|
+
# by realized magnitude in the builders below where it makes sense.
|
|
56
|
+
|
|
57
|
+
_GUIDE: dict[str, dict[str, Any]] = {
|
|
58
|
+
"mcar": {
|
|
59
|
+
"title": "Missing completely at random",
|
|
60
|
+
"base_severity": "low",
|
|
61
|
+
"explanation": (
|
|
62
|
+
"Values are missing independently of every column, including their own "
|
|
63
|
+
"value. This is the most benign kind of missingness: the observed rows "
|
|
64
|
+
"stay a fair, unbiased sample of the data."
|
|
65
|
+
),
|
|
66
|
+
"recommendation": (
|
|
67
|
+
"Simple imputation is unbiased here. Use the median (numeric) or mode "
|
|
68
|
+
"(categorical), or drop rows if the rate is small (<5%)."
|
|
69
|
+
),
|
|
70
|
+
"techniques": [
|
|
71
|
+
"Median/mean imputation for numeric columns",
|
|
72
|
+
"Mode (most-frequent) imputation for categoricals",
|
|
73
|
+
"Listwise row deletion — unbiased under MCAR when the rate is low",
|
|
74
|
+
],
|
|
75
|
+
},
|
|
76
|
+
"mar": {
|
|
77
|
+
"title": "Missing at random",
|
|
78
|
+
"base_severity": "medium",
|
|
79
|
+
"explanation": (
|
|
80
|
+
"Whether a value is missing depends on *other observed* columns, not on "
|
|
81
|
+
"the missing value itself. Naive mean/median imputation biases the column "
|
|
82
|
+
"because it ignores those drivers; conditioning on them fixes it."
|
|
83
|
+
),
|
|
84
|
+
"recommendation": (
|
|
85
|
+
"Use conditional, model-based imputation that learns from the observed "
|
|
86
|
+
"drivers (e.g. IterativeImputer / MICE or KNN), not a global mean."
|
|
87
|
+
),
|
|
88
|
+
"techniques": [
|
|
89
|
+
"IterativeImputer (MICE) — regresses the column on the others",
|
|
90
|
+
"KNN imputation on the correlated features",
|
|
91
|
+
"Add a binary missingness-indicator feature alongside the imputed value",
|
|
92
|
+
"Avoid plain mean/median imputation — it biases under MAR",
|
|
93
|
+
],
|
|
94
|
+
},
|
|
95
|
+
"mnar": {
|
|
96
|
+
"title": "Missing not at random",
|
|
97
|
+
"base_severity": "high",
|
|
98
|
+
"explanation": (
|
|
99
|
+
"Whether a value is missing depends on the *unobserved value itself* "
|
|
100
|
+
"(e.g. high earners hide income). No imputation is unbiased without "
|
|
101
|
+
"modelling the missingness mechanism — the missing rows are a skewed "
|
|
102
|
+
"sample, so filling them in from the observed rows distorts the column."
|
|
103
|
+
),
|
|
104
|
+
"recommendation": (
|
|
105
|
+
"Treat missingness as informative: add an explicit missing-indicator "
|
|
106
|
+
"feature (it is often predictive on its own) and avoid pretending the "
|
|
107
|
+
"data is MCAR/MAR. Consider selection / pattern-mixture models."
|
|
108
|
+
),
|
|
109
|
+
"techniques": [
|
|
110
|
+
"Add a binary 'is-missing' indicator — frequently predictive itself",
|
|
111
|
+
"Pattern-mixture or selection models for the missingness mechanism",
|
|
112
|
+
"Domain-informed bounds instead of point imputation",
|
|
113
|
+
"Run a sensitivity analysis over plausible imputed values",
|
|
114
|
+
],
|
|
115
|
+
},
|
|
116
|
+
"label_noise": {
|
|
117
|
+
"title": "Label noise",
|
|
118
|
+
"base_severity": "high",
|
|
119
|
+
"explanation": (
|
|
120
|
+
"A fraction of the target labels are wrong. This caps the accuracy any "
|
|
121
|
+
"model can honestly reach and, with high-capacity models, gets memorised "
|
|
122
|
+
"as if it were signal — hurting generalisation."
|
|
123
|
+
),
|
|
124
|
+
"recommendation": (
|
|
125
|
+
"Train with noise-robust losses and strong regularisation, and use "
|
|
126
|
+
"confident-learning tools to find and prune likely-mislabelled rows."
|
|
127
|
+
),
|
|
128
|
+
"techniques": [
|
|
129
|
+
"Confident learning (e.g. cleanlab) to flag mislabelled rows",
|
|
130
|
+
"Noise-robust losses (label smoothing, symmetric / MAE loss)",
|
|
131
|
+
"Early stopping and regularisation to resist memorising errors",
|
|
132
|
+
"Ensemble disagreement to surface suspect labels",
|
|
133
|
+
],
|
|
134
|
+
},
|
|
135
|
+
"feature_noise": {
|
|
136
|
+
"title": "Feature measurement noise",
|
|
137
|
+
"base_severity": "medium",
|
|
138
|
+
"explanation": (
|
|
139
|
+
"Additive measurement noise was injected into this feature, lowering its "
|
|
140
|
+
"signal-to-noise ratio. Linear models will attenuate its coefficient "
|
|
141
|
+
"(regression dilution); the feature looks weaker than it truly is."
|
|
142
|
+
),
|
|
143
|
+
"recommendation": (
|
|
144
|
+
"Prefer noise-tolerant models and regularisation; aggregate or smooth if "
|
|
145
|
+
"repeated measurements exist for the same entity."
|
|
146
|
+
),
|
|
147
|
+
"techniques": [
|
|
148
|
+
"L2 regularisation to stabilise the attenuated coefficient",
|
|
149
|
+
"Tree ensembles — more tolerant of feature noise than linear models",
|
|
150
|
+
"Aggregate/denoise if multiple readings per entity are available",
|
|
151
|
+
"Robust scaling so the noise does not dominate distance metrics",
|
|
152
|
+
],
|
|
153
|
+
},
|
|
154
|
+
"drift": {
|
|
155
|
+
"title": "Distribution drift over row order",
|
|
156
|
+
"base_severity": "medium",
|
|
157
|
+
"explanation": (
|
|
158
|
+
"This feature's distribution shifts across the dataset index (its "
|
|
159
|
+
"early rows differ from its late rows). A random train/test split leaks "
|
|
160
|
+
"the late regime into training and overstates real-world performance."
|
|
161
|
+
),
|
|
162
|
+
"recommendation": (
|
|
163
|
+
"Split by row/time order rather than randomly, and validate on the later "
|
|
164
|
+
"regime. Detrend the feature or add the time index as a feature."
|
|
165
|
+
),
|
|
166
|
+
"techniques": [
|
|
167
|
+
"Time-ordered train/test split (do not shuffle)",
|
|
168
|
+
"Detrend / difference the feature to remove the systematic shift",
|
|
169
|
+
"Add the row index or timestamp as an explicit feature",
|
|
170
|
+
"Rolling or periodic retraining for a deployed model",
|
|
171
|
+
],
|
|
172
|
+
},
|
|
173
|
+
"covariate_shift": {
|
|
174
|
+
"title": "Covariate shift",
|
|
175
|
+
"base_severity": "medium",
|
|
176
|
+
"explanation": (
|
|
177
|
+
"This feature's marginal distribution was moved away from the original "
|
|
178
|
+
"(train ≠ deployment distribution) while its relationship to the label is "
|
|
179
|
+
"preserved. Models tuned on the original distribution mis-calibrate."
|
|
180
|
+
),
|
|
181
|
+
"recommendation": (
|
|
182
|
+
"Re-weight training rows by the density ratio (importance weighting), or "
|
|
183
|
+
"re-standardise using deployment statistics, and validate on the shifted "
|
|
184
|
+
"distribution."
|
|
185
|
+
),
|
|
186
|
+
"techniques": [
|
|
187
|
+
"Importance weighting via density-ratio estimation",
|
|
188
|
+
"Domain-adaptation methods",
|
|
189
|
+
"Re-standardise the feature using deployment-time statistics",
|
|
190
|
+
"Validate explicitly on the shifted distribution",
|
|
191
|
+
],
|
|
192
|
+
},
|
|
193
|
+
"leakage": {
|
|
194
|
+
"title": "Target leakage",
|
|
195
|
+
"base_severity": "critical",
|
|
196
|
+
"explanation": (
|
|
197
|
+
"This column is a near-perfect proxy for the target that would NOT be "
|
|
198
|
+
"available at prediction time. It inflates offline metrics to look great "
|
|
199
|
+
"and then collapses in production. This is leakage, not signal."
|
|
200
|
+
),
|
|
201
|
+
"recommendation": (
|
|
202
|
+
"Drop this column before training. If a single feature gives suspiciously "
|
|
203
|
+
"high accuracy/AUROC, treat it as leakage until proven otherwise."
|
|
204
|
+
),
|
|
205
|
+
"techniques": [
|
|
206
|
+
"Remove the leaking column from the feature set",
|
|
207
|
+
"Audit all features for ones derived from the target",
|
|
208
|
+
"Be suspicious of any single feature with near-perfect predictive power",
|
|
209
|
+
],
|
|
210
|
+
},
|
|
211
|
+
"class_imbalance": {
|
|
212
|
+
"title": "Class imbalance",
|
|
213
|
+
"base_severity": "medium",
|
|
214
|
+
"explanation": (
|
|
215
|
+
"The target classes are far from balanced. A model can score high "
|
|
216
|
+
"accuracy by predicting the majority class while being useless on the "
|
|
217
|
+
"minority class that usually matters most."
|
|
218
|
+
),
|
|
219
|
+
"recommendation": (
|
|
220
|
+
"Stop using raw accuracy — evaluate with PR-AUC / F1 / recall on the "
|
|
221
|
+
"minority class, and rebalance via class weights or resampling."
|
|
222
|
+
),
|
|
223
|
+
"techniques": [
|
|
224
|
+
"Class weights (e.g. class_weight='balanced')",
|
|
225
|
+
"Resampling: SMOTE / oversample minority or undersample majority",
|
|
226
|
+
"Stratified train/test splitting and cross-validation",
|
|
227
|
+
"Evaluate with PR-AUC, F1, recall — not accuracy",
|
|
228
|
+
"Tune the decision threshold rather than defaulting to 0.5",
|
|
229
|
+
],
|
|
230
|
+
},
|
|
231
|
+
}
|
|
232
|
+
|
|
233
|
+
|
|
234
|
+
def _escalate(base: Severity, fraction: float | None) -> Severity:
|
|
235
|
+
"""Bump severity up one tier when a corruption rate is large."""
|
|
236
|
+
if fraction is None:
|
|
237
|
+
return base
|
|
238
|
+
if fraction >= 0.30:
|
|
239
|
+
return _bump(base, 2)
|
|
240
|
+
if fraction >= 0.15:
|
|
241
|
+
return _bump(base, 1)
|
|
242
|
+
return base
|
|
243
|
+
|
|
244
|
+
|
|
245
|
+
def _bump(sev: Severity, by: int) -> Severity:
|
|
246
|
+
rank = min(3, _SEVERITY_RANK.get(sev, 1) + by)
|
|
247
|
+
for name, value in _SEVERITY_RANK.items():
|
|
248
|
+
if value == rank:
|
|
249
|
+
return name
|
|
250
|
+
return sev
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
def build_issue(
|
|
254
|
+
mode: str, *, magnitude: str, fraction: float | None = None, detail: dict[str, Any] | None = None
|
|
255
|
+
) -> Issue:
|
|
256
|
+
"""Assemble the guidance :class:`Issue` for a mechanism + realized magnitude.
|
|
257
|
+
|
|
258
|
+
``fraction`` (a corruption rate in [0, 1]) escalates the base severity for
|
|
259
|
+
rate-like mechanisms; pass ``None`` for mechanisms where rate is irrelevant.
|
|
260
|
+
Unknown mechanisms fall back to a generic medium-severity entry so a new
|
|
261
|
+
plugin failure mode still renders coherently.
|
|
262
|
+
"""
|
|
263
|
+
guide = _GUIDE.get(mode)
|
|
264
|
+
if guide is None:
|
|
265
|
+
return Issue(
|
|
266
|
+
mode=mode,
|
|
267
|
+
title=mode.replace("_", " ").title(),
|
|
268
|
+
severity="medium",
|
|
269
|
+
magnitude=magnitude,
|
|
270
|
+
explanation=f"Column was modified by the {mode!r} mechanism.",
|
|
271
|
+
recommendation="Inspect the realized effect and handle accordingly.",
|
|
272
|
+
techniques=[],
|
|
273
|
+
detail=detail or {},
|
|
274
|
+
)
|
|
275
|
+
return Issue(
|
|
276
|
+
mode=mode,
|
|
277
|
+
title=guide["title"],
|
|
278
|
+
severity=_escalate(guide["base_severity"], fraction),
|
|
279
|
+
magnitude=magnitude,
|
|
280
|
+
explanation=guide["explanation"],
|
|
281
|
+
recommendation=guide["recommendation"],
|
|
282
|
+
techniques=list(guide["techniques"]),
|
|
283
|
+
detail=detail or {},
|
|
284
|
+
)
|
|
285
|
+
|
|
286
|
+
|
|
287
|
+
def severity_rank(sev: Severity) -> int:
|
|
288
|
+
"""Numeric rank for sorting issues by urgency (higher = more urgent)."""
|
|
289
|
+
return _SEVERITY_RANK.get(sev, 1)
|
datadoom/engine/audit.py
ADDED
|
@@ -0,0 +1,290 @@
|
|
|
1
|
+
"""Human-readable audit report for a generation (bound into the export bundle).
|
|
2
|
+
|
|
3
|
+
Renders the full :class:`~datadoom.engine.reports.ReportBundle` — compliance, the
|
|
4
|
+
per-column guide (stats + data-quality issues + ML advice), injected failures,
|
|
5
|
+
causal truth, difficulty, and the determinism record — into a single Markdown
|
|
6
|
+
document, ``audit_report.md``, opened with a clickable **table of contents** so a
|
|
7
|
+
reader can jump straight to a section (or a specific column). It ships alongside
|
|
8
|
+
the data, ``metadata.json`` and the locked ``spec.resolved.yaml`` so a downloaded
|
|
9
|
+
bundle is self-describing.
|
|
10
|
+
|
|
11
|
+
Pure, deterministic, **timestamp-free** (invariant #6): the same
|
|
12
|
+
``(spec_hash, seed)`` renders byte-identical Markdown.
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import re
|
|
18
|
+
from typing import TYPE_CHECKING, Any
|
|
19
|
+
|
|
20
|
+
if TYPE_CHECKING:
|
|
21
|
+
from .reports import ReportBundle
|
|
22
|
+
from .spec.models import Spec
|
|
23
|
+
|
|
24
|
+
_SEVERITY_MARK = {"critical": "🔴", "high": "🟠", "medium": "🟡", "low": "⚪"}
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _slug(text: str) -> str:
|
|
28
|
+
"""GitHub-style heading anchor: lowercase, drop punctuation, spaces → hyphens.
|
|
29
|
+
|
|
30
|
+
Matches the id common Markdown renderers (GitHub, VS Code) derive from a
|
|
31
|
+
heading, so the table-of-contents links resolve. Headings in this document
|
|
32
|
+
are kept free of decorative characters so the mapping stays unambiguous.
|
|
33
|
+
"""
|
|
34
|
+
s = text.strip().lower().replace("`", "")
|
|
35
|
+
s = re.sub(r"[^\w\s-]", "", s) # keep word chars, whitespace, hyphens
|
|
36
|
+
return s.replace(" ", "-")
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _fmt(x: Any) -> str:
|
|
40
|
+
"""Compact, stable formatting for a number (or pass-through for non-numbers)."""
|
|
41
|
+
if x is None:
|
|
42
|
+
return "—"
|
|
43
|
+
if isinstance(x, bool):
|
|
44
|
+
return str(x)
|
|
45
|
+
if isinstance(x, int):
|
|
46
|
+
return str(x)
|
|
47
|
+
if isinstance(x, float):
|
|
48
|
+
if x != x: # NaN
|
|
49
|
+
return "—"
|
|
50
|
+
if abs(x) >= 1e5 or (x != 0 and abs(x) < 1e-3):
|
|
51
|
+
return f"{x:.3e}"
|
|
52
|
+
return f"{x:.4g}"
|
|
53
|
+
return str(x)
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
def _pct(x: float | None) -> str:
|
|
57
|
+
return "—" if x is None else f"{x * 100:.1f}%"
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
def render_audit_markdown(
|
|
61
|
+
spec: Spec, report: ReportBundle, *, package_version: str | None = None
|
|
62
|
+
) -> str:
|
|
63
|
+
"""Render the audit report for ``report`` as a Markdown string."""
|
|
64
|
+
r = report.to_dict()
|
|
65
|
+
det = r.get("determinism") or {}
|
|
66
|
+
dist = r.get("distribution")
|
|
67
|
+
profile = r.get("profile")
|
|
68
|
+
failures = r.get("failures")
|
|
69
|
+
truth = r.get("causal_truth")
|
|
70
|
+
difficulty = r.get("difficulty")
|
|
71
|
+
|
|
72
|
+
lines: list[str] = []
|
|
73
|
+
w = lines.append
|
|
74
|
+
|
|
75
|
+
# --- title -------------------------------------------------------------------
|
|
76
|
+
w(f"# DataDoom Audit Report — {spec.name}")
|
|
77
|
+
w("")
|
|
78
|
+
if spec.description:
|
|
79
|
+
w(f"_{spec.description}_")
|
|
80
|
+
w("")
|
|
81
|
+
w("> Regenerate byte-identical data from the locked `spec.resolved.yaml` in this")
|
|
82
|
+
w("> bundle. This report is deterministic — no timestamps or ambient state.")
|
|
83
|
+
w("")
|
|
84
|
+
|
|
85
|
+
# --- table of contents -------------------------------------------------------
|
|
86
|
+
# (title, present?) — only list sections that are actually rendered.
|
|
87
|
+
toc: list[tuple[str, bool]] = [
|
|
88
|
+
("Overview", True),
|
|
89
|
+
("Distribution compliance", bool(dist and dist.get("features"))),
|
|
90
|
+
("Column guide", bool(profile and profile.get("columns"))),
|
|
91
|
+
("Injected failures", bool(failures and failures.get("modes"))),
|
|
92
|
+
("Causal truth", bool(truth and truth.get("edges"))),
|
|
93
|
+
("Difficulty calibration", bool(difficulty)),
|
|
94
|
+
("Determinism", bool(det.get("artifact_checksums"))),
|
|
95
|
+
]
|
|
96
|
+
w("## Contents")
|
|
97
|
+
w("")
|
|
98
|
+
for title, present in toc:
|
|
99
|
+
if not present:
|
|
100
|
+
continue
|
|
101
|
+
w(f"- [{title}](#{_slug(title)})")
|
|
102
|
+
if title == "Column guide" and profile:
|
|
103
|
+
for col in profile["columns"]:
|
|
104
|
+
n_issues = len(col.get("issues") or [])
|
|
105
|
+
suffix = f" — {n_issues} issue{'s' if n_issues != 1 else ''}" if n_issues else ""
|
|
106
|
+
w(f" - [{col['name']}](#{_slug(col['name'])}){suffix}")
|
|
107
|
+
w("")
|
|
108
|
+
|
|
109
|
+
# --- overview ----------------------------------------------------------------
|
|
110
|
+
w("## Overview")
|
|
111
|
+
w("")
|
|
112
|
+
w("| Field | Value |")
|
|
113
|
+
w("|---|---|")
|
|
114
|
+
w(f"| Spec hash | `{det.get('spec_hash', '—')}` |")
|
|
115
|
+
w(f"| Seed | `{det.get('seed', '—')}` |")
|
|
116
|
+
w(f"| Rows | {spec.rows} |")
|
|
117
|
+
w(f"| Features (declared) | {len(spec.features)} |")
|
|
118
|
+
w(f"| datadoom_version | {spec.datadoom_version} |")
|
|
119
|
+
if package_version:
|
|
120
|
+
w(f"| Engine version | {package_version} |")
|
|
121
|
+
score = r.get("compliance_score")
|
|
122
|
+
if score is not None:
|
|
123
|
+
w(f"| Compliance score | {_pct(score)} |")
|
|
124
|
+
w("")
|
|
125
|
+
|
|
126
|
+
_compliance(w, dist)
|
|
127
|
+
_column_guide(w, profile)
|
|
128
|
+
_failures(w, failures)
|
|
129
|
+
_causal(w, truth)
|
|
130
|
+
_difficulty(w, difficulty)
|
|
131
|
+
_determinism(w, det)
|
|
132
|
+
|
|
133
|
+
w("---")
|
|
134
|
+
w("_Generated by DataDoom — controllable, reproducible synthetic data._")
|
|
135
|
+
w("")
|
|
136
|
+
return "\n".join(lines)
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
def _compliance(w: Any, dist: dict[str, Any] | None) -> None:
|
|
140
|
+
if not dist or not dist.get("features"):
|
|
141
|
+
return
|
|
142
|
+
w("## Distribution compliance")
|
|
143
|
+
w("")
|
|
144
|
+
w("Honest fit of each realized column against its requested distribution (KS or")
|
|
145
|
+
w("chi-square goodness-of-fit). Parameters are never refit to the sample.")
|
|
146
|
+
w("")
|
|
147
|
+
w("| Feature | Distribution | Emp. mean | Emp. std | Clamped | p-value | Verdict |")
|
|
148
|
+
w("|---|---|---|---|---|---|---|")
|
|
149
|
+
for c in dist["features"]:
|
|
150
|
+
emp = c.get("empirical") or {}
|
|
151
|
+
applicable = c.get("applicable", True)
|
|
152
|
+
verdict = "n/a" if not applicable else ("pass" if c.get("passed") else "review")
|
|
153
|
+
w(
|
|
154
|
+
f"| {c.get('feature')} | {c.get('dist')} | {_fmt(emp.get('mean'))} | "
|
|
155
|
+
f"{_fmt(emp.get('std'))} | {_pct(c.get('clamped_fraction'))} | "
|
|
156
|
+
f"{_fmt(c.get('p_value')) if applicable else '—'} | {verdict} |"
|
|
157
|
+
)
|
|
158
|
+
w("")
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def _column_guide(w: Any, profile: dict[str, Any] | None) -> None:
|
|
162
|
+
if not profile or not profile.get("columns"):
|
|
163
|
+
return
|
|
164
|
+
s = profile.get("summary") or {}
|
|
165
|
+
w("## Column guide")
|
|
166
|
+
w("")
|
|
167
|
+
w(
|
|
168
|
+
f"{s.get('n_columns', 0)} columns · {s.get('columns_with_issues', 0)} with "
|
|
169
|
+
f"data-quality issues · {s.get('critical_issues', 0)} critical · "
|
|
170
|
+
f"{s.get('high_issues', 0)} high severity."
|
|
171
|
+
)
|
|
172
|
+
if s.get("label"):
|
|
173
|
+
w("")
|
|
174
|
+
w(f"Detected target column: **{s['label']}**.")
|
|
175
|
+
w("")
|
|
176
|
+
for col in profile["columns"]:
|
|
177
|
+
# Heading is the bare column name so its anchor is stable for the TOC; the
|
|
178
|
+
# role/type/derivation goes on the line below.
|
|
179
|
+
w(f"### {col['name']}")
|
|
180
|
+
meta = f"_{col.get('role', 'feature')} · {col.get('feature_type')} ({col.get('dtype')})_"
|
|
181
|
+
parents = col.get("parents") or []
|
|
182
|
+
if parents:
|
|
183
|
+
meta += f" · _derived from: {', '.join(parents)}_"
|
|
184
|
+
w(meta)
|
|
185
|
+
w("")
|
|
186
|
+
bits = [
|
|
187
|
+
f"rows {col.get('count')}",
|
|
188
|
+
f"missing {_pct(col.get('missing_pct'))}",
|
|
189
|
+
f"unique {col.get('unique')}",
|
|
190
|
+
]
|
|
191
|
+
stats = col.get("stats")
|
|
192
|
+
if stats:
|
|
193
|
+
bits += [
|
|
194
|
+
f"mean {_fmt(stats.get('mean'))}",
|
|
195
|
+
f"std {_fmt(stats.get('std'))}",
|
|
196
|
+
f"min {_fmt(stats.get('min'))}",
|
|
197
|
+
f"median {_fmt(stats.get('median'))}",
|
|
198
|
+
f"max {_fmt(stats.get('max'))}",
|
|
199
|
+
]
|
|
200
|
+
imb = col.get("imbalance")
|
|
201
|
+
if imb:
|
|
202
|
+
bits.append(f"balance {_pct(imb.get('majority_pct'))}/{_pct(imb.get('minority_pct'))}")
|
|
203
|
+
inj = col.get("injected")
|
|
204
|
+
if inj and inj.get("missing_pct"):
|
|
205
|
+
bits.append(f"missing after injection {_pct(inj.get('missing_pct'))}")
|
|
206
|
+
w("- " + " · ".join(bits))
|
|
207
|
+
|
|
208
|
+
cats = col.get("categories")
|
|
209
|
+
if cats:
|
|
210
|
+
preview = ", ".join(f"{c['value']} {_pct(c['pct'])}" for c in cats[:6])
|
|
211
|
+
w(f"- classes: {preview}")
|
|
212
|
+
|
|
213
|
+
for issue in col.get("issues") or []:
|
|
214
|
+
mark = _SEVERITY_MARK.get(issue.get("severity", "medium"), "•")
|
|
215
|
+
w("")
|
|
216
|
+
w(f"- {mark} **{issue.get('title')}** ({issue.get('severity')}) — {issue.get('magnitude')}")
|
|
217
|
+
w(f" - {issue.get('explanation')}")
|
|
218
|
+
w(f" - **How to handle it:** {issue.get('recommendation')}")
|
|
219
|
+
for t in issue.get("techniques") or []:
|
|
220
|
+
w(f" - {t}")
|
|
221
|
+
w("")
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def _failures(w: Any, failures: dict[str, Any] | None) -> None:
|
|
225
|
+
if not failures or not failures.get("modes"):
|
|
226
|
+
return
|
|
227
|
+
w("## Injected failures")
|
|
228
|
+
w("")
|
|
229
|
+
w(f"{failures.get('count', 0)} corruption(s) applied to the injected variant.")
|
|
230
|
+
w("")
|
|
231
|
+
for m in failures["modes"]:
|
|
232
|
+
target = m.get("column") or m.get("into") or ", ".join((m.get("nullified_fraction") or {}).keys())
|
|
233
|
+
w(f"- **{m.get('type')}** → `{target}`")
|
|
234
|
+
for k, v in m.items():
|
|
235
|
+
if k in ("index", "type", "mechanism"):
|
|
236
|
+
continue
|
|
237
|
+
w(f" - {k}: {_fmt(v) if isinstance(v, (int, float)) else v}")
|
|
238
|
+
w("")
|
|
239
|
+
|
|
240
|
+
|
|
241
|
+
def _causal(w: Any, truth: dict[str, Any] | None) -> None:
|
|
242
|
+
if not truth or not truth.get("edges"):
|
|
243
|
+
return
|
|
244
|
+
w("## Causal truth")
|
|
245
|
+
w("")
|
|
246
|
+
w("The true generating graph (structural equations). Edges into an intervened")
|
|
247
|
+
w("node are detached (`active: false`).")
|
|
248
|
+
w("")
|
|
249
|
+
w("| From | To | Function | Active |")
|
|
250
|
+
w("|---|---|---|---|")
|
|
251
|
+
for e in truth["edges"]:
|
|
252
|
+
w(f"| {e.get('from')} | {e.get('to')} | {e.get('fn')} | {e.get('active', True)} |")
|
|
253
|
+
interventions = truth.get("interventions") or {}
|
|
254
|
+
if interventions:
|
|
255
|
+
w("")
|
|
256
|
+
w("Interventions: " + ", ".join(f"do({k}={_fmt(v)})" for k, v in interventions.items()))
|
|
257
|
+
w("")
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
def _difficulty(w: Any, diff: dict[str, Any] | None) -> None:
|
|
261
|
+
if not diff:
|
|
262
|
+
return
|
|
263
|
+
target = diff.get("target") or {}
|
|
264
|
+
band = target.get("band") or [None, None]
|
|
265
|
+
w("## Difficulty calibration")
|
|
266
|
+
w("")
|
|
267
|
+
w(
|
|
268
|
+
f"Achieved **{_fmt(diff.get('achieved_metric'))} "
|
|
269
|
+
f"{str(diff.get('metric_name', '')).upper()}** against target band "
|
|
270
|
+
f"{_fmt(band[0])}–{_fmt(band[1])} "
|
|
271
|
+
f"({'in band' if diff.get('band_met') else 'closest reached'}) "
|
|
272
|
+
f"in {diff.get('iterations', 0)} iteration(s)."
|
|
273
|
+
)
|
|
274
|
+
w("")
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
def _determinism(w: Any, det: dict[str, Any]) -> None:
|
|
278
|
+
checks = det.get("artifact_checksums") or {}
|
|
279
|
+
if not checks:
|
|
280
|
+
return
|
|
281
|
+
w("## Determinism")
|
|
282
|
+
w("")
|
|
283
|
+
w("SHA-256 checksums of the data artifacts — the same `(spec_hash, seed)`")
|
|
284
|
+
w("reproduces these byte-for-byte.")
|
|
285
|
+
w("")
|
|
286
|
+
w("| File | Checksum |")
|
|
287
|
+
w("|---|---|")
|
|
288
|
+
for name, digest in checks.items():
|
|
289
|
+
w(f"| `{name}` | `{digest}` |")
|
|
290
|
+
w("")
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""Causal engine — DAG construction, structural functions, SEM execution (05 §3)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from .execute import execute_causal, resolve_interventions
|
|
6
|
+
from .functions import STRUCTURAL_FNS, StructuralFn
|
|
7
|
+
from .graph import CausalDag
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"CausalDag",
|
|
11
|
+
"StructuralFn",
|
|
12
|
+
"STRUCTURAL_FNS",
|
|
13
|
+
"execute_causal",
|
|
14
|
+
"resolve_interventions",
|
|
15
|
+
]
|