datadoom 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. datadoom/__init__.py +23 -0
  2. datadoom/adapters/__init__.py +29 -0
  3. datadoom/adapters/frameworks.py +94 -0
  4. datadoom/adapters/loaders.py +72 -0
  5. datadoom/api/__init__.py +11 -0
  6. datadoom/api/app.py +109 -0
  7. datadoom/api/deps.py +30 -0
  8. datadoom/api/errors.py +89 -0
  9. datadoom/api/estimate.py +82 -0
  10. datadoom/api/routes/__init__.py +7 -0
  11. datadoom/api/routes/artifacts.py +147 -0
  12. datadoom/api/routes/datasets.py +180 -0
  13. datadoom/api/routes/meta.py +45 -0
  14. datadoom/api/routes/plugins.py +22 -0
  15. datadoom/api/routes/runs.py +144 -0
  16. datadoom/api/routes/specs.py +73 -0
  17. datadoom/api/routes/templates.py +30 -0
  18. datadoom/api/schemas.py +230 -0
  19. datadoom/api/serializers.py +143 -0
  20. datadoom/api/state.py +24 -0
  21. datadoom/api/store_helpers.py +56 -0
  22. datadoom/api/ws.py +72 -0
  23. datadoom/cli/__init__.py +1 -0
  24. datadoom/cli/main.py +313 -0
  25. datadoom/config.py +108 -0
  26. datadoom/engine/__init__.py +38 -0
  27. datadoom/engine/advice.py +289 -0
  28. datadoom/engine/audit.py +290 -0
  29. datadoom/engine/causal/__init__.py +15 -0
  30. datadoom/engine/causal/execute.py +116 -0
  31. datadoom/engine/causal/functions.py +116 -0
  32. datadoom/engine/causal/graph.py +54 -0
  33. datadoom/engine/difficulty/__init__.py +36 -0
  34. datadoom/engine/difficulty/calibrate.py +235 -0
  35. datadoom/engine/difficulty/knobs.py +171 -0
  36. datadoom/engine/difficulty/probes.py +181 -0
  37. datadoom/engine/dist/__init__.py +35 -0
  38. datadoom/engine/dist/base.py +46 -0
  39. datadoom/engine/dist/builtins.py +172 -0
  40. datadoom/engine/dist/compliance.py +344 -0
  41. datadoom/engine/dist/providers.py +117 -0
  42. datadoom/engine/errors.py +32 -0
  43. datadoom/engine/export/__init__.py +27 -0
  44. datadoom/engine/export/base.py +49 -0
  45. datadoom/engine/export/checksums.py +18 -0
  46. datadoom/engine/export/csv_exporter.py +34 -0
  47. datadoom/engine/export/json_exporter.py +67 -0
  48. datadoom/engine/export/metadata.py +58 -0
  49. datadoom/engine/export/parquet_exporter.py +45 -0
  50. datadoom/engine/failure/__init__.py +18 -0
  51. datadoom/engine/failure/apply.py +37 -0
  52. datadoom/engine/failure/base.py +116 -0
  53. datadoom/engine/failure/modes.py +442 -0
  54. datadoom/engine/pipeline.py +418 -0
  55. datadoom/engine/profile.py +327 -0
  56. datadoom/engine/progress.py +14 -0
  57. datadoom/engine/reference.py +338 -0
  58. datadoom/engine/reports.py +206 -0
  59. datadoom/engine/rng.py +79 -0
  60. datadoom/engine/spec/__init__.py +45 -0
  61. datadoom/engine/spec/hashing.py +57 -0
  62. datadoom/engine/spec/models.py +238 -0
  63. datadoom/engine/spec/validate.py +345 -0
  64. datadoom/engine/timeseries.py +88 -0
  65. datadoom/jobs/__init__.py +14 -0
  66. datadoom/jobs/progress.py +155 -0
  67. datadoom/jobs/worker.py +162 -0
  68. datadoom/plugin.py +35 -0
  69. datadoom/plugins/__init__.py +47 -0
  70. datadoom/plugins/contracts.py +72 -0
  71. datadoom/plugins/loader.py +125 -0
  72. datadoom/plugins/registry.py +214 -0
  73. datadoom/plugins/scaffold.py +434 -0
  74. datadoom/store/__init__.py +47 -0
  75. datadoom/store/artifacts.py +67 -0
  76. datadoom/store/db.py +104 -0
  77. datadoom/store/migrations/__init__.py +0 -0
  78. datadoom/store/migrations/env.py +53 -0
  79. datadoom/store/migrations/script.py.mako +24 -0
  80. datadoom/store/migrations/versions/0001_init.py +149 -0
  81. datadoom/store/migrations/versions/0002_report_mutual_information.py +23 -0
  82. datadoom/store/migrations/versions/0003_run_name.py +23 -0
  83. datadoom/store/migrations/versions/0004_report_profile.py +24 -0
  84. datadoom/store/models.py +170 -0
  85. datadoom/store/repositories.py +279 -0
  86. datadoom/templates/__init__.py +239 -0
  87. datadoom/templates/ab_test.datadoom.yaml +46 -0
  88. datadoom/templates/clinical_deterioration.datadoom.yaml +124 -0
  89. datadoom/templates/credit_default_challenge.datadoom.yaml +147 -0
  90. datadoom/templates/customer_churn.datadoom.yaml +60 -0
  91. datadoom/templates/ecommerce_orders.datadoom.yaml +46 -0
  92. datadoom/templates/fraud_detection.datadoom.yaml +57 -0
  93. datadoom/templates/hospital_readmission.datadoom.yaml +61 -0
  94. datadoom/templates/insurance_claims.datadoom.yaml +43 -0
  95. datadoom/templates/iot_sensors.datadoom.yaml +44 -0
  96. datadoom/templates/people_directory.datadoom.yaml +56 -0
  97. datadoom/templates/predictive_maintenance.datadoom.yaml +107 -0
  98. datadoom/templates/telecom_churn_challenge.datadoom.yaml +125 -0
  99. datadoom/version.py +3 -0
  100. datadoom/webdist/assets/index-V8VAuTJG.js +445 -0
  101. datadoom/webdist/assets/index-doRjyG5s.css +1 -0
  102. datadoom/webdist/assets/inter-cyrillic-ext-wght-normal-BOeWTOD4.woff2 +0 -0
  103. datadoom/webdist/assets/inter-cyrillic-wght-normal-DqGufNeO.woff2 +0 -0
  104. datadoom/webdist/assets/inter-greek-ext-wght-normal-DlzME5K_.woff2 +0 -0
  105. datadoom/webdist/assets/inter-greek-wght-normal-CkhJZR-_.woff2 +0 -0
  106. datadoom/webdist/assets/inter-latin-ext-wght-normal-DO1Apj_S.woff2 +0 -0
  107. datadoom/webdist/assets/inter-latin-wght-normal-Dx4kXJAl.woff2 +0 -0
  108. datadoom/webdist/assets/inter-vietnamese-wght-normal-CBcvBZtf.woff2 +0 -0
  109. datadoom/webdist/assets/jetbrains-mono-cyrillic-wght-normal-D73BlboJ.woff2 +0 -0
  110. datadoom/webdist/assets/jetbrains-mono-greek-wght-normal-Bw9x6K1M.woff2 +0 -0
  111. datadoom/webdist/assets/jetbrains-mono-latin-ext-wght-normal-DBQx-q_a.woff2 +0 -0
  112. datadoom/webdist/assets/jetbrains-mono-latin-wght-normal-B9CIFXIH.woff2 +0 -0
  113. datadoom/webdist/assets/jetbrains-mono-vietnamese-wght-normal-Bt-aOZkq.woff2 +0 -0
  114. datadoom/webdist/assets/space-grotesk-latin-ext-wght-normal-D9tNdqV9.woff2 +0 -0
  115. datadoom/webdist/assets/space-grotesk-latin-wght-normal-BhU9QXUp.woff2 +0 -0
  116. datadoom/webdist/assets/space-grotesk-vietnamese-wght-normal-D0rl6rjA.woff2 +0 -0
  117. datadoom/webdist/index.html +15 -0
  118. datadoom-0.1.0.dev0.dist-info/METADATA +143 -0
  119. datadoom-0.1.0.dev0.dist-info/RECORD +122 -0
  120. datadoom-0.1.0.dev0.dist-info/WHEEL +4 -0
  121. datadoom-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  122. datadoom-0.1.0.dev0.dist-info/licenses/LICENSE +202 -0
@@ -0,0 +1,279 @@
1
+ """Repositories — the only way the app reads/writes metadata rows.
2
+
3
+ Each repository wraps a live :class:`~sqlalchemy.orm.Session`. They enforce the
4
+ domain invariants from doc 06 §5 — most importantly **spec immutability**: an
5
+ edit never updates a spec row, it creates a new version and repoints the
6
+ dataset's ``current_spec_id``.
7
+
8
+ UUID PKs are generated here with ``uuid4``. This is the persistence layer, NOT
9
+ the engine data path, so the determinism ban on ``uuid4`` does not apply (DB
10
+ identity has no bearing on reproducible artifact bytes).
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import uuid
16
+ from typing import Any
17
+
18
+ from sqlalchemy import func, select
19
+ from sqlalchemy.orm import Session
20
+
21
+ from .db import utcnow_iso
22
+ from .models import (
23
+ ArtifactRow,
24
+ DatasetRow,
25
+ GenerationRunRow,
26
+ ReportRow,
27
+ SpecRow,
28
+ )
29
+
30
+
31
+ def _uid() -> str:
32
+ return str(uuid.uuid4())
33
+
34
+
35
+ class DatasetRepository:
36
+ def __init__(self, session: Session) -> None:
37
+ self.s = session
38
+
39
+ def create(
40
+ self, name: str, description: str | None = None, owner_id: str | None = None
41
+ ) -> DatasetRow:
42
+ now = utcnow_iso()
43
+ row = DatasetRow(
44
+ dataset_id=_uid(),
45
+ name=name,
46
+ description=description,
47
+ status="draft",
48
+ owner_id=owner_id,
49
+ created_at=now,
50
+ updated_at=now,
51
+ )
52
+ self.s.add(row)
53
+ self.s.flush()
54
+ return row
55
+
56
+ def get(self, dataset_id: str) -> DatasetRow | None:
57
+ return self.s.get(DatasetRow, dataset_id)
58
+
59
+ def get_by_name(self, name: str, owner_id: str | None = None) -> DatasetRow | None:
60
+ stmt = select(DatasetRow).where(
61
+ DatasetRow.name == name, DatasetRow.owner_id.is_(owner_id)
62
+ )
63
+ return self.s.scalars(stmt).first()
64
+
65
+ def list(
66
+ self,
67
+ status: str | None = None,
68
+ q: str | None = None,
69
+ limit: int = 50,
70
+ offset: int = 0,
71
+ ) -> tuple[list[DatasetRow], int]:
72
+ stmt = select(DatasetRow)
73
+ count_stmt = select(func.count()).select_from(DatasetRow)
74
+ if status:
75
+ stmt = stmt.where(DatasetRow.status == status)
76
+ count_stmt = count_stmt.where(DatasetRow.status == status)
77
+ if q:
78
+ like = f"%{q}%"
79
+ stmt = stmt.where(DatasetRow.name.like(like))
80
+ count_stmt = count_stmt.where(DatasetRow.name.like(like))
81
+ total = self.s.scalar(count_stmt) or 0
82
+ stmt = stmt.order_by(DatasetRow.updated_at.desc()).limit(limit).offset(offset)
83
+ return list(self.s.scalars(stmt).all()), int(total)
84
+
85
+ def touch(self, row: DatasetRow) -> None:
86
+ row.updated_at = utcnow_iso()
87
+
88
+ def update(
89
+ self, row: DatasetRow, name: str | None = None, description: str | None = None
90
+ ) -> DatasetRow:
91
+ if name is not None:
92
+ row.name = name
93
+ if description is not None:
94
+ row.description = description
95
+ self.touch(row)
96
+ return row
97
+
98
+ def set_status(self, row: DatasetRow, status: str) -> None:
99
+ row.status = status
100
+ self.touch(row)
101
+
102
+ def delete(self, row: DatasetRow) -> None:
103
+ # ORM cascade removes specs/runs/artifacts/reports rows.
104
+ self.s.delete(row)
105
+
106
+
107
+ class SpecRepository:
108
+ def __init__(self, session: Session) -> None:
109
+ self.s = session
110
+
111
+ def create_version(
112
+ self,
113
+ dataset: DatasetRow,
114
+ body: dict[str, Any],
115
+ spec_hash: str,
116
+ datadoom_version: str,
117
+ ) -> SpecRow:
118
+ """Create the next immutable spec snapshot and repoint the dataset."""
119
+ next_version = (
120
+ self.s.scalar(
121
+ select(func.coalesce(func.max(SpecRow.version), 0)).where(
122
+ SpecRow.dataset_id == dataset.dataset_id
123
+ )
124
+ )
125
+ or 0
126
+ ) + 1
127
+ row = SpecRow(
128
+ spec_id=_uid(),
129
+ dataset_id=dataset.dataset_id,
130
+ spec_hash=spec_hash,
131
+ body=body,
132
+ datadoom_version=datadoom_version,
133
+ version=next_version,
134
+ created_at=utcnow_iso(),
135
+ )
136
+ self.s.add(row)
137
+ self.s.flush()
138
+ dataset.current_spec_id = row.spec_id
139
+ dataset.updated_at = utcnow_iso()
140
+ return row
141
+
142
+ def get(self, spec_id: str) -> SpecRow | None:
143
+ return self.s.get(SpecRow, spec_id)
144
+
145
+ def current(self, dataset: DatasetRow) -> SpecRow | None:
146
+ if dataset.current_spec_id is None:
147
+ return None
148
+ return self.s.get(SpecRow, dataset.current_spec_id)
149
+
150
+ def history(self, dataset_id: str) -> list[SpecRow]:
151
+ stmt = (
152
+ select(SpecRow)
153
+ .where(SpecRow.dataset_id == dataset_id)
154
+ .order_by(SpecRow.version.desc())
155
+ )
156
+ return list(self.s.scalars(stmt).all())
157
+
158
+ def by_version(self, dataset_id: str, version: int) -> SpecRow | None:
159
+ stmt = select(SpecRow).where(
160
+ SpecRow.dataset_id == dataset_id, SpecRow.version == version
161
+ )
162
+ return self.s.scalars(stmt).first()
163
+
164
+
165
+ class RunRepository:
166
+ def __init__(self, session: Session) -> None:
167
+ self.s = session
168
+
169
+ def create(
170
+ self, dataset_id: str, spec_id: str, seed: int, name: str | None = None
171
+ ) -> GenerationRunRow:
172
+ row = GenerationRunRow(
173
+ run_id=_uid(),
174
+ dataset_id=dataset_id,
175
+ spec_id=spec_id,
176
+ name=name,
177
+ seed=seed,
178
+ status="queued",
179
+ progress_pct=0,
180
+ created_at=utcnow_iso(),
181
+ )
182
+ self.s.add(row)
183
+ self.s.flush()
184
+ return row
185
+
186
+ def get(self, run_id: str) -> GenerationRunRow | None:
187
+ return self.s.get(GenerationRunRow, run_id)
188
+
189
+ def set_name(self, row: GenerationRunRow, name: str) -> GenerationRunRow:
190
+ row.name = name
191
+ return row
192
+
193
+ def delete(self, row: GenerationRunRow) -> None:
194
+ # Clear the dataset's recorded latest-run pointer if it referenced this run
195
+ # (it's a soft reference, so the cascade won't touch it).
196
+ dataset = self.s.get(DatasetRow, row.dataset_id)
197
+ if dataset is not None and dataset.latest_run_id == row.run_id:
198
+ dataset.latest_run_id = None
199
+ # ORM cascade removes artifact/report rows.
200
+ self.s.delete(row)
201
+
202
+ def list_for_dataset(self, dataset_id: str) -> list[GenerationRunRow]:
203
+ stmt = (
204
+ select(GenerationRunRow)
205
+ .where(GenerationRunRow.dataset_id == dataset_id)
206
+ .order_by(GenerationRunRow.created_at.desc())
207
+ )
208
+ return list(self.s.scalars(stmt).all())
209
+
210
+ def find_repro(self, spec_id: str, seed: int) -> GenerationRunRow | None:
211
+ stmt = (
212
+ select(GenerationRunRow)
213
+ .where(GenerationRunRow.spec_id == spec_id, GenerationRunRow.seed == seed)
214
+ .order_by(GenerationRunRow.created_at.desc())
215
+ )
216
+ return self.s.scalars(stmt).first()
217
+
218
+
219
+ class ArtifactRepository:
220
+ def __init__(self, session: Session) -> None:
221
+ self.s = session
222
+
223
+ def add(
224
+ self,
225
+ run_id: str,
226
+ version: str,
227
+ fmt: str,
228
+ storage_uri: str,
229
+ checksum_sha256: str,
230
+ size_bytes: int,
231
+ split: str | None = None,
232
+ ) -> ArtifactRow:
233
+ row = ArtifactRow(
234
+ artifact_id=_uid(),
235
+ run_id=run_id,
236
+ version=version,
237
+ split=split,
238
+ format=fmt,
239
+ storage_uri=storage_uri,
240
+ checksum_sha256=checksum_sha256,
241
+ size_bytes=size_bytes,
242
+ created_at=utcnow_iso(),
243
+ )
244
+ self.s.add(row)
245
+ self.s.flush()
246
+ return row
247
+
248
+ def get(self, artifact_id: str) -> ArtifactRow | None:
249
+ return self.s.get(ArtifactRow, artifact_id)
250
+
251
+ def list_for_run(self, run_id: str) -> list[ArtifactRow]:
252
+ stmt = select(ArtifactRow).where(ArtifactRow.run_id == run_id)
253
+ return list(self.s.scalars(stmt).all())
254
+
255
+
256
+ class ReportRepository:
257
+ def __init__(self, session: Session) -> None:
258
+ self.s = session
259
+
260
+ def upsert(self, run_id: str, sections: dict[str, Any]) -> ReportRow:
261
+ existing = self.get_for_run(run_id)
262
+ if existing is None:
263
+ existing = ReportRow(report_id=_uid(), run_id=run_id, created_at=utcnow_iso())
264
+ self.s.add(existing)
265
+ existing.compliance_score = sections.get("compliance_score")
266
+ existing.distribution = sections.get("distribution")
267
+ existing.correlation = sections.get("correlation")
268
+ existing.mutual_information = sections.get("mutual_information")
269
+ existing.causal_truth = sections.get("causal_truth")
270
+ existing.difficulty = sections.get("difficulty")
271
+ existing.failures = sections.get("failures")
272
+ existing.profile = sections.get("profile")
273
+ existing.determinism = sections.get("determinism")
274
+ self.s.flush()
275
+ return existing
276
+
277
+ def get_for_run(self, run_id: str) -> ReportRow | None:
278
+ stmt = select(ReportRow).where(ReportRow.run_id == run_id)
279
+ return self.s.scalars(stmt).first()
@@ -0,0 +1,239 @@
1
+ """Built-in domain templates (17 step 18, 09 §4.6).
2
+
3
+ A template is a curated, ready-to-run spec plus catalog metadata. The web gallery
4
+ and ``datadoom template`` surface them so a user can start from a realistic
5
+ domain dataset in one click. Templates are *data only* (no code); this module is
6
+ a thin loader that reads the bundled YAML via :mod:`importlib.resources`, so it
7
+ works the same from the source tree and an installed wheel.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ from dataclasses import dataclass
13
+ from importlib import resources
14
+ from typing import Any
15
+
16
+ import yaml
17
+
18
+
19
+ @dataclass(frozen=True)
20
+ class TemplateMeta:
21
+ """Catalog entry for one built-in template."""
22
+
23
+ id: str
24
+ name: str
25
+ domain: str
26
+ description: str
27
+ tags: tuple[str, ...]
28
+ filename: str
29
+ features: tuple[str, ...] = () # showcased engine features (causal/failures/…)
30
+ level: str = "starter" # "starter" (learn one feature) | "hackathon" (full enterprise challenge)
31
+
32
+ def to_summary(self) -> dict[str, Any]:
33
+ return {
34
+ "id": self.id,
35
+ "name": self.name,
36
+ "domain": self.domain,
37
+ "description": self.description,
38
+ "tags": list(self.tags) + list(self.features),
39
+ "level": self.level,
40
+ }
41
+
42
+
43
+ CATALOG: tuple[TemplateMeta, ...] = (
44
+ # ── Hackathon flagships ──────────────────────────────────────────────────
45
+ # Enterprise-grade ML challenges: each composes a deep causal DAG, a latent
46
+ # confounder, a stacked data-quality failure profile and (where applicable)
47
+ # a calibrated difficulty band — a realistic dataset to build a model on,
48
+ # carrying a `meta.challenge` brief (target / metric / split / gotchas).
49
+ TemplateMeta(
50
+ id="credit-default-challenge",
51
+ name="Credit default (challenge)",
52
+ domain="Finance",
53
+ description=(
54
+ "Consumer credit default. Demographics and employment drive income, which "
55
+ "feeds a latent risk score behind the default label — calibrated to the "
56
+ "'advanced' AUROC band, then corrupted with MNAR income, drifting debt-to-"
57
+ "income, a leaked collections proxy and label noise. Train/test split included."
58
+ ),
59
+ tags=("credit-risk", "classification"),
60
+ features=("causal", "latent", "difficulty", "failure-injection", "leakage"),
61
+ filename="credit_default_challenge.datadoom.yaml",
62
+ level="hackathon",
63
+ ),
64
+ TemplateMeta(
65
+ id="clinical-deterioration",
66
+ name="Clinical deterioration (challenge)",
67
+ domain="Healthcare",
68
+ description=(
69
+ "ICU early-warning with a hidden confounder: a latent illness severity drives "
70
+ "both the observed vitals (heart rate, lactate, BP) and the outcome, so the "
71
+ "vitals are confounded proxies. Calibrated to 'advanced', with realistic "
72
+ "MNAR/MAR/MCAR clinical missingness."
73
+ ),
74
+ tags=("clinical", "classification"),
75
+ features=("causal", "latent", "confounder", "difficulty", "missingness"),
76
+ filename="clinical_deterioration.datadoom.yaml",
77
+ level="hackathon",
78
+ ),
79
+ TemplateMeta(
80
+ id="predictive-maintenance",
81
+ name="Predictive maintenance (challenge)",
82
+ domain="Industrial IoT",
83
+ description=(
84
+ "Turbine maintenance on multi-sensor time-series (vibration, bearing temp, oil "
85
+ "pressure) plus load and component grade, driving a latent wear index behind a "
86
+ "maintenance label. The load regime drifts, gains sensor noise and drops "
87
+ "readings; a leaked alarm proxy is planted. Sequential — preserve row order."
88
+ ),
89
+ tags=("predictive-maintenance", "classification"),
90
+ features=("time-series", "causal", "latent", "drift", "leakage"),
91
+ filename="predictive_maintenance.datadoom.yaml",
92
+ level="hackathon",
93
+ ),
94
+ TemplateMeta(
95
+ id="telecom-churn-challenge",
96
+ name="Telecom churn (challenge)",
97
+ domain="Telecom",
98
+ description=(
99
+ "Customer churn with realistic records: believable identity fields (name, "
100
+ "email, city) sit beside the real signal — tenure, charges, support load, "
101
+ "contract and usage feed a latent dissatisfaction score. Calibrated to the hard "
102
+ "'kaggle' AUROC band, with MNAR usage and noisy labels. Drop the identifiers."
103
+ ),
104
+ tags=("churn", "classification"),
105
+ features=("causal", "latent", "difficulty", "realistic-text", "missingness"),
106
+ filename="telecom_churn_challenge.datadoom.yaml",
107
+ level="hackathon",
108
+ ),
109
+ # ── Starter templates (learn one capability at a time) ───────────────────
110
+ TemplateMeta(
111
+ id="fraud-detection",
112
+ name="Transaction fraud",
113
+ domain="Finance",
114
+ description=(
115
+ "Customer age and card type drive monthly spend, which drives a fraud-risk "
116
+ "label — then realistic data-quality failures (under-reported spend, random "
117
+ "gaps, mislabels) corrupt a copy so you can study robustness."
118
+ ),
119
+ tags=("classification",),
120
+ features=("causal", "failure-injection"),
121
+ filename="fraud_detection.datadoom.yaml",
122
+ ),
123
+ TemplateMeta(
124
+ id="customer-churn",
125
+ name="Customer churn",
126
+ domain="SaaS",
127
+ description=(
128
+ "Tenure, monthly charges and support load feed a latent satisfaction score "
129
+ "behind a churn label, calibrated down to an 'intermediate' baseline-AUROC "
130
+ "band so the dataset is hard in a measured, honest way."
131
+ ),
132
+ tags=("classification",),
133
+ features=("difficulty", "latent", "causal"),
134
+ filename="customer_churn.datadoom.yaml",
135
+ ),
136
+ TemplateMeta(
137
+ id="hospital-readmission",
138
+ name="Hospital readmission",
139
+ domain="Healthcare",
140
+ description=(
141
+ "Patient age, diagnosis count, length of stay and prior admissions drive a "
142
+ "latent severity score behind a 30-day readmission label — a clean causal "
143
+ "starter with a hidden confounder in the true graph."
144
+ ),
145
+ tags=("classification",),
146
+ features=("causal", "latent"),
147
+ filename="hospital_readmission.datadoom.yaml",
148
+ ),
149
+ TemplateMeta(
150
+ id="ecommerce-orders",
151
+ name="E-commerce orders",
152
+ domain="E-commerce",
153
+ description=(
154
+ "A fast orders table — lognormal order value, basket quantity, product "
155
+ "category, channel, order date and a return flag. Distribution-only, so it "
156
+ "generates instantly."
157
+ ),
158
+ tags=("tabular",),
159
+ features=("distributions", "datetime"),
160
+ filename="ecommerce_orders.datadoom.yaml",
161
+ ),
162
+ TemplateMeta(
163
+ id="iot-sensors",
164
+ name="IoT sensor readings",
165
+ domain="IoT",
166
+ description=(
167
+ "Hourly multi-sensor telemetry — temperature, humidity, pressure and battery "
168
+ "by device, with bounded distributions that stay physically plausible."
169
+ ),
170
+ tags=("tabular",),
171
+ features=("numeric", "datetime"),
172
+ filename="iot_sensors.datadoom.yaml",
173
+ ),
174
+ TemplateMeta(
175
+ id="people-directory",
176
+ name="People directory",
177
+ domain="People",
178
+ description=(
179
+ "Believable identities — names, emails, phones, companies, job titles, "
180
+ "cities and countries — via deterministic mimesis providers. Great for demos "
181
+ "and UIs that need realistic-looking records."
182
+ ),
183
+ tags=("tabular",),
184
+ features=("realistic-text", "datetime"),
185
+ filename="people_directory.datadoom.yaml",
186
+ ),
187
+ TemplateMeta(
188
+ id="marketing-ab-test",
189
+ name="Marketing A/B test",
190
+ domain="Marketing",
191
+ description=(
192
+ "A web experiment — 50/50 control vs. treatment, session engagement "
193
+ "(exponential dwell, Poisson pageviews), conversion and revenue. "
194
+ "Distribution-only and instant."
195
+ ),
196
+ tags=("experiment",),
197
+ features=("distributions",),
198
+ filename="ab_test.datadoom.yaml",
199
+ ),
200
+ TemplateMeta(
201
+ id="insurance-claims",
202
+ name="Insurance claims",
203
+ domain="Insurance",
204
+ description=(
205
+ "Claims with a heavy-tailed (Pareto) claim amount — most small, a few very "
206
+ "large — plus policyholder age, region, prior-claim count and a fraud flag."
207
+ ),
208
+ tags=("tabular",),
209
+ features=("heavy-tail",),
210
+ filename="insurance_claims.datadoom.yaml",
211
+ ),
212
+ )
213
+
214
+ _BY_ID = {t.id: t for t in CATALOG}
215
+
216
+
217
+ def list_templates() -> list[TemplateMeta]:
218
+ """Every built-in template, in catalog order."""
219
+ return list(CATALOG)
220
+
221
+
222
+ def get_template(template_id: str) -> TemplateMeta | None:
223
+ return _BY_ID.get(template_id)
224
+
225
+
226
+ def load_template_text(template_id: str) -> str:
227
+ """The template's raw spec YAML (comments preserved — good for `template show`)."""
228
+ meta = _BY_ID.get(template_id)
229
+ if meta is None:
230
+ raise KeyError(f"unknown template {template_id!r}")
231
+ return (resources.files(__package__) / meta.filename).read_text(encoding="utf-8")
232
+
233
+
234
+ def load_template_body(template_id: str) -> dict[str, Any]:
235
+ """Parse a template's spec YAML into a raw dict (not yet validated)."""
236
+ body = yaml.safe_load(load_template_text(template_id))
237
+ if not isinstance(body, dict):
238
+ raise ValueError(f"template {template_id!r} did not parse to a mapping")
239
+ return body
@@ -0,0 +1,46 @@
1
+ datadoom_version: "1"
2
+ name: "marketing-ab-test"
3
+ description: >
4
+ Marketing A/B-test starter — a fast experiment table: a 50/50 control vs.
5
+ treatment assignment, session engagement (exponential dwell time, Poisson
6
+ pageviews), a conversion flag and order revenue. Distribution-only, so it
7
+ generates instantly.
8
+ seed: 27
9
+ rows: 9000
10
+
11
+ features:
12
+ variant:
13
+ type: categorical
14
+ categories: [control, treatment]
15
+ weights: [0.5, 0.5]
16
+ device:
17
+ type: categorical
18
+ categories: [desktop, mobile, tablet]
19
+ weights: [0.5, 0.42, 0.08]
20
+ session_minutes:
21
+ type: numeric
22
+ dist: exponential
23
+ params: { scale: 6.0 }
24
+ min: 0
25
+ pageviews:
26
+ type: numeric
27
+ dist: poisson
28
+ params: { lam: 5 }
29
+ min: 0
30
+ dtype: int
31
+ revenue:
32
+ type: numeric
33
+ dist: lognormal
34
+ params: { mu: 2.6, sigma: 1.1 }
35
+ min: 0
36
+ converted:
37
+ type: boolean
38
+ rate: 0.12
39
+
40
+ export:
41
+ formats: [csv]
42
+ versions: [clean]
43
+
44
+ meta:
45
+ problem_statement: "Web A/B experiment data (assignment, engagement, conversion)."
46
+ tags: [marketing, experiment, tabular]
@@ -0,0 +1,124 @@
1
+ datadoom_version: "1"
2
+ name: "clinical-deterioration"
3
+ description: >
4
+ HACKATHON — ICU clinical deterioration with a hidden confounder. A latent
5
+ illness `severity` (emit: false) is driven by age, comorbidity burden and
6
+ admission type, and severity in turn drives BOTH the observed vitals
7
+ (heart rate ↑, lactate ↑, systolic BP ↓) AND the `deterioration` label. Because
8
+ the true cause is unobserved, the vitals are only *proxies* — a textbook
9
+ latent-confounder problem. Calibrated to the 'advanced' baseline-AUROC band.
10
+ Realistic clinical missingness corrupts a copy: lactate is MNAR (extreme assay
11
+ values dropped), systolic BP is MAR (missing for older patients), heart-rate
12
+ telemetry is MCAR, and 2% of outcomes are mislabelled. `sex` is included but
13
+ carries no causal signal — a fairness/feature-selection check.
14
+ seed: 202
15
+ rows: 7000
16
+
17
+ features:
18
+ patient_age:
19
+ type: numeric
20
+ dist: normal
21
+ params: { mean: 63, std: 17 }
22
+ min: 18
23
+ max: 100
24
+ dtype: int
25
+ sex:
26
+ type: categorical
27
+ categories: [female, male]
28
+ weights: [0.49, 0.51]
29
+ admission_type:
30
+ type: categorical
31
+ categories: [elective, urgent, emergency]
32
+ weights: [0.4, 0.35, 0.25]
33
+ num_comorbidities:
34
+ type: numeric
35
+ dist: poisson
36
+ params: { lam: 2.6 }
37
+ min: 0
38
+ dtype: int
39
+ severity:
40
+ type: numeric # LATENT confounder — the true generative driver
41
+ dtype: float
42
+ emit: false # not shipped; drives both vitals and outcome
43
+ heart_rate:
44
+ type: numeric # derived: observed proxy of severity (bpm)
45
+ dtype: float
46
+ min: 0
47
+ lactate:
48
+ type: numeric # derived: observed proxy of severity (mmol/L)
49
+ dtype: float
50
+ min: 0
51
+ systolic_bp:
52
+ type: numeric # derived: observed proxy of severity (mmHg)
53
+ dtype: float
54
+ min: 0
55
+ deterioration:
56
+ type: boolean # label (derived): logistic of severity
57
+
58
+ causal:
59
+ edges:
60
+ # roots → latent severity
61
+ - { from: patient_age, to: severity, fn: linear, weight: 0.025 }
62
+ - { from: num_comorbidities, to: severity, fn: linear, weight: 0.55 }
63
+ - { from: admission_type, to: severity, fn: map, mapping: { elective: -1.0, urgent: 0.5, emergency: 2.0 } }
64
+ # latent severity → observed vitals (reverse causation → confounding)
65
+ - { from: severity, to: heart_rate, fn: linear, weight: 9.0, bias: 78 }
66
+ - { from: severity, to: lactate, fn: linear, weight: 0.6, bias: 1.4 }
67
+ - { from: severity, to: systolic_bp, fn: linear, weight: -7.0, bias: 124 }
68
+ # latent severity → outcome
69
+ - { from: severity, to: deterioration, fn: logistic, weight: 1.2, bias: -5.5 }
70
+ noise:
71
+ severity: { dist: normal, params: { mean: 0, std: 0.5 } }
72
+ heart_rate: { dist: normal, params: { mean: 0, std: 8 } }
73
+ lactate: { dist: normal, params: { mean: 0, std: 0.4 } }
74
+ systolic_bp: { dist: normal, params: { mean: 0, std: 10 } }
75
+ deterioration: { dist: none }
76
+
77
+ difficulty:
78
+ target: advanced # baseline ROC-AUC calibrated into [0.72, 0.80]
79
+ label: deterioration
80
+ probe: logreg
81
+ max_iters: 10
82
+ knobs: [noise, label_noise]
83
+
84
+ failures:
85
+ - type: mnar # extreme lactate values not recorded
86
+ column: lactate
87
+ rate: 0.12
88
+ strength: 2.0
89
+ - type: mar # BP missing more often for older patients
90
+ column: systolic_bp
91
+ driver: patient_age
92
+ rate: 0.10
93
+ strength: 2.0
94
+ - type: mcar # random telemetry gaps in heart rate
95
+ columns: [heart_rate]
96
+ rate: 0.05
97
+ - type: label_noise # 2% chart-coding errors
98
+ column: deterioration
99
+ rate: 0.02
100
+
101
+ export:
102
+ formats: [csv]
103
+ versions: [clean, injected]
104
+ splits: { train: 0.8, test: 0.2 }
105
+
106
+ meta:
107
+ level: hackathon
108
+ challenge:
109
+ title: "Early warning for clinical deterioration"
110
+ task: classification
111
+ target: deterioration
112
+ metric: ROC-AUC
113
+ difficulty: advanced
114
+ baseline_auroc_band: [0.72, 0.80]
115
+ train_test_split: "80 / 20"
116
+ hidden_structure: >
117
+ The true cause (severity) is latent. It drives the observed vitals AND the
118
+ outcome, so heart_rate / lactate / systolic_bp are correlated proxies, not
119
+ independent causes.
120
+ gotchas:
121
+ - "lactate is MNAR (extreme values missing) and systolic_bp is MAR (by age) — imputation choice matters."
122
+ - "sex carries no causal signal; including it should not help — watch for spurious importance."
123
+ - "Vitals are confounded proxies of a hidden severity, not independent predictors."
124
+ tags: [healthcare, clinical, causal, latent, confounder, difficulty, missingness, classification]