datadoom 0.1.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datadoom/__init__.py +23 -0
- datadoom/adapters/__init__.py +29 -0
- datadoom/adapters/frameworks.py +94 -0
- datadoom/adapters/loaders.py +72 -0
- datadoom/api/__init__.py +11 -0
- datadoom/api/app.py +109 -0
- datadoom/api/deps.py +30 -0
- datadoom/api/errors.py +89 -0
- datadoom/api/estimate.py +82 -0
- datadoom/api/routes/__init__.py +7 -0
- datadoom/api/routes/artifacts.py +147 -0
- datadoom/api/routes/datasets.py +180 -0
- datadoom/api/routes/meta.py +45 -0
- datadoom/api/routes/plugins.py +22 -0
- datadoom/api/routes/runs.py +144 -0
- datadoom/api/routes/specs.py +73 -0
- datadoom/api/routes/templates.py +30 -0
- datadoom/api/schemas.py +230 -0
- datadoom/api/serializers.py +143 -0
- datadoom/api/state.py +24 -0
- datadoom/api/store_helpers.py +56 -0
- datadoom/api/ws.py +72 -0
- datadoom/cli/__init__.py +1 -0
- datadoom/cli/main.py +313 -0
- datadoom/config.py +108 -0
- datadoom/engine/__init__.py +38 -0
- datadoom/engine/advice.py +289 -0
- datadoom/engine/audit.py +290 -0
- datadoom/engine/causal/__init__.py +15 -0
- datadoom/engine/causal/execute.py +116 -0
- datadoom/engine/causal/functions.py +116 -0
- datadoom/engine/causal/graph.py +54 -0
- datadoom/engine/difficulty/__init__.py +36 -0
- datadoom/engine/difficulty/calibrate.py +235 -0
- datadoom/engine/difficulty/knobs.py +171 -0
- datadoom/engine/difficulty/probes.py +181 -0
- datadoom/engine/dist/__init__.py +35 -0
- datadoom/engine/dist/base.py +46 -0
- datadoom/engine/dist/builtins.py +172 -0
- datadoom/engine/dist/compliance.py +344 -0
- datadoom/engine/dist/providers.py +117 -0
- datadoom/engine/errors.py +32 -0
- datadoom/engine/export/__init__.py +27 -0
- datadoom/engine/export/base.py +49 -0
- datadoom/engine/export/checksums.py +18 -0
- datadoom/engine/export/csv_exporter.py +34 -0
- datadoom/engine/export/json_exporter.py +67 -0
- datadoom/engine/export/metadata.py +58 -0
- datadoom/engine/export/parquet_exporter.py +45 -0
- datadoom/engine/failure/__init__.py +18 -0
- datadoom/engine/failure/apply.py +37 -0
- datadoom/engine/failure/base.py +116 -0
- datadoom/engine/failure/modes.py +442 -0
- datadoom/engine/pipeline.py +418 -0
- datadoom/engine/profile.py +327 -0
- datadoom/engine/progress.py +14 -0
- datadoom/engine/reference.py +338 -0
- datadoom/engine/reports.py +206 -0
- datadoom/engine/rng.py +79 -0
- datadoom/engine/spec/__init__.py +45 -0
- datadoom/engine/spec/hashing.py +57 -0
- datadoom/engine/spec/models.py +238 -0
- datadoom/engine/spec/validate.py +345 -0
- datadoom/engine/timeseries.py +88 -0
- datadoom/jobs/__init__.py +14 -0
- datadoom/jobs/progress.py +155 -0
- datadoom/jobs/worker.py +162 -0
- datadoom/plugin.py +35 -0
- datadoom/plugins/__init__.py +47 -0
- datadoom/plugins/contracts.py +72 -0
- datadoom/plugins/loader.py +125 -0
- datadoom/plugins/registry.py +214 -0
- datadoom/plugins/scaffold.py +434 -0
- datadoom/store/__init__.py +47 -0
- datadoom/store/artifacts.py +67 -0
- datadoom/store/db.py +104 -0
- datadoom/store/migrations/__init__.py +0 -0
- datadoom/store/migrations/env.py +53 -0
- datadoom/store/migrations/script.py.mako +24 -0
- datadoom/store/migrations/versions/0001_init.py +149 -0
- datadoom/store/migrations/versions/0002_report_mutual_information.py +23 -0
- datadoom/store/migrations/versions/0003_run_name.py +23 -0
- datadoom/store/migrations/versions/0004_report_profile.py +24 -0
- datadoom/store/models.py +170 -0
- datadoom/store/repositories.py +279 -0
- datadoom/templates/__init__.py +239 -0
- datadoom/templates/ab_test.datadoom.yaml +46 -0
- datadoom/templates/clinical_deterioration.datadoom.yaml +124 -0
- datadoom/templates/credit_default_challenge.datadoom.yaml +147 -0
- datadoom/templates/customer_churn.datadoom.yaml +60 -0
- datadoom/templates/ecommerce_orders.datadoom.yaml +46 -0
- datadoom/templates/fraud_detection.datadoom.yaml +57 -0
- datadoom/templates/hospital_readmission.datadoom.yaml +61 -0
- datadoom/templates/insurance_claims.datadoom.yaml +43 -0
- datadoom/templates/iot_sensors.datadoom.yaml +44 -0
- datadoom/templates/people_directory.datadoom.yaml +56 -0
- datadoom/templates/predictive_maintenance.datadoom.yaml +107 -0
- datadoom/templates/telecom_churn_challenge.datadoom.yaml +125 -0
- datadoom/version.py +3 -0
- datadoom/webdist/assets/index-V8VAuTJG.js +445 -0
- datadoom/webdist/assets/index-doRjyG5s.css +1 -0
- datadoom/webdist/assets/inter-cyrillic-ext-wght-normal-BOeWTOD4.woff2 +0 -0
- datadoom/webdist/assets/inter-cyrillic-wght-normal-DqGufNeO.woff2 +0 -0
- datadoom/webdist/assets/inter-greek-ext-wght-normal-DlzME5K_.woff2 +0 -0
- datadoom/webdist/assets/inter-greek-wght-normal-CkhJZR-_.woff2 +0 -0
- datadoom/webdist/assets/inter-latin-ext-wght-normal-DO1Apj_S.woff2 +0 -0
- datadoom/webdist/assets/inter-latin-wght-normal-Dx4kXJAl.woff2 +0 -0
- datadoom/webdist/assets/inter-vietnamese-wght-normal-CBcvBZtf.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-cyrillic-wght-normal-D73BlboJ.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-greek-wght-normal-Bw9x6K1M.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-latin-ext-wght-normal-DBQx-q_a.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-latin-wght-normal-B9CIFXIH.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-vietnamese-wght-normal-Bt-aOZkq.woff2 +0 -0
- datadoom/webdist/assets/space-grotesk-latin-ext-wght-normal-D9tNdqV9.woff2 +0 -0
- datadoom/webdist/assets/space-grotesk-latin-wght-normal-BhU9QXUp.woff2 +0 -0
- datadoom/webdist/assets/space-grotesk-vietnamese-wght-normal-D0rl6rjA.woff2 +0 -0
- datadoom/webdist/index.html +15 -0
- datadoom-0.1.0.dev0.dist-info/METADATA +143 -0
- datadoom-0.1.0.dev0.dist-info/RECORD +122 -0
- datadoom-0.1.0.dev0.dist-info/WHEEL +4 -0
- datadoom-0.1.0.dev0.dist-info/entry_points.txt +2 -0
- datadoom-0.1.0.dev0.dist-info/licenses/LICENSE +202 -0
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
datadoom_version: "1"
|
|
2
|
+
name: "credit-default-challenge"
|
|
3
|
+
description: >
|
|
4
|
+
HACKATHON FLAGSHIP — consumer credit default. A multi-hop causal economy:
|
|
5
|
+
applicant demographics and employment drive annual income, income + debt load +
|
|
6
|
+
repayment history feed a LATENT risk_score (emit: false — the true generative
|
|
7
|
+
driver, never shipped), and the risk_score drives the `defaulted` label. The
|
|
8
|
+
clean dataset is then calibrated down to the 'advanced' baseline-AUROC band
|
|
9
|
+
[0.72, 0.80] so a vanilla model is honestly hard. On top, a realistic
|
|
10
|
+
data-quality stack corrupts a copy: income is MNAR (high earners under-report),
|
|
11
|
+
debt-to-income is MAR (missing for older applicants) and DRIFTS upward over time
|
|
12
|
+
(a simulated downturn), a `collections_flag` is a LEAKED near-proxy of the label
|
|
13
|
+
(students must detect and drop it), and 2% of labels are flipped. Train/test
|
|
14
|
+
split included.
|
|
15
|
+
seed: 101
|
|
16
|
+
rows: 9000
|
|
17
|
+
|
|
18
|
+
features:
|
|
19
|
+
applicant_age:
|
|
20
|
+
type: numeric
|
|
21
|
+
dist: normal
|
|
22
|
+
params: { mean: 41, std: 13 }
|
|
23
|
+
min: 18
|
|
24
|
+
max: 90
|
|
25
|
+
dtype: int
|
|
26
|
+
employment_type:
|
|
27
|
+
type: categorical
|
|
28
|
+
categories: [unemployed, part_time, full_time, self_employed]
|
|
29
|
+
weights: [0.08, 0.17, 0.6, 0.15]
|
|
30
|
+
education:
|
|
31
|
+
type: categorical
|
|
32
|
+
categories: [hs, college, grad]
|
|
33
|
+
weights: [0.45, 0.42, 0.13]
|
|
34
|
+
region:
|
|
35
|
+
type: categorical
|
|
36
|
+
categories: [northeast, midwest, south, west]
|
|
37
|
+
weights: [0.22, 0.24, 0.32, 0.22]
|
|
38
|
+
requested_amount:
|
|
39
|
+
type: numeric
|
|
40
|
+
dist: lognormal
|
|
41
|
+
params: { mu: 9.1, sigma: 0.55 } # median ≈ $8,950
|
|
42
|
+
min: 500
|
|
43
|
+
credit_history_months:
|
|
44
|
+
type: numeric
|
|
45
|
+
dist: normal
|
|
46
|
+
params: { mean: 96, std: 48 }
|
|
47
|
+
min: 0
|
|
48
|
+
dtype: int
|
|
49
|
+
num_open_accounts:
|
|
50
|
+
type: numeric
|
|
51
|
+
dist: poisson
|
|
52
|
+
params: { lam: 5 }
|
|
53
|
+
min: 0
|
|
54
|
+
dtype: int
|
|
55
|
+
num_late_payments:
|
|
56
|
+
type: numeric
|
|
57
|
+
dist: poisson
|
|
58
|
+
params: { lam: 1.1 }
|
|
59
|
+
min: 0
|
|
60
|
+
dtype: int
|
|
61
|
+
debt_to_income:
|
|
62
|
+
type: numeric
|
|
63
|
+
dist: normal
|
|
64
|
+
params: { mean: 0.34, std: 0.14 }
|
|
65
|
+
min: 0
|
|
66
|
+
annual_income:
|
|
67
|
+
type: numeric # derived (hop 1): demographics + employment → income
|
|
68
|
+
dtype: float
|
|
69
|
+
min: 0
|
|
70
|
+
risk_score:
|
|
71
|
+
type: numeric # derived LATENT (hop 2): the true default driver
|
|
72
|
+
dtype: float
|
|
73
|
+
emit: false # hidden — shapes the label, shows in the true graph, not shipped
|
|
74
|
+
defaulted:
|
|
75
|
+
type: boolean # label (hop 3): logistic of the latent risk_score
|
|
76
|
+
|
|
77
|
+
causal:
|
|
78
|
+
edges:
|
|
79
|
+
# hop 1 — what someone earns
|
|
80
|
+
- { from: applicant_age, to: annual_income, fn: linear, weight: 620, bias: 15000 }
|
|
81
|
+
- { from: education, to: annual_income, fn: map, mapping: { hs: 0, college: 18000, grad: 41000 } }
|
|
82
|
+
- { from: employment_type, to: annual_income, fn: map, mapping: { unemployed: -16000, part_time: -4000, full_time: 12000, self_employed: 6000 } }
|
|
83
|
+
# hop 2 — latent creditworthiness
|
|
84
|
+
- { from: annual_income, to: risk_score, fn: linear, weight: -0.000028 }
|
|
85
|
+
- { from: debt_to_income, to: risk_score, fn: linear, weight: 4.2 }
|
|
86
|
+
- { from: num_late_payments, to: risk_score, fn: linear, weight: 0.55 }
|
|
87
|
+
- { from: credit_history_months, to: risk_score, fn: linear, weight: -0.006 }
|
|
88
|
+
- { from: num_open_accounts, to: risk_score, fn: linear, weight: 0.06 }
|
|
89
|
+
# hop 3 — the label
|
|
90
|
+
- { from: risk_score, to: defaulted, fn: logistic, weight: 1.6, bias: -1.2 }
|
|
91
|
+
noise:
|
|
92
|
+
annual_income: { dist: normal, params: { mean: 0, std: 6500 } }
|
|
93
|
+
risk_score: { dist: none }
|
|
94
|
+
defaulted: { dist: none }
|
|
95
|
+
|
|
96
|
+
difficulty:
|
|
97
|
+
target: advanced # baseline ROC-AUC calibrated into [0.72, 0.80]
|
|
98
|
+
label: defaulted
|
|
99
|
+
probe: logreg
|
|
100
|
+
max_iters: 10
|
|
101
|
+
knobs: [noise, label_noise]
|
|
102
|
+
|
|
103
|
+
failures:
|
|
104
|
+
- type: mnar # high earners under-report income
|
|
105
|
+
column: annual_income
|
|
106
|
+
rate: 0.10
|
|
107
|
+
strength: 2.2
|
|
108
|
+
- type: mar # DTI missing for older applicants
|
|
109
|
+
column: debt_to_income
|
|
110
|
+
driver: applicant_age
|
|
111
|
+
rate: 0.08
|
|
112
|
+
strength: 2.0
|
|
113
|
+
- type: drift # a simulated downturn pushes DTI up over time
|
|
114
|
+
column: debt_to_income
|
|
115
|
+
schedule: { kind: linear, magnitude: 0.15 }
|
|
116
|
+
- type: leakage # collections_flag ≈ the label — a trap to detect and drop
|
|
117
|
+
target: defaulted
|
|
118
|
+
into: collections_flag
|
|
119
|
+
noise: 0.05
|
|
120
|
+
- type: label_noise # 2% mislabelled defaults
|
|
121
|
+
column: defaulted
|
|
122
|
+
rate: 0.02
|
|
123
|
+
|
|
124
|
+
export:
|
|
125
|
+
formats: [csv]
|
|
126
|
+
versions: [clean, injected]
|
|
127
|
+
splits: { train: 0.8, test: 0.2 }
|
|
128
|
+
|
|
129
|
+
meta:
|
|
130
|
+
level: hackathon
|
|
131
|
+
challenge:
|
|
132
|
+
title: "Predict consumer credit default"
|
|
133
|
+
task: classification
|
|
134
|
+
target: defaulted
|
|
135
|
+
metric: ROC-AUC
|
|
136
|
+
difficulty: advanced
|
|
137
|
+
baseline_auroc_band: [0.72, 0.80]
|
|
138
|
+
train_test_split: "80 / 20"
|
|
139
|
+
hidden_structure: >
|
|
140
|
+
risk_score is latent (never shipped). The signal flows
|
|
141
|
+
demographics+employment → annual_income → risk_score → defaulted.
|
|
142
|
+
gotchas:
|
|
143
|
+
- "annual_income is MNAR — missingness depends on the (high) value itself."
|
|
144
|
+
- "debt_to_income is MAR (missing for older applicants) and drifts upward over the row order."
|
|
145
|
+
- "collections_flag is a leaked proxy of the label — drop it or you'll 'win' dishonestly."
|
|
146
|
+
- "2% of labels are flipped; perfect accuracy is not the goal."
|
|
147
|
+
tags: [finance, credit-risk, causal, latent, difficulty, failure-injection, leakage, classification]
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
datadoom_version: "1"
|
|
2
|
+
name: "customer-churn"
|
|
3
|
+
description: >
|
|
4
|
+
Customer-churn starter. Tenure, monthly charges and support load feed a latent
|
|
5
|
+
satisfaction score (emit: false — it shapes the label and shows up in the true
|
|
6
|
+
causal graph, but is never shipped) behind a churn label. The dataset is
|
|
7
|
+
calibrated down to the 'intermediate' baseline-AUROC band, so a baseline model
|
|
8
|
+
lands in [0.62, 0.72] — hard in a measured, honest way.
|
|
9
|
+
seed: 23
|
|
10
|
+
rows: 7000
|
|
11
|
+
|
|
12
|
+
features:
|
|
13
|
+
tenure_months:
|
|
14
|
+
type: numeric
|
|
15
|
+
dist: normal
|
|
16
|
+
params: { mean: 28, std: 16 }
|
|
17
|
+
min: 0
|
|
18
|
+
dtype: int
|
|
19
|
+
monthly_charges:
|
|
20
|
+
type: numeric
|
|
21
|
+
dist: normal
|
|
22
|
+
params: { mean: 70, std: 25 }
|
|
23
|
+
min: 0
|
|
24
|
+
support_tickets:
|
|
25
|
+
type: numeric
|
|
26
|
+
dist: poisson
|
|
27
|
+
params: { lam: 2.5 }
|
|
28
|
+
min: 0
|
|
29
|
+
dtype: int
|
|
30
|
+
satisfaction:
|
|
31
|
+
type: numeric # latent: weighted combination of the drivers
|
|
32
|
+
dtype: float
|
|
33
|
+
emit: false # hidden — drives the label but is not shipped
|
|
34
|
+
churned:
|
|
35
|
+
type: boolean # label (derived): logistic of (low) satisfaction
|
|
36
|
+
|
|
37
|
+
causal:
|
|
38
|
+
edges:
|
|
39
|
+
- { from: tenure_months, to: satisfaction, fn: linear, weight: 0.04 }
|
|
40
|
+
- { from: monthly_charges, to: satisfaction, fn: linear, weight: -0.03 }
|
|
41
|
+
- { from: support_tickets, to: satisfaction, fn: linear, weight: -0.4 }
|
|
42
|
+
- { from: satisfaction, to: churned, fn: logistic, weight: -2.5, bias: 0.2 }
|
|
43
|
+
noise:
|
|
44
|
+
satisfaction: { dist: none }
|
|
45
|
+
churned: { dist: none }
|
|
46
|
+
|
|
47
|
+
difficulty:
|
|
48
|
+
target: intermediate # beginner | intermediate | advanced | kaggle
|
|
49
|
+
label: churned
|
|
50
|
+
probe: logreg
|
|
51
|
+
max_iters: 10
|
|
52
|
+
knobs: [noise, label_noise]
|
|
53
|
+
|
|
54
|
+
export:
|
|
55
|
+
formats: [csv]
|
|
56
|
+
versions: [clean]
|
|
57
|
+
|
|
58
|
+
meta:
|
|
59
|
+
problem_statement: "Predict churned from tenure, charges, support load (satisfaction is latent)."
|
|
60
|
+
tags: [churn, difficulty, latent, causal, classification]
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
datadoom_version: "1"
|
|
2
|
+
name: "ecommerce-orders"
|
|
3
|
+
description: >
|
|
4
|
+
E-commerce orders starter — a fast, distribution-only table (no causal graph):
|
|
5
|
+
order value (heavy-ish lognormal), basket quantity, product category, channel,
|
|
6
|
+
order date and a return flag. Great for quickly populating a realistic-looking
|
|
7
|
+
orders dataset.
|
|
8
|
+
seed: 31
|
|
9
|
+
rows: 10000
|
|
10
|
+
|
|
11
|
+
features:
|
|
12
|
+
order_value:
|
|
13
|
+
type: numeric
|
|
14
|
+
dist: lognormal
|
|
15
|
+
params: { mu: 3.8, sigma: 0.6 }
|
|
16
|
+
min: 0
|
|
17
|
+
quantity:
|
|
18
|
+
type: numeric
|
|
19
|
+
dist: poisson
|
|
20
|
+
params: { lam: 2.2 }
|
|
21
|
+
min: 1
|
|
22
|
+
dtype: int
|
|
23
|
+
category:
|
|
24
|
+
type: categorical
|
|
25
|
+
categories: [electronics, apparel, home, books, toys, grocery]
|
|
26
|
+
weights: [0.22, 0.26, 0.18, 0.1, 0.1, 0.14]
|
|
27
|
+
channel:
|
|
28
|
+
type: categorical
|
|
29
|
+
categories: [web, mobile, store]
|
|
30
|
+
weights: [0.45, 0.4, 0.15]
|
|
31
|
+
order_date:
|
|
32
|
+
type: datetime
|
|
33
|
+
start: "2023-01-01"
|
|
34
|
+
end: "2024-12-31"
|
|
35
|
+
granularity: day
|
|
36
|
+
returned:
|
|
37
|
+
type: boolean
|
|
38
|
+
rate: 0.08
|
|
39
|
+
|
|
40
|
+
export:
|
|
41
|
+
formats: [csv]
|
|
42
|
+
versions: [clean]
|
|
43
|
+
|
|
44
|
+
meta:
|
|
45
|
+
problem_statement: "A realistic orders table for quick prototyping (distribution-only)."
|
|
46
|
+
tags: [ecommerce, tabular, datetime]
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
datadoom_version: "1"
|
|
2
|
+
name: "transaction-fraud"
|
|
3
|
+
description: >
|
|
4
|
+
Transaction fraud starter. Customer age and card type drive monthly spend, which
|
|
5
|
+
drives a fraud-risk label; a stack of realistic failures (under-reported spend,
|
|
6
|
+
random gaps in age, mislabelled cases) then corrupts a copy — the clean baseline
|
|
7
|
+
is always kept, so you can compare clean vs. injected in Results.
|
|
8
|
+
seed: 7
|
|
9
|
+
rows: 8000
|
|
10
|
+
|
|
11
|
+
features:
|
|
12
|
+
customer_age:
|
|
13
|
+
type: numeric
|
|
14
|
+
dist: normal
|
|
15
|
+
params: { mean: 42, std: 13 }
|
|
16
|
+
min: 18
|
|
17
|
+
max: 95
|
|
18
|
+
dtype: int
|
|
19
|
+
card_type:
|
|
20
|
+
type: categorical
|
|
21
|
+
categories: [standard, gold, platinum]
|
|
22
|
+
weights: [0.6, 0.3, 0.1]
|
|
23
|
+
monthly_spend:
|
|
24
|
+
type: numeric # derived
|
|
25
|
+
dtype: float
|
|
26
|
+
min: 0
|
|
27
|
+
is_fraud:
|
|
28
|
+
type: boolean # derived (target)
|
|
29
|
+
|
|
30
|
+
causal:
|
|
31
|
+
edges:
|
|
32
|
+
- { from: customer_age, to: monthly_spend, fn: linear, weight: 35, bias: 1200 }
|
|
33
|
+
- { from: card_type, to: monthly_spend, fn: map, mapping: { standard: 0, gold: 900, platinum: 2600 } }
|
|
34
|
+
- { from: monthly_spend, to: is_fraud, fn: logistic, weight: -0.0008, bias: 1.2 }
|
|
35
|
+
noise:
|
|
36
|
+
monthly_spend: { dist: normal, params: { mean: 0, std: 300 } }
|
|
37
|
+
is_fraud: { dist: none }
|
|
38
|
+
|
|
39
|
+
failures:
|
|
40
|
+
- type: mnar # big spenders under-report
|
|
41
|
+
column: monthly_spend
|
|
42
|
+
rate: 0.10
|
|
43
|
+
strength: 2.0
|
|
44
|
+
- type: mcar # random gaps in age
|
|
45
|
+
columns: [customer_age]
|
|
46
|
+
rate: 0.05
|
|
47
|
+
- type: label_noise # mislabelled fraud cases
|
|
48
|
+
column: is_fraud
|
|
49
|
+
rate: 0.03
|
|
50
|
+
|
|
51
|
+
export:
|
|
52
|
+
formats: [csv]
|
|
53
|
+
versions: [clean, injected]
|
|
54
|
+
|
|
55
|
+
meta:
|
|
56
|
+
problem_statement: "Predict is_fraud from customer_age, card_type, monthly_spend under realistic failures."
|
|
57
|
+
tags: [fraud, causal, failure-injection, classification]
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
datadoom_version: "1"
|
|
2
|
+
name: "hospital-readmission"
|
|
3
|
+
description: >
|
|
4
|
+
Hospital 30-day readmission starter. Patient age, diagnosis count, length of stay
|
|
5
|
+
and prior admissions drive a latent severity score (emit: false) behind a
|
|
6
|
+
readmission label. A clean causal dataset with a hidden confounder documented in
|
|
7
|
+
the true graph — a good base to add failures or a difficulty target onto.
|
|
8
|
+
seed: 11
|
|
9
|
+
rows: 6000
|
|
10
|
+
|
|
11
|
+
features:
|
|
12
|
+
patient_age:
|
|
13
|
+
type: numeric
|
|
14
|
+
dist: normal
|
|
15
|
+
params: { mean: 58, std: 17 }
|
|
16
|
+
min: 18
|
|
17
|
+
max: 100
|
|
18
|
+
dtype: int
|
|
19
|
+
num_diagnoses:
|
|
20
|
+
type: numeric
|
|
21
|
+
dist: poisson
|
|
22
|
+
params: { lam: 4 }
|
|
23
|
+
min: 0
|
|
24
|
+
dtype: int
|
|
25
|
+
length_of_stay:
|
|
26
|
+
type: numeric
|
|
27
|
+
dist: normal
|
|
28
|
+
params: { mean: 5, std: 3 }
|
|
29
|
+
min: 1
|
|
30
|
+
dtype: int
|
|
31
|
+
prior_admissions:
|
|
32
|
+
type: numeric
|
|
33
|
+
dist: poisson
|
|
34
|
+
params: { lam: 1.2 }
|
|
35
|
+
min: 0
|
|
36
|
+
dtype: int
|
|
37
|
+
severity:
|
|
38
|
+
type: numeric # latent: weighted combination of the drivers
|
|
39
|
+
dtype: float
|
|
40
|
+
emit: false # hidden confounder — not shipped
|
|
41
|
+
readmitted:
|
|
42
|
+
type: boolean # label (derived): logistic of severity
|
|
43
|
+
|
|
44
|
+
causal:
|
|
45
|
+
edges:
|
|
46
|
+
- { from: patient_age, to: severity, fn: linear, weight: 0.02 }
|
|
47
|
+
- { from: num_diagnoses, to: severity, fn: linear, weight: 0.25 }
|
|
48
|
+
- { from: length_of_stay, to: severity, fn: linear, weight: 0.18 }
|
|
49
|
+
- { from: prior_admissions, to: severity, fn: linear, weight: 0.6 }
|
|
50
|
+
- { from: severity, to: readmitted, fn: logistic, weight: 1.8, bias: -2.0 }
|
|
51
|
+
noise:
|
|
52
|
+
severity: { dist: none }
|
|
53
|
+
readmitted: { dist: none }
|
|
54
|
+
|
|
55
|
+
export:
|
|
56
|
+
formats: [csv]
|
|
57
|
+
versions: [clean]
|
|
58
|
+
|
|
59
|
+
meta:
|
|
60
|
+
problem_statement: "Predict readmitted from age, diagnoses, length of stay, prior admissions (severity is latent)."
|
|
61
|
+
tags: [healthcare, causal, latent, classification]
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
datadoom_version: "1"
|
|
2
|
+
name: "insurance-claims"
|
|
3
|
+
description: >
|
|
4
|
+
Insurance claims starter showcasing a heavy-tailed claim amount (Pareto) — most
|
|
5
|
+
claims are small, a few are very large. Policyholder age, region, prior-claim
|
|
6
|
+
count and a fraud flag round it out. Distribution-only, generates instantly.
|
|
7
|
+
seed: 13
|
|
8
|
+
rows: 8000
|
|
9
|
+
|
|
10
|
+
features:
|
|
11
|
+
policyholder_age:
|
|
12
|
+
type: numeric
|
|
13
|
+
dist: normal
|
|
14
|
+
params: { mean: 47, std: 15 }
|
|
15
|
+
min: 18
|
|
16
|
+
max: 90
|
|
17
|
+
dtype: int
|
|
18
|
+
region:
|
|
19
|
+
type: categorical
|
|
20
|
+
categories: [north, south, east, west]
|
|
21
|
+
weights: [0.25, 0.3, 0.2, 0.25]
|
|
22
|
+
claim_amount:
|
|
23
|
+
type: numeric
|
|
24
|
+
dist: pareto
|
|
25
|
+
params: { alpha: 2.5, xm: 800 }
|
|
26
|
+
min: 0
|
|
27
|
+
num_prior_claims:
|
|
28
|
+
type: numeric
|
|
29
|
+
dist: poisson
|
|
30
|
+
params: { lam: 1.1 }
|
|
31
|
+
min: 0
|
|
32
|
+
dtype: int
|
|
33
|
+
is_fraudulent:
|
|
34
|
+
type: boolean
|
|
35
|
+
rate: 0.05
|
|
36
|
+
|
|
37
|
+
export:
|
|
38
|
+
formats: [csv]
|
|
39
|
+
versions: [clean]
|
|
40
|
+
|
|
41
|
+
meta:
|
|
42
|
+
problem_statement: "Insurance claims with a heavy-tailed (Pareto) claim amount."
|
|
43
|
+
tags: [insurance, heavy-tail, tabular]
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
datadoom_version: "1"
|
|
2
|
+
name: "iot-sensor-readings"
|
|
3
|
+
description: >
|
|
4
|
+
IoT sensor telemetry starter — a fast, purely numeric table of environmental
|
|
5
|
+
readings (temperature, humidity, pressure, battery) tagged by device and
|
|
6
|
+
hourly timestamp. Bounded distributions keep the values physically plausible.
|
|
7
|
+
seed: 19
|
|
8
|
+
rows: 12000
|
|
9
|
+
|
|
10
|
+
features:
|
|
11
|
+
device_id:
|
|
12
|
+
type: categorical
|
|
13
|
+
categories: [sensor-a, sensor-b, sensor-c, sensor-d]
|
|
14
|
+
temperature_c:
|
|
15
|
+
type: numeric
|
|
16
|
+
dist: normal
|
|
17
|
+
params: { mean: 21.5, std: 4.0 }
|
|
18
|
+
humidity_pct:
|
|
19
|
+
type: numeric
|
|
20
|
+
dist: normal
|
|
21
|
+
params: { mean: 45, std: 12 }
|
|
22
|
+
min: 0
|
|
23
|
+
max: 100
|
|
24
|
+
pressure_hpa:
|
|
25
|
+
type: numeric
|
|
26
|
+
dist: normal
|
|
27
|
+
params: { mean: 1013, std: 8 }
|
|
28
|
+
battery_pct:
|
|
29
|
+
type: numeric
|
|
30
|
+
dist: uniform
|
|
31
|
+
params: { low: 20, high: 100 }
|
|
32
|
+
reading_time:
|
|
33
|
+
type: datetime
|
|
34
|
+
start: "2024-01-01"
|
|
35
|
+
end: "2024-03-31"
|
|
36
|
+
granularity: hour
|
|
37
|
+
|
|
38
|
+
export:
|
|
39
|
+
formats: [csv]
|
|
40
|
+
versions: [clean]
|
|
41
|
+
|
|
42
|
+
meta:
|
|
43
|
+
problem_statement: "Hourly multi-sensor telemetry for quick prototyping (distribution-only)."
|
|
44
|
+
tags: [iot, numeric, datetime, timeseries-ish]
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
datadoom_version: "1"
|
|
2
|
+
name: "people-directory"
|
|
3
|
+
description: >
|
|
4
|
+
People directory starter — realistic-but-deterministic identities via the
|
|
5
|
+
bundled mimesis providers (name, email, phone, company, job, city, country).
|
|
6
|
+
Same (spec_hash, seed) → byte-identical strings. Ideal for demos and UIs that
|
|
7
|
+
need believable names and contact details.
|
|
8
|
+
seed: 42
|
|
9
|
+
rows: 3000
|
|
10
|
+
|
|
11
|
+
features:
|
|
12
|
+
full_name:
|
|
13
|
+
type: text
|
|
14
|
+
generator: name
|
|
15
|
+
email:
|
|
16
|
+
type: text
|
|
17
|
+
generator: email
|
|
18
|
+
phone:
|
|
19
|
+
type: text
|
|
20
|
+
generator: phone
|
|
21
|
+
company:
|
|
22
|
+
type: text
|
|
23
|
+
generator: company
|
|
24
|
+
job_title:
|
|
25
|
+
type: text
|
|
26
|
+
generator: occupation
|
|
27
|
+
city:
|
|
28
|
+
type: text
|
|
29
|
+
generator: city
|
|
30
|
+
country:
|
|
31
|
+
type: text
|
|
32
|
+
generator: country
|
|
33
|
+
age:
|
|
34
|
+
type: numeric
|
|
35
|
+
dist: normal
|
|
36
|
+
params: { mean: 38, std: 12 }
|
|
37
|
+
min: 18
|
|
38
|
+
max: 85
|
|
39
|
+
dtype: int
|
|
40
|
+
plan:
|
|
41
|
+
type: categorical
|
|
42
|
+
categories: [free, pro, enterprise]
|
|
43
|
+
weights: [0.7, 0.25, 0.05]
|
|
44
|
+
signup_date:
|
|
45
|
+
type: datetime
|
|
46
|
+
start: "2021-01-01"
|
|
47
|
+
end: "2024-12-31"
|
|
48
|
+
granularity: day
|
|
49
|
+
|
|
50
|
+
export:
|
|
51
|
+
formats: [csv]
|
|
52
|
+
versions: [clean]
|
|
53
|
+
|
|
54
|
+
meta:
|
|
55
|
+
problem_statement: "Believable people records (names, emails, companies) for demos and UIs."
|
|
56
|
+
tags: [people, realistic-text, pii-like]
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
datadoom_version: "1"
|
|
2
|
+
name: "predictive-maintenance"
|
|
3
|
+
description: >
|
|
4
|
+
HACKATHON — turbine predictive maintenance on multi-sensor TIME-SERIES. Three
|
|
5
|
+
additive sensor streams (vibration, bearing temperature, oil pressure) run over
|
|
6
|
+
the row index (row order IS time): each carries trend + operational seasonality +
|
|
7
|
+
AR(1) autocorrelation + noise. Together with the operating `load_factor` and the
|
|
8
|
+
`component_grade` they drive a LATENT `wear_index` (emit: false) behind a
|
|
9
|
+
`needs_maintenance` label. A realistic operations stack then corrupts a copy:
|
|
10
|
+
the load regime DRIFTS upward over time, the load sensor gains noise and drops
|
|
11
|
+
readings (MCAR), and a `maintenance_alarm` is planted as a LEAKED proxy of the
|
|
12
|
+
label. This is a sequential problem — preserve row order; do not shuffle the
|
|
13
|
+
series across the train/test boundary.
|
|
14
|
+
seed: 303
|
|
15
|
+
rows: 4500
|
|
16
|
+
|
|
17
|
+
features:
|
|
18
|
+
vibration:
|
|
19
|
+
type: timeseries # mm/s RMS — slow upward trend as the unit wears
|
|
20
|
+
trend: { slope: 0.0009, intercept: 2.0 }
|
|
21
|
+
seasonality:
|
|
22
|
+
- { amplitude: 0.45, period: 24, phase: 0.0 } # daily duty cycle
|
|
23
|
+
ar: [0.5]
|
|
24
|
+
noise_std: 0.2
|
|
25
|
+
dtype: float
|
|
26
|
+
bearing_temp:
|
|
27
|
+
type: timeseries # °C — daily + weekly cycle on a warming trend
|
|
28
|
+
trend: { slope: 0.0018, intercept: 64.0 }
|
|
29
|
+
seasonality:
|
|
30
|
+
- { amplitude: 4.0, period: 24, phase: 0.0 }
|
|
31
|
+
- { amplitude: 1.5, period: 168, phase: 1.0 }
|
|
32
|
+
ar: [0.6]
|
|
33
|
+
noise_std: 1.0
|
|
34
|
+
dtype: float
|
|
35
|
+
oil_pressure:
|
|
36
|
+
type: timeseries # bar — gently declining as seals degrade
|
|
37
|
+
trend: { slope: -0.0011, intercept: 45.0 }
|
|
38
|
+
ar: [0.4]
|
|
39
|
+
noise_std: 0.7
|
|
40
|
+
dtype: float
|
|
41
|
+
load_factor:
|
|
42
|
+
type: numeric # 0..1 operating load (root; the failures target this)
|
|
43
|
+
dist: normal
|
|
44
|
+
params: { mean: 0.68, std: 0.14 }
|
|
45
|
+
min: 0
|
|
46
|
+
max: 1
|
|
47
|
+
component_grade:
|
|
48
|
+
type: categorical
|
|
49
|
+
categories: [economy, standard, premium]
|
|
50
|
+
weights: [0.4, 0.4, 0.2]
|
|
51
|
+
wear_index:
|
|
52
|
+
type: numeric # LATENT degradation index — the true driver
|
|
53
|
+
dtype: float
|
|
54
|
+
emit: false
|
|
55
|
+
needs_maintenance:
|
|
56
|
+
type: boolean # label (derived): logistic of wear_index
|
|
57
|
+
|
|
58
|
+
causal:
|
|
59
|
+
edges:
|
|
60
|
+
- { from: vibration, to: wear_index, fn: linear, weight: 0.9 }
|
|
61
|
+
- { from: bearing_temp, to: wear_index, fn: linear, weight: 0.03 }
|
|
62
|
+
- { from: oil_pressure, to: wear_index, fn: linear, weight: -0.04 }
|
|
63
|
+
- { from: load_factor, to: wear_index, fn: linear, weight: 1.3 }
|
|
64
|
+
- { from: component_grade, to: wear_index, fn: map, mapping: { economy: 0.8, standard: 0.0, premium: -0.6 } }
|
|
65
|
+
- { from: wear_index, to: needs_maintenance, fn: logistic, weight: 1.1, bias: -6.8 }
|
|
66
|
+
noise:
|
|
67
|
+
wear_index: { dist: none }
|
|
68
|
+
needs_maintenance: { dist: none }
|
|
69
|
+
|
|
70
|
+
failures:
|
|
71
|
+
- type: drift # load regime creeps up over the campaign
|
|
72
|
+
column: load_factor
|
|
73
|
+
schedule: { kind: linear, magnitude: 0.18 }
|
|
74
|
+
- type: feature_noise # sensor noise on the load reading
|
|
75
|
+
column: load_factor
|
|
76
|
+
dist: normal
|
|
77
|
+
params: { mean: 0, std: 0.04 }
|
|
78
|
+
- type: mcar # dropped telemetry on load
|
|
79
|
+
columns: [load_factor]
|
|
80
|
+
rate: 0.05
|
|
81
|
+
- type: leakage # an alarm that is essentially the label — drop it
|
|
82
|
+
target: needs_maintenance
|
|
83
|
+
into: maintenance_alarm
|
|
84
|
+
noise: 0.06
|
|
85
|
+
|
|
86
|
+
export:
|
|
87
|
+
formats: [csv]
|
|
88
|
+
versions: [clean, injected]
|
|
89
|
+
|
|
90
|
+
meta:
|
|
91
|
+
level: hackathon
|
|
92
|
+
challenge:
|
|
93
|
+
title: "Predict turbine maintenance needs from sensor streams"
|
|
94
|
+
task: classification
|
|
95
|
+
target: needs_maintenance
|
|
96
|
+
metric: ROC-AUC / PR-AUC (class-imbalanced)
|
|
97
|
+
time_axis: row_order
|
|
98
|
+
hidden_structure: >
|
|
99
|
+
wear_index is latent and accumulates from the sensor streams + load + grade.
|
|
100
|
+
The series are autocorrelated — vibration/temp trend up, oil pressure trends
|
|
101
|
+
down as the unit degrades.
|
|
102
|
+
gotchas:
|
|
103
|
+
- "Row order is the time axis — do NOT shuffle across the train/test split."
|
|
104
|
+
- "load_factor drifts and is noisy/MCAR in the injected copy — a covariate-shift between early and late rows."
|
|
105
|
+
- "maintenance_alarm is a leaked proxy of the label — drop it."
|
|
106
|
+
- "Positive class is the minority — prefer PR-AUC over plain accuracy."
|
|
107
|
+
tags: [iot, predictive-maintenance, time-series, causal, latent, drift, leakage, classification]
|