datadoom 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. datadoom/__init__.py +23 -0
  2. datadoom/adapters/__init__.py +29 -0
  3. datadoom/adapters/frameworks.py +94 -0
  4. datadoom/adapters/loaders.py +72 -0
  5. datadoom/api/__init__.py +11 -0
  6. datadoom/api/app.py +109 -0
  7. datadoom/api/deps.py +30 -0
  8. datadoom/api/errors.py +89 -0
  9. datadoom/api/estimate.py +82 -0
  10. datadoom/api/routes/__init__.py +7 -0
  11. datadoom/api/routes/artifacts.py +147 -0
  12. datadoom/api/routes/datasets.py +180 -0
  13. datadoom/api/routes/meta.py +45 -0
  14. datadoom/api/routes/plugins.py +22 -0
  15. datadoom/api/routes/runs.py +144 -0
  16. datadoom/api/routes/specs.py +73 -0
  17. datadoom/api/routes/templates.py +30 -0
  18. datadoom/api/schemas.py +230 -0
  19. datadoom/api/serializers.py +143 -0
  20. datadoom/api/state.py +24 -0
  21. datadoom/api/store_helpers.py +56 -0
  22. datadoom/api/ws.py +72 -0
  23. datadoom/cli/__init__.py +1 -0
  24. datadoom/cli/main.py +313 -0
  25. datadoom/config.py +108 -0
  26. datadoom/engine/__init__.py +38 -0
  27. datadoom/engine/advice.py +289 -0
  28. datadoom/engine/audit.py +290 -0
  29. datadoom/engine/causal/__init__.py +15 -0
  30. datadoom/engine/causal/execute.py +116 -0
  31. datadoom/engine/causal/functions.py +116 -0
  32. datadoom/engine/causal/graph.py +54 -0
  33. datadoom/engine/difficulty/__init__.py +36 -0
  34. datadoom/engine/difficulty/calibrate.py +235 -0
  35. datadoom/engine/difficulty/knobs.py +171 -0
  36. datadoom/engine/difficulty/probes.py +181 -0
  37. datadoom/engine/dist/__init__.py +35 -0
  38. datadoom/engine/dist/base.py +46 -0
  39. datadoom/engine/dist/builtins.py +172 -0
  40. datadoom/engine/dist/compliance.py +344 -0
  41. datadoom/engine/dist/providers.py +117 -0
  42. datadoom/engine/errors.py +32 -0
  43. datadoom/engine/export/__init__.py +27 -0
  44. datadoom/engine/export/base.py +49 -0
  45. datadoom/engine/export/checksums.py +18 -0
  46. datadoom/engine/export/csv_exporter.py +34 -0
  47. datadoom/engine/export/json_exporter.py +67 -0
  48. datadoom/engine/export/metadata.py +58 -0
  49. datadoom/engine/export/parquet_exporter.py +45 -0
  50. datadoom/engine/failure/__init__.py +18 -0
  51. datadoom/engine/failure/apply.py +37 -0
  52. datadoom/engine/failure/base.py +116 -0
  53. datadoom/engine/failure/modes.py +442 -0
  54. datadoom/engine/pipeline.py +418 -0
  55. datadoom/engine/profile.py +327 -0
  56. datadoom/engine/progress.py +14 -0
  57. datadoom/engine/reference.py +338 -0
  58. datadoom/engine/reports.py +206 -0
  59. datadoom/engine/rng.py +79 -0
  60. datadoom/engine/spec/__init__.py +45 -0
  61. datadoom/engine/spec/hashing.py +57 -0
  62. datadoom/engine/spec/models.py +238 -0
  63. datadoom/engine/spec/validate.py +345 -0
  64. datadoom/engine/timeseries.py +88 -0
  65. datadoom/jobs/__init__.py +14 -0
  66. datadoom/jobs/progress.py +155 -0
  67. datadoom/jobs/worker.py +162 -0
  68. datadoom/plugin.py +35 -0
  69. datadoom/plugins/__init__.py +47 -0
  70. datadoom/plugins/contracts.py +72 -0
  71. datadoom/plugins/loader.py +125 -0
  72. datadoom/plugins/registry.py +214 -0
  73. datadoom/plugins/scaffold.py +434 -0
  74. datadoom/store/__init__.py +47 -0
  75. datadoom/store/artifacts.py +67 -0
  76. datadoom/store/db.py +104 -0
  77. datadoom/store/migrations/__init__.py +0 -0
  78. datadoom/store/migrations/env.py +53 -0
  79. datadoom/store/migrations/script.py.mako +24 -0
  80. datadoom/store/migrations/versions/0001_init.py +149 -0
  81. datadoom/store/migrations/versions/0002_report_mutual_information.py +23 -0
  82. datadoom/store/migrations/versions/0003_run_name.py +23 -0
  83. datadoom/store/migrations/versions/0004_report_profile.py +24 -0
  84. datadoom/store/models.py +170 -0
  85. datadoom/store/repositories.py +279 -0
  86. datadoom/templates/__init__.py +239 -0
  87. datadoom/templates/ab_test.datadoom.yaml +46 -0
  88. datadoom/templates/clinical_deterioration.datadoom.yaml +124 -0
  89. datadoom/templates/credit_default_challenge.datadoom.yaml +147 -0
  90. datadoom/templates/customer_churn.datadoom.yaml +60 -0
  91. datadoom/templates/ecommerce_orders.datadoom.yaml +46 -0
  92. datadoom/templates/fraud_detection.datadoom.yaml +57 -0
  93. datadoom/templates/hospital_readmission.datadoom.yaml +61 -0
  94. datadoom/templates/insurance_claims.datadoom.yaml +43 -0
  95. datadoom/templates/iot_sensors.datadoom.yaml +44 -0
  96. datadoom/templates/people_directory.datadoom.yaml +56 -0
  97. datadoom/templates/predictive_maintenance.datadoom.yaml +107 -0
  98. datadoom/templates/telecom_churn_challenge.datadoom.yaml +125 -0
  99. datadoom/version.py +3 -0
  100. datadoom/webdist/assets/index-V8VAuTJG.js +445 -0
  101. datadoom/webdist/assets/index-doRjyG5s.css +1 -0
  102. datadoom/webdist/assets/inter-cyrillic-ext-wght-normal-BOeWTOD4.woff2 +0 -0
  103. datadoom/webdist/assets/inter-cyrillic-wght-normal-DqGufNeO.woff2 +0 -0
  104. datadoom/webdist/assets/inter-greek-ext-wght-normal-DlzME5K_.woff2 +0 -0
  105. datadoom/webdist/assets/inter-greek-wght-normal-CkhJZR-_.woff2 +0 -0
  106. datadoom/webdist/assets/inter-latin-ext-wght-normal-DO1Apj_S.woff2 +0 -0
  107. datadoom/webdist/assets/inter-latin-wght-normal-Dx4kXJAl.woff2 +0 -0
  108. datadoom/webdist/assets/inter-vietnamese-wght-normal-CBcvBZtf.woff2 +0 -0
  109. datadoom/webdist/assets/jetbrains-mono-cyrillic-wght-normal-D73BlboJ.woff2 +0 -0
  110. datadoom/webdist/assets/jetbrains-mono-greek-wght-normal-Bw9x6K1M.woff2 +0 -0
  111. datadoom/webdist/assets/jetbrains-mono-latin-ext-wght-normal-DBQx-q_a.woff2 +0 -0
  112. datadoom/webdist/assets/jetbrains-mono-latin-wght-normal-B9CIFXIH.woff2 +0 -0
  113. datadoom/webdist/assets/jetbrains-mono-vietnamese-wght-normal-Bt-aOZkq.woff2 +0 -0
  114. datadoom/webdist/assets/space-grotesk-latin-ext-wght-normal-D9tNdqV9.woff2 +0 -0
  115. datadoom/webdist/assets/space-grotesk-latin-wght-normal-BhU9QXUp.woff2 +0 -0
  116. datadoom/webdist/assets/space-grotesk-vietnamese-wght-normal-D0rl6rjA.woff2 +0 -0
  117. datadoom/webdist/index.html +15 -0
  118. datadoom-0.1.0.dev0.dist-info/METADATA +143 -0
  119. datadoom-0.1.0.dev0.dist-info/RECORD +122 -0
  120. datadoom-0.1.0.dev0.dist-info/WHEEL +4 -0
  121. datadoom-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  122. datadoom-0.1.0.dev0.dist-info/licenses/LICENSE +202 -0
@@ -0,0 +1,147 @@
1
+ datadoom_version: "1"
2
+ name: "credit-default-challenge"
3
+ description: >
4
+ HACKATHON FLAGSHIP — consumer credit default. A multi-hop causal economy:
5
+ applicant demographics and employment drive annual income, income + debt load +
6
+ repayment history feed a LATENT risk_score (emit: false — the true generative
7
+ driver, never shipped), and the risk_score drives the `defaulted` label. The
8
+ clean dataset is then calibrated down to the 'advanced' baseline-AUROC band
9
+ [0.72, 0.80] so a vanilla model is honestly hard. On top, a realistic
10
+ data-quality stack corrupts a copy: income is MNAR (high earners under-report),
11
+ debt-to-income is MAR (missing for older applicants) and DRIFTS upward over time
12
+ (a simulated downturn), a `collections_flag` is a LEAKED near-proxy of the label
13
+ (students must detect and drop it), and 2% of labels are flipped. Train/test
14
+ split included.
15
+ seed: 101
16
+ rows: 9000
17
+
18
+ features:
19
+ applicant_age:
20
+ type: numeric
21
+ dist: normal
22
+ params: { mean: 41, std: 13 }
23
+ min: 18
24
+ max: 90
25
+ dtype: int
26
+ employment_type:
27
+ type: categorical
28
+ categories: [unemployed, part_time, full_time, self_employed]
29
+ weights: [0.08, 0.17, 0.6, 0.15]
30
+ education:
31
+ type: categorical
32
+ categories: [hs, college, grad]
33
+ weights: [0.45, 0.42, 0.13]
34
+ region:
35
+ type: categorical
36
+ categories: [northeast, midwest, south, west]
37
+ weights: [0.22, 0.24, 0.32, 0.22]
38
+ requested_amount:
39
+ type: numeric
40
+ dist: lognormal
41
+ params: { mu: 9.1, sigma: 0.55 } # median ≈ $8,950
42
+ min: 500
43
+ credit_history_months:
44
+ type: numeric
45
+ dist: normal
46
+ params: { mean: 96, std: 48 }
47
+ min: 0
48
+ dtype: int
49
+ num_open_accounts:
50
+ type: numeric
51
+ dist: poisson
52
+ params: { lam: 5 }
53
+ min: 0
54
+ dtype: int
55
+ num_late_payments:
56
+ type: numeric
57
+ dist: poisson
58
+ params: { lam: 1.1 }
59
+ min: 0
60
+ dtype: int
61
+ debt_to_income:
62
+ type: numeric
63
+ dist: normal
64
+ params: { mean: 0.34, std: 0.14 }
65
+ min: 0
66
+ annual_income:
67
+ type: numeric # derived (hop 1): demographics + employment → income
68
+ dtype: float
69
+ min: 0
70
+ risk_score:
71
+ type: numeric # derived LATENT (hop 2): the true default driver
72
+ dtype: float
73
+ emit: false # hidden — shapes the label, shows in the true graph, not shipped
74
+ defaulted:
75
+ type: boolean # label (hop 3): logistic of the latent risk_score
76
+
77
+ causal:
78
+ edges:
79
+ # hop 1 — what someone earns
80
+ - { from: applicant_age, to: annual_income, fn: linear, weight: 620, bias: 15000 }
81
+ - { from: education, to: annual_income, fn: map, mapping: { hs: 0, college: 18000, grad: 41000 } }
82
+ - { from: employment_type, to: annual_income, fn: map, mapping: { unemployed: -16000, part_time: -4000, full_time: 12000, self_employed: 6000 } }
83
+ # hop 2 — latent creditworthiness
84
+ - { from: annual_income, to: risk_score, fn: linear, weight: -0.000028 }
85
+ - { from: debt_to_income, to: risk_score, fn: linear, weight: 4.2 }
86
+ - { from: num_late_payments, to: risk_score, fn: linear, weight: 0.55 }
87
+ - { from: credit_history_months, to: risk_score, fn: linear, weight: -0.006 }
88
+ - { from: num_open_accounts, to: risk_score, fn: linear, weight: 0.06 }
89
+ # hop 3 — the label
90
+ - { from: risk_score, to: defaulted, fn: logistic, weight: 1.6, bias: -1.2 }
91
+ noise:
92
+ annual_income: { dist: normal, params: { mean: 0, std: 6500 } }
93
+ risk_score: { dist: none }
94
+ defaulted: { dist: none }
95
+
96
+ difficulty:
97
+ target: advanced # baseline ROC-AUC calibrated into [0.72, 0.80]
98
+ label: defaulted
99
+ probe: logreg
100
+ max_iters: 10
101
+ knobs: [noise, label_noise]
102
+
103
+ failures:
104
+ - type: mnar # high earners under-report income
105
+ column: annual_income
106
+ rate: 0.10
107
+ strength: 2.2
108
+ - type: mar # DTI missing for older applicants
109
+ column: debt_to_income
110
+ driver: applicant_age
111
+ rate: 0.08
112
+ strength: 2.0
113
+ - type: drift # a simulated downturn pushes DTI up over time
114
+ column: debt_to_income
115
+ schedule: { kind: linear, magnitude: 0.15 }
116
+ - type: leakage # collections_flag ≈ the label — a trap to detect and drop
117
+ target: defaulted
118
+ into: collections_flag
119
+ noise: 0.05
120
+ - type: label_noise # 2% mislabelled defaults
121
+ column: defaulted
122
+ rate: 0.02
123
+
124
+ export:
125
+ formats: [csv]
126
+ versions: [clean, injected]
127
+ splits: { train: 0.8, test: 0.2 }
128
+
129
+ meta:
130
+ level: hackathon
131
+ challenge:
132
+ title: "Predict consumer credit default"
133
+ task: classification
134
+ target: defaulted
135
+ metric: ROC-AUC
136
+ difficulty: advanced
137
+ baseline_auroc_band: [0.72, 0.80]
138
+ train_test_split: "80 / 20"
139
+ hidden_structure: >
140
+ risk_score is latent (never shipped). The signal flows
141
+ demographics+employment → annual_income → risk_score → defaulted.
142
+ gotchas:
143
+ - "annual_income is MNAR — missingness depends on the (high) value itself."
144
+ - "debt_to_income is MAR (missing for older applicants) and drifts upward over the row order."
145
+ - "collections_flag is a leaked proxy of the label — drop it or you'll 'win' dishonestly."
146
+ - "2% of labels are flipped; perfect accuracy is not the goal."
147
+ tags: [finance, credit-risk, causal, latent, difficulty, failure-injection, leakage, classification]
@@ -0,0 +1,60 @@
1
+ datadoom_version: "1"
2
+ name: "customer-churn"
3
+ description: >
4
+ Customer-churn starter. Tenure, monthly charges and support load feed a latent
5
+ satisfaction score (emit: false — it shapes the label and shows up in the true
6
+ causal graph, but is never shipped) behind a churn label. The dataset is
7
+ calibrated down to the 'intermediate' baseline-AUROC band, so a baseline model
8
+ lands in [0.62, 0.72] — hard in a measured, honest way.
9
+ seed: 23
10
+ rows: 7000
11
+
12
+ features:
13
+ tenure_months:
14
+ type: numeric
15
+ dist: normal
16
+ params: { mean: 28, std: 16 }
17
+ min: 0
18
+ dtype: int
19
+ monthly_charges:
20
+ type: numeric
21
+ dist: normal
22
+ params: { mean: 70, std: 25 }
23
+ min: 0
24
+ support_tickets:
25
+ type: numeric
26
+ dist: poisson
27
+ params: { lam: 2.5 }
28
+ min: 0
29
+ dtype: int
30
+ satisfaction:
31
+ type: numeric # latent: weighted combination of the drivers
32
+ dtype: float
33
+ emit: false # hidden — drives the label but is not shipped
34
+ churned:
35
+ type: boolean # label (derived): logistic of (low) satisfaction
36
+
37
+ causal:
38
+ edges:
39
+ - { from: tenure_months, to: satisfaction, fn: linear, weight: 0.04 }
40
+ - { from: monthly_charges, to: satisfaction, fn: linear, weight: -0.03 }
41
+ - { from: support_tickets, to: satisfaction, fn: linear, weight: -0.4 }
42
+ - { from: satisfaction, to: churned, fn: logistic, weight: -2.5, bias: 0.2 }
43
+ noise:
44
+ satisfaction: { dist: none }
45
+ churned: { dist: none }
46
+
47
+ difficulty:
48
+ target: intermediate # beginner | intermediate | advanced | kaggle
49
+ label: churned
50
+ probe: logreg
51
+ max_iters: 10
52
+ knobs: [noise, label_noise]
53
+
54
+ export:
55
+ formats: [csv]
56
+ versions: [clean]
57
+
58
+ meta:
59
+ problem_statement: "Predict churned from tenure, charges, support load (satisfaction is latent)."
60
+ tags: [churn, difficulty, latent, causal, classification]
@@ -0,0 +1,46 @@
1
+ datadoom_version: "1"
2
+ name: "ecommerce-orders"
3
+ description: >
4
+ E-commerce orders starter — a fast, distribution-only table (no causal graph):
5
+ order value (heavy-ish lognormal), basket quantity, product category, channel,
6
+ order date and a return flag. Great for quickly populating a realistic-looking
7
+ orders dataset.
8
+ seed: 31
9
+ rows: 10000
10
+
11
+ features:
12
+ order_value:
13
+ type: numeric
14
+ dist: lognormal
15
+ params: { mu: 3.8, sigma: 0.6 }
16
+ min: 0
17
+ quantity:
18
+ type: numeric
19
+ dist: poisson
20
+ params: { lam: 2.2 }
21
+ min: 1
22
+ dtype: int
23
+ category:
24
+ type: categorical
25
+ categories: [electronics, apparel, home, books, toys, grocery]
26
+ weights: [0.22, 0.26, 0.18, 0.1, 0.1, 0.14]
27
+ channel:
28
+ type: categorical
29
+ categories: [web, mobile, store]
30
+ weights: [0.45, 0.4, 0.15]
31
+ order_date:
32
+ type: datetime
33
+ start: "2023-01-01"
34
+ end: "2024-12-31"
35
+ granularity: day
36
+ returned:
37
+ type: boolean
38
+ rate: 0.08
39
+
40
+ export:
41
+ formats: [csv]
42
+ versions: [clean]
43
+
44
+ meta:
45
+ problem_statement: "A realistic orders table for quick prototyping (distribution-only)."
46
+ tags: [ecommerce, tabular, datetime]
@@ -0,0 +1,57 @@
1
+ datadoom_version: "1"
2
+ name: "transaction-fraud"
3
+ description: >
4
+ Transaction fraud starter. Customer age and card type drive monthly spend, which
5
+ drives a fraud-risk label; a stack of realistic failures (under-reported spend,
6
+ random gaps in age, mislabelled cases) then corrupts a copy — the clean baseline
7
+ is always kept, so you can compare clean vs. injected in Results.
8
+ seed: 7
9
+ rows: 8000
10
+
11
+ features:
12
+ customer_age:
13
+ type: numeric
14
+ dist: normal
15
+ params: { mean: 42, std: 13 }
16
+ min: 18
17
+ max: 95
18
+ dtype: int
19
+ card_type:
20
+ type: categorical
21
+ categories: [standard, gold, platinum]
22
+ weights: [0.6, 0.3, 0.1]
23
+ monthly_spend:
24
+ type: numeric # derived
25
+ dtype: float
26
+ min: 0
27
+ is_fraud:
28
+ type: boolean # derived (target)
29
+
30
+ causal:
31
+ edges:
32
+ - { from: customer_age, to: monthly_spend, fn: linear, weight: 35, bias: 1200 }
33
+ - { from: card_type, to: monthly_spend, fn: map, mapping: { standard: 0, gold: 900, platinum: 2600 } }
34
+ - { from: monthly_spend, to: is_fraud, fn: logistic, weight: -0.0008, bias: 1.2 }
35
+ noise:
36
+ monthly_spend: { dist: normal, params: { mean: 0, std: 300 } }
37
+ is_fraud: { dist: none }
38
+
39
+ failures:
40
+ - type: mnar # big spenders under-report
41
+ column: monthly_spend
42
+ rate: 0.10
43
+ strength: 2.0
44
+ - type: mcar # random gaps in age
45
+ columns: [customer_age]
46
+ rate: 0.05
47
+ - type: label_noise # mislabelled fraud cases
48
+ column: is_fraud
49
+ rate: 0.03
50
+
51
+ export:
52
+ formats: [csv]
53
+ versions: [clean, injected]
54
+
55
+ meta:
56
+ problem_statement: "Predict is_fraud from customer_age, card_type, monthly_spend under realistic failures."
57
+ tags: [fraud, causal, failure-injection, classification]
@@ -0,0 +1,61 @@
1
+ datadoom_version: "1"
2
+ name: "hospital-readmission"
3
+ description: >
4
+ Hospital 30-day readmission starter. Patient age, diagnosis count, length of stay
5
+ and prior admissions drive a latent severity score (emit: false) behind a
6
+ readmission label. A clean causal dataset with a hidden confounder documented in
7
+ the true graph — a good base to add failures or a difficulty target onto.
8
+ seed: 11
9
+ rows: 6000
10
+
11
+ features:
12
+ patient_age:
13
+ type: numeric
14
+ dist: normal
15
+ params: { mean: 58, std: 17 }
16
+ min: 18
17
+ max: 100
18
+ dtype: int
19
+ num_diagnoses:
20
+ type: numeric
21
+ dist: poisson
22
+ params: { lam: 4 }
23
+ min: 0
24
+ dtype: int
25
+ length_of_stay:
26
+ type: numeric
27
+ dist: normal
28
+ params: { mean: 5, std: 3 }
29
+ min: 1
30
+ dtype: int
31
+ prior_admissions:
32
+ type: numeric
33
+ dist: poisson
34
+ params: { lam: 1.2 }
35
+ min: 0
36
+ dtype: int
37
+ severity:
38
+ type: numeric # latent: weighted combination of the drivers
39
+ dtype: float
40
+ emit: false # hidden confounder — not shipped
41
+ readmitted:
42
+ type: boolean # label (derived): logistic of severity
43
+
44
+ causal:
45
+ edges:
46
+ - { from: patient_age, to: severity, fn: linear, weight: 0.02 }
47
+ - { from: num_diagnoses, to: severity, fn: linear, weight: 0.25 }
48
+ - { from: length_of_stay, to: severity, fn: linear, weight: 0.18 }
49
+ - { from: prior_admissions, to: severity, fn: linear, weight: 0.6 }
50
+ - { from: severity, to: readmitted, fn: logistic, weight: 1.8, bias: -2.0 }
51
+ noise:
52
+ severity: { dist: none }
53
+ readmitted: { dist: none }
54
+
55
+ export:
56
+ formats: [csv]
57
+ versions: [clean]
58
+
59
+ meta:
60
+ problem_statement: "Predict readmitted from age, diagnoses, length of stay, prior admissions (severity is latent)."
61
+ tags: [healthcare, causal, latent, classification]
@@ -0,0 +1,43 @@
1
+ datadoom_version: "1"
2
+ name: "insurance-claims"
3
+ description: >
4
+ Insurance claims starter showcasing a heavy-tailed claim amount (Pareto) — most
5
+ claims are small, a few are very large. Policyholder age, region, prior-claim
6
+ count and a fraud flag round it out. Distribution-only, generates instantly.
7
+ seed: 13
8
+ rows: 8000
9
+
10
+ features:
11
+ policyholder_age:
12
+ type: numeric
13
+ dist: normal
14
+ params: { mean: 47, std: 15 }
15
+ min: 18
16
+ max: 90
17
+ dtype: int
18
+ region:
19
+ type: categorical
20
+ categories: [north, south, east, west]
21
+ weights: [0.25, 0.3, 0.2, 0.25]
22
+ claim_amount:
23
+ type: numeric
24
+ dist: pareto
25
+ params: { alpha: 2.5, xm: 800 }
26
+ min: 0
27
+ num_prior_claims:
28
+ type: numeric
29
+ dist: poisson
30
+ params: { lam: 1.1 }
31
+ min: 0
32
+ dtype: int
33
+ is_fraudulent:
34
+ type: boolean
35
+ rate: 0.05
36
+
37
+ export:
38
+ formats: [csv]
39
+ versions: [clean]
40
+
41
+ meta:
42
+ problem_statement: "Insurance claims with a heavy-tailed (Pareto) claim amount."
43
+ tags: [insurance, heavy-tail, tabular]
@@ -0,0 +1,44 @@
1
+ datadoom_version: "1"
2
+ name: "iot-sensor-readings"
3
+ description: >
4
+ IoT sensor telemetry starter — a fast, purely numeric table of environmental
5
+ readings (temperature, humidity, pressure, battery) tagged by device and
6
+ hourly timestamp. Bounded distributions keep the values physically plausible.
7
+ seed: 19
8
+ rows: 12000
9
+
10
+ features:
11
+ device_id:
12
+ type: categorical
13
+ categories: [sensor-a, sensor-b, sensor-c, sensor-d]
14
+ temperature_c:
15
+ type: numeric
16
+ dist: normal
17
+ params: { mean: 21.5, std: 4.0 }
18
+ humidity_pct:
19
+ type: numeric
20
+ dist: normal
21
+ params: { mean: 45, std: 12 }
22
+ min: 0
23
+ max: 100
24
+ pressure_hpa:
25
+ type: numeric
26
+ dist: normal
27
+ params: { mean: 1013, std: 8 }
28
+ battery_pct:
29
+ type: numeric
30
+ dist: uniform
31
+ params: { low: 20, high: 100 }
32
+ reading_time:
33
+ type: datetime
34
+ start: "2024-01-01"
35
+ end: "2024-03-31"
36
+ granularity: hour
37
+
38
+ export:
39
+ formats: [csv]
40
+ versions: [clean]
41
+
42
+ meta:
43
+ problem_statement: "Hourly multi-sensor telemetry for quick prototyping (distribution-only)."
44
+ tags: [iot, numeric, datetime, timeseries-ish]
@@ -0,0 +1,56 @@
1
+ datadoom_version: "1"
2
+ name: "people-directory"
3
+ description: >
4
+ People directory starter — realistic-but-deterministic identities via the
5
+ bundled mimesis providers (name, email, phone, company, job, city, country).
6
+ Same (spec_hash, seed) → byte-identical strings. Ideal for demos and UIs that
7
+ need believable names and contact details.
8
+ seed: 42
9
+ rows: 3000
10
+
11
+ features:
12
+ full_name:
13
+ type: text
14
+ generator: name
15
+ email:
16
+ type: text
17
+ generator: email
18
+ phone:
19
+ type: text
20
+ generator: phone
21
+ company:
22
+ type: text
23
+ generator: company
24
+ job_title:
25
+ type: text
26
+ generator: occupation
27
+ city:
28
+ type: text
29
+ generator: city
30
+ country:
31
+ type: text
32
+ generator: country
33
+ age:
34
+ type: numeric
35
+ dist: normal
36
+ params: { mean: 38, std: 12 }
37
+ min: 18
38
+ max: 85
39
+ dtype: int
40
+ plan:
41
+ type: categorical
42
+ categories: [free, pro, enterprise]
43
+ weights: [0.7, 0.25, 0.05]
44
+ signup_date:
45
+ type: datetime
46
+ start: "2021-01-01"
47
+ end: "2024-12-31"
48
+ granularity: day
49
+
50
+ export:
51
+ formats: [csv]
52
+ versions: [clean]
53
+
54
+ meta:
55
+ problem_statement: "Believable people records (names, emails, companies) for demos and UIs."
56
+ tags: [people, realistic-text, pii-like]
@@ -0,0 +1,107 @@
1
+ datadoom_version: "1"
2
+ name: "predictive-maintenance"
3
+ description: >
4
+ HACKATHON — turbine predictive maintenance on multi-sensor TIME-SERIES. Three
5
+ additive sensor streams (vibration, bearing temperature, oil pressure) run over
6
+ the row index (row order IS time): each carries trend + operational seasonality +
7
+ AR(1) autocorrelation + noise. Together with the operating `load_factor` and the
8
+ `component_grade` they drive a LATENT `wear_index` (emit: false) behind a
9
+ `needs_maintenance` label. A realistic operations stack then corrupts a copy:
10
+ the load regime DRIFTS upward over time, the load sensor gains noise and drops
11
+ readings (MCAR), and a `maintenance_alarm` is planted as a LEAKED proxy of the
12
+ label. This is a sequential problem — preserve row order; do not shuffle the
13
+ series across the train/test boundary.
14
+ seed: 303
15
+ rows: 4500
16
+
17
+ features:
18
+ vibration:
19
+ type: timeseries # mm/s RMS — slow upward trend as the unit wears
20
+ trend: { slope: 0.0009, intercept: 2.0 }
21
+ seasonality:
22
+ - { amplitude: 0.45, period: 24, phase: 0.0 } # daily duty cycle
23
+ ar: [0.5]
24
+ noise_std: 0.2
25
+ dtype: float
26
+ bearing_temp:
27
+ type: timeseries # °C — daily + weekly cycle on a warming trend
28
+ trend: { slope: 0.0018, intercept: 64.0 }
29
+ seasonality:
30
+ - { amplitude: 4.0, period: 24, phase: 0.0 }
31
+ - { amplitude: 1.5, period: 168, phase: 1.0 }
32
+ ar: [0.6]
33
+ noise_std: 1.0
34
+ dtype: float
35
+ oil_pressure:
36
+ type: timeseries # bar — gently declining as seals degrade
37
+ trend: { slope: -0.0011, intercept: 45.0 }
38
+ ar: [0.4]
39
+ noise_std: 0.7
40
+ dtype: float
41
+ load_factor:
42
+ type: numeric # 0..1 operating load (root; the failures target this)
43
+ dist: normal
44
+ params: { mean: 0.68, std: 0.14 }
45
+ min: 0
46
+ max: 1
47
+ component_grade:
48
+ type: categorical
49
+ categories: [economy, standard, premium]
50
+ weights: [0.4, 0.4, 0.2]
51
+ wear_index:
52
+ type: numeric # LATENT degradation index — the true driver
53
+ dtype: float
54
+ emit: false
55
+ needs_maintenance:
56
+ type: boolean # label (derived): logistic of wear_index
57
+
58
+ causal:
59
+ edges:
60
+ - { from: vibration, to: wear_index, fn: linear, weight: 0.9 }
61
+ - { from: bearing_temp, to: wear_index, fn: linear, weight: 0.03 }
62
+ - { from: oil_pressure, to: wear_index, fn: linear, weight: -0.04 }
63
+ - { from: load_factor, to: wear_index, fn: linear, weight: 1.3 }
64
+ - { from: component_grade, to: wear_index, fn: map, mapping: { economy: 0.8, standard: 0.0, premium: -0.6 } }
65
+ - { from: wear_index, to: needs_maintenance, fn: logistic, weight: 1.1, bias: -6.8 }
66
+ noise:
67
+ wear_index: { dist: none }
68
+ needs_maintenance: { dist: none }
69
+
70
+ failures:
71
+ - type: drift # load regime creeps up over the campaign
72
+ column: load_factor
73
+ schedule: { kind: linear, magnitude: 0.18 }
74
+ - type: feature_noise # sensor noise on the load reading
75
+ column: load_factor
76
+ dist: normal
77
+ params: { mean: 0, std: 0.04 }
78
+ - type: mcar # dropped telemetry on load
79
+ columns: [load_factor]
80
+ rate: 0.05
81
+ - type: leakage # an alarm that is essentially the label — drop it
82
+ target: needs_maintenance
83
+ into: maintenance_alarm
84
+ noise: 0.06
85
+
86
+ export:
87
+ formats: [csv]
88
+ versions: [clean, injected]
89
+
90
+ meta:
91
+ level: hackathon
92
+ challenge:
93
+ title: "Predict turbine maintenance needs from sensor streams"
94
+ task: classification
95
+ target: needs_maintenance
96
+ metric: ROC-AUC / PR-AUC (class-imbalanced)
97
+ time_axis: row_order
98
+ hidden_structure: >
99
+ wear_index is latent and accumulates from the sensor streams + load + grade.
100
+ The series are autocorrelated — vibration/temp trend up, oil pressure trends
101
+ down as the unit degrades.
102
+ gotchas:
103
+ - "Row order is the time axis — do NOT shuffle across the train/test split."
104
+ - "load_factor drifts and is noisy/MCAR in the injected copy — a covariate-shift between early and late rows."
105
+ - "maintenance_alarm is a leaked proxy of the label — drop it."
106
+ - "Positive class is the minority — prefer PR-AUC over plain accuracy."
107
+ tags: [iot, predictive-maintenance, time-series, causal, latent, drift, leakage, classification]