datadoom 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. datadoom/__init__.py +23 -0
  2. datadoom/adapters/__init__.py +29 -0
  3. datadoom/adapters/frameworks.py +94 -0
  4. datadoom/adapters/loaders.py +72 -0
  5. datadoom/api/__init__.py +11 -0
  6. datadoom/api/app.py +109 -0
  7. datadoom/api/deps.py +30 -0
  8. datadoom/api/errors.py +89 -0
  9. datadoom/api/estimate.py +82 -0
  10. datadoom/api/routes/__init__.py +7 -0
  11. datadoom/api/routes/artifacts.py +147 -0
  12. datadoom/api/routes/datasets.py +180 -0
  13. datadoom/api/routes/meta.py +45 -0
  14. datadoom/api/routes/plugins.py +22 -0
  15. datadoom/api/routes/runs.py +144 -0
  16. datadoom/api/routes/specs.py +73 -0
  17. datadoom/api/routes/templates.py +30 -0
  18. datadoom/api/schemas.py +230 -0
  19. datadoom/api/serializers.py +143 -0
  20. datadoom/api/state.py +24 -0
  21. datadoom/api/store_helpers.py +56 -0
  22. datadoom/api/ws.py +72 -0
  23. datadoom/cli/__init__.py +1 -0
  24. datadoom/cli/main.py +313 -0
  25. datadoom/config.py +108 -0
  26. datadoom/engine/__init__.py +38 -0
  27. datadoom/engine/advice.py +289 -0
  28. datadoom/engine/audit.py +290 -0
  29. datadoom/engine/causal/__init__.py +15 -0
  30. datadoom/engine/causal/execute.py +116 -0
  31. datadoom/engine/causal/functions.py +116 -0
  32. datadoom/engine/causal/graph.py +54 -0
  33. datadoom/engine/difficulty/__init__.py +36 -0
  34. datadoom/engine/difficulty/calibrate.py +235 -0
  35. datadoom/engine/difficulty/knobs.py +171 -0
  36. datadoom/engine/difficulty/probes.py +181 -0
  37. datadoom/engine/dist/__init__.py +35 -0
  38. datadoom/engine/dist/base.py +46 -0
  39. datadoom/engine/dist/builtins.py +172 -0
  40. datadoom/engine/dist/compliance.py +344 -0
  41. datadoom/engine/dist/providers.py +117 -0
  42. datadoom/engine/errors.py +32 -0
  43. datadoom/engine/export/__init__.py +27 -0
  44. datadoom/engine/export/base.py +49 -0
  45. datadoom/engine/export/checksums.py +18 -0
  46. datadoom/engine/export/csv_exporter.py +34 -0
  47. datadoom/engine/export/json_exporter.py +67 -0
  48. datadoom/engine/export/metadata.py +58 -0
  49. datadoom/engine/export/parquet_exporter.py +45 -0
  50. datadoom/engine/failure/__init__.py +18 -0
  51. datadoom/engine/failure/apply.py +37 -0
  52. datadoom/engine/failure/base.py +116 -0
  53. datadoom/engine/failure/modes.py +442 -0
  54. datadoom/engine/pipeline.py +418 -0
  55. datadoom/engine/profile.py +327 -0
  56. datadoom/engine/progress.py +14 -0
  57. datadoom/engine/reference.py +338 -0
  58. datadoom/engine/reports.py +206 -0
  59. datadoom/engine/rng.py +79 -0
  60. datadoom/engine/spec/__init__.py +45 -0
  61. datadoom/engine/spec/hashing.py +57 -0
  62. datadoom/engine/spec/models.py +238 -0
  63. datadoom/engine/spec/validate.py +345 -0
  64. datadoom/engine/timeseries.py +88 -0
  65. datadoom/jobs/__init__.py +14 -0
  66. datadoom/jobs/progress.py +155 -0
  67. datadoom/jobs/worker.py +162 -0
  68. datadoom/plugin.py +35 -0
  69. datadoom/plugins/__init__.py +47 -0
  70. datadoom/plugins/contracts.py +72 -0
  71. datadoom/plugins/loader.py +125 -0
  72. datadoom/plugins/registry.py +214 -0
  73. datadoom/plugins/scaffold.py +434 -0
  74. datadoom/store/__init__.py +47 -0
  75. datadoom/store/artifacts.py +67 -0
  76. datadoom/store/db.py +104 -0
  77. datadoom/store/migrations/__init__.py +0 -0
  78. datadoom/store/migrations/env.py +53 -0
  79. datadoom/store/migrations/script.py.mako +24 -0
  80. datadoom/store/migrations/versions/0001_init.py +149 -0
  81. datadoom/store/migrations/versions/0002_report_mutual_information.py +23 -0
  82. datadoom/store/migrations/versions/0003_run_name.py +23 -0
  83. datadoom/store/migrations/versions/0004_report_profile.py +24 -0
  84. datadoom/store/models.py +170 -0
  85. datadoom/store/repositories.py +279 -0
  86. datadoom/templates/__init__.py +239 -0
  87. datadoom/templates/ab_test.datadoom.yaml +46 -0
  88. datadoom/templates/clinical_deterioration.datadoom.yaml +124 -0
  89. datadoom/templates/credit_default_challenge.datadoom.yaml +147 -0
  90. datadoom/templates/customer_churn.datadoom.yaml +60 -0
  91. datadoom/templates/ecommerce_orders.datadoom.yaml +46 -0
  92. datadoom/templates/fraud_detection.datadoom.yaml +57 -0
  93. datadoom/templates/hospital_readmission.datadoom.yaml +61 -0
  94. datadoom/templates/insurance_claims.datadoom.yaml +43 -0
  95. datadoom/templates/iot_sensors.datadoom.yaml +44 -0
  96. datadoom/templates/people_directory.datadoom.yaml +56 -0
  97. datadoom/templates/predictive_maintenance.datadoom.yaml +107 -0
  98. datadoom/templates/telecom_churn_challenge.datadoom.yaml +125 -0
  99. datadoom/version.py +3 -0
  100. datadoom/webdist/assets/index-V8VAuTJG.js +445 -0
  101. datadoom/webdist/assets/index-doRjyG5s.css +1 -0
  102. datadoom/webdist/assets/inter-cyrillic-ext-wght-normal-BOeWTOD4.woff2 +0 -0
  103. datadoom/webdist/assets/inter-cyrillic-wght-normal-DqGufNeO.woff2 +0 -0
  104. datadoom/webdist/assets/inter-greek-ext-wght-normal-DlzME5K_.woff2 +0 -0
  105. datadoom/webdist/assets/inter-greek-wght-normal-CkhJZR-_.woff2 +0 -0
  106. datadoom/webdist/assets/inter-latin-ext-wght-normal-DO1Apj_S.woff2 +0 -0
  107. datadoom/webdist/assets/inter-latin-wght-normal-Dx4kXJAl.woff2 +0 -0
  108. datadoom/webdist/assets/inter-vietnamese-wght-normal-CBcvBZtf.woff2 +0 -0
  109. datadoom/webdist/assets/jetbrains-mono-cyrillic-wght-normal-D73BlboJ.woff2 +0 -0
  110. datadoom/webdist/assets/jetbrains-mono-greek-wght-normal-Bw9x6K1M.woff2 +0 -0
  111. datadoom/webdist/assets/jetbrains-mono-latin-ext-wght-normal-DBQx-q_a.woff2 +0 -0
  112. datadoom/webdist/assets/jetbrains-mono-latin-wght-normal-B9CIFXIH.woff2 +0 -0
  113. datadoom/webdist/assets/jetbrains-mono-vietnamese-wght-normal-Bt-aOZkq.woff2 +0 -0
  114. datadoom/webdist/assets/space-grotesk-latin-ext-wght-normal-D9tNdqV9.woff2 +0 -0
  115. datadoom/webdist/assets/space-grotesk-latin-wght-normal-BhU9QXUp.woff2 +0 -0
  116. datadoom/webdist/assets/space-grotesk-vietnamese-wght-normal-D0rl6rjA.woff2 +0 -0
  117. datadoom/webdist/index.html +15 -0
  118. datadoom-0.1.0.dev0.dist-info/METADATA +143 -0
  119. datadoom-0.1.0.dev0.dist-info/RECORD +122 -0
  120. datadoom-0.1.0.dev0.dist-info/WHEEL +4 -0
  121. datadoom-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  122. datadoom-0.1.0.dev0.dist-info/licenses/LICENSE +202 -0
@@ -0,0 +1,125 @@
1
+ datadoom_version: "1"
2
+ name: "telecom-churn-challenge"
3
+ description: >
4
+ HACKATHON — telecom customer churn with realistic records and mixed feature
5
+ types. Believable identity fields (customer name, email, city, signup date —
6
+ generated by deterministic providers) sit alongside the real signal: tenure,
7
+ monthly charges, support-call load, contract type and data usage feed a LATENT
8
+ `dissatisfaction` score (emit: false) behind the `churned` label. Calibrated to
9
+ the demanding 'kaggle' baseline-AUROC band [0.62, 0.72] — a genuinely hard
10
+ churn problem. A light data-quality stack corrupts a copy: heavy-usage records
11
+ under-report (MNAR), monthly charges have random gaps (MCAR), and 3% of churn
12
+ labels are noisy. The identity columns are pure realism — leaking no signal,
13
+ they're a feature-selection / PII-hygiene check (drop them).
14
+ seed: 404
15
+ rows: 8000
16
+
17
+ features:
18
+ customer_name:
19
+ type: text
20
+ generator: name
21
+ locale: en
22
+ email:
23
+ type: text
24
+ generator: email
25
+ locale: en
26
+ city:
27
+ type: text
28
+ generator: city
29
+ locale: en
30
+ signup_date:
31
+ type: datetime
32
+ start: "2019-01-01"
33
+ end: "2023-12-31"
34
+ granularity: day
35
+ plan:
36
+ type: categorical
37
+ categories: [prepaid, basic, plus, premium]
38
+ weights: [0.2, 0.35, 0.3, 0.15]
39
+ contract:
40
+ type: categorical
41
+ categories: [monthly, annual, two_year]
42
+ weights: [0.6, 0.25, 0.15]
43
+ tenure_months:
44
+ type: numeric
45
+ dist: normal
46
+ params: { mean: 30, std: 18 }
47
+ min: 0
48
+ dtype: int
49
+ monthly_charges:
50
+ type: numeric
51
+ dist: normal
52
+ params: { mean: 65, std: 25 }
53
+ min: 0
54
+ num_support_calls:
55
+ type: numeric
56
+ dist: poisson
57
+ params: { lam: 1.8 }
58
+ min: 0
59
+ dtype: int
60
+ data_usage_gb:
61
+ type: numeric
62
+ dist: lognormal
63
+ params: { mu: 2.4, sigma: 0.6 } # median ≈ 11 GB, right-skewed
64
+ min: 0
65
+ dissatisfaction:
66
+ type: numeric # LATENT churn propensity — the true driver
67
+ dtype: float
68
+ emit: false
69
+ churned:
70
+ type: boolean # label (derived): logistic of dissatisfaction
71
+
72
+ causal:
73
+ edges:
74
+ - { from: tenure_months, to: dissatisfaction, fn: linear, weight: -0.03 }
75
+ - { from: monthly_charges, to: dissatisfaction, fn: linear, weight: 0.02 }
76
+ - { from: num_support_calls, to: dissatisfaction, fn: linear, weight: 0.42 }
77
+ - { from: data_usage_gb, to: dissatisfaction, fn: linear, weight: -0.02 }
78
+ - { from: contract, to: dissatisfaction, fn: map, mapping: { monthly: 1.0, annual: -0.3, two_year: -1.0 } }
79
+ - { from: dissatisfaction, to: churned, fn: logistic, weight: 1.2, bias: -3.5 }
80
+ noise:
81
+ dissatisfaction: { dist: none }
82
+ churned: { dist: none }
83
+
84
+ difficulty:
85
+ target: kaggle # baseline ROC-AUC calibrated into [0.62, 0.72]
86
+ label: churned
87
+ probe: logreg
88
+ max_iters: 12
89
+ knobs: [noise, label_noise]
90
+
91
+ failures:
92
+ - type: mnar # heavy users under-report usage
93
+ column: data_usage_gb
94
+ rate: 0.08
95
+ strength: 2.0
96
+ - type: mcar # billing gaps
97
+ columns: [monthly_charges]
98
+ rate: 0.04
99
+ - type: label_noise # 3% noisy churn flags
100
+ column: churned
101
+ rate: 0.03
102
+
103
+ export:
104
+ formats: [csv]
105
+ versions: [clean, injected]
106
+ splits: { train: 0.7, test: 0.3 }
107
+
108
+ meta:
109
+ level: hackathon
110
+ challenge:
111
+ title: "Predict telecom customer churn"
112
+ task: classification
113
+ target: churned
114
+ metric: ROC-AUC
115
+ difficulty: kaggle
116
+ baseline_auroc_band: [0.62, 0.72]
117
+ train_test_split: "70 / 30"
118
+ hidden_structure: >
119
+ dissatisfaction is latent. tenure (−), charges (+), support calls (+),
120
+ usage (−) and contract type drive it; it drives churn.
121
+ gotchas:
122
+ - "customer_name / email / city are realistic but carry NO signal — drop these identifier columns."
123
+ - "data_usage_gb is MNAR (heavy users under-report) — missingness itself is informative."
124
+ - "Calibrated to the kaggle band — expect a hard ceiling around 0.72 AUROC; chasing higher means overfitting noise."
125
+ tags: [telecom, churn, causal, latent, difficulty, realistic-text, missingness, classification]
datadoom/version.py ADDED
@@ -0,0 +1,3 @@
1
+ """Single source of truth for the package version (read by hatchling)."""
2
+
3
+ __version__ = "0.1.0.dev0"