datadoom 0.1.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datadoom/__init__.py +23 -0
- datadoom/adapters/__init__.py +29 -0
- datadoom/adapters/frameworks.py +94 -0
- datadoom/adapters/loaders.py +72 -0
- datadoom/api/__init__.py +11 -0
- datadoom/api/app.py +109 -0
- datadoom/api/deps.py +30 -0
- datadoom/api/errors.py +89 -0
- datadoom/api/estimate.py +82 -0
- datadoom/api/routes/__init__.py +7 -0
- datadoom/api/routes/artifacts.py +147 -0
- datadoom/api/routes/datasets.py +180 -0
- datadoom/api/routes/meta.py +45 -0
- datadoom/api/routes/plugins.py +22 -0
- datadoom/api/routes/runs.py +144 -0
- datadoom/api/routes/specs.py +73 -0
- datadoom/api/routes/templates.py +30 -0
- datadoom/api/schemas.py +230 -0
- datadoom/api/serializers.py +143 -0
- datadoom/api/state.py +24 -0
- datadoom/api/store_helpers.py +56 -0
- datadoom/api/ws.py +72 -0
- datadoom/cli/__init__.py +1 -0
- datadoom/cli/main.py +313 -0
- datadoom/config.py +108 -0
- datadoom/engine/__init__.py +38 -0
- datadoom/engine/advice.py +289 -0
- datadoom/engine/audit.py +290 -0
- datadoom/engine/causal/__init__.py +15 -0
- datadoom/engine/causal/execute.py +116 -0
- datadoom/engine/causal/functions.py +116 -0
- datadoom/engine/causal/graph.py +54 -0
- datadoom/engine/difficulty/__init__.py +36 -0
- datadoom/engine/difficulty/calibrate.py +235 -0
- datadoom/engine/difficulty/knobs.py +171 -0
- datadoom/engine/difficulty/probes.py +181 -0
- datadoom/engine/dist/__init__.py +35 -0
- datadoom/engine/dist/base.py +46 -0
- datadoom/engine/dist/builtins.py +172 -0
- datadoom/engine/dist/compliance.py +344 -0
- datadoom/engine/dist/providers.py +117 -0
- datadoom/engine/errors.py +32 -0
- datadoom/engine/export/__init__.py +27 -0
- datadoom/engine/export/base.py +49 -0
- datadoom/engine/export/checksums.py +18 -0
- datadoom/engine/export/csv_exporter.py +34 -0
- datadoom/engine/export/json_exporter.py +67 -0
- datadoom/engine/export/metadata.py +58 -0
- datadoom/engine/export/parquet_exporter.py +45 -0
- datadoom/engine/failure/__init__.py +18 -0
- datadoom/engine/failure/apply.py +37 -0
- datadoom/engine/failure/base.py +116 -0
- datadoom/engine/failure/modes.py +442 -0
- datadoom/engine/pipeline.py +418 -0
- datadoom/engine/profile.py +327 -0
- datadoom/engine/progress.py +14 -0
- datadoom/engine/reference.py +338 -0
- datadoom/engine/reports.py +206 -0
- datadoom/engine/rng.py +79 -0
- datadoom/engine/spec/__init__.py +45 -0
- datadoom/engine/spec/hashing.py +57 -0
- datadoom/engine/spec/models.py +238 -0
- datadoom/engine/spec/validate.py +345 -0
- datadoom/engine/timeseries.py +88 -0
- datadoom/jobs/__init__.py +14 -0
- datadoom/jobs/progress.py +155 -0
- datadoom/jobs/worker.py +162 -0
- datadoom/plugin.py +35 -0
- datadoom/plugins/__init__.py +47 -0
- datadoom/plugins/contracts.py +72 -0
- datadoom/plugins/loader.py +125 -0
- datadoom/plugins/registry.py +214 -0
- datadoom/plugins/scaffold.py +434 -0
- datadoom/store/__init__.py +47 -0
- datadoom/store/artifacts.py +67 -0
- datadoom/store/db.py +104 -0
- datadoom/store/migrations/__init__.py +0 -0
- datadoom/store/migrations/env.py +53 -0
- datadoom/store/migrations/script.py.mako +24 -0
- datadoom/store/migrations/versions/0001_init.py +149 -0
- datadoom/store/migrations/versions/0002_report_mutual_information.py +23 -0
- datadoom/store/migrations/versions/0003_run_name.py +23 -0
- datadoom/store/migrations/versions/0004_report_profile.py +24 -0
- datadoom/store/models.py +170 -0
- datadoom/store/repositories.py +279 -0
- datadoom/templates/__init__.py +239 -0
- datadoom/templates/ab_test.datadoom.yaml +46 -0
- datadoom/templates/clinical_deterioration.datadoom.yaml +124 -0
- datadoom/templates/credit_default_challenge.datadoom.yaml +147 -0
- datadoom/templates/customer_churn.datadoom.yaml +60 -0
- datadoom/templates/ecommerce_orders.datadoom.yaml +46 -0
- datadoom/templates/fraud_detection.datadoom.yaml +57 -0
- datadoom/templates/hospital_readmission.datadoom.yaml +61 -0
- datadoom/templates/insurance_claims.datadoom.yaml +43 -0
- datadoom/templates/iot_sensors.datadoom.yaml +44 -0
- datadoom/templates/people_directory.datadoom.yaml +56 -0
- datadoom/templates/predictive_maintenance.datadoom.yaml +107 -0
- datadoom/templates/telecom_churn_challenge.datadoom.yaml +125 -0
- datadoom/version.py +3 -0
- datadoom/webdist/assets/index-V8VAuTJG.js +445 -0
- datadoom/webdist/assets/index-doRjyG5s.css +1 -0
- datadoom/webdist/assets/inter-cyrillic-ext-wght-normal-BOeWTOD4.woff2 +0 -0
- datadoom/webdist/assets/inter-cyrillic-wght-normal-DqGufNeO.woff2 +0 -0
- datadoom/webdist/assets/inter-greek-ext-wght-normal-DlzME5K_.woff2 +0 -0
- datadoom/webdist/assets/inter-greek-wght-normal-CkhJZR-_.woff2 +0 -0
- datadoom/webdist/assets/inter-latin-ext-wght-normal-DO1Apj_S.woff2 +0 -0
- datadoom/webdist/assets/inter-latin-wght-normal-Dx4kXJAl.woff2 +0 -0
- datadoom/webdist/assets/inter-vietnamese-wght-normal-CBcvBZtf.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-cyrillic-wght-normal-D73BlboJ.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-greek-wght-normal-Bw9x6K1M.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-latin-ext-wght-normal-DBQx-q_a.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-latin-wght-normal-B9CIFXIH.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-vietnamese-wght-normal-Bt-aOZkq.woff2 +0 -0
- datadoom/webdist/assets/space-grotesk-latin-ext-wght-normal-D9tNdqV9.woff2 +0 -0
- datadoom/webdist/assets/space-grotesk-latin-wght-normal-BhU9QXUp.woff2 +0 -0
- datadoom/webdist/assets/space-grotesk-vietnamese-wght-normal-D0rl6rjA.woff2 +0 -0
- datadoom/webdist/index.html +15 -0
- datadoom-0.1.0.dev0.dist-info/METADATA +143 -0
- datadoom-0.1.0.dev0.dist-info/RECORD +122 -0
- datadoom-0.1.0.dev0.dist-info/WHEEL +4 -0
- datadoom-0.1.0.dev0.dist-info/entry_points.txt +2 -0
- datadoom-0.1.0.dev0.dist-info/licenses/LICENSE +202 -0
|
@@ -0,0 +1,125 @@
|
|
|
1
|
+
datadoom_version: "1"
|
|
2
|
+
name: "telecom-churn-challenge"
|
|
3
|
+
description: >
|
|
4
|
+
HACKATHON — telecom customer churn with realistic records and mixed feature
|
|
5
|
+
types. Believable identity fields (customer name, email, city, signup date —
|
|
6
|
+
generated by deterministic providers) sit alongside the real signal: tenure,
|
|
7
|
+
monthly charges, support-call load, contract type and data usage feed a LATENT
|
|
8
|
+
`dissatisfaction` score (emit: false) behind the `churned` label. Calibrated to
|
|
9
|
+
the demanding 'kaggle' baseline-AUROC band [0.62, 0.72] — a genuinely hard
|
|
10
|
+
churn problem. A light data-quality stack corrupts a copy: heavy-usage records
|
|
11
|
+
under-report (MNAR), monthly charges have random gaps (MCAR), and 3% of churn
|
|
12
|
+
labels are noisy. The identity columns are pure realism — leaking no signal,
|
|
13
|
+
they're a feature-selection / PII-hygiene check (drop them).
|
|
14
|
+
seed: 404
|
|
15
|
+
rows: 8000
|
|
16
|
+
|
|
17
|
+
features:
|
|
18
|
+
customer_name:
|
|
19
|
+
type: text
|
|
20
|
+
generator: name
|
|
21
|
+
locale: en
|
|
22
|
+
email:
|
|
23
|
+
type: text
|
|
24
|
+
generator: email
|
|
25
|
+
locale: en
|
|
26
|
+
city:
|
|
27
|
+
type: text
|
|
28
|
+
generator: city
|
|
29
|
+
locale: en
|
|
30
|
+
signup_date:
|
|
31
|
+
type: datetime
|
|
32
|
+
start: "2019-01-01"
|
|
33
|
+
end: "2023-12-31"
|
|
34
|
+
granularity: day
|
|
35
|
+
plan:
|
|
36
|
+
type: categorical
|
|
37
|
+
categories: [prepaid, basic, plus, premium]
|
|
38
|
+
weights: [0.2, 0.35, 0.3, 0.15]
|
|
39
|
+
contract:
|
|
40
|
+
type: categorical
|
|
41
|
+
categories: [monthly, annual, two_year]
|
|
42
|
+
weights: [0.6, 0.25, 0.15]
|
|
43
|
+
tenure_months:
|
|
44
|
+
type: numeric
|
|
45
|
+
dist: normal
|
|
46
|
+
params: { mean: 30, std: 18 }
|
|
47
|
+
min: 0
|
|
48
|
+
dtype: int
|
|
49
|
+
monthly_charges:
|
|
50
|
+
type: numeric
|
|
51
|
+
dist: normal
|
|
52
|
+
params: { mean: 65, std: 25 }
|
|
53
|
+
min: 0
|
|
54
|
+
num_support_calls:
|
|
55
|
+
type: numeric
|
|
56
|
+
dist: poisson
|
|
57
|
+
params: { lam: 1.8 }
|
|
58
|
+
min: 0
|
|
59
|
+
dtype: int
|
|
60
|
+
data_usage_gb:
|
|
61
|
+
type: numeric
|
|
62
|
+
dist: lognormal
|
|
63
|
+
params: { mu: 2.4, sigma: 0.6 } # median ≈ 11 GB, right-skewed
|
|
64
|
+
min: 0
|
|
65
|
+
dissatisfaction:
|
|
66
|
+
type: numeric # LATENT churn propensity — the true driver
|
|
67
|
+
dtype: float
|
|
68
|
+
emit: false
|
|
69
|
+
churned:
|
|
70
|
+
type: boolean # label (derived): logistic of dissatisfaction
|
|
71
|
+
|
|
72
|
+
causal:
|
|
73
|
+
edges:
|
|
74
|
+
- { from: tenure_months, to: dissatisfaction, fn: linear, weight: -0.03 }
|
|
75
|
+
- { from: monthly_charges, to: dissatisfaction, fn: linear, weight: 0.02 }
|
|
76
|
+
- { from: num_support_calls, to: dissatisfaction, fn: linear, weight: 0.42 }
|
|
77
|
+
- { from: data_usage_gb, to: dissatisfaction, fn: linear, weight: -0.02 }
|
|
78
|
+
- { from: contract, to: dissatisfaction, fn: map, mapping: { monthly: 1.0, annual: -0.3, two_year: -1.0 } }
|
|
79
|
+
- { from: dissatisfaction, to: churned, fn: logistic, weight: 1.2, bias: -3.5 }
|
|
80
|
+
noise:
|
|
81
|
+
dissatisfaction: { dist: none }
|
|
82
|
+
churned: { dist: none }
|
|
83
|
+
|
|
84
|
+
difficulty:
|
|
85
|
+
target: kaggle # baseline ROC-AUC calibrated into [0.62, 0.72]
|
|
86
|
+
label: churned
|
|
87
|
+
probe: logreg
|
|
88
|
+
max_iters: 12
|
|
89
|
+
knobs: [noise, label_noise]
|
|
90
|
+
|
|
91
|
+
failures:
|
|
92
|
+
- type: mnar # heavy users under-report usage
|
|
93
|
+
column: data_usage_gb
|
|
94
|
+
rate: 0.08
|
|
95
|
+
strength: 2.0
|
|
96
|
+
- type: mcar # billing gaps
|
|
97
|
+
columns: [monthly_charges]
|
|
98
|
+
rate: 0.04
|
|
99
|
+
- type: label_noise # 3% noisy churn flags
|
|
100
|
+
column: churned
|
|
101
|
+
rate: 0.03
|
|
102
|
+
|
|
103
|
+
export:
|
|
104
|
+
formats: [csv]
|
|
105
|
+
versions: [clean, injected]
|
|
106
|
+
splits: { train: 0.7, test: 0.3 }
|
|
107
|
+
|
|
108
|
+
meta:
|
|
109
|
+
level: hackathon
|
|
110
|
+
challenge:
|
|
111
|
+
title: "Predict telecom customer churn"
|
|
112
|
+
task: classification
|
|
113
|
+
target: churned
|
|
114
|
+
metric: ROC-AUC
|
|
115
|
+
difficulty: kaggle
|
|
116
|
+
baseline_auroc_band: [0.62, 0.72]
|
|
117
|
+
train_test_split: "70 / 30"
|
|
118
|
+
hidden_structure: >
|
|
119
|
+
dissatisfaction is latent. tenure (−), charges (+), support calls (+),
|
|
120
|
+
usage (−) and contract type drive it; it drives churn.
|
|
121
|
+
gotchas:
|
|
122
|
+
- "customer_name / email / city are realistic but carry NO signal — drop these identifier columns."
|
|
123
|
+
- "data_usage_gb is MNAR (heavy users under-report) — missingness itself is informative."
|
|
124
|
+
- "Calibrated to the kaggle band — expect a hard ceiling around 0.72 AUROC; chasing higher means overfitting noise."
|
|
125
|
+
tags: [telecom, churn, causal, latent, difficulty, realistic-text, missingness, classification]
|
datadoom/version.py
ADDED