datadoom 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. datadoom/__init__.py +23 -0
  2. datadoom/adapters/__init__.py +29 -0
  3. datadoom/adapters/frameworks.py +94 -0
  4. datadoom/adapters/loaders.py +72 -0
  5. datadoom/api/__init__.py +11 -0
  6. datadoom/api/app.py +109 -0
  7. datadoom/api/deps.py +30 -0
  8. datadoom/api/errors.py +89 -0
  9. datadoom/api/estimate.py +82 -0
  10. datadoom/api/routes/__init__.py +7 -0
  11. datadoom/api/routes/artifacts.py +147 -0
  12. datadoom/api/routes/datasets.py +180 -0
  13. datadoom/api/routes/meta.py +45 -0
  14. datadoom/api/routes/plugins.py +22 -0
  15. datadoom/api/routes/runs.py +144 -0
  16. datadoom/api/routes/specs.py +73 -0
  17. datadoom/api/routes/templates.py +30 -0
  18. datadoom/api/schemas.py +230 -0
  19. datadoom/api/serializers.py +143 -0
  20. datadoom/api/state.py +24 -0
  21. datadoom/api/store_helpers.py +56 -0
  22. datadoom/api/ws.py +72 -0
  23. datadoom/cli/__init__.py +1 -0
  24. datadoom/cli/main.py +313 -0
  25. datadoom/config.py +108 -0
  26. datadoom/engine/__init__.py +38 -0
  27. datadoom/engine/advice.py +289 -0
  28. datadoom/engine/audit.py +290 -0
  29. datadoom/engine/causal/__init__.py +15 -0
  30. datadoom/engine/causal/execute.py +116 -0
  31. datadoom/engine/causal/functions.py +116 -0
  32. datadoom/engine/causal/graph.py +54 -0
  33. datadoom/engine/difficulty/__init__.py +36 -0
  34. datadoom/engine/difficulty/calibrate.py +235 -0
  35. datadoom/engine/difficulty/knobs.py +171 -0
  36. datadoom/engine/difficulty/probes.py +181 -0
  37. datadoom/engine/dist/__init__.py +35 -0
  38. datadoom/engine/dist/base.py +46 -0
  39. datadoom/engine/dist/builtins.py +172 -0
  40. datadoom/engine/dist/compliance.py +344 -0
  41. datadoom/engine/dist/providers.py +117 -0
  42. datadoom/engine/errors.py +32 -0
  43. datadoom/engine/export/__init__.py +27 -0
  44. datadoom/engine/export/base.py +49 -0
  45. datadoom/engine/export/checksums.py +18 -0
  46. datadoom/engine/export/csv_exporter.py +34 -0
  47. datadoom/engine/export/json_exporter.py +67 -0
  48. datadoom/engine/export/metadata.py +58 -0
  49. datadoom/engine/export/parquet_exporter.py +45 -0
  50. datadoom/engine/failure/__init__.py +18 -0
  51. datadoom/engine/failure/apply.py +37 -0
  52. datadoom/engine/failure/base.py +116 -0
  53. datadoom/engine/failure/modes.py +442 -0
  54. datadoom/engine/pipeline.py +418 -0
  55. datadoom/engine/profile.py +327 -0
  56. datadoom/engine/progress.py +14 -0
  57. datadoom/engine/reference.py +338 -0
  58. datadoom/engine/reports.py +206 -0
  59. datadoom/engine/rng.py +79 -0
  60. datadoom/engine/spec/__init__.py +45 -0
  61. datadoom/engine/spec/hashing.py +57 -0
  62. datadoom/engine/spec/models.py +238 -0
  63. datadoom/engine/spec/validate.py +345 -0
  64. datadoom/engine/timeseries.py +88 -0
  65. datadoom/jobs/__init__.py +14 -0
  66. datadoom/jobs/progress.py +155 -0
  67. datadoom/jobs/worker.py +162 -0
  68. datadoom/plugin.py +35 -0
  69. datadoom/plugins/__init__.py +47 -0
  70. datadoom/plugins/contracts.py +72 -0
  71. datadoom/plugins/loader.py +125 -0
  72. datadoom/plugins/registry.py +214 -0
  73. datadoom/plugins/scaffold.py +434 -0
  74. datadoom/store/__init__.py +47 -0
  75. datadoom/store/artifacts.py +67 -0
  76. datadoom/store/db.py +104 -0
  77. datadoom/store/migrations/__init__.py +0 -0
  78. datadoom/store/migrations/env.py +53 -0
  79. datadoom/store/migrations/script.py.mako +24 -0
  80. datadoom/store/migrations/versions/0001_init.py +149 -0
  81. datadoom/store/migrations/versions/0002_report_mutual_information.py +23 -0
  82. datadoom/store/migrations/versions/0003_run_name.py +23 -0
  83. datadoom/store/migrations/versions/0004_report_profile.py +24 -0
  84. datadoom/store/models.py +170 -0
  85. datadoom/store/repositories.py +279 -0
  86. datadoom/templates/__init__.py +239 -0
  87. datadoom/templates/ab_test.datadoom.yaml +46 -0
  88. datadoom/templates/clinical_deterioration.datadoom.yaml +124 -0
  89. datadoom/templates/credit_default_challenge.datadoom.yaml +147 -0
  90. datadoom/templates/customer_churn.datadoom.yaml +60 -0
  91. datadoom/templates/ecommerce_orders.datadoom.yaml +46 -0
  92. datadoom/templates/fraud_detection.datadoom.yaml +57 -0
  93. datadoom/templates/hospital_readmission.datadoom.yaml +61 -0
  94. datadoom/templates/insurance_claims.datadoom.yaml +43 -0
  95. datadoom/templates/iot_sensors.datadoom.yaml +44 -0
  96. datadoom/templates/people_directory.datadoom.yaml +56 -0
  97. datadoom/templates/predictive_maintenance.datadoom.yaml +107 -0
  98. datadoom/templates/telecom_churn_challenge.datadoom.yaml +125 -0
  99. datadoom/version.py +3 -0
  100. datadoom/webdist/assets/index-V8VAuTJG.js +445 -0
  101. datadoom/webdist/assets/index-doRjyG5s.css +1 -0
  102. datadoom/webdist/assets/inter-cyrillic-ext-wght-normal-BOeWTOD4.woff2 +0 -0
  103. datadoom/webdist/assets/inter-cyrillic-wght-normal-DqGufNeO.woff2 +0 -0
  104. datadoom/webdist/assets/inter-greek-ext-wght-normal-DlzME5K_.woff2 +0 -0
  105. datadoom/webdist/assets/inter-greek-wght-normal-CkhJZR-_.woff2 +0 -0
  106. datadoom/webdist/assets/inter-latin-ext-wght-normal-DO1Apj_S.woff2 +0 -0
  107. datadoom/webdist/assets/inter-latin-wght-normal-Dx4kXJAl.woff2 +0 -0
  108. datadoom/webdist/assets/inter-vietnamese-wght-normal-CBcvBZtf.woff2 +0 -0
  109. datadoom/webdist/assets/jetbrains-mono-cyrillic-wght-normal-D73BlboJ.woff2 +0 -0
  110. datadoom/webdist/assets/jetbrains-mono-greek-wght-normal-Bw9x6K1M.woff2 +0 -0
  111. datadoom/webdist/assets/jetbrains-mono-latin-ext-wght-normal-DBQx-q_a.woff2 +0 -0
  112. datadoom/webdist/assets/jetbrains-mono-latin-wght-normal-B9CIFXIH.woff2 +0 -0
  113. datadoom/webdist/assets/jetbrains-mono-vietnamese-wght-normal-Bt-aOZkq.woff2 +0 -0
  114. datadoom/webdist/assets/space-grotesk-latin-ext-wght-normal-D9tNdqV9.woff2 +0 -0
  115. datadoom/webdist/assets/space-grotesk-latin-wght-normal-BhU9QXUp.woff2 +0 -0
  116. datadoom/webdist/assets/space-grotesk-vietnamese-wght-normal-D0rl6rjA.woff2 +0 -0
  117. datadoom/webdist/index.html +15 -0
  118. datadoom-0.1.0.dev0.dist-info/METADATA +143 -0
  119. datadoom-0.1.0.dev0.dist-info/RECORD +122 -0
  120. datadoom-0.1.0.dev0.dist-info/WHEEL +4 -0
  121. datadoom-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  122. datadoom-0.1.0.dev0.dist-info/licenses/LICENSE +202 -0
@@ -0,0 +1,171 @@
1
+ """The difficulty dial: knobs that make a dataset harder, baked into the frame.
2
+
3
+ The adaptive loop (``calibrate.py``) is a 1-D root-find — it can only bisect a
4
+ single monotone scalar. So the lean-default knob set is composed into **one
5
+ bisectable dial** ``d``:
6
+
7
+ d ∈ [0, 1): feature-observation noise η ramps 0 → η_max
8
+ d ∈ [1, 2]: η held at η_max, label-flip rate ρ ramps 0 → ρ_max
9
+
10
+ Feature noise is the primary lever — it adds Gaussian observation noise to the
11
+ numeric predictors while **leaving the authored causal graph untouched** (so the
12
+ ``causal_truth`` report stays honest: the label still depends on the true
13
+ signal, the features are just noisier observations of it). Label flipping is the
14
+ deep-end extension used when feature noise saturates before reaching a very low
15
+ target band.
16
+
17
+ Determinism: the per-column noise draws and the label-flip draws are taken
18
+ *once* (independent of ``d``) and merely *scaled* by the dial. This makes the
19
+ realized frame a continuous, monotone function of ``d`` and guarantees the
20
+ flipped-row set is **nested** as ρ grows — clean monotonicity for the bisection,
21
+ and byte-identical output for a given final dial (invariant #6).
22
+
23
+ `causal` (coefficient shrink) and `imbalance` are recognized knob *names* in the
24
+ spec but are not active levers in v0.1; the calibrator reports which knobs it
25
+ actually used. See the difficulty backlog in ``status.md``.
26
+ """
27
+
28
+ from __future__ import annotations
29
+
30
+ from dataclasses import dataclass
31
+ from typing import TYPE_CHECKING
32
+
33
+ import numpy as np
34
+ import pandas as pd
35
+
36
+ if TYPE_CHECKING:
37
+ from ..rng import RNGFactory
38
+
39
+ # At η = η_max each numeric predictor receives additive noise with std equal to
40
+ # η_max × the feature's own (clean) standard deviation — a 1.5σ observation
41
+ # blur, enough to wash most linear signal out of a strong feature.
42
+ FEATURE_NOISE_MAX = 1.5
43
+ # Label flips top out at 0.5 — at which point a binary label is pure coin-flip
44
+ # (AUROC → 0.5), the hardest any classification task can be.
45
+ LABEL_FLIP_MAX = 0.5
46
+ # The dial spans the feature-noise region [0,1) then the label-noise region [1,2].
47
+ DIAL_MAX = 2.0
48
+
49
+ # Knob names the dial actively implements; others are accepted-but-inactive.
50
+ ACTIVE_KNOBS = ("noise", "label_noise")
51
+
52
+
53
+ @dataclass
54
+ class KnobState:
55
+ """The decomposed knob values realized at a given dial position."""
56
+
57
+ dial: float
58
+ feature_noise: float # realized η
59
+ label_flip: float # realized ρ
60
+
61
+ def to_dict(self) -> dict[str, float]:
62
+ return {"dial": self.dial, "feature_noise": self.feature_noise, "label_flip": self.label_flip}
63
+
64
+
65
+ class DifficultyDial:
66
+ """Pre-draws the perturbations once, then realizes any dial position cheaply.
67
+
68
+ Holding the draws fixed and scaling by the dial is what makes ``μ(d)``
69
+ monotone (nested label flips, proportional feature blur) so the calibrator's
70
+ bisection is well-posed.
71
+ """
72
+
73
+ def __init__(
74
+ self,
75
+ base: pd.DataFrame,
76
+ label: str,
77
+ rng: RNGFactory,
78
+ knobs: list[str],
79
+ used_namespaces: list[str],
80
+ ) -> None:
81
+ self.base = base
82
+ self.label = label
83
+ self.n = len(base)
84
+ self.use_feature_noise = "noise" in knobs
85
+ self.use_label_noise = "label_noise" in knobs
86
+
87
+ # Numeric predictors only (booleans/categoricals/datetimes are left to
88
+ # the label-flip lever; the label column itself is never blurred).
89
+ self.numeric_cols: list[str] = [
90
+ c
91
+ for c in base.columns
92
+ if c != label
93
+ and pd.api.types.is_numeric_dtype(base[c])
94
+ and not pd.api.types.is_bool_dtype(base[c])
95
+ ]
96
+ self._z: dict[str, np.ndarray] = {}
97
+ self._sd: dict[str, float] = {}
98
+ self._is_int: dict[str, bool] = {}
99
+ if self.use_feature_noise:
100
+ for c in self.numeric_cols:
101
+ col = base[c]
102
+ self._sd[c] = float(np.nanstd(col.to_numpy(dtype=float)))
103
+ self._is_int[c] = pd.api.types.is_integer_dtype(col)
104
+ self._z[c] = rng.difficulty(f"feature:{c}").standard_normal(self.n)
105
+ used_namespaces.append(f"difficulty:feature:{c}")
106
+
107
+ self._u: np.ndarray | None = None
108
+ if self.use_label_noise:
109
+ self._u = rng.difficulty("label").random(self.n)
110
+ used_namespaces.append("difficulty:label")
111
+
112
+ # --- dial → knob magnitudes -----------------------------------------------------
113
+ def feature_noise_at(self, dial: float) -> float:
114
+ if not self.use_feature_noise:
115
+ return 0.0
116
+ return min(dial, 1.0) * FEATURE_NOISE_MAX
117
+
118
+ def label_flip_at(self, dial: float) -> float:
119
+ if not self.use_label_noise:
120
+ return 0.0
121
+ return max(0.0, dial - 1.0) * LABEL_FLIP_MAX
122
+
123
+ # --- realize a frame at a dial position -----------------------------------------
124
+ def realize(self, dial: float) -> tuple[pd.DataFrame, KnobState]:
125
+ eta = self.feature_noise_at(dial)
126
+ rho = self.label_flip_at(dial)
127
+ frame = self.base.copy()
128
+
129
+ if eta > 0.0:
130
+ for c in self.numeric_cols:
131
+ sd = self._sd[c]
132
+ if sd <= 0.0:
133
+ continue # constant column — nothing to blur
134
+ noisy = frame[c].to_numpy(dtype=float) + eta * sd * self._z[c]
135
+ if self._is_int[c]:
136
+ frame[c] = np.rint(noisy).astype("int64") # preserve int dtype
137
+ else:
138
+ frame[c] = noisy
139
+
140
+ if rho > 0.0 and self._u is not None:
141
+ flip = self._u < rho # nested as ρ grows → monotone
142
+ frame[self.label] = _flip_label(frame[self.label], flip)
143
+
144
+ return frame, KnobState(dial=dial, feature_noise=eta, label_flip=rho)
145
+
146
+ def noise_to_signal(self, eta: float) -> float:
147
+ """Var(ε)/Var(signal) for the feature noise (05 §5.4): equals η²."""
148
+ return float(eta * eta) if self.numeric_cols else 0.0
149
+
150
+
151
+ def _flip_label(series: pd.Series, flip: np.ndarray) -> pd.Series:
152
+ """Flip the selected label rows to a *different* class.
153
+
154
+ Boolean → logical NOT; binary categorical → swap to the other category;
155
+ k-ary categorical → rotate to the next category (deterministic, always a
156
+ genuine change). Mirrors the ``label_noise`` failure mode's "flip to a
157
+ different class" guarantee.
158
+ """
159
+ if pd.api.types.is_bool_dtype(series):
160
+ out = series.to_numpy().copy()
161
+ out[flip] = ~out[flip]
162
+ return pd.Series(out, index=series.index)
163
+
164
+ out = series.to_numpy().copy()
165
+ categories = sorted(pd.unique(series.dropna()))
166
+ if len(categories) < 2:
167
+ return series # nothing to flip into
168
+ nxt = {c: categories[(i + 1) % len(categories)] for i, c in enumerate(categories)}
169
+ flipped = np.array([nxt.get(v, v) for v in out[flip]], dtype=out.dtype)
170
+ out[flip] = flipped
171
+ return pd.Series(out, index=series.index)
@@ -0,0 +1,181 @@
1
+ """Baseline probe models for difficulty calibration (05 §5.1, 17 step 15).
2
+
3
+ Difficulty is defined *operationally*: a dataset is "as hard as" the score a
4
+ standard baseline model achieves on it. A :class:`ProbeModel` trains a baseline
5
+ classifier on a seeded train split and reports the task metric on the holdout —
6
+ AUROC for binary classification. The probe never refits or "helps" the data; it
7
+ just measures how separable the label is from the features (honest statistics,
8
+ invariant #3).
9
+
10
+ scikit-learn powers the estimators. The probe metric drives the adaptive loop's
11
+ knob selection, so it sits on the determinism-critical path — every estimator is
12
+ either intrinsically deterministic (lbfgs logistic regression) or seeded
13
+ (decision tree, train/test split). Same `(spec_hash, seed)` on the pinned
14
+ environment → identical metric → identical calibrated bytes (invariant #6).
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ from abc import ABC, abstractmethod
20
+ from collections.abc import Mapping
21
+ from dataclasses import dataclass
22
+ from typing import Any
23
+
24
+ import numpy as np
25
+ import pandas as pd
26
+ from sklearn.linear_model import LogisticRegression
27
+ from sklearn.metrics import accuracy_score, roc_auc_score
28
+ from sklearn.model_selection import train_test_split
29
+ from sklearn.preprocessing import StandardScaler
30
+ from sklearn.tree import DecisionTreeClassifier
31
+
32
+ # Object columns with more distinct values than this are treated as free-text /
33
+ # id-like and dropped from the probe design matrix (one-hot would explode).
34
+ _MAX_CATEGORY_CARDINALITY = 50
35
+ _TEST_SIZE = 0.3
36
+
37
+
38
+ @dataclass
39
+ class ProbeResult:
40
+ """What a probe measured on one realized frame."""
41
+
42
+ metric: float # the task metric the band targets (AUROC for binary)
43
+ metric_name: str # "auroc"
44
+ task: str # "classification"
45
+ linear_separability: float # holdout accuracy of a linear probe (05 §5.4)
46
+ class_balance: float # fraction of the positive class
47
+ n_features: int # design-matrix width actually fed to the probe
48
+
49
+ def to_dict(self) -> dict[str, Any]:
50
+ return {
51
+ "metric": self.metric,
52
+ "metric_name": self.metric_name,
53
+ "task": self.task,
54
+ "linear_separability": self.linear_separability,
55
+ "class_balance": self.class_balance,
56
+ "n_features": self.n_features,
57
+ }
58
+
59
+
60
+ class ProbeModel(ABC):
61
+ """A baseline model whose holdout score *defines* the dataset's difficulty."""
62
+
63
+ name: str
64
+ # Optional JSON-schema fragment for probe options (09 §6); ``None`` for built-ins.
65
+ param_schema: Mapping[str, Any] | None = None
66
+
67
+ @abstractmethod
68
+ def estimator(self, seed: int) -> Any:
69
+ """Return a fresh scikit-learn classifier exposing ``predict_proba``."""
70
+
71
+
72
+ class LogRegProbe(ProbeModel):
73
+ name = "logreg"
74
+
75
+ def estimator(self, seed: int) -> Any:
76
+ # lbfgs is deterministic given the data; no random_state needed.
77
+ return LogisticRegression(max_iter=1000)
78
+
79
+
80
+ class TreeProbe(ProbeModel):
81
+ name = "tree"
82
+
83
+ def estimator(self, seed: int) -> Any:
84
+ # Depth-capped so it stays a *baseline*, not a memorizer; seeded for repro.
85
+ return DecisionTreeClassifier(max_depth=8, random_state=seed)
86
+
87
+
88
+ PROBES: dict[str, ProbeModel] = {p.name: p for p in (LogRegProbe(), TreeProbe())}
89
+
90
+
91
+ def _encode_label(series: pd.Series) -> tuple[np.ndarray, int]:
92
+ """Map a classification label to 0/1 codes; return (codes, n_classes).
93
+
94
+ The positive class is the lexicographically larger category (so booleans map
95
+ True→1) — a stable convention so AUROC orientation is reproducible.
96
+ """
97
+ if pd.api.types.is_bool_dtype(series):
98
+ return series.to_numpy().astype(int), 2
99
+ codes, uniques = pd.factorize(series, sort=True)
100
+ return codes.astype(int), len(uniques)
101
+
102
+
103
+ def _design_matrix(frame: pd.DataFrame, label: str) -> tuple[np.ndarray, list[str]]:
104
+ """Build a numeric feature matrix from every column except the label.
105
+
106
+ Numeric/boolean columns pass through (median-imputed); low-cardinality
107
+ categoricals are one-hot encoded; datetimes become ordinals; free-text /
108
+ high-cardinality columns are dropped (uninformative to a baseline probe).
109
+ """
110
+ parts: list[pd.DataFrame] = []
111
+ for col in frame.columns:
112
+ if col == label:
113
+ continue
114
+ s = frame[col]
115
+ if pd.api.types.is_bool_dtype(s):
116
+ parts.append(s.astype(float).to_frame(col))
117
+ elif pd.api.types.is_numeric_dtype(s):
118
+ filled = s.astype(float)
119
+ median = filled.median()
120
+ parts.append(filled.fillna(0.0 if pd.isna(median) else median).to_frame(col))
121
+ elif pd.api.types.is_datetime64_any_dtype(s):
122
+ parts.append(s.astype("int64").astype(float).to_frame(col))
123
+ else: # object / categorical
124
+ if s.nunique(dropna=True) <= _MAX_CATEGORY_CARDINALITY:
125
+ dummies = pd.get_dummies(s, prefix=col, dummy_na=False, dtype=float)
126
+ if dummies.shape[1] > 0:
127
+ parts.append(dummies)
128
+ if not parts:
129
+ return np.empty((len(frame), 0), dtype=float), []
130
+ design = pd.concat(parts, axis=1)
131
+ return design.to_numpy(dtype=float), list(design.columns)
132
+
133
+
134
+ def evaluate(
135
+ probe: ProbeModel,
136
+ frame: pd.DataFrame,
137
+ label: str,
138
+ *,
139
+ split_seed: int,
140
+ est_seed: int,
141
+ ) -> ProbeResult:
142
+ """Train ``probe`` on a seeded split and score it on the holdout (05 §5.1).
143
+
144
+ Binary classification only in v0.1: returns holdout AUROC. Degenerate cases
145
+ (no usable features, a single realized class) score at chance (0.5) — honest
146
+ "no signal" rather than a misleading number or a crash.
147
+ """
148
+ X, feat_names = _design_matrix(frame, label)
149
+ y, _ = _encode_label(frame[label])
150
+ n_features = X.shape[1]
151
+ positive_rate = float(np.mean(y == 1)) if len(y) else 0.0
152
+
153
+ classes = np.unique(y)
154
+ if n_features == 0 or classes.size < 2:
155
+ # Nothing to learn from, or label collapsed to one class → chance.
156
+ return ProbeResult(0.5, "auroc", "classification", 0.5, positive_rate, n_features)
157
+
158
+ # Stratify only when every class can appear on both sides of the split.
159
+ counts = np.bincount(y)
160
+ stratify = y if counts.min() >= 2 else None
161
+ X_tr, X_te, y_tr, y_te = train_test_split(
162
+ X, y, test_size=_TEST_SIZE, random_state=split_seed, stratify=stratify
163
+ )
164
+ if np.unique(y_te).size < 2 or np.unique(y_tr).size < 2:
165
+ return ProbeResult(0.5, "auroc", "classification", 0.5, positive_rate, n_features)
166
+
167
+ scaler = StandardScaler().fit(X_tr)
168
+ X_tr_s = scaler.transform(X_tr)
169
+ X_te_s = scaler.transform(X_te)
170
+
171
+ model = probe.estimator(est_seed)
172
+ model.fit(X_tr_s, y_tr)
173
+ proba = model.predict_proba(X_te_s)[:, 1]
174
+ auroc = float(roc_auc_score(y_te, proba))
175
+
176
+ # Linear-separability reference (05 §5.4): a plain logistic probe's holdout
177
+ # accuracy, regardless of which probe scored the AUROC.
178
+ linear = LogisticRegression(max_iter=1000).fit(X_tr_s, y_tr)
179
+ lin_acc = float(accuracy_score(y_te, linear.predict(X_te_s)))
180
+
181
+ return ProbeResult(auroc, "auroc", "classification", lin_acc, positive_rate, n_features)
@@ -0,0 +1,35 @@
1
+ """Distributions + honest compliance reporting."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .base import Distribution
6
+ from .builtins import (
7
+ REGISTRY,
8
+ sample_boolean,
9
+ sample_categorical,
10
+ sample_datetime,
11
+ sample_text,
12
+ )
13
+ from .compliance import ComplianceReport, FeatureCompliance, assess_numeric
14
+ from .providers import (
15
+ REALISTIC_GENERATORS,
16
+ is_realistic_generator,
17
+ resolve_locale,
18
+ sample_provider,
19
+ )
20
+
21
+ __all__ = [
22
+ "Distribution",
23
+ "REGISTRY",
24
+ "ComplianceReport",
25
+ "FeatureCompliance",
26
+ "assess_numeric",
27
+ "sample_boolean",
28
+ "sample_categorical",
29
+ "sample_datetime",
30
+ "sample_text",
31
+ "sample_provider",
32
+ "is_realistic_generator",
33
+ "resolve_locale",
34
+ "REALISTIC_GENERATORS",
35
+ ]
@@ -0,0 +1,46 @@
1
+ """Distribution ABC (05 §2).
2
+
3
+ A :class:`Distribution` knows how to *sample* from a target ``D(theta)`` using an
4
+ injected RNG, *validate* its parameters, and expose its theoretical *cdf* so the
5
+ compliance layer can run a KS test. Sampling is correct by construction — we
6
+ never refit parameters to the realized sample (05 §2.3).
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from abc import ABC, abstractmethod
12
+ from typing import Mapping
13
+
14
+ import numpy as np
15
+
16
+ from ..errors import SpecValidationError
17
+
18
+
19
+ class Distribution(ABC):
20
+ name: str
21
+ required_params: tuple[str, ...] = ()
22
+ # Optional JSON-schema fragment for the feature `params`. Built-ins leave this
23
+ # ``None`` (the Canvas renders their native controls); plugins declare one so
24
+ # the UI can render config controls with no frontend work (09 §6).
25
+ param_schema: Mapping[str, object] | None = None
26
+
27
+ @abstractmethod
28
+ def sample(self, rng: np.random.Generator, n: int, params: Mapping[str, float]) -> np.ndarray:
29
+ """Draw ``n`` samples from the target distribution."""
30
+
31
+ @abstractmethod
32
+ def cdf(self, x: np.ndarray, params: Mapping[str, float]) -> np.ndarray:
33
+ """Theoretical CDF F(x; params) — used for KS reporting."""
34
+
35
+ def validate(self, params: Mapping[str, float], locator: str | None = None) -> None:
36
+ """Check required params are present and satisfy domain constraints."""
37
+ missing = [p for p in self.required_params if p not in params]
38
+ if missing:
39
+ raise SpecValidationError(
40
+ f"distribution {self.name!r} missing params: {missing}", locator=locator
41
+ )
42
+ self._validate_domain(params, locator)
43
+
44
+ def _validate_domain(self, params: Mapping[str, float], locator: str | None) -> None:
45
+ """Override for per-distribution constraints (e.g. std > 0)."""
46
+ return None
@@ -0,0 +1,172 @@
1
+ """Built-in distributions and feature samplers (04 §4, 05 §2).
2
+
3
+ Numeric distributions referenced by ``dist:`` live in :data:`REGISTRY`. Sampling
4
+ flows entirely through the injected ``numpy.random.Generator`` so results are
5
+ deterministic on the pinned path. Non-numeric feature kinds (categorical,
6
+ boolean, datetime, text) have dedicated samplers used by the pipeline.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from typing import Mapping
12
+
13
+ import numpy as np
14
+ from scipy import stats
15
+
16
+ from ..errors import SpecValidationError
17
+ from .base import Distribution
18
+
19
+
20
+ def _require_positive(params: Mapping[str, float], keys: tuple[str, ...], locator: str | None) -> None:
21
+ for k in keys:
22
+ if k in params and params[k] <= 0:
23
+ raise SpecValidationError(f"param {k!r} must be > 0", locator=locator)
24
+
25
+
26
+ class Normal(Distribution):
27
+ name = "normal"
28
+ required_params = ("mean", "std")
29
+
30
+ def _validate_domain(self, params, locator):
31
+ _require_positive(params, ("std",), locator)
32
+
33
+ def sample(self, rng, n, params):
34
+ return rng.normal(params["mean"], params["std"], size=n)
35
+
36
+ def cdf(self, x, params):
37
+ return stats.norm.cdf(x, loc=params["mean"], scale=params["std"])
38
+
39
+
40
+ class LogNormal(Distribution):
41
+ name = "lognormal"
42
+ required_params = ("mu", "sigma")
43
+
44
+ def _validate_domain(self, params, locator):
45
+ _require_positive(params, ("sigma",), locator)
46
+
47
+ def sample(self, rng, n, params):
48
+ return rng.lognormal(params["mu"], params["sigma"], size=n)
49
+
50
+ def cdf(self, x, params):
51
+ return stats.lognorm.cdf(x, s=params["sigma"], scale=np.exp(params["mu"]))
52
+
53
+
54
+ class Poisson(Distribution):
55
+ name = "poisson"
56
+ required_params = ("lam",)
57
+
58
+ def _validate_domain(self, params, locator):
59
+ _require_positive(params, ("lam",), locator)
60
+
61
+ def sample(self, rng, n, params):
62
+ return rng.poisson(params["lam"], size=n)
63
+
64
+ def cdf(self, x, params):
65
+ return stats.poisson.cdf(x, mu=params["lam"])
66
+
67
+
68
+ class Pareto(Distribution):
69
+ name = "pareto"
70
+ required_params = ("alpha", "xm")
71
+
72
+ def _validate_domain(self, params, locator):
73
+ _require_positive(params, ("alpha", "xm"), locator)
74
+
75
+ def sample(self, rng, n, params):
76
+ # numpy draws Lomax (Pareto II); classical Pareto I = (lomax + 1) * xm.
77
+ return (rng.pareto(params["alpha"], size=n) + 1.0) * params["xm"]
78
+
79
+ def cdf(self, x, params):
80
+ return stats.pareto.cdf(x, b=params["alpha"], scale=params["xm"])
81
+
82
+
83
+ class Uniform(Distribution):
84
+ name = "uniform"
85
+ required_params = ("low", "high")
86
+
87
+ def _validate_domain(self, params, locator):
88
+ if params["low"] >= params["high"]:
89
+ raise SpecValidationError("uniform requires low < high", locator=locator)
90
+
91
+ def sample(self, rng, n, params):
92
+ return rng.uniform(params["low"], params["high"], size=n)
93
+
94
+ def cdf(self, x, params):
95
+ return stats.uniform.cdf(x, loc=params["low"], scale=params["high"] - params["low"])
96
+
97
+
98
+ class Exponential(Distribution):
99
+ name = "exponential"
100
+ required_params = ("scale",)
101
+
102
+ def _validate_domain(self, params, locator):
103
+ _require_positive(params, ("scale",), locator)
104
+
105
+ def sample(self, rng, n, params):
106
+ return rng.exponential(params["scale"], size=n)
107
+
108
+ def cdf(self, x, params):
109
+ return stats.expon.cdf(x, scale=params["scale"])
110
+
111
+
112
+ REGISTRY: dict[str, Distribution] = {
113
+ d.name: d
114
+ for d in (Normal(), LogNormal(), Poisson(), Pareto(), Uniform(), Exponential())
115
+ }
116
+
117
+
118
+ # --- Non-numeric feature samplers ----------------------------------------------------
119
+
120
+
121
+ def sample_categorical(
122
+ rng: np.random.Generator, n: int, categories: list[str], weights: list[float] | None
123
+ ) -> np.ndarray:
124
+ if weights is None:
125
+ probs = None
126
+ else:
127
+ total = float(sum(weights))
128
+ probs = [w / total for w in weights]
129
+ return rng.choice(np.asarray(categories, dtype=object), size=n, p=probs)
130
+
131
+
132
+ def sample_boolean(rng: np.random.Generator, n: int, rate: float) -> np.ndarray:
133
+ return rng.random(size=n) < rate
134
+
135
+
136
+ def sample_datetime(
137
+ rng: np.random.Generator,
138
+ n: int,
139
+ start: str,
140
+ end: str,
141
+ granularity: str,
142
+ ) -> np.ndarray:
143
+ """Uniformly sample timestamps in [start, end] at the given granularity."""
144
+ unit = {"second": "s", "minute": "m", "hour": "h", "day": "D"}[granularity]
145
+ # Cast both endpoints to the chosen unit so their difference is an integer
146
+ # number of steps — avoids unit-mismatch and stays fully deterministic.
147
+ start_dt = np.datetime64(start).astype(f"datetime64[{unit}]")
148
+ end_dt = np.datetime64(end).astype(f"datetime64[{unit}]")
149
+ span_steps = int((end_dt - start_dt).astype("int64"))
150
+ if span_steps < 0:
151
+ raise SpecValidationError("datetime end must be >= start")
152
+ offsets = rng.integers(0, span_steps + 1, size=n)
153
+ return start_dt + offsets.astype(f"timedelta64[{unit}]")
154
+
155
+
156
+ _LOREM = (
157
+ "lorem ipsum dolor sit amet consectetur adipiscing elit sed do eiusmod tempor "
158
+ "incididunt ut labore et dolore magna aliqua ut enim ad minim veniam quis nostrud "
159
+ "exercitation ullamco laboris nisi aliquip ex ea commodo consequat"
160
+ ).split()
161
+
162
+
163
+ def sample_text(
164
+ rng: np.random.Generator, n: int, min_len: int, max_len: int
165
+ ) -> np.ndarray:
166
+ words = np.asarray(_LOREM, dtype=object)
167
+ lengths = rng.integers(min_len, max_len + 1, size=n)
168
+ out = np.empty(n, dtype=object)
169
+ for i in range(n):
170
+ idx = rng.integers(0, len(words), size=int(lengths[i]))
171
+ out[i] = " ".join(words[idx])
172
+ return out