datadoom 0.1.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datadoom/__init__.py +23 -0
- datadoom/adapters/__init__.py +29 -0
- datadoom/adapters/frameworks.py +94 -0
- datadoom/adapters/loaders.py +72 -0
- datadoom/api/__init__.py +11 -0
- datadoom/api/app.py +109 -0
- datadoom/api/deps.py +30 -0
- datadoom/api/errors.py +89 -0
- datadoom/api/estimate.py +82 -0
- datadoom/api/routes/__init__.py +7 -0
- datadoom/api/routes/artifacts.py +147 -0
- datadoom/api/routes/datasets.py +180 -0
- datadoom/api/routes/meta.py +45 -0
- datadoom/api/routes/plugins.py +22 -0
- datadoom/api/routes/runs.py +144 -0
- datadoom/api/routes/specs.py +73 -0
- datadoom/api/routes/templates.py +30 -0
- datadoom/api/schemas.py +230 -0
- datadoom/api/serializers.py +143 -0
- datadoom/api/state.py +24 -0
- datadoom/api/store_helpers.py +56 -0
- datadoom/api/ws.py +72 -0
- datadoom/cli/__init__.py +1 -0
- datadoom/cli/main.py +313 -0
- datadoom/config.py +108 -0
- datadoom/engine/__init__.py +38 -0
- datadoom/engine/advice.py +289 -0
- datadoom/engine/audit.py +290 -0
- datadoom/engine/causal/__init__.py +15 -0
- datadoom/engine/causal/execute.py +116 -0
- datadoom/engine/causal/functions.py +116 -0
- datadoom/engine/causal/graph.py +54 -0
- datadoom/engine/difficulty/__init__.py +36 -0
- datadoom/engine/difficulty/calibrate.py +235 -0
- datadoom/engine/difficulty/knobs.py +171 -0
- datadoom/engine/difficulty/probes.py +181 -0
- datadoom/engine/dist/__init__.py +35 -0
- datadoom/engine/dist/base.py +46 -0
- datadoom/engine/dist/builtins.py +172 -0
- datadoom/engine/dist/compliance.py +344 -0
- datadoom/engine/dist/providers.py +117 -0
- datadoom/engine/errors.py +32 -0
- datadoom/engine/export/__init__.py +27 -0
- datadoom/engine/export/base.py +49 -0
- datadoom/engine/export/checksums.py +18 -0
- datadoom/engine/export/csv_exporter.py +34 -0
- datadoom/engine/export/json_exporter.py +67 -0
- datadoom/engine/export/metadata.py +58 -0
- datadoom/engine/export/parquet_exporter.py +45 -0
- datadoom/engine/failure/__init__.py +18 -0
- datadoom/engine/failure/apply.py +37 -0
- datadoom/engine/failure/base.py +116 -0
- datadoom/engine/failure/modes.py +442 -0
- datadoom/engine/pipeline.py +418 -0
- datadoom/engine/profile.py +327 -0
- datadoom/engine/progress.py +14 -0
- datadoom/engine/reference.py +338 -0
- datadoom/engine/reports.py +206 -0
- datadoom/engine/rng.py +79 -0
- datadoom/engine/spec/__init__.py +45 -0
- datadoom/engine/spec/hashing.py +57 -0
- datadoom/engine/spec/models.py +238 -0
- datadoom/engine/spec/validate.py +345 -0
- datadoom/engine/timeseries.py +88 -0
- datadoom/jobs/__init__.py +14 -0
- datadoom/jobs/progress.py +155 -0
- datadoom/jobs/worker.py +162 -0
- datadoom/plugin.py +35 -0
- datadoom/plugins/__init__.py +47 -0
- datadoom/plugins/contracts.py +72 -0
- datadoom/plugins/loader.py +125 -0
- datadoom/plugins/registry.py +214 -0
- datadoom/plugins/scaffold.py +434 -0
- datadoom/store/__init__.py +47 -0
- datadoom/store/artifacts.py +67 -0
- datadoom/store/db.py +104 -0
- datadoom/store/migrations/__init__.py +0 -0
- datadoom/store/migrations/env.py +53 -0
- datadoom/store/migrations/script.py.mako +24 -0
- datadoom/store/migrations/versions/0001_init.py +149 -0
- datadoom/store/migrations/versions/0002_report_mutual_information.py +23 -0
- datadoom/store/migrations/versions/0003_run_name.py +23 -0
- datadoom/store/migrations/versions/0004_report_profile.py +24 -0
- datadoom/store/models.py +170 -0
- datadoom/store/repositories.py +279 -0
- datadoom/templates/__init__.py +239 -0
- datadoom/templates/ab_test.datadoom.yaml +46 -0
- datadoom/templates/clinical_deterioration.datadoom.yaml +124 -0
- datadoom/templates/credit_default_challenge.datadoom.yaml +147 -0
- datadoom/templates/customer_churn.datadoom.yaml +60 -0
- datadoom/templates/ecommerce_orders.datadoom.yaml +46 -0
- datadoom/templates/fraud_detection.datadoom.yaml +57 -0
- datadoom/templates/hospital_readmission.datadoom.yaml +61 -0
- datadoom/templates/insurance_claims.datadoom.yaml +43 -0
- datadoom/templates/iot_sensors.datadoom.yaml +44 -0
- datadoom/templates/people_directory.datadoom.yaml +56 -0
- datadoom/templates/predictive_maintenance.datadoom.yaml +107 -0
- datadoom/templates/telecom_churn_challenge.datadoom.yaml +125 -0
- datadoom/version.py +3 -0
- datadoom/webdist/assets/index-V8VAuTJG.js +445 -0
- datadoom/webdist/assets/index-doRjyG5s.css +1 -0
- datadoom/webdist/assets/inter-cyrillic-ext-wght-normal-BOeWTOD4.woff2 +0 -0
- datadoom/webdist/assets/inter-cyrillic-wght-normal-DqGufNeO.woff2 +0 -0
- datadoom/webdist/assets/inter-greek-ext-wght-normal-DlzME5K_.woff2 +0 -0
- datadoom/webdist/assets/inter-greek-wght-normal-CkhJZR-_.woff2 +0 -0
- datadoom/webdist/assets/inter-latin-ext-wght-normal-DO1Apj_S.woff2 +0 -0
- datadoom/webdist/assets/inter-latin-wght-normal-Dx4kXJAl.woff2 +0 -0
- datadoom/webdist/assets/inter-vietnamese-wght-normal-CBcvBZtf.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-cyrillic-wght-normal-D73BlboJ.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-greek-wght-normal-Bw9x6K1M.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-latin-ext-wght-normal-DBQx-q_a.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-latin-wght-normal-B9CIFXIH.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-vietnamese-wght-normal-Bt-aOZkq.woff2 +0 -0
- datadoom/webdist/assets/space-grotesk-latin-ext-wght-normal-D9tNdqV9.woff2 +0 -0
- datadoom/webdist/assets/space-grotesk-latin-wght-normal-BhU9QXUp.woff2 +0 -0
- datadoom/webdist/assets/space-grotesk-vietnamese-wght-normal-D0rl6rjA.woff2 +0 -0
- datadoom/webdist/index.html +15 -0
- datadoom-0.1.0.dev0.dist-info/METADATA +143 -0
- datadoom-0.1.0.dev0.dist-info/RECORD +122 -0
- datadoom-0.1.0.dev0.dist-info/WHEEL +4 -0
- datadoom-0.1.0.dev0.dist-info/entry_points.txt +2 -0
- datadoom-0.1.0.dev0.dist-info/licenses/LICENSE +202 -0
|
@@ -0,0 +1,171 @@
|
|
|
1
|
+
"""The difficulty dial: knobs that make a dataset harder, baked into the frame.
|
|
2
|
+
|
|
3
|
+
The adaptive loop (``calibrate.py``) is a 1-D root-find — it can only bisect a
|
|
4
|
+
single monotone scalar. So the lean-default knob set is composed into **one
|
|
5
|
+
bisectable dial** ``d``:
|
|
6
|
+
|
|
7
|
+
d ∈ [0, 1): feature-observation noise η ramps 0 → η_max
|
|
8
|
+
d ∈ [1, 2]: η held at η_max, label-flip rate ρ ramps 0 → ρ_max
|
|
9
|
+
|
|
10
|
+
Feature noise is the primary lever — it adds Gaussian observation noise to the
|
|
11
|
+
numeric predictors while **leaving the authored causal graph untouched** (so the
|
|
12
|
+
``causal_truth`` report stays honest: the label still depends on the true
|
|
13
|
+
signal, the features are just noisier observations of it). Label flipping is the
|
|
14
|
+
deep-end extension used when feature noise saturates before reaching a very low
|
|
15
|
+
target band.
|
|
16
|
+
|
|
17
|
+
Determinism: the per-column noise draws and the label-flip draws are taken
|
|
18
|
+
*once* (independent of ``d``) and merely *scaled* by the dial. This makes the
|
|
19
|
+
realized frame a continuous, monotone function of ``d`` and guarantees the
|
|
20
|
+
flipped-row set is **nested** as ρ grows — clean monotonicity for the bisection,
|
|
21
|
+
and byte-identical output for a given final dial (invariant #6).
|
|
22
|
+
|
|
23
|
+
`causal` (coefficient shrink) and `imbalance` are recognized knob *names* in the
|
|
24
|
+
spec but are not active levers in v0.1; the calibrator reports which knobs it
|
|
25
|
+
actually used. See the difficulty backlog in ``status.md``.
|
|
26
|
+
"""
|
|
27
|
+
|
|
28
|
+
from __future__ import annotations
|
|
29
|
+
|
|
30
|
+
from dataclasses import dataclass
|
|
31
|
+
from typing import TYPE_CHECKING
|
|
32
|
+
|
|
33
|
+
import numpy as np
|
|
34
|
+
import pandas as pd
|
|
35
|
+
|
|
36
|
+
if TYPE_CHECKING:
|
|
37
|
+
from ..rng import RNGFactory
|
|
38
|
+
|
|
39
|
+
# At η = η_max each numeric predictor receives additive noise with std equal to
|
|
40
|
+
# η_max × the feature's own (clean) standard deviation — a 1.5σ observation
|
|
41
|
+
# blur, enough to wash most linear signal out of a strong feature.
|
|
42
|
+
FEATURE_NOISE_MAX = 1.5
|
|
43
|
+
# Label flips top out at 0.5 — at which point a binary label is pure coin-flip
|
|
44
|
+
# (AUROC → 0.5), the hardest any classification task can be.
|
|
45
|
+
LABEL_FLIP_MAX = 0.5
|
|
46
|
+
# The dial spans the feature-noise region [0,1) then the label-noise region [1,2].
|
|
47
|
+
DIAL_MAX = 2.0
|
|
48
|
+
|
|
49
|
+
# Knob names the dial actively implements; others are accepted-but-inactive.
|
|
50
|
+
ACTIVE_KNOBS = ("noise", "label_noise")
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@dataclass
|
|
54
|
+
class KnobState:
|
|
55
|
+
"""The decomposed knob values realized at a given dial position."""
|
|
56
|
+
|
|
57
|
+
dial: float
|
|
58
|
+
feature_noise: float # realized η
|
|
59
|
+
label_flip: float # realized ρ
|
|
60
|
+
|
|
61
|
+
def to_dict(self) -> dict[str, float]:
|
|
62
|
+
return {"dial": self.dial, "feature_noise": self.feature_noise, "label_flip": self.label_flip}
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
class DifficultyDial:
|
|
66
|
+
"""Pre-draws the perturbations once, then realizes any dial position cheaply.
|
|
67
|
+
|
|
68
|
+
Holding the draws fixed and scaling by the dial is what makes ``μ(d)``
|
|
69
|
+
monotone (nested label flips, proportional feature blur) so the calibrator's
|
|
70
|
+
bisection is well-posed.
|
|
71
|
+
"""
|
|
72
|
+
|
|
73
|
+
def __init__(
|
|
74
|
+
self,
|
|
75
|
+
base: pd.DataFrame,
|
|
76
|
+
label: str,
|
|
77
|
+
rng: RNGFactory,
|
|
78
|
+
knobs: list[str],
|
|
79
|
+
used_namespaces: list[str],
|
|
80
|
+
) -> None:
|
|
81
|
+
self.base = base
|
|
82
|
+
self.label = label
|
|
83
|
+
self.n = len(base)
|
|
84
|
+
self.use_feature_noise = "noise" in knobs
|
|
85
|
+
self.use_label_noise = "label_noise" in knobs
|
|
86
|
+
|
|
87
|
+
# Numeric predictors only (booleans/categoricals/datetimes are left to
|
|
88
|
+
# the label-flip lever; the label column itself is never blurred).
|
|
89
|
+
self.numeric_cols: list[str] = [
|
|
90
|
+
c
|
|
91
|
+
for c in base.columns
|
|
92
|
+
if c != label
|
|
93
|
+
and pd.api.types.is_numeric_dtype(base[c])
|
|
94
|
+
and not pd.api.types.is_bool_dtype(base[c])
|
|
95
|
+
]
|
|
96
|
+
self._z: dict[str, np.ndarray] = {}
|
|
97
|
+
self._sd: dict[str, float] = {}
|
|
98
|
+
self._is_int: dict[str, bool] = {}
|
|
99
|
+
if self.use_feature_noise:
|
|
100
|
+
for c in self.numeric_cols:
|
|
101
|
+
col = base[c]
|
|
102
|
+
self._sd[c] = float(np.nanstd(col.to_numpy(dtype=float)))
|
|
103
|
+
self._is_int[c] = pd.api.types.is_integer_dtype(col)
|
|
104
|
+
self._z[c] = rng.difficulty(f"feature:{c}").standard_normal(self.n)
|
|
105
|
+
used_namespaces.append(f"difficulty:feature:{c}")
|
|
106
|
+
|
|
107
|
+
self._u: np.ndarray | None = None
|
|
108
|
+
if self.use_label_noise:
|
|
109
|
+
self._u = rng.difficulty("label").random(self.n)
|
|
110
|
+
used_namespaces.append("difficulty:label")
|
|
111
|
+
|
|
112
|
+
# --- dial → knob magnitudes -----------------------------------------------------
|
|
113
|
+
def feature_noise_at(self, dial: float) -> float:
|
|
114
|
+
if not self.use_feature_noise:
|
|
115
|
+
return 0.0
|
|
116
|
+
return min(dial, 1.0) * FEATURE_NOISE_MAX
|
|
117
|
+
|
|
118
|
+
def label_flip_at(self, dial: float) -> float:
|
|
119
|
+
if not self.use_label_noise:
|
|
120
|
+
return 0.0
|
|
121
|
+
return max(0.0, dial - 1.0) * LABEL_FLIP_MAX
|
|
122
|
+
|
|
123
|
+
# --- realize a frame at a dial position -----------------------------------------
|
|
124
|
+
def realize(self, dial: float) -> tuple[pd.DataFrame, KnobState]:
|
|
125
|
+
eta = self.feature_noise_at(dial)
|
|
126
|
+
rho = self.label_flip_at(dial)
|
|
127
|
+
frame = self.base.copy()
|
|
128
|
+
|
|
129
|
+
if eta > 0.0:
|
|
130
|
+
for c in self.numeric_cols:
|
|
131
|
+
sd = self._sd[c]
|
|
132
|
+
if sd <= 0.0:
|
|
133
|
+
continue # constant column — nothing to blur
|
|
134
|
+
noisy = frame[c].to_numpy(dtype=float) + eta * sd * self._z[c]
|
|
135
|
+
if self._is_int[c]:
|
|
136
|
+
frame[c] = np.rint(noisy).astype("int64") # preserve int dtype
|
|
137
|
+
else:
|
|
138
|
+
frame[c] = noisy
|
|
139
|
+
|
|
140
|
+
if rho > 0.0 and self._u is not None:
|
|
141
|
+
flip = self._u < rho # nested as ρ grows → monotone
|
|
142
|
+
frame[self.label] = _flip_label(frame[self.label], flip)
|
|
143
|
+
|
|
144
|
+
return frame, KnobState(dial=dial, feature_noise=eta, label_flip=rho)
|
|
145
|
+
|
|
146
|
+
def noise_to_signal(self, eta: float) -> float:
|
|
147
|
+
"""Var(ε)/Var(signal) for the feature noise (05 §5.4): equals η²."""
|
|
148
|
+
return float(eta * eta) if self.numeric_cols else 0.0
|
|
149
|
+
|
|
150
|
+
|
|
151
|
+
def _flip_label(series: pd.Series, flip: np.ndarray) -> pd.Series:
|
|
152
|
+
"""Flip the selected label rows to a *different* class.
|
|
153
|
+
|
|
154
|
+
Boolean → logical NOT; binary categorical → swap to the other category;
|
|
155
|
+
k-ary categorical → rotate to the next category (deterministic, always a
|
|
156
|
+
genuine change). Mirrors the ``label_noise`` failure mode's "flip to a
|
|
157
|
+
different class" guarantee.
|
|
158
|
+
"""
|
|
159
|
+
if pd.api.types.is_bool_dtype(series):
|
|
160
|
+
out = series.to_numpy().copy()
|
|
161
|
+
out[flip] = ~out[flip]
|
|
162
|
+
return pd.Series(out, index=series.index)
|
|
163
|
+
|
|
164
|
+
out = series.to_numpy().copy()
|
|
165
|
+
categories = sorted(pd.unique(series.dropna()))
|
|
166
|
+
if len(categories) < 2:
|
|
167
|
+
return series # nothing to flip into
|
|
168
|
+
nxt = {c: categories[(i + 1) % len(categories)] for i, c in enumerate(categories)}
|
|
169
|
+
flipped = np.array([nxt.get(v, v) for v in out[flip]], dtype=out.dtype)
|
|
170
|
+
out[flip] = flipped
|
|
171
|
+
return pd.Series(out, index=series.index)
|
|
@@ -0,0 +1,181 @@
|
|
|
1
|
+
"""Baseline probe models for difficulty calibration (05 §5.1, 17 step 15).
|
|
2
|
+
|
|
3
|
+
Difficulty is defined *operationally*: a dataset is "as hard as" the score a
|
|
4
|
+
standard baseline model achieves on it. A :class:`ProbeModel` trains a baseline
|
|
5
|
+
classifier on a seeded train split and reports the task metric on the holdout —
|
|
6
|
+
AUROC for binary classification. The probe never refits or "helps" the data; it
|
|
7
|
+
just measures how separable the label is from the features (honest statistics,
|
|
8
|
+
invariant #3).
|
|
9
|
+
|
|
10
|
+
scikit-learn powers the estimators. The probe metric drives the adaptive loop's
|
|
11
|
+
knob selection, so it sits on the determinism-critical path — every estimator is
|
|
12
|
+
either intrinsically deterministic (lbfgs logistic regression) or seeded
|
|
13
|
+
(decision tree, train/test split). Same `(spec_hash, seed)` on the pinned
|
|
14
|
+
environment → identical metric → identical calibrated bytes (invariant #6).
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
from abc import ABC, abstractmethod
|
|
20
|
+
from collections.abc import Mapping
|
|
21
|
+
from dataclasses import dataclass
|
|
22
|
+
from typing import Any
|
|
23
|
+
|
|
24
|
+
import numpy as np
|
|
25
|
+
import pandas as pd
|
|
26
|
+
from sklearn.linear_model import LogisticRegression
|
|
27
|
+
from sklearn.metrics import accuracy_score, roc_auc_score
|
|
28
|
+
from sklearn.model_selection import train_test_split
|
|
29
|
+
from sklearn.preprocessing import StandardScaler
|
|
30
|
+
from sklearn.tree import DecisionTreeClassifier
|
|
31
|
+
|
|
32
|
+
# Object columns with more distinct values than this are treated as free-text /
|
|
33
|
+
# id-like and dropped from the probe design matrix (one-hot would explode).
|
|
34
|
+
_MAX_CATEGORY_CARDINALITY = 50
|
|
35
|
+
_TEST_SIZE = 0.3
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class ProbeResult:
|
|
40
|
+
"""What a probe measured on one realized frame."""
|
|
41
|
+
|
|
42
|
+
metric: float # the task metric the band targets (AUROC for binary)
|
|
43
|
+
metric_name: str # "auroc"
|
|
44
|
+
task: str # "classification"
|
|
45
|
+
linear_separability: float # holdout accuracy of a linear probe (05 §5.4)
|
|
46
|
+
class_balance: float # fraction of the positive class
|
|
47
|
+
n_features: int # design-matrix width actually fed to the probe
|
|
48
|
+
|
|
49
|
+
def to_dict(self) -> dict[str, Any]:
|
|
50
|
+
return {
|
|
51
|
+
"metric": self.metric,
|
|
52
|
+
"metric_name": self.metric_name,
|
|
53
|
+
"task": self.task,
|
|
54
|
+
"linear_separability": self.linear_separability,
|
|
55
|
+
"class_balance": self.class_balance,
|
|
56
|
+
"n_features": self.n_features,
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
class ProbeModel(ABC):
|
|
61
|
+
"""A baseline model whose holdout score *defines* the dataset's difficulty."""
|
|
62
|
+
|
|
63
|
+
name: str
|
|
64
|
+
# Optional JSON-schema fragment for probe options (09 §6); ``None`` for built-ins.
|
|
65
|
+
param_schema: Mapping[str, Any] | None = None
|
|
66
|
+
|
|
67
|
+
@abstractmethod
|
|
68
|
+
def estimator(self, seed: int) -> Any:
|
|
69
|
+
"""Return a fresh scikit-learn classifier exposing ``predict_proba``."""
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class LogRegProbe(ProbeModel):
|
|
73
|
+
name = "logreg"
|
|
74
|
+
|
|
75
|
+
def estimator(self, seed: int) -> Any:
|
|
76
|
+
# lbfgs is deterministic given the data; no random_state needed.
|
|
77
|
+
return LogisticRegression(max_iter=1000)
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class TreeProbe(ProbeModel):
|
|
81
|
+
name = "tree"
|
|
82
|
+
|
|
83
|
+
def estimator(self, seed: int) -> Any:
|
|
84
|
+
# Depth-capped so it stays a *baseline*, not a memorizer; seeded for repro.
|
|
85
|
+
return DecisionTreeClassifier(max_depth=8, random_state=seed)
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
PROBES: dict[str, ProbeModel] = {p.name: p for p in (LogRegProbe(), TreeProbe())}
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
def _encode_label(series: pd.Series) -> tuple[np.ndarray, int]:
|
|
92
|
+
"""Map a classification label to 0/1 codes; return (codes, n_classes).
|
|
93
|
+
|
|
94
|
+
The positive class is the lexicographically larger category (so booleans map
|
|
95
|
+
True→1) — a stable convention so AUROC orientation is reproducible.
|
|
96
|
+
"""
|
|
97
|
+
if pd.api.types.is_bool_dtype(series):
|
|
98
|
+
return series.to_numpy().astype(int), 2
|
|
99
|
+
codes, uniques = pd.factorize(series, sort=True)
|
|
100
|
+
return codes.astype(int), len(uniques)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def _design_matrix(frame: pd.DataFrame, label: str) -> tuple[np.ndarray, list[str]]:
|
|
104
|
+
"""Build a numeric feature matrix from every column except the label.
|
|
105
|
+
|
|
106
|
+
Numeric/boolean columns pass through (median-imputed); low-cardinality
|
|
107
|
+
categoricals are one-hot encoded; datetimes become ordinals; free-text /
|
|
108
|
+
high-cardinality columns are dropped (uninformative to a baseline probe).
|
|
109
|
+
"""
|
|
110
|
+
parts: list[pd.DataFrame] = []
|
|
111
|
+
for col in frame.columns:
|
|
112
|
+
if col == label:
|
|
113
|
+
continue
|
|
114
|
+
s = frame[col]
|
|
115
|
+
if pd.api.types.is_bool_dtype(s):
|
|
116
|
+
parts.append(s.astype(float).to_frame(col))
|
|
117
|
+
elif pd.api.types.is_numeric_dtype(s):
|
|
118
|
+
filled = s.astype(float)
|
|
119
|
+
median = filled.median()
|
|
120
|
+
parts.append(filled.fillna(0.0 if pd.isna(median) else median).to_frame(col))
|
|
121
|
+
elif pd.api.types.is_datetime64_any_dtype(s):
|
|
122
|
+
parts.append(s.astype("int64").astype(float).to_frame(col))
|
|
123
|
+
else: # object / categorical
|
|
124
|
+
if s.nunique(dropna=True) <= _MAX_CATEGORY_CARDINALITY:
|
|
125
|
+
dummies = pd.get_dummies(s, prefix=col, dummy_na=False, dtype=float)
|
|
126
|
+
if dummies.shape[1] > 0:
|
|
127
|
+
parts.append(dummies)
|
|
128
|
+
if not parts:
|
|
129
|
+
return np.empty((len(frame), 0), dtype=float), []
|
|
130
|
+
design = pd.concat(parts, axis=1)
|
|
131
|
+
return design.to_numpy(dtype=float), list(design.columns)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def evaluate(
|
|
135
|
+
probe: ProbeModel,
|
|
136
|
+
frame: pd.DataFrame,
|
|
137
|
+
label: str,
|
|
138
|
+
*,
|
|
139
|
+
split_seed: int,
|
|
140
|
+
est_seed: int,
|
|
141
|
+
) -> ProbeResult:
|
|
142
|
+
"""Train ``probe`` on a seeded split and score it on the holdout (05 §5.1).
|
|
143
|
+
|
|
144
|
+
Binary classification only in v0.1: returns holdout AUROC. Degenerate cases
|
|
145
|
+
(no usable features, a single realized class) score at chance (0.5) — honest
|
|
146
|
+
"no signal" rather than a misleading number or a crash.
|
|
147
|
+
"""
|
|
148
|
+
X, feat_names = _design_matrix(frame, label)
|
|
149
|
+
y, _ = _encode_label(frame[label])
|
|
150
|
+
n_features = X.shape[1]
|
|
151
|
+
positive_rate = float(np.mean(y == 1)) if len(y) else 0.0
|
|
152
|
+
|
|
153
|
+
classes = np.unique(y)
|
|
154
|
+
if n_features == 0 or classes.size < 2:
|
|
155
|
+
# Nothing to learn from, or label collapsed to one class → chance.
|
|
156
|
+
return ProbeResult(0.5, "auroc", "classification", 0.5, positive_rate, n_features)
|
|
157
|
+
|
|
158
|
+
# Stratify only when every class can appear on both sides of the split.
|
|
159
|
+
counts = np.bincount(y)
|
|
160
|
+
stratify = y if counts.min() >= 2 else None
|
|
161
|
+
X_tr, X_te, y_tr, y_te = train_test_split(
|
|
162
|
+
X, y, test_size=_TEST_SIZE, random_state=split_seed, stratify=stratify
|
|
163
|
+
)
|
|
164
|
+
if np.unique(y_te).size < 2 or np.unique(y_tr).size < 2:
|
|
165
|
+
return ProbeResult(0.5, "auroc", "classification", 0.5, positive_rate, n_features)
|
|
166
|
+
|
|
167
|
+
scaler = StandardScaler().fit(X_tr)
|
|
168
|
+
X_tr_s = scaler.transform(X_tr)
|
|
169
|
+
X_te_s = scaler.transform(X_te)
|
|
170
|
+
|
|
171
|
+
model = probe.estimator(est_seed)
|
|
172
|
+
model.fit(X_tr_s, y_tr)
|
|
173
|
+
proba = model.predict_proba(X_te_s)[:, 1]
|
|
174
|
+
auroc = float(roc_auc_score(y_te, proba))
|
|
175
|
+
|
|
176
|
+
# Linear-separability reference (05 §5.4): a plain logistic probe's holdout
|
|
177
|
+
# accuracy, regardless of which probe scored the AUROC.
|
|
178
|
+
linear = LogisticRegression(max_iter=1000).fit(X_tr_s, y_tr)
|
|
179
|
+
lin_acc = float(accuracy_score(y_te, linear.predict(X_te_s)))
|
|
180
|
+
|
|
181
|
+
return ProbeResult(auroc, "auroc", "classification", lin_acc, positive_rate, n_features)
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""Distributions + honest compliance reporting."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from .base import Distribution
|
|
6
|
+
from .builtins import (
|
|
7
|
+
REGISTRY,
|
|
8
|
+
sample_boolean,
|
|
9
|
+
sample_categorical,
|
|
10
|
+
sample_datetime,
|
|
11
|
+
sample_text,
|
|
12
|
+
)
|
|
13
|
+
from .compliance import ComplianceReport, FeatureCompliance, assess_numeric
|
|
14
|
+
from .providers import (
|
|
15
|
+
REALISTIC_GENERATORS,
|
|
16
|
+
is_realistic_generator,
|
|
17
|
+
resolve_locale,
|
|
18
|
+
sample_provider,
|
|
19
|
+
)
|
|
20
|
+
|
|
21
|
+
__all__ = [
|
|
22
|
+
"Distribution",
|
|
23
|
+
"REGISTRY",
|
|
24
|
+
"ComplianceReport",
|
|
25
|
+
"FeatureCompliance",
|
|
26
|
+
"assess_numeric",
|
|
27
|
+
"sample_boolean",
|
|
28
|
+
"sample_categorical",
|
|
29
|
+
"sample_datetime",
|
|
30
|
+
"sample_text",
|
|
31
|
+
"sample_provider",
|
|
32
|
+
"is_realistic_generator",
|
|
33
|
+
"resolve_locale",
|
|
34
|
+
"REALISTIC_GENERATORS",
|
|
35
|
+
]
|
|
@@ -0,0 +1,46 @@
|
|
|
1
|
+
"""Distribution ABC (05 §2).
|
|
2
|
+
|
|
3
|
+
A :class:`Distribution` knows how to *sample* from a target ``D(theta)`` using an
|
|
4
|
+
injected RNG, *validate* its parameters, and expose its theoretical *cdf* so the
|
|
5
|
+
compliance layer can run a KS test. Sampling is correct by construction — we
|
|
6
|
+
never refit parameters to the realized sample (05 §2.3).
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from abc import ABC, abstractmethod
|
|
12
|
+
from typing import Mapping
|
|
13
|
+
|
|
14
|
+
import numpy as np
|
|
15
|
+
|
|
16
|
+
from ..errors import SpecValidationError
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class Distribution(ABC):
|
|
20
|
+
name: str
|
|
21
|
+
required_params: tuple[str, ...] = ()
|
|
22
|
+
# Optional JSON-schema fragment for the feature `params`. Built-ins leave this
|
|
23
|
+
# ``None`` (the Canvas renders their native controls); plugins declare one so
|
|
24
|
+
# the UI can render config controls with no frontend work (09 §6).
|
|
25
|
+
param_schema: Mapping[str, object] | None = None
|
|
26
|
+
|
|
27
|
+
@abstractmethod
|
|
28
|
+
def sample(self, rng: np.random.Generator, n: int, params: Mapping[str, float]) -> np.ndarray:
|
|
29
|
+
"""Draw ``n`` samples from the target distribution."""
|
|
30
|
+
|
|
31
|
+
@abstractmethod
|
|
32
|
+
def cdf(self, x: np.ndarray, params: Mapping[str, float]) -> np.ndarray:
|
|
33
|
+
"""Theoretical CDF F(x; params) — used for KS reporting."""
|
|
34
|
+
|
|
35
|
+
def validate(self, params: Mapping[str, float], locator: str | None = None) -> None:
|
|
36
|
+
"""Check required params are present and satisfy domain constraints."""
|
|
37
|
+
missing = [p for p in self.required_params if p not in params]
|
|
38
|
+
if missing:
|
|
39
|
+
raise SpecValidationError(
|
|
40
|
+
f"distribution {self.name!r} missing params: {missing}", locator=locator
|
|
41
|
+
)
|
|
42
|
+
self._validate_domain(params, locator)
|
|
43
|
+
|
|
44
|
+
def _validate_domain(self, params: Mapping[str, float], locator: str | None) -> None:
|
|
45
|
+
"""Override for per-distribution constraints (e.g. std > 0)."""
|
|
46
|
+
return None
|
|
@@ -0,0 +1,172 @@
|
|
|
1
|
+
"""Built-in distributions and feature samplers (04 §4, 05 §2).
|
|
2
|
+
|
|
3
|
+
Numeric distributions referenced by ``dist:`` live in :data:`REGISTRY`. Sampling
|
|
4
|
+
flows entirely through the injected ``numpy.random.Generator`` so results are
|
|
5
|
+
deterministic on the pinned path. Non-numeric feature kinds (categorical,
|
|
6
|
+
boolean, datetime, text) have dedicated samplers used by the pipeline.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from typing import Mapping
|
|
12
|
+
|
|
13
|
+
import numpy as np
|
|
14
|
+
from scipy import stats
|
|
15
|
+
|
|
16
|
+
from ..errors import SpecValidationError
|
|
17
|
+
from .base import Distribution
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def _require_positive(params: Mapping[str, float], keys: tuple[str, ...], locator: str | None) -> None:
|
|
21
|
+
for k in keys:
|
|
22
|
+
if k in params and params[k] <= 0:
|
|
23
|
+
raise SpecValidationError(f"param {k!r} must be > 0", locator=locator)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class Normal(Distribution):
|
|
27
|
+
name = "normal"
|
|
28
|
+
required_params = ("mean", "std")
|
|
29
|
+
|
|
30
|
+
def _validate_domain(self, params, locator):
|
|
31
|
+
_require_positive(params, ("std",), locator)
|
|
32
|
+
|
|
33
|
+
def sample(self, rng, n, params):
|
|
34
|
+
return rng.normal(params["mean"], params["std"], size=n)
|
|
35
|
+
|
|
36
|
+
def cdf(self, x, params):
|
|
37
|
+
return stats.norm.cdf(x, loc=params["mean"], scale=params["std"])
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class LogNormal(Distribution):
|
|
41
|
+
name = "lognormal"
|
|
42
|
+
required_params = ("mu", "sigma")
|
|
43
|
+
|
|
44
|
+
def _validate_domain(self, params, locator):
|
|
45
|
+
_require_positive(params, ("sigma",), locator)
|
|
46
|
+
|
|
47
|
+
def sample(self, rng, n, params):
|
|
48
|
+
return rng.lognormal(params["mu"], params["sigma"], size=n)
|
|
49
|
+
|
|
50
|
+
def cdf(self, x, params):
|
|
51
|
+
return stats.lognorm.cdf(x, s=params["sigma"], scale=np.exp(params["mu"]))
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
class Poisson(Distribution):
|
|
55
|
+
name = "poisson"
|
|
56
|
+
required_params = ("lam",)
|
|
57
|
+
|
|
58
|
+
def _validate_domain(self, params, locator):
|
|
59
|
+
_require_positive(params, ("lam",), locator)
|
|
60
|
+
|
|
61
|
+
def sample(self, rng, n, params):
|
|
62
|
+
return rng.poisson(params["lam"], size=n)
|
|
63
|
+
|
|
64
|
+
def cdf(self, x, params):
|
|
65
|
+
return stats.poisson.cdf(x, mu=params["lam"])
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class Pareto(Distribution):
|
|
69
|
+
name = "pareto"
|
|
70
|
+
required_params = ("alpha", "xm")
|
|
71
|
+
|
|
72
|
+
def _validate_domain(self, params, locator):
|
|
73
|
+
_require_positive(params, ("alpha", "xm"), locator)
|
|
74
|
+
|
|
75
|
+
def sample(self, rng, n, params):
|
|
76
|
+
# numpy draws Lomax (Pareto II); classical Pareto I = (lomax + 1) * xm.
|
|
77
|
+
return (rng.pareto(params["alpha"], size=n) + 1.0) * params["xm"]
|
|
78
|
+
|
|
79
|
+
def cdf(self, x, params):
|
|
80
|
+
return stats.pareto.cdf(x, b=params["alpha"], scale=params["xm"])
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class Uniform(Distribution):
|
|
84
|
+
name = "uniform"
|
|
85
|
+
required_params = ("low", "high")
|
|
86
|
+
|
|
87
|
+
def _validate_domain(self, params, locator):
|
|
88
|
+
if params["low"] >= params["high"]:
|
|
89
|
+
raise SpecValidationError("uniform requires low < high", locator=locator)
|
|
90
|
+
|
|
91
|
+
def sample(self, rng, n, params):
|
|
92
|
+
return rng.uniform(params["low"], params["high"], size=n)
|
|
93
|
+
|
|
94
|
+
def cdf(self, x, params):
|
|
95
|
+
return stats.uniform.cdf(x, loc=params["low"], scale=params["high"] - params["low"])
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class Exponential(Distribution):
|
|
99
|
+
name = "exponential"
|
|
100
|
+
required_params = ("scale",)
|
|
101
|
+
|
|
102
|
+
def _validate_domain(self, params, locator):
|
|
103
|
+
_require_positive(params, ("scale",), locator)
|
|
104
|
+
|
|
105
|
+
def sample(self, rng, n, params):
|
|
106
|
+
return rng.exponential(params["scale"], size=n)
|
|
107
|
+
|
|
108
|
+
def cdf(self, x, params):
|
|
109
|
+
return stats.expon.cdf(x, scale=params["scale"])
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
REGISTRY: dict[str, Distribution] = {
|
|
113
|
+
d.name: d
|
|
114
|
+
for d in (Normal(), LogNormal(), Poisson(), Pareto(), Uniform(), Exponential())
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
# --- Non-numeric feature samplers ----------------------------------------------------
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def sample_categorical(
|
|
122
|
+
rng: np.random.Generator, n: int, categories: list[str], weights: list[float] | None
|
|
123
|
+
) -> np.ndarray:
|
|
124
|
+
if weights is None:
|
|
125
|
+
probs = None
|
|
126
|
+
else:
|
|
127
|
+
total = float(sum(weights))
|
|
128
|
+
probs = [w / total for w in weights]
|
|
129
|
+
return rng.choice(np.asarray(categories, dtype=object), size=n, p=probs)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
def sample_boolean(rng: np.random.Generator, n: int, rate: float) -> np.ndarray:
|
|
133
|
+
return rng.random(size=n) < rate
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def sample_datetime(
|
|
137
|
+
rng: np.random.Generator,
|
|
138
|
+
n: int,
|
|
139
|
+
start: str,
|
|
140
|
+
end: str,
|
|
141
|
+
granularity: str,
|
|
142
|
+
) -> np.ndarray:
|
|
143
|
+
"""Uniformly sample timestamps in [start, end] at the given granularity."""
|
|
144
|
+
unit = {"second": "s", "minute": "m", "hour": "h", "day": "D"}[granularity]
|
|
145
|
+
# Cast both endpoints to the chosen unit so their difference is an integer
|
|
146
|
+
# number of steps — avoids unit-mismatch and stays fully deterministic.
|
|
147
|
+
start_dt = np.datetime64(start).astype(f"datetime64[{unit}]")
|
|
148
|
+
end_dt = np.datetime64(end).astype(f"datetime64[{unit}]")
|
|
149
|
+
span_steps = int((end_dt - start_dt).astype("int64"))
|
|
150
|
+
if span_steps < 0:
|
|
151
|
+
raise SpecValidationError("datetime end must be >= start")
|
|
152
|
+
offsets = rng.integers(0, span_steps + 1, size=n)
|
|
153
|
+
return start_dt + offsets.astype(f"timedelta64[{unit}]")
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
_LOREM = (
|
|
157
|
+
"lorem ipsum dolor sit amet consectetur adipiscing elit sed do eiusmod tempor "
|
|
158
|
+
"incididunt ut labore et dolore magna aliqua ut enim ad minim veniam quis nostrud "
|
|
159
|
+
"exercitation ullamco laboris nisi aliquip ex ea commodo consequat"
|
|
160
|
+
).split()
|
|
161
|
+
|
|
162
|
+
|
|
163
|
+
def sample_text(
|
|
164
|
+
rng: np.random.Generator, n: int, min_len: int, max_len: int
|
|
165
|
+
) -> np.ndarray:
|
|
166
|
+
words = np.asarray(_LOREM, dtype=object)
|
|
167
|
+
lengths = rng.integers(min_len, max_len + 1, size=n)
|
|
168
|
+
out = np.empty(n, dtype=object)
|
|
169
|
+
for i in range(n):
|
|
170
|
+
idx = rng.integers(0, len(words), size=int(lengths[i]))
|
|
171
|
+
out[i] = " ".join(words[idx])
|
|
172
|
+
return out
|