datadoom 0.1.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datadoom/__init__.py +23 -0
- datadoom/adapters/__init__.py +29 -0
- datadoom/adapters/frameworks.py +94 -0
- datadoom/adapters/loaders.py +72 -0
- datadoom/api/__init__.py +11 -0
- datadoom/api/app.py +109 -0
- datadoom/api/deps.py +30 -0
- datadoom/api/errors.py +89 -0
- datadoom/api/estimate.py +82 -0
- datadoom/api/routes/__init__.py +7 -0
- datadoom/api/routes/artifacts.py +147 -0
- datadoom/api/routes/datasets.py +180 -0
- datadoom/api/routes/meta.py +45 -0
- datadoom/api/routes/plugins.py +22 -0
- datadoom/api/routes/runs.py +144 -0
- datadoom/api/routes/specs.py +73 -0
- datadoom/api/routes/templates.py +30 -0
- datadoom/api/schemas.py +230 -0
- datadoom/api/serializers.py +143 -0
- datadoom/api/state.py +24 -0
- datadoom/api/store_helpers.py +56 -0
- datadoom/api/ws.py +72 -0
- datadoom/cli/__init__.py +1 -0
- datadoom/cli/main.py +313 -0
- datadoom/config.py +108 -0
- datadoom/engine/__init__.py +38 -0
- datadoom/engine/advice.py +289 -0
- datadoom/engine/audit.py +290 -0
- datadoom/engine/causal/__init__.py +15 -0
- datadoom/engine/causal/execute.py +116 -0
- datadoom/engine/causal/functions.py +116 -0
- datadoom/engine/causal/graph.py +54 -0
- datadoom/engine/difficulty/__init__.py +36 -0
- datadoom/engine/difficulty/calibrate.py +235 -0
- datadoom/engine/difficulty/knobs.py +171 -0
- datadoom/engine/difficulty/probes.py +181 -0
- datadoom/engine/dist/__init__.py +35 -0
- datadoom/engine/dist/base.py +46 -0
- datadoom/engine/dist/builtins.py +172 -0
- datadoom/engine/dist/compliance.py +344 -0
- datadoom/engine/dist/providers.py +117 -0
- datadoom/engine/errors.py +32 -0
- datadoom/engine/export/__init__.py +27 -0
- datadoom/engine/export/base.py +49 -0
- datadoom/engine/export/checksums.py +18 -0
- datadoom/engine/export/csv_exporter.py +34 -0
- datadoom/engine/export/json_exporter.py +67 -0
- datadoom/engine/export/metadata.py +58 -0
- datadoom/engine/export/parquet_exporter.py +45 -0
- datadoom/engine/failure/__init__.py +18 -0
- datadoom/engine/failure/apply.py +37 -0
- datadoom/engine/failure/base.py +116 -0
- datadoom/engine/failure/modes.py +442 -0
- datadoom/engine/pipeline.py +418 -0
- datadoom/engine/profile.py +327 -0
- datadoom/engine/progress.py +14 -0
- datadoom/engine/reference.py +338 -0
- datadoom/engine/reports.py +206 -0
- datadoom/engine/rng.py +79 -0
- datadoom/engine/spec/__init__.py +45 -0
- datadoom/engine/spec/hashing.py +57 -0
- datadoom/engine/spec/models.py +238 -0
- datadoom/engine/spec/validate.py +345 -0
- datadoom/engine/timeseries.py +88 -0
- datadoom/jobs/__init__.py +14 -0
- datadoom/jobs/progress.py +155 -0
- datadoom/jobs/worker.py +162 -0
- datadoom/plugin.py +35 -0
- datadoom/plugins/__init__.py +47 -0
- datadoom/plugins/contracts.py +72 -0
- datadoom/plugins/loader.py +125 -0
- datadoom/plugins/registry.py +214 -0
- datadoom/plugins/scaffold.py +434 -0
- datadoom/store/__init__.py +47 -0
- datadoom/store/artifacts.py +67 -0
- datadoom/store/db.py +104 -0
- datadoom/store/migrations/__init__.py +0 -0
- datadoom/store/migrations/env.py +53 -0
- datadoom/store/migrations/script.py.mako +24 -0
- datadoom/store/migrations/versions/0001_init.py +149 -0
- datadoom/store/migrations/versions/0002_report_mutual_information.py +23 -0
- datadoom/store/migrations/versions/0003_run_name.py +23 -0
- datadoom/store/migrations/versions/0004_report_profile.py +24 -0
- datadoom/store/models.py +170 -0
- datadoom/store/repositories.py +279 -0
- datadoom/templates/__init__.py +239 -0
- datadoom/templates/ab_test.datadoom.yaml +46 -0
- datadoom/templates/clinical_deterioration.datadoom.yaml +124 -0
- datadoom/templates/credit_default_challenge.datadoom.yaml +147 -0
- datadoom/templates/customer_churn.datadoom.yaml +60 -0
- datadoom/templates/ecommerce_orders.datadoom.yaml +46 -0
- datadoom/templates/fraud_detection.datadoom.yaml +57 -0
- datadoom/templates/hospital_readmission.datadoom.yaml +61 -0
- datadoom/templates/insurance_claims.datadoom.yaml +43 -0
- datadoom/templates/iot_sensors.datadoom.yaml +44 -0
- datadoom/templates/people_directory.datadoom.yaml +56 -0
- datadoom/templates/predictive_maintenance.datadoom.yaml +107 -0
- datadoom/templates/telecom_churn_challenge.datadoom.yaml +125 -0
- datadoom/version.py +3 -0
- datadoom/webdist/assets/index-V8VAuTJG.js +445 -0
- datadoom/webdist/assets/index-doRjyG5s.css +1 -0
- datadoom/webdist/assets/inter-cyrillic-ext-wght-normal-BOeWTOD4.woff2 +0 -0
- datadoom/webdist/assets/inter-cyrillic-wght-normal-DqGufNeO.woff2 +0 -0
- datadoom/webdist/assets/inter-greek-ext-wght-normal-DlzME5K_.woff2 +0 -0
- datadoom/webdist/assets/inter-greek-wght-normal-CkhJZR-_.woff2 +0 -0
- datadoom/webdist/assets/inter-latin-ext-wght-normal-DO1Apj_S.woff2 +0 -0
- datadoom/webdist/assets/inter-latin-wght-normal-Dx4kXJAl.woff2 +0 -0
- datadoom/webdist/assets/inter-vietnamese-wght-normal-CBcvBZtf.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-cyrillic-wght-normal-D73BlboJ.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-greek-wght-normal-Bw9x6K1M.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-latin-ext-wght-normal-DBQx-q_a.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-latin-wght-normal-B9CIFXIH.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-vietnamese-wght-normal-Bt-aOZkq.woff2 +0 -0
- datadoom/webdist/assets/space-grotesk-latin-ext-wght-normal-D9tNdqV9.woff2 +0 -0
- datadoom/webdist/assets/space-grotesk-latin-wght-normal-BhU9QXUp.woff2 +0 -0
- datadoom/webdist/assets/space-grotesk-vietnamese-wght-normal-D0rl6rjA.woff2 +0 -0
- datadoom/webdist/index.html +15 -0
- datadoom-0.1.0.dev0.dist-info/METADATA +143 -0
- datadoom-0.1.0.dev0.dist-info/RECORD +122 -0
- datadoom-0.1.0.dev0.dist-info/WHEEL +4 -0
- datadoom-0.1.0.dev0.dist-info/entry_points.txt +2 -0
- datadoom-0.1.0.dev0.dist-info/licenses/LICENSE +202 -0
|
@@ -0,0 +1,344 @@
|
|
|
1
|
+
"""Honest statistical compliance reporting (05 §2.2-2.3, §7).
|
|
2
|
+
|
|
3
|
+
We report how well the realized sample matches the *requested* distribution. We
|
|
4
|
+
deliberately do NOT refit parameters to the sample: the ~alpha fraction of
|
|
5
|
+
"failures" at significance alpha is expected sampling variance, not a defect.
|
|
6
|
+
Refitting would make the data match itself rather than the user's request.
|
|
7
|
+
|
|
8
|
+
**Two complementary tests, picked by feature shape.**
|
|
9
|
+
|
|
10
|
+
*Continuous, untransformed targets* → a one-sample **Kolmogorov-Smirnov** test
|
|
11
|
+
against the requested CDF. This is the right tool when the realized data really
|
|
12
|
+
is a clean draw from a continuous distribution (e.g. ``normal``/``lognormal``
|
|
13
|
+
with ``dtype: float`` and no clamping).
|
|
14
|
+
|
|
15
|
+
*Integer, discrete, or clamped targets* → a **chi-square goodness-of-fit** test
|
|
16
|
+
against the **effective** PMF (the distribution actually realized after the
|
|
17
|
+
transform). A KS test is invalid here: ``dtype: int`` discretizes a continuous
|
|
18
|
+
draw, a discrete distribution (poisson) lives on the integers, and ``min``/
|
|
19
|
+
``max`` clamping piles point masses at the bounds — so the realized data is no
|
|
20
|
+
longer a clean draw from the continuous CDF, and at large *n* a KS test rejects
|
|
21
|
+
on the *transform artifact*, not on any defect. The GoF test instead compares
|
|
22
|
+
binned counts to the effective PMF, where the end bins absorb the (possibly
|
|
23
|
+
clamped) tail mass:
|
|
24
|
+
|
|
25
|
+
* interior integer bin ``k`` → ``P = F(k + ½) − F(k − ½)``
|
|
26
|
+
* min bin → ``P = F(kmin + ½)`` (absorbs the lower tail)
|
|
27
|
+
* max bin → ``P = 1 − F(kmax − ½)`` (absorbs the upper tail)
|
|
28
|
+
|
|
29
|
+
For a discrete CDF the ``±½`` edges coincide with the integer steps, so the same
|
|
30
|
+
formula yields the exact PMF (``F(k) − F(k−1)``). Bins whose expected count falls
|
|
31
|
+
below :data:`MIN_EXPECTED_COUNT` are merged with a neighbour (Cochran's rule) so
|
|
32
|
+
the chi-square approximation holds. Degrees of freedom are ``bins − 1`` — we
|
|
33
|
+
subtract **nothing** for fitted parameters because the parameters come from the
|
|
34
|
+
spec, not from the data.
|
|
35
|
+
|
|
36
|
+
This turns the previous honest *abstention* (``applicable: False``, scored
|
|
37
|
+
``n/a``) into an actual validated pass/fail for the most common real-world
|
|
38
|
+
feature shapes — ages, counts, bounded scores — while never penalizing a correct
|
|
39
|
+
generator for a transform we deliberately applied.
|
|
40
|
+
"""
|
|
41
|
+
|
|
42
|
+
from __future__ import annotations
|
|
43
|
+
|
|
44
|
+
from dataclasses import dataclass, field
|
|
45
|
+
from typing import Any
|
|
46
|
+
|
|
47
|
+
import numpy as np
|
|
48
|
+
from scipy import stats
|
|
49
|
+
|
|
50
|
+
from .base import Distribution
|
|
51
|
+
from .builtins import REGISTRY
|
|
52
|
+
|
|
53
|
+
DEFAULT_ALPHA = 0.05
|
|
54
|
+
|
|
55
|
+
# Distributions whose support is discrete: a continuous KS test does not apply.
|
|
56
|
+
DISCRETE_DISTS = {"poisson"}
|
|
57
|
+
|
|
58
|
+
# Cochran's rule of thumb: keep every chi-square cell's expected count at or
|
|
59
|
+
# above this by merging sparse bins, so the asymptotic distribution holds.
|
|
60
|
+
MIN_EXPECTED_COUNT = 5.0
|
|
61
|
+
|
|
62
|
+
# Number of interior bins for the clamped-continuous goodness-of-fit test.
|
|
63
|
+
_CONTINUOUS_INTERIOR_BINS = 24
|
|
64
|
+
|
|
65
|
+
# Guard: refuse to enumerate an absurd integer range (degenerate spec).
|
|
66
|
+
_MAX_INTEGER_BINS = 200_000
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
@dataclass
|
|
70
|
+
class FeatureCompliance:
|
|
71
|
+
feature: str
|
|
72
|
+
dist: str
|
|
73
|
+
target_params: dict[str, float]
|
|
74
|
+
empirical: dict[str, float]
|
|
75
|
+
ks_statistic: float
|
|
76
|
+
p_value: float
|
|
77
|
+
passed: bool | None
|
|
78
|
+
clamped_fraction: float = 0.0
|
|
79
|
+
applicable: bool = True
|
|
80
|
+
note: str | None = None
|
|
81
|
+
# Which test produced ``p_value``/``passed``: "ks", "chi2_gof", or "none"
|
|
82
|
+
# (the last meaning no valid test could be formed — an honest abstention).
|
|
83
|
+
test: str = "ks"
|
|
84
|
+
# Chi-square goodness-of-fit detail (only when ``test == "chi2_gof"``).
|
|
85
|
+
gof: dict[str, float] | None = None
|
|
86
|
+
|
|
87
|
+
def to_dict(self) -> dict[str, Any]:
|
|
88
|
+
return {
|
|
89
|
+
"feature": self.feature,
|
|
90
|
+
"dist": self.dist,
|
|
91
|
+
"target_params": self.target_params,
|
|
92
|
+
"empirical": self.empirical,
|
|
93
|
+
"ks_statistic": self.ks_statistic,
|
|
94
|
+
"p_value": self.p_value,
|
|
95
|
+
"passed": self.passed,
|
|
96
|
+
"clamped_fraction": self.clamped_fraction,
|
|
97
|
+
"applicable": self.applicable,
|
|
98
|
+
"note": self.note,
|
|
99
|
+
"test": self.test,
|
|
100
|
+
"gof": self.gof,
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
@dataclass
|
|
105
|
+
class ComplianceReport:
|
|
106
|
+
alpha: float
|
|
107
|
+
features: list[FeatureCompliance] = field(default_factory=list)
|
|
108
|
+
|
|
109
|
+
@property
|
|
110
|
+
def score(self) -> float:
|
|
111
|
+
"""Fraction of *assessable* features whose fit test passes.
|
|
112
|
+
|
|
113
|
+
A feature is assessable when some valid test (KS for continuous targets,
|
|
114
|
+
chi-square GoF for integer/discrete/clamped targets) could be run for it.
|
|
115
|
+
Features that abstain (``applicable: False`` — no valid test could be
|
|
116
|
+
formed) are excluded so a correct generator is never penalized. With no
|
|
117
|
+
assessable features there is nothing to contradict, so the score is 1.0.
|
|
118
|
+
"""
|
|
119
|
+
applicable = [f for f in self.features if f.applicable]
|
|
120
|
+
if not applicable:
|
|
121
|
+
return 1.0
|
|
122
|
+
return sum(1 for f in applicable if f.passed) / len(applicable)
|
|
123
|
+
|
|
124
|
+
def to_dict(self) -> dict[str, Any]:
|
|
125
|
+
return {
|
|
126
|
+
"alpha": self.alpha,
|
|
127
|
+
"compliance_score": self.score,
|
|
128
|
+
"applicable_features": sum(1 for f in self.features if f.applicable),
|
|
129
|
+
"assessed_features": len(self.features),
|
|
130
|
+
"features": [f.to_dict() for f in self.features],
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
def _needs_gof(dist_name: str, dtype: str, clamped_fraction: float) -> tuple[bool, list[str]]:
|
|
135
|
+
"""Decide whether a continuous KS test is invalid here (→ use a GoF test).
|
|
136
|
+
|
|
137
|
+
Returns ``(needs_gof, reasons)``; an empty ``reasons`` means KS is valid.
|
|
138
|
+
"""
|
|
139
|
+
reasons: list[str] = []
|
|
140
|
+
if dist_name in DISCRETE_DISTS:
|
|
141
|
+
reasons.append("discrete distribution")
|
|
142
|
+
if dtype == "int":
|
|
143
|
+
reasons.append("integer discretization")
|
|
144
|
+
if clamped_fraction > 0:
|
|
145
|
+
reasons.append(f"clamping ({clamped_fraction:.1%})")
|
|
146
|
+
return bool(reasons), reasons
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def _merge_sparse_bins(
|
|
150
|
+
expected_p: np.ndarray, observed: np.ndarray, n: int
|
|
151
|
+
) -> tuple[np.ndarray, np.ndarray]:
|
|
152
|
+
"""Greedily merge adjacent bins left-to-right until each expected count is at
|
|
153
|
+
least :data:`MIN_EXPECTED_COUNT`. Any sparse remainder folds into the last
|
|
154
|
+
closed group. Deterministic given the inputs.
|
|
155
|
+
"""
|
|
156
|
+
groups_p: list[float] = []
|
|
157
|
+
groups_o: list[float] = []
|
|
158
|
+
cur_p = 0.0
|
|
159
|
+
cur_o = 0.0
|
|
160
|
+
for p, o in zip(expected_p.tolist(), observed.tolist()):
|
|
161
|
+
cur_p += p
|
|
162
|
+
cur_o += o
|
|
163
|
+
if cur_p * n >= MIN_EXPECTED_COUNT:
|
|
164
|
+
groups_p.append(cur_p)
|
|
165
|
+
groups_o.append(cur_o)
|
|
166
|
+
cur_p = 0.0
|
|
167
|
+
cur_o = 0.0
|
|
168
|
+
if cur_p > 0 or cur_o > 0: # leftover sparse tail
|
|
169
|
+
if groups_p:
|
|
170
|
+
groups_p[-1] += cur_p
|
|
171
|
+
groups_o[-1] += cur_o
|
|
172
|
+
else:
|
|
173
|
+
groups_p.append(cur_p)
|
|
174
|
+
groups_o.append(cur_o)
|
|
175
|
+
return np.asarray(groups_p, dtype=float), np.asarray(groups_o, dtype=float)
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def _chi_square(expected_p: np.ndarray, observed: np.ndarray, n: int) -> dict[str, float] | None:
|
|
179
|
+
"""Run a chi-square GoF on already-binned (expected prob, observed count).
|
|
180
|
+
|
|
181
|
+
Merges sparse bins, then computes ``Σ (O − E)² / E`` with ``dof = bins − 1``
|
|
182
|
+
(no parameters were fit). Returns ``None`` when fewer than two bins survive
|
|
183
|
+
(no testable signal) or the total expected mass is degenerate.
|
|
184
|
+
"""
|
|
185
|
+
total_p = float(expected_p.sum())
|
|
186
|
+
if total_p <= 0:
|
|
187
|
+
return None
|
|
188
|
+
expected_p = expected_p / total_p # guard tiny float drift so Σp == 1
|
|
189
|
+
merged_p, merged_o = _merge_sparse_bins(expected_p, observed, n)
|
|
190
|
+
if merged_p.size < 2:
|
|
191
|
+
return None
|
|
192
|
+
expected_counts = merged_p * n
|
|
193
|
+
statistic = float(np.sum((merged_o - expected_counts) ** 2 / expected_counts))
|
|
194
|
+
dof = int(merged_p.size - 1)
|
|
195
|
+
p_value = float(stats.chi2.sf(statistic, dof))
|
|
196
|
+
return {"statistic": statistic, "dof": float(dof), "bins": float(merged_p.size), "p_value": p_value}
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def _integer_gof(dist: Distribution, params: dict[str, float], data: np.ndarray, n: int) -> dict[str, float] | None:
|
|
200
|
+
"""GoF for an integer-valued target (int dtype or a discrete distribution)."""
|
|
201
|
+
ints = np.rint(data).astype(np.int64)
|
|
202
|
+
kmin = int(ints.min())
|
|
203
|
+
kmax = int(ints.max())
|
|
204
|
+
if kmax - kmin + 1 > _MAX_INTEGER_BINS:
|
|
205
|
+
return None
|
|
206
|
+
ks = np.arange(kmin, kmax + 1)
|
|
207
|
+
observed = np.bincount(ints - kmin, minlength=ks.size).astype(float)
|
|
208
|
+
upper = dist.cdf(ks + 0.5, params) # F(k + ½)
|
|
209
|
+
lower = dist.cdf(ks - 0.5, params) # F(k − ½)
|
|
210
|
+
expected_p = np.asarray(upper, dtype=float) - np.asarray(lower, dtype=float)
|
|
211
|
+
expected_p[0] = float(upper[0]) # min bin absorbs the lower tail
|
|
212
|
+
expected_p[-1] = 1.0 - float(lower[-1]) # max bin absorbs the upper tail
|
|
213
|
+
expected_p = np.clip(expected_p, 0.0, None)
|
|
214
|
+
return _chi_square(expected_p, observed, n)
|
|
215
|
+
|
|
216
|
+
|
|
217
|
+
def _clamped_continuous_gof(
|
|
218
|
+
dist: Distribution,
|
|
219
|
+
params: dict[str, float],
|
|
220
|
+
data: np.ndarray,
|
|
221
|
+
n: int,
|
|
222
|
+
clamp_min: float | None,
|
|
223
|
+
clamp_max: float | None,
|
|
224
|
+
) -> dict[str, float] | None:
|
|
225
|
+
"""GoF for a continuous (float) target whose only transform is clamping.
|
|
226
|
+
|
|
227
|
+
Clamping turns ``[min, max]`` into point masses: ``P(min) = F(min)`` and
|
|
228
|
+
``P(max) = 1 − F(max)``. The open interior is split into equal-width bins.
|
|
229
|
+
"""
|
|
230
|
+
bins_p: list[float] = []
|
|
231
|
+
bins_o: list[float] = []
|
|
232
|
+
|
|
233
|
+
interior_lo = clamp_min if clamp_min is not None else float(data.min())
|
|
234
|
+
interior_hi = clamp_max if clamp_max is not None else float(data.max())
|
|
235
|
+
if not interior_hi > interior_lo:
|
|
236
|
+
return None
|
|
237
|
+
|
|
238
|
+
if clamp_min is not None: # lower point mass P(min) = F(min)
|
|
239
|
+
bins_p.append(float(dist.cdf(np.asarray([clamp_min]), params)[0]))
|
|
240
|
+
bins_o.append(float(np.count_nonzero(data <= clamp_min)))
|
|
241
|
+
|
|
242
|
+
edges = np.linspace(interior_lo, interior_hi, _CONTINUOUS_INTERIOR_BINS + 1)
|
|
243
|
+
cdf_edges = np.asarray(dist.cdf(edges, params), dtype=float)
|
|
244
|
+
if clamp_min is None:
|
|
245
|
+
cdf_edges[0] = 0.0 # bottom interior bin absorbs the open lower tail
|
|
246
|
+
if clamp_max is None:
|
|
247
|
+
cdf_edges[-1] = 1.0 # top interior bin absorbs the open upper tail
|
|
248
|
+
# Strictly-interior data (exact bounds already counted as point masses).
|
|
249
|
+
interior_mask = data > interior_lo
|
|
250
|
+
if clamp_max is not None:
|
|
251
|
+
interior_mask &= data < interior_hi
|
|
252
|
+
interior_o = np.histogram(data[interior_mask], bins=edges)[0].astype(float)
|
|
253
|
+
bins_p.extend(np.diff(cdf_edges).tolist())
|
|
254
|
+
bins_o.extend(interior_o.tolist())
|
|
255
|
+
|
|
256
|
+
if clamp_max is not None: # upper point mass P(max) = 1 − F(max)
|
|
257
|
+
bins_p.append(1.0 - float(dist.cdf(np.asarray([clamp_max]), params)[0]))
|
|
258
|
+
bins_o.append(float(np.count_nonzero(data >= clamp_max)))
|
|
259
|
+
|
|
260
|
+
expected_p = np.clip(np.asarray(bins_p, dtype=float), 0.0, None)
|
|
261
|
+
observed = np.asarray(bins_o, dtype=float)
|
|
262
|
+
return _chi_square(expected_p, observed, n)
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
def assess_numeric(
|
|
266
|
+
feature: str,
|
|
267
|
+
dist_name: str,
|
|
268
|
+
params: dict[str, float],
|
|
269
|
+
values: np.ndarray,
|
|
270
|
+
clamped_fraction: float = 0.0,
|
|
271
|
+
alpha: float = DEFAULT_ALPHA,
|
|
272
|
+
dtype: str = "float",
|
|
273
|
+
clamp_min: float | None = None,
|
|
274
|
+
clamp_max: float | None = None,
|
|
275
|
+
) -> FeatureCompliance:
|
|
276
|
+
"""Assess a realized numeric sample against its requested distribution.
|
|
277
|
+
|
|
278
|
+
Continuous untransformed targets are judged by KS; integer/discrete/clamped
|
|
279
|
+
targets by a chi-square goodness-of-fit against the effective PMF. The KS
|
|
280
|
+
statistic is always reported for transparency. See the module docstring.
|
|
281
|
+
"""
|
|
282
|
+
dist = REGISTRY[dist_name]
|
|
283
|
+
data = np.asarray(values, dtype=float)
|
|
284
|
+
|
|
285
|
+
ks_stat, ks_p = stats.kstest(data, lambda x: dist.cdf(x, params))
|
|
286
|
+
empirical = {
|
|
287
|
+
"mean": float(np.mean(data)),
|
|
288
|
+
"std": float(np.std(data, ddof=1)) if data.size > 1 else 0.0,
|
|
289
|
+
"min": float(np.min(data)),
|
|
290
|
+
"max": float(np.max(data)),
|
|
291
|
+
}
|
|
292
|
+
|
|
293
|
+
needs_gof, reasons = _needs_gof(dist_name, dtype, clamped_fraction)
|
|
294
|
+
|
|
295
|
+
# Decide which test rules, then construct one FeatureCompliance.
|
|
296
|
+
p_value = float(ks_p)
|
|
297
|
+
passed: bool | None = bool(ks_p > alpha)
|
|
298
|
+
applicable = True
|
|
299
|
+
test = "ks"
|
|
300
|
+
note: str | None = None
|
|
301
|
+
gof: dict[str, float] | None = None
|
|
302
|
+
|
|
303
|
+
if needs_gof:
|
|
304
|
+
why = ", ".join(reasons)
|
|
305
|
+
# KS is not a valid signal here — run a goodness-of-fit test against the
|
|
306
|
+
# effective (discretized/clamped) PMF instead.
|
|
307
|
+
if dtype == "int" or dist_name in DISCRETE_DISTS:
|
|
308
|
+
gof = _integer_gof(dist, params, data, data.size)
|
|
309
|
+
else: # continuous float whose only transform is clamping
|
|
310
|
+
gof = _clamped_continuous_gof(dist, params, data, data.size, clamp_min, clamp_max)
|
|
311
|
+
if gof is None:
|
|
312
|
+
# No valid test could be formed (near-constant / too few bins) —
|
|
313
|
+
# abstain honestly rather than emit a meaningless verdict.
|
|
314
|
+
passed = None
|
|
315
|
+
applicable = False
|
|
316
|
+
test = "none"
|
|
317
|
+
note = (
|
|
318
|
+
f"continuous KS not applicable ({why}); goodness-of-fit "
|
|
319
|
+
"abstained (too few distinct values to bin)"
|
|
320
|
+
)
|
|
321
|
+
else:
|
|
322
|
+
p_value = float(gof["p_value"])
|
|
323
|
+
passed = bool(gof["p_value"] > alpha)
|
|
324
|
+
test = "chi2_gof"
|
|
325
|
+
note = (
|
|
326
|
+
f"chi-square goodness-of-fit vs the effective PMF "
|
|
327
|
+
f"({int(gof['bins'])} bins, dof {int(gof['dof'])}); "
|
|
328
|
+
f"KS not applicable ({why})"
|
|
329
|
+
)
|
|
330
|
+
|
|
331
|
+
return FeatureCompliance(
|
|
332
|
+
feature=feature,
|
|
333
|
+
dist=dist_name,
|
|
334
|
+
target_params=dict(params),
|
|
335
|
+
empirical=empirical,
|
|
336
|
+
ks_statistic=float(ks_stat),
|
|
337
|
+
p_value=p_value,
|
|
338
|
+
passed=passed,
|
|
339
|
+
clamped_fraction=float(clamped_fraction),
|
|
340
|
+
applicable=applicable,
|
|
341
|
+
note=note,
|
|
342
|
+
test=test,
|
|
343
|
+
gof=gof,
|
|
344
|
+
)
|
|
@@ -0,0 +1,117 @@
|
|
|
1
|
+
"""Realistic text providers (names, emails, addresses, …) backed by *mimesis*.
|
|
2
|
+
|
|
3
|
+
The default ``text`` generator is ``lorem`` (see :func:`sample_text`), which emits
|
|
4
|
+
filler words. These providers make text *genuine-looking* — ``"Maria Alvarez"``
|
|
5
|
+
instead of ``"lorem ipsum dolor"`` — **without sacrificing determinism**, which is
|
|
6
|
+
DataDoom's headline guarantee (CLAUDE.md invariant #1).
|
|
7
|
+
|
|
8
|
+
How determinism is preserved
|
|
9
|
+
----------------------------
|
|
10
|
+
mimesis is a pure, offline library that draws from an *isolated*, seeded
|
|
11
|
+
``random.Random`` instance — it never touches global random state. We seed that
|
|
12
|
+
instance from the feature's own ``numpy`` generator (which is itself keyed by
|
|
13
|
+
``sha256(spec_hash || seed || namespace)``), so:
|
|
14
|
+
|
|
15
|
+
* the same ``(spec_hash, seed)`` reproduces byte-identical text, and
|
|
16
|
+
* each feature draws from an independent stream — adding one never perturbs
|
|
17
|
+
another (invariant #1).
|
|
18
|
+
|
|
19
|
+
Byte-reproducibility holds *on the pinned path*: a different mimesis version may
|
|
20
|
+
emit different strings for the same seed, exactly like the numpy pin for numeric
|
|
21
|
+
draws (invariant #6). mimesis is therefore a pinned core dependency.
|
|
22
|
+
"""
|
|
23
|
+
|
|
24
|
+
from __future__ import annotations
|
|
25
|
+
|
|
26
|
+
from typing import TYPE_CHECKING, Callable
|
|
27
|
+
|
|
28
|
+
import numpy as np
|
|
29
|
+
|
|
30
|
+
from ..errors import SpecValidationError
|
|
31
|
+
|
|
32
|
+
if TYPE_CHECKING:
|
|
33
|
+
from mimesis import Generic
|
|
34
|
+
from mimesis.locales import Locale
|
|
35
|
+
|
|
36
|
+
# A curated provider catalog. Each entry maps a spec ``generator`` key to a
|
|
37
|
+
# callable that pulls one value from a seeded mimesis facade. Keep the keys
|
|
38
|
+
# stable: they are part of the spec surface (invariant #5, additive only).
|
|
39
|
+
_PROVIDERS: dict[str, Callable[[Generic], object]] = {
|
|
40
|
+
# people
|
|
41
|
+
"name": lambda g: g.person.full_name(),
|
|
42
|
+
"first_name": lambda g: g.person.first_name(),
|
|
43
|
+
"last_name": lambda g: g.person.last_name(),
|
|
44
|
+
"email": lambda g: g.person.email(),
|
|
45
|
+
"username": lambda g: g.person.username(),
|
|
46
|
+
"phone": lambda g: g.person.phone_number(),
|
|
47
|
+
"occupation": lambda g: g.person.occupation(),
|
|
48
|
+
"title": lambda g: g.person.title(),
|
|
49
|
+
"nationality": lambda g: g.person.nationality(),
|
|
50
|
+
# places
|
|
51
|
+
"address": lambda g: g.address.address(),
|
|
52
|
+
"street": lambda g: g.address.street_name(),
|
|
53
|
+
"city": lambda g: g.address.city(),
|
|
54
|
+
"state": lambda g: g.address.state(),
|
|
55
|
+
"country": lambda g: g.address.country(),
|
|
56
|
+
"postal_code": lambda g: g.address.postal_code(),
|
|
57
|
+
# business / finance
|
|
58
|
+
"company": lambda g: g.finance.company(),
|
|
59
|
+
"currency": lambda g: g.finance.currency_iso_code(),
|
|
60
|
+
"price": lambda g: g.finance.price(),
|
|
61
|
+
# internet
|
|
62
|
+
"url": lambda g: g.internet.url(),
|
|
63
|
+
"hostname": lambda g: g.internet.hostname(),
|
|
64
|
+
"ipv4": lambda g: g.internet.ip_v4(),
|
|
65
|
+
# generic text
|
|
66
|
+
"word": lambda g: g.text.word(),
|
|
67
|
+
"sentence": lambda g: g.text.sentence(),
|
|
68
|
+
"color": lambda g: g.text.color(),
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
#: Generator keys served by mimesis (``lorem`` is handled by ``sample_text``).
|
|
72
|
+
REALISTIC_GENERATORS: frozenset[str] = frozenset(_PROVIDERS)
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def is_realistic_generator(name: str) -> bool:
|
|
76
|
+
"""True if ``name`` is a mimesis-backed provider key."""
|
|
77
|
+
return name in _PROVIDERS
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def resolve_locale(locale: str, *, locator: str | None = None) -> "Locale":
|
|
81
|
+
"""Map a spec locale string (e.g. ``"en"``) to a mimesis ``Locale`` member."""
|
|
82
|
+
from mimesis.locales import Locale
|
|
83
|
+
|
|
84
|
+
try:
|
|
85
|
+
return Locale(locale)
|
|
86
|
+
except ValueError as exc:
|
|
87
|
+
valid = sorted(loc.value for loc in Locale)
|
|
88
|
+
raise SpecValidationError(
|
|
89
|
+
f"unknown locale {locale!r} (known: {valid})", locator=locator
|
|
90
|
+
) from exc
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def sample_provider(
|
|
94
|
+
rng: np.random.Generator, n: int, generator: str, locale: str = "en"
|
|
95
|
+
) -> np.ndarray:
|
|
96
|
+
"""Draw ``n`` realistic values for ``generator``, seeded from ``rng``.
|
|
97
|
+
|
|
98
|
+
The mimesis facade is seeded from a 32-bit integer pulled off the feature's
|
|
99
|
+
own generator, so the output is reproducible and stream-independent.
|
|
100
|
+
"""
|
|
101
|
+
provider = _PROVIDERS.get(generator)
|
|
102
|
+
if provider is None:
|
|
103
|
+
raise SpecValidationError(
|
|
104
|
+
f"unknown text generator {generator!r} "
|
|
105
|
+
f"(known: {sorted(REALISTIC_GENERATORS) + ['lorem']})"
|
|
106
|
+
)
|
|
107
|
+
|
|
108
|
+
from mimesis import Generic
|
|
109
|
+
|
|
110
|
+
loc = resolve_locale(locale)
|
|
111
|
+
seed = int(rng.integers(0, 2**32))
|
|
112
|
+
facade = Generic(locale=loc, seed=seed)
|
|
113
|
+
|
|
114
|
+
out = np.empty(n, dtype=object)
|
|
115
|
+
for i in range(n):
|
|
116
|
+
out[i] = str(provider(facade))
|
|
117
|
+
return out
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
"""Engine-level exceptions.
|
|
2
|
+
|
|
3
|
+
These are framework-free and carry a ``locator`` so the UI/CLI can point the
|
|
4
|
+
user at the offending control (a feature name, an edge, a list index).
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class DataDoomError(Exception):
|
|
11
|
+
"""Base class for all engine errors."""
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
class SpecValidationError(DataDoomError):
|
|
15
|
+
"""A spec failed structural or cross-field validation.
|
|
16
|
+
|
|
17
|
+
``locator`` identifies *where* the problem is (e.g. ``features.age.params``,
|
|
18
|
+
``causal.edges[2]``) so a caller can highlight it.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
def __init__(self, message: str, locator: str | None = None) -> None:
|
|
22
|
+
self.locator = locator
|
|
23
|
+
self.message = message
|
|
24
|
+
super().__init__(f"{locator}: {message}" if locator else message)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class DistributionError(DataDoomError):
|
|
28
|
+
"""An unknown distribution or invalid distribution parameters."""
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class ReproducibilityError(DataDoomError):
|
|
32
|
+
"""A determinism/verification check failed (checksum mismatch)."""
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
"""Export adapters + metadata + checksums."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from .base import ArtifactInfo, Exporter
|
|
6
|
+
from .checksums import sha256_bytes, sha256_file
|
|
7
|
+
from .csv_exporter import CsvExporter
|
|
8
|
+
from .json_exporter import JsonExporter
|
|
9
|
+
from .metadata import build_metadata, write_metadata
|
|
10
|
+
from .parquet_exporter import ParquetExporter
|
|
11
|
+
|
|
12
|
+
EXPORTERS: dict[str, Exporter] = {
|
|
13
|
+
e.format: e for e in (CsvExporter(), JsonExporter(), ParquetExporter())
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"ArtifactInfo",
|
|
18
|
+
"Exporter",
|
|
19
|
+
"CsvExporter",
|
|
20
|
+
"JsonExporter",
|
|
21
|
+
"ParquetExporter",
|
|
22
|
+
"EXPORTERS",
|
|
23
|
+
"sha256_bytes",
|
|
24
|
+
"sha256_file",
|
|
25
|
+
"build_metadata",
|
|
26
|
+
"write_metadata",
|
|
27
|
+
]
|
|
@@ -0,0 +1,49 @@
|
|
|
1
|
+
"""Exporter ABC (04 §8).
|
|
2
|
+
|
|
3
|
+
An exporter serializes a frame to a byte-stable file and returns its checksum
|
|
4
|
+
metadata. Byte-stability is essential: the same ``(spec_hash, seed)`` must yield
|
|
5
|
+
identical file bytes on the pinned path.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from abc import ABC, abstractmethod
|
|
11
|
+
from collections.abc import Mapping
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
import pandas as pd
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass
|
|
19
|
+
class ArtifactInfo:
|
|
20
|
+
path: str
|
|
21
|
+
format: str
|
|
22
|
+
checksum_sha256: str
|
|
23
|
+
size_bytes: int
|
|
24
|
+
version: str = "clean"
|
|
25
|
+
|
|
26
|
+
def to_dict(self) -> dict[str, object]:
|
|
27
|
+
return {
|
|
28
|
+
"path": self.path,
|
|
29
|
+
"format": self.format,
|
|
30
|
+
"checksum_sha256": self.checksum_sha256,
|
|
31
|
+
"size_bytes": self.size_bytes,
|
|
32
|
+
"version": self.version,
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class Exporter(ABC):
|
|
37
|
+
format: str
|
|
38
|
+
# File extension for the artifact (defaults to ``format`` when unset).
|
|
39
|
+
extension: str = ""
|
|
40
|
+
# Optional JSON-schema fragment for exporter options (09 §6); ``None`` for built-ins.
|
|
41
|
+
param_schema: Mapping[str, object] | None = None
|
|
42
|
+
|
|
43
|
+
@property
|
|
44
|
+
def ext(self) -> str:
|
|
45
|
+
return self.extension or self.format
|
|
46
|
+
|
|
47
|
+
@abstractmethod
|
|
48
|
+
def write(self, df: pd.DataFrame, path: str | Path) -> ArtifactInfo:
|
|
49
|
+
"""Write ``df`` to ``path`` deterministically and return its info."""
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""SHA256 helpers — the bitwise-reproducibility anchor (05 §8)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import hashlib
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def sha256_bytes(data: bytes) -> str:
|
|
10
|
+
return hashlib.sha256(data).hexdigest()
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def sha256_file(path: str | Path, chunk_size: int = 1 << 20) -> str:
|
|
14
|
+
h = hashlib.sha256()
|
|
15
|
+
with open(path, "rb") as fh:
|
|
16
|
+
for chunk in iter(lambda: fh.read(chunk_size), b""):
|
|
17
|
+
h.update(chunk)
|
|
18
|
+
return h.hexdigest()
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
"""Byte-stable CSV writer (17 step 4).
|
|
2
|
+
|
|
3
|
+
We render to a string with an explicit ``\\n`` line terminator and write raw
|
|
4
|
+
UTF-8 bytes ourselves, bypassing OS-specific newline translation so the output
|
|
5
|
+
is identical on Windows, macOS and Linux. Column order is fixed by the caller.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
|
|
14
|
+
from .base import ArtifactInfo, Exporter
|
|
15
|
+
from .checksums import sha256_bytes
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class CsvExporter(Exporter):
|
|
19
|
+
format = "csv"
|
|
20
|
+
|
|
21
|
+
def write(self, df: pd.DataFrame, path: str | Path) -> ArtifactInfo:
|
|
22
|
+
text = df.to_csv(index=False, lineterminator="\n")
|
|
23
|
+
data = text.encode("utf-8")
|
|
24
|
+
path = Path(path)
|
|
25
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
26
|
+
# Binary write => no newline translation; bytes are exactly `data`.
|
|
27
|
+
with open(path, "wb") as fh:
|
|
28
|
+
fh.write(data)
|
|
29
|
+
return ArtifactInfo(
|
|
30
|
+
path=str(path),
|
|
31
|
+
format=self.format,
|
|
32
|
+
checksum_sha256=sha256_bytes(data),
|
|
33
|
+
size_bytes=len(data),
|
|
34
|
+
)
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""Byte-stable JSON writer (17 step 18, 09 §8).
|
|
2
|
+
|
|
3
|
+
Emits a records array (``[{col: value, …}, …]``) in stable column order, with no
|
|
4
|
+
timestamps/ambient state, so the same ``(spec_hash, seed)`` yields identical bytes
|
|
5
|
+
on the pinned path (invariant #6). Values are normalized so the output round-trips
|
|
6
|
+
through ``pandas.read_json(orient="records")``: numpy scalars become Python
|
|
7
|
+
scalars, ``NaN``/``NaT`` become ``null``, and datetimes become ISO-8601 strings.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import json
|
|
13
|
+
import math
|
|
14
|
+
from pathlib import Path
|
|
15
|
+
from typing import Any
|
|
16
|
+
|
|
17
|
+
import numpy as np
|
|
18
|
+
import pandas as pd
|
|
19
|
+
|
|
20
|
+
from .base import ArtifactInfo, Exporter
|
|
21
|
+
from .checksums import sha256_bytes
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def _normalize(value: Any) -> Any:
|
|
25
|
+
if value is None:
|
|
26
|
+
return None
|
|
27
|
+
if isinstance(value, float) and math.isnan(value):
|
|
28
|
+
return None
|
|
29
|
+
if isinstance(value, np.floating):
|
|
30
|
+
f = float(value)
|
|
31
|
+
return None if math.isnan(f) else f
|
|
32
|
+
if isinstance(value, np.integer):
|
|
33
|
+
return int(value)
|
|
34
|
+
if isinstance(value, np.bool_):
|
|
35
|
+
return bool(value)
|
|
36
|
+
if isinstance(value, (pd.Timestamp,)):
|
|
37
|
+
return None if pd.isna(value) else value.isoformat()
|
|
38
|
+
if value is pd.NaT:
|
|
39
|
+
return None
|
|
40
|
+
if isinstance(value, np.datetime64):
|
|
41
|
+
ts = pd.Timestamp(value)
|
|
42
|
+
return None if pd.isna(ts) else ts.isoformat()
|
|
43
|
+
return value
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class JsonExporter(Exporter):
|
|
47
|
+
format = "json"
|
|
48
|
+
|
|
49
|
+
def write(self, df: pd.DataFrame, path: str | Path) -> ArtifactInfo:
|
|
50
|
+
columns = list(df.columns)
|
|
51
|
+
records = [
|
|
52
|
+
{col: _normalize(val) for col, val in zip(columns, row, strict=True)}
|
|
53
|
+
for row in df.itertuples(index=False, name=None)
|
|
54
|
+
]
|
|
55
|
+
# Compact + sorted-by-column-order; LF newlines only (json never emits CRLF).
|
|
56
|
+
text = json.dumps(records, ensure_ascii=False, separators=(",", ":"))
|
|
57
|
+
data = text.encode("utf-8")
|
|
58
|
+
path = Path(path)
|
|
59
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
60
|
+
with open(path, "wb") as fh:
|
|
61
|
+
fh.write(data)
|
|
62
|
+
return ArtifactInfo(
|
|
63
|
+
path=str(path),
|
|
64
|
+
format=self.format,
|
|
65
|
+
checksum_sha256=sha256_bytes(data),
|
|
66
|
+
size_bytes=len(data),
|
|
67
|
+
)
|