datadoom 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. datadoom/__init__.py +23 -0
  2. datadoom/adapters/__init__.py +29 -0
  3. datadoom/adapters/frameworks.py +94 -0
  4. datadoom/adapters/loaders.py +72 -0
  5. datadoom/api/__init__.py +11 -0
  6. datadoom/api/app.py +109 -0
  7. datadoom/api/deps.py +30 -0
  8. datadoom/api/errors.py +89 -0
  9. datadoom/api/estimate.py +82 -0
  10. datadoom/api/routes/__init__.py +7 -0
  11. datadoom/api/routes/artifacts.py +147 -0
  12. datadoom/api/routes/datasets.py +180 -0
  13. datadoom/api/routes/meta.py +45 -0
  14. datadoom/api/routes/plugins.py +22 -0
  15. datadoom/api/routes/runs.py +144 -0
  16. datadoom/api/routes/specs.py +73 -0
  17. datadoom/api/routes/templates.py +30 -0
  18. datadoom/api/schemas.py +230 -0
  19. datadoom/api/serializers.py +143 -0
  20. datadoom/api/state.py +24 -0
  21. datadoom/api/store_helpers.py +56 -0
  22. datadoom/api/ws.py +72 -0
  23. datadoom/cli/__init__.py +1 -0
  24. datadoom/cli/main.py +313 -0
  25. datadoom/config.py +108 -0
  26. datadoom/engine/__init__.py +38 -0
  27. datadoom/engine/advice.py +289 -0
  28. datadoom/engine/audit.py +290 -0
  29. datadoom/engine/causal/__init__.py +15 -0
  30. datadoom/engine/causal/execute.py +116 -0
  31. datadoom/engine/causal/functions.py +116 -0
  32. datadoom/engine/causal/graph.py +54 -0
  33. datadoom/engine/difficulty/__init__.py +36 -0
  34. datadoom/engine/difficulty/calibrate.py +235 -0
  35. datadoom/engine/difficulty/knobs.py +171 -0
  36. datadoom/engine/difficulty/probes.py +181 -0
  37. datadoom/engine/dist/__init__.py +35 -0
  38. datadoom/engine/dist/base.py +46 -0
  39. datadoom/engine/dist/builtins.py +172 -0
  40. datadoom/engine/dist/compliance.py +344 -0
  41. datadoom/engine/dist/providers.py +117 -0
  42. datadoom/engine/errors.py +32 -0
  43. datadoom/engine/export/__init__.py +27 -0
  44. datadoom/engine/export/base.py +49 -0
  45. datadoom/engine/export/checksums.py +18 -0
  46. datadoom/engine/export/csv_exporter.py +34 -0
  47. datadoom/engine/export/json_exporter.py +67 -0
  48. datadoom/engine/export/metadata.py +58 -0
  49. datadoom/engine/export/parquet_exporter.py +45 -0
  50. datadoom/engine/failure/__init__.py +18 -0
  51. datadoom/engine/failure/apply.py +37 -0
  52. datadoom/engine/failure/base.py +116 -0
  53. datadoom/engine/failure/modes.py +442 -0
  54. datadoom/engine/pipeline.py +418 -0
  55. datadoom/engine/profile.py +327 -0
  56. datadoom/engine/progress.py +14 -0
  57. datadoom/engine/reference.py +338 -0
  58. datadoom/engine/reports.py +206 -0
  59. datadoom/engine/rng.py +79 -0
  60. datadoom/engine/spec/__init__.py +45 -0
  61. datadoom/engine/spec/hashing.py +57 -0
  62. datadoom/engine/spec/models.py +238 -0
  63. datadoom/engine/spec/validate.py +345 -0
  64. datadoom/engine/timeseries.py +88 -0
  65. datadoom/jobs/__init__.py +14 -0
  66. datadoom/jobs/progress.py +155 -0
  67. datadoom/jobs/worker.py +162 -0
  68. datadoom/plugin.py +35 -0
  69. datadoom/plugins/__init__.py +47 -0
  70. datadoom/plugins/contracts.py +72 -0
  71. datadoom/plugins/loader.py +125 -0
  72. datadoom/plugins/registry.py +214 -0
  73. datadoom/plugins/scaffold.py +434 -0
  74. datadoom/store/__init__.py +47 -0
  75. datadoom/store/artifacts.py +67 -0
  76. datadoom/store/db.py +104 -0
  77. datadoom/store/migrations/__init__.py +0 -0
  78. datadoom/store/migrations/env.py +53 -0
  79. datadoom/store/migrations/script.py.mako +24 -0
  80. datadoom/store/migrations/versions/0001_init.py +149 -0
  81. datadoom/store/migrations/versions/0002_report_mutual_information.py +23 -0
  82. datadoom/store/migrations/versions/0003_run_name.py +23 -0
  83. datadoom/store/migrations/versions/0004_report_profile.py +24 -0
  84. datadoom/store/models.py +170 -0
  85. datadoom/store/repositories.py +279 -0
  86. datadoom/templates/__init__.py +239 -0
  87. datadoom/templates/ab_test.datadoom.yaml +46 -0
  88. datadoom/templates/clinical_deterioration.datadoom.yaml +124 -0
  89. datadoom/templates/credit_default_challenge.datadoom.yaml +147 -0
  90. datadoom/templates/customer_churn.datadoom.yaml +60 -0
  91. datadoom/templates/ecommerce_orders.datadoom.yaml +46 -0
  92. datadoom/templates/fraud_detection.datadoom.yaml +57 -0
  93. datadoom/templates/hospital_readmission.datadoom.yaml +61 -0
  94. datadoom/templates/insurance_claims.datadoom.yaml +43 -0
  95. datadoom/templates/iot_sensors.datadoom.yaml +44 -0
  96. datadoom/templates/people_directory.datadoom.yaml +56 -0
  97. datadoom/templates/predictive_maintenance.datadoom.yaml +107 -0
  98. datadoom/templates/telecom_churn_challenge.datadoom.yaml +125 -0
  99. datadoom/version.py +3 -0
  100. datadoom/webdist/assets/index-V8VAuTJG.js +445 -0
  101. datadoom/webdist/assets/index-doRjyG5s.css +1 -0
  102. datadoom/webdist/assets/inter-cyrillic-ext-wght-normal-BOeWTOD4.woff2 +0 -0
  103. datadoom/webdist/assets/inter-cyrillic-wght-normal-DqGufNeO.woff2 +0 -0
  104. datadoom/webdist/assets/inter-greek-ext-wght-normal-DlzME5K_.woff2 +0 -0
  105. datadoom/webdist/assets/inter-greek-wght-normal-CkhJZR-_.woff2 +0 -0
  106. datadoom/webdist/assets/inter-latin-ext-wght-normal-DO1Apj_S.woff2 +0 -0
  107. datadoom/webdist/assets/inter-latin-wght-normal-Dx4kXJAl.woff2 +0 -0
  108. datadoom/webdist/assets/inter-vietnamese-wght-normal-CBcvBZtf.woff2 +0 -0
  109. datadoom/webdist/assets/jetbrains-mono-cyrillic-wght-normal-D73BlboJ.woff2 +0 -0
  110. datadoom/webdist/assets/jetbrains-mono-greek-wght-normal-Bw9x6K1M.woff2 +0 -0
  111. datadoom/webdist/assets/jetbrains-mono-latin-ext-wght-normal-DBQx-q_a.woff2 +0 -0
  112. datadoom/webdist/assets/jetbrains-mono-latin-wght-normal-B9CIFXIH.woff2 +0 -0
  113. datadoom/webdist/assets/jetbrains-mono-vietnamese-wght-normal-Bt-aOZkq.woff2 +0 -0
  114. datadoom/webdist/assets/space-grotesk-latin-ext-wght-normal-D9tNdqV9.woff2 +0 -0
  115. datadoom/webdist/assets/space-grotesk-latin-wght-normal-BhU9QXUp.woff2 +0 -0
  116. datadoom/webdist/assets/space-grotesk-vietnamese-wght-normal-D0rl6rjA.woff2 +0 -0
  117. datadoom/webdist/index.html +15 -0
  118. datadoom-0.1.0.dev0.dist-info/METADATA +143 -0
  119. datadoom-0.1.0.dev0.dist-info/RECORD +122 -0
  120. datadoom-0.1.0.dev0.dist-info/WHEEL +4 -0
  121. datadoom-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  122. datadoom-0.1.0.dev0.dist-info/licenses/LICENSE +202 -0
@@ -0,0 +1,344 @@
1
+ """Honest statistical compliance reporting (05 §2.2-2.3, §7).
2
+
3
+ We report how well the realized sample matches the *requested* distribution. We
4
+ deliberately do NOT refit parameters to the sample: the ~alpha fraction of
5
+ "failures" at significance alpha is expected sampling variance, not a defect.
6
+ Refitting would make the data match itself rather than the user's request.
7
+
8
+ **Two complementary tests, picked by feature shape.**
9
+
10
+ *Continuous, untransformed targets* → a one-sample **Kolmogorov-Smirnov** test
11
+ against the requested CDF. This is the right tool when the realized data really
12
+ is a clean draw from a continuous distribution (e.g. ``normal``/``lognormal``
13
+ with ``dtype: float`` and no clamping).
14
+
15
+ *Integer, discrete, or clamped targets* → a **chi-square goodness-of-fit** test
16
+ against the **effective** PMF (the distribution actually realized after the
17
+ transform). A KS test is invalid here: ``dtype: int`` discretizes a continuous
18
+ draw, a discrete distribution (poisson) lives on the integers, and ``min``/
19
+ ``max`` clamping piles point masses at the bounds — so the realized data is no
20
+ longer a clean draw from the continuous CDF, and at large *n* a KS test rejects
21
+ on the *transform artifact*, not on any defect. The GoF test instead compares
22
+ binned counts to the effective PMF, where the end bins absorb the (possibly
23
+ clamped) tail mass:
24
+
25
+ * interior integer bin ``k`` → ``P = F(k + ½) − F(k − ½)``
26
+ * min bin → ``P = F(kmin + ½)`` (absorbs the lower tail)
27
+ * max bin → ``P = 1 − F(kmax − ½)`` (absorbs the upper tail)
28
+
29
+ For a discrete CDF the ``±½`` edges coincide with the integer steps, so the same
30
+ formula yields the exact PMF (``F(k) − F(k−1)``). Bins whose expected count falls
31
+ below :data:`MIN_EXPECTED_COUNT` are merged with a neighbour (Cochran's rule) so
32
+ the chi-square approximation holds. Degrees of freedom are ``bins − 1`` — we
33
+ subtract **nothing** for fitted parameters because the parameters come from the
34
+ spec, not from the data.
35
+
36
+ This turns the previous honest *abstention* (``applicable: False``, scored
37
+ ``n/a``) into an actual validated pass/fail for the most common real-world
38
+ feature shapes — ages, counts, bounded scores — while never penalizing a correct
39
+ generator for a transform we deliberately applied.
40
+ """
41
+
42
+ from __future__ import annotations
43
+
44
+ from dataclasses import dataclass, field
45
+ from typing import Any
46
+
47
+ import numpy as np
48
+ from scipy import stats
49
+
50
+ from .base import Distribution
51
+ from .builtins import REGISTRY
52
+
53
+ DEFAULT_ALPHA = 0.05
54
+
55
+ # Distributions whose support is discrete: a continuous KS test does not apply.
56
+ DISCRETE_DISTS = {"poisson"}
57
+
58
+ # Cochran's rule of thumb: keep every chi-square cell's expected count at or
59
+ # above this by merging sparse bins, so the asymptotic distribution holds.
60
+ MIN_EXPECTED_COUNT = 5.0
61
+
62
+ # Number of interior bins for the clamped-continuous goodness-of-fit test.
63
+ _CONTINUOUS_INTERIOR_BINS = 24
64
+
65
+ # Guard: refuse to enumerate an absurd integer range (degenerate spec).
66
+ _MAX_INTEGER_BINS = 200_000
67
+
68
+
69
+ @dataclass
70
+ class FeatureCompliance:
71
+ feature: str
72
+ dist: str
73
+ target_params: dict[str, float]
74
+ empirical: dict[str, float]
75
+ ks_statistic: float
76
+ p_value: float
77
+ passed: bool | None
78
+ clamped_fraction: float = 0.0
79
+ applicable: bool = True
80
+ note: str | None = None
81
+ # Which test produced ``p_value``/``passed``: "ks", "chi2_gof", or "none"
82
+ # (the last meaning no valid test could be formed — an honest abstention).
83
+ test: str = "ks"
84
+ # Chi-square goodness-of-fit detail (only when ``test == "chi2_gof"``).
85
+ gof: dict[str, float] | None = None
86
+
87
+ def to_dict(self) -> dict[str, Any]:
88
+ return {
89
+ "feature": self.feature,
90
+ "dist": self.dist,
91
+ "target_params": self.target_params,
92
+ "empirical": self.empirical,
93
+ "ks_statistic": self.ks_statistic,
94
+ "p_value": self.p_value,
95
+ "passed": self.passed,
96
+ "clamped_fraction": self.clamped_fraction,
97
+ "applicable": self.applicable,
98
+ "note": self.note,
99
+ "test": self.test,
100
+ "gof": self.gof,
101
+ }
102
+
103
+
104
+ @dataclass
105
+ class ComplianceReport:
106
+ alpha: float
107
+ features: list[FeatureCompliance] = field(default_factory=list)
108
+
109
+ @property
110
+ def score(self) -> float:
111
+ """Fraction of *assessable* features whose fit test passes.
112
+
113
+ A feature is assessable when some valid test (KS for continuous targets,
114
+ chi-square GoF for integer/discrete/clamped targets) could be run for it.
115
+ Features that abstain (``applicable: False`` — no valid test could be
116
+ formed) are excluded so a correct generator is never penalized. With no
117
+ assessable features there is nothing to contradict, so the score is 1.0.
118
+ """
119
+ applicable = [f for f in self.features if f.applicable]
120
+ if not applicable:
121
+ return 1.0
122
+ return sum(1 for f in applicable if f.passed) / len(applicable)
123
+
124
+ def to_dict(self) -> dict[str, Any]:
125
+ return {
126
+ "alpha": self.alpha,
127
+ "compliance_score": self.score,
128
+ "applicable_features": sum(1 for f in self.features if f.applicable),
129
+ "assessed_features": len(self.features),
130
+ "features": [f.to_dict() for f in self.features],
131
+ }
132
+
133
+
134
+ def _needs_gof(dist_name: str, dtype: str, clamped_fraction: float) -> tuple[bool, list[str]]:
135
+ """Decide whether a continuous KS test is invalid here (→ use a GoF test).
136
+
137
+ Returns ``(needs_gof, reasons)``; an empty ``reasons`` means KS is valid.
138
+ """
139
+ reasons: list[str] = []
140
+ if dist_name in DISCRETE_DISTS:
141
+ reasons.append("discrete distribution")
142
+ if dtype == "int":
143
+ reasons.append("integer discretization")
144
+ if clamped_fraction > 0:
145
+ reasons.append(f"clamping ({clamped_fraction:.1%})")
146
+ return bool(reasons), reasons
147
+
148
+
149
+ def _merge_sparse_bins(
150
+ expected_p: np.ndarray, observed: np.ndarray, n: int
151
+ ) -> tuple[np.ndarray, np.ndarray]:
152
+ """Greedily merge adjacent bins left-to-right until each expected count is at
153
+ least :data:`MIN_EXPECTED_COUNT`. Any sparse remainder folds into the last
154
+ closed group. Deterministic given the inputs.
155
+ """
156
+ groups_p: list[float] = []
157
+ groups_o: list[float] = []
158
+ cur_p = 0.0
159
+ cur_o = 0.0
160
+ for p, o in zip(expected_p.tolist(), observed.tolist()):
161
+ cur_p += p
162
+ cur_o += o
163
+ if cur_p * n >= MIN_EXPECTED_COUNT:
164
+ groups_p.append(cur_p)
165
+ groups_o.append(cur_o)
166
+ cur_p = 0.0
167
+ cur_o = 0.0
168
+ if cur_p > 0 or cur_o > 0: # leftover sparse tail
169
+ if groups_p:
170
+ groups_p[-1] += cur_p
171
+ groups_o[-1] += cur_o
172
+ else:
173
+ groups_p.append(cur_p)
174
+ groups_o.append(cur_o)
175
+ return np.asarray(groups_p, dtype=float), np.asarray(groups_o, dtype=float)
176
+
177
+
178
+ def _chi_square(expected_p: np.ndarray, observed: np.ndarray, n: int) -> dict[str, float] | None:
179
+ """Run a chi-square GoF on already-binned (expected prob, observed count).
180
+
181
+ Merges sparse bins, then computes ``Σ (O − E)² / E`` with ``dof = bins − 1``
182
+ (no parameters were fit). Returns ``None`` when fewer than two bins survive
183
+ (no testable signal) or the total expected mass is degenerate.
184
+ """
185
+ total_p = float(expected_p.sum())
186
+ if total_p <= 0:
187
+ return None
188
+ expected_p = expected_p / total_p # guard tiny float drift so Σp == 1
189
+ merged_p, merged_o = _merge_sparse_bins(expected_p, observed, n)
190
+ if merged_p.size < 2:
191
+ return None
192
+ expected_counts = merged_p * n
193
+ statistic = float(np.sum((merged_o - expected_counts) ** 2 / expected_counts))
194
+ dof = int(merged_p.size - 1)
195
+ p_value = float(stats.chi2.sf(statistic, dof))
196
+ return {"statistic": statistic, "dof": float(dof), "bins": float(merged_p.size), "p_value": p_value}
197
+
198
+
199
+ def _integer_gof(dist: Distribution, params: dict[str, float], data: np.ndarray, n: int) -> dict[str, float] | None:
200
+ """GoF for an integer-valued target (int dtype or a discrete distribution)."""
201
+ ints = np.rint(data).astype(np.int64)
202
+ kmin = int(ints.min())
203
+ kmax = int(ints.max())
204
+ if kmax - kmin + 1 > _MAX_INTEGER_BINS:
205
+ return None
206
+ ks = np.arange(kmin, kmax + 1)
207
+ observed = np.bincount(ints - kmin, minlength=ks.size).astype(float)
208
+ upper = dist.cdf(ks + 0.5, params) # F(k + ½)
209
+ lower = dist.cdf(ks - 0.5, params) # F(k − ½)
210
+ expected_p = np.asarray(upper, dtype=float) - np.asarray(lower, dtype=float)
211
+ expected_p[0] = float(upper[0]) # min bin absorbs the lower tail
212
+ expected_p[-1] = 1.0 - float(lower[-1]) # max bin absorbs the upper tail
213
+ expected_p = np.clip(expected_p, 0.0, None)
214
+ return _chi_square(expected_p, observed, n)
215
+
216
+
217
+ def _clamped_continuous_gof(
218
+ dist: Distribution,
219
+ params: dict[str, float],
220
+ data: np.ndarray,
221
+ n: int,
222
+ clamp_min: float | None,
223
+ clamp_max: float | None,
224
+ ) -> dict[str, float] | None:
225
+ """GoF for a continuous (float) target whose only transform is clamping.
226
+
227
+ Clamping turns ``[min, max]`` into point masses: ``P(min) = F(min)`` and
228
+ ``P(max) = 1 − F(max)``. The open interior is split into equal-width bins.
229
+ """
230
+ bins_p: list[float] = []
231
+ bins_o: list[float] = []
232
+
233
+ interior_lo = clamp_min if clamp_min is not None else float(data.min())
234
+ interior_hi = clamp_max if clamp_max is not None else float(data.max())
235
+ if not interior_hi > interior_lo:
236
+ return None
237
+
238
+ if clamp_min is not None: # lower point mass P(min) = F(min)
239
+ bins_p.append(float(dist.cdf(np.asarray([clamp_min]), params)[0]))
240
+ bins_o.append(float(np.count_nonzero(data <= clamp_min)))
241
+
242
+ edges = np.linspace(interior_lo, interior_hi, _CONTINUOUS_INTERIOR_BINS + 1)
243
+ cdf_edges = np.asarray(dist.cdf(edges, params), dtype=float)
244
+ if clamp_min is None:
245
+ cdf_edges[0] = 0.0 # bottom interior bin absorbs the open lower tail
246
+ if clamp_max is None:
247
+ cdf_edges[-1] = 1.0 # top interior bin absorbs the open upper tail
248
+ # Strictly-interior data (exact bounds already counted as point masses).
249
+ interior_mask = data > interior_lo
250
+ if clamp_max is not None:
251
+ interior_mask &= data < interior_hi
252
+ interior_o = np.histogram(data[interior_mask], bins=edges)[0].astype(float)
253
+ bins_p.extend(np.diff(cdf_edges).tolist())
254
+ bins_o.extend(interior_o.tolist())
255
+
256
+ if clamp_max is not None: # upper point mass P(max) = 1 − F(max)
257
+ bins_p.append(1.0 - float(dist.cdf(np.asarray([clamp_max]), params)[0]))
258
+ bins_o.append(float(np.count_nonzero(data >= clamp_max)))
259
+
260
+ expected_p = np.clip(np.asarray(bins_p, dtype=float), 0.0, None)
261
+ observed = np.asarray(bins_o, dtype=float)
262
+ return _chi_square(expected_p, observed, n)
263
+
264
+
265
+ def assess_numeric(
266
+ feature: str,
267
+ dist_name: str,
268
+ params: dict[str, float],
269
+ values: np.ndarray,
270
+ clamped_fraction: float = 0.0,
271
+ alpha: float = DEFAULT_ALPHA,
272
+ dtype: str = "float",
273
+ clamp_min: float | None = None,
274
+ clamp_max: float | None = None,
275
+ ) -> FeatureCompliance:
276
+ """Assess a realized numeric sample against its requested distribution.
277
+
278
+ Continuous untransformed targets are judged by KS; integer/discrete/clamped
279
+ targets by a chi-square goodness-of-fit against the effective PMF. The KS
280
+ statistic is always reported for transparency. See the module docstring.
281
+ """
282
+ dist = REGISTRY[dist_name]
283
+ data = np.asarray(values, dtype=float)
284
+
285
+ ks_stat, ks_p = stats.kstest(data, lambda x: dist.cdf(x, params))
286
+ empirical = {
287
+ "mean": float(np.mean(data)),
288
+ "std": float(np.std(data, ddof=1)) if data.size > 1 else 0.0,
289
+ "min": float(np.min(data)),
290
+ "max": float(np.max(data)),
291
+ }
292
+
293
+ needs_gof, reasons = _needs_gof(dist_name, dtype, clamped_fraction)
294
+
295
+ # Decide which test rules, then construct one FeatureCompliance.
296
+ p_value = float(ks_p)
297
+ passed: bool | None = bool(ks_p > alpha)
298
+ applicable = True
299
+ test = "ks"
300
+ note: str | None = None
301
+ gof: dict[str, float] | None = None
302
+
303
+ if needs_gof:
304
+ why = ", ".join(reasons)
305
+ # KS is not a valid signal here — run a goodness-of-fit test against the
306
+ # effective (discretized/clamped) PMF instead.
307
+ if dtype == "int" or dist_name in DISCRETE_DISTS:
308
+ gof = _integer_gof(dist, params, data, data.size)
309
+ else: # continuous float whose only transform is clamping
310
+ gof = _clamped_continuous_gof(dist, params, data, data.size, clamp_min, clamp_max)
311
+ if gof is None:
312
+ # No valid test could be formed (near-constant / too few bins) —
313
+ # abstain honestly rather than emit a meaningless verdict.
314
+ passed = None
315
+ applicable = False
316
+ test = "none"
317
+ note = (
318
+ f"continuous KS not applicable ({why}); goodness-of-fit "
319
+ "abstained (too few distinct values to bin)"
320
+ )
321
+ else:
322
+ p_value = float(gof["p_value"])
323
+ passed = bool(gof["p_value"] > alpha)
324
+ test = "chi2_gof"
325
+ note = (
326
+ f"chi-square goodness-of-fit vs the effective PMF "
327
+ f"({int(gof['bins'])} bins, dof {int(gof['dof'])}); "
328
+ f"KS not applicable ({why})"
329
+ )
330
+
331
+ return FeatureCompliance(
332
+ feature=feature,
333
+ dist=dist_name,
334
+ target_params=dict(params),
335
+ empirical=empirical,
336
+ ks_statistic=float(ks_stat),
337
+ p_value=p_value,
338
+ passed=passed,
339
+ clamped_fraction=float(clamped_fraction),
340
+ applicable=applicable,
341
+ note=note,
342
+ test=test,
343
+ gof=gof,
344
+ )
@@ -0,0 +1,117 @@
1
+ """Realistic text providers (names, emails, addresses, …) backed by *mimesis*.
2
+
3
+ The default ``text`` generator is ``lorem`` (see :func:`sample_text`), which emits
4
+ filler words. These providers make text *genuine-looking* — ``"Maria Alvarez"``
5
+ instead of ``"lorem ipsum dolor"`` — **without sacrificing determinism**, which is
6
+ DataDoom's headline guarantee (CLAUDE.md invariant #1).
7
+
8
+ How determinism is preserved
9
+ ----------------------------
10
+ mimesis is a pure, offline library that draws from an *isolated*, seeded
11
+ ``random.Random`` instance — it never touches global random state. We seed that
12
+ instance from the feature's own ``numpy`` generator (which is itself keyed by
13
+ ``sha256(spec_hash || seed || namespace)``), so:
14
+
15
+ * the same ``(spec_hash, seed)`` reproduces byte-identical text, and
16
+ * each feature draws from an independent stream — adding one never perturbs
17
+ another (invariant #1).
18
+
19
+ Byte-reproducibility holds *on the pinned path*: a different mimesis version may
20
+ emit different strings for the same seed, exactly like the numpy pin for numeric
21
+ draws (invariant #6). mimesis is therefore a pinned core dependency.
22
+ """
23
+
24
+ from __future__ import annotations
25
+
26
+ from typing import TYPE_CHECKING, Callable
27
+
28
+ import numpy as np
29
+
30
+ from ..errors import SpecValidationError
31
+
32
+ if TYPE_CHECKING:
33
+ from mimesis import Generic
34
+ from mimesis.locales import Locale
35
+
36
+ # A curated provider catalog. Each entry maps a spec ``generator`` key to a
37
+ # callable that pulls one value from a seeded mimesis facade. Keep the keys
38
+ # stable: they are part of the spec surface (invariant #5, additive only).
39
+ _PROVIDERS: dict[str, Callable[[Generic], object]] = {
40
+ # people
41
+ "name": lambda g: g.person.full_name(),
42
+ "first_name": lambda g: g.person.first_name(),
43
+ "last_name": lambda g: g.person.last_name(),
44
+ "email": lambda g: g.person.email(),
45
+ "username": lambda g: g.person.username(),
46
+ "phone": lambda g: g.person.phone_number(),
47
+ "occupation": lambda g: g.person.occupation(),
48
+ "title": lambda g: g.person.title(),
49
+ "nationality": lambda g: g.person.nationality(),
50
+ # places
51
+ "address": lambda g: g.address.address(),
52
+ "street": lambda g: g.address.street_name(),
53
+ "city": lambda g: g.address.city(),
54
+ "state": lambda g: g.address.state(),
55
+ "country": lambda g: g.address.country(),
56
+ "postal_code": lambda g: g.address.postal_code(),
57
+ # business / finance
58
+ "company": lambda g: g.finance.company(),
59
+ "currency": lambda g: g.finance.currency_iso_code(),
60
+ "price": lambda g: g.finance.price(),
61
+ # internet
62
+ "url": lambda g: g.internet.url(),
63
+ "hostname": lambda g: g.internet.hostname(),
64
+ "ipv4": lambda g: g.internet.ip_v4(),
65
+ # generic text
66
+ "word": lambda g: g.text.word(),
67
+ "sentence": lambda g: g.text.sentence(),
68
+ "color": lambda g: g.text.color(),
69
+ }
70
+
71
+ #: Generator keys served by mimesis (``lorem`` is handled by ``sample_text``).
72
+ REALISTIC_GENERATORS: frozenset[str] = frozenset(_PROVIDERS)
73
+
74
+
75
+ def is_realistic_generator(name: str) -> bool:
76
+ """True if ``name`` is a mimesis-backed provider key."""
77
+ return name in _PROVIDERS
78
+
79
+
80
+ def resolve_locale(locale: str, *, locator: str | None = None) -> "Locale":
81
+ """Map a spec locale string (e.g. ``"en"``) to a mimesis ``Locale`` member."""
82
+ from mimesis.locales import Locale
83
+
84
+ try:
85
+ return Locale(locale)
86
+ except ValueError as exc:
87
+ valid = sorted(loc.value for loc in Locale)
88
+ raise SpecValidationError(
89
+ f"unknown locale {locale!r} (known: {valid})", locator=locator
90
+ ) from exc
91
+
92
+
93
+ def sample_provider(
94
+ rng: np.random.Generator, n: int, generator: str, locale: str = "en"
95
+ ) -> np.ndarray:
96
+ """Draw ``n`` realistic values for ``generator``, seeded from ``rng``.
97
+
98
+ The mimesis facade is seeded from a 32-bit integer pulled off the feature's
99
+ own generator, so the output is reproducible and stream-independent.
100
+ """
101
+ provider = _PROVIDERS.get(generator)
102
+ if provider is None:
103
+ raise SpecValidationError(
104
+ f"unknown text generator {generator!r} "
105
+ f"(known: {sorted(REALISTIC_GENERATORS) + ['lorem']})"
106
+ )
107
+
108
+ from mimesis import Generic
109
+
110
+ loc = resolve_locale(locale)
111
+ seed = int(rng.integers(0, 2**32))
112
+ facade = Generic(locale=loc, seed=seed)
113
+
114
+ out = np.empty(n, dtype=object)
115
+ for i in range(n):
116
+ out[i] = str(provider(facade))
117
+ return out
@@ -0,0 +1,32 @@
1
+ """Engine-level exceptions.
2
+
3
+ These are framework-free and carry a ``locator`` so the UI/CLI can point the
4
+ user at the offending control (a feature name, an edge, a list index).
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+
10
+ class DataDoomError(Exception):
11
+ """Base class for all engine errors."""
12
+
13
+
14
+ class SpecValidationError(DataDoomError):
15
+ """A spec failed structural or cross-field validation.
16
+
17
+ ``locator`` identifies *where* the problem is (e.g. ``features.age.params``,
18
+ ``causal.edges[2]``) so a caller can highlight it.
19
+ """
20
+
21
+ def __init__(self, message: str, locator: str | None = None) -> None:
22
+ self.locator = locator
23
+ self.message = message
24
+ super().__init__(f"{locator}: {message}" if locator else message)
25
+
26
+
27
+ class DistributionError(DataDoomError):
28
+ """An unknown distribution or invalid distribution parameters."""
29
+
30
+
31
+ class ReproducibilityError(DataDoomError):
32
+ """A determinism/verification check failed (checksum mismatch)."""
@@ -0,0 +1,27 @@
1
+ """Export adapters + metadata + checksums."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from .base import ArtifactInfo, Exporter
6
+ from .checksums import sha256_bytes, sha256_file
7
+ from .csv_exporter import CsvExporter
8
+ from .json_exporter import JsonExporter
9
+ from .metadata import build_metadata, write_metadata
10
+ from .parquet_exporter import ParquetExporter
11
+
12
+ EXPORTERS: dict[str, Exporter] = {
13
+ e.format: e for e in (CsvExporter(), JsonExporter(), ParquetExporter())
14
+ }
15
+
16
+ __all__ = [
17
+ "ArtifactInfo",
18
+ "Exporter",
19
+ "CsvExporter",
20
+ "JsonExporter",
21
+ "ParquetExporter",
22
+ "EXPORTERS",
23
+ "sha256_bytes",
24
+ "sha256_file",
25
+ "build_metadata",
26
+ "write_metadata",
27
+ ]
@@ -0,0 +1,49 @@
1
+ """Exporter ABC (04 §8).
2
+
3
+ An exporter serializes a frame to a byte-stable file and returns its checksum
4
+ metadata. Byte-stability is essential: the same ``(spec_hash, seed)`` must yield
5
+ identical file bytes on the pinned path.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from abc import ABC, abstractmethod
11
+ from collections.abc import Mapping
12
+ from dataclasses import dataclass
13
+ from pathlib import Path
14
+
15
+ import pandas as pd
16
+
17
+
18
+ @dataclass
19
+ class ArtifactInfo:
20
+ path: str
21
+ format: str
22
+ checksum_sha256: str
23
+ size_bytes: int
24
+ version: str = "clean"
25
+
26
+ def to_dict(self) -> dict[str, object]:
27
+ return {
28
+ "path": self.path,
29
+ "format": self.format,
30
+ "checksum_sha256": self.checksum_sha256,
31
+ "size_bytes": self.size_bytes,
32
+ "version": self.version,
33
+ }
34
+
35
+
36
+ class Exporter(ABC):
37
+ format: str
38
+ # File extension for the artifact (defaults to ``format`` when unset).
39
+ extension: str = ""
40
+ # Optional JSON-schema fragment for exporter options (09 §6); ``None`` for built-ins.
41
+ param_schema: Mapping[str, object] | None = None
42
+
43
+ @property
44
+ def ext(self) -> str:
45
+ return self.extension or self.format
46
+
47
+ @abstractmethod
48
+ def write(self, df: pd.DataFrame, path: str | Path) -> ArtifactInfo:
49
+ """Write ``df`` to ``path`` deterministically and return its info."""
@@ -0,0 +1,18 @@
1
+ """SHA256 helpers — the bitwise-reproducibility anchor (05 §8)."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import hashlib
6
+ from pathlib import Path
7
+
8
+
9
+ def sha256_bytes(data: bytes) -> str:
10
+ return hashlib.sha256(data).hexdigest()
11
+
12
+
13
+ def sha256_file(path: str | Path, chunk_size: int = 1 << 20) -> str:
14
+ h = hashlib.sha256()
15
+ with open(path, "rb") as fh:
16
+ for chunk in iter(lambda: fh.read(chunk_size), b""):
17
+ h.update(chunk)
18
+ return h.hexdigest()
@@ -0,0 +1,34 @@
1
+ """Byte-stable CSV writer (17 step 4).
2
+
3
+ We render to a string with an explicit ``\\n`` line terminator and write raw
4
+ UTF-8 bytes ourselves, bypassing OS-specific newline translation so the output
5
+ is identical on Windows, macOS and Linux. Column order is fixed by the caller.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from pathlib import Path
11
+
12
+ import pandas as pd
13
+
14
+ from .base import ArtifactInfo, Exporter
15
+ from .checksums import sha256_bytes
16
+
17
+
18
+ class CsvExporter(Exporter):
19
+ format = "csv"
20
+
21
+ def write(self, df: pd.DataFrame, path: str | Path) -> ArtifactInfo:
22
+ text = df.to_csv(index=False, lineterminator="\n")
23
+ data = text.encode("utf-8")
24
+ path = Path(path)
25
+ path.parent.mkdir(parents=True, exist_ok=True)
26
+ # Binary write => no newline translation; bytes are exactly `data`.
27
+ with open(path, "wb") as fh:
28
+ fh.write(data)
29
+ return ArtifactInfo(
30
+ path=str(path),
31
+ format=self.format,
32
+ checksum_sha256=sha256_bytes(data),
33
+ size_bytes=len(data),
34
+ )
@@ -0,0 +1,67 @@
1
+ """Byte-stable JSON writer (17 step 18, 09 §8).
2
+
3
+ Emits a records array (``[{col: value, …}, …]``) in stable column order, with no
4
+ timestamps/ambient state, so the same ``(spec_hash, seed)`` yields identical bytes
5
+ on the pinned path (invariant #6). Values are normalized so the output round-trips
6
+ through ``pandas.read_json(orient="records")``: numpy scalars become Python
7
+ scalars, ``NaN``/``NaT`` become ``null``, and datetimes become ISO-8601 strings.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import json
13
+ import math
14
+ from pathlib import Path
15
+ from typing import Any
16
+
17
+ import numpy as np
18
+ import pandas as pd
19
+
20
+ from .base import ArtifactInfo, Exporter
21
+ from .checksums import sha256_bytes
22
+
23
+
24
+ def _normalize(value: Any) -> Any:
25
+ if value is None:
26
+ return None
27
+ if isinstance(value, float) and math.isnan(value):
28
+ return None
29
+ if isinstance(value, np.floating):
30
+ f = float(value)
31
+ return None if math.isnan(f) else f
32
+ if isinstance(value, np.integer):
33
+ return int(value)
34
+ if isinstance(value, np.bool_):
35
+ return bool(value)
36
+ if isinstance(value, (pd.Timestamp,)):
37
+ return None if pd.isna(value) else value.isoformat()
38
+ if value is pd.NaT:
39
+ return None
40
+ if isinstance(value, np.datetime64):
41
+ ts = pd.Timestamp(value)
42
+ return None if pd.isna(ts) else ts.isoformat()
43
+ return value
44
+
45
+
46
+ class JsonExporter(Exporter):
47
+ format = "json"
48
+
49
+ def write(self, df: pd.DataFrame, path: str | Path) -> ArtifactInfo:
50
+ columns = list(df.columns)
51
+ records = [
52
+ {col: _normalize(val) for col, val in zip(columns, row, strict=True)}
53
+ for row in df.itertuples(index=False, name=None)
54
+ ]
55
+ # Compact + sorted-by-column-order; LF newlines only (json never emits CRLF).
56
+ text = json.dumps(records, ensure_ascii=False, separators=(",", ":"))
57
+ data = text.encode("utf-8")
58
+ path = Path(path)
59
+ path.parent.mkdir(parents=True, exist_ok=True)
60
+ with open(path, "wb") as fh:
61
+ fh.write(data)
62
+ return ArtifactInfo(
63
+ path=str(path),
64
+ format=self.format,
65
+ checksum_sha256=sha256_bytes(data),
66
+ size_bytes=len(data),
67
+ )