datadoom 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. datadoom/__init__.py +23 -0
  2. datadoom/adapters/__init__.py +29 -0
  3. datadoom/adapters/frameworks.py +94 -0
  4. datadoom/adapters/loaders.py +72 -0
  5. datadoom/api/__init__.py +11 -0
  6. datadoom/api/app.py +109 -0
  7. datadoom/api/deps.py +30 -0
  8. datadoom/api/errors.py +89 -0
  9. datadoom/api/estimate.py +82 -0
  10. datadoom/api/routes/__init__.py +7 -0
  11. datadoom/api/routes/artifacts.py +147 -0
  12. datadoom/api/routes/datasets.py +180 -0
  13. datadoom/api/routes/meta.py +45 -0
  14. datadoom/api/routes/plugins.py +22 -0
  15. datadoom/api/routes/runs.py +144 -0
  16. datadoom/api/routes/specs.py +73 -0
  17. datadoom/api/routes/templates.py +30 -0
  18. datadoom/api/schemas.py +230 -0
  19. datadoom/api/serializers.py +143 -0
  20. datadoom/api/state.py +24 -0
  21. datadoom/api/store_helpers.py +56 -0
  22. datadoom/api/ws.py +72 -0
  23. datadoom/cli/__init__.py +1 -0
  24. datadoom/cli/main.py +313 -0
  25. datadoom/config.py +108 -0
  26. datadoom/engine/__init__.py +38 -0
  27. datadoom/engine/advice.py +289 -0
  28. datadoom/engine/audit.py +290 -0
  29. datadoom/engine/causal/__init__.py +15 -0
  30. datadoom/engine/causal/execute.py +116 -0
  31. datadoom/engine/causal/functions.py +116 -0
  32. datadoom/engine/causal/graph.py +54 -0
  33. datadoom/engine/difficulty/__init__.py +36 -0
  34. datadoom/engine/difficulty/calibrate.py +235 -0
  35. datadoom/engine/difficulty/knobs.py +171 -0
  36. datadoom/engine/difficulty/probes.py +181 -0
  37. datadoom/engine/dist/__init__.py +35 -0
  38. datadoom/engine/dist/base.py +46 -0
  39. datadoom/engine/dist/builtins.py +172 -0
  40. datadoom/engine/dist/compliance.py +344 -0
  41. datadoom/engine/dist/providers.py +117 -0
  42. datadoom/engine/errors.py +32 -0
  43. datadoom/engine/export/__init__.py +27 -0
  44. datadoom/engine/export/base.py +49 -0
  45. datadoom/engine/export/checksums.py +18 -0
  46. datadoom/engine/export/csv_exporter.py +34 -0
  47. datadoom/engine/export/json_exporter.py +67 -0
  48. datadoom/engine/export/metadata.py +58 -0
  49. datadoom/engine/export/parquet_exporter.py +45 -0
  50. datadoom/engine/failure/__init__.py +18 -0
  51. datadoom/engine/failure/apply.py +37 -0
  52. datadoom/engine/failure/base.py +116 -0
  53. datadoom/engine/failure/modes.py +442 -0
  54. datadoom/engine/pipeline.py +418 -0
  55. datadoom/engine/profile.py +327 -0
  56. datadoom/engine/progress.py +14 -0
  57. datadoom/engine/reference.py +338 -0
  58. datadoom/engine/reports.py +206 -0
  59. datadoom/engine/rng.py +79 -0
  60. datadoom/engine/spec/__init__.py +45 -0
  61. datadoom/engine/spec/hashing.py +57 -0
  62. datadoom/engine/spec/models.py +238 -0
  63. datadoom/engine/spec/validate.py +345 -0
  64. datadoom/engine/timeseries.py +88 -0
  65. datadoom/jobs/__init__.py +14 -0
  66. datadoom/jobs/progress.py +155 -0
  67. datadoom/jobs/worker.py +162 -0
  68. datadoom/plugin.py +35 -0
  69. datadoom/plugins/__init__.py +47 -0
  70. datadoom/plugins/contracts.py +72 -0
  71. datadoom/plugins/loader.py +125 -0
  72. datadoom/plugins/registry.py +214 -0
  73. datadoom/plugins/scaffold.py +434 -0
  74. datadoom/store/__init__.py +47 -0
  75. datadoom/store/artifacts.py +67 -0
  76. datadoom/store/db.py +104 -0
  77. datadoom/store/migrations/__init__.py +0 -0
  78. datadoom/store/migrations/env.py +53 -0
  79. datadoom/store/migrations/script.py.mako +24 -0
  80. datadoom/store/migrations/versions/0001_init.py +149 -0
  81. datadoom/store/migrations/versions/0002_report_mutual_information.py +23 -0
  82. datadoom/store/migrations/versions/0003_run_name.py +23 -0
  83. datadoom/store/migrations/versions/0004_report_profile.py +24 -0
  84. datadoom/store/models.py +170 -0
  85. datadoom/store/repositories.py +279 -0
  86. datadoom/templates/__init__.py +239 -0
  87. datadoom/templates/ab_test.datadoom.yaml +46 -0
  88. datadoom/templates/clinical_deterioration.datadoom.yaml +124 -0
  89. datadoom/templates/credit_default_challenge.datadoom.yaml +147 -0
  90. datadoom/templates/customer_churn.datadoom.yaml +60 -0
  91. datadoom/templates/ecommerce_orders.datadoom.yaml +46 -0
  92. datadoom/templates/fraud_detection.datadoom.yaml +57 -0
  93. datadoom/templates/hospital_readmission.datadoom.yaml +61 -0
  94. datadoom/templates/insurance_claims.datadoom.yaml +43 -0
  95. datadoom/templates/iot_sensors.datadoom.yaml +44 -0
  96. datadoom/templates/people_directory.datadoom.yaml +56 -0
  97. datadoom/templates/predictive_maintenance.datadoom.yaml +107 -0
  98. datadoom/templates/telecom_churn_challenge.datadoom.yaml +125 -0
  99. datadoom/version.py +3 -0
  100. datadoom/webdist/assets/index-V8VAuTJG.js +445 -0
  101. datadoom/webdist/assets/index-doRjyG5s.css +1 -0
  102. datadoom/webdist/assets/inter-cyrillic-ext-wght-normal-BOeWTOD4.woff2 +0 -0
  103. datadoom/webdist/assets/inter-cyrillic-wght-normal-DqGufNeO.woff2 +0 -0
  104. datadoom/webdist/assets/inter-greek-ext-wght-normal-DlzME5K_.woff2 +0 -0
  105. datadoom/webdist/assets/inter-greek-wght-normal-CkhJZR-_.woff2 +0 -0
  106. datadoom/webdist/assets/inter-latin-ext-wght-normal-DO1Apj_S.woff2 +0 -0
  107. datadoom/webdist/assets/inter-latin-wght-normal-Dx4kXJAl.woff2 +0 -0
  108. datadoom/webdist/assets/inter-vietnamese-wght-normal-CBcvBZtf.woff2 +0 -0
  109. datadoom/webdist/assets/jetbrains-mono-cyrillic-wght-normal-D73BlboJ.woff2 +0 -0
  110. datadoom/webdist/assets/jetbrains-mono-greek-wght-normal-Bw9x6K1M.woff2 +0 -0
  111. datadoom/webdist/assets/jetbrains-mono-latin-ext-wght-normal-DBQx-q_a.woff2 +0 -0
  112. datadoom/webdist/assets/jetbrains-mono-latin-wght-normal-B9CIFXIH.woff2 +0 -0
  113. datadoom/webdist/assets/jetbrains-mono-vietnamese-wght-normal-Bt-aOZkq.woff2 +0 -0
  114. datadoom/webdist/assets/space-grotesk-latin-ext-wght-normal-D9tNdqV9.woff2 +0 -0
  115. datadoom/webdist/assets/space-grotesk-latin-wght-normal-BhU9QXUp.woff2 +0 -0
  116. datadoom/webdist/assets/space-grotesk-vietnamese-wght-normal-D0rl6rjA.woff2 +0 -0
  117. datadoom/webdist/index.html +15 -0
  118. datadoom-0.1.0.dev0.dist-info/METADATA +143 -0
  119. datadoom-0.1.0.dev0.dist-info/RECORD +122 -0
  120. datadoom-0.1.0.dev0.dist-info/WHEEL +4 -0
  121. datadoom-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  122. datadoom-0.1.0.dev0.dist-info/licenses/LICENSE +202 -0
@@ -0,0 +1,434 @@
1
+ """Plugin authoring workflow: scaffolder + contract checker (09 §9, 17 step 17).
2
+
3
+ ``scaffold_plugin`` writes a ready-to-publish ``datadoom-plugin-*`` package (entry
4
+ point, base-class stub, contract test, README) so a contributor starts from a
5
+ working, deterministic plugin. ``check_object`` / ``check_plugin`` run the plugin
6
+ contract tests (13 §5): interface completeness, schema validity, determinism, and
7
+ a static RNG-hygiene scan — the same checks that gate the ecosystem.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import importlib.util
13
+ import inspect
14
+ import io
15
+ import re
16
+ import sys
17
+ import tempfile
18
+ import tokenize
19
+ from dataclasses import dataclass, field
20
+ from pathlib import Path
21
+ from typing import Any
22
+
23
+ import numpy as np
24
+ import pandas as pd
25
+
26
+ from .contracts import KEY_ATTR, PLUGIN_BASES
27
+ from .registry import PluginError, resolve_kind, resolve_kind_class, validate_param_schema
28
+
29
+ # stdlib/global sources of non-reproducible randomness, banned in the data path (09 §5).
30
+ _BANNED_RNG = re.compile(
31
+ r"\b(np|numpy)\.random\.|(?<![\w.])random\.|\buuid4\b|\btime\.time\b|"
32
+ r"\bdatetime\.now\b|\bsecrets\.",
33
+ )
34
+
35
+ _KIND_SUFFIX = {
36
+ "distribution": "Distribution",
37
+ "structural_fn": "StructuralFn",
38
+ "failure_mode": "FailureMode",
39
+ "exporter": "Exporter",
40
+ "probe_model": "ProbeModel",
41
+ }
42
+
43
+
44
+ # --- contract checking ---------------------------------------------------------------
45
+
46
+
47
+ @dataclass
48
+ class ObjectCheck:
49
+ """Result of running the plugin contract on a single instance."""
50
+
51
+ name: str
52
+ kind: str | None
53
+ results: list[tuple[str, str, str]] = field(default_factory=list) # (check, status, detail)
54
+
55
+ @property
56
+ def ok(self) -> bool:
57
+ return all(status != "fail" for _, status, _ in self.results)
58
+
59
+ def add(self, check: str, status: str, detail: str = "") -> None:
60
+ self.results.append((check, status, detail))
61
+
62
+ def summary(self) -> str:
63
+ head = f"{self.name} ({self.kind or 'unknown kind'})"
64
+ lines = [
65
+ f" [{status.upper():4}] {check}" + (f" - {detail}" if detail else "")
66
+ for check, status, detail in self.results
67
+ ]
68
+ return "\n".join([head, *lines])
69
+
70
+
71
+ def check_object(obj: object) -> ObjectCheck:
72
+ """Run interface / schema / determinism / RNG-hygiene checks on one plugin instance."""
73
+ kind = resolve_kind(obj)
74
+ report = ObjectCheck(name=type(obj).__name__, kind=kind)
75
+
76
+ if kind is None:
77
+ report.add("interface", "fail", "does not subclass a known plugin base")
78
+ return report
79
+ key_attr = KEY_ATTR[kind]
80
+ key = getattr(obj, key_attr, None)
81
+ if not isinstance(key, str) or not key:
82
+ report.add("interface", "fail", f"missing non-empty '{key_attr}'")
83
+ else:
84
+ report.add("interface", "pass", f"{key_attr}={key!r}")
85
+
86
+ schema = getattr(obj, "param_schema", None)
87
+ if schema is None:
88
+ report.add("schema", "skip", "no param_schema (uses native UI controls)")
89
+ else:
90
+ try:
91
+ validate_param_schema(schema)
92
+ report.add("schema", "pass", "valid JSON-schema fragment")
93
+ except PluginError as exc:
94
+ report.add("schema", "fail", str(exc))
95
+
96
+ _check_determinism(obj, kind, report)
97
+ _check_rng_hygiene(obj, report)
98
+ return report
99
+
100
+
101
+ def _check_determinism(obj: object, kind: str, report: ObjectCheck) -> None:
102
+ try:
103
+ if kind == "distribution":
104
+ params = getattr(obj, "example_params", None)
105
+ if not isinstance(params, dict):
106
+ report.add("determinism", "skip", "set `example_params` for an auto-check")
107
+ return
108
+ a = obj.sample(np.random.default_rng(0), 256, params) # type: ignore[attr-defined]
109
+ b = obj.sample(np.random.default_rng(0), 256, params) # type: ignore[attr-defined]
110
+ ok = np.array_equal(np.asarray(a), np.asarray(b))
111
+ report.add("determinism", "pass" if ok else "fail", "256 draws, two seeded RNGs")
112
+ elif kind == "exporter":
113
+ df = pd.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"]})
114
+ with tempfile.TemporaryDirectory() as tmp:
115
+ p1, p2 = Path(tmp) / "1", Path(tmp) / "2"
116
+ obj.write(df, p1) # type: ignore[attr-defined]
117
+ obj.write(df, p2) # type: ignore[attr-defined]
118
+ ok = p1.read_bytes() == p2.read_bytes()
119
+ report.add("determinism", "pass" if ok else "fail", "byte-stable on two writes")
120
+ else:
121
+ report.add(
122
+ "determinism", "skip", f"{kind} determinism is covered by its engine tests"
123
+ )
124
+ except Exception as exc: # noqa: BLE001
125
+ report.add("determinism", "fail", f"raised {type(exc).__name__}: {exc}")
126
+
127
+
128
+ def _strip_noncode(source: str) -> str:
129
+ """Blank out comments and string literals, preserving layout, so the RNG-hygiene
130
+ scan never trips on the word "random" in a docstring or comment."""
131
+ lines = [list(line) for line in source.splitlines(keepends=True)]
132
+ try:
133
+ for tok in tokenize.generate_tokens(io.StringIO(source).readline):
134
+ if tok.type not in (tokenize.COMMENT, tokenize.STRING):
135
+ continue
136
+ (sr, sc), (er, ec) = tok.start, tok.end
137
+ for r in range(sr, er + 1):
138
+ row = lines[r - 1]
139
+ c0 = sc if r == sr else 0
140
+ c1 = ec if r == er else len(row)
141
+ for c in range(c0, min(c1, len(row))):
142
+ if row[c] != "\n":
143
+ row[c] = " "
144
+ except (tokenize.TokenError, IndentationError):
145
+ return source
146
+ return "".join("".join(row) for row in lines)
147
+
148
+
149
+ def _check_rng_hygiene(obj: object, report: ObjectCheck) -> None:
150
+ try:
151
+ source = inspect.getsource(type(obj))
152
+ except (OSError, TypeError):
153
+ report.add("rng_hygiene", "skip", "source unavailable")
154
+ return
155
+ hits = sorted({m.group(0) for m in _BANNED_RNG.finditer(_strip_noncode(source))})
156
+ if hits:
157
+ report.add(
158
+ "rng_hygiene", "fail", "use the injected rng only; found " + ", ".join(hits)
159
+ )
160
+ else:
161
+ report.add("rng_hygiene", "pass", "uses only the injected rng")
162
+
163
+
164
+ def check_plugin(target: str | Path) -> list[ObjectCheck]:
165
+ """Check every plugin class defined by a ``.py`` file, a directory, or a module.
166
+
167
+ A directory is searched (non-recursively into ``build``/``tests``) for files
168
+ that define plugin classes; each is imported in isolation and checked.
169
+ """
170
+ path = Path(target)
171
+ if path.suffix == ".py" and path.is_file():
172
+ files = [path]
173
+ elif path.is_dir():
174
+ files = [
175
+ p
176
+ for p in sorted(path.rglob("*.py"))
177
+ if not any(part in {"build", "dist", ".venv", "tests", "__pycache__"} for part in p.parts)
178
+ ]
179
+ else:
180
+ raise PluginError(f"nothing to check at {target!r} (expected a .py file or directory)")
181
+
182
+ reports: list[ObjectCheck] = []
183
+ seen: set[str] = set()
184
+ for file in files:
185
+ module = _import_file(file)
186
+ for _, member in inspect.getmembers(module, inspect.isclass):
187
+ if member.__module__ != module.__name__:
188
+ continue
189
+ if resolve_kind_class(member) is None:
190
+ continue
191
+ if member.__qualname__ in seen:
192
+ continue
193
+ seen.add(member.__qualname__)
194
+ try:
195
+ instance = member()
196
+ except Exception as exc: # noqa: BLE001
197
+ bad = ObjectCheck(name=member.__name__, kind=resolve_kind_class(member))
198
+ bad.add("interface", "fail", f"could not instantiate: {exc}")
199
+ reports.append(bad)
200
+ continue
201
+ reports.append(check_object(instance))
202
+ if not reports:
203
+ raise PluginError(f"found no plugin classes under {target!r}")
204
+ return reports
205
+
206
+
207
+ def _import_file(path: Path) -> Any:
208
+ mod_name = f"datadoom_check_{abs(hash(str(path)))}"
209
+ spec = importlib.util.spec_from_file_location(mod_name, path)
210
+ if spec is None or spec.loader is None:
211
+ raise PluginError(f"could not import {path}")
212
+ module = importlib.util.module_from_spec(spec)
213
+ sys.modules[mod_name] = module
214
+ try:
215
+ spec.loader.exec_module(module)
216
+ except Exception as exc: # noqa: BLE001
217
+ sys.modules.pop(mod_name, None)
218
+ raise PluginError(f"{path.name} failed to import: {exc}") from exc
219
+ return module
220
+
221
+
222
+ # --- scaffolding ---------------------------------------------------------------------
223
+
224
+
225
+ def _camel(name: str) -> str:
226
+ parts = re.split(r"[-_\s]+", name.strip())
227
+ return "".join(p[:1].upper() + p[1:] for p in parts if p)
228
+
229
+
230
+ def scaffold_plugin(kind: str, name: str, dest: str | Path = ".") -> Path:
231
+ """Write a ``datadoom-plugin-<name>`` package skeleton; return its root directory."""
232
+ if kind not in PLUGIN_BASES:
233
+ raise PluginError(f"unknown kind {kind!r}; choose one of {', '.join(PLUGIN_BASES)}")
234
+ if not re.fullmatch(r"[a-z][a-z0-9_]*", name):
235
+ raise PluginError(
236
+ f"invalid plugin name {name!r}; use a lowercase identifier (e.g. 'weibull')"
237
+ )
238
+
239
+ module = f"datadoom_plugin_{name}"
240
+ dist_name = f"datadoom-plugin-{name.replace('_', '-')}"
241
+ class_name = _camel(name) + _KIND_SUFFIX[kind]
242
+ root = Path(dest) / dist_name
243
+ if root.exists():
244
+ raise PluginError(f"{root} already exists")
245
+ pkg = root / "src" / module
246
+ tests = root / "tests"
247
+ pkg.mkdir(parents=True)
248
+ tests.mkdir(parents=True)
249
+
250
+ (root / "pyproject.toml").write_text(
251
+ _PYPROJECT.format(dist=dist_name, module=module, name=name, cls=class_name),
252
+ encoding="utf-8",
253
+ )
254
+ (pkg / "__init__.py").write_text(
255
+ _STUBS[kind].format(cls=class_name, name=name), encoding="utf-8"
256
+ )
257
+ (tests / "test_contract.py").write_text(
258
+ _TEST_STUB.format(module=module, cls=class_name), encoding="utf-8"
259
+ )
260
+ (root / "README.md").write_text(
261
+ _README.format(dist=dist_name, kind=kind, name=name, cls=class_name, module=module),
262
+ encoding="utf-8",
263
+ )
264
+ return root
265
+
266
+
267
+ _PYPROJECT = '''\
268
+ [build-system]
269
+ requires = ["hatchling"]
270
+ build-backend = "hatchling.build"
271
+
272
+ [project]
273
+ name = "{dist}"
274
+ version = "0.1.0"
275
+ description = "A DataDoom plugin."
276
+ requires-python = ">=3.11"
277
+ dependencies = ["datadoom"]
278
+
279
+ # Discovered by DataDoom's plugin loader at startup (09 §3).
280
+ [project.entry-points."datadoom.plugins"]
281
+ {name} = "{module}:{cls}"
282
+
283
+ [tool.hatch.build.targets.wheel]
284
+ packages = ["src/{module}"]
285
+ '''
286
+
287
+ _TEST_STUB = '''\
288
+ """Plugin contract test — runs the same checks as `datadoom plugin check`."""
289
+
290
+ from datadoom.plugins.scaffold import check_object
291
+ from {module} import {cls}
292
+
293
+
294
+ def test_contract() -> None:
295
+ report = check_object({cls}())
296
+ assert report.ok, "\\n" + report.summary()
297
+ '''
298
+
299
+ _README = """\
300
+ # {dist}
301
+
302
+ A DataDoom **{kind}** plugin contributing `{name}` ({cls}).
303
+
304
+ ## Develop
305
+
306
+ ```bash
307
+ pip install -e . # the plugin appears in `datadoom` and the web UI
308
+ datadoom plugin check . # run the contract tests (interface/schema/determinism/RNG)
309
+ pytest # the bundled contract test
310
+ ```
311
+
312
+ Implement the method bodies in `src/{module}/__init__.py`, using **only** the
313
+ injected `rng` for randomness (stdlib `random`, `np.random.*` globals, `uuid4`,
314
+ `time` are banned — they break reproducibility and fail the contract check).
315
+ """
316
+
317
+
318
+ _STUBS = {
319
+ "distribution": '''\
320
+ """A DataDoom distribution plugin."""
321
+
322
+ from __future__ import annotations
323
+
324
+ import numpy as np
325
+ from datadoom.plugin import Distribution, schema
326
+
327
+
328
+ class {cls}(Distribution):
329
+ name = "{name}"
330
+ required_params = ("scale",)
331
+ # Rendered by the Canvas wherever a distribution is selectable (09 §6).
332
+ param_schema = schema({{"scale": {{"type": "number", "minimum": 0, "title": "Scale"}}}})
333
+ # Used by `datadoom plugin check` for the automated determinism check.
334
+ example_params = {{"scale": 2.0}}
335
+
336
+ def sample(self, rng, n, params):
337
+ # MUST use the injected rng (a numpy Generator) — never global random.
338
+ return params["scale"] * rng.standard_exponential(size=n)
339
+
340
+ def cdf(self, x, params): # enables KS compliance reporting
341
+ return 1.0 - np.exp(-np.asarray(x, dtype=float) / params["scale"])
342
+ ''',
343
+ "structural_fn": '''\
344
+ """A DataDoom structural-function plugin (a causal/SEM edge)."""
345
+
346
+ from __future__ import annotations
347
+
348
+ import numpy as np
349
+ from datadoom.plugin import StructuralFn, schema
350
+
351
+
352
+ class {cls}(StructuralFn):
353
+ name = "{name}"
354
+ # Structural fns read the fixed CausalEdge fields (weight/bias/coeffs/mapping);
355
+ # here `weight` is the slope and `bias` is the saturation cap.
356
+ param_schema = schema({{
357
+ "weight": {{"type": "number", "title": "Weight (slope)"}},
358
+ "bias": {{"type": "number", "title": "Saturation cap"}},
359
+ }})
360
+
361
+ def contribution(self, parent, edge):
362
+ weight = edge.weight if edge.weight is not None else 1.0
363
+ cap = edge.bias if edge.bias is not None else float("inf")
364
+ return np.minimum(weight * np.asarray(parent, dtype=float), cap)
365
+ ''',
366
+ "failure_mode": '''\
367
+ """A DataDoom failure-mode plugin (a corruption transform)."""
368
+
369
+ from __future__ import annotations
370
+
371
+ import numpy as np
372
+ from datadoom.plugin import FailureMode, schema
373
+
374
+
375
+ class {cls}(FailureMode):
376
+ name = "{name}"
377
+ param_schema = schema({{
378
+ "column": {{"type": "string", "title": "Column"}},
379
+ "rate": {{"type": "number", "minimum": 0, "maximum": 1, "title": "Rate"}},
380
+ }})
381
+
382
+ def apply(self, rng, frame, params, features):
383
+ col = params["column"]
384
+ rate = float(params.get("rate", 0.1))
385
+ mask = rng.random(size=len(frame)) < rate
386
+ frame.loc[mask, col] = np.nan # corrupt the working (injected) copy in place
387
+ return {{"column": col, "nulled_fraction": float(mask.mean())}}
388
+ ''',
389
+ "exporter": '''\
390
+ """A DataDoom exporter plugin (an output format)."""
391
+
392
+ from __future__ import annotations
393
+
394
+ from pathlib import Path
395
+
396
+ from datadoom.engine.export.checksums import sha256_bytes
397
+ from datadoom.plugin import Exporter
398
+ from datadoom.engine.export.base import ArtifactInfo
399
+
400
+
401
+ class {cls}(Exporter):
402
+ format = "{name}"
403
+
404
+ def write(self, df, path):
405
+ path = Path(path)
406
+ # Write deterministically — no timestamps/ambient state (invariant #6).
407
+ payload = df.to_json(orient="records", indent=2).encode("utf-8")
408
+ path.write_bytes(payload)
409
+ return ArtifactInfo(
410
+ path=str(path),
411
+ format=self.format,
412
+ checksum_sha256=sha256_bytes(payload),
413
+ size_bytes=len(payload),
414
+ )
415
+ ''',
416
+ "probe_model": '''\
417
+ """A DataDoom probe-model plugin (a difficulty baseline)."""
418
+
419
+ from __future__ import annotations
420
+
421
+ from datadoom.plugin import ProbeModel
422
+
423
+
424
+ class {cls}(ProbeModel):
425
+ name = "{name}"
426
+
427
+ def estimator(self, seed):
428
+ # Return a fresh scikit-learn classifier exposing predict_proba; seed any
429
+ # randomness so the probe metric is reproducible (it drives calibration).
430
+ from sklearn.ensemble import RandomForestClassifier
431
+
432
+ return RandomForestClassifier(n_estimators=50, max_depth=6, random_state=seed)
433
+ ''',
434
+ }
@@ -0,0 +1,47 @@
1
+ """Persistence layer — metadata DB (SQLAlchemy/SQLite) + artifact storage.
2
+
3
+ `store/` sits beside the engine: it persists engine outputs but imports nothing
4
+ from `jobs`, `api`, or `cli` (enforced by import-linter). The rest of the app
5
+ talks to it only through the repositories and the :class:`ArtifactStore`.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from .artifacts import ArtifactStore, LocalArtifactStore
11
+ from .db import Database, init_database, utcnow_iso
12
+ from .models import (
13
+ ArtifactRow,
14
+ Base,
15
+ DatasetRow,
16
+ GenerationRunRow,
17
+ PluginRow,
18
+ ReportRow,
19
+ SpecRow,
20
+ )
21
+ from .repositories import (
22
+ ArtifactRepository,
23
+ DatasetRepository,
24
+ ReportRepository,
25
+ RunRepository,
26
+ SpecRepository,
27
+ )
28
+
29
+ __all__ = [
30
+ "Database",
31
+ "init_database",
32
+ "utcnow_iso",
33
+ "Base",
34
+ "DatasetRow",
35
+ "SpecRow",
36
+ "GenerationRunRow",
37
+ "ArtifactRow",
38
+ "ReportRow",
39
+ "PluginRow",
40
+ "DatasetRepository",
41
+ "SpecRepository",
42
+ "RunRepository",
43
+ "ArtifactRepository",
44
+ "ReportRepository",
45
+ "ArtifactStore",
46
+ "LocalArtifactStore",
47
+ ]
@@ -0,0 +1,67 @@
1
+ """Artifact storage adapters (03 §3.6, 06 §3.4).
2
+
3
+ Local filesystem by default: ``<artifacts_dir>/<dataset_id>/<run_id>/...``.
4
+ The interface keeps the rest of the app storage-agnostic so an S3 adapter can
5
+ drop in for team mode without touching callers.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import shutil
11
+ from abc import ABC, abstractmethod
12
+ from pathlib import Path
13
+
14
+
15
+ class ArtifactStore(ABC):
16
+ """Where a run's output files live. URIs are opaque to the caller."""
17
+
18
+ @abstractmethod
19
+ def run_dir(self, dataset_id: str, run_id: str) -> Path:
20
+ """Return (creating if needed) the directory for a run's artifacts."""
21
+
22
+ @abstractmethod
23
+ def to_uri(self, path: Path) -> str:
24
+ """Stable storage URI recorded in the Artifact row."""
25
+
26
+ @abstractmethod
27
+ def open_uri(self, uri: str) -> Path:
28
+ """Resolve a stored URI back to a readable local path (for downloads)."""
29
+
30
+ @abstractmethod
31
+ def remove_dataset(self, dataset_id: str) -> None:
32
+ """Delete all artifacts for a dataset (cascade delete)."""
33
+
34
+ @abstractmethod
35
+ def remove_run(self, dataset_id: str, run_id: str) -> None:
36
+ """Delete all artifacts for a single run."""
37
+
38
+
39
+ class LocalArtifactStore(ArtifactStore):
40
+ def __init__(self, root: Path) -> None:
41
+ self.root = Path(root)
42
+
43
+ def run_dir(self, dataset_id: str, run_id: str) -> Path:
44
+ d = self.root / dataset_id / run_id
45
+ d.mkdir(parents=True, exist_ok=True)
46
+ return d
47
+
48
+ def to_uri(self, path: Path) -> str:
49
+ # Record paths relative to the artifact root so the DB stays portable if
50
+ # the root moves; downloads resolve back through `open_uri`.
51
+ rel = Path(path).resolve().relative_to(self.root.resolve())
52
+ return f"file:{rel.as_posix()}"
53
+
54
+ def open_uri(self, uri: str) -> Path:
55
+ if uri.startswith("file:"):
56
+ return (self.root / uri[len("file:") :]).resolve()
57
+ return Path(uri)
58
+
59
+ def remove_dataset(self, dataset_id: str) -> None:
60
+ d = self.root / dataset_id
61
+ if d.exists():
62
+ shutil.rmtree(d, ignore_errors=True)
63
+
64
+ def remove_run(self, dataset_id: str, run_id: str) -> None:
65
+ d = self.root / dataset_id / run_id
66
+ if d.exists():
67
+ shutil.rmtree(d, ignore_errors=True)
datadoom/store/db.py ADDED
@@ -0,0 +1,104 @@
1
+ """Database engine/session management (07 §4-5).
2
+
3
+ SQLite by default with the pragmas doc 07 mandates (WAL, FK on, NORMAL sync).
4
+ On startup we run ``alembic upgrade head`` against the on-disk DB so users never
5
+ run migrations by hand. For in-memory/test databases (where Alembic's separate
6
+ connection cannot see a ``:memory:`` schema) we fall back to ``create_all`` —
7
+ the migration is asserted to match the models by a dedicated test.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import datetime as _dt
13
+ from collections.abc import Iterator
14
+ from contextlib import contextmanager
15
+ from pathlib import Path
16
+
17
+ from sqlalchemy import Engine, create_engine, event
18
+ from sqlalchemy.orm import Session, sessionmaker
19
+
20
+ from .models import Base
21
+
22
+
23
+ def utcnow_iso() -> str:
24
+ """Current time as an ISO-8601 UTC string (the on-disk timestamp format)."""
25
+ return _dt.datetime.now(_dt.UTC).replace(microsecond=0).isoformat()
26
+
27
+
28
+ def _is_sqlite(url: str) -> bool:
29
+ return url.startswith("sqlite")
30
+
31
+
32
+ def _is_memory(url: str) -> bool:
33
+ return ":memory:" in url or url in {"sqlite://", "sqlite:///:memory:"}
34
+
35
+
36
+ def _install_sqlite_pragmas(engine: Engine) -> None:
37
+ @event.listens_for(engine, "connect")
38
+ def _set_pragmas(dbapi_conn, _record): # noqa: ANN001
39
+ cur = dbapi_conn.cursor()
40
+ cur.execute("PRAGMA journal_mode=WAL")
41
+ cur.execute("PRAGMA foreign_keys=ON")
42
+ cur.execute("PRAGMA synchronous=NORMAL")
43
+ cur.close()
44
+
45
+
46
+ class Database:
47
+ """Owns the SQLAlchemy engine + session factory for one DB URL."""
48
+
49
+ def __init__(self, url: str) -> None:
50
+ self.url = url
51
+ connect_args = {"check_same_thread": False} if _is_sqlite(url) else {}
52
+ self.engine: Engine = create_engine(url, future=True, connect_args=connect_args)
53
+ if _is_sqlite(url):
54
+ _install_sqlite_pragmas(self.engine)
55
+ self._session_factory = sessionmaker(self.engine, expire_on_commit=False, future=True)
56
+
57
+ @contextmanager
58
+ def session(self) -> Iterator[Session]:
59
+ """Transactional session scope: commit on success, rollback on error."""
60
+ sess = self._session_factory()
61
+ try:
62
+ yield sess
63
+ sess.commit()
64
+ except Exception:
65
+ sess.rollback()
66
+ raise
67
+ finally:
68
+ sess.close()
69
+
70
+ def create_all(self) -> None:
71
+ """Create the schema directly from the ORM metadata (test/in-memory path)."""
72
+ Base.metadata.create_all(self.engine)
73
+
74
+ def dispose(self) -> None:
75
+ self.engine.dispose()
76
+
77
+
78
+ def _alembic_config(url: str): # noqa: ANN202
79
+ from alembic.config import Config
80
+
81
+ migrations_dir = Path(__file__).parent / "migrations"
82
+ cfg = Config()
83
+ cfg.set_main_option("script_location", str(migrations_dir))
84
+ cfg.set_main_option("sqlalchemy.url", url)
85
+ return cfg
86
+
87
+
88
+ def init_database(url: str) -> Database:
89
+ """Open the DB and bring its schema up to head (Alembic), creating dirs."""
90
+ if _is_sqlite(url) and not _is_memory(url):
91
+ # Ensure the parent directory for the .db file exists.
92
+ path = url.replace("sqlite:///", "", 1)
93
+ Path(path).parent.mkdir(parents=True, exist_ok=True)
94
+
95
+ db = Database(url)
96
+ if _is_memory(url):
97
+ # Alembic uses its own connection; it can't see a private :memory: schema.
98
+ db.create_all()
99
+ return db
100
+
101
+ from alembic import command
102
+
103
+ command.upgrade(_alembic_config(url), "head")
104
+ return db
File without changes