datadoom 0.1.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datadoom/__init__.py +23 -0
- datadoom/adapters/__init__.py +29 -0
- datadoom/adapters/frameworks.py +94 -0
- datadoom/adapters/loaders.py +72 -0
- datadoom/api/__init__.py +11 -0
- datadoom/api/app.py +109 -0
- datadoom/api/deps.py +30 -0
- datadoom/api/errors.py +89 -0
- datadoom/api/estimate.py +82 -0
- datadoom/api/routes/__init__.py +7 -0
- datadoom/api/routes/artifacts.py +147 -0
- datadoom/api/routes/datasets.py +180 -0
- datadoom/api/routes/meta.py +45 -0
- datadoom/api/routes/plugins.py +22 -0
- datadoom/api/routes/runs.py +144 -0
- datadoom/api/routes/specs.py +73 -0
- datadoom/api/routes/templates.py +30 -0
- datadoom/api/schemas.py +230 -0
- datadoom/api/serializers.py +143 -0
- datadoom/api/state.py +24 -0
- datadoom/api/store_helpers.py +56 -0
- datadoom/api/ws.py +72 -0
- datadoom/cli/__init__.py +1 -0
- datadoom/cli/main.py +313 -0
- datadoom/config.py +108 -0
- datadoom/engine/__init__.py +38 -0
- datadoom/engine/advice.py +289 -0
- datadoom/engine/audit.py +290 -0
- datadoom/engine/causal/__init__.py +15 -0
- datadoom/engine/causal/execute.py +116 -0
- datadoom/engine/causal/functions.py +116 -0
- datadoom/engine/causal/graph.py +54 -0
- datadoom/engine/difficulty/__init__.py +36 -0
- datadoom/engine/difficulty/calibrate.py +235 -0
- datadoom/engine/difficulty/knobs.py +171 -0
- datadoom/engine/difficulty/probes.py +181 -0
- datadoom/engine/dist/__init__.py +35 -0
- datadoom/engine/dist/base.py +46 -0
- datadoom/engine/dist/builtins.py +172 -0
- datadoom/engine/dist/compliance.py +344 -0
- datadoom/engine/dist/providers.py +117 -0
- datadoom/engine/errors.py +32 -0
- datadoom/engine/export/__init__.py +27 -0
- datadoom/engine/export/base.py +49 -0
- datadoom/engine/export/checksums.py +18 -0
- datadoom/engine/export/csv_exporter.py +34 -0
- datadoom/engine/export/json_exporter.py +67 -0
- datadoom/engine/export/metadata.py +58 -0
- datadoom/engine/export/parquet_exporter.py +45 -0
- datadoom/engine/failure/__init__.py +18 -0
- datadoom/engine/failure/apply.py +37 -0
- datadoom/engine/failure/base.py +116 -0
- datadoom/engine/failure/modes.py +442 -0
- datadoom/engine/pipeline.py +418 -0
- datadoom/engine/profile.py +327 -0
- datadoom/engine/progress.py +14 -0
- datadoom/engine/reference.py +338 -0
- datadoom/engine/reports.py +206 -0
- datadoom/engine/rng.py +79 -0
- datadoom/engine/spec/__init__.py +45 -0
- datadoom/engine/spec/hashing.py +57 -0
- datadoom/engine/spec/models.py +238 -0
- datadoom/engine/spec/validate.py +345 -0
- datadoom/engine/timeseries.py +88 -0
- datadoom/jobs/__init__.py +14 -0
- datadoom/jobs/progress.py +155 -0
- datadoom/jobs/worker.py +162 -0
- datadoom/plugin.py +35 -0
- datadoom/plugins/__init__.py +47 -0
- datadoom/plugins/contracts.py +72 -0
- datadoom/plugins/loader.py +125 -0
- datadoom/plugins/registry.py +214 -0
- datadoom/plugins/scaffold.py +434 -0
- datadoom/store/__init__.py +47 -0
- datadoom/store/artifacts.py +67 -0
- datadoom/store/db.py +104 -0
- datadoom/store/migrations/__init__.py +0 -0
- datadoom/store/migrations/env.py +53 -0
- datadoom/store/migrations/script.py.mako +24 -0
- datadoom/store/migrations/versions/0001_init.py +149 -0
- datadoom/store/migrations/versions/0002_report_mutual_information.py +23 -0
- datadoom/store/migrations/versions/0003_run_name.py +23 -0
- datadoom/store/migrations/versions/0004_report_profile.py +24 -0
- datadoom/store/models.py +170 -0
- datadoom/store/repositories.py +279 -0
- datadoom/templates/__init__.py +239 -0
- datadoom/templates/ab_test.datadoom.yaml +46 -0
- datadoom/templates/clinical_deterioration.datadoom.yaml +124 -0
- datadoom/templates/credit_default_challenge.datadoom.yaml +147 -0
- datadoom/templates/customer_churn.datadoom.yaml +60 -0
- datadoom/templates/ecommerce_orders.datadoom.yaml +46 -0
- datadoom/templates/fraud_detection.datadoom.yaml +57 -0
- datadoom/templates/hospital_readmission.datadoom.yaml +61 -0
- datadoom/templates/insurance_claims.datadoom.yaml +43 -0
- datadoom/templates/iot_sensors.datadoom.yaml +44 -0
- datadoom/templates/people_directory.datadoom.yaml +56 -0
- datadoom/templates/predictive_maintenance.datadoom.yaml +107 -0
- datadoom/templates/telecom_churn_challenge.datadoom.yaml +125 -0
- datadoom/version.py +3 -0
- datadoom/webdist/assets/index-V8VAuTJG.js +445 -0
- datadoom/webdist/assets/index-doRjyG5s.css +1 -0
- datadoom/webdist/assets/inter-cyrillic-ext-wght-normal-BOeWTOD4.woff2 +0 -0
- datadoom/webdist/assets/inter-cyrillic-wght-normal-DqGufNeO.woff2 +0 -0
- datadoom/webdist/assets/inter-greek-ext-wght-normal-DlzME5K_.woff2 +0 -0
- datadoom/webdist/assets/inter-greek-wght-normal-CkhJZR-_.woff2 +0 -0
- datadoom/webdist/assets/inter-latin-ext-wght-normal-DO1Apj_S.woff2 +0 -0
- datadoom/webdist/assets/inter-latin-wght-normal-Dx4kXJAl.woff2 +0 -0
- datadoom/webdist/assets/inter-vietnamese-wght-normal-CBcvBZtf.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-cyrillic-wght-normal-D73BlboJ.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-greek-wght-normal-Bw9x6K1M.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-latin-ext-wght-normal-DBQx-q_a.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-latin-wght-normal-B9CIFXIH.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-vietnamese-wght-normal-Bt-aOZkq.woff2 +0 -0
- datadoom/webdist/assets/space-grotesk-latin-ext-wght-normal-D9tNdqV9.woff2 +0 -0
- datadoom/webdist/assets/space-grotesk-latin-wght-normal-BhU9QXUp.woff2 +0 -0
- datadoom/webdist/assets/space-grotesk-vietnamese-wght-normal-D0rl6rjA.woff2 +0 -0
- datadoom/webdist/index.html +15 -0
- datadoom-0.1.0.dev0.dist-info/METADATA +143 -0
- datadoom-0.1.0.dev0.dist-info/RECORD +122 -0
- datadoom-0.1.0.dev0.dist-info/WHEEL +4 -0
- datadoom-0.1.0.dev0.dist-info/entry_points.txt +2 -0
- datadoom-0.1.0.dev0.dist-info/licenses/LICENSE +202 -0
|
@@ -0,0 +1,434 @@
|
|
|
1
|
+
"""Plugin authoring workflow: scaffolder + contract checker (09 §9, 17 step 17).
|
|
2
|
+
|
|
3
|
+
``scaffold_plugin`` writes a ready-to-publish ``datadoom-plugin-*`` package (entry
|
|
4
|
+
point, base-class stub, contract test, README) so a contributor starts from a
|
|
5
|
+
working, deterministic plugin. ``check_object`` / ``check_plugin`` run the plugin
|
|
6
|
+
contract tests (13 §5): interface completeness, schema validity, determinism, and
|
|
7
|
+
a static RNG-hygiene scan — the same checks that gate the ecosystem.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import importlib.util
|
|
13
|
+
import inspect
|
|
14
|
+
import io
|
|
15
|
+
import re
|
|
16
|
+
import sys
|
|
17
|
+
import tempfile
|
|
18
|
+
import tokenize
|
|
19
|
+
from dataclasses import dataclass, field
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
from typing import Any
|
|
22
|
+
|
|
23
|
+
import numpy as np
|
|
24
|
+
import pandas as pd
|
|
25
|
+
|
|
26
|
+
from .contracts import KEY_ATTR, PLUGIN_BASES
|
|
27
|
+
from .registry import PluginError, resolve_kind, resolve_kind_class, validate_param_schema
|
|
28
|
+
|
|
29
|
+
# stdlib/global sources of non-reproducible randomness, banned in the data path (09 §5).
|
|
30
|
+
_BANNED_RNG = re.compile(
|
|
31
|
+
r"\b(np|numpy)\.random\.|(?<![\w.])random\.|\buuid4\b|\btime\.time\b|"
|
|
32
|
+
r"\bdatetime\.now\b|\bsecrets\.",
|
|
33
|
+
)
|
|
34
|
+
|
|
35
|
+
_KIND_SUFFIX = {
|
|
36
|
+
"distribution": "Distribution",
|
|
37
|
+
"structural_fn": "StructuralFn",
|
|
38
|
+
"failure_mode": "FailureMode",
|
|
39
|
+
"exporter": "Exporter",
|
|
40
|
+
"probe_model": "ProbeModel",
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
# --- contract checking ---------------------------------------------------------------
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
@dataclass
|
|
48
|
+
class ObjectCheck:
|
|
49
|
+
"""Result of running the plugin contract on a single instance."""
|
|
50
|
+
|
|
51
|
+
name: str
|
|
52
|
+
kind: str | None
|
|
53
|
+
results: list[tuple[str, str, str]] = field(default_factory=list) # (check, status, detail)
|
|
54
|
+
|
|
55
|
+
@property
|
|
56
|
+
def ok(self) -> bool:
|
|
57
|
+
return all(status != "fail" for _, status, _ in self.results)
|
|
58
|
+
|
|
59
|
+
def add(self, check: str, status: str, detail: str = "") -> None:
|
|
60
|
+
self.results.append((check, status, detail))
|
|
61
|
+
|
|
62
|
+
def summary(self) -> str:
|
|
63
|
+
head = f"{self.name} ({self.kind or 'unknown kind'})"
|
|
64
|
+
lines = [
|
|
65
|
+
f" [{status.upper():4}] {check}" + (f" - {detail}" if detail else "")
|
|
66
|
+
for check, status, detail in self.results
|
|
67
|
+
]
|
|
68
|
+
return "\n".join([head, *lines])
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def check_object(obj: object) -> ObjectCheck:
|
|
72
|
+
"""Run interface / schema / determinism / RNG-hygiene checks on one plugin instance."""
|
|
73
|
+
kind = resolve_kind(obj)
|
|
74
|
+
report = ObjectCheck(name=type(obj).__name__, kind=kind)
|
|
75
|
+
|
|
76
|
+
if kind is None:
|
|
77
|
+
report.add("interface", "fail", "does not subclass a known plugin base")
|
|
78
|
+
return report
|
|
79
|
+
key_attr = KEY_ATTR[kind]
|
|
80
|
+
key = getattr(obj, key_attr, None)
|
|
81
|
+
if not isinstance(key, str) or not key:
|
|
82
|
+
report.add("interface", "fail", f"missing non-empty '{key_attr}'")
|
|
83
|
+
else:
|
|
84
|
+
report.add("interface", "pass", f"{key_attr}={key!r}")
|
|
85
|
+
|
|
86
|
+
schema = getattr(obj, "param_schema", None)
|
|
87
|
+
if schema is None:
|
|
88
|
+
report.add("schema", "skip", "no param_schema (uses native UI controls)")
|
|
89
|
+
else:
|
|
90
|
+
try:
|
|
91
|
+
validate_param_schema(schema)
|
|
92
|
+
report.add("schema", "pass", "valid JSON-schema fragment")
|
|
93
|
+
except PluginError as exc:
|
|
94
|
+
report.add("schema", "fail", str(exc))
|
|
95
|
+
|
|
96
|
+
_check_determinism(obj, kind, report)
|
|
97
|
+
_check_rng_hygiene(obj, report)
|
|
98
|
+
return report
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
def _check_determinism(obj: object, kind: str, report: ObjectCheck) -> None:
|
|
102
|
+
try:
|
|
103
|
+
if kind == "distribution":
|
|
104
|
+
params = getattr(obj, "example_params", None)
|
|
105
|
+
if not isinstance(params, dict):
|
|
106
|
+
report.add("determinism", "skip", "set `example_params` for an auto-check")
|
|
107
|
+
return
|
|
108
|
+
a = obj.sample(np.random.default_rng(0), 256, params) # type: ignore[attr-defined]
|
|
109
|
+
b = obj.sample(np.random.default_rng(0), 256, params) # type: ignore[attr-defined]
|
|
110
|
+
ok = np.array_equal(np.asarray(a), np.asarray(b))
|
|
111
|
+
report.add("determinism", "pass" if ok else "fail", "256 draws, two seeded RNGs")
|
|
112
|
+
elif kind == "exporter":
|
|
113
|
+
df = pd.DataFrame({"a": [1, 2, 3], "b": ["x", "y", "z"]})
|
|
114
|
+
with tempfile.TemporaryDirectory() as tmp:
|
|
115
|
+
p1, p2 = Path(tmp) / "1", Path(tmp) / "2"
|
|
116
|
+
obj.write(df, p1) # type: ignore[attr-defined]
|
|
117
|
+
obj.write(df, p2) # type: ignore[attr-defined]
|
|
118
|
+
ok = p1.read_bytes() == p2.read_bytes()
|
|
119
|
+
report.add("determinism", "pass" if ok else "fail", "byte-stable on two writes")
|
|
120
|
+
else:
|
|
121
|
+
report.add(
|
|
122
|
+
"determinism", "skip", f"{kind} determinism is covered by its engine tests"
|
|
123
|
+
)
|
|
124
|
+
except Exception as exc: # noqa: BLE001
|
|
125
|
+
report.add("determinism", "fail", f"raised {type(exc).__name__}: {exc}")
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def _strip_noncode(source: str) -> str:
|
|
129
|
+
"""Blank out comments and string literals, preserving layout, so the RNG-hygiene
|
|
130
|
+
scan never trips on the word "random" in a docstring or comment."""
|
|
131
|
+
lines = [list(line) for line in source.splitlines(keepends=True)]
|
|
132
|
+
try:
|
|
133
|
+
for tok in tokenize.generate_tokens(io.StringIO(source).readline):
|
|
134
|
+
if tok.type not in (tokenize.COMMENT, tokenize.STRING):
|
|
135
|
+
continue
|
|
136
|
+
(sr, sc), (er, ec) = tok.start, tok.end
|
|
137
|
+
for r in range(sr, er + 1):
|
|
138
|
+
row = lines[r - 1]
|
|
139
|
+
c0 = sc if r == sr else 0
|
|
140
|
+
c1 = ec if r == er else len(row)
|
|
141
|
+
for c in range(c0, min(c1, len(row))):
|
|
142
|
+
if row[c] != "\n":
|
|
143
|
+
row[c] = " "
|
|
144
|
+
except (tokenize.TokenError, IndentationError):
|
|
145
|
+
return source
|
|
146
|
+
return "".join("".join(row) for row in lines)
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
def _check_rng_hygiene(obj: object, report: ObjectCheck) -> None:
|
|
150
|
+
try:
|
|
151
|
+
source = inspect.getsource(type(obj))
|
|
152
|
+
except (OSError, TypeError):
|
|
153
|
+
report.add("rng_hygiene", "skip", "source unavailable")
|
|
154
|
+
return
|
|
155
|
+
hits = sorted({m.group(0) for m in _BANNED_RNG.finditer(_strip_noncode(source))})
|
|
156
|
+
if hits:
|
|
157
|
+
report.add(
|
|
158
|
+
"rng_hygiene", "fail", "use the injected rng only; found " + ", ".join(hits)
|
|
159
|
+
)
|
|
160
|
+
else:
|
|
161
|
+
report.add("rng_hygiene", "pass", "uses only the injected rng")
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def check_plugin(target: str | Path) -> list[ObjectCheck]:
|
|
165
|
+
"""Check every plugin class defined by a ``.py`` file, a directory, or a module.
|
|
166
|
+
|
|
167
|
+
A directory is searched (non-recursively into ``build``/``tests``) for files
|
|
168
|
+
that define plugin classes; each is imported in isolation and checked.
|
|
169
|
+
"""
|
|
170
|
+
path = Path(target)
|
|
171
|
+
if path.suffix == ".py" and path.is_file():
|
|
172
|
+
files = [path]
|
|
173
|
+
elif path.is_dir():
|
|
174
|
+
files = [
|
|
175
|
+
p
|
|
176
|
+
for p in sorted(path.rglob("*.py"))
|
|
177
|
+
if not any(part in {"build", "dist", ".venv", "tests", "__pycache__"} for part in p.parts)
|
|
178
|
+
]
|
|
179
|
+
else:
|
|
180
|
+
raise PluginError(f"nothing to check at {target!r} (expected a .py file or directory)")
|
|
181
|
+
|
|
182
|
+
reports: list[ObjectCheck] = []
|
|
183
|
+
seen: set[str] = set()
|
|
184
|
+
for file in files:
|
|
185
|
+
module = _import_file(file)
|
|
186
|
+
for _, member in inspect.getmembers(module, inspect.isclass):
|
|
187
|
+
if member.__module__ != module.__name__:
|
|
188
|
+
continue
|
|
189
|
+
if resolve_kind_class(member) is None:
|
|
190
|
+
continue
|
|
191
|
+
if member.__qualname__ in seen:
|
|
192
|
+
continue
|
|
193
|
+
seen.add(member.__qualname__)
|
|
194
|
+
try:
|
|
195
|
+
instance = member()
|
|
196
|
+
except Exception as exc: # noqa: BLE001
|
|
197
|
+
bad = ObjectCheck(name=member.__name__, kind=resolve_kind_class(member))
|
|
198
|
+
bad.add("interface", "fail", f"could not instantiate: {exc}")
|
|
199
|
+
reports.append(bad)
|
|
200
|
+
continue
|
|
201
|
+
reports.append(check_object(instance))
|
|
202
|
+
if not reports:
|
|
203
|
+
raise PluginError(f"found no plugin classes under {target!r}")
|
|
204
|
+
return reports
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def _import_file(path: Path) -> Any:
|
|
208
|
+
mod_name = f"datadoom_check_{abs(hash(str(path)))}"
|
|
209
|
+
spec = importlib.util.spec_from_file_location(mod_name, path)
|
|
210
|
+
if spec is None or spec.loader is None:
|
|
211
|
+
raise PluginError(f"could not import {path}")
|
|
212
|
+
module = importlib.util.module_from_spec(spec)
|
|
213
|
+
sys.modules[mod_name] = module
|
|
214
|
+
try:
|
|
215
|
+
spec.loader.exec_module(module)
|
|
216
|
+
except Exception as exc: # noqa: BLE001
|
|
217
|
+
sys.modules.pop(mod_name, None)
|
|
218
|
+
raise PluginError(f"{path.name} failed to import: {exc}") from exc
|
|
219
|
+
return module
|
|
220
|
+
|
|
221
|
+
|
|
222
|
+
# --- scaffolding ---------------------------------------------------------------------
|
|
223
|
+
|
|
224
|
+
|
|
225
|
+
def _camel(name: str) -> str:
|
|
226
|
+
parts = re.split(r"[-_\s]+", name.strip())
|
|
227
|
+
return "".join(p[:1].upper() + p[1:] for p in parts if p)
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def scaffold_plugin(kind: str, name: str, dest: str | Path = ".") -> Path:
|
|
231
|
+
"""Write a ``datadoom-plugin-<name>`` package skeleton; return its root directory."""
|
|
232
|
+
if kind not in PLUGIN_BASES:
|
|
233
|
+
raise PluginError(f"unknown kind {kind!r}; choose one of {', '.join(PLUGIN_BASES)}")
|
|
234
|
+
if not re.fullmatch(r"[a-z][a-z0-9_]*", name):
|
|
235
|
+
raise PluginError(
|
|
236
|
+
f"invalid plugin name {name!r}; use a lowercase identifier (e.g. 'weibull')"
|
|
237
|
+
)
|
|
238
|
+
|
|
239
|
+
module = f"datadoom_plugin_{name}"
|
|
240
|
+
dist_name = f"datadoom-plugin-{name.replace('_', '-')}"
|
|
241
|
+
class_name = _camel(name) + _KIND_SUFFIX[kind]
|
|
242
|
+
root = Path(dest) / dist_name
|
|
243
|
+
if root.exists():
|
|
244
|
+
raise PluginError(f"{root} already exists")
|
|
245
|
+
pkg = root / "src" / module
|
|
246
|
+
tests = root / "tests"
|
|
247
|
+
pkg.mkdir(parents=True)
|
|
248
|
+
tests.mkdir(parents=True)
|
|
249
|
+
|
|
250
|
+
(root / "pyproject.toml").write_text(
|
|
251
|
+
_PYPROJECT.format(dist=dist_name, module=module, name=name, cls=class_name),
|
|
252
|
+
encoding="utf-8",
|
|
253
|
+
)
|
|
254
|
+
(pkg / "__init__.py").write_text(
|
|
255
|
+
_STUBS[kind].format(cls=class_name, name=name), encoding="utf-8"
|
|
256
|
+
)
|
|
257
|
+
(tests / "test_contract.py").write_text(
|
|
258
|
+
_TEST_STUB.format(module=module, cls=class_name), encoding="utf-8"
|
|
259
|
+
)
|
|
260
|
+
(root / "README.md").write_text(
|
|
261
|
+
_README.format(dist=dist_name, kind=kind, name=name, cls=class_name, module=module),
|
|
262
|
+
encoding="utf-8",
|
|
263
|
+
)
|
|
264
|
+
return root
|
|
265
|
+
|
|
266
|
+
|
|
267
|
+
_PYPROJECT = '''\
|
|
268
|
+
[build-system]
|
|
269
|
+
requires = ["hatchling"]
|
|
270
|
+
build-backend = "hatchling.build"
|
|
271
|
+
|
|
272
|
+
[project]
|
|
273
|
+
name = "{dist}"
|
|
274
|
+
version = "0.1.0"
|
|
275
|
+
description = "A DataDoom plugin."
|
|
276
|
+
requires-python = ">=3.11"
|
|
277
|
+
dependencies = ["datadoom"]
|
|
278
|
+
|
|
279
|
+
# Discovered by DataDoom's plugin loader at startup (09 §3).
|
|
280
|
+
[project.entry-points."datadoom.plugins"]
|
|
281
|
+
{name} = "{module}:{cls}"
|
|
282
|
+
|
|
283
|
+
[tool.hatch.build.targets.wheel]
|
|
284
|
+
packages = ["src/{module}"]
|
|
285
|
+
'''
|
|
286
|
+
|
|
287
|
+
_TEST_STUB = '''\
|
|
288
|
+
"""Plugin contract test — runs the same checks as `datadoom plugin check`."""
|
|
289
|
+
|
|
290
|
+
from datadoom.plugins.scaffold import check_object
|
|
291
|
+
from {module} import {cls}
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def test_contract() -> None:
|
|
295
|
+
report = check_object({cls}())
|
|
296
|
+
assert report.ok, "\\n" + report.summary()
|
|
297
|
+
'''
|
|
298
|
+
|
|
299
|
+
_README = """\
|
|
300
|
+
# {dist}
|
|
301
|
+
|
|
302
|
+
A DataDoom **{kind}** plugin contributing `{name}` ({cls}).
|
|
303
|
+
|
|
304
|
+
## Develop
|
|
305
|
+
|
|
306
|
+
```bash
|
|
307
|
+
pip install -e . # the plugin appears in `datadoom` and the web UI
|
|
308
|
+
datadoom plugin check . # run the contract tests (interface/schema/determinism/RNG)
|
|
309
|
+
pytest # the bundled contract test
|
|
310
|
+
```
|
|
311
|
+
|
|
312
|
+
Implement the method bodies in `src/{module}/__init__.py`, using **only** the
|
|
313
|
+
injected `rng` for randomness (stdlib `random`, `np.random.*` globals, `uuid4`,
|
|
314
|
+
`time` are banned — they break reproducibility and fail the contract check).
|
|
315
|
+
"""
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
_STUBS = {
|
|
319
|
+
"distribution": '''\
|
|
320
|
+
"""A DataDoom distribution plugin."""
|
|
321
|
+
|
|
322
|
+
from __future__ import annotations
|
|
323
|
+
|
|
324
|
+
import numpy as np
|
|
325
|
+
from datadoom.plugin import Distribution, schema
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
class {cls}(Distribution):
|
|
329
|
+
name = "{name}"
|
|
330
|
+
required_params = ("scale",)
|
|
331
|
+
# Rendered by the Canvas wherever a distribution is selectable (09 §6).
|
|
332
|
+
param_schema = schema({{"scale": {{"type": "number", "minimum": 0, "title": "Scale"}}}})
|
|
333
|
+
# Used by `datadoom plugin check` for the automated determinism check.
|
|
334
|
+
example_params = {{"scale": 2.0}}
|
|
335
|
+
|
|
336
|
+
def sample(self, rng, n, params):
|
|
337
|
+
# MUST use the injected rng (a numpy Generator) — never global random.
|
|
338
|
+
return params["scale"] * rng.standard_exponential(size=n)
|
|
339
|
+
|
|
340
|
+
def cdf(self, x, params): # enables KS compliance reporting
|
|
341
|
+
return 1.0 - np.exp(-np.asarray(x, dtype=float) / params["scale"])
|
|
342
|
+
''',
|
|
343
|
+
"structural_fn": '''\
|
|
344
|
+
"""A DataDoom structural-function plugin (a causal/SEM edge)."""
|
|
345
|
+
|
|
346
|
+
from __future__ import annotations
|
|
347
|
+
|
|
348
|
+
import numpy as np
|
|
349
|
+
from datadoom.plugin import StructuralFn, schema
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
class {cls}(StructuralFn):
|
|
353
|
+
name = "{name}"
|
|
354
|
+
# Structural fns read the fixed CausalEdge fields (weight/bias/coeffs/mapping);
|
|
355
|
+
# here `weight` is the slope and `bias` is the saturation cap.
|
|
356
|
+
param_schema = schema({{
|
|
357
|
+
"weight": {{"type": "number", "title": "Weight (slope)"}},
|
|
358
|
+
"bias": {{"type": "number", "title": "Saturation cap"}},
|
|
359
|
+
}})
|
|
360
|
+
|
|
361
|
+
def contribution(self, parent, edge):
|
|
362
|
+
weight = edge.weight if edge.weight is not None else 1.0
|
|
363
|
+
cap = edge.bias if edge.bias is not None else float("inf")
|
|
364
|
+
return np.minimum(weight * np.asarray(parent, dtype=float), cap)
|
|
365
|
+
''',
|
|
366
|
+
"failure_mode": '''\
|
|
367
|
+
"""A DataDoom failure-mode plugin (a corruption transform)."""
|
|
368
|
+
|
|
369
|
+
from __future__ import annotations
|
|
370
|
+
|
|
371
|
+
import numpy as np
|
|
372
|
+
from datadoom.plugin import FailureMode, schema
|
|
373
|
+
|
|
374
|
+
|
|
375
|
+
class {cls}(FailureMode):
|
|
376
|
+
name = "{name}"
|
|
377
|
+
param_schema = schema({{
|
|
378
|
+
"column": {{"type": "string", "title": "Column"}},
|
|
379
|
+
"rate": {{"type": "number", "minimum": 0, "maximum": 1, "title": "Rate"}},
|
|
380
|
+
}})
|
|
381
|
+
|
|
382
|
+
def apply(self, rng, frame, params, features):
|
|
383
|
+
col = params["column"]
|
|
384
|
+
rate = float(params.get("rate", 0.1))
|
|
385
|
+
mask = rng.random(size=len(frame)) < rate
|
|
386
|
+
frame.loc[mask, col] = np.nan # corrupt the working (injected) copy in place
|
|
387
|
+
return {{"column": col, "nulled_fraction": float(mask.mean())}}
|
|
388
|
+
''',
|
|
389
|
+
"exporter": '''\
|
|
390
|
+
"""A DataDoom exporter plugin (an output format)."""
|
|
391
|
+
|
|
392
|
+
from __future__ import annotations
|
|
393
|
+
|
|
394
|
+
from pathlib import Path
|
|
395
|
+
|
|
396
|
+
from datadoom.engine.export.checksums import sha256_bytes
|
|
397
|
+
from datadoom.plugin import Exporter
|
|
398
|
+
from datadoom.engine.export.base import ArtifactInfo
|
|
399
|
+
|
|
400
|
+
|
|
401
|
+
class {cls}(Exporter):
|
|
402
|
+
format = "{name}"
|
|
403
|
+
|
|
404
|
+
def write(self, df, path):
|
|
405
|
+
path = Path(path)
|
|
406
|
+
# Write deterministically — no timestamps/ambient state (invariant #6).
|
|
407
|
+
payload = df.to_json(orient="records", indent=2).encode("utf-8")
|
|
408
|
+
path.write_bytes(payload)
|
|
409
|
+
return ArtifactInfo(
|
|
410
|
+
path=str(path),
|
|
411
|
+
format=self.format,
|
|
412
|
+
checksum_sha256=sha256_bytes(payload),
|
|
413
|
+
size_bytes=len(payload),
|
|
414
|
+
)
|
|
415
|
+
''',
|
|
416
|
+
"probe_model": '''\
|
|
417
|
+
"""A DataDoom probe-model plugin (a difficulty baseline)."""
|
|
418
|
+
|
|
419
|
+
from __future__ import annotations
|
|
420
|
+
|
|
421
|
+
from datadoom.plugin import ProbeModel
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
class {cls}(ProbeModel):
|
|
425
|
+
name = "{name}"
|
|
426
|
+
|
|
427
|
+
def estimator(self, seed):
|
|
428
|
+
# Return a fresh scikit-learn classifier exposing predict_proba; seed any
|
|
429
|
+
# randomness so the probe metric is reproducible (it drives calibration).
|
|
430
|
+
from sklearn.ensemble import RandomForestClassifier
|
|
431
|
+
|
|
432
|
+
return RandomForestClassifier(n_estimators=50, max_depth=6, random_state=seed)
|
|
433
|
+
''',
|
|
434
|
+
}
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
"""Persistence layer — metadata DB (SQLAlchemy/SQLite) + artifact storage.
|
|
2
|
+
|
|
3
|
+
`store/` sits beside the engine: it persists engine outputs but imports nothing
|
|
4
|
+
from `jobs`, `api`, or `cli` (enforced by import-linter). The rest of the app
|
|
5
|
+
talks to it only through the repositories and the :class:`ArtifactStore`.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from .artifacts import ArtifactStore, LocalArtifactStore
|
|
11
|
+
from .db import Database, init_database, utcnow_iso
|
|
12
|
+
from .models import (
|
|
13
|
+
ArtifactRow,
|
|
14
|
+
Base,
|
|
15
|
+
DatasetRow,
|
|
16
|
+
GenerationRunRow,
|
|
17
|
+
PluginRow,
|
|
18
|
+
ReportRow,
|
|
19
|
+
SpecRow,
|
|
20
|
+
)
|
|
21
|
+
from .repositories import (
|
|
22
|
+
ArtifactRepository,
|
|
23
|
+
DatasetRepository,
|
|
24
|
+
ReportRepository,
|
|
25
|
+
RunRepository,
|
|
26
|
+
SpecRepository,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
__all__ = [
|
|
30
|
+
"Database",
|
|
31
|
+
"init_database",
|
|
32
|
+
"utcnow_iso",
|
|
33
|
+
"Base",
|
|
34
|
+
"DatasetRow",
|
|
35
|
+
"SpecRow",
|
|
36
|
+
"GenerationRunRow",
|
|
37
|
+
"ArtifactRow",
|
|
38
|
+
"ReportRow",
|
|
39
|
+
"PluginRow",
|
|
40
|
+
"DatasetRepository",
|
|
41
|
+
"SpecRepository",
|
|
42
|
+
"RunRepository",
|
|
43
|
+
"ArtifactRepository",
|
|
44
|
+
"ReportRepository",
|
|
45
|
+
"ArtifactStore",
|
|
46
|
+
"LocalArtifactStore",
|
|
47
|
+
]
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""Artifact storage adapters (03 §3.6, 06 §3.4).
|
|
2
|
+
|
|
3
|
+
Local filesystem by default: ``<artifacts_dir>/<dataset_id>/<run_id>/...``.
|
|
4
|
+
The interface keeps the rest of the app storage-agnostic so an S3 adapter can
|
|
5
|
+
drop in for team mode without touching callers.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import shutil
|
|
11
|
+
from abc import ABC, abstractmethod
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class ArtifactStore(ABC):
|
|
16
|
+
"""Where a run's output files live. URIs are opaque to the caller."""
|
|
17
|
+
|
|
18
|
+
@abstractmethod
|
|
19
|
+
def run_dir(self, dataset_id: str, run_id: str) -> Path:
|
|
20
|
+
"""Return (creating if needed) the directory for a run's artifacts."""
|
|
21
|
+
|
|
22
|
+
@abstractmethod
|
|
23
|
+
def to_uri(self, path: Path) -> str:
|
|
24
|
+
"""Stable storage URI recorded in the Artifact row."""
|
|
25
|
+
|
|
26
|
+
@abstractmethod
|
|
27
|
+
def open_uri(self, uri: str) -> Path:
|
|
28
|
+
"""Resolve a stored URI back to a readable local path (for downloads)."""
|
|
29
|
+
|
|
30
|
+
@abstractmethod
|
|
31
|
+
def remove_dataset(self, dataset_id: str) -> None:
|
|
32
|
+
"""Delete all artifacts for a dataset (cascade delete)."""
|
|
33
|
+
|
|
34
|
+
@abstractmethod
|
|
35
|
+
def remove_run(self, dataset_id: str, run_id: str) -> None:
|
|
36
|
+
"""Delete all artifacts for a single run."""
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class LocalArtifactStore(ArtifactStore):
|
|
40
|
+
def __init__(self, root: Path) -> None:
|
|
41
|
+
self.root = Path(root)
|
|
42
|
+
|
|
43
|
+
def run_dir(self, dataset_id: str, run_id: str) -> Path:
|
|
44
|
+
d = self.root / dataset_id / run_id
|
|
45
|
+
d.mkdir(parents=True, exist_ok=True)
|
|
46
|
+
return d
|
|
47
|
+
|
|
48
|
+
def to_uri(self, path: Path) -> str:
|
|
49
|
+
# Record paths relative to the artifact root so the DB stays portable if
|
|
50
|
+
# the root moves; downloads resolve back through `open_uri`.
|
|
51
|
+
rel = Path(path).resolve().relative_to(self.root.resolve())
|
|
52
|
+
return f"file:{rel.as_posix()}"
|
|
53
|
+
|
|
54
|
+
def open_uri(self, uri: str) -> Path:
|
|
55
|
+
if uri.startswith("file:"):
|
|
56
|
+
return (self.root / uri[len("file:") :]).resolve()
|
|
57
|
+
return Path(uri)
|
|
58
|
+
|
|
59
|
+
def remove_dataset(self, dataset_id: str) -> None:
|
|
60
|
+
d = self.root / dataset_id
|
|
61
|
+
if d.exists():
|
|
62
|
+
shutil.rmtree(d, ignore_errors=True)
|
|
63
|
+
|
|
64
|
+
def remove_run(self, dataset_id: str, run_id: str) -> None:
|
|
65
|
+
d = self.root / dataset_id / run_id
|
|
66
|
+
if d.exists():
|
|
67
|
+
shutil.rmtree(d, ignore_errors=True)
|
datadoom/store/db.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
"""Database engine/session management (07 §4-5).
|
|
2
|
+
|
|
3
|
+
SQLite by default with the pragmas doc 07 mandates (WAL, FK on, NORMAL sync).
|
|
4
|
+
On startup we run ``alembic upgrade head`` against the on-disk DB so users never
|
|
5
|
+
run migrations by hand. For in-memory/test databases (where Alembic's separate
|
|
6
|
+
connection cannot see a ``:memory:`` schema) we fall back to ``create_all`` —
|
|
7
|
+
the migration is asserted to match the models by a dedicated test.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import datetime as _dt
|
|
13
|
+
from collections.abc import Iterator
|
|
14
|
+
from contextlib import contextmanager
|
|
15
|
+
from pathlib import Path
|
|
16
|
+
|
|
17
|
+
from sqlalchemy import Engine, create_engine, event
|
|
18
|
+
from sqlalchemy.orm import Session, sessionmaker
|
|
19
|
+
|
|
20
|
+
from .models import Base
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def utcnow_iso() -> str:
|
|
24
|
+
"""Current time as an ISO-8601 UTC string (the on-disk timestamp format)."""
|
|
25
|
+
return _dt.datetime.now(_dt.UTC).replace(microsecond=0).isoformat()
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _is_sqlite(url: str) -> bool:
|
|
29
|
+
return url.startswith("sqlite")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _is_memory(url: str) -> bool:
|
|
33
|
+
return ":memory:" in url or url in {"sqlite://", "sqlite:///:memory:"}
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _install_sqlite_pragmas(engine: Engine) -> None:
|
|
37
|
+
@event.listens_for(engine, "connect")
|
|
38
|
+
def _set_pragmas(dbapi_conn, _record): # noqa: ANN001
|
|
39
|
+
cur = dbapi_conn.cursor()
|
|
40
|
+
cur.execute("PRAGMA journal_mode=WAL")
|
|
41
|
+
cur.execute("PRAGMA foreign_keys=ON")
|
|
42
|
+
cur.execute("PRAGMA synchronous=NORMAL")
|
|
43
|
+
cur.close()
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
class Database:
|
|
47
|
+
"""Owns the SQLAlchemy engine + session factory for one DB URL."""
|
|
48
|
+
|
|
49
|
+
def __init__(self, url: str) -> None:
|
|
50
|
+
self.url = url
|
|
51
|
+
connect_args = {"check_same_thread": False} if _is_sqlite(url) else {}
|
|
52
|
+
self.engine: Engine = create_engine(url, future=True, connect_args=connect_args)
|
|
53
|
+
if _is_sqlite(url):
|
|
54
|
+
_install_sqlite_pragmas(self.engine)
|
|
55
|
+
self._session_factory = sessionmaker(self.engine, expire_on_commit=False, future=True)
|
|
56
|
+
|
|
57
|
+
@contextmanager
|
|
58
|
+
def session(self) -> Iterator[Session]:
|
|
59
|
+
"""Transactional session scope: commit on success, rollback on error."""
|
|
60
|
+
sess = self._session_factory()
|
|
61
|
+
try:
|
|
62
|
+
yield sess
|
|
63
|
+
sess.commit()
|
|
64
|
+
except Exception:
|
|
65
|
+
sess.rollback()
|
|
66
|
+
raise
|
|
67
|
+
finally:
|
|
68
|
+
sess.close()
|
|
69
|
+
|
|
70
|
+
def create_all(self) -> None:
|
|
71
|
+
"""Create the schema directly from the ORM metadata (test/in-memory path)."""
|
|
72
|
+
Base.metadata.create_all(self.engine)
|
|
73
|
+
|
|
74
|
+
def dispose(self) -> None:
|
|
75
|
+
self.engine.dispose()
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _alembic_config(url: str): # noqa: ANN202
|
|
79
|
+
from alembic.config import Config
|
|
80
|
+
|
|
81
|
+
migrations_dir = Path(__file__).parent / "migrations"
|
|
82
|
+
cfg = Config()
|
|
83
|
+
cfg.set_main_option("script_location", str(migrations_dir))
|
|
84
|
+
cfg.set_main_option("sqlalchemy.url", url)
|
|
85
|
+
return cfg
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def init_database(url: str) -> Database:
|
|
89
|
+
"""Open the DB and bring its schema up to head (Alembic), creating dirs."""
|
|
90
|
+
if _is_sqlite(url) and not _is_memory(url):
|
|
91
|
+
# Ensure the parent directory for the .db file exists.
|
|
92
|
+
path = url.replace("sqlite:///", "", 1)
|
|
93
|
+
Path(path).parent.mkdir(parents=True, exist_ok=True)
|
|
94
|
+
|
|
95
|
+
db = Database(url)
|
|
96
|
+
if _is_memory(url):
|
|
97
|
+
# Alembic uses its own connection; it can't see a private :memory: schema.
|
|
98
|
+
db.create_all()
|
|
99
|
+
return db
|
|
100
|
+
|
|
101
|
+
from alembic import command
|
|
102
|
+
|
|
103
|
+
command.upgrade(_alembic_config(url), "head")
|
|
104
|
+
return db
|
|
File without changes
|