datadoom 0.1.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datadoom/__init__.py +23 -0
- datadoom/adapters/__init__.py +29 -0
- datadoom/adapters/frameworks.py +94 -0
- datadoom/adapters/loaders.py +72 -0
- datadoom/api/__init__.py +11 -0
- datadoom/api/app.py +109 -0
- datadoom/api/deps.py +30 -0
- datadoom/api/errors.py +89 -0
- datadoom/api/estimate.py +82 -0
- datadoom/api/routes/__init__.py +7 -0
- datadoom/api/routes/artifacts.py +147 -0
- datadoom/api/routes/datasets.py +180 -0
- datadoom/api/routes/meta.py +45 -0
- datadoom/api/routes/plugins.py +22 -0
- datadoom/api/routes/runs.py +144 -0
- datadoom/api/routes/specs.py +73 -0
- datadoom/api/routes/templates.py +30 -0
- datadoom/api/schemas.py +230 -0
- datadoom/api/serializers.py +143 -0
- datadoom/api/state.py +24 -0
- datadoom/api/store_helpers.py +56 -0
- datadoom/api/ws.py +72 -0
- datadoom/cli/__init__.py +1 -0
- datadoom/cli/main.py +313 -0
- datadoom/config.py +108 -0
- datadoom/engine/__init__.py +38 -0
- datadoom/engine/advice.py +289 -0
- datadoom/engine/audit.py +290 -0
- datadoom/engine/causal/__init__.py +15 -0
- datadoom/engine/causal/execute.py +116 -0
- datadoom/engine/causal/functions.py +116 -0
- datadoom/engine/causal/graph.py +54 -0
- datadoom/engine/difficulty/__init__.py +36 -0
- datadoom/engine/difficulty/calibrate.py +235 -0
- datadoom/engine/difficulty/knobs.py +171 -0
- datadoom/engine/difficulty/probes.py +181 -0
- datadoom/engine/dist/__init__.py +35 -0
- datadoom/engine/dist/base.py +46 -0
- datadoom/engine/dist/builtins.py +172 -0
- datadoom/engine/dist/compliance.py +344 -0
- datadoom/engine/dist/providers.py +117 -0
- datadoom/engine/errors.py +32 -0
- datadoom/engine/export/__init__.py +27 -0
- datadoom/engine/export/base.py +49 -0
- datadoom/engine/export/checksums.py +18 -0
- datadoom/engine/export/csv_exporter.py +34 -0
- datadoom/engine/export/json_exporter.py +67 -0
- datadoom/engine/export/metadata.py +58 -0
- datadoom/engine/export/parquet_exporter.py +45 -0
- datadoom/engine/failure/__init__.py +18 -0
- datadoom/engine/failure/apply.py +37 -0
- datadoom/engine/failure/base.py +116 -0
- datadoom/engine/failure/modes.py +442 -0
- datadoom/engine/pipeline.py +418 -0
- datadoom/engine/profile.py +327 -0
- datadoom/engine/progress.py +14 -0
- datadoom/engine/reference.py +338 -0
- datadoom/engine/reports.py +206 -0
- datadoom/engine/rng.py +79 -0
- datadoom/engine/spec/__init__.py +45 -0
- datadoom/engine/spec/hashing.py +57 -0
- datadoom/engine/spec/models.py +238 -0
- datadoom/engine/spec/validate.py +345 -0
- datadoom/engine/timeseries.py +88 -0
- datadoom/jobs/__init__.py +14 -0
- datadoom/jobs/progress.py +155 -0
- datadoom/jobs/worker.py +162 -0
- datadoom/plugin.py +35 -0
- datadoom/plugins/__init__.py +47 -0
- datadoom/plugins/contracts.py +72 -0
- datadoom/plugins/loader.py +125 -0
- datadoom/plugins/registry.py +214 -0
- datadoom/plugins/scaffold.py +434 -0
- datadoom/store/__init__.py +47 -0
- datadoom/store/artifacts.py +67 -0
- datadoom/store/db.py +104 -0
- datadoom/store/migrations/__init__.py +0 -0
- datadoom/store/migrations/env.py +53 -0
- datadoom/store/migrations/script.py.mako +24 -0
- datadoom/store/migrations/versions/0001_init.py +149 -0
- datadoom/store/migrations/versions/0002_report_mutual_information.py +23 -0
- datadoom/store/migrations/versions/0003_run_name.py +23 -0
- datadoom/store/migrations/versions/0004_report_profile.py +24 -0
- datadoom/store/models.py +170 -0
- datadoom/store/repositories.py +279 -0
- datadoom/templates/__init__.py +239 -0
- datadoom/templates/ab_test.datadoom.yaml +46 -0
- datadoom/templates/clinical_deterioration.datadoom.yaml +124 -0
- datadoom/templates/credit_default_challenge.datadoom.yaml +147 -0
- datadoom/templates/customer_churn.datadoom.yaml +60 -0
- datadoom/templates/ecommerce_orders.datadoom.yaml +46 -0
- datadoom/templates/fraud_detection.datadoom.yaml +57 -0
- datadoom/templates/hospital_readmission.datadoom.yaml +61 -0
- datadoom/templates/insurance_claims.datadoom.yaml +43 -0
- datadoom/templates/iot_sensors.datadoom.yaml +44 -0
- datadoom/templates/people_directory.datadoom.yaml +56 -0
- datadoom/templates/predictive_maintenance.datadoom.yaml +107 -0
- datadoom/templates/telecom_churn_challenge.datadoom.yaml +125 -0
- datadoom/version.py +3 -0
- datadoom/webdist/assets/index-V8VAuTJG.js +445 -0
- datadoom/webdist/assets/index-doRjyG5s.css +1 -0
- datadoom/webdist/assets/inter-cyrillic-ext-wght-normal-BOeWTOD4.woff2 +0 -0
- datadoom/webdist/assets/inter-cyrillic-wght-normal-DqGufNeO.woff2 +0 -0
- datadoom/webdist/assets/inter-greek-ext-wght-normal-DlzME5K_.woff2 +0 -0
- datadoom/webdist/assets/inter-greek-wght-normal-CkhJZR-_.woff2 +0 -0
- datadoom/webdist/assets/inter-latin-ext-wght-normal-DO1Apj_S.woff2 +0 -0
- datadoom/webdist/assets/inter-latin-wght-normal-Dx4kXJAl.woff2 +0 -0
- datadoom/webdist/assets/inter-vietnamese-wght-normal-CBcvBZtf.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-cyrillic-wght-normal-D73BlboJ.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-greek-wght-normal-Bw9x6K1M.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-latin-ext-wght-normal-DBQx-q_a.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-latin-wght-normal-B9CIFXIH.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-vietnamese-wght-normal-Bt-aOZkq.woff2 +0 -0
- datadoom/webdist/assets/space-grotesk-latin-ext-wght-normal-D9tNdqV9.woff2 +0 -0
- datadoom/webdist/assets/space-grotesk-latin-wght-normal-BhU9QXUp.woff2 +0 -0
- datadoom/webdist/assets/space-grotesk-vietnamese-wght-normal-D0rl6rjA.woff2 +0 -0
- datadoom/webdist/index.html +15 -0
- datadoom-0.1.0.dev0.dist-info/METADATA +143 -0
- datadoom-0.1.0.dev0.dist-info/RECORD +122 -0
- datadoom-0.1.0.dev0.dist-info/WHEEL +4 -0
- datadoom-0.1.0.dev0.dist-info/entry_points.txt +2 -0
- datadoom-0.1.0.dev0.dist-info/licenses/LICENSE +202 -0
|
@@ -0,0 +1,206 @@
|
|
|
1
|
+
"""Report assembly (03 §4 stage 8, 06 §3.5, 05 §7).
|
|
2
|
+
|
|
3
|
+
Builds the sectioned ``Report`` payload the Results screen and the ``reports``
|
|
4
|
+
table bind to. Pure engine code — no DB/web imports. The sections the engine can
|
|
5
|
+
honestly produce are populated:
|
|
6
|
+
|
|
7
|
+
* ``compliance_score`` / ``distribution`` — from the KS compliance pass.
|
|
8
|
+
* ``correlation`` — Pearson matrix over numeric columns (cheap, honest).
|
|
9
|
+
* ``mutual_information`` — pairwise MI (nats) over discretized columns (05 §7).
|
|
10
|
+
* ``causal_truth`` — the true generating DAG + interventions (P2).
|
|
11
|
+
* ``failures`` — per-mode realized diffs when failures were injected (P3).
|
|
12
|
+
* ``difficulty`` — target band, achieved metric, probe, iterations, knobs (P4).
|
|
13
|
+
* ``determinism`` — spec_hash, seed, per-namespace key digests, checksums.
|
|
14
|
+
|
|
15
|
+
Sections the engine cannot honestly produce for a given run stay ``None``; the
|
|
16
|
+
schema is stable so the UI is coherent from day one.
|
|
17
|
+
"""
|
|
18
|
+
|
|
19
|
+
from __future__ import annotations
|
|
20
|
+
|
|
21
|
+
from dataclasses import asdict, dataclass, field
|
|
22
|
+
from typing import TYPE_CHECKING, Any
|
|
23
|
+
|
|
24
|
+
import numpy as np
|
|
25
|
+
import pandas as pd
|
|
26
|
+
|
|
27
|
+
from .causal.execute import resolve_interventions
|
|
28
|
+
from .dist import ComplianceReport
|
|
29
|
+
from .profile import build_profile
|
|
30
|
+
|
|
31
|
+
if TYPE_CHECKING:
|
|
32
|
+
from .causal.graph import CausalDag
|
|
33
|
+
from .spec.models import CausalGraph, Spec
|
|
34
|
+
|
|
35
|
+
# Columns with more distinct values than this are treated as free-text / id-like
|
|
36
|
+
# and excluded from the mutual-information matrix (binning would be meaningless).
|
|
37
|
+
_MI_MAX_CARDINALITY = 50
|
|
38
|
+
_MI_NUMERIC_BINS = 10
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
@dataclass
|
|
42
|
+
class ReportBundle:
|
|
43
|
+
compliance_score: float
|
|
44
|
+
distribution: dict[str, Any]
|
|
45
|
+
correlation: dict[str, Any] | None = None
|
|
46
|
+
mutual_information: dict[str, Any] | None = None
|
|
47
|
+
causal_truth: dict[str, Any] | None = None
|
|
48
|
+
difficulty: dict[str, Any] | None = None
|
|
49
|
+
failures: dict[str, Any] | None = None
|
|
50
|
+
profile: dict[str, Any] | None = None
|
|
51
|
+
determinism: dict[str, Any] = field(default_factory=dict)
|
|
52
|
+
|
|
53
|
+
def to_dict(self) -> dict[str, Any]:
|
|
54
|
+
return asdict(self)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
def correlation_pearson(frame: pd.DataFrame) -> dict[str, Any] | None:
|
|
58
|
+
"""Pearson correlation over numeric columns, JSON-serializable.
|
|
59
|
+
|
|
60
|
+
Returns ``None`` when fewer than two numeric columns exist (a matrix would
|
|
61
|
+
be degenerate). NaNs (e.g. a constant column) are rendered as ``None``.
|
|
62
|
+
"""
|
|
63
|
+
numeric = frame.select_dtypes(include=[np.number])
|
|
64
|
+
if numeric.shape[1] < 2:
|
|
65
|
+
return None
|
|
66
|
+
corr = numeric.corr(method="pearson")
|
|
67
|
+
matrix = [[None if pd.isna(v) else float(v) for v in row] for row in corr.to_numpy()]
|
|
68
|
+
return {"method": "pearson", "columns": list(corr.columns), "matrix": matrix}
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def _discretize(series: pd.Series) -> np.ndarray | None:
|
|
72
|
+
"""Map a column to integer codes for MI; ``None`` if it isn't usable.
|
|
73
|
+
|
|
74
|
+
Numeric/datetime columns are quantile-binned; low-cardinality
|
|
75
|
+
categorical/boolean columns are factorized; free-text/id-like columns
|
|
76
|
+
(cardinality above the cap) are skipped.
|
|
77
|
+
"""
|
|
78
|
+
nunique = series.nunique(dropna=True)
|
|
79
|
+
if nunique < 2:
|
|
80
|
+
return None
|
|
81
|
+
if pd.api.types.is_bool_dtype(series) or pd.api.types.is_object_dtype(series) or isinstance(
|
|
82
|
+
series.dtype, pd.CategoricalDtype
|
|
83
|
+
):
|
|
84
|
+
if nunique > _MI_MAX_CARDINALITY:
|
|
85
|
+
return None
|
|
86
|
+
return pd.factorize(series, sort=True)[0]
|
|
87
|
+
values = series
|
|
88
|
+
if pd.api.types.is_datetime64_any_dtype(series):
|
|
89
|
+
values = series.astype("int64")
|
|
90
|
+
if pd.api.types.is_numeric_dtype(values):
|
|
91
|
+
bins = min(_MI_NUMERIC_BINS, int(nunique))
|
|
92
|
+
codes = pd.qcut(values, q=bins, duplicates="drop").cat.codes
|
|
93
|
+
return codes.to_numpy() if codes.nunique() >= 2 else None
|
|
94
|
+
return None
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _mutual_information(a: np.ndarray, b: np.ndarray) -> float:
|
|
98
|
+
"""MI in nats between two integer-code arrays via the joint histogram."""
|
|
99
|
+
contingency = pd.crosstab(a, b).to_numpy(dtype=float)
|
|
100
|
+
n = contingency.sum()
|
|
101
|
+
if n == 0:
|
|
102
|
+
return 0.0
|
|
103
|
+
pxy = contingency / n
|
|
104
|
+
px = pxy.sum(axis=1, keepdims=True)
|
|
105
|
+
py = pxy.sum(axis=0, keepdims=True)
|
|
106
|
+
with np.errstate(divide="ignore", invalid="ignore"):
|
|
107
|
+
terms = pxy * (np.log(pxy) - np.log(px) - np.log(py))
|
|
108
|
+
return float(np.nansum(np.where(pxy > 0, terms, 0.0)))
|
|
109
|
+
|
|
110
|
+
|
|
111
|
+
def mutual_information_matrix(frame: pd.DataFrame) -> dict[str, Any] | None:
|
|
112
|
+
"""Pairwise mutual information (nats) over discretizable columns (05 §7).
|
|
113
|
+
|
|
114
|
+
Returns ``None`` when fewer than two columns are usable. The diagonal is the
|
|
115
|
+
column's entropy ``H(X) = I(X;X)``. Symmetric by construction.
|
|
116
|
+
"""
|
|
117
|
+
codes: dict[str, np.ndarray] = {}
|
|
118
|
+
for col in frame.columns:
|
|
119
|
+
disc = _discretize(frame[col])
|
|
120
|
+
if disc is not None:
|
|
121
|
+
codes[col] = disc
|
|
122
|
+
cols = list(codes)
|
|
123
|
+
if len(cols) < 2:
|
|
124
|
+
return None
|
|
125
|
+
size = len(cols)
|
|
126
|
+
matrix = [[0.0] * size for _ in range(size)]
|
|
127
|
+
for i in range(size):
|
|
128
|
+
for j in range(i, size):
|
|
129
|
+
mi = _mutual_information(codes[cols[i]], codes[cols[j]])
|
|
130
|
+
matrix[i][j] = mi
|
|
131
|
+
matrix[j][i] = mi
|
|
132
|
+
return {"method": "histogram", "units": "nats", "columns": cols, "matrix": matrix}
|
|
133
|
+
|
|
134
|
+
|
|
135
|
+
def causal_truth(
|
|
136
|
+
causal: CausalGraph | None, dag: CausalDag | None
|
|
137
|
+
) -> dict[str, Any] | None:
|
|
138
|
+
"""The true generating graph: edges (with params), interventions, topo order.
|
|
139
|
+
|
|
140
|
+
Edges whose destination is intervened are reported ``active: false`` — an
|
|
141
|
+
intervention ``do(X)`` detaches X's incoming edges (05 §3.1).
|
|
142
|
+
"""
|
|
143
|
+
if causal is None:
|
|
144
|
+
return None
|
|
145
|
+
interventions = resolve_interventions(causal.interventions)
|
|
146
|
+
edges: list[dict[str, Any]] = []
|
|
147
|
+
for e in causal.edges:
|
|
148
|
+
edge: dict[str, Any] = {"from": e.src, "to": e.dst, "fn": e.fn}
|
|
149
|
+
for key, val in (
|
|
150
|
+
("weight", e.weight),
|
|
151
|
+
("bias", e.bias),
|
|
152
|
+
("coeffs", e.coeffs),
|
|
153
|
+
("mapping", e.mapping),
|
|
154
|
+
):
|
|
155
|
+
if val is not None:
|
|
156
|
+
edge[key] = val
|
|
157
|
+
edge["active"] = e.dst not in interventions
|
|
158
|
+
edges.append(edge)
|
|
159
|
+
return {
|
|
160
|
+
"nodes": dag.topological_order() if dag is not None else None,
|
|
161
|
+
"edges": edges,
|
|
162
|
+
"interventions": interventions,
|
|
163
|
+
"topological_order": dag.topological_order() if dag is not None else None,
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
|
|
167
|
+
def failures_section(diffs: list[dict[str, Any]] | None) -> dict[str, Any] | None:
|
|
168
|
+
"""Wrap the per-mode failure diffs into the report's ``failures`` section.
|
|
169
|
+
|
|
170
|
+
``None`` when no failures were injected, so the UI can tell "no corruption"
|
|
171
|
+
apart from "corruption with empty effect".
|
|
172
|
+
"""
|
|
173
|
+
if not diffs:
|
|
174
|
+
return None
|
|
175
|
+
return {"count": len(diffs), "modes": diffs}
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
def build_report(
|
|
179
|
+
*,
|
|
180
|
+
compliance: ComplianceReport,
|
|
181
|
+
frame: pd.DataFrame,
|
|
182
|
+
determinism: dict[str, Any],
|
|
183
|
+
spec: Spec | None = None,
|
|
184
|
+
causal: CausalGraph | None = None,
|
|
185
|
+
causal_dag: CausalDag | None = None,
|
|
186
|
+
failures: list[dict[str, Any]] | None = None,
|
|
187
|
+
injected: pd.DataFrame | None = None,
|
|
188
|
+
difficulty: dict[str, Any] | None = None,
|
|
189
|
+
) -> ReportBundle:
|
|
190
|
+
"""Assemble the report bundle from the realized frame and compliance pass."""
|
|
191
|
+
profile = (
|
|
192
|
+
build_profile(spec, frame, injected=injected, failure_diffs=failures)
|
|
193
|
+
if spec is not None
|
|
194
|
+
else None
|
|
195
|
+
)
|
|
196
|
+
return ReportBundle(
|
|
197
|
+
compliance_score=compliance.score,
|
|
198
|
+
distribution=compliance.to_dict(),
|
|
199
|
+
correlation=correlation_pearson(frame),
|
|
200
|
+
mutual_information=mutual_information_matrix(frame),
|
|
201
|
+
causal_truth=causal_truth(causal, causal_dag),
|
|
202
|
+
failures=failures_section(failures),
|
|
203
|
+
profile=profile,
|
|
204
|
+
difficulty=difficulty,
|
|
205
|
+
determinism=determinism,
|
|
206
|
+
)
|
datadoom/engine/rng.py
ADDED
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
"""Seeded RNG factory — the determinism invariant (05 §1.2).
|
|
2
|
+
|
|
3
|
+
All randomness in the engine MUST flow through an :class:`RNGFactory`. No stdlib
|
|
4
|
+
``random``, ``uuid4``, ``time``, or ``np.random.*`` *global* calls are allowed in
|
|
5
|
+
the data path. Each logical stream gets its own independent generator keyed by
|
|
6
|
+
``namespace`` so that adding a feature never perturbs another's draws.
|
|
7
|
+
|
|
8
|
+
key(ns) = SHA256(spec_hash || ":" || seed || ":" || ns)[:8] -> uint64
|
|
9
|
+
RNG(ns) = numpy.random.Generator(PCG64(key(ns)))
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import hashlib
|
|
15
|
+
|
|
16
|
+
import numpy as np
|
|
17
|
+
|
|
18
|
+
# Stable namespace prefixes (documented in 05 §1.2). Helpers below build the
|
|
19
|
+
# full namespace string so call sites stay consistent.
|
|
20
|
+
NS_FEATURE = "feature"
|
|
21
|
+
NS_NOISE = "noise"
|
|
22
|
+
NS_FAILURE = "failure"
|
|
23
|
+
NS_PROBE = "probe"
|
|
24
|
+
NS_DIFFICULTY = "difficulty"
|
|
25
|
+
NS_SHUFFLE = "shuffle"
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def _derive_key(spec_hash: str, seed: int, namespace: str) -> int:
|
|
29
|
+
"""Derive a uint64 PCG64 key from (spec_hash, seed, namespace)."""
|
|
30
|
+
payload = f"{spec_hash}:{seed}:{namespace}".encode()
|
|
31
|
+
digest = hashlib.sha256(payload).digest()
|
|
32
|
+
# First 8 bytes, big-endian, as an unsigned 64-bit integer.
|
|
33
|
+
return int.from_bytes(digest[:8], byteorder="big", signed=False)
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
class RNGFactory:
|
|
37
|
+
"""Produces independent, deterministic generators per namespace.
|
|
38
|
+
|
|
39
|
+
Two factories with identical ``(spec_hash, seed)`` yield identical draws for
|
|
40
|
+
the same namespace, and different namespaces are statistically independent.
|
|
41
|
+
"""
|
|
42
|
+
|
|
43
|
+
def __init__(self, spec_hash: str, seed: int) -> None:
|
|
44
|
+
self.spec_hash = spec_hash
|
|
45
|
+
self.seed = int(seed)
|
|
46
|
+
|
|
47
|
+
def key(self, namespace: str) -> int:
|
|
48
|
+
"""Return the raw uint64 key for a namespace (used in determinism reports)."""
|
|
49
|
+
return _derive_key(self.spec_hash, self.seed, namespace)
|
|
50
|
+
|
|
51
|
+
def generator(self, namespace: str) -> np.random.Generator:
|
|
52
|
+
"""Return an independent ``numpy.random.Generator`` for ``namespace``."""
|
|
53
|
+
return np.random.Generator(np.random.PCG64(self.key(namespace)))
|
|
54
|
+
|
|
55
|
+
# Convenience namespace builders -------------------------------------------------
|
|
56
|
+
|
|
57
|
+
def feature(self, name: str) -> np.random.Generator:
|
|
58
|
+
return self.generator(f"{NS_FEATURE}:{name}")
|
|
59
|
+
|
|
60
|
+
def noise(self, name: str) -> np.random.Generator:
|
|
61
|
+
return self.generator(f"{NS_NOISE}:{name}")
|
|
62
|
+
|
|
63
|
+
def failure(self, index: int) -> np.random.Generator:
|
|
64
|
+
return self.generator(f"{NS_FAILURE}:{index}")
|
|
65
|
+
|
|
66
|
+
def difficulty(self, name: str) -> np.random.Generator:
|
|
67
|
+
"""Stream for difficulty-calibration perturbations (feature/label noise)."""
|
|
68
|
+
return self.generator(f"{NS_DIFFICULTY}:{name}")
|
|
69
|
+
|
|
70
|
+
def probe(self, name: str) -> np.random.Generator:
|
|
71
|
+
"""Stream for difficulty probe-model seeds (train/test split, estimator)."""
|
|
72
|
+
return self.generator(f"{NS_PROBE}:{name}")
|
|
73
|
+
|
|
74
|
+
def shuffle(self) -> np.random.Generator:
|
|
75
|
+
return self.generator(NS_SHUFFLE)
|
|
76
|
+
|
|
77
|
+
def key_digests(self, namespaces: list[str]) -> dict[str, str]:
|
|
78
|
+
"""Hex key digests for the determinism report section (06 §3.5)."""
|
|
79
|
+
return {ns: format(self.key(ns), "016x") for ns in namespaces}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""Spec parsing, validation, canonicalization and hashing."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from typing import Any
|
|
6
|
+
|
|
7
|
+
import yaml
|
|
8
|
+
|
|
9
|
+
from ..errors import SpecValidationError
|
|
10
|
+
from .hashing import canonical_json, spec_hash
|
|
11
|
+
from .models import Spec
|
|
12
|
+
from .validate import validate_spec
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"Spec",
|
|
16
|
+
"validate_spec",
|
|
17
|
+
"canonical_json",
|
|
18
|
+
"spec_hash",
|
|
19
|
+
"load_spec",
|
|
20
|
+
"parse_spec",
|
|
21
|
+
]
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def parse_spec(data: dict[str, Any]) -> Spec:
|
|
25
|
+
"""Parse a raw dict into a validated :class:`Spec` (shape + cross-field)."""
|
|
26
|
+
from pydantic import ValidationError
|
|
27
|
+
|
|
28
|
+
try:
|
|
29
|
+
spec = Spec.model_validate(data)
|
|
30
|
+
except ValidationError as exc:
|
|
31
|
+
# Surface the first error with a dotted locator the UI/CLI can use.
|
|
32
|
+
first = exc.errors()[0]
|
|
33
|
+
locator = ".".join(str(p) for p in first["loc"])
|
|
34
|
+
raise SpecValidationError(first["msg"], locator=locator) from exc
|
|
35
|
+
validate_spec(spec)
|
|
36
|
+
return spec
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def load_spec(path: str) -> Spec:
|
|
40
|
+
"""Load and validate a spec from a YAML (or JSON) file."""
|
|
41
|
+
with open(path, encoding="utf-8") as fh:
|
|
42
|
+
data = yaml.safe_load(fh)
|
|
43
|
+
if not isinstance(data, dict):
|
|
44
|
+
raise SpecValidationError("spec file must be a mapping at the top level")
|
|
45
|
+
return parse_spec(data)
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"""Canonical serialization & spec hashing (05 §1.1).
|
|
2
|
+
|
|
3
|
+
The spec hash is the identity of a dataset's *design*. It must be stable across
|
|
4
|
+
machines and runs, so we define a strict canonical JSON form:
|
|
5
|
+
|
|
6
|
+
- object keys sorted lexicographically,
|
|
7
|
+
- no insignificant whitespace,
|
|
8
|
+
- numbers normalized (integral floats collapse to ints; shortest float repr),
|
|
9
|
+
- arrays preserve author order (order is semantic),
|
|
10
|
+
- the ``seed`` field is excluded (seed is not part of the design identity).
|
|
11
|
+
|
|
12
|
+
spec_hash = SHA256(canonical_json(spec_without_seed)) # hex
|
|
13
|
+
"""
|
|
14
|
+
|
|
15
|
+
from __future__ import annotations
|
|
16
|
+
|
|
17
|
+
import hashlib
|
|
18
|
+
import json
|
|
19
|
+
from typing import Any
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def _normalize(value: Any) -> Any:
|
|
23
|
+
"""Recursively normalize numbers and drop nothing else.
|
|
24
|
+
|
|
25
|
+
Integral floats (``1.0``) become ints (``1``) so that a value authored as an
|
|
26
|
+
int and one authored as a float-but-whole hash identically.
|
|
27
|
+
"""
|
|
28
|
+
if isinstance(value, bool):
|
|
29
|
+
# bool is a subclass of int — keep it as-is (True/False -> JSON booleans).
|
|
30
|
+
return value
|
|
31
|
+
if isinstance(value, float):
|
|
32
|
+
if value.is_integer():
|
|
33
|
+
return int(value)
|
|
34
|
+
return value
|
|
35
|
+
if isinstance(value, dict):
|
|
36
|
+
return {k: _normalize(v) for k, v in value.items()}
|
|
37
|
+
if isinstance(value, (list, tuple)):
|
|
38
|
+
return [_normalize(v) for v in value]
|
|
39
|
+
return value
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
def canonical_json(spec_body: dict[str, Any]) -> str:
|
|
43
|
+
"""Return the canonical JSON string for a spec dict (``seed`` removed)."""
|
|
44
|
+
body = {k: v for k, v in spec_body.items() if k != "seed"}
|
|
45
|
+
normalized = _normalize(body)
|
|
46
|
+
return json.dumps(
|
|
47
|
+
normalized,
|
|
48
|
+
sort_keys=True,
|
|
49
|
+
separators=(",", ":"),
|
|
50
|
+
ensure_ascii=False,
|
|
51
|
+
allow_nan=False,
|
|
52
|
+
)
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def spec_hash(spec_body: dict[str, Any]) -> str:
|
|
56
|
+
"""Compute the hex SHA256 of the canonical (seed-excluded) spec."""
|
|
57
|
+
return hashlib.sha256(canonical_json(spec_body).encode("utf-8")).hexdigest()
|
|
@@ -0,0 +1,238 @@
|
|
|
1
|
+
"""Pydantic v2 spec models — the parsed, validated form of doc 04.
|
|
2
|
+
|
|
3
|
+
These are pure (no DB/framework imports). Shape/type validation happens here;
|
|
4
|
+
cross-entity semantic checks (acyclicity, references) live in ``validate.py``.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from typing import Annotated, Any, Literal
|
|
10
|
+
|
|
11
|
+
from pydantic import BaseModel, ConfigDict, Field, field_validator
|
|
12
|
+
|
|
13
|
+
from .hashing import canonical_json, spec_hash
|
|
14
|
+
|
|
15
|
+
FEATURE_NAME_PATTERN = r"^[A-Za-z_][A-Za-z0-9_]*$"
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class _Model(BaseModel):
|
|
19
|
+
model_config = ConfigDict(extra="forbid")
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
# --- Feature definitions (04 §4) -----------------------------------------------------
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class _FeatureBase(_Model):
|
|
26
|
+
"""Fields shared by every feature type.
|
|
27
|
+
|
|
28
|
+
``emit=False`` marks a **latent** feature: it is sampled / computed and may
|
|
29
|
+
drive the causal SEM (and appears in the true causal graph), but it is **not
|
|
30
|
+
shipped** — excluded from the output CSV, the difficulty probe, compliance,
|
|
31
|
+
and correlation/MI. It models hidden variables (unobserved confounders, a
|
|
32
|
+
latent score that generates an observed label). The default is ``None`` ==
|
|
33
|
+
emitted; only an explicit ``False`` is canonicalized, so adding the field
|
|
34
|
+
never changes the hash of an existing spec (invariant #5/#6).
|
|
35
|
+
"""
|
|
36
|
+
|
|
37
|
+
description: str | None = None
|
|
38
|
+
emit: bool | None = None
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class NumericFeature(_FeatureBase):
|
|
42
|
+
type: Literal["numeric"]
|
|
43
|
+
dist: str | None = None # None => derived via causal
|
|
44
|
+
params: dict[str, float] = Field(default_factory=dict)
|
|
45
|
+
min: float | None = None
|
|
46
|
+
max: float | None = None
|
|
47
|
+
dtype: Literal["int", "float"] = "float"
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class CategoricalFeature(_FeatureBase):
|
|
51
|
+
type: Literal["categorical"]
|
|
52
|
+
categories: list[str] = Field(min_length=1)
|
|
53
|
+
weights: list[float] | None = None
|
|
54
|
+
|
|
55
|
+
@field_validator("weights")
|
|
56
|
+
@classmethod
|
|
57
|
+
def _weights_nonneg(cls, v: list[float] | None) -> list[float] | None:
|
|
58
|
+
if v is not None and any(w < 0 for w in v):
|
|
59
|
+
raise ValueError("categorical weights must be non-negative")
|
|
60
|
+
return v
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class BooleanFeature(_FeatureBase):
|
|
64
|
+
type: Literal["boolean"]
|
|
65
|
+
rate: float = 0.5
|
|
66
|
+
|
|
67
|
+
@field_validator("rate")
|
|
68
|
+
@classmethod
|
|
69
|
+
def _rate_in_unit(cls, v: float) -> float:
|
|
70
|
+
if not 0.0 <= v <= 1.0:
|
|
71
|
+
raise ValueError("boolean rate must be in [0, 1]")
|
|
72
|
+
return v
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
class DatetimeFeature(_FeatureBase):
|
|
76
|
+
type: Literal["datetime"]
|
|
77
|
+
start: str
|
|
78
|
+
end: str
|
|
79
|
+
granularity: Literal["second", "minute", "hour", "day"] = "day"
|
|
80
|
+
dist: str = "uniform"
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
class TextFeature(_FeatureBase):
|
|
84
|
+
type: Literal["text"]
|
|
85
|
+
# "lorem" emits filler words; any realistic provider key (name, email,
|
|
86
|
+
# address, company, …) is served by mimesis. See dist/providers.py.
|
|
87
|
+
generator: str = "lorem"
|
|
88
|
+
locale: str = "en"
|
|
89
|
+
length: dict[str, int] = Field(default_factory=lambda: {"min": 5, "max": 30})
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
class TrendSpec(_Model):
|
|
93
|
+
"""Linear trend ``slope·t + intercept`` over the row index."""
|
|
94
|
+
|
|
95
|
+
slope: float = 0.0
|
|
96
|
+
intercept: float = 0.0
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
class SeasonalitySpec(_Model):
|
|
100
|
+
"""One sinusoidal season ``amplitude·sin(2π·t/period + phase)``."""
|
|
101
|
+
|
|
102
|
+
amplitude: float
|
|
103
|
+
period: float = Field(gt=0.0)
|
|
104
|
+
phase: float = 0.0
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
class TimeseriesFeature(_FeatureBase):
|
|
108
|
+
"""An ordered series ``Xₜ = T(t) + S(t) + AR(p) + εₜ`` (05 §6).
|
|
109
|
+
|
|
110
|
+
Generated over the row index (row order is preserved end-to-end). It is a
|
|
111
|
+
*root* feature — it may be a causal **parent** but is never a causal target,
|
|
112
|
+
and it is not assessed by distribution compliance (KS/GoF don't apply to a
|
|
113
|
+
trended, autocorrelated series).
|
|
114
|
+
"""
|
|
115
|
+
|
|
116
|
+
type: Literal["timeseries"]
|
|
117
|
+
trend: TrendSpec | None = None
|
|
118
|
+
seasonality: list[SeasonalitySpec] = Field(default_factory=list)
|
|
119
|
+
ar: list[float] = Field(default_factory=list) # AR coefficients φ₁…φ_p
|
|
120
|
+
noise_std: float = Field(default=1.0, ge=0.0) # σ of εₜ ~ N(0, σ²)
|
|
121
|
+
min: float | None = None
|
|
122
|
+
max: float | None = None
|
|
123
|
+
dtype: Literal["int", "float"] = "float"
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
Feature = Annotated[
|
|
127
|
+
NumericFeature
|
|
128
|
+
| CategoricalFeature
|
|
129
|
+
| BooleanFeature
|
|
130
|
+
| DatetimeFeature
|
|
131
|
+
| TextFeature
|
|
132
|
+
| TimeseriesFeature,
|
|
133
|
+
Field(discriminator="type"),
|
|
134
|
+
]
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
# --- Causal graph (04 §5) ------------------------------------------------------------
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
class CausalEdge(_Model):
|
|
141
|
+
model_config = ConfigDict(extra="forbid", populate_by_name=True)
|
|
142
|
+
|
|
143
|
+
src: str = Field(alias="from")
|
|
144
|
+
dst: str = Field(alias="to")
|
|
145
|
+
fn: str # linear | logistic | polynomial | map | identity | <plugin>
|
|
146
|
+
weight: float | None = None
|
|
147
|
+
bias: float | None = None
|
|
148
|
+
coeffs: list[float] | None = None
|
|
149
|
+
mapping: dict[str, float] | None = None
|
|
150
|
+
|
|
151
|
+
|
|
152
|
+
class CausalGraph(_Model):
|
|
153
|
+
edges: list[CausalEdge] = Field(default_factory=list)
|
|
154
|
+
noise: dict[str, dict[str, Any]] = Field(default_factory=dict)
|
|
155
|
+
interventions: list[dict[str, Any]] = Field(default_factory=list)
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
# --- Difficulty / failures / export (04 §6-8) ----------------------------------------
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
class Difficulty(_Model):
|
|
162
|
+
target: str | dict[str, Any]
|
|
163
|
+
label: str
|
|
164
|
+
probe: str = "logreg"
|
|
165
|
+
max_iters: int = Field(default=8, ge=1)
|
|
166
|
+
# Lean-default knobs: feature-observation noise (primary) + label flips
|
|
167
|
+
# (deep end). `causal` shrink / `imbalance` are planned (status.md backlog).
|
|
168
|
+
knobs: list[str] = Field(default_factory=lambda: ["noise", "label_noise"])
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
class Failure(BaseModel):
|
|
172
|
+
# type-specific fields are validated by the FailureMode handler (P3).
|
|
173
|
+
model_config = ConfigDict(extra="allow")
|
|
174
|
+
|
|
175
|
+
type: str
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
ExportVersion = Literal["clean", "injected"]
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
def _default_versions() -> list[ExportVersion]:
|
|
182
|
+
return ["clean"]
|
|
183
|
+
|
|
184
|
+
|
|
185
|
+
class ExportSpec(_Model):
|
|
186
|
+
formats: list[str] = Field(default_factory=lambda: ["csv"])
|
|
187
|
+
versions: list[ExportVersion] = Field(default_factory=_default_versions)
|
|
188
|
+
splits: dict[str, float] | None = None
|
|
189
|
+
shuffle: bool = True
|
|
190
|
+
metadata: bool = True
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
# --- Top-level spec (04 §2) ----------------------------------------------------------
|
|
194
|
+
|
|
195
|
+
|
|
196
|
+
class Spec(_Model):
|
|
197
|
+
datadoom_version: str
|
|
198
|
+
name: str
|
|
199
|
+
description: str | None = None
|
|
200
|
+
seed: int | None = None
|
|
201
|
+
rows: int = Field(ge=1)
|
|
202
|
+
features: dict[str, Feature]
|
|
203
|
+
causal: CausalGraph | None = None
|
|
204
|
+
difficulty: Difficulty | None = None
|
|
205
|
+
failures: list[Failure] = Field(default_factory=list)
|
|
206
|
+
export: ExportSpec = Field(default_factory=ExportSpec)
|
|
207
|
+
meta: dict[str, Any] = Field(default_factory=dict)
|
|
208
|
+
|
|
209
|
+
@field_validator("name")
|
|
210
|
+
@classmethod
|
|
211
|
+
def _name_slug(cls, v: str) -> str:
|
|
212
|
+
import re
|
|
213
|
+
|
|
214
|
+
if not re.match(r"^[A-Za-z0-9_-]+$", v):
|
|
215
|
+
raise ValueError("name must be slug-friendly ([A-Za-z0-9_-]+)")
|
|
216
|
+
return v
|
|
217
|
+
|
|
218
|
+
# --- Canonical form & identity (05 §1) -------------------------------------------
|
|
219
|
+
|
|
220
|
+
def body(self) -> dict[str, Any]:
|
|
221
|
+
"""Serializable spec document (aliases applied, None fields dropped)."""
|
|
222
|
+
return self.model_dump(mode="json", by_alias=True, exclude_none=True)
|
|
223
|
+
|
|
224
|
+
def canonical(self) -> str:
|
|
225
|
+
"""Canonical JSON string (seed excluded) used for hashing."""
|
|
226
|
+
return canonical_json(self.body())
|
|
227
|
+
|
|
228
|
+
def spec_hash(self) -> str:
|
|
229
|
+
"""sha256 of the canonical, seed-excluded spec."""
|
|
230
|
+
return spec_hash(self.body())
|
|
231
|
+
|
|
232
|
+
def latent_names(self) -> set[str]:
|
|
233
|
+
"""Feature names marked ``emit: false`` (computed but not shipped)."""
|
|
234
|
+
return {name for name, feat in self.features.items() if feat.emit is False}
|
|
235
|
+
|
|
236
|
+
def emitted_names(self) -> list[str]:
|
|
237
|
+
"""Feature names that are shipped, in spec order."""
|
|
238
|
+
return [name for name, feat in self.features.items() if feat.emit is not False]
|