datadoom 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. datadoom/__init__.py +23 -0
  2. datadoom/adapters/__init__.py +29 -0
  3. datadoom/adapters/frameworks.py +94 -0
  4. datadoom/adapters/loaders.py +72 -0
  5. datadoom/api/__init__.py +11 -0
  6. datadoom/api/app.py +109 -0
  7. datadoom/api/deps.py +30 -0
  8. datadoom/api/errors.py +89 -0
  9. datadoom/api/estimate.py +82 -0
  10. datadoom/api/routes/__init__.py +7 -0
  11. datadoom/api/routes/artifacts.py +147 -0
  12. datadoom/api/routes/datasets.py +180 -0
  13. datadoom/api/routes/meta.py +45 -0
  14. datadoom/api/routes/plugins.py +22 -0
  15. datadoom/api/routes/runs.py +144 -0
  16. datadoom/api/routes/specs.py +73 -0
  17. datadoom/api/routes/templates.py +30 -0
  18. datadoom/api/schemas.py +230 -0
  19. datadoom/api/serializers.py +143 -0
  20. datadoom/api/state.py +24 -0
  21. datadoom/api/store_helpers.py +56 -0
  22. datadoom/api/ws.py +72 -0
  23. datadoom/cli/__init__.py +1 -0
  24. datadoom/cli/main.py +313 -0
  25. datadoom/config.py +108 -0
  26. datadoom/engine/__init__.py +38 -0
  27. datadoom/engine/advice.py +289 -0
  28. datadoom/engine/audit.py +290 -0
  29. datadoom/engine/causal/__init__.py +15 -0
  30. datadoom/engine/causal/execute.py +116 -0
  31. datadoom/engine/causal/functions.py +116 -0
  32. datadoom/engine/causal/graph.py +54 -0
  33. datadoom/engine/difficulty/__init__.py +36 -0
  34. datadoom/engine/difficulty/calibrate.py +235 -0
  35. datadoom/engine/difficulty/knobs.py +171 -0
  36. datadoom/engine/difficulty/probes.py +181 -0
  37. datadoom/engine/dist/__init__.py +35 -0
  38. datadoom/engine/dist/base.py +46 -0
  39. datadoom/engine/dist/builtins.py +172 -0
  40. datadoom/engine/dist/compliance.py +344 -0
  41. datadoom/engine/dist/providers.py +117 -0
  42. datadoom/engine/errors.py +32 -0
  43. datadoom/engine/export/__init__.py +27 -0
  44. datadoom/engine/export/base.py +49 -0
  45. datadoom/engine/export/checksums.py +18 -0
  46. datadoom/engine/export/csv_exporter.py +34 -0
  47. datadoom/engine/export/json_exporter.py +67 -0
  48. datadoom/engine/export/metadata.py +58 -0
  49. datadoom/engine/export/parquet_exporter.py +45 -0
  50. datadoom/engine/failure/__init__.py +18 -0
  51. datadoom/engine/failure/apply.py +37 -0
  52. datadoom/engine/failure/base.py +116 -0
  53. datadoom/engine/failure/modes.py +442 -0
  54. datadoom/engine/pipeline.py +418 -0
  55. datadoom/engine/profile.py +327 -0
  56. datadoom/engine/progress.py +14 -0
  57. datadoom/engine/reference.py +338 -0
  58. datadoom/engine/reports.py +206 -0
  59. datadoom/engine/rng.py +79 -0
  60. datadoom/engine/spec/__init__.py +45 -0
  61. datadoom/engine/spec/hashing.py +57 -0
  62. datadoom/engine/spec/models.py +238 -0
  63. datadoom/engine/spec/validate.py +345 -0
  64. datadoom/engine/timeseries.py +88 -0
  65. datadoom/jobs/__init__.py +14 -0
  66. datadoom/jobs/progress.py +155 -0
  67. datadoom/jobs/worker.py +162 -0
  68. datadoom/plugin.py +35 -0
  69. datadoom/plugins/__init__.py +47 -0
  70. datadoom/plugins/contracts.py +72 -0
  71. datadoom/plugins/loader.py +125 -0
  72. datadoom/plugins/registry.py +214 -0
  73. datadoom/plugins/scaffold.py +434 -0
  74. datadoom/store/__init__.py +47 -0
  75. datadoom/store/artifacts.py +67 -0
  76. datadoom/store/db.py +104 -0
  77. datadoom/store/migrations/__init__.py +0 -0
  78. datadoom/store/migrations/env.py +53 -0
  79. datadoom/store/migrations/script.py.mako +24 -0
  80. datadoom/store/migrations/versions/0001_init.py +149 -0
  81. datadoom/store/migrations/versions/0002_report_mutual_information.py +23 -0
  82. datadoom/store/migrations/versions/0003_run_name.py +23 -0
  83. datadoom/store/migrations/versions/0004_report_profile.py +24 -0
  84. datadoom/store/models.py +170 -0
  85. datadoom/store/repositories.py +279 -0
  86. datadoom/templates/__init__.py +239 -0
  87. datadoom/templates/ab_test.datadoom.yaml +46 -0
  88. datadoom/templates/clinical_deterioration.datadoom.yaml +124 -0
  89. datadoom/templates/credit_default_challenge.datadoom.yaml +147 -0
  90. datadoom/templates/customer_churn.datadoom.yaml +60 -0
  91. datadoom/templates/ecommerce_orders.datadoom.yaml +46 -0
  92. datadoom/templates/fraud_detection.datadoom.yaml +57 -0
  93. datadoom/templates/hospital_readmission.datadoom.yaml +61 -0
  94. datadoom/templates/insurance_claims.datadoom.yaml +43 -0
  95. datadoom/templates/iot_sensors.datadoom.yaml +44 -0
  96. datadoom/templates/people_directory.datadoom.yaml +56 -0
  97. datadoom/templates/predictive_maintenance.datadoom.yaml +107 -0
  98. datadoom/templates/telecom_churn_challenge.datadoom.yaml +125 -0
  99. datadoom/version.py +3 -0
  100. datadoom/webdist/assets/index-V8VAuTJG.js +445 -0
  101. datadoom/webdist/assets/index-doRjyG5s.css +1 -0
  102. datadoom/webdist/assets/inter-cyrillic-ext-wght-normal-BOeWTOD4.woff2 +0 -0
  103. datadoom/webdist/assets/inter-cyrillic-wght-normal-DqGufNeO.woff2 +0 -0
  104. datadoom/webdist/assets/inter-greek-ext-wght-normal-DlzME5K_.woff2 +0 -0
  105. datadoom/webdist/assets/inter-greek-wght-normal-CkhJZR-_.woff2 +0 -0
  106. datadoom/webdist/assets/inter-latin-ext-wght-normal-DO1Apj_S.woff2 +0 -0
  107. datadoom/webdist/assets/inter-latin-wght-normal-Dx4kXJAl.woff2 +0 -0
  108. datadoom/webdist/assets/inter-vietnamese-wght-normal-CBcvBZtf.woff2 +0 -0
  109. datadoom/webdist/assets/jetbrains-mono-cyrillic-wght-normal-D73BlboJ.woff2 +0 -0
  110. datadoom/webdist/assets/jetbrains-mono-greek-wght-normal-Bw9x6K1M.woff2 +0 -0
  111. datadoom/webdist/assets/jetbrains-mono-latin-ext-wght-normal-DBQx-q_a.woff2 +0 -0
  112. datadoom/webdist/assets/jetbrains-mono-latin-wght-normal-B9CIFXIH.woff2 +0 -0
  113. datadoom/webdist/assets/jetbrains-mono-vietnamese-wght-normal-Bt-aOZkq.woff2 +0 -0
  114. datadoom/webdist/assets/space-grotesk-latin-ext-wght-normal-D9tNdqV9.woff2 +0 -0
  115. datadoom/webdist/assets/space-grotesk-latin-wght-normal-BhU9QXUp.woff2 +0 -0
  116. datadoom/webdist/assets/space-grotesk-vietnamese-wght-normal-D0rl6rjA.woff2 +0 -0
  117. datadoom/webdist/index.html +15 -0
  118. datadoom-0.1.0.dev0.dist-info/METADATA +143 -0
  119. datadoom-0.1.0.dev0.dist-info/RECORD +122 -0
  120. datadoom-0.1.0.dev0.dist-info/WHEEL +4 -0
  121. datadoom-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  122. datadoom-0.1.0.dev0.dist-info/licenses/LICENSE +202 -0
@@ -0,0 +1,206 @@
1
+ """Report assembly (03 §4 stage 8, 06 §3.5, 05 §7).
2
+
3
+ Builds the sectioned ``Report`` payload the Results screen and the ``reports``
4
+ table bind to. Pure engine code — no DB/web imports. The sections the engine can
5
+ honestly produce are populated:
6
+
7
+ * ``compliance_score`` / ``distribution`` — from the KS compliance pass.
8
+ * ``correlation`` — Pearson matrix over numeric columns (cheap, honest).
9
+ * ``mutual_information`` — pairwise MI (nats) over discretized columns (05 §7).
10
+ * ``causal_truth`` — the true generating DAG + interventions (P2).
11
+ * ``failures`` — per-mode realized diffs when failures were injected (P3).
12
+ * ``difficulty`` — target band, achieved metric, probe, iterations, knobs (P4).
13
+ * ``determinism`` — spec_hash, seed, per-namespace key digests, checksums.
14
+
15
+ Sections the engine cannot honestly produce for a given run stay ``None``; the
16
+ schema is stable so the UI is coherent from day one.
17
+ """
18
+
19
+ from __future__ import annotations
20
+
21
+ from dataclasses import asdict, dataclass, field
22
+ from typing import TYPE_CHECKING, Any
23
+
24
+ import numpy as np
25
+ import pandas as pd
26
+
27
+ from .causal.execute import resolve_interventions
28
+ from .dist import ComplianceReport
29
+ from .profile import build_profile
30
+
31
+ if TYPE_CHECKING:
32
+ from .causal.graph import CausalDag
33
+ from .spec.models import CausalGraph, Spec
34
+
35
+ # Columns with more distinct values than this are treated as free-text / id-like
36
+ # and excluded from the mutual-information matrix (binning would be meaningless).
37
+ _MI_MAX_CARDINALITY = 50
38
+ _MI_NUMERIC_BINS = 10
39
+
40
+
41
+ @dataclass
42
+ class ReportBundle:
43
+ compliance_score: float
44
+ distribution: dict[str, Any]
45
+ correlation: dict[str, Any] | None = None
46
+ mutual_information: dict[str, Any] | None = None
47
+ causal_truth: dict[str, Any] | None = None
48
+ difficulty: dict[str, Any] | None = None
49
+ failures: dict[str, Any] | None = None
50
+ profile: dict[str, Any] | None = None
51
+ determinism: dict[str, Any] = field(default_factory=dict)
52
+
53
+ def to_dict(self) -> dict[str, Any]:
54
+ return asdict(self)
55
+
56
+
57
+ def correlation_pearson(frame: pd.DataFrame) -> dict[str, Any] | None:
58
+ """Pearson correlation over numeric columns, JSON-serializable.
59
+
60
+ Returns ``None`` when fewer than two numeric columns exist (a matrix would
61
+ be degenerate). NaNs (e.g. a constant column) are rendered as ``None``.
62
+ """
63
+ numeric = frame.select_dtypes(include=[np.number])
64
+ if numeric.shape[1] < 2:
65
+ return None
66
+ corr = numeric.corr(method="pearson")
67
+ matrix = [[None if pd.isna(v) else float(v) for v in row] for row in corr.to_numpy()]
68
+ return {"method": "pearson", "columns": list(corr.columns), "matrix": matrix}
69
+
70
+
71
+ def _discretize(series: pd.Series) -> np.ndarray | None:
72
+ """Map a column to integer codes for MI; ``None`` if it isn't usable.
73
+
74
+ Numeric/datetime columns are quantile-binned; low-cardinality
75
+ categorical/boolean columns are factorized; free-text/id-like columns
76
+ (cardinality above the cap) are skipped.
77
+ """
78
+ nunique = series.nunique(dropna=True)
79
+ if nunique < 2:
80
+ return None
81
+ if pd.api.types.is_bool_dtype(series) or pd.api.types.is_object_dtype(series) or isinstance(
82
+ series.dtype, pd.CategoricalDtype
83
+ ):
84
+ if nunique > _MI_MAX_CARDINALITY:
85
+ return None
86
+ return pd.factorize(series, sort=True)[0]
87
+ values = series
88
+ if pd.api.types.is_datetime64_any_dtype(series):
89
+ values = series.astype("int64")
90
+ if pd.api.types.is_numeric_dtype(values):
91
+ bins = min(_MI_NUMERIC_BINS, int(nunique))
92
+ codes = pd.qcut(values, q=bins, duplicates="drop").cat.codes
93
+ return codes.to_numpy() if codes.nunique() >= 2 else None
94
+ return None
95
+
96
+
97
+ def _mutual_information(a: np.ndarray, b: np.ndarray) -> float:
98
+ """MI in nats between two integer-code arrays via the joint histogram."""
99
+ contingency = pd.crosstab(a, b).to_numpy(dtype=float)
100
+ n = contingency.sum()
101
+ if n == 0:
102
+ return 0.0
103
+ pxy = contingency / n
104
+ px = pxy.sum(axis=1, keepdims=True)
105
+ py = pxy.sum(axis=0, keepdims=True)
106
+ with np.errstate(divide="ignore", invalid="ignore"):
107
+ terms = pxy * (np.log(pxy) - np.log(px) - np.log(py))
108
+ return float(np.nansum(np.where(pxy > 0, terms, 0.0)))
109
+
110
+
111
+ def mutual_information_matrix(frame: pd.DataFrame) -> dict[str, Any] | None:
112
+ """Pairwise mutual information (nats) over discretizable columns (05 §7).
113
+
114
+ Returns ``None`` when fewer than two columns are usable. The diagonal is the
115
+ column's entropy ``H(X) = I(X;X)``. Symmetric by construction.
116
+ """
117
+ codes: dict[str, np.ndarray] = {}
118
+ for col in frame.columns:
119
+ disc = _discretize(frame[col])
120
+ if disc is not None:
121
+ codes[col] = disc
122
+ cols = list(codes)
123
+ if len(cols) < 2:
124
+ return None
125
+ size = len(cols)
126
+ matrix = [[0.0] * size for _ in range(size)]
127
+ for i in range(size):
128
+ for j in range(i, size):
129
+ mi = _mutual_information(codes[cols[i]], codes[cols[j]])
130
+ matrix[i][j] = mi
131
+ matrix[j][i] = mi
132
+ return {"method": "histogram", "units": "nats", "columns": cols, "matrix": matrix}
133
+
134
+
135
+ def causal_truth(
136
+ causal: CausalGraph | None, dag: CausalDag | None
137
+ ) -> dict[str, Any] | None:
138
+ """The true generating graph: edges (with params), interventions, topo order.
139
+
140
+ Edges whose destination is intervened are reported ``active: false`` — an
141
+ intervention ``do(X)`` detaches X's incoming edges (05 §3.1).
142
+ """
143
+ if causal is None:
144
+ return None
145
+ interventions = resolve_interventions(causal.interventions)
146
+ edges: list[dict[str, Any]] = []
147
+ for e in causal.edges:
148
+ edge: dict[str, Any] = {"from": e.src, "to": e.dst, "fn": e.fn}
149
+ for key, val in (
150
+ ("weight", e.weight),
151
+ ("bias", e.bias),
152
+ ("coeffs", e.coeffs),
153
+ ("mapping", e.mapping),
154
+ ):
155
+ if val is not None:
156
+ edge[key] = val
157
+ edge["active"] = e.dst not in interventions
158
+ edges.append(edge)
159
+ return {
160
+ "nodes": dag.topological_order() if dag is not None else None,
161
+ "edges": edges,
162
+ "interventions": interventions,
163
+ "topological_order": dag.topological_order() if dag is not None else None,
164
+ }
165
+
166
+
167
+ def failures_section(diffs: list[dict[str, Any]] | None) -> dict[str, Any] | None:
168
+ """Wrap the per-mode failure diffs into the report's ``failures`` section.
169
+
170
+ ``None`` when no failures were injected, so the UI can tell "no corruption"
171
+ apart from "corruption with empty effect".
172
+ """
173
+ if not diffs:
174
+ return None
175
+ return {"count": len(diffs), "modes": diffs}
176
+
177
+
178
+ def build_report(
179
+ *,
180
+ compliance: ComplianceReport,
181
+ frame: pd.DataFrame,
182
+ determinism: dict[str, Any],
183
+ spec: Spec | None = None,
184
+ causal: CausalGraph | None = None,
185
+ causal_dag: CausalDag | None = None,
186
+ failures: list[dict[str, Any]] | None = None,
187
+ injected: pd.DataFrame | None = None,
188
+ difficulty: dict[str, Any] | None = None,
189
+ ) -> ReportBundle:
190
+ """Assemble the report bundle from the realized frame and compliance pass."""
191
+ profile = (
192
+ build_profile(spec, frame, injected=injected, failure_diffs=failures)
193
+ if spec is not None
194
+ else None
195
+ )
196
+ return ReportBundle(
197
+ compliance_score=compliance.score,
198
+ distribution=compliance.to_dict(),
199
+ correlation=correlation_pearson(frame),
200
+ mutual_information=mutual_information_matrix(frame),
201
+ causal_truth=causal_truth(causal, causal_dag),
202
+ failures=failures_section(failures),
203
+ profile=profile,
204
+ difficulty=difficulty,
205
+ determinism=determinism,
206
+ )
datadoom/engine/rng.py ADDED
@@ -0,0 +1,79 @@
1
+ """Seeded RNG factory — the determinism invariant (05 §1.2).
2
+
3
+ All randomness in the engine MUST flow through an :class:`RNGFactory`. No stdlib
4
+ ``random``, ``uuid4``, ``time``, or ``np.random.*`` *global* calls are allowed in
5
+ the data path. Each logical stream gets its own independent generator keyed by
6
+ ``namespace`` so that adding a feature never perturbs another's draws.
7
+
8
+ key(ns) = SHA256(spec_hash || ":" || seed || ":" || ns)[:8] -> uint64
9
+ RNG(ns) = numpy.random.Generator(PCG64(key(ns)))
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import hashlib
15
+
16
+ import numpy as np
17
+
18
+ # Stable namespace prefixes (documented in 05 §1.2). Helpers below build the
19
+ # full namespace string so call sites stay consistent.
20
+ NS_FEATURE = "feature"
21
+ NS_NOISE = "noise"
22
+ NS_FAILURE = "failure"
23
+ NS_PROBE = "probe"
24
+ NS_DIFFICULTY = "difficulty"
25
+ NS_SHUFFLE = "shuffle"
26
+
27
+
28
+ def _derive_key(spec_hash: str, seed: int, namespace: str) -> int:
29
+ """Derive a uint64 PCG64 key from (spec_hash, seed, namespace)."""
30
+ payload = f"{spec_hash}:{seed}:{namespace}".encode()
31
+ digest = hashlib.sha256(payload).digest()
32
+ # First 8 bytes, big-endian, as an unsigned 64-bit integer.
33
+ return int.from_bytes(digest[:8], byteorder="big", signed=False)
34
+
35
+
36
+ class RNGFactory:
37
+ """Produces independent, deterministic generators per namespace.
38
+
39
+ Two factories with identical ``(spec_hash, seed)`` yield identical draws for
40
+ the same namespace, and different namespaces are statistically independent.
41
+ """
42
+
43
+ def __init__(self, spec_hash: str, seed: int) -> None:
44
+ self.spec_hash = spec_hash
45
+ self.seed = int(seed)
46
+
47
+ def key(self, namespace: str) -> int:
48
+ """Return the raw uint64 key for a namespace (used in determinism reports)."""
49
+ return _derive_key(self.spec_hash, self.seed, namespace)
50
+
51
+ def generator(self, namespace: str) -> np.random.Generator:
52
+ """Return an independent ``numpy.random.Generator`` for ``namespace``."""
53
+ return np.random.Generator(np.random.PCG64(self.key(namespace)))
54
+
55
+ # Convenience namespace builders -------------------------------------------------
56
+
57
+ def feature(self, name: str) -> np.random.Generator:
58
+ return self.generator(f"{NS_FEATURE}:{name}")
59
+
60
+ def noise(self, name: str) -> np.random.Generator:
61
+ return self.generator(f"{NS_NOISE}:{name}")
62
+
63
+ def failure(self, index: int) -> np.random.Generator:
64
+ return self.generator(f"{NS_FAILURE}:{index}")
65
+
66
+ def difficulty(self, name: str) -> np.random.Generator:
67
+ """Stream for difficulty-calibration perturbations (feature/label noise)."""
68
+ return self.generator(f"{NS_DIFFICULTY}:{name}")
69
+
70
+ def probe(self, name: str) -> np.random.Generator:
71
+ """Stream for difficulty probe-model seeds (train/test split, estimator)."""
72
+ return self.generator(f"{NS_PROBE}:{name}")
73
+
74
+ def shuffle(self) -> np.random.Generator:
75
+ return self.generator(NS_SHUFFLE)
76
+
77
+ def key_digests(self, namespaces: list[str]) -> dict[str, str]:
78
+ """Hex key digests for the determinism report section (06 §3.5)."""
79
+ return {ns: format(self.key(ns), "016x") for ns in namespaces}
@@ -0,0 +1,45 @@
1
+ """Spec parsing, validation, canonicalization and hashing."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+ import yaml
8
+
9
+ from ..errors import SpecValidationError
10
+ from .hashing import canonical_json, spec_hash
11
+ from .models import Spec
12
+ from .validate import validate_spec
13
+
14
+ __all__ = [
15
+ "Spec",
16
+ "validate_spec",
17
+ "canonical_json",
18
+ "spec_hash",
19
+ "load_spec",
20
+ "parse_spec",
21
+ ]
22
+
23
+
24
+ def parse_spec(data: dict[str, Any]) -> Spec:
25
+ """Parse a raw dict into a validated :class:`Spec` (shape + cross-field)."""
26
+ from pydantic import ValidationError
27
+
28
+ try:
29
+ spec = Spec.model_validate(data)
30
+ except ValidationError as exc:
31
+ # Surface the first error with a dotted locator the UI/CLI can use.
32
+ first = exc.errors()[0]
33
+ locator = ".".join(str(p) for p in first["loc"])
34
+ raise SpecValidationError(first["msg"], locator=locator) from exc
35
+ validate_spec(spec)
36
+ return spec
37
+
38
+
39
+ def load_spec(path: str) -> Spec:
40
+ """Load and validate a spec from a YAML (or JSON) file."""
41
+ with open(path, encoding="utf-8") as fh:
42
+ data = yaml.safe_load(fh)
43
+ if not isinstance(data, dict):
44
+ raise SpecValidationError("spec file must be a mapping at the top level")
45
+ return parse_spec(data)
@@ -0,0 +1,57 @@
1
+ """Canonical serialization & spec hashing (05 §1.1).
2
+
3
+ The spec hash is the identity of a dataset's *design*. It must be stable across
4
+ machines and runs, so we define a strict canonical JSON form:
5
+
6
+ - object keys sorted lexicographically,
7
+ - no insignificant whitespace,
8
+ - numbers normalized (integral floats collapse to ints; shortest float repr),
9
+ - arrays preserve author order (order is semantic),
10
+ - the ``seed`` field is excluded (seed is not part of the design identity).
11
+
12
+ spec_hash = SHA256(canonical_json(spec_without_seed)) # hex
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import hashlib
18
+ import json
19
+ from typing import Any
20
+
21
+
22
+ def _normalize(value: Any) -> Any:
23
+ """Recursively normalize numbers and drop nothing else.
24
+
25
+ Integral floats (``1.0``) become ints (``1``) so that a value authored as an
26
+ int and one authored as a float-but-whole hash identically.
27
+ """
28
+ if isinstance(value, bool):
29
+ # bool is a subclass of int — keep it as-is (True/False -> JSON booleans).
30
+ return value
31
+ if isinstance(value, float):
32
+ if value.is_integer():
33
+ return int(value)
34
+ return value
35
+ if isinstance(value, dict):
36
+ return {k: _normalize(v) for k, v in value.items()}
37
+ if isinstance(value, (list, tuple)):
38
+ return [_normalize(v) for v in value]
39
+ return value
40
+
41
+
42
+ def canonical_json(spec_body: dict[str, Any]) -> str:
43
+ """Return the canonical JSON string for a spec dict (``seed`` removed)."""
44
+ body = {k: v for k, v in spec_body.items() if k != "seed"}
45
+ normalized = _normalize(body)
46
+ return json.dumps(
47
+ normalized,
48
+ sort_keys=True,
49
+ separators=(",", ":"),
50
+ ensure_ascii=False,
51
+ allow_nan=False,
52
+ )
53
+
54
+
55
+ def spec_hash(spec_body: dict[str, Any]) -> str:
56
+ """Compute the hex SHA256 of the canonical (seed-excluded) spec."""
57
+ return hashlib.sha256(canonical_json(spec_body).encode("utf-8")).hexdigest()
@@ -0,0 +1,238 @@
1
+ """Pydantic v2 spec models — the parsed, validated form of doc 04.
2
+
3
+ These are pure (no DB/framework imports). Shape/type validation happens here;
4
+ cross-entity semantic checks (acyclicity, references) live in ``validate.py``.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from typing import Annotated, Any, Literal
10
+
11
+ from pydantic import BaseModel, ConfigDict, Field, field_validator
12
+
13
+ from .hashing import canonical_json, spec_hash
14
+
15
+ FEATURE_NAME_PATTERN = r"^[A-Za-z_][A-Za-z0-9_]*$"
16
+
17
+
18
+ class _Model(BaseModel):
19
+ model_config = ConfigDict(extra="forbid")
20
+
21
+
22
+ # --- Feature definitions (04 §4) -----------------------------------------------------
23
+
24
+
25
+ class _FeatureBase(_Model):
26
+ """Fields shared by every feature type.
27
+
28
+ ``emit=False`` marks a **latent** feature: it is sampled / computed and may
29
+ drive the causal SEM (and appears in the true causal graph), but it is **not
30
+ shipped** — excluded from the output CSV, the difficulty probe, compliance,
31
+ and correlation/MI. It models hidden variables (unobserved confounders, a
32
+ latent score that generates an observed label). The default is ``None`` ==
33
+ emitted; only an explicit ``False`` is canonicalized, so adding the field
34
+ never changes the hash of an existing spec (invariant #5/#6).
35
+ """
36
+
37
+ description: str | None = None
38
+ emit: bool | None = None
39
+
40
+
41
+ class NumericFeature(_FeatureBase):
42
+ type: Literal["numeric"]
43
+ dist: str | None = None # None => derived via causal
44
+ params: dict[str, float] = Field(default_factory=dict)
45
+ min: float | None = None
46
+ max: float | None = None
47
+ dtype: Literal["int", "float"] = "float"
48
+
49
+
50
+ class CategoricalFeature(_FeatureBase):
51
+ type: Literal["categorical"]
52
+ categories: list[str] = Field(min_length=1)
53
+ weights: list[float] | None = None
54
+
55
+ @field_validator("weights")
56
+ @classmethod
57
+ def _weights_nonneg(cls, v: list[float] | None) -> list[float] | None:
58
+ if v is not None and any(w < 0 for w in v):
59
+ raise ValueError("categorical weights must be non-negative")
60
+ return v
61
+
62
+
63
+ class BooleanFeature(_FeatureBase):
64
+ type: Literal["boolean"]
65
+ rate: float = 0.5
66
+
67
+ @field_validator("rate")
68
+ @classmethod
69
+ def _rate_in_unit(cls, v: float) -> float:
70
+ if not 0.0 <= v <= 1.0:
71
+ raise ValueError("boolean rate must be in [0, 1]")
72
+ return v
73
+
74
+
75
+ class DatetimeFeature(_FeatureBase):
76
+ type: Literal["datetime"]
77
+ start: str
78
+ end: str
79
+ granularity: Literal["second", "minute", "hour", "day"] = "day"
80
+ dist: str = "uniform"
81
+
82
+
83
+ class TextFeature(_FeatureBase):
84
+ type: Literal["text"]
85
+ # "lorem" emits filler words; any realistic provider key (name, email,
86
+ # address, company, …) is served by mimesis. See dist/providers.py.
87
+ generator: str = "lorem"
88
+ locale: str = "en"
89
+ length: dict[str, int] = Field(default_factory=lambda: {"min": 5, "max": 30})
90
+
91
+
92
+ class TrendSpec(_Model):
93
+ """Linear trend ``slope·t + intercept`` over the row index."""
94
+
95
+ slope: float = 0.0
96
+ intercept: float = 0.0
97
+
98
+
99
+ class SeasonalitySpec(_Model):
100
+ """One sinusoidal season ``amplitude·sin(2π·t/period + phase)``."""
101
+
102
+ amplitude: float
103
+ period: float = Field(gt=0.0)
104
+ phase: float = 0.0
105
+
106
+
107
+ class TimeseriesFeature(_FeatureBase):
108
+ """An ordered series ``Xₜ = T(t) + S(t) + AR(p) + εₜ`` (05 §6).
109
+
110
+ Generated over the row index (row order is preserved end-to-end). It is a
111
+ *root* feature — it may be a causal **parent** but is never a causal target,
112
+ and it is not assessed by distribution compliance (KS/GoF don't apply to a
113
+ trended, autocorrelated series).
114
+ """
115
+
116
+ type: Literal["timeseries"]
117
+ trend: TrendSpec | None = None
118
+ seasonality: list[SeasonalitySpec] = Field(default_factory=list)
119
+ ar: list[float] = Field(default_factory=list) # AR coefficients φ₁…φ_p
120
+ noise_std: float = Field(default=1.0, ge=0.0) # σ of εₜ ~ N(0, σ²)
121
+ min: float | None = None
122
+ max: float | None = None
123
+ dtype: Literal["int", "float"] = "float"
124
+
125
+
126
+ Feature = Annotated[
127
+ NumericFeature
128
+ | CategoricalFeature
129
+ | BooleanFeature
130
+ | DatetimeFeature
131
+ | TextFeature
132
+ | TimeseriesFeature,
133
+ Field(discriminator="type"),
134
+ ]
135
+
136
+
137
+ # --- Causal graph (04 §5) ------------------------------------------------------------
138
+
139
+
140
+ class CausalEdge(_Model):
141
+ model_config = ConfigDict(extra="forbid", populate_by_name=True)
142
+
143
+ src: str = Field(alias="from")
144
+ dst: str = Field(alias="to")
145
+ fn: str # linear | logistic | polynomial | map | identity | <plugin>
146
+ weight: float | None = None
147
+ bias: float | None = None
148
+ coeffs: list[float] | None = None
149
+ mapping: dict[str, float] | None = None
150
+
151
+
152
+ class CausalGraph(_Model):
153
+ edges: list[CausalEdge] = Field(default_factory=list)
154
+ noise: dict[str, dict[str, Any]] = Field(default_factory=dict)
155
+ interventions: list[dict[str, Any]] = Field(default_factory=list)
156
+
157
+
158
+ # --- Difficulty / failures / export (04 §6-8) ----------------------------------------
159
+
160
+
161
+ class Difficulty(_Model):
162
+ target: str | dict[str, Any]
163
+ label: str
164
+ probe: str = "logreg"
165
+ max_iters: int = Field(default=8, ge=1)
166
+ # Lean-default knobs: feature-observation noise (primary) + label flips
167
+ # (deep end). `causal` shrink / `imbalance` are planned (status.md backlog).
168
+ knobs: list[str] = Field(default_factory=lambda: ["noise", "label_noise"])
169
+
170
+
171
+ class Failure(BaseModel):
172
+ # type-specific fields are validated by the FailureMode handler (P3).
173
+ model_config = ConfigDict(extra="allow")
174
+
175
+ type: str
176
+
177
+
178
+ ExportVersion = Literal["clean", "injected"]
179
+
180
+
181
+ def _default_versions() -> list[ExportVersion]:
182
+ return ["clean"]
183
+
184
+
185
+ class ExportSpec(_Model):
186
+ formats: list[str] = Field(default_factory=lambda: ["csv"])
187
+ versions: list[ExportVersion] = Field(default_factory=_default_versions)
188
+ splits: dict[str, float] | None = None
189
+ shuffle: bool = True
190
+ metadata: bool = True
191
+
192
+
193
+ # --- Top-level spec (04 §2) ----------------------------------------------------------
194
+
195
+
196
+ class Spec(_Model):
197
+ datadoom_version: str
198
+ name: str
199
+ description: str | None = None
200
+ seed: int | None = None
201
+ rows: int = Field(ge=1)
202
+ features: dict[str, Feature]
203
+ causal: CausalGraph | None = None
204
+ difficulty: Difficulty | None = None
205
+ failures: list[Failure] = Field(default_factory=list)
206
+ export: ExportSpec = Field(default_factory=ExportSpec)
207
+ meta: dict[str, Any] = Field(default_factory=dict)
208
+
209
+ @field_validator("name")
210
+ @classmethod
211
+ def _name_slug(cls, v: str) -> str:
212
+ import re
213
+
214
+ if not re.match(r"^[A-Za-z0-9_-]+$", v):
215
+ raise ValueError("name must be slug-friendly ([A-Za-z0-9_-]+)")
216
+ return v
217
+
218
+ # --- Canonical form & identity (05 §1) -------------------------------------------
219
+
220
+ def body(self) -> dict[str, Any]:
221
+ """Serializable spec document (aliases applied, None fields dropped)."""
222
+ return self.model_dump(mode="json", by_alias=True, exclude_none=True)
223
+
224
+ def canonical(self) -> str:
225
+ """Canonical JSON string (seed excluded) used for hashing."""
226
+ return canonical_json(self.body())
227
+
228
+ def spec_hash(self) -> str:
229
+ """sha256 of the canonical, seed-excluded spec."""
230
+ return spec_hash(self.body())
231
+
232
+ def latent_names(self) -> set[str]:
233
+ """Feature names marked ``emit: false`` (computed but not shipped)."""
234
+ return {name for name, feat in self.features.items() if feat.emit is False}
235
+
236
+ def emitted_names(self) -> list[str]:
237
+ """Feature names that are shipped, in spec order."""
238
+ return [name for name, feat in self.features.items() if feat.emit is not False]