datadoom 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. datadoom/__init__.py +23 -0
  2. datadoom/adapters/__init__.py +29 -0
  3. datadoom/adapters/frameworks.py +94 -0
  4. datadoom/adapters/loaders.py +72 -0
  5. datadoom/api/__init__.py +11 -0
  6. datadoom/api/app.py +109 -0
  7. datadoom/api/deps.py +30 -0
  8. datadoom/api/errors.py +89 -0
  9. datadoom/api/estimate.py +82 -0
  10. datadoom/api/routes/__init__.py +7 -0
  11. datadoom/api/routes/artifacts.py +147 -0
  12. datadoom/api/routes/datasets.py +180 -0
  13. datadoom/api/routes/meta.py +45 -0
  14. datadoom/api/routes/plugins.py +22 -0
  15. datadoom/api/routes/runs.py +144 -0
  16. datadoom/api/routes/specs.py +73 -0
  17. datadoom/api/routes/templates.py +30 -0
  18. datadoom/api/schemas.py +230 -0
  19. datadoom/api/serializers.py +143 -0
  20. datadoom/api/state.py +24 -0
  21. datadoom/api/store_helpers.py +56 -0
  22. datadoom/api/ws.py +72 -0
  23. datadoom/cli/__init__.py +1 -0
  24. datadoom/cli/main.py +313 -0
  25. datadoom/config.py +108 -0
  26. datadoom/engine/__init__.py +38 -0
  27. datadoom/engine/advice.py +289 -0
  28. datadoom/engine/audit.py +290 -0
  29. datadoom/engine/causal/__init__.py +15 -0
  30. datadoom/engine/causal/execute.py +116 -0
  31. datadoom/engine/causal/functions.py +116 -0
  32. datadoom/engine/causal/graph.py +54 -0
  33. datadoom/engine/difficulty/__init__.py +36 -0
  34. datadoom/engine/difficulty/calibrate.py +235 -0
  35. datadoom/engine/difficulty/knobs.py +171 -0
  36. datadoom/engine/difficulty/probes.py +181 -0
  37. datadoom/engine/dist/__init__.py +35 -0
  38. datadoom/engine/dist/base.py +46 -0
  39. datadoom/engine/dist/builtins.py +172 -0
  40. datadoom/engine/dist/compliance.py +344 -0
  41. datadoom/engine/dist/providers.py +117 -0
  42. datadoom/engine/errors.py +32 -0
  43. datadoom/engine/export/__init__.py +27 -0
  44. datadoom/engine/export/base.py +49 -0
  45. datadoom/engine/export/checksums.py +18 -0
  46. datadoom/engine/export/csv_exporter.py +34 -0
  47. datadoom/engine/export/json_exporter.py +67 -0
  48. datadoom/engine/export/metadata.py +58 -0
  49. datadoom/engine/export/parquet_exporter.py +45 -0
  50. datadoom/engine/failure/__init__.py +18 -0
  51. datadoom/engine/failure/apply.py +37 -0
  52. datadoom/engine/failure/base.py +116 -0
  53. datadoom/engine/failure/modes.py +442 -0
  54. datadoom/engine/pipeline.py +418 -0
  55. datadoom/engine/profile.py +327 -0
  56. datadoom/engine/progress.py +14 -0
  57. datadoom/engine/reference.py +338 -0
  58. datadoom/engine/reports.py +206 -0
  59. datadoom/engine/rng.py +79 -0
  60. datadoom/engine/spec/__init__.py +45 -0
  61. datadoom/engine/spec/hashing.py +57 -0
  62. datadoom/engine/spec/models.py +238 -0
  63. datadoom/engine/spec/validate.py +345 -0
  64. datadoom/engine/timeseries.py +88 -0
  65. datadoom/jobs/__init__.py +14 -0
  66. datadoom/jobs/progress.py +155 -0
  67. datadoom/jobs/worker.py +162 -0
  68. datadoom/plugin.py +35 -0
  69. datadoom/plugins/__init__.py +47 -0
  70. datadoom/plugins/contracts.py +72 -0
  71. datadoom/plugins/loader.py +125 -0
  72. datadoom/plugins/registry.py +214 -0
  73. datadoom/plugins/scaffold.py +434 -0
  74. datadoom/store/__init__.py +47 -0
  75. datadoom/store/artifacts.py +67 -0
  76. datadoom/store/db.py +104 -0
  77. datadoom/store/migrations/__init__.py +0 -0
  78. datadoom/store/migrations/env.py +53 -0
  79. datadoom/store/migrations/script.py.mako +24 -0
  80. datadoom/store/migrations/versions/0001_init.py +149 -0
  81. datadoom/store/migrations/versions/0002_report_mutual_information.py +23 -0
  82. datadoom/store/migrations/versions/0003_run_name.py +23 -0
  83. datadoom/store/migrations/versions/0004_report_profile.py +24 -0
  84. datadoom/store/models.py +170 -0
  85. datadoom/store/repositories.py +279 -0
  86. datadoom/templates/__init__.py +239 -0
  87. datadoom/templates/ab_test.datadoom.yaml +46 -0
  88. datadoom/templates/clinical_deterioration.datadoom.yaml +124 -0
  89. datadoom/templates/credit_default_challenge.datadoom.yaml +147 -0
  90. datadoom/templates/customer_churn.datadoom.yaml +60 -0
  91. datadoom/templates/ecommerce_orders.datadoom.yaml +46 -0
  92. datadoom/templates/fraud_detection.datadoom.yaml +57 -0
  93. datadoom/templates/hospital_readmission.datadoom.yaml +61 -0
  94. datadoom/templates/insurance_claims.datadoom.yaml +43 -0
  95. datadoom/templates/iot_sensors.datadoom.yaml +44 -0
  96. datadoom/templates/people_directory.datadoom.yaml +56 -0
  97. datadoom/templates/predictive_maintenance.datadoom.yaml +107 -0
  98. datadoom/templates/telecom_churn_challenge.datadoom.yaml +125 -0
  99. datadoom/version.py +3 -0
  100. datadoom/webdist/assets/index-V8VAuTJG.js +445 -0
  101. datadoom/webdist/assets/index-doRjyG5s.css +1 -0
  102. datadoom/webdist/assets/inter-cyrillic-ext-wght-normal-BOeWTOD4.woff2 +0 -0
  103. datadoom/webdist/assets/inter-cyrillic-wght-normal-DqGufNeO.woff2 +0 -0
  104. datadoom/webdist/assets/inter-greek-ext-wght-normal-DlzME5K_.woff2 +0 -0
  105. datadoom/webdist/assets/inter-greek-wght-normal-CkhJZR-_.woff2 +0 -0
  106. datadoom/webdist/assets/inter-latin-ext-wght-normal-DO1Apj_S.woff2 +0 -0
  107. datadoom/webdist/assets/inter-latin-wght-normal-Dx4kXJAl.woff2 +0 -0
  108. datadoom/webdist/assets/inter-vietnamese-wght-normal-CBcvBZtf.woff2 +0 -0
  109. datadoom/webdist/assets/jetbrains-mono-cyrillic-wght-normal-D73BlboJ.woff2 +0 -0
  110. datadoom/webdist/assets/jetbrains-mono-greek-wght-normal-Bw9x6K1M.woff2 +0 -0
  111. datadoom/webdist/assets/jetbrains-mono-latin-ext-wght-normal-DBQx-q_a.woff2 +0 -0
  112. datadoom/webdist/assets/jetbrains-mono-latin-wght-normal-B9CIFXIH.woff2 +0 -0
  113. datadoom/webdist/assets/jetbrains-mono-vietnamese-wght-normal-Bt-aOZkq.woff2 +0 -0
  114. datadoom/webdist/assets/space-grotesk-latin-ext-wght-normal-D9tNdqV9.woff2 +0 -0
  115. datadoom/webdist/assets/space-grotesk-latin-wght-normal-BhU9QXUp.woff2 +0 -0
  116. datadoom/webdist/assets/space-grotesk-vietnamese-wght-normal-D0rl6rjA.woff2 +0 -0
  117. datadoom/webdist/index.html +15 -0
  118. datadoom-0.1.0.dev0.dist-info/METADATA +143 -0
  119. datadoom-0.1.0.dev0.dist-info/RECORD +122 -0
  120. datadoom-0.1.0.dev0.dist-info/WHEEL +4 -0
  121. datadoom-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  122. datadoom-0.1.0.dev0.dist-info/licenses/LICENSE +202 -0
@@ -0,0 +1,418 @@
1
+ """Minimal deterministic generation pipeline (03, 05, 17 step 5).
2
+
3
+ P0 implements the headless slice of the canonical 9-stage pipeline:
4
+
5
+ intake -> snapshot -> seed -> base_generation -> compliance -> packaging
6
+
7
+ Causal, failure, and difficulty stages arrive in later phases. The single
8
+ entry point :func:`generate` is what the CLI, API, and ``datadoom.generate()``
9
+ all call — generation logic is never duplicated.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ import os
15
+ from dataclasses import dataclass, field
16
+ from pathlib import Path
17
+ from typing import Any
18
+
19
+ import numpy as np
20
+ import pandas as pd
21
+
22
+ from ..version import __version__
23
+ from .dist import (
24
+ REGISTRY,
25
+ ComplianceReport,
26
+ assess_numeric,
27
+ sample_boolean,
28
+ sample_categorical,
29
+ sample_datetime,
30
+ sample_provider,
31
+ sample_text,
32
+ )
33
+ from .dist.compliance import DEFAULT_ALPHA
34
+ from .errors import SpecValidationError
35
+ from .export import EXPORTERS, ArtifactInfo, build_metadata, write_metadata
36
+ from .progress import ProgressEmitter
37
+ from .reports import ReportBundle, build_report
38
+ from .rng import RNGFactory
39
+ from .spec import Spec, validate_spec
40
+ from .spec.models import (
41
+ BooleanFeature,
42
+ CategoricalFeature,
43
+ DatetimeFeature,
44
+ NumericFeature,
45
+ TextFeature,
46
+ TimeseriesFeature,
47
+ )
48
+ from .timeseries import Seasonality, Trend, generate_series
49
+
50
+ STAGES = (
51
+ "intake",
52
+ "snapshot",
53
+ "seed",
54
+ "base_generation",
55
+ "causal",
56
+ "difficulty",
57
+ "failure_injection",
58
+ "compliance",
59
+ "packaging",
60
+ )
61
+
62
+
63
+ @dataclass
64
+ class RunContext:
65
+ spec: Spec
66
+ spec_hash: str
67
+ seed: int
68
+ rng: RNGFactory
69
+ frames: dict[str, pd.DataFrame] = field(default_factory=dict)
70
+ reports: dict[str, Any] = field(default_factory=dict)
71
+ progress: ProgressEmitter = field(default_factory=ProgressEmitter)
72
+ used_namespaces: list[str] = field(default_factory=list)
73
+
74
+
75
+ @dataclass
76
+ class RunResult:
77
+ spec_hash: str
78
+ seed: int
79
+ frame: pd.DataFrame
80
+ compliance: ComplianceReport
81
+ metadata: dict[str, Any]
82
+ artifacts: list[ArtifactInfo]
83
+ out_dir: str | None
84
+ report: ReportBundle
85
+ injected: pd.DataFrame | None = None
86
+ difficulty: dict[str, Any] | None = None
87
+
88
+
89
+ def resolve_seed(spec: Spec, seed_override: int | None) -> int:
90
+ """Resolve the effective seed: override > spec.seed > fresh OS entropy.
91
+
92
+ Seed generation is the one sanctioned use of OS entropy — it is NOT on the
93
+ data path. The resolved seed is recorded so the run is reproducible after.
94
+ Exposed so callers (e.g. the API) can persist the resolved seed at
95
+ run-creation time and stay consistent with what the pipeline would pick.
96
+ """
97
+ if seed_override is not None:
98
+ return int(seed_override)
99
+ if spec.seed is not None:
100
+ return int(spec.seed)
101
+ return int.from_bytes(os.urandom(8), "big") & ((1 << 63) - 1)
102
+
103
+
104
+ def _derived_features(spec: Spec) -> set[str]:
105
+ """Feature names produced by the causal layer (any edge destination)."""
106
+ if spec.causal is None:
107
+ return set()
108
+ return {edge.dst for edge in spec.causal.edges}
109
+
110
+
111
+ def _clamp_and_cast(
112
+ values: np.ndarray, lo: float | None, hi: float | None, dtype: str, n: int
113
+ ) -> tuple[np.ndarray, float]:
114
+ """Apply optional ``min``/``max`` clamping and int rounding; report clamped fraction."""
115
+ clamped_fraction = 0.0
116
+ if lo is not None or hi is not None:
117
+ low = -np.inf if lo is None else lo
118
+ high = np.inf if hi is None else hi
119
+ mask = (values < low) | (values > high)
120
+ clamped_fraction = float(np.mean(mask)) if n else 0.0
121
+ values = np.clip(values, low, high)
122
+ if dtype == "int":
123
+ values = np.rint(values).astype("int64")
124
+ return values, clamped_fraction
125
+
126
+
127
+ def _sample_feature(name: str, feat: Any, ctx: RunContext) -> tuple[np.ndarray, float]:
128
+ """Return (values, clamped_fraction) for one feature."""
129
+ n = ctx.spec.rows
130
+ rng = ctx.rng.feature(name)
131
+ ctx.used_namespaces.append(f"feature:{name}")
132
+
133
+ if isinstance(feat, NumericFeature):
134
+ if feat.dist is None:
135
+ raise SpecValidationError(
136
+ f"feature {name!r} has no distribution; derived (causal) features "
137
+ "require the causal engine, which arrives in a later phase",
138
+ locator=f"features.{name}",
139
+ )
140
+ values = REGISTRY[feat.dist].sample(rng, n, feat.params)
141
+ return _clamp_and_cast(values, feat.min, feat.max, feat.dtype, n)
142
+
143
+ if isinstance(feat, TimeseriesFeature):
144
+ # εₜ flows through the noise namespace (05 §6: RNG(noise:<series>)).
145
+ ts_rng = ctx.rng.noise(name)
146
+ ctx.used_namespaces[-1] = f"noise:{name}" # replace the feature:<name> we appended
147
+ series = generate_series(
148
+ ts_rng,
149
+ n,
150
+ trend=Trend(feat.trend.slope, feat.trend.intercept) if feat.trend else None,
151
+ seasonality=[Seasonality(s.amplitude, s.period, s.phase) for s in feat.seasonality],
152
+ ar=feat.ar,
153
+ noise_std=feat.noise_std,
154
+ )
155
+ return _clamp_and_cast(series, feat.min, feat.max, feat.dtype, n)
156
+
157
+ if isinstance(feat, CategoricalFeature):
158
+ return sample_categorical(rng, n, feat.categories, feat.weights), 0.0
159
+ if isinstance(feat, BooleanFeature):
160
+ return sample_boolean(rng, n, feat.rate), 0.0
161
+ if isinstance(feat, DatetimeFeature):
162
+ return sample_datetime(rng, n, feat.start, feat.end, feat.granularity), 0.0
163
+ if isinstance(feat, TextFeature):
164
+ if feat.generator == "lorem":
165
+ return (
166
+ sample_text(rng, n, feat.length.get("min", 5), feat.length.get("max", 30)),
167
+ 0.0,
168
+ )
169
+ return sample_provider(rng, n, feat.generator, feat.locale), 0.0
170
+ raise SpecValidationError(f"unsupported feature type for {name!r}", locator=f"features.{name}")
171
+
172
+
173
+ def generate(
174
+ spec: Spec,
175
+ *,
176
+ seed: int | None = None,
177
+ out_dir: str | Path | None = None,
178
+ progress: ProgressEmitter | None = None,
179
+ alpha: float = DEFAULT_ALPHA,
180
+ ) -> RunResult:
181
+ """Execute the minimal pipeline and (optionally) write artifacts."""
182
+ progress = progress or ProgressEmitter()
183
+
184
+ # 1. intake & validate
185
+ progress.emit("intake", 0, "validating spec")
186
+ validate_spec(spec)
187
+
188
+ # 2. snapshot & hash
189
+ progress.emit("snapshot", 10, "hashing spec")
190
+ spec_hash = spec.spec_hash()
191
+
192
+ # 3. seed resolution
193
+ resolved_seed = resolve_seed(spec, seed)
194
+ rng = RNGFactory(spec_hash, resolved_seed)
195
+ ctx = RunContext(spec=spec, spec_hash=spec_hash, seed=resolved_seed, rng=rng, progress=progress)
196
+ progress.emit("seed", 20, f"seed={resolved_seed}")
197
+
198
+ # 4. base feature generation — sample root (non-derived) features.
199
+ # Causal targets are computed in the causal stage, not sampled here.
200
+ progress.emit("base_generation", 30, "sampling features")
201
+ derived = _derived_features(spec)
202
+ columns: dict[str, np.ndarray] = {}
203
+ clamp_fractions: dict[str, float] = {}
204
+ for fname, feat in spec.features.items():
205
+ if fname in derived:
206
+ continue
207
+ values, clamped = _sample_feature(fname, feat, ctx)
208
+ columns[fname] = values
209
+ clamp_fractions[fname] = clamped
210
+
211
+ # 5. causal / SEM execution — fill derived columns in topological order.
212
+ causal_dag = None
213
+ if spec.causal is not None:
214
+ progress.emit("causal", 55, "executing structural equations")
215
+ from .causal import execute_causal
216
+
217
+ causal_dag = execute_causal(ctx, columns)
218
+
219
+ frame = pd.DataFrame(columns, columns=list(spec.features.keys()))
220
+
221
+ # 5a. latent features (emit: false) drove sampling / the SEM and remain in the
222
+ # true causal graph, but are NOT shipped — drop them before difficulty,
223
+ # failures, compliance, and packaging so nothing downstream (incl. the probe)
224
+ # can see a hidden variable.
225
+ latent = spec.latent_names()
226
+ if latent:
227
+ frame = frame.drop(columns=[c for c in latent if c in frame.columns])
228
+ ctx.frames["clean"] = frame
229
+
230
+ # 5b. difficulty calibration — tune the dataset to a target baseline-metric
231
+ # band (feature-observation noise / label flips), baked into the clean frame.
232
+ difficulty_report: dict[str, Any] | None = None
233
+ if spec.difficulty is not None:
234
+ progress.emit("difficulty", 58, "calibrating difficulty")
235
+ from .difficulty import calibrate_difficulty
236
+
237
+ diff_result, frame = calibrate_difficulty(ctx, frame)
238
+ difficulty_report = diff_result.to_dict()
239
+ ctx.frames["clean"] = frame # the calibrated frame is the shipped baseline
240
+
241
+ # 6. failure injection — corrupt a copy; the clean baseline is preserved.
242
+ injected: pd.DataFrame | None = None
243
+ failure_diffs: list[dict[str, Any]] | None = None
244
+ if spec.failures:
245
+ progress.emit("failure_injection", 62, "injecting failures")
246
+ from .failure import apply_failures
247
+
248
+ injected, failure_diffs = apply_failures(ctx, frame)
249
+ ctx.frames["injected"] = injected
250
+
251
+ # 7. compliance (honest KS, no refit) — assessed on the clean baseline.
252
+ progress.emit("compliance", 70, "assessing distribution fit")
253
+ report = ComplianceReport(alpha=alpha)
254
+ for fname, feat in spec.features.items():
255
+ if feat.emit is False:
256
+ continue # latent — not shipped, so nothing to assess
257
+ if isinstance(feat, NumericFeature) and feat.dist is not None:
258
+ report.features.append(
259
+ assess_numeric(
260
+ fname,
261
+ feat.dist,
262
+ feat.params,
263
+ frame[fname].to_numpy(),
264
+ clamped_fraction=clamp_fractions[fname],
265
+ alpha=alpha,
266
+ dtype=feat.dtype,
267
+ clamp_min=feat.min,
268
+ clamp_max=feat.max,
269
+ )
270
+ )
271
+ ctx.reports["compliance"] = report
272
+
273
+ # 8. packaging
274
+ progress.emit("packaging", 90, "writing artifacts")
275
+ determinism: dict[str, Any] = {
276
+ "spec_hash": ctx.spec_hash,
277
+ "seed": ctx.seed,
278
+ "namespace_key_digests": ctx.rng.key_digests(sorted(set(ctx.used_namespaces))),
279
+ "artifact_checksums": {},
280
+ }
281
+ artifacts: list[ArtifactInfo] = []
282
+ metadata: dict[str, Any] = {}
283
+ out_path: str | None = None
284
+ if out_dir is not None:
285
+ out_path = str(out_dir)
286
+ artifacts, metadata, checksums = _package(
287
+ ctx, frame, report, determinism, Path(out_dir), injected, failure_diffs
288
+ )
289
+ determinism["artifact_checksums"] = checksums
290
+
291
+ report_bundle = build_report(
292
+ compliance=report,
293
+ frame=frame,
294
+ determinism=determinism,
295
+ spec=spec,
296
+ causal=spec.causal,
297
+ causal_dag=causal_dag,
298
+ failures=failure_diffs,
299
+ injected=injected,
300
+ difficulty=difficulty_report,
301
+ )
302
+
303
+ # Bind a human-readable audit report (compliance + column guide + failures +
304
+ # determinism) into the bundle so a download is self-describing. Registered as
305
+ # a tracked artifact but kept out of the metadata checksum map (like the spec).
306
+ if out_dir is not None:
307
+ from .audit import render_audit_markdown
308
+
309
+ audit_md = render_audit_markdown(spec, report_bundle, package_version=__version__)
310
+ artifacts.append(_write_audit_report(audit_md, Path(out_dir) / "audit_report.md"))
311
+
312
+ progress.emit("packaging", 100, "done")
313
+ return RunResult(
314
+ spec_hash=spec_hash,
315
+ seed=resolved_seed,
316
+ frame=frame,
317
+ compliance=report,
318
+ metadata=metadata,
319
+ artifacts=artifacts,
320
+ out_dir=out_path,
321
+ report=report_bundle,
322
+ injected=injected,
323
+ difficulty=difficulty_report,
324
+ )
325
+
326
+
327
+ def _package(
328
+ ctx: RunContext,
329
+ frame: pd.DataFrame,
330
+ report: ComplianceReport,
331
+ determinism: dict[str, Any],
332
+ out_dir: Path,
333
+ injected: pd.DataFrame | None = None,
334
+ failure_diffs: list[dict[str, Any]] | None = None,
335
+ ) -> tuple[list[ArtifactInfo], dict[str, Any], dict[str, str]]:
336
+ out_dir.mkdir(parents=True, exist_ok=True)
337
+
338
+ # Each requested format gets a file per shipped version. CSV stays first so it
339
+ # remains the canonical preview/determinism artifact (`data.csv`); other formats
340
+ # ship alongside it (09 §8).
341
+ formats = ctx.spec.export.formats or ["csv"]
342
+ want_injected = injected is not None and "injected" in ctx.spec.export.versions
343
+ variants: list[tuple[str, pd.DataFrame]] = [("clean", frame)]
344
+ if want_injected:
345
+ variants.append(("injected", injected))
346
+
347
+ data_artifacts: list[ArtifactInfo] = []
348
+ for fmt in formats:
349
+ exporter = EXPORTERS[fmt]
350
+ for version, variant in variants:
351
+ stem = "data" if version == "clean" else "data.injected"
352
+ name = f"{stem}.{exporter.ext}"
353
+ artifact = exporter.write(variant, out_dir / name)
354
+ # Store a path relative to out_dir so checksums/metadata are location-stable.
355
+ artifact.path = name
356
+ artifact.version = version
357
+ data_artifacts.append(artifact)
358
+
359
+ checksums = {a.path: a.checksum_sha256 for a in data_artifacts}
360
+ determinism = {**determinism, "artifact_checksums": checksums}
361
+ metadata = build_metadata(
362
+ spec_body=ctx.spec.body(),
363
+ spec_hash=ctx.spec_hash,
364
+ seed=ctx.seed,
365
+ rows=ctx.spec.rows,
366
+ package_version=__version__,
367
+ artifacts=data_artifacts,
368
+ compliance=report.to_dict(),
369
+ determinism=determinism,
370
+ failures=failure_diffs,
371
+ )
372
+ meta_artifact = write_metadata(metadata, out_dir / "metadata.json")
373
+ meta_artifact.path = "metadata.json"
374
+
375
+ # The resolved spec (with seed) so the bundle is self-reproducing. It is
376
+ # registered as a tracked, checksummed artifact (version "spec") — the locked,
377
+ # version-controllable record of exactly what produced this run — but kept OUT
378
+ # of the metadata determinism checksum map (that map is for data files only),
379
+ # so ``metadata.json`` stays byte-identical across runs.
380
+ resolved_spec = dict(ctx.spec.body())
381
+ resolved_spec["seed"] = ctx.seed
382
+ spec_artifact = _write_resolved_spec(resolved_spec, out_dir / "spec.resolved.yaml")
383
+
384
+ return [*data_artifacts, meta_artifact, spec_artifact], metadata, checksums
385
+
386
+
387
+ def _write_audit_report(markdown: str, path: Path) -> ArtifactInfo:
388
+ """Write the audit report and return its tracked-artifact info (version 'audit')."""
389
+ from .export import sha256_bytes
390
+
391
+ data = markdown.encode("utf-8")
392
+ with open(path, "wb") as fh:
393
+ fh.write(data)
394
+ return ArtifactInfo(
395
+ path=path.name,
396
+ format="md",
397
+ checksum_sha256=sha256_bytes(data),
398
+ size_bytes=len(data),
399
+ version="audit",
400
+ )
401
+
402
+
403
+ def _write_resolved_spec(spec_body: dict[str, Any], path: Path) -> ArtifactInfo:
404
+ import yaml
405
+
406
+ from .export import sha256_bytes
407
+
408
+ text = yaml.safe_dump(spec_body, sort_keys=True, default_flow_style=False)
409
+ data = text.encode("utf-8")
410
+ with open(path, "wb") as fh:
411
+ fh.write(data)
412
+ return ArtifactInfo(
413
+ path=path.name,
414
+ format="yaml",
415
+ checksum_sha256=sha256_bytes(data),
416
+ size_bytes=len(data),
417
+ version="spec",
418
+ )