datadoom 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. datadoom/__init__.py +23 -0
  2. datadoom/adapters/__init__.py +29 -0
  3. datadoom/adapters/frameworks.py +94 -0
  4. datadoom/adapters/loaders.py +72 -0
  5. datadoom/api/__init__.py +11 -0
  6. datadoom/api/app.py +109 -0
  7. datadoom/api/deps.py +30 -0
  8. datadoom/api/errors.py +89 -0
  9. datadoom/api/estimate.py +82 -0
  10. datadoom/api/routes/__init__.py +7 -0
  11. datadoom/api/routes/artifacts.py +147 -0
  12. datadoom/api/routes/datasets.py +180 -0
  13. datadoom/api/routes/meta.py +45 -0
  14. datadoom/api/routes/plugins.py +22 -0
  15. datadoom/api/routes/runs.py +144 -0
  16. datadoom/api/routes/specs.py +73 -0
  17. datadoom/api/routes/templates.py +30 -0
  18. datadoom/api/schemas.py +230 -0
  19. datadoom/api/serializers.py +143 -0
  20. datadoom/api/state.py +24 -0
  21. datadoom/api/store_helpers.py +56 -0
  22. datadoom/api/ws.py +72 -0
  23. datadoom/cli/__init__.py +1 -0
  24. datadoom/cli/main.py +313 -0
  25. datadoom/config.py +108 -0
  26. datadoom/engine/__init__.py +38 -0
  27. datadoom/engine/advice.py +289 -0
  28. datadoom/engine/audit.py +290 -0
  29. datadoom/engine/causal/__init__.py +15 -0
  30. datadoom/engine/causal/execute.py +116 -0
  31. datadoom/engine/causal/functions.py +116 -0
  32. datadoom/engine/causal/graph.py +54 -0
  33. datadoom/engine/difficulty/__init__.py +36 -0
  34. datadoom/engine/difficulty/calibrate.py +235 -0
  35. datadoom/engine/difficulty/knobs.py +171 -0
  36. datadoom/engine/difficulty/probes.py +181 -0
  37. datadoom/engine/dist/__init__.py +35 -0
  38. datadoom/engine/dist/base.py +46 -0
  39. datadoom/engine/dist/builtins.py +172 -0
  40. datadoom/engine/dist/compliance.py +344 -0
  41. datadoom/engine/dist/providers.py +117 -0
  42. datadoom/engine/errors.py +32 -0
  43. datadoom/engine/export/__init__.py +27 -0
  44. datadoom/engine/export/base.py +49 -0
  45. datadoom/engine/export/checksums.py +18 -0
  46. datadoom/engine/export/csv_exporter.py +34 -0
  47. datadoom/engine/export/json_exporter.py +67 -0
  48. datadoom/engine/export/metadata.py +58 -0
  49. datadoom/engine/export/parquet_exporter.py +45 -0
  50. datadoom/engine/failure/__init__.py +18 -0
  51. datadoom/engine/failure/apply.py +37 -0
  52. datadoom/engine/failure/base.py +116 -0
  53. datadoom/engine/failure/modes.py +442 -0
  54. datadoom/engine/pipeline.py +418 -0
  55. datadoom/engine/profile.py +327 -0
  56. datadoom/engine/progress.py +14 -0
  57. datadoom/engine/reference.py +338 -0
  58. datadoom/engine/reports.py +206 -0
  59. datadoom/engine/rng.py +79 -0
  60. datadoom/engine/spec/__init__.py +45 -0
  61. datadoom/engine/spec/hashing.py +57 -0
  62. datadoom/engine/spec/models.py +238 -0
  63. datadoom/engine/spec/validate.py +345 -0
  64. datadoom/engine/timeseries.py +88 -0
  65. datadoom/jobs/__init__.py +14 -0
  66. datadoom/jobs/progress.py +155 -0
  67. datadoom/jobs/worker.py +162 -0
  68. datadoom/plugin.py +35 -0
  69. datadoom/plugins/__init__.py +47 -0
  70. datadoom/plugins/contracts.py +72 -0
  71. datadoom/plugins/loader.py +125 -0
  72. datadoom/plugins/registry.py +214 -0
  73. datadoom/plugins/scaffold.py +434 -0
  74. datadoom/store/__init__.py +47 -0
  75. datadoom/store/artifacts.py +67 -0
  76. datadoom/store/db.py +104 -0
  77. datadoom/store/migrations/__init__.py +0 -0
  78. datadoom/store/migrations/env.py +53 -0
  79. datadoom/store/migrations/script.py.mako +24 -0
  80. datadoom/store/migrations/versions/0001_init.py +149 -0
  81. datadoom/store/migrations/versions/0002_report_mutual_information.py +23 -0
  82. datadoom/store/migrations/versions/0003_run_name.py +23 -0
  83. datadoom/store/migrations/versions/0004_report_profile.py +24 -0
  84. datadoom/store/models.py +170 -0
  85. datadoom/store/repositories.py +279 -0
  86. datadoom/templates/__init__.py +239 -0
  87. datadoom/templates/ab_test.datadoom.yaml +46 -0
  88. datadoom/templates/clinical_deterioration.datadoom.yaml +124 -0
  89. datadoom/templates/credit_default_challenge.datadoom.yaml +147 -0
  90. datadoom/templates/customer_churn.datadoom.yaml +60 -0
  91. datadoom/templates/ecommerce_orders.datadoom.yaml +46 -0
  92. datadoom/templates/fraud_detection.datadoom.yaml +57 -0
  93. datadoom/templates/hospital_readmission.datadoom.yaml +61 -0
  94. datadoom/templates/insurance_claims.datadoom.yaml +43 -0
  95. datadoom/templates/iot_sensors.datadoom.yaml +44 -0
  96. datadoom/templates/people_directory.datadoom.yaml +56 -0
  97. datadoom/templates/predictive_maintenance.datadoom.yaml +107 -0
  98. datadoom/templates/telecom_churn_challenge.datadoom.yaml +125 -0
  99. datadoom/version.py +3 -0
  100. datadoom/webdist/assets/index-V8VAuTJG.js +445 -0
  101. datadoom/webdist/assets/index-doRjyG5s.css +1 -0
  102. datadoom/webdist/assets/inter-cyrillic-ext-wght-normal-BOeWTOD4.woff2 +0 -0
  103. datadoom/webdist/assets/inter-cyrillic-wght-normal-DqGufNeO.woff2 +0 -0
  104. datadoom/webdist/assets/inter-greek-ext-wght-normal-DlzME5K_.woff2 +0 -0
  105. datadoom/webdist/assets/inter-greek-wght-normal-CkhJZR-_.woff2 +0 -0
  106. datadoom/webdist/assets/inter-latin-ext-wght-normal-DO1Apj_S.woff2 +0 -0
  107. datadoom/webdist/assets/inter-latin-wght-normal-Dx4kXJAl.woff2 +0 -0
  108. datadoom/webdist/assets/inter-vietnamese-wght-normal-CBcvBZtf.woff2 +0 -0
  109. datadoom/webdist/assets/jetbrains-mono-cyrillic-wght-normal-D73BlboJ.woff2 +0 -0
  110. datadoom/webdist/assets/jetbrains-mono-greek-wght-normal-Bw9x6K1M.woff2 +0 -0
  111. datadoom/webdist/assets/jetbrains-mono-latin-ext-wght-normal-DBQx-q_a.woff2 +0 -0
  112. datadoom/webdist/assets/jetbrains-mono-latin-wght-normal-B9CIFXIH.woff2 +0 -0
  113. datadoom/webdist/assets/jetbrains-mono-vietnamese-wght-normal-Bt-aOZkq.woff2 +0 -0
  114. datadoom/webdist/assets/space-grotesk-latin-ext-wght-normal-D9tNdqV9.woff2 +0 -0
  115. datadoom/webdist/assets/space-grotesk-latin-wght-normal-BhU9QXUp.woff2 +0 -0
  116. datadoom/webdist/assets/space-grotesk-vietnamese-wght-normal-D0rl6rjA.woff2 +0 -0
  117. datadoom/webdist/index.html +15 -0
  118. datadoom-0.1.0.dev0.dist-info/METADATA +143 -0
  119. datadoom-0.1.0.dev0.dist-info/RECORD +122 -0
  120. datadoom-0.1.0.dev0.dist-info/WHEEL +4 -0
  121. datadoom-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  122. datadoom-0.1.0.dev0.dist-info/licenses/LICENSE +202 -0
@@ -0,0 +1,58 @@
1
+ """Deterministic ``metadata.json`` writer (04 §8, 06 §3.5).
2
+
3
+ The metadata document is intentionally free of timestamps or other ambient state
4
+ so that it is itself reproducible: the same ``(spec_hash, seed)`` produces an
5
+ identical metadata file. Human-facing run timestamps live in the persistence
6
+ layer, not in the reproducible artifact bundle.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import json
12
+ from pathlib import Path
13
+ from typing import Any
14
+
15
+ from .base import ArtifactInfo
16
+ from .checksums import sha256_bytes
17
+
18
+
19
+ def build_metadata(
20
+ *,
21
+ spec_body: dict[str, Any],
22
+ spec_hash: str,
23
+ seed: int,
24
+ rows: int,
25
+ package_version: str,
26
+ artifacts: list[ArtifactInfo],
27
+ compliance: dict[str, Any],
28
+ determinism: dict[str, Any],
29
+ failures: list[dict[str, Any]] | None = None,
30
+ ) -> dict[str, Any]:
31
+ metadata: dict[str, Any] = {
32
+ "datadoom_package_version": package_version,
33
+ "spec_hash": spec_hash,
34
+ "seed": seed,
35
+ "rows": rows,
36
+ "spec": spec_body,
37
+ "artifacts": [a.to_dict() for a in artifacts],
38
+ "compliance": compliance,
39
+ "determinism": determinism,
40
+ }
41
+ if failures is not None:
42
+ metadata["failures"] = failures
43
+ return metadata
44
+
45
+
46
+ def write_metadata(metadata: dict[str, Any], path: str | Path) -> ArtifactInfo:
47
+ text = json.dumps(metadata, sort_keys=True, indent=2, ensure_ascii=False)
48
+ data = (text + "\n").encode("utf-8")
49
+ path = Path(path)
50
+ path.parent.mkdir(parents=True, exist_ok=True)
51
+ with open(path, "wb") as fh:
52
+ fh.write(data)
53
+ return ArtifactInfo(
54
+ path=str(path),
55
+ format="json",
56
+ checksum_sha256=sha256_bytes(data),
57
+ size_bytes=len(data),
58
+ )
@@ -0,0 +1,45 @@
1
+ """Parquet writer (17 step 18, 09 §8).
2
+
3
+ Parquet is a columnar format ML tooling consumes directly (pandas/Polars/Spark,
4
+ ``datasets``). ``pyarrow`` is an **optional** dependency (``pip install
5
+ datadoom[parquet]``) — it's a large wheel and the core stays light — so the import
6
+ is deferred to ``write`` with an actionable error if it is missing. Within a
7
+ pinned environment the same ``(spec_hash, seed)`` yields identical bytes; the only
8
+ embedded ambient value is pyarrow's constant ``created_by`` build string.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from pathlib import Path
14
+
15
+ import pandas as pd
16
+
17
+ from ..errors import DataDoomError
18
+ from .base import ArtifactInfo, Exporter
19
+ from .checksums import sha256_file
20
+
21
+
22
+ class ParquetExporter(Exporter):
23
+ format = "parquet"
24
+
25
+ def write(self, df: pd.DataFrame, path: str | Path) -> ArtifactInfo:
26
+ try:
27
+ import pyarrow as pa
28
+ import pyarrow.parquet as pq
29
+ except ImportError as exc: # pragma: no cover - exercised only without the extra
30
+ raise DataDoomError( # noqa: TRY003
31
+ "the 'parquet' export format needs pyarrow; install it with "
32
+ "`pip install datadoom[parquet]`"
33
+ ) from exc
34
+
35
+ path = Path(path)
36
+ path.parent.mkdir(parents=True, exist_ok=True)
37
+ table = pa.Table.from_pandas(df, preserve_index=False)
38
+ # Fixed options so the bytes are reproducible on the pinned path.
39
+ pq.write_table(table, path, compression="snappy", version="2.6")
40
+ return ArtifactInfo(
41
+ path=str(path),
42
+ format=self.format,
43
+ checksum_sha256=sha256_file(path),
44
+ size_bytes=path.stat().st_size,
45
+ )
@@ -0,0 +1,18 @@
1
+ """Failure injection — deterministic corruption transforms (05 §4, 04 §7).
2
+
3
+ A clean baseline is captured before injection and always preserved alongside the
4
+ injected variant. Each mode draws from ``RNG(failure:i)`` so the injected frame
5
+ is itself reproducible on the pinned path.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from .apply import apply_failures
11
+ from .base import FailureMode
12
+ from .modes import FAILURE_MODES
13
+
14
+ __all__ = [
15
+ "FailureMode",
16
+ "FAILURE_MODES",
17
+ "apply_failures",
18
+ ]
@@ -0,0 +1,37 @@
1
+ """Failure-injection orchestration (03 §4 stage 6, 05 §4).
2
+
3
+ Captures the clean baseline, then applies the spec's ordered failure list to a
4
+ copy, each mode drawing from its own ``RNG(failure:i)``. Returns the injected
5
+ frame and the per-mode diff summaries. The clean frame is never mutated.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from typing import TYPE_CHECKING, Any
11
+
12
+ import pandas as pd
13
+
14
+ from .modes import FAILURE_MODES
15
+
16
+ if TYPE_CHECKING: # avoid a runtime import cycle with pipeline
17
+ from ..pipeline import RunContext
18
+
19
+
20
+ def apply_failures(ctx: RunContext, clean: pd.DataFrame) -> tuple[pd.DataFrame, list[dict[str, Any]]]:
21
+ """Apply the spec's failures in order to a copy of ``clean``.
22
+
23
+ Returns ``(injected_frame, diffs)`` where each diff records the mode and its
24
+ realized effect. ``ctx.used_namespaces`` gains a ``failure:i`` entry per mode
25
+ so the determinism report covers the injected stream too.
26
+ """
27
+ injected = clean.copy(deep=True)
28
+ diffs: list[dict[str, Any]] = []
29
+ for i, failure in enumerate(ctx.spec.failures):
30
+ mode = FAILURE_MODES[failure.type]
31
+ params = failure.model_dump()
32
+ params.pop("type", None)
33
+ rng = ctx.rng.failure(i)
34
+ ctx.used_namespaces.append(f"failure:{i}")
35
+ summary = mode.apply(rng, injected, params, ctx.spec.features)
36
+ diffs.append({"index": i, "type": failure.type, **summary})
37
+ return injected, diffs
@@ -0,0 +1,116 @@
1
+ """FailureMode ABC + shared helpers (05 §4, 04 §7).
2
+
3
+ A failure mode is a deterministic corruption transform applied *after* the clean
4
+ baseline is captured. Each mode reads its config (the failure spec entry minus
5
+ ``type``), mutates the working ``injected`` frame in place, and returns a **diff
6
+ summary** describing what it changed (fraction nullified, realized rate, shift
7
+ magnitude, leakage correlation…). The clean baseline is never touched.
8
+
9
+ All randomness flows through an injected ``numpy.random.Generator`` (``RNG(failure:i)``)
10
+ so the injected variant is itself reproducible on the pinned path.
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ from abc import ABC, abstractmethod
16
+ from collections.abc import Mapping
17
+ from typing import Any
18
+
19
+ import numpy as np
20
+ import pandas as pd
21
+
22
+ from ..errors import SpecValidationError
23
+ from ..spec.models import Feature
24
+
25
+
26
+ def sigmoid(z: np.ndarray) -> np.ndarray:
27
+ """Numerically-stable logistic function."""
28
+ return np.where(z >= 0, 1.0 / (1.0 + np.exp(-z)), np.exp(z) / (1.0 + np.exp(z)))
29
+
30
+
31
+ def standardize(values: np.ndarray) -> np.ndarray:
32
+ """Z-score a numeric (or boolean) driver; constant/degenerate columns map to
33
+ zeros. NaN-robust: previously-injected missing values contribute a 0 score
34
+ rather than poisoning the whole column."""
35
+ x = np.asarray(values, dtype=float)
36
+ std = float(np.nanstd(x))
37
+ if std == 0.0 or not np.isfinite(std):
38
+ return np.zeros_like(x)
39
+ z = (x - float(np.nanmean(x))) / std
40
+ return np.nan_to_num(z, nan=0.0)
41
+
42
+
43
+ def calibrate_logistic_intercept(scores: np.ndarray, target_rate: float) -> float:
44
+ """Find ``a`` so that ``mean(sigmoid(a + scores)) == target_rate``.
45
+
46
+ The mean of the logistic is monotonic in the intercept, so a bisection
47
+ converges. This lets MAR/MNAR honor the requested *expected* missing rate
48
+ while keeping the missingness **dependent on the driver/value** (the whole
49
+ point of those mechanisms).
50
+ """
51
+ if target_rate <= 0.0:
52
+ return float("-inf")
53
+ if target_rate >= 1.0:
54
+ return float("inf")
55
+ lo, hi = -60.0, 60.0
56
+ for _ in range(80):
57
+ mid = 0.5 * (lo + hi)
58
+ if float(np.mean(sigmoid(mid + scores))) < target_rate:
59
+ lo = mid
60
+ else:
61
+ hi = mid
62
+ return 0.5 * (lo + hi)
63
+
64
+
65
+ # --- validation helpers --------------------------------------------------------------
66
+
67
+
68
+ def require_rate(params: Mapping[str, Any], locator: str) -> float:
69
+ rate = params.get("rate")
70
+ if rate is None:
71
+ raise SpecValidationError("missing required 'rate'", locator=f"{locator}.rate")
72
+ rate = float(rate)
73
+ if not 0.0 <= rate <= 1.0:
74
+ raise SpecValidationError("rate must be in [0, 1]", locator=f"{locator}.rate")
75
+ return rate
76
+
77
+
78
+ def require_feature(
79
+ params: Mapping[str, Any],
80
+ key: str,
81
+ features: Mapping[str, Feature],
82
+ locator: str,
83
+ ) -> str:
84
+ ref = params.get(key)
85
+ if not isinstance(ref, str):
86
+ raise SpecValidationError(f"missing required '{key}'", locator=f"{locator}.{key}")
87
+ if ref not in features:
88
+ raise SpecValidationError(
89
+ f"{key} {ref!r} is not a declared feature", locator=f"{locator}.{key}"
90
+ )
91
+ return ref
92
+
93
+
94
+ class FailureMode(ABC):
95
+ """ABC for a corruption transform (05 §4)."""
96
+
97
+ name: str
98
+ # Optional JSON-schema fragment for the failure params (09 §6); ``None`` for built-ins.
99
+ param_schema: Mapping[str, Any] | None = None
100
+
101
+ def validate(
102
+ self, params: Mapping[str, Any], features: Mapping[str, Feature], locator: str
103
+ ) -> None:
104
+ """Check the config carries the fields this mode needs and they reference
105
+ real features. Raise :class:`SpecValidationError` on the first problem."""
106
+ return None
107
+
108
+ @abstractmethod
109
+ def apply(
110
+ self,
111
+ rng: np.random.Generator,
112
+ frame: pd.DataFrame,
113
+ params: Mapping[str, Any],
114
+ features: Mapping[str, Feature],
115
+ ) -> dict[str, Any]:
116
+ """Mutate ``frame`` in place and return a diff summary."""