datadoom 0.1.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datadoom/__init__.py +23 -0
- datadoom/adapters/__init__.py +29 -0
- datadoom/adapters/frameworks.py +94 -0
- datadoom/adapters/loaders.py +72 -0
- datadoom/api/__init__.py +11 -0
- datadoom/api/app.py +109 -0
- datadoom/api/deps.py +30 -0
- datadoom/api/errors.py +89 -0
- datadoom/api/estimate.py +82 -0
- datadoom/api/routes/__init__.py +7 -0
- datadoom/api/routes/artifacts.py +147 -0
- datadoom/api/routes/datasets.py +180 -0
- datadoom/api/routes/meta.py +45 -0
- datadoom/api/routes/plugins.py +22 -0
- datadoom/api/routes/runs.py +144 -0
- datadoom/api/routes/specs.py +73 -0
- datadoom/api/routes/templates.py +30 -0
- datadoom/api/schemas.py +230 -0
- datadoom/api/serializers.py +143 -0
- datadoom/api/state.py +24 -0
- datadoom/api/store_helpers.py +56 -0
- datadoom/api/ws.py +72 -0
- datadoom/cli/__init__.py +1 -0
- datadoom/cli/main.py +313 -0
- datadoom/config.py +108 -0
- datadoom/engine/__init__.py +38 -0
- datadoom/engine/advice.py +289 -0
- datadoom/engine/audit.py +290 -0
- datadoom/engine/causal/__init__.py +15 -0
- datadoom/engine/causal/execute.py +116 -0
- datadoom/engine/causal/functions.py +116 -0
- datadoom/engine/causal/graph.py +54 -0
- datadoom/engine/difficulty/__init__.py +36 -0
- datadoom/engine/difficulty/calibrate.py +235 -0
- datadoom/engine/difficulty/knobs.py +171 -0
- datadoom/engine/difficulty/probes.py +181 -0
- datadoom/engine/dist/__init__.py +35 -0
- datadoom/engine/dist/base.py +46 -0
- datadoom/engine/dist/builtins.py +172 -0
- datadoom/engine/dist/compliance.py +344 -0
- datadoom/engine/dist/providers.py +117 -0
- datadoom/engine/errors.py +32 -0
- datadoom/engine/export/__init__.py +27 -0
- datadoom/engine/export/base.py +49 -0
- datadoom/engine/export/checksums.py +18 -0
- datadoom/engine/export/csv_exporter.py +34 -0
- datadoom/engine/export/json_exporter.py +67 -0
- datadoom/engine/export/metadata.py +58 -0
- datadoom/engine/export/parquet_exporter.py +45 -0
- datadoom/engine/failure/__init__.py +18 -0
- datadoom/engine/failure/apply.py +37 -0
- datadoom/engine/failure/base.py +116 -0
- datadoom/engine/failure/modes.py +442 -0
- datadoom/engine/pipeline.py +418 -0
- datadoom/engine/profile.py +327 -0
- datadoom/engine/progress.py +14 -0
- datadoom/engine/reference.py +338 -0
- datadoom/engine/reports.py +206 -0
- datadoom/engine/rng.py +79 -0
- datadoom/engine/spec/__init__.py +45 -0
- datadoom/engine/spec/hashing.py +57 -0
- datadoom/engine/spec/models.py +238 -0
- datadoom/engine/spec/validate.py +345 -0
- datadoom/engine/timeseries.py +88 -0
- datadoom/jobs/__init__.py +14 -0
- datadoom/jobs/progress.py +155 -0
- datadoom/jobs/worker.py +162 -0
- datadoom/plugin.py +35 -0
- datadoom/plugins/__init__.py +47 -0
- datadoom/plugins/contracts.py +72 -0
- datadoom/plugins/loader.py +125 -0
- datadoom/plugins/registry.py +214 -0
- datadoom/plugins/scaffold.py +434 -0
- datadoom/store/__init__.py +47 -0
- datadoom/store/artifacts.py +67 -0
- datadoom/store/db.py +104 -0
- datadoom/store/migrations/__init__.py +0 -0
- datadoom/store/migrations/env.py +53 -0
- datadoom/store/migrations/script.py.mako +24 -0
- datadoom/store/migrations/versions/0001_init.py +149 -0
- datadoom/store/migrations/versions/0002_report_mutual_information.py +23 -0
- datadoom/store/migrations/versions/0003_run_name.py +23 -0
- datadoom/store/migrations/versions/0004_report_profile.py +24 -0
- datadoom/store/models.py +170 -0
- datadoom/store/repositories.py +279 -0
- datadoom/templates/__init__.py +239 -0
- datadoom/templates/ab_test.datadoom.yaml +46 -0
- datadoom/templates/clinical_deterioration.datadoom.yaml +124 -0
- datadoom/templates/credit_default_challenge.datadoom.yaml +147 -0
- datadoom/templates/customer_churn.datadoom.yaml +60 -0
- datadoom/templates/ecommerce_orders.datadoom.yaml +46 -0
- datadoom/templates/fraud_detection.datadoom.yaml +57 -0
- datadoom/templates/hospital_readmission.datadoom.yaml +61 -0
- datadoom/templates/insurance_claims.datadoom.yaml +43 -0
- datadoom/templates/iot_sensors.datadoom.yaml +44 -0
- datadoom/templates/people_directory.datadoom.yaml +56 -0
- datadoom/templates/predictive_maintenance.datadoom.yaml +107 -0
- datadoom/templates/telecom_churn_challenge.datadoom.yaml +125 -0
- datadoom/version.py +3 -0
- datadoom/webdist/assets/index-V8VAuTJG.js +445 -0
- datadoom/webdist/assets/index-doRjyG5s.css +1 -0
- datadoom/webdist/assets/inter-cyrillic-ext-wght-normal-BOeWTOD4.woff2 +0 -0
- datadoom/webdist/assets/inter-cyrillic-wght-normal-DqGufNeO.woff2 +0 -0
- datadoom/webdist/assets/inter-greek-ext-wght-normal-DlzME5K_.woff2 +0 -0
- datadoom/webdist/assets/inter-greek-wght-normal-CkhJZR-_.woff2 +0 -0
- datadoom/webdist/assets/inter-latin-ext-wght-normal-DO1Apj_S.woff2 +0 -0
- datadoom/webdist/assets/inter-latin-wght-normal-Dx4kXJAl.woff2 +0 -0
- datadoom/webdist/assets/inter-vietnamese-wght-normal-CBcvBZtf.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-cyrillic-wght-normal-D73BlboJ.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-greek-wght-normal-Bw9x6K1M.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-latin-ext-wght-normal-DBQx-q_a.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-latin-wght-normal-B9CIFXIH.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-vietnamese-wght-normal-Bt-aOZkq.woff2 +0 -0
- datadoom/webdist/assets/space-grotesk-latin-ext-wght-normal-D9tNdqV9.woff2 +0 -0
- datadoom/webdist/assets/space-grotesk-latin-wght-normal-BhU9QXUp.woff2 +0 -0
- datadoom/webdist/assets/space-grotesk-vietnamese-wght-normal-D0rl6rjA.woff2 +0 -0
- datadoom/webdist/index.html +15 -0
- datadoom-0.1.0.dev0.dist-info/METADATA +143 -0
- datadoom-0.1.0.dev0.dist-info/RECORD +122 -0
- datadoom-0.1.0.dev0.dist-info/WHEEL +4 -0
- datadoom-0.1.0.dev0.dist-info/entry_points.txt +2 -0
- datadoom-0.1.0.dev0.dist-info/licenses/LICENSE +202 -0
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
"""Deterministic ``metadata.json`` writer (04 §8, 06 §3.5).
|
|
2
|
+
|
|
3
|
+
The metadata document is intentionally free of timestamps or other ambient state
|
|
4
|
+
so that it is itself reproducible: the same ``(spec_hash, seed)`` produces an
|
|
5
|
+
identical metadata file. Human-facing run timestamps live in the persistence
|
|
6
|
+
layer, not in the reproducible artifact bundle.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
from .base import ArtifactInfo
|
|
16
|
+
from .checksums import sha256_bytes
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
def build_metadata(
|
|
20
|
+
*,
|
|
21
|
+
spec_body: dict[str, Any],
|
|
22
|
+
spec_hash: str,
|
|
23
|
+
seed: int,
|
|
24
|
+
rows: int,
|
|
25
|
+
package_version: str,
|
|
26
|
+
artifacts: list[ArtifactInfo],
|
|
27
|
+
compliance: dict[str, Any],
|
|
28
|
+
determinism: dict[str, Any],
|
|
29
|
+
failures: list[dict[str, Any]] | None = None,
|
|
30
|
+
) -> dict[str, Any]:
|
|
31
|
+
metadata: dict[str, Any] = {
|
|
32
|
+
"datadoom_package_version": package_version,
|
|
33
|
+
"spec_hash": spec_hash,
|
|
34
|
+
"seed": seed,
|
|
35
|
+
"rows": rows,
|
|
36
|
+
"spec": spec_body,
|
|
37
|
+
"artifacts": [a.to_dict() for a in artifacts],
|
|
38
|
+
"compliance": compliance,
|
|
39
|
+
"determinism": determinism,
|
|
40
|
+
}
|
|
41
|
+
if failures is not None:
|
|
42
|
+
metadata["failures"] = failures
|
|
43
|
+
return metadata
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def write_metadata(metadata: dict[str, Any], path: str | Path) -> ArtifactInfo:
|
|
47
|
+
text = json.dumps(metadata, sort_keys=True, indent=2, ensure_ascii=False)
|
|
48
|
+
data = (text + "\n").encode("utf-8")
|
|
49
|
+
path = Path(path)
|
|
50
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
51
|
+
with open(path, "wb") as fh:
|
|
52
|
+
fh.write(data)
|
|
53
|
+
return ArtifactInfo(
|
|
54
|
+
path=str(path),
|
|
55
|
+
format="json",
|
|
56
|
+
checksum_sha256=sha256_bytes(data),
|
|
57
|
+
size_bytes=len(data),
|
|
58
|
+
)
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
"""Parquet writer (17 step 18, 09 §8).
|
|
2
|
+
|
|
3
|
+
Parquet is a columnar format ML tooling consumes directly (pandas/Polars/Spark,
|
|
4
|
+
``datasets``). ``pyarrow`` is an **optional** dependency (``pip install
|
|
5
|
+
datadoom[parquet]``) — it's a large wheel and the core stays light — so the import
|
|
6
|
+
is deferred to ``write`` with an actionable error if it is missing. Within a
|
|
7
|
+
pinned environment the same ``(spec_hash, seed)`` yields identical bytes; the only
|
|
8
|
+
embedded ambient value is pyarrow's constant ``created_by`` build string.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
import pandas as pd
|
|
16
|
+
|
|
17
|
+
from ..errors import DataDoomError
|
|
18
|
+
from .base import ArtifactInfo, Exporter
|
|
19
|
+
from .checksums import sha256_file
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class ParquetExporter(Exporter):
|
|
23
|
+
format = "parquet"
|
|
24
|
+
|
|
25
|
+
def write(self, df: pd.DataFrame, path: str | Path) -> ArtifactInfo:
|
|
26
|
+
try:
|
|
27
|
+
import pyarrow as pa
|
|
28
|
+
import pyarrow.parquet as pq
|
|
29
|
+
except ImportError as exc: # pragma: no cover - exercised only without the extra
|
|
30
|
+
raise DataDoomError( # noqa: TRY003
|
|
31
|
+
"the 'parquet' export format needs pyarrow; install it with "
|
|
32
|
+
"`pip install datadoom[parquet]`"
|
|
33
|
+
) from exc
|
|
34
|
+
|
|
35
|
+
path = Path(path)
|
|
36
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
37
|
+
table = pa.Table.from_pandas(df, preserve_index=False)
|
|
38
|
+
# Fixed options so the bytes are reproducible on the pinned path.
|
|
39
|
+
pq.write_table(table, path, compression="snappy", version="2.6")
|
|
40
|
+
return ArtifactInfo(
|
|
41
|
+
path=str(path),
|
|
42
|
+
format=self.format,
|
|
43
|
+
checksum_sha256=sha256_file(path),
|
|
44
|
+
size_bytes=path.stat().st_size,
|
|
45
|
+
)
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
"""Failure injection — deterministic corruption transforms (05 §4, 04 §7).
|
|
2
|
+
|
|
3
|
+
A clean baseline is captured before injection and always preserved alongside the
|
|
4
|
+
injected variant. Each mode draws from ``RNG(failure:i)`` so the injected frame
|
|
5
|
+
is itself reproducible on the pinned path.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from .apply import apply_failures
|
|
11
|
+
from .base import FailureMode
|
|
12
|
+
from .modes import FAILURE_MODES
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"FailureMode",
|
|
16
|
+
"FAILURE_MODES",
|
|
17
|
+
"apply_failures",
|
|
18
|
+
]
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""Failure-injection orchestration (03 §4 stage 6, 05 §4).
|
|
2
|
+
|
|
3
|
+
Captures the clean baseline, then applies the spec's ordered failure list to a
|
|
4
|
+
copy, each mode drawing from its own ``RNG(failure:i)``. Returns the injected
|
|
5
|
+
frame and the per-mode diff summaries. The clean frame is never mutated.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import TYPE_CHECKING, Any
|
|
11
|
+
|
|
12
|
+
import pandas as pd
|
|
13
|
+
|
|
14
|
+
from .modes import FAILURE_MODES
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING: # avoid a runtime import cycle with pipeline
|
|
17
|
+
from ..pipeline import RunContext
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def apply_failures(ctx: RunContext, clean: pd.DataFrame) -> tuple[pd.DataFrame, list[dict[str, Any]]]:
|
|
21
|
+
"""Apply the spec's failures in order to a copy of ``clean``.
|
|
22
|
+
|
|
23
|
+
Returns ``(injected_frame, diffs)`` where each diff records the mode and its
|
|
24
|
+
realized effect. ``ctx.used_namespaces`` gains a ``failure:i`` entry per mode
|
|
25
|
+
so the determinism report covers the injected stream too.
|
|
26
|
+
"""
|
|
27
|
+
injected = clean.copy(deep=True)
|
|
28
|
+
diffs: list[dict[str, Any]] = []
|
|
29
|
+
for i, failure in enumerate(ctx.spec.failures):
|
|
30
|
+
mode = FAILURE_MODES[failure.type]
|
|
31
|
+
params = failure.model_dump()
|
|
32
|
+
params.pop("type", None)
|
|
33
|
+
rng = ctx.rng.failure(i)
|
|
34
|
+
ctx.used_namespaces.append(f"failure:{i}")
|
|
35
|
+
summary = mode.apply(rng, injected, params, ctx.spec.features)
|
|
36
|
+
diffs.append({"index": i, "type": failure.type, **summary})
|
|
37
|
+
return injected, diffs
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
"""FailureMode ABC + shared helpers (05 §4, 04 §7).
|
|
2
|
+
|
|
3
|
+
A failure mode is a deterministic corruption transform applied *after* the clean
|
|
4
|
+
baseline is captured. Each mode reads its config (the failure spec entry minus
|
|
5
|
+
``type``), mutates the working ``injected`` frame in place, and returns a **diff
|
|
6
|
+
summary** describing what it changed (fraction nullified, realized rate, shift
|
|
7
|
+
magnitude, leakage correlation…). The clean baseline is never touched.
|
|
8
|
+
|
|
9
|
+
All randomness flows through an injected ``numpy.random.Generator`` (``RNG(failure:i)``)
|
|
10
|
+
so the injected variant is itself reproducible on the pinned path.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
from abc import ABC, abstractmethod
|
|
16
|
+
from collections.abc import Mapping
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
import numpy as np
|
|
20
|
+
import pandas as pd
|
|
21
|
+
|
|
22
|
+
from ..errors import SpecValidationError
|
|
23
|
+
from ..spec.models import Feature
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def sigmoid(z: np.ndarray) -> np.ndarray:
|
|
27
|
+
"""Numerically-stable logistic function."""
|
|
28
|
+
return np.where(z >= 0, 1.0 / (1.0 + np.exp(-z)), np.exp(z) / (1.0 + np.exp(z)))
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def standardize(values: np.ndarray) -> np.ndarray:
|
|
32
|
+
"""Z-score a numeric (or boolean) driver; constant/degenerate columns map to
|
|
33
|
+
zeros. NaN-robust: previously-injected missing values contribute a 0 score
|
|
34
|
+
rather than poisoning the whole column."""
|
|
35
|
+
x = np.asarray(values, dtype=float)
|
|
36
|
+
std = float(np.nanstd(x))
|
|
37
|
+
if std == 0.0 or not np.isfinite(std):
|
|
38
|
+
return np.zeros_like(x)
|
|
39
|
+
z = (x - float(np.nanmean(x))) / std
|
|
40
|
+
return np.nan_to_num(z, nan=0.0)
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
def calibrate_logistic_intercept(scores: np.ndarray, target_rate: float) -> float:
|
|
44
|
+
"""Find ``a`` so that ``mean(sigmoid(a + scores)) == target_rate``.
|
|
45
|
+
|
|
46
|
+
The mean of the logistic is monotonic in the intercept, so a bisection
|
|
47
|
+
converges. This lets MAR/MNAR honor the requested *expected* missing rate
|
|
48
|
+
while keeping the missingness **dependent on the driver/value** (the whole
|
|
49
|
+
point of those mechanisms).
|
|
50
|
+
"""
|
|
51
|
+
if target_rate <= 0.0:
|
|
52
|
+
return float("-inf")
|
|
53
|
+
if target_rate >= 1.0:
|
|
54
|
+
return float("inf")
|
|
55
|
+
lo, hi = -60.0, 60.0
|
|
56
|
+
for _ in range(80):
|
|
57
|
+
mid = 0.5 * (lo + hi)
|
|
58
|
+
if float(np.mean(sigmoid(mid + scores))) < target_rate:
|
|
59
|
+
lo = mid
|
|
60
|
+
else:
|
|
61
|
+
hi = mid
|
|
62
|
+
return 0.5 * (lo + hi)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
# --- validation helpers --------------------------------------------------------------
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
def require_rate(params: Mapping[str, Any], locator: str) -> float:
|
|
69
|
+
rate = params.get("rate")
|
|
70
|
+
if rate is None:
|
|
71
|
+
raise SpecValidationError("missing required 'rate'", locator=f"{locator}.rate")
|
|
72
|
+
rate = float(rate)
|
|
73
|
+
if not 0.0 <= rate <= 1.0:
|
|
74
|
+
raise SpecValidationError("rate must be in [0, 1]", locator=f"{locator}.rate")
|
|
75
|
+
return rate
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def require_feature(
|
|
79
|
+
params: Mapping[str, Any],
|
|
80
|
+
key: str,
|
|
81
|
+
features: Mapping[str, Feature],
|
|
82
|
+
locator: str,
|
|
83
|
+
) -> str:
|
|
84
|
+
ref = params.get(key)
|
|
85
|
+
if not isinstance(ref, str):
|
|
86
|
+
raise SpecValidationError(f"missing required '{key}'", locator=f"{locator}.{key}")
|
|
87
|
+
if ref not in features:
|
|
88
|
+
raise SpecValidationError(
|
|
89
|
+
f"{key} {ref!r} is not a declared feature", locator=f"{locator}.{key}"
|
|
90
|
+
)
|
|
91
|
+
return ref
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class FailureMode(ABC):
|
|
95
|
+
"""ABC for a corruption transform (05 §4)."""
|
|
96
|
+
|
|
97
|
+
name: str
|
|
98
|
+
# Optional JSON-schema fragment for the failure params (09 §6); ``None`` for built-ins.
|
|
99
|
+
param_schema: Mapping[str, Any] | None = None
|
|
100
|
+
|
|
101
|
+
def validate(
|
|
102
|
+
self, params: Mapping[str, Any], features: Mapping[str, Feature], locator: str
|
|
103
|
+
) -> None:
|
|
104
|
+
"""Check the config carries the fields this mode needs and they reference
|
|
105
|
+
real features. Raise :class:`SpecValidationError` on the first problem."""
|
|
106
|
+
return None
|
|
107
|
+
|
|
108
|
+
@abstractmethod
|
|
109
|
+
def apply(
|
|
110
|
+
self,
|
|
111
|
+
rng: np.random.Generator,
|
|
112
|
+
frame: pd.DataFrame,
|
|
113
|
+
params: Mapping[str, Any],
|
|
114
|
+
features: Mapping[str, Feature],
|
|
115
|
+
) -> dict[str, Any]:
|
|
116
|
+
"""Mutate ``frame`` in place and return a diff summary."""
|