datadoom 0.1.0.dev0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datadoom/__init__.py +23 -0
- datadoom/adapters/__init__.py +29 -0
- datadoom/adapters/frameworks.py +94 -0
- datadoom/adapters/loaders.py +72 -0
- datadoom/api/__init__.py +11 -0
- datadoom/api/app.py +109 -0
- datadoom/api/deps.py +30 -0
- datadoom/api/errors.py +89 -0
- datadoom/api/estimate.py +82 -0
- datadoom/api/routes/__init__.py +7 -0
- datadoom/api/routes/artifacts.py +147 -0
- datadoom/api/routes/datasets.py +180 -0
- datadoom/api/routes/meta.py +45 -0
- datadoom/api/routes/plugins.py +22 -0
- datadoom/api/routes/runs.py +144 -0
- datadoom/api/routes/specs.py +73 -0
- datadoom/api/routes/templates.py +30 -0
- datadoom/api/schemas.py +230 -0
- datadoom/api/serializers.py +143 -0
- datadoom/api/state.py +24 -0
- datadoom/api/store_helpers.py +56 -0
- datadoom/api/ws.py +72 -0
- datadoom/cli/__init__.py +1 -0
- datadoom/cli/main.py +313 -0
- datadoom/config.py +108 -0
- datadoom/engine/__init__.py +38 -0
- datadoom/engine/advice.py +289 -0
- datadoom/engine/audit.py +290 -0
- datadoom/engine/causal/__init__.py +15 -0
- datadoom/engine/causal/execute.py +116 -0
- datadoom/engine/causal/functions.py +116 -0
- datadoom/engine/causal/graph.py +54 -0
- datadoom/engine/difficulty/__init__.py +36 -0
- datadoom/engine/difficulty/calibrate.py +235 -0
- datadoom/engine/difficulty/knobs.py +171 -0
- datadoom/engine/difficulty/probes.py +181 -0
- datadoom/engine/dist/__init__.py +35 -0
- datadoom/engine/dist/base.py +46 -0
- datadoom/engine/dist/builtins.py +172 -0
- datadoom/engine/dist/compliance.py +344 -0
- datadoom/engine/dist/providers.py +117 -0
- datadoom/engine/errors.py +32 -0
- datadoom/engine/export/__init__.py +27 -0
- datadoom/engine/export/base.py +49 -0
- datadoom/engine/export/checksums.py +18 -0
- datadoom/engine/export/csv_exporter.py +34 -0
- datadoom/engine/export/json_exporter.py +67 -0
- datadoom/engine/export/metadata.py +58 -0
- datadoom/engine/export/parquet_exporter.py +45 -0
- datadoom/engine/failure/__init__.py +18 -0
- datadoom/engine/failure/apply.py +37 -0
- datadoom/engine/failure/base.py +116 -0
- datadoom/engine/failure/modes.py +442 -0
- datadoom/engine/pipeline.py +418 -0
- datadoom/engine/profile.py +327 -0
- datadoom/engine/progress.py +14 -0
- datadoom/engine/reference.py +338 -0
- datadoom/engine/reports.py +206 -0
- datadoom/engine/rng.py +79 -0
- datadoom/engine/spec/__init__.py +45 -0
- datadoom/engine/spec/hashing.py +57 -0
- datadoom/engine/spec/models.py +238 -0
- datadoom/engine/spec/validate.py +345 -0
- datadoom/engine/timeseries.py +88 -0
- datadoom/jobs/__init__.py +14 -0
- datadoom/jobs/progress.py +155 -0
- datadoom/jobs/worker.py +162 -0
- datadoom/plugin.py +35 -0
- datadoom/plugins/__init__.py +47 -0
- datadoom/plugins/contracts.py +72 -0
- datadoom/plugins/loader.py +125 -0
- datadoom/plugins/registry.py +214 -0
- datadoom/plugins/scaffold.py +434 -0
- datadoom/store/__init__.py +47 -0
- datadoom/store/artifacts.py +67 -0
- datadoom/store/db.py +104 -0
- datadoom/store/migrations/__init__.py +0 -0
- datadoom/store/migrations/env.py +53 -0
- datadoom/store/migrations/script.py.mako +24 -0
- datadoom/store/migrations/versions/0001_init.py +149 -0
- datadoom/store/migrations/versions/0002_report_mutual_information.py +23 -0
- datadoom/store/migrations/versions/0003_run_name.py +23 -0
- datadoom/store/migrations/versions/0004_report_profile.py +24 -0
- datadoom/store/models.py +170 -0
- datadoom/store/repositories.py +279 -0
- datadoom/templates/__init__.py +239 -0
- datadoom/templates/ab_test.datadoom.yaml +46 -0
- datadoom/templates/clinical_deterioration.datadoom.yaml +124 -0
- datadoom/templates/credit_default_challenge.datadoom.yaml +147 -0
- datadoom/templates/customer_churn.datadoom.yaml +60 -0
- datadoom/templates/ecommerce_orders.datadoom.yaml +46 -0
- datadoom/templates/fraud_detection.datadoom.yaml +57 -0
- datadoom/templates/hospital_readmission.datadoom.yaml +61 -0
- datadoom/templates/insurance_claims.datadoom.yaml +43 -0
- datadoom/templates/iot_sensors.datadoom.yaml +44 -0
- datadoom/templates/people_directory.datadoom.yaml +56 -0
- datadoom/templates/predictive_maintenance.datadoom.yaml +107 -0
- datadoom/templates/telecom_churn_challenge.datadoom.yaml +125 -0
- datadoom/version.py +3 -0
- datadoom/webdist/assets/index-V8VAuTJG.js +445 -0
- datadoom/webdist/assets/index-doRjyG5s.css +1 -0
- datadoom/webdist/assets/inter-cyrillic-ext-wght-normal-BOeWTOD4.woff2 +0 -0
- datadoom/webdist/assets/inter-cyrillic-wght-normal-DqGufNeO.woff2 +0 -0
- datadoom/webdist/assets/inter-greek-ext-wght-normal-DlzME5K_.woff2 +0 -0
- datadoom/webdist/assets/inter-greek-wght-normal-CkhJZR-_.woff2 +0 -0
- datadoom/webdist/assets/inter-latin-ext-wght-normal-DO1Apj_S.woff2 +0 -0
- datadoom/webdist/assets/inter-latin-wght-normal-Dx4kXJAl.woff2 +0 -0
- datadoom/webdist/assets/inter-vietnamese-wght-normal-CBcvBZtf.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-cyrillic-wght-normal-D73BlboJ.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-greek-wght-normal-Bw9x6K1M.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-latin-ext-wght-normal-DBQx-q_a.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-latin-wght-normal-B9CIFXIH.woff2 +0 -0
- datadoom/webdist/assets/jetbrains-mono-vietnamese-wght-normal-Bt-aOZkq.woff2 +0 -0
- datadoom/webdist/assets/space-grotesk-latin-ext-wght-normal-D9tNdqV9.woff2 +0 -0
- datadoom/webdist/assets/space-grotesk-latin-wght-normal-BhU9QXUp.woff2 +0 -0
- datadoom/webdist/assets/space-grotesk-vietnamese-wght-normal-D0rl6rjA.woff2 +0 -0
- datadoom/webdist/index.html +15 -0
- datadoom-0.1.0.dev0.dist-info/METADATA +143 -0
- datadoom-0.1.0.dev0.dist-info/RECORD +122 -0
- datadoom-0.1.0.dev0.dist-info/WHEEL +4 -0
- datadoom-0.1.0.dev0.dist-info/entry_points.txt +2 -0
- datadoom-0.1.0.dev0.dist-info/licenses/LICENSE +202 -0
datadoom/__init__.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
"""DataDoom — local-first engine for reproducible synthetic data.
|
|
2
|
+
|
|
3
|
+
Public API:
|
|
4
|
+
|
|
5
|
+
>>> import datadoom
|
|
6
|
+
>>> spec = datadoom.load_spec("dataset.datadoom.yaml")
|
|
7
|
+
>>> result = datadoom.generate(spec, seed=42)
|
|
8
|
+
>>> result.frame.head()
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
from .engine import Spec, generate, load_spec, parse_spec, validate_spec
|
|
14
|
+
from .version import __version__
|
|
15
|
+
|
|
16
|
+
__all__ = [
|
|
17
|
+
"Spec",
|
|
18
|
+
"generate",
|
|
19
|
+
"load_spec",
|
|
20
|
+
"parse_spec",
|
|
21
|
+
"validate_spec",
|
|
22
|
+
"__version__",
|
|
23
|
+
]
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""Framework adapters — load a generated DataDoom run into ML frameworks (18.4).
|
|
2
|
+
|
|
3
|
+
A run directory (the ``--out`` of ``datadoom run``) holds ``data.csv`` (and any
|
|
4
|
+
``data.injected.*`` / other formats). These helpers turn it into the in-memory
|
|
5
|
+
object each framework expects:
|
|
6
|
+
|
|
7
|
+
* :func:`load_dataframe` — a **pandas** ``DataFrame`` (no extra needed; pandas is
|
|
8
|
+
a core dep). Auto-detects csv / parquet / json and the clean/injected variant.
|
|
9
|
+
* :func:`to_torch_dataset` — a ``torch.utils.data.TensorDataset`` (extra: ``torch``).
|
|
10
|
+
* :func:`to_tf_dataset` — a ``tf.data.Dataset`` (extra: ``tf``).
|
|
11
|
+
* :func:`to_hf_dataset` — a HuggingFace ``datasets.Dataset`` (extra: ``hf``).
|
|
12
|
+
|
|
13
|
+
The framework loaders **lazy-import** their backend and raise a clear install
|
|
14
|
+
hint if it is missing, so the core install stays light. This package depends only
|
|
15
|
+
on the engine (for nothing heavyweight) + pandas; the engine never imports it.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
from .frameworks import to_hf_dataset, to_tf_dataset, to_torch_dataset
|
|
21
|
+
from .loaders import load_dataframe, numeric_feature_columns
|
|
22
|
+
|
|
23
|
+
__all__ = [
|
|
24
|
+
"load_dataframe",
|
|
25
|
+
"numeric_feature_columns",
|
|
26
|
+
"to_torch_dataset",
|
|
27
|
+
"to_tf_dataset",
|
|
28
|
+
"to_hf_dataset",
|
|
29
|
+
]
|
|
@@ -0,0 +1,94 @@
|
|
|
1
|
+
"""Convert a pandas DataFrame into torch / tensorflow / HuggingFace datasets.
|
|
2
|
+
|
|
3
|
+
Each converter lazy-imports its backend so the core install stays light; a
|
|
4
|
+
missing backend raises an actionable install hint naming the right extra.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from importlib import import_module
|
|
10
|
+
from typing import TYPE_CHECKING, Any
|
|
11
|
+
|
|
12
|
+
import numpy as np
|
|
13
|
+
import pandas as pd
|
|
14
|
+
|
|
15
|
+
from .loaders import numeric_feature_columns
|
|
16
|
+
|
|
17
|
+
if TYPE_CHECKING: # pragma: no cover - typing only
|
|
18
|
+
pass
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def _require(module: str, extra: str) -> Any:
|
|
22
|
+
try:
|
|
23
|
+
return import_module(module)
|
|
24
|
+
except ImportError as exc: # pragma: no cover - exercised only without the extra
|
|
25
|
+
raise ImportError(
|
|
26
|
+
f"{module!r} is required for this adapter. Install it with: "
|
|
27
|
+
f"pip install 'datadoom[{extra}]'"
|
|
28
|
+
) from exc
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
def _feature_matrix(
|
|
32
|
+
df: pd.DataFrame, feature_columns: list[str] | None, target: str | None
|
|
33
|
+
) -> tuple[list[str], np.ndarray]:
|
|
34
|
+
cols = feature_columns or numeric_feature_columns(df, exclude=[target] if target else None)
|
|
35
|
+
if not cols:
|
|
36
|
+
raise ValueError(
|
|
37
|
+
"no numeric/boolean feature columns found; pass feature_columns explicitly "
|
|
38
|
+
"or encode categorical/text columns first"
|
|
39
|
+
)
|
|
40
|
+
x = df[cols].to_numpy(dtype="float32")
|
|
41
|
+
return cols, x
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def to_torch_dataset(
|
|
45
|
+
df: pd.DataFrame,
|
|
46
|
+
*,
|
|
47
|
+
target: str | None = None,
|
|
48
|
+
feature_columns: list[str] | None = None,
|
|
49
|
+
) -> Any:
|
|
50
|
+
"""Build a ``torch.utils.data.TensorDataset`` from ``df`` (extra: ``torch``).
|
|
51
|
+
|
|
52
|
+
Features are the numeric/boolean columns (or ``feature_columns``); if
|
|
53
|
+
``target`` is given it becomes the second tensor.
|
|
54
|
+
"""
|
|
55
|
+
torch = _require("torch", "torch")
|
|
56
|
+
_, x = _feature_matrix(df, feature_columns, target)
|
|
57
|
+
x_t = torch.as_tensor(x)
|
|
58
|
+
if target is None:
|
|
59
|
+
return torch.utils.data.TensorDataset(x_t)
|
|
60
|
+
y = df[target].to_numpy()
|
|
61
|
+
y_t = torch.as_tensor(y.astype("float32") if y.dtype != object else y)
|
|
62
|
+
return torch.utils.data.TensorDataset(x_t, y_t)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def to_tf_dataset(
|
|
66
|
+
df: pd.DataFrame,
|
|
67
|
+
*,
|
|
68
|
+
target: str | None = None,
|
|
69
|
+
feature_columns: list[str] | None = None,
|
|
70
|
+
batch_size: int | None = None,
|
|
71
|
+
) -> Any:
|
|
72
|
+
"""Build a ``tf.data.Dataset`` from ``df`` (extra: ``tf``).
|
|
73
|
+
|
|
74
|
+
Yields feature rows, or ``(features, label)`` pairs when ``target`` is set.
|
|
75
|
+
Optionally batched.
|
|
76
|
+
"""
|
|
77
|
+
tf = _require("tensorflow", "tf")
|
|
78
|
+
_, x = _feature_matrix(df, feature_columns, target)
|
|
79
|
+
if target is None:
|
|
80
|
+
ds = tf.data.Dataset.from_tensor_slices(x)
|
|
81
|
+
else:
|
|
82
|
+
ds = tf.data.Dataset.from_tensor_slices((x, df[target].to_numpy()))
|
|
83
|
+
if batch_size:
|
|
84
|
+
ds = ds.batch(batch_size)
|
|
85
|
+
return ds
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def to_hf_dataset(df: pd.DataFrame) -> Any:
|
|
89
|
+
"""Build a HuggingFace ``datasets.Dataset`` from ``df`` (extra: ``hf``).
|
|
90
|
+
|
|
91
|
+
Keeps every column (including categorical/text) — HF datasets are schema-rich.
|
|
92
|
+
"""
|
|
93
|
+
datasets = _require("datasets", "hf")
|
|
94
|
+
return datasets.Dataset.from_pandas(df, preserve_index=False)
|
|
@@ -0,0 +1,72 @@
|
|
|
1
|
+
"""Read generated run artifacts into a pandas DataFrame."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import pandas as pd
|
|
8
|
+
|
|
9
|
+
# Preferred read order: CSV is the canonical artifact, then parquet, then json.
|
|
10
|
+
_READERS: list[tuple[str, str]] = [("csv", "csv"), ("parquet", "parquet"), ("json", "json")]
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _read(path: Path, kind: str) -> pd.DataFrame:
|
|
14
|
+
if kind == "csv":
|
|
15
|
+
return pd.read_csv(path)
|
|
16
|
+
if kind == "parquet":
|
|
17
|
+
return pd.read_parquet(path)
|
|
18
|
+
return pd.read_json(path, orient="records")
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def load_dataframe(
|
|
22
|
+
run_dir: str | Path,
|
|
23
|
+
*,
|
|
24
|
+
version: str = "clean",
|
|
25
|
+
split: str | None = None,
|
|
26
|
+
) -> pd.DataFrame:
|
|
27
|
+
"""Load a generated dataset variant into a pandas ``DataFrame``.
|
|
28
|
+
|
|
29
|
+
Args:
|
|
30
|
+
run_dir: the run output directory (``datadoom run --out <dir>``).
|
|
31
|
+
version: ``"clean"`` (default) or ``"injected"`` (the corrupted variant).
|
|
32
|
+
split: optional split name (e.g. ``"train"``) if split files were written.
|
|
33
|
+
|
|
34
|
+
Returns:
|
|
35
|
+
The dataset as a ``DataFrame``, columns in spec order.
|
|
36
|
+
|
|
37
|
+
Raises:
|
|
38
|
+
FileNotFoundError: if no matching data artifact exists in ``run_dir``.
|
|
39
|
+
"""
|
|
40
|
+
base = Path(run_dir)
|
|
41
|
+
stem = "data" if version == "clean" else "data.injected"
|
|
42
|
+
if split:
|
|
43
|
+
stem = f"{stem}.{split}"
|
|
44
|
+
|
|
45
|
+
for ext, kind in _READERS:
|
|
46
|
+
candidate = base / f"{stem}.{ext}"
|
|
47
|
+
if candidate.exists():
|
|
48
|
+
return _read(candidate, kind)
|
|
49
|
+
|
|
50
|
+
tried = ", ".join(f"{stem}.{ext}" for ext, _ in _READERS)
|
|
51
|
+
raise FileNotFoundError(
|
|
52
|
+
f"no data artifact for version={version!r}"
|
|
53
|
+
+ (f", split={split!r}" if split else "")
|
|
54
|
+
+ f" in {base} (looked for: {tried})"
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def numeric_feature_columns(df: pd.DataFrame, *, exclude: list[str] | None = None) -> list[str]:
|
|
59
|
+
"""Return the numeric/boolean columns of ``df`` (model-ready features).
|
|
60
|
+
|
|
61
|
+
Categorical/text/datetime columns are skipped — encode them yourself if you
|
|
62
|
+
need them. Pass ``exclude`` to drop e.g. the target column.
|
|
63
|
+
"""
|
|
64
|
+
drop = set(exclude or [])
|
|
65
|
+
cols: list[str] = []
|
|
66
|
+
for name in df.columns:
|
|
67
|
+
if name in drop:
|
|
68
|
+
continue
|
|
69
|
+
s = df[name]
|
|
70
|
+
if pd.api.types.is_numeric_dtype(s) or pd.api.types.is_bool_dtype(s):
|
|
71
|
+
cols.append(name)
|
|
72
|
+
return cols
|
datadoom/api/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""DataDoom HTTP API (FastAPI). Thin layer over ``jobs`` + ``store`` + ``engine``.
|
|
2
|
+
|
|
3
|
+
``create_app()`` is the entry point; ``datadoom serve`` (CLI) runs it under
|
|
4
|
+
uvicorn.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from .app import create_app
|
|
10
|
+
|
|
11
|
+
__all__ = ["create_app"]
|
datadoom/api/app.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
"""FastAPI application factory (08, 17 step 9).
|
|
2
|
+
|
|
3
|
+
Assembles config -> store -> jobs -> api into one app: opens the DB (running
|
|
4
|
+
Alembic to head), wires the worker + event hub, mounts the REST routes and the
|
|
5
|
+
WebSocket/SSE transport, installs the error envelope, and serves the bundled SPA
|
|
6
|
+
from ``webdist/`` (so ``datadoom serve`` is a complete app with no Node needed).
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import asyncio
|
|
12
|
+
from contextlib import asynccontextmanager
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
from fastapi import FastAPI
|
|
16
|
+
from fastapi.responses import FileResponse, JSONResponse
|
|
17
|
+
from fastapi.staticfiles import StaticFiles
|
|
18
|
+
|
|
19
|
+
from datadoom.config import Config, load_config
|
|
20
|
+
from datadoom.jobs import EventHub, WorkerPool
|
|
21
|
+
from datadoom.plugins import load_plugins
|
|
22
|
+
from datadoom.store import LocalArtifactStore, init_database
|
|
23
|
+
from datadoom.version import __version__
|
|
24
|
+
|
|
25
|
+
from . import ws
|
|
26
|
+
from .errors import install_error_handlers
|
|
27
|
+
from .routes import artifacts, datasets, meta, plugins, runs, specs, templates
|
|
28
|
+
from .state import AppState
|
|
29
|
+
|
|
30
|
+
WEBDIST = Path(__file__).resolve().parent.parent / "webdist"
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def create_app(config: Config | None = None) -> FastAPI:
|
|
34
|
+
config = config or load_config()
|
|
35
|
+
config.ensure_dirs()
|
|
36
|
+
|
|
37
|
+
# Discover plugins (entry points + local dir) into the engine's lookup tables;
|
|
38
|
+
# conflicts fail loudly here rather than silently shadowing a capability (09 §3).
|
|
39
|
+
load_plugins(local_dir=config.home / "plugins")
|
|
40
|
+
|
|
41
|
+
db = init_database(config.db_url)
|
|
42
|
+
artifact_store = LocalArtifactStore(config.artifacts_dir)
|
|
43
|
+
hub = EventHub()
|
|
44
|
+
worker = WorkerPool(db, artifact_store, hub, __version__)
|
|
45
|
+
state = AppState(
|
|
46
|
+
config=config, db=db, artifacts=artifact_store, hub=hub, worker=worker
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
@asynccontextmanager
|
|
50
|
+
async def lifespan(app: FastAPI): # noqa: ANN202
|
|
51
|
+
# Bind the running loop so worker threads can marshal WS events onto it.
|
|
52
|
+
hub.bind_loop(asyncio.get_running_loop())
|
|
53
|
+
yield
|
|
54
|
+
worker.shutdown()
|
|
55
|
+
db.dispose()
|
|
56
|
+
|
|
57
|
+
app = FastAPI(
|
|
58
|
+
title="DataDoom",
|
|
59
|
+
version=__version__,
|
|
60
|
+
description="Local-first engine for controllable, reproducible synthetic data.",
|
|
61
|
+
openapi_url="/api/openapi.json",
|
|
62
|
+
docs_url="/api/docs",
|
|
63
|
+
redoc_url=None,
|
|
64
|
+
lifespan=lifespan,
|
|
65
|
+
)
|
|
66
|
+
app.state.dd = state
|
|
67
|
+
|
|
68
|
+
install_error_handlers(app)
|
|
69
|
+
|
|
70
|
+
for module in (meta, specs, datasets, runs, artifacts, templates, plugins):
|
|
71
|
+
app.include_router(module.router)
|
|
72
|
+
app.include_router(ws.router)
|
|
73
|
+
|
|
74
|
+
_mount_spa(app)
|
|
75
|
+
return app
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
def _mount_spa(app: FastAPI) -> None:
|
|
79
|
+
"""Serve the built SPA at ``/`` with client-side-routing fallback.
|
|
80
|
+
|
|
81
|
+
If ``webdist/`` is absent (dev before the frontend is built), ``/`` returns a
|
|
82
|
+
friendly JSON pointer instead of 404 so the API is still usable.
|
|
83
|
+
"""
|
|
84
|
+
if not (WEBDIST / "index.html").exists():
|
|
85
|
+
|
|
86
|
+
@app.get("/", include_in_schema=False)
|
|
87
|
+
async def _no_spa() -> JSONResponse: # noqa: ANN202
|
|
88
|
+
return JSONResponse(
|
|
89
|
+
{
|
|
90
|
+
"status": "ok",
|
|
91
|
+
"message": "DataDoom API is running. The web UI is not built; "
|
|
92
|
+
"run `cd frontend && npm install && npm run build`.",
|
|
93
|
+
"docs": "/api/docs",
|
|
94
|
+
}
|
|
95
|
+
)
|
|
96
|
+
|
|
97
|
+
return
|
|
98
|
+
|
|
99
|
+
assets = WEBDIST / "assets"
|
|
100
|
+
if assets.exists():
|
|
101
|
+
app.mount("/assets", StaticFiles(directory=assets), name="assets")
|
|
102
|
+
|
|
103
|
+
@app.get("/{full_path:path}", include_in_schema=False)
|
|
104
|
+
async def _spa(full_path: str) -> FileResponse: # noqa: ANN202
|
|
105
|
+
# Serve real files when they exist; otherwise the SPA entry (client routing).
|
|
106
|
+
candidate = WEBDIST / full_path
|
|
107
|
+
if full_path and candidate.is_file():
|
|
108
|
+
return FileResponse(candidate)
|
|
109
|
+
return FileResponse(WEBDIST / "index.html")
|
datadoom/api/deps.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
"""FastAPI dependencies: app state, DB sessions, and (no-op local) auth.
|
|
2
|
+
|
|
3
|
+
Auth is a no-op dependency in local mode (08 §1); team mode swaps in a real
|
|
4
|
+
bearer-token dependency without changing any route signature.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from collections.abc import Iterator
|
|
10
|
+
|
|
11
|
+
from fastapi import Request
|
|
12
|
+
from sqlalchemy.orm import Session
|
|
13
|
+
|
|
14
|
+
from .state import AppState
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def get_state(request: Request) -> AppState:
|
|
18
|
+
return request.app.state.dd
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def get_session(request: Request) -> Iterator[Session]:
|
|
22
|
+
"""Yield a transactional session for the request (commit/rollback handled)."""
|
|
23
|
+
state: AppState = request.app.state.dd
|
|
24
|
+
with state.db.session() as session:
|
|
25
|
+
yield session
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def current_owner() -> None:
|
|
29
|
+
"""No-op auth: local mode has a single implicit owner (``owner_id = None``)."""
|
|
30
|
+
return None
|
datadoom/api/errors.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
"""Consistent error envelope + exception handlers (08 §1, §12).
|
|
2
|
+
|
|
3
|
+
Every error response is ``{ "error": { code, message, locator? } }``. Spec
|
|
4
|
+
validation failures map to 422 with the offending field's ``locator``; other
|
|
5
|
+
engine errors to 400; anything unexpected to 500 (traceback logged, not leaked).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import logging
|
|
11
|
+
|
|
12
|
+
from fastapi import FastAPI, Request
|
|
13
|
+
from fastapi.exceptions import RequestValidationError
|
|
14
|
+
from fastapi.responses import JSONResponse
|
|
15
|
+
from starlette.exceptions import HTTPException as StarletteHTTPException
|
|
16
|
+
|
|
17
|
+
from datadoom.engine.errors import (
|
|
18
|
+
DataDoomError,
|
|
19
|
+
DistributionError,
|
|
20
|
+
SpecValidationError,
|
|
21
|
+
)
|
|
22
|
+
|
|
23
|
+
log = logging.getLogger("datadoom.api")
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _envelope(code: str, message: str, locator: str | None = None) -> dict:
|
|
27
|
+
detail = {"code": code, "message": message}
|
|
28
|
+
if locator is not None:
|
|
29
|
+
detail["locator"] = locator
|
|
30
|
+
return {"error": detail}
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# Status-code -> default error code for bare HTTPExceptions.
|
|
34
|
+
_CODE_FOR_STATUS = {400: "bad_request", 404: "not_found", 409: "conflict", 422: "validation_error"}
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def http_error(status: int, code: str, message: str, locator: str | None = None): # noqa: ANN201
|
|
38
|
+
"""Build an HTTPException whose detail carries our envelope fields."""
|
|
39
|
+
from fastapi import HTTPException
|
|
40
|
+
|
|
41
|
+
return HTTPException(status_code=status, detail={"code": code, "message": message, "locator": locator})
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def install_error_handlers(app: FastAPI) -> None:
|
|
45
|
+
@app.exception_handler(SpecValidationError)
|
|
46
|
+
async def _spec_invalid(_req: Request, exc: SpecValidationError): # noqa: ANN202
|
|
47
|
+
return JSONResponse(
|
|
48
|
+
status_code=422,
|
|
49
|
+
content=_envelope("validation_error", str(exc), getattr(exc, "locator", None)),
|
|
50
|
+
)
|
|
51
|
+
|
|
52
|
+
@app.exception_handler(DistributionError)
|
|
53
|
+
async def _dist_error(_req: Request, exc: DistributionError): # noqa: ANN202
|
|
54
|
+
return JSONResponse(
|
|
55
|
+
status_code=422,
|
|
56
|
+
content=_envelope("distribution_error", str(exc), getattr(exc, "locator", None)),
|
|
57
|
+
)
|
|
58
|
+
|
|
59
|
+
@app.exception_handler(DataDoomError)
|
|
60
|
+
async def _domain_error(_req: Request, exc: DataDoomError): # noqa: ANN202
|
|
61
|
+
return JSONResponse(
|
|
62
|
+
status_code=400, content=_envelope("error", str(exc), getattr(exc, "locator", None))
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
@app.exception_handler(StarletteHTTPException)
|
|
66
|
+
async def _http_exc(_req: Request, exc: StarletteHTTPException): # noqa: ANN202
|
|
67
|
+
detail = exc.detail
|
|
68
|
+
if isinstance(detail, dict) and "code" in detail:
|
|
69
|
+
content = _envelope(detail["code"], detail.get("message", ""), detail.get("locator"))
|
|
70
|
+
else:
|
|
71
|
+
code = _CODE_FOR_STATUS.get(exc.status_code, "error")
|
|
72
|
+
content = _envelope(code, str(detail))
|
|
73
|
+
return JSONResponse(status_code=exc.status_code, content=content)
|
|
74
|
+
|
|
75
|
+
@app.exception_handler(RequestValidationError)
|
|
76
|
+
async def _req_invalid(_req: Request, exc: RequestValidationError): # noqa: ANN202
|
|
77
|
+
first = exc.errors()[0] if exc.errors() else {}
|
|
78
|
+
locator = ".".join(str(p) for p in first.get("loc", []) if p != "body")
|
|
79
|
+
return JSONResponse(
|
|
80
|
+
status_code=422,
|
|
81
|
+
content=_envelope("validation_error", first.get("msg", "invalid request"), locator or None),
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
@app.exception_handler(Exception)
|
|
85
|
+
async def _unexpected(_req: Request, exc: Exception): # noqa: ANN202
|
|
86
|
+
log.exception("unhandled server error")
|
|
87
|
+
return JSONResponse(
|
|
88
|
+
status_code=500, content=_envelope("internal_error", "internal server error")
|
|
89
|
+
)
|
datadoom/api/estimate.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
1
|
+
"""Resource estimator (doc 12) — heuristic runtime / RAM / output-size guess.
|
|
2
|
+
|
|
3
|
+
Local-first means **no cost, no GPU, no quotas** — we estimate only so the UI can
|
|
4
|
+
warn before a heavy run. Pure function of the spec + fixed calibration constants,
|
|
5
|
+
so it is deterministic and reproducible (doc 12 §9). Never blocks a run.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from dataclasses import dataclass
|
|
11
|
+
|
|
12
|
+
from datadoom.engine import Spec
|
|
13
|
+
|
|
14
|
+
# Per-cell byte estimates by type (doc 12 §3).
|
|
15
|
+
_BYTES_NUMERIC = 8
|
|
16
|
+
_BYTES_BOOL = 1
|
|
17
|
+
_BYTES_DATETIME = 19 # ISO-8601 string in CSV
|
|
18
|
+
_CSV_FORMAT_FACTOR = 1.2
|
|
19
|
+
|
|
20
|
+
# Calibrated throughput constants (rows/sec-ish) — reference-laptop defaults (doc 12 §6).
|
|
21
|
+
_KAPPA_SAMPLE = 4_000_000.0 # vectorized sampling cells/sec
|
|
22
|
+
_KAPPA_IO = 80_000_000.0 # bytes/sec write
|
|
23
|
+
_T_FIXED = 0.15 # process/setup overhead seconds
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
@dataclass
|
|
27
|
+
class Estimate:
|
|
28
|
+
estimated_runtime_seconds: float
|
|
29
|
+
estimated_ram_mb: float
|
|
30
|
+
estimated_size_bytes: int
|
|
31
|
+
features: int
|
|
32
|
+
edges: int
|
|
33
|
+
gpu_required: bool = False
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
def _avg_bytes(feat) -> float: # noqa: ANN001 — duck-typed over the feature union
|
|
37
|
+
t = feat.type
|
|
38
|
+
if t == "numeric":
|
|
39
|
+
return _BYTES_NUMERIC
|
|
40
|
+
if t == "boolean":
|
|
41
|
+
return _BYTES_BOOL
|
|
42
|
+
if t == "datetime":
|
|
43
|
+
return _BYTES_DATETIME
|
|
44
|
+
if t == "categorical":
|
|
45
|
+
labels = feat.categories or [""]
|
|
46
|
+
return sum(len(c) for c in labels) / len(labels) + 1
|
|
47
|
+
if t == "text":
|
|
48
|
+
length = getattr(feat, "length", {}) or {}
|
|
49
|
+
avg_tokens = (length.get("min", 5) + length.get("max", 30)) / 2
|
|
50
|
+
return avg_tokens * 5 # ~5 bytes/token incl. spaces
|
|
51
|
+
return _BYTES_NUMERIC
|
|
52
|
+
|
|
53
|
+
|
|
54
|
+
def estimate(spec: Spec) -> Estimate:
|
|
55
|
+
n = spec.rows
|
|
56
|
+
feats = list(spec.features.values())
|
|
57
|
+
f = len(feats)
|
|
58
|
+
edges = len(spec.causal.edges) if spec.causal else 0
|
|
59
|
+
|
|
60
|
+
bytes_per_row = sum(_avg_bytes(ft) for ft in feats)
|
|
61
|
+
versions = len(spec.export.versions) or 1
|
|
62
|
+
formats = len(spec.export.formats) or 1
|
|
63
|
+
size_clean = n * bytes_per_row * _CSV_FORMAT_FACTOR
|
|
64
|
+
size_total = int(size_clean * versions * formats)
|
|
65
|
+
|
|
66
|
+
f_num = sum(1 for ft in feats if ft.type == "numeric")
|
|
67
|
+
t_base = (n * max(f_num, 1)) / _KAPPA_SAMPLE
|
|
68
|
+
t_io = size_total / _KAPPA_IO
|
|
69
|
+
runtime = round(_T_FIXED + t_base + t_io, 3)
|
|
70
|
+
|
|
71
|
+
# One float64 working frame, with clean (+possible injected) copies (doc 12 §4).
|
|
72
|
+
frame_multiplier = 2 + (1 if "injected" in spec.export.versions else 0)
|
|
73
|
+
ram_mb = round((n * max(f, 1) * 8 * frame_multiplier) / (1024 * 1024), 2)
|
|
74
|
+
|
|
75
|
+
return Estimate(
|
|
76
|
+
estimated_runtime_seconds=runtime,
|
|
77
|
+
estimated_ram_mb=ram_mb,
|
|
78
|
+
estimated_size_bytes=size_total,
|
|
79
|
+
features=f,
|
|
80
|
+
edges=edges,
|
|
81
|
+
gpu_required=False,
|
|
82
|
+
)
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
"""API route modules (doc 08). Each exposes an ``APIRouter`` named ``router``."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from . import artifacts, datasets, meta, plugins, runs, specs, templates
|
|
6
|
+
|
|
7
|
+
__all__ = ["artifacts", "datasets", "meta", "plugins", "runs", "specs", "templates"]
|