datadoom 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. datadoom/__init__.py +23 -0
  2. datadoom/adapters/__init__.py +29 -0
  3. datadoom/adapters/frameworks.py +94 -0
  4. datadoom/adapters/loaders.py +72 -0
  5. datadoom/api/__init__.py +11 -0
  6. datadoom/api/app.py +109 -0
  7. datadoom/api/deps.py +30 -0
  8. datadoom/api/errors.py +89 -0
  9. datadoom/api/estimate.py +82 -0
  10. datadoom/api/routes/__init__.py +7 -0
  11. datadoom/api/routes/artifacts.py +147 -0
  12. datadoom/api/routes/datasets.py +180 -0
  13. datadoom/api/routes/meta.py +45 -0
  14. datadoom/api/routes/plugins.py +22 -0
  15. datadoom/api/routes/runs.py +144 -0
  16. datadoom/api/routes/specs.py +73 -0
  17. datadoom/api/routes/templates.py +30 -0
  18. datadoom/api/schemas.py +230 -0
  19. datadoom/api/serializers.py +143 -0
  20. datadoom/api/state.py +24 -0
  21. datadoom/api/store_helpers.py +56 -0
  22. datadoom/api/ws.py +72 -0
  23. datadoom/cli/__init__.py +1 -0
  24. datadoom/cli/main.py +313 -0
  25. datadoom/config.py +108 -0
  26. datadoom/engine/__init__.py +38 -0
  27. datadoom/engine/advice.py +289 -0
  28. datadoom/engine/audit.py +290 -0
  29. datadoom/engine/causal/__init__.py +15 -0
  30. datadoom/engine/causal/execute.py +116 -0
  31. datadoom/engine/causal/functions.py +116 -0
  32. datadoom/engine/causal/graph.py +54 -0
  33. datadoom/engine/difficulty/__init__.py +36 -0
  34. datadoom/engine/difficulty/calibrate.py +235 -0
  35. datadoom/engine/difficulty/knobs.py +171 -0
  36. datadoom/engine/difficulty/probes.py +181 -0
  37. datadoom/engine/dist/__init__.py +35 -0
  38. datadoom/engine/dist/base.py +46 -0
  39. datadoom/engine/dist/builtins.py +172 -0
  40. datadoom/engine/dist/compliance.py +344 -0
  41. datadoom/engine/dist/providers.py +117 -0
  42. datadoom/engine/errors.py +32 -0
  43. datadoom/engine/export/__init__.py +27 -0
  44. datadoom/engine/export/base.py +49 -0
  45. datadoom/engine/export/checksums.py +18 -0
  46. datadoom/engine/export/csv_exporter.py +34 -0
  47. datadoom/engine/export/json_exporter.py +67 -0
  48. datadoom/engine/export/metadata.py +58 -0
  49. datadoom/engine/export/parquet_exporter.py +45 -0
  50. datadoom/engine/failure/__init__.py +18 -0
  51. datadoom/engine/failure/apply.py +37 -0
  52. datadoom/engine/failure/base.py +116 -0
  53. datadoom/engine/failure/modes.py +442 -0
  54. datadoom/engine/pipeline.py +418 -0
  55. datadoom/engine/profile.py +327 -0
  56. datadoom/engine/progress.py +14 -0
  57. datadoom/engine/reference.py +338 -0
  58. datadoom/engine/reports.py +206 -0
  59. datadoom/engine/rng.py +79 -0
  60. datadoom/engine/spec/__init__.py +45 -0
  61. datadoom/engine/spec/hashing.py +57 -0
  62. datadoom/engine/spec/models.py +238 -0
  63. datadoom/engine/spec/validate.py +345 -0
  64. datadoom/engine/timeseries.py +88 -0
  65. datadoom/jobs/__init__.py +14 -0
  66. datadoom/jobs/progress.py +155 -0
  67. datadoom/jobs/worker.py +162 -0
  68. datadoom/plugin.py +35 -0
  69. datadoom/plugins/__init__.py +47 -0
  70. datadoom/plugins/contracts.py +72 -0
  71. datadoom/plugins/loader.py +125 -0
  72. datadoom/plugins/registry.py +214 -0
  73. datadoom/plugins/scaffold.py +434 -0
  74. datadoom/store/__init__.py +47 -0
  75. datadoom/store/artifacts.py +67 -0
  76. datadoom/store/db.py +104 -0
  77. datadoom/store/migrations/__init__.py +0 -0
  78. datadoom/store/migrations/env.py +53 -0
  79. datadoom/store/migrations/script.py.mako +24 -0
  80. datadoom/store/migrations/versions/0001_init.py +149 -0
  81. datadoom/store/migrations/versions/0002_report_mutual_information.py +23 -0
  82. datadoom/store/migrations/versions/0003_run_name.py +23 -0
  83. datadoom/store/migrations/versions/0004_report_profile.py +24 -0
  84. datadoom/store/models.py +170 -0
  85. datadoom/store/repositories.py +279 -0
  86. datadoom/templates/__init__.py +239 -0
  87. datadoom/templates/ab_test.datadoom.yaml +46 -0
  88. datadoom/templates/clinical_deterioration.datadoom.yaml +124 -0
  89. datadoom/templates/credit_default_challenge.datadoom.yaml +147 -0
  90. datadoom/templates/customer_churn.datadoom.yaml +60 -0
  91. datadoom/templates/ecommerce_orders.datadoom.yaml +46 -0
  92. datadoom/templates/fraud_detection.datadoom.yaml +57 -0
  93. datadoom/templates/hospital_readmission.datadoom.yaml +61 -0
  94. datadoom/templates/insurance_claims.datadoom.yaml +43 -0
  95. datadoom/templates/iot_sensors.datadoom.yaml +44 -0
  96. datadoom/templates/people_directory.datadoom.yaml +56 -0
  97. datadoom/templates/predictive_maintenance.datadoom.yaml +107 -0
  98. datadoom/templates/telecom_churn_challenge.datadoom.yaml +125 -0
  99. datadoom/version.py +3 -0
  100. datadoom/webdist/assets/index-V8VAuTJG.js +445 -0
  101. datadoom/webdist/assets/index-doRjyG5s.css +1 -0
  102. datadoom/webdist/assets/inter-cyrillic-ext-wght-normal-BOeWTOD4.woff2 +0 -0
  103. datadoom/webdist/assets/inter-cyrillic-wght-normal-DqGufNeO.woff2 +0 -0
  104. datadoom/webdist/assets/inter-greek-ext-wght-normal-DlzME5K_.woff2 +0 -0
  105. datadoom/webdist/assets/inter-greek-wght-normal-CkhJZR-_.woff2 +0 -0
  106. datadoom/webdist/assets/inter-latin-ext-wght-normal-DO1Apj_S.woff2 +0 -0
  107. datadoom/webdist/assets/inter-latin-wght-normal-Dx4kXJAl.woff2 +0 -0
  108. datadoom/webdist/assets/inter-vietnamese-wght-normal-CBcvBZtf.woff2 +0 -0
  109. datadoom/webdist/assets/jetbrains-mono-cyrillic-wght-normal-D73BlboJ.woff2 +0 -0
  110. datadoom/webdist/assets/jetbrains-mono-greek-wght-normal-Bw9x6K1M.woff2 +0 -0
  111. datadoom/webdist/assets/jetbrains-mono-latin-ext-wght-normal-DBQx-q_a.woff2 +0 -0
  112. datadoom/webdist/assets/jetbrains-mono-latin-wght-normal-B9CIFXIH.woff2 +0 -0
  113. datadoom/webdist/assets/jetbrains-mono-vietnamese-wght-normal-Bt-aOZkq.woff2 +0 -0
  114. datadoom/webdist/assets/space-grotesk-latin-ext-wght-normal-D9tNdqV9.woff2 +0 -0
  115. datadoom/webdist/assets/space-grotesk-latin-wght-normal-BhU9QXUp.woff2 +0 -0
  116. datadoom/webdist/assets/space-grotesk-vietnamese-wght-normal-D0rl6rjA.woff2 +0 -0
  117. datadoom/webdist/index.html +15 -0
  118. datadoom-0.1.0.dev0.dist-info/METADATA +143 -0
  119. datadoom-0.1.0.dev0.dist-info/RECORD +122 -0
  120. datadoom-0.1.0.dev0.dist-info/WHEEL +4 -0
  121. datadoom-0.1.0.dev0.dist-info/entry_points.txt +2 -0
  122. datadoom-0.1.0.dev0.dist-info/licenses/LICENSE +202 -0
datadoom/__init__.py ADDED
@@ -0,0 +1,23 @@
1
+ """DataDoom — local-first engine for reproducible synthetic data.
2
+
3
+ Public API:
4
+
5
+ >>> import datadoom
6
+ >>> spec = datadoom.load_spec("dataset.datadoom.yaml")
7
+ >>> result = datadoom.generate(spec, seed=42)
8
+ >>> result.frame.head()
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ from .engine import Spec, generate, load_spec, parse_spec, validate_spec
14
+ from .version import __version__
15
+
16
+ __all__ = [
17
+ "Spec",
18
+ "generate",
19
+ "load_spec",
20
+ "parse_spec",
21
+ "validate_spec",
22
+ "__version__",
23
+ ]
@@ -0,0 +1,29 @@
1
+ """Framework adapters — load a generated DataDoom run into ML frameworks (18.4).
2
+
3
+ A run directory (the ``--out`` of ``datadoom run``) holds ``data.csv`` (and any
4
+ ``data.injected.*`` / other formats). These helpers turn it into the in-memory
5
+ object each framework expects:
6
+
7
+ * :func:`load_dataframe` — a **pandas** ``DataFrame`` (no extra needed; pandas is
8
+ a core dep). Auto-detects csv / parquet / json and the clean/injected variant.
9
+ * :func:`to_torch_dataset` — a ``torch.utils.data.TensorDataset`` (extra: ``torch``).
10
+ * :func:`to_tf_dataset` — a ``tf.data.Dataset`` (extra: ``tf``).
11
+ * :func:`to_hf_dataset` — a HuggingFace ``datasets.Dataset`` (extra: ``hf``).
12
+
13
+ The framework loaders **lazy-import** their backend and raise a clear install
14
+ hint if it is missing, so the core install stays light. This package depends only
15
+ on the engine (for nothing heavyweight) + pandas; the engine never imports it.
16
+ """
17
+
18
+ from __future__ import annotations
19
+
20
+ from .frameworks import to_hf_dataset, to_tf_dataset, to_torch_dataset
21
+ from .loaders import load_dataframe, numeric_feature_columns
22
+
23
+ __all__ = [
24
+ "load_dataframe",
25
+ "numeric_feature_columns",
26
+ "to_torch_dataset",
27
+ "to_tf_dataset",
28
+ "to_hf_dataset",
29
+ ]
@@ -0,0 +1,94 @@
1
+ """Convert a pandas DataFrame into torch / tensorflow / HuggingFace datasets.
2
+
3
+ Each converter lazy-imports its backend so the core install stays light; a
4
+ missing backend raises an actionable install hint naming the right extra.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from importlib import import_module
10
+ from typing import TYPE_CHECKING, Any
11
+
12
+ import numpy as np
13
+ import pandas as pd
14
+
15
+ from .loaders import numeric_feature_columns
16
+
17
+ if TYPE_CHECKING: # pragma: no cover - typing only
18
+ pass
19
+
20
+
21
+ def _require(module: str, extra: str) -> Any:
22
+ try:
23
+ return import_module(module)
24
+ except ImportError as exc: # pragma: no cover - exercised only without the extra
25
+ raise ImportError(
26
+ f"{module!r} is required for this adapter. Install it with: "
27
+ f"pip install 'datadoom[{extra}]'"
28
+ ) from exc
29
+
30
+
31
+ def _feature_matrix(
32
+ df: pd.DataFrame, feature_columns: list[str] | None, target: str | None
33
+ ) -> tuple[list[str], np.ndarray]:
34
+ cols = feature_columns or numeric_feature_columns(df, exclude=[target] if target else None)
35
+ if not cols:
36
+ raise ValueError(
37
+ "no numeric/boolean feature columns found; pass feature_columns explicitly "
38
+ "or encode categorical/text columns first"
39
+ )
40
+ x = df[cols].to_numpy(dtype="float32")
41
+ return cols, x
42
+
43
+
44
+ def to_torch_dataset(
45
+ df: pd.DataFrame,
46
+ *,
47
+ target: str | None = None,
48
+ feature_columns: list[str] | None = None,
49
+ ) -> Any:
50
+ """Build a ``torch.utils.data.TensorDataset`` from ``df`` (extra: ``torch``).
51
+
52
+ Features are the numeric/boolean columns (or ``feature_columns``); if
53
+ ``target`` is given it becomes the second tensor.
54
+ """
55
+ torch = _require("torch", "torch")
56
+ _, x = _feature_matrix(df, feature_columns, target)
57
+ x_t = torch.as_tensor(x)
58
+ if target is None:
59
+ return torch.utils.data.TensorDataset(x_t)
60
+ y = df[target].to_numpy()
61
+ y_t = torch.as_tensor(y.astype("float32") if y.dtype != object else y)
62
+ return torch.utils.data.TensorDataset(x_t, y_t)
63
+
64
+
65
+ def to_tf_dataset(
66
+ df: pd.DataFrame,
67
+ *,
68
+ target: str | None = None,
69
+ feature_columns: list[str] | None = None,
70
+ batch_size: int | None = None,
71
+ ) -> Any:
72
+ """Build a ``tf.data.Dataset`` from ``df`` (extra: ``tf``).
73
+
74
+ Yields feature rows, or ``(features, label)`` pairs when ``target`` is set.
75
+ Optionally batched.
76
+ """
77
+ tf = _require("tensorflow", "tf")
78
+ _, x = _feature_matrix(df, feature_columns, target)
79
+ if target is None:
80
+ ds = tf.data.Dataset.from_tensor_slices(x)
81
+ else:
82
+ ds = tf.data.Dataset.from_tensor_slices((x, df[target].to_numpy()))
83
+ if batch_size:
84
+ ds = ds.batch(batch_size)
85
+ return ds
86
+
87
+
88
+ def to_hf_dataset(df: pd.DataFrame) -> Any:
89
+ """Build a HuggingFace ``datasets.Dataset`` from ``df`` (extra: ``hf``).
90
+
91
+ Keeps every column (including categorical/text) — HF datasets are schema-rich.
92
+ """
93
+ datasets = _require("datasets", "hf")
94
+ return datasets.Dataset.from_pandas(df, preserve_index=False)
@@ -0,0 +1,72 @@
1
+ """Read generated run artifacts into a pandas DataFrame."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ import pandas as pd
8
+
9
+ # Preferred read order: CSV is the canonical artifact, then parquet, then json.
10
+ _READERS: list[tuple[str, str]] = [("csv", "csv"), ("parquet", "parquet"), ("json", "json")]
11
+
12
+
13
+ def _read(path: Path, kind: str) -> pd.DataFrame:
14
+ if kind == "csv":
15
+ return pd.read_csv(path)
16
+ if kind == "parquet":
17
+ return pd.read_parquet(path)
18
+ return pd.read_json(path, orient="records")
19
+
20
+
21
+ def load_dataframe(
22
+ run_dir: str | Path,
23
+ *,
24
+ version: str = "clean",
25
+ split: str | None = None,
26
+ ) -> pd.DataFrame:
27
+ """Load a generated dataset variant into a pandas ``DataFrame``.
28
+
29
+ Args:
30
+ run_dir: the run output directory (``datadoom run --out <dir>``).
31
+ version: ``"clean"`` (default) or ``"injected"`` (the corrupted variant).
32
+ split: optional split name (e.g. ``"train"``) if split files were written.
33
+
34
+ Returns:
35
+ The dataset as a ``DataFrame``, columns in spec order.
36
+
37
+ Raises:
38
+ FileNotFoundError: if no matching data artifact exists in ``run_dir``.
39
+ """
40
+ base = Path(run_dir)
41
+ stem = "data" if version == "clean" else "data.injected"
42
+ if split:
43
+ stem = f"{stem}.{split}"
44
+
45
+ for ext, kind in _READERS:
46
+ candidate = base / f"{stem}.{ext}"
47
+ if candidate.exists():
48
+ return _read(candidate, kind)
49
+
50
+ tried = ", ".join(f"{stem}.{ext}" for ext, _ in _READERS)
51
+ raise FileNotFoundError(
52
+ f"no data artifact for version={version!r}"
53
+ + (f", split={split!r}" if split else "")
54
+ + f" in {base} (looked for: {tried})"
55
+ )
56
+
57
+
58
+ def numeric_feature_columns(df: pd.DataFrame, *, exclude: list[str] | None = None) -> list[str]:
59
+ """Return the numeric/boolean columns of ``df`` (model-ready features).
60
+
61
+ Categorical/text/datetime columns are skipped — encode them yourself if you
62
+ need them. Pass ``exclude`` to drop e.g. the target column.
63
+ """
64
+ drop = set(exclude or [])
65
+ cols: list[str] = []
66
+ for name in df.columns:
67
+ if name in drop:
68
+ continue
69
+ s = df[name]
70
+ if pd.api.types.is_numeric_dtype(s) or pd.api.types.is_bool_dtype(s):
71
+ cols.append(name)
72
+ return cols
@@ -0,0 +1,11 @@
1
+ """DataDoom HTTP API (FastAPI). Thin layer over ``jobs`` + ``store`` + ``engine``.
2
+
3
+ ``create_app()`` is the entry point; ``datadoom serve`` (CLI) runs it under
4
+ uvicorn.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from .app import create_app
10
+
11
+ __all__ = ["create_app"]
datadoom/api/app.py ADDED
@@ -0,0 +1,109 @@
1
+ """FastAPI application factory (08, 17 step 9).
2
+
3
+ Assembles config -> store -> jobs -> api into one app: opens the DB (running
4
+ Alembic to head), wires the worker + event hub, mounts the REST routes and the
5
+ WebSocket/SSE transport, installs the error envelope, and serves the bundled SPA
6
+ from ``webdist/`` (so ``datadoom serve`` is a complete app with no Node needed).
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import asyncio
12
+ from contextlib import asynccontextmanager
13
+ from pathlib import Path
14
+
15
+ from fastapi import FastAPI
16
+ from fastapi.responses import FileResponse, JSONResponse
17
+ from fastapi.staticfiles import StaticFiles
18
+
19
+ from datadoom.config import Config, load_config
20
+ from datadoom.jobs import EventHub, WorkerPool
21
+ from datadoom.plugins import load_plugins
22
+ from datadoom.store import LocalArtifactStore, init_database
23
+ from datadoom.version import __version__
24
+
25
+ from . import ws
26
+ from .errors import install_error_handlers
27
+ from .routes import artifacts, datasets, meta, plugins, runs, specs, templates
28
+ from .state import AppState
29
+
30
+ WEBDIST = Path(__file__).resolve().parent.parent / "webdist"
31
+
32
+
33
+ def create_app(config: Config | None = None) -> FastAPI:
34
+ config = config or load_config()
35
+ config.ensure_dirs()
36
+
37
+ # Discover plugins (entry points + local dir) into the engine's lookup tables;
38
+ # conflicts fail loudly here rather than silently shadowing a capability (09 §3).
39
+ load_plugins(local_dir=config.home / "plugins")
40
+
41
+ db = init_database(config.db_url)
42
+ artifact_store = LocalArtifactStore(config.artifacts_dir)
43
+ hub = EventHub()
44
+ worker = WorkerPool(db, artifact_store, hub, __version__)
45
+ state = AppState(
46
+ config=config, db=db, artifacts=artifact_store, hub=hub, worker=worker
47
+ )
48
+
49
+ @asynccontextmanager
50
+ async def lifespan(app: FastAPI): # noqa: ANN202
51
+ # Bind the running loop so worker threads can marshal WS events onto it.
52
+ hub.bind_loop(asyncio.get_running_loop())
53
+ yield
54
+ worker.shutdown()
55
+ db.dispose()
56
+
57
+ app = FastAPI(
58
+ title="DataDoom",
59
+ version=__version__,
60
+ description="Local-first engine for controllable, reproducible synthetic data.",
61
+ openapi_url="/api/openapi.json",
62
+ docs_url="/api/docs",
63
+ redoc_url=None,
64
+ lifespan=lifespan,
65
+ )
66
+ app.state.dd = state
67
+
68
+ install_error_handlers(app)
69
+
70
+ for module in (meta, specs, datasets, runs, artifacts, templates, plugins):
71
+ app.include_router(module.router)
72
+ app.include_router(ws.router)
73
+
74
+ _mount_spa(app)
75
+ return app
76
+
77
+
78
+ def _mount_spa(app: FastAPI) -> None:
79
+ """Serve the built SPA at ``/`` with client-side-routing fallback.
80
+
81
+ If ``webdist/`` is absent (dev before the frontend is built), ``/`` returns a
82
+ friendly JSON pointer instead of 404 so the API is still usable.
83
+ """
84
+ if not (WEBDIST / "index.html").exists():
85
+
86
+ @app.get("/", include_in_schema=False)
87
+ async def _no_spa() -> JSONResponse: # noqa: ANN202
88
+ return JSONResponse(
89
+ {
90
+ "status": "ok",
91
+ "message": "DataDoom API is running. The web UI is not built; "
92
+ "run `cd frontend && npm install && npm run build`.",
93
+ "docs": "/api/docs",
94
+ }
95
+ )
96
+
97
+ return
98
+
99
+ assets = WEBDIST / "assets"
100
+ if assets.exists():
101
+ app.mount("/assets", StaticFiles(directory=assets), name="assets")
102
+
103
+ @app.get("/{full_path:path}", include_in_schema=False)
104
+ async def _spa(full_path: str) -> FileResponse: # noqa: ANN202
105
+ # Serve real files when they exist; otherwise the SPA entry (client routing).
106
+ candidate = WEBDIST / full_path
107
+ if full_path and candidate.is_file():
108
+ return FileResponse(candidate)
109
+ return FileResponse(WEBDIST / "index.html")
datadoom/api/deps.py ADDED
@@ -0,0 +1,30 @@
1
+ """FastAPI dependencies: app state, DB sessions, and (no-op local) auth.
2
+
3
+ Auth is a no-op dependency in local mode (08 §1); team mode swaps in a real
4
+ bearer-token dependency without changing any route signature.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from collections.abc import Iterator
10
+
11
+ from fastapi import Request
12
+ from sqlalchemy.orm import Session
13
+
14
+ from .state import AppState
15
+
16
+
17
+ def get_state(request: Request) -> AppState:
18
+ return request.app.state.dd
19
+
20
+
21
+ def get_session(request: Request) -> Iterator[Session]:
22
+ """Yield a transactional session for the request (commit/rollback handled)."""
23
+ state: AppState = request.app.state.dd
24
+ with state.db.session() as session:
25
+ yield session
26
+
27
+
28
+ def current_owner() -> None:
29
+ """No-op auth: local mode has a single implicit owner (``owner_id = None``)."""
30
+ return None
datadoom/api/errors.py ADDED
@@ -0,0 +1,89 @@
1
+ """Consistent error envelope + exception handlers (08 §1, §12).
2
+
3
+ Every error response is ``{ "error": { code, message, locator? } }``. Spec
4
+ validation failures map to 422 with the offending field's ``locator``; other
5
+ engine errors to 400; anything unexpected to 500 (traceback logged, not leaked).
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import logging
11
+
12
+ from fastapi import FastAPI, Request
13
+ from fastapi.exceptions import RequestValidationError
14
+ from fastapi.responses import JSONResponse
15
+ from starlette.exceptions import HTTPException as StarletteHTTPException
16
+
17
+ from datadoom.engine.errors import (
18
+ DataDoomError,
19
+ DistributionError,
20
+ SpecValidationError,
21
+ )
22
+
23
+ log = logging.getLogger("datadoom.api")
24
+
25
+
26
+ def _envelope(code: str, message: str, locator: str | None = None) -> dict:
27
+ detail = {"code": code, "message": message}
28
+ if locator is not None:
29
+ detail["locator"] = locator
30
+ return {"error": detail}
31
+
32
+
33
+ # Status-code -> default error code for bare HTTPExceptions.
34
+ _CODE_FOR_STATUS = {400: "bad_request", 404: "not_found", 409: "conflict", 422: "validation_error"}
35
+
36
+
37
+ def http_error(status: int, code: str, message: str, locator: str | None = None): # noqa: ANN201
38
+ """Build an HTTPException whose detail carries our envelope fields."""
39
+ from fastapi import HTTPException
40
+
41
+ return HTTPException(status_code=status, detail={"code": code, "message": message, "locator": locator})
42
+
43
+
44
+ def install_error_handlers(app: FastAPI) -> None:
45
+ @app.exception_handler(SpecValidationError)
46
+ async def _spec_invalid(_req: Request, exc: SpecValidationError): # noqa: ANN202
47
+ return JSONResponse(
48
+ status_code=422,
49
+ content=_envelope("validation_error", str(exc), getattr(exc, "locator", None)),
50
+ )
51
+
52
+ @app.exception_handler(DistributionError)
53
+ async def _dist_error(_req: Request, exc: DistributionError): # noqa: ANN202
54
+ return JSONResponse(
55
+ status_code=422,
56
+ content=_envelope("distribution_error", str(exc), getattr(exc, "locator", None)),
57
+ )
58
+
59
+ @app.exception_handler(DataDoomError)
60
+ async def _domain_error(_req: Request, exc: DataDoomError): # noqa: ANN202
61
+ return JSONResponse(
62
+ status_code=400, content=_envelope("error", str(exc), getattr(exc, "locator", None))
63
+ )
64
+
65
+ @app.exception_handler(StarletteHTTPException)
66
+ async def _http_exc(_req: Request, exc: StarletteHTTPException): # noqa: ANN202
67
+ detail = exc.detail
68
+ if isinstance(detail, dict) and "code" in detail:
69
+ content = _envelope(detail["code"], detail.get("message", ""), detail.get("locator"))
70
+ else:
71
+ code = _CODE_FOR_STATUS.get(exc.status_code, "error")
72
+ content = _envelope(code, str(detail))
73
+ return JSONResponse(status_code=exc.status_code, content=content)
74
+
75
+ @app.exception_handler(RequestValidationError)
76
+ async def _req_invalid(_req: Request, exc: RequestValidationError): # noqa: ANN202
77
+ first = exc.errors()[0] if exc.errors() else {}
78
+ locator = ".".join(str(p) for p in first.get("loc", []) if p != "body")
79
+ return JSONResponse(
80
+ status_code=422,
81
+ content=_envelope("validation_error", first.get("msg", "invalid request"), locator or None),
82
+ )
83
+
84
+ @app.exception_handler(Exception)
85
+ async def _unexpected(_req: Request, exc: Exception): # noqa: ANN202
86
+ log.exception("unhandled server error")
87
+ return JSONResponse(
88
+ status_code=500, content=_envelope("internal_error", "internal server error")
89
+ )
@@ -0,0 +1,82 @@
1
+ """Resource estimator (doc 12) — heuristic runtime / RAM / output-size guess.
2
+
3
+ Local-first means **no cost, no GPU, no quotas** — we estimate only so the UI can
4
+ warn before a heavy run. Pure function of the spec + fixed calibration constants,
5
+ so it is deterministic and reproducible (doc 12 §9). Never blocks a run.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from dataclasses import dataclass
11
+
12
+ from datadoom.engine import Spec
13
+
14
+ # Per-cell byte estimates by type (doc 12 §3).
15
+ _BYTES_NUMERIC = 8
16
+ _BYTES_BOOL = 1
17
+ _BYTES_DATETIME = 19 # ISO-8601 string in CSV
18
+ _CSV_FORMAT_FACTOR = 1.2
19
+
20
+ # Calibrated throughput constants (rows/sec-ish) — reference-laptop defaults (doc 12 §6).
21
+ _KAPPA_SAMPLE = 4_000_000.0 # vectorized sampling cells/sec
22
+ _KAPPA_IO = 80_000_000.0 # bytes/sec write
23
+ _T_FIXED = 0.15 # process/setup overhead seconds
24
+
25
+
26
+ @dataclass
27
+ class Estimate:
28
+ estimated_runtime_seconds: float
29
+ estimated_ram_mb: float
30
+ estimated_size_bytes: int
31
+ features: int
32
+ edges: int
33
+ gpu_required: bool = False
34
+
35
+
36
+ def _avg_bytes(feat) -> float: # noqa: ANN001 — duck-typed over the feature union
37
+ t = feat.type
38
+ if t == "numeric":
39
+ return _BYTES_NUMERIC
40
+ if t == "boolean":
41
+ return _BYTES_BOOL
42
+ if t == "datetime":
43
+ return _BYTES_DATETIME
44
+ if t == "categorical":
45
+ labels = feat.categories or [""]
46
+ return sum(len(c) for c in labels) / len(labels) + 1
47
+ if t == "text":
48
+ length = getattr(feat, "length", {}) or {}
49
+ avg_tokens = (length.get("min", 5) + length.get("max", 30)) / 2
50
+ return avg_tokens * 5 # ~5 bytes/token incl. spaces
51
+ return _BYTES_NUMERIC
52
+
53
+
54
+ def estimate(spec: Spec) -> Estimate:
55
+ n = spec.rows
56
+ feats = list(spec.features.values())
57
+ f = len(feats)
58
+ edges = len(spec.causal.edges) if spec.causal else 0
59
+
60
+ bytes_per_row = sum(_avg_bytes(ft) for ft in feats)
61
+ versions = len(spec.export.versions) or 1
62
+ formats = len(spec.export.formats) or 1
63
+ size_clean = n * bytes_per_row * _CSV_FORMAT_FACTOR
64
+ size_total = int(size_clean * versions * formats)
65
+
66
+ f_num = sum(1 for ft in feats if ft.type == "numeric")
67
+ t_base = (n * max(f_num, 1)) / _KAPPA_SAMPLE
68
+ t_io = size_total / _KAPPA_IO
69
+ runtime = round(_T_FIXED + t_base + t_io, 3)
70
+
71
+ # One float64 working frame, with clean (+possible injected) copies (doc 12 §4).
72
+ frame_multiplier = 2 + (1 if "injected" in spec.export.versions else 0)
73
+ ram_mb = round((n * max(f, 1) * 8 * frame_multiplier) / (1024 * 1024), 2)
74
+
75
+ return Estimate(
76
+ estimated_runtime_seconds=runtime,
77
+ estimated_ram_mb=ram_mb,
78
+ estimated_size_bytes=size_total,
79
+ features=f,
80
+ edges=edges,
81
+ gpu_required=False,
82
+ )
@@ -0,0 +1,7 @@
1
+ """API route modules (doc 08). Each exposes an ``APIRouter`` named ``router``."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from . import artifacts, datasets, meta, plugins, runs, specs, templates
6
+
7
+ __all__ = ["artifacts", "datasets", "meta", "plugins", "runs", "specs", "templates"]