PyPI - datadoom - Versions diffs - 0.1.0.dev0__py3-none-any.whl - Mend

datadoom 0.1.0.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (122) hide show

datadoom/__init__.py +23 -0
datadoom/adapters/__init__.py +29 -0
datadoom/adapters/frameworks.py +94 -0
datadoom/adapters/loaders.py +72 -0
datadoom/api/__init__.py +11 -0
datadoom/api/app.py +109 -0
datadoom/api/deps.py +30 -0
datadoom/api/errors.py +89 -0
datadoom/api/estimate.py +82 -0
datadoom/api/routes/__init__.py +7 -0
datadoom/api/routes/artifacts.py +147 -0
datadoom/api/routes/datasets.py +180 -0
datadoom/api/routes/meta.py +45 -0
datadoom/api/routes/plugins.py +22 -0
datadoom/api/routes/runs.py +144 -0
datadoom/api/routes/specs.py +73 -0
datadoom/api/routes/templates.py +30 -0
datadoom/api/schemas.py +230 -0
datadoom/api/serializers.py +143 -0
datadoom/api/state.py +24 -0
datadoom/api/store_helpers.py +56 -0
datadoom/api/ws.py +72 -0
datadoom/cli/__init__.py +1 -0
datadoom/cli/main.py +313 -0
datadoom/config.py +108 -0
datadoom/engine/__init__.py +38 -0
datadoom/engine/advice.py +289 -0
datadoom/engine/audit.py +290 -0
datadoom/engine/causal/__init__.py +15 -0
datadoom/engine/causal/execute.py +116 -0
datadoom/engine/causal/functions.py +116 -0
datadoom/engine/causal/graph.py +54 -0
datadoom/engine/difficulty/__init__.py +36 -0
datadoom/engine/difficulty/calibrate.py +235 -0
datadoom/engine/difficulty/knobs.py +171 -0
datadoom/engine/difficulty/probes.py +181 -0
datadoom/engine/dist/__init__.py +35 -0
datadoom/engine/dist/base.py +46 -0
datadoom/engine/dist/builtins.py +172 -0
datadoom/engine/dist/compliance.py +344 -0
datadoom/engine/dist/providers.py +117 -0
datadoom/engine/errors.py +32 -0
datadoom/engine/export/__init__.py +27 -0
datadoom/engine/export/base.py +49 -0
datadoom/engine/export/checksums.py +18 -0
datadoom/engine/export/csv_exporter.py +34 -0
datadoom/engine/export/json_exporter.py +67 -0
datadoom/engine/export/metadata.py +58 -0
datadoom/engine/export/parquet_exporter.py +45 -0
datadoom/engine/failure/__init__.py +18 -0
datadoom/engine/failure/apply.py +37 -0
datadoom/engine/failure/base.py +116 -0
datadoom/engine/failure/modes.py +442 -0
datadoom/engine/pipeline.py +418 -0
datadoom/engine/profile.py +327 -0
datadoom/engine/progress.py +14 -0
datadoom/engine/reference.py +338 -0
datadoom/engine/reports.py +206 -0
datadoom/engine/rng.py +79 -0
datadoom/engine/spec/__init__.py +45 -0
datadoom/engine/spec/hashing.py +57 -0
datadoom/engine/spec/models.py +238 -0
datadoom/engine/spec/validate.py +345 -0
datadoom/engine/timeseries.py +88 -0
datadoom/jobs/__init__.py +14 -0
datadoom/jobs/progress.py +155 -0
datadoom/jobs/worker.py +162 -0
datadoom/plugin.py +35 -0
datadoom/plugins/__init__.py +47 -0
datadoom/plugins/contracts.py +72 -0
datadoom/plugins/loader.py +125 -0
datadoom/plugins/registry.py +214 -0
datadoom/plugins/scaffold.py +434 -0
datadoom/store/__init__.py +47 -0
datadoom/store/artifacts.py +67 -0
datadoom/store/db.py +104 -0
datadoom/store/migrations/__init__.py +0 -0
datadoom/store/migrations/env.py +53 -0
datadoom/store/migrations/script.py.mako +24 -0
datadoom/store/migrations/versions/0001_init.py +149 -0
datadoom/store/migrations/versions/0002_report_mutual_information.py +23 -0
datadoom/store/migrations/versions/0003_run_name.py +23 -0
datadoom/store/migrations/versions/0004_report_profile.py +24 -0
datadoom/store/models.py +170 -0
datadoom/store/repositories.py +279 -0
datadoom/templates/__init__.py +239 -0
datadoom/templates/ab_test.datadoom.yaml +46 -0
datadoom/templates/clinical_deterioration.datadoom.yaml +124 -0
datadoom/templates/credit_default_challenge.datadoom.yaml +147 -0
datadoom/templates/customer_churn.datadoom.yaml +60 -0
datadoom/templates/ecommerce_orders.datadoom.yaml +46 -0
datadoom/templates/fraud_detection.datadoom.yaml +57 -0
datadoom/templates/hospital_readmission.datadoom.yaml +61 -0
datadoom/templates/insurance_claims.datadoom.yaml +43 -0
datadoom/templates/iot_sensors.datadoom.yaml +44 -0
datadoom/templates/people_directory.datadoom.yaml +56 -0
datadoom/templates/predictive_maintenance.datadoom.yaml +107 -0
datadoom/templates/telecom_churn_challenge.datadoom.yaml +125 -0
datadoom/version.py +3 -0
datadoom/webdist/assets/index-V8VAuTJG.js +445 -0
datadoom/webdist/assets/index-doRjyG5s.css +1 -0
datadoom/webdist/assets/inter-cyrillic-ext-wght-normal-BOeWTOD4.woff2 +0 -0
datadoom/webdist/assets/inter-cyrillic-wght-normal-DqGufNeO.woff2 +0 -0
datadoom/webdist/assets/inter-greek-ext-wght-normal-DlzME5K_.woff2 +0 -0
datadoom/webdist/assets/inter-greek-wght-normal-CkhJZR-_.woff2 +0 -0
datadoom/webdist/assets/inter-latin-ext-wght-normal-DO1Apj_S.woff2 +0 -0
datadoom/webdist/assets/inter-latin-wght-normal-Dx4kXJAl.woff2 +0 -0
datadoom/webdist/assets/inter-vietnamese-wght-normal-CBcvBZtf.woff2 +0 -0
datadoom/webdist/assets/jetbrains-mono-cyrillic-wght-normal-D73BlboJ.woff2 +0 -0
datadoom/webdist/assets/jetbrains-mono-greek-wght-normal-Bw9x6K1M.woff2 +0 -0
datadoom/webdist/assets/jetbrains-mono-latin-ext-wght-normal-DBQx-q_a.woff2 +0 -0
datadoom/webdist/assets/jetbrains-mono-latin-wght-normal-B9CIFXIH.woff2 +0 -0
datadoom/webdist/assets/jetbrains-mono-vietnamese-wght-normal-Bt-aOZkq.woff2 +0 -0
datadoom/webdist/assets/space-grotesk-latin-ext-wght-normal-D9tNdqV9.woff2 +0 -0
datadoom/webdist/assets/space-grotesk-latin-wght-normal-BhU9QXUp.woff2 +0 -0
datadoom/webdist/assets/space-grotesk-vietnamese-wght-normal-D0rl6rjA.woff2 +0 -0
datadoom/webdist/index.html +15 -0
datadoom-0.1.0.dev0.dist-info/METADATA +143 -0
datadoom-0.1.0.dev0.dist-info/RECORD +122 -0
datadoom-0.1.0.dev0.dist-info/WHEEL +4 -0
datadoom-0.1.0.dev0.dist-info/entry_points.txt +2 -0
datadoom-0.1.0.dev0.dist-info/licenses/LICENSE +202 -0

datadoom/api/schemas.py ADDED Viewed

@@ -0,0 +1,230 @@
+"""Request/response models — the typed surface FastAPI turns into OpenAPI.
+The frontend generates its API client from ``/api/openapi.json``, so these
+shapes ARE the contract (doc 08). Spec bodies travel as open ``dict`` payloads
+(the authoritative validation lives in ``engine.spec``); these models describe
+the persistence/metadata envelope around them.
+"""
+from __future__ import annotations
+from typing import Any
+from pydantic import BaseModel, Field
+SpecBody = dict[str, Any]
+# --- errors -------------------------------------------------------------------
+class ErrorDetail(BaseModel):
+    code: str
+    message: str
+    locator: str | None = None
+class ErrorResponse(BaseModel):
+    error: ErrorDetail
+# --- specs (stateless helpers) ------------------------------------------------
+class ValidateResponse(BaseModel):
+    valid: bool = True
+    spec_hash: str
+    warnings: list[str] = Field(default_factory=list)
+class ParseTextRequest(BaseModel):
+    """Raw spec text (YAML or JSON) for the web 'New from YAML' import flow."""
+    text: str
+class ParseResponse(BaseModel):
+    valid: bool = True
+    spec_hash: str
+    spec: SpecBody  # the parsed, validated spec body (JSON form)
+class HashResponse(BaseModel):
+    spec_hash: str
+class EstimateResponse(BaseModel):
+    estimated_runtime_seconds: float
+    estimated_ram_mb: float
+    estimated_size_bytes: int
+    features: int
+    edges: int
+    gpu_required: bool = False
+# --- specs of a dataset -------------------------------------------------------
+class SpecSummary(BaseModel):
+    spec_id: str
+    spec_hash: str
+    version: int
+    datadoom_version: str
+    created_at: str
+class SpecDetail(SpecSummary):
+    body: SpecBody
+class SaveSpecResponse(BaseModel):
+    spec_id: str
+    spec_hash: str
+    version: int
+# --- runs ---------------------------------------------------------------------
+class RunSummary(BaseModel):
+    run_id: str
+    dataset_id: str
+    spec_id: str
+    spec_hash: str | None = None
+    name: str | None = None
+    seed: int
+    status: str
+    stage: str | None = None
+    progress_pct: int = 0
+    compliance_score: float | None = None
+    error: dict[str, Any] | None = None
+    metrics: dict[str, Any] | None = None
+    started_at: str | None = None
+    finished_at: str | None = None
+    created_at: str
+class CreateRunRequest(BaseModel):
+    seed: int | None = None
+    name: str | None = None
+class UpdateRunRequest(BaseModel):
+    name: str
+class CreateRunResponse(BaseModel):
+    run_id: str
+    status: str
+    seed: int
+    ws: str
+class CancelResponse(BaseModel):
+    status: str
+# --- datasets -----------------------------------------------------------------
+class LatestRun(BaseModel):
+    run_id: str
+    status: str
+    compliance_score: float | None = None
+class DatasetSummary(BaseModel):
+    dataset_id: str
+    name: str
+    description: str | None = None
+    status: str
+    rows: int | None = None
+    features: int | None = None
+    compliance_score: float | None = None
+    created_at: str
+    updated_at: str
+class DatasetList(BaseModel):
+    items: list[DatasetSummary]
+    total: int
+class Dataset(BaseModel):
+    dataset_id: str
+    name: str
+    description: str | None = None
+    status: str
+    current_spec: SpecDetail | None = None
+    latest_run: LatestRun | None = None
+    created_at: str
+    updated_at: str
+class CreateDatasetRequest(BaseModel):
+    name: str
+    description: str | None = None
+    spec: SpecBody | None = None
+class UpdateDatasetRequest(BaseModel):
+    name: str | None = None
+    description: str | None = None
+# --- artifacts & reports ------------------------------------------------------
+class Artifact(BaseModel):
+    artifact_id: str
+    run_id: str
+    version: str
+    split: str | None = None
+    format: str
+    filename: str
+    size_bytes: int
+    checksum_sha256: str
+    created_at: str
+class Report(BaseModel):
+    report_id: str
+    run_id: str
+    compliance_score: float | None = None
+    distribution: dict[str, Any] | None = None
+    correlation: dict[str, Any] | None = None
+    mutual_information: dict[str, Any] | None = None
+    causal_truth: dict[str, Any] | None = None
+    difficulty: dict[str, Any] | None = None
+    failures: dict[str, Any] | None = None
+    profile: dict[str, Any] | None = None
+    determinism: dict[str, Any] | None = None
+class PreviewResponse(BaseModel):
+    columns: list[str]
+    rows: list[list[Any]]
+    total: int
+# --- templates & plugins & meta ----------------------------------------------
+class TemplateSummary(BaseModel):
+    id: str
+    name: str
+    domain: str
+    description: str
+    tags: list[str] = Field(default_factory=list)
+    level: str = "starter"  # "starter" | "hackathon"
+class TemplateDetail(TemplateSummary):
+    spec: dict[str, Any]
+class PluginInfo(BaseModel):
+    name: str
+    kind: str
+    version: str | None = None
+    schema_: dict[str, Any] | None = Field(default=None, alias="schema")
+    source: str = "builtin"  # builtin | entrypoint | local
+    builtin: bool = True
+    enabled: bool = True
+class HealthResponse(BaseModel):
+    status: str = "ok"
+class VersionResponse(BaseModel):
+    version: str
+    datadoom_version: str
+    python: str
+    platform: str

datadoom/api/serializers.py ADDED Viewed

@@ -0,0 +1,143 @@
+"""ORM row -> API schema converters (keeps route bodies thin)."""
+from __future__ import annotations
+from typing import Any
+from datadoom.store import (
+    ArtifactRow,
+    DatasetRow,
+    GenerationRunRow,
+    ReportRow,
+    SpecRow,
+)
+from . import schemas
+def spec_summary(row: SpecRow) -> schemas.SpecSummary:
+    return schemas.SpecSummary(
+        spec_id=row.spec_id,
+        spec_hash=row.spec_hash,
+        version=row.version,
+        datadoom_version=row.datadoom_version,
+        created_at=row.created_at,
+    )
+def spec_detail(row: SpecRow) -> schemas.SpecDetail:
+    return schemas.SpecDetail(
+        spec_id=row.spec_id,
+        spec_hash=row.spec_hash,
+        version=row.version,
+        datadoom_version=row.datadoom_version,
+        created_at=row.created_at,
+        body=dict(row.body),
+    )
+def run_summary(row: GenerationRunRow) -> schemas.RunSummary:
+    compliance = None
+    if row.metrics:
+        compliance = row.metrics.get("compliance_score")
+    return schemas.RunSummary(
+        run_id=row.run_id,
+        dataset_id=row.dataset_id,
+        spec_id=row.spec_id,
+        spec_hash=row.spec.spec_hash if row.spec is not None else None,
+        name=row.name,
+        seed=row.seed,
+        status=row.status,
+        stage=row.stage,
+        progress_pct=row.progress_pct,
+        compliance_score=compliance,
+        error=row.error,
+        metrics=row.metrics,
+        started_at=row.started_at,
+        finished_at=row.finished_at,
+        created_at=row.created_at,
+    )
+def latest_run(row: GenerationRunRow | None) -> schemas.LatestRun | None:
+    if row is None:
+        return None
+    compliance = row.metrics.get("compliance_score") if row.metrics else None
+    return schemas.LatestRun(
+        run_id=row.run_id, status=row.status, compliance_score=compliance
+    )
+def _spec_stats(body: dict[str, Any] | None) -> tuple[int | None, int | None]:
+    if not body:
+        return None, None
+    rows = body.get("rows")
+    features = body.get("features")
+    return rows, (len(features) if isinstance(features, dict) else None)
+def dataset_summary(
+    row: DatasetRow, current_spec: SpecRow | None, latest: GenerationRunRow | None
+) -> schemas.DatasetSummary:
+    rows, features = _spec_stats(current_spec.body if current_spec else None)
+    compliance = latest.metrics.get("compliance_score") if latest and latest.metrics else None
+    return schemas.DatasetSummary(
+        dataset_id=row.dataset_id,
+        name=row.name,
+        description=row.description,
+        status=row.status,
+        rows=rows,
+        features=features,
+        compliance_score=compliance,
+        created_at=row.created_at,
+        updated_at=row.updated_at,
+    )
+def dataset(
+    row: DatasetRow, current_spec: SpecRow | None, latest: GenerationRunRow | None
+) -> schemas.Dataset:
+    return schemas.Dataset(
+        dataset_id=row.dataset_id,
+        name=row.name,
+        description=row.description,
+        status=row.status,
+        current_spec=spec_detail(current_spec) if current_spec else None,
+        latest_run=latest_run(latest),
+        created_at=row.created_at,
+        updated_at=row.updated_at,
+    )
+def artifact(row: ArtifactRow) -> schemas.Artifact:
+    # The real on-disk filename is the basename of the storage URI — the
+    # authoritative name (data.csv, data.injected.csv, metadata.json, …) so the
+    # UI never has to guess clean-vs-injected from version/format.
+    filename = row.storage_uri.replace("\\", "/").rsplit("/", 1)[-1]
+    return schemas.Artifact(
+        artifact_id=row.artifact_id,
+        run_id=row.run_id,
+        version=row.version,
+        split=row.split,
+        format=row.format,
+        filename=filename,
+        size_bytes=row.size_bytes,
+        checksum_sha256=row.checksum_sha256,
+        created_at=row.created_at,
+    )
+def report(row: ReportRow) -> schemas.Report:
+    return schemas.Report(
+        report_id=row.report_id,
+        run_id=row.run_id,
+        compliance_score=row.compliance_score,
+        distribution=row.distribution,
+        correlation=row.correlation,
+        mutual_information=row.mutual_information,
+        causal_truth=row.causal_truth,
+        difficulty=row.difficulty,
+        failures=row.failures,
+        profile=row.profile,
+        determinism=row.determinism,
+    )

datadoom/api/state.py ADDED Viewed

@@ -0,0 +1,24 @@
+"""Shared application state, assembled by the app factory and hung on
+``app.state.dd``. Holds the singletons the routes need: config, the database,
+the artifact store, the event hub, and the worker pool.
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from datadoom.config import Config
+from datadoom.jobs import EventHub, WorkerPool
+from datadoom.store import ArtifactStore, Database
+@dataclass
+class AppState:
+    config: Config
+    db: Database
+    artifacts: ArtifactStore
+    hub: EventHub
+    worker: WorkerPool
+    # In-process idempotency map: (dataset_id, key) -> run_id (08 §1). Sufficient
+    # for the single-process local server; team mode would persist this.
+    idempotency: dict[tuple[str, str], str] = field(default_factory=dict)

datadoom/api/store_helpers.py ADDED Viewed

@@ -0,0 +1,56 @@
+"""Thin helpers shared by routes: dataset loading + latest-run lookup.
+Re-exports the store repositories so route modules import them from one place,
+and centralizes the "404 if missing" and "latest run" patterns.
+"""
+from __future__ import annotations
+from sqlalchemy.orm import Session
+from datadoom.store import (
+    ArtifactRepository,
+    DatasetRepository,
+    DatasetRow,
+    GenerationRunRow,
+    ReportRepository,
+    RunRepository,
+    SpecRepository,
+)
+from .errors import http_error
+__all__ = [
+    "DatasetRepository",
+    "SpecRepository",
+    "RunRepository",
+    "ArtifactRepository",
+    "ReportRepository",
+    "load_dataset",
+    "load_run",
+    "latest_run_row",
+]
+def load_dataset(session: Session, dataset_id: str) -> DatasetRow:
+    row = DatasetRepository(session).get(dataset_id)
+    if row is None:
+        raise http_error(404, "not_found", f"dataset {dataset_id} not found")
+    return row
+def load_run(session: Session, run_id: str) -> GenerationRunRow:
+    row = RunRepository(session).get(run_id)
+    if row is None:
+        raise http_error(404, "not_found", f"run {run_id} not found")
+    return row
+def latest_run_row(runs: RunRepository, dataset: DatasetRow) -> GenerationRunRow | None:
+    """Prefer the dataset's recorded latest run, else the most recent by time."""
+    if dataset.latest_run_id:
+        found = runs.get(dataset.latest_run_id)
+        if found is not None:
+            return found
+    rows = runs.list_for_dataset(dataset.dataset_id)
+    return rows[0] if rows else None

datadoom/api/ws.py ADDED Viewed

@@ -0,0 +1,72 @@
+"""Live progress transport (08 §7): WebSocket primary, SSE fallback.
+Both subscribe to the :class:`~datadoom.jobs.progress.EventHub`, which replays
+the stage events so far to a late subscriber, then streams live updates until a
+terminal event (``completed`` / ``failed`` / ``cancelled``). The WS channel also
+accepts ``{"type":"cancel"}`` from the client.
+"""
+from __future__ import annotations
+import asyncio
+import contextlib
+import json
+from fastapi import APIRouter, Request, WebSocket, WebSocketDisconnect
+from fastapi.responses import StreamingResponse
+from .state import AppState
+router = APIRouter(tags=["ws"])
+_TERMINAL = {"completed", "failed", "cancelled"}
+@router.websocket("/api/ws/runs/{run_id}")
+async def ws_run(websocket: WebSocket, run_id: str) -> None:
+    await websocket.accept()
+    state: AppState = websocket.app.state.dd
+    hub = state.hub
+    async def pump_client() -> None:
+        # Listen for client -> server messages (only "cancel" is meaningful).
+        try:
+            while True:
+                msg = await websocket.receive_text()
+                try:
+                    data = json.loads(msg)
+                except ValueError:
+                    continue
+                if data.get("type") == "cancel":
+                    hub.request_cancel(run_id)
+        except WebSocketDisconnect:
+            return
+    client_task = asyncio.create_task(pump_client())
+    try:
+        async for event in hub.subscribe(run_id):
+            await websocket.send_json(event)
+            if event.get("type") in _TERMINAL:
+                break
+    except WebSocketDisconnect:
+        pass
+    finally:
+        client_task.cancel()
+        with contextlib.suppress(RuntimeError):
+            await websocket.close()
+@router.get("/api/runs/{run_id}/events")
+async def sse_run(run_id: str, request: Request) -> StreamingResponse:
+    state: AppState = request.app.state.dd
+    hub = state.hub
+    async def event_stream():  # noqa: ANN202
+        async for event in hub.subscribe(run_id):
+            if await request.is_disconnected():
+                break
+            yield f"data: {json.dumps(event)}\n\n"
+            if event.get("type") in _TERMINAL:
+                break
+    return StreamingResponse(event_stream(), media_type="text/event-stream")

datadoom/cli/__init__.py ADDED Viewed

	@@ -0,0 +1 @@
1	+ """DataDoom command-line interface."""