PyPI - evaldata - Versions diffs - 0.1.0__py3-none-any.whl - Mend

evaldata 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (38) hide show

evaldata/__init__.py +35 -0
evaldata/cli.py +191 -0
evaldata/core/__init__.py +5 -0
evaldata/core/runner.py +60 -0
evaldata/equivalence/__init__.py +10 -0
evaldata/equivalence/columns.py +49 -0
evaldata/equivalence/compare.py +78 -0
evaldata/loaders/__init__.py +5 -0
evaldata/loaders/python.py +69 -0
evaldata/platforms/__init__.py +46 -0
evaldata/platforms/base.py +145 -0
evaldata/platforms/databricks.py +152 -0
evaldata/platforms/duckdb.py +78 -0
evaldata/platforms/postgres.py +86 -0
evaldata/platforms/registry.py +159 -0
evaldata/py.typed +0 -0
evaldata/pytest_plugin/__init__.py +1 -0
evaldata/pytest_plugin/plugin.py +89 -0
evaldata/reporting/__init__.py +5 -0
evaldata/reporting/collector.py +64 -0
evaldata/reporting/terminal.py +177 -0
evaldata/scorers/__init__.py +9 -0
evaldata/scorers/base.py +17 -0
evaldata/scorers/context.py +16 -0
evaldata/scorers/expectation_suite.py +235 -0
evaldata/scorers/query.py +134 -0
evaldata/scorers/result_set_equivalence.py +479 -0
evaldata/scorers/sql.py +764 -0
evaldata/solvers/__init__.py +27 -0
evaldata/solvers/base.py +14 -0
evaldata/solvers/callable.py +24 -0
evaldata/solvers/prompt.py +178 -0
evaldata/types.py +510 -0
evaldata-0.1.0.dist-info/METADATA +113 -0
evaldata-0.1.0.dist-info/RECORD +38 -0
evaldata-0.1.0.dist-info/WHEEL +4 -0
evaldata-0.1.0.dist-info/entry_points.txt +6 -0
evaldata-0.1.0.dist-info/licenses/LICENSE +201 -0

evaldata/__init__.py ADDED Viewed

@@ -0,0 +1,35 @@
+"""evaldata — AI evals framework for data and analytics engineering teams."""
+from typing import TYPE_CHECKING, Any
+from evaldata.core import assert_eval
+from evaldata.loaders import eval_case
+from evaldata.scorers import ExpectationSuiteScorer, ResultSetEquivalence
+from evaldata.solvers import CallableSolver
+from evaldata.types import EvalCase, PlatformRef
+if TYPE_CHECKING:
+    from evaldata.solvers import PromptSolver as PromptSolver
+__all__ = [
+    "CallableSolver",
+    "EvalCase",
+    "ExpectationSuiteScorer",
+    "PlatformRef",
+    "ResultSetEquivalence",
+    "assert_eval",
+    "eval_case",
+]
+def __getattr__(name: str) -> Any:
+    if name == "PromptSolver":
+        from evaldata.solvers import PromptSolver
+        return PromptSolver
+    msg = f"module {__name__!r} has no attribute {name!r}"
+    raise AttributeError(msg)
+def __dir__() -> list[str]:
+    return sorted([*globals(), "PromptSolver"])

evaldata/cli.py ADDED Viewed

@@ -0,0 +1,191 @@
+"""The `evaldata` command-line interface."""
+import subprocess
+import sys
+from pathlib import Path
+import typer
+from rich.console import Console
+from rich.table import Table
+from rich.text import Text
+from evaldata.platforms.registry import (
+    close_all,
+    databricks_platform,
+    duckdb_platform,
+    postgres_platform,
+    resolve,
+)
+from evaldata.types import PlatformRef
+app = typer.Typer(help="AI evals for data & analytics engineering teams.", no_args_is_help=True)
+@app.command(context_settings={"allow_extra_args": True, "ignore_unknown_options": True})
+def run(
+    ctx: typer.Context,
+    path: str | None = typer.Argument(None, help="Path or test id to run; omit to use pytest's testpaths."),
+    json_path: Path | None = typer.Option(
+        None,
+        "--json",
+        metavar="PATH",
+        help="Also write the structured evaldata results JSON to PATH (off by default).",
+    ),
+) -> None:
+    """Run the eval suite via pytest, forwarding any extra pytest arguments verbatim.
+    Args:
+        ctx: The Typer context; its extra args are forwarded straight to pytest.
+        path: A path or test id to run; omit to use pytest's `testpaths`.
+        json_path: If given, also write the structured results JSON to this path.
+    Raises:
+        Exit: Always, carrying pytest's return code as the process exit code.
+    """
+    cmd = [sys.executable, "-m", "pytest"]
+    if path is not None:
+        cmd.append(path)
+    if json_path is not None:
+        cmd.append(f"--evaldata-json={json_path}")
+    cmd.extend(ctx.args)
+    completed = subprocess.run(cmd)  # noqa: PLW1510 - exit code is forwarded, not raised on
+    raise typer.Exit(completed.returncode)
+def _build_refs(
+    *,
+    duckdb: str | None,
+    postgres: str | None,
+    databricks_server_hostname: str | None = None,
+    databricks_http_path: str | None = None,
+) -> list[PlatformRef]:
+    """Build a `PlatformRef` for each platform flag that was provided.
+    Each branch routes through the typed registry builder, so a flag can only ever name a
+    real `PlatformKind`. The Databricks ref is built only when both its server hostname and
+    HTTP path are given (it has no single-value form).
+    Args:
+        duckdb: A DuckDB database path, or `None` if the flag was not given.
+        postgres: A PostgreSQL conninfo, or `None` if the flag was not given.
+        databricks_server_hostname: A Databricks workspace hostname, or `None`.
+        databricks_http_path: A Databricks SQL Warehouse HTTP path, or `None`.
+    Returns:
+        One `PlatformRef` per platform whose flag(s) were provided, in flag order.
+    """
+    refs: list[PlatformRef] = []
+    if duckdb is not None:
+        refs.append(duckdb_platform(name="duckdb", path=duckdb))
+    if postgres is not None:
+        refs.append(postgres_platform(name="postgres", conninfo=postgres))
+    if databricks_server_hostname is not None and databricks_http_path is not None:
+        refs.append(
+            databricks_platform(
+                name="databricks",
+                server_hostname=databricks_server_hostname,
+                http_path=databricks_http_path,
+            )
+        )
+    return refs
+def _probe(ref: PlatformRef) -> tuple[bool, str]:
+    """Resolve `ref` to a live adapter and run `SELECT 1`.
+    Catches broadly on purpose: adapter construction can raise (e.g. psycopg fails to
+    connect, or an optional driver is missing), and `doctor` must report that as a FAIL
+    rather than crash. A query that fails as a value (`ExecutionResult.error`) is a FAIL
+    too.
+    Args:
+        ref: The platform reference to probe.
+    Returns:
+        A tuple `(ok, detail)`: `ok` is whether the probe succeeded, and `detail` is a
+        human-readable status or error message.
+    """
+    try:
+        result = resolve(ref).execute("SELECT 1")
+    except Exception as e:  # noqa: BLE001 - diagnostics: any failure is a reported FAIL
+        return False, str(e)
+    if result.error is not None:
+        return False, result.error
+    return True, "connected"
+@app.command()
+def doctor(
+    duckdb: str | None = typer.Option(
+        None, "--duckdb", metavar="PATH", envvar="EVALDATA_DUCKDB_PATH", help="DuckDB database path to check."
+    ),
+    postgres: str | None = typer.Option(
+        None,
+        "--postgres",
+        metavar="CONNINFO",
+        envvar="EVALDATA_POSTGRES_CONNINFO",
+        help='PostgreSQL libpq conninfo to check (empty "" uses PG* env vars / libpq defaults).',
+    ),
+    databricks_server_hostname: str | None = typer.Option(
+        None,
+        "--databricks-server-hostname",
+        metavar="HOST",
+        envvar="DATABRICKS_SERVER_HOSTNAME",
+        help="Databricks workspace hostname to check (paired with --databricks-http-path).",
+    ),
+    databricks_http_path: str | None = typer.Option(
+        None,
+        "--databricks-http-path",
+        metavar="PATH",
+        envvar="DATABRICKS_HTTP_PATH",
+        help="Databricks SQL Warehouse HTTP path to check (paired with --databricks-server-hostname).",
+    ),
+) -> None:
+    """Check that the given platform connections work (one --<kind> flag per platform).
+    Args:
+        duckdb: A DuckDB database path to check (also read from `EVALDATA_DUCKDB_PATH`).
+        postgres: A PostgreSQL conninfo to check (also read from
+            `EVALDATA_POSTGRES_CONNINFO`).
+        databricks_server_hostname: A Databricks workspace hostname to check (also read from
+            `DATABRICKS_SERVER_HOSTNAME`); required together with `databricks_http_path`.
+        databricks_http_path: A Databricks SQL Warehouse HTTP path to check (also read from
+            `DATABRICKS_HTTP_PATH`); required together with `databricks_server_hostname`.
+    Raises:
+        BadParameter: If no platform flag is provided, or only one of the two Databricks flags is.
+        Exit: With code 1 if any platform connection fails.
+    """
+    if (databricks_server_hostname is None) != (databricks_http_path is None):
+        msg = "--databricks-server-hostname and --databricks-http-path must be given together"
+        raise typer.BadParameter(msg)
+    refs = _build_refs(
+        duckdb=duckdb,
+        postgres=postgres,
+        databricks_server_hostname=databricks_server_hostname,
+        databricks_http_path=databricks_http_path,
+    )
+    if not refs:
+        msg = "specify at least one platform, e.g. --duckdb PATH or --postgres CONNINFO"
+        raise typer.BadParameter(msg)
+    console = Console()
+    table = Table(title="evaldata doctor", title_justify="left")
+    table.add_column("platform")
+    table.add_column("kind")
+    table.add_column("status")
+    all_ok = True
+    try:
+        for ref in refs:
+            ok, detail = _probe(ref)
+            all_ok = all_ok and ok
+            mark = "OK" if ok else "FAIL"
+            # Text (not markup) so bracketed driver messages render verbatim.
+            table.add_row(ref.name, ref.kind, Text(f"{mark} {detail}", style="green" if ok else "red"))
+    finally:
+        close_all()  # this CLI invocation owns the adapters it resolved
+    console.print(table)
+    if not all_ok:
+        raise typer.Exit(1)

evaldata/core/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""Core orchestration: the runner and the pytest-facing `assert_eval`."""
+from evaldata.core.runner import assert_eval
+__all__ = ["assert_eval"]

evaldata/core/runner.py ADDED Viewed

@@ -0,0 +1,60 @@
+"""Eval orchestration and the pytest-facing `assert_eval`."""
+from collections.abc import Sequence
+from evaldata.platforms.base import PlatformAdapter
+from evaldata.platforms.registry import resolve
+from evaldata.reporting.collector import CaseReport, record
+from evaldata.reporting.terminal import render_failure, render_solver_error
+from evaldata.scorers.base import Scorer
+from evaldata.scorers.context import ScoreContext
+from evaldata.scorers.query import QueryRunner
+from evaldata.solvers.base import Solver
+from evaldata.types import EvalCase
+def assert_eval(
+    case: EvalCase,
+    solver: Solver,
+    *,
+    scorers: Sequence[Scorer],
+    adapter: PlatformAdapter | None = None,
+) -> None:
+    """Run `case` through `solver` + a platform adapter + `scorers`; raise on any failure.
+    Solves the case, executes the produced SQL, and scores the result with each scorer.
+    The adapter is the explicitly passed `adapter` if given, otherwise resolved (and
+    session-cached) from `case.platform`. Execution is bounded by `case.cost_budget`'s
+    `max_seconds`: an overrunning query is cancelled and scored as an execution failure.
+    Args:
+        case: The eval case to run.
+        solver: The solver that produces SQL for the case.
+        scorers: Scorers applied to the execution result; all must pass.
+        adapter: A platform adapter to execute against. If omitted, one is resolved and
+            session-cached from `case.platform`.
+    Raises:
+        AssertionError: If the solver fails or any scorer fails, carrying a composed
+            diagnostic. Raising is pytest's failure protocol.
+    """
+    output = solver.solve(case)
+    if output.error is not None:
+        error = output.error
+        record(CaseReport(id=case.id, input=case.input, passed=False, error=f"solver error [{error.kind}]"))
+        raise AssertionError(render_solver_error(case, error))
+    sql = output.output
+    if sql is None:  # pragma: no cover - unreachable: SolverOutput's validator guarantees output XOR error
+        msg = f"evaldata case {case.id!r}: solver returned neither output nor error"
+        raise AssertionError(msg)
+    live = adapter if adapter is not None else resolve(case.platform)
+    max_seconds = case.cost_budget.max_seconds if case.cost_budget is not None else None
+    dialect = case.platform.dialect or case.platform.kind
+    queries = QueryRunner(live, sql, dialect, max_seconds)
+    result = queries.run(sql)
+    context = ScoreContext(queries=queries)
+    scores = [scorer.score(case, output, result, context=context) for scorer in scorers]
+    failures = [s for s in scores if not s.passed]
+    record(CaseReport(id=case.id, input=case.input, passed=not failures, scores=list(scores)))
+    if failures:
+        raise AssertionError(render_failure(case, output, result, failures))

evaldata/equivalence/__init__.py ADDED Viewed

@@ -0,0 +1,10 @@
+"""Result-set equivalence engine: column reconciliation plus the pure `build_result_set_diff` assembly seam."""
+from evaldata.equivalence.columns import ColumnReconciliation, reconcile_columns
+from evaldata.equivalence.compare import build_result_set_diff
+__all__ = [
+    "ColumnReconciliation",
+    "build_result_set_diff",
+    "reconcile_columns",
+]

evaldata/equivalence/columns.py ADDED Viewed

@@ -0,0 +1,49 @@
+"""Column reconciliation between actual and expected schemas."""
+from typing import Literal, NamedTuple
+class ColumnReconciliation(NamedTuple):
+    """The outcome of reconciling actual against expected column names.
+    Attributes:
+        in_both: Columns present in both, in expected order; the columns compared on.
+        missing: Columns expected but absent from actual, in expected order.
+        unexpected: Columns present in actual but not expected, in actual order.
+        order_mismatch: `True` only when `column_order == "strict"` and the sequences
+            differ positionally.
+    """
+    in_both: list[str]
+    missing: list[str]
+    unexpected: list[str]
+    order_mismatch: bool
+def reconcile_columns(
+    actual: list[str],
+    expected: list[str],
+    column_order: Literal["ignore", "strict"],
+) -> ColumnReconciliation:
+    """Reconcile actual against expected column-name sequences.
+    Row comparison is always keyed by name (rows are dicts), so the order signal is a
+    separate assertion rather than a constraint on row matching.
+    Args:
+        actual: Column names from the actual result set.
+        expected: Column names from the expected result set.
+        column_order: `"strict"` to flag a positional order difference, `"ignore"` to
+            disregard ordering.
+    Returns:
+        A `ColumnReconciliation`. The `in_both`/`missing`/`unexpected` lists preserve
+        source order by construction (the sets are membership lookups only).
+    """
+    actual_set = set(actual)
+    expected_set = set(expected)
+    in_both = [c for c in expected if c in actual_set]
+    missing = [c for c in expected if c not in actual_set]
+    unexpected = [c for c in actual if c not in expected_set]
+    order_mismatch = column_order == "strict" and actual != expected
+    return ColumnReconciliation(in_both, missing, unexpected, order_mismatch)

evaldata/equivalence/compare.py ADDED Viewed

@@ -0,0 +1,78 @@
+"""Pure assembly of a `ResultSetDiff` from precomputed counts, samples, and column signals."""
+from typing import Any
+from evaldata.equivalence.columns import ColumnReconciliation
+from evaldata.types import ColumnMismatch, ResultSetDiff, TypeMismatch
+def build_result_set_diff(
+    *,
+    expected_row_count: int,
+    actual_row_count: int,
+    missing_row_count: int,
+    extra_row_count: int,
+    sample_missing_rows: list[dict[str, Any]],
+    sample_extra_rows: list[dict[str, Any]],
+    columns: ColumnReconciliation,
+    type_mismatches: list[TypeMismatch],
+    column_mismatches: list[ColumnMismatch],
+) -> ResultSetDiff | None:
+    """Assemble a `ResultSetDiff` from already-computed diff signals.
+    Warehouse-free: the row counts/samples are computed by the engine and the column/type
+    signals in Python, then passed here. `column_mismatches` is populated only by the keyed
+    `FULL OUTER JOIN` path (empty for the keyless `EXCEPT ALL` path).
+    Args:
+        expected_row_count: The number of expected rows.
+        actual_row_count: The number of actual rows.
+        missing_row_count: Rows present in expected but absent from actual.
+        extra_row_count: Rows present in actual but absent from expected.
+        sample_missing_rows: A bounded sample of the missing rows.
+        sample_extra_rows: A bounded sample of the extra rows.
+        columns: The reconciliation of actual against expected column names.
+        type_mismatches: Per-column type differences over the shared columns.
+        column_mismatches: Per-column counts of key-matched rows whose value differs;
+            empty for the keyless path.
+    Returns:
+        `None` if the assembled diff records no differences (the result sets are equal),
+        else the populated `ResultSetDiff`.
+    """
+    diff = ResultSetDiff(
+        expected_row_count=expected_row_count,
+        actual_row_count=actual_row_count,
+        missing_row_count=missing_row_count,
+        extra_row_count=extra_row_count,
+        sample_missing_rows=sample_missing_rows,
+        sample_extra_rows=sample_extra_rows,
+        missing_columns=columns.missing,
+        unexpected_columns=columns.unexpected,
+        type_mismatches=type_mismatches,
+        column_mismatches=column_mismatches,
+        column_order_mismatch=columns.order_mismatch,
+    )
+    if _is_equal(diff):
+        return None
+    return diff
+def _is_equal(d: ResultSetDiff) -> bool:
+    """Whether the diff records no differences — i.e. the result sets are equal.
+    Args:
+        d: The diff to inspect.
+    Returns:
+        `True` if there are no row, column, type, or ordering differences.
+    """
+    return (
+        d.missing_row_count == 0
+        and d.extra_row_count == 0
+        and not d.missing_columns
+        and not d.unexpected_columns
+        and not d.type_mismatches
+        and not d.column_mismatches
+        and not d.column_order_mismatch
+    )

evaldata/loaders/__init__.py ADDED Viewed

@@ -0,0 +1,5 @@
+"""Loaders: build `EvalCase`s from authoring surfaces (Python decorator first; YAML in v1.x)."""
+from evaldata.loaders.python import eval_case, read_eval_case
+__all__ = ["eval_case", "read_eval_case"]

evaldata/loaders/python.py ADDED Viewed

@@ -0,0 +1,69 @@
+"""`@eval_case`: the Python authoring decorator for test cases."""
+from collections.abc import Callable
+from typing import Any, TypeVar
+from weakref import WeakKeyDictionary
+from pydantic import TypeAdapter
+from evaldata.types import ComparisonConfig, CostBudget, EvalCase, Expected, PlatformRef
+_TestFn = TypeVar("_TestFn", bound=Callable[..., Any])
+# Built once; validates a dict into the discriminated `Expected` union.
+_EXPECTED_ADAPTER: TypeAdapter[Expected] = TypeAdapter(Expected)
+# Weak keys so a collected test function that goes away releases its entry; identity
+# lookup matches what pytest passes as `request.function`.
+_CASES: WeakKeyDictionary[Callable[..., Any], EvalCase] = WeakKeyDictionary()
+def eval_case(
+    *,
+    input: str,
+    expected: dict[str, Any] | Expected,
+    platform: PlatformRef,
+    id: str | None = None,
+    metadata: dict[str, Any] | None = None,
+    comparison: ComparisonConfig | None = None,
+    cost_budget: CostBudget | None = None,
+) -> Callable[[_TestFn], _TestFn]:
+    """Attach an `EvalCase` to a test function for the `case` fixture to inject.
+    Args:
+        input: The natural-language question / instruction under test.
+        expected: The expected outcome — a typed `Expected` or a dict coerced to one.
+        platform: A `PlatformRef` (build one with `duckdb_platform` / `postgres_platform`).
+        id: Case identifier; defaults to the decorated function's name.
+        metadata: Optional free-form tags/owner/source metadata.
+        comparison: Optional result-set comparison rules; defaults to `ComparisonConfig()`.
+        cost_budget: Optional ceiling on platform resource consumption for the case.
+    Returns:
+        A decorator that records the case and returns the function unchanged.
+    """
+    coerced: Expected = _EXPECTED_ADAPTER.validate_python(expected) if isinstance(expected, dict) else expected
+    def decorator(func: _TestFn) -> _TestFn:
+        extra: dict[str, Any] = {}
+        if metadata is not None:
+            extra["metadata"] = metadata
+        if comparison is not None:
+            extra["comparison"] = comparison
+        if cost_budget is not None:
+            extra["cost_budget"] = cost_budget
+        _CASES[func] = EvalCase(
+            id=id or getattr(func, "__name__", ""),
+            input=input,
+            expected=coerced,
+            platform=platform,
+            **extra,
+        )
+        return func
+    return decorator
+def read_eval_case(func: Callable[..., Any]) -> EvalCase | None:
+    """Return the `EvalCase` attached to `func` by `@eval_case`, or `None`."""
+    return _CASES.get(func)

evaldata/platforms/__init__.py ADDED Viewed

@@ -0,0 +1,46 @@
+"""Platform adapters: per-platform integrations that execute SQL against a data platform."""
+import importlib
+from typing import TYPE_CHECKING, Any
+from evaldata.platforms.base import PlatformAdapter
+from evaldata.platforms.duckdb import DuckDBAdapter
+from evaldata.platforms.registry import databricks_platform, duckdb_platform, postgres_platform, resolve
+if TYPE_CHECKING:
+    from evaldata.platforms.databricks import DatabricksAdapter
+    from evaldata.platforms.postgres import PostgresAdapter
+__all__ = [
+    "DatabricksAdapter",
+    "DuckDBAdapter",
+    "PlatformAdapter",
+    "PostgresAdapter",
+    "databricks_platform",
+    "duckdb_platform",
+    "postgres_platform",
+    "resolve",
+]
+_LAZY_ADAPTERS = {
+    "PostgresAdapter": ("evaldata.platforms.postgres", "postgres"),
+    "DatabricksAdapter": ("evaldata.platforms.databricks", "databricks"),
+}
+def __getattr__(name: str) -> Any:
+    lazy = _LAZY_ADAPTERS.get(name)
+    if lazy is not None:
+        module_path, extra = lazy
+        try:
+            module = importlib.import_module(module_path)
+        except ImportError as e:
+            msg = f"{name} requires the {extra!r} extra: install evaldata[{extra}]"
+            raise ImportError(msg) from e
+        return getattr(module, name)
+    msg = f"module {__name__!r} has no attribute {name!r}"
+    raise AttributeError(msg)
+def __dir__() -> list[str]:
+    return sorted([*globals(), *_LAZY_ADAPTERS])