PyPI - selfevals - Versions diffs - 0.2.2__py3-none-any.whl - Mend

selfevals 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (96) hide show

selfevals/.agents/skills/error-analysis/SKILL.md +149 -0
selfevals/__init__.py +19 -0
selfevals/_errors.py +44 -0
selfevals/_internal/__init__.py +0 -0
selfevals/_internal/hashing.py +23 -0
selfevals/_internal/ids.py +65 -0
selfevals/_internal/time.py +17 -0
selfevals/analysis/__init__.py +23 -0
selfevals/analysis/bundle.py +162 -0
selfevals/analysis/hypothesis.py +26 -0
selfevals/analysis/ingest.py +185 -0
selfevals/analysis/schemas.py +119 -0
selfevals/analysis/staging.py +34 -0
selfevals/api/__init__.py +24 -0
selfevals/api/__main__.py +47 -0
selfevals/api/app.py +351 -0
selfevals/api/broker.py +210 -0
selfevals/api/broker_bridge.py +29 -0
selfevals/api/queries.py +447 -0
selfevals/api/schemas.py +151 -0
selfevals/api/sse.py +114 -0
selfevals/cli/__init__.py +15 -0
selfevals/cli/_friendly.py +180 -0
selfevals/cli/_help.py +55 -0
selfevals/cli/analyze_commands.py +169 -0
selfevals/cli/commands.py +615 -0
selfevals/cli/main.py +409 -0
selfevals/decision/__init__.py +34 -0
selfevals/decision/matrix.py +185 -0
selfevals/examples/__init__.py +8 -0
selfevals/examples/evals/datasets/pingpong.jsonl +2 -0
selfevals/examples/evals/experiments/example_pingpong.yaml +58 -0
selfevals/examples/pingpong.py +21 -0
selfevals/graders/__init__.py +46 -0
selfevals/graders/base.py +54 -0
selfevals/graders/calibration.py +145 -0
selfevals/graders/deterministic.py +143 -0
selfevals/graders/llm_judge.py +187 -0
selfevals/graders/registry.py +66 -0
selfevals/optimization/__init__.py +47 -0
selfevals/optimization/aggregator.py +246 -0
selfevals/optimization/loop.py +432 -0
selfevals/optimization/proposers.py +202 -0
selfevals/py.typed +0 -0
selfevals/repo/__init__.py +28 -0
selfevals/repo/loader.py +276 -0
selfevals/reporter/__init__.py +21 -0
selfevals/reporter/_metrics.py +114 -0
selfevals/reporter/compare.py +221 -0
selfevals/reporter/json_report.py +105 -0
selfevals/reporter/markdown.py +232 -0
selfevals/runner/__init__.py +42 -0
selfevals/runner/adapters.py +268 -0
selfevals/runner/executor.py +234 -0
selfevals/runner/otlp_receiver.py +343 -0
selfevals/runner/otlp_to_recorder.py +180 -0
selfevals/runner/sandbox.py +46 -0
selfevals/schemas/__init__.py +213 -0
selfevals/schemas/_base.py +82 -0
selfevals/schemas/annotation.py +55 -0
selfevals/schemas/dataset.py +111 -0
selfevals/schemas/enums.py +324 -0
selfevals/schemas/eval_case.py +189 -0
selfevals/schemas/experiment.py +367 -0
selfevals/schemas/failure_mode.py +76 -0
selfevals/schemas/fleet.py +111 -0
selfevals/schemas/grader_card.py +112 -0
selfevals/schemas/iteration.py +219 -0
selfevals/schemas/registry.py +125 -0
selfevals/schemas/tool.py +43 -0
selfevals/schemas/trace.py +384 -0
selfevals/schemas/workspace.py +69 -0
selfevals/sdk/__init__.py +24 -0
selfevals/sdk/auto_instrument.py +165 -0
selfevals/sdk/context.py +45 -0
selfevals/sdk/exporter.py +50 -0
selfevals/sdk/facade.py +203 -0
selfevals/skills/__init__.py +61 -0
selfevals/storage/__init__.py +53 -0
selfevals/storage/errors.py +66 -0
selfevals/storage/filesystem.py +137 -0
selfevals/storage/interface.py +135 -0
selfevals/storage/migrations/__init__.py +80 -0
selfevals/storage/migrations/m0001_initial.py +57 -0
selfevals/storage/seed.py +199 -0
selfevals/storage/sqlite.py +232 -0
selfevals/trace/__init__.py +31 -0
selfevals/trace/otel_importer.py +455 -0
selfevals/trace/payload_router.py +106 -0
selfevals/trace/recorder.py +540 -0
selfevals/version.py +1 -0
selfevals-0.2.2.dist-info/METADATA +283 -0
selfevals-0.2.2.dist-info/RECORD +96 -0
selfevals-0.2.2.dist-info/WHEEL +4 -0
selfevals-0.2.2.dist-info/entry_points.txt +2 -0
selfevals-0.2.2.dist-info/licenses/LICENSE +17 -0

selfevals/cli/__init__.py ADDED Viewed

@@ -0,0 +1,15 @@
+"""SelfEvals CLI.
+A thin argparse-based command surface over the rest of the library.
+Zero new runtime dependencies — Typer/Click would be friendlier but
+each pulls a dep tree we don't need yet.
+Entry point declared in `pyproject.toml`:
+    selfevals = "selfevals.cli.main:app"
+"""
+from __future__ import annotations
+from selfevals.cli.main import app
+__all__ = ["app"]

selfevals/cli/_friendly.py ADDED Viewed

@@ -0,0 +1,180 @@
+"""Translate low-level exceptions into actionable CLI errors.
+This module is the single chokepoint between "what the runtime raised"
+and "what the user sees on stderr". The rule is:
+* If the failure is something a user can fix by changing inputs,
+  surfaces, or configuration, we wrap the underlying exception in a
+  :class:`SelfEvalsUserError` with a tight, file-relative message and
+  (when possible) a concrete hint.
+* If it's an internal invariant violation, we re-raise so the traceback
+  reaches the user.
+Adding a new friendly-error path: pick a function below or add one. Do
+**not** sprinkle ``except FooError`` blocks across the CLI — keeping
+the translation table here is the whole point.
+"""
+from __future__ import annotations
+import difflib
+import sqlite3
+from pathlib import Path
+from urllib.error import HTTPError, URLError
+import yaml
+from selfevals._errors import SelfEvalsUserError
+from selfevals.repo.loader import LoaderError, load_experiment_spec
+from selfevals.runner.adapters import AdapterError
+if False:  # for type checkers only, no runtime cycle.
+    from selfevals.repo.loader import ExperimentSpec
+def load_spec(path: str | Path, *, workspace_id: str | None = None) -> ExperimentSpec:
+    """Load a YAML experiment spec with friendly error messages.
+    Wraps :func:`selfevals.repo.loader.load_experiment_spec`. Catches
+    raw YAML parser errors and `LoaderError` and re-raises them as
+    :class:`SelfEvalsUserError` so the CLI prints a clean single line.
+    The loader already constructs nice messages for *most* failure
+    modes; this wrapper exists so callers don't have to know about
+    `LoaderError` and so the few classes of error the loader does not
+    label (a `yaml.YAMLError` leaking through, a vanished file race)
+    get the same one-line treatment.
+    """
+    spec_path = Path(path)
+    try:
+        return load_experiment_spec(spec_path, workspace_id=workspace_id)
+    except LoaderError as exc:
+        # `LoaderError` is the loader's friendly umbrella, but the dataset
+        # branch deserves the special "did you mean ..." treatment so we
+        # intercept it before falling through to the generic hint table.
+        dataset = _missing_dataset_path(exc)
+        if dataset is not None:
+            err = dataset_not_found(dataset)
+            raise err from exc
+        raise SelfEvalsUserError(str(exc), hint=_yaml_hint_if_relevant(spec_path, exc)) from exc
+    except yaml.YAMLError as exc:  # pragma: no cover - loader already wraps this
+        raise SelfEvalsUserError(
+            f"could not parse YAML {spec_path}: {exc}",
+            hint="check indentation and quoting; run `yamllint` for a line-by-line view",
+        ) from exc
+    except FileNotFoundError as exc:  # pragma: no cover - loader already handles
+        raise SelfEvalsUserError(f"experiment spec not found: {spec_path}") from exc
+def dataset_not_found(path: Path) -> SelfEvalsUserError:
+    """Build a `Dataset not found` error with a fuzzy-match suggestion.
+    Returns the exception; the caller raises (lets the caller pick
+    `raise ... from exc` to preserve a stacktrace if it has one).
+    """
+    parent = path.parent if path.parent.exists() else Path()
+    candidates: list[str] = []
+    if parent.exists():
+        for entry in parent.iterdir():
+            if entry.is_file() and entry.suffix in {".jsonl", ".json", ".yaml", ".yml"}:
+                candidates.append(entry.name)
+    closest = difflib.get_close_matches(path.name, candidates, n=1, cutoff=0.6)
+    hint: str | None = None
+    if closest:
+        hint = f"did you mean {parent / closest[0]}?"
+    return SelfEvalsUserError(f"dataset path {str(path)!r} not found", hint=hint)
+def unknown_grader(name: str, available: list[str]) -> SelfEvalsUserError:
+    """`Grader 'foo' not registered. Available: ...`."""
+    available_str = ", ".join(sorted(available)) if available else "(none)"
+    closest = difflib.get_close_matches(name, available, n=1, cutoff=0.6)
+    hint: str | None = None
+    if closest:
+        hint = f"did you mean {closest[0]!r}?"
+    return SelfEvalsUserError(
+        f"grader {name!r} not registered; available: {available_str}",
+        hint=hint,
+    )
+def wrap_adapter_error(exc: Exception, *, url: str | None = None) -> SelfEvalsUserError:
+    """Convert an `AdapterError` / `URLError` / `HTTPError` into a user error.
+    `url` is the endpoint the adapter was POSTing to, when known. The
+    message format is stable so docs/troubleshooting.md can cite it.
+    """
+    target = f" to {url}" if url else ""
+    if isinstance(exc, HTTPError):
+        return SelfEvalsUserError(
+            f"HTTP adapter got {exc.code} {exc.reason}{target}",
+            hint="check the endpoint returns 2xx with a JSON body",
+        )
+    if isinstance(exc, URLError):
+        reason = getattr(exc, "reason", exc)
+        return SelfEvalsUserError(
+            f"HTTP adapter could not reach{target} ({reason})",
+            hint="confirm the endpoint is running and reachable from this host",
+        )
+    if isinstance(exc, TimeoutError):
+        return SelfEvalsUserError(
+            f"HTTP adapter timed out{target}",
+            hint="increase timeout_seconds or check endpoint responsiveness",
+        )
+    # `AdapterError` covers contract violations (bad JSON, non-dict, etc.).
+    return SelfEvalsUserError(f"adapter error{target}: {exc}")
+def wrap_sqlite_error(exc: sqlite3.Error, *, db_path: Path | str) -> SelfEvalsUserError:
+    """Turn a raw `sqlite3.OperationalError` into something a human can act on."""
+    msg = str(exc).lower()
+    if "locked" in msg or "busy" in msg:
+        return SelfEvalsUserError(
+            f"sqlite database {db_path} is locked",
+            hint="another selfevals process is using it; try `--db <new-path>` or wait",
+        )
+    if "malformed" in msg or "corrupt" in msg or "not a database" in msg:
+        return SelfEvalsUserError(
+            f"sqlite database {db_path} is corrupted or not a valid selfevals db",
+            hint="back up the file and re-run with `--db <new-path>` to start clean",
+        )
+    return SelfEvalsUserError(f"sqlite error at {db_path}: {exc}")
+def _missing_dataset_path(exc: LoaderError) -> Path | None:
+    """If the LoaderError comes from `_read_jsonl`'s 'dataset file not found',
+    return the missing path so the caller can add a fuzzy hint."""
+    msg = str(exc)
+    marker = "dataset file not found: "
+    if marker not in msg:
+        return None
+    # Format: "dataset file not found: <path>"
+    return Path(msg.split(marker, 1)[1].strip())
+def _yaml_hint_if_relevant(spec_path: Path, exc: LoaderError) -> str | None:
+    msg = str(exc).lower()
+    if "could not parse yaml" in msg:
+        return (
+            f"open {spec_path} and check indentation and unclosed brackets; "
+            "yaml errors usually point at the line just *after* the mistake"
+        )
+    if "workspace_id missing" in msg:
+        return "add `workspace: ws_<id>` at the top of the file or pass --workspace"
+    if "missing or non-mapping `experiment:`" in msg:
+        return "the YAML must have an `experiment:` key with the experiment block"
+    if "dataset" in msg and "not found" in msg:
+        return "check `dataset.cases_path` is relative to the YAML file"
+    if "entrypoint" in msg:
+        return "format must be 'package.module:callable_name' (note the colon)"
+    return None
+__all__ = [
+    "AdapterError",
+    "dataset_not_found",
+    "load_spec",
+    "unknown_grader",
+    "wrap_adapter_error",
+    "wrap_sqlite_error",
+]

selfevals/cli/_help.py ADDED Viewed

@@ -0,0 +1,55 @@
+"""Help-text helpers for the CLI.
+Centralises the epilog formatting so every subcommand renders examples
+the same way and so `tests/cli/test_help_texts.py` can assert a single
+convention ("Example:" line) across the board.
+Keep this module pure text. No business logic.
+"""
+from __future__ import annotations
+import argparse
+import textwrap
+from collections.abc import Iterable
+def epilog(*examples: str) -> str:
+    """Render one or more shell examples as an argparse epilog.
+    Each example is a single command line. The first is labelled
+    ``Example:``; any additional ones are stacked underneath without a
+    second label so the help text stays compact.
+    """
+    if not examples:
+        raise ValueError("epilog() requires at least one example")
+    lines = ["Example:"]
+    lines.extend(f"  {ex}" for ex in examples)
+    return "\n".join(lines)
+def make_subparser(
+    subparsers: argparse._SubParsersAction[argparse.ArgumentParser],
+    name: str,
+    *,
+    help_text: str,
+    description: str | None = None,
+    examples: Iterable[str] = (),
+) -> argparse.ArgumentParser:
+    """Add a subparser with a normalised description + epilog.
+    - ``help_text`` is the one-liner shown in the parent ``--help`` listing.
+    - ``description`` defaults to ``help_text`` and is shown at the top of
+      the subcommand's own ``--help``.
+    - ``examples`` becomes the epilog. Use the
+      :class:`argparse.RawDescriptionHelpFormatter` so indentation
+      survives.
+    """
+    example_list = list(examples)
+    return subparsers.add_parser(
+        name,
+        help=help_text,
+        description=textwrap.dedent(description or help_text).strip(),
+        epilog=epilog(*example_list) if example_list else None,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )

selfevals/cli/analyze_commands.py ADDED Viewed

@@ -0,0 +1,169 @@
+"""CLI commands for error analysis: `analyze pull/push` and `failuremode *`.
+These implement the handshake (design §4) and the human promotion gate (§6).
+`analyze pull` emits an AnalysisBundle as JSON on stdout; `analyze push` reads
+an AnalysisResult as JSON on stdin. The `failuremode` family manages the
+taxonomy: list, promote (candidate→official), retire, merge, edit.
+"""
+from __future__ import annotations
+import argparse
+import json
+import sys
+from pathlib import Path
+from selfevals._errors import SelfEvalsUserError
+from selfevals.analysis import build_bundle, ingest_result
+from selfevals.analysis.ingest import AnalysisIngestError
+from selfevals.analysis.schemas import AnalysisResult
+from selfevals.cli.commands import _require_entity, _storage
+from selfevals.schemas.enums import FailureModeStatus
+from selfevals.schemas.failure_mode import FailureMode
+from selfevals.storage.filesystem import FilesystemObjectStore
+from selfevals.storage.interface import ListFilter
+def _object_store(args: argparse.Namespace) -> FilesystemObjectStore:
+    """Object store rooted next to the db, for payload-routed quotes."""
+    return FilesystemObjectStore(Path(args.db).parent / "objects")
+def cmd_analyze_pull(args: argparse.Namespace) -> int:
+    storage = _storage(args)
+    try:
+        bundle = build_bundle(
+            storage,
+            workspace_id=args.workspace_id,
+            experiment_id=args.experiment_id,
+            iteration=args.iteration,
+            only_failed=not args.all,
+        )
+    finally:
+        storage.close()
+    print(json.dumps(bundle.model_dump(mode="json"), indent=2))
+    return 0
+def cmd_analyze_push(args: argparse.Namespace) -> int:
+    raw = sys.stdin.read()
+    if not raw.strip():
+        raise SelfEvalsUserError("analyze push expects an AnalysisResult JSON on stdin")
+    try:
+        result = AnalysisResult.model_validate_json(raw)
+    except ValueError as exc:
+        raise SelfEvalsUserError(f"invalid AnalysisResult JSON: {exc}") from exc
+    storage = _storage(args)
+    try:
+        summary = ingest_result(
+            storage,
+            workspace_id=args.workspace_id,
+            experiment_id=args.experiment_id,
+            result=result,
+            proposed_by=args.by,
+            object_store=_object_store(args),
+        )
+    except AnalysisIngestError as exc:
+        raise SelfEvalsUserError(str(exc)) from exc
+    finally:
+        storage.close()
+    print(f"assignments applied : {summary.assignments_applied}")
+    print(f"candidates created  : {len(summary.created_candidates)}")
+    print(f"candidates re-seen  : {len(summary.updated_candidates)}")
+    print(f"hypotheses recorded : {summary.hypotheses_recorded}")
+    if summary.created_candidates:
+        print("\nnew candidates (promote with `selfevals failuremode promote <id>`):")
+        for fm_id in summary.created_candidates:
+            print(f"  {fm_id}")
+    return 0
+def cmd_failuremode_list(args: argparse.Namespace) -> int:
+    storage = _storage(args)
+    try:
+        with storage.open(args.workspace_id) as scope:
+            modes = [
+                m
+                for m in scope.list_entities(FailureMode, ListFilter())
+                if isinstance(m, FailureMode)
+            ]
+    finally:
+        storage.close()
+    if args.status:
+        modes = [m for m in modes if str(m.status) == args.status]
+    if not modes:
+        print("(no failure modes)")
+        return 0
+    for m in sorted(modes, key=lambda x: (str(x.status), x.slug)):
+        marker = "*" if m.status == FailureModeStatus.OFFICIAL else " "
+        print(f"{marker} {m.id}  [{m.status}]  {m.slug}  ({len(m.examples)} ex)")
+    return 0
+def _load_mode(args: argparse.Namespace, fm_id: str) -> FailureMode:
+    storage = _storage(args)
+    try:
+        with storage.open(args.workspace_id) as scope:
+            fm = _require_entity(scope, FailureMode, fm_id)
+    finally:
+        storage.close()
+    assert isinstance(fm, FailureMode)
+    return fm
+def _save_mode(args: argparse.Namespace, fm: FailureMode) -> None:
+    storage = _storage(args)
+    try:
+        with storage.open(args.workspace_id) as scope:
+            scope.put_entity(fm)
+    finally:
+        storage.close()
+def cmd_failuremode_promote(args: argparse.Namespace) -> int:
+    fm = _load_mode(args, args.failure_mode_id)
+    if fm.status == FailureModeStatus.OFFICIAL:
+        print(f"{fm.id} is already official")
+        return 0
+    fm.status = FailureModeStatus.OFFICIAL
+    _save_mode(args, fm)
+    print(f"promoted {fm.id} ({fm.slug}) → official")
+    return 0
+def cmd_failuremode_retire(args: argparse.Namespace) -> int:
+    fm = _load_mode(args, args.failure_mode_id)
+    fm.status = FailureModeStatus.RETIRED
+    _save_mode(args, fm)
+    print(f"retired {fm.id} ({fm.slug})")
+    return 0
+def cmd_failuremode_merge(args: argparse.Namespace) -> int:
+    src = _load_mode(args, args.failure_mode_id)
+    dst = _load_mode(args, args.into)
+    if src.id == dst.id:
+        raise SelfEvalsUserError("cannot merge a mode into itself")
+    # Move examples to the destination, retire the source, set the back-pointer.
+    dst.examples = [*dst.examples, *src.examples]
+    src.superseded_by = dst.id
+    src.status = FailureModeStatus.RETIRED
+    _save_mode(args, dst)
+    _save_mode(args, src)
+    print(f"merged {src.id} ({src.slug}) → {dst.id} ({dst.slug}); source retired")
+    return 0
+def cmd_failuremode_edit(args: argparse.Namespace) -> int:
+    fm = _load_mode(args, args.failure_mode_id)
+    if args.title is None and args.definition is None:
+        raise SelfEvalsUserError("nothing to edit: pass --title and/or --definition")
+    if args.title is not None:
+        fm.title = args.title
+    if args.definition is not None:
+        fm.definition = args.definition
+    _save_mode(args, fm)
+    print(f"edited {fm.id} ({fm.slug})")
+    return 0