PyPI - selfevals - Versions diffs - 0.2.2__py3-none-any.whl - Mend

selfevals 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (96) hide show

selfevals/.agents/skills/error-analysis/SKILL.md +149 -0
selfevals/__init__.py +19 -0
selfevals/_errors.py +44 -0
selfevals/_internal/__init__.py +0 -0
selfevals/_internal/hashing.py +23 -0
selfevals/_internal/ids.py +65 -0
selfevals/_internal/time.py +17 -0
selfevals/analysis/__init__.py +23 -0
selfevals/analysis/bundle.py +162 -0
selfevals/analysis/hypothesis.py +26 -0
selfevals/analysis/ingest.py +185 -0
selfevals/analysis/schemas.py +119 -0
selfevals/analysis/staging.py +34 -0
selfevals/api/__init__.py +24 -0
selfevals/api/__main__.py +47 -0
selfevals/api/app.py +351 -0
selfevals/api/broker.py +210 -0
selfevals/api/broker_bridge.py +29 -0
selfevals/api/queries.py +447 -0
selfevals/api/schemas.py +151 -0
selfevals/api/sse.py +114 -0
selfevals/cli/__init__.py +15 -0
selfevals/cli/_friendly.py +180 -0
selfevals/cli/_help.py +55 -0
selfevals/cli/analyze_commands.py +169 -0
selfevals/cli/commands.py +615 -0
selfevals/cli/main.py +409 -0
selfevals/decision/__init__.py +34 -0
selfevals/decision/matrix.py +185 -0
selfevals/examples/__init__.py +8 -0
selfevals/examples/evals/datasets/pingpong.jsonl +2 -0
selfevals/examples/evals/experiments/example_pingpong.yaml +58 -0
selfevals/examples/pingpong.py +21 -0
selfevals/graders/__init__.py +46 -0
selfevals/graders/base.py +54 -0
selfevals/graders/calibration.py +145 -0
selfevals/graders/deterministic.py +143 -0
selfevals/graders/llm_judge.py +187 -0
selfevals/graders/registry.py +66 -0
selfevals/optimization/__init__.py +47 -0
selfevals/optimization/aggregator.py +246 -0
selfevals/optimization/loop.py +432 -0
selfevals/optimization/proposers.py +202 -0
selfevals/py.typed +0 -0
selfevals/repo/__init__.py +28 -0
selfevals/repo/loader.py +276 -0
selfevals/reporter/__init__.py +21 -0
selfevals/reporter/_metrics.py +114 -0
selfevals/reporter/compare.py +221 -0
selfevals/reporter/json_report.py +105 -0
selfevals/reporter/markdown.py +232 -0
selfevals/runner/__init__.py +42 -0
selfevals/runner/adapters.py +268 -0
selfevals/runner/executor.py +234 -0
selfevals/runner/otlp_receiver.py +343 -0
selfevals/runner/otlp_to_recorder.py +180 -0
selfevals/runner/sandbox.py +46 -0
selfevals/schemas/__init__.py +213 -0
selfevals/schemas/_base.py +82 -0
selfevals/schemas/annotation.py +55 -0
selfevals/schemas/dataset.py +111 -0
selfevals/schemas/enums.py +324 -0
selfevals/schemas/eval_case.py +189 -0
selfevals/schemas/experiment.py +367 -0
selfevals/schemas/failure_mode.py +76 -0
selfevals/schemas/fleet.py +111 -0
selfevals/schemas/grader_card.py +112 -0
selfevals/schemas/iteration.py +219 -0
selfevals/schemas/registry.py +125 -0
selfevals/schemas/tool.py +43 -0
selfevals/schemas/trace.py +384 -0
selfevals/schemas/workspace.py +69 -0
selfevals/sdk/__init__.py +24 -0
selfevals/sdk/auto_instrument.py +165 -0
selfevals/sdk/context.py +45 -0
selfevals/sdk/exporter.py +50 -0
selfevals/sdk/facade.py +203 -0
selfevals/skills/__init__.py +61 -0
selfevals/storage/__init__.py +53 -0
selfevals/storage/errors.py +66 -0
selfevals/storage/filesystem.py +137 -0
selfevals/storage/interface.py +135 -0
selfevals/storage/migrations/__init__.py +80 -0
selfevals/storage/migrations/m0001_initial.py +57 -0
selfevals/storage/seed.py +199 -0
selfevals/storage/sqlite.py +232 -0
selfevals/trace/__init__.py +31 -0
selfevals/trace/otel_importer.py +455 -0
selfevals/trace/payload_router.py +106 -0
selfevals/trace/recorder.py +540 -0
selfevals/version.py +1 -0
selfevals-0.2.2.dist-info/METADATA +283 -0
selfevals-0.2.2.dist-info/RECORD +96 -0
selfevals-0.2.2.dist-info/WHEEL +4 -0
selfevals-0.2.2.dist-info/entry_points.txt +2 -0
selfevals-0.2.2.dist-info/licenses/LICENSE +17 -0

selfevals/analysis/ingest.py ADDED Viewed

@@ -0,0 +1,185 @@
+"""Ingest an AnalysisResult: persist assignments, candidates, hypotheses.
+This is the push half of the handshake (design §4). It enforces the two
+invariants that keep the taxonomy trustworthy:
+  1. Each assignment targets exactly one of an existing `mode_id` (classify) or
+     a `new_mode_slug` (propose) — the XOR, validated on the wire model and
+     re-checked here against what actually exists.
+  2. Classify-don't-rename: an assignment may reference an existing mode but can
+     never edit its title/definition. New modes arrive only as candidates.
+     Renaming is a separate human action. This is what keeps mode identity
+     stable across analysis runs.
+New modes are created idempotent on slug (a repeat slug updates the existing
+candidate's examples rather than duplicating). Hypotheses are stored as
+`Proposal` seeds linked to the experiment; they are not auto-run.
+"""
+from __future__ import annotations
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING
+from selfevals.analysis.schemas import AnalysisResult
+from selfevals.schemas.enums import FailureModeStatus
+from selfevals.schemas.failure_mode import FailureMode, FailureModeExample
+from selfevals.schemas.trace import GraderResult, Trace
+if TYPE_CHECKING:
+    from selfevals.storage.interface import ObjectStoreInterface, WorkspaceScope
+    from selfevals.storage.sqlite import SQLiteStorage
+class AnalysisIngestError(ValueError):
+    """Raised when an AnalysisResult cannot be applied (unknown ids, etc.)."""
+@dataclass
+class IngestSummary:
+    created_candidates: list[str] = field(default_factory=list)  # fm ids
+    updated_candidates: list[str] = field(default_factory=list)  # fm ids (slug re-seen)
+    assignments_applied: int = 0
+    hypotheses_recorded: int = 0
+def ingest_result(
+    storage: SQLiteStorage,
+    *,
+    workspace_id: str,
+    experiment_id: str,
+    result: AnalysisResult,
+    proposed_by: str = "agent:unknown",
+    object_store: ObjectStoreInterface | None = None,
+) -> IngestSummary:
+    """Apply an AnalysisResult to the workspace. Best-effort transactional:
+    everything is validated before any write, so a bad result rejects whole."""
+    summary = IngestSummary()
+    with storage.open(workspace_id) as scope:
+        existing = [fm for fm in scope.list_entities(FailureMode) if isinstance(fm, FailureMode)]
+        by_id = {fm.id: fm for fm in existing}
+        by_slug = {fm.slug: fm for fm in existing}
+        for a in result.assignments:
+            if a.mode_id is not None and a.mode_id not in by_id:
+                raise AnalysisIngestError(f"assignment references unknown mode_id {a.mode_id!r}")
+        proposed_slugs = {p.slug for p in result.proposed_modes}
+        for a in result.assignments:
+            # An assignment can name a new slug only if it is declared in
+            # proposed_modes (so it has a definition) or already known.
+            if (
+                a.new_mode_slug is not None
+                and a.new_mode_slug not in proposed_slugs
+                and a.new_mode_slug not in by_slug
+            ):
+                raise AnalysisIngestError(
+                    f"assignment proposes new_mode_slug {a.new_mode_slug!r} "
+                    "but it is neither in proposed_modes nor already known"
+                )
+        slug_to_mode: dict[str, FailureMode] = dict(by_slug)
+        for p in result.proposed_modes:
+            if p.slug in by_slug:
+                # Slug re-seen: keep the existing mode (classify-don't-rename),
+                # only note that the agent re-proposed it.
+                summary.updated_candidates.append(by_slug[p.slug].id)
+                slug_to_mode[p.slug] = by_slug[p.slug]
+                continue
+            parent_id = by_slug[p.parent_slug].id if p.parent_slug in by_slug else None
+            mode = FailureMode(
+                id=FailureMode.make_id(),
+                workspace_id=workspace_id,
+                slug=p.slug,
+                title=p.title,
+                definition=p.definition,
+                status=FailureModeStatus.CANDIDATE,
+                parent_mode_id=parent_id,
+                proposed_by=proposed_by,
+            )
+            scope.put_entity(mode)
+            slug_to_mode[p.slug] = mode
+            by_id[mode.id] = mode
+            summary.created_candidates.append(mode.id)
+        for a in result.assignments:
+            resolved_id = (
+                a.mode_id if a.mode_id is not None else slug_to_mode[a.new_mode_slug].id  # type: ignore[index]
+            )
+            trace = scope.get_entity(Trace, a.trace_id)
+            assert isinstance(trace, Trace)
+            _stamp_mode_on_trace(trace, resolved_id, grader="error_analysis")
+            scope.put_entity(trace)
+            # Append example evidence to the mode (payload-route the quote).
+            mode = by_id[resolved_id]
+            quote_pointer = None
+            quote_hash = None
+            if a.quote and object_store is not None:
+                from selfevals.trace.payload_router import PayloadRouter
+                router = PayloadRouter(object_store, workspace_id=workspace_id)
+                routed = router.route_value(f"fm_example:{a.trace_id}", a.quote)
+                quote_pointer = routed.pointer
+                quote_hash = routed.content_hash
+            mode.examples.append(
+                FailureModeExample(
+                    trace_id=a.trace_id,
+                    quote_pointer=quote_pointer,
+                    quote_hash=quote_hash,
+                    note=a.open_note,
+                )
+            )
+            scope.put_entity(mode)
+            summary.assignments_applied += 1
+        summary.hypotheses_recorded = _record_hypotheses(
+            scope, workspace_id=workspace_id, experiment_id=experiment_id, result=result
+        )
+    return summary
+def _stamp_mode_on_trace(trace: Trace, mode_id: str, *, grader: str) -> None:
+    """Add `mode_id` to the trace's grader results without duplicating.
+    If an error-analysis GraderResult already exists, extend it; otherwise add
+    one carrying the trace's worst label so the link has context.
+    """
+    for gr in trace.grader_results:
+        if gr.grader == grader:
+            if mode_id not in gr.failure_modes:
+                gr.failure_modes = [*gr.failure_modes, mode_id]
+            return
+    worst = "fail"
+    for gr in trace.grader_results:
+        if gr.label in {"error", "fail", "partial"}:
+            worst = gr.label
+            break
+    trace.grader_results.append(GraderResult(grader=grader, label=worst, failure_modes=[mode_id]))
+def _record_hypotheses(
+    scope: WorkspaceScope,
+    *,
+    workspace_id: str,
+    experiment_id: str,
+    result: AnalysisResult,
+) -> int:
+    """Persist hypotheses as HypothesisRecord seeds for the proposer.
+    Kept as a thin entity so the proposer (and a future llm_proposer) can
+    consult them. We do not run them here.
+    """
+    from selfevals.analysis.hypothesis import HypothesisRecord
+    count = 0
+    for h in result.hypotheses:
+        scope.put_entity(
+            HypothesisRecord(
+                id=HypothesisRecord.make_id(),
+                workspace_id=workspace_id,
+                experiment_id=experiment_id,
+                targets_mode_slug=h.targets_mode_slug,
+                statement=h.statement,
+                suggested_parameters=dict(h.suggested_parameters),
+            )
+        )
+        count += 1
+    return count

selfevals/analysis/schemas.py ADDED Viewed

@@ -0,0 +1,119 @@
+"""Wire schemas for the error-analysis handshake.
+These are the contract between selfevals and an external coding agent (the
+`error-analysis` skill). selfevals emits an `AnalysisBundle` (pull) and ingests
+an `AnalysisResult` (push). Get these right and any agent can honour the
+protocol. See docs/spec/error_analysis_design.md §4.
+They are plain `SelfEvalsModel`s (not entities) — transport shapes, not stored
+rows. The persistence happens by translating an `AnalysisResult` into
+`FailureMode` entities and `GraderResult` updates in `ingest.py`.
+"""
+from __future__ import annotations
+from typing import Any
+from pydantic import Field, model_validator
+from selfevals.schemas._base import NonEmptyStr, SelfEvalsModel
+ANALYSIS_SCHEMA_VERSION = "1.0.0"
+class TaxonomyEntry(SelfEvalsModel):
+    """A live failure mode the agent must classify AGAINST (never rename)."""
+    id: NonEmptyStr
+    slug: NonEmptyStr
+    title: str
+    definition: str
+    status: str
+class BundleGrade(SelfEvalsModel):
+    label: str
+    score: float | None = None
+    deterministic_modes: list[str] = Field(default_factory=list)
+    judge_reason: str | None = None
+class BundleMessage(SelfEvalsModel):
+    role: str
+    content: str
+class BundleErrorSpan(SelfEvalsModel):
+    kind: str
+    name: str
+    error: str | None = None
+class BundleTrace(SelfEvalsModel):
+    """One failed trace the agent needs to code."""
+    trace_id: NonEmptyStr
+    run_id: NonEmptyStr
+    thread_id: str | None = None
+    eval_case_id: str | None = None
+    grade: BundleGrade
+    transcript: list[BundleMessage] = Field(default_factory=list)
+    first_error_span: BundleErrorSpan | None = None
+class AnalysisBundle(SelfEvalsModel):
+    schema_version: str = ANALYSIS_SCHEMA_VERSION
+    workspace_id: NonEmptyStr
+    experiment_id: NonEmptyStr
+    iteration: int | None = None
+    taxonomy: list[TaxonomyEntry] = Field(default_factory=list)
+    traces: list[BundleTrace] = Field(default_factory=list)
+    instructions_ref: str = "skill://error-analysis"
+class Assignment(SelfEvalsModel):
+    """Trace → failure mode. Either an existing `mode_id` (classify) XOR a
+    `new_mode_slug` (propose). Never both, never neither — enforced here and
+    again transactionally in ingest."""
+    trace_id: NonEmptyStr
+    mode_id: str | None = None
+    new_mode_slug: str | None = None
+    open_note: str | None = None
+    quote: str | None = None
+    confidence: float | None = Field(default=None, ge=0.0, le=1.0)
+    @model_validator(mode="after")
+    def _exactly_one_target(self) -> Assignment:
+        has_id = self.mode_id is not None
+        has_slug = self.new_mode_slug is not None
+        if has_id == has_slug:
+            raise ValueError(
+                "assignment must set exactly one of mode_id (classify) or "
+                "new_mode_slug (propose) — never both, never neither"
+            )
+        return self
+class ProposedMode(SelfEvalsModel):
+    """A new candidate mode discovered during axial coding."""
+    slug: NonEmptyStr
+    title: NonEmptyStr
+    definition: NonEmptyStr
+    parent_slug: str | None = None
+class Hypothesis(SelfEvalsModel):
+    """A testable change targeting a mode, fed to the proposer (not auto-run)."""
+    targets_mode_slug: NonEmptyStr
+    statement: NonEmptyStr
+    suggested_parameters: dict[str, Any] = Field(default_factory=dict)
+class AnalysisResult(SelfEvalsModel):
+    schema_version: str = ANALYSIS_SCHEMA_VERSION
+    assignments: list[Assignment] = Field(default_factory=list)
+    proposed_modes: list[ProposedMode] = Field(default_factory=list)
+    hypotheses: list[Hypothesis] = Field(default_factory=list)

selfevals/analysis/staging.py ADDED Viewed

@@ -0,0 +1,34 @@
+"""AnalysisStagingRecord — selfevals's advisory "this run is worth coding" marker.
+When an experiment opts into error analysis (`error_analysis.enabled`) and an
+iteration's fail rate clears the configured trigger, the loop persists one of
+these. It records *that the trigger fired* — the experiment, the iteration, the
+observed fail rate, and a human-readable reason — so a human or scheduler knows
+an `analyze pull` is worth doing.
+selfevals never invokes an agent or an LLM off the back of this. Staging is a
+signal, not an action: `analyze pull` stays a pure read you can run anytime; the
+marker just tells you *when it pays off*. See docs/spec/error_analysis_design.md §9.
+"""
+from __future__ import annotations
+from typing import ClassVar
+from pydantic import Field
+from selfevals.schemas._base import BaseEntity, NonEmptyStr
+class AnalysisStagingRecord(BaseEntity):
+    _id_prefix: ClassVar[str] = "stg"
+    experiment_id: NonEmptyStr
+    iteration: int = Field(ge=0)
+    fail_rate: float = Field(ge=0.0, le=1.0)
+    threshold: float = Field(ge=0.0, le=1.0)
+    scope: str
+    """`failed_only` or `all` — what `analyze pull` should bundle for this run."""
+    reason: NonEmptyStr
+    consumed: bool = False
+    """Set once an `analyze pull` has acted on this staging, so it isn't re-flagged."""

selfevals/api/__init__.py ADDED Viewed

@@ -0,0 +1,24 @@
+"""HTTP bridge between the SQLite-backed storage and the web UI.
+Read-only for MVP plus two writes: create workspace, queue experiment
+spec. FastAPI is an optional extra (`pip install selfevals[web]`);
+importing this package does not import FastAPI eagerly so that the
+default install path stays slim.
+"""
+from __future__ import annotations
+from typing import TYPE_CHECKING
+if TYPE_CHECKING:
+    from fastapi import FastAPI
+def create_app(*, db_path: str | None = None) -> FastAPI:
+    """Build the FastAPI app. Defers the FastAPI import to call time."""
+    from selfevals.api.app import build_app
+    return build_app(db_path=db_path)
+__all__ = ["create_app"]

selfevals/api/__main__.py ADDED Viewed

@@ -0,0 +1,47 @@
+"""`python -m selfevals.api` — run the FastAPI app via uvicorn."""
+from __future__ import annotations
+import argparse
+import os
+import sys
+def main(argv: list[str] | None = None) -> int:
+    parser = argparse.ArgumentParser(prog="selfevals-api")
+    parser.add_argument("--host", default="127.0.0.1")
+    parser.add_argument("--port", type=int, default=8000)
+    parser.add_argument(
+        "--db",
+        default=os.environ.get("SELFEVALS_DB", "./selfevals.sqlite"),
+        help="Path to the SQLite database file.",
+    )
+    parser.add_argument(
+        "--reload",
+        action="store_true",
+        help="Enable uvicorn auto-reload (dev only).",
+    )
+    args = parser.parse_args(argv)
+    os.environ["SELFEVALS_DB"] = args.db
+    try:
+        import uvicorn
+    except ImportError as exc:
+        print(
+            "error: uvicorn is not installed. Install with: pip install selfevals[web]",
+            file=sys.stderr,
+        )
+        raise SystemExit(2) from exc
+    uvicorn.run(
+        "selfevals.api.app:build_app",
+        host=args.host,
+        port=args.port,
+        reload=args.reload,
+        factory=True,
+    )
+    return 0
+if __name__ == "__main__":  # pragma: no cover
+    raise SystemExit(main())