PyPI - selfevals - Versions diffs - 0.2.2__tar.gz - Mend

selfevals 0.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (197) hide show

selfevals-0.2.2/.gitignore +57 -0
selfevals-0.2.2/CHANGELOG.md +540 -0
selfevals-0.2.2/LICENSE +17 -0
selfevals-0.2.2/PKG-INFO +283 -0
selfevals-0.2.2/README.md +198 -0
selfevals-0.2.2/docs/STATUS.md +126 -0
selfevals-0.2.2/docs/adapters.md +291 -0
selfevals-0.2.2/docs/spec/error_analysis_design.md +372 -0
selfevals-0.2.2/docs/spec/evals_framework.md +358 -0
selfevals-0.2.2/docs/spec/operational_spec_v0.1.md +134 -0
selfevals-0.2.2/docs/spec/raw.md +13 -0
selfevals-0.2.2/docs/spec/sdk_otlp_design.md +268 -0
selfevals-0.2.2/docs/spec/taxonomy.md +110 -0
selfevals-0.2.2/docs/troubleshooting.md +171 -0
selfevals-0.2.2/evals/datasets/pingpong.jsonl +2 -0
selfevals-0.2.2/evals/experiments/example_pingpong.yaml +63 -0
selfevals-0.2.2/examples/README.md +82 -0
selfevals-0.2.2/examples/__init__.py +0 -0
selfevals-0.2.2/examples/hello_llm/__init__.py +13 -0
selfevals-0.2.2/examples/hello_llm/agent.py +429 -0
selfevals-0.2.2/examples/hello_llm/cases.jsonl +3 -0
selfevals-0.2.2/examples/hello_llm/experiment.yaml +74 -0
selfevals-0.2.2/examples/hello_openai/__init__.py +17 -0
selfevals-0.2.2/examples/hello_openai/agent.py +421 -0
selfevals-0.2.2/examples/hello_openai/cases.jsonl +3 -0
selfevals-0.2.2/examples/hello_openai/experiment.yaml +74 -0
selfevals-0.2.2/pyproject.toml +205 -0
selfevals-0.2.2/src/selfevals/.agents/skills/error-analysis/SKILL.md +149 -0
selfevals-0.2.2/src/selfevals/__init__.py +19 -0
selfevals-0.2.2/src/selfevals/_errors.py +44 -0
selfevals-0.2.2/src/selfevals/_internal/__init__.py +0 -0
selfevals-0.2.2/src/selfevals/_internal/hashing.py +23 -0
selfevals-0.2.2/src/selfevals/_internal/ids.py +65 -0
selfevals-0.2.2/src/selfevals/_internal/time.py +17 -0
selfevals-0.2.2/src/selfevals/analysis/__init__.py +23 -0
selfevals-0.2.2/src/selfevals/analysis/bundle.py +162 -0
selfevals-0.2.2/src/selfevals/analysis/hypothesis.py +26 -0
selfevals-0.2.2/src/selfevals/analysis/ingest.py +185 -0
selfevals-0.2.2/src/selfevals/analysis/schemas.py +119 -0
selfevals-0.2.2/src/selfevals/analysis/staging.py +34 -0
selfevals-0.2.2/src/selfevals/api/__init__.py +24 -0
selfevals-0.2.2/src/selfevals/api/__main__.py +47 -0
selfevals-0.2.2/src/selfevals/api/app.py +351 -0
selfevals-0.2.2/src/selfevals/api/broker.py +210 -0
selfevals-0.2.2/src/selfevals/api/broker_bridge.py +29 -0
selfevals-0.2.2/src/selfevals/api/queries.py +447 -0
selfevals-0.2.2/src/selfevals/api/schemas.py +151 -0
selfevals-0.2.2/src/selfevals/api/sse.py +114 -0
selfevals-0.2.2/src/selfevals/cli/__init__.py +15 -0
selfevals-0.2.2/src/selfevals/cli/_friendly.py +180 -0
selfevals-0.2.2/src/selfevals/cli/_help.py +55 -0
selfevals-0.2.2/src/selfevals/cli/analyze_commands.py +169 -0
selfevals-0.2.2/src/selfevals/cli/commands.py +615 -0
selfevals-0.2.2/src/selfevals/cli/main.py +409 -0
selfevals-0.2.2/src/selfevals/decision/__init__.py +34 -0
selfevals-0.2.2/src/selfevals/decision/matrix.py +185 -0
selfevals-0.2.2/src/selfevals/examples/__init__.py +8 -0
selfevals-0.2.2/src/selfevals/examples/evals/datasets/pingpong.jsonl +2 -0
selfevals-0.2.2/src/selfevals/examples/evals/experiments/example_pingpong.yaml +58 -0
selfevals-0.2.2/src/selfevals/examples/pingpong.py +21 -0
selfevals-0.2.2/src/selfevals/graders/__init__.py +46 -0
selfevals-0.2.2/src/selfevals/graders/base.py +54 -0
selfevals-0.2.2/src/selfevals/graders/calibration.py +145 -0
selfevals-0.2.2/src/selfevals/graders/deterministic.py +143 -0
selfevals-0.2.2/src/selfevals/graders/llm_judge.py +187 -0
selfevals-0.2.2/src/selfevals/graders/registry.py +66 -0
selfevals-0.2.2/src/selfevals/optimization/__init__.py +47 -0
selfevals-0.2.2/src/selfevals/optimization/aggregator.py +246 -0
selfevals-0.2.2/src/selfevals/optimization/loop.py +432 -0
selfevals-0.2.2/src/selfevals/optimization/proposers.py +202 -0
selfevals-0.2.2/src/selfevals/py.typed +0 -0
selfevals-0.2.2/src/selfevals/repo/__init__.py +28 -0
selfevals-0.2.2/src/selfevals/repo/loader.py +276 -0
selfevals-0.2.2/src/selfevals/reporter/__init__.py +21 -0
selfevals-0.2.2/src/selfevals/reporter/_metrics.py +114 -0
selfevals-0.2.2/src/selfevals/reporter/compare.py +221 -0
selfevals-0.2.2/src/selfevals/reporter/json_report.py +105 -0
selfevals-0.2.2/src/selfevals/reporter/markdown.py +232 -0
selfevals-0.2.2/src/selfevals/runner/__init__.py +42 -0
selfevals-0.2.2/src/selfevals/runner/adapters.py +268 -0
selfevals-0.2.2/src/selfevals/runner/executor.py +234 -0
selfevals-0.2.2/src/selfevals/runner/otlp_receiver.py +343 -0
selfevals-0.2.2/src/selfevals/runner/otlp_to_recorder.py +180 -0
selfevals-0.2.2/src/selfevals/runner/sandbox.py +46 -0
selfevals-0.2.2/src/selfevals/schemas/__init__.py +213 -0
selfevals-0.2.2/src/selfevals/schemas/_base.py +82 -0
selfevals-0.2.2/src/selfevals/schemas/annotation.py +55 -0
selfevals-0.2.2/src/selfevals/schemas/dataset.py +111 -0
selfevals-0.2.2/src/selfevals/schemas/enums.py +324 -0
selfevals-0.2.2/src/selfevals/schemas/eval_case.py +189 -0
selfevals-0.2.2/src/selfevals/schemas/experiment.py +367 -0
selfevals-0.2.2/src/selfevals/schemas/failure_mode.py +76 -0
selfevals-0.2.2/src/selfevals/schemas/fleet.py +111 -0
selfevals-0.2.2/src/selfevals/schemas/grader_card.py +112 -0
selfevals-0.2.2/src/selfevals/schemas/iteration.py +219 -0
selfevals-0.2.2/src/selfevals/schemas/registry.py +125 -0
selfevals-0.2.2/src/selfevals/schemas/tool.py +43 -0
selfevals-0.2.2/src/selfevals/schemas/trace.py +384 -0
selfevals-0.2.2/src/selfevals/schemas/workspace.py +69 -0
selfevals-0.2.2/src/selfevals/sdk/__init__.py +24 -0
selfevals-0.2.2/src/selfevals/sdk/auto_instrument.py +165 -0
selfevals-0.2.2/src/selfevals/sdk/context.py +45 -0
selfevals-0.2.2/src/selfevals/sdk/exporter.py +50 -0
selfevals-0.2.2/src/selfevals/sdk/facade.py +203 -0
selfevals-0.2.2/src/selfevals/skills/__init__.py +61 -0
selfevals-0.2.2/src/selfevals/storage/__init__.py +53 -0
selfevals-0.2.2/src/selfevals/storage/errors.py +66 -0
selfevals-0.2.2/src/selfevals/storage/filesystem.py +137 -0
selfevals-0.2.2/src/selfevals/storage/interface.py +135 -0
selfevals-0.2.2/src/selfevals/storage/migrations/__init__.py +80 -0
selfevals-0.2.2/src/selfevals/storage/migrations/m0001_initial.py +57 -0
selfevals-0.2.2/src/selfevals/storage/seed.py +199 -0
selfevals-0.2.2/src/selfevals/storage/sqlite.py +232 -0
selfevals-0.2.2/src/selfevals/trace/__init__.py +31 -0
selfevals-0.2.2/src/selfevals/trace/otel_importer.py +455 -0
selfevals-0.2.2/src/selfevals/trace/payload_router.py +106 -0
selfevals-0.2.2/src/selfevals/trace/recorder.py +540 -0
selfevals-0.2.2/src/selfevals/version.py +1 -0
selfevals-0.2.2/tests/__init__.py +0 -0
selfevals-0.2.2/tests/analysis/__init__.py +0 -0
selfevals-0.2.2/tests/analysis/test_handshake.py +237 -0
selfevals-0.2.2/tests/api/__init__.py +0 -0
selfevals-0.2.2/tests/api/test_api.py +127 -0
selfevals-0.2.2/tests/api/test_broker.py +58 -0
selfevals-0.2.2/tests/api/test_sse.py +67 -0
selfevals-0.2.2/tests/api/test_threads.py +106 -0
selfevals-0.2.2/tests/cli/__init__.py +0 -0
selfevals-0.2.2/tests/cli/helpers_str_agent.py +14 -0
selfevals-0.2.2/tests/cli/test_analyze_cli.py +123 -0
selfevals-0.2.2/tests/cli/test_cli.py +324 -0
selfevals-0.2.2/tests/cli/test_cli_run.py +156 -0
selfevals-0.2.2/tests/cli/test_compare.py +223 -0
selfevals-0.2.2/tests/cli/test_examples_cli.py +32 -0
selfevals-0.2.2/tests/cli/test_help_texts.py +93 -0
selfevals-0.2.2/tests/cli/test_skills_cli.py +30 -0
selfevals-0.2.2/tests/decision/__init__.py +0 -0
selfevals-0.2.2/tests/decision/test_loop_integration.py +143 -0
selfevals-0.2.2/tests/decision/test_matrix.py +206 -0
selfevals-0.2.2/tests/examples/__init__.py +0 -0
selfevals-0.2.2/tests/examples/test_hello_llm.py +171 -0
selfevals-0.2.2/tests/graders/__init__.py +0 -0
selfevals-0.2.2/tests/graders/test_calibration.py +98 -0
selfevals-0.2.2/tests/graders/test_deterministic.py +209 -0
selfevals-0.2.2/tests/graders/test_llm_judge.py +196 -0
selfevals-0.2.2/tests/integration/__init__.py +0 -0
selfevals-0.2.2/tests/integration/test_full_loop_with_mocked_judge.py +450 -0
selfevals-0.2.2/tests/optimization/__init__.py +0 -0
selfevals-0.2.2/tests/optimization/test_aggregator.py +129 -0
selfevals-0.2.2/tests/optimization/test_loop.py +283 -0
selfevals-0.2.2/tests/optimization/test_loop_error_analysis.py +280 -0
selfevals-0.2.2/tests/optimization/test_proposers.py +181 -0
selfevals-0.2.2/tests/repo/__init__.py +0 -0
selfevals-0.2.2/tests/repo/test_loader.py +246 -0
selfevals-0.2.2/tests/reporter/__init__.py +0 -0
selfevals-0.2.2/tests/reporter/test_markdown.py +384 -0
selfevals-0.2.2/tests/reporter/test_metrics.py +246 -0
selfevals-0.2.2/tests/runner/__init__.py +0 -0
selfevals-0.2.2/tests/runner/test_adapters.py +154 -0
selfevals-0.2.2/tests/runner/test_executor.py +207 -0
selfevals-0.2.2/tests/runner/test_otlp_receiver.py +196 -0
selfevals-0.2.2/tests/runner/test_otlp_to_recorder.py +108 -0
selfevals-0.2.2/tests/runner/test_sandbox.py +30 -0
selfevals-0.2.2/tests/schemas/__init__.py +0 -0
selfevals-0.2.2/tests/schemas/test_annotation.py +77 -0
selfevals-0.2.2/tests/schemas/test_base.py +96 -0
selfevals-0.2.2/tests/schemas/test_cross_entity.py +303 -0
selfevals-0.2.2/tests/schemas/test_dataset.py +108 -0
selfevals-0.2.2/tests/schemas/test_enums.py +59 -0
selfevals-0.2.2/tests/schemas/test_error_analysis_spec.py +46 -0
selfevals-0.2.2/tests/schemas/test_eval_case.py +131 -0
selfevals-0.2.2/tests/schemas/test_experiment.py +203 -0
selfevals-0.2.2/tests/schemas/test_failure_mode.py +86 -0
selfevals-0.2.2/tests/schemas/test_fleet.py +109 -0
selfevals-0.2.2/tests/schemas/test_grader_card.py +82 -0
selfevals-0.2.2/tests/schemas/test_iteration.py +248 -0
selfevals-0.2.2/tests/schemas/test_registry.py +114 -0
selfevals-0.2.2/tests/schemas/test_tool.py +47 -0
selfevals-0.2.2/tests/schemas/test_trace.py +277 -0
selfevals-0.2.2/tests/schemas/test_workspace.py +77 -0
selfevals-0.2.2/tests/sdk/__init__.py +0 -0
selfevals-0.2.2/tests/sdk/test_auto_instrument.py +96 -0
selfevals-0.2.2/tests/sdk/test_facade.py +97 -0
selfevals-0.2.2/tests/skills/__init__.py +0 -0
selfevals-0.2.2/tests/skills/test_skills_locator.py +31 -0
selfevals-0.2.2/tests/storage/__init__.py +0 -0
selfevals-0.2.2/tests/storage/test_filesystem_object_store.py +139 -0
selfevals-0.2.2/tests/storage/test_migrations.py +34 -0
selfevals-0.2.2/tests/storage/test_seed.py +68 -0
selfevals-0.2.2/tests/storage/test_sqlite_storage.py +203 -0
selfevals-0.2.2/tests/test_internal_hashing.py +23 -0
selfevals-0.2.2/tests/test_internal_ids.py +42 -0
selfevals-0.2.2/tests/test_internal_time.py +24 -0
selfevals-0.2.2/tests/test_smoke.py +6 -0
selfevals-0.2.2/tests/trace/__init__.py +0 -0
selfevals-0.2.2/tests/trace/test_otel_importer.py +463 -0
selfevals-0.2.2/tests/trace/test_payload_router.py +81 -0
selfevals-0.2.2/tests/trace/test_recorder.py +129 -0

selfevals-0.2.2/.gitignore ADDED Viewed

@@ -0,0 +1,57 @@
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+# uv / venv
+.venv/
+venv/
+env/
+# Build artifacts
+build/
+dist/
+*.egg-info/
+*.egg
+.eggs/
+# Test / coverage
+.pytest_cache/
+.coverage
+.coverage.*
+htmlcov/
+coverage.xml
+.tox/
+.nox/
+# Type checkers
+.mypy_cache/
+.ruff_cache/
+.pyre/
+.pytype/
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+# OS
+.DS_Store
+Thumbs.db
+# Local data
+*.db
+*.sqlite
+*.sqlite3
+*.sqlite-shm
+*.sqlite-wal
+.bootstrap/
+data/
+# Secrets
+.env
+.env.*
+!.env.example

selfevals-0.2.2/CHANGELOG.md ADDED Viewed

@@ -0,0 +1,540 @@
+# Changelog
+All notable changes to this project will be documented in this file.
+The format is loosely based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
+Versions follow [SemVer](https://semver.org/).
+## [Unreleased]
+## [0.2.2] - 2026-05-27
+### Documentation
+- Onboarding pass after the `bootstrap` -> `selfevals` rename. Fixed the
+  CI mypy target (`src/bootstrap` -> `src/selfevals`) and 13 stale
+  `bootstrap` CLI/prose references in the bundled error-analysis skill.
+- README rewritten for a new user: provider-extras install guidance, a
+  Concepts table, both LLM examples (Anthropic + OpenAI), a full CLI
+  reference, and the global `--db` placement note. Status banners bumped
+  to the current release.
+- New `examples/README.md` (walk-through + how to adapt to your own agent)
+  and an expanded `CONTRIBUTING.md` (test layout, extras some tests need,
+  where to add a grader/adapter/proposer).
+No runtime or API changes — docs and packaging metadata only.
+## [0.2.1] - 2026-05-27
+### Changed
+- **Provider extras now bundle the provider SDK**, not just the
+  OpenInference instrumentor. `pip install selfevals[openai]` (and
+  `[anthropic]`, `[bedrock]`, `[vertex]`, `[langchain]`, `[crewai]`) now
+  pulls the provider's own SDK alongside the tracing integration — so a
+  single install is enough to run and trace a provider-backed agent. This
+  follows the Pydantic AI per-provider-extra pattern; core still depends on
+  no provider SDK (only `pydantic` + `pyyaml`).
+### Added
+- **`examples/hello_openai/`** — an OpenAI twin of `examples/hello_llm/`
+  (Anthropic): same three cases, same graders, same temperature sweep,
+  only the provider call differs. Calls OpenAI Chat Completions
+  (`gpt-4o-mini`) with a deterministic fake fallback when `OPENAI_API_KEY`
+  is unset. The lazy import distinguishes "SDK missing" (prints a
+  `pip install selfevals[openai]` hint) from "no API key" (silent fake).
+## [0.2.0] - 2026-05-26
+First release prepared for PyPI (distribution name `selfevals`; import and
+CLI remain `selfevals`). Adds the error-analysis closed loop, thread
+grouping, and trace message-content capture on top of the 0.1.0 runtime.
+### Added
+- **Error analysis + failure-mode taxonomy** — a closed loop, not a dashboard:
+  it grows a per-workspace failure-mode taxonomy and drives the next experiment.
+  selfevals owns the data, contract, persistence, and verification; the
+  intelligence (open/axial coding) lives in an external coding agent. selfevals
+  never calls an LLM. Design: `docs/spec/error_analysis_design.md`.
+  - **Persistence fix** — `IterationMetrics.failure_mode_counts` now persists
+    and survives a round-trip, so "top modes of experiment X" / "trend of mode
+    Y across iterations" are answerable. Closes the v0.1.0 known gap; the
+    markdown report and `compare` start showing real failure-mode data.
+  - **`FailureMode` entity** + per-workspace taxonomy seeded by `init` (9
+    canonical modes). Lifecycle CANDIDATE → OFFICIAL → RETIRED with a **human
+    promotion gate**; `superseded_by` back-pointer on merge.
+  - **Handshake** — `selfevals analyze pull <ws> <exp>` emits an
+    `AnalysisBundle` (failed traces + live taxonomy) as JSON; `analyze push`
+    ingests an `AnalysisResult` from stdin, validating-before-writing and
+    enforcing the assignment XOR (`mode_id` *or* `new_mode_slug`) and
+    classify-don't-rename invariants. Re-proposing an existing slug doesn't
+    duplicate it (discover-once, classify-thereafter).
+  - **`failuremode` CLI** — `list / promote / retire / merge / edit` for
+    taxonomy management and the human gate.
+  - **Closing the loop** — `ProposerInputs.failure_modes_consulted` carries the
+    prior iteration's dominant modes so a hypothesis can target a named mode;
+    `IterationAggregate.fail_rate` is the trigger signal; verification reuses
+    the existing `compare.py` before/after on stable mode ids.
+  - **Trace persistence** — `RunSpec.persist_traces` (`none` / `all` / `failed`,
+    default `failed`) controls which per-repetition traces the loop writes,
+    stamped with their grader results. A plain `selfevals run` now leaves the
+    failed traces in storage so `analyze pull` works without the SDK/OTLP path;
+    `--persist-traces` overrides it on the CLI. Traces also carry their
+    `iteration` so `analyze pull --iteration N` scopes correctly.
+  - **YAML opt-in** — a declarative, governable `error_analysis:` block on an
+    experiment (`enabled`, `taxonomy`, `trigger.fail_rate_above + threshold`,
+    `scope`). Default off. When the trigger fires, selfevals persists an
+    advisory `AnalysisStagingRecord` ("this run is worth coding") — it never
+    invokes an agent. The pingpong example opts in.
+  - **Bundled `error-analysis` skill** — ships inside the package
+    (`selfevals/.agents/skills/`, FastAPI convention) so `pip install selfevals`
+    makes it discoverable. It encodes the *method* (open → axial coding,
+    saturation, the handshake, the human gate), not intelligence. New
+    `selfevals.skills` locator + `selfevals skills list / path` CLI.
+  - 60+ new tests across schema round-trips, the push invariants, the
+    second-round stability property, loop staging + mode carryover, the YAML
+    loader, the skills locator, and the CLI cycle. mypy --strict + ruff clean.
+- **Thread grouping** — traces can now be assembled into the conversation
+  thread they belong to. `RunInfo` gains `thread_id` + `thread_position`; the
+  OTel importer auto-detects the thread from `session.id` (OpenInference) or
+  `gen_ai.conversation.id` (OTel GenAI), without overwriting an explicit
+  caller-set `thread_id`. New read query `load_thread` + `GET
+  /workspaces/{ws}/threads/{thread_id}` return every trace sharing a thread,
+  ordered by `thread_position` (falling back to `started_at`), each turn
+  projected with its grader results so the per-turn grade is visible.
+  `TraceResponse` now surfaces `thread_id` / `thread_position`. This closes the
+  last trace-grouping gap versus LangSmith sessions; the run→experiment→
+  iteration→decision→grade chain already existed. Eight new tests.
+- OTel importer now extracts prompt/completion **message content** into
+  traces. `_build_llm_span` reconstructs ordered message lists from both
+  attribute families — OpenInference native (`llm.input_messages.{i}.message.*`,
+  `llm.output_messages.{i}.message.*`) and the OTel GenAI alias
+  (`gen_ai.prompt.{i}.*`, `gen_ai.completion.{i}.*`). When both are present the
+  native family wins. Each side gets a stable `content_hash` (on
+  `messages_hash` / `output.content_hash`) for dedup and drift detection, and
+  the structured messages are kept inline under `provider_metadata`
+  (`selfevals.messages_in` / `selfevals.messages_out`). Closes the last gap
+  versus LangSmith trace capture: the actual prompt and response text are now
+  in the trace, not just tokens/model/stop_reason. Five new importer tests.
+## [0.1.0] - 2026-05-25
+First version where the README no longer lies. `selfevals run` works
+end-to-end against a real LLM agent, error paths are actionable, and
+the markdown/JSON reports answer the obvious follow-up questions.
+Schema-wise compatible with `0.0.9`.
+### Added — usable v1 surface
+Examples and quickstart:
+- `examples/hello_llm/` — a real Anthropic agent (with deterministic
+  fakes when `ANTHROPIC_API_KEY` is unset) over 3 EvalCases:
+  sentiment classification, structured extraction, open-ended support
+  reply. Two graders combined: `DeterministicGrader` for the rule
+  cases + `LLMJudgeGrader` for the open-ended one. `GridProposer`
+  sweeps `temperature ∈ {0.0, 0.5, 1.0}`.
+- README quickstart points at `evals/experiments/example_pingpong.yaml`
+  with the exact commands. Status banner updated from "no runtime
+  yet" to "runtime functional".
+CLI UX (Day 2):
+- Every subcommand (`init`, `workspace`, `experiment`, `iteration`,
+  `report`, `run`, `compare`, `estimate`) now has a user-facing
+  one-line description and a copy-paste `Example:` epilog. Helper
+  `src/selfevals/cli/_help.py` centralizes the pattern.
+- `tests/cli/test_help_texts.py` enforces the contract.
+- `docs/adapters.md` documents the three adapters with YAML config,
+  per-adapter agent code, contracts, limitations, and a comparison
+  table.
+Errors and hardening (Day 3):
+- `SelfEvalsError` / `SelfEvalsUserError` hierarchy. User-correctable
+  failures exit with code 2 and a clean one-line message; internal
+  errors keep their traceback.
+- `src/selfevals/cli/_friendly.py` is the single translation
+  chokepoint for YAML parse errors, dataset paths (with fuzzy-match
+  suggestions via stdlib `difflib`), missing graders, HTTP adapter
+  transport errors (URL + actionable suffix), and SQLite locked /
+  corrupted cases.
+- `src/selfevals/graders/registry.py` — name→factory registry.
+  `deterministic` is pre-registered; `llm_judge` is registered
+  on-demand by the CLI. YAML can declare top-level `graders:` and
+  per-case `EvalCase.graders` filters which graders run.
+- `tests/integration/test_full_loop_with_mocked_judge.py` — 7 tests
+  covering the happy path plus each of the five friendly-error
+  shapes.
+- `docs/troubleshooting.md` documents the five common errors and
+  fixes.
+Reporter (Day 4):
+- `src/selfevals/reporter/_metrics.py` — pure helpers
+  (`compute_total_cost`, `compute_total_time_seconds`, etc.) that
+  return `None` when data is absent instead of misleading zeros.
+- Markdown report gains a "Cost & Time" section (omitted gracefully
+  when there are no LLM calls) and a "Next steps" block with
+  copy-paste inspection commands.
+- JSON report exposes a stable `cost_time` block (`None` when
+  missing).
+- `src/selfevals/reporter/compare.py` powers `selfevals compare`:
+  proposal diff table, metrics diff table, failure-mode diff, and a
+  "B is better: primary +X; no new failure modes" recommendation.
+### Fixed
+- Console script `selfevals` was pointing at `cli.main:app`, which
+  returns an int but never raised `SystemExit`, so user errors
+  silently exited 0. Now points at `cli.main:main`, which wraps `app`
+  in `SystemExit(...)`.
+- `pyproject.toml` ruff `per-file-ignores` had no entry for
+  `src/selfevals/api/**`, so legitimate FastAPI `Depends(...)`
+  defaults were flagged as B008. Added the ignore.
+- `pyproject.toml` `pytest.ini_options` was missing the `asyncio`
+  marker registration; `--strict-markers` was rejecting async tests.
+- `EvalCase.graders` was unused metadata until now — the
+  `OptimizationLoop` now filters graders per case when the field is
+  populated, preserving the prior "run everything" behavior when it
+  is empty.
+### Known gaps (not blocking v0.1.0)
+- 9 tests under `tests/sdk/` and `tests/runner/test_otlp_receiver.py`
+  require the `telemetry` extra (`uv sync --extra telemetry`) and
+  fail without it. They are excluded from the default surface.
+- 3 tests under `tests/api/` require the `web` extra
+  (`uv sync --extra web`) to install FastAPI.
+- Failure modes do not yet survive persistence to SQLite — the
+  compare and report tooling already handles their presence gracefully
+  for when the schema is extended. *(Resolved in [Unreleased]: error
+  analysis persists `failure_mode_counts`.)*
+- `CliCommandAdapter` and `HttpEndpointAdapter` are not yet
+  auto-wired from YAML; users instantiate them via a Python
+  entrypoint. `docs/adapters.md` documents the workaround.
+## [0.0.9] - 2026-05-16
+### Added — MVP Block A: YAML loader + `selfevals run` end-to-end
+Repo loader (`src/selfevals/repo/`):
+- `load_experiment_spec(path)` parses `evals/experiments/<name>.yaml` →
+  `(workspace_id, Experiment, [EvalCase], AgentEntrypoint)`. YAML keys
+  are 1:1 with the Pydantic field names — no DSL translation; the
+  validators do all the shape checking.
+- Cases can be inline (`dataset.cases_inline:`) or external JSONL
+  (`dataset.cases_path:`). Mutually exclusive; both empty rejected.
+- Agent entrypoint declared as `module.path:callable_name`.
+  `resolve_agent_callable` defers import until the runner needs it
+  (lets `selfevals inspect` validate a spec without booting user code).
+- 14 tests covering inline/external loading, workspace override,
+  missing fields, malformed YAML, invalid payloads, entrypoint
+  resolution.
+CLI `selfevals run <yaml>`:
+- Loads spec → resolves agent callable → wraps as `EmbeddedAdapter`
+  (str returns auto-coerced to `AdapterResponse`) → builds the
+  proposer per `experiment.proposer.strategy` (grid / random /
+  manual) → drives `OptimizationLoop` with `DecisionMatrixEvaluator`
+  + `DeterministicGrader` → emits markdown/JSON report.
+- Flags: `--workspace`, `--max-iterations`, `--reps`, `--format`,
+  `--no-persist`.
+- Persists `Experiment` + `IterationRecord` + `DecisionRecord` to
+  SQLite when storage is enabled; auto-seeds the workspace row.
+- 6 tests covering markdown/JSON output, persistence to SQLite,
+  missing-spec error, validation, str→AdapterResponse coercion.
+Example experiment:
+- `evals/experiments/example_pingpong.yaml` + `evals/datasets/pingpong.jsonl` +
+  `selfevals.examples.pingpong` reference agent. Serves as smoke test
+  and onboarding artifact. `uv run selfevals run evals/experiments/example_pingpong.yaml --no-persist`
+  produces a clean report out of the box.
+Refactor:
+- `DecisionMatrixEvaluator` now inherits from `DecisionEvaluatorProtocol`
+  so the type checker recognizes it as a valid argument to
+  `OptimizationLoop(decision_evaluator=...)`.
+20 new tests (390 total). mypy strict + ruff clean. One new runtime
+dep: `pyyaml>=6,<7`.
+### Added — Design docs for next implementation surfaces
+- `docs/spec/sdk_otlp_design.md`: locked blueprint for the user-side
+  SDK façade (`selfevals.init()`) + embedded OTLP HTTP receiver +
+  OpenInference auto-instrumentation. Sections 1-11 cover the
+  decisions already made (no re-litigation), package layout, exact
+  signatures, span translation table, dependency tree (optional
+  extras), test plan, and acceptance criteria. ~1500-2000 LOC budget,
+  dedicated session.
+- `docs/prompts/web_session_prompt.md`: self-contained prompt for the
+  Claude Code session that builds the web UI + SDK + OTLP receiver.
+  Includes product vibe (Stripe/Airbnb/ChatGPT/Claude/LangSmith/Mercury),
+  page inventory (8 surfaces), design tokens, stack recommendation,
+  backend contract, and "done" criteria.
+## [0.0.8] - 2026-05-16
+### Added — PR 8 + PR 9: Reporter + CLI
+Reporter (`selfevals.reporter`):
+- `render_markdown(result)` produces a PR-comment-style summary:
+  experiment header (name, goal, state, mode, proposer, iterations
+  run, termination reason), target + guardrail spec line, best-
+  iteration callout with parameters, per-iteration table
+  (`#`, primary, Δ vs running best, decision outcome, rationale —
+  with pipe-escaping and 80-char rationale truncation), and a
+  top-N failure-modes section drawn from
+  `IterationAggregate.failure_mode_counts`.
+- `render_json(result)` emits a stable, machine-readable payload
+  (`schema_version=1`) keyed on iteration index, with explicit
+  best-iteration reference. JSON path is what the CLI's `--format
+  json` flag outputs.
+- Pure: no I/O, no global state — callers decide where the strings
+  end up (stdout, a file, a GitHub PR comment).
+CLI (`selfevals` console script, argparse-only, zero new deps):
+- `selfevals init <slug>` — idempotent workspace seed via
+  `seed_workspace`; prints workspace id + member count.
+- `selfevals workspace show <ws_id>` — workspace metadata +
+  experiment count.
+- `selfevals experiment list <ws_id>` / `show <ws_id> <exp_id>` —
+  inspect experiments in storage with target + iteration progress.
+- `selfevals iteration list <ws_id> <exp_id>` — per-iteration
+  primary metric + decision outcome.
+- `selfevals report <ws_id> <exp_id> [--format markdown|json]` —
+  reconstructs an OptimizationResult from stored IterationRecords +
+  DecisionRecords (lossy on per-case GradeResults, lossless on
+  aggregates) and pipes it through the reporter.
+- `selfevals compare <ws_id> <iter_a_id> <iter_b_id>` — side-by-
+  side primary metric diff between two iterations of the same
+  experiment.
+- `selfevals estimate --cases N --space-size M --reps K
+  --cost-per-call X` — dry-run upper-bound on agent calls and
+  total USD cost before paying for a run.
+- All user-facing errors (missing entity, primary-metric mismatch,
+  invalid numeric args) go through `CommandError` → `error: <msg>`
+  on stderr → exit code 2. Unexpected exceptions surface as
+  tracebacks (bugs, not user errors).
+18 new tests (370 total: 9 reporter + 9 CLI). mypy strict + ruff
+clean. Zero new runtime deps — argparse + stdlib.
+## [0.0.7] - 2026-05-16
+### Added — PR 6 + PR 7: OptimizationLoop + Decision matrix
+Proposers:
+- `Proposer` ABC with `ProposerContext` (iteration index + history).
+- `ManualProposer`: walk a caller-supplied list of `Proposal` or
+  parameter dicts; raises `SearchSpaceExhaustedError` when done.
+- `GridProposer`: cartesian product over list-valued entries in
+  `experiment.search_space.model_params`; scalar entries are held
+  constant; empty list → raises ValueError.
+- `RandomProposer`: independent uniform sampling from each parameter
+  spec (list, `{lo, hi}`, `{choices: [...]}`, or scalar constant).
+  Bounded by `max_proposals`; seeded for reproducibility.
+- All proposals are re-validated against the experiment's editable
+  contract before being returned.
+Aggregator:
+- `aggregate_iteration(case_outcomes, primary_metric, reliability_metrics)`
+  computes pass@1 / pass@k / pass^k / consistency_rate /
+  stability_score / recovery_rate from per-case `CaseOutcome`s.
+- Worst-of policy when multiple graders run on the same repetition:
+  ERROR > FAIL > PARTIAL > SKIPPED > PASS.
+- Failure-mode counts aggregated by tag.
+- Guardrail metrics (`cost_usd_per_case`, `latency_ms_per_case_avg`)
+  surfaced when traces report cost/duration.
+OptimizationLoop:
+- Transitions experiment state DRAFT → QUEUED → RUNNING → COMPLETED.
+- For each iteration: ask proposer for a Proposal, run cases through
+  the Executor, score per-rep results with the configured graders,
+  aggregate, hand to a DecisionEvaluator, persist IterationRecord +
+  DecisionRecord (when a WorkspaceScope is provided).
+- Terminates on `search_space_exhausted`, `converged`, or
+  `max_iterations`. Convergence = no improvement above
+  `min_delta` for `patience` consecutive iterations.
+Decision matrix (PR 7):
+- `evaluate_iteration` (pure) + `DecisionMatrixEvaluator` (object).
+  Applies the §10 canonical subset that powers MVP optimization:
+  guardrail check → first-iteration target check → improvement vs
+  baseline → regression handling per `Experiment.decision` policy
+  (reject / investigate / spawn_subexperiment) or guardrail policy
+  (reject / require_tradeoff_review).
+- Missing guardrail metric values are treated as passing — the runner
+  doesn't synthesize every metric in MVP and we don't fail-shut on
+  absent data.
+- End-to-end integration test wires the evaluator into the loop and
+  verifies that improvement / no-improvement / regression each
+  produce the right DecisionRecord.outcome.
+47 new tests (352 total). mypy strict + ruff clean. Zero new deps.
+## [0.0.6] - 2026-05-16
+### Added — PR 5: Graders (deterministic + LLM judge + calibration)
+- `Grader` ABC with `GraderContext` (case + trace + optional response)
+  and `GradeResult` (label / score / reason / confidence / failure_modes
+  / details). `GradeLabel` enum: pass, fail, partial, error, skipped.
+- `DeterministicGrader`: reads rules off `EvalCase.expected`:
+  must_include, must_not_include, required_tools (looks at
+  ToolCallSpans in the trace), forbidden_tools, optional regex,
+  structured_output equality. Configurable case-sensitive mode. Each
+  rule emits a stable failure_mode tag for weighted scoring upstream.
+- `LLMJudgeGrader`: invokes any `AgentAdapter` as a judge against a
+  rubric prompt (`RubricTemplate` with safe substitution). Parses the
+  judge's JSON output into a `JudgeDecision`; unknown labels and bad
+  JSON return `GradeLabel.ERROR` rather than crashing. Honors
+  `GraderCard.blocking` thresholds: when below calibration the grader
+  returns SKIPPED ("degraded to advisory") unless `force=True`.
+  Single-judge in MVP; panel infrastructure-ready for post-MVP.
+- Calibration helpers (`compute_classification_metrics`): pair
+  predictions with human labels by case_id; compute precision, recall,
+  F1 for the positive class plus macro-F1, accuracy, per-label
+  precision/recall, and confusion matrix. Counts high-risk false
+  negatives separately (the failure mode that wakes someone up).
+  Class-imbalance guard: undefined precision/recall return None.
+25 new tests (305 total). mypy strict + ruff clean. Zero new deps.
+## [0.0.5] - 2026-05-16
+### Added — PR 4: Runner (agent adapters + sandbox + executor)
+- `AgentAdapter` ABC + `AdapterRequest`/`AdapterResponse` dataclasses;
+  the narrow contract between selfevals and the agent under test.
+- `EmbeddedAdapter`: wraps a Python callable. Used for tests and
+  in-repo agents.
+- `CliCommandAdapter`: subprocess + JSON-over-stdio. Configurable
+  command, env, timeout.
+- `HttpEndpointAdapter`: POST JSON via stdlib `urllib` (no
+  third-party HTTP dep). Configurable headers + timeout.
+- All three normalize errors into `AdapterError` with the original
+  cause preserved.
+- `SandboxPolicy`: declarative mock/dry_run rules; `live_sandboxed`
+  and `live_canary` are accepted as enum values but `ensure_runnable()`
+  blocks them in MVP via `SandboxViolationError`.
+- `Executor`: runs an `EvalCase` for N repetitions through a given
+  adapter + sandbox; assembles a `Trace` per repetition via
+  `TraceRecorder`. Records adapter LLM output as an `LLMCallSpan`,
+  each tool use as a `ToolCallSpan` (sandboxed flag per policy),
+  and adapter exceptions as `ErrorSpan` + `final_state=errored`.
+24 new tests (280 total). mypy strict + ruff clean. Zero new deps.
+## [0.0.4] - 2026-05-16
+### Added — PR 3: Trace ingestion (recorder + payload router + OTel importer)
+- `PayloadRouter` — small payloads (≤4 KB by default) stay inline in
+  the Trace JSON; larger ones are written to the `ObjectStoreInterface`
+  and replaced with `oss://` pointers + sha256 hashes. Canonical
+  JSON encoding for dicts/lists guarantees stable hashing across key
+  order.
+- `TraceRecorder` — context manager that captures spans during agent
+  execution. Span context managers: `agent_turn`, `llm_call`,
+  `tool_call`. Convenience emitters: `add_retrieval`,
+  `add_memory_read/write`, `add_decision`, `add_handoff`,
+  `add_human_intervention`, `add_guardrail_check`, `add_error`.
+  Accumulates trace-level metrics (LLM call count, tool call count,
+  token totals, retries). Tool call exceptions automatically mark
+  the span ERROR with type+message. Exiting the context with an
+  uncaught exception marks the trace ERRORED.
+- `import_otel_spans` — adapter from a flat list of OTel-style span
+  dicts (gen_ai.*, openinference.*) to a selfevals Trace. Classifies
+  spans by `openinference.span.kind` / `gen_ai.*` presence,
+  normalizes finish reasons, preserves parent/child links, retains
+  unknown attributes in `provider_metadata` or CustomSpan.payload.
+  When TOOL spans carry call_ids without explicit linkage, the
+  importer synthesizes ToolUseRequest entries on the nearest LLM
+  span so the schema invariant holds; if no LLM span exists the
+  call_id is dropped silently.
+- Public surface: `selfevals.trace` re-exports `PayloadRouter`,
+  `TraceRecorder`, `import_otel_spans`.
+26 new tests; 256 total. mypy strict + ruff clean. Zero new deps.
+## [0.0.3] - 2026-05-16
+### Added — PR 2: Storage layer (SQLite + filesystem + workspace scoping)
+- `StorageInterface` / `ObjectStoreInterface` / `WorkspaceScope` ABCs:
+  every read or write is bound to one `workspace_id`; cross-tenant
+  access is impossible by construction.
+- `SQLiteStorage` with single generic `entities` table (entity_type, id,
+  workspace_id, version, timestamps, payload JSON) + `objects` table.
+  Indexes on (workspace_id, entity_type[, created/updated]) and a
+  partial deleted_at index. Optimistic concurrency on `version`.
+  WAL journal mode + foreign keys on.
+- Homemade migration runner (no alembic dep): forward-only,
+  `mNNNN_<slug>.py` modules with `up(conn)`, tracked in
+  `_selfevalss_migrations`. Initial migration creates the tables.
+- `FilesystemObjectStore`: content-addressed blobs at
+  `{root}/{workspace_id}/{prefix2}/{sha256}.bin`; pointer URI
+  `oss://{workspace_id}/sha256:...` encodes its workspace.
+  SHA256 integrity check on read; collision detected if same hash
+  resolves to different bytes.
+- `seed_workspace(storage, slug, name, user_id, ...)` helper:
+  idempotent by (slug, owner), creates the Workspace + one Member
+  per `Role` (viewer, evaluator, experimenter, maintainer, admin,
+  auditor) when `assign_all_roles=True`.
+- Errors: `EntityNotFoundError`, `WorkspaceMismatchError`,
+  `OptimisticConcurrencyError`, `ObjectNotFoundError`,
+  `PointerHashMismatchError`, `IntegrityViolationError`.
+33 new tests (231 total).
+## [0.0.2] - 2026-05-16
+### Added — PR 1: Schemas-first scaffolding (Pydantic v2)
+Closed enums (`Role`, `Level`, `DatasetSource`, `GroundTruthMethod`,
+`DatasetType`, `SandboxMode`, `RuntimeLocation`, `Mode`, `ProposerStrategy`,
+`ExperimentState`, `SpanKind`, `StopReason`, `TraceState`,
+`ToolCallStatus`, `PIIStatus`, `FeatureKind`/`Status`,
+`AgentType`/`Status`, `FleetStatus`, `DatasetStatus`, `ToolStatus`,
+`GraderCardState`, `DecisionOutcome`, `IterationState`, `Modality`).
+Entities:
+- `Workspace`, `Member` — multi-tenant primitives; workspace is
+  self-referential (its own workspace_id == id).
+- `Tool` — first-class entity needed for `editable.tool_code`.
+- `FeatureRegistry`, `RiskRegistry` — declarative taxonomies.
+- `AgentFleet`, `Agent` — agent_type-discriminated payloads.
+- `EvalCase` — taxonomy (level, feature, source, ground_truth,
+  runtime, dataset_type, risk), expected, failure_weights, blocking,
+  holdout, PII contract.
+- `Dataset` — manifest with split_allocation, lazy statistics by
+  manifest_hash, regression-class immutability when frozen.
+- `Experiment` — TargetSpec, EditableContract enforcing mode=agent_loop
+  for tool_code/workflow_graph/skills, SearchSpace, FrozenSnapshot,
+  ProposerSpec (MVP gates non-manual/grid/random), RunSpec, JudgeDefenses
+  (live_canary requires outcome_metrics), ReliabilitySpec
+  (pass@N/pass^N/consistency_rate/...), DecisionPolicy, state machine.
+- `IterationRecord`, `Proposal` (with `validate_against(experiment)` =
+  editable contract enforcement), `DecisionRecord` with automated +
+  human rationale.
+- `GraderCard` with blocking thresholds contract (precision >= 0.90,
+  recall >= 0.95, max high-risk FNs == 0).
+- `Annotation` with free-form labels + optional rubric_version.
+- `Trace` schema (operational §B.2): RunInfo, AgentSnapshotRef,
+  EnvironmentInfo, FinalState, discriminated `Span` union (12 kinds),
+  TokenBreakdown with cache_read/cache_creation/reasoning, CostBreakdown,
+  ReasoningBlock with provider signature, LLMOutput with
+  tool_use_requested, ToolCallSpan.tool_use_id linkage validated
+  trace-wide.
+Internal helpers: ULID + prefixed ULID id generation (stdlib only),
+canonical content_hash (sha256), tz-aware UTC time helpers.
+Tests: 197 unit tests covering every validator and enum; mypy strict
++ ruff (E/W/F/I/B/UP/N/SIM/RUF) clean.
+## [0.0.1] - 2026-05-16
+### Added
+- Initial repo scaffolding: `pyproject.toml`, ruff + mypy strict + pytest config.
+- `docs/spec/` with canonical eval framework spec, operational spec v0.1, taxonomy notes.

selfevals-0.2.2/LICENSE ADDED Viewed

@@ -0,0 +1,17 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+Copyright 2026 Patricio Valdez
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.