selfevals 0.2.2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (197) hide show
  1. selfevals-0.2.2/.gitignore +57 -0
  2. selfevals-0.2.2/CHANGELOG.md +540 -0
  3. selfevals-0.2.2/LICENSE +17 -0
  4. selfevals-0.2.2/PKG-INFO +283 -0
  5. selfevals-0.2.2/README.md +198 -0
  6. selfevals-0.2.2/docs/STATUS.md +126 -0
  7. selfevals-0.2.2/docs/adapters.md +291 -0
  8. selfevals-0.2.2/docs/spec/error_analysis_design.md +372 -0
  9. selfevals-0.2.2/docs/spec/evals_framework.md +358 -0
  10. selfevals-0.2.2/docs/spec/operational_spec_v0.1.md +134 -0
  11. selfevals-0.2.2/docs/spec/raw.md +13 -0
  12. selfevals-0.2.2/docs/spec/sdk_otlp_design.md +268 -0
  13. selfevals-0.2.2/docs/spec/taxonomy.md +110 -0
  14. selfevals-0.2.2/docs/troubleshooting.md +171 -0
  15. selfevals-0.2.2/evals/datasets/pingpong.jsonl +2 -0
  16. selfevals-0.2.2/evals/experiments/example_pingpong.yaml +63 -0
  17. selfevals-0.2.2/examples/README.md +82 -0
  18. selfevals-0.2.2/examples/__init__.py +0 -0
  19. selfevals-0.2.2/examples/hello_llm/__init__.py +13 -0
  20. selfevals-0.2.2/examples/hello_llm/agent.py +429 -0
  21. selfevals-0.2.2/examples/hello_llm/cases.jsonl +3 -0
  22. selfevals-0.2.2/examples/hello_llm/experiment.yaml +74 -0
  23. selfevals-0.2.2/examples/hello_openai/__init__.py +17 -0
  24. selfevals-0.2.2/examples/hello_openai/agent.py +421 -0
  25. selfevals-0.2.2/examples/hello_openai/cases.jsonl +3 -0
  26. selfevals-0.2.2/examples/hello_openai/experiment.yaml +74 -0
  27. selfevals-0.2.2/pyproject.toml +205 -0
  28. selfevals-0.2.2/src/selfevals/.agents/skills/error-analysis/SKILL.md +149 -0
  29. selfevals-0.2.2/src/selfevals/__init__.py +19 -0
  30. selfevals-0.2.2/src/selfevals/_errors.py +44 -0
  31. selfevals-0.2.2/src/selfevals/_internal/__init__.py +0 -0
  32. selfevals-0.2.2/src/selfevals/_internal/hashing.py +23 -0
  33. selfevals-0.2.2/src/selfevals/_internal/ids.py +65 -0
  34. selfevals-0.2.2/src/selfevals/_internal/time.py +17 -0
  35. selfevals-0.2.2/src/selfevals/analysis/__init__.py +23 -0
  36. selfevals-0.2.2/src/selfevals/analysis/bundle.py +162 -0
  37. selfevals-0.2.2/src/selfevals/analysis/hypothesis.py +26 -0
  38. selfevals-0.2.2/src/selfevals/analysis/ingest.py +185 -0
  39. selfevals-0.2.2/src/selfevals/analysis/schemas.py +119 -0
  40. selfevals-0.2.2/src/selfevals/analysis/staging.py +34 -0
  41. selfevals-0.2.2/src/selfevals/api/__init__.py +24 -0
  42. selfevals-0.2.2/src/selfevals/api/__main__.py +47 -0
  43. selfevals-0.2.2/src/selfevals/api/app.py +351 -0
  44. selfevals-0.2.2/src/selfevals/api/broker.py +210 -0
  45. selfevals-0.2.2/src/selfevals/api/broker_bridge.py +29 -0
  46. selfevals-0.2.2/src/selfevals/api/queries.py +447 -0
  47. selfevals-0.2.2/src/selfevals/api/schemas.py +151 -0
  48. selfevals-0.2.2/src/selfevals/api/sse.py +114 -0
  49. selfevals-0.2.2/src/selfevals/cli/__init__.py +15 -0
  50. selfevals-0.2.2/src/selfevals/cli/_friendly.py +180 -0
  51. selfevals-0.2.2/src/selfevals/cli/_help.py +55 -0
  52. selfevals-0.2.2/src/selfevals/cli/analyze_commands.py +169 -0
  53. selfevals-0.2.2/src/selfevals/cli/commands.py +615 -0
  54. selfevals-0.2.2/src/selfevals/cli/main.py +409 -0
  55. selfevals-0.2.2/src/selfevals/decision/__init__.py +34 -0
  56. selfevals-0.2.2/src/selfevals/decision/matrix.py +185 -0
  57. selfevals-0.2.2/src/selfevals/examples/__init__.py +8 -0
  58. selfevals-0.2.2/src/selfevals/examples/evals/datasets/pingpong.jsonl +2 -0
  59. selfevals-0.2.2/src/selfevals/examples/evals/experiments/example_pingpong.yaml +58 -0
  60. selfevals-0.2.2/src/selfevals/examples/pingpong.py +21 -0
  61. selfevals-0.2.2/src/selfevals/graders/__init__.py +46 -0
  62. selfevals-0.2.2/src/selfevals/graders/base.py +54 -0
  63. selfevals-0.2.2/src/selfevals/graders/calibration.py +145 -0
  64. selfevals-0.2.2/src/selfevals/graders/deterministic.py +143 -0
  65. selfevals-0.2.2/src/selfevals/graders/llm_judge.py +187 -0
  66. selfevals-0.2.2/src/selfevals/graders/registry.py +66 -0
  67. selfevals-0.2.2/src/selfevals/optimization/__init__.py +47 -0
  68. selfevals-0.2.2/src/selfevals/optimization/aggregator.py +246 -0
  69. selfevals-0.2.2/src/selfevals/optimization/loop.py +432 -0
  70. selfevals-0.2.2/src/selfevals/optimization/proposers.py +202 -0
  71. selfevals-0.2.2/src/selfevals/py.typed +0 -0
  72. selfevals-0.2.2/src/selfevals/repo/__init__.py +28 -0
  73. selfevals-0.2.2/src/selfevals/repo/loader.py +276 -0
  74. selfevals-0.2.2/src/selfevals/reporter/__init__.py +21 -0
  75. selfevals-0.2.2/src/selfevals/reporter/_metrics.py +114 -0
  76. selfevals-0.2.2/src/selfevals/reporter/compare.py +221 -0
  77. selfevals-0.2.2/src/selfevals/reporter/json_report.py +105 -0
  78. selfevals-0.2.2/src/selfevals/reporter/markdown.py +232 -0
  79. selfevals-0.2.2/src/selfevals/runner/__init__.py +42 -0
  80. selfevals-0.2.2/src/selfevals/runner/adapters.py +268 -0
  81. selfevals-0.2.2/src/selfevals/runner/executor.py +234 -0
  82. selfevals-0.2.2/src/selfevals/runner/otlp_receiver.py +343 -0
  83. selfevals-0.2.2/src/selfevals/runner/otlp_to_recorder.py +180 -0
  84. selfevals-0.2.2/src/selfevals/runner/sandbox.py +46 -0
  85. selfevals-0.2.2/src/selfevals/schemas/__init__.py +213 -0
  86. selfevals-0.2.2/src/selfevals/schemas/_base.py +82 -0
  87. selfevals-0.2.2/src/selfevals/schemas/annotation.py +55 -0
  88. selfevals-0.2.2/src/selfevals/schemas/dataset.py +111 -0
  89. selfevals-0.2.2/src/selfevals/schemas/enums.py +324 -0
  90. selfevals-0.2.2/src/selfevals/schemas/eval_case.py +189 -0
  91. selfevals-0.2.2/src/selfevals/schemas/experiment.py +367 -0
  92. selfevals-0.2.2/src/selfevals/schemas/failure_mode.py +76 -0
  93. selfevals-0.2.2/src/selfevals/schemas/fleet.py +111 -0
  94. selfevals-0.2.2/src/selfevals/schemas/grader_card.py +112 -0
  95. selfevals-0.2.2/src/selfevals/schemas/iteration.py +219 -0
  96. selfevals-0.2.2/src/selfevals/schemas/registry.py +125 -0
  97. selfevals-0.2.2/src/selfevals/schemas/tool.py +43 -0
  98. selfevals-0.2.2/src/selfevals/schemas/trace.py +384 -0
  99. selfevals-0.2.2/src/selfevals/schemas/workspace.py +69 -0
  100. selfevals-0.2.2/src/selfevals/sdk/__init__.py +24 -0
  101. selfevals-0.2.2/src/selfevals/sdk/auto_instrument.py +165 -0
  102. selfevals-0.2.2/src/selfevals/sdk/context.py +45 -0
  103. selfevals-0.2.2/src/selfevals/sdk/exporter.py +50 -0
  104. selfevals-0.2.2/src/selfevals/sdk/facade.py +203 -0
  105. selfevals-0.2.2/src/selfevals/skills/__init__.py +61 -0
  106. selfevals-0.2.2/src/selfevals/storage/__init__.py +53 -0
  107. selfevals-0.2.2/src/selfevals/storage/errors.py +66 -0
  108. selfevals-0.2.2/src/selfevals/storage/filesystem.py +137 -0
  109. selfevals-0.2.2/src/selfevals/storage/interface.py +135 -0
  110. selfevals-0.2.2/src/selfevals/storage/migrations/__init__.py +80 -0
  111. selfevals-0.2.2/src/selfevals/storage/migrations/m0001_initial.py +57 -0
  112. selfevals-0.2.2/src/selfevals/storage/seed.py +199 -0
  113. selfevals-0.2.2/src/selfevals/storage/sqlite.py +232 -0
  114. selfevals-0.2.2/src/selfevals/trace/__init__.py +31 -0
  115. selfevals-0.2.2/src/selfevals/trace/otel_importer.py +455 -0
  116. selfevals-0.2.2/src/selfevals/trace/payload_router.py +106 -0
  117. selfevals-0.2.2/src/selfevals/trace/recorder.py +540 -0
  118. selfevals-0.2.2/src/selfevals/version.py +1 -0
  119. selfevals-0.2.2/tests/__init__.py +0 -0
  120. selfevals-0.2.2/tests/analysis/__init__.py +0 -0
  121. selfevals-0.2.2/tests/analysis/test_handshake.py +237 -0
  122. selfevals-0.2.2/tests/api/__init__.py +0 -0
  123. selfevals-0.2.2/tests/api/test_api.py +127 -0
  124. selfevals-0.2.2/tests/api/test_broker.py +58 -0
  125. selfevals-0.2.2/tests/api/test_sse.py +67 -0
  126. selfevals-0.2.2/tests/api/test_threads.py +106 -0
  127. selfevals-0.2.2/tests/cli/__init__.py +0 -0
  128. selfevals-0.2.2/tests/cli/helpers_str_agent.py +14 -0
  129. selfevals-0.2.2/tests/cli/test_analyze_cli.py +123 -0
  130. selfevals-0.2.2/tests/cli/test_cli.py +324 -0
  131. selfevals-0.2.2/tests/cli/test_cli_run.py +156 -0
  132. selfevals-0.2.2/tests/cli/test_compare.py +223 -0
  133. selfevals-0.2.2/tests/cli/test_examples_cli.py +32 -0
  134. selfevals-0.2.2/tests/cli/test_help_texts.py +93 -0
  135. selfevals-0.2.2/tests/cli/test_skills_cli.py +30 -0
  136. selfevals-0.2.2/tests/decision/__init__.py +0 -0
  137. selfevals-0.2.2/tests/decision/test_loop_integration.py +143 -0
  138. selfevals-0.2.2/tests/decision/test_matrix.py +206 -0
  139. selfevals-0.2.2/tests/examples/__init__.py +0 -0
  140. selfevals-0.2.2/tests/examples/test_hello_llm.py +171 -0
  141. selfevals-0.2.2/tests/graders/__init__.py +0 -0
  142. selfevals-0.2.2/tests/graders/test_calibration.py +98 -0
  143. selfevals-0.2.2/tests/graders/test_deterministic.py +209 -0
  144. selfevals-0.2.2/tests/graders/test_llm_judge.py +196 -0
  145. selfevals-0.2.2/tests/integration/__init__.py +0 -0
  146. selfevals-0.2.2/tests/integration/test_full_loop_with_mocked_judge.py +450 -0
  147. selfevals-0.2.2/tests/optimization/__init__.py +0 -0
  148. selfevals-0.2.2/tests/optimization/test_aggregator.py +129 -0
  149. selfevals-0.2.2/tests/optimization/test_loop.py +283 -0
  150. selfevals-0.2.2/tests/optimization/test_loop_error_analysis.py +280 -0
  151. selfevals-0.2.2/tests/optimization/test_proposers.py +181 -0
  152. selfevals-0.2.2/tests/repo/__init__.py +0 -0
  153. selfevals-0.2.2/tests/repo/test_loader.py +246 -0
  154. selfevals-0.2.2/tests/reporter/__init__.py +0 -0
  155. selfevals-0.2.2/tests/reporter/test_markdown.py +384 -0
  156. selfevals-0.2.2/tests/reporter/test_metrics.py +246 -0
  157. selfevals-0.2.2/tests/runner/__init__.py +0 -0
  158. selfevals-0.2.2/tests/runner/test_adapters.py +154 -0
  159. selfevals-0.2.2/tests/runner/test_executor.py +207 -0
  160. selfevals-0.2.2/tests/runner/test_otlp_receiver.py +196 -0
  161. selfevals-0.2.2/tests/runner/test_otlp_to_recorder.py +108 -0
  162. selfevals-0.2.2/tests/runner/test_sandbox.py +30 -0
  163. selfevals-0.2.2/tests/schemas/__init__.py +0 -0
  164. selfevals-0.2.2/tests/schemas/test_annotation.py +77 -0
  165. selfevals-0.2.2/tests/schemas/test_base.py +96 -0
  166. selfevals-0.2.2/tests/schemas/test_cross_entity.py +303 -0
  167. selfevals-0.2.2/tests/schemas/test_dataset.py +108 -0
  168. selfevals-0.2.2/tests/schemas/test_enums.py +59 -0
  169. selfevals-0.2.2/tests/schemas/test_error_analysis_spec.py +46 -0
  170. selfevals-0.2.2/tests/schemas/test_eval_case.py +131 -0
  171. selfevals-0.2.2/tests/schemas/test_experiment.py +203 -0
  172. selfevals-0.2.2/tests/schemas/test_failure_mode.py +86 -0
  173. selfevals-0.2.2/tests/schemas/test_fleet.py +109 -0
  174. selfevals-0.2.2/tests/schemas/test_grader_card.py +82 -0
  175. selfevals-0.2.2/tests/schemas/test_iteration.py +248 -0
  176. selfevals-0.2.2/tests/schemas/test_registry.py +114 -0
  177. selfevals-0.2.2/tests/schemas/test_tool.py +47 -0
  178. selfevals-0.2.2/tests/schemas/test_trace.py +277 -0
  179. selfevals-0.2.2/tests/schemas/test_workspace.py +77 -0
  180. selfevals-0.2.2/tests/sdk/__init__.py +0 -0
  181. selfevals-0.2.2/tests/sdk/test_auto_instrument.py +96 -0
  182. selfevals-0.2.2/tests/sdk/test_facade.py +97 -0
  183. selfevals-0.2.2/tests/skills/__init__.py +0 -0
  184. selfevals-0.2.2/tests/skills/test_skills_locator.py +31 -0
  185. selfevals-0.2.2/tests/storage/__init__.py +0 -0
  186. selfevals-0.2.2/tests/storage/test_filesystem_object_store.py +139 -0
  187. selfevals-0.2.2/tests/storage/test_migrations.py +34 -0
  188. selfevals-0.2.2/tests/storage/test_seed.py +68 -0
  189. selfevals-0.2.2/tests/storage/test_sqlite_storage.py +203 -0
  190. selfevals-0.2.2/tests/test_internal_hashing.py +23 -0
  191. selfevals-0.2.2/tests/test_internal_ids.py +42 -0
  192. selfevals-0.2.2/tests/test_internal_time.py +24 -0
  193. selfevals-0.2.2/tests/test_smoke.py +6 -0
  194. selfevals-0.2.2/tests/trace/__init__.py +0 -0
  195. selfevals-0.2.2/tests/trace/test_otel_importer.py +463 -0
  196. selfevals-0.2.2/tests/trace/test_payload_router.py +81 -0
  197. selfevals-0.2.2/tests/trace/test_recorder.py +129 -0
@@ -0,0 +1,57 @@
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+
8
+ # uv / venv
9
+ .venv/
10
+ venv/
11
+ env/
12
+
13
+ # Build artifacts
14
+ build/
15
+ dist/
16
+ *.egg-info/
17
+ *.egg
18
+ .eggs/
19
+
20
+ # Test / coverage
21
+ .pytest_cache/
22
+ .coverage
23
+ .coverage.*
24
+ htmlcov/
25
+ coverage.xml
26
+ .tox/
27
+ .nox/
28
+
29
+ # Type checkers
30
+ .mypy_cache/
31
+ .ruff_cache/
32
+ .pyre/
33
+ .pytype/
34
+
35
+ # IDE
36
+ .vscode/
37
+ .idea/
38
+ *.swp
39
+ *.swo
40
+
41
+ # OS
42
+ .DS_Store
43
+ Thumbs.db
44
+
45
+ # Local data
46
+ *.db
47
+ *.sqlite
48
+ *.sqlite3
49
+ *.sqlite-shm
50
+ *.sqlite-wal
51
+ .bootstrap/
52
+ data/
53
+
54
+ # Secrets
55
+ .env
56
+ .env.*
57
+ !.env.example
@@ -0,0 +1,540 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project will be documented in this file.
4
+
5
+ The format is loosely based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
6
+ Versions follow [SemVer](https://semver.org/).
7
+
8
+ ## [Unreleased]
9
+
10
+ ## [0.2.2] - 2026-05-27
11
+
12
+ ### Documentation
13
+
14
+ - Onboarding pass after the `bootstrap` -> `selfevals` rename. Fixed the
15
+ CI mypy target (`src/bootstrap` -> `src/selfevals`) and 13 stale
16
+ `bootstrap` CLI/prose references in the bundled error-analysis skill.
17
+ - README rewritten for a new user: provider-extras install guidance, a
18
+ Concepts table, both LLM examples (Anthropic + OpenAI), a full CLI
19
+ reference, and the global `--db` placement note. Status banners bumped
20
+ to the current release.
21
+ - New `examples/README.md` (walk-through + how to adapt to your own agent)
22
+ and an expanded `CONTRIBUTING.md` (test layout, extras some tests need,
23
+ where to add a grader/adapter/proposer).
24
+
25
+ No runtime or API changes — docs and packaging metadata only.
26
+
27
+ ## [0.2.1] - 2026-05-27
28
+
29
+ ### Changed
30
+
31
+ - **Provider extras now bundle the provider SDK**, not just the
32
+ OpenInference instrumentor. `pip install selfevals[openai]` (and
33
+ `[anthropic]`, `[bedrock]`, `[vertex]`, `[langchain]`, `[crewai]`) now
34
+ pulls the provider's own SDK alongside the tracing integration — so a
35
+ single install is enough to run and trace a provider-backed agent. This
36
+ follows the Pydantic AI per-provider-extra pattern; core still depends on
37
+ no provider SDK (only `pydantic` + `pyyaml`).
38
+
39
+ ### Added
40
+
41
+ - **`examples/hello_openai/`** — an OpenAI twin of `examples/hello_llm/`
42
+ (Anthropic): same three cases, same graders, same temperature sweep,
43
+ only the provider call differs. Calls OpenAI Chat Completions
44
+ (`gpt-4o-mini`) with a deterministic fake fallback when `OPENAI_API_KEY`
45
+ is unset. The lazy import distinguishes "SDK missing" (prints a
46
+ `pip install selfevals[openai]` hint) from "no API key" (silent fake).
47
+
48
+ ## [0.2.0] - 2026-05-26
49
+
50
+ First release prepared for PyPI (distribution name `selfevals`; import and
51
+ CLI remain `selfevals`). Adds the error-analysis closed loop, thread
52
+ grouping, and trace message-content capture on top of the 0.1.0 runtime.
53
+
54
+ ### Added
55
+
56
+ - **Error analysis + failure-mode taxonomy** — a closed loop, not a dashboard:
57
+ it grows a per-workspace failure-mode taxonomy and drives the next experiment.
58
+ selfevals owns the data, contract, persistence, and verification; the
59
+ intelligence (open/axial coding) lives in an external coding agent. selfevals
60
+ never calls an LLM. Design: `docs/spec/error_analysis_design.md`.
61
+ - **Persistence fix** — `IterationMetrics.failure_mode_counts` now persists
62
+ and survives a round-trip, so "top modes of experiment X" / "trend of mode
63
+ Y across iterations" are answerable. Closes the v0.1.0 known gap; the
64
+ markdown report and `compare` start showing real failure-mode data.
65
+ - **`FailureMode` entity** + per-workspace taxonomy seeded by `init` (9
66
+ canonical modes). Lifecycle CANDIDATE → OFFICIAL → RETIRED with a **human
67
+ promotion gate**; `superseded_by` back-pointer on merge.
68
+ - **Handshake** — `selfevals analyze pull <ws> <exp>` emits an
69
+ `AnalysisBundle` (failed traces + live taxonomy) as JSON; `analyze push`
70
+ ingests an `AnalysisResult` from stdin, validating-before-writing and
71
+ enforcing the assignment XOR (`mode_id` *or* `new_mode_slug`) and
72
+ classify-don't-rename invariants. Re-proposing an existing slug doesn't
73
+ duplicate it (discover-once, classify-thereafter).
74
+ - **`failuremode` CLI** — `list / promote / retire / merge / edit` for
75
+ taxonomy management and the human gate.
76
+ - **Closing the loop** — `ProposerInputs.failure_modes_consulted` carries the
77
+ prior iteration's dominant modes so a hypothesis can target a named mode;
78
+ `IterationAggregate.fail_rate` is the trigger signal; verification reuses
79
+ the existing `compare.py` before/after on stable mode ids.
80
+ - **Trace persistence** — `RunSpec.persist_traces` (`none` / `all` / `failed`,
81
+ default `failed`) controls which per-repetition traces the loop writes,
82
+ stamped with their grader results. A plain `selfevals run` now leaves the
83
+ failed traces in storage so `analyze pull` works without the SDK/OTLP path;
84
+ `--persist-traces` overrides it on the CLI. Traces also carry their
85
+ `iteration` so `analyze pull --iteration N` scopes correctly.
86
+ - **YAML opt-in** — a declarative, governable `error_analysis:` block on an
87
+ experiment (`enabled`, `taxonomy`, `trigger.fail_rate_above + threshold`,
88
+ `scope`). Default off. When the trigger fires, selfevals persists an
89
+ advisory `AnalysisStagingRecord` ("this run is worth coding") — it never
90
+ invokes an agent. The pingpong example opts in.
91
+ - **Bundled `error-analysis` skill** — ships inside the package
92
+ (`selfevals/.agents/skills/`, FastAPI convention) so `pip install selfevals`
93
+ makes it discoverable. It encodes the *method* (open → axial coding,
94
+ saturation, the handshake, the human gate), not intelligence. New
95
+ `selfevals.skills` locator + `selfevals skills list / path` CLI.
96
+ - 60+ new tests across schema round-trips, the push invariants, the
97
+ second-round stability property, loop staging + mode carryover, the YAML
98
+ loader, the skills locator, and the CLI cycle. mypy --strict + ruff clean.
99
+ - **Thread grouping** — traces can now be assembled into the conversation
100
+ thread they belong to. `RunInfo` gains `thread_id` + `thread_position`; the
101
+ OTel importer auto-detects the thread from `session.id` (OpenInference) or
102
+ `gen_ai.conversation.id` (OTel GenAI), without overwriting an explicit
103
+ caller-set `thread_id`. New read query `load_thread` + `GET
104
+ /workspaces/{ws}/threads/{thread_id}` return every trace sharing a thread,
105
+ ordered by `thread_position` (falling back to `started_at`), each turn
106
+ projected with its grader results so the per-turn grade is visible.
107
+ `TraceResponse` now surfaces `thread_id` / `thread_position`. This closes the
108
+ last trace-grouping gap versus LangSmith sessions; the run→experiment→
109
+ iteration→decision→grade chain already existed. Eight new tests.
110
+ - OTel importer now extracts prompt/completion **message content** into
111
+ traces. `_build_llm_span` reconstructs ordered message lists from both
112
+ attribute families — OpenInference native (`llm.input_messages.{i}.message.*`,
113
+ `llm.output_messages.{i}.message.*`) and the OTel GenAI alias
114
+ (`gen_ai.prompt.{i}.*`, `gen_ai.completion.{i}.*`). When both are present the
115
+ native family wins. Each side gets a stable `content_hash` (on
116
+ `messages_hash` / `output.content_hash`) for dedup and drift detection, and
117
+ the structured messages are kept inline under `provider_metadata`
118
+ (`selfevals.messages_in` / `selfevals.messages_out`). Closes the last gap
119
+ versus LangSmith trace capture: the actual prompt and response text are now
120
+ in the trace, not just tokens/model/stop_reason. Five new importer tests.
121
+
122
+ ## [0.1.0] - 2026-05-25
123
+
124
+ First version where the README no longer lies. `selfevals run` works
125
+ end-to-end against a real LLM agent, error paths are actionable, and
126
+ the markdown/JSON reports answer the obvious follow-up questions.
127
+ Schema-wise compatible with `0.0.9`.
128
+
129
+ ### Added — usable v1 surface
130
+
131
+ Examples and quickstart:
132
+ - `examples/hello_llm/` — a real Anthropic agent (with deterministic
133
+ fakes when `ANTHROPIC_API_KEY` is unset) over 3 EvalCases:
134
+ sentiment classification, structured extraction, open-ended support
135
+ reply. Two graders combined: `DeterministicGrader` for the rule
136
+ cases + `LLMJudgeGrader` for the open-ended one. `GridProposer`
137
+ sweeps `temperature ∈ {0.0, 0.5, 1.0}`.
138
+ - README quickstart points at `evals/experiments/example_pingpong.yaml`
139
+ with the exact commands. Status banner updated from "no runtime
140
+ yet" to "runtime functional".
141
+
142
+ CLI UX (Day 2):
143
+ - Every subcommand (`init`, `workspace`, `experiment`, `iteration`,
144
+ `report`, `run`, `compare`, `estimate`) now has a user-facing
145
+ one-line description and a copy-paste `Example:` epilog. Helper
146
+ `src/selfevals/cli/_help.py` centralizes the pattern.
147
+ - `tests/cli/test_help_texts.py` enforces the contract.
148
+ - `docs/adapters.md` documents the three adapters with YAML config,
149
+ per-adapter agent code, contracts, limitations, and a comparison
150
+ table.
151
+
152
+ Errors and hardening (Day 3):
153
+ - `SelfEvalsError` / `SelfEvalsUserError` hierarchy. User-correctable
154
+ failures exit with code 2 and a clean one-line message; internal
155
+ errors keep their traceback.
156
+ - `src/selfevals/cli/_friendly.py` is the single translation
157
+ chokepoint for YAML parse errors, dataset paths (with fuzzy-match
158
+ suggestions via stdlib `difflib`), missing graders, HTTP adapter
159
+ transport errors (URL + actionable suffix), and SQLite locked /
160
+ corrupted cases.
161
+ - `src/selfevals/graders/registry.py` — name→factory registry.
162
+ `deterministic` is pre-registered; `llm_judge` is registered
163
+ on-demand by the CLI. YAML can declare top-level `graders:` and
164
+ per-case `EvalCase.graders` filters which graders run.
165
+ - `tests/integration/test_full_loop_with_mocked_judge.py` — 7 tests
166
+ covering the happy path plus each of the five friendly-error
167
+ shapes.
168
+ - `docs/troubleshooting.md` documents the five common errors and
169
+ fixes.
170
+
171
+ Reporter (Day 4):
172
+ - `src/selfevals/reporter/_metrics.py` — pure helpers
173
+ (`compute_total_cost`, `compute_total_time_seconds`, etc.) that
174
+ return `None` when data is absent instead of misleading zeros.
175
+ - Markdown report gains a "Cost & Time" section (omitted gracefully
176
+ when there are no LLM calls) and a "Next steps" block with
177
+ copy-paste inspection commands.
178
+ - JSON report exposes a stable `cost_time` block (`None` when
179
+ missing).
180
+ - `src/selfevals/reporter/compare.py` powers `selfevals compare`:
181
+ proposal diff table, metrics diff table, failure-mode diff, and a
182
+ "B is better: primary +X; no new failure modes" recommendation.
183
+
184
+ ### Fixed
185
+
186
+ - Console script `selfevals` was pointing at `cli.main:app`, which
187
+ returns an int but never raised `SystemExit`, so user errors
188
+ silently exited 0. Now points at `cli.main:main`, which wraps `app`
189
+ in `SystemExit(...)`.
190
+ - `pyproject.toml` ruff `per-file-ignores` had no entry for
191
+ `src/selfevals/api/**`, so legitimate FastAPI `Depends(...)`
192
+ defaults were flagged as B008. Added the ignore.
193
+ - `pyproject.toml` `pytest.ini_options` was missing the `asyncio`
194
+ marker registration; `--strict-markers` was rejecting async tests.
195
+ - `EvalCase.graders` was unused metadata until now — the
196
+ `OptimizationLoop` now filters graders per case when the field is
197
+ populated, preserving the prior "run everything" behavior when it
198
+ is empty.
199
+
200
+ ### Known gaps (not blocking v0.1.0)
201
+
202
+ - 9 tests under `tests/sdk/` and `tests/runner/test_otlp_receiver.py`
203
+ require the `telemetry` extra (`uv sync --extra telemetry`) and
204
+ fail without it. They are excluded from the default surface.
205
+ - 3 tests under `tests/api/` require the `web` extra
206
+ (`uv sync --extra web`) to install FastAPI.
207
+ - Failure modes do not yet survive persistence to SQLite — the
208
+ compare and report tooling already handles their presence gracefully
209
+ for when the schema is extended. *(Resolved in [Unreleased]: error
210
+ analysis persists `failure_mode_counts`.)*
211
+ - `CliCommandAdapter` and `HttpEndpointAdapter` are not yet
212
+ auto-wired from YAML; users instantiate them via a Python
213
+ entrypoint. `docs/adapters.md` documents the workaround.
214
+
215
+ ## [0.0.9] - 2026-05-16
216
+
217
+ ### Added — MVP Block A: YAML loader + `selfevals run` end-to-end
218
+
219
+ Repo loader (`src/selfevals/repo/`):
220
+ - `load_experiment_spec(path)` parses `evals/experiments/<name>.yaml` →
221
+ `(workspace_id, Experiment, [EvalCase], AgentEntrypoint)`. YAML keys
222
+ are 1:1 with the Pydantic field names — no DSL translation; the
223
+ validators do all the shape checking.
224
+ - Cases can be inline (`dataset.cases_inline:`) or external JSONL
225
+ (`dataset.cases_path:`). Mutually exclusive; both empty rejected.
226
+ - Agent entrypoint declared as `module.path:callable_name`.
227
+ `resolve_agent_callable` defers import until the runner needs it
228
+ (lets `selfevals inspect` validate a spec without booting user code).
229
+ - 14 tests covering inline/external loading, workspace override,
230
+ missing fields, malformed YAML, invalid payloads, entrypoint
231
+ resolution.
232
+
233
+ CLI `selfevals run <yaml>`:
234
+ - Loads spec → resolves agent callable → wraps as `EmbeddedAdapter`
235
+ (str returns auto-coerced to `AdapterResponse`) → builds the
236
+ proposer per `experiment.proposer.strategy` (grid / random /
237
+ manual) → drives `OptimizationLoop` with `DecisionMatrixEvaluator`
238
+ + `DeterministicGrader` → emits markdown/JSON report.
239
+ - Flags: `--workspace`, `--max-iterations`, `--reps`, `--format`,
240
+ `--no-persist`.
241
+ - Persists `Experiment` + `IterationRecord` + `DecisionRecord` to
242
+ SQLite when storage is enabled; auto-seeds the workspace row.
243
+ - 6 tests covering markdown/JSON output, persistence to SQLite,
244
+ missing-spec error, validation, str→AdapterResponse coercion.
245
+
246
+ Example experiment:
247
+ - `evals/experiments/example_pingpong.yaml` + `evals/datasets/pingpong.jsonl` +
248
+ `selfevals.examples.pingpong` reference agent. Serves as smoke test
249
+ and onboarding artifact. `uv run selfevals run evals/experiments/example_pingpong.yaml --no-persist`
250
+ produces a clean report out of the box.
251
+
252
+ Refactor:
253
+ - `DecisionMatrixEvaluator` now inherits from `DecisionEvaluatorProtocol`
254
+ so the type checker recognizes it as a valid argument to
255
+ `OptimizationLoop(decision_evaluator=...)`.
256
+
257
+ 20 new tests (390 total). mypy strict + ruff clean. One new runtime
258
+ dep: `pyyaml>=6,<7`.
259
+
260
+ ### Added — Design docs for next implementation surfaces
261
+
262
+ - `docs/spec/sdk_otlp_design.md`: locked blueprint for the user-side
263
+ SDK façade (`selfevals.init()`) + embedded OTLP HTTP receiver +
264
+ OpenInference auto-instrumentation. Sections 1-11 cover the
265
+ decisions already made (no re-litigation), package layout, exact
266
+ signatures, span translation table, dependency tree (optional
267
+ extras), test plan, and acceptance criteria. ~1500-2000 LOC budget,
268
+ dedicated session.
269
+ - `docs/prompts/web_session_prompt.md`: self-contained prompt for the
270
+ Claude Code session that builds the web UI + SDK + OTLP receiver.
271
+ Includes product vibe (Stripe/Airbnb/ChatGPT/Claude/LangSmith/Mercury),
272
+ page inventory (8 surfaces), design tokens, stack recommendation,
273
+ backend contract, and "done" criteria.
274
+
275
+ ## [0.0.8] - 2026-05-16
276
+
277
+ ### Added — PR 8 + PR 9: Reporter + CLI
278
+
279
+ Reporter (`selfevals.reporter`):
280
+ - `render_markdown(result)` produces a PR-comment-style summary:
281
+ experiment header (name, goal, state, mode, proposer, iterations
282
+ run, termination reason), target + guardrail spec line, best-
283
+ iteration callout with parameters, per-iteration table
284
+ (`#`, primary, Δ vs running best, decision outcome, rationale —
285
+ with pipe-escaping and 80-char rationale truncation), and a
286
+ top-N failure-modes section drawn from
287
+ `IterationAggregate.failure_mode_counts`.
288
+ - `render_json(result)` emits a stable, machine-readable payload
289
+ (`schema_version=1`) keyed on iteration index, with explicit
290
+ best-iteration reference. JSON path is what the CLI's `--format
291
+ json` flag outputs.
292
+ - Pure: no I/O, no global state — callers decide where the strings
293
+ end up (stdout, a file, a GitHub PR comment).
294
+
295
+ CLI (`selfevals` console script, argparse-only, zero new deps):
296
+ - `selfevals init <slug>` — idempotent workspace seed via
297
+ `seed_workspace`; prints workspace id + member count.
298
+ - `selfevals workspace show <ws_id>` — workspace metadata +
299
+ experiment count.
300
+ - `selfevals experiment list <ws_id>` / `show <ws_id> <exp_id>` —
301
+ inspect experiments in storage with target + iteration progress.
302
+ - `selfevals iteration list <ws_id> <exp_id>` — per-iteration
303
+ primary metric + decision outcome.
304
+ - `selfevals report <ws_id> <exp_id> [--format markdown|json]` —
305
+ reconstructs an OptimizationResult from stored IterationRecords +
306
+ DecisionRecords (lossy on per-case GradeResults, lossless on
307
+ aggregates) and pipes it through the reporter.
308
+ - `selfevals compare <ws_id> <iter_a_id> <iter_b_id>` — side-by-
309
+ side primary metric diff between two iterations of the same
310
+ experiment.
311
+ - `selfevals estimate --cases N --space-size M --reps K
312
+ --cost-per-call X` — dry-run upper-bound on agent calls and
313
+ total USD cost before paying for a run.
314
+ - All user-facing errors (missing entity, primary-metric mismatch,
315
+ invalid numeric args) go through `CommandError` → `error: <msg>`
316
+ on stderr → exit code 2. Unexpected exceptions surface as
317
+ tracebacks (bugs, not user errors).
318
+
319
+ 18 new tests (370 total: 9 reporter + 9 CLI). mypy strict + ruff
320
+ clean. Zero new runtime deps — argparse + stdlib.
321
+
322
+ ## [0.0.7] - 2026-05-16
323
+
324
+ ### Added — PR 6 + PR 7: OptimizationLoop + Decision matrix
325
+
326
+ Proposers:
327
+ - `Proposer` ABC with `ProposerContext` (iteration index + history).
328
+ - `ManualProposer`: walk a caller-supplied list of `Proposal` or
329
+ parameter dicts; raises `SearchSpaceExhaustedError` when done.
330
+ - `GridProposer`: cartesian product over list-valued entries in
331
+ `experiment.search_space.model_params`; scalar entries are held
332
+ constant; empty list → raises ValueError.
333
+ - `RandomProposer`: independent uniform sampling from each parameter
334
+ spec (list, `{lo, hi}`, `{choices: [...]}`, or scalar constant).
335
+ Bounded by `max_proposals`; seeded for reproducibility.
336
+ - All proposals are re-validated against the experiment's editable
337
+ contract before being returned.
338
+
339
+ Aggregator:
340
+ - `aggregate_iteration(case_outcomes, primary_metric, reliability_metrics)`
341
+ computes pass@1 / pass@k / pass^k / consistency_rate /
342
+ stability_score / recovery_rate from per-case `CaseOutcome`s.
343
+ - Worst-of policy when multiple graders run on the same repetition:
344
+ ERROR > FAIL > PARTIAL > SKIPPED > PASS.
345
+ - Failure-mode counts aggregated by tag.
346
+ - Guardrail metrics (`cost_usd_per_case`, `latency_ms_per_case_avg`)
347
+ surfaced when traces report cost/duration.
348
+
349
+ OptimizationLoop:
350
+ - Transitions experiment state DRAFT → QUEUED → RUNNING → COMPLETED.
351
+ - For each iteration: ask proposer for a Proposal, run cases through
352
+ the Executor, score per-rep results with the configured graders,
353
+ aggregate, hand to a DecisionEvaluator, persist IterationRecord +
354
+ DecisionRecord (when a WorkspaceScope is provided).
355
+ - Terminates on `search_space_exhausted`, `converged`, or
356
+ `max_iterations`. Convergence = no improvement above
357
+ `min_delta` for `patience` consecutive iterations.
358
+
359
+ Decision matrix (PR 7):
360
+ - `evaluate_iteration` (pure) + `DecisionMatrixEvaluator` (object).
361
+ Applies the §10 canonical subset that powers MVP optimization:
362
+ guardrail check → first-iteration target check → improvement vs
363
+ baseline → regression handling per `Experiment.decision` policy
364
+ (reject / investigate / spawn_subexperiment) or guardrail policy
365
+ (reject / require_tradeoff_review).
366
+ - Missing guardrail metric values are treated as passing — the runner
367
+ doesn't synthesize every metric in MVP and we don't fail-shut on
368
+ absent data.
369
+ - End-to-end integration test wires the evaluator into the loop and
370
+ verifies that improvement / no-improvement / regression each
371
+ produce the right DecisionRecord.outcome.
372
+
373
+ 47 new tests (352 total). mypy strict + ruff clean. Zero new deps.
374
+
375
+ ## [0.0.6] - 2026-05-16
376
+
377
+ ### Added — PR 5: Graders (deterministic + LLM judge + calibration)
378
+
379
+ - `Grader` ABC with `GraderContext` (case + trace + optional response)
380
+ and `GradeResult` (label / score / reason / confidence / failure_modes
381
+ / details). `GradeLabel` enum: pass, fail, partial, error, skipped.
382
+ - `DeterministicGrader`: reads rules off `EvalCase.expected`:
383
+ must_include, must_not_include, required_tools (looks at
384
+ ToolCallSpans in the trace), forbidden_tools, optional regex,
385
+ structured_output equality. Configurable case-sensitive mode. Each
386
+ rule emits a stable failure_mode tag for weighted scoring upstream.
387
+ - `LLMJudgeGrader`: invokes any `AgentAdapter` as a judge against a
388
+ rubric prompt (`RubricTemplate` with safe substitution). Parses the
389
+ judge's JSON output into a `JudgeDecision`; unknown labels and bad
390
+ JSON return `GradeLabel.ERROR` rather than crashing. Honors
391
+ `GraderCard.blocking` thresholds: when below calibration the grader
392
+ returns SKIPPED ("degraded to advisory") unless `force=True`.
393
+ Single-judge in MVP; panel infrastructure-ready for post-MVP.
394
+ - Calibration helpers (`compute_classification_metrics`): pair
395
+ predictions with human labels by case_id; compute precision, recall,
396
+ F1 for the positive class plus macro-F1, accuracy, per-label
397
+ precision/recall, and confusion matrix. Counts high-risk false
398
+ negatives separately (the failure mode that wakes someone up).
399
+ Class-imbalance guard: undefined precision/recall return None.
400
+
401
+ 25 new tests (305 total). mypy strict + ruff clean. Zero new deps.
402
+
403
+ ## [0.0.5] - 2026-05-16
404
+
405
+ ### Added — PR 4: Runner (agent adapters + sandbox + executor)
406
+
407
+ - `AgentAdapter` ABC + `AdapterRequest`/`AdapterResponse` dataclasses;
408
+ the narrow contract between selfevals and the agent under test.
409
+ - `EmbeddedAdapter`: wraps a Python callable. Used for tests and
410
+ in-repo agents.
411
+ - `CliCommandAdapter`: subprocess + JSON-over-stdio. Configurable
412
+ command, env, timeout.
413
+ - `HttpEndpointAdapter`: POST JSON via stdlib `urllib` (no
414
+ third-party HTTP dep). Configurable headers + timeout.
415
+ - All three normalize errors into `AdapterError` with the original
416
+ cause preserved.
417
+ - `SandboxPolicy`: declarative mock/dry_run rules; `live_sandboxed`
418
+ and `live_canary` are accepted as enum values but `ensure_runnable()`
419
+ blocks them in MVP via `SandboxViolationError`.
420
+ - `Executor`: runs an `EvalCase` for N repetitions through a given
421
+ adapter + sandbox; assembles a `Trace` per repetition via
422
+ `TraceRecorder`. Records adapter LLM output as an `LLMCallSpan`,
423
+ each tool use as a `ToolCallSpan` (sandboxed flag per policy),
424
+ and adapter exceptions as `ErrorSpan` + `final_state=errored`.
425
+
426
+ 24 new tests (280 total). mypy strict + ruff clean. Zero new deps.
427
+
428
+ ## [0.0.4] - 2026-05-16
429
+
430
+ ### Added — PR 3: Trace ingestion (recorder + payload router + OTel importer)
431
+
432
+ - `PayloadRouter` — small payloads (≤4 KB by default) stay inline in
433
+ the Trace JSON; larger ones are written to the `ObjectStoreInterface`
434
+ and replaced with `oss://` pointers + sha256 hashes. Canonical
435
+ JSON encoding for dicts/lists guarantees stable hashing across key
436
+ order.
437
+ - `TraceRecorder` — context manager that captures spans during agent
438
+ execution. Span context managers: `agent_turn`, `llm_call`,
439
+ `tool_call`. Convenience emitters: `add_retrieval`,
440
+ `add_memory_read/write`, `add_decision`, `add_handoff`,
441
+ `add_human_intervention`, `add_guardrail_check`, `add_error`.
442
+ Accumulates trace-level metrics (LLM call count, tool call count,
443
+ token totals, retries). Tool call exceptions automatically mark
444
+ the span ERROR with type+message. Exiting the context with an
445
+ uncaught exception marks the trace ERRORED.
446
+ - `import_otel_spans` — adapter from a flat list of OTel-style span
447
+ dicts (gen_ai.*, openinference.*) to a selfevals Trace. Classifies
448
+ spans by `openinference.span.kind` / `gen_ai.*` presence,
449
+ normalizes finish reasons, preserves parent/child links, retains
450
+ unknown attributes in `provider_metadata` or CustomSpan.payload.
451
+ When TOOL spans carry call_ids without explicit linkage, the
452
+ importer synthesizes ToolUseRequest entries on the nearest LLM
453
+ span so the schema invariant holds; if no LLM span exists the
454
+ call_id is dropped silently.
455
+ - Public surface: `selfevals.trace` re-exports `PayloadRouter`,
456
+ `TraceRecorder`, `import_otel_spans`.
457
+
458
+ 26 new tests; 256 total. mypy strict + ruff clean. Zero new deps.
459
+
460
+ ## [0.0.3] - 2026-05-16
461
+
462
+ ### Added — PR 2: Storage layer (SQLite + filesystem + workspace scoping)
463
+
464
+ - `StorageInterface` / `ObjectStoreInterface` / `WorkspaceScope` ABCs:
465
+ every read or write is bound to one `workspace_id`; cross-tenant
466
+ access is impossible by construction.
467
+ - `SQLiteStorage` with single generic `entities` table (entity_type, id,
468
+ workspace_id, version, timestamps, payload JSON) + `objects` table.
469
+ Indexes on (workspace_id, entity_type[, created/updated]) and a
470
+ partial deleted_at index. Optimistic concurrency on `version`.
471
+ WAL journal mode + foreign keys on.
472
+ - Homemade migration runner (no alembic dep): forward-only,
473
+ `mNNNN_<slug>.py` modules with `up(conn)`, tracked in
474
+ `_selfevalss_migrations`. Initial migration creates the tables.
475
+ - `FilesystemObjectStore`: content-addressed blobs at
476
+ `{root}/{workspace_id}/{prefix2}/{sha256}.bin`; pointer URI
477
+ `oss://{workspace_id}/sha256:...` encodes its workspace.
478
+ SHA256 integrity check on read; collision detected if same hash
479
+ resolves to different bytes.
480
+ - `seed_workspace(storage, slug, name, user_id, ...)` helper:
481
+ idempotent by (slug, owner), creates the Workspace + one Member
482
+ per `Role` (viewer, evaluator, experimenter, maintainer, admin,
483
+ auditor) when `assign_all_roles=True`.
484
+ - Errors: `EntityNotFoundError`, `WorkspaceMismatchError`,
485
+ `OptimisticConcurrencyError`, `ObjectNotFoundError`,
486
+ `PointerHashMismatchError`, `IntegrityViolationError`.
487
+
488
+ 33 new tests (231 total).
489
+
490
+ ## [0.0.2] - 2026-05-16
491
+
492
+ ### Added — PR 1: Schemas-first scaffolding (Pydantic v2)
493
+
494
+ Closed enums (`Role`, `Level`, `DatasetSource`, `GroundTruthMethod`,
495
+ `DatasetType`, `SandboxMode`, `RuntimeLocation`, `Mode`, `ProposerStrategy`,
496
+ `ExperimentState`, `SpanKind`, `StopReason`, `TraceState`,
497
+ `ToolCallStatus`, `PIIStatus`, `FeatureKind`/`Status`,
498
+ `AgentType`/`Status`, `FleetStatus`, `DatasetStatus`, `ToolStatus`,
499
+ `GraderCardState`, `DecisionOutcome`, `IterationState`, `Modality`).
500
+
501
+ Entities:
502
+ - `Workspace`, `Member` — multi-tenant primitives; workspace is
503
+ self-referential (its own workspace_id == id).
504
+ - `Tool` — first-class entity needed for `editable.tool_code`.
505
+ - `FeatureRegistry`, `RiskRegistry` — declarative taxonomies.
506
+ - `AgentFleet`, `Agent` — agent_type-discriminated payloads.
507
+ - `EvalCase` — taxonomy (level, feature, source, ground_truth,
508
+ runtime, dataset_type, risk), expected, failure_weights, blocking,
509
+ holdout, PII contract.
510
+ - `Dataset` — manifest with split_allocation, lazy statistics by
511
+ manifest_hash, regression-class immutability when frozen.
512
+ - `Experiment` — TargetSpec, EditableContract enforcing mode=agent_loop
513
+ for tool_code/workflow_graph/skills, SearchSpace, FrozenSnapshot,
514
+ ProposerSpec (MVP gates non-manual/grid/random), RunSpec, JudgeDefenses
515
+ (live_canary requires outcome_metrics), ReliabilitySpec
516
+ (pass@N/pass^N/consistency_rate/...), DecisionPolicy, state machine.
517
+ - `IterationRecord`, `Proposal` (with `validate_against(experiment)` =
518
+ editable contract enforcement), `DecisionRecord` with automated +
519
+ human rationale.
520
+ - `GraderCard` with blocking thresholds contract (precision >= 0.90,
521
+ recall >= 0.95, max high-risk FNs == 0).
522
+ - `Annotation` with free-form labels + optional rubric_version.
523
+ - `Trace` schema (operational §B.2): RunInfo, AgentSnapshotRef,
524
+ EnvironmentInfo, FinalState, discriminated `Span` union (12 kinds),
525
+ TokenBreakdown with cache_read/cache_creation/reasoning, CostBreakdown,
526
+ ReasoningBlock with provider signature, LLMOutput with
527
+ tool_use_requested, ToolCallSpan.tool_use_id linkage validated
528
+ trace-wide.
529
+
530
+ Internal helpers: ULID + prefixed ULID id generation (stdlib only),
531
+ canonical content_hash (sha256), tz-aware UTC time helpers.
532
+
533
+ Tests: 197 unit tests covering every validator and enum; mypy strict
534
+ + ruff (E/W/F/I/B/UP/N/SIM/RUF) clean.
535
+
536
+ ## [0.0.1] - 2026-05-16
537
+
538
+ ### Added
539
+ - Initial repo scaffolding: `pyproject.toml`, ruff + mypy strict + pytest config.
540
+ - `docs/spec/` with canonical eval framework spec, operational spec v0.1, taxonomy notes.
@@ -0,0 +1,17 @@
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ Copyright 2026 Patricio Valdez
6
+
7
+ Licensed under the Apache License, Version 2.0 (the "License");
8
+ you may not use this file except in compliance with the License.
9
+ You may obtain a copy of the License at
10
+
11
+ http://www.apache.org/licenses/LICENSE-2.0
12
+
13
+ Unless required by applicable law or agreed to in writing, software
14
+ distributed under the License is distributed on an "AS IS" BASIS,
15
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16
+ See the License for the specific language governing permissions and
17
+ limitations under the License.