selfevals 0.2.2__tar.gz → 0.3.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (204) hide show
  1. {selfevals-0.2.2 → selfevals-0.3.0}/.gitignore +3 -0
  2. {selfevals-0.2.2 → selfevals-0.3.0}/CHANGELOG.md +42 -0
  3. {selfevals-0.2.2 → selfevals-0.3.0}/PKG-INFO +6 -4
  4. {selfevals-0.2.2 → selfevals-0.3.0}/README.md +4 -3
  5. selfevals-0.3.0/docs/FRONTEND.md +424 -0
  6. selfevals-0.3.0/docs/ROADMAP.md +178 -0
  7. selfevals-0.3.0/docs/SEALS_100X_GAP_PLAN.md +531 -0
  8. {selfevals-0.2.2 → selfevals-0.3.0}/docs/STATUS.md +55 -25
  9. {selfevals-0.2.2 → selfevals-0.3.0}/pyproject.toml +2 -1
  10. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/cli/commands.py +2 -1
  11. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/graders/base.py +1 -1
  12. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/graders/deterministic.py +1 -1
  13. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/graders/llm_judge.py +2 -2
  14. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/optimization/loop.py +37 -17
  15. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/runner/adapters.py +64 -44
  16. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/runner/executor.py +33 -19
  17. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/schemas/__init__.py +8 -0
  18. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/schemas/enums.py +9 -0
  19. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/schemas/eval_case.py +76 -5
  20. selfevals-0.3.0/src/selfevals/version.py +1 -0
  21. {selfevals-0.2.2 → selfevals-0.3.0}/tests/cli/test_cli.py +2 -1
  22. selfevals-0.3.0/tests/conftest.py +41 -0
  23. {selfevals-0.2.2 → selfevals-0.3.0}/tests/decision/test_loop_integration.py +5 -2
  24. {selfevals-0.2.2 → selfevals-0.3.0}/tests/graders/test_deterministic.py +33 -23
  25. {selfevals-0.2.2 → selfevals-0.3.0}/tests/graders/test_llm_judge.py +33 -13
  26. {selfevals-0.2.2 → selfevals-0.3.0}/tests/integration/test_full_loop_with_mocked_judge.py +6 -4
  27. {selfevals-0.2.2 → selfevals-0.3.0}/tests/optimization/test_loop.py +81 -10
  28. {selfevals-0.2.2 → selfevals-0.3.0}/tests/optimization/test_loop_error_analysis.py +24 -15
  29. {selfevals-0.2.2 → selfevals-0.3.0}/tests/reporter/test_markdown.py +2 -1
  30. selfevals-0.3.0/tests/runner/test_adapters.py +225 -0
  31. {selfevals-0.2.2 → selfevals-0.3.0}/tests/runner/test_executor.py +110 -14
  32. selfevals-0.3.0/tests/schemas/test_eval_case.py +310 -0
  33. selfevals-0.2.2/src/selfevals/version.py +0 -1
  34. selfevals-0.2.2/tests/runner/test_adapters.py +0 -154
  35. selfevals-0.2.2/tests/schemas/test_eval_case.py +0 -131
  36. {selfevals-0.2.2 → selfevals-0.3.0}/LICENSE +0 -0
  37. {selfevals-0.2.2 → selfevals-0.3.0}/docs/adapters.md +0 -0
  38. {selfevals-0.2.2 → selfevals-0.3.0}/docs/spec/error_analysis_design.md +0 -0
  39. {selfevals-0.2.2 → selfevals-0.3.0}/docs/spec/evals_framework.md +0 -0
  40. {selfevals-0.2.2 → selfevals-0.3.0}/docs/spec/operational_spec_v0.1.md +0 -0
  41. {selfevals-0.2.2 → selfevals-0.3.0}/docs/spec/raw.md +0 -0
  42. {selfevals-0.2.2 → selfevals-0.3.0}/docs/spec/sdk_otlp_design.md +0 -0
  43. {selfevals-0.2.2 → selfevals-0.3.0}/docs/spec/taxonomy.md +0 -0
  44. {selfevals-0.2.2 → selfevals-0.3.0}/docs/troubleshooting.md +0 -0
  45. {selfevals-0.2.2 → selfevals-0.3.0}/evals/datasets/pingpong.jsonl +0 -0
  46. {selfevals-0.2.2 → selfevals-0.3.0}/evals/experiments/example_pingpong.yaml +0 -0
  47. {selfevals-0.2.2 → selfevals-0.3.0}/examples/README.md +0 -0
  48. {selfevals-0.2.2 → selfevals-0.3.0}/examples/__init__.py +0 -0
  49. {selfevals-0.2.2 → selfevals-0.3.0}/examples/hello_llm/__init__.py +0 -0
  50. {selfevals-0.2.2 → selfevals-0.3.0}/examples/hello_llm/agent.py +0 -0
  51. {selfevals-0.2.2 → selfevals-0.3.0}/examples/hello_llm/cases.jsonl +0 -0
  52. {selfevals-0.2.2 → selfevals-0.3.0}/examples/hello_llm/experiment.yaml +0 -0
  53. {selfevals-0.2.2 → selfevals-0.3.0}/examples/hello_openai/__init__.py +0 -0
  54. {selfevals-0.2.2 → selfevals-0.3.0}/examples/hello_openai/agent.py +0 -0
  55. {selfevals-0.2.2 → selfevals-0.3.0}/examples/hello_openai/cases.jsonl +0 -0
  56. {selfevals-0.2.2 → selfevals-0.3.0}/examples/hello_openai/experiment.yaml +0 -0
  57. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/.agents/skills/error-analysis/SKILL.md +0 -0
  58. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/__init__.py +0 -0
  59. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/_errors.py +0 -0
  60. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/_internal/__init__.py +0 -0
  61. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/_internal/hashing.py +0 -0
  62. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/_internal/ids.py +0 -0
  63. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/_internal/time.py +0 -0
  64. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/analysis/__init__.py +0 -0
  65. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/analysis/bundle.py +0 -0
  66. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/analysis/hypothesis.py +0 -0
  67. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/analysis/ingest.py +0 -0
  68. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/analysis/schemas.py +0 -0
  69. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/analysis/staging.py +0 -0
  70. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/api/__init__.py +0 -0
  71. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/api/__main__.py +0 -0
  72. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/api/app.py +0 -0
  73. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/api/broker.py +0 -0
  74. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/api/broker_bridge.py +0 -0
  75. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/api/queries.py +0 -0
  76. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/api/schemas.py +0 -0
  77. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/api/sse.py +0 -0
  78. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/cli/__init__.py +0 -0
  79. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/cli/_friendly.py +0 -0
  80. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/cli/_help.py +0 -0
  81. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/cli/analyze_commands.py +0 -0
  82. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/cli/main.py +0 -0
  83. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/decision/__init__.py +0 -0
  84. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/decision/matrix.py +0 -0
  85. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/examples/__init__.py +0 -0
  86. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/examples/evals/datasets/pingpong.jsonl +0 -0
  87. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/examples/evals/experiments/example_pingpong.yaml +0 -0
  88. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/examples/pingpong.py +0 -0
  89. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/graders/__init__.py +0 -0
  90. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/graders/calibration.py +0 -0
  91. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/graders/registry.py +0 -0
  92. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/optimization/__init__.py +0 -0
  93. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/optimization/aggregator.py +0 -0
  94. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/optimization/proposers.py +0 -0
  95. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/py.typed +0 -0
  96. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/repo/__init__.py +0 -0
  97. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/repo/loader.py +0 -0
  98. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/reporter/__init__.py +0 -0
  99. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/reporter/_metrics.py +0 -0
  100. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/reporter/compare.py +0 -0
  101. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/reporter/json_report.py +0 -0
  102. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/reporter/markdown.py +0 -0
  103. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/runner/__init__.py +0 -0
  104. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/runner/otlp_receiver.py +0 -0
  105. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/runner/otlp_to_recorder.py +0 -0
  106. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/runner/sandbox.py +0 -0
  107. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/schemas/_base.py +0 -0
  108. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/schemas/annotation.py +0 -0
  109. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/schemas/dataset.py +0 -0
  110. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/schemas/experiment.py +0 -0
  111. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/schemas/failure_mode.py +0 -0
  112. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/schemas/fleet.py +0 -0
  113. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/schemas/grader_card.py +0 -0
  114. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/schemas/iteration.py +0 -0
  115. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/schemas/registry.py +0 -0
  116. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/schemas/tool.py +0 -0
  117. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/schemas/trace.py +0 -0
  118. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/schemas/workspace.py +0 -0
  119. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/sdk/__init__.py +0 -0
  120. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/sdk/auto_instrument.py +0 -0
  121. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/sdk/context.py +0 -0
  122. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/sdk/exporter.py +0 -0
  123. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/sdk/facade.py +0 -0
  124. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/skills/__init__.py +0 -0
  125. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/storage/__init__.py +0 -0
  126. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/storage/errors.py +0 -0
  127. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/storage/filesystem.py +0 -0
  128. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/storage/interface.py +0 -0
  129. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/storage/migrations/__init__.py +0 -0
  130. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/storage/migrations/m0001_initial.py +0 -0
  131. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/storage/seed.py +0 -0
  132. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/storage/sqlite.py +0 -0
  133. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/trace/__init__.py +0 -0
  134. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/trace/otel_importer.py +0 -0
  135. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/trace/payload_router.py +0 -0
  136. {selfevals-0.2.2 → selfevals-0.3.0}/src/selfevals/trace/recorder.py +0 -0
  137. {selfevals-0.2.2 → selfevals-0.3.0}/tests/__init__.py +0 -0
  138. {selfevals-0.2.2 → selfevals-0.3.0}/tests/analysis/__init__.py +0 -0
  139. {selfevals-0.2.2 → selfevals-0.3.0}/tests/analysis/test_handshake.py +0 -0
  140. {selfevals-0.2.2 → selfevals-0.3.0}/tests/api/__init__.py +0 -0
  141. {selfevals-0.2.2 → selfevals-0.3.0}/tests/api/test_api.py +0 -0
  142. {selfevals-0.2.2 → selfevals-0.3.0}/tests/api/test_broker.py +0 -0
  143. {selfevals-0.2.2 → selfevals-0.3.0}/tests/api/test_sse.py +0 -0
  144. {selfevals-0.2.2 → selfevals-0.3.0}/tests/api/test_threads.py +0 -0
  145. {selfevals-0.2.2 → selfevals-0.3.0}/tests/cli/__init__.py +0 -0
  146. {selfevals-0.2.2 → selfevals-0.3.0}/tests/cli/helpers_str_agent.py +0 -0
  147. {selfevals-0.2.2 → selfevals-0.3.0}/tests/cli/test_analyze_cli.py +0 -0
  148. {selfevals-0.2.2 → selfevals-0.3.0}/tests/cli/test_cli_run.py +0 -0
  149. {selfevals-0.2.2 → selfevals-0.3.0}/tests/cli/test_compare.py +0 -0
  150. {selfevals-0.2.2 → selfevals-0.3.0}/tests/cli/test_examples_cli.py +0 -0
  151. {selfevals-0.2.2 → selfevals-0.3.0}/tests/cli/test_help_texts.py +0 -0
  152. {selfevals-0.2.2 → selfevals-0.3.0}/tests/cli/test_skills_cli.py +0 -0
  153. {selfevals-0.2.2 → selfevals-0.3.0}/tests/decision/__init__.py +0 -0
  154. {selfevals-0.2.2 → selfevals-0.3.0}/tests/decision/test_matrix.py +0 -0
  155. {selfevals-0.2.2 → selfevals-0.3.0}/tests/examples/__init__.py +0 -0
  156. {selfevals-0.2.2 → selfevals-0.3.0}/tests/examples/test_hello_llm.py +0 -0
  157. {selfevals-0.2.2 → selfevals-0.3.0}/tests/graders/__init__.py +0 -0
  158. {selfevals-0.2.2 → selfevals-0.3.0}/tests/graders/test_calibration.py +0 -0
  159. {selfevals-0.2.2 → selfevals-0.3.0}/tests/integration/__init__.py +0 -0
  160. {selfevals-0.2.2 → selfevals-0.3.0}/tests/optimization/__init__.py +0 -0
  161. {selfevals-0.2.2 → selfevals-0.3.0}/tests/optimization/test_aggregator.py +0 -0
  162. {selfevals-0.2.2 → selfevals-0.3.0}/tests/optimization/test_proposers.py +0 -0
  163. {selfevals-0.2.2 → selfevals-0.3.0}/tests/repo/__init__.py +0 -0
  164. {selfevals-0.2.2 → selfevals-0.3.0}/tests/repo/test_loader.py +0 -0
  165. {selfevals-0.2.2 → selfevals-0.3.0}/tests/reporter/__init__.py +0 -0
  166. {selfevals-0.2.2 → selfevals-0.3.0}/tests/reporter/test_metrics.py +0 -0
  167. {selfevals-0.2.2 → selfevals-0.3.0}/tests/runner/__init__.py +0 -0
  168. {selfevals-0.2.2 → selfevals-0.3.0}/tests/runner/test_otlp_receiver.py +0 -0
  169. {selfevals-0.2.2 → selfevals-0.3.0}/tests/runner/test_otlp_to_recorder.py +0 -0
  170. {selfevals-0.2.2 → selfevals-0.3.0}/tests/runner/test_sandbox.py +0 -0
  171. {selfevals-0.2.2 → selfevals-0.3.0}/tests/schemas/__init__.py +0 -0
  172. {selfevals-0.2.2 → selfevals-0.3.0}/tests/schemas/test_annotation.py +0 -0
  173. {selfevals-0.2.2 → selfevals-0.3.0}/tests/schemas/test_base.py +0 -0
  174. {selfevals-0.2.2 → selfevals-0.3.0}/tests/schemas/test_cross_entity.py +0 -0
  175. {selfevals-0.2.2 → selfevals-0.3.0}/tests/schemas/test_dataset.py +0 -0
  176. {selfevals-0.2.2 → selfevals-0.3.0}/tests/schemas/test_enums.py +0 -0
  177. {selfevals-0.2.2 → selfevals-0.3.0}/tests/schemas/test_error_analysis_spec.py +0 -0
  178. {selfevals-0.2.2 → selfevals-0.3.0}/tests/schemas/test_experiment.py +0 -0
  179. {selfevals-0.2.2 → selfevals-0.3.0}/tests/schemas/test_failure_mode.py +0 -0
  180. {selfevals-0.2.2 → selfevals-0.3.0}/tests/schemas/test_fleet.py +0 -0
  181. {selfevals-0.2.2 → selfevals-0.3.0}/tests/schemas/test_grader_card.py +0 -0
  182. {selfevals-0.2.2 → selfevals-0.3.0}/tests/schemas/test_iteration.py +0 -0
  183. {selfevals-0.2.2 → selfevals-0.3.0}/tests/schemas/test_registry.py +0 -0
  184. {selfevals-0.2.2 → selfevals-0.3.0}/tests/schemas/test_tool.py +0 -0
  185. {selfevals-0.2.2 → selfevals-0.3.0}/tests/schemas/test_trace.py +0 -0
  186. {selfevals-0.2.2 → selfevals-0.3.0}/tests/schemas/test_workspace.py +0 -0
  187. {selfevals-0.2.2 → selfevals-0.3.0}/tests/sdk/__init__.py +0 -0
  188. {selfevals-0.2.2 → selfevals-0.3.0}/tests/sdk/test_auto_instrument.py +0 -0
  189. {selfevals-0.2.2 → selfevals-0.3.0}/tests/sdk/test_facade.py +0 -0
  190. {selfevals-0.2.2 → selfevals-0.3.0}/tests/skills/__init__.py +0 -0
  191. {selfevals-0.2.2 → selfevals-0.3.0}/tests/skills/test_skills_locator.py +0 -0
  192. {selfevals-0.2.2 → selfevals-0.3.0}/tests/storage/__init__.py +0 -0
  193. {selfevals-0.2.2 → selfevals-0.3.0}/tests/storage/test_filesystem_object_store.py +0 -0
  194. {selfevals-0.2.2 → selfevals-0.3.0}/tests/storage/test_migrations.py +0 -0
  195. {selfevals-0.2.2 → selfevals-0.3.0}/tests/storage/test_seed.py +0 -0
  196. {selfevals-0.2.2 → selfevals-0.3.0}/tests/storage/test_sqlite_storage.py +0 -0
  197. {selfevals-0.2.2 → selfevals-0.3.0}/tests/test_internal_hashing.py +0 -0
  198. {selfevals-0.2.2 → selfevals-0.3.0}/tests/test_internal_ids.py +0 -0
  199. {selfevals-0.2.2 → selfevals-0.3.0}/tests/test_internal_time.py +0 -0
  200. {selfevals-0.2.2 → selfevals-0.3.0}/tests/test_smoke.py +0 -0
  201. {selfevals-0.2.2 → selfevals-0.3.0}/tests/trace/__init__.py +0 -0
  202. {selfevals-0.2.2 → selfevals-0.3.0}/tests/trace/test_otel_importer.py +0 -0
  203. {selfevals-0.2.2 → selfevals-0.3.0}/tests/trace/test_payload_router.py +0 -0
  204. {selfevals-0.2.2 → selfevals-0.3.0}/tests/trace/test_recorder.py +0 -0
@@ -51,6 +51,9 @@ Thumbs.db
51
51
  .bootstrap/
52
52
  data/
53
53
 
54
+ # Agent worktrees
55
+ .claude/
56
+
54
57
  # Secrets
55
58
  .env
56
59
  .env.*
@@ -7,10 +7,52 @@ Versions follow [SemVer](https://semver.org/).
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.3.0] - 2026-05-27
11
+
12
+ ### Added
13
+
14
+ - **Validated multi-turn conversation input.** When `EvalCase.input`
15
+ carries a `messages` key it is validated as a typed conversation:
16
+ non-empty message list, roles from a new `MessageRole` enum
17
+ (system/user/assistant/tool), content as a string or a list of
18
+ content blocks, multimodal-aware via the `Modality` enum. New
19
+ `Message`, `ContentBlock`, and `ConversationInput` models, plus
20
+ `EvalCase.conversation()` / `EvalCase.is_conversation()` accessors.
21
+ Inputs without a `messages` key remain opaque payloads, so the
22
+ field stays a plain JSON dict that adapters receive verbatim.
23
+ - **Async-first evaluators.** `AgentAdapter.invoke` and `Grader.grade`
24
+ are now async. The executor runs repetitions concurrently and the
25
+ optimization loop grades concurrently, each bounded by a
26
+ configurable semaphore (`concurrency` / `grade_concurrency`,
27
+ default 8). `EmbeddedAdapter` accepts sync or async callables,
28
+ `CliCommandAdapter` uses an asyncio subprocess, and
29
+ `HttpEndpointAdapter` is native async on httpx. `asyncio.run` is
30
+ confined to the CLI edge.
31
+
32
+ ### Changed
33
+
34
+ - `httpx` is now a runtime dependency (the default HTTP adapter
35
+ transport), not just a dev dependency.
36
+
37
+ ### Documentation
38
+
39
+ - STATUS.md and README banners read v0.3.0; multi-turn input and async
40
+ evaluators moved into "What works"; test counts refreshed (default
41
+ surface 559, full 597); roadmap records both as shipped in 0.3.0.
42
+
10
43
  ## [0.2.2] - 2026-05-27
11
44
 
12
45
  ### Documentation
13
46
 
47
+ - STATUS.md and README banners now read v0.2.2 (they had lagged at
48
+ v0.2.1 despite the 0.2.2 release). Refreshed the STATUS body against
49
+ the current tree: test counts (default surface 481 -> 528, full
50
+ surface 566, extras-gated 9 -> 24), and the forward-looking
51
+ "What v0.2 will probably contain" section became a "Roadmap" that
52
+ separates what shipped in 0.2.x from what remains on the backlog.
53
+
54
+ ### Documentation
55
+
14
56
  - Onboarding pass after the `bootstrap` -> `selfevals` rename. Fixed the
15
57
  CI mypy target (`src/bootstrap` -> `src/selfevals`) and 13 stale
16
58
  `bootstrap` CLI/prose references in the bundled error-analysis skill.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: selfevals
3
- Version: 0.2.2
3
+ Version: 0.3.0
4
4
  Summary: Self-improving evals framework for AI agents.
5
5
  Project-URL: Homepage, https://github.com/patovaldezf/selfevals
6
6
  Project-URL: Repository, https://github.com/patovaldezf/selfevals
@@ -18,6 +18,7 @@ Classifier: Programming Language :: Python :: 3.12
18
18
  Classifier: Programming Language :: Python :: 3.13
19
19
  Classifier: Topic :: Software Development :: Testing
20
20
  Requires-Python: >=3.12
21
+ Requires-Dist: httpx<1,>=0.27
21
22
  Requires-Dist: pydantic<3,>=2.7
22
23
  Requires-Dist: pyyaml<7,>=6
23
24
  Provides-Extra: all
@@ -94,9 +95,10 @@ configuration to keep. CLI-first, multi-tenant from day one, and agnostic
94
95
  to the agent framework underneath — selfevals never calls your provider;
95
96
  your agent does, and selfevals grades the result.
96
97
 
97
- > Status: **v0.2.1 — runtime functional.** The CLI works end-to-end:
98
+ > Status: **v0.3.0 — runtime functional.** The CLI works end-to-end:
98
99
  > load an experiment spec → run cases through an adapter → grade traces →
99
- > persist iterations → render a report. See [`docs/spec/`](docs/spec/) for
100
+ > persist iterations → render a report. Adapters and graders are async,
101
+ > with concurrent repetitions and grading. See [`docs/spec/`](docs/spec/) for
100
102
  > the canonical and operational specs that drive design, and
101
103
  > [`docs/STATUS.md`](docs/STATUS.md) for an honest what-works / what-doesn't
102
104
  > snapshot.
@@ -153,7 +155,7 @@ The five nouns you'll meet everywhere:
153
155
 
154
156
  | Term | What it is |
155
157
  |------|------------|
156
- | **EvalCase** | One test: an input, the expected outcome, and which graders apply. |
158
+ | **EvalCase** | One test: an input (a validated multi-turn `messages` conversation, or any opaque payload), the expected outcome, and which graders apply. |
157
159
  | **Adapter** | The bridge to your agent — embedded callable, CLI subprocess, or HTTP endpoint. selfevals calls *it*, never the provider directly. |
158
160
  | **Grader** | Scores a trace. `DeterministicGrader` (rules: substrings, tools, JSON schema) or `LLMJudgeGrader` (a rubric-driven judge). |
159
161
  | **Proposer** | Picks the next parameter configuration to try — `manual`, `grid`, or `random`. |
@@ -9,9 +9,10 @@ configuration to keep. CLI-first, multi-tenant from day one, and agnostic
9
9
  to the agent framework underneath — selfevals never calls your provider;
10
10
  your agent does, and selfevals grades the result.
11
11
 
12
- > Status: **v0.2.1 — runtime functional.** The CLI works end-to-end:
12
+ > Status: **v0.3.0 — runtime functional.** The CLI works end-to-end:
13
13
  > load an experiment spec → run cases through an adapter → grade traces →
14
- > persist iterations → render a report. See [`docs/spec/`](docs/spec/) for
14
+ > persist iterations → render a report. Adapters and graders are async,
15
+ > with concurrent repetitions and grading. See [`docs/spec/`](docs/spec/) for
15
16
  > the canonical and operational specs that drive design, and
16
17
  > [`docs/STATUS.md`](docs/STATUS.md) for an honest what-works / what-doesn't
17
18
  > snapshot.
@@ -68,7 +69,7 @@ The five nouns you'll meet everywhere:
68
69
 
69
70
  | Term | What it is |
70
71
  |------|------------|
71
- | **EvalCase** | One test: an input, the expected outcome, and which graders apply. |
72
+ | **EvalCase** | One test: an input (a validated multi-turn `messages` conversation, or any opaque payload), the expected outcome, and which graders apply. |
72
73
  | **Adapter** | The bridge to your agent — embedded callable, CLI subprocess, or HTTP endpoint. selfevals calls *it*, never the provider directly. |
73
74
  | **Grader** | Scores a trace. `DeterministicGrader` (rules: substrings, tools, JSON schema) or `LLMJudgeGrader` (a rubric-driven judge). |
74
75
  | **Proposer** | Picks the next parameter configuration to try — `manual`, `grid`, or `random`. |
@@ -0,0 +1,424 @@
1
+ # selfevals — Frontend Spec ("nuestro propio LangSmith")
2
+
3
+ > Spec completa del frontend de selfevals: la web UI para **visibilidad total** de evals —
4
+ > runs, debug detallado, drill-down de qué hace el agente, versionado de datasets/cases,
5
+ > latencia/TTFT, comparación de iteraciones, dashboards de failure modes. La promesa del
6
+ > `evals_framework.md` §11: "desarrollar nuestro propio LangSmith".
7
+ >
8
+ > Este doc cubre: estado actual (qué ya existe), arquitectura, cada vista (existente +
9
+ > faltante), cada endpoint (existente + faltante con contrato), SSE/live, auth/roles, UX por
10
+ > pantalla, y el roadmap FE por fases.
11
+ >
12
+ > **No es greenfield.** Ya hay una SvelteKit app + API FastAPI + SSE de traces en vivo
13
+ > funcionando. Stack decidido en [`docs/web/decisions.md`](web/decisions.md). El backend que
14
+ > el FE espeja evoluciona según [`docs/ROADMAP.md`](ROADMAP.md).
15
+
16
+ Fecha: 2026-05-27.
17
+
18
+ ---
19
+
20
+ ## 0. Stack (ya decidido — no relitigar)
21
+
22
+ | Capa | Elección | Nota |
23
+ |------|----------|------|
24
+ | Framework | **SvelteKit 2.61 + Svelte 5.55** | `+page.server.ts` load encaja con superficies read-mostly. Mismo design language que `pato-os`. |
25
+ | Deploy | Vercel (web) + FastAPI sidecar (Python API) | adapter-node hoy; Vercel adapter objetivo. |
26
+ | API | REST plano + tipos TS espejo | Pydantic v2 es la fuente de verdad. OpenAPI gratis en `/api/openapi.json`; futuro `openapi-typescript` para codegen. |
27
+ | Server-state | `@tanstack/svelte-query` | UI-state en stores nativos. Routing-state en URL params (`$page`). |
28
+ | Tablas | `@tanstack/svelte-table` | columnas/sorting/filtering. |
29
+ | Charts | LayerCake (D3-on-Svelte) | sparklines, anchor-set, barras de clusters. |
30
+ | CSS | Tailwind v4 + shadcn-svelte/bits-ui | primitivos accesibles, skin propio vía design tokens. |
31
+ | Fonts | Inter + JetBrains Mono (self-hosted) | tabular numerals en celdas mono. |
32
+ | Auth | stub `X-SelfEvals-User: local` | sin auth real en MVP; el header viaja para enchufar auth después sin tocar pantallas. |
33
+
34
+ **Nota de doc obsoleto:** `docs/web/decisions.md:72-78` lista "live trace streaming" y "SSE
35
+ para run progress" como *out of scope v0*. **Ya están implementados** (`api/sse.py`,
36
+ `api/broker.py`). Este doc es la fuente actualizada.
37
+
38
+ ---
39
+
40
+ ## 1. Arquitectura actual
41
+
42
+ ```
43
+ ┌─────────────────────┐ REST + SSE ┌──────────────────────────┐
44
+ │ SvelteKit (web/) │ ───────────────▶│ FastAPI (api/) │
45
+ │ +page.server.ts │ │ app.py · queries.py │
46
+ │ load() → fetch │◀─ EventSource ──│ sse.py · broker.py │
47
+ └─────────────────────┘ └────────────┬─────────────┘
48
+ │ WorkspaceScope
49
+
50
+ ┌──────────────────────────┐
51
+ │ SQLite + filesystem store │
52
+ │ (storage/) │
53
+ └────────────┬─────────────┘
54
+ ▲ publish spans
55
+ OTLP receiver (runner/) ────┘ (broker_bridge)
56
+ ```
57
+
58
+ - **Web y API son servicios desacoplados.** El CLI orquesta los runs (`selfevals run`); la
59
+ web lee resultados terminados y traces en vivo cuando un run está en curso y emite spans
60
+ vía el broker.
61
+ - **Aislamiento por workspace** estructural en el storage (`storage/interface.py` —
62
+ `WorkspaceScope`). Sin auth en la capa de storage; el caller garantiza el `workspace_id`.
63
+ - **API hoy es read-mostly**: ~12 GET + 1 POST (crear workspace). Toda la mutación del
64
+ lifecycle de experimentos pasa por el CLI.
65
+
66
+ ### Cómo se arranca hoy
67
+ `python -m selfevals.api` (uvicorn): `--host` (def 127.0.0.1), `--port` (def 8000),
68
+ `--db` (def `./selfevals.sqlite`), `--reload`. Env `SELFEVALS_DB` como fallback.
69
+ **No existe `selfevals serve`** (ver §6).
70
+
71
+ ---
72
+
73
+ ## 2. Modelo de datos que el FE visualiza
74
+
75
+ La API expone *view models* (denormalizados) en `api/schemas.py`. Las entidades canónicas
76
+ viven en `src/selfevals/schemas/` (fuente de verdad). El FE consume las view shapes.
77
+
78
+ ### Jerarquía
79
+ ```
80
+ Workspace
81
+ └─ Experiment (target, guardrails, editable, search_space, proposer, run, ...)
82
+ └─ IterationRecord (hypothesis, proposed_parameters, metrics, decision)
83
+ └─ DecisionRecord (outcome, rationale automated/human, metrics_snapshot)
84
+ └─ Trace[] (uno por run/rep; multi-turno = uno por turno, mismo thread_id)
85
+ └─ Span[] (discriminated union por kind)
86
+ └─ GraderResult[] (label, score, failure_modes, [breakdown ← #3 futuro])
87
+ ```
88
+
89
+ ### View models (api/schemas.py)
90
+ - **WorkspaceSummary**: id, slug, name, owner_id, created_at, experiment_count, last_run_at.
91
+ - **WorkspaceResponse**: + `recent_health` (fracción de iteraciones recientes en
92
+ keep_candidate).
93
+ - **ExperimentSummary**: id, name, goal, mode, state, primary_metric, primary_target
94
+ {operator,value}, proposer_strategy, max_iterations, iteration_count, timestamps.
95
+ - **ExperimentDetailResponse**: summary + `result` (JSON de `reporter.render_json`, null si
96
+ no ha corrido) + iterations[].
97
+ - **IterationSummary**: id, iteration, state, hypothesis, proposed_parameters,
98
+ primary_metric_name/value, **delta_vs_best**, decision_outcome, decision_rationale,
99
+ cost_usd, duration_seconds, trace_run_ids[], created_at.
100
+ - **TraceResponse**: id, run_id, experiment_id, iteration, thread_id, thread_position,
101
+ final_state, started_at/ended_at, spans[], metrics{}.
102
+ - **SpanSummary**: id, parent_id, kind, name, started_at, duration_ms, **detail{}**
103
+ (campos kind-specific filtrados — p.ej. LLM: provider/model/stop_reason).
104
+ - **ThreadTurn** / **ThreadResponse**: traces con mismo thread_id ensamblados como
105
+ conversación ordenada (por thread_position, luego started_at); cada turn carga
106
+ primary_grade + grader_results.
107
+ - **AnchorPoint**: experiment_id, experiment_name, iteration, primary_metric_name/value,
108
+ decision_outcome, created_at (vista longitudinal de tendencia por workspace).
109
+
110
+ ### Span kinds (lo que el trace viewer debe renderizar)
111
+ `AgentTurn · LLMCall · ToolCall · Retrieval · MemoryRead · MemoryWrite · Decision ·
112
+ Handoff · HumanIntervention · GuardrailCheck · Error · Custom`. Cada uno con payload propio
113
+ (LLMCall: tokens/cost/TTFT/reasoning; ToolCall: tool_use_id/args/result/status/sandboxed;
114
+ Retrieval: query/top_k/retrieved docs/reranker; Decision: chosen/alternatives/rationale).
115
+
116
+ ---
117
+
118
+ ## 3. Endpoints existentes
119
+
120
+ Base `/api/`, sin prefijo de versión. CORS para `localhost:5173`. Header `X-SelfEvals-User`
121
+ (def "local").
122
+
123
+ ### Read-only (GET)
124
+ | Endpoint | Devuelve | Filtros |
125
+ |----------|----------|---------|
126
+ | `/api/health` | HealthResponse | — |
127
+ | `/api/workspaces` | WorkspaceListResponse | — |
128
+ | `/api/workspaces/{ws}` | WorkspaceResponse | — |
129
+ | `/api/workspaces/{ws}/experiments` | list[ExperimentSummary] | limit (1–500, def 100) |
130
+ | `/api/workspaces/{ws}/experiments/{id}` | ExperimentDetailResponse | — |
131
+ | `/api/workspaces/{ws}/experiments/{id}/iterations` | IterationListResponse | — |
132
+ | `/api/workspaces/{ws}/experiments/{id}/decisions` | list[dict] | — |
133
+ | `/api/workspaces/{ws}/iterations/{id}` | dict {iteration, decision} | — |
134
+ | `/api/workspaces/{ws}/traces/{trace_id}` | TraceResponse | acepta run_id como fallback |
135
+ | `/api/workspaces/{ws}/threads/{thread_id}` | ThreadResponse | — |
136
+ | `/api/runs/active` | list[{workspace_id, run_id}] | — |
137
+ | `/api/workspaces/{ws}/anchor-set` | list[AnchorPoint] | — |
138
+
139
+ ### Streaming (SSE)
140
+ | Endpoint | Eventos |
141
+ |----------|---------|
142
+ | `/api/workspaces/{ws}/traces/{run_id}/stream` | `snapshot` (trace completo) · `span` (uno) · `ping` (heartbeat 15s) · `complete` (final_state) |
143
+
144
+ ### Write
145
+ | Endpoint | Método | Body |
146
+ |----------|--------|------|
147
+ | `/api/workspaces` | POST | CreateWorkspaceRequest {slug, name?, description?} → 201 |
148
+
149
+ ---
150
+
151
+ ## 4. Rutas web existentes
152
+
153
+ Routing por archivos de SvelteKit. Cliente tipado en `lib/api/client.ts`; SSE helper en
154
+ `lib/api/sse.ts` (`openTraceStream(ws, runId, handlers)`).
155
+
156
+ | Ruta | Estado | Qué hace |
157
+ |------|--------|----------|
158
+ | `/` | ✅ funcional | Lista de workspaces; error si la API no responde. |
159
+ | `/[workspace]` | ✅ funcional | Detalle: tabla de experimentos con sparkline de tendencia, chips (exp count, recent_health, anchor points), recientes. Secciones skeleton "failure clusters (soon)" + datasets. |
160
+ | `/[workspace]/experiments` | 🟡 scaffolded | Lista completa de experimentos. |
161
+ | `/[workspace]/experiments/[experiment]` | ✅ funcional | 3 tabs: **Iterations** (tabla hypothesis/params/metric/delta/decision/rationale), **Compare** (picker side-by-side con delta), **Decisions** (audit trail). Sidebar al clickear iteración: detalle completo. |
162
+ | `/[workspace]/anchor-set` | 🟡 skeletal | Vista longitudinal de anchor points. |
163
+ | `/[workspace]/traces/[trace]` | ✅ funcional + **live** | Inspector de trace. Sidebar izq: árbol de spans jerárquico. Main: detalle del span seleccionado con facetas kind-specific. **SSE**: actualiza el árbol en vivo, pill "live" mientras el stream está activo. |
164
+ | `/[workspace]/clusters` | ❌ stub | Placeholder; necesita failure-clusters API (§7). |
165
+ | `/[workspace]/datasets` | ❌ stub | Placeholder; necesita datasets + cases API (§7). |
166
+
167
+ ### Componentes existentes
168
+ `AppShell` (layout) · `DecisionBadge` (outcome → badge de color) · `MetricChip` (label+value,
169
+ formato number/percent/plain) · `SpanNode` (nodo recursivo del árbol) · `Sparkline`
170
+ (LayerCake) · `ActiveRunsPill` (indicador de runs en vivo).
171
+
172
+ ---
173
+
174
+ ## 5. Vistas faltantes (el camino a paridad LangSmith)
175
+
176
+ Cada vista nueva espeja una capacidad del [ROADMAP](ROADMAP.md). Marcadas con la capacidad
177
+ backend de la que dependen.
178
+
179
+ ### 5.1 Funnel drill-down · depende de #3 (breakdown)
180
+ **Dónde:** dentro del trace viewer y del experiment detail.
181
+ **Qué:** un `GraderResult` con `breakdown: BreakdownNode` (árbol recursivo
182
+ key/label/score/weight/children) se renderiza como **funnel**: por nivel, pass-rate +
183
+ dónde se cae el flujo. En el experiment detail, agregado:
184
+ `IterationAggregate.funnel` → tabla/barras por `key` ("routing 0.92 → tool_order 0.71 →
185
+ final 0.65"). En el trace, el breakdown del grader de ese run.
186
+ **Componente nuevo:** `FunnelBreakdown.svelte` (árbol indentado + barras LayerCake).
187
+
188
+ ### 5.2 Trajectory viewer · depende de #4 (TrajectoryGrader)
189
+ **Dónde:** trace viewer, capa sobre el árbol de spans.
190
+ **Qué:** visualiza la **secuencia** de tool calls / decisiones (no solo el árbol jerárquico).
191
+ Resalta failure modes diagnósticos (`wrong_tool_order`, `tool_loop_overrun`,
192
+ `missing_routing_decision`, `redundant_retrieval`) como anotaciones sobre los spans, **sin**
193
+ marcar el run como fail (la trayectoria es diagnóstica, no gate — ver ROADMAP #4). Timeline
194
+ horizontal de spans con duración (waterfall), badges de modo diagnóstico.
195
+ **Componente nuevo:** `TrajectoryTimeline.svelte` (waterfall) + `DiagnosticBadge.svelte`.
196
+
197
+ ### 5.3 Thread viewer (multi-turno) · depende de #2 (executor) — endpoint YA existe
198
+ **Dónde:** ruta nueva `/[workspace]/threads/[thread]`.
199
+ **Qué:** el endpoint `GET /threads/{thread_id}` (ThreadResponse) **ya existe**. Falta la
200
+ **ruta web**. Renderiza la conversación turn-by-turn: cada turn = burbuja (user/assistant)
201
+ con su trace enlazado, su primary_grade y grader_results. Por-turno: link al trace viewer.
202
+ Cuando exista #2 (executor real) + #15 (simulador), distinguir turnos `user_simulator` de
203
+ usuario real (tag en provider_metadata).
204
+ **Componente nuevo:** `ThreadConversation.svelte` (burbujas) + `TurnGradeChip.svelte`.
205
+
206
+ ### 5.4 Judge panel / calibración · depende de #17
207
+ **Dónde:** dentro del trace viewer (cuando el grader es panel) + ruta nueva
208
+ `/[workspace]/judges`.
209
+ **Qué:** para un `JudgePanelGrader`: mostrar el **consenso** (majority/unanimous/weighted),
210
+ el voto de cada juez miembro, y la variance de counterfactuals (paráfrasis). Vista de
211
+ calibración: precision/recall/F1/macro-F1 del juez vs labels humanos (de `calibration.py`),
212
+ y la cola de human spot-check (`Annotation` stubs pendientes de revisar).
213
+ **Componentes nuevos:** `JudgeConsensus.svelte` (votos + consenso) · `CalibrationMatrix.svelte`
214
+ (confusion + métricas) · `SpotCheckQueue.svelte` (cola de anotación).
215
+
216
+ ### 5.5 Failure clusters dashboard · depende de error-analysis API (§7)
217
+ **Dónde:** ruta `/[workspace]/clusters` (hoy stub).
218
+ **Qué:** taxonomía de failure modes del workspace (entidad `FailureMode`, lifecycle
219
+ CANDIDATE→OFFICIAL→RETIRED). Por modo: frecuencia, tendencia entre iteraciones, severidad,
220
+ casos enlazados. Acciones (gated): promote/retire/merge modes. El backend ya tiene esto en
221
+ CLI (`selfevals failuremode`, `analyze pull/push`) — falta exponerlo por API.
222
+ **Componentes nuevos:** `FailureModeTable.svelte` · `ModeTrend.svelte` (LayerCake) ·
223
+ `ModeLifecycleControls.svelte`.
224
+
225
+ ### 5.6 Datasets + cases browser · depende de datasets API (§7)
226
+ **Dónde:** ruta `/[workspace]/datasets` (hoy stub).
227
+ **Qué:** lista de datasets (smoke/golden/regression/capability/...), su SplitAllocation
228
+ (optimization/holdout/reliability), statistics (by_level/feature/source/pii, holdout_count),
229
+ status (draft/active/frozen). Drill-down a cases: filtrar por taxonomy (level/feature/source/
230
+ dataset_type), ver `input`/`expected`/graders/failure_weights/pii_status. **Versionado de
231
+ datasets/cases** (la promesa §11). Edición de cases es post-MVP.
232
+ **Componentes nuevos:** `DatasetList.svelte` · `CaseTable.svelte` (TanStack Table,
233
+ filtros por taxonomy) · `CaseDetail.svelte` · `SplitAllocationBar.svelte`.
234
+
235
+ ### 5.7 Latencia / costo dashboard · depende de #9, #14
236
+ **Dónde:** experiment detail (tab nuevo "Performance") + trace viewer (por LLM span).
237
+ **Qué:** TTFT, p50/p95/p99 latency, tokens-per-sec, costo por caso/iteración/experimento.
238
+ Series por iteración (¿mejoró accuracy pero empeoró p95?). En el trace: TTFT y tokens/sec
239
+ por LLM call (los campos ya existen en `LLMCallSpan`, se poblarán con #9).
240
+ **Componentes nuevos:** `LatencyPercentiles.svelte` · `CostBreakdownChart.svelte`.
241
+
242
+ ### 5.8 Live run control · depende de serve (§6)
243
+ **Dónde:** shell global + ruta `/[workspace]/runs`.
244
+ **Qué:** hoy `/runs/active` lista pares (ws, run_id) y el `ActiveRunsPill` los muestra. Falta:
245
+ **lanzar/pausar/abortar runs desde la web** (hoy solo CLI). Requiere endpoints de mutación de
246
+ lifecycle (§7) y `selfevals serve` montando el optimization loop (§6). Vista de progreso del
247
+ run en vivo (iteración actual, casos completados, métrica parcial).
248
+ **Componentes nuevos:** `RunProgress.svelte` · `RunControls.svelte`.
249
+
250
+ ---
251
+
252
+ ## 6. El gap `selfevals serve`
253
+
254
+ **No existe `serve` en el CLI.** Hoy API y web se levantan por separado. Un `selfevals serve`
255
+ debería montar en un solo proceso:
256
+
257
+ 1. **FastAPI app** — ya construida, solo instanciar.
258
+ 2. **Web UI** — build de SvelteKit (adapter-node) servido como estático o vía el adapter.
259
+ 3. **OTLP receiver** (`runner/otlp_receiver.py`) — escuchar spans de evals corriendo.
260
+ 4. **SpanBroker** singleton — para el live tracing (SSE).
261
+ 5. **Optimization loop** (futuro) — para lanzar runs desde la web, no solo CLI.
262
+
263
+ **Contrato propuesto:** `selfevals serve --host --port --db [--web-dist path] [--no-web]`.
264
+ Arranca API + (opcional) web + OTLP receiver + broker en un event loop. El loop de
265
+ optimización se integra cuando existan los endpoints de mutación (§7.1).
266
+
267
+ ---
268
+
269
+ ## 7. Endpoints faltantes (con contrato)
270
+
271
+ ### 7.1 Mutación de lifecycle de experimentos
272
+ Para mover el control del CLI a la web.
273
+ - `POST /api/workspaces/{ws}/experiments` — crear experiment desde spec YAML/JSON.
274
+ Body: el spec del experimento. → ExperimentSummary (201).
275
+ - `POST /api/workspaces/{ws}/experiments/{id}/runs` — lanzar un run (dispara el optimization
276
+ loop). → `{run_id}`. Requiere serve (§6).
277
+ - `PATCH /api/workspaces/{ws}/experiments/{id}` — transición de estado (pause/abort/resume).
278
+ Body: `{state}`. → ExperimentSummary.
279
+ - `POST /api/workspaces/{ws}/iterations/{id}/decision` — registrar/editar rationale humano,
280
+ override de la decisión automática. Body: HumanRationale {decided_by, notes,
281
+ overrides_automated}. → DecisionRecord.
282
+
283
+ ### 7.2 Datasets + cases (para §5.6)
284
+ - `GET /api/workspaces/{ws}/datasets` → list[DatasetSummary] (id, name, dataset_type, status,
285
+ statistics, split_allocation).
286
+ - `GET /api/workspaces/{ws}/datasets/{id}` → DatasetDetail + cases refs.
287
+ - `GET /api/workspaces/{ws}/cases` → list[EvalCaseSummary]. Filtros:
288
+ `level, feature, source, dataset_type, pii_status, holdout, limit, offset`.
289
+ - `GET /api/workspaces/{ws}/cases/{id}` → EvalCaseDetail (input, expected, taxonomy, graders,
290
+ failure_weights, metadata).
291
+
292
+ ### 7.3 Failure modes + error analysis (para §5.5)
293
+ - `GET /api/workspaces/{ws}/failure-modes` → list[FailureMode] (slug, name, lifecycle,
294
+ frequency, severity).
295
+ - `GET /api/workspaces/{ws}/failure-modes/{slug}/trend` → series por iteración.
296
+ - `POST /api/workspaces/{ws}/failure-modes/{slug}/lifecycle` — promote/retire/merge (gated).
297
+ - `GET /api/workspaces/{ws}/analysis/staging` → bundles staged pendientes de coding.
298
+
299
+ ### 7.4 Traces filtrables (hoy solo por id)
300
+ - `GET /api/workspaces/{ws}/traces` — list. Filtros: `experiment_id, iteration,
301
+ final_state, span_kind, has_failure, limit, offset`. → list[TraceSummary].
302
+
303
+ ### 7.5 Analytics / agregados
304
+ - `GET /api/workspaces/{ws}/experiments/{id}/funnel` → agregado funnel por key (para §5.1).
305
+ Depende de #3.
306
+ - `GET /api/workspaces/{ws}/experiments/{id}/performance` → percentiles latencia + costo
307
+ por iteración (para §5.7). Depende de #9/#14.
308
+ - `GET /api/workspaces/{ws}/experiments/{id}/correlation` → correlación param↔metric.
309
+
310
+ ### 7.6 Export
311
+ - `GET /api/workspaces/{ws}/experiments/{id}/export?format=csv|json` — iteraciones/resultados.
312
+ - `GET /api/workspaces/{ws}/traces/{id}/export?format=json` — trace completo con payloads
313
+ resueltos (del object store).
314
+
315
+ ### 7.7 Paginación (deuda transversal)
316
+ Los endpoints de lista no tienen paginación real (limit hardcoded 100; sorting/limit en
317
+ Python en `queries.py`). Añadir `limit`/`offset`/`order_by` consistentes — el `ListFilter`
318
+ del storage (`storage/interface.py`) ya lo soporta, falta exponerlo en la API.
319
+
320
+ ---
321
+
322
+ ## 8. Auth / roles / permisos (post-MVP)
323
+
324
+ Hoy: stub `X-SelfEvals-User: local`, sin auth real, aislamiento solo por workspace en storage.
325
+ El `seed_workspace()` ya crea **member roles** por defecto — la base de datos ya modela
326
+ membresía (entidad workspace member / `Role`).
327
+
328
+ **Diseño futuro (la promesa §10/§11 "usuarios con roles, permisos configurables"):**
329
+ - **AuthN**: enchufar OIDC/sesión donde hoy va el header stub. Cero cambio en pantallas (el
330
+ header ya viaja en cada request del cliente).
331
+ - **AuthZ por workspace**: roles (owner/editor/viewer). Viewer = solo GET. Editor = mutación
332
+ de cases/datasets/decisiones. Owner = + gestión de miembros + lifecycle de failure modes.
333
+ - **Gating de acciones destructivas/escribientes** en el FE: las acciones de §5.4
334
+ (promote/retire mode), §5.6 (editar case), §5.8 (abort run), §7.1 (override decisión)
335
+ requieren rol ≥ editor; deshabilitar/ocultar según el rol del `X-SelfEvals-User`.
336
+ - **Audit trail**: las decisiones ya guardan `human.decided_by/decided_at`. Extender a
337
+ mutaciones de dataset/mode.
338
+
339
+ No tocar pantallas para esto: el contrato es que toda mutación pase por endpoints que ya
340
+ validan rol server-side, y el FE solo refleje permisos (ocultar botones).
341
+
342
+ ---
343
+
344
+ ## 9. UX por pantalla (principios + por-vista)
345
+
346
+ **Principios transversales** (densos, read-mostly, técnicos):
347
+ - Tabular numerals en toda celda numérica (JetBrains Mono). Métricas alineadas a la derecha.
348
+ - Color semántico consistente vía `DecisionBadge`: keep_candidate=verde, reject=rojo,
349
+ investigate=ámbar, tradeoff_review=morado, spawn_subexperiment=azul.
350
+ - Delta siempre con signo y color (▲ verde / ▼ rojo) contra baseline o best.
351
+ - Drill-down progresivo: lista → detalle en sidebar (no navegación destructiva) donde se pueda.
352
+ - Estados vacíos explícitos ("este experimento no ha corrido", no spinner infinito).
353
+ - Live: pill "live" pulsante mientras hay SSE activo; degradar a "finished" al `complete`.
354
+
355
+ **Por vista:**
356
+ - **Workspace list (`/`)**: cards o tabla con name, exp count, recent_health (anillo de
357
+ color), last_run_at relativo. CTA crear workspace.
358
+ - **Workspace detail (`/[ws]`)**: hero con chips (exp count, health, anchor points). Tabla de
359
+ experimentos con sparkline de la métrica primaria. Secciones de navegación a clusters y
360
+ datasets (activar cuando existan §5.5/§5.6).
361
+ - **Experiment detail**: tabs Iterations / Compare / Decisions / **Performance** (nuevo, §5.7)
362
+ / **Funnel** (nuevo, §5.1). Sidebar de iteración con hypothesis + params + métricas +
363
+ decision + record id. Compare: dos pickers, diff de params (prompt diff side-by-side) +
364
+ diff de métricas + recomendación de ganador.
365
+ - **Trace viewer**: árbol de spans (izq) + detalle (centro) + **trajectory timeline** (nuevo,
366
+ §5.2, abajo). Facetas por kind. Botón "ver thread" si el trace tiene thread_id. Pill live +
367
+ spans llegando por SSE. Resolver pointers (system_prompt/messages/args/result) bajo demanda
368
+ desde el object store (endpoint export §7.6 o lazy fetch).
369
+ - **Thread viewer (nuevo, §5.3)**: conversación vertical, burbujas por rol, grade chip por
370
+ turno, link a cada trace. Distinguir `user_simulator`.
371
+ - **Clusters (nuevo, §5.5)**: tabla de modes + trend chart + controles de lifecycle (gated).
372
+ - **Datasets (nuevo, §5.6)**: lista + split allocation bar + case browser filtrable + case
373
+ detail. Toggle de versión.
374
+ - **Judges (nuevo, §5.4)**: consenso del panel + calibration matrix + spot-check queue.
375
+
376
+ ---
377
+
378
+ ## 10. Roadmap FE por fases
379
+
380
+ El FE sigue al backend del [ROADMAP](ROADMAP.md). Una vista no se construye antes que su
381
+ capacidad backend.
382
+
383
+ ### FE-Fase 0 — Pulir lo existente + serve (S–M)
384
+ - `selfevals serve` (§6): un comando levanta API + web + OTLP + broker.
385
+ - Completar rutas scaffolded: `/[ws]/experiments` y `/[ws]/anchor-set` a funcional.
386
+ - Paginación consistente en endpoints de lista (§7.7).
387
+ - Resolver pointers en el trace viewer (mostrar prompts/args/results reales).
388
+
389
+ ### FE-Fase 1 — Multi-turno + funnel (M) — tras backend #2, #3
390
+ - **Thread viewer** (§5.3) — endpoint ya existe, solo falta la ruta.
391
+ - **Funnel drill-down** (§5.1) — tras #3 (breakdown) + endpoint funnel (§7.5).
392
+
393
+ ### FE-Fase 2 — Trayectoria + jueces (M) — tras backend #4, #17
394
+ - **Trajectory timeline** (§5.2) — tras #4.
395
+ - **Judge panel / calibración** (§5.4) — tras #17.
396
+
397
+ ### FE-Fase 3 — Datasets + clusters + performance (M–L) — tras endpoints §7.2/§7.3/§7.5
398
+ - **Datasets + cases browser** (§5.6) + versionado.
399
+ - **Failure clusters dashboard** (§5.5).
400
+ - **Latencia/costo dashboard** (§5.7) — tras #9/#14.
401
+
402
+ ### FE-Fase 4 — Control de runs + auth (L) — tras serve + mutación §7.1
403
+ - **Live run control** (§5.8): lanzar/pausar/abortar desde la web.
404
+ - **Auth real + roles/permisos** (§8): gating de acciones escribientes.
405
+
406
+ ---
407
+
408
+ ## 11. Resumen de gaps (qué falta, priorizado)
409
+
410
+ | Gap FE | Tipo | Depende de | Prioridad |
411
+ |--------|------|-----------|-----------|
412
+ | `selfevals serve` | CLI | — | Alta (desbloquea todo lo embebido) |
413
+ | Resolver pointers en trace viewer | FE | endpoint export §7.6 | Alta (debug real) |
414
+ | Paginación de listas | API | ListFilter (existe) | Alta |
415
+ | Thread viewer | FE | endpoint existe; #2 para datos ricos | Alta (rápido) |
416
+ | Funnel drill-down | FE+API | #3 breakdown | Media |
417
+ | Trajectory timeline | FE | #4 | Media |
418
+ | Judge panel / calibración | FE+API | #17 | Media |
419
+ | Datasets + cases browser | FE+API | endpoints §7.2 | Media |
420
+ | Failure clusters | FE+API | endpoints §7.3 (CLI ya tiene) | Media |
421
+ | Latencia/costo dashboard | FE+API | #9, #14 | Media |
422
+ | Mutación lifecycle (crear/lanzar/abortar) | API | serve | Media-Baja |
423
+ | Auth + roles | API+FE | — | Baja (MVP sin auth) |
424
+ | Export CSV/JSON | API+FE | — | Baja |