selfevals 0.2.2__tar.gz → 0.4.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (242) hide show
  1. {selfevals-0.2.2 → selfevals-0.4.0}/.gitignore +3 -0
  2. {selfevals-0.2.2 → selfevals-0.4.0}/CHANGELOG.md +109 -0
  3. {selfevals-0.2.2 → selfevals-0.4.0}/PKG-INFO +18 -5
  4. {selfevals-0.2.2 → selfevals-0.4.0}/README.md +16 -4
  5. selfevals-0.4.0/docs/FE_FASE_A_PENDIENTES.md +171 -0
  6. selfevals-0.4.0/docs/FRONTEND.md +439 -0
  7. selfevals-0.4.0/docs/FRONTEND_PRODUCT_PLAN.md +491 -0
  8. selfevals-0.4.0/docs/ROADMAP.md +184 -0
  9. {selfevals-0.2.2 → selfevals-0.4.0}/docs/STATUS.md +68 -35
  10. {selfevals-0.2.2 → selfevals-0.4.0}/docs/adapters.md +132 -57
  11. selfevals-0.4.0/docs/api_reference.md +392 -0
  12. selfevals-0.4.0/docs/archive/SEALS_100X_GAP_PLAN.md +536 -0
  13. selfevals-0.4.0/docs/eval_config.md +328 -0
  14. selfevals-0.4.0/docs/json_report_schema.md +202 -0
  15. {selfevals-0.2.2 → selfevals-0.4.0}/pyproject.toml +2 -1
  16. selfevals-0.4.0/src/selfevals/.agents/skills/run-eval-experiment/SKILL.md +237 -0
  17. selfevals-0.4.0/src/selfevals/__init__.py +41 -0
  18. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/api/app.py +146 -4
  19. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/api/queries.py +196 -14
  20. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/api/schemas.py +118 -0
  21. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/api/sse.py +10 -0
  22. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/cli/commands.py +181 -9
  23. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/cli/main.py +40 -0
  24. selfevals-0.4.0/src/selfevals/graders/__init__.py +82 -0
  25. selfevals-0.4.0/src/selfevals/graders/artifact.py +286 -0
  26. selfevals-0.4.0/src/selfevals/graders/base.py +121 -0
  27. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/graders/deterministic.py +46 -3
  28. selfevals-0.4.0/src/selfevals/graders/guardrail.py +295 -0
  29. selfevals-0.4.0/src/selfevals/graders/judge_panel.py +572 -0
  30. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/graders/llm_judge.py +2 -2
  31. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/graders/registry.py +9 -3
  32. selfevals-0.4.0/src/selfevals/graders/trajectory.py +363 -0
  33. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/optimization/__init__.py +8 -0
  34. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/optimization/aggregator.py +177 -1
  35. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/optimization/loop.py +238 -37
  36. selfevals-0.4.0/src/selfevals/optimization/proposers.py +400 -0
  37. selfevals-0.4.0/src/selfevals/optimization/sampling.py +189 -0
  38. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/repo/__init__.py +10 -1
  39. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/repo/loader.py +146 -6
  40. selfevals-0.4.0/src/selfevals/reporter/compare.py +477 -0
  41. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/reporter/json_report.py +36 -0
  42. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/reporter/markdown.py +38 -1
  43. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/runner/__init__.py +10 -0
  44. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/runner/adapters.py +64 -44
  45. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/runner/executor.py +116 -20
  46. selfevals-0.4.0/src/selfevals/runner/multiturn.py +326 -0
  47. selfevals-0.4.0/src/selfevals/runner/pricing.py +228 -0
  48. selfevals-0.4.0/src/selfevals/runner/simulator.py +319 -0
  49. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/schemas/__init__.py +8 -0
  50. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/schemas/enums.py +9 -0
  51. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/schemas/eval_case.py +83 -5
  52. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/schemas/experiment.py +8 -7
  53. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/schemas/iteration.py +8 -1
  54. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/schemas/trace.py +15 -1
  55. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/storage/sqlite.py +10 -1
  56. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/trace/recorder.py +40 -3
  57. selfevals-0.4.0/src/selfevals/version.py +1 -0
  58. selfevals-0.4.0/tests/api/test_api.py +362 -0
  59. selfevals-0.4.0/tests/api/test_compare_endpoint.py +151 -0
  60. selfevals-0.4.0/tests/api/test_funnel_endpoint.py +140 -0
  61. selfevals-0.4.0/tests/api/test_sse.py +139 -0
  62. selfevals-0.4.0/tests/cli/test_adapter_wiring.py +94 -0
  63. {selfevals-0.2.2 → selfevals-0.4.0}/tests/cli/test_cli.py +2 -1
  64. selfevals-0.4.0/tests/cli/test_serve.py +164 -0
  65. selfevals-0.4.0/tests/conftest.py +41 -0
  66. {selfevals-0.2.2 → selfevals-0.4.0}/tests/decision/test_loop_integration.py +5 -2
  67. selfevals-0.4.0/tests/graders/test_artifact.py +293 -0
  68. selfevals-0.4.0/tests/graders/test_breakdown.py +117 -0
  69. {selfevals-0.2.2 → selfevals-0.4.0}/tests/graders/test_deterministic.py +91 -23
  70. selfevals-0.4.0/tests/graders/test_guardrail.py +288 -0
  71. selfevals-0.4.0/tests/graders/test_judge_panel.py +500 -0
  72. {selfevals-0.2.2 → selfevals-0.4.0}/tests/graders/test_llm_judge.py +33 -13
  73. selfevals-0.4.0/tests/graders/test_trajectory.py +458 -0
  74. {selfevals-0.2.2 → selfevals-0.4.0}/tests/integration/test_full_loop_with_mocked_judge.py +6 -4
  75. selfevals-0.4.0/tests/optimization/test_aggregator.py +393 -0
  76. {selfevals-0.2.2 → selfevals-0.4.0}/tests/optimization/test_loop.py +174 -11
  77. {selfevals-0.2.2 → selfevals-0.4.0}/tests/optimization/test_loop_error_analysis.py +76 -16
  78. {selfevals-0.2.2 → selfevals-0.4.0}/tests/optimization/test_proposers.py +134 -0
  79. selfevals-0.4.0/tests/optimization/test_sampling.py +226 -0
  80. {selfevals-0.2.2 → selfevals-0.4.0}/tests/repo/test_loader.py +111 -3
  81. selfevals-0.4.0/tests/reporter/test_compare_structured.py +256 -0
  82. selfevals-0.4.0/tests/reporter/test_funnel.py +229 -0
  83. selfevals-0.4.0/tests/reporter/test_json_report.py +217 -0
  84. {selfevals-0.2.2 → selfevals-0.4.0}/tests/reporter/test_markdown.py +25 -1
  85. selfevals-0.4.0/tests/runner/test_adapters.py +225 -0
  86. {selfevals-0.2.2 → selfevals-0.4.0}/tests/runner/test_executor.py +110 -14
  87. selfevals-0.4.0/tests/runner/test_multiturn.py +198 -0
  88. selfevals-0.4.0/tests/runner/test_pricing.py +115 -0
  89. selfevals-0.4.0/tests/runner/test_simulator.py +397 -0
  90. selfevals-0.4.0/tests/schemas/test_eval_case.py +310 -0
  91. {selfevals-0.2.2 → selfevals-0.4.0}/tests/schemas/test_experiment.py +16 -3
  92. {selfevals-0.2.2 → selfevals-0.4.0}/tests/schemas/test_trace.py +20 -0
  93. {selfevals-0.2.2 → selfevals-0.4.0}/tests/storage/test_sqlite_storage.py +53 -0
  94. {selfevals-0.2.2 → selfevals-0.4.0}/tests/trace/test_recorder.py +85 -5
  95. selfevals-0.2.2/src/selfevals/__init__.py +0 -19
  96. selfevals-0.2.2/src/selfevals/graders/__init__.py +0 -46
  97. selfevals-0.2.2/src/selfevals/graders/base.py +0 -54
  98. selfevals-0.2.2/src/selfevals/optimization/proposers.py +0 -202
  99. selfevals-0.2.2/src/selfevals/reporter/compare.py +0 -221
  100. selfevals-0.2.2/src/selfevals/version.py +0 -1
  101. selfevals-0.2.2/tests/api/test_api.py +0 -127
  102. selfevals-0.2.2/tests/api/test_sse.py +0 -67
  103. selfevals-0.2.2/tests/optimization/test_aggregator.py +0 -129
  104. selfevals-0.2.2/tests/runner/test_adapters.py +0 -154
  105. selfevals-0.2.2/tests/schemas/test_eval_case.py +0 -131
  106. {selfevals-0.2.2 → selfevals-0.4.0}/LICENSE +0 -0
  107. {selfevals-0.2.2 → selfevals-0.4.0}/docs/spec/error_analysis_design.md +0 -0
  108. {selfevals-0.2.2 → selfevals-0.4.0}/docs/spec/evals_framework.md +0 -0
  109. {selfevals-0.2.2 → selfevals-0.4.0}/docs/spec/operational_spec_v0.1.md +0 -0
  110. {selfevals-0.2.2 → selfevals-0.4.0}/docs/spec/raw.md +0 -0
  111. {selfevals-0.2.2 → selfevals-0.4.0}/docs/spec/sdk_otlp_design.md +0 -0
  112. {selfevals-0.2.2 → selfevals-0.4.0}/docs/spec/taxonomy.md +0 -0
  113. {selfevals-0.2.2 → selfevals-0.4.0}/docs/troubleshooting.md +0 -0
  114. {selfevals-0.2.2 → selfevals-0.4.0}/evals/datasets/pingpong.jsonl +0 -0
  115. {selfevals-0.2.2 → selfevals-0.4.0}/evals/experiments/example_pingpong.yaml +0 -0
  116. {selfevals-0.2.2 → selfevals-0.4.0}/examples/README.md +0 -0
  117. {selfevals-0.2.2 → selfevals-0.4.0}/examples/__init__.py +0 -0
  118. {selfevals-0.2.2 → selfevals-0.4.0}/examples/hello_llm/__init__.py +0 -0
  119. {selfevals-0.2.2 → selfevals-0.4.0}/examples/hello_llm/agent.py +0 -0
  120. {selfevals-0.2.2 → selfevals-0.4.0}/examples/hello_llm/cases.jsonl +0 -0
  121. {selfevals-0.2.2 → selfevals-0.4.0}/examples/hello_llm/experiment.yaml +0 -0
  122. {selfevals-0.2.2 → selfevals-0.4.0}/examples/hello_openai/__init__.py +0 -0
  123. {selfevals-0.2.2 → selfevals-0.4.0}/examples/hello_openai/agent.py +0 -0
  124. {selfevals-0.2.2 → selfevals-0.4.0}/examples/hello_openai/cases.jsonl +0 -0
  125. {selfevals-0.2.2 → selfevals-0.4.0}/examples/hello_openai/experiment.yaml +0 -0
  126. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/.agents/skills/error-analysis/SKILL.md +0 -0
  127. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/_errors.py +0 -0
  128. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/_internal/__init__.py +0 -0
  129. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/_internal/hashing.py +0 -0
  130. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/_internal/ids.py +0 -0
  131. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/_internal/time.py +0 -0
  132. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/analysis/__init__.py +0 -0
  133. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/analysis/bundle.py +0 -0
  134. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/analysis/hypothesis.py +0 -0
  135. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/analysis/ingest.py +0 -0
  136. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/analysis/schemas.py +0 -0
  137. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/analysis/staging.py +0 -0
  138. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/api/__init__.py +0 -0
  139. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/api/__main__.py +0 -0
  140. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/api/broker.py +0 -0
  141. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/api/broker_bridge.py +0 -0
  142. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/cli/__init__.py +0 -0
  143. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/cli/_friendly.py +0 -0
  144. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/cli/_help.py +0 -0
  145. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/cli/analyze_commands.py +0 -0
  146. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/decision/__init__.py +0 -0
  147. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/decision/matrix.py +0 -0
  148. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/examples/__init__.py +0 -0
  149. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/examples/evals/datasets/pingpong.jsonl +0 -0
  150. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/examples/evals/experiments/example_pingpong.yaml +0 -0
  151. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/examples/pingpong.py +0 -0
  152. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/graders/calibration.py +0 -0
  153. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/py.typed +0 -0
  154. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/reporter/__init__.py +0 -0
  155. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/reporter/_metrics.py +0 -0
  156. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/runner/otlp_receiver.py +0 -0
  157. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/runner/otlp_to_recorder.py +0 -0
  158. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/runner/sandbox.py +0 -0
  159. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/schemas/_base.py +0 -0
  160. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/schemas/annotation.py +0 -0
  161. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/schemas/dataset.py +0 -0
  162. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/schemas/failure_mode.py +0 -0
  163. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/schemas/fleet.py +0 -0
  164. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/schemas/grader_card.py +0 -0
  165. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/schemas/registry.py +0 -0
  166. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/schemas/tool.py +0 -0
  167. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/schemas/workspace.py +0 -0
  168. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/sdk/__init__.py +0 -0
  169. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/sdk/auto_instrument.py +0 -0
  170. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/sdk/context.py +0 -0
  171. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/sdk/exporter.py +0 -0
  172. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/sdk/facade.py +0 -0
  173. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/skills/__init__.py +0 -0
  174. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/storage/__init__.py +0 -0
  175. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/storage/errors.py +0 -0
  176. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/storage/filesystem.py +0 -0
  177. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/storage/interface.py +0 -0
  178. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/storage/migrations/__init__.py +0 -0
  179. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/storage/migrations/m0001_initial.py +0 -0
  180. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/storage/seed.py +0 -0
  181. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/trace/__init__.py +0 -0
  182. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/trace/otel_importer.py +0 -0
  183. {selfevals-0.2.2 → selfevals-0.4.0}/src/selfevals/trace/payload_router.py +0 -0
  184. {selfevals-0.2.2 → selfevals-0.4.0}/tests/__init__.py +0 -0
  185. {selfevals-0.2.2 → selfevals-0.4.0}/tests/analysis/__init__.py +0 -0
  186. {selfevals-0.2.2 → selfevals-0.4.0}/tests/analysis/test_handshake.py +0 -0
  187. {selfevals-0.2.2 → selfevals-0.4.0}/tests/api/__init__.py +0 -0
  188. {selfevals-0.2.2 → selfevals-0.4.0}/tests/api/test_broker.py +0 -0
  189. {selfevals-0.2.2 → selfevals-0.4.0}/tests/api/test_threads.py +0 -0
  190. {selfevals-0.2.2 → selfevals-0.4.0}/tests/cli/__init__.py +0 -0
  191. {selfevals-0.2.2 → selfevals-0.4.0}/tests/cli/helpers_str_agent.py +0 -0
  192. {selfevals-0.2.2 → selfevals-0.4.0}/tests/cli/test_analyze_cli.py +0 -0
  193. {selfevals-0.2.2 → selfevals-0.4.0}/tests/cli/test_cli_run.py +0 -0
  194. {selfevals-0.2.2 → selfevals-0.4.0}/tests/cli/test_compare.py +0 -0
  195. {selfevals-0.2.2 → selfevals-0.4.0}/tests/cli/test_examples_cli.py +0 -0
  196. {selfevals-0.2.2 → selfevals-0.4.0}/tests/cli/test_help_texts.py +0 -0
  197. {selfevals-0.2.2 → selfevals-0.4.0}/tests/cli/test_skills_cli.py +0 -0
  198. {selfevals-0.2.2 → selfevals-0.4.0}/tests/decision/__init__.py +0 -0
  199. {selfevals-0.2.2 → selfevals-0.4.0}/tests/decision/test_matrix.py +0 -0
  200. {selfevals-0.2.2 → selfevals-0.4.0}/tests/examples/__init__.py +0 -0
  201. {selfevals-0.2.2 → selfevals-0.4.0}/tests/examples/test_hello_llm.py +0 -0
  202. {selfevals-0.2.2 → selfevals-0.4.0}/tests/graders/__init__.py +0 -0
  203. {selfevals-0.2.2 → selfevals-0.4.0}/tests/graders/test_calibration.py +0 -0
  204. {selfevals-0.2.2 → selfevals-0.4.0}/tests/integration/__init__.py +0 -0
  205. {selfevals-0.2.2 → selfevals-0.4.0}/tests/optimization/__init__.py +0 -0
  206. {selfevals-0.2.2 → selfevals-0.4.0}/tests/repo/__init__.py +0 -0
  207. {selfevals-0.2.2 → selfevals-0.4.0}/tests/reporter/__init__.py +0 -0
  208. {selfevals-0.2.2 → selfevals-0.4.0}/tests/reporter/test_metrics.py +0 -0
  209. {selfevals-0.2.2 → selfevals-0.4.0}/tests/runner/__init__.py +0 -0
  210. {selfevals-0.2.2 → selfevals-0.4.0}/tests/runner/test_otlp_receiver.py +0 -0
  211. {selfevals-0.2.2 → selfevals-0.4.0}/tests/runner/test_otlp_to_recorder.py +0 -0
  212. {selfevals-0.2.2 → selfevals-0.4.0}/tests/runner/test_sandbox.py +0 -0
  213. {selfevals-0.2.2 → selfevals-0.4.0}/tests/schemas/__init__.py +0 -0
  214. {selfevals-0.2.2 → selfevals-0.4.0}/tests/schemas/test_annotation.py +0 -0
  215. {selfevals-0.2.2 → selfevals-0.4.0}/tests/schemas/test_base.py +0 -0
  216. {selfevals-0.2.2 → selfevals-0.4.0}/tests/schemas/test_cross_entity.py +0 -0
  217. {selfevals-0.2.2 → selfevals-0.4.0}/tests/schemas/test_dataset.py +0 -0
  218. {selfevals-0.2.2 → selfevals-0.4.0}/tests/schemas/test_enums.py +0 -0
  219. {selfevals-0.2.2 → selfevals-0.4.0}/tests/schemas/test_error_analysis_spec.py +0 -0
  220. {selfevals-0.2.2 → selfevals-0.4.0}/tests/schemas/test_failure_mode.py +0 -0
  221. {selfevals-0.2.2 → selfevals-0.4.0}/tests/schemas/test_fleet.py +0 -0
  222. {selfevals-0.2.2 → selfevals-0.4.0}/tests/schemas/test_grader_card.py +0 -0
  223. {selfevals-0.2.2 → selfevals-0.4.0}/tests/schemas/test_iteration.py +0 -0
  224. {selfevals-0.2.2 → selfevals-0.4.0}/tests/schemas/test_registry.py +0 -0
  225. {selfevals-0.2.2 → selfevals-0.4.0}/tests/schemas/test_tool.py +0 -0
  226. {selfevals-0.2.2 → selfevals-0.4.0}/tests/schemas/test_workspace.py +0 -0
  227. {selfevals-0.2.2 → selfevals-0.4.0}/tests/sdk/__init__.py +0 -0
  228. {selfevals-0.2.2 → selfevals-0.4.0}/tests/sdk/test_auto_instrument.py +0 -0
  229. {selfevals-0.2.2 → selfevals-0.4.0}/tests/sdk/test_facade.py +0 -0
  230. {selfevals-0.2.2 → selfevals-0.4.0}/tests/skills/__init__.py +0 -0
  231. {selfevals-0.2.2 → selfevals-0.4.0}/tests/skills/test_skills_locator.py +0 -0
  232. {selfevals-0.2.2 → selfevals-0.4.0}/tests/storage/__init__.py +0 -0
  233. {selfevals-0.2.2 → selfevals-0.4.0}/tests/storage/test_filesystem_object_store.py +0 -0
  234. {selfevals-0.2.2 → selfevals-0.4.0}/tests/storage/test_migrations.py +0 -0
  235. {selfevals-0.2.2 → selfevals-0.4.0}/tests/storage/test_seed.py +0 -0
  236. {selfevals-0.2.2 → selfevals-0.4.0}/tests/test_internal_hashing.py +0 -0
  237. {selfevals-0.2.2 → selfevals-0.4.0}/tests/test_internal_ids.py +0 -0
  238. {selfevals-0.2.2 → selfevals-0.4.0}/tests/test_internal_time.py +0 -0
  239. {selfevals-0.2.2 → selfevals-0.4.0}/tests/test_smoke.py +0 -0
  240. {selfevals-0.2.2 → selfevals-0.4.0}/tests/trace/__init__.py +0 -0
  241. {selfevals-0.2.2 → selfevals-0.4.0}/tests/trace/test_otel_importer.py +0 -0
  242. {selfevals-0.2.2 → selfevals-0.4.0}/tests/trace/test_payload_router.py +0 -0
@@ -51,6 +51,9 @@ Thumbs.db
51
51
  .bootstrap/
52
52
  data/
53
53
 
54
+ # Agent worktrees
55
+ .claude/
56
+
54
57
  # Secrets
55
58
  .env
56
59
  .env.*
@@ -7,10 +7,119 @@ Versions follow [SemVer](https://semver.org/).
7
7
 
8
8
  ## [Unreleased]
9
9
 
10
+ ## [0.4.0] - 2026-05-28
11
+
12
+ ### Added
13
+
14
+ - **Recall-based `must_include` grading (`Expected.min_recall`).** A new
15
+ optional `min_recall` float in `[0, 1]` on `EvalCase.expected`. When
16
+ set (and `must_include` is non-empty), the `DeterministicGrader` grades
17
+ `must_include` by *recall* — the fraction of required substrings that
18
+ appear in the response — instead of all-or-nothing: the grade is `pass`
19
+ iff `recall >= min_recall`, and `score` becomes the recall value
20
+ (exposed in `details["recall"]`). Missing substrings still emit their
21
+ `missing_required_substring` failure mode for diagnostics but no longer
22
+ force a FAIL on their own. Hard violations (`must_not_include`,
23
+ `required_tools`/`forbidden_tools`, `regex_match`, `structured_output`)
24
+ always take precedence: any hard violation still forces FAIL even when
25
+ recall clears the threshold. When `min_recall` is `None` (the default),
26
+ `must_include` stays all-or-nothing as before.
27
+ - **Cache hit counts in the JSON report.** Each iteration in
28
+ `selfevals report --format json` (and the live `run --format json`) now
29
+ carries a `"cache": {"hits": N, "llm_calls": M}` object — the number of
30
+ cache-hit LLM spans and the total LLM-call spans across that iteration's
31
+ traces — so cost/throughput consumers can read cache effectiveness
32
+ without raw trace spelunking.
33
+ - **Per-iteration failure rationales in the JSON report.** Each iteration
34
+ now carries a `"failure_reasons"` array: deduplicated grader rationales
35
+ for every non-passing grade, one entry per distinct
36
+ `(grader, label, reason)` with `score` and `failure_modes`. This lets a
37
+ downstream consumer see *why* a grader failed without reading SQLite.
38
+ (Populated on a live `run`; empty when an experiment is reconstructed
39
+ from storage, e.g. via `report` or the HTTP API — see
40
+ [`docs/json_report_schema.md`](docs/json_report_schema.md).)
41
+ - **Thread viewer (web + API).** `GET /api/workspaces/{ws}/threads/{thread}`
42
+ (already shipped, now documented) assembles every `Trace` sharing a
43
+ `thread_id` into an ordered, turn-by-turn conversation (`ThreadResponse`),
44
+ each turn carrying its `primary_grade` and `grader_results`. New web
45
+ route `/[workspace]/threads/[thread]` renders the multi-turn conversation.
46
+ - **Funnel drill-down (web + API).** New endpoint
47
+ `GET /api/workspaces/{ws}/iterations/{id}/funnel` returns the per-iteration
48
+ grader funnel (`FunnelResponse`, recursive `FunnelNodeResponse` nodes read
49
+ straight from `IterationRecord.metrics.funnel`). New "Funnel" tab on the
50
+ experiment-detail view renders it via the recursive `FunnelNode.svelte`
51
+ component. `nodes` is empty when no grader emitted a breakdown.
52
+ - **Server-rendered iteration compare (web + API).** New endpoint
53
+ `GET /api/workspaces/{ws}/experiments/{id}/compare?a={itr}&b={itr}` returns
54
+ a structured `CompareResponse` (proposal diff, metrics diff, failure-mode
55
+ diff, funnel diff, winner recommendation, `holdout_status`) computed by the
56
+ reporter's `compute_compare` — the single source of truth shared with the
57
+ CLI `compare` command. Returns 404 when an iteration is unknown and 400
58
+ when the two iterations belong to different experiments. The web "Compare"
59
+ tab now renders this diff server-side instead of recomputing deltas in the
60
+ browser.
61
+
62
+ ### Documentation
63
+
64
+ - New [`docs/api_reference.md`](docs/api_reference.md): the canonical HTTP
65
+ API reference — every endpoint, grouped by resource, with method, path,
66
+ params, response schema, and error codes.
67
+ - New [`docs/eval_config.md`](docs/eval_config.md): the YAML eval-config
68
+ reference (top-level keys, `EvalCase`/`Expected` fields including
69
+ `min_recall`, graders, agent transports, proposers) with validating
70
+ snippets.
71
+ - New [`docs/json_report_schema.md`](docs/json_report_schema.md): the
72
+ `report --format json` output shape, documenting every root and
73
+ per-iteration key (including the new `cache` and `failure_reasons`).
74
+ - `docs/FRONTEND.md` §3/§5: the funnel, compare, and thread endpoints/views
75
+ are now documented as shipped.
76
+
77
+ ## [0.3.0] - 2026-05-27
78
+
79
+ ### Added
80
+
81
+ - **Validated multi-turn conversation input.** When `EvalCase.input`
82
+ carries a `messages` key it is validated as a typed conversation:
83
+ non-empty message list, roles from a new `MessageRole` enum
84
+ (system/user/assistant/tool), content as a string or a list of
85
+ content blocks, multimodal-aware via the `Modality` enum. New
86
+ `Message`, `ContentBlock`, and `ConversationInput` models, plus
87
+ `EvalCase.conversation()` / `EvalCase.is_conversation()` accessors.
88
+ Inputs without a `messages` key remain opaque payloads, so the
89
+ field stays a plain JSON dict that adapters receive verbatim.
90
+ - **Async-first evaluators.** `AgentAdapter.invoke` and `Grader.grade`
91
+ are now async. The executor runs repetitions concurrently and the
92
+ optimization loop grades concurrently, each bounded by a
93
+ configurable semaphore (`concurrency` / `grade_concurrency`,
94
+ default 8). `EmbeddedAdapter` accepts sync or async callables,
95
+ `CliCommandAdapter` uses an asyncio subprocess, and
96
+ `HttpEndpointAdapter` is native async on httpx. `asyncio.run` is
97
+ confined to the CLI edge.
98
+
99
+ ### Changed
100
+
101
+ - `httpx` is now a runtime dependency (the default HTTP adapter
102
+ transport), not just a dev dependency.
103
+
104
+ ### Documentation
105
+
106
+ - STATUS.md and README banners read v0.3.0; multi-turn input and async
107
+ evaluators moved into "What works"; test counts refreshed (default
108
+ surface 559, full 597); roadmap records both as shipped in 0.3.0.
109
+
10
110
  ## [0.2.2] - 2026-05-27
11
111
 
12
112
  ### Documentation
13
113
 
114
+ - STATUS.md and README banners now read v0.2.2 (they had lagged at
115
+ v0.2.1 despite the 0.2.2 release). Refreshed the STATUS body against
116
+ the current tree: test counts (default surface 481 -> 528, full
117
+ surface 566, extras-gated 9 -> 24), and the forward-looking
118
+ "What v0.2 will probably contain" section became a "Roadmap" that
119
+ separates what shipped in 0.2.x from what remains on the backlog.
120
+
121
+ ### Documentation
122
+
14
123
  - Onboarding pass after the `bootstrap` -> `selfevals` rename. Fixed the
15
124
  CI mypy target (`src/bootstrap` -> `src/selfevals`) and 13 stale
16
125
  `bootstrap` CLI/prose references in the bundled error-analysis skill.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: selfevals
3
- Version: 0.2.2
3
+ Version: 0.4.0
4
4
  Summary: Self-improving evals framework for AI agents.
5
5
  Project-URL: Homepage, https://github.com/patovaldezf/selfevals
6
6
  Project-URL: Repository, https://github.com/patovaldezf/selfevals
@@ -18,6 +18,7 @@ Classifier: Programming Language :: Python :: 3.12
18
18
  Classifier: Programming Language :: Python :: 3.13
19
19
  Classifier: Topic :: Software Development :: Testing
20
20
  Requires-Python: >=3.12
21
+ Requires-Dist: httpx<1,>=0.27
21
22
  Requires-Dist: pydantic<3,>=2.7
22
23
  Requires-Dist: pyyaml<7,>=6
23
24
  Provides-Extra: all
@@ -94,9 +95,10 @@ configuration to keep. CLI-first, multi-tenant from day one, and agnostic
94
95
  to the agent framework underneath — selfevals never calls your provider;
95
96
  your agent does, and selfevals grades the result.
96
97
 
97
- > Status: **v0.2.1 — runtime functional.** The CLI works end-to-end:
98
+ > Status: **v0.3.0 — runtime functional.** The CLI works end-to-end:
98
99
  > load an experiment spec → run cases through an adapter → grade traces →
99
- > persist iterations → render a report. See [`docs/spec/`](docs/spec/) for
100
+ > persist iterations → render a report. Adapters and graders are async,
101
+ > with concurrent repetitions and grading. See [`docs/spec/`](docs/spec/) for
100
102
  > the canonical and operational specs that drive design, and
101
103
  > [`docs/STATUS.md`](docs/STATUS.md) for an honest what-works / what-doesn't
102
104
  > snapshot.
@@ -153,7 +155,7 @@ The five nouns you'll meet everywhere:
153
155
 
154
156
  | Term | What it is |
155
157
  |------|------------|
156
- | **EvalCase** | One test: an input, the expected outcome, and which graders apply. |
158
+ | **EvalCase** | One test: an input (a validated multi-turn `messages` conversation, or any opaque payload), the expected outcome, and which graders apply. |
157
159
  | **Adapter** | The bridge to your agent — embedded callable, CLI subprocess, or HTTP endpoint. selfevals calls *it*, never the provider directly. |
158
160
  | **Grader** | Scores a trace. `DeterministicGrader` (rules: substrings, tools, JSON schema) or `LLMJudgeGrader` (a rubric-driven judge). |
159
161
  | **Proposer** | Picks the next parameter configuration to try — `manual`, `grid`, or `random`. |
@@ -223,7 +225,7 @@ its arguments. The surface:
223
225
  |---------|---------|
224
226
  | `init <slug>` | Create a workspace and seed the default failure-mode taxonomy. |
225
227
  | `run <spec.yaml>` | Run an experiment spec end-to-end. |
226
- | `report <ws> <exp>` | Render a stored experiment as markdown (`--format json` for JSON). |
228
+ | `report <ws> <exp>` | Render a stored experiment as markdown (`--format json` for JSON; the JSON now includes per-iteration `cache` hit counts and deduplicated `failure_reasons`). |
227
229
  | `compare <ws> <itr_a> <itr_b>` | Diff two iterations side by side. |
228
230
  | `estimate` | Dry-run cost estimate for a search space × cases × reps. |
229
231
  | `workspace show <ws>` | Inspect a workspace. |
@@ -247,6 +249,17 @@ candidate modes via `failuremode promote`. The bundled
247
249
  [`error-analysis` skill](src/selfevals/.agents/skills/error-analysis/SKILL.md)
248
250
  (discoverable via `selfevals skills list`) encodes the method.
249
251
 
252
+ ## Documentation
253
+
254
+ | Doc | What it covers |
255
+ |-----|----------------|
256
+ | [`docs/eval_config.md`](docs/eval_config.md) | The YAML experiment spec: top-level keys, `EvalCase`/`Expected` fields (including recall-based `must_include` via `min_recall`), graders, agent transports, and proposers. |
257
+ | [`docs/api_reference.md`](docs/api_reference.md) | The canonical HTTP API reference — every endpoint, response schema, and error codes. |
258
+ | [`docs/json_report_schema.md`](docs/json_report_schema.md) | The `report --format json` output shape, including the per-iteration `cache` and `failure_reasons` keys. |
259
+ | [`docs/adapters.md`](docs/adapters.md) | Adapter contract and per-transport YAML/code snippets. |
260
+ | [`docs/FRONTEND.md`](docs/FRONTEND.md) | The web UI spec (views, endpoints, roadmap). |
261
+ | [`docs/STATUS.md`](docs/STATUS.md) | Honest what-works / what-doesn't snapshot. |
262
+
250
263
  ## Layout
251
264
 
252
265
  ```
@@ -9,9 +9,10 @@ configuration to keep. CLI-first, multi-tenant from day one, and agnostic
9
9
  to the agent framework underneath — selfevals never calls your provider;
10
10
  your agent does, and selfevals grades the result.
11
11
 
12
- > Status: **v0.2.1 — runtime functional.** The CLI works end-to-end:
12
+ > Status: **v0.3.0 — runtime functional.** The CLI works end-to-end:
13
13
  > load an experiment spec → run cases through an adapter → grade traces →
14
- > persist iterations → render a report. See [`docs/spec/`](docs/spec/) for
14
+ > persist iterations → render a report. Adapters and graders are async,
15
+ > with concurrent repetitions and grading. See [`docs/spec/`](docs/spec/) for
15
16
  > the canonical and operational specs that drive design, and
16
17
  > [`docs/STATUS.md`](docs/STATUS.md) for an honest what-works / what-doesn't
17
18
  > snapshot.
@@ -68,7 +69,7 @@ The five nouns you'll meet everywhere:
68
69
 
69
70
  | Term | What it is |
70
71
  |------|------------|
71
- | **EvalCase** | One test: an input, the expected outcome, and which graders apply. |
72
+ | **EvalCase** | One test: an input (a validated multi-turn `messages` conversation, or any opaque payload), the expected outcome, and which graders apply. |
72
73
  | **Adapter** | The bridge to your agent — embedded callable, CLI subprocess, or HTTP endpoint. selfevals calls *it*, never the provider directly. |
73
74
  | **Grader** | Scores a trace. `DeterministicGrader` (rules: substrings, tools, JSON schema) or `LLMJudgeGrader` (a rubric-driven judge). |
74
75
  | **Proposer** | Picks the next parameter configuration to try — `manual`, `grid`, or `random`. |
@@ -138,7 +139,7 @@ its arguments. The surface:
138
139
  |---------|---------|
139
140
  | `init <slug>` | Create a workspace and seed the default failure-mode taxonomy. |
140
141
  | `run <spec.yaml>` | Run an experiment spec end-to-end. |
141
- | `report <ws> <exp>` | Render a stored experiment as markdown (`--format json` for JSON). |
142
+ | `report <ws> <exp>` | Render a stored experiment as markdown (`--format json` for JSON; the JSON now includes per-iteration `cache` hit counts and deduplicated `failure_reasons`). |
142
143
  | `compare <ws> <itr_a> <itr_b>` | Diff two iterations side by side. |
143
144
  | `estimate` | Dry-run cost estimate for a search space × cases × reps. |
144
145
  | `workspace show <ws>` | Inspect a workspace. |
@@ -162,6 +163,17 @@ candidate modes via `failuremode promote`. The bundled
162
163
  [`error-analysis` skill](src/selfevals/.agents/skills/error-analysis/SKILL.md)
163
164
  (discoverable via `selfevals skills list`) encodes the method.
164
165
 
166
+ ## Documentation
167
+
168
+ | Doc | What it covers |
169
+ |-----|----------------|
170
+ | [`docs/eval_config.md`](docs/eval_config.md) | The YAML experiment spec: top-level keys, `EvalCase`/`Expected` fields (including recall-based `must_include` via `min_recall`), graders, agent transports, and proposers. |
171
+ | [`docs/api_reference.md`](docs/api_reference.md) | The canonical HTTP API reference — every endpoint, response schema, and error codes. |
172
+ | [`docs/json_report_schema.md`](docs/json_report_schema.md) | The `report --format json` output shape, including the per-iteration `cache` and `failure_reasons` keys. |
173
+ | [`docs/adapters.md`](docs/adapters.md) | Adapter contract and per-transport YAML/code snippets. |
174
+ | [`docs/FRONTEND.md`](docs/FRONTEND.md) | The web UI spec (views, endpoints, roadmap). |
175
+ | [`docs/STATUS.md`](docs/STATUS.md) | Honest what-works / what-doesn't snapshot. |
176
+
165
177
  ## Layout
166
178
 
167
179
  ```
@@ -0,0 +1,171 @@
1
+ # Fase A — Pendientes y deuda diferida
2
+
3
+ Vivo. Se actualiza cada vez que un PR de Fase A se mergea y descubre
4
+ algo que debe quedar registrado para después.
5
+
6
+ Para el plan completo, ver [FRONTEND_PRODUCT_PLAN.md](FRONTEND_PRODUCT_PLAN.md).
7
+
8
+ ## Estado de la Fase A
9
+
10
+ | # | PR | Estado | Notas |
11
+ |---|---|---|---|
12
+ | A1 | #16 los 3 bugs + plan | ✅ merged | — |
13
+ | A2 | #17 link iter → traza | ✅ merged | — |
14
+ | A3 | #18 resolver pointers | ✅ merged | — |
15
+ | A4 | #19 selfevals serve | ✅ merged | latente BUG-4 hasta este PR |
16
+ | A5 | #20 identidad humana sobre ULID | ✅ merged | — |
17
+ | A6 | #21 span kind visible + densidad | ✅ merged | QA visual destrabada al fix-ear BUG-4 |
18
+ | BUG-4 | #22 proxy `/api` en `selfevals serve` | ✅ merged | hooks.server.ts + `SELFEVALS_API_BASE` |
19
+ | A7 | #23 a11y filas ([button]/[link]) | ✅ merged | — |
20
+ | A8 | paginación + virtual scroll | 🟡 in progress | envelope solo en `/experiments` por ahora |
21
+
22
+ ## Pendientes registrados (orden de descubrimiento)
23
+
24
+ ### De A5 (#20)
25
+
26
+ 1. **URL routing por slug humano.** El URL sigue siendo el ULID
27
+ (`/<ws_id>/experiments/<exp_id>`). Slugs humanos en la ruta
28
+ requieren:
29
+ - `queries.workspace_detail` aceptar slug además de id.
30
+ - Posiblemente `queries.list_experiments` / `experiment_detail`
31
+ resolver experimento por slug dentro de un workspace.
32
+ - Routes SvelteKit con `[workspace]` aceptan ambos sin cambio.
33
+ - Scope: backend resolver + tests. No es de Fase A; ir como propio
34
+ PR cuando duela.
35
+
36
+ 2. ~~**Anchor-set: CopyableId chip dentro del row.**~~ ✅ Resuelto en
37
+ A7. Row reescrito de `<a>` envolvente a `<div>` con `<a>` (link al
38
+ experiment) y `<CopyableId>` lado a lado. focus-within ring para
39
+ feedback de teclado consistente.
40
+
41
+ 3. ~~**QA visual de los chips CopyableId.**~~ ✅ Desbloqueado con
42
+ BUG-4. Render visual confirmado en el trace viewer y experiment
43
+ detail vía `selfevals serve`. Hover, tick `copied`, y stop-propagation
44
+ en celdas clickeables siguen siendo verificación manual (no test
45
+ automatizado — vitest no está montado, ver A6).
46
+
47
+ 4. **Workspace ID en overview no es copiable.** El `/[workspace]/+page.svelte`
48
+ muestra `workspace.slug` como chip y `workspace.name` como h1, pero
49
+ el ULID del workspace no aparece en ningún lado. Si alguien lo
50
+ necesita para curl/CLI, debe ir a otra ruta o leerlo de la URL.
51
+ Decisión consciente — no añadir hasta que duela.
52
+
53
+ ### De A6 (en progreso)
54
+
55
+ 1. ~~**QA visual del trace tree.** Bloqueado por BUG-4.~~ ✅ Desbloqueado
56
+ con el fix de BUG-4. Dogfood: `curl :web_port/<ws>/traces/<run>`
57
+ muestra glyph ◆ + label "llm" + nombre "adapter_response", todo
58
+ renderizado correctamente.
59
+ 2. **Iconografía de spans.** Los glifos Unicode (`◆ ✦ ⚙ ◇ ▽ △ ◉ ↦
60
+ ☞ ◈ ✕`) son funcionales pero no son SVGs. Si el set crece o el
61
+ peso visual se queda corto, evaluar un set SVG inline (sin
62
+ dependencia externa) — pero NO antes de que un usuario real se
63
+ queje del look actual.
64
+ 3. **`tokens_per_second` en el árbol.** Lo expongo en el API
65
+ (`keep_keys`) pero NO lo renderizo en SpanNode aún: en pingpong da
66
+ `None`. Cuando ROADMAP #9 lo pueble, añadir como fact (junto a TTFT
67
+ y tokens). Mismo para `time_to_first_token_ms` en ejemplos reales.
68
+ 4. **Densidad: facts ocultas en mobile.** `hidden sm:inline-block`
69
+ esconde los facts en viewports angostos. El plan §1.2-7 dice
70
+ "mobile está roto" → decisión consciente desktop-first, no
71
+ regresión. Cuando llegue mobile, refactorizar SpanNode para
72
+ colapsar facts a chevron expandible.
73
+
74
+ ### De A7 (cerrado)
75
+
76
+ Nada de deuda diferida — pendiente A5#2 (anchor-set CopyableId chip)
77
+ quedó resuelto en este PR.
78
+
79
+ ### De A8 (en progreso)
80
+
81
+ 1. **Pagination envelope solo en `/experiments`.** Los demás endpoints
82
+ de lista (`/workspaces`, `/iterations`) siguen devolviendo lista
83
+ plana. Cuando algún usuario tenga >100 de cualquiera de esos, hacer
84
+ el mismo upgrade (la receta ya está). Por ahora, deuda registrada.
85
+
86
+ 2. **FE UI de "Load more" / paginación.** El envelope viaja pero la
87
+ UI no tiene botón "Load more" — solo muestra "X of N" cuando hay
88
+ más páginas. La razón: nadie tiene aún >100 experimentos así que
89
+ no hay un caso de uso real para probar. Cuando duela, añadir el
90
+ botón con `offset` incremental.
91
+
92
+ 3. **Virtual scroll: row height aproximada.** `SpanTreeFlat` asume
93
+ `ROW_HEIGHT_PX = 28` (uniforme). Si un día metemos rows multi-línea
94
+ (p.ej. error con traceback inline), la matemática del window se
95
+ rompe. Solución correcta cuando llegue: `ResizeObserver` por fila
96
+ + altura medida. Por ahora un fact en una sola línea encaja.
97
+
98
+ 4. **No hay "scroll to selected".** Si la traza tiene 1000 spans y el
99
+ span seleccionado está en el medio, abrir la página NO hace scroll
100
+ a esa fila. La selección viene del click humano (no de la URL todavía),
101
+ así que el seleccionado siempre empieza en `null` y el usuario
102
+ recorre. Cuando integrumos URL `?span=sp_...`, añadir auto-scroll.
103
+
104
+ ### De A8 (pendiente)
105
+
106
+ _(pendiente)_
107
+
108
+ ## Bugs/deuda fuera de Fase A descubiertos en el camino
109
+
110
+ ### BUG-4 ✅ FIXED — `selfevals serve` no proxya `/api` → web Node ve 404 en todo
111
+
112
+ **Síntoma.** Con `selfevals serve` (modo producción, no `npm run dev`),
113
+ toda ruta del web devuelve 404 o "Backend unreachable / API 404".
114
+ Verificado en QA de A6:
115
+
116
+ ```
117
+ $ uv run selfevals --db /tmp/qa.sqlite serve --port 5188
118
+ $ curl :5189/ # 200 pero renderiza "API 404"
119
+ $ curl :5189/<ws>/traces/<run_id> # 404
120
+ $ curl :5189/api/workspaces # 404 (no hay handler)
121
+ ```
122
+
123
+ **Causa raíz.** `cli/commands.py:cmd_serve` spawna el Node server de
124
+ SvelteKit en `port+1` con `ORIGIN=http://host:port+1`, pero **no
125
+ configura proxy**. El FE usa `fetch('/api/...')` (relativo) en
126
+ `+page.server.ts` — en SSR eso pega al Node server (que no tiene
127
+ ruta `/api`), no a FastAPI en `port`. El propio comentario en
128
+ `commands.py:735-737` reconoce el hueco ("the built server has no
129
+ proxy, so the web side must call the API via absolute URLs") pero
130
+ nunca se implementó el fix.
131
+
132
+ **Impacto.** Toda Fase A es invisible end-to-end por la vía oficial
133
+ de dogfood. Los PRs A1-A6 pasan typecheck + pytest + API roundtrip,
134
+ pero ningún usuario ha visto el resultado renderizado vía
135
+ `selfevals serve`. (Sí funciona vía `npm run dev` + `uvicorn`
136
+ manualmente — el bug es específico de la ruta "un comando" de A4.)
137
+
138
+ **Fixes posibles (preferir 1):**
139
+ 1. `SELFEVALS_API_BASE=http://host:port` env var hacia el subprocess
140
+ Node + cliente API absoluto cuando esté presente (server-side
141
+ load). El cliente del browser sigue relativo.
142
+ 2. Hooks SvelteKit (`hooks.server.ts`) que proxean `/api` → FastAPI
143
+ cuando detectan `SELFEVALS_API_BASE`.
144
+ 3. Servir el build estático desde FastAPI y dropar el Node server
145
+ (pierde SSR — no aceptable).
146
+
147
+ **Test a añadir.** `test_serve_web_can_reach_api_through_node`: lanza
148
+ `selfevals serve`, espera a que ambos puertos respondan, y hace
149
+ `curl :web_port/` esperando ver al menos un workspace en el HTML
150
+ renderizado (no "Backend unreachable"). Esto pinta el contrato
151
+ end-to-end y captura cualquier futuro retroceso del proxy.
152
+
153
+ **Prioridad.** Alta. Bloquea QA visual de toda Fase A. Ir como PR
154
+ propio (BUG-4), no enredarlo con A6/A7/A8. Idealmente antes de
155
+ mergear A6.
156
+
157
+ **Fix shipped (PR fix/bug-4-serve-api-proxy).** Opción 2:
158
+ `web/src/hooks.server.ts` con un `handle` que intercepta `/api/*`,
159
+ lee `SELFEVALS_API_BASE` y hace `fetch` al upstream FastAPI
160
+ streameando body (importante para SSE: `/api/.../stream` es
161
+ text/event-stream sin EOF). `cmd_serve` setea
162
+ `SELFEVALS_API_BASE=http://host:api_port` en el subprocess Node.
163
+ Dogfood end-to-end OK:
164
+ `curl :web_port/<ws>/traces/<run_id>` ahora devuelve 200 con todo
165
+ el render de Fase A (A5 nombre humano + A6 glyph + facts).
166
+
167
+ **Pendiente.** Test integration con `node` real está fuera del PR
168
+ (CI Python no tiene Node; CI Web no tiene Python). Unit test del env
169
+ var en `test_serve_spawns_node_with_correct_env` cubre la mitad Python.
170
+ Para el matrix completo, configurar un job de CI con Python + Node
171
+ juntos — registrar como deuda separada cuando duela.