selfevals 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (96) hide show
  1. selfevals/.agents/skills/error-analysis/SKILL.md +149 -0
  2. selfevals/__init__.py +19 -0
  3. selfevals/_errors.py +44 -0
  4. selfevals/_internal/__init__.py +0 -0
  5. selfevals/_internal/hashing.py +23 -0
  6. selfevals/_internal/ids.py +65 -0
  7. selfevals/_internal/time.py +17 -0
  8. selfevals/analysis/__init__.py +23 -0
  9. selfevals/analysis/bundle.py +162 -0
  10. selfevals/analysis/hypothesis.py +26 -0
  11. selfevals/analysis/ingest.py +185 -0
  12. selfevals/analysis/schemas.py +119 -0
  13. selfevals/analysis/staging.py +34 -0
  14. selfevals/api/__init__.py +24 -0
  15. selfevals/api/__main__.py +47 -0
  16. selfevals/api/app.py +351 -0
  17. selfevals/api/broker.py +210 -0
  18. selfevals/api/broker_bridge.py +29 -0
  19. selfevals/api/queries.py +447 -0
  20. selfevals/api/schemas.py +151 -0
  21. selfevals/api/sse.py +114 -0
  22. selfevals/cli/__init__.py +15 -0
  23. selfevals/cli/_friendly.py +180 -0
  24. selfevals/cli/_help.py +55 -0
  25. selfevals/cli/analyze_commands.py +169 -0
  26. selfevals/cli/commands.py +615 -0
  27. selfevals/cli/main.py +409 -0
  28. selfevals/decision/__init__.py +34 -0
  29. selfevals/decision/matrix.py +185 -0
  30. selfevals/examples/__init__.py +8 -0
  31. selfevals/examples/evals/datasets/pingpong.jsonl +2 -0
  32. selfevals/examples/evals/experiments/example_pingpong.yaml +58 -0
  33. selfevals/examples/pingpong.py +21 -0
  34. selfevals/graders/__init__.py +46 -0
  35. selfevals/graders/base.py +54 -0
  36. selfevals/graders/calibration.py +145 -0
  37. selfevals/graders/deterministic.py +143 -0
  38. selfevals/graders/llm_judge.py +187 -0
  39. selfevals/graders/registry.py +66 -0
  40. selfevals/optimization/__init__.py +47 -0
  41. selfevals/optimization/aggregator.py +246 -0
  42. selfevals/optimization/loop.py +432 -0
  43. selfevals/optimization/proposers.py +202 -0
  44. selfevals/py.typed +0 -0
  45. selfevals/repo/__init__.py +28 -0
  46. selfevals/repo/loader.py +276 -0
  47. selfevals/reporter/__init__.py +21 -0
  48. selfevals/reporter/_metrics.py +114 -0
  49. selfevals/reporter/compare.py +221 -0
  50. selfevals/reporter/json_report.py +105 -0
  51. selfevals/reporter/markdown.py +232 -0
  52. selfevals/runner/__init__.py +42 -0
  53. selfevals/runner/adapters.py +268 -0
  54. selfevals/runner/executor.py +234 -0
  55. selfevals/runner/otlp_receiver.py +343 -0
  56. selfevals/runner/otlp_to_recorder.py +180 -0
  57. selfevals/runner/sandbox.py +46 -0
  58. selfevals/schemas/__init__.py +213 -0
  59. selfevals/schemas/_base.py +82 -0
  60. selfevals/schemas/annotation.py +55 -0
  61. selfevals/schemas/dataset.py +111 -0
  62. selfevals/schemas/enums.py +324 -0
  63. selfevals/schemas/eval_case.py +189 -0
  64. selfevals/schemas/experiment.py +367 -0
  65. selfevals/schemas/failure_mode.py +76 -0
  66. selfevals/schemas/fleet.py +111 -0
  67. selfevals/schemas/grader_card.py +112 -0
  68. selfevals/schemas/iteration.py +219 -0
  69. selfevals/schemas/registry.py +125 -0
  70. selfevals/schemas/tool.py +43 -0
  71. selfevals/schemas/trace.py +384 -0
  72. selfevals/schemas/workspace.py +69 -0
  73. selfevals/sdk/__init__.py +24 -0
  74. selfevals/sdk/auto_instrument.py +165 -0
  75. selfevals/sdk/context.py +45 -0
  76. selfevals/sdk/exporter.py +50 -0
  77. selfevals/sdk/facade.py +203 -0
  78. selfevals/skills/__init__.py +61 -0
  79. selfevals/storage/__init__.py +53 -0
  80. selfevals/storage/errors.py +66 -0
  81. selfevals/storage/filesystem.py +137 -0
  82. selfevals/storage/interface.py +135 -0
  83. selfevals/storage/migrations/__init__.py +80 -0
  84. selfevals/storage/migrations/m0001_initial.py +57 -0
  85. selfevals/storage/seed.py +199 -0
  86. selfevals/storage/sqlite.py +232 -0
  87. selfevals/trace/__init__.py +31 -0
  88. selfevals/trace/otel_importer.py +455 -0
  89. selfevals/trace/payload_router.py +106 -0
  90. selfevals/trace/recorder.py +540 -0
  91. selfevals/version.py +1 -0
  92. selfevals-0.2.2.dist-info/METADATA +283 -0
  93. selfevals-0.2.2.dist-info/RECORD +96 -0
  94. selfevals-0.2.2.dist-info/WHEEL +4 -0
  95. selfevals-0.2.2.dist-info/entry_points.txt +2 -0
  96. selfevals-0.2.2.dist-info/licenses/LICENSE +17 -0
@@ -0,0 +1,447 @@
1
+ """Read queries over the SQLite store, shaped for the web UI.
2
+
3
+ We don't add an ORM. We open a `WorkspaceScope` per request, list
4
+ entities, and project them into the view models in
5
+ `selfevals.api.schemas`. The single non-trivial bit is rebuilding
6
+ the `OptimizationResult` JSON via the existing reconstruction helper
7
+ in `cli.commands` so the web reuses the exact same shape the
8
+ reporter emits.
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import json
14
+ from collections.abc import Sequence
15
+ from datetime import datetime
16
+ from typing import Any
17
+
18
+ from pydantic import BaseModel
19
+
20
+ from selfevals.api.schemas import (
21
+ ExperimentDetailResponse,
22
+ ExperimentSummary,
23
+ IterationSummary,
24
+ SpanSummary,
25
+ ThreadResponse,
26
+ ThreadTurn,
27
+ TraceResponse,
28
+ WorkspaceResponse,
29
+ WorkspaceSummary,
30
+ )
31
+ from selfevals.cli.commands import (
32
+ _experiment_decisions,
33
+ _experiment_iterations,
34
+ _reconstruct_result,
35
+ )
36
+ from selfevals.reporter import render_json
37
+ from selfevals.schemas.experiment import Experiment
38
+ from selfevals.schemas.iteration import DecisionRecord, IterationRecord
39
+ from selfevals.schemas.trace import Trace
40
+ from selfevals.schemas.workspace import Workspace
41
+ from selfevals.storage.interface import ListFilter
42
+ from selfevals.storage.sqlite import SQLiteStorage
43
+
44
+
45
+ class AnchorPoint(BaseModel):
46
+ experiment_id: str
47
+ experiment_name: str
48
+ iteration: int
49
+ primary_metric_name: str
50
+ primary_metric_value: float
51
+ decision_outcome: str
52
+ created_at: str
53
+
54
+
55
+ def list_workspaces(storage: SQLiteStorage) -> list[WorkspaceSummary]:
56
+ """Cross-workspace listing. Direct SQL because the typed interface is
57
+ intentionally scoped — no way to list without a workspace_id."""
58
+ rows = storage.connection.execute(
59
+ "SELECT payload FROM entities WHERE entity_type = 'Workspace' ORDER BY created_at DESC"
60
+ ).fetchall()
61
+ summaries: list[WorkspaceSummary] = []
62
+ for (payload,) in rows:
63
+ ws = Workspace.model_validate(json.loads(payload))
64
+ exp_count = storage.connection.execute(
65
+ "SELECT COUNT(1) FROM entities WHERE entity_type = 'Experiment' AND workspace_id = ?",
66
+ (ws.id,),
67
+ ).fetchone()[0]
68
+ last_run = storage.connection.execute(
69
+ "SELECT MAX(updated_at) FROM entities "
70
+ "WHERE entity_type = 'IterationRecord' AND workspace_id = ?",
71
+ (ws.id,),
72
+ ).fetchone()[0]
73
+ summaries.append(
74
+ WorkspaceSummary(
75
+ id=ws.id,
76
+ slug=ws.slug,
77
+ name=ws.name,
78
+ description=ws.description,
79
+ owner_id=ws.owner_id,
80
+ created_at=ws.created_at,
81
+ experiment_count=int(exp_count or 0),
82
+ last_run_at=last_run,
83
+ )
84
+ )
85
+ return summaries
86
+
87
+
88
+ def workspace_detail(storage: SQLiteStorage, *, workspace_id: str) -> WorkspaceResponse | None:
89
+ try:
90
+ with storage.open(workspace_id) as scope:
91
+ ws = scope.get_entity(Workspace, workspace_id)
92
+ assert isinstance(ws, Workspace)
93
+ experiments: Sequence[Experiment] = [
94
+ e
95
+ for e in scope.list_entities(Experiment, ListFilter())
96
+ if isinstance(e, Experiment)
97
+ ]
98
+ recent_iterations = [
99
+ it
100
+ for it in scope.list_entities(
101
+ IterationRecord,
102
+ ListFilter(order_by="updated_at", limit=20),
103
+ )
104
+ if isinstance(it, IterationRecord)
105
+ ]
106
+ except Exception:
107
+ return None
108
+ keep_count = sum(
109
+ 1
110
+ for it in recent_iterations
111
+ if it.decision is not None and str(it.decision.outcome) == "keep_candidate"
112
+ )
113
+ recent_health: float | None = None
114
+ if recent_iterations:
115
+ recent_health = round(keep_count / len(recent_iterations), 3)
116
+ return WorkspaceResponse(
117
+ id=ws.id,
118
+ slug=ws.slug,
119
+ name=ws.name,
120
+ description=ws.description,
121
+ owner_id=ws.owner_id,
122
+ created_at=ws.created_at,
123
+ experiment_count=len(experiments),
124
+ recent_health=recent_health,
125
+ )
126
+
127
+
128
+ def list_experiments(
129
+ storage: SQLiteStorage, *, workspace_id: str, limit: int = 100
130
+ ) -> list[dict[str, Any]]:
131
+ with storage.open(workspace_id) as scope:
132
+ experiments = [
133
+ e
134
+ for e in scope.list_entities(Experiment, ListFilter(order_by="updated_at", limit=limit))
135
+ if isinstance(e, Experiment)
136
+ ]
137
+ out: list[dict[str, Any]] = []
138
+ for exp in experiments:
139
+ it_count = sum(
140
+ 1
141
+ for it in scope.list_entities(IterationRecord, ListFilter())
142
+ if isinstance(it, IterationRecord) and it.experiment_id == exp.id
143
+ )
144
+ out.append(_experiment_summary_dict(exp, iteration_count=it_count))
145
+ return out
146
+
147
+
148
+ def experiment_detail(
149
+ storage: SQLiteStorage, *, workspace_id: str, experiment_id: str
150
+ ) -> ExperimentDetailResponse | None:
151
+ with storage.open(workspace_id) as scope:
152
+ try:
153
+ exp = scope.get_entity(Experiment, experiment_id)
154
+ except Exception:
155
+ return None
156
+ assert isinstance(exp, Experiment)
157
+ iterations = _experiment_iterations(scope, exp.id)
158
+ decisions = _experiment_decisions(scope, exp.id)
159
+
160
+ result_dict: dict[str, Any] | None = None
161
+ if iterations:
162
+ result = _reconstruct_result(exp, iterations, decisions)
163
+ result_dict = json.loads(render_json(result))
164
+
165
+ summary = ExperimentSummary(**_experiment_summary_dict(exp, iteration_count=len(iterations)))
166
+ return ExperimentDetailResponse(
167
+ summary=summary,
168
+ result=result_dict,
169
+ iterations=_iteration_summaries(iterations, decisions),
170
+ )
171
+
172
+
173
+ def experiment_iterations(
174
+ storage: SQLiteStorage, *, workspace_id: str, experiment_id: str
175
+ ) -> list[IterationSummary]:
176
+ with storage.open(workspace_id) as scope:
177
+ iterations = _experiment_iterations(scope, experiment_id)
178
+ decisions = _experiment_decisions(scope, experiment_id)
179
+ return _iteration_summaries(iterations, decisions)
180
+
181
+
182
+ def experiment_decisions(
183
+ storage: SQLiteStorage, *, workspace_id: str, experiment_id: str
184
+ ) -> list[dict[str, Any]]:
185
+ with storage.open(workspace_id) as scope:
186
+ decisions = _experiment_decisions(scope, experiment_id)
187
+ out: list[dict[str, Any]] = []
188
+ for iteration in sorted(decisions):
189
+ d = decisions[iteration]
190
+ out.append(
191
+ {
192
+ "id": d.id,
193
+ "iteration": d.iteration,
194
+ "outcome": str(d.outcome),
195
+ "automated_rationale": d.rationale.automated,
196
+ "human_rationale": (d.rationale.human.notes if d.rationale.human else None),
197
+ "metrics_snapshot": d.metrics_snapshot,
198
+ "created_at": d.created_at.isoformat(),
199
+ }
200
+ )
201
+ return out
202
+
203
+
204
+ def iteration_detail(
205
+ storage: SQLiteStorage, *, workspace_id: str, iteration_id: str
206
+ ) -> dict[str, Any] | None:
207
+ with storage.open(workspace_id) as scope:
208
+ try:
209
+ it = scope.get_entity(IterationRecord, iteration_id)
210
+ except Exception:
211
+ return None
212
+ assert isinstance(it, IterationRecord)
213
+ decisions = _experiment_decisions(scope, it.experiment_id)
214
+ decision = decisions.get(it.iteration)
215
+ return {
216
+ "iteration": it.model_dump(mode="json"),
217
+ "decision": decision.model_dump(mode="json") if decision else None,
218
+ }
219
+
220
+
221
+ def load_trace(storage: SQLiteStorage, *, workspace_id: str, trace_id: str) -> TraceResponse | None:
222
+ """Look up a Trace by either its entity id (`tr_...`) or its run_id
223
+ (`run_...`). Both are common navigation targets — IterationRecord
224
+ persists `run_id`s while internal storage keys by entity id."""
225
+ with storage.open(workspace_id) as scope:
226
+ try:
227
+ trace = scope.get_entity(Trace, trace_id)
228
+ except Exception:
229
+ trace = None
230
+ if trace is None:
231
+ # Fall back to a run_id lookup. The generic entities table
232
+ # does not index json_extract, but the workspace-scoped
233
+ # table is small enough that a single scan is fine.
234
+ row = storage.connection.execute(
235
+ "SELECT payload FROM entities "
236
+ "WHERE workspace_id = ? AND entity_type = 'Trace' "
237
+ "AND json_extract(payload, '$.run.run_id') = ? LIMIT 1",
238
+ (workspace_id, trace_id),
239
+ ).fetchone()
240
+ if row is None:
241
+ return None
242
+ trace = Trace.model_validate(json.loads(row[0]))
243
+ assert isinstance(trace, Trace)
244
+ return TraceResponse(
245
+ id=trace.id,
246
+ run_id=trace.run.run_id,
247
+ experiment_id=trace.run.experiment_id,
248
+ iteration=trace.run.iteration,
249
+ thread_id=trace.run.thread_id,
250
+ thread_position=trace.run.thread_position,
251
+ final_state=str(trace.final_state.status),
252
+ started_at=trace.environment.started_at,
253
+ ended_at=trace.environment.ended_at,
254
+ spans=[_span_summary(s) for s in trace.spans],
255
+ metrics=trace.metrics.model_dump(mode="json"),
256
+ )
257
+
258
+
259
+ def load_thread(
260
+ storage: SQLiteStorage, *, workspace_id: str, thread_id: str
261
+ ) -> ThreadResponse | None:
262
+ """Assemble every Trace sharing `thread_id` into an ordered conversation.
263
+
264
+ Traces are ordered by `run.thread_position` when set, falling back to
265
+ `environment.started_at` so a thread without explicit turn indices still
266
+ reads in chronological order. Each turn carries its grader results so the
267
+ thread view shows the grade per turn, not just the transcript.
268
+ Returns None when no trace carries the thread_id.
269
+ """
270
+ rows = storage.connection.execute(
271
+ "SELECT payload FROM entities "
272
+ "WHERE workspace_id = ? AND entity_type = 'Trace' "
273
+ "AND json_extract(payload, '$.run.thread_id') = ?",
274
+ (workspace_id, thread_id),
275
+ ).fetchall()
276
+ if not rows:
277
+ return None
278
+
279
+ traces = [Trace.model_validate(json.loads(payload)) for (payload,) in rows]
280
+
281
+ def _sort_key(t: Trace) -> tuple[int, int, datetime]:
282
+ # Explicitly-positioned turns first (by position), then the rest by
283
+ # start time. The leading int makes positioned turns sort ahead of
284
+ # unpositioned ones deterministically.
285
+ pos = t.run.thread_position
286
+ has_pos = 0 if pos is not None else 1
287
+ return (has_pos, pos if pos is not None else 0, t.environment.started_at)
288
+
289
+ traces.sort(key=_sort_key)
290
+
291
+ turns: list[ThreadTurn] = []
292
+ for idx, trace in enumerate(traces):
293
+ primary_grade = trace.grader_results[0].label if trace.grader_results else None
294
+ turns.append(
295
+ ThreadTurn(
296
+ trace_id=trace.id,
297
+ run_id=trace.run.run_id,
298
+ position=trace.run.thread_position if trace.run.thread_position is not None else idx,
299
+ experiment_id=trace.run.experiment_id,
300
+ iteration=trace.run.iteration,
301
+ final_state=str(trace.final_state.status),
302
+ started_at=trace.environment.started_at,
303
+ ended_at=trace.environment.ended_at,
304
+ primary_grade=primary_grade,
305
+ grader_results=[g.model_dump(mode="json") for g in trace.grader_results],
306
+ metrics=trace.metrics.model_dump(mode="json"),
307
+ )
308
+ )
309
+ return ThreadResponse(thread_id=thread_id, turn_count=len(turns), turns=turns)
310
+
311
+
312
+ def anchor_set_history(storage: SQLiteStorage, *, workspace_id: str) -> list[AnchorPoint]:
313
+ """Longitudinal view: latest primary-metric value per experiment.
314
+
315
+ Anchor-set proper requires repeated reruns of a canonical case
316
+ set; until that lands, we expose the per-experiment latest
317
+ completed iteration so the chart has shape.
318
+ """
319
+ with storage.open(workspace_id) as scope:
320
+ experiments = [
321
+ e for e in scope.list_entities(Experiment, ListFilter()) if isinstance(e, Experiment)
322
+ ]
323
+ points: list[AnchorPoint] = []
324
+ for exp in experiments:
325
+ iterations = _experiment_iterations(scope, exp.id)
326
+ decisions = _experiment_decisions(scope, exp.id)
327
+ for it in iterations:
328
+ if it.metrics is None:
329
+ continue
330
+ decision = decisions.get(it.iteration)
331
+ outcome = str(decision.outcome) if decision else "unknown"
332
+ points.append(
333
+ AnchorPoint(
334
+ experiment_id=exp.id,
335
+ experiment_name=exp.name,
336
+ iteration=it.iteration,
337
+ primary_metric_name=it.metrics.primary.name,
338
+ primary_metric_value=it.metrics.primary.value,
339
+ decision_outcome=outcome,
340
+ created_at=it.created_at.isoformat(),
341
+ )
342
+ )
343
+ points.sort(key=lambda p: p.created_at)
344
+ return points
345
+
346
+
347
+ def _experiment_summary_dict(exp: Experiment, *, iteration_count: int) -> dict[str, Any]:
348
+ return {
349
+ "id": exp.id,
350
+ "name": exp.name,
351
+ "goal": exp.goal,
352
+ "mode": str(exp.mode),
353
+ "state": str(exp.state),
354
+ "primary_metric": exp.target.primary.name,
355
+ "primary_target": {
356
+ "operator": exp.target.primary.operator,
357
+ "value": exp.target.primary.value,
358
+ },
359
+ "proposer_strategy": str(exp.proposer.strategy),
360
+ "max_iterations": exp.run.max_iterations,
361
+ "created_at": exp.created_at.isoformat(),
362
+ "updated_at": exp.updated_at.isoformat(),
363
+ "iteration_count": iteration_count,
364
+ }
365
+
366
+
367
+ def _iteration_summaries(
368
+ iterations: Sequence[IterationRecord],
369
+ decisions: dict[int, DecisionRecord],
370
+ ) -> list[IterationSummary]:
371
+ best_so_far: float | None = None
372
+ out: list[IterationSummary] = []
373
+ for it in iterations:
374
+ primary = it.metrics.primary if it.metrics else None
375
+ delta: float | None = None
376
+ if primary is not None:
377
+ delta = 0.0 if best_so_far is None else primary.value - best_so_far
378
+ if best_so_far is None or primary.value > best_so_far:
379
+ best_so_far = primary.value
380
+ decision = decisions.get(it.iteration)
381
+ out.append(
382
+ IterationSummary(
383
+ id=it.id,
384
+ iteration=it.iteration,
385
+ state=str(it.state),
386
+ hypothesis=it.hypothesis,
387
+ proposed_parameters=dict(it.proposed_parameters),
388
+ primary_metric_name=primary.name if primary else None,
389
+ primary_metric_value=primary.value if primary else None,
390
+ delta_vs_best=delta,
391
+ decision_outcome=(str(decision.outcome) if decision is not None else None),
392
+ decision_rationale=(decision.rationale.automated if decision is not None else None),
393
+ cost_usd=it.cost_usd,
394
+ duration_seconds=it.duration_seconds,
395
+ trace_run_ids=list(it.execution.trace_run_ids),
396
+ created_at=it.created_at,
397
+ )
398
+ )
399
+ return out
400
+
401
+
402
+ def _span_summary(span: Any) -> SpanSummary:
403
+ """Project any Span subclass into the trimmed view shape.
404
+
405
+ We surface kind + name + parent + timing on every span, and copy
406
+ the kind-specific high-value fields into `detail` for the trace
407
+ inspector to render without fetching the full payload.
408
+ """
409
+ detail: dict[str, Any] = {}
410
+ payload = span.model_dump(mode="json")
411
+ keep_keys = {
412
+ "provider",
413
+ "model",
414
+ "params",
415
+ "tokens",
416
+ "cost_usd",
417
+ "output",
418
+ "reasoning",
419
+ "tool_name",
420
+ "tool_use_id",
421
+ "status",
422
+ "error",
423
+ "retriever",
424
+ "top_k_requested",
425
+ "top_k_returned",
426
+ "retrieved",
427
+ "decision_type",
428
+ "chosen",
429
+ "alternatives_considered",
430
+ "guardrail",
431
+ "passed",
432
+ "error_type",
433
+ "message",
434
+ "recoverable",
435
+ }
436
+ for key, value in payload.items():
437
+ if key in keep_keys:
438
+ detail[key] = value
439
+ return SpanSummary(
440
+ id=span.id,
441
+ parent_id=span.parent_id,
442
+ kind=str(span.kind),
443
+ name=span.name,
444
+ started_at=span.started_at,
445
+ duration_ms=span.duration_ms,
446
+ detail=detail,
447
+ )
@@ -0,0 +1,151 @@
1
+ """Pydantic response models for the HTTP bridge.
2
+
3
+ These are *view* shapes — denormalized snapshots of the canonical
4
+ entities that the web UI needs. The canonical schemas in
5
+ `selfevals.schemas` stay the source of truth; this module simply
6
+ chooses what to expose and in what shape.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from datetime import datetime
12
+ from typing import Any
13
+
14
+ from pydantic import BaseModel, Field
15
+
16
+
17
+ class HealthResponse(BaseModel):
18
+ status: str
19
+ db_path: str
20
+
21
+
22
+ class WorkspaceSummary(BaseModel):
23
+ id: str
24
+ slug: str
25
+ name: str
26
+ description: str | None = None
27
+ owner_id: str | None = None
28
+ created_at: datetime
29
+ experiment_count: int = 0
30
+ last_run_at: datetime | None = None
31
+
32
+
33
+ class WorkspaceListResponse(BaseModel):
34
+ workspaces: list[WorkspaceSummary]
35
+
36
+
37
+ class WorkspaceResponse(BaseModel):
38
+ id: str
39
+ slug: str
40
+ name: str
41
+ description: str | None = None
42
+ owner_id: str | None = None
43
+ created_at: datetime
44
+ experiment_count: int
45
+ recent_health: float | None = Field(
46
+ default=None,
47
+ description="Fraction of recent experiments that landed on keep_candidate.",
48
+ )
49
+
50
+
51
+ class CreateWorkspaceRequest(BaseModel):
52
+ slug: str = Field(min_length=1, max_length=63)
53
+ name: str | None = None
54
+ description: str | None = None
55
+
56
+
57
+ class IterationSummary(BaseModel):
58
+ id: str
59
+ iteration: int
60
+ state: str
61
+ hypothesis: str
62
+ proposed_parameters: dict[str, Any] = Field(default_factory=dict)
63
+ primary_metric_name: str | None = None
64
+ primary_metric_value: float | None = None
65
+ delta_vs_best: float | None = None
66
+ decision_outcome: str | None = None
67
+ decision_rationale: str | None = None
68
+ cost_usd: float | None = None
69
+ duration_seconds: float | None = None
70
+ trace_run_ids: list[str] = Field(default_factory=list)
71
+ created_at: datetime
72
+
73
+
74
+ class IterationListResponse(BaseModel):
75
+ iterations: list[IterationSummary]
76
+
77
+
78
+ class ExperimentSummary(BaseModel):
79
+ id: str
80
+ name: str
81
+ goal: str
82
+ mode: str
83
+ state: str
84
+ primary_metric: str
85
+ primary_target: dict[str, Any]
86
+ proposer_strategy: str
87
+ max_iterations: int
88
+ created_at: datetime
89
+ updated_at: datetime
90
+ iteration_count: int = 0
91
+
92
+
93
+ class ExperimentDetailResponse(BaseModel):
94
+ """Shape returned by GET /workspaces/{ws}/experiments/{id}.
95
+
96
+ `result` is the JSON shape from `selfevals.reporter.render_json`
97
+ when there is at least one completed iteration to reconstruct;
98
+ `None` when the experiment has not run yet.
99
+ """
100
+
101
+ summary: ExperimentSummary
102
+ result: dict[str, Any] | None = None
103
+ iterations: list[IterationSummary] = Field(default_factory=list)
104
+
105
+
106
+ class SpanSummary(BaseModel):
107
+ id: str
108
+ parent_id: str | None
109
+ kind: str
110
+ name: str
111
+ started_at: datetime
112
+ duration_ms: int
113
+ detail: dict[str, Any] = Field(default_factory=dict)
114
+
115
+
116
+ class TraceResponse(BaseModel):
117
+ id: str
118
+ run_id: str
119
+ experiment_id: str | None
120
+ iteration: int | None
121
+ thread_id: str | None = None
122
+ thread_position: int | None = None
123
+ final_state: str
124
+ started_at: datetime
125
+ ended_at: datetime | None
126
+ spans: list[SpanSummary]
127
+ metrics: dict[str, Any]
128
+
129
+
130
+ class ThreadTurn(BaseModel):
131
+ """One trace within a thread, projected as a turn for the thread view."""
132
+
133
+ trace_id: str
134
+ run_id: str
135
+ position: int
136
+ experiment_id: str | None = None
137
+ iteration: int | None = None
138
+ final_state: str
139
+ started_at: datetime
140
+ ended_at: datetime | None = None
141
+ primary_grade: str | None = None
142
+ grader_results: list[dict[str, Any]] = Field(default_factory=list)
143
+ metrics: dict[str, Any] = Field(default_factory=dict)
144
+
145
+
146
+ class ThreadResponse(BaseModel):
147
+ """All traces sharing a thread_id, assembled into an ordered conversation."""
148
+
149
+ thread_id: str
150
+ turn_count: int
151
+ turns: list[ThreadTurn] = Field(default_factory=list)
selfevals/api/sse.py ADDED
@@ -0,0 +1,114 @@
1
+ """SSE stream of trace spans, with a heartbeat and a snapshot prelude.
2
+
3
+ Wire format (one frame is `<lines>\\n\\n`):
4
+
5
+ event: snapshot
6
+ data: {... full Trace JSON ...}
7
+
8
+ event: span
9
+ data: {... one SpanSummary ...}
10
+
11
+ event: ping
12
+ data: 1
13
+
14
+ event: complete
15
+ data: {"final_state": "completed"}
16
+
17
+ The client subscribes via `new EventSource(url)`. Heartbeat every 15s
18
+ keeps proxies from idle-closing the connection.
19
+ """
20
+
21
+ from __future__ import annotations
22
+
23
+ import asyncio
24
+ import json
25
+ import logging
26
+ from collections.abc import AsyncIterator, Callable
27
+ from typing import Any
28
+
29
+ from fastapi.responses import StreamingResponse
30
+
31
+ from selfevals.api.broker import SpanBroker, _Closed
32
+ from selfevals.api.queries import load_trace
33
+ from selfevals.storage.sqlite import SQLiteStorage
34
+
35
+ logger = logging.getLogger(__name__)
36
+
37
+ _HEARTBEAT_SECONDS = 15.0
38
+ _HEADERS = {
39
+ "Cache-Control": "no-cache, no-transform",
40
+ "Connection": "keep-alive",
41
+ "X-Accel-Buffering": "no",
42
+ }
43
+
44
+
45
+ def _encode(event: str, data: Any) -> bytes:
46
+ payload = data if isinstance(data, str) else json.dumps(data, default=str)
47
+ return f"event: {event}\ndata: {payload}\n\n".encode()
48
+
49
+
50
+ async def stream_trace(
51
+ *,
52
+ workspace_id: str,
53
+ run_id: str,
54
+ broker: SpanBroker,
55
+ storage_factory: Callable[[], SQLiteStorage],
56
+ ) -> StreamingResponse:
57
+ """Build a StreamingResponse that emits snapshot + live spans."""
58
+
59
+ async def gen() -> AsyncIterator[bytes]:
60
+ # 1. Initial snapshot from SQLite (may be None if the run hasn't
61
+ # persisted yet; that's fine — the client gets an empty
62
+ # snapshot and waits for live spans).
63
+ storage = storage_factory()
64
+ try:
65
+ snapshot = load_trace(storage, workspace_id=workspace_id, trace_id=run_id)
66
+ finally:
67
+ storage.close()
68
+ if snapshot is not None:
69
+ yield _encode("snapshot", snapshot.model_dump(mode="json"))
70
+ else:
71
+ yield _encode("snapshot", {"run_id": run_id, "spans": []})
72
+
73
+ # 2. Live subscription.
74
+ sub = broker.subscribe(workspace_id, run_id)
75
+ agen = sub.__aiter__()
76
+
77
+ async def _next() -> dict[str, Any] | _Closed:
78
+ return await agen.__anext__()
79
+
80
+ heartbeat_task: asyncio.Task[None] | None = None
81
+ next_event_task: asyncio.Task[dict[str, Any] | _Closed] | None = None
82
+ try:
83
+ while True:
84
+ if next_event_task is None:
85
+ next_event_task = asyncio.create_task(_next())
86
+ if heartbeat_task is None:
87
+ heartbeat_task = asyncio.create_task(asyncio.sleep(_HEARTBEAT_SECONDS))
88
+ done, _pending = await asyncio.wait(
89
+ {next_event_task, heartbeat_task},
90
+ return_when=asyncio.FIRST_COMPLETED,
91
+ )
92
+ if heartbeat_task in done:
93
+ heartbeat_task = None
94
+ yield _encode("ping", "1")
95
+ if next_event_task in done:
96
+ try:
97
+ event = next_event_task.result()
98
+ except StopAsyncIteration:
99
+ return
100
+ next_event_task = None
101
+ if isinstance(event, _Closed):
102
+ yield _encode("complete", {"final_state": event.final_state})
103
+ return
104
+ yield _encode("span", event)
105
+ finally:
106
+ for task in (heartbeat_task, next_event_task):
107
+ if task is not None and not task.done():
108
+ task.cancel()
109
+
110
+ return StreamingResponse(
111
+ gen(),
112
+ media_type="text/event-stream",
113
+ headers=_HEADERS,
114
+ )