selfevals 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- selfevals/.agents/skills/error-analysis/SKILL.md +149 -0
- selfevals/__init__.py +19 -0
- selfevals/_errors.py +44 -0
- selfevals/_internal/__init__.py +0 -0
- selfevals/_internal/hashing.py +23 -0
- selfevals/_internal/ids.py +65 -0
- selfevals/_internal/time.py +17 -0
- selfevals/analysis/__init__.py +23 -0
- selfevals/analysis/bundle.py +162 -0
- selfevals/analysis/hypothesis.py +26 -0
- selfevals/analysis/ingest.py +185 -0
- selfevals/analysis/schemas.py +119 -0
- selfevals/analysis/staging.py +34 -0
- selfevals/api/__init__.py +24 -0
- selfevals/api/__main__.py +47 -0
- selfevals/api/app.py +351 -0
- selfevals/api/broker.py +210 -0
- selfevals/api/broker_bridge.py +29 -0
- selfevals/api/queries.py +447 -0
- selfevals/api/schemas.py +151 -0
- selfevals/api/sse.py +114 -0
- selfevals/cli/__init__.py +15 -0
- selfevals/cli/_friendly.py +180 -0
- selfevals/cli/_help.py +55 -0
- selfevals/cli/analyze_commands.py +169 -0
- selfevals/cli/commands.py +615 -0
- selfevals/cli/main.py +409 -0
- selfevals/decision/__init__.py +34 -0
- selfevals/decision/matrix.py +185 -0
- selfevals/examples/__init__.py +8 -0
- selfevals/examples/evals/datasets/pingpong.jsonl +2 -0
- selfevals/examples/evals/experiments/example_pingpong.yaml +58 -0
- selfevals/examples/pingpong.py +21 -0
- selfevals/graders/__init__.py +46 -0
- selfevals/graders/base.py +54 -0
- selfevals/graders/calibration.py +145 -0
- selfevals/graders/deterministic.py +143 -0
- selfevals/graders/llm_judge.py +187 -0
- selfevals/graders/registry.py +66 -0
- selfevals/optimization/__init__.py +47 -0
- selfevals/optimization/aggregator.py +246 -0
- selfevals/optimization/loop.py +432 -0
- selfevals/optimization/proposers.py +202 -0
- selfevals/py.typed +0 -0
- selfevals/repo/__init__.py +28 -0
- selfevals/repo/loader.py +276 -0
- selfevals/reporter/__init__.py +21 -0
- selfevals/reporter/_metrics.py +114 -0
- selfevals/reporter/compare.py +221 -0
- selfevals/reporter/json_report.py +105 -0
- selfevals/reporter/markdown.py +232 -0
- selfevals/runner/__init__.py +42 -0
- selfevals/runner/adapters.py +268 -0
- selfevals/runner/executor.py +234 -0
- selfevals/runner/otlp_receiver.py +343 -0
- selfevals/runner/otlp_to_recorder.py +180 -0
- selfevals/runner/sandbox.py +46 -0
- selfevals/schemas/__init__.py +213 -0
- selfevals/schemas/_base.py +82 -0
- selfevals/schemas/annotation.py +55 -0
- selfevals/schemas/dataset.py +111 -0
- selfevals/schemas/enums.py +324 -0
- selfevals/schemas/eval_case.py +189 -0
- selfevals/schemas/experiment.py +367 -0
- selfevals/schemas/failure_mode.py +76 -0
- selfevals/schemas/fleet.py +111 -0
- selfevals/schemas/grader_card.py +112 -0
- selfevals/schemas/iteration.py +219 -0
- selfevals/schemas/registry.py +125 -0
- selfevals/schemas/tool.py +43 -0
- selfevals/schemas/trace.py +384 -0
- selfevals/schemas/workspace.py +69 -0
- selfevals/sdk/__init__.py +24 -0
- selfevals/sdk/auto_instrument.py +165 -0
- selfevals/sdk/context.py +45 -0
- selfevals/sdk/exporter.py +50 -0
- selfevals/sdk/facade.py +203 -0
- selfevals/skills/__init__.py +61 -0
- selfevals/storage/__init__.py +53 -0
- selfevals/storage/errors.py +66 -0
- selfevals/storage/filesystem.py +137 -0
- selfevals/storage/interface.py +135 -0
- selfevals/storage/migrations/__init__.py +80 -0
- selfevals/storage/migrations/m0001_initial.py +57 -0
- selfevals/storage/seed.py +199 -0
- selfevals/storage/sqlite.py +232 -0
- selfevals/trace/__init__.py +31 -0
- selfevals/trace/otel_importer.py +455 -0
- selfevals/trace/payload_router.py +106 -0
- selfevals/trace/recorder.py +540 -0
- selfevals/version.py +1 -0
- selfevals-0.2.2.dist-info/METADATA +283 -0
- selfevals-0.2.2.dist-info/RECORD +96 -0
- selfevals-0.2.2.dist-info/WHEEL +4 -0
- selfevals-0.2.2.dist-info/entry_points.txt +2 -0
- selfevals-0.2.2.dist-info/licenses/LICENSE +17 -0
selfevals/api/queries.py
ADDED
|
@@ -0,0 +1,447 @@
|
|
|
1
|
+
"""Read queries over the SQLite store, shaped for the web UI.
|
|
2
|
+
|
|
3
|
+
We don't add an ORM. We open a `WorkspaceScope` per request, list
|
|
4
|
+
entities, and project them into the view models in
|
|
5
|
+
`selfevals.api.schemas`. The single non-trivial bit is rebuilding
|
|
6
|
+
the `OptimizationResult` JSON via the existing reconstruction helper
|
|
7
|
+
in `cli.commands` so the web reuses the exact same shape the
|
|
8
|
+
reporter emits.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import json
|
|
14
|
+
from collections.abc import Sequence
|
|
15
|
+
from datetime import datetime
|
|
16
|
+
from typing import Any
|
|
17
|
+
|
|
18
|
+
from pydantic import BaseModel
|
|
19
|
+
|
|
20
|
+
from selfevals.api.schemas import (
|
|
21
|
+
ExperimentDetailResponse,
|
|
22
|
+
ExperimentSummary,
|
|
23
|
+
IterationSummary,
|
|
24
|
+
SpanSummary,
|
|
25
|
+
ThreadResponse,
|
|
26
|
+
ThreadTurn,
|
|
27
|
+
TraceResponse,
|
|
28
|
+
WorkspaceResponse,
|
|
29
|
+
WorkspaceSummary,
|
|
30
|
+
)
|
|
31
|
+
from selfevals.cli.commands import (
|
|
32
|
+
_experiment_decisions,
|
|
33
|
+
_experiment_iterations,
|
|
34
|
+
_reconstruct_result,
|
|
35
|
+
)
|
|
36
|
+
from selfevals.reporter import render_json
|
|
37
|
+
from selfevals.schemas.experiment import Experiment
|
|
38
|
+
from selfevals.schemas.iteration import DecisionRecord, IterationRecord
|
|
39
|
+
from selfevals.schemas.trace import Trace
|
|
40
|
+
from selfevals.schemas.workspace import Workspace
|
|
41
|
+
from selfevals.storage.interface import ListFilter
|
|
42
|
+
from selfevals.storage.sqlite import SQLiteStorage
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class AnchorPoint(BaseModel):
|
|
46
|
+
experiment_id: str
|
|
47
|
+
experiment_name: str
|
|
48
|
+
iteration: int
|
|
49
|
+
primary_metric_name: str
|
|
50
|
+
primary_metric_value: float
|
|
51
|
+
decision_outcome: str
|
|
52
|
+
created_at: str
|
|
53
|
+
|
|
54
|
+
|
|
55
|
+
def list_workspaces(storage: SQLiteStorage) -> list[WorkspaceSummary]:
|
|
56
|
+
"""Cross-workspace listing. Direct SQL because the typed interface is
|
|
57
|
+
intentionally scoped — no way to list without a workspace_id."""
|
|
58
|
+
rows = storage.connection.execute(
|
|
59
|
+
"SELECT payload FROM entities WHERE entity_type = 'Workspace' ORDER BY created_at DESC"
|
|
60
|
+
).fetchall()
|
|
61
|
+
summaries: list[WorkspaceSummary] = []
|
|
62
|
+
for (payload,) in rows:
|
|
63
|
+
ws = Workspace.model_validate(json.loads(payload))
|
|
64
|
+
exp_count = storage.connection.execute(
|
|
65
|
+
"SELECT COUNT(1) FROM entities WHERE entity_type = 'Experiment' AND workspace_id = ?",
|
|
66
|
+
(ws.id,),
|
|
67
|
+
).fetchone()[0]
|
|
68
|
+
last_run = storage.connection.execute(
|
|
69
|
+
"SELECT MAX(updated_at) FROM entities "
|
|
70
|
+
"WHERE entity_type = 'IterationRecord' AND workspace_id = ?",
|
|
71
|
+
(ws.id,),
|
|
72
|
+
).fetchone()[0]
|
|
73
|
+
summaries.append(
|
|
74
|
+
WorkspaceSummary(
|
|
75
|
+
id=ws.id,
|
|
76
|
+
slug=ws.slug,
|
|
77
|
+
name=ws.name,
|
|
78
|
+
description=ws.description,
|
|
79
|
+
owner_id=ws.owner_id,
|
|
80
|
+
created_at=ws.created_at,
|
|
81
|
+
experiment_count=int(exp_count or 0),
|
|
82
|
+
last_run_at=last_run,
|
|
83
|
+
)
|
|
84
|
+
)
|
|
85
|
+
return summaries
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
def workspace_detail(storage: SQLiteStorage, *, workspace_id: str) -> WorkspaceResponse | None:
|
|
89
|
+
try:
|
|
90
|
+
with storage.open(workspace_id) as scope:
|
|
91
|
+
ws = scope.get_entity(Workspace, workspace_id)
|
|
92
|
+
assert isinstance(ws, Workspace)
|
|
93
|
+
experiments: Sequence[Experiment] = [
|
|
94
|
+
e
|
|
95
|
+
for e in scope.list_entities(Experiment, ListFilter())
|
|
96
|
+
if isinstance(e, Experiment)
|
|
97
|
+
]
|
|
98
|
+
recent_iterations = [
|
|
99
|
+
it
|
|
100
|
+
for it in scope.list_entities(
|
|
101
|
+
IterationRecord,
|
|
102
|
+
ListFilter(order_by="updated_at", limit=20),
|
|
103
|
+
)
|
|
104
|
+
if isinstance(it, IterationRecord)
|
|
105
|
+
]
|
|
106
|
+
except Exception:
|
|
107
|
+
return None
|
|
108
|
+
keep_count = sum(
|
|
109
|
+
1
|
|
110
|
+
for it in recent_iterations
|
|
111
|
+
if it.decision is not None and str(it.decision.outcome) == "keep_candidate"
|
|
112
|
+
)
|
|
113
|
+
recent_health: float | None = None
|
|
114
|
+
if recent_iterations:
|
|
115
|
+
recent_health = round(keep_count / len(recent_iterations), 3)
|
|
116
|
+
return WorkspaceResponse(
|
|
117
|
+
id=ws.id,
|
|
118
|
+
slug=ws.slug,
|
|
119
|
+
name=ws.name,
|
|
120
|
+
description=ws.description,
|
|
121
|
+
owner_id=ws.owner_id,
|
|
122
|
+
created_at=ws.created_at,
|
|
123
|
+
experiment_count=len(experiments),
|
|
124
|
+
recent_health=recent_health,
|
|
125
|
+
)
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
def list_experiments(
|
|
129
|
+
storage: SQLiteStorage, *, workspace_id: str, limit: int = 100
|
|
130
|
+
) -> list[dict[str, Any]]:
|
|
131
|
+
with storage.open(workspace_id) as scope:
|
|
132
|
+
experiments = [
|
|
133
|
+
e
|
|
134
|
+
for e in scope.list_entities(Experiment, ListFilter(order_by="updated_at", limit=limit))
|
|
135
|
+
if isinstance(e, Experiment)
|
|
136
|
+
]
|
|
137
|
+
out: list[dict[str, Any]] = []
|
|
138
|
+
for exp in experiments:
|
|
139
|
+
it_count = sum(
|
|
140
|
+
1
|
|
141
|
+
for it in scope.list_entities(IterationRecord, ListFilter())
|
|
142
|
+
if isinstance(it, IterationRecord) and it.experiment_id == exp.id
|
|
143
|
+
)
|
|
144
|
+
out.append(_experiment_summary_dict(exp, iteration_count=it_count))
|
|
145
|
+
return out
|
|
146
|
+
|
|
147
|
+
|
|
148
|
+
def experiment_detail(
|
|
149
|
+
storage: SQLiteStorage, *, workspace_id: str, experiment_id: str
|
|
150
|
+
) -> ExperimentDetailResponse | None:
|
|
151
|
+
with storage.open(workspace_id) as scope:
|
|
152
|
+
try:
|
|
153
|
+
exp = scope.get_entity(Experiment, experiment_id)
|
|
154
|
+
except Exception:
|
|
155
|
+
return None
|
|
156
|
+
assert isinstance(exp, Experiment)
|
|
157
|
+
iterations = _experiment_iterations(scope, exp.id)
|
|
158
|
+
decisions = _experiment_decisions(scope, exp.id)
|
|
159
|
+
|
|
160
|
+
result_dict: dict[str, Any] | None = None
|
|
161
|
+
if iterations:
|
|
162
|
+
result = _reconstruct_result(exp, iterations, decisions)
|
|
163
|
+
result_dict = json.loads(render_json(result))
|
|
164
|
+
|
|
165
|
+
summary = ExperimentSummary(**_experiment_summary_dict(exp, iteration_count=len(iterations)))
|
|
166
|
+
return ExperimentDetailResponse(
|
|
167
|
+
summary=summary,
|
|
168
|
+
result=result_dict,
|
|
169
|
+
iterations=_iteration_summaries(iterations, decisions),
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
|
|
173
|
+
def experiment_iterations(
|
|
174
|
+
storage: SQLiteStorage, *, workspace_id: str, experiment_id: str
|
|
175
|
+
) -> list[IterationSummary]:
|
|
176
|
+
with storage.open(workspace_id) as scope:
|
|
177
|
+
iterations = _experiment_iterations(scope, experiment_id)
|
|
178
|
+
decisions = _experiment_decisions(scope, experiment_id)
|
|
179
|
+
return _iteration_summaries(iterations, decisions)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def experiment_decisions(
|
|
183
|
+
storage: SQLiteStorage, *, workspace_id: str, experiment_id: str
|
|
184
|
+
) -> list[dict[str, Any]]:
|
|
185
|
+
with storage.open(workspace_id) as scope:
|
|
186
|
+
decisions = _experiment_decisions(scope, experiment_id)
|
|
187
|
+
out: list[dict[str, Any]] = []
|
|
188
|
+
for iteration in sorted(decisions):
|
|
189
|
+
d = decisions[iteration]
|
|
190
|
+
out.append(
|
|
191
|
+
{
|
|
192
|
+
"id": d.id,
|
|
193
|
+
"iteration": d.iteration,
|
|
194
|
+
"outcome": str(d.outcome),
|
|
195
|
+
"automated_rationale": d.rationale.automated,
|
|
196
|
+
"human_rationale": (d.rationale.human.notes if d.rationale.human else None),
|
|
197
|
+
"metrics_snapshot": d.metrics_snapshot,
|
|
198
|
+
"created_at": d.created_at.isoformat(),
|
|
199
|
+
}
|
|
200
|
+
)
|
|
201
|
+
return out
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
def iteration_detail(
|
|
205
|
+
storage: SQLiteStorage, *, workspace_id: str, iteration_id: str
|
|
206
|
+
) -> dict[str, Any] | None:
|
|
207
|
+
with storage.open(workspace_id) as scope:
|
|
208
|
+
try:
|
|
209
|
+
it = scope.get_entity(IterationRecord, iteration_id)
|
|
210
|
+
except Exception:
|
|
211
|
+
return None
|
|
212
|
+
assert isinstance(it, IterationRecord)
|
|
213
|
+
decisions = _experiment_decisions(scope, it.experiment_id)
|
|
214
|
+
decision = decisions.get(it.iteration)
|
|
215
|
+
return {
|
|
216
|
+
"iteration": it.model_dump(mode="json"),
|
|
217
|
+
"decision": decision.model_dump(mode="json") if decision else None,
|
|
218
|
+
}
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
def load_trace(storage: SQLiteStorage, *, workspace_id: str, trace_id: str) -> TraceResponse | None:
|
|
222
|
+
"""Look up a Trace by either its entity id (`tr_...`) or its run_id
|
|
223
|
+
(`run_...`). Both are common navigation targets — IterationRecord
|
|
224
|
+
persists `run_id`s while internal storage keys by entity id."""
|
|
225
|
+
with storage.open(workspace_id) as scope:
|
|
226
|
+
try:
|
|
227
|
+
trace = scope.get_entity(Trace, trace_id)
|
|
228
|
+
except Exception:
|
|
229
|
+
trace = None
|
|
230
|
+
if trace is None:
|
|
231
|
+
# Fall back to a run_id lookup. The generic entities table
|
|
232
|
+
# does not index json_extract, but the workspace-scoped
|
|
233
|
+
# table is small enough that a single scan is fine.
|
|
234
|
+
row = storage.connection.execute(
|
|
235
|
+
"SELECT payload FROM entities "
|
|
236
|
+
"WHERE workspace_id = ? AND entity_type = 'Trace' "
|
|
237
|
+
"AND json_extract(payload, '$.run.run_id') = ? LIMIT 1",
|
|
238
|
+
(workspace_id, trace_id),
|
|
239
|
+
).fetchone()
|
|
240
|
+
if row is None:
|
|
241
|
+
return None
|
|
242
|
+
trace = Trace.model_validate(json.loads(row[0]))
|
|
243
|
+
assert isinstance(trace, Trace)
|
|
244
|
+
return TraceResponse(
|
|
245
|
+
id=trace.id,
|
|
246
|
+
run_id=trace.run.run_id,
|
|
247
|
+
experiment_id=trace.run.experiment_id,
|
|
248
|
+
iteration=trace.run.iteration,
|
|
249
|
+
thread_id=trace.run.thread_id,
|
|
250
|
+
thread_position=trace.run.thread_position,
|
|
251
|
+
final_state=str(trace.final_state.status),
|
|
252
|
+
started_at=trace.environment.started_at,
|
|
253
|
+
ended_at=trace.environment.ended_at,
|
|
254
|
+
spans=[_span_summary(s) for s in trace.spans],
|
|
255
|
+
metrics=trace.metrics.model_dump(mode="json"),
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
def load_thread(
|
|
260
|
+
storage: SQLiteStorage, *, workspace_id: str, thread_id: str
|
|
261
|
+
) -> ThreadResponse | None:
|
|
262
|
+
"""Assemble every Trace sharing `thread_id` into an ordered conversation.
|
|
263
|
+
|
|
264
|
+
Traces are ordered by `run.thread_position` when set, falling back to
|
|
265
|
+
`environment.started_at` so a thread without explicit turn indices still
|
|
266
|
+
reads in chronological order. Each turn carries its grader results so the
|
|
267
|
+
thread view shows the grade per turn, not just the transcript.
|
|
268
|
+
Returns None when no trace carries the thread_id.
|
|
269
|
+
"""
|
|
270
|
+
rows = storage.connection.execute(
|
|
271
|
+
"SELECT payload FROM entities "
|
|
272
|
+
"WHERE workspace_id = ? AND entity_type = 'Trace' "
|
|
273
|
+
"AND json_extract(payload, '$.run.thread_id') = ?",
|
|
274
|
+
(workspace_id, thread_id),
|
|
275
|
+
).fetchall()
|
|
276
|
+
if not rows:
|
|
277
|
+
return None
|
|
278
|
+
|
|
279
|
+
traces = [Trace.model_validate(json.loads(payload)) for (payload,) in rows]
|
|
280
|
+
|
|
281
|
+
def _sort_key(t: Trace) -> tuple[int, int, datetime]:
|
|
282
|
+
# Explicitly-positioned turns first (by position), then the rest by
|
|
283
|
+
# start time. The leading int makes positioned turns sort ahead of
|
|
284
|
+
# unpositioned ones deterministically.
|
|
285
|
+
pos = t.run.thread_position
|
|
286
|
+
has_pos = 0 if pos is not None else 1
|
|
287
|
+
return (has_pos, pos if pos is not None else 0, t.environment.started_at)
|
|
288
|
+
|
|
289
|
+
traces.sort(key=_sort_key)
|
|
290
|
+
|
|
291
|
+
turns: list[ThreadTurn] = []
|
|
292
|
+
for idx, trace in enumerate(traces):
|
|
293
|
+
primary_grade = trace.grader_results[0].label if trace.grader_results else None
|
|
294
|
+
turns.append(
|
|
295
|
+
ThreadTurn(
|
|
296
|
+
trace_id=trace.id,
|
|
297
|
+
run_id=trace.run.run_id,
|
|
298
|
+
position=trace.run.thread_position if trace.run.thread_position is not None else idx,
|
|
299
|
+
experiment_id=trace.run.experiment_id,
|
|
300
|
+
iteration=trace.run.iteration,
|
|
301
|
+
final_state=str(trace.final_state.status),
|
|
302
|
+
started_at=trace.environment.started_at,
|
|
303
|
+
ended_at=trace.environment.ended_at,
|
|
304
|
+
primary_grade=primary_grade,
|
|
305
|
+
grader_results=[g.model_dump(mode="json") for g in trace.grader_results],
|
|
306
|
+
metrics=trace.metrics.model_dump(mode="json"),
|
|
307
|
+
)
|
|
308
|
+
)
|
|
309
|
+
return ThreadResponse(thread_id=thread_id, turn_count=len(turns), turns=turns)
|
|
310
|
+
|
|
311
|
+
|
|
312
|
+
def anchor_set_history(storage: SQLiteStorage, *, workspace_id: str) -> list[AnchorPoint]:
|
|
313
|
+
"""Longitudinal view: latest primary-metric value per experiment.
|
|
314
|
+
|
|
315
|
+
Anchor-set proper requires repeated reruns of a canonical case
|
|
316
|
+
set; until that lands, we expose the per-experiment latest
|
|
317
|
+
completed iteration so the chart has shape.
|
|
318
|
+
"""
|
|
319
|
+
with storage.open(workspace_id) as scope:
|
|
320
|
+
experiments = [
|
|
321
|
+
e for e in scope.list_entities(Experiment, ListFilter()) if isinstance(e, Experiment)
|
|
322
|
+
]
|
|
323
|
+
points: list[AnchorPoint] = []
|
|
324
|
+
for exp in experiments:
|
|
325
|
+
iterations = _experiment_iterations(scope, exp.id)
|
|
326
|
+
decisions = _experiment_decisions(scope, exp.id)
|
|
327
|
+
for it in iterations:
|
|
328
|
+
if it.metrics is None:
|
|
329
|
+
continue
|
|
330
|
+
decision = decisions.get(it.iteration)
|
|
331
|
+
outcome = str(decision.outcome) if decision else "unknown"
|
|
332
|
+
points.append(
|
|
333
|
+
AnchorPoint(
|
|
334
|
+
experiment_id=exp.id,
|
|
335
|
+
experiment_name=exp.name,
|
|
336
|
+
iteration=it.iteration,
|
|
337
|
+
primary_metric_name=it.metrics.primary.name,
|
|
338
|
+
primary_metric_value=it.metrics.primary.value,
|
|
339
|
+
decision_outcome=outcome,
|
|
340
|
+
created_at=it.created_at.isoformat(),
|
|
341
|
+
)
|
|
342
|
+
)
|
|
343
|
+
points.sort(key=lambda p: p.created_at)
|
|
344
|
+
return points
|
|
345
|
+
|
|
346
|
+
|
|
347
|
+
def _experiment_summary_dict(exp: Experiment, *, iteration_count: int) -> dict[str, Any]:
|
|
348
|
+
return {
|
|
349
|
+
"id": exp.id,
|
|
350
|
+
"name": exp.name,
|
|
351
|
+
"goal": exp.goal,
|
|
352
|
+
"mode": str(exp.mode),
|
|
353
|
+
"state": str(exp.state),
|
|
354
|
+
"primary_metric": exp.target.primary.name,
|
|
355
|
+
"primary_target": {
|
|
356
|
+
"operator": exp.target.primary.operator,
|
|
357
|
+
"value": exp.target.primary.value,
|
|
358
|
+
},
|
|
359
|
+
"proposer_strategy": str(exp.proposer.strategy),
|
|
360
|
+
"max_iterations": exp.run.max_iterations,
|
|
361
|
+
"created_at": exp.created_at.isoformat(),
|
|
362
|
+
"updated_at": exp.updated_at.isoformat(),
|
|
363
|
+
"iteration_count": iteration_count,
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
|
|
367
|
+
def _iteration_summaries(
|
|
368
|
+
iterations: Sequence[IterationRecord],
|
|
369
|
+
decisions: dict[int, DecisionRecord],
|
|
370
|
+
) -> list[IterationSummary]:
|
|
371
|
+
best_so_far: float | None = None
|
|
372
|
+
out: list[IterationSummary] = []
|
|
373
|
+
for it in iterations:
|
|
374
|
+
primary = it.metrics.primary if it.metrics else None
|
|
375
|
+
delta: float | None = None
|
|
376
|
+
if primary is not None:
|
|
377
|
+
delta = 0.0 if best_so_far is None else primary.value - best_so_far
|
|
378
|
+
if best_so_far is None or primary.value > best_so_far:
|
|
379
|
+
best_so_far = primary.value
|
|
380
|
+
decision = decisions.get(it.iteration)
|
|
381
|
+
out.append(
|
|
382
|
+
IterationSummary(
|
|
383
|
+
id=it.id,
|
|
384
|
+
iteration=it.iteration,
|
|
385
|
+
state=str(it.state),
|
|
386
|
+
hypothesis=it.hypothesis,
|
|
387
|
+
proposed_parameters=dict(it.proposed_parameters),
|
|
388
|
+
primary_metric_name=primary.name if primary else None,
|
|
389
|
+
primary_metric_value=primary.value if primary else None,
|
|
390
|
+
delta_vs_best=delta,
|
|
391
|
+
decision_outcome=(str(decision.outcome) if decision is not None else None),
|
|
392
|
+
decision_rationale=(decision.rationale.automated if decision is not None else None),
|
|
393
|
+
cost_usd=it.cost_usd,
|
|
394
|
+
duration_seconds=it.duration_seconds,
|
|
395
|
+
trace_run_ids=list(it.execution.trace_run_ids),
|
|
396
|
+
created_at=it.created_at,
|
|
397
|
+
)
|
|
398
|
+
)
|
|
399
|
+
return out
|
|
400
|
+
|
|
401
|
+
|
|
402
|
+
def _span_summary(span: Any) -> SpanSummary:
|
|
403
|
+
"""Project any Span subclass into the trimmed view shape.
|
|
404
|
+
|
|
405
|
+
We surface kind + name + parent + timing on every span, and copy
|
|
406
|
+
the kind-specific high-value fields into `detail` for the trace
|
|
407
|
+
inspector to render without fetching the full payload.
|
|
408
|
+
"""
|
|
409
|
+
detail: dict[str, Any] = {}
|
|
410
|
+
payload = span.model_dump(mode="json")
|
|
411
|
+
keep_keys = {
|
|
412
|
+
"provider",
|
|
413
|
+
"model",
|
|
414
|
+
"params",
|
|
415
|
+
"tokens",
|
|
416
|
+
"cost_usd",
|
|
417
|
+
"output",
|
|
418
|
+
"reasoning",
|
|
419
|
+
"tool_name",
|
|
420
|
+
"tool_use_id",
|
|
421
|
+
"status",
|
|
422
|
+
"error",
|
|
423
|
+
"retriever",
|
|
424
|
+
"top_k_requested",
|
|
425
|
+
"top_k_returned",
|
|
426
|
+
"retrieved",
|
|
427
|
+
"decision_type",
|
|
428
|
+
"chosen",
|
|
429
|
+
"alternatives_considered",
|
|
430
|
+
"guardrail",
|
|
431
|
+
"passed",
|
|
432
|
+
"error_type",
|
|
433
|
+
"message",
|
|
434
|
+
"recoverable",
|
|
435
|
+
}
|
|
436
|
+
for key, value in payload.items():
|
|
437
|
+
if key in keep_keys:
|
|
438
|
+
detail[key] = value
|
|
439
|
+
return SpanSummary(
|
|
440
|
+
id=span.id,
|
|
441
|
+
parent_id=span.parent_id,
|
|
442
|
+
kind=str(span.kind),
|
|
443
|
+
name=span.name,
|
|
444
|
+
started_at=span.started_at,
|
|
445
|
+
duration_ms=span.duration_ms,
|
|
446
|
+
detail=detail,
|
|
447
|
+
)
|
selfevals/api/schemas.py
ADDED
|
@@ -0,0 +1,151 @@
|
|
|
1
|
+
"""Pydantic response models for the HTTP bridge.
|
|
2
|
+
|
|
3
|
+
These are *view* shapes — denormalized snapshots of the canonical
|
|
4
|
+
entities that the web UI needs. The canonical schemas in
|
|
5
|
+
`selfevals.schemas` stay the source of truth; this module simply
|
|
6
|
+
chooses what to expose and in what shape.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from datetime import datetime
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
from pydantic import BaseModel, Field
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class HealthResponse(BaseModel):
|
|
18
|
+
status: str
|
|
19
|
+
db_path: str
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class WorkspaceSummary(BaseModel):
|
|
23
|
+
id: str
|
|
24
|
+
slug: str
|
|
25
|
+
name: str
|
|
26
|
+
description: str | None = None
|
|
27
|
+
owner_id: str | None = None
|
|
28
|
+
created_at: datetime
|
|
29
|
+
experiment_count: int = 0
|
|
30
|
+
last_run_at: datetime | None = None
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
class WorkspaceListResponse(BaseModel):
|
|
34
|
+
workspaces: list[WorkspaceSummary]
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class WorkspaceResponse(BaseModel):
|
|
38
|
+
id: str
|
|
39
|
+
slug: str
|
|
40
|
+
name: str
|
|
41
|
+
description: str | None = None
|
|
42
|
+
owner_id: str | None = None
|
|
43
|
+
created_at: datetime
|
|
44
|
+
experiment_count: int
|
|
45
|
+
recent_health: float | None = Field(
|
|
46
|
+
default=None,
|
|
47
|
+
description="Fraction of recent experiments that landed on keep_candidate.",
|
|
48
|
+
)
|
|
49
|
+
|
|
50
|
+
|
|
51
|
+
class CreateWorkspaceRequest(BaseModel):
|
|
52
|
+
slug: str = Field(min_length=1, max_length=63)
|
|
53
|
+
name: str | None = None
|
|
54
|
+
description: str | None = None
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class IterationSummary(BaseModel):
|
|
58
|
+
id: str
|
|
59
|
+
iteration: int
|
|
60
|
+
state: str
|
|
61
|
+
hypothesis: str
|
|
62
|
+
proposed_parameters: dict[str, Any] = Field(default_factory=dict)
|
|
63
|
+
primary_metric_name: str | None = None
|
|
64
|
+
primary_metric_value: float | None = None
|
|
65
|
+
delta_vs_best: float | None = None
|
|
66
|
+
decision_outcome: str | None = None
|
|
67
|
+
decision_rationale: str | None = None
|
|
68
|
+
cost_usd: float | None = None
|
|
69
|
+
duration_seconds: float | None = None
|
|
70
|
+
trace_run_ids: list[str] = Field(default_factory=list)
|
|
71
|
+
created_at: datetime
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
class IterationListResponse(BaseModel):
|
|
75
|
+
iterations: list[IterationSummary]
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class ExperimentSummary(BaseModel):
|
|
79
|
+
id: str
|
|
80
|
+
name: str
|
|
81
|
+
goal: str
|
|
82
|
+
mode: str
|
|
83
|
+
state: str
|
|
84
|
+
primary_metric: str
|
|
85
|
+
primary_target: dict[str, Any]
|
|
86
|
+
proposer_strategy: str
|
|
87
|
+
max_iterations: int
|
|
88
|
+
created_at: datetime
|
|
89
|
+
updated_at: datetime
|
|
90
|
+
iteration_count: int = 0
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
class ExperimentDetailResponse(BaseModel):
|
|
94
|
+
"""Shape returned by GET /workspaces/{ws}/experiments/{id}.
|
|
95
|
+
|
|
96
|
+
`result` is the JSON shape from `selfevals.reporter.render_json`
|
|
97
|
+
when there is at least one completed iteration to reconstruct;
|
|
98
|
+
`None` when the experiment has not run yet.
|
|
99
|
+
"""
|
|
100
|
+
|
|
101
|
+
summary: ExperimentSummary
|
|
102
|
+
result: dict[str, Any] | None = None
|
|
103
|
+
iterations: list[IterationSummary] = Field(default_factory=list)
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
class SpanSummary(BaseModel):
|
|
107
|
+
id: str
|
|
108
|
+
parent_id: str | None
|
|
109
|
+
kind: str
|
|
110
|
+
name: str
|
|
111
|
+
started_at: datetime
|
|
112
|
+
duration_ms: int
|
|
113
|
+
detail: dict[str, Any] = Field(default_factory=dict)
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
class TraceResponse(BaseModel):
|
|
117
|
+
id: str
|
|
118
|
+
run_id: str
|
|
119
|
+
experiment_id: str | None
|
|
120
|
+
iteration: int | None
|
|
121
|
+
thread_id: str | None = None
|
|
122
|
+
thread_position: int | None = None
|
|
123
|
+
final_state: str
|
|
124
|
+
started_at: datetime
|
|
125
|
+
ended_at: datetime | None
|
|
126
|
+
spans: list[SpanSummary]
|
|
127
|
+
metrics: dict[str, Any]
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class ThreadTurn(BaseModel):
|
|
131
|
+
"""One trace within a thread, projected as a turn for the thread view."""
|
|
132
|
+
|
|
133
|
+
trace_id: str
|
|
134
|
+
run_id: str
|
|
135
|
+
position: int
|
|
136
|
+
experiment_id: str | None = None
|
|
137
|
+
iteration: int | None = None
|
|
138
|
+
final_state: str
|
|
139
|
+
started_at: datetime
|
|
140
|
+
ended_at: datetime | None = None
|
|
141
|
+
primary_grade: str | None = None
|
|
142
|
+
grader_results: list[dict[str, Any]] = Field(default_factory=list)
|
|
143
|
+
metrics: dict[str, Any] = Field(default_factory=dict)
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
class ThreadResponse(BaseModel):
|
|
147
|
+
"""All traces sharing a thread_id, assembled into an ordered conversation."""
|
|
148
|
+
|
|
149
|
+
thread_id: str
|
|
150
|
+
turn_count: int
|
|
151
|
+
turns: list[ThreadTurn] = Field(default_factory=list)
|
selfevals/api/sse.py
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
"""SSE stream of trace spans, with a heartbeat and a snapshot prelude.
|
|
2
|
+
|
|
3
|
+
Wire format (one frame is `<lines>\\n\\n`):
|
|
4
|
+
|
|
5
|
+
event: snapshot
|
|
6
|
+
data: {... full Trace JSON ...}
|
|
7
|
+
|
|
8
|
+
event: span
|
|
9
|
+
data: {... one SpanSummary ...}
|
|
10
|
+
|
|
11
|
+
event: ping
|
|
12
|
+
data: 1
|
|
13
|
+
|
|
14
|
+
event: complete
|
|
15
|
+
data: {"final_state": "completed"}
|
|
16
|
+
|
|
17
|
+
The client subscribes via `new EventSource(url)`. Heartbeat every 15s
|
|
18
|
+
keeps proxies from idle-closing the connection.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import asyncio
|
|
24
|
+
import json
|
|
25
|
+
import logging
|
|
26
|
+
from collections.abc import AsyncIterator, Callable
|
|
27
|
+
from typing import Any
|
|
28
|
+
|
|
29
|
+
from fastapi.responses import StreamingResponse
|
|
30
|
+
|
|
31
|
+
from selfevals.api.broker import SpanBroker, _Closed
|
|
32
|
+
from selfevals.api.queries import load_trace
|
|
33
|
+
from selfevals.storage.sqlite import SQLiteStorage
|
|
34
|
+
|
|
35
|
+
logger = logging.getLogger(__name__)
|
|
36
|
+
|
|
37
|
+
_HEARTBEAT_SECONDS = 15.0
|
|
38
|
+
_HEADERS = {
|
|
39
|
+
"Cache-Control": "no-cache, no-transform",
|
|
40
|
+
"Connection": "keep-alive",
|
|
41
|
+
"X-Accel-Buffering": "no",
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def _encode(event: str, data: Any) -> bytes:
|
|
46
|
+
payload = data if isinstance(data, str) else json.dumps(data, default=str)
|
|
47
|
+
return f"event: {event}\ndata: {payload}\n\n".encode()
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
async def stream_trace(
|
|
51
|
+
*,
|
|
52
|
+
workspace_id: str,
|
|
53
|
+
run_id: str,
|
|
54
|
+
broker: SpanBroker,
|
|
55
|
+
storage_factory: Callable[[], SQLiteStorage],
|
|
56
|
+
) -> StreamingResponse:
|
|
57
|
+
"""Build a StreamingResponse that emits snapshot + live spans."""
|
|
58
|
+
|
|
59
|
+
async def gen() -> AsyncIterator[bytes]:
|
|
60
|
+
# 1. Initial snapshot from SQLite (may be None if the run hasn't
|
|
61
|
+
# persisted yet; that's fine — the client gets an empty
|
|
62
|
+
# snapshot and waits for live spans).
|
|
63
|
+
storage = storage_factory()
|
|
64
|
+
try:
|
|
65
|
+
snapshot = load_trace(storage, workspace_id=workspace_id, trace_id=run_id)
|
|
66
|
+
finally:
|
|
67
|
+
storage.close()
|
|
68
|
+
if snapshot is not None:
|
|
69
|
+
yield _encode("snapshot", snapshot.model_dump(mode="json"))
|
|
70
|
+
else:
|
|
71
|
+
yield _encode("snapshot", {"run_id": run_id, "spans": []})
|
|
72
|
+
|
|
73
|
+
# 2. Live subscription.
|
|
74
|
+
sub = broker.subscribe(workspace_id, run_id)
|
|
75
|
+
agen = sub.__aiter__()
|
|
76
|
+
|
|
77
|
+
async def _next() -> dict[str, Any] | _Closed:
|
|
78
|
+
return await agen.__anext__()
|
|
79
|
+
|
|
80
|
+
heartbeat_task: asyncio.Task[None] | None = None
|
|
81
|
+
next_event_task: asyncio.Task[dict[str, Any] | _Closed] | None = None
|
|
82
|
+
try:
|
|
83
|
+
while True:
|
|
84
|
+
if next_event_task is None:
|
|
85
|
+
next_event_task = asyncio.create_task(_next())
|
|
86
|
+
if heartbeat_task is None:
|
|
87
|
+
heartbeat_task = asyncio.create_task(asyncio.sleep(_HEARTBEAT_SECONDS))
|
|
88
|
+
done, _pending = await asyncio.wait(
|
|
89
|
+
{next_event_task, heartbeat_task},
|
|
90
|
+
return_when=asyncio.FIRST_COMPLETED,
|
|
91
|
+
)
|
|
92
|
+
if heartbeat_task in done:
|
|
93
|
+
heartbeat_task = None
|
|
94
|
+
yield _encode("ping", "1")
|
|
95
|
+
if next_event_task in done:
|
|
96
|
+
try:
|
|
97
|
+
event = next_event_task.result()
|
|
98
|
+
except StopAsyncIteration:
|
|
99
|
+
return
|
|
100
|
+
next_event_task = None
|
|
101
|
+
if isinstance(event, _Closed):
|
|
102
|
+
yield _encode("complete", {"final_state": event.final_state})
|
|
103
|
+
return
|
|
104
|
+
yield _encode("span", event)
|
|
105
|
+
finally:
|
|
106
|
+
for task in (heartbeat_task, next_event_task):
|
|
107
|
+
if task is not None and not task.done():
|
|
108
|
+
task.cancel()
|
|
109
|
+
|
|
110
|
+
return StreamingResponse(
|
|
111
|
+
gen(),
|
|
112
|
+
media_type="text/event-stream",
|
|
113
|
+
headers=_HEADERS,
|
|
114
|
+
)
|