selfevals 0.2.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- selfevals/.agents/skills/error-analysis/SKILL.md +149 -0
- selfevals/__init__.py +19 -0
- selfevals/_errors.py +44 -0
- selfevals/_internal/__init__.py +0 -0
- selfevals/_internal/hashing.py +23 -0
- selfevals/_internal/ids.py +65 -0
- selfevals/_internal/time.py +17 -0
- selfevals/analysis/__init__.py +23 -0
- selfevals/analysis/bundle.py +162 -0
- selfevals/analysis/hypothesis.py +26 -0
- selfevals/analysis/ingest.py +185 -0
- selfevals/analysis/schemas.py +119 -0
- selfevals/analysis/staging.py +34 -0
- selfevals/api/__init__.py +24 -0
- selfevals/api/__main__.py +47 -0
- selfevals/api/app.py +351 -0
- selfevals/api/broker.py +210 -0
- selfevals/api/broker_bridge.py +29 -0
- selfevals/api/queries.py +447 -0
- selfevals/api/schemas.py +151 -0
- selfevals/api/sse.py +114 -0
- selfevals/cli/__init__.py +15 -0
- selfevals/cli/_friendly.py +180 -0
- selfevals/cli/_help.py +55 -0
- selfevals/cli/analyze_commands.py +169 -0
- selfevals/cli/commands.py +615 -0
- selfevals/cli/main.py +409 -0
- selfevals/decision/__init__.py +34 -0
- selfevals/decision/matrix.py +185 -0
- selfevals/examples/__init__.py +8 -0
- selfevals/examples/evals/datasets/pingpong.jsonl +2 -0
- selfevals/examples/evals/experiments/example_pingpong.yaml +58 -0
- selfevals/examples/pingpong.py +21 -0
- selfevals/graders/__init__.py +46 -0
- selfevals/graders/base.py +54 -0
- selfevals/graders/calibration.py +145 -0
- selfevals/graders/deterministic.py +143 -0
- selfevals/graders/llm_judge.py +187 -0
- selfevals/graders/registry.py +66 -0
- selfevals/optimization/__init__.py +47 -0
- selfevals/optimization/aggregator.py +246 -0
- selfevals/optimization/loop.py +432 -0
- selfevals/optimization/proposers.py +202 -0
- selfevals/py.typed +0 -0
- selfevals/repo/__init__.py +28 -0
- selfevals/repo/loader.py +276 -0
- selfevals/reporter/__init__.py +21 -0
- selfevals/reporter/_metrics.py +114 -0
- selfevals/reporter/compare.py +221 -0
- selfevals/reporter/json_report.py +105 -0
- selfevals/reporter/markdown.py +232 -0
- selfevals/runner/__init__.py +42 -0
- selfevals/runner/adapters.py +268 -0
- selfevals/runner/executor.py +234 -0
- selfevals/runner/otlp_receiver.py +343 -0
- selfevals/runner/otlp_to_recorder.py +180 -0
- selfevals/runner/sandbox.py +46 -0
- selfevals/schemas/__init__.py +213 -0
- selfevals/schemas/_base.py +82 -0
- selfevals/schemas/annotation.py +55 -0
- selfevals/schemas/dataset.py +111 -0
- selfevals/schemas/enums.py +324 -0
- selfevals/schemas/eval_case.py +189 -0
- selfevals/schemas/experiment.py +367 -0
- selfevals/schemas/failure_mode.py +76 -0
- selfevals/schemas/fleet.py +111 -0
- selfevals/schemas/grader_card.py +112 -0
- selfevals/schemas/iteration.py +219 -0
- selfevals/schemas/registry.py +125 -0
- selfevals/schemas/tool.py +43 -0
- selfevals/schemas/trace.py +384 -0
- selfevals/schemas/workspace.py +69 -0
- selfevals/sdk/__init__.py +24 -0
- selfevals/sdk/auto_instrument.py +165 -0
- selfevals/sdk/context.py +45 -0
- selfevals/sdk/exporter.py +50 -0
- selfevals/sdk/facade.py +203 -0
- selfevals/skills/__init__.py +61 -0
- selfevals/storage/__init__.py +53 -0
- selfevals/storage/errors.py +66 -0
- selfevals/storage/filesystem.py +137 -0
- selfevals/storage/interface.py +135 -0
- selfevals/storage/migrations/__init__.py +80 -0
- selfevals/storage/migrations/m0001_initial.py +57 -0
- selfevals/storage/seed.py +199 -0
- selfevals/storage/sqlite.py +232 -0
- selfevals/trace/__init__.py +31 -0
- selfevals/trace/otel_importer.py +455 -0
- selfevals/trace/payload_router.py +106 -0
- selfevals/trace/recorder.py +540 -0
- selfevals/version.py +1 -0
- selfevals-0.2.2.dist-info/METADATA +283 -0
- selfevals-0.2.2.dist-info/RECORD +96 -0
- selfevals-0.2.2.dist-info/WHEEL +4 -0
- selfevals-0.2.2.dist-info/entry_points.txt +2 -0
- selfevals-0.2.2.dist-info/licenses/LICENSE +17 -0
selfevals/api/app.py
ADDED
|
@@ -0,0 +1,351 @@
|
|
|
1
|
+
"""FastAPI app — read-mostly HTTP bridge over the SQLite store.
|
|
2
|
+
|
|
3
|
+
Mounted on `/` (no version prefix; this is a single internal service).
|
|
4
|
+
Endpoints map 1:1 to the pages of the web UI; payload shapes match
|
|
5
|
+
the existing Pydantic models so the web side can validate against the
|
|
6
|
+
same canonical JSON.
|
|
7
|
+
|
|
8
|
+
Auth: stubbed via a single `X-SelfEvals-User` header (default
|
|
9
|
+
`"local"`). Real auth lands later; everything else is forward-compat.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import asyncio
|
|
15
|
+
import os
|
|
16
|
+
from collections.abc import AsyncIterator
|
|
17
|
+
from contextlib import asynccontextmanager
|
|
18
|
+
from pathlib import Path
|
|
19
|
+
from typing import Annotated, Any
|
|
20
|
+
|
|
21
|
+
from fastapi import Depends, FastAPI, Header, HTTPException, Query
|
|
22
|
+
from fastapi.middleware.cors import CORSMiddleware
|
|
23
|
+
from fastapi.responses import StreamingResponse
|
|
24
|
+
|
|
25
|
+
from selfevals.api.broker import get_broker
|
|
26
|
+
from selfevals.api.queries import (
|
|
27
|
+
AnchorPoint,
|
|
28
|
+
anchor_set_history,
|
|
29
|
+
experiment_decisions,
|
|
30
|
+
experiment_detail,
|
|
31
|
+
experiment_iterations,
|
|
32
|
+
iteration_detail,
|
|
33
|
+
list_experiments,
|
|
34
|
+
list_workspaces,
|
|
35
|
+
load_thread,
|
|
36
|
+
load_trace,
|
|
37
|
+
workspace_detail,
|
|
38
|
+
)
|
|
39
|
+
from selfevals.api.schemas import (
|
|
40
|
+
CreateWorkspaceRequest,
|
|
41
|
+
ExperimentDetailResponse,
|
|
42
|
+
HealthResponse,
|
|
43
|
+
IterationListResponse,
|
|
44
|
+
ThreadResponse,
|
|
45
|
+
TraceResponse,
|
|
46
|
+
WorkspaceListResponse,
|
|
47
|
+
WorkspaceResponse,
|
|
48
|
+
)
|
|
49
|
+
from selfevals.api.sse import stream_trace
|
|
50
|
+
from selfevals.storage.sqlite import SQLiteStorage
|
|
51
|
+
|
|
52
|
+
DEFAULT_DB_PATH = "./selfevals.sqlite"
|
|
53
|
+
_USER_HEADER = "X-SelfEvals-User"
|
|
54
|
+
|
|
55
|
+
UserHeader = Annotated[
|
|
56
|
+
str | None,
|
|
57
|
+
Header(alias=_USER_HEADER, description="Stubbed user id (auth is post-MVP)."),
|
|
58
|
+
]
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _resolve_db_path(db_path: str | None) -> str:
|
|
62
|
+
return db_path or os.environ.get("SELFEVALS_DB", DEFAULT_DB_PATH)
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def build_app(*, db_path: str | None = None) -> FastAPI:
|
|
66
|
+
"""Construct the FastAPI app, parameterized on the SQLite db path."""
|
|
67
|
+
resolved = _resolve_db_path(db_path)
|
|
68
|
+
Path(resolved).parent.mkdir(parents=True, exist_ok=True)
|
|
69
|
+
|
|
70
|
+
@asynccontextmanager
|
|
71
|
+
async def lifespan(_app: FastAPI) -> AsyncIterator[None]:
|
|
72
|
+
# Capture the running event loop so the OTLP receiver thread
|
|
73
|
+
# (which runs sync) can schedule span publishes onto it.
|
|
74
|
+
get_broker().bind_loop(asyncio.get_running_loop())
|
|
75
|
+
yield
|
|
76
|
+
|
|
77
|
+
app = FastAPI(
|
|
78
|
+
title="selfevals",
|
|
79
|
+
description="HTTP bridge for the selfevals evals framework.",
|
|
80
|
+
version="0.0.1",
|
|
81
|
+
docs_url="/api/docs",
|
|
82
|
+
redoc_url=None,
|
|
83
|
+
openapi_url="/api/openapi.json",
|
|
84
|
+
lifespan=lifespan,
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
app.add_middleware(
|
|
88
|
+
CORSMiddleware,
|
|
89
|
+
allow_origins=["http://localhost:5173", "http://127.0.0.1:5173"],
|
|
90
|
+
allow_credentials=False,
|
|
91
|
+
allow_methods=["GET", "POST", "OPTIONS"],
|
|
92
|
+
allow_headers=["*"],
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
def _storage() -> SQLiteStorage:
|
|
96
|
+
return SQLiteStorage(resolved)
|
|
97
|
+
|
|
98
|
+
def _storage_factory() -> SQLiteStorage:
|
|
99
|
+
return SQLiteStorage(resolved)
|
|
100
|
+
|
|
101
|
+
@app.get("/api/health", response_model=HealthResponse, tags=["meta"])
|
|
102
|
+
def health() -> HealthResponse:
|
|
103
|
+
return HealthResponse(status="ok", db_path=resolved)
|
|
104
|
+
|
|
105
|
+
@app.get(
|
|
106
|
+
"/api/workspaces",
|
|
107
|
+
response_model=WorkspaceListResponse,
|
|
108
|
+
tags=["workspaces"],
|
|
109
|
+
)
|
|
110
|
+
def workspaces_index(
|
|
111
|
+
storage: SQLiteStorage = Depends(_storage),
|
|
112
|
+
_user: UserHeader = None,
|
|
113
|
+
) -> WorkspaceListResponse:
|
|
114
|
+
try:
|
|
115
|
+
return WorkspaceListResponse(workspaces=list_workspaces(storage))
|
|
116
|
+
finally:
|
|
117
|
+
storage.close()
|
|
118
|
+
|
|
119
|
+
@app.get(
|
|
120
|
+
"/api/workspaces/{workspace_id}",
|
|
121
|
+
response_model=WorkspaceResponse,
|
|
122
|
+
tags=["workspaces"],
|
|
123
|
+
)
|
|
124
|
+
def workspaces_show(
|
|
125
|
+
workspace_id: str,
|
|
126
|
+
storage: SQLiteStorage = Depends(_storage),
|
|
127
|
+
_user: UserHeader = None,
|
|
128
|
+
) -> WorkspaceResponse:
|
|
129
|
+
try:
|
|
130
|
+
ws = workspace_detail(storage, workspace_id=workspace_id)
|
|
131
|
+
if ws is None:
|
|
132
|
+
raise HTTPException(status_code=404, detail="workspace not found")
|
|
133
|
+
return ws
|
|
134
|
+
finally:
|
|
135
|
+
storage.close()
|
|
136
|
+
|
|
137
|
+
@app.post(
|
|
138
|
+
"/api/workspaces",
|
|
139
|
+
response_model=WorkspaceResponse,
|
|
140
|
+
status_code=201,
|
|
141
|
+
tags=["workspaces"],
|
|
142
|
+
)
|
|
143
|
+
def workspaces_create(
|
|
144
|
+
body: CreateWorkspaceRequest,
|
|
145
|
+
storage: SQLiteStorage = Depends(_storage),
|
|
146
|
+
user: UserHeader = None,
|
|
147
|
+
) -> WorkspaceResponse:
|
|
148
|
+
from selfevals.storage.seed import seed_workspace
|
|
149
|
+
|
|
150
|
+
try:
|
|
151
|
+
seeded = seed_workspace(
|
|
152
|
+
storage,
|
|
153
|
+
slug=body.slug,
|
|
154
|
+
name=body.name or body.slug,
|
|
155
|
+
user_id=user or "local",
|
|
156
|
+
description=body.description,
|
|
157
|
+
)
|
|
158
|
+
ws = seeded.workspace
|
|
159
|
+
return WorkspaceResponse(
|
|
160
|
+
id=ws.id,
|
|
161
|
+
slug=ws.slug,
|
|
162
|
+
name=ws.name,
|
|
163
|
+
description=ws.description,
|
|
164
|
+
owner_id=ws.owner_id,
|
|
165
|
+
created_at=ws.created_at,
|
|
166
|
+
experiment_count=0,
|
|
167
|
+
recent_health=None,
|
|
168
|
+
)
|
|
169
|
+
finally:
|
|
170
|
+
storage.close()
|
|
171
|
+
|
|
172
|
+
@app.get(
|
|
173
|
+
"/api/workspaces/{workspace_id}/experiments",
|
|
174
|
+
response_model=list[dict[str, Any]],
|
|
175
|
+
tags=["experiments"],
|
|
176
|
+
)
|
|
177
|
+
def experiments_index(
|
|
178
|
+
workspace_id: str,
|
|
179
|
+
storage: SQLiteStorage = Depends(_storage),
|
|
180
|
+
limit: Annotated[int, Query(ge=1, le=500)] = 100,
|
|
181
|
+
_user: UserHeader = None,
|
|
182
|
+
) -> list[dict[str, Any]]:
|
|
183
|
+
try:
|
|
184
|
+
return list_experiments(storage, workspace_id=workspace_id, limit=limit)
|
|
185
|
+
finally:
|
|
186
|
+
storage.close()
|
|
187
|
+
|
|
188
|
+
@app.get(
|
|
189
|
+
"/api/workspaces/{workspace_id}/experiments/{experiment_id}",
|
|
190
|
+
response_model=ExperimentDetailResponse,
|
|
191
|
+
tags=["experiments"],
|
|
192
|
+
)
|
|
193
|
+
def experiments_show(
|
|
194
|
+
workspace_id: str,
|
|
195
|
+
experiment_id: str,
|
|
196
|
+
storage: SQLiteStorage = Depends(_storage),
|
|
197
|
+
_user: UserHeader = None,
|
|
198
|
+
) -> ExperimentDetailResponse:
|
|
199
|
+
try:
|
|
200
|
+
detail = experiment_detail(
|
|
201
|
+
storage,
|
|
202
|
+
workspace_id=workspace_id,
|
|
203
|
+
experiment_id=experiment_id,
|
|
204
|
+
)
|
|
205
|
+
if detail is None:
|
|
206
|
+
raise HTTPException(
|
|
207
|
+
status_code=404,
|
|
208
|
+
detail=f"experiment {experiment_id} not found",
|
|
209
|
+
)
|
|
210
|
+
return detail
|
|
211
|
+
finally:
|
|
212
|
+
storage.close()
|
|
213
|
+
|
|
214
|
+
@app.get(
|
|
215
|
+
"/api/workspaces/{workspace_id}/experiments/{experiment_id}/iterations",
|
|
216
|
+
response_model=IterationListResponse,
|
|
217
|
+
tags=["experiments"],
|
|
218
|
+
)
|
|
219
|
+
def experiments_iterations(
|
|
220
|
+
workspace_id: str,
|
|
221
|
+
experiment_id: str,
|
|
222
|
+
storage: SQLiteStorage = Depends(_storage),
|
|
223
|
+
_user: UserHeader = None,
|
|
224
|
+
) -> IterationListResponse:
|
|
225
|
+
try:
|
|
226
|
+
return IterationListResponse(
|
|
227
|
+
iterations=experiment_iterations(
|
|
228
|
+
storage,
|
|
229
|
+
workspace_id=workspace_id,
|
|
230
|
+
experiment_id=experiment_id,
|
|
231
|
+
)
|
|
232
|
+
)
|
|
233
|
+
finally:
|
|
234
|
+
storage.close()
|
|
235
|
+
|
|
236
|
+
@app.get(
|
|
237
|
+
"/api/workspaces/{workspace_id}/experiments/{experiment_id}/decisions",
|
|
238
|
+
tags=["experiments"],
|
|
239
|
+
)
|
|
240
|
+
def experiments_decisions(
|
|
241
|
+
workspace_id: str,
|
|
242
|
+
experiment_id: str,
|
|
243
|
+
storage: SQLiteStorage = Depends(_storage),
|
|
244
|
+
_user: UserHeader = None,
|
|
245
|
+
) -> list[dict[str, Any]]:
|
|
246
|
+
try:
|
|
247
|
+
return experiment_decisions(
|
|
248
|
+
storage,
|
|
249
|
+
workspace_id=workspace_id,
|
|
250
|
+
experiment_id=experiment_id,
|
|
251
|
+
)
|
|
252
|
+
finally:
|
|
253
|
+
storage.close()
|
|
254
|
+
|
|
255
|
+
@app.get(
|
|
256
|
+
"/api/workspaces/{workspace_id}/iterations/{iteration_id}",
|
|
257
|
+
tags=["experiments"],
|
|
258
|
+
)
|
|
259
|
+
def iterations_show(
|
|
260
|
+
workspace_id: str,
|
|
261
|
+
iteration_id: str,
|
|
262
|
+
storage: SQLiteStorage = Depends(_storage),
|
|
263
|
+
_user: UserHeader = None,
|
|
264
|
+
) -> dict[str, Any]:
|
|
265
|
+
try:
|
|
266
|
+
detail = iteration_detail(
|
|
267
|
+
storage,
|
|
268
|
+
workspace_id=workspace_id,
|
|
269
|
+
iteration_id=iteration_id,
|
|
270
|
+
)
|
|
271
|
+
if detail is None:
|
|
272
|
+
raise HTTPException(status_code=404, detail="iteration not found")
|
|
273
|
+
return detail
|
|
274
|
+
finally:
|
|
275
|
+
storage.close()
|
|
276
|
+
|
|
277
|
+
@app.get(
|
|
278
|
+
"/api/workspaces/{workspace_id}/traces/{trace_id}",
|
|
279
|
+
response_model=TraceResponse,
|
|
280
|
+
tags=["traces"],
|
|
281
|
+
)
|
|
282
|
+
def traces_show(
|
|
283
|
+
workspace_id: str,
|
|
284
|
+
trace_id: str,
|
|
285
|
+
storage: SQLiteStorage = Depends(_storage),
|
|
286
|
+
_user: UserHeader = None,
|
|
287
|
+
) -> TraceResponse:
|
|
288
|
+
try:
|
|
289
|
+
trace = load_trace(storage, workspace_id=workspace_id, trace_id=trace_id)
|
|
290
|
+
if trace is None:
|
|
291
|
+
raise HTTPException(status_code=404, detail="trace not found")
|
|
292
|
+
return trace
|
|
293
|
+
finally:
|
|
294
|
+
storage.close()
|
|
295
|
+
|
|
296
|
+
@app.get(
|
|
297
|
+
"/api/workspaces/{workspace_id}/threads/{thread_id}",
|
|
298
|
+
response_model=ThreadResponse,
|
|
299
|
+
tags=["traces"],
|
|
300
|
+
)
|
|
301
|
+
def threads_show(
|
|
302
|
+
workspace_id: str,
|
|
303
|
+
thread_id: str,
|
|
304
|
+
storage: SQLiteStorage = Depends(_storage),
|
|
305
|
+
_user: UserHeader = None,
|
|
306
|
+
) -> ThreadResponse:
|
|
307
|
+
try:
|
|
308
|
+
thread = load_thread(storage, workspace_id=workspace_id, thread_id=thread_id)
|
|
309
|
+
if thread is None:
|
|
310
|
+
raise HTTPException(status_code=404, detail="thread not found")
|
|
311
|
+
return thread
|
|
312
|
+
finally:
|
|
313
|
+
storage.close()
|
|
314
|
+
|
|
315
|
+
@app.get("/api/runs/active", tags=["traces"])
|
|
316
|
+
def runs_active(_user: UserHeader = None) -> list[dict[str, str]]:
|
|
317
|
+
return [{"workspace_id": ws, "run_id": run} for (ws, run) in get_broker().active_runs()]
|
|
318
|
+
|
|
319
|
+
@app.get(
|
|
320
|
+
"/api/workspaces/{workspace_id}/traces/{run_id}/stream",
|
|
321
|
+
tags=["traces"],
|
|
322
|
+
response_class=StreamingResponse,
|
|
323
|
+
)
|
|
324
|
+
async def traces_stream(
|
|
325
|
+
workspace_id: str,
|
|
326
|
+
run_id: str,
|
|
327
|
+
_user: UserHeader = None,
|
|
328
|
+
) -> StreamingResponse:
|
|
329
|
+
return await stream_trace(
|
|
330
|
+
workspace_id=workspace_id,
|
|
331
|
+
run_id=run_id,
|
|
332
|
+
broker=get_broker(),
|
|
333
|
+
storage_factory=_storage_factory,
|
|
334
|
+
)
|
|
335
|
+
|
|
336
|
+
@app.get(
|
|
337
|
+
"/api/workspaces/{workspace_id}/anchor-set",
|
|
338
|
+
response_model=list[AnchorPoint],
|
|
339
|
+
tags=["anchor-set"],
|
|
340
|
+
)
|
|
341
|
+
def anchor_set(
|
|
342
|
+
workspace_id: str,
|
|
343
|
+
storage: SQLiteStorage = Depends(_storage),
|
|
344
|
+
_user: UserHeader = None,
|
|
345
|
+
) -> list[AnchorPoint]:
|
|
346
|
+
try:
|
|
347
|
+
return anchor_set_history(storage, workspace_id=workspace_id)
|
|
348
|
+
finally:
|
|
349
|
+
storage.close()
|
|
350
|
+
|
|
351
|
+
return app
|
selfevals/api/broker.py
ADDED
|
@@ -0,0 +1,210 @@
|
|
|
1
|
+
"""In-process pub/sub for live trace streaming.
|
|
2
|
+
|
|
3
|
+
The OTLP receiver thread calls `publish_threadsafe()` when a span lands.
|
|
4
|
+
FastAPI SSE handlers call `subscribe()` to get an async generator of
|
|
5
|
+
events for a given `(workspace_id, run_id)` pair.
|
|
6
|
+
|
|
7
|
+
Why this lives in `selfevals.api`: the broker is a *transport* concern,
|
|
8
|
+
not a capture concern. The receiver doesn't know what it's for; it
|
|
9
|
+
just calls a callback. Coupling the broker to the API package keeps
|
|
10
|
+
the capture pipeline import-graph clean (the SDK / OTLP receiver
|
|
11
|
+
don't need to know FastAPI exists).
|
|
12
|
+
|
|
13
|
+
Scaling note: this is a single-process in-memory broker. The contract
|
|
14
|
+
(`publish` + `subscribe`) is intentionally narrow so a Redis-backed
|
|
15
|
+
implementation can drop in later without touching callers.
|
|
16
|
+
"""
|
|
17
|
+
|
|
18
|
+
from __future__ import annotations
|
|
19
|
+
|
|
20
|
+
import asyncio
|
|
21
|
+
import logging
|
|
22
|
+
from collections.abc import AsyncIterator
|
|
23
|
+
from contextlib import suppress
|
|
24
|
+
from dataclasses import dataclass, field
|
|
25
|
+
from typing import Any
|
|
26
|
+
|
|
27
|
+
logger = logging.getLogger(__name__)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
# Sentinel objects on the queue — clearer than magic dicts.
|
|
31
|
+
@dataclass(frozen=True)
|
|
32
|
+
class _Closed:
|
|
33
|
+
final_state: str = "completed"
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
_QUEUE_MAXSIZE = 256
|
|
37
|
+
"""Per-subscriber queue depth. If we exceed it, the slowest subscriber
|
|
38
|
+
gets disconnected — it's the wrong behaviour to backpressure the
|
|
39
|
+
receiver thread for one stuck browser tab."""
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass
|
|
43
|
+
class _Subscriber:
|
|
44
|
+
queue: asyncio.Queue[dict[str, Any] | _Closed]
|
|
45
|
+
workspace_id: str
|
|
46
|
+
run_id: str
|
|
47
|
+
closed: bool = False
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
@dataclass
|
|
51
|
+
class _Channel:
|
|
52
|
+
"""All subscribers for one (workspace_id, run_id) pair."""
|
|
53
|
+
|
|
54
|
+
subscribers: list[_Subscriber] = field(default_factory=list)
|
|
55
|
+
closed: bool = False
|
|
56
|
+
final_state: str | None = None
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
class SpanBroker:
|
|
60
|
+
"""In-proc fan-out from the OTLP receiver to SSE subscribers."""
|
|
61
|
+
|
|
62
|
+
def __init__(self) -> None:
|
|
63
|
+
self._channels: dict[tuple[str, str], _Channel] = {}
|
|
64
|
+
self._lock = asyncio.Lock()
|
|
65
|
+
self._loop: asyncio.AbstractEventLoop | None = None
|
|
66
|
+
|
|
67
|
+
def bind_loop(self, loop: asyncio.AbstractEventLoop) -> None:
|
|
68
|
+
"""Capture the FastAPI event loop so the receiver thread can
|
|
69
|
+
schedule publishes onto it. Call once at app startup."""
|
|
70
|
+
self._loop = loop
|
|
71
|
+
|
|
72
|
+
def active_runs(self) -> list[tuple[str, str]]:
|
|
73
|
+
"""Snapshot of (workspace_id, run_id) channels that are open.
|
|
74
|
+
|
|
75
|
+
Used by `GET /api/runs/active` so the web shell can show a
|
|
76
|
+
live pill for in-flight runs. Includes runs whose channel was
|
|
77
|
+
opened by `mark_run_active` even before any spans arrived."""
|
|
78
|
+
return [(ws, run) for (ws, run), ch in self._channels.items() if not ch.closed]
|
|
79
|
+
|
|
80
|
+
def mark_run_active_threadsafe(self, workspace_id: str, run_id: str) -> None:
|
|
81
|
+
"""Open the channel for a run before any spans flow. Lets the
|
|
82
|
+
web's "active runs" pill light up the moment a run starts."""
|
|
83
|
+
loop = self._loop
|
|
84
|
+
if loop is None:
|
|
85
|
+
return
|
|
86
|
+
with suppress(RuntimeError):
|
|
87
|
+
loop.call_soon_threadsafe(self._mark_active_sync, workspace_id, run_id)
|
|
88
|
+
|
|
89
|
+
def _mark_active_sync(self, workspace_id: str, run_id: str) -> None:
|
|
90
|
+
key = (workspace_id, run_id)
|
|
91
|
+
self._channels.setdefault(key, _Channel())
|
|
92
|
+
|
|
93
|
+
async def subscribe(
|
|
94
|
+
self, workspace_id: str, run_id: str
|
|
95
|
+
) -> AsyncIterator[dict[str, Any] | _Closed]:
|
|
96
|
+
"""Async-iterate events for one run. Caller is responsible for
|
|
97
|
+
cancelling the iteration when the client disconnects.
|
|
98
|
+
|
|
99
|
+
Note: this does NOT replay history. The SSE handler emits a
|
|
100
|
+
snapshot of the current Trace state *before* calling
|
|
101
|
+
subscribe(), so the subscriber only needs new spans from here.
|
|
102
|
+
"""
|
|
103
|
+
key = (workspace_id, run_id)
|
|
104
|
+
sub = _Subscriber(
|
|
105
|
+
queue=asyncio.Queue(maxsize=_QUEUE_MAXSIZE),
|
|
106
|
+
workspace_id=workspace_id,
|
|
107
|
+
run_id=run_id,
|
|
108
|
+
)
|
|
109
|
+
async with self._lock:
|
|
110
|
+
channel = self._channels.setdefault(key, _Channel())
|
|
111
|
+
channel.subscribers.append(sub)
|
|
112
|
+
# If the channel is already closed, emit the close event and
|
|
113
|
+
# return without ever blocking.
|
|
114
|
+
already_closed = channel.closed
|
|
115
|
+
final_state = channel.final_state
|
|
116
|
+
if already_closed:
|
|
117
|
+
yield _Closed(final_state=final_state or "completed")
|
|
118
|
+
return
|
|
119
|
+
try:
|
|
120
|
+
while True:
|
|
121
|
+
event = await sub.queue.get()
|
|
122
|
+
if isinstance(event, _Closed):
|
|
123
|
+
yield event
|
|
124
|
+
return
|
|
125
|
+
yield event
|
|
126
|
+
finally:
|
|
127
|
+
sub.closed = True
|
|
128
|
+
async with self._lock:
|
|
129
|
+
ch = self._channels.get(key)
|
|
130
|
+
if ch is not None:
|
|
131
|
+
ch.subscribers = [s for s in ch.subscribers if not s.closed]
|
|
132
|
+
if not ch.subscribers and ch.closed:
|
|
133
|
+
self._channels.pop(key, None)
|
|
134
|
+
|
|
135
|
+
def publish_threadsafe(
|
|
136
|
+
self, workspace_id: str, run_id: str, span_payload: dict[str, Any]
|
|
137
|
+
) -> None:
|
|
138
|
+
"""Called from the OTLP receiver's background thread.
|
|
139
|
+
|
|
140
|
+
Hops onto the FastAPI event loop via call_soon_threadsafe.
|
|
141
|
+
If no loop is bound, drops silently — the broker is best-effort,
|
|
142
|
+
not the source of truth (SQLite is)."""
|
|
143
|
+
loop = self._loop
|
|
144
|
+
if loop is None:
|
|
145
|
+
return
|
|
146
|
+
# Loop may be closed during process shutdown — best-effort.
|
|
147
|
+
with suppress(RuntimeError):
|
|
148
|
+
loop.call_soon_threadsafe(self._publish_sync, workspace_id, run_id, span_payload)
|
|
149
|
+
|
|
150
|
+
def close_run_threadsafe(
|
|
151
|
+
self, workspace_id: str, run_id: str, final_state: str = "completed"
|
|
152
|
+
) -> None:
|
|
153
|
+
loop = self._loop
|
|
154
|
+
if loop is None:
|
|
155
|
+
return
|
|
156
|
+
with suppress(RuntimeError):
|
|
157
|
+
loop.call_soon_threadsafe(self._close_sync, workspace_id, run_id, final_state)
|
|
158
|
+
|
|
159
|
+
def _publish_sync(self, workspace_id: str, run_id: str, span_payload: dict[str, Any]) -> None:
|
|
160
|
+
key = (workspace_id, run_id)
|
|
161
|
+
channel = self._channels.get(key)
|
|
162
|
+
if channel is None or channel.closed:
|
|
163
|
+
# No live subscribers and channel hasn't been opened — drop.
|
|
164
|
+
# A late subscriber will start from the SQLite snapshot.
|
|
165
|
+
return
|
|
166
|
+
for sub in list(channel.subscribers):
|
|
167
|
+
try:
|
|
168
|
+
sub.queue.put_nowait(span_payload)
|
|
169
|
+
except asyncio.QueueFull:
|
|
170
|
+
logger.warning(
|
|
171
|
+
"SpanBroker: dropping slow subscriber ws=%s run=%s",
|
|
172
|
+
workspace_id,
|
|
173
|
+
run_id,
|
|
174
|
+
)
|
|
175
|
+
sub.closed = True
|
|
176
|
+
with suppress(asyncio.QueueFull):
|
|
177
|
+
sub.queue.put_nowait(_Closed(final_state="disconnected"))
|
|
178
|
+
|
|
179
|
+
def _close_sync(self, workspace_id: str, run_id: str, final_state: str) -> None:
|
|
180
|
+
key = (workspace_id, run_id)
|
|
181
|
+
channel = self._channels.get(key)
|
|
182
|
+
if channel is None:
|
|
183
|
+
# Subscribers may attach later; record the closed state so
|
|
184
|
+
# subscribe() can emit _Closed immediately.
|
|
185
|
+
self._channels[key] = _Channel(closed=True, final_state=final_state)
|
|
186
|
+
return
|
|
187
|
+
channel.closed = True
|
|
188
|
+
channel.final_state = final_state
|
|
189
|
+
close_event = _Closed(final_state=final_state)
|
|
190
|
+
for sub in channel.subscribers:
|
|
191
|
+
with suppress(asyncio.QueueFull):
|
|
192
|
+
sub.queue.put_nowait(close_event)
|
|
193
|
+
|
|
194
|
+
|
|
195
|
+
# Module-level singleton bound at build_app() time.
|
|
196
|
+
_broker: SpanBroker | None = None
|
|
197
|
+
|
|
198
|
+
|
|
199
|
+
def get_broker() -> SpanBroker:
|
|
200
|
+
"""Return the process-wide broker, lazily constructed."""
|
|
201
|
+
global _broker
|
|
202
|
+
if _broker is None:
|
|
203
|
+
_broker = SpanBroker()
|
|
204
|
+
return _broker
|
|
205
|
+
|
|
206
|
+
|
|
207
|
+
def reset_for_tests() -> None:
|
|
208
|
+
"""Drop the singleton — used by test fixtures to keep state clean."""
|
|
209
|
+
global _broker
|
|
210
|
+
_broker = None
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"""Adapter that lets the OTLP receiver feed the SSE broker.
|
|
2
|
+
|
|
3
|
+
Lives in `selfevals.api` so the `runner/` package stays unaware that
|
|
4
|
+
SSE / FastAPI exist. Only `selfevals serve` imports this; CLI-only
|
|
5
|
+
runs never load it.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
from selfevals.api.broker import SpanBroker
|
|
13
|
+
from selfevals.runner.otlp_receiver import SpanPublisher
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class BrokerPublisher(SpanPublisher):
|
|
17
|
+
"""SpanPublisher impl that forwards to a SpanBroker."""
|
|
18
|
+
|
|
19
|
+
def __init__(self, broker: SpanBroker) -> None:
|
|
20
|
+
self._broker = broker
|
|
21
|
+
|
|
22
|
+
def mark_active(self, workspace_id: str, run_id: str) -> None:
|
|
23
|
+
self._broker.mark_run_active_threadsafe(workspace_id, run_id)
|
|
24
|
+
|
|
25
|
+
def publish(self, workspace_id: str, run_id: str, span_payload: dict[str, Any]) -> None:
|
|
26
|
+
self._broker.publish_threadsafe(workspace_id, run_id, span_payload)
|
|
27
|
+
|
|
28
|
+
def close(self, workspace_id: str, run_id: str, final_state: str = "completed") -> None:
|
|
29
|
+
self._broker.close_run_threadsafe(workspace_id, run_id, final_state)
|