evalgrid 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- evalgrid/__init__.py +3 -0
- evalgrid/__main__.py +4 -0
- evalgrid/api/__init__.py +0 -0
- evalgrid/api/app.py +442 -0
- evalgrid/api/database.py +78 -0
- evalgrid/cli/__init__.py +3 -0
- evalgrid/cli/commands/__init__.py +0 -0
- evalgrid/cli/commands/init.py +139 -0
- evalgrid/cli/commands/run.py +262 -0
- evalgrid/cli/commands/scenario.py +153 -0
- evalgrid/cli/commands/server.py +39 -0
- evalgrid/cli/main.py +20 -0
- evalgrid/core/__init__.py +14 -0
- evalgrid/core/loader.py +106 -0
- evalgrid/core/models.py +99 -0
- evalgrid/core/paths.py +27 -0
- evalgrid/core/runner.py +125 -0
- evalgrid/core/scorer.py +216 -0
- evalgrid/static/assets/index-9gTV8edJ.css +1 -0
- evalgrid/static/assets/index-DiafW5oN.js +205 -0
- evalgrid/static/index.html +13 -0
- evalgrid-0.1.0.dist-info/METADATA +263 -0
- evalgrid-0.1.0.dist-info/RECORD +26 -0
- evalgrid-0.1.0.dist-info/WHEEL +4 -0
- evalgrid-0.1.0.dist-info/entry_points.txt +2 -0
- evalgrid-0.1.0.dist-info/licenses/LICENSE +21 -0
evalgrid/__init__.py
ADDED
evalgrid/__main__.py
ADDED
evalgrid/api/__init__.py
ADDED
|
File without changes
|
evalgrid/api/app.py
ADDED
|
@@ -0,0 +1,442 @@
|
|
|
1
|
+
"""FastAPI application — eval runner + results API."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import logging
|
|
7
|
+
import os
|
|
8
|
+
import re
|
|
9
|
+
import uuid
|
|
10
|
+
from contextlib import asynccontextmanager
|
|
11
|
+
from datetime import datetime, timezone
|
|
12
|
+
from pathlib import Path
|
|
13
|
+
from typing import Any
|
|
14
|
+
|
|
15
|
+
import yaml
|
|
16
|
+
from fastapi import BackgroundTasks, Depends, FastAPI, HTTPException, Request
|
|
17
|
+
from fastapi.middleware.cors import CORSMiddleware
|
|
18
|
+
from fastapi.responses import JSONResponse
|
|
19
|
+
from fastapi.staticfiles import StaticFiles
|
|
20
|
+
from pydantic import BaseModel, Field
|
|
21
|
+
from sqlalchemy import select
|
|
22
|
+
from sqlalchemy.ext.asyncio import AsyncSession
|
|
23
|
+
|
|
24
|
+
from evalgrid.core import EvalConfig, ScenarioLoader
|
|
25
|
+
from evalgrid.core.models import PassFail
|
|
26
|
+
from evalgrid.core.runner import ScenarioRunner
|
|
27
|
+
|
|
28
|
+
from .database import RunRecord, ScenarioResultRecord, get_db, init_db
|
|
29
|
+
|
|
30
|
+
logger = logging.getLogger("evalgrid")
|
|
31
|
+
|
|
32
|
+
SCENARIOS_BASE_DIR = Path.home() / ".evalgrid" / "scenarios"
|
|
33
|
+
|
|
34
|
+
_API_KEY_RE = re.compile(
|
|
35
|
+
r"(?:sk-ant-|sk-|pypi-)[A-Za-z0-9\-_]{6,}"
|
|
36
|
+
r"|AIza[A-Za-z0-9\-_]{30,}"
|
|
37
|
+
)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def _sanitize_error(msg: str) -> str:
|
|
41
|
+
return _API_KEY_RE.sub("[REDACTED]", msg)
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
def _iso(dt: datetime | None) -> str | None:
|
|
45
|
+
"""Return ISO-8601 string with explicit UTC offset so browsers parse it correctly."""
|
|
46
|
+
if dt is None:
|
|
47
|
+
return None
|
|
48
|
+
if dt.tzinfo is None:
|
|
49
|
+
dt = dt.replace(tzinfo=timezone.utc)
|
|
50
|
+
return dt.isoformat()
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
@asynccontextmanager
|
|
54
|
+
async def lifespan(app: FastAPI): # noqa: ARG001
|
|
55
|
+
await init_db()
|
|
56
|
+
|
|
57
|
+
# Mark any runs that were left running (server restart) as failed
|
|
58
|
+
from .database import SessionLocal
|
|
59
|
+
try:
|
|
60
|
+
async with SessionLocal() as db:
|
|
61
|
+
stale_rows = await db.execute(
|
|
62
|
+
select(RunRecord).where(RunRecord.finished_at.is_(None))
|
|
63
|
+
)
|
|
64
|
+
stale = stale_rows.scalars().all()
|
|
65
|
+
for record in stale:
|
|
66
|
+
record.finished_at = datetime.now(timezone.utc)
|
|
67
|
+
record.total = record.total or 0
|
|
68
|
+
record.passed = record.passed or 0
|
|
69
|
+
record.pass_rate = 0.0
|
|
70
|
+
record.results = [{"error": "Server restarted"}]
|
|
71
|
+
logger.warning("Marked stale run %s as failed (server restart)", record.run_id)
|
|
72
|
+
if stale:
|
|
73
|
+
await db.commit()
|
|
74
|
+
except Exception as exc:
|
|
75
|
+
logger.warning("Stale run cleanup skipped: %s", exc)
|
|
76
|
+
|
|
77
|
+
yield
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
app = FastAPI(title="evalgrid API", version="0.1.0", lifespan=lifespan)
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
@app.exception_handler(Exception)
|
|
84
|
+
async def global_exception_handler(request: Request, exc: Exception) -> JSONResponse:
|
|
85
|
+
logger.error("Unhandled exception on %s: %s", request.url.path, exc)
|
|
86
|
+
sanitized = _sanitize_error(str(exc))
|
|
87
|
+
return JSONResponse(status_code=500, content={"detail": f"Internal server error: {sanitized}"})
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
app.add_middleware(
|
|
91
|
+
CORSMiddleware,
|
|
92
|
+
allow_origins=[
|
|
93
|
+
"http://localhost:5173",
|
|
94
|
+
"http://localhost:8000",
|
|
95
|
+
"http://127.0.0.1:5173",
|
|
96
|
+
"http://127.0.0.1:8000",
|
|
97
|
+
],
|
|
98
|
+
allow_methods=["*"],
|
|
99
|
+
allow_headers=["*"],
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
# ── Schema ────────────────────────────────────────────────────────────────────
|
|
104
|
+
|
|
105
|
+
class RunRequest(BaseModel):
|
|
106
|
+
scenarios_dir: str = ""
|
|
107
|
+
tags: list[str] = []
|
|
108
|
+
scenario_ids: list[str] = []
|
|
109
|
+
mock: bool = False
|
|
110
|
+
judge_model: str | None = None
|
|
111
|
+
|
|
112
|
+
|
|
113
|
+
class RunSummary(BaseModel):
|
|
114
|
+
run_id: str
|
|
115
|
+
started_at: str
|
|
116
|
+
finished_at: str | None = None
|
|
117
|
+
pass_rate: float
|
|
118
|
+
passed: int
|
|
119
|
+
total: int
|
|
120
|
+
status: str
|
|
121
|
+
scenario_names: list[str] = []
|
|
122
|
+
skipped_count: int = 0
|
|
123
|
+
skipped_scenarios: list[dict[str, str]] = []
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
class ScenarioCreate(BaseModel):
|
|
127
|
+
name: str = Field(..., max_length=200)
|
|
128
|
+
description: str = Field(default="", max_length=1000)
|
|
129
|
+
agent_type: str = "generic"
|
|
130
|
+
prompt: str = Field(..., max_length=16000)
|
|
131
|
+
test_input: str = Field(..., max_length=8000)
|
|
132
|
+
expected_actions: list[dict[str, Any]] = []
|
|
133
|
+
tags: list[str] = []
|
|
134
|
+
scenarios_dir: str = ""
|
|
135
|
+
|
|
136
|
+
|
|
137
|
+
def _validate_path(scenarios_dir: str) -> Path:
|
|
138
|
+
"""Resolve and validate that the path stays within SCENARIOS_BASE_DIR."""
|
|
139
|
+
_base = SCENARIOS_BASE_DIR.resolve()
|
|
140
|
+
# Belt-and-suspenders: reject obvious traversal attempts
|
|
141
|
+
if ".." in scenarios_dir:
|
|
142
|
+
raise HTTPException(status_code=400, detail="Path outside allowed scenarios directory")
|
|
143
|
+
resolved = (_base / scenarios_dir).resolve()
|
|
144
|
+
if not resolved.is_relative_to(_base):
|
|
145
|
+
raise HTTPException(status_code=400, detail="Path outside allowed scenarios directory")
|
|
146
|
+
return resolved
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
# ── Routes ────────────────────────────────────────────────────────────────────
|
|
150
|
+
|
|
151
|
+
@app.get("/health")
|
|
152
|
+
async def health() -> dict[str, str]:
|
|
153
|
+
return {"status": "ok"}
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
@app.get("/api/config-status")
|
|
157
|
+
async def config_status() -> dict[str, Any]:
|
|
158
|
+
anthropic_set = bool(os.environ.get("ANTHROPIC_API_KEY"))
|
|
159
|
+
openai_set = bool(os.environ.get("OPENAI_API_KEY"))
|
|
160
|
+
return {
|
|
161
|
+
"anthropic_key_set": anthropic_set,
|
|
162
|
+
"openai_key_set": openai_set,
|
|
163
|
+
"any_key_set": anthropic_set or openai_set,
|
|
164
|
+
"mock_available": True,
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
@app.post("/api/runs", response_model=RunSummary)
|
|
169
|
+
async def create_run(
|
|
170
|
+
req: RunRequest,
|
|
171
|
+
background: BackgroundTasks,
|
|
172
|
+
db: AsyncSession = Depends(get_db),
|
|
173
|
+
) -> RunSummary:
|
|
174
|
+
_validate_path(req.scenarios_dir)
|
|
175
|
+
|
|
176
|
+
if not req.mock and not any(os.environ.get(k) for k in ("ANTHROPIC_API_KEY", "OPENAI_API_KEY")):
|
|
177
|
+
raise HTTPException(status_code=400, detail="No provider API key found in environment (set ANTHROPIC_API_KEY or OPENAI_API_KEY)")
|
|
178
|
+
|
|
179
|
+
run_id = str(uuid.uuid4())
|
|
180
|
+
record = RunRecord(
|
|
181
|
+
run_id=run_id,
|
|
182
|
+
started_at=datetime.now(timezone.utc),
|
|
183
|
+
results=[],
|
|
184
|
+
)
|
|
185
|
+
db.add(record)
|
|
186
|
+
await db.commit()
|
|
187
|
+
|
|
188
|
+
background.add_task(_execute_run, run_id, req)
|
|
189
|
+
|
|
190
|
+
return RunSummary(
|
|
191
|
+
run_id=run_id,
|
|
192
|
+
started_at=_iso(record.started_at),
|
|
193
|
+
pass_rate=0.0,
|
|
194
|
+
passed=0,
|
|
195
|
+
total=0,
|
|
196
|
+
status="running",
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
|
|
200
|
+
@app.get("/api/runs", response_model=list[RunSummary])
|
|
201
|
+
async def list_runs(db: AsyncSession = Depends(get_db)) -> list[RunSummary]:
|
|
202
|
+
rows = await db.execute(select(RunRecord).order_by(RunRecord.started_at.desc()).limit(100))
|
|
203
|
+
records = rows.scalars().all()
|
|
204
|
+
|
|
205
|
+
run_ids = [r.run_id for r in records]
|
|
206
|
+
name_rows = await db.execute(
|
|
207
|
+
select(ScenarioResultRecord.run_id, ScenarioResultRecord.scenario_name)
|
|
208
|
+
.where(ScenarioResultRecord.run_id.in_(run_ids))
|
|
209
|
+
)
|
|
210
|
+
names_by_run: dict[str, list[str]] = {}
|
|
211
|
+
for run_id, scenario_name in name_rows:
|
|
212
|
+
names_by_run.setdefault(run_id, []).append(scenario_name)
|
|
213
|
+
|
|
214
|
+
return [
|
|
215
|
+
RunSummary(
|
|
216
|
+
run_id=r.run_id,
|
|
217
|
+
started_at=_iso(r.started_at),
|
|
218
|
+
finished_at=_iso(r.finished_at),
|
|
219
|
+
pass_rate=r.pass_rate or 0.0,
|
|
220
|
+
passed=r.passed or 0,
|
|
221
|
+
total=r.total or 0,
|
|
222
|
+
status="completed" if r.finished_at else "running",
|
|
223
|
+
scenario_names=names_by_run.get(r.run_id, []),
|
|
224
|
+
skipped_count=len(r.skipped_scenarios or []),
|
|
225
|
+
skipped_scenarios=r.skipped_scenarios or [],
|
|
226
|
+
)
|
|
227
|
+
for r in records
|
|
228
|
+
]
|
|
229
|
+
|
|
230
|
+
|
|
231
|
+
@app.get("/api/runs/{run_id}")
|
|
232
|
+
async def get_run(run_id: str, db: AsyncSession = Depends(get_db)) -> dict[str, Any]:
|
|
233
|
+
row = await db.execute(select(RunRecord).where(RunRecord.run_id == run_id))
|
|
234
|
+
record = row.scalar_one_or_none()
|
|
235
|
+
if not record:
|
|
236
|
+
raise HTTPException(status_code=404, detail="Run not found")
|
|
237
|
+
|
|
238
|
+
result_rows = await db.execute(
|
|
239
|
+
select(ScenarioResultRecord).where(ScenarioResultRecord.run_id == run_id)
|
|
240
|
+
)
|
|
241
|
+
scenario_results = result_rows.scalars().all()
|
|
242
|
+
|
|
243
|
+
return {
|
|
244
|
+
"run_id": record.run_id,
|
|
245
|
+
"started_at": _iso(record.started_at),
|
|
246
|
+
"finished_at": _iso(record.finished_at),
|
|
247
|
+
"pass_rate": record.pass_rate,
|
|
248
|
+
"passed": record.passed,
|
|
249
|
+
"total": record.total,
|
|
250
|
+
"status": "completed" if record.finished_at else "running",
|
|
251
|
+
"scenario_results": [
|
|
252
|
+
{
|
|
253
|
+
"scenario_id": r.scenario_id,
|
|
254
|
+
"scenario_name": r.scenario_name,
|
|
255
|
+
"agent_type": r.agent_type,
|
|
256
|
+
"status": r.status,
|
|
257
|
+
"overall_score": r.overall_score,
|
|
258
|
+
"pass_rate": r.pass_rate,
|
|
259
|
+
"reasoning": r.reasoning,
|
|
260
|
+
"error": r.error,
|
|
261
|
+
"steps": r.steps,
|
|
262
|
+
}
|
|
263
|
+
for r in scenario_results
|
|
264
|
+
],
|
|
265
|
+
}
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
@app.post("/api/scenarios")
|
|
269
|
+
async def create_scenario(
|
|
270
|
+
req: ScenarioCreate,
|
|
271
|
+
) -> dict[str, Any]:
|
|
272
|
+
scenario_id = re.sub(r"[^a-z0-9]+", "-", req.name.lower()).strip("-")
|
|
273
|
+
|
|
274
|
+
actions = req.expected_actions or [
|
|
275
|
+
{"action": "complete_task", "required": True, "description": "Agent completes the task correctly"}
|
|
276
|
+
]
|
|
277
|
+
|
|
278
|
+
scenario_data = {
|
|
279
|
+
"id": scenario_id,
|
|
280
|
+
"name": req.name,
|
|
281
|
+
"description": req.description,
|
|
282
|
+
"agent_type": req.agent_type,
|
|
283
|
+
"prompt": req.prompt,
|
|
284
|
+
"test_input": req.test_input,
|
|
285
|
+
"expected_actions": actions,
|
|
286
|
+
"tags": req.tags,
|
|
287
|
+
}
|
|
288
|
+
|
|
289
|
+
scen_dir = _validate_path(req.scenarios_dir)
|
|
290
|
+
scen_dir.mkdir(parents=True, exist_ok=True)
|
|
291
|
+
|
|
292
|
+
path = scen_dir / f"{scenario_id}.yml"
|
|
293
|
+
if path.exists():
|
|
294
|
+
raise HTTPException(status_code=409, detail=f"Scenario with id '{scenario_id}' already exists")
|
|
295
|
+
with open(path, "w") as f:
|
|
296
|
+
yaml.dump(scenario_data, f, default_flow_style=False, allow_unicode=True, sort_keys=False)
|
|
297
|
+
|
|
298
|
+
return {
|
|
299
|
+
"id": scenario_id,
|
|
300
|
+
"name": req.name,
|
|
301
|
+
"description": req.description,
|
|
302
|
+
"agent_type": req.agent_type,
|
|
303
|
+
"tags": req.tags,
|
|
304
|
+
"step_count": len(actions),
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
|
|
308
|
+
@app.get("/api/scenarios")
|
|
309
|
+
async def list_scenarios(scenarios_dir: str = "") -> list[dict[str, Any]]:
|
|
310
|
+
scen_dir = _validate_path(scenarios_dir)
|
|
311
|
+
if not scen_dir.exists():
|
|
312
|
+
return []
|
|
313
|
+
loader = ScenarioLoader()
|
|
314
|
+
result = []
|
|
315
|
+
for path in sorted(scen_dir.rglob("*.yml")) + sorted(scen_dir.rglob("*.yaml")):
|
|
316
|
+
if path.name.startswith("_"):
|
|
317
|
+
continue
|
|
318
|
+
try:
|
|
319
|
+
s = loader.load_file(path)
|
|
320
|
+
rel = path.relative_to(scen_dir)
|
|
321
|
+
suite = rel.parts[0] if len(rel.parts) > 1 else "default"
|
|
322
|
+
result.append({
|
|
323
|
+
"id": s.id,
|
|
324
|
+
"name": s.name,
|
|
325
|
+
"description": s.description,
|
|
326
|
+
"agent_type": s.agent_type.value,
|
|
327
|
+
"tags": s.tags,
|
|
328
|
+
"step_count": len(s.expected_actions),
|
|
329
|
+
"suite": suite,
|
|
330
|
+
})
|
|
331
|
+
except Exception as e:
|
|
332
|
+
logger.warning("Failed to load scenario %s: %s", path, e)
|
|
333
|
+
return result
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
# ── Background task ────────────────────────────────────────────────────────────
|
|
337
|
+
|
|
338
|
+
async def _mark_run_failed(run_id: str, error_message: str) -> None:
|
|
339
|
+
from .database import SessionLocal
|
|
340
|
+
async with SessionLocal() as db:
|
|
341
|
+
row = await db.execute(select(RunRecord).where(RunRecord.run_id == run_id))
|
|
342
|
+
record = row.scalar_one_or_none()
|
|
343
|
+
if record:
|
|
344
|
+
record.finished_at = datetime.now(timezone.utc)
|
|
345
|
+
record.total = 0
|
|
346
|
+
record.passed = 0
|
|
347
|
+
record.pass_rate = 0.0
|
|
348
|
+
record.results = [{"error": error_message}]
|
|
349
|
+
await db.commit()
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
async def _execute_run(run_id: str, req: RunRequest) -> None:
|
|
353
|
+
from .database import SessionLocal
|
|
354
|
+
|
|
355
|
+
config = EvalConfig()
|
|
356
|
+
if req.judge_model:
|
|
357
|
+
config.judge.model = req.judge_model
|
|
358
|
+
loader = ScenarioLoader()
|
|
359
|
+
scen_dir = (SCENARIOS_BASE_DIR.resolve() / req.scenarios_dir).resolve()
|
|
360
|
+
|
|
361
|
+
if not scen_dir.exists():
|
|
362
|
+
await _mark_run_failed(run_id, f"Scenarios directory not found: {scen_dir}")
|
|
363
|
+
return
|
|
364
|
+
|
|
365
|
+
async def _run_with_timeout() -> None:
|
|
366
|
+
scenarios = loader.load_dir(scen_dir)
|
|
367
|
+
|
|
368
|
+
if loader.skipped:
|
|
369
|
+
async with SessionLocal() as db:
|
|
370
|
+
row = await db.execute(select(RunRecord).where(RunRecord.run_id == run_id))
|
|
371
|
+
record = row.scalar_one_or_none()
|
|
372
|
+
if record:
|
|
373
|
+
record.skipped_scenarios = loader.skipped
|
|
374
|
+
await db.commit()
|
|
375
|
+
|
|
376
|
+
if req.tags:
|
|
377
|
+
scenarios = [s for s in scenarios if any(t in s.tags for t in req.tags)]
|
|
378
|
+
if req.scenario_ids:
|
|
379
|
+
scenarios = [s for s in scenarios if s.id in req.scenario_ids]
|
|
380
|
+
|
|
381
|
+
if req.mock:
|
|
382
|
+
from evalgrid.core.scorer import MockScorer
|
|
383
|
+
runner = ScenarioRunner(
|
|
384
|
+
config,
|
|
385
|
+
scorer=MockScorer(),
|
|
386
|
+
agent_fn=lambda s: ("[mock agent output]", 0),
|
|
387
|
+
)
|
|
388
|
+
else:
|
|
389
|
+
runner = ScenarioRunner(config)
|
|
390
|
+
|
|
391
|
+
loop = asyncio.get_running_loop()
|
|
392
|
+
results = []
|
|
393
|
+
for scenario in scenarios:
|
|
394
|
+
result = await loop.run_in_executor(None, runner.run_scenario, scenario)
|
|
395
|
+
results.append(result)
|
|
396
|
+
|
|
397
|
+
async with SessionLocal() as db:
|
|
398
|
+
sr = ScenarioResultRecord(
|
|
399
|
+
run_id=run_id,
|
|
400
|
+
scenario_id=result.scenario_id,
|
|
401
|
+
scenario_name=result.scenario_name,
|
|
402
|
+
agent_type=result.agent_type.value,
|
|
403
|
+
status=result.status.value,
|
|
404
|
+
overall_score=result.overall_score,
|
|
405
|
+
pass_rate=result.pass_rate,
|
|
406
|
+
reasoning=result.reasoning,
|
|
407
|
+
error=result.error,
|
|
408
|
+
steps=[s.model_dump(mode="json") for s in result.steps],
|
|
409
|
+
started_at=result.started_at,
|
|
410
|
+
finished_at=result.finished_at,
|
|
411
|
+
)
|
|
412
|
+
db.add(sr)
|
|
413
|
+
await db.commit()
|
|
414
|
+
|
|
415
|
+
passed = sum(1 for r in results if r.status == PassFail.PASS)
|
|
416
|
+
total = len(results)
|
|
417
|
+
|
|
418
|
+
async with SessionLocal() as db:
|
|
419
|
+
row = await db.execute(select(RunRecord).where(RunRecord.run_id == run_id))
|
|
420
|
+
record = row.scalar_one_or_none()
|
|
421
|
+
if record:
|
|
422
|
+
record.finished_at = datetime.now(timezone.utc)
|
|
423
|
+
record.passed = passed
|
|
424
|
+
record.total = total
|
|
425
|
+
record.pass_rate = passed / total if total else 0.0
|
|
426
|
+
await db.commit()
|
|
427
|
+
|
|
428
|
+
try:
|
|
429
|
+
await asyncio.wait_for(_run_with_timeout(), timeout=1800)
|
|
430
|
+
except TimeoutError:
|
|
431
|
+
logger.error("Run %s timed out after 1800s", run_id)
|
|
432
|
+
await _mark_run_failed(run_id, "Run timed out (30 minute limit exceeded)")
|
|
433
|
+
except Exception as exc:
|
|
434
|
+
await _mark_run_failed(run_id, _sanitize_error(str(exc)))
|
|
435
|
+
|
|
436
|
+
|
|
437
|
+
# Serve React dashboard — evalgrid/static/ (wheel) or dashboard/dist/ (dev)
|
|
438
|
+
_dashboard = Path(__file__).parent.parent / "static"
|
|
439
|
+
if not _dashboard.exists():
|
|
440
|
+
_dashboard = Path(__file__).parent.parent.parent / "dashboard" / "dist"
|
|
441
|
+
if _dashboard.exists():
|
|
442
|
+
app.mount("/", StaticFiles(directory=_dashboard, html=True), name="static")
|
evalgrid/api/database.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
"""Database setup — SQLite (dev) / Postgres (prod)."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
logger = logging.getLogger("evalgrid")
|
|
9
|
+
|
|
10
|
+
from sqlalchemy import JSON, Column, DateTime, Float, ForeignKey, Integer, String, Text, func, text
|
|
11
|
+
from sqlalchemy.ext.asyncio import AsyncSession, async_sessionmaker, create_async_engine
|
|
12
|
+
from sqlalchemy.orm import DeclarativeBase
|
|
13
|
+
|
|
14
|
+
from evalgrid.core.paths import default_database_url
|
|
15
|
+
|
|
16
|
+
DATABASE_URL = default_database_url()
|
|
17
|
+
|
|
18
|
+
engine = create_async_engine(DATABASE_URL, echo=False)
|
|
19
|
+
SessionLocal = async_sessionmaker(engine, class_=AsyncSession, expire_on_commit=False)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class Base(DeclarativeBase):
|
|
23
|
+
pass
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class RunRecord(Base):
|
|
27
|
+
__tablename__ = "runs"
|
|
28
|
+
|
|
29
|
+
id = Column(Integer, primary_key=True, autoincrement=True)
|
|
30
|
+
run_id = Column(String, unique=True, index=True)
|
|
31
|
+
started_at = Column(DateTime, default=func.now())
|
|
32
|
+
finished_at = Column(DateTime, nullable=True)
|
|
33
|
+
pass_rate = Column(Float, default=0.0)
|
|
34
|
+
passed = Column(Integer, default=0)
|
|
35
|
+
total = Column(Integer, default=0)
|
|
36
|
+
results = Column(JSON, default=list)
|
|
37
|
+
skipped_scenarios = Column(JSON, nullable=True)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
class ScenarioResultRecord(Base):
|
|
41
|
+
__tablename__ = "scenario_results"
|
|
42
|
+
|
|
43
|
+
id = Column(Integer, primary_key=True, autoincrement=True)
|
|
44
|
+
run_id = Column(String, ForeignKey("runs.run_id"), index=True)
|
|
45
|
+
scenario_id = Column(String, index=True)
|
|
46
|
+
scenario_name = Column(String)
|
|
47
|
+
agent_type = Column(String)
|
|
48
|
+
status = Column(String)
|
|
49
|
+
overall_score = Column(Float)
|
|
50
|
+
pass_rate = Column(Float)
|
|
51
|
+
reasoning = Column(Text, default="")
|
|
52
|
+
error = Column(Text, nullable=True)
|
|
53
|
+
steps = Column(JSON, default=list)
|
|
54
|
+
started_at = Column(DateTime)
|
|
55
|
+
finished_at = Column(DateTime, nullable=True)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
async def init_db() -> None:
|
|
59
|
+
"""Initialize the database, creating parent directories as needed."""
|
|
60
|
+
url_str = str(engine.url)
|
|
61
|
+
if url_str.startswith("sqlite"):
|
|
62
|
+
db_path_str = url_str.split(":///", 1)[-1]
|
|
63
|
+
if db_path_str and db_path_str != ":memory:":
|
|
64
|
+
db_path = Path(db_path_str).expanduser()
|
|
65
|
+
db_path.parent.mkdir(parents=True, exist_ok=True)
|
|
66
|
+
|
|
67
|
+
async with engine.begin() as conn:
|
|
68
|
+
await conn.run_sync(Base.metadata.create_all)
|
|
69
|
+
try:
|
|
70
|
+
await conn.execute(text("ALTER TABLE runs ADD COLUMN skipped_scenarios JSON DEFAULT '[]'"))
|
|
71
|
+
except Exception as e:
|
|
72
|
+
if "duplicate column" not in str(e).lower():
|
|
73
|
+
logger.warning("Migration skipped_scenarios failed: %s", e)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
async def get_db():
|
|
77
|
+
async with SessionLocal() as session:
|
|
78
|
+
yield session
|
evalgrid/cli/__init__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,139 @@
|
|
|
1
|
+
"""evalgrid init — scaffold .evalgrid/ in the current project."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import shutil
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import click
|
|
9
|
+
from rich.console import Console
|
|
10
|
+
from rich.panel import Panel
|
|
11
|
+
|
|
12
|
+
console = Console(legacy_windows=False)
|
|
13
|
+
|
|
14
|
+
CONFIG_TEMPLATE = """\
|
|
15
|
+
# evalgrid configuration
|
|
16
|
+
model: claude-sonnet-4-6
|
|
17
|
+
max_concurrent: 5
|
|
18
|
+
timeout_seconds: 120
|
|
19
|
+
retry_on_error: true
|
|
20
|
+
max_retries: 2
|
|
21
|
+
output_dir: .evalgrid/results
|
|
22
|
+
scenarios_dir: .evalgrid/scenarios
|
|
23
|
+
# Database path: ~/.evalgrid/results.db (override via DATABASE_URL env var)
|
|
24
|
+
|
|
25
|
+
# Choose your judge model (used to score agent responses)
|
|
26
|
+
judge:
|
|
27
|
+
# Anthropic (default)
|
|
28
|
+
provider: anthropic
|
|
29
|
+
model: claude-sonnet-4-6
|
|
30
|
+
api_key_env: ANTHROPIC_API_KEY
|
|
31
|
+
|
|
32
|
+
# OpenAI alternative:
|
|
33
|
+
# provider: openai
|
|
34
|
+
# model: gpt-4o
|
|
35
|
+
# api_key_env: OPENAI_API_KEY
|
|
36
|
+
|
|
37
|
+
# Google alternative:
|
|
38
|
+
# provider: google
|
|
39
|
+
# model: gemini/gemini-1.5-pro
|
|
40
|
+
# api_key_env: GEMINI_API_KEY
|
|
41
|
+
|
|
42
|
+
# Local Ollama (no API key needed):
|
|
43
|
+
# provider: ollama
|
|
44
|
+
# model: ollama/llama3.1:70b
|
|
45
|
+
# api_key_env: ""
|
|
46
|
+
|
|
47
|
+
# OpenRouter (200+ models, one key):
|
|
48
|
+
# provider: openrouter
|
|
49
|
+
# model: openrouter/anthropic/claude-sonnet-4
|
|
50
|
+
# api_key_env: OPENROUTER_API_KEY
|
|
51
|
+
"""
|
|
52
|
+
|
|
53
|
+
EXAMPLE_SCENARIO = """\
|
|
54
|
+
id: example-hello-world
|
|
55
|
+
name: Hello World
|
|
56
|
+
description: Verify the agent responds politely to a greeting
|
|
57
|
+
agent_type: generic
|
|
58
|
+
prompt: |
|
|
59
|
+
You are a helpful AI assistant. Respond to user messages in a friendly,
|
|
60
|
+
concise way. Always greet the user back and offer to help.
|
|
61
|
+
test_input: "Hello! Can you help me today?"
|
|
62
|
+
expected_actions:
|
|
63
|
+
- action: greet_user
|
|
64
|
+
required: true
|
|
65
|
+
description: Agent greets the user back
|
|
66
|
+
- action: offer_assistance
|
|
67
|
+
required: true
|
|
68
|
+
description: Agent offers to help
|
|
69
|
+
- action: ask_followup
|
|
70
|
+
required: false
|
|
71
|
+
description: Agent asks what the user needs help with
|
|
72
|
+
success_criteria: >
|
|
73
|
+
The agent should acknowledge the greeting warmly and clearly offer assistance.
|
|
74
|
+
The tone should be friendly and professional.
|
|
75
|
+
tags:
|
|
76
|
+
- greeting
|
|
77
|
+
- basic
|
|
78
|
+
timeout_seconds: 30
|
|
79
|
+
"""
|
|
80
|
+
|
|
81
|
+
GITIGNORE_ADDITION = """
|
|
82
|
+
# evalgrid
|
|
83
|
+
.evalgrid/results/
|
|
84
|
+
.evalgrid/results.db
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
@click.command()
|
|
89
|
+
@click.option("--dir", "target_dir", default=".", help="Project directory to initialize")
|
|
90
|
+
@click.option("--with-examples", is_flag=True, default=True, help="Copy example scenarios")
|
|
91
|
+
def init(target_dir: str, with_examples: bool) -> None:
|
|
92
|
+
"""Scaffold a .evalgrid/ folder in your project."""
|
|
93
|
+
root = Path(target_dir).resolve()
|
|
94
|
+
evalgrid_dir = root / ".evalgrid"
|
|
95
|
+
|
|
96
|
+
if evalgrid_dir.exists():
|
|
97
|
+
console.print("[yellow]! .evalgrid/ already exists.[/yellow]")
|
|
98
|
+
if not click.confirm("Reinitialize?", default=False):
|
|
99
|
+
return
|
|
100
|
+
|
|
101
|
+
# Create directory structure
|
|
102
|
+
(evalgrid_dir / "scenarios").mkdir(parents=True, exist_ok=True)
|
|
103
|
+
(evalgrid_dir / "results").mkdir(parents=True, exist_ok=True)
|
|
104
|
+
|
|
105
|
+
# Write config
|
|
106
|
+
config_path = evalgrid_dir / "config.yml"
|
|
107
|
+
config_path.write_text(CONFIG_TEMPLATE)
|
|
108
|
+
|
|
109
|
+
# Write example scenario
|
|
110
|
+
if with_examples:
|
|
111
|
+
example_path = evalgrid_dir / "scenarios" / "example-hello-world.yml"
|
|
112
|
+
example_path.write_text(EXAMPLE_SCENARIO)
|
|
113
|
+
|
|
114
|
+
# Append to .gitignore if it exists
|
|
115
|
+
gitignore = root / ".gitignore"
|
|
116
|
+
if gitignore.exists():
|
|
117
|
+
content = gitignore.read_text()
|
|
118
|
+
if ".evalgrid/results" not in content:
|
|
119
|
+
gitignore.write_text(content + GITIGNORE_ADDITION)
|
|
120
|
+
|
|
121
|
+
console.print(
|
|
122
|
+
Panel.fit(
|
|
123
|
+
"[bold green]evalgrid initialized![/bold green]\n\n"
|
|
124
|
+
f"[dim]Config:[/dim] [cyan]{config_path.relative_to(root)}[/cyan]\n"
|
|
125
|
+
f"[dim]Scenarios:[/dim] [cyan]{(evalgrid_dir / 'scenarios').relative_to(root)}[/cyan]\n\n"
|
|
126
|
+
"[bold]Next steps:[/bold]\n\n"
|
|
127
|
+
" 1. Try it now (no API key required):\n"
|
|
128
|
+
" [cyan]evalgrid run --mock[/cyan]\n\n"
|
|
129
|
+
" 2. To run with real AI scoring, set your API key:\n"
|
|
130
|
+
" [cyan]Linux/macOS:[/cyan] export ANTHROPIC_API_KEY=sk-ant-...\n"
|
|
131
|
+
" [cyan]Windows:[/cyan] $env:ANTHROPIC_API_KEY = \"sk-ant-...\"\n\n"
|
|
132
|
+
" 3. Review the example scenario:\n"
|
|
133
|
+
" [cyan].evalgrid/scenarios/example-hello-world.yml[/cyan]\n\n"
|
|
134
|
+
" 4. Open the dashboard:\n"
|
|
135
|
+
" [cyan]evalgrid server[/cyan] -> http://localhost:8000",
|
|
136
|
+
title="evalgrid",
|
|
137
|
+
border_style="green",
|
|
138
|
+
)
|
|
139
|
+
)
|