ragbits-evaluate 0.0.30rc1__py3-none-any.whl → 1.4.0.dev202602030301__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ragbits/evaluate/agent_simulation/__init__.py +4 -49
- ragbits/evaluate/agent_simulation/conversation.py +278 -663
- ragbits/evaluate/agent_simulation/logger.py +1 -1
- ragbits/evaluate/agent_simulation/metrics/__init__.py +0 -10
- ragbits/evaluate/agent_simulation/metrics/builtin.py +49 -59
- ragbits/evaluate/agent_simulation/metrics/collectors.py +17 -37
- ragbits/evaluate/agent_simulation/models.py +18 -198
- ragbits/evaluate/agent_simulation/results.py +49 -125
- ragbits/evaluate/agent_simulation/scenarios.py +19 -95
- ragbits/evaluate/agent_simulation/simulation.py +166 -72
- ragbits/evaluate/metrics/question_answer.py +25 -8
- {ragbits_evaluate-0.0.30rc1.dist-info → ragbits_evaluate-1.4.0.dev202602030301.dist-info}/METADATA +2 -6
- {ragbits_evaluate-0.0.30rc1.dist-info → ragbits_evaluate-1.4.0.dev202602030301.dist-info}/RECORD +14 -25
- ragbits/evaluate/agent_simulation/checkers.py +0 -591
- ragbits/evaluate/agent_simulation/display.py +0 -118
- ragbits/evaluate/agent_simulation/metrics/deepeval.py +0 -295
- ragbits/evaluate/agent_simulation/tracing.py +0 -233
- ragbits/evaluate/api.py +0 -603
- ragbits/evaluate/api_types.py +0 -343
- ragbits/evaluate/execution_manager.py +0 -451
- ragbits/evaluate/stores/__init__.py +0 -36
- ragbits/evaluate/stores/base.py +0 -98
- ragbits/evaluate/stores/file.py +0 -466
- ragbits/evaluate/stores/kv.py +0 -535
- {ragbits_evaluate-0.0.30rc1.dist-info → ragbits_evaluate-1.4.0.dev202602030301.dist-info}/WHEEL +0 -0
ragbits/evaluate/api.py
DELETED
|
@@ -1,603 +0,0 @@
|
|
|
1
|
-
"""EvalAPI - FastAPI server for evaluation UI."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
import asyncio
|
|
6
|
-
import importlib
|
|
7
|
-
import json
|
|
8
|
-
import logging
|
|
9
|
-
from collections.abc import AsyncGenerator, Callable
|
|
10
|
-
from contextlib import asynccontextmanager
|
|
11
|
-
from pathlib import Path
|
|
12
|
-
from typing import TYPE_CHECKING, Any
|
|
13
|
-
|
|
14
|
-
import uvicorn
|
|
15
|
-
from fastapi import FastAPI, HTTPException, Request, status
|
|
16
|
-
from fastapi.exceptions import RequestValidationError
|
|
17
|
-
from fastapi.middleware.cors import CORSMiddleware
|
|
18
|
-
from fastapi.responses import HTMLResponse, JSONResponse, StreamingResponse
|
|
19
|
-
from fastapi.staticfiles import StaticFiles
|
|
20
|
-
|
|
21
|
-
from ragbits.chat.interface import ChatInterface
|
|
22
|
-
from ragbits.evaluate.agent_simulation.models import Personality, Scenario, SimulationConfig
|
|
23
|
-
from ragbits.evaluate.agent_simulation.results import SimulationResult, SimulationStatus
|
|
24
|
-
from ragbits.evaluate.agent_simulation.scenarios import ScenarioFile, load_personalities, load_scenario_file
|
|
25
|
-
from ragbits.evaluate.api_types import (
|
|
26
|
-
EvalConfigResponse,
|
|
27
|
-
PersonasListResponse,
|
|
28
|
-
PersonaSummary,
|
|
29
|
-
ResultsListResponse,
|
|
30
|
-
RunEvaluationRequest,
|
|
31
|
-
RunStartResponse,
|
|
32
|
-
ScenarioDetail,
|
|
33
|
-
ScenarioFileSummary,
|
|
34
|
-
ScenarioSummary,
|
|
35
|
-
SimulationRunsListResponse,
|
|
36
|
-
TaskDetail,
|
|
37
|
-
)
|
|
38
|
-
from ragbits.evaluate.execution_manager import ExecutionManager, create_progress_callback
|
|
39
|
-
|
|
40
|
-
if TYPE_CHECKING:
|
|
41
|
-
from ragbits.evaluate.stores.base import EvalReportStore
|
|
42
|
-
|
|
43
|
-
logger = logging.getLogger(__name__)
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
class EvalAPI:
|
|
47
|
-
"""FastAPI server for evaluation UI with scenario management and parallel execution."""
|
|
48
|
-
|
|
49
|
-
def __init__(
|
|
50
|
-
self,
|
|
51
|
-
chat_factory: Callable[[], ChatInterface] | str,
|
|
52
|
-
scenarios_dir: str = "./scenarios",
|
|
53
|
-
results_dir: str = "./eval_results",
|
|
54
|
-
cors_origins: list[str] | None = None,
|
|
55
|
-
ui_build_dir: str | None = None,
|
|
56
|
-
response_adapters: list = None,
|
|
57
|
-
simulation_config: SimulationConfig | None = None,
|
|
58
|
-
store: EvalReportStore | None = None,
|
|
59
|
-
) -> None:
|
|
60
|
-
"""Initialize the EvalAPI.
|
|
61
|
-
|
|
62
|
-
Args:
|
|
63
|
-
chat_factory: Factory function that creates ChatInterface instances,
|
|
64
|
-
or a string path in format "module:function".
|
|
65
|
-
scenarios_dir: Directory containing scenario JSON files.
|
|
66
|
-
results_dir: Directory for storing evaluation results (used if store is not provided).
|
|
67
|
-
cors_origins: List of allowed CORS origins.
|
|
68
|
-
ui_build_dir: Path to custom UI build directory.
|
|
69
|
-
response_adapters: List of response adapters for processing chat responses.
|
|
70
|
-
simulation_config: Default SimulationConfig for running evaluations.
|
|
71
|
-
Can be overridden per-run via API request.
|
|
72
|
-
store: Storage backend for evaluation results. If not provided,
|
|
73
|
-
uses FileEvalReportStore with results_dir.
|
|
74
|
-
"""
|
|
75
|
-
self.chat_factory = self._load_chat_factory(chat_factory)
|
|
76
|
-
self.scenarios_dir = Path(scenarios_dir)
|
|
77
|
-
self.results_dir = Path(results_dir)
|
|
78
|
-
self.dist_dir = self._resolve_ui_build_dir(ui_build_dir)
|
|
79
|
-
self.cors_origins = cors_origins or []
|
|
80
|
-
self.response_adapters = response_adapters
|
|
81
|
-
self.simulation_config = simulation_config or SimulationConfig()
|
|
82
|
-
|
|
83
|
-
# Ensure directories exist
|
|
84
|
-
self.scenarios_dir.mkdir(parents=True, exist_ok=True)
|
|
85
|
-
|
|
86
|
-
# Initialize execution manager with store or default to file-based
|
|
87
|
-
if store is not None:
|
|
88
|
-
self.execution_manager = ExecutionManager(store=store)
|
|
89
|
-
else:
|
|
90
|
-
self.results_dir.mkdir(parents=True, exist_ok=True)
|
|
91
|
-
self.execution_manager = ExecutionManager(store=self.results_dir)
|
|
92
|
-
|
|
93
|
-
# Cache for loaded scenarios and scenario files
|
|
94
|
-
self._scenarios_cache: dict[str, Scenario] | None = None
|
|
95
|
-
self._scenario_files_cache: list[ScenarioFile] | None = None
|
|
96
|
-
# Cache for loaded personas
|
|
97
|
-
self._personas_cache: dict[str, Personality] | None = None
|
|
98
|
-
|
|
99
|
-
@asynccontextmanager
|
|
100
|
-
async def lifespan(app: FastAPI) -> AsyncGenerator[None, None]:
|
|
101
|
-
# Load scenarios and personas on startup
|
|
102
|
-
self._load_all_scenarios()
|
|
103
|
-
self._load_all_personas()
|
|
104
|
-
yield
|
|
105
|
-
|
|
106
|
-
self.app = FastAPI(lifespan=lifespan, title="Ragbits Evaluation API")
|
|
107
|
-
|
|
108
|
-
self.configure_app()
|
|
109
|
-
self.setup_routes()
|
|
110
|
-
self.setup_exception_handlers()
|
|
111
|
-
|
|
112
|
-
def configure_app(self) -> None:
|
|
113
|
-
"""Configure middleware, CORS, and static files."""
|
|
114
|
-
self.app.add_middleware(
|
|
115
|
-
CORSMiddleware,
|
|
116
|
-
allow_origins=self.cors_origins,
|
|
117
|
-
allow_credentials=True,
|
|
118
|
-
allow_methods=["*"],
|
|
119
|
-
allow_headers=["*"],
|
|
120
|
-
)
|
|
121
|
-
|
|
122
|
-
# Mount static files if directory exists
|
|
123
|
-
assets_dir = self.dist_dir / "assets"
|
|
124
|
-
if assets_dir.exists():
|
|
125
|
-
self.app.mount("/assets", StaticFiles(directory=str(assets_dir)), name="assets")
|
|
126
|
-
|
|
127
|
-
def setup_exception_handlers(self) -> None:
|
|
128
|
-
"""Setup custom exception handlers."""
|
|
129
|
-
|
|
130
|
-
@self.app.exception_handler(RequestValidationError)
|
|
131
|
-
async def validation_exception_handler(request: Request, exc: RequestValidationError) -> JSONResponse:
|
|
132
|
-
logger.error(f"Validation error: {exc}")
|
|
133
|
-
return JSONResponse(
|
|
134
|
-
status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
|
|
135
|
-
content={"detail": exc.errors(), "body": exc.body},
|
|
136
|
-
)
|
|
137
|
-
|
|
138
|
-
def setup_routes(self) -> None:
|
|
139
|
-
"""Define API routes."""
|
|
140
|
-
self._setup_health_routes()
|
|
141
|
-
self._setup_config_routes()
|
|
142
|
-
self._setup_scenario_routes()
|
|
143
|
-
self._setup_execution_routes()
|
|
144
|
-
self._setup_persona_routes()
|
|
145
|
-
self._setup_results_routes()
|
|
146
|
-
self._setup_ui_routes()
|
|
147
|
-
|
|
148
|
-
def _setup_health_routes(self) -> None:
|
|
149
|
-
"""Setup health check endpoints."""
|
|
150
|
-
|
|
151
|
-
@self.app.get("/api/health", response_class=JSONResponse)
|
|
152
|
-
async def health() -> JSONResponse:
|
|
153
|
-
"""Basic liveness check - returns OK if the server is running."""
|
|
154
|
-
return JSONResponse(content={"status": "ok"})
|
|
155
|
-
|
|
156
|
-
@self.app.get("/api/ready", response_class=JSONResponse)
|
|
157
|
-
async def ready() -> JSONResponse:
|
|
158
|
-
"""Readiness check - verifies the API is ready to handle requests."""
|
|
159
|
-
checks = {
|
|
160
|
-
"scenarios_loaded": self._scenarios_cache is not None,
|
|
161
|
-
"personas_loaded": self._personas_cache is not None,
|
|
162
|
-
"scenarios_dir_exists": self.scenarios_dir.exists(),
|
|
163
|
-
"results_dir_exists": self.results_dir.exists(),
|
|
164
|
-
}
|
|
165
|
-
all_ready = all(checks.values())
|
|
166
|
-
return JSONResponse(
|
|
167
|
-
content={
|
|
168
|
-
"status": "ready" if all_ready else "not_ready",
|
|
169
|
-
"checks": checks,
|
|
170
|
-
},
|
|
171
|
-
status_code=status.HTTP_200_OK if all_ready else status.HTTP_503_SERVICE_UNAVAILABLE,
|
|
172
|
-
)
|
|
173
|
-
|
|
174
|
-
def _setup_config_routes(self) -> None:
|
|
175
|
-
"""Setup configuration endpoints."""
|
|
176
|
-
|
|
177
|
-
@self.app.get("/api/eval/config", response_class=JSONResponse)
|
|
178
|
-
async def get_config() -> JSONResponse:
|
|
179
|
-
"""Get evaluation configuration with available scenarios."""
|
|
180
|
-
scenarios = self._get_scenarios()
|
|
181
|
-
scenario_files = self._get_scenario_files()
|
|
182
|
-
response = EvalConfigResponse(
|
|
183
|
-
available_scenarios=[
|
|
184
|
-
ScenarioSummary(name=s.name, num_tasks=len(s.tasks), group=s.group) for s in scenarios.values()
|
|
185
|
-
],
|
|
186
|
-
scenario_files=[
|
|
187
|
-
ScenarioFileSummary(
|
|
188
|
-
filename=sf.filename,
|
|
189
|
-
group=sf.group,
|
|
190
|
-
scenarios=[
|
|
191
|
-
ScenarioSummary(name=s.name, num_tasks=len(s.tasks), group=s.group) for s in sf.scenarios
|
|
192
|
-
],
|
|
193
|
-
)
|
|
194
|
-
for sf in scenario_files
|
|
195
|
-
],
|
|
196
|
-
scenarios_dir=str(self.scenarios_dir),
|
|
197
|
-
)
|
|
198
|
-
return JSONResponse(content=response.model_dump())
|
|
199
|
-
|
|
200
|
-
def _setup_scenario_routes(self) -> None:
|
|
201
|
-
"""Setup scenario management endpoints."""
|
|
202
|
-
|
|
203
|
-
@self.app.get("/api/eval/scenarios/{scenario_name}", response_class=JSONResponse)
|
|
204
|
-
async def get_scenario(scenario_name: str) -> JSONResponse:
|
|
205
|
-
"""Get full scenario details for viewing/editing."""
|
|
206
|
-
scenarios = self._get_scenarios()
|
|
207
|
-
scenario = scenarios.get(scenario_name)
|
|
208
|
-
if not scenario:
|
|
209
|
-
raise HTTPException(status_code=404, detail=f"Scenario '{scenario_name}' not found")
|
|
210
|
-
|
|
211
|
-
detail = ScenarioDetail(
|
|
212
|
-
name=scenario.name,
|
|
213
|
-
tasks=[
|
|
214
|
-
TaskDetail(
|
|
215
|
-
task=t.task,
|
|
216
|
-
checkers=t.checkers,
|
|
217
|
-
checker_mode=t.checker_mode,
|
|
218
|
-
)
|
|
219
|
-
for t in scenario.tasks
|
|
220
|
-
],
|
|
221
|
-
group=scenario.group,
|
|
222
|
-
)
|
|
223
|
-
return JSONResponse(content=detail.model_dump())
|
|
224
|
-
|
|
225
|
-
@self.app.post("/api/eval/scenarios/reload", response_class=JSONResponse)
|
|
226
|
-
async def reload_scenarios() -> JSONResponse:
|
|
227
|
-
"""Reload scenarios from disk."""
|
|
228
|
-
self._scenarios_cache = None
|
|
229
|
-
self._scenario_files_cache = None
|
|
230
|
-
self._load_all_scenarios()
|
|
231
|
-
scenarios = self._get_scenarios()
|
|
232
|
-
return JSONResponse(
|
|
233
|
-
content={
|
|
234
|
-
"status": "reloaded",
|
|
235
|
-
"count": len(scenarios),
|
|
236
|
-
}
|
|
237
|
-
)
|
|
238
|
-
|
|
239
|
-
def _setup_execution_routes(self) -> None:
|
|
240
|
-
"""Setup evaluation execution endpoints."""
|
|
241
|
-
|
|
242
|
-
@self.app.post("/api/eval/run", response_class=JSONResponse)
|
|
243
|
-
async def run_evaluation(request: RunEvaluationRequest) -> JSONResponse:
|
|
244
|
-
"""Start an evaluation run with one or more scenarios.
|
|
245
|
-
|
|
246
|
-
If personas are provided, creates a matrix run: each scenario × each persona.
|
|
247
|
-
"""
|
|
248
|
-
scenarios = self._get_scenarios()
|
|
249
|
-
|
|
250
|
-
# Validate all requested scenarios exist
|
|
251
|
-
missing = [name for name in request.scenario_names if name not in scenarios]
|
|
252
|
-
if missing:
|
|
253
|
-
raise HTTPException(status_code=404, detail=f"Scenarios not found: {missing}")
|
|
254
|
-
|
|
255
|
-
# Validate personas exist if provided
|
|
256
|
-
if request.personas:
|
|
257
|
-
missing_personas = [p for p in request.personas if p not in scenarios]
|
|
258
|
-
if missing_personas:
|
|
259
|
-
raise HTTPException(status_code=404, detail=f"Personas not found: {missing_personas}")
|
|
260
|
-
|
|
261
|
-
# Generate run ID and create progress queue
|
|
262
|
-
run_id = ExecutionManager.generate_run_id()
|
|
263
|
-
|
|
264
|
-
# Determine personas to use (None means single run without persona)
|
|
265
|
-
personas_to_run: list[str | None] = request.personas if request.personas else [None]
|
|
266
|
-
|
|
267
|
-
# Build list of scenario runs (scenarios × personas matrix)
|
|
268
|
-
scenario_run_names = []
|
|
269
|
-
for scenario_name in request.scenario_names:
|
|
270
|
-
for persona in personas_to_run:
|
|
271
|
-
# Create unique name for tracking
|
|
272
|
-
run_name = f"{scenario_name}:{persona}" if persona else scenario_name
|
|
273
|
-
scenario_run_names.append(run_name)
|
|
274
|
-
|
|
275
|
-
self.execution_manager.create_run(run_id, scenario_run_names)
|
|
276
|
-
|
|
277
|
-
# Start execution tasks for each scenario × persona combination
|
|
278
|
-
base_config = request.config.model_dump()
|
|
279
|
-
for scenario_name in request.scenario_names:
|
|
280
|
-
scenario = scenarios[scenario_name]
|
|
281
|
-
for persona in personas_to_run:
|
|
282
|
-
config_with_persona = {**base_config, "persona": persona}
|
|
283
|
-
asyncio.create_task(
|
|
284
|
-
self._run_scenario(run_id, scenario, config_with_persona, persona),
|
|
285
|
-
)
|
|
286
|
-
|
|
287
|
-
response = RunStartResponse(run_id=run_id, scenarios=request.scenario_names)
|
|
288
|
-
return JSONResponse(content=response.model_dump())
|
|
289
|
-
|
|
290
|
-
@self.app.get("/api/eval/progress/{run_id}")
|
|
291
|
-
async def stream_progress(run_id: str) -> StreamingResponse:
|
|
292
|
-
"""Stream progress updates for a running evaluation."""
|
|
293
|
-
if not self.execution_manager.is_run_active(run_id):
|
|
294
|
-
raise HTTPException(status_code=404, detail=f"Run '{run_id}' not found or completed")
|
|
295
|
-
|
|
296
|
-
return StreamingResponse(
|
|
297
|
-
self._progress_generator(run_id),
|
|
298
|
-
media_type="text/event-stream",
|
|
299
|
-
)
|
|
300
|
-
|
|
301
|
-
@self.app.get("/api/eval/progress/{run_id}/buffer/{scenario_run_id}", response_class=JSONResponse)
|
|
302
|
-
async def get_scenario_buffer(run_id: str, scenario_run_id: str) -> JSONResponse:
|
|
303
|
-
"""Get buffered events for a scenario run.
|
|
304
|
-
|
|
305
|
-
This allows fetching all events that occurred before subscribing to SSE,
|
|
306
|
-
enabling late subscribers to catch up on scenario history.
|
|
307
|
-
"""
|
|
308
|
-
if not self.execution_manager.is_run_active(run_id):
|
|
309
|
-
raise HTTPException(status_code=404, detail=f"Run '{run_id}' not found or completed")
|
|
310
|
-
|
|
311
|
-
events = self.execution_manager.get_scenario_run_buffer(run_id, scenario_run_id)
|
|
312
|
-
return JSONResponse(content={"events": [e.model_dump(mode="json") for e in events]})
|
|
313
|
-
|
|
314
|
-
def _setup_persona_routes(self) -> None:
|
|
315
|
-
"""Setup persona management and testing endpoints."""
|
|
316
|
-
|
|
317
|
-
@self.app.get("/api/eval/personas", response_class=JSONResponse)
|
|
318
|
-
async def list_personas() -> JSONResponse:
|
|
319
|
-
"""List all available personas."""
|
|
320
|
-
personas = self._get_personas()
|
|
321
|
-
response = PersonasListResponse(
|
|
322
|
-
personas=[PersonaSummary(name=p.name, description=p.description) for p in personas.values()],
|
|
323
|
-
total=len(personas),
|
|
324
|
-
)
|
|
325
|
-
return JSONResponse(content=response.model_dump())
|
|
326
|
-
|
|
327
|
-
@self.app.get("/api/eval/personas/{persona_name}", response_class=JSONResponse)
|
|
328
|
-
async def get_persona(persona_name: str) -> JSONResponse:
|
|
329
|
-
"""Get full persona details."""
|
|
330
|
-
persona = self._get_persona(persona_name)
|
|
331
|
-
if not persona:
|
|
332
|
-
raise HTTPException(status_code=404, detail=f"Persona '{persona_name}' not found")
|
|
333
|
-
return JSONResponse(content=PersonaSummary(name=persona.name, description=persona.description).model_dump())
|
|
334
|
-
|
|
335
|
-
@self.app.post("/api/eval/personas/reload", response_class=JSONResponse)
|
|
336
|
-
async def reload_personas() -> JSONResponse:
|
|
337
|
-
"""Reload personas from disk."""
|
|
338
|
-
self._personas_cache = None
|
|
339
|
-
self._load_all_personas()
|
|
340
|
-
personas = self._get_personas()
|
|
341
|
-
return JSONResponse(
|
|
342
|
-
content={
|
|
343
|
-
"status": "reloaded",
|
|
344
|
-
"count": len(personas),
|
|
345
|
-
}
|
|
346
|
-
)
|
|
347
|
-
|
|
348
|
-
def _setup_results_routes(self) -> None:
|
|
349
|
-
"""Setup results management endpoints."""
|
|
350
|
-
|
|
351
|
-
@self.app.get("/api/eval/runs", response_class=JSONResponse)
|
|
352
|
-
async def list_runs(limit: int = 50, offset: int = 0) -> JSONResponse:
|
|
353
|
-
"""List simulation runs (batch runs grouped by run_id)."""
|
|
354
|
-
runs, total = await self.execution_manager.list_runs(limit=limit, offset=offset)
|
|
355
|
-
response = SimulationRunsListResponse(runs=runs, total=total)
|
|
356
|
-
return JSONResponse(content=response.model_dump(mode="json"))
|
|
357
|
-
|
|
358
|
-
@self.app.get("/api/eval/runs/{run_id}", response_class=JSONResponse)
|
|
359
|
-
async def get_run(run_id: str) -> JSONResponse:
|
|
360
|
-
"""Get full details for a simulation run."""
|
|
361
|
-
run = await self.execution_manager.get_run(run_id)
|
|
362
|
-
if not run:
|
|
363
|
-
raise HTTPException(status_code=404, detail=f"Run '{run_id}' not found")
|
|
364
|
-
return JSONResponse(content=run.model_dump(mode="json"))
|
|
365
|
-
|
|
366
|
-
@self.app.get("/api/eval/results", response_class=JSONResponse)
|
|
367
|
-
async def list_results(limit: int = 50, offset: int = 0) -> JSONResponse:
|
|
368
|
-
"""List completed evaluation results."""
|
|
369
|
-
results, total = await self.execution_manager.list_results(limit=limit, offset=offset)
|
|
370
|
-
response = ResultsListResponse(results=results, total=total)
|
|
371
|
-
return JSONResponse(content=response.model_dump(mode="json"))
|
|
372
|
-
|
|
373
|
-
@self.app.get("/api/eval/results/{result_id}", response_class=JSONResponse)
|
|
374
|
-
async def get_result(result_id: str) -> JSONResponse:
|
|
375
|
-
"""Get full evaluation result with conversation details."""
|
|
376
|
-
result = await self.execution_manager.load_result(result_id)
|
|
377
|
-
if not result:
|
|
378
|
-
raise HTTPException(status_code=404, detail=f"Result '{result_id}' not found")
|
|
379
|
-
return JSONResponse(content=result.to_dict())
|
|
380
|
-
|
|
381
|
-
@self.app.delete("/api/eval/results/{result_id}", response_class=JSONResponse)
|
|
382
|
-
async def delete_result(result_id: str) -> JSONResponse:
|
|
383
|
-
"""Delete an evaluation result."""
|
|
384
|
-
if await self.execution_manager.delete_result(result_id):
|
|
385
|
-
return JSONResponse(content={"status": "deleted"})
|
|
386
|
-
raise HTTPException(status_code=404, detail=f"Result '{result_id}' not found")
|
|
387
|
-
|
|
388
|
-
def _setup_ui_routes(self) -> None:
|
|
389
|
-
"""Setup UI serving endpoints."""
|
|
390
|
-
|
|
391
|
-
@self.app.get("/{full_path:path}", response_class=HTMLResponse)
|
|
392
|
-
async def serve_ui(full_path: str = "") -> HTMLResponse:
|
|
393
|
-
"""Serve the evaluation UI."""
|
|
394
|
-
index_file = self.dist_dir / "eval.html"
|
|
395
|
-
if not index_file.exists():
|
|
396
|
-
# Fall back to index.html if eval.html doesn't exist
|
|
397
|
-
index_file = self.dist_dir / "index.html"
|
|
398
|
-
if not index_file.exists():
|
|
399
|
-
raise HTTPException(status_code=404, detail="UI not built")
|
|
400
|
-
with open(str(index_file)) as file:
|
|
401
|
-
return HTMLResponse(content=file.read())
|
|
402
|
-
|
|
403
|
-
async def _progress_generator(self, run_id: str) -> AsyncGenerator[str, None]:
|
|
404
|
-
"""Generate SSE events for progress updates."""
|
|
405
|
-
async for update in self.execution_manager.stream_progress(run_id):
|
|
406
|
-
data = json.dumps(update.model_dump(mode="json"))
|
|
407
|
-
yield f"data: {data}\n\n"
|
|
408
|
-
|
|
409
|
-
async def _run_scenario(
|
|
410
|
-
self,
|
|
411
|
-
run_id: str,
|
|
412
|
-
scenario: Scenario,
|
|
413
|
-
request_config: dict[str, Any],
|
|
414
|
-
persona: str | None = None,
|
|
415
|
-
) -> None:
|
|
416
|
-
"""Run a single scenario and emit progress updates."""
|
|
417
|
-
from ragbits.evaluate.agent_simulation.conversation import run_simulation
|
|
418
|
-
|
|
419
|
-
# Create unique run name for scenario+persona combination
|
|
420
|
-
run_name = f"{scenario.name}:{persona}" if persona else scenario.name
|
|
421
|
-
|
|
422
|
-
# Register the scenario run to get a unique ID and enable event buffering
|
|
423
|
-
scenario_run_id = self.execution_manager.register_scenario_run(run_id, run_name)
|
|
424
|
-
callback = create_progress_callback(
|
|
425
|
-
run_id, scenario_run_id, scenario.name, self.execution_manager, persona=persona
|
|
426
|
-
)
|
|
427
|
-
|
|
428
|
-
# Get personality from cache if persona is specified
|
|
429
|
-
personality: Personality | None = self._get_persona(persona) if persona else None
|
|
430
|
-
|
|
431
|
-
try:
|
|
432
|
-
# Emit starting status
|
|
433
|
-
await callback(
|
|
434
|
-
"status",
|
|
435
|
-
status=SimulationStatus.RUNNING,
|
|
436
|
-
current_turn=0,
|
|
437
|
-
current_task_index=0,
|
|
438
|
-
current_task=scenario.tasks[0].task if scenario.tasks else None,
|
|
439
|
-
)
|
|
440
|
-
|
|
441
|
-
# Create a new ChatInterface instance for this scenario
|
|
442
|
-
chat = self.chat_factory()
|
|
443
|
-
await chat.setup()
|
|
444
|
-
|
|
445
|
-
# Merge request config with default simulation config
|
|
446
|
-
# Request values override defaults (excluding 'persona' which is handled separately)
|
|
447
|
-
config = self.simulation_config.model_copy(
|
|
448
|
-
update={k: v for k, v in request_config.items() if v is not None and k != "persona"}
|
|
449
|
-
)
|
|
450
|
-
|
|
451
|
-
# Run the simulation with progress callback and personality
|
|
452
|
-
result = await run_simulation(
|
|
453
|
-
scenario=scenario,
|
|
454
|
-
chat=chat,
|
|
455
|
-
config=config,
|
|
456
|
-
personality=personality,
|
|
457
|
-
progress_callback=callback,
|
|
458
|
-
)
|
|
459
|
-
|
|
460
|
-
# Save result
|
|
461
|
-
result_id = await self.execution_manager.save_result(run_id, scenario_run_id, scenario.name, result)
|
|
462
|
-
|
|
463
|
-
# Emit completion
|
|
464
|
-
await callback(
|
|
465
|
-
"complete",
|
|
466
|
-
result_id=result_id,
|
|
467
|
-
status=result.status,
|
|
468
|
-
success_rate=result.metrics.success_rate if result.metrics else 0.0,
|
|
469
|
-
total_turns=result.metrics.total_turns if result.metrics else 0,
|
|
470
|
-
total_tasks=result.metrics.total_tasks if result.metrics else 0,
|
|
471
|
-
tasks_completed=result.metrics.tasks_completed if result.metrics else 0,
|
|
472
|
-
)
|
|
473
|
-
|
|
474
|
-
except Exception as e:
|
|
475
|
-
logger.exception(f"Error running scenario {scenario.name}")
|
|
476
|
-
# Save failed result
|
|
477
|
-
failed_result = SimulationResult(
|
|
478
|
-
scenario_name=scenario.name,
|
|
479
|
-
start_time=self.execution_manager._active_runs.get(run_id, {}).get(
|
|
480
|
-
"start_time", __import__("datetime").datetime.now(__import__("datetime").timezone.utc)
|
|
481
|
-
),
|
|
482
|
-
status=SimulationStatus.FAILED,
|
|
483
|
-
error=str(e),
|
|
484
|
-
)
|
|
485
|
-
await self.execution_manager.save_result(run_id, scenario_run_id, scenario.name, failed_result)
|
|
486
|
-
|
|
487
|
-
await callback("error", error=str(e))
|
|
488
|
-
|
|
489
|
-
def _load_all_scenarios(self) -> None:
|
|
490
|
-
"""Load all scenarios from the scenarios directory."""
|
|
491
|
-
self._scenarios_cache = {}
|
|
492
|
-
self._scenario_files_cache = []
|
|
493
|
-
|
|
494
|
-
for json_file in self.scenarios_dir.glob("*.json"):
|
|
495
|
-
try:
|
|
496
|
-
scenario_file = load_scenario_file(str(json_file))
|
|
497
|
-
self._scenario_files_cache.append(scenario_file)
|
|
498
|
-
for scenario in scenario_file.scenarios:
|
|
499
|
-
self._scenarios_cache[scenario.name] = scenario
|
|
500
|
-
logger.info(f"Loaded {len(scenario_file.scenarios)} scenarios from {json_file}")
|
|
501
|
-
except Exception as e:
|
|
502
|
-
logger.warning(f"Failed to load scenarios from {json_file}: {e}")
|
|
503
|
-
|
|
504
|
-
logger.info(f"Total scenarios loaded: {len(self._scenarios_cache)}")
|
|
505
|
-
|
|
506
|
-
def _get_scenarios(self) -> dict[str, Scenario]:
|
|
507
|
-
"""Get cached scenarios, loading if necessary."""
|
|
508
|
-
if self._scenarios_cache is None:
|
|
509
|
-
self._load_all_scenarios()
|
|
510
|
-
return self._scenarios_cache or {}
|
|
511
|
-
|
|
512
|
-
def _get_scenario_files(self) -> list[ScenarioFile]:
|
|
513
|
-
"""Get cached scenario files, loading if necessary."""
|
|
514
|
-
if self._scenario_files_cache is None:
|
|
515
|
-
self._load_all_scenarios()
|
|
516
|
-
return self._scenario_files_cache or []
|
|
517
|
-
|
|
518
|
-
def _load_all_personas(self) -> None:
|
|
519
|
-
"""Load all personas from the scenarios directory."""
|
|
520
|
-
self._personas_cache = {}
|
|
521
|
-
|
|
522
|
-
# Try to find personas file (with fallback to old name)
|
|
523
|
-
personas_file = self.scenarios_dir / "personas.json"
|
|
524
|
-
if not personas_file.exists():
|
|
525
|
-
personas_file = self.scenarios_dir / "personalities.json"
|
|
526
|
-
|
|
527
|
-
if personas_file.exists():
|
|
528
|
-
try:
|
|
529
|
-
personas = load_personalities(str(personas_file))
|
|
530
|
-
for persona in personas:
|
|
531
|
-
self._personas_cache[persona.name] = persona
|
|
532
|
-
logger.info(f"Loaded {len(personas)} personas from {personas_file}")
|
|
533
|
-
except Exception as e:
|
|
534
|
-
logger.warning(f"Failed to load personas from {personas_file}: {e}")
|
|
535
|
-
else:
|
|
536
|
-
logger.info("No personas file found")
|
|
537
|
-
|
|
538
|
-
def _get_personas(self) -> dict[str, Personality]:
|
|
539
|
-
"""Get cached personas, loading if necessary."""
|
|
540
|
-
if self._personas_cache is None:
|
|
541
|
-
self._load_all_personas()
|
|
542
|
-
return self._personas_cache or {}
|
|
543
|
-
|
|
544
|
-
def _get_persona(self, name: str) -> Personality | None:
|
|
545
|
-
"""Get a specific persona by name."""
|
|
546
|
-
return self._get_personas().get(name)
|
|
547
|
-
|
|
548
|
-
@staticmethod
|
|
549
|
-
def _resolve_ui_build_dir(ui_build_dir: str | None) -> Path:
|
|
550
|
-
"""Resolve the UI build directory path.
|
|
551
|
-
|
|
552
|
-
Priority:
|
|
553
|
-
1. Custom path if provided
|
|
554
|
-
2. Eval-specific ui-build directory if exists
|
|
555
|
-
3. Fallback to ragbits-chat ui-build directory
|
|
556
|
-
|
|
557
|
-
Args:
|
|
558
|
-
ui_build_dir: Optional custom UI build directory path.
|
|
559
|
-
|
|
560
|
-
Returns:
|
|
561
|
-
Path to the UI build directory.
|
|
562
|
-
"""
|
|
563
|
-
if ui_build_dir:
|
|
564
|
-
return Path(ui_build_dir)
|
|
565
|
-
|
|
566
|
-
# Try eval-specific ui-build first
|
|
567
|
-
eval_ui_dir = Path(__file__).parent / "ui-build"
|
|
568
|
-
if eval_ui_dir.exists():
|
|
569
|
-
return eval_ui_dir
|
|
570
|
-
|
|
571
|
-
# Fallback to ragbits-chat ui-build
|
|
572
|
-
import ragbits.chat.api as chat_api
|
|
573
|
-
|
|
574
|
-
return Path(chat_api.__file__).parent / "ui-build"
|
|
575
|
-
|
|
576
|
-
@staticmethod
|
|
577
|
-
def _load_chat_factory(factory: Callable[[], ChatInterface] | str) -> Callable[[], ChatInterface]:
|
|
578
|
-
"""Load chat factory from callable or string path.
|
|
579
|
-
|
|
580
|
-
Args:
|
|
581
|
-
factory: Factory function or string path in format "module:function".
|
|
582
|
-
|
|
583
|
-
Returns:
|
|
584
|
-
Callable that creates ChatInterface instances.
|
|
585
|
-
"""
|
|
586
|
-
if isinstance(factory, str):
|
|
587
|
-
module_path, obj_name = factory.split(":")
|
|
588
|
-
logger.info(f"Loading chat factory from {module_path}:{obj_name}")
|
|
589
|
-
module = importlib.import_module(module_path)
|
|
590
|
-
factory_func = getattr(module, obj_name)
|
|
591
|
-
if not callable(factory_func):
|
|
592
|
-
raise TypeError(f"{obj_name} is not callable")
|
|
593
|
-
return factory_func
|
|
594
|
-
return factory
|
|
595
|
-
|
|
596
|
-
def run(self, host: str = "127.0.0.1", port: int = 8001) -> None:
|
|
597
|
-
"""Start the API server.
|
|
598
|
-
|
|
599
|
-
Args:
|
|
600
|
-
host: Host to bind to.
|
|
601
|
-
port: Port to bind to.
|
|
602
|
-
"""
|
|
603
|
-
uvicorn.run(self.app, host=host, port=port)
|