ragbits-evaluate 0.0.30rc1__py3-none-any.whl → 1.4.0.dev202602030301__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ragbits/evaluate/agent_simulation/__init__.py +4 -49
- ragbits/evaluate/agent_simulation/conversation.py +278 -663
- ragbits/evaluate/agent_simulation/logger.py +1 -1
- ragbits/evaluate/agent_simulation/metrics/__init__.py +0 -10
- ragbits/evaluate/agent_simulation/metrics/builtin.py +49 -59
- ragbits/evaluate/agent_simulation/metrics/collectors.py +17 -37
- ragbits/evaluate/agent_simulation/models.py +18 -198
- ragbits/evaluate/agent_simulation/results.py +49 -125
- ragbits/evaluate/agent_simulation/scenarios.py +19 -95
- ragbits/evaluate/agent_simulation/simulation.py +166 -72
- ragbits/evaluate/metrics/question_answer.py +25 -8
- {ragbits_evaluate-0.0.30rc1.dist-info → ragbits_evaluate-1.4.0.dev202602030301.dist-info}/METADATA +2 -6
- {ragbits_evaluate-0.0.30rc1.dist-info → ragbits_evaluate-1.4.0.dev202602030301.dist-info}/RECORD +14 -25
- ragbits/evaluate/agent_simulation/checkers.py +0 -591
- ragbits/evaluate/agent_simulation/display.py +0 -118
- ragbits/evaluate/agent_simulation/metrics/deepeval.py +0 -295
- ragbits/evaluate/agent_simulation/tracing.py +0 -233
- ragbits/evaluate/api.py +0 -603
- ragbits/evaluate/api_types.py +0 -343
- ragbits/evaluate/execution_manager.py +0 -451
- ragbits/evaluate/stores/__init__.py +0 -36
- ragbits/evaluate/stores/base.py +0 -98
- ragbits/evaluate/stores/file.py +0 -466
- ragbits/evaluate/stores/kv.py +0 -535
- {ragbits_evaluate-0.0.30rc1.dist-info → ragbits_evaluate-1.4.0.dev202602030301.dist-info}/WHEEL +0 -0
|
@@ -1,451 +0,0 @@
|
|
|
1
|
-
"""Execution manager for running and tracking evaluation simulations."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
import asyncio
|
|
6
|
-
import logging
|
|
7
|
-
import uuid
|
|
8
|
-
from collections.abc import AsyncGenerator, Awaitable, Callable
|
|
9
|
-
from datetime import datetime, timezone
|
|
10
|
-
from pathlib import Path
|
|
11
|
-
from typing import TYPE_CHECKING, Any
|
|
12
|
-
|
|
13
|
-
from ragbits.evaluate.agent_simulation.results import ResponseChunk, SimulationResult, SimulationStatus
|
|
14
|
-
from ragbits.evaluate.api_types import (
|
|
15
|
-
CompletionUpdate,
|
|
16
|
-
ErrorUpdate,
|
|
17
|
-
ProgressUpdate,
|
|
18
|
-
ResponseChunkUpdate,
|
|
19
|
-
ResultSummary,
|
|
20
|
-
SimulationRunDetail,
|
|
21
|
-
SimulationRunSummary,
|
|
22
|
-
StatusProgressUpdate,
|
|
23
|
-
TaskCompleteUpdate,
|
|
24
|
-
TurnProgressUpdate,
|
|
25
|
-
)
|
|
26
|
-
|
|
27
|
-
if TYPE_CHECKING:
|
|
28
|
-
from ragbits.evaluate.stores.base import EvalReportStore
|
|
29
|
-
|
|
30
|
-
logger = logging.getLogger(__name__)
|
|
31
|
-
|
|
32
|
-
ProgressCallback = Callable[[ProgressUpdate], Awaitable[None]]
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
class ExecutionManager:
|
|
36
|
-
"""Manages parallel simulation execution and result persistence."""
|
|
37
|
-
|
|
38
|
-
def __init__(self, store: EvalReportStore | Path | None = None) -> None:
|
|
39
|
-
"""Initialize the execution manager.
|
|
40
|
-
|
|
41
|
-
Args:
|
|
42
|
-
store: Storage backend for evaluation results. Can be:
|
|
43
|
-
- EvalReportStore instance for custom storage
|
|
44
|
-
- Path for file-based storage (backward compatibility)
|
|
45
|
-
- None to use default file-based storage in ./eval_results
|
|
46
|
-
"""
|
|
47
|
-
from ragbits.evaluate.stores import FileEvalReportStore
|
|
48
|
-
|
|
49
|
-
if store is None:
|
|
50
|
-
self.store: EvalReportStore = FileEvalReportStore(Path("./eval_results"))
|
|
51
|
-
elif isinstance(store, Path):
|
|
52
|
-
self.store = FileEvalReportStore(store)
|
|
53
|
-
else:
|
|
54
|
-
self.store = store
|
|
55
|
-
|
|
56
|
-
# Active runs: run_id -> dict with queue, tasks, start_time
|
|
57
|
-
self._active_runs: dict[str, dict[str, Any]] = {}
|
|
58
|
-
|
|
59
|
-
@staticmethod
|
|
60
|
-
def generate_run_id() -> str:
|
|
61
|
-
"""Generate a unique run ID."""
|
|
62
|
-
timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
|
|
63
|
-
unique = uuid.uuid4().hex[:8]
|
|
64
|
-
return f"run_{timestamp}_{unique}"
|
|
65
|
-
|
|
66
|
-
@staticmethod
|
|
67
|
-
def generate_scenario_run_id(scenario_name: str, persona: str | None = None) -> str:
|
|
68
|
-
"""Generate a unique scenario run ID.
|
|
69
|
-
|
|
70
|
-
Args:
|
|
71
|
-
scenario_name: Name of the scenario.
|
|
72
|
-
persona: Optional persona name.
|
|
73
|
-
|
|
74
|
-
Returns:
|
|
75
|
-
Unique scenario run ID.
|
|
76
|
-
"""
|
|
77
|
-
safe_scenario = "".join(c if c.isalnum() or c in "-_" else "_" for c in scenario_name)
|
|
78
|
-
safe_persona = ""
|
|
79
|
-
if persona:
|
|
80
|
-
safe_persona = "_" + "".join(c if c.isalnum() or c in "-_" else "_" for c in persona)
|
|
81
|
-
unique = uuid.uuid4().hex[:6]
|
|
82
|
-
return f"sr_{safe_scenario}{safe_persona}_{unique}"
|
|
83
|
-
|
|
84
|
-
def create_run(self, run_id: str, scenario_names: list[str]) -> asyncio.Queue[ProgressUpdate]:
|
|
85
|
-
"""Create a new run and return its progress queue.
|
|
86
|
-
|
|
87
|
-
Args:
|
|
88
|
-
run_id: Unique identifier for this run.
|
|
89
|
-
scenario_names: List of scenario names being executed.
|
|
90
|
-
|
|
91
|
-
Returns:
|
|
92
|
-
Queue for receiving progress updates.
|
|
93
|
-
"""
|
|
94
|
-
queue: asyncio.Queue[ProgressUpdate] = asyncio.Queue()
|
|
95
|
-
self._active_runs[run_id] = {
|
|
96
|
-
"queue": queue,
|
|
97
|
-
"scenarios": scenario_names,
|
|
98
|
-
"start_time": datetime.now(timezone.utc),
|
|
99
|
-
"tasks": {},
|
|
100
|
-
"completed": set(),
|
|
101
|
-
# Scenario run registry: scenario_run_id -> {scenario_name, persona, ...}
|
|
102
|
-
"scenario_runs": {},
|
|
103
|
-
# Buffer all events per scenario_run_id for late subscribers
|
|
104
|
-
"event_buffer": {},
|
|
105
|
-
}
|
|
106
|
-
return queue
|
|
107
|
-
|
|
108
|
-
def register_scenario_run(
|
|
109
|
-
self,
|
|
110
|
-
run_id: str,
|
|
111
|
-
scenario_name: str,
|
|
112
|
-
persona: str | None = None,
|
|
113
|
-
) -> str:
|
|
114
|
-
"""Register a scenario run and get its unique ID.
|
|
115
|
-
|
|
116
|
-
Args:
|
|
117
|
-
run_id: Run identifier.
|
|
118
|
-
scenario_name: Name of the scenario.
|
|
119
|
-
persona: Optional persona name.
|
|
120
|
-
|
|
121
|
-
Returns:
|
|
122
|
-
Unique scenario run ID.
|
|
123
|
-
"""
|
|
124
|
-
run = self._active_runs.get(run_id)
|
|
125
|
-
if not run:
|
|
126
|
-
raise ValueError(f"Run '{run_id}' not found")
|
|
127
|
-
|
|
128
|
-
scenario_run_id = self.generate_scenario_run_id(scenario_name, persona)
|
|
129
|
-
run["scenario_runs"][scenario_run_id] = {
|
|
130
|
-
"scenario_name": scenario_name,
|
|
131
|
-
"persona": persona,
|
|
132
|
-
"start_time": datetime.now(timezone.utc),
|
|
133
|
-
}
|
|
134
|
-
run["event_buffer"][scenario_run_id] = []
|
|
135
|
-
return scenario_run_id
|
|
136
|
-
|
|
137
|
-
def get_scenario_run_buffer(self, run_id: str, scenario_run_id: str) -> list[ProgressUpdate]:
|
|
138
|
-
"""Get buffered events for a scenario run.
|
|
139
|
-
|
|
140
|
-
Args:
|
|
141
|
-
run_id: Run identifier.
|
|
142
|
-
scenario_run_id: Scenario run identifier.
|
|
143
|
-
|
|
144
|
-
Returns:
|
|
145
|
-
List of buffered progress updates.
|
|
146
|
-
"""
|
|
147
|
-
run = self._active_runs.get(run_id)
|
|
148
|
-
if not run:
|
|
149
|
-
return []
|
|
150
|
-
return run["event_buffer"].get(scenario_run_id, [])
|
|
151
|
-
|
|
152
|
-
def get_progress_queue(self, run_id: str) -> asyncio.Queue[ProgressUpdate] | None:
|
|
153
|
-
"""Get the progress queue for a run.
|
|
154
|
-
|
|
155
|
-
Args:
|
|
156
|
-
run_id: Run identifier.
|
|
157
|
-
|
|
158
|
-
Returns:
|
|
159
|
-
Progress queue if run exists, None otherwise.
|
|
160
|
-
"""
|
|
161
|
-
run = self._active_runs.get(run_id)
|
|
162
|
-
return run["queue"] if run else None
|
|
163
|
-
|
|
164
|
-
def is_run_active(self, run_id: str) -> bool:
|
|
165
|
-
"""Check if a run is still active.
|
|
166
|
-
|
|
167
|
-
Args:
|
|
168
|
-
run_id: Run identifier.
|
|
169
|
-
|
|
170
|
-
Returns:
|
|
171
|
-
True if run is active.
|
|
172
|
-
"""
|
|
173
|
-
return run_id in self._active_runs
|
|
174
|
-
|
|
175
|
-
def mark_scenario_complete(self, run_id: str, scenario_name: str) -> bool:
|
|
176
|
-
"""Mark a scenario as complete and check if run is finished.
|
|
177
|
-
|
|
178
|
-
Args:
|
|
179
|
-
run_id: Run identifier.
|
|
180
|
-
scenario_name: Name of completed scenario.
|
|
181
|
-
|
|
182
|
-
Returns:
|
|
183
|
-
True if all scenarios in the run are complete.
|
|
184
|
-
"""
|
|
185
|
-
run = self._active_runs.get(run_id)
|
|
186
|
-
if not run:
|
|
187
|
-
return True
|
|
188
|
-
|
|
189
|
-
run["completed"].add(scenario_name)
|
|
190
|
-
return len(run["completed"]) >= len(run["scenarios"])
|
|
191
|
-
|
|
192
|
-
def cleanup_run(self, run_id: str) -> None:
|
|
193
|
-
"""Clean up a completed run.
|
|
194
|
-
|
|
195
|
-
Args:
|
|
196
|
-
run_id: Run identifier to clean up.
|
|
197
|
-
"""
|
|
198
|
-
if run_id in self._active_runs:
|
|
199
|
-
del self._active_runs[run_id]
|
|
200
|
-
|
|
201
|
-
async def emit_progress(self, run_id: str, update: ProgressUpdate) -> None:
|
|
202
|
-
"""Emit a progress update to the run's queue and buffer it.
|
|
203
|
-
|
|
204
|
-
Args:
|
|
205
|
-
run_id: Run identifier.
|
|
206
|
-
update: Progress update to emit.
|
|
207
|
-
"""
|
|
208
|
-
run = self._active_runs.get(run_id)
|
|
209
|
-
if not run:
|
|
210
|
-
return
|
|
211
|
-
|
|
212
|
-
# Buffer the event for the scenario run
|
|
213
|
-
scenario_run_id = update.scenario_run_id
|
|
214
|
-
if scenario_run_id in run["event_buffer"]:
|
|
215
|
-
run["event_buffer"][scenario_run_id].append(update)
|
|
216
|
-
|
|
217
|
-
# Also emit to the queue for real-time streaming
|
|
218
|
-
queue = run.get("queue")
|
|
219
|
-
if queue:
|
|
220
|
-
await queue.put(update)
|
|
221
|
-
|
|
222
|
-
async def stream_progress(self, run_id: str) -> AsyncGenerator[ProgressUpdate, None]:
|
|
223
|
-
"""Stream progress updates for a run.
|
|
224
|
-
|
|
225
|
-
Args:
|
|
226
|
-
run_id: Run identifier.
|
|
227
|
-
|
|
228
|
-
Yields:
|
|
229
|
-
Progress updates as they occur.
|
|
230
|
-
"""
|
|
231
|
-
queue = self.get_progress_queue(run_id)
|
|
232
|
-
if not queue:
|
|
233
|
-
return
|
|
234
|
-
|
|
235
|
-
while True:
|
|
236
|
-
try:
|
|
237
|
-
# Use timeout to allow checking if run is still active
|
|
238
|
-
update = await asyncio.wait_for(queue.get(), timeout=30.0)
|
|
239
|
-
yield update
|
|
240
|
-
|
|
241
|
-
# Check if this is a terminal update
|
|
242
|
-
if isinstance(update, CompletionUpdate | ErrorUpdate) and self.mark_scenario_complete(
|
|
243
|
-
run_id, update.scenario_name
|
|
244
|
-
):
|
|
245
|
-
# All scenarios complete, clean up
|
|
246
|
-
self.cleanup_run(run_id)
|
|
247
|
-
return
|
|
248
|
-
except TimeoutError:
|
|
249
|
-
# Check if run is still active
|
|
250
|
-
if not self.is_run_active(run_id):
|
|
251
|
-
return
|
|
252
|
-
continue
|
|
253
|
-
except asyncio.CancelledError:
|
|
254
|
-
return
|
|
255
|
-
|
|
256
|
-
async def save_result(self, run_id: str, scenario_run_id: str, scenario_name: str, result: SimulationResult) -> str:
|
|
257
|
-
"""Save a simulation result.
|
|
258
|
-
|
|
259
|
-
Args:
|
|
260
|
-
run_id: Run identifier.
|
|
261
|
-
scenario_run_id: Unique scenario run identifier.
|
|
262
|
-
scenario_name: Name of the scenario.
|
|
263
|
-
result: Simulation result to save.
|
|
264
|
-
|
|
265
|
-
Returns:
|
|
266
|
-
Result ID for later retrieval.
|
|
267
|
-
"""
|
|
268
|
-
# Collect response chunks from the event buffer
|
|
269
|
-
buffered_events = self.get_scenario_run_buffer(run_id, scenario_run_id)
|
|
270
|
-
buffered_chunks = []
|
|
271
|
-
for event in buffered_events:
|
|
272
|
-
if event.type == "response_chunk":
|
|
273
|
-
buffered_chunks.append(
|
|
274
|
-
ResponseChunk(
|
|
275
|
-
turn_index=event.turn_index,
|
|
276
|
-
task_index=event.task_index,
|
|
277
|
-
chunk_index=0, # Will be re-indexed by the store
|
|
278
|
-
chunk_type=event.chunk_type,
|
|
279
|
-
chunk_data=event.chunk_data,
|
|
280
|
-
)
|
|
281
|
-
)
|
|
282
|
-
|
|
283
|
-
return await self.store.save_result(
|
|
284
|
-
run_id=run_id,
|
|
285
|
-
scenario_run_id=scenario_run_id,
|
|
286
|
-
scenario_name=scenario_name,
|
|
287
|
-
result=result,
|
|
288
|
-
buffered_chunks=buffered_chunks if buffered_chunks else None,
|
|
289
|
-
)
|
|
290
|
-
|
|
291
|
-
async def list_results(self, limit: int = 50, offset: int = 0) -> tuple[list[ResultSummary], int]:
|
|
292
|
-
"""List evaluation results with pagination.
|
|
293
|
-
|
|
294
|
-
Args:
|
|
295
|
-
limit: Maximum number of results to return.
|
|
296
|
-
offset: Number of results to skip.
|
|
297
|
-
|
|
298
|
-
Returns:
|
|
299
|
-
Tuple of (results list, total count).
|
|
300
|
-
"""
|
|
301
|
-
return await self.store.list_results(limit=limit, offset=offset)
|
|
302
|
-
|
|
303
|
-
async def load_result(self, result_id: str) -> SimulationResult | None:
|
|
304
|
-
"""Load a simulation result.
|
|
305
|
-
|
|
306
|
-
Args:
|
|
307
|
-
result_id: Result identifier.
|
|
308
|
-
|
|
309
|
-
Returns:
|
|
310
|
-
SimulationResult if found, None otherwise.
|
|
311
|
-
"""
|
|
312
|
-
return await self.store.load_result(result_id)
|
|
313
|
-
|
|
314
|
-
async def delete_result(self, result_id: str) -> bool:
|
|
315
|
-
"""Delete a simulation result.
|
|
316
|
-
|
|
317
|
-
Args:
|
|
318
|
-
result_id: Result identifier.
|
|
319
|
-
|
|
320
|
-
Returns:
|
|
321
|
-
True if deleted, False if not found.
|
|
322
|
-
"""
|
|
323
|
-
return await self.store.delete_result(result_id)
|
|
324
|
-
|
|
325
|
-
async def list_runs(self, limit: int = 50, offset: int = 0) -> tuple[list[SimulationRunSummary], int]:
|
|
326
|
-
"""List simulation runs (batch runs grouped by run_id).
|
|
327
|
-
|
|
328
|
-
Args:
|
|
329
|
-
limit: Maximum number of runs to return.
|
|
330
|
-
offset: Number of runs to skip.
|
|
331
|
-
|
|
332
|
-
Returns:
|
|
333
|
-
Tuple of (runs list, total count).
|
|
334
|
-
"""
|
|
335
|
-
return await self.store.list_runs(limit=limit, offset=offset)
|
|
336
|
-
|
|
337
|
-
async def get_run(self, run_id: str) -> SimulationRunDetail | None:
|
|
338
|
-
"""Get full details for a simulation run.
|
|
339
|
-
|
|
340
|
-
Args:
|
|
341
|
-
run_id: Run identifier.
|
|
342
|
-
|
|
343
|
-
Returns:
|
|
344
|
-
SimulationRunDetail if found, None otherwise.
|
|
345
|
-
"""
|
|
346
|
-
return await self.store.get_run(run_id)
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
def create_progress_callback(
|
|
350
|
-
run_id: str,
|
|
351
|
-
scenario_run_id: str,
|
|
352
|
-
scenario_name: str,
|
|
353
|
-
execution_manager: ExecutionManager,
|
|
354
|
-
persona: str | None = None,
|
|
355
|
-
) -> ProgressCallback:
|
|
356
|
-
"""Create a progress callback for use with run_simulation.
|
|
357
|
-
|
|
358
|
-
Args:
|
|
359
|
-
run_id: Run identifier.
|
|
360
|
-
scenario_run_id: Unique scenario run identifier (includes persona if any).
|
|
361
|
-
scenario_name: Name of the scenario being run.
|
|
362
|
-
execution_manager: Execution manager instance.
|
|
363
|
-
persona: Optional persona name for this scenario run.
|
|
364
|
-
|
|
365
|
-
Returns:
|
|
366
|
-
Async callback function for progress updates.
|
|
367
|
-
"""
|
|
368
|
-
|
|
369
|
-
async def callback(
|
|
370
|
-
event_type: str,
|
|
371
|
-
**kwargs: Any,
|
|
372
|
-
) -> None:
|
|
373
|
-
"""Progress callback for simulation events."""
|
|
374
|
-
update: ProgressUpdate
|
|
375
|
-
|
|
376
|
-
if event_type == "status":
|
|
377
|
-
update = StatusProgressUpdate(
|
|
378
|
-
run_id=run_id,
|
|
379
|
-
scenario_run_id=scenario_run_id,
|
|
380
|
-
scenario_name=scenario_name,
|
|
381
|
-
persona=persona,
|
|
382
|
-
status=kwargs.get("status", SimulationStatus.RUNNING),
|
|
383
|
-
current_turn=kwargs.get("current_turn"),
|
|
384
|
-
current_task_index=kwargs.get("current_task_index"),
|
|
385
|
-
current_task=kwargs.get("current_task"),
|
|
386
|
-
)
|
|
387
|
-
elif event_type == "turn":
|
|
388
|
-
update = TurnProgressUpdate(
|
|
389
|
-
run_id=run_id,
|
|
390
|
-
scenario_run_id=scenario_run_id,
|
|
391
|
-
scenario_name=scenario_name,
|
|
392
|
-
persona=persona,
|
|
393
|
-
turn_index=kwargs.get("turn_index", 0),
|
|
394
|
-
task_index=kwargs.get("task_index", 0),
|
|
395
|
-
user_message=kwargs.get("user_message", ""),
|
|
396
|
-
assistant_message=kwargs.get("assistant_message", ""),
|
|
397
|
-
tool_calls=kwargs.get("tool_calls", []),
|
|
398
|
-
task_completed=kwargs.get("task_completed", False),
|
|
399
|
-
task_completed_reason=kwargs.get("task_completed_reason", ""),
|
|
400
|
-
checkers=kwargs.get("checkers", []),
|
|
401
|
-
checker_mode=kwargs.get("checker_mode", "all"),
|
|
402
|
-
)
|
|
403
|
-
elif event_type == "task_complete":
|
|
404
|
-
update = TaskCompleteUpdate(
|
|
405
|
-
run_id=run_id,
|
|
406
|
-
scenario_run_id=scenario_run_id,
|
|
407
|
-
scenario_name=scenario_name,
|
|
408
|
-
persona=persona,
|
|
409
|
-
task_index=kwargs.get("task_index", 0),
|
|
410
|
-
task_description=kwargs.get("task_description", ""),
|
|
411
|
-
turns_taken=kwargs.get("turns_taken", 0),
|
|
412
|
-
reason=kwargs.get("reason", ""),
|
|
413
|
-
)
|
|
414
|
-
elif event_type == "complete":
|
|
415
|
-
update = CompletionUpdate(
|
|
416
|
-
run_id=run_id,
|
|
417
|
-
scenario_run_id=scenario_run_id,
|
|
418
|
-
scenario_name=scenario_name,
|
|
419
|
-
persona=persona,
|
|
420
|
-
result_id=kwargs.get("result_id", ""),
|
|
421
|
-
status=kwargs.get("status", SimulationStatus.COMPLETED),
|
|
422
|
-
success_rate=kwargs.get("success_rate", 0.0),
|
|
423
|
-
total_turns=kwargs.get("total_turns", 0),
|
|
424
|
-
total_tasks=kwargs.get("total_tasks", 0),
|
|
425
|
-
tasks_completed=kwargs.get("tasks_completed", 0),
|
|
426
|
-
)
|
|
427
|
-
elif event_type == "error":
|
|
428
|
-
update = ErrorUpdate(
|
|
429
|
-
run_id=run_id,
|
|
430
|
-
scenario_run_id=scenario_run_id,
|
|
431
|
-
scenario_name=scenario_name,
|
|
432
|
-
persona=persona,
|
|
433
|
-
error=kwargs.get("error", "Unknown error"),
|
|
434
|
-
)
|
|
435
|
-
elif event_type == "response_chunk":
|
|
436
|
-
update = ResponseChunkUpdate(
|
|
437
|
-
run_id=run_id,
|
|
438
|
-
scenario_run_id=scenario_run_id,
|
|
439
|
-
scenario_name=scenario_name,
|
|
440
|
-
persona=persona,
|
|
441
|
-
turn_index=kwargs.get("turn_index", 0),
|
|
442
|
-
task_index=kwargs.get("task_index", 0),
|
|
443
|
-
chunk_type=kwargs.get("chunk_type", "unknown"),
|
|
444
|
-
chunk_data=kwargs.get("chunk_data", {}),
|
|
445
|
-
)
|
|
446
|
-
else:
|
|
447
|
-
return
|
|
448
|
-
|
|
449
|
-
await execution_manager.emit_progress(run_id, update)
|
|
450
|
-
|
|
451
|
-
return callback
|
|
@@ -1,36 +0,0 @@
|
|
|
1
|
-
"""Evaluation report storage backends.
|
|
2
|
-
|
|
3
|
-
This module provides pluggable storage backends for evaluation reports.
|
|
4
|
-
The default is FileEvalReportStore which maintains backward compatibility
|
|
5
|
-
with the existing file-based approach.
|
|
6
|
-
|
|
7
|
-
Example usage:
|
|
8
|
-
from ragbits.evaluate.stores import FileEvalReportStore, KVEvalReportStore
|
|
9
|
-
|
|
10
|
-
# File-based storage (default)
|
|
11
|
-
store = FileEvalReportStore(results_dir="./eval_results")
|
|
12
|
-
|
|
13
|
-
# KV-based storage using PostgreSQL (recommended for production)
|
|
14
|
-
from ragbits.core.storage.connections import PostgresConnection
|
|
15
|
-
from ragbits.core.storage.kv_store import PostgresKVStore
|
|
16
|
-
|
|
17
|
-
conn = PostgresConnection(host="localhost", database="mydb")
|
|
18
|
-
kv = PostgresKVStore(connection=conn, table_name="eval_results")
|
|
19
|
-
store = KVEvalReportStore(kv_store=kv)
|
|
20
|
-
"""
|
|
21
|
-
|
|
22
|
-
from ragbits.evaluate.stores.base import EvalReportStore
|
|
23
|
-
from ragbits.evaluate.stores.file import FileEvalReportStore
|
|
24
|
-
|
|
25
|
-
__all__ = [
|
|
26
|
-
"EvalReportStore",
|
|
27
|
-
"FileEvalReportStore",
|
|
28
|
-
]
|
|
29
|
-
|
|
30
|
-
# KV store is optional (requires ragbits-core with storage extras)
|
|
31
|
-
try:
|
|
32
|
-
from ragbits.evaluate.stores.kv import KVEvalReportStore # noqa: F401
|
|
33
|
-
|
|
34
|
-
__all__.append("KVEvalReportStore")
|
|
35
|
-
except ImportError:
|
|
36
|
-
pass
|
ragbits/evaluate/stores/base.py
DELETED
|
@@ -1,98 +0,0 @@
|
|
|
1
|
-
"""Base class for evaluation report storage backends."""
|
|
2
|
-
|
|
3
|
-
from abc import ABC, abstractmethod
|
|
4
|
-
from typing import ClassVar
|
|
5
|
-
|
|
6
|
-
from ragbits.evaluate import stores as stores_module
|
|
7
|
-
from ragbits.evaluate.agent_simulation.results import ResponseChunk, SimulationResult
|
|
8
|
-
from ragbits.evaluate.api_types import ResultSummary, SimulationRunDetail, SimulationRunSummary
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class EvalReportStore(ABC):
|
|
12
|
-
"""Abstract base class for evaluation report storage.
|
|
13
|
-
|
|
14
|
-
Provides a pluggable interface for storing and retrieving evaluation results.
|
|
15
|
-
Implementations can use file-based storage, SQLite, PostgreSQL, or other backends.
|
|
16
|
-
"""
|
|
17
|
-
|
|
18
|
-
default_module: ClassVar = stores_module
|
|
19
|
-
configuration_key: ClassVar = "eval_report_store"
|
|
20
|
-
|
|
21
|
-
@abstractmethod
|
|
22
|
-
async def save_result(
|
|
23
|
-
self,
|
|
24
|
-
run_id: str,
|
|
25
|
-
scenario_run_id: str,
|
|
26
|
-
scenario_name: str,
|
|
27
|
-
result: SimulationResult,
|
|
28
|
-
buffered_chunks: list[ResponseChunk] | None = None,
|
|
29
|
-
) -> str:
|
|
30
|
-
"""Save a simulation result.
|
|
31
|
-
|
|
32
|
-
Args:
|
|
33
|
-
run_id: Run identifier for grouping multiple scenarios.
|
|
34
|
-
scenario_run_id: Unique identifier for this scenario run.
|
|
35
|
-
scenario_name: Name of the scenario.
|
|
36
|
-
result: The simulation result to save.
|
|
37
|
-
buffered_chunks: Optional response chunks from the event buffer.
|
|
38
|
-
|
|
39
|
-
Returns:
|
|
40
|
-
Result ID for later retrieval.
|
|
41
|
-
"""
|
|
42
|
-
|
|
43
|
-
@abstractmethod
|
|
44
|
-
async def load_result(self, result_id: str) -> SimulationResult | None:
|
|
45
|
-
"""Load a simulation result by ID.
|
|
46
|
-
|
|
47
|
-
Args:
|
|
48
|
-
result_id: Result identifier.
|
|
49
|
-
|
|
50
|
-
Returns:
|
|
51
|
-
SimulationResult if found, None otherwise.
|
|
52
|
-
"""
|
|
53
|
-
|
|
54
|
-
@abstractmethod
|
|
55
|
-
async def delete_result(self, result_id: str) -> bool:
|
|
56
|
-
"""Delete a simulation result.
|
|
57
|
-
|
|
58
|
-
Args:
|
|
59
|
-
result_id: Result identifier.
|
|
60
|
-
|
|
61
|
-
Returns:
|
|
62
|
-
True if deleted, False if not found.
|
|
63
|
-
"""
|
|
64
|
-
|
|
65
|
-
@abstractmethod
|
|
66
|
-
async def list_results(self, limit: int = 50, offset: int = 0) -> tuple[list[ResultSummary], int]:
|
|
67
|
-
"""List evaluation results with pagination.
|
|
68
|
-
|
|
69
|
-
Args:
|
|
70
|
-
limit: Maximum number of results to return.
|
|
71
|
-
offset: Number of results to skip.
|
|
72
|
-
|
|
73
|
-
Returns:
|
|
74
|
-
Tuple of (results list, total count).
|
|
75
|
-
"""
|
|
76
|
-
|
|
77
|
-
@abstractmethod
|
|
78
|
-
async def list_runs(self, limit: int = 50, offset: int = 0) -> tuple[list[SimulationRunSummary], int]:
|
|
79
|
-
"""List simulation runs (batch runs grouped by run_id).
|
|
80
|
-
|
|
81
|
-
Args:
|
|
82
|
-
limit: Maximum number of runs to return.
|
|
83
|
-
offset: Number of runs to skip.
|
|
84
|
-
|
|
85
|
-
Returns:
|
|
86
|
-
Tuple of (runs list, total count).
|
|
87
|
-
"""
|
|
88
|
-
|
|
89
|
-
@abstractmethod
|
|
90
|
-
async def get_run(self, run_id: str) -> SimulationRunDetail | None:
|
|
91
|
-
"""Get full details for a simulation run.
|
|
92
|
-
|
|
93
|
-
Args:
|
|
94
|
-
run_id: Run identifier.
|
|
95
|
-
|
|
96
|
-
Returns:
|
|
97
|
-
SimulationRunDetail if found, None otherwise.
|
|
98
|
-
"""
|