ragbits-evaluate 0.0.30rc1__py3-none-any.whl → 1.4.0.dev202602030301__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,451 +0,0 @@
1
- """Execution manager for running and tracking evaluation simulations."""
2
-
3
- from __future__ import annotations
4
-
5
- import asyncio
6
- import logging
7
- import uuid
8
- from collections.abc import AsyncGenerator, Awaitable, Callable
9
- from datetime import datetime, timezone
10
- from pathlib import Path
11
- from typing import TYPE_CHECKING, Any
12
-
13
- from ragbits.evaluate.agent_simulation.results import ResponseChunk, SimulationResult, SimulationStatus
14
- from ragbits.evaluate.api_types import (
15
- CompletionUpdate,
16
- ErrorUpdate,
17
- ProgressUpdate,
18
- ResponseChunkUpdate,
19
- ResultSummary,
20
- SimulationRunDetail,
21
- SimulationRunSummary,
22
- StatusProgressUpdate,
23
- TaskCompleteUpdate,
24
- TurnProgressUpdate,
25
- )
26
-
27
- if TYPE_CHECKING:
28
- from ragbits.evaluate.stores.base import EvalReportStore
29
-
30
- logger = logging.getLogger(__name__)
31
-
32
- ProgressCallback = Callable[[ProgressUpdate], Awaitable[None]]
33
-
34
-
35
- class ExecutionManager:
36
- """Manages parallel simulation execution and result persistence."""
37
-
38
- def __init__(self, store: EvalReportStore | Path | None = None) -> None:
39
- """Initialize the execution manager.
40
-
41
- Args:
42
- store: Storage backend for evaluation results. Can be:
43
- - EvalReportStore instance for custom storage
44
- - Path for file-based storage (backward compatibility)
45
- - None to use default file-based storage in ./eval_results
46
- """
47
- from ragbits.evaluate.stores import FileEvalReportStore
48
-
49
- if store is None:
50
- self.store: EvalReportStore = FileEvalReportStore(Path("./eval_results"))
51
- elif isinstance(store, Path):
52
- self.store = FileEvalReportStore(store)
53
- else:
54
- self.store = store
55
-
56
- # Active runs: run_id -> dict with queue, tasks, start_time
57
- self._active_runs: dict[str, dict[str, Any]] = {}
58
-
59
- @staticmethod
60
- def generate_run_id() -> str:
61
- """Generate a unique run ID."""
62
- timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
63
- unique = uuid.uuid4().hex[:8]
64
- return f"run_{timestamp}_{unique}"
65
-
66
- @staticmethod
67
- def generate_scenario_run_id(scenario_name: str, persona: str | None = None) -> str:
68
- """Generate a unique scenario run ID.
69
-
70
- Args:
71
- scenario_name: Name of the scenario.
72
- persona: Optional persona name.
73
-
74
- Returns:
75
- Unique scenario run ID.
76
- """
77
- safe_scenario = "".join(c if c.isalnum() or c in "-_" else "_" for c in scenario_name)
78
- safe_persona = ""
79
- if persona:
80
- safe_persona = "_" + "".join(c if c.isalnum() or c in "-_" else "_" for c in persona)
81
- unique = uuid.uuid4().hex[:6]
82
- return f"sr_{safe_scenario}{safe_persona}_{unique}"
83
-
84
- def create_run(self, run_id: str, scenario_names: list[str]) -> asyncio.Queue[ProgressUpdate]:
85
- """Create a new run and return its progress queue.
86
-
87
- Args:
88
- run_id: Unique identifier for this run.
89
- scenario_names: List of scenario names being executed.
90
-
91
- Returns:
92
- Queue for receiving progress updates.
93
- """
94
- queue: asyncio.Queue[ProgressUpdate] = asyncio.Queue()
95
- self._active_runs[run_id] = {
96
- "queue": queue,
97
- "scenarios": scenario_names,
98
- "start_time": datetime.now(timezone.utc),
99
- "tasks": {},
100
- "completed": set(),
101
- # Scenario run registry: scenario_run_id -> {scenario_name, persona, ...}
102
- "scenario_runs": {},
103
- # Buffer all events per scenario_run_id for late subscribers
104
- "event_buffer": {},
105
- }
106
- return queue
107
-
108
- def register_scenario_run(
109
- self,
110
- run_id: str,
111
- scenario_name: str,
112
- persona: str | None = None,
113
- ) -> str:
114
- """Register a scenario run and get its unique ID.
115
-
116
- Args:
117
- run_id: Run identifier.
118
- scenario_name: Name of the scenario.
119
- persona: Optional persona name.
120
-
121
- Returns:
122
- Unique scenario run ID.
123
- """
124
- run = self._active_runs.get(run_id)
125
- if not run:
126
- raise ValueError(f"Run '{run_id}' not found")
127
-
128
- scenario_run_id = self.generate_scenario_run_id(scenario_name, persona)
129
- run["scenario_runs"][scenario_run_id] = {
130
- "scenario_name": scenario_name,
131
- "persona": persona,
132
- "start_time": datetime.now(timezone.utc),
133
- }
134
- run["event_buffer"][scenario_run_id] = []
135
- return scenario_run_id
136
-
137
- def get_scenario_run_buffer(self, run_id: str, scenario_run_id: str) -> list[ProgressUpdate]:
138
- """Get buffered events for a scenario run.
139
-
140
- Args:
141
- run_id: Run identifier.
142
- scenario_run_id: Scenario run identifier.
143
-
144
- Returns:
145
- List of buffered progress updates.
146
- """
147
- run = self._active_runs.get(run_id)
148
- if not run:
149
- return []
150
- return run["event_buffer"].get(scenario_run_id, [])
151
-
152
- def get_progress_queue(self, run_id: str) -> asyncio.Queue[ProgressUpdate] | None:
153
- """Get the progress queue for a run.
154
-
155
- Args:
156
- run_id: Run identifier.
157
-
158
- Returns:
159
- Progress queue if run exists, None otherwise.
160
- """
161
- run = self._active_runs.get(run_id)
162
- return run["queue"] if run else None
163
-
164
- def is_run_active(self, run_id: str) -> bool:
165
- """Check if a run is still active.
166
-
167
- Args:
168
- run_id: Run identifier.
169
-
170
- Returns:
171
- True if run is active.
172
- """
173
- return run_id in self._active_runs
174
-
175
- def mark_scenario_complete(self, run_id: str, scenario_name: str) -> bool:
176
- """Mark a scenario as complete and check if run is finished.
177
-
178
- Args:
179
- run_id: Run identifier.
180
- scenario_name: Name of completed scenario.
181
-
182
- Returns:
183
- True if all scenarios in the run are complete.
184
- """
185
- run = self._active_runs.get(run_id)
186
- if not run:
187
- return True
188
-
189
- run["completed"].add(scenario_name)
190
- return len(run["completed"]) >= len(run["scenarios"])
191
-
192
- def cleanup_run(self, run_id: str) -> None:
193
- """Clean up a completed run.
194
-
195
- Args:
196
- run_id: Run identifier to clean up.
197
- """
198
- if run_id in self._active_runs:
199
- del self._active_runs[run_id]
200
-
201
- async def emit_progress(self, run_id: str, update: ProgressUpdate) -> None:
202
- """Emit a progress update to the run's queue and buffer it.
203
-
204
- Args:
205
- run_id: Run identifier.
206
- update: Progress update to emit.
207
- """
208
- run = self._active_runs.get(run_id)
209
- if not run:
210
- return
211
-
212
- # Buffer the event for the scenario run
213
- scenario_run_id = update.scenario_run_id
214
- if scenario_run_id in run["event_buffer"]:
215
- run["event_buffer"][scenario_run_id].append(update)
216
-
217
- # Also emit to the queue for real-time streaming
218
- queue = run.get("queue")
219
- if queue:
220
- await queue.put(update)
221
-
222
- async def stream_progress(self, run_id: str) -> AsyncGenerator[ProgressUpdate, None]:
223
- """Stream progress updates for a run.
224
-
225
- Args:
226
- run_id: Run identifier.
227
-
228
- Yields:
229
- Progress updates as they occur.
230
- """
231
- queue = self.get_progress_queue(run_id)
232
- if not queue:
233
- return
234
-
235
- while True:
236
- try:
237
- # Use timeout to allow checking if run is still active
238
- update = await asyncio.wait_for(queue.get(), timeout=30.0)
239
- yield update
240
-
241
- # Check if this is a terminal update
242
- if isinstance(update, CompletionUpdate | ErrorUpdate) and self.mark_scenario_complete(
243
- run_id, update.scenario_name
244
- ):
245
- # All scenarios complete, clean up
246
- self.cleanup_run(run_id)
247
- return
248
- except TimeoutError:
249
- # Check if run is still active
250
- if not self.is_run_active(run_id):
251
- return
252
- continue
253
- except asyncio.CancelledError:
254
- return
255
-
256
- async def save_result(self, run_id: str, scenario_run_id: str, scenario_name: str, result: SimulationResult) -> str:
257
- """Save a simulation result.
258
-
259
- Args:
260
- run_id: Run identifier.
261
- scenario_run_id: Unique scenario run identifier.
262
- scenario_name: Name of the scenario.
263
- result: Simulation result to save.
264
-
265
- Returns:
266
- Result ID for later retrieval.
267
- """
268
- # Collect response chunks from the event buffer
269
- buffered_events = self.get_scenario_run_buffer(run_id, scenario_run_id)
270
- buffered_chunks = []
271
- for event in buffered_events:
272
- if event.type == "response_chunk":
273
- buffered_chunks.append(
274
- ResponseChunk(
275
- turn_index=event.turn_index,
276
- task_index=event.task_index,
277
- chunk_index=0, # Will be re-indexed by the store
278
- chunk_type=event.chunk_type,
279
- chunk_data=event.chunk_data,
280
- )
281
- )
282
-
283
- return await self.store.save_result(
284
- run_id=run_id,
285
- scenario_run_id=scenario_run_id,
286
- scenario_name=scenario_name,
287
- result=result,
288
- buffered_chunks=buffered_chunks if buffered_chunks else None,
289
- )
290
-
291
- async def list_results(self, limit: int = 50, offset: int = 0) -> tuple[list[ResultSummary], int]:
292
- """List evaluation results with pagination.
293
-
294
- Args:
295
- limit: Maximum number of results to return.
296
- offset: Number of results to skip.
297
-
298
- Returns:
299
- Tuple of (results list, total count).
300
- """
301
- return await self.store.list_results(limit=limit, offset=offset)
302
-
303
- async def load_result(self, result_id: str) -> SimulationResult | None:
304
- """Load a simulation result.
305
-
306
- Args:
307
- result_id: Result identifier.
308
-
309
- Returns:
310
- SimulationResult if found, None otherwise.
311
- """
312
- return await self.store.load_result(result_id)
313
-
314
- async def delete_result(self, result_id: str) -> bool:
315
- """Delete a simulation result.
316
-
317
- Args:
318
- result_id: Result identifier.
319
-
320
- Returns:
321
- True if deleted, False if not found.
322
- """
323
- return await self.store.delete_result(result_id)
324
-
325
- async def list_runs(self, limit: int = 50, offset: int = 0) -> tuple[list[SimulationRunSummary], int]:
326
- """List simulation runs (batch runs grouped by run_id).
327
-
328
- Args:
329
- limit: Maximum number of runs to return.
330
- offset: Number of runs to skip.
331
-
332
- Returns:
333
- Tuple of (runs list, total count).
334
- """
335
- return await self.store.list_runs(limit=limit, offset=offset)
336
-
337
- async def get_run(self, run_id: str) -> SimulationRunDetail | None:
338
- """Get full details for a simulation run.
339
-
340
- Args:
341
- run_id: Run identifier.
342
-
343
- Returns:
344
- SimulationRunDetail if found, None otherwise.
345
- """
346
- return await self.store.get_run(run_id)
347
-
348
-
349
- def create_progress_callback(
350
- run_id: str,
351
- scenario_run_id: str,
352
- scenario_name: str,
353
- execution_manager: ExecutionManager,
354
- persona: str | None = None,
355
- ) -> ProgressCallback:
356
- """Create a progress callback for use with run_simulation.
357
-
358
- Args:
359
- run_id: Run identifier.
360
- scenario_run_id: Unique scenario run identifier (includes persona if any).
361
- scenario_name: Name of the scenario being run.
362
- execution_manager: Execution manager instance.
363
- persona: Optional persona name for this scenario run.
364
-
365
- Returns:
366
- Async callback function for progress updates.
367
- """
368
-
369
- async def callback(
370
- event_type: str,
371
- **kwargs: Any,
372
- ) -> None:
373
- """Progress callback for simulation events."""
374
- update: ProgressUpdate
375
-
376
- if event_type == "status":
377
- update = StatusProgressUpdate(
378
- run_id=run_id,
379
- scenario_run_id=scenario_run_id,
380
- scenario_name=scenario_name,
381
- persona=persona,
382
- status=kwargs.get("status", SimulationStatus.RUNNING),
383
- current_turn=kwargs.get("current_turn"),
384
- current_task_index=kwargs.get("current_task_index"),
385
- current_task=kwargs.get("current_task"),
386
- )
387
- elif event_type == "turn":
388
- update = TurnProgressUpdate(
389
- run_id=run_id,
390
- scenario_run_id=scenario_run_id,
391
- scenario_name=scenario_name,
392
- persona=persona,
393
- turn_index=kwargs.get("turn_index", 0),
394
- task_index=kwargs.get("task_index", 0),
395
- user_message=kwargs.get("user_message", ""),
396
- assistant_message=kwargs.get("assistant_message", ""),
397
- tool_calls=kwargs.get("tool_calls", []),
398
- task_completed=kwargs.get("task_completed", False),
399
- task_completed_reason=kwargs.get("task_completed_reason", ""),
400
- checkers=kwargs.get("checkers", []),
401
- checker_mode=kwargs.get("checker_mode", "all"),
402
- )
403
- elif event_type == "task_complete":
404
- update = TaskCompleteUpdate(
405
- run_id=run_id,
406
- scenario_run_id=scenario_run_id,
407
- scenario_name=scenario_name,
408
- persona=persona,
409
- task_index=kwargs.get("task_index", 0),
410
- task_description=kwargs.get("task_description", ""),
411
- turns_taken=kwargs.get("turns_taken", 0),
412
- reason=kwargs.get("reason", ""),
413
- )
414
- elif event_type == "complete":
415
- update = CompletionUpdate(
416
- run_id=run_id,
417
- scenario_run_id=scenario_run_id,
418
- scenario_name=scenario_name,
419
- persona=persona,
420
- result_id=kwargs.get("result_id", ""),
421
- status=kwargs.get("status", SimulationStatus.COMPLETED),
422
- success_rate=kwargs.get("success_rate", 0.0),
423
- total_turns=kwargs.get("total_turns", 0),
424
- total_tasks=kwargs.get("total_tasks", 0),
425
- tasks_completed=kwargs.get("tasks_completed", 0),
426
- )
427
- elif event_type == "error":
428
- update = ErrorUpdate(
429
- run_id=run_id,
430
- scenario_run_id=scenario_run_id,
431
- scenario_name=scenario_name,
432
- persona=persona,
433
- error=kwargs.get("error", "Unknown error"),
434
- )
435
- elif event_type == "response_chunk":
436
- update = ResponseChunkUpdate(
437
- run_id=run_id,
438
- scenario_run_id=scenario_run_id,
439
- scenario_name=scenario_name,
440
- persona=persona,
441
- turn_index=kwargs.get("turn_index", 0),
442
- task_index=kwargs.get("task_index", 0),
443
- chunk_type=kwargs.get("chunk_type", "unknown"),
444
- chunk_data=kwargs.get("chunk_data", {}),
445
- )
446
- else:
447
- return
448
-
449
- await execution_manager.emit_progress(run_id, update)
450
-
451
- return callback
@@ -1,36 +0,0 @@
1
- """Evaluation report storage backends.
2
-
3
- This module provides pluggable storage backends for evaluation reports.
4
- The default is FileEvalReportStore which maintains backward compatibility
5
- with the existing file-based approach.
6
-
7
- Example usage:
8
- from ragbits.evaluate.stores import FileEvalReportStore, KVEvalReportStore
9
-
10
- # File-based storage (default)
11
- store = FileEvalReportStore(results_dir="./eval_results")
12
-
13
- # KV-based storage using PostgreSQL (recommended for production)
14
- from ragbits.core.storage.connections import PostgresConnection
15
- from ragbits.core.storage.kv_store import PostgresKVStore
16
-
17
- conn = PostgresConnection(host="localhost", database="mydb")
18
- kv = PostgresKVStore(connection=conn, table_name="eval_results")
19
- store = KVEvalReportStore(kv_store=kv)
20
- """
21
-
22
- from ragbits.evaluate.stores.base import EvalReportStore
23
- from ragbits.evaluate.stores.file import FileEvalReportStore
24
-
25
- __all__ = [
26
- "EvalReportStore",
27
- "FileEvalReportStore",
28
- ]
29
-
30
- # KV store is optional (requires ragbits-core with storage extras)
31
- try:
32
- from ragbits.evaluate.stores.kv import KVEvalReportStore # noqa: F401
33
-
34
- __all__.append("KVEvalReportStore")
35
- except ImportError:
36
- pass
@@ -1,98 +0,0 @@
1
- """Base class for evaluation report storage backends."""
2
-
3
- from abc import ABC, abstractmethod
4
- from typing import ClassVar
5
-
6
- from ragbits.evaluate import stores as stores_module
7
- from ragbits.evaluate.agent_simulation.results import ResponseChunk, SimulationResult
8
- from ragbits.evaluate.api_types import ResultSummary, SimulationRunDetail, SimulationRunSummary
9
-
10
-
11
- class EvalReportStore(ABC):
12
- """Abstract base class for evaluation report storage.
13
-
14
- Provides a pluggable interface for storing and retrieving evaluation results.
15
- Implementations can use file-based storage, SQLite, PostgreSQL, or other backends.
16
- """
17
-
18
- default_module: ClassVar = stores_module
19
- configuration_key: ClassVar = "eval_report_store"
20
-
21
- @abstractmethod
22
- async def save_result(
23
- self,
24
- run_id: str,
25
- scenario_run_id: str,
26
- scenario_name: str,
27
- result: SimulationResult,
28
- buffered_chunks: list[ResponseChunk] | None = None,
29
- ) -> str:
30
- """Save a simulation result.
31
-
32
- Args:
33
- run_id: Run identifier for grouping multiple scenarios.
34
- scenario_run_id: Unique identifier for this scenario run.
35
- scenario_name: Name of the scenario.
36
- result: The simulation result to save.
37
- buffered_chunks: Optional response chunks from the event buffer.
38
-
39
- Returns:
40
- Result ID for later retrieval.
41
- """
42
-
43
- @abstractmethod
44
- async def load_result(self, result_id: str) -> SimulationResult | None:
45
- """Load a simulation result by ID.
46
-
47
- Args:
48
- result_id: Result identifier.
49
-
50
- Returns:
51
- SimulationResult if found, None otherwise.
52
- """
53
-
54
- @abstractmethod
55
- async def delete_result(self, result_id: str) -> bool:
56
- """Delete a simulation result.
57
-
58
- Args:
59
- result_id: Result identifier.
60
-
61
- Returns:
62
- True if deleted, False if not found.
63
- """
64
-
65
- @abstractmethod
66
- async def list_results(self, limit: int = 50, offset: int = 0) -> tuple[list[ResultSummary], int]:
67
- """List evaluation results with pagination.
68
-
69
- Args:
70
- limit: Maximum number of results to return.
71
- offset: Number of results to skip.
72
-
73
- Returns:
74
- Tuple of (results list, total count).
75
- """
76
-
77
- @abstractmethod
78
- async def list_runs(self, limit: int = 50, offset: int = 0) -> tuple[list[SimulationRunSummary], int]:
79
- """List simulation runs (batch runs grouped by run_id).
80
-
81
- Args:
82
- limit: Maximum number of runs to return.
83
- offset: Number of runs to skip.
84
-
85
- Returns:
86
- Tuple of (runs list, total count).
87
- """
88
-
89
- @abstractmethod
90
- async def get_run(self, run_id: str) -> SimulationRunDetail | None:
91
- """Get full details for a simulation run.
92
-
93
- Args:
94
- run_id: Run identifier.
95
-
96
- Returns:
97
- SimulationRunDetail if found, None otherwise.
98
- """