ragbits-evaluate 0.0.30rc1__py3-none-any.whl → 1.4.0.dev202602030301__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,535 +0,0 @@
1
- """Key-value based evaluation report storage using core KVStore."""
2
-
3
- from __future__ import annotations
4
-
5
- import logging
6
- from datetime import datetime, timezone
7
- from typing import TYPE_CHECKING, Any
8
-
9
- from ragbits.evaluate.agent_simulation.results import (
10
- ConversationMetrics,
11
- ResponseChunk,
12
- SimulationResult,
13
- SimulationStatus,
14
- TaskResult,
15
- TurnResult,
16
- )
17
- from ragbits.evaluate.api_types import (
18
- ResultSummary,
19
- SimulationRunDetail,
20
- SimulationRunSummary,
21
- )
22
- from ragbits.evaluate.stores.base import EvalReportStore
23
-
24
- if TYPE_CHECKING:
25
- from ragbits.core.storage.kv_store.base import KVStore
26
-
27
- logger = logging.getLogger(__name__)
28
-
29
-
30
- class KVEvalReportStore(EvalReportStore):
31
- """Key-value based storage for evaluation reports.
32
-
33
- Uses ragbits.core.storage.kv_store for simple JSON storage.
34
- Stores results, runs, and indexes as JSON documents.
35
-
36
- Example:
37
- ```python
38
- from ragbits.core.storage.connections import PostgresConnection
39
- from ragbits.core.storage.kv_store import PostgresKVStore
40
- from ragbits.evaluate.stores import KVEvalReportStore
41
-
42
- conn = PostgresConnection(host="localhost", database="mydb")
43
- kv = PostgresKVStore(connection=conn, table_name="eval_store")
44
- store = KVEvalReportStore(kv_store=kv)
45
- ```
46
- """
47
-
48
- # Key prefixes for different data types
49
- _RESULT_PREFIX = "result:"
50
- _RUN_PREFIX = "run:"
51
- _INDEX_KEY = "index:results"
52
- _RUNS_INDEX_KEY = "index:runs"
53
-
54
- def __init__(self, kv_store: KVStore[dict[str, Any]]) -> None:
55
- """Initialize the KV store.
56
-
57
- Args:
58
- kv_store: KVStore instance from ragbits.core.storage.
59
- """
60
- self._kv = kv_store
61
-
62
- async def save_result(
63
- self,
64
- run_id: str,
65
- scenario_run_id: str,
66
- scenario_name: str,
67
- result: SimulationResult,
68
- buffered_chunks: list[ResponseChunk] | None = None,
69
- ) -> str:
70
- """Save a simulation result."""
71
- # Generate result_id
72
- timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
73
- safe_name = "".join(c if c.isalnum() or c in "-_" else "_" for c in scenario_name)
74
- result_id = f"result_{timestamp}_{safe_name}"
75
-
76
- # Add buffered chunks
77
- all_chunks = list(result.response_chunks)
78
- if buffered_chunks:
79
- chunk_index = len(all_chunks)
80
- for chunk in buffered_chunks:
81
- if chunk.chunk_type == "text":
82
- continue
83
- all_chunks.append(
84
- ResponseChunk(
85
- turn_index=chunk.turn_index,
86
- task_index=chunk.task_index,
87
- chunk_index=chunk_index,
88
- chunk_type=chunk.chunk_type,
89
- chunk_data=chunk.chunk_data,
90
- )
91
- )
92
- chunk_index += 1
93
-
94
- # Serialize result to dict
95
- result_data = {
96
- "result_id": result_id,
97
- "run_id": run_id,
98
- "scenario_run_id": scenario_run_id,
99
- "scenario_name": scenario_name,
100
- "persona": result.persona,
101
- "status": result.status.value,
102
- "start_time": result.start_time.isoformat(),
103
- "end_time": result.end_time.isoformat() if result.end_time else None,
104
- "agent_model": result.agent_model,
105
- "simulated_user_model": result.simulated_user_model,
106
- "checker_model": result.checker_model,
107
- "conversation_id": result.conversation_id,
108
- "final_state": result.final_state,
109
- "metrics": result.metrics.metrics if result.metrics else None,
110
- "traces": result.traces,
111
- "error": result.error,
112
- "turns": [self._serialize_turn(t) for t in result.turns],
113
- "tasks": [self._serialize_task(t) for t in result.tasks],
114
- "response_chunks": [self._serialize_chunk(c) for c in all_chunks],
115
- "created_at": datetime.now(timezone.utc).isoformat(),
116
- }
117
-
118
- # Save result
119
- await self._kv.set(f"{self._RESULT_PREFIX}{result_id}", result_data)
120
-
121
- # Update results index
122
- await self._add_to_index(self._INDEX_KEY, result_id, result_data)
123
-
124
- # Update or create run
125
- await self._update_run(run_id, result_id, result_data)
126
-
127
- logger.info(f"Saved result {result_id} to KV store")
128
- return result_id
129
-
130
- def _serialize_turn(self, turn: TurnResult) -> dict[str, Any]: # noqa: PLR6301
131
- """Serialize a turn result."""
132
- token_usage = None
133
- if turn.token_usage:
134
- if hasattr(turn.token_usage, "model_dump"):
135
- token_usage = turn.token_usage.model_dump()
136
- elif isinstance(turn.token_usage, dict):
137
- token_usage = turn.token_usage
138
-
139
- return {
140
- "turn_index": turn.turn_index,
141
- "task_index": turn.task_index,
142
- "user_message": turn.user_message,
143
- "assistant_message": turn.assistant_message,
144
- "tool_calls": turn.tool_calls,
145
- "task_completed": turn.task_completed,
146
- "task_completed_reason": turn.task_completed_reason,
147
- "token_usage": token_usage,
148
- "latency_ms": turn.latency_ms,
149
- "checkers": [c.to_dict() for c in turn.checkers] if turn.checkers else [],
150
- "checker_mode": turn.checker_mode,
151
- }
152
-
153
- def _serialize_task(self, task: TaskResult) -> dict[str, Any]: # noqa: PLR6301
154
- """Serialize a task result."""
155
- return {
156
- "task_index": task.task_index,
157
- "description": task.description,
158
- "completed": task.completed,
159
- "turns_taken": task.turns_taken,
160
- "final_reason": task.final_reason,
161
- "checkers": task.checkers,
162
- "checker_mode": task.checker_mode,
163
- }
164
-
165
- def _serialize_chunk(self, chunk: ResponseChunk) -> dict[str, Any]: # noqa: PLR6301
166
- """Serialize a response chunk."""
167
- return {
168
- "turn_index": chunk.turn_index,
169
- "task_index": chunk.task_index,
170
- "chunk_index": chunk.chunk_index,
171
- "chunk_type": chunk.chunk_type,
172
- "chunk_data": chunk.chunk_data,
173
- }
174
-
175
- async def _add_to_index(self, index_key: str, item_id: str, data: dict[str, Any]) -> None:
176
- """Add item to an index."""
177
- index = await self._kv.get(index_key) or {"items": []}
178
-
179
- # Add summary to index
180
- summary = {
181
- "id": item_id,
182
- "scenario_name": data.get("scenario_name"),
183
- "status": data.get("status"),
184
- "start_time": data.get("start_time"),
185
- "created_at": data.get("created_at"),
186
- "metrics": data.get("metrics"),
187
- }
188
- index["items"].insert(0, summary) # Most recent first
189
-
190
- await self._kv.set(index_key, index)
191
-
192
- async def _update_run(self, run_id: str, result_id: str, result_data: dict[str, Any]) -> None:
193
- """Update or create a run record."""
194
- run_key = f"{self._RUN_PREFIX}{run_id}"
195
- run = await self._kv.get(run_key)
196
-
197
- if run is None:
198
- run = {
199
- "id": run_id,
200
- "timestamp": datetime.now(timezone.utc).isoformat(),
201
- "status": "running",
202
- "results": [],
203
- "total_scenarios": 0,
204
- "completed_scenarios": 0,
205
- "failed_scenarios": 0,
206
- "total_tokens": 0,
207
- "total_cost_usd": 0.0,
208
- }
209
- # Add to runs index
210
- runs_index = await self._kv.get(self._RUNS_INDEX_KEY) or {"items": []}
211
- runs_index["items"].insert(0, {"id": run_id, "timestamp": run["timestamp"]})
212
- await self._kv.set(self._RUNS_INDEX_KEY, runs_index)
213
-
214
- # Add result to run
215
- run["results"].append(
216
- {
217
- "result_id": result_id,
218
- "scenario_name": result_data.get("scenario_name"),
219
- "persona": result_data.get("persona"),
220
- "status": result_data.get("status"),
221
- "metrics": result_data.get("metrics"),
222
- }
223
- )
224
-
225
- # Update stats
226
- run["total_scenarios"] = len(run["results"])
227
- run["completed_scenarios"] = sum(1 for r in run["results"] if r["status"] == "completed")
228
- run["failed_scenarios"] = sum(1 for r in run["results"] if r["status"] in ("failed", "timeout"))
229
-
230
- metrics = result_data.get("metrics") or {}
231
- run["total_tokens"] += metrics.get("total_tokens", 0)
232
- run["total_cost_usd"] += metrics.get("total_cost_usd", 0.0)
233
-
234
- # Update run status
235
- if run["completed_scenarios"] == run["total_scenarios"]:
236
- run["status"] = "completed"
237
- elif run["failed_scenarios"] > 0:
238
- run["status"] = "failed"
239
-
240
- await self._kv.set(run_key, run)
241
-
242
- async def load_result(self, result_id: str) -> SimulationResult | None:
243
- """Load a simulation result."""
244
- data = await self._kv.get(f"{self._RESULT_PREFIX}{result_id}")
245
- if not data:
246
- return None
247
-
248
- return self._deserialize_result(data)
249
-
250
- def _deserialize_result(self, data: dict[str, Any]) -> SimulationResult: # noqa: PLR6301
251
- """Deserialize a result from dict."""
252
- from ragbits.evaluate.agent_simulation.results import CheckerResultItem
253
-
254
- turns = []
255
- for t in data.get("turns", []):
256
- checkers = [CheckerResultItem.from_dict(c) for c in t.get("checkers", [])]
257
- turns.append(
258
- TurnResult(
259
- turn_index=t["turn_index"],
260
- task_index=t["task_index"],
261
- user_message=t.get("user_message", ""),
262
- assistant_message=t.get("assistant_message", ""),
263
- tool_calls=t.get("tool_calls", []),
264
- task_completed=t.get("task_completed", False),
265
- task_completed_reason=t.get("task_completed_reason", ""),
266
- token_usage=t.get("token_usage"),
267
- latency_ms=t.get("latency_ms"),
268
- checkers=checkers,
269
- checker_mode=t.get("checker_mode", "all"),
270
- )
271
- )
272
-
273
- tasks = [
274
- TaskResult(
275
- task_index=t["task_index"],
276
- description=t.get("description", ""),
277
- completed=t.get("completed", False),
278
- turns_taken=t.get("turns_taken", 0),
279
- final_reason=t.get("final_reason", ""),
280
- checkers=t.get("checkers", []),
281
- checker_mode=t.get("checker_mode", "all"),
282
- )
283
- for t in data.get("tasks", [])
284
- ]
285
-
286
- chunks = [
287
- ResponseChunk(
288
- turn_index=c["turn_index"],
289
- task_index=c["task_index"],
290
- chunk_index=c["chunk_index"],
291
- chunk_type=c["chunk_type"],
292
- chunk_data=c.get("chunk_data", {}),
293
- )
294
- for c in data.get("response_chunks", [])
295
- ]
296
-
297
- return SimulationResult(
298
- scenario_name=data["scenario_name"],
299
- start_time=datetime.fromisoformat(data["start_time"]),
300
- end_time=datetime.fromisoformat(data["end_time"]) if data.get("end_time") else None,
301
- status=SimulationStatus(data["status"]),
302
- agent_model=data.get("agent_model"),
303
- simulated_user_model=data.get("simulated_user_model"),
304
- checker_model=data.get("checker_model"),
305
- persona=data.get("persona"),
306
- error=data.get("error"),
307
- conversation_id=data.get("conversation_id"),
308
- final_state=data.get("final_state", {}),
309
- turns=turns,
310
- tasks=tasks,
311
- metrics=ConversationMetrics(metrics=data["metrics"]) if data.get("metrics") else None,
312
- response_chunks=chunks,
313
- traces=data.get("traces", []),
314
- )
315
-
316
- async def delete_result(self, result_id: str) -> bool:
317
- """Delete a simulation result."""
318
- data = await self._kv.get(f"{self._RESULT_PREFIX}{result_id}")
319
- if not data:
320
- return False
321
-
322
- # Remove from result
323
- await self._kv.delete(f"{self._RESULT_PREFIX}{result_id}")
324
-
325
- # Update index
326
- index = await self._kv.get(self._INDEX_KEY) or {"items": []}
327
- index["items"] = [i for i in index["items"] if i["id"] != result_id]
328
- await self._kv.set(self._INDEX_KEY, index)
329
-
330
- # Update run
331
- run_id = data.get("run_id")
332
- if run_id:
333
- run = await self._kv.get(f"{self._RUN_PREFIX}{run_id}")
334
- if run:
335
- run["results"] = [r for r in run["results"] if r["result_id"] != result_id]
336
- if not run["results"]:
337
- await self._kv.delete(f"{self._RUN_PREFIX}{run_id}")
338
- # Remove from runs index
339
- runs_index = await self._kv.get(self._RUNS_INDEX_KEY) or {"items": []}
340
- runs_index["items"] = [i for i in runs_index["items"] if i["id"] != run_id]
341
- await self._kv.set(self._RUNS_INDEX_KEY, runs_index)
342
- else:
343
- await self._kv.set(f"{self._RUN_PREFIX}{run_id}", run)
344
-
345
- return True
346
-
347
- async def list_results(self, limit: int = 50, offset: int = 0) -> tuple[list[ResultSummary], int]:
348
- """List evaluation results with pagination."""
349
- index = await self._kv.get(self._INDEX_KEY) or {"items": []}
350
- items = index["items"]
351
- total = len(items)
352
-
353
- summaries = []
354
- for item in items[offset : offset + limit]:
355
- metrics = item.get("metrics") or {}
356
- summaries.append(
357
- ResultSummary(
358
- result_id=item["id"],
359
- scenario_name=item.get("scenario_name", ""),
360
- timestamp=datetime.fromisoformat(item["start_time"])
361
- if item.get("start_time")
362
- else datetime.now(timezone.utc),
363
- status=SimulationStatus(item.get("status", "unknown")),
364
- tasks_completed=metrics.get("tasks_completed", 0),
365
- total_tasks=metrics.get("total_tasks", 0),
366
- success_rate=metrics.get("success_rate", 0.0),
367
- total_turns=metrics.get("total_turns", 0),
368
- total_tokens=metrics.get("total_tokens", 0),
369
- total_cost_usd=metrics.get("total_cost_usd", 0.0),
370
- )
371
- )
372
-
373
- return summaries, total
374
-
375
- async def list_runs(self, limit: int = 50, offset: int = 0) -> tuple[list[SimulationRunSummary], int]:
376
- """List simulation runs."""
377
- runs_index = await self._kv.get(self._RUNS_INDEX_KEY) or {"items": []}
378
- items = runs_index["items"]
379
- total = len(items)
380
-
381
- runs = []
382
- for item in items[offset : offset + limit]:
383
- run = await self._kv.get(f"{self._RUN_PREFIX}{item['id']}")
384
- if run:
385
- runs.append(self._run_to_summary(run))
386
-
387
- return runs, total
388
-
389
- def _run_to_summary(self, run: dict[str, Any]) -> SimulationRunSummary: # noqa: PLR6301
390
- """Convert run data to summary."""
391
- from ragbits.evaluate.api_types import ScenarioRunSummary
392
-
393
- scenario_runs = []
394
- for r in run.get("results", []):
395
- metrics = r.get("metrics") or {}
396
- scenario_runs.append(
397
- ScenarioRunSummary(
398
- id=r["result_id"],
399
- scenario_name=r.get("scenario_name", ""),
400
- persona=r.get("persona"),
401
- status=SimulationStatus(r.get("status", "unknown")),
402
- start_time=datetime.now(timezone.utc), # Not stored in summary
403
- end_time=None,
404
- total_turns=metrics.get("total_turns", 0),
405
- total_tasks=metrics.get("total_tasks", 0),
406
- tasks_completed=metrics.get("tasks_completed", 0),
407
- success_rate=metrics.get("success_rate", 0.0),
408
- total_tokens=metrics.get("total_tokens", 0),
409
- total_cost_usd=metrics.get("total_cost_usd", 0.0),
410
- error=None,
411
- )
412
- )
413
-
414
- success_rates = [sr.success_rate for sr in scenario_runs if sr.success_rate > 0]
415
- overall_success_rate = sum(success_rates) / len(success_rates) if success_rates else 0.0
416
-
417
- return SimulationRunSummary(
418
- id=run["id"],
419
- timestamp=datetime.fromisoformat(run["timestamp"]),
420
- status=SimulationStatus(run.get("status", "unknown")),
421
- scenario_runs=scenario_runs,
422
- total_scenarios=run.get("total_scenarios", 0),
423
- completed_scenarios=run.get("completed_scenarios", 0),
424
- failed_scenarios=run.get("failed_scenarios", 0),
425
- total_tokens=run.get("total_tokens", 0),
426
- total_cost_usd=run.get("total_cost_usd", 0.0),
427
- overall_success_rate=overall_success_rate,
428
- )
429
-
430
- async def get_run(self, run_id: str) -> SimulationRunDetail | None:
431
- """Get full details for a simulation run."""
432
- run = await self._kv.get(f"{self._RUN_PREFIX}{run_id}")
433
- if not run:
434
- return None
435
-
436
- scenario_runs = []
437
- for r in run.get("results", []):
438
- # Load full result
439
- result_data = await self._kv.get(f"{self._RESULT_PREFIX}{r['result_id']}")
440
- if result_data:
441
- scenario_runs.append(self._result_to_scenario_detail(result_data))
442
-
443
- success_rates = [sr.metrics.get("success_rate", 0.0) for sr in scenario_runs if sr.metrics]
444
- overall_success_rate = sum(success_rates) / len(success_rates) if success_rates else 0.0
445
-
446
- return SimulationRunDetail(
447
- id=run["id"],
448
- timestamp=datetime.fromisoformat(run["timestamp"]),
449
- status=SimulationStatus(run.get("status", "unknown")),
450
- scenario_runs=scenario_runs,
451
- total_scenarios=run.get("total_scenarios", 0),
452
- completed_scenarios=run.get("completed_scenarios", 0),
453
- failed_scenarios=run.get("failed_scenarios", 0),
454
- total_tokens=run.get("total_tokens", 0),
455
- total_cost_usd=run.get("total_cost_usd", 0.0),
456
- overall_success_rate=overall_success_rate,
457
- )
458
-
459
- def _result_to_scenario_detail(self, data: dict[str, Any]) -> Any: # noqa: PLR6301, ANN401
460
- """Convert result data to scenario detail."""
461
- from ragbits.evaluate.api_types import (
462
- CheckerResultItemResponse,
463
- ResponseChunkResponse,
464
- ScenarioRunDetail,
465
- TaskResultResponse,
466
- TurnResultResponse,
467
- )
468
-
469
- turns = []
470
- for t in data.get("turns", []):
471
- token_usage = t.get("token_usage")
472
- if token_usage:
473
- token_usage = {
474
- "prompt_tokens": token_usage.get("prompt_tokens", 0),
475
- "completion_tokens": token_usage.get("completion_tokens", 0),
476
- "total_tokens": token_usage.get("total_tokens", 0),
477
- }
478
- turns.append(
479
- TurnResultResponse(
480
- turn_index=t["turn_index"],
481
- task_index=t["task_index"],
482
- user_message=t.get("user_message", ""),
483
- assistant_message=t.get("assistant_message", ""),
484
- tool_calls=t.get("tool_calls", []),
485
- task_completed=t.get("task_completed", False),
486
- task_completed_reason=t.get("task_completed_reason", ""),
487
- token_usage=token_usage,
488
- latency_ms=t.get("latency_ms"),
489
- checkers=[
490
- CheckerResultItemResponse(
491
- type=c.get("type", "unknown"),
492
- completed=c.get("completed", False),
493
- reason=c.get("reason", ""),
494
- )
495
- for c in t.get("checkers", [])
496
- ],
497
- checker_mode=t.get("checker_mode", "all"),
498
- )
499
- )
500
-
501
- tasks = [
502
- TaskResultResponse(
503
- task_index=t["task_index"],
504
- description=t.get("description", ""),
505
- completed=t.get("completed", False),
506
- turns_taken=t.get("turns_taken", 0),
507
- final_reason=t.get("final_reason", ""),
508
- )
509
- for t in data.get("tasks", [])
510
- ]
511
-
512
- chunks = [
513
- ResponseChunkResponse(
514
- turn_index=c["turn_index"],
515
- task_index=c["task_index"],
516
- chunk_index=c["chunk_index"],
517
- chunk_type=c["chunk_type"],
518
- chunk_data=c.get("chunk_data", {}),
519
- )
520
- for c in data.get("response_chunks", [])
521
- ]
522
-
523
- return ScenarioRunDetail(
524
- id=data.get("scenario_run_id", data["result_id"]),
525
- scenario_name=data["scenario_name"],
526
- persona=data.get("persona"),
527
- status=SimulationStatus(data["status"]),
528
- start_time=datetime.fromisoformat(data["start_time"]),
529
- end_time=datetime.fromisoformat(data["end_time"]) if data.get("end_time") else None,
530
- turns=turns,
531
- tasks=tasks,
532
- response_chunks=chunks,
533
- metrics=data.get("metrics"),
534
- error=data.get("error"),
535
- )