ragbits-evaluate 0.0.30rc1__py3-none-any.whl → 1.4.0.dev202602030301__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ragbits/evaluate/agent_simulation/__init__.py +4 -49
- ragbits/evaluate/agent_simulation/conversation.py +278 -663
- ragbits/evaluate/agent_simulation/logger.py +1 -1
- ragbits/evaluate/agent_simulation/metrics/__init__.py +0 -10
- ragbits/evaluate/agent_simulation/metrics/builtin.py +49 -59
- ragbits/evaluate/agent_simulation/metrics/collectors.py +17 -37
- ragbits/evaluate/agent_simulation/models.py +18 -198
- ragbits/evaluate/agent_simulation/results.py +49 -125
- ragbits/evaluate/agent_simulation/scenarios.py +19 -95
- ragbits/evaluate/agent_simulation/simulation.py +166 -72
- ragbits/evaluate/metrics/question_answer.py +25 -8
- {ragbits_evaluate-0.0.30rc1.dist-info → ragbits_evaluate-1.4.0.dev202602030301.dist-info}/METADATA +2 -6
- {ragbits_evaluate-0.0.30rc1.dist-info → ragbits_evaluate-1.4.0.dev202602030301.dist-info}/RECORD +14 -25
- ragbits/evaluate/agent_simulation/checkers.py +0 -591
- ragbits/evaluate/agent_simulation/display.py +0 -118
- ragbits/evaluate/agent_simulation/metrics/deepeval.py +0 -295
- ragbits/evaluate/agent_simulation/tracing.py +0 -233
- ragbits/evaluate/api.py +0 -603
- ragbits/evaluate/api_types.py +0 -343
- ragbits/evaluate/execution_manager.py +0 -451
- ragbits/evaluate/stores/__init__.py +0 -36
- ragbits/evaluate/stores/base.py +0 -98
- ragbits/evaluate/stores/file.py +0 -466
- ragbits/evaluate/stores/kv.py +0 -535
- {ragbits_evaluate-0.0.30rc1.dist-info → ragbits_evaluate-1.4.0.dev202602030301.dist-info}/WHEEL +0 -0
ragbits/evaluate/stores/kv.py
DELETED
|
@@ -1,535 +0,0 @@
|
|
|
1
|
-
"""Key-value based evaluation report storage using core KVStore."""
|
|
2
|
-
|
|
3
|
-
from __future__ import annotations
|
|
4
|
-
|
|
5
|
-
import logging
|
|
6
|
-
from datetime import datetime, timezone
|
|
7
|
-
from typing import TYPE_CHECKING, Any
|
|
8
|
-
|
|
9
|
-
from ragbits.evaluate.agent_simulation.results import (
|
|
10
|
-
ConversationMetrics,
|
|
11
|
-
ResponseChunk,
|
|
12
|
-
SimulationResult,
|
|
13
|
-
SimulationStatus,
|
|
14
|
-
TaskResult,
|
|
15
|
-
TurnResult,
|
|
16
|
-
)
|
|
17
|
-
from ragbits.evaluate.api_types import (
|
|
18
|
-
ResultSummary,
|
|
19
|
-
SimulationRunDetail,
|
|
20
|
-
SimulationRunSummary,
|
|
21
|
-
)
|
|
22
|
-
from ragbits.evaluate.stores.base import EvalReportStore
|
|
23
|
-
|
|
24
|
-
if TYPE_CHECKING:
|
|
25
|
-
from ragbits.core.storage.kv_store.base import KVStore
|
|
26
|
-
|
|
27
|
-
logger = logging.getLogger(__name__)
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
class KVEvalReportStore(EvalReportStore):
|
|
31
|
-
"""Key-value based storage for evaluation reports.
|
|
32
|
-
|
|
33
|
-
Uses ragbits.core.storage.kv_store for simple JSON storage.
|
|
34
|
-
Stores results, runs, and indexes as JSON documents.
|
|
35
|
-
|
|
36
|
-
Example:
|
|
37
|
-
```python
|
|
38
|
-
from ragbits.core.storage.connections import PostgresConnection
|
|
39
|
-
from ragbits.core.storage.kv_store import PostgresKVStore
|
|
40
|
-
from ragbits.evaluate.stores import KVEvalReportStore
|
|
41
|
-
|
|
42
|
-
conn = PostgresConnection(host="localhost", database="mydb")
|
|
43
|
-
kv = PostgresKVStore(connection=conn, table_name="eval_store")
|
|
44
|
-
store = KVEvalReportStore(kv_store=kv)
|
|
45
|
-
```
|
|
46
|
-
"""
|
|
47
|
-
|
|
48
|
-
# Key prefixes for different data types
|
|
49
|
-
_RESULT_PREFIX = "result:"
|
|
50
|
-
_RUN_PREFIX = "run:"
|
|
51
|
-
_INDEX_KEY = "index:results"
|
|
52
|
-
_RUNS_INDEX_KEY = "index:runs"
|
|
53
|
-
|
|
54
|
-
def __init__(self, kv_store: KVStore[dict[str, Any]]) -> None:
|
|
55
|
-
"""Initialize the KV store.
|
|
56
|
-
|
|
57
|
-
Args:
|
|
58
|
-
kv_store: KVStore instance from ragbits.core.storage.
|
|
59
|
-
"""
|
|
60
|
-
self._kv = kv_store
|
|
61
|
-
|
|
62
|
-
async def save_result(
|
|
63
|
-
self,
|
|
64
|
-
run_id: str,
|
|
65
|
-
scenario_run_id: str,
|
|
66
|
-
scenario_name: str,
|
|
67
|
-
result: SimulationResult,
|
|
68
|
-
buffered_chunks: list[ResponseChunk] | None = None,
|
|
69
|
-
) -> str:
|
|
70
|
-
"""Save a simulation result."""
|
|
71
|
-
# Generate result_id
|
|
72
|
-
timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
|
|
73
|
-
safe_name = "".join(c if c.isalnum() or c in "-_" else "_" for c in scenario_name)
|
|
74
|
-
result_id = f"result_{timestamp}_{safe_name}"
|
|
75
|
-
|
|
76
|
-
# Add buffered chunks
|
|
77
|
-
all_chunks = list(result.response_chunks)
|
|
78
|
-
if buffered_chunks:
|
|
79
|
-
chunk_index = len(all_chunks)
|
|
80
|
-
for chunk in buffered_chunks:
|
|
81
|
-
if chunk.chunk_type == "text":
|
|
82
|
-
continue
|
|
83
|
-
all_chunks.append(
|
|
84
|
-
ResponseChunk(
|
|
85
|
-
turn_index=chunk.turn_index,
|
|
86
|
-
task_index=chunk.task_index,
|
|
87
|
-
chunk_index=chunk_index,
|
|
88
|
-
chunk_type=chunk.chunk_type,
|
|
89
|
-
chunk_data=chunk.chunk_data,
|
|
90
|
-
)
|
|
91
|
-
)
|
|
92
|
-
chunk_index += 1
|
|
93
|
-
|
|
94
|
-
# Serialize result to dict
|
|
95
|
-
result_data = {
|
|
96
|
-
"result_id": result_id,
|
|
97
|
-
"run_id": run_id,
|
|
98
|
-
"scenario_run_id": scenario_run_id,
|
|
99
|
-
"scenario_name": scenario_name,
|
|
100
|
-
"persona": result.persona,
|
|
101
|
-
"status": result.status.value,
|
|
102
|
-
"start_time": result.start_time.isoformat(),
|
|
103
|
-
"end_time": result.end_time.isoformat() if result.end_time else None,
|
|
104
|
-
"agent_model": result.agent_model,
|
|
105
|
-
"simulated_user_model": result.simulated_user_model,
|
|
106
|
-
"checker_model": result.checker_model,
|
|
107
|
-
"conversation_id": result.conversation_id,
|
|
108
|
-
"final_state": result.final_state,
|
|
109
|
-
"metrics": result.metrics.metrics if result.metrics else None,
|
|
110
|
-
"traces": result.traces,
|
|
111
|
-
"error": result.error,
|
|
112
|
-
"turns": [self._serialize_turn(t) for t in result.turns],
|
|
113
|
-
"tasks": [self._serialize_task(t) for t in result.tasks],
|
|
114
|
-
"response_chunks": [self._serialize_chunk(c) for c in all_chunks],
|
|
115
|
-
"created_at": datetime.now(timezone.utc).isoformat(),
|
|
116
|
-
}
|
|
117
|
-
|
|
118
|
-
# Save result
|
|
119
|
-
await self._kv.set(f"{self._RESULT_PREFIX}{result_id}", result_data)
|
|
120
|
-
|
|
121
|
-
# Update results index
|
|
122
|
-
await self._add_to_index(self._INDEX_KEY, result_id, result_data)
|
|
123
|
-
|
|
124
|
-
# Update or create run
|
|
125
|
-
await self._update_run(run_id, result_id, result_data)
|
|
126
|
-
|
|
127
|
-
logger.info(f"Saved result {result_id} to KV store")
|
|
128
|
-
return result_id
|
|
129
|
-
|
|
130
|
-
def _serialize_turn(self, turn: TurnResult) -> dict[str, Any]: # noqa: PLR6301
|
|
131
|
-
"""Serialize a turn result."""
|
|
132
|
-
token_usage = None
|
|
133
|
-
if turn.token_usage:
|
|
134
|
-
if hasattr(turn.token_usage, "model_dump"):
|
|
135
|
-
token_usage = turn.token_usage.model_dump()
|
|
136
|
-
elif isinstance(turn.token_usage, dict):
|
|
137
|
-
token_usage = turn.token_usage
|
|
138
|
-
|
|
139
|
-
return {
|
|
140
|
-
"turn_index": turn.turn_index,
|
|
141
|
-
"task_index": turn.task_index,
|
|
142
|
-
"user_message": turn.user_message,
|
|
143
|
-
"assistant_message": turn.assistant_message,
|
|
144
|
-
"tool_calls": turn.tool_calls,
|
|
145
|
-
"task_completed": turn.task_completed,
|
|
146
|
-
"task_completed_reason": turn.task_completed_reason,
|
|
147
|
-
"token_usage": token_usage,
|
|
148
|
-
"latency_ms": turn.latency_ms,
|
|
149
|
-
"checkers": [c.to_dict() for c in turn.checkers] if turn.checkers else [],
|
|
150
|
-
"checker_mode": turn.checker_mode,
|
|
151
|
-
}
|
|
152
|
-
|
|
153
|
-
def _serialize_task(self, task: TaskResult) -> dict[str, Any]: # noqa: PLR6301
|
|
154
|
-
"""Serialize a task result."""
|
|
155
|
-
return {
|
|
156
|
-
"task_index": task.task_index,
|
|
157
|
-
"description": task.description,
|
|
158
|
-
"completed": task.completed,
|
|
159
|
-
"turns_taken": task.turns_taken,
|
|
160
|
-
"final_reason": task.final_reason,
|
|
161
|
-
"checkers": task.checkers,
|
|
162
|
-
"checker_mode": task.checker_mode,
|
|
163
|
-
}
|
|
164
|
-
|
|
165
|
-
def _serialize_chunk(self, chunk: ResponseChunk) -> dict[str, Any]: # noqa: PLR6301
|
|
166
|
-
"""Serialize a response chunk."""
|
|
167
|
-
return {
|
|
168
|
-
"turn_index": chunk.turn_index,
|
|
169
|
-
"task_index": chunk.task_index,
|
|
170
|
-
"chunk_index": chunk.chunk_index,
|
|
171
|
-
"chunk_type": chunk.chunk_type,
|
|
172
|
-
"chunk_data": chunk.chunk_data,
|
|
173
|
-
}
|
|
174
|
-
|
|
175
|
-
async def _add_to_index(self, index_key: str, item_id: str, data: dict[str, Any]) -> None:
|
|
176
|
-
"""Add item to an index."""
|
|
177
|
-
index = await self._kv.get(index_key) or {"items": []}
|
|
178
|
-
|
|
179
|
-
# Add summary to index
|
|
180
|
-
summary = {
|
|
181
|
-
"id": item_id,
|
|
182
|
-
"scenario_name": data.get("scenario_name"),
|
|
183
|
-
"status": data.get("status"),
|
|
184
|
-
"start_time": data.get("start_time"),
|
|
185
|
-
"created_at": data.get("created_at"),
|
|
186
|
-
"metrics": data.get("metrics"),
|
|
187
|
-
}
|
|
188
|
-
index["items"].insert(0, summary) # Most recent first
|
|
189
|
-
|
|
190
|
-
await self._kv.set(index_key, index)
|
|
191
|
-
|
|
192
|
-
async def _update_run(self, run_id: str, result_id: str, result_data: dict[str, Any]) -> None:
|
|
193
|
-
"""Update or create a run record."""
|
|
194
|
-
run_key = f"{self._RUN_PREFIX}{run_id}"
|
|
195
|
-
run = await self._kv.get(run_key)
|
|
196
|
-
|
|
197
|
-
if run is None:
|
|
198
|
-
run = {
|
|
199
|
-
"id": run_id,
|
|
200
|
-
"timestamp": datetime.now(timezone.utc).isoformat(),
|
|
201
|
-
"status": "running",
|
|
202
|
-
"results": [],
|
|
203
|
-
"total_scenarios": 0,
|
|
204
|
-
"completed_scenarios": 0,
|
|
205
|
-
"failed_scenarios": 0,
|
|
206
|
-
"total_tokens": 0,
|
|
207
|
-
"total_cost_usd": 0.0,
|
|
208
|
-
}
|
|
209
|
-
# Add to runs index
|
|
210
|
-
runs_index = await self._kv.get(self._RUNS_INDEX_KEY) or {"items": []}
|
|
211
|
-
runs_index["items"].insert(0, {"id": run_id, "timestamp": run["timestamp"]})
|
|
212
|
-
await self._kv.set(self._RUNS_INDEX_KEY, runs_index)
|
|
213
|
-
|
|
214
|
-
# Add result to run
|
|
215
|
-
run["results"].append(
|
|
216
|
-
{
|
|
217
|
-
"result_id": result_id,
|
|
218
|
-
"scenario_name": result_data.get("scenario_name"),
|
|
219
|
-
"persona": result_data.get("persona"),
|
|
220
|
-
"status": result_data.get("status"),
|
|
221
|
-
"metrics": result_data.get("metrics"),
|
|
222
|
-
}
|
|
223
|
-
)
|
|
224
|
-
|
|
225
|
-
# Update stats
|
|
226
|
-
run["total_scenarios"] = len(run["results"])
|
|
227
|
-
run["completed_scenarios"] = sum(1 for r in run["results"] if r["status"] == "completed")
|
|
228
|
-
run["failed_scenarios"] = sum(1 for r in run["results"] if r["status"] in ("failed", "timeout"))
|
|
229
|
-
|
|
230
|
-
metrics = result_data.get("metrics") or {}
|
|
231
|
-
run["total_tokens"] += metrics.get("total_tokens", 0)
|
|
232
|
-
run["total_cost_usd"] += metrics.get("total_cost_usd", 0.0)
|
|
233
|
-
|
|
234
|
-
# Update run status
|
|
235
|
-
if run["completed_scenarios"] == run["total_scenarios"]:
|
|
236
|
-
run["status"] = "completed"
|
|
237
|
-
elif run["failed_scenarios"] > 0:
|
|
238
|
-
run["status"] = "failed"
|
|
239
|
-
|
|
240
|
-
await self._kv.set(run_key, run)
|
|
241
|
-
|
|
242
|
-
async def load_result(self, result_id: str) -> SimulationResult | None:
|
|
243
|
-
"""Load a simulation result."""
|
|
244
|
-
data = await self._kv.get(f"{self._RESULT_PREFIX}{result_id}")
|
|
245
|
-
if not data:
|
|
246
|
-
return None
|
|
247
|
-
|
|
248
|
-
return self._deserialize_result(data)
|
|
249
|
-
|
|
250
|
-
def _deserialize_result(self, data: dict[str, Any]) -> SimulationResult: # noqa: PLR6301
|
|
251
|
-
"""Deserialize a result from dict."""
|
|
252
|
-
from ragbits.evaluate.agent_simulation.results import CheckerResultItem
|
|
253
|
-
|
|
254
|
-
turns = []
|
|
255
|
-
for t in data.get("turns", []):
|
|
256
|
-
checkers = [CheckerResultItem.from_dict(c) for c in t.get("checkers", [])]
|
|
257
|
-
turns.append(
|
|
258
|
-
TurnResult(
|
|
259
|
-
turn_index=t["turn_index"],
|
|
260
|
-
task_index=t["task_index"],
|
|
261
|
-
user_message=t.get("user_message", ""),
|
|
262
|
-
assistant_message=t.get("assistant_message", ""),
|
|
263
|
-
tool_calls=t.get("tool_calls", []),
|
|
264
|
-
task_completed=t.get("task_completed", False),
|
|
265
|
-
task_completed_reason=t.get("task_completed_reason", ""),
|
|
266
|
-
token_usage=t.get("token_usage"),
|
|
267
|
-
latency_ms=t.get("latency_ms"),
|
|
268
|
-
checkers=checkers,
|
|
269
|
-
checker_mode=t.get("checker_mode", "all"),
|
|
270
|
-
)
|
|
271
|
-
)
|
|
272
|
-
|
|
273
|
-
tasks = [
|
|
274
|
-
TaskResult(
|
|
275
|
-
task_index=t["task_index"],
|
|
276
|
-
description=t.get("description", ""),
|
|
277
|
-
completed=t.get("completed", False),
|
|
278
|
-
turns_taken=t.get("turns_taken", 0),
|
|
279
|
-
final_reason=t.get("final_reason", ""),
|
|
280
|
-
checkers=t.get("checkers", []),
|
|
281
|
-
checker_mode=t.get("checker_mode", "all"),
|
|
282
|
-
)
|
|
283
|
-
for t in data.get("tasks", [])
|
|
284
|
-
]
|
|
285
|
-
|
|
286
|
-
chunks = [
|
|
287
|
-
ResponseChunk(
|
|
288
|
-
turn_index=c["turn_index"],
|
|
289
|
-
task_index=c["task_index"],
|
|
290
|
-
chunk_index=c["chunk_index"],
|
|
291
|
-
chunk_type=c["chunk_type"],
|
|
292
|
-
chunk_data=c.get("chunk_data", {}),
|
|
293
|
-
)
|
|
294
|
-
for c in data.get("response_chunks", [])
|
|
295
|
-
]
|
|
296
|
-
|
|
297
|
-
return SimulationResult(
|
|
298
|
-
scenario_name=data["scenario_name"],
|
|
299
|
-
start_time=datetime.fromisoformat(data["start_time"]),
|
|
300
|
-
end_time=datetime.fromisoformat(data["end_time"]) if data.get("end_time") else None,
|
|
301
|
-
status=SimulationStatus(data["status"]),
|
|
302
|
-
agent_model=data.get("agent_model"),
|
|
303
|
-
simulated_user_model=data.get("simulated_user_model"),
|
|
304
|
-
checker_model=data.get("checker_model"),
|
|
305
|
-
persona=data.get("persona"),
|
|
306
|
-
error=data.get("error"),
|
|
307
|
-
conversation_id=data.get("conversation_id"),
|
|
308
|
-
final_state=data.get("final_state", {}),
|
|
309
|
-
turns=turns,
|
|
310
|
-
tasks=tasks,
|
|
311
|
-
metrics=ConversationMetrics(metrics=data["metrics"]) if data.get("metrics") else None,
|
|
312
|
-
response_chunks=chunks,
|
|
313
|
-
traces=data.get("traces", []),
|
|
314
|
-
)
|
|
315
|
-
|
|
316
|
-
async def delete_result(self, result_id: str) -> bool:
|
|
317
|
-
"""Delete a simulation result."""
|
|
318
|
-
data = await self._kv.get(f"{self._RESULT_PREFIX}{result_id}")
|
|
319
|
-
if not data:
|
|
320
|
-
return False
|
|
321
|
-
|
|
322
|
-
# Remove from result
|
|
323
|
-
await self._kv.delete(f"{self._RESULT_PREFIX}{result_id}")
|
|
324
|
-
|
|
325
|
-
# Update index
|
|
326
|
-
index = await self._kv.get(self._INDEX_KEY) or {"items": []}
|
|
327
|
-
index["items"] = [i for i in index["items"] if i["id"] != result_id]
|
|
328
|
-
await self._kv.set(self._INDEX_KEY, index)
|
|
329
|
-
|
|
330
|
-
# Update run
|
|
331
|
-
run_id = data.get("run_id")
|
|
332
|
-
if run_id:
|
|
333
|
-
run = await self._kv.get(f"{self._RUN_PREFIX}{run_id}")
|
|
334
|
-
if run:
|
|
335
|
-
run["results"] = [r for r in run["results"] if r["result_id"] != result_id]
|
|
336
|
-
if not run["results"]:
|
|
337
|
-
await self._kv.delete(f"{self._RUN_PREFIX}{run_id}")
|
|
338
|
-
# Remove from runs index
|
|
339
|
-
runs_index = await self._kv.get(self._RUNS_INDEX_KEY) or {"items": []}
|
|
340
|
-
runs_index["items"] = [i for i in runs_index["items"] if i["id"] != run_id]
|
|
341
|
-
await self._kv.set(self._RUNS_INDEX_KEY, runs_index)
|
|
342
|
-
else:
|
|
343
|
-
await self._kv.set(f"{self._RUN_PREFIX}{run_id}", run)
|
|
344
|
-
|
|
345
|
-
return True
|
|
346
|
-
|
|
347
|
-
async def list_results(self, limit: int = 50, offset: int = 0) -> tuple[list[ResultSummary], int]:
|
|
348
|
-
"""List evaluation results with pagination."""
|
|
349
|
-
index = await self._kv.get(self._INDEX_KEY) or {"items": []}
|
|
350
|
-
items = index["items"]
|
|
351
|
-
total = len(items)
|
|
352
|
-
|
|
353
|
-
summaries = []
|
|
354
|
-
for item in items[offset : offset + limit]:
|
|
355
|
-
metrics = item.get("metrics") or {}
|
|
356
|
-
summaries.append(
|
|
357
|
-
ResultSummary(
|
|
358
|
-
result_id=item["id"],
|
|
359
|
-
scenario_name=item.get("scenario_name", ""),
|
|
360
|
-
timestamp=datetime.fromisoformat(item["start_time"])
|
|
361
|
-
if item.get("start_time")
|
|
362
|
-
else datetime.now(timezone.utc),
|
|
363
|
-
status=SimulationStatus(item.get("status", "unknown")),
|
|
364
|
-
tasks_completed=metrics.get("tasks_completed", 0),
|
|
365
|
-
total_tasks=metrics.get("total_tasks", 0),
|
|
366
|
-
success_rate=metrics.get("success_rate", 0.0),
|
|
367
|
-
total_turns=metrics.get("total_turns", 0),
|
|
368
|
-
total_tokens=metrics.get("total_tokens", 0),
|
|
369
|
-
total_cost_usd=metrics.get("total_cost_usd", 0.0),
|
|
370
|
-
)
|
|
371
|
-
)
|
|
372
|
-
|
|
373
|
-
return summaries, total
|
|
374
|
-
|
|
375
|
-
async def list_runs(self, limit: int = 50, offset: int = 0) -> tuple[list[SimulationRunSummary], int]:
|
|
376
|
-
"""List simulation runs."""
|
|
377
|
-
runs_index = await self._kv.get(self._RUNS_INDEX_KEY) or {"items": []}
|
|
378
|
-
items = runs_index["items"]
|
|
379
|
-
total = len(items)
|
|
380
|
-
|
|
381
|
-
runs = []
|
|
382
|
-
for item in items[offset : offset + limit]:
|
|
383
|
-
run = await self._kv.get(f"{self._RUN_PREFIX}{item['id']}")
|
|
384
|
-
if run:
|
|
385
|
-
runs.append(self._run_to_summary(run))
|
|
386
|
-
|
|
387
|
-
return runs, total
|
|
388
|
-
|
|
389
|
-
def _run_to_summary(self, run: dict[str, Any]) -> SimulationRunSummary: # noqa: PLR6301
|
|
390
|
-
"""Convert run data to summary."""
|
|
391
|
-
from ragbits.evaluate.api_types import ScenarioRunSummary
|
|
392
|
-
|
|
393
|
-
scenario_runs = []
|
|
394
|
-
for r in run.get("results", []):
|
|
395
|
-
metrics = r.get("metrics") or {}
|
|
396
|
-
scenario_runs.append(
|
|
397
|
-
ScenarioRunSummary(
|
|
398
|
-
id=r["result_id"],
|
|
399
|
-
scenario_name=r.get("scenario_name", ""),
|
|
400
|
-
persona=r.get("persona"),
|
|
401
|
-
status=SimulationStatus(r.get("status", "unknown")),
|
|
402
|
-
start_time=datetime.now(timezone.utc), # Not stored in summary
|
|
403
|
-
end_time=None,
|
|
404
|
-
total_turns=metrics.get("total_turns", 0),
|
|
405
|
-
total_tasks=metrics.get("total_tasks", 0),
|
|
406
|
-
tasks_completed=metrics.get("tasks_completed", 0),
|
|
407
|
-
success_rate=metrics.get("success_rate", 0.0),
|
|
408
|
-
total_tokens=metrics.get("total_tokens", 0),
|
|
409
|
-
total_cost_usd=metrics.get("total_cost_usd", 0.0),
|
|
410
|
-
error=None,
|
|
411
|
-
)
|
|
412
|
-
)
|
|
413
|
-
|
|
414
|
-
success_rates = [sr.success_rate for sr in scenario_runs if sr.success_rate > 0]
|
|
415
|
-
overall_success_rate = sum(success_rates) / len(success_rates) if success_rates else 0.0
|
|
416
|
-
|
|
417
|
-
return SimulationRunSummary(
|
|
418
|
-
id=run["id"],
|
|
419
|
-
timestamp=datetime.fromisoformat(run["timestamp"]),
|
|
420
|
-
status=SimulationStatus(run.get("status", "unknown")),
|
|
421
|
-
scenario_runs=scenario_runs,
|
|
422
|
-
total_scenarios=run.get("total_scenarios", 0),
|
|
423
|
-
completed_scenarios=run.get("completed_scenarios", 0),
|
|
424
|
-
failed_scenarios=run.get("failed_scenarios", 0),
|
|
425
|
-
total_tokens=run.get("total_tokens", 0),
|
|
426
|
-
total_cost_usd=run.get("total_cost_usd", 0.0),
|
|
427
|
-
overall_success_rate=overall_success_rate,
|
|
428
|
-
)
|
|
429
|
-
|
|
430
|
-
async def get_run(self, run_id: str) -> SimulationRunDetail | None:
|
|
431
|
-
"""Get full details for a simulation run."""
|
|
432
|
-
run = await self._kv.get(f"{self._RUN_PREFIX}{run_id}")
|
|
433
|
-
if not run:
|
|
434
|
-
return None
|
|
435
|
-
|
|
436
|
-
scenario_runs = []
|
|
437
|
-
for r in run.get("results", []):
|
|
438
|
-
# Load full result
|
|
439
|
-
result_data = await self._kv.get(f"{self._RESULT_PREFIX}{r['result_id']}")
|
|
440
|
-
if result_data:
|
|
441
|
-
scenario_runs.append(self._result_to_scenario_detail(result_data))
|
|
442
|
-
|
|
443
|
-
success_rates = [sr.metrics.get("success_rate", 0.0) for sr in scenario_runs if sr.metrics]
|
|
444
|
-
overall_success_rate = sum(success_rates) / len(success_rates) if success_rates else 0.0
|
|
445
|
-
|
|
446
|
-
return SimulationRunDetail(
|
|
447
|
-
id=run["id"],
|
|
448
|
-
timestamp=datetime.fromisoformat(run["timestamp"]),
|
|
449
|
-
status=SimulationStatus(run.get("status", "unknown")),
|
|
450
|
-
scenario_runs=scenario_runs,
|
|
451
|
-
total_scenarios=run.get("total_scenarios", 0),
|
|
452
|
-
completed_scenarios=run.get("completed_scenarios", 0),
|
|
453
|
-
failed_scenarios=run.get("failed_scenarios", 0),
|
|
454
|
-
total_tokens=run.get("total_tokens", 0),
|
|
455
|
-
total_cost_usd=run.get("total_cost_usd", 0.0),
|
|
456
|
-
overall_success_rate=overall_success_rate,
|
|
457
|
-
)
|
|
458
|
-
|
|
459
|
-
def _result_to_scenario_detail(self, data: dict[str, Any]) -> Any: # noqa: PLR6301, ANN401
|
|
460
|
-
"""Convert result data to scenario detail."""
|
|
461
|
-
from ragbits.evaluate.api_types import (
|
|
462
|
-
CheckerResultItemResponse,
|
|
463
|
-
ResponseChunkResponse,
|
|
464
|
-
ScenarioRunDetail,
|
|
465
|
-
TaskResultResponse,
|
|
466
|
-
TurnResultResponse,
|
|
467
|
-
)
|
|
468
|
-
|
|
469
|
-
turns = []
|
|
470
|
-
for t in data.get("turns", []):
|
|
471
|
-
token_usage = t.get("token_usage")
|
|
472
|
-
if token_usage:
|
|
473
|
-
token_usage = {
|
|
474
|
-
"prompt_tokens": token_usage.get("prompt_tokens", 0),
|
|
475
|
-
"completion_tokens": token_usage.get("completion_tokens", 0),
|
|
476
|
-
"total_tokens": token_usage.get("total_tokens", 0),
|
|
477
|
-
}
|
|
478
|
-
turns.append(
|
|
479
|
-
TurnResultResponse(
|
|
480
|
-
turn_index=t["turn_index"],
|
|
481
|
-
task_index=t["task_index"],
|
|
482
|
-
user_message=t.get("user_message", ""),
|
|
483
|
-
assistant_message=t.get("assistant_message", ""),
|
|
484
|
-
tool_calls=t.get("tool_calls", []),
|
|
485
|
-
task_completed=t.get("task_completed", False),
|
|
486
|
-
task_completed_reason=t.get("task_completed_reason", ""),
|
|
487
|
-
token_usage=token_usage,
|
|
488
|
-
latency_ms=t.get("latency_ms"),
|
|
489
|
-
checkers=[
|
|
490
|
-
CheckerResultItemResponse(
|
|
491
|
-
type=c.get("type", "unknown"),
|
|
492
|
-
completed=c.get("completed", False),
|
|
493
|
-
reason=c.get("reason", ""),
|
|
494
|
-
)
|
|
495
|
-
for c in t.get("checkers", [])
|
|
496
|
-
],
|
|
497
|
-
checker_mode=t.get("checker_mode", "all"),
|
|
498
|
-
)
|
|
499
|
-
)
|
|
500
|
-
|
|
501
|
-
tasks = [
|
|
502
|
-
TaskResultResponse(
|
|
503
|
-
task_index=t["task_index"],
|
|
504
|
-
description=t.get("description", ""),
|
|
505
|
-
completed=t.get("completed", False),
|
|
506
|
-
turns_taken=t.get("turns_taken", 0),
|
|
507
|
-
final_reason=t.get("final_reason", ""),
|
|
508
|
-
)
|
|
509
|
-
for t in data.get("tasks", [])
|
|
510
|
-
]
|
|
511
|
-
|
|
512
|
-
chunks = [
|
|
513
|
-
ResponseChunkResponse(
|
|
514
|
-
turn_index=c["turn_index"],
|
|
515
|
-
task_index=c["task_index"],
|
|
516
|
-
chunk_index=c["chunk_index"],
|
|
517
|
-
chunk_type=c["chunk_type"],
|
|
518
|
-
chunk_data=c.get("chunk_data", {}),
|
|
519
|
-
)
|
|
520
|
-
for c in data.get("response_chunks", [])
|
|
521
|
-
]
|
|
522
|
-
|
|
523
|
-
return ScenarioRunDetail(
|
|
524
|
-
id=data.get("scenario_run_id", data["result_id"]),
|
|
525
|
-
scenario_name=data["scenario_name"],
|
|
526
|
-
persona=data.get("persona"),
|
|
527
|
-
status=SimulationStatus(data["status"]),
|
|
528
|
-
start_time=datetime.fromisoformat(data["start_time"]),
|
|
529
|
-
end_time=datetime.fromisoformat(data["end_time"]) if data.get("end_time") else None,
|
|
530
|
-
turns=turns,
|
|
531
|
-
tasks=tasks,
|
|
532
|
-
response_chunks=chunks,
|
|
533
|
-
metrics=data.get("metrics"),
|
|
534
|
-
error=data.get("error"),
|
|
535
|
-
)
|
{ragbits_evaluate-0.0.30rc1.dist-info → ragbits_evaluate-1.4.0.dev202602030301.dist-info}/WHEEL
RENAMED
|
File without changes
|