ragbits-evaluate 0.0.30rc1__py3-none-any.whl → 1.4.0.dev202602030301__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,466 +0,0 @@
1
- """File-based evaluation report storage using JSON files."""
2
-
3
- import json
4
- import logging
5
- from datetime import datetime, timezone
6
- from pathlib import Path
7
-
8
- from ragbits.evaluate.agent_simulation.results import ResponseChunk, SimulationResult, SimulationStatus
9
- from ragbits.evaluate.api_types import (
10
- CheckerResultItemResponse,
11
- ResponseChunkResponse,
12
- ResultSummary,
13
- ScenarioRunDetail,
14
- ScenarioRunSummary,
15
- SimulationRunDetail,
16
- SimulationRunSummary,
17
- TaskResultResponse,
18
- TurnResultResponse,
19
- )
20
- from ragbits.evaluate.stores.base import EvalReportStore
21
-
22
- logger = logging.getLogger(__name__)
23
-
24
-
25
- class FileEvalReportStore(EvalReportStore):
26
- """File-based storage for evaluation reports using JSON files.
27
-
28
- This is the default storage backend that maintains backward compatibility
29
- with the existing file-based approach.
30
- """
31
-
32
- def __init__(self, results_dir: Path | str) -> None:
33
- """Initialize the file-based store.
34
-
35
- Args:
36
- results_dir: Directory for storing evaluation results as JSON files.
37
- """
38
- self.results_dir = Path(results_dir)
39
- self.results_dir.mkdir(parents=True, exist_ok=True)
40
-
41
- @staticmethod
42
- def _generate_scenario_run_id(scenario_name: str, persona: str | None = None) -> str:
43
- """Generate a unique scenario run ID for old results without one."""
44
- import uuid
45
-
46
- safe_scenario = "".join(c if c.isalnum() or c in "-_" else "_" for c in scenario_name)
47
- safe_persona = ""
48
- if persona:
49
- safe_persona = "_" + "".join(c if c.isalnum() or c in "-_" else "_" for c in persona)
50
- unique = uuid.uuid4().hex[:6]
51
- return f"sr_{safe_scenario}{safe_persona}_{unique}"
52
-
53
- async def save_result(
54
- self,
55
- run_id: str,
56
- scenario_run_id: str,
57
- scenario_name: str,
58
- result: SimulationResult,
59
- buffered_chunks: list[ResponseChunk] | None = None,
60
- ) -> str:
61
- """Save a simulation result to disk.
62
-
63
- Args:
64
- run_id: Run identifier.
65
- scenario_run_id: Unique scenario run identifier.
66
- scenario_name: Name of the scenario.
67
- result: Simulation result to save.
68
- buffered_chunks: Optional response chunks from the event buffer.
69
-
70
- Returns:
71
- Result ID (filename without extension).
72
- """
73
- # Add buffered chunks to the result (skip text chunks)
74
- if buffered_chunks:
75
- chunk_index = len(result.response_chunks)
76
- for chunk in buffered_chunks:
77
- if chunk.chunk_type == "text":
78
- continue
79
- result.response_chunks.append(
80
- ResponseChunk(
81
- turn_index=chunk.turn_index,
82
- task_index=chunk.task_index,
83
- chunk_index=chunk_index,
84
- chunk_type=chunk.chunk_type,
85
- chunk_data=chunk.chunk_data,
86
- )
87
- )
88
- chunk_index += 1
89
-
90
- timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
91
- safe_name = "".join(c if c.isalnum() or c in "-_" else "_" for c in scenario_name)
92
- result_id = f"result_{timestamp}_{safe_name}"
93
-
94
- result_path = self.results_dir / f"{result_id}.json"
95
- result_data = result.to_dict()
96
- # Include run_id and scenario_run_id for grouping and identification
97
- result_data["run_id"] = run_id
98
- result_data["scenario_run_id"] = scenario_run_id
99
-
100
- with open(result_path, "w", encoding="utf-8") as f:
101
- json.dump(result_data, f, indent=2, default=str)
102
-
103
- logger.info(f"Saved result to {result_path}")
104
- return result_id
105
-
106
- async def load_result(self, result_id: str) -> SimulationResult | None:
107
- """Load a simulation result from disk.
108
-
109
- Args:
110
- result_id: Result identifier (filename without extension).
111
-
112
- Returns:
113
- SimulationResult if found, None otherwise.
114
- """
115
- result_path = self.results_dir / f"{result_id}.json"
116
- if not result_path.exists():
117
- return None
118
-
119
- try:
120
- with open(result_path, encoding="utf-8") as f:
121
- data = json.load(f)
122
- return SimulationResult.from_dict(data)
123
- except (json.JSONDecodeError, KeyError, ValueError) as e:
124
- logger.error(f"Failed to load result {result_id}: {e}")
125
- return None
126
-
127
- async def delete_result(self, result_id: str) -> bool:
128
- """Delete a simulation result.
129
-
130
- Args:
131
- result_id: Result identifier.
132
-
133
- Returns:
134
- True if deleted, False if not found.
135
- """
136
- result_path = self.results_dir / f"{result_id}.json"
137
- if result_path.exists():
138
- result_path.unlink()
139
- return True
140
- return False
141
-
142
- async def list_results(self, limit: int = 50, offset: int = 0) -> tuple[list[ResultSummary], int]:
143
- """List evaluation results with pagination.
144
-
145
- Args:
146
- limit: Maximum number of results to return.
147
- offset: Number of results to skip.
148
-
149
- Returns:
150
- Tuple of (results list, total count).
151
- """
152
- result_files = sorted(self.results_dir.glob("result_*.json"), key=lambda p: p.stat().st_mtime, reverse=True)
153
-
154
- total = len(result_files)
155
- paginated = result_files[offset : offset + limit]
156
-
157
- summaries = []
158
- for path in paginated:
159
- try:
160
- with open(path, encoding="utf-8") as f:
161
- data = json.load(f)
162
-
163
- metrics = data.get("metrics", {})
164
- summaries.append(
165
- ResultSummary(
166
- result_id=path.stem,
167
- scenario_name=data.get("scenario_name", "Unknown"),
168
- timestamp=datetime.fromisoformat(
169
- data.get("start_time", datetime.now(timezone.utc).isoformat())
170
- ),
171
- status=SimulationStatus(data.get("status", "completed")),
172
- tasks_completed=metrics.get("tasks_completed", 0),
173
- total_tasks=metrics.get("total_tasks", 0),
174
- success_rate=metrics.get("success_rate", 0.0),
175
- total_turns=metrics.get("total_turns", 0),
176
- total_tokens=metrics.get("total_tokens", 0),
177
- total_cost_usd=metrics.get("total_cost_usd", 0.0),
178
- )
179
- )
180
- except (json.JSONDecodeError, KeyError, ValueError) as e:
181
- logger.warning(f"Failed to parse result file {path}: {e}")
182
- continue
183
-
184
- return summaries, total
185
-
186
- async def list_runs(self, limit: int = 50, offset: int = 0) -> tuple[list[SimulationRunSummary], int]:
187
- """List simulation runs (batch runs grouped by run_id).
188
-
189
- Args:
190
- limit: Maximum number of runs to return.
191
- offset: Number of runs to skip.
192
-
193
- Returns:
194
- Tuple of (runs list, total count).
195
- """
196
- result_files = sorted(self.results_dir.glob("result_*.json"), key=lambda p: p.stat().st_mtime, reverse=True)
197
-
198
- # Group results by run_id
199
- runs_map: dict[str, list[dict]] = {}
200
- for path in result_files:
201
- try:
202
- with open(path, encoding="utf-8") as f:
203
- data = json.load(f)
204
- run_id = data.get("run_id", path.stem) # Fallback to result_id if no run_id
205
- if run_id not in runs_map:
206
- runs_map[run_id] = []
207
- runs_map[run_id].append(data)
208
- except (json.JSONDecodeError, KeyError) as e:
209
- logger.warning(f"Failed to parse result file {path}: {e}")
210
- continue
211
-
212
- # Convert to SimulationRunSummary objects
213
- runs = []
214
- for run_id, results in runs_map.items():
215
- # Sort results by start_time
216
- results.sort(key=lambda r: r.get("start_time", ""), reverse=True)
217
-
218
- scenario_runs = []
219
- total_tokens = 0
220
- total_cost = 0.0
221
- completed = 0
222
- failed = 0
223
-
224
- for result in results:
225
- metrics = result.get("metrics") or {}
226
- status = SimulationStatus(result.get("status", "completed"))
227
-
228
- if status == SimulationStatus.COMPLETED:
229
- completed += 1
230
- elif status in (SimulationStatus.FAILED, SimulationStatus.TIMEOUT):
231
- failed += 1
232
-
233
- tokens = metrics.get("total_tokens", 0)
234
- cost = metrics.get("total_cost_usd", 0.0)
235
- total_tokens += tokens
236
- total_cost += cost
237
-
238
- # Get scenario_run_id, falling back to generated one for old results
239
- scenario_name = result.get("scenario_name", "Unknown")
240
- scenario_run_id = result.get("scenario_run_id", self._generate_scenario_run_id(scenario_name))
241
-
242
- scenario_runs.append(
243
- ScenarioRunSummary(
244
- id=scenario_run_id,
245
- scenario_name=scenario_name,
246
- persona=result.get("persona"),
247
- status=status,
248
- start_time=datetime.fromisoformat(
249
- result.get("start_time", datetime.now(timezone.utc).isoformat())
250
- ),
251
- end_time=datetime.fromisoformat(result["end_time"]) if result.get("end_time") else None,
252
- total_turns=metrics.get("total_turns", 0),
253
- total_tasks=metrics.get("total_tasks", 0),
254
- tasks_completed=metrics.get("tasks_completed", 0),
255
- success_rate=metrics.get("success_rate", 0.0),
256
- total_tokens=tokens,
257
- total_cost_usd=cost,
258
- error=result.get("error"),
259
- )
260
- )
261
-
262
- # Determine overall run status
263
- total_scenarios = len(scenario_runs)
264
- if failed > 0:
265
- overall_status = SimulationStatus.FAILED
266
- elif completed == total_scenarios:
267
- overall_status = SimulationStatus.COMPLETED
268
- else:
269
- overall_status = SimulationStatus.RUNNING
270
-
271
- # Calculate overall success rate
272
- success_rates = [sr.success_rate for sr in scenario_runs if sr.success_rate > 0]
273
- overall_success_rate = sum(success_rates) / len(success_rates) if success_rates else 0.0
274
-
275
- # Use the earliest start_time as the run timestamp
276
- earliest_time = min(
277
- (datetime.fromisoformat(r.get("start_time", datetime.now(timezone.utc).isoformat())) for r in results),
278
- default=datetime.now(timezone.utc),
279
- )
280
-
281
- runs.append(
282
- SimulationRunSummary(
283
- id=run_id,
284
- timestamp=earliest_time,
285
- status=overall_status,
286
- scenario_runs=scenario_runs,
287
- total_scenarios=total_scenarios,
288
- completed_scenarios=completed,
289
- failed_scenarios=failed,
290
- total_tokens=total_tokens,
291
- total_cost_usd=total_cost,
292
- overall_success_rate=overall_success_rate,
293
- )
294
- )
295
-
296
- # Sort runs by timestamp (newest first)
297
- runs.sort(key=lambda r: r.timestamp, reverse=True)
298
-
299
- total = len(runs)
300
- paginated = runs[offset : offset + limit]
301
-
302
- return paginated, total
303
-
304
- async def get_run(self, run_id: str) -> SimulationRunDetail | None: # noqa: PLR0912, PLR0915
305
- """Get full details for a simulation run.
306
-
307
- Args:
308
- run_id: Run identifier.
309
-
310
- Returns:
311
- SimulationRunDetail if found, None otherwise.
312
- """
313
- result_files = self.results_dir.glob("result_*.json")
314
-
315
- # Find all results for this run_id
316
- results = []
317
- for path in result_files:
318
- try:
319
- with open(path, encoding="utf-8") as f:
320
- data = json.load(f)
321
- if data.get("run_id") == run_id:
322
- results.append(data)
323
- except (json.JSONDecodeError, KeyError) as e:
324
- logger.warning(f"Failed to parse result file {path}: {e}")
325
- continue
326
-
327
- if not results:
328
- return None
329
-
330
- # Build full scenario run details
331
- scenario_runs = []
332
- total_tokens = 0
333
- total_cost = 0.0
334
- completed = 0
335
- failed = 0
336
-
337
- for result in results:
338
- metrics = result.get("metrics") or {}
339
- status = SimulationStatus(result.get("status", "completed"))
340
-
341
- if status == SimulationStatus.COMPLETED:
342
- completed += 1
343
- elif status in (SimulationStatus.FAILED, SimulationStatus.TIMEOUT):
344
- failed += 1
345
-
346
- tokens = metrics.get("total_tokens", 0)
347
- cost = metrics.get("total_cost_usd", 0.0)
348
- total_tokens += tokens
349
- total_cost += cost
350
-
351
- # Parse turns
352
- turns = []
353
- for turn in result.get("turns", []):
354
- token_usage = turn.get("token_usage")
355
- # Extract only prompt, completion, and total tokens
356
- if hasattr(token_usage, "model_dump"):
357
- token_usage = token_usage.model_dump(include={"prompt_tokens", "completion_tokens", "total_tokens"})
358
- elif isinstance(token_usage, dict):
359
- token_usage = {
360
- "prompt_tokens": token_usage.get("prompt_tokens", 0),
361
- "completion_tokens": token_usage.get("completion_tokens", 0),
362
- "total_tokens": token_usage.get("total_tokens", 0),
363
- }
364
- else:
365
- token_usage = None
366
- checkers = [
367
- CheckerResultItemResponse(
368
- type=c.get("type", "unknown"),
369
- completed=c.get("completed", False),
370
- reason=c.get("reason", ""),
371
- )
372
- for c in turn.get("checkers", [])
373
- ]
374
- turns.append(
375
- TurnResultResponse(
376
- turn_index=turn.get("turn_index", 0),
377
- task_index=turn.get("task_index", 0),
378
- user_message=turn.get("user_message", ""),
379
- assistant_message=turn.get("assistant_message", ""),
380
- tool_calls=turn.get("tool_calls", []),
381
- task_completed=turn.get("task_completed", False),
382
- task_completed_reason=turn.get("task_completed_reason", ""),
383
- token_usage=token_usage,
384
- latency_ms=turn.get("latency_ms"),
385
- checkers=checkers,
386
- checker_mode=turn.get("checker_mode", "all"),
387
- )
388
- )
389
-
390
- # Parse tasks
391
- tasks = []
392
- for task in result.get("tasks", []):
393
- tasks.append(
394
- TaskResultResponse(
395
- task_index=task.get("task_index", 0),
396
- description=task.get("description", ""),
397
- completed=task.get("completed", False),
398
- turns_taken=task.get("turns_taken", 0),
399
- final_reason=task.get("final_reason", ""),
400
- )
401
- )
402
-
403
- # Parse response chunks
404
- response_chunks = []
405
- for chunk in result.get("response_chunks", []):
406
- response_chunks.append(
407
- ResponseChunkResponse(
408
- turn_index=chunk.get("turn_index", 0),
409
- task_index=chunk.get("task_index", 0),
410
- chunk_index=chunk.get("chunk_index", 0),
411
- chunk_type=chunk.get("chunk_type", "unknown"),
412
- chunk_data=chunk.get("chunk_data", {}),
413
- )
414
- )
415
-
416
- # Get scenario_run_id, falling back to generated one for old results
417
- scenario_name = result.get("scenario_name", "Unknown")
418
- scenario_run_id = result.get("scenario_run_id", self._generate_scenario_run_id(scenario_name))
419
-
420
- scenario_runs.append(
421
- ScenarioRunDetail(
422
- id=scenario_run_id,
423
- scenario_name=scenario_name,
424
- persona=result.get("persona"),
425
- status=status,
426
- start_time=datetime.fromisoformat(result.get("start_time", datetime.now(timezone.utc).isoformat())),
427
- end_time=datetime.fromisoformat(result["end_time"]) if result.get("end_time") else None,
428
- turns=turns,
429
- tasks=tasks,
430
- response_chunks=response_chunks,
431
- metrics=metrics if metrics else None,
432
- error=result.get("error"),
433
- )
434
- )
435
-
436
- # Determine overall run status
437
- total_scenarios = len(scenario_runs)
438
- if failed > 0:
439
- overall_status = SimulationStatus.FAILED
440
- elif completed == total_scenarios:
441
- overall_status = SimulationStatus.COMPLETED
442
- else:
443
- overall_status = SimulationStatus.RUNNING
444
-
445
- # Calculate overall success rate
446
- success_rates = [sr.metrics.get("success_rate", 0.0) for sr in scenario_runs if sr.metrics]
447
- overall_success_rate = sum(success_rates) / len(success_rates) if success_rates else 0.0
448
-
449
- # Use the earliest start_time as the run timestamp
450
- earliest_time = min(
451
- (sr.start_time for sr in scenario_runs),
452
- default=datetime.now(timezone.utc),
453
- )
454
-
455
- return SimulationRunDetail(
456
- id=run_id,
457
- timestamp=earliest_time,
458
- status=overall_status,
459
- scenario_runs=scenario_runs,
460
- total_scenarios=total_scenarios,
461
- completed_scenarios=completed,
462
- failed_scenarios=failed,
463
- total_tokens=total_tokens,
464
- total_cost_usd=total_cost,
465
- overall_success_rate=overall_success_rate,
466
- )