ragbits-evaluate 0.0.30rc1__py3-none-any.whl → 1.4.0.dev202602030301__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ragbits/evaluate/agent_simulation/__init__.py +4 -49
- ragbits/evaluate/agent_simulation/conversation.py +278 -663
- ragbits/evaluate/agent_simulation/logger.py +1 -1
- ragbits/evaluate/agent_simulation/metrics/__init__.py +0 -10
- ragbits/evaluate/agent_simulation/metrics/builtin.py +49 -59
- ragbits/evaluate/agent_simulation/metrics/collectors.py +17 -37
- ragbits/evaluate/agent_simulation/models.py +18 -198
- ragbits/evaluate/agent_simulation/results.py +49 -125
- ragbits/evaluate/agent_simulation/scenarios.py +19 -95
- ragbits/evaluate/agent_simulation/simulation.py +166 -72
- ragbits/evaluate/metrics/question_answer.py +25 -8
- {ragbits_evaluate-0.0.30rc1.dist-info → ragbits_evaluate-1.4.0.dev202602030301.dist-info}/METADATA +2 -6
- {ragbits_evaluate-0.0.30rc1.dist-info → ragbits_evaluate-1.4.0.dev202602030301.dist-info}/RECORD +14 -25
- ragbits/evaluate/agent_simulation/checkers.py +0 -591
- ragbits/evaluate/agent_simulation/display.py +0 -118
- ragbits/evaluate/agent_simulation/metrics/deepeval.py +0 -295
- ragbits/evaluate/agent_simulation/tracing.py +0 -233
- ragbits/evaluate/api.py +0 -603
- ragbits/evaluate/api_types.py +0 -343
- ragbits/evaluate/execution_manager.py +0 -451
- ragbits/evaluate/stores/__init__.py +0 -36
- ragbits/evaluate/stores/base.py +0 -98
- ragbits/evaluate/stores/file.py +0 -466
- ragbits/evaluate/stores/kv.py +0 -535
- {ragbits_evaluate-0.0.30rc1.dist-info → ragbits_evaluate-1.4.0.dev202602030301.dist-info}/WHEEL +0 -0
ragbits/evaluate/api_types.py
DELETED
|
@@ -1,343 +0,0 @@
|
|
|
1
|
-
"""API types for the evaluation UI endpoints."""
|
|
2
|
-
|
|
3
|
-
from datetime import datetime
|
|
4
|
-
from typing import Any
|
|
5
|
-
|
|
6
|
-
from pydantic import BaseModel, Field
|
|
7
|
-
|
|
8
|
-
from ragbits.evaluate.agent_simulation.results import SimulationStatus
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
class TaskDetail(BaseModel):
|
|
12
|
-
"""Task details for scenario display and editing."""
|
|
13
|
-
|
|
14
|
-
task: str
|
|
15
|
-
checkers: list[dict[str, Any]] = Field(default_factory=list)
|
|
16
|
-
checker_mode: str = "all"
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
class ScenarioSummary(BaseModel):
|
|
20
|
-
"""Summary of a scenario for listing."""
|
|
21
|
-
|
|
22
|
-
name: str
|
|
23
|
-
num_tasks: int
|
|
24
|
-
group: str | None = None
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
class ScenarioFileSummary(BaseModel):
|
|
28
|
-
"""Summary of a scenario file with its scenarios."""
|
|
29
|
-
|
|
30
|
-
filename: str
|
|
31
|
-
group: str | None = None
|
|
32
|
-
scenarios: list[ScenarioSummary]
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
class ScenarioDetail(BaseModel):
|
|
36
|
-
"""Full scenario details for viewing and editing."""
|
|
37
|
-
|
|
38
|
-
name: str
|
|
39
|
-
tasks: list[TaskDetail]
|
|
40
|
-
group: str | None = None
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
class EvalConfigResponse(BaseModel):
|
|
44
|
-
"""Configuration response with available scenarios."""
|
|
45
|
-
|
|
46
|
-
available_scenarios: list[ScenarioSummary]
|
|
47
|
-
scenario_files: list[ScenarioFileSummary] = Field(default_factory=list)
|
|
48
|
-
scenarios_dir: str
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
class RunEvaluationConfig(BaseModel):
|
|
52
|
-
"""Configuration for running a simulation via API.
|
|
53
|
-
|
|
54
|
-
This is the frontend-facing config model with validation constraints.
|
|
55
|
-
"""
|
|
56
|
-
|
|
57
|
-
max_turns_scenario: int = Field(default=15, ge=1, le=100)
|
|
58
|
-
max_turns_task: int | None = Field(default=4, ge=1, le=50)
|
|
59
|
-
sim_user_model_name: str | None = None
|
|
60
|
-
checker_model_name: str | None = None
|
|
61
|
-
default_model: str = "gpt-4o-mini"
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
class RunEvaluationRequest(BaseModel):
|
|
65
|
-
"""Request to start an evaluation run.
|
|
66
|
-
|
|
67
|
-
If personas is provided, creates a matrix run: each scenario × each persona.
|
|
68
|
-
If personas is empty or None, runs each scenario once without a persona.
|
|
69
|
-
"""
|
|
70
|
-
|
|
71
|
-
scenario_names: list[str] = Field(..., min_length=1)
|
|
72
|
-
personas: list[str] | None = Field(default=None, description="Personas for matrix runs")
|
|
73
|
-
config: RunEvaluationConfig = Field(default_factory=RunEvaluationConfig)
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
class RunStartResponse(BaseModel):
|
|
77
|
-
"""Response when starting an evaluation run."""
|
|
78
|
-
|
|
79
|
-
run_id: str
|
|
80
|
-
scenarios: list[str]
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
class ProgressUpdate(BaseModel):
|
|
84
|
-
"""Base progress update for SSE streaming."""
|
|
85
|
-
|
|
86
|
-
type: str
|
|
87
|
-
run_id: str
|
|
88
|
-
scenario_run_id: str # Unique ID for the scenario run (scenario + persona)
|
|
89
|
-
scenario_name: str
|
|
90
|
-
persona: str | None = None # Persona used for this scenario run
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
class StatusProgressUpdate(ProgressUpdate):
|
|
94
|
-
"""Status change progress update."""
|
|
95
|
-
|
|
96
|
-
type: str = "status"
|
|
97
|
-
status: SimulationStatus
|
|
98
|
-
current_turn: int | None = None
|
|
99
|
-
current_task_index: int | None = None
|
|
100
|
-
current_task: str | None = None
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
class TurnProgressUpdate(ProgressUpdate):
|
|
104
|
-
"""Turn completion progress update."""
|
|
105
|
-
|
|
106
|
-
type: str = "turn"
|
|
107
|
-
turn_index: int
|
|
108
|
-
task_index: int
|
|
109
|
-
user_message: str
|
|
110
|
-
assistant_message: str
|
|
111
|
-
tool_calls: list[dict[str, Any]] = Field(default_factory=list)
|
|
112
|
-
task_completed: bool = False
|
|
113
|
-
task_completed_reason: str = ""
|
|
114
|
-
checkers: list[dict[str, Any]] = Field(default_factory=list)
|
|
115
|
-
checker_mode: str = "all"
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
class TaskCompleteUpdate(ProgressUpdate):
|
|
119
|
-
"""Task completion progress update."""
|
|
120
|
-
|
|
121
|
-
type: str = "task_complete"
|
|
122
|
-
task_index: int
|
|
123
|
-
task_description: str
|
|
124
|
-
turns_taken: int
|
|
125
|
-
reason: str
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
class CompletionUpdate(ProgressUpdate):
|
|
129
|
-
"""Scenario completion progress update."""
|
|
130
|
-
|
|
131
|
-
type: str = "complete"
|
|
132
|
-
result_id: str
|
|
133
|
-
status: SimulationStatus
|
|
134
|
-
success_rate: float
|
|
135
|
-
total_turns: int
|
|
136
|
-
total_tasks: int
|
|
137
|
-
tasks_completed: int
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
class ErrorUpdate(ProgressUpdate):
|
|
141
|
-
"""Error progress update."""
|
|
142
|
-
|
|
143
|
-
type: str = "error"
|
|
144
|
-
error: str
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
class SourceReference(BaseModel):
|
|
148
|
-
"""A source/reference document from the chat response."""
|
|
149
|
-
|
|
150
|
-
title: str
|
|
151
|
-
content: str
|
|
152
|
-
url: str | None = None
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
class SourceUpdate(ProgressUpdate):
|
|
156
|
-
"""Source/reference document progress update."""
|
|
157
|
-
|
|
158
|
-
type: str = "source"
|
|
159
|
-
turn_index: int
|
|
160
|
-
task_index: int
|
|
161
|
-
source: SourceReference
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
class ResponseChunkUpdate(ProgressUpdate):
|
|
165
|
-
"""Real-time ChatInterface response chunk progress update.
|
|
166
|
-
|
|
167
|
-
Streams raw response chunks from the ChatInterface as they arrive,
|
|
168
|
-
enabling real-time visibility into all response types.
|
|
169
|
-
"""
|
|
170
|
-
|
|
171
|
-
type: str = "response_chunk"
|
|
172
|
-
turn_index: int
|
|
173
|
-
task_index: int
|
|
174
|
-
chunk_type: str # e.g., "text", "reference", "tool_call", "usage", "live_update", etc.
|
|
175
|
-
chunk_data: dict[str, Any]
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
class ResultSummary(BaseModel):
|
|
179
|
-
"""Summary of an evaluation result for listing."""
|
|
180
|
-
|
|
181
|
-
result_id: str
|
|
182
|
-
scenario_name: str
|
|
183
|
-
timestamp: datetime
|
|
184
|
-
status: SimulationStatus
|
|
185
|
-
tasks_completed: int
|
|
186
|
-
total_tasks: int
|
|
187
|
-
success_rate: float
|
|
188
|
-
total_turns: int
|
|
189
|
-
total_tokens: int = 0
|
|
190
|
-
total_cost_usd: float = 0.0
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
class ResultsListResponse(BaseModel):
|
|
194
|
-
"""Response for listing evaluation results."""
|
|
195
|
-
|
|
196
|
-
results: list[ResultSummary]
|
|
197
|
-
total: int
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
class TestPersonaRequest(BaseModel):
|
|
201
|
-
"""Request to test how a persona would ask a task."""
|
|
202
|
-
|
|
203
|
-
task: str
|
|
204
|
-
persona: str | None = None
|
|
205
|
-
scenario_name: str | None = None
|
|
206
|
-
task_index: int | None = None
|
|
207
|
-
model: str | None = None
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
class TestPersonaResponse(BaseModel):
|
|
211
|
-
"""Response with the generated persona message."""
|
|
212
|
-
|
|
213
|
-
message: str
|
|
214
|
-
persona: str | None = None
|
|
215
|
-
model: str
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
class PersonaSummary(BaseModel):
|
|
219
|
-
"""Summary of a persona for listing."""
|
|
220
|
-
|
|
221
|
-
name: str
|
|
222
|
-
description: str
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
class PersonasListResponse(BaseModel):
|
|
226
|
-
"""Response for listing personas."""
|
|
227
|
-
|
|
228
|
-
personas: list[PersonaSummary]
|
|
229
|
-
total: int
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
class ScenarioRunSummary(BaseModel):
|
|
233
|
-
"""Summary of a single scenario within a batch run."""
|
|
234
|
-
|
|
235
|
-
id: str # Unique ID for this scenario run (scenario + persona + run_id)
|
|
236
|
-
scenario_name: str
|
|
237
|
-
persona: str | None = None
|
|
238
|
-
status: SimulationStatus
|
|
239
|
-
start_time: datetime
|
|
240
|
-
end_time: datetime | None = None
|
|
241
|
-
total_turns: int = 0
|
|
242
|
-
total_tasks: int = 0
|
|
243
|
-
tasks_completed: int = 0
|
|
244
|
-
success_rate: float = 0.0
|
|
245
|
-
total_tokens: int = 0
|
|
246
|
-
total_cost_usd: float = 0.0
|
|
247
|
-
error: str | None = None
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
class SimulationRunSummary(BaseModel):
|
|
251
|
-
"""Summary of a batch simulation run containing multiple scenarios."""
|
|
252
|
-
|
|
253
|
-
id: str
|
|
254
|
-
timestamp: datetime
|
|
255
|
-
version: str = "current"
|
|
256
|
-
status: SimulationStatus
|
|
257
|
-
scenario_runs: list[ScenarioRunSummary]
|
|
258
|
-
total_scenarios: int
|
|
259
|
-
completed_scenarios: int = 0
|
|
260
|
-
failed_scenarios: int = 0
|
|
261
|
-
total_tokens: int = 0
|
|
262
|
-
total_cost_usd: float = 0.0
|
|
263
|
-
overall_success_rate: float = 0.0
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
class SimulationRunsListResponse(BaseModel):
|
|
267
|
-
"""Response for listing simulation runs."""
|
|
268
|
-
|
|
269
|
-
runs: list[SimulationRunSummary]
|
|
270
|
-
total: int
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
class CheckerResultItemResponse(BaseModel):
|
|
274
|
-
"""Result of a single checker evaluation."""
|
|
275
|
-
|
|
276
|
-
type: str
|
|
277
|
-
completed: bool
|
|
278
|
-
reason: str
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
class TurnResultResponse(BaseModel):
|
|
282
|
-
"""Turn result with conversation data."""
|
|
283
|
-
|
|
284
|
-
turn_index: int
|
|
285
|
-
task_index: int
|
|
286
|
-
user_message: str
|
|
287
|
-
assistant_message: str
|
|
288
|
-
tool_calls: list[dict[str, Any]] = Field(default_factory=list)
|
|
289
|
-
task_completed: bool = False
|
|
290
|
-
task_completed_reason: str = ""
|
|
291
|
-
token_usage: dict[str, int] | None = None
|
|
292
|
-
latency_ms: float | None = None
|
|
293
|
-
checkers: list[CheckerResultItemResponse] = Field(default_factory=list)
|
|
294
|
-
checker_mode: str = "all"
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
class TaskResultResponse(BaseModel):
|
|
298
|
-
"""Task result."""
|
|
299
|
-
|
|
300
|
-
task_index: int
|
|
301
|
-
description: str
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
class ResponseChunkResponse(BaseModel):
|
|
305
|
-
"""Response chunk from ChatInterface stream."""
|
|
306
|
-
|
|
307
|
-
turn_index: int
|
|
308
|
-
task_index: int
|
|
309
|
-
chunk_index: int
|
|
310
|
-
chunk_type: str
|
|
311
|
-
chunk_data: dict[str, Any]
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
class ScenarioRunDetail(BaseModel):
|
|
315
|
-
"""Full scenario run details including turns and tasks."""
|
|
316
|
-
|
|
317
|
-
id: str # Unique ID for this scenario run
|
|
318
|
-
scenario_name: str
|
|
319
|
-
persona: str | None = None
|
|
320
|
-
status: SimulationStatus
|
|
321
|
-
start_time: datetime
|
|
322
|
-
end_time: datetime | None = None
|
|
323
|
-
turns: list[TurnResultResponse] = Field(default_factory=list)
|
|
324
|
-
tasks: list[TaskResultResponse] = Field(default_factory=list)
|
|
325
|
-
response_chunks: list[ResponseChunkResponse] = Field(default_factory=list)
|
|
326
|
-
metrics: dict[str, Any] | None = None
|
|
327
|
-
error: str | None = None
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
class SimulationRunDetail(BaseModel):
|
|
331
|
-
"""Full simulation run details."""
|
|
332
|
-
|
|
333
|
-
id: str
|
|
334
|
-
timestamp: datetime
|
|
335
|
-
version: str = "current"
|
|
336
|
-
status: SimulationStatus
|
|
337
|
-
scenario_runs: list[ScenarioRunDetail]
|
|
338
|
-
total_scenarios: int
|
|
339
|
-
completed_scenarios: int = 0
|
|
340
|
-
failed_scenarios: int = 0
|
|
341
|
-
total_tokens: int = 0
|
|
342
|
-
total_cost_usd: float = 0.0
|
|
343
|
-
overall_success_rate: float = 0.0
|