ragbits-evaluate 0.0.30rc1__py3-none-any.whl → 1.4.0.dev202602030301__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,343 +0,0 @@
1
- """API types for the evaluation UI endpoints."""
2
-
3
- from datetime import datetime
4
- from typing import Any
5
-
6
- from pydantic import BaseModel, Field
7
-
8
- from ragbits.evaluate.agent_simulation.results import SimulationStatus
9
-
10
-
11
- class TaskDetail(BaseModel):
12
- """Task details for scenario display and editing."""
13
-
14
- task: str
15
- checkers: list[dict[str, Any]] = Field(default_factory=list)
16
- checker_mode: str = "all"
17
-
18
-
19
- class ScenarioSummary(BaseModel):
20
- """Summary of a scenario for listing."""
21
-
22
- name: str
23
- num_tasks: int
24
- group: str | None = None
25
-
26
-
27
- class ScenarioFileSummary(BaseModel):
28
- """Summary of a scenario file with its scenarios."""
29
-
30
- filename: str
31
- group: str | None = None
32
- scenarios: list[ScenarioSummary]
33
-
34
-
35
- class ScenarioDetail(BaseModel):
36
- """Full scenario details for viewing and editing."""
37
-
38
- name: str
39
- tasks: list[TaskDetail]
40
- group: str | None = None
41
-
42
-
43
- class EvalConfigResponse(BaseModel):
44
- """Configuration response with available scenarios."""
45
-
46
- available_scenarios: list[ScenarioSummary]
47
- scenario_files: list[ScenarioFileSummary] = Field(default_factory=list)
48
- scenarios_dir: str
49
-
50
-
51
- class RunEvaluationConfig(BaseModel):
52
- """Configuration for running a simulation via API.
53
-
54
- This is the frontend-facing config model with validation constraints.
55
- """
56
-
57
- max_turns_scenario: int = Field(default=15, ge=1, le=100)
58
- max_turns_task: int | None = Field(default=4, ge=1, le=50)
59
- sim_user_model_name: str | None = None
60
- checker_model_name: str | None = None
61
- default_model: str = "gpt-4o-mini"
62
-
63
-
64
- class RunEvaluationRequest(BaseModel):
65
- """Request to start an evaluation run.
66
-
67
- If personas is provided, creates a matrix run: each scenario × each persona.
68
- If personas is empty or None, runs each scenario once without a persona.
69
- """
70
-
71
- scenario_names: list[str] = Field(..., min_length=1)
72
- personas: list[str] | None = Field(default=None, description="Personas for matrix runs")
73
- config: RunEvaluationConfig = Field(default_factory=RunEvaluationConfig)
74
-
75
-
76
- class RunStartResponse(BaseModel):
77
- """Response when starting an evaluation run."""
78
-
79
- run_id: str
80
- scenarios: list[str]
81
-
82
-
83
- class ProgressUpdate(BaseModel):
84
- """Base progress update for SSE streaming."""
85
-
86
- type: str
87
- run_id: str
88
- scenario_run_id: str # Unique ID for the scenario run (scenario + persona)
89
- scenario_name: str
90
- persona: str | None = None # Persona used for this scenario run
91
-
92
-
93
- class StatusProgressUpdate(ProgressUpdate):
94
- """Status change progress update."""
95
-
96
- type: str = "status"
97
- status: SimulationStatus
98
- current_turn: int | None = None
99
- current_task_index: int | None = None
100
- current_task: str | None = None
101
-
102
-
103
- class TurnProgressUpdate(ProgressUpdate):
104
- """Turn completion progress update."""
105
-
106
- type: str = "turn"
107
- turn_index: int
108
- task_index: int
109
- user_message: str
110
- assistant_message: str
111
- tool_calls: list[dict[str, Any]] = Field(default_factory=list)
112
- task_completed: bool = False
113
- task_completed_reason: str = ""
114
- checkers: list[dict[str, Any]] = Field(default_factory=list)
115
- checker_mode: str = "all"
116
-
117
-
118
- class TaskCompleteUpdate(ProgressUpdate):
119
- """Task completion progress update."""
120
-
121
- type: str = "task_complete"
122
- task_index: int
123
- task_description: str
124
- turns_taken: int
125
- reason: str
126
-
127
-
128
- class CompletionUpdate(ProgressUpdate):
129
- """Scenario completion progress update."""
130
-
131
- type: str = "complete"
132
- result_id: str
133
- status: SimulationStatus
134
- success_rate: float
135
- total_turns: int
136
- total_tasks: int
137
- tasks_completed: int
138
-
139
-
140
- class ErrorUpdate(ProgressUpdate):
141
- """Error progress update."""
142
-
143
- type: str = "error"
144
- error: str
145
-
146
-
147
- class SourceReference(BaseModel):
148
- """A source/reference document from the chat response."""
149
-
150
- title: str
151
- content: str
152
- url: str | None = None
153
-
154
-
155
- class SourceUpdate(ProgressUpdate):
156
- """Source/reference document progress update."""
157
-
158
- type: str = "source"
159
- turn_index: int
160
- task_index: int
161
- source: SourceReference
162
-
163
-
164
- class ResponseChunkUpdate(ProgressUpdate):
165
- """Real-time ChatInterface response chunk progress update.
166
-
167
- Streams raw response chunks from the ChatInterface as they arrive,
168
- enabling real-time visibility into all response types.
169
- """
170
-
171
- type: str = "response_chunk"
172
- turn_index: int
173
- task_index: int
174
- chunk_type: str # e.g., "text", "reference", "tool_call", "usage", "live_update", etc.
175
- chunk_data: dict[str, Any]
176
-
177
-
178
- class ResultSummary(BaseModel):
179
- """Summary of an evaluation result for listing."""
180
-
181
- result_id: str
182
- scenario_name: str
183
- timestamp: datetime
184
- status: SimulationStatus
185
- tasks_completed: int
186
- total_tasks: int
187
- success_rate: float
188
- total_turns: int
189
- total_tokens: int = 0
190
- total_cost_usd: float = 0.0
191
-
192
-
193
- class ResultsListResponse(BaseModel):
194
- """Response for listing evaluation results."""
195
-
196
- results: list[ResultSummary]
197
- total: int
198
-
199
-
200
- class TestPersonaRequest(BaseModel):
201
- """Request to test how a persona would ask a task."""
202
-
203
- task: str
204
- persona: str | None = None
205
- scenario_name: str | None = None
206
- task_index: int | None = None
207
- model: str | None = None
208
-
209
-
210
- class TestPersonaResponse(BaseModel):
211
- """Response with the generated persona message."""
212
-
213
- message: str
214
- persona: str | None = None
215
- model: str
216
-
217
-
218
- class PersonaSummary(BaseModel):
219
- """Summary of a persona for listing."""
220
-
221
- name: str
222
- description: str
223
-
224
-
225
- class PersonasListResponse(BaseModel):
226
- """Response for listing personas."""
227
-
228
- personas: list[PersonaSummary]
229
- total: int
230
-
231
-
232
- class ScenarioRunSummary(BaseModel):
233
- """Summary of a single scenario within a batch run."""
234
-
235
- id: str # Unique ID for this scenario run (scenario + persona + run_id)
236
- scenario_name: str
237
- persona: str | None = None
238
- status: SimulationStatus
239
- start_time: datetime
240
- end_time: datetime | None = None
241
- total_turns: int = 0
242
- total_tasks: int = 0
243
- tasks_completed: int = 0
244
- success_rate: float = 0.0
245
- total_tokens: int = 0
246
- total_cost_usd: float = 0.0
247
- error: str | None = None
248
-
249
-
250
- class SimulationRunSummary(BaseModel):
251
- """Summary of a batch simulation run containing multiple scenarios."""
252
-
253
- id: str
254
- timestamp: datetime
255
- version: str = "current"
256
- status: SimulationStatus
257
- scenario_runs: list[ScenarioRunSummary]
258
- total_scenarios: int
259
- completed_scenarios: int = 0
260
- failed_scenarios: int = 0
261
- total_tokens: int = 0
262
- total_cost_usd: float = 0.0
263
- overall_success_rate: float = 0.0
264
-
265
-
266
- class SimulationRunsListResponse(BaseModel):
267
- """Response for listing simulation runs."""
268
-
269
- runs: list[SimulationRunSummary]
270
- total: int
271
-
272
-
273
- class CheckerResultItemResponse(BaseModel):
274
- """Result of a single checker evaluation."""
275
-
276
- type: str
277
- completed: bool
278
- reason: str
279
-
280
-
281
- class TurnResultResponse(BaseModel):
282
- """Turn result with conversation data."""
283
-
284
- turn_index: int
285
- task_index: int
286
- user_message: str
287
- assistant_message: str
288
- tool_calls: list[dict[str, Any]] = Field(default_factory=list)
289
- task_completed: bool = False
290
- task_completed_reason: str = ""
291
- token_usage: dict[str, int] | None = None
292
- latency_ms: float | None = None
293
- checkers: list[CheckerResultItemResponse] = Field(default_factory=list)
294
- checker_mode: str = "all"
295
-
296
-
297
- class TaskResultResponse(BaseModel):
298
- """Task result."""
299
-
300
- task_index: int
301
- description: str
302
-
303
-
304
- class ResponseChunkResponse(BaseModel):
305
- """Response chunk from ChatInterface stream."""
306
-
307
- turn_index: int
308
- task_index: int
309
- chunk_index: int
310
- chunk_type: str
311
- chunk_data: dict[str, Any]
312
-
313
-
314
- class ScenarioRunDetail(BaseModel):
315
- """Full scenario run details including turns and tasks."""
316
-
317
- id: str # Unique ID for this scenario run
318
- scenario_name: str
319
- persona: str | None = None
320
- status: SimulationStatus
321
- start_time: datetime
322
- end_time: datetime | None = None
323
- turns: list[TurnResultResponse] = Field(default_factory=list)
324
- tasks: list[TaskResultResponse] = Field(default_factory=list)
325
- response_chunks: list[ResponseChunkResponse] = Field(default_factory=list)
326
- metrics: dict[str, Any] | None = None
327
- error: str | None = None
328
-
329
-
330
- class SimulationRunDetail(BaseModel):
331
- """Full simulation run details."""
332
-
333
- id: str
334
- timestamp: datetime
335
- version: str = "current"
336
- status: SimulationStatus
337
- scenario_runs: list[ScenarioRunDetail]
338
- total_scenarios: int
339
- completed_scenarios: int = 0
340
- failed_scenarios: int = 0
341
- total_tokens: int = 0
342
- total_cost_usd: float = 0.0
343
- overall_success_rate: float = 0.0