ragbits-evaluate 0.0.30rc1__py3-none-any.whl → 1.4.0.dev202602030301__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,8 +5,6 @@ from datetime import datetime
5
5
  from enum import Enum
6
6
  from typing import Any
7
7
 
8
- from ragbits.core.llms.base import Usage
9
-
10
8
 
11
9
  class SimulationStatus(str, Enum):
12
10
  """Status of a simulation run."""
@@ -17,60 +15,6 @@ class SimulationStatus(str, Enum):
17
15
  TIMEOUT = "timeout"
18
16
 
19
17
 
20
- @dataclass
21
- class CheckerResultItem:
22
- """Result of a single checker evaluation."""
23
-
24
- type: str
25
- completed: bool
26
- reason: str
27
-
28
- def to_dict(self) -> dict[str, Any]:
29
- """Convert to dictionary."""
30
- return {"type": self.type, "completed": self.completed, "reason": self.reason}
31
-
32
- @classmethod
33
- def from_dict(cls, data: dict[str, Any]) -> "CheckerResultItem":
34
- """Create from dictionary."""
35
- return cls(
36
- type=data.get("type", data.get("checker_type", "unknown")),
37
- completed=data.get("completed", False),
38
- reason=data.get("reason", ""),
39
- )
40
-
41
-
42
- @dataclass
43
- class ResponseChunk:
44
- """A response chunk from the ChatInterface stream."""
45
-
46
- turn_index: int
47
- task_index: int
48
- chunk_index: int
49
- chunk_type: str
50
- chunk_data: dict[str, Any]
51
-
52
- def to_dict(self) -> dict[str, Any]:
53
- """Convert to dictionary."""
54
- return {
55
- "turn_index": self.turn_index,
56
- "task_index": self.task_index,
57
- "chunk_index": self.chunk_index,
58
- "chunk_type": self.chunk_type,
59
- "chunk_data": self.chunk_data,
60
- }
61
-
62
- @classmethod
63
- def from_dict(cls, data: dict[str, Any]) -> "ResponseChunk":
64
- """Create from dictionary."""
65
- return cls(
66
- turn_index=data.get("turn_index", 0),
67
- task_index=data.get("task_index", 0),
68
- chunk_index=data.get("chunk_index", 0),
69
- chunk_type=data.get("chunk_type", "unknown"),
70
- chunk_data=data.get("chunk_data", {}),
71
- )
72
-
73
-
74
18
  @dataclass
75
19
  class TurnResult:
76
20
  """Result of a single conversation turn."""
@@ -82,10 +26,8 @@ class TurnResult:
82
26
  tool_calls: list[dict[str, Any]] = field(default_factory=list)
83
27
  task_completed: bool = False
84
28
  task_completed_reason: str = ""
85
- token_usage: Usage = field(default_factory=Usage)
29
+ token_usage: dict[str, int] | None = None
86
30
  latency_ms: float | None = None
87
- checkers: list[CheckerResultItem] = field(default_factory=list)
88
- checker_mode: str = "all"
89
31
 
90
32
 
91
33
  @dataclass
@@ -94,51 +36,32 @@ class TaskResult:
94
36
 
95
37
  task_index: int
96
38
  description: str
39
+ expected_result: str | None
97
40
  completed: bool
98
41
  turns_taken: int
99
42
  final_reason: str
100
- checkers: list[dict[str, Any]] = field(default_factory=list)
101
- checker_mode: str = "all"
102
43
 
103
44
 
104
45
  @dataclass
105
46
  class ConversationMetrics:
106
- """Aggregate metrics for the conversation.
107
-
108
- All metrics are stored in a single flat dictionary. Built-in metrics include:
109
- - total_turns: Number of conversation turns
110
- - total_tasks: Number of tasks in the scenario
111
- - tasks_completed: Number of successfully completed tasks
112
- - success_rate: Ratio of completed tasks
113
- - total_tokens, prompt_tokens, completion_tokens: Token usage
114
- - total_cost_usd: Estimated cost
115
- - latency_avg_ms, latency_min_ms, latency_max_ms: Response latency
116
- - tools_total_calls, tools_unique, tools_counts: Tool usage
117
-
118
- Additional metrics from custom collectors are merged into this dict.
119
- """
120
-
121
- metrics: dict[str, Any] = field(default_factory=dict)
122
-
123
- @property
124
- def total_turns(self) -> int:
125
- """Number of conversation turns."""
126
- return self.metrics.get("total_turns", 0)
127
-
128
- @property
129
- def total_tasks(self) -> int:
130
- """Number of tasks in scenario."""
131
- return self.metrics.get("total_tasks", 0)
132
-
133
- @property
134
- def tasks_completed(self) -> int:
135
- """Number of completed tasks."""
136
- return self.metrics.get("tasks_completed", 0)
47
+ """Aggregate metrics for the conversation."""
48
+
49
+ total_turns: int
50
+ total_tasks: int
51
+ tasks_completed: int
52
+ total_tokens: int = 0
53
+ prompt_tokens: int = 0
54
+ completion_tokens: int = 0
55
+ total_cost_usd: float = 0.0
56
+ deepeval_scores: dict[str, float] = field(default_factory=dict)
57
+ custom: dict[str, Any] = field(default_factory=dict)
137
58
 
138
59
  @property
139
60
  def success_rate(self) -> float:
140
61
  """Calculate task success rate."""
141
- return self.metrics.get("success_rate", 0.0)
62
+ if self.total_tasks == 0:
63
+ return 0.0
64
+ return self.tasks_completed / self.total_tasks
142
65
 
143
66
 
144
67
  @dataclass
@@ -154,23 +77,15 @@ class SimulationResult:
154
77
  turns: list[TurnResult] = field(default_factory=list)
155
78
  tasks: list[TaskResult] = field(default_factory=list)
156
79
  metrics: ConversationMetrics | None = None
157
- response_chunks: list[ResponseChunk] = field(default_factory=list)
158
80
 
159
81
  # Optional metadata
160
82
  end_time: datetime | None = None
161
83
  agent_model: str | None = None
162
84
  simulated_user_model: str | None = None
163
85
  checker_model: str | None = None
164
- persona: str | None = None
86
+ personality: str | None = None
165
87
  error: str | None = None
166
88
 
167
- # Conversation context
168
- conversation_id: str | None = None
169
- final_state: dict[str, Any] = field(default_factory=dict)
170
-
171
- # Traces from the chat interface
172
- traces: list[dict[str, Any]] = field(default_factory=list)
173
-
174
89
  def to_dict(self) -> dict[str, Any]:
175
90
  """Convert to JSON-serializable dictionary."""
176
91
  return {
@@ -181,11 +96,8 @@ class SimulationResult:
181
96
  "agent_model": self.agent_model,
182
97
  "simulated_user_model": self.simulated_user_model,
183
98
  "checker_model": self.checker_model,
184
- "persona": self.persona,
99
+ "personality": self.personality,
185
100
  "error": self.error,
186
- "conversation_id": self.conversation_id,
187
- "final_state": self.final_state,
188
- "response_chunks": [c.to_dict() for c in self.response_chunks],
189
101
  "turns": [
190
102
  {
191
103
  "turn_index": t.turn_index,
@@ -195,10 +107,8 @@ class SimulationResult:
195
107
  "tool_calls": t.tool_calls,
196
108
  "task_completed": t.task_completed,
197
109
  "task_completed_reason": t.task_completed_reason,
198
- "token_usage": t.token_usage.model_dump() if t.token_usage else None,
110
+ "token_usage": t.token_usage,
199
111
  "latency_ms": t.latency_ms,
200
- "checkers": [c.to_dict() for c in t.checkers],
201
- "checker_mode": t.checker_mode,
202
112
  }
203
113
  for t in self.turns
204
114
  ],
@@ -206,16 +116,27 @@ class SimulationResult:
206
116
  {
207
117
  "task_index": t.task_index,
208
118
  "description": t.description,
119
+ "expected_result": t.expected_result,
209
120
  "completed": t.completed,
210
121
  "turns_taken": t.turns_taken,
211
122
  "final_reason": t.final_reason,
212
- "checkers": t.checkers,
213
- "checker_mode": t.checker_mode,
214
123
  }
215
124
  for t in self.tasks
216
125
  ],
217
- "metrics": self.metrics.metrics if self.metrics else None,
218
- "traces": self.traces,
126
+ "metrics": {
127
+ "total_turns": self.metrics.total_turns,
128
+ "total_tasks": self.metrics.total_tasks,
129
+ "tasks_completed": self.metrics.tasks_completed,
130
+ "success_rate": self.metrics.success_rate,
131
+ "total_tokens": self.metrics.total_tokens,
132
+ "prompt_tokens": self.metrics.prompt_tokens,
133
+ "completion_tokens": self.metrics.completion_tokens,
134
+ "total_cost_usd": self.metrics.total_cost_usd,
135
+ "deepeval_scores": self.metrics.deepeval_scores,
136
+ "custom": self.metrics.custom,
137
+ }
138
+ if self.metrics
139
+ else None,
219
140
  }
220
141
 
221
142
  @classmethod
@@ -232,8 +153,6 @@ class SimulationResult:
232
153
  task_completed_reason=t.get("task_completed_reason", ""),
233
154
  token_usage=t.get("token_usage"),
234
155
  latency_ms=t.get("latency_ms"),
235
- checkers=[CheckerResultItem.from_dict(c) for c in t.get("checkers", [])],
236
- checker_mode=t.get("checker_mode", "all"),
237
156
  )
238
157
  for t in data.get("turns", [])
239
158
  ]
@@ -242,19 +161,28 @@ class SimulationResult:
242
161
  TaskResult(
243
162
  task_index=t["task_index"],
244
163
  description=t["description"],
164
+ expected_result=t.get("expected_result"),
245
165
  completed=t["completed"],
246
166
  turns_taken=t["turns_taken"],
247
167
  final_reason=t["final_reason"],
248
- checkers=t.get("checkers", []),
249
- checker_mode=t.get("checker_mode", "all"),
250
168
  )
251
169
  for t in data.get("tasks", [])
252
170
  ]
253
171
 
254
172
  metrics_data = data.get("metrics")
255
- metrics = ConversationMetrics(metrics=metrics_data) if metrics_data else None
256
-
257
- response_chunks = [ResponseChunk.from_dict(c) for c in data.get("response_chunks", [])]
173
+ metrics = None
174
+ if metrics_data:
175
+ metrics = ConversationMetrics(
176
+ total_turns=metrics_data["total_turns"],
177
+ total_tasks=metrics_data["total_tasks"],
178
+ tasks_completed=metrics_data["tasks_completed"],
179
+ total_tokens=metrics_data.get("total_tokens", 0),
180
+ prompt_tokens=metrics_data.get("prompt_tokens", 0),
181
+ completion_tokens=metrics_data.get("completion_tokens", 0),
182
+ total_cost_usd=metrics_data.get("total_cost_usd", 0.0),
183
+ deepeval_scores=metrics_data.get("deepeval_scores", {}),
184
+ custom=metrics_data.get("custom", {}),
185
+ )
258
186
 
259
187
  return cls(
260
188
  scenario_name=data["scenario_name"],
@@ -264,13 +192,9 @@ class SimulationResult:
264
192
  agent_model=data.get("agent_model"),
265
193
  simulated_user_model=data.get("simulated_user_model"),
266
194
  checker_model=data.get("checker_model"),
267
- persona=data.get("persona", data.get("personality")), # backwards compat
195
+ personality=data.get("personality"),
268
196
  error=data.get("error"),
269
- conversation_id=data.get("conversation_id"),
270
- final_state=data.get("final_state", {}),
271
197
  turns=turns,
272
198
  tasks=tasks,
273
199
  metrics=metrics,
274
- response_chunks=response_chunks,
275
- traces=data.get("traces", []),
276
200
  )
@@ -1,52 +1,26 @@
1
1
  """Scenario loading functionality for agent simulation."""
2
2
 
3
3
  import json
4
- from dataclasses import dataclass, field
5
4
  from pathlib import Path
6
5
 
7
6
  from ragbits.evaluate.agent_simulation.models import Personality, Scenario, Task
8
7
 
9
8
 
10
- @dataclass
11
- class ScenarioFile:
12
- """Represents a loaded scenario file with its metadata."""
13
-
14
- filename: str
15
- group: str | None
16
- scenarios: list[Scenario] = field(default_factory=list)
17
-
18
-
19
9
  def load_scenarios(scenarios_file: str = "scenarios.json") -> list[Scenario]:
20
10
  """Load scenarios from a JSON file.
21
11
 
22
- Expected JSON format (new format with file-level group):
23
- {
24
- "group": "Group Name",
25
- "scenarios": [
26
- {
27
- "name": "Scenario 1",
28
- "tasks": [
29
- {
30
- "task": "task description",
31
- "checkers": [
32
- {"type": "llm", "expected_result": "expected result"},
33
- {"type": "tool_call", "tools": ["tool1", "tool2"]},
34
- {"type": "state", "checks": [{"key": "user.confirmed", "value": true}]}
35
- ],
36
- "checker_mode": "all"
37
- },
38
- ...
39
- ]
40
- },
41
- ...
42
- ]
43
- }
44
-
45
- Legacy format (array of scenarios) is still supported:
12
+ Expected JSON format:
46
13
  [
47
14
  {
48
15
  "name": "Scenario 1",
49
- "tasks": [...]
16
+ "tasks": [
17
+ {
18
+ "task": "task description",
19
+ "expected_result": "expected result description",
20
+ "expected_tools": ["tool1", "tool2"] # optional
21
+ },
22
+ ...
23
+ ]
50
24
  },
51
25
  ...
52
26
  ]
@@ -57,35 +31,6 @@ def load_scenarios(scenarios_file: str = "scenarios.json") -> list[Scenario]:
57
31
  Returns:
58
32
  List of Scenario objects
59
33
 
60
- Raises:
61
- FileNotFoundError: If the scenarios file doesn't exist
62
- ValueError: If the file format is invalid
63
- """
64
- scenario_file = load_scenario_file(scenarios_file)
65
- return scenario_file.scenarios
66
-
67
-
68
- def load_scenario_file(scenarios_file: str = "scenarios.json") -> ScenarioFile:
69
- """Load scenarios from a JSON file with file-level metadata.
70
-
71
- This function supports both the new format with file-level group:
72
- {
73
- "group": "Group Name",
74
- "scenarios": [...]
75
- }
76
-
77
- And the legacy format (array of scenarios):
78
- [
79
- {"name": "Scenario 1", "tasks": [...]},
80
- ...
81
- ]
82
-
83
- Args:
84
- scenarios_file: Path to the JSON file containing scenarios
85
-
86
- Returns:
87
- ScenarioFile object containing scenarios and file-level metadata
88
-
89
34
  Raises:
90
35
  FileNotFoundError: If the scenarios file doesn't exist
91
36
  ValueError: If the file format is invalid
@@ -97,31 +42,16 @@ def load_scenario_file(scenarios_file: str = "scenarios.json") -> ScenarioFile:
97
42
  with scenarios_path.open("r", encoding="utf-8") as f:
98
43
  data = json.load(f)
99
44
 
100
- # Determine format and extract scenarios data and file-level group
101
- file_group: str | None = None
102
- scenarios_data: list
103
-
104
- if isinstance(data, dict):
105
- # New format: {"group": "...", "scenarios": [...]}
106
- file_group = data.get("group")
107
- scenarios_data = data.get("scenarios", [])
108
- if not isinstance(scenarios_data, list):
109
- raise ValueError(f"'scenarios' field must be a JSON array, got {type(scenarios_data).__name__}")
110
- elif isinstance(data, list):
111
- # Legacy format: [...]
112
- scenarios_data = data
113
- else:
114
- raise ValueError(f"Scenarios file must contain a JSON object or array, got {type(data).__name__}")
45
+ if not isinstance(data, list):
46
+ raise ValueError(f"Scenarios file must contain a JSON array, got {type(data).__name__}")
115
47
 
116
48
  scenarios: list[Scenario] = []
117
- for scenario_data in scenarios_data:
49
+ for scenario_data in data:
118
50
  if not isinstance(scenario_data, dict):
119
51
  raise ValueError(f"Each scenario must be a JSON object, got {type(scenario_data).__name__}")
120
52
 
121
53
  name = scenario_data.get("name", "")
122
54
  tasks_data = scenario_data.get("tasks", [])
123
- # Scenario can have its own group, or inherit from file-level group
124
- scenario_group = scenario_data.get("group") or file_group
125
55
 
126
56
  if not isinstance(tasks_data, list):
127
57
  raise ValueError(f"Tasks must be a JSON array, got {type(tasks_data).__name__}")
@@ -132,24 +62,18 @@ def load_scenario_file(scenarios_file: str = "scenarios.json") -> ScenarioFile:
132
62
  raise ValueError(f"Each task must be a JSON object, got {type(task_data).__name__}")
133
63
 
134
64
  task_desc = task_data.get("task", "")
135
- checkers = task_data.get("checkers", [])
136
- checker_mode = task_data.get("checker_mode", "all")
137
-
138
- if not isinstance(checkers, list):
139
- raise ValueError(f"checkers must be a list, got {type(checkers).__name__}")
140
-
141
- tasks.append(Task(task=task_desc, checkers=checkers, checker_mode=checker_mode))
65
+ expected_result = task_data.get("expected_result", "")
66
+ expected_tools = task_data.get("expected_tools")
67
+ if expected_tools is not None and not isinstance(expected_tools, list):
68
+ raise ValueError(f"expected_tools must be a list or null, got {type(expected_tools).__name__}")
69
+ tasks.append(Task(task=task_desc, expected_result=expected_result, expected_tools=expected_tools))
142
70
 
143
- scenarios.append(Scenario(name=name, tasks=tasks, group=scenario_group))
71
+ scenarios.append(Scenario(name=name, tasks=tasks))
144
72
 
145
73
  if not scenarios:
146
74
  raise ValueError(f"No scenarios found in {scenarios_path}")
147
75
 
148
- return ScenarioFile(
149
- filename=scenarios_path.name,
150
- group=file_group,
151
- scenarios=scenarios,
152
- )
76
+ return scenarios
153
77
 
154
78
 
155
79
  def load_personalities(personalities_file: str = "personalities.json") -> list[Personality]: