ragbits-evaluate 0.0.30rc1__py3-none-any.whl → 1.4.0.dev202602030301__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -40,7 +40,7 @@ class ConversationLogger:
40
40
  f.write(f"Tasks: {len(scenario.tasks)}\n")
41
41
  for i, task in enumerate(scenario.tasks, 1):
42
42
  f.write(f" Task {i}: {task.task}\n")
43
- f.write(f" Checkers: {task.checkers.model_dump_json()}\n")
43
+ f.write(f" Expected: {task.expected_result}\n")
44
44
  f.write(f"Agent model: {agent_model_name or 'default'}\n")
45
45
  f.write(f"Simulated user model: {sim_user_model_name or 'default'}\n")
46
46
  f.write(f"Goal checker model: {checker_model_name or 'default'}\n")
@@ -9,19 +9,9 @@ from ragbits.evaluate.agent_simulation.metrics.collectors import (
9
9
  CompositeMetricCollector,
10
10
  MetricCollector,
11
11
  )
12
- from ragbits.evaluate.agent_simulation.metrics.deepeval import (
13
- DeepEvalAllMetricsCollector,
14
- DeepEvalCompletenessMetricCollector,
15
- DeepEvalKnowledgeRetentionMetricCollector,
16
- DeepEvalRelevancyMetricCollector,
17
- )
18
12
 
19
13
  __all__ = [
20
14
  "CompositeMetricCollector",
21
- "DeepEvalAllMetricsCollector",
22
- "DeepEvalCompletenessMetricCollector",
23
- "DeepEvalKnowledgeRetentionMetricCollector",
24
- "DeepEvalRelevancyMetricCollector",
25
15
  "LatencyMetricCollector",
26
16
  "MetricCollector",
27
17
  "TokenUsageMetricCollector",
@@ -3,28 +3,24 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import time
6
- from functools import reduce
7
6
  from typing import TYPE_CHECKING, Any
8
7
 
9
- from ragbits.chat.interface.types import ChatResponseUnion, TextResponse
10
- from ragbits.core.llms.base import Usage
11
- from ragbits.evaluate.agent_simulation.metrics.collectors import MetricCollector
12
-
13
8
  if TYPE_CHECKING:
14
9
  from ragbits.evaluate.agent_simulation.results import TurnResult
15
10
 
16
11
 
17
- class LatencyMetricCollector(MetricCollector):
12
+ class LatencyMetricCollector:
18
13
  """Tracks response latency per turn.
19
14
 
20
15
  Measures the wall-clock time from turn start to turn end,
21
16
  providing average, min, max, and per-turn latency metrics.
22
17
 
23
18
  Example:
19
+ >>> collector = LatencyMetricCollector()
24
20
  >>> result = await run_simulation(
25
21
  ... scenario=scenario,
26
22
  ... chat=chat,
27
- ... config=SimulationConfig(metrics=[LatencyMetricCollector]),
23
+ ... metric_collectors=[collector],
28
24
  ... )
29
25
  >>> print(result.metrics.custom["latency_avg_ms"])
30
26
  """
@@ -33,7 +29,6 @@ class LatencyMetricCollector(MetricCollector):
33
29
  """Initialize the latency collector."""
34
30
  self._turn_start: float | None = None
35
31
  self._latencies: list[float] = []
36
- self._times_to_first_token: dict[int, float] = {}
37
32
 
38
33
  def on_turn_start(self, turn_index: int, task_index: int, user_message: str) -> None:
39
34
  """Record the start time for this turn.
@@ -45,20 +40,6 @@ class LatencyMetricCollector(MetricCollector):
45
40
  """
46
41
  self._turn_start = time.perf_counter()
47
42
 
48
- def on_streamed_response(
49
- self, turn_index: int, task_index: int, user_message: str, response: ChatResponseUnion
50
- ) -> None:
51
- """Record time to first token on first text response.
52
-
53
- Args:
54
- turn_index: 1-based index of the current turn.
55
- task_index: 0-based index of the current task.
56
- user_message: The user message (unused).
57
- response: Response chunk from chat interface.
58
- """
59
- if turn_index not in self._times_to_first_token and isinstance(response, TextResponse):
60
- self._times_to_first_token[turn_index] = (time.perf_counter() - self._turn_start) * 1000
61
-
62
43
  def on_turn_end(self, turn_result: TurnResult) -> None:
63
44
  """Calculate and store the latency for this turn.
64
45
 
@@ -80,47 +61,41 @@ class LatencyMetricCollector(MetricCollector):
80
61
  Dictionary with latency_avg_ms, latency_max_ms, latency_min_ms,
81
62
  and latency_per_turn_ms.
82
63
  """
83
- ttfts = list(self._times_to_first_token.values())
84
-
85
- rv = {}
86
- if self._latencies:
87
- rv.update(
88
- {
89
- "latency_avg_ms": sum(self._latencies) / len(self._latencies),
90
- "latency_max_ms": max(self._latencies),
91
- "latency_min_ms": min(self._latencies),
92
- }
93
- )
94
- if ttfts:
95
- rv.update(
96
- {
97
- "time_to_first_token_avg_ms": sum(ttfts) / len(ttfts),
98
- "time_to_first_token_max_ms": max(ttfts),
99
- "time_to_first_token_min_ms": min(ttfts),
100
- }
101
- )
102
-
103
- return rv
104
-
105
-
106
- class TokenUsageMetricCollector(MetricCollector):
64
+ if not self._latencies:
65
+ return {}
66
+
67
+ return {
68
+ "latency_avg_ms": sum(self._latencies) / len(self._latencies),
69
+ "latency_max_ms": max(self._latencies),
70
+ "latency_min_ms": min(self._latencies),
71
+ "latency_per_turn_ms": self._latencies.copy(),
72
+ }
73
+
74
+ def reset(self) -> None:
75
+ """Reset collector state for a new conversation."""
76
+ self._turn_start = None
77
+ self._latencies = []
78
+
79
+
80
+ class TokenUsageMetricCollector:
107
81
  """Tracks token usage and estimated cost per turn.
108
82
 
109
83
  Aggregates token counts from each turn to provide total and
110
84
  per-turn token usage statistics.
111
85
 
112
86
  Example:
87
+ >>> collector = TokenUsageMetricCollector()
113
88
  >>> result = await run_simulation(
114
89
  ... scenario=scenario,
115
90
  ... chat=chat,
116
- ... config=SimulationConfig(metrics=[TokenUsageMetricCollector]),
91
+ ... metric_collectors=[collector],
117
92
  ... )
118
93
  >>> print(result.metrics.custom["tokens_total"])
119
94
  """
120
95
 
121
96
  def __init__(self) -> None:
122
97
  """Initialize the token usage collector."""
123
- self._usage: dict[int, Usage] = {}
98
+ self._turn_tokens: list[dict[str, int]] = []
124
99
 
125
100
  def on_turn_start(self, turn_index: int, task_index: int, user_message: str) -> None:
126
101
  """No-op for token collector.
@@ -138,7 +113,10 @@ class TokenUsageMetricCollector(MetricCollector):
138
113
  Args:
139
114
  turn_result: The result of the completed turn.
140
115
  """
141
- self._usage[turn_result.turn_index] = turn_result.token_usage
116
+ if turn_result.token_usage:
117
+ self._turn_tokens.append(turn_result.token_usage.copy())
118
+ else:
119
+ self._turn_tokens.append({"total": 0, "prompt": 0, "completion": 0})
142
120
 
143
121
  def on_conversation_end(self, all_turns: list[TurnResult]) -> dict[str, Any]:
144
122
  """Return aggregated token usage metrics.
@@ -148,33 +126,40 @@ class TokenUsageMetricCollector(MetricCollector):
148
126
 
149
127
  Returns:
150
128
  Dictionary with tokens_total, tokens_prompt, tokens_completion,
151
- tokens_avg_per_turn, and estimated_usd.
129
+ tokens_avg_per_turn, and tokens_per_turn.
152
130
  """
153
- if not self._usage:
131
+ if not self._turn_tokens:
154
132
  return {}
155
133
 
156
- total_usage = reduce(lambda a, b: a + b, self._usage.values())
134
+ total = sum(t.get("total", 0) for t in self._turn_tokens)
135
+ prompt = sum(t.get("prompt", 0) for t in self._turn_tokens)
136
+ completion = sum(t.get("completion", 0) for t in self._turn_tokens)
157
137
 
158
138
  return {
159
- "tokens_total": total_usage.total_tokens,
160
- "tokens_prompt": total_usage.prompt_tokens,
161
- "tokens_completion": total_usage.completion_tokens,
162
- "tokens_avg_per_turn": total_usage.total_tokens / len(self._usage),
163
- "estimated_usd": total_usage.estimated_cost,
139
+ "tokens_total": total,
140
+ "tokens_prompt": prompt,
141
+ "tokens_completion": completion,
142
+ "tokens_avg_per_turn": total / len(self._turn_tokens) if self._turn_tokens else 0,
143
+ "tokens_per_turn": [t.get("total", 0) for t in self._turn_tokens],
164
144
  }
165
145
 
146
+ def reset(self) -> None:
147
+ """Reset collector state for a new conversation."""
148
+ self._turn_tokens = []
166
149
 
167
- class ToolUsageMetricCollector(MetricCollector):
150
+
151
+ class ToolUsageMetricCollector:
168
152
  """Tracks tool call patterns during the conversation.
169
153
 
170
154
  Records which tools were called, how often, and on which turns,
171
155
  providing insights into agent behavior.
172
156
 
173
157
  Example:
158
+ >>> collector = ToolUsageMetricCollector()
174
159
  >>> result = await run_simulation(
175
160
  ... scenario=scenario,
176
161
  ... chat=chat,
177
- ... config=SimulationConfig(metrics=[ToolUsageMetricCollector]),
162
+ ... metric_collectors=[collector],
178
163
  ... )
179
164
  >>> print(result.metrics.custom["tools_unique"])
180
165
  """
@@ -229,3 +214,8 @@ class ToolUsageMetricCollector(MetricCollector):
229
214
  "tools_per_turn": self._tool_calls.copy(),
230
215
  "turns_with_tools": turns_with_tools,
231
216
  }
217
+
218
+ def reset(self) -> None:
219
+ """Reset collector state for a new conversation."""
220
+ self._tool_calls = []
221
+ self._tool_counts = {}
@@ -2,16 +2,14 @@
2
2
 
3
3
  from __future__ import annotations
4
4
 
5
- from abc import ABC, abstractmethod
6
- from typing import TYPE_CHECKING, Any
7
-
8
- from ragbits.chat.interface.types import ChatResponseUnion
5
+ from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable
9
6
 
10
7
  if TYPE_CHECKING:
11
8
  from ragbits.evaluate.agent_simulation.results import TurnResult
12
9
 
13
10
 
14
- class MetricCollector(ABC):
11
+ @runtime_checkable
12
+ class MetricCollector(Protocol):
15
13
  """Protocol for collecting metrics during conversation simulation.
16
14
 
17
15
  Implement this protocol to create custom metric collectors that can
@@ -33,7 +31,7 @@ class MetricCollector(ABC):
33
31
  ... pass
34
32
  """
35
33
 
36
- def on_turn_start(self, turn_index: int, task_index: int, user_message: str) -> None: # noqa: PLR6301
34
+ def on_turn_start(self, turn_index: int, task_index: int, user_message: str) -> None:
37
35
  """Called before agent processes a turn.
38
36
 
39
37
  Args:
@@ -41,30 +39,16 @@ class MetricCollector(ABC):
41
39
  task_index: 0-based index of the current task.
42
40
  user_message: The user message being sent to the agent.
43
41
  """
44
- return
45
-
46
- def on_streamed_response( # noqa: PLR6301
47
- self, turn_index: int, task_index: int, user_message: str, response: ChatResponseUnion
48
- ) -> None:
49
- """Called after receiving chunk from chat interface.
50
-
51
- Args:
52
- turn_index: 1-based index of the current turn.
53
- task_index: 0-based index of the current task.
54
- user_message: The user message being sent to the agent.
55
- response: Response yielded from chat, usually command or text chunk.
56
- """
57
- return
42
+ ...
58
43
 
59
- def on_turn_end(self, turn_result: TurnResult) -> None: # noqa: PLR6301
44
+ def on_turn_end(self, turn_result: TurnResult) -> None:
60
45
  """Called after a turn completes.
61
46
 
62
47
  Args:
63
48
  turn_result: The result of the completed turn.
64
49
  """
65
- return
50
+ ...
66
51
 
67
- @abstractmethod
68
52
  def on_conversation_end(self, all_turns: list[TurnResult]) -> dict[str, Any]:
69
53
  """Called when the conversation ends, returns computed metrics.
70
54
 
@@ -74,6 +58,11 @@ class MetricCollector(ABC):
74
58
  Returns:
75
59
  Dictionary of metric names to values.
76
60
  """
61
+ ...
62
+
63
+ def reset(self) -> None:
64
+ """Reset collector state for a new conversation."""
65
+ ...
77
66
 
78
67
 
79
68
  class CompositeMetricCollector:
@@ -123,20 +112,6 @@ class CompositeMetricCollector:
123
112
  for collector in self._collectors:
124
113
  collector.on_turn_start(turn_index, task_index, user_message)
125
114
 
126
- def on_streamed_response(
127
- self, turn_index: int, task_index: int, user_message: str, response: ChatResponseUnion
128
- ) -> None:
129
- """Delegate to all child collectors.
130
-
131
- Args:
132
- turn_index: 1-based index of the current turn.
133
- task_index: 0-based index of the current task.
134
- user_message: The user message being sent to the agent.
135
- response: Response yielded from chat, usually command or text chunk.
136
- """
137
- for collector in self._collectors:
138
- collector.on_streamed_response(turn_index, task_index, user_message, response)
139
-
140
115
  def on_turn_end(self, turn_result: TurnResult) -> None:
141
116
  """Delegate to all child collectors.
142
117
 
@@ -160,3 +135,8 @@ class CompositeMetricCollector:
160
135
  metrics = collector.on_conversation_end(all_turns)
161
136
  combined.update(metrics)
162
137
  return combined
138
+
139
+ def reset(self) -> None:
140
+ """Reset all child collectors."""
141
+ for collector in self._collectors:
142
+ collector.reset()
@@ -1,217 +1,37 @@
1
1
  """Data models for agent simulation scenarios."""
2
2
 
3
- from __future__ import annotations
3
+ from dataclasses import dataclass
4
4
 
5
- from collections.abc import Callable
6
- from typing import TYPE_CHECKING, Any, Literal
7
5
 
8
- from pydantic import BaseModel, Field, create_model
9
-
10
- from ragbits.evaluate.agent_simulation.context import DataSnapshot, DomainContext
11
- from ragbits.evaluate.agent_simulation.metrics.collectors import MetricCollector
12
-
13
- if TYPE_CHECKING:
14
- from rich.console import Console
15
-
16
- from ragbits.evaluate.agent_simulation.checkers import BaseCheckerConfig
17
- from ragbits.evaluate.agent_simulation.display import ScenarioLiveDisplay
18
-
19
-
20
- class Turn(BaseModel):
6
+ @dataclass
7
+ class Turn:
21
8
  """A single conversation turn between user and assistant."""
22
9
 
23
10
  user: str
24
11
  assistant: str
25
12
 
26
13
 
27
- class Task(BaseModel):
28
- """A singular task or goal that simulated user is destined to complete."""
29
-
30
- task: str = Field(
31
- ...,
32
- description="A natural language description of the objective that simulated user needs to complete.",
33
- )
34
- checkers: list[dict[str, Any]] = Field(
35
- default_factory=list,
36
- description="List of checker configurations. Each dict must have 'type' key and checker-specific fields.",
37
- )
38
- checker_mode: Literal["all", "any"] = Field(
39
- default="all",
40
- description="How to combine multiple checkers: 'all' (all must pass), 'any' (one must pass)",
41
- )
42
-
43
- def get_parsed_checkers(self) -> list[BaseCheckerConfig]:
44
- """Parse checker configs into typed checker instances.
14
+ @dataclass
15
+ class Task:
16
+ """A single task with its expected result."""
45
17
 
46
- Returns:
47
- List of parsed checker config instances.
48
- """
49
- from ragbits.evaluate.agent_simulation.checkers import parse_checker_config
18
+ task: str
19
+ expected_result: str
20
+ expected_tools: list[str] | None = None
21
+ """Optional list of tool names that should be used to complete this task."""
50
22
 
51
- return [parse_checker_config(c) for c in self.checkers]
52
23
 
53
- def get_checker_summary(self) -> str:
54
- """Get a human-readable summary of configured checkers.
55
-
56
- Returns:
57
- Summary string describing the checkers.
58
- """
59
- if not self.checkers:
60
- return "no checkers"
61
-
62
- types = [c.get("type", "unknown") for c in self.checkers]
63
- return f"{', '.join(types)} ({self.checker_mode})"
64
-
65
-
66
- class Scenario(BaseModel):
24
+ @dataclass
25
+ class Scenario:
67
26
  """A scenario containing multiple tasks to be completed sequentially."""
68
27
 
69
- name: str = Field(..., description="Short name identyfing the scenario")
70
- tasks: list[Task] = Field(
71
- default_factory=list,
72
- description=(
73
- "List of tasks that will be executed during the scenario. "
74
- "Simulating LLM will use this list to determine next steps. "
75
- "It can be both treated as conversation outline or a checklist "
76
- "that should be realized by simulated user. "
77
- "Expected result will be used to judge if specific exchange of messages "
78
- "was aligned with system expectactions. "
79
- ),
80
- )
81
-
82
- turn_limit: int | None = Field(
83
- None,
84
- description=(
85
- "Limit how many turns can be ran before failing the scenario. "
86
- "If set here it will override default settings."
87
- ),
88
- )
89
- turn_limit_per_task: int | None = Field(
90
- None,
91
- description="Limit number of turns, this time per task. Specific tasks can override their limits.",
92
- )
93
-
94
- group: str | None = Field(
95
- None,
96
- description=(
97
- "Scenarios may be coupled together by being in the same group. "
98
- "Scenarios in groups are often executed one after another, "
99
- "may have some sort of dependencies or inference. "
100
- "In final results aggregated group metrics can be found."
101
- ),
102
- )
28
+ name: str
29
+ tasks: list[Task]
103
30
 
104
- def display(self, console: Console | None = None) -> None:
105
- """Display scenario with rich panel."""
106
- from ragbits.evaluate.agent_simulation.display import display_scenario
107
31
 
108
- display_scenario(self, console)
109
-
110
- def live_display(self, console: Console | None = None) -> ScenarioLiveDisplay:
111
- """Create a live display for this scenario."""
112
- from ragbits.evaluate.agent_simulation.display import ScenarioLiveDisplay
113
-
114
- return ScenarioLiveDisplay(self, console)
115
-
116
- @classmethod
117
- def dto(cls) -> type[Scenario]:
118
- """Create a DTO class for serialization."""
119
- if not hasattr(cls, "_dto_cls"):
120
- cls._dto_cls = create_model(
121
- "ScenarioDTO",
122
- __base__=cls,
123
- name=(str, cls.__pydantic_fields__["name"]),
124
- tasks=(list[Task], cls.__pydantic_fields__["tasks"]),
125
- )
126
- return cls._dto_cls
127
-
128
-
129
- class Personality(BaseModel):
32
+ @dataclass
33
+ class Personality:
130
34
  """A personality definition for the simulated user."""
131
35
 
132
- name: str = Field(
133
- ...,
134
- description="A descriptive name that will help to identify this specific instance of personality.",
135
- )
136
- description: str = Field(
137
- ...,
138
- description=(
139
- "Detailed description of user behaviour, style of communication, "
140
- "internal motives, language, attitute, etc."
141
- ),
142
- )
143
-
144
-
145
- class SimulationConfig(BaseModel):
146
- """Configuration for running agent simulations.
147
-
148
- Groups parameters that are commonly passed between simulation components.
149
- Excludes instance-specific objects like ChatInterface, callbacks, and streams.
150
- """
151
-
152
- model_config = {"arbitrary_types_allowed": True}
153
-
154
- max_turns_scenario: int = Field(
155
- default=15,
156
- description="Maximum number of conversation turns for the entire scenario.",
157
- )
158
- max_turns_task: int | None = Field(
159
- default=4,
160
- description="Maximum number of conversation turns per task (None for no limit).",
161
- )
162
- log_file: str | None = Field(
163
- default=None,
164
- description="Optional path to log file.",
165
- )
166
- agent_model_name: str | None = Field(
167
- default=None,
168
- description="Optional override for agent LLM model name.",
169
- )
170
- sim_user_model_name: str | None = Field(
171
- default=None,
172
- description="Optional override for simulated user LLM model name.",
173
- )
174
- checker_model_name: str | None = Field(
175
- default=None,
176
- description="Optional override for goal checker LLM model name.",
177
- )
178
- default_model: str = Field(
179
- default="gpt-4o-mini",
180
- description="Default LLM model name when specific models not provided.",
181
- )
182
- api_key: str = Field(
183
- default="",
184
- description="API key for LLM.",
185
- )
186
- user_message_prefix: str = Field(
187
- default="",
188
- description="Optional prefix to add to user messages before sending to agent.",
189
- )
190
- domain_context: DomainContext | None = Field(
191
- default=None,
192
- description="Optional domain context for goal checking (currency, locale, business rules).",
193
- )
194
- data_snapshot: DataSnapshot | None = Field(
195
- default=None,
196
- description="Optional data snapshot to ground simulated user requests to available data.",
197
- )
198
- metrics: list[type[MetricCollector] | Callable[[], MetricCollector]] | None = Field(
199
- default=None,
200
- description=(
201
- "Optional list of metric collector factories. Each item can be either a class "
202
- "(e.g., LatencyMetricCollector) or a callable that returns a collector instance "
203
- "(e.g., lambda: CustomCollector(arg=value)). Fresh instances are created for each run."
204
- ),
205
- )
206
-
207
- def create_metric_collectors(self) -> list[MetricCollector]:
208
- """Create fresh metric collector instances for a simulation run.
209
-
210
- Each call creates new instances to ensure concurrent runs don't share state.
211
-
212
- Returns:
213
- List of freshly instantiated metric collectors.
214
- """
215
- if not self.metrics:
216
- return []
217
- return [factory() for factory in self.metrics]
36
+ name: str
37
+ description: str