ragbits-evaluate 0.0.30rc1__py3-none-any.whl → 1.4.0.dev202602030301__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ragbits/evaluate/agent_simulation/__init__.py +4 -49
- ragbits/evaluate/agent_simulation/conversation.py +278 -663
- ragbits/evaluate/agent_simulation/logger.py +1 -1
- ragbits/evaluate/agent_simulation/metrics/__init__.py +0 -10
- ragbits/evaluate/agent_simulation/metrics/builtin.py +49 -59
- ragbits/evaluate/agent_simulation/metrics/collectors.py +17 -37
- ragbits/evaluate/agent_simulation/models.py +18 -198
- ragbits/evaluate/agent_simulation/results.py +49 -125
- ragbits/evaluate/agent_simulation/scenarios.py +19 -95
- ragbits/evaluate/agent_simulation/simulation.py +166 -72
- ragbits/evaluate/metrics/question_answer.py +25 -8
- {ragbits_evaluate-0.0.30rc1.dist-info → ragbits_evaluate-1.4.0.dev202602030301.dist-info}/METADATA +2 -6
- {ragbits_evaluate-0.0.30rc1.dist-info → ragbits_evaluate-1.4.0.dev202602030301.dist-info}/RECORD +14 -25
- ragbits/evaluate/agent_simulation/checkers.py +0 -591
- ragbits/evaluate/agent_simulation/display.py +0 -118
- ragbits/evaluate/agent_simulation/metrics/deepeval.py +0 -295
- ragbits/evaluate/agent_simulation/tracing.py +0 -233
- ragbits/evaluate/api.py +0 -603
- ragbits/evaluate/api_types.py +0 -343
- ragbits/evaluate/execution_manager.py +0 -451
- ragbits/evaluate/stores/__init__.py +0 -36
- ragbits/evaluate/stores/base.py +0 -98
- ragbits/evaluate/stores/file.py +0 -466
- ragbits/evaluate/stores/kv.py +0 -535
- {ragbits_evaluate-0.0.30rc1.dist-info → ragbits_evaluate-1.4.0.dev202602030301.dist-info}/WHEEL +0 -0
|
@@ -40,7 +40,7 @@ class ConversationLogger:
|
|
|
40
40
|
f.write(f"Tasks: {len(scenario.tasks)}\n")
|
|
41
41
|
for i, task in enumerate(scenario.tasks, 1):
|
|
42
42
|
f.write(f" Task {i}: {task.task}\n")
|
|
43
|
-
f.write(f"
|
|
43
|
+
f.write(f" Expected: {task.expected_result}\n")
|
|
44
44
|
f.write(f"Agent model: {agent_model_name or 'default'}\n")
|
|
45
45
|
f.write(f"Simulated user model: {sim_user_model_name or 'default'}\n")
|
|
46
46
|
f.write(f"Goal checker model: {checker_model_name or 'default'}\n")
|
|
@@ -9,19 +9,9 @@ from ragbits.evaluate.agent_simulation.metrics.collectors import (
|
|
|
9
9
|
CompositeMetricCollector,
|
|
10
10
|
MetricCollector,
|
|
11
11
|
)
|
|
12
|
-
from ragbits.evaluate.agent_simulation.metrics.deepeval import (
|
|
13
|
-
DeepEvalAllMetricsCollector,
|
|
14
|
-
DeepEvalCompletenessMetricCollector,
|
|
15
|
-
DeepEvalKnowledgeRetentionMetricCollector,
|
|
16
|
-
DeepEvalRelevancyMetricCollector,
|
|
17
|
-
)
|
|
18
12
|
|
|
19
13
|
__all__ = [
|
|
20
14
|
"CompositeMetricCollector",
|
|
21
|
-
"DeepEvalAllMetricsCollector",
|
|
22
|
-
"DeepEvalCompletenessMetricCollector",
|
|
23
|
-
"DeepEvalKnowledgeRetentionMetricCollector",
|
|
24
|
-
"DeepEvalRelevancyMetricCollector",
|
|
25
15
|
"LatencyMetricCollector",
|
|
26
16
|
"MetricCollector",
|
|
27
17
|
"TokenUsageMetricCollector",
|
|
@@ -3,28 +3,24 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
import time
|
|
6
|
-
from functools import reduce
|
|
7
6
|
from typing import TYPE_CHECKING, Any
|
|
8
7
|
|
|
9
|
-
from ragbits.chat.interface.types import ChatResponseUnion, TextResponse
|
|
10
|
-
from ragbits.core.llms.base import Usage
|
|
11
|
-
from ragbits.evaluate.agent_simulation.metrics.collectors import MetricCollector
|
|
12
|
-
|
|
13
8
|
if TYPE_CHECKING:
|
|
14
9
|
from ragbits.evaluate.agent_simulation.results import TurnResult
|
|
15
10
|
|
|
16
11
|
|
|
17
|
-
class LatencyMetricCollector
|
|
12
|
+
class LatencyMetricCollector:
|
|
18
13
|
"""Tracks response latency per turn.
|
|
19
14
|
|
|
20
15
|
Measures the wall-clock time from turn start to turn end,
|
|
21
16
|
providing average, min, max, and per-turn latency metrics.
|
|
22
17
|
|
|
23
18
|
Example:
|
|
19
|
+
>>> collector = LatencyMetricCollector()
|
|
24
20
|
>>> result = await run_simulation(
|
|
25
21
|
... scenario=scenario,
|
|
26
22
|
... chat=chat,
|
|
27
|
-
...
|
|
23
|
+
... metric_collectors=[collector],
|
|
28
24
|
... )
|
|
29
25
|
>>> print(result.metrics.custom["latency_avg_ms"])
|
|
30
26
|
"""
|
|
@@ -33,7 +29,6 @@ class LatencyMetricCollector(MetricCollector):
|
|
|
33
29
|
"""Initialize the latency collector."""
|
|
34
30
|
self._turn_start: float | None = None
|
|
35
31
|
self._latencies: list[float] = []
|
|
36
|
-
self._times_to_first_token: dict[int, float] = {}
|
|
37
32
|
|
|
38
33
|
def on_turn_start(self, turn_index: int, task_index: int, user_message: str) -> None:
|
|
39
34
|
"""Record the start time for this turn.
|
|
@@ -45,20 +40,6 @@ class LatencyMetricCollector(MetricCollector):
|
|
|
45
40
|
"""
|
|
46
41
|
self._turn_start = time.perf_counter()
|
|
47
42
|
|
|
48
|
-
def on_streamed_response(
|
|
49
|
-
self, turn_index: int, task_index: int, user_message: str, response: ChatResponseUnion
|
|
50
|
-
) -> None:
|
|
51
|
-
"""Record time to first token on first text response.
|
|
52
|
-
|
|
53
|
-
Args:
|
|
54
|
-
turn_index: 1-based index of the current turn.
|
|
55
|
-
task_index: 0-based index of the current task.
|
|
56
|
-
user_message: The user message (unused).
|
|
57
|
-
response: Response chunk from chat interface.
|
|
58
|
-
"""
|
|
59
|
-
if turn_index not in self._times_to_first_token and isinstance(response, TextResponse):
|
|
60
|
-
self._times_to_first_token[turn_index] = (time.perf_counter() - self._turn_start) * 1000
|
|
61
|
-
|
|
62
43
|
def on_turn_end(self, turn_result: TurnResult) -> None:
|
|
63
44
|
"""Calculate and store the latency for this turn.
|
|
64
45
|
|
|
@@ -80,47 +61,41 @@ class LatencyMetricCollector(MetricCollector):
|
|
|
80
61
|
Dictionary with latency_avg_ms, latency_max_ms, latency_min_ms,
|
|
81
62
|
and latency_per_turn_ms.
|
|
82
63
|
"""
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
}
|
|
101
|
-
)
|
|
102
|
-
|
|
103
|
-
return rv
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
class TokenUsageMetricCollector(MetricCollector):
|
|
64
|
+
if not self._latencies:
|
|
65
|
+
return {}
|
|
66
|
+
|
|
67
|
+
return {
|
|
68
|
+
"latency_avg_ms": sum(self._latencies) / len(self._latencies),
|
|
69
|
+
"latency_max_ms": max(self._latencies),
|
|
70
|
+
"latency_min_ms": min(self._latencies),
|
|
71
|
+
"latency_per_turn_ms": self._latencies.copy(),
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
def reset(self) -> None:
|
|
75
|
+
"""Reset collector state for a new conversation."""
|
|
76
|
+
self._turn_start = None
|
|
77
|
+
self._latencies = []
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class TokenUsageMetricCollector:
|
|
107
81
|
"""Tracks token usage and estimated cost per turn.
|
|
108
82
|
|
|
109
83
|
Aggregates token counts from each turn to provide total and
|
|
110
84
|
per-turn token usage statistics.
|
|
111
85
|
|
|
112
86
|
Example:
|
|
87
|
+
>>> collector = TokenUsageMetricCollector()
|
|
113
88
|
>>> result = await run_simulation(
|
|
114
89
|
... scenario=scenario,
|
|
115
90
|
... chat=chat,
|
|
116
|
-
...
|
|
91
|
+
... metric_collectors=[collector],
|
|
117
92
|
... )
|
|
118
93
|
>>> print(result.metrics.custom["tokens_total"])
|
|
119
94
|
"""
|
|
120
95
|
|
|
121
96
|
def __init__(self) -> None:
|
|
122
97
|
"""Initialize the token usage collector."""
|
|
123
|
-
self.
|
|
98
|
+
self._turn_tokens: list[dict[str, int]] = []
|
|
124
99
|
|
|
125
100
|
def on_turn_start(self, turn_index: int, task_index: int, user_message: str) -> None:
|
|
126
101
|
"""No-op for token collector.
|
|
@@ -138,7 +113,10 @@ class TokenUsageMetricCollector(MetricCollector):
|
|
|
138
113
|
Args:
|
|
139
114
|
turn_result: The result of the completed turn.
|
|
140
115
|
"""
|
|
141
|
-
|
|
116
|
+
if turn_result.token_usage:
|
|
117
|
+
self._turn_tokens.append(turn_result.token_usage.copy())
|
|
118
|
+
else:
|
|
119
|
+
self._turn_tokens.append({"total": 0, "prompt": 0, "completion": 0})
|
|
142
120
|
|
|
143
121
|
def on_conversation_end(self, all_turns: list[TurnResult]) -> dict[str, Any]:
|
|
144
122
|
"""Return aggregated token usage metrics.
|
|
@@ -148,33 +126,40 @@ class TokenUsageMetricCollector(MetricCollector):
|
|
|
148
126
|
|
|
149
127
|
Returns:
|
|
150
128
|
Dictionary with tokens_total, tokens_prompt, tokens_completion,
|
|
151
|
-
tokens_avg_per_turn, and
|
|
129
|
+
tokens_avg_per_turn, and tokens_per_turn.
|
|
152
130
|
"""
|
|
153
|
-
if not self.
|
|
131
|
+
if not self._turn_tokens:
|
|
154
132
|
return {}
|
|
155
133
|
|
|
156
|
-
|
|
134
|
+
total = sum(t.get("total", 0) for t in self._turn_tokens)
|
|
135
|
+
prompt = sum(t.get("prompt", 0) for t in self._turn_tokens)
|
|
136
|
+
completion = sum(t.get("completion", 0) for t in self._turn_tokens)
|
|
157
137
|
|
|
158
138
|
return {
|
|
159
|
-
"tokens_total":
|
|
160
|
-
"tokens_prompt":
|
|
161
|
-
"tokens_completion":
|
|
162
|
-
"tokens_avg_per_turn":
|
|
163
|
-
"
|
|
139
|
+
"tokens_total": total,
|
|
140
|
+
"tokens_prompt": prompt,
|
|
141
|
+
"tokens_completion": completion,
|
|
142
|
+
"tokens_avg_per_turn": total / len(self._turn_tokens) if self._turn_tokens else 0,
|
|
143
|
+
"tokens_per_turn": [t.get("total", 0) for t in self._turn_tokens],
|
|
164
144
|
}
|
|
165
145
|
|
|
146
|
+
def reset(self) -> None:
|
|
147
|
+
"""Reset collector state for a new conversation."""
|
|
148
|
+
self._turn_tokens = []
|
|
166
149
|
|
|
167
|
-
|
|
150
|
+
|
|
151
|
+
class ToolUsageMetricCollector:
|
|
168
152
|
"""Tracks tool call patterns during the conversation.
|
|
169
153
|
|
|
170
154
|
Records which tools were called, how often, and on which turns,
|
|
171
155
|
providing insights into agent behavior.
|
|
172
156
|
|
|
173
157
|
Example:
|
|
158
|
+
>>> collector = ToolUsageMetricCollector()
|
|
174
159
|
>>> result = await run_simulation(
|
|
175
160
|
... scenario=scenario,
|
|
176
161
|
... chat=chat,
|
|
177
|
-
...
|
|
162
|
+
... metric_collectors=[collector],
|
|
178
163
|
... )
|
|
179
164
|
>>> print(result.metrics.custom["tools_unique"])
|
|
180
165
|
"""
|
|
@@ -229,3 +214,8 @@ class ToolUsageMetricCollector(MetricCollector):
|
|
|
229
214
|
"tools_per_turn": self._tool_calls.copy(),
|
|
230
215
|
"turns_with_tools": turns_with_tools,
|
|
231
216
|
}
|
|
217
|
+
|
|
218
|
+
def reset(self) -> None:
|
|
219
|
+
"""Reset collector state for a new conversation."""
|
|
220
|
+
self._tool_calls = []
|
|
221
|
+
self._tool_counts = {}
|
|
@@ -2,16 +2,14 @@
|
|
|
2
2
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
|
-
from
|
|
6
|
-
from typing import TYPE_CHECKING, Any
|
|
7
|
-
|
|
8
|
-
from ragbits.chat.interface.types import ChatResponseUnion
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Protocol, runtime_checkable
|
|
9
6
|
|
|
10
7
|
if TYPE_CHECKING:
|
|
11
8
|
from ragbits.evaluate.agent_simulation.results import TurnResult
|
|
12
9
|
|
|
13
10
|
|
|
14
|
-
|
|
11
|
+
@runtime_checkable
|
|
12
|
+
class MetricCollector(Protocol):
|
|
15
13
|
"""Protocol for collecting metrics during conversation simulation.
|
|
16
14
|
|
|
17
15
|
Implement this protocol to create custom metric collectors that can
|
|
@@ -33,7 +31,7 @@ class MetricCollector(ABC):
|
|
|
33
31
|
... pass
|
|
34
32
|
"""
|
|
35
33
|
|
|
36
|
-
def on_turn_start(self, turn_index: int, task_index: int, user_message: str) -> None:
|
|
34
|
+
def on_turn_start(self, turn_index: int, task_index: int, user_message: str) -> None:
|
|
37
35
|
"""Called before agent processes a turn.
|
|
38
36
|
|
|
39
37
|
Args:
|
|
@@ -41,30 +39,16 @@ class MetricCollector(ABC):
|
|
|
41
39
|
task_index: 0-based index of the current task.
|
|
42
40
|
user_message: The user message being sent to the agent.
|
|
43
41
|
"""
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
def on_streamed_response( # noqa: PLR6301
|
|
47
|
-
self, turn_index: int, task_index: int, user_message: str, response: ChatResponseUnion
|
|
48
|
-
) -> None:
|
|
49
|
-
"""Called after receiving chunk from chat interface.
|
|
50
|
-
|
|
51
|
-
Args:
|
|
52
|
-
turn_index: 1-based index of the current turn.
|
|
53
|
-
task_index: 0-based index of the current task.
|
|
54
|
-
user_message: The user message being sent to the agent.
|
|
55
|
-
response: Response yielded from chat, usually command or text chunk.
|
|
56
|
-
"""
|
|
57
|
-
return
|
|
42
|
+
...
|
|
58
43
|
|
|
59
|
-
def on_turn_end(self, turn_result: TurnResult) -> None:
|
|
44
|
+
def on_turn_end(self, turn_result: TurnResult) -> None:
|
|
60
45
|
"""Called after a turn completes.
|
|
61
46
|
|
|
62
47
|
Args:
|
|
63
48
|
turn_result: The result of the completed turn.
|
|
64
49
|
"""
|
|
65
|
-
|
|
50
|
+
...
|
|
66
51
|
|
|
67
|
-
@abstractmethod
|
|
68
52
|
def on_conversation_end(self, all_turns: list[TurnResult]) -> dict[str, Any]:
|
|
69
53
|
"""Called when the conversation ends, returns computed metrics.
|
|
70
54
|
|
|
@@ -74,6 +58,11 @@ class MetricCollector(ABC):
|
|
|
74
58
|
Returns:
|
|
75
59
|
Dictionary of metric names to values.
|
|
76
60
|
"""
|
|
61
|
+
...
|
|
62
|
+
|
|
63
|
+
def reset(self) -> None:
|
|
64
|
+
"""Reset collector state for a new conversation."""
|
|
65
|
+
...
|
|
77
66
|
|
|
78
67
|
|
|
79
68
|
class CompositeMetricCollector:
|
|
@@ -123,20 +112,6 @@ class CompositeMetricCollector:
|
|
|
123
112
|
for collector in self._collectors:
|
|
124
113
|
collector.on_turn_start(turn_index, task_index, user_message)
|
|
125
114
|
|
|
126
|
-
def on_streamed_response(
|
|
127
|
-
self, turn_index: int, task_index: int, user_message: str, response: ChatResponseUnion
|
|
128
|
-
) -> None:
|
|
129
|
-
"""Delegate to all child collectors.
|
|
130
|
-
|
|
131
|
-
Args:
|
|
132
|
-
turn_index: 1-based index of the current turn.
|
|
133
|
-
task_index: 0-based index of the current task.
|
|
134
|
-
user_message: The user message being sent to the agent.
|
|
135
|
-
response: Response yielded from chat, usually command or text chunk.
|
|
136
|
-
"""
|
|
137
|
-
for collector in self._collectors:
|
|
138
|
-
collector.on_streamed_response(turn_index, task_index, user_message, response)
|
|
139
|
-
|
|
140
115
|
def on_turn_end(self, turn_result: TurnResult) -> None:
|
|
141
116
|
"""Delegate to all child collectors.
|
|
142
117
|
|
|
@@ -160,3 +135,8 @@ class CompositeMetricCollector:
|
|
|
160
135
|
metrics = collector.on_conversation_end(all_turns)
|
|
161
136
|
combined.update(metrics)
|
|
162
137
|
return combined
|
|
138
|
+
|
|
139
|
+
def reset(self) -> None:
|
|
140
|
+
"""Reset all child collectors."""
|
|
141
|
+
for collector in self._collectors:
|
|
142
|
+
collector.reset()
|
|
@@ -1,217 +1,37 @@
|
|
|
1
1
|
"""Data models for agent simulation scenarios."""
|
|
2
2
|
|
|
3
|
-
from
|
|
3
|
+
from dataclasses import dataclass
|
|
4
4
|
|
|
5
|
-
from collections.abc import Callable
|
|
6
|
-
from typing import TYPE_CHECKING, Any, Literal
|
|
7
5
|
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
from ragbits.evaluate.agent_simulation.context import DataSnapshot, DomainContext
|
|
11
|
-
from ragbits.evaluate.agent_simulation.metrics.collectors import MetricCollector
|
|
12
|
-
|
|
13
|
-
if TYPE_CHECKING:
|
|
14
|
-
from rich.console import Console
|
|
15
|
-
|
|
16
|
-
from ragbits.evaluate.agent_simulation.checkers import BaseCheckerConfig
|
|
17
|
-
from ragbits.evaluate.agent_simulation.display import ScenarioLiveDisplay
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
class Turn(BaseModel):
|
|
6
|
+
@dataclass
|
|
7
|
+
class Turn:
|
|
21
8
|
"""A single conversation turn between user and assistant."""
|
|
22
9
|
|
|
23
10
|
user: str
|
|
24
11
|
assistant: str
|
|
25
12
|
|
|
26
13
|
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
task: str = Field(
|
|
31
|
-
...,
|
|
32
|
-
description="A natural language description of the objective that simulated user needs to complete.",
|
|
33
|
-
)
|
|
34
|
-
checkers: list[dict[str, Any]] = Field(
|
|
35
|
-
default_factory=list,
|
|
36
|
-
description="List of checker configurations. Each dict must have 'type' key and checker-specific fields.",
|
|
37
|
-
)
|
|
38
|
-
checker_mode: Literal["all", "any"] = Field(
|
|
39
|
-
default="all",
|
|
40
|
-
description="How to combine multiple checkers: 'all' (all must pass), 'any' (one must pass)",
|
|
41
|
-
)
|
|
42
|
-
|
|
43
|
-
def get_parsed_checkers(self) -> list[BaseCheckerConfig]:
|
|
44
|
-
"""Parse checker configs into typed checker instances.
|
|
14
|
+
@dataclass
|
|
15
|
+
class Task:
|
|
16
|
+
"""A single task with its expected result."""
|
|
45
17
|
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
18
|
+
task: str
|
|
19
|
+
expected_result: str
|
|
20
|
+
expected_tools: list[str] | None = None
|
|
21
|
+
"""Optional list of tool names that should be used to complete this task."""
|
|
50
22
|
|
|
51
|
-
return [parse_checker_config(c) for c in self.checkers]
|
|
52
23
|
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
Returns:
|
|
57
|
-
Summary string describing the checkers.
|
|
58
|
-
"""
|
|
59
|
-
if not self.checkers:
|
|
60
|
-
return "no checkers"
|
|
61
|
-
|
|
62
|
-
types = [c.get("type", "unknown") for c in self.checkers]
|
|
63
|
-
return f"{', '.join(types)} ({self.checker_mode})"
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
class Scenario(BaseModel):
|
|
24
|
+
@dataclass
|
|
25
|
+
class Scenario:
|
|
67
26
|
"""A scenario containing multiple tasks to be completed sequentially."""
|
|
68
27
|
|
|
69
|
-
name: str
|
|
70
|
-
tasks: list[Task]
|
|
71
|
-
default_factory=list,
|
|
72
|
-
description=(
|
|
73
|
-
"List of tasks that will be executed during the scenario. "
|
|
74
|
-
"Simulating LLM will use this list to determine next steps. "
|
|
75
|
-
"It can be both treated as conversation outline or a checklist "
|
|
76
|
-
"that should be realized by simulated user. "
|
|
77
|
-
"Expected result will be used to judge if specific exchange of messages "
|
|
78
|
-
"was aligned with system expectactions. "
|
|
79
|
-
),
|
|
80
|
-
)
|
|
81
|
-
|
|
82
|
-
turn_limit: int | None = Field(
|
|
83
|
-
None,
|
|
84
|
-
description=(
|
|
85
|
-
"Limit how many turns can be ran before failing the scenario. "
|
|
86
|
-
"If set here it will override default settings."
|
|
87
|
-
),
|
|
88
|
-
)
|
|
89
|
-
turn_limit_per_task: int | None = Field(
|
|
90
|
-
None,
|
|
91
|
-
description="Limit number of turns, this time per task. Specific tasks can override their limits.",
|
|
92
|
-
)
|
|
93
|
-
|
|
94
|
-
group: str | None = Field(
|
|
95
|
-
None,
|
|
96
|
-
description=(
|
|
97
|
-
"Scenarios may be coupled together by being in the same group. "
|
|
98
|
-
"Scenarios in groups are often executed one after another, "
|
|
99
|
-
"may have some sort of dependencies or inference. "
|
|
100
|
-
"In final results aggregated group metrics can be found."
|
|
101
|
-
),
|
|
102
|
-
)
|
|
28
|
+
name: str
|
|
29
|
+
tasks: list[Task]
|
|
103
30
|
|
|
104
|
-
def display(self, console: Console | None = None) -> None:
|
|
105
|
-
"""Display scenario with rich panel."""
|
|
106
|
-
from ragbits.evaluate.agent_simulation.display import display_scenario
|
|
107
31
|
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
def live_display(self, console: Console | None = None) -> ScenarioLiveDisplay:
|
|
111
|
-
"""Create a live display for this scenario."""
|
|
112
|
-
from ragbits.evaluate.agent_simulation.display import ScenarioLiveDisplay
|
|
113
|
-
|
|
114
|
-
return ScenarioLiveDisplay(self, console)
|
|
115
|
-
|
|
116
|
-
@classmethod
|
|
117
|
-
def dto(cls) -> type[Scenario]:
|
|
118
|
-
"""Create a DTO class for serialization."""
|
|
119
|
-
if not hasattr(cls, "_dto_cls"):
|
|
120
|
-
cls._dto_cls = create_model(
|
|
121
|
-
"ScenarioDTO",
|
|
122
|
-
__base__=cls,
|
|
123
|
-
name=(str, cls.__pydantic_fields__["name"]),
|
|
124
|
-
tasks=(list[Task], cls.__pydantic_fields__["tasks"]),
|
|
125
|
-
)
|
|
126
|
-
return cls._dto_cls
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
class Personality(BaseModel):
|
|
32
|
+
@dataclass
|
|
33
|
+
class Personality:
|
|
130
34
|
"""A personality definition for the simulated user."""
|
|
131
35
|
|
|
132
|
-
name: str
|
|
133
|
-
|
|
134
|
-
description="A descriptive name that will help to identify this specific instance of personality.",
|
|
135
|
-
)
|
|
136
|
-
description: str = Field(
|
|
137
|
-
...,
|
|
138
|
-
description=(
|
|
139
|
-
"Detailed description of user behaviour, style of communication, "
|
|
140
|
-
"internal motives, language, attitute, etc."
|
|
141
|
-
),
|
|
142
|
-
)
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
class SimulationConfig(BaseModel):
|
|
146
|
-
"""Configuration for running agent simulations.
|
|
147
|
-
|
|
148
|
-
Groups parameters that are commonly passed between simulation components.
|
|
149
|
-
Excludes instance-specific objects like ChatInterface, callbacks, and streams.
|
|
150
|
-
"""
|
|
151
|
-
|
|
152
|
-
model_config = {"arbitrary_types_allowed": True}
|
|
153
|
-
|
|
154
|
-
max_turns_scenario: int = Field(
|
|
155
|
-
default=15,
|
|
156
|
-
description="Maximum number of conversation turns for the entire scenario.",
|
|
157
|
-
)
|
|
158
|
-
max_turns_task: int | None = Field(
|
|
159
|
-
default=4,
|
|
160
|
-
description="Maximum number of conversation turns per task (None for no limit).",
|
|
161
|
-
)
|
|
162
|
-
log_file: str | None = Field(
|
|
163
|
-
default=None,
|
|
164
|
-
description="Optional path to log file.",
|
|
165
|
-
)
|
|
166
|
-
agent_model_name: str | None = Field(
|
|
167
|
-
default=None,
|
|
168
|
-
description="Optional override for agent LLM model name.",
|
|
169
|
-
)
|
|
170
|
-
sim_user_model_name: str | None = Field(
|
|
171
|
-
default=None,
|
|
172
|
-
description="Optional override for simulated user LLM model name.",
|
|
173
|
-
)
|
|
174
|
-
checker_model_name: str | None = Field(
|
|
175
|
-
default=None,
|
|
176
|
-
description="Optional override for goal checker LLM model name.",
|
|
177
|
-
)
|
|
178
|
-
default_model: str = Field(
|
|
179
|
-
default="gpt-4o-mini",
|
|
180
|
-
description="Default LLM model name when specific models not provided.",
|
|
181
|
-
)
|
|
182
|
-
api_key: str = Field(
|
|
183
|
-
default="",
|
|
184
|
-
description="API key for LLM.",
|
|
185
|
-
)
|
|
186
|
-
user_message_prefix: str = Field(
|
|
187
|
-
default="",
|
|
188
|
-
description="Optional prefix to add to user messages before sending to agent.",
|
|
189
|
-
)
|
|
190
|
-
domain_context: DomainContext | None = Field(
|
|
191
|
-
default=None,
|
|
192
|
-
description="Optional domain context for goal checking (currency, locale, business rules).",
|
|
193
|
-
)
|
|
194
|
-
data_snapshot: DataSnapshot | None = Field(
|
|
195
|
-
default=None,
|
|
196
|
-
description="Optional data snapshot to ground simulated user requests to available data.",
|
|
197
|
-
)
|
|
198
|
-
metrics: list[type[MetricCollector] | Callable[[], MetricCollector]] | None = Field(
|
|
199
|
-
default=None,
|
|
200
|
-
description=(
|
|
201
|
-
"Optional list of metric collector factories. Each item can be either a class "
|
|
202
|
-
"(e.g., LatencyMetricCollector) or a callable that returns a collector instance "
|
|
203
|
-
"(e.g., lambda: CustomCollector(arg=value)). Fresh instances are created for each run."
|
|
204
|
-
),
|
|
205
|
-
)
|
|
206
|
-
|
|
207
|
-
def create_metric_collectors(self) -> list[MetricCollector]:
|
|
208
|
-
"""Create fresh metric collector instances for a simulation run.
|
|
209
|
-
|
|
210
|
-
Each call creates new instances to ensure concurrent runs don't share state.
|
|
211
|
-
|
|
212
|
-
Returns:
|
|
213
|
-
List of freshly instantiated metric collectors.
|
|
214
|
-
"""
|
|
215
|
-
if not self.metrics:
|
|
216
|
-
return []
|
|
217
|
-
return [factory() for factory in self.metrics]
|
|
36
|
+
name: str
|
|
37
|
+
description: str
|