langwatch-scenario 0.3.0__py3-none-any.whl → 0.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {langwatch_scenario-0.3.0.dist-info → langwatch_scenario-0.4.0.dist-info}/METADATA +140 -79
- langwatch_scenario-0.4.0.dist-info/RECORD +18 -0
- scenario/__init__.py +223 -9
- scenario/agent_adapter.py +111 -0
- scenario/cache.py +132 -8
- scenario/config.py +154 -10
- scenario/error_messages.py +8 -38
- scenario/judge_agent.py +435 -0
- scenario/pytest_plugin.py +223 -15
- scenario/scenario_executor.py +428 -136
- scenario/scenario_state.py +205 -0
- scenario/script.py +361 -0
- scenario/types.py +193 -20
- scenario/user_simulator_agent.py +249 -0
- scenario/utils.py +252 -2
- langwatch_scenario-0.3.0.dist-info/RECORD +0 -16
- scenario/scenario.py +0 -238
- scenario/scenario_agent_adapter.py +0 -16
- scenario/testing_agent.py +0 -279
- {langwatch_scenario-0.3.0.dist-info → langwatch_scenario-0.4.0.dist-info}/WHEEL +0 -0
- {langwatch_scenario-0.3.0.dist-info → langwatch_scenario-0.4.0.dist-info}/entry_points.txt +0 -0
- {langwatch_scenario-0.3.0.dist-info → langwatch_scenario-0.4.0.dist-info}/top_level.txt +0 -0
scenario/testing_agent.py
DELETED
@@ -1,279 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
TestingAgent module: defines the testing agent that interacts with the agent under test.
|
3
|
-
"""
|
4
|
-
|
5
|
-
import json
|
6
|
-
import logging
|
7
|
-
import re
|
8
|
-
from typing import Optional, Type, cast
|
9
|
-
|
10
|
-
from litellm import Choices, completion
|
11
|
-
from litellm.files.main import ModelResponse
|
12
|
-
|
13
|
-
from scenario.cache import scenario_cache
|
14
|
-
from scenario.scenario_agent_adapter import ScenarioAgentAdapter
|
15
|
-
from scenario.utils import reverse_roles
|
16
|
-
|
17
|
-
from .error_messages import testing_agent_not_configured_error_message
|
18
|
-
from .types import AgentInput, AgentReturnTypes, ScenarioAgentRole, ScenarioResult
|
19
|
-
|
20
|
-
|
21
|
-
logger = logging.getLogger("scenario")
|
22
|
-
|
23
|
-
|
24
|
-
class TestingAgent(ScenarioAgentAdapter):
|
25
|
-
"""
|
26
|
-
The Testing Agent that interacts with the agent under test.
|
27
|
-
|
28
|
-
This agent is responsible for:
|
29
|
-
1. Generating messages to send to the agent based on the scenario
|
30
|
-
2. Evaluating the responses from the agent against the success/failure criteria
|
31
|
-
3. Determining when to end the test and return a result
|
32
|
-
"""
|
33
|
-
|
34
|
-
roles = {ScenarioAgentRole.USER, ScenarioAgentRole.JUDGE}
|
35
|
-
|
36
|
-
model: str = ""
|
37
|
-
api_key: Optional[str] = None
|
38
|
-
temperature: float = 0.0
|
39
|
-
max_tokens: Optional[int] = None
|
40
|
-
|
41
|
-
# To prevent pytest from thinking this is actually a test class
|
42
|
-
__test__ = False
|
43
|
-
|
44
|
-
def __init__(self, input: AgentInput):
|
45
|
-
super().__init__(input)
|
46
|
-
|
47
|
-
if not self.model:
|
48
|
-
raise Exception(testing_agent_not_configured_error_message)
|
49
|
-
|
50
|
-
@classmethod
|
51
|
-
def with_config(
|
52
|
-
cls,
|
53
|
-
model: str,
|
54
|
-
api_key: Optional[str] = None,
|
55
|
-
temperature: float = 0.0,
|
56
|
-
max_tokens: Optional[int] = None,
|
57
|
-
) -> Type["TestingAgent"]:
|
58
|
-
class TestingAgentWithConfig(cls):
|
59
|
-
def __init__(self, input: AgentInput):
|
60
|
-
self.model = model
|
61
|
-
self.api_key = api_key
|
62
|
-
self.temperature = temperature
|
63
|
-
self.max_tokens = max_tokens
|
64
|
-
|
65
|
-
super().__init__(input)
|
66
|
-
|
67
|
-
return TestingAgentWithConfig
|
68
|
-
|
69
|
-
@scenario_cache(ignore=["scenario"])
|
70
|
-
async def call(
|
71
|
-
self,
|
72
|
-
input: AgentInput,
|
73
|
-
) -> AgentReturnTypes:
|
74
|
-
"""
|
75
|
-
Generate the next message in the conversation based on history OR
|
76
|
-
return a ScenarioResult if the test should conclude.
|
77
|
-
|
78
|
-
Returns either:
|
79
|
-
- A string message to send to the agent (if conversation should continue)
|
80
|
-
- A ScenarioResult (if the test should conclude)
|
81
|
-
"""
|
82
|
-
|
83
|
-
scenario = input.scenario_state.scenario
|
84
|
-
|
85
|
-
messages = [
|
86
|
-
{
|
87
|
-
"role": "system",
|
88
|
-
"content": f"""
|
89
|
-
<role>
|
90
|
-
You are pretending to be a user, you are testing an AI Agent (shown as the user role) based on a scenario.
|
91
|
-
Approach this naturally, as a human user would, with very short inputs, few words, all lowercase, imperative, not periods, like when they google or talk to chatgpt.
|
92
|
-
</role>
|
93
|
-
|
94
|
-
<goal>
|
95
|
-
Your goal (assistant) is to interact with the Agent Under Test (user) as if you were a human user to see if it can complete the scenario successfully.
|
96
|
-
</goal>
|
97
|
-
|
98
|
-
<scenario>
|
99
|
-
{scenario.description}
|
100
|
-
</scenario>
|
101
|
-
|
102
|
-
<criteria>
|
103
|
-
{"\n".join([f"{idx + 1}. {criterion}" for idx, criterion in enumerate(scenario.criteria)])}
|
104
|
-
</criteria>
|
105
|
-
|
106
|
-
<execution_flow>
|
107
|
-
1. Generate the first message to start the scenario
|
108
|
-
2. After the Agent Under Test (user) responds, generate the next message to send to the Agent Under Test, keep repeating step 2 until criterias match
|
109
|
-
3. If the test should end, use the finish_test tool to determine if all the criteria have been met
|
110
|
-
</execution_flow>
|
111
|
-
|
112
|
-
<rules>
|
113
|
-
1. Test should end immediately if a criteria mentioning something the agent should NOT do is met
|
114
|
-
2. Test should continue until all scenario goals have been met to try going through all the criteria
|
115
|
-
3. DO NOT make any judgment calls that are not explicitly listed in the success or failure criteria, withhold judgement if necessary
|
116
|
-
4. DO NOT carry over any requests yourself, YOU ARE NOT the assistant today, wait for the user to do it
|
117
|
-
</rules>
|
118
|
-
""",
|
119
|
-
},
|
120
|
-
{"role": "assistant", "content": "Hello, how can I help you today?"},
|
121
|
-
*input.messages,
|
122
|
-
]
|
123
|
-
|
124
|
-
is_first_message = len(input.messages) == 0
|
125
|
-
is_last_message = (
|
126
|
-
input.scenario_state.current_turn == input.scenario_state.scenario.max_turns
|
127
|
-
)
|
128
|
-
|
129
|
-
if is_last_message:
|
130
|
-
messages.append(
|
131
|
-
{
|
132
|
-
"role": "user",
|
133
|
-
"content": """
|
134
|
-
System:
|
135
|
-
|
136
|
-
<finish_test>
|
137
|
-
This is the last message, conversation has reached the maximum number of turns, give your final verdict,
|
138
|
-
if you don't have enough information to make a verdict, say inconclusive with max turns reached.
|
139
|
-
</finish_test>
|
140
|
-
""",
|
141
|
-
}
|
142
|
-
)
|
143
|
-
|
144
|
-
# User to assistant role reversal
|
145
|
-
# LLM models are biased to always be the assistant not the user, so we need to do this reversal otherwise models like GPT 4.5 is
|
146
|
-
# super confused, and Claude 3.7 even starts throwing exceptions.
|
147
|
-
messages = reverse_roles(messages)
|
148
|
-
|
149
|
-
# Define the tool
|
150
|
-
criteria_names = [
|
151
|
-
re.sub(
|
152
|
-
r"[^a-zA-Z0-9]",
|
153
|
-
"_",
|
154
|
-
criterion.replace(" ", "_").replace("'", "").lower(),
|
155
|
-
)[:70]
|
156
|
-
for criterion in scenario.criteria
|
157
|
-
]
|
158
|
-
tools = [
|
159
|
-
{
|
160
|
-
"type": "function",
|
161
|
-
"function": {
|
162
|
-
"name": "finish_test",
|
163
|
-
"description": "Complete the test with a final verdict",
|
164
|
-
"strict": True,
|
165
|
-
"parameters": {
|
166
|
-
"type": "object",
|
167
|
-
"properties": {
|
168
|
-
"criteria": {
|
169
|
-
"type": "object",
|
170
|
-
"properties": {
|
171
|
-
criteria_names[idx]: {
|
172
|
-
"enum": [True, False, "inconclusive"],
|
173
|
-
"description": criterion,
|
174
|
-
}
|
175
|
-
for idx, criterion in enumerate(scenario.criteria)
|
176
|
-
},
|
177
|
-
"required": criteria_names,
|
178
|
-
"additionalProperties": False,
|
179
|
-
"description": "Strict verdict for each criterion",
|
180
|
-
},
|
181
|
-
"reasoning": {
|
182
|
-
"type": "string",
|
183
|
-
"description": "Explanation of what the final verdict should be",
|
184
|
-
},
|
185
|
-
"verdict": {
|
186
|
-
"type": "string",
|
187
|
-
"enum": ["success", "failure", "inconclusive"],
|
188
|
-
"description": "The final verdict of the test",
|
189
|
-
},
|
190
|
-
},
|
191
|
-
"required": ["criteria", "reasoning", "verdict"],
|
192
|
-
"additionalProperties": False,
|
193
|
-
},
|
194
|
-
},
|
195
|
-
}
|
196
|
-
]
|
197
|
-
|
198
|
-
enforce_judgment = input.requested_role == ScenarioAgentRole.JUDGE
|
199
|
-
has_criteria = len(scenario.criteria) > 0
|
200
|
-
|
201
|
-
if enforce_judgment and not has_criteria:
|
202
|
-
return ScenarioResult(
|
203
|
-
success=False,
|
204
|
-
messages=[],
|
205
|
-
reasoning="TestingAgent was called as a judge, but it has no criteria to judge against",
|
206
|
-
)
|
207
|
-
|
208
|
-
response = cast(
|
209
|
-
ModelResponse,
|
210
|
-
completion(
|
211
|
-
model=self.model,
|
212
|
-
messages=messages,
|
213
|
-
temperature=self.temperature,
|
214
|
-
max_tokens=self.max_tokens,
|
215
|
-
tools=(
|
216
|
-
tools
|
217
|
-
if (not is_first_message or enforce_judgment) and has_criteria
|
218
|
-
else None
|
219
|
-
),
|
220
|
-
tool_choice=(
|
221
|
-
"required"
|
222
|
-
if (is_last_message or enforce_judgment) and has_criteria
|
223
|
-
else None
|
224
|
-
),
|
225
|
-
),
|
226
|
-
)
|
227
|
-
|
228
|
-
# Extract the content from the response
|
229
|
-
if hasattr(response, "choices") and len(response.choices) > 0:
|
230
|
-
message = cast(Choices, response.choices[0]).message
|
231
|
-
|
232
|
-
# Check if the LLM chose to use the tool
|
233
|
-
if message.tool_calls:
|
234
|
-
tool_call = message.tool_calls[0]
|
235
|
-
if tool_call.function.name == "finish_test":
|
236
|
-
# Parse the tool call arguments
|
237
|
-
try:
|
238
|
-
args = json.loads(tool_call.function.arguments)
|
239
|
-
verdict = args.get("verdict", "inconclusive")
|
240
|
-
reasoning = args.get("reasoning", "No reasoning provided")
|
241
|
-
criteria = args.get("criteria", {})
|
242
|
-
|
243
|
-
passed_criteria = [
|
244
|
-
scenario.criteria[idx]
|
245
|
-
for idx, criterion in enumerate(criteria.values())
|
246
|
-
if criterion == True
|
247
|
-
]
|
248
|
-
failed_criteria = [
|
249
|
-
scenario.criteria[idx]
|
250
|
-
for idx, criterion in enumerate(criteria.values())
|
251
|
-
if criterion == False
|
252
|
-
]
|
253
|
-
|
254
|
-
# Return the appropriate ScenarioResult based on the verdict
|
255
|
-
return ScenarioResult(
|
256
|
-
success=verdict == "success",
|
257
|
-
messages=messages,
|
258
|
-
reasoning=reasoning,
|
259
|
-
passed_criteria=passed_criteria,
|
260
|
-
failed_criteria=failed_criteria,
|
261
|
-
)
|
262
|
-
except json.JSONDecodeError:
|
263
|
-
logger.error("Failed to parse tool call arguments")
|
264
|
-
|
265
|
-
# If no tool call use the message content as next message
|
266
|
-
message_content = message.content
|
267
|
-
if message_content is None:
|
268
|
-
# If invalid tool call, raise an error
|
269
|
-
if message.tool_calls:
|
270
|
-
raise Exception(
|
271
|
-
f"Invalid tool call from testing agent: {message.tool_calls.__repr__()}"
|
272
|
-
)
|
273
|
-
raise Exception(f"No response from LLM: {response.__repr__()}")
|
274
|
-
|
275
|
-
return {"role": "user", "content": message_content}
|
276
|
-
else:
|
277
|
-
raise Exception(
|
278
|
-
f"Unexpected response format from LLM: {response.__repr__()}"
|
279
|
-
)
|
File without changes
|
File without changes
|
File without changes
|