langwatch-scenario 0.1.3__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {langwatch_scenario-0.1.3.dist-info → langwatch_scenario-0.2.0.dist-info}/METADATA +36 -23
- langwatch_scenario-0.2.0.dist-info/RECORD +15 -0
- {langwatch_scenario-0.1.3.dist-info → langwatch_scenario-0.2.0.dist-info}/WHEEL +1 -1
- scenario/pytest_plugin.py +7 -7
- scenario/result.py +11 -18
- scenario/scenario.py +13 -10
- scenario/scenario_executor.py +1 -1
- scenario/testing_agent.py +53 -56
- langwatch_scenario-0.1.3.dist-info/RECORD +0 -15
- {langwatch_scenario-0.1.3.dist-info → langwatch_scenario-0.2.0.dist-info}/entry_points.txt +0 -0
- {langwatch_scenario-0.1.3.dist-info → langwatch_scenario-0.2.0.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: langwatch-scenario
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.2.0
|
4
4
|
Summary: The end-to-end agent testing library
|
5
5
|
Author-email: LangWatch Team <support@langwatch.ai>
|
6
6
|
License: MIT
|
@@ -45,6 +45,11 @@ You define the scenarios, and the testing agent will simulate your users as it f
|
|
45
45
|
|
46
46
|
[📺 Video Tutorial](https://www.youtube.com/watch?v=f8NLpkY0Av4)
|
47
47
|
|
48
|
+
### See also
|
49
|
+
|
50
|
+
- [Scenario TypeScript](https://github.com/langwatch/scenario-ts/)
|
51
|
+
- [Scenario Go](https://github.com/langwatch/scenario-go/)
|
52
|
+
|
48
53
|
## Getting Started
|
49
54
|
|
50
55
|
Install pytest and scenario:
|
@@ -72,18 +77,23 @@ async def test_vegetarian_recipe_agent():
|
|
72
77
|
# Call your agent here
|
73
78
|
return agent.run(message)
|
74
79
|
|
75
|
-
# Define the scenario
|
80
|
+
# Define the simulated scenario
|
76
81
|
scenario = Scenario(
|
77
|
-
"
|
82
|
+
name="dinner idea",
|
83
|
+
description="""
|
84
|
+
It's saturday evening, the user is very hungry and tired,
|
85
|
+
but have no money to order out, so they are looking for a recipe.
|
86
|
+
|
87
|
+
The user never mentions they want a vegetarian recipe.
|
88
|
+
""",
|
78
89
|
agent=vegetarian_recipe_agent,
|
79
|
-
|
80
|
-
|
81
|
-
"
|
82
|
-
"
|
83
|
-
|
84
|
-
|
85
|
-
"
|
86
|
-
"The agent asks more than two follow-up questions",
|
90
|
+
# List the evaluation criteria for the scenario to be considered successful
|
91
|
+
criteria=[
|
92
|
+
"Agent should not ask more than two follow-up questions",
|
93
|
+
"Agent should generate a recipe",
|
94
|
+
"Recipe should include a list of ingredients",
|
95
|
+
"Recipe should include step-by-step cooking instructions",
|
96
|
+
"Recipe should be vegetarian and not include any sort of meat",
|
87
97
|
],
|
88
98
|
)
|
89
99
|
|
@@ -111,9 +121,11 @@ class VegetarianRecipeAgent:
|
|
111
121
|
messages=[
|
112
122
|
{
|
113
123
|
"role": "system",
|
114
|
-
"content": """
|
115
|
-
|
116
|
-
|
124
|
+
"content": """
|
125
|
+
You are a vegetarian recipe agent.
|
126
|
+
Given the user request, ask AT MOST ONE follow-up question,
|
127
|
+
then provide a complete recipe. Keep your responses concise and focused.
|
128
|
+
""",
|
117
129
|
},
|
118
130
|
*self.history,
|
119
131
|
],
|
@@ -151,19 +163,20 @@ For example, in this Lovable Clone scenario test:
|
|
151
163
|
|
152
164
|
```python
|
153
165
|
scenario = Scenario(
|
154
|
-
"
|
166
|
+
name="dog walking startup landing page",
|
167
|
+
description="""
|
168
|
+
the user wants to create a new landing page for their dog walking startup
|
169
|
+
|
170
|
+
send the first message to generate the landing page, then a single follow up request to extend it, then give your final verdict
|
171
|
+
""",
|
155
172
|
agent=lovable_agent,
|
156
|
-
|
157
|
-
success_criteria=[
|
173
|
+
criteria=[
|
158
174
|
"agent reads the files before go and making changes",
|
159
|
-
"agent modified the index.css file",
|
160
|
-
"agent modified the Index.tsx file",
|
175
|
+
"agent modified the index.css file, not only the Index.tsx file",
|
161
176
|
"agent created a comprehensive landing page",
|
162
177
|
"agent extended the landing page with a new section",
|
163
|
-
|
164
|
-
|
165
|
-
"agent says it can't read the file",
|
166
|
-
"agent produces incomplete code or is too lazy to finish",
|
178
|
+
"agent should NOT say it can't read the file",
|
179
|
+
"agent should NOT produce incomplete code or be too lazy to finish",
|
167
180
|
],
|
168
181
|
max_turns=5,
|
169
182
|
)
|
@@ -0,0 +1,15 @@
|
|
1
|
+
scenario/__init__.py,sha256=LfCjOpbn55jYBBZHyMSZtRAWeCDFn4z4OhAyFnu8aMg,602
|
2
|
+
scenario/cache.py,sha256=sYu16SAf-BnVYkWSlEDzpyynJGIQyNYsgMXPgCqEnmk,1719
|
3
|
+
scenario/config.py,sha256=5UVBmuQDtni0Yu00bMh5p0xMGsrymYVRftXBGTsi2fI,802
|
4
|
+
scenario/error_messages.py,sha256=ZMcAOKJmKaLIinMZ0yBIOgDhPfeJH0uZxIEmolRArtc,2344
|
5
|
+
scenario/pytest_plugin.py,sha256=TzOHi8PN-dtDqaYAZkgT0wgBkhetOpYy--Z0pzi5PXM,5771
|
6
|
+
scenario/result.py,sha256=y6mUu6X4H6YJYmwVD4VWHCBi-1BTlUVeYrTZ3HBA0oU,2382
|
7
|
+
scenario/scenario.py,sha256=OTadwIHIcUhXxfUNnJXpT7h3GZ_VUL3XSd9k-oVPfMo,4069
|
8
|
+
scenario/scenario_executor.py,sha256=phRKj7vZ_QjGUO9w05-DPrAzdacg_7CnTV59lYLCCKk,7912
|
9
|
+
scenario/testing_agent.py,sha256=y4B8TMhKryeTiiv62qwslx7Gw_zw54Vk9zPyswEPm0k,10481
|
10
|
+
scenario/utils.py,sha256=tMESosrxesA1B5zZB3IJ-sNSXDmnpNNib-DHobveVLA,3918
|
11
|
+
langwatch_scenario-0.2.0.dist-info/METADATA,sha256=fc1oBg2ms-iVgYc44oSTJk-8sw2yOe_PpWEMStvYEX4,9339
|
12
|
+
langwatch_scenario-0.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
13
|
+
langwatch_scenario-0.2.0.dist-info/entry_points.txt,sha256=WlEnJ_gku0i18bIa3DSuGqXRX-QDQLe_s0YmRzK45TI,45
|
14
|
+
langwatch_scenario-0.2.0.dist-info/top_level.txt,sha256=45Mn28aedJsetnBMB5xSmrJ-yo701QLH89Zlz4r1clE,9
|
15
|
+
langwatch_scenario-0.2.0.dist-info/RECORD,,
|
scenario/pytest_plugin.py
CHANGED
@@ -82,7 +82,7 @@ class ScenarioReporter:
|
|
82
82
|
time = f" in {result.total_time:.2f}s (agent: {result.agent_time:.2f}s)"
|
83
83
|
|
84
84
|
print(
|
85
|
-
f"\n{idx}. {scenario.
|
85
|
+
f"\n{idx}. {scenario.name} - {colored(status, status_color, attrs=['bold'])}{time}"
|
86
86
|
)
|
87
87
|
|
88
88
|
print(
|
@@ -92,23 +92,23 @@ class ScenarioReporter:
|
|
92
92
|
)
|
93
93
|
)
|
94
94
|
|
95
|
-
if hasattr(result, "
|
96
|
-
criteria_count = len(result.
|
97
|
-
total_criteria = len(scenario.
|
95
|
+
if hasattr(result, "passed_criteria") and result.passed_criteria:
|
96
|
+
criteria_count = len(result.passed_criteria)
|
97
|
+
total_criteria = len(scenario.criteria)
|
98
98
|
criteria_color = (
|
99
99
|
"green" if criteria_count == total_criteria else "yellow"
|
100
100
|
)
|
101
101
|
print(
|
102
102
|
colored(
|
103
|
-
f"
|
103
|
+
f" Passed Criteria: {criteria_count}/{total_criteria}",
|
104
104
|
criteria_color,
|
105
105
|
)
|
106
106
|
)
|
107
107
|
|
108
|
-
if hasattr(result, "
|
108
|
+
if hasattr(result, "failed_criteria") and result.failed_criteria:
|
109
109
|
print(
|
110
110
|
colored(
|
111
|
-
f"
|
111
|
+
f" Failed Criteria: {len(result.failed_criteria)}",
|
112
112
|
"red",
|
113
113
|
)
|
114
114
|
)
|
scenario/result.py
CHANGED
@@ -15,17 +15,15 @@ class ScenarioResult:
|
|
15
15
|
success: Whether the scenario passed
|
16
16
|
conversation: The conversation history
|
17
17
|
reasoning: Reasoning for the result
|
18
|
-
|
19
|
-
|
20
|
-
triggered_failures: List of failure criteria that were triggered
|
18
|
+
passed_criteria: List of criteria that were met
|
19
|
+
failed_criteria: List of criteria that were not met
|
21
20
|
"""
|
22
21
|
|
23
22
|
success: bool
|
24
23
|
conversation: List[Dict[str, str]]
|
25
24
|
reasoning: Optional[str] = None
|
26
|
-
|
27
|
-
|
28
|
-
triggered_failures: List[str] = field(default_factory=list)
|
25
|
+
passed_criteria: List[str] = field(default_factory=list)
|
26
|
+
failed_criteria: List[str] = field(default_factory=list)
|
29
27
|
total_time: Optional[float] = None
|
30
28
|
agent_time: Optional[float] = None
|
31
29
|
|
@@ -39,7 +37,7 @@ class ScenarioResult:
|
|
39
37
|
cls,
|
40
38
|
conversation: List[Dict[str, str]],
|
41
39
|
reasoning: Optional[str],
|
42
|
-
|
40
|
+
passed_criteria: List[str],
|
43
41
|
total_time: Optional[float] = None,
|
44
42
|
agent_time: Optional[float] = None,
|
45
43
|
) -> "ScenarioResult":
|
@@ -48,9 +46,8 @@ class ScenarioResult:
|
|
48
46
|
success=True,
|
49
47
|
conversation=conversation,
|
50
48
|
reasoning=reasoning,
|
51
|
-
|
52
|
-
|
53
|
-
triggered_failures=[],
|
49
|
+
passed_criteria=passed_criteria,
|
50
|
+
failed_criteria=[],
|
54
51
|
total_time=total_time,
|
55
52
|
agent_time=agent_time,
|
56
53
|
)
|
@@ -60,9 +57,8 @@ class ScenarioResult:
|
|
60
57
|
cls,
|
61
58
|
conversation: List[Dict[str, str]],
|
62
59
|
reasoning: str,
|
63
|
-
|
64
|
-
|
65
|
-
triggered_failures: Optional[List[str]] = None,
|
60
|
+
passed_criteria: Optional[List[str]] = None,
|
61
|
+
failed_criteria: Optional[List[str]] = None,
|
66
62
|
total_time: Optional[float] = None,
|
67
63
|
agent_time: Optional[float] = None,
|
68
64
|
) -> "ScenarioResult":
|
@@ -71,11 +67,8 @@ class ScenarioResult:
|
|
71
67
|
success=False,
|
72
68
|
conversation=conversation,
|
73
69
|
reasoning=reasoning,
|
74
|
-
|
75
|
-
|
76
|
-
triggered_failures=(
|
77
|
-
triggered_failures if triggered_failures is not None else []
|
78
|
-
),
|
70
|
+
passed_criteria=passed_criteria if passed_criteria is not None else [],
|
71
|
+
failed_criteria=failed_criteria if failed_criteria is not None else [],
|
79
72
|
total_time=total_time,
|
80
73
|
agent_time=agent_time,
|
81
74
|
)
|
scenario/scenario.py
CHANGED
@@ -15,6 +15,7 @@ from .testing_agent import TestingAgent
|
|
15
15
|
|
16
16
|
from openai.types.chat import ChatCompletionMessageParam
|
17
17
|
|
18
|
+
|
18
19
|
class AgentResult(TypedDict, total=False):
|
19
20
|
message: str
|
20
21
|
messages: List[ChatCompletionMessageParam]
|
@@ -27,34 +28,36 @@ class Scenario(ScenarioConfig):
|
|
27
28
|
|
28
29
|
It includes:
|
29
30
|
- A description of the scenario
|
30
|
-
-
|
31
|
-
- Failure criteria to determine if the agent failed
|
32
|
-
- An optional strategy that guides the testing agent
|
31
|
+
- Criteria to determine if the agent behaved correctly
|
33
32
|
- Optional additional parameters
|
34
33
|
"""
|
35
34
|
|
35
|
+
name: str
|
36
36
|
description: str
|
37
37
|
agent: Union[
|
38
38
|
Callable[[str, Optional[Dict[str, Any]]], Dict[str, Any]],
|
39
39
|
Callable[[str, Optional[Dict[str, Any]]], Awaitable[Dict[str, Any]]],
|
40
40
|
]
|
41
|
-
|
42
|
-
failure_criteria: List[str] = []
|
43
|
-
strategy: Optional[str] = None
|
41
|
+
criteria: List[str]
|
44
42
|
|
45
|
-
def __init__(self, description: str, **kwargs):
|
43
|
+
def __init__(self, name: str, description: str, **kwargs):
|
46
44
|
"""Validate scenario configuration after initialization."""
|
47
45
|
|
48
46
|
default_config = getattr(Scenario, "default_config", None)
|
49
47
|
if default_config:
|
50
48
|
kwargs = {**default_config.model_dump(), **kwargs}
|
51
49
|
|
50
|
+
if not name:
|
51
|
+
raise ValueError("Scenario name cannot be empty")
|
52
|
+
kwargs["name"] = name
|
53
|
+
|
52
54
|
if not description:
|
53
55
|
raise ValueError("Scenario description cannot be empty")
|
54
56
|
kwargs["description"] = description
|
55
57
|
|
56
|
-
|
57
|
-
|
58
|
+
# TODO: allow not having any criteria, for scripted scenarios
|
59
|
+
if not kwargs.get("criteria"):
|
60
|
+
raise ValueError("Scenario must have at least one criteria")
|
58
61
|
|
59
62
|
if kwargs.get("max_turns", 0) < 1:
|
60
63
|
raise ValueError("max_turns must be a positive integer")
|
@@ -65,7 +68,6 @@ class Scenario(ScenarioConfig):
|
|
65
68
|
|
66
69
|
super().__init__(**kwargs)
|
67
70
|
|
68
|
-
|
69
71
|
async def run(self, context: Optional[Dict[str, Any]] = None) -> ScenarioResult:
|
70
72
|
"""
|
71
73
|
Run the scenario against the agent under test.
|
@@ -82,6 +84,7 @@ class Scenario(ScenarioConfig):
|
|
82
84
|
# being used throughout, any user code on the callback can
|
83
85
|
# be blocking, preventing them from running scenarios in parallel
|
84
86
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
87
|
+
|
85
88
|
def run_in_thread():
|
86
89
|
loop = asyncio.new_event_loop()
|
87
90
|
asyncio.set_event_loop(loop)
|
scenario/scenario_executor.py
CHANGED
@@ -199,6 +199,6 @@ class ScenarioExecutor:
|
|
199
199
|
|
200
200
|
def _scenario_name(self):
|
201
201
|
if self.scenario.verbose == 2:
|
202
|
-
return termcolor.colored(f"[Scenario: {self.scenario.
|
202
|
+
return termcolor.colored(f"[Scenario: {self.scenario.name}] ", "yellow")
|
203
203
|
else:
|
204
204
|
return ""
|
scenario/testing_agent.py
CHANGED
@@ -4,6 +4,7 @@ TestingAgent module: defines the testing agent that interacts with the agent und
|
|
4
4
|
|
5
5
|
import json
|
6
6
|
import logging
|
7
|
+
import re
|
7
8
|
from typing import TYPE_CHECKING, Dict, List, Any, Optional, Union, cast
|
8
9
|
from pydantic import BaseModel
|
9
10
|
|
@@ -74,27 +75,19 @@ Your goal (assistant) is to interact with the Agent Under Test (user) as if you
|
|
74
75
|
{scenario.description}
|
75
76
|
</scenario>
|
76
77
|
|
77
|
-
<
|
78
|
-
{
|
79
|
-
</
|
80
|
-
|
81
|
-
<success_criteria>
|
82
|
-
{json.dumps(scenario.success_criteria, indent=2)}
|
83
|
-
</success_criteria>
|
84
|
-
|
85
|
-
<failure_criteria>
|
86
|
-
{json.dumps(scenario.failure_criteria, indent=2)}
|
87
|
-
</failure_criteria>
|
78
|
+
<criteria>
|
79
|
+
{"\n".join([f"{idx + 1}. {criterion}" for idx, criterion in enumerate(scenario.criteria)])}
|
80
|
+
</criteria>
|
88
81
|
|
89
82
|
<execution_flow>
|
90
83
|
1. Generate the first message to start the scenario
|
91
84
|
2. After the Agent Under Test (user) responds, generate the next message to send to the Agent Under Test, keep repeating step 2 until criterias match
|
92
|
-
3. If the test should end, use the finish_test tool to determine if
|
85
|
+
3. If the test should end, use the finish_test tool to determine if all the criteria have been met
|
93
86
|
</execution_flow>
|
94
87
|
|
95
88
|
<rules>
|
96
|
-
1. Test should end immediately if a
|
97
|
-
2. Test should continue until all
|
89
|
+
1. Test should end immediately if a criteria mentioning something the agent should NOT do is met
|
90
|
+
2. Test should continue until all scenario goals have been met to try going through all the criteria
|
98
91
|
3. DO NOT make any judgment calls that are not explicitly listed in the success or failure criteria, withhold judgement if necessary
|
99
92
|
4. DO NOT carry over any requests yourself, YOU ARE NOT the assistant today, wait for the user to do it
|
100
93
|
</rules>
|
@@ -141,6 +134,14 @@ if you don't have enough information to make a verdict, say inconclusive with ma
|
|
141
134
|
message.role = "user"
|
142
135
|
|
143
136
|
# Define the tool
|
137
|
+
criteria_names = [
|
138
|
+
re.sub(
|
139
|
+
r"[^a-zA-Z0-9]",
|
140
|
+
"_",
|
141
|
+
criterion.replace(" ", "_").replace("'", "").lower(),
|
142
|
+
)[:70]
|
143
|
+
for criterion in scenario.criteria
|
144
|
+
]
|
144
145
|
tools = [
|
145
146
|
{
|
146
147
|
"type": "function",
|
@@ -151,40 +152,30 @@ if you don't have enough information to make a verdict, say inconclusive with ma
|
|
151
152
|
"parameters": {
|
152
153
|
"type": "object",
|
153
154
|
"properties": {
|
154
|
-
"
|
155
|
-
"type": "string",
|
156
|
-
"enum": ["success", "failure", "inconclusive"],
|
157
|
-
"description": "The final verdict of the test",
|
158
|
-
},
|
159
|
-
"reasoning": {
|
160
|
-
"type": "string",
|
161
|
-
"description": "Explanation of why this verdict was reached",
|
162
|
-
},
|
163
|
-
"details": {
|
155
|
+
"criteria": {
|
164
156
|
"type": "object",
|
165
157
|
"properties": {
|
166
|
-
|
167
|
-
"
|
168
|
-
"
|
169
|
-
|
170
|
-
|
171
|
-
"unmet_criteria": {
|
172
|
-
"type": "array",
|
173
|
-
"items": {"type": "string"},
|
174
|
-
"description": "List of success criteria that have not been met",
|
175
|
-
},
|
176
|
-
"triggered_failures": {
|
177
|
-
"type": "array",
|
178
|
-
"items": {"type": "string"},
|
179
|
-
"description": "List of failure criteria that have been triggered",
|
180
|
-
},
|
158
|
+
criteria_names[idx]: {
|
159
|
+
"enum": [True, False, "inconclusive"],
|
160
|
+
"description": criterion,
|
161
|
+
}
|
162
|
+
for idx, criterion in enumerate(scenario.criteria)
|
181
163
|
},
|
182
|
-
"required":
|
164
|
+
"required": criteria_names,
|
183
165
|
"additionalProperties": False,
|
184
|
-
"description": "
|
166
|
+
"description": "Strict verdict for each criterion",
|
167
|
+
},
|
168
|
+
"reasoning": {
|
169
|
+
"type": "string",
|
170
|
+
"description": "Explanation of what the final verdict should be",
|
171
|
+
},
|
172
|
+
"verdict": {
|
173
|
+
"type": "string",
|
174
|
+
"enum": ["success", "failure", "inconclusive"],
|
175
|
+
"description": "The final verdict of the test",
|
185
176
|
},
|
186
177
|
},
|
187
|
-
"required": ["
|
178
|
+
"required": ["criteria", "reasoning", "verdict"],
|
188
179
|
"additionalProperties": False,
|
189
180
|
},
|
190
181
|
},
|
@@ -216,35 +207,40 @@ if you don't have enough information to make a verdict, say inconclusive with ma
|
|
216
207
|
args = json.loads(tool_call.function.arguments)
|
217
208
|
verdict = args.get("verdict", "inconclusive")
|
218
209
|
reasoning = args.get("reasoning", "No reasoning provided")
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
210
|
+
criteria = args.get("criteria", {})
|
211
|
+
|
212
|
+
passed_criteria = [
|
213
|
+
scenario.criteria[idx]
|
214
|
+
for idx, criterion in enumerate(criteria.values())
|
215
|
+
if criterion == True
|
216
|
+
]
|
217
|
+
failed_criteria = [
|
218
|
+
scenario.criteria[idx]
|
219
|
+
for idx, criterion in enumerate(criteria.values())
|
220
|
+
if criterion == False
|
221
|
+
]
|
224
222
|
|
225
223
|
# Return the appropriate ScenarioResult based on the verdict
|
226
224
|
if verdict == "success":
|
227
225
|
return ScenarioResult.success_result(
|
228
226
|
conversation=conversation,
|
229
227
|
reasoning=reasoning,
|
230
|
-
|
228
|
+
passed_criteria=passed_criteria,
|
231
229
|
)
|
232
230
|
elif verdict == "failure":
|
233
231
|
return ScenarioResult.failure_result(
|
234
232
|
conversation=conversation,
|
235
233
|
reasoning=reasoning,
|
236
|
-
|
237
|
-
|
238
|
-
triggered_failures=triggered_failures,
|
234
|
+
passed_criteria=passed_criteria,
|
235
|
+
failed_criteria=failed_criteria,
|
239
236
|
)
|
240
237
|
else: # inconclusive
|
241
238
|
return ScenarioResult(
|
242
239
|
success=False,
|
243
240
|
conversation=conversation,
|
244
241
|
reasoning=reasoning,
|
245
|
-
|
246
|
-
|
247
|
-
triggered_failures=triggered_failures,
|
242
|
+
passed_criteria=passed_criteria,
|
243
|
+
failed_criteria=failed_criteria,
|
248
244
|
)
|
249
245
|
except json.JSONDecodeError:
|
250
246
|
logger.error("Failed to parse tool call arguments")
|
@@ -254,7 +250,9 @@ if you don't have enough information to make a verdict, say inconclusive with ma
|
|
254
250
|
if message_content is None:
|
255
251
|
# If invalid tool call, raise an error
|
256
252
|
if message.tool_calls:
|
257
|
-
raise Exception(
|
253
|
+
raise Exception(
|
254
|
+
f"Invalid tool call from testing agent: {message.tool_calls.__repr__()}"
|
255
|
+
)
|
258
256
|
raise Exception(f"No response from LLM: {response.__repr__()}")
|
259
257
|
|
260
258
|
return message_content
|
@@ -262,4 +260,3 @@ if you don't have enough information to make a verdict, say inconclusive with ma
|
|
262
260
|
raise Exception(
|
263
261
|
f"Unexpected response format from LLM: {response.__repr__()}"
|
264
262
|
)
|
265
|
-
|
@@ -1,15 +0,0 @@
|
|
1
|
-
scenario/__init__.py,sha256=LfCjOpbn55jYBBZHyMSZtRAWeCDFn4z4OhAyFnu8aMg,602
|
2
|
-
scenario/cache.py,sha256=sYu16SAf-BnVYkWSlEDzpyynJGIQyNYsgMXPgCqEnmk,1719
|
3
|
-
scenario/config.py,sha256=5UVBmuQDtni0Yu00bMh5p0xMGsrymYVRftXBGTsi2fI,802
|
4
|
-
scenario/error_messages.py,sha256=ZMcAOKJmKaLIinMZ0yBIOgDhPfeJH0uZxIEmolRArtc,2344
|
5
|
-
scenario/pytest_plugin.py,sha256=BuBbyKLa-t9AFVn9EETl7OvGSt__dFO7KnbZynfS1UM,5789
|
6
|
-
scenario/result.py,sha256=SGF8uYNtkP7cJy4KsshUozZRevmdiyX2TFzr6VreTv8,2717
|
7
|
-
scenario/scenario.py,sha256=tYn3Y1sK6_7pg7hFb_5w0TW6nun-za_4F8kqcnrXXU4,4077
|
8
|
-
scenario/scenario_executor.py,sha256=c8xV6GoJgO2JoZBWpYPQN5YwwQ3G9iJUtXV9UGSf1q8,7919
|
9
|
-
scenario/testing_agent.py,sha256=eS-c_io5cHgzJ88wwRvU_vve-pmB2HsGWN6qwlq0sPg,10865
|
10
|
-
scenario/utils.py,sha256=tMESosrxesA1B5zZB3IJ-sNSXDmnpNNib-DHobveVLA,3918
|
11
|
-
langwatch_scenario-0.1.3.dist-info/METADATA,sha256=7OIolGcZ3fkCXFmE6JHkckVCeJb1r3yYSYveJ6iE9zw,8801
|
12
|
-
langwatch_scenario-0.1.3.dist-info/WHEEL,sha256=pxyMxgL8-pra_rKaQ4drOZAegBVuX-G_4nRHjjgWbmo,91
|
13
|
-
langwatch_scenario-0.1.3.dist-info/entry_points.txt,sha256=WlEnJ_gku0i18bIa3DSuGqXRX-QDQLe_s0YmRzK45TI,45
|
14
|
-
langwatch_scenario-0.1.3.dist-info/top_level.txt,sha256=45Mn28aedJsetnBMB5xSmrJ-yo701QLH89Zlz4r1clE,9
|
15
|
-
langwatch_scenario-0.1.3.dist-info/RECORD,,
|
File without changes
|
File without changes
|