langwatch-scenario 0.1.2__py3-none-any.whl → 0.2.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {langwatch_scenario-0.1.2.dist-info → langwatch_scenario-0.2.0.dist-info}/METADATA +43 -24
- langwatch_scenario-0.2.0.dist-info/RECORD +15 -0
- {langwatch_scenario-0.1.2.dist-info → langwatch_scenario-0.2.0.dist-info}/WHEEL +1 -1
- scenario/pytest_plugin.py +26 -10
- scenario/result.py +11 -18
- scenario/scenario.py +15 -10
- scenario/scenario_executor.py +1 -1
- scenario/testing_agent.py +53 -56
- langwatch_scenario-0.1.2.dist-info/RECORD +0 -15
- {langwatch_scenario-0.1.2.dist-info → langwatch_scenario-0.2.0.dist-info}/entry_points.txt +0 -0
- {langwatch_scenario-0.1.2.dist-info → langwatch_scenario-0.2.0.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: langwatch-scenario
|
3
|
-
Version: 0.
|
3
|
+
Version: 0.2.0
|
4
4
|
Summary: The end-to-end agent testing library
|
5
5
|
Author-email: LangWatch Team <support@langwatch.ai>
|
6
6
|
License: MIT
|
@@ -45,6 +45,11 @@ You define the scenarios, and the testing agent will simulate your users as it f
|
|
45
45
|
|
46
46
|
[📺 Video Tutorial](https://www.youtube.com/watch?v=f8NLpkY0Av4)
|
47
47
|
|
48
|
+
### See also
|
49
|
+
|
50
|
+
- [Scenario TypeScript](https://github.com/langwatch/scenario-ts/)
|
51
|
+
- [Scenario Go](https://github.com/langwatch/scenario-go/)
|
52
|
+
|
48
53
|
## Getting Started
|
49
54
|
|
50
55
|
Install pytest and scenario:
|
@@ -72,18 +77,23 @@ async def test_vegetarian_recipe_agent():
|
|
72
77
|
# Call your agent here
|
73
78
|
return agent.run(message)
|
74
79
|
|
75
|
-
# Define the scenario
|
80
|
+
# Define the simulated scenario
|
76
81
|
scenario = Scenario(
|
77
|
-
"
|
82
|
+
name="dinner idea",
|
83
|
+
description="""
|
84
|
+
It's saturday evening, the user is very hungry and tired,
|
85
|
+
but have no money to order out, so they are looking for a recipe.
|
86
|
+
|
87
|
+
The user never mentions they want a vegetarian recipe.
|
88
|
+
""",
|
78
89
|
agent=vegetarian_recipe_agent,
|
79
|
-
|
80
|
-
|
81
|
-
"
|
82
|
-
"
|
83
|
-
|
84
|
-
|
85
|
-
"
|
86
|
-
"The agent asks more than two follow-up questions",
|
90
|
+
# List the evaluation criteria for the scenario to be considered successful
|
91
|
+
criteria=[
|
92
|
+
"Agent should not ask more than two follow-up questions",
|
93
|
+
"Agent should generate a recipe",
|
94
|
+
"Recipe should include a list of ingredients",
|
95
|
+
"Recipe should include step-by-step cooking instructions",
|
96
|
+
"Recipe should be vegetarian and not include any sort of meat",
|
87
97
|
],
|
88
98
|
)
|
89
99
|
|
@@ -111,9 +121,11 @@ class VegetarianRecipeAgent:
|
|
111
121
|
messages=[
|
112
122
|
{
|
113
123
|
"role": "system",
|
114
|
-
"content": """
|
115
|
-
|
116
|
-
|
124
|
+
"content": """
|
125
|
+
You are a vegetarian recipe agent.
|
126
|
+
Given the user request, ask AT MOST ONE follow-up question,
|
127
|
+
then provide a complete recipe. Keep your responses concise and focused.
|
128
|
+
""",
|
117
129
|
},
|
118
130
|
*self.history,
|
119
131
|
],
|
@@ -151,19 +163,20 @@ For example, in this Lovable Clone scenario test:
|
|
151
163
|
|
152
164
|
```python
|
153
165
|
scenario = Scenario(
|
154
|
-
"
|
166
|
+
name="dog walking startup landing page",
|
167
|
+
description="""
|
168
|
+
the user wants to create a new landing page for their dog walking startup
|
169
|
+
|
170
|
+
send the first message to generate the landing page, then a single follow up request to extend it, then give your final verdict
|
171
|
+
""",
|
155
172
|
agent=lovable_agent,
|
156
|
-
|
157
|
-
success_criteria=[
|
173
|
+
criteria=[
|
158
174
|
"agent reads the files before go and making changes",
|
159
|
-
"agent modified the index.css file",
|
160
|
-
"agent modified the Index.tsx file",
|
175
|
+
"agent modified the index.css file, not only the Index.tsx file",
|
161
176
|
"agent created a comprehensive landing page",
|
162
177
|
"agent extended the landing page with a new section",
|
163
|
-
|
164
|
-
|
165
|
-
"agent says it can't read the file",
|
166
|
-
"agent produces incomplete code or is too lazy to finish",
|
178
|
+
"agent should NOT say it can't read the file",
|
179
|
+
"agent should NOT produce incomplete code or be too lazy to finish",
|
167
180
|
],
|
168
181
|
max_turns=5,
|
169
182
|
)
|
@@ -175,7 +188,7 @@ You can find a fully working Lovable Clone example in [examples/test_lovable_clo
|
|
175
188
|
|
176
189
|
## Debug mode
|
177
190
|
|
178
|
-
You can enable debug mode by setting the `debug` field to `True` in the `Scenario.configure` method or in the specific scenario you are running.
|
191
|
+
You can enable debug mode by setting the `debug` field to `True` in the `Scenario.configure` method or in the specific scenario you are running, or by passing the `--debug` flag to pytest.
|
179
192
|
|
180
193
|
Debug mode allows you to see the messages in slow motion step by step, and intervene with your own inputs to debug your agent from the middle of the conversation.
|
181
194
|
|
@@ -183,6 +196,12 @@ Debug mode allows you to see the messages in slow motion step by step, and inter
|
|
183
196
|
Scenario.configure(testing_agent=TestingAgent(model="openai/gpt-4o-mini"), debug=True)
|
184
197
|
```
|
185
198
|
|
199
|
+
or
|
200
|
+
|
201
|
+
```bash
|
202
|
+
pytest -s tests/test_vegetarian_recipe_agent.py --debug
|
203
|
+
```
|
204
|
+
|
186
205
|
## Cache
|
187
206
|
|
188
207
|
Each time the scenario runs, the testing agent might chose a different input to start, this is good to make sure it covers the variance of real users as well, however we understand that the non-deterministic nature of it might make it less repeatable, costly and harder to debug. To solve for it, you can use the `cache_key` field in the `Scenario.configure` method or in the specific scenario you are running, this will make the testing agent give the same input for given the same scenario:
|
@@ -0,0 +1,15 @@
|
|
1
|
+
scenario/__init__.py,sha256=LfCjOpbn55jYBBZHyMSZtRAWeCDFn4z4OhAyFnu8aMg,602
|
2
|
+
scenario/cache.py,sha256=sYu16SAf-BnVYkWSlEDzpyynJGIQyNYsgMXPgCqEnmk,1719
|
3
|
+
scenario/config.py,sha256=5UVBmuQDtni0Yu00bMh5p0xMGsrymYVRftXBGTsi2fI,802
|
4
|
+
scenario/error_messages.py,sha256=ZMcAOKJmKaLIinMZ0yBIOgDhPfeJH0uZxIEmolRArtc,2344
|
5
|
+
scenario/pytest_plugin.py,sha256=TzOHi8PN-dtDqaYAZkgT0wgBkhetOpYy--Z0pzi5PXM,5771
|
6
|
+
scenario/result.py,sha256=y6mUu6X4H6YJYmwVD4VWHCBi-1BTlUVeYrTZ3HBA0oU,2382
|
7
|
+
scenario/scenario.py,sha256=OTadwIHIcUhXxfUNnJXpT7h3GZ_VUL3XSd9k-oVPfMo,4069
|
8
|
+
scenario/scenario_executor.py,sha256=phRKj7vZ_QjGUO9w05-DPrAzdacg_7CnTV59lYLCCKk,7912
|
9
|
+
scenario/testing_agent.py,sha256=y4B8TMhKryeTiiv62qwslx7Gw_zw54Vk9zPyswEPm0k,10481
|
10
|
+
scenario/utils.py,sha256=tMESosrxesA1B5zZB3IJ-sNSXDmnpNNib-DHobveVLA,3918
|
11
|
+
langwatch_scenario-0.2.0.dist-info/METADATA,sha256=fc1oBg2ms-iVgYc44oSTJk-8sw2yOe_PpWEMStvYEX4,9339
|
12
|
+
langwatch_scenario-0.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
13
|
+
langwatch_scenario-0.2.0.dist-info/entry_points.txt,sha256=WlEnJ_gku0i18bIa3DSuGqXRX-QDQLe_s0YmRzK45TI,45
|
14
|
+
langwatch_scenario-0.2.0.dist-info/top_level.txt,sha256=45Mn28aedJsetnBMB5xSmrJ-yo701QLH89Zlz4r1clE,9
|
15
|
+
langwatch_scenario-0.2.0.dist-info/RECORD,,
|
scenario/pytest_plugin.py
CHANGED
@@ -11,14 +11,16 @@ from scenario.result import ScenarioResult
|
|
11
11
|
|
12
12
|
from .scenario import Scenario
|
13
13
|
|
14
|
+
|
14
15
|
class ScenarioReporterResults(TypedDict):
|
15
16
|
scenario: Scenario
|
16
17
|
result: ScenarioResult
|
17
18
|
|
19
|
+
|
18
20
|
# ScenarioReporter class definition moved outside the fixture for global use
|
19
21
|
class ScenarioReporter:
|
20
22
|
def __init__(self):
|
21
|
-
self.results
|
23
|
+
self.results: list[ScenarioReporterResults] = []
|
22
24
|
|
23
25
|
def add_result(self, scenario, result):
|
24
26
|
"""Add a test result to the reporter."""
|
@@ -80,28 +82,33 @@ class ScenarioReporter:
|
|
80
82
|
time = f" in {result.total_time:.2f}s (agent: {result.agent_time:.2f}s)"
|
81
83
|
|
82
84
|
print(
|
83
|
-
f"\n{idx}. {scenario.
|
85
|
+
f"\n{idx}. {scenario.name} - {colored(status, status_color, attrs=['bold'])}{time}"
|
84
86
|
)
|
85
87
|
|
86
|
-
print(
|
88
|
+
print(
|
89
|
+
colored(
|
90
|
+
f" Reasoning: {result.reasoning}",
|
91
|
+
"green" if result.success else "red",
|
92
|
+
)
|
93
|
+
)
|
87
94
|
|
88
|
-
if hasattr(result, "
|
89
|
-
criteria_count = len(result.
|
90
|
-
total_criteria = len(scenario.
|
95
|
+
if hasattr(result, "passed_criteria") and result.passed_criteria:
|
96
|
+
criteria_count = len(result.passed_criteria)
|
97
|
+
total_criteria = len(scenario.criteria)
|
91
98
|
criteria_color = (
|
92
99
|
"green" if criteria_count == total_criteria else "yellow"
|
93
100
|
)
|
94
101
|
print(
|
95
102
|
colored(
|
96
|
-
f"
|
103
|
+
f" Passed Criteria: {criteria_count}/{total_criteria}",
|
97
104
|
criteria_color,
|
98
105
|
)
|
99
106
|
)
|
100
107
|
|
101
|
-
if hasattr(result, "
|
108
|
+
if hasattr(result, "failed_criteria") and result.failed_criteria:
|
102
109
|
print(
|
103
110
|
colored(
|
104
|
-
f"
|
111
|
+
f" Failed Criteria: {len(result.failed_criteria)}",
|
105
112
|
"red",
|
106
113
|
)
|
107
114
|
)
|
@@ -119,6 +126,10 @@ def pytest_configure(config):
|
|
119
126
|
"markers", "agent_test: mark test as an agent scenario test"
|
120
127
|
)
|
121
128
|
|
129
|
+
if config.getoption("--debug"):
|
130
|
+
print(colored("\nScenario debug mode enabled (--debug).", "yellow"))
|
131
|
+
Scenario.configure(verbose=True, debug=True)
|
132
|
+
|
122
133
|
# Create a global reporter instance
|
123
134
|
config._scenario_reporter = ScenarioReporter()
|
124
135
|
|
@@ -128,7 +139,12 @@ def pytest_configure(config):
|
|
128
139
|
result = await original_run(self, *args, **kwargs)
|
129
140
|
|
130
141
|
# Always report to the global reporter
|
131
|
-
|
142
|
+
# Ensure the reporter exists before adding result
|
143
|
+
if hasattr(config, "_scenario_reporter"):
|
144
|
+
config._scenario_reporter.add_result(self, result)
|
145
|
+
else:
|
146
|
+
# Handle case where reporter might not be initialized (should not happen with current setup)
|
147
|
+
print(colored("Warning: Scenario reporter not found during run.", "yellow"))
|
132
148
|
|
133
149
|
return result
|
134
150
|
|
scenario/result.py
CHANGED
@@ -15,17 +15,15 @@ class ScenarioResult:
|
|
15
15
|
success: Whether the scenario passed
|
16
16
|
conversation: The conversation history
|
17
17
|
reasoning: Reasoning for the result
|
18
|
-
|
19
|
-
|
20
|
-
triggered_failures: List of failure criteria that were triggered
|
18
|
+
passed_criteria: List of criteria that were met
|
19
|
+
failed_criteria: List of criteria that were not met
|
21
20
|
"""
|
22
21
|
|
23
22
|
success: bool
|
24
23
|
conversation: List[Dict[str, str]]
|
25
24
|
reasoning: Optional[str] = None
|
26
|
-
|
27
|
-
|
28
|
-
triggered_failures: List[str] = field(default_factory=list)
|
25
|
+
passed_criteria: List[str] = field(default_factory=list)
|
26
|
+
failed_criteria: List[str] = field(default_factory=list)
|
29
27
|
total_time: Optional[float] = None
|
30
28
|
agent_time: Optional[float] = None
|
31
29
|
|
@@ -39,7 +37,7 @@ class ScenarioResult:
|
|
39
37
|
cls,
|
40
38
|
conversation: List[Dict[str, str]],
|
41
39
|
reasoning: Optional[str],
|
42
|
-
|
40
|
+
passed_criteria: List[str],
|
43
41
|
total_time: Optional[float] = None,
|
44
42
|
agent_time: Optional[float] = None,
|
45
43
|
) -> "ScenarioResult":
|
@@ -48,9 +46,8 @@ class ScenarioResult:
|
|
48
46
|
success=True,
|
49
47
|
conversation=conversation,
|
50
48
|
reasoning=reasoning,
|
51
|
-
|
52
|
-
|
53
|
-
triggered_failures=[],
|
49
|
+
passed_criteria=passed_criteria,
|
50
|
+
failed_criteria=[],
|
54
51
|
total_time=total_time,
|
55
52
|
agent_time=agent_time,
|
56
53
|
)
|
@@ -60,9 +57,8 @@ class ScenarioResult:
|
|
60
57
|
cls,
|
61
58
|
conversation: List[Dict[str, str]],
|
62
59
|
reasoning: str,
|
63
|
-
|
64
|
-
|
65
|
-
triggered_failures: Optional[List[str]] = None,
|
60
|
+
passed_criteria: Optional[List[str]] = None,
|
61
|
+
failed_criteria: Optional[List[str]] = None,
|
66
62
|
total_time: Optional[float] = None,
|
67
63
|
agent_time: Optional[float] = None,
|
68
64
|
) -> "ScenarioResult":
|
@@ -71,11 +67,8 @@ class ScenarioResult:
|
|
71
67
|
success=False,
|
72
68
|
conversation=conversation,
|
73
69
|
reasoning=reasoning,
|
74
|
-
|
75
|
-
|
76
|
-
triggered_failures=(
|
77
|
-
triggered_failures if triggered_failures is not None else []
|
78
|
-
),
|
70
|
+
passed_criteria=passed_criteria if passed_criteria is not None else [],
|
71
|
+
failed_criteria=failed_criteria if failed_criteria is not None else [],
|
79
72
|
total_time=total_time,
|
80
73
|
agent_time=agent_time,
|
81
74
|
)
|
scenario/scenario.py
CHANGED
@@ -15,6 +15,7 @@ from .testing_agent import TestingAgent
|
|
15
15
|
|
16
16
|
from openai.types.chat import ChatCompletionMessageParam
|
17
17
|
|
18
|
+
|
18
19
|
class AgentResult(TypedDict, total=False):
|
19
20
|
message: str
|
20
21
|
messages: List[ChatCompletionMessageParam]
|
@@ -27,34 +28,36 @@ class Scenario(ScenarioConfig):
|
|
27
28
|
|
28
29
|
It includes:
|
29
30
|
- A description of the scenario
|
30
|
-
-
|
31
|
-
- Failure criteria to determine if the agent failed
|
32
|
-
- An optional strategy that guides the testing agent
|
31
|
+
- Criteria to determine if the agent behaved correctly
|
33
32
|
- Optional additional parameters
|
34
33
|
"""
|
35
34
|
|
35
|
+
name: str
|
36
36
|
description: str
|
37
37
|
agent: Union[
|
38
38
|
Callable[[str, Optional[Dict[str, Any]]], Dict[str, Any]],
|
39
39
|
Callable[[str, Optional[Dict[str, Any]]], Awaitable[Dict[str, Any]]],
|
40
40
|
]
|
41
|
-
|
42
|
-
failure_criteria: List[str] = []
|
43
|
-
strategy: Optional[str] = None
|
41
|
+
criteria: List[str]
|
44
42
|
|
45
|
-
def __init__(self, description: str, **kwargs):
|
43
|
+
def __init__(self, name: str, description: str, **kwargs):
|
46
44
|
"""Validate scenario configuration after initialization."""
|
47
45
|
|
48
46
|
default_config = getattr(Scenario, "default_config", None)
|
49
47
|
if default_config:
|
50
48
|
kwargs = {**default_config.model_dump(), **kwargs}
|
51
49
|
|
50
|
+
if not name:
|
51
|
+
raise ValueError("Scenario name cannot be empty")
|
52
|
+
kwargs["name"] = name
|
53
|
+
|
52
54
|
if not description:
|
53
55
|
raise ValueError("Scenario description cannot be empty")
|
54
56
|
kwargs["description"] = description
|
55
57
|
|
56
|
-
|
57
|
-
|
58
|
+
# TODO: allow not having any criteria, for scripted scenarios
|
59
|
+
if not kwargs.get("criteria"):
|
60
|
+
raise ValueError("Scenario must have at least one criteria")
|
58
61
|
|
59
62
|
if kwargs.get("max_turns", 0) < 1:
|
60
63
|
raise ValueError("max_turns must be a positive integer")
|
@@ -65,7 +68,6 @@ class Scenario(ScenarioConfig):
|
|
65
68
|
|
66
69
|
super().__init__(**kwargs)
|
67
70
|
|
68
|
-
|
69
71
|
async def run(self, context: Optional[Dict[str, Any]] = None) -> ScenarioResult:
|
70
72
|
"""
|
71
73
|
Run the scenario against the agent under test.
|
@@ -82,6 +84,7 @@ class Scenario(ScenarioConfig):
|
|
82
84
|
# being used throughout, any user code on the callback can
|
83
85
|
# be blocking, preventing them from running scenarios in parallel
|
84
86
|
with concurrent.futures.ThreadPoolExecutor() as executor:
|
87
|
+
|
85
88
|
def run_in_thread():
|
86
89
|
loop = asyncio.new_event_loop()
|
87
90
|
asyncio.set_event_loop(loop)
|
@@ -105,6 +108,7 @@ class Scenario(ScenarioConfig):
|
|
105
108
|
max_turns: Optional[int] = None,
|
106
109
|
verbose: Optional[Union[bool, int]] = None,
|
107
110
|
cache_key: Optional[str] = None,
|
111
|
+
debug: Optional[bool] = None,
|
108
112
|
) -> None:
|
109
113
|
existing_config = getattr(cls, "default_config", ScenarioConfig())
|
110
114
|
|
@@ -114,5 +118,6 @@ class Scenario(ScenarioConfig):
|
|
114
118
|
max_turns=max_turns,
|
115
119
|
verbose=verbose,
|
116
120
|
cache_key=cache_key,
|
121
|
+
debug=debug,
|
117
122
|
)
|
118
123
|
)
|
scenario/scenario_executor.py
CHANGED
@@ -199,6 +199,6 @@ class ScenarioExecutor:
|
|
199
199
|
|
200
200
|
def _scenario_name(self):
|
201
201
|
if self.scenario.verbose == 2:
|
202
|
-
return termcolor.colored(f"[Scenario: {self.scenario.
|
202
|
+
return termcolor.colored(f"[Scenario: {self.scenario.name}] ", "yellow")
|
203
203
|
else:
|
204
204
|
return ""
|
scenario/testing_agent.py
CHANGED
@@ -4,6 +4,7 @@ TestingAgent module: defines the testing agent that interacts with the agent und
|
|
4
4
|
|
5
5
|
import json
|
6
6
|
import logging
|
7
|
+
import re
|
7
8
|
from typing import TYPE_CHECKING, Dict, List, Any, Optional, Union, cast
|
8
9
|
from pydantic import BaseModel
|
9
10
|
|
@@ -74,27 +75,19 @@ Your goal (assistant) is to interact with the Agent Under Test (user) as if you
|
|
74
75
|
{scenario.description}
|
75
76
|
</scenario>
|
76
77
|
|
77
|
-
<
|
78
|
-
{
|
79
|
-
</
|
80
|
-
|
81
|
-
<success_criteria>
|
82
|
-
{json.dumps(scenario.success_criteria, indent=2)}
|
83
|
-
</success_criteria>
|
84
|
-
|
85
|
-
<failure_criteria>
|
86
|
-
{json.dumps(scenario.failure_criteria, indent=2)}
|
87
|
-
</failure_criteria>
|
78
|
+
<criteria>
|
79
|
+
{"\n".join([f"{idx + 1}. {criterion}" for idx, criterion in enumerate(scenario.criteria)])}
|
80
|
+
</criteria>
|
88
81
|
|
89
82
|
<execution_flow>
|
90
83
|
1. Generate the first message to start the scenario
|
91
84
|
2. After the Agent Under Test (user) responds, generate the next message to send to the Agent Under Test, keep repeating step 2 until criterias match
|
92
|
-
3. If the test should end, use the finish_test tool to determine if
|
85
|
+
3. If the test should end, use the finish_test tool to determine if all the criteria have been met
|
93
86
|
</execution_flow>
|
94
87
|
|
95
88
|
<rules>
|
96
|
-
1. Test should end immediately if a
|
97
|
-
2. Test should continue until all
|
89
|
+
1. Test should end immediately if a criteria mentioning something the agent should NOT do is met
|
90
|
+
2. Test should continue until all scenario goals have been met to try going through all the criteria
|
98
91
|
3. DO NOT make any judgment calls that are not explicitly listed in the success or failure criteria, withhold judgement if necessary
|
99
92
|
4. DO NOT carry over any requests yourself, YOU ARE NOT the assistant today, wait for the user to do it
|
100
93
|
</rules>
|
@@ -141,6 +134,14 @@ if you don't have enough information to make a verdict, say inconclusive with ma
|
|
141
134
|
message.role = "user"
|
142
135
|
|
143
136
|
# Define the tool
|
137
|
+
criteria_names = [
|
138
|
+
re.sub(
|
139
|
+
r"[^a-zA-Z0-9]",
|
140
|
+
"_",
|
141
|
+
criterion.replace(" ", "_").replace("'", "").lower(),
|
142
|
+
)[:70]
|
143
|
+
for criterion in scenario.criteria
|
144
|
+
]
|
144
145
|
tools = [
|
145
146
|
{
|
146
147
|
"type": "function",
|
@@ -151,40 +152,30 @@ if you don't have enough information to make a verdict, say inconclusive with ma
|
|
151
152
|
"parameters": {
|
152
153
|
"type": "object",
|
153
154
|
"properties": {
|
154
|
-
"
|
155
|
-
"type": "string",
|
156
|
-
"enum": ["success", "failure", "inconclusive"],
|
157
|
-
"description": "The final verdict of the test",
|
158
|
-
},
|
159
|
-
"reasoning": {
|
160
|
-
"type": "string",
|
161
|
-
"description": "Explanation of why this verdict was reached",
|
162
|
-
},
|
163
|
-
"details": {
|
155
|
+
"criteria": {
|
164
156
|
"type": "object",
|
165
157
|
"properties": {
|
166
|
-
|
167
|
-
"
|
168
|
-
"
|
169
|
-
|
170
|
-
|
171
|
-
"unmet_criteria": {
|
172
|
-
"type": "array",
|
173
|
-
"items": {"type": "string"},
|
174
|
-
"description": "List of success criteria that have not been met",
|
175
|
-
},
|
176
|
-
"triggered_failures": {
|
177
|
-
"type": "array",
|
178
|
-
"items": {"type": "string"},
|
179
|
-
"description": "List of failure criteria that have been triggered",
|
180
|
-
},
|
158
|
+
criteria_names[idx]: {
|
159
|
+
"enum": [True, False, "inconclusive"],
|
160
|
+
"description": criterion,
|
161
|
+
}
|
162
|
+
for idx, criterion in enumerate(scenario.criteria)
|
181
163
|
},
|
182
|
-
"required":
|
164
|
+
"required": criteria_names,
|
183
165
|
"additionalProperties": False,
|
184
|
-
"description": "
|
166
|
+
"description": "Strict verdict for each criterion",
|
167
|
+
},
|
168
|
+
"reasoning": {
|
169
|
+
"type": "string",
|
170
|
+
"description": "Explanation of what the final verdict should be",
|
171
|
+
},
|
172
|
+
"verdict": {
|
173
|
+
"type": "string",
|
174
|
+
"enum": ["success", "failure", "inconclusive"],
|
175
|
+
"description": "The final verdict of the test",
|
185
176
|
},
|
186
177
|
},
|
187
|
-
"required": ["
|
178
|
+
"required": ["criteria", "reasoning", "verdict"],
|
188
179
|
"additionalProperties": False,
|
189
180
|
},
|
190
181
|
},
|
@@ -216,35 +207,40 @@ if you don't have enough information to make a verdict, say inconclusive with ma
|
|
216
207
|
args = json.loads(tool_call.function.arguments)
|
217
208
|
verdict = args.get("verdict", "inconclusive")
|
218
209
|
reasoning = args.get("reasoning", "No reasoning provided")
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
210
|
+
criteria = args.get("criteria", {})
|
211
|
+
|
212
|
+
passed_criteria = [
|
213
|
+
scenario.criteria[idx]
|
214
|
+
for idx, criterion in enumerate(criteria.values())
|
215
|
+
if criterion == True
|
216
|
+
]
|
217
|
+
failed_criteria = [
|
218
|
+
scenario.criteria[idx]
|
219
|
+
for idx, criterion in enumerate(criteria.values())
|
220
|
+
if criterion == False
|
221
|
+
]
|
224
222
|
|
225
223
|
# Return the appropriate ScenarioResult based on the verdict
|
226
224
|
if verdict == "success":
|
227
225
|
return ScenarioResult.success_result(
|
228
226
|
conversation=conversation,
|
229
227
|
reasoning=reasoning,
|
230
|
-
|
228
|
+
passed_criteria=passed_criteria,
|
231
229
|
)
|
232
230
|
elif verdict == "failure":
|
233
231
|
return ScenarioResult.failure_result(
|
234
232
|
conversation=conversation,
|
235
233
|
reasoning=reasoning,
|
236
|
-
|
237
|
-
|
238
|
-
triggered_failures=triggered_failures,
|
234
|
+
passed_criteria=passed_criteria,
|
235
|
+
failed_criteria=failed_criteria,
|
239
236
|
)
|
240
237
|
else: # inconclusive
|
241
238
|
return ScenarioResult(
|
242
239
|
success=False,
|
243
240
|
conversation=conversation,
|
244
241
|
reasoning=reasoning,
|
245
|
-
|
246
|
-
|
247
|
-
triggered_failures=triggered_failures,
|
242
|
+
passed_criteria=passed_criteria,
|
243
|
+
failed_criteria=failed_criteria,
|
248
244
|
)
|
249
245
|
except json.JSONDecodeError:
|
250
246
|
logger.error("Failed to parse tool call arguments")
|
@@ -254,7 +250,9 @@ if you don't have enough information to make a verdict, say inconclusive with ma
|
|
254
250
|
if message_content is None:
|
255
251
|
# If invalid tool call, raise an error
|
256
252
|
if message.tool_calls:
|
257
|
-
raise Exception(
|
253
|
+
raise Exception(
|
254
|
+
f"Invalid tool call from testing agent: {message.tool_calls.__repr__()}"
|
255
|
+
)
|
258
256
|
raise Exception(f"No response from LLM: {response.__repr__()}")
|
259
257
|
|
260
258
|
return message_content
|
@@ -262,4 +260,3 @@ if you don't have enough information to make a verdict, say inconclusive with ma
|
|
262
260
|
raise Exception(
|
263
261
|
f"Unexpected response format from LLM: {response.__repr__()}"
|
264
262
|
)
|
265
|
-
|
@@ -1,15 +0,0 @@
|
|
1
|
-
scenario/__init__.py,sha256=LfCjOpbn55jYBBZHyMSZtRAWeCDFn4z4OhAyFnu8aMg,602
|
2
|
-
scenario/cache.py,sha256=sYu16SAf-BnVYkWSlEDzpyynJGIQyNYsgMXPgCqEnmk,1719
|
3
|
-
scenario/config.py,sha256=5UVBmuQDtni0Yu00bMh5p0xMGsrymYVRftXBGTsi2fI,802
|
4
|
-
scenario/error_messages.py,sha256=ZMcAOKJmKaLIinMZ0yBIOgDhPfeJH0uZxIEmolRArtc,2344
|
5
|
-
scenario/pytest_plugin.py,sha256=ydtQxaN09qzoo12nNT8BQY_UPPHAt-AH92HWnPEN6bI,5212
|
6
|
-
scenario/result.py,sha256=SGF8uYNtkP7cJy4KsshUozZRevmdiyX2TFzr6VreTv8,2717
|
7
|
-
scenario/scenario.py,sha256=MqsyiNue1KC4mtvTHnJqJ6Fj3u0TTAdAYann8P8WBBQ,4010
|
8
|
-
scenario/scenario_executor.py,sha256=c8xV6GoJgO2JoZBWpYPQN5YwwQ3G9iJUtXV9UGSf1q8,7919
|
9
|
-
scenario/testing_agent.py,sha256=eS-c_io5cHgzJ88wwRvU_vve-pmB2HsGWN6qwlq0sPg,10865
|
10
|
-
scenario/utils.py,sha256=tMESosrxesA1B5zZB3IJ-sNSXDmnpNNib-DHobveVLA,3918
|
11
|
-
langwatch_scenario-0.1.2.dist-info/METADATA,sha256=La0j89kCoJpCriv3R8Sx5aqKgZy_iC-WNF-NqZzptfk,8684
|
12
|
-
langwatch_scenario-0.1.2.dist-info/WHEEL,sha256=pxyMxgL8-pra_rKaQ4drOZAegBVuX-G_4nRHjjgWbmo,91
|
13
|
-
langwatch_scenario-0.1.2.dist-info/entry_points.txt,sha256=WlEnJ_gku0i18bIa3DSuGqXRX-QDQLe_s0YmRzK45TI,45
|
14
|
-
langwatch_scenario-0.1.2.dist-info/top_level.txt,sha256=45Mn28aedJsetnBMB5xSmrJ-yo701QLH89Zlz4r1clE,9
|
15
|
-
langwatch_scenario-0.1.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|