langwatch-scenario 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {langwatch_scenario-0.1.1.dist-info → langwatch_scenario-0.1.3.dist-info}/METADATA +61 -12
- langwatch_scenario-0.1.3.dist-info/RECORD +15 -0
- {langwatch_scenario-0.1.1.dist-info → langwatch_scenario-0.1.3.dist-info}/WHEEL +1 -1
- scenario/error_messages.py +12 -2
- scenario/pytest_plugin.py +19 -3
- scenario/scenario.py +2 -0
- scenario/scenario_executor.py +10 -10
- scenario/testing_agent.py +4 -1
- langwatch_scenario-0.1.1.dist-info/RECORD +0 -15
- {langwatch_scenario-0.1.1.dist-info → langwatch_scenario-0.1.3.dist-info}/entry_points.txt +0 -0
- {langwatch_scenario-0.1.1.dist-info → langwatch_scenario-0.1.3.dist-info}/top_level.txt +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: langwatch-scenario
|
3
|
-
Version: 0.1.
|
3
|
+
Version: 0.1.3
|
4
4
|
Summary: The end-to-end agent testing library
|
5
5
|
Author-email: LangWatch Team <support@langwatch.ai>
|
6
6
|
License: MIT
|
@@ -24,14 +24,14 @@ Requires-Dist: pydantic>=2.7.0
|
|
24
24
|
Requires-Dist: joblib>=1.4.2
|
25
25
|
Requires-Dist: wrapt>=1.17.2
|
26
26
|
Requires-Dist: pytest-asyncio>=0.26.0
|
27
|
-
Requires-Dist: rich
|
27
|
+
Requires-Dist: rich<15.0.0,>=13.3.3
|
28
28
|
Provides-Extra: dev
|
29
29
|
Requires-Dist: black; extra == "dev"
|
30
30
|
Requires-Dist: isort; extra == "dev"
|
31
31
|
Requires-Dist: mypy; extra == "dev"
|
32
32
|
Requires-Dist: pytest-cov; extra == "dev"
|
33
33
|
|
34
|
-

|
35
35
|
|
36
36
|
<div align="center">
|
37
37
|
<!-- Discord, PyPI, Docs, etc links -->
|
@@ -43,6 +43,8 @@ Scenario is a library for testing agents end-to-end as a human would, but withou
|
|
43
43
|
|
44
44
|
You define the scenarios, and the testing agent will simulate your users as it follows them, it will keep chatting and evaluating your agent until it reaches the desired goal or detects an unexpected behavior.
|
45
45
|
|
46
|
+
[📺 Video Tutorial](https://www.youtube.com/watch?v=f8NLpkY0Av4)
|
47
|
+
|
46
48
|
## Getting Started
|
47
49
|
|
48
50
|
Install pytest and scenario:
|
@@ -51,12 +53,12 @@ Install pytest and scenario:
|
|
51
53
|
pip install pytest langwatch-scenario
|
52
54
|
```
|
53
55
|
|
54
|
-
Now create your first scenario
|
56
|
+
Now create your first scenario and save it as `tests/test_vegetarian_recipe_agent.py`:
|
55
57
|
|
56
58
|
```python
|
57
59
|
import pytest
|
58
60
|
|
59
|
-
from scenario import Scenario, TestingAgent
|
61
|
+
from scenario import Scenario, TestingAgent, scenario_cache
|
60
62
|
|
61
63
|
Scenario.configure(testing_agent=TestingAgent(model="openai/gpt-4o-mini"))
|
62
64
|
|
@@ -64,37 +66,78 @@ Scenario.configure(testing_agent=TestingAgent(model="openai/gpt-4o-mini"))
|
|
64
66
|
@pytest.mark.agent_test
|
65
67
|
@pytest.mark.asyncio
|
66
68
|
async def test_vegetarian_recipe_agent():
|
69
|
+
agent = VegetarianRecipeAgent()
|
70
|
+
|
67
71
|
def vegetarian_recipe_agent(message, context):
|
68
72
|
# Call your agent here
|
69
|
-
|
70
|
-
|
71
|
-
return {"message": response}
|
73
|
+
return agent.run(message)
|
72
74
|
|
75
|
+
# Define the scenario
|
73
76
|
scenario = Scenario(
|
74
77
|
"User is looking for a dinner idea",
|
75
78
|
agent=vegetarian_recipe_agent,
|
76
79
|
success_criteria=[
|
77
80
|
"Recipe agent generates a vegetarian recipe",
|
81
|
+
"Recipe includes a list of ingredients",
|
78
82
|
"Recipe includes step-by-step cooking instructions",
|
79
83
|
],
|
80
84
|
failure_criteria=[
|
81
|
-
"The recipe includes meat",
|
85
|
+
"The recipe is not vegetarian or includes meat",
|
82
86
|
"The agent asks more than two follow-up questions",
|
83
87
|
],
|
84
88
|
)
|
85
89
|
|
90
|
+
# Run the scenario and get results
|
86
91
|
result = await scenario.run()
|
87
92
|
|
93
|
+
# Assert for pytest to know whether the test passed
|
88
94
|
assert result.success
|
95
|
+
|
96
|
+
|
97
|
+
# Example agent implementation
|
98
|
+
import litellm
|
99
|
+
|
100
|
+
|
101
|
+
class VegetarianRecipeAgent:
|
102
|
+
def __init__(self):
|
103
|
+
self.history = []
|
104
|
+
|
105
|
+
@scenario_cache()
|
106
|
+
def run(self, message: str):
|
107
|
+
self.history.append({"role": "user", "content": message})
|
108
|
+
|
109
|
+
response = litellm.completion(
|
110
|
+
model="openai/gpt-4o-mini",
|
111
|
+
messages=[
|
112
|
+
{
|
113
|
+
"role": "system",
|
114
|
+
"content": """You are a vegetarian recipe agent.
|
115
|
+
Given the user request, ask AT MOST ONE follow-up question,
|
116
|
+
then provide a complete recipe. Keep your responses concise and focused.""",
|
117
|
+
},
|
118
|
+
*self.history,
|
119
|
+
],
|
120
|
+
)
|
121
|
+
message = response.choices[0].message # type: ignore
|
122
|
+
self.history.append(message)
|
123
|
+
|
124
|
+
return {"messages": [message]}
|
125
|
+
|
126
|
+
```
|
127
|
+
|
128
|
+
Create a `.env` file and put your OpenAI API key in it:
|
129
|
+
|
130
|
+
```bash
|
131
|
+
OPENAI_API_KEY=<your-api-key>
|
89
132
|
```
|
90
133
|
|
91
|
-
|
134
|
+
Now run it with pytest:
|
92
135
|
|
93
136
|
```bash
|
94
137
|
pytest -s tests/test_vegetarian_recipe_agent.py
|
95
138
|
```
|
96
139
|
|
97
|
-
|
140
|
+
This is how it will look like:
|
98
141
|
|
99
142
|
[](https://asciinema.org/a/nvO5GWGzqKTTCd8gtNSezQw11)
|
100
143
|
|
@@ -132,7 +175,7 @@ You can find a fully working Lovable Clone example in [examples/test_lovable_clo
|
|
132
175
|
|
133
176
|
## Debug mode
|
134
177
|
|
135
|
-
You can enable debug mode by setting the `debug` field to `True` in the `Scenario.configure` method or in the specific scenario you are running.
|
178
|
+
You can enable debug mode by setting the `debug` field to `True` in the `Scenario.configure` method or in the specific scenario you are running, or by passing the `--debug` flag to pytest.
|
136
179
|
|
137
180
|
Debug mode allows you to see the messages in slow motion step by step, and intervene with your own inputs to debug your agent from the middle of the conversation.
|
138
181
|
|
@@ -140,6 +183,12 @@ Debug mode allows you to see the messages in slow motion step by step, and inter
|
|
140
183
|
Scenario.configure(testing_agent=TestingAgent(model="openai/gpt-4o-mini"), debug=True)
|
141
184
|
```
|
142
185
|
|
186
|
+
or
|
187
|
+
|
188
|
+
```bash
|
189
|
+
pytest -s tests/test_vegetarian_recipe_agent.py --debug
|
190
|
+
```
|
191
|
+
|
143
192
|
## Cache
|
144
193
|
|
145
194
|
Each time the scenario runs, the testing agent might chose a different input to start, this is good to make sure it covers the variance of real users as well, however we understand that the non-deterministic nature of it might make it less repeatable, costly and harder to debug. To solve for it, you can use the `cache_key` field in the `Scenario.configure` method or in the specific scenario you are running, this will make the testing agent give the same input for given the same scenario:
|
@@ -0,0 +1,15 @@
|
|
1
|
+
scenario/__init__.py,sha256=LfCjOpbn55jYBBZHyMSZtRAWeCDFn4z4OhAyFnu8aMg,602
|
2
|
+
scenario/cache.py,sha256=sYu16SAf-BnVYkWSlEDzpyynJGIQyNYsgMXPgCqEnmk,1719
|
3
|
+
scenario/config.py,sha256=5UVBmuQDtni0Yu00bMh5p0xMGsrymYVRftXBGTsi2fI,802
|
4
|
+
scenario/error_messages.py,sha256=ZMcAOKJmKaLIinMZ0yBIOgDhPfeJH0uZxIEmolRArtc,2344
|
5
|
+
scenario/pytest_plugin.py,sha256=BuBbyKLa-t9AFVn9EETl7OvGSt__dFO7KnbZynfS1UM,5789
|
6
|
+
scenario/result.py,sha256=SGF8uYNtkP7cJy4KsshUozZRevmdiyX2TFzr6VreTv8,2717
|
7
|
+
scenario/scenario.py,sha256=tYn3Y1sK6_7pg7hFb_5w0TW6nun-za_4F8kqcnrXXU4,4077
|
8
|
+
scenario/scenario_executor.py,sha256=c8xV6GoJgO2JoZBWpYPQN5YwwQ3G9iJUtXV9UGSf1q8,7919
|
9
|
+
scenario/testing_agent.py,sha256=eS-c_io5cHgzJ88wwRvU_vve-pmB2HsGWN6qwlq0sPg,10865
|
10
|
+
scenario/utils.py,sha256=tMESosrxesA1B5zZB3IJ-sNSXDmnpNNib-DHobveVLA,3918
|
11
|
+
langwatch_scenario-0.1.3.dist-info/METADATA,sha256=7OIolGcZ3fkCXFmE6JHkckVCeJb1r3yYSYveJ6iE9zw,8801
|
12
|
+
langwatch_scenario-0.1.3.dist-info/WHEEL,sha256=pxyMxgL8-pra_rKaQ4drOZAegBVuX-G_4nRHjjgWbmo,91
|
13
|
+
langwatch_scenario-0.1.3.dist-info/entry_points.txt,sha256=WlEnJ_gku0i18bIa3DSuGqXRX-QDQLe_s0YmRzK45TI,45
|
14
|
+
langwatch_scenario-0.1.3.dist-info/top_level.txt,sha256=45Mn28aedJsetnBMB5xSmrJ-yo701QLH89Zlz4r1clE,9
|
15
|
+
langwatch_scenario-0.1.3.dist-info/RECORD,,
|
scenario/error_messages.py
CHANGED
@@ -1,3 +1,5 @@
|
|
1
|
+
from textwrap import indent
|
2
|
+
from typing import Any
|
1
3
|
import termcolor
|
2
4
|
|
3
5
|
|
@@ -37,9 +39,17 @@ default_config_error_message = f"""
|
|
37
39
|
"""
|
38
40
|
|
39
41
|
|
40
|
-
message_return_error_message
|
42
|
+
def message_return_error_message(got: Any):
|
43
|
+
got_ = got.__repr__()
|
44
|
+
if len(got_) > 100:
|
45
|
+
got_ = got_[:100] + "..."
|
41
46
|
|
42
|
-
|
47
|
+
return f"""
|
48
|
+
{termcolor.colored("->", "cyan")} Your agent returned:
|
49
|
+
|
50
|
+
{indent(got_, ' ' * 4)}
|
51
|
+
|
52
|
+
{termcolor.colored("->", "cyan")} But your agent should return a dict with either a "message" string key or a "messages" key in OpenAI messages format so the testing agent can understand what happened. For example:
|
43
53
|
|
44
54
|
def my_agent_under_test(message, context):
|
45
55
|
response = call_my_agent(message)
|
scenario/pytest_plugin.py
CHANGED
@@ -11,14 +11,16 @@ from scenario.result import ScenarioResult
|
|
11
11
|
|
12
12
|
from .scenario import Scenario
|
13
13
|
|
14
|
+
|
14
15
|
class ScenarioReporterResults(TypedDict):
|
15
16
|
scenario: Scenario
|
16
17
|
result: ScenarioResult
|
17
18
|
|
19
|
+
|
18
20
|
# ScenarioReporter class definition moved outside the fixture for global use
|
19
21
|
class ScenarioReporter:
|
20
22
|
def __init__(self):
|
21
|
-
self.results
|
23
|
+
self.results: list[ScenarioReporterResults] = []
|
22
24
|
|
23
25
|
def add_result(self, scenario, result):
|
24
26
|
"""Add a test result to the reporter."""
|
@@ -83,7 +85,12 @@ class ScenarioReporter:
|
|
83
85
|
f"\n{idx}. {scenario.description} - {colored(status, status_color, attrs=['bold'])}{time}"
|
84
86
|
)
|
85
87
|
|
86
|
-
print(
|
88
|
+
print(
|
89
|
+
colored(
|
90
|
+
f" Reasoning: {result.reasoning}",
|
91
|
+
"green" if result.success else "red",
|
92
|
+
)
|
93
|
+
)
|
87
94
|
|
88
95
|
if hasattr(result, "met_criteria") and result.met_criteria:
|
89
96
|
criteria_count = len(result.met_criteria)
|
@@ -119,6 +126,10 @@ def pytest_configure(config):
|
|
119
126
|
"markers", "agent_test: mark test as an agent scenario test"
|
120
127
|
)
|
121
128
|
|
129
|
+
if config.getoption("--debug"):
|
130
|
+
print(colored("\nScenario debug mode enabled (--debug).", "yellow"))
|
131
|
+
Scenario.configure(verbose=True, debug=True)
|
132
|
+
|
122
133
|
# Create a global reporter instance
|
123
134
|
config._scenario_reporter = ScenarioReporter()
|
124
135
|
|
@@ -128,7 +139,12 @@ def pytest_configure(config):
|
|
128
139
|
result = await original_run(self, *args, **kwargs)
|
129
140
|
|
130
141
|
# Always report to the global reporter
|
131
|
-
|
142
|
+
# Ensure the reporter exists before adding result
|
143
|
+
if hasattr(config, "_scenario_reporter"):
|
144
|
+
config._scenario_reporter.add_result(self, result)
|
145
|
+
else:
|
146
|
+
# Handle case where reporter might not be initialized (should not happen with current setup)
|
147
|
+
print(colored("Warning: Scenario reporter not found during run.", "yellow"))
|
132
148
|
|
133
149
|
return result
|
134
150
|
|
scenario/scenario.py
CHANGED
@@ -105,6 +105,7 @@ class Scenario(ScenarioConfig):
|
|
105
105
|
max_turns: Optional[int] = None,
|
106
106
|
verbose: Optional[Union[bool, int]] = None,
|
107
107
|
cache_key: Optional[str] = None,
|
108
|
+
debug: Optional[bool] = None,
|
108
109
|
) -> None:
|
109
110
|
existing_config = getattr(cls, "default_config", ScenarioConfig())
|
110
111
|
|
@@ -114,5 +115,6 @@ class Scenario(ScenarioConfig):
|
|
114
115
|
max_turns=max_turns,
|
115
116
|
verbose=verbose,
|
116
117
|
cache_key=cache_key,
|
118
|
+
debug=debug,
|
117
119
|
)
|
118
120
|
)
|
scenario/scenario_executor.py
CHANGED
@@ -52,17 +52,17 @@ class ScenarioExecutor:
|
|
52
52
|
# Run the initial testing agent prompt to get started
|
53
53
|
total_start_time = time.time()
|
54
54
|
context_scenario.set(self.scenario)
|
55
|
-
|
55
|
+
next_message = self._generate_next_message(
|
56
56
|
self.scenario, self.conversation, first_message=True
|
57
57
|
)
|
58
58
|
|
59
|
-
if isinstance(
|
59
|
+
if isinstance(next_message, ScenarioResult):
|
60
60
|
raise Exception(
|
61
61
|
"Unexpectedly generated a ScenarioResult for the initial message",
|
62
|
-
|
62
|
+
next_message.__repr__(),
|
63
63
|
)
|
64
64
|
elif self.scenario.verbose:
|
65
|
-
print(self._scenario_name() + termcolor.colored("User:", "green"),
|
65
|
+
print(self._scenario_name() + termcolor.colored("User:", "green"), next_message)
|
66
66
|
|
67
67
|
# Execute the conversation
|
68
68
|
current_turn = 0
|
@@ -72,14 +72,14 @@ class ScenarioExecutor:
|
|
72
72
|
# Start the test with the initial message
|
73
73
|
while current_turn < max_turns:
|
74
74
|
# Record the testing agent's message
|
75
|
-
self.conversation.append({"role": "user", "content":
|
75
|
+
self.conversation.append({"role": "user", "content": next_message})
|
76
76
|
|
77
77
|
# Get response from the agent under test
|
78
78
|
start_time = time.time()
|
79
79
|
|
80
80
|
context_scenario.set(self.scenario)
|
81
81
|
with show_spinner(text="Agent:", color="blue", enabled=self.scenario.verbose):
|
82
|
-
agent_response = self.scenario.agent(
|
82
|
+
agent_response = self.scenario.agent(next_message, context)
|
83
83
|
if isinstance(agent_response, Awaitable):
|
84
84
|
agent_response = await agent_response
|
85
85
|
|
@@ -97,10 +97,10 @@ class ScenarioExecutor:
|
|
97
97
|
)
|
98
98
|
)
|
99
99
|
if not has_valid_message and not has_valid_messages:
|
100
|
-
raise Exception(message_return_error_message)
|
100
|
+
raise Exception(message_return_error_message(agent_response))
|
101
101
|
|
102
102
|
messages: list[ChatCompletionMessageParam] = []
|
103
|
-
if has_valid_messages:
|
103
|
+
if has_valid_messages and len(agent_response["messages"]) > 0:
|
104
104
|
messages = agent_response["messages"]
|
105
105
|
|
106
106
|
# Drop the first messages both if they are system or user messages
|
@@ -110,7 +110,7 @@ class ScenarioExecutor:
|
|
110
110
|
messages = messages[1:]
|
111
111
|
|
112
112
|
if has_valid_message and self.scenario.verbose:
|
113
|
-
print(self._scenario_name()
|
113
|
+
print(self._scenario_name() + termcolor.colored("Agent:", "blue"), agent_response["message"])
|
114
114
|
|
115
115
|
if messages and self.scenario.verbose:
|
116
116
|
print_openai_messages(self._scenario_name(), messages)
|
@@ -159,7 +159,7 @@ class ScenarioExecutor:
|
|
159
159
|
print(self._scenario_name() + termcolor.colored("User:", "green"), result)
|
160
160
|
|
161
161
|
# Otherwise, it's the next message to send to the agent
|
162
|
-
|
162
|
+
next_message = result
|
163
163
|
|
164
164
|
# Increment turn counter
|
165
165
|
current_turn += 1
|
scenario/testing_agent.py
CHANGED
@@ -249,9 +249,12 @@ if you don't have enough information to make a verdict, say inconclusive with ma
|
|
249
249
|
except json.JSONDecodeError:
|
250
250
|
logger.error("Failed to parse tool call arguments")
|
251
251
|
|
252
|
-
# If no tool call
|
252
|
+
# If no tool call use the message content as next message
|
253
253
|
message_content = message.content
|
254
254
|
if message_content is None:
|
255
|
+
# If invalid tool call, raise an error
|
256
|
+
if message.tool_calls:
|
257
|
+
raise Exception(f"Invalid tool call from testing agent: {message.tool_calls.__repr__()}")
|
255
258
|
raise Exception(f"No response from LLM: {response.__repr__()}")
|
256
259
|
|
257
260
|
return message_content
|
@@ -1,15 +0,0 @@
|
|
1
|
-
scenario/__init__.py,sha256=LfCjOpbn55jYBBZHyMSZtRAWeCDFn4z4OhAyFnu8aMg,602
|
2
|
-
scenario/cache.py,sha256=sYu16SAf-BnVYkWSlEDzpyynJGIQyNYsgMXPgCqEnmk,1719
|
3
|
-
scenario/config.py,sha256=5UVBmuQDtni0Yu00bMh5p0xMGsrymYVRftXBGTsi2fI,802
|
4
|
-
scenario/error_messages.py,sha256=8bTwG_iKz7FjGp50FU0anQ1fmI6eJE4NeaoXtiifbBg,2099
|
5
|
-
scenario/pytest_plugin.py,sha256=ydtQxaN09qzoo12nNT8BQY_UPPHAt-AH92HWnPEN6bI,5212
|
6
|
-
scenario/result.py,sha256=SGF8uYNtkP7cJy4KsshUozZRevmdiyX2TFzr6VreTv8,2717
|
7
|
-
scenario/scenario.py,sha256=MqsyiNue1KC4mtvTHnJqJ6Fj3u0TTAdAYann8P8WBBQ,4010
|
8
|
-
scenario/scenario_executor.py,sha256=bDzoatslbp80dG6DU-i2VUlOa9SMtyw2VIhcF7knwis,7883
|
9
|
-
scenario/testing_agent.py,sha256=wMK2GqmN4QDr0kFoxgqcAPsU6gjCx8HBJQv1wmsdSb4,10683
|
10
|
-
scenario/utils.py,sha256=tMESosrxesA1B5zZB3IJ-sNSXDmnpNNib-DHobveVLA,3918
|
11
|
-
langwatch_scenario-0.1.1.dist-info/METADATA,sha256=SL8rtzUuSwBthrIfjiSLpPNxFt1kX8Vd1TzETBw1oys,7435
|
12
|
-
langwatch_scenario-0.1.1.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
13
|
-
langwatch_scenario-0.1.1.dist-info/entry_points.txt,sha256=WlEnJ_gku0i18bIa3DSuGqXRX-QDQLe_s0YmRzK45TI,45
|
14
|
-
langwatch_scenario-0.1.1.dist-info/top_level.txt,sha256=45Mn28aedJsetnBMB5xSmrJ-yo701QLH89Zlz4r1clE,9
|
15
|
-
langwatch_scenario-0.1.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|