langwatch-scenario 0.1.1__py3-none-any.whl → 0.1.3__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: langwatch-scenario
3
- Version: 0.1.1
3
+ Version: 0.1.3
4
4
  Summary: The end-to-end agent testing library
5
5
  Author-email: LangWatch Team <support@langwatch.ai>
6
6
  License: MIT
@@ -24,14 +24,14 @@ Requires-Dist: pydantic>=2.7.0
24
24
  Requires-Dist: joblib>=1.4.2
25
25
  Requires-Dist: wrapt>=1.17.2
26
26
  Requires-Dist: pytest-asyncio>=0.26.0
27
- Requires-Dist: rich>=14.0.0
27
+ Requires-Dist: rich<15.0.0,>=13.3.3
28
28
  Provides-Extra: dev
29
29
  Requires-Dist: black; extra == "dev"
30
30
  Requires-Dist: isort; extra == "dev"
31
31
  Requires-Dist: mypy; extra == "dev"
32
32
  Requires-Dist: pytest-cov; extra == "dev"
33
33
 
34
- ![scenario](./assets/scenario-wide.webp)
34
+ ![scenario](https://github.com/langwatch/scenario/raw/main/assets/scenario-wide.webp)
35
35
 
36
36
  <div align="center">
37
37
  <!-- Discord, PyPI, Docs, etc links -->
@@ -43,6 +43,8 @@ Scenario is a library for testing agents end-to-end as a human would, but withou
43
43
 
44
44
  You define the scenarios, and the testing agent will simulate your users as it follows them, it will keep chatting and evaluating your agent until it reaches the desired goal or detects an unexpected behavior.
45
45
 
46
+ [📺 Video Tutorial](https://www.youtube.com/watch?v=f8NLpkY0Av4)
47
+
46
48
  ## Getting Started
47
49
 
48
50
  Install pytest and scenario:
@@ -51,12 +53,12 @@ Install pytest and scenario:
51
53
  pip install pytest langwatch-scenario
52
54
  ```
53
55
 
54
- Now create your first scenario:
56
+ Now create your first scenario and save it as `tests/test_vegetarian_recipe_agent.py`:
55
57
 
56
58
  ```python
57
59
  import pytest
58
60
 
59
- from scenario import Scenario, TestingAgent
61
+ from scenario import Scenario, TestingAgent, scenario_cache
60
62
 
61
63
  Scenario.configure(testing_agent=TestingAgent(model="openai/gpt-4o-mini"))
62
64
 
@@ -64,37 +66,78 @@ Scenario.configure(testing_agent=TestingAgent(model="openai/gpt-4o-mini"))
64
66
  @pytest.mark.agent_test
65
67
  @pytest.mark.asyncio
66
68
  async def test_vegetarian_recipe_agent():
69
+ agent = VegetarianRecipeAgent()
70
+
67
71
  def vegetarian_recipe_agent(message, context):
68
72
  # Call your agent here
69
- response = "<Your agent's response>"
70
-
71
- return {"message": response}
73
+ return agent.run(message)
72
74
 
75
+ # Define the scenario
73
76
  scenario = Scenario(
74
77
  "User is looking for a dinner idea",
75
78
  agent=vegetarian_recipe_agent,
76
79
  success_criteria=[
77
80
  "Recipe agent generates a vegetarian recipe",
81
+ "Recipe includes a list of ingredients",
78
82
  "Recipe includes step-by-step cooking instructions",
79
83
  ],
80
84
  failure_criteria=[
81
- "The recipe includes meat",
85
+ "The recipe is not vegetarian or includes meat",
82
86
  "The agent asks more than two follow-up questions",
83
87
  ],
84
88
  )
85
89
 
90
+ # Run the scenario and get results
86
91
  result = await scenario.run()
87
92
 
93
+ # Assert for pytest to know whether the test passed
88
94
  assert result.success
95
+
96
+
97
+ # Example agent implementation
98
+ import litellm
99
+
100
+
101
+ class VegetarianRecipeAgent:
102
+ def __init__(self):
103
+ self.history = []
104
+
105
+ @scenario_cache()
106
+ def run(self, message: str):
107
+ self.history.append({"role": "user", "content": message})
108
+
109
+ response = litellm.completion(
110
+ model="openai/gpt-4o-mini",
111
+ messages=[
112
+ {
113
+ "role": "system",
114
+ "content": """You are a vegetarian recipe agent.
115
+ Given the user request, ask AT MOST ONE follow-up question,
116
+ then provide a complete recipe. Keep your responses concise and focused.""",
117
+ },
118
+ *self.history,
119
+ ],
120
+ )
121
+ message = response.choices[0].message # type: ignore
122
+ self.history.append(message)
123
+
124
+ return {"messages": [message]}
125
+
126
+ ```
127
+
128
+ Create a `.env` file and put your OpenAI API key in it:
129
+
130
+ ```bash
131
+ OPENAI_API_KEY=<your-api-key>
89
132
  ```
90
133
 
91
- Save it as `tests/test_vegetarian_recipe_agent.py` and run it with pytest:
134
+ Now run it with pytest:
92
135
 
93
136
  ```bash
94
137
  pytest -s tests/test_vegetarian_recipe_agent.py
95
138
  ```
96
139
 
97
- Once you connect to callback to a real agent, this is how it will look like:
140
+ This is how it will look like:
98
141
 
99
142
  [![asciicast](https://asciinema.org/a/nvO5GWGzqKTTCd8gtNSezQw11.svg)](https://asciinema.org/a/nvO5GWGzqKTTCd8gtNSezQw11)
100
143
 
@@ -132,7 +175,7 @@ You can find a fully working Lovable Clone example in [examples/test_lovable_clo
132
175
 
133
176
  ## Debug mode
134
177
 
135
- You can enable debug mode by setting the `debug` field to `True` in the `Scenario.configure` method or in the specific scenario you are running.
178
+ You can enable debug mode by setting the `debug` field to `True` in the `Scenario.configure` method or in the specific scenario you are running, or by passing the `--debug` flag to pytest.
136
179
 
137
180
  Debug mode allows you to see the messages in slow motion step by step, and intervene with your own inputs to debug your agent from the middle of the conversation.
138
181
 
@@ -140,6 +183,12 @@ Debug mode allows you to see the messages in slow motion step by step, and inter
140
183
  Scenario.configure(testing_agent=TestingAgent(model="openai/gpt-4o-mini"), debug=True)
141
184
  ```
142
185
 
186
+ or
187
+
188
+ ```bash
189
+ pytest -s tests/test_vegetarian_recipe_agent.py --debug
190
+ ```
191
+
143
192
  ## Cache
144
193
 
145
194
  Each time the scenario runs, the testing agent might chose a different input to start, this is good to make sure it covers the variance of real users as well, however we understand that the non-deterministic nature of it might make it less repeatable, costly and harder to debug. To solve for it, you can use the `cache_key` field in the `Scenario.configure` method or in the specific scenario you are running, this will make the testing agent give the same input for given the same scenario:
@@ -0,0 +1,15 @@
1
+ scenario/__init__.py,sha256=LfCjOpbn55jYBBZHyMSZtRAWeCDFn4z4OhAyFnu8aMg,602
2
+ scenario/cache.py,sha256=sYu16SAf-BnVYkWSlEDzpyynJGIQyNYsgMXPgCqEnmk,1719
3
+ scenario/config.py,sha256=5UVBmuQDtni0Yu00bMh5p0xMGsrymYVRftXBGTsi2fI,802
4
+ scenario/error_messages.py,sha256=ZMcAOKJmKaLIinMZ0yBIOgDhPfeJH0uZxIEmolRArtc,2344
5
+ scenario/pytest_plugin.py,sha256=BuBbyKLa-t9AFVn9EETl7OvGSt__dFO7KnbZynfS1UM,5789
6
+ scenario/result.py,sha256=SGF8uYNtkP7cJy4KsshUozZRevmdiyX2TFzr6VreTv8,2717
7
+ scenario/scenario.py,sha256=tYn3Y1sK6_7pg7hFb_5w0TW6nun-za_4F8kqcnrXXU4,4077
8
+ scenario/scenario_executor.py,sha256=c8xV6GoJgO2JoZBWpYPQN5YwwQ3G9iJUtXV9UGSf1q8,7919
9
+ scenario/testing_agent.py,sha256=eS-c_io5cHgzJ88wwRvU_vve-pmB2HsGWN6qwlq0sPg,10865
10
+ scenario/utils.py,sha256=tMESosrxesA1B5zZB3IJ-sNSXDmnpNNib-DHobveVLA,3918
11
+ langwatch_scenario-0.1.3.dist-info/METADATA,sha256=7OIolGcZ3fkCXFmE6JHkckVCeJb1r3yYSYveJ6iE9zw,8801
12
+ langwatch_scenario-0.1.3.dist-info/WHEEL,sha256=pxyMxgL8-pra_rKaQ4drOZAegBVuX-G_4nRHjjgWbmo,91
13
+ langwatch_scenario-0.1.3.dist-info/entry_points.txt,sha256=WlEnJ_gku0i18bIa3DSuGqXRX-QDQLe_s0YmRzK45TI,45
14
+ langwatch_scenario-0.1.3.dist-info/top_level.txt,sha256=45Mn28aedJsetnBMB5xSmrJ-yo701QLH89Zlz4r1clE,9
15
+ langwatch_scenario-0.1.3.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (78.1.0)
2
+ Generator: setuptools (79.0.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,3 +1,5 @@
1
+ from textwrap import indent
2
+ from typing import Any
1
3
  import termcolor
2
4
 
3
5
 
@@ -37,9 +39,17 @@ default_config_error_message = f"""
37
39
  """
38
40
 
39
41
 
40
- message_return_error_message = f"""
42
+ def message_return_error_message(got: Any):
43
+ got_ = got.__repr__()
44
+ if len(got_) > 100:
45
+ got_ = got_[:100] + "..."
41
46
 
42
- {termcolor.colored("->", "cyan")} Your agent should return a dict with either a "message" string key or a "messages" key in OpenAI messages format so the testing agent can understand what happened. For example:
47
+ return f"""
48
+ {termcolor.colored("->", "cyan")} Your agent returned:
49
+
50
+ {indent(got_, ' ' * 4)}
51
+
52
+ {termcolor.colored("->", "cyan")} But your agent should return a dict with either a "message" string key or a "messages" key in OpenAI messages format so the testing agent can understand what happened. For example:
43
53
 
44
54
  def my_agent_under_test(message, context):
45
55
  response = call_my_agent(message)
scenario/pytest_plugin.py CHANGED
@@ -11,14 +11,16 @@ from scenario.result import ScenarioResult
11
11
 
12
12
  from .scenario import Scenario
13
13
 
14
+
14
15
  class ScenarioReporterResults(TypedDict):
15
16
  scenario: Scenario
16
17
  result: ScenarioResult
17
18
 
19
+
18
20
  # ScenarioReporter class definition moved outside the fixture for global use
19
21
  class ScenarioReporter:
20
22
  def __init__(self):
21
- self.results : list[ScenarioReporterResults] = []
23
+ self.results: list[ScenarioReporterResults] = []
22
24
 
23
25
  def add_result(self, scenario, result):
24
26
  """Add a test result to the reporter."""
@@ -83,7 +85,12 @@ class ScenarioReporter:
83
85
  f"\n{idx}. {scenario.description} - {colored(status, status_color, attrs=['bold'])}{time}"
84
86
  )
85
87
 
86
- print(colored(f" Reasoning: {result.reasoning}", "green" if result.success else "red"))
88
+ print(
89
+ colored(
90
+ f" Reasoning: {result.reasoning}",
91
+ "green" if result.success else "red",
92
+ )
93
+ )
87
94
 
88
95
  if hasattr(result, "met_criteria") and result.met_criteria:
89
96
  criteria_count = len(result.met_criteria)
@@ -119,6 +126,10 @@ def pytest_configure(config):
119
126
  "markers", "agent_test: mark test as an agent scenario test"
120
127
  )
121
128
 
129
+ if config.getoption("--debug"):
130
+ print(colored("\nScenario debug mode enabled (--debug).", "yellow"))
131
+ Scenario.configure(verbose=True, debug=True)
132
+
122
133
  # Create a global reporter instance
123
134
  config._scenario_reporter = ScenarioReporter()
124
135
 
@@ -128,7 +139,12 @@ def pytest_configure(config):
128
139
  result = await original_run(self, *args, **kwargs)
129
140
 
130
141
  # Always report to the global reporter
131
- config._scenario_reporter.add_result(self, result)
142
+ # Ensure the reporter exists before adding result
143
+ if hasattr(config, "_scenario_reporter"):
144
+ config._scenario_reporter.add_result(self, result)
145
+ else:
146
+ # Handle case where reporter might not be initialized (should not happen with current setup)
147
+ print(colored("Warning: Scenario reporter not found during run.", "yellow"))
132
148
 
133
149
  return result
134
150
 
scenario/scenario.py CHANGED
@@ -105,6 +105,7 @@ class Scenario(ScenarioConfig):
105
105
  max_turns: Optional[int] = None,
106
106
  verbose: Optional[Union[bool, int]] = None,
107
107
  cache_key: Optional[str] = None,
108
+ debug: Optional[bool] = None,
108
109
  ) -> None:
109
110
  existing_config = getattr(cls, "default_config", ScenarioConfig())
110
111
 
@@ -114,5 +115,6 @@ class Scenario(ScenarioConfig):
114
115
  max_turns=max_turns,
115
116
  verbose=verbose,
116
117
  cache_key=cache_key,
118
+ debug=debug,
117
119
  )
118
120
  )
@@ -52,17 +52,17 @@ class ScenarioExecutor:
52
52
  # Run the initial testing agent prompt to get started
53
53
  total_start_time = time.time()
54
54
  context_scenario.set(self.scenario)
55
- initial_message = self._generate_next_message(
55
+ next_message = self._generate_next_message(
56
56
  self.scenario, self.conversation, first_message=True
57
57
  )
58
58
 
59
- if isinstance(initial_message, ScenarioResult):
59
+ if isinstance(next_message, ScenarioResult):
60
60
  raise Exception(
61
61
  "Unexpectedly generated a ScenarioResult for the initial message",
62
- initial_message.__repr__(),
62
+ next_message.__repr__(),
63
63
  )
64
64
  elif self.scenario.verbose:
65
- print(self._scenario_name() + termcolor.colored("User:", "green"), initial_message)
65
+ print(self._scenario_name() + termcolor.colored("User:", "green"), next_message)
66
66
 
67
67
  # Execute the conversation
68
68
  current_turn = 0
@@ -72,14 +72,14 @@ class ScenarioExecutor:
72
72
  # Start the test with the initial message
73
73
  while current_turn < max_turns:
74
74
  # Record the testing agent's message
75
- self.conversation.append({"role": "user", "content": initial_message})
75
+ self.conversation.append({"role": "user", "content": next_message})
76
76
 
77
77
  # Get response from the agent under test
78
78
  start_time = time.time()
79
79
 
80
80
  context_scenario.set(self.scenario)
81
81
  with show_spinner(text="Agent:", color="blue", enabled=self.scenario.verbose):
82
- agent_response = self.scenario.agent(initial_message, context)
82
+ agent_response = self.scenario.agent(next_message, context)
83
83
  if isinstance(agent_response, Awaitable):
84
84
  agent_response = await agent_response
85
85
 
@@ -97,10 +97,10 @@ class ScenarioExecutor:
97
97
  )
98
98
  )
99
99
  if not has_valid_message and not has_valid_messages:
100
- raise Exception(message_return_error_message)
100
+ raise Exception(message_return_error_message(agent_response))
101
101
 
102
102
  messages: list[ChatCompletionMessageParam] = []
103
- if has_valid_messages:
103
+ if has_valid_messages and len(agent_response["messages"]) > 0:
104
104
  messages = agent_response["messages"]
105
105
 
106
106
  # Drop the first messages both if they are system or user messages
@@ -110,7 +110,7 @@ class ScenarioExecutor:
110
110
  messages = messages[1:]
111
111
 
112
112
  if has_valid_message and self.scenario.verbose:
113
- print(self._scenario_name(), termcolor.colored("Agent:", "blue"), agent_response["message"])
113
+ print(self._scenario_name() + termcolor.colored("Agent:", "blue"), agent_response["message"])
114
114
 
115
115
  if messages and self.scenario.verbose:
116
116
  print_openai_messages(self._scenario_name(), messages)
@@ -159,7 +159,7 @@ class ScenarioExecutor:
159
159
  print(self._scenario_name() + termcolor.colored("User:", "green"), result)
160
160
 
161
161
  # Otherwise, it's the next message to send to the agent
162
- initial_message = result
162
+ next_message = result
163
163
 
164
164
  # Increment turn counter
165
165
  current_turn += 1
scenario/testing_agent.py CHANGED
@@ -249,9 +249,12 @@ if you don't have enough information to make a verdict, say inconclusive with ma
249
249
  except json.JSONDecodeError:
250
250
  logger.error("Failed to parse tool call arguments")
251
251
 
252
- # If no tool call or invalid tool call, use the message content as next message
252
+ # If no tool call use the message content as next message
253
253
  message_content = message.content
254
254
  if message_content is None:
255
+ # If invalid tool call, raise an error
256
+ if message.tool_calls:
257
+ raise Exception(f"Invalid tool call from testing agent: {message.tool_calls.__repr__()}")
255
258
  raise Exception(f"No response from LLM: {response.__repr__()}")
256
259
 
257
260
  return message_content
@@ -1,15 +0,0 @@
1
- scenario/__init__.py,sha256=LfCjOpbn55jYBBZHyMSZtRAWeCDFn4z4OhAyFnu8aMg,602
2
- scenario/cache.py,sha256=sYu16SAf-BnVYkWSlEDzpyynJGIQyNYsgMXPgCqEnmk,1719
3
- scenario/config.py,sha256=5UVBmuQDtni0Yu00bMh5p0xMGsrymYVRftXBGTsi2fI,802
4
- scenario/error_messages.py,sha256=8bTwG_iKz7FjGp50FU0anQ1fmI6eJE4NeaoXtiifbBg,2099
5
- scenario/pytest_plugin.py,sha256=ydtQxaN09qzoo12nNT8BQY_UPPHAt-AH92HWnPEN6bI,5212
6
- scenario/result.py,sha256=SGF8uYNtkP7cJy4KsshUozZRevmdiyX2TFzr6VreTv8,2717
7
- scenario/scenario.py,sha256=MqsyiNue1KC4mtvTHnJqJ6Fj3u0TTAdAYann8P8WBBQ,4010
8
- scenario/scenario_executor.py,sha256=bDzoatslbp80dG6DU-i2VUlOa9SMtyw2VIhcF7knwis,7883
9
- scenario/testing_agent.py,sha256=wMK2GqmN4QDr0kFoxgqcAPsU6gjCx8HBJQv1wmsdSb4,10683
10
- scenario/utils.py,sha256=tMESosrxesA1B5zZB3IJ-sNSXDmnpNNib-DHobveVLA,3918
11
- langwatch_scenario-0.1.1.dist-info/METADATA,sha256=SL8rtzUuSwBthrIfjiSLpPNxFt1kX8Vd1TzETBw1oys,7435
12
- langwatch_scenario-0.1.1.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
13
- langwatch_scenario-0.1.1.dist-info/entry_points.txt,sha256=WlEnJ_gku0i18bIa3DSuGqXRX-QDQLe_s0YmRzK45TI,45
14
- langwatch_scenario-0.1.1.dist-info/top_level.txt,sha256=45Mn28aedJsetnBMB5xSmrJ-yo701QLH89Zlz4r1clE,9
15
- langwatch_scenario-0.1.1.dist-info/RECORD,,