langwatch-scenario 0.2.0__py3-none-any.whl → 0.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,254 +0,0 @@
1
- Metadata-Version: 2.4
2
- Name: langwatch-scenario
3
- Version: 0.2.0
4
- Summary: The end-to-end agent testing library
5
- Author-email: LangWatch Team <support@langwatch.ai>
6
- License: MIT
7
- Project-URL: Homepage, https://github.com/langwatch/scenario
8
- Project-URL: Bug Tracker, https://github.com/langwatch/scenario/issues
9
- Classifier: Development Status :: 4 - Beta
10
- Classifier: Intended Audience :: Developers
11
- Classifier: License :: OSI Approved :: MIT License
12
- Classifier: Programming Language :: Python :: 3
13
- Classifier: Programming Language :: Python :: 3.8
14
- Classifier: Programming Language :: Python :: 3.9
15
- Classifier: Programming Language :: Python :: 3.10
16
- Classifier: Programming Language :: Python :: 3.11
17
- Requires-Python: >=3.9
18
- Description-Content-Type: text/markdown
19
- Requires-Dist: pytest>=8.1.1
20
- Requires-Dist: litellm>=1.49.0
21
- Requires-Dist: python-dotenv>=1.0.1
22
- Requires-Dist: termcolor>=2.4.0
23
- Requires-Dist: pydantic>=2.7.0
24
- Requires-Dist: joblib>=1.4.2
25
- Requires-Dist: wrapt>=1.17.2
26
- Requires-Dist: pytest-asyncio>=0.26.0
27
- Requires-Dist: rich<15.0.0,>=13.3.3
28
- Provides-Extra: dev
29
- Requires-Dist: black; extra == "dev"
30
- Requires-Dist: isort; extra == "dev"
31
- Requires-Dist: mypy; extra == "dev"
32
- Requires-Dist: pytest-cov; extra == "dev"
33
-
34
- ![scenario](https://github.com/langwatch/scenario/raw/main/assets/scenario-wide.webp)
35
-
36
- <div align="center">
37
- <!-- Discord, PyPI, Docs, etc links -->
38
- </div>
39
-
40
- # Scenario: Use an Agent to test your Agent
41
-
42
- Scenario is a library for testing agents end-to-end as a human would, but without having to manually do it. The automated testing agent covers every single scenario for you.
43
-
44
- You define the scenarios, and the testing agent will simulate your users as it follows them, it will keep chatting and evaluating your agent until it reaches the desired goal or detects an unexpected behavior.
45
-
46
- [📺 Video Tutorial](https://www.youtube.com/watch?v=f8NLpkY0Av4)
47
-
48
- ### See also
49
-
50
- - [Scenario TypeScript](https://github.com/langwatch/scenario-ts/)
51
- - [Scenario Go](https://github.com/langwatch/scenario-go/)
52
-
53
- ## Getting Started
54
-
55
- Install pytest and scenario:
56
-
57
- ```bash
58
- pip install pytest langwatch-scenario
59
- ```
60
-
61
- Now create your first scenario and save it as `tests/test_vegetarian_recipe_agent.py`:
62
-
63
- ```python
64
- import pytest
65
-
66
- from scenario import Scenario, TestingAgent, scenario_cache
67
-
68
- Scenario.configure(testing_agent=TestingAgent(model="openai/gpt-4o-mini"))
69
-
70
-
71
- @pytest.mark.agent_test
72
- @pytest.mark.asyncio
73
- async def test_vegetarian_recipe_agent():
74
- agent = VegetarianRecipeAgent()
75
-
76
- def vegetarian_recipe_agent(message, context):
77
- # Call your agent here
78
- return agent.run(message)
79
-
80
- # Define the simulated scenario
81
- scenario = Scenario(
82
- name="dinner idea",
83
- description="""
84
- It's saturday evening, the user is very hungry and tired,
85
- but have no money to order out, so they are looking for a recipe.
86
-
87
- The user never mentions they want a vegetarian recipe.
88
- """,
89
- agent=vegetarian_recipe_agent,
90
- # List the evaluation criteria for the scenario to be considered successful
91
- criteria=[
92
- "Agent should not ask more than two follow-up questions",
93
- "Agent should generate a recipe",
94
- "Recipe should include a list of ingredients",
95
- "Recipe should include step-by-step cooking instructions",
96
- "Recipe should be vegetarian and not include any sort of meat",
97
- ],
98
- )
99
-
100
- # Run the scenario and get results
101
- result = await scenario.run()
102
-
103
- # Assert for pytest to know whether the test passed
104
- assert result.success
105
-
106
-
107
- # Example agent implementation
108
- import litellm
109
-
110
-
111
- class VegetarianRecipeAgent:
112
- def __init__(self):
113
- self.history = []
114
-
115
- @scenario_cache()
116
- def run(self, message: str):
117
- self.history.append({"role": "user", "content": message})
118
-
119
- response = litellm.completion(
120
- model="openai/gpt-4o-mini",
121
- messages=[
122
- {
123
- "role": "system",
124
- "content": """
125
- You are a vegetarian recipe agent.
126
- Given the user request, ask AT MOST ONE follow-up question,
127
- then provide a complete recipe. Keep your responses concise and focused.
128
- """,
129
- },
130
- *self.history,
131
- ],
132
- )
133
- message = response.choices[0].message # type: ignore
134
- self.history.append(message)
135
-
136
- return {"messages": [message]}
137
-
138
- ```
139
-
140
- Create a `.env` file and put your OpenAI API key in it:
141
-
142
- ```bash
143
- OPENAI_API_KEY=<your-api-key>
144
- ```
145
-
146
- Now run it with pytest:
147
-
148
- ```bash
149
- pytest -s tests/test_vegetarian_recipe_agent.py
150
- ```
151
-
152
- This is how it will look like:
153
-
154
- [![asciicast](https://asciinema.org/a/nvO5GWGzqKTTCd8gtNSezQw11.svg)](https://asciinema.org/a/nvO5GWGzqKTTCd8gtNSezQw11)
155
-
156
- You can find a fully working example in [examples/test_vegetarian_recipe_agent.py](examples/test_vegetarian_recipe_agent.py).
157
-
158
- ## Customize strategy and max_turns
159
-
160
- You can customize how should the testing agent go about testing by defining a `strategy` field. You can also limit the maximum number of turns the scenario will take by setting the `max_turns` field (defaults to 10).
161
-
162
- For example, in this Lovable Clone scenario test:
163
-
164
- ```python
165
- scenario = Scenario(
166
- name="dog walking startup landing page",
167
- description="""
168
- the user wants to create a new landing page for their dog walking startup
169
-
170
- send the first message to generate the landing page, then a single follow up request to extend it, then give your final verdict
171
- """,
172
- agent=lovable_agent,
173
- criteria=[
174
- "agent reads the files before go and making changes",
175
- "agent modified the index.css file, not only the Index.tsx file",
176
- "agent created a comprehensive landing page",
177
- "agent extended the landing page with a new section",
178
- "agent should NOT say it can't read the file",
179
- "agent should NOT produce incomplete code or be too lazy to finish",
180
- ],
181
- max_turns=5,
182
- )
183
-
184
- result = await scenario.run()
185
- ```
186
-
187
- You can find a fully working Lovable Clone example in [examples/test_lovable_clone.py](examples/test_lovable_clone.py).
188
-
189
- ## Debug mode
190
-
191
- You can enable debug mode by setting the `debug` field to `True` in the `Scenario.configure` method or in the specific scenario you are running, or by passing the `--debug` flag to pytest.
192
-
193
- Debug mode allows you to see the messages in slow motion step by step, and intervene with your own inputs to debug your agent from the middle of the conversation.
194
-
195
- ```python
196
- Scenario.configure(testing_agent=TestingAgent(model="openai/gpt-4o-mini"), debug=True)
197
- ```
198
-
199
- or
200
-
201
- ```bash
202
- pytest -s tests/test_vegetarian_recipe_agent.py --debug
203
- ```
204
-
205
- ## Cache
206
-
207
- Each time the scenario runs, the testing agent might chose a different input to start, this is good to make sure it covers the variance of real users as well, however we understand that the non-deterministic nature of it might make it less repeatable, costly and harder to debug. To solve for it, you can use the `cache_key` field in the `Scenario.configure` method or in the specific scenario you are running, this will make the testing agent give the same input for given the same scenario:
208
-
209
- ```python
210
- Scenario.configure(testing_agent=TestingAgent(model="openai/gpt-4o-mini"), cache_key="42")
211
- ```
212
-
213
- To bust the cache, you can simply pass a different `cache_key`, disable it, or delete the cache files located at `~/.scenario/cache`.
214
-
215
- To go a step further and fully cache the test end-to-end, you can also wrap the LLM calls or any other non-deterministic functions in your application side with the `@scenario_cache` decorator:
216
-
217
- ```python
218
- class MyAgent:
219
- @scenario_cache(ignore=["self"])
220
- def invoke(self, message, context):
221
- return client.chat.completions.create(
222
- # ...
223
- )
224
- ```
225
-
226
- This will cache any function call you decorate when running the tests and make them repeatable, hashed by the function arguments, the scenario being executed, and the `cache_key` you provided. You can exclude arguments that should not be hashed for the cache key by naming them in the `ignore` argument.
227
-
228
- ## Disable Output
229
-
230
- You can remove the `-s` flag from pytest to hide the output during test, which will only show up if the test fails. Alternatively, you can set `verbose=False` in the `Scenario.configure` method or in the specific scenario you are running.
231
-
232
- ## Running in parallel
233
-
234
- As the number of your scenarios grows, you might want to run them in parallel to speed up your whole test suite. We suggest you to use the [pytest-asyncio-concurrent](https://pypi.org/project/pytest-asyncio-concurrent/) plugin to do so.
235
-
236
- Simply install the plugin from the link above, then replace the `@pytest.mark.asyncio` annotation in the tests with `@pytest.mark.asyncio_concurrent`, adding a group name to it to mark the group of scenarions that should be run in parallel together, e.g.:
237
-
238
- ```python
239
- @pytest.mark.agent_test
240
- @pytest.mark.asyncio_concurrent(group="vegetarian_recipe_agent")
241
- async def test_vegetarian_recipe_agent():
242
- # ...
243
-
244
- @pytest.mark.agent_test
245
- @pytest.mark.asyncio_concurrent(group="vegetarian_recipe_agent")
246
- async def test_user_is_very_hungry():
247
- # ...
248
- ```
249
-
250
- Those two scenarios should now run in parallel.
251
-
252
- ## License
253
-
254
- MIT License
@@ -1,15 +0,0 @@
1
- scenario/__init__.py,sha256=LfCjOpbn55jYBBZHyMSZtRAWeCDFn4z4OhAyFnu8aMg,602
2
- scenario/cache.py,sha256=sYu16SAf-BnVYkWSlEDzpyynJGIQyNYsgMXPgCqEnmk,1719
3
- scenario/config.py,sha256=5UVBmuQDtni0Yu00bMh5p0xMGsrymYVRftXBGTsi2fI,802
4
- scenario/error_messages.py,sha256=ZMcAOKJmKaLIinMZ0yBIOgDhPfeJH0uZxIEmolRArtc,2344
5
- scenario/pytest_plugin.py,sha256=TzOHi8PN-dtDqaYAZkgT0wgBkhetOpYy--Z0pzi5PXM,5771
6
- scenario/result.py,sha256=y6mUu6X4H6YJYmwVD4VWHCBi-1BTlUVeYrTZ3HBA0oU,2382
7
- scenario/scenario.py,sha256=OTadwIHIcUhXxfUNnJXpT7h3GZ_VUL3XSd9k-oVPfMo,4069
8
- scenario/scenario_executor.py,sha256=phRKj7vZ_QjGUO9w05-DPrAzdacg_7CnTV59lYLCCKk,7912
9
- scenario/testing_agent.py,sha256=y4B8TMhKryeTiiv62qwslx7Gw_zw54Vk9zPyswEPm0k,10481
10
- scenario/utils.py,sha256=tMESosrxesA1B5zZB3IJ-sNSXDmnpNNib-DHobveVLA,3918
11
- langwatch_scenario-0.2.0.dist-info/METADATA,sha256=fc1oBg2ms-iVgYc44oSTJk-8sw2yOe_PpWEMStvYEX4,9339
12
- langwatch_scenario-0.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
13
- langwatch_scenario-0.2.0.dist-info/entry_points.txt,sha256=WlEnJ_gku0i18bIa3DSuGqXRX-QDQLe_s0YmRzK45TI,45
14
- langwatch_scenario-0.2.0.dist-info/top_level.txt,sha256=45Mn28aedJsetnBMB5xSmrJ-yo701QLH89Zlz4r1clE,9
15
- langwatch_scenario-0.2.0.dist-info/RECORD,,
scenario/result.py DELETED
@@ -1,74 +0,0 @@
1
- """
2
- Result module: defines the class for scenario test results.
3
- """
4
-
5
- from dataclasses import dataclass, field
6
- from typing import List, Dict, Optional
7
-
8
-
9
- @dataclass
10
- class ScenarioResult:
11
- """
12
- Represents the results of a scenario test run.
13
-
14
- Attributes:
15
- success: Whether the scenario passed
16
- conversation: The conversation history
17
- reasoning: Reasoning for the result
18
- passed_criteria: List of criteria that were met
19
- failed_criteria: List of criteria that were not met
20
- """
21
-
22
- success: bool
23
- conversation: List[Dict[str, str]]
24
- reasoning: Optional[str] = None
25
- passed_criteria: List[str] = field(default_factory=list)
26
- failed_criteria: List[str] = field(default_factory=list)
27
- total_time: Optional[float] = None
28
- agent_time: Optional[float] = None
29
-
30
- def __post_init__(self) -> None:
31
- """Validate the result after initialization."""
32
- if not self.success and not self.reasoning:
33
- raise ValueError("Failed scenarios must have a reasoning")
34
-
35
- @classmethod
36
- def success_result(
37
- cls,
38
- conversation: List[Dict[str, str]],
39
- reasoning: Optional[str],
40
- passed_criteria: List[str],
41
- total_time: Optional[float] = None,
42
- agent_time: Optional[float] = None,
43
- ) -> "ScenarioResult":
44
- """Create a successful result."""
45
- return cls(
46
- success=True,
47
- conversation=conversation,
48
- reasoning=reasoning,
49
- passed_criteria=passed_criteria,
50
- failed_criteria=[],
51
- total_time=total_time,
52
- agent_time=agent_time,
53
- )
54
-
55
- @classmethod
56
- def failure_result(
57
- cls,
58
- conversation: List[Dict[str, str]],
59
- reasoning: str,
60
- passed_criteria: Optional[List[str]] = None,
61
- failed_criteria: Optional[List[str]] = None,
62
- total_time: Optional[float] = None,
63
- agent_time: Optional[float] = None,
64
- ) -> "ScenarioResult":
65
- """Create a failed result."""
66
- return cls(
67
- success=False,
68
- conversation=conversation,
69
- reasoning=reasoning,
70
- passed_criteria=passed_criteria if passed_criteria is not None else [],
71
- failed_criteria=failed_criteria if failed_criteria is not None else [],
72
- total_time=total_time,
73
- agent_time=agent_time,
74
- )
scenario/scenario.py DELETED
@@ -1,123 +0,0 @@
1
- """
2
- Scenario module: defines the core Scenario class for agent testing.
3
- """
4
-
5
- from typing import Awaitable, List, Dict, Any, Optional, Callable, TypedDict, Union
6
- import asyncio
7
- import concurrent.futures
8
- from functools import partial
9
-
10
- from scenario.config import ScenarioConfig
11
- from scenario.scenario_executor import ScenarioExecutor
12
-
13
- from .result import ScenarioResult
14
- from .testing_agent import TestingAgent
15
-
16
- from openai.types.chat import ChatCompletionMessageParam
17
-
18
-
19
- class AgentResult(TypedDict, total=False):
20
- message: str
21
- messages: List[ChatCompletionMessageParam]
22
- extra: Dict[str, Any]
23
-
24
-
25
- class Scenario(ScenarioConfig):
26
- """
27
- A scenario represents a specific testing case for an agent.
28
-
29
- It includes:
30
- - A description of the scenario
31
- - Criteria to determine if the agent behaved correctly
32
- - Optional additional parameters
33
- """
34
-
35
- name: str
36
- description: str
37
- agent: Union[
38
- Callable[[str, Optional[Dict[str, Any]]], Dict[str, Any]],
39
- Callable[[str, Optional[Dict[str, Any]]], Awaitable[Dict[str, Any]]],
40
- ]
41
- criteria: List[str]
42
-
43
- def __init__(self, name: str, description: str, **kwargs):
44
- """Validate scenario configuration after initialization."""
45
-
46
- default_config = getattr(Scenario, "default_config", None)
47
- if default_config:
48
- kwargs = {**default_config.model_dump(), **kwargs}
49
-
50
- if not name:
51
- raise ValueError("Scenario name cannot be empty")
52
- kwargs["name"] = name
53
-
54
- if not description:
55
- raise ValueError("Scenario description cannot be empty")
56
- kwargs["description"] = description
57
-
58
- # TODO: allow not having any criteria, for scripted scenarios
59
- if not kwargs.get("criteria"):
60
- raise ValueError("Scenario must have at least one criteria")
61
-
62
- if kwargs.get("max_turns", 0) < 1:
63
- raise ValueError("max_turns must be a positive integer")
64
-
65
- # Ensure agent is callable
66
- if not callable(kwargs.get("agent")):
67
- raise ValueError("Agent must be a callable function")
68
-
69
- super().__init__(**kwargs)
70
-
71
- async def run(self, context: Optional[Dict[str, Any]] = None) -> ScenarioResult:
72
- """
73
- Run the scenario against the agent under test.
74
-
75
- Args:
76
- context: Optional initial context for the agent
77
-
78
- Returns:
79
- ScenarioResult containing the test outcome
80
- """
81
-
82
- # We'll use a thread pool to run the execution logic, we
83
- # require a separate thread because even though asyncio is
84
- # being used throughout, any user code on the callback can
85
- # be blocking, preventing them from running scenarios in parallel
86
- with concurrent.futures.ThreadPoolExecutor() as executor:
87
-
88
- def run_in_thread():
89
- loop = asyncio.new_event_loop()
90
- asyncio.set_event_loop(loop)
91
-
92
- try:
93
- return loop.run_until_complete(ScenarioExecutor(self).run(context))
94
- finally:
95
- loop.close()
96
-
97
- # Run the function in the thread pool and await its result
98
- # This converts the thread's execution into a Future that the current
99
- # event loop can await without blocking
100
- loop = asyncio.get_event_loop()
101
- result = await loop.run_in_executor(executor, run_in_thread)
102
- return result
103
-
104
- @classmethod
105
- def configure(
106
- cls,
107
- testing_agent: Optional[TestingAgent] = None,
108
- max_turns: Optional[int] = None,
109
- verbose: Optional[Union[bool, int]] = None,
110
- cache_key: Optional[str] = None,
111
- debug: Optional[bool] = None,
112
- ) -> None:
113
- existing_config = getattr(cls, "default_config", ScenarioConfig())
114
-
115
- cls.default_config = existing_config.merge(
116
- ScenarioConfig(
117
- testing_agent=testing_agent,
118
- max_turns=max_turns,
119
- verbose=verbose,
120
- cache_key=cache_key,
121
- debug=debug,
122
- )
123
- )