langwatch-scenario 0.1.2__py3-none-any.whl → 0.2.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: langwatch-scenario
3
- Version: 0.1.2
3
+ Version: 0.2.0
4
4
  Summary: The end-to-end agent testing library
5
5
  Author-email: LangWatch Team <support@langwatch.ai>
6
6
  License: MIT
@@ -45,6 +45,11 @@ You define the scenarios, and the testing agent will simulate your users as it f
45
45
 
46
46
  [📺 Video Tutorial](https://www.youtube.com/watch?v=f8NLpkY0Av4)
47
47
 
48
+ ### See also
49
+
50
+ - [Scenario TypeScript](https://github.com/langwatch/scenario-ts/)
51
+ - [Scenario Go](https://github.com/langwatch/scenario-go/)
52
+
48
53
  ## Getting Started
49
54
 
50
55
  Install pytest and scenario:
@@ -72,18 +77,23 @@ async def test_vegetarian_recipe_agent():
72
77
  # Call your agent here
73
78
  return agent.run(message)
74
79
 
75
- # Define the scenario
80
+ # Define the simulated scenario
76
81
  scenario = Scenario(
77
- "User is looking for a dinner idea",
82
+ name="dinner idea",
83
+ description="""
84
+ It's saturday evening, the user is very hungry and tired,
85
+ but have no money to order out, so they are looking for a recipe.
86
+
87
+ The user never mentions they want a vegetarian recipe.
88
+ """,
78
89
  agent=vegetarian_recipe_agent,
79
- success_criteria=[
80
- "Recipe agent generates a vegetarian recipe",
81
- "Recipe includes a list of ingredients",
82
- "Recipe includes step-by-step cooking instructions",
83
- ],
84
- failure_criteria=[
85
- "The recipe is not vegetarian or includes meat",
86
- "The agent asks more than two follow-up questions",
90
+ # List the evaluation criteria for the scenario to be considered successful
91
+ criteria=[
92
+ "Agent should not ask more than two follow-up questions",
93
+ "Agent should generate a recipe",
94
+ "Recipe should include a list of ingredients",
95
+ "Recipe should include step-by-step cooking instructions",
96
+ "Recipe should be vegetarian and not include any sort of meat",
87
97
  ],
88
98
  )
89
99
 
@@ -111,9 +121,11 @@ class VegetarianRecipeAgent:
111
121
  messages=[
112
122
  {
113
123
  "role": "system",
114
- "content": """You are a vegetarian recipe agent.
115
- Given the user request, ask AT MOST ONE follow-up question,
116
- then provide a complete recipe. Keep your responses concise and focused.""",
124
+ "content": """
125
+ You are a vegetarian recipe agent.
126
+ Given the user request, ask AT MOST ONE follow-up question,
127
+ then provide a complete recipe. Keep your responses concise and focused.
128
+ """,
117
129
  },
118
130
  *self.history,
119
131
  ],
@@ -151,19 +163,20 @@ For example, in this Lovable Clone scenario test:
151
163
 
152
164
  ```python
153
165
  scenario = Scenario(
154
- "user wants to create a new landing page for their dog walking startup",
166
+ name="dog walking startup landing page",
167
+ description="""
168
+ the user wants to create a new landing page for their dog walking startup
169
+
170
+ send the first message to generate the landing page, then a single follow up request to extend it, then give your final verdict
171
+ """,
155
172
  agent=lovable_agent,
156
- strategy="send the first message to generate the landing page, then a single follow up request to extend it, then give your final verdict",
157
- success_criteria=[
173
+ criteria=[
158
174
  "agent reads the files before go and making changes",
159
- "agent modified the index.css file",
160
- "agent modified the Index.tsx file",
175
+ "agent modified the index.css file, not only the Index.tsx file",
161
176
  "agent created a comprehensive landing page",
162
177
  "agent extended the landing page with a new section",
163
- ],
164
- failure_criteria=[
165
- "agent says it can't read the file",
166
- "agent produces incomplete code or is too lazy to finish",
178
+ "agent should NOT say it can't read the file",
179
+ "agent should NOT produce incomplete code or be too lazy to finish",
167
180
  ],
168
181
  max_turns=5,
169
182
  )
@@ -175,7 +188,7 @@ You can find a fully working Lovable Clone example in [examples/test_lovable_clo
175
188
 
176
189
  ## Debug mode
177
190
 
178
- You can enable debug mode by setting the `debug` field to `True` in the `Scenario.configure` method or in the specific scenario you are running.
191
+ You can enable debug mode by setting the `debug` field to `True` in the `Scenario.configure` method or in the specific scenario you are running, or by passing the `--debug` flag to pytest.
179
192
 
180
193
  Debug mode allows you to see the messages in slow motion step by step, and intervene with your own inputs to debug your agent from the middle of the conversation.
181
194
 
@@ -183,6 +196,12 @@ Debug mode allows you to see the messages in slow motion step by step, and inter
183
196
  Scenario.configure(testing_agent=TestingAgent(model="openai/gpt-4o-mini"), debug=True)
184
197
  ```
185
198
 
199
+ or
200
+
201
+ ```bash
202
+ pytest -s tests/test_vegetarian_recipe_agent.py --debug
203
+ ```
204
+
186
205
  ## Cache
187
206
 
188
207
  Each time the scenario runs, the testing agent might chose a different input to start, this is good to make sure it covers the variance of real users as well, however we understand that the non-deterministic nature of it might make it less repeatable, costly and harder to debug. To solve for it, you can use the `cache_key` field in the `Scenario.configure` method or in the specific scenario you are running, this will make the testing agent give the same input for given the same scenario:
@@ -0,0 +1,15 @@
1
+ scenario/__init__.py,sha256=LfCjOpbn55jYBBZHyMSZtRAWeCDFn4z4OhAyFnu8aMg,602
2
+ scenario/cache.py,sha256=sYu16SAf-BnVYkWSlEDzpyynJGIQyNYsgMXPgCqEnmk,1719
3
+ scenario/config.py,sha256=5UVBmuQDtni0Yu00bMh5p0xMGsrymYVRftXBGTsi2fI,802
4
+ scenario/error_messages.py,sha256=ZMcAOKJmKaLIinMZ0yBIOgDhPfeJH0uZxIEmolRArtc,2344
5
+ scenario/pytest_plugin.py,sha256=TzOHi8PN-dtDqaYAZkgT0wgBkhetOpYy--Z0pzi5PXM,5771
6
+ scenario/result.py,sha256=y6mUu6X4H6YJYmwVD4VWHCBi-1BTlUVeYrTZ3HBA0oU,2382
7
+ scenario/scenario.py,sha256=OTadwIHIcUhXxfUNnJXpT7h3GZ_VUL3XSd9k-oVPfMo,4069
8
+ scenario/scenario_executor.py,sha256=phRKj7vZ_QjGUO9w05-DPrAzdacg_7CnTV59lYLCCKk,7912
9
+ scenario/testing_agent.py,sha256=y4B8TMhKryeTiiv62qwslx7Gw_zw54Vk9zPyswEPm0k,10481
10
+ scenario/utils.py,sha256=tMESosrxesA1B5zZB3IJ-sNSXDmnpNNib-DHobveVLA,3918
11
+ langwatch_scenario-0.2.0.dist-info/METADATA,sha256=fc1oBg2ms-iVgYc44oSTJk-8sw2yOe_PpWEMStvYEX4,9339
12
+ langwatch_scenario-0.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
13
+ langwatch_scenario-0.2.0.dist-info/entry_points.txt,sha256=WlEnJ_gku0i18bIa3DSuGqXRX-QDQLe_s0YmRzK45TI,45
14
+ langwatch_scenario-0.2.0.dist-info/top_level.txt,sha256=45Mn28aedJsetnBMB5xSmrJ-yo701QLH89Zlz4r1clE,9
15
+ langwatch_scenario-0.2.0.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (79.0.0)
2
+ Generator: setuptools (80.9.0)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
scenario/pytest_plugin.py CHANGED
@@ -11,14 +11,16 @@ from scenario.result import ScenarioResult
11
11
 
12
12
  from .scenario import Scenario
13
13
 
14
+
14
15
  class ScenarioReporterResults(TypedDict):
15
16
  scenario: Scenario
16
17
  result: ScenarioResult
17
18
 
19
+
18
20
  # ScenarioReporter class definition moved outside the fixture for global use
19
21
  class ScenarioReporter:
20
22
  def __init__(self):
21
- self.results : list[ScenarioReporterResults] = []
23
+ self.results: list[ScenarioReporterResults] = []
22
24
 
23
25
  def add_result(self, scenario, result):
24
26
  """Add a test result to the reporter."""
@@ -80,28 +82,33 @@ class ScenarioReporter:
80
82
  time = f" in {result.total_time:.2f}s (agent: {result.agent_time:.2f}s)"
81
83
 
82
84
  print(
83
- f"\n{idx}. {scenario.description} - {colored(status, status_color, attrs=['bold'])}{time}"
85
+ f"\n{idx}. {scenario.name} - {colored(status, status_color, attrs=['bold'])}{time}"
84
86
  )
85
87
 
86
- print(colored(f" Reasoning: {result.reasoning}", "green" if result.success else "red"))
88
+ print(
89
+ colored(
90
+ f" Reasoning: {result.reasoning}",
91
+ "green" if result.success else "red",
92
+ )
93
+ )
87
94
 
88
- if hasattr(result, "met_criteria") and result.met_criteria:
89
- criteria_count = len(result.met_criteria)
90
- total_criteria = len(scenario.success_criteria)
95
+ if hasattr(result, "passed_criteria") and result.passed_criteria:
96
+ criteria_count = len(result.passed_criteria)
97
+ total_criteria = len(scenario.criteria)
91
98
  criteria_color = (
92
99
  "green" if criteria_count == total_criteria else "yellow"
93
100
  )
94
101
  print(
95
102
  colored(
96
- f" Success Criteria: {criteria_count}/{total_criteria}",
103
+ f" Passed Criteria: {criteria_count}/{total_criteria}",
97
104
  criteria_color,
98
105
  )
99
106
  )
100
107
 
101
- if hasattr(result, "triggered_failures") and result.triggered_failures:
108
+ if hasattr(result, "failed_criteria") and result.failed_criteria:
102
109
  print(
103
110
  colored(
104
- f" Failures Criteria: {len(result.triggered_failures)}",
111
+ f" Failed Criteria: {len(result.failed_criteria)}",
105
112
  "red",
106
113
  )
107
114
  )
@@ -119,6 +126,10 @@ def pytest_configure(config):
119
126
  "markers", "agent_test: mark test as an agent scenario test"
120
127
  )
121
128
 
129
+ if config.getoption("--debug"):
130
+ print(colored("\nScenario debug mode enabled (--debug).", "yellow"))
131
+ Scenario.configure(verbose=True, debug=True)
132
+
122
133
  # Create a global reporter instance
123
134
  config._scenario_reporter = ScenarioReporter()
124
135
 
@@ -128,7 +139,12 @@ def pytest_configure(config):
128
139
  result = await original_run(self, *args, **kwargs)
129
140
 
130
141
  # Always report to the global reporter
131
- config._scenario_reporter.add_result(self, result)
142
+ # Ensure the reporter exists before adding result
143
+ if hasattr(config, "_scenario_reporter"):
144
+ config._scenario_reporter.add_result(self, result)
145
+ else:
146
+ # Handle case where reporter might not be initialized (should not happen with current setup)
147
+ print(colored("Warning: Scenario reporter not found during run.", "yellow"))
132
148
 
133
149
  return result
134
150
 
scenario/result.py CHANGED
@@ -15,17 +15,15 @@ class ScenarioResult:
15
15
  success: Whether the scenario passed
16
16
  conversation: The conversation history
17
17
  reasoning: Reasoning for the result
18
- met_criteria: List of success criteria that were met
19
- unmet_criteria: List of success criteria that were not met
20
- triggered_failures: List of failure criteria that were triggered
18
+ passed_criteria: List of criteria that were met
19
+ failed_criteria: List of criteria that were not met
21
20
  """
22
21
 
23
22
  success: bool
24
23
  conversation: List[Dict[str, str]]
25
24
  reasoning: Optional[str] = None
26
- met_criteria: List[str] = field(default_factory=list)
27
- unmet_criteria: List[str] = field(default_factory=list)
28
- triggered_failures: List[str] = field(default_factory=list)
25
+ passed_criteria: List[str] = field(default_factory=list)
26
+ failed_criteria: List[str] = field(default_factory=list)
29
27
  total_time: Optional[float] = None
30
28
  agent_time: Optional[float] = None
31
29
 
@@ -39,7 +37,7 @@ class ScenarioResult:
39
37
  cls,
40
38
  conversation: List[Dict[str, str]],
41
39
  reasoning: Optional[str],
42
- met_criteria: List[str],
40
+ passed_criteria: List[str],
43
41
  total_time: Optional[float] = None,
44
42
  agent_time: Optional[float] = None,
45
43
  ) -> "ScenarioResult":
@@ -48,9 +46,8 @@ class ScenarioResult:
48
46
  success=True,
49
47
  conversation=conversation,
50
48
  reasoning=reasoning,
51
- met_criteria=met_criteria,
52
- unmet_criteria=[],
53
- triggered_failures=[],
49
+ passed_criteria=passed_criteria,
50
+ failed_criteria=[],
54
51
  total_time=total_time,
55
52
  agent_time=agent_time,
56
53
  )
@@ -60,9 +57,8 @@ class ScenarioResult:
60
57
  cls,
61
58
  conversation: List[Dict[str, str]],
62
59
  reasoning: str,
63
- met_criteria: Optional[List[str]] = None,
64
- unmet_criteria: Optional[List[str]] = None,
65
- triggered_failures: Optional[List[str]] = None,
60
+ passed_criteria: Optional[List[str]] = None,
61
+ failed_criteria: Optional[List[str]] = None,
66
62
  total_time: Optional[float] = None,
67
63
  agent_time: Optional[float] = None,
68
64
  ) -> "ScenarioResult":
@@ -71,11 +67,8 @@ class ScenarioResult:
71
67
  success=False,
72
68
  conversation=conversation,
73
69
  reasoning=reasoning,
74
- met_criteria=met_criteria if met_criteria is not None else [],
75
- unmet_criteria=unmet_criteria if unmet_criteria is not None else [],
76
- triggered_failures=(
77
- triggered_failures if triggered_failures is not None else []
78
- ),
70
+ passed_criteria=passed_criteria if passed_criteria is not None else [],
71
+ failed_criteria=failed_criteria if failed_criteria is not None else [],
79
72
  total_time=total_time,
80
73
  agent_time=agent_time,
81
74
  )
scenario/scenario.py CHANGED
@@ -15,6 +15,7 @@ from .testing_agent import TestingAgent
15
15
 
16
16
  from openai.types.chat import ChatCompletionMessageParam
17
17
 
18
+
18
19
  class AgentResult(TypedDict, total=False):
19
20
  message: str
20
21
  messages: List[ChatCompletionMessageParam]
@@ -27,34 +28,36 @@ class Scenario(ScenarioConfig):
27
28
 
28
29
  It includes:
29
30
  - A description of the scenario
30
- - Success criteria to determine if the agent behaved correctly
31
- - Failure criteria to determine if the agent failed
32
- - An optional strategy that guides the testing agent
31
+ - Criteria to determine if the agent behaved correctly
33
32
  - Optional additional parameters
34
33
  """
35
34
 
35
+ name: str
36
36
  description: str
37
37
  agent: Union[
38
38
  Callable[[str, Optional[Dict[str, Any]]], Dict[str, Any]],
39
39
  Callable[[str, Optional[Dict[str, Any]]], Awaitable[Dict[str, Any]]],
40
40
  ]
41
- success_criteria: List[str]
42
- failure_criteria: List[str] = []
43
- strategy: Optional[str] = None
41
+ criteria: List[str]
44
42
 
45
- def __init__(self, description: str, **kwargs):
43
+ def __init__(self, name: str, description: str, **kwargs):
46
44
  """Validate scenario configuration after initialization."""
47
45
 
48
46
  default_config = getattr(Scenario, "default_config", None)
49
47
  if default_config:
50
48
  kwargs = {**default_config.model_dump(), **kwargs}
51
49
 
50
+ if not name:
51
+ raise ValueError("Scenario name cannot be empty")
52
+ kwargs["name"] = name
53
+
52
54
  if not description:
53
55
  raise ValueError("Scenario description cannot be empty")
54
56
  kwargs["description"] = description
55
57
 
56
- if not kwargs.get("success_criteria"):
57
- raise ValueError("Scenario must have at least one success criterion")
58
+ # TODO: allow not having any criteria, for scripted scenarios
59
+ if not kwargs.get("criteria"):
60
+ raise ValueError("Scenario must have at least one criteria")
58
61
 
59
62
  if kwargs.get("max_turns", 0) < 1:
60
63
  raise ValueError("max_turns must be a positive integer")
@@ -65,7 +68,6 @@ class Scenario(ScenarioConfig):
65
68
 
66
69
  super().__init__(**kwargs)
67
70
 
68
-
69
71
  async def run(self, context: Optional[Dict[str, Any]] = None) -> ScenarioResult:
70
72
  """
71
73
  Run the scenario against the agent under test.
@@ -82,6 +84,7 @@ class Scenario(ScenarioConfig):
82
84
  # being used throughout, any user code on the callback can
83
85
  # be blocking, preventing them from running scenarios in parallel
84
86
  with concurrent.futures.ThreadPoolExecutor() as executor:
87
+
85
88
  def run_in_thread():
86
89
  loop = asyncio.new_event_loop()
87
90
  asyncio.set_event_loop(loop)
@@ -105,6 +108,7 @@ class Scenario(ScenarioConfig):
105
108
  max_turns: Optional[int] = None,
106
109
  verbose: Optional[Union[bool, int]] = None,
107
110
  cache_key: Optional[str] = None,
111
+ debug: Optional[bool] = None,
108
112
  ) -> None:
109
113
  existing_config = getattr(cls, "default_config", ScenarioConfig())
110
114
 
@@ -114,5 +118,6 @@ class Scenario(ScenarioConfig):
114
118
  max_turns=max_turns,
115
119
  verbose=verbose,
116
120
  cache_key=cache_key,
121
+ debug=debug,
117
122
  )
118
123
  )
@@ -199,6 +199,6 @@ class ScenarioExecutor:
199
199
 
200
200
  def _scenario_name(self):
201
201
  if self.scenario.verbose == 2:
202
- return termcolor.colored(f"[Scenario: {self.scenario.description}] ", "yellow")
202
+ return termcolor.colored(f"[Scenario: {self.scenario.name}] ", "yellow")
203
203
  else:
204
204
  return ""
scenario/testing_agent.py CHANGED
@@ -4,6 +4,7 @@ TestingAgent module: defines the testing agent that interacts with the agent und
4
4
 
5
5
  import json
6
6
  import logging
7
+ import re
7
8
  from typing import TYPE_CHECKING, Dict, List, Any, Optional, Union, cast
8
9
  from pydantic import BaseModel
9
10
 
@@ -74,27 +75,19 @@ Your goal (assistant) is to interact with the Agent Under Test (user) as if you
74
75
  {scenario.description}
75
76
  </scenario>
76
77
 
77
- <strategy>
78
- {scenario.strategy or "Start with a first message and guide the conversation to play out the scenario."}
79
- </strategy>
80
-
81
- <success_criteria>
82
- {json.dumps(scenario.success_criteria, indent=2)}
83
- </success_criteria>
84
-
85
- <failure_criteria>
86
- {json.dumps(scenario.failure_criteria, indent=2)}
87
- </failure_criteria>
78
+ <criteria>
79
+ {"\n".join([f"{idx + 1}. {criterion}" for idx, criterion in enumerate(scenario.criteria)])}
80
+ </criteria>
88
81
 
89
82
  <execution_flow>
90
83
  1. Generate the first message to start the scenario
91
84
  2. After the Agent Under Test (user) responds, generate the next message to send to the Agent Under Test, keep repeating step 2 until criterias match
92
- 3. If the test should end, use the finish_test tool to determine if success or failure criteria have been met
85
+ 3. If the test should end, use the finish_test tool to determine if all the criteria have been met
93
86
  </execution_flow>
94
87
 
95
88
  <rules>
96
- 1. Test should end immediately if a failure criteria is triggered
97
- 2. Test should continue until all success criteria have been met
89
+ 1. Test should end immediately if a criteria mentioning something the agent should NOT do is met
90
+ 2. Test should continue until all scenario goals have been met to try going through all the criteria
98
91
  3. DO NOT make any judgment calls that are not explicitly listed in the success or failure criteria, withhold judgement if necessary
99
92
  4. DO NOT carry over any requests yourself, YOU ARE NOT the assistant today, wait for the user to do it
100
93
  </rules>
@@ -141,6 +134,14 @@ if you don't have enough information to make a verdict, say inconclusive with ma
141
134
  message.role = "user"
142
135
 
143
136
  # Define the tool
137
+ criteria_names = [
138
+ re.sub(
139
+ r"[^a-zA-Z0-9]",
140
+ "_",
141
+ criterion.replace(" ", "_").replace("'", "").lower(),
142
+ )[:70]
143
+ for criterion in scenario.criteria
144
+ ]
144
145
  tools = [
145
146
  {
146
147
  "type": "function",
@@ -151,40 +152,30 @@ if you don't have enough information to make a verdict, say inconclusive with ma
151
152
  "parameters": {
152
153
  "type": "object",
153
154
  "properties": {
154
- "verdict": {
155
- "type": "string",
156
- "enum": ["success", "failure", "inconclusive"],
157
- "description": "The final verdict of the test",
158
- },
159
- "reasoning": {
160
- "type": "string",
161
- "description": "Explanation of why this verdict was reached",
162
- },
163
- "details": {
155
+ "criteria": {
164
156
  "type": "object",
165
157
  "properties": {
166
- "met_criteria": {
167
- "type": "array",
168
- "items": {"type": "string"},
169
- "description": "List of success criteria that have been met",
170
- },
171
- "unmet_criteria": {
172
- "type": "array",
173
- "items": {"type": "string"},
174
- "description": "List of success criteria that have not been met",
175
- },
176
- "triggered_failures": {
177
- "type": "array",
178
- "items": {"type": "string"},
179
- "description": "List of failure criteria that have been triggered",
180
- },
158
+ criteria_names[idx]: {
159
+ "enum": [True, False, "inconclusive"],
160
+ "description": criterion,
161
+ }
162
+ for idx, criterion in enumerate(scenario.criteria)
181
163
  },
182
- "required": ["met_criteria", "unmet_criteria", "triggered_failures"],
164
+ "required": criteria_names,
183
165
  "additionalProperties": False,
184
- "description": "Detailed information about criteria evaluation",
166
+ "description": "Strict verdict for each criterion",
167
+ },
168
+ "reasoning": {
169
+ "type": "string",
170
+ "description": "Explanation of what the final verdict should be",
171
+ },
172
+ "verdict": {
173
+ "type": "string",
174
+ "enum": ["success", "failure", "inconclusive"],
175
+ "description": "The final verdict of the test",
185
176
  },
186
177
  },
187
- "required": ["verdict", "reasoning", "details"],
178
+ "required": ["criteria", "reasoning", "verdict"],
188
179
  "additionalProperties": False,
189
180
  },
190
181
  },
@@ -216,35 +207,40 @@ if you don't have enough information to make a verdict, say inconclusive with ma
216
207
  args = json.loads(tool_call.function.arguments)
217
208
  verdict = args.get("verdict", "inconclusive")
218
209
  reasoning = args.get("reasoning", "No reasoning provided")
219
- details = args.get("details", {})
220
-
221
- met_criteria = details.get("met_criteria", [])
222
- unmet_criteria = details.get("unmet_criteria", [])
223
- triggered_failures = details.get("triggered_failures", [])
210
+ criteria = args.get("criteria", {})
211
+
212
+ passed_criteria = [
213
+ scenario.criteria[idx]
214
+ for idx, criterion in enumerate(criteria.values())
215
+ if criterion == True
216
+ ]
217
+ failed_criteria = [
218
+ scenario.criteria[idx]
219
+ for idx, criterion in enumerate(criteria.values())
220
+ if criterion == False
221
+ ]
224
222
 
225
223
  # Return the appropriate ScenarioResult based on the verdict
226
224
  if verdict == "success":
227
225
  return ScenarioResult.success_result(
228
226
  conversation=conversation,
229
227
  reasoning=reasoning,
230
- met_criteria=met_criteria,
228
+ passed_criteria=passed_criteria,
231
229
  )
232
230
  elif verdict == "failure":
233
231
  return ScenarioResult.failure_result(
234
232
  conversation=conversation,
235
233
  reasoning=reasoning,
236
- met_criteria=met_criteria,
237
- unmet_criteria=unmet_criteria,
238
- triggered_failures=triggered_failures,
234
+ passed_criteria=passed_criteria,
235
+ failed_criteria=failed_criteria,
239
236
  )
240
237
  else: # inconclusive
241
238
  return ScenarioResult(
242
239
  success=False,
243
240
  conversation=conversation,
244
241
  reasoning=reasoning,
245
- met_criteria=met_criteria,
246
- unmet_criteria=unmet_criteria,
247
- triggered_failures=triggered_failures,
242
+ passed_criteria=passed_criteria,
243
+ failed_criteria=failed_criteria,
248
244
  )
249
245
  except json.JSONDecodeError:
250
246
  logger.error("Failed to parse tool call arguments")
@@ -254,7 +250,9 @@ if you don't have enough information to make a verdict, say inconclusive with ma
254
250
  if message_content is None:
255
251
  # If invalid tool call, raise an error
256
252
  if message.tool_calls:
257
- raise Exception(f"Invalid tool call from testing agent: {message.tool_calls.__repr__()}")
253
+ raise Exception(
254
+ f"Invalid tool call from testing agent: {message.tool_calls.__repr__()}"
255
+ )
258
256
  raise Exception(f"No response from LLM: {response.__repr__()}")
259
257
 
260
258
  return message_content
@@ -262,4 +260,3 @@ if you don't have enough information to make a verdict, say inconclusive with ma
262
260
  raise Exception(
263
261
  f"Unexpected response format from LLM: {response.__repr__()}"
264
262
  )
265
-
@@ -1,15 +0,0 @@
1
- scenario/__init__.py,sha256=LfCjOpbn55jYBBZHyMSZtRAWeCDFn4z4OhAyFnu8aMg,602
2
- scenario/cache.py,sha256=sYu16SAf-BnVYkWSlEDzpyynJGIQyNYsgMXPgCqEnmk,1719
3
- scenario/config.py,sha256=5UVBmuQDtni0Yu00bMh5p0xMGsrymYVRftXBGTsi2fI,802
4
- scenario/error_messages.py,sha256=ZMcAOKJmKaLIinMZ0yBIOgDhPfeJH0uZxIEmolRArtc,2344
5
- scenario/pytest_plugin.py,sha256=ydtQxaN09qzoo12nNT8BQY_UPPHAt-AH92HWnPEN6bI,5212
6
- scenario/result.py,sha256=SGF8uYNtkP7cJy4KsshUozZRevmdiyX2TFzr6VreTv8,2717
7
- scenario/scenario.py,sha256=MqsyiNue1KC4mtvTHnJqJ6Fj3u0TTAdAYann8P8WBBQ,4010
8
- scenario/scenario_executor.py,sha256=c8xV6GoJgO2JoZBWpYPQN5YwwQ3G9iJUtXV9UGSf1q8,7919
9
- scenario/testing_agent.py,sha256=eS-c_io5cHgzJ88wwRvU_vve-pmB2HsGWN6qwlq0sPg,10865
10
- scenario/utils.py,sha256=tMESosrxesA1B5zZB3IJ-sNSXDmnpNNib-DHobveVLA,3918
11
- langwatch_scenario-0.1.2.dist-info/METADATA,sha256=La0j89kCoJpCriv3R8Sx5aqKgZy_iC-WNF-NqZzptfk,8684
12
- langwatch_scenario-0.1.2.dist-info/WHEEL,sha256=pxyMxgL8-pra_rKaQ4drOZAegBVuX-G_4nRHjjgWbmo,91
13
- langwatch_scenario-0.1.2.dist-info/entry_points.txt,sha256=WlEnJ_gku0i18bIa3DSuGqXRX-QDQLe_s0YmRzK45TI,45
14
- langwatch_scenario-0.1.2.dist-info/top_level.txt,sha256=45Mn28aedJsetnBMB5xSmrJ-yo701QLH89Zlz4r1clE,9
15
- langwatch_scenario-0.1.2.dist-info/RECORD,,