langwatch-scenario 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
scenario/testing_agent.py CHANGED
@@ -5,25 +5,23 @@ TestingAgent module: defines the testing agent that interacts with the agent und
5
5
  import json
6
6
  import logging
7
7
  import re
8
- from typing import TYPE_CHECKING, Dict, List, Any, Optional, Union, cast
9
- from pydantic import BaseModel
8
+ from typing import Optional, Type, cast
10
9
 
11
10
  from litellm import Choices, completion
12
11
  from litellm.files.main import ModelResponse
13
12
 
14
13
  from scenario.cache import scenario_cache
15
- from scenario.utils import safe_attr_or_key
14
+ from scenario.scenario_agent_adapter import ScenarioAgentAdapter
15
+ from scenario.utils import reverse_roles
16
16
 
17
- from .result import ScenarioResult
18
-
19
- if TYPE_CHECKING:
20
- from scenario.scenario import Scenario
17
+ from .error_messages import testing_agent_not_configured_error_message
18
+ from .types import AgentInput, AgentReturnTypes, ScenarioAgentRole, ScenarioResult
21
19
 
22
20
 
23
21
  logger = logging.getLogger("scenario")
24
22
 
25
23
 
26
- class TestingAgent(BaseModel):
24
+ class TestingAgent(ScenarioAgentAdapter):
27
25
  """
28
26
  The Testing Agent that interacts with the agent under test.
29
27
 
@@ -33,7 +31,9 @@ class TestingAgent(BaseModel):
33
31
  3. Determining when to end the test and return a result
34
32
  """
35
33
 
36
- model: str
34
+ roles = {ScenarioAgentRole.USER, ScenarioAgentRole.JUDGE}
35
+
36
+ model: str = ""
37
37
  api_key: Optional[str] = None
38
38
  temperature: float = 0.0
39
39
  max_tokens: Optional[int] = None
@@ -41,14 +41,36 @@ class TestingAgent(BaseModel):
41
41
  # To prevent pytest from thinking this is actually a test class
42
42
  __test__ = False
43
43
 
44
+ def __init__(self, input: AgentInput):
45
+ super().__init__(input)
46
+
47
+ if not self.model:
48
+ raise Exception(testing_agent_not_configured_error_message)
49
+
50
+ @classmethod
51
+ def with_config(
52
+ cls,
53
+ model: str,
54
+ api_key: Optional[str] = None,
55
+ temperature: float = 0.0,
56
+ max_tokens: Optional[int] = None,
57
+ ) -> Type["TestingAgent"]:
58
+ class TestingAgentWithConfig(cls):
59
+ def __init__(self, input: AgentInput):
60
+ self.model = model
61
+ self.api_key = api_key
62
+ self.temperature = temperature
63
+ self.max_tokens = max_tokens
64
+
65
+ super().__init__(input)
66
+
67
+ return TestingAgentWithConfig
68
+
44
69
  @scenario_cache(ignore=["scenario"])
45
- def generate_next_message(
70
+ async def call(
46
71
  self,
47
- scenario: "Scenario",
48
- conversation: List[Dict[str, Any]],
49
- first_message: bool = False,
50
- last_message: bool = False,
51
- ) -> Union[str, ScenarioResult]:
72
+ input: AgentInput,
73
+ ) -> AgentReturnTypes:
52
74
  """
53
75
  Generate the next message in the conversation based on history OR
54
76
  return a ScenarioResult if the test should conclude.
@@ -58,6 +80,8 @@ class TestingAgent(BaseModel):
58
80
  - A ScenarioResult (if the test should conclude)
59
81
  """
60
82
 
83
+ scenario = input.scenario_state.scenario
84
+
61
85
  messages = [
62
86
  {
63
87
  "role": "system",
@@ -94,10 +118,15 @@ Your goal (assistant) is to interact with the Agent Under Test (user) as if you
94
118
  """,
95
119
  },
96
120
  {"role": "assistant", "content": "Hello, how can I help you today?"},
97
- *conversation,
121
+ *input.messages,
98
122
  ]
99
123
 
100
- if last_message:
124
+ is_first_message = len(input.messages) == 0
125
+ is_last_message = (
126
+ input.scenario_state.current_turn == input.scenario_state.scenario.max_turns
127
+ )
128
+
129
+ if is_last_message:
101
130
  messages.append(
102
131
  {
103
132
  "role": "user",
@@ -115,23 +144,7 @@ if you don't have enough information to make a verdict, say inconclusive with ma
115
144
  # User to assistant role reversal
116
145
  # LLM models are biased to always be the assistant not the user, so we need to do this reversal otherwise models like GPT 4.5 is
117
146
  # super confused, and Claude 3.7 even starts throwing exceptions.
118
- for message in messages:
119
- # Can't reverse tool calls
120
- if not safe_attr_or_key(message, "content") or safe_attr_or_key(
121
- message, "tool_calls"
122
- ):
123
- continue
124
-
125
- if type(message) == dict:
126
- if message["role"] == "user":
127
- message["role"] = "assistant"
128
- elif message["role"] == "assistant":
129
- message["role"] = "user"
130
- else:
131
- if getattr(message, "role", None) == "user":
132
- message.role = "assistant"
133
- elif getattr(message, "role", None) == "assistant":
134
- message.role = "user"
147
+ messages = reverse_roles(messages)
135
148
 
136
149
  # Define the tool
137
150
  criteria_names = [
@@ -182,6 +195,16 @@ if you don't have enough information to make a verdict, say inconclusive with ma
182
195
  }
183
196
  ]
184
197
 
198
+ enforce_judgment = input.requested_role == ScenarioAgentRole.JUDGE
199
+ has_criteria = len(scenario.criteria) > 0
200
+
201
+ if enforce_judgment and not has_criteria:
202
+ return ScenarioResult(
203
+ success=False,
204
+ messages=[],
205
+ reasoning="TestingAgent was called as a judge, but it has no criteria to judge against",
206
+ )
207
+
185
208
  response = cast(
186
209
  ModelResponse,
187
210
  completion(
@@ -189,8 +212,16 @@ if you don't have enough information to make a verdict, say inconclusive with ma
189
212
  messages=messages,
190
213
  temperature=self.temperature,
191
214
  max_tokens=self.max_tokens,
192
- tools=tools if not first_message else None,
193
- tool_choice="required" if last_message else None,
215
+ tools=(
216
+ tools
217
+ if (not is_first_message or enforce_judgment) and has_criteria
218
+ else None
219
+ ),
220
+ tool_choice=(
221
+ "required"
222
+ if (is_last_message or enforce_judgment) and has_criteria
223
+ else None
224
+ ),
194
225
  ),
195
226
  )
196
227
 
@@ -221,27 +252,13 @@ if you don't have enough information to make a verdict, say inconclusive with ma
221
252
  ]
222
253
 
223
254
  # Return the appropriate ScenarioResult based on the verdict
224
- if verdict == "success":
225
- return ScenarioResult.success_result(
226
- conversation=conversation,
227
- reasoning=reasoning,
228
- passed_criteria=passed_criteria,
229
- )
230
- elif verdict == "failure":
231
- return ScenarioResult.failure_result(
232
- conversation=conversation,
233
- reasoning=reasoning,
234
- passed_criteria=passed_criteria,
235
- failed_criteria=failed_criteria,
236
- )
237
- else: # inconclusive
238
- return ScenarioResult(
239
- success=False,
240
- conversation=conversation,
241
- reasoning=reasoning,
242
- passed_criteria=passed_criteria,
243
- failed_criteria=failed_criteria,
244
- )
255
+ return ScenarioResult(
256
+ success=verdict == "success",
257
+ messages=messages,
258
+ reasoning=reasoning,
259
+ passed_criteria=passed_criteria,
260
+ failed_criteria=failed_criteria,
261
+ )
245
262
  except json.JSONDecodeError:
246
263
  logger.error("Failed to parse tool call arguments")
247
264
 
@@ -255,7 +272,7 @@ if you don't have enough information to make a verdict, say inconclusive with ma
255
272
  )
256
273
  raise Exception(f"No response from LLM: {response.__repr__()}")
257
274
 
258
- return message_content
275
+ return {"role": "user", "content": message_content}
259
276
  else:
260
277
  raise Exception(
261
278
  f"Unexpected response format from LLM: {response.__repr__()}"
scenario/types.py ADDED
@@ -0,0 +1,96 @@
1
+ from enum import Enum
2
+ from pydantic import BaseModel, Field, SkipValidation
3
+ from typing import (
4
+ TYPE_CHECKING,
5
+ Annotated,
6
+ Any,
7
+ Awaitable,
8
+ Callable,
9
+ Coroutine,
10
+ Dict,
11
+ List,
12
+ Optional,
13
+ Union,
14
+ )
15
+
16
+ from openai.types.chat import ChatCompletionMessageParam, ChatCompletionUserMessageParam
17
+
18
+ # Prevent circular imports + Pydantic breaking
19
+ if TYPE_CHECKING:
20
+ from scenario.scenario_executor import ScenarioExecutor
21
+
22
+ ScenarioExecutorType = ScenarioExecutor
23
+ else:
24
+ ScenarioExecutorType = Any
25
+
26
+
27
+ class ScenarioAgentRole(Enum):
28
+ USER = "User"
29
+ AGENT = "Agent"
30
+ JUDGE = "Judge"
31
+
32
+
33
+ class AgentInput(BaseModel):
34
+ thread_id: str
35
+ # Prevent pydantic from validating/parsing the messages and causing issues: https://github.com/pydantic/pydantic/issues/9541
36
+ messages: Annotated[List[ChatCompletionMessageParam], SkipValidation]
37
+ new_messages: Annotated[List[ChatCompletionMessageParam], SkipValidation]
38
+ context: Dict[str, Any]
39
+ requested_role: ScenarioAgentRole
40
+ scenario_state: ScenarioExecutorType = Field(exclude=True)
41
+
42
+ def last_new_user_message(self) -> ChatCompletionUserMessageParam:
43
+ user_messages = [m for m in self.new_messages if m["role"] == "user"]
44
+ if not user_messages:
45
+ raise ValueError(
46
+ "No new user messages found, did you mean to call the assistant twice? Perhaps change your adapter to use the full messages list instead."
47
+ )
48
+ return user_messages[-1]
49
+
50
+ def last_new_user_message_str(self) -> str:
51
+ content = self.last_new_user_message()["content"]
52
+ if type(content) != str:
53
+ raise ValueError(
54
+ f"Last user message is not a string: {content.__repr__()}. Please use the full messages list instead."
55
+ )
56
+ return content
57
+
58
+
59
+ class ScenarioResult(BaseModel):
60
+ """
61
+ Represents the results of a scenario test run.
62
+
63
+ Attributes:
64
+ success: Whether the scenario passed
65
+ conversation: The conversation history
66
+ reasoning: Reasoning for the result
67
+ passed_criteria: List of criteria that were met
68
+ failed_criteria: List of criteria that were not met
69
+ """
70
+
71
+ success: bool
72
+ messages: List[ChatCompletionMessageParam]
73
+ reasoning: Optional[str] = None
74
+ passed_criteria: List[str] = []
75
+ failed_criteria: List[str] = []
76
+ total_time: Optional[float] = None
77
+ agent_time: Optional[float] = None
78
+
79
+ def __repr__(self) -> str:
80
+ """Provide a concise representation for debugging."""
81
+ status = "PASSED" if self.success else "FAILED"
82
+ return f"ScenarioResult(success={self.success}, status={status}, reasoning='{self.reasoning or 'None'}')"
83
+
84
+
85
+ AgentReturnTypes = Union[
86
+ str, ChatCompletionMessageParam, List[ChatCompletionMessageParam], ScenarioResult
87
+ ]
88
+
89
+ # TODO: remove the optional ScenarioResult return type from here, use events instead
90
+ ScriptStep = Union[
91
+ Callable[["ScenarioExecutor"], None],
92
+ Callable[["ScenarioExecutor"], Optional[ScenarioResult]],
93
+ # Async as well
94
+ Callable[["ScenarioExecutor"], Awaitable[None]],
95
+ Callable[["ScenarioExecutor"], Awaitable[Optional[ScenarioResult]]],
96
+ ]
scenario/utils.py CHANGED
@@ -1,6 +1,16 @@
1
1
  from contextlib import contextmanager
2
2
  import sys
3
- from typing import Optional, Union
3
+ from typing import (
4
+ Any,
5
+ Iterator,
6
+ List,
7
+ Literal,
8
+ Optional,
9
+ Union,
10
+ TypeVar,
11
+ Awaitable,
12
+ cast,
13
+ )
4
14
  from pydantic import BaseModel
5
15
 
6
16
  import json
@@ -14,12 +24,18 @@ from rich.console import Console
14
24
  from rich.text import Text
15
25
  from rich.errors import LiveError
16
26
 
27
+ from scenario.error_messages import message_return_error_message
28
+ from scenario.types import AgentReturnTypes, ScenarioResult
29
+
30
+ T = TypeVar("T")
17
31
 
18
32
 
19
33
  class SerializableAndPydanticEncoder(json.JSONEncoder):
20
34
  def default(self, o):
21
35
  if isinstance(o, BaseModel):
22
36
  return o.model_dump(exclude_unset=True)
37
+ if isinstance(o, Iterator):
38
+ return list(o)
23
39
  return super().default(o)
24
40
 
25
41
 
@@ -46,7 +62,9 @@ def title_case(string):
46
62
  return " ".join(word.capitalize() for word in string.split("_"))
47
63
 
48
64
 
49
- def print_openai_messages(scenario_name: str, messages: list[ChatCompletionMessageParam]):
65
+ def print_openai_messages(
66
+ scenario_name: str, messages: list[ChatCompletionMessageParam]
67
+ ):
50
68
  for msg in messages:
51
69
  role = safe_attr_or_key(msg, "role")
52
70
  content = safe_attr_or_key(msg, "content")
@@ -61,9 +79,12 @@ def print_openai_messages(scenario_name: str, messages: list[ChatCompletionMessa
61
79
  args = safe_attr_or_key(function, "arguments", "{}")
62
80
  args = _take_maybe_json_first_lines(args)
63
81
  print(
64
- scenario_name + termcolor.colored(f"ToolCall({name}):", "magenta"),
82
+ scenario_name
83
+ + termcolor.colored(f"ToolCall({name}):", "magenta"),
65
84
  f"\n\n{indent(args, ' ' * 4)}\n",
66
85
  )
86
+ elif role == "user":
87
+ print(scenario_name + termcolor.colored("User:", "green"), content)
67
88
  elif role == "tool":
68
89
  content = _take_maybe_json_first_lines(content or msg.__repr__())
69
90
  print(
@@ -91,9 +112,12 @@ def _take_maybe_json_first_lines(string, max_lines=5):
91
112
 
92
113
  console = Console()
93
114
 
115
+
94
116
  class TextFirstSpinner(Spinner):
95
117
  def __init__(self, name, text: str, color: str, **kwargs):
96
- super().__init__(name, "", style="bold white", **kwargs) # Initialize with empty text
118
+ super().__init__(
119
+ name, "", style="bold white", **kwargs
120
+ ) # Initialize with empty text
97
121
  self.text_before = text
98
122
  self.color = color
99
123
 
@@ -105,7 +129,9 @@ class TextFirstSpinner(Spinner):
105
129
 
106
130
 
107
131
  @contextmanager
108
- def show_spinner(text: str, color: str = "white", enabled: Optional[Union[bool, int]] = None):
132
+ def show_spinner(
133
+ text: str, color: str = "white", enabled: Optional[Union[bool, int]] = None
134
+ ):
109
135
  if not enabled:
110
136
  yield
111
137
  else:
@@ -119,3 +145,120 @@ def show_spinner(text: str, color: str = "white", enabled: Optional[Union[bool,
119
145
 
120
146
  # Cursor up one line
121
147
  sys.stdout.write("\033[F")
148
+ # Erase the line
149
+ sys.stdout.write("\033[2K")
150
+
151
+
152
+ def check_valid_return_type(return_value: Any, class_name: str) -> None:
153
+ def _is_valid_openai_message(message: Any) -> bool:
154
+ return (isinstance(message, dict) and "role" in message) or (
155
+ isinstance(message, BaseModel) and hasattr(message, "role")
156
+ )
157
+
158
+ if (
159
+ isinstance(return_value, str)
160
+ or _is_valid_openai_message(return_value)
161
+ or (
162
+ isinstance(return_value, list)
163
+ and all(_is_valid_openai_message(message) for message in return_value)
164
+ )
165
+ or isinstance(return_value, ScenarioResult)
166
+ ):
167
+ try:
168
+ json.dumps(return_value, cls=SerializableAndPydanticEncoder)
169
+ except:
170
+ raise ValueError(
171
+ message_return_error_message(got=return_value, class_name=class_name)
172
+ )
173
+
174
+ return
175
+
176
+ raise ValueError(
177
+ message_return_error_message(got=return_value, class_name=class_name)
178
+ )
179
+
180
+
181
+ def convert_agent_return_types_to_openai_messages(
182
+ agent_response: AgentReturnTypes, role: Literal["user", "assistant"]
183
+ ) -> List[ChatCompletionMessageParam]:
184
+ if isinstance(agent_response, ScenarioResult):
185
+ raise ValueError(
186
+ "Unexpectedly tried to convert a ScenarioResult to openai messages",
187
+ agent_response.__repr__(),
188
+ )
189
+
190
+ def convert_maybe_object_to_openai_message(
191
+ obj: Any,
192
+ ) -> ChatCompletionMessageParam:
193
+ if isinstance(obj, dict):
194
+ return cast(ChatCompletionMessageParam, obj)
195
+ elif isinstance(obj, BaseModel):
196
+ return cast(
197
+ ChatCompletionMessageParam,
198
+ obj.model_dump(
199
+ exclude_unset=True,
200
+ exclude_none=True,
201
+ exclude_defaults=True,
202
+ ),
203
+ )
204
+ else:
205
+ raise ValueError(f"Unexpected agent response type: {type(obj).__name__}")
206
+
207
+ def ensure_dict(
208
+ obj: T,
209
+ ) -> T:
210
+ return json.loads(json.dumps(obj, cls=SerializableAndPydanticEncoder))
211
+
212
+ if isinstance(agent_response, str):
213
+ return [
214
+ (
215
+ {"role": "user", "content": agent_response}
216
+ if role == "user"
217
+ else {"role": "assistant", "content": agent_response}
218
+ )
219
+ ]
220
+ elif isinstance(agent_response, list):
221
+ return [
222
+ ensure_dict(convert_maybe_object_to_openai_message(message))
223
+ for message in agent_response
224
+ ]
225
+ else:
226
+ return [ensure_dict(convert_maybe_object_to_openai_message(agent_response))]
227
+
228
+
229
+ def reverse_roles(
230
+ messages: list[ChatCompletionMessageParam],
231
+ ) -> list[ChatCompletionMessageParam]:
232
+ """
233
+ Reverses the roles of the messages in the list.
234
+
235
+ Args:
236
+ messages: The list of messages to reverse the roles of.
237
+ """
238
+
239
+ for message in messages.copy():
240
+ # Can't reverse tool calls
241
+ if not safe_attr_or_key(message, "content") or safe_attr_or_key(
242
+ message, "tool_calls"
243
+ ):
244
+ continue
245
+
246
+ if type(message) == dict:
247
+ if message["role"] == "user":
248
+ message["role"] = "assistant"
249
+ elif message["role"] == "assistant":
250
+ message["role"] = "user"
251
+ else:
252
+ if getattr(message, "role", None) == "user":
253
+ message.role = "assistant" # type: ignore
254
+ elif getattr(message, "role", None) == "assistant":
255
+ message.role = "user" # type: ignore
256
+
257
+ return messages
258
+
259
+
260
+ async def await_if_awaitable(value: T) -> T:
261
+ if isinstance(value, Awaitable):
262
+ return await value
263
+ else:
264
+ return value
@@ -1,15 +0,0 @@
1
- scenario/__init__.py,sha256=LfCjOpbn55jYBBZHyMSZtRAWeCDFn4z4OhAyFnu8aMg,602
2
- scenario/cache.py,sha256=sYu16SAf-BnVYkWSlEDzpyynJGIQyNYsgMXPgCqEnmk,1719
3
- scenario/config.py,sha256=5UVBmuQDtni0Yu00bMh5p0xMGsrymYVRftXBGTsi2fI,802
4
- scenario/error_messages.py,sha256=ZMcAOKJmKaLIinMZ0yBIOgDhPfeJH0uZxIEmolRArtc,2344
5
- scenario/pytest_plugin.py,sha256=TzOHi8PN-dtDqaYAZkgT0wgBkhetOpYy--Z0pzi5PXM,5771
6
- scenario/result.py,sha256=y6mUu6X4H6YJYmwVD4VWHCBi-1BTlUVeYrTZ3HBA0oU,2382
7
- scenario/scenario.py,sha256=OTadwIHIcUhXxfUNnJXpT7h3GZ_VUL3XSd9k-oVPfMo,4069
8
- scenario/scenario_executor.py,sha256=phRKj7vZ_QjGUO9w05-DPrAzdacg_7CnTV59lYLCCKk,7912
9
- scenario/testing_agent.py,sha256=y4B8TMhKryeTiiv62qwslx7Gw_zw54Vk9zPyswEPm0k,10481
10
- scenario/utils.py,sha256=tMESosrxesA1B5zZB3IJ-sNSXDmnpNNib-DHobveVLA,3918
11
- langwatch_scenario-0.2.0.dist-info/METADATA,sha256=fc1oBg2ms-iVgYc44oSTJk-8sw2yOe_PpWEMStvYEX4,9339
12
- langwatch_scenario-0.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
13
- langwatch_scenario-0.2.0.dist-info/entry_points.txt,sha256=WlEnJ_gku0i18bIa3DSuGqXRX-QDQLe_s0YmRzK45TI,45
14
- langwatch_scenario-0.2.0.dist-info/top_level.txt,sha256=45Mn28aedJsetnBMB5xSmrJ-yo701QLH89Zlz4r1clE,9
15
- langwatch_scenario-0.2.0.dist-info/RECORD,,
scenario/result.py DELETED
@@ -1,74 +0,0 @@
1
- """
2
- Result module: defines the class for scenario test results.
3
- """
4
-
5
- from dataclasses import dataclass, field
6
- from typing import List, Dict, Optional
7
-
8
-
9
- @dataclass
10
- class ScenarioResult:
11
- """
12
- Represents the results of a scenario test run.
13
-
14
- Attributes:
15
- success: Whether the scenario passed
16
- conversation: The conversation history
17
- reasoning: Reasoning for the result
18
- passed_criteria: List of criteria that were met
19
- failed_criteria: List of criteria that were not met
20
- """
21
-
22
- success: bool
23
- conversation: List[Dict[str, str]]
24
- reasoning: Optional[str] = None
25
- passed_criteria: List[str] = field(default_factory=list)
26
- failed_criteria: List[str] = field(default_factory=list)
27
- total_time: Optional[float] = None
28
- agent_time: Optional[float] = None
29
-
30
- def __post_init__(self) -> None:
31
- """Validate the result after initialization."""
32
- if not self.success and not self.reasoning:
33
- raise ValueError("Failed scenarios must have a reasoning")
34
-
35
- @classmethod
36
- def success_result(
37
- cls,
38
- conversation: List[Dict[str, str]],
39
- reasoning: Optional[str],
40
- passed_criteria: List[str],
41
- total_time: Optional[float] = None,
42
- agent_time: Optional[float] = None,
43
- ) -> "ScenarioResult":
44
- """Create a successful result."""
45
- return cls(
46
- success=True,
47
- conversation=conversation,
48
- reasoning=reasoning,
49
- passed_criteria=passed_criteria,
50
- failed_criteria=[],
51
- total_time=total_time,
52
- agent_time=agent_time,
53
- )
54
-
55
- @classmethod
56
- def failure_result(
57
- cls,
58
- conversation: List[Dict[str, str]],
59
- reasoning: str,
60
- passed_criteria: Optional[List[str]] = None,
61
- failed_criteria: Optional[List[str]] = None,
62
- total_time: Optional[float] = None,
63
- agent_time: Optional[float] = None,
64
- ) -> "ScenarioResult":
65
- """Create a failed result."""
66
- return cls(
67
- success=False,
68
- conversation=conversation,
69
- reasoning=reasoning,
70
- passed_criteria=passed_criteria if passed_criteria is not None else [],
71
- failed_criteria=failed_criteria if failed_criteria is not None else [],
72
- total_time=total_time,
73
- agent_time=agent_time,
74
- )