langwatch-scenario 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {langwatch_scenario-0.2.0.dist-info → langwatch_scenario-0.3.0.dist-info}/METADATA +60 -12
- langwatch_scenario-0.3.0.dist-info/RECORD +16 -0
- scenario/__init__.py +13 -3
- scenario/config.py +18 -7
- scenario/error_messages.py +81 -23
- scenario/pytest_plugin.py +1 -1
- scenario/scenario.py +135 -20
- scenario/scenario_agent_adapter.py +16 -0
- scenario/scenario_executor.py +405 -143
- scenario/testing_agent.py +75 -58
- scenario/types.py +96 -0
- scenario/utils.py +148 -5
- langwatch_scenario-0.2.0.dist-info/RECORD +0 -15
- scenario/result.py +0 -74
- {langwatch_scenario-0.2.0.dist-info → langwatch_scenario-0.3.0.dist-info}/WHEEL +0 -0
- {langwatch_scenario-0.2.0.dist-info → langwatch_scenario-0.3.0.dist-info}/entry_points.txt +0 -0
- {langwatch_scenario-0.2.0.dist-info → langwatch_scenario-0.3.0.dist-info}/top_level.txt +0 -0
scenario/testing_agent.py
CHANGED
@@ -5,25 +5,23 @@ TestingAgent module: defines the testing agent that interacts with the agent und
|
|
5
5
|
import json
|
6
6
|
import logging
|
7
7
|
import re
|
8
|
-
from typing import
|
9
|
-
from pydantic import BaseModel
|
8
|
+
from typing import Optional, Type, cast
|
10
9
|
|
11
10
|
from litellm import Choices, completion
|
12
11
|
from litellm.files.main import ModelResponse
|
13
12
|
|
14
13
|
from scenario.cache import scenario_cache
|
15
|
-
from scenario.
|
14
|
+
from scenario.scenario_agent_adapter import ScenarioAgentAdapter
|
15
|
+
from scenario.utils import reverse_roles
|
16
16
|
|
17
|
-
from .
|
18
|
-
|
19
|
-
if TYPE_CHECKING:
|
20
|
-
from scenario.scenario import Scenario
|
17
|
+
from .error_messages import testing_agent_not_configured_error_message
|
18
|
+
from .types import AgentInput, AgentReturnTypes, ScenarioAgentRole, ScenarioResult
|
21
19
|
|
22
20
|
|
23
21
|
logger = logging.getLogger("scenario")
|
24
22
|
|
25
23
|
|
26
|
-
class TestingAgent(
|
24
|
+
class TestingAgent(ScenarioAgentAdapter):
|
27
25
|
"""
|
28
26
|
The Testing Agent that interacts with the agent under test.
|
29
27
|
|
@@ -33,7 +31,9 @@ class TestingAgent(BaseModel):
|
|
33
31
|
3. Determining when to end the test and return a result
|
34
32
|
"""
|
35
33
|
|
36
|
-
|
34
|
+
roles = {ScenarioAgentRole.USER, ScenarioAgentRole.JUDGE}
|
35
|
+
|
36
|
+
model: str = ""
|
37
37
|
api_key: Optional[str] = None
|
38
38
|
temperature: float = 0.0
|
39
39
|
max_tokens: Optional[int] = None
|
@@ -41,14 +41,36 @@ class TestingAgent(BaseModel):
|
|
41
41
|
# To prevent pytest from thinking this is actually a test class
|
42
42
|
__test__ = False
|
43
43
|
|
44
|
+
def __init__(self, input: AgentInput):
|
45
|
+
super().__init__(input)
|
46
|
+
|
47
|
+
if not self.model:
|
48
|
+
raise Exception(testing_agent_not_configured_error_message)
|
49
|
+
|
50
|
+
@classmethod
|
51
|
+
def with_config(
|
52
|
+
cls,
|
53
|
+
model: str,
|
54
|
+
api_key: Optional[str] = None,
|
55
|
+
temperature: float = 0.0,
|
56
|
+
max_tokens: Optional[int] = None,
|
57
|
+
) -> Type["TestingAgent"]:
|
58
|
+
class TestingAgentWithConfig(cls):
|
59
|
+
def __init__(self, input: AgentInput):
|
60
|
+
self.model = model
|
61
|
+
self.api_key = api_key
|
62
|
+
self.temperature = temperature
|
63
|
+
self.max_tokens = max_tokens
|
64
|
+
|
65
|
+
super().__init__(input)
|
66
|
+
|
67
|
+
return TestingAgentWithConfig
|
68
|
+
|
44
69
|
@scenario_cache(ignore=["scenario"])
|
45
|
-
def
|
70
|
+
async def call(
|
46
71
|
self,
|
47
|
-
|
48
|
-
|
49
|
-
first_message: bool = False,
|
50
|
-
last_message: bool = False,
|
51
|
-
) -> Union[str, ScenarioResult]:
|
72
|
+
input: AgentInput,
|
73
|
+
) -> AgentReturnTypes:
|
52
74
|
"""
|
53
75
|
Generate the next message in the conversation based on history OR
|
54
76
|
return a ScenarioResult if the test should conclude.
|
@@ -58,6 +80,8 @@ class TestingAgent(BaseModel):
|
|
58
80
|
- A ScenarioResult (if the test should conclude)
|
59
81
|
"""
|
60
82
|
|
83
|
+
scenario = input.scenario_state.scenario
|
84
|
+
|
61
85
|
messages = [
|
62
86
|
{
|
63
87
|
"role": "system",
|
@@ -94,10 +118,15 @@ Your goal (assistant) is to interact with the Agent Under Test (user) as if you
|
|
94
118
|
""",
|
95
119
|
},
|
96
120
|
{"role": "assistant", "content": "Hello, how can I help you today?"},
|
97
|
-
*
|
121
|
+
*input.messages,
|
98
122
|
]
|
99
123
|
|
100
|
-
|
124
|
+
is_first_message = len(input.messages) == 0
|
125
|
+
is_last_message = (
|
126
|
+
input.scenario_state.current_turn == input.scenario_state.scenario.max_turns
|
127
|
+
)
|
128
|
+
|
129
|
+
if is_last_message:
|
101
130
|
messages.append(
|
102
131
|
{
|
103
132
|
"role": "user",
|
@@ -115,23 +144,7 @@ if you don't have enough information to make a verdict, say inconclusive with ma
|
|
115
144
|
# User to assistant role reversal
|
116
145
|
# LLM models are biased to always be the assistant not the user, so we need to do this reversal otherwise models like GPT 4.5 is
|
117
146
|
# super confused, and Claude 3.7 even starts throwing exceptions.
|
118
|
-
|
119
|
-
# Can't reverse tool calls
|
120
|
-
if not safe_attr_or_key(message, "content") or safe_attr_or_key(
|
121
|
-
message, "tool_calls"
|
122
|
-
):
|
123
|
-
continue
|
124
|
-
|
125
|
-
if type(message) == dict:
|
126
|
-
if message["role"] == "user":
|
127
|
-
message["role"] = "assistant"
|
128
|
-
elif message["role"] == "assistant":
|
129
|
-
message["role"] = "user"
|
130
|
-
else:
|
131
|
-
if getattr(message, "role", None) == "user":
|
132
|
-
message.role = "assistant"
|
133
|
-
elif getattr(message, "role", None) == "assistant":
|
134
|
-
message.role = "user"
|
147
|
+
messages = reverse_roles(messages)
|
135
148
|
|
136
149
|
# Define the tool
|
137
150
|
criteria_names = [
|
@@ -182,6 +195,16 @@ if you don't have enough information to make a verdict, say inconclusive with ma
|
|
182
195
|
}
|
183
196
|
]
|
184
197
|
|
198
|
+
enforce_judgment = input.requested_role == ScenarioAgentRole.JUDGE
|
199
|
+
has_criteria = len(scenario.criteria) > 0
|
200
|
+
|
201
|
+
if enforce_judgment and not has_criteria:
|
202
|
+
return ScenarioResult(
|
203
|
+
success=False,
|
204
|
+
messages=[],
|
205
|
+
reasoning="TestingAgent was called as a judge, but it has no criteria to judge against",
|
206
|
+
)
|
207
|
+
|
185
208
|
response = cast(
|
186
209
|
ModelResponse,
|
187
210
|
completion(
|
@@ -189,8 +212,16 @@ if you don't have enough information to make a verdict, say inconclusive with ma
|
|
189
212
|
messages=messages,
|
190
213
|
temperature=self.temperature,
|
191
214
|
max_tokens=self.max_tokens,
|
192
|
-
tools=
|
193
|
-
|
215
|
+
tools=(
|
216
|
+
tools
|
217
|
+
if (not is_first_message or enforce_judgment) and has_criteria
|
218
|
+
else None
|
219
|
+
),
|
220
|
+
tool_choice=(
|
221
|
+
"required"
|
222
|
+
if (is_last_message or enforce_judgment) and has_criteria
|
223
|
+
else None
|
224
|
+
),
|
194
225
|
),
|
195
226
|
)
|
196
227
|
|
@@ -221,27 +252,13 @@ if you don't have enough information to make a verdict, say inconclusive with ma
|
|
221
252
|
]
|
222
253
|
|
223
254
|
# Return the appropriate ScenarioResult based on the verdict
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
return ScenarioResult.failure_result(
|
232
|
-
conversation=conversation,
|
233
|
-
reasoning=reasoning,
|
234
|
-
passed_criteria=passed_criteria,
|
235
|
-
failed_criteria=failed_criteria,
|
236
|
-
)
|
237
|
-
else: # inconclusive
|
238
|
-
return ScenarioResult(
|
239
|
-
success=False,
|
240
|
-
conversation=conversation,
|
241
|
-
reasoning=reasoning,
|
242
|
-
passed_criteria=passed_criteria,
|
243
|
-
failed_criteria=failed_criteria,
|
244
|
-
)
|
255
|
+
return ScenarioResult(
|
256
|
+
success=verdict == "success",
|
257
|
+
messages=messages,
|
258
|
+
reasoning=reasoning,
|
259
|
+
passed_criteria=passed_criteria,
|
260
|
+
failed_criteria=failed_criteria,
|
261
|
+
)
|
245
262
|
except json.JSONDecodeError:
|
246
263
|
logger.error("Failed to parse tool call arguments")
|
247
264
|
|
@@ -255,7 +272,7 @@ if you don't have enough information to make a verdict, say inconclusive with ma
|
|
255
272
|
)
|
256
273
|
raise Exception(f"No response from LLM: {response.__repr__()}")
|
257
274
|
|
258
|
-
return message_content
|
275
|
+
return {"role": "user", "content": message_content}
|
259
276
|
else:
|
260
277
|
raise Exception(
|
261
278
|
f"Unexpected response format from LLM: {response.__repr__()}"
|
scenario/types.py
ADDED
@@ -0,0 +1,96 @@
|
|
1
|
+
from enum import Enum
|
2
|
+
from pydantic import BaseModel, Field, SkipValidation
|
3
|
+
from typing import (
|
4
|
+
TYPE_CHECKING,
|
5
|
+
Annotated,
|
6
|
+
Any,
|
7
|
+
Awaitable,
|
8
|
+
Callable,
|
9
|
+
Coroutine,
|
10
|
+
Dict,
|
11
|
+
List,
|
12
|
+
Optional,
|
13
|
+
Union,
|
14
|
+
)
|
15
|
+
|
16
|
+
from openai.types.chat import ChatCompletionMessageParam, ChatCompletionUserMessageParam
|
17
|
+
|
18
|
+
# Prevent circular imports + Pydantic breaking
|
19
|
+
if TYPE_CHECKING:
|
20
|
+
from scenario.scenario_executor import ScenarioExecutor
|
21
|
+
|
22
|
+
ScenarioExecutorType = ScenarioExecutor
|
23
|
+
else:
|
24
|
+
ScenarioExecutorType = Any
|
25
|
+
|
26
|
+
|
27
|
+
class ScenarioAgentRole(Enum):
|
28
|
+
USER = "User"
|
29
|
+
AGENT = "Agent"
|
30
|
+
JUDGE = "Judge"
|
31
|
+
|
32
|
+
|
33
|
+
class AgentInput(BaseModel):
|
34
|
+
thread_id: str
|
35
|
+
# Prevent pydantic from validating/parsing the messages and causing issues: https://github.com/pydantic/pydantic/issues/9541
|
36
|
+
messages: Annotated[List[ChatCompletionMessageParam], SkipValidation]
|
37
|
+
new_messages: Annotated[List[ChatCompletionMessageParam], SkipValidation]
|
38
|
+
context: Dict[str, Any]
|
39
|
+
requested_role: ScenarioAgentRole
|
40
|
+
scenario_state: ScenarioExecutorType = Field(exclude=True)
|
41
|
+
|
42
|
+
def last_new_user_message(self) -> ChatCompletionUserMessageParam:
|
43
|
+
user_messages = [m for m in self.new_messages if m["role"] == "user"]
|
44
|
+
if not user_messages:
|
45
|
+
raise ValueError(
|
46
|
+
"No new user messages found, did you mean to call the assistant twice? Perhaps change your adapter to use the full messages list instead."
|
47
|
+
)
|
48
|
+
return user_messages[-1]
|
49
|
+
|
50
|
+
def last_new_user_message_str(self) -> str:
|
51
|
+
content = self.last_new_user_message()["content"]
|
52
|
+
if type(content) != str:
|
53
|
+
raise ValueError(
|
54
|
+
f"Last user message is not a string: {content.__repr__()}. Please use the full messages list instead."
|
55
|
+
)
|
56
|
+
return content
|
57
|
+
|
58
|
+
|
59
|
+
class ScenarioResult(BaseModel):
|
60
|
+
"""
|
61
|
+
Represents the results of a scenario test run.
|
62
|
+
|
63
|
+
Attributes:
|
64
|
+
success: Whether the scenario passed
|
65
|
+
conversation: The conversation history
|
66
|
+
reasoning: Reasoning for the result
|
67
|
+
passed_criteria: List of criteria that were met
|
68
|
+
failed_criteria: List of criteria that were not met
|
69
|
+
"""
|
70
|
+
|
71
|
+
success: bool
|
72
|
+
messages: List[ChatCompletionMessageParam]
|
73
|
+
reasoning: Optional[str] = None
|
74
|
+
passed_criteria: List[str] = []
|
75
|
+
failed_criteria: List[str] = []
|
76
|
+
total_time: Optional[float] = None
|
77
|
+
agent_time: Optional[float] = None
|
78
|
+
|
79
|
+
def __repr__(self) -> str:
|
80
|
+
"""Provide a concise representation for debugging."""
|
81
|
+
status = "PASSED" if self.success else "FAILED"
|
82
|
+
return f"ScenarioResult(success={self.success}, status={status}, reasoning='{self.reasoning or 'None'}')"
|
83
|
+
|
84
|
+
|
85
|
+
AgentReturnTypes = Union[
|
86
|
+
str, ChatCompletionMessageParam, List[ChatCompletionMessageParam], ScenarioResult
|
87
|
+
]
|
88
|
+
|
89
|
+
# TODO: remove the optional ScenarioResult return type from here, use events instead
|
90
|
+
ScriptStep = Union[
|
91
|
+
Callable[["ScenarioExecutor"], None],
|
92
|
+
Callable[["ScenarioExecutor"], Optional[ScenarioResult]],
|
93
|
+
# Async as well
|
94
|
+
Callable[["ScenarioExecutor"], Awaitable[None]],
|
95
|
+
Callable[["ScenarioExecutor"], Awaitable[Optional[ScenarioResult]]],
|
96
|
+
]
|
scenario/utils.py
CHANGED
@@ -1,6 +1,16 @@
|
|
1
1
|
from contextlib import contextmanager
|
2
2
|
import sys
|
3
|
-
from typing import
|
3
|
+
from typing import (
|
4
|
+
Any,
|
5
|
+
Iterator,
|
6
|
+
List,
|
7
|
+
Literal,
|
8
|
+
Optional,
|
9
|
+
Union,
|
10
|
+
TypeVar,
|
11
|
+
Awaitable,
|
12
|
+
cast,
|
13
|
+
)
|
4
14
|
from pydantic import BaseModel
|
5
15
|
|
6
16
|
import json
|
@@ -14,12 +24,18 @@ from rich.console import Console
|
|
14
24
|
from rich.text import Text
|
15
25
|
from rich.errors import LiveError
|
16
26
|
|
27
|
+
from scenario.error_messages import message_return_error_message
|
28
|
+
from scenario.types import AgentReturnTypes, ScenarioResult
|
29
|
+
|
30
|
+
T = TypeVar("T")
|
17
31
|
|
18
32
|
|
19
33
|
class SerializableAndPydanticEncoder(json.JSONEncoder):
|
20
34
|
def default(self, o):
|
21
35
|
if isinstance(o, BaseModel):
|
22
36
|
return o.model_dump(exclude_unset=True)
|
37
|
+
if isinstance(o, Iterator):
|
38
|
+
return list(o)
|
23
39
|
return super().default(o)
|
24
40
|
|
25
41
|
|
@@ -46,7 +62,9 @@ def title_case(string):
|
|
46
62
|
return " ".join(word.capitalize() for word in string.split("_"))
|
47
63
|
|
48
64
|
|
49
|
-
def print_openai_messages(
|
65
|
+
def print_openai_messages(
|
66
|
+
scenario_name: str, messages: list[ChatCompletionMessageParam]
|
67
|
+
):
|
50
68
|
for msg in messages:
|
51
69
|
role = safe_attr_or_key(msg, "role")
|
52
70
|
content = safe_attr_or_key(msg, "content")
|
@@ -61,9 +79,12 @@ def print_openai_messages(scenario_name: str, messages: list[ChatCompletionMessa
|
|
61
79
|
args = safe_attr_or_key(function, "arguments", "{}")
|
62
80
|
args = _take_maybe_json_first_lines(args)
|
63
81
|
print(
|
64
|
-
scenario_name
|
82
|
+
scenario_name
|
83
|
+
+ termcolor.colored(f"ToolCall({name}):", "magenta"),
|
65
84
|
f"\n\n{indent(args, ' ' * 4)}\n",
|
66
85
|
)
|
86
|
+
elif role == "user":
|
87
|
+
print(scenario_name + termcolor.colored("User:", "green"), content)
|
67
88
|
elif role == "tool":
|
68
89
|
content = _take_maybe_json_first_lines(content or msg.__repr__())
|
69
90
|
print(
|
@@ -91,9 +112,12 @@ def _take_maybe_json_first_lines(string, max_lines=5):
|
|
91
112
|
|
92
113
|
console = Console()
|
93
114
|
|
115
|
+
|
94
116
|
class TextFirstSpinner(Spinner):
|
95
117
|
def __init__(self, name, text: str, color: str, **kwargs):
|
96
|
-
super().__init__(
|
118
|
+
super().__init__(
|
119
|
+
name, "", style="bold white", **kwargs
|
120
|
+
) # Initialize with empty text
|
97
121
|
self.text_before = text
|
98
122
|
self.color = color
|
99
123
|
|
@@ -105,7 +129,9 @@ class TextFirstSpinner(Spinner):
|
|
105
129
|
|
106
130
|
|
107
131
|
@contextmanager
|
108
|
-
def show_spinner(
|
132
|
+
def show_spinner(
|
133
|
+
text: str, color: str = "white", enabled: Optional[Union[bool, int]] = None
|
134
|
+
):
|
109
135
|
if not enabled:
|
110
136
|
yield
|
111
137
|
else:
|
@@ -119,3 +145,120 @@ def show_spinner(text: str, color: str = "white", enabled: Optional[Union[bool,
|
|
119
145
|
|
120
146
|
# Cursor up one line
|
121
147
|
sys.stdout.write("\033[F")
|
148
|
+
# Erase the line
|
149
|
+
sys.stdout.write("\033[2K")
|
150
|
+
|
151
|
+
|
152
|
+
def check_valid_return_type(return_value: Any, class_name: str) -> None:
|
153
|
+
def _is_valid_openai_message(message: Any) -> bool:
|
154
|
+
return (isinstance(message, dict) and "role" in message) or (
|
155
|
+
isinstance(message, BaseModel) and hasattr(message, "role")
|
156
|
+
)
|
157
|
+
|
158
|
+
if (
|
159
|
+
isinstance(return_value, str)
|
160
|
+
or _is_valid_openai_message(return_value)
|
161
|
+
or (
|
162
|
+
isinstance(return_value, list)
|
163
|
+
and all(_is_valid_openai_message(message) for message in return_value)
|
164
|
+
)
|
165
|
+
or isinstance(return_value, ScenarioResult)
|
166
|
+
):
|
167
|
+
try:
|
168
|
+
json.dumps(return_value, cls=SerializableAndPydanticEncoder)
|
169
|
+
except:
|
170
|
+
raise ValueError(
|
171
|
+
message_return_error_message(got=return_value, class_name=class_name)
|
172
|
+
)
|
173
|
+
|
174
|
+
return
|
175
|
+
|
176
|
+
raise ValueError(
|
177
|
+
message_return_error_message(got=return_value, class_name=class_name)
|
178
|
+
)
|
179
|
+
|
180
|
+
|
181
|
+
def convert_agent_return_types_to_openai_messages(
|
182
|
+
agent_response: AgentReturnTypes, role: Literal["user", "assistant"]
|
183
|
+
) -> List[ChatCompletionMessageParam]:
|
184
|
+
if isinstance(agent_response, ScenarioResult):
|
185
|
+
raise ValueError(
|
186
|
+
"Unexpectedly tried to convert a ScenarioResult to openai messages",
|
187
|
+
agent_response.__repr__(),
|
188
|
+
)
|
189
|
+
|
190
|
+
def convert_maybe_object_to_openai_message(
|
191
|
+
obj: Any,
|
192
|
+
) -> ChatCompletionMessageParam:
|
193
|
+
if isinstance(obj, dict):
|
194
|
+
return cast(ChatCompletionMessageParam, obj)
|
195
|
+
elif isinstance(obj, BaseModel):
|
196
|
+
return cast(
|
197
|
+
ChatCompletionMessageParam,
|
198
|
+
obj.model_dump(
|
199
|
+
exclude_unset=True,
|
200
|
+
exclude_none=True,
|
201
|
+
exclude_defaults=True,
|
202
|
+
),
|
203
|
+
)
|
204
|
+
else:
|
205
|
+
raise ValueError(f"Unexpected agent response type: {type(obj).__name__}")
|
206
|
+
|
207
|
+
def ensure_dict(
|
208
|
+
obj: T,
|
209
|
+
) -> T:
|
210
|
+
return json.loads(json.dumps(obj, cls=SerializableAndPydanticEncoder))
|
211
|
+
|
212
|
+
if isinstance(agent_response, str):
|
213
|
+
return [
|
214
|
+
(
|
215
|
+
{"role": "user", "content": agent_response}
|
216
|
+
if role == "user"
|
217
|
+
else {"role": "assistant", "content": agent_response}
|
218
|
+
)
|
219
|
+
]
|
220
|
+
elif isinstance(agent_response, list):
|
221
|
+
return [
|
222
|
+
ensure_dict(convert_maybe_object_to_openai_message(message))
|
223
|
+
for message in agent_response
|
224
|
+
]
|
225
|
+
else:
|
226
|
+
return [ensure_dict(convert_maybe_object_to_openai_message(agent_response))]
|
227
|
+
|
228
|
+
|
229
|
+
def reverse_roles(
|
230
|
+
messages: list[ChatCompletionMessageParam],
|
231
|
+
) -> list[ChatCompletionMessageParam]:
|
232
|
+
"""
|
233
|
+
Reverses the roles of the messages in the list.
|
234
|
+
|
235
|
+
Args:
|
236
|
+
messages: The list of messages to reverse the roles of.
|
237
|
+
"""
|
238
|
+
|
239
|
+
for message in messages.copy():
|
240
|
+
# Can't reverse tool calls
|
241
|
+
if not safe_attr_or_key(message, "content") or safe_attr_or_key(
|
242
|
+
message, "tool_calls"
|
243
|
+
):
|
244
|
+
continue
|
245
|
+
|
246
|
+
if type(message) == dict:
|
247
|
+
if message["role"] == "user":
|
248
|
+
message["role"] = "assistant"
|
249
|
+
elif message["role"] == "assistant":
|
250
|
+
message["role"] = "user"
|
251
|
+
else:
|
252
|
+
if getattr(message, "role", None) == "user":
|
253
|
+
message.role = "assistant" # type: ignore
|
254
|
+
elif getattr(message, "role", None) == "assistant":
|
255
|
+
message.role = "user" # type: ignore
|
256
|
+
|
257
|
+
return messages
|
258
|
+
|
259
|
+
|
260
|
+
async def await_if_awaitable(value: T) -> T:
|
261
|
+
if isinstance(value, Awaitable):
|
262
|
+
return await value
|
263
|
+
else:
|
264
|
+
return value
|
@@ -1,15 +0,0 @@
|
|
1
|
-
scenario/__init__.py,sha256=LfCjOpbn55jYBBZHyMSZtRAWeCDFn4z4OhAyFnu8aMg,602
|
2
|
-
scenario/cache.py,sha256=sYu16SAf-BnVYkWSlEDzpyynJGIQyNYsgMXPgCqEnmk,1719
|
3
|
-
scenario/config.py,sha256=5UVBmuQDtni0Yu00bMh5p0xMGsrymYVRftXBGTsi2fI,802
|
4
|
-
scenario/error_messages.py,sha256=ZMcAOKJmKaLIinMZ0yBIOgDhPfeJH0uZxIEmolRArtc,2344
|
5
|
-
scenario/pytest_plugin.py,sha256=TzOHi8PN-dtDqaYAZkgT0wgBkhetOpYy--Z0pzi5PXM,5771
|
6
|
-
scenario/result.py,sha256=y6mUu6X4H6YJYmwVD4VWHCBi-1BTlUVeYrTZ3HBA0oU,2382
|
7
|
-
scenario/scenario.py,sha256=OTadwIHIcUhXxfUNnJXpT7h3GZ_VUL3XSd9k-oVPfMo,4069
|
8
|
-
scenario/scenario_executor.py,sha256=phRKj7vZ_QjGUO9w05-DPrAzdacg_7CnTV59lYLCCKk,7912
|
9
|
-
scenario/testing_agent.py,sha256=y4B8TMhKryeTiiv62qwslx7Gw_zw54Vk9zPyswEPm0k,10481
|
10
|
-
scenario/utils.py,sha256=tMESosrxesA1B5zZB3IJ-sNSXDmnpNNib-DHobveVLA,3918
|
11
|
-
langwatch_scenario-0.2.0.dist-info/METADATA,sha256=fc1oBg2ms-iVgYc44oSTJk-8sw2yOe_PpWEMStvYEX4,9339
|
12
|
-
langwatch_scenario-0.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
13
|
-
langwatch_scenario-0.2.0.dist-info/entry_points.txt,sha256=WlEnJ_gku0i18bIa3DSuGqXRX-QDQLe_s0YmRzK45TI,45
|
14
|
-
langwatch_scenario-0.2.0.dist-info/top_level.txt,sha256=45Mn28aedJsetnBMB5xSmrJ-yo701QLH89Zlz4r1clE,9
|
15
|
-
langwatch_scenario-0.2.0.dist-info/RECORD,,
|
scenario/result.py
DELETED
@@ -1,74 +0,0 @@
|
|
1
|
-
"""
|
2
|
-
Result module: defines the class for scenario test results.
|
3
|
-
"""
|
4
|
-
|
5
|
-
from dataclasses import dataclass, field
|
6
|
-
from typing import List, Dict, Optional
|
7
|
-
|
8
|
-
|
9
|
-
@dataclass
|
10
|
-
class ScenarioResult:
|
11
|
-
"""
|
12
|
-
Represents the results of a scenario test run.
|
13
|
-
|
14
|
-
Attributes:
|
15
|
-
success: Whether the scenario passed
|
16
|
-
conversation: The conversation history
|
17
|
-
reasoning: Reasoning for the result
|
18
|
-
passed_criteria: List of criteria that were met
|
19
|
-
failed_criteria: List of criteria that were not met
|
20
|
-
"""
|
21
|
-
|
22
|
-
success: bool
|
23
|
-
conversation: List[Dict[str, str]]
|
24
|
-
reasoning: Optional[str] = None
|
25
|
-
passed_criteria: List[str] = field(default_factory=list)
|
26
|
-
failed_criteria: List[str] = field(default_factory=list)
|
27
|
-
total_time: Optional[float] = None
|
28
|
-
agent_time: Optional[float] = None
|
29
|
-
|
30
|
-
def __post_init__(self) -> None:
|
31
|
-
"""Validate the result after initialization."""
|
32
|
-
if not self.success and not self.reasoning:
|
33
|
-
raise ValueError("Failed scenarios must have a reasoning")
|
34
|
-
|
35
|
-
@classmethod
|
36
|
-
def success_result(
|
37
|
-
cls,
|
38
|
-
conversation: List[Dict[str, str]],
|
39
|
-
reasoning: Optional[str],
|
40
|
-
passed_criteria: List[str],
|
41
|
-
total_time: Optional[float] = None,
|
42
|
-
agent_time: Optional[float] = None,
|
43
|
-
) -> "ScenarioResult":
|
44
|
-
"""Create a successful result."""
|
45
|
-
return cls(
|
46
|
-
success=True,
|
47
|
-
conversation=conversation,
|
48
|
-
reasoning=reasoning,
|
49
|
-
passed_criteria=passed_criteria,
|
50
|
-
failed_criteria=[],
|
51
|
-
total_time=total_time,
|
52
|
-
agent_time=agent_time,
|
53
|
-
)
|
54
|
-
|
55
|
-
@classmethod
|
56
|
-
def failure_result(
|
57
|
-
cls,
|
58
|
-
conversation: List[Dict[str, str]],
|
59
|
-
reasoning: str,
|
60
|
-
passed_criteria: Optional[List[str]] = None,
|
61
|
-
failed_criteria: Optional[List[str]] = None,
|
62
|
-
total_time: Optional[float] = None,
|
63
|
-
agent_time: Optional[float] = None,
|
64
|
-
) -> "ScenarioResult":
|
65
|
-
"""Create a failed result."""
|
66
|
-
return cls(
|
67
|
-
success=False,
|
68
|
-
conversation=conversation,
|
69
|
-
reasoning=reasoning,
|
70
|
-
passed_criteria=passed_criteria if passed_criteria is not None else [],
|
71
|
-
failed_criteria=failed_criteria if failed_criteria is not None else [],
|
72
|
-
total_time=total_time,
|
73
|
-
agent_time=agent_time,
|
74
|
-
)
|
File without changes
|
File without changes
|
File without changes
|