langwatch-scenario 0.1.3__py3-none-any.whl → 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {langwatch_scenario-0.1.3.dist-info → langwatch_scenario-0.3.0.dist-info}/METADATA +95 -34
- langwatch_scenario-0.3.0.dist-info/RECORD +16 -0
- {langwatch_scenario-0.1.3.dist-info → langwatch_scenario-0.3.0.dist-info}/WHEEL +1 -1
- scenario/__init__.py +13 -3
- scenario/config.py +18 -7
- scenario/error_messages.py +81 -23
- scenario/pytest_plugin.py +8 -8
- scenario/scenario.py +144 -26
- scenario/scenario_agent_adapter.py +16 -0
- scenario/scenario_executor.py +405 -143
- scenario/testing_agent.py +123 -109
- scenario/types.py +96 -0
- scenario/utils.py +148 -5
- langwatch_scenario-0.1.3.dist-info/RECORD +0 -15
- scenario/result.py +0 -81
- {langwatch_scenario-0.1.3.dist-info → langwatch_scenario-0.3.0.dist-info}/entry_points.txt +0 -0
- {langwatch_scenario-0.1.3.dist-info → langwatch_scenario-0.3.0.dist-info}/top_level.txt +0 -0
scenario/testing_agent.py
CHANGED
@@ -4,25 +4,24 @@ TestingAgent module: defines the testing agent that interacts with the agent und
|
|
4
4
|
|
5
5
|
import json
|
6
6
|
import logging
|
7
|
-
|
8
|
-
from
|
7
|
+
import re
|
8
|
+
from typing import Optional, Type, cast
|
9
9
|
|
10
10
|
from litellm import Choices, completion
|
11
11
|
from litellm.files.main import ModelResponse
|
12
12
|
|
13
13
|
from scenario.cache import scenario_cache
|
14
|
-
from scenario.
|
14
|
+
from scenario.scenario_agent_adapter import ScenarioAgentAdapter
|
15
|
+
from scenario.utils import reverse_roles
|
15
16
|
|
16
|
-
from .
|
17
|
-
|
18
|
-
if TYPE_CHECKING:
|
19
|
-
from scenario.scenario import Scenario
|
17
|
+
from .error_messages import testing_agent_not_configured_error_message
|
18
|
+
from .types import AgentInput, AgentReturnTypes, ScenarioAgentRole, ScenarioResult
|
20
19
|
|
21
20
|
|
22
21
|
logger = logging.getLogger("scenario")
|
23
22
|
|
24
23
|
|
25
|
-
class TestingAgent(
|
24
|
+
class TestingAgent(ScenarioAgentAdapter):
|
26
25
|
"""
|
27
26
|
The Testing Agent that interacts with the agent under test.
|
28
27
|
|
@@ -32,7 +31,9 @@ class TestingAgent(BaseModel):
|
|
32
31
|
3. Determining when to end the test and return a result
|
33
32
|
"""
|
34
33
|
|
35
|
-
|
34
|
+
roles = {ScenarioAgentRole.USER, ScenarioAgentRole.JUDGE}
|
35
|
+
|
36
|
+
model: str = ""
|
36
37
|
api_key: Optional[str] = None
|
37
38
|
temperature: float = 0.0
|
38
39
|
max_tokens: Optional[int] = None
|
@@ -40,14 +41,36 @@ class TestingAgent(BaseModel):
|
|
40
41
|
# To prevent pytest from thinking this is actually a test class
|
41
42
|
__test__ = False
|
42
43
|
|
44
|
+
def __init__(self, input: AgentInput):
|
45
|
+
super().__init__(input)
|
46
|
+
|
47
|
+
if not self.model:
|
48
|
+
raise Exception(testing_agent_not_configured_error_message)
|
49
|
+
|
50
|
+
@classmethod
|
51
|
+
def with_config(
|
52
|
+
cls,
|
53
|
+
model: str,
|
54
|
+
api_key: Optional[str] = None,
|
55
|
+
temperature: float = 0.0,
|
56
|
+
max_tokens: Optional[int] = None,
|
57
|
+
) -> Type["TestingAgent"]:
|
58
|
+
class TestingAgentWithConfig(cls):
|
59
|
+
def __init__(self, input: AgentInput):
|
60
|
+
self.model = model
|
61
|
+
self.api_key = api_key
|
62
|
+
self.temperature = temperature
|
63
|
+
self.max_tokens = max_tokens
|
64
|
+
|
65
|
+
super().__init__(input)
|
66
|
+
|
67
|
+
return TestingAgentWithConfig
|
68
|
+
|
43
69
|
@scenario_cache(ignore=["scenario"])
|
44
|
-
def
|
70
|
+
async def call(
|
45
71
|
self,
|
46
|
-
|
47
|
-
|
48
|
-
first_message: bool = False,
|
49
|
-
last_message: bool = False,
|
50
|
-
) -> Union[str, ScenarioResult]:
|
72
|
+
input: AgentInput,
|
73
|
+
) -> AgentReturnTypes:
|
51
74
|
"""
|
52
75
|
Generate the next message in the conversation based on history OR
|
53
76
|
return a ScenarioResult if the test should conclude.
|
@@ -57,6 +80,8 @@ class TestingAgent(BaseModel):
|
|
57
80
|
- A ScenarioResult (if the test should conclude)
|
58
81
|
"""
|
59
82
|
|
83
|
+
scenario = input.scenario_state.scenario
|
84
|
+
|
60
85
|
messages = [
|
61
86
|
{
|
62
87
|
"role": "system",
|
@@ -74,37 +99,34 @@ Your goal (assistant) is to interact with the Agent Under Test (user) as if you
|
|
74
99
|
{scenario.description}
|
75
100
|
</scenario>
|
76
101
|
|
77
|
-
<
|
78
|
-
{
|
79
|
-
</
|
80
|
-
|
81
|
-
<success_criteria>
|
82
|
-
{json.dumps(scenario.success_criteria, indent=2)}
|
83
|
-
</success_criteria>
|
84
|
-
|
85
|
-
<failure_criteria>
|
86
|
-
{json.dumps(scenario.failure_criteria, indent=2)}
|
87
|
-
</failure_criteria>
|
102
|
+
<criteria>
|
103
|
+
{"\n".join([f"{idx + 1}. {criterion}" for idx, criterion in enumerate(scenario.criteria)])}
|
104
|
+
</criteria>
|
88
105
|
|
89
106
|
<execution_flow>
|
90
107
|
1. Generate the first message to start the scenario
|
91
108
|
2. After the Agent Under Test (user) responds, generate the next message to send to the Agent Under Test, keep repeating step 2 until criterias match
|
92
|
-
3. If the test should end, use the finish_test tool to determine if
|
109
|
+
3. If the test should end, use the finish_test tool to determine if all the criteria have been met
|
93
110
|
</execution_flow>
|
94
111
|
|
95
112
|
<rules>
|
96
|
-
1. Test should end immediately if a
|
97
|
-
2. Test should continue until all
|
113
|
+
1. Test should end immediately if a criteria mentioning something the agent should NOT do is met
|
114
|
+
2. Test should continue until all scenario goals have been met to try going through all the criteria
|
98
115
|
3. DO NOT make any judgment calls that are not explicitly listed in the success or failure criteria, withhold judgement if necessary
|
99
116
|
4. DO NOT carry over any requests yourself, YOU ARE NOT the assistant today, wait for the user to do it
|
100
117
|
</rules>
|
101
118
|
""",
|
102
119
|
},
|
103
120
|
{"role": "assistant", "content": "Hello, how can I help you today?"},
|
104
|
-
*
|
121
|
+
*input.messages,
|
105
122
|
]
|
106
123
|
|
107
|
-
|
124
|
+
is_first_message = len(input.messages) == 0
|
125
|
+
is_last_message = (
|
126
|
+
input.scenario_state.current_turn == input.scenario_state.scenario.max_turns
|
127
|
+
)
|
128
|
+
|
129
|
+
if is_last_message:
|
108
130
|
messages.append(
|
109
131
|
{
|
110
132
|
"role": "user",
|
@@ -122,25 +144,17 @@ if you don't have enough information to make a verdict, say inconclusive with ma
|
|
122
144
|
# User to assistant role reversal
|
123
145
|
# LLM models are biased to always be the assistant not the user, so we need to do this reversal otherwise models like GPT 4.5 is
|
124
146
|
# super confused, and Claude 3.7 even starts throwing exceptions.
|
125
|
-
|
126
|
-
# Can't reverse tool calls
|
127
|
-
if not safe_attr_or_key(message, "content") or safe_attr_or_key(
|
128
|
-
message, "tool_calls"
|
129
|
-
):
|
130
|
-
continue
|
131
|
-
|
132
|
-
if type(message) == dict:
|
133
|
-
if message["role"] == "user":
|
134
|
-
message["role"] = "assistant"
|
135
|
-
elif message["role"] == "assistant":
|
136
|
-
message["role"] = "user"
|
137
|
-
else:
|
138
|
-
if getattr(message, "role", None) == "user":
|
139
|
-
message.role = "assistant"
|
140
|
-
elif getattr(message, "role", None) == "assistant":
|
141
|
-
message.role = "user"
|
147
|
+
messages = reverse_roles(messages)
|
142
148
|
|
143
149
|
# Define the tool
|
150
|
+
criteria_names = [
|
151
|
+
re.sub(
|
152
|
+
r"[^a-zA-Z0-9]",
|
153
|
+
"_",
|
154
|
+
criterion.replace(" ", "_").replace("'", "").lower(),
|
155
|
+
)[:70]
|
156
|
+
for criterion in scenario.criteria
|
157
|
+
]
|
144
158
|
tools = [
|
145
159
|
{
|
146
160
|
"type": "function",
|
@@ -151,46 +165,46 @@ if you don't have enough information to make a verdict, say inconclusive with ma
|
|
151
165
|
"parameters": {
|
152
166
|
"type": "object",
|
153
167
|
"properties": {
|
154
|
-
"
|
155
|
-
"type": "string",
|
156
|
-
"enum": ["success", "failure", "inconclusive"],
|
157
|
-
"description": "The final verdict of the test",
|
158
|
-
},
|
159
|
-
"reasoning": {
|
160
|
-
"type": "string",
|
161
|
-
"description": "Explanation of why this verdict was reached",
|
162
|
-
},
|
163
|
-
"details": {
|
168
|
+
"criteria": {
|
164
169
|
"type": "object",
|
165
170
|
"properties": {
|
166
|
-
|
167
|
-
"
|
168
|
-
"
|
169
|
-
|
170
|
-
|
171
|
-
"unmet_criteria": {
|
172
|
-
"type": "array",
|
173
|
-
"items": {"type": "string"},
|
174
|
-
"description": "List of success criteria that have not been met",
|
175
|
-
},
|
176
|
-
"triggered_failures": {
|
177
|
-
"type": "array",
|
178
|
-
"items": {"type": "string"},
|
179
|
-
"description": "List of failure criteria that have been triggered",
|
180
|
-
},
|
171
|
+
criteria_names[idx]: {
|
172
|
+
"enum": [True, False, "inconclusive"],
|
173
|
+
"description": criterion,
|
174
|
+
}
|
175
|
+
for idx, criterion in enumerate(scenario.criteria)
|
181
176
|
},
|
182
|
-
"required":
|
177
|
+
"required": criteria_names,
|
183
178
|
"additionalProperties": False,
|
184
|
-
"description": "
|
179
|
+
"description": "Strict verdict for each criterion",
|
180
|
+
},
|
181
|
+
"reasoning": {
|
182
|
+
"type": "string",
|
183
|
+
"description": "Explanation of what the final verdict should be",
|
184
|
+
},
|
185
|
+
"verdict": {
|
186
|
+
"type": "string",
|
187
|
+
"enum": ["success", "failure", "inconclusive"],
|
188
|
+
"description": "The final verdict of the test",
|
185
189
|
},
|
186
190
|
},
|
187
|
-
"required": ["
|
191
|
+
"required": ["criteria", "reasoning", "verdict"],
|
188
192
|
"additionalProperties": False,
|
189
193
|
},
|
190
194
|
},
|
191
195
|
}
|
192
196
|
]
|
193
197
|
|
198
|
+
enforce_judgment = input.requested_role == ScenarioAgentRole.JUDGE
|
199
|
+
has_criteria = len(scenario.criteria) > 0
|
200
|
+
|
201
|
+
if enforce_judgment and not has_criteria:
|
202
|
+
return ScenarioResult(
|
203
|
+
success=False,
|
204
|
+
messages=[],
|
205
|
+
reasoning="TestingAgent was called as a judge, but it has no criteria to judge against",
|
206
|
+
)
|
207
|
+
|
194
208
|
response = cast(
|
195
209
|
ModelResponse,
|
196
210
|
completion(
|
@@ -198,8 +212,16 @@ if you don't have enough information to make a verdict, say inconclusive with ma
|
|
198
212
|
messages=messages,
|
199
213
|
temperature=self.temperature,
|
200
214
|
max_tokens=self.max_tokens,
|
201
|
-
tools=
|
202
|
-
|
215
|
+
tools=(
|
216
|
+
tools
|
217
|
+
if (not is_first_message or enforce_judgment) and has_criteria
|
218
|
+
else None
|
219
|
+
),
|
220
|
+
tool_choice=(
|
221
|
+
"required"
|
222
|
+
if (is_last_message or enforce_judgment) and has_criteria
|
223
|
+
else None
|
224
|
+
),
|
203
225
|
),
|
204
226
|
)
|
205
227
|
|
@@ -216,36 +238,27 @@ if you don't have enough information to make a verdict, say inconclusive with ma
|
|
216
238
|
args = json.loads(tool_call.function.arguments)
|
217
239
|
verdict = args.get("verdict", "inconclusive")
|
218
240
|
reasoning = args.get("reasoning", "No reasoning provided")
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
241
|
+
criteria = args.get("criteria", {})
|
242
|
+
|
243
|
+
passed_criteria = [
|
244
|
+
scenario.criteria[idx]
|
245
|
+
for idx, criterion in enumerate(criteria.values())
|
246
|
+
if criterion == True
|
247
|
+
]
|
248
|
+
failed_criteria = [
|
249
|
+
scenario.criteria[idx]
|
250
|
+
for idx, criterion in enumerate(criteria.values())
|
251
|
+
if criterion == False
|
252
|
+
]
|
224
253
|
|
225
254
|
# Return the appropriate ScenarioResult based on the verdict
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
return ScenarioResult.failure_result(
|
234
|
-
conversation=conversation,
|
235
|
-
reasoning=reasoning,
|
236
|
-
met_criteria=met_criteria,
|
237
|
-
unmet_criteria=unmet_criteria,
|
238
|
-
triggered_failures=triggered_failures,
|
239
|
-
)
|
240
|
-
else: # inconclusive
|
241
|
-
return ScenarioResult(
|
242
|
-
success=False,
|
243
|
-
conversation=conversation,
|
244
|
-
reasoning=reasoning,
|
245
|
-
met_criteria=met_criteria,
|
246
|
-
unmet_criteria=unmet_criteria,
|
247
|
-
triggered_failures=triggered_failures,
|
248
|
-
)
|
255
|
+
return ScenarioResult(
|
256
|
+
success=verdict == "success",
|
257
|
+
messages=messages,
|
258
|
+
reasoning=reasoning,
|
259
|
+
passed_criteria=passed_criteria,
|
260
|
+
failed_criteria=failed_criteria,
|
261
|
+
)
|
249
262
|
except json.JSONDecodeError:
|
250
263
|
logger.error("Failed to parse tool call arguments")
|
251
264
|
|
@@ -254,12 +267,13 @@ if you don't have enough information to make a verdict, say inconclusive with ma
|
|
254
267
|
if message_content is None:
|
255
268
|
# If invalid tool call, raise an error
|
256
269
|
if message.tool_calls:
|
257
|
-
raise Exception(
|
270
|
+
raise Exception(
|
271
|
+
f"Invalid tool call from testing agent: {message.tool_calls.__repr__()}"
|
272
|
+
)
|
258
273
|
raise Exception(f"No response from LLM: {response.__repr__()}")
|
259
274
|
|
260
|
-
return message_content
|
275
|
+
return {"role": "user", "content": message_content}
|
261
276
|
else:
|
262
277
|
raise Exception(
|
263
278
|
f"Unexpected response format from LLM: {response.__repr__()}"
|
264
279
|
)
|
265
|
-
|
scenario/types.py
ADDED
@@ -0,0 +1,96 @@
|
|
1
|
+
from enum import Enum
|
2
|
+
from pydantic import BaseModel, Field, SkipValidation
|
3
|
+
from typing import (
|
4
|
+
TYPE_CHECKING,
|
5
|
+
Annotated,
|
6
|
+
Any,
|
7
|
+
Awaitable,
|
8
|
+
Callable,
|
9
|
+
Coroutine,
|
10
|
+
Dict,
|
11
|
+
List,
|
12
|
+
Optional,
|
13
|
+
Union,
|
14
|
+
)
|
15
|
+
|
16
|
+
from openai.types.chat import ChatCompletionMessageParam, ChatCompletionUserMessageParam
|
17
|
+
|
18
|
+
# Prevent circular imports + Pydantic breaking
|
19
|
+
if TYPE_CHECKING:
|
20
|
+
from scenario.scenario_executor import ScenarioExecutor
|
21
|
+
|
22
|
+
ScenarioExecutorType = ScenarioExecutor
|
23
|
+
else:
|
24
|
+
ScenarioExecutorType = Any
|
25
|
+
|
26
|
+
|
27
|
+
class ScenarioAgentRole(Enum):
|
28
|
+
USER = "User"
|
29
|
+
AGENT = "Agent"
|
30
|
+
JUDGE = "Judge"
|
31
|
+
|
32
|
+
|
33
|
+
class AgentInput(BaseModel):
|
34
|
+
thread_id: str
|
35
|
+
# Prevent pydantic from validating/parsing the messages and causing issues: https://github.com/pydantic/pydantic/issues/9541
|
36
|
+
messages: Annotated[List[ChatCompletionMessageParam], SkipValidation]
|
37
|
+
new_messages: Annotated[List[ChatCompletionMessageParam], SkipValidation]
|
38
|
+
context: Dict[str, Any]
|
39
|
+
requested_role: ScenarioAgentRole
|
40
|
+
scenario_state: ScenarioExecutorType = Field(exclude=True)
|
41
|
+
|
42
|
+
def last_new_user_message(self) -> ChatCompletionUserMessageParam:
|
43
|
+
user_messages = [m for m in self.new_messages if m["role"] == "user"]
|
44
|
+
if not user_messages:
|
45
|
+
raise ValueError(
|
46
|
+
"No new user messages found, did you mean to call the assistant twice? Perhaps change your adapter to use the full messages list instead."
|
47
|
+
)
|
48
|
+
return user_messages[-1]
|
49
|
+
|
50
|
+
def last_new_user_message_str(self) -> str:
|
51
|
+
content = self.last_new_user_message()["content"]
|
52
|
+
if type(content) != str:
|
53
|
+
raise ValueError(
|
54
|
+
f"Last user message is not a string: {content.__repr__()}. Please use the full messages list instead."
|
55
|
+
)
|
56
|
+
return content
|
57
|
+
|
58
|
+
|
59
|
+
class ScenarioResult(BaseModel):
|
60
|
+
"""
|
61
|
+
Represents the results of a scenario test run.
|
62
|
+
|
63
|
+
Attributes:
|
64
|
+
success: Whether the scenario passed
|
65
|
+
conversation: The conversation history
|
66
|
+
reasoning: Reasoning for the result
|
67
|
+
passed_criteria: List of criteria that were met
|
68
|
+
failed_criteria: List of criteria that were not met
|
69
|
+
"""
|
70
|
+
|
71
|
+
success: bool
|
72
|
+
messages: List[ChatCompletionMessageParam]
|
73
|
+
reasoning: Optional[str] = None
|
74
|
+
passed_criteria: List[str] = []
|
75
|
+
failed_criteria: List[str] = []
|
76
|
+
total_time: Optional[float] = None
|
77
|
+
agent_time: Optional[float] = None
|
78
|
+
|
79
|
+
def __repr__(self) -> str:
|
80
|
+
"""Provide a concise representation for debugging."""
|
81
|
+
status = "PASSED" if self.success else "FAILED"
|
82
|
+
return f"ScenarioResult(success={self.success}, status={status}, reasoning='{self.reasoning or 'None'}')"
|
83
|
+
|
84
|
+
|
85
|
+
AgentReturnTypes = Union[
|
86
|
+
str, ChatCompletionMessageParam, List[ChatCompletionMessageParam], ScenarioResult
|
87
|
+
]
|
88
|
+
|
89
|
+
# TODO: remove the optional ScenarioResult return type from here, use events instead
|
90
|
+
ScriptStep = Union[
|
91
|
+
Callable[["ScenarioExecutor"], None],
|
92
|
+
Callable[["ScenarioExecutor"], Optional[ScenarioResult]],
|
93
|
+
# Async as well
|
94
|
+
Callable[["ScenarioExecutor"], Awaitable[None]],
|
95
|
+
Callable[["ScenarioExecutor"], Awaitable[Optional[ScenarioResult]]],
|
96
|
+
]
|
scenario/utils.py
CHANGED
@@ -1,6 +1,16 @@
|
|
1
1
|
from contextlib import contextmanager
|
2
2
|
import sys
|
3
|
-
from typing import
|
3
|
+
from typing import (
|
4
|
+
Any,
|
5
|
+
Iterator,
|
6
|
+
List,
|
7
|
+
Literal,
|
8
|
+
Optional,
|
9
|
+
Union,
|
10
|
+
TypeVar,
|
11
|
+
Awaitable,
|
12
|
+
cast,
|
13
|
+
)
|
4
14
|
from pydantic import BaseModel
|
5
15
|
|
6
16
|
import json
|
@@ -14,12 +24,18 @@ from rich.console import Console
|
|
14
24
|
from rich.text import Text
|
15
25
|
from rich.errors import LiveError
|
16
26
|
|
27
|
+
from scenario.error_messages import message_return_error_message
|
28
|
+
from scenario.types import AgentReturnTypes, ScenarioResult
|
29
|
+
|
30
|
+
T = TypeVar("T")
|
17
31
|
|
18
32
|
|
19
33
|
class SerializableAndPydanticEncoder(json.JSONEncoder):
|
20
34
|
def default(self, o):
|
21
35
|
if isinstance(o, BaseModel):
|
22
36
|
return o.model_dump(exclude_unset=True)
|
37
|
+
if isinstance(o, Iterator):
|
38
|
+
return list(o)
|
23
39
|
return super().default(o)
|
24
40
|
|
25
41
|
|
@@ -46,7 +62,9 @@ def title_case(string):
|
|
46
62
|
return " ".join(word.capitalize() for word in string.split("_"))
|
47
63
|
|
48
64
|
|
49
|
-
def print_openai_messages(
|
65
|
+
def print_openai_messages(
|
66
|
+
scenario_name: str, messages: list[ChatCompletionMessageParam]
|
67
|
+
):
|
50
68
|
for msg in messages:
|
51
69
|
role = safe_attr_or_key(msg, "role")
|
52
70
|
content = safe_attr_or_key(msg, "content")
|
@@ -61,9 +79,12 @@ def print_openai_messages(scenario_name: str, messages: list[ChatCompletionMessa
|
|
61
79
|
args = safe_attr_or_key(function, "arguments", "{}")
|
62
80
|
args = _take_maybe_json_first_lines(args)
|
63
81
|
print(
|
64
|
-
scenario_name
|
82
|
+
scenario_name
|
83
|
+
+ termcolor.colored(f"ToolCall({name}):", "magenta"),
|
65
84
|
f"\n\n{indent(args, ' ' * 4)}\n",
|
66
85
|
)
|
86
|
+
elif role == "user":
|
87
|
+
print(scenario_name + termcolor.colored("User:", "green"), content)
|
67
88
|
elif role == "tool":
|
68
89
|
content = _take_maybe_json_first_lines(content or msg.__repr__())
|
69
90
|
print(
|
@@ -91,9 +112,12 @@ def _take_maybe_json_first_lines(string, max_lines=5):
|
|
91
112
|
|
92
113
|
console = Console()
|
93
114
|
|
115
|
+
|
94
116
|
class TextFirstSpinner(Spinner):
|
95
117
|
def __init__(self, name, text: str, color: str, **kwargs):
|
96
|
-
super().__init__(
|
118
|
+
super().__init__(
|
119
|
+
name, "", style="bold white", **kwargs
|
120
|
+
) # Initialize with empty text
|
97
121
|
self.text_before = text
|
98
122
|
self.color = color
|
99
123
|
|
@@ -105,7 +129,9 @@ class TextFirstSpinner(Spinner):
|
|
105
129
|
|
106
130
|
|
107
131
|
@contextmanager
|
108
|
-
def show_spinner(
|
132
|
+
def show_spinner(
|
133
|
+
text: str, color: str = "white", enabled: Optional[Union[bool, int]] = None
|
134
|
+
):
|
109
135
|
if not enabled:
|
110
136
|
yield
|
111
137
|
else:
|
@@ -119,3 +145,120 @@ def show_spinner(text: str, color: str = "white", enabled: Optional[Union[bool,
|
|
119
145
|
|
120
146
|
# Cursor up one line
|
121
147
|
sys.stdout.write("\033[F")
|
148
|
+
# Erase the line
|
149
|
+
sys.stdout.write("\033[2K")
|
150
|
+
|
151
|
+
|
152
|
+
def check_valid_return_type(return_value: Any, class_name: str) -> None:
|
153
|
+
def _is_valid_openai_message(message: Any) -> bool:
|
154
|
+
return (isinstance(message, dict) and "role" in message) or (
|
155
|
+
isinstance(message, BaseModel) and hasattr(message, "role")
|
156
|
+
)
|
157
|
+
|
158
|
+
if (
|
159
|
+
isinstance(return_value, str)
|
160
|
+
or _is_valid_openai_message(return_value)
|
161
|
+
or (
|
162
|
+
isinstance(return_value, list)
|
163
|
+
and all(_is_valid_openai_message(message) for message in return_value)
|
164
|
+
)
|
165
|
+
or isinstance(return_value, ScenarioResult)
|
166
|
+
):
|
167
|
+
try:
|
168
|
+
json.dumps(return_value, cls=SerializableAndPydanticEncoder)
|
169
|
+
except:
|
170
|
+
raise ValueError(
|
171
|
+
message_return_error_message(got=return_value, class_name=class_name)
|
172
|
+
)
|
173
|
+
|
174
|
+
return
|
175
|
+
|
176
|
+
raise ValueError(
|
177
|
+
message_return_error_message(got=return_value, class_name=class_name)
|
178
|
+
)
|
179
|
+
|
180
|
+
|
181
|
+
def convert_agent_return_types_to_openai_messages(
|
182
|
+
agent_response: AgentReturnTypes, role: Literal["user", "assistant"]
|
183
|
+
) -> List[ChatCompletionMessageParam]:
|
184
|
+
if isinstance(agent_response, ScenarioResult):
|
185
|
+
raise ValueError(
|
186
|
+
"Unexpectedly tried to convert a ScenarioResult to openai messages",
|
187
|
+
agent_response.__repr__(),
|
188
|
+
)
|
189
|
+
|
190
|
+
def convert_maybe_object_to_openai_message(
|
191
|
+
obj: Any,
|
192
|
+
) -> ChatCompletionMessageParam:
|
193
|
+
if isinstance(obj, dict):
|
194
|
+
return cast(ChatCompletionMessageParam, obj)
|
195
|
+
elif isinstance(obj, BaseModel):
|
196
|
+
return cast(
|
197
|
+
ChatCompletionMessageParam,
|
198
|
+
obj.model_dump(
|
199
|
+
exclude_unset=True,
|
200
|
+
exclude_none=True,
|
201
|
+
exclude_defaults=True,
|
202
|
+
),
|
203
|
+
)
|
204
|
+
else:
|
205
|
+
raise ValueError(f"Unexpected agent response type: {type(obj).__name__}")
|
206
|
+
|
207
|
+
def ensure_dict(
|
208
|
+
obj: T,
|
209
|
+
) -> T:
|
210
|
+
return json.loads(json.dumps(obj, cls=SerializableAndPydanticEncoder))
|
211
|
+
|
212
|
+
if isinstance(agent_response, str):
|
213
|
+
return [
|
214
|
+
(
|
215
|
+
{"role": "user", "content": agent_response}
|
216
|
+
if role == "user"
|
217
|
+
else {"role": "assistant", "content": agent_response}
|
218
|
+
)
|
219
|
+
]
|
220
|
+
elif isinstance(agent_response, list):
|
221
|
+
return [
|
222
|
+
ensure_dict(convert_maybe_object_to_openai_message(message))
|
223
|
+
for message in agent_response
|
224
|
+
]
|
225
|
+
else:
|
226
|
+
return [ensure_dict(convert_maybe_object_to_openai_message(agent_response))]
|
227
|
+
|
228
|
+
|
229
|
+
def reverse_roles(
|
230
|
+
messages: list[ChatCompletionMessageParam],
|
231
|
+
) -> list[ChatCompletionMessageParam]:
|
232
|
+
"""
|
233
|
+
Reverses the roles of the messages in the list.
|
234
|
+
|
235
|
+
Args:
|
236
|
+
messages: The list of messages to reverse the roles of.
|
237
|
+
"""
|
238
|
+
|
239
|
+
for message in messages.copy():
|
240
|
+
# Can't reverse tool calls
|
241
|
+
if not safe_attr_or_key(message, "content") or safe_attr_or_key(
|
242
|
+
message, "tool_calls"
|
243
|
+
):
|
244
|
+
continue
|
245
|
+
|
246
|
+
if type(message) == dict:
|
247
|
+
if message["role"] == "user":
|
248
|
+
message["role"] = "assistant"
|
249
|
+
elif message["role"] == "assistant":
|
250
|
+
message["role"] = "user"
|
251
|
+
else:
|
252
|
+
if getattr(message, "role", None) == "user":
|
253
|
+
message.role = "assistant" # type: ignore
|
254
|
+
elif getattr(message, "role", None) == "assistant":
|
255
|
+
message.role = "user" # type: ignore
|
256
|
+
|
257
|
+
return messages
|
258
|
+
|
259
|
+
|
260
|
+
async def await_if_awaitable(value: T) -> T:
|
261
|
+
if isinstance(value, Awaitable):
|
262
|
+
return await value
|
263
|
+
else:
|
264
|
+
return value
|
@@ -1,15 +0,0 @@
|
|
1
|
-
scenario/__init__.py,sha256=LfCjOpbn55jYBBZHyMSZtRAWeCDFn4z4OhAyFnu8aMg,602
|
2
|
-
scenario/cache.py,sha256=sYu16SAf-BnVYkWSlEDzpyynJGIQyNYsgMXPgCqEnmk,1719
|
3
|
-
scenario/config.py,sha256=5UVBmuQDtni0Yu00bMh5p0xMGsrymYVRftXBGTsi2fI,802
|
4
|
-
scenario/error_messages.py,sha256=ZMcAOKJmKaLIinMZ0yBIOgDhPfeJH0uZxIEmolRArtc,2344
|
5
|
-
scenario/pytest_plugin.py,sha256=BuBbyKLa-t9AFVn9EETl7OvGSt__dFO7KnbZynfS1UM,5789
|
6
|
-
scenario/result.py,sha256=SGF8uYNtkP7cJy4KsshUozZRevmdiyX2TFzr6VreTv8,2717
|
7
|
-
scenario/scenario.py,sha256=tYn3Y1sK6_7pg7hFb_5w0TW6nun-za_4F8kqcnrXXU4,4077
|
8
|
-
scenario/scenario_executor.py,sha256=c8xV6GoJgO2JoZBWpYPQN5YwwQ3G9iJUtXV9UGSf1q8,7919
|
9
|
-
scenario/testing_agent.py,sha256=eS-c_io5cHgzJ88wwRvU_vve-pmB2HsGWN6qwlq0sPg,10865
|
10
|
-
scenario/utils.py,sha256=tMESosrxesA1B5zZB3IJ-sNSXDmnpNNib-DHobveVLA,3918
|
11
|
-
langwatch_scenario-0.1.3.dist-info/METADATA,sha256=7OIolGcZ3fkCXFmE6JHkckVCeJb1r3yYSYveJ6iE9zw,8801
|
12
|
-
langwatch_scenario-0.1.3.dist-info/WHEEL,sha256=pxyMxgL8-pra_rKaQ4drOZAegBVuX-G_4nRHjjgWbmo,91
|
13
|
-
langwatch_scenario-0.1.3.dist-info/entry_points.txt,sha256=WlEnJ_gku0i18bIa3DSuGqXRX-QDQLe_s0YmRzK45TI,45
|
14
|
-
langwatch_scenario-0.1.3.dist-info/top_level.txt,sha256=45Mn28aedJsetnBMB5xSmrJ-yo701QLH89Zlz4r1clE,9
|
15
|
-
langwatch_scenario-0.1.3.dist-info/RECORD,,
|