PyPI - langwatch-scenario - Versions diffs - 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl - Mend

langwatch-scenario 0.2.0py3-none-any.whl → 0.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (17) hide show

{langwatch_scenario-0.2.0.dist-info → langwatch_scenario-0.3.0.dist-info}/METADATA +60 -12
langwatch_scenario-0.3.0.dist-info/RECORD +16 -0
scenario/__init__.py +13 -3
scenario/config.py +18 -7
scenario/error_messages.py +81 -23
scenario/pytest_plugin.py +1 -1
scenario/scenario.py +135 -20
scenario/scenario_agent_adapter.py +16 -0
scenario/scenario_executor.py +405 -143
scenario/testing_agent.py +75 -58
scenario/types.py +96 -0
scenario/utils.py +148 -5
langwatch_scenario-0.2.0.dist-info/RECORD +0 -15
scenario/result.py +0 -74
{langwatch_scenario-0.2.0.dist-info → langwatch_scenario-0.3.0.dist-info}/WHEEL +0 -0
{langwatch_scenario-0.2.0.dist-info → langwatch_scenario-0.3.0.dist-info}/entry_points.txt +0 -0
{langwatch_scenario-0.2.0.dist-info → langwatch_scenario-0.3.0.dist-info}/top_level.txt +0 -0

scenario/testing_agent.py CHANGED Viewed

@@ -5,25 +5,23 @@ TestingAgent module: defines the testing agent that interacts with the agent und
 import json
 import logging
 import re
-from typing import TYPE_CHECKING, Dict, List, Any, Optional, Union, cast
-from pydantic import BaseModel
+from typing import Optional, Type, cast
 from litellm import Choices, completion
 from litellm.files.main import ModelResponse
 from scenario.cache import scenario_cache
-from scenario.utils import safe_attr_or_key
+from scenario.scenario_agent_adapter import ScenarioAgentAdapter
+from scenario.utils import reverse_roles
-from .result import ScenarioResult
-if TYPE_CHECKING:
-    from scenario.scenario import Scenario
+from .error_messages import testing_agent_not_configured_error_message
+from .types import AgentInput, AgentReturnTypes, ScenarioAgentRole, ScenarioResult
 logger = logging.getLogger("scenario")
-class TestingAgent(BaseModel):
+class TestingAgent(ScenarioAgentAdapter):
     """
     The Testing Agent that interacts with the agent under test.
@@ -33,7 +31,9 @@ class TestingAgent(BaseModel):
     3. Determining when to end the test and return a result
     """
-    model: str
+    roles = {ScenarioAgentRole.USER, ScenarioAgentRole.JUDGE}
+    model: str = ""
     api_key: Optional[str] = None
     temperature: float = 0.0
     max_tokens: Optional[int] = None
@@ -41,14 +41,36 @@ class TestingAgent(BaseModel):
     # To prevent pytest from thinking this is actually a test class
     __test__ = False
+    def __init__(self, input: AgentInput):
+        super().__init__(input)
+        if not self.model:
+            raise Exception(testing_agent_not_configured_error_message)
+    @classmethod
+    def with_config(
+        cls,
+        model: str,
+        api_key: Optional[str] = None,
+        temperature: float = 0.0,
+        max_tokens: Optional[int] = None,
+    ) -> Type["TestingAgent"]:
+        class TestingAgentWithConfig(cls):
+            def __init__(self, input: AgentInput):
+                self.model = model
+                self.api_key = api_key
+                self.temperature = temperature
+                self.max_tokens = max_tokens
+                super().__init__(input)
+        return TestingAgentWithConfig
     @scenario_cache(ignore=["scenario"])
-    def generate_next_message(
+    async def call(
         self,
-        scenario: "Scenario",
-        conversation: List[Dict[str, Any]],
-        first_message: bool = False,
-        last_message: bool = False,
-    ) -> Union[str, ScenarioResult]:
+        input: AgentInput,
+    ) -> AgentReturnTypes:
         """
         Generate the next message in the conversation based on history OR
         return a ScenarioResult if the test should conclude.
@@ -58,6 +80,8 @@ class TestingAgent(BaseModel):
           - A ScenarioResult (if the test should conclude)
         """
+        scenario = input.scenario_state.scenario
         messages = [
             {
                 "role": "system",
@@ -94,10 +118,15 @@ Your goal (assistant) is to interact with the Agent Under Test (user) as if you
 """,
             },
             {"role": "assistant", "content": "Hello, how can I help you today?"},
-            *conversation,
+            *input.messages,
         ]
-        if last_message:
+        is_first_message = len(input.messages) == 0
+        is_last_message = (
+            input.scenario_state.current_turn == input.scenario_state.scenario.max_turns
+        )
+        if is_last_message:
             messages.append(
                 {
                     "role": "user",
@@ -115,23 +144,7 @@ if you don't have enough information to make a verdict, say inconclusive with ma
         # User to assistant role reversal
         # LLM models are biased to always be the assistant not the user, so we need to do this reversal otherwise models like GPT 4.5 is
         # super confused, and Claude 3.7 even starts throwing exceptions.
-        for message in messages:
-            # Can't reverse tool calls
-            if not safe_attr_or_key(message, "content") or safe_attr_or_key(
-                message, "tool_calls"
-            ):
-                continue
-            if type(message) == dict:
-                if message["role"] == "user":
-                    message["role"] = "assistant"
-                elif message["role"] == "assistant":
-                    message["role"] = "user"
-            else:
-                if getattr(message, "role", None) == "user":
-                    message.role = "assistant"
-                elif getattr(message, "role", None) == "assistant":
-                    message.role = "user"
+        messages = reverse_roles(messages)
         # Define the tool
         criteria_names = [
@@ -182,6 +195,16 @@ if you don't have enough information to make a verdict, say inconclusive with ma
             }
         ]
+        enforce_judgment = input.requested_role == ScenarioAgentRole.JUDGE
+        has_criteria = len(scenario.criteria) > 0
+        if enforce_judgment and not has_criteria:
+            return ScenarioResult(
+                success=False,
+                messages=[],
+                reasoning="TestingAgent was called as a judge, but it has no criteria to judge against",
+            )
         response = cast(
             ModelResponse,
             completion(
@@ -189,8 +212,16 @@ if you don't have enough information to make a verdict, say inconclusive with ma
                 messages=messages,
                 temperature=self.temperature,
                 max_tokens=self.max_tokens,
-                tools=tools if not first_message else None,
-                tool_choice="required" if last_message else None,
+                tools=(
+                    tools
+                    if (not is_first_message or enforce_judgment) and has_criteria
+                    else None
+                ),
+                tool_choice=(
+                    "required"
+                    if (is_last_message or enforce_judgment) and has_criteria
+                    else None
+                ),
             ),
         )
@@ -221,27 +252,13 @@ if you don't have enough information to make a verdict, say inconclusive with ma
                         ]
                         # Return the appropriate ScenarioResult based on the verdict
-                        if verdict == "success":
-                            return ScenarioResult.success_result(
-                                conversation=conversation,
-                                reasoning=reasoning,
-                                passed_criteria=passed_criteria,
-                            )
-                        elif verdict == "failure":
-                            return ScenarioResult.failure_result(
-                                conversation=conversation,
-                                reasoning=reasoning,
-                                passed_criteria=passed_criteria,
-                                failed_criteria=failed_criteria,
-                            )
-                        else:  # inconclusive
-                            return ScenarioResult(
-                                success=False,
-                                conversation=conversation,
-                                reasoning=reasoning,
-                                passed_criteria=passed_criteria,
-                                failed_criteria=failed_criteria,
-                            )
+                        return ScenarioResult(
+                            success=verdict == "success",
+                            messages=messages,
+                            reasoning=reasoning,
+                            passed_criteria=passed_criteria,
+                            failed_criteria=failed_criteria,
+                        )
                     except json.JSONDecodeError:
                         logger.error("Failed to parse tool call arguments")
@@ -255,7 +272,7 @@ if you don't have enough information to make a verdict, say inconclusive with ma
                     )
                 raise Exception(f"No response from LLM: {response.__repr__()}")
-            return message_content
+            return {"role": "user", "content": message_content}
         else:
             raise Exception(
                 f"Unexpected response format from LLM: {response.__repr__()}"

scenario/types.py ADDED Viewed

@@ -0,0 +1,96 @@
+from enum import Enum
+from pydantic import BaseModel, Field, SkipValidation
+from typing import (
+    TYPE_CHECKING,
+    Annotated,
+    Any,
+    Awaitable,
+    Callable,
+    Coroutine,
+    Dict,
+    List,
+    Optional,
+    Union,
+)
+from openai.types.chat import ChatCompletionMessageParam, ChatCompletionUserMessageParam
+# Prevent circular imports + Pydantic breaking
+if TYPE_CHECKING:
+    from scenario.scenario_executor import ScenarioExecutor
+    ScenarioExecutorType = ScenarioExecutor
+else:
+    ScenarioExecutorType = Any
+class ScenarioAgentRole(Enum):
+    USER = "User"
+    AGENT = "Agent"
+    JUDGE = "Judge"
+class AgentInput(BaseModel):
+    thread_id: str
+    # Prevent pydantic from validating/parsing the messages and causing issues: https://github.com/pydantic/pydantic/issues/9541
+    messages: Annotated[List[ChatCompletionMessageParam], SkipValidation]
+    new_messages: Annotated[List[ChatCompletionMessageParam], SkipValidation]
+    context: Dict[str, Any]
+    requested_role: ScenarioAgentRole
+    scenario_state: ScenarioExecutorType = Field(exclude=True)
+    def last_new_user_message(self) -> ChatCompletionUserMessageParam:
+        user_messages = [m for m in self.new_messages if m["role"] == "user"]
+        if not user_messages:
+            raise ValueError(
+                "No new user messages found, did you mean to call the assistant twice? Perhaps change your adapter to use the full messages list instead."
+            )
+        return user_messages[-1]
+    def last_new_user_message_str(self) -> str:
+        content = self.last_new_user_message()["content"]
+        if type(content) != str:
+            raise ValueError(
+                f"Last user message is not a string: {content.__repr__()}. Please use the full messages list instead."
+            )
+        return content
+class ScenarioResult(BaseModel):
+    """
+    Represents the results of a scenario test run.
+    Attributes:
+        success: Whether the scenario passed
+        conversation: The conversation history
+        reasoning: Reasoning for the result
+        passed_criteria: List of criteria that were met
+        failed_criteria: List of criteria that were not met
+    """
+    success: bool
+    messages: List[ChatCompletionMessageParam]
+    reasoning: Optional[str] = None
+    passed_criteria: List[str] = []
+    failed_criteria: List[str] = []
+    total_time: Optional[float] = None
+    agent_time: Optional[float] = None
+    def __repr__(self) -> str:
+        """Provide a concise representation for debugging."""
+        status = "PASSED" if self.success else "FAILED"
+        return f"ScenarioResult(success={self.success}, status={status}, reasoning='{self.reasoning or 'None'}')"
+AgentReturnTypes = Union[
+    str, ChatCompletionMessageParam, List[ChatCompletionMessageParam], ScenarioResult
+]
+# TODO: remove the optional ScenarioResult return type from here, use events instead
+ScriptStep = Union[
+    Callable[["ScenarioExecutor"], None],
+    Callable[["ScenarioExecutor"], Optional[ScenarioResult]],
+    # Async as well
+    Callable[["ScenarioExecutor"], Awaitable[None]],
+    Callable[["ScenarioExecutor"], Awaitable[Optional[ScenarioResult]]],
+]

scenario/utils.py CHANGED Viewed

@@ -1,6 +1,16 @@
 from contextlib import contextmanager
 import sys
-from typing import Optional, Union
+from typing import (
+    Any,
+    Iterator,
+    List,
+    Literal,
+    Optional,
+    Union,
+    TypeVar,
+    Awaitable,
+    cast,
+)
 from pydantic import BaseModel
 import json
@@ -14,12 +24,18 @@ from rich.console import Console
 from rich.text import Text
 from rich.errors import LiveError
+from scenario.error_messages import message_return_error_message
+from scenario.types import AgentReturnTypes, ScenarioResult
+T = TypeVar("T")
 class SerializableAndPydanticEncoder(json.JSONEncoder):
     def default(self, o):
         if isinstance(o, BaseModel):
             return o.model_dump(exclude_unset=True)
+        if isinstance(o, Iterator):
+            return list(o)
         return super().default(o)
@@ -46,7 +62,9 @@ def title_case(string):
     return " ".join(word.capitalize() for word in string.split("_"))
-def print_openai_messages(scenario_name: str, messages: list[ChatCompletionMessageParam]):
+def print_openai_messages(
+    scenario_name: str, messages: list[ChatCompletionMessageParam]
+):
     for msg in messages:
         role = safe_attr_or_key(msg, "role")
         content = safe_attr_or_key(msg, "content")
@@ -61,9 +79,12 @@ def print_openai_messages(scenario_name: str, messages: list[ChatCompletionMessa
                     args = safe_attr_or_key(function, "arguments", "{}")
                     args = _take_maybe_json_first_lines(args)
                     print(
-                        scenario_name + termcolor.colored(f"ToolCall({name}):", "magenta"),
+                        scenario_name
+                        + termcolor.colored(f"ToolCall({name}):", "magenta"),
                         f"\n\n{indent(args, ' ' * 4)}\n",
                     )
+        elif role == "user":
+            print(scenario_name + termcolor.colored("User:", "green"), content)
         elif role == "tool":
             content = _take_maybe_json_first_lines(content or msg.__repr__())
             print(
@@ -91,9 +112,12 @@ def _take_maybe_json_first_lines(string, max_lines=5):
 console = Console()
 class TextFirstSpinner(Spinner):
     def __init__(self, name, text: str, color: str, **kwargs):
-        super().__init__(name, "", style="bold white", **kwargs)  # Initialize with empty text
+        super().__init__(
+            name, "", style="bold white", **kwargs
+        )  # Initialize with empty text
         self.text_before = text
         self.color = color
@@ -105,7 +129,9 @@ class TextFirstSpinner(Spinner):
 @contextmanager
-def show_spinner(text: str, color: str = "white", enabled: Optional[Union[bool, int]] = None):
+def show_spinner(
+    text: str, color: str = "white", enabled: Optional[Union[bool, int]] = None
+):
     if not enabled:
         yield
     else:
@@ -119,3 +145,120 @@ def show_spinner(text: str, color: str = "white", enabled: Optional[Union[bool,
         # Cursor up one line
         sys.stdout.write("\033[F")
+        # Erase the line
+        sys.stdout.write("\033[2K")
+def check_valid_return_type(return_value: Any, class_name: str) -> None:
+    def _is_valid_openai_message(message: Any) -> bool:
+        return (isinstance(message, dict) and "role" in message) or (
+            isinstance(message, BaseModel) and hasattr(message, "role")
+        )
+    if (
+        isinstance(return_value, str)
+        or _is_valid_openai_message(return_value)
+        or (
+            isinstance(return_value, list)
+            and all(_is_valid_openai_message(message) for message in return_value)
+        )
+        or isinstance(return_value, ScenarioResult)
+    ):
+        try:
+            json.dumps(return_value, cls=SerializableAndPydanticEncoder)
+        except:
+            raise ValueError(
+                message_return_error_message(got=return_value, class_name=class_name)
+            )
+        return
+    raise ValueError(
+        message_return_error_message(got=return_value, class_name=class_name)
+    )
+def convert_agent_return_types_to_openai_messages(
+    agent_response: AgentReturnTypes, role: Literal["user", "assistant"]
+) -> List[ChatCompletionMessageParam]:
+    if isinstance(agent_response, ScenarioResult):
+        raise ValueError(
+            "Unexpectedly tried to convert a ScenarioResult to openai messages",
+            agent_response.__repr__(),
+        )
+    def convert_maybe_object_to_openai_message(
+        obj: Any,
+    ) -> ChatCompletionMessageParam:
+        if isinstance(obj, dict):
+            return cast(ChatCompletionMessageParam, obj)
+        elif isinstance(obj, BaseModel):
+            return cast(
+                ChatCompletionMessageParam,
+                obj.model_dump(
+                    exclude_unset=True,
+                    exclude_none=True,
+                    exclude_defaults=True,
+                ),
+            )
+        else:
+            raise ValueError(f"Unexpected agent response type: {type(obj).__name__}")
+    def ensure_dict(
+        obj: T,
+    ) -> T:
+        return json.loads(json.dumps(obj, cls=SerializableAndPydanticEncoder))
+    if isinstance(agent_response, str):
+        return [
+            (
+                {"role": "user", "content": agent_response}
+                if role == "user"
+                else {"role": "assistant", "content": agent_response}
+            )
+        ]
+    elif isinstance(agent_response, list):
+        return [
+            ensure_dict(convert_maybe_object_to_openai_message(message))
+            for message in agent_response
+        ]
+    else:
+        return [ensure_dict(convert_maybe_object_to_openai_message(agent_response))]
+def reverse_roles(
+    messages: list[ChatCompletionMessageParam],
+) -> list[ChatCompletionMessageParam]:
+    """
+    Reverses the roles of the messages in the list.
+    Args:
+        messages: The list of messages to reverse the roles of.
+    """
+    for message in messages.copy():
+        # Can't reverse tool calls
+        if not safe_attr_or_key(message, "content") or safe_attr_or_key(
+            message, "tool_calls"
+        ):
+            continue
+        if type(message) == dict:
+            if message["role"] == "user":
+                message["role"] = "assistant"
+            elif message["role"] == "assistant":
+                message["role"] = "user"
+        else:
+            if getattr(message, "role", None) == "user":
+                message.role = "assistant"  # type: ignore
+            elif getattr(message, "role", None) == "assistant":
+                message.role = "user"  # type: ignore
+    return messages
+async def await_if_awaitable(value: T) -> T:
+    if isinstance(value, Awaitable):
+        return await value
+    else:
+        return value

langwatch_scenario-0.2.0.dist-info/RECORD DELETED Viewed

@@ -1,15 +0,0 @@
-scenario/__init__.py,sha256=LfCjOpbn55jYBBZHyMSZtRAWeCDFn4z4OhAyFnu8aMg,602
-scenario/cache.py,sha256=sYu16SAf-BnVYkWSlEDzpyynJGIQyNYsgMXPgCqEnmk,1719
-scenario/config.py,sha256=5UVBmuQDtni0Yu00bMh5p0xMGsrymYVRftXBGTsi2fI,802
-scenario/error_messages.py,sha256=ZMcAOKJmKaLIinMZ0yBIOgDhPfeJH0uZxIEmolRArtc,2344
-scenario/pytest_plugin.py,sha256=TzOHi8PN-dtDqaYAZkgT0wgBkhetOpYy--Z0pzi5PXM,5771
-scenario/result.py,sha256=y6mUu6X4H6YJYmwVD4VWHCBi-1BTlUVeYrTZ3HBA0oU,2382
-scenario/scenario.py,sha256=OTadwIHIcUhXxfUNnJXpT7h3GZ_VUL3XSd9k-oVPfMo,4069
-scenario/scenario_executor.py,sha256=phRKj7vZ_QjGUO9w05-DPrAzdacg_7CnTV59lYLCCKk,7912
-scenario/testing_agent.py,sha256=y4B8TMhKryeTiiv62qwslx7Gw_zw54Vk9zPyswEPm0k,10481
-scenario/utils.py,sha256=tMESosrxesA1B5zZB3IJ-sNSXDmnpNNib-DHobveVLA,3918
-langwatch_scenario-0.2.0.dist-info/METADATA,sha256=fc1oBg2ms-iVgYc44oSTJk-8sw2yOe_PpWEMStvYEX4,9339
-langwatch_scenario-0.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
-langwatch_scenario-0.2.0.dist-info/entry_points.txt,sha256=WlEnJ_gku0i18bIa3DSuGqXRX-QDQLe_s0YmRzK45TI,45
-langwatch_scenario-0.2.0.dist-info/top_level.txt,sha256=45Mn28aedJsetnBMB5xSmrJ-yo701QLH89Zlz4r1clE,9
-langwatch_scenario-0.2.0.dist-info/RECORD,,

scenario/result.py DELETED Viewed

@@ -1,74 +0,0 @@
-"""
-Result module: defines the class for scenario test results.
-"""
-from dataclasses import dataclass, field
-from typing import List, Dict, Optional
-@dataclass
-class ScenarioResult:
-    """
-    Represents the results of a scenario test run.
-    Attributes:
-        success: Whether the scenario passed
-        conversation: The conversation history
-        reasoning: Reasoning for the result
-        passed_criteria: List of criteria that were met
-        failed_criteria: List of criteria that were not met
-    """
-    success: bool
-    conversation: List[Dict[str, str]]
-    reasoning: Optional[str] = None
-    passed_criteria: List[str] = field(default_factory=list)
-    failed_criteria: List[str] = field(default_factory=list)
-    total_time: Optional[float] = None
-    agent_time: Optional[float] = None
-    def __post_init__(self) -> None:
-        """Validate the result after initialization."""
-        if not self.success and not self.reasoning:
-            raise ValueError("Failed scenarios must have a reasoning")
-    @classmethod
-    def success_result(
-        cls,
-        conversation: List[Dict[str, str]],
-        reasoning: Optional[str],
-        passed_criteria: List[str],
-        total_time: Optional[float] = None,
-        agent_time: Optional[float] = None,
-    ) -> "ScenarioResult":
-        """Create a successful result."""
-        return cls(
-            success=True,
-            conversation=conversation,
-            reasoning=reasoning,
-            passed_criteria=passed_criteria,
-            failed_criteria=[],
-            total_time=total_time,
-            agent_time=agent_time,
-        )
-    @classmethod
-    def failure_result(
-        cls,
-        conversation: List[Dict[str, str]],
-        reasoning: str,
-        passed_criteria: Optional[List[str]] = None,
-        failed_criteria: Optional[List[str]] = None,
-        total_time: Optional[float] = None,
-        agent_time: Optional[float] = None,
-    ) -> "ScenarioResult":
-        """Create a failed result."""
-        return cls(
-            success=False,
-            conversation=conversation,
-            reasoning=reasoning,
-            passed_criteria=passed_criteria if passed_criteria is not None else [],
-            failed_criteria=failed_criteria if failed_criteria is not None else [],
-            total_time=total_time,
-            agent_time=agent_time,
-        )

{langwatch_scenario-0.2.0.dist-info → langwatch_scenario-0.3.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{langwatch_scenario-0.2.0.dist-info → langwatch_scenario-0.3.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{langwatch_scenario-0.2.0.dist-info → langwatch_scenario-0.3.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

langwatch-scenario 0.2.0__py3-none-any.whl → 0.3.0__py3-none-any.whl

langwatch-scenario 0.2.0py3-none-any.whl → 0.3.0py3-none-any.whl