PyPI - langwatch-scenario - Versions diffs - 0.4.0__py3-none-any.whl → 0.7.1__py3-none-any.whl - Mend

langwatch-scenario 0.4.0py3-none-any.whl → 0.7.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (238) hide show

scenario/{utils.py → _utils/utils.py} RENAMED Viewed

@@ -11,13 +11,10 @@ import sys
 from typing import (
     Any,
     Iterator,
-    List,
-    Literal,
     Optional,
     Union,
     TypeVar,
     Awaitable,
-    cast,
 )
 from pydantic import BaseModel
 import copy
@@ -33,8 +30,8 @@ from rich.console import Console
 from rich.text import Text
 from rich.errors import LiveError
-from scenario.error_messages import message_return_error_message
-from scenario.types import AgentReturnTypes, ScenarioResult
+from scenario._error_messages import message_return_error_message
+from scenario.types import ScenarioResult
 T = TypeVar("T")
@@ -48,7 +45,7 @@ class SerializableAndPydanticEncoder(json.JSONEncoder):
     Used for caching and logging scenarios that contain complex objects.
     Example:
-        ```python
+        ```
         data = {
             "model": SomeBaseModel(field="value"),
             "iterator": iter([1, 2, 3])
@@ -56,7 +53,7 @@ class SerializableAndPydanticEncoder(json.JSONEncoder):
         json.dumps(data, cls=SerializableAndPydanticEncoder)
         ```
     """
-    def default(self, o):
+    def default(self, o: Any) -> Any:
         if isinstance(o, BaseModel):
             return o.model_dump(exclude_unset=True)
         if isinstance(o, Iterator):
@@ -73,26 +70,26 @@ class SerializableWithStringFallback(SerializableAndPydanticEncoder):
     that logging and caching operations never fail due to serialization issues.
     Example:
-        ```python
+        ```
         # This will work even with complex non-serializable objects
         data = {"function": lambda x: x, "complex_object": SomeComplexClass()}
         json.dumps(data, cls=SerializableWithStringFallback)
         # Result: {"function": "<function <lambda> at 0x...>", "complex_object": "..."}
         ```
     """
-    def default(self, o):
+    def default(self, o: Any) -> Any:
         try:
             return super().default(o)
         except:
             return str(o)
-def safe_list_at(list, index, default=None):
+def safe_list_at(list_obj: list, index: int, default: Any = None) -> Any:
     """
     Safely get an item from a list by index with a default fallback.
     Args:
-        list: The list to access
+        list_obj: The list to access
         index: The index to retrieve
         default: Value to return if index is out of bounds
@@ -100,7 +97,7 @@ def safe_list_at(list, index, default=None):
         The item at the index, or the default value if index is invalid
     Example:
-        ```python
+        ```
         items = ["a", "b", "c"]
         print(safe_list_at(items, 1))    # "b"
         print(safe_list_at(items, 10))   # None
@@ -108,12 +105,12 @@ def safe_list_at(list, index, default=None):
         ```
     """
     try:
-        return list[index]
+        return list_obj[index]
     except:
         return default
-def safe_attr_or_key(obj, attr_or_key, default=None):
+def safe_attr_or_key(obj: Any, attr_or_key: str, default: Any = None) -> Any:
     """
     Safely get an attribute or dictionary key from an object.
@@ -129,7 +126,7 @@ def safe_attr_or_key(obj, attr_or_key, default=None):
         The attribute/key value, or the default if not found
     Example:
-        ```python
+        ```
         class MyClass:
             attr = "value"
@@ -141,10 +138,10 @@ def safe_attr_or_key(obj, attr_or_key, default=None):
         print(safe_attr_or_key(obj, "missing"))  # None
         ```
     """
-    return getattr(obj, attr_or_key, obj.get(attr_or_key))
+    return getattr(obj, attr_or_key, getattr(obj, 'get', lambda x, default=None: default)(attr_or_key, default))
-def title_case(string):
+def title_case(string: str) -> str:
     """
     Convert snake_case string to Title Case.
@@ -155,7 +152,7 @@ def title_case(string):
         String converted to Title Case
     Example:
-        ```python
+        ```
         print(title_case("user_simulator_agent"))  # "User Simulator Agent"
         print(title_case("api_key"))               # "Api Key"
         ```
@@ -178,7 +175,7 @@ def print_openai_messages(
         messages: List of OpenAI-compatible messages to print
     Example:
-        ```python
+        ```
         messages = [
             {"role": "user", "content": "Hello"},
             {"role": "assistant", "content": "Hi there!"},
@@ -226,7 +223,7 @@ def print_openai_messages(
             )
-def _take_maybe_json_first_lines(string, max_lines=5):
+def _take_maybe_json_first_lines(string: str, max_lines: int = 5) -> str:
     """
     Truncate string content and format JSON if possible.
@@ -268,14 +265,14 @@ class TextFirstSpinner(Spinner):
         color: Color for the descriptive text
         **kwargs: Additional arguments passed to the base Spinner class
     """
-    def __init__(self, name, text: str, color: str, **kwargs):
+    def __init__(self, name: str, text: str, color: str, **kwargs: Any) -> None:
         super().__init__(
             name, "", style="bold white", **kwargs
         )  # Initialize with empty text
         self.text_before = text
         self.color = color
-    def render(self, time):
+    def render(self, time: float) -> Text:
         # Get the original spinner frame
         spinner_frame = super().render(time)
         # Create a composite with text first, then spinner
@@ -299,7 +296,7 @@ def show_spinner(
         enabled: Whether to show the spinner (respects verbose settings)
     Example:
-        ```python
+        ```
         with show_spinner("Calling agent...", color="blue", enabled=True):
             response = await agent.call(input_data)
@@ -345,7 +342,7 @@ def check_valid_return_type(return_value: Any, class_name: str) -> None:
         ValueError: If the return value is not in a supported format
     Example:
-        ```python
+        ```
         # Valid return values
         check_valid_return_type("Hello world", "MyAgent")  # OK
         check_valid_return_type({"role": "assistant", "content": "Hi"}, "MyAgent")  # OK
@@ -383,92 +380,6 @@ def check_valid_return_type(return_value: Any, class_name: str) -> None:
     )
-def convert_agent_return_types_to_openai_messages(
-    agent_response: AgentReturnTypes, role: Literal["user", "assistant"]
-) -> List[ChatCompletionMessageParam]:
-    """
-    Convert various agent return types to standardized OpenAI message format.
-    This function normalizes different return types from agent adapters into
-    a consistent list of OpenAI-compatible messages that can be used throughout
-    the scenario execution pipeline.
-    Args:
-        agent_response: Response from an agent adapter call
-        role: The role to assign to string responses ("user" or "assistant")
-    Returns:
-        List of OpenAI-compatible messages
-    Raises:
-        ValueError: If agent_response is a ScenarioResult (which should be handled separately)
-    Example:
-        ```python
-        # String response
-        messages = convert_agent_return_types_to_openai_messages("Hello", "assistant")
-        # Result: [{"role": "assistant", "content": "Hello"}]
-        # Dict response
-        response = {"role": "assistant", "content": "Hi", "tool_calls": [...]}
-        messages = convert_agent_return_types_to_openai_messages(response, "assistant")
-        # Result: [{"role": "assistant", "content": "Hi", "tool_calls": [...]}]
-        # List response
-        responses = [
-            {"role": "assistant", "content": "Thinking..."},
-            {"role": "assistant", "content": "Here's the answer"}
-        ]
-        messages = convert_agent_return_types_to_openai_messages(responses, "assistant")
-        # Result: Same list, validated and normalized
-        ```
-    """
-    if isinstance(agent_response, ScenarioResult):
-        raise ValueError(
-            "Unexpectedly tried to convert a ScenarioResult to openai messages",
-            agent_response.__repr__(),
-        )
-    def convert_maybe_object_to_openai_message(
-        obj: Any,
-    ) -> ChatCompletionMessageParam:
-        if isinstance(obj, dict):
-            return cast(ChatCompletionMessageParam, obj)
-        elif isinstance(obj, BaseModel):
-            return cast(
-                ChatCompletionMessageParam,
-                obj.model_dump(
-                    exclude_unset=True,
-                    exclude_none=True,
-                    exclude_defaults=True,
-                    warnings=False,
-                ),
-            )
-        else:
-            raise ValueError(f"Unexpected agent response type: {type(obj).__name__}")
-    def ensure_dict(
-        obj: T,
-    ) -> T:
-        return json.loads(json.dumps(obj, cls=SerializableAndPydanticEncoder))
-    if isinstance(agent_response, str):
-        return [
-            (
-                {"role": "user", "content": agent_response}
-                if role == "user"
-                else {"role": "assistant", "content": agent_response}
-            )
-        ]
-    elif isinstance(agent_response, list):
-        return [
-            ensure_dict(convert_maybe_object_to_openai_message(message))
-            for message in agent_response
-        ]
-    else:
-        return [ensure_dict(convert_maybe_object_to_openai_message(agent_response))]
 def reverse_roles(
     messages: list[ChatCompletionMessageParam],
 ) -> list[ChatCompletionMessageParam]:

scenario/agent_adapter.py CHANGED Viewed

@@ -26,9 +26,9 @@ class AgentAdapter(ABC):
         role: The role this agent plays in scenarios (USER, AGENT, or JUDGE)
     Example:
-        ```python
+        ```
         import scenario
-        from my_agent_library import MyCustomAgent
+        from my_agent import MyCustomAgent
         class MyAgentAdapter(scenario.AgentAdapter):
             def __init__(self):
@@ -66,6 +66,7 @@ class AgentAdapter(ABC):
         - For stateful agents, use input.thread_id to maintain conversation context
         - For stateless agents, use input.messages for the full conversation history
     """
     role: ClassVar[AgentRole] = AgentRole.AGENT
     @abstractmethod
@@ -82,13 +83,17 @@ class AgentAdapter(ABC):
         Returns:
             AgentReturnTypes: The agent's response, which can be:
                 - str: Simple text response
                 - ChatCompletionMessageParam: Single OpenAI-format message
                 - List[ChatCompletionMessageParam]: Multiple messages for complex responses
                 - ScenarioResult: Direct test result (typically only used by judge agents)
         Example:
-            ```python
+            ```
             async def call(self, input: AgentInput) -> AgentReturnTypes:
                 # Simple string response
                 user_msg = input.last_new_user_message_str()
@@ -98,7 +103,6 @@ class AgentAdapter(ABC):
                 return {
                     "role": "assistant",
                     "content": "Let me help you with that...",
-                    "tool_calls": [...]  # If your agent uses tools
                 }
                 # Or multiple messages for complex interactions

scenario/cache.py CHANGED Viewed

@@ -18,7 +18,7 @@ import json
 import wrapt
 from scenario.types import AgentInput
-from scenario.utils import SerializableWithStringFallback
+from scenario._utils.utils import SerializableWithStringFallback
 if TYPE_CHECKING:
     from scenario.scenario_executor import ScenarioExecutor
@@ -39,7 +39,7 @@ def get_cache() -> Memory:
         Memory instance configured with the appropriate cache directory
     Example:
-        ```python
+        ```
         # Default cache location: ~/.scenario/cache
         cache = get_cache()
@@ -75,7 +75,7 @@ def scenario_cache(ignore=[]):
         Decorator function that can be applied to any function or method
     Example:
-        ```python
+        ```
         import scenario
         class MyAgent:
@@ -105,6 +105,7 @@ def scenario_cache(ignore=[]):
         - AgentInput objects are specially handled to exclude thread_id from caching
         - Both sync and async functions are supported
     """
     @wrapt.decorator
     def wrapper(wrapped: Callable, instance=None, args=[], kwargs={}):
         scenario: "ScenarioExecutor" = context_scenario.get()

scenario/config.py CHANGED Viewed

@@ -9,6 +9,7 @@ and debugging options.
 from typing import Optional, Union, ClassVar
 from pydantic import BaseModel
 class ModelConfig(BaseModel):
     """
     Configuration for LLM model settings.
@@ -23,7 +24,7 @@ class ModelConfig(BaseModel):
         max_tokens: Maximum number of tokens to generate in responses
     Example:
-        ```python
+        ```
         model_config = ModelConfig(
             model="openai/gpt-4.1-mini",
             api_key="your-api-key",
@@ -32,6 +33,7 @@ class ModelConfig(BaseModel):
         )
         ```
     """
     model: str
     api_key: Optional[str] = None
     temperature: float = 0.0
@@ -54,7 +56,7 @@ class ScenarioConfig(BaseModel):
         debug: Whether to enable debug mode with step-by-step interaction
     Example:
-        ```python
+        ```
         # Configure globally for all scenarios
         scenario.configure(
             default_model="openai/gpt-4.1-mini",
@@ -106,7 +108,7 @@ class ScenarioConfig(BaseModel):
             debug: Enable debug mode for step-by-step execution with user intervention
         Example:
-            ```python
+            ```
             import scenario
             # Set up default configuration
@@ -151,7 +153,7 @@ class ScenarioConfig(BaseModel):
             A new ScenarioConfig instance with merged values
         Example:
-            ```python
+            ```
             base_config = ScenarioConfig(max_turns=10, verbose=True)
             override_config = ScenarioConfig(max_turns=20)
@@ -174,7 +176,7 @@ class ScenarioConfig(BaseModel):
             Dictionary of configuration key-value pairs, excluding None values
         Example:
-            ```python
+            ```
             config = ScenarioConfig(max_turns=15, verbose=True)
             items = config.items()
             # Result: {"max_turns": 15, "verbose": True}

scenario/judge_agent.py CHANGED Viewed

@@ -19,7 +19,7 @@ from scenario.cache import scenario_cache
 from scenario.agent_adapter import AgentAdapter
 from scenario.config import ModelConfig, ScenarioConfig
-from .error_messages import agent_not_configured_error_message
+from ._error_messages import agent_not_configured_error_message
 from .types import AgentInput, AgentReturnTypes, AgentRole, ScenarioResult
@@ -48,7 +48,7 @@ class JudgeAgent(AgentAdapter):
         system_prompt: Custom system prompt to override default judge behavior
     Example:
-        ```python
+        ```
         import scenario
         # Basic judge agent with criteria
@@ -93,6 +93,7 @@ class JudgeAgent(AgentAdapter):
         - Provide detailed reasoning for their decisions
         - Support both positive criteria (things that should happen) and negative criteria (things that shouldn't)
     """
     role = AgentRole.JUDGE
     model: str
@@ -133,14 +134,12 @@ class JudgeAgent(AgentAdapter):
             Exception: If no model is configured either in parameters or global config
         Example:
-            ```python
+            ```
             # Customer service judge
             cs_judge = JudgeAgent(
                 criteria=[
-                    "Agent is polite and professional",
-                    "Agent addresses the customer's specific concern",
-                    "Agent offers appropriate solutions or next steps",
-                    "Agent does not make promises the company cannot keep"
+                    "Agent replies with the refund policy",
+                    "Agent offers next steps for the customer",
                 ],
                 temperature=0.1
             )
@@ -148,9 +147,8 @@ class JudgeAgent(AgentAdapter):
             # Technical accuracy judge
             tech_judge = JudgeAgent(
                 criteria=[
-                    "Code examples compile without errors",
-                    "Security vulnerabilities are not introduced",
-                    "Best practices are recommended"
+                    "Agent adds a code review pointing out the code compilation errors",
+                    "Agent adds a code review about the missing security headers"
                 ],
                 system_prompt="You are a senior software engineer reviewing code for production use."
             )
@@ -210,24 +208,6 @@ class JudgeAgent(AgentAdapter):
             Exception: If the judge cannot make a valid decision or if there's an
                       error in the evaluation process
-        Example:
-            The judge evaluates conversations like this:
-            ```
-            Conversation so far:
-            User: "I need help with authentication"
-            Agent: "I can help! What authentication method are you using?"
-            User: "JWT tokens"
-            Agent: "Here's how to implement JWT securely: [detailed code example]"
-            Judge evaluation:
-            - ✓ Agent provides helpful responses
-            - ✓ Agent asks relevant follow-up questions
-            - ✓ Security best practices are mentioned
-            Decision: CONTINUE (all criteria being met so far)
-            ```
         Note:
             - Returns empty list [] to continue the scenario
             - Returns ScenarioResult to end with success/failure
@@ -238,6 +218,10 @@ class JudgeAgent(AgentAdapter):
         scenario = input.scenario_state
+        criteria_str = "\n".join(
+            [f"{idx + 1}. {criterion}" for idx, criterion in enumerate(self.criteria)]
+        )
         messages = [
             {
                 "role": "system",
@@ -257,7 +241,7 @@ If you do have enough information, use the finish_test tool to determine if all
 </scenario>
 <criteria>
-{"\n".join([f"{idx + 1}. {criterion}" for idx, criterion in enumerate(self.criteria)])}
+{criteria_str}
 </criteria>
 <rules>

scenario/pytest_plugin.py CHANGED Viewed

@@ -8,7 +8,7 @@ pytest-based testing workflows.
 """
 import pytest
-from typing import TypedDict, List, Tuple
+from typing import TypedDict
 import functools
 from termcolor import colored
@@ -16,7 +16,6 @@ from scenario.config import ScenarioConfig
 from scenario.types import ScenarioResult
 from .scenario_executor import ScenarioExecutor
-import scenario
 class ScenarioReporterResults(TypedDict):
@@ -46,23 +45,6 @@ class ScenarioReporter:
     Attributes:
         results: List of all scenario test results collected during the session
-    Example:
-        The reporter is used automatically, but you can access it in tests:
-        ```python
-        def test_my_scenarios(scenario_reporter):
-            # Run your scenarios
-            result1 = await scenario.run(...)
-            result2 = await scenario.run(...)
-            # Check collected results
-            assert len(scenario_reporter.results) == 2
-            # Get summary statistics
-            summary = scenario_reporter.get_summary()
-            print(f"Success rate: {summary['success_rate']}%")
-        ```
     """
     def __init__(self):
@@ -80,21 +62,6 @@ class ScenarioReporter:
         Args:
             scenario: The ScenarioExecutor instance that ran the test
             result: The ScenarioResult containing test outcome and details
-        Example:
-            ```python
-            # This happens automatically when you run scenarios
-            result = await scenario.run(
-                name="my test",
-                description="Test description",
-                agents=[
-                    my_agent,
-                    scenario.UserSimulatorAgent(),
-                    scenario.JudgeAgent(criteria=["Agent provides helpful response"])
-                ]
-            )
-            # Result is automatically added to the global reporter
-            ```
         """
         self.results.append({"scenario": scenario, "result": result})
@@ -111,18 +78,6 @@ class ScenarioReporter:
             - passed: Number of scenarios that passed
             - failed: Number of scenarios that failed
             - success_rate: Percentage of scenarios that passed (0-100)
-        Example:
-            ```python
-            def test_summary_check(scenario_reporter):
-                # Run some scenarios...
-                await scenario.run(...)
-                await scenario.run(...)
-                summary = scenario_reporter.get_summary()
-                assert summary['total'] == 2
-                assert summary['success_rate'] >= 80  # Require 80% success rate
-            ```
         """
         total = len(self.results)
         passed = sum(1 for r in self.results if r["result"].success)
@@ -242,7 +197,7 @@ class ScenarioReporter:
 # Store the original run method
-original_run = ScenarioExecutor._run
+original_run = ScenarioExecutor.run
 @pytest.hookimpl(trylast=True)
@@ -304,7 +259,7 @@ def pytest_configure(config):
         return result
     # Apply the patch
-    ScenarioExecutor._run = auto_reporting_run
+    ScenarioExecutor.run = auto_reporting_run
 @pytest.hookimpl(trylast=True)
@@ -314,7 +269,7 @@ def pytest_unconfigure(config):
     This hook is called when pytest is shutting down and:
     - Prints the final scenario test report
-    - Restores the original ScenarioExecutor._run method
+    - Restores the original ScenarioExecutor.run method
     - Cleans up any remaining resources
     Args:
@@ -329,7 +284,7 @@ def pytest_unconfigure(config):
         config._scenario_reporter.print_report()
     # Restore the original method
-    ScenarioExecutor._run = original_run
+    ScenarioExecutor.run = original_run
 @pytest.fixture
@@ -347,7 +302,7 @@ def scenario_reporter(request):
         ScenarioReporter: The global reporter instance collecting all scenario results
     Example:
-        ```python
+        ```
         @pytest.mark.agent_test
         def test_with_custom_reporting(scenario_reporter):
             # Run your scenarios

langwatch-scenario 0.4.0__py3-none-any.whl → 0.7.1__py3-none-any.whl

langwatch-scenario 0.4.0py3-none-any.whl → 0.7.1py3-none-any.whl