PyPI - langwatch-scenario - Versions diffs - 0.3.0__py3-none-any.whl → 0.6.0__py3-none-any.whl - Mend

langwatch-scenario 0.3.0py3-none-any.whl → 0.6.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (33) hide show

langwatch_scenario-0.6.0.dist-info/METADATA +385 -0
langwatch_scenario-0.6.0.dist-info/RECORD +27 -0
scenario/__init__.py +128 -17
scenario/{error_messages.py → _error_messages.py} +8 -38
scenario/_utils/__init__.py +32 -0
scenario/_utils/ids.py +58 -0
scenario/_utils/message_conversion.py +103 -0
scenario/_utils/utils.py +425 -0
scenario/agent_adapter.py +115 -0
scenario/cache.py +134 -9
scenario/config.py +156 -10
scenario/events/__init__.py +66 -0
scenario/events/event_bus.py +175 -0
scenario/events/event_reporter.py +83 -0
scenario/events/events.py +169 -0
scenario/events/messages.py +84 -0
scenario/events/utils.py +86 -0
scenario/judge_agent.py +414 -0
scenario/pytest_plugin.py +177 -14
scenario/scenario_executor.py +630 -154
scenario/scenario_state.py +205 -0
scenario/script.py +361 -0
scenario/types.py +197 -20
scenario/user_simulator_agent.py +242 -0
langwatch_scenario-0.3.0.dist-info/METADATA +0 -302
langwatch_scenario-0.3.0.dist-info/RECORD +0 -16
scenario/scenario.py +0 -238
scenario/scenario_agent_adapter.py +0 -16
scenario/testing_agent.py +0 -279
scenario/utils.py +0 -264
{langwatch_scenario-0.3.0.dist-info → langwatch_scenario-0.6.0.dist-info}/WHEEL +0 -0
{langwatch_scenario-0.3.0.dist-info → langwatch_scenario-0.6.0.dist-info}/entry_points.txt +0 -0
{langwatch_scenario-0.3.0.dist-info → langwatch_scenario-0.6.0.dist-info}/top_level.txt +0 -0

scenario/judge_agent.py ADDED Viewed

@@ -0,0 +1,414 @@
+"""
+Judge agent module for evaluating scenario conversations.
+This module provides the JudgeAgent class, which evaluates ongoing conversations
+between users and agents to determine if success criteria are met. The judge
+makes real-time decisions about whether scenarios should continue or end with
+success/failure verdicts.
+"""
+import json
+import logging
+import re
+from typing import List, Optional, cast
+from litellm import Choices, completion
+from litellm.files.main import ModelResponse
+from scenario.cache import scenario_cache
+from scenario.agent_adapter import AgentAdapter
+from scenario.config import ModelConfig, ScenarioConfig
+from ._error_messages import agent_not_configured_error_message
+from .types import AgentInput, AgentReturnTypes, AgentRole, ScenarioResult
+logger = logging.getLogger("scenario")
+class JudgeAgent(AgentAdapter):
+    """
+    Agent that evaluates conversations against success criteria.
+    The JudgeAgent watches conversations in real-time and makes decisions about
+    whether the agent under test is meeting the specified criteria. It can either
+    allow the conversation to continue or end it with a success/failure verdict.
+    The judge uses function calling to make structured decisions and provides
+    detailed reasoning for its verdicts. It evaluates each criterion independently
+    and provides comprehensive feedback about what worked and what didn't.
+    Attributes:
+        role: Always AgentRole.JUDGE for judge agents
+        model: LLM model identifier to use for evaluation
+        api_key: Optional API key for the model provider
+        temperature: Sampling temperature for evaluation consistency
+        max_tokens: Maximum tokens for judge reasoning
+        criteria: List of success criteria to evaluate against
+        system_prompt: Custom system prompt to override default judge behavior
+    Example:
+        ```
+        import scenario
+        # Basic judge agent with criteria
+        judge = scenario.JudgeAgent(
+            criteria=[
+                "Agent provides helpful responses",
+                "Agent asks relevant follow-up questions",
+                "Agent does not provide harmful information"
+            ]
+        )
+        # Customized judge with specific model and behavior
+        strict_judge = scenario.JudgeAgent(
+            model="openai/gpt-4.1-mini",
+            criteria=[
+                "Code examples are syntactically correct",
+                "Explanations are technically accurate",
+                "Security best practices are mentioned"
+            ],
+            temperature=0.0,  # More deterministic evaluation
+            system_prompt="You are a strict technical reviewer evaluating code quality."
+        )
+        # Use in scenario
+        result = await scenario.run(
+            name="coding assistant test",
+            description="User asks for help with Python functions",
+            agents=[
+                coding_agent,
+                scenario.UserSimulatorAgent(),
+                judge
+            ]
+        )
+        print(f"Passed criteria: {result.passed_criteria}")
+        print(f"Failed criteria: {result.failed_criteria}")
+        ```
+    Note:
+        - Judge agents evaluate conversations continuously, not just at the end
+        - They can end scenarios early if clear success/failure conditions are met
+        - Provide detailed reasoning for their decisions
+        - Support both positive criteria (things that should happen) and negative criteria (things that shouldn't)
+    """
+    role = AgentRole.JUDGE
+    model: str
+    api_key: Optional[str]
+    temperature: float
+    max_tokens: Optional[int]
+    criteria: List[str]
+    system_prompt: Optional[str]
+    def __init__(
+        self,
+        *,
+        criteria: Optional[List[str]] = None,
+        model: Optional[str] = None,
+        api_key: Optional[str] = None,
+        temperature: float = 0.0,
+        max_tokens: Optional[int] = None,
+        system_prompt: Optional[str] = None,
+    ):
+        """
+        Initialize a judge agent with evaluation criteria.
+        Args:
+            criteria: List of success criteria to evaluate the conversation against.
+                     Can include both positive requirements ("Agent provides helpful responses")
+                     and negative constraints ("Agent should not provide personal information").
+            model: LLM model identifier (e.g., "openai/gpt-4.1-mini").
+                   If not provided, uses the default model from global configuration.
+            api_key: API key for the model provider. If not provided,
+                     uses the key from global configuration or environment.
+            temperature: Sampling temperature for evaluation (0.0-1.0).
+                        Lower values (0.0-0.2) recommended for consistent evaluation.
+            max_tokens: Maximum number of tokens for judge reasoning and explanations.
+            system_prompt: Custom system prompt to override default judge behavior.
+                          Use this to create specialized evaluation perspectives.
+        Raises:
+            Exception: If no model is configured either in parameters or global config
+        Example:
+            ```
+            # Customer service judge
+            cs_judge = JudgeAgent(
+                criteria=[
+                    "Agent replies with the refund policy",
+                    "Agent offers next steps for the customer",
+                ],
+                temperature=0.1
+            )
+            # Technical accuracy judge
+            tech_judge = JudgeAgent(
+                criteria=[
+                    "Agent adds a code review pointing out the code compilation errors",
+                    "Agent adds a code review about the missing security headers"
+                ],
+                system_prompt="You are a senior software engineer reviewing code for production use."
+            )
+            ```
+        """
+        # Override the default system prompt for the judge agent
+        self.criteria = criteria or []
+        self.api_key = api_key
+        self.temperature = temperature
+        self.max_tokens = max_tokens
+        self.system_prompt = system_prompt
+        if model:
+            self.model = model
+        if ScenarioConfig.default_config is not None and isinstance(
+            ScenarioConfig.default_config.default_model, str
+        ):
+            self.model = model or ScenarioConfig.default_config.default_model
+        elif ScenarioConfig.default_config is not None and isinstance(
+            ScenarioConfig.default_config.default_model, ModelConfig
+        ):
+            self.model = model or ScenarioConfig.default_config.default_model.model
+            self.api_key = (
+                api_key or ScenarioConfig.default_config.default_model.api_key
+            )
+            self.temperature = (
+                temperature or ScenarioConfig.default_config.default_model.temperature
+            )
+            self.max_tokens = (
+                max_tokens or ScenarioConfig.default_config.default_model.max_tokens
+            )
+        if not hasattr(self, "model"):
+            raise Exception(agent_not_configured_error_message("TestingAgent"))
+    @scenario_cache()
+    async def call(
+        self,
+        input: AgentInput,
+    ) -> AgentReturnTypes:
+        """
+        Evaluate the current conversation state against the configured criteria.
+        This method analyzes the conversation history and determines whether the
+        scenario should continue or end with a verdict. It uses function calling
+        to make structured decisions and provides detailed reasoning.
+        Args:
+            input: AgentInput containing conversation history and scenario context
+        Returns:
+            AgentReturnTypes: Either an empty list (continue scenario) or a
+                            ScenarioResult (end scenario with verdict)
+        Raises:
+            Exception: If the judge cannot make a valid decision or if there's an
+                      error in the evaluation process
+        Note:
+            - Returns empty list [] to continue the scenario
+            - Returns ScenarioResult to end with success/failure
+            - Provides detailed reasoning for all decisions
+            - Evaluates each criterion independently
+            - Can end scenarios early if clear violation or success is detected
+        """
+        scenario = input.scenario_state
+        messages = [
+            {
+                "role": "system",
+                "content": self.system_prompt
+                or f"""
+<role>
+You are an LLM as a judge watching a simulated conversation as it plays out live to determine if the agent under test meets the criteria or not.
+</role>
+<goal>
+Your goal is to determine if you already have enough information to make a verdict of the scenario below, or if the conversation should continue for longer.
+If you do have enough information, use the finish_test tool to determine if all the criteria have been met, if not, use the continue_test tool to let the next step play out.
+</goal>
+<scenario>
+{scenario.description}
+</scenario>
+<criteria>
+{"\n".join([f"{idx + 1}. {criterion}" for idx, criterion in enumerate(self.criteria)])}
+</criteria>
+<rules>
+- Be strict, do not let the conversation continue if the agent already broke one of the "do not" or "should not" criterias.
+- DO NOT make any judgment calls that are not explicitly listed in the success or failure criteria, withhold judgement if necessary
+</rules>
+""",
+            },
+            *input.messages,
+        ]
+        is_last_message = (
+            input.scenario_state.current_turn == input.scenario_state.config.max_turns
+        )
+        if is_last_message:
+            messages.append(
+                {
+                    "role": "user",
+                    "content": """
+System:
+<finish_test>
+This is the last message, conversation has reached the maximum number of turns, give your final verdict,
+if you don't have enough information to make a verdict, say inconclusive with max turns reached.
+</finish_test>
+""",
+                }
+            )
+        # Define the tools
+        criteria_names = [
+            re.sub(
+                r"[^a-zA-Z0-9]",
+                "_",
+                criterion.replace(" ", "_").replace("'", "").lower(),
+            )[:70]
+            for criterion in self.criteria
+        ]
+        tools = [
+            {
+                "type": "function",
+                "function": {
+                    "name": "continue_test",
+                    "description": "Continue the test with the next step",
+                    "strict": True,
+                    "parameters": {
+                        "type": "object",
+                        "properties": {},
+                        "required": [],
+                        "additionalProperties": False,
+                    },
+                },
+            },
+            {
+                "type": "function",
+                "function": {
+                    "name": "finish_test",
+                    "description": "Complete the test with a final verdict",
+                    "strict": True,
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "criteria": {
+                                "type": "object",
+                                "properties": {
+                                    criteria_names[idx]: {
+                                        "enum": [True, False, "inconclusive"],
+                                        "description": criterion,
+                                    }
+                                    for idx, criterion in enumerate(self.criteria)
+                                },
+                                "required": criteria_names,
+                                "additionalProperties": False,
+                                "description": "Strict verdict for each criterion",
+                            },
+                            "reasoning": {
+                                "type": "string",
+                                "description": "Explanation of what the final verdict should be",
+                            },
+                            "verdict": {
+                                "type": "string",
+                                "enum": ["success", "failure", "inconclusive"],
+                                "description": "The final verdict of the test",
+                            },
+                        },
+                        "required": ["criteria", "reasoning", "verdict"],
+                        "additionalProperties": False,
+                    },
+                },
+            },
+        ]
+        enforce_judgment = input.judgment_request
+        has_criteria = len(self.criteria) > 0
+        if enforce_judgment and not has_criteria:
+            return ScenarioResult(
+                success=False,
+                messages=[],
+                reasoning="TestingAgent was called as a judge, but it has no criteria to judge against",
+            )
+        response = cast(
+            ModelResponse,
+            completion(
+                model=self.model,
+                messages=messages,
+                temperature=self.temperature,
+                max_tokens=self.max_tokens,
+                tools=tools,
+                tool_choice=(
+                    {"type": "function", "function": {"name": "finish_test"}}
+                    if (is_last_message or enforce_judgment) and has_criteria
+                    else "required"
+                ),
+            ),
+        )
+        # Extract the content from the response
+        if hasattr(response, "choices") and len(response.choices) > 0:
+            message = cast(Choices, response.choices[0]).message
+            # Check if the LLM chose to use the tool
+            if message.tool_calls:
+                tool_call = message.tool_calls[0]
+                if tool_call.function.name == "continue_test":
+                    return []
+                if tool_call.function.name == "finish_test":
+                    # Parse the tool call arguments
+                    try:
+                        args = json.loads(tool_call.function.arguments)
+                        verdict = args.get("verdict", "inconclusive")
+                        reasoning = args.get("reasoning", "No reasoning provided")
+                        criteria = args.get("criteria", {})
+                        passed_criteria = [
+                            self.criteria[idx]
+                            for idx, criterion in enumerate(criteria.values())
+                            if criterion == True
+                        ]
+                        failed_criteria = [
+                            self.criteria[idx]
+                            for idx, criterion in enumerate(criteria.values())
+                            if criterion == False
+                        ]
+                        # Return the appropriate ScenarioResult based on the verdict
+                        return ScenarioResult(
+                            success=verdict == "success" and len(failed_criteria) == 0,
+                            messages=messages,
+                            reasoning=reasoning,
+                            passed_criteria=passed_criteria,
+                            failed_criteria=failed_criteria,
+                        )
+                    except json.JSONDecodeError:
+                        raise Exception(
+                            f"Failed to parse tool call arguments from judge agent: {tool_call.function.arguments}"
+                        )
+                else:
+                    raise Exception(
+                        f"Invalid tool call from judge agent: {tool_call.function.name}"
+                    )
+            else:
+                raise Exception(
+                    f"Invalid response from judge agent, tool calls not found: {message.__repr__()}"
+                )
+        else:
+            raise Exception(
+                f"Unexpected response format from LLM: {response.__repr__()}"
+            )

scenario/pytest_plugin.py CHANGED Viewed

@@ -1,5 +1,10 @@
 """
 Pytest plugin for Scenario testing library.
+This module provides pytest integration for the Scenario framework, including
+automatic test reporting, debug mode support, and collection of scenario
+results across test runs. It enables seamless integration with existing
+pytest-based testing workflows.
 """
 import pytest
@@ -7,27 +12,73 @@ from typing import TypedDict
 import functools
 from termcolor import colored
+from scenario.config import ScenarioConfig
 from scenario.types import ScenarioResult
-from .scenario import Scenario
+from .scenario_executor import ScenarioExecutor
 class ScenarioReporterResults(TypedDict):
-    scenario: Scenario
+    """
+    Type definition for scenario test results stored by the reporter.
+    Attributes:
+        scenario: The ScenarioExecutor instance that ran the test
+        result: The ScenarioResult containing test outcome and details
+    """
+    scenario: ScenarioExecutor
     result: ScenarioResult
 # ScenarioReporter class definition moved outside the fixture for global use
 class ScenarioReporter:
+    """
+    Collects and reports on scenario test results across a pytest session.
+    This class automatically collects results from all scenario tests run during
+    a pytest session and provides comprehensive reporting including success rates,
+    timing information, and detailed failure analysis.
+    The reporter is automatically instantiated by the pytest plugin and collects
+    results from all scenario.run() calls without requiring explicit user setup.
+    Attributes:
+        results: List of all scenario test results collected during the session
+    """
     def __init__(self):
+        """Initialize an empty scenario reporter."""
         self.results: list[ScenarioReporterResults] = []
-    def add_result(self, scenario, result):
-        """Add a test result to the reporter."""
+    def add_result(self, scenario: ScenarioExecutor, result: ScenarioResult):
+        """
+        Add a test result to the reporter.
+        This method is called automatically by the pytest plugin whenever
+        a scenario.run() call completes. It stores both the scenario
+        configuration and the test result for later reporting.
+        Args:
+            scenario: The ScenarioExecutor instance that ran the test
+            result: The ScenarioResult containing test outcome and details
+        """
         self.results.append({"scenario": scenario, "result": result})
     def get_summary(self):
-        """Get a summary of all test results."""
+        """
+        Get a summary of all test results.
+        Calculates aggregate statistics across all scenario tests that
+        have been run during the current pytest session.
+        Returns:
+            Dictionary containing summary statistics:
+            - total: Total number of scenarios run
+            - passed: Number of scenarios that passed
+            - failed: Number of scenarios that failed
+            - success_rate: Percentage of scenarios that passed (0-100)
+        """
         total = len(self.results)
         passed = sum(1 for r in self.results if r["result"].success)
         failed = total - passed
@@ -40,7 +91,36 @@ class ScenarioReporter:
         }
     def print_report(self):
-        """Print a detailed report of all test results."""
+        """
+        Print a detailed report of all test results.
+        Outputs a comprehensive report to the console showing:
+        - Overall summary statistics
+        - Individual scenario results with success/failure status
+        - Detailed reasoning for each scenario outcome
+        - Timing information when available
+        - Criteria pass/fail breakdown for judge-evaluated scenarios
+        The report is automatically printed at the end of pytest sessions,
+        but can also be called manually for intermediate reporting.
+        Example output:
+        ```
+        === Scenario Test Report ===
+        Total Scenarios: 5
+        Passed: 4
+        Failed: 1
+        Success Rate: 80%
+        1. weather query test - PASSED in 2.34s (agent: 1.12s)
+           Reasoning: Agent successfully provided weather information
+           Passed Criteria: 2/2
+        2. complex math problem - FAILED in 5.67s (agent: 3.45s)
+           Reasoning: Agent provided incorrect calculation
+           Failed Criteria: 1
+        ```
+        """
         if not self.results:
             return  # Skip report if no results
@@ -94,7 +174,9 @@ class ScenarioReporter:
             if hasattr(result, "passed_criteria") and result.passed_criteria:
                 criteria_count = len(result.passed_criteria)
-                total_criteria = len(scenario.criteria)
+                total_criteria = len(result.passed_criteria) + len(
+                    result.failed_criteria
+                )
                 criteria_color = (
                     "green" if criteria_count == total_criteria else "yellow"
                 )
@@ -115,12 +197,40 @@ class ScenarioReporter:
 # Store the original run method
-original_run = Scenario.run
+original_run = ScenarioExecutor._run
 @pytest.hookimpl(trylast=True)
 def pytest_configure(config):
-    """Register the agent_test marker and set up automatic reporting."""
+    """
+    Configure pytest integration for Scenario testing.
+    This hook is called when pytest starts and sets up:
+    - Registration of the @pytest.mark.agent_test marker
+    - Debug mode configuration from command line arguments
+    - Global scenario reporter for collecting results
+    - Automatic result collection from all scenario.run() calls
+    Args:
+        config: pytest configuration object
+    Note:
+        This function runs automatically when pytest loads the plugin.
+        Users don't need to call it directly.
+    Debug Mode:
+        When --debug is passed to pytest, enables step-by-step scenario
+        execution with user intervention capabilities.
+    Example:
+        ```bash
+        # Enable debug mode for all scenarios
+        pytest tests/ --debug -s
+        # Run normally
+        pytest tests/
+        ```
+    """
     # Register the marker
     config.addinivalue_line(
         "markers", "agent_test: mark test as an agent scenario test"
@@ -128,7 +238,7 @@ def pytest_configure(config):
     if config.getoption("--debug"):
         print(colored("\nScenario debug mode enabled (--debug).", "yellow"))
-        Scenario.configure(verbose=True, debug=True)
+        ScenarioConfig.configure(verbose=True, debug=True)
     # Create a global reporter instance
     config._scenario_reporter = ScenarioReporter()
@@ -149,27 +259,80 @@ def pytest_configure(config):
         return result
     # Apply the patch
-    Scenario.run = auto_reporting_run
+    ScenarioExecutor._run = auto_reporting_run
 @pytest.hookimpl(trylast=True)
 def pytest_unconfigure(config):
-    """Clean up and print final report when pytest exits."""
+    """
+    Clean up pytest integration when pytest exits.
+    This hook is called when pytest is shutting down and:
+    - Prints the final scenario test report
+    - Restores the original ScenarioExecutor._run method
+    - Cleans up any remaining resources
+    Args:
+        config: pytest configuration object
+    Note:
+        This function runs automatically when pytest exits.
+        Users don't need to call it directly.
+    """
     # Print the final report
     if hasattr(config, "_scenario_reporter"):
         config._scenario_reporter.print_report()
     # Restore the original method
-    Scenario.run = original_run
+    ScenarioExecutor._run = original_run
 @pytest.fixture
 def scenario_reporter(request):
     """
-    A pytest fixture for accessing the global scenario reporter.
+    Pytest fixture for accessing the global scenario reporter.
     This fixture provides access to the same reporter that's used for automatic
     reporting, allowing tests to explicitly interact with the reporter if needed.
+    Args:
+        request: pytest request object containing test context
+    Yields:
+        ScenarioReporter: The global reporter instance collecting all scenario results
+    Example:
+        ```
+        @pytest.mark.agent_test
+        def test_with_custom_reporting(scenario_reporter):
+            # Run your scenarios
+            result1 = await scenario.run(
+                name="test 1",
+                description="First test",
+                agents=[agent, user_sim, judge]
+            )
+            result2 = await scenario.run(
+                name="test 2",
+                description="Second test",
+                agents=[agent, user_sim, judge]
+            )
+            # Access collected results
+            assert len(scenario_reporter.results) == 2
+            # Check success rate
+            summary = scenario_reporter.get_summary()
+            assert summary['success_rate'] >= 90
+            # Print intermediate report
+            if summary['failed'] > 0:
+                scenario_reporter.print_report()
+        ```
+    Note:
+        The reporter automatically collects results from all scenario.run() calls,
+        so you don't need to manually add results unless you're doing custom reporting.
     """
     # Get the global reporter from pytest config
     reporter = request.config._scenario_reporter

langwatch-scenario 0.3.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

langwatch-scenario 0.3.0py3-none-any.whl → 0.6.0py3-none-any.whl