PyPI - langwatch-scenario - Versions diffs - 0.1.3__py3-none-any.whl → 0.2.0__py3-none-any.whl - Mend

langwatch-scenario 0.1.3py3-none-any.whl → 0.2.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

{langwatch_scenario-0.1.3.dist-info → langwatch_scenario-0.2.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: langwatch-scenario
-Version: 0.1.3
+Version: 0.2.0
 Summary: The end-to-end agent testing library
 Author-email: LangWatch Team <support@langwatch.ai>
 License: MIT
@@ -45,6 +45,11 @@ You define the scenarios, and the testing agent will simulate your users as it f
 [📺 Video Tutorial](https://www.youtube.com/watch?v=f8NLpkY0Av4)
+### See also
+- [Scenario TypeScript](https://github.com/langwatch/scenario-ts/)
+- [Scenario Go](https://github.com/langwatch/scenario-go/)
 ## Getting Started
 Install pytest and scenario:
@@ -72,18 +77,23 @@ async def test_vegetarian_recipe_agent():
         # Call your agent here
         return agent.run(message)
-    # Define the scenario
+    # Define the simulated scenario
     scenario = Scenario(
-        "User is looking for a dinner idea",
+        name="dinner idea",
+        description="""
+            It's saturday evening, the user is very hungry and tired,
+            but have no money to order out, so they are looking for a recipe.
+            The user never mentions they want a vegetarian recipe.
+        """,
         agent=vegetarian_recipe_agent,
-        success_criteria=[
-            "Recipe agent generates a vegetarian recipe",
-            "Recipe includes a list of ingredients",
-            "Recipe includes step-by-step cooking instructions",
-        ],
-        failure_criteria=[
-            "The recipe is not vegetarian or includes meat",
-            "The agent asks more than two follow-up questions",
+        # List the evaluation criteria for the scenario to be considered successful
+        criteria=[
+            "Agent should not ask more than two follow-up questions",
+            "Agent should generate a recipe",
+            "Recipe should include a list of ingredients",
+            "Recipe should include step-by-step cooking instructions",
+            "Recipe should be vegetarian and not include any sort of meat",
         ],
     )
@@ -111,9 +121,11 @@ class VegetarianRecipeAgent:
             messages=[
                 {
                     "role": "system",
-                    "content": """You are a vegetarian recipe agent.
-                    Given the user request, ask AT MOST ONE follow-up question,
-                    then provide a complete recipe. Keep your responses concise and focused.""",
+                    "content": """
+                        You are a vegetarian recipe agent.
+                        Given the user request, ask AT MOST ONE follow-up question,
+                        then provide a complete recipe. Keep your responses concise and focused.
+                    """,
                 },
                 *self.history,
             ],
@@ -151,19 +163,20 @@ For example, in this Lovable Clone scenario test:
 ```python
 scenario = Scenario(
-    "user wants to create a new landing page for their dog walking startup",
+    name="dog walking startup landing page",
+    description="""
+        the user wants to create a new landing page for their dog walking startup
+        send the first message to generate the landing page, then a single follow up request to extend it, then give your final verdict
+    """,
     agent=lovable_agent,
-    strategy="send the first message to generate the landing page, then a single follow up request to extend it, then give your final verdict",
-    success_criteria=[
+    criteria=[
         "agent reads the files before go and making changes",
-        "agent modified the index.css file",
-        "agent modified the Index.tsx file",
+        "agent modified the index.css file, not only the Index.tsx file",
         "agent created a comprehensive landing page",
         "agent extended the landing page with a new section",
-    ],
-    failure_criteria=[
-        "agent says it can't read the file",
-        "agent produces incomplete code or is too lazy to finish",
+        "agent should NOT say it can't read the file",
+        "agent should NOT produce incomplete code or be too lazy to finish",
     ],
     max_turns=5,
 )

langwatch_scenario-0.2.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,15 @@
+scenario/__init__.py,sha256=LfCjOpbn55jYBBZHyMSZtRAWeCDFn4z4OhAyFnu8aMg,602
+scenario/cache.py,sha256=sYu16SAf-BnVYkWSlEDzpyynJGIQyNYsgMXPgCqEnmk,1719
+scenario/config.py,sha256=5UVBmuQDtni0Yu00bMh5p0xMGsrymYVRftXBGTsi2fI,802
+scenario/error_messages.py,sha256=ZMcAOKJmKaLIinMZ0yBIOgDhPfeJH0uZxIEmolRArtc,2344
+scenario/pytest_plugin.py,sha256=TzOHi8PN-dtDqaYAZkgT0wgBkhetOpYy--Z0pzi5PXM,5771
+scenario/result.py,sha256=y6mUu6X4H6YJYmwVD4VWHCBi-1BTlUVeYrTZ3HBA0oU,2382
+scenario/scenario.py,sha256=OTadwIHIcUhXxfUNnJXpT7h3GZ_VUL3XSd9k-oVPfMo,4069
+scenario/scenario_executor.py,sha256=phRKj7vZ_QjGUO9w05-DPrAzdacg_7CnTV59lYLCCKk,7912
+scenario/testing_agent.py,sha256=y4B8TMhKryeTiiv62qwslx7Gw_zw54Vk9zPyswEPm0k,10481
+scenario/utils.py,sha256=tMESosrxesA1B5zZB3IJ-sNSXDmnpNNib-DHobveVLA,3918
+langwatch_scenario-0.2.0.dist-info/METADATA,sha256=fc1oBg2ms-iVgYc44oSTJk-8sw2yOe_PpWEMStvYEX4,9339
+langwatch_scenario-0.2.0.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
+langwatch_scenario-0.2.0.dist-info/entry_points.txt,sha256=WlEnJ_gku0i18bIa3DSuGqXRX-QDQLe_s0YmRzK45TI,45
+langwatch_scenario-0.2.0.dist-info/top_level.txt,sha256=45Mn28aedJsetnBMB5xSmrJ-yo701QLH89Zlz4r1clE,9
+langwatch_scenario-0.2.0.dist-info/RECORD,,

{langwatch_scenario-0.1.3.dist-info → langwatch_scenario-0.2.0.dist-info}/WHEEL RENAMED Viewed

@@ -1,5 +1,5 @@
 Wheel-Version: 1.0
-Generator: setuptools (79.0.0)
+Generator: setuptools (80.9.0)
 Root-Is-Purelib: true
 Tag: py3-none-any

scenario/pytest_plugin.py CHANGED Viewed

@@ -82,7 +82,7 @@ class ScenarioReporter:
                 time = f" in {result.total_time:.2f}s (agent: {result.agent_time:.2f}s)"
             print(
-                f"\n{idx}. {scenario.description} - {colored(status, status_color, attrs=['bold'])}{time}"
+                f"\n{idx}. {scenario.name} - {colored(status, status_color, attrs=['bold'])}{time}"
             )
             print(
@@ -92,23 +92,23 @@ class ScenarioReporter:
                 )
             )
-            if hasattr(result, "met_criteria") and result.met_criteria:
-                criteria_count = len(result.met_criteria)
-                total_criteria = len(scenario.success_criteria)
+            if hasattr(result, "passed_criteria") and result.passed_criteria:
+                criteria_count = len(result.passed_criteria)
+                total_criteria = len(scenario.criteria)
                 criteria_color = (
                     "green" if criteria_count == total_criteria else "yellow"
                 )
                 print(
                     colored(
-                        f"   Success Criteria: {criteria_count}/{total_criteria}",
+                        f"   Passed Criteria: {criteria_count}/{total_criteria}",
                         criteria_color,
                     )
                 )
-            if hasattr(result, "triggered_failures") and result.triggered_failures:
+            if hasattr(result, "failed_criteria") and result.failed_criteria:
                 print(
                     colored(
-                        f"   Failures Criteria: {len(result.triggered_failures)}",
+                        f"   Failed Criteria: {len(result.failed_criteria)}",
                         "red",
                     )
                 )

scenario/result.py CHANGED Viewed

@@ -15,17 +15,15 @@ class ScenarioResult:
         success: Whether the scenario passed
         conversation: The conversation history
         reasoning: Reasoning for the result
-        met_criteria: List of success criteria that were met
-        unmet_criteria: List of success criteria that were not met
-        triggered_failures: List of failure criteria that were triggered
+        passed_criteria: List of criteria that were met
+        failed_criteria: List of criteria that were not met
     """
     success: bool
     conversation: List[Dict[str, str]]
     reasoning: Optional[str] = None
-    met_criteria: List[str] = field(default_factory=list)
-    unmet_criteria: List[str] = field(default_factory=list)
-    triggered_failures: List[str] = field(default_factory=list)
+    passed_criteria: List[str] = field(default_factory=list)
+    failed_criteria: List[str] = field(default_factory=list)
     total_time: Optional[float] = None
     agent_time: Optional[float] = None
@@ -39,7 +37,7 @@ class ScenarioResult:
         cls,
         conversation: List[Dict[str, str]],
         reasoning: Optional[str],
-        met_criteria: List[str],
+        passed_criteria: List[str],
         total_time: Optional[float] = None,
         agent_time: Optional[float] = None,
     ) -> "ScenarioResult":
@@ -48,9 +46,8 @@ class ScenarioResult:
             success=True,
             conversation=conversation,
             reasoning=reasoning,
-            met_criteria=met_criteria,
-            unmet_criteria=[],
-            triggered_failures=[],
+            passed_criteria=passed_criteria,
+            failed_criteria=[],
             total_time=total_time,
             agent_time=agent_time,
         )
@@ -60,9 +57,8 @@ class ScenarioResult:
         cls,
         conversation: List[Dict[str, str]],
         reasoning: str,
-        met_criteria: Optional[List[str]] = None,
-        unmet_criteria: Optional[List[str]] = None,
-        triggered_failures: Optional[List[str]] = None,
+        passed_criteria: Optional[List[str]] = None,
+        failed_criteria: Optional[List[str]] = None,
         total_time: Optional[float] = None,
         agent_time: Optional[float] = None,
     ) -> "ScenarioResult":
@@ -71,11 +67,8 @@ class ScenarioResult:
             success=False,
             conversation=conversation,
             reasoning=reasoning,
-            met_criteria=met_criteria if met_criteria is not None else [],
-            unmet_criteria=unmet_criteria if unmet_criteria is not None else [],
-            triggered_failures=(
-                triggered_failures if triggered_failures is not None else []
-            ),
+            passed_criteria=passed_criteria if passed_criteria is not None else [],
+            failed_criteria=failed_criteria if failed_criteria is not None else [],
             total_time=total_time,
             agent_time=agent_time,
         )

scenario/scenario.py CHANGED Viewed

@@ -15,6 +15,7 @@ from .testing_agent import TestingAgent
 from openai.types.chat import ChatCompletionMessageParam
 class AgentResult(TypedDict, total=False):
     message: str
     messages: List[ChatCompletionMessageParam]
@@ -27,34 +28,36 @@ class Scenario(ScenarioConfig):
     It includes:
     - A description of the scenario
-    - Success criteria to determine if the agent behaved correctly
-    - Failure criteria to determine if the agent failed
-    - An optional strategy that guides the testing agent
+    - Criteria to determine if the agent behaved correctly
     - Optional additional parameters
     """
+    name: str
     description: str
     agent: Union[
         Callable[[str, Optional[Dict[str, Any]]], Dict[str, Any]],
         Callable[[str, Optional[Dict[str, Any]]], Awaitable[Dict[str, Any]]],
     ]
-    success_criteria: List[str]
-    failure_criteria: List[str] = []
-    strategy: Optional[str] = None
+    criteria: List[str]
-    def __init__(self, description: str, **kwargs):
+    def __init__(self, name: str, description: str, **kwargs):
         """Validate scenario configuration after initialization."""
         default_config = getattr(Scenario, "default_config", None)
         if default_config:
             kwargs = {**default_config.model_dump(), **kwargs}
+        if not name:
+            raise ValueError("Scenario name cannot be empty")
+        kwargs["name"] = name
         if not description:
             raise ValueError("Scenario description cannot be empty")
         kwargs["description"] = description
-        if not kwargs.get("success_criteria"):
-            raise ValueError("Scenario must have at least one success criterion")
+        # TODO: allow not having any criteria, for scripted scenarios
+        if not kwargs.get("criteria"):
+            raise ValueError("Scenario must have at least one criteria")
         if kwargs.get("max_turns", 0) < 1:
             raise ValueError("max_turns must be a positive integer")
@@ -65,7 +68,6 @@ class Scenario(ScenarioConfig):
         super().__init__(**kwargs)
     async def run(self, context: Optional[Dict[str, Any]] = None) -> ScenarioResult:
         """
         Run the scenario against the agent under test.
@@ -82,6 +84,7 @@ class Scenario(ScenarioConfig):
         # being used throughout, any user code on the callback can
         # be blocking, preventing them from running scenarios in parallel
         with concurrent.futures.ThreadPoolExecutor() as executor:
             def run_in_thread():
                 loop = asyncio.new_event_loop()
                 asyncio.set_event_loop(loop)

scenario/scenario_executor.py CHANGED Viewed

@@ -199,6 +199,6 @@ class ScenarioExecutor:
     def _scenario_name(self):
         if self.scenario.verbose == 2:
-            return termcolor.colored(f"[Scenario: {self.scenario.description}] ", "yellow")
+            return termcolor.colored(f"[Scenario: {self.scenario.name}] ", "yellow")
         else:
             return ""

scenario/testing_agent.py CHANGED Viewed

@@ -4,6 +4,7 @@ TestingAgent module: defines the testing agent that interacts with the agent und
 import json
 import logging
+import re
 from typing import TYPE_CHECKING, Dict, List, Any, Optional, Union, cast
 from pydantic import BaseModel
@@ -74,27 +75,19 @@ Your goal (assistant) is to interact with the Agent Under Test (user) as if you
 {scenario.description}
 </scenario>
-<strategy>
-{scenario.strategy or "Start with a first message and guide the conversation to play out the scenario."}
-</strategy>
-<success_criteria>
-{json.dumps(scenario.success_criteria, indent=2)}
-</success_criteria>
-<failure_criteria>
-{json.dumps(scenario.failure_criteria, indent=2)}
-</failure_criteria>
+<criteria>
+{"\n".join([f"{idx + 1}. {criterion}" for idx, criterion in enumerate(scenario.criteria)])}
+</criteria>
 <execution_flow>
 1. Generate the first message to start the scenario
 2. After the Agent Under Test (user) responds, generate the next message to send to the Agent Under Test, keep repeating step 2 until criterias match
-3. If the test should end, use the finish_test tool to determine if success or failure criteria have been met
+3. If the test should end, use the finish_test tool to determine if all the criteria have been met
 </execution_flow>
 <rules>
-1. Test should end immediately if a failure criteria is triggered
-2. Test should continue until all success criteria have been met
+1. Test should end immediately if a criteria mentioning something the agent should NOT do is met
+2. Test should continue until all scenario goals have been met to try going through all the criteria
 3. DO NOT make any judgment calls that are not explicitly listed in the success or failure criteria, withhold judgement if necessary
 4. DO NOT carry over any requests yourself, YOU ARE NOT the assistant today, wait for the user to do it
 </rules>
@@ -141,6 +134,14 @@ if you don't have enough information to make a verdict, say inconclusive with ma
                     message.role = "user"
         # Define the tool
+        criteria_names = [
+            re.sub(
+                r"[^a-zA-Z0-9]",
+                "_",
+                criterion.replace(" ", "_").replace("'", "").lower(),
+            )[:70]
+            for criterion in scenario.criteria
+        ]
         tools = [
             {
                 "type": "function",
@@ -151,40 +152,30 @@ if you don't have enough information to make a verdict, say inconclusive with ma
                     "parameters": {
                         "type": "object",
                         "properties": {
-                            "verdict": {
-                                "type": "string",
-                                "enum": ["success", "failure", "inconclusive"],
-                                "description": "The final verdict of the test",
-                            },
-                            "reasoning": {
-                                "type": "string",
-                                "description": "Explanation of why this verdict was reached",
-                            },
-                            "details": {
+                            "criteria": {
                                 "type": "object",
                                 "properties": {
-                                    "met_criteria": {
-                                        "type": "array",
-                                        "items": {"type": "string"},
-                                        "description": "List of success criteria that have been met",
-                                    },
-                                    "unmet_criteria": {
-                                        "type": "array",
-                                        "items": {"type": "string"},
-                                        "description": "List of success criteria that have not been met",
-                                    },
-                                    "triggered_failures": {
-                                        "type": "array",
-                                        "items": {"type": "string"},
-                                        "description": "List of failure criteria that have been triggered",
-                                    },
+                                    criteria_names[idx]: {
+                                        "enum": [True, False, "inconclusive"],
+                                        "description": criterion,
+                                    }
+                                    for idx, criterion in enumerate(scenario.criteria)
                                 },
-                                "required": ["met_criteria", "unmet_criteria", "triggered_failures"],
+                                "required": criteria_names,
                                 "additionalProperties": False,
-                                "description": "Detailed information about criteria evaluation",
+                                "description": "Strict verdict for each criterion",
+                            },
+                            "reasoning": {
+                                "type": "string",
+                                "description": "Explanation of what the final verdict should be",
+                            },
+                            "verdict": {
+                                "type": "string",
+                                "enum": ["success", "failure", "inconclusive"],
+                                "description": "The final verdict of the test",
                             },
                         },
-                        "required": ["verdict", "reasoning", "details"],
+                        "required": ["criteria", "reasoning", "verdict"],
                         "additionalProperties": False,
                     },
                 },
@@ -216,35 +207,40 @@ if you don't have enough information to make a verdict, say inconclusive with ma
                         args = json.loads(tool_call.function.arguments)
                         verdict = args.get("verdict", "inconclusive")
                         reasoning = args.get("reasoning", "No reasoning provided")
-                        details = args.get("details", {})
-                        met_criteria = details.get("met_criteria", [])
-                        unmet_criteria = details.get("unmet_criteria", [])
-                        triggered_failures = details.get("triggered_failures", [])
+                        criteria = args.get("criteria", {})
+                        passed_criteria = [
+                            scenario.criteria[idx]
+                            for idx, criterion in enumerate(criteria.values())
+                            if criterion == True
+                        ]
+                        failed_criteria = [
+                            scenario.criteria[idx]
+                            for idx, criterion in enumerate(criteria.values())
+                            if criterion == False
+                        ]
                         # Return the appropriate ScenarioResult based on the verdict
                         if verdict == "success":
                             return ScenarioResult.success_result(
                                 conversation=conversation,
                                 reasoning=reasoning,
-                                met_criteria=met_criteria,
+                                passed_criteria=passed_criteria,
                             )
                         elif verdict == "failure":
                             return ScenarioResult.failure_result(
                                 conversation=conversation,
                                 reasoning=reasoning,
-                                met_criteria=met_criteria,
-                                unmet_criteria=unmet_criteria,
-                                triggered_failures=triggered_failures,
+                                passed_criteria=passed_criteria,
+                                failed_criteria=failed_criteria,
                             )
                         else:  # inconclusive
                             return ScenarioResult(
                                 success=False,
                                 conversation=conversation,
                                 reasoning=reasoning,
-                                met_criteria=met_criteria,
-                                unmet_criteria=unmet_criteria,
-                                triggered_failures=triggered_failures,
+                                passed_criteria=passed_criteria,
+                                failed_criteria=failed_criteria,
                             )
                     except json.JSONDecodeError:
                         logger.error("Failed to parse tool call arguments")
@@ -254,7 +250,9 @@ if you don't have enough information to make a verdict, say inconclusive with ma
             if message_content is None:
                 # If invalid tool call, raise an error
                 if message.tool_calls:
-                    raise Exception(f"Invalid tool call from testing agent: {message.tool_calls.__repr__()}")
+                    raise Exception(
+                        f"Invalid tool call from testing agent: {message.tool_calls.__repr__()}"
+                    )
                 raise Exception(f"No response from LLM: {response.__repr__()}")
             return message_content
@@ -262,4 +260,3 @@ if you don't have enough information to make a verdict, say inconclusive with ma
             raise Exception(
                 f"Unexpected response format from LLM: {response.__repr__()}"
             )

langwatch_scenario-0.1.3.dist-info/RECORD DELETED Viewed

@@ -1,15 +0,0 @@
-scenario/__init__.py,sha256=LfCjOpbn55jYBBZHyMSZtRAWeCDFn4z4OhAyFnu8aMg,602
-scenario/cache.py,sha256=sYu16SAf-BnVYkWSlEDzpyynJGIQyNYsgMXPgCqEnmk,1719
-scenario/config.py,sha256=5UVBmuQDtni0Yu00bMh5p0xMGsrymYVRftXBGTsi2fI,802
-scenario/error_messages.py,sha256=ZMcAOKJmKaLIinMZ0yBIOgDhPfeJH0uZxIEmolRArtc,2344
-scenario/pytest_plugin.py,sha256=BuBbyKLa-t9AFVn9EETl7OvGSt__dFO7KnbZynfS1UM,5789
-scenario/result.py,sha256=SGF8uYNtkP7cJy4KsshUozZRevmdiyX2TFzr6VreTv8,2717
-scenario/scenario.py,sha256=tYn3Y1sK6_7pg7hFb_5w0TW6nun-za_4F8kqcnrXXU4,4077
-scenario/scenario_executor.py,sha256=c8xV6GoJgO2JoZBWpYPQN5YwwQ3G9iJUtXV9UGSf1q8,7919
-scenario/testing_agent.py,sha256=eS-c_io5cHgzJ88wwRvU_vve-pmB2HsGWN6qwlq0sPg,10865
-scenario/utils.py,sha256=tMESosrxesA1B5zZB3IJ-sNSXDmnpNNib-DHobveVLA,3918
-langwatch_scenario-0.1.3.dist-info/METADATA,sha256=7OIolGcZ3fkCXFmE6JHkckVCeJb1r3yYSYveJ6iE9zw,8801
-langwatch_scenario-0.1.3.dist-info/WHEEL,sha256=pxyMxgL8-pra_rKaQ4drOZAegBVuX-G_4nRHjjgWbmo,91
-langwatch_scenario-0.1.3.dist-info/entry_points.txt,sha256=WlEnJ_gku0i18bIa3DSuGqXRX-QDQLe_s0YmRzK45TI,45
-langwatch_scenario-0.1.3.dist-info/top_level.txt,sha256=45Mn28aedJsetnBMB5xSmrJ-yo701QLH89Zlz4r1clE,9
-langwatch_scenario-0.1.3.dist-info/RECORD,,

{langwatch_scenario-0.1.3.dist-info → langwatch_scenario-0.2.0.dist-info}/entry_points.txt RENAMED Viewed

File without changes

{langwatch_scenario-0.1.3.dist-info → langwatch_scenario-0.2.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

langwatch-scenario 0.1.3__py3-none-any.whl → 0.2.0__py3-none-any.whl

langwatch-scenario 0.1.3py3-none-any.whl → 0.2.0py3-none-any.whl