langwatch-scenario 0.4.0__py3-none-any.whl → 0.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,175 @@
1
+ from rx.subject.subject import Subject
2
+ from rx import operators as ops
3
+ from typing import Optional
4
+ from datetime import datetime, UTC
5
+ from .events import ScenarioEvent, ScenarioRunFinishedEvent
6
+ from .event_reporter import EventReporter
7
+ from typing import Any
8
+
9
+ import asyncio
10
+
11
+
12
+ class ScenarioEventBus:
13
+ """
14
+ Manages scenario event publishing, subscription, and processing pipeline using RxPY.
15
+
16
+ The EventBus provides a centralized event processing system that handles scenario
17
+ events asynchronously with retry logic and concurrent processing. It automatically
18
+ manages the event stream lifecycle and ensures all events are processed before
19
+ completion.
20
+
21
+ Events are processed concurrently to improve performance, and failed event
22
+ processing is automatically retried with exponential backoff.
23
+
24
+ Attributes:
25
+ _events: RxPY Subject for event stream management
26
+ _event_reporter: EventReporter instance for HTTP posting of events
27
+ _processing_complete: Async event to signal when all events are processed
28
+ _processing_task: Background task for event processing
29
+ _max_retries: Maximum number of retry attempts for failed event processing
30
+
31
+ Example:
32
+ ```python
33
+ # Create event bus with custom reporter
34
+ reporter = EventReporter(endpoint="https://api.langwatch.ai")
35
+ event_bus = ScenarioEventBus(event_reporter=reporter, max_retries=5)
36
+
37
+ # Start listening for events
38
+ await event_bus.listen()
39
+
40
+ # Publish events
41
+ event_bus.publish(scenario_started_event)
42
+ event_bus.publish(message_snapshot_event)
43
+ event_bus.publish(scenario_finished_event) # This completes the stream
44
+
45
+ # Wait for all events to be processed
46
+ await event_bus.drain()
47
+ ```
48
+ """
49
+
50
+ def __init__(
51
+ self, event_reporter: Optional[EventReporter] = None, max_retries: int = 3
52
+ ):
53
+ """
54
+ Initialize the event bus with optional event reporter and retry configuration.
55
+
56
+ Args:
57
+ event_reporter: Optional EventReporter for HTTP posting of events.
58
+ If not provided, a default EventReporter will be created.
59
+ max_retries: Maximum number of retry attempts for failed event processing.
60
+ Defaults to 3 attempts with exponential backoff.
61
+ """
62
+ self._events = Subject()
63
+ # Use default EventReporter if none provided
64
+ self._event_reporter: EventReporter = event_reporter or EventReporter()
65
+ self._processing_complete = asyncio.Event()
66
+ self._processing_task: Optional[asyncio.Task[Any]] = None
67
+ self._max_retries = max_retries
68
+
69
+ def publish(self, event: ScenarioEvent) -> None:
70
+ """
71
+ Publishes an event into the processing pipeline.
72
+
73
+ This method adds an event to the RxPY stream for processing. The event
74
+ timestamp is automatically set to the current time in milliseconds if
75
+ not already provided. Publishing a ScenarioRunFinishedEvent automatically
76
+ completes the event stream.
77
+
78
+ Args:
79
+ event: The scenario event to publish. Must be a valid ScenarioEvent type.
80
+
81
+ Note:
82
+ Events are processed asynchronously in the background. Use `drain()`
83
+ to wait for all events to be processed after publishing.
84
+ """
85
+ # Convert to Unix timestamp in milliseconds
86
+ event.timestamp = int(datetime.now(UTC).timestamp() * 1000)
87
+ self._events.on_next(event)
88
+
89
+ if isinstance(event, ScenarioRunFinishedEvent):
90
+ self._events.on_completed()
91
+
92
+ async def listen(self) -> None:
93
+ """
94
+ Begins listening for and processing events.
95
+
96
+ This method sets up the RxPY event processing pipeline with concurrent
97
+ processing and automatic retry logic. It should be called before publishing
98
+ any events to ensure proper event handling.
99
+
100
+ The processing pipeline:
101
+ 1. Receives events from the publish() method
102
+ 2. Processes each event concurrently using asyncio tasks
103
+ 3. Automatically retries failed events with exponential backoff
104
+ 4. Completes when a ScenarioRunFinishedEvent is published
105
+
106
+ Note:
107
+ This method is idempotent - calling it multiple times has no effect
108
+ if the processing pipeline is already active.
109
+ """
110
+ if self._processing_task is not None:
111
+ return
112
+
113
+ async def process_single_event(event: ScenarioEvent, attempt: int = 1) -> bool:
114
+ """
115
+ Process a single event with retry logic.
116
+
117
+ Args:
118
+ event: The event to process
119
+ attempt: Current attempt number (1-based)
120
+
121
+ Returns:
122
+ True if processing succeeded, False if all retries failed
123
+ """
124
+ try:
125
+ if self._event_reporter:
126
+ await self._event_reporter.post_event(event)
127
+ return True
128
+ except Exception as e:
129
+ if attempt >= self._max_retries:
130
+ print(f"Failed to process event after {attempt} attempts: {e}")
131
+ return False
132
+ print(
133
+ f"Error processing event (attempt {attempt}/{self._max_retries}): {e}"
134
+ )
135
+ await asyncio.sleep(0.1 * (2 ** (attempt - 1)))
136
+ return await process_single_event(event, attempt + 1)
137
+
138
+ def process_event(event: ScenarioEvent) -> asyncio.Task[bool]:
139
+ """Create an asyncio task to process an event concurrently."""
140
+ loop = asyncio.get_event_loop()
141
+ return loop.create_task(process_single_event(event))
142
+
143
+ # Set up the event processing pipeline with concurrent processing
144
+ self._events.pipe(ops.flat_map(lambda event: process_event(event))).subscribe(
145
+ on_next=lambda success: None,
146
+ on_completed=lambda: self._processing_complete.set(),
147
+ on_error=lambda e: print(f"Unexpected error in event stream: {e}"),
148
+ )
149
+
150
+ async def drain(self) -> None:
151
+ """
152
+ Waits for all events to be processed after the stream is completed.
153
+
154
+ This method blocks until all events in the processing pipeline have been
155
+ handled. It should be called after publishing all events to ensure
156
+ proper cleanup and that no events are lost.
157
+
158
+ Note:
159
+ This method will wait indefinitely if the event stream has not been
160
+ completed (i.e., if no ScenarioRunFinishedEvent has been published).
161
+ """
162
+ await self._processing_complete.wait()
163
+
164
+ def is_completed(self) -> bool:
165
+ """
166
+ Returns whether the event bus has completed processing all events.
167
+
168
+ This method provides a non-blocking way to check if all events have
169
+ been processed. It's useful for monitoring the state of the event bus
170
+ without blocking execution.
171
+
172
+ Returns:
173
+ True if all events have been processed, False otherwise
174
+ """
175
+ return self._processing_complete.is_set()
@@ -0,0 +1,83 @@
1
+ import logging
2
+ import os
3
+ import httpx
4
+ from typing import Optional
5
+ from .events import ScenarioEvent
6
+
7
+
8
+ class EventReporter:
9
+ """
10
+ Handles HTTP posting of scenario events to external endpoints.
11
+
12
+ Single responsibility: Send events via HTTP to configured endpoints
13
+ with proper authentication and error handling.
14
+
15
+ Args:
16
+ endpoint (str, optional): The base URL to post events to. Defaults to LANGWATCH_ENDPOINT env var.
17
+ api_key (str, optional): The API key for authentication. Defaults to LANGWATCH_API_KEY env var.
18
+
19
+ Example:
20
+ event = {
21
+ "type": "SCENARIO_RUN_STARTED",
22
+ "batch_run_id": "batch-1",
23
+ "scenario_id": "scenario-1",
24
+ "scenario_run_id": "run-1",
25
+ "metadata": {
26
+ "name": "test",
27
+ "description": "test scenario"
28
+ }
29
+ }
30
+
31
+ reporter = EventReporter(endpoint="https://api.langwatch.ai", api_key="test-api-key")
32
+ await reporter.post_event(event)
33
+ """
34
+
35
+ def __init__(self, endpoint: Optional[str] = None, api_key: Optional[str] = None):
36
+ self.endpoint = endpoint or os.getenv("LANGWATCH_ENDPOINT")
37
+ self.api_key = api_key or os.getenv("LANGWATCH_API_KEY", "")
38
+ self.logger = logging.getLogger("EventReporter")
39
+
40
+ async def post_event(self, event: ScenarioEvent):
41
+ """
42
+ Posts an event to the configured endpoint.
43
+
44
+ Args:
45
+ event: A dictionary containing the event data
46
+
47
+ Returns:
48
+ None - logs success/failure internally
49
+ """
50
+ event_type = event.type_
51
+ self.logger.info(f"[{event_type}] Publishing event ({event.scenario_run_id})")
52
+
53
+ if not self.endpoint:
54
+ self.logger.warning(
55
+ "No LANGWATCH_ENDPOINT configured, skipping event posting"
56
+ )
57
+ return
58
+
59
+ try:
60
+ async with httpx.AsyncClient() as client:
61
+ response = await client.post(
62
+ f"{self.endpoint}/api/scenario-events",
63
+ json=event.to_dict(),
64
+ headers={
65
+ "Content-Type": "application/json",
66
+ "X-Auth-Token": self.api_key,
67
+ },
68
+ )
69
+ self.logger.info(f"[{event_type}] POST response status: {response.status_code} ({event.scenario_run_id})")
70
+
71
+ if response.is_success:
72
+ data = response.json()
73
+ self.logger.info(f"[{event_type}] POST response: {data} ({event.scenario_run_id})")
74
+ else:
75
+ error_text = response.text
76
+ self.logger.error(
77
+ f"[{event_type}] Event POST failed: status={response.status_code}, "
78
+ f"reason={response.reason_phrase}, error={error_text}, "
79
+ f"event={event}"
80
+ )
81
+ except Exception as error:
82
+ self.logger.error(
83
+ f"[{event_type}] Event POST error: {error}, event={event}, endpoint={self.endpoint}")
@@ -0,0 +1,169 @@
1
+ """
2
+ Exports scenario event models from the generated LangWatch API client,
3
+ renaming the auto-generated types to clean, meaningful names.
4
+
5
+ This ensures all event types are always in sync with the OpenAPI spec and
6
+ the backend, and provides a single import location for event models.
7
+
8
+ If you need to add custom logic or helpers, you can extend or wrap these models here.
9
+ """
10
+
11
+ from typing import Union, Any, Optional
12
+ from scenario.generated.langwatch_api_client.lang_watch_api_client.models import (
13
+ PostApiScenarioEventsBodyType0,
14
+ PostApiScenarioEventsBodyType0Metadata as ScenarioRunStartedEventMetadata,
15
+ PostApiScenarioEventsBodyType1,
16
+ PostApiScenarioEventsBodyType1ResultsType0 as ScenarioRunFinishedEventResults,
17
+ PostApiScenarioEventsBodyType1ResultsType0Verdict as ScenarioRunFinishedEventVerdict,
18
+ PostApiScenarioEventsBodyType1Status as ScenarioRunFinishedEventStatus,
19
+ PostApiScenarioEventsBodyType2,
20
+ # Message types for the snapshot event
21
+ PostApiScenarioEventsBodyType2MessagesItemType0,
22
+ PostApiScenarioEventsBodyType2MessagesItemType1,
23
+ PostApiScenarioEventsBodyType2MessagesItemType2,
24
+ PostApiScenarioEventsBodyType2MessagesItemType3,
25
+ PostApiScenarioEventsBodyType2MessagesItemType4,
26
+ )
27
+
28
+ # Type alias for message types
29
+ MessageType = Union[
30
+ PostApiScenarioEventsBodyType2MessagesItemType0,
31
+ PostApiScenarioEventsBodyType2MessagesItemType1,
32
+ PostApiScenarioEventsBodyType2MessagesItemType2,
33
+ PostApiScenarioEventsBodyType2MessagesItemType3,
34
+ PostApiScenarioEventsBodyType2MessagesItemType4,
35
+ ]
36
+
37
+ class ScenarioRunStartedEvent(PostApiScenarioEventsBodyType0):
38
+ """
39
+ Event published when a scenario run begins execution.
40
+
41
+ Automatically sets type_ to "SCENARIO_RUN_STARTED" and includes metadata
42
+ about the scenario (name, description, etc.).
43
+
44
+ Args:
45
+ batch_run_id (str): Unique identifier for the batch of scenario runs
46
+ scenario_id (str): Unique identifier for the scenario definition
47
+ scenario_run_id (str): Unique identifier for this specific run
48
+ metadata (ScenarioRunStartedEventMetadata): Scenario details like name and description
49
+ timestamp (Optional[int], optional): Unix timestamp in milliseconds, auto-generated if not provided
50
+ raw_event (Optional[Any], optional): Raw event data
51
+ scenario_set_id (Optional[str], optional): Set identifier, defaults to "default"
52
+ """
53
+ def __init__(
54
+ self,
55
+ batch_run_id: str,
56
+ scenario_id: str,
57
+ scenario_run_id: str,
58
+ metadata: ScenarioRunStartedEventMetadata,
59
+ timestamp: int,
60
+ raw_event: Optional[Any] = None,
61
+ scenario_set_id: Optional[str] = "default"
62
+ ):
63
+ super().__init__(
64
+ type_="SCENARIO_RUN_STARTED",
65
+ batch_run_id=batch_run_id,
66
+ scenario_id=scenario_id,
67
+ scenario_run_id=scenario_run_id,
68
+ metadata=metadata,
69
+ timestamp=timestamp,
70
+ raw_event=raw_event,
71
+ scenario_set_id=scenario_set_id or "default"
72
+ )
73
+
74
+ class ScenarioRunFinishedEvent(PostApiScenarioEventsBodyType1):
75
+ """
76
+ Event published when a scenario run completes execution.
77
+
78
+ Automatically sets type_ to "SCENARIO_RUN_FINISHED" and includes results
79
+ with verdict (PASS/FAIL/SUCCESS) and reasoning.
80
+
81
+ Args:
82
+ batch_run_id (str): Unique identifier for the batch of scenario runs
83
+ scenario_id (str): Unique identifier for the scenario definition
84
+ scenario_run_id (str): Unique identifier for this specific run
85
+ status (ScenarioRunFinishedEventStatus): Overall execution status
86
+ timestamp (Optional[int], optional): Unix timestamp in milliseconds, auto-generated if not provided
87
+ raw_event (Optional[Any], optional): Raw event data
88
+ scenario_set_id (Optional[str], optional): Set identifier, defaults to "default"
89
+ results (Optional[ScenarioRunFinishedEventResults], optional): Verdict and reasoning for the outcome
90
+ """
91
+ def __init__(
92
+ self,
93
+ batch_run_id: str,
94
+ scenario_id: str,
95
+ scenario_run_id: str,
96
+ status: ScenarioRunFinishedEventStatus,
97
+ timestamp: int,
98
+ results: Optional[ScenarioRunFinishedEventResults] = None,
99
+ raw_event: Optional[Any] = None,
100
+ scenario_set_id: Optional[str] = "default",
101
+ ):
102
+ super().__init__(
103
+ type_="SCENARIO_RUN_FINISHED",
104
+ batch_run_id=batch_run_id,
105
+ scenario_id=scenario_id,
106
+ scenario_run_id=scenario_run_id,
107
+ status=status,
108
+ timestamp=timestamp,
109
+ raw_event=raw_event,
110
+ scenario_set_id=scenario_set_id or "default",
111
+ results=results
112
+ )
113
+
114
+ class ScenarioMessageSnapshotEvent(PostApiScenarioEventsBodyType2):
115
+ """
116
+ Event published to capture intermediate state during scenario execution.
117
+
118
+ Automatically sets type_ to "SCENARIO_MESSAGE_SNAPSHOT" and allows tracking
119
+ of messages, context, or other runtime data during scenario processing.
120
+
121
+ Args:
122
+ batch_run_id (str): Unique identifier for the batch of scenario runs
123
+ scenario_id (str): Unique identifier for the scenario definition
124
+ scenario_run_id (str): Unique identifier for this specific run
125
+ messages (list[MessageType]): List of message objects in the conversation
126
+ timestamp (Optional[int], optional): Unix timestamp in milliseconds, auto-generated if not provided
127
+ raw_event (Optional[Any], optional): Raw event data
128
+ scenario_set_id (Optional[str], optional): Set identifier, defaults to "default"
129
+ """
130
+ def __init__(
131
+ self,
132
+ batch_run_id: str,
133
+ scenario_id: str,
134
+ scenario_run_id: str,
135
+ messages: list[MessageType],
136
+ timestamp: int,
137
+ raw_event: Optional[Any] = None,
138
+ scenario_set_id: Optional[str] = "default"
139
+ ):
140
+ super().__init__(
141
+ type_="SCENARIO_MESSAGE_SNAPSHOT",
142
+ batch_run_id=batch_run_id,
143
+ scenario_id=scenario_id,
144
+ scenario_run_id=scenario_run_id,
145
+ messages=messages,
146
+ timestamp=timestamp,
147
+ raw_event=raw_event,
148
+ scenario_set_id=scenario_set_id or "default"
149
+ )
150
+
151
+ # Union type for all supported event types
152
+ ScenarioEvent = Union[
153
+ ScenarioRunStartedEvent,
154
+ ScenarioRunFinishedEvent,
155
+ ScenarioMessageSnapshotEvent
156
+ ]
157
+
158
+
159
+ __all__ = [
160
+ "ScenarioEvent",
161
+ "ScenarioRunStartedEvent",
162
+ "ScenarioRunStartedEventMetadata",
163
+ "ScenarioRunFinishedEvent",
164
+ "ScenarioRunFinishedEventResults",
165
+ "ScenarioRunFinishedEventVerdict",
166
+ "ScenarioRunFinishedEventStatus",
167
+ "ScenarioMessageSnapshotEvent",
168
+ "MessageType",
169
+ ]
@@ -0,0 +1,84 @@
1
+ from typing import Union, Optional, List
2
+ from ag_ui.core import (
3
+ UserMessage as AgUiUserMessage,
4
+ AssistantMessage as AgUiAssistantMessage,
5
+ SystemMessage as AgUiSystemMessage,
6
+ ToolMessage as AgUiToolMessage,
7
+ ToolCall as AgUiToolCall,
8
+ FunctionCall as AgUiFunctionCall,
9
+ )
10
+
11
+ class UserMessage(AgUiUserMessage):
12
+ """
13
+ An AG-UI user message extended with the to_dict method.
14
+ Enforces role='user' and requires content.
15
+ """
16
+ def __init__(self, id: str, content: str, name: Optional[str] = None):
17
+ super().__init__(id=id, role="user", content=content, name=name)
18
+
19
+ def to_dict(self):
20
+ """Convert the UserMessage to a dictionary representation."""
21
+ return self.model_dump(exclude_none=True)
22
+
23
+ class AssistantMessage(AgUiAssistantMessage):
24
+ """
25
+ An AG-UI assistant message extended with the to_dict method.
26
+ Enforces role='assistant' and allows optional content and tool_calls.
27
+ """
28
+ def __init__(self, id: str, content: Optional[str] = None, tool_calls: Optional[List['ToolCall']] = None, name: Optional[str] = None):
29
+ super().__init__(id=id, role="assistant", content=content, tool_calls=tool_calls, name=name)
30
+
31
+ def to_dict(self):
32
+ """Convert the AssistantMessage to a dictionary representation."""
33
+ return self.model_dump(exclude_none=True)
34
+
35
+ class SystemMessage(AgUiSystemMessage):
36
+ """
37
+ An AG-UI system message extended with the to_dict method.
38
+ Enforces role='system' and requires content.
39
+ """
40
+ def __init__(self, id: str, content: str, name: Optional[str] = None):
41
+ super().__init__(id=id, role="system", content=content, name=name)
42
+
43
+ def to_dict(self):
44
+ """Convert the SystemMessage to a dictionary representation."""
45
+ return self.model_dump(exclude_none=True)
46
+
47
+ class ToolMessage(AgUiToolMessage):
48
+ """
49
+ An AG-UI tool message extended with the to_dict method.
50
+ Enforces role='tool' and requires content and tool_call_id.
51
+ """
52
+ def __init__(self, id: str, content: str, tool_call_id: str):
53
+ super().__init__(id=id, role="tool", content=content, tool_call_id=tool_call_id)
54
+
55
+ def to_dict(self):
56
+ """Convert the ToolMessage to a dictionary representation."""
57
+ return self.model_dump(exclude_none=True)
58
+
59
+ class ToolCall(AgUiToolCall):
60
+ """
61
+ An AG-UI tool call extended with the to_dict method.
62
+ Enforces type='function' and requires id and function.
63
+ """
64
+ def __init__(self, id: str, function: 'FunctionCall'):
65
+ super().__init__(id=id, type="function", function=function)
66
+
67
+ def to_dict(self):
68
+ """Convert the ToolCall to a dictionary representation."""
69
+ return self.model_dump(exclude_none=True)
70
+
71
+ class FunctionCall(AgUiFunctionCall):
72
+ """
73
+ An AG-UI function call extended with the to_dict method.
74
+ Requires name and arguments.
75
+ """
76
+ def __init__(self, name: str, arguments: str):
77
+ super().__init__(name=name, arguments=arguments)
78
+
79
+ def to_dict(self):
80
+ """Convert the FunctionCall to a dictionary representation."""
81
+ return self.model_dump(exclude_none=True)
82
+
83
+ # Union type alias for all message types
84
+ Message = Union[UserMessage, AssistantMessage, SystemMessage, ToolMessage, ToolCall, FunctionCall]
@@ -0,0 +1,86 @@
1
+ from openai.types.chat.chat_completion_message_param import ChatCompletionMessageParam
2
+ from .messages import UserMessage, AssistantMessage, SystemMessage, ToolMessage, ToolCall, FunctionCall
3
+ from typing import List, Union
4
+
5
+ import uuid
6
+
7
+ # Define the correct Message type for the return value
8
+ Message = Union[UserMessage, AssistantMessage, SystemMessage, ToolMessage]
9
+
10
+ def convert_messages_to_ag_ui_messages(messages: list[ChatCompletionMessageParam]) -> list[Message]:
11
+ """
12
+ Converts OpenAI ChatCompletionMessageParam messages to ag_ui Message format.
13
+
14
+ This function transforms messages from OpenAI's format to the ag_ui protocol
15
+ format for consistent message handling across the scenario framework.
16
+
17
+ Args:
18
+ messages: List of OpenAI ChatCompletionMessageParam messages
19
+
20
+ Returns:
21
+ List of ag_ui Message objects
22
+
23
+ Raises:
24
+ ValueError: If message role is not supported or message format is invalid
25
+ """
26
+
27
+ converted_messages: list[Message] = []
28
+
29
+ for i, message in enumerate(messages):
30
+ # Generate unique ID for each message
31
+ message_id = message.get("id") or str(uuid.uuid4())
32
+
33
+ role = message.get("role")
34
+ content = message.get("content")
35
+
36
+ if role == "user":
37
+ if not content:
38
+ raise ValueError(f"User message at index {i} missing required content")
39
+ converted_messages.append(UserMessage(
40
+ id=message_id,
41
+ content=str(content)
42
+ ))
43
+ elif role == "assistant":
44
+ # Handle tool calls if present
45
+ tool_calls = message.get("tool_calls")
46
+ ag_ui_tool_calls: List[ToolCall] | None = None
47
+
48
+ if tool_calls:
49
+ ag_ui_tool_calls = []
50
+ for tool_call in tool_calls:
51
+ ag_ui_tool_calls.append(ToolCall(
52
+ id=tool_call.get("id", str(uuid.uuid4())),
53
+ function=FunctionCall(
54
+ name=tool_call["function"]["name"],
55
+ arguments=tool_call["function"]["arguments"]
56
+ )
57
+ ))
58
+
59
+ converted_messages.append(AssistantMessage(
60
+ id=message_id,
61
+ content=str(content) if content else None,
62
+ tool_calls=ag_ui_tool_calls
63
+ ))
64
+ elif role == "system":
65
+ if not content:
66
+ raise ValueError(f"System message at index {i} missing required content")
67
+ converted_messages.append(SystemMessage(
68
+ id=message_id,
69
+ content=str(content)
70
+ ))
71
+ elif role == "tool":
72
+ tool_call_id = message.get("tool_call_id")
73
+ if not tool_call_id:
74
+ raise ValueError(f"Tool message at index {i} missing required tool_call_id")
75
+ if not content:
76
+ raise ValueError(f"Tool message at index {i} missing required content")
77
+
78
+ converted_messages.append(ToolMessage(
79
+ id=message_id,
80
+ content=str(content),
81
+ tool_call_id=tool_call_id
82
+ ))
83
+ else:
84
+ raise ValueError(f"Unsupported message role '{role}' at index {i}")
85
+
86
+ return converted_messages
scenario/judge_agent.py CHANGED
@@ -19,7 +19,7 @@ from scenario.cache import scenario_cache
19
19
  from scenario.agent_adapter import AgentAdapter
20
20
  from scenario.config import ModelConfig, ScenarioConfig
21
21
 
22
- from .error_messages import agent_not_configured_error_message
22
+ from ._error_messages import agent_not_configured_error_message
23
23
  from .types import AgentInput, AgentReturnTypes, AgentRole, ScenarioResult
24
24
 
25
25
 
@@ -48,7 +48,7 @@ class JudgeAgent(AgentAdapter):
48
48
  system_prompt: Custom system prompt to override default judge behavior
49
49
 
50
50
  Example:
51
- ```python
51
+ ```
52
52
  import scenario
53
53
 
54
54
  # Basic judge agent with criteria
@@ -133,14 +133,12 @@ class JudgeAgent(AgentAdapter):
133
133
  Exception: If no model is configured either in parameters or global config
134
134
 
135
135
  Example:
136
- ```python
136
+ ```
137
137
  # Customer service judge
138
138
  cs_judge = JudgeAgent(
139
139
  criteria=[
140
- "Agent is polite and professional",
141
- "Agent addresses the customer's specific concern",
142
- "Agent offers appropriate solutions or next steps",
143
- "Agent does not make promises the company cannot keep"
140
+ "Agent replies with the refund policy",
141
+ "Agent offers next steps for the customer",
144
142
  ],
145
143
  temperature=0.1
146
144
  )
@@ -148,9 +146,8 @@ class JudgeAgent(AgentAdapter):
148
146
  # Technical accuracy judge
149
147
  tech_judge = JudgeAgent(
150
148
  criteria=[
151
- "Code examples compile without errors",
152
- "Security vulnerabilities are not introduced",
153
- "Best practices are recommended"
149
+ "Agent adds a code review pointing out the code compilation errors",
150
+ "Agent adds a code review about the missing security headers"
154
151
  ],
155
152
  system_prompt="You are a senior software engineer reviewing code for production use."
156
153
  )
@@ -210,24 +207,6 @@ class JudgeAgent(AgentAdapter):
210
207
  Exception: If the judge cannot make a valid decision or if there's an
211
208
  error in the evaluation process
212
209
 
213
- Example:
214
- The judge evaluates conversations like this:
215
-
216
- ```
217
- Conversation so far:
218
- User: "I need help with authentication"
219
- Agent: "I can help! What authentication method are you using?"
220
- User: "JWT tokens"
221
- Agent: "Here's how to implement JWT securely: [detailed code example]"
222
-
223
- Judge evaluation:
224
- - ✓ Agent provides helpful responses
225
- - ✓ Agent asks relevant follow-up questions
226
- - ✓ Security best practices are mentioned
227
-
228
- Decision: CONTINUE (all criteria being met so far)
229
- ```
230
-
231
210
  Note:
232
211
  - Returns empty list [] to continue the scenario
233
212
  - Returns ScenarioResult to end with success/failure