sandboxy 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sandboxy/__init__.py +3 -0
- sandboxy/agents/__init__.py +21 -0
- sandboxy/agents/base.py +66 -0
- sandboxy/agents/llm_prompt.py +308 -0
- sandboxy/agents/loader.py +222 -0
- sandboxy/api/__init__.py +5 -0
- sandboxy/api/app.py +76 -0
- sandboxy/api/routes/__init__.py +1 -0
- sandboxy/api/routes/agents.py +92 -0
- sandboxy/api/routes/local.py +1388 -0
- sandboxy/api/routes/tools.py +106 -0
- sandboxy/cli/__init__.py +1 -0
- sandboxy/cli/main.py +1196 -0
- sandboxy/cli/type_detector.py +48 -0
- sandboxy/config.py +49 -0
- sandboxy/core/__init__.py +1 -0
- sandboxy/core/async_runner.py +824 -0
- sandboxy/core/mdl_parser.py +441 -0
- sandboxy/core/runner.py +599 -0
- sandboxy/core/safe_eval.py +165 -0
- sandboxy/core/state.py +234 -0
- sandboxy/datasets/__init__.py +20 -0
- sandboxy/datasets/loader.py +193 -0
- sandboxy/datasets/runner.py +442 -0
- sandboxy/errors.py +166 -0
- sandboxy/local/context.py +235 -0
- sandboxy/local/results.py +173 -0
- sandboxy/logging.py +31 -0
- sandboxy/mcp/__init__.py +25 -0
- sandboxy/mcp/client.py +360 -0
- sandboxy/mcp/wrapper.py +99 -0
- sandboxy/providers/__init__.py +34 -0
- sandboxy/providers/anthropic_provider.py +271 -0
- sandboxy/providers/base.py +123 -0
- sandboxy/providers/http_client.py +101 -0
- sandboxy/providers/openai_provider.py +282 -0
- sandboxy/providers/openrouter.py +958 -0
- sandboxy/providers/registry.py +199 -0
- sandboxy/scenarios/__init__.py +11 -0
- sandboxy/scenarios/comparison.py +491 -0
- sandboxy/scenarios/loader.py +262 -0
- sandboxy/scenarios/runner.py +468 -0
- sandboxy/scenarios/unified.py +1434 -0
- sandboxy/session/__init__.py +21 -0
- sandboxy/session/manager.py +278 -0
- sandboxy/tools/__init__.py +34 -0
- sandboxy/tools/base.py +127 -0
- sandboxy/tools/loader.py +270 -0
- sandboxy/tools/yaml_tools.py +708 -0
- sandboxy/ui/__init__.py +27 -0
- sandboxy/ui/dist/assets/index-CgAkYWrJ.css +1 -0
- sandboxy/ui/dist/assets/index-D4zoGFcr.js +347 -0
- sandboxy/ui/dist/index.html +14 -0
- sandboxy/utils/__init__.py +3 -0
- sandboxy/utils/time.py +20 -0
- sandboxy-0.0.1.dist-info/METADATA +241 -0
- sandboxy-0.0.1.dist-info/RECORD +60 -0
- sandboxy-0.0.1.dist-info/WHEEL +4 -0
- sandboxy-0.0.1.dist-info/entry_points.txt +3 -0
- sandboxy-0.0.1.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,824 @@
|
|
|
1
|
+
"""Async Runner - executes MDL modules with support for interactive sessions.
|
|
2
|
+
|
|
3
|
+
This runner is designed for use with WebSocket connections where execution
|
|
4
|
+
can be paused to await user input.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
import asyncio
|
|
8
|
+
import json
|
|
9
|
+
import logging
|
|
10
|
+
import re
|
|
11
|
+
from collections.abc import AsyncGenerator
|
|
12
|
+
from typing import Any
|
|
13
|
+
|
|
14
|
+
from pydantic import BaseModel, Field
|
|
15
|
+
|
|
16
|
+
logger = logging.getLogger(__name__)
|
|
17
|
+
|
|
18
|
+
from sandboxy.agents.base import Agent, AgentAction
|
|
19
|
+
from sandboxy.core.state import (
|
|
20
|
+
EvaluationResult,
|
|
21
|
+
Message,
|
|
22
|
+
ModuleSpec,
|
|
23
|
+
SessionState,
|
|
24
|
+
Step,
|
|
25
|
+
StepAction,
|
|
26
|
+
ToolCall,
|
|
27
|
+
)
|
|
28
|
+
from sandboxy.tools.base import Tool, ToolResult
|
|
29
|
+
from sandboxy.tools.loader import ToolLoader
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class RunEvent(BaseModel):
|
|
33
|
+
"""Event emitted during module execution."""
|
|
34
|
+
|
|
35
|
+
type: str # "user", "agent", "tool_call", "tool_result", "awaiting_input", "completed", "error"
|
|
36
|
+
payload: dict[str, Any] = Field(default_factory=dict)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class AsyncRunner:
|
|
40
|
+
"""Executes MDL modules asynchronously with support for interactive sessions.
|
|
41
|
+
|
|
42
|
+
This runner uses an async generator pattern to yield events and receive
|
|
43
|
+
user input for `await_user` steps.
|
|
44
|
+
|
|
45
|
+
Usage:
|
|
46
|
+
runner = AsyncRunner(module, agent)
|
|
47
|
+
async for event in runner.run():
|
|
48
|
+
if event.type == "awaiting_input":
|
|
49
|
+
# Get user input somehow
|
|
50
|
+
user_input = await get_user_input()
|
|
51
|
+
runner.provide_input(user_input)
|
|
52
|
+
else:
|
|
53
|
+
# Process other events
|
|
54
|
+
handle_event(event)
|
|
55
|
+
"""
|
|
56
|
+
|
|
57
|
+
def __init__(self, module: ModuleSpec, agent: Agent) -> None:
|
|
58
|
+
"""Initialize async runner.
|
|
59
|
+
|
|
60
|
+
Args:
|
|
61
|
+
module: MDL module specification to execute.
|
|
62
|
+
agent: Agent to run within the module.
|
|
63
|
+
"""
|
|
64
|
+
self.module = module
|
|
65
|
+
self.agent = agent
|
|
66
|
+
self.events: list[RunEvent] = []
|
|
67
|
+
self.history: list[Message] = []
|
|
68
|
+
self.env_state: dict[str, Any] = module.environment.initial_state.copy()
|
|
69
|
+
self.tools: dict[str, Tool] = ToolLoader.from_env_config(module.environment)
|
|
70
|
+
|
|
71
|
+
# Session state
|
|
72
|
+
self.state = SessionState.IDLE
|
|
73
|
+
self._user_input_future: asyncio.Future[str] | None = None
|
|
74
|
+
self._step_index = 0
|
|
75
|
+
|
|
76
|
+
@property
|
|
77
|
+
def session_state(self) -> SessionState:
|
|
78
|
+
"""Get current session state."""
|
|
79
|
+
return self.state
|
|
80
|
+
|
|
81
|
+
def provide_input(self, content: str) -> None:
|
|
82
|
+
"""Provide user input for an await_user step.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
content: User's input text.
|
|
86
|
+
|
|
87
|
+
Raises:
|
|
88
|
+
RuntimeError: If not currently awaiting user input.
|
|
89
|
+
"""
|
|
90
|
+
if self._user_input_future is None or self._user_input_future.done():
|
|
91
|
+
raise RuntimeError("Not currently awaiting user input")
|
|
92
|
+
self._user_input_future.set_result(content)
|
|
93
|
+
|
|
94
|
+
def inject_event(
|
|
95
|
+
self, tool_name: str, event_type: str, args: dict[str, Any] | None = None
|
|
96
|
+
) -> dict[str, Any]:
|
|
97
|
+
"""Inject a game event by calling a tool's trigger_event action.
|
|
98
|
+
|
|
99
|
+
This is used for chaos injection - frontend can trigger events like
|
|
100
|
+
"heatwave" or "rush_hour" that modify the game state.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
tool_name: Name of the tool to call (e.g., "stand" for lemonade stand).
|
|
104
|
+
event_type: Type of event to trigger (e.g., "heatwave", "rush_hour").
|
|
105
|
+
args: Optional additional arguments for the event.
|
|
106
|
+
|
|
107
|
+
Returns:
|
|
108
|
+
The tool result data.
|
|
109
|
+
|
|
110
|
+
Raises:
|
|
111
|
+
ValueError: If tool not found or event trigger fails.
|
|
112
|
+
"""
|
|
113
|
+
if tool_name not in self.tools:
|
|
114
|
+
raise ValueError(f"Tool not found: {tool_name}")
|
|
115
|
+
|
|
116
|
+
tool = self.tools[tool_name]
|
|
117
|
+
event_args = {"event": event_type}
|
|
118
|
+
if args:
|
|
119
|
+
event_args.update(args)
|
|
120
|
+
|
|
121
|
+
result = tool.invoke("trigger_event", event_args, self.env_state)
|
|
122
|
+
|
|
123
|
+
if not result.success:
|
|
124
|
+
raise ValueError(f"Event trigger failed: {result.error}")
|
|
125
|
+
|
|
126
|
+
return result.data or {}
|
|
127
|
+
|
|
128
|
+
async def run(self) -> AsyncGenerator[RunEvent, None]:
|
|
129
|
+
"""Execute the module, yielding events as they occur.
|
|
130
|
+
|
|
131
|
+
Yields:
|
|
132
|
+
RunEvent objects for each significant event during execution.
|
|
133
|
+
When type is "awaiting_input", caller should get user input
|
|
134
|
+
and call provide_input() before continuing iteration.
|
|
135
|
+
"""
|
|
136
|
+
self.state = SessionState.RUNNING
|
|
137
|
+
steps = self.module.steps
|
|
138
|
+
|
|
139
|
+
try:
|
|
140
|
+
while self._step_index < len(steps):
|
|
141
|
+
step = steps[self._step_index]
|
|
142
|
+
|
|
143
|
+
if step.action == StepAction.INJECT_USER.value:
|
|
144
|
+
event = self._handle_inject_user(step)
|
|
145
|
+
self.events.append(event)
|
|
146
|
+
yield event
|
|
147
|
+
|
|
148
|
+
elif step.action == StepAction.AWAIT_USER.value:
|
|
149
|
+
# Yield awaiting_input event and wait for user input
|
|
150
|
+
async for event in self._handle_await_user(step):
|
|
151
|
+
self.events.append(event)
|
|
152
|
+
yield event
|
|
153
|
+
|
|
154
|
+
elif step.action == StepAction.AWAIT_AGENT.value:
|
|
155
|
+
self.state = SessionState.AWAITING_AGENT
|
|
156
|
+
async for event in self._handle_await_agent(step):
|
|
157
|
+
self.events.append(event)
|
|
158
|
+
yield event
|
|
159
|
+
self.state = SessionState.RUNNING
|
|
160
|
+
|
|
161
|
+
elif step.action == StepAction.BRANCH.value:
|
|
162
|
+
event, new_steps = self._handle_branch(step)
|
|
163
|
+
if event:
|
|
164
|
+
self.events.append(event)
|
|
165
|
+
yield event
|
|
166
|
+
if new_steps is not None:
|
|
167
|
+
steps = new_steps
|
|
168
|
+
self._step_index = 0
|
|
169
|
+
continue
|
|
170
|
+
|
|
171
|
+
elif step.action == StepAction.TOOL_CALL.value:
|
|
172
|
+
async for event in self._handle_direct_tool_call(step):
|
|
173
|
+
self.events.append(event)
|
|
174
|
+
yield event
|
|
175
|
+
|
|
176
|
+
self._step_index += 1
|
|
177
|
+
|
|
178
|
+
# Evaluation
|
|
179
|
+
evaluation = self._evaluate()
|
|
180
|
+
self.state = SessionState.COMPLETED
|
|
181
|
+
|
|
182
|
+
yield RunEvent(
|
|
183
|
+
type="completed",
|
|
184
|
+
payload={
|
|
185
|
+
"evaluation": evaluation.model_dump(),
|
|
186
|
+
"num_events": len(self.events),
|
|
187
|
+
},
|
|
188
|
+
)
|
|
189
|
+
|
|
190
|
+
except Exception as e:
|
|
191
|
+
self.state = SessionState.ERROR
|
|
192
|
+
yield RunEvent(
|
|
193
|
+
type="error",
|
|
194
|
+
payload={"message": str(e)},
|
|
195
|
+
)
|
|
196
|
+
|
|
197
|
+
def _handle_inject_user(self, step: Step) -> RunEvent:
|
|
198
|
+
"""Handle inject_user action - add scripted user message."""
|
|
199
|
+
content = step.params.get("content", "")
|
|
200
|
+
msg = Message(role="user", content=content)
|
|
201
|
+
self.history.append(msg)
|
|
202
|
+
|
|
203
|
+
return RunEvent(
|
|
204
|
+
type="user",
|
|
205
|
+
payload={"content": content, "step_id": step.id, "scripted": True},
|
|
206
|
+
)
|
|
207
|
+
|
|
208
|
+
async def _handle_await_user(self, step: Step) -> AsyncGenerator[RunEvent, None]:
|
|
209
|
+
"""Handle await_user action - wait for real user input."""
|
|
210
|
+
prompt = step.params.get("prompt", "")
|
|
211
|
+
timeout = step.params.get("timeout")
|
|
212
|
+
|
|
213
|
+
self.state = SessionState.AWAITING_USER
|
|
214
|
+
|
|
215
|
+
# Yield event to signal we're waiting for input
|
|
216
|
+
yield RunEvent(
|
|
217
|
+
type="awaiting_input",
|
|
218
|
+
payload={"prompt": prompt, "step_id": step.id, "timeout": timeout},
|
|
219
|
+
)
|
|
220
|
+
|
|
221
|
+
# Create future for user input
|
|
222
|
+
self._user_input_future = asyncio.get_event_loop().create_future()
|
|
223
|
+
|
|
224
|
+
try:
|
|
225
|
+
if timeout:
|
|
226
|
+
content = await asyncio.wait_for(self._user_input_future, timeout=timeout)
|
|
227
|
+
else:
|
|
228
|
+
content = await self._user_input_future
|
|
229
|
+
except TimeoutError:
|
|
230
|
+
content = step.params.get("default", "[timeout - no input]")
|
|
231
|
+
|
|
232
|
+
self._user_input_future = None
|
|
233
|
+
self.state = SessionState.RUNNING
|
|
234
|
+
|
|
235
|
+
# Add user message to history
|
|
236
|
+
msg = Message(role="user", content=content)
|
|
237
|
+
self.history.append(msg)
|
|
238
|
+
|
|
239
|
+
yield RunEvent(
|
|
240
|
+
type="user",
|
|
241
|
+
payload={"content": content, "step_id": step.id, "scripted": False},
|
|
242
|
+
)
|
|
243
|
+
|
|
244
|
+
async def _handle_await_agent(
|
|
245
|
+
self, step: Step, max_tool_calls: int = 10
|
|
246
|
+
) -> AsyncGenerator[RunEvent, None]:
|
|
247
|
+
"""Handle await_agent action - get agent response.
|
|
248
|
+
|
|
249
|
+
May involve multiple tool calls before agent returns a message.
|
|
250
|
+
"""
|
|
251
|
+
tool_call_count = 0
|
|
252
|
+
|
|
253
|
+
while tool_call_count < max_tool_calls:
|
|
254
|
+
# Build tool schemas for agent
|
|
255
|
+
tool_schemas = self._get_tool_schemas()
|
|
256
|
+
|
|
257
|
+
# Get agent action (this could be made async if agent supports it)
|
|
258
|
+
action: AgentAction = self.agent.step(self.history, tool_schemas)
|
|
259
|
+
|
|
260
|
+
if action.type == "message":
|
|
261
|
+
msg = Message(role="assistant", content=action.content or "")
|
|
262
|
+
self.history.append(msg)
|
|
263
|
+
|
|
264
|
+
yield RunEvent(
|
|
265
|
+
type="agent",
|
|
266
|
+
payload={"content": msg.content, "step_id": step.id},
|
|
267
|
+
)
|
|
268
|
+
return # Done with this await_agent step
|
|
269
|
+
|
|
270
|
+
elif action.type == "tool_call":
|
|
271
|
+
async for event in self._handle_tool_call(action, step):
|
|
272
|
+
yield event
|
|
273
|
+
tool_call_count += 1
|
|
274
|
+
# Continue loop to let agent respond to tool result
|
|
275
|
+
|
|
276
|
+
elif action.type == "stop":
|
|
277
|
+
# If we've processed tool calls, the agent should respond based on results
|
|
278
|
+
# Some models return empty content after tool calls - add a hint and retry once
|
|
279
|
+
if tool_call_count > 0 and not hasattr(self, "_retry_after_tool"):
|
|
280
|
+
self._retry_after_tool = True
|
|
281
|
+
# Add a system hint to prompt the agent to respond
|
|
282
|
+
self.history.append(
|
|
283
|
+
Message(
|
|
284
|
+
role="user",
|
|
285
|
+
content="[System: Please respond to the customer based on the information you just retrieved.]",
|
|
286
|
+
)
|
|
287
|
+
)
|
|
288
|
+
continue # Retry the loop
|
|
289
|
+
|
|
290
|
+
# Clean up retry flag
|
|
291
|
+
if hasattr(self, "_retry_after_tool"):
|
|
292
|
+
delattr(self, "_retry_after_tool")
|
|
293
|
+
|
|
294
|
+
yield RunEvent(
|
|
295
|
+
type="agent_stop",
|
|
296
|
+
payload={"step_id": step.id},
|
|
297
|
+
)
|
|
298
|
+
return
|
|
299
|
+
|
|
300
|
+
async def _handle_tool_call(
|
|
301
|
+
self, action: AgentAction, step: Step
|
|
302
|
+
) -> AsyncGenerator[RunEvent, None]:
|
|
303
|
+
"""Handle a tool call from the agent."""
|
|
304
|
+
tool_name = action.tool_name or ""
|
|
305
|
+
tool_action = action.tool_action or ""
|
|
306
|
+
tool_args = action.tool_args or {}
|
|
307
|
+
|
|
308
|
+
# Use the original tool_call_id from the model, or generate one as fallback
|
|
309
|
+
tool_call_id = action.tool_call_id or f"call_{tool_name}_{tool_action}_{len(self.events)}"
|
|
310
|
+
function_name = f"{tool_name}__{tool_action}"
|
|
311
|
+
|
|
312
|
+
yield RunEvent(
|
|
313
|
+
type="tool_call",
|
|
314
|
+
payload={
|
|
315
|
+
"tool": tool_name,
|
|
316
|
+
"action": tool_action,
|
|
317
|
+
"args": tool_args,
|
|
318
|
+
"step_id": step.id,
|
|
319
|
+
},
|
|
320
|
+
)
|
|
321
|
+
|
|
322
|
+
# Add assistant message with tool_calls
|
|
323
|
+
self.history.append(
|
|
324
|
+
Message(
|
|
325
|
+
role="assistant",
|
|
326
|
+
content="",
|
|
327
|
+
tool_calls=[
|
|
328
|
+
ToolCall(
|
|
329
|
+
id=tool_call_id,
|
|
330
|
+
name=function_name,
|
|
331
|
+
arguments=json.dumps(tool_args),
|
|
332
|
+
)
|
|
333
|
+
],
|
|
334
|
+
)
|
|
335
|
+
)
|
|
336
|
+
|
|
337
|
+
# Execute tool
|
|
338
|
+
if tool_name in self.tools:
|
|
339
|
+
tool = self.tools[tool_name]
|
|
340
|
+
result: ToolResult = tool.invoke(tool_action, tool_args, self.env_state)
|
|
341
|
+
|
|
342
|
+
yield RunEvent(
|
|
343
|
+
type="tool_result",
|
|
344
|
+
payload={
|
|
345
|
+
"tool": tool_name,
|
|
346
|
+
"action": tool_action,
|
|
347
|
+
"result": result.model_dump(),
|
|
348
|
+
},
|
|
349
|
+
)
|
|
350
|
+
|
|
351
|
+
# Add tool result to history
|
|
352
|
+
self.history.append(
|
|
353
|
+
Message(
|
|
354
|
+
role="tool",
|
|
355
|
+
content=json.dumps(result.data) if result.success else result.error or "",
|
|
356
|
+
tool_name=tool_name,
|
|
357
|
+
tool_call_id=tool_call_id,
|
|
358
|
+
)
|
|
359
|
+
)
|
|
360
|
+
else:
|
|
361
|
+
error_msg = f"Tool not found: {tool_name}"
|
|
362
|
+
yield RunEvent(
|
|
363
|
+
type="tool_result",
|
|
364
|
+
payload={
|
|
365
|
+
"tool": tool_name,
|
|
366
|
+
"action": tool_action,
|
|
367
|
+
"result": {"success": False, "error": error_msg},
|
|
368
|
+
},
|
|
369
|
+
)
|
|
370
|
+
self.history.append(
|
|
371
|
+
Message(
|
|
372
|
+
role="tool",
|
|
373
|
+
content=error_msg,
|
|
374
|
+
tool_name=tool_name,
|
|
375
|
+
tool_call_id=tool_call_id,
|
|
376
|
+
)
|
|
377
|
+
)
|
|
378
|
+
|
|
379
|
+
async def _handle_direct_tool_call(self, step: Step) -> AsyncGenerator[RunEvent, None]:
|
|
380
|
+
"""Handle direct tool_call action (not via agent)."""
|
|
381
|
+
tool_name = step.params.get("tool", "")
|
|
382
|
+
tool_action = step.params.get("action", "")
|
|
383
|
+
tool_args = step.params.get("args", {})
|
|
384
|
+
|
|
385
|
+
yield RunEvent(
|
|
386
|
+
type="tool_call",
|
|
387
|
+
payload={
|
|
388
|
+
"tool": tool_name,
|
|
389
|
+
"action": tool_action,
|
|
390
|
+
"args": tool_args,
|
|
391
|
+
"step_id": step.id,
|
|
392
|
+
"direct": True,
|
|
393
|
+
},
|
|
394
|
+
)
|
|
395
|
+
|
|
396
|
+
if tool_name in self.tools:
|
|
397
|
+
tool = self.tools[tool_name]
|
|
398
|
+
result: ToolResult = tool.invoke(tool_action, tool_args, self.env_state)
|
|
399
|
+
|
|
400
|
+
yield RunEvent(
|
|
401
|
+
type="tool_result",
|
|
402
|
+
payload={
|
|
403
|
+
"tool": tool_name,
|
|
404
|
+
"action": tool_action,
|
|
405
|
+
"result": result.model_dump(),
|
|
406
|
+
},
|
|
407
|
+
)
|
|
408
|
+
else:
|
|
409
|
+
yield RunEvent(
|
|
410
|
+
type="tool_result",
|
|
411
|
+
payload={
|
|
412
|
+
"tool": tool_name,
|
|
413
|
+
"action": tool_action,
|
|
414
|
+
"result": {"success": False, "error": f"Tool not found: {tool_name}"},
|
|
415
|
+
},
|
|
416
|
+
)
|
|
417
|
+
|
|
418
|
+
def _handle_branch(self, step: Step) -> tuple[RunEvent | None, list[Step] | None]:
|
|
419
|
+
"""Handle branch action."""
|
|
420
|
+
branch_name = step.params.get("branch_name")
|
|
421
|
+
|
|
422
|
+
event = RunEvent(
|
|
423
|
+
type="branch",
|
|
424
|
+
payload={"branch": branch_name, "step_id": step.id},
|
|
425
|
+
)
|
|
426
|
+
|
|
427
|
+
if branch_name and branch_name in self.module.branches:
|
|
428
|
+
return event, self.module.branches[branch_name]
|
|
429
|
+
|
|
430
|
+
return event, None
|
|
431
|
+
|
|
432
|
+
def _get_tool_schemas(self) -> list[dict[str, Any]]:
|
|
433
|
+
"""Get tool schemas for agent tool calling."""
|
|
434
|
+
schemas = []
|
|
435
|
+
for name, tool in self.tools.items():
|
|
436
|
+
schemas.append(
|
|
437
|
+
{
|
|
438
|
+
"name": name,
|
|
439
|
+
"description": tool.description,
|
|
440
|
+
"actions": tool.get_actions(),
|
|
441
|
+
}
|
|
442
|
+
)
|
|
443
|
+
return schemas
|
|
444
|
+
|
|
445
|
+
def _evaluate(self) -> EvaluationResult:
|
|
446
|
+
"""Run evaluation checks and compute score."""
|
|
447
|
+
checks: dict[str, Any] = {}
|
|
448
|
+
|
|
449
|
+
# Run all checks and collect results
|
|
450
|
+
for check in self.module.evaluation:
|
|
451
|
+
result = self._run_check(check)
|
|
452
|
+
checks[check.name] = result
|
|
453
|
+
|
|
454
|
+
# Compute final score based on scoring config
|
|
455
|
+
score = self._compute_score(checks)
|
|
456
|
+
|
|
457
|
+
return EvaluationResult(
|
|
458
|
+
checks=checks,
|
|
459
|
+
score=score,
|
|
460
|
+
num_events=len(self.events),
|
|
461
|
+
status="ok",
|
|
462
|
+
)
|
|
463
|
+
|
|
464
|
+
def _compute_score(self, checks: dict[str, Any]) -> float:
|
|
465
|
+
"""Compute final score based on scoring config.
|
|
466
|
+
|
|
467
|
+
Supports three modes:
|
|
468
|
+
1. Formula: Use a Python expression with check names as variables
|
|
469
|
+
2. Weighted average: Average checks with optional weights
|
|
470
|
+
3. Default: Simple average of all numeric/boolean results
|
|
471
|
+
"""
|
|
472
|
+
scoring = self.module.scoring
|
|
473
|
+
|
|
474
|
+
# Extract numeric values from checks for use in formulas
|
|
475
|
+
check_values: dict[str, float] = {}
|
|
476
|
+
for name, result in checks.items():
|
|
477
|
+
if isinstance(result, int | float):
|
|
478
|
+
check_values[name] = float(result)
|
|
479
|
+
elif isinstance(result, bool):
|
|
480
|
+
check_values[name] = 1.0 if result else 0.0
|
|
481
|
+
elif isinstance(result, dict):
|
|
482
|
+
if result.get("passed") is True:
|
|
483
|
+
check_values[name] = 1.0
|
|
484
|
+
elif result.get("passed") is False:
|
|
485
|
+
check_values[name] = 0.0
|
|
486
|
+
elif "value" in result and isinstance(result["value"], int | float):
|
|
487
|
+
check_values[name] = float(result["value"])
|
|
488
|
+
|
|
489
|
+
# Mode 1: Custom formula
|
|
490
|
+
if scoring.formula:
|
|
491
|
+
try:
|
|
492
|
+
score = self._eval_score_formula(scoring.formula, check_values)
|
|
493
|
+
except Exception:
|
|
494
|
+
# Fall back to weighted average on formula error
|
|
495
|
+
score = self._weighted_average(check_values, scoring.weights)
|
|
496
|
+
else:
|
|
497
|
+
# Mode 2/3: Weighted average (with optional weights)
|
|
498
|
+
score = self._weighted_average(check_values, scoring.weights)
|
|
499
|
+
|
|
500
|
+
# Normalize if requested
|
|
501
|
+
if scoring.normalize and scoring.max_score != scoring.min_score:
|
|
502
|
+
score = (score - scoring.min_score) / (scoring.max_score - scoring.min_score)
|
|
503
|
+
score = max(0.0, min(1.0, score)) # Clamp to 0-1
|
|
504
|
+
|
|
505
|
+
return score
|
|
506
|
+
|
|
507
|
+
def _eval_score_formula(self, formula: str, check_values: dict[str, float]) -> float:
|
|
508
|
+
"""Evaluate a score formula with check values as variables."""
|
|
509
|
+
safe_builtins = {
|
|
510
|
+
"True": True,
|
|
511
|
+
"False": False,
|
|
512
|
+
"None": None,
|
|
513
|
+
"len": len,
|
|
514
|
+
"min": min,
|
|
515
|
+
"max": max,
|
|
516
|
+
"abs": abs,
|
|
517
|
+
"sum": sum,
|
|
518
|
+
"round": round,
|
|
519
|
+
}
|
|
520
|
+
|
|
521
|
+
# Add env_state to context for formulas that reference it
|
|
522
|
+
context = {"__builtins__": safe_builtins, "env_state": self.env_state}
|
|
523
|
+
context.update(check_values)
|
|
524
|
+
|
|
525
|
+
result = eval(formula, context, {})
|
|
526
|
+
return float(result)
|
|
527
|
+
|
|
528
|
+
def _weighted_average(self, values: dict[str, float], weights: dict[str, float]) -> float:
|
|
529
|
+
"""Compute weighted average of check values."""
|
|
530
|
+
if not values:
|
|
531
|
+
return 0.0
|
|
532
|
+
|
|
533
|
+
total = 0.0
|
|
534
|
+
total_weight = 0.0
|
|
535
|
+
|
|
536
|
+
for name, value in values.items():
|
|
537
|
+
weight = weights.get(name, 1.0)
|
|
538
|
+
total += value * weight
|
|
539
|
+
total_weight += weight
|
|
540
|
+
|
|
541
|
+
return total / total_weight if total_weight > 0 else 0.0
|
|
542
|
+
|
|
543
|
+
def _run_check(self, check: Any) -> dict[str, Any]:
|
|
544
|
+
"""Run a single evaluation check."""
|
|
545
|
+
kind = check.kind
|
|
546
|
+
|
|
547
|
+
try:
|
|
548
|
+
if kind == "contains":
|
|
549
|
+
return self._check_contains(check)
|
|
550
|
+
if kind == "regex":
|
|
551
|
+
return self._check_regex(check)
|
|
552
|
+
if kind == "count":
|
|
553
|
+
return self._check_count(check)
|
|
554
|
+
if kind == "tool_called":
|
|
555
|
+
return self._check_tool_called(check)
|
|
556
|
+
if kind == "equals":
|
|
557
|
+
return self._check_equals(check)
|
|
558
|
+
if kind == "env_state":
|
|
559
|
+
return self._check_env_state(check)
|
|
560
|
+
if kind == "deterministic":
|
|
561
|
+
# Legacy support for raw Python expressions
|
|
562
|
+
return self._check_deterministic(check)
|
|
563
|
+
if kind == "llm":
|
|
564
|
+
return {"status": "skipped", "reason": "LLM eval not implemented"}
|
|
565
|
+
return {"status": "error", "error": f"Unknown check kind: {kind}"}
|
|
566
|
+
except Exception as e:
|
|
567
|
+
return {"status": "error", "error": str(e)}
|
|
568
|
+
|
|
569
|
+
def _get_target_text(self, target: str) -> str:
|
|
570
|
+
"""Get text content for a target."""
|
|
571
|
+
if target == "agent_messages":
|
|
572
|
+
return " ".join(msg.content for msg in self.history if msg.role == "assistant")
|
|
573
|
+
if target == "user_messages":
|
|
574
|
+
return " ".join(msg.content for msg in self.history if msg.role == "user")
|
|
575
|
+
if target == "all_messages":
|
|
576
|
+
return " ".join(msg.content for msg in self.history)
|
|
577
|
+
if target == "last_agent_message":
|
|
578
|
+
for msg in reversed(self.history):
|
|
579
|
+
if msg.role == "assistant":
|
|
580
|
+
return msg.content
|
|
581
|
+
return ""
|
|
582
|
+
if target == "last_user_message":
|
|
583
|
+
for msg in reversed(self.history):
|
|
584
|
+
if msg.role == "user":
|
|
585
|
+
return msg.content
|
|
586
|
+
return ""
|
|
587
|
+
return ""
|
|
588
|
+
|
|
589
|
+
def _get_target_list(self, target: str) -> list[Any]:
|
|
590
|
+
"""Get list of items for a target."""
|
|
591
|
+
if target == "agent_messages":
|
|
592
|
+
return [msg for msg in self.history if msg.role == "assistant"]
|
|
593
|
+
if target == "user_messages":
|
|
594
|
+
return [msg for msg in self.history if msg.role == "user"]
|
|
595
|
+
if target == "all_messages":
|
|
596
|
+
return list(self.history)
|
|
597
|
+
if target == "tool_calls":
|
|
598
|
+
return [event for event in self.events if event.type == "tool_call"]
|
|
599
|
+
return []
|
|
600
|
+
|
|
601
|
+
def _check_contains(self, check: Any) -> dict[str, Any]:
|
|
602
|
+
"""Check if target contains a value."""
|
|
603
|
+
target = check.target or "agent_messages"
|
|
604
|
+
value = check.value or ""
|
|
605
|
+
expected = check.expected
|
|
606
|
+
case_sensitive = check.case_sensitive
|
|
607
|
+
|
|
608
|
+
text = self._get_target_text(target)
|
|
609
|
+
|
|
610
|
+
if not case_sensitive:
|
|
611
|
+
text = text.lower()
|
|
612
|
+
value = value.lower()
|
|
613
|
+
|
|
614
|
+
found = value in text
|
|
615
|
+
passed = found == expected
|
|
616
|
+
|
|
617
|
+
return {
|
|
618
|
+
"passed": passed,
|
|
619
|
+
"found": found,
|
|
620
|
+
"expected": expected,
|
|
621
|
+
"searched_for": check.value,
|
|
622
|
+
"in": target,
|
|
623
|
+
}
|
|
624
|
+
|
|
625
|
+
def _check_regex(self, check: Any) -> dict[str, Any]:
|
|
626
|
+
"""Check if target matches a regex pattern."""
|
|
627
|
+
target = check.target or "agent_messages"
|
|
628
|
+
pattern = check.pattern or ""
|
|
629
|
+
expected = check.expected
|
|
630
|
+
|
|
631
|
+
text = self._get_target_text(target)
|
|
632
|
+
match = bool(re.search(pattern, text, re.IGNORECASE if not check.case_sensitive else 0))
|
|
633
|
+
passed = match == expected
|
|
634
|
+
|
|
635
|
+
return {
|
|
636
|
+
"passed": passed,
|
|
637
|
+
"matched": match,
|
|
638
|
+
"expected": expected,
|
|
639
|
+
"pattern": pattern,
|
|
640
|
+
"in": target,
|
|
641
|
+
}
|
|
642
|
+
|
|
643
|
+
def _check_count(self, check: Any) -> dict[str, Any]:
|
|
644
|
+
"""Check count of items."""
|
|
645
|
+
target = check.target or "agent_messages"
|
|
646
|
+
min_count = check.min
|
|
647
|
+
max_count = check.max
|
|
648
|
+
|
|
649
|
+
items = self._get_target_list(target)
|
|
650
|
+
count = len(items)
|
|
651
|
+
|
|
652
|
+
passed = True
|
|
653
|
+
if min_count is not None and count < min_count:
|
|
654
|
+
passed = False
|
|
655
|
+
if max_count is not None and count > max_count:
|
|
656
|
+
passed = False
|
|
657
|
+
|
|
658
|
+
return {
|
|
659
|
+
"passed": passed,
|
|
660
|
+
"count": count,
|
|
661
|
+
"min": min_count,
|
|
662
|
+
"max": max_count,
|
|
663
|
+
"target": target,
|
|
664
|
+
}
|
|
665
|
+
|
|
666
|
+
def _check_tool_called(self, check: Any) -> dict[str, Any]:
|
|
667
|
+
"""Check if a specific tool was called."""
|
|
668
|
+
tool_name = check.tool
|
|
669
|
+
action_name = check.action
|
|
670
|
+
expected = check.expected
|
|
671
|
+
|
|
672
|
+
tool_calls = [e for e in self.events if e.type == "tool_call"]
|
|
673
|
+
|
|
674
|
+
called = False
|
|
675
|
+
for tc in tool_calls:
|
|
676
|
+
payload = tc.payload
|
|
677
|
+
if payload.get("tool") == tool_name:
|
|
678
|
+
if action_name is None or payload.get("action") == action_name:
|
|
679
|
+
called = True
|
|
680
|
+
break
|
|
681
|
+
|
|
682
|
+
passed = called == expected
|
|
683
|
+
|
|
684
|
+
return {
|
|
685
|
+
"passed": passed,
|
|
686
|
+
"called": called,
|
|
687
|
+
"expected": expected,
|
|
688
|
+
"tool": tool_name,
|
|
689
|
+
"action": action_name,
|
|
690
|
+
}
|
|
691
|
+
|
|
692
|
+
def _check_equals(self, check: Any) -> dict[str, Any]:
|
|
693
|
+
"""Check if a value equals expected."""
|
|
694
|
+
target = check.target or ""
|
|
695
|
+
expected_value = check.value
|
|
696
|
+
|
|
697
|
+
# Handle env.* targets
|
|
698
|
+
if target.startswith("env."):
|
|
699
|
+
key = target[4:]
|
|
700
|
+
actual_value = self.env_state.get(key)
|
|
701
|
+
else:
|
|
702
|
+
actual_value = self._get_target_text(target)
|
|
703
|
+
|
|
704
|
+
passed = actual_value == expected_value
|
|
705
|
+
|
|
706
|
+
return {
|
|
707
|
+
"passed": passed,
|
|
708
|
+
"actual": actual_value,
|
|
709
|
+
"expected": expected_value,
|
|
710
|
+
"target": target,
|
|
711
|
+
}
|
|
712
|
+
|
|
713
|
+
def _get_nested_value(self, obj: Any, path: str) -> Any:
|
|
714
|
+
"""Get a nested value using dot notation (e.g., 'orders.ORD123.refunded')."""
|
|
715
|
+
keys = path.split(".")
|
|
716
|
+
current = obj
|
|
717
|
+
for key in keys:
|
|
718
|
+
if current is None:
|
|
719
|
+
return None
|
|
720
|
+
if isinstance(current, dict):
|
|
721
|
+
current = current.get(key)
|
|
722
|
+
elif hasattr(current, key):
|
|
723
|
+
current = getattr(current, key)
|
|
724
|
+
else:
|
|
725
|
+
return None
|
|
726
|
+
return current
|
|
727
|
+
|
|
728
|
+
def _check_env_state(self, check: Any) -> dict[str, Any]:
|
|
729
|
+
"""Check environment state value. Supports dot notation for nested access."""
|
|
730
|
+
key = check.key or ""
|
|
731
|
+
expected_value = check.value
|
|
732
|
+
|
|
733
|
+
# Support dot notation for nested values (e.g., "orders.ORD123.refunded")
|
|
734
|
+
if "." in key:
|
|
735
|
+
actual_value = self._get_nested_value(self.env_state, key)
|
|
736
|
+
else:
|
|
737
|
+
actual_value = self.env_state.get(key)
|
|
738
|
+
|
|
739
|
+
passed = actual_value == expected_value
|
|
740
|
+
|
|
741
|
+
return {
|
|
742
|
+
"passed": passed,
|
|
743
|
+
"actual": actual_value,
|
|
744
|
+
"expected": expected_value,
|
|
745
|
+
"key": key,
|
|
746
|
+
}
|
|
747
|
+
|
|
748
|
+
def _check_deterministic(self, check: Any) -> dict[str, Any]:
|
|
749
|
+
"""Evaluate a deterministic check with Python expression and optional pass_if condition."""
|
|
750
|
+
expr = check.config.get("expr", "")
|
|
751
|
+
if not expr or expr == "TODO":
|
|
752
|
+
return {"status": "skipped", "reason": "No expression defined"}
|
|
753
|
+
|
|
754
|
+
context = {
|
|
755
|
+
"env_state": self.env_state,
|
|
756
|
+
"history": [msg.model_dump() for msg in self.history],
|
|
757
|
+
"events": [event.model_dump() for event in self.events],
|
|
758
|
+
}
|
|
759
|
+
|
|
760
|
+
try:
|
|
761
|
+
result = self._safe_eval(expr, context)
|
|
762
|
+
|
|
763
|
+
# Check for pass_if condition (e.g., ">=0", "<=5", ">=50")
|
|
764
|
+
pass_if = check.config.get("pass_if")
|
|
765
|
+
if pass_if and isinstance(result, int | float):
|
|
766
|
+
passed = self._evaluate_pass_condition(result, pass_if)
|
|
767
|
+
return {"passed": passed, "value": result, "condition": pass_if}
|
|
768
|
+
if isinstance(result, bool):
|
|
769
|
+
return {"passed": result}
|
|
770
|
+
# For numeric values without pass_if, just return the value (no pass/fail)
|
|
771
|
+
return {"value": result}
|
|
772
|
+
except Exception as e:
|
|
773
|
+
return {"status": "error", "error": str(e)}
|
|
774
|
+
|
|
775
|
+
def _evaluate_pass_condition(self, value: float, condition: str) -> bool:
|
|
776
|
+
"""Evaluate a pass_if condition like '>=0', '<=5', '>50'."""
|
|
777
|
+
# Parse condition: operator + value (e.g., ">=50", "<=0", ">10")
|
|
778
|
+
match = re.match(r"([<>=!]+)\s*(-?[\d.]+)", condition)
|
|
779
|
+
if not match:
|
|
780
|
+
return True # No valid condition, default to pass
|
|
781
|
+
|
|
782
|
+
op, threshold_str = match.groups()
|
|
783
|
+
threshold = float(threshold_str)
|
|
784
|
+
|
|
785
|
+
if op == ">=":
|
|
786
|
+
return value >= threshold
|
|
787
|
+
if op == "<=":
|
|
788
|
+
return value <= threshold
|
|
789
|
+
if op == ">":
|
|
790
|
+
return value > threshold
|
|
791
|
+
if op == "<":
|
|
792
|
+
return value < threshold
|
|
793
|
+
if op == "==" or op == "=":
|
|
794
|
+
return value == threshold
|
|
795
|
+
if op == "!=" or op == "<>":
|
|
796
|
+
return value != threshold
|
|
797
|
+
return True # Unknown operator, default to pass
|
|
798
|
+
|
|
799
|
+
def _safe_eval(self, expr: str, context: dict[str, Any]) -> Any:
|
|
800
|
+
"""Safely evaluate an expression with restricted scope (legacy support)."""
|
|
801
|
+
safe_builtins = {
|
|
802
|
+
"True": True,
|
|
803
|
+
"False": False,
|
|
804
|
+
"None": None,
|
|
805
|
+
"len": len,
|
|
806
|
+
"str": str,
|
|
807
|
+
"int": int,
|
|
808
|
+
"float": float,
|
|
809
|
+
"bool": bool,
|
|
810
|
+
"list": list,
|
|
811
|
+
"dict": dict,
|
|
812
|
+
"sum": sum,
|
|
813
|
+
"min": min,
|
|
814
|
+
"max": max,
|
|
815
|
+
"abs": abs,
|
|
816
|
+
"round": round,
|
|
817
|
+
"any": any,
|
|
818
|
+
"all": all,
|
|
819
|
+
}
|
|
820
|
+
|
|
821
|
+
safe_globals = {"__builtins__": safe_builtins}
|
|
822
|
+
safe_globals.update(context)
|
|
823
|
+
|
|
824
|
+
return eval(expr, safe_globals, {})
|