sandboxy 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sandboxy/__init__.py +3 -0
- sandboxy/agents/__init__.py +21 -0
- sandboxy/agents/base.py +66 -0
- sandboxy/agents/llm_prompt.py +308 -0
- sandboxy/agents/loader.py +222 -0
- sandboxy/api/__init__.py +5 -0
- sandboxy/api/app.py +76 -0
- sandboxy/api/routes/__init__.py +1 -0
- sandboxy/api/routes/agents.py +92 -0
- sandboxy/api/routes/local.py +1388 -0
- sandboxy/api/routes/tools.py +106 -0
- sandboxy/cli/__init__.py +1 -0
- sandboxy/cli/main.py +1196 -0
- sandboxy/cli/type_detector.py +48 -0
- sandboxy/config.py +49 -0
- sandboxy/core/__init__.py +1 -0
- sandboxy/core/async_runner.py +824 -0
- sandboxy/core/mdl_parser.py +441 -0
- sandboxy/core/runner.py +599 -0
- sandboxy/core/safe_eval.py +165 -0
- sandboxy/core/state.py +234 -0
- sandboxy/datasets/__init__.py +20 -0
- sandboxy/datasets/loader.py +193 -0
- sandboxy/datasets/runner.py +442 -0
- sandboxy/errors.py +166 -0
- sandboxy/local/context.py +235 -0
- sandboxy/local/results.py +173 -0
- sandboxy/logging.py +31 -0
- sandboxy/mcp/__init__.py +25 -0
- sandboxy/mcp/client.py +360 -0
- sandboxy/mcp/wrapper.py +99 -0
- sandboxy/providers/__init__.py +34 -0
- sandboxy/providers/anthropic_provider.py +271 -0
- sandboxy/providers/base.py +123 -0
- sandboxy/providers/http_client.py +101 -0
- sandboxy/providers/openai_provider.py +282 -0
- sandboxy/providers/openrouter.py +958 -0
- sandboxy/providers/registry.py +199 -0
- sandboxy/scenarios/__init__.py +11 -0
- sandboxy/scenarios/comparison.py +491 -0
- sandboxy/scenarios/loader.py +262 -0
- sandboxy/scenarios/runner.py +468 -0
- sandboxy/scenarios/unified.py +1434 -0
- sandboxy/session/__init__.py +21 -0
- sandboxy/session/manager.py +278 -0
- sandboxy/tools/__init__.py +34 -0
- sandboxy/tools/base.py +127 -0
- sandboxy/tools/loader.py +270 -0
- sandboxy/tools/yaml_tools.py +708 -0
- sandboxy/ui/__init__.py +27 -0
- sandboxy/ui/dist/assets/index-CgAkYWrJ.css +1 -0
- sandboxy/ui/dist/assets/index-D4zoGFcr.js +347 -0
- sandboxy/ui/dist/index.html +14 -0
- sandboxy/utils/__init__.py +3 -0
- sandboxy/utils/time.py +20 -0
- sandboxy-0.0.1.dist-info/METADATA +241 -0
- sandboxy-0.0.1.dist-info/RECORD +60 -0
- sandboxy-0.0.1.dist-info/WHEEL +4 -0
- sandboxy-0.0.1.dist-info/entry_points.txt +3 -0
- sandboxy-0.0.1.dist-info/licenses/LICENSE +201 -0
sandboxy/core/runner.py
ADDED
|
@@ -0,0 +1,599 @@
|
|
|
1
|
+
"""Runner - executes MDL modules with agents and tools."""
|
|
2
|
+
|
|
3
|
+
import json
|
|
4
|
+
import logging
|
|
5
|
+
import re
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from pydantic import BaseModel, Field
|
|
9
|
+
|
|
10
|
+
logger = logging.getLogger(__name__)
|
|
11
|
+
|
|
12
|
+
from sandboxy.agents.base import Agent, AgentAction
|
|
13
|
+
from sandboxy.core.state import EvaluationResult, Message, ModuleSpec, Step, ToolCall
|
|
14
|
+
from sandboxy.tools.base import Tool, ToolResult
|
|
15
|
+
from sandboxy.tools.loader import ToolLoader
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class RunEvent(BaseModel):
|
|
19
|
+
"""Event recorded during module execution."""
|
|
20
|
+
|
|
21
|
+
type: str # "user", "agent", "tool_call", "tool_result", "branch", "eval"
|
|
22
|
+
payload: dict[str, Any] = Field(default_factory=dict)
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
class RunResult(BaseModel):
|
|
26
|
+
"""Result of running a module with an agent."""
|
|
27
|
+
|
|
28
|
+
module_id: str
|
|
29
|
+
agent_id: str
|
|
30
|
+
events: list[RunEvent] = Field(default_factory=list)
|
|
31
|
+
evaluation: EvaluationResult = Field(default_factory=EvaluationResult)
|
|
32
|
+
|
|
33
|
+
def to_json(self, indent: int | None = None) -> str:
|
|
34
|
+
"""Serialize result to JSON string."""
|
|
35
|
+
return self.model_dump_json(indent=indent)
|
|
36
|
+
|
|
37
|
+
def pretty(self) -> str:
|
|
38
|
+
"""Format result for human-readable display."""
|
|
39
|
+
lines = [
|
|
40
|
+
f"Module: {self.module_id}",
|
|
41
|
+
f"Agent: {self.agent_id}",
|
|
42
|
+
"",
|
|
43
|
+
]
|
|
44
|
+
|
|
45
|
+
for event in self.events:
|
|
46
|
+
if event.type == "user":
|
|
47
|
+
lines.append(f"USER: {event.payload.get('content', '')}")
|
|
48
|
+
elif event.type == "agent":
|
|
49
|
+
lines.append(f"AGENT: {event.payload.get('content', '')}")
|
|
50
|
+
elif event.type == "tool_call":
|
|
51
|
+
tool = event.payload.get("tool", "")
|
|
52
|
+
action = event.payload.get("action", "")
|
|
53
|
+
args = event.payload.get("args", {})
|
|
54
|
+
lines.append(f"TOOL CALL: {tool}.{action}({args})")
|
|
55
|
+
elif event.type == "tool_result":
|
|
56
|
+
result = event.payload.get("result", {})
|
|
57
|
+
success = result.get("success", False)
|
|
58
|
+
data = result.get("data", "")
|
|
59
|
+
status = "OK" if success else "FAIL"
|
|
60
|
+
lines.append(f"TOOL RESULT [{status}]: {data}")
|
|
61
|
+
elif event.type == "branch":
|
|
62
|
+
branch = event.payload.get("branch", "")
|
|
63
|
+
lines.append(f"[BRANCH] → {branch}")
|
|
64
|
+
|
|
65
|
+
lines.append("")
|
|
66
|
+
lines.append("EVALUATION:")
|
|
67
|
+
lines.append(f" Score: {self.evaluation.score}")
|
|
68
|
+
lines.append(f" Status: {self.evaluation.status}")
|
|
69
|
+
lines.append(f" Events: {self.evaluation.num_events}")
|
|
70
|
+
if self.evaluation.checks:
|
|
71
|
+
lines.append(f" Checks: {json.dumps(self.evaluation.checks, indent=2)}")
|
|
72
|
+
|
|
73
|
+
return "\n".join(lines)
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
class Runner:
|
|
77
|
+
"""Executes MDL modules with agents and tools."""
|
|
78
|
+
|
|
79
|
+
def __init__(self, module: ModuleSpec, agent: Agent) -> None:
|
|
80
|
+
"""Initialize runner with module and agent.
|
|
81
|
+
|
|
82
|
+
Args:
|
|
83
|
+
module: MDL module specification to execute.
|
|
84
|
+
agent: Agent to run within the module.
|
|
85
|
+
"""
|
|
86
|
+
self.module = module
|
|
87
|
+
self.agent = agent
|
|
88
|
+
self.events: list[RunEvent] = []
|
|
89
|
+
self.history: list[Message] = []
|
|
90
|
+
self.env_state: dict[str, Any] = module.environment.initial_state.copy()
|
|
91
|
+
self.tools: dict[str, Tool] = ToolLoader.from_env_config(module.environment)
|
|
92
|
+
|
|
93
|
+
def run(self) -> RunResult:
|
|
94
|
+
"""Execute the module and return results.
|
|
95
|
+
|
|
96
|
+
Returns:
|
|
97
|
+
Result containing events and evaluation.
|
|
98
|
+
"""
|
|
99
|
+
step_index = 0
|
|
100
|
+
steps = self.module.steps
|
|
101
|
+
|
|
102
|
+
while step_index < len(steps):
|
|
103
|
+
step = steps[step_index]
|
|
104
|
+
next_index = step_index + 1
|
|
105
|
+
|
|
106
|
+
if step.action == "inject_user":
|
|
107
|
+
self._handle_inject_user(step)
|
|
108
|
+
|
|
109
|
+
elif step.action == "await_agent":
|
|
110
|
+
should_stop = self._handle_await_agent(step)
|
|
111
|
+
if should_stop:
|
|
112
|
+
break
|
|
113
|
+
|
|
114
|
+
elif step.action == "branch":
|
|
115
|
+
new_steps, new_index = self._handle_branch(step)
|
|
116
|
+
if new_steps is not None:
|
|
117
|
+
steps = new_steps
|
|
118
|
+
step_index = new_index
|
|
119
|
+
continue
|
|
120
|
+
|
|
121
|
+
step_index = next_index
|
|
122
|
+
|
|
123
|
+
evaluation = self._evaluate()
|
|
124
|
+
return RunResult(
|
|
125
|
+
module_id=self.module.id,
|
|
126
|
+
agent_id=self.agent.config.id,
|
|
127
|
+
events=self.events,
|
|
128
|
+
evaluation=evaluation,
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
def _handle_inject_user(self, step: Step) -> None:
|
|
132
|
+
"""Handle inject_user action - add user message to history."""
|
|
133
|
+
content = step.params.get("content", "")
|
|
134
|
+
msg = Message(role="user", content=content)
|
|
135
|
+
self.history.append(msg)
|
|
136
|
+
self.events.append(
|
|
137
|
+
RunEvent(
|
|
138
|
+
type="user",
|
|
139
|
+
payload={"content": content, "step_id": step.id},
|
|
140
|
+
)
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
def _handle_await_agent(self, step: Step, max_tool_calls: int = 10) -> bool:
|
|
144
|
+
"""Handle await_agent action - get agent response.
|
|
145
|
+
|
|
146
|
+
The agent may make multiple tool calls before responding with a message.
|
|
147
|
+
We loop until the agent returns a message or stop action.
|
|
148
|
+
|
|
149
|
+
Args:
|
|
150
|
+
step: Current step being executed.
|
|
151
|
+
max_tool_calls: Maximum tool calls allowed before forcing stop.
|
|
152
|
+
|
|
153
|
+
Returns:
|
|
154
|
+
True if agent wants to stop, False otherwise.
|
|
155
|
+
"""
|
|
156
|
+
tool_call_count = 0
|
|
157
|
+
|
|
158
|
+
while tool_call_count < max_tool_calls:
|
|
159
|
+
# Build tool schemas for agent
|
|
160
|
+
tool_schemas = self._get_tool_schemas()
|
|
161
|
+
|
|
162
|
+
# Get agent action
|
|
163
|
+
action: AgentAction = self.agent.step(self.history, tool_schemas)
|
|
164
|
+
|
|
165
|
+
if action.type == "message":
|
|
166
|
+
msg = Message(role="assistant", content=action.content or "")
|
|
167
|
+
self.history.append(msg)
|
|
168
|
+
self.events.append(
|
|
169
|
+
RunEvent(
|
|
170
|
+
type="agent",
|
|
171
|
+
payload={"content": msg.content, "step_id": step.id},
|
|
172
|
+
)
|
|
173
|
+
)
|
|
174
|
+
return False # Done with this await_agent step
|
|
175
|
+
|
|
176
|
+
if action.type == "tool_call":
|
|
177
|
+
self._handle_tool_call(action, step)
|
|
178
|
+
tool_call_count += 1
|
|
179
|
+
# Continue loop to let agent respond to tool result
|
|
180
|
+
|
|
181
|
+
elif action.type == "stop":
|
|
182
|
+
return True
|
|
183
|
+
|
|
184
|
+
# Max tool calls reached
|
|
185
|
+
return False
|
|
186
|
+
|
|
187
|
+
def _handle_tool_call(self, action: AgentAction, step: Step) -> None:
|
|
188
|
+
"""Handle a tool call from the agent."""
|
|
189
|
+
tool_name = action.tool_name or ""
|
|
190
|
+
tool_action = action.tool_action or ""
|
|
191
|
+
tool_args = action.tool_args or {}
|
|
192
|
+
|
|
193
|
+
# Generate unique tool call ID
|
|
194
|
+
tool_call_id = f"call_{tool_name}_{tool_action}_{len(self.events)}"
|
|
195
|
+
# Function name uses double underscore separator (matching _build_tools)
|
|
196
|
+
function_name = f"{tool_name}__{tool_action}"
|
|
197
|
+
|
|
198
|
+
self.events.append(
|
|
199
|
+
RunEvent(
|
|
200
|
+
type="tool_call",
|
|
201
|
+
payload={
|
|
202
|
+
"tool": tool_name,
|
|
203
|
+
"action": tool_action,
|
|
204
|
+
"args": tool_args,
|
|
205
|
+
"step_id": step.id,
|
|
206
|
+
},
|
|
207
|
+
)
|
|
208
|
+
)
|
|
209
|
+
|
|
210
|
+
# Add assistant message with tool_calls BEFORE the tool result
|
|
211
|
+
# This is required by OpenAI API
|
|
212
|
+
self.history.append(
|
|
213
|
+
Message(
|
|
214
|
+
role="assistant",
|
|
215
|
+
content="",
|
|
216
|
+
tool_calls=[
|
|
217
|
+
ToolCall(
|
|
218
|
+
id=tool_call_id,
|
|
219
|
+
name=function_name,
|
|
220
|
+
arguments=json.dumps(tool_args),
|
|
221
|
+
)
|
|
222
|
+
],
|
|
223
|
+
)
|
|
224
|
+
)
|
|
225
|
+
|
|
226
|
+
# Execute tool if available
|
|
227
|
+
if tool_name in self.tools:
|
|
228
|
+
tool = self.tools[tool_name]
|
|
229
|
+
result: ToolResult = tool.invoke(tool_action, tool_args, self.env_state)
|
|
230
|
+
|
|
231
|
+
self.events.append(
|
|
232
|
+
RunEvent(
|
|
233
|
+
type="tool_result",
|
|
234
|
+
payload={
|
|
235
|
+
"tool": tool_name,
|
|
236
|
+
"action": tool_action,
|
|
237
|
+
"result": result.model_dump(),
|
|
238
|
+
},
|
|
239
|
+
)
|
|
240
|
+
)
|
|
241
|
+
|
|
242
|
+
# Add tool result to history with matching tool_call_id
|
|
243
|
+
self.history.append(
|
|
244
|
+
Message(
|
|
245
|
+
role="tool",
|
|
246
|
+
content=json.dumps(result.data) if result.success else result.error or "",
|
|
247
|
+
tool_name=tool_name,
|
|
248
|
+
tool_call_id=tool_call_id,
|
|
249
|
+
)
|
|
250
|
+
)
|
|
251
|
+
else:
|
|
252
|
+
# Tool not found - still add tool result message
|
|
253
|
+
error_msg = f"Tool not found: {tool_name}"
|
|
254
|
+
self.events.append(
|
|
255
|
+
RunEvent(
|
|
256
|
+
type="tool_result",
|
|
257
|
+
payload={
|
|
258
|
+
"tool": tool_name,
|
|
259
|
+
"action": tool_action,
|
|
260
|
+
"result": {"success": False, "error": error_msg},
|
|
261
|
+
},
|
|
262
|
+
)
|
|
263
|
+
)
|
|
264
|
+
self.history.append(
|
|
265
|
+
Message(
|
|
266
|
+
role="tool",
|
|
267
|
+
content=error_msg,
|
|
268
|
+
tool_name=tool_name,
|
|
269
|
+
tool_call_id=tool_call_id,
|
|
270
|
+
)
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
def _handle_branch(self, step: Step) -> tuple[list[Step] | None, int]:
|
|
274
|
+
"""Handle branch action.
|
|
275
|
+
|
|
276
|
+
Returns:
|
|
277
|
+
Tuple of (new_steps, new_index) if branching, (None, 0) otherwise.
|
|
278
|
+
"""
|
|
279
|
+
branch_name = step.params.get("branch_name")
|
|
280
|
+
|
|
281
|
+
self.events.append(
|
|
282
|
+
RunEvent(
|
|
283
|
+
type="branch",
|
|
284
|
+
payload={"branch": branch_name, "step_id": step.id},
|
|
285
|
+
)
|
|
286
|
+
)
|
|
287
|
+
|
|
288
|
+
if branch_name and branch_name in self.module.branches:
|
|
289
|
+
return self.module.branches[branch_name], 0
|
|
290
|
+
|
|
291
|
+
return None, 0
|
|
292
|
+
|
|
293
|
+
def _get_tool_schemas(self) -> list[dict[str, Any]]:
|
|
294
|
+
"""Get tool schemas for agent tool calling."""
|
|
295
|
+
schemas = []
|
|
296
|
+
for name, tool in self.tools.items():
|
|
297
|
+
schemas.append(
|
|
298
|
+
{
|
|
299
|
+
"name": name,
|
|
300
|
+
"description": tool.description,
|
|
301
|
+
"actions": tool.get_actions(),
|
|
302
|
+
}
|
|
303
|
+
)
|
|
304
|
+
return schemas
|
|
305
|
+
|
|
306
|
+
def _evaluate(self) -> EvaluationResult:
|
|
307
|
+
"""Run evaluation checks and compute score.
|
|
308
|
+
|
|
309
|
+
Returns:
|
|
310
|
+
Evaluation result with checks and score.
|
|
311
|
+
"""
|
|
312
|
+
checks: dict[str, Any] = {}
|
|
313
|
+
|
|
314
|
+
for check in self.module.evaluation:
|
|
315
|
+
result = self._run_check(check)
|
|
316
|
+
checks[check.name] = result
|
|
317
|
+
|
|
318
|
+
# Compute score using scoring config
|
|
319
|
+
score = self._compute_score(checks)
|
|
320
|
+
|
|
321
|
+
return EvaluationResult(
|
|
322
|
+
checks=checks,
|
|
323
|
+
score=score,
|
|
324
|
+
num_events=len(self.events),
|
|
325
|
+
status="ok",
|
|
326
|
+
)
|
|
327
|
+
|
|
328
|
+
def _run_check(self, check: Any) -> dict[str, Any]:
|
|
329
|
+
"""Run a single evaluation check."""
|
|
330
|
+
kind = check.kind
|
|
331
|
+
|
|
332
|
+
try:
|
|
333
|
+
if kind == "contains":
|
|
334
|
+
return self._check_contains(check)
|
|
335
|
+
if kind == "regex":
|
|
336
|
+
return self._check_regex(check)
|
|
337
|
+
if kind == "count":
|
|
338
|
+
return self._check_count(check)
|
|
339
|
+
if kind == "tool_called":
|
|
340
|
+
return self._check_tool_called(check)
|
|
341
|
+
if kind == "env_state":
|
|
342
|
+
return self._check_env_state(check)
|
|
343
|
+
if kind == "deterministic":
|
|
344
|
+
return self._eval_deterministic(check)
|
|
345
|
+
if kind == "llm":
|
|
346
|
+
return {"status": "skipped", "reason": "LLM eval not implemented"}
|
|
347
|
+
return {"status": "error", "error": f"Unknown check kind: {kind}"}
|
|
348
|
+
except Exception as e:
|
|
349
|
+
return {"status": "error", "error": str(e)}
|
|
350
|
+
|
|
351
|
+
def _get_target_text(self, target: str) -> str:
|
|
352
|
+
"""Get text content for a target."""
|
|
353
|
+
if target == "agent_messages":
|
|
354
|
+
return " ".join(msg.content for msg in self.history if msg.role == "assistant")
|
|
355
|
+
if target == "user_messages":
|
|
356
|
+
return " ".join(msg.content for msg in self.history if msg.role == "user")
|
|
357
|
+
if target == "all_messages":
|
|
358
|
+
return " ".join(msg.content for msg in self.history)
|
|
359
|
+
if target == "last_agent_message":
|
|
360
|
+
for msg in reversed(self.history):
|
|
361
|
+
if msg.role == "assistant":
|
|
362
|
+
return msg.content
|
|
363
|
+
return ""
|
|
364
|
+
return ""
|
|
365
|
+
|
|
366
|
+
def _get_target_list(self, target: str) -> list[Any]:
|
|
367
|
+
"""Get list of items for a target."""
|
|
368
|
+
if target == "agent_messages":
|
|
369
|
+
return [msg for msg in self.history if msg.role == "assistant"]
|
|
370
|
+
if target == "user_messages":
|
|
371
|
+
return [msg for msg in self.history if msg.role == "user"]
|
|
372
|
+
if target == "all_messages":
|
|
373
|
+
return list(self.history)
|
|
374
|
+
if target == "tool_calls":
|
|
375
|
+
return [event for event in self.events if event.type == "tool_call"]
|
|
376
|
+
return []
|
|
377
|
+
|
|
378
|
+
def _check_contains(self, check: Any) -> dict[str, Any]:
|
|
379
|
+
"""Check if target contains a value."""
|
|
380
|
+
target = check.target or "agent_messages"
|
|
381
|
+
value = check.value or ""
|
|
382
|
+
expected = check.expected
|
|
383
|
+
case_sensitive = check.case_sensitive
|
|
384
|
+
|
|
385
|
+
text = self._get_target_text(target)
|
|
386
|
+
|
|
387
|
+
if not case_sensitive:
|
|
388
|
+
text = text.lower()
|
|
389
|
+
value = value.lower()
|
|
390
|
+
|
|
391
|
+
found = value in text
|
|
392
|
+
passed = found == expected
|
|
393
|
+
|
|
394
|
+
return {
|
|
395
|
+
"passed": passed,
|
|
396
|
+
"found": found,
|
|
397
|
+
"expected": expected,
|
|
398
|
+
}
|
|
399
|
+
|
|
400
|
+
def _check_regex(self, check: Any) -> dict[str, Any]:
|
|
401
|
+
"""Check if target matches a regex pattern."""
|
|
402
|
+
target = check.target or "agent_messages"
|
|
403
|
+
pattern = check.pattern or ""
|
|
404
|
+
expected = check.expected
|
|
405
|
+
|
|
406
|
+
text = self._get_target_text(target)
|
|
407
|
+
flags = 0 if check.case_sensitive else re.IGNORECASE
|
|
408
|
+
match = bool(re.search(pattern, text, flags))
|
|
409
|
+
passed = match == expected
|
|
410
|
+
|
|
411
|
+
return {
|
|
412
|
+
"passed": passed,
|
|
413
|
+
"matched": match,
|
|
414
|
+
"expected": expected,
|
|
415
|
+
}
|
|
416
|
+
|
|
417
|
+
def _check_count(self, check: Any) -> dict[str, Any]:
|
|
418
|
+
"""Check count of items."""
|
|
419
|
+
target = check.target or "agent_messages"
|
|
420
|
+
min_count = check.min
|
|
421
|
+
max_count = check.max
|
|
422
|
+
|
|
423
|
+
items = self._get_target_list(target)
|
|
424
|
+
count = len(items)
|
|
425
|
+
|
|
426
|
+
passed = True
|
|
427
|
+
if min_count is not None and count < min_count:
|
|
428
|
+
passed = False
|
|
429
|
+
if max_count is not None and count > max_count:
|
|
430
|
+
passed = False
|
|
431
|
+
|
|
432
|
+
return {
|
|
433
|
+
"passed": passed,
|
|
434
|
+
"count": count,
|
|
435
|
+
"min": min_count,
|
|
436
|
+
"max": max_count,
|
|
437
|
+
}
|
|
438
|
+
|
|
439
|
+
def _check_tool_called(self, check: Any) -> dict[str, Any]:
|
|
440
|
+
"""Check if a specific tool was called."""
|
|
441
|
+
tool_name = check.tool
|
|
442
|
+
action_name = check.action
|
|
443
|
+
expected = check.expected
|
|
444
|
+
|
|
445
|
+
tool_calls = [e for e in self.events if e.type == "tool_call"]
|
|
446
|
+
|
|
447
|
+
called = False
|
|
448
|
+
for tc in tool_calls:
|
|
449
|
+
payload = tc.payload
|
|
450
|
+
if payload.get("tool") == tool_name:
|
|
451
|
+
if action_name is None or payload.get("action") == action_name:
|
|
452
|
+
called = True
|
|
453
|
+
break
|
|
454
|
+
|
|
455
|
+
passed = called == expected
|
|
456
|
+
|
|
457
|
+
return {
|
|
458
|
+
"passed": passed,
|
|
459
|
+
"called": called,
|
|
460
|
+
"expected": expected,
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
def _check_env_state(self, check: Any) -> dict[str, Any]:
|
|
464
|
+
"""Check environment state value."""
|
|
465
|
+
key = check.key or ""
|
|
466
|
+
expected_value = check.value
|
|
467
|
+
|
|
468
|
+
actual_value = self.env_state.get(key)
|
|
469
|
+
passed = actual_value == expected_value
|
|
470
|
+
|
|
471
|
+
return {
|
|
472
|
+
"passed": passed,
|
|
473
|
+
"actual": actual_value,
|
|
474
|
+
"expected": expected_value,
|
|
475
|
+
}
|
|
476
|
+
|
|
477
|
+
def _compute_score(self, checks: dict[str, Any]) -> float:
|
|
478
|
+
"""Compute final score based on scoring config."""
|
|
479
|
+
scoring = self.module.scoring
|
|
480
|
+
|
|
481
|
+
# Extract numeric values from checks
|
|
482
|
+
check_values: dict[str, float] = {}
|
|
483
|
+
for name, result in checks.items():
|
|
484
|
+
if isinstance(result, int | float):
|
|
485
|
+
check_values[name] = float(result)
|
|
486
|
+
elif isinstance(result, bool):
|
|
487
|
+
check_values[name] = 1.0 if result else 0.0
|
|
488
|
+
elif isinstance(result, dict):
|
|
489
|
+
if result.get("passed") is True:
|
|
490
|
+
check_values[name] = 1.0
|
|
491
|
+
elif result.get("passed") is False:
|
|
492
|
+
check_values[name] = 0.0
|
|
493
|
+
elif "value" in result and isinstance(result["value"], int | float):
|
|
494
|
+
check_values[name] = float(result["value"])
|
|
495
|
+
|
|
496
|
+
# Use formula if specified
|
|
497
|
+
if scoring.formula:
|
|
498
|
+
try:
|
|
499
|
+
score = self._eval_score_formula(scoring.formula, check_values)
|
|
500
|
+
except Exception:
|
|
501
|
+
score = self._weighted_average(check_values, scoring.weights)
|
|
502
|
+
else:
|
|
503
|
+
score = self._weighted_average(check_values, scoring.weights)
|
|
504
|
+
|
|
505
|
+
# Normalize if requested
|
|
506
|
+
if scoring.normalize and scoring.max_score != scoring.min_score:
|
|
507
|
+
score = (score - scoring.min_score) / (scoring.max_score - scoring.min_score)
|
|
508
|
+
score = max(0.0, min(1.0, score))
|
|
509
|
+
|
|
510
|
+
return score
|
|
511
|
+
|
|
512
|
+
def _eval_score_formula(self, formula: str, check_values: dict[str, float]) -> float:
|
|
513
|
+
"""Evaluate a score formula."""
|
|
514
|
+
safe_builtins = {
|
|
515
|
+
"True": True,
|
|
516
|
+
"False": False,
|
|
517
|
+
"None": None,
|
|
518
|
+
"len": len,
|
|
519
|
+
"min": min,
|
|
520
|
+
"max": max,
|
|
521
|
+
"abs": abs,
|
|
522
|
+
"sum": sum,
|
|
523
|
+
"round": round,
|
|
524
|
+
}
|
|
525
|
+
context = {"__builtins__": safe_builtins, "env_state": self.env_state}
|
|
526
|
+
context.update(check_values)
|
|
527
|
+
return float(eval(formula, context, {}))
|
|
528
|
+
|
|
529
|
+
def _weighted_average(self, values: dict[str, float], weights: dict[str, float]) -> float:
|
|
530
|
+
"""Compute weighted average of check values."""
|
|
531
|
+
if not values:
|
|
532
|
+
return 0.0
|
|
533
|
+
total = sum(values[n] * weights.get(n, 1.0) for n in values)
|
|
534
|
+
total_weight = sum(weights.get(n, 1.0) for n in values)
|
|
535
|
+
return total / total_weight if total_weight > 0 else 0.0
|
|
536
|
+
|
|
537
|
+
def _eval_deterministic(self, check: Any) -> Any:
|
|
538
|
+
"""Evaluate a deterministic check.
|
|
539
|
+
|
|
540
|
+
Args:
|
|
541
|
+
check: Evaluation check with expr in config.
|
|
542
|
+
|
|
543
|
+
Returns:
|
|
544
|
+
Result of evaluation (bool, number, or error dict).
|
|
545
|
+
"""
|
|
546
|
+
expr = check.config.get("expr", "")
|
|
547
|
+
if not expr or expr == "TODO":
|
|
548
|
+
return {"status": "skipped", "reason": "No expression defined"}
|
|
549
|
+
|
|
550
|
+
# Build evaluation context
|
|
551
|
+
context = {
|
|
552
|
+
"env_state": self.env_state,
|
|
553
|
+
"history": [msg.model_dump() for msg in self.history],
|
|
554
|
+
"events": [event.model_dump() for event in self.events],
|
|
555
|
+
}
|
|
556
|
+
|
|
557
|
+
try:
|
|
558
|
+
# Safe evaluation using restricted builtins
|
|
559
|
+
result = self._safe_eval(expr, context)
|
|
560
|
+
return result
|
|
561
|
+
except Exception as e:
|
|
562
|
+
return {"status": "error", "error": str(e)}
|
|
563
|
+
|
|
564
|
+
def _safe_eval(self, expr: str, context: dict[str, Any]) -> Any:
|
|
565
|
+
"""Safely evaluate an expression with restricted scope.
|
|
566
|
+
|
|
567
|
+
Args:
|
|
568
|
+
expr: Expression to evaluate.
|
|
569
|
+
context: Variables available in expression.
|
|
570
|
+
|
|
571
|
+
Returns:
|
|
572
|
+
Result of evaluation.
|
|
573
|
+
"""
|
|
574
|
+
# Restrict available builtins
|
|
575
|
+
safe_builtins = {
|
|
576
|
+
"True": True,
|
|
577
|
+
"False": False,
|
|
578
|
+
"None": None,
|
|
579
|
+
"len": len,
|
|
580
|
+
"str": str,
|
|
581
|
+
"int": int,
|
|
582
|
+
"float": float,
|
|
583
|
+
"bool": bool,
|
|
584
|
+
"list": list,
|
|
585
|
+
"dict": dict,
|
|
586
|
+
"sum": sum,
|
|
587
|
+
"min": min,
|
|
588
|
+
"max": max,
|
|
589
|
+
"abs": abs,
|
|
590
|
+
"round": round,
|
|
591
|
+
"any": any,
|
|
592
|
+
"all": all,
|
|
593
|
+
}
|
|
594
|
+
|
|
595
|
+
# Create restricted globals
|
|
596
|
+
safe_globals = {"__builtins__": safe_builtins}
|
|
597
|
+
safe_globals.update(context)
|
|
598
|
+
|
|
599
|
+
return eval(expr, safe_globals, {})
|