sandboxy 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sandboxy/__init__.py +3 -0
- sandboxy/agents/__init__.py +21 -0
- sandboxy/agents/base.py +66 -0
- sandboxy/agents/llm_prompt.py +308 -0
- sandboxy/agents/loader.py +222 -0
- sandboxy/api/__init__.py +5 -0
- sandboxy/api/app.py +76 -0
- sandboxy/api/routes/__init__.py +1 -0
- sandboxy/api/routes/agents.py +92 -0
- sandboxy/api/routes/local.py +1388 -0
- sandboxy/api/routes/tools.py +106 -0
- sandboxy/cli/__init__.py +1 -0
- sandboxy/cli/main.py +1196 -0
- sandboxy/cli/type_detector.py +48 -0
- sandboxy/config.py +49 -0
- sandboxy/core/__init__.py +1 -0
- sandboxy/core/async_runner.py +824 -0
- sandboxy/core/mdl_parser.py +441 -0
- sandboxy/core/runner.py +599 -0
- sandboxy/core/safe_eval.py +165 -0
- sandboxy/core/state.py +234 -0
- sandboxy/datasets/__init__.py +20 -0
- sandboxy/datasets/loader.py +193 -0
- sandboxy/datasets/runner.py +442 -0
- sandboxy/errors.py +166 -0
- sandboxy/local/context.py +235 -0
- sandboxy/local/results.py +173 -0
- sandboxy/logging.py +31 -0
- sandboxy/mcp/__init__.py +25 -0
- sandboxy/mcp/client.py +360 -0
- sandboxy/mcp/wrapper.py +99 -0
- sandboxy/providers/__init__.py +34 -0
- sandboxy/providers/anthropic_provider.py +271 -0
- sandboxy/providers/base.py +123 -0
- sandboxy/providers/http_client.py +101 -0
- sandboxy/providers/openai_provider.py +282 -0
- sandboxy/providers/openrouter.py +958 -0
- sandboxy/providers/registry.py +199 -0
- sandboxy/scenarios/__init__.py +11 -0
- sandboxy/scenarios/comparison.py +491 -0
- sandboxy/scenarios/loader.py +262 -0
- sandboxy/scenarios/runner.py +468 -0
- sandboxy/scenarios/unified.py +1434 -0
- sandboxy/session/__init__.py +21 -0
- sandboxy/session/manager.py +278 -0
- sandboxy/tools/__init__.py +34 -0
- sandboxy/tools/base.py +127 -0
- sandboxy/tools/loader.py +270 -0
- sandboxy/tools/yaml_tools.py +708 -0
- sandboxy/ui/__init__.py +27 -0
- sandboxy/ui/dist/assets/index-CgAkYWrJ.css +1 -0
- sandboxy/ui/dist/assets/index-D4zoGFcr.js +347 -0
- sandboxy/ui/dist/index.html +14 -0
- sandboxy/utils/__init__.py +3 -0
- sandboxy/utils/time.py +20 -0
- sandboxy-0.0.1.dist-info/METADATA +241 -0
- sandboxy-0.0.1.dist-info/RECORD +60 -0
- sandboxy-0.0.1.dist-info/WHEEL +4 -0
- sandboxy-0.0.1.dist-info/entry_points.txt +3 -0
- sandboxy-0.0.1.dist-info/licenses/LICENSE +201 -0
|
@@ -0,0 +1,468 @@
|
|
|
1
|
+
"""Scenario runner - execute scenarios with YAML-defined and MCP tools."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import json
|
|
7
|
+
import logging
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
from typing import Any
|
|
10
|
+
|
|
11
|
+
from pydantic import BaseModel, Field
|
|
12
|
+
|
|
13
|
+
from sandboxy.agents.base import Agent, AgentAction
|
|
14
|
+
from sandboxy.scenarios.loader import ScenarioSpec, StepSpec
|
|
15
|
+
from sandboxy.tools.base import ToolResult
|
|
16
|
+
from sandboxy.tools.loader import YAML_TOOL_DIRS
|
|
17
|
+
from sandboxy.tools.yaml_tools import load_scenario_tools
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class ScenarioEvent(BaseModel):
|
|
23
|
+
"""Event recorded during scenario execution."""
|
|
24
|
+
|
|
25
|
+
type: str # "user", "agent", "tool_call", "tool_result", "system"
|
|
26
|
+
payload: dict[str, Any] = Field(default_factory=dict)
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class ScenarioResult(BaseModel):
|
|
30
|
+
"""Result of running a scenario."""
|
|
31
|
+
|
|
32
|
+
scenario_id: str
|
|
33
|
+
agent_id: str
|
|
34
|
+
events: list[ScenarioEvent] = Field(default_factory=list)
|
|
35
|
+
tool_calls: list[dict[str, Any]] = Field(default_factory=list)
|
|
36
|
+
final_state: dict[str, Any] = Field(default_factory=dict)
|
|
37
|
+
goals_achieved: list[str] = Field(default_factory=list)
|
|
38
|
+
score: float = 0.0
|
|
39
|
+
|
|
40
|
+
def to_json(self, indent: int | None = None) -> str:
|
|
41
|
+
"""Serialize result to JSON string."""
|
|
42
|
+
return self.model_dump_json(indent=indent)
|
|
43
|
+
|
|
44
|
+
def pretty(self) -> str:
|
|
45
|
+
"""Format result for human-readable display."""
|
|
46
|
+
lines = [
|
|
47
|
+
f"Scenario: {self.scenario_id}",
|
|
48
|
+
f"Agent: {self.agent_id}",
|
|
49
|
+
"",
|
|
50
|
+
]
|
|
51
|
+
|
|
52
|
+
for event in self.events:
|
|
53
|
+
if event.type == "user":
|
|
54
|
+
lines.append(f"USER: {event.payload.get('content', '')[:100]}...")
|
|
55
|
+
elif event.type == "agent":
|
|
56
|
+
content = event.payload.get("content", "")
|
|
57
|
+
if len(content) > 200:
|
|
58
|
+
content = content[:200] + "..."
|
|
59
|
+
lines.append(f"AGENT: {content}")
|
|
60
|
+
elif event.type == "tool_call":
|
|
61
|
+
tool = event.payload.get("tool", "")
|
|
62
|
+
action = event.payload.get("action", "")
|
|
63
|
+
lines.append(f"TOOL: {tool}.{action}()")
|
|
64
|
+
elif event.type == "tool_result":
|
|
65
|
+
success = event.payload.get("success", False)
|
|
66
|
+
status = "OK" if success else "FAIL"
|
|
67
|
+
data = str(event.payload.get("data", ""))[:50]
|
|
68
|
+
lines.append(f" -> [{status}] {data}")
|
|
69
|
+
|
|
70
|
+
lines.append("")
|
|
71
|
+
lines.append(f"Tool Calls Made: {len(self.tool_calls)}")
|
|
72
|
+
lines.append(f"Goals Achieved: {len(self.goals_achieved)}")
|
|
73
|
+
lines.append(f"Score: {self.score}")
|
|
74
|
+
|
|
75
|
+
return "\n".join(lines)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
class Message(BaseModel):
|
|
79
|
+
"""A message in conversation history."""
|
|
80
|
+
|
|
81
|
+
role: str # "system", "user", "assistant", "tool"
|
|
82
|
+
content: str
|
|
83
|
+
tool_name: str | None = None
|
|
84
|
+
tool_call_id: str | None = None
|
|
85
|
+
tool_calls: list[dict[str, Any]] | None = None
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class ScenarioRunner:
|
|
89
|
+
"""Runs scenarios with YAML-defined and MCP tools."""
|
|
90
|
+
|
|
91
|
+
def __init__(
|
|
92
|
+
self,
|
|
93
|
+
scenario: ScenarioSpec,
|
|
94
|
+
agent: Agent,
|
|
95
|
+
tool_dirs: list[Path] | None = None,
|
|
96
|
+
) -> None:
|
|
97
|
+
"""Initialize the scenario runner.
|
|
98
|
+
|
|
99
|
+
Args:
|
|
100
|
+
scenario: The scenario specification
|
|
101
|
+
agent: The agent to run
|
|
102
|
+
tool_dirs: Optional directories to search for tool libraries
|
|
103
|
+
"""
|
|
104
|
+
self.scenario = scenario
|
|
105
|
+
self.agent = agent
|
|
106
|
+
self.tool_dirs = tool_dirs or YAML_TOOL_DIRS
|
|
107
|
+
|
|
108
|
+
# Load YAML tools
|
|
109
|
+
scenario_data = {
|
|
110
|
+
"tools_from": scenario.tools_from,
|
|
111
|
+
"tools": scenario.tools,
|
|
112
|
+
}
|
|
113
|
+
self.tools: dict[str, Any] = load_scenario_tools(scenario_data, self.tool_dirs)
|
|
114
|
+
self._mcp_tools: dict[str, Any] = {} # MCP tools loaded separately
|
|
115
|
+
self._mcp_manager: Any = None
|
|
116
|
+
|
|
117
|
+
# Initialize state
|
|
118
|
+
self.env_state: dict[str, Any] = scenario.initial_state.copy()
|
|
119
|
+
self.history: list[Message] = []
|
|
120
|
+
self.events: list[ScenarioEvent] = []
|
|
121
|
+
self.tool_call_log: list[dict[str, Any]] = []
|
|
122
|
+
|
|
123
|
+
async def _load_mcp_tools(self) -> None:
|
|
124
|
+
"""Load MCP tools from configured servers."""
|
|
125
|
+
if not self.scenario.mcp_servers:
|
|
126
|
+
return
|
|
127
|
+
|
|
128
|
+
from sandboxy.mcp.client import McpManager, McpServerConfig
|
|
129
|
+
|
|
130
|
+
self._mcp_manager = McpManager()
|
|
131
|
+
|
|
132
|
+
configs = [
|
|
133
|
+
McpServerConfig(
|
|
134
|
+
name=server.name,
|
|
135
|
+
# Local (stdio) transport
|
|
136
|
+
command=server.command,
|
|
137
|
+
args=server.args,
|
|
138
|
+
env=server.env,
|
|
139
|
+
# Remote (HTTP) transport
|
|
140
|
+
url=server.url,
|
|
141
|
+
headers=server.headers,
|
|
142
|
+
transport=server.transport, # type: ignore[arg-type]
|
|
143
|
+
)
|
|
144
|
+
for server in self.scenario.mcp_servers
|
|
145
|
+
]
|
|
146
|
+
|
|
147
|
+
self._mcp_tools = await self._mcp_manager.connect_all(configs)
|
|
148
|
+
|
|
149
|
+
# Merge MCP tools with YAML tools (YAML tools take precedence)
|
|
150
|
+
for name, tool in self._mcp_tools.items():
|
|
151
|
+
if name not in self.tools:
|
|
152
|
+
self.tools[name] = tool
|
|
153
|
+
|
|
154
|
+
async def _cleanup_mcp(self) -> None:
|
|
155
|
+
"""Disconnect from MCP servers."""
|
|
156
|
+
if self._mcp_manager:
|
|
157
|
+
await self._mcp_manager.disconnect_all()
|
|
158
|
+
|
|
159
|
+
def run(self, max_turns: int = 20) -> ScenarioResult:
|
|
160
|
+
"""Execute the scenario synchronously.
|
|
161
|
+
|
|
162
|
+
Args:
|
|
163
|
+
max_turns: Maximum number of conversation turns
|
|
164
|
+
|
|
165
|
+
Returns:
|
|
166
|
+
ScenarioResult with events and evaluation
|
|
167
|
+
"""
|
|
168
|
+
return asyncio.run(self.run_async(max_turns))
|
|
169
|
+
|
|
170
|
+
async def run_async(self, max_turns: int = 20) -> ScenarioResult:
|
|
171
|
+
"""Execute the scenario asynchronously.
|
|
172
|
+
|
|
173
|
+
Args:
|
|
174
|
+
max_turns: Maximum number of conversation turns
|
|
175
|
+
|
|
176
|
+
Returns:
|
|
177
|
+
ScenarioResult with events and evaluation
|
|
178
|
+
"""
|
|
179
|
+
try:
|
|
180
|
+
# Load MCP tools if configured
|
|
181
|
+
await self._load_mcp_tools()
|
|
182
|
+
|
|
183
|
+
# Add system prompt to history
|
|
184
|
+
if self.scenario.system_prompt:
|
|
185
|
+
self.history.append(Message(role="system", content=self.scenario.system_prompt))
|
|
186
|
+
|
|
187
|
+
# Execute steps
|
|
188
|
+
for step in self.scenario.steps:
|
|
189
|
+
await self._execute_step(step, max_turns)
|
|
190
|
+
|
|
191
|
+
# Evaluate goals
|
|
192
|
+
goals_achieved = self._evaluate_goals()
|
|
193
|
+
score = self._compute_score(goals_achieved)
|
|
194
|
+
|
|
195
|
+
return ScenarioResult(
|
|
196
|
+
scenario_id=self.scenario.id,
|
|
197
|
+
agent_id=self.agent.config.id,
|
|
198
|
+
events=self.events,
|
|
199
|
+
tool_calls=self.tool_call_log,
|
|
200
|
+
final_state=self.env_state.copy(),
|
|
201
|
+
goals_achieved=goals_achieved,
|
|
202
|
+
score=score,
|
|
203
|
+
)
|
|
204
|
+
finally:
|
|
205
|
+
await self._cleanup_mcp()
|
|
206
|
+
|
|
207
|
+
async def _execute_step(self, step: StepSpec, max_turns: int) -> None:
|
|
208
|
+
"""Execute a single scenario step."""
|
|
209
|
+
if step.action == "inject_user":
|
|
210
|
+
content = step.params.get("content", "")
|
|
211
|
+
self._add_user_message(content)
|
|
212
|
+
|
|
213
|
+
elif step.action == "await_agent":
|
|
214
|
+
await self._get_agent_response(max_tool_calls=10)
|
|
215
|
+
|
|
216
|
+
elif step.action == "await_user":
|
|
217
|
+
# Interactive mode - skip in batch execution
|
|
218
|
+
logger.debug("Skipping await_user step (batch mode)")
|
|
219
|
+
|
|
220
|
+
def _add_user_message(self, content: str) -> None:
|
|
221
|
+
"""Add a user message to history."""
|
|
222
|
+
self.history.append(Message(role="user", content=content))
|
|
223
|
+
self.events.append(ScenarioEvent(type="user", payload={"content": content}))
|
|
224
|
+
|
|
225
|
+
async def _get_agent_response(self, max_tool_calls: int = 10) -> None:
|
|
226
|
+
"""Get agent response, handling tool calls."""
|
|
227
|
+
from sandboxy.core.state import Message as CoreMessage
|
|
228
|
+
from sandboxy.core.state import ToolCall
|
|
229
|
+
|
|
230
|
+
tool_calls_made = 0
|
|
231
|
+
|
|
232
|
+
while tool_calls_made < max_tool_calls:
|
|
233
|
+
# Build tool schemas
|
|
234
|
+
tool_schemas = self._get_tool_schemas()
|
|
235
|
+
|
|
236
|
+
# Convert history to CoreMessage objects for agent
|
|
237
|
+
history_for_agent: list[CoreMessage] = []
|
|
238
|
+
for m in self.history:
|
|
239
|
+
# Convert tool_calls from dicts to ToolCall objects if present
|
|
240
|
+
tool_calls_obj = None
|
|
241
|
+
if m.tool_calls:
|
|
242
|
+
tool_calls_obj = [
|
|
243
|
+
ToolCall(
|
|
244
|
+
id=tc["id"],
|
|
245
|
+
name=tc["name"],
|
|
246
|
+
arguments=tc["arguments"],
|
|
247
|
+
)
|
|
248
|
+
for tc in m.tool_calls
|
|
249
|
+
]
|
|
250
|
+
|
|
251
|
+
history_for_agent.append(
|
|
252
|
+
CoreMessage(
|
|
253
|
+
role=m.role, # type: ignore[arg-type]
|
|
254
|
+
content=m.content,
|
|
255
|
+
tool_name=m.tool_name,
|
|
256
|
+
tool_call_id=m.tool_call_id,
|
|
257
|
+
tool_calls=tool_calls_obj,
|
|
258
|
+
)
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
# Get agent action
|
|
262
|
+
action: AgentAction = self.agent.step(history_for_agent, tool_schemas)
|
|
263
|
+
|
|
264
|
+
if action.type == "message":
|
|
265
|
+
# Agent responded with a message
|
|
266
|
+
self.history.append(Message(role="assistant", content=action.content or ""))
|
|
267
|
+
self.events.append(ScenarioEvent(type="agent", payload={"content": action.content}))
|
|
268
|
+
return
|
|
269
|
+
|
|
270
|
+
if action.type == "tool_call":
|
|
271
|
+
# Agent made a tool call
|
|
272
|
+
await self._handle_tool_call(action)
|
|
273
|
+
tool_calls_made += 1
|
|
274
|
+
|
|
275
|
+
elif action.type == "stop":
|
|
276
|
+
return
|
|
277
|
+
|
|
278
|
+
async def _handle_tool_call(self, action: AgentAction) -> None:
|
|
279
|
+
"""Handle a tool call from the agent."""
|
|
280
|
+
tool_name = action.tool_name or ""
|
|
281
|
+
tool_action = action.tool_action or "call"
|
|
282
|
+
tool_args = action.tool_args or {}
|
|
283
|
+
|
|
284
|
+
# Generate tool call ID
|
|
285
|
+
tool_call_id = f"call_{tool_name}_{len(self.events)}"
|
|
286
|
+
function_name = f"{tool_name}__{tool_action}"
|
|
287
|
+
|
|
288
|
+
# Log the call
|
|
289
|
+
call_log: dict[str, Any] = {
|
|
290
|
+
"tool": tool_name,
|
|
291
|
+
"action": tool_action,
|
|
292
|
+
"args": tool_args,
|
|
293
|
+
"state_before": self.env_state.copy(),
|
|
294
|
+
}
|
|
295
|
+
|
|
296
|
+
self.events.append(
|
|
297
|
+
ScenarioEvent(
|
|
298
|
+
type="tool_call",
|
|
299
|
+
payload={"tool": tool_name, "action": tool_action, "args": tool_args},
|
|
300
|
+
)
|
|
301
|
+
)
|
|
302
|
+
|
|
303
|
+
# Add assistant message with tool call
|
|
304
|
+
self.history.append(
|
|
305
|
+
Message(
|
|
306
|
+
role="assistant",
|
|
307
|
+
content="",
|
|
308
|
+
tool_calls=[
|
|
309
|
+
{
|
|
310
|
+
"id": tool_call_id,
|
|
311
|
+
"name": function_name,
|
|
312
|
+
"arguments": json.dumps(tool_args),
|
|
313
|
+
}
|
|
314
|
+
],
|
|
315
|
+
)
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
# Execute the tool
|
|
319
|
+
if tool_name in self.tools:
|
|
320
|
+
tool = self.tools[tool_name]
|
|
321
|
+
|
|
322
|
+
# Check if tool is async (MCP) or sync (YAML mock)
|
|
323
|
+
if hasattr(tool, "invoke_async"):
|
|
324
|
+
# MCP tool - async
|
|
325
|
+
result: ToolResult = await tool.invoke_async(tool_action, tool_args, self.env_state)
|
|
326
|
+
else:
|
|
327
|
+
# YAML mock tool - sync
|
|
328
|
+
result = tool.invoke(tool_action, tool_args, self.env_state)
|
|
329
|
+
|
|
330
|
+
call_log["result"] = result.model_dump()
|
|
331
|
+
call_log["state_after"] = self.env_state.copy()
|
|
332
|
+
|
|
333
|
+
self.events.append(
|
|
334
|
+
ScenarioEvent(
|
|
335
|
+
type="tool_result",
|
|
336
|
+
payload={
|
|
337
|
+
"tool": tool_name,
|
|
338
|
+
"action": tool_action,
|
|
339
|
+
"success": result.success,
|
|
340
|
+
"data": result.data,
|
|
341
|
+
"error": result.error,
|
|
342
|
+
},
|
|
343
|
+
)
|
|
344
|
+
)
|
|
345
|
+
|
|
346
|
+
# Add tool result to history
|
|
347
|
+
result_content = result.data if result.success else (result.error or "")
|
|
348
|
+
if not isinstance(result_content, str):
|
|
349
|
+
result_content = json.dumps(result_content)
|
|
350
|
+
|
|
351
|
+
self.history.append(
|
|
352
|
+
Message(
|
|
353
|
+
role="tool",
|
|
354
|
+
content=result_content,
|
|
355
|
+
tool_name=tool_name,
|
|
356
|
+
tool_call_id=tool_call_id,
|
|
357
|
+
)
|
|
358
|
+
)
|
|
359
|
+
else:
|
|
360
|
+
# Tool not found
|
|
361
|
+
error_msg = f"Tool not found: {tool_name}"
|
|
362
|
+
call_log["error"] = error_msg
|
|
363
|
+
|
|
364
|
+
self.events.append(
|
|
365
|
+
ScenarioEvent(
|
|
366
|
+
type="tool_result",
|
|
367
|
+
payload={"tool": tool_name, "success": False, "error": error_msg},
|
|
368
|
+
)
|
|
369
|
+
)
|
|
370
|
+
|
|
371
|
+
self.history.append(
|
|
372
|
+
Message(
|
|
373
|
+
role="tool",
|
|
374
|
+
content=error_msg,
|
|
375
|
+
tool_name=tool_name,
|
|
376
|
+
tool_call_id=tool_call_id,
|
|
377
|
+
)
|
|
378
|
+
)
|
|
379
|
+
|
|
380
|
+
self.tool_call_log.append(call_log)
|
|
381
|
+
|
|
382
|
+
def _get_tool_schemas(self) -> list[dict[str, Any]]:
|
|
383
|
+
"""Get tool schemas for agent."""
|
|
384
|
+
schemas = []
|
|
385
|
+
for name, tool in self.tools.items():
|
|
386
|
+
schemas.append(
|
|
387
|
+
{
|
|
388
|
+
"name": name,
|
|
389
|
+
"description": tool.description,
|
|
390
|
+
"actions": tool.get_actions(),
|
|
391
|
+
}
|
|
392
|
+
)
|
|
393
|
+
return schemas
|
|
394
|
+
|
|
395
|
+
def _evaluate_goals(self) -> list[str]:
|
|
396
|
+
"""Evaluate which goals were achieved."""
|
|
397
|
+
achieved: list[str] = []
|
|
398
|
+
|
|
399
|
+
for goal in self.scenario.goals:
|
|
400
|
+
detection = goal.detection
|
|
401
|
+
if not detection:
|
|
402
|
+
continue
|
|
403
|
+
|
|
404
|
+
detection_type = detection.get("type", "")
|
|
405
|
+
|
|
406
|
+
if detection_type == "env_state":
|
|
407
|
+
# Check if state key equals value
|
|
408
|
+
key = detection.get("key", "")
|
|
409
|
+
expected = detection.get("value")
|
|
410
|
+
if self.env_state.get(key) == expected:
|
|
411
|
+
achieved.append(goal.id)
|
|
412
|
+
|
|
413
|
+
elif detection_type == "tool_called":
|
|
414
|
+
# Check if a tool was called
|
|
415
|
+
tool = detection.get("tool", "")
|
|
416
|
+
for call in self.tool_call_log:
|
|
417
|
+
if call.get("tool") == tool:
|
|
418
|
+
achieved.append(goal.id)
|
|
419
|
+
break
|
|
420
|
+
|
|
421
|
+
elif detection_type == "any_tool_called":
|
|
422
|
+
# Check if any of the listed tools was called
|
|
423
|
+
tools = detection.get("tools", [])
|
|
424
|
+
for call in self.tool_call_log:
|
|
425
|
+
if call.get("tool") in tools:
|
|
426
|
+
achieved.append(goal.id)
|
|
427
|
+
break
|
|
428
|
+
|
|
429
|
+
elif detection_type == "agent_contains":
|
|
430
|
+
# Check if agent messages contain patterns
|
|
431
|
+
patterns = detection.get("patterns", [])
|
|
432
|
+
agent_text = " ".join(
|
|
433
|
+
e.payload.get("content", "") for e in self.events if e.type == "agent"
|
|
434
|
+
).lower()
|
|
435
|
+
|
|
436
|
+
for pattern in patterns:
|
|
437
|
+
if pattern.lower() in agent_text:
|
|
438
|
+
achieved.append(goal.id)
|
|
439
|
+
break
|
|
440
|
+
|
|
441
|
+
return list(set(achieved)) # Deduplicate
|
|
442
|
+
|
|
443
|
+
def _compute_score(self, goals_achieved: list[str]) -> float:
|
|
444
|
+
"""Compute score based on achieved goals."""
|
|
445
|
+
from sandboxy.core.safe_eval import EvaluationError, safe_eval_formula
|
|
446
|
+
|
|
447
|
+
total = 0.0
|
|
448
|
+
goal_map = {g.id: g for g in self.scenario.goals}
|
|
449
|
+
|
|
450
|
+
for goal_id in goals_achieved:
|
|
451
|
+
if goal_id in goal_map:
|
|
452
|
+
total += goal_map[goal_id].points
|
|
453
|
+
|
|
454
|
+
# Apply scoring formula if present
|
|
455
|
+
formula = self.scenario.scoring.get("formula")
|
|
456
|
+
if formula:
|
|
457
|
+
context = {
|
|
458
|
+
g.id.replace("-", "_"): 1.0 if g.id in goals_achieved else 0.0
|
|
459
|
+
for g in self.scenario.goals
|
|
460
|
+
}
|
|
461
|
+
context["goals_achieved"] = float(len(goals_achieved))
|
|
462
|
+
context["total_goals"] = float(len(self.scenario.goals))
|
|
463
|
+
try:
|
|
464
|
+
total = safe_eval_formula(formula, context)
|
|
465
|
+
except EvaluationError as e:
|
|
466
|
+
logger.warning("Failed to evaluate scoring formula '%s': %s", formula, e)
|
|
467
|
+
|
|
468
|
+
return total
|