sandboxy 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. sandboxy/__init__.py +3 -0
  2. sandboxy/agents/__init__.py +21 -0
  3. sandboxy/agents/base.py +66 -0
  4. sandboxy/agents/llm_prompt.py +308 -0
  5. sandboxy/agents/loader.py +222 -0
  6. sandboxy/api/__init__.py +5 -0
  7. sandboxy/api/app.py +76 -0
  8. sandboxy/api/routes/__init__.py +1 -0
  9. sandboxy/api/routes/agents.py +92 -0
  10. sandboxy/api/routes/local.py +1388 -0
  11. sandboxy/api/routes/tools.py +106 -0
  12. sandboxy/cli/__init__.py +1 -0
  13. sandboxy/cli/main.py +1196 -0
  14. sandboxy/cli/type_detector.py +48 -0
  15. sandboxy/config.py +49 -0
  16. sandboxy/core/__init__.py +1 -0
  17. sandboxy/core/async_runner.py +824 -0
  18. sandboxy/core/mdl_parser.py +441 -0
  19. sandboxy/core/runner.py +599 -0
  20. sandboxy/core/safe_eval.py +165 -0
  21. sandboxy/core/state.py +234 -0
  22. sandboxy/datasets/__init__.py +20 -0
  23. sandboxy/datasets/loader.py +193 -0
  24. sandboxy/datasets/runner.py +442 -0
  25. sandboxy/errors.py +166 -0
  26. sandboxy/local/context.py +235 -0
  27. sandboxy/local/results.py +173 -0
  28. sandboxy/logging.py +31 -0
  29. sandboxy/mcp/__init__.py +25 -0
  30. sandboxy/mcp/client.py +360 -0
  31. sandboxy/mcp/wrapper.py +99 -0
  32. sandboxy/providers/__init__.py +34 -0
  33. sandboxy/providers/anthropic_provider.py +271 -0
  34. sandboxy/providers/base.py +123 -0
  35. sandboxy/providers/http_client.py +101 -0
  36. sandboxy/providers/openai_provider.py +282 -0
  37. sandboxy/providers/openrouter.py +958 -0
  38. sandboxy/providers/registry.py +199 -0
  39. sandboxy/scenarios/__init__.py +11 -0
  40. sandboxy/scenarios/comparison.py +491 -0
  41. sandboxy/scenarios/loader.py +262 -0
  42. sandboxy/scenarios/runner.py +468 -0
  43. sandboxy/scenarios/unified.py +1434 -0
  44. sandboxy/session/__init__.py +21 -0
  45. sandboxy/session/manager.py +278 -0
  46. sandboxy/tools/__init__.py +34 -0
  47. sandboxy/tools/base.py +127 -0
  48. sandboxy/tools/loader.py +270 -0
  49. sandboxy/tools/yaml_tools.py +708 -0
  50. sandboxy/ui/__init__.py +27 -0
  51. sandboxy/ui/dist/assets/index-CgAkYWrJ.css +1 -0
  52. sandboxy/ui/dist/assets/index-D4zoGFcr.js +347 -0
  53. sandboxy/ui/dist/index.html +14 -0
  54. sandboxy/utils/__init__.py +3 -0
  55. sandboxy/utils/time.py +20 -0
  56. sandboxy-0.0.1.dist-info/METADATA +241 -0
  57. sandboxy-0.0.1.dist-info/RECORD +60 -0
  58. sandboxy-0.0.1.dist-info/WHEEL +4 -0
  59. sandboxy-0.0.1.dist-info/entry_points.txt +3 -0
  60. sandboxy-0.0.1.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,468 @@
1
+ """Scenario runner - execute scenarios with YAML-defined and MCP tools."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import asyncio
6
+ import json
7
+ import logging
8
+ from pathlib import Path
9
+ from typing import Any
10
+
11
+ from pydantic import BaseModel, Field
12
+
13
+ from sandboxy.agents.base import Agent, AgentAction
14
+ from sandboxy.scenarios.loader import ScenarioSpec, StepSpec
15
+ from sandboxy.tools.base import ToolResult
16
+ from sandboxy.tools.loader import YAML_TOOL_DIRS
17
+ from sandboxy.tools.yaml_tools import load_scenario_tools
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ class ScenarioEvent(BaseModel):
23
+ """Event recorded during scenario execution."""
24
+
25
+ type: str # "user", "agent", "tool_call", "tool_result", "system"
26
+ payload: dict[str, Any] = Field(default_factory=dict)
27
+
28
+
29
+ class ScenarioResult(BaseModel):
30
+ """Result of running a scenario."""
31
+
32
+ scenario_id: str
33
+ agent_id: str
34
+ events: list[ScenarioEvent] = Field(default_factory=list)
35
+ tool_calls: list[dict[str, Any]] = Field(default_factory=list)
36
+ final_state: dict[str, Any] = Field(default_factory=dict)
37
+ goals_achieved: list[str] = Field(default_factory=list)
38
+ score: float = 0.0
39
+
40
+ def to_json(self, indent: int | None = None) -> str:
41
+ """Serialize result to JSON string."""
42
+ return self.model_dump_json(indent=indent)
43
+
44
+ def pretty(self) -> str:
45
+ """Format result for human-readable display."""
46
+ lines = [
47
+ f"Scenario: {self.scenario_id}",
48
+ f"Agent: {self.agent_id}",
49
+ "",
50
+ ]
51
+
52
+ for event in self.events:
53
+ if event.type == "user":
54
+ lines.append(f"USER: {event.payload.get('content', '')[:100]}...")
55
+ elif event.type == "agent":
56
+ content = event.payload.get("content", "")
57
+ if len(content) > 200:
58
+ content = content[:200] + "..."
59
+ lines.append(f"AGENT: {content}")
60
+ elif event.type == "tool_call":
61
+ tool = event.payload.get("tool", "")
62
+ action = event.payload.get("action", "")
63
+ lines.append(f"TOOL: {tool}.{action}()")
64
+ elif event.type == "tool_result":
65
+ success = event.payload.get("success", False)
66
+ status = "OK" if success else "FAIL"
67
+ data = str(event.payload.get("data", ""))[:50]
68
+ lines.append(f" -> [{status}] {data}")
69
+
70
+ lines.append("")
71
+ lines.append(f"Tool Calls Made: {len(self.tool_calls)}")
72
+ lines.append(f"Goals Achieved: {len(self.goals_achieved)}")
73
+ lines.append(f"Score: {self.score}")
74
+
75
+ return "\n".join(lines)
76
+
77
+
78
+ class Message(BaseModel):
79
+ """A message in conversation history."""
80
+
81
+ role: str # "system", "user", "assistant", "tool"
82
+ content: str
83
+ tool_name: str | None = None
84
+ tool_call_id: str | None = None
85
+ tool_calls: list[dict[str, Any]] | None = None
86
+
87
+
88
+ class ScenarioRunner:
89
+ """Runs scenarios with YAML-defined and MCP tools."""
90
+
91
+ def __init__(
92
+ self,
93
+ scenario: ScenarioSpec,
94
+ agent: Agent,
95
+ tool_dirs: list[Path] | None = None,
96
+ ) -> None:
97
+ """Initialize the scenario runner.
98
+
99
+ Args:
100
+ scenario: The scenario specification
101
+ agent: The agent to run
102
+ tool_dirs: Optional directories to search for tool libraries
103
+ """
104
+ self.scenario = scenario
105
+ self.agent = agent
106
+ self.tool_dirs = tool_dirs or YAML_TOOL_DIRS
107
+
108
+ # Load YAML tools
109
+ scenario_data = {
110
+ "tools_from": scenario.tools_from,
111
+ "tools": scenario.tools,
112
+ }
113
+ self.tools: dict[str, Any] = load_scenario_tools(scenario_data, self.tool_dirs)
114
+ self._mcp_tools: dict[str, Any] = {} # MCP tools loaded separately
115
+ self._mcp_manager: Any = None
116
+
117
+ # Initialize state
118
+ self.env_state: dict[str, Any] = scenario.initial_state.copy()
119
+ self.history: list[Message] = []
120
+ self.events: list[ScenarioEvent] = []
121
+ self.tool_call_log: list[dict[str, Any]] = []
122
+
123
+ async def _load_mcp_tools(self) -> None:
124
+ """Load MCP tools from configured servers."""
125
+ if not self.scenario.mcp_servers:
126
+ return
127
+
128
+ from sandboxy.mcp.client import McpManager, McpServerConfig
129
+
130
+ self._mcp_manager = McpManager()
131
+
132
+ configs = [
133
+ McpServerConfig(
134
+ name=server.name,
135
+ # Local (stdio) transport
136
+ command=server.command,
137
+ args=server.args,
138
+ env=server.env,
139
+ # Remote (HTTP) transport
140
+ url=server.url,
141
+ headers=server.headers,
142
+ transport=server.transport, # type: ignore[arg-type]
143
+ )
144
+ for server in self.scenario.mcp_servers
145
+ ]
146
+
147
+ self._mcp_tools = await self._mcp_manager.connect_all(configs)
148
+
149
+ # Merge MCP tools with YAML tools (YAML tools take precedence)
150
+ for name, tool in self._mcp_tools.items():
151
+ if name not in self.tools:
152
+ self.tools[name] = tool
153
+
154
+ async def _cleanup_mcp(self) -> None:
155
+ """Disconnect from MCP servers."""
156
+ if self._mcp_manager:
157
+ await self._mcp_manager.disconnect_all()
158
+
159
+ def run(self, max_turns: int = 20) -> ScenarioResult:
160
+ """Execute the scenario synchronously.
161
+
162
+ Args:
163
+ max_turns: Maximum number of conversation turns
164
+
165
+ Returns:
166
+ ScenarioResult with events and evaluation
167
+ """
168
+ return asyncio.run(self.run_async(max_turns))
169
+
170
+ async def run_async(self, max_turns: int = 20) -> ScenarioResult:
171
+ """Execute the scenario asynchronously.
172
+
173
+ Args:
174
+ max_turns: Maximum number of conversation turns
175
+
176
+ Returns:
177
+ ScenarioResult with events and evaluation
178
+ """
179
+ try:
180
+ # Load MCP tools if configured
181
+ await self._load_mcp_tools()
182
+
183
+ # Add system prompt to history
184
+ if self.scenario.system_prompt:
185
+ self.history.append(Message(role="system", content=self.scenario.system_prompt))
186
+
187
+ # Execute steps
188
+ for step in self.scenario.steps:
189
+ await self._execute_step(step, max_turns)
190
+
191
+ # Evaluate goals
192
+ goals_achieved = self._evaluate_goals()
193
+ score = self._compute_score(goals_achieved)
194
+
195
+ return ScenarioResult(
196
+ scenario_id=self.scenario.id,
197
+ agent_id=self.agent.config.id,
198
+ events=self.events,
199
+ tool_calls=self.tool_call_log,
200
+ final_state=self.env_state.copy(),
201
+ goals_achieved=goals_achieved,
202
+ score=score,
203
+ )
204
+ finally:
205
+ await self._cleanup_mcp()
206
+
207
+ async def _execute_step(self, step: StepSpec, max_turns: int) -> None:
208
+ """Execute a single scenario step."""
209
+ if step.action == "inject_user":
210
+ content = step.params.get("content", "")
211
+ self._add_user_message(content)
212
+
213
+ elif step.action == "await_agent":
214
+ await self._get_agent_response(max_tool_calls=10)
215
+
216
+ elif step.action == "await_user":
217
+ # Interactive mode - skip in batch execution
218
+ logger.debug("Skipping await_user step (batch mode)")
219
+
220
+ def _add_user_message(self, content: str) -> None:
221
+ """Add a user message to history."""
222
+ self.history.append(Message(role="user", content=content))
223
+ self.events.append(ScenarioEvent(type="user", payload={"content": content}))
224
+
225
+ async def _get_agent_response(self, max_tool_calls: int = 10) -> None:
226
+ """Get agent response, handling tool calls."""
227
+ from sandboxy.core.state import Message as CoreMessage
228
+ from sandboxy.core.state import ToolCall
229
+
230
+ tool_calls_made = 0
231
+
232
+ while tool_calls_made < max_tool_calls:
233
+ # Build tool schemas
234
+ tool_schemas = self._get_tool_schemas()
235
+
236
+ # Convert history to CoreMessage objects for agent
237
+ history_for_agent: list[CoreMessage] = []
238
+ for m in self.history:
239
+ # Convert tool_calls from dicts to ToolCall objects if present
240
+ tool_calls_obj = None
241
+ if m.tool_calls:
242
+ tool_calls_obj = [
243
+ ToolCall(
244
+ id=tc["id"],
245
+ name=tc["name"],
246
+ arguments=tc["arguments"],
247
+ )
248
+ for tc in m.tool_calls
249
+ ]
250
+
251
+ history_for_agent.append(
252
+ CoreMessage(
253
+ role=m.role, # type: ignore[arg-type]
254
+ content=m.content,
255
+ tool_name=m.tool_name,
256
+ tool_call_id=m.tool_call_id,
257
+ tool_calls=tool_calls_obj,
258
+ )
259
+ )
260
+
261
+ # Get agent action
262
+ action: AgentAction = self.agent.step(history_for_agent, tool_schemas)
263
+
264
+ if action.type == "message":
265
+ # Agent responded with a message
266
+ self.history.append(Message(role="assistant", content=action.content or ""))
267
+ self.events.append(ScenarioEvent(type="agent", payload={"content": action.content}))
268
+ return
269
+
270
+ if action.type == "tool_call":
271
+ # Agent made a tool call
272
+ await self._handle_tool_call(action)
273
+ tool_calls_made += 1
274
+
275
+ elif action.type == "stop":
276
+ return
277
+
278
+ async def _handle_tool_call(self, action: AgentAction) -> None:
279
+ """Handle a tool call from the agent."""
280
+ tool_name = action.tool_name or ""
281
+ tool_action = action.tool_action or "call"
282
+ tool_args = action.tool_args or {}
283
+
284
+ # Generate tool call ID
285
+ tool_call_id = f"call_{tool_name}_{len(self.events)}"
286
+ function_name = f"{tool_name}__{tool_action}"
287
+
288
+ # Log the call
289
+ call_log: dict[str, Any] = {
290
+ "tool": tool_name,
291
+ "action": tool_action,
292
+ "args": tool_args,
293
+ "state_before": self.env_state.copy(),
294
+ }
295
+
296
+ self.events.append(
297
+ ScenarioEvent(
298
+ type="tool_call",
299
+ payload={"tool": tool_name, "action": tool_action, "args": tool_args},
300
+ )
301
+ )
302
+
303
+ # Add assistant message with tool call
304
+ self.history.append(
305
+ Message(
306
+ role="assistant",
307
+ content="",
308
+ tool_calls=[
309
+ {
310
+ "id": tool_call_id,
311
+ "name": function_name,
312
+ "arguments": json.dumps(tool_args),
313
+ }
314
+ ],
315
+ )
316
+ )
317
+
318
+ # Execute the tool
319
+ if tool_name in self.tools:
320
+ tool = self.tools[tool_name]
321
+
322
+ # Check if tool is async (MCP) or sync (YAML mock)
323
+ if hasattr(tool, "invoke_async"):
324
+ # MCP tool - async
325
+ result: ToolResult = await tool.invoke_async(tool_action, tool_args, self.env_state)
326
+ else:
327
+ # YAML mock tool - sync
328
+ result = tool.invoke(tool_action, tool_args, self.env_state)
329
+
330
+ call_log["result"] = result.model_dump()
331
+ call_log["state_after"] = self.env_state.copy()
332
+
333
+ self.events.append(
334
+ ScenarioEvent(
335
+ type="tool_result",
336
+ payload={
337
+ "tool": tool_name,
338
+ "action": tool_action,
339
+ "success": result.success,
340
+ "data": result.data,
341
+ "error": result.error,
342
+ },
343
+ )
344
+ )
345
+
346
+ # Add tool result to history
347
+ result_content = result.data if result.success else (result.error or "")
348
+ if not isinstance(result_content, str):
349
+ result_content = json.dumps(result_content)
350
+
351
+ self.history.append(
352
+ Message(
353
+ role="tool",
354
+ content=result_content,
355
+ tool_name=tool_name,
356
+ tool_call_id=tool_call_id,
357
+ )
358
+ )
359
+ else:
360
+ # Tool not found
361
+ error_msg = f"Tool not found: {tool_name}"
362
+ call_log["error"] = error_msg
363
+
364
+ self.events.append(
365
+ ScenarioEvent(
366
+ type="tool_result",
367
+ payload={"tool": tool_name, "success": False, "error": error_msg},
368
+ )
369
+ )
370
+
371
+ self.history.append(
372
+ Message(
373
+ role="tool",
374
+ content=error_msg,
375
+ tool_name=tool_name,
376
+ tool_call_id=tool_call_id,
377
+ )
378
+ )
379
+
380
+ self.tool_call_log.append(call_log)
381
+
382
+ def _get_tool_schemas(self) -> list[dict[str, Any]]:
383
+ """Get tool schemas for agent."""
384
+ schemas = []
385
+ for name, tool in self.tools.items():
386
+ schemas.append(
387
+ {
388
+ "name": name,
389
+ "description": tool.description,
390
+ "actions": tool.get_actions(),
391
+ }
392
+ )
393
+ return schemas
394
+
395
+ def _evaluate_goals(self) -> list[str]:
396
+ """Evaluate which goals were achieved."""
397
+ achieved: list[str] = []
398
+
399
+ for goal in self.scenario.goals:
400
+ detection = goal.detection
401
+ if not detection:
402
+ continue
403
+
404
+ detection_type = detection.get("type", "")
405
+
406
+ if detection_type == "env_state":
407
+ # Check if state key equals value
408
+ key = detection.get("key", "")
409
+ expected = detection.get("value")
410
+ if self.env_state.get(key) == expected:
411
+ achieved.append(goal.id)
412
+
413
+ elif detection_type == "tool_called":
414
+ # Check if a tool was called
415
+ tool = detection.get("tool", "")
416
+ for call in self.tool_call_log:
417
+ if call.get("tool") == tool:
418
+ achieved.append(goal.id)
419
+ break
420
+
421
+ elif detection_type == "any_tool_called":
422
+ # Check if any of the listed tools was called
423
+ tools = detection.get("tools", [])
424
+ for call in self.tool_call_log:
425
+ if call.get("tool") in tools:
426
+ achieved.append(goal.id)
427
+ break
428
+
429
+ elif detection_type == "agent_contains":
430
+ # Check if agent messages contain patterns
431
+ patterns = detection.get("patterns", [])
432
+ agent_text = " ".join(
433
+ e.payload.get("content", "") for e in self.events if e.type == "agent"
434
+ ).lower()
435
+
436
+ for pattern in patterns:
437
+ if pattern.lower() in agent_text:
438
+ achieved.append(goal.id)
439
+ break
440
+
441
+ return list(set(achieved)) # Deduplicate
442
+
443
+ def _compute_score(self, goals_achieved: list[str]) -> float:
444
+ """Compute score based on achieved goals."""
445
+ from sandboxy.core.safe_eval import EvaluationError, safe_eval_formula
446
+
447
+ total = 0.0
448
+ goal_map = {g.id: g for g in self.scenario.goals}
449
+
450
+ for goal_id in goals_achieved:
451
+ if goal_id in goal_map:
452
+ total += goal_map[goal_id].points
453
+
454
+ # Apply scoring formula if present
455
+ formula = self.scenario.scoring.get("formula")
456
+ if formula:
457
+ context = {
458
+ g.id.replace("-", "_"): 1.0 if g.id in goals_achieved else 0.0
459
+ for g in self.scenario.goals
460
+ }
461
+ context["goals_achieved"] = float(len(goals_achieved))
462
+ context["total_goals"] = float(len(self.scenario.goals))
463
+ try:
464
+ total = safe_eval_formula(formula, context)
465
+ except EvaluationError as e:
466
+ logger.warning("Failed to evaluate scoring formula '%s': %s", formula, e)
467
+
468
+ return total