sandboxy 0.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (60) hide show
  1. sandboxy/__init__.py +3 -0
  2. sandboxy/agents/__init__.py +21 -0
  3. sandboxy/agents/base.py +66 -0
  4. sandboxy/agents/llm_prompt.py +308 -0
  5. sandboxy/agents/loader.py +222 -0
  6. sandboxy/api/__init__.py +5 -0
  7. sandboxy/api/app.py +76 -0
  8. sandboxy/api/routes/__init__.py +1 -0
  9. sandboxy/api/routes/agents.py +92 -0
  10. sandboxy/api/routes/local.py +1388 -0
  11. sandboxy/api/routes/tools.py +106 -0
  12. sandboxy/cli/__init__.py +1 -0
  13. sandboxy/cli/main.py +1196 -0
  14. sandboxy/cli/type_detector.py +48 -0
  15. sandboxy/config.py +49 -0
  16. sandboxy/core/__init__.py +1 -0
  17. sandboxy/core/async_runner.py +824 -0
  18. sandboxy/core/mdl_parser.py +441 -0
  19. sandboxy/core/runner.py +599 -0
  20. sandboxy/core/safe_eval.py +165 -0
  21. sandboxy/core/state.py +234 -0
  22. sandboxy/datasets/__init__.py +20 -0
  23. sandboxy/datasets/loader.py +193 -0
  24. sandboxy/datasets/runner.py +442 -0
  25. sandboxy/errors.py +166 -0
  26. sandboxy/local/context.py +235 -0
  27. sandboxy/local/results.py +173 -0
  28. sandboxy/logging.py +31 -0
  29. sandboxy/mcp/__init__.py +25 -0
  30. sandboxy/mcp/client.py +360 -0
  31. sandboxy/mcp/wrapper.py +99 -0
  32. sandboxy/providers/__init__.py +34 -0
  33. sandboxy/providers/anthropic_provider.py +271 -0
  34. sandboxy/providers/base.py +123 -0
  35. sandboxy/providers/http_client.py +101 -0
  36. sandboxy/providers/openai_provider.py +282 -0
  37. sandboxy/providers/openrouter.py +958 -0
  38. sandboxy/providers/registry.py +199 -0
  39. sandboxy/scenarios/__init__.py +11 -0
  40. sandboxy/scenarios/comparison.py +491 -0
  41. sandboxy/scenarios/loader.py +262 -0
  42. sandboxy/scenarios/runner.py +468 -0
  43. sandboxy/scenarios/unified.py +1434 -0
  44. sandboxy/session/__init__.py +21 -0
  45. sandboxy/session/manager.py +278 -0
  46. sandboxy/tools/__init__.py +34 -0
  47. sandboxy/tools/base.py +127 -0
  48. sandboxy/tools/loader.py +270 -0
  49. sandboxy/tools/yaml_tools.py +708 -0
  50. sandboxy/ui/__init__.py +27 -0
  51. sandboxy/ui/dist/assets/index-CgAkYWrJ.css +1 -0
  52. sandboxy/ui/dist/assets/index-D4zoGFcr.js +347 -0
  53. sandboxy/ui/dist/index.html +14 -0
  54. sandboxy/utils/__init__.py +3 -0
  55. sandboxy/utils/time.py +20 -0
  56. sandboxy-0.0.1.dist-info/METADATA +241 -0
  57. sandboxy-0.0.1.dist-info/RECORD +60 -0
  58. sandboxy-0.0.1.dist-info/WHEEL +4 -0
  59. sandboxy-0.0.1.dist-info/entry_points.txt +3 -0
  60. sandboxy-0.0.1.dist-info/licenses/LICENSE +201 -0
@@ -0,0 +1,599 @@
1
+ """Runner - executes MDL modules with agents and tools."""
2
+
3
+ import json
4
+ import logging
5
+ import re
6
+ from typing import Any
7
+
8
+ from pydantic import BaseModel, Field
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+ from sandboxy.agents.base import Agent, AgentAction
13
+ from sandboxy.core.state import EvaluationResult, Message, ModuleSpec, Step, ToolCall
14
+ from sandboxy.tools.base import Tool, ToolResult
15
+ from sandboxy.tools.loader import ToolLoader
16
+
17
+
18
+ class RunEvent(BaseModel):
19
+ """Event recorded during module execution."""
20
+
21
+ type: str # "user", "agent", "tool_call", "tool_result", "branch", "eval"
22
+ payload: dict[str, Any] = Field(default_factory=dict)
23
+
24
+
25
+ class RunResult(BaseModel):
26
+ """Result of running a module with an agent."""
27
+
28
+ module_id: str
29
+ agent_id: str
30
+ events: list[RunEvent] = Field(default_factory=list)
31
+ evaluation: EvaluationResult = Field(default_factory=EvaluationResult)
32
+
33
+ def to_json(self, indent: int | None = None) -> str:
34
+ """Serialize result to JSON string."""
35
+ return self.model_dump_json(indent=indent)
36
+
37
+ def pretty(self) -> str:
38
+ """Format result for human-readable display."""
39
+ lines = [
40
+ f"Module: {self.module_id}",
41
+ f"Agent: {self.agent_id}",
42
+ "",
43
+ ]
44
+
45
+ for event in self.events:
46
+ if event.type == "user":
47
+ lines.append(f"USER: {event.payload.get('content', '')}")
48
+ elif event.type == "agent":
49
+ lines.append(f"AGENT: {event.payload.get('content', '')}")
50
+ elif event.type == "tool_call":
51
+ tool = event.payload.get("tool", "")
52
+ action = event.payload.get("action", "")
53
+ args = event.payload.get("args", {})
54
+ lines.append(f"TOOL CALL: {tool}.{action}({args})")
55
+ elif event.type == "tool_result":
56
+ result = event.payload.get("result", {})
57
+ success = result.get("success", False)
58
+ data = result.get("data", "")
59
+ status = "OK" if success else "FAIL"
60
+ lines.append(f"TOOL RESULT [{status}]: {data}")
61
+ elif event.type == "branch":
62
+ branch = event.payload.get("branch", "")
63
+ lines.append(f"[BRANCH] → {branch}")
64
+
65
+ lines.append("")
66
+ lines.append("EVALUATION:")
67
+ lines.append(f" Score: {self.evaluation.score}")
68
+ lines.append(f" Status: {self.evaluation.status}")
69
+ lines.append(f" Events: {self.evaluation.num_events}")
70
+ if self.evaluation.checks:
71
+ lines.append(f" Checks: {json.dumps(self.evaluation.checks, indent=2)}")
72
+
73
+ return "\n".join(lines)
74
+
75
+
76
+ class Runner:
77
+ """Executes MDL modules with agents and tools."""
78
+
79
+ def __init__(self, module: ModuleSpec, agent: Agent) -> None:
80
+ """Initialize runner with module and agent.
81
+
82
+ Args:
83
+ module: MDL module specification to execute.
84
+ agent: Agent to run within the module.
85
+ """
86
+ self.module = module
87
+ self.agent = agent
88
+ self.events: list[RunEvent] = []
89
+ self.history: list[Message] = []
90
+ self.env_state: dict[str, Any] = module.environment.initial_state.copy()
91
+ self.tools: dict[str, Tool] = ToolLoader.from_env_config(module.environment)
92
+
93
+ def run(self) -> RunResult:
94
+ """Execute the module and return results.
95
+
96
+ Returns:
97
+ Result containing events and evaluation.
98
+ """
99
+ step_index = 0
100
+ steps = self.module.steps
101
+
102
+ while step_index < len(steps):
103
+ step = steps[step_index]
104
+ next_index = step_index + 1
105
+
106
+ if step.action == "inject_user":
107
+ self._handle_inject_user(step)
108
+
109
+ elif step.action == "await_agent":
110
+ should_stop = self._handle_await_agent(step)
111
+ if should_stop:
112
+ break
113
+
114
+ elif step.action == "branch":
115
+ new_steps, new_index = self._handle_branch(step)
116
+ if new_steps is not None:
117
+ steps = new_steps
118
+ step_index = new_index
119
+ continue
120
+
121
+ step_index = next_index
122
+
123
+ evaluation = self._evaluate()
124
+ return RunResult(
125
+ module_id=self.module.id,
126
+ agent_id=self.agent.config.id,
127
+ events=self.events,
128
+ evaluation=evaluation,
129
+ )
130
+
131
+ def _handle_inject_user(self, step: Step) -> None:
132
+ """Handle inject_user action - add user message to history."""
133
+ content = step.params.get("content", "")
134
+ msg = Message(role="user", content=content)
135
+ self.history.append(msg)
136
+ self.events.append(
137
+ RunEvent(
138
+ type="user",
139
+ payload={"content": content, "step_id": step.id},
140
+ )
141
+ )
142
+
143
+ def _handle_await_agent(self, step: Step, max_tool_calls: int = 10) -> bool:
144
+ """Handle await_agent action - get agent response.
145
+
146
+ The agent may make multiple tool calls before responding with a message.
147
+ We loop until the agent returns a message or stop action.
148
+
149
+ Args:
150
+ step: Current step being executed.
151
+ max_tool_calls: Maximum tool calls allowed before forcing stop.
152
+
153
+ Returns:
154
+ True if agent wants to stop, False otherwise.
155
+ """
156
+ tool_call_count = 0
157
+
158
+ while tool_call_count < max_tool_calls:
159
+ # Build tool schemas for agent
160
+ tool_schemas = self._get_tool_schemas()
161
+
162
+ # Get agent action
163
+ action: AgentAction = self.agent.step(self.history, tool_schemas)
164
+
165
+ if action.type == "message":
166
+ msg = Message(role="assistant", content=action.content or "")
167
+ self.history.append(msg)
168
+ self.events.append(
169
+ RunEvent(
170
+ type="agent",
171
+ payload={"content": msg.content, "step_id": step.id},
172
+ )
173
+ )
174
+ return False # Done with this await_agent step
175
+
176
+ if action.type == "tool_call":
177
+ self._handle_tool_call(action, step)
178
+ tool_call_count += 1
179
+ # Continue loop to let agent respond to tool result
180
+
181
+ elif action.type == "stop":
182
+ return True
183
+
184
+ # Max tool calls reached
185
+ return False
186
+
187
+ def _handle_tool_call(self, action: AgentAction, step: Step) -> None:
188
+ """Handle a tool call from the agent."""
189
+ tool_name = action.tool_name or ""
190
+ tool_action = action.tool_action or ""
191
+ tool_args = action.tool_args or {}
192
+
193
+ # Generate unique tool call ID
194
+ tool_call_id = f"call_{tool_name}_{tool_action}_{len(self.events)}"
195
+ # Function name uses double underscore separator (matching _build_tools)
196
+ function_name = f"{tool_name}__{tool_action}"
197
+
198
+ self.events.append(
199
+ RunEvent(
200
+ type="tool_call",
201
+ payload={
202
+ "tool": tool_name,
203
+ "action": tool_action,
204
+ "args": tool_args,
205
+ "step_id": step.id,
206
+ },
207
+ )
208
+ )
209
+
210
+ # Add assistant message with tool_calls BEFORE the tool result
211
+ # This is required by OpenAI API
212
+ self.history.append(
213
+ Message(
214
+ role="assistant",
215
+ content="",
216
+ tool_calls=[
217
+ ToolCall(
218
+ id=tool_call_id,
219
+ name=function_name,
220
+ arguments=json.dumps(tool_args),
221
+ )
222
+ ],
223
+ )
224
+ )
225
+
226
+ # Execute tool if available
227
+ if tool_name in self.tools:
228
+ tool = self.tools[tool_name]
229
+ result: ToolResult = tool.invoke(tool_action, tool_args, self.env_state)
230
+
231
+ self.events.append(
232
+ RunEvent(
233
+ type="tool_result",
234
+ payload={
235
+ "tool": tool_name,
236
+ "action": tool_action,
237
+ "result": result.model_dump(),
238
+ },
239
+ )
240
+ )
241
+
242
+ # Add tool result to history with matching tool_call_id
243
+ self.history.append(
244
+ Message(
245
+ role="tool",
246
+ content=json.dumps(result.data) if result.success else result.error or "",
247
+ tool_name=tool_name,
248
+ tool_call_id=tool_call_id,
249
+ )
250
+ )
251
+ else:
252
+ # Tool not found - still add tool result message
253
+ error_msg = f"Tool not found: {tool_name}"
254
+ self.events.append(
255
+ RunEvent(
256
+ type="tool_result",
257
+ payload={
258
+ "tool": tool_name,
259
+ "action": tool_action,
260
+ "result": {"success": False, "error": error_msg},
261
+ },
262
+ )
263
+ )
264
+ self.history.append(
265
+ Message(
266
+ role="tool",
267
+ content=error_msg,
268
+ tool_name=tool_name,
269
+ tool_call_id=tool_call_id,
270
+ )
271
+ )
272
+
273
+ def _handle_branch(self, step: Step) -> tuple[list[Step] | None, int]:
274
+ """Handle branch action.
275
+
276
+ Returns:
277
+ Tuple of (new_steps, new_index) if branching, (None, 0) otherwise.
278
+ """
279
+ branch_name = step.params.get("branch_name")
280
+
281
+ self.events.append(
282
+ RunEvent(
283
+ type="branch",
284
+ payload={"branch": branch_name, "step_id": step.id},
285
+ )
286
+ )
287
+
288
+ if branch_name and branch_name in self.module.branches:
289
+ return self.module.branches[branch_name], 0
290
+
291
+ return None, 0
292
+
293
+ def _get_tool_schemas(self) -> list[dict[str, Any]]:
294
+ """Get tool schemas for agent tool calling."""
295
+ schemas = []
296
+ for name, tool in self.tools.items():
297
+ schemas.append(
298
+ {
299
+ "name": name,
300
+ "description": tool.description,
301
+ "actions": tool.get_actions(),
302
+ }
303
+ )
304
+ return schemas
305
+
306
+ def _evaluate(self) -> EvaluationResult:
307
+ """Run evaluation checks and compute score.
308
+
309
+ Returns:
310
+ Evaluation result with checks and score.
311
+ """
312
+ checks: dict[str, Any] = {}
313
+
314
+ for check in self.module.evaluation:
315
+ result = self._run_check(check)
316
+ checks[check.name] = result
317
+
318
+ # Compute score using scoring config
319
+ score = self._compute_score(checks)
320
+
321
+ return EvaluationResult(
322
+ checks=checks,
323
+ score=score,
324
+ num_events=len(self.events),
325
+ status="ok",
326
+ )
327
+
328
+ def _run_check(self, check: Any) -> dict[str, Any]:
329
+ """Run a single evaluation check."""
330
+ kind = check.kind
331
+
332
+ try:
333
+ if kind == "contains":
334
+ return self._check_contains(check)
335
+ if kind == "regex":
336
+ return self._check_regex(check)
337
+ if kind == "count":
338
+ return self._check_count(check)
339
+ if kind == "tool_called":
340
+ return self._check_tool_called(check)
341
+ if kind == "env_state":
342
+ return self._check_env_state(check)
343
+ if kind == "deterministic":
344
+ return self._eval_deterministic(check)
345
+ if kind == "llm":
346
+ return {"status": "skipped", "reason": "LLM eval not implemented"}
347
+ return {"status": "error", "error": f"Unknown check kind: {kind}"}
348
+ except Exception as e:
349
+ return {"status": "error", "error": str(e)}
350
+
351
+ def _get_target_text(self, target: str) -> str:
352
+ """Get text content for a target."""
353
+ if target == "agent_messages":
354
+ return " ".join(msg.content for msg in self.history if msg.role == "assistant")
355
+ if target == "user_messages":
356
+ return " ".join(msg.content for msg in self.history if msg.role == "user")
357
+ if target == "all_messages":
358
+ return " ".join(msg.content for msg in self.history)
359
+ if target == "last_agent_message":
360
+ for msg in reversed(self.history):
361
+ if msg.role == "assistant":
362
+ return msg.content
363
+ return ""
364
+ return ""
365
+
366
+ def _get_target_list(self, target: str) -> list[Any]:
367
+ """Get list of items for a target."""
368
+ if target == "agent_messages":
369
+ return [msg for msg in self.history if msg.role == "assistant"]
370
+ if target == "user_messages":
371
+ return [msg for msg in self.history if msg.role == "user"]
372
+ if target == "all_messages":
373
+ return list(self.history)
374
+ if target == "tool_calls":
375
+ return [event for event in self.events if event.type == "tool_call"]
376
+ return []
377
+
378
+ def _check_contains(self, check: Any) -> dict[str, Any]:
379
+ """Check if target contains a value."""
380
+ target = check.target or "agent_messages"
381
+ value = check.value or ""
382
+ expected = check.expected
383
+ case_sensitive = check.case_sensitive
384
+
385
+ text = self._get_target_text(target)
386
+
387
+ if not case_sensitive:
388
+ text = text.lower()
389
+ value = value.lower()
390
+
391
+ found = value in text
392
+ passed = found == expected
393
+
394
+ return {
395
+ "passed": passed,
396
+ "found": found,
397
+ "expected": expected,
398
+ }
399
+
400
+ def _check_regex(self, check: Any) -> dict[str, Any]:
401
+ """Check if target matches a regex pattern."""
402
+ target = check.target or "agent_messages"
403
+ pattern = check.pattern or ""
404
+ expected = check.expected
405
+
406
+ text = self._get_target_text(target)
407
+ flags = 0 if check.case_sensitive else re.IGNORECASE
408
+ match = bool(re.search(pattern, text, flags))
409
+ passed = match == expected
410
+
411
+ return {
412
+ "passed": passed,
413
+ "matched": match,
414
+ "expected": expected,
415
+ }
416
+
417
+ def _check_count(self, check: Any) -> dict[str, Any]:
418
+ """Check count of items."""
419
+ target = check.target or "agent_messages"
420
+ min_count = check.min
421
+ max_count = check.max
422
+
423
+ items = self._get_target_list(target)
424
+ count = len(items)
425
+
426
+ passed = True
427
+ if min_count is not None and count < min_count:
428
+ passed = False
429
+ if max_count is not None and count > max_count:
430
+ passed = False
431
+
432
+ return {
433
+ "passed": passed,
434
+ "count": count,
435
+ "min": min_count,
436
+ "max": max_count,
437
+ }
438
+
439
+ def _check_tool_called(self, check: Any) -> dict[str, Any]:
440
+ """Check if a specific tool was called."""
441
+ tool_name = check.tool
442
+ action_name = check.action
443
+ expected = check.expected
444
+
445
+ tool_calls = [e for e in self.events if e.type == "tool_call"]
446
+
447
+ called = False
448
+ for tc in tool_calls:
449
+ payload = tc.payload
450
+ if payload.get("tool") == tool_name:
451
+ if action_name is None or payload.get("action") == action_name:
452
+ called = True
453
+ break
454
+
455
+ passed = called == expected
456
+
457
+ return {
458
+ "passed": passed,
459
+ "called": called,
460
+ "expected": expected,
461
+ }
462
+
463
+ def _check_env_state(self, check: Any) -> dict[str, Any]:
464
+ """Check environment state value."""
465
+ key = check.key or ""
466
+ expected_value = check.value
467
+
468
+ actual_value = self.env_state.get(key)
469
+ passed = actual_value == expected_value
470
+
471
+ return {
472
+ "passed": passed,
473
+ "actual": actual_value,
474
+ "expected": expected_value,
475
+ }
476
+
477
+ def _compute_score(self, checks: dict[str, Any]) -> float:
478
+ """Compute final score based on scoring config."""
479
+ scoring = self.module.scoring
480
+
481
+ # Extract numeric values from checks
482
+ check_values: dict[str, float] = {}
483
+ for name, result in checks.items():
484
+ if isinstance(result, int | float):
485
+ check_values[name] = float(result)
486
+ elif isinstance(result, bool):
487
+ check_values[name] = 1.0 if result else 0.0
488
+ elif isinstance(result, dict):
489
+ if result.get("passed") is True:
490
+ check_values[name] = 1.0
491
+ elif result.get("passed") is False:
492
+ check_values[name] = 0.0
493
+ elif "value" in result and isinstance(result["value"], int | float):
494
+ check_values[name] = float(result["value"])
495
+
496
+ # Use formula if specified
497
+ if scoring.formula:
498
+ try:
499
+ score = self._eval_score_formula(scoring.formula, check_values)
500
+ except Exception:
501
+ score = self._weighted_average(check_values, scoring.weights)
502
+ else:
503
+ score = self._weighted_average(check_values, scoring.weights)
504
+
505
+ # Normalize if requested
506
+ if scoring.normalize and scoring.max_score != scoring.min_score:
507
+ score = (score - scoring.min_score) / (scoring.max_score - scoring.min_score)
508
+ score = max(0.0, min(1.0, score))
509
+
510
+ return score
511
+
512
+ def _eval_score_formula(self, formula: str, check_values: dict[str, float]) -> float:
513
+ """Evaluate a score formula."""
514
+ safe_builtins = {
515
+ "True": True,
516
+ "False": False,
517
+ "None": None,
518
+ "len": len,
519
+ "min": min,
520
+ "max": max,
521
+ "abs": abs,
522
+ "sum": sum,
523
+ "round": round,
524
+ }
525
+ context = {"__builtins__": safe_builtins, "env_state": self.env_state}
526
+ context.update(check_values)
527
+ return float(eval(formula, context, {}))
528
+
529
+ def _weighted_average(self, values: dict[str, float], weights: dict[str, float]) -> float:
530
+ """Compute weighted average of check values."""
531
+ if not values:
532
+ return 0.0
533
+ total = sum(values[n] * weights.get(n, 1.0) for n in values)
534
+ total_weight = sum(weights.get(n, 1.0) for n in values)
535
+ return total / total_weight if total_weight > 0 else 0.0
536
+
537
+ def _eval_deterministic(self, check: Any) -> Any:
538
+ """Evaluate a deterministic check.
539
+
540
+ Args:
541
+ check: Evaluation check with expr in config.
542
+
543
+ Returns:
544
+ Result of evaluation (bool, number, or error dict).
545
+ """
546
+ expr = check.config.get("expr", "")
547
+ if not expr or expr == "TODO":
548
+ return {"status": "skipped", "reason": "No expression defined"}
549
+
550
+ # Build evaluation context
551
+ context = {
552
+ "env_state": self.env_state,
553
+ "history": [msg.model_dump() for msg in self.history],
554
+ "events": [event.model_dump() for event in self.events],
555
+ }
556
+
557
+ try:
558
+ # Safe evaluation using restricted builtins
559
+ result = self._safe_eval(expr, context)
560
+ return result
561
+ except Exception as e:
562
+ return {"status": "error", "error": str(e)}
563
+
564
+ def _safe_eval(self, expr: str, context: dict[str, Any]) -> Any:
565
+ """Safely evaluate an expression with restricted scope.
566
+
567
+ Args:
568
+ expr: Expression to evaluate.
569
+ context: Variables available in expression.
570
+
571
+ Returns:
572
+ Result of evaluation.
573
+ """
574
+ # Restrict available builtins
575
+ safe_builtins = {
576
+ "True": True,
577
+ "False": False,
578
+ "None": None,
579
+ "len": len,
580
+ "str": str,
581
+ "int": int,
582
+ "float": float,
583
+ "bool": bool,
584
+ "list": list,
585
+ "dict": dict,
586
+ "sum": sum,
587
+ "min": min,
588
+ "max": max,
589
+ "abs": abs,
590
+ "round": round,
591
+ "any": any,
592
+ "all": all,
593
+ }
594
+
595
+ # Create restricted globals
596
+ safe_globals = {"__builtins__": safe_builtins}
597
+ safe_globals.update(context)
598
+
599
+ return eval(expr, safe_globals, {})