sandboxy 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sandboxy/__init__.py +3 -0
- sandboxy/agents/__init__.py +21 -0
- sandboxy/agents/base.py +66 -0
- sandboxy/agents/llm_prompt.py +308 -0
- sandboxy/agents/loader.py +222 -0
- sandboxy/api/__init__.py +5 -0
- sandboxy/api/app.py +76 -0
- sandboxy/api/routes/__init__.py +1 -0
- sandboxy/api/routes/agents.py +92 -0
- sandboxy/api/routes/local.py +1388 -0
- sandboxy/api/routes/tools.py +106 -0
- sandboxy/cli/__init__.py +1 -0
- sandboxy/cli/main.py +1196 -0
- sandboxy/cli/type_detector.py +48 -0
- sandboxy/config.py +49 -0
- sandboxy/core/__init__.py +1 -0
- sandboxy/core/async_runner.py +824 -0
- sandboxy/core/mdl_parser.py +441 -0
- sandboxy/core/runner.py +599 -0
- sandboxy/core/safe_eval.py +165 -0
- sandboxy/core/state.py +234 -0
- sandboxy/datasets/__init__.py +20 -0
- sandboxy/datasets/loader.py +193 -0
- sandboxy/datasets/runner.py +442 -0
- sandboxy/errors.py +166 -0
- sandboxy/local/context.py +235 -0
- sandboxy/local/results.py +173 -0
- sandboxy/logging.py +31 -0
- sandboxy/mcp/__init__.py +25 -0
- sandboxy/mcp/client.py +360 -0
- sandboxy/mcp/wrapper.py +99 -0
- sandboxy/providers/__init__.py +34 -0
- sandboxy/providers/anthropic_provider.py +271 -0
- sandboxy/providers/base.py +123 -0
- sandboxy/providers/http_client.py +101 -0
- sandboxy/providers/openai_provider.py +282 -0
- sandboxy/providers/openrouter.py +958 -0
- sandboxy/providers/registry.py +199 -0
- sandboxy/scenarios/__init__.py +11 -0
- sandboxy/scenarios/comparison.py +491 -0
- sandboxy/scenarios/loader.py +262 -0
- sandboxy/scenarios/runner.py +468 -0
- sandboxy/scenarios/unified.py +1434 -0
- sandboxy/session/__init__.py +21 -0
- sandboxy/session/manager.py +278 -0
- sandboxy/tools/__init__.py +34 -0
- sandboxy/tools/base.py +127 -0
- sandboxy/tools/loader.py +270 -0
- sandboxy/tools/yaml_tools.py +708 -0
- sandboxy/ui/__init__.py +27 -0
- sandboxy/ui/dist/assets/index-CgAkYWrJ.css +1 -0
- sandboxy/ui/dist/assets/index-D4zoGFcr.js +347 -0
- sandboxy/ui/dist/index.html +14 -0
- sandboxy/utils/__init__.py +3 -0
- sandboxy/utils/time.py +20 -0
- sandboxy-0.0.1.dist-info/METADATA +241 -0
- sandboxy-0.0.1.dist-info/RECORD +60 -0
- sandboxy-0.0.1.dist-info/WHEEL +4 -0
- sandboxy-0.0.1.dist-info/entry_points.txt +3 -0
- sandboxy-0.0.1.dist-info/licenses/LICENSE +201 -0
sandboxy/cli/main.py
ADDED
|
@@ -0,0 +1,1196 @@
|
|
|
1
|
+
"""CLI entrypoint for Sandboxy."""
|
|
2
|
+
|
|
3
|
+
import csv
|
|
4
|
+
import json
|
|
5
|
+
import os
|
|
6
|
+
import sys
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
from typing import Any
|
|
9
|
+
|
|
10
|
+
import click
|
|
11
|
+
from dotenv import load_dotenv
|
|
12
|
+
|
|
13
|
+
# Load .env file from current directory and parents
|
|
14
|
+
load_dotenv() # Loads .env from cwd
|
|
15
|
+
load_dotenv(Path.home() / ".sandboxy" / ".env") # Also check ~/.sandboxy/.env
|
|
16
|
+
|
|
17
|
+
from sandboxy.agents.loader import AgentLoader
|
|
18
|
+
from sandboxy.core.mdl_parser import MDLParseError, apply_variables, load_module, validate_module
|
|
19
|
+
from sandboxy.core.runner import Runner
|
|
20
|
+
from sandboxy.scenarios.loader import load_scenario
|
|
21
|
+
from sandboxy.scenarios.runner import ScenarioRunner
|
|
22
|
+
from sandboxy.tools.loader import get_yaml_tool_libraries
|
|
23
|
+
|
|
24
|
+
DEFAULT_AGENT_DIRS = [
|
|
25
|
+
Path("agents/core"),
|
|
26
|
+
Path("agents/community"),
|
|
27
|
+
Path.home() / ".sandboxy" / "agents",
|
|
28
|
+
]
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
@click.group()
|
|
32
|
+
@click.version_option(package_name="sandboxy")
|
|
33
|
+
def main() -> None:
|
|
34
|
+
"""Sandboxy CLI - run and validate agent simulations."""
|
|
35
|
+
pass
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _load_variables_from_env() -> dict:
|
|
39
|
+
"""Load variables from SANDBOXY_VARIABLES environment variable."""
|
|
40
|
+
env_vars = os.environ.get("SANDBOXY_VARIABLES", "")
|
|
41
|
+
if not env_vars:
|
|
42
|
+
return {}
|
|
43
|
+
try:
|
|
44
|
+
return json.loads(env_vars)
|
|
45
|
+
except json.JSONDecodeError:
|
|
46
|
+
return {}
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
@main.command()
|
|
50
|
+
@click.argument("module_path", type=click.Path(exists=True))
|
|
51
|
+
@click.option("--agent-id", "-a", help="Agent ID to use", default=None)
|
|
52
|
+
@click.option("--output", "-o", help="Output file for replay JSON", default=None)
|
|
53
|
+
@click.option("--pretty", "-p", is_flag=True, help="Pretty print output")
|
|
54
|
+
@click.option("--var", "-v", multiple=True, help="Variable in name=value format")
|
|
55
|
+
def run(
|
|
56
|
+
module_path: str,
|
|
57
|
+
agent_id: str | None,
|
|
58
|
+
output: str | None,
|
|
59
|
+
pretty: bool,
|
|
60
|
+
var: tuple[str, ...],
|
|
61
|
+
) -> None:
|
|
62
|
+
"""Run a module with a given agent.
|
|
63
|
+
|
|
64
|
+
MODULE_PATH is the path to an MDL YAML file.
|
|
65
|
+
"""
|
|
66
|
+
try:
|
|
67
|
+
module = load_module(Path(module_path))
|
|
68
|
+
except MDLParseError as e:
|
|
69
|
+
click.echo(f"Error loading module: {e}", err=True)
|
|
70
|
+
sys.exit(1)
|
|
71
|
+
|
|
72
|
+
# Load variables from environment and CLI
|
|
73
|
+
variables = _load_variables_from_env()
|
|
74
|
+
for v in var:
|
|
75
|
+
if "=" in v:
|
|
76
|
+
name, value = v.split("=", 1)
|
|
77
|
+
# Try to parse as JSON for numbers/booleans
|
|
78
|
+
try:
|
|
79
|
+
variables[name] = json.loads(value)
|
|
80
|
+
except json.JSONDecodeError:
|
|
81
|
+
variables[name] = value
|
|
82
|
+
|
|
83
|
+
# Apply variables to module
|
|
84
|
+
module = apply_variables(module, variables)
|
|
85
|
+
|
|
86
|
+
loader = AgentLoader(DEFAULT_AGENT_DIRS)
|
|
87
|
+
|
|
88
|
+
try:
|
|
89
|
+
if agent_id:
|
|
90
|
+
agent = loader.load(agent_id)
|
|
91
|
+
else:
|
|
92
|
+
agent = loader.load_default()
|
|
93
|
+
except ValueError as e:
|
|
94
|
+
click.echo(f"Error loading agent: {e}", err=True)
|
|
95
|
+
sys.exit(1)
|
|
96
|
+
|
|
97
|
+
# Apply module's agent_config overrides
|
|
98
|
+
if module.agent_config:
|
|
99
|
+
if "system_prompt" in module.agent_config:
|
|
100
|
+
agent.config.system_prompt = module.agent_config["system_prompt"]
|
|
101
|
+
|
|
102
|
+
click.echo(f"Running module: {module.id}")
|
|
103
|
+
click.echo(f"Using agent: {agent.config.id}")
|
|
104
|
+
if variables:
|
|
105
|
+
click.echo(f"Variables: {variables}")
|
|
106
|
+
click.echo("")
|
|
107
|
+
|
|
108
|
+
runner = Runner(module=module, agent=agent)
|
|
109
|
+
result = runner.run()
|
|
110
|
+
|
|
111
|
+
if output:
|
|
112
|
+
Path(output).write_text(result.to_json(indent=2))
|
|
113
|
+
click.echo(f"Results saved to: {output}")
|
|
114
|
+
elif pretty:
|
|
115
|
+
click.echo(result.pretty())
|
|
116
|
+
else:
|
|
117
|
+
click.echo(result.to_json(indent=2))
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
@main.command()
|
|
121
|
+
@click.argument("module_path", type=click.Path(exists=True))
|
|
122
|
+
def validate(module_path: str) -> None:
|
|
123
|
+
"""Validate an MDL module.
|
|
124
|
+
|
|
125
|
+
MODULE_PATH is the path to an MDL YAML file.
|
|
126
|
+
"""
|
|
127
|
+
errors = validate_module(Path(module_path))
|
|
128
|
+
|
|
129
|
+
if errors:
|
|
130
|
+
click.echo("Module validation failed:", err=True)
|
|
131
|
+
for error in errors:
|
|
132
|
+
click.echo(f" - {error}", err=True)
|
|
133
|
+
sys.exit(1)
|
|
134
|
+
|
|
135
|
+
click.echo("Module is valid.")
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
@main.command()
|
|
139
|
+
@click.argument("module_path", type=click.Path(exists=True))
|
|
140
|
+
@click.option("--agents", required=True, help="Comma-separated agent IDs")
|
|
141
|
+
@click.option("--runs-per-agent", type=int, default=1, help="Number of runs per agent")
|
|
142
|
+
@click.option("--output", "-o", type=click.Path(), default=None, help="Output CSV file")
|
|
143
|
+
@click.option("--var", "-v", multiple=True, help="Variable in name=value format")
|
|
144
|
+
@click.option("--seed", type=int, default=None, help="Random seed for reproducibility")
|
|
145
|
+
def bench(
|
|
146
|
+
module_path: str,
|
|
147
|
+
agents: str,
|
|
148
|
+
runs_per_agent: int,
|
|
149
|
+
output: str | None,
|
|
150
|
+
var: tuple[str, ...],
|
|
151
|
+
seed: int | None,
|
|
152
|
+
) -> None:
|
|
153
|
+
"""Benchmark a module against multiple agents.
|
|
154
|
+
|
|
155
|
+
MODULE_PATH is the path to an MDL YAML file.
|
|
156
|
+
|
|
157
|
+
Examples:
|
|
158
|
+
sandboxy bench modules/lemonade.yml --agents gpt4,claude --runs 5
|
|
159
|
+
sandboxy bench modules/lemonade.yml --agents gpt4 -v difficulty=8 -v starting_cash=100
|
|
160
|
+
"""
|
|
161
|
+
import random
|
|
162
|
+
|
|
163
|
+
# Set random seed for reproducibility
|
|
164
|
+
if seed is not None:
|
|
165
|
+
random.seed(seed)
|
|
166
|
+
|
|
167
|
+
try:
|
|
168
|
+
module = load_module(Path(module_path))
|
|
169
|
+
except MDLParseError as e:
|
|
170
|
+
click.echo(f"Error loading module: {e}", err=True)
|
|
171
|
+
sys.exit(1)
|
|
172
|
+
|
|
173
|
+
# Load variables from environment and CLI
|
|
174
|
+
variables = _load_variables_from_env()
|
|
175
|
+
for v in var:
|
|
176
|
+
if "=" in v:
|
|
177
|
+
name, value = v.split("=", 1)
|
|
178
|
+
try:
|
|
179
|
+
variables[name] = json.loads(value)
|
|
180
|
+
except json.JSONDecodeError:
|
|
181
|
+
variables[name] = value
|
|
182
|
+
|
|
183
|
+
# Apply variables to module
|
|
184
|
+
if variables:
|
|
185
|
+
module = apply_variables(module, variables)
|
|
186
|
+
click.echo(f"Variables: {variables}")
|
|
187
|
+
|
|
188
|
+
loader = AgentLoader(DEFAULT_AGENT_DIRS)
|
|
189
|
+
agent_ids = [a.strip() for a in agents.split(",")]
|
|
190
|
+
|
|
191
|
+
results: list[dict[str, str | float | int]] = []
|
|
192
|
+
|
|
193
|
+
for agent_id in agent_ids:
|
|
194
|
+
try:
|
|
195
|
+
agent = loader.load(agent_id)
|
|
196
|
+
except ValueError as e:
|
|
197
|
+
click.echo(f"Warning: Skipping agent {agent_id}: {e}", err=True)
|
|
198
|
+
continue
|
|
199
|
+
|
|
200
|
+
# Apply module's agent_config overrides
|
|
201
|
+
if module.agent_config:
|
|
202
|
+
if "system_prompt" in module.agent_config:
|
|
203
|
+
agent.config.system_prompt = module.agent_config["system_prompt"]
|
|
204
|
+
|
|
205
|
+
click.echo(f"Benchmarking agent: {agent_id}")
|
|
206
|
+
|
|
207
|
+
for run_idx in range(runs_per_agent):
|
|
208
|
+
runner = Runner(module=module, agent=agent)
|
|
209
|
+
result = runner.run()
|
|
210
|
+
|
|
211
|
+
row: dict[str, str | float | int] = {
|
|
212
|
+
"agent_id": agent_id,
|
|
213
|
+
"run_idx": run_idx,
|
|
214
|
+
"score": result.evaluation.score,
|
|
215
|
+
"num_events": result.evaluation.num_events,
|
|
216
|
+
"status": result.evaluation.status,
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
# Add seed if used for reproducibility tracking
|
|
220
|
+
if seed is not None:
|
|
221
|
+
row["seed"] = seed
|
|
222
|
+
|
|
223
|
+
# Add env_state metrics if available
|
|
224
|
+
if "cash_balance" in runner.env_state:
|
|
225
|
+
row["final_cash"] = runner.env_state["cash_balance"]
|
|
226
|
+
if "starting_cash" in module.environment.initial_state:
|
|
227
|
+
initial = module.environment.initial_state["starting_cash"]
|
|
228
|
+
if "final_cash" in row:
|
|
229
|
+
row["profit"] = float(row["final_cash"]) - float(initial)
|
|
230
|
+
|
|
231
|
+
# Add all evaluation check results
|
|
232
|
+
for check_name, check_result in result.evaluation.checks.items():
|
|
233
|
+
if isinstance(check_result, int | float | bool):
|
|
234
|
+
row[f"check_{check_name}"] = check_result
|
|
235
|
+
|
|
236
|
+
results.append(row)
|
|
237
|
+
click.echo(f" Run {run_idx + 1}: score={result.evaluation.score:.2f}")
|
|
238
|
+
|
|
239
|
+
if not results:
|
|
240
|
+
click.echo("No results to report.", err=True)
|
|
241
|
+
sys.exit(1)
|
|
242
|
+
|
|
243
|
+
# Output results
|
|
244
|
+
if output:
|
|
245
|
+
fieldnames = list(results[0].keys())
|
|
246
|
+
with open(output, "w", newline="") as f:
|
|
247
|
+
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
|
248
|
+
writer.writeheader()
|
|
249
|
+
writer.writerows(results)
|
|
250
|
+
click.echo(f"\nResults saved to: {output}")
|
|
251
|
+
else:
|
|
252
|
+
# Print summary table
|
|
253
|
+
click.echo("\nBenchmark Results:")
|
|
254
|
+
click.echo("-" * 60)
|
|
255
|
+
|
|
256
|
+
# Group by agent
|
|
257
|
+
from collections import defaultdict
|
|
258
|
+
|
|
259
|
+
by_agent: dict[str, list[dict[str, str | float | int]]] = defaultdict(list)
|
|
260
|
+
for r in results:
|
|
261
|
+
by_agent[str(r["agent_id"])].append(r)
|
|
262
|
+
|
|
263
|
+
for agent_id, runs in by_agent.items():
|
|
264
|
+
scores = [r["score"] for r in runs if isinstance(r["score"], int | float)]
|
|
265
|
+
avg_score = sum(scores) / len(scores) if scores else 0
|
|
266
|
+
click.echo(f"{agent_id}:")
|
|
267
|
+
click.echo(f" Runs: {len(runs)}")
|
|
268
|
+
click.echo(f" Avg Score: {avg_score:.3f}")
|
|
269
|
+
if "final_cash" in runs[0]:
|
|
270
|
+
cash_values = [
|
|
271
|
+
float(r["final_cash"])
|
|
272
|
+
for r in runs
|
|
273
|
+
if "final_cash" in r and isinstance(r["final_cash"], int | float)
|
|
274
|
+
]
|
|
275
|
+
avg_cash = sum(cash_values) / len(cash_values) if cash_values else 0.0
|
|
276
|
+
click.echo(f" Avg Final Cash: {avg_cash:.2f}")
|
|
277
|
+
click.echo("")
|
|
278
|
+
|
|
279
|
+
|
|
280
|
+
@main.command()
|
|
281
|
+
@click.option("--port", "-p", type=int, default=8000, help="Port to run server on")
|
|
282
|
+
@click.option("--host", default="127.0.0.1", help="Host to bind to")
|
|
283
|
+
@click.option("--no-browser", is_flag=True, help="Don't open browser automatically")
|
|
284
|
+
def open(port: int, host: str, no_browser: bool) -> None:
|
|
285
|
+
"""Open the local Sandboxy UI.
|
|
286
|
+
|
|
287
|
+
Starts the API server and opens the web interface in your browser.
|
|
288
|
+
Loads scenarios, tools, and agents from the current working directory.
|
|
289
|
+
|
|
290
|
+
Examples:
|
|
291
|
+
sandboxy open
|
|
292
|
+
sandboxy open --port 3000
|
|
293
|
+
sandboxy open --no-browser
|
|
294
|
+
"""
|
|
295
|
+
import threading
|
|
296
|
+
import time
|
|
297
|
+
import webbrowser
|
|
298
|
+
|
|
299
|
+
import uvicorn
|
|
300
|
+
|
|
301
|
+
from sandboxy.api.app import create_local_app
|
|
302
|
+
|
|
303
|
+
root_dir = Path.cwd()
|
|
304
|
+
local_ui_path = Path(__file__).parent.parent / "ui" / "dist"
|
|
305
|
+
|
|
306
|
+
app = create_local_app(
|
|
307
|
+
root_dir,
|
|
308
|
+
local_ui_path if local_ui_path.exists() else None,
|
|
309
|
+
)
|
|
310
|
+
|
|
311
|
+
url = f"http://{host}:{port}"
|
|
312
|
+
click.echo(f"Starting Sandboxy at {url}")
|
|
313
|
+
click.echo(f"Working directory: {root_dir}")
|
|
314
|
+
click.echo("")
|
|
315
|
+
|
|
316
|
+
if not no_browser:
|
|
317
|
+
|
|
318
|
+
def open_browser() -> None:
|
|
319
|
+
time.sleep(1.5)
|
|
320
|
+
webbrowser.open(url)
|
|
321
|
+
|
|
322
|
+
threading.Thread(target=open_browser, daemon=True).start()
|
|
323
|
+
|
|
324
|
+
uvicorn.run(app, host=host, port=port, log_level="info")
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
@main.command()
|
|
328
|
+
def list_agents() -> None:
|
|
329
|
+
"""List available agents."""
|
|
330
|
+
loader = AgentLoader(DEFAULT_AGENT_DIRS)
|
|
331
|
+
agent_ids = loader.list_ids()
|
|
332
|
+
|
|
333
|
+
if not agent_ids:
|
|
334
|
+
click.echo("No agents found.")
|
|
335
|
+
click.echo("Agent directories searched:")
|
|
336
|
+
for d in DEFAULT_AGENT_DIRS:
|
|
337
|
+
click.echo(f" - {d}")
|
|
338
|
+
return
|
|
339
|
+
|
|
340
|
+
click.echo("Available agents:")
|
|
341
|
+
for agent_id in sorted(agent_ids):
|
|
342
|
+
config = loader.get_config(agent_id)
|
|
343
|
+
if config:
|
|
344
|
+
click.echo(f" {agent_id}")
|
|
345
|
+
click.echo(f" Name: {config.name}")
|
|
346
|
+
click.echo(f" Model: {config.model}")
|
|
347
|
+
click.echo("")
|
|
348
|
+
|
|
349
|
+
|
|
350
|
+
@main.command()
|
|
351
|
+
@click.argument("module_path", type=click.Path(exists=True))
|
|
352
|
+
def info(module_path: str) -> None:
|
|
353
|
+
"""Show information about a module.
|
|
354
|
+
|
|
355
|
+
MODULE_PATH is the path to an MDL YAML file.
|
|
356
|
+
"""
|
|
357
|
+
try:
|
|
358
|
+
module = load_module(Path(module_path))
|
|
359
|
+
except MDLParseError as e:
|
|
360
|
+
click.echo(f"Error loading module: {e}", err=True)
|
|
361
|
+
sys.exit(1)
|
|
362
|
+
|
|
363
|
+
click.echo(f"Module: {module.id}")
|
|
364
|
+
click.echo(f"Description: {module.description}")
|
|
365
|
+
click.echo("")
|
|
366
|
+
click.echo("Environment:")
|
|
367
|
+
click.echo(f" Sandbox Type: {module.environment.sandbox_type}")
|
|
368
|
+
click.echo(f" Tools: {len(module.environment.tools)}")
|
|
369
|
+
for tool in module.environment.tools:
|
|
370
|
+
click.echo(f" - {tool.name} ({tool.type})")
|
|
371
|
+
click.echo("")
|
|
372
|
+
click.echo(f"Steps: {len(module.steps)}")
|
|
373
|
+
for step in module.steps:
|
|
374
|
+
click.echo(f" - {step.id}: {step.action}")
|
|
375
|
+
click.echo("")
|
|
376
|
+
click.echo(f"Branches: {len(module.branches)}")
|
|
377
|
+
for name, steps in module.branches.items():
|
|
378
|
+
click.echo(f" - {name}: {len(steps)} steps")
|
|
379
|
+
click.echo("")
|
|
380
|
+
click.echo(f"Evaluation Checks: {len(module.evaluation)}")
|
|
381
|
+
for check in module.evaluation:
|
|
382
|
+
click.echo(f" - {check.name} ({check.kind})")
|
|
383
|
+
|
|
384
|
+
|
|
385
|
+
@main.command()
|
|
386
|
+
@click.argument("scenario_path", type=click.Path(exists=True))
|
|
387
|
+
@click.option(
|
|
388
|
+
"--model",
|
|
389
|
+
"-m",
|
|
390
|
+
help="Model to use (e.g., openai/gpt-4o, anthropic/claude-3.5-sonnet)",
|
|
391
|
+
default=None,
|
|
392
|
+
)
|
|
393
|
+
@click.option("--agent-id", "-a", help="Agent ID from config files", default=None)
|
|
394
|
+
@click.option("--output", "-o", help="Output file for results JSON", default=None)
|
|
395
|
+
@click.option("--pretty", "-p", is_flag=True, help="Pretty print output")
|
|
396
|
+
@click.option("--max-turns", type=int, default=20, help="Maximum conversation turns")
|
|
397
|
+
@click.option("--var", "-v", multiple=True, help="Variable in name=value format")
|
|
398
|
+
def scenario(
|
|
399
|
+
scenario_path: str,
|
|
400
|
+
model: str | None,
|
|
401
|
+
agent_id: str | None,
|
|
402
|
+
output: str | None,
|
|
403
|
+
pretty: bool,
|
|
404
|
+
max_turns: int,
|
|
405
|
+
var: tuple[str, ...],
|
|
406
|
+
) -> None:
|
|
407
|
+
"""Run a scenario with YAML-defined tools.
|
|
408
|
+
|
|
409
|
+
SCENARIO_PATH is the path to a scenario YAML file.
|
|
410
|
+
|
|
411
|
+
Scenarios support YAML-defined tools that don't require Python code.
|
|
412
|
+
Tools can be defined inline or loaded from tool libraries.
|
|
413
|
+
|
|
414
|
+
Examples:
|
|
415
|
+
sandboxy scenario scenarios/trolley.yml -m openai/gpt-4o
|
|
416
|
+
sandboxy scenario scenarios/trolley.yml -m anthropic/claude-3.5-sonnet -p
|
|
417
|
+
sandboxy scenario scenarios/surgeon.yml -v patient="John Smith" -v condition="critical"
|
|
418
|
+
"""
|
|
419
|
+
from sandboxy.agents.base import AgentConfig
|
|
420
|
+
from sandboxy.agents.llm_prompt import LlmPromptAgent
|
|
421
|
+
from sandboxy.scenarios.loader import apply_scenario_variables
|
|
422
|
+
|
|
423
|
+
try:
|
|
424
|
+
spec = load_scenario(Path(scenario_path))
|
|
425
|
+
except ValueError as e:
|
|
426
|
+
click.echo(f"Error loading scenario: {e}", err=True)
|
|
427
|
+
sys.exit(1)
|
|
428
|
+
|
|
429
|
+
# Parse and apply variables
|
|
430
|
+
variables: dict[str, Any] = {}
|
|
431
|
+
for v in var:
|
|
432
|
+
if "=" in v:
|
|
433
|
+
name, value = v.split("=", 1)
|
|
434
|
+
# Try to parse as JSON for numbers/booleans
|
|
435
|
+
try:
|
|
436
|
+
variables[name] = json.loads(value)
|
|
437
|
+
except json.JSONDecodeError:
|
|
438
|
+
variables[name] = value
|
|
439
|
+
|
|
440
|
+
if variables:
|
|
441
|
+
spec = apply_scenario_variables(spec, variables)
|
|
442
|
+
click.echo(f"Variables: {variables}")
|
|
443
|
+
|
|
444
|
+
# Determine which agent to use
|
|
445
|
+
agent = None
|
|
446
|
+
|
|
447
|
+
if model:
|
|
448
|
+
# Create ad-hoc agent from model string
|
|
449
|
+
config = AgentConfig(
|
|
450
|
+
id=model,
|
|
451
|
+
name=model.split("/")[-1] if "/" in model else model,
|
|
452
|
+
kind="llm-prompt",
|
|
453
|
+
model=model,
|
|
454
|
+
system_prompt="",
|
|
455
|
+
tools=[],
|
|
456
|
+
params={"temperature": 0.7, "max_tokens": 4096},
|
|
457
|
+
impl={},
|
|
458
|
+
)
|
|
459
|
+
agent = LlmPromptAgent(config)
|
|
460
|
+
elif agent_id:
|
|
461
|
+
# Load from agent config files
|
|
462
|
+
loader = AgentLoader(DEFAULT_AGENT_DIRS)
|
|
463
|
+
try:
|
|
464
|
+
agent = loader.load(agent_id)
|
|
465
|
+
except ValueError as e:
|
|
466
|
+
click.echo(f"Error loading agent: {e}", err=True)
|
|
467
|
+
sys.exit(1)
|
|
468
|
+
else:
|
|
469
|
+
# Try to load default, but give helpful message if none available
|
|
470
|
+
loader = AgentLoader(DEFAULT_AGENT_DIRS)
|
|
471
|
+
try:
|
|
472
|
+
agent = loader.load_default()
|
|
473
|
+
except ValueError:
|
|
474
|
+
click.echo("No model specified. Use -m to specify a model:", err=True)
|
|
475
|
+
click.echo("", err=True)
|
|
476
|
+
click.echo(" sandboxy scenario <file> -m openai/gpt-4o", err=True)
|
|
477
|
+
click.echo(" sandboxy scenario <file> -m anthropic/claude-3.5-sonnet", err=True)
|
|
478
|
+
click.echo(" sandboxy scenario <file> -m google/gemini-2.0-flash-exp:free", err=True)
|
|
479
|
+
click.echo("", err=True)
|
|
480
|
+
click.echo(
|
|
481
|
+
"Or set OPENROUTER_API_KEY and use any model from openrouter.ai/models", err=True
|
|
482
|
+
)
|
|
483
|
+
sys.exit(1)
|
|
484
|
+
|
|
485
|
+
# Apply scenario's system prompt to agent
|
|
486
|
+
if spec.system_prompt:
|
|
487
|
+
agent.config.system_prompt = spec.system_prompt
|
|
488
|
+
|
|
489
|
+
click.echo(f"Running scenario: {spec.name}")
|
|
490
|
+
click.echo(f"Using model: {agent.config.model}")
|
|
491
|
+
click.echo(f"Tools loaded: {len(spec.tools) + len(spec.tools_from)} source(s)")
|
|
492
|
+
click.echo("")
|
|
493
|
+
|
|
494
|
+
runner = ScenarioRunner(scenario=spec, agent=agent)
|
|
495
|
+
result = runner.run(max_turns=max_turns)
|
|
496
|
+
|
|
497
|
+
if output:
|
|
498
|
+
Path(output).write_text(result.to_json(indent=2))
|
|
499
|
+
click.echo(f"\nResults saved to: {output}")
|
|
500
|
+
elif pretty:
|
|
501
|
+
click.echo(result.pretty())
|
|
502
|
+
else:
|
|
503
|
+
click.echo(result.to_json(indent=2))
|
|
504
|
+
|
|
505
|
+
|
|
506
|
+
@main.command()
|
|
507
|
+
def list_tools() -> None:
|
|
508
|
+
"""List available YAML tool libraries."""
|
|
509
|
+
libraries = get_yaml_tool_libraries()
|
|
510
|
+
|
|
511
|
+
if not libraries:
|
|
512
|
+
click.echo("No YAML tool libraries found.")
|
|
513
|
+
click.echo("Tool directories searched:")
|
|
514
|
+
click.echo(" - tools/")
|
|
515
|
+
return
|
|
516
|
+
|
|
517
|
+
click.echo("Available YAML tool libraries:")
|
|
518
|
+
for lib in sorted(libraries):
|
|
519
|
+
click.echo(f" - {lib}")
|
|
520
|
+
|
|
521
|
+
click.echo("")
|
|
522
|
+
click.echo("Use in scenarios with:")
|
|
523
|
+
click.echo(" tools_from:")
|
|
524
|
+
click.echo(" - <library_name>")
|
|
525
|
+
|
|
526
|
+
|
|
527
|
+
# Common models for quick reference (Updated January 2026)
|
|
528
|
+
POPULAR_MODELS = [
|
|
529
|
+
# Free models
|
|
530
|
+
("google/gemini-2.0-flash-exp:free", "Free", "Gemini 2.0 Flash - fast & free"),
|
|
531
|
+
("deepseek/deepseek-r1:free", "Free", "DeepSeek R1 - reasoning model"),
|
|
532
|
+
("meta-llama/llama-3.3-70b-instruct:free", "Free", "Llama 3.3 70B"),
|
|
533
|
+
("qwen/qwen-2.5-72b-instruct:free", "Free", "Qwen 2.5 72B"),
|
|
534
|
+
# Budget models (< $0.50/M input)
|
|
535
|
+
("openai/gpt-4o-mini", "$0.15/M", "GPT-4o Mini"),
|
|
536
|
+
("openai/gpt-4.1-nano", "$0.10/M", "GPT-4.1 Nano"),
|
|
537
|
+
("openai/gpt-5-mini", "$0.30/M", "GPT-5 Mini - newest budget"),
|
|
538
|
+
("google/gemini-2.0-flash", "$0.10/M", "Gemini 2.0 Flash"),
|
|
539
|
+
("google/gemini-3-flash", "$0.30/M", "Gemini 3 Flash - newest"),
|
|
540
|
+
("x-ai/grok-4-fast", "$0.20/M", "Grok 4 Fast - 2M context"),
|
|
541
|
+
("deepseek/deepseek-chat", "$0.30/M", "DeepSeek V3"),
|
|
542
|
+
("anthropic/claude-3-haiku", "$0.25/M", "Claude 3 Haiku"),
|
|
543
|
+
# Mid-tier models ($0.50 - $2.00/M input)
|
|
544
|
+
("anthropic/claude-haiku-4.5", "$1.00/M", "Claude Haiku 4.5 - newest fast"),
|
|
545
|
+
("openai/o3-mini", "$1.10/M", "o3 Mini - reasoning"),
|
|
546
|
+
("google/gemini-2.5-pro", "$1.25/M", "Gemini 2.5 Pro"),
|
|
547
|
+
("openai/gpt-5.1", "$1.25/M", "GPT-5.1"),
|
|
548
|
+
("openai/gpt-5.2", "$1.75/M", "GPT-5.2 - newest"),
|
|
549
|
+
("deepseek/deepseek-r1", "$0.70/M", "DeepSeek R1 - reasoning"),
|
|
550
|
+
# Premium models ($2.00 - $5.00/M input)
|
|
551
|
+
("google/gemini-3-pro", "$2.00/M", "Gemini 3 Pro - newest"),
|
|
552
|
+
("openai/gpt-4.1", "$2.00/M", "GPT-4.1 - 1M context"),
|
|
553
|
+
("anthropic/claude-sonnet-4.5", "$3.00/M", "Claude Sonnet 4.5 - newest"),
|
|
554
|
+
("anthropic/claude-3.5-sonnet", "$3.00/M", "Claude 3.5 Sonnet"),
|
|
555
|
+
("x-ai/grok-4", "$3.00/M", "Grok 4 - 2M context"),
|
|
556
|
+
("openai/o1-mini", "$3.00/M", "o1 Mini - reasoning"),
|
|
557
|
+
("anthropic/claude-opus-4.5", "$5.00/M", "Claude Opus 4.5 - newest best"),
|
|
558
|
+
# Frontier models (> $5.00/M input)
|
|
559
|
+
("openai/o1", "$15.00/M", "o1 - advanced reasoning"),
|
|
560
|
+
("openai/o3", "$20.00/M", "o3 - newest reasoning"),
|
|
561
|
+
("openai/gpt-5.2-pro", "$21.00/M", "GPT-5.2 Pro - maximum capability"),
|
|
562
|
+
("openai/o1-pro", "$150.00/M", "o1 Pro - extended thinking"),
|
|
563
|
+
]
|
|
564
|
+
|
|
565
|
+
|
|
566
|
+
@main.command()
|
|
567
|
+
@click.option("--fetch", "-f", is_flag=True, help="Fetch full list from OpenRouter API")
|
|
568
|
+
@click.option("--free", is_flag=True, help="Show only free models")
|
|
569
|
+
@click.option("--search", "-s", help="Search for models by name")
|
|
570
|
+
def list_models(fetch: bool, free: bool, search: str | None) -> None:
|
|
571
|
+
"""List available models from OpenRouter.
|
|
572
|
+
|
|
573
|
+
By default shows popular models. Use --fetch to get the full list.
|
|
574
|
+
|
|
575
|
+
Examples:
|
|
576
|
+
sandboxy list-models
|
|
577
|
+
sandboxy list-models --free
|
|
578
|
+
sandboxy list-models --fetch --search claude
|
|
579
|
+
"""
|
|
580
|
+
if fetch:
|
|
581
|
+
# Fetch from OpenRouter API
|
|
582
|
+
api_key = os.getenv("OPENROUTER_API_KEY", "")
|
|
583
|
+
if not api_key:
|
|
584
|
+
click.echo("OPENROUTER_API_KEY not set. Showing popular models instead.", err=True)
|
|
585
|
+
click.echo("")
|
|
586
|
+
fetch = False
|
|
587
|
+
|
|
588
|
+
if fetch:
|
|
589
|
+
_fetch_and_display_models(free, search)
|
|
590
|
+
else:
|
|
591
|
+
_display_popular_models(free, search)
|
|
592
|
+
|
|
593
|
+
|
|
594
|
+
def _display_popular_models(free_only: bool, search: str | None) -> None:
|
|
595
|
+
"""Display curated list of popular models."""
|
|
596
|
+
click.echo("Popular Models (via OpenRouter):")
|
|
597
|
+
click.echo("")
|
|
598
|
+
|
|
599
|
+
for model_id, price, desc in POPULAR_MODELS:
|
|
600
|
+
if free_only and price != "Free":
|
|
601
|
+
continue
|
|
602
|
+
if search and search.lower() not in model_id.lower() and search.lower() not in desc.lower():
|
|
603
|
+
continue
|
|
604
|
+
|
|
605
|
+
click.echo(f" {model_id}")
|
|
606
|
+
click.echo(f" {desc} [{price}]")
|
|
607
|
+
click.echo("")
|
|
608
|
+
|
|
609
|
+
click.echo("Usage:")
|
|
610
|
+
click.echo(" sandboxy scenario <file> -m openai/gpt-4o-mini")
|
|
611
|
+
click.echo("")
|
|
612
|
+
click.echo("Set your API key:")
|
|
613
|
+
click.echo(" export OPENROUTER_API_KEY=sk-or-...")
|
|
614
|
+
click.echo("")
|
|
615
|
+
click.echo("Browse all models: https://openrouter.ai/models")
|
|
616
|
+
click.echo("Use --fetch to query the full list from OpenRouter API")
|
|
617
|
+
|
|
618
|
+
|
|
619
|
+
def _fetch_and_display_models(free_only: bool, search: str | None) -> None:
|
|
620
|
+
"""Fetch and display models from OpenRouter API."""
|
|
621
|
+
try:
|
|
622
|
+
import httpx
|
|
623
|
+
except ImportError:
|
|
624
|
+
click.echo("httpx package required. Install with: pip install httpx", err=True)
|
|
625
|
+
return
|
|
626
|
+
|
|
627
|
+
api_key = os.getenv("OPENROUTER_API_KEY", "")
|
|
628
|
+
|
|
629
|
+
try:
|
|
630
|
+
with httpx.Client() as client:
|
|
631
|
+
resp = client.get(
|
|
632
|
+
"https://openrouter.ai/api/v1/models",
|
|
633
|
+
headers={"Authorization": f"Bearer {api_key}"} if api_key else {},
|
|
634
|
+
timeout=10.0,
|
|
635
|
+
)
|
|
636
|
+
resp.raise_for_status()
|
|
637
|
+
data = resp.json()
|
|
638
|
+
except Exception as e:
|
|
639
|
+
click.echo(f"Error fetching models: {e}", err=True)
|
|
640
|
+
click.echo("Falling back to popular models list.", err=True)
|
|
641
|
+
click.echo("")
|
|
642
|
+
_display_popular_models(free_only, search)
|
|
643
|
+
return
|
|
644
|
+
|
|
645
|
+
models = data.get("data", [])
|
|
646
|
+
|
|
647
|
+
# Filter and sort
|
|
648
|
+
filtered = []
|
|
649
|
+
for m in models:
|
|
650
|
+
model_id = m.get("id", "")
|
|
651
|
+
name = m.get("name", model_id)
|
|
652
|
+
pricing = m.get("pricing", {})
|
|
653
|
+
prompt_price = float(pricing.get("prompt", 0)) * 1_000_000 # Per million tokens
|
|
654
|
+
|
|
655
|
+
is_free = prompt_price == 0
|
|
656
|
+
|
|
657
|
+
if free_only and not is_free:
|
|
658
|
+
continue
|
|
659
|
+
|
|
660
|
+
if search:
|
|
661
|
+
search_lower = search.lower()
|
|
662
|
+
if search_lower not in model_id.lower() and search_lower not in name.lower():
|
|
663
|
+
continue
|
|
664
|
+
|
|
665
|
+
filtered.append(
|
|
666
|
+
{
|
|
667
|
+
"id": model_id,
|
|
668
|
+
"name": name,
|
|
669
|
+
"price": "Free" if is_free else f"${prompt_price:.2f}/M",
|
|
670
|
+
"context": m.get("context_length", 0),
|
|
671
|
+
}
|
|
672
|
+
)
|
|
673
|
+
|
|
674
|
+
# Sort by price (free first, then by cost)
|
|
675
|
+
filtered.sort(
|
|
676
|
+
key=lambda x: (
|
|
677
|
+
0 if x["price"] == "Free" else float(x["price"].replace("$", "").replace("/M", ""))
|
|
678
|
+
)
|
|
679
|
+
)
|
|
680
|
+
|
|
681
|
+
click.echo(f"Models from OpenRouter ({len(filtered)} found):")
|
|
682
|
+
click.echo("")
|
|
683
|
+
|
|
684
|
+
for m in filtered[:50]: # Limit output
|
|
685
|
+
click.echo(f" {m['id']}")
|
|
686
|
+
ctx = f"{m['context'] // 1000}k" if m["context"] >= 1000 else str(m["context"])
|
|
687
|
+
click.echo(f" {m['name']} [{m['price']}] [ctx: {ctx}]")
|
|
688
|
+
|
|
689
|
+
if len(filtered) > 50:
|
|
690
|
+
click.echo(f" ... and {len(filtered) - 50} more")
|
|
691
|
+
|
|
692
|
+
click.echo("")
|
|
693
|
+
click.echo("Usage: sandboxy scenario <file> -m <model-id>")
|
|
694
|
+
|
|
695
|
+
|
|
696
|
+
# -----------------------------------------------------------------------------
|
|
697
|
+
# Scaffolding Commands
|
|
698
|
+
# -----------------------------------------------------------------------------
|
|
699
|
+
|
|
700
|
+
SCENARIO_TEMPLATE = """# {title}
|
|
701
|
+
# {description}
|
|
702
|
+
|
|
703
|
+
id: {id}
|
|
704
|
+
name: "{title}"
|
|
705
|
+
description: |
|
|
706
|
+
{description}
|
|
707
|
+
|
|
708
|
+
category: general
|
|
709
|
+
tags:
|
|
710
|
+
- example
|
|
711
|
+
|
|
712
|
+
# Import tools from libraries (optional)
|
|
713
|
+
# tools_from:
|
|
714
|
+
# - my_tool_library
|
|
715
|
+
|
|
716
|
+
# Define inline tools
|
|
717
|
+
tools:
|
|
718
|
+
check_status:
|
|
719
|
+
description: "Check the current status"
|
|
720
|
+
params:
|
|
721
|
+
target:
|
|
722
|
+
type: string
|
|
723
|
+
required: true
|
|
724
|
+
description: "What to check"
|
|
725
|
+
returns: "Status of {{target}}: OK"
|
|
726
|
+
|
|
727
|
+
perform_action:
|
|
728
|
+
description: "Perform an action"
|
|
729
|
+
params:
|
|
730
|
+
action:
|
|
731
|
+
type: string
|
|
732
|
+
required: true
|
|
733
|
+
confirm:
|
|
734
|
+
type: boolean
|
|
735
|
+
required: false
|
|
736
|
+
default: false
|
|
737
|
+
error_when: "confirm != true and confirm != True"
|
|
738
|
+
returns_error: "Action requires confirmation. Set confirm=true."
|
|
739
|
+
returns: "Action '{{action}}' completed successfully."
|
|
740
|
+
side_effects:
|
|
741
|
+
- set: "last_action"
|
|
742
|
+
value: "{{action}}"
|
|
743
|
+
- set: "action_confirmed"
|
|
744
|
+
value: true
|
|
745
|
+
|
|
746
|
+
# Initial state for the scenario
|
|
747
|
+
initial_state:
|
|
748
|
+
status: "nominal"
|
|
749
|
+
alert_level: 0
|
|
750
|
+
|
|
751
|
+
# System prompt for the AI agent
|
|
752
|
+
system_prompt: |
|
|
753
|
+
You are an AI assistant in this scenario.
|
|
754
|
+
|
|
755
|
+
Use the available tools to:
|
|
756
|
+
1. Assess the situation
|
|
757
|
+
2. Take appropriate action
|
|
758
|
+
3. Explain your reasoning
|
|
759
|
+
|
|
760
|
+
# Conversation flow
|
|
761
|
+
steps:
|
|
762
|
+
- id: initial_prompt
|
|
763
|
+
action: inject_user
|
|
764
|
+
params:
|
|
765
|
+
content: |
|
|
766
|
+
Welcome to the scenario. What would you like to do?
|
|
767
|
+
|
|
768
|
+
- id: agent_response
|
|
769
|
+
action: await_agent
|
|
770
|
+
|
|
771
|
+
# Add more steps as needed:
|
|
772
|
+
# - id: followup
|
|
773
|
+
# action: inject_user
|
|
774
|
+
# params:
|
|
775
|
+
# content: "What's your next move?"
|
|
776
|
+
#
|
|
777
|
+
# - id: agent_followup
|
|
778
|
+
# action: await_agent
|
|
779
|
+
|
|
780
|
+
# Goals for scoring
|
|
781
|
+
goals:
|
|
782
|
+
- id: checked_status
|
|
783
|
+
name: "Checked Status"
|
|
784
|
+
description: "Used check_status tool"
|
|
785
|
+
points: 10
|
|
786
|
+
detection:
|
|
787
|
+
type: tool_called
|
|
788
|
+
tool: check_status
|
|
789
|
+
|
|
790
|
+
- id: took_action
|
|
791
|
+
name: "Took Action"
|
|
792
|
+
description: "Performed an action with confirmation"
|
|
793
|
+
points: 20
|
|
794
|
+
detection:
|
|
795
|
+
type: env_state
|
|
796
|
+
key: action_confirmed
|
|
797
|
+
value: true
|
|
798
|
+
|
|
799
|
+
# Scoring configuration
|
|
800
|
+
scoring:
|
|
801
|
+
max_score: 30
|
|
802
|
+
# Optional formula: "checked_status + took_action"
|
|
803
|
+
"""
|
|
804
|
+
|
|
805
|
+
TOOL_LIBRARY_TEMPLATE = """# {title}
|
|
806
|
+
# {description}
|
|
807
|
+
|
|
808
|
+
name: {name}
|
|
809
|
+
description: |
|
|
810
|
+
{description}
|
|
811
|
+
|
|
812
|
+
tools:
|
|
813
|
+
# Example: Simple tool with static return
|
|
814
|
+
get_info:
|
|
815
|
+
description: "Get information about something"
|
|
816
|
+
params:
|
|
817
|
+
item:
|
|
818
|
+
type: string
|
|
819
|
+
required: true
|
|
820
|
+
description: "The item to get info about"
|
|
821
|
+
returns: "Info for {{item}}: This is example data."
|
|
822
|
+
|
|
823
|
+
# Example: Tool with state modification
|
|
824
|
+
update_setting:
|
|
825
|
+
description: "Update a setting value"
|
|
826
|
+
params:
|
|
827
|
+
key:
|
|
828
|
+
type: string
|
|
829
|
+
required: true
|
|
830
|
+
value:
|
|
831
|
+
type: string
|
|
832
|
+
required: true
|
|
833
|
+
returns: "Setting '{{key}}' updated to '{{value}}'."
|
|
834
|
+
side_effects:
|
|
835
|
+
- set: "setting_{{key}}"
|
|
836
|
+
value: "{{value}}"
|
|
837
|
+
|
|
838
|
+
# Example: Tool with confirmation requirement
|
|
839
|
+
dangerous_action:
|
|
840
|
+
description: "Perform a dangerous action (requires confirmation)"
|
|
841
|
+
params:
|
|
842
|
+
target:
|
|
843
|
+
type: string
|
|
844
|
+
required: true
|
|
845
|
+
confirm:
|
|
846
|
+
type: boolean
|
|
847
|
+
required: true
|
|
848
|
+
description: "Must be true to proceed"
|
|
849
|
+
error_when: "confirm != true and confirm != True"
|
|
850
|
+
returns_error: "This action requires confirmation. Set confirm=true to proceed."
|
|
851
|
+
returns: "Dangerous action performed on {{target}}."
|
|
852
|
+
side_effects:
|
|
853
|
+
- set: "{{target}}_modified"
|
|
854
|
+
value: true
|
|
855
|
+
|
|
856
|
+
# Example: Tool with conditional returns
|
|
857
|
+
check_status:
|
|
858
|
+
description: "Check status of a system"
|
|
859
|
+
params:
|
|
860
|
+
system:
|
|
861
|
+
type: string
|
|
862
|
+
required: true
|
|
863
|
+
returns:
|
|
864
|
+
- when: "{{system}}_modified == true"
|
|
865
|
+
value: "System {{system}}: MODIFIED - Changes pending"
|
|
866
|
+
- when: "{{system}}_offline == true"
|
|
867
|
+
value: "System {{system}}: OFFLINE"
|
|
868
|
+
- when: "default"
|
|
869
|
+
value: "System {{system}}: ONLINE - All systems nominal"
|
|
870
|
+
|
|
871
|
+
# Example: Tool with enum constraint
|
|
872
|
+
set_mode:
|
|
873
|
+
description: "Set the operating mode"
|
|
874
|
+
params:
|
|
875
|
+
mode:
|
|
876
|
+
type: string
|
|
877
|
+
required: true
|
|
878
|
+
enum: ["normal", "maintenance", "emergency"]
|
|
879
|
+
description: "Operating mode"
|
|
880
|
+
returns: "Mode set to: {{mode}}"
|
|
881
|
+
side_effects:
|
|
882
|
+
- set: "current_mode"
|
|
883
|
+
value: "{{mode}}"
|
|
884
|
+
"""
|
|
885
|
+
|
|
886
|
+
|
|
887
|
+
@main.group()
|
|
888
|
+
def new() -> None:
|
|
889
|
+
"""Create new scenarios and tool libraries."""
|
|
890
|
+
pass
|
|
891
|
+
|
|
892
|
+
|
|
893
|
+
@new.command("scenario")
|
|
894
|
+
@click.argument("name")
|
|
895
|
+
@click.option("--title", "-t", help="Human-readable title", default=None)
|
|
896
|
+
@click.option("--description", "-d", help="Brief description", default="A new scenario")
|
|
897
|
+
@click.option("--output-dir", "-o", help="Output directory", default="scenarios")
|
|
898
|
+
def new_scenario(name: str, title: str | None, description: str, output_dir: str) -> None:
|
|
899
|
+
"""Create a new scenario stub.
|
|
900
|
+
|
|
901
|
+
NAME is the scenario identifier (e.g., 'my-scenario' or 'trolley_problem').
|
|
902
|
+
|
|
903
|
+
Examples:
|
|
904
|
+
sandboxy new scenario my-test
|
|
905
|
+
sandboxy new scenario data-center-fire -t "Data Center Fire" -d "Handle a fire emergency"
|
|
906
|
+
"""
|
|
907
|
+
# Normalize name
|
|
908
|
+
scenario_id = name.lower().replace(" ", "-").replace("_", "-")
|
|
909
|
+
filename = f"{scenario_id.replace('-', '_')}.yml"
|
|
910
|
+
|
|
911
|
+
# Generate title if not provided
|
|
912
|
+
if title is None:
|
|
913
|
+
title = " ".join(word.capitalize() for word in scenario_id.split("-"))
|
|
914
|
+
|
|
915
|
+
# Create output directory if needed
|
|
916
|
+
output_path = Path(output_dir)
|
|
917
|
+
output_path.mkdir(parents=True, exist_ok=True)
|
|
918
|
+
|
|
919
|
+
file_path = output_path / filename
|
|
920
|
+
|
|
921
|
+
if file_path.exists():
|
|
922
|
+
click.echo(f"Error: {file_path} already exists", err=True)
|
|
923
|
+
sys.exit(1)
|
|
924
|
+
|
|
925
|
+
# Generate content
|
|
926
|
+
content = SCENARIO_TEMPLATE.format(
|
|
927
|
+
id=scenario_id,
|
|
928
|
+
title=title,
|
|
929
|
+
description=description,
|
|
930
|
+
)
|
|
931
|
+
|
|
932
|
+
file_path.write_text(content)
|
|
933
|
+
click.echo(f"Created scenario: {file_path}")
|
|
934
|
+
click.echo("")
|
|
935
|
+
click.echo("Next steps:")
|
|
936
|
+
click.echo(f" 1. Edit {file_path} to customize your scenario")
|
|
937
|
+
click.echo(f" 2. Run: sandboxy scenario {file_path} -p")
|
|
938
|
+
|
|
939
|
+
|
|
940
|
+
@new.command("tool")
|
|
941
|
+
@click.argument("name")
|
|
942
|
+
@click.option("--title", "-t", help="Human-readable title", default=None)
|
|
943
|
+
@click.option("--description", "-d", help="Brief description", default="A collection of mock tools")
|
|
944
|
+
@click.option("--output-dir", "-o", help="Output directory", default="tools")
|
|
945
|
+
def new_tool(name: str, title: str | None, description: str, output_dir: str) -> None:
|
|
946
|
+
"""Create a new tool library stub.
|
|
947
|
+
|
|
948
|
+
NAME is the library name (e.g., 'mock_hospital' or 'space-station').
|
|
949
|
+
|
|
950
|
+
Examples:
|
|
951
|
+
sandboxy new tool mock_hospital
|
|
952
|
+
sandboxy new tool space-station -t "Space Station Tools" -d "Tools for space station scenarios"
|
|
953
|
+
"""
|
|
954
|
+
# Normalize name - tool libraries use underscores by convention
|
|
955
|
+
lib_name = name.lower().replace("-", "_").replace(" ", "_")
|
|
956
|
+
|
|
957
|
+
# Ensure it starts with mock_ for clarity
|
|
958
|
+
if not lib_name.startswith("mock_"):
|
|
959
|
+
lib_name = f"mock_{lib_name}"
|
|
960
|
+
|
|
961
|
+
filename = f"{lib_name}.yml"
|
|
962
|
+
|
|
963
|
+
# Generate title if not provided
|
|
964
|
+
if title is None:
|
|
965
|
+
title = " ".join(word.capitalize() for word in lib_name.replace("mock_", "").split("_"))
|
|
966
|
+
title = f"Mock {title} Tools"
|
|
967
|
+
|
|
968
|
+
# Create output directory if needed
|
|
969
|
+
output_path = Path(output_dir)
|
|
970
|
+
output_path.mkdir(parents=True, exist_ok=True)
|
|
971
|
+
|
|
972
|
+
file_path = output_path / filename
|
|
973
|
+
|
|
974
|
+
if file_path.exists():
|
|
975
|
+
click.echo(f"Error: {file_path} already exists", err=True)
|
|
976
|
+
sys.exit(1)
|
|
977
|
+
|
|
978
|
+
# Generate content
|
|
979
|
+
content = TOOL_LIBRARY_TEMPLATE.format(
|
|
980
|
+
name=lib_name,
|
|
981
|
+
title=title,
|
|
982
|
+
description=description,
|
|
983
|
+
)
|
|
984
|
+
|
|
985
|
+
file_path.write_text(content)
|
|
986
|
+
click.echo(f"Created tool library: {file_path}")
|
|
987
|
+
click.echo("")
|
|
988
|
+
click.echo("Next steps:")
|
|
989
|
+
click.echo(f" 1. Edit {file_path} to add your tools")
|
|
990
|
+
click.echo(" 2. Use in scenarios with:")
|
|
991
|
+
click.echo(" tools_from:")
|
|
992
|
+
click.echo(f" - {lib_name}")
|
|
993
|
+
|
|
994
|
+
|
|
995
|
+
# -----------------------------------------------------------------------------
|
|
996
|
+
# MCP Commands
|
|
997
|
+
# -----------------------------------------------------------------------------
|
|
998
|
+
|
|
999
|
+
|
|
1000
|
+
@main.group()
|
|
1001
|
+
def mcp() -> None:
|
|
1002
|
+
"""MCP (Model Context Protocol) tools."""
|
|
1003
|
+
pass
|
|
1004
|
+
|
|
1005
|
+
|
|
1006
|
+
@mcp.command("inspect")
|
|
1007
|
+
@click.argument("target", required=False)
|
|
1008
|
+
@click.option("--url", "-u", help="URL of remote MCP server (HTTP transport)")
|
|
1009
|
+
@click.option("--args", "-a", "cmd_args", multiple=True, help="Arguments for local server command")
|
|
1010
|
+
@click.option(
|
|
1011
|
+
"--header", "-H", "headers", multiple=True, help="HTTP headers (key:value) for remote servers"
|
|
1012
|
+
)
|
|
1013
|
+
@click.option(
|
|
1014
|
+
"--transport",
|
|
1015
|
+
"-t",
|
|
1016
|
+
type=click.Choice(["auto", "sse", "streamable_http"]),
|
|
1017
|
+
default="auto",
|
|
1018
|
+
help="HTTP transport type (default: auto-detect)",
|
|
1019
|
+
)
|
|
1020
|
+
def mcp_inspect(
|
|
1021
|
+
target: str | None,
|
|
1022
|
+
url: str | None,
|
|
1023
|
+
cmd_args: tuple[str, ...],
|
|
1024
|
+
headers: tuple[str, ...],
|
|
1025
|
+
transport: str,
|
|
1026
|
+
) -> None:
|
|
1027
|
+
"""Inspect an MCP server and list its available tools.
|
|
1028
|
+
|
|
1029
|
+
For LOCAL servers (stdio), provide the command:
|
|
1030
|
+
sandboxy mcp inspect "npx -y @modelcontextprotocol/server-filesystem /tmp"
|
|
1031
|
+
|
|
1032
|
+
For REMOTE servers (HTTP), use --url:
|
|
1033
|
+
sandboxy mcp inspect --url "https://example.com/mcp"
|
|
1034
|
+
sandboxy mcp inspect --url "https://example.com/sse" --transport sse
|
|
1035
|
+
sandboxy mcp inspect --url "https://api.example.com/mcp" -H "Authorization:Bearer token"
|
|
1036
|
+
"""
|
|
1037
|
+
import asyncio
|
|
1038
|
+
import shlex
|
|
1039
|
+
|
|
1040
|
+
if not target and not url:
|
|
1041
|
+
click.echo(
|
|
1042
|
+
"Error: Provide either a command (for local servers) or --url (for remote servers)",
|
|
1043
|
+
err=True,
|
|
1044
|
+
)
|
|
1045
|
+
sys.exit(1)
|
|
1046
|
+
|
|
1047
|
+
# Parse headers into dict
|
|
1048
|
+
headers_dict: dict[str, str] = {}
|
|
1049
|
+
for h in headers:
|
|
1050
|
+
if ":" in h:
|
|
1051
|
+
key, value = h.split(":", 1)
|
|
1052
|
+
headers_dict[key.strip()] = value.strip()
|
|
1053
|
+
|
|
1054
|
+
async def _inspect_local(command: str, args: list[str]) -> list[dict[str, Any]]:
|
|
1055
|
+
from sandboxy.mcp.client import inspect_mcp_server
|
|
1056
|
+
|
|
1057
|
+
return await inspect_mcp_server(command, args)
|
|
1058
|
+
|
|
1059
|
+
async def _inspect_remote(url: str) -> list[dict[str, Any]]:
|
|
1060
|
+
from sandboxy.mcp.client import inspect_mcp_server_http
|
|
1061
|
+
|
|
1062
|
+
return await inspect_mcp_server_http(url, headers_dict if headers_dict else None, transport) # type: ignore[arg-type]
|
|
1063
|
+
|
|
1064
|
+
async def _inspect() -> None:
|
|
1065
|
+
try:
|
|
1066
|
+
if url:
|
|
1067
|
+
# Remote server
|
|
1068
|
+
click.echo(f"Connecting to remote MCP server: {url}")
|
|
1069
|
+
if headers_dict:
|
|
1070
|
+
click.echo(f" Headers: {list(headers_dict.keys())}")
|
|
1071
|
+
click.echo(f" Transport: {transport}")
|
|
1072
|
+
click.echo("")
|
|
1073
|
+
tools = await _inspect_remote(url)
|
|
1074
|
+
else:
|
|
1075
|
+
# Local server
|
|
1076
|
+
command = target or ""
|
|
1077
|
+
args = list(cmd_args)
|
|
1078
|
+
|
|
1079
|
+
# Parse command if it's a single string with spaces
|
|
1080
|
+
if not args and " " in command:
|
|
1081
|
+
parts = shlex.split(command)
|
|
1082
|
+
command = parts[0]
|
|
1083
|
+
args = parts[1:]
|
|
1084
|
+
|
|
1085
|
+
click.echo(f"Connecting to local MCP server: {command} {' '.join(args)}")
|
|
1086
|
+
click.echo("")
|
|
1087
|
+
tools = await _inspect_local(command, args)
|
|
1088
|
+
|
|
1089
|
+
if not tools:
|
|
1090
|
+
click.echo("No tools found.")
|
|
1091
|
+
return
|
|
1092
|
+
|
|
1093
|
+
click.echo(f"Found {len(tools)} tool(s):")
|
|
1094
|
+
click.echo("")
|
|
1095
|
+
|
|
1096
|
+
for tool in tools:
|
|
1097
|
+
click.echo(f" {tool['name']}")
|
|
1098
|
+
if tool.get("description"):
|
|
1099
|
+
click.echo(f" {tool['description']}")
|
|
1100
|
+
|
|
1101
|
+
params = tool.get("parameters", [])
|
|
1102
|
+
if params:
|
|
1103
|
+
click.echo(" Parameters:")
|
|
1104
|
+
for p in params:
|
|
1105
|
+
req = " (required)" if p.get("required") else ""
|
|
1106
|
+
desc = f" - {p.get('description')}" if p.get("description") else ""
|
|
1107
|
+
click.echo(f" - {p['name']}: {p.get('type', 'any')}{req}{desc}")
|
|
1108
|
+
|
|
1109
|
+
click.echo("")
|
|
1110
|
+
|
|
1111
|
+
# Show usage example
|
|
1112
|
+
click.echo("Use in scenarios with:")
|
|
1113
|
+
click.echo(" mcp_servers:")
|
|
1114
|
+
click.echo(" - name: my_server")
|
|
1115
|
+
if url:
|
|
1116
|
+
click.echo(f" url: {url}")
|
|
1117
|
+
if headers_dict:
|
|
1118
|
+
click.echo(" headers:")
|
|
1119
|
+
for k, v in headers_dict.items():
|
|
1120
|
+
click.echo(f" {k}: {v}")
|
|
1121
|
+
if transport != "auto":
|
|
1122
|
+
click.echo(f" transport: {transport}")
|
|
1123
|
+
else:
|
|
1124
|
+
command = target or ""
|
|
1125
|
+
args = list(cmd_args)
|
|
1126
|
+
if not args and " " in command:
|
|
1127
|
+
parts = shlex.split(command)
|
|
1128
|
+
command = parts[0]
|
|
1129
|
+
args = parts[1:]
|
|
1130
|
+
click.echo(f" command: {command}")
|
|
1131
|
+
if args:
|
|
1132
|
+
click.echo(f" args: {args}")
|
|
1133
|
+
|
|
1134
|
+
except Exception as e:
|
|
1135
|
+
click.echo(f"Error connecting to MCP server: {e}", err=True)
|
|
1136
|
+
sys.exit(1)
|
|
1137
|
+
|
|
1138
|
+
asyncio.run(_inspect())
|
|
1139
|
+
|
|
1140
|
+
|
|
1141
|
+
@mcp.command("list-servers")
|
|
1142
|
+
def mcp_list_servers() -> None:
|
|
1143
|
+
"""List commonly used MCP servers.
|
|
1144
|
+
|
|
1145
|
+
Shows a curated list of popular MCP servers that can be used with sandboxy.
|
|
1146
|
+
"""
|
|
1147
|
+
servers = [
|
|
1148
|
+
(
|
|
1149
|
+
"@modelcontextprotocol/server-filesystem",
|
|
1150
|
+
"File system access",
|
|
1151
|
+
"npx -y @modelcontextprotocol/server-filesystem <path>",
|
|
1152
|
+
),
|
|
1153
|
+
(
|
|
1154
|
+
"@modelcontextprotocol/server-github",
|
|
1155
|
+
"GitHub API access",
|
|
1156
|
+
"npx -y @modelcontextprotocol/server-github",
|
|
1157
|
+
),
|
|
1158
|
+
(
|
|
1159
|
+
"@modelcontextprotocol/server-postgres",
|
|
1160
|
+
"PostgreSQL database",
|
|
1161
|
+
"npx -y @modelcontextprotocol/server-postgres <connection-string>",
|
|
1162
|
+
),
|
|
1163
|
+
(
|
|
1164
|
+
"@modelcontextprotocol/server-sqlite",
|
|
1165
|
+
"SQLite database",
|
|
1166
|
+
"npx -y @modelcontextprotocol/server-sqlite <db-path>",
|
|
1167
|
+
),
|
|
1168
|
+
(
|
|
1169
|
+
"@modelcontextprotocol/server-brave-search",
|
|
1170
|
+
"Brave Search API",
|
|
1171
|
+
"npx -y @modelcontextprotocol/server-brave-search",
|
|
1172
|
+
),
|
|
1173
|
+
(
|
|
1174
|
+
"@modelcontextprotocol/server-puppeteer",
|
|
1175
|
+
"Browser automation",
|
|
1176
|
+
"npx -y @modelcontextprotocol/server-puppeteer",
|
|
1177
|
+
),
|
|
1178
|
+
]
|
|
1179
|
+
|
|
1180
|
+
click.echo("Popular MCP Servers:")
|
|
1181
|
+
click.echo("")
|
|
1182
|
+
|
|
1183
|
+
for name, desc, cmd in servers:
|
|
1184
|
+
click.echo(f" {name}")
|
|
1185
|
+
click.echo(f" {desc}")
|
|
1186
|
+
click.echo(f" Usage: {cmd}")
|
|
1187
|
+
click.echo("")
|
|
1188
|
+
|
|
1189
|
+
click.echo("Inspect a server's tools:")
|
|
1190
|
+
click.echo(' sandboxy mcp inspect "npx -y @modelcontextprotocol/server-filesystem /tmp"')
|
|
1191
|
+
click.echo("")
|
|
1192
|
+
click.echo("More servers: https://github.com/modelcontextprotocol/servers")
|
|
1193
|
+
|
|
1194
|
+
|
|
1195
|
+
if __name__ == "__main__":
|
|
1196
|
+
main()
|