sandboxy 0.0.2__py3-none-any.whl → 0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sandboxy/api/routes/local.py +182 -19
- sandboxy/cli/main.py +530 -174
- sandboxy/mlflow/__init__.py +38 -0
- sandboxy/mlflow/artifacts.py +184 -0
- sandboxy/mlflow/config.py +90 -0
- sandboxy/mlflow/exporter.py +439 -0
- sandboxy/mlflow/metrics.py +115 -0
- sandboxy/mlflow/tags.py +140 -0
- sandboxy/mlflow/tracing.py +126 -0
- sandboxy/scenarios/loader.py +44 -2
- sandboxy/scenarios/runner.py +57 -2
- sandboxy/tools/yaml_tools.py +18 -0
- sandboxy/ui/dist/assets/index-CU06wBqc.js +362 -0
- sandboxy/ui/dist/assets/index-Cgg2wY2m.css +1 -0
- sandboxy/ui/dist/index.html +2 -2
- {sandboxy-0.0.2.dist-info → sandboxy-0.0.4.dist-info}/METADATA +37 -1
- {sandboxy-0.0.2.dist-info → sandboxy-0.0.4.dist-info}/RECORD +20 -13
- sandboxy/ui/dist/assets/index-CgAkYWrJ.css +0 -1
- sandboxy/ui/dist/assets/index-D4zoGFcr.js +0 -347
- {sandboxy-0.0.2.dist-info → sandboxy-0.0.4.dist-info}/WHEEL +0 -0
- {sandboxy-0.0.2.dist-info → sandboxy-0.0.4.dist-info}/entry_points.txt +0 -0
- {sandboxy-0.0.2.dist-info → sandboxy-0.0.4.dist-info}/licenses/LICENSE +0 -0
sandboxy/cli/main.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
|
1
1
|
"""CLI entrypoint for Sandboxy."""
|
|
2
2
|
|
|
3
|
-
import csv
|
|
4
3
|
import json
|
|
5
4
|
import os
|
|
6
5
|
import sys
|
|
@@ -35,6 +34,59 @@ def main() -> None:
|
|
|
35
34
|
pass
|
|
36
35
|
|
|
37
36
|
|
|
37
|
+
@main.command()
|
|
38
|
+
@click.argument("shell", type=click.Choice(["bash", "zsh", "fish"]), default="bash")
|
|
39
|
+
def completion(shell: str) -> None:
|
|
40
|
+
"""Generate shell completion and show setup instructions.
|
|
41
|
+
|
|
42
|
+
Writes completion script to ~/.sandboxy-completion.<shell>
|
|
43
|
+
and shows the line to add to your shell config.
|
|
44
|
+
|
|
45
|
+
Examples:
|
|
46
|
+
sandboxy completion # Generate bash completion
|
|
47
|
+
sandboxy completion zsh # Generate zsh completion
|
|
48
|
+
"""
|
|
49
|
+
import subprocess
|
|
50
|
+
|
|
51
|
+
home = Path.home()
|
|
52
|
+
ext = shell if shell != "bash" else "bash"
|
|
53
|
+
completion_file = home / f".sandboxy-completion.{ext}"
|
|
54
|
+
|
|
55
|
+
# Generate completion script using Click's built-in mechanism
|
|
56
|
+
env = os.environ.copy()
|
|
57
|
+
env["_SANDBOXY_COMPLETE"] = f"{shell}_source"
|
|
58
|
+
|
|
59
|
+
result = subprocess.run( # noqa: S603
|
|
60
|
+
[sys.executable, "-m", "sandboxy.cli.main"],
|
|
61
|
+
env=env,
|
|
62
|
+
capture_output=True,
|
|
63
|
+
text=True,
|
|
64
|
+
)
|
|
65
|
+
|
|
66
|
+
# Write to file
|
|
67
|
+
completion_file.write_text(result.stdout)
|
|
68
|
+
click.echo(f"Generated: {completion_file}")
|
|
69
|
+
click.echo("")
|
|
70
|
+
click.echo("Add this line to your shell config:")
|
|
71
|
+
click.echo("")
|
|
72
|
+
|
|
73
|
+
if shell == "bash":
|
|
74
|
+
click.echo("# Sandboxy completion")
|
|
75
|
+
click.echo(f'. "{completion_file}"')
|
|
76
|
+
click.echo("")
|
|
77
|
+
click.echo("(Add to ~/.bashrc)")
|
|
78
|
+
elif shell == "zsh":
|
|
79
|
+
click.echo("# Sandboxy completion")
|
|
80
|
+
click.echo(f'. "{completion_file}"')
|
|
81
|
+
click.echo("")
|
|
82
|
+
click.echo("(Add to ~/.zshrc)")
|
|
83
|
+
elif shell == "fish":
|
|
84
|
+
click.echo("# Sandboxy completion")
|
|
85
|
+
click.echo(f'source "{completion_file}"')
|
|
86
|
+
click.echo("")
|
|
87
|
+
click.echo("(Add to ~/.config/fish/config.fish)")
|
|
88
|
+
|
|
89
|
+
|
|
38
90
|
def _load_variables_from_env() -> dict:
|
|
39
91
|
"""Load variables from SANDBOXY_VARIABLES environment variable."""
|
|
40
92
|
env_vars = os.environ.get("SANDBOXY_VARIABLES", "")
|
|
@@ -46,6 +98,322 @@ def _load_variables_from_env() -> dict:
|
|
|
46
98
|
return {}
|
|
47
99
|
|
|
48
100
|
|
|
101
|
+
def _export_to_mlflow(
|
|
102
|
+
result: Any,
|
|
103
|
+
spec: Any,
|
|
104
|
+
scenario_path: Path,
|
|
105
|
+
mlflow_export: bool,
|
|
106
|
+
no_mlflow: bool,
|
|
107
|
+
mlflow_tracking_uri: str | None,
|
|
108
|
+
mlflow_experiment: str | None,
|
|
109
|
+
agent_name: str = "default",
|
|
110
|
+
) -> None:
|
|
111
|
+
"""Export scenario result to MLflow if enabled.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
result: ScenarioResult from runner
|
|
115
|
+
spec: ScenarioSpec
|
|
116
|
+
scenario_path: Path to scenario file
|
|
117
|
+
mlflow_export: --mlflow-export flag
|
|
118
|
+
no_mlflow: --no-mlflow flag
|
|
119
|
+
mlflow_tracking_uri: --mlflow-tracking-uri value
|
|
120
|
+
mlflow_experiment: --mlflow-experiment value
|
|
121
|
+
agent_name: Agent configuration name
|
|
122
|
+
"""
|
|
123
|
+
from sandboxy.mlflow.config import MLflowConfig
|
|
124
|
+
|
|
125
|
+
# Get YAML config from spec
|
|
126
|
+
yaml_config = None
|
|
127
|
+
if spec.mlflow:
|
|
128
|
+
yaml_config = {
|
|
129
|
+
"enabled": spec.mlflow.enabled,
|
|
130
|
+
"experiment": spec.mlflow.experiment,
|
|
131
|
+
"tracking_uri": spec.mlflow.tracking_uri,
|
|
132
|
+
"tags": spec.mlflow.tags,
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
# Resolve config with precedence
|
|
136
|
+
config = MLflowConfig.resolve(
|
|
137
|
+
cli_export=mlflow_export,
|
|
138
|
+
cli_no_mlflow=no_mlflow,
|
|
139
|
+
cli_tracking_uri=mlflow_tracking_uri,
|
|
140
|
+
cli_experiment=mlflow_experiment,
|
|
141
|
+
yaml_config=yaml_config,
|
|
142
|
+
scenario_name=spec.name,
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
if not config.enabled:
|
|
146
|
+
return
|
|
147
|
+
|
|
148
|
+
# Import and use exporter
|
|
149
|
+
try:
|
|
150
|
+
from sandboxy.mlflow.exporter import MLflowExporter
|
|
151
|
+
|
|
152
|
+
exporter = MLflowExporter(config)
|
|
153
|
+
|
|
154
|
+
# Convert ScenarioResult to RunResult-like for exporter
|
|
155
|
+
# ScenarioResult has different structure, create adapter
|
|
156
|
+
run_id = exporter.export(
|
|
157
|
+
result=_adapt_scenario_result(result),
|
|
158
|
+
scenario_path=scenario_path,
|
|
159
|
+
scenario_name=spec.name,
|
|
160
|
+
scenario_id=spec.id,
|
|
161
|
+
agent_name=agent_name,
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
if run_id:
|
|
165
|
+
click.echo(f"\nExported to MLflow: run_id={run_id}")
|
|
166
|
+
|
|
167
|
+
except ImportError:
|
|
168
|
+
click.echo(
|
|
169
|
+
"\nMLflow not installed. Install with: pip install sandboxy[mlflow]",
|
|
170
|
+
err=True,
|
|
171
|
+
)
|
|
172
|
+
except Exception as e:
|
|
173
|
+
click.echo(f"\nWarning: MLflow export failed: {e}", err=True)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def _adapt_scenario_result(result: Any) -> Any:
|
|
177
|
+
"""Adapt ScenarioResult to RunResult-like interface for MLflowExporter.
|
|
178
|
+
|
|
179
|
+
The exporter expects RunResult fields, but ScenarioRunner returns ScenarioResult.
|
|
180
|
+
This creates an adapter object.
|
|
181
|
+
"""
|
|
182
|
+
from dataclasses import dataclass, field
|
|
183
|
+
|
|
184
|
+
@dataclass
|
|
185
|
+
class GoalResultAdapter:
|
|
186
|
+
name: str
|
|
187
|
+
score: float
|
|
188
|
+
passed: bool = True
|
|
189
|
+
|
|
190
|
+
@dataclass
|
|
191
|
+
class EvaluationAdapter:
|
|
192
|
+
goals: list[GoalResultAdapter] = field(default_factory=list)
|
|
193
|
+
total_score: float = 0.0
|
|
194
|
+
max_score: float = 0.0
|
|
195
|
+
percentage: float = 0.0
|
|
196
|
+
|
|
197
|
+
@dataclass
|
|
198
|
+
class RunResultAdapter:
|
|
199
|
+
model: str = ""
|
|
200
|
+
error: str | None = None
|
|
201
|
+
latency_ms: int = 0
|
|
202
|
+
input_tokens: int = 0
|
|
203
|
+
output_tokens: int = 0
|
|
204
|
+
evaluation: EvaluationAdapter | None = None
|
|
205
|
+
|
|
206
|
+
# Extract data from ScenarioResult
|
|
207
|
+
adapter = RunResultAdapter(
|
|
208
|
+
model=getattr(result, "agent_id", "unknown"),
|
|
209
|
+
error=None,
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
# Build evaluation from goals
|
|
213
|
+
goals = []
|
|
214
|
+
total = 0.0
|
|
215
|
+
for goal_name in getattr(result, "goals_achieved", []):
|
|
216
|
+
goals.append(GoalResultAdapter(name=goal_name, score=1.0, passed=True))
|
|
217
|
+
total += 1.0
|
|
218
|
+
|
|
219
|
+
score = getattr(result, "score", 0.0)
|
|
220
|
+
max_score = max(score, len(goals)) if goals else score
|
|
221
|
+
|
|
222
|
+
adapter.evaluation = EvaluationAdapter(
|
|
223
|
+
goals=goals,
|
|
224
|
+
total_score=score,
|
|
225
|
+
max_score=max_score,
|
|
226
|
+
percentage=(score / max_score * 100) if max_score > 0 else 0.0,
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
return adapter
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
@main.command()
|
|
233
|
+
@click.option("--with-examples", is_flag=True, help="Include example scenarios and tools")
|
|
234
|
+
@click.option(
|
|
235
|
+
"--dir",
|
|
236
|
+
"-d",
|
|
237
|
+
"directory",
|
|
238
|
+
type=click.Path(path_type=Path),
|
|
239
|
+
default=None,
|
|
240
|
+
help="Directory to initialize (default: current directory)",
|
|
241
|
+
)
|
|
242
|
+
def init(with_examples: bool, directory: Path | None) -> None:
|
|
243
|
+
"""Initialize a new Sandboxy project.
|
|
244
|
+
|
|
245
|
+
Creates the standard folder structure for scenarios, tools, agents, and datasets.
|
|
246
|
+
|
|
247
|
+
Examples:
|
|
248
|
+
sandboxy init
|
|
249
|
+
sandboxy init --with-examples
|
|
250
|
+
sandboxy init --dir my-project
|
|
251
|
+
"""
|
|
252
|
+
root = directory or Path.cwd()
|
|
253
|
+
|
|
254
|
+
# Create directory if specified and doesn't exist
|
|
255
|
+
if directory and not root.exists():
|
|
256
|
+
root.mkdir(parents=True)
|
|
257
|
+
click.echo(f"Created directory: {root}")
|
|
258
|
+
|
|
259
|
+
# Standard folders
|
|
260
|
+
folders = ["scenarios", "tools", "agents", "datasets", "runs"]
|
|
261
|
+
created = []
|
|
262
|
+
|
|
263
|
+
for folder in folders:
|
|
264
|
+
folder_path = root / folder
|
|
265
|
+
if not folder_path.exists():
|
|
266
|
+
folder_path.mkdir(parents=True)
|
|
267
|
+
created.append(folder)
|
|
268
|
+
|
|
269
|
+
if created:
|
|
270
|
+
click.echo(f"Created folders: {', '.join(created)}")
|
|
271
|
+
else:
|
|
272
|
+
click.echo("All folders already exist")
|
|
273
|
+
|
|
274
|
+
# Create .env.example if it doesn't exist
|
|
275
|
+
env_example = root / ".env.example"
|
|
276
|
+
if not env_example.exists():
|
|
277
|
+
env_example.write_text(
|
|
278
|
+
"""# Sandboxy Environment Variables
|
|
279
|
+
# Copy this to .env and fill in your API keys
|
|
280
|
+
|
|
281
|
+
# OpenRouter API key (recommended - access to 400+ models)
|
|
282
|
+
OPENROUTER_API_KEY=
|
|
283
|
+
|
|
284
|
+
# Or use direct provider keys
|
|
285
|
+
OPENAI_API_KEY=
|
|
286
|
+
ANTHROPIC_API_KEY=
|
|
287
|
+
"""
|
|
288
|
+
)
|
|
289
|
+
click.echo("Created .env.example")
|
|
290
|
+
|
|
291
|
+
# Create .gitignore if it doesn't exist
|
|
292
|
+
gitignore = root / ".gitignore"
|
|
293
|
+
if not gitignore.exists():
|
|
294
|
+
gitignore.write_text(
|
|
295
|
+
""".env
|
|
296
|
+
runs/
|
|
297
|
+
__pycache__/
|
|
298
|
+
*.pyc
|
|
299
|
+
"""
|
|
300
|
+
)
|
|
301
|
+
click.echo("Created .gitignore")
|
|
302
|
+
|
|
303
|
+
# Add examples if requested
|
|
304
|
+
if with_examples:
|
|
305
|
+
_create_example_files(root)
|
|
306
|
+
|
|
307
|
+
click.echo("")
|
|
308
|
+
click.echo("Project initialized! Next steps:")
|
|
309
|
+
click.echo(" 1. Copy .env.example to .env and add your API key")
|
|
310
|
+
click.echo(" 2. Create scenarios in scenarios/")
|
|
311
|
+
click.echo(" 3. Run: sandboxy open")
|
|
312
|
+
|
|
313
|
+
|
|
314
|
+
def _create_example_files(root: Path) -> None:
|
|
315
|
+
"""Create example scenario and tool files."""
|
|
316
|
+
# Example scenario
|
|
317
|
+
example_scenario = root / "scenarios" / "hello-world.yml"
|
|
318
|
+
if not example_scenario.exists():
|
|
319
|
+
example_scenario.write_text(
|
|
320
|
+
"""name: Hello World
|
|
321
|
+
description: A simple greeting scenario to test your setup
|
|
322
|
+
|
|
323
|
+
system_prompt: |
|
|
324
|
+
You are a friendly assistant. Greet the user warmly.
|
|
325
|
+
|
|
326
|
+
prompt: |
|
|
327
|
+
Hello! Can you introduce yourself?
|
|
328
|
+
|
|
329
|
+
evaluation:
|
|
330
|
+
goals:
|
|
331
|
+
- id: greeted
|
|
332
|
+
name: Greeted the user
|
|
333
|
+
description: The assistant should greet the user
|
|
334
|
+
outcome: true
|
|
335
|
+
check: "'hello' in response.lower() or 'hi' in response.lower()"
|
|
336
|
+
"""
|
|
337
|
+
)
|
|
338
|
+
click.echo("Created scenarios/hello-world.yml")
|
|
339
|
+
|
|
340
|
+
# Example tool
|
|
341
|
+
example_tool = root / "tools" / "calculator.yml"
|
|
342
|
+
if not example_tool.exists():
|
|
343
|
+
example_tool.write_text(
|
|
344
|
+
"""name: calculator
|
|
345
|
+
description: A simple calculator tool
|
|
346
|
+
|
|
347
|
+
tools:
|
|
348
|
+
calculator:
|
|
349
|
+
description: Perform basic math operations
|
|
350
|
+
actions:
|
|
351
|
+
add:
|
|
352
|
+
description: Add two numbers
|
|
353
|
+
parameters:
|
|
354
|
+
type: object
|
|
355
|
+
properties:
|
|
356
|
+
a:
|
|
357
|
+
type: number
|
|
358
|
+
description: First number
|
|
359
|
+
b:
|
|
360
|
+
type: number
|
|
361
|
+
description: Second number
|
|
362
|
+
required: [a, b]
|
|
363
|
+
returns:
|
|
364
|
+
result: "{{a}} + {{b}}"
|
|
365
|
+
|
|
366
|
+
multiply:
|
|
367
|
+
description: Multiply two numbers
|
|
368
|
+
parameters:
|
|
369
|
+
type: object
|
|
370
|
+
properties:
|
|
371
|
+
a:
|
|
372
|
+
type: number
|
|
373
|
+
b:
|
|
374
|
+
type: number
|
|
375
|
+
required: [a, b]
|
|
376
|
+
returns:
|
|
377
|
+
result: "{{a}} * {{b}}"
|
|
378
|
+
"""
|
|
379
|
+
)
|
|
380
|
+
click.echo("Created tools/calculator.yml")
|
|
381
|
+
|
|
382
|
+
# Example scenario using the tool
|
|
383
|
+
tool_scenario = root / "scenarios" / "calculator-test.yml"
|
|
384
|
+
if not tool_scenario.exists():
|
|
385
|
+
tool_scenario.write_text(
|
|
386
|
+
"""name: Calculator Test
|
|
387
|
+
description: Test the calculator tool
|
|
388
|
+
|
|
389
|
+
system_prompt: |
|
|
390
|
+
You are a helpful assistant with access to a calculator.
|
|
391
|
+
Use the calculator tool to perform math operations.
|
|
392
|
+
|
|
393
|
+
tools_from:
|
|
394
|
+
- calculator
|
|
395
|
+
|
|
396
|
+
prompt: |
|
|
397
|
+
What is 42 + 17?
|
|
398
|
+
|
|
399
|
+
evaluation:
|
|
400
|
+
goals:
|
|
401
|
+
- id: used_calculator
|
|
402
|
+
name: Used calculator
|
|
403
|
+
description: The agent should use the calculator tool
|
|
404
|
+
outcome: true
|
|
405
|
+
check: "any(tc.tool == 'calculator' for tc in tool_calls)"
|
|
406
|
+
|
|
407
|
+
- id: correct_answer
|
|
408
|
+
name: Correct answer
|
|
409
|
+
description: The response should contain 59
|
|
410
|
+
outcome: true
|
|
411
|
+
check: "'59' in response"
|
|
412
|
+
"""
|
|
413
|
+
)
|
|
414
|
+
click.echo("Created scenarios/calculator-test.yml")
|
|
415
|
+
|
|
416
|
+
|
|
49
417
|
@main.command()
|
|
50
418
|
@click.argument("module_path", type=click.Path(exists=True))
|
|
51
419
|
@click.option("--agent-id", "-a", help="Agent ID to use", default=None)
|
|
@@ -135,148 +503,6 @@ def validate(module_path: str) -> None:
|
|
|
135
503
|
click.echo("Module is valid.")
|
|
136
504
|
|
|
137
505
|
|
|
138
|
-
@main.command()
|
|
139
|
-
@click.argument("module_path", type=click.Path(exists=True))
|
|
140
|
-
@click.option("--agents", required=True, help="Comma-separated agent IDs")
|
|
141
|
-
@click.option("--runs-per-agent", type=int, default=1, help="Number of runs per agent")
|
|
142
|
-
@click.option("--output", "-o", type=click.Path(), default=None, help="Output CSV file")
|
|
143
|
-
@click.option("--var", "-v", multiple=True, help="Variable in name=value format")
|
|
144
|
-
@click.option("--seed", type=int, default=None, help="Random seed for reproducibility")
|
|
145
|
-
def bench(
|
|
146
|
-
module_path: str,
|
|
147
|
-
agents: str,
|
|
148
|
-
runs_per_agent: int,
|
|
149
|
-
output: str | None,
|
|
150
|
-
var: tuple[str, ...],
|
|
151
|
-
seed: int | None,
|
|
152
|
-
) -> None:
|
|
153
|
-
"""Benchmark a module against multiple agents.
|
|
154
|
-
|
|
155
|
-
MODULE_PATH is the path to an MDL YAML file.
|
|
156
|
-
|
|
157
|
-
Examples:
|
|
158
|
-
sandboxy bench modules/lemonade.yml --agents gpt4,claude --runs 5
|
|
159
|
-
sandboxy bench modules/lemonade.yml --agents gpt4 -v difficulty=8 -v starting_cash=100
|
|
160
|
-
"""
|
|
161
|
-
import random
|
|
162
|
-
|
|
163
|
-
# Set random seed for reproducibility
|
|
164
|
-
if seed is not None:
|
|
165
|
-
random.seed(seed)
|
|
166
|
-
|
|
167
|
-
try:
|
|
168
|
-
module = load_module(Path(module_path))
|
|
169
|
-
except MDLParseError as e:
|
|
170
|
-
click.echo(f"Error loading module: {e}", err=True)
|
|
171
|
-
sys.exit(1)
|
|
172
|
-
|
|
173
|
-
# Load variables from environment and CLI
|
|
174
|
-
variables = _load_variables_from_env()
|
|
175
|
-
for v in var:
|
|
176
|
-
if "=" in v:
|
|
177
|
-
name, value = v.split("=", 1)
|
|
178
|
-
try:
|
|
179
|
-
variables[name] = json.loads(value)
|
|
180
|
-
except json.JSONDecodeError:
|
|
181
|
-
variables[name] = value
|
|
182
|
-
|
|
183
|
-
# Apply variables to module
|
|
184
|
-
if variables:
|
|
185
|
-
module = apply_variables(module, variables)
|
|
186
|
-
click.echo(f"Variables: {variables}")
|
|
187
|
-
|
|
188
|
-
loader = AgentLoader(DEFAULT_AGENT_DIRS)
|
|
189
|
-
agent_ids = [a.strip() for a in agents.split(",")]
|
|
190
|
-
|
|
191
|
-
results: list[dict[str, str | float | int]] = []
|
|
192
|
-
|
|
193
|
-
for agent_id in agent_ids:
|
|
194
|
-
try:
|
|
195
|
-
agent = loader.load(agent_id)
|
|
196
|
-
except ValueError as e:
|
|
197
|
-
click.echo(f"Warning: Skipping agent {agent_id}: {e}", err=True)
|
|
198
|
-
continue
|
|
199
|
-
|
|
200
|
-
# Apply module's agent_config overrides
|
|
201
|
-
if module.agent_config:
|
|
202
|
-
if "system_prompt" in module.agent_config:
|
|
203
|
-
agent.config.system_prompt = module.agent_config["system_prompt"]
|
|
204
|
-
|
|
205
|
-
click.echo(f"Benchmarking agent: {agent_id}")
|
|
206
|
-
|
|
207
|
-
for run_idx in range(runs_per_agent):
|
|
208
|
-
runner = Runner(module=module, agent=agent)
|
|
209
|
-
result = runner.run()
|
|
210
|
-
|
|
211
|
-
row: dict[str, str | float | int] = {
|
|
212
|
-
"agent_id": agent_id,
|
|
213
|
-
"run_idx": run_idx,
|
|
214
|
-
"score": result.evaluation.score,
|
|
215
|
-
"num_events": result.evaluation.num_events,
|
|
216
|
-
"status": result.evaluation.status,
|
|
217
|
-
}
|
|
218
|
-
|
|
219
|
-
# Add seed if used for reproducibility tracking
|
|
220
|
-
if seed is not None:
|
|
221
|
-
row["seed"] = seed
|
|
222
|
-
|
|
223
|
-
# Add env_state metrics if available
|
|
224
|
-
if "cash_balance" in runner.env_state:
|
|
225
|
-
row["final_cash"] = runner.env_state["cash_balance"]
|
|
226
|
-
if "starting_cash" in module.environment.initial_state:
|
|
227
|
-
initial = module.environment.initial_state["starting_cash"]
|
|
228
|
-
if "final_cash" in row:
|
|
229
|
-
row["profit"] = float(row["final_cash"]) - float(initial)
|
|
230
|
-
|
|
231
|
-
# Add all evaluation check results
|
|
232
|
-
for check_name, check_result in result.evaluation.checks.items():
|
|
233
|
-
if isinstance(check_result, int | float | bool):
|
|
234
|
-
row[f"check_{check_name}"] = check_result
|
|
235
|
-
|
|
236
|
-
results.append(row)
|
|
237
|
-
click.echo(f" Run {run_idx + 1}: score={result.evaluation.score:.2f}")
|
|
238
|
-
|
|
239
|
-
if not results:
|
|
240
|
-
click.echo("No results to report.", err=True)
|
|
241
|
-
sys.exit(1)
|
|
242
|
-
|
|
243
|
-
# Output results
|
|
244
|
-
if output:
|
|
245
|
-
fieldnames = list(results[0].keys())
|
|
246
|
-
with open(output, "w", newline="") as f:
|
|
247
|
-
writer = csv.DictWriter(f, fieldnames=fieldnames)
|
|
248
|
-
writer.writeheader()
|
|
249
|
-
writer.writerows(results)
|
|
250
|
-
click.echo(f"\nResults saved to: {output}")
|
|
251
|
-
else:
|
|
252
|
-
# Print summary table
|
|
253
|
-
click.echo("\nBenchmark Results:")
|
|
254
|
-
click.echo("-" * 60)
|
|
255
|
-
|
|
256
|
-
# Group by agent
|
|
257
|
-
from collections import defaultdict
|
|
258
|
-
|
|
259
|
-
by_agent: dict[str, list[dict[str, str | float | int]]] = defaultdict(list)
|
|
260
|
-
for r in results:
|
|
261
|
-
by_agent[str(r["agent_id"])].append(r)
|
|
262
|
-
|
|
263
|
-
for agent_id, runs in by_agent.items():
|
|
264
|
-
scores = [r["score"] for r in runs if isinstance(r["score"], int | float)]
|
|
265
|
-
avg_score = sum(scores) / len(scores) if scores else 0
|
|
266
|
-
click.echo(f"{agent_id}:")
|
|
267
|
-
click.echo(f" Runs: {len(runs)}")
|
|
268
|
-
click.echo(f" Avg Score: {avg_score:.3f}")
|
|
269
|
-
if "final_cash" in runs[0]:
|
|
270
|
-
cash_values = [
|
|
271
|
-
float(r["final_cash"])
|
|
272
|
-
for r in runs
|
|
273
|
-
if "final_cash" in r and isinstance(r["final_cash"], int | float)
|
|
274
|
-
]
|
|
275
|
-
avg_cash = sum(cash_values) / len(cash_values) if cash_values else 0.0
|
|
276
|
-
click.echo(f" Avg Final Cash: {avg_cash:.2f}")
|
|
277
|
-
click.echo("")
|
|
278
|
-
|
|
279
|
-
|
|
280
506
|
@main.command()
|
|
281
507
|
@click.option("--port", "-p", type=int, default=8000, help="Port to run server on")
|
|
282
508
|
@click.option("--host", default="127.0.0.1", help="Host to bind to")
|
|
@@ -433,22 +659,54 @@ def info(module_path: str) -> None:
|
|
|
433
659
|
@click.option(
|
|
434
660
|
"--model",
|
|
435
661
|
"-m",
|
|
436
|
-
|
|
437
|
-
|
|
662
|
+
multiple=True,
|
|
663
|
+
help="Model(s) to use. Can specify multiple: -m gpt-4o -m claude-3.5-sonnet",
|
|
438
664
|
)
|
|
439
665
|
@click.option("--agent-id", "-a", help="Agent ID from config files", default=None)
|
|
440
666
|
@click.option("--output", "-o", help="Output file for results JSON", default=None)
|
|
441
667
|
@click.option("--pretty", "-p", is_flag=True, help="Pretty print output")
|
|
442
668
|
@click.option("--max-turns", type=int, default=20, help="Maximum conversation turns")
|
|
443
669
|
@click.option("--var", "-v", multiple=True, help="Variable in name=value format")
|
|
670
|
+
@click.option(
|
|
671
|
+
"--mlflow-export",
|
|
672
|
+
is_flag=True,
|
|
673
|
+
help="Export run results to MLflow tracking server",
|
|
674
|
+
)
|
|
675
|
+
@click.option(
|
|
676
|
+
"--no-mlflow",
|
|
677
|
+
is_flag=True,
|
|
678
|
+
help="Disable MLflow export (overrides YAML config)",
|
|
679
|
+
)
|
|
680
|
+
@click.option(
|
|
681
|
+
"--mlflow-tracking-uri",
|
|
682
|
+
type=str,
|
|
683
|
+
default=None,
|
|
684
|
+
help="MLflow tracking server URI (overrides MLFLOW_TRACKING_URI env)",
|
|
685
|
+
)
|
|
686
|
+
@click.option(
|
|
687
|
+
"--mlflow-experiment",
|
|
688
|
+
type=str,
|
|
689
|
+
default=None,
|
|
690
|
+
help="MLflow experiment name (defaults to scenario name)",
|
|
691
|
+
)
|
|
692
|
+
@click.option(
|
|
693
|
+
"--mlflow-no-tracing",
|
|
694
|
+
is_flag=True,
|
|
695
|
+
help="Disable LLM call tracing (only log summary metrics)",
|
|
696
|
+
)
|
|
444
697
|
def scenario(
|
|
445
698
|
scenario_path: str,
|
|
446
|
-
model: str
|
|
699
|
+
model: tuple[str, ...],
|
|
447
700
|
agent_id: str | None,
|
|
448
701
|
output: str | None,
|
|
449
702
|
pretty: bool,
|
|
450
703
|
max_turns: int,
|
|
451
704
|
var: tuple[str, ...],
|
|
705
|
+
mlflow_export: bool,
|
|
706
|
+
no_mlflow: bool,
|
|
707
|
+
mlflow_tracking_uri: str | None,
|
|
708
|
+
mlflow_experiment: str | None,
|
|
709
|
+
mlflow_no_tracing: bool,
|
|
452
710
|
) -> None:
|
|
453
711
|
"""Run a scenario with YAML-defined tools.
|
|
454
712
|
|
|
@@ -459,8 +717,10 @@ def scenario(
|
|
|
459
717
|
|
|
460
718
|
Examples:
|
|
461
719
|
sandboxy scenario scenarios/trolley.yml -m openai/gpt-4o
|
|
462
|
-
sandboxy scenario scenarios/trolley.yml -m
|
|
720
|
+
sandboxy scenario scenarios/trolley.yml -m gpt-4o -m claude-3.5-sonnet # multiple models
|
|
463
721
|
sandboxy scenario scenarios/surgeon.yml -v patient="John Smith" -v condition="critical"
|
|
722
|
+
sandboxy scenario scenarios/test.yml -m gpt-4o --mlflow-export
|
|
723
|
+
sandboxy scenario scenarios/test.yml -m gpt-4o -m gpt-4o-mini --mlflow-export # compare models
|
|
464
724
|
"""
|
|
465
725
|
from sandboxy.agents.base import AgentConfig
|
|
466
726
|
from sandboxy.agents.llm_prompt import LlmPromptAgent
|
|
@@ -472,6 +732,26 @@ def scenario(
|
|
|
472
732
|
click.echo(f"Error loading scenario: {e}", err=True)
|
|
473
733
|
sys.exit(1)
|
|
474
734
|
|
|
735
|
+
# Build MLflow config if export requested
|
|
736
|
+
mlflow_config = None
|
|
737
|
+
if mlflow_export and not no_mlflow:
|
|
738
|
+
try:
|
|
739
|
+
from sandboxy.mlflow import MLflowConfig
|
|
740
|
+
|
|
741
|
+
mlflow_config = MLflowConfig.resolve(
|
|
742
|
+
cli_export=True,
|
|
743
|
+
cli_tracking_uri=mlflow_tracking_uri,
|
|
744
|
+
cli_experiment=mlflow_experiment,
|
|
745
|
+
cli_tracing=not mlflow_no_tracing,
|
|
746
|
+
yaml_config=spec.mlflow.model_dump() if spec.mlflow else None,
|
|
747
|
+
scenario_name=spec.name,
|
|
748
|
+
)
|
|
749
|
+
click.echo(f"MLflow enabled → experiment: {mlflow_config.experiment}")
|
|
750
|
+
if mlflow_config.tracing:
|
|
751
|
+
click.echo(" Tracing: ON (LLM calls will be captured)")
|
|
752
|
+
except ImportError:
|
|
753
|
+
pass # MLflow not installed
|
|
754
|
+
|
|
475
755
|
# Parse and apply variables
|
|
476
756
|
variables: dict[str, Any] = {}
|
|
477
757
|
for v in var:
|
|
@@ -487,27 +767,17 @@ def scenario(
|
|
|
487
767
|
spec = apply_scenario_variables(spec, variables)
|
|
488
768
|
click.echo(f"Variables: {variables}")
|
|
489
769
|
|
|
490
|
-
#
|
|
491
|
-
|
|
770
|
+
# Build list of models to run
|
|
771
|
+
models_to_run: list[str] = []
|
|
492
772
|
|
|
493
773
|
if model:
|
|
494
|
-
|
|
495
|
-
config = AgentConfig(
|
|
496
|
-
id=model,
|
|
497
|
-
name=model.split("/")[-1] if "/" in model else model,
|
|
498
|
-
kind="llm-prompt",
|
|
499
|
-
model=model,
|
|
500
|
-
system_prompt="",
|
|
501
|
-
tools=[],
|
|
502
|
-
params={"temperature": 0.7, "max_tokens": 4096},
|
|
503
|
-
impl={},
|
|
504
|
-
)
|
|
505
|
-
agent = LlmPromptAgent(config)
|
|
774
|
+
models_to_run = list(model)
|
|
506
775
|
elif agent_id:
|
|
507
776
|
# Load from agent config files
|
|
508
777
|
loader = AgentLoader(DEFAULT_AGENT_DIRS)
|
|
509
778
|
try:
|
|
510
779
|
agent = loader.load(agent_id)
|
|
780
|
+
models_to_run = [agent.config.model]
|
|
511
781
|
except ValueError as e:
|
|
512
782
|
click.echo(f"Error loading agent: {e}", err=True)
|
|
513
783
|
sys.exit(1)
|
|
@@ -516,6 +786,7 @@ def scenario(
|
|
|
516
786
|
loader = AgentLoader(DEFAULT_AGENT_DIRS)
|
|
517
787
|
try:
|
|
518
788
|
agent = loader.load_default()
|
|
789
|
+
models_to_run = [agent.config.model]
|
|
519
790
|
except ValueError:
|
|
520
791
|
click.echo("No model specified. Use -m to specify a model:", err=True)
|
|
521
792
|
click.echo("", err=True)
|
|
@@ -528,25 +799,110 @@ def scenario(
|
|
|
528
799
|
)
|
|
529
800
|
sys.exit(1)
|
|
530
801
|
|
|
531
|
-
# Apply scenario's system prompt to agent
|
|
532
|
-
if spec.system_prompt:
|
|
533
|
-
agent.config.system_prompt = spec.system_prompt
|
|
534
|
-
|
|
535
802
|
click.echo(f"Running scenario: {spec.name}")
|
|
536
|
-
click.echo(f"
|
|
803
|
+
click.echo(f"Models: {', '.join(models_to_run)}")
|
|
537
804
|
click.echo(f"Tools loaded: {len(spec.tools) + len(spec.tools_from)} source(s)")
|
|
805
|
+
if len(models_to_run) > 1:
|
|
806
|
+
click.echo("Running models in parallel...")
|
|
538
807
|
click.echo("")
|
|
539
808
|
|
|
540
|
-
|
|
541
|
-
|
|
809
|
+
def run_single_model(model_id: str) -> dict[str, Any]:
|
|
810
|
+
"""Run scenario with a single model, with MLflow tracing if enabled."""
|
|
811
|
+
agent_config = AgentConfig(
|
|
812
|
+
id=model_id,
|
|
813
|
+
name=model_id.split("/")[-1] if "/" in model_id else model_id,
|
|
814
|
+
kind="llm-prompt",
|
|
815
|
+
model=model_id,
|
|
816
|
+
system_prompt=spec.system_prompt or "",
|
|
817
|
+
tools=[],
|
|
818
|
+
params={"temperature": 0.7, "max_tokens": 4096},
|
|
819
|
+
impl={},
|
|
820
|
+
)
|
|
821
|
+
agent = LlmPromptAgent(agent_config)
|
|
822
|
+
|
|
823
|
+
# If MLflow enabled, wrap execution in run context so traces are connected
|
|
824
|
+
if mlflow_config and mlflow_config.enabled:
|
|
825
|
+
from sandboxy.mlflow import MLflowExporter, mlflow_run_context
|
|
826
|
+
from sandboxy.mlflow.tracing import enable_tracing
|
|
827
|
+
|
|
828
|
+
# Enable tracing before the run starts
|
|
829
|
+
if mlflow_config.tracing:
|
|
830
|
+
enable_tracing(
|
|
831
|
+
tracking_uri=mlflow_config.tracking_uri,
|
|
832
|
+
experiment_name=mlflow_config.experiment,
|
|
833
|
+
)
|
|
834
|
+
|
|
835
|
+
# Start run, execute scenario, then log metrics - all connected
|
|
836
|
+
with mlflow_run_context(mlflow_config, run_name=model_id) as run_id:
|
|
837
|
+
runner = ScenarioRunner(scenario=spec, agent=agent)
|
|
838
|
+
result = runner.run(max_turns=max_turns)
|
|
839
|
+
|
|
840
|
+
# Log metrics to the active run (traces are already attached)
|
|
841
|
+
if run_id:
|
|
842
|
+
exporter = MLflowExporter(mlflow_config)
|
|
843
|
+
exporter.log_to_active_run(
|
|
844
|
+
result=result,
|
|
845
|
+
scenario_path=Path(scenario_path),
|
|
846
|
+
scenario_name=spec.name,
|
|
847
|
+
scenario_id=spec.id,
|
|
848
|
+
agent_name=agent.config.name,
|
|
849
|
+
)
|
|
850
|
+
|
|
851
|
+
return {"model": model_id, "result": result, "agent_name": agent.config.name}
|
|
852
|
+
|
|
853
|
+
# No MLflow - just run scenario
|
|
854
|
+
runner = ScenarioRunner(scenario=spec, agent=agent)
|
|
855
|
+
result = runner.run(max_turns=max_turns)
|
|
856
|
+
return {"model": model_id, "result": result, "agent_name": agent.config.name}
|
|
857
|
+
|
|
858
|
+
# Run models in parallel if multiple, otherwise just run single
|
|
859
|
+
results: list[Any] = []
|
|
860
|
+
if len(models_to_run) == 1:
|
|
861
|
+
results = [run_single_model(models_to_run[0])]
|
|
862
|
+
else:
|
|
863
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
864
|
+
|
|
865
|
+
with ThreadPoolExecutor(max_workers=len(models_to_run)) as executor:
|
|
866
|
+
futures = {executor.submit(run_single_model, m): m for m in models_to_run}
|
|
867
|
+
for future in as_completed(futures):
|
|
868
|
+
model_id = futures[future]
|
|
869
|
+
try:
|
|
870
|
+
result_data = future.result()
|
|
871
|
+
results.append(result_data)
|
|
872
|
+
click.echo(f"✓ Completed: {model_id}")
|
|
873
|
+
except Exception as e:
|
|
874
|
+
click.echo(f"✗ Failed: {model_id} - {e}", err=True)
|
|
875
|
+
click.echo("")
|
|
542
876
|
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
877
|
+
# Output results
|
|
878
|
+
if len(results) == 1:
|
|
879
|
+
result = results[0]["result"]
|
|
880
|
+
if output:
|
|
881
|
+
Path(output).write_text(result.to_json(indent=2))
|
|
882
|
+
click.echo(f"\nResults saved to: {output}")
|
|
883
|
+
elif pretty:
|
|
884
|
+
click.echo(result.pretty())
|
|
885
|
+
else:
|
|
886
|
+
click.echo(result.to_json(indent=2))
|
|
548
887
|
else:
|
|
549
|
-
|
|
888
|
+
# Multiple models - show summary
|
|
889
|
+
# Get max_score from spec (scoring config or sum of goal points)
|
|
890
|
+
max_score = spec.scoring.get("max_score", 0) if spec.scoring else 0
|
|
891
|
+
if not max_score and spec.goals:
|
|
892
|
+
max_score = sum(g.points for g in spec.goals)
|
|
893
|
+
|
|
894
|
+
click.echo("=== Results Summary ===")
|
|
895
|
+
for r in results:
|
|
896
|
+
model_name = r["model"]
|
|
897
|
+
res = r["result"]
|
|
898
|
+
score = getattr(res, "score", 0) or 0
|
|
899
|
+
pct = (score / max_score * 100) if max_score > 0 else 0
|
|
900
|
+
click.echo(f" {model_name}: {score:.1f}/{max_score:.1f} ({pct:.0f}%)")
|
|
901
|
+
|
|
902
|
+
if output:
|
|
903
|
+
all_results = [{"model": r["model"], "result": r["result"].to_dict()} for r in results]
|
|
904
|
+
Path(output).write_text(json.dumps(all_results, indent=2))
|
|
905
|
+
click.echo(f"\nResults saved to: {output}")
|
|
550
906
|
|
|
551
907
|
|
|
552
908
|
@main.command()
|