sandboxy 0.0.2__py3-none-any.whl → 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sandboxy/cli/main.py CHANGED
@@ -1,6 +1,5 @@
1
1
  """CLI entrypoint for Sandboxy."""
2
2
 
3
- import csv
4
3
  import json
5
4
  import os
6
5
  import sys
@@ -35,6 +34,59 @@ def main() -> None:
35
34
  pass
36
35
 
37
36
 
37
+ @main.command()
38
+ @click.argument("shell", type=click.Choice(["bash", "zsh", "fish"]), default="bash")
39
+ def completion(shell: str) -> None:
40
+ """Generate shell completion and show setup instructions.
41
+
42
+ Writes completion script to ~/.sandboxy-completion.<shell>
43
+ and shows the line to add to your shell config.
44
+
45
+ Examples:
46
+ sandboxy completion # Generate bash completion
47
+ sandboxy completion zsh # Generate zsh completion
48
+ """
49
+ import subprocess
50
+
51
+ home = Path.home()
52
+ ext = shell if shell != "bash" else "bash"
53
+ completion_file = home / f".sandboxy-completion.{ext}"
54
+
55
+ # Generate completion script using Click's built-in mechanism
56
+ env = os.environ.copy()
57
+ env["_SANDBOXY_COMPLETE"] = f"{shell}_source"
58
+
59
+ result = subprocess.run( # noqa: S603
60
+ [sys.executable, "-m", "sandboxy.cli.main"],
61
+ env=env,
62
+ capture_output=True,
63
+ text=True,
64
+ )
65
+
66
+ # Write to file
67
+ completion_file.write_text(result.stdout)
68
+ click.echo(f"Generated: {completion_file}")
69
+ click.echo("")
70
+ click.echo("Add this line to your shell config:")
71
+ click.echo("")
72
+
73
+ if shell == "bash":
74
+ click.echo("# Sandboxy completion")
75
+ click.echo(f'. "{completion_file}"')
76
+ click.echo("")
77
+ click.echo("(Add to ~/.bashrc)")
78
+ elif shell == "zsh":
79
+ click.echo("# Sandboxy completion")
80
+ click.echo(f'. "{completion_file}"')
81
+ click.echo("")
82
+ click.echo("(Add to ~/.zshrc)")
83
+ elif shell == "fish":
84
+ click.echo("# Sandboxy completion")
85
+ click.echo(f'source "{completion_file}"')
86
+ click.echo("")
87
+ click.echo("(Add to ~/.config/fish/config.fish)")
88
+
89
+
38
90
  def _load_variables_from_env() -> dict:
39
91
  """Load variables from SANDBOXY_VARIABLES environment variable."""
40
92
  env_vars = os.environ.get("SANDBOXY_VARIABLES", "")
@@ -46,6 +98,322 @@ def _load_variables_from_env() -> dict:
46
98
  return {}
47
99
 
48
100
 
101
+ def _export_to_mlflow(
102
+ result: Any,
103
+ spec: Any,
104
+ scenario_path: Path,
105
+ mlflow_export: bool,
106
+ no_mlflow: bool,
107
+ mlflow_tracking_uri: str | None,
108
+ mlflow_experiment: str | None,
109
+ agent_name: str = "default",
110
+ ) -> None:
111
+ """Export scenario result to MLflow if enabled.
112
+
113
+ Args:
114
+ result: ScenarioResult from runner
115
+ spec: ScenarioSpec
116
+ scenario_path: Path to scenario file
117
+ mlflow_export: --mlflow-export flag
118
+ no_mlflow: --no-mlflow flag
119
+ mlflow_tracking_uri: --mlflow-tracking-uri value
120
+ mlflow_experiment: --mlflow-experiment value
121
+ agent_name: Agent configuration name
122
+ """
123
+ from sandboxy.mlflow.config import MLflowConfig
124
+
125
+ # Get YAML config from spec
126
+ yaml_config = None
127
+ if spec.mlflow:
128
+ yaml_config = {
129
+ "enabled": spec.mlflow.enabled,
130
+ "experiment": spec.mlflow.experiment,
131
+ "tracking_uri": spec.mlflow.tracking_uri,
132
+ "tags": spec.mlflow.tags,
133
+ }
134
+
135
+ # Resolve config with precedence
136
+ config = MLflowConfig.resolve(
137
+ cli_export=mlflow_export,
138
+ cli_no_mlflow=no_mlflow,
139
+ cli_tracking_uri=mlflow_tracking_uri,
140
+ cli_experiment=mlflow_experiment,
141
+ yaml_config=yaml_config,
142
+ scenario_name=spec.name,
143
+ )
144
+
145
+ if not config.enabled:
146
+ return
147
+
148
+ # Import and use exporter
149
+ try:
150
+ from sandboxy.mlflow.exporter import MLflowExporter
151
+
152
+ exporter = MLflowExporter(config)
153
+
154
+ # Convert ScenarioResult to RunResult-like for exporter
155
+ # ScenarioResult has different structure, create adapter
156
+ run_id = exporter.export(
157
+ result=_adapt_scenario_result(result),
158
+ scenario_path=scenario_path,
159
+ scenario_name=spec.name,
160
+ scenario_id=spec.id,
161
+ agent_name=agent_name,
162
+ )
163
+
164
+ if run_id:
165
+ click.echo(f"\nExported to MLflow: run_id={run_id}")
166
+
167
+ except ImportError:
168
+ click.echo(
169
+ "\nMLflow not installed. Install with: pip install sandboxy[mlflow]",
170
+ err=True,
171
+ )
172
+ except Exception as e:
173
+ click.echo(f"\nWarning: MLflow export failed: {e}", err=True)
174
+
175
+
176
+ def _adapt_scenario_result(result: Any) -> Any:
177
+ """Adapt ScenarioResult to RunResult-like interface for MLflowExporter.
178
+
179
+ The exporter expects RunResult fields, but ScenarioRunner returns ScenarioResult.
180
+ This creates an adapter object.
181
+ """
182
+ from dataclasses import dataclass, field
183
+
184
+ @dataclass
185
+ class GoalResultAdapter:
186
+ name: str
187
+ score: float
188
+ passed: bool = True
189
+
190
+ @dataclass
191
+ class EvaluationAdapter:
192
+ goals: list[GoalResultAdapter] = field(default_factory=list)
193
+ total_score: float = 0.0
194
+ max_score: float = 0.0
195
+ percentage: float = 0.0
196
+
197
+ @dataclass
198
+ class RunResultAdapter:
199
+ model: str = ""
200
+ error: str | None = None
201
+ latency_ms: int = 0
202
+ input_tokens: int = 0
203
+ output_tokens: int = 0
204
+ evaluation: EvaluationAdapter | None = None
205
+
206
+ # Extract data from ScenarioResult
207
+ adapter = RunResultAdapter(
208
+ model=getattr(result, "agent_id", "unknown"),
209
+ error=None,
210
+ )
211
+
212
+ # Build evaluation from goals
213
+ goals = []
214
+ total = 0.0
215
+ for goal_name in getattr(result, "goals_achieved", []):
216
+ goals.append(GoalResultAdapter(name=goal_name, score=1.0, passed=True))
217
+ total += 1.0
218
+
219
+ score = getattr(result, "score", 0.0)
220
+ max_score = max(score, len(goals)) if goals else score
221
+
222
+ adapter.evaluation = EvaluationAdapter(
223
+ goals=goals,
224
+ total_score=score,
225
+ max_score=max_score,
226
+ percentage=(score / max_score * 100) if max_score > 0 else 0.0,
227
+ )
228
+
229
+ return adapter
230
+
231
+
232
+ @main.command()
233
+ @click.option("--with-examples", is_flag=True, help="Include example scenarios and tools")
234
+ @click.option(
235
+ "--dir",
236
+ "-d",
237
+ "directory",
238
+ type=click.Path(path_type=Path),
239
+ default=None,
240
+ help="Directory to initialize (default: current directory)",
241
+ )
242
+ def init(with_examples: bool, directory: Path | None) -> None:
243
+ """Initialize a new Sandboxy project.
244
+
245
+ Creates the standard folder structure for scenarios, tools, agents, and datasets.
246
+
247
+ Examples:
248
+ sandboxy init
249
+ sandboxy init --with-examples
250
+ sandboxy init --dir my-project
251
+ """
252
+ root = directory or Path.cwd()
253
+
254
+ # Create directory if specified and doesn't exist
255
+ if directory and not root.exists():
256
+ root.mkdir(parents=True)
257
+ click.echo(f"Created directory: {root}")
258
+
259
+ # Standard folders
260
+ folders = ["scenarios", "tools", "agents", "datasets", "runs"]
261
+ created = []
262
+
263
+ for folder in folders:
264
+ folder_path = root / folder
265
+ if not folder_path.exists():
266
+ folder_path.mkdir(parents=True)
267
+ created.append(folder)
268
+
269
+ if created:
270
+ click.echo(f"Created folders: {', '.join(created)}")
271
+ else:
272
+ click.echo("All folders already exist")
273
+
274
+ # Create .env.example if it doesn't exist
275
+ env_example = root / ".env.example"
276
+ if not env_example.exists():
277
+ env_example.write_text(
278
+ """# Sandboxy Environment Variables
279
+ # Copy this to .env and fill in your API keys
280
+
281
+ # OpenRouter API key (recommended - access to 400+ models)
282
+ OPENROUTER_API_KEY=
283
+
284
+ # Or use direct provider keys
285
+ OPENAI_API_KEY=
286
+ ANTHROPIC_API_KEY=
287
+ """
288
+ )
289
+ click.echo("Created .env.example")
290
+
291
+ # Create .gitignore if it doesn't exist
292
+ gitignore = root / ".gitignore"
293
+ if not gitignore.exists():
294
+ gitignore.write_text(
295
+ """.env
296
+ runs/
297
+ __pycache__/
298
+ *.pyc
299
+ """
300
+ )
301
+ click.echo("Created .gitignore")
302
+
303
+ # Add examples if requested
304
+ if with_examples:
305
+ _create_example_files(root)
306
+
307
+ click.echo("")
308
+ click.echo("Project initialized! Next steps:")
309
+ click.echo(" 1. Copy .env.example to .env and add your API key")
310
+ click.echo(" 2. Create scenarios in scenarios/")
311
+ click.echo(" 3. Run: sandboxy open")
312
+
313
+
314
+ def _create_example_files(root: Path) -> None:
315
+ """Create example scenario and tool files."""
316
+ # Example scenario
317
+ example_scenario = root / "scenarios" / "hello-world.yml"
318
+ if not example_scenario.exists():
319
+ example_scenario.write_text(
320
+ """name: Hello World
321
+ description: A simple greeting scenario to test your setup
322
+
323
+ system_prompt: |
324
+ You are a friendly assistant. Greet the user warmly.
325
+
326
+ prompt: |
327
+ Hello! Can you introduce yourself?
328
+
329
+ evaluation:
330
+ goals:
331
+ - id: greeted
332
+ name: Greeted the user
333
+ description: The assistant should greet the user
334
+ outcome: true
335
+ check: "'hello' in response.lower() or 'hi' in response.lower()"
336
+ """
337
+ )
338
+ click.echo("Created scenarios/hello-world.yml")
339
+
340
+ # Example tool
341
+ example_tool = root / "tools" / "calculator.yml"
342
+ if not example_tool.exists():
343
+ example_tool.write_text(
344
+ """name: calculator
345
+ description: A simple calculator tool
346
+
347
+ tools:
348
+ calculator:
349
+ description: Perform basic math operations
350
+ actions:
351
+ add:
352
+ description: Add two numbers
353
+ parameters:
354
+ type: object
355
+ properties:
356
+ a:
357
+ type: number
358
+ description: First number
359
+ b:
360
+ type: number
361
+ description: Second number
362
+ required: [a, b]
363
+ returns:
364
+ result: "{{a}} + {{b}}"
365
+
366
+ multiply:
367
+ description: Multiply two numbers
368
+ parameters:
369
+ type: object
370
+ properties:
371
+ a:
372
+ type: number
373
+ b:
374
+ type: number
375
+ required: [a, b]
376
+ returns:
377
+ result: "{{a}} * {{b}}"
378
+ """
379
+ )
380
+ click.echo("Created tools/calculator.yml")
381
+
382
+ # Example scenario using the tool
383
+ tool_scenario = root / "scenarios" / "calculator-test.yml"
384
+ if not tool_scenario.exists():
385
+ tool_scenario.write_text(
386
+ """name: Calculator Test
387
+ description: Test the calculator tool
388
+
389
+ system_prompt: |
390
+ You are a helpful assistant with access to a calculator.
391
+ Use the calculator tool to perform math operations.
392
+
393
+ tools_from:
394
+ - calculator
395
+
396
+ prompt: |
397
+ What is 42 + 17?
398
+
399
+ evaluation:
400
+ goals:
401
+ - id: used_calculator
402
+ name: Used calculator
403
+ description: The agent should use the calculator tool
404
+ outcome: true
405
+ check: "any(tc.tool == 'calculator' for tc in tool_calls)"
406
+
407
+ - id: correct_answer
408
+ name: Correct answer
409
+ description: The response should contain 59
410
+ outcome: true
411
+ check: "'59' in response"
412
+ """
413
+ )
414
+ click.echo("Created scenarios/calculator-test.yml")
415
+
416
+
49
417
  @main.command()
50
418
  @click.argument("module_path", type=click.Path(exists=True))
51
419
  @click.option("--agent-id", "-a", help="Agent ID to use", default=None)
@@ -135,148 +503,6 @@ def validate(module_path: str) -> None:
135
503
  click.echo("Module is valid.")
136
504
 
137
505
 
138
- @main.command()
139
- @click.argument("module_path", type=click.Path(exists=True))
140
- @click.option("--agents", required=True, help="Comma-separated agent IDs")
141
- @click.option("--runs-per-agent", type=int, default=1, help="Number of runs per agent")
142
- @click.option("--output", "-o", type=click.Path(), default=None, help="Output CSV file")
143
- @click.option("--var", "-v", multiple=True, help="Variable in name=value format")
144
- @click.option("--seed", type=int, default=None, help="Random seed for reproducibility")
145
- def bench(
146
- module_path: str,
147
- agents: str,
148
- runs_per_agent: int,
149
- output: str | None,
150
- var: tuple[str, ...],
151
- seed: int | None,
152
- ) -> None:
153
- """Benchmark a module against multiple agents.
154
-
155
- MODULE_PATH is the path to an MDL YAML file.
156
-
157
- Examples:
158
- sandboxy bench modules/lemonade.yml --agents gpt4,claude --runs 5
159
- sandboxy bench modules/lemonade.yml --agents gpt4 -v difficulty=8 -v starting_cash=100
160
- """
161
- import random
162
-
163
- # Set random seed for reproducibility
164
- if seed is not None:
165
- random.seed(seed)
166
-
167
- try:
168
- module = load_module(Path(module_path))
169
- except MDLParseError as e:
170
- click.echo(f"Error loading module: {e}", err=True)
171
- sys.exit(1)
172
-
173
- # Load variables from environment and CLI
174
- variables = _load_variables_from_env()
175
- for v in var:
176
- if "=" in v:
177
- name, value = v.split("=", 1)
178
- try:
179
- variables[name] = json.loads(value)
180
- except json.JSONDecodeError:
181
- variables[name] = value
182
-
183
- # Apply variables to module
184
- if variables:
185
- module = apply_variables(module, variables)
186
- click.echo(f"Variables: {variables}")
187
-
188
- loader = AgentLoader(DEFAULT_AGENT_DIRS)
189
- agent_ids = [a.strip() for a in agents.split(",")]
190
-
191
- results: list[dict[str, str | float | int]] = []
192
-
193
- for agent_id in agent_ids:
194
- try:
195
- agent = loader.load(agent_id)
196
- except ValueError as e:
197
- click.echo(f"Warning: Skipping agent {agent_id}: {e}", err=True)
198
- continue
199
-
200
- # Apply module's agent_config overrides
201
- if module.agent_config:
202
- if "system_prompt" in module.agent_config:
203
- agent.config.system_prompt = module.agent_config["system_prompt"]
204
-
205
- click.echo(f"Benchmarking agent: {agent_id}")
206
-
207
- for run_idx in range(runs_per_agent):
208
- runner = Runner(module=module, agent=agent)
209
- result = runner.run()
210
-
211
- row: dict[str, str | float | int] = {
212
- "agent_id": agent_id,
213
- "run_idx": run_idx,
214
- "score": result.evaluation.score,
215
- "num_events": result.evaluation.num_events,
216
- "status": result.evaluation.status,
217
- }
218
-
219
- # Add seed if used for reproducibility tracking
220
- if seed is not None:
221
- row["seed"] = seed
222
-
223
- # Add env_state metrics if available
224
- if "cash_balance" in runner.env_state:
225
- row["final_cash"] = runner.env_state["cash_balance"]
226
- if "starting_cash" in module.environment.initial_state:
227
- initial = module.environment.initial_state["starting_cash"]
228
- if "final_cash" in row:
229
- row["profit"] = float(row["final_cash"]) - float(initial)
230
-
231
- # Add all evaluation check results
232
- for check_name, check_result in result.evaluation.checks.items():
233
- if isinstance(check_result, int | float | bool):
234
- row[f"check_{check_name}"] = check_result
235
-
236
- results.append(row)
237
- click.echo(f" Run {run_idx + 1}: score={result.evaluation.score:.2f}")
238
-
239
- if not results:
240
- click.echo("No results to report.", err=True)
241
- sys.exit(1)
242
-
243
- # Output results
244
- if output:
245
- fieldnames = list(results[0].keys())
246
- with open(output, "w", newline="") as f:
247
- writer = csv.DictWriter(f, fieldnames=fieldnames)
248
- writer.writeheader()
249
- writer.writerows(results)
250
- click.echo(f"\nResults saved to: {output}")
251
- else:
252
- # Print summary table
253
- click.echo("\nBenchmark Results:")
254
- click.echo("-" * 60)
255
-
256
- # Group by agent
257
- from collections import defaultdict
258
-
259
- by_agent: dict[str, list[dict[str, str | float | int]]] = defaultdict(list)
260
- for r in results:
261
- by_agent[str(r["agent_id"])].append(r)
262
-
263
- for agent_id, runs in by_agent.items():
264
- scores = [r["score"] for r in runs if isinstance(r["score"], int | float)]
265
- avg_score = sum(scores) / len(scores) if scores else 0
266
- click.echo(f"{agent_id}:")
267
- click.echo(f" Runs: {len(runs)}")
268
- click.echo(f" Avg Score: {avg_score:.3f}")
269
- if "final_cash" in runs[0]:
270
- cash_values = [
271
- float(r["final_cash"])
272
- for r in runs
273
- if "final_cash" in r and isinstance(r["final_cash"], int | float)
274
- ]
275
- avg_cash = sum(cash_values) / len(cash_values) if cash_values else 0.0
276
- click.echo(f" Avg Final Cash: {avg_cash:.2f}")
277
- click.echo("")
278
-
279
-
280
506
  @main.command()
281
507
  @click.option("--port", "-p", type=int, default=8000, help="Port to run server on")
282
508
  @click.option("--host", default="127.0.0.1", help="Host to bind to")
@@ -433,22 +659,54 @@ def info(module_path: str) -> None:
433
659
  @click.option(
434
660
  "--model",
435
661
  "-m",
436
- help="Model to use (e.g., openai/gpt-4o, anthropic/claude-3.5-sonnet)",
437
- default=None,
662
+ multiple=True,
663
+ help="Model(s) to use. Can specify multiple: -m gpt-4o -m claude-3.5-sonnet",
438
664
  )
439
665
  @click.option("--agent-id", "-a", help="Agent ID from config files", default=None)
440
666
  @click.option("--output", "-o", help="Output file for results JSON", default=None)
441
667
  @click.option("--pretty", "-p", is_flag=True, help="Pretty print output")
442
668
  @click.option("--max-turns", type=int, default=20, help="Maximum conversation turns")
443
669
  @click.option("--var", "-v", multiple=True, help="Variable in name=value format")
670
+ @click.option(
671
+ "--mlflow-export",
672
+ is_flag=True,
673
+ help="Export run results to MLflow tracking server",
674
+ )
675
+ @click.option(
676
+ "--no-mlflow",
677
+ is_flag=True,
678
+ help="Disable MLflow export (overrides YAML config)",
679
+ )
680
+ @click.option(
681
+ "--mlflow-tracking-uri",
682
+ type=str,
683
+ default=None,
684
+ help="MLflow tracking server URI (overrides MLFLOW_TRACKING_URI env)",
685
+ )
686
+ @click.option(
687
+ "--mlflow-experiment",
688
+ type=str,
689
+ default=None,
690
+ help="MLflow experiment name (defaults to scenario name)",
691
+ )
692
+ @click.option(
693
+ "--mlflow-no-tracing",
694
+ is_flag=True,
695
+ help="Disable LLM call tracing (only log summary metrics)",
696
+ )
444
697
  def scenario(
445
698
  scenario_path: str,
446
- model: str | None,
699
+ model: tuple[str, ...],
447
700
  agent_id: str | None,
448
701
  output: str | None,
449
702
  pretty: bool,
450
703
  max_turns: int,
451
704
  var: tuple[str, ...],
705
+ mlflow_export: bool,
706
+ no_mlflow: bool,
707
+ mlflow_tracking_uri: str | None,
708
+ mlflow_experiment: str | None,
709
+ mlflow_no_tracing: bool,
452
710
  ) -> None:
453
711
  """Run a scenario with YAML-defined tools.
454
712
 
@@ -459,8 +717,10 @@ def scenario(
459
717
 
460
718
  Examples:
461
719
  sandboxy scenario scenarios/trolley.yml -m openai/gpt-4o
462
- sandboxy scenario scenarios/trolley.yml -m anthropic/claude-3.5-sonnet -p
720
+ sandboxy scenario scenarios/trolley.yml -m gpt-4o -m claude-3.5-sonnet # multiple models
463
721
  sandboxy scenario scenarios/surgeon.yml -v patient="John Smith" -v condition="critical"
722
+ sandboxy scenario scenarios/test.yml -m gpt-4o --mlflow-export
723
+ sandboxy scenario scenarios/test.yml -m gpt-4o -m gpt-4o-mini --mlflow-export # compare models
464
724
  """
465
725
  from sandboxy.agents.base import AgentConfig
466
726
  from sandboxy.agents.llm_prompt import LlmPromptAgent
@@ -472,6 +732,26 @@ def scenario(
472
732
  click.echo(f"Error loading scenario: {e}", err=True)
473
733
  sys.exit(1)
474
734
 
735
+ # Build MLflow config if export requested
736
+ mlflow_config = None
737
+ if mlflow_export and not no_mlflow:
738
+ try:
739
+ from sandboxy.mlflow import MLflowConfig
740
+
741
+ mlflow_config = MLflowConfig.resolve(
742
+ cli_export=True,
743
+ cli_tracking_uri=mlflow_tracking_uri,
744
+ cli_experiment=mlflow_experiment,
745
+ cli_tracing=not mlflow_no_tracing,
746
+ yaml_config=spec.mlflow.model_dump() if spec.mlflow else None,
747
+ scenario_name=spec.name,
748
+ )
749
+ click.echo(f"MLflow enabled → experiment: {mlflow_config.experiment}")
750
+ if mlflow_config.tracing:
751
+ click.echo(" Tracing: ON (LLM calls will be captured)")
752
+ except ImportError:
753
+ pass # MLflow not installed
754
+
475
755
  # Parse and apply variables
476
756
  variables: dict[str, Any] = {}
477
757
  for v in var:
@@ -487,27 +767,17 @@ def scenario(
487
767
  spec = apply_scenario_variables(spec, variables)
488
768
  click.echo(f"Variables: {variables}")
489
769
 
490
- # Determine which agent to use
491
- agent = None
770
+ # Build list of models to run
771
+ models_to_run: list[str] = []
492
772
 
493
773
  if model:
494
- # Create ad-hoc agent from model string
495
- config = AgentConfig(
496
- id=model,
497
- name=model.split("/")[-1] if "/" in model else model,
498
- kind="llm-prompt",
499
- model=model,
500
- system_prompt="",
501
- tools=[],
502
- params={"temperature": 0.7, "max_tokens": 4096},
503
- impl={},
504
- )
505
- agent = LlmPromptAgent(config)
774
+ models_to_run = list(model)
506
775
  elif agent_id:
507
776
  # Load from agent config files
508
777
  loader = AgentLoader(DEFAULT_AGENT_DIRS)
509
778
  try:
510
779
  agent = loader.load(agent_id)
780
+ models_to_run = [agent.config.model]
511
781
  except ValueError as e:
512
782
  click.echo(f"Error loading agent: {e}", err=True)
513
783
  sys.exit(1)
@@ -516,6 +786,7 @@ def scenario(
516
786
  loader = AgentLoader(DEFAULT_AGENT_DIRS)
517
787
  try:
518
788
  agent = loader.load_default()
789
+ models_to_run = [agent.config.model]
519
790
  except ValueError:
520
791
  click.echo("No model specified. Use -m to specify a model:", err=True)
521
792
  click.echo("", err=True)
@@ -528,25 +799,110 @@ def scenario(
528
799
  )
529
800
  sys.exit(1)
530
801
 
531
- # Apply scenario's system prompt to agent
532
- if spec.system_prompt:
533
- agent.config.system_prompt = spec.system_prompt
534
-
535
802
  click.echo(f"Running scenario: {spec.name}")
536
- click.echo(f"Using model: {agent.config.model}")
803
+ click.echo(f"Models: {', '.join(models_to_run)}")
537
804
  click.echo(f"Tools loaded: {len(spec.tools) + len(spec.tools_from)} source(s)")
805
+ if len(models_to_run) > 1:
806
+ click.echo("Running models in parallel...")
538
807
  click.echo("")
539
808
 
540
- runner = ScenarioRunner(scenario=spec, agent=agent)
541
- result = runner.run(max_turns=max_turns)
809
+ def run_single_model(model_id: str) -> dict[str, Any]:
810
+ """Run scenario with a single model, with MLflow tracing if enabled."""
811
+ agent_config = AgentConfig(
812
+ id=model_id,
813
+ name=model_id.split("/")[-1] if "/" in model_id else model_id,
814
+ kind="llm-prompt",
815
+ model=model_id,
816
+ system_prompt=spec.system_prompt or "",
817
+ tools=[],
818
+ params={"temperature": 0.7, "max_tokens": 4096},
819
+ impl={},
820
+ )
821
+ agent = LlmPromptAgent(agent_config)
822
+
823
+ # If MLflow enabled, wrap execution in run context so traces are connected
824
+ if mlflow_config and mlflow_config.enabled:
825
+ from sandboxy.mlflow import MLflowExporter, mlflow_run_context
826
+ from sandboxy.mlflow.tracing import enable_tracing
827
+
828
+ # Enable tracing before the run starts
829
+ if mlflow_config.tracing:
830
+ enable_tracing(
831
+ tracking_uri=mlflow_config.tracking_uri,
832
+ experiment_name=mlflow_config.experiment,
833
+ )
834
+
835
+ # Start run, execute scenario, then log metrics - all connected
836
+ with mlflow_run_context(mlflow_config, run_name=model_id) as run_id:
837
+ runner = ScenarioRunner(scenario=spec, agent=agent)
838
+ result = runner.run(max_turns=max_turns)
839
+
840
+ # Log metrics to the active run (traces are already attached)
841
+ if run_id:
842
+ exporter = MLflowExporter(mlflow_config)
843
+ exporter.log_to_active_run(
844
+ result=result,
845
+ scenario_path=Path(scenario_path),
846
+ scenario_name=spec.name,
847
+ scenario_id=spec.id,
848
+ agent_name=agent.config.name,
849
+ )
850
+
851
+ return {"model": model_id, "result": result, "agent_name": agent.config.name}
852
+
853
+ # No MLflow - just run scenario
854
+ runner = ScenarioRunner(scenario=spec, agent=agent)
855
+ result = runner.run(max_turns=max_turns)
856
+ return {"model": model_id, "result": result, "agent_name": agent.config.name}
857
+
858
+ # Run models in parallel if multiple, otherwise just run single
859
+ results: list[Any] = []
860
+ if len(models_to_run) == 1:
861
+ results = [run_single_model(models_to_run[0])]
862
+ else:
863
+ from concurrent.futures import ThreadPoolExecutor, as_completed
864
+
865
+ with ThreadPoolExecutor(max_workers=len(models_to_run)) as executor:
866
+ futures = {executor.submit(run_single_model, m): m for m in models_to_run}
867
+ for future in as_completed(futures):
868
+ model_id = futures[future]
869
+ try:
870
+ result_data = future.result()
871
+ results.append(result_data)
872
+ click.echo(f"✓ Completed: {model_id}")
873
+ except Exception as e:
874
+ click.echo(f"✗ Failed: {model_id} - {e}", err=True)
875
+ click.echo("")
542
876
 
543
- if output:
544
- Path(output).write_text(result.to_json(indent=2))
545
- click.echo(f"\nResults saved to: {output}")
546
- elif pretty:
547
- click.echo(result.pretty())
877
+ # Output results
878
+ if len(results) == 1:
879
+ result = results[0]["result"]
880
+ if output:
881
+ Path(output).write_text(result.to_json(indent=2))
882
+ click.echo(f"\nResults saved to: {output}")
883
+ elif pretty:
884
+ click.echo(result.pretty())
885
+ else:
886
+ click.echo(result.to_json(indent=2))
548
887
  else:
549
- click.echo(result.to_json(indent=2))
888
+ # Multiple models - show summary
889
+ # Get max_score from spec (scoring config or sum of goal points)
890
+ max_score = spec.scoring.get("max_score", 0) if spec.scoring else 0
891
+ if not max_score and spec.goals:
892
+ max_score = sum(g.points for g in spec.goals)
893
+
894
+ click.echo("=== Results Summary ===")
895
+ for r in results:
896
+ model_name = r["model"]
897
+ res = r["result"]
898
+ score = getattr(res, "score", 0) or 0
899
+ pct = (score / max_score * 100) if max_score > 0 else 0
900
+ click.echo(f" {model_name}: {score:.1f}/{max_score:.1f} ({pct:.0f}%)")
901
+
902
+ if output:
903
+ all_results = [{"model": r["model"], "result": r["result"].to_dict()} for r in results]
904
+ Path(output).write_text(json.dumps(all_results, indent=2))
905
+ click.echo(f"\nResults saved to: {output}")
550
906
 
551
907
 
552
908
  @main.command()