sandboxy 0.0.3__py3-none-any.whl → 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -379,6 +379,10 @@ class RunScenarioRequest(BaseModel):
379
379
  max_turns: int = 20
380
380
  max_tokens: int = 1024
381
381
  temperature: float = 0.7
382
+ mlflow_export: bool = False
383
+ mlflow_tracking_uri: str | None = None
384
+ mlflow_experiment: str | None = None
385
+ mlflow_tracing: bool = True
382
386
 
383
387
 
384
388
  class RunScenarioResponse(BaseModel):
@@ -393,6 +397,9 @@ class RunScenarioResponse(BaseModel):
393
397
  final_state: dict[str, Any]
394
398
  evaluation: dict[str, Any] | None
395
399
  latency_ms: int
400
+ input_tokens: int = 0
401
+ output_tokens: int = 0
402
+ cost_usd: float | None = None
396
403
  error: str | None
397
404
 
398
405
 
@@ -404,6 +411,10 @@ class CompareModelsRequest(BaseModel):
404
411
  runs_per_model: int = 1
405
412
  variables: dict[str, Any] = Field(default_factory=dict)
406
413
  max_turns: int = 20
414
+ mlflow_export: bool = False
415
+ mlflow_tracking_uri: str | None = None
416
+ mlflow_experiment: str | None = None
417
+ mlflow_tracing: bool = True # Enable LLM call tracing by default
407
418
 
408
419
 
409
420
  class CompareModelsResponse(BaseModel):
@@ -454,20 +465,73 @@ async def run_scenario(request: RunScenarioRequest) -> RunScenarioResponse:
454
465
  spec = load_unified_scenario(scenario_path)
455
466
  runner = UnifiedRunner()
456
467
 
457
- result = await runner.run(
458
- scenario=spec,
459
- model=request.model,
460
- variables=request.variables,
461
- max_turns=request.max_turns,
462
- max_tokens=request.max_tokens,
463
- temperature=request.temperature,
464
- )
468
+ # Setup MLflow if requested
469
+ mlflow_config = None
470
+ if request.mlflow_export:
471
+ try:
472
+ from sandboxy.mlflow import MLflowConfig
473
+
474
+ mlflow_config = MLflowConfig.resolve(
475
+ cli_export=True,
476
+ cli_tracking_uri=request.mlflow_tracking_uri,
477
+ cli_experiment=request.mlflow_experiment,
478
+ cli_tracing=request.mlflow_tracing,
479
+ yaml_config=None,
480
+ scenario_name=spec.name,
481
+ )
482
+ except ImportError:
483
+ pass # MLflow not installed
484
+
485
+ # Run with MLflow context if enabled (connects traces to run)
486
+ if mlflow_config and mlflow_config.enabled:
487
+ from sandboxy.mlflow import MLflowExporter, mlflow_run_context
488
+ from sandboxy.mlflow.tracing import enable_tracing
489
+
490
+ if mlflow_config.tracing:
491
+ enable_tracing(
492
+ tracking_uri=mlflow_config.tracking_uri,
493
+ experiment_name=mlflow_config.experiment,
494
+ )
495
+
496
+ with mlflow_run_context(mlflow_config, run_name=request.model) as run_id:
497
+ result = await runner.run(
498
+ scenario=spec,
499
+ model=request.model,
500
+ variables=request.variables,
501
+ max_turns=request.max_turns,
502
+ max_tokens=request.max_tokens,
503
+ temperature=request.temperature,
504
+ )
505
+
506
+ if run_id:
507
+ exporter = MLflowExporter(mlflow_config)
508
+ exporter.log_to_active_run(
509
+ result=result,
510
+ scenario_path=scenario_path,
511
+ scenario_name=spec.name,
512
+ scenario_id=spec.id,
513
+ agent_name=request.model,
514
+ )
515
+ else:
516
+ result = await runner.run(
517
+ scenario=spec,
518
+ model=request.model,
519
+ variables=request.variables,
520
+ max_turns=request.max_turns,
521
+ max_tokens=request.max_tokens,
522
+ temperature=request.temperature,
523
+ )
465
524
 
466
525
  # Save result to runs/
467
526
  from sandboxy.local.results import save_run_result
468
527
 
469
528
  save_run_result(request.scenario_id, result.to_dict())
470
529
 
530
+ # Calculate cost
531
+ input_tokens = result.input_tokens or 0
532
+ output_tokens = result.output_tokens or 0
533
+ cost_usd = calculate_cost(result.model, input_tokens, output_tokens)
534
+
471
535
  return RunScenarioResponse(
472
536
  id=result.id,
473
537
  scenario_id=result.scenario_id,
@@ -481,6 +545,9 @@ async def run_scenario(request: RunScenarioRequest) -> RunScenarioResponse:
481
545
  final_state=result.final_state,
482
546
  evaluation=result.evaluation.to_dict() if result.evaluation else None,
483
547
  latency_ms=result.latency_ms,
548
+ input_tokens=input_tokens,
549
+ output_tokens=output_tokens,
550
+ cost_usd=cost_usd,
484
551
  error=result.error,
485
552
  )
486
553
 
@@ -530,6 +597,19 @@ async def compare_models(request: CompareModelsRequest) -> CompareModelsResponse
530
597
 
531
598
  spec = load_unified_scenario(scenario_path)
532
599
 
600
+ # Enable MLflow tracing if requested (must be done BEFORE any LLM calls)
601
+ if request.mlflow_export and request.mlflow_tracing:
602
+ try:
603
+ from sandboxy.mlflow.tracing import enable_tracing
604
+
605
+ experiment = request.mlflow_experiment or spec.name
606
+ enable_tracing(
607
+ tracking_uri=request.mlflow_tracking_uri,
608
+ experiment_name=experiment,
609
+ )
610
+ except ImportError:
611
+ pass # MLflow not installed
612
+
533
613
  comparison = await run_comparison(
534
614
  scenario=spec,
535
615
  models=request.models,
@@ -538,6 +618,31 @@ async def compare_models(request: CompareModelsRequest) -> CompareModelsResponse
538
618
  max_turns=request.max_turns,
539
619
  )
540
620
 
621
+ # MLflow export (if enabled)
622
+ if request.mlflow_export:
623
+ try:
624
+ from sandboxy.mlflow import MLflowConfig, MLflowExporter
625
+
626
+ for result in comparison.results:
627
+ config = MLflowConfig.resolve(
628
+ cli_export=True,
629
+ cli_tracking_uri=request.mlflow_tracking_uri,
630
+ cli_experiment=request.mlflow_experiment,
631
+ cli_tracing=request.mlflow_tracing,
632
+ yaml_config=None,
633
+ scenario_name=spec.name,
634
+ )
635
+ exporter = MLflowExporter(config)
636
+ exporter.export(
637
+ result=result.to_dict(),
638
+ scenario_path=scenario_path,
639
+ agent_name=result.model,
640
+ )
641
+ except ImportError:
642
+ logger.warning("MLflow not installed, skipping export")
643
+ except Exception as e:
644
+ logger.warning(f"Failed to export to MLflow: {e}")
645
+
541
646
  # Save comparison result
542
647
  from sandboxy.local.results import save_run_result
543
648
 
@@ -905,6 +1010,8 @@ class RunDatasetRequest(BaseModel):
905
1010
  max_tokens: int = 1024
906
1011
  temperature: float = 0.7
907
1012
  parallel: int = 1
1013
+ mlflow_enabled: bool = False
1014
+ mlflow_experiment: str | None = None
908
1015
 
909
1016
 
910
1017
  class RunDatasetResponse(BaseModel):
@@ -1335,25 +1442,81 @@ async def run_with_dataset(request: RunDatasetRequest) -> RunDatasetResponse:
1335
1442
  spec = load_unified_scenario(scenario_path)
1336
1443
  dataset = load_dataset(dataset_path)
1337
1444
 
1338
- if request.parallel > 1:
1339
- result = await run_dataset_parallel(
1445
+ # Setup MLflow if enabled
1446
+ mlflow_config = None
1447
+ if request.mlflow_enabled:
1448
+ try:
1449
+ from sandboxy.mlflow import MLflowConfig
1450
+
1451
+ mlflow_config = MLflowConfig(
1452
+ enabled=True,
1453
+ experiment=request.mlflow_experiment or f"{spec.name}-dataset",
1454
+ tracing=False, # Tracing not needed for dataset aggregates
1455
+ )
1456
+ except ImportError:
1457
+ pass # MLflow not installed
1458
+
1459
+ async def run_dataset_benchmark():
1460
+ if request.parallel > 1:
1461
+ return await run_dataset_parallel(
1462
+ scenario=spec,
1463
+ model=request.model,
1464
+ dataset=dataset,
1465
+ max_turns=request.max_turns,
1466
+ max_tokens=request.max_tokens,
1467
+ temperature=request.temperature,
1468
+ max_concurrent=request.parallel,
1469
+ )
1470
+ return await run_dataset(
1340
1471
  scenario=spec,
1341
1472
  model=request.model,
1342
1473
  dataset=dataset,
1343
1474
  max_turns=request.max_turns,
1344
1475
  max_tokens=request.max_tokens,
1345
1476
  temperature=request.temperature,
1346
- max_concurrent=request.parallel,
1347
1477
  )
1478
+
1479
+ # Run with MLflow context if enabled
1480
+ if mlflow_config and mlflow_config.enabled:
1481
+ from sandboxy.mlflow import mlflow_run_context
1482
+
1483
+ run_name = f"{request.model}-{request.dataset_id}"
1484
+ with mlflow_run_context(mlflow_config, run_name=run_name) as run_id:
1485
+ result = await run_dataset_benchmark()
1486
+
1487
+ # Log aggregate metrics to MLflow
1488
+ if run_id:
1489
+ try:
1490
+ import mlflow
1491
+
1492
+ mlflow.log_params(
1493
+ {
1494
+ "scenario_id": result.scenario_id,
1495
+ "dataset_id": result.dataset_id,
1496
+ "model": result.model,
1497
+ "total_cases": result.total_cases,
1498
+ }
1499
+ )
1500
+ mlflow.log_metrics(
1501
+ {
1502
+ "passed_cases": result.passed_cases,
1503
+ "failed_cases": result.failed_cases,
1504
+ "pass_rate": result.pass_rate,
1505
+ "avg_score": result.avg_score,
1506
+ "avg_percentage": result.avg_percentage,
1507
+ "total_time_ms": result.total_time_ms,
1508
+ }
1509
+ )
1510
+ # Log per-expected-outcome metrics
1511
+ for expected, counts in result.by_expected.items():
1512
+ total = counts.get("passed", 0) + counts.get("failed", 0)
1513
+ if total > 0:
1514
+ rate = counts.get("passed", 0) / total
1515
+ mlflow.log_metric(f"pass_rate_{expected}", rate)
1516
+ except Exception as e:
1517
+ logger.warning(f"Failed to log dataset metrics to MLflow: {e}")
1348
1518
  else:
1349
- result = await run_dataset(
1350
- scenario=spec,
1351
- model=request.model,
1352
- dataset=dataset,
1353
- max_turns=request.max_turns,
1354
- max_tokens=request.max_tokens,
1355
- temperature=request.temperature,
1356
- )
1519
+ result = await run_dataset_benchmark()
1357
1520
 
1358
1521
  # Save result
1359
1522
  from sandboxy.local.results import save_run_result
sandboxy/cli/main.py CHANGED
@@ -98,6 +98,137 @@ def _load_variables_from_env() -> dict:
98
98
  return {}
99
99
 
100
100
 
101
+ def _export_to_mlflow(
102
+ result: Any,
103
+ spec: Any,
104
+ scenario_path: Path,
105
+ mlflow_export: bool,
106
+ no_mlflow: bool,
107
+ mlflow_tracking_uri: str | None,
108
+ mlflow_experiment: str | None,
109
+ agent_name: str = "default",
110
+ ) -> None:
111
+ """Export scenario result to MLflow if enabled.
112
+
113
+ Args:
114
+ result: ScenarioResult from runner
115
+ spec: ScenarioSpec
116
+ scenario_path: Path to scenario file
117
+ mlflow_export: --mlflow-export flag
118
+ no_mlflow: --no-mlflow flag
119
+ mlflow_tracking_uri: --mlflow-tracking-uri value
120
+ mlflow_experiment: --mlflow-experiment value
121
+ agent_name: Agent configuration name
122
+ """
123
+ from sandboxy.mlflow.config import MLflowConfig
124
+
125
+ # Get YAML config from spec
126
+ yaml_config = None
127
+ if spec.mlflow:
128
+ yaml_config = {
129
+ "enabled": spec.mlflow.enabled,
130
+ "experiment": spec.mlflow.experiment,
131
+ "tracking_uri": spec.mlflow.tracking_uri,
132
+ "tags": spec.mlflow.tags,
133
+ }
134
+
135
+ # Resolve config with precedence
136
+ config = MLflowConfig.resolve(
137
+ cli_export=mlflow_export,
138
+ cli_no_mlflow=no_mlflow,
139
+ cli_tracking_uri=mlflow_tracking_uri,
140
+ cli_experiment=mlflow_experiment,
141
+ yaml_config=yaml_config,
142
+ scenario_name=spec.name,
143
+ )
144
+
145
+ if not config.enabled:
146
+ return
147
+
148
+ # Import and use exporter
149
+ try:
150
+ from sandboxy.mlflow.exporter import MLflowExporter
151
+
152
+ exporter = MLflowExporter(config)
153
+
154
+ # Convert ScenarioResult to RunResult-like for exporter
155
+ # ScenarioResult has different structure, create adapter
156
+ run_id = exporter.export(
157
+ result=_adapt_scenario_result(result),
158
+ scenario_path=scenario_path,
159
+ scenario_name=spec.name,
160
+ scenario_id=spec.id,
161
+ agent_name=agent_name,
162
+ )
163
+
164
+ if run_id:
165
+ click.echo(f"\nExported to MLflow: run_id={run_id}")
166
+
167
+ except ImportError:
168
+ click.echo(
169
+ "\nMLflow not installed. Install with: pip install sandboxy[mlflow]",
170
+ err=True,
171
+ )
172
+ except Exception as e:
173
+ click.echo(f"\nWarning: MLflow export failed: {e}", err=True)
174
+
175
+
176
+ def _adapt_scenario_result(result: Any) -> Any:
177
+ """Adapt ScenarioResult to RunResult-like interface for MLflowExporter.
178
+
179
+ The exporter expects RunResult fields, but ScenarioRunner returns ScenarioResult.
180
+ This creates an adapter object.
181
+ """
182
+ from dataclasses import dataclass, field
183
+
184
+ @dataclass
185
+ class GoalResultAdapter:
186
+ name: str
187
+ score: float
188
+ passed: bool = True
189
+
190
+ @dataclass
191
+ class EvaluationAdapter:
192
+ goals: list[GoalResultAdapter] = field(default_factory=list)
193
+ total_score: float = 0.0
194
+ max_score: float = 0.0
195
+ percentage: float = 0.0
196
+
197
+ @dataclass
198
+ class RunResultAdapter:
199
+ model: str = ""
200
+ error: str | None = None
201
+ latency_ms: int = 0
202
+ input_tokens: int = 0
203
+ output_tokens: int = 0
204
+ evaluation: EvaluationAdapter | None = None
205
+
206
+ # Extract data from ScenarioResult
207
+ adapter = RunResultAdapter(
208
+ model=getattr(result, "agent_id", "unknown"),
209
+ error=None,
210
+ )
211
+
212
+ # Build evaluation from goals
213
+ goals = []
214
+ total = 0.0
215
+ for goal_name in getattr(result, "goals_achieved", []):
216
+ goals.append(GoalResultAdapter(name=goal_name, score=1.0, passed=True))
217
+ total += 1.0
218
+
219
+ score = getattr(result, "score", 0.0)
220
+ max_score = max(score, len(goals)) if goals else score
221
+
222
+ adapter.evaluation = EvaluationAdapter(
223
+ goals=goals,
224
+ total_score=score,
225
+ max_score=max_score,
226
+ percentage=(score / max_score * 100) if max_score > 0 else 0.0,
227
+ )
228
+
229
+ return adapter
230
+
231
+
101
232
  @main.command()
102
233
  @click.option("--with-examples", is_flag=True, help="Include example scenarios and tools")
103
234
  @click.option(
@@ -528,22 +659,54 @@ def info(module_path: str) -> None:
528
659
  @click.option(
529
660
  "--model",
530
661
  "-m",
531
- help="Model to use (e.g., openai/gpt-4o, anthropic/claude-3.5-sonnet)",
532
- default=None,
662
+ multiple=True,
663
+ help="Model(s) to use. Can specify multiple: -m gpt-4o -m claude-3.5-sonnet",
533
664
  )
534
665
  @click.option("--agent-id", "-a", help="Agent ID from config files", default=None)
535
666
  @click.option("--output", "-o", help="Output file for results JSON", default=None)
536
667
  @click.option("--pretty", "-p", is_flag=True, help="Pretty print output")
537
668
  @click.option("--max-turns", type=int, default=20, help="Maximum conversation turns")
538
669
  @click.option("--var", "-v", multiple=True, help="Variable in name=value format")
670
+ @click.option(
671
+ "--mlflow-export",
672
+ is_flag=True,
673
+ help="Export run results to MLflow tracking server",
674
+ )
675
+ @click.option(
676
+ "--no-mlflow",
677
+ is_flag=True,
678
+ help="Disable MLflow export (overrides YAML config)",
679
+ )
680
+ @click.option(
681
+ "--mlflow-tracking-uri",
682
+ type=str,
683
+ default=None,
684
+ help="MLflow tracking server URI (overrides MLFLOW_TRACKING_URI env)",
685
+ )
686
+ @click.option(
687
+ "--mlflow-experiment",
688
+ type=str,
689
+ default=None,
690
+ help="MLflow experiment name (defaults to scenario name)",
691
+ )
692
+ @click.option(
693
+ "--mlflow-no-tracing",
694
+ is_flag=True,
695
+ help="Disable LLM call tracing (only log summary metrics)",
696
+ )
539
697
  def scenario(
540
698
  scenario_path: str,
541
- model: str | None,
699
+ model: tuple[str, ...],
542
700
  agent_id: str | None,
543
701
  output: str | None,
544
702
  pretty: bool,
545
703
  max_turns: int,
546
704
  var: tuple[str, ...],
705
+ mlflow_export: bool,
706
+ no_mlflow: bool,
707
+ mlflow_tracking_uri: str | None,
708
+ mlflow_experiment: str | None,
709
+ mlflow_no_tracing: bool,
547
710
  ) -> None:
548
711
  """Run a scenario with YAML-defined tools.
549
712
 
@@ -554,8 +717,10 @@ def scenario(
554
717
 
555
718
  Examples:
556
719
  sandboxy scenario scenarios/trolley.yml -m openai/gpt-4o
557
- sandboxy scenario scenarios/trolley.yml -m anthropic/claude-3.5-sonnet -p
720
+ sandboxy scenario scenarios/trolley.yml -m gpt-4o -m claude-3.5-sonnet # multiple models
558
721
  sandboxy scenario scenarios/surgeon.yml -v patient="John Smith" -v condition="critical"
722
+ sandboxy scenario scenarios/test.yml -m gpt-4o --mlflow-export
723
+ sandboxy scenario scenarios/test.yml -m gpt-4o -m gpt-4o-mini --mlflow-export # compare models
559
724
  """
560
725
  from sandboxy.agents.base import AgentConfig
561
726
  from sandboxy.agents.llm_prompt import LlmPromptAgent
@@ -567,6 +732,26 @@ def scenario(
567
732
  click.echo(f"Error loading scenario: {e}", err=True)
568
733
  sys.exit(1)
569
734
 
735
+ # Build MLflow config if export requested
736
+ mlflow_config = None
737
+ if mlflow_export and not no_mlflow:
738
+ try:
739
+ from sandboxy.mlflow import MLflowConfig
740
+
741
+ mlflow_config = MLflowConfig.resolve(
742
+ cli_export=True,
743
+ cli_tracking_uri=mlflow_tracking_uri,
744
+ cli_experiment=mlflow_experiment,
745
+ cli_tracing=not mlflow_no_tracing,
746
+ yaml_config=spec.mlflow.model_dump() if spec.mlflow else None,
747
+ scenario_name=spec.name,
748
+ )
749
+ click.echo(f"MLflow enabled → experiment: {mlflow_config.experiment}")
750
+ if mlflow_config.tracing:
751
+ click.echo(" Tracing: ON (LLM calls will be captured)")
752
+ except ImportError:
753
+ pass # MLflow not installed
754
+
570
755
  # Parse and apply variables
571
756
  variables: dict[str, Any] = {}
572
757
  for v in var:
@@ -582,27 +767,17 @@ def scenario(
582
767
  spec = apply_scenario_variables(spec, variables)
583
768
  click.echo(f"Variables: {variables}")
584
769
 
585
- # Determine which agent to use
586
- agent = None
770
+ # Build list of models to run
771
+ models_to_run: list[str] = []
587
772
 
588
773
  if model:
589
- # Create ad-hoc agent from model string
590
- config = AgentConfig(
591
- id=model,
592
- name=model.split("/")[-1] if "/" in model else model,
593
- kind="llm-prompt",
594
- model=model,
595
- system_prompt="",
596
- tools=[],
597
- params={"temperature": 0.7, "max_tokens": 4096},
598
- impl={},
599
- )
600
- agent = LlmPromptAgent(config)
774
+ models_to_run = list(model)
601
775
  elif agent_id:
602
776
  # Load from agent config files
603
777
  loader = AgentLoader(DEFAULT_AGENT_DIRS)
604
778
  try:
605
779
  agent = loader.load(agent_id)
780
+ models_to_run = [agent.config.model]
606
781
  except ValueError as e:
607
782
  click.echo(f"Error loading agent: {e}", err=True)
608
783
  sys.exit(1)
@@ -611,6 +786,7 @@ def scenario(
611
786
  loader = AgentLoader(DEFAULT_AGENT_DIRS)
612
787
  try:
613
788
  agent = loader.load_default()
789
+ models_to_run = [agent.config.model]
614
790
  except ValueError:
615
791
  click.echo("No model specified. Use -m to specify a model:", err=True)
616
792
  click.echo("", err=True)
@@ -623,25 +799,110 @@ def scenario(
623
799
  )
624
800
  sys.exit(1)
625
801
 
626
- # Apply scenario's system prompt to agent
627
- if spec.system_prompt:
628
- agent.config.system_prompt = spec.system_prompt
629
-
630
802
  click.echo(f"Running scenario: {spec.name}")
631
- click.echo(f"Using model: {agent.config.model}")
803
+ click.echo(f"Models: {', '.join(models_to_run)}")
632
804
  click.echo(f"Tools loaded: {len(spec.tools) + len(spec.tools_from)} source(s)")
805
+ if len(models_to_run) > 1:
806
+ click.echo("Running models in parallel...")
633
807
  click.echo("")
634
808
 
635
- runner = ScenarioRunner(scenario=spec, agent=agent)
636
- result = runner.run(max_turns=max_turns)
809
+ def run_single_model(model_id: str) -> dict[str, Any]:
810
+ """Run scenario with a single model, with MLflow tracing if enabled."""
811
+ agent_config = AgentConfig(
812
+ id=model_id,
813
+ name=model_id.split("/")[-1] if "/" in model_id else model_id,
814
+ kind="llm-prompt",
815
+ model=model_id,
816
+ system_prompt=spec.system_prompt or "",
817
+ tools=[],
818
+ params={"temperature": 0.7, "max_tokens": 4096},
819
+ impl={},
820
+ )
821
+ agent = LlmPromptAgent(agent_config)
822
+
823
+ # If MLflow enabled, wrap execution in run context so traces are connected
824
+ if mlflow_config and mlflow_config.enabled:
825
+ from sandboxy.mlflow import MLflowExporter, mlflow_run_context
826
+ from sandboxy.mlflow.tracing import enable_tracing
827
+
828
+ # Enable tracing before the run starts
829
+ if mlflow_config.tracing:
830
+ enable_tracing(
831
+ tracking_uri=mlflow_config.tracking_uri,
832
+ experiment_name=mlflow_config.experiment,
833
+ )
834
+
835
+ # Start run, execute scenario, then log metrics - all connected
836
+ with mlflow_run_context(mlflow_config, run_name=model_id) as run_id:
837
+ runner = ScenarioRunner(scenario=spec, agent=agent)
838
+ result = runner.run(max_turns=max_turns)
839
+
840
+ # Log metrics to the active run (traces are already attached)
841
+ if run_id:
842
+ exporter = MLflowExporter(mlflow_config)
843
+ exporter.log_to_active_run(
844
+ result=result,
845
+ scenario_path=Path(scenario_path),
846
+ scenario_name=spec.name,
847
+ scenario_id=spec.id,
848
+ agent_name=agent.config.name,
849
+ )
850
+
851
+ return {"model": model_id, "result": result, "agent_name": agent.config.name}
852
+
853
+ # No MLflow - just run scenario
854
+ runner = ScenarioRunner(scenario=spec, agent=agent)
855
+ result = runner.run(max_turns=max_turns)
856
+ return {"model": model_id, "result": result, "agent_name": agent.config.name}
857
+
858
+ # Run models in parallel if multiple, otherwise just run single
859
+ results: list[Any] = []
860
+ if len(models_to_run) == 1:
861
+ results = [run_single_model(models_to_run[0])]
862
+ else:
863
+ from concurrent.futures import ThreadPoolExecutor, as_completed
864
+
865
+ with ThreadPoolExecutor(max_workers=len(models_to_run)) as executor:
866
+ futures = {executor.submit(run_single_model, m): m for m in models_to_run}
867
+ for future in as_completed(futures):
868
+ model_id = futures[future]
869
+ try:
870
+ result_data = future.result()
871
+ results.append(result_data)
872
+ click.echo(f"✓ Completed: {model_id}")
873
+ except Exception as e:
874
+ click.echo(f"✗ Failed: {model_id} - {e}", err=True)
875
+ click.echo("")
637
876
 
638
- if output:
639
- Path(output).write_text(result.to_json(indent=2))
640
- click.echo(f"\nResults saved to: {output}")
641
- elif pretty:
642
- click.echo(result.pretty())
877
+ # Output results
878
+ if len(results) == 1:
879
+ result = results[0]["result"]
880
+ if output:
881
+ Path(output).write_text(result.to_json(indent=2))
882
+ click.echo(f"\nResults saved to: {output}")
883
+ elif pretty:
884
+ click.echo(result.pretty())
885
+ else:
886
+ click.echo(result.to_json(indent=2))
643
887
  else:
644
- click.echo(result.to_json(indent=2))
888
+ # Multiple models - show summary
889
+ # Get max_score from spec (scoring config or sum of goal points)
890
+ max_score = spec.scoring.get("max_score", 0) if spec.scoring else 0
891
+ if not max_score and spec.goals:
892
+ max_score = sum(g.points for g in spec.goals)
893
+
894
+ click.echo("=== Results Summary ===")
895
+ for r in results:
896
+ model_name = r["model"]
897
+ res = r["result"]
898
+ score = getattr(res, "score", 0) or 0
899
+ pct = (score / max_score * 100) if max_score > 0 else 0
900
+ click.echo(f" {model_name}: {score:.1f}/{max_score:.1f} ({pct:.0f}%)")
901
+
902
+ if output:
903
+ all_results = [{"model": r["model"], "result": r["result"].to_dict()} for r in results]
904
+ Path(output).write_text(json.dumps(all_results, indent=2))
905
+ click.echo(f"\nResults saved to: {output}")
645
906
 
646
907
 
647
908
  @main.command()