sandboxy 0.0.3__py3-none-any.whl → 0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sandboxy/api/routes/local.py +182 -19
- sandboxy/cli/main.py +292 -31
- sandboxy/mlflow/__init__.py +38 -0
- sandboxy/mlflow/artifacts.py +184 -0
- sandboxy/mlflow/config.py +90 -0
- sandboxy/mlflow/exporter.py +439 -0
- sandboxy/mlflow/metrics.py +115 -0
- sandboxy/mlflow/tags.py +140 -0
- sandboxy/mlflow/tracing.py +126 -0
- sandboxy/scenarios/loader.py +44 -2
- sandboxy/scenarios/runner.py +57 -2
- sandboxy/tools/yaml_tools.py +18 -0
- sandboxy/ui/dist/assets/index-CU06wBqc.js +362 -0
- sandboxy/ui/dist/assets/index-Cgg2wY2m.css +1 -0
- sandboxy/ui/dist/index.html +2 -2
- {sandboxy-0.0.3.dist-info → sandboxy-0.0.4.dist-info}/METADATA +37 -1
- {sandboxy-0.0.3.dist-info → sandboxy-0.0.4.dist-info}/RECORD +20 -13
- sandboxy/ui/dist/assets/index-CgAkYWrJ.css +0 -1
- sandboxy/ui/dist/assets/index-D4zoGFcr.js +0 -347
- {sandboxy-0.0.3.dist-info → sandboxy-0.0.4.dist-info}/WHEEL +0 -0
- {sandboxy-0.0.3.dist-info → sandboxy-0.0.4.dist-info}/entry_points.txt +0 -0
- {sandboxy-0.0.3.dist-info → sandboxy-0.0.4.dist-info}/licenses/LICENSE +0 -0
sandboxy/api/routes/local.py
CHANGED
|
@@ -379,6 +379,10 @@ class RunScenarioRequest(BaseModel):
|
|
|
379
379
|
max_turns: int = 20
|
|
380
380
|
max_tokens: int = 1024
|
|
381
381
|
temperature: float = 0.7
|
|
382
|
+
mlflow_export: bool = False
|
|
383
|
+
mlflow_tracking_uri: str | None = None
|
|
384
|
+
mlflow_experiment: str | None = None
|
|
385
|
+
mlflow_tracing: bool = True
|
|
382
386
|
|
|
383
387
|
|
|
384
388
|
class RunScenarioResponse(BaseModel):
|
|
@@ -393,6 +397,9 @@ class RunScenarioResponse(BaseModel):
|
|
|
393
397
|
final_state: dict[str, Any]
|
|
394
398
|
evaluation: dict[str, Any] | None
|
|
395
399
|
latency_ms: int
|
|
400
|
+
input_tokens: int = 0
|
|
401
|
+
output_tokens: int = 0
|
|
402
|
+
cost_usd: float | None = None
|
|
396
403
|
error: str | None
|
|
397
404
|
|
|
398
405
|
|
|
@@ -404,6 +411,10 @@ class CompareModelsRequest(BaseModel):
|
|
|
404
411
|
runs_per_model: int = 1
|
|
405
412
|
variables: dict[str, Any] = Field(default_factory=dict)
|
|
406
413
|
max_turns: int = 20
|
|
414
|
+
mlflow_export: bool = False
|
|
415
|
+
mlflow_tracking_uri: str | None = None
|
|
416
|
+
mlflow_experiment: str | None = None
|
|
417
|
+
mlflow_tracing: bool = True # Enable LLM call tracing by default
|
|
407
418
|
|
|
408
419
|
|
|
409
420
|
class CompareModelsResponse(BaseModel):
|
|
@@ -454,20 +465,73 @@ async def run_scenario(request: RunScenarioRequest) -> RunScenarioResponse:
|
|
|
454
465
|
spec = load_unified_scenario(scenario_path)
|
|
455
466
|
runner = UnifiedRunner()
|
|
456
467
|
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
468
|
+
# Setup MLflow if requested
|
|
469
|
+
mlflow_config = None
|
|
470
|
+
if request.mlflow_export:
|
|
471
|
+
try:
|
|
472
|
+
from sandboxy.mlflow import MLflowConfig
|
|
473
|
+
|
|
474
|
+
mlflow_config = MLflowConfig.resolve(
|
|
475
|
+
cli_export=True,
|
|
476
|
+
cli_tracking_uri=request.mlflow_tracking_uri,
|
|
477
|
+
cli_experiment=request.mlflow_experiment,
|
|
478
|
+
cli_tracing=request.mlflow_tracing,
|
|
479
|
+
yaml_config=None,
|
|
480
|
+
scenario_name=spec.name,
|
|
481
|
+
)
|
|
482
|
+
except ImportError:
|
|
483
|
+
pass # MLflow not installed
|
|
484
|
+
|
|
485
|
+
# Run with MLflow context if enabled (connects traces to run)
|
|
486
|
+
if mlflow_config and mlflow_config.enabled:
|
|
487
|
+
from sandboxy.mlflow import MLflowExporter, mlflow_run_context
|
|
488
|
+
from sandboxy.mlflow.tracing import enable_tracing
|
|
489
|
+
|
|
490
|
+
if mlflow_config.tracing:
|
|
491
|
+
enable_tracing(
|
|
492
|
+
tracking_uri=mlflow_config.tracking_uri,
|
|
493
|
+
experiment_name=mlflow_config.experiment,
|
|
494
|
+
)
|
|
495
|
+
|
|
496
|
+
with mlflow_run_context(mlflow_config, run_name=request.model) as run_id:
|
|
497
|
+
result = await runner.run(
|
|
498
|
+
scenario=spec,
|
|
499
|
+
model=request.model,
|
|
500
|
+
variables=request.variables,
|
|
501
|
+
max_turns=request.max_turns,
|
|
502
|
+
max_tokens=request.max_tokens,
|
|
503
|
+
temperature=request.temperature,
|
|
504
|
+
)
|
|
505
|
+
|
|
506
|
+
if run_id:
|
|
507
|
+
exporter = MLflowExporter(mlflow_config)
|
|
508
|
+
exporter.log_to_active_run(
|
|
509
|
+
result=result,
|
|
510
|
+
scenario_path=scenario_path,
|
|
511
|
+
scenario_name=spec.name,
|
|
512
|
+
scenario_id=spec.id,
|
|
513
|
+
agent_name=request.model,
|
|
514
|
+
)
|
|
515
|
+
else:
|
|
516
|
+
result = await runner.run(
|
|
517
|
+
scenario=spec,
|
|
518
|
+
model=request.model,
|
|
519
|
+
variables=request.variables,
|
|
520
|
+
max_turns=request.max_turns,
|
|
521
|
+
max_tokens=request.max_tokens,
|
|
522
|
+
temperature=request.temperature,
|
|
523
|
+
)
|
|
465
524
|
|
|
466
525
|
# Save result to runs/
|
|
467
526
|
from sandboxy.local.results import save_run_result
|
|
468
527
|
|
|
469
528
|
save_run_result(request.scenario_id, result.to_dict())
|
|
470
529
|
|
|
530
|
+
# Calculate cost
|
|
531
|
+
input_tokens = result.input_tokens or 0
|
|
532
|
+
output_tokens = result.output_tokens or 0
|
|
533
|
+
cost_usd = calculate_cost(result.model, input_tokens, output_tokens)
|
|
534
|
+
|
|
471
535
|
return RunScenarioResponse(
|
|
472
536
|
id=result.id,
|
|
473
537
|
scenario_id=result.scenario_id,
|
|
@@ -481,6 +545,9 @@ async def run_scenario(request: RunScenarioRequest) -> RunScenarioResponse:
|
|
|
481
545
|
final_state=result.final_state,
|
|
482
546
|
evaluation=result.evaluation.to_dict() if result.evaluation else None,
|
|
483
547
|
latency_ms=result.latency_ms,
|
|
548
|
+
input_tokens=input_tokens,
|
|
549
|
+
output_tokens=output_tokens,
|
|
550
|
+
cost_usd=cost_usd,
|
|
484
551
|
error=result.error,
|
|
485
552
|
)
|
|
486
553
|
|
|
@@ -530,6 +597,19 @@ async def compare_models(request: CompareModelsRequest) -> CompareModelsResponse
|
|
|
530
597
|
|
|
531
598
|
spec = load_unified_scenario(scenario_path)
|
|
532
599
|
|
|
600
|
+
# Enable MLflow tracing if requested (must be done BEFORE any LLM calls)
|
|
601
|
+
if request.mlflow_export and request.mlflow_tracing:
|
|
602
|
+
try:
|
|
603
|
+
from sandboxy.mlflow.tracing import enable_tracing
|
|
604
|
+
|
|
605
|
+
experiment = request.mlflow_experiment or spec.name
|
|
606
|
+
enable_tracing(
|
|
607
|
+
tracking_uri=request.mlflow_tracking_uri,
|
|
608
|
+
experiment_name=experiment,
|
|
609
|
+
)
|
|
610
|
+
except ImportError:
|
|
611
|
+
pass # MLflow not installed
|
|
612
|
+
|
|
533
613
|
comparison = await run_comparison(
|
|
534
614
|
scenario=spec,
|
|
535
615
|
models=request.models,
|
|
@@ -538,6 +618,31 @@ async def compare_models(request: CompareModelsRequest) -> CompareModelsResponse
|
|
|
538
618
|
max_turns=request.max_turns,
|
|
539
619
|
)
|
|
540
620
|
|
|
621
|
+
# MLflow export (if enabled)
|
|
622
|
+
if request.mlflow_export:
|
|
623
|
+
try:
|
|
624
|
+
from sandboxy.mlflow import MLflowConfig, MLflowExporter
|
|
625
|
+
|
|
626
|
+
for result in comparison.results:
|
|
627
|
+
config = MLflowConfig.resolve(
|
|
628
|
+
cli_export=True,
|
|
629
|
+
cli_tracking_uri=request.mlflow_tracking_uri,
|
|
630
|
+
cli_experiment=request.mlflow_experiment,
|
|
631
|
+
cli_tracing=request.mlflow_tracing,
|
|
632
|
+
yaml_config=None,
|
|
633
|
+
scenario_name=spec.name,
|
|
634
|
+
)
|
|
635
|
+
exporter = MLflowExporter(config)
|
|
636
|
+
exporter.export(
|
|
637
|
+
result=result.to_dict(),
|
|
638
|
+
scenario_path=scenario_path,
|
|
639
|
+
agent_name=result.model,
|
|
640
|
+
)
|
|
641
|
+
except ImportError:
|
|
642
|
+
logger.warning("MLflow not installed, skipping export")
|
|
643
|
+
except Exception as e:
|
|
644
|
+
logger.warning(f"Failed to export to MLflow: {e}")
|
|
645
|
+
|
|
541
646
|
# Save comparison result
|
|
542
647
|
from sandboxy.local.results import save_run_result
|
|
543
648
|
|
|
@@ -905,6 +1010,8 @@ class RunDatasetRequest(BaseModel):
|
|
|
905
1010
|
max_tokens: int = 1024
|
|
906
1011
|
temperature: float = 0.7
|
|
907
1012
|
parallel: int = 1
|
|
1013
|
+
mlflow_enabled: bool = False
|
|
1014
|
+
mlflow_experiment: str | None = None
|
|
908
1015
|
|
|
909
1016
|
|
|
910
1017
|
class RunDatasetResponse(BaseModel):
|
|
@@ -1335,25 +1442,81 @@ async def run_with_dataset(request: RunDatasetRequest) -> RunDatasetResponse:
|
|
|
1335
1442
|
spec = load_unified_scenario(scenario_path)
|
|
1336
1443
|
dataset = load_dataset(dataset_path)
|
|
1337
1444
|
|
|
1338
|
-
|
|
1339
|
-
|
|
1445
|
+
# Setup MLflow if enabled
|
|
1446
|
+
mlflow_config = None
|
|
1447
|
+
if request.mlflow_enabled:
|
|
1448
|
+
try:
|
|
1449
|
+
from sandboxy.mlflow import MLflowConfig
|
|
1450
|
+
|
|
1451
|
+
mlflow_config = MLflowConfig(
|
|
1452
|
+
enabled=True,
|
|
1453
|
+
experiment=request.mlflow_experiment or f"{spec.name}-dataset",
|
|
1454
|
+
tracing=False, # Tracing not needed for dataset aggregates
|
|
1455
|
+
)
|
|
1456
|
+
except ImportError:
|
|
1457
|
+
pass # MLflow not installed
|
|
1458
|
+
|
|
1459
|
+
async def run_dataset_benchmark():
|
|
1460
|
+
if request.parallel > 1:
|
|
1461
|
+
return await run_dataset_parallel(
|
|
1462
|
+
scenario=spec,
|
|
1463
|
+
model=request.model,
|
|
1464
|
+
dataset=dataset,
|
|
1465
|
+
max_turns=request.max_turns,
|
|
1466
|
+
max_tokens=request.max_tokens,
|
|
1467
|
+
temperature=request.temperature,
|
|
1468
|
+
max_concurrent=request.parallel,
|
|
1469
|
+
)
|
|
1470
|
+
return await run_dataset(
|
|
1340
1471
|
scenario=spec,
|
|
1341
1472
|
model=request.model,
|
|
1342
1473
|
dataset=dataset,
|
|
1343
1474
|
max_turns=request.max_turns,
|
|
1344
1475
|
max_tokens=request.max_tokens,
|
|
1345
1476
|
temperature=request.temperature,
|
|
1346
|
-
max_concurrent=request.parallel,
|
|
1347
1477
|
)
|
|
1478
|
+
|
|
1479
|
+
# Run with MLflow context if enabled
|
|
1480
|
+
if mlflow_config and mlflow_config.enabled:
|
|
1481
|
+
from sandboxy.mlflow import mlflow_run_context
|
|
1482
|
+
|
|
1483
|
+
run_name = f"{request.model}-{request.dataset_id}"
|
|
1484
|
+
with mlflow_run_context(mlflow_config, run_name=run_name) as run_id:
|
|
1485
|
+
result = await run_dataset_benchmark()
|
|
1486
|
+
|
|
1487
|
+
# Log aggregate metrics to MLflow
|
|
1488
|
+
if run_id:
|
|
1489
|
+
try:
|
|
1490
|
+
import mlflow
|
|
1491
|
+
|
|
1492
|
+
mlflow.log_params(
|
|
1493
|
+
{
|
|
1494
|
+
"scenario_id": result.scenario_id,
|
|
1495
|
+
"dataset_id": result.dataset_id,
|
|
1496
|
+
"model": result.model,
|
|
1497
|
+
"total_cases": result.total_cases,
|
|
1498
|
+
}
|
|
1499
|
+
)
|
|
1500
|
+
mlflow.log_metrics(
|
|
1501
|
+
{
|
|
1502
|
+
"passed_cases": result.passed_cases,
|
|
1503
|
+
"failed_cases": result.failed_cases,
|
|
1504
|
+
"pass_rate": result.pass_rate,
|
|
1505
|
+
"avg_score": result.avg_score,
|
|
1506
|
+
"avg_percentage": result.avg_percentage,
|
|
1507
|
+
"total_time_ms": result.total_time_ms,
|
|
1508
|
+
}
|
|
1509
|
+
)
|
|
1510
|
+
# Log per-expected-outcome metrics
|
|
1511
|
+
for expected, counts in result.by_expected.items():
|
|
1512
|
+
total = counts.get("passed", 0) + counts.get("failed", 0)
|
|
1513
|
+
if total > 0:
|
|
1514
|
+
rate = counts.get("passed", 0) / total
|
|
1515
|
+
mlflow.log_metric(f"pass_rate_{expected}", rate)
|
|
1516
|
+
except Exception as e:
|
|
1517
|
+
logger.warning(f"Failed to log dataset metrics to MLflow: {e}")
|
|
1348
1518
|
else:
|
|
1349
|
-
result = await
|
|
1350
|
-
scenario=spec,
|
|
1351
|
-
model=request.model,
|
|
1352
|
-
dataset=dataset,
|
|
1353
|
-
max_turns=request.max_turns,
|
|
1354
|
-
max_tokens=request.max_tokens,
|
|
1355
|
-
temperature=request.temperature,
|
|
1356
|
-
)
|
|
1519
|
+
result = await run_dataset_benchmark()
|
|
1357
1520
|
|
|
1358
1521
|
# Save result
|
|
1359
1522
|
from sandboxy.local.results import save_run_result
|
sandboxy/cli/main.py
CHANGED
|
@@ -98,6 +98,137 @@ def _load_variables_from_env() -> dict:
|
|
|
98
98
|
return {}
|
|
99
99
|
|
|
100
100
|
|
|
101
|
+
def _export_to_mlflow(
|
|
102
|
+
result: Any,
|
|
103
|
+
spec: Any,
|
|
104
|
+
scenario_path: Path,
|
|
105
|
+
mlflow_export: bool,
|
|
106
|
+
no_mlflow: bool,
|
|
107
|
+
mlflow_tracking_uri: str | None,
|
|
108
|
+
mlflow_experiment: str | None,
|
|
109
|
+
agent_name: str = "default",
|
|
110
|
+
) -> None:
|
|
111
|
+
"""Export scenario result to MLflow if enabled.
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
result: ScenarioResult from runner
|
|
115
|
+
spec: ScenarioSpec
|
|
116
|
+
scenario_path: Path to scenario file
|
|
117
|
+
mlflow_export: --mlflow-export flag
|
|
118
|
+
no_mlflow: --no-mlflow flag
|
|
119
|
+
mlflow_tracking_uri: --mlflow-tracking-uri value
|
|
120
|
+
mlflow_experiment: --mlflow-experiment value
|
|
121
|
+
agent_name: Agent configuration name
|
|
122
|
+
"""
|
|
123
|
+
from sandboxy.mlflow.config import MLflowConfig
|
|
124
|
+
|
|
125
|
+
# Get YAML config from spec
|
|
126
|
+
yaml_config = None
|
|
127
|
+
if spec.mlflow:
|
|
128
|
+
yaml_config = {
|
|
129
|
+
"enabled": spec.mlflow.enabled,
|
|
130
|
+
"experiment": spec.mlflow.experiment,
|
|
131
|
+
"tracking_uri": spec.mlflow.tracking_uri,
|
|
132
|
+
"tags": spec.mlflow.tags,
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
# Resolve config with precedence
|
|
136
|
+
config = MLflowConfig.resolve(
|
|
137
|
+
cli_export=mlflow_export,
|
|
138
|
+
cli_no_mlflow=no_mlflow,
|
|
139
|
+
cli_tracking_uri=mlflow_tracking_uri,
|
|
140
|
+
cli_experiment=mlflow_experiment,
|
|
141
|
+
yaml_config=yaml_config,
|
|
142
|
+
scenario_name=spec.name,
|
|
143
|
+
)
|
|
144
|
+
|
|
145
|
+
if not config.enabled:
|
|
146
|
+
return
|
|
147
|
+
|
|
148
|
+
# Import and use exporter
|
|
149
|
+
try:
|
|
150
|
+
from sandboxy.mlflow.exporter import MLflowExporter
|
|
151
|
+
|
|
152
|
+
exporter = MLflowExporter(config)
|
|
153
|
+
|
|
154
|
+
# Convert ScenarioResult to RunResult-like for exporter
|
|
155
|
+
# ScenarioResult has different structure, create adapter
|
|
156
|
+
run_id = exporter.export(
|
|
157
|
+
result=_adapt_scenario_result(result),
|
|
158
|
+
scenario_path=scenario_path,
|
|
159
|
+
scenario_name=spec.name,
|
|
160
|
+
scenario_id=spec.id,
|
|
161
|
+
agent_name=agent_name,
|
|
162
|
+
)
|
|
163
|
+
|
|
164
|
+
if run_id:
|
|
165
|
+
click.echo(f"\nExported to MLflow: run_id={run_id}")
|
|
166
|
+
|
|
167
|
+
except ImportError:
|
|
168
|
+
click.echo(
|
|
169
|
+
"\nMLflow not installed. Install with: pip install sandboxy[mlflow]",
|
|
170
|
+
err=True,
|
|
171
|
+
)
|
|
172
|
+
except Exception as e:
|
|
173
|
+
click.echo(f"\nWarning: MLflow export failed: {e}", err=True)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def _adapt_scenario_result(result: Any) -> Any:
|
|
177
|
+
"""Adapt ScenarioResult to RunResult-like interface for MLflowExporter.
|
|
178
|
+
|
|
179
|
+
The exporter expects RunResult fields, but ScenarioRunner returns ScenarioResult.
|
|
180
|
+
This creates an adapter object.
|
|
181
|
+
"""
|
|
182
|
+
from dataclasses import dataclass, field
|
|
183
|
+
|
|
184
|
+
@dataclass
|
|
185
|
+
class GoalResultAdapter:
|
|
186
|
+
name: str
|
|
187
|
+
score: float
|
|
188
|
+
passed: bool = True
|
|
189
|
+
|
|
190
|
+
@dataclass
|
|
191
|
+
class EvaluationAdapter:
|
|
192
|
+
goals: list[GoalResultAdapter] = field(default_factory=list)
|
|
193
|
+
total_score: float = 0.0
|
|
194
|
+
max_score: float = 0.0
|
|
195
|
+
percentage: float = 0.0
|
|
196
|
+
|
|
197
|
+
@dataclass
|
|
198
|
+
class RunResultAdapter:
|
|
199
|
+
model: str = ""
|
|
200
|
+
error: str | None = None
|
|
201
|
+
latency_ms: int = 0
|
|
202
|
+
input_tokens: int = 0
|
|
203
|
+
output_tokens: int = 0
|
|
204
|
+
evaluation: EvaluationAdapter | None = None
|
|
205
|
+
|
|
206
|
+
# Extract data from ScenarioResult
|
|
207
|
+
adapter = RunResultAdapter(
|
|
208
|
+
model=getattr(result, "agent_id", "unknown"),
|
|
209
|
+
error=None,
|
|
210
|
+
)
|
|
211
|
+
|
|
212
|
+
# Build evaluation from goals
|
|
213
|
+
goals = []
|
|
214
|
+
total = 0.0
|
|
215
|
+
for goal_name in getattr(result, "goals_achieved", []):
|
|
216
|
+
goals.append(GoalResultAdapter(name=goal_name, score=1.0, passed=True))
|
|
217
|
+
total += 1.0
|
|
218
|
+
|
|
219
|
+
score = getattr(result, "score", 0.0)
|
|
220
|
+
max_score = max(score, len(goals)) if goals else score
|
|
221
|
+
|
|
222
|
+
adapter.evaluation = EvaluationAdapter(
|
|
223
|
+
goals=goals,
|
|
224
|
+
total_score=score,
|
|
225
|
+
max_score=max_score,
|
|
226
|
+
percentage=(score / max_score * 100) if max_score > 0 else 0.0,
|
|
227
|
+
)
|
|
228
|
+
|
|
229
|
+
return adapter
|
|
230
|
+
|
|
231
|
+
|
|
101
232
|
@main.command()
|
|
102
233
|
@click.option("--with-examples", is_flag=True, help="Include example scenarios and tools")
|
|
103
234
|
@click.option(
|
|
@@ -528,22 +659,54 @@ def info(module_path: str) -> None:
|
|
|
528
659
|
@click.option(
|
|
529
660
|
"--model",
|
|
530
661
|
"-m",
|
|
531
|
-
|
|
532
|
-
|
|
662
|
+
multiple=True,
|
|
663
|
+
help="Model(s) to use. Can specify multiple: -m gpt-4o -m claude-3.5-sonnet",
|
|
533
664
|
)
|
|
534
665
|
@click.option("--agent-id", "-a", help="Agent ID from config files", default=None)
|
|
535
666
|
@click.option("--output", "-o", help="Output file for results JSON", default=None)
|
|
536
667
|
@click.option("--pretty", "-p", is_flag=True, help="Pretty print output")
|
|
537
668
|
@click.option("--max-turns", type=int, default=20, help="Maximum conversation turns")
|
|
538
669
|
@click.option("--var", "-v", multiple=True, help="Variable in name=value format")
|
|
670
|
+
@click.option(
|
|
671
|
+
"--mlflow-export",
|
|
672
|
+
is_flag=True,
|
|
673
|
+
help="Export run results to MLflow tracking server",
|
|
674
|
+
)
|
|
675
|
+
@click.option(
|
|
676
|
+
"--no-mlflow",
|
|
677
|
+
is_flag=True,
|
|
678
|
+
help="Disable MLflow export (overrides YAML config)",
|
|
679
|
+
)
|
|
680
|
+
@click.option(
|
|
681
|
+
"--mlflow-tracking-uri",
|
|
682
|
+
type=str,
|
|
683
|
+
default=None,
|
|
684
|
+
help="MLflow tracking server URI (overrides MLFLOW_TRACKING_URI env)",
|
|
685
|
+
)
|
|
686
|
+
@click.option(
|
|
687
|
+
"--mlflow-experiment",
|
|
688
|
+
type=str,
|
|
689
|
+
default=None,
|
|
690
|
+
help="MLflow experiment name (defaults to scenario name)",
|
|
691
|
+
)
|
|
692
|
+
@click.option(
|
|
693
|
+
"--mlflow-no-tracing",
|
|
694
|
+
is_flag=True,
|
|
695
|
+
help="Disable LLM call tracing (only log summary metrics)",
|
|
696
|
+
)
|
|
539
697
|
def scenario(
|
|
540
698
|
scenario_path: str,
|
|
541
|
-
model: str
|
|
699
|
+
model: tuple[str, ...],
|
|
542
700
|
agent_id: str | None,
|
|
543
701
|
output: str | None,
|
|
544
702
|
pretty: bool,
|
|
545
703
|
max_turns: int,
|
|
546
704
|
var: tuple[str, ...],
|
|
705
|
+
mlflow_export: bool,
|
|
706
|
+
no_mlflow: bool,
|
|
707
|
+
mlflow_tracking_uri: str | None,
|
|
708
|
+
mlflow_experiment: str | None,
|
|
709
|
+
mlflow_no_tracing: bool,
|
|
547
710
|
) -> None:
|
|
548
711
|
"""Run a scenario with YAML-defined tools.
|
|
549
712
|
|
|
@@ -554,8 +717,10 @@ def scenario(
|
|
|
554
717
|
|
|
555
718
|
Examples:
|
|
556
719
|
sandboxy scenario scenarios/trolley.yml -m openai/gpt-4o
|
|
557
|
-
sandboxy scenario scenarios/trolley.yml -m
|
|
720
|
+
sandboxy scenario scenarios/trolley.yml -m gpt-4o -m claude-3.5-sonnet # multiple models
|
|
558
721
|
sandboxy scenario scenarios/surgeon.yml -v patient="John Smith" -v condition="critical"
|
|
722
|
+
sandboxy scenario scenarios/test.yml -m gpt-4o --mlflow-export
|
|
723
|
+
sandboxy scenario scenarios/test.yml -m gpt-4o -m gpt-4o-mini --mlflow-export # compare models
|
|
559
724
|
"""
|
|
560
725
|
from sandboxy.agents.base import AgentConfig
|
|
561
726
|
from sandboxy.agents.llm_prompt import LlmPromptAgent
|
|
@@ -567,6 +732,26 @@ def scenario(
|
|
|
567
732
|
click.echo(f"Error loading scenario: {e}", err=True)
|
|
568
733
|
sys.exit(1)
|
|
569
734
|
|
|
735
|
+
# Build MLflow config if export requested
|
|
736
|
+
mlflow_config = None
|
|
737
|
+
if mlflow_export and not no_mlflow:
|
|
738
|
+
try:
|
|
739
|
+
from sandboxy.mlflow import MLflowConfig
|
|
740
|
+
|
|
741
|
+
mlflow_config = MLflowConfig.resolve(
|
|
742
|
+
cli_export=True,
|
|
743
|
+
cli_tracking_uri=mlflow_tracking_uri,
|
|
744
|
+
cli_experiment=mlflow_experiment,
|
|
745
|
+
cli_tracing=not mlflow_no_tracing,
|
|
746
|
+
yaml_config=spec.mlflow.model_dump() if spec.mlflow else None,
|
|
747
|
+
scenario_name=spec.name,
|
|
748
|
+
)
|
|
749
|
+
click.echo(f"MLflow enabled → experiment: {mlflow_config.experiment}")
|
|
750
|
+
if mlflow_config.tracing:
|
|
751
|
+
click.echo(" Tracing: ON (LLM calls will be captured)")
|
|
752
|
+
except ImportError:
|
|
753
|
+
pass # MLflow not installed
|
|
754
|
+
|
|
570
755
|
# Parse and apply variables
|
|
571
756
|
variables: dict[str, Any] = {}
|
|
572
757
|
for v in var:
|
|
@@ -582,27 +767,17 @@ def scenario(
|
|
|
582
767
|
spec = apply_scenario_variables(spec, variables)
|
|
583
768
|
click.echo(f"Variables: {variables}")
|
|
584
769
|
|
|
585
|
-
#
|
|
586
|
-
|
|
770
|
+
# Build list of models to run
|
|
771
|
+
models_to_run: list[str] = []
|
|
587
772
|
|
|
588
773
|
if model:
|
|
589
|
-
|
|
590
|
-
config = AgentConfig(
|
|
591
|
-
id=model,
|
|
592
|
-
name=model.split("/")[-1] if "/" in model else model,
|
|
593
|
-
kind="llm-prompt",
|
|
594
|
-
model=model,
|
|
595
|
-
system_prompt="",
|
|
596
|
-
tools=[],
|
|
597
|
-
params={"temperature": 0.7, "max_tokens": 4096},
|
|
598
|
-
impl={},
|
|
599
|
-
)
|
|
600
|
-
agent = LlmPromptAgent(config)
|
|
774
|
+
models_to_run = list(model)
|
|
601
775
|
elif agent_id:
|
|
602
776
|
# Load from agent config files
|
|
603
777
|
loader = AgentLoader(DEFAULT_AGENT_DIRS)
|
|
604
778
|
try:
|
|
605
779
|
agent = loader.load(agent_id)
|
|
780
|
+
models_to_run = [agent.config.model]
|
|
606
781
|
except ValueError as e:
|
|
607
782
|
click.echo(f"Error loading agent: {e}", err=True)
|
|
608
783
|
sys.exit(1)
|
|
@@ -611,6 +786,7 @@ def scenario(
|
|
|
611
786
|
loader = AgentLoader(DEFAULT_AGENT_DIRS)
|
|
612
787
|
try:
|
|
613
788
|
agent = loader.load_default()
|
|
789
|
+
models_to_run = [agent.config.model]
|
|
614
790
|
except ValueError:
|
|
615
791
|
click.echo("No model specified. Use -m to specify a model:", err=True)
|
|
616
792
|
click.echo("", err=True)
|
|
@@ -623,25 +799,110 @@ def scenario(
|
|
|
623
799
|
)
|
|
624
800
|
sys.exit(1)
|
|
625
801
|
|
|
626
|
-
# Apply scenario's system prompt to agent
|
|
627
|
-
if spec.system_prompt:
|
|
628
|
-
agent.config.system_prompt = spec.system_prompt
|
|
629
|
-
|
|
630
802
|
click.echo(f"Running scenario: {spec.name}")
|
|
631
|
-
click.echo(f"
|
|
803
|
+
click.echo(f"Models: {', '.join(models_to_run)}")
|
|
632
804
|
click.echo(f"Tools loaded: {len(spec.tools) + len(spec.tools_from)} source(s)")
|
|
805
|
+
if len(models_to_run) > 1:
|
|
806
|
+
click.echo("Running models in parallel...")
|
|
633
807
|
click.echo("")
|
|
634
808
|
|
|
635
|
-
|
|
636
|
-
|
|
809
|
+
def run_single_model(model_id: str) -> dict[str, Any]:
|
|
810
|
+
"""Run scenario with a single model, with MLflow tracing if enabled."""
|
|
811
|
+
agent_config = AgentConfig(
|
|
812
|
+
id=model_id,
|
|
813
|
+
name=model_id.split("/")[-1] if "/" in model_id else model_id,
|
|
814
|
+
kind="llm-prompt",
|
|
815
|
+
model=model_id,
|
|
816
|
+
system_prompt=spec.system_prompt or "",
|
|
817
|
+
tools=[],
|
|
818
|
+
params={"temperature": 0.7, "max_tokens": 4096},
|
|
819
|
+
impl={},
|
|
820
|
+
)
|
|
821
|
+
agent = LlmPromptAgent(agent_config)
|
|
822
|
+
|
|
823
|
+
# If MLflow enabled, wrap execution in run context so traces are connected
|
|
824
|
+
if mlflow_config and mlflow_config.enabled:
|
|
825
|
+
from sandboxy.mlflow import MLflowExporter, mlflow_run_context
|
|
826
|
+
from sandboxy.mlflow.tracing import enable_tracing
|
|
827
|
+
|
|
828
|
+
# Enable tracing before the run starts
|
|
829
|
+
if mlflow_config.tracing:
|
|
830
|
+
enable_tracing(
|
|
831
|
+
tracking_uri=mlflow_config.tracking_uri,
|
|
832
|
+
experiment_name=mlflow_config.experiment,
|
|
833
|
+
)
|
|
834
|
+
|
|
835
|
+
# Start run, execute scenario, then log metrics - all connected
|
|
836
|
+
with mlflow_run_context(mlflow_config, run_name=model_id) as run_id:
|
|
837
|
+
runner = ScenarioRunner(scenario=spec, agent=agent)
|
|
838
|
+
result = runner.run(max_turns=max_turns)
|
|
839
|
+
|
|
840
|
+
# Log metrics to the active run (traces are already attached)
|
|
841
|
+
if run_id:
|
|
842
|
+
exporter = MLflowExporter(mlflow_config)
|
|
843
|
+
exporter.log_to_active_run(
|
|
844
|
+
result=result,
|
|
845
|
+
scenario_path=Path(scenario_path),
|
|
846
|
+
scenario_name=spec.name,
|
|
847
|
+
scenario_id=spec.id,
|
|
848
|
+
agent_name=agent.config.name,
|
|
849
|
+
)
|
|
850
|
+
|
|
851
|
+
return {"model": model_id, "result": result, "agent_name": agent.config.name}
|
|
852
|
+
|
|
853
|
+
# No MLflow - just run scenario
|
|
854
|
+
runner = ScenarioRunner(scenario=spec, agent=agent)
|
|
855
|
+
result = runner.run(max_turns=max_turns)
|
|
856
|
+
return {"model": model_id, "result": result, "agent_name": agent.config.name}
|
|
857
|
+
|
|
858
|
+
# Run models in parallel if multiple, otherwise just run single
|
|
859
|
+
results: list[Any] = []
|
|
860
|
+
if len(models_to_run) == 1:
|
|
861
|
+
results = [run_single_model(models_to_run[0])]
|
|
862
|
+
else:
|
|
863
|
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
864
|
+
|
|
865
|
+
with ThreadPoolExecutor(max_workers=len(models_to_run)) as executor:
|
|
866
|
+
futures = {executor.submit(run_single_model, m): m for m in models_to_run}
|
|
867
|
+
for future in as_completed(futures):
|
|
868
|
+
model_id = futures[future]
|
|
869
|
+
try:
|
|
870
|
+
result_data = future.result()
|
|
871
|
+
results.append(result_data)
|
|
872
|
+
click.echo(f"✓ Completed: {model_id}")
|
|
873
|
+
except Exception as e:
|
|
874
|
+
click.echo(f"✗ Failed: {model_id} - {e}", err=True)
|
|
875
|
+
click.echo("")
|
|
637
876
|
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
877
|
+
# Output results
|
|
878
|
+
if len(results) == 1:
|
|
879
|
+
result = results[0]["result"]
|
|
880
|
+
if output:
|
|
881
|
+
Path(output).write_text(result.to_json(indent=2))
|
|
882
|
+
click.echo(f"\nResults saved to: {output}")
|
|
883
|
+
elif pretty:
|
|
884
|
+
click.echo(result.pretty())
|
|
885
|
+
else:
|
|
886
|
+
click.echo(result.to_json(indent=2))
|
|
643
887
|
else:
|
|
644
|
-
|
|
888
|
+
# Multiple models - show summary
|
|
889
|
+
# Get max_score from spec (scoring config or sum of goal points)
|
|
890
|
+
max_score = spec.scoring.get("max_score", 0) if spec.scoring else 0
|
|
891
|
+
if not max_score and spec.goals:
|
|
892
|
+
max_score = sum(g.points for g in spec.goals)
|
|
893
|
+
|
|
894
|
+
click.echo("=== Results Summary ===")
|
|
895
|
+
for r in results:
|
|
896
|
+
model_name = r["model"]
|
|
897
|
+
res = r["result"]
|
|
898
|
+
score = getattr(res, "score", 0) or 0
|
|
899
|
+
pct = (score / max_score * 100) if max_score > 0 else 0
|
|
900
|
+
click.echo(f" {model_name}: {score:.1f}/{max_score:.1f} ({pct:.0f}%)")
|
|
901
|
+
|
|
902
|
+
if output:
|
|
903
|
+
all_results = [{"model": r["model"], "result": r["result"].to_dict()} for r in results]
|
|
904
|
+
Path(output).write_text(json.dumps(all_results, indent=2))
|
|
905
|
+
click.echo(f"\nResults saved to: {output}")
|
|
645
906
|
|
|
646
907
|
|
|
647
908
|
@main.command()
|