sandboxy 0.0.2__py3-none-any.whl → 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -379,6 +379,10 @@ class RunScenarioRequest(BaseModel):
379
379
  max_turns: int = 20
380
380
  max_tokens: int = 1024
381
381
  temperature: float = 0.7
382
+ mlflow_export: bool = False
383
+ mlflow_tracking_uri: str | None = None
384
+ mlflow_experiment: str | None = None
385
+ mlflow_tracing: bool = True
382
386
 
383
387
 
384
388
  class RunScenarioResponse(BaseModel):
@@ -393,6 +397,9 @@ class RunScenarioResponse(BaseModel):
393
397
  final_state: dict[str, Any]
394
398
  evaluation: dict[str, Any] | None
395
399
  latency_ms: int
400
+ input_tokens: int = 0
401
+ output_tokens: int = 0
402
+ cost_usd: float | None = None
396
403
  error: str | None
397
404
 
398
405
 
@@ -404,6 +411,10 @@ class CompareModelsRequest(BaseModel):
404
411
  runs_per_model: int = 1
405
412
  variables: dict[str, Any] = Field(default_factory=dict)
406
413
  max_turns: int = 20
414
+ mlflow_export: bool = False
415
+ mlflow_tracking_uri: str | None = None
416
+ mlflow_experiment: str | None = None
417
+ mlflow_tracing: bool = True # Enable LLM call tracing by default
407
418
 
408
419
 
409
420
  class CompareModelsResponse(BaseModel):
@@ -454,20 +465,73 @@ async def run_scenario(request: RunScenarioRequest) -> RunScenarioResponse:
454
465
  spec = load_unified_scenario(scenario_path)
455
466
  runner = UnifiedRunner()
456
467
 
457
- result = await runner.run(
458
- scenario=spec,
459
- model=request.model,
460
- variables=request.variables,
461
- max_turns=request.max_turns,
462
- max_tokens=request.max_tokens,
463
- temperature=request.temperature,
464
- )
468
+ # Setup MLflow if requested
469
+ mlflow_config = None
470
+ if request.mlflow_export:
471
+ try:
472
+ from sandboxy.mlflow import MLflowConfig
473
+
474
+ mlflow_config = MLflowConfig.resolve(
475
+ cli_export=True,
476
+ cli_tracking_uri=request.mlflow_tracking_uri,
477
+ cli_experiment=request.mlflow_experiment,
478
+ cli_tracing=request.mlflow_tracing,
479
+ yaml_config=None,
480
+ scenario_name=spec.name,
481
+ )
482
+ except ImportError:
483
+ pass # MLflow not installed
484
+
485
+ # Run with MLflow context if enabled (connects traces to run)
486
+ if mlflow_config and mlflow_config.enabled:
487
+ from sandboxy.mlflow import MLflowExporter, mlflow_run_context
488
+ from sandboxy.mlflow.tracing import enable_tracing
489
+
490
+ if mlflow_config.tracing:
491
+ enable_tracing(
492
+ tracking_uri=mlflow_config.tracking_uri,
493
+ experiment_name=mlflow_config.experiment,
494
+ )
495
+
496
+ with mlflow_run_context(mlflow_config, run_name=request.model) as run_id:
497
+ result = await runner.run(
498
+ scenario=spec,
499
+ model=request.model,
500
+ variables=request.variables,
501
+ max_turns=request.max_turns,
502
+ max_tokens=request.max_tokens,
503
+ temperature=request.temperature,
504
+ )
505
+
506
+ if run_id:
507
+ exporter = MLflowExporter(mlflow_config)
508
+ exporter.log_to_active_run(
509
+ result=result,
510
+ scenario_path=scenario_path,
511
+ scenario_name=spec.name,
512
+ scenario_id=spec.id,
513
+ agent_name=request.model,
514
+ )
515
+ else:
516
+ result = await runner.run(
517
+ scenario=spec,
518
+ model=request.model,
519
+ variables=request.variables,
520
+ max_turns=request.max_turns,
521
+ max_tokens=request.max_tokens,
522
+ temperature=request.temperature,
523
+ )
465
524
 
466
525
  # Save result to runs/
467
526
  from sandboxy.local.results import save_run_result
468
527
 
469
528
  save_run_result(request.scenario_id, result.to_dict())
470
529
 
530
+ # Calculate cost
531
+ input_tokens = result.input_tokens or 0
532
+ output_tokens = result.output_tokens or 0
533
+ cost_usd = calculate_cost(result.model, input_tokens, output_tokens)
534
+
471
535
  return RunScenarioResponse(
472
536
  id=result.id,
473
537
  scenario_id=result.scenario_id,
@@ -481,6 +545,9 @@ async def run_scenario(request: RunScenarioRequest) -> RunScenarioResponse:
481
545
  final_state=result.final_state,
482
546
  evaluation=result.evaluation.to_dict() if result.evaluation else None,
483
547
  latency_ms=result.latency_ms,
548
+ input_tokens=input_tokens,
549
+ output_tokens=output_tokens,
550
+ cost_usd=cost_usd,
484
551
  error=result.error,
485
552
  )
486
553
 
@@ -530,6 +597,19 @@ async def compare_models(request: CompareModelsRequest) -> CompareModelsResponse
530
597
 
531
598
  spec = load_unified_scenario(scenario_path)
532
599
 
600
+ # Enable MLflow tracing if requested (must be done BEFORE any LLM calls)
601
+ if request.mlflow_export and request.mlflow_tracing:
602
+ try:
603
+ from sandboxy.mlflow.tracing import enable_tracing
604
+
605
+ experiment = request.mlflow_experiment or spec.name
606
+ enable_tracing(
607
+ tracking_uri=request.mlflow_tracking_uri,
608
+ experiment_name=experiment,
609
+ )
610
+ except ImportError:
611
+ pass # MLflow not installed
612
+
533
613
  comparison = await run_comparison(
534
614
  scenario=spec,
535
615
  models=request.models,
@@ -538,6 +618,31 @@ async def compare_models(request: CompareModelsRequest) -> CompareModelsResponse
538
618
  max_turns=request.max_turns,
539
619
  )
540
620
 
621
+ # MLflow export (if enabled)
622
+ if request.mlflow_export:
623
+ try:
624
+ from sandboxy.mlflow import MLflowConfig, MLflowExporter
625
+
626
+ for result in comparison.results:
627
+ config = MLflowConfig.resolve(
628
+ cli_export=True,
629
+ cli_tracking_uri=request.mlflow_tracking_uri,
630
+ cli_experiment=request.mlflow_experiment,
631
+ cli_tracing=request.mlflow_tracing,
632
+ yaml_config=None,
633
+ scenario_name=spec.name,
634
+ )
635
+ exporter = MLflowExporter(config)
636
+ exporter.export(
637
+ result=result.to_dict(),
638
+ scenario_path=scenario_path,
639
+ agent_name=result.model,
640
+ )
641
+ except ImportError:
642
+ logger.warning("MLflow not installed, skipping export")
643
+ except Exception as e:
644
+ logger.warning(f"Failed to export to MLflow: {e}")
645
+
541
646
  # Save comparison result
542
647
  from sandboxy.local.results import save_run_result
543
648
 
@@ -905,6 +1010,8 @@ class RunDatasetRequest(BaseModel):
905
1010
  max_tokens: int = 1024
906
1011
  temperature: float = 0.7
907
1012
  parallel: int = 1
1013
+ mlflow_enabled: bool = False
1014
+ mlflow_experiment: str | None = None
908
1015
 
909
1016
 
910
1017
  class RunDatasetResponse(BaseModel):
@@ -1335,25 +1442,81 @@ async def run_with_dataset(request: RunDatasetRequest) -> RunDatasetResponse:
1335
1442
  spec = load_unified_scenario(scenario_path)
1336
1443
  dataset = load_dataset(dataset_path)
1337
1444
 
1338
- if request.parallel > 1:
1339
- result = await run_dataset_parallel(
1445
+ # Setup MLflow if enabled
1446
+ mlflow_config = None
1447
+ if request.mlflow_enabled:
1448
+ try:
1449
+ from sandboxy.mlflow import MLflowConfig
1450
+
1451
+ mlflow_config = MLflowConfig(
1452
+ enabled=True,
1453
+ experiment=request.mlflow_experiment or f"{spec.name}-dataset",
1454
+ tracing=False, # Tracing not needed for dataset aggregates
1455
+ )
1456
+ except ImportError:
1457
+ pass # MLflow not installed
1458
+
1459
+ async def run_dataset_benchmark():
1460
+ if request.parallel > 1:
1461
+ return await run_dataset_parallel(
1462
+ scenario=spec,
1463
+ model=request.model,
1464
+ dataset=dataset,
1465
+ max_turns=request.max_turns,
1466
+ max_tokens=request.max_tokens,
1467
+ temperature=request.temperature,
1468
+ max_concurrent=request.parallel,
1469
+ )
1470
+ return await run_dataset(
1340
1471
  scenario=spec,
1341
1472
  model=request.model,
1342
1473
  dataset=dataset,
1343
1474
  max_turns=request.max_turns,
1344
1475
  max_tokens=request.max_tokens,
1345
1476
  temperature=request.temperature,
1346
- max_concurrent=request.parallel,
1347
1477
  )
1478
+
1479
+ # Run with MLflow context if enabled
1480
+ if mlflow_config and mlflow_config.enabled:
1481
+ from sandboxy.mlflow import mlflow_run_context
1482
+
1483
+ run_name = f"{request.model}-{request.dataset_id}"
1484
+ with mlflow_run_context(mlflow_config, run_name=run_name) as run_id:
1485
+ result = await run_dataset_benchmark()
1486
+
1487
+ # Log aggregate metrics to MLflow
1488
+ if run_id:
1489
+ try:
1490
+ import mlflow
1491
+
1492
+ mlflow.log_params(
1493
+ {
1494
+ "scenario_id": result.scenario_id,
1495
+ "dataset_id": result.dataset_id,
1496
+ "model": result.model,
1497
+ "total_cases": result.total_cases,
1498
+ }
1499
+ )
1500
+ mlflow.log_metrics(
1501
+ {
1502
+ "passed_cases": result.passed_cases,
1503
+ "failed_cases": result.failed_cases,
1504
+ "pass_rate": result.pass_rate,
1505
+ "avg_score": result.avg_score,
1506
+ "avg_percentage": result.avg_percentage,
1507
+ "total_time_ms": result.total_time_ms,
1508
+ }
1509
+ )
1510
+ # Log per-expected-outcome metrics
1511
+ for expected, counts in result.by_expected.items():
1512
+ total = counts.get("passed", 0) + counts.get("failed", 0)
1513
+ if total > 0:
1514
+ rate = counts.get("passed", 0) / total
1515
+ mlflow.log_metric(f"pass_rate_{expected}", rate)
1516
+ except Exception as e:
1517
+ logger.warning(f"Failed to log dataset metrics to MLflow: {e}")
1348
1518
  else:
1349
- result = await run_dataset(
1350
- scenario=spec,
1351
- model=request.model,
1352
- dataset=dataset,
1353
- max_turns=request.max_turns,
1354
- max_tokens=request.max_tokens,
1355
- temperature=request.temperature,
1356
- )
1519
+ result = await run_dataset_benchmark()
1357
1520
 
1358
1521
  # Save result
1359
1522
  from sandboxy.local.results import save_run_result