sandboxy 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -16,11 +16,37 @@ MAX_RETRIES = 3
16
16
  RETRY_DELAY_BASE = 1.0 # seconds
17
17
 
18
18
 
19
+ def _is_local_provider_model(model_id: str) -> bool:
20
+ """Check if a model ID refers to a local provider.
21
+
22
+ Args:
23
+ model_id: Model identifier
24
+
25
+ Returns:
26
+ True if the model is from a configured local provider
27
+ """
28
+ if "/" not in model_id:
29
+ return False
30
+
31
+ provider_name = model_id.split("/")[0]
32
+
33
+ # Check if this provider name matches a configured local provider
34
+ try:
35
+ from sandboxy.providers.config import load_providers_config
36
+
37
+ config = load_providers_config()
38
+ return any(p.name == provider_name and p.enabled for p in config.providers)
39
+ except Exception:
40
+ return False
41
+
42
+
19
43
  class LlmPromptAgent(BaseAgent):
20
44
  """Agent that uses an LLM via OpenAI-compatible API.
21
45
 
22
- Supports both direct OpenAI and OpenRouter (for 400+ models).
23
- Uses OpenRouter when model contains "/" (e.g., "openai/gpt-4o").
46
+ Supports:
47
+ - Local providers (Ollama, LM Studio, vLLM) when model matches configured provider
48
+ - OpenRouter (for 400+ cloud models)
49
+ - Direct OpenAI when model has no prefix
24
50
  """
25
51
 
26
52
  def __init__(self, config: AgentConfig) -> None:
@@ -31,7 +57,12 @@ class LlmPromptAgent(BaseAgent):
31
57
  """
32
58
  super().__init__(config)
33
59
  self._client: Any = None
34
- self._is_openrouter = "/" in (config.model or "")
60
+ self._local_provider: Any = None
61
+
62
+ # Check for local provider first
63
+ self._is_local = _is_local_provider_model(config.model or "")
64
+ self._is_openrouter = not self._is_local and "/" in (config.model or "")
65
+
35
66
  # Token usage tracking
36
67
  self._total_input_tokens = 0
37
68
  self._total_output_tokens = 0
@@ -39,6 +70,9 @@ class LlmPromptAgent(BaseAgent):
39
70
  @property
40
71
  def api_key(self) -> str:
41
72
  """Get the appropriate API key based on model type."""
73
+ if self._is_local:
74
+ # Local providers may not need an API key, or it's in the provider config
75
+ return ""
42
76
  if self._is_openrouter:
43
77
  return os.getenv("OPENROUTER_API_KEY", "")
44
78
  return os.getenv("OPENAI_API_KEY", "")
@@ -49,15 +83,46 @@ class LlmPromptAgent(BaseAgent):
49
83
  if self._client is None:
50
84
  from openai import OpenAI
51
85
 
52
- if self._is_openrouter:
53
- logger.debug("Initializing OpenRouter client for model: %s", self.config.model)
54
- self._client = OpenAI(
55
- api_key=self.api_key,
56
- base_url="https://openrouter.ai/api/v1",
57
- )
58
- else:
59
- logger.debug("Initializing OpenAI client for model: %s", self.config.model)
60
- self._client = OpenAI(api_key=self.api_key)
86
+ if self._is_local:
87
+ # Get local provider and create client pointing to it
88
+ provider_name = (self.config.model or "").split("/")[0]
89
+ from sandboxy.providers.config import load_providers_config
90
+
91
+ config = load_providers_config()
92
+ provider_config = config.get_provider(provider_name)
93
+
94
+ if provider_config:
95
+ logger.debug(
96
+ "Initializing local client for %s at %s",
97
+ provider_name,
98
+ provider_config.base_url,
99
+ )
100
+ headers = {}
101
+ if provider_config.api_key:
102
+ headers["Authorization"] = f"Bearer {provider_config.api_key}"
103
+
104
+ self._client = OpenAI(
105
+ api_key=provider_config.api_key or "not-needed",
106
+ base_url=provider_config.base_url,
107
+ default_headers=headers if headers else None,
108
+ )
109
+ else:
110
+ logger.warning(
111
+ "Local provider %s not found, falling back to OpenRouter", provider_name
112
+ )
113
+ self._is_local = False
114
+ self._is_openrouter = True
115
+
116
+ if self._client is None: # Not set by local provider path
117
+ if self._is_openrouter:
118
+ logger.debug("Initializing OpenRouter client for model: %s", self.config.model)
119
+ self._client = OpenAI(
120
+ api_key=self.api_key,
121
+ base_url="https://openrouter.ai/api/v1",
122
+ )
123
+ else:
124
+ logger.debug("Initializing OpenAI client for model: %s", self.config.model)
125
+ self._client = OpenAI(api_key=self.api_key)
61
126
  return self._client
62
127
 
63
128
  def step(
@@ -66,7 +131,8 @@ class LlmPromptAgent(BaseAgent):
66
131
  available_tools: list[dict[str, Any]] | None = None,
67
132
  ) -> AgentAction:
68
133
  """Process conversation and return next action using LLM."""
69
- if not self.api_key:
134
+ # Local providers don't require an API key
135
+ if not self._is_local and not self.api_key:
70
136
  return self._stub_response(history)
71
137
 
72
138
  messages = self._build_messages(history)
@@ -188,8 +254,13 @@ class LlmPromptAgent(BaseAgent):
188
254
  messages: list[dict[str, Any]],
189
255
  tools: list[dict[str, Any]] | None,
190
256
  ) -> Any:
191
- """Make API call to OpenAI/OpenRouter."""
257
+ """Make API call to OpenAI/OpenRouter/Local provider."""
192
258
  model = self.config.model or "gpt-4o-mini"
259
+
260
+ # For local providers, strip the provider prefix (e.g., "ollama/llama3" -> "llama3")
261
+ if self._is_local and "/" in model:
262
+ model = model.split("/", 1)[1]
263
+
193
264
  kwargs: dict[str, Any] = {
194
265
  "model": model,
195
266
  "messages": messages,
sandboxy/api/app.py CHANGED
@@ -58,12 +58,13 @@ def create_local_app(
58
58
  )
59
59
 
60
60
  # Local routes only
61
- from sandboxy.api.routes import agents, tools
61
+ from sandboxy.api.routes import agents, providers, tools
62
62
  from sandboxy.api.routes import local as local_routes
63
63
 
64
64
  app.include_router(local_routes.router, prefix="/api/v1", tags=["local"])
65
65
  app.include_router(agents.router, prefix="/api/v1", tags=["agents"])
66
66
  app.include_router(tools.router, prefix="/api/v1", tags=["tools"])
67
+ app.include_router(providers.router, prefix="/api/v1", tags=["providers"])
67
68
 
68
69
  @app.get("/health")
69
70
  async def health_check():
@@ -379,6 +379,10 @@ class RunScenarioRequest(BaseModel):
379
379
  max_turns: int = 20
380
380
  max_tokens: int = 1024
381
381
  temperature: float = 0.7
382
+ mlflow_export: bool = False
383
+ mlflow_tracking_uri: str | None = None
384
+ mlflow_experiment: str | None = None
385
+ mlflow_tracing: bool = True
382
386
 
383
387
 
384
388
  class RunScenarioResponse(BaseModel):
@@ -393,6 +397,9 @@ class RunScenarioResponse(BaseModel):
393
397
  final_state: dict[str, Any]
394
398
  evaluation: dict[str, Any] | None
395
399
  latency_ms: int
400
+ input_tokens: int = 0
401
+ output_tokens: int = 0
402
+ cost_usd: float | None = None
396
403
  error: str | None
397
404
 
398
405
 
@@ -404,6 +411,10 @@ class CompareModelsRequest(BaseModel):
404
411
  runs_per_model: int = 1
405
412
  variables: dict[str, Any] = Field(default_factory=dict)
406
413
  max_turns: int = 20
414
+ mlflow_export: bool = False
415
+ mlflow_tracking_uri: str | None = None
416
+ mlflow_experiment: str | None = None
417
+ mlflow_tracing: bool = True # Enable LLM call tracing by default
407
418
 
408
419
 
409
420
  class CompareModelsResponse(BaseModel):
@@ -454,20 +465,73 @@ async def run_scenario(request: RunScenarioRequest) -> RunScenarioResponse:
454
465
  spec = load_unified_scenario(scenario_path)
455
466
  runner = UnifiedRunner()
456
467
 
457
- result = await runner.run(
458
- scenario=spec,
459
- model=request.model,
460
- variables=request.variables,
461
- max_turns=request.max_turns,
462
- max_tokens=request.max_tokens,
463
- temperature=request.temperature,
464
- )
468
+ # Setup MLflow if requested
469
+ mlflow_config = None
470
+ if request.mlflow_export:
471
+ try:
472
+ from sandboxy.mlflow import MLflowConfig
473
+
474
+ mlflow_config = MLflowConfig.resolve(
475
+ cli_export=True,
476
+ cli_tracking_uri=request.mlflow_tracking_uri,
477
+ cli_experiment=request.mlflow_experiment,
478
+ cli_tracing=request.mlflow_tracing,
479
+ yaml_config=None,
480
+ scenario_name=spec.name,
481
+ )
482
+ except ImportError:
483
+ pass # MLflow not installed
484
+
485
+ # Run with MLflow context if enabled (connects traces to run)
486
+ if mlflow_config and mlflow_config.enabled:
487
+ from sandboxy.mlflow import MLflowExporter, mlflow_run_context
488
+ from sandboxy.mlflow.tracing import enable_tracing
489
+
490
+ if mlflow_config.tracing:
491
+ enable_tracing(
492
+ tracking_uri=mlflow_config.tracking_uri,
493
+ experiment_name=mlflow_config.experiment,
494
+ )
495
+
496
+ with mlflow_run_context(mlflow_config, run_name=request.model) as run_id:
497
+ result = await runner.run(
498
+ scenario=spec,
499
+ model=request.model,
500
+ variables=request.variables,
501
+ max_turns=request.max_turns,
502
+ max_tokens=request.max_tokens,
503
+ temperature=request.temperature,
504
+ )
505
+
506
+ if run_id:
507
+ exporter = MLflowExporter(mlflow_config)
508
+ exporter.log_to_active_run(
509
+ result=result,
510
+ scenario_path=scenario_path,
511
+ scenario_name=spec.name,
512
+ scenario_id=spec.id,
513
+ agent_name=request.model,
514
+ )
515
+ else:
516
+ result = await runner.run(
517
+ scenario=spec,
518
+ model=request.model,
519
+ variables=request.variables,
520
+ max_turns=request.max_turns,
521
+ max_tokens=request.max_tokens,
522
+ temperature=request.temperature,
523
+ )
465
524
 
466
525
  # Save result to runs/
467
526
  from sandboxy.local.results import save_run_result
468
527
 
469
528
  save_run_result(request.scenario_id, result.to_dict())
470
529
 
530
+ # Calculate cost
531
+ input_tokens = result.input_tokens or 0
532
+ output_tokens = result.output_tokens or 0
533
+ cost_usd = calculate_cost(result.model, input_tokens, output_tokens)
534
+
471
535
  return RunScenarioResponse(
472
536
  id=result.id,
473
537
  scenario_id=result.scenario_id,
@@ -481,6 +545,9 @@ async def run_scenario(request: RunScenarioRequest) -> RunScenarioResponse:
481
545
  final_state=result.final_state,
482
546
  evaluation=result.evaluation.to_dict() if result.evaluation else None,
483
547
  latency_ms=result.latency_ms,
548
+ input_tokens=input_tokens,
549
+ output_tokens=output_tokens,
550
+ cost_usd=cost_usd,
484
551
  error=result.error,
485
552
  )
486
553
 
@@ -530,6 +597,19 @@ async def compare_models(request: CompareModelsRequest) -> CompareModelsResponse
530
597
 
531
598
  spec = load_unified_scenario(scenario_path)
532
599
 
600
+ # Enable MLflow tracing if requested (must be done BEFORE any LLM calls)
601
+ if request.mlflow_export and request.mlflow_tracing:
602
+ try:
603
+ from sandboxy.mlflow.tracing import enable_tracing
604
+
605
+ experiment = request.mlflow_experiment or spec.name
606
+ enable_tracing(
607
+ tracking_uri=request.mlflow_tracking_uri,
608
+ experiment_name=experiment,
609
+ )
610
+ except ImportError:
611
+ pass # MLflow not installed
612
+
533
613
  comparison = await run_comparison(
534
614
  scenario=spec,
535
615
  models=request.models,
@@ -538,6 +618,33 @@ async def compare_models(request: CompareModelsRequest) -> CompareModelsResponse
538
618
  max_turns=request.max_turns,
539
619
  )
540
620
 
621
+ # MLflow export (if enabled)
622
+ if request.mlflow_export:
623
+ try:
624
+ from sandboxy.mlflow import MLflowConfig, MLflowExporter
625
+
626
+ for result in comparison.results:
627
+ config = MLflowConfig.resolve(
628
+ cli_export=True,
629
+ cli_tracking_uri=request.mlflow_tracking_uri,
630
+ cli_experiment=request.mlflow_experiment,
631
+ cli_tracing=request.mlflow_tracing,
632
+ yaml_config=None,
633
+ scenario_name=spec.name,
634
+ )
635
+ exporter = MLflowExporter(config)
636
+ exporter.export(
637
+ result=result.to_dict(),
638
+ scenario_path=scenario_path,
639
+ scenario_name=spec.name,
640
+ scenario_id=spec.id,
641
+ agent_name=result.model,
642
+ )
643
+ except ImportError:
644
+ logger.warning("MLflow not installed, skipping export")
645
+ except Exception as e:
646
+ logger.warning(f"Failed to export to MLflow: {e}")
647
+
541
648
  # Save comparison result
542
649
  from sandboxy.local.results import save_run_result
543
650
 
@@ -587,10 +694,40 @@ def calculate_cost(model_id: str, input_tokens: int, output_tokens: int) -> floa
587
694
 
588
695
  @router.get("/local/models")
589
696
  async def list_available_models() -> list[dict[str, Any]]:
590
- """List available models from OpenRouter."""
697
+ """List available models from OpenRouter and local providers."""
698
+ from sandboxy.providers.config import get_enabled_providers
699
+ from sandboxy.providers.local import LocalProvider
591
700
  from sandboxy.providers.openrouter import OPENROUTER_MODELS
592
701
 
593
702
  models = []
703
+
704
+ # Add models from local providers first
705
+ for provider_config in get_enabled_providers():
706
+ try:
707
+ provider = LocalProvider(provider_config)
708
+ local_models = await provider.refresh_models()
709
+ await provider.close()
710
+
711
+ for model in local_models:
712
+ # Model ID includes provider prefix for routing
713
+ full_model_id = f"{provider_config.name}/{model.id}"
714
+ models.append(
715
+ {
716
+ "id": full_model_id,
717
+ "name": model.name,
718
+ "price": "Local",
719
+ "pricing": {"input": 0, "output": 0},
720
+ "provider": provider_config.name,
721
+ "context_length": model.context_length,
722
+ "supports_vision": model.supports_vision,
723
+ "is_local": True,
724
+ "provider_name": provider_config.name,
725
+ }
726
+ )
727
+ except Exception as e:
728
+ logger.warning(f"Failed to fetch models from {provider_config.name}: {e}")
729
+
730
+ # Add OpenRouter models
594
731
  for model_id, info in OPENROUTER_MODELS.items():
595
732
  # Format price string
596
733
  if info.input_cost_per_million == 0 and info.output_cost_per_million == 0:
@@ -610,6 +747,7 @@ async def list_available_models() -> list[dict[str, Any]]:
610
747
  "provider": info.provider,
611
748
  "context_length": info.context_length,
612
749
  "supports_vision": info.supports_vision,
750
+ "is_local": False,
613
751
  }
614
752
  )
615
753
 
@@ -905,6 +1043,8 @@ class RunDatasetRequest(BaseModel):
905
1043
  max_tokens: int = 1024
906
1044
  temperature: float = 0.7
907
1045
  parallel: int = 1
1046
+ mlflow_enabled: bool = False
1047
+ mlflow_experiment: str | None = None
908
1048
 
909
1049
 
910
1050
  class RunDatasetResponse(BaseModel):
@@ -1335,25 +1475,81 @@ async def run_with_dataset(request: RunDatasetRequest) -> RunDatasetResponse:
1335
1475
  spec = load_unified_scenario(scenario_path)
1336
1476
  dataset = load_dataset(dataset_path)
1337
1477
 
1338
- if request.parallel > 1:
1339
- result = await run_dataset_parallel(
1478
+ # Setup MLflow if enabled
1479
+ mlflow_config = None
1480
+ if request.mlflow_enabled:
1481
+ try:
1482
+ from sandboxy.mlflow import MLflowConfig
1483
+
1484
+ mlflow_config = MLflowConfig(
1485
+ enabled=True,
1486
+ experiment=request.mlflow_experiment or f"{spec.name}-dataset",
1487
+ tracing=False, # Tracing not needed for dataset aggregates
1488
+ )
1489
+ except ImportError:
1490
+ pass # MLflow not installed
1491
+
1492
+ async def run_dataset_benchmark():
1493
+ if request.parallel > 1:
1494
+ return await run_dataset_parallel(
1495
+ scenario=spec,
1496
+ model=request.model,
1497
+ dataset=dataset,
1498
+ max_turns=request.max_turns,
1499
+ max_tokens=request.max_tokens,
1500
+ temperature=request.temperature,
1501
+ max_concurrent=request.parallel,
1502
+ )
1503
+ return await run_dataset(
1340
1504
  scenario=spec,
1341
1505
  model=request.model,
1342
1506
  dataset=dataset,
1343
1507
  max_turns=request.max_turns,
1344
1508
  max_tokens=request.max_tokens,
1345
1509
  temperature=request.temperature,
1346
- max_concurrent=request.parallel,
1347
1510
  )
1511
+
1512
+ # Run with MLflow context if enabled
1513
+ if mlflow_config and mlflow_config.enabled:
1514
+ from sandboxy.mlflow import mlflow_run_context
1515
+
1516
+ run_name = f"{request.model}-{request.dataset_id}"
1517
+ with mlflow_run_context(mlflow_config, run_name=run_name) as run_id:
1518
+ result = await run_dataset_benchmark()
1519
+
1520
+ # Log aggregate metrics to MLflow
1521
+ if run_id:
1522
+ try:
1523
+ import mlflow
1524
+
1525
+ mlflow.log_params(
1526
+ {
1527
+ "scenario_id": result.scenario_id,
1528
+ "dataset_id": result.dataset_id,
1529
+ "model": result.model,
1530
+ "total_cases": result.total_cases,
1531
+ }
1532
+ )
1533
+ mlflow.log_metrics(
1534
+ {
1535
+ "passed_cases": result.passed_cases,
1536
+ "failed_cases": result.failed_cases,
1537
+ "pass_rate": result.pass_rate,
1538
+ "avg_score": result.avg_score,
1539
+ "avg_percentage": result.avg_percentage,
1540
+ "total_time_ms": result.total_time_ms,
1541
+ }
1542
+ )
1543
+ # Log per-expected-outcome metrics
1544
+ for expected, counts in result.by_expected.items():
1545
+ total = counts.get("passed", 0) + counts.get("failed", 0)
1546
+ if total > 0:
1547
+ rate = counts.get("passed", 0) / total
1548
+ mlflow.log_metric(f"pass_rate_{expected}", rate)
1549
+ except Exception as e:
1550
+ logger.warning(f"Failed to log dataset metrics to MLflow: {e}")
1348
1551
  else:
1349
- result = await run_dataset(
1350
- scenario=spec,
1351
- model=request.model,
1352
- dataset=dataset,
1353
- max_turns=request.max_turns,
1354
- max_tokens=request.max_tokens,
1355
- temperature=request.temperature,
1356
- )
1552
+ result = await run_dataset_benchmark()
1357
1553
 
1358
1554
  # Save result
1359
1555
  from sandboxy.local.results import save_run_result