sandboxy 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sandboxy/agents/llm_prompt.py +85 -14
- sandboxy/api/app.py +2 -1
- sandboxy/api/routes/local.py +216 -20
- sandboxy/api/routes/providers.py +369 -0
- sandboxy/cli/main.py +663 -31
- sandboxy/mlflow/__init__.py +38 -0
- sandboxy/mlflow/artifacts.py +184 -0
- sandboxy/mlflow/config.py +90 -0
- sandboxy/mlflow/exporter.py +445 -0
- sandboxy/mlflow/metrics.py +115 -0
- sandboxy/mlflow/tags.py +140 -0
- sandboxy/mlflow/tracing.py +126 -0
- sandboxy/providers/__init__.py +37 -3
- sandboxy/providers/config.py +243 -0
- sandboxy/providers/local.py +498 -0
- sandboxy/providers/registry.py +107 -13
- sandboxy/scenarios/loader.py +44 -2
- sandboxy/scenarios/runner.py +57 -2
- sandboxy/scenarios/unified.py +27 -3
- sandboxy/tools/yaml_tools.py +18 -0
- sandboxy/ui/dist/assets/index-CLxxjJuD.js +367 -0
- sandboxy/ui/dist/assets/index-DBB7ehs6.css +1 -0
- sandboxy/ui/dist/index.html +2 -2
- {sandboxy-0.0.3.dist-info → sandboxy-0.0.5.dist-info}/METADATA +103 -27
- {sandboxy-0.0.3.dist-info → sandboxy-0.0.5.dist-info}/RECORD +28 -18
- sandboxy/ui/dist/assets/index-CgAkYWrJ.css +0 -1
- sandboxy/ui/dist/assets/index-D4zoGFcr.js +0 -347
- {sandboxy-0.0.3.dist-info → sandboxy-0.0.5.dist-info}/WHEEL +0 -0
- {sandboxy-0.0.3.dist-info → sandboxy-0.0.5.dist-info}/entry_points.txt +0 -0
- {sandboxy-0.0.3.dist-info → sandboxy-0.0.5.dist-info}/licenses/LICENSE +0 -0
sandboxy/agents/llm_prompt.py
CHANGED
|
@@ -16,11 +16,37 @@ MAX_RETRIES = 3
|
|
|
16
16
|
RETRY_DELAY_BASE = 1.0 # seconds
|
|
17
17
|
|
|
18
18
|
|
|
19
|
+
def _is_local_provider_model(model_id: str) -> bool:
|
|
20
|
+
"""Check if a model ID refers to a local provider.
|
|
21
|
+
|
|
22
|
+
Args:
|
|
23
|
+
model_id: Model identifier
|
|
24
|
+
|
|
25
|
+
Returns:
|
|
26
|
+
True if the model is from a configured local provider
|
|
27
|
+
"""
|
|
28
|
+
if "/" not in model_id:
|
|
29
|
+
return False
|
|
30
|
+
|
|
31
|
+
provider_name = model_id.split("/")[0]
|
|
32
|
+
|
|
33
|
+
# Check if this provider name matches a configured local provider
|
|
34
|
+
try:
|
|
35
|
+
from sandboxy.providers.config import load_providers_config
|
|
36
|
+
|
|
37
|
+
config = load_providers_config()
|
|
38
|
+
return any(p.name == provider_name and p.enabled for p in config.providers)
|
|
39
|
+
except Exception:
|
|
40
|
+
return False
|
|
41
|
+
|
|
42
|
+
|
|
19
43
|
class LlmPromptAgent(BaseAgent):
|
|
20
44
|
"""Agent that uses an LLM via OpenAI-compatible API.
|
|
21
45
|
|
|
22
|
-
Supports
|
|
23
|
-
|
|
46
|
+
Supports:
|
|
47
|
+
- Local providers (Ollama, LM Studio, vLLM) when model matches configured provider
|
|
48
|
+
- OpenRouter (for 400+ cloud models)
|
|
49
|
+
- Direct OpenAI when model has no prefix
|
|
24
50
|
"""
|
|
25
51
|
|
|
26
52
|
def __init__(self, config: AgentConfig) -> None:
|
|
@@ -31,7 +57,12 @@ class LlmPromptAgent(BaseAgent):
|
|
|
31
57
|
"""
|
|
32
58
|
super().__init__(config)
|
|
33
59
|
self._client: Any = None
|
|
34
|
-
self.
|
|
60
|
+
self._local_provider: Any = None
|
|
61
|
+
|
|
62
|
+
# Check for local provider first
|
|
63
|
+
self._is_local = _is_local_provider_model(config.model or "")
|
|
64
|
+
self._is_openrouter = not self._is_local and "/" in (config.model or "")
|
|
65
|
+
|
|
35
66
|
# Token usage tracking
|
|
36
67
|
self._total_input_tokens = 0
|
|
37
68
|
self._total_output_tokens = 0
|
|
@@ -39,6 +70,9 @@ class LlmPromptAgent(BaseAgent):
|
|
|
39
70
|
@property
|
|
40
71
|
def api_key(self) -> str:
|
|
41
72
|
"""Get the appropriate API key based on model type."""
|
|
73
|
+
if self._is_local:
|
|
74
|
+
# Local providers may not need an API key, or it's in the provider config
|
|
75
|
+
return ""
|
|
42
76
|
if self._is_openrouter:
|
|
43
77
|
return os.getenv("OPENROUTER_API_KEY", "")
|
|
44
78
|
return os.getenv("OPENAI_API_KEY", "")
|
|
@@ -49,15 +83,46 @@ class LlmPromptAgent(BaseAgent):
|
|
|
49
83
|
if self._client is None:
|
|
50
84
|
from openai import OpenAI
|
|
51
85
|
|
|
52
|
-
if self.
|
|
53
|
-
|
|
54
|
-
self.
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
)
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
86
|
+
if self._is_local:
|
|
87
|
+
# Get local provider and create client pointing to it
|
|
88
|
+
provider_name = (self.config.model or "").split("/")[0]
|
|
89
|
+
from sandboxy.providers.config import load_providers_config
|
|
90
|
+
|
|
91
|
+
config = load_providers_config()
|
|
92
|
+
provider_config = config.get_provider(provider_name)
|
|
93
|
+
|
|
94
|
+
if provider_config:
|
|
95
|
+
logger.debug(
|
|
96
|
+
"Initializing local client for %s at %s",
|
|
97
|
+
provider_name,
|
|
98
|
+
provider_config.base_url,
|
|
99
|
+
)
|
|
100
|
+
headers = {}
|
|
101
|
+
if provider_config.api_key:
|
|
102
|
+
headers["Authorization"] = f"Bearer {provider_config.api_key}"
|
|
103
|
+
|
|
104
|
+
self._client = OpenAI(
|
|
105
|
+
api_key=provider_config.api_key or "not-needed",
|
|
106
|
+
base_url=provider_config.base_url,
|
|
107
|
+
default_headers=headers if headers else None,
|
|
108
|
+
)
|
|
109
|
+
else:
|
|
110
|
+
logger.warning(
|
|
111
|
+
"Local provider %s not found, falling back to OpenRouter", provider_name
|
|
112
|
+
)
|
|
113
|
+
self._is_local = False
|
|
114
|
+
self._is_openrouter = True
|
|
115
|
+
|
|
116
|
+
if self._client is None: # Not set by local provider path
|
|
117
|
+
if self._is_openrouter:
|
|
118
|
+
logger.debug("Initializing OpenRouter client for model: %s", self.config.model)
|
|
119
|
+
self._client = OpenAI(
|
|
120
|
+
api_key=self.api_key,
|
|
121
|
+
base_url="https://openrouter.ai/api/v1",
|
|
122
|
+
)
|
|
123
|
+
else:
|
|
124
|
+
logger.debug("Initializing OpenAI client for model: %s", self.config.model)
|
|
125
|
+
self._client = OpenAI(api_key=self.api_key)
|
|
61
126
|
return self._client
|
|
62
127
|
|
|
63
128
|
def step(
|
|
@@ -66,7 +131,8 @@ class LlmPromptAgent(BaseAgent):
|
|
|
66
131
|
available_tools: list[dict[str, Any]] | None = None,
|
|
67
132
|
) -> AgentAction:
|
|
68
133
|
"""Process conversation and return next action using LLM."""
|
|
69
|
-
|
|
134
|
+
# Local providers don't require an API key
|
|
135
|
+
if not self._is_local and not self.api_key:
|
|
70
136
|
return self._stub_response(history)
|
|
71
137
|
|
|
72
138
|
messages = self._build_messages(history)
|
|
@@ -188,8 +254,13 @@ class LlmPromptAgent(BaseAgent):
|
|
|
188
254
|
messages: list[dict[str, Any]],
|
|
189
255
|
tools: list[dict[str, Any]] | None,
|
|
190
256
|
) -> Any:
|
|
191
|
-
"""Make API call to OpenAI/OpenRouter."""
|
|
257
|
+
"""Make API call to OpenAI/OpenRouter/Local provider."""
|
|
192
258
|
model = self.config.model or "gpt-4o-mini"
|
|
259
|
+
|
|
260
|
+
# For local providers, strip the provider prefix (e.g., "ollama/llama3" -> "llama3")
|
|
261
|
+
if self._is_local and "/" in model:
|
|
262
|
+
model = model.split("/", 1)[1]
|
|
263
|
+
|
|
193
264
|
kwargs: dict[str, Any] = {
|
|
194
265
|
"model": model,
|
|
195
266
|
"messages": messages,
|
sandboxy/api/app.py
CHANGED
|
@@ -58,12 +58,13 @@ def create_local_app(
|
|
|
58
58
|
)
|
|
59
59
|
|
|
60
60
|
# Local routes only
|
|
61
|
-
from sandboxy.api.routes import agents, tools
|
|
61
|
+
from sandboxy.api.routes import agents, providers, tools
|
|
62
62
|
from sandboxy.api.routes import local as local_routes
|
|
63
63
|
|
|
64
64
|
app.include_router(local_routes.router, prefix="/api/v1", tags=["local"])
|
|
65
65
|
app.include_router(agents.router, prefix="/api/v1", tags=["agents"])
|
|
66
66
|
app.include_router(tools.router, prefix="/api/v1", tags=["tools"])
|
|
67
|
+
app.include_router(providers.router, prefix="/api/v1", tags=["providers"])
|
|
67
68
|
|
|
68
69
|
@app.get("/health")
|
|
69
70
|
async def health_check():
|
sandboxy/api/routes/local.py
CHANGED
|
@@ -379,6 +379,10 @@ class RunScenarioRequest(BaseModel):
|
|
|
379
379
|
max_turns: int = 20
|
|
380
380
|
max_tokens: int = 1024
|
|
381
381
|
temperature: float = 0.7
|
|
382
|
+
mlflow_export: bool = False
|
|
383
|
+
mlflow_tracking_uri: str | None = None
|
|
384
|
+
mlflow_experiment: str | None = None
|
|
385
|
+
mlflow_tracing: bool = True
|
|
382
386
|
|
|
383
387
|
|
|
384
388
|
class RunScenarioResponse(BaseModel):
|
|
@@ -393,6 +397,9 @@ class RunScenarioResponse(BaseModel):
|
|
|
393
397
|
final_state: dict[str, Any]
|
|
394
398
|
evaluation: dict[str, Any] | None
|
|
395
399
|
latency_ms: int
|
|
400
|
+
input_tokens: int = 0
|
|
401
|
+
output_tokens: int = 0
|
|
402
|
+
cost_usd: float | None = None
|
|
396
403
|
error: str | None
|
|
397
404
|
|
|
398
405
|
|
|
@@ -404,6 +411,10 @@ class CompareModelsRequest(BaseModel):
|
|
|
404
411
|
runs_per_model: int = 1
|
|
405
412
|
variables: dict[str, Any] = Field(default_factory=dict)
|
|
406
413
|
max_turns: int = 20
|
|
414
|
+
mlflow_export: bool = False
|
|
415
|
+
mlflow_tracking_uri: str | None = None
|
|
416
|
+
mlflow_experiment: str | None = None
|
|
417
|
+
mlflow_tracing: bool = True # Enable LLM call tracing by default
|
|
407
418
|
|
|
408
419
|
|
|
409
420
|
class CompareModelsResponse(BaseModel):
|
|
@@ -454,20 +465,73 @@ async def run_scenario(request: RunScenarioRequest) -> RunScenarioResponse:
|
|
|
454
465
|
spec = load_unified_scenario(scenario_path)
|
|
455
466
|
runner = UnifiedRunner()
|
|
456
467
|
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
468
|
+
# Setup MLflow if requested
|
|
469
|
+
mlflow_config = None
|
|
470
|
+
if request.mlflow_export:
|
|
471
|
+
try:
|
|
472
|
+
from sandboxy.mlflow import MLflowConfig
|
|
473
|
+
|
|
474
|
+
mlflow_config = MLflowConfig.resolve(
|
|
475
|
+
cli_export=True,
|
|
476
|
+
cli_tracking_uri=request.mlflow_tracking_uri,
|
|
477
|
+
cli_experiment=request.mlflow_experiment,
|
|
478
|
+
cli_tracing=request.mlflow_tracing,
|
|
479
|
+
yaml_config=None,
|
|
480
|
+
scenario_name=spec.name,
|
|
481
|
+
)
|
|
482
|
+
except ImportError:
|
|
483
|
+
pass # MLflow not installed
|
|
484
|
+
|
|
485
|
+
# Run with MLflow context if enabled (connects traces to run)
|
|
486
|
+
if mlflow_config and mlflow_config.enabled:
|
|
487
|
+
from sandboxy.mlflow import MLflowExporter, mlflow_run_context
|
|
488
|
+
from sandboxy.mlflow.tracing import enable_tracing
|
|
489
|
+
|
|
490
|
+
if mlflow_config.tracing:
|
|
491
|
+
enable_tracing(
|
|
492
|
+
tracking_uri=mlflow_config.tracking_uri,
|
|
493
|
+
experiment_name=mlflow_config.experiment,
|
|
494
|
+
)
|
|
495
|
+
|
|
496
|
+
with mlflow_run_context(mlflow_config, run_name=request.model) as run_id:
|
|
497
|
+
result = await runner.run(
|
|
498
|
+
scenario=spec,
|
|
499
|
+
model=request.model,
|
|
500
|
+
variables=request.variables,
|
|
501
|
+
max_turns=request.max_turns,
|
|
502
|
+
max_tokens=request.max_tokens,
|
|
503
|
+
temperature=request.temperature,
|
|
504
|
+
)
|
|
505
|
+
|
|
506
|
+
if run_id:
|
|
507
|
+
exporter = MLflowExporter(mlflow_config)
|
|
508
|
+
exporter.log_to_active_run(
|
|
509
|
+
result=result,
|
|
510
|
+
scenario_path=scenario_path,
|
|
511
|
+
scenario_name=spec.name,
|
|
512
|
+
scenario_id=spec.id,
|
|
513
|
+
agent_name=request.model,
|
|
514
|
+
)
|
|
515
|
+
else:
|
|
516
|
+
result = await runner.run(
|
|
517
|
+
scenario=spec,
|
|
518
|
+
model=request.model,
|
|
519
|
+
variables=request.variables,
|
|
520
|
+
max_turns=request.max_turns,
|
|
521
|
+
max_tokens=request.max_tokens,
|
|
522
|
+
temperature=request.temperature,
|
|
523
|
+
)
|
|
465
524
|
|
|
466
525
|
# Save result to runs/
|
|
467
526
|
from sandboxy.local.results import save_run_result
|
|
468
527
|
|
|
469
528
|
save_run_result(request.scenario_id, result.to_dict())
|
|
470
529
|
|
|
530
|
+
# Calculate cost
|
|
531
|
+
input_tokens = result.input_tokens or 0
|
|
532
|
+
output_tokens = result.output_tokens or 0
|
|
533
|
+
cost_usd = calculate_cost(result.model, input_tokens, output_tokens)
|
|
534
|
+
|
|
471
535
|
return RunScenarioResponse(
|
|
472
536
|
id=result.id,
|
|
473
537
|
scenario_id=result.scenario_id,
|
|
@@ -481,6 +545,9 @@ async def run_scenario(request: RunScenarioRequest) -> RunScenarioResponse:
|
|
|
481
545
|
final_state=result.final_state,
|
|
482
546
|
evaluation=result.evaluation.to_dict() if result.evaluation else None,
|
|
483
547
|
latency_ms=result.latency_ms,
|
|
548
|
+
input_tokens=input_tokens,
|
|
549
|
+
output_tokens=output_tokens,
|
|
550
|
+
cost_usd=cost_usd,
|
|
484
551
|
error=result.error,
|
|
485
552
|
)
|
|
486
553
|
|
|
@@ -530,6 +597,19 @@ async def compare_models(request: CompareModelsRequest) -> CompareModelsResponse
|
|
|
530
597
|
|
|
531
598
|
spec = load_unified_scenario(scenario_path)
|
|
532
599
|
|
|
600
|
+
# Enable MLflow tracing if requested (must be done BEFORE any LLM calls)
|
|
601
|
+
if request.mlflow_export and request.mlflow_tracing:
|
|
602
|
+
try:
|
|
603
|
+
from sandboxy.mlflow.tracing import enable_tracing
|
|
604
|
+
|
|
605
|
+
experiment = request.mlflow_experiment or spec.name
|
|
606
|
+
enable_tracing(
|
|
607
|
+
tracking_uri=request.mlflow_tracking_uri,
|
|
608
|
+
experiment_name=experiment,
|
|
609
|
+
)
|
|
610
|
+
except ImportError:
|
|
611
|
+
pass # MLflow not installed
|
|
612
|
+
|
|
533
613
|
comparison = await run_comparison(
|
|
534
614
|
scenario=spec,
|
|
535
615
|
models=request.models,
|
|
@@ -538,6 +618,33 @@ async def compare_models(request: CompareModelsRequest) -> CompareModelsResponse
|
|
|
538
618
|
max_turns=request.max_turns,
|
|
539
619
|
)
|
|
540
620
|
|
|
621
|
+
# MLflow export (if enabled)
|
|
622
|
+
if request.mlflow_export:
|
|
623
|
+
try:
|
|
624
|
+
from sandboxy.mlflow import MLflowConfig, MLflowExporter
|
|
625
|
+
|
|
626
|
+
for result in comparison.results:
|
|
627
|
+
config = MLflowConfig.resolve(
|
|
628
|
+
cli_export=True,
|
|
629
|
+
cli_tracking_uri=request.mlflow_tracking_uri,
|
|
630
|
+
cli_experiment=request.mlflow_experiment,
|
|
631
|
+
cli_tracing=request.mlflow_tracing,
|
|
632
|
+
yaml_config=None,
|
|
633
|
+
scenario_name=spec.name,
|
|
634
|
+
)
|
|
635
|
+
exporter = MLflowExporter(config)
|
|
636
|
+
exporter.export(
|
|
637
|
+
result=result.to_dict(),
|
|
638
|
+
scenario_path=scenario_path,
|
|
639
|
+
scenario_name=spec.name,
|
|
640
|
+
scenario_id=spec.id,
|
|
641
|
+
agent_name=result.model,
|
|
642
|
+
)
|
|
643
|
+
except ImportError:
|
|
644
|
+
logger.warning("MLflow not installed, skipping export")
|
|
645
|
+
except Exception as e:
|
|
646
|
+
logger.warning(f"Failed to export to MLflow: {e}")
|
|
647
|
+
|
|
541
648
|
# Save comparison result
|
|
542
649
|
from sandboxy.local.results import save_run_result
|
|
543
650
|
|
|
@@ -587,10 +694,40 @@ def calculate_cost(model_id: str, input_tokens: int, output_tokens: int) -> floa
|
|
|
587
694
|
|
|
588
695
|
@router.get("/local/models")
|
|
589
696
|
async def list_available_models() -> list[dict[str, Any]]:
|
|
590
|
-
"""List available models from OpenRouter."""
|
|
697
|
+
"""List available models from OpenRouter and local providers."""
|
|
698
|
+
from sandboxy.providers.config import get_enabled_providers
|
|
699
|
+
from sandboxy.providers.local import LocalProvider
|
|
591
700
|
from sandboxy.providers.openrouter import OPENROUTER_MODELS
|
|
592
701
|
|
|
593
702
|
models = []
|
|
703
|
+
|
|
704
|
+
# Add models from local providers first
|
|
705
|
+
for provider_config in get_enabled_providers():
|
|
706
|
+
try:
|
|
707
|
+
provider = LocalProvider(provider_config)
|
|
708
|
+
local_models = await provider.refresh_models()
|
|
709
|
+
await provider.close()
|
|
710
|
+
|
|
711
|
+
for model in local_models:
|
|
712
|
+
# Model ID includes provider prefix for routing
|
|
713
|
+
full_model_id = f"{provider_config.name}/{model.id}"
|
|
714
|
+
models.append(
|
|
715
|
+
{
|
|
716
|
+
"id": full_model_id,
|
|
717
|
+
"name": model.name,
|
|
718
|
+
"price": "Local",
|
|
719
|
+
"pricing": {"input": 0, "output": 0},
|
|
720
|
+
"provider": provider_config.name,
|
|
721
|
+
"context_length": model.context_length,
|
|
722
|
+
"supports_vision": model.supports_vision,
|
|
723
|
+
"is_local": True,
|
|
724
|
+
"provider_name": provider_config.name,
|
|
725
|
+
}
|
|
726
|
+
)
|
|
727
|
+
except Exception as e:
|
|
728
|
+
logger.warning(f"Failed to fetch models from {provider_config.name}: {e}")
|
|
729
|
+
|
|
730
|
+
# Add OpenRouter models
|
|
594
731
|
for model_id, info in OPENROUTER_MODELS.items():
|
|
595
732
|
# Format price string
|
|
596
733
|
if info.input_cost_per_million == 0 and info.output_cost_per_million == 0:
|
|
@@ -610,6 +747,7 @@ async def list_available_models() -> list[dict[str, Any]]:
|
|
|
610
747
|
"provider": info.provider,
|
|
611
748
|
"context_length": info.context_length,
|
|
612
749
|
"supports_vision": info.supports_vision,
|
|
750
|
+
"is_local": False,
|
|
613
751
|
}
|
|
614
752
|
)
|
|
615
753
|
|
|
@@ -905,6 +1043,8 @@ class RunDatasetRequest(BaseModel):
|
|
|
905
1043
|
max_tokens: int = 1024
|
|
906
1044
|
temperature: float = 0.7
|
|
907
1045
|
parallel: int = 1
|
|
1046
|
+
mlflow_enabled: bool = False
|
|
1047
|
+
mlflow_experiment: str | None = None
|
|
908
1048
|
|
|
909
1049
|
|
|
910
1050
|
class RunDatasetResponse(BaseModel):
|
|
@@ -1335,25 +1475,81 @@ async def run_with_dataset(request: RunDatasetRequest) -> RunDatasetResponse:
|
|
|
1335
1475
|
spec = load_unified_scenario(scenario_path)
|
|
1336
1476
|
dataset = load_dataset(dataset_path)
|
|
1337
1477
|
|
|
1338
|
-
|
|
1339
|
-
|
|
1478
|
+
# Setup MLflow if enabled
|
|
1479
|
+
mlflow_config = None
|
|
1480
|
+
if request.mlflow_enabled:
|
|
1481
|
+
try:
|
|
1482
|
+
from sandboxy.mlflow import MLflowConfig
|
|
1483
|
+
|
|
1484
|
+
mlflow_config = MLflowConfig(
|
|
1485
|
+
enabled=True,
|
|
1486
|
+
experiment=request.mlflow_experiment or f"{spec.name}-dataset",
|
|
1487
|
+
tracing=False, # Tracing not needed for dataset aggregates
|
|
1488
|
+
)
|
|
1489
|
+
except ImportError:
|
|
1490
|
+
pass # MLflow not installed
|
|
1491
|
+
|
|
1492
|
+
async def run_dataset_benchmark():
|
|
1493
|
+
if request.parallel > 1:
|
|
1494
|
+
return await run_dataset_parallel(
|
|
1495
|
+
scenario=spec,
|
|
1496
|
+
model=request.model,
|
|
1497
|
+
dataset=dataset,
|
|
1498
|
+
max_turns=request.max_turns,
|
|
1499
|
+
max_tokens=request.max_tokens,
|
|
1500
|
+
temperature=request.temperature,
|
|
1501
|
+
max_concurrent=request.parallel,
|
|
1502
|
+
)
|
|
1503
|
+
return await run_dataset(
|
|
1340
1504
|
scenario=spec,
|
|
1341
1505
|
model=request.model,
|
|
1342
1506
|
dataset=dataset,
|
|
1343
1507
|
max_turns=request.max_turns,
|
|
1344
1508
|
max_tokens=request.max_tokens,
|
|
1345
1509
|
temperature=request.temperature,
|
|
1346
|
-
max_concurrent=request.parallel,
|
|
1347
1510
|
)
|
|
1511
|
+
|
|
1512
|
+
# Run with MLflow context if enabled
|
|
1513
|
+
if mlflow_config and mlflow_config.enabled:
|
|
1514
|
+
from sandboxy.mlflow import mlflow_run_context
|
|
1515
|
+
|
|
1516
|
+
run_name = f"{request.model}-{request.dataset_id}"
|
|
1517
|
+
with mlflow_run_context(mlflow_config, run_name=run_name) as run_id:
|
|
1518
|
+
result = await run_dataset_benchmark()
|
|
1519
|
+
|
|
1520
|
+
# Log aggregate metrics to MLflow
|
|
1521
|
+
if run_id:
|
|
1522
|
+
try:
|
|
1523
|
+
import mlflow
|
|
1524
|
+
|
|
1525
|
+
mlflow.log_params(
|
|
1526
|
+
{
|
|
1527
|
+
"scenario_id": result.scenario_id,
|
|
1528
|
+
"dataset_id": result.dataset_id,
|
|
1529
|
+
"model": result.model,
|
|
1530
|
+
"total_cases": result.total_cases,
|
|
1531
|
+
}
|
|
1532
|
+
)
|
|
1533
|
+
mlflow.log_metrics(
|
|
1534
|
+
{
|
|
1535
|
+
"passed_cases": result.passed_cases,
|
|
1536
|
+
"failed_cases": result.failed_cases,
|
|
1537
|
+
"pass_rate": result.pass_rate,
|
|
1538
|
+
"avg_score": result.avg_score,
|
|
1539
|
+
"avg_percentage": result.avg_percentage,
|
|
1540
|
+
"total_time_ms": result.total_time_ms,
|
|
1541
|
+
}
|
|
1542
|
+
)
|
|
1543
|
+
# Log per-expected-outcome metrics
|
|
1544
|
+
for expected, counts in result.by_expected.items():
|
|
1545
|
+
total = counts.get("passed", 0) + counts.get("failed", 0)
|
|
1546
|
+
if total > 0:
|
|
1547
|
+
rate = counts.get("passed", 0) / total
|
|
1548
|
+
mlflow.log_metric(f"pass_rate_{expected}", rate)
|
|
1549
|
+
except Exception as e:
|
|
1550
|
+
logger.warning(f"Failed to log dataset metrics to MLflow: {e}")
|
|
1348
1551
|
else:
|
|
1349
|
-
result = await
|
|
1350
|
-
scenario=spec,
|
|
1351
|
-
model=request.model,
|
|
1352
|
-
dataset=dataset,
|
|
1353
|
-
max_turns=request.max_turns,
|
|
1354
|
-
max_tokens=request.max_tokens,
|
|
1355
|
-
temperature=request.temperature,
|
|
1356
|
-
)
|
|
1552
|
+
result = await run_dataset_benchmark()
|
|
1357
1553
|
|
|
1358
1554
|
# Save result
|
|
1359
1555
|
from sandboxy.local.results import save_run_result
|