sandboxy 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sandboxy/agents/llm_prompt.py +85 -14
- sandboxy/api/app.py +2 -1
- sandboxy/api/routes/local.py +216 -20
- sandboxy/api/routes/providers.py +369 -0
- sandboxy/cli/main.py +663 -31
- sandboxy/mlflow/__init__.py +38 -0
- sandboxy/mlflow/artifacts.py +184 -0
- sandboxy/mlflow/config.py +90 -0
- sandboxy/mlflow/exporter.py +445 -0
- sandboxy/mlflow/metrics.py +115 -0
- sandboxy/mlflow/tags.py +140 -0
- sandboxy/mlflow/tracing.py +126 -0
- sandboxy/providers/__init__.py +37 -3
- sandboxy/providers/config.py +243 -0
- sandboxy/providers/local.py +498 -0
- sandboxy/providers/registry.py +107 -13
- sandboxy/scenarios/loader.py +44 -2
- sandboxy/scenarios/runner.py +57 -2
- sandboxy/scenarios/unified.py +27 -3
- sandboxy/tools/yaml_tools.py +18 -0
- sandboxy/ui/dist/assets/index-CLxxjJuD.js +367 -0
- sandboxy/ui/dist/assets/index-DBB7ehs6.css +1 -0
- sandboxy/ui/dist/index.html +2 -2
- {sandboxy-0.0.3.dist-info → sandboxy-0.0.5.dist-info}/METADATA +103 -27
- {sandboxy-0.0.3.dist-info → sandboxy-0.0.5.dist-info}/RECORD +28 -18
- sandboxy/ui/dist/assets/index-CgAkYWrJ.css +0 -1
- sandboxy/ui/dist/assets/index-D4zoGFcr.js +0 -347
- {sandboxy-0.0.3.dist-info → sandboxy-0.0.5.dist-info}/WHEEL +0 -0
- {sandboxy-0.0.3.dist-info → sandboxy-0.0.5.dist-info}/entry_points.txt +0 -0
- {sandboxy-0.0.3.dist-info → sandboxy-0.0.5.dist-info}/licenses/LICENSE +0 -0
sandboxy/scenarios/loader.py
CHANGED
|
@@ -28,6 +28,25 @@ class StepSpec(BaseModel):
|
|
|
28
28
|
params: dict[str, Any] = Field(default_factory=dict)
|
|
29
29
|
|
|
30
30
|
|
|
31
|
+
class MLflowYamlConfig(BaseModel):
|
|
32
|
+
"""MLflow configuration from scenario YAML.
|
|
33
|
+
|
|
34
|
+
Example:
|
|
35
|
+
mlflow:
|
|
36
|
+
enabled: true
|
|
37
|
+
experiment: "agent-evals"
|
|
38
|
+
tracking_uri: "http://localhost:5000"
|
|
39
|
+
tags:
|
|
40
|
+
team: "support-agents"
|
|
41
|
+
environment: "staging"
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
enabled: bool = False
|
|
45
|
+
experiment: str | None = None
|
|
46
|
+
tracking_uri: str | None = None
|
|
47
|
+
tags: dict[str, str] = Field(default_factory=dict)
|
|
48
|
+
|
|
49
|
+
|
|
31
50
|
class McpServerSpec(BaseModel):
|
|
32
51
|
"""Specification for an MCP server connection.
|
|
33
52
|
|
|
@@ -74,9 +93,12 @@ class ScenarioSpec(BaseModel):
|
|
|
74
93
|
|
|
75
94
|
# Evaluation
|
|
76
95
|
goals: list[GoalSpec] = Field(default_factory=list)
|
|
77
|
-
evaluation: list[dict[str, Any]] = Field(default_factory=list)
|
|
96
|
+
evaluation: list[dict[str, Any]] | dict[str, Any] = Field(default_factory=list)
|
|
78
97
|
scoring: dict[str, Any] = Field(default_factory=dict)
|
|
79
98
|
|
|
99
|
+
# MLflow integration (optional)
|
|
100
|
+
mlflow: MLflowYamlConfig | None = None
|
|
101
|
+
|
|
80
102
|
|
|
81
103
|
def load_scenario(path: Path) -> ScenarioSpec:
|
|
82
104
|
"""Load a scenario from a YAML file.
|
|
@@ -148,7 +170,14 @@ def parse_scenario(raw: dict[str, Any]) -> ScenarioSpec:
|
|
|
148
170
|
|
|
149
171
|
# Parse goals
|
|
150
172
|
goals: list[GoalSpec] = []
|
|
151
|
-
|
|
173
|
+
goals_raw = raw.get("goals", [])
|
|
174
|
+
|
|
175
|
+
# Also check for goals nested inside evaluation dict
|
|
176
|
+
evaluation_raw = raw.get("evaluation", [])
|
|
177
|
+
if isinstance(evaluation_raw, dict) and "goals" in evaluation_raw:
|
|
178
|
+
goals_raw = evaluation_raw.get("goals", [])
|
|
179
|
+
|
|
180
|
+
for g in goals_raw:
|
|
152
181
|
goals.append(
|
|
153
182
|
GoalSpec(
|
|
154
183
|
id=g.get("id", f"goal_{len(goals)}"),
|
|
@@ -159,6 +188,17 @@ def parse_scenario(raw: dict[str, Any]) -> ScenarioSpec:
|
|
|
159
188
|
)
|
|
160
189
|
)
|
|
161
190
|
|
|
191
|
+
# Parse MLflow config if present
|
|
192
|
+
mlflow_config: MLflowYamlConfig | None = None
|
|
193
|
+
if "mlflow" in raw and isinstance(raw["mlflow"], dict):
|
|
194
|
+
mlflow_raw = raw["mlflow"]
|
|
195
|
+
mlflow_config = MLflowYamlConfig(
|
|
196
|
+
enabled=mlflow_raw.get("enabled", False),
|
|
197
|
+
experiment=mlflow_raw.get("experiment"),
|
|
198
|
+
tracking_uri=mlflow_raw.get("tracking_uri"),
|
|
199
|
+
tags=mlflow_raw.get("tags", {}),
|
|
200
|
+
)
|
|
201
|
+
|
|
162
202
|
return ScenarioSpec(
|
|
163
203
|
id=raw.get("id", "unnamed"),
|
|
164
204
|
name=raw.get("name", raw.get("id", "Unnamed Scenario")),
|
|
@@ -174,6 +214,7 @@ def parse_scenario(raw: dict[str, Any]) -> ScenarioSpec:
|
|
|
174
214
|
goals=goals,
|
|
175
215
|
evaluation=raw.get("evaluation", []),
|
|
176
216
|
scoring=raw.get("scoring", {}),
|
|
217
|
+
mlflow=mlflow_config,
|
|
177
218
|
)
|
|
178
219
|
|
|
179
220
|
|
|
@@ -259,4 +300,5 @@ def apply_scenario_variables(spec: ScenarioSpec, variables: dict[str, Any]) -> S
|
|
|
259
300
|
goals=spec.goals,
|
|
260
301
|
evaluation=spec.evaluation,
|
|
261
302
|
scoring=spec.scoring,
|
|
303
|
+
mlflow=spec.mlflow, # Preserve MLflow config
|
|
262
304
|
)
|
sandboxy/scenarios/runner.py
CHANGED
|
@@ -5,6 +5,7 @@ from __future__ import annotations
|
|
|
5
5
|
import asyncio
|
|
6
6
|
import json
|
|
7
7
|
import logging
|
|
8
|
+
import time
|
|
8
9
|
from pathlib import Path
|
|
9
10
|
from typing import Any
|
|
10
11
|
|
|
@@ -26,6 +27,16 @@ class ScenarioEvent(BaseModel):
|
|
|
26
27
|
payload: dict[str, Any] = Field(default_factory=dict)
|
|
27
28
|
|
|
28
29
|
|
|
30
|
+
class GoalResult(BaseModel):
|
|
31
|
+
"""Result of evaluating a single goal."""
|
|
32
|
+
|
|
33
|
+
id: str
|
|
34
|
+
name: str
|
|
35
|
+
achieved: bool
|
|
36
|
+
points: int
|
|
37
|
+
reason: str = ""
|
|
38
|
+
|
|
39
|
+
|
|
29
40
|
class ScenarioResult(BaseModel):
|
|
30
41
|
"""Result of running a scenario."""
|
|
31
42
|
|
|
@@ -35,7 +46,12 @@ class ScenarioResult(BaseModel):
|
|
|
35
46
|
tool_calls: list[dict[str, Any]] = Field(default_factory=list)
|
|
36
47
|
final_state: dict[str, Any] = Field(default_factory=dict)
|
|
37
48
|
goals_achieved: list[str] = Field(default_factory=list)
|
|
49
|
+
goal_results: list[GoalResult] = Field(default_factory=list)
|
|
38
50
|
score: float = 0.0
|
|
51
|
+
max_score: float = 0.0
|
|
52
|
+
latency_ms: int = 0
|
|
53
|
+
input_tokens: int = 0
|
|
54
|
+
output_tokens: int = 0
|
|
39
55
|
|
|
40
56
|
def to_json(self, indent: int | None = None) -> str:
|
|
41
57
|
"""Serialize result to JSON string."""
|
|
@@ -70,7 +86,11 @@ class ScenarioResult(BaseModel):
|
|
|
70
86
|
lines.append("")
|
|
71
87
|
lines.append(f"Tool Calls Made: {len(self.tool_calls)}")
|
|
72
88
|
lines.append(f"Goals Achieved: {len(self.goals_achieved)}")
|
|
73
|
-
lines.append(f"Score: {self.score}")
|
|
89
|
+
lines.append(f"Score: {self.score}/{self.max_score}")
|
|
90
|
+
if self.latency_ms:
|
|
91
|
+
lines.append(f"Latency: {self.latency_ms}ms")
|
|
92
|
+
if self.input_tokens or self.output_tokens:
|
|
93
|
+
lines.append(f"Tokens: {self.input_tokens} in / {self.output_tokens} out")
|
|
74
94
|
|
|
75
95
|
return "\n".join(lines)
|
|
76
96
|
|
|
@@ -176,6 +196,8 @@ class ScenarioRunner:
|
|
|
176
196
|
Returns:
|
|
177
197
|
ScenarioResult with events and evaluation
|
|
178
198
|
"""
|
|
199
|
+
start_time = time.perf_counter()
|
|
200
|
+
|
|
179
201
|
try:
|
|
180
202
|
# Load MCP tools if configured
|
|
181
203
|
await self._load_mcp_tools()
|
|
@@ -188,9 +210,21 @@ class ScenarioRunner:
|
|
|
188
210
|
for step in self.scenario.steps:
|
|
189
211
|
await self._execute_step(step, max_turns)
|
|
190
212
|
|
|
191
|
-
# Evaluate goals
|
|
213
|
+
# Evaluate goals and build detailed results
|
|
192
214
|
goals_achieved = self._evaluate_goals()
|
|
215
|
+
goal_results = self._build_goal_results(goals_achieved)
|
|
193
216
|
score = self._compute_score(goals_achieved)
|
|
217
|
+
max_score = sum(g.points for g in self.scenario.goals)
|
|
218
|
+
|
|
219
|
+
# Get token usage from agent if available
|
|
220
|
+
input_tokens = 0
|
|
221
|
+
output_tokens = 0
|
|
222
|
+
if hasattr(self.agent, "get_usage"):
|
|
223
|
+
usage = self.agent.get_usage()
|
|
224
|
+
input_tokens = usage.get("input_tokens", 0)
|
|
225
|
+
output_tokens = usage.get("output_tokens", 0)
|
|
226
|
+
|
|
227
|
+
latency_ms = int((time.perf_counter() - start_time) * 1000)
|
|
194
228
|
|
|
195
229
|
return ScenarioResult(
|
|
196
230
|
scenario_id=self.scenario.id,
|
|
@@ -199,7 +233,12 @@ class ScenarioRunner:
|
|
|
199
233
|
tool_calls=self.tool_call_log,
|
|
200
234
|
final_state=self.env_state.copy(),
|
|
201
235
|
goals_achieved=goals_achieved,
|
|
236
|
+
goal_results=goal_results,
|
|
202
237
|
score=score,
|
|
238
|
+
max_score=max_score,
|
|
239
|
+
latency_ms=latency_ms,
|
|
240
|
+
input_tokens=input_tokens,
|
|
241
|
+
output_tokens=output_tokens,
|
|
203
242
|
)
|
|
204
243
|
finally:
|
|
205
244
|
await self._cleanup_mcp()
|
|
@@ -440,6 +479,22 @@ class ScenarioRunner:
|
|
|
440
479
|
|
|
441
480
|
return list(set(achieved)) # Deduplicate
|
|
442
481
|
|
|
482
|
+
def _build_goal_results(self, goals_achieved: list[str]) -> list[GoalResult]:
|
|
483
|
+
"""Build detailed goal results for MLflow logging."""
|
|
484
|
+
results = []
|
|
485
|
+
for goal in self.scenario.goals:
|
|
486
|
+
achieved = goal.id in goals_achieved
|
|
487
|
+
results.append(
|
|
488
|
+
GoalResult(
|
|
489
|
+
id=goal.id,
|
|
490
|
+
name=goal.name or goal.id,
|
|
491
|
+
achieved=achieved,
|
|
492
|
+
points=goal.points if achieved else 0,
|
|
493
|
+
reason="Goal achieved" if achieved else "Goal not achieved",
|
|
494
|
+
)
|
|
495
|
+
)
|
|
496
|
+
return results
|
|
497
|
+
|
|
443
498
|
def _compute_score(self, goals_achieved: list[str]) -> float:
|
|
444
499
|
"""Compute score based on achieved goals."""
|
|
445
500
|
from sandboxy.core.safe_eval import EvaluationError, safe_eval_formula
|
sandboxy/scenarios/unified.py
CHANGED
|
@@ -292,6 +292,9 @@ class RunResult:
|
|
|
292
292
|
cost_usd: float | None = None
|
|
293
293
|
error: str | None = None
|
|
294
294
|
created_at: datetime = field(default_factory=datetime.now)
|
|
295
|
+
# Provider info for distinguishing local vs cloud
|
|
296
|
+
is_local: bool = False
|
|
297
|
+
provider_name: str | None = None
|
|
295
298
|
|
|
296
299
|
def to_dict(self) -> dict[str, Any]:
|
|
297
300
|
"""Convert to dictionary."""
|
|
@@ -324,6 +327,8 @@ class RunResult:
|
|
|
324
327
|
"cost_usd": self.cost_usd,
|
|
325
328
|
"error": self.error,
|
|
326
329
|
"created_at": self.created_at.isoformat(),
|
|
330
|
+
"is_local": self.is_local,
|
|
331
|
+
"provider_name": self.provider_name,
|
|
327
332
|
}
|
|
328
333
|
|
|
329
334
|
def to_json(self, indent: int | None = 2) -> str:
|
|
@@ -332,9 +337,15 @@ class RunResult:
|
|
|
332
337
|
|
|
333
338
|
def pretty(self) -> str:
|
|
334
339
|
"""Format for human-readable display."""
|
|
340
|
+
model_display = self.model
|
|
341
|
+
if self.is_local:
|
|
342
|
+
model_display += " (local)"
|
|
343
|
+
elif self.provider_name:
|
|
344
|
+
model_display += f" ({self.provider_name})"
|
|
345
|
+
|
|
335
346
|
lines = [
|
|
336
347
|
f"Scenario: {self.scenario_id}",
|
|
337
|
-
f"Model: {
|
|
348
|
+
f"Model: {model_display}",
|
|
338
349
|
f"Latency: {self.latency_ms}ms",
|
|
339
350
|
]
|
|
340
351
|
|
|
@@ -516,6 +527,10 @@ class UnifiedRunner:
|
|
|
516
527
|
temperature=temperature,
|
|
517
528
|
)
|
|
518
529
|
|
|
530
|
+
# Detect if this is a local provider
|
|
531
|
+
is_local = hasattr(provider, "config") # LocalProvider has config attribute
|
|
532
|
+
provider_name = provider.provider_name if hasattr(provider, "provider_name") else None
|
|
533
|
+
|
|
519
534
|
return RunResult(
|
|
520
535
|
id="", # Set by caller
|
|
521
536
|
scenario_id=scenario.id,
|
|
@@ -528,7 +543,9 @@ class UnifiedRunner:
|
|
|
528
543
|
],
|
|
529
544
|
input_tokens=response.input_tokens,
|
|
530
545
|
output_tokens=response.output_tokens,
|
|
531
|
-
cost_usd=response.cost_usd,
|
|
546
|
+
cost_usd=response.cost_usd if not is_local else 0.0, # Local models have no cost
|
|
547
|
+
is_local=is_local,
|
|
548
|
+
provider_name=provider_name,
|
|
532
549
|
)
|
|
533
550
|
|
|
534
551
|
async def _run_multi_turn(
|
|
@@ -773,6 +790,11 @@ class UnifiedRunner:
|
|
|
773
790
|
# Calculate cost from token counts
|
|
774
791
|
cost_usd = self._calculate_cost(model, input_tokens, output_tokens)
|
|
775
792
|
|
|
793
|
+
# Detect if this is a local provider
|
|
794
|
+
provider = self.registry.get_provider_for_model(model)
|
|
795
|
+
is_local = hasattr(provider, "config")
|
|
796
|
+
provider_name = provider.provider_name if hasattr(provider, "provider_name") else None
|
|
797
|
+
|
|
776
798
|
return RunResult(
|
|
777
799
|
id="",
|
|
778
800
|
scenario_id=scenario.id,
|
|
@@ -783,7 +805,9 @@ class UnifiedRunner:
|
|
|
783
805
|
final_state=env_state,
|
|
784
806
|
input_tokens=input_tokens,
|
|
785
807
|
output_tokens=output_tokens,
|
|
786
|
-
cost_usd=cost_usd,
|
|
808
|
+
cost_usd=cost_usd if not is_local else 0.0,
|
|
809
|
+
is_local=is_local,
|
|
810
|
+
provider_name=provider_name,
|
|
787
811
|
)
|
|
788
812
|
|
|
789
813
|
finally:
|
sandboxy/tools/yaml_tools.py
CHANGED
|
@@ -288,6 +288,17 @@ class YamlMockTool:
|
|
|
288
288
|
self.config = config.config
|
|
289
289
|
self.spec = spec
|
|
290
290
|
self._call_log: list[dict[str, Any]] = []
|
|
291
|
+
self._overrides: dict[str, Any] = {}
|
|
292
|
+
|
|
293
|
+
def set_overrides(self, overrides: dict[str, Any]) -> None:
|
|
294
|
+
"""Set response overrides for dataset benchmarking.
|
|
295
|
+
|
|
296
|
+
Args:
|
|
297
|
+
overrides: Dict mapping "tool.action" or "tool" to override response data.
|
|
298
|
+
When a matching action is called, returns the override data
|
|
299
|
+
instead of the normal mock response.
|
|
300
|
+
"""
|
|
301
|
+
self._overrides = overrides
|
|
291
302
|
|
|
292
303
|
@property
|
|
293
304
|
def call_log(self) -> list[dict[str, Any]]:
|
|
@@ -347,6 +358,13 @@ class YamlMockTool:
|
|
|
347
358
|
for effect in action_spec.side_effects:
|
|
348
359
|
effect.apply(env_state, validated_args)
|
|
349
360
|
|
|
361
|
+
# Check for override (dataset benchmarking)
|
|
362
|
+
override_key = f"{self.name}.{action}"
|
|
363
|
+
if override_key in self._overrides:
|
|
364
|
+
return ToolResult(success=True, data=self._overrides[override_key])
|
|
365
|
+
if self.name in self._overrides:
|
|
366
|
+
return ToolResult(success=True, data=self._overrides[self.name])
|
|
367
|
+
|
|
350
368
|
# Compute return value
|
|
351
369
|
result_value = self._compute_return(action_spec, validated_args, env_state)
|
|
352
370
|
|