sandboxy 0.0.2__py3-none-any.whl → 0.0.4__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sandboxy/api/routes/local.py +182 -19
- sandboxy/cli/main.py +530 -174
- sandboxy/mlflow/__init__.py +38 -0
- sandboxy/mlflow/artifacts.py +184 -0
- sandboxy/mlflow/config.py +90 -0
- sandboxy/mlflow/exporter.py +439 -0
- sandboxy/mlflow/metrics.py +115 -0
- sandboxy/mlflow/tags.py +140 -0
- sandboxy/mlflow/tracing.py +126 -0
- sandboxy/scenarios/loader.py +44 -2
- sandboxy/scenarios/runner.py +57 -2
- sandboxy/tools/yaml_tools.py +18 -0
- sandboxy/ui/dist/assets/index-CU06wBqc.js +362 -0
- sandboxy/ui/dist/assets/index-Cgg2wY2m.css +1 -0
- sandboxy/ui/dist/index.html +2 -2
- {sandboxy-0.0.2.dist-info → sandboxy-0.0.4.dist-info}/METADATA +37 -1
- {sandboxy-0.0.2.dist-info → sandboxy-0.0.4.dist-info}/RECORD +20 -13
- sandboxy/ui/dist/assets/index-CgAkYWrJ.css +0 -1
- sandboxy/ui/dist/assets/index-D4zoGFcr.js +0 -347
- {sandboxy-0.0.2.dist-info → sandboxy-0.0.4.dist-info}/WHEEL +0 -0
- {sandboxy-0.0.2.dist-info → sandboxy-0.0.4.dist-info}/entry_points.txt +0 -0
- {sandboxy-0.0.2.dist-info → sandboxy-0.0.4.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,126 @@
|
|
|
1
|
+
"""MLflow tracing support for Sandboxy.
|
|
2
|
+
|
|
3
|
+
Enables automatic tracing of LLM calls using MLflow's autolog feature.
|
|
4
|
+
When enabled, all OpenAI SDK calls are automatically captured as spans
|
|
5
|
+
within the MLflow run, providing detailed visibility into:
|
|
6
|
+
- Each LLM call (prompt, response, latency, tokens)
|
|
7
|
+
- Tool/function calls made by the LLM
|
|
8
|
+
- The full execution flow
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import logging
|
|
14
|
+
from collections.abc import Generator
|
|
15
|
+
from contextlib import contextmanager
|
|
16
|
+
from typing import TYPE_CHECKING
|
|
17
|
+
|
|
18
|
+
if TYPE_CHECKING:
|
|
19
|
+
pass
|
|
20
|
+
|
|
21
|
+
logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
_tracing_enabled = False
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def enable_tracing(
|
|
27
|
+
tracking_uri: str | None = None,
|
|
28
|
+
experiment_name: str | None = None,
|
|
29
|
+
) -> bool:
|
|
30
|
+
"""Enable MLflow tracing for OpenAI calls.
|
|
31
|
+
|
|
32
|
+
This should be called once before any LLM calls are made.
|
|
33
|
+
It enables MLflow's autolog feature which automatically
|
|
34
|
+
captures all OpenAI SDK calls as traces.
|
|
35
|
+
|
|
36
|
+
Args:
|
|
37
|
+
tracking_uri: MLflow tracking server URI (uses env var if not set)
|
|
38
|
+
experiment_name: Experiment to log traces to
|
|
39
|
+
|
|
40
|
+
Returns:
|
|
41
|
+
True if tracing was enabled successfully, False otherwise
|
|
42
|
+
"""
|
|
43
|
+
global _tracing_enabled
|
|
44
|
+
|
|
45
|
+
if _tracing_enabled:
|
|
46
|
+
return True
|
|
47
|
+
|
|
48
|
+
try:
|
|
49
|
+
import os
|
|
50
|
+
|
|
51
|
+
import mlflow
|
|
52
|
+
|
|
53
|
+
# Set tracking URI before enabling autolog
|
|
54
|
+
uri = tracking_uri or os.environ.get("MLFLOW_TRACKING_URI")
|
|
55
|
+
if uri:
|
|
56
|
+
mlflow.set_tracking_uri(uri)
|
|
57
|
+
|
|
58
|
+
# Set experiment before enabling autolog
|
|
59
|
+
if experiment_name:
|
|
60
|
+
mlflow.set_experiment(experiment_name)
|
|
61
|
+
|
|
62
|
+
# Enable OpenAI autologging - this captures all OpenAI calls as traces
|
|
63
|
+
mlflow.openai.autolog()
|
|
64
|
+
|
|
65
|
+
_tracing_enabled = True
|
|
66
|
+
logger.debug("MLflow tracing enabled for OpenAI")
|
|
67
|
+
return True
|
|
68
|
+
|
|
69
|
+
except ImportError as e:
|
|
70
|
+
logger.warning(f"MLflow or OpenAI not installed, tracing disabled: {e}")
|
|
71
|
+
return False
|
|
72
|
+
except Exception as e:
|
|
73
|
+
logger.warning(f"Failed to enable MLflow tracing: {e}")
|
|
74
|
+
return False
|
|
75
|
+
|
|
76
|
+
|
|
77
|
+
def disable_tracing() -> None:
|
|
78
|
+
"""Disable MLflow tracing."""
|
|
79
|
+
global _tracing_enabled
|
|
80
|
+
|
|
81
|
+
if not _tracing_enabled:
|
|
82
|
+
return
|
|
83
|
+
|
|
84
|
+
try:
|
|
85
|
+
import mlflow
|
|
86
|
+
|
|
87
|
+
mlflow.openai.autolog(disable=True)
|
|
88
|
+
_tracing_enabled = False
|
|
89
|
+
logger.debug("MLflow tracing disabled")
|
|
90
|
+
|
|
91
|
+
except Exception as e:
|
|
92
|
+
logger.warning(f"Failed to disable MLflow tracing: {e}")
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
@contextmanager
|
|
96
|
+
def trace_span(name: str, span_type: str = "CHAIN") -> Generator[None, None, None]:
|
|
97
|
+
"""Create a manual trace span for non-LLM operations.
|
|
98
|
+
|
|
99
|
+
Use this to wrap tool calls, scenario steps, or other operations
|
|
100
|
+
you want to appear in the trace.
|
|
101
|
+
|
|
102
|
+
Args:
|
|
103
|
+
name: Name of the span (e.g., "tool_call:get_account_activity")
|
|
104
|
+
span_type: Type of span (CHAIN, TOOL, RETRIEVER, etc.)
|
|
105
|
+
|
|
106
|
+
Example:
|
|
107
|
+
with trace_span("tool_call:search", span_type="TOOL"):
|
|
108
|
+
result = execute_tool(...)
|
|
109
|
+
"""
|
|
110
|
+
try:
|
|
111
|
+
import mlflow
|
|
112
|
+
|
|
113
|
+
with mlflow.start_span(name=name, span_type=span_type):
|
|
114
|
+
yield
|
|
115
|
+
|
|
116
|
+
except ImportError:
|
|
117
|
+
# MLflow not installed, just run without tracing
|
|
118
|
+
yield
|
|
119
|
+
except Exception as e:
|
|
120
|
+
logger.debug(f"Tracing span failed: {e}")
|
|
121
|
+
yield
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def is_tracing_enabled() -> bool:
|
|
125
|
+
"""Check if tracing is currently enabled."""
|
|
126
|
+
return _tracing_enabled
|
sandboxy/scenarios/loader.py
CHANGED
|
@@ -28,6 +28,25 @@ class StepSpec(BaseModel):
|
|
|
28
28
|
params: dict[str, Any] = Field(default_factory=dict)
|
|
29
29
|
|
|
30
30
|
|
|
31
|
+
class MLflowYamlConfig(BaseModel):
|
|
32
|
+
"""MLflow configuration from scenario YAML.
|
|
33
|
+
|
|
34
|
+
Example:
|
|
35
|
+
mlflow:
|
|
36
|
+
enabled: true
|
|
37
|
+
experiment: "agent-evals"
|
|
38
|
+
tracking_uri: "http://localhost:5000"
|
|
39
|
+
tags:
|
|
40
|
+
team: "support-agents"
|
|
41
|
+
environment: "staging"
|
|
42
|
+
"""
|
|
43
|
+
|
|
44
|
+
enabled: bool = False
|
|
45
|
+
experiment: str | None = None
|
|
46
|
+
tracking_uri: str | None = None
|
|
47
|
+
tags: dict[str, str] = Field(default_factory=dict)
|
|
48
|
+
|
|
49
|
+
|
|
31
50
|
class McpServerSpec(BaseModel):
|
|
32
51
|
"""Specification for an MCP server connection.
|
|
33
52
|
|
|
@@ -74,9 +93,12 @@ class ScenarioSpec(BaseModel):
|
|
|
74
93
|
|
|
75
94
|
# Evaluation
|
|
76
95
|
goals: list[GoalSpec] = Field(default_factory=list)
|
|
77
|
-
evaluation: list[dict[str, Any]] = Field(default_factory=list)
|
|
96
|
+
evaluation: list[dict[str, Any]] | dict[str, Any] = Field(default_factory=list)
|
|
78
97
|
scoring: dict[str, Any] = Field(default_factory=dict)
|
|
79
98
|
|
|
99
|
+
# MLflow integration (optional)
|
|
100
|
+
mlflow: MLflowYamlConfig | None = None
|
|
101
|
+
|
|
80
102
|
|
|
81
103
|
def load_scenario(path: Path) -> ScenarioSpec:
|
|
82
104
|
"""Load a scenario from a YAML file.
|
|
@@ -148,7 +170,14 @@ def parse_scenario(raw: dict[str, Any]) -> ScenarioSpec:
|
|
|
148
170
|
|
|
149
171
|
# Parse goals
|
|
150
172
|
goals: list[GoalSpec] = []
|
|
151
|
-
|
|
173
|
+
goals_raw = raw.get("goals", [])
|
|
174
|
+
|
|
175
|
+
# Also check for goals nested inside evaluation dict
|
|
176
|
+
evaluation_raw = raw.get("evaluation", [])
|
|
177
|
+
if isinstance(evaluation_raw, dict) and "goals" in evaluation_raw:
|
|
178
|
+
goals_raw = evaluation_raw.get("goals", [])
|
|
179
|
+
|
|
180
|
+
for g in goals_raw:
|
|
152
181
|
goals.append(
|
|
153
182
|
GoalSpec(
|
|
154
183
|
id=g.get("id", f"goal_{len(goals)}"),
|
|
@@ -159,6 +188,17 @@ def parse_scenario(raw: dict[str, Any]) -> ScenarioSpec:
|
|
|
159
188
|
)
|
|
160
189
|
)
|
|
161
190
|
|
|
191
|
+
# Parse MLflow config if present
|
|
192
|
+
mlflow_config: MLflowYamlConfig | None = None
|
|
193
|
+
if "mlflow" in raw and isinstance(raw["mlflow"], dict):
|
|
194
|
+
mlflow_raw = raw["mlflow"]
|
|
195
|
+
mlflow_config = MLflowYamlConfig(
|
|
196
|
+
enabled=mlflow_raw.get("enabled", False),
|
|
197
|
+
experiment=mlflow_raw.get("experiment"),
|
|
198
|
+
tracking_uri=mlflow_raw.get("tracking_uri"),
|
|
199
|
+
tags=mlflow_raw.get("tags", {}),
|
|
200
|
+
)
|
|
201
|
+
|
|
162
202
|
return ScenarioSpec(
|
|
163
203
|
id=raw.get("id", "unnamed"),
|
|
164
204
|
name=raw.get("name", raw.get("id", "Unnamed Scenario")),
|
|
@@ -174,6 +214,7 @@ def parse_scenario(raw: dict[str, Any]) -> ScenarioSpec:
|
|
|
174
214
|
goals=goals,
|
|
175
215
|
evaluation=raw.get("evaluation", []),
|
|
176
216
|
scoring=raw.get("scoring", {}),
|
|
217
|
+
mlflow=mlflow_config,
|
|
177
218
|
)
|
|
178
219
|
|
|
179
220
|
|
|
@@ -259,4 +300,5 @@ def apply_scenario_variables(spec: ScenarioSpec, variables: dict[str, Any]) -> S
|
|
|
259
300
|
goals=spec.goals,
|
|
260
301
|
evaluation=spec.evaluation,
|
|
261
302
|
scoring=spec.scoring,
|
|
303
|
+
mlflow=spec.mlflow, # Preserve MLflow config
|
|
262
304
|
)
|
sandboxy/scenarios/runner.py
CHANGED
|
@@ -5,6 +5,7 @@ from __future__ import annotations
|
|
|
5
5
|
import asyncio
|
|
6
6
|
import json
|
|
7
7
|
import logging
|
|
8
|
+
import time
|
|
8
9
|
from pathlib import Path
|
|
9
10
|
from typing import Any
|
|
10
11
|
|
|
@@ -26,6 +27,16 @@ class ScenarioEvent(BaseModel):
|
|
|
26
27
|
payload: dict[str, Any] = Field(default_factory=dict)
|
|
27
28
|
|
|
28
29
|
|
|
30
|
+
class GoalResult(BaseModel):
|
|
31
|
+
"""Result of evaluating a single goal."""
|
|
32
|
+
|
|
33
|
+
id: str
|
|
34
|
+
name: str
|
|
35
|
+
achieved: bool
|
|
36
|
+
points: int
|
|
37
|
+
reason: str = ""
|
|
38
|
+
|
|
39
|
+
|
|
29
40
|
class ScenarioResult(BaseModel):
|
|
30
41
|
"""Result of running a scenario."""
|
|
31
42
|
|
|
@@ -35,7 +46,12 @@ class ScenarioResult(BaseModel):
|
|
|
35
46
|
tool_calls: list[dict[str, Any]] = Field(default_factory=list)
|
|
36
47
|
final_state: dict[str, Any] = Field(default_factory=dict)
|
|
37
48
|
goals_achieved: list[str] = Field(default_factory=list)
|
|
49
|
+
goal_results: list[GoalResult] = Field(default_factory=list)
|
|
38
50
|
score: float = 0.0
|
|
51
|
+
max_score: float = 0.0
|
|
52
|
+
latency_ms: int = 0
|
|
53
|
+
input_tokens: int = 0
|
|
54
|
+
output_tokens: int = 0
|
|
39
55
|
|
|
40
56
|
def to_json(self, indent: int | None = None) -> str:
|
|
41
57
|
"""Serialize result to JSON string."""
|
|
@@ -70,7 +86,11 @@ class ScenarioResult(BaseModel):
|
|
|
70
86
|
lines.append("")
|
|
71
87
|
lines.append(f"Tool Calls Made: {len(self.tool_calls)}")
|
|
72
88
|
lines.append(f"Goals Achieved: {len(self.goals_achieved)}")
|
|
73
|
-
lines.append(f"Score: {self.score}")
|
|
89
|
+
lines.append(f"Score: {self.score}/{self.max_score}")
|
|
90
|
+
if self.latency_ms:
|
|
91
|
+
lines.append(f"Latency: {self.latency_ms}ms")
|
|
92
|
+
if self.input_tokens or self.output_tokens:
|
|
93
|
+
lines.append(f"Tokens: {self.input_tokens} in / {self.output_tokens} out")
|
|
74
94
|
|
|
75
95
|
return "\n".join(lines)
|
|
76
96
|
|
|
@@ -176,6 +196,8 @@ class ScenarioRunner:
|
|
|
176
196
|
Returns:
|
|
177
197
|
ScenarioResult with events and evaluation
|
|
178
198
|
"""
|
|
199
|
+
start_time = time.perf_counter()
|
|
200
|
+
|
|
179
201
|
try:
|
|
180
202
|
# Load MCP tools if configured
|
|
181
203
|
await self._load_mcp_tools()
|
|
@@ -188,9 +210,21 @@ class ScenarioRunner:
|
|
|
188
210
|
for step in self.scenario.steps:
|
|
189
211
|
await self._execute_step(step, max_turns)
|
|
190
212
|
|
|
191
|
-
# Evaluate goals
|
|
213
|
+
# Evaluate goals and build detailed results
|
|
192
214
|
goals_achieved = self._evaluate_goals()
|
|
215
|
+
goal_results = self._build_goal_results(goals_achieved)
|
|
193
216
|
score = self._compute_score(goals_achieved)
|
|
217
|
+
max_score = sum(g.points for g in self.scenario.goals)
|
|
218
|
+
|
|
219
|
+
# Get token usage from agent if available
|
|
220
|
+
input_tokens = 0
|
|
221
|
+
output_tokens = 0
|
|
222
|
+
if hasattr(self.agent, "get_usage"):
|
|
223
|
+
usage = self.agent.get_usage()
|
|
224
|
+
input_tokens = usage.get("input_tokens", 0)
|
|
225
|
+
output_tokens = usage.get("output_tokens", 0)
|
|
226
|
+
|
|
227
|
+
latency_ms = int((time.perf_counter() - start_time) * 1000)
|
|
194
228
|
|
|
195
229
|
return ScenarioResult(
|
|
196
230
|
scenario_id=self.scenario.id,
|
|
@@ -199,7 +233,12 @@ class ScenarioRunner:
|
|
|
199
233
|
tool_calls=self.tool_call_log,
|
|
200
234
|
final_state=self.env_state.copy(),
|
|
201
235
|
goals_achieved=goals_achieved,
|
|
236
|
+
goal_results=goal_results,
|
|
202
237
|
score=score,
|
|
238
|
+
max_score=max_score,
|
|
239
|
+
latency_ms=latency_ms,
|
|
240
|
+
input_tokens=input_tokens,
|
|
241
|
+
output_tokens=output_tokens,
|
|
203
242
|
)
|
|
204
243
|
finally:
|
|
205
244
|
await self._cleanup_mcp()
|
|
@@ -440,6 +479,22 @@ class ScenarioRunner:
|
|
|
440
479
|
|
|
441
480
|
return list(set(achieved)) # Deduplicate
|
|
442
481
|
|
|
482
|
+
def _build_goal_results(self, goals_achieved: list[str]) -> list[GoalResult]:
|
|
483
|
+
"""Build detailed goal results for MLflow logging."""
|
|
484
|
+
results = []
|
|
485
|
+
for goal in self.scenario.goals:
|
|
486
|
+
achieved = goal.id in goals_achieved
|
|
487
|
+
results.append(
|
|
488
|
+
GoalResult(
|
|
489
|
+
id=goal.id,
|
|
490
|
+
name=goal.name or goal.id,
|
|
491
|
+
achieved=achieved,
|
|
492
|
+
points=goal.points if achieved else 0,
|
|
493
|
+
reason="Goal achieved" if achieved else "Goal not achieved",
|
|
494
|
+
)
|
|
495
|
+
)
|
|
496
|
+
return results
|
|
497
|
+
|
|
443
498
|
def _compute_score(self, goals_achieved: list[str]) -> float:
|
|
444
499
|
"""Compute score based on achieved goals."""
|
|
445
500
|
from sandboxy.core.safe_eval import EvaluationError, safe_eval_formula
|
sandboxy/tools/yaml_tools.py
CHANGED
|
@@ -288,6 +288,17 @@ class YamlMockTool:
|
|
|
288
288
|
self.config = config.config
|
|
289
289
|
self.spec = spec
|
|
290
290
|
self._call_log: list[dict[str, Any]] = []
|
|
291
|
+
self._overrides: dict[str, Any] = {}
|
|
292
|
+
|
|
293
|
+
def set_overrides(self, overrides: dict[str, Any]) -> None:
|
|
294
|
+
"""Set response overrides for dataset benchmarking.
|
|
295
|
+
|
|
296
|
+
Args:
|
|
297
|
+
overrides: Dict mapping "tool.action" or "tool" to override response data.
|
|
298
|
+
When a matching action is called, returns the override data
|
|
299
|
+
instead of the normal mock response.
|
|
300
|
+
"""
|
|
301
|
+
self._overrides = overrides
|
|
291
302
|
|
|
292
303
|
@property
|
|
293
304
|
def call_log(self) -> list[dict[str, Any]]:
|
|
@@ -347,6 +358,13 @@ class YamlMockTool:
|
|
|
347
358
|
for effect in action_spec.side_effects:
|
|
348
359
|
effect.apply(env_state, validated_args)
|
|
349
360
|
|
|
361
|
+
# Check for override (dataset benchmarking)
|
|
362
|
+
override_key = f"{self.name}.{action}"
|
|
363
|
+
if override_key in self._overrides:
|
|
364
|
+
return ToolResult(success=True, data=self._overrides[override_key])
|
|
365
|
+
if self.name in self._overrides:
|
|
366
|
+
return ToolResult(success=True, data=self._overrides[self.name])
|
|
367
|
+
|
|
350
368
|
# Compute return value
|
|
351
369
|
result_value = self._compute_return(action_spec, validated_args, env_state)
|
|
352
370
|
|