sandboxy 0.0.3__py3-none-any.whl → 0.0.4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,126 @@
1
+ """MLflow tracing support for Sandboxy.
2
+
3
+ Enables automatic tracing of LLM calls using MLflow's autolog feature.
4
+ When enabled, all OpenAI SDK calls are automatically captured as spans
5
+ within the MLflow run, providing detailed visibility into:
6
+ - Each LLM call (prompt, response, latency, tokens)
7
+ - Tool/function calls made by the LLM
8
+ - The full execution flow
9
+ """
10
+
11
+ from __future__ import annotations
12
+
13
+ import logging
14
+ from collections.abc import Generator
15
+ from contextlib import contextmanager
16
+ from typing import TYPE_CHECKING
17
+
18
+ if TYPE_CHECKING:
19
+ pass
20
+
21
+ logger = logging.getLogger(__name__)
22
+
23
+ _tracing_enabled = False
24
+
25
+
26
+ def enable_tracing(
27
+ tracking_uri: str | None = None,
28
+ experiment_name: str | None = None,
29
+ ) -> bool:
30
+ """Enable MLflow tracing for OpenAI calls.
31
+
32
+ This should be called once before any LLM calls are made.
33
+ It enables MLflow's autolog feature which automatically
34
+ captures all OpenAI SDK calls as traces.
35
+
36
+ Args:
37
+ tracking_uri: MLflow tracking server URI (uses env var if not set)
38
+ experiment_name: Experiment to log traces to
39
+
40
+ Returns:
41
+ True if tracing was enabled successfully, False otherwise
42
+ """
43
+ global _tracing_enabled
44
+
45
+ if _tracing_enabled:
46
+ return True
47
+
48
+ try:
49
+ import os
50
+
51
+ import mlflow
52
+
53
+ # Set tracking URI before enabling autolog
54
+ uri = tracking_uri or os.environ.get("MLFLOW_TRACKING_URI")
55
+ if uri:
56
+ mlflow.set_tracking_uri(uri)
57
+
58
+ # Set experiment before enabling autolog
59
+ if experiment_name:
60
+ mlflow.set_experiment(experiment_name)
61
+
62
+ # Enable OpenAI autologging - this captures all OpenAI calls as traces
63
+ mlflow.openai.autolog()
64
+
65
+ _tracing_enabled = True
66
+ logger.debug("MLflow tracing enabled for OpenAI")
67
+ return True
68
+
69
+ except ImportError as e:
70
+ logger.warning(f"MLflow or OpenAI not installed, tracing disabled: {e}")
71
+ return False
72
+ except Exception as e:
73
+ logger.warning(f"Failed to enable MLflow tracing: {e}")
74
+ return False
75
+
76
+
77
+ def disable_tracing() -> None:
78
+ """Disable MLflow tracing."""
79
+ global _tracing_enabled
80
+
81
+ if not _tracing_enabled:
82
+ return
83
+
84
+ try:
85
+ import mlflow
86
+
87
+ mlflow.openai.autolog(disable=True)
88
+ _tracing_enabled = False
89
+ logger.debug("MLflow tracing disabled")
90
+
91
+ except Exception as e:
92
+ logger.warning(f"Failed to disable MLflow tracing: {e}")
93
+
94
+
95
+ @contextmanager
96
+ def trace_span(name: str, span_type: str = "CHAIN") -> Generator[None, None, None]:
97
+ """Create a manual trace span for non-LLM operations.
98
+
99
+ Use this to wrap tool calls, scenario steps, or other operations
100
+ you want to appear in the trace.
101
+
102
+ Args:
103
+ name: Name of the span (e.g., "tool_call:get_account_activity")
104
+ span_type: Type of span (CHAIN, TOOL, RETRIEVER, etc.)
105
+
106
+ Example:
107
+ with trace_span("tool_call:search", span_type="TOOL"):
108
+ result = execute_tool(...)
109
+ """
110
+ try:
111
+ import mlflow
112
+
113
+ with mlflow.start_span(name=name, span_type=span_type):
114
+ yield
115
+
116
+ except ImportError:
117
+ # MLflow not installed, just run without tracing
118
+ yield
119
+ except Exception as e:
120
+ logger.debug(f"Tracing span failed: {e}")
121
+ yield
122
+
123
+
124
+ def is_tracing_enabled() -> bool:
125
+ """Check if tracing is currently enabled."""
126
+ return _tracing_enabled
@@ -28,6 +28,25 @@ class StepSpec(BaseModel):
28
28
  params: dict[str, Any] = Field(default_factory=dict)
29
29
 
30
30
 
31
+ class MLflowYamlConfig(BaseModel):
32
+ """MLflow configuration from scenario YAML.
33
+
34
+ Example:
35
+ mlflow:
36
+ enabled: true
37
+ experiment: "agent-evals"
38
+ tracking_uri: "http://localhost:5000"
39
+ tags:
40
+ team: "support-agents"
41
+ environment: "staging"
42
+ """
43
+
44
+ enabled: bool = False
45
+ experiment: str | None = None
46
+ tracking_uri: str | None = None
47
+ tags: dict[str, str] = Field(default_factory=dict)
48
+
49
+
31
50
  class McpServerSpec(BaseModel):
32
51
  """Specification for an MCP server connection.
33
52
 
@@ -74,9 +93,12 @@ class ScenarioSpec(BaseModel):
74
93
 
75
94
  # Evaluation
76
95
  goals: list[GoalSpec] = Field(default_factory=list)
77
- evaluation: list[dict[str, Any]] = Field(default_factory=list)
96
+ evaluation: list[dict[str, Any]] | dict[str, Any] = Field(default_factory=list)
78
97
  scoring: dict[str, Any] = Field(default_factory=dict)
79
98
 
99
+ # MLflow integration (optional)
100
+ mlflow: MLflowYamlConfig | None = None
101
+
80
102
 
81
103
  def load_scenario(path: Path) -> ScenarioSpec:
82
104
  """Load a scenario from a YAML file.
@@ -148,7 +170,14 @@ def parse_scenario(raw: dict[str, Any]) -> ScenarioSpec:
148
170
 
149
171
  # Parse goals
150
172
  goals: list[GoalSpec] = []
151
- for g in raw.get("goals", []):
173
+ goals_raw = raw.get("goals", [])
174
+
175
+ # Also check for goals nested inside evaluation dict
176
+ evaluation_raw = raw.get("evaluation", [])
177
+ if isinstance(evaluation_raw, dict) and "goals" in evaluation_raw:
178
+ goals_raw = evaluation_raw.get("goals", [])
179
+
180
+ for g in goals_raw:
152
181
  goals.append(
153
182
  GoalSpec(
154
183
  id=g.get("id", f"goal_{len(goals)}"),
@@ -159,6 +188,17 @@ def parse_scenario(raw: dict[str, Any]) -> ScenarioSpec:
159
188
  )
160
189
  )
161
190
 
191
+ # Parse MLflow config if present
192
+ mlflow_config: MLflowYamlConfig | None = None
193
+ if "mlflow" in raw and isinstance(raw["mlflow"], dict):
194
+ mlflow_raw = raw["mlflow"]
195
+ mlflow_config = MLflowYamlConfig(
196
+ enabled=mlflow_raw.get("enabled", False),
197
+ experiment=mlflow_raw.get("experiment"),
198
+ tracking_uri=mlflow_raw.get("tracking_uri"),
199
+ tags=mlflow_raw.get("tags", {}),
200
+ )
201
+
162
202
  return ScenarioSpec(
163
203
  id=raw.get("id", "unnamed"),
164
204
  name=raw.get("name", raw.get("id", "Unnamed Scenario")),
@@ -174,6 +214,7 @@ def parse_scenario(raw: dict[str, Any]) -> ScenarioSpec:
174
214
  goals=goals,
175
215
  evaluation=raw.get("evaluation", []),
176
216
  scoring=raw.get("scoring", {}),
217
+ mlflow=mlflow_config,
177
218
  )
178
219
 
179
220
 
@@ -259,4 +300,5 @@ def apply_scenario_variables(spec: ScenarioSpec, variables: dict[str, Any]) -> S
259
300
  goals=spec.goals,
260
301
  evaluation=spec.evaluation,
261
302
  scoring=spec.scoring,
303
+ mlflow=spec.mlflow, # Preserve MLflow config
262
304
  )
@@ -5,6 +5,7 @@ from __future__ import annotations
5
5
  import asyncio
6
6
  import json
7
7
  import logging
8
+ import time
8
9
  from pathlib import Path
9
10
  from typing import Any
10
11
 
@@ -26,6 +27,16 @@ class ScenarioEvent(BaseModel):
26
27
  payload: dict[str, Any] = Field(default_factory=dict)
27
28
 
28
29
 
30
+ class GoalResult(BaseModel):
31
+ """Result of evaluating a single goal."""
32
+
33
+ id: str
34
+ name: str
35
+ achieved: bool
36
+ points: int
37
+ reason: str = ""
38
+
39
+
29
40
  class ScenarioResult(BaseModel):
30
41
  """Result of running a scenario."""
31
42
 
@@ -35,7 +46,12 @@ class ScenarioResult(BaseModel):
35
46
  tool_calls: list[dict[str, Any]] = Field(default_factory=list)
36
47
  final_state: dict[str, Any] = Field(default_factory=dict)
37
48
  goals_achieved: list[str] = Field(default_factory=list)
49
+ goal_results: list[GoalResult] = Field(default_factory=list)
38
50
  score: float = 0.0
51
+ max_score: float = 0.0
52
+ latency_ms: int = 0
53
+ input_tokens: int = 0
54
+ output_tokens: int = 0
39
55
 
40
56
  def to_json(self, indent: int | None = None) -> str:
41
57
  """Serialize result to JSON string."""
@@ -70,7 +86,11 @@ class ScenarioResult(BaseModel):
70
86
  lines.append("")
71
87
  lines.append(f"Tool Calls Made: {len(self.tool_calls)}")
72
88
  lines.append(f"Goals Achieved: {len(self.goals_achieved)}")
73
- lines.append(f"Score: {self.score}")
89
+ lines.append(f"Score: {self.score}/{self.max_score}")
90
+ if self.latency_ms:
91
+ lines.append(f"Latency: {self.latency_ms}ms")
92
+ if self.input_tokens or self.output_tokens:
93
+ lines.append(f"Tokens: {self.input_tokens} in / {self.output_tokens} out")
74
94
 
75
95
  return "\n".join(lines)
76
96
 
@@ -176,6 +196,8 @@ class ScenarioRunner:
176
196
  Returns:
177
197
  ScenarioResult with events and evaluation
178
198
  """
199
+ start_time = time.perf_counter()
200
+
179
201
  try:
180
202
  # Load MCP tools if configured
181
203
  await self._load_mcp_tools()
@@ -188,9 +210,21 @@ class ScenarioRunner:
188
210
  for step in self.scenario.steps:
189
211
  await self._execute_step(step, max_turns)
190
212
 
191
- # Evaluate goals
213
+ # Evaluate goals and build detailed results
192
214
  goals_achieved = self._evaluate_goals()
215
+ goal_results = self._build_goal_results(goals_achieved)
193
216
  score = self._compute_score(goals_achieved)
217
+ max_score = sum(g.points for g in self.scenario.goals)
218
+
219
+ # Get token usage from agent if available
220
+ input_tokens = 0
221
+ output_tokens = 0
222
+ if hasattr(self.agent, "get_usage"):
223
+ usage = self.agent.get_usage()
224
+ input_tokens = usage.get("input_tokens", 0)
225
+ output_tokens = usage.get("output_tokens", 0)
226
+
227
+ latency_ms = int((time.perf_counter() - start_time) * 1000)
194
228
 
195
229
  return ScenarioResult(
196
230
  scenario_id=self.scenario.id,
@@ -199,7 +233,12 @@ class ScenarioRunner:
199
233
  tool_calls=self.tool_call_log,
200
234
  final_state=self.env_state.copy(),
201
235
  goals_achieved=goals_achieved,
236
+ goal_results=goal_results,
202
237
  score=score,
238
+ max_score=max_score,
239
+ latency_ms=latency_ms,
240
+ input_tokens=input_tokens,
241
+ output_tokens=output_tokens,
203
242
  )
204
243
  finally:
205
244
  await self._cleanup_mcp()
@@ -440,6 +479,22 @@ class ScenarioRunner:
440
479
 
441
480
  return list(set(achieved)) # Deduplicate
442
481
 
482
+ def _build_goal_results(self, goals_achieved: list[str]) -> list[GoalResult]:
483
+ """Build detailed goal results for MLflow logging."""
484
+ results = []
485
+ for goal in self.scenario.goals:
486
+ achieved = goal.id in goals_achieved
487
+ results.append(
488
+ GoalResult(
489
+ id=goal.id,
490
+ name=goal.name or goal.id,
491
+ achieved=achieved,
492
+ points=goal.points if achieved else 0,
493
+ reason="Goal achieved" if achieved else "Goal not achieved",
494
+ )
495
+ )
496
+ return results
497
+
443
498
  def _compute_score(self, goals_achieved: list[str]) -> float:
444
499
  """Compute score based on achieved goals."""
445
500
  from sandboxy.core.safe_eval import EvaluationError, safe_eval_formula
@@ -288,6 +288,17 @@ class YamlMockTool:
288
288
  self.config = config.config
289
289
  self.spec = spec
290
290
  self._call_log: list[dict[str, Any]] = []
291
+ self._overrides: dict[str, Any] = {}
292
+
293
+ def set_overrides(self, overrides: dict[str, Any]) -> None:
294
+ """Set response overrides for dataset benchmarking.
295
+
296
+ Args:
297
+ overrides: Dict mapping "tool.action" or "tool" to override response data.
298
+ When a matching action is called, returns the override data
299
+ instead of the normal mock response.
300
+ """
301
+ self._overrides = overrides
291
302
 
292
303
  @property
293
304
  def call_log(self) -> list[dict[str, Any]]:
@@ -347,6 +358,13 @@ class YamlMockTool:
347
358
  for effect in action_spec.side_effects:
348
359
  effect.apply(env_state, validated_args)
349
360
 
361
+ # Check for override (dataset benchmarking)
362
+ override_key = f"{self.name}.{action}"
363
+ if override_key in self._overrides:
364
+ return ToolResult(success=True, data=self._overrides[override_key])
365
+ if self.name in self._overrides:
366
+ return ToolResult(success=True, data=self._overrides[self.name])
367
+
350
368
  # Compute return value
351
369
  result_value = self._compute_return(action_spec, validated_args, env_state)
352
370