sandboxy 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -28,6 +28,25 @@ class StepSpec(BaseModel):
28
28
  params: dict[str, Any] = Field(default_factory=dict)
29
29
 
30
30
 
31
+ class MLflowYamlConfig(BaseModel):
32
+ """MLflow configuration from scenario YAML.
33
+
34
+ Example:
35
+ mlflow:
36
+ enabled: true
37
+ experiment: "agent-evals"
38
+ tracking_uri: "http://localhost:5000"
39
+ tags:
40
+ team: "support-agents"
41
+ environment: "staging"
42
+ """
43
+
44
+ enabled: bool = False
45
+ experiment: str | None = None
46
+ tracking_uri: str | None = None
47
+ tags: dict[str, str] = Field(default_factory=dict)
48
+
49
+
31
50
  class McpServerSpec(BaseModel):
32
51
  """Specification for an MCP server connection.
33
52
 
@@ -74,9 +93,12 @@ class ScenarioSpec(BaseModel):
74
93
 
75
94
  # Evaluation
76
95
  goals: list[GoalSpec] = Field(default_factory=list)
77
- evaluation: list[dict[str, Any]] = Field(default_factory=list)
96
+ evaluation: list[dict[str, Any]] | dict[str, Any] = Field(default_factory=list)
78
97
  scoring: dict[str, Any] = Field(default_factory=dict)
79
98
 
99
+ # MLflow integration (optional)
100
+ mlflow: MLflowYamlConfig | None = None
101
+
80
102
 
81
103
  def load_scenario(path: Path) -> ScenarioSpec:
82
104
  """Load a scenario from a YAML file.
@@ -148,7 +170,14 @@ def parse_scenario(raw: dict[str, Any]) -> ScenarioSpec:
148
170
 
149
171
  # Parse goals
150
172
  goals: list[GoalSpec] = []
151
- for g in raw.get("goals", []):
173
+ goals_raw = raw.get("goals", [])
174
+
175
+ # Also check for goals nested inside evaluation dict
176
+ evaluation_raw = raw.get("evaluation", [])
177
+ if isinstance(evaluation_raw, dict) and "goals" in evaluation_raw:
178
+ goals_raw = evaluation_raw.get("goals", [])
179
+
180
+ for g in goals_raw:
152
181
  goals.append(
153
182
  GoalSpec(
154
183
  id=g.get("id", f"goal_{len(goals)}"),
@@ -159,6 +188,17 @@ def parse_scenario(raw: dict[str, Any]) -> ScenarioSpec:
159
188
  )
160
189
  )
161
190
 
191
+ # Parse MLflow config if present
192
+ mlflow_config: MLflowYamlConfig | None = None
193
+ if "mlflow" in raw and isinstance(raw["mlflow"], dict):
194
+ mlflow_raw = raw["mlflow"]
195
+ mlflow_config = MLflowYamlConfig(
196
+ enabled=mlflow_raw.get("enabled", False),
197
+ experiment=mlflow_raw.get("experiment"),
198
+ tracking_uri=mlflow_raw.get("tracking_uri"),
199
+ tags=mlflow_raw.get("tags", {}),
200
+ )
201
+
162
202
  return ScenarioSpec(
163
203
  id=raw.get("id", "unnamed"),
164
204
  name=raw.get("name", raw.get("id", "Unnamed Scenario")),
@@ -174,6 +214,7 @@ def parse_scenario(raw: dict[str, Any]) -> ScenarioSpec:
174
214
  goals=goals,
175
215
  evaluation=raw.get("evaluation", []),
176
216
  scoring=raw.get("scoring", {}),
217
+ mlflow=mlflow_config,
177
218
  )
178
219
 
179
220
 
@@ -259,4 +300,5 @@ def apply_scenario_variables(spec: ScenarioSpec, variables: dict[str, Any]) -> S
259
300
  goals=spec.goals,
260
301
  evaluation=spec.evaluation,
261
302
  scoring=spec.scoring,
303
+ mlflow=spec.mlflow, # Preserve MLflow config
262
304
  )
@@ -5,6 +5,7 @@ from __future__ import annotations
5
5
  import asyncio
6
6
  import json
7
7
  import logging
8
+ import time
8
9
  from pathlib import Path
9
10
  from typing import Any
10
11
 
@@ -26,6 +27,16 @@ class ScenarioEvent(BaseModel):
26
27
  payload: dict[str, Any] = Field(default_factory=dict)
27
28
 
28
29
 
30
+ class GoalResult(BaseModel):
31
+ """Result of evaluating a single goal."""
32
+
33
+ id: str
34
+ name: str
35
+ achieved: bool
36
+ points: int
37
+ reason: str = ""
38
+
39
+
29
40
  class ScenarioResult(BaseModel):
30
41
  """Result of running a scenario."""
31
42
 
@@ -35,7 +46,12 @@ class ScenarioResult(BaseModel):
35
46
  tool_calls: list[dict[str, Any]] = Field(default_factory=list)
36
47
  final_state: dict[str, Any] = Field(default_factory=dict)
37
48
  goals_achieved: list[str] = Field(default_factory=list)
49
+ goal_results: list[GoalResult] = Field(default_factory=list)
38
50
  score: float = 0.0
51
+ max_score: float = 0.0
52
+ latency_ms: int = 0
53
+ input_tokens: int = 0
54
+ output_tokens: int = 0
39
55
 
40
56
  def to_json(self, indent: int | None = None) -> str:
41
57
  """Serialize result to JSON string."""
@@ -70,7 +86,11 @@ class ScenarioResult(BaseModel):
70
86
  lines.append("")
71
87
  lines.append(f"Tool Calls Made: {len(self.tool_calls)}")
72
88
  lines.append(f"Goals Achieved: {len(self.goals_achieved)}")
73
- lines.append(f"Score: {self.score}")
89
+ lines.append(f"Score: {self.score}/{self.max_score}")
90
+ if self.latency_ms:
91
+ lines.append(f"Latency: {self.latency_ms}ms")
92
+ if self.input_tokens or self.output_tokens:
93
+ lines.append(f"Tokens: {self.input_tokens} in / {self.output_tokens} out")
74
94
 
75
95
  return "\n".join(lines)
76
96
 
@@ -176,6 +196,8 @@ class ScenarioRunner:
176
196
  Returns:
177
197
  ScenarioResult with events and evaluation
178
198
  """
199
+ start_time = time.perf_counter()
200
+
179
201
  try:
180
202
  # Load MCP tools if configured
181
203
  await self._load_mcp_tools()
@@ -188,9 +210,21 @@ class ScenarioRunner:
188
210
  for step in self.scenario.steps:
189
211
  await self._execute_step(step, max_turns)
190
212
 
191
- # Evaluate goals
213
+ # Evaluate goals and build detailed results
192
214
  goals_achieved = self._evaluate_goals()
215
+ goal_results = self._build_goal_results(goals_achieved)
193
216
  score = self._compute_score(goals_achieved)
217
+ max_score = sum(g.points for g in self.scenario.goals)
218
+
219
+ # Get token usage from agent if available
220
+ input_tokens = 0
221
+ output_tokens = 0
222
+ if hasattr(self.agent, "get_usage"):
223
+ usage = self.agent.get_usage()
224
+ input_tokens = usage.get("input_tokens", 0)
225
+ output_tokens = usage.get("output_tokens", 0)
226
+
227
+ latency_ms = int((time.perf_counter() - start_time) * 1000)
194
228
 
195
229
  return ScenarioResult(
196
230
  scenario_id=self.scenario.id,
@@ -199,7 +233,12 @@ class ScenarioRunner:
199
233
  tool_calls=self.tool_call_log,
200
234
  final_state=self.env_state.copy(),
201
235
  goals_achieved=goals_achieved,
236
+ goal_results=goal_results,
202
237
  score=score,
238
+ max_score=max_score,
239
+ latency_ms=latency_ms,
240
+ input_tokens=input_tokens,
241
+ output_tokens=output_tokens,
203
242
  )
204
243
  finally:
205
244
  await self._cleanup_mcp()
@@ -440,6 +479,22 @@ class ScenarioRunner:
440
479
 
441
480
  return list(set(achieved)) # Deduplicate
442
481
 
482
+ def _build_goal_results(self, goals_achieved: list[str]) -> list[GoalResult]:
483
+ """Build detailed goal results for MLflow logging."""
484
+ results = []
485
+ for goal in self.scenario.goals:
486
+ achieved = goal.id in goals_achieved
487
+ results.append(
488
+ GoalResult(
489
+ id=goal.id,
490
+ name=goal.name or goal.id,
491
+ achieved=achieved,
492
+ points=goal.points if achieved else 0,
493
+ reason="Goal achieved" if achieved else "Goal not achieved",
494
+ )
495
+ )
496
+ return results
497
+
443
498
  def _compute_score(self, goals_achieved: list[str]) -> float:
444
499
  """Compute score based on achieved goals."""
445
500
  from sandboxy.core.safe_eval import EvaluationError, safe_eval_formula
@@ -292,6 +292,9 @@ class RunResult:
292
292
  cost_usd: float | None = None
293
293
  error: str | None = None
294
294
  created_at: datetime = field(default_factory=datetime.now)
295
+ # Provider info for distinguishing local vs cloud
296
+ is_local: bool = False
297
+ provider_name: str | None = None
295
298
 
296
299
  def to_dict(self) -> dict[str, Any]:
297
300
  """Convert to dictionary."""
@@ -324,6 +327,8 @@ class RunResult:
324
327
  "cost_usd": self.cost_usd,
325
328
  "error": self.error,
326
329
  "created_at": self.created_at.isoformat(),
330
+ "is_local": self.is_local,
331
+ "provider_name": self.provider_name,
327
332
  }
328
333
 
329
334
  def to_json(self, indent: int | None = 2) -> str:
@@ -332,9 +337,15 @@ class RunResult:
332
337
 
333
338
  def pretty(self) -> str:
334
339
  """Format for human-readable display."""
340
+ model_display = self.model
341
+ if self.is_local:
342
+ model_display += " (local)"
343
+ elif self.provider_name:
344
+ model_display += f" ({self.provider_name})"
345
+
335
346
  lines = [
336
347
  f"Scenario: {self.scenario_id}",
337
- f"Model: {self.model}",
348
+ f"Model: {model_display}",
338
349
  f"Latency: {self.latency_ms}ms",
339
350
  ]
340
351
 
@@ -516,6 +527,10 @@ class UnifiedRunner:
516
527
  temperature=temperature,
517
528
  )
518
529
 
530
+ # Detect if this is a local provider
531
+ is_local = hasattr(provider, "config") # LocalProvider has config attribute
532
+ provider_name = provider.provider_name if hasattr(provider, "provider_name") else None
533
+
519
534
  return RunResult(
520
535
  id="", # Set by caller
521
536
  scenario_id=scenario.id,
@@ -528,7 +543,9 @@ class UnifiedRunner:
528
543
  ],
529
544
  input_tokens=response.input_tokens,
530
545
  output_tokens=response.output_tokens,
531
- cost_usd=response.cost_usd,
546
+ cost_usd=response.cost_usd if not is_local else 0.0, # Local models have no cost
547
+ is_local=is_local,
548
+ provider_name=provider_name,
532
549
  )
533
550
 
534
551
  async def _run_multi_turn(
@@ -773,6 +790,11 @@ class UnifiedRunner:
773
790
  # Calculate cost from token counts
774
791
  cost_usd = self._calculate_cost(model, input_tokens, output_tokens)
775
792
 
793
+ # Detect if this is a local provider
794
+ provider = self.registry.get_provider_for_model(model)
795
+ is_local = hasattr(provider, "config")
796
+ provider_name = provider.provider_name if hasattr(provider, "provider_name") else None
797
+
776
798
  return RunResult(
777
799
  id="",
778
800
  scenario_id=scenario.id,
@@ -783,7 +805,9 @@ class UnifiedRunner:
783
805
  final_state=env_state,
784
806
  input_tokens=input_tokens,
785
807
  output_tokens=output_tokens,
786
- cost_usd=cost_usd,
808
+ cost_usd=cost_usd if not is_local else 0.0,
809
+ is_local=is_local,
810
+ provider_name=provider_name,
787
811
  )
788
812
 
789
813
  finally:
@@ -288,6 +288,17 @@ class YamlMockTool:
288
288
  self.config = config.config
289
289
  self.spec = spec
290
290
  self._call_log: list[dict[str, Any]] = []
291
+ self._overrides: dict[str, Any] = {}
292
+
293
+ def set_overrides(self, overrides: dict[str, Any]) -> None:
294
+ """Set response overrides for dataset benchmarking.
295
+
296
+ Args:
297
+ overrides: Dict mapping "tool.action" or "tool" to override response data.
298
+ When a matching action is called, returns the override data
299
+ instead of the normal mock response.
300
+ """
301
+ self._overrides = overrides
291
302
 
292
303
  @property
293
304
  def call_log(self) -> list[dict[str, Any]]:
@@ -347,6 +358,13 @@ class YamlMockTool:
347
358
  for effect in action_spec.side_effects:
348
359
  effect.apply(env_state, validated_args)
349
360
 
361
+ # Check for override (dataset benchmarking)
362
+ override_key = f"{self.name}.{action}"
363
+ if override_key in self._overrides:
364
+ return ToolResult(success=True, data=self._overrides[override_key])
365
+ if self.name in self._overrides:
366
+ return ToolResult(success=True, data=self._overrides[self.name])
367
+
350
368
  # Compute return value
351
369
  result_value = self._compute_return(action_spec, validated_args, env_state)
352
370