PyPI - sandboxy - Versions diffs - 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl - Mend

sandboxy 0.0.3py3-none-any.whl → 0.0.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

sandboxy/agents/llm_prompt.py +85 -14
sandboxy/api/app.py +2 -1
sandboxy/api/routes/local.py +216 -20
sandboxy/api/routes/providers.py +369 -0
sandboxy/cli/main.py +663 -31
sandboxy/mlflow/__init__.py +38 -0
sandboxy/mlflow/artifacts.py +184 -0
sandboxy/mlflow/config.py +90 -0
sandboxy/mlflow/exporter.py +445 -0
sandboxy/mlflow/metrics.py +115 -0
sandboxy/mlflow/tags.py +140 -0
sandboxy/mlflow/tracing.py +126 -0
sandboxy/providers/__init__.py +37 -3
sandboxy/providers/config.py +243 -0
sandboxy/providers/local.py +498 -0
sandboxy/providers/registry.py +107 -13
sandboxy/scenarios/loader.py +44 -2
sandboxy/scenarios/runner.py +57 -2
sandboxy/scenarios/unified.py +27 -3
sandboxy/tools/yaml_tools.py +18 -0
sandboxy/ui/dist/assets/index-CLxxjJuD.js +367 -0
sandboxy/ui/dist/assets/index-DBB7ehs6.css +1 -0
sandboxy/ui/dist/index.html +2 -2
{sandboxy-0.0.3.dist-info → sandboxy-0.0.5.dist-info}/METADATA +103 -27
{sandboxy-0.0.3.dist-info → sandboxy-0.0.5.dist-info}/RECORD +28 -18
sandboxy/ui/dist/assets/index-CgAkYWrJ.css +0 -1
sandboxy/ui/dist/assets/index-D4zoGFcr.js +0 -347
{sandboxy-0.0.3.dist-info → sandboxy-0.0.5.dist-info}/WHEEL +0 -0
{sandboxy-0.0.3.dist-info → sandboxy-0.0.5.dist-info}/entry_points.txt +0 -0
{sandboxy-0.0.3.dist-info → sandboxy-0.0.5.dist-info}/licenses/LICENSE +0 -0

sandboxy/scenarios/loader.py CHANGED Viewed

@@ -28,6 +28,25 @@ class StepSpec(BaseModel):
     params: dict[str, Any] = Field(default_factory=dict)
+class MLflowYamlConfig(BaseModel):
+    """MLflow configuration from scenario YAML.
+    Example:
+        mlflow:
+          enabled: true
+          experiment: "agent-evals"
+          tracking_uri: "http://localhost:5000"
+          tags:
+            team: "support-agents"
+            environment: "staging"
+    """
+    enabled: bool = False
+    experiment: str | None = None
+    tracking_uri: str | None = None
+    tags: dict[str, str] = Field(default_factory=dict)
 class McpServerSpec(BaseModel):
     """Specification for an MCP server connection.
@@ -74,9 +93,12 @@ class ScenarioSpec(BaseModel):
     # Evaluation
     goals: list[GoalSpec] = Field(default_factory=list)
-    evaluation: list[dict[str, Any]] = Field(default_factory=list)
+    evaluation: list[dict[str, Any]] | dict[str, Any] = Field(default_factory=list)
     scoring: dict[str, Any] = Field(default_factory=dict)
+    # MLflow integration (optional)
+    mlflow: MLflowYamlConfig | None = None
 def load_scenario(path: Path) -> ScenarioSpec:
     """Load a scenario from a YAML file.
@@ -148,7 +170,14 @@ def parse_scenario(raw: dict[str, Any]) -> ScenarioSpec:
     # Parse goals
     goals: list[GoalSpec] = []
-    for g in raw.get("goals", []):
+    goals_raw = raw.get("goals", [])
+    # Also check for goals nested inside evaluation dict
+    evaluation_raw = raw.get("evaluation", [])
+    if isinstance(evaluation_raw, dict) and "goals" in evaluation_raw:
+        goals_raw = evaluation_raw.get("goals", [])
+    for g in goals_raw:
         goals.append(
             GoalSpec(
                 id=g.get("id", f"goal_{len(goals)}"),
@@ -159,6 +188,17 @@ def parse_scenario(raw: dict[str, Any]) -> ScenarioSpec:
             )
         )
+    # Parse MLflow config if present
+    mlflow_config: MLflowYamlConfig | None = None
+    if "mlflow" in raw and isinstance(raw["mlflow"], dict):
+        mlflow_raw = raw["mlflow"]
+        mlflow_config = MLflowYamlConfig(
+            enabled=mlflow_raw.get("enabled", False),
+            experiment=mlflow_raw.get("experiment"),
+            tracking_uri=mlflow_raw.get("tracking_uri"),
+            tags=mlflow_raw.get("tags", {}),
+        )
     return ScenarioSpec(
         id=raw.get("id", "unnamed"),
         name=raw.get("name", raw.get("id", "Unnamed Scenario")),
@@ -174,6 +214,7 @@ def parse_scenario(raw: dict[str, Any]) -> ScenarioSpec:
         goals=goals,
         evaluation=raw.get("evaluation", []),
         scoring=raw.get("scoring", {}),
+        mlflow=mlflow_config,
     )
@@ -259,4 +300,5 @@ def apply_scenario_variables(spec: ScenarioSpec, variables: dict[str, Any]) -> S
         goals=spec.goals,
         evaluation=spec.evaluation,
         scoring=spec.scoring,
+        mlflow=spec.mlflow,  # Preserve MLflow config
     )

sandboxy/scenarios/runner.py CHANGED Viewed

@@ -5,6 +5,7 @@ from __future__ import annotations
 import asyncio
 import json
 import logging
+import time
 from pathlib import Path
 from typing import Any
@@ -26,6 +27,16 @@ class ScenarioEvent(BaseModel):
     payload: dict[str, Any] = Field(default_factory=dict)
+class GoalResult(BaseModel):
+    """Result of evaluating a single goal."""
+    id: str
+    name: str
+    achieved: bool
+    points: int
+    reason: str = ""
 class ScenarioResult(BaseModel):
     """Result of running a scenario."""
@@ -35,7 +46,12 @@ class ScenarioResult(BaseModel):
     tool_calls: list[dict[str, Any]] = Field(default_factory=list)
     final_state: dict[str, Any] = Field(default_factory=dict)
     goals_achieved: list[str] = Field(default_factory=list)
+    goal_results: list[GoalResult] = Field(default_factory=list)
     score: float = 0.0
+    max_score: float = 0.0
+    latency_ms: int = 0
+    input_tokens: int = 0
+    output_tokens: int = 0
     def to_json(self, indent: int | None = None) -> str:
         """Serialize result to JSON string."""
@@ -70,7 +86,11 @@ class ScenarioResult(BaseModel):
         lines.append("")
         lines.append(f"Tool Calls Made: {len(self.tool_calls)}")
         lines.append(f"Goals Achieved: {len(self.goals_achieved)}")
-        lines.append(f"Score: {self.score}")
+        lines.append(f"Score: {self.score}/{self.max_score}")
+        if self.latency_ms:
+            lines.append(f"Latency: {self.latency_ms}ms")
+        if self.input_tokens or self.output_tokens:
+            lines.append(f"Tokens: {self.input_tokens} in / {self.output_tokens} out")
         return "\n".join(lines)
@@ -176,6 +196,8 @@ class ScenarioRunner:
         Returns:
             ScenarioResult with events and evaluation
         """
+        start_time = time.perf_counter()
         try:
             # Load MCP tools if configured
             await self._load_mcp_tools()
@@ -188,9 +210,21 @@ class ScenarioRunner:
             for step in self.scenario.steps:
                 await self._execute_step(step, max_turns)
-            # Evaluate goals
+            # Evaluate goals and build detailed results
             goals_achieved = self._evaluate_goals()
+            goal_results = self._build_goal_results(goals_achieved)
             score = self._compute_score(goals_achieved)
+            max_score = sum(g.points for g in self.scenario.goals)
+            # Get token usage from agent if available
+            input_tokens = 0
+            output_tokens = 0
+            if hasattr(self.agent, "get_usage"):
+                usage = self.agent.get_usage()
+                input_tokens = usage.get("input_tokens", 0)
+                output_tokens = usage.get("output_tokens", 0)
+            latency_ms = int((time.perf_counter() - start_time) * 1000)
             return ScenarioResult(
                 scenario_id=self.scenario.id,
@@ -199,7 +233,12 @@ class ScenarioRunner:
                 tool_calls=self.tool_call_log,
                 final_state=self.env_state.copy(),
                 goals_achieved=goals_achieved,
+                goal_results=goal_results,
                 score=score,
+                max_score=max_score,
+                latency_ms=latency_ms,
+                input_tokens=input_tokens,
+                output_tokens=output_tokens,
             )
         finally:
             await self._cleanup_mcp()
@@ -440,6 +479,22 @@ class ScenarioRunner:
         return list(set(achieved))  # Deduplicate
+    def _build_goal_results(self, goals_achieved: list[str]) -> list[GoalResult]:
+        """Build detailed goal results for MLflow logging."""
+        results = []
+        for goal in self.scenario.goals:
+            achieved = goal.id in goals_achieved
+            results.append(
+                GoalResult(
+                    id=goal.id,
+                    name=goal.name or goal.id,
+                    achieved=achieved,
+                    points=goal.points if achieved else 0,
+                    reason="Goal achieved" if achieved else "Goal not achieved",
+                )
+            )
+        return results
     def _compute_score(self, goals_achieved: list[str]) -> float:
         """Compute score based on achieved goals."""
         from sandboxy.core.safe_eval import EvaluationError, safe_eval_formula

sandboxy/scenarios/unified.py CHANGED Viewed

@@ -292,6 +292,9 @@ class RunResult:
     cost_usd: float | None = None
     error: str | None = None
     created_at: datetime = field(default_factory=datetime.now)
+    # Provider info for distinguishing local vs cloud
+    is_local: bool = False
+    provider_name: str | None = None
     def to_dict(self) -> dict[str, Any]:
         """Convert to dictionary."""
@@ -324,6 +327,8 @@ class RunResult:
             "cost_usd": self.cost_usd,
             "error": self.error,
             "created_at": self.created_at.isoformat(),
+            "is_local": self.is_local,
+            "provider_name": self.provider_name,
         }
     def to_json(self, indent: int | None = 2) -> str:
@@ -332,9 +337,15 @@ class RunResult:
     def pretty(self) -> str:
         """Format for human-readable display."""
+        model_display = self.model
+        if self.is_local:
+            model_display += " (local)"
+        elif self.provider_name:
+            model_display += f" ({self.provider_name})"
         lines = [
             f"Scenario: {self.scenario_id}",
-            f"Model: {self.model}",
+            f"Model: {model_display}",
             f"Latency: {self.latency_ms}ms",
         ]
@@ -516,6 +527,10 @@ class UnifiedRunner:
             temperature=temperature,
         )
+        # Detect if this is a local provider
+        is_local = hasattr(provider, "config")  # LocalProvider has config attribute
+        provider_name = provider.provider_name if hasattr(provider, "provider_name") else None
         return RunResult(
             id="",  # Set by caller
             scenario_id=scenario.id,
@@ -528,7 +543,9 @@ class UnifiedRunner:
             ],
             input_tokens=response.input_tokens,
             output_tokens=response.output_tokens,
-            cost_usd=response.cost_usd,
+            cost_usd=response.cost_usd if not is_local else 0.0,  # Local models have no cost
+            is_local=is_local,
+            provider_name=provider_name,
         )
     async def _run_multi_turn(
@@ -773,6 +790,11 @@ class UnifiedRunner:
                 # Calculate cost from token counts
                 cost_usd = self._calculate_cost(model, input_tokens, output_tokens)
+            # Detect if this is a local provider
+            provider = self.registry.get_provider_for_model(model)
+            is_local = hasattr(provider, "config")
+            provider_name = provider.provider_name if hasattr(provider, "provider_name") else None
             return RunResult(
                 id="",
                 scenario_id=scenario.id,
@@ -783,7 +805,9 @@ class UnifiedRunner:
                 final_state=env_state,
                 input_tokens=input_tokens,
                 output_tokens=output_tokens,
-                cost_usd=cost_usd,
+                cost_usd=cost_usd if not is_local else 0.0,
+                is_local=is_local,
+                provider_name=provider_name,
             )
         finally:

sandboxy/tools/yaml_tools.py CHANGED Viewed

@@ -288,6 +288,17 @@ class YamlMockTool:
         self.config = config.config
         self.spec = spec
         self._call_log: list[dict[str, Any]] = []
+        self._overrides: dict[str, Any] = {}
+    def set_overrides(self, overrides: dict[str, Any]) -> None:
+        """Set response overrides for dataset benchmarking.
+        Args:
+            overrides: Dict mapping "tool.action" or "tool" to override response data.
+                      When a matching action is called, returns the override data
+                      instead of the normal mock response.
+        """
+        self._overrides = overrides
     @property
     def call_log(self) -> list[dict[str, Any]]:
@@ -347,6 +358,13 @@ class YamlMockTool:
         for effect in action_spec.side_effects:
             effect.apply(env_state, validated_args)
+        # Check for override (dataset benchmarking)
+        override_key = f"{self.name}.{action}"
+        if override_key in self._overrides:
+            return ToolResult(success=True, data=self._overrides[override_key])
+        if self.name in self._overrides:
+            return ToolResult(success=True, data=self._overrides[self.name])
         # Compute return value
         result_value = self._compute_return(action_spec, validated_args, env_state)

sandboxy 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl

sandboxy 0.0.3py3-none-any.whl → 0.0.5py3-none-any.whl