PyPI - sandboxy - Versions diffs - 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl - Mend

sandboxy 0.0.3py3-none-any.whl → 0.0.5py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (30) hide show

sandboxy/agents/llm_prompt.py +85 -14
sandboxy/api/app.py +2 -1
sandboxy/api/routes/local.py +216 -20
sandboxy/api/routes/providers.py +369 -0
sandboxy/cli/main.py +663 -31
sandboxy/mlflow/__init__.py +38 -0
sandboxy/mlflow/artifacts.py +184 -0
sandboxy/mlflow/config.py +90 -0
sandboxy/mlflow/exporter.py +445 -0
sandboxy/mlflow/metrics.py +115 -0
sandboxy/mlflow/tags.py +140 -0
sandboxy/mlflow/tracing.py +126 -0
sandboxy/providers/__init__.py +37 -3
sandboxy/providers/config.py +243 -0
sandboxy/providers/local.py +498 -0
sandboxy/providers/registry.py +107 -13
sandboxy/scenarios/loader.py +44 -2
sandboxy/scenarios/runner.py +57 -2
sandboxy/scenarios/unified.py +27 -3
sandboxy/tools/yaml_tools.py +18 -0
sandboxy/ui/dist/assets/index-CLxxjJuD.js +367 -0
sandboxy/ui/dist/assets/index-DBB7ehs6.css +1 -0
sandboxy/ui/dist/index.html +2 -2
{sandboxy-0.0.3.dist-info → sandboxy-0.0.5.dist-info}/METADATA +103 -27
{sandboxy-0.0.3.dist-info → sandboxy-0.0.5.dist-info}/RECORD +28 -18
sandboxy/ui/dist/assets/index-CgAkYWrJ.css +0 -1
sandboxy/ui/dist/assets/index-D4zoGFcr.js +0 -347
{sandboxy-0.0.3.dist-info → sandboxy-0.0.5.dist-info}/WHEEL +0 -0
{sandboxy-0.0.3.dist-info → sandboxy-0.0.5.dist-info}/entry_points.txt +0 -0
{sandboxy-0.0.3.dist-info → sandboxy-0.0.5.dist-info}/licenses/LICENSE +0 -0

sandboxy/agents/llm_prompt.py CHANGED Viewed

@@ -16,11 +16,37 @@ MAX_RETRIES = 3
 RETRY_DELAY_BASE = 1.0  # seconds
+def _is_local_provider_model(model_id: str) -> bool:
+    """Check if a model ID refers to a local provider.
+    Args:
+        model_id: Model identifier
+    Returns:
+        True if the model is from a configured local provider
+    """
+    if "/" not in model_id:
+        return False
+    provider_name = model_id.split("/")[0]
+    # Check if this provider name matches a configured local provider
+    try:
+        from sandboxy.providers.config import load_providers_config
+        config = load_providers_config()
+        return any(p.name == provider_name and p.enabled for p in config.providers)
+    except Exception:
+        return False
 class LlmPromptAgent(BaseAgent):
     """Agent that uses an LLM via OpenAI-compatible API.
-    Supports both direct OpenAI and OpenRouter (for 400+ models).
-    Uses OpenRouter when model contains "/" (e.g., "openai/gpt-4o").
+    Supports:
+    - Local providers (Ollama, LM Studio, vLLM) when model matches configured provider
+    - OpenRouter (for 400+ cloud models)
+    - Direct OpenAI when model has no prefix
     """
     def __init__(self, config: AgentConfig) -> None:
@@ -31,7 +57,12 @@ class LlmPromptAgent(BaseAgent):
         """
         super().__init__(config)
         self._client: Any = None
-        self._is_openrouter = "/" in (config.model or "")
+        self._local_provider: Any = None
+        # Check for local provider first
+        self._is_local = _is_local_provider_model(config.model or "")
+        self._is_openrouter = not self._is_local and "/" in (config.model or "")
         # Token usage tracking
         self._total_input_tokens = 0
         self._total_output_tokens = 0
@@ -39,6 +70,9 @@ class LlmPromptAgent(BaseAgent):
     @property
     def api_key(self) -> str:
         """Get the appropriate API key based on model type."""
+        if self._is_local:
+            # Local providers may not need an API key, or it's in the provider config
+            return ""
         if self._is_openrouter:
             return os.getenv("OPENROUTER_API_KEY", "")
         return os.getenv("OPENAI_API_KEY", "")
@@ -49,15 +83,46 @@ class LlmPromptAgent(BaseAgent):
         if self._client is None:
             from openai import OpenAI
-            if self._is_openrouter:
-                logger.debug("Initializing OpenRouter client for model: %s", self.config.model)
-                self._client = OpenAI(
-                    api_key=self.api_key,
-                    base_url="https://openrouter.ai/api/v1",
-                )
-            else:
-                logger.debug("Initializing OpenAI client for model: %s", self.config.model)
-                self._client = OpenAI(api_key=self.api_key)
+            if self._is_local:
+                # Get local provider and create client pointing to it
+                provider_name = (self.config.model or "").split("/")[0]
+                from sandboxy.providers.config import load_providers_config
+                config = load_providers_config()
+                provider_config = config.get_provider(provider_name)
+                if provider_config:
+                    logger.debug(
+                        "Initializing local client for %s at %s",
+                        provider_name,
+                        provider_config.base_url,
+                    )
+                    headers = {}
+                    if provider_config.api_key:
+                        headers["Authorization"] = f"Bearer {provider_config.api_key}"
+                    self._client = OpenAI(
+                        api_key=provider_config.api_key or "not-needed",
+                        base_url=provider_config.base_url,
+                        default_headers=headers if headers else None,
+                    )
+                else:
+                    logger.warning(
+                        "Local provider %s not found, falling back to OpenRouter", provider_name
+                    )
+                    self._is_local = False
+                    self._is_openrouter = True
+            if self._client is None:  # Not set by local provider path
+                if self._is_openrouter:
+                    logger.debug("Initializing OpenRouter client for model: %s", self.config.model)
+                    self._client = OpenAI(
+                        api_key=self.api_key,
+                        base_url="https://openrouter.ai/api/v1",
+                    )
+                else:
+                    logger.debug("Initializing OpenAI client for model: %s", self.config.model)
+                    self._client = OpenAI(api_key=self.api_key)
         return self._client
     def step(
@@ -66,7 +131,8 @@ class LlmPromptAgent(BaseAgent):
         available_tools: list[dict[str, Any]] | None = None,
     ) -> AgentAction:
         """Process conversation and return next action using LLM."""
-        if not self.api_key:
+        # Local providers don't require an API key
+        if not self._is_local and not self.api_key:
             return self._stub_response(history)
         messages = self._build_messages(history)
@@ -188,8 +254,13 @@ class LlmPromptAgent(BaseAgent):
         messages: list[dict[str, Any]],
         tools: list[dict[str, Any]] | None,
     ) -> Any:
-        """Make API call to OpenAI/OpenRouter."""
+        """Make API call to OpenAI/OpenRouter/Local provider."""
         model = self.config.model or "gpt-4o-mini"
+        # For local providers, strip the provider prefix (e.g., "ollama/llama3" -> "llama3")
+        if self._is_local and "/" in model:
+            model = model.split("/", 1)[1]
         kwargs: dict[str, Any] = {
             "model": model,
             "messages": messages,

sandboxy/api/app.py CHANGED Viewed

@@ -58,12 +58,13 @@ def create_local_app(
     )
     # Local routes only
-    from sandboxy.api.routes import agents, tools
+    from sandboxy.api.routes import agents, providers, tools
     from sandboxy.api.routes import local as local_routes
     app.include_router(local_routes.router, prefix="/api/v1", tags=["local"])
     app.include_router(agents.router, prefix="/api/v1", tags=["agents"])
     app.include_router(tools.router, prefix="/api/v1", tags=["tools"])
+    app.include_router(providers.router, prefix="/api/v1", tags=["providers"])
     @app.get("/health")
     async def health_check():

sandboxy/api/routes/local.py CHANGED Viewed

@@ -379,6 +379,10 @@ class RunScenarioRequest(BaseModel):
     max_turns: int = 20
     max_tokens: int = 1024
     temperature: float = 0.7
+    mlflow_export: bool = False
+    mlflow_tracking_uri: str | None = None
+    mlflow_experiment: str | None = None
+    mlflow_tracing: bool = True
 class RunScenarioResponse(BaseModel):
@@ -393,6 +397,9 @@ class RunScenarioResponse(BaseModel):
     final_state: dict[str, Any]
     evaluation: dict[str, Any] | None
     latency_ms: int
+    input_tokens: int = 0
+    output_tokens: int = 0
+    cost_usd: float | None = None
     error: str | None
@@ -404,6 +411,10 @@ class CompareModelsRequest(BaseModel):
     runs_per_model: int = 1
     variables: dict[str, Any] = Field(default_factory=dict)
     max_turns: int = 20
+    mlflow_export: bool = False
+    mlflow_tracking_uri: str | None = None
+    mlflow_experiment: str | None = None
+    mlflow_tracing: bool = True  # Enable LLM call tracing by default
 class CompareModelsResponse(BaseModel):
@@ -454,20 +465,73 @@ async def run_scenario(request: RunScenarioRequest) -> RunScenarioResponse:
         spec = load_unified_scenario(scenario_path)
         runner = UnifiedRunner()
-        result = await runner.run(
-            scenario=spec,
-            model=request.model,
-            variables=request.variables,
-            max_turns=request.max_turns,
-            max_tokens=request.max_tokens,
-            temperature=request.temperature,
-        )
+        # Setup MLflow if requested
+        mlflow_config = None
+        if request.mlflow_export:
+            try:
+                from sandboxy.mlflow import MLflowConfig
+                mlflow_config = MLflowConfig.resolve(
+                    cli_export=True,
+                    cli_tracking_uri=request.mlflow_tracking_uri,
+                    cli_experiment=request.mlflow_experiment,
+                    cli_tracing=request.mlflow_tracing,
+                    yaml_config=None,
+                    scenario_name=spec.name,
+                )
+            except ImportError:
+                pass  # MLflow not installed
+        # Run with MLflow context if enabled (connects traces to run)
+        if mlflow_config and mlflow_config.enabled:
+            from sandboxy.mlflow import MLflowExporter, mlflow_run_context
+            from sandboxy.mlflow.tracing import enable_tracing
+            if mlflow_config.tracing:
+                enable_tracing(
+                    tracking_uri=mlflow_config.tracking_uri,
+                    experiment_name=mlflow_config.experiment,
+                )
+            with mlflow_run_context(mlflow_config, run_name=request.model) as run_id:
+                result = await runner.run(
+                    scenario=spec,
+                    model=request.model,
+                    variables=request.variables,
+                    max_turns=request.max_turns,
+                    max_tokens=request.max_tokens,
+                    temperature=request.temperature,
+                )
+                if run_id:
+                    exporter = MLflowExporter(mlflow_config)
+                    exporter.log_to_active_run(
+                        result=result,
+                        scenario_path=scenario_path,
+                        scenario_name=spec.name,
+                        scenario_id=spec.id,
+                        agent_name=request.model,
+                    )
+        else:
+            result = await runner.run(
+                scenario=spec,
+                model=request.model,
+                variables=request.variables,
+                max_turns=request.max_turns,
+                max_tokens=request.max_tokens,
+                temperature=request.temperature,
+            )
         # Save result to runs/
         from sandboxy.local.results import save_run_result
         save_run_result(request.scenario_id, result.to_dict())
+        # Calculate cost
+        input_tokens = result.input_tokens or 0
+        output_tokens = result.output_tokens or 0
+        cost_usd = calculate_cost(result.model, input_tokens, output_tokens)
         return RunScenarioResponse(
             id=result.id,
             scenario_id=result.scenario_id,
@@ -481,6 +545,9 @@ async def run_scenario(request: RunScenarioRequest) -> RunScenarioResponse:
             final_state=result.final_state,
             evaluation=result.evaluation.to_dict() if result.evaluation else None,
             latency_ms=result.latency_ms,
+            input_tokens=input_tokens,
+            output_tokens=output_tokens,
+            cost_usd=cost_usd,
             error=result.error,
         )
@@ -530,6 +597,19 @@ async def compare_models(request: CompareModelsRequest) -> CompareModelsResponse
         spec = load_unified_scenario(scenario_path)
+        # Enable MLflow tracing if requested (must be done BEFORE any LLM calls)
+        if request.mlflow_export and request.mlflow_tracing:
+            try:
+                from sandboxy.mlflow.tracing import enable_tracing
+                experiment = request.mlflow_experiment or spec.name
+                enable_tracing(
+                    tracking_uri=request.mlflow_tracking_uri,
+                    experiment_name=experiment,
+                )
+            except ImportError:
+                pass  # MLflow not installed
         comparison = await run_comparison(
             scenario=spec,
             models=request.models,
@@ -538,6 +618,33 @@ async def compare_models(request: CompareModelsRequest) -> CompareModelsResponse
             max_turns=request.max_turns,
         )
+        # MLflow export (if enabled)
+        if request.mlflow_export:
+            try:
+                from sandboxy.mlflow import MLflowConfig, MLflowExporter
+                for result in comparison.results:
+                    config = MLflowConfig.resolve(
+                        cli_export=True,
+                        cli_tracking_uri=request.mlflow_tracking_uri,
+                        cli_experiment=request.mlflow_experiment,
+                        cli_tracing=request.mlflow_tracing,
+                        yaml_config=None,
+                        scenario_name=spec.name,
+                    )
+                    exporter = MLflowExporter(config)
+                    exporter.export(
+                        result=result.to_dict(),
+                        scenario_path=scenario_path,
+                        scenario_name=spec.name,
+                        scenario_id=spec.id,
+                        agent_name=result.model,
+                    )
+            except ImportError:
+                logger.warning("MLflow not installed, skipping export")
+            except Exception as e:
+                logger.warning(f"Failed to export to MLflow: {e}")
         # Save comparison result
         from sandboxy.local.results import save_run_result
@@ -587,10 +694,40 @@ def calculate_cost(model_id: str, input_tokens: int, output_tokens: int) -> floa
 @router.get("/local/models")
 async def list_available_models() -> list[dict[str, Any]]:
-    """List available models from OpenRouter."""
+    """List available models from OpenRouter and local providers."""
+    from sandboxy.providers.config import get_enabled_providers
+    from sandboxy.providers.local import LocalProvider
     from sandboxy.providers.openrouter import OPENROUTER_MODELS
     models = []
+    # Add models from local providers first
+    for provider_config in get_enabled_providers():
+        try:
+            provider = LocalProvider(provider_config)
+            local_models = await provider.refresh_models()
+            await provider.close()
+            for model in local_models:
+                # Model ID includes provider prefix for routing
+                full_model_id = f"{provider_config.name}/{model.id}"
+                models.append(
+                    {
+                        "id": full_model_id,
+                        "name": model.name,
+                        "price": "Local",
+                        "pricing": {"input": 0, "output": 0},
+                        "provider": provider_config.name,
+                        "context_length": model.context_length,
+                        "supports_vision": model.supports_vision,
+                        "is_local": True,
+                        "provider_name": provider_config.name,
+                    }
+                )
+        except Exception as e:
+            logger.warning(f"Failed to fetch models from {provider_config.name}: {e}")
+    # Add OpenRouter models
     for model_id, info in OPENROUTER_MODELS.items():
         # Format price string
         if info.input_cost_per_million == 0 and info.output_cost_per_million == 0:
@@ -610,6 +747,7 @@ async def list_available_models() -> list[dict[str, Any]]:
                 "provider": info.provider,
                 "context_length": info.context_length,
                 "supports_vision": info.supports_vision,
+                "is_local": False,
             }
         )
@@ -905,6 +1043,8 @@ class RunDatasetRequest(BaseModel):
     max_tokens: int = 1024
     temperature: float = 0.7
     parallel: int = 1
+    mlflow_enabled: bool = False
+    mlflow_experiment: str | None = None
 class RunDatasetResponse(BaseModel):
@@ -1335,25 +1475,81 @@ async def run_with_dataset(request: RunDatasetRequest) -> RunDatasetResponse:
         spec = load_unified_scenario(scenario_path)
         dataset = load_dataset(dataset_path)
-        if request.parallel > 1:
-            result = await run_dataset_parallel(
+        # Setup MLflow if enabled
+        mlflow_config = None
+        if request.mlflow_enabled:
+            try:
+                from sandboxy.mlflow import MLflowConfig
+                mlflow_config = MLflowConfig(
+                    enabled=True,
+                    experiment=request.mlflow_experiment or f"{spec.name}-dataset",
+                    tracing=False,  # Tracing not needed for dataset aggregates
+                )
+            except ImportError:
+                pass  # MLflow not installed
+        async def run_dataset_benchmark():
+            if request.parallel > 1:
+                return await run_dataset_parallel(
+                    scenario=spec,
+                    model=request.model,
+                    dataset=dataset,
+                    max_turns=request.max_turns,
+                    max_tokens=request.max_tokens,
+                    temperature=request.temperature,
+                    max_concurrent=request.parallel,
+                )
+            return await run_dataset(
                 scenario=spec,
                 model=request.model,
                 dataset=dataset,
                 max_turns=request.max_turns,
                 max_tokens=request.max_tokens,
                 temperature=request.temperature,
-                max_concurrent=request.parallel,
             )
+        # Run with MLflow context if enabled
+        if mlflow_config and mlflow_config.enabled:
+            from sandboxy.mlflow import mlflow_run_context
+            run_name = f"{request.model}-{request.dataset_id}"
+            with mlflow_run_context(mlflow_config, run_name=run_name) as run_id:
+                result = await run_dataset_benchmark()
+                # Log aggregate metrics to MLflow
+                if run_id:
+                    try:
+                        import mlflow
+                        mlflow.log_params(
+                            {
+                                "scenario_id": result.scenario_id,
+                                "dataset_id": result.dataset_id,
+                                "model": result.model,
+                                "total_cases": result.total_cases,
+                            }
+                        )
+                        mlflow.log_metrics(
+                            {
+                                "passed_cases": result.passed_cases,
+                                "failed_cases": result.failed_cases,
+                                "pass_rate": result.pass_rate,
+                                "avg_score": result.avg_score,
+                                "avg_percentage": result.avg_percentage,
+                                "total_time_ms": result.total_time_ms,
+                            }
+                        )
+                        # Log per-expected-outcome metrics
+                        for expected, counts in result.by_expected.items():
+                            total = counts.get("passed", 0) + counts.get("failed", 0)
+                            if total > 0:
+                                rate = counts.get("passed", 0) / total
+                                mlflow.log_metric(f"pass_rate_{expected}", rate)
+                    except Exception as e:
+                        logger.warning(f"Failed to log dataset metrics to MLflow: {e}")
         else:
-            result = await run_dataset(
-                scenario=spec,
-                model=request.model,
-                dataset=dataset,
-                max_turns=request.max_turns,
-                max_tokens=request.max_tokens,
-                temperature=request.temperature,
-            )
+            result = await run_dataset_benchmark()
         # Save result
         from sandboxy.local.results import save_run_result

sandboxy 0.0.3__py3-none-any.whl → 0.0.5__py3-none-any.whl

sandboxy 0.0.3py3-none-any.whl → 0.0.5py3-none-any.whl