PyPI - sandboxy - Versions diffs - 0.0.2__tar.gz → 0.0.4__tar.gz - Mend

sandboxy 0.0.2tar.gz → 0.0.4tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (129) hide show

{sandboxy-0.0.2 → sandboxy-0.0.4}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: sandboxy
-Version: 0.0.2
+Version: 0.0.4
 Summary: Open-source agent simulation and benchmarking platform
 Project-URL: Homepage, https://github.com/sandboxy-ai/sandboxy
 Project-URL: Repository, https://github.com/sandboxy-ai/sandboxy
@@ -39,6 +39,8 @@ Requires-Dist: pytest-xdist>=3.5.0; extra == 'dev'
 Requires-Dist: pytest>=8.0; extra == 'dev'
 Requires-Dist: respx>=0.21.0; extra == 'dev'
 Requires-Dist: ruff>=0.1; extra == 'dev'
+Provides-Extra: mlflow
+Requires-Dist: mlflow>=3.0; extra == 'mlflow'
 Description-Content-Type: text/markdown
 # Sandboxy
@@ -204,6 +206,39 @@ sandboxy list-models --search claude
 sandboxy list-models --free
 ```
+## MLflow Integration
+Export scenario run results to MLflow for experiment tracking and model comparison.
+```bash
+# Install with MLflow support
+pip install sandboxy[mlflow]
+# Export run to MLflow
+sandboxy scenario scenarios/test.yml -m openai/gpt-4o --mlflow-export
+# Custom experiment name
+sandboxy scenario scenarios/test.yml -m gpt-4o --mlflow-export --mlflow-experiment "my-evals"
+```
+Or enable in scenario YAML:
+```yaml
+id: my-scenario
+name: "My Test"
+mlflow:
+  enabled: true
+  experiment: "agent-evals"
+  tags:
+    team: "support"
+system_prompt: |
+  ...
+```
+See `MLFLOW_TRACKING_URI` env variable to configure the MLflow server.
 ## Configuration
 Environment variables (in `~/.sandboxy/.env` or project `.env`):
@@ -213,6 +248,7 @@ Environment variables (in `~/.sandboxy/.env` or project `.env`):
 | `OPENROUTER_API_KEY` | OpenRouter API key (400+ models) |
 | `OPENAI_API_KEY` | Direct OpenAI access |
 | `ANTHROPIC_API_KEY` | Direct Anthropic access |
+| `MLFLOW_TRACKING_URI` | MLflow tracking server URI |
 ## Project Structure

{sandboxy-0.0.2 → sandboxy-0.0.4}/README.md RENAMED Viewed

@@ -161,6 +161,39 @@ sandboxy list-models --search claude
 sandboxy list-models --free
 ```
+## MLflow Integration
+Export scenario run results to MLflow for experiment tracking and model comparison.
+```bash
+# Install with MLflow support
+pip install sandboxy[mlflow]
+# Export run to MLflow
+sandboxy scenario scenarios/test.yml -m openai/gpt-4o --mlflow-export
+# Custom experiment name
+sandboxy scenario scenarios/test.yml -m gpt-4o --mlflow-export --mlflow-experiment "my-evals"
+```
+Or enable in scenario YAML:
+```yaml
+id: my-scenario
+name: "My Test"
+mlflow:
+  enabled: true
+  experiment: "agent-evals"
+  tags:
+    team: "support"
+system_prompt: |
+  ...
+```
+See `MLFLOW_TRACKING_URI` env variable to configure the MLflow server.
 ## Configuration
 Environment variables (in `~/.sandboxy/.env` or project `.env`):
@@ -170,6 +203,7 @@ Environment variables (in `~/.sandboxy/.env` or project `.env`):
 | `OPENROUTER_API_KEY` | OpenRouter API key (400+ models) |
 | `OPENAI_API_KEY` | Direct OpenAI access |
 | `ANTHROPIC_API_KEY` | Direct Anthropic access |
+| `MLFLOW_TRACKING_URI` | MLflow tracking server URI |
 ## Project Structure

{sandboxy-0.0.2 → sandboxy-0.0.4}/local-ui/src/hooks/useScenarioRun.ts RENAMED Viewed

@@ -7,13 +7,20 @@ import { api, RunScenarioResponse, CompareModelsResponse } from '../lib/api'
 export type RunState = 'idle' | 'running' | 'completed' | 'error'
+export interface MlflowOptions {
+  enabled: boolean
+  trackingUri?: string
+  experiment?: string
+  tracing?: boolean
+}
 export interface UseScenarioRunResult {
   state: RunState
   result: RunScenarioResponse | null
   comparison: CompareModelsResponse | null
   error: string | null
-  runScenario: (scenarioId: string, model: string, variables?: Record<string, unknown>) => Promise<void>
-  compareModels: (scenarioId: string, models: string[], runsPerModel?: number, variables?: Record<string, unknown>) => Promise<void>
+  runScenario: (scenarioId: string, model: string, variables?: Record<string, unknown>, mlflow?: MlflowOptions) => Promise<void>
+  compareModels: (scenarioId: string, models: string[], runsPerModel?: number, variables?: Record<string, unknown>, mlflow?: MlflowOptions) => Promise<void>
   reset: () => void
 }
@@ -33,7 +40,8 @@ export function useScenarioRun(): UseScenarioRunResult {
   const runScenario = useCallback(async (
     scenarioId: string,
     model: string,
-    variables?: Record<string, unknown>
+    variables?: Record<string, unknown>,
+    mlflow?: MlflowOptions
   ) => {
     reset()
     setState('running')
@@ -43,6 +51,10 @@ export function useScenarioRun(): UseScenarioRunResult {
         scenario_id: scenarioId,
         model,
         variables,
+        mlflow_export: mlflow?.enabled,
+        mlflow_tracking_uri: mlflow?.trackingUri,
+        mlflow_experiment: mlflow?.experiment,
+        mlflow_tracing: mlflow?.tracing,
       })
       if (response.error) {
@@ -62,7 +74,8 @@ export function useScenarioRun(): UseScenarioRunResult {
     scenarioId: string,
     models: string[],
     runsPerModel: number = 1,
-    variables?: Record<string, unknown>
+    variables?: Record<string, unknown>,
+    mlflow?: MlflowOptions
   ) => {
     reset()
     setState('running')
@@ -73,6 +86,10 @@ export function useScenarioRun(): UseScenarioRunResult {
         models,
         runs_per_model: runsPerModel,
         variables,
+        mlflow_export: mlflow?.enabled,
+        mlflow_tracking_uri: mlflow?.trackingUri,
+        mlflow_experiment: mlflow?.experiment,
+        mlflow_tracing: mlflow?.tracing,
       })
       setState('completed')

{sandboxy-0.0.2 → sandboxy-0.0.4}/local-ui/src/lib/api.ts RENAMED Viewed

@@ -53,6 +53,10 @@ export interface RunScenarioRequest {
   max_turns?: number
   max_tokens?: number
   temperature?: number
+  mlflow_export?: boolean
+  mlflow_tracking_uri?: string
+  mlflow_experiment?: string
+  mlflow_tracing?: boolean
 }
 export interface HistoryMessage {
@@ -112,6 +116,10 @@ export interface CompareModelsRequest {
   runs_per_model?: number
   variables?: Record<string, unknown>
   max_turns?: number
+  mlflow_export?: boolean
+  mlflow_tracking_uri?: string
+  mlflow_experiment?: string
+  mlflow_tracing?: boolean
 }
 export interface ModelStats {
@@ -205,6 +213,8 @@ export interface RunDatasetRequest {
   max_tokens?: number
   temperature?: number
   parallel?: number
+  mlflow_enabled?: boolean
+  mlflow_experiment?: string
 }
 export interface CaseResultInfo {

{sandboxy-0.0.2 → sandboxy-0.0.4}/local-ui/src/pages/RunPage.tsx RENAMED Viewed

@@ -1,8 +1,8 @@
 import { useState, useEffect } from 'react'
 import { useParams, useSearchParams, Link } from 'react-router-dom'
-import { ArrowLeft, Play, Loader2, XCircle, Edit, Settings, Database, Check, X } from 'lucide-react'
+import { ArrowLeft, Play, Loader2, XCircle, Edit, Settings, Database, Check, X, ChevronDown, ChevronRight, ExternalLink } from 'lucide-react'
 import { api, ScenarioDetail, ModelInfo, VariableInfo, DatasetInfo, RunDatasetResponse, LocalFileInfo } from '../lib/api'
-import { useScenarioRun } from '../hooks/useScenarioRun'
+import { useScenarioRun, MlflowOptions } from '../hooks/useScenarioRun'
 import { SingleRunResult, ComparisonResult } from '../components/ResultDisplay'
 import { ModelSelector, MultiModelSelector } from '../components/ModelSelector'
@@ -33,6 +33,13 @@ export default function RunPage() {
   const [datasetRunning, setDatasetRunning] = useState(false)
   const [parallel, setParallel] = useState(parallelFromUrl)
+  // MLflow state
+  const [mlflowEnabled, setMlflowEnabled] = useState(false)
+  const [mlflowExpanded, setMlflowExpanded] = useState(false)
+  const [mlflowTrackingUri, setMlflowTrackingUri] = useState('')
+  const [mlflowExperiment, setMlflowExperiment] = useState('')
+  const [mlflowTracing, setMlflowTracing] = useState(true)
   const { state, result, comparison, error: runError, runScenario, compareModels } = useScenarioRun()
   useEffect(() => {
@@ -93,6 +100,13 @@ export default function RunPage() {
     const sid = selectedScenarioId || scenarioId
     if (!sid) return
+    const mlflowOptions: MlflowOptions | undefined = mlflowEnabled ? {
+      enabled: true,
+      trackingUri: mlflowTrackingUri || undefined,
+      experiment: mlflowExperiment || undefined,
+      tracing: mlflowTracing,
+    } : undefined
     if (runMode === 'dataset') {
       if (!selectedDataset || !selectedModel) return
       setDatasetRunning(true)
@@ -103,6 +117,8 @@ export default function RunPage() {
           dataset_id: selectedDataset,
           model: selectedModel,
           parallel,
+          mlflow_enabled: mlflowEnabled,
+          mlflow_experiment: mlflowExperiment || undefined,
         })
         setDatasetResult(result)
       } catch (err) {
@@ -112,10 +128,10 @@ export default function RunPage() {
       }
     } else if (runMode === 'single') {
       if (!selectedModel) return
-      await runScenario(sid, selectedModel, variables)
+      await runScenario(sid, selectedModel, variables, mlflowOptions)
     } else {
       if (selectedModels.length === 0) return
-      await compareModels(sid, selectedModels, runsPerModel, variables)
+      await compareModels(sid, selectedModels, runsPerModel, variables, mlflowOptions)
     }
   }
@@ -360,6 +376,96 @@ export default function RunPage() {
           </div>
         )}
+        {/* MLflow Section */}
+        <div className="mb-6 p-4 panel-subtle">
+          <button
+            onClick={() => setMlflowExpanded(!mlflowExpanded)}
+            className="flex items-center gap-2 w-full text-left"
+          >
+            {mlflowExpanded ? <ChevronDown size={18} /> : <ChevronRight size={18} />}
+            <span className="font-medium text-slate-100">MLflow Tracking</span>
+            {mlflowEnabled && (
+              <span className="ml-2 px-2 py-0.5 text-xs bg-green-500/20 text-green-400 rounded">
+                Enabled
+              </span>
+            )}
+          </button>
+          {mlflowExpanded && (
+            <div className="mt-4 space-y-4">
+              {/* Enable Toggle */}
+              <label className="flex items-center gap-3 cursor-pointer">
+                <input
+                  type="checkbox"
+                  checked={mlflowEnabled}
+                  onChange={(e) => setMlflowEnabled(e.target.checked)}
+                  disabled={state === 'running' || datasetRunning}
+                  className="w-4 h-4 rounded border-slate-600 text-orange-400 focus:ring-orange-400"
+                />
+                <span className="text-slate-200">Enable MLflow tracking</span>
+              </label>
+              {mlflowEnabled && (
+                <>
+                  {/* Tracking URI - only for non-dataset runs (dataset uses env var) */}
+                  {runMode !== 'dataset' && (
+                    <div>
+                      <label className="block text-sm font-medium text-slate-400 mb-1">
+                        Tracking URI
+                      </label>
+                      <input
+                        type="text"
+                        value={mlflowTrackingUri}
+                        onChange={(e) => setMlflowTrackingUri(e.target.value)}
+                        disabled={state === 'running'}
+                        placeholder="http://127.0.0.1:5000 (uses MLFLOW_TRACKING_URI if empty)"
+                        className="w-full panel-subtle px-3 py-2 text-slate-100 text-sm focus:outline-none focus:ring-2 focus:ring-orange-400"
+                      />
+                    </div>
+                  )}
+                  {/* Experiment Name */}
+                  <div>
+                    <label className="block text-sm font-medium text-slate-400 mb-1">
+                      Experiment Name
+                    </label>
+                    <input
+                      type="text"
+                      value={mlflowExperiment}
+                      onChange={(e) => setMlflowExperiment(e.target.value)}
+                      disabled={state === 'running' || datasetRunning}
+                      placeholder={runMode === 'dataset' ? `${scenario?.name || 'scenario'}-dataset` : (scenario?.name || 'Defaults to scenario name')}
+                      className="w-full panel-subtle px-3 py-2 text-slate-100 text-sm focus:outline-none focus:ring-2 focus:ring-orange-400"
+                    />
+                  </div>
+                  {/* Tracing Toggle - only for non-dataset runs */}
+                  {runMode !== 'dataset' && (
+                    <label className="flex items-center gap-3 cursor-pointer">
+                      <input
+                        type="checkbox"
+                        checked={mlflowTracing}
+                        onChange={(e) => setMlflowTracing(e.target.checked)}
+                        disabled={state === 'running'}
+                        className="w-4 h-4 rounded border-slate-600 text-orange-400 focus:ring-orange-400"
+                      />
+                      <div>
+                        <span className="text-slate-200">Enable LLM Tracing</span>
+                        <p className="text-xs text-slate-500">Capture detailed traces of each LLM call</p>
+                      </div>
+                    </label>
+                  )}
+                  <p className="text-xs text-slate-500 flex items-center gap-1">
+                    <ExternalLink size={12} />
+                    View results at your MLflow server after the run completes
+                  </p>
+                </>
+              )}
+            </div>
+          )}
+        </div>
         <button
           onClick={handleRun}
           disabled={

{sandboxy-0.0.2 → sandboxy-0.0.4}/pyproject.toml RENAMED Viewed

@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 [project]
 name = "sandboxy"
-version = "0.0.2"
+version = "0.0.4"
 description = "Open-source agent simulation and benchmarking platform"
 readme = "README.md"
 license = "Apache-2.0"
@@ -41,6 +41,9 @@ dependencies = [
 ]
 [project.optional-dependencies]
+mlflow = [
+    "mlflow>=3.0",
+]
 dev = [
     "pytest>=8.0",
     "pytest-cov>=4.0",
@@ -120,6 +123,8 @@ ignore = [
 # S307: eval usage with safety measures (safe_builtins, simpleeval)
 "sandboxy/tools/yaml_tools.py" = ["S307"]
 "sandboxy/core/safe_eval.py" = ["S307"]
+# S603, S607, S110: git subprocess call is safe (hardcoded command, known input)
+"sandboxy/mlflow/tags.py" = ["S603", "S607", "S110"]
 [tool.ruff.lint.pydocstyle]
 convention = "google"

{sandboxy-0.0.2 → sandboxy-0.0.4}/sandboxy/api/routes/local.py RENAMED Viewed

@@ -379,6 +379,10 @@ class RunScenarioRequest(BaseModel):
     max_turns: int = 20
     max_tokens: int = 1024
     temperature: float = 0.7
+    mlflow_export: bool = False
+    mlflow_tracking_uri: str | None = None
+    mlflow_experiment: str | None = None
+    mlflow_tracing: bool = True
 class RunScenarioResponse(BaseModel):
@@ -393,6 +397,9 @@ class RunScenarioResponse(BaseModel):
     final_state: dict[str, Any]
     evaluation: dict[str, Any] | None
     latency_ms: int
+    input_tokens: int = 0
+    output_tokens: int = 0
+    cost_usd: float | None = None
     error: str | None
@@ -404,6 +411,10 @@ class CompareModelsRequest(BaseModel):
     runs_per_model: int = 1
     variables: dict[str, Any] = Field(default_factory=dict)
     max_turns: int = 20
+    mlflow_export: bool = False
+    mlflow_tracking_uri: str | None = None
+    mlflow_experiment: str | None = None
+    mlflow_tracing: bool = True  # Enable LLM call tracing by default
 class CompareModelsResponse(BaseModel):
@@ -454,20 +465,73 @@ async def run_scenario(request: RunScenarioRequest) -> RunScenarioResponse:
         spec = load_unified_scenario(scenario_path)
         runner = UnifiedRunner()
-        result = await runner.run(
-            scenario=spec,
-            model=request.model,
-            variables=request.variables,
-            max_turns=request.max_turns,
-            max_tokens=request.max_tokens,
-            temperature=request.temperature,
-        )
+        # Setup MLflow if requested
+        mlflow_config = None
+        if request.mlflow_export:
+            try:
+                from sandboxy.mlflow import MLflowConfig
+                mlflow_config = MLflowConfig.resolve(
+                    cli_export=True,
+                    cli_tracking_uri=request.mlflow_tracking_uri,
+                    cli_experiment=request.mlflow_experiment,
+                    cli_tracing=request.mlflow_tracing,
+                    yaml_config=None,
+                    scenario_name=spec.name,
+                )
+            except ImportError:
+                pass  # MLflow not installed
+        # Run with MLflow context if enabled (connects traces to run)
+        if mlflow_config and mlflow_config.enabled:
+            from sandboxy.mlflow import MLflowExporter, mlflow_run_context
+            from sandboxy.mlflow.tracing import enable_tracing
+            if mlflow_config.tracing:
+                enable_tracing(
+                    tracking_uri=mlflow_config.tracking_uri,
+                    experiment_name=mlflow_config.experiment,
+                )
+            with mlflow_run_context(mlflow_config, run_name=request.model) as run_id:
+                result = await runner.run(
+                    scenario=spec,
+                    model=request.model,
+                    variables=request.variables,
+                    max_turns=request.max_turns,
+                    max_tokens=request.max_tokens,
+                    temperature=request.temperature,
+                )
+                if run_id:
+                    exporter = MLflowExporter(mlflow_config)
+                    exporter.log_to_active_run(
+                        result=result,
+                        scenario_path=scenario_path,
+                        scenario_name=spec.name,
+                        scenario_id=spec.id,
+                        agent_name=request.model,
+                    )
+        else:
+            result = await runner.run(
+                scenario=spec,
+                model=request.model,
+                variables=request.variables,
+                max_turns=request.max_turns,
+                max_tokens=request.max_tokens,
+                temperature=request.temperature,
+            )
         # Save result to runs/
         from sandboxy.local.results import save_run_result
         save_run_result(request.scenario_id, result.to_dict())
+        # Calculate cost
+        input_tokens = result.input_tokens or 0
+        output_tokens = result.output_tokens or 0
+        cost_usd = calculate_cost(result.model, input_tokens, output_tokens)
         return RunScenarioResponse(
             id=result.id,
             scenario_id=result.scenario_id,
@@ -481,6 +545,9 @@ async def run_scenario(request: RunScenarioRequest) -> RunScenarioResponse:
             final_state=result.final_state,
             evaluation=result.evaluation.to_dict() if result.evaluation else None,
             latency_ms=result.latency_ms,
+            input_tokens=input_tokens,
+            output_tokens=output_tokens,
+            cost_usd=cost_usd,
             error=result.error,
         )
@@ -530,6 +597,19 @@ async def compare_models(request: CompareModelsRequest) -> CompareModelsResponse
         spec = load_unified_scenario(scenario_path)
+        # Enable MLflow tracing if requested (must be done BEFORE any LLM calls)
+        if request.mlflow_export and request.mlflow_tracing:
+            try:
+                from sandboxy.mlflow.tracing import enable_tracing
+                experiment = request.mlflow_experiment or spec.name
+                enable_tracing(
+                    tracking_uri=request.mlflow_tracking_uri,
+                    experiment_name=experiment,
+                )
+            except ImportError:
+                pass  # MLflow not installed
         comparison = await run_comparison(
             scenario=spec,
             models=request.models,
@@ -538,6 +618,31 @@ async def compare_models(request: CompareModelsRequest) -> CompareModelsResponse
             max_turns=request.max_turns,
         )
+        # MLflow export (if enabled)
+        if request.mlflow_export:
+            try:
+                from sandboxy.mlflow import MLflowConfig, MLflowExporter
+                for result in comparison.results:
+                    config = MLflowConfig.resolve(
+                        cli_export=True,
+                        cli_tracking_uri=request.mlflow_tracking_uri,
+                        cli_experiment=request.mlflow_experiment,
+                        cli_tracing=request.mlflow_tracing,
+                        yaml_config=None,
+                        scenario_name=spec.name,
+                    )
+                    exporter = MLflowExporter(config)
+                    exporter.export(
+                        result=result.to_dict(),
+                        scenario_path=scenario_path,
+                        agent_name=result.model,
+                    )
+            except ImportError:
+                logger.warning("MLflow not installed, skipping export")
+            except Exception as e:
+                logger.warning(f"Failed to export to MLflow: {e}")
         # Save comparison result
         from sandboxy.local.results import save_run_result
@@ -905,6 +1010,8 @@ class RunDatasetRequest(BaseModel):
     max_tokens: int = 1024
     temperature: float = 0.7
     parallel: int = 1
+    mlflow_enabled: bool = False
+    mlflow_experiment: str | None = None
 class RunDatasetResponse(BaseModel):
@@ -1335,25 +1442,81 @@ async def run_with_dataset(request: RunDatasetRequest) -> RunDatasetResponse:
         spec = load_unified_scenario(scenario_path)
         dataset = load_dataset(dataset_path)
-        if request.parallel > 1:
-            result = await run_dataset_parallel(
+        # Setup MLflow if enabled
+        mlflow_config = None
+        if request.mlflow_enabled:
+            try:
+                from sandboxy.mlflow import MLflowConfig
+                mlflow_config = MLflowConfig(
+                    enabled=True,
+                    experiment=request.mlflow_experiment or f"{spec.name}-dataset",
+                    tracing=False,  # Tracing not needed for dataset aggregates
+                )
+            except ImportError:
+                pass  # MLflow not installed
+        async def run_dataset_benchmark():
+            if request.parallel > 1:
+                return await run_dataset_parallel(
+                    scenario=spec,
+                    model=request.model,
+                    dataset=dataset,
+                    max_turns=request.max_turns,
+                    max_tokens=request.max_tokens,
+                    temperature=request.temperature,
+                    max_concurrent=request.parallel,
+                )
+            return await run_dataset(
                 scenario=spec,
                 model=request.model,
                 dataset=dataset,
                 max_turns=request.max_turns,
                 max_tokens=request.max_tokens,
                 temperature=request.temperature,
-                max_concurrent=request.parallel,
             )
+        # Run with MLflow context if enabled
+        if mlflow_config and mlflow_config.enabled:
+            from sandboxy.mlflow import mlflow_run_context
+            run_name = f"{request.model}-{request.dataset_id}"
+            with mlflow_run_context(mlflow_config, run_name=run_name) as run_id:
+                result = await run_dataset_benchmark()
+                # Log aggregate metrics to MLflow
+                if run_id:
+                    try:
+                        import mlflow
+                        mlflow.log_params(
+                            {
+                                "scenario_id": result.scenario_id,
+                                "dataset_id": result.dataset_id,
+                                "model": result.model,
+                                "total_cases": result.total_cases,
+                            }
+                        )
+                        mlflow.log_metrics(
+                            {
+                                "passed_cases": result.passed_cases,
+                                "failed_cases": result.failed_cases,
+                                "pass_rate": result.pass_rate,
+                                "avg_score": result.avg_score,
+                                "avg_percentage": result.avg_percentage,
+                                "total_time_ms": result.total_time_ms,
+                            }
+                        )
+                        # Log per-expected-outcome metrics
+                        for expected, counts in result.by_expected.items():
+                            total = counts.get("passed", 0) + counts.get("failed", 0)
+                            if total > 0:
+                                rate = counts.get("passed", 0) / total
+                                mlflow.log_metric(f"pass_rate_{expected}", rate)
+                    except Exception as e:
+                        logger.warning(f"Failed to log dataset metrics to MLflow: {e}")
         else:
-            result = await run_dataset(
-                scenario=spec,
-                model=request.model,
-                dataset=dataset,
-                max_turns=request.max_turns,
-                max_tokens=request.max_tokens,
-                temperature=request.temperature,
-            )
+            result = await run_dataset_benchmark()
         # Save result
         from sandboxy.local.results import save_run_result

sandboxy 0.0.2__tar.gz → 0.0.4__tar.gz

sandboxy 0.0.2tar.gz → 0.0.4tar.gz