PyPI - sandboxy - Versions diffs - 0.0.3__py3-none-any.whl → 0.0.4__py3-none-any.whl - Mend

sandboxy 0.0.3py3-none-any.whl → 0.0.4py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

sandboxy/api/routes/local.py +182 -19
sandboxy/cli/main.py +292 -31
sandboxy/mlflow/__init__.py +38 -0
sandboxy/mlflow/artifacts.py +184 -0
sandboxy/mlflow/config.py +90 -0
sandboxy/mlflow/exporter.py +439 -0
sandboxy/mlflow/metrics.py +115 -0
sandboxy/mlflow/tags.py +140 -0
sandboxy/mlflow/tracing.py +126 -0
sandboxy/scenarios/loader.py +44 -2
sandboxy/scenarios/runner.py +57 -2
sandboxy/tools/yaml_tools.py +18 -0
sandboxy/ui/dist/assets/index-CU06wBqc.js +362 -0
sandboxy/ui/dist/assets/index-Cgg2wY2m.css +1 -0
sandboxy/ui/dist/index.html +2 -2
{sandboxy-0.0.3.dist-info → sandboxy-0.0.4.dist-info}/METADATA +37 -1
{sandboxy-0.0.3.dist-info → sandboxy-0.0.4.dist-info}/RECORD +20 -13
sandboxy/ui/dist/assets/index-CgAkYWrJ.css +0 -1
sandboxy/ui/dist/assets/index-D4zoGFcr.js +0 -347
{sandboxy-0.0.3.dist-info → sandboxy-0.0.4.dist-info}/WHEEL +0 -0
{sandboxy-0.0.3.dist-info → sandboxy-0.0.4.dist-info}/entry_points.txt +0 -0
{sandboxy-0.0.3.dist-info → sandboxy-0.0.4.dist-info}/licenses/LICENSE +0 -0

sandboxy/api/routes/local.py CHANGED Viewed

@@ -379,6 +379,10 @@ class RunScenarioRequest(BaseModel):
     max_turns: int = 20
     max_tokens: int = 1024
     temperature: float = 0.7
+    mlflow_export: bool = False
+    mlflow_tracking_uri: str | None = None
+    mlflow_experiment: str | None = None
+    mlflow_tracing: bool = True
 class RunScenarioResponse(BaseModel):
@@ -393,6 +397,9 @@ class RunScenarioResponse(BaseModel):
     final_state: dict[str, Any]
     evaluation: dict[str, Any] | None
     latency_ms: int
+    input_tokens: int = 0
+    output_tokens: int = 0
+    cost_usd: float | None = None
     error: str | None
@@ -404,6 +411,10 @@ class CompareModelsRequest(BaseModel):
     runs_per_model: int = 1
     variables: dict[str, Any] = Field(default_factory=dict)
     max_turns: int = 20
+    mlflow_export: bool = False
+    mlflow_tracking_uri: str | None = None
+    mlflow_experiment: str | None = None
+    mlflow_tracing: bool = True  # Enable LLM call tracing by default
 class CompareModelsResponse(BaseModel):
@@ -454,20 +465,73 @@ async def run_scenario(request: RunScenarioRequest) -> RunScenarioResponse:
         spec = load_unified_scenario(scenario_path)
         runner = UnifiedRunner()
-        result = await runner.run(
-            scenario=spec,
-            model=request.model,
-            variables=request.variables,
-            max_turns=request.max_turns,
-            max_tokens=request.max_tokens,
-            temperature=request.temperature,
-        )
+        # Setup MLflow if requested
+        mlflow_config = None
+        if request.mlflow_export:
+            try:
+                from sandboxy.mlflow import MLflowConfig
+                mlflow_config = MLflowConfig.resolve(
+                    cli_export=True,
+                    cli_tracking_uri=request.mlflow_tracking_uri,
+                    cli_experiment=request.mlflow_experiment,
+                    cli_tracing=request.mlflow_tracing,
+                    yaml_config=None,
+                    scenario_name=spec.name,
+                )
+            except ImportError:
+                pass  # MLflow not installed
+        # Run with MLflow context if enabled (connects traces to run)
+        if mlflow_config and mlflow_config.enabled:
+            from sandboxy.mlflow import MLflowExporter, mlflow_run_context
+            from sandboxy.mlflow.tracing import enable_tracing
+            if mlflow_config.tracing:
+                enable_tracing(
+                    tracking_uri=mlflow_config.tracking_uri,
+                    experiment_name=mlflow_config.experiment,
+                )
+            with mlflow_run_context(mlflow_config, run_name=request.model) as run_id:
+                result = await runner.run(
+                    scenario=spec,
+                    model=request.model,
+                    variables=request.variables,
+                    max_turns=request.max_turns,
+                    max_tokens=request.max_tokens,
+                    temperature=request.temperature,
+                )
+                if run_id:
+                    exporter = MLflowExporter(mlflow_config)
+                    exporter.log_to_active_run(
+                        result=result,
+                        scenario_path=scenario_path,
+                        scenario_name=spec.name,
+                        scenario_id=spec.id,
+                        agent_name=request.model,
+                    )
+        else:
+            result = await runner.run(
+                scenario=spec,
+                model=request.model,
+                variables=request.variables,
+                max_turns=request.max_turns,
+                max_tokens=request.max_tokens,
+                temperature=request.temperature,
+            )
         # Save result to runs/
         from sandboxy.local.results import save_run_result
         save_run_result(request.scenario_id, result.to_dict())
+        # Calculate cost
+        input_tokens = result.input_tokens or 0
+        output_tokens = result.output_tokens or 0
+        cost_usd = calculate_cost(result.model, input_tokens, output_tokens)
         return RunScenarioResponse(
             id=result.id,
             scenario_id=result.scenario_id,
@@ -481,6 +545,9 @@ async def run_scenario(request: RunScenarioRequest) -> RunScenarioResponse:
             final_state=result.final_state,
             evaluation=result.evaluation.to_dict() if result.evaluation else None,
             latency_ms=result.latency_ms,
+            input_tokens=input_tokens,
+            output_tokens=output_tokens,
+            cost_usd=cost_usd,
             error=result.error,
         )
@@ -530,6 +597,19 @@ async def compare_models(request: CompareModelsRequest) -> CompareModelsResponse
         spec = load_unified_scenario(scenario_path)
+        # Enable MLflow tracing if requested (must be done BEFORE any LLM calls)
+        if request.mlflow_export and request.mlflow_tracing:
+            try:
+                from sandboxy.mlflow.tracing import enable_tracing
+                experiment = request.mlflow_experiment or spec.name
+                enable_tracing(
+                    tracking_uri=request.mlflow_tracking_uri,
+                    experiment_name=experiment,
+                )
+            except ImportError:
+                pass  # MLflow not installed
         comparison = await run_comparison(
             scenario=spec,
             models=request.models,
@@ -538,6 +618,31 @@ async def compare_models(request: CompareModelsRequest) -> CompareModelsResponse
             max_turns=request.max_turns,
         )
+        # MLflow export (if enabled)
+        if request.mlflow_export:
+            try:
+                from sandboxy.mlflow import MLflowConfig, MLflowExporter
+                for result in comparison.results:
+                    config = MLflowConfig.resolve(
+                        cli_export=True,
+                        cli_tracking_uri=request.mlflow_tracking_uri,
+                        cli_experiment=request.mlflow_experiment,
+                        cli_tracing=request.mlflow_tracing,
+                        yaml_config=None,
+                        scenario_name=spec.name,
+                    )
+                    exporter = MLflowExporter(config)
+                    exporter.export(
+                        result=result.to_dict(),
+                        scenario_path=scenario_path,
+                        agent_name=result.model,
+                    )
+            except ImportError:
+                logger.warning("MLflow not installed, skipping export")
+            except Exception as e:
+                logger.warning(f"Failed to export to MLflow: {e}")
         # Save comparison result
         from sandboxy.local.results import save_run_result
@@ -905,6 +1010,8 @@ class RunDatasetRequest(BaseModel):
     max_tokens: int = 1024
     temperature: float = 0.7
     parallel: int = 1
+    mlflow_enabled: bool = False
+    mlflow_experiment: str | None = None
 class RunDatasetResponse(BaseModel):
@@ -1335,25 +1442,81 @@ async def run_with_dataset(request: RunDatasetRequest) -> RunDatasetResponse:
         spec = load_unified_scenario(scenario_path)
         dataset = load_dataset(dataset_path)
-        if request.parallel > 1:
-            result = await run_dataset_parallel(
+        # Setup MLflow if enabled
+        mlflow_config = None
+        if request.mlflow_enabled:
+            try:
+                from sandboxy.mlflow import MLflowConfig
+                mlflow_config = MLflowConfig(
+                    enabled=True,
+                    experiment=request.mlflow_experiment or f"{spec.name}-dataset",
+                    tracing=False,  # Tracing not needed for dataset aggregates
+                )
+            except ImportError:
+                pass  # MLflow not installed
+        async def run_dataset_benchmark():
+            if request.parallel > 1:
+                return await run_dataset_parallel(
+                    scenario=spec,
+                    model=request.model,
+                    dataset=dataset,
+                    max_turns=request.max_turns,
+                    max_tokens=request.max_tokens,
+                    temperature=request.temperature,
+                    max_concurrent=request.parallel,
+                )
+            return await run_dataset(
                 scenario=spec,
                 model=request.model,
                 dataset=dataset,
                 max_turns=request.max_turns,
                 max_tokens=request.max_tokens,
                 temperature=request.temperature,
-                max_concurrent=request.parallel,
             )
+        # Run with MLflow context if enabled
+        if mlflow_config and mlflow_config.enabled:
+            from sandboxy.mlflow import mlflow_run_context
+            run_name = f"{request.model}-{request.dataset_id}"
+            with mlflow_run_context(mlflow_config, run_name=run_name) as run_id:
+                result = await run_dataset_benchmark()
+                # Log aggregate metrics to MLflow
+                if run_id:
+                    try:
+                        import mlflow
+                        mlflow.log_params(
+                            {
+                                "scenario_id": result.scenario_id,
+                                "dataset_id": result.dataset_id,
+                                "model": result.model,
+                                "total_cases": result.total_cases,
+                            }
+                        )
+                        mlflow.log_metrics(
+                            {
+                                "passed_cases": result.passed_cases,
+                                "failed_cases": result.failed_cases,
+                                "pass_rate": result.pass_rate,
+                                "avg_score": result.avg_score,
+                                "avg_percentage": result.avg_percentage,
+                                "total_time_ms": result.total_time_ms,
+                            }
+                        )
+                        # Log per-expected-outcome metrics
+                        for expected, counts in result.by_expected.items():
+                            total = counts.get("passed", 0) + counts.get("failed", 0)
+                            if total > 0:
+                                rate = counts.get("passed", 0) / total
+                                mlflow.log_metric(f"pass_rate_{expected}", rate)
+                    except Exception as e:
+                        logger.warning(f"Failed to log dataset metrics to MLflow: {e}")
         else:
-            result = await run_dataset(
-                scenario=spec,
-                model=request.model,
-                dataset=dataset,
-                max_turns=request.max_turns,
-                max_tokens=request.max_tokens,
-                temperature=request.temperature,
-            )
+            result = await run_dataset_benchmark()
         # Save result
         from sandboxy.local.results import save_run_result

sandboxy/cli/main.py CHANGED Viewed

@@ -98,6 +98,137 @@ def _load_variables_from_env() -> dict:
         return {}
+def _export_to_mlflow(
+    result: Any,
+    spec: Any,
+    scenario_path: Path,
+    mlflow_export: bool,
+    no_mlflow: bool,
+    mlflow_tracking_uri: str | None,
+    mlflow_experiment: str | None,
+    agent_name: str = "default",
+) -> None:
+    """Export scenario result to MLflow if enabled.
+    Args:
+        result: ScenarioResult from runner
+        spec: ScenarioSpec
+        scenario_path: Path to scenario file
+        mlflow_export: --mlflow-export flag
+        no_mlflow: --no-mlflow flag
+        mlflow_tracking_uri: --mlflow-tracking-uri value
+        mlflow_experiment: --mlflow-experiment value
+        agent_name: Agent configuration name
+    """
+    from sandboxy.mlflow.config import MLflowConfig
+    # Get YAML config from spec
+    yaml_config = None
+    if spec.mlflow:
+        yaml_config = {
+            "enabled": spec.mlflow.enabled,
+            "experiment": spec.mlflow.experiment,
+            "tracking_uri": spec.mlflow.tracking_uri,
+            "tags": spec.mlflow.tags,
+        }
+    # Resolve config with precedence
+    config = MLflowConfig.resolve(
+        cli_export=mlflow_export,
+        cli_no_mlflow=no_mlflow,
+        cli_tracking_uri=mlflow_tracking_uri,
+        cli_experiment=mlflow_experiment,
+        yaml_config=yaml_config,
+        scenario_name=spec.name,
+    )
+    if not config.enabled:
+        return
+    # Import and use exporter
+    try:
+        from sandboxy.mlflow.exporter import MLflowExporter
+        exporter = MLflowExporter(config)
+        # Convert ScenarioResult to RunResult-like for exporter
+        # ScenarioResult has different structure, create adapter
+        run_id = exporter.export(
+            result=_adapt_scenario_result(result),
+            scenario_path=scenario_path,
+            scenario_name=spec.name,
+            scenario_id=spec.id,
+            agent_name=agent_name,
+        )
+        if run_id:
+            click.echo(f"\nExported to MLflow: run_id={run_id}")
+    except ImportError:
+        click.echo(
+            "\nMLflow not installed. Install with: pip install sandboxy[mlflow]",
+            err=True,
+        )
+    except Exception as e:
+        click.echo(f"\nWarning: MLflow export failed: {e}", err=True)
+def _adapt_scenario_result(result: Any) -> Any:
+    """Adapt ScenarioResult to RunResult-like interface for MLflowExporter.
+    The exporter expects RunResult fields, but ScenarioRunner returns ScenarioResult.
+    This creates an adapter object.
+    """
+    from dataclasses import dataclass, field
+    @dataclass
+    class GoalResultAdapter:
+        name: str
+        score: float
+        passed: bool = True
+    @dataclass
+    class EvaluationAdapter:
+        goals: list[GoalResultAdapter] = field(default_factory=list)
+        total_score: float = 0.0
+        max_score: float = 0.0
+        percentage: float = 0.0
+    @dataclass
+    class RunResultAdapter:
+        model: str = ""
+        error: str | None = None
+        latency_ms: int = 0
+        input_tokens: int = 0
+        output_tokens: int = 0
+        evaluation: EvaluationAdapter | None = None
+    # Extract data from ScenarioResult
+    adapter = RunResultAdapter(
+        model=getattr(result, "agent_id", "unknown"),
+        error=None,
+    )
+    # Build evaluation from goals
+    goals = []
+    total = 0.0
+    for goal_name in getattr(result, "goals_achieved", []):
+        goals.append(GoalResultAdapter(name=goal_name, score=1.0, passed=True))
+        total += 1.0
+    score = getattr(result, "score", 0.0)
+    max_score = max(score, len(goals)) if goals else score
+    adapter.evaluation = EvaluationAdapter(
+        goals=goals,
+        total_score=score,
+        max_score=max_score,
+        percentage=(score / max_score * 100) if max_score > 0 else 0.0,
+    )
+    return adapter
 @main.command()
 @click.option("--with-examples", is_flag=True, help="Include example scenarios and tools")
 @click.option(
@@ -528,22 +659,54 @@ def info(module_path: str) -> None:
 @click.option(
     "--model",
     "-m",
-    help="Model to use (e.g., openai/gpt-4o, anthropic/claude-3.5-sonnet)",
-    default=None,
+    multiple=True,
+    help="Model(s) to use. Can specify multiple: -m gpt-4o -m claude-3.5-sonnet",
 )
 @click.option("--agent-id", "-a", help="Agent ID from config files", default=None)
 @click.option("--output", "-o", help="Output file for results JSON", default=None)
 @click.option("--pretty", "-p", is_flag=True, help="Pretty print output")
 @click.option("--max-turns", type=int, default=20, help="Maximum conversation turns")
 @click.option("--var", "-v", multiple=True, help="Variable in name=value format")
+@click.option(
+    "--mlflow-export",
+    is_flag=True,
+    help="Export run results to MLflow tracking server",
+)
+@click.option(
+    "--no-mlflow",
+    is_flag=True,
+    help="Disable MLflow export (overrides YAML config)",
+)
+@click.option(
+    "--mlflow-tracking-uri",
+    type=str,
+    default=None,
+    help="MLflow tracking server URI (overrides MLFLOW_TRACKING_URI env)",
+)
+@click.option(
+    "--mlflow-experiment",
+    type=str,
+    default=None,
+    help="MLflow experiment name (defaults to scenario name)",
+)
+@click.option(
+    "--mlflow-no-tracing",
+    is_flag=True,
+    help="Disable LLM call tracing (only log summary metrics)",
+)
 def scenario(
     scenario_path: str,
-    model: str | None,
+    model: tuple[str, ...],
     agent_id: str | None,
     output: str | None,
     pretty: bool,
     max_turns: int,
     var: tuple[str, ...],
+    mlflow_export: bool,
+    no_mlflow: bool,
+    mlflow_tracking_uri: str | None,
+    mlflow_experiment: str | None,
+    mlflow_no_tracing: bool,
 ) -> None:
     """Run a scenario with YAML-defined tools.
@@ -554,8 +717,10 @@ def scenario(
     Examples:
         sandboxy scenario scenarios/trolley.yml -m openai/gpt-4o
-        sandboxy scenario scenarios/trolley.yml -m anthropic/claude-3.5-sonnet -p
+        sandboxy scenario scenarios/trolley.yml -m gpt-4o -m claude-3.5-sonnet  # multiple models
         sandboxy scenario scenarios/surgeon.yml -v patient="John Smith" -v condition="critical"
+        sandboxy scenario scenarios/test.yml -m gpt-4o --mlflow-export
+        sandboxy scenario scenarios/test.yml -m gpt-4o -m gpt-4o-mini --mlflow-export  # compare models
     """
     from sandboxy.agents.base import AgentConfig
     from sandboxy.agents.llm_prompt import LlmPromptAgent
@@ -567,6 +732,26 @@ def scenario(
         click.echo(f"Error loading scenario: {e}", err=True)
         sys.exit(1)
+    # Build MLflow config if export requested
+    mlflow_config = None
+    if mlflow_export and not no_mlflow:
+        try:
+            from sandboxy.mlflow import MLflowConfig
+            mlflow_config = MLflowConfig.resolve(
+                cli_export=True,
+                cli_tracking_uri=mlflow_tracking_uri,
+                cli_experiment=mlflow_experiment,
+                cli_tracing=not mlflow_no_tracing,
+                yaml_config=spec.mlflow.model_dump() if spec.mlflow else None,
+                scenario_name=spec.name,
+            )
+            click.echo(f"MLflow enabled → experiment: {mlflow_config.experiment}")
+            if mlflow_config.tracing:
+                click.echo("  Tracing: ON (LLM calls will be captured)")
+        except ImportError:
+            pass  # MLflow not installed
     # Parse and apply variables
     variables: dict[str, Any] = {}
     for v in var:
@@ -582,27 +767,17 @@ def scenario(
         spec = apply_scenario_variables(spec, variables)
         click.echo(f"Variables: {variables}")
-    # Determine which agent to use
-    agent = None
+    # Build list of models to run
+    models_to_run: list[str] = []
     if model:
-        # Create ad-hoc agent from model string
-        config = AgentConfig(
-            id=model,
-            name=model.split("/")[-1] if "/" in model else model,
-            kind="llm-prompt",
-            model=model,
-            system_prompt="",
-            tools=[],
-            params={"temperature": 0.7, "max_tokens": 4096},
-            impl={},
-        )
-        agent = LlmPromptAgent(config)
+        models_to_run = list(model)
     elif agent_id:
         # Load from agent config files
         loader = AgentLoader(DEFAULT_AGENT_DIRS)
         try:
             agent = loader.load(agent_id)
+            models_to_run = [agent.config.model]
         except ValueError as e:
             click.echo(f"Error loading agent: {e}", err=True)
             sys.exit(1)
@@ -611,6 +786,7 @@ def scenario(
         loader = AgentLoader(DEFAULT_AGENT_DIRS)
         try:
             agent = loader.load_default()
+            models_to_run = [agent.config.model]
         except ValueError:
             click.echo("No model specified. Use -m to specify a model:", err=True)
             click.echo("", err=True)
@@ -623,25 +799,110 @@ def scenario(
             )
             sys.exit(1)
-    # Apply scenario's system prompt to agent
-    if spec.system_prompt:
-        agent.config.system_prompt = spec.system_prompt
     click.echo(f"Running scenario: {spec.name}")
-    click.echo(f"Using model: {agent.config.model}")
+    click.echo(f"Models: {', '.join(models_to_run)}")
     click.echo(f"Tools loaded: {len(spec.tools) + len(spec.tools_from)} source(s)")
+    if len(models_to_run) > 1:
+        click.echo("Running models in parallel...")
     click.echo("")
-    runner = ScenarioRunner(scenario=spec, agent=agent)
-    result = runner.run(max_turns=max_turns)
+    def run_single_model(model_id: str) -> dict[str, Any]:
+        """Run scenario with a single model, with MLflow tracing if enabled."""
+        agent_config = AgentConfig(
+            id=model_id,
+            name=model_id.split("/")[-1] if "/" in model_id else model_id,
+            kind="llm-prompt",
+            model=model_id,
+            system_prompt=spec.system_prompt or "",
+            tools=[],
+            params={"temperature": 0.7, "max_tokens": 4096},
+            impl={},
+        )
+        agent = LlmPromptAgent(agent_config)
+        # If MLflow enabled, wrap execution in run context so traces are connected
+        if mlflow_config and mlflow_config.enabled:
+            from sandboxy.mlflow import MLflowExporter, mlflow_run_context
+            from sandboxy.mlflow.tracing import enable_tracing
+            # Enable tracing before the run starts
+            if mlflow_config.tracing:
+                enable_tracing(
+                    tracking_uri=mlflow_config.tracking_uri,
+                    experiment_name=mlflow_config.experiment,
+                )
+            # Start run, execute scenario, then log metrics - all connected
+            with mlflow_run_context(mlflow_config, run_name=model_id) as run_id:
+                runner = ScenarioRunner(scenario=spec, agent=agent)
+                result = runner.run(max_turns=max_turns)
+                # Log metrics to the active run (traces are already attached)
+                if run_id:
+                    exporter = MLflowExporter(mlflow_config)
+                    exporter.log_to_active_run(
+                        result=result,
+                        scenario_path=Path(scenario_path),
+                        scenario_name=spec.name,
+                        scenario_id=spec.id,
+                        agent_name=agent.config.name,
+                    )
+            return {"model": model_id, "result": result, "agent_name": agent.config.name}
+        # No MLflow - just run scenario
+        runner = ScenarioRunner(scenario=spec, agent=agent)
+        result = runner.run(max_turns=max_turns)
+        return {"model": model_id, "result": result, "agent_name": agent.config.name}
+    # Run models in parallel if multiple, otherwise just run single
+    results: list[Any] = []
+    if len(models_to_run) == 1:
+        results = [run_single_model(models_to_run[0])]
+    else:
+        from concurrent.futures import ThreadPoolExecutor, as_completed
+        with ThreadPoolExecutor(max_workers=len(models_to_run)) as executor:
+            futures = {executor.submit(run_single_model, m): m for m in models_to_run}
+            for future in as_completed(futures):
+                model_id = futures[future]
+                try:
+                    result_data = future.result()
+                    results.append(result_data)
+                    click.echo(f"✓ Completed: {model_id}")
+                except Exception as e:
+                    click.echo(f"✗ Failed: {model_id} - {e}", err=True)
+        click.echo("")
-    if output:
-        Path(output).write_text(result.to_json(indent=2))
-        click.echo(f"\nResults saved to: {output}")
-    elif pretty:
-        click.echo(result.pretty())
+    # Output results
+    if len(results) == 1:
+        result = results[0]["result"]
+        if output:
+            Path(output).write_text(result.to_json(indent=2))
+            click.echo(f"\nResults saved to: {output}")
+        elif pretty:
+            click.echo(result.pretty())
+        else:
+            click.echo(result.to_json(indent=2))
     else:
-        click.echo(result.to_json(indent=2))
+        # Multiple models - show summary
+        # Get max_score from spec (scoring config or sum of goal points)
+        max_score = spec.scoring.get("max_score", 0) if spec.scoring else 0
+        if not max_score and spec.goals:
+            max_score = sum(g.points for g in spec.goals)
+        click.echo("=== Results Summary ===")
+        for r in results:
+            model_name = r["model"]
+            res = r["result"]
+            score = getattr(res, "score", 0) or 0
+            pct = (score / max_score * 100) if max_score > 0 else 0
+            click.echo(f"  {model_name}: {score:.1f}/{max_score:.1f} ({pct:.0f}%)")
+        if output:
+            all_results = [{"model": r["model"], "result": r["result"].to_dict()} for r in results]
+            Path(output).write_text(json.dumps(all_results, indent=2))
+            click.echo(f"\nResults saved to: {output}")
 @main.command()

sandboxy 0.0.3__py3-none-any.whl → 0.0.4__py3-none-any.whl

sandboxy 0.0.3py3-none-any.whl → 0.0.4py3-none-any.whl