PyPI - hud-python - Versions diffs - 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl - Mend

hud-python 0.4.45py3-none-any.whl → 0.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (274) hide show

hud/__init__.py +27 -7
hud/agents/__init__.py +11 -5
hud/agents/base.py +220 -500
hud/agents/claude.py +200 -240
hud/agents/gemini.py +275 -0
hud/agents/gemini_cua.py +335 -0
hud/agents/grounded_openai.py +98 -100
hud/agents/misc/integration_test_agent.py +51 -20
hud/agents/misc/response_agent.py +41 -36
hud/agents/openai.py +291 -292
hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
hud/agents/operator.py +211 -0
hud/agents/tests/conftest.py +133 -0
hud/agents/tests/test_base.py +300 -622
hud/agents/tests/test_base_runtime.py +233 -0
hud/agents/tests/test_claude.py +379 -210
hud/agents/tests/test_client.py +9 -10
hud/agents/tests/test_gemini.py +369 -0
hud/agents/tests/test_grounded_openai_agent.py +65 -50
hud/agents/tests/test_openai.py +376 -140
hud/agents/tests/test_operator.py +362 -0
hud/agents/tests/test_run_eval.py +179 -0
hud/cli/__init__.py +461 -545
hud/cli/analyze.py +43 -5
hud/cli/build.py +664 -110
hud/cli/debug.py +8 -5
hud/cli/dev.py +882 -734
hud/cli/eval.py +782 -668
hud/cli/flows/dev.py +167 -0
hud/cli/flows/init.py +191 -0
hud/cli/flows/tasks.py +153 -56
hud/cli/flows/templates.py +151 -0
hud/cli/flows/tests/__init__.py +1 -0
hud/cli/flows/tests/test_dev.py +126 -0
hud/cli/init.py +60 -58
hud/cli/push.py +29 -11
hud/cli/rft.py +311 -0
hud/cli/rft_status.py +145 -0
hud/cli/tests/test_analyze.py +5 -5
hud/cli/tests/test_analyze_metadata.py +3 -2
hud/cli/tests/test_analyze_module.py +120 -0
hud/cli/tests/test_build.py +108 -6
hud/cli/tests/test_build_failure.py +41 -0
hud/cli/tests/test_build_module.py +50 -0
hud/cli/tests/test_cli_init.py +6 -1
hud/cli/tests/test_cli_more_wrappers.py +30 -0
hud/cli/tests/test_cli_root.py +140 -0
hud/cli/tests/test_convert.py +361 -0
hud/cli/tests/test_debug.py +12 -10
hud/cli/tests/test_dev.py +197 -0
hud/cli/tests/test_eval.py +251 -0
hud/cli/tests/test_eval_bedrock.py +51 -0
hud/cli/tests/test_init.py +124 -0
hud/cli/tests/test_main_module.py +11 -5
hud/cli/tests/test_mcp_server.py +12 -100
hud/cli/tests/test_push_happy.py +74 -0
hud/cli/tests/test_push_wrapper.py +23 -0
hud/cli/tests/test_registry.py +1 -1
hud/cli/tests/test_utils.py +1 -1
hud/cli/{rl → utils}/celebrate.py +14 -12
hud/cli/utils/config.py +18 -1
hud/cli/utils/docker.py +130 -4
hud/cli/utils/env_check.py +9 -9
hud/cli/utils/git.py +136 -0
hud/cli/utils/interactive.py +39 -5
hud/cli/utils/metadata.py +69 -0
hud/cli/utils/runner.py +1 -1
hud/cli/utils/server.py +2 -2
hud/cli/utils/source_hash.py +3 -3
hud/cli/utils/tasks.py +4 -1
hud/cli/utils/tests/__init__.py +0 -0
hud/cli/utils/tests/test_config.py +58 -0
hud/cli/utils/tests/test_docker.py +93 -0
hud/cli/utils/tests/test_docker_hints.py +71 -0
hud/cli/utils/tests/test_env_check.py +74 -0
hud/cli/utils/tests/test_environment.py +42 -0
hud/cli/utils/tests/test_git.py +142 -0
hud/cli/utils/tests/test_interactive_module.py +60 -0
hud/cli/utils/tests/test_local_runner.py +50 -0
hud/cli/utils/tests/test_logging_utils.py +23 -0
hud/cli/utils/tests/test_metadata.py +49 -0
hud/cli/utils/tests/test_package_runner.py +35 -0
hud/cli/utils/tests/test_registry_utils.py +49 -0
hud/cli/utils/tests/test_remote_runner.py +25 -0
hud/cli/utils/tests/test_runner_modules.py +52 -0
hud/cli/utils/tests/test_source_hash.py +36 -0
hud/cli/utils/tests/test_tasks.py +80 -0
hud/cli/utils/version_check.py +258 -0
hud/cli/{rl → utils}/viewer.py +2 -2
hud/clients/README.md +12 -11
hud/clients/__init__.py +4 -3
hud/clients/base.py +166 -26
hud/clients/environment.py +51 -0
hud/clients/fastmcp.py +13 -6
hud/clients/mcp_use.py +40 -15
hud/clients/tests/test_analyze_scenarios.py +206 -0
hud/clients/tests/test_protocol.py +9 -3
hud/datasets/__init__.py +23 -20
hud/datasets/loader.py +327 -0
hud/datasets/runner.py +192 -105
hud/datasets/tests/__init__.py +0 -0
hud/datasets/tests/test_loader.py +221 -0
hud/datasets/tests/test_utils.py +315 -0
hud/datasets/utils.py +270 -90
hud/environment/__init__.py +50 -0
hud/environment/connection.py +206 -0
hud/environment/connectors/__init__.py +33 -0
hud/environment/connectors/base.py +68 -0
hud/environment/connectors/local.py +177 -0
hud/environment/connectors/mcp_config.py +109 -0
hud/environment/connectors/openai.py +101 -0
hud/environment/connectors/remote.py +172 -0
hud/environment/environment.py +694 -0
hud/environment/integrations/__init__.py +45 -0
hud/environment/integrations/adk.py +67 -0
hud/environment/integrations/anthropic.py +196 -0
hud/environment/integrations/gemini.py +92 -0
hud/environment/integrations/langchain.py +82 -0
hud/environment/integrations/llamaindex.py +68 -0
hud/environment/integrations/openai.py +238 -0
hud/environment/mock.py +306 -0
hud/environment/router.py +112 -0
hud/environment/scenarios.py +493 -0
hud/environment/tests/__init__.py +1 -0
hud/environment/tests/test_connection.py +317 -0
hud/environment/tests/test_connectors.py +218 -0
hud/environment/tests/test_environment.py +161 -0
hud/environment/tests/test_integrations.py +257 -0
hud/environment/tests/test_local_connectors.py +201 -0
hud/environment/tests/test_scenarios.py +280 -0
hud/environment/tests/test_tools.py +208 -0
hud/environment/types.py +23 -0
hud/environment/utils/__init__.py +35 -0
hud/environment/utils/formats.py +215 -0
hud/environment/utils/schema.py +171 -0
hud/environment/utils/tool_wrappers.py +113 -0
hud/eval/__init__.py +67 -0
hud/eval/context.py +674 -0
hud/eval/display.py +299 -0
hud/eval/instrument.py +185 -0
hud/eval/manager.py +466 -0
hud/eval/parallel.py +268 -0
hud/eval/task.py +340 -0
hud/eval/tests/__init__.py +1 -0
hud/eval/tests/test_context.py +178 -0
hud/eval/tests/test_eval.py +210 -0
hud/eval/tests/test_manager.py +152 -0
hud/eval/tests/test_parallel.py +168 -0
hud/eval/tests/test_task.py +145 -0
hud/eval/types.py +63 -0
hud/eval/utils.py +183 -0
hud/patches/__init__.py +19 -0
hud/patches/mcp_patches.py +151 -0
hud/patches/warnings.py +54 -0
hud/samples/browser.py +4 -4
hud/server/__init__.py +2 -1
hud/server/low_level.py +2 -1
hud/server/router.py +164 -0
hud/server/server.py +567 -80
hud/server/tests/test_mcp_server_integration.py +11 -11
hud/server/tests/test_mcp_server_more.py +1 -1
hud/server/tests/test_server_extra.py +2 -0
hud/settings.py +45 -3
hud/shared/exceptions.py +36 -10
hud/shared/hints.py +26 -1
hud/shared/requests.py +15 -3
hud/shared/tests/test_exceptions.py +40 -31
hud/shared/tests/test_hints.py +167 -0
hud/telemetry/__init__.py +20 -19
hud/telemetry/exporter.py +201 -0
hud/telemetry/instrument.py +158 -253
hud/telemetry/tests/test_eval_telemetry.py +356 -0
hud/telemetry/tests/test_exporter.py +258 -0
hud/telemetry/tests/test_instrument.py +401 -0
hud/tools/__init__.py +16 -2
hud/tools/apply_patch.py +639 -0
hud/tools/base.py +54 -4
hud/tools/bash.py +2 -2
hud/tools/computer/__init__.py +4 -0
hud/tools/computer/anthropic.py +2 -2
hud/tools/computer/gemini.py +385 -0
hud/tools/computer/hud.py +23 -6
hud/tools/computer/openai.py +20 -21
hud/tools/computer/qwen.py +434 -0
hud/tools/computer/settings.py +37 -0
hud/tools/edit.py +3 -7
hud/tools/executors/base.py +4 -2
hud/tools/executors/pyautogui.py +1 -1
hud/tools/grounding/grounded_tool.py +13 -18
hud/tools/grounding/grounder.py +10 -31
hud/tools/grounding/tests/test_grounded_tool.py +26 -44
hud/tools/jupyter.py +330 -0
hud/tools/playwright.py +18 -3
hud/tools/shell.py +308 -0
hud/tools/tests/test_apply_patch.py +718 -0
hud/tools/tests/test_computer.py +4 -9
hud/tools/tests/test_computer_actions.py +24 -2
hud/tools/tests/test_jupyter_tool.py +181 -0
hud/tools/tests/test_shell.py +596 -0
hud/tools/tests/test_submit.py +85 -0
hud/tools/tests/test_types.py +193 -0
hud/tools/types.py +21 -1
hud/types.py +167 -57
hud/utils/__init__.py +2 -0
hud/utils/env.py +67 -0
hud/utils/hud_console.py +61 -3
hud/utils/mcp.py +15 -58
hud/utils/strict_schema.py +162 -0
hud/utils/tests/test_init.py +1 -2
hud/utils/tests/test_mcp.py +1 -28
hud/utils/tests/test_pretty_errors.py +186 -0
hud/utils/tests/test_tool_shorthand.py +154 -0
hud/utils/tests/test_version.py +1 -1
hud/utils/types.py +20 -0
hud/version.py +1 -1
hud_python-0.5.1.dist-info/METADATA +264 -0
hud_python-0.5.1.dist-info/RECORD +299 -0
{hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
hud/agents/langchain.py +0 -261
hud/agents/lite_llm.py +0 -72
hud/cli/rl/__init__.py +0 -180
hud/cli/rl/config.py +0 -101
hud/cli/rl/display.py +0 -133
hud/cli/rl/gpu.py +0 -63
hud/cli/rl/gpu_utils.py +0 -321
hud/cli/rl/local_runner.py +0 -595
hud/cli/rl/presets.py +0 -96
hud/cli/rl/remote_runner.py +0 -463
hud/cli/rl/rl_api.py +0 -150
hud/cli/rl/vllm.py +0 -177
hud/cli/rl/wait_utils.py +0 -89
hud/datasets/parallel.py +0 -687
hud/misc/__init__.py +0 -1
hud/misc/claude_plays_pokemon.py +0 -292
hud/otel/__init__.py +0 -35
hud/otel/collector.py +0 -142
hud/otel/config.py +0 -181
hud/otel/context.py +0 -570
hud/otel/exporters.py +0 -369
hud/otel/instrumentation.py +0 -135
hud/otel/processors.py +0 -121
hud/otel/tests/__init__.py +0 -1
hud/otel/tests/test_processors.py +0 -197
hud/rl/README.md +0 -30
hud/rl/__init__.py +0 -1
hud/rl/actor.py +0 -176
hud/rl/buffer.py +0 -405
hud/rl/chat_template.jinja +0 -101
hud/rl/config.py +0 -192
hud/rl/distributed.py +0 -132
hud/rl/learner.py +0 -637
hud/rl/tests/__init__.py +0 -1
hud/rl/tests/test_learner.py +0 -186
hud/rl/train.py +0 -382
hud/rl/types.py +0 -101
hud/rl/utils/start_vllm_server.sh +0 -30
hud/rl/utils.py +0 -524
hud/rl/vllm_adapter.py +0 -143
hud/telemetry/job.py +0 -352
hud/telemetry/replay.py +0 -74
hud/telemetry/tests/test_replay.py +0 -40
hud/telemetry/tests/test_trace.py +0 -63
hud/telemetry/trace.py +0 -158
hud/utils/agent_factories.py +0 -86
hud/utils/async_utils.py +0 -65
hud/utils/group_eval.py +0 -223
hud/utils/progress.py +0 -149
hud/utils/tasks.py +0 -127
hud/utils/tests/test_async_utils.py +0 -173
hud/utils/tests/test_progress.py +0 -261
hud_python-0.4.45.dist-info/METADATA +0 -552
hud_python-0.4.45.dist-info/RECORD +0 -228
{hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
{hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0

hud/cli/eval.py CHANGED Viewed

@@ -1,762 +1,876 @@
-"""HUD evaluation command for running tasks and datasets."""
+"""HUD evaluation command for running tasks and datasets.
+Config Override Order: CLI arguments > .hud_eval.toml > defaults
+"""
 from __future__ import annotations
 import asyncio
 import logging
+import re
+import time
+import tomllib
+from dataclasses import dataclass
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Literal
+from typing import TYPE_CHECKING, Any, ClassVar
+import questionary
 import typer
+from pydantic import BaseModel, Field, field_validator
+from rich import box
+from rich.table import Table
-import hud
-from hud.cli.utils.env_check import ensure_built, find_environment_dir
 from hud.settings import settings
-from hud.utils.group_eval import display_group_statistics, run_tasks_grouped
+from hud.types import AgentType
+from hud.utils.env import resolve_env_vars
 from hud.utils.hud_console import HUDConsole
+# Pattern to detect AWS Bedrock inference profile ARNs
+_BEDROCK_ARN_PATTERN = re.compile(r"^arn:aws:bedrock:[a-z0-9-]+:\d+:inference-profile/.+$")
+def _is_bedrock_arn(model: str | None) -> bool:
+    """Check if a model string is a Bedrock inference profile ARN."""
+    return model is not None and bool(_BEDROCK_ARN_PATTERN.match(model))
 if TYPE_CHECKING:
-    from hud.types import Task
+    from hud.agents.base import MCPAgent
 logger = logging.getLogger(__name__)
 hud_console = HUDConsole()
+_CONFIG_PATH = ".hud_eval.toml"
-def get_available_models() -> list[dict[str, str | None]]:
-    """Fetch available models from the HUD API (only ready models).
-    Returns:
-        List of dicts with 'name', 'vllm_url', and 'base_model' keys
-    """
-    try:
-        from hud.cli.rl import rl_api
-        hud_console.info("Fetching your models from https://hud.so/models")
-        models = rl_api.list_models()
-        # Filter for ready models only and sort by recency
-        ready_models = [m for m in models if m.status == "ready"]
-        ready_models.sort(key=lambda m: m.created_at or "", reverse=True)
+@dataclass(frozen=True)
+class AgentPreset:
+    """A preset agent configuration combining agent type, model, and optional config."""
-        # Count other statuses for informational purposes
-        training_count = sum(1 for m in models if m.status == "training")
-        # other_count = len(models) - len(ready_models) - training_count
+    name: str
+    agent_type: AgentType
+    model: str | None = None
+    agent_config: dict[str, Any] | None = None
-        if ready_models:
-            hud_console.success(f"Found {len(ready_models)} ready models:")
-            for model in ready_models:
-                vllm_status = " (vLLM deployed)" if model.vllm_url else ""
-                hud_console.info(f"  ✅ {model.name}{vllm_status}")
-            if training_count > 0:
-                hud_console.info(f"\n({training_count} models currently training)")
+# Built-in presets for the interactive picker
+_AGENT_PRESETS: list[AgentPreset] = [
+    # Native agents (use provider SDKs directly)
+    AgentPreset("Claude Sonnet 4.5", AgentType.CLAUDE, "claude-sonnet-4-5"),
+    AgentPreset("GPT-5", AgentType.OPENAI, "gpt-5"),
+    AgentPreset("Operator (OpenAI Computer Use)", AgentType.OPERATOR, "computer-use-preview"),
+    AgentPreset("Gemini 3 Pro Preview", AgentType.GEMINI, "gemini-3-pro-preview"),
+    AgentPreset(
+        "Gemini CUA (Gemini Computer Use)",
+        AgentType.GEMINI_CUA,
+        "gemini-2.5-computer-use-preview",
+    ),
+    # HUD Gateway presets (models via HUD Inference API)
+    AgentPreset(
+        "Grok 4-1 Fast (xAI)",
+        AgentType.OPENAI_COMPATIBLE,
+        "grok-4-1-fast",
+        {
+            "openai_compatible": {
+                "base_url": settings.hud_gateway_url,
+                "model_name": "Grok 4-1 Fast",
+            }
+        },
+    ),
+    AgentPreset(
+        "GLM-4.5V (Z-AI)",
+        AgentType.OPENAI_COMPATIBLE,
+        "z-ai/glm-4.5v",
+        {"openai_compatible": {"base_url": settings.hud_gateway_url, "model_name": "GLM-4.5V"}},
+    ),
+]
+_DEFAULT_CONFIG_TEMPLATE = """# HUD Eval Configuration
+# Command-line arguments override these settings
+[eval]
+# source = "hud-evals/SheetBench-50"
+# agent = "claude"
+# all = false  # Run all problems instead of just 1
+# max_concurrent = 30
+# max_steps = 10
+# group_size = 1
+# byok = false  # Remote only; use encrypted env vars on the platform.
+# task_ids = ["task_1", "task_2"]
+# verbose = true
+# very_verbose = true
+# auto_respond = true
+# gateway = false  # Route LLM API calls through HUD Gateway
+[agent]
+# allowed_tools = ["computer", "playwright"]
+# disallowed_tools = []
+[claude]
+# model = "claude-sonnet-4-5"
+# max_tokens = 16384
+# use_computer_beta = true
+[openai]
+# model = "gpt-4o"
+# temperature = 0.7
+# max_output_tokens = 4096
+[gemini]
+# model = "gemini-2.5-pro"
+# temperature = 1.0
+# top_p = 0.95
+[gemini_cua]
+# model = "gemini-2.5-computer-use-preview"
+# temperature = 1.0
+# top_p = 0.95
+# excluded_predefined_functions = []
+[openai_compatible]
+# base_url = "http://localhost:8000/v1"
+# model = "my-model"
+"""
+# Agent type -> (settings attr, env var name)
+_API_KEY_REQUIREMENTS: dict[AgentType, tuple[str, str]] = {
+    AgentType.CLAUDE: ("anthropic_api_key", "ANTHROPIC_API_KEY"),
+    AgentType.GEMINI: ("gemini_api_key", "GEMINI_API_KEY"),
+    AgentType.GEMINI_CUA: ("gemini_api_key", "GEMINI_API_KEY"),
+    AgentType.OPENAI: ("openai_api_key", "OPENAI_API_KEY"),
+    AgentType.OPERATOR: ("openai_api_key", "OPENAI_API_KEY"),
+}
+class EvalConfig(BaseModel):
+    """Configuration for hud eval command."""
+    # Class-level registry
+    _agent_classes: ClassVar[dict[AgentType, type["MCPAgent"]]] = {}
+    # Fields loaded from [eval] section
+    _EVAL_FIELDS: ClassVar[set[str]] = {
+        "source",
+        "agent_type",
+        "task_ids",
+        "all",
+        "max_concurrent",
+        "max_steps",
+        "verbose",
+        "very_verbose",
+        "group_size",
+        "byok",
+        "remote",
+        "auto_respond",
+        "quiet",
+        "gateway",
+    }
+    # Fields loaded from [agent] section
+    _AGENT_FIELDS: ClassVar[set[str]] = {"allowed_tools", "disallowed_tools"}
+    # Eval settings
+    source: str | None = None
+    agent_type: AgentType | None = None
+    model: str | None = None
+    task_ids: list[str] | None = None
+    all: bool = False  # Run all problems instead of just 1
+    max_concurrent: int = 30
+    max_steps: int = 10
+    verbose: bool = False
+    very_verbose: bool = False
+    auto_respond: bool | None = None  # Continue without prompting
+    group_size: int = 1
+    byok: bool = False
+    remote: bool = False
+    quiet: bool = False  # Suppress opening browser for eval links
+    gateway: bool = False  # Use HUD Gateway for LLM API calls
+    # Base agent config (these merge with task's agent_config)
+    allowed_tools: list[str] | None = None
+    disallowed_tools: list[str] | None = None
+    agent_config: dict[str, Any] = Field(default_factory=dict)
+    @field_validator("agent_type", mode="before")
+    @classmethod
+    def _parse_agent_type(cls, v: Any) -> AgentType | None:
+        """Convert string agent name to AgentType enum."""
+        if v is None:
+            return None
+        if isinstance(v, AgentType):
+            return v
+        if isinstance(v, str):
+            try:
+                return AgentType(v)
+            except ValueError:
+                valid = [e.value for e in AgentType]
+                raise ValueError(
+                    f"Invalid agent: {v}. Must be one of: {', '.join(valid)}"
+                ) from None
+        return v
+    def validate_api_keys(self) -> None:
+        """Validate required API keys for the selected agent. Raises typer.Exit on failure."""
+        # BYOK requires remote execution (check before agent_type guard)
+        if self.byok and not self.remote:
+            hud_console.error("--byok requires --remote (BYOK only works with remote execution)")
+            raise typer.Exit(1)
-            return [
-                {"name": model.name, "vllm_url": model.vllm_url, "base_model": model.base_model}
-                for model in ready_models
-            ]
-        else:
-            if training_count > 0:
-                hud_console.warning(
-                    f"No ready models found. You have {training_count} models currently training."
+        if self.agent_type is None:
+            return
+        if self.remote:
+            if not settings.api_key:
+                hud_console.error("HUD_API_KEY is required for remote execution")
+                hud_console.info("Set it: hud set HUD_API_KEY=your-key-here")
+                raise typer.Exit(1)
+            return
+        # Gateway mode only requires HUD_API_KEY
+        if self.gateway:
+            if not settings.api_key:
+                hud_console.error("HUD_API_KEY is required for gateway mode")
+                hud_console.info("Set it: hud set HUD_API_KEY=your-key-here")
+                raise typer.Exit(1)
+            return
+        if self.agent_type == AgentType.OPENAI_COMPATIBLE:
+            # Check both CLI --model and config file model
+            config_model = self.agent_config.get("openai_compatible", {}).get("model")
+            if not self.model and not config_model:
+                hud_console.error(
+                    "Model name is required for OpenAI compatible agent. "
+                    "Use --model or set model in [openai_compatible] section of .hud_eval.toml"
                 )
-            else:
-                hud_console.warning("No models found in your account.")
-            return []
-    except Exception as e:
-        hud_console.debug(f"Error fetching models: {e}")
-        # Don't show the error to the user, just proceed without HUD models
-        return []
-def build_agent(
-    agent_type: Literal["claude", "openai", "vllm", "litellm", "integration_test"],
-    *,
-    model: str | None = None,
-    allowed_tools: list[str] | None = None,
-    verbose: bool = False,
-    vllm_base_url: str | None = None,
-) -> Any:
-    """Create and return the requested agent type."""
-    # Import agents lazily to avoid dependency issues
-    if agent_type == "integration_test":
-        from hud.agents.misc.integration_test_agent import IntegrationTestRunner
-        return IntegrationTestRunner(verbose=verbose)
-    elif agent_type == "vllm":
-        # Create a generic OpenAI agent for vLLM server
-        try:
-            from openai import AsyncOpenAI
-            from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
-        except ImportError as e:
-            hud_console.error(
-                "OpenAI dependencies are not installed. "
-                "Please install with: pip install 'hud-python[agent]'"
-            )
-            raise typer.Exit(1) from e
-        # Determine the base URL to use
-        if vllm_base_url is not None:
-            # Use the provided vLLM URL (for custom/local servers)
-            base_url = vllm_base_url
-            hud_console.info(f"Using vLLM server at {base_url}")
-            api_key = (
-                settings.api_key if base_url.startswith(settings.hud_rl_url) else "token-abc123"
+                raise typer.Exit(1)
+        elif self.agent_type == AgentType.CLAUDE and _is_bedrock_arn(self.model):
+            missing_aws = (
+                not settings.aws_access_key_id
+                or not settings.aws_secret_access_key
+                or not settings.aws_region
             )
-        else:
-            # Default to localhost
-            base_url = "http://localhost:8000/v1"
-            api_key = "token-abc123"
-        # Create OpenAI client for vLLM
-        openai_client = AsyncOpenAI(
-            base_url=base_url,
-            api_key=api_key,
-            timeout=30.0,
-        )
-        return GenericOpenAIChatAgent(
-            openai_client=openai_client,
-            model_name=model or "served-model",  # Default model name
-            verbose=verbose,
-            completion_kwargs={
-                "temperature": 0.7,
-                "max_tokens": 2048,
-                "tool_choice": "required",  # if self.actor_config.force_tool_choice else "auto",
-            },
+            if missing_aws:
+                hud_console.error(
+                    "AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, and AWS_REGION "
+                    "are required for AWS Bedrock"
+                )
+                raise typer.Exit(1)
+        elif self.agent_type in _API_KEY_REQUIREMENTS:
+            attr, env_var = _API_KEY_REQUIREMENTS[self.agent_type]
+            if not getattr(settings, attr, None):
+                hud_console.error(f"{env_var} is required for {self.agent_type.value} agent")
+                hud_console.info(f"Set it: hud set {env_var}=your-key-here")
+                raise typer.Exit(1)
+        if not settings.api_key:
+            hud_console.warning("HUD_API_KEY not set. Some features may be limited.")
+    def get_agent_kwargs(self) -> dict[str, Any]:
+        """Build agent kwargs from config.
+        Model precedence:
+        1. CLI --model (highest priority)
+        2. [agent_type].model in TOML (per-agent config)
+        """
+        if self.agent_type is None:
+            raise ValueError("agent_type must be set before calling get_agent_kwargs()")
+        kwargs: dict[str, Any] = {}
+        if self.allowed_tools:
+            kwargs["allowed_tools"] = self.allowed_tools
+        if self.disallowed_tools:
+            kwargs["disallowed_tools"] = self.disallowed_tools
+        # Apply agent-specific config
+        agent_key = self.agent_type.value
+        if agent_key in self.agent_config:
+            agent_cfg = dict(self.agent_config[agent_key])
+            kwargs.update(agent_cfg)
+        # CLI --model always wins
+        if self.model:
+            kwargs["model"] = self.model
+        # For gateway base_url, inject HUD API key if not already set
+        if self.agent_type == AgentType.OPENAI_COMPATIBLE and "api_key" not in kwargs:
+            base_url = kwargs.get("base_url", "")
+            if settings.hud_gateway_url in base_url and settings.api_key:
+                kwargs["api_key"] = settings.api_key
+        # Auto-detect Bedrock when Claude is selected with a Bedrock ARN
+        # Check both model and checkpoint_name for ARN patterns
+        bedrock_arn_detected = _is_bedrock_arn(kwargs.get("model")) or _is_bedrock_arn(
+            kwargs.get("checkpoint_name")
         )
-    elif agent_type == "openai":
-        try:
-            from hud.agents import OperatorAgent
-        except ImportError as e:
-            hud_console.error(
-                "OpenAI agent dependencies are not installed. "
-                "Please install with: pip install 'hud-python[agent]'"
+        if self.agent_type == AgentType.CLAUDE and bedrock_arn_detected:
+            missing_aws = (
+                not settings.aws_access_key_id
+                or not settings.aws_secret_access_key
+                or not settings.aws_region
             )
-            raise typer.Exit(1) from e
+            if missing_aws:
+                hud_console.error(
+                    "AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, and AWS_REGION "
+                    "are required for AWS Bedrock"
+                )
+                raise typer.Exit(1)
-        if allowed_tools:
-            return OperatorAgent(
-                allowed_tools=allowed_tools,
-                verbose=verbose,
-            )
-        else:
-            return OperatorAgent(verbose=verbose)
+            from anthropic import AsyncAnthropicBedrock
-    elif agent_type == "litellm":
-        try:
-            from hud.agents.lite_llm import LiteAgent
-        except ImportError as e:
-            hud_console.error(
-                "LiteLLM agent dependencies are not installed. "
-                "Please install with: pip install 'hud-python[agent]'"
+            kwargs["model_client"] = AsyncAnthropicBedrock(
+                aws_access_key=settings.aws_access_key_id,
+                aws_secret_key=settings.aws_secret_access_key,
+                aws_region=settings.aws_region or "us-east-1",
             )
-            raise typer.Exit(1) from e
-        return LiteAgent(
-            model_name=model or "gpt-4o-mini",
-            allowed_tools=allowed_tools,
-            verbose=verbose,
-        )
-    # Fallback Claude agent (Anthropic)
-    try:
-        from hud.agents import ClaudeAgent
-    except ImportError as e:
-        hud_console.error(
-            "Claude agent dependencies are not installed. "
-            "Please install with: pip install 'hud-python[agent]'"
-        )
-        raise typer.Exit(1) from e
-    model = model or "claude-sonnet-4-20250514"
-    if allowed_tools:
-        return ClaudeAgent(
-            model=model,
-            allowed_tools=allowed_tools,
-            verbose=verbose,
-        )
-    else:
-        return ClaudeAgent(
-            model=model,
-            verbose=verbose,
-        )
+            hud_console.info("🔧 Using AWS Bedrock (detected ARN in model)")
+        kwargs["verbose"] = self.verbose or self.very_verbose
+        if self.agent_type in (
+            AgentType.CLAUDE,
+            AgentType.OPENAI,
+            AgentType.OPERATOR,
+            AgentType.GEMINI,
+            AgentType.GEMINI_CUA,
+        ):
+            kwargs["validate_api_key"] = False
+        # Configure gateway mode - route LLM API calls through HUD gateway
+        if self.gateway:
+            hud_api_key = settings.api_key
+            if not hud_api_key:
+                raise typer.Exit(1)  # Already validated in validate_api_keys()
+            if self.agent_type == AgentType.CLAUDE:
+                from anthropic import AsyncAnthropic
+                kwargs["model_client"] = AsyncAnthropic(
+                    api_key=hud_api_key,
+                    base_url=settings.hud_gateway_url,
+                )
+                hud_console.info("🌐 Using HUD Gateway for Claude API")
+            elif self.agent_type in (AgentType.OPENAI, AgentType.OPERATOR):
+                from openai import AsyncOpenAI
+                kwargs["model_client"] = AsyncOpenAI(
+                    api_key=hud_api_key,
+                    base_url=settings.hud_gateway_url,
+                )
+                hud_console.info("🌐 Using HUD Gateway for OpenAI API")
+            elif self.agent_type == AgentType.OPENAI_COMPATIBLE:
+                from openai import AsyncOpenAI
-async def run_single_task(
-    source: str,
-    *,
-    agent_type: Literal["claude", "openai", "vllm", "litellm", "integration_test"] = "claude",
-    model: str | None = None,
-    allowed_tools: list[str] | None = None,
-    max_steps: int = 10,
-    verbose: bool = False,
-    vllm_base_url: str | None = None,
-    group_size: int = 1,
-) -> None:
-    """Load one task and execute it, or detect if JSON contains a list and run as dataset."""
+                kwargs["openai_client"] = AsyncOpenAI(
+                    api_key=hud_api_key,
+                    base_url=settings.hud_gateway_url,
+                )
+                hud_console.info("🌐 Using HUD Gateway for OpenAI-compatible API")
+            elif self.agent_type in (AgentType.GEMINI, AgentType.GEMINI_CUA):
+                from google import genai
+                from google.genai.types import HttpOptions
+                kwargs["model_client"] = genai.Client(
+                    api_key="PLACEHOLDER",
+                    http_options=HttpOptions(
+                        api_version="v1beta",
+                        base_url=settings.hud_gateway_url,
+                        headers={"Authorization": f"Bearer {hud_api_key}"},
+                    ),
+                )
+                hud_console.info("🌐 Using HUD Gateway for Gemini API")
-    # Import Task and run_dataset lazily
-    try:
-        from hud.utils.tasks import load_tasks
-    except ImportError as e:
-        hud_console.error(
-            "Dataset dependencies are not installed. "
-            "Please install with: pip install 'hud-python\u27e6agent\u27e7'"
-        )
-        raise typer.Exit(1) from e
+        return kwargs
-    path = Path(source)
-    if path.exists() and (path.suffix in [".json", ".jsonl"]):
-        hud_console.info("📊 Loading task file…")
-        tasks: list[Task] = load_tasks(str(path))  # type: ignore[assignment]
+    @classmethod
+    def load(cls, path: str = _CONFIG_PATH) -> EvalConfig:
+        """Load config from TOML file."""
+        p = Path(path)
+        if not p.exists():
+            p.write_text(_DEFAULT_CONFIG_TEMPLATE)
+            hud_console.info(f"Generated {_CONFIG_PATH}")
+            return cls()
-        # If tasks reference a local environment (nearby), ensure it's built/up-to-date.
         try:
-            env_dir = find_environment_dir(path)
-            if env_dir is not None:
-                # Non-interactive for eval; warn but don't block
-                ensure_built(env_dir, interactive=False)
+            with open(p, "rb") as f:
+                toml_data = tomllib.load(f)
         except Exception as e:
-            hud_console.debug(f"Eval preflight env check skipped: {e}")
+            hud_console.warning(f"Failed to parse {path}: {e}")
+            return cls()
-        # Single task - use the first (and only) task
-        task = tasks[0]
-        hud_console.info("Found 1 task, running as single task…")
+        toml_data = resolve_env_vars(toml_data)
-    else:
-        # Load from HuggingFace dataset or non-file source
-        hud_console.info(f"📊 Loading tasks from: {source}…")
-        tasks: list[Task] = load_tasks(source)  # type: ignore[assignment]
+        # Extract sections
+        eval_section = toml_data.get("eval", {})
+        agent_section = toml_data.get("agent", {})
-        if not tasks:
-            hud_console.error(f"No tasks found in: {source}")
-            raise typer.Exit(1)
+        # Build config data
+        data: dict[str, Any] = {}
-        # Single task - use the first task
-        task = tasks[0]
-        hud_console.info(
-            "Using first task from dataset (run with --full to run the entire dataset)..."
-        )
+        # Eval settings (map 'agent' -> 'agent_type')
+        if "agent" in eval_section:
+            data["agent_type"] = eval_section["agent"]
+        for key in cls._EVAL_FIELDS:
+            if key in eval_section:
+                data[key] = eval_section[key]
-    task_prompt = task.prompt[:50] + "..." if len(task.prompt) > 50 else task.prompt
-    # Use grouped evaluation if group_size > 1
-    agent_config: dict[str, Any] = {}
-    if agent_type == "integration_test":
-        from hud.agents.misc.integration_test_agent import IntegrationTestRunner
-        agent_class = IntegrationTestRunner
-        agent_config = {"verbose": verbose}
-        if allowed_tools:
-            agent_config["allowed_tools"] = allowed_tools
-    elif agent_type == "vllm":
-        # Special handling for vLLM
-        sample_agent = build_agent(
-            agent_type,
-            model=model,
-            allowed_tools=allowed_tools,
-            verbose=verbose,
-            vllm_base_url=vllm_base_url,
-        )
-        agent_config = {
-            "openai_client": sample_agent.oai,
-            "model_name": sample_agent.model_name,
-            "verbose": verbose,
-            "completion_kwargs": sample_agent.completion_kwargs,
-        }
-        if allowed_tools:
-            agent_config["allowed_tools"] = allowed_tools
-        from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
-        agent_class = GenericOpenAIChatAgent
-    elif agent_type == "openai":
-        from hud.agents import OperatorAgent
-        agent_class = OperatorAgent
-        agent_config = {"verbose": verbose}
-        if allowed_tools:
-            agent_config["allowed_tools"] = allowed_tools
-    elif agent_type == "litellm":
-        from hud.agents.lite_llm import LiteAgent
-        agent_class = LiteAgent
-        agent_config = {
-            "model_name": model or "gpt-4o-mini",
-            "verbose": verbose,
-        }
-        if allowed_tools:
-            agent_config["allowed_tools"] = allowed_tools
-    elif agent_type == "claude":
-        from hud.agents import ClaudeAgent
-        agent_class = ClaudeAgent
-        agent_config = {
-            "model": model or "claude-sonnet-4-20250514",
-            "verbose": verbose,
-        }
-        if allowed_tools:
-            agent_config["allowed_tools"] = allowed_tools
-    else:
-        raise ValueError(f"Invalid agent type: {agent_type}")
-    if group_size > 1:
-        hud_console.info(f"🔄 Running task with group_size={group_size}")
-        # Run with grouping
-        stats = await run_tasks_grouped(
-            tasks=[task],
-            agent_class=agent_class,
-            agent_config=agent_config,
-            group_size=group_size,
-            max_parallel_episodes=48,  # Same as RL default
-            max_steps=max_steps,
-            verbose=verbose,
-        )
-        display_group_statistics(stats, show_details=True)
-    else:
-        # Original single-run logic
-        with hud.trace(name=task_prompt):
-            agent = build_agent(
-                agent_type,
-                model=model,
-                allowed_tools=allowed_tools,
-                verbose=verbose,
-                vllm_base_url=vllm_base_url,
-            )
-            hud_console.info(task.prompt)
-            result = await agent.run(task, max_steps=max_steps)
-            hud_console.success(f"Reward: {result.reward}")
-async def run_full_dataset(
-    source: str,
-    *,
-    agent_type: Literal["claude", "openai", "vllm", "litellm", "integration_test"] = "claude",
-    model: str | None = None,
-    allowed_tools: list[str] | None = None,
-    max_concurrent: int = 30,
-    max_steps: int = 10,
-    parallel: bool = False,
-    max_workers: int | None = None,
-    max_concurrent_per_worker: int = 25,
-    verbose: bool = False,
-    vllm_base_url: str | None = None,
-    group_size: int = 1,
-) -> list[Any]:
-    """Run evaluation across the entire dataset.
-    Uses either asyncio-based run_dataset or process-based parallel execution
-    depending on the parallel flag."""
-    # Import run_dataset lazily
-    try:
-        from hud.datasets import run_dataset, run_dataset_parallel, run_dataset_parallel_manual
-        from hud.utils.tasks import load_tasks
-    except ImportError as e:
-        hud_console.error(
-            "Dataset dependencies are not installed. "
-            "Please install with: pip install 'hud-python[agent]'"
-        )
-        raise typer.Exit(1) from e
-    # Load tasks using unified loader
-    hud_console.info(f"📊 Loading tasks from: {source}…")
-    tasks: list[Task] = load_tasks(source)  # type: ignore[assignment]
-    if not tasks:
-        hud_console.error(f"No tasks found in: {source}")
-        raise typer.Exit(1)
+        # Agent base config
+        for key in cls._AGENT_FIELDS:
+            if key in agent_section:
+                data[key] = agent_section[key]
-    # Convert Task objects to dicts for dataset runners
-    dataset_or_tasks = [task.model_dump() for task in tasks]
+        # Agent-specific configs (claude, openai, gemini, etc.)
+        agent_config: dict[str, Any] = {}
+        for agent_type in AgentType:
+            if agent_type.value in toml_data:
+                agent_config[agent_type.value] = toml_data[agent_type.value]
+        data["agent_config"] = agent_config
-    # Determine dataset name
-    path = Path(source)
-    dataset_name = f"Dataset: {path.name}" if path.exists() else source.split("/")[-1]
-    # Build agent class + config for run_dataset
-    if agent_type == "integration_test":  # --integration-test mode
-        from hud.agents.misc.integration_test_agent import IntegrationTestRunner
-        agent_class = IntegrationTestRunner
-        agent_config = {"verbose": verbose}
-    elif agent_type == "vllm":
         try:
-            from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
-            agent_class = GenericOpenAIChatAgent
-        except ImportError as e:
-            hud_console.error(
-                "OpenAI dependencies are not installed. "
-                "Please install with: pip install 'hud-python[agent]'"
+            return cls.model_validate(data)
+        except Exception as e:
+            hud_console.warning(f"Invalid config: {e}")
+            return cls()
+    def merge_cli(
+        self,
+        agent: str | None = None,
+        config: list[str] | None = None,
+        allowed_tools: str | None = None,
+        disallowed_tools: str | None = None,
+        task_ids: str | None = None,
+        **cli_args: Any,
+    ) -> EvalConfig:
+        """Merge CLI args (non-None values override config)."""
+        overrides: dict[str, Any] = {}
+        if agent is not None:
+            overrides["agent_type"] = agent
+        # Parse comma-separated lists
+        if allowed_tools is not None:
+            overrides["allowed_tools"] = [t.strip() for t in allowed_tools.split(",") if t.strip()]
+        if disallowed_tools is not None:
+            overrides["disallowed_tools"] = [
+                t.strip() for t in disallowed_tools.split(",") if t.strip()
+            ]
+        if task_ids is not None:
+            overrides["task_ids"] = [t.strip() for t in task_ids.split(",") if t.strip()]
+        overrides.update({k: v for k, v in cli_args.items() if v is not None and v is not False})
+        for k in ("all", "verbose", "very_verbose", "remote", "quiet", "gateway"):
+            if cli_args.get(k) is True:
+                overrides[k] = True
+            elif k in overrides and cli_args.get(k) is False:
+                del overrides[k]
+        # --full is a shortcut for --all --auto-respond --max-steps 100
+        if overrides.get("full"):
+            overrides["all"] = True
+            if "auto_respond" not in overrides:
+                overrides["auto_respond"] = True
+            if "max_steps" not in overrides:
+                overrides["max_steps"] = 100
+        if config:
+            merged_agent_config = dict(self.agent_config)
+            for item in config:
+                if "=" in item:
+                    key, value = item.split("=", 1)
+                    key = key.strip()
+                    value = value.strip()
+                    # Parse value
+                    if value.lower() == "true":
+                        parsed_value: Any = True
+                    elif value.lower() == "false":
+                        parsed_value = False
+                    else:
+                        try:
+                            parsed_value = int(value)
+                        except ValueError:
+                            try:
+                                parsed_value = float(value)
+                            except ValueError:
+                                parsed_value = value
+                    # Handle namespaced keys (e.g., claude.max_tokens)
+                    if "." in key:
+                        agent_name, param = key.split(".", 1)
+                        if agent_name not in merged_agent_config:
+                            merged_agent_config[agent_name] = {}
+                        merged_agent_config[agent_name][param] = parsed_value
+                    else:
+                        # Non-namespaced: apply to current agent if set
+                        if self.agent_type:
+                            agent_name = self.agent_type.value
+                            if agent_name not in merged_agent_config:
+                                merged_agent_config[agent_name] = {}
+                            merged_agent_config[agent_name][key] = parsed_value
+            overrides["agent_config"] = merged_agent_config
+        return self.model_validate({**self.model_dump(), **overrides})
+    def resolve_agent_interactive(self) -> EvalConfig:
+        """Prompt user to select an agent preset if not set. Returns updated config."""
+        if self.agent_type is not None:
+            return self
+        # Build choices from presets
+        choices: list[dict[str, Any]] = [
+            {"name": preset.name, "value": preset} for preset in _AGENT_PRESETS
+        ]
+        selected: AgentPreset = hud_console.select("Select an agent:", choices=choices, default=0)  # type: ignore[arg-type]
+        # Merge preset into config
+        updates: dict[str, Any] = {"agent_type": selected.agent_type}
+        if selected.model:
+            updates["model"] = selected.model
+        if selected.agent_config:
+            # Merge preset's agent_config with existing
+            merged = dict(self.agent_config)
+            for key, value in selected.agent_config.items():
+                if key in merged:
+                    merged[key] = {**merged[key], **value}
+                else:
+                    merged[key] = value
+            updates["agent_config"] = merged
+        return self.model_validate({**self.model_dump(), **updates})
+    def display(self) -> None:
+        """Display settings in a table."""
+        table = Table(title="Evaluation Settings", title_style="bold cyan", box=box.ROUNDED)
+        table.add_column("Setting", style="yellow")
+        table.add_column("Value", style="green")
+        # Core settings
+        table.add_row("source", str(self.source or "—"))
+        table.add_row("agent", self.agent_type.value)  # type: ignore[union-attr]
+        if self.task_ids:
+            table.add_row(
+                "task_ids", ", ".join(self.task_ids[:5]) + ("..." if len(self.task_ids) > 5 else "")
             )
-            raise typer.Exit(1) from e
-        # Use build_agent to create a sample agent to get the config
-        sample_agent = build_agent(
-            agent_type,
-            model=model,
-            allowed_tools=allowed_tools,
-            verbose=verbose,
-            vllm_base_url=vllm_base_url,
-        )
+        table.add_row("all", str(self.all))
+        table.add_row("max_steps", str(self.max_steps))
+        if not self.remote:
+            table.add_row("max_concurrent", str(self.max_concurrent))
+        if self.group_size > 1:
+            table.add_row("group_size", str(self.group_size))
+        if self.auto_respond:
+            table.add_row("auto_respond", "[bold green]True[/bold green]")
+        if self.very_verbose:
+            table.add_row("very_verbose", "[bold green]True[/bold green]")
+        elif self.verbose:
+            table.add_row("verbose", "[bold green]True[/bold green]")
+        if self.remote:
+            table.add_row("remote", "[bold green]True[/bold green] (submitting to platform)")
+        if self.gateway:
+            table.add_row("gateway", "[bold green]True[/bold green] (routing via HUD Gateway)")
+        if self.byok:
+            table.add_row("byok", "[bold green]True[/bold green] (remote only)")
+        # Tool filters (only if set)
+        if self.allowed_tools:
+            table.add_row("allowed_tools", ", ".join(self.allowed_tools))
+        if self.disallowed_tools:
+            table.add_row("disallowed_tools", ", ".join(self.disallowed_tools))
+        # Agent config section
+        if self.agent_type:
+            table.add_row("", "")
+            table.add_row(f"[dim]{self.agent_type.value} config[/dim]", "")
+            config_cls = self.agent_type.cls.config_cls
+            defaults = config_cls()
+            overrides = self.agent_config.get(self.agent_type.value, {})
+            skip = {
+                "model_client",
+                "model_name",
+                "validate_api_key",
+                "model_config",
+                "allowed_tools",
+                "disallowed_tools",
+                "system_prompt",
+                "response_tool_name",
+                "append_setup_output",
+                "initial_screenshot",
+            }
+            sensitive_fields = {"api_key", "api_secret", "token", "password", "secret"}
+            for name in config_cls.model_fields:
+                if name in skip:
+                    continue
+                # Always show model
+                if name == "model":
+                    if self.model:
+                        value = self.model
+                    elif overrides.get("model"):
+                        value = overrides["model"]
+                    else:
+                        value = getattr(defaults, "model", None)
+                    table.add_row("  model", str(value) if value else "—")
+                elif name in overrides:
+                    value = overrides[name]
+                    if name in sensitive_fields and value:
+                        display_value = f"{str(value)[:4]}****" if len(str(value)) > 4 else "****"
+                    else:
+                        display_value = str(value)
+                    table.add_row(f"  {name}", display_value)
+        hud_console.console.print(table)
+# =============================================================================
+# Evaluation runner
+# =============================================================================
+async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
+    """Run evaluation with the given config using run_dataset()."""
+    from hud.datasets import load_tasks, run_dataset
+    if cfg.source is None or cfg.agent_type is None:
+        raise ValueError("source and agent_type must be set")
+    # Load tasks using unified loader (handles v4→v5 conversion automatically)
+    hud_console.info(f"📊 Loading tasks from: {cfg.source}…")
+    tasks = load_tasks(cfg.source)
-        # Extract the config from the sample agent
-        agent_config: dict[str, Any] = {
-            "openai_client": sample_agent.oai,
-            "model_name": sample_agent.model_name,
-            "verbose": verbose,
-            "completion_kwargs": sample_agent.completion_kwargs,
-        }
-        if allowed_tools:
-            agent_config["allowed_tools"] = allowed_tools
-    elif agent_type == "openai":
-        try:
-            from hud.agents import OperatorAgent
+    if not tasks:
+        hud_console.error(f"No tasks found in: {cfg.source}")
+        raise typer.Exit(1)
-            agent_class = OperatorAgent
-        except ImportError as e:
-            hud_console.error(
-                "OpenAI agent dependencies are not installed. "
-                "Please install with: pip install 'hud-python[agent]'"
-            )
-            raise typer.Exit(1) from e
+    # Filter by task IDs if provided
+    if cfg.task_ids:
+        id_set = set(cfg.task_ids)
+        # Match by task.id or index
+        filtered = [t for i, t in enumerate(tasks) if t.id in id_set or str(i) in id_set]
+        if not filtered:
+            hud_console.error(f"No tasks found matching IDs: {', '.join(cfg.task_ids)}")
+            raise typer.Exit(1)
+        hud_console.info(f"Filtered to {len(filtered)} task(s) by ID")
+        tasks = filtered
+    elif not cfg.all:
+        # Single task mode (no --all, --full, or --task-ids)
+        tasks = [tasks[0]]
+        hud_console.info("Using first task (run with --full or --task-ids for more)…")
+    hud_console.info(f"Loaded {len(tasks)} task(s)")
+    # Prepare agent kwargs
+    agent_kwargs = cfg.get_agent_kwargs()
+    auto_respond = cfg.auto_respond
+    if auto_respond:
+        agent_kwargs = {**agent_kwargs, "auto_respond": True}
+    max_steps = cfg.max_steps
+    # Remote execution - submit to HUD platform
+    if cfg.remote:
+        agent_kwargs = {
+            k: v for k, v in agent_kwargs.items() if k not in ("api_key", "model_client")
+        }
+        # Create a job ID for tracking
+        import uuid
-        agent_config = {"verbose": verbose}
-        if allowed_tools:
-            agent_config["allowed_tools"] = allowed_tools
+        from hud.datasets.utils import submit_rollouts
-    elif agent_type == "litellm":
-        try:
-            from hud.agents.lite_llm import LiteAgent
+        job_id = str(uuid.uuid4())
+        hud_console.info(
+            f"Submitting {len(tasks)} task(s) for remote execution (job_id: {job_id})…"
+        )
-            agent_class = LiteAgent
-        except ImportError as e:
-            hud_console.error(
-                "LiteLLM agent dependencies are not installed. "
-                "Please install with: pip install 'hud-python[agent]'"
-            )
-            raise typer.Exit(1) from e
+        await submit_rollouts(
+            tasks=tasks,
+            job_id=job_id,
+            agent_type=cfg.agent_type,
+            agent_params=agent_kwargs,
+            max_steps=max_steps,
+            group_size=cfg.group_size,
+            use_byok=cfg.byok,
+        )
-        agent_config = {
-            "model_name": model or "gpt-4o-mini",
-            "verbose": verbose,
-        }
-        if allowed_tools:
-            agent_config["allowed_tools"] = allowed_tools
+        hud_console.success(f"Tasks submitted. View at: https://hud.ai/jobs/{job_id}")
+        return [], tasks
+    # Single task mode - show extra info
+    if len(tasks) == 1 and cfg.group_size == 1:
+        logging.getLogger("hud.agents").setLevel(logging.INFO)
+        logging.getLogger("hud.agents.base").setLevel(logging.INFO)
+        # Get prompt from args (v4 tasks) or show scenario name
+        prompt = tasks[0].args.get("prompt") if tasks[0].args else tasks[0].scenario
+        if prompt:
+            hud_console.info(f"Prompt: {prompt}")
     else:
-        try:
-            from hud.agents import ClaudeAgent
+        hud_console.info(
+            f"🚀 Running evaluation (max_concurrent: {cfg.max_concurrent}, "
+            f"group_size: {cfg.group_size})…"
+        )
-            agent_class = ClaudeAgent
-        except ImportError as e:
-            hud_console.error(
-                "Claude agent dependencies are not installed. "
-                "Please install with: pip install 'hud-python[agent]'"
-            )
-            raise typer.Exit(1) from e
+    # Run using run_dataset
+    results = await run_dataset(
+        tasks,
+        cfg.agent_type,
+        agent_params=agent_kwargs,
+        max_steps=max_steps,
+        max_concurrent=cfg.max_concurrent,
+        group_size=cfg.group_size,
+        quiet=cfg.quiet,
+    )
-        agent_config = {
-            "model": model or "claude-sonnet-4-20250514",
-            "verbose": verbose,
-        }
-        if allowed_tools:
-            agent_config["allowed_tools"] = allowed_tools
-    # Use grouped evaluation if group_size > 1
-    if group_size > 1:
-        hud_console.info(f"🔄 Running dataset with group_size={group_size}")
-        # Run with job tracking
-        with hud.job(
-            name=f"Evaluation {dataset_name} (group_size={group_size})",
-            metadata={
-                "dataset": source,
-                "group_size": group_size,
-                "tasks": len(dataset_or_tasks),
-                "total_episodes": len(dataset_or_tasks) * group_size,
-            },
-        ) as job:
-            # Convert dicts to Task objects if needed
-            from hud.datasets import Task
-            tasks = []
-            for item in dataset_or_tasks:
-                if isinstance(item, dict):
-                    tasks.append(Task(**item))
-                else:
-                    tasks.append(item)
-            stats = await run_tasks_grouped(
-                tasks=tasks,
-                agent_class=agent_class,
-                agent_config=agent_config,
-                group_size=group_size,
-                max_parallel_episodes=max_concurrent
-                if not parallel
-                else max_concurrent_per_worker * (max_workers or 4),
-                max_steps=max_steps,
-                verbose=verbose,
-                job_id=job.id,
-            )
+    # Show reward for single task
+    if len(tasks) == 1 and cfg.group_size == 1 and results:
+        hud_console.success(f"Reward: {results[0].reward}")
-        # Display results
-        display_group_statistics(stats, show_details=len(stats) <= 50)
+    return results, tasks
-        # Return stats for consistency with other modes
-        return stats
-    # Original logic for non-grouped evaluation
-    elif parallel:
-        hud_console.info(
-            f"🚀 Running PARALLEL evaluation (workers: {max_workers or 'auto'}, max_concurrent: {max_concurrent})…"  # noqa: E501
-        )
-        if max_workers is None:
-            # Use auto-optimization (now the default run_dataset_parallel)
-            return await run_dataset_parallel(
-                name=f"Evaluation {dataset_name}",
-                dataset=dataset_or_tasks,
-                agent_class=agent_class,
-                agent_config=agent_config,
-                max_concurrent=max_concurrent,
-                metadata={"dataset": source, "parallel": True},
-                max_steps=max_steps,
-                auto_respond=True,
-            )
-        else:
-            # Use manual configuration
-            return await run_dataset_parallel_manual(
-                name=f"Evaluation {dataset_name}",
-                dataset=dataset_or_tasks,
-                agent_class=agent_class,
-                agent_config=agent_config,
-                max_workers=max_workers,
-                max_concurrent_per_worker=max_concurrent_per_worker,
-                max_concurrent=max_concurrent,
-                metadata={"dataset": source, "parallel": True},
-                max_steps=max_steps,
-                auto_respond=True,
-            )
-    else:
-        hud_console.info(f"🚀 Running evaluation (max_concurrent: {max_concurrent})…")
-        return await run_dataset(
-            name=f"Evaluation {dataset_name}",
-            dataset=dataset_or_tasks,
-            agent_class=agent_class,
-            agent_config=agent_config,
-            max_concurrent=max_concurrent,
-            metadata={"dataset": source},
-            max_steps=max_steps,
-        )
+# =============================================================================
+# CLI command
+# =============================================================================
 def eval_command(
-    source: str = typer.Argument(
-        ...,
-        help="HuggingFace dataset identifier (e.g. 'hud-evals/SheetBench-50'), JSON file (array of tasks), or JSONL file (one task per line)",  # noqa: E501
+    source: str | None = typer.Argument(None, help="HuggingFace dataset or task JSON file"),
+    agent: str | None = typer.Argument(
+        None,
+        help="Agent: claude, openai, operator, gemini, gemini_cua, openai_compatible, integration_test",  # noqa: E501
     ),
+    all: bool = typer.Option(False, "--all", help="Run all problems instead of just 1"),
     full: bool = typer.Option(
         False,
         "--full",
-        help="Run the entire dataset (omit for single-task debug mode)",
+        help="Run the entire dataset. Shortcut for --all --auto-respond  --max-steps 100",
     ),
-    agent: Literal["claude", "openai", "vllm", "litellm", "integration_test"] = typer.Option(
-        "claude",
-        "--agent",
-        help="Agent backend to use (claude, openai, vllm for local server, or litellm)",
-    ),
-    model: str | None = typer.Option(
-        None,
-        "--model",
-        help="Model name for the chosen agent",
+    model: str | None = typer.Option(None, "--model", "-m", help="Model name"),
+    config: list[str] | None = typer.Option(  # noqa: B008
+        None, "--config", "-c", help="Agent config: key=value"
     ),
+    # Task-overridable settings
     allowed_tools: str | None = typer.Option(
-        None,
-        "--allowed-tools",
-        help="Comma-separated list of allowed tools",
+        None, "--allowed-tools", help="Comma-separated allowed tools"
     ),
-    max_concurrent: int = typer.Option(
-        30,
-        "--max-concurrent",
-        help="Concurrency level for asyncio mode (ignored in parallel mode)",
+    disallowed_tools: str | None = typer.Option(
+        None, "--disallowed-tools", help="Comma-separated disallowed tools"
     ),
-    max_steps: int | None = typer.Option(
-        None,
-        "--max-steps",
-        help="Maximum steps per task (default: 10 for single, 50 for full)",
+    # Eval settings
+    max_concurrent: int | None = typer.Option(
+        None, "--max-concurrent", help="Max concurrent tasks"
     ),
-    parallel: bool = typer.Option(
+    max_steps: int | None = typer.Option(None, "--max-steps", help="Max steps per task"),
+    verbose: bool = typer.Option(False, "--verbose", "-v", help="Verbose output"),
+    very_verbose: bool = typer.Option(False, "--very-verbose", "-vv", help="Debug logs"),
+    auto_respond: bool = typer.Option(
         False,
-        "--parallel",
-        help="Use process-based parallel execution for large datasets (100+ tasks)",
+        "--auto-respond",
+        help="Automatically prompt the agent to continue if it does not respond with a tool call",
     ),
-    max_workers: int | None = typer.Option(
-        None,
-        "--max-workers",
-        help="Number of worker processes for parallel mode (auto-optimized if not set)",
-    ),
-    max_concurrent_per_worker: int = typer.Option(
-        20,
-        "--max-concurrent-per-worker",
-        help="Maximum concurrent tasks per worker in parallel mode",
+    group_size: int | None = typer.Option(None, "--group-size", help="Runs per task"),
+    task_ids: str | None = typer.Option(None, "--task-ids", help="Comma-separated task IDs to run"),
+    yes: bool = typer.Option(False, "--yes", "-y", help="Skip confirmation"),
+    remote: bool = typer.Option(
+        False, "--remote", help="Submit tasks to platform for remote execution"
     ),
-    verbose: bool = typer.Option(
+    byok: bool = typer.Option(
         False,
-        "--verbose",
-        help="Enable verbose output from the agent",
+        "--byok",
+        help="Remote only: use BYOK keys from encrypted env vars for inference",
     ),
-    very_verbose: bool = typer.Option(
-        False,
-        "--very-verbose",
-        "-vv",
-        help="Enable debug-level logs for maximum visibility",
-    ),
-    vllm_base_url: str | None = typer.Option(
-        None,
-        "--vllm-base-url",
-        help="Base URL for vLLM server (when using --agent vllm)",
+    quiet: bool = typer.Option(
+        False, "--quiet", "-q", help="Suppress opening browser for eval links"
     ),
-    group_size: int = typer.Option(
-        1,
-        "--group-size",
-        help="Number of times to run each task (similar to RL training)",
-    ),
-    integration_test: bool = typer.Option(
-        False,
-        "--integration-test",
-        help=(
-            "Run integration_test_tool tool, where problem is setup, "
-            "actions are applied, and evaluation is performed, without "
-            "spinning up an agent"
-        ),
+    gateway: bool = typer.Option(
+        False, "--gateway", "-g", help="Route LLM API calls through HUD Gateway"
     ),
 ) -> None:
     """🚀 Run evaluation on datasets or individual tasks with agents.
     Examples:
-        # Evaluate a single task from SheetBench
-        hud eval hud-evals/SheetBench-50
-        # Evaluate the FULL SheetBench dataset with Claude (asyncio mode)
-        hud eval hud-evals/SheetBench-50 --full --agent claude
-        # Run large dataset with PARALLEL execution (auto-optimized)
-        hud eval hud-evals/OSWorld-Verified-XLang --full --parallel
-        # Parallel mode with manual configuration (16 workers, 25 tasks each)
-        hud eval hud-evals/OSWorld-Verified-XLang --full --parallel --max-workers 16
-        # Limit total concurrent tasks to prevent rate limits
-        hud eval hud-evals/SheetBench-50 --full --parallel --max-concurrent 20
-        # Run a single task from a JSON file
-        hud eval task.json
-        # Run multiple tasks from a JSON file with parallel execution
-        hud eval tasks.json --full --parallel
-        # Run with OpenAI Operator agent
-        hud eval hud-evals/OSWorld-Gold-Beta --agent openai
+        hud eval tasks.json claude
+        hud eval hud-evals/SheetBench-50 claude --full
+        hud eval tasks.json claude --config max_tokens=32768
+        hud eval tasks.json openai --config temperature=0.7
+        hud eval tasks.json claude --full --remote  # Remote execution
+        hud eval tasks.json claude --gateway  # Route LLM calls through HUD Gateway
+    """
+    hud_console.info("🔧 Initializing evaluation...")
+    # Load config and merge CLI args
+    cfg = EvalConfig.load().merge_cli(
+        source=source,
+        agent=agent,
+        model=model,
+        all=all,
+        full=full,
+        max_concurrent=max_concurrent,
+        max_steps=max_steps,
+        allowed_tools=allowed_tools,
+        disallowed_tools=disallowed_tools,
+        task_ids=task_ids,
+        verbose=verbose,
+        very_verbose=very_verbose,
+        auto_respond=auto_respond,
+        group_size=group_size,
+        config=config,
+        remote=remote,
+        byok=byok,
+        quiet=quiet,
+        gateway=gateway,
+    )
-        # Use local vLLM server (default: localhost:8000)
-        hud eval task.json --agent vllm --model Qwen/Qwen2.5-VL-3B-Instruct
+    # Find source if not provided
+    if cfg.source is None:
+        try:
+            from hud.cli.utils.tasks import find_tasks_file
-        # Use custom vLLM server URL
-        hud eval task.json --agent vllm --vllm-base-url http://192.168.1.100:8000/v1
+            cfg = cfg.model_copy(
+                update={"source": find_tasks_file(None, msg="Select a tasks file")}
+            )
+            hud_console.success(f"Selected: {cfg.source}")
+        except Exception:
+            hud_console.error("No source provided and no task files found")
+            raise typer.Exit(1) from None
-        # Run with verbose output for debugging
-        hud eval task.json --verbose
-    """
-    from hud.settings import settings
+    # Resolve agent interactively if needed
+    cfg = cfg.resolve_agent_interactive()
-    if very_verbose:
-        logging.basicConfig(
-            level=logging.DEBUG,
-            format="%(asctime)s - %(name)s - %(message)s",
-            datefmt="%H:%M:%S",
-        )
+    # Configure logging
+    if cfg.very_verbose:
+        logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(message)s")
         logging.getLogger("hud.agents").setLevel(logging.DEBUG)
-        logging.getLogger("hud.agents.base").setLevel(logging.DEBUG)
-    elif verbose:
-        logging.basicConfig(
-            level=logging.INFO,
-            format="%(asctime)s - %(name)s - %(message)s",
-            datefmt="%H:%M:%S",
-        )
+        # Suppress noisy HTTP client logs
+        logging.getLogger("httpx").setLevel(logging.WARNING)
+        logging.getLogger("httpcore").setLevel(logging.WARNING)
+    elif cfg.verbose:
         logging.getLogger("hud.agents").setLevel(logging.INFO)
-        logging.getLogger("hud.agents.base").setLevel(logging.INFO)
-    # We pass integration_test as the agent_type
-    if integration_test:
-        agent = "integration_test"
+    # Validate API keys
+    cfg.validate_api_keys()
-    # Check for required API keys
-    if agent == "claude":
-        if not settings.anthropic_api_key:
-            hud_console.error("ANTHROPIC_API_KEY is required for Claude agent")
-            hud_console.info(
-                "Set it in your environment or run: hud set ANTHROPIC_API_KEY=your-key-here"
-            )
-            raise typer.Exit(1)
-    elif agent == "openai" and not settings.openai_api_key:
-        hud_console.error("OPENAI_API_KEY is required for OpenAI agent")
-        hud_console.info("Set it in your environment or run: hud set OPENAI_API_KEY=your-key-here")
+    # Display and confirm
+    cfg.display()
+    if not yes and not questionary.confirm("Proceed?", default=True, qmark="").ask():
+        hud_console.info("Cancelled.")
         raise typer.Exit(1)
-    elif agent == "vllm":
-        if model:
-            hud_console.info(f"Using vLLM with model: {model}")
-        else:
-            hud_console.error("Model name is required for vLLM agent, specify with --model")
-            raise typer.Exit(1)
-    # Check for HUD_API_KEY if using HUD services
-    if not settings.api_key:
-        hud_console.warning("HUD_API_KEY not set. Some features may be limited.")
-        hud_console.info("Get your API key at: https://hud.so")
-        hud_console.info("Set it in your environment or run: hud set HUD_API_KEY=your-key-here")
+    # Run
+    start_time = time.time()
+    try:
+        results, tasks = asyncio.run(_run_evaluation(cfg))
+    except ValueError as e:
+        hud_console.error(str(e))
+        raise typer.Exit(1) from None
+    elapsed = time.time() - start_time
-    # Parse allowed tools
-    allowed_tools_list = (
-        [t.strip() for t in allowed_tools.split(",") if t.strip()] if allowed_tools else None
-    )
+    if cfg.remote:
+        return
-    # Set default max_steps if not provided
-    if max_steps is None:
-        max_steps = 50 if full else 10
-    # Run evaluation
-    if full:
-        asyncio.run(
-            run_full_dataset(
-                source,
-                agent_type=agent,
-                model=model,
-                allowed_tools=allowed_tools_list,
-                max_concurrent=max_concurrent,
-                max_steps=max_steps,
-                parallel=parallel,
-                max_workers=max_workers,
-                max_concurrent_per_worker=max_concurrent_per_worker,
-                verbose=very_verbose or verbose,
-                vllm_base_url=vllm_base_url,
-                group_size=group_size,
-            )
-        )
-    else:
-        asyncio.run(
-            run_single_task(
-                source,
-                agent_type=agent,
-                model=model,
-                allowed_tools=allowed_tools_list,
-                max_steps=max_steps,
-                verbose=very_verbose or verbose,
-                vllm_base_url=vllm_base_url,
-                group_size=group_size,
-            )
-        )
+    from hud.datasets import display_results
+    display_results(results, tasks=tasks, elapsed=elapsed, show_details=len(results) <= 50)

hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl

hud-python 0.4.45py3-none-any.whl → 0.5.1py3-none-any.whl