hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hud/__init__.py +27 -7
- hud/agents/__init__.py +11 -5
- hud/agents/base.py +220 -500
- hud/agents/claude.py +200 -240
- hud/agents/gemini.py +275 -0
- hud/agents/gemini_cua.py +335 -0
- hud/agents/grounded_openai.py +98 -100
- hud/agents/misc/integration_test_agent.py +51 -20
- hud/agents/misc/response_agent.py +41 -36
- hud/agents/openai.py +291 -292
- hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
- hud/agents/operator.py +211 -0
- hud/agents/tests/conftest.py +133 -0
- hud/agents/tests/test_base.py +300 -622
- hud/agents/tests/test_base_runtime.py +233 -0
- hud/agents/tests/test_claude.py +379 -210
- hud/agents/tests/test_client.py +9 -10
- hud/agents/tests/test_gemini.py +369 -0
- hud/agents/tests/test_grounded_openai_agent.py +65 -50
- hud/agents/tests/test_openai.py +376 -140
- hud/agents/tests/test_operator.py +362 -0
- hud/agents/tests/test_run_eval.py +179 -0
- hud/cli/__init__.py +461 -545
- hud/cli/analyze.py +43 -5
- hud/cli/build.py +664 -110
- hud/cli/debug.py +8 -5
- hud/cli/dev.py +882 -734
- hud/cli/eval.py +782 -668
- hud/cli/flows/dev.py +167 -0
- hud/cli/flows/init.py +191 -0
- hud/cli/flows/tasks.py +153 -56
- hud/cli/flows/templates.py +151 -0
- hud/cli/flows/tests/__init__.py +1 -0
- hud/cli/flows/tests/test_dev.py +126 -0
- hud/cli/init.py +60 -58
- hud/cli/push.py +29 -11
- hud/cli/rft.py +311 -0
- hud/cli/rft_status.py +145 -0
- hud/cli/tests/test_analyze.py +5 -5
- hud/cli/tests/test_analyze_metadata.py +3 -2
- hud/cli/tests/test_analyze_module.py +120 -0
- hud/cli/tests/test_build.py +108 -6
- hud/cli/tests/test_build_failure.py +41 -0
- hud/cli/tests/test_build_module.py +50 -0
- hud/cli/tests/test_cli_init.py +6 -1
- hud/cli/tests/test_cli_more_wrappers.py +30 -0
- hud/cli/tests/test_cli_root.py +140 -0
- hud/cli/tests/test_convert.py +361 -0
- hud/cli/tests/test_debug.py +12 -10
- hud/cli/tests/test_dev.py +197 -0
- hud/cli/tests/test_eval.py +251 -0
- hud/cli/tests/test_eval_bedrock.py +51 -0
- hud/cli/tests/test_init.py +124 -0
- hud/cli/tests/test_main_module.py +11 -5
- hud/cli/tests/test_mcp_server.py +12 -100
- hud/cli/tests/test_push_happy.py +74 -0
- hud/cli/tests/test_push_wrapper.py +23 -0
- hud/cli/tests/test_registry.py +1 -1
- hud/cli/tests/test_utils.py +1 -1
- hud/cli/{rl → utils}/celebrate.py +14 -12
- hud/cli/utils/config.py +18 -1
- hud/cli/utils/docker.py +130 -4
- hud/cli/utils/env_check.py +9 -9
- hud/cli/utils/git.py +136 -0
- hud/cli/utils/interactive.py +39 -5
- hud/cli/utils/metadata.py +69 -0
- hud/cli/utils/runner.py +1 -1
- hud/cli/utils/server.py +2 -2
- hud/cli/utils/source_hash.py +3 -3
- hud/cli/utils/tasks.py +4 -1
- hud/cli/utils/tests/__init__.py +0 -0
- hud/cli/utils/tests/test_config.py +58 -0
- hud/cli/utils/tests/test_docker.py +93 -0
- hud/cli/utils/tests/test_docker_hints.py +71 -0
- hud/cli/utils/tests/test_env_check.py +74 -0
- hud/cli/utils/tests/test_environment.py +42 -0
- hud/cli/utils/tests/test_git.py +142 -0
- hud/cli/utils/tests/test_interactive_module.py +60 -0
- hud/cli/utils/tests/test_local_runner.py +50 -0
- hud/cli/utils/tests/test_logging_utils.py +23 -0
- hud/cli/utils/tests/test_metadata.py +49 -0
- hud/cli/utils/tests/test_package_runner.py +35 -0
- hud/cli/utils/tests/test_registry_utils.py +49 -0
- hud/cli/utils/tests/test_remote_runner.py +25 -0
- hud/cli/utils/tests/test_runner_modules.py +52 -0
- hud/cli/utils/tests/test_source_hash.py +36 -0
- hud/cli/utils/tests/test_tasks.py +80 -0
- hud/cli/utils/version_check.py +258 -0
- hud/cli/{rl → utils}/viewer.py +2 -2
- hud/clients/README.md +12 -11
- hud/clients/__init__.py +4 -3
- hud/clients/base.py +166 -26
- hud/clients/environment.py +51 -0
- hud/clients/fastmcp.py +13 -6
- hud/clients/mcp_use.py +40 -15
- hud/clients/tests/test_analyze_scenarios.py +206 -0
- hud/clients/tests/test_protocol.py +9 -3
- hud/datasets/__init__.py +23 -20
- hud/datasets/loader.py +327 -0
- hud/datasets/runner.py +192 -105
- hud/datasets/tests/__init__.py +0 -0
- hud/datasets/tests/test_loader.py +221 -0
- hud/datasets/tests/test_utils.py +315 -0
- hud/datasets/utils.py +270 -90
- hud/environment/__init__.py +50 -0
- hud/environment/connection.py +206 -0
- hud/environment/connectors/__init__.py +33 -0
- hud/environment/connectors/base.py +68 -0
- hud/environment/connectors/local.py +177 -0
- hud/environment/connectors/mcp_config.py +109 -0
- hud/environment/connectors/openai.py +101 -0
- hud/environment/connectors/remote.py +172 -0
- hud/environment/environment.py +694 -0
- hud/environment/integrations/__init__.py +45 -0
- hud/environment/integrations/adk.py +67 -0
- hud/environment/integrations/anthropic.py +196 -0
- hud/environment/integrations/gemini.py +92 -0
- hud/environment/integrations/langchain.py +82 -0
- hud/environment/integrations/llamaindex.py +68 -0
- hud/environment/integrations/openai.py +238 -0
- hud/environment/mock.py +306 -0
- hud/environment/router.py +112 -0
- hud/environment/scenarios.py +493 -0
- hud/environment/tests/__init__.py +1 -0
- hud/environment/tests/test_connection.py +317 -0
- hud/environment/tests/test_connectors.py +218 -0
- hud/environment/tests/test_environment.py +161 -0
- hud/environment/tests/test_integrations.py +257 -0
- hud/environment/tests/test_local_connectors.py +201 -0
- hud/environment/tests/test_scenarios.py +280 -0
- hud/environment/tests/test_tools.py +208 -0
- hud/environment/types.py +23 -0
- hud/environment/utils/__init__.py +35 -0
- hud/environment/utils/formats.py +215 -0
- hud/environment/utils/schema.py +171 -0
- hud/environment/utils/tool_wrappers.py +113 -0
- hud/eval/__init__.py +67 -0
- hud/eval/context.py +674 -0
- hud/eval/display.py +299 -0
- hud/eval/instrument.py +185 -0
- hud/eval/manager.py +466 -0
- hud/eval/parallel.py +268 -0
- hud/eval/task.py +340 -0
- hud/eval/tests/__init__.py +1 -0
- hud/eval/tests/test_context.py +178 -0
- hud/eval/tests/test_eval.py +210 -0
- hud/eval/tests/test_manager.py +152 -0
- hud/eval/tests/test_parallel.py +168 -0
- hud/eval/tests/test_task.py +145 -0
- hud/eval/types.py +63 -0
- hud/eval/utils.py +183 -0
- hud/patches/__init__.py +19 -0
- hud/patches/mcp_patches.py +151 -0
- hud/patches/warnings.py +54 -0
- hud/samples/browser.py +4 -4
- hud/server/__init__.py +2 -1
- hud/server/low_level.py +2 -1
- hud/server/router.py +164 -0
- hud/server/server.py +567 -80
- hud/server/tests/test_mcp_server_integration.py +11 -11
- hud/server/tests/test_mcp_server_more.py +1 -1
- hud/server/tests/test_server_extra.py +2 -0
- hud/settings.py +45 -3
- hud/shared/exceptions.py +36 -10
- hud/shared/hints.py +26 -1
- hud/shared/requests.py +15 -3
- hud/shared/tests/test_exceptions.py +40 -31
- hud/shared/tests/test_hints.py +167 -0
- hud/telemetry/__init__.py +20 -19
- hud/telemetry/exporter.py +201 -0
- hud/telemetry/instrument.py +158 -253
- hud/telemetry/tests/test_eval_telemetry.py +356 -0
- hud/telemetry/tests/test_exporter.py +258 -0
- hud/telemetry/tests/test_instrument.py +401 -0
- hud/tools/__init__.py +16 -2
- hud/tools/apply_patch.py +639 -0
- hud/tools/base.py +54 -4
- hud/tools/bash.py +2 -2
- hud/tools/computer/__init__.py +4 -0
- hud/tools/computer/anthropic.py +2 -2
- hud/tools/computer/gemini.py +385 -0
- hud/tools/computer/hud.py +23 -6
- hud/tools/computer/openai.py +20 -21
- hud/tools/computer/qwen.py +434 -0
- hud/tools/computer/settings.py +37 -0
- hud/tools/edit.py +3 -7
- hud/tools/executors/base.py +4 -2
- hud/tools/executors/pyautogui.py +1 -1
- hud/tools/grounding/grounded_tool.py +13 -18
- hud/tools/grounding/grounder.py +10 -31
- hud/tools/grounding/tests/test_grounded_tool.py +26 -44
- hud/tools/jupyter.py +330 -0
- hud/tools/playwright.py +18 -3
- hud/tools/shell.py +308 -0
- hud/tools/tests/test_apply_patch.py +718 -0
- hud/tools/tests/test_computer.py +4 -9
- hud/tools/tests/test_computer_actions.py +24 -2
- hud/tools/tests/test_jupyter_tool.py +181 -0
- hud/tools/tests/test_shell.py +596 -0
- hud/tools/tests/test_submit.py +85 -0
- hud/tools/tests/test_types.py +193 -0
- hud/tools/types.py +21 -1
- hud/types.py +167 -57
- hud/utils/__init__.py +2 -0
- hud/utils/env.py +67 -0
- hud/utils/hud_console.py +61 -3
- hud/utils/mcp.py +15 -58
- hud/utils/strict_schema.py +162 -0
- hud/utils/tests/test_init.py +1 -2
- hud/utils/tests/test_mcp.py +1 -28
- hud/utils/tests/test_pretty_errors.py +186 -0
- hud/utils/tests/test_tool_shorthand.py +154 -0
- hud/utils/tests/test_version.py +1 -1
- hud/utils/types.py +20 -0
- hud/version.py +1 -1
- hud_python-0.5.1.dist-info/METADATA +264 -0
- hud_python-0.5.1.dist-info/RECORD +299 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
- hud/agents/langchain.py +0 -261
- hud/agents/lite_llm.py +0 -72
- hud/cli/rl/__init__.py +0 -180
- hud/cli/rl/config.py +0 -101
- hud/cli/rl/display.py +0 -133
- hud/cli/rl/gpu.py +0 -63
- hud/cli/rl/gpu_utils.py +0 -321
- hud/cli/rl/local_runner.py +0 -595
- hud/cli/rl/presets.py +0 -96
- hud/cli/rl/remote_runner.py +0 -463
- hud/cli/rl/rl_api.py +0 -150
- hud/cli/rl/vllm.py +0 -177
- hud/cli/rl/wait_utils.py +0 -89
- hud/datasets/parallel.py +0 -687
- hud/misc/__init__.py +0 -1
- hud/misc/claude_plays_pokemon.py +0 -292
- hud/otel/__init__.py +0 -35
- hud/otel/collector.py +0 -142
- hud/otel/config.py +0 -181
- hud/otel/context.py +0 -570
- hud/otel/exporters.py +0 -369
- hud/otel/instrumentation.py +0 -135
- hud/otel/processors.py +0 -121
- hud/otel/tests/__init__.py +0 -1
- hud/otel/tests/test_processors.py +0 -197
- hud/rl/README.md +0 -30
- hud/rl/__init__.py +0 -1
- hud/rl/actor.py +0 -176
- hud/rl/buffer.py +0 -405
- hud/rl/chat_template.jinja +0 -101
- hud/rl/config.py +0 -192
- hud/rl/distributed.py +0 -132
- hud/rl/learner.py +0 -637
- hud/rl/tests/__init__.py +0 -1
- hud/rl/tests/test_learner.py +0 -186
- hud/rl/train.py +0 -382
- hud/rl/types.py +0 -101
- hud/rl/utils/start_vllm_server.sh +0 -30
- hud/rl/utils.py +0 -524
- hud/rl/vllm_adapter.py +0 -143
- hud/telemetry/job.py +0 -352
- hud/telemetry/replay.py +0 -74
- hud/telemetry/tests/test_replay.py +0 -40
- hud/telemetry/tests/test_trace.py +0 -63
- hud/telemetry/trace.py +0 -158
- hud/utils/agent_factories.py +0 -86
- hud/utils/async_utils.py +0 -65
- hud/utils/group_eval.py +0 -223
- hud/utils/progress.py +0 -149
- hud/utils/tasks.py +0 -127
- hud/utils/tests/test_async_utils.py +0 -173
- hud/utils/tests/test_progress.py +0 -261
- hud_python-0.4.45.dist-info/METADATA +0 -552
- hud_python-0.4.45.dist-info/RECORD +0 -228
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0
hud/cli/eval.py
CHANGED
|
@@ -1,762 +1,876 @@
|
|
|
1
|
-
"""HUD evaluation command for running tasks and datasets.
|
|
1
|
+
"""HUD evaluation command for running tasks and datasets.
|
|
2
|
+
|
|
3
|
+
Config Override Order: CLI arguments > .hud_eval.toml > defaults
|
|
4
|
+
"""
|
|
2
5
|
|
|
3
6
|
from __future__ import annotations
|
|
4
7
|
|
|
5
8
|
import asyncio
|
|
6
9
|
import logging
|
|
10
|
+
import re
|
|
11
|
+
import time
|
|
12
|
+
import tomllib
|
|
13
|
+
from dataclasses import dataclass
|
|
7
14
|
from pathlib import Path
|
|
8
|
-
from typing import TYPE_CHECKING, Any,
|
|
15
|
+
from typing import TYPE_CHECKING, Any, ClassVar
|
|
9
16
|
|
|
17
|
+
import questionary
|
|
10
18
|
import typer
|
|
19
|
+
from pydantic import BaseModel, Field, field_validator
|
|
20
|
+
from rich import box
|
|
21
|
+
from rich.table import Table
|
|
11
22
|
|
|
12
|
-
import hud
|
|
13
|
-
from hud.cli.utils.env_check import ensure_built, find_environment_dir
|
|
14
23
|
from hud.settings import settings
|
|
15
|
-
from hud.
|
|
24
|
+
from hud.types import AgentType
|
|
25
|
+
from hud.utils.env import resolve_env_vars
|
|
16
26
|
from hud.utils.hud_console import HUDConsole
|
|
17
27
|
|
|
28
|
+
# Pattern to detect AWS Bedrock inference profile ARNs
|
|
29
|
+
_BEDROCK_ARN_PATTERN = re.compile(r"^arn:aws:bedrock:[a-z0-9-]+:\d+:inference-profile/.+$")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _is_bedrock_arn(model: str | None) -> bool:
|
|
33
|
+
"""Check if a model string is a Bedrock inference profile ARN."""
|
|
34
|
+
return model is not None and bool(_BEDROCK_ARN_PATTERN.match(model))
|
|
35
|
+
|
|
36
|
+
|
|
18
37
|
if TYPE_CHECKING:
|
|
19
|
-
from hud.
|
|
38
|
+
from hud.agents.base import MCPAgent
|
|
39
|
+
|
|
20
40
|
logger = logging.getLogger(__name__)
|
|
21
41
|
hud_console = HUDConsole()
|
|
22
42
|
|
|
43
|
+
_CONFIG_PATH = ".hud_eval.toml"
|
|
23
44
|
|
|
24
|
-
def get_available_models() -> list[dict[str, str | None]]:
|
|
25
|
-
"""Fetch available models from the HUD API (only ready models).
|
|
26
|
-
|
|
27
|
-
Returns:
|
|
28
|
-
List of dicts with 'name', 'vllm_url', and 'base_model' keys
|
|
29
|
-
"""
|
|
30
|
-
try:
|
|
31
|
-
from hud.cli.rl import rl_api
|
|
32
|
-
|
|
33
|
-
hud_console.info("Fetching your models from https://hud.so/models")
|
|
34
|
-
models = rl_api.list_models()
|
|
35
45
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
46
|
+
@dataclass(frozen=True)
|
|
47
|
+
class AgentPreset:
|
|
48
|
+
"""A preset agent configuration combining agent type, model, and optional config."""
|
|
39
49
|
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
50
|
+
name: str
|
|
51
|
+
agent_type: AgentType
|
|
52
|
+
model: str | None = None
|
|
53
|
+
agent_config: dict[str, Any] | None = None
|
|
43
54
|
|
|
44
|
-
if ready_models:
|
|
45
|
-
hud_console.success(f"Found {len(ready_models)} ready models:")
|
|
46
|
-
for model in ready_models:
|
|
47
|
-
vllm_status = " (vLLM deployed)" if model.vllm_url else ""
|
|
48
|
-
hud_console.info(f" ✅ {model.name}{vllm_status}")
|
|
49
55
|
|
|
50
|
-
|
|
51
|
-
|
|
56
|
+
# Built-in presets for the interactive picker
|
|
57
|
+
_AGENT_PRESETS: list[AgentPreset] = [
|
|
58
|
+
# Native agents (use provider SDKs directly)
|
|
59
|
+
AgentPreset("Claude Sonnet 4.5", AgentType.CLAUDE, "claude-sonnet-4-5"),
|
|
60
|
+
AgentPreset("GPT-5", AgentType.OPENAI, "gpt-5"),
|
|
61
|
+
AgentPreset("Operator (OpenAI Computer Use)", AgentType.OPERATOR, "computer-use-preview"),
|
|
62
|
+
AgentPreset("Gemini 3 Pro Preview", AgentType.GEMINI, "gemini-3-pro-preview"),
|
|
63
|
+
AgentPreset(
|
|
64
|
+
"Gemini CUA (Gemini Computer Use)",
|
|
65
|
+
AgentType.GEMINI_CUA,
|
|
66
|
+
"gemini-2.5-computer-use-preview",
|
|
67
|
+
),
|
|
68
|
+
# HUD Gateway presets (models via HUD Inference API)
|
|
69
|
+
AgentPreset(
|
|
70
|
+
"Grok 4-1 Fast (xAI)",
|
|
71
|
+
AgentType.OPENAI_COMPATIBLE,
|
|
72
|
+
"grok-4-1-fast",
|
|
73
|
+
{
|
|
74
|
+
"openai_compatible": {
|
|
75
|
+
"base_url": settings.hud_gateway_url,
|
|
76
|
+
"model_name": "Grok 4-1 Fast",
|
|
77
|
+
}
|
|
78
|
+
},
|
|
79
|
+
),
|
|
80
|
+
AgentPreset(
|
|
81
|
+
"GLM-4.5V (Z-AI)",
|
|
82
|
+
AgentType.OPENAI_COMPATIBLE,
|
|
83
|
+
"z-ai/glm-4.5v",
|
|
84
|
+
{"openai_compatible": {"base_url": settings.hud_gateway_url, "model_name": "GLM-4.5V"}},
|
|
85
|
+
),
|
|
86
|
+
]
|
|
87
|
+
|
|
88
|
+
_DEFAULT_CONFIG_TEMPLATE = """# HUD Eval Configuration
|
|
89
|
+
# Command-line arguments override these settings
|
|
90
|
+
|
|
91
|
+
[eval]
|
|
92
|
+
# source = "hud-evals/SheetBench-50"
|
|
93
|
+
# agent = "claude"
|
|
94
|
+
# all = false # Run all problems instead of just 1
|
|
95
|
+
# max_concurrent = 30
|
|
96
|
+
# max_steps = 10
|
|
97
|
+
# group_size = 1
|
|
98
|
+
# byok = false # Remote only; use encrypted env vars on the platform.
|
|
99
|
+
# task_ids = ["task_1", "task_2"]
|
|
100
|
+
# verbose = true
|
|
101
|
+
# very_verbose = true
|
|
102
|
+
# auto_respond = true
|
|
103
|
+
# gateway = false # Route LLM API calls through HUD Gateway
|
|
104
|
+
|
|
105
|
+
[agent]
|
|
106
|
+
# allowed_tools = ["computer", "playwright"]
|
|
107
|
+
# disallowed_tools = []
|
|
108
|
+
|
|
109
|
+
[claude]
|
|
110
|
+
# model = "claude-sonnet-4-5"
|
|
111
|
+
# max_tokens = 16384
|
|
112
|
+
# use_computer_beta = true
|
|
113
|
+
|
|
114
|
+
[openai]
|
|
115
|
+
# model = "gpt-4o"
|
|
116
|
+
# temperature = 0.7
|
|
117
|
+
# max_output_tokens = 4096
|
|
118
|
+
|
|
119
|
+
[gemini]
|
|
120
|
+
# model = "gemini-2.5-pro"
|
|
121
|
+
# temperature = 1.0
|
|
122
|
+
# top_p = 0.95
|
|
123
|
+
|
|
124
|
+
[gemini_cua]
|
|
125
|
+
# model = "gemini-2.5-computer-use-preview"
|
|
126
|
+
# temperature = 1.0
|
|
127
|
+
# top_p = 0.95
|
|
128
|
+
# excluded_predefined_functions = []
|
|
129
|
+
|
|
130
|
+
[openai_compatible]
|
|
131
|
+
# base_url = "http://localhost:8000/v1"
|
|
132
|
+
# model = "my-model"
|
|
133
|
+
"""
|
|
134
|
+
|
|
135
|
+
# Agent type -> (settings attr, env var name)
|
|
136
|
+
_API_KEY_REQUIREMENTS: dict[AgentType, tuple[str, str]] = {
|
|
137
|
+
AgentType.CLAUDE: ("anthropic_api_key", "ANTHROPIC_API_KEY"),
|
|
138
|
+
AgentType.GEMINI: ("gemini_api_key", "GEMINI_API_KEY"),
|
|
139
|
+
AgentType.GEMINI_CUA: ("gemini_api_key", "GEMINI_API_KEY"),
|
|
140
|
+
AgentType.OPENAI: ("openai_api_key", "OPENAI_API_KEY"),
|
|
141
|
+
AgentType.OPERATOR: ("openai_api_key", "OPENAI_API_KEY"),
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
class EvalConfig(BaseModel):
|
|
146
|
+
"""Configuration for hud eval command."""
|
|
147
|
+
|
|
148
|
+
# Class-level registry
|
|
149
|
+
_agent_classes: ClassVar[dict[AgentType, type["MCPAgent"]]] = {}
|
|
150
|
+
|
|
151
|
+
# Fields loaded from [eval] section
|
|
152
|
+
_EVAL_FIELDS: ClassVar[set[str]] = {
|
|
153
|
+
"source",
|
|
154
|
+
"agent_type",
|
|
155
|
+
"task_ids",
|
|
156
|
+
"all",
|
|
157
|
+
"max_concurrent",
|
|
158
|
+
"max_steps",
|
|
159
|
+
"verbose",
|
|
160
|
+
"very_verbose",
|
|
161
|
+
"group_size",
|
|
162
|
+
"byok",
|
|
163
|
+
"remote",
|
|
164
|
+
"auto_respond",
|
|
165
|
+
"quiet",
|
|
166
|
+
"gateway",
|
|
167
|
+
}
|
|
168
|
+
# Fields loaded from [agent] section
|
|
169
|
+
_AGENT_FIELDS: ClassVar[set[str]] = {"allowed_tools", "disallowed_tools"}
|
|
170
|
+
|
|
171
|
+
# Eval settings
|
|
172
|
+
source: str | None = None
|
|
173
|
+
agent_type: AgentType | None = None
|
|
174
|
+
model: str | None = None
|
|
175
|
+
task_ids: list[str] | None = None
|
|
176
|
+
all: bool = False # Run all problems instead of just 1
|
|
177
|
+
max_concurrent: int = 30
|
|
178
|
+
max_steps: int = 10
|
|
179
|
+
verbose: bool = False
|
|
180
|
+
very_verbose: bool = False
|
|
181
|
+
auto_respond: bool | None = None # Continue without prompting
|
|
182
|
+
group_size: int = 1
|
|
183
|
+
byok: bool = False
|
|
184
|
+
remote: bool = False
|
|
185
|
+
quiet: bool = False # Suppress opening browser for eval links
|
|
186
|
+
gateway: bool = False # Use HUD Gateway for LLM API calls
|
|
187
|
+
|
|
188
|
+
# Base agent config (these merge with task's agent_config)
|
|
189
|
+
allowed_tools: list[str] | None = None
|
|
190
|
+
disallowed_tools: list[str] | None = None
|
|
191
|
+
|
|
192
|
+
agent_config: dict[str, Any] = Field(default_factory=dict)
|
|
193
|
+
|
|
194
|
+
@field_validator("agent_type", mode="before")
|
|
195
|
+
@classmethod
|
|
196
|
+
def _parse_agent_type(cls, v: Any) -> AgentType | None:
|
|
197
|
+
"""Convert string agent name to AgentType enum."""
|
|
198
|
+
if v is None:
|
|
199
|
+
return None
|
|
200
|
+
if isinstance(v, AgentType):
|
|
201
|
+
return v
|
|
202
|
+
if isinstance(v, str):
|
|
203
|
+
try:
|
|
204
|
+
return AgentType(v)
|
|
205
|
+
except ValueError:
|
|
206
|
+
valid = [e.value for e in AgentType]
|
|
207
|
+
raise ValueError(
|
|
208
|
+
f"Invalid agent: {v}. Must be one of: {', '.join(valid)}"
|
|
209
|
+
) from None
|
|
210
|
+
return v
|
|
211
|
+
|
|
212
|
+
def validate_api_keys(self) -> None:
|
|
213
|
+
"""Validate required API keys for the selected agent. Raises typer.Exit on failure."""
|
|
214
|
+
# BYOK requires remote execution (check before agent_type guard)
|
|
215
|
+
if self.byok and not self.remote:
|
|
216
|
+
hud_console.error("--byok requires --remote (BYOK only works with remote execution)")
|
|
217
|
+
raise typer.Exit(1)
|
|
52
218
|
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
hud_console.
|
|
60
|
-
|
|
219
|
+
if self.agent_type is None:
|
|
220
|
+
return
|
|
221
|
+
|
|
222
|
+
if self.remote:
|
|
223
|
+
if not settings.api_key:
|
|
224
|
+
hud_console.error("HUD_API_KEY is required for remote execution")
|
|
225
|
+
hud_console.info("Set it: hud set HUD_API_KEY=your-key-here")
|
|
226
|
+
raise typer.Exit(1)
|
|
227
|
+
return
|
|
228
|
+
|
|
229
|
+
# Gateway mode only requires HUD_API_KEY
|
|
230
|
+
if self.gateway:
|
|
231
|
+
if not settings.api_key:
|
|
232
|
+
hud_console.error("HUD_API_KEY is required for gateway mode")
|
|
233
|
+
hud_console.info("Set it: hud set HUD_API_KEY=your-key-here")
|
|
234
|
+
raise typer.Exit(1)
|
|
235
|
+
return
|
|
236
|
+
|
|
237
|
+
if self.agent_type == AgentType.OPENAI_COMPATIBLE:
|
|
238
|
+
# Check both CLI --model and config file model
|
|
239
|
+
config_model = self.agent_config.get("openai_compatible", {}).get("model")
|
|
240
|
+
if not self.model and not config_model:
|
|
241
|
+
hud_console.error(
|
|
242
|
+
"Model name is required for OpenAI compatible agent. "
|
|
243
|
+
"Use --model or set model in [openai_compatible] section of .hud_eval.toml"
|
|
61
244
|
)
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
return []
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
def build_agent(
|
|
72
|
-
agent_type: Literal["claude", "openai", "vllm", "litellm", "integration_test"],
|
|
73
|
-
*,
|
|
74
|
-
model: str | None = None,
|
|
75
|
-
allowed_tools: list[str] | None = None,
|
|
76
|
-
verbose: bool = False,
|
|
77
|
-
vllm_base_url: str | None = None,
|
|
78
|
-
) -> Any:
|
|
79
|
-
"""Create and return the requested agent type."""
|
|
80
|
-
|
|
81
|
-
# Import agents lazily to avoid dependency issues
|
|
82
|
-
if agent_type == "integration_test":
|
|
83
|
-
from hud.agents.misc.integration_test_agent import IntegrationTestRunner
|
|
84
|
-
|
|
85
|
-
return IntegrationTestRunner(verbose=verbose)
|
|
86
|
-
elif agent_type == "vllm":
|
|
87
|
-
# Create a generic OpenAI agent for vLLM server
|
|
88
|
-
try:
|
|
89
|
-
from openai import AsyncOpenAI
|
|
90
|
-
|
|
91
|
-
from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
|
|
92
|
-
except ImportError as e:
|
|
93
|
-
hud_console.error(
|
|
94
|
-
"OpenAI dependencies are not installed. "
|
|
95
|
-
"Please install with: pip install 'hud-python[agent]'"
|
|
96
|
-
)
|
|
97
|
-
raise typer.Exit(1) from e
|
|
98
|
-
|
|
99
|
-
# Determine the base URL to use
|
|
100
|
-
if vllm_base_url is not None:
|
|
101
|
-
# Use the provided vLLM URL (for custom/local servers)
|
|
102
|
-
base_url = vllm_base_url
|
|
103
|
-
hud_console.info(f"Using vLLM server at {base_url}")
|
|
104
|
-
api_key = (
|
|
105
|
-
settings.api_key if base_url.startswith(settings.hud_rl_url) else "token-abc123"
|
|
245
|
+
raise typer.Exit(1)
|
|
246
|
+
elif self.agent_type == AgentType.CLAUDE and _is_bedrock_arn(self.model):
|
|
247
|
+
missing_aws = (
|
|
248
|
+
not settings.aws_access_key_id
|
|
249
|
+
or not settings.aws_secret_access_key
|
|
250
|
+
or not settings.aws_region
|
|
106
251
|
)
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
252
|
+
if missing_aws:
|
|
253
|
+
hud_console.error(
|
|
254
|
+
"AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, and AWS_REGION "
|
|
255
|
+
"are required for AWS Bedrock"
|
|
256
|
+
)
|
|
257
|
+
raise typer.Exit(1)
|
|
258
|
+
elif self.agent_type in _API_KEY_REQUIREMENTS:
|
|
259
|
+
attr, env_var = _API_KEY_REQUIREMENTS[self.agent_type]
|
|
260
|
+
if not getattr(settings, attr, None):
|
|
261
|
+
hud_console.error(f"{env_var} is required for {self.agent_type.value} agent")
|
|
262
|
+
hud_console.info(f"Set it: hud set {env_var}=your-key-here")
|
|
263
|
+
raise typer.Exit(1)
|
|
264
|
+
|
|
265
|
+
if not settings.api_key:
|
|
266
|
+
hud_console.warning("HUD_API_KEY not set. Some features may be limited.")
|
|
267
|
+
|
|
268
|
+
def get_agent_kwargs(self) -> dict[str, Any]:
|
|
269
|
+
"""Build agent kwargs from config.
|
|
270
|
+
|
|
271
|
+
Model precedence:
|
|
272
|
+
1. CLI --model (highest priority)
|
|
273
|
+
2. [agent_type].model in TOML (per-agent config)
|
|
274
|
+
"""
|
|
275
|
+
if self.agent_type is None:
|
|
276
|
+
raise ValueError("agent_type must be set before calling get_agent_kwargs()")
|
|
277
|
+
|
|
278
|
+
kwargs: dict[str, Any] = {}
|
|
279
|
+
|
|
280
|
+
if self.allowed_tools:
|
|
281
|
+
kwargs["allowed_tools"] = self.allowed_tools
|
|
282
|
+
if self.disallowed_tools:
|
|
283
|
+
kwargs["disallowed_tools"] = self.disallowed_tools
|
|
284
|
+
|
|
285
|
+
# Apply agent-specific config
|
|
286
|
+
agent_key = self.agent_type.value
|
|
287
|
+
if agent_key in self.agent_config:
|
|
288
|
+
agent_cfg = dict(self.agent_config[agent_key])
|
|
289
|
+
kwargs.update(agent_cfg)
|
|
290
|
+
|
|
291
|
+
# CLI --model always wins
|
|
292
|
+
if self.model:
|
|
293
|
+
kwargs["model"] = self.model
|
|
294
|
+
|
|
295
|
+
# For gateway base_url, inject HUD API key if not already set
|
|
296
|
+
if self.agent_type == AgentType.OPENAI_COMPATIBLE and "api_key" not in kwargs:
|
|
297
|
+
base_url = kwargs.get("base_url", "")
|
|
298
|
+
if settings.hud_gateway_url in base_url and settings.api_key:
|
|
299
|
+
kwargs["api_key"] = settings.api_key
|
|
300
|
+
|
|
301
|
+
# Auto-detect Bedrock when Claude is selected with a Bedrock ARN
|
|
302
|
+
# Check both model and checkpoint_name for ARN patterns
|
|
303
|
+
bedrock_arn_detected = _is_bedrock_arn(kwargs.get("model")) or _is_bedrock_arn(
|
|
304
|
+
kwargs.get("checkpoint_name")
|
|
128
305
|
)
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
hud_console.error(
|
|
135
|
-
"OpenAI agent dependencies are not installed. "
|
|
136
|
-
"Please install with: pip install 'hud-python[agent]'"
|
|
306
|
+
if self.agent_type == AgentType.CLAUDE and bedrock_arn_detected:
|
|
307
|
+
missing_aws = (
|
|
308
|
+
not settings.aws_access_key_id
|
|
309
|
+
or not settings.aws_secret_access_key
|
|
310
|
+
or not settings.aws_region
|
|
137
311
|
)
|
|
138
|
-
|
|
312
|
+
if missing_aws:
|
|
313
|
+
hud_console.error(
|
|
314
|
+
"AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, and AWS_REGION "
|
|
315
|
+
"are required for AWS Bedrock"
|
|
316
|
+
)
|
|
317
|
+
raise typer.Exit(1)
|
|
139
318
|
|
|
140
|
-
|
|
141
|
-
return OperatorAgent(
|
|
142
|
-
allowed_tools=allowed_tools,
|
|
143
|
-
verbose=verbose,
|
|
144
|
-
)
|
|
145
|
-
else:
|
|
146
|
-
return OperatorAgent(verbose=verbose)
|
|
319
|
+
from anthropic import AsyncAnthropicBedrock
|
|
147
320
|
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
hud_console.error(
|
|
153
|
-
"LiteLLM agent dependencies are not installed. "
|
|
154
|
-
"Please install with: pip install 'hud-python[agent]'"
|
|
321
|
+
kwargs["model_client"] = AsyncAnthropicBedrock(
|
|
322
|
+
aws_access_key=settings.aws_access_key_id,
|
|
323
|
+
aws_secret_key=settings.aws_secret_access_key,
|
|
324
|
+
aws_region=settings.aws_region or "us-east-1",
|
|
155
325
|
)
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
verbose=verbose,
|
|
186
|
-
)
|
|
326
|
+
hud_console.info("🔧 Using AWS Bedrock (detected ARN in model)")
|
|
327
|
+
|
|
328
|
+
kwargs["verbose"] = self.verbose or self.very_verbose
|
|
329
|
+
|
|
330
|
+
if self.agent_type in (
|
|
331
|
+
AgentType.CLAUDE,
|
|
332
|
+
AgentType.OPENAI,
|
|
333
|
+
AgentType.OPERATOR,
|
|
334
|
+
AgentType.GEMINI,
|
|
335
|
+
AgentType.GEMINI_CUA,
|
|
336
|
+
):
|
|
337
|
+
kwargs["validate_api_key"] = False
|
|
338
|
+
|
|
339
|
+
# Configure gateway mode - route LLM API calls through HUD gateway
|
|
340
|
+
if self.gateway:
|
|
341
|
+
hud_api_key = settings.api_key
|
|
342
|
+
if not hud_api_key:
|
|
343
|
+
raise typer.Exit(1) # Already validated in validate_api_keys()
|
|
344
|
+
|
|
345
|
+
if self.agent_type == AgentType.CLAUDE:
|
|
346
|
+
from anthropic import AsyncAnthropic
|
|
347
|
+
|
|
348
|
+
kwargs["model_client"] = AsyncAnthropic(
|
|
349
|
+
api_key=hud_api_key,
|
|
350
|
+
base_url=settings.hud_gateway_url,
|
|
351
|
+
)
|
|
352
|
+
hud_console.info("🌐 Using HUD Gateway for Claude API")
|
|
353
|
+
elif self.agent_type in (AgentType.OPENAI, AgentType.OPERATOR):
|
|
354
|
+
from openai import AsyncOpenAI
|
|
187
355
|
|
|
356
|
+
kwargs["model_client"] = AsyncOpenAI(
|
|
357
|
+
api_key=hud_api_key,
|
|
358
|
+
base_url=settings.hud_gateway_url,
|
|
359
|
+
)
|
|
360
|
+
hud_console.info("🌐 Using HUD Gateway for OpenAI API")
|
|
361
|
+
elif self.agent_type == AgentType.OPENAI_COMPATIBLE:
|
|
362
|
+
from openai import AsyncOpenAI
|
|
188
363
|
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
364
|
+
kwargs["openai_client"] = AsyncOpenAI(
|
|
365
|
+
api_key=hud_api_key,
|
|
366
|
+
base_url=settings.hud_gateway_url,
|
|
367
|
+
)
|
|
368
|
+
hud_console.info("🌐 Using HUD Gateway for OpenAI-compatible API")
|
|
369
|
+
elif self.agent_type in (AgentType.GEMINI, AgentType.GEMINI_CUA):
|
|
370
|
+
from google import genai
|
|
371
|
+
from google.genai.types import HttpOptions
|
|
372
|
+
|
|
373
|
+
kwargs["model_client"] = genai.Client(
|
|
374
|
+
api_key="PLACEHOLDER",
|
|
375
|
+
http_options=HttpOptions(
|
|
376
|
+
api_version="v1beta",
|
|
377
|
+
base_url=settings.hud_gateway_url,
|
|
378
|
+
headers={"Authorization": f"Bearer {hud_api_key}"},
|
|
379
|
+
),
|
|
380
|
+
)
|
|
381
|
+
hud_console.info("🌐 Using HUD Gateway for Gemini API")
|
|
201
382
|
|
|
202
|
-
|
|
203
|
-
try:
|
|
204
|
-
from hud.utils.tasks import load_tasks
|
|
205
|
-
except ImportError as e:
|
|
206
|
-
hud_console.error(
|
|
207
|
-
"Dataset dependencies are not installed. "
|
|
208
|
-
"Please install with: pip install 'hud-python\u27e6agent\u27e7'"
|
|
209
|
-
)
|
|
210
|
-
raise typer.Exit(1) from e
|
|
383
|
+
return kwargs
|
|
211
384
|
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
385
|
+
@classmethod
|
|
386
|
+
def load(cls, path: str = _CONFIG_PATH) -> EvalConfig:
|
|
387
|
+
"""Load config from TOML file."""
|
|
388
|
+
p = Path(path)
|
|
389
|
+
if not p.exists():
|
|
390
|
+
p.write_text(_DEFAULT_CONFIG_TEMPLATE)
|
|
391
|
+
hud_console.info(f"Generated {_CONFIG_PATH}")
|
|
392
|
+
return cls()
|
|
216
393
|
|
|
217
|
-
# If tasks reference a local environment (nearby), ensure it's built/up-to-date.
|
|
218
394
|
try:
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
# Non-interactive for eval; warn but don't block
|
|
222
|
-
ensure_built(env_dir, interactive=False)
|
|
395
|
+
with open(p, "rb") as f:
|
|
396
|
+
toml_data = tomllib.load(f)
|
|
223
397
|
except Exception as e:
|
|
224
|
-
hud_console.
|
|
398
|
+
hud_console.warning(f"Failed to parse {path}: {e}")
|
|
399
|
+
return cls()
|
|
225
400
|
|
|
226
|
-
|
|
227
|
-
task = tasks[0]
|
|
228
|
-
hud_console.info("Found 1 task, running as single task…")
|
|
401
|
+
toml_data = resolve_env_vars(toml_data)
|
|
229
402
|
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
tasks: list[Task] = load_tasks(source) # type: ignore[assignment]
|
|
403
|
+
# Extract sections
|
|
404
|
+
eval_section = toml_data.get("eval", {})
|
|
405
|
+
agent_section = toml_data.get("agent", {})
|
|
234
406
|
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
raise typer.Exit(1)
|
|
407
|
+
# Build config data
|
|
408
|
+
data: dict[str, Any] = {}
|
|
238
409
|
|
|
239
|
-
#
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
410
|
+
# Eval settings (map 'agent' -> 'agent_type')
|
|
411
|
+
if "agent" in eval_section:
|
|
412
|
+
data["agent_type"] = eval_section["agent"]
|
|
413
|
+
for key in cls._EVAL_FIELDS:
|
|
414
|
+
if key in eval_section:
|
|
415
|
+
data[key] = eval_section[key]
|
|
244
416
|
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
if agent_type == "integration_test":
|
|
250
|
-
from hud.agents.misc.integration_test_agent import IntegrationTestRunner
|
|
251
|
-
|
|
252
|
-
agent_class = IntegrationTestRunner
|
|
253
|
-
agent_config = {"verbose": verbose}
|
|
254
|
-
if allowed_tools:
|
|
255
|
-
agent_config["allowed_tools"] = allowed_tools
|
|
256
|
-
elif agent_type == "vllm":
|
|
257
|
-
# Special handling for vLLM
|
|
258
|
-
sample_agent = build_agent(
|
|
259
|
-
agent_type,
|
|
260
|
-
model=model,
|
|
261
|
-
allowed_tools=allowed_tools,
|
|
262
|
-
verbose=verbose,
|
|
263
|
-
vllm_base_url=vllm_base_url,
|
|
264
|
-
)
|
|
265
|
-
agent_config = {
|
|
266
|
-
"openai_client": sample_agent.oai,
|
|
267
|
-
"model_name": sample_agent.model_name,
|
|
268
|
-
"verbose": verbose,
|
|
269
|
-
"completion_kwargs": sample_agent.completion_kwargs,
|
|
270
|
-
}
|
|
271
|
-
if allowed_tools:
|
|
272
|
-
agent_config["allowed_tools"] = allowed_tools
|
|
273
|
-
|
|
274
|
-
from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
|
|
275
|
-
|
|
276
|
-
agent_class = GenericOpenAIChatAgent
|
|
277
|
-
elif agent_type == "openai":
|
|
278
|
-
from hud.agents import OperatorAgent
|
|
279
|
-
|
|
280
|
-
agent_class = OperatorAgent
|
|
281
|
-
agent_config = {"verbose": verbose}
|
|
282
|
-
if allowed_tools:
|
|
283
|
-
agent_config["allowed_tools"] = allowed_tools
|
|
284
|
-
elif agent_type == "litellm":
|
|
285
|
-
from hud.agents.lite_llm import LiteAgent
|
|
286
|
-
|
|
287
|
-
agent_class = LiteAgent
|
|
288
|
-
agent_config = {
|
|
289
|
-
"model_name": model or "gpt-4o-mini",
|
|
290
|
-
"verbose": verbose,
|
|
291
|
-
}
|
|
292
|
-
if allowed_tools:
|
|
293
|
-
agent_config["allowed_tools"] = allowed_tools
|
|
294
|
-
elif agent_type == "claude":
|
|
295
|
-
from hud.agents import ClaudeAgent
|
|
296
|
-
|
|
297
|
-
agent_class = ClaudeAgent
|
|
298
|
-
agent_config = {
|
|
299
|
-
"model": model or "claude-sonnet-4-20250514",
|
|
300
|
-
"verbose": verbose,
|
|
301
|
-
}
|
|
302
|
-
if allowed_tools:
|
|
303
|
-
agent_config["allowed_tools"] = allowed_tools
|
|
304
|
-
else:
|
|
305
|
-
raise ValueError(f"Invalid agent type: {agent_type}")
|
|
306
|
-
|
|
307
|
-
if group_size > 1:
|
|
308
|
-
hud_console.info(f"🔄 Running task with group_size={group_size}")
|
|
309
|
-
# Run with grouping
|
|
310
|
-
stats = await run_tasks_grouped(
|
|
311
|
-
tasks=[task],
|
|
312
|
-
agent_class=agent_class,
|
|
313
|
-
agent_config=agent_config,
|
|
314
|
-
group_size=group_size,
|
|
315
|
-
max_parallel_episodes=48, # Same as RL default
|
|
316
|
-
max_steps=max_steps,
|
|
317
|
-
verbose=verbose,
|
|
318
|
-
)
|
|
319
|
-
display_group_statistics(stats, show_details=True)
|
|
320
|
-
else:
|
|
321
|
-
# Original single-run logic
|
|
322
|
-
with hud.trace(name=task_prompt):
|
|
323
|
-
agent = build_agent(
|
|
324
|
-
agent_type,
|
|
325
|
-
model=model,
|
|
326
|
-
allowed_tools=allowed_tools,
|
|
327
|
-
verbose=verbose,
|
|
328
|
-
vllm_base_url=vllm_base_url,
|
|
329
|
-
)
|
|
330
|
-
hud_console.info(task.prompt)
|
|
331
|
-
result = await agent.run(task, max_steps=max_steps)
|
|
332
|
-
hud_console.success(f"Reward: {result.reward}")
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
async def run_full_dataset(
|
|
336
|
-
source: str,
|
|
337
|
-
*,
|
|
338
|
-
agent_type: Literal["claude", "openai", "vllm", "litellm", "integration_test"] = "claude",
|
|
339
|
-
model: str | None = None,
|
|
340
|
-
allowed_tools: list[str] | None = None,
|
|
341
|
-
max_concurrent: int = 30,
|
|
342
|
-
max_steps: int = 10,
|
|
343
|
-
parallel: bool = False,
|
|
344
|
-
max_workers: int | None = None,
|
|
345
|
-
max_concurrent_per_worker: int = 25,
|
|
346
|
-
verbose: bool = False,
|
|
347
|
-
vllm_base_url: str | None = None,
|
|
348
|
-
group_size: int = 1,
|
|
349
|
-
) -> list[Any]:
|
|
350
|
-
"""Run evaluation across the entire dataset.
|
|
351
|
-
|
|
352
|
-
Uses either asyncio-based run_dataset or process-based parallel execution
|
|
353
|
-
depending on the parallel flag."""
|
|
354
|
-
|
|
355
|
-
# Import run_dataset lazily
|
|
356
|
-
try:
|
|
357
|
-
from hud.datasets import run_dataset, run_dataset_parallel, run_dataset_parallel_manual
|
|
358
|
-
from hud.utils.tasks import load_tasks
|
|
359
|
-
except ImportError as e:
|
|
360
|
-
hud_console.error(
|
|
361
|
-
"Dataset dependencies are not installed. "
|
|
362
|
-
"Please install with: pip install 'hud-python[agent]'"
|
|
363
|
-
)
|
|
364
|
-
raise typer.Exit(1) from e
|
|
365
|
-
|
|
366
|
-
# Load tasks using unified loader
|
|
367
|
-
hud_console.info(f"📊 Loading tasks from: {source}…")
|
|
368
|
-
tasks: list[Task] = load_tasks(source) # type: ignore[assignment]
|
|
369
|
-
|
|
370
|
-
if not tasks:
|
|
371
|
-
hud_console.error(f"No tasks found in: {source}")
|
|
372
|
-
raise typer.Exit(1)
|
|
417
|
+
# Agent base config
|
|
418
|
+
for key in cls._AGENT_FIELDS:
|
|
419
|
+
if key in agent_section:
|
|
420
|
+
data[key] = agent_section[key]
|
|
373
421
|
|
|
374
|
-
|
|
375
|
-
|
|
422
|
+
# Agent-specific configs (claude, openai, gemini, etc.)
|
|
423
|
+
agent_config: dict[str, Any] = {}
|
|
424
|
+
for agent_type in AgentType:
|
|
425
|
+
if agent_type.value in toml_data:
|
|
426
|
+
agent_config[agent_type.value] = toml_data[agent_type.value]
|
|
427
|
+
data["agent_config"] = agent_config
|
|
376
428
|
|
|
377
|
-
# Determine dataset name
|
|
378
|
-
path = Path(source)
|
|
379
|
-
dataset_name = f"Dataset: {path.name}" if path.exists() else source.split("/")[-1]
|
|
380
|
-
|
|
381
|
-
# Build agent class + config for run_dataset
|
|
382
|
-
if agent_type == "integration_test": # --integration-test mode
|
|
383
|
-
from hud.agents.misc.integration_test_agent import IntegrationTestRunner
|
|
384
|
-
|
|
385
|
-
agent_class = IntegrationTestRunner
|
|
386
|
-
agent_config = {"verbose": verbose}
|
|
387
|
-
elif agent_type == "vllm":
|
|
388
429
|
try:
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
430
|
+
return cls.model_validate(data)
|
|
431
|
+
except Exception as e:
|
|
432
|
+
hud_console.warning(f"Invalid config: {e}")
|
|
433
|
+
return cls()
|
|
434
|
+
|
|
435
|
+
def merge_cli(
|
|
436
|
+
self,
|
|
437
|
+
agent: str | None = None,
|
|
438
|
+
config: list[str] | None = None,
|
|
439
|
+
allowed_tools: str | None = None,
|
|
440
|
+
disallowed_tools: str | None = None,
|
|
441
|
+
task_ids: str | None = None,
|
|
442
|
+
**cli_args: Any,
|
|
443
|
+
) -> EvalConfig:
|
|
444
|
+
"""Merge CLI args (non-None values override config)."""
|
|
445
|
+
overrides: dict[str, Any] = {}
|
|
446
|
+
|
|
447
|
+
if agent is not None:
|
|
448
|
+
overrides["agent_type"] = agent
|
|
449
|
+
|
|
450
|
+
# Parse comma-separated lists
|
|
451
|
+
if allowed_tools is not None:
|
|
452
|
+
overrides["allowed_tools"] = [t.strip() for t in allowed_tools.split(",") if t.strip()]
|
|
453
|
+
if disallowed_tools is not None:
|
|
454
|
+
overrides["disallowed_tools"] = [
|
|
455
|
+
t.strip() for t in disallowed_tools.split(",") if t.strip()
|
|
456
|
+
]
|
|
457
|
+
if task_ids is not None:
|
|
458
|
+
overrides["task_ids"] = [t.strip() for t in task_ids.split(",") if t.strip()]
|
|
459
|
+
|
|
460
|
+
overrides.update({k: v for k, v in cli_args.items() if v is not None and v is not False})
|
|
461
|
+
|
|
462
|
+
for k in ("all", "verbose", "very_verbose", "remote", "quiet", "gateway"):
|
|
463
|
+
if cli_args.get(k) is True:
|
|
464
|
+
overrides[k] = True
|
|
465
|
+
elif k in overrides and cli_args.get(k) is False:
|
|
466
|
+
del overrides[k]
|
|
467
|
+
|
|
468
|
+
# --full is a shortcut for --all --auto-respond --max-steps 100
|
|
469
|
+
if overrides.get("full"):
|
|
470
|
+
overrides["all"] = True
|
|
471
|
+
if "auto_respond" not in overrides:
|
|
472
|
+
overrides["auto_respond"] = True
|
|
473
|
+
if "max_steps" not in overrides:
|
|
474
|
+
overrides["max_steps"] = 100
|
|
475
|
+
|
|
476
|
+
if config:
|
|
477
|
+
merged_agent_config = dict(self.agent_config)
|
|
478
|
+
for item in config:
|
|
479
|
+
if "=" in item:
|
|
480
|
+
key, value = item.split("=", 1)
|
|
481
|
+
key = key.strip()
|
|
482
|
+
value = value.strip()
|
|
483
|
+
|
|
484
|
+
# Parse value
|
|
485
|
+
if value.lower() == "true":
|
|
486
|
+
parsed_value: Any = True
|
|
487
|
+
elif value.lower() == "false":
|
|
488
|
+
parsed_value = False
|
|
489
|
+
else:
|
|
490
|
+
try:
|
|
491
|
+
parsed_value = int(value)
|
|
492
|
+
except ValueError:
|
|
493
|
+
try:
|
|
494
|
+
parsed_value = float(value)
|
|
495
|
+
except ValueError:
|
|
496
|
+
parsed_value = value
|
|
497
|
+
|
|
498
|
+
# Handle namespaced keys (e.g., claude.max_tokens)
|
|
499
|
+
if "." in key:
|
|
500
|
+
agent_name, param = key.split(".", 1)
|
|
501
|
+
if agent_name not in merged_agent_config:
|
|
502
|
+
merged_agent_config[agent_name] = {}
|
|
503
|
+
merged_agent_config[agent_name][param] = parsed_value
|
|
504
|
+
else:
|
|
505
|
+
# Non-namespaced: apply to current agent if set
|
|
506
|
+
if self.agent_type:
|
|
507
|
+
agent_name = self.agent_type.value
|
|
508
|
+
if agent_name not in merged_agent_config:
|
|
509
|
+
merged_agent_config[agent_name] = {}
|
|
510
|
+
merged_agent_config[agent_name][key] = parsed_value
|
|
511
|
+
|
|
512
|
+
overrides["agent_config"] = merged_agent_config
|
|
513
|
+
|
|
514
|
+
return self.model_validate({**self.model_dump(), **overrides})
|
|
515
|
+
|
|
516
|
+
def resolve_agent_interactive(self) -> EvalConfig:
|
|
517
|
+
"""Prompt user to select an agent preset if not set. Returns updated config."""
|
|
518
|
+
if self.agent_type is not None:
|
|
519
|
+
return self
|
|
520
|
+
|
|
521
|
+
# Build choices from presets
|
|
522
|
+
choices: list[dict[str, Any]] = [
|
|
523
|
+
{"name": preset.name, "value": preset} for preset in _AGENT_PRESETS
|
|
524
|
+
]
|
|
525
|
+
|
|
526
|
+
selected: AgentPreset = hud_console.select("Select an agent:", choices=choices, default=0) # type: ignore[arg-type]
|
|
527
|
+
|
|
528
|
+
# Merge preset into config
|
|
529
|
+
updates: dict[str, Any] = {"agent_type": selected.agent_type}
|
|
530
|
+
if selected.model:
|
|
531
|
+
updates["model"] = selected.model
|
|
532
|
+
if selected.agent_config:
|
|
533
|
+
# Merge preset's agent_config with existing
|
|
534
|
+
merged = dict(self.agent_config)
|
|
535
|
+
for key, value in selected.agent_config.items():
|
|
536
|
+
if key in merged:
|
|
537
|
+
merged[key] = {**merged[key], **value}
|
|
538
|
+
else:
|
|
539
|
+
merged[key] = value
|
|
540
|
+
updates["agent_config"] = merged
|
|
541
|
+
|
|
542
|
+
return self.model_validate({**self.model_dump(), **updates})
|
|
543
|
+
|
|
544
|
+
def display(self) -> None:
|
|
545
|
+
"""Display settings in a table."""
|
|
546
|
+
table = Table(title="Evaluation Settings", title_style="bold cyan", box=box.ROUNDED)
|
|
547
|
+
table.add_column("Setting", style="yellow")
|
|
548
|
+
table.add_column("Value", style="green")
|
|
549
|
+
|
|
550
|
+
# Core settings
|
|
551
|
+
table.add_row("source", str(self.source or "—"))
|
|
552
|
+
table.add_row("agent", self.agent_type.value) # type: ignore[union-attr]
|
|
553
|
+
if self.task_ids:
|
|
554
|
+
table.add_row(
|
|
555
|
+
"task_ids", ", ".join(self.task_ids[:5]) + ("..." if len(self.task_ids) > 5 else "")
|
|
396
556
|
)
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
557
|
+
table.add_row("all", str(self.all))
|
|
558
|
+
table.add_row("max_steps", str(self.max_steps))
|
|
559
|
+
if not self.remote:
|
|
560
|
+
table.add_row("max_concurrent", str(self.max_concurrent))
|
|
561
|
+
if self.group_size > 1:
|
|
562
|
+
table.add_row("group_size", str(self.group_size))
|
|
563
|
+
if self.auto_respond:
|
|
564
|
+
table.add_row("auto_respond", "[bold green]True[/bold green]")
|
|
565
|
+
if self.very_verbose:
|
|
566
|
+
table.add_row("very_verbose", "[bold green]True[/bold green]")
|
|
567
|
+
elif self.verbose:
|
|
568
|
+
table.add_row("verbose", "[bold green]True[/bold green]")
|
|
569
|
+
if self.remote:
|
|
570
|
+
table.add_row("remote", "[bold green]True[/bold green] (submitting to platform)")
|
|
571
|
+
if self.gateway:
|
|
572
|
+
table.add_row("gateway", "[bold green]True[/bold green] (routing via HUD Gateway)")
|
|
573
|
+
if self.byok:
|
|
574
|
+
table.add_row("byok", "[bold green]True[/bold green] (remote only)")
|
|
575
|
+
|
|
576
|
+
# Tool filters (only if set)
|
|
577
|
+
if self.allowed_tools:
|
|
578
|
+
table.add_row("allowed_tools", ", ".join(self.allowed_tools))
|
|
579
|
+
if self.disallowed_tools:
|
|
580
|
+
table.add_row("disallowed_tools", ", ".join(self.disallowed_tools))
|
|
581
|
+
|
|
582
|
+
# Agent config section
|
|
583
|
+
if self.agent_type:
|
|
584
|
+
table.add_row("", "")
|
|
585
|
+
table.add_row(f"[dim]{self.agent_type.value} config[/dim]", "")
|
|
586
|
+
|
|
587
|
+
config_cls = self.agent_type.cls.config_cls
|
|
588
|
+
defaults = config_cls()
|
|
589
|
+
overrides = self.agent_config.get(self.agent_type.value, {})
|
|
590
|
+
skip = {
|
|
591
|
+
"model_client",
|
|
592
|
+
"model_name",
|
|
593
|
+
"validate_api_key",
|
|
594
|
+
"model_config",
|
|
595
|
+
"allowed_tools",
|
|
596
|
+
"disallowed_tools",
|
|
597
|
+
"system_prompt",
|
|
598
|
+
"response_tool_name",
|
|
599
|
+
"append_setup_output",
|
|
600
|
+
"initial_screenshot",
|
|
601
|
+
}
|
|
602
|
+
|
|
603
|
+
sensitive_fields = {"api_key", "api_secret", "token", "password", "secret"}
|
|
604
|
+
|
|
605
|
+
for name in config_cls.model_fields:
|
|
606
|
+
if name in skip:
|
|
607
|
+
continue
|
|
608
|
+
# Always show model
|
|
609
|
+
if name == "model":
|
|
610
|
+
if self.model:
|
|
611
|
+
value = self.model
|
|
612
|
+
elif overrides.get("model"):
|
|
613
|
+
value = overrides["model"]
|
|
614
|
+
else:
|
|
615
|
+
value = getattr(defaults, "model", None)
|
|
616
|
+
table.add_row(" model", str(value) if value else "—")
|
|
617
|
+
elif name in overrides:
|
|
618
|
+
value = overrides[name]
|
|
619
|
+
if name in sensitive_fields and value:
|
|
620
|
+
display_value = f"{str(value)[:4]}****" if len(str(value)) > 4 else "****"
|
|
621
|
+
else:
|
|
622
|
+
display_value = str(value)
|
|
623
|
+
table.add_row(f" {name}", display_value)
|
|
624
|
+
|
|
625
|
+
hud_console.console.print(table)
|
|
626
|
+
|
|
627
|
+
|
|
628
|
+
# =============================================================================
|
|
629
|
+
# Evaluation runner
|
|
630
|
+
# =============================================================================
|
|
631
|
+
|
|
632
|
+
|
|
633
|
+
async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
|
|
634
|
+
"""Run evaluation with the given config using run_dataset()."""
|
|
635
|
+
from hud.datasets import load_tasks, run_dataset
|
|
636
|
+
|
|
637
|
+
if cfg.source is None or cfg.agent_type is None:
|
|
638
|
+
raise ValueError("source and agent_type must be set")
|
|
639
|
+
|
|
640
|
+
# Load tasks using unified loader (handles v4→v5 conversion automatically)
|
|
641
|
+
hud_console.info(f"📊 Loading tasks from: {cfg.source}…")
|
|
642
|
+
tasks = load_tasks(cfg.source)
|
|
407
643
|
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
"model_name": sample_agent.model_name,
|
|
412
|
-
"verbose": verbose,
|
|
413
|
-
"completion_kwargs": sample_agent.completion_kwargs,
|
|
414
|
-
}
|
|
415
|
-
if allowed_tools:
|
|
416
|
-
agent_config["allowed_tools"] = allowed_tools
|
|
417
|
-
elif agent_type == "openai":
|
|
418
|
-
try:
|
|
419
|
-
from hud.agents import OperatorAgent
|
|
644
|
+
if not tasks:
|
|
645
|
+
hud_console.error(f"No tasks found in: {cfg.source}")
|
|
646
|
+
raise typer.Exit(1)
|
|
420
647
|
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
648
|
+
# Filter by task IDs if provided
|
|
649
|
+
if cfg.task_ids:
|
|
650
|
+
id_set = set(cfg.task_ids)
|
|
651
|
+
# Match by task.id or index
|
|
652
|
+
filtered = [t for i, t in enumerate(tasks) if t.id in id_set or str(i) in id_set]
|
|
653
|
+
if not filtered:
|
|
654
|
+
hud_console.error(f"No tasks found matching IDs: {', '.join(cfg.task_ids)}")
|
|
655
|
+
raise typer.Exit(1)
|
|
656
|
+
hud_console.info(f"Filtered to {len(filtered)} task(s) by ID")
|
|
657
|
+
tasks = filtered
|
|
658
|
+
elif not cfg.all:
|
|
659
|
+
# Single task mode (no --all, --full, or --task-ids)
|
|
660
|
+
tasks = [tasks[0]]
|
|
661
|
+
hud_console.info("Using first task (run with --full or --task-ids for more)…")
|
|
662
|
+
|
|
663
|
+
hud_console.info(f"Loaded {len(tasks)} task(s)")
|
|
664
|
+
|
|
665
|
+
# Prepare agent kwargs
|
|
666
|
+
agent_kwargs = cfg.get_agent_kwargs()
|
|
667
|
+
auto_respond = cfg.auto_respond
|
|
668
|
+
if auto_respond:
|
|
669
|
+
agent_kwargs = {**agent_kwargs, "auto_respond": True}
|
|
670
|
+
|
|
671
|
+
max_steps = cfg.max_steps
|
|
672
|
+
|
|
673
|
+
# Remote execution - submit to HUD platform
|
|
674
|
+
if cfg.remote:
|
|
675
|
+
agent_kwargs = {
|
|
676
|
+
k: v for k, v in agent_kwargs.items() if k not in ("api_key", "model_client")
|
|
677
|
+
}
|
|
678
|
+
# Create a job ID for tracking
|
|
679
|
+
import uuid
|
|
428
680
|
|
|
429
|
-
|
|
430
|
-
if allowed_tools:
|
|
431
|
-
agent_config["allowed_tools"] = allowed_tools
|
|
681
|
+
from hud.datasets.utils import submit_rollouts
|
|
432
682
|
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
683
|
+
job_id = str(uuid.uuid4())
|
|
684
|
+
hud_console.info(
|
|
685
|
+
f"Submitting {len(tasks)} task(s) for remote execution (job_id: {job_id})…"
|
|
686
|
+
)
|
|
436
687
|
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
688
|
+
await submit_rollouts(
|
|
689
|
+
tasks=tasks,
|
|
690
|
+
job_id=job_id,
|
|
691
|
+
agent_type=cfg.agent_type,
|
|
692
|
+
agent_params=agent_kwargs,
|
|
693
|
+
max_steps=max_steps,
|
|
694
|
+
group_size=cfg.group_size,
|
|
695
|
+
use_byok=cfg.byok,
|
|
696
|
+
)
|
|
444
697
|
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
"verbose": verbose,
|
|
448
|
-
}
|
|
449
|
-
if allowed_tools:
|
|
450
|
-
agent_config["allowed_tools"] = allowed_tools
|
|
698
|
+
hud_console.success(f"Tasks submitted. View at: https://hud.ai/jobs/{job_id}")
|
|
699
|
+
return [], tasks
|
|
451
700
|
|
|
701
|
+
# Single task mode - show extra info
|
|
702
|
+
if len(tasks) == 1 and cfg.group_size == 1:
|
|
703
|
+
logging.getLogger("hud.agents").setLevel(logging.INFO)
|
|
704
|
+
logging.getLogger("hud.agents.base").setLevel(logging.INFO)
|
|
705
|
+
# Get prompt from args (v4 tasks) or show scenario name
|
|
706
|
+
prompt = tasks[0].args.get("prompt") if tasks[0].args else tasks[0].scenario
|
|
707
|
+
if prompt:
|
|
708
|
+
hud_console.info(f"Prompt: {prompt}")
|
|
452
709
|
else:
|
|
453
|
-
|
|
454
|
-
|
|
710
|
+
hud_console.info(
|
|
711
|
+
f"🚀 Running evaluation (max_concurrent: {cfg.max_concurrent}, "
|
|
712
|
+
f"group_size: {cfg.group_size})…"
|
|
713
|
+
)
|
|
455
714
|
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
715
|
+
# Run using run_dataset
|
|
716
|
+
results = await run_dataset(
|
|
717
|
+
tasks,
|
|
718
|
+
cfg.agent_type,
|
|
719
|
+
agent_params=agent_kwargs,
|
|
720
|
+
max_steps=max_steps,
|
|
721
|
+
max_concurrent=cfg.max_concurrent,
|
|
722
|
+
group_size=cfg.group_size,
|
|
723
|
+
quiet=cfg.quiet,
|
|
724
|
+
)
|
|
463
725
|
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
}
|
|
468
|
-
if allowed_tools:
|
|
469
|
-
agent_config["allowed_tools"] = allowed_tools
|
|
470
|
-
|
|
471
|
-
# Use grouped evaluation if group_size > 1
|
|
472
|
-
if group_size > 1:
|
|
473
|
-
hud_console.info(f"🔄 Running dataset with group_size={group_size}")
|
|
474
|
-
|
|
475
|
-
# Run with job tracking
|
|
476
|
-
with hud.job(
|
|
477
|
-
name=f"Evaluation {dataset_name} (group_size={group_size})",
|
|
478
|
-
metadata={
|
|
479
|
-
"dataset": source,
|
|
480
|
-
"group_size": group_size,
|
|
481
|
-
"tasks": len(dataset_or_tasks),
|
|
482
|
-
"total_episodes": len(dataset_or_tasks) * group_size,
|
|
483
|
-
},
|
|
484
|
-
) as job:
|
|
485
|
-
# Convert dicts to Task objects if needed
|
|
486
|
-
from hud.datasets import Task
|
|
487
|
-
|
|
488
|
-
tasks = []
|
|
489
|
-
for item in dataset_or_tasks:
|
|
490
|
-
if isinstance(item, dict):
|
|
491
|
-
tasks.append(Task(**item))
|
|
492
|
-
else:
|
|
493
|
-
tasks.append(item)
|
|
494
|
-
|
|
495
|
-
stats = await run_tasks_grouped(
|
|
496
|
-
tasks=tasks,
|
|
497
|
-
agent_class=agent_class,
|
|
498
|
-
agent_config=agent_config,
|
|
499
|
-
group_size=group_size,
|
|
500
|
-
max_parallel_episodes=max_concurrent
|
|
501
|
-
if not parallel
|
|
502
|
-
else max_concurrent_per_worker * (max_workers or 4),
|
|
503
|
-
max_steps=max_steps,
|
|
504
|
-
verbose=verbose,
|
|
505
|
-
job_id=job.id,
|
|
506
|
-
)
|
|
726
|
+
# Show reward for single task
|
|
727
|
+
if len(tasks) == 1 and cfg.group_size == 1 and results:
|
|
728
|
+
hud_console.success(f"Reward: {results[0].reward}")
|
|
507
729
|
|
|
508
|
-
|
|
509
|
-
display_group_statistics(stats, show_details=len(stats) <= 50)
|
|
730
|
+
return results, tasks
|
|
510
731
|
|
|
511
|
-
# Return stats for consistency with other modes
|
|
512
|
-
return stats
|
|
513
732
|
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
f"🚀 Running PARALLEL evaluation (workers: {max_workers or 'auto'}, max_concurrent: {max_concurrent})…" # noqa: E501
|
|
518
|
-
)
|
|
519
|
-
if max_workers is None:
|
|
520
|
-
# Use auto-optimization (now the default run_dataset_parallel)
|
|
521
|
-
return await run_dataset_parallel(
|
|
522
|
-
name=f"Evaluation {dataset_name}",
|
|
523
|
-
dataset=dataset_or_tasks,
|
|
524
|
-
agent_class=agent_class,
|
|
525
|
-
agent_config=agent_config,
|
|
526
|
-
max_concurrent=max_concurrent,
|
|
527
|
-
metadata={"dataset": source, "parallel": True},
|
|
528
|
-
max_steps=max_steps,
|
|
529
|
-
auto_respond=True,
|
|
530
|
-
)
|
|
531
|
-
else:
|
|
532
|
-
# Use manual configuration
|
|
533
|
-
return await run_dataset_parallel_manual(
|
|
534
|
-
name=f"Evaluation {dataset_name}",
|
|
535
|
-
dataset=dataset_or_tasks,
|
|
536
|
-
agent_class=agent_class,
|
|
537
|
-
agent_config=agent_config,
|
|
538
|
-
max_workers=max_workers,
|
|
539
|
-
max_concurrent_per_worker=max_concurrent_per_worker,
|
|
540
|
-
max_concurrent=max_concurrent,
|
|
541
|
-
metadata={"dataset": source, "parallel": True},
|
|
542
|
-
max_steps=max_steps,
|
|
543
|
-
auto_respond=True,
|
|
544
|
-
)
|
|
545
|
-
else:
|
|
546
|
-
hud_console.info(f"🚀 Running evaluation (max_concurrent: {max_concurrent})…")
|
|
547
|
-
return await run_dataset(
|
|
548
|
-
name=f"Evaluation {dataset_name}",
|
|
549
|
-
dataset=dataset_or_tasks,
|
|
550
|
-
agent_class=agent_class,
|
|
551
|
-
agent_config=agent_config,
|
|
552
|
-
max_concurrent=max_concurrent,
|
|
553
|
-
metadata={"dataset": source},
|
|
554
|
-
max_steps=max_steps,
|
|
555
|
-
)
|
|
733
|
+
# =============================================================================
|
|
734
|
+
# CLI command
|
|
735
|
+
# =============================================================================
|
|
556
736
|
|
|
557
737
|
|
|
558
738
|
def eval_command(
|
|
559
|
-
source: str = typer.Argument(
|
|
560
|
-
|
|
561
|
-
|
|
739
|
+
source: str | None = typer.Argument(None, help="HuggingFace dataset or task JSON file"),
|
|
740
|
+
agent: str | None = typer.Argument(
|
|
741
|
+
None,
|
|
742
|
+
help="Agent: claude, openai, operator, gemini, gemini_cua, openai_compatible, integration_test", # noqa: E501
|
|
562
743
|
),
|
|
744
|
+
all: bool = typer.Option(False, "--all", help="Run all problems instead of just 1"),
|
|
563
745
|
full: bool = typer.Option(
|
|
564
746
|
False,
|
|
565
747
|
"--full",
|
|
566
|
-
help="Run the entire dataset
|
|
748
|
+
help="Run the entire dataset. Shortcut for --all --auto-respond --max-steps 100",
|
|
567
749
|
),
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
"--
|
|
571
|
-
help="Agent backend to use (claude, openai, vllm for local server, or litellm)",
|
|
572
|
-
),
|
|
573
|
-
model: str | None = typer.Option(
|
|
574
|
-
None,
|
|
575
|
-
"--model",
|
|
576
|
-
help="Model name for the chosen agent",
|
|
750
|
+
model: str | None = typer.Option(None, "--model", "-m", help="Model name"),
|
|
751
|
+
config: list[str] | None = typer.Option( # noqa: B008
|
|
752
|
+
None, "--config", "-c", help="Agent config: key=value"
|
|
577
753
|
),
|
|
754
|
+
# Task-overridable settings
|
|
578
755
|
allowed_tools: str | None = typer.Option(
|
|
579
|
-
None,
|
|
580
|
-
"--allowed-tools",
|
|
581
|
-
help="Comma-separated list of allowed tools",
|
|
756
|
+
None, "--allowed-tools", help="Comma-separated allowed tools"
|
|
582
757
|
),
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
"--max-concurrent",
|
|
586
|
-
help="Concurrency level for asyncio mode (ignored in parallel mode)",
|
|
758
|
+
disallowed_tools: str | None = typer.Option(
|
|
759
|
+
None, "--disallowed-tools", help="Comma-separated disallowed tools"
|
|
587
760
|
),
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
"--max-
|
|
591
|
-
help="Maximum steps per task (default: 10 for single, 50 for full)",
|
|
761
|
+
# Eval settings
|
|
762
|
+
max_concurrent: int | None = typer.Option(
|
|
763
|
+
None, "--max-concurrent", help="Max concurrent tasks"
|
|
592
764
|
),
|
|
593
|
-
|
|
765
|
+
max_steps: int | None = typer.Option(None, "--max-steps", help="Max steps per task"),
|
|
766
|
+
verbose: bool = typer.Option(False, "--verbose", "-v", help="Verbose output"),
|
|
767
|
+
very_verbose: bool = typer.Option(False, "--very-verbose", "-vv", help="Debug logs"),
|
|
768
|
+
auto_respond: bool = typer.Option(
|
|
594
769
|
False,
|
|
595
|
-
"--
|
|
596
|
-
help="
|
|
770
|
+
"--auto-respond",
|
|
771
|
+
help="Automatically prompt the agent to continue if it does not respond with a tool call",
|
|
597
772
|
),
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
max_concurrent_per_worker: int = typer.Option(
|
|
604
|
-
20,
|
|
605
|
-
"--max-concurrent-per-worker",
|
|
606
|
-
help="Maximum concurrent tasks per worker in parallel mode",
|
|
773
|
+
group_size: int | None = typer.Option(None, "--group-size", help="Runs per task"),
|
|
774
|
+
task_ids: str | None = typer.Option(None, "--task-ids", help="Comma-separated task IDs to run"),
|
|
775
|
+
yes: bool = typer.Option(False, "--yes", "-y", help="Skip confirmation"),
|
|
776
|
+
remote: bool = typer.Option(
|
|
777
|
+
False, "--remote", help="Submit tasks to platform for remote execution"
|
|
607
778
|
),
|
|
608
|
-
|
|
779
|
+
byok: bool = typer.Option(
|
|
609
780
|
False,
|
|
610
|
-
"--
|
|
611
|
-
help="
|
|
781
|
+
"--byok",
|
|
782
|
+
help="Remote only: use BYOK keys from encrypted env vars for inference",
|
|
612
783
|
),
|
|
613
|
-
|
|
614
|
-
False,
|
|
615
|
-
"--very-verbose",
|
|
616
|
-
"-vv",
|
|
617
|
-
help="Enable debug-level logs for maximum visibility",
|
|
618
|
-
),
|
|
619
|
-
vllm_base_url: str | None = typer.Option(
|
|
620
|
-
None,
|
|
621
|
-
"--vllm-base-url",
|
|
622
|
-
help="Base URL for vLLM server (when using --agent vllm)",
|
|
784
|
+
quiet: bool = typer.Option(
|
|
785
|
+
False, "--quiet", "-q", help="Suppress opening browser for eval links"
|
|
623
786
|
),
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
"--group-size",
|
|
627
|
-
help="Number of times to run each task (similar to RL training)",
|
|
628
|
-
),
|
|
629
|
-
integration_test: bool = typer.Option(
|
|
630
|
-
False,
|
|
631
|
-
"--integration-test",
|
|
632
|
-
help=(
|
|
633
|
-
"Run integration_test_tool tool, where problem is setup, "
|
|
634
|
-
"actions are applied, and evaluation is performed, without "
|
|
635
|
-
"spinning up an agent"
|
|
636
|
-
),
|
|
787
|
+
gateway: bool = typer.Option(
|
|
788
|
+
False, "--gateway", "-g", help="Route LLM API calls through HUD Gateway"
|
|
637
789
|
),
|
|
638
790
|
) -> None:
|
|
639
791
|
"""🚀 Run evaluation on datasets or individual tasks with agents.
|
|
640
792
|
|
|
641
793
|
Examples:
|
|
642
|
-
|
|
643
|
-
hud eval hud-evals/SheetBench-50
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
hud eval
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
794
|
+
hud eval tasks.json claude
|
|
795
|
+
hud eval hud-evals/SheetBench-50 claude --full
|
|
796
|
+
hud eval tasks.json claude --config max_tokens=32768
|
|
797
|
+
hud eval tasks.json openai --config temperature=0.7
|
|
798
|
+
hud eval tasks.json claude --full --remote # Remote execution
|
|
799
|
+
hud eval tasks.json claude --gateway # Route LLM calls through HUD Gateway
|
|
800
|
+
"""
|
|
801
|
+
hud_console.info("🔧 Initializing evaluation...")
|
|
802
|
+
|
|
803
|
+
# Load config and merge CLI args
|
|
804
|
+
cfg = EvalConfig.load().merge_cli(
|
|
805
|
+
source=source,
|
|
806
|
+
agent=agent,
|
|
807
|
+
model=model,
|
|
808
|
+
all=all,
|
|
809
|
+
full=full,
|
|
810
|
+
max_concurrent=max_concurrent,
|
|
811
|
+
max_steps=max_steps,
|
|
812
|
+
allowed_tools=allowed_tools,
|
|
813
|
+
disallowed_tools=disallowed_tools,
|
|
814
|
+
task_ids=task_ids,
|
|
815
|
+
verbose=verbose,
|
|
816
|
+
very_verbose=very_verbose,
|
|
817
|
+
auto_respond=auto_respond,
|
|
818
|
+
group_size=group_size,
|
|
819
|
+
config=config,
|
|
820
|
+
remote=remote,
|
|
821
|
+
byok=byok,
|
|
822
|
+
quiet=quiet,
|
|
823
|
+
gateway=gateway,
|
|
824
|
+
)
|
|
665
825
|
|
|
666
|
-
|
|
667
|
-
|
|
826
|
+
# Find source if not provided
|
|
827
|
+
if cfg.source is None:
|
|
828
|
+
try:
|
|
829
|
+
from hud.cli.utils.tasks import find_tasks_file
|
|
668
830
|
|
|
669
|
-
|
|
670
|
-
|
|
831
|
+
cfg = cfg.model_copy(
|
|
832
|
+
update={"source": find_tasks_file(None, msg="Select a tasks file")}
|
|
833
|
+
)
|
|
834
|
+
hud_console.success(f"Selected: {cfg.source}")
|
|
835
|
+
except Exception:
|
|
836
|
+
hud_console.error("No source provided and no task files found")
|
|
837
|
+
raise typer.Exit(1) from None
|
|
671
838
|
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
"""
|
|
675
|
-
from hud.settings import settings
|
|
839
|
+
# Resolve agent interactively if needed
|
|
840
|
+
cfg = cfg.resolve_agent_interactive()
|
|
676
841
|
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
format="%(asctime)s - %(name)s - %(message)s",
|
|
681
|
-
datefmt="%H:%M:%S",
|
|
682
|
-
)
|
|
842
|
+
# Configure logging
|
|
843
|
+
if cfg.very_verbose:
|
|
844
|
+
logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(message)s")
|
|
683
845
|
logging.getLogger("hud.agents").setLevel(logging.DEBUG)
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
logging.
|
|
687
|
-
|
|
688
|
-
format="%(asctime)s - %(name)s - %(message)s",
|
|
689
|
-
datefmt="%H:%M:%S",
|
|
690
|
-
)
|
|
846
|
+
# Suppress noisy HTTP client logs
|
|
847
|
+
logging.getLogger("httpx").setLevel(logging.WARNING)
|
|
848
|
+
logging.getLogger("httpcore").setLevel(logging.WARNING)
|
|
849
|
+
elif cfg.verbose:
|
|
691
850
|
logging.getLogger("hud.agents").setLevel(logging.INFO)
|
|
692
|
-
logging.getLogger("hud.agents.base").setLevel(logging.INFO)
|
|
693
851
|
|
|
694
|
-
#
|
|
695
|
-
|
|
696
|
-
agent = "integration_test"
|
|
852
|
+
# Validate API keys
|
|
853
|
+
cfg.validate_api_keys()
|
|
697
854
|
|
|
698
|
-
#
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
"Set it in your environment or run: hud set ANTHROPIC_API_KEY=your-key-here"
|
|
704
|
-
)
|
|
705
|
-
raise typer.Exit(1)
|
|
706
|
-
elif agent == "openai" and not settings.openai_api_key:
|
|
707
|
-
hud_console.error("OPENAI_API_KEY is required for OpenAI agent")
|
|
708
|
-
hud_console.info("Set it in your environment or run: hud set OPENAI_API_KEY=your-key-here")
|
|
855
|
+
# Display and confirm
|
|
856
|
+
cfg.display()
|
|
857
|
+
|
|
858
|
+
if not yes and not questionary.confirm("Proceed?", default=True, qmark="").ask():
|
|
859
|
+
hud_console.info("Cancelled.")
|
|
709
860
|
raise typer.Exit(1)
|
|
710
|
-
elif agent == "vllm":
|
|
711
|
-
if model:
|
|
712
|
-
hud_console.info(f"Using vLLM with model: {model}")
|
|
713
|
-
else:
|
|
714
|
-
hud_console.error("Model name is required for vLLM agent, specify with --model")
|
|
715
|
-
raise typer.Exit(1)
|
|
716
861
|
|
|
717
|
-
#
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
862
|
+
# Run
|
|
863
|
+
start_time = time.time()
|
|
864
|
+
try:
|
|
865
|
+
results, tasks = asyncio.run(_run_evaluation(cfg))
|
|
866
|
+
except ValueError as e:
|
|
867
|
+
hud_console.error(str(e))
|
|
868
|
+
raise typer.Exit(1) from None
|
|
869
|
+
elapsed = time.time() - start_time
|
|
722
870
|
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
[t.strip() for t in allowed_tools.split(",") if t.strip()] if allowed_tools else None
|
|
726
|
-
)
|
|
871
|
+
if cfg.remote:
|
|
872
|
+
return
|
|
727
873
|
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
# Run evaluation
|
|
733
|
-
if full:
|
|
734
|
-
asyncio.run(
|
|
735
|
-
run_full_dataset(
|
|
736
|
-
source,
|
|
737
|
-
agent_type=agent,
|
|
738
|
-
model=model,
|
|
739
|
-
allowed_tools=allowed_tools_list,
|
|
740
|
-
max_concurrent=max_concurrent,
|
|
741
|
-
max_steps=max_steps,
|
|
742
|
-
parallel=parallel,
|
|
743
|
-
max_workers=max_workers,
|
|
744
|
-
max_concurrent_per_worker=max_concurrent_per_worker,
|
|
745
|
-
verbose=very_verbose or verbose,
|
|
746
|
-
vllm_base_url=vllm_base_url,
|
|
747
|
-
group_size=group_size,
|
|
748
|
-
)
|
|
749
|
-
)
|
|
750
|
-
else:
|
|
751
|
-
asyncio.run(
|
|
752
|
-
run_single_task(
|
|
753
|
-
source,
|
|
754
|
-
agent_type=agent,
|
|
755
|
-
model=model,
|
|
756
|
-
allowed_tools=allowed_tools_list,
|
|
757
|
-
max_steps=max_steps,
|
|
758
|
-
verbose=very_verbose or verbose,
|
|
759
|
-
vllm_base_url=vllm_base_url,
|
|
760
|
-
group_size=group_size,
|
|
761
|
-
)
|
|
762
|
-
)
|
|
874
|
+
from hud.datasets import display_results
|
|
875
|
+
|
|
876
|
+
display_results(results, tasks=tasks, elapsed=elapsed, show_details=len(results) <= 50)
|