hud-python 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hud/__init__.py +27 -7
- hud/agents/__init__.py +70 -5
- hud/agents/base.py +238 -500
- hud/agents/claude.py +236 -247
- hud/agents/gateway.py +42 -0
- hud/agents/gemini.py +264 -0
- hud/agents/gemini_cua.py +324 -0
- hud/agents/grounded_openai.py +98 -100
- hud/agents/misc/integration_test_agent.py +51 -20
- hud/agents/misc/response_agent.py +48 -36
- hud/agents/openai.py +282 -296
- hud/agents/{openai_chat_generic.py → openai_chat.py} +63 -33
- hud/agents/operator.py +199 -0
- hud/agents/resolver.py +70 -0
- hud/agents/tests/conftest.py +133 -0
- hud/agents/tests/test_base.py +300 -622
- hud/agents/tests/test_base_runtime.py +233 -0
- hud/agents/tests/test_claude.py +381 -214
- hud/agents/tests/test_client.py +9 -10
- hud/agents/tests/test_gemini.py +369 -0
- hud/agents/tests/test_grounded_openai_agent.py +65 -50
- hud/agents/tests/test_openai.py +377 -140
- hud/agents/tests/test_operator.py +362 -0
- hud/agents/tests/test_resolver.py +192 -0
- hud/agents/tests/test_run_eval.py +179 -0
- hud/agents/types.py +148 -0
- hud/cli/__init__.py +493 -546
- hud/cli/analyze.py +43 -5
- hud/cli/build.py +699 -113
- hud/cli/debug.py +8 -5
- hud/cli/dev.py +889 -732
- hud/cli/eval.py +793 -667
- hud/cli/flows/dev.py +167 -0
- hud/cli/flows/init.py +191 -0
- hud/cli/flows/tasks.py +153 -56
- hud/cli/flows/templates.py +151 -0
- hud/cli/flows/tests/__init__.py +1 -0
- hud/cli/flows/tests/test_dev.py +126 -0
- hud/cli/init.py +60 -58
- hud/cli/pull.py +1 -1
- hud/cli/push.py +38 -13
- hud/cli/rft.py +311 -0
- hud/cli/rft_status.py +145 -0
- hud/cli/tests/test_analyze.py +5 -5
- hud/cli/tests/test_analyze_metadata.py +3 -2
- hud/cli/tests/test_analyze_module.py +120 -0
- hud/cli/tests/test_build.py +110 -8
- hud/cli/tests/test_build_failure.py +41 -0
- hud/cli/tests/test_build_module.py +50 -0
- hud/cli/tests/test_cli_init.py +6 -1
- hud/cli/tests/test_cli_more_wrappers.py +30 -0
- hud/cli/tests/test_cli_root.py +140 -0
- hud/cli/tests/test_convert.py +361 -0
- hud/cli/tests/test_debug.py +12 -10
- hud/cli/tests/test_dev.py +197 -0
- hud/cli/tests/test_eval.py +251 -0
- hud/cli/tests/test_eval_bedrock.py +51 -0
- hud/cli/tests/test_init.py +124 -0
- hud/cli/tests/test_main_module.py +11 -5
- hud/cli/tests/test_mcp_server.py +12 -100
- hud/cli/tests/test_push.py +1 -1
- hud/cli/tests/test_push_happy.py +74 -0
- hud/cli/tests/test_push_wrapper.py +23 -0
- hud/cli/tests/test_registry.py +1 -1
- hud/cli/tests/test_utils.py +1 -1
- hud/cli/{rl → utils}/celebrate.py +14 -12
- hud/cli/utils/config.py +18 -1
- hud/cli/utils/docker.py +130 -4
- hud/cli/utils/env_check.py +9 -9
- hud/cli/utils/git.py +136 -0
- hud/cli/utils/interactive.py +39 -5
- hud/cli/utils/metadata.py +70 -1
- hud/cli/utils/runner.py +1 -1
- hud/cli/utils/server.py +2 -2
- hud/cli/utils/source_hash.py +3 -3
- hud/cli/utils/tasks.py +4 -1
- hud/cli/utils/tests/__init__.py +0 -0
- hud/cli/utils/tests/test_config.py +58 -0
- hud/cli/utils/tests/test_docker.py +93 -0
- hud/cli/utils/tests/test_docker_hints.py +71 -0
- hud/cli/utils/tests/test_env_check.py +74 -0
- hud/cli/utils/tests/test_environment.py +42 -0
- hud/cli/utils/tests/test_git.py +142 -0
- hud/cli/utils/tests/test_interactive_module.py +60 -0
- hud/cli/utils/tests/test_local_runner.py +50 -0
- hud/cli/utils/tests/test_logging_utils.py +23 -0
- hud/cli/utils/tests/test_metadata.py +49 -0
- hud/cli/utils/tests/test_package_runner.py +35 -0
- hud/cli/utils/tests/test_registry_utils.py +49 -0
- hud/cli/utils/tests/test_remote_runner.py +25 -0
- hud/cli/utils/tests/test_runner_modules.py +52 -0
- hud/cli/utils/tests/test_source_hash.py +36 -0
- hud/cli/utils/tests/test_tasks.py +80 -0
- hud/cli/utils/version_check.py +258 -0
- hud/cli/{rl → utils}/viewer.py +2 -2
- hud/clients/README.md +12 -11
- hud/clients/__init__.py +4 -3
- hud/clients/base.py +166 -26
- hud/clients/environment.py +51 -0
- hud/clients/fastmcp.py +13 -6
- hud/clients/mcp_use.py +45 -15
- hud/clients/tests/test_analyze_scenarios.py +206 -0
- hud/clients/tests/test_protocol.py +9 -3
- hud/datasets/__init__.py +23 -20
- hud/datasets/loader.py +326 -0
- hud/datasets/runner.py +198 -105
- hud/datasets/tests/__init__.py +0 -0
- hud/datasets/tests/test_loader.py +221 -0
- hud/datasets/tests/test_utils.py +315 -0
- hud/datasets/utils.py +270 -90
- hud/environment/__init__.py +52 -0
- hud/environment/connection.py +258 -0
- hud/environment/connectors/__init__.py +33 -0
- hud/environment/connectors/base.py +68 -0
- hud/environment/connectors/local.py +177 -0
- hud/environment/connectors/mcp_config.py +137 -0
- hud/environment/connectors/openai.py +101 -0
- hud/environment/connectors/remote.py +172 -0
- hud/environment/environment.py +835 -0
- hud/environment/integrations/__init__.py +45 -0
- hud/environment/integrations/adk.py +67 -0
- hud/environment/integrations/anthropic.py +196 -0
- hud/environment/integrations/gemini.py +92 -0
- hud/environment/integrations/langchain.py +82 -0
- hud/environment/integrations/llamaindex.py +68 -0
- hud/environment/integrations/openai.py +238 -0
- hud/environment/mock.py +306 -0
- hud/environment/router.py +263 -0
- hud/environment/scenarios.py +620 -0
- hud/environment/tests/__init__.py +1 -0
- hud/environment/tests/test_connection.py +317 -0
- hud/environment/tests/test_connectors.py +205 -0
- hud/environment/tests/test_environment.py +593 -0
- hud/environment/tests/test_integrations.py +257 -0
- hud/environment/tests/test_local_connectors.py +242 -0
- hud/environment/tests/test_scenarios.py +1086 -0
- hud/environment/tests/test_tools.py +208 -0
- hud/environment/types.py +23 -0
- hud/environment/utils/__init__.py +35 -0
- hud/environment/utils/formats.py +215 -0
- hud/environment/utils/schema.py +171 -0
- hud/environment/utils/tool_wrappers.py +113 -0
- hud/eval/__init__.py +67 -0
- hud/eval/context.py +727 -0
- hud/eval/display.py +299 -0
- hud/eval/instrument.py +187 -0
- hud/eval/manager.py +533 -0
- hud/eval/parallel.py +268 -0
- hud/eval/task.py +372 -0
- hud/eval/tests/__init__.py +1 -0
- hud/eval/tests/test_context.py +178 -0
- hud/eval/tests/test_eval.py +210 -0
- hud/eval/tests/test_manager.py +152 -0
- hud/eval/tests/test_parallel.py +168 -0
- hud/eval/tests/test_task.py +291 -0
- hud/eval/types.py +65 -0
- hud/eval/utils.py +194 -0
- hud/patches/__init__.py +19 -0
- hud/patches/mcp_patches.py +308 -0
- hud/patches/warnings.py +54 -0
- hud/samples/browser.py +4 -4
- hud/server/__init__.py +2 -1
- hud/server/low_level.py +2 -1
- hud/server/router.py +164 -0
- hud/server/server.py +567 -80
- hud/server/tests/test_mcp_server_integration.py +11 -11
- hud/server/tests/test_mcp_server_more.py +1 -1
- hud/server/tests/test_server_extra.py +2 -0
- hud/settings.py +45 -3
- hud/shared/exceptions.py +36 -10
- hud/shared/hints.py +26 -1
- hud/shared/requests.py +15 -3
- hud/shared/tests/test_exceptions.py +40 -31
- hud/shared/tests/test_hints.py +167 -0
- hud/telemetry/__init__.py +20 -19
- hud/telemetry/exporter.py +201 -0
- hud/telemetry/instrument.py +165 -253
- hud/telemetry/tests/test_eval_telemetry.py +356 -0
- hud/telemetry/tests/test_exporter.py +258 -0
- hud/telemetry/tests/test_instrument.py +401 -0
- hud/tools/__init__.py +18 -2
- hud/tools/agent.py +223 -0
- hud/tools/apply_patch.py +639 -0
- hud/tools/base.py +54 -4
- hud/tools/bash.py +2 -2
- hud/tools/computer/__init__.py +36 -3
- hud/tools/computer/anthropic.py +2 -2
- hud/tools/computer/gemini.py +385 -0
- hud/tools/computer/hud.py +23 -6
- hud/tools/computer/openai.py +20 -21
- hud/tools/computer/qwen.py +434 -0
- hud/tools/computer/settings.py +37 -0
- hud/tools/edit.py +3 -7
- hud/tools/executors/base.py +4 -2
- hud/tools/executors/pyautogui.py +1 -1
- hud/tools/grounding/grounded_tool.py +13 -18
- hud/tools/grounding/grounder.py +10 -31
- hud/tools/grounding/tests/test_grounded_tool.py +26 -44
- hud/tools/jupyter.py +330 -0
- hud/tools/playwright.py +18 -3
- hud/tools/shell.py +308 -0
- hud/tools/tests/test_agent_tool.py +355 -0
- hud/tools/tests/test_apply_patch.py +718 -0
- hud/tools/tests/test_computer.py +4 -9
- hud/tools/tests/test_computer_actions.py +24 -2
- hud/tools/tests/test_jupyter_tool.py +181 -0
- hud/tools/tests/test_shell.py +596 -0
- hud/tools/tests/test_submit.py +85 -0
- hud/tools/tests/test_types.py +193 -0
- hud/tools/types.py +21 -1
- hud/types.py +194 -56
- hud/utils/__init__.py +2 -0
- hud/utils/env.py +67 -0
- hud/utils/hud_console.py +89 -18
- hud/utils/mcp.py +15 -58
- hud/utils/strict_schema.py +162 -0
- hud/utils/tests/test_init.py +1 -2
- hud/utils/tests/test_mcp.py +1 -28
- hud/utils/tests/test_pretty_errors.py +186 -0
- hud/utils/tests/test_tool_shorthand.py +154 -0
- hud/utils/tests/test_version.py +1 -1
- hud/utils/types.py +20 -0
- hud/version.py +1 -1
- hud_python-0.5.13.dist-info/METADATA +264 -0
- hud_python-0.5.13.dist-info/RECORD +305 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/WHEEL +1 -1
- hud/agents/langchain.py +0 -261
- hud/agents/lite_llm.py +0 -72
- hud/cli/rl/__init__.py +0 -180
- hud/cli/rl/config.py +0 -101
- hud/cli/rl/display.py +0 -133
- hud/cli/rl/gpu.py +0 -63
- hud/cli/rl/gpu_utils.py +0 -321
- hud/cli/rl/local_runner.py +0 -595
- hud/cli/rl/presets.py +0 -96
- hud/cli/rl/remote_runner.py +0 -463
- hud/cli/rl/rl_api.py +0 -150
- hud/cli/rl/vllm.py +0 -177
- hud/cli/rl/wait_utils.py +0 -89
- hud/datasets/parallel.py +0 -687
- hud/misc/__init__.py +0 -1
- hud/misc/claude_plays_pokemon.py +0 -292
- hud/otel/__init__.py +0 -35
- hud/otel/collector.py +0 -142
- hud/otel/config.py +0 -181
- hud/otel/context.py +0 -570
- hud/otel/exporters.py +0 -369
- hud/otel/instrumentation.py +0 -135
- hud/otel/processors.py +0 -121
- hud/otel/tests/__init__.py +0 -1
- hud/otel/tests/test_processors.py +0 -197
- hud/rl/README.md +0 -30
- hud/rl/__init__.py +0 -1
- hud/rl/actor.py +0 -176
- hud/rl/buffer.py +0 -405
- hud/rl/chat_template.jinja +0 -101
- hud/rl/config.py +0 -192
- hud/rl/distributed.py +0 -132
- hud/rl/learner.py +0 -637
- hud/rl/tests/__init__.py +0 -1
- hud/rl/tests/test_learner.py +0 -186
- hud/rl/train.py +0 -382
- hud/rl/types.py +0 -101
- hud/rl/utils/start_vllm_server.sh +0 -30
- hud/rl/utils.py +0 -524
- hud/rl/vllm_adapter.py +0 -143
- hud/telemetry/job.py +0 -352
- hud/telemetry/replay.py +0 -74
- hud/telemetry/tests/test_replay.py +0 -40
- hud/telemetry/tests/test_trace.py +0 -63
- hud/telemetry/trace.py +0 -158
- hud/utils/agent_factories.py +0 -86
- hud/utils/async_utils.py +0 -65
- hud/utils/group_eval.py +0 -223
- hud/utils/progress.py +0 -149
- hud/utils/tasks.py +0 -127
- hud/utils/tests/test_async_utils.py +0 -173
- hud/utils/tests/test_progress.py +0 -261
- hud_python-0.4.45.dist-info/METADATA +0 -552
- hud_python-0.4.45.dist-info/RECORD +0 -228
- {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0
hud/cli/eval.py
CHANGED
|
@@ -1,762 +1,888 @@
|
|
|
1
|
-
"""HUD evaluation command for running tasks and datasets.
|
|
1
|
+
"""HUD evaluation command for running tasks and datasets.
|
|
2
|
+
|
|
3
|
+
Config Override Order: CLI arguments > .hud_eval.toml > defaults
|
|
4
|
+
"""
|
|
2
5
|
|
|
3
6
|
from __future__ import annotations
|
|
4
7
|
|
|
5
8
|
import asyncio
|
|
6
9
|
import logging
|
|
10
|
+
import re
|
|
11
|
+
import time
|
|
12
|
+
import tomllib
|
|
13
|
+
from dataclasses import dataclass
|
|
7
14
|
from pathlib import Path
|
|
8
|
-
from typing import TYPE_CHECKING, Any,
|
|
15
|
+
from typing import TYPE_CHECKING, Any, ClassVar
|
|
9
16
|
|
|
17
|
+
import questionary
|
|
10
18
|
import typer
|
|
19
|
+
from pydantic import BaseModel, Field, field_validator
|
|
20
|
+
from rich import box
|
|
21
|
+
from rich.table import Table
|
|
11
22
|
|
|
12
|
-
import hud
|
|
13
|
-
from hud.cli.utils.env_check import ensure_built, find_environment_dir
|
|
14
23
|
from hud.settings import settings
|
|
15
|
-
from hud.
|
|
24
|
+
from hud.types import AgentType
|
|
25
|
+
from hud.utils.env import resolve_env_vars
|
|
16
26
|
from hud.utils.hud_console import HUDConsole
|
|
17
27
|
|
|
28
|
+
# Pattern to detect AWS Bedrock inference profile ARNs
|
|
29
|
+
_BEDROCK_ARN_PATTERN = re.compile(r"^arn:aws:bedrock:[a-z0-9-]+:\d+:inference-profile/.+$")
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
def _is_bedrock_arn(model: str | None) -> bool:
|
|
33
|
+
"""Check if a model string is a Bedrock inference profile ARN."""
|
|
34
|
+
return model is not None and bool(_BEDROCK_ARN_PATTERN.match(model))
|
|
35
|
+
|
|
36
|
+
|
|
18
37
|
if TYPE_CHECKING:
|
|
19
|
-
from hud.
|
|
38
|
+
from hud.agents.base import MCPAgent
|
|
39
|
+
|
|
20
40
|
logger = logging.getLogger(__name__)
|
|
21
41
|
hud_console = HUDConsole()
|
|
22
42
|
|
|
43
|
+
_CONFIG_PATH = ".hud_eval.toml"
|
|
23
44
|
|
|
24
|
-
def get_available_models() -> list[dict[str, str | None]]:
|
|
25
|
-
"""Fetch available models from the HUD API (only ready models).
|
|
26
|
-
|
|
27
|
-
Returns:
|
|
28
|
-
List of dicts with 'name', 'vllm_url', and 'base_model' keys
|
|
29
|
-
"""
|
|
30
|
-
try:
|
|
31
|
-
from hud.cli.rl import rl_api
|
|
32
45
|
|
|
33
|
-
|
|
34
|
-
|
|
46
|
+
@dataclass(frozen=True)
|
|
47
|
+
class AgentPreset:
|
|
48
|
+
"""A preset agent configuration combining agent type, model, and optional config."""
|
|
35
49
|
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
50
|
+
name: str
|
|
51
|
+
agent_type: AgentType
|
|
52
|
+
model: str | None = None
|
|
53
|
+
agent_config: dict[str, Any] | None = None
|
|
39
54
|
|
|
40
|
-
# Count other statuses for informational purposes
|
|
41
|
-
training_count = sum(1 for m in models if m.status == "training")
|
|
42
|
-
# other_count = len(models) - len(ready_models) - training_count
|
|
43
55
|
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
56
|
+
# Built-in presets for the interactive picker
|
|
57
|
+
_AGENT_PRESETS: list[AgentPreset] = [
|
|
58
|
+
# Native agents (use provider SDKs directly)
|
|
59
|
+
AgentPreset("Claude Sonnet 4.5", AgentType.CLAUDE, "claude-sonnet-4-5"),
|
|
60
|
+
AgentPreset("GPT-5", AgentType.OPENAI, "gpt-5"),
|
|
61
|
+
AgentPreset("Operator (OpenAI Computer Use)", AgentType.OPERATOR, "computer-use-preview"),
|
|
62
|
+
AgentPreset("Gemini 3 Pro Preview", AgentType.GEMINI, "gemini-3-pro-preview"),
|
|
63
|
+
AgentPreset(
|
|
64
|
+
"Gemini CUA (Gemini Computer Use)",
|
|
65
|
+
AgentType.GEMINI_CUA,
|
|
66
|
+
"gemini-2.5-computer-use-preview",
|
|
67
|
+
),
|
|
68
|
+
# HUD Gateway presets (models via HUD Inference API)
|
|
69
|
+
AgentPreset(
|
|
70
|
+
"Grok 4-1 Fast (xAI)",
|
|
71
|
+
AgentType.OPENAI_COMPATIBLE,
|
|
72
|
+
"grok-4-1-fast",
|
|
73
|
+
{
|
|
74
|
+
"openai_compatible": {
|
|
75
|
+
"base_url": settings.hud_gateway_url,
|
|
76
|
+
"model_name": "Grok 4-1 Fast",
|
|
77
|
+
}
|
|
78
|
+
},
|
|
79
|
+
),
|
|
80
|
+
AgentPreset(
|
|
81
|
+
"GLM-4.5V (Z-AI)",
|
|
82
|
+
AgentType.OPENAI_COMPATIBLE,
|
|
83
|
+
"z-ai/glm-4.5v",
|
|
84
|
+
{"openai_compatible": {"base_url": settings.hud_gateway_url, "model_name": "GLM-4.5V"}},
|
|
85
|
+
),
|
|
86
|
+
]
|
|
87
|
+
|
|
88
|
+
_DEFAULT_CONFIG_TEMPLATE = """# HUD Eval Configuration
|
|
89
|
+
# Command-line arguments override these settings
|
|
90
|
+
|
|
91
|
+
[eval]
|
|
92
|
+
# source = "hud-evals/SheetBench-50"
|
|
93
|
+
# agent = "claude"
|
|
94
|
+
# all = false # Run all problems instead of just 1
|
|
95
|
+
# max_concurrent = 30
|
|
96
|
+
# max_steps = 10
|
|
97
|
+
# group_size = 1
|
|
98
|
+
# byok = false # Remote only; use encrypted env vars on the platform.
|
|
99
|
+
# task_ids = ["task_1", "task_2"]
|
|
100
|
+
# verbose = true
|
|
101
|
+
# very_verbose = true
|
|
102
|
+
# auto_respond = true
|
|
103
|
+
# gateway = false # Route LLM API calls through HUD Gateway
|
|
104
|
+
|
|
105
|
+
[agent]
|
|
106
|
+
# allowed_tools = ["computer", "playwright"]
|
|
107
|
+
# disallowed_tools = []
|
|
108
|
+
|
|
109
|
+
[claude]
|
|
110
|
+
# model = "claude-sonnet-4-5"
|
|
111
|
+
# max_tokens = 16384
|
|
112
|
+
# use_computer_beta = true
|
|
113
|
+
|
|
114
|
+
[openai]
|
|
115
|
+
# model = "gpt-4o"
|
|
116
|
+
# temperature = 0.7
|
|
117
|
+
# max_output_tokens = 4096
|
|
118
|
+
|
|
119
|
+
[gemini]
|
|
120
|
+
# model = "gemini-2.5-pro"
|
|
121
|
+
# temperature = 1.0
|
|
122
|
+
# top_p = 0.95
|
|
123
|
+
|
|
124
|
+
[gemini_cua]
|
|
125
|
+
# model = "gemini-2.5-computer-use-preview"
|
|
126
|
+
# temperature = 1.0
|
|
127
|
+
# top_p = 0.95
|
|
128
|
+
# excluded_predefined_functions = []
|
|
129
|
+
|
|
130
|
+
[openai_compatible]
|
|
131
|
+
# base_url = "http://localhost:8000/v1"
|
|
132
|
+
# model = "my-model"
|
|
133
|
+
"""
|
|
134
|
+
|
|
135
|
+
# Agent type -> (settings attr, env var name)
|
|
136
|
+
_API_KEY_REQUIREMENTS: dict[AgentType, tuple[str, str]] = {
|
|
137
|
+
AgentType.CLAUDE: ("anthropic_api_key", "ANTHROPIC_API_KEY"),
|
|
138
|
+
AgentType.GEMINI: ("gemini_api_key", "GEMINI_API_KEY"),
|
|
139
|
+
AgentType.GEMINI_CUA: ("gemini_api_key", "GEMINI_API_KEY"),
|
|
140
|
+
AgentType.OPENAI: ("openai_api_key", "OPENAI_API_KEY"),
|
|
141
|
+
AgentType.OPERATOR: ("openai_api_key", "OPENAI_API_KEY"),
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
class EvalConfig(BaseModel):
|
|
146
|
+
"""Configuration for hud eval command."""
|
|
147
|
+
|
|
148
|
+
# Class-level registry
|
|
149
|
+
_agent_classes: ClassVar[dict[AgentType, type["MCPAgent"]]] = {}
|
|
150
|
+
|
|
151
|
+
# Fields loaded from [eval] section
|
|
152
|
+
_EVAL_FIELDS: ClassVar[set[str]] = {
|
|
153
|
+
"source",
|
|
154
|
+
"agent_type",
|
|
155
|
+
"task_ids",
|
|
156
|
+
"all",
|
|
157
|
+
"max_concurrent",
|
|
158
|
+
"max_steps",
|
|
159
|
+
"verbose",
|
|
160
|
+
"very_verbose",
|
|
161
|
+
"group_size",
|
|
162
|
+
"byok",
|
|
163
|
+
"remote",
|
|
164
|
+
"auto_respond",
|
|
165
|
+
"quiet",
|
|
166
|
+
"gateway",
|
|
167
|
+
"taskset",
|
|
168
|
+
}
|
|
169
|
+
# Fields loaded from [agent] section
|
|
170
|
+
_AGENT_FIELDS: ClassVar[set[str]] = {"allowed_tools", "disallowed_tools"}
|
|
171
|
+
|
|
172
|
+
# Eval settings
|
|
173
|
+
source: str | None = None
|
|
174
|
+
agent_type: AgentType | None = None
|
|
175
|
+
model: str | None = None
|
|
176
|
+
task_ids: list[str] | None = None
|
|
177
|
+
all: bool = False # Run all problems instead of just 1
|
|
178
|
+
max_concurrent: int = 30
|
|
179
|
+
max_steps: int = 10
|
|
180
|
+
verbose: bool = False
|
|
181
|
+
very_verbose: bool = False
|
|
182
|
+
auto_respond: bool | None = None # Continue without prompting
|
|
183
|
+
group_size: int = 1
|
|
184
|
+
byok: bool = False
|
|
185
|
+
remote: bool = False
|
|
186
|
+
quiet: bool = False # Suppress opening browser for eval links
|
|
187
|
+
gateway: bool = False # Use HUD Gateway for LLM API calls
|
|
188
|
+
taskset: str | None = None # Taskset slug to associate job with
|
|
189
|
+
|
|
190
|
+
# Base agent config (these merge with task's agent_config)
|
|
191
|
+
allowed_tools: list[str] | None = None
|
|
192
|
+
disallowed_tools: list[str] | None = None
|
|
193
|
+
|
|
194
|
+
agent_config: dict[str, Any] = Field(default_factory=dict)
|
|
195
|
+
|
|
196
|
+
@field_validator("agent_type", mode="before")
|
|
197
|
+
@classmethod
|
|
198
|
+
def _parse_agent_type(cls, v: Any) -> AgentType | None:
|
|
199
|
+
"""Convert string agent name to AgentType enum."""
|
|
200
|
+
if v is None:
|
|
201
|
+
return None
|
|
202
|
+
if isinstance(v, AgentType):
|
|
203
|
+
return v
|
|
204
|
+
if isinstance(v, str):
|
|
205
|
+
try:
|
|
206
|
+
return AgentType(v)
|
|
207
|
+
except ValueError:
|
|
208
|
+
valid = [e.value for e in AgentType]
|
|
209
|
+
raise ValueError(
|
|
210
|
+
f"Invalid agent: {v}. Must be one of: {', '.join(valid)}"
|
|
211
|
+
) from None
|
|
212
|
+
return v
|
|
213
|
+
|
|
214
|
+
def validate_api_keys(self) -> None:
|
|
215
|
+
"""Validate required API keys for the selected agent. Raises typer.Exit on failure."""
|
|
216
|
+
# BYOK requires remote execution (check before agent_type guard)
|
|
217
|
+
if self.byok and not self.remote:
|
|
218
|
+
hud_console.error("--byok requires --remote (BYOK only works with remote execution)")
|
|
219
|
+
raise typer.Exit(1)
|
|
52
220
|
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
hud_console.
|
|
60
|
-
|
|
221
|
+
if self.agent_type is None:
|
|
222
|
+
return
|
|
223
|
+
|
|
224
|
+
if self.remote:
|
|
225
|
+
if not settings.api_key:
|
|
226
|
+
hud_console.error("HUD_API_KEY is required for remote execution")
|
|
227
|
+
hud_console.info("Set it: hud set HUD_API_KEY=your-key-here")
|
|
228
|
+
raise typer.Exit(1)
|
|
229
|
+
return
|
|
230
|
+
|
|
231
|
+
# Gateway mode only requires HUD_API_KEY
|
|
232
|
+
if self.gateway:
|
|
233
|
+
if not settings.api_key:
|
|
234
|
+
hud_console.error("HUD_API_KEY is required for gateway mode")
|
|
235
|
+
hud_console.info("Set it: hud set HUD_API_KEY=your-key-here")
|
|
236
|
+
raise typer.Exit(1)
|
|
237
|
+
return
|
|
238
|
+
|
|
239
|
+
if self.agent_type == AgentType.OPENAI_COMPATIBLE:
|
|
240
|
+
# Check both CLI --model and config file model
|
|
241
|
+
config_model = self.agent_config.get("openai_compatible", {}).get("model")
|
|
242
|
+
if not self.model and not config_model:
|
|
243
|
+
hud_console.error(
|
|
244
|
+
"Model name is required for OpenAI compatible agent. "
|
|
245
|
+
"Use --model or set model in [openai_compatible] section of .hud_eval.toml"
|
|
61
246
|
)
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
return []
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
def build_agent(
|
|
72
|
-
agent_type: Literal["claude", "openai", "vllm", "litellm", "integration_test"],
|
|
73
|
-
*,
|
|
74
|
-
model: str | None = None,
|
|
75
|
-
allowed_tools: list[str] | None = None,
|
|
76
|
-
verbose: bool = False,
|
|
77
|
-
vllm_base_url: str | None = None,
|
|
78
|
-
) -> Any:
|
|
79
|
-
"""Create and return the requested agent type."""
|
|
80
|
-
|
|
81
|
-
# Import agents lazily to avoid dependency issues
|
|
82
|
-
if agent_type == "integration_test":
|
|
83
|
-
from hud.agents.misc.integration_test_agent import IntegrationTestRunner
|
|
84
|
-
|
|
85
|
-
return IntegrationTestRunner(verbose=verbose)
|
|
86
|
-
elif agent_type == "vllm":
|
|
87
|
-
# Create a generic OpenAI agent for vLLM server
|
|
88
|
-
try:
|
|
89
|
-
from openai import AsyncOpenAI
|
|
90
|
-
|
|
91
|
-
from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
|
|
92
|
-
except ImportError as e:
|
|
93
|
-
hud_console.error(
|
|
94
|
-
"OpenAI dependencies are not installed. "
|
|
95
|
-
"Please install with: pip install 'hud-python[agent]'"
|
|
247
|
+
raise typer.Exit(1)
|
|
248
|
+
elif self.agent_type == AgentType.CLAUDE and _is_bedrock_arn(self.model):
|
|
249
|
+
missing_aws = (
|
|
250
|
+
not settings.aws_access_key_id
|
|
251
|
+
or not settings.aws_secret_access_key
|
|
252
|
+
or not settings.aws_region
|
|
96
253
|
)
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
)
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
254
|
+
if missing_aws:
|
|
255
|
+
hud_console.error(
|
|
256
|
+
"AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, and AWS_REGION "
|
|
257
|
+
"are required for AWS Bedrock"
|
|
258
|
+
)
|
|
259
|
+
raise typer.Exit(1)
|
|
260
|
+
elif self.agent_type in _API_KEY_REQUIREMENTS:
|
|
261
|
+
attr, env_var = _API_KEY_REQUIREMENTS[self.agent_type]
|
|
262
|
+
if not getattr(settings, attr, None):
|
|
263
|
+
hud_console.error(f"{env_var} is required for {self.agent_type.value} agent")
|
|
264
|
+
hud_console.info(f"Set it: hud set {env_var}=your-key-here")
|
|
265
|
+
raise typer.Exit(1)
|
|
266
|
+
|
|
267
|
+
if not settings.api_key:
|
|
268
|
+
hud_console.warning("HUD_API_KEY not set. Some features may be limited.")
|
|
269
|
+
|
|
270
|
+
def get_agent_kwargs(self) -> dict[str, Any]:
|
|
271
|
+
"""Build agent kwargs from config.
|
|
272
|
+
|
|
273
|
+
Model precedence:
|
|
274
|
+
1. CLI --model (highest priority)
|
|
275
|
+
2. [agent_type].model in TOML (per-agent config)
|
|
276
|
+
"""
|
|
277
|
+
if self.agent_type is None:
|
|
278
|
+
raise ValueError("agent_type must be set before calling get_agent_kwargs()")
|
|
279
|
+
|
|
280
|
+
kwargs: dict[str, Any] = {}
|
|
281
|
+
|
|
282
|
+
if self.allowed_tools:
|
|
283
|
+
kwargs["allowed_tools"] = self.allowed_tools
|
|
284
|
+
if self.disallowed_tools:
|
|
285
|
+
kwargs["disallowed_tools"] = self.disallowed_tools
|
|
286
|
+
|
|
287
|
+
# Apply agent-specific config
|
|
288
|
+
agent_key = self.agent_type.value
|
|
289
|
+
if agent_key in self.agent_config:
|
|
290
|
+
agent_cfg = dict(self.agent_config[agent_key])
|
|
291
|
+
kwargs.update(agent_cfg)
|
|
292
|
+
|
|
293
|
+
# CLI --model always wins
|
|
294
|
+
if self.model:
|
|
295
|
+
kwargs["model"] = self.model
|
|
296
|
+
|
|
297
|
+
# For gateway base_url, inject HUD API key if not already set
|
|
298
|
+
if self.agent_type == AgentType.OPENAI_COMPATIBLE and "api_key" not in kwargs:
|
|
299
|
+
base_url = kwargs.get("base_url", "")
|
|
300
|
+
if settings.hud_gateway_url in base_url and settings.api_key:
|
|
301
|
+
kwargs["api_key"] = settings.api_key
|
|
302
|
+
|
|
303
|
+
# Auto-detect Bedrock when Claude is selected with a Bedrock ARN
|
|
304
|
+
# Check both model and checkpoint_name for ARN patterns
|
|
305
|
+
bedrock_arn_detected = _is_bedrock_arn(kwargs.get("model")) or _is_bedrock_arn(
|
|
306
|
+
kwargs.get("checkpoint_name")
|
|
128
307
|
)
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
hud_console.error(
|
|
135
|
-
"OpenAI agent dependencies are not installed. "
|
|
136
|
-
"Please install with: pip install 'hud-python[agent]'"
|
|
308
|
+
if self.agent_type == AgentType.CLAUDE and bedrock_arn_detected:
|
|
309
|
+
missing_aws = (
|
|
310
|
+
not settings.aws_access_key_id
|
|
311
|
+
or not settings.aws_secret_access_key
|
|
312
|
+
or not settings.aws_region
|
|
137
313
|
)
|
|
138
|
-
|
|
314
|
+
if missing_aws:
|
|
315
|
+
hud_console.error(
|
|
316
|
+
"AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, and AWS_REGION "
|
|
317
|
+
"are required for AWS Bedrock"
|
|
318
|
+
)
|
|
319
|
+
raise typer.Exit(1)
|
|
139
320
|
|
|
140
|
-
|
|
141
|
-
return OperatorAgent(
|
|
142
|
-
allowed_tools=allowed_tools,
|
|
143
|
-
verbose=verbose,
|
|
144
|
-
)
|
|
145
|
-
else:
|
|
146
|
-
return OperatorAgent(verbose=verbose)
|
|
321
|
+
from anthropic import AsyncAnthropicBedrock
|
|
147
322
|
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
hud_console.error(
|
|
153
|
-
"LiteLLM agent dependencies are not installed. "
|
|
154
|
-
"Please install with: pip install 'hud-python[agent]'"
|
|
323
|
+
kwargs["model_client"] = AsyncAnthropicBedrock(
|
|
324
|
+
aws_access_key=settings.aws_access_key_id,
|
|
325
|
+
aws_secret_key=settings.aws_secret_access_key,
|
|
326
|
+
aws_region=settings.aws_region or "us-east-1",
|
|
155
327
|
)
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
|
|
328
|
+
hud_console.info("🔧 Using AWS Bedrock (detected ARN in model)")
|
|
329
|
+
|
|
330
|
+
kwargs["verbose"] = self.verbose or self.very_verbose
|
|
331
|
+
|
|
332
|
+
if self.agent_type in (
|
|
333
|
+
AgentType.CLAUDE,
|
|
334
|
+
AgentType.OPENAI,
|
|
335
|
+
AgentType.OPERATOR,
|
|
336
|
+
AgentType.GEMINI,
|
|
337
|
+
AgentType.GEMINI_CUA,
|
|
338
|
+
):
|
|
339
|
+
kwargs["validate_api_key"] = False
|
|
340
|
+
|
|
341
|
+
# Configure gateway mode - route LLM API calls through HUD gateway
|
|
342
|
+
if self.gateway:
|
|
343
|
+
if not settings.api_key:
|
|
344
|
+
raise typer.Exit(1) # Already validated in validate_api_keys()
|
|
345
|
+
|
|
346
|
+
from hud.agents.gateway import build_gateway_client
|
|
347
|
+
|
|
348
|
+
# Map AgentType to provider
|
|
349
|
+
agent_to_provider = {
|
|
350
|
+
AgentType.CLAUDE: "anthropic",
|
|
351
|
+
AgentType.OPENAI: "openai",
|
|
352
|
+
AgentType.OPERATOR: "openai",
|
|
353
|
+
AgentType.GEMINI: "gemini",
|
|
354
|
+
AgentType.GEMINI_CUA: "gemini",
|
|
355
|
+
AgentType.OPENAI_COMPATIBLE: "openai",
|
|
356
|
+
}
|
|
357
|
+
provider = agent_to_provider.get(self.agent_type, "openai")
|
|
358
|
+
client = build_gateway_client(provider)
|
|
359
|
+
|
|
360
|
+
# OpenAI-compatible uses openai_client key
|
|
361
|
+
is_oai_compat = self.agent_type == AgentType.OPENAI_COMPATIBLE
|
|
362
|
+
kwargs["openai_client" if is_oai_compat else "model_client"] = client
|
|
363
|
+
hud_console.info(f"🌐 Using HUD Gateway for {provider} API")
|
|
364
|
+
|
|
365
|
+
return kwargs
|
|
366
|
+
|
|
367
|
+
@classmethod
|
|
368
|
+
def load(cls, path: str = _CONFIG_PATH) -> EvalConfig:
|
|
369
|
+
"""Load config from TOML file."""
|
|
370
|
+
p = Path(path)
|
|
371
|
+
if not p.exists():
|
|
372
|
+
p.write_text(_DEFAULT_CONFIG_TEMPLATE)
|
|
373
|
+
hud_console.info(f"Generated {_CONFIG_PATH}")
|
|
374
|
+
return cls()
|
|
163
375
|
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
"Please install with: pip install 'hud-python[agent]'"
|
|
171
|
-
)
|
|
172
|
-
raise typer.Exit(1) from e
|
|
376
|
+
try:
|
|
377
|
+
with open(p, "rb") as f:
|
|
378
|
+
toml_data = tomllib.load(f)
|
|
379
|
+
except Exception as e:
|
|
380
|
+
hud_console.warning(f"Failed to parse {path}: {e}")
|
|
381
|
+
return cls()
|
|
173
382
|
|
|
174
|
-
|
|
383
|
+
toml_data = resolve_env_vars(toml_data)
|
|
175
384
|
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
allowed_tools=allowed_tools,
|
|
180
|
-
verbose=verbose,
|
|
181
|
-
)
|
|
182
|
-
else:
|
|
183
|
-
return ClaudeAgent(
|
|
184
|
-
model=model,
|
|
185
|
-
verbose=verbose,
|
|
186
|
-
)
|
|
385
|
+
# Extract sections
|
|
386
|
+
eval_section = toml_data.get("eval", {})
|
|
387
|
+
agent_section = toml_data.get("agent", {})
|
|
187
388
|
|
|
389
|
+
# Build config data
|
|
390
|
+
data: dict[str, Any] = {}
|
|
188
391
|
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
max_steps: int = 10,
|
|
196
|
-
verbose: bool = False,
|
|
197
|
-
vllm_base_url: str | None = None,
|
|
198
|
-
group_size: int = 1,
|
|
199
|
-
) -> None:
|
|
200
|
-
"""Load one task and execute it, or detect if JSON contains a list and run as dataset."""
|
|
392
|
+
# Eval settings (map 'agent' -> 'agent_type')
|
|
393
|
+
if "agent" in eval_section:
|
|
394
|
+
data["agent_type"] = eval_section["agent"]
|
|
395
|
+
for key in cls._EVAL_FIELDS:
|
|
396
|
+
if key in eval_section:
|
|
397
|
+
data[key] = eval_section[key]
|
|
201
398
|
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
hud_console.error(
|
|
207
|
-
"Dataset dependencies are not installed. "
|
|
208
|
-
"Please install with: pip install 'hud-python\u27e6agent\u27e7'"
|
|
209
|
-
)
|
|
210
|
-
raise typer.Exit(1) from e
|
|
399
|
+
# Agent base config
|
|
400
|
+
for key in cls._AGENT_FIELDS:
|
|
401
|
+
if key in agent_section:
|
|
402
|
+
data[key] = agent_section[key]
|
|
211
403
|
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
404
|
+
# Agent-specific configs (claude, openai, gemini, etc.)
|
|
405
|
+
agent_config: dict[str, Any] = {}
|
|
406
|
+
for agent_type in AgentType:
|
|
407
|
+
if agent_type.value in toml_data:
|
|
408
|
+
agent_config[agent_type.value] = toml_data[agent_type.value]
|
|
409
|
+
data["agent_config"] = agent_config
|
|
216
410
|
|
|
217
|
-
# If tasks reference a local environment (nearby), ensure it's built/up-to-date.
|
|
218
411
|
try:
|
|
219
|
-
|
|
220
|
-
if env_dir is not None:
|
|
221
|
-
# Non-interactive for eval; warn but don't block
|
|
222
|
-
ensure_built(env_dir, interactive=False)
|
|
412
|
+
return cls.model_validate(data)
|
|
223
413
|
except Exception as e:
|
|
224
|
-
hud_console.
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
|
|
250
|
-
|
|
251
|
-
|
|
252
|
-
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
269
|
-
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
302
|
-
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
)
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
414
|
+
hud_console.warning(f"Invalid config: {e}")
|
|
415
|
+
return cls()
|
|
416
|
+
|
|
417
|
+
def merge_cli(
|
|
418
|
+
self,
|
|
419
|
+
agent: str | None = None,
|
|
420
|
+
config: list[str] | None = None,
|
|
421
|
+
allowed_tools: str | None = None,
|
|
422
|
+
disallowed_tools: str | None = None,
|
|
423
|
+
task_ids: str | None = None,
|
|
424
|
+
**cli_args: Any,
|
|
425
|
+
) -> EvalConfig:
|
|
426
|
+
"""Merge CLI args (non-None values override config)."""
|
|
427
|
+
overrides: dict[str, Any] = {}
|
|
428
|
+
|
|
429
|
+
if agent is not None:
|
|
430
|
+
overrides["agent_type"] = agent
|
|
431
|
+
|
|
432
|
+
# Parse comma-separated lists
|
|
433
|
+
if allowed_tools is not None:
|
|
434
|
+
overrides["allowed_tools"] = [t.strip() for t in allowed_tools.split(",") if t.strip()]
|
|
435
|
+
if disallowed_tools is not None:
|
|
436
|
+
overrides["disallowed_tools"] = [
|
|
437
|
+
t.strip() for t in disallowed_tools.split(",") if t.strip()
|
|
438
|
+
]
|
|
439
|
+
if task_ids is not None:
|
|
440
|
+
overrides["task_ids"] = [t.strip() for t in task_ids.split(",") if t.strip()]
|
|
441
|
+
|
|
442
|
+
overrides.update({k: v for k, v in cli_args.items() if v is not None and v is not False})
|
|
443
|
+
|
|
444
|
+
for k in ("all", "verbose", "very_verbose", "remote", "quiet", "gateway"):
|
|
445
|
+
if cli_args.get(k) is True:
|
|
446
|
+
overrides[k] = True
|
|
447
|
+
elif k in overrides and cli_args.get(k) is False:
|
|
448
|
+
del overrides[k]
|
|
449
|
+
|
|
450
|
+
# --full is a shortcut for --all --auto-respond --max-steps 100
|
|
451
|
+
if overrides.get("full"):
|
|
452
|
+
overrides["all"] = True
|
|
453
|
+
if "auto_respond" not in overrides:
|
|
454
|
+
overrides["auto_respond"] = True
|
|
455
|
+
if "max_steps" not in overrides:
|
|
456
|
+
overrides["max_steps"] = 100
|
|
457
|
+
|
|
458
|
+
if config:
|
|
459
|
+
merged_agent_config = dict(self.agent_config)
|
|
460
|
+
for item in config:
|
|
461
|
+
if "=" in item:
|
|
462
|
+
key, value = item.split("=", 1)
|
|
463
|
+
key = key.strip()
|
|
464
|
+
value = value.strip()
|
|
465
|
+
|
|
466
|
+
# Parse value
|
|
467
|
+
if value.lower() == "true":
|
|
468
|
+
parsed_value: Any = True
|
|
469
|
+
elif value.lower() == "false":
|
|
470
|
+
parsed_value = False
|
|
471
|
+
else:
|
|
472
|
+
try:
|
|
473
|
+
parsed_value = int(value)
|
|
474
|
+
except ValueError:
|
|
475
|
+
try:
|
|
476
|
+
parsed_value = float(value)
|
|
477
|
+
except ValueError:
|
|
478
|
+
parsed_value = value
|
|
479
|
+
|
|
480
|
+
# Handle namespaced keys (e.g., claude.max_tokens)
|
|
481
|
+
if "." in key:
|
|
482
|
+
agent_name, param = key.split(".", 1)
|
|
483
|
+
if agent_name not in merged_agent_config:
|
|
484
|
+
merged_agent_config[agent_name] = {}
|
|
485
|
+
merged_agent_config[agent_name][param] = parsed_value
|
|
486
|
+
else:
|
|
487
|
+
# Non-namespaced: apply to current agent if set
|
|
488
|
+
if self.agent_type:
|
|
489
|
+
agent_name = self.agent_type.value
|
|
490
|
+
if agent_name not in merged_agent_config:
|
|
491
|
+
merged_agent_config[agent_name] = {}
|
|
492
|
+
merged_agent_config[agent_name][key] = parsed_value
|
|
493
|
+
|
|
494
|
+
overrides["agent_config"] = merged_agent_config
|
|
495
|
+
|
|
496
|
+
return self.model_validate({**self.model_dump(), **overrides})
|
|
497
|
+
|
|
498
|
+
def resolve_agent_interactive(self) -> EvalConfig:
|
|
499
|
+
"""Prompt user to select an agent preset if not set. Returns updated config."""
|
|
500
|
+
if self.agent_type is not None:
|
|
501
|
+
return self
|
|
502
|
+
|
|
503
|
+
# Build choices from presets
|
|
504
|
+
choices: list[dict[str, Any]] = [
|
|
505
|
+
{"name": preset.name, "value": preset} for preset in _AGENT_PRESETS
|
|
506
|
+
]
|
|
507
|
+
|
|
508
|
+
selected: AgentPreset = hud_console.select("Select an agent:", choices=choices, default=0) # type: ignore[arg-type]
|
|
509
|
+
|
|
510
|
+
# Merge preset into config
|
|
511
|
+
updates: dict[str, Any] = {"agent_type": selected.agent_type}
|
|
512
|
+
if selected.model:
|
|
513
|
+
updates["model"] = selected.model
|
|
514
|
+
if selected.agent_config:
|
|
515
|
+
# Merge preset's agent_config with existing
|
|
516
|
+
merged = dict(self.agent_config)
|
|
517
|
+
for key, value in selected.agent_config.items():
|
|
518
|
+
if key in merged:
|
|
519
|
+
merged[key] = {**merged[key], **value}
|
|
520
|
+
else:
|
|
521
|
+
merged[key] = value
|
|
522
|
+
updates["agent_config"] = merged
|
|
523
|
+
|
|
524
|
+
return self.model_validate({**self.model_dump(), **updates})
|
|
525
|
+
|
|
526
|
+
def display(self) -> None:
|
|
527
|
+
"""Display settings in a table."""
|
|
528
|
+
table = Table(title="Evaluation Settings", title_style="bold cyan", box=box.ROUNDED)
|
|
529
|
+
table.add_column("Setting", style="yellow")
|
|
530
|
+
table.add_column("Value", style="green")
|
|
531
|
+
|
|
532
|
+
# Core settings
|
|
533
|
+
table.add_row("source", str(self.source or "—"))
|
|
534
|
+
table.add_row("agent", self.agent_type.value) # type: ignore[union-attr]
|
|
535
|
+
if self.task_ids:
|
|
536
|
+
table.add_row(
|
|
537
|
+
"task_ids", ", ".join(self.task_ids[:5]) + ("..." if len(self.task_ids) > 5 else "")
|
|
329
538
|
)
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
539
|
+
table.add_row("all", str(self.all))
|
|
540
|
+
table.add_row("max_steps", str(self.max_steps))
|
|
541
|
+
if not self.remote:
|
|
542
|
+
table.add_row("max_concurrent", str(self.max_concurrent))
|
|
543
|
+
if self.group_size > 1:
|
|
544
|
+
table.add_row("group_size", str(self.group_size))
|
|
545
|
+
if self.auto_respond:
|
|
546
|
+
table.add_row("auto_respond", "[bold green]True[/bold green]")
|
|
547
|
+
if self.very_verbose:
|
|
548
|
+
table.add_row("very_verbose", "[bold green]True[/bold green]")
|
|
549
|
+
elif self.verbose:
|
|
550
|
+
table.add_row("verbose", "[bold green]True[/bold green]")
|
|
551
|
+
if self.remote:
|
|
552
|
+
table.add_row("remote", "[bold green]True[/bold green] (submitting to platform)")
|
|
553
|
+
if self.gateway:
|
|
554
|
+
table.add_row("gateway", "[bold green]True[/bold green] (routing via HUD Gateway)")
|
|
555
|
+
if self.byok:
|
|
556
|
+
table.add_row("byok", "[bold green]True[/bold green] (remote only)")
|
|
557
|
+
|
|
558
|
+
# Tool filters (only if set)
|
|
559
|
+
if self.allowed_tools:
|
|
560
|
+
table.add_row("allowed_tools", ", ".join(self.allowed_tools))
|
|
561
|
+
if self.disallowed_tools:
|
|
562
|
+
table.add_row("disallowed_tools", ", ".join(self.disallowed_tools))
|
|
563
|
+
|
|
564
|
+
# Agent config section
|
|
565
|
+
if self.agent_type:
|
|
566
|
+
table.add_row("", "")
|
|
567
|
+
table.add_row(f"[dim]{self.agent_type.value} config[/dim]", "")
|
|
568
|
+
|
|
569
|
+
config_cls = self.agent_type.config_cls
|
|
570
|
+
defaults = config_cls()
|
|
571
|
+
overrides = self.agent_config.get(self.agent_type.value, {})
|
|
572
|
+
skip = {
|
|
573
|
+
"model_client",
|
|
574
|
+
"model_name",
|
|
575
|
+
"validate_api_key",
|
|
576
|
+
"model_config",
|
|
577
|
+
"allowed_tools",
|
|
578
|
+
"disallowed_tools",
|
|
579
|
+
"system_prompt",
|
|
580
|
+
"response_tool_name",
|
|
581
|
+
"append_setup_output",
|
|
582
|
+
"initial_screenshot",
|
|
583
|
+
}
|
|
584
|
+
|
|
585
|
+
sensitive_fields = {"api_key", "api_secret", "token", "password", "secret"}
|
|
586
|
+
|
|
587
|
+
for name in config_cls.model_fields:
|
|
588
|
+
if name in skip:
|
|
589
|
+
continue
|
|
590
|
+
# Always show model
|
|
591
|
+
if name == "model":
|
|
592
|
+
if self.model:
|
|
593
|
+
value = self.model
|
|
594
|
+
elif overrides.get("model"):
|
|
595
|
+
value = overrides["model"]
|
|
596
|
+
else:
|
|
597
|
+
value = getattr(defaults, "model", None)
|
|
598
|
+
table.add_row(" model", str(value) if value else "—")
|
|
599
|
+
elif name in overrides:
|
|
600
|
+
value = overrides[name]
|
|
601
|
+
if name in sensitive_fields and value:
|
|
602
|
+
display_value = f"{str(value)[:4]}****" if len(str(value)) > 4 else "****"
|
|
603
|
+
else:
|
|
604
|
+
display_value = str(value)
|
|
605
|
+
table.add_row(f" {name}", display_value)
|
|
606
|
+
|
|
607
|
+
hud_console.console.print(table)
|
|
608
|
+
|
|
609
|
+
|
|
610
|
+
# =============================================================================
|
|
611
|
+
# Evaluation runner
|
|
612
|
+
# =============================================================================
|
|
613
|
+
|
|
614
|
+
|
|
615
|
+
async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
|
|
616
|
+
"""Run evaluation with the given config using run_dataset()."""
|
|
617
|
+
from hud.datasets import load_tasks, run_dataset
|
|
618
|
+
|
|
619
|
+
if cfg.source is None or cfg.agent_type is None:
|
|
620
|
+
raise ValueError("source and agent_type must be set")
|
|
621
|
+
|
|
622
|
+
# Load tasks using unified loader (handles v4→v5 conversion automatically)
|
|
623
|
+
hud_console.info(f"📊 Loading tasks from: {cfg.source}…")
|
|
624
|
+
tasks = load_tasks(cfg.source)
|
|
369
625
|
|
|
370
626
|
if not tasks:
|
|
371
|
-
hud_console.error(f"No tasks found in: {source}")
|
|
627
|
+
hud_console.error(f"No tasks found in: {cfg.source}")
|
|
372
628
|
raise typer.Exit(1)
|
|
373
629
|
|
|
374
|
-
#
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
|
|
382
|
-
|
|
383
|
-
|
|
630
|
+
# Filter by task IDs if provided
|
|
631
|
+
if cfg.task_ids:
|
|
632
|
+
id_set = set(cfg.task_ids)
|
|
633
|
+
# Match by task.id or index
|
|
634
|
+
filtered = [t for i, t in enumerate(tasks) if t.id in id_set or str(i) in id_set]
|
|
635
|
+
if not filtered:
|
|
636
|
+
hud_console.error(f"No tasks found matching IDs: {', '.join(cfg.task_ids)}")
|
|
637
|
+
raise typer.Exit(1)
|
|
638
|
+
hud_console.info(f"Filtered to {len(filtered)} task(s) by ID")
|
|
639
|
+
tasks = filtered
|
|
640
|
+
elif not cfg.all:
|
|
641
|
+
# Single task mode (no --all, --full, or --task-ids)
|
|
642
|
+
tasks = [tasks[0]]
|
|
643
|
+
hud_console.info("Using first task (run with --full or --task-ids for more)…")
|
|
644
|
+
|
|
645
|
+
hud_console.info(f"Loaded {len(tasks)} task(s)")
|
|
646
|
+
|
|
647
|
+
# Prepare agent kwargs
|
|
648
|
+
agent_kwargs = cfg.get_agent_kwargs()
|
|
649
|
+
auto_respond = cfg.auto_respond
|
|
650
|
+
if auto_respond:
|
|
651
|
+
agent_kwargs = {**agent_kwargs, "auto_respond": True}
|
|
652
|
+
|
|
653
|
+
max_steps = cfg.max_steps
|
|
654
|
+
|
|
655
|
+
# Remote execution - submit to HUD platform
|
|
656
|
+
if cfg.remote:
|
|
657
|
+
agent_kwargs = {
|
|
658
|
+
k: v for k, v in agent_kwargs.items() if k not in ("api_key", "model_client")
|
|
659
|
+
}
|
|
660
|
+
import uuid
|
|
384
661
|
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
elif agent_type == "vllm":
|
|
388
|
-
try:
|
|
389
|
-
from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
|
|
662
|
+
from hud.datasets.utils import submit_rollouts
|
|
663
|
+
from hud.eval.manager import _send_job_enter
|
|
390
664
|
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
"OpenAI dependencies are not installed. "
|
|
395
|
-
"Please install with: pip install 'hud-python[agent]'"
|
|
396
|
-
)
|
|
397
|
-
raise typer.Exit(1) from e
|
|
398
|
-
|
|
399
|
-
# Use build_agent to create a sample agent to get the config
|
|
400
|
-
sample_agent = build_agent(
|
|
401
|
-
agent_type,
|
|
402
|
-
model=model,
|
|
403
|
-
allowed_tools=allowed_tools,
|
|
404
|
-
verbose=verbose,
|
|
405
|
-
vllm_base_url=vllm_base_url,
|
|
665
|
+
job_id = str(uuid.uuid4())
|
|
666
|
+
hud_console.info(
|
|
667
|
+
f"Submitting {len(tasks)} task(s) for remote execution (job_id: {job_id})…"
|
|
406
668
|
)
|
|
407
669
|
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
}
|
|
415
|
-
if allowed_tools:
|
|
416
|
-
agent_config["allowed_tools"] = allowed_tools
|
|
417
|
-
elif agent_type == "openai":
|
|
418
|
-
try:
|
|
419
|
-
from hud.agents import OperatorAgent
|
|
420
|
-
|
|
421
|
-
agent_class = OperatorAgent
|
|
422
|
-
except ImportError as e:
|
|
423
|
-
hud_console.error(
|
|
424
|
-
"OpenAI agent dependencies are not installed. "
|
|
425
|
-
"Please install with: pip install 'hud-python[agent]'"
|
|
670
|
+
if cfg.taskset:
|
|
671
|
+
tasks_to_create = [t for t in tasks if not t.id]
|
|
672
|
+
tasks_data = (
|
|
673
|
+
[t.model_dump(mode="json", exclude_none=True) for t in tasks_to_create]
|
|
674
|
+
if tasks_to_create
|
|
675
|
+
else None
|
|
426
676
|
)
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
from hud.agents.lite_llm import LiteAgent
|
|
436
|
-
|
|
437
|
-
agent_class = LiteAgent
|
|
438
|
-
except ImportError as e:
|
|
439
|
-
hud_console.error(
|
|
440
|
-
"LiteLLM agent dependencies are not installed. "
|
|
441
|
-
"Please install with: pip install 'hud-python[agent]'"
|
|
677
|
+
ids = await _send_job_enter(
|
|
678
|
+
job_id=job_id,
|
|
679
|
+
name=f"eval ({cfg.source})" if cfg.source else "eval",
|
|
680
|
+
variants=None,
|
|
681
|
+
group=cfg.group_size,
|
|
682
|
+
api_key=None,
|
|
683
|
+
taskset=cfg.taskset,
|
|
684
|
+
tasks=tasks_data,
|
|
442
685
|
)
|
|
443
|
-
|
|
686
|
+
if ids:
|
|
687
|
+
if len(ids) != len(tasks_to_create):
|
|
688
|
+
hud_console.warning(
|
|
689
|
+
f"Task count mismatch: sent {len(tasks_to_create)} tasks, "
|
|
690
|
+
f"received {len(ids)} IDs. Some tasks may not be linked."
|
|
691
|
+
)
|
|
692
|
+
for task_obj, task_version_id in zip(tasks_to_create, ids, strict=False):
|
|
693
|
+
task_obj.id = task_version_id
|
|
694
|
+
|
|
695
|
+
await submit_rollouts(
|
|
696
|
+
tasks=tasks,
|
|
697
|
+
job_id=job_id,
|
|
698
|
+
agent_type=cfg.agent_type,
|
|
699
|
+
agent_params=agent_kwargs,
|
|
700
|
+
max_steps=max_steps,
|
|
701
|
+
group_size=cfg.group_size,
|
|
702
|
+
use_byok=cfg.byok,
|
|
703
|
+
)
|
|
444
704
|
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
"verbose": verbose,
|
|
448
|
-
}
|
|
449
|
-
if allowed_tools:
|
|
450
|
-
agent_config["allowed_tools"] = allowed_tools
|
|
705
|
+
hud_console.success(f"Tasks submitted. View at: https://hud.ai/jobs/{job_id}")
|
|
706
|
+
return [], tasks
|
|
451
707
|
|
|
708
|
+
# Single task mode - show extra info
|
|
709
|
+
if len(tasks) == 1 and cfg.group_size == 1:
|
|
710
|
+
logging.getLogger("hud.agents").setLevel(logging.INFO)
|
|
711
|
+
logging.getLogger("hud.agents.base").setLevel(logging.INFO)
|
|
712
|
+
# Get prompt from args (v4 tasks) or show scenario name
|
|
713
|
+
prompt = tasks[0].args.get("prompt") if tasks[0].args else tasks[0].scenario
|
|
714
|
+
if prompt:
|
|
715
|
+
hud_console.info(f"Prompt: {prompt}")
|
|
452
716
|
else:
|
|
453
|
-
|
|
454
|
-
|
|
717
|
+
hud_console.info(
|
|
718
|
+
f"🚀 Running evaluation (max_concurrent: {cfg.max_concurrent}, "
|
|
719
|
+
f"group_size: {cfg.group_size})…"
|
|
720
|
+
)
|
|
455
721
|
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
722
|
+
# Run using run_dataset
|
|
723
|
+
results = await run_dataset(
|
|
724
|
+
tasks,
|
|
725
|
+
cfg.agent_type,
|
|
726
|
+
agent_params=agent_kwargs,
|
|
727
|
+
max_steps=max_steps,
|
|
728
|
+
max_concurrent=cfg.max_concurrent,
|
|
729
|
+
group_size=cfg.group_size,
|
|
730
|
+
quiet=cfg.quiet,
|
|
731
|
+
taskset=cfg.taskset,
|
|
732
|
+
)
|
|
463
733
|
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
}
|
|
468
|
-
if allowed_tools:
|
|
469
|
-
agent_config["allowed_tools"] = allowed_tools
|
|
470
|
-
|
|
471
|
-
# Use grouped evaluation if group_size > 1
|
|
472
|
-
if group_size > 1:
|
|
473
|
-
hud_console.info(f"🔄 Running dataset with group_size={group_size}")
|
|
474
|
-
|
|
475
|
-
# Run with job tracking
|
|
476
|
-
with hud.job(
|
|
477
|
-
name=f"Evaluation {dataset_name} (group_size={group_size})",
|
|
478
|
-
metadata={
|
|
479
|
-
"dataset": source,
|
|
480
|
-
"group_size": group_size,
|
|
481
|
-
"tasks": len(dataset_or_tasks),
|
|
482
|
-
"total_episodes": len(dataset_or_tasks) * group_size,
|
|
483
|
-
},
|
|
484
|
-
) as job:
|
|
485
|
-
# Convert dicts to Task objects if needed
|
|
486
|
-
from hud.datasets import Task
|
|
487
|
-
|
|
488
|
-
tasks = []
|
|
489
|
-
for item in dataset_or_tasks:
|
|
490
|
-
if isinstance(item, dict):
|
|
491
|
-
tasks.append(Task(**item))
|
|
492
|
-
else:
|
|
493
|
-
tasks.append(item)
|
|
494
|
-
|
|
495
|
-
stats = await run_tasks_grouped(
|
|
496
|
-
tasks=tasks,
|
|
497
|
-
agent_class=agent_class,
|
|
498
|
-
agent_config=agent_config,
|
|
499
|
-
group_size=group_size,
|
|
500
|
-
max_parallel_episodes=max_concurrent
|
|
501
|
-
if not parallel
|
|
502
|
-
else max_concurrent_per_worker * (max_workers or 4),
|
|
503
|
-
max_steps=max_steps,
|
|
504
|
-
verbose=verbose,
|
|
505
|
-
job_id=job.id,
|
|
506
|
-
)
|
|
734
|
+
# Show reward for single task
|
|
735
|
+
if len(tasks) == 1 and cfg.group_size == 1 and results:
|
|
736
|
+
hud_console.success(f"Reward: {results[0].reward}")
|
|
507
737
|
|
|
508
|
-
|
|
509
|
-
display_group_statistics(stats, show_details=len(stats) <= 50)
|
|
738
|
+
return results, tasks
|
|
510
739
|
|
|
511
|
-
# Return stats for consistency with other modes
|
|
512
|
-
return stats
|
|
513
740
|
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
f"🚀 Running PARALLEL evaluation (workers: {max_workers or 'auto'}, max_concurrent: {max_concurrent})…" # noqa: E501
|
|
518
|
-
)
|
|
519
|
-
if max_workers is None:
|
|
520
|
-
# Use auto-optimization (now the default run_dataset_parallel)
|
|
521
|
-
return await run_dataset_parallel(
|
|
522
|
-
name=f"Evaluation {dataset_name}",
|
|
523
|
-
dataset=dataset_or_tasks,
|
|
524
|
-
agent_class=agent_class,
|
|
525
|
-
agent_config=agent_config,
|
|
526
|
-
max_concurrent=max_concurrent,
|
|
527
|
-
metadata={"dataset": source, "parallel": True},
|
|
528
|
-
max_steps=max_steps,
|
|
529
|
-
auto_respond=True,
|
|
530
|
-
)
|
|
531
|
-
else:
|
|
532
|
-
# Use manual configuration
|
|
533
|
-
return await run_dataset_parallel_manual(
|
|
534
|
-
name=f"Evaluation {dataset_name}",
|
|
535
|
-
dataset=dataset_or_tasks,
|
|
536
|
-
agent_class=agent_class,
|
|
537
|
-
agent_config=agent_config,
|
|
538
|
-
max_workers=max_workers,
|
|
539
|
-
max_concurrent_per_worker=max_concurrent_per_worker,
|
|
540
|
-
max_concurrent=max_concurrent,
|
|
541
|
-
metadata={"dataset": source, "parallel": True},
|
|
542
|
-
max_steps=max_steps,
|
|
543
|
-
auto_respond=True,
|
|
544
|
-
)
|
|
545
|
-
else:
|
|
546
|
-
hud_console.info(f"🚀 Running evaluation (max_concurrent: {max_concurrent})…")
|
|
547
|
-
return await run_dataset(
|
|
548
|
-
name=f"Evaluation {dataset_name}",
|
|
549
|
-
dataset=dataset_or_tasks,
|
|
550
|
-
agent_class=agent_class,
|
|
551
|
-
agent_config=agent_config,
|
|
552
|
-
max_concurrent=max_concurrent,
|
|
553
|
-
metadata={"dataset": source},
|
|
554
|
-
max_steps=max_steps,
|
|
555
|
-
)
|
|
741
|
+
# =============================================================================
|
|
742
|
+
# CLI command
|
|
743
|
+
# =============================================================================
|
|
556
744
|
|
|
557
745
|
|
|
558
746
|
def eval_command(
|
|
559
|
-
source: str = typer.Argument(
|
|
560
|
-
|
|
561
|
-
|
|
747
|
+
source: str | None = typer.Argument(None, help="HuggingFace dataset or task JSON file"),
|
|
748
|
+
agent: str | None = typer.Argument(
|
|
749
|
+
None,
|
|
750
|
+
help="Agent: claude, openai, operator, gemini, gemini_cua, openai_compatible, integration_test", # noqa: E501
|
|
562
751
|
),
|
|
752
|
+
all: bool = typer.Option(False, "--all", help="Run all problems instead of just 1"),
|
|
563
753
|
full: bool = typer.Option(
|
|
564
754
|
False,
|
|
565
755
|
"--full",
|
|
566
|
-
help="Run the entire dataset
|
|
756
|
+
help="Run the entire dataset. Shortcut for --all --auto-respond --max-steps 100",
|
|
567
757
|
),
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
"--
|
|
571
|
-
help="Agent backend to use (claude, openai, vllm for local server, or litellm)",
|
|
572
|
-
),
|
|
573
|
-
model: str | None = typer.Option(
|
|
574
|
-
None,
|
|
575
|
-
"--model",
|
|
576
|
-
help="Model name for the chosen agent",
|
|
758
|
+
model: str | None = typer.Option(None, "--model", "-m", help="Model name"),
|
|
759
|
+
config: list[str] | None = typer.Option( # noqa: B008
|
|
760
|
+
None, "--config", "-c", help="Agent config: key=value"
|
|
577
761
|
),
|
|
762
|
+
# Task-overridable settings
|
|
578
763
|
allowed_tools: str | None = typer.Option(
|
|
579
|
-
None,
|
|
580
|
-
"--allowed-tools",
|
|
581
|
-
help="Comma-separated list of allowed tools",
|
|
764
|
+
None, "--allowed-tools", help="Comma-separated allowed tools"
|
|
582
765
|
),
|
|
583
|
-
|
|
584
|
-
|
|
585
|
-
"--max-concurrent",
|
|
586
|
-
help="Concurrency level for asyncio mode (ignored in parallel mode)",
|
|
766
|
+
disallowed_tools: str | None = typer.Option(
|
|
767
|
+
None, "--disallowed-tools", help="Comma-separated disallowed tools"
|
|
587
768
|
),
|
|
588
|
-
|
|
589
|
-
|
|
590
|
-
"--max-
|
|
591
|
-
help="Maximum steps per task (default: 10 for single, 50 for full)",
|
|
769
|
+
# Eval settings
|
|
770
|
+
max_concurrent: int | None = typer.Option(
|
|
771
|
+
None, "--max-concurrent", help="Max concurrent tasks"
|
|
592
772
|
),
|
|
593
|
-
|
|
773
|
+
max_steps: int | None = typer.Option(None, "--max-steps", help="Max steps per task"),
|
|
774
|
+
verbose: bool = typer.Option(False, "--verbose", "-v", help="Verbose output"),
|
|
775
|
+
very_verbose: bool = typer.Option(False, "--very-verbose", "-vv", help="Debug logs"),
|
|
776
|
+
auto_respond: bool = typer.Option(
|
|
594
777
|
False,
|
|
595
|
-
"--
|
|
596
|
-
help="
|
|
778
|
+
"--auto-respond",
|
|
779
|
+
help="Automatically prompt the agent to continue if it does not respond with a tool call",
|
|
597
780
|
),
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
602
|
-
|
|
603
|
-
max_concurrent_per_worker: int = typer.Option(
|
|
604
|
-
20,
|
|
605
|
-
"--max-concurrent-per-worker",
|
|
606
|
-
help="Maximum concurrent tasks per worker in parallel mode",
|
|
781
|
+
group_size: int | None = typer.Option(None, "--group-size", help="Runs per task"),
|
|
782
|
+
task_ids: str | None = typer.Option(None, "--task-ids", help="Comma-separated task IDs to run"),
|
|
783
|
+
yes: bool = typer.Option(False, "--yes", "-y", help="Skip confirmation"),
|
|
784
|
+
remote: bool = typer.Option(
|
|
785
|
+
False, "--remote", help="Submit tasks to platform for remote execution"
|
|
607
786
|
),
|
|
608
|
-
|
|
787
|
+
byok: bool = typer.Option(
|
|
609
788
|
False,
|
|
610
|
-
"--
|
|
611
|
-
help="
|
|
789
|
+
"--byok",
|
|
790
|
+
help="Remote only: use BYOK keys from encrypted env vars for inference",
|
|
612
791
|
),
|
|
613
|
-
|
|
614
|
-
False,
|
|
615
|
-
"--very-verbose",
|
|
616
|
-
"-vv",
|
|
617
|
-
help="Enable debug-level logs for maximum visibility",
|
|
792
|
+
quiet: bool = typer.Option(
|
|
793
|
+
False, "--quiet", "-q", help="Suppress opening browser for eval links"
|
|
618
794
|
),
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
"--vllm-base-url",
|
|
622
|
-
help="Base URL for vLLM server (when using --agent vllm)",
|
|
795
|
+
gateway: bool = typer.Option(
|
|
796
|
+
False, "--gateway", "-g", help="Route LLM API calls through HUD Gateway"
|
|
623
797
|
),
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
"--group-size",
|
|
627
|
-
help="Number of times to run each task (similar to RL training)",
|
|
628
|
-
),
|
|
629
|
-
integration_test: bool = typer.Option(
|
|
630
|
-
False,
|
|
631
|
-
"--integration-test",
|
|
632
|
-
help=(
|
|
633
|
-
"Run integration_test_tool tool, where problem is setup, "
|
|
634
|
-
"actions are applied, and evaluation is performed, without "
|
|
635
|
-
"spinning up an agent"
|
|
636
|
-
),
|
|
798
|
+
taskset: str | None = typer.Option(
|
|
799
|
+
None, "--taskset", "-t", help="Taskset slug to associate job with"
|
|
637
800
|
),
|
|
638
801
|
) -> None:
|
|
639
802
|
"""🚀 Run evaluation on datasets or individual tasks with agents.
|
|
640
803
|
|
|
641
804
|
Examples:
|
|
642
|
-
|
|
643
|
-
hud eval hud-evals/SheetBench-50
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
hud eval
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
805
|
+
hud eval tasks.json claude
|
|
806
|
+
hud eval hud-evals/SheetBench-50 claude --full
|
|
807
|
+
hud eval tasks.json claude --config max_tokens=32768
|
|
808
|
+
hud eval tasks.json openai --config temperature=0.7
|
|
809
|
+
hud eval tasks.json claude --full --remote # Remote execution
|
|
810
|
+
hud eval tasks.json claude --gateway # Route LLM calls through HUD Gateway
|
|
811
|
+
"""
|
|
812
|
+
hud_console.info("🔧 Initializing evaluation...")
|
|
813
|
+
|
|
814
|
+
# Load config and merge CLI args
|
|
815
|
+
cfg = EvalConfig.load().merge_cli(
|
|
816
|
+
source=source,
|
|
817
|
+
agent=agent,
|
|
818
|
+
model=model,
|
|
819
|
+
all=all,
|
|
820
|
+
full=full,
|
|
821
|
+
max_concurrent=max_concurrent,
|
|
822
|
+
max_steps=max_steps,
|
|
823
|
+
allowed_tools=allowed_tools,
|
|
824
|
+
disallowed_tools=disallowed_tools,
|
|
825
|
+
task_ids=task_ids,
|
|
826
|
+
verbose=verbose,
|
|
827
|
+
very_verbose=very_verbose,
|
|
828
|
+
auto_respond=auto_respond,
|
|
829
|
+
group_size=group_size,
|
|
830
|
+
config=config,
|
|
831
|
+
remote=remote,
|
|
832
|
+
byok=byok,
|
|
833
|
+
quiet=quiet,
|
|
834
|
+
gateway=gateway,
|
|
835
|
+
taskset=taskset,
|
|
836
|
+
)
|
|
662
837
|
|
|
663
|
-
|
|
664
|
-
|
|
838
|
+
# Find source if not provided
|
|
839
|
+
if cfg.source is None:
|
|
840
|
+
try:
|
|
841
|
+
from hud.cli.utils.tasks import find_tasks_file
|
|
665
842
|
|
|
666
|
-
|
|
667
|
-
|
|
843
|
+
cfg = cfg.model_copy(
|
|
844
|
+
update={"source": find_tasks_file(None, msg="Select a tasks file")}
|
|
845
|
+
)
|
|
846
|
+
hud_console.success(f"Selected: {cfg.source}")
|
|
847
|
+
except Exception:
|
|
848
|
+
hud_console.error("No source provided and no task files found")
|
|
849
|
+
raise typer.Exit(1) from None
|
|
668
850
|
|
|
669
|
-
|
|
670
|
-
|
|
851
|
+
# Resolve agent interactively if needed
|
|
852
|
+
cfg = cfg.resolve_agent_interactive()
|
|
671
853
|
|
|
672
|
-
|
|
673
|
-
|
|
674
|
-
|
|
675
|
-
from hud.settings import settings
|
|
676
|
-
|
|
677
|
-
if very_verbose:
|
|
678
|
-
logging.basicConfig(
|
|
679
|
-
level=logging.DEBUG,
|
|
680
|
-
format="%(asctime)s - %(name)s - %(message)s",
|
|
681
|
-
datefmt="%H:%M:%S",
|
|
682
|
-
)
|
|
854
|
+
# Configure logging
|
|
855
|
+
if cfg.very_verbose:
|
|
856
|
+
logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(message)s")
|
|
683
857
|
logging.getLogger("hud.agents").setLevel(logging.DEBUG)
|
|
684
|
-
|
|
685
|
-
|
|
686
|
-
logging.
|
|
687
|
-
|
|
688
|
-
format="%(asctime)s - %(name)s - %(message)s",
|
|
689
|
-
datefmt="%H:%M:%S",
|
|
690
|
-
)
|
|
858
|
+
# Suppress noisy HTTP client logs
|
|
859
|
+
logging.getLogger("httpx").setLevel(logging.WARNING)
|
|
860
|
+
logging.getLogger("httpcore").setLevel(logging.WARNING)
|
|
861
|
+
elif cfg.verbose:
|
|
691
862
|
logging.getLogger("hud.agents").setLevel(logging.INFO)
|
|
692
|
-
logging.getLogger("hud.agents.base").setLevel(logging.INFO)
|
|
693
863
|
|
|
694
|
-
#
|
|
695
|
-
|
|
696
|
-
agent = "integration_test"
|
|
864
|
+
# Validate API keys
|
|
865
|
+
cfg.validate_api_keys()
|
|
697
866
|
|
|
698
|
-
#
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
"Set it in your environment or run: hud set ANTHROPIC_API_KEY=your-key-here"
|
|
704
|
-
)
|
|
705
|
-
raise typer.Exit(1)
|
|
706
|
-
elif agent == "openai" and not settings.openai_api_key:
|
|
707
|
-
hud_console.error("OPENAI_API_KEY is required for OpenAI agent")
|
|
708
|
-
hud_console.info("Set it in your environment or run: hud set OPENAI_API_KEY=your-key-here")
|
|
867
|
+
# Display and confirm
|
|
868
|
+
cfg.display()
|
|
869
|
+
|
|
870
|
+
if not yes and not questionary.confirm("Proceed?", default=True, qmark="").ask():
|
|
871
|
+
hud_console.info("Cancelled.")
|
|
709
872
|
raise typer.Exit(1)
|
|
710
|
-
elif agent == "vllm":
|
|
711
|
-
if model:
|
|
712
|
-
hud_console.info(f"Using vLLM with model: {model}")
|
|
713
|
-
else:
|
|
714
|
-
hud_console.error("Model name is required for vLLM agent, specify with --model")
|
|
715
|
-
raise typer.Exit(1)
|
|
716
873
|
|
|
717
|
-
#
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
721
|
-
|
|
874
|
+
# Run
|
|
875
|
+
start_time = time.time()
|
|
876
|
+
try:
|
|
877
|
+
results, tasks = asyncio.run(_run_evaluation(cfg))
|
|
878
|
+
except ValueError as e:
|
|
879
|
+
hud_console.error(str(e))
|
|
880
|
+
raise typer.Exit(1) from None
|
|
881
|
+
elapsed = time.time() - start_time
|
|
722
882
|
|
|
723
|
-
|
|
724
|
-
|
|
725
|
-
[t.strip() for t in allowed_tools.split(",") if t.strip()] if allowed_tools else None
|
|
726
|
-
)
|
|
883
|
+
if cfg.remote:
|
|
884
|
+
return
|
|
727
885
|
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
|
|
731
|
-
|
|
732
|
-
# Run evaluation
|
|
733
|
-
if full:
|
|
734
|
-
asyncio.run(
|
|
735
|
-
run_full_dataset(
|
|
736
|
-
source,
|
|
737
|
-
agent_type=agent,
|
|
738
|
-
model=model,
|
|
739
|
-
allowed_tools=allowed_tools_list,
|
|
740
|
-
max_concurrent=max_concurrent,
|
|
741
|
-
max_steps=max_steps,
|
|
742
|
-
parallel=parallel,
|
|
743
|
-
max_workers=max_workers,
|
|
744
|
-
max_concurrent_per_worker=max_concurrent_per_worker,
|
|
745
|
-
verbose=very_verbose or verbose,
|
|
746
|
-
vllm_base_url=vllm_base_url,
|
|
747
|
-
group_size=group_size,
|
|
748
|
-
)
|
|
749
|
-
)
|
|
750
|
-
else:
|
|
751
|
-
asyncio.run(
|
|
752
|
-
run_single_task(
|
|
753
|
-
source,
|
|
754
|
-
agent_type=agent,
|
|
755
|
-
model=model,
|
|
756
|
-
allowed_tools=allowed_tools_list,
|
|
757
|
-
max_steps=max_steps,
|
|
758
|
-
verbose=very_verbose or verbose,
|
|
759
|
-
vllm_base_url=vllm_base_url,
|
|
760
|
-
group_size=group_size,
|
|
761
|
-
)
|
|
762
|
-
)
|
|
886
|
+
from hud.datasets import display_results
|
|
887
|
+
|
|
888
|
+
display_results(results, tasks=tasks, elapsed=elapsed, show_details=len(results) <= 50)
|