hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (274) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +11 -5
  3. hud/agents/base.py +220 -500
  4. hud/agents/claude.py +200 -240
  5. hud/agents/gemini.py +275 -0
  6. hud/agents/gemini_cua.py +335 -0
  7. hud/agents/grounded_openai.py +98 -100
  8. hud/agents/misc/integration_test_agent.py +51 -20
  9. hud/agents/misc/response_agent.py +41 -36
  10. hud/agents/openai.py +291 -292
  11. hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
  12. hud/agents/operator.py +211 -0
  13. hud/agents/tests/conftest.py +133 -0
  14. hud/agents/tests/test_base.py +300 -622
  15. hud/agents/tests/test_base_runtime.py +233 -0
  16. hud/agents/tests/test_claude.py +379 -210
  17. hud/agents/tests/test_client.py +9 -10
  18. hud/agents/tests/test_gemini.py +369 -0
  19. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  20. hud/agents/tests/test_openai.py +376 -140
  21. hud/agents/tests/test_operator.py +362 -0
  22. hud/agents/tests/test_run_eval.py +179 -0
  23. hud/cli/__init__.py +461 -545
  24. hud/cli/analyze.py +43 -5
  25. hud/cli/build.py +664 -110
  26. hud/cli/debug.py +8 -5
  27. hud/cli/dev.py +882 -734
  28. hud/cli/eval.py +782 -668
  29. hud/cli/flows/dev.py +167 -0
  30. hud/cli/flows/init.py +191 -0
  31. hud/cli/flows/tasks.py +153 -56
  32. hud/cli/flows/templates.py +151 -0
  33. hud/cli/flows/tests/__init__.py +1 -0
  34. hud/cli/flows/tests/test_dev.py +126 -0
  35. hud/cli/init.py +60 -58
  36. hud/cli/push.py +29 -11
  37. hud/cli/rft.py +311 -0
  38. hud/cli/rft_status.py +145 -0
  39. hud/cli/tests/test_analyze.py +5 -5
  40. hud/cli/tests/test_analyze_metadata.py +3 -2
  41. hud/cli/tests/test_analyze_module.py +120 -0
  42. hud/cli/tests/test_build.py +108 -6
  43. hud/cli/tests/test_build_failure.py +41 -0
  44. hud/cli/tests/test_build_module.py +50 -0
  45. hud/cli/tests/test_cli_init.py +6 -1
  46. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  47. hud/cli/tests/test_cli_root.py +140 -0
  48. hud/cli/tests/test_convert.py +361 -0
  49. hud/cli/tests/test_debug.py +12 -10
  50. hud/cli/tests/test_dev.py +197 -0
  51. hud/cli/tests/test_eval.py +251 -0
  52. hud/cli/tests/test_eval_bedrock.py +51 -0
  53. hud/cli/tests/test_init.py +124 -0
  54. hud/cli/tests/test_main_module.py +11 -5
  55. hud/cli/tests/test_mcp_server.py +12 -100
  56. hud/cli/tests/test_push_happy.py +74 -0
  57. hud/cli/tests/test_push_wrapper.py +23 -0
  58. hud/cli/tests/test_registry.py +1 -1
  59. hud/cli/tests/test_utils.py +1 -1
  60. hud/cli/{rl → utils}/celebrate.py +14 -12
  61. hud/cli/utils/config.py +18 -1
  62. hud/cli/utils/docker.py +130 -4
  63. hud/cli/utils/env_check.py +9 -9
  64. hud/cli/utils/git.py +136 -0
  65. hud/cli/utils/interactive.py +39 -5
  66. hud/cli/utils/metadata.py +69 -0
  67. hud/cli/utils/runner.py +1 -1
  68. hud/cli/utils/server.py +2 -2
  69. hud/cli/utils/source_hash.py +3 -3
  70. hud/cli/utils/tasks.py +4 -1
  71. hud/cli/utils/tests/__init__.py +0 -0
  72. hud/cli/utils/tests/test_config.py +58 -0
  73. hud/cli/utils/tests/test_docker.py +93 -0
  74. hud/cli/utils/tests/test_docker_hints.py +71 -0
  75. hud/cli/utils/tests/test_env_check.py +74 -0
  76. hud/cli/utils/tests/test_environment.py +42 -0
  77. hud/cli/utils/tests/test_git.py +142 -0
  78. hud/cli/utils/tests/test_interactive_module.py +60 -0
  79. hud/cli/utils/tests/test_local_runner.py +50 -0
  80. hud/cli/utils/tests/test_logging_utils.py +23 -0
  81. hud/cli/utils/tests/test_metadata.py +49 -0
  82. hud/cli/utils/tests/test_package_runner.py +35 -0
  83. hud/cli/utils/tests/test_registry_utils.py +49 -0
  84. hud/cli/utils/tests/test_remote_runner.py +25 -0
  85. hud/cli/utils/tests/test_runner_modules.py +52 -0
  86. hud/cli/utils/tests/test_source_hash.py +36 -0
  87. hud/cli/utils/tests/test_tasks.py +80 -0
  88. hud/cli/utils/version_check.py +258 -0
  89. hud/cli/{rl → utils}/viewer.py +2 -2
  90. hud/clients/README.md +12 -11
  91. hud/clients/__init__.py +4 -3
  92. hud/clients/base.py +166 -26
  93. hud/clients/environment.py +51 -0
  94. hud/clients/fastmcp.py +13 -6
  95. hud/clients/mcp_use.py +40 -15
  96. hud/clients/tests/test_analyze_scenarios.py +206 -0
  97. hud/clients/tests/test_protocol.py +9 -3
  98. hud/datasets/__init__.py +23 -20
  99. hud/datasets/loader.py +327 -0
  100. hud/datasets/runner.py +192 -105
  101. hud/datasets/tests/__init__.py +0 -0
  102. hud/datasets/tests/test_loader.py +221 -0
  103. hud/datasets/tests/test_utils.py +315 -0
  104. hud/datasets/utils.py +270 -90
  105. hud/environment/__init__.py +50 -0
  106. hud/environment/connection.py +206 -0
  107. hud/environment/connectors/__init__.py +33 -0
  108. hud/environment/connectors/base.py +68 -0
  109. hud/environment/connectors/local.py +177 -0
  110. hud/environment/connectors/mcp_config.py +109 -0
  111. hud/environment/connectors/openai.py +101 -0
  112. hud/environment/connectors/remote.py +172 -0
  113. hud/environment/environment.py +694 -0
  114. hud/environment/integrations/__init__.py +45 -0
  115. hud/environment/integrations/adk.py +67 -0
  116. hud/environment/integrations/anthropic.py +196 -0
  117. hud/environment/integrations/gemini.py +92 -0
  118. hud/environment/integrations/langchain.py +82 -0
  119. hud/environment/integrations/llamaindex.py +68 -0
  120. hud/environment/integrations/openai.py +238 -0
  121. hud/environment/mock.py +306 -0
  122. hud/environment/router.py +112 -0
  123. hud/environment/scenarios.py +493 -0
  124. hud/environment/tests/__init__.py +1 -0
  125. hud/environment/tests/test_connection.py +317 -0
  126. hud/environment/tests/test_connectors.py +218 -0
  127. hud/environment/tests/test_environment.py +161 -0
  128. hud/environment/tests/test_integrations.py +257 -0
  129. hud/environment/tests/test_local_connectors.py +201 -0
  130. hud/environment/tests/test_scenarios.py +280 -0
  131. hud/environment/tests/test_tools.py +208 -0
  132. hud/environment/types.py +23 -0
  133. hud/environment/utils/__init__.py +35 -0
  134. hud/environment/utils/formats.py +215 -0
  135. hud/environment/utils/schema.py +171 -0
  136. hud/environment/utils/tool_wrappers.py +113 -0
  137. hud/eval/__init__.py +67 -0
  138. hud/eval/context.py +674 -0
  139. hud/eval/display.py +299 -0
  140. hud/eval/instrument.py +185 -0
  141. hud/eval/manager.py +466 -0
  142. hud/eval/parallel.py +268 -0
  143. hud/eval/task.py +340 -0
  144. hud/eval/tests/__init__.py +1 -0
  145. hud/eval/tests/test_context.py +178 -0
  146. hud/eval/tests/test_eval.py +210 -0
  147. hud/eval/tests/test_manager.py +152 -0
  148. hud/eval/tests/test_parallel.py +168 -0
  149. hud/eval/tests/test_task.py +145 -0
  150. hud/eval/types.py +63 -0
  151. hud/eval/utils.py +183 -0
  152. hud/patches/__init__.py +19 -0
  153. hud/patches/mcp_patches.py +151 -0
  154. hud/patches/warnings.py +54 -0
  155. hud/samples/browser.py +4 -4
  156. hud/server/__init__.py +2 -1
  157. hud/server/low_level.py +2 -1
  158. hud/server/router.py +164 -0
  159. hud/server/server.py +567 -80
  160. hud/server/tests/test_mcp_server_integration.py +11 -11
  161. hud/server/tests/test_mcp_server_more.py +1 -1
  162. hud/server/tests/test_server_extra.py +2 -0
  163. hud/settings.py +45 -3
  164. hud/shared/exceptions.py +36 -10
  165. hud/shared/hints.py +26 -1
  166. hud/shared/requests.py +15 -3
  167. hud/shared/tests/test_exceptions.py +40 -31
  168. hud/shared/tests/test_hints.py +167 -0
  169. hud/telemetry/__init__.py +20 -19
  170. hud/telemetry/exporter.py +201 -0
  171. hud/telemetry/instrument.py +158 -253
  172. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  173. hud/telemetry/tests/test_exporter.py +258 -0
  174. hud/telemetry/tests/test_instrument.py +401 -0
  175. hud/tools/__init__.py +16 -2
  176. hud/tools/apply_patch.py +639 -0
  177. hud/tools/base.py +54 -4
  178. hud/tools/bash.py +2 -2
  179. hud/tools/computer/__init__.py +4 -0
  180. hud/tools/computer/anthropic.py +2 -2
  181. hud/tools/computer/gemini.py +385 -0
  182. hud/tools/computer/hud.py +23 -6
  183. hud/tools/computer/openai.py +20 -21
  184. hud/tools/computer/qwen.py +434 -0
  185. hud/tools/computer/settings.py +37 -0
  186. hud/tools/edit.py +3 -7
  187. hud/tools/executors/base.py +4 -2
  188. hud/tools/executors/pyautogui.py +1 -1
  189. hud/tools/grounding/grounded_tool.py +13 -18
  190. hud/tools/grounding/grounder.py +10 -31
  191. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  192. hud/tools/jupyter.py +330 -0
  193. hud/tools/playwright.py +18 -3
  194. hud/tools/shell.py +308 -0
  195. hud/tools/tests/test_apply_patch.py +718 -0
  196. hud/tools/tests/test_computer.py +4 -9
  197. hud/tools/tests/test_computer_actions.py +24 -2
  198. hud/tools/tests/test_jupyter_tool.py +181 -0
  199. hud/tools/tests/test_shell.py +596 -0
  200. hud/tools/tests/test_submit.py +85 -0
  201. hud/tools/tests/test_types.py +193 -0
  202. hud/tools/types.py +21 -1
  203. hud/types.py +167 -57
  204. hud/utils/__init__.py +2 -0
  205. hud/utils/env.py +67 -0
  206. hud/utils/hud_console.py +61 -3
  207. hud/utils/mcp.py +15 -58
  208. hud/utils/strict_schema.py +162 -0
  209. hud/utils/tests/test_init.py +1 -2
  210. hud/utils/tests/test_mcp.py +1 -28
  211. hud/utils/tests/test_pretty_errors.py +186 -0
  212. hud/utils/tests/test_tool_shorthand.py +154 -0
  213. hud/utils/tests/test_version.py +1 -1
  214. hud/utils/types.py +20 -0
  215. hud/version.py +1 -1
  216. hud_python-0.5.1.dist-info/METADATA +264 -0
  217. hud_python-0.5.1.dist-info/RECORD +299 -0
  218. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
  219. hud/agents/langchain.py +0 -261
  220. hud/agents/lite_llm.py +0 -72
  221. hud/cli/rl/__init__.py +0 -180
  222. hud/cli/rl/config.py +0 -101
  223. hud/cli/rl/display.py +0 -133
  224. hud/cli/rl/gpu.py +0 -63
  225. hud/cli/rl/gpu_utils.py +0 -321
  226. hud/cli/rl/local_runner.py +0 -595
  227. hud/cli/rl/presets.py +0 -96
  228. hud/cli/rl/remote_runner.py +0 -463
  229. hud/cli/rl/rl_api.py +0 -150
  230. hud/cli/rl/vllm.py +0 -177
  231. hud/cli/rl/wait_utils.py +0 -89
  232. hud/datasets/parallel.py +0 -687
  233. hud/misc/__init__.py +0 -1
  234. hud/misc/claude_plays_pokemon.py +0 -292
  235. hud/otel/__init__.py +0 -35
  236. hud/otel/collector.py +0 -142
  237. hud/otel/config.py +0 -181
  238. hud/otel/context.py +0 -570
  239. hud/otel/exporters.py +0 -369
  240. hud/otel/instrumentation.py +0 -135
  241. hud/otel/processors.py +0 -121
  242. hud/otel/tests/__init__.py +0 -1
  243. hud/otel/tests/test_processors.py +0 -197
  244. hud/rl/README.md +0 -30
  245. hud/rl/__init__.py +0 -1
  246. hud/rl/actor.py +0 -176
  247. hud/rl/buffer.py +0 -405
  248. hud/rl/chat_template.jinja +0 -101
  249. hud/rl/config.py +0 -192
  250. hud/rl/distributed.py +0 -132
  251. hud/rl/learner.py +0 -637
  252. hud/rl/tests/__init__.py +0 -1
  253. hud/rl/tests/test_learner.py +0 -186
  254. hud/rl/train.py +0 -382
  255. hud/rl/types.py +0 -101
  256. hud/rl/utils/start_vllm_server.sh +0 -30
  257. hud/rl/utils.py +0 -524
  258. hud/rl/vllm_adapter.py +0 -143
  259. hud/telemetry/job.py +0 -352
  260. hud/telemetry/replay.py +0 -74
  261. hud/telemetry/tests/test_replay.py +0 -40
  262. hud/telemetry/tests/test_trace.py +0 -63
  263. hud/telemetry/trace.py +0 -158
  264. hud/utils/agent_factories.py +0 -86
  265. hud/utils/async_utils.py +0 -65
  266. hud/utils/group_eval.py +0 -223
  267. hud/utils/progress.py +0 -149
  268. hud/utils/tasks.py +0 -127
  269. hud/utils/tests/test_async_utils.py +0 -173
  270. hud/utils/tests/test_progress.py +0 -261
  271. hud_python-0.4.45.dist-info/METADATA +0 -552
  272. hud_python-0.4.45.dist-info/RECORD +0 -228
  273. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
  274. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0
hud/cli/eval.py CHANGED
@@ -1,762 +1,876 @@
1
- """HUD evaluation command for running tasks and datasets."""
1
+ """HUD evaluation command for running tasks and datasets.
2
+
3
+ Config Override Order: CLI arguments > .hud_eval.toml > defaults
4
+ """
2
5
 
3
6
  from __future__ import annotations
4
7
 
5
8
  import asyncio
6
9
  import logging
10
+ import re
11
+ import time
12
+ import tomllib
13
+ from dataclasses import dataclass
7
14
  from pathlib import Path
8
- from typing import TYPE_CHECKING, Any, Literal
15
+ from typing import TYPE_CHECKING, Any, ClassVar
9
16
 
17
+ import questionary
10
18
  import typer
19
+ from pydantic import BaseModel, Field, field_validator
20
+ from rich import box
21
+ from rich.table import Table
11
22
 
12
- import hud
13
- from hud.cli.utils.env_check import ensure_built, find_environment_dir
14
23
  from hud.settings import settings
15
- from hud.utils.group_eval import display_group_statistics, run_tasks_grouped
24
+ from hud.types import AgentType
25
+ from hud.utils.env import resolve_env_vars
16
26
  from hud.utils.hud_console import HUDConsole
17
27
 
28
+ # Pattern to detect AWS Bedrock inference profile ARNs
29
+ _BEDROCK_ARN_PATTERN = re.compile(r"^arn:aws:bedrock:[a-z0-9-]+:\d+:inference-profile/.+$")
30
+
31
+
32
+ def _is_bedrock_arn(model: str | None) -> bool:
33
+ """Check if a model string is a Bedrock inference profile ARN."""
34
+ return model is not None and bool(_BEDROCK_ARN_PATTERN.match(model))
35
+
36
+
18
37
  if TYPE_CHECKING:
19
- from hud.types import Task
38
+ from hud.agents.base import MCPAgent
39
+
20
40
  logger = logging.getLogger(__name__)
21
41
  hud_console = HUDConsole()
22
42
 
43
+ _CONFIG_PATH = ".hud_eval.toml"
23
44
 
24
- def get_available_models() -> list[dict[str, str | None]]:
25
- """Fetch available models from the HUD API (only ready models).
26
-
27
- Returns:
28
- List of dicts with 'name', 'vllm_url', and 'base_model' keys
29
- """
30
- try:
31
- from hud.cli.rl import rl_api
32
-
33
- hud_console.info("Fetching your models from https://hud.so/models")
34
- models = rl_api.list_models()
35
45
 
36
- # Filter for ready models only and sort by recency
37
- ready_models = [m for m in models if m.status == "ready"]
38
- ready_models.sort(key=lambda m: m.created_at or "", reverse=True)
46
+ @dataclass(frozen=True)
47
+ class AgentPreset:
48
+ """A preset agent configuration combining agent type, model, and optional config."""
39
49
 
40
- # Count other statuses for informational purposes
41
- training_count = sum(1 for m in models if m.status == "training")
42
- # other_count = len(models) - len(ready_models) - training_count
50
+ name: str
51
+ agent_type: AgentType
52
+ model: str | None = None
53
+ agent_config: dict[str, Any] | None = None
43
54
 
44
- if ready_models:
45
- hud_console.success(f"Found {len(ready_models)} ready models:")
46
- for model in ready_models:
47
- vllm_status = " (vLLM deployed)" if model.vllm_url else ""
48
- hud_console.info(f" ✅ {model.name}{vllm_status}")
49
55
 
50
- if training_count > 0:
51
- hud_console.info(f"\n({training_count} models currently training)")
56
+ # Built-in presets for the interactive picker
57
+ _AGENT_PRESETS: list[AgentPreset] = [
58
+ # Native agents (use provider SDKs directly)
59
+ AgentPreset("Claude Sonnet 4.5", AgentType.CLAUDE, "claude-sonnet-4-5"),
60
+ AgentPreset("GPT-5", AgentType.OPENAI, "gpt-5"),
61
+ AgentPreset("Operator (OpenAI Computer Use)", AgentType.OPERATOR, "computer-use-preview"),
62
+ AgentPreset("Gemini 3 Pro Preview", AgentType.GEMINI, "gemini-3-pro-preview"),
63
+ AgentPreset(
64
+ "Gemini CUA (Gemini Computer Use)",
65
+ AgentType.GEMINI_CUA,
66
+ "gemini-2.5-computer-use-preview",
67
+ ),
68
+ # HUD Gateway presets (models via HUD Inference API)
69
+ AgentPreset(
70
+ "Grok 4-1 Fast (xAI)",
71
+ AgentType.OPENAI_COMPATIBLE,
72
+ "grok-4-1-fast",
73
+ {
74
+ "openai_compatible": {
75
+ "base_url": settings.hud_gateway_url,
76
+ "model_name": "Grok 4-1 Fast",
77
+ }
78
+ },
79
+ ),
80
+ AgentPreset(
81
+ "GLM-4.5V (Z-AI)",
82
+ AgentType.OPENAI_COMPATIBLE,
83
+ "z-ai/glm-4.5v",
84
+ {"openai_compatible": {"base_url": settings.hud_gateway_url, "model_name": "GLM-4.5V"}},
85
+ ),
86
+ ]
87
+
88
+ _DEFAULT_CONFIG_TEMPLATE = """# HUD Eval Configuration
89
+ # Command-line arguments override these settings
90
+
91
+ [eval]
92
+ # source = "hud-evals/SheetBench-50"
93
+ # agent = "claude"
94
+ # all = false # Run all problems instead of just 1
95
+ # max_concurrent = 30
96
+ # max_steps = 10
97
+ # group_size = 1
98
+ # byok = false # Remote only; use encrypted env vars on the platform.
99
+ # task_ids = ["task_1", "task_2"]
100
+ # verbose = true
101
+ # very_verbose = true
102
+ # auto_respond = true
103
+ # gateway = false # Route LLM API calls through HUD Gateway
104
+
105
+ [agent]
106
+ # allowed_tools = ["computer", "playwright"]
107
+ # disallowed_tools = []
108
+
109
+ [claude]
110
+ # model = "claude-sonnet-4-5"
111
+ # max_tokens = 16384
112
+ # use_computer_beta = true
113
+
114
+ [openai]
115
+ # model = "gpt-4o"
116
+ # temperature = 0.7
117
+ # max_output_tokens = 4096
118
+
119
+ [gemini]
120
+ # model = "gemini-2.5-pro"
121
+ # temperature = 1.0
122
+ # top_p = 0.95
123
+
124
+ [gemini_cua]
125
+ # model = "gemini-2.5-computer-use-preview"
126
+ # temperature = 1.0
127
+ # top_p = 0.95
128
+ # excluded_predefined_functions = []
129
+
130
+ [openai_compatible]
131
+ # base_url = "http://localhost:8000/v1"
132
+ # model = "my-model"
133
+ """
134
+
135
+ # Agent type -> (settings attr, env var name)
136
+ _API_KEY_REQUIREMENTS: dict[AgentType, tuple[str, str]] = {
137
+ AgentType.CLAUDE: ("anthropic_api_key", "ANTHROPIC_API_KEY"),
138
+ AgentType.GEMINI: ("gemini_api_key", "GEMINI_API_KEY"),
139
+ AgentType.GEMINI_CUA: ("gemini_api_key", "GEMINI_API_KEY"),
140
+ AgentType.OPENAI: ("openai_api_key", "OPENAI_API_KEY"),
141
+ AgentType.OPERATOR: ("openai_api_key", "OPENAI_API_KEY"),
142
+ }
143
+
144
+
145
+ class EvalConfig(BaseModel):
146
+ """Configuration for hud eval command."""
147
+
148
+ # Class-level registry
149
+ _agent_classes: ClassVar[dict[AgentType, type["MCPAgent"]]] = {}
150
+
151
+ # Fields loaded from [eval] section
152
+ _EVAL_FIELDS: ClassVar[set[str]] = {
153
+ "source",
154
+ "agent_type",
155
+ "task_ids",
156
+ "all",
157
+ "max_concurrent",
158
+ "max_steps",
159
+ "verbose",
160
+ "very_verbose",
161
+ "group_size",
162
+ "byok",
163
+ "remote",
164
+ "auto_respond",
165
+ "quiet",
166
+ "gateway",
167
+ }
168
+ # Fields loaded from [agent] section
169
+ _AGENT_FIELDS: ClassVar[set[str]] = {"allowed_tools", "disallowed_tools"}
170
+
171
+ # Eval settings
172
+ source: str | None = None
173
+ agent_type: AgentType | None = None
174
+ model: str | None = None
175
+ task_ids: list[str] | None = None
176
+ all: bool = False # Run all problems instead of just 1
177
+ max_concurrent: int = 30
178
+ max_steps: int = 10
179
+ verbose: bool = False
180
+ very_verbose: bool = False
181
+ auto_respond: bool | None = None # Continue without prompting
182
+ group_size: int = 1
183
+ byok: bool = False
184
+ remote: bool = False
185
+ quiet: bool = False # Suppress opening browser for eval links
186
+ gateway: bool = False # Use HUD Gateway for LLM API calls
187
+
188
+ # Base agent config (these merge with task's agent_config)
189
+ allowed_tools: list[str] | None = None
190
+ disallowed_tools: list[str] | None = None
191
+
192
+ agent_config: dict[str, Any] = Field(default_factory=dict)
193
+
194
+ @field_validator("agent_type", mode="before")
195
+ @classmethod
196
+ def _parse_agent_type(cls, v: Any) -> AgentType | None:
197
+ """Convert string agent name to AgentType enum."""
198
+ if v is None:
199
+ return None
200
+ if isinstance(v, AgentType):
201
+ return v
202
+ if isinstance(v, str):
203
+ try:
204
+ return AgentType(v)
205
+ except ValueError:
206
+ valid = [e.value for e in AgentType]
207
+ raise ValueError(
208
+ f"Invalid agent: {v}. Must be one of: {', '.join(valid)}"
209
+ ) from None
210
+ return v
211
+
212
+ def validate_api_keys(self) -> None:
213
+ """Validate required API keys for the selected agent. Raises typer.Exit on failure."""
214
+ # BYOK requires remote execution (check before agent_type guard)
215
+ if self.byok and not self.remote:
216
+ hud_console.error("--byok requires --remote (BYOK only works with remote execution)")
217
+ raise typer.Exit(1)
52
218
 
53
- return [
54
- {"name": model.name, "vllm_url": model.vllm_url, "base_model": model.base_model}
55
- for model in ready_models
56
- ]
57
- else:
58
- if training_count > 0:
59
- hud_console.warning(
60
- f"No ready models found. You have {training_count} models currently training."
219
+ if self.agent_type is None:
220
+ return
221
+
222
+ if self.remote:
223
+ if not settings.api_key:
224
+ hud_console.error("HUD_API_KEY is required for remote execution")
225
+ hud_console.info("Set it: hud set HUD_API_KEY=your-key-here")
226
+ raise typer.Exit(1)
227
+ return
228
+
229
+ # Gateway mode only requires HUD_API_KEY
230
+ if self.gateway:
231
+ if not settings.api_key:
232
+ hud_console.error("HUD_API_KEY is required for gateway mode")
233
+ hud_console.info("Set it: hud set HUD_API_KEY=your-key-here")
234
+ raise typer.Exit(1)
235
+ return
236
+
237
+ if self.agent_type == AgentType.OPENAI_COMPATIBLE:
238
+ # Check both CLI --model and config file model
239
+ config_model = self.agent_config.get("openai_compatible", {}).get("model")
240
+ if not self.model and not config_model:
241
+ hud_console.error(
242
+ "Model name is required for OpenAI compatible agent. "
243
+ "Use --model or set model in [openai_compatible] section of .hud_eval.toml"
61
244
  )
62
- else:
63
- hud_console.warning("No models found in your account.")
64
- return []
65
- except Exception as e:
66
- hud_console.debug(f"Error fetching models: {e}")
67
- # Don't show the error to the user, just proceed without HUD models
68
- return []
69
-
70
-
71
- def build_agent(
72
- agent_type: Literal["claude", "openai", "vllm", "litellm", "integration_test"],
73
- *,
74
- model: str | None = None,
75
- allowed_tools: list[str] | None = None,
76
- verbose: bool = False,
77
- vllm_base_url: str | None = None,
78
- ) -> Any:
79
- """Create and return the requested agent type."""
80
-
81
- # Import agents lazily to avoid dependency issues
82
- if agent_type == "integration_test":
83
- from hud.agents.misc.integration_test_agent import IntegrationTestRunner
84
-
85
- return IntegrationTestRunner(verbose=verbose)
86
- elif agent_type == "vllm":
87
- # Create a generic OpenAI agent for vLLM server
88
- try:
89
- from openai import AsyncOpenAI
90
-
91
- from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
92
- except ImportError as e:
93
- hud_console.error(
94
- "OpenAI dependencies are not installed. "
95
- "Please install with: pip install 'hud-python[agent]'"
96
- )
97
- raise typer.Exit(1) from e
98
-
99
- # Determine the base URL to use
100
- if vllm_base_url is not None:
101
- # Use the provided vLLM URL (for custom/local servers)
102
- base_url = vllm_base_url
103
- hud_console.info(f"Using vLLM server at {base_url}")
104
- api_key = (
105
- settings.api_key if base_url.startswith(settings.hud_rl_url) else "token-abc123"
245
+ raise typer.Exit(1)
246
+ elif self.agent_type == AgentType.CLAUDE and _is_bedrock_arn(self.model):
247
+ missing_aws = (
248
+ not settings.aws_access_key_id
249
+ or not settings.aws_secret_access_key
250
+ or not settings.aws_region
106
251
  )
107
- else:
108
- # Default to localhost
109
- base_url = "http://localhost:8000/v1"
110
- api_key = "token-abc123"
111
-
112
- # Create OpenAI client for vLLM
113
- openai_client = AsyncOpenAI(
114
- base_url=base_url,
115
- api_key=api_key,
116
- timeout=30.0,
117
- )
118
-
119
- return GenericOpenAIChatAgent(
120
- openai_client=openai_client,
121
- model_name=model or "served-model", # Default model name
122
- verbose=verbose,
123
- completion_kwargs={
124
- "temperature": 0.7,
125
- "max_tokens": 2048,
126
- "tool_choice": "required", # if self.actor_config.force_tool_choice else "auto",
127
- },
252
+ if missing_aws:
253
+ hud_console.error(
254
+ "AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, and AWS_REGION "
255
+ "are required for AWS Bedrock"
256
+ )
257
+ raise typer.Exit(1)
258
+ elif self.agent_type in _API_KEY_REQUIREMENTS:
259
+ attr, env_var = _API_KEY_REQUIREMENTS[self.agent_type]
260
+ if not getattr(settings, attr, None):
261
+ hud_console.error(f"{env_var} is required for {self.agent_type.value} agent")
262
+ hud_console.info(f"Set it: hud set {env_var}=your-key-here")
263
+ raise typer.Exit(1)
264
+
265
+ if not settings.api_key:
266
+ hud_console.warning("HUD_API_KEY not set. Some features may be limited.")
267
+
268
+ def get_agent_kwargs(self) -> dict[str, Any]:
269
+ """Build agent kwargs from config.
270
+
271
+ Model precedence:
272
+ 1. CLI --model (highest priority)
273
+ 2. [agent_type].model in TOML (per-agent config)
274
+ """
275
+ if self.agent_type is None:
276
+ raise ValueError("agent_type must be set before calling get_agent_kwargs()")
277
+
278
+ kwargs: dict[str, Any] = {}
279
+
280
+ if self.allowed_tools:
281
+ kwargs["allowed_tools"] = self.allowed_tools
282
+ if self.disallowed_tools:
283
+ kwargs["disallowed_tools"] = self.disallowed_tools
284
+
285
+ # Apply agent-specific config
286
+ agent_key = self.agent_type.value
287
+ if agent_key in self.agent_config:
288
+ agent_cfg = dict(self.agent_config[agent_key])
289
+ kwargs.update(agent_cfg)
290
+
291
+ # CLI --model always wins
292
+ if self.model:
293
+ kwargs["model"] = self.model
294
+
295
+ # For gateway base_url, inject HUD API key if not already set
296
+ if self.agent_type == AgentType.OPENAI_COMPATIBLE and "api_key" not in kwargs:
297
+ base_url = kwargs.get("base_url", "")
298
+ if settings.hud_gateway_url in base_url and settings.api_key:
299
+ kwargs["api_key"] = settings.api_key
300
+
301
+ # Auto-detect Bedrock when Claude is selected with a Bedrock ARN
302
+ # Check both model and checkpoint_name for ARN patterns
303
+ bedrock_arn_detected = _is_bedrock_arn(kwargs.get("model")) or _is_bedrock_arn(
304
+ kwargs.get("checkpoint_name")
128
305
  )
129
-
130
- elif agent_type == "openai":
131
- try:
132
- from hud.agents import OperatorAgent
133
- except ImportError as e:
134
- hud_console.error(
135
- "OpenAI agent dependencies are not installed. "
136
- "Please install with: pip install 'hud-python[agent]'"
306
+ if self.agent_type == AgentType.CLAUDE and bedrock_arn_detected:
307
+ missing_aws = (
308
+ not settings.aws_access_key_id
309
+ or not settings.aws_secret_access_key
310
+ or not settings.aws_region
137
311
  )
138
- raise typer.Exit(1) from e
312
+ if missing_aws:
313
+ hud_console.error(
314
+ "AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, and AWS_REGION "
315
+ "are required for AWS Bedrock"
316
+ )
317
+ raise typer.Exit(1)
139
318
 
140
- if allowed_tools:
141
- return OperatorAgent(
142
- allowed_tools=allowed_tools,
143
- verbose=verbose,
144
- )
145
- else:
146
- return OperatorAgent(verbose=verbose)
319
+ from anthropic import AsyncAnthropicBedrock
147
320
 
148
- elif agent_type == "litellm":
149
- try:
150
- from hud.agents.lite_llm import LiteAgent
151
- except ImportError as e:
152
- hud_console.error(
153
- "LiteLLM agent dependencies are not installed. "
154
- "Please install with: pip install 'hud-python[agent]'"
321
+ kwargs["model_client"] = AsyncAnthropicBedrock(
322
+ aws_access_key=settings.aws_access_key_id,
323
+ aws_secret_key=settings.aws_secret_access_key,
324
+ aws_region=settings.aws_region or "us-east-1",
155
325
  )
156
- raise typer.Exit(1) from e
157
-
158
- return LiteAgent(
159
- model_name=model or "gpt-4o-mini",
160
- allowed_tools=allowed_tools,
161
- verbose=verbose,
162
- )
163
-
164
- # Fallback Claude agent (Anthropic)
165
- try:
166
- from hud.agents import ClaudeAgent
167
- except ImportError as e:
168
- hud_console.error(
169
- "Claude agent dependencies are not installed. "
170
- "Please install with: pip install 'hud-python[agent]'"
171
- )
172
- raise typer.Exit(1) from e
173
-
174
- model = model or "claude-sonnet-4-20250514"
175
-
176
- if allowed_tools:
177
- return ClaudeAgent(
178
- model=model,
179
- allowed_tools=allowed_tools,
180
- verbose=verbose,
181
- )
182
- else:
183
- return ClaudeAgent(
184
- model=model,
185
- verbose=verbose,
186
- )
326
+ hud_console.info("🔧 Using AWS Bedrock (detected ARN in model)")
327
+
328
+ kwargs["verbose"] = self.verbose or self.very_verbose
329
+
330
+ if self.agent_type in (
331
+ AgentType.CLAUDE,
332
+ AgentType.OPENAI,
333
+ AgentType.OPERATOR,
334
+ AgentType.GEMINI,
335
+ AgentType.GEMINI_CUA,
336
+ ):
337
+ kwargs["validate_api_key"] = False
338
+
339
+ # Configure gateway mode - route LLM API calls through HUD gateway
340
+ if self.gateway:
341
+ hud_api_key = settings.api_key
342
+ if not hud_api_key:
343
+ raise typer.Exit(1) # Already validated in validate_api_keys()
344
+
345
+ if self.agent_type == AgentType.CLAUDE:
346
+ from anthropic import AsyncAnthropic
347
+
348
+ kwargs["model_client"] = AsyncAnthropic(
349
+ api_key=hud_api_key,
350
+ base_url=settings.hud_gateway_url,
351
+ )
352
+ hud_console.info("🌐 Using HUD Gateway for Claude API")
353
+ elif self.agent_type in (AgentType.OPENAI, AgentType.OPERATOR):
354
+ from openai import AsyncOpenAI
187
355
 
356
+ kwargs["model_client"] = AsyncOpenAI(
357
+ api_key=hud_api_key,
358
+ base_url=settings.hud_gateway_url,
359
+ )
360
+ hud_console.info("🌐 Using HUD Gateway for OpenAI API")
361
+ elif self.agent_type == AgentType.OPENAI_COMPATIBLE:
362
+ from openai import AsyncOpenAI
188
363
 
189
- async def run_single_task(
190
- source: str,
191
- *,
192
- agent_type: Literal["claude", "openai", "vllm", "litellm", "integration_test"] = "claude",
193
- model: str | None = None,
194
- allowed_tools: list[str] | None = None,
195
- max_steps: int = 10,
196
- verbose: bool = False,
197
- vllm_base_url: str | None = None,
198
- group_size: int = 1,
199
- ) -> None:
200
- """Load one task and execute it, or detect if JSON contains a list and run as dataset."""
364
+ kwargs["openai_client"] = AsyncOpenAI(
365
+ api_key=hud_api_key,
366
+ base_url=settings.hud_gateway_url,
367
+ )
368
+ hud_console.info("🌐 Using HUD Gateway for OpenAI-compatible API")
369
+ elif self.agent_type in (AgentType.GEMINI, AgentType.GEMINI_CUA):
370
+ from google import genai
371
+ from google.genai.types import HttpOptions
372
+
373
+ kwargs["model_client"] = genai.Client(
374
+ api_key="PLACEHOLDER",
375
+ http_options=HttpOptions(
376
+ api_version="v1beta",
377
+ base_url=settings.hud_gateway_url,
378
+ headers={"Authorization": f"Bearer {hud_api_key}"},
379
+ ),
380
+ )
381
+ hud_console.info("🌐 Using HUD Gateway for Gemini API")
201
382
 
202
- # Import Task and run_dataset lazily
203
- try:
204
- from hud.utils.tasks import load_tasks
205
- except ImportError as e:
206
- hud_console.error(
207
- "Dataset dependencies are not installed. "
208
- "Please install with: pip install 'hud-python\u27e6agent\u27e7'"
209
- )
210
- raise typer.Exit(1) from e
383
+ return kwargs
211
384
 
212
- path = Path(source)
213
- if path.exists() and (path.suffix in [".json", ".jsonl"]):
214
- hud_console.info("📊 Loading task file")
215
- tasks: list[Task] = load_tasks(str(path)) # type: ignore[assignment]
385
+ @classmethod
386
+ def load(cls, path: str = _CONFIG_PATH) -> EvalConfig:
387
+ """Load config from TOML file."""
388
+ p = Path(path)
389
+ if not p.exists():
390
+ p.write_text(_DEFAULT_CONFIG_TEMPLATE)
391
+ hud_console.info(f"Generated {_CONFIG_PATH}")
392
+ return cls()
216
393
 
217
- # If tasks reference a local environment (nearby), ensure it's built/up-to-date.
218
394
  try:
219
- env_dir = find_environment_dir(path)
220
- if env_dir is not None:
221
- # Non-interactive for eval; warn but don't block
222
- ensure_built(env_dir, interactive=False)
395
+ with open(p, "rb") as f:
396
+ toml_data = tomllib.load(f)
223
397
  except Exception as e:
224
- hud_console.debug(f"Eval preflight env check skipped: {e}")
398
+ hud_console.warning(f"Failed to parse {path}: {e}")
399
+ return cls()
225
400
 
226
- # Single task - use the first (and only) task
227
- task = tasks[0]
228
- hud_console.info("Found 1 task, running as single task…")
401
+ toml_data = resolve_env_vars(toml_data)
229
402
 
230
- else:
231
- # Load from HuggingFace dataset or non-file source
232
- hud_console.info(f"📊 Loading tasks from: {source}…")
233
- tasks: list[Task] = load_tasks(source) # type: ignore[assignment]
403
+ # Extract sections
404
+ eval_section = toml_data.get("eval", {})
405
+ agent_section = toml_data.get("agent", {})
234
406
 
235
- if not tasks:
236
- hud_console.error(f"No tasks found in: {source}")
237
- raise typer.Exit(1)
407
+ # Build config data
408
+ data: dict[str, Any] = {}
238
409
 
239
- # Single task - use the first task
240
- task = tasks[0]
241
- hud_console.info(
242
- "Using first task from dataset (run with --full to run the entire dataset)..."
243
- )
410
+ # Eval settings (map 'agent' -> 'agent_type')
411
+ if "agent" in eval_section:
412
+ data["agent_type"] = eval_section["agent"]
413
+ for key in cls._EVAL_FIELDS:
414
+ if key in eval_section:
415
+ data[key] = eval_section[key]
244
416
 
245
- task_prompt = task.prompt[:50] + "..." if len(task.prompt) > 50 else task.prompt
246
-
247
- # Use grouped evaluation if group_size > 1
248
- agent_config: dict[str, Any] = {}
249
- if agent_type == "integration_test":
250
- from hud.agents.misc.integration_test_agent import IntegrationTestRunner
251
-
252
- agent_class = IntegrationTestRunner
253
- agent_config = {"verbose": verbose}
254
- if allowed_tools:
255
- agent_config["allowed_tools"] = allowed_tools
256
- elif agent_type == "vllm":
257
- # Special handling for vLLM
258
- sample_agent = build_agent(
259
- agent_type,
260
- model=model,
261
- allowed_tools=allowed_tools,
262
- verbose=verbose,
263
- vllm_base_url=vllm_base_url,
264
- )
265
- agent_config = {
266
- "openai_client": sample_agent.oai,
267
- "model_name": sample_agent.model_name,
268
- "verbose": verbose,
269
- "completion_kwargs": sample_agent.completion_kwargs,
270
- }
271
- if allowed_tools:
272
- agent_config["allowed_tools"] = allowed_tools
273
-
274
- from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
275
-
276
- agent_class = GenericOpenAIChatAgent
277
- elif agent_type == "openai":
278
- from hud.agents import OperatorAgent
279
-
280
- agent_class = OperatorAgent
281
- agent_config = {"verbose": verbose}
282
- if allowed_tools:
283
- agent_config["allowed_tools"] = allowed_tools
284
- elif agent_type == "litellm":
285
- from hud.agents.lite_llm import LiteAgent
286
-
287
- agent_class = LiteAgent
288
- agent_config = {
289
- "model_name": model or "gpt-4o-mini",
290
- "verbose": verbose,
291
- }
292
- if allowed_tools:
293
- agent_config["allowed_tools"] = allowed_tools
294
- elif agent_type == "claude":
295
- from hud.agents import ClaudeAgent
296
-
297
- agent_class = ClaudeAgent
298
- agent_config = {
299
- "model": model or "claude-sonnet-4-20250514",
300
- "verbose": verbose,
301
- }
302
- if allowed_tools:
303
- agent_config["allowed_tools"] = allowed_tools
304
- else:
305
- raise ValueError(f"Invalid agent type: {agent_type}")
306
-
307
- if group_size > 1:
308
- hud_console.info(f"🔄 Running task with group_size={group_size}")
309
- # Run with grouping
310
- stats = await run_tasks_grouped(
311
- tasks=[task],
312
- agent_class=agent_class,
313
- agent_config=agent_config,
314
- group_size=group_size,
315
- max_parallel_episodes=48, # Same as RL default
316
- max_steps=max_steps,
317
- verbose=verbose,
318
- )
319
- display_group_statistics(stats, show_details=True)
320
- else:
321
- # Original single-run logic
322
- with hud.trace(name=task_prompt):
323
- agent = build_agent(
324
- agent_type,
325
- model=model,
326
- allowed_tools=allowed_tools,
327
- verbose=verbose,
328
- vllm_base_url=vllm_base_url,
329
- )
330
- hud_console.info(task.prompt)
331
- result = await agent.run(task, max_steps=max_steps)
332
- hud_console.success(f"Reward: {result.reward}")
333
-
334
-
335
- async def run_full_dataset(
336
- source: str,
337
- *,
338
- agent_type: Literal["claude", "openai", "vllm", "litellm", "integration_test"] = "claude",
339
- model: str | None = None,
340
- allowed_tools: list[str] | None = None,
341
- max_concurrent: int = 30,
342
- max_steps: int = 10,
343
- parallel: bool = False,
344
- max_workers: int | None = None,
345
- max_concurrent_per_worker: int = 25,
346
- verbose: bool = False,
347
- vllm_base_url: str | None = None,
348
- group_size: int = 1,
349
- ) -> list[Any]:
350
- """Run evaluation across the entire dataset.
351
-
352
- Uses either asyncio-based run_dataset or process-based parallel execution
353
- depending on the parallel flag."""
354
-
355
- # Import run_dataset lazily
356
- try:
357
- from hud.datasets import run_dataset, run_dataset_parallel, run_dataset_parallel_manual
358
- from hud.utils.tasks import load_tasks
359
- except ImportError as e:
360
- hud_console.error(
361
- "Dataset dependencies are not installed. "
362
- "Please install with: pip install 'hud-python[agent]'"
363
- )
364
- raise typer.Exit(1) from e
365
-
366
- # Load tasks using unified loader
367
- hud_console.info(f"📊 Loading tasks from: {source}…")
368
- tasks: list[Task] = load_tasks(source) # type: ignore[assignment]
369
-
370
- if not tasks:
371
- hud_console.error(f"No tasks found in: {source}")
372
- raise typer.Exit(1)
417
+ # Agent base config
418
+ for key in cls._AGENT_FIELDS:
419
+ if key in agent_section:
420
+ data[key] = agent_section[key]
373
421
 
374
- # Convert Task objects to dicts for dataset runners
375
- dataset_or_tasks = [task.model_dump() for task in tasks]
422
+ # Agent-specific configs (claude, openai, gemini, etc.)
423
+ agent_config: dict[str, Any] = {}
424
+ for agent_type in AgentType:
425
+ if agent_type.value in toml_data:
426
+ agent_config[agent_type.value] = toml_data[agent_type.value]
427
+ data["agent_config"] = agent_config
376
428
 
377
- # Determine dataset name
378
- path = Path(source)
379
- dataset_name = f"Dataset: {path.name}" if path.exists() else source.split("/")[-1]
380
-
381
- # Build agent class + config for run_dataset
382
- if agent_type == "integration_test": # --integration-test mode
383
- from hud.agents.misc.integration_test_agent import IntegrationTestRunner
384
-
385
- agent_class = IntegrationTestRunner
386
- agent_config = {"verbose": verbose}
387
- elif agent_type == "vllm":
388
429
  try:
389
- from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
390
-
391
- agent_class = GenericOpenAIChatAgent
392
- except ImportError as e:
393
- hud_console.error(
394
- "OpenAI dependencies are not installed. "
395
- "Please install with: pip install 'hud-python[agent]'"
430
+ return cls.model_validate(data)
431
+ except Exception as e:
432
+ hud_console.warning(f"Invalid config: {e}")
433
+ return cls()
434
+
435
+ def merge_cli(
436
+ self,
437
+ agent: str | None = None,
438
+ config: list[str] | None = None,
439
+ allowed_tools: str | None = None,
440
+ disallowed_tools: str | None = None,
441
+ task_ids: str | None = None,
442
+ **cli_args: Any,
443
+ ) -> EvalConfig:
444
+ """Merge CLI args (non-None values override config)."""
445
+ overrides: dict[str, Any] = {}
446
+
447
+ if agent is not None:
448
+ overrides["agent_type"] = agent
449
+
450
+ # Parse comma-separated lists
451
+ if allowed_tools is not None:
452
+ overrides["allowed_tools"] = [t.strip() for t in allowed_tools.split(",") if t.strip()]
453
+ if disallowed_tools is not None:
454
+ overrides["disallowed_tools"] = [
455
+ t.strip() for t in disallowed_tools.split(",") if t.strip()
456
+ ]
457
+ if task_ids is not None:
458
+ overrides["task_ids"] = [t.strip() for t in task_ids.split(",") if t.strip()]
459
+
460
+ overrides.update({k: v for k, v in cli_args.items() if v is not None and v is not False})
461
+
462
+ for k in ("all", "verbose", "very_verbose", "remote", "quiet", "gateway"):
463
+ if cli_args.get(k) is True:
464
+ overrides[k] = True
465
+ elif k in overrides and cli_args.get(k) is False:
466
+ del overrides[k]
467
+
468
+ # --full is a shortcut for --all --auto-respond --max-steps 100
469
+ if overrides.get("full"):
470
+ overrides["all"] = True
471
+ if "auto_respond" not in overrides:
472
+ overrides["auto_respond"] = True
473
+ if "max_steps" not in overrides:
474
+ overrides["max_steps"] = 100
475
+
476
+ if config:
477
+ merged_agent_config = dict(self.agent_config)
478
+ for item in config:
479
+ if "=" in item:
480
+ key, value = item.split("=", 1)
481
+ key = key.strip()
482
+ value = value.strip()
483
+
484
+ # Parse value
485
+ if value.lower() == "true":
486
+ parsed_value: Any = True
487
+ elif value.lower() == "false":
488
+ parsed_value = False
489
+ else:
490
+ try:
491
+ parsed_value = int(value)
492
+ except ValueError:
493
+ try:
494
+ parsed_value = float(value)
495
+ except ValueError:
496
+ parsed_value = value
497
+
498
+ # Handle namespaced keys (e.g., claude.max_tokens)
499
+ if "." in key:
500
+ agent_name, param = key.split(".", 1)
501
+ if agent_name not in merged_agent_config:
502
+ merged_agent_config[agent_name] = {}
503
+ merged_agent_config[agent_name][param] = parsed_value
504
+ else:
505
+ # Non-namespaced: apply to current agent if set
506
+ if self.agent_type:
507
+ agent_name = self.agent_type.value
508
+ if agent_name not in merged_agent_config:
509
+ merged_agent_config[agent_name] = {}
510
+ merged_agent_config[agent_name][key] = parsed_value
511
+
512
+ overrides["agent_config"] = merged_agent_config
513
+
514
+ return self.model_validate({**self.model_dump(), **overrides})
515
+
516
+ def resolve_agent_interactive(self) -> EvalConfig:
517
+ """Prompt user to select an agent preset if not set. Returns updated config."""
518
+ if self.agent_type is not None:
519
+ return self
520
+
521
+ # Build choices from presets
522
+ choices: list[dict[str, Any]] = [
523
+ {"name": preset.name, "value": preset} for preset in _AGENT_PRESETS
524
+ ]
525
+
526
+ selected: AgentPreset = hud_console.select("Select an agent:", choices=choices, default=0) # type: ignore[arg-type]
527
+
528
+ # Merge preset into config
529
+ updates: dict[str, Any] = {"agent_type": selected.agent_type}
530
+ if selected.model:
531
+ updates["model"] = selected.model
532
+ if selected.agent_config:
533
+ # Merge preset's agent_config with existing
534
+ merged = dict(self.agent_config)
535
+ for key, value in selected.agent_config.items():
536
+ if key in merged:
537
+ merged[key] = {**merged[key], **value}
538
+ else:
539
+ merged[key] = value
540
+ updates["agent_config"] = merged
541
+
542
+ return self.model_validate({**self.model_dump(), **updates})
543
+
544
+ def display(self) -> None:
545
+ """Display settings in a table."""
546
+ table = Table(title="Evaluation Settings", title_style="bold cyan", box=box.ROUNDED)
547
+ table.add_column("Setting", style="yellow")
548
+ table.add_column("Value", style="green")
549
+
550
+ # Core settings
551
+ table.add_row("source", str(self.source or "—"))
552
+ table.add_row("agent", self.agent_type.value) # type: ignore[union-attr]
553
+ if self.task_ids:
554
+ table.add_row(
555
+ "task_ids", ", ".join(self.task_ids[:5]) + ("..." if len(self.task_ids) > 5 else "")
396
556
  )
397
- raise typer.Exit(1) from e
398
-
399
- # Use build_agent to create a sample agent to get the config
400
- sample_agent = build_agent(
401
- agent_type,
402
- model=model,
403
- allowed_tools=allowed_tools,
404
- verbose=verbose,
405
- vllm_base_url=vllm_base_url,
406
- )
557
+ table.add_row("all", str(self.all))
558
+ table.add_row("max_steps", str(self.max_steps))
559
+ if not self.remote:
560
+ table.add_row("max_concurrent", str(self.max_concurrent))
561
+ if self.group_size > 1:
562
+ table.add_row("group_size", str(self.group_size))
563
+ if self.auto_respond:
564
+ table.add_row("auto_respond", "[bold green]True[/bold green]")
565
+ if self.very_verbose:
566
+ table.add_row("very_verbose", "[bold green]True[/bold green]")
567
+ elif self.verbose:
568
+ table.add_row("verbose", "[bold green]True[/bold green]")
569
+ if self.remote:
570
+ table.add_row("remote", "[bold green]True[/bold green] (submitting to platform)")
571
+ if self.gateway:
572
+ table.add_row("gateway", "[bold green]True[/bold green] (routing via HUD Gateway)")
573
+ if self.byok:
574
+ table.add_row("byok", "[bold green]True[/bold green] (remote only)")
575
+
576
+ # Tool filters (only if set)
577
+ if self.allowed_tools:
578
+ table.add_row("allowed_tools", ", ".join(self.allowed_tools))
579
+ if self.disallowed_tools:
580
+ table.add_row("disallowed_tools", ", ".join(self.disallowed_tools))
581
+
582
+ # Agent config section
583
+ if self.agent_type:
584
+ table.add_row("", "")
585
+ table.add_row(f"[dim]{self.agent_type.value} config[/dim]", "")
586
+
587
+ config_cls = self.agent_type.cls.config_cls
588
+ defaults = config_cls()
589
+ overrides = self.agent_config.get(self.agent_type.value, {})
590
+ skip = {
591
+ "model_client",
592
+ "model_name",
593
+ "validate_api_key",
594
+ "model_config",
595
+ "allowed_tools",
596
+ "disallowed_tools",
597
+ "system_prompt",
598
+ "response_tool_name",
599
+ "append_setup_output",
600
+ "initial_screenshot",
601
+ }
602
+
603
+ sensitive_fields = {"api_key", "api_secret", "token", "password", "secret"}
604
+
605
+ for name in config_cls.model_fields:
606
+ if name in skip:
607
+ continue
608
+ # Always show model
609
+ if name == "model":
610
+ if self.model:
611
+ value = self.model
612
+ elif overrides.get("model"):
613
+ value = overrides["model"]
614
+ else:
615
+ value = getattr(defaults, "model", None)
616
+ table.add_row(" model", str(value) if value else "—")
617
+ elif name in overrides:
618
+ value = overrides[name]
619
+ if name in sensitive_fields and value:
620
+ display_value = f"{str(value)[:4]}****" if len(str(value)) > 4 else "****"
621
+ else:
622
+ display_value = str(value)
623
+ table.add_row(f" {name}", display_value)
624
+
625
+ hud_console.console.print(table)
626
+
627
+
628
+ # =============================================================================
629
+ # Evaluation runner
630
+ # =============================================================================
631
+
632
+
633
+ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
634
+ """Run evaluation with the given config using run_dataset()."""
635
+ from hud.datasets import load_tasks, run_dataset
636
+
637
+ if cfg.source is None or cfg.agent_type is None:
638
+ raise ValueError("source and agent_type must be set")
639
+
640
+ # Load tasks using unified loader (handles v4→v5 conversion automatically)
641
+ hud_console.info(f"📊 Loading tasks from: {cfg.source}…")
642
+ tasks = load_tasks(cfg.source)
407
643
 
408
- # Extract the config from the sample agent
409
- agent_config: dict[str, Any] = {
410
- "openai_client": sample_agent.oai,
411
- "model_name": sample_agent.model_name,
412
- "verbose": verbose,
413
- "completion_kwargs": sample_agent.completion_kwargs,
414
- }
415
- if allowed_tools:
416
- agent_config["allowed_tools"] = allowed_tools
417
- elif agent_type == "openai":
418
- try:
419
- from hud.agents import OperatorAgent
644
+ if not tasks:
645
+ hud_console.error(f"No tasks found in: {cfg.source}")
646
+ raise typer.Exit(1)
420
647
 
421
- agent_class = OperatorAgent
422
- except ImportError as e:
423
- hud_console.error(
424
- "OpenAI agent dependencies are not installed. "
425
- "Please install with: pip install 'hud-python[agent]'"
426
- )
427
- raise typer.Exit(1) from e
648
+ # Filter by task IDs if provided
649
+ if cfg.task_ids:
650
+ id_set = set(cfg.task_ids)
651
+ # Match by task.id or index
652
+ filtered = [t for i, t in enumerate(tasks) if t.id in id_set or str(i) in id_set]
653
+ if not filtered:
654
+ hud_console.error(f"No tasks found matching IDs: {', '.join(cfg.task_ids)}")
655
+ raise typer.Exit(1)
656
+ hud_console.info(f"Filtered to {len(filtered)} task(s) by ID")
657
+ tasks = filtered
658
+ elif not cfg.all:
659
+ # Single task mode (no --all, --full, or --task-ids)
660
+ tasks = [tasks[0]]
661
+ hud_console.info("Using first task (run with --full or --task-ids for more)…")
662
+
663
+ hud_console.info(f"Loaded {len(tasks)} task(s)")
664
+
665
+ # Prepare agent kwargs
666
+ agent_kwargs = cfg.get_agent_kwargs()
667
+ auto_respond = cfg.auto_respond
668
+ if auto_respond:
669
+ agent_kwargs = {**agent_kwargs, "auto_respond": True}
670
+
671
+ max_steps = cfg.max_steps
672
+
673
+ # Remote execution - submit to HUD platform
674
+ if cfg.remote:
675
+ agent_kwargs = {
676
+ k: v for k, v in agent_kwargs.items() if k not in ("api_key", "model_client")
677
+ }
678
+ # Create a job ID for tracking
679
+ import uuid
428
680
 
429
- agent_config = {"verbose": verbose}
430
- if allowed_tools:
431
- agent_config["allowed_tools"] = allowed_tools
681
+ from hud.datasets.utils import submit_rollouts
432
682
 
433
- elif agent_type == "litellm":
434
- try:
435
- from hud.agents.lite_llm import LiteAgent
683
+ job_id = str(uuid.uuid4())
684
+ hud_console.info(
685
+ f"Submitting {len(tasks)} task(s) for remote execution (job_id: {job_id})…"
686
+ )
436
687
 
437
- agent_class = LiteAgent
438
- except ImportError as e:
439
- hud_console.error(
440
- "LiteLLM agent dependencies are not installed. "
441
- "Please install with: pip install 'hud-python[agent]'"
442
- )
443
- raise typer.Exit(1) from e
688
+ await submit_rollouts(
689
+ tasks=tasks,
690
+ job_id=job_id,
691
+ agent_type=cfg.agent_type,
692
+ agent_params=agent_kwargs,
693
+ max_steps=max_steps,
694
+ group_size=cfg.group_size,
695
+ use_byok=cfg.byok,
696
+ )
444
697
 
445
- agent_config = {
446
- "model_name": model or "gpt-4o-mini",
447
- "verbose": verbose,
448
- }
449
- if allowed_tools:
450
- agent_config["allowed_tools"] = allowed_tools
698
+ hud_console.success(f"Tasks submitted. View at: https://hud.ai/jobs/{job_id}")
699
+ return [], tasks
451
700
 
701
+ # Single task mode - show extra info
702
+ if len(tasks) == 1 and cfg.group_size == 1:
703
+ logging.getLogger("hud.agents").setLevel(logging.INFO)
704
+ logging.getLogger("hud.agents.base").setLevel(logging.INFO)
705
+ # Get prompt from args (v4 tasks) or show scenario name
706
+ prompt = tasks[0].args.get("prompt") if tasks[0].args else tasks[0].scenario
707
+ if prompt:
708
+ hud_console.info(f"Prompt: {prompt}")
452
709
  else:
453
- try:
454
- from hud.agents import ClaudeAgent
710
+ hud_console.info(
711
+ f"🚀 Running evaluation (max_concurrent: {cfg.max_concurrent}, "
712
+ f"group_size: {cfg.group_size})…"
713
+ )
455
714
 
456
- agent_class = ClaudeAgent
457
- except ImportError as e:
458
- hud_console.error(
459
- "Claude agent dependencies are not installed. "
460
- "Please install with: pip install 'hud-python[agent]'"
461
- )
462
- raise typer.Exit(1) from e
715
+ # Run using run_dataset
716
+ results = await run_dataset(
717
+ tasks,
718
+ cfg.agent_type,
719
+ agent_params=agent_kwargs,
720
+ max_steps=max_steps,
721
+ max_concurrent=cfg.max_concurrent,
722
+ group_size=cfg.group_size,
723
+ quiet=cfg.quiet,
724
+ )
463
725
 
464
- agent_config = {
465
- "model": model or "claude-sonnet-4-20250514",
466
- "verbose": verbose,
467
- }
468
- if allowed_tools:
469
- agent_config["allowed_tools"] = allowed_tools
470
-
471
- # Use grouped evaluation if group_size > 1
472
- if group_size > 1:
473
- hud_console.info(f"🔄 Running dataset with group_size={group_size}")
474
-
475
- # Run with job tracking
476
- with hud.job(
477
- name=f"Evaluation {dataset_name} (group_size={group_size})",
478
- metadata={
479
- "dataset": source,
480
- "group_size": group_size,
481
- "tasks": len(dataset_or_tasks),
482
- "total_episodes": len(dataset_or_tasks) * group_size,
483
- },
484
- ) as job:
485
- # Convert dicts to Task objects if needed
486
- from hud.datasets import Task
487
-
488
- tasks = []
489
- for item in dataset_or_tasks:
490
- if isinstance(item, dict):
491
- tasks.append(Task(**item))
492
- else:
493
- tasks.append(item)
494
-
495
- stats = await run_tasks_grouped(
496
- tasks=tasks,
497
- agent_class=agent_class,
498
- agent_config=agent_config,
499
- group_size=group_size,
500
- max_parallel_episodes=max_concurrent
501
- if not parallel
502
- else max_concurrent_per_worker * (max_workers or 4),
503
- max_steps=max_steps,
504
- verbose=verbose,
505
- job_id=job.id,
506
- )
726
+ # Show reward for single task
727
+ if len(tasks) == 1 and cfg.group_size == 1 and results:
728
+ hud_console.success(f"Reward: {results[0].reward}")
507
729
 
508
- # Display results
509
- display_group_statistics(stats, show_details=len(stats) <= 50)
730
+ return results, tasks
510
731
 
511
- # Return stats for consistency with other modes
512
- return stats
513
732
 
514
- # Original logic for non-grouped evaluation
515
- elif parallel:
516
- hud_console.info(
517
- f"🚀 Running PARALLEL evaluation (workers: {max_workers or 'auto'}, max_concurrent: {max_concurrent})…" # noqa: E501
518
- )
519
- if max_workers is None:
520
- # Use auto-optimization (now the default run_dataset_parallel)
521
- return await run_dataset_parallel(
522
- name=f"Evaluation {dataset_name}",
523
- dataset=dataset_or_tasks,
524
- agent_class=agent_class,
525
- agent_config=agent_config,
526
- max_concurrent=max_concurrent,
527
- metadata={"dataset": source, "parallel": True},
528
- max_steps=max_steps,
529
- auto_respond=True,
530
- )
531
- else:
532
- # Use manual configuration
533
- return await run_dataset_parallel_manual(
534
- name=f"Evaluation {dataset_name}",
535
- dataset=dataset_or_tasks,
536
- agent_class=agent_class,
537
- agent_config=agent_config,
538
- max_workers=max_workers,
539
- max_concurrent_per_worker=max_concurrent_per_worker,
540
- max_concurrent=max_concurrent,
541
- metadata={"dataset": source, "parallel": True},
542
- max_steps=max_steps,
543
- auto_respond=True,
544
- )
545
- else:
546
- hud_console.info(f"🚀 Running evaluation (max_concurrent: {max_concurrent})…")
547
- return await run_dataset(
548
- name=f"Evaluation {dataset_name}",
549
- dataset=dataset_or_tasks,
550
- agent_class=agent_class,
551
- agent_config=agent_config,
552
- max_concurrent=max_concurrent,
553
- metadata={"dataset": source},
554
- max_steps=max_steps,
555
- )
733
+ # =============================================================================
734
+ # CLI command
735
+ # =============================================================================
556
736
 
557
737
 
558
738
  def eval_command(
559
- source: str = typer.Argument(
560
- ...,
561
- help="HuggingFace dataset identifier (e.g. 'hud-evals/SheetBench-50'), JSON file (array of tasks), or JSONL file (one task per line)", # noqa: E501
739
+ source: str | None = typer.Argument(None, help="HuggingFace dataset or task JSON file"),
740
+ agent: str | None = typer.Argument(
741
+ None,
742
+ help="Agent: claude, openai, operator, gemini, gemini_cua, openai_compatible, integration_test", # noqa: E501
562
743
  ),
744
+ all: bool = typer.Option(False, "--all", help="Run all problems instead of just 1"),
563
745
  full: bool = typer.Option(
564
746
  False,
565
747
  "--full",
566
- help="Run the entire dataset (omit for single-task debug mode)",
748
+ help="Run the entire dataset. Shortcut for --all --auto-respond --max-steps 100",
567
749
  ),
568
- agent: Literal["claude", "openai", "vllm", "litellm", "integration_test"] = typer.Option(
569
- "claude",
570
- "--agent",
571
- help="Agent backend to use (claude, openai, vllm for local server, or litellm)",
572
- ),
573
- model: str | None = typer.Option(
574
- None,
575
- "--model",
576
- help="Model name for the chosen agent",
750
+ model: str | None = typer.Option(None, "--model", "-m", help="Model name"),
751
+ config: list[str] | None = typer.Option( # noqa: B008
752
+ None, "--config", "-c", help="Agent config: key=value"
577
753
  ),
754
+ # Task-overridable settings
578
755
  allowed_tools: str | None = typer.Option(
579
- None,
580
- "--allowed-tools",
581
- help="Comma-separated list of allowed tools",
756
+ None, "--allowed-tools", help="Comma-separated allowed tools"
582
757
  ),
583
- max_concurrent: int = typer.Option(
584
- 30,
585
- "--max-concurrent",
586
- help="Concurrency level for asyncio mode (ignored in parallel mode)",
758
+ disallowed_tools: str | None = typer.Option(
759
+ None, "--disallowed-tools", help="Comma-separated disallowed tools"
587
760
  ),
588
- max_steps: int | None = typer.Option(
589
- None,
590
- "--max-steps",
591
- help="Maximum steps per task (default: 10 for single, 50 for full)",
761
+ # Eval settings
762
+ max_concurrent: int | None = typer.Option(
763
+ None, "--max-concurrent", help="Max concurrent tasks"
592
764
  ),
593
- parallel: bool = typer.Option(
765
+ max_steps: int | None = typer.Option(None, "--max-steps", help="Max steps per task"),
766
+ verbose: bool = typer.Option(False, "--verbose", "-v", help="Verbose output"),
767
+ very_verbose: bool = typer.Option(False, "--very-verbose", "-vv", help="Debug logs"),
768
+ auto_respond: bool = typer.Option(
594
769
  False,
595
- "--parallel",
596
- help="Use process-based parallel execution for large datasets (100+ tasks)",
770
+ "--auto-respond",
771
+ help="Automatically prompt the agent to continue if it does not respond with a tool call",
597
772
  ),
598
- max_workers: int | None = typer.Option(
599
- None,
600
- "--max-workers",
601
- help="Number of worker processes for parallel mode (auto-optimized if not set)",
602
- ),
603
- max_concurrent_per_worker: int = typer.Option(
604
- 20,
605
- "--max-concurrent-per-worker",
606
- help="Maximum concurrent tasks per worker in parallel mode",
773
+ group_size: int | None = typer.Option(None, "--group-size", help="Runs per task"),
774
+ task_ids: str | None = typer.Option(None, "--task-ids", help="Comma-separated task IDs to run"),
775
+ yes: bool = typer.Option(False, "--yes", "-y", help="Skip confirmation"),
776
+ remote: bool = typer.Option(
777
+ False, "--remote", help="Submit tasks to platform for remote execution"
607
778
  ),
608
- verbose: bool = typer.Option(
779
+ byok: bool = typer.Option(
609
780
  False,
610
- "--verbose",
611
- help="Enable verbose output from the agent",
781
+ "--byok",
782
+ help="Remote only: use BYOK keys from encrypted env vars for inference",
612
783
  ),
613
- very_verbose: bool = typer.Option(
614
- False,
615
- "--very-verbose",
616
- "-vv",
617
- help="Enable debug-level logs for maximum visibility",
618
- ),
619
- vllm_base_url: str | None = typer.Option(
620
- None,
621
- "--vllm-base-url",
622
- help="Base URL for vLLM server (when using --agent vllm)",
784
+ quiet: bool = typer.Option(
785
+ False, "--quiet", "-q", help="Suppress opening browser for eval links"
623
786
  ),
624
- group_size: int = typer.Option(
625
- 1,
626
- "--group-size",
627
- help="Number of times to run each task (similar to RL training)",
628
- ),
629
- integration_test: bool = typer.Option(
630
- False,
631
- "--integration-test",
632
- help=(
633
- "Run integration_test_tool tool, where problem is setup, "
634
- "actions are applied, and evaluation is performed, without "
635
- "spinning up an agent"
636
- ),
787
+ gateway: bool = typer.Option(
788
+ False, "--gateway", "-g", help="Route LLM API calls through HUD Gateway"
637
789
  ),
638
790
  ) -> None:
639
791
  """🚀 Run evaluation on datasets or individual tasks with agents.
640
792
 
641
793
  Examples:
642
- # Evaluate a single task from SheetBench
643
- hud eval hud-evals/SheetBench-50
644
-
645
- # Evaluate the FULL SheetBench dataset with Claude (asyncio mode)
646
- hud eval hud-evals/SheetBench-50 --full --agent claude
647
-
648
- # Run large dataset with PARALLEL execution (auto-optimized)
649
- hud eval hud-evals/OSWorld-Verified-XLang --full --parallel
650
-
651
- # Parallel mode with manual configuration (16 workers, 25 tasks each)
652
- hud eval hud-evals/OSWorld-Verified-XLang --full --parallel --max-workers 16
653
-
654
- # Limit total concurrent tasks to prevent rate limits
655
- hud eval hud-evals/SheetBench-50 --full --parallel --max-concurrent 20
656
-
657
- # Run a single task from a JSON file
658
- hud eval task.json
659
-
660
- # Run multiple tasks from a JSON file with parallel execution
661
- hud eval tasks.json --full --parallel
662
-
663
- # Run with OpenAI Operator agent
664
- hud eval hud-evals/OSWorld-Gold-Beta --agent openai
794
+ hud eval tasks.json claude
795
+ hud eval hud-evals/SheetBench-50 claude --full
796
+ hud eval tasks.json claude --config max_tokens=32768
797
+ hud eval tasks.json openai --config temperature=0.7
798
+ hud eval tasks.json claude --full --remote # Remote execution
799
+ hud eval tasks.json claude --gateway # Route LLM calls through HUD Gateway
800
+ """
801
+ hud_console.info("🔧 Initializing evaluation...")
802
+
803
+ # Load config and merge CLI args
804
+ cfg = EvalConfig.load().merge_cli(
805
+ source=source,
806
+ agent=agent,
807
+ model=model,
808
+ all=all,
809
+ full=full,
810
+ max_concurrent=max_concurrent,
811
+ max_steps=max_steps,
812
+ allowed_tools=allowed_tools,
813
+ disallowed_tools=disallowed_tools,
814
+ task_ids=task_ids,
815
+ verbose=verbose,
816
+ very_verbose=very_verbose,
817
+ auto_respond=auto_respond,
818
+ group_size=group_size,
819
+ config=config,
820
+ remote=remote,
821
+ byok=byok,
822
+ quiet=quiet,
823
+ gateway=gateway,
824
+ )
665
825
 
666
- # Use local vLLM server (default: localhost:8000)
667
- hud eval task.json --agent vllm --model Qwen/Qwen2.5-VL-3B-Instruct
826
+ # Find source if not provided
827
+ if cfg.source is None:
828
+ try:
829
+ from hud.cli.utils.tasks import find_tasks_file
668
830
 
669
- # Use custom vLLM server URL
670
- hud eval task.json --agent vllm --vllm-base-url http://192.168.1.100:8000/v1
831
+ cfg = cfg.model_copy(
832
+ update={"source": find_tasks_file(None, msg="Select a tasks file")}
833
+ )
834
+ hud_console.success(f"Selected: {cfg.source}")
835
+ except Exception:
836
+ hud_console.error("No source provided and no task files found")
837
+ raise typer.Exit(1) from None
671
838
 
672
- # Run with verbose output for debugging
673
- hud eval task.json --verbose
674
- """
675
- from hud.settings import settings
839
+ # Resolve agent interactively if needed
840
+ cfg = cfg.resolve_agent_interactive()
676
841
 
677
- if very_verbose:
678
- logging.basicConfig(
679
- level=logging.DEBUG,
680
- format="%(asctime)s - %(name)s - %(message)s",
681
- datefmt="%H:%M:%S",
682
- )
842
+ # Configure logging
843
+ if cfg.very_verbose:
844
+ logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(message)s")
683
845
  logging.getLogger("hud.agents").setLevel(logging.DEBUG)
684
- logging.getLogger("hud.agents.base").setLevel(logging.DEBUG)
685
- elif verbose:
686
- logging.basicConfig(
687
- level=logging.INFO,
688
- format="%(asctime)s - %(name)s - %(message)s",
689
- datefmt="%H:%M:%S",
690
- )
846
+ # Suppress noisy HTTP client logs
847
+ logging.getLogger("httpx").setLevel(logging.WARNING)
848
+ logging.getLogger("httpcore").setLevel(logging.WARNING)
849
+ elif cfg.verbose:
691
850
  logging.getLogger("hud.agents").setLevel(logging.INFO)
692
- logging.getLogger("hud.agents.base").setLevel(logging.INFO)
693
851
 
694
- # We pass integration_test as the agent_type
695
- if integration_test:
696
- agent = "integration_test"
852
+ # Validate API keys
853
+ cfg.validate_api_keys()
697
854
 
698
- # Check for required API keys
699
- if agent == "claude":
700
- if not settings.anthropic_api_key:
701
- hud_console.error("ANTHROPIC_API_KEY is required for Claude agent")
702
- hud_console.info(
703
- "Set it in your environment or run: hud set ANTHROPIC_API_KEY=your-key-here"
704
- )
705
- raise typer.Exit(1)
706
- elif agent == "openai" and not settings.openai_api_key:
707
- hud_console.error("OPENAI_API_KEY is required for OpenAI agent")
708
- hud_console.info("Set it in your environment or run: hud set OPENAI_API_KEY=your-key-here")
855
+ # Display and confirm
856
+ cfg.display()
857
+
858
+ if not yes and not questionary.confirm("Proceed?", default=True, qmark="").ask():
859
+ hud_console.info("Cancelled.")
709
860
  raise typer.Exit(1)
710
- elif agent == "vllm":
711
- if model:
712
- hud_console.info(f"Using vLLM with model: {model}")
713
- else:
714
- hud_console.error("Model name is required for vLLM agent, specify with --model")
715
- raise typer.Exit(1)
716
861
 
717
- # Check for HUD_API_KEY if using HUD services
718
- if not settings.api_key:
719
- hud_console.warning("HUD_API_KEY not set. Some features may be limited.")
720
- hud_console.info("Get your API key at: https://hud.so")
721
- hud_console.info("Set it in your environment or run: hud set HUD_API_KEY=your-key-here")
862
+ # Run
863
+ start_time = time.time()
864
+ try:
865
+ results, tasks = asyncio.run(_run_evaluation(cfg))
866
+ except ValueError as e:
867
+ hud_console.error(str(e))
868
+ raise typer.Exit(1) from None
869
+ elapsed = time.time() - start_time
722
870
 
723
- # Parse allowed tools
724
- allowed_tools_list = (
725
- [t.strip() for t in allowed_tools.split(",") if t.strip()] if allowed_tools else None
726
- )
871
+ if cfg.remote:
872
+ return
727
873
 
728
- # Set default max_steps if not provided
729
- if max_steps is None:
730
- max_steps = 50 if full else 10
731
-
732
- # Run evaluation
733
- if full:
734
- asyncio.run(
735
- run_full_dataset(
736
- source,
737
- agent_type=agent,
738
- model=model,
739
- allowed_tools=allowed_tools_list,
740
- max_concurrent=max_concurrent,
741
- max_steps=max_steps,
742
- parallel=parallel,
743
- max_workers=max_workers,
744
- max_concurrent_per_worker=max_concurrent_per_worker,
745
- verbose=very_verbose or verbose,
746
- vllm_base_url=vllm_base_url,
747
- group_size=group_size,
748
- )
749
- )
750
- else:
751
- asyncio.run(
752
- run_single_task(
753
- source,
754
- agent_type=agent,
755
- model=model,
756
- allowed_tools=allowed_tools_list,
757
- max_steps=max_steps,
758
- verbose=very_verbose or verbose,
759
- vllm_base_url=vllm_base_url,
760
- group_size=group_size,
761
- )
762
- )
874
+ from hud.datasets import display_results
875
+
876
+ display_results(results, tasks=tasks, elapsed=elapsed, show_details=len(results) <= 50)