hud-python 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (282) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +70 -5
  3. hud/agents/base.py +238 -500
  4. hud/agents/claude.py +236 -247
  5. hud/agents/gateway.py +42 -0
  6. hud/agents/gemini.py +264 -0
  7. hud/agents/gemini_cua.py +324 -0
  8. hud/agents/grounded_openai.py +98 -100
  9. hud/agents/misc/integration_test_agent.py +51 -20
  10. hud/agents/misc/response_agent.py +48 -36
  11. hud/agents/openai.py +282 -296
  12. hud/agents/{openai_chat_generic.py → openai_chat.py} +63 -33
  13. hud/agents/operator.py +199 -0
  14. hud/agents/resolver.py +70 -0
  15. hud/agents/tests/conftest.py +133 -0
  16. hud/agents/tests/test_base.py +300 -622
  17. hud/agents/tests/test_base_runtime.py +233 -0
  18. hud/agents/tests/test_claude.py +381 -214
  19. hud/agents/tests/test_client.py +9 -10
  20. hud/agents/tests/test_gemini.py +369 -0
  21. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  22. hud/agents/tests/test_openai.py +377 -140
  23. hud/agents/tests/test_operator.py +362 -0
  24. hud/agents/tests/test_resolver.py +192 -0
  25. hud/agents/tests/test_run_eval.py +179 -0
  26. hud/agents/types.py +148 -0
  27. hud/cli/__init__.py +493 -546
  28. hud/cli/analyze.py +43 -5
  29. hud/cli/build.py +699 -113
  30. hud/cli/debug.py +8 -5
  31. hud/cli/dev.py +889 -732
  32. hud/cli/eval.py +793 -667
  33. hud/cli/flows/dev.py +167 -0
  34. hud/cli/flows/init.py +191 -0
  35. hud/cli/flows/tasks.py +153 -56
  36. hud/cli/flows/templates.py +151 -0
  37. hud/cli/flows/tests/__init__.py +1 -0
  38. hud/cli/flows/tests/test_dev.py +126 -0
  39. hud/cli/init.py +60 -58
  40. hud/cli/pull.py +1 -1
  41. hud/cli/push.py +38 -13
  42. hud/cli/rft.py +311 -0
  43. hud/cli/rft_status.py +145 -0
  44. hud/cli/tests/test_analyze.py +5 -5
  45. hud/cli/tests/test_analyze_metadata.py +3 -2
  46. hud/cli/tests/test_analyze_module.py +120 -0
  47. hud/cli/tests/test_build.py +110 -8
  48. hud/cli/tests/test_build_failure.py +41 -0
  49. hud/cli/tests/test_build_module.py +50 -0
  50. hud/cli/tests/test_cli_init.py +6 -1
  51. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  52. hud/cli/tests/test_cli_root.py +140 -0
  53. hud/cli/tests/test_convert.py +361 -0
  54. hud/cli/tests/test_debug.py +12 -10
  55. hud/cli/tests/test_dev.py +197 -0
  56. hud/cli/tests/test_eval.py +251 -0
  57. hud/cli/tests/test_eval_bedrock.py +51 -0
  58. hud/cli/tests/test_init.py +124 -0
  59. hud/cli/tests/test_main_module.py +11 -5
  60. hud/cli/tests/test_mcp_server.py +12 -100
  61. hud/cli/tests/test_push.py +1 -1
  62. hud/cli/tests/test_push_happy.py +74 -0
  63. hud/cli/tests/test_push_wrapper.py +23 -0
  64. hud/cli/tests/test_registry.py +1 -1
  65. hud/cli/tests/test_utils.py +1 -1
  66. hud/cli/{rl → utils}/celebrate.py +14 -12
  67. hud/cli/utils/config.py +18 -1
  68. hud/cli/utils/docker.py +130 -4
  69. hud/cli/utils/env_check.py +9 -9
  70. hud/cli/utils/git.py +136 -0
  71. hud/cli/utils/interactive.py +39 -5
  72. hud/cli/utils/metadata.py +70 -1
  73. hud/cli/utils/runner.py +1 -1
  74. hud/cli/utils/server.py +2 -2
  75. hud/cli/utils/source_hash.py +3 -3
  76. hud/cli/utils/tasks.py +4 -1
  77. hud/cli/utils/tests/__init__.py +0 -0
  78. hud/cli/utils/tests/test_config.py +58 -0
  79. hud/cli/utils/tests/test_docker.py +93 -0
  80. hud/cli/utils/tests/test_docker_hints.py +71 -0
  81. hud/cli/utils/tests/test_env_check.py +74 -0
  82. hud/cli/utils/tests/test_environment.py +42 -0
  83. hud/cli/utils/tests/test_git.py +142 -0
  84. hud/cli/utils/tests/test_interactive_module.py +60 -0
  85. hud/cli/utils/tests/test_local_runner.py +50 -0
  86. hud/cli/utils/tests/test_logging_utils.py +23 -0
  87. hud/cli/utils/tests/test_metadata.py +49 -0
  88. hud/cli/utils/tests/test_package_runner.py +35 -0
  89. hud/cli/utils/tests/test_registry_utils.py +49 -0
  90. hud/cli/utils/tests/test_remote_runner.py +25 -0
  91. hud/cli/utils/tests/test_runner_modules.py +52 -0
  92. hud/cli/utils/tests/test_source_hash.py +36 -0
  93. hud/cli/utils/tests/test_tasks.py +80 -0
  94. hud/cli/utils/version_check.py +258 -0
  95. hud/cli/{rl → utils}/viewer.py +2 -2
  96. hud/clients/README.md +12 -11
  97. hud/clients/__init__.py +4 -3
  98. hud/clients/base.py +166 -26
  99. hud/clients/environment.py +51 -0
  100. hud/clients/fastmcp.py +13 -6
  101. hud/clients/mcp_use.py +45 -15
  102. hud/clients/tests/test_analyze_scenarios.py +206 -0
  103. hud/clients/tests/test_protocol.py +9 -3
  104. hud/datasets/__init__.py +23 -20
  105. hud/datasets/loader.py +326 -0
  106. hud/datasets/runner.py +198 -105
  107. hud/datasets/tests/__init__.py +0 -0
  108. hud/datasets/tests/test_loader.py +221 -0
  109. hud/datasets/tests/test_utils.py +315 -0
  110. hud/datasets/utils.py +270 -90
  111. hud/environment/__init__.py +52 -0
  112. hud/environment/connection.py +258 -0
  113. hud/environment/connectors/__init__.py +33 -0
  114. hud/environment/connectors/base.py +68 -0
  115. hud/environment/connectors/local.py +177 -0
  116. hud/environment/connectors/mcp_config.py +137 -0
  117. hud/environment/connectors/openai.py +101 -0
  118. hud/environment/connectors/remote.py +172 -0
  119. hud/environment/environment.py +835 -0
  120. hud/environment/integrations/__init__.py +45 -0
  121. hud/environment/integrations/adk.py +67 -0
  122. hud/environment/integrations/anthropic.py +196 -0
  123. hud/environment/integrations/gemini.py +92 -0
  124. hud/environment/integrations/langchain.py +82 -0
  125. hud/environment/integrations/llamaindex.py +68 -0
  126. hud/environment/integrations/openai.py +238 -0
  127. hud/environment/mock.py +306 -0
  128. hud/environment/router.py +263 -0
  129. hud/environment/scenarios.py +620 -0
  130. hud/environment/tests/__init__.py +1 -0
  131. hud/environment/tests/test_connection.py +317 -0
  132. hud/environment/tests/test_connectors.py +205 -0
  133. hud/environment/tests/test_environment.py +593 -0
  134. hud/environment/tests/test_integrations.py +257 -0
  135. hud/environment/tests/test_local_connectors.py +242 -0
  136. hud/environment/tests/test_scenarios.py +1086 -0
  137. hud/environment/tests/test_tools.py +208 -0
  138. hud/environment/types.py +23 -0
  139. hud/environment/utils/__init__.py +35 -0
  140. hud/environment/utils/formats.py +215 -0
  141. hud/environment/utils/schema.py +171 -0
  142. hud/environment/utils/tool_wrappers.py +113 -0
  143. hud/eval/__init__.py +67 -0
  144. hud/eval/context.py +727 -0
  145. hud/eval/display.py +299 -0
  146. hud/eval/instrument.py +187 -0
  147. hud/eval/manager.py +533 -0
  148. hud/eval/parallel.py +268 -0
  149. hud/eval/task.py +372 -0
  150. hud/eval/tests/__init__.py +1 -0
  151. hud/eval/tests/test_context.py +178 -0
  152. hud/eval/tests/test_eval.py +210 -0
  153. hud/eval/tests/test_manager.py +152 -0
  154. hud/eval/tests/test_parallel.py +168 -0
  155. hud/eval/tests/test_task.py +291 -0
  156. hud/eval/types.py +65 -0
  157. hud/eval/utils.py +194 -0
  158. hud/patches/__init__.py +19 -0
  159. hud/patches/mcp_patches.py +308 -0
  160. hud/patches/warnings.py +54 -0
  161. hud/samples/browser.py +4 -4
  162. hud/server/__init__.py +2 -1
  163. hud/server/low_level.py +2 -1
  164. hud/server/router.py +164 -0
  165. hud/server/server.py +567 -80
  166. hud/server/tests/test_mcp_server_integration.py +11 -11
  167. hud/server/tests/test_mcp_server_more.py +1 -1
  168. hud/server/tests/test_server_extra.py +2 -0
  169. hud/settings.py +45 -3
  170. hud/shared/exceptions.py +36 -10
  171. hud/shared/hints.py +26 -1
  172. hud/shared/requests.py +15 -3
  173. hud/shared/tests/test_exceptions.py +40 -31
  174. hud/shared/tests/test_hints.py +167 -0
  175. hud/telemetry/__init__.py +20 -19
  176. hud/telemetry/exporter.py +201 -0
  177. hud/telemetry/instrument.py +165 -253
  178. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  179. hud/telemetry/tests/test_exporter.py +258 -0
  180. hud/telemetry/tests/test_instrument.py +401 -0
  181. hud/tools/__init__.py +18 -2
  182. hud/tools/agent.py +223 -0
  183. hud/tools/apply_patch.py +639 -0
  184. hud/tools/base.py +54 -4
  185. hud/tools/bash.py +2 -2
  186. hud/tools/computer/__init__.py +36 -3
  187. hud/tools/computer/anthropic.py +2 -2
  188. hud/tools/computer/gemini.py +385 -0
  189. hud/tools/computer/hud.py +23 -6
  190. hud/tools/computer/openai.py +20 -21
  191. hud/tools/computer/qwen.py +434 -0
  192. hud/tools/computer/settings.py +37 -0
  193. hud/tools/edit.py +3 -7
  194. hud/tools/executors/base.py +4 -2
  195. hud/tools/executors/pyautogui.py +1 -1
  196. hud/tools/grounding/grounded_tool.py +13 -18
  197. hud/tools/grounding/grounder.py +10 -31
  198. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  199. hud/tools/jupyter.py +330 -0
  200. hud/tools/playwright.py +18 -3
  201. hud/tools/shell.py +308 -0
  202. hud/tools/tests/test_agent_tool.py +355 -0
  203. hud/tools/tests/test_apply_patch.py +718 -0
  204. hud/tools/tests/test_computer.py +4 -9
  205. hud/tools/tests/test_computer_actions.py +24 -2
  206. hud/tools/tests/test_jupyter_tool.py +181 -0
  207. hud/tools/tests/test_shell.py +596 -0
  208. hud/tools/tests/test_submit.py +85 -0
  209. hud/tools/tests/test_types.py +193 -0
  210. hud/tools/types.py +21 -1
  211. hud/types.py +194 -56
  212. hud/utils/__init__.py +2 -0
  213. hud/utils/env.py +67 -0
  214. hud/utils/hud_console.py +89 -18
  215. hud/utils/mcp.py +15 -58
  216. hud/utils/strict_schema.py +162 -0
  217. hud/utils/tests/test_init.py +1 -2
  218. hud/utils/tests/test_mcp.py +1 -28
  219. hud/utils/tests/test_pretty_errors.py +186 -0
  220. hud/utils/tests/test_tool_shorthand.py +154 -0
  221. hud/utils/tests/test_version.py +1 -1
  222. hud/utils/types.py +20 -0
  223. hud/version.py +1 -1
  224. hud_python-0.5.13.dist-info/METADATA +264 -0
  225. hud_python-0.5.13.dist-info/RECORD +305 -0
  226. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/WHEEL +1 -1
  227. hud/agents/langchain.py +0 -261
  228. hud/agents/lite_llm.py +0 -72
  229. hud/cli/rl/__init__.py +0 -180
  230. hud/cli/rl/config.py +0 -101
  231. hud/cli/rl/display.py +0 -133
  232. hud/cli/rl/gpu.py +0 -63
  233. hud/cli/rl/gpu_utils.py +0 -321
  234. hud/cli/rl/local_runner.py +0 -595
  235. hud/cli/rl/presets.py +0 -96
  236. hud/cli/rl/remote_runner.py +0 -463
  237. hud/cli/rl/rl_api.py +0 -150
  238. hud/cli/rl/vllm.py +0 -177
  239. hud/cli/rl/wait_utils.py +0 -89
  240. hud/datasets/parallel.py +0 -687
  241. hud/misc/__init__.py +0 -1
  242. hud/misc/claude_plays_pokemon.py +0 -292
  243. hud/otel/__init__.py +0 -35
  244. hud/otel/collector.py +0 -142
  245. hud/otel/config.py +0 -181
  246. hud/otel/context.py +0 -570
  247. hud/otel/exporters.py +0 -369
  248. hud/otel/instrumentation.py +0 -135
  249. hud/otel/processors.py +0 -121
  250. hud/otel/tests/__init__.py +0 -1
  251. hud/otel/tests/test_processors.py +0 -197
  252. hud/rl/README.md +0 -30
  253. hud/rl/__init__.py +0 -1
  254. hud/rl/actor.py +0 -176
  255. hud/rl/buffer.py +0 -405
  256. hud/rl/chat_template.jinja +0 -101
  257. hud/rl/config.py +0 -192
  258. hud/rl/distributed.py +0 -132
  259. hud/rl/learner.py +0 -637
  260. hud/rl/tests/__init__.py +0 -1
  261. hud/rl/tests/test_learner.py +0 -186
  262. hud/rl/train.py +0 -382
  263. hud/rl/types.py +0 -101
  264. hud/rl/utils/start_vllm_server.sh +0 -30
  265. hud/rl/utils.py +0 -524
  266. hud/rl/vllm_adapter.py +0 -143
  267. hud/telemetry/job.py +0 -352
  268. hud/telemetry/replay.py +0 -74
  269. hud/telemetry/tests/test_replay.py +0 -40
  270. hud/telemetry/tests/test_trace.py +0 -63
  271. hud/telemetry/trace.py +0 -158
  272. hud/utils/agent_factories.py +0 -86
  273. hud/utils/async_utils.py +0 -65
  274. hud/utils/group_eval.py +0 -223
  275. hud/utils/progress.py +0 -149
  276. hud/utils/tasks.py +0 -127
  277. hud/utils/tests/test_async_utils.py +0 -173
  278. hud/utils/tests/test_progress.py +0 -261
  279. hud_python-0.4.45.dist-info/METADATA +0 -552
  280. hud_python-0.4.45.dist-info/RECORD +0 -228
  281. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
  282. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0
hud/cli/eval.py CHANGED
@@ -1,762 +1,888 @@
1
- """HUD evaluation command for running tasks and datasets."""
1
+ """HUD evaluation command for running tasks and datasets.
2
+
3
+ Config Override Order: CLI arguments > .hud_eval.toml > defaults
4
+ """
2
5
 
3
6
  from __future__ import annotations
4
7
 
5
8
  import asyncio
6
9
  import logging
10
+ import re
11
+ import time
12
+ import tomllib
13
+ from dataclasses import dataclass
7
14
  from pathlib import Path
8
- from typing import TYPE_CHECKING, Any, Literal
15
+ from typing import TYPE_CHECKING, Any, ClassVar
9
16
 
17
+ import questionary
10
18
  import typer
19
+ from pydantic import BaseModel, Field, field_validator
20
+ from rich import box
21
+ from rich.table import Table
11
22
 
12
- import hud
13
- from hud.cli.utils.env_check import ensure_built, find_environment_dir
14
23
  from hud.settings import settings
15
- from hud.utils.group_eval import display_group_statistics, run_tasks_grouped
24
+ from hud.types import AgentType
25
+ from hud.utils.env import resolve_env_vars
16
26
  from hud.utils.hud_console import HUDConsole
17
27
 
28
+ # Pattern to detect AWS Bedrock inference profile ARNs
29
+ _BEDROCK_ARN_PATTERN = re.compile(r"^arn:aws:bedrock:[a-z0-9-]+:\d+:inference-profile/.+$")
30
+
31
+
32
+ def _is_bedrock_arn(model: str | None) -> bool:
33
+ """Check if a model string is a Bedrock inference profile ARN."""
34
+ return model is not None and bool(_BEDROCK_ARN_PATTERN.match(model))
35
+
36
+
18
37
  if TYPE_CHECKING:
19
- from hud.types import Task
38
+ from hud.agents.base import MCPAgent
39
+
20
40
  logger = logging.getLogger(__name__)
21
41
  hud_console = HUDConsole()
22
42
 
43
+ _CONFIG_PATH = ".hud_eval.toml"
23
44
 
24
- def get_available_models() -> list[dict[str, str | None]]:
25
- """Fetch available models from the HUD API (only ready models).
26
-
27
- Returns:
28
- List of dicts with 'name', 'vllm_url', and 'base_model' keys
29
- """
30
- try:
31
- from hud.cli.rl import rl_api
32
45
 
33
- hud_console.info("Fetching your models from https://hud.so/models")
34
- models = rl_api.list_models()
46
+ @dataclass(frozen=True)
47
+ class AgentPreset:
48
+ """A preset agent configuration combining agent type, model, and optional config."""
35
49
 
36
- # Filter for ready models only and sort by recency
37
- ready_models = [m for m in models if m.status == "ready"]
38
- ready_models.sort(key=lambda m: m.created_at or "", reverse=True)
50
+ name: str
51
+ agent_type: AgentType
52
+ model: str | None = None
53
+ agent_config: dict[str, Any] | None = None
39
54
 
40
- # Count other statuses for informational purposes
41
- training_count = sum(1 for m in models if m.status == "training")
42
- # other_count = len(models) - len(ready_models) - training_count
43
55
 
44
- if ready_models:
45
- hud_console.success(f"Found {len(ready_models)} ready models:")
46
- for model in ready_models:
47
- vllm_status = " (vLLM deployed)" if model.vllm_url else ""
48
- hud_console.info(f" {model.name}{vllm_status}")
49
-
50
- if training_count > 0:
51
- hud_console.info(f"\n({training_count} models currently training)")
56
+ # Built-in presets for the interactive picker
57
+ _AGENT_PRESETS: list[AgentPreset] = [
58
+ # Native agents (use provider SDKs directly)
59
+ AgentPreset("Claude Sonnet 4.5", AgentType.CLAUDE, "claude-sonnet-4-5"),
60
+ AgentPreset("GPT-5", AgentType.OPENAI, "gpt-5"),
61
+ AgentPreset("Operator (OpenAI Computer Use)", AgentType.OPERATOR, "computer-use-preview"),
62
+ AgentPreset("Gemini 3 Pro Preview", AgentType.GEMINI, "gemini-3-pro-preview"),
63
+ AgentPreset(
64
+ "Gemini CUA (Gemini Computer Use)",
65
+ AgentType.GEMINI_CUA,
66
+ "gemini-2.5-computer-use-preview",
67
+ ),
68
+ # HUD Gateway presets (models via HUD Inference API)
69
+ AgentPreset(
70
+ "Grok 4-1 Fast (xAI)",
71
+ AgentType.OPENAI_COMPATIBLE,
72
+ "grok-4-1-fast",
73
+ {
74
+ "openai_compatible": {
75
+ "base_url": settings.hud_gateway_url,
76
+ "model_name": "Grok 4-1 Fast",
77
+ }
78
+ },
79
+ ),
80
+ AgentPreset(
81
+ "GLM-4.5V (Z-AI)",
82
+ AgentType.OPENAI_COMPATIBLE,
83
+ "z-ai/glm-4.5v",
84
+ {"openai_compatible": {"base_url": settings.hud_gateway_url, "model_name": "GLM-4.5V"}},
85
+ ),
86
+ ]
87
+
88
+ _DEFAULT_CONFIG_TEMPLATE = """# HUD Eval Configuration
89
+ # Command-line arguments override these settings
90
+
91
+ [eval]
92
+ # source = "hud-evals/SheetBench-50"
93
+ # agent = "claude"
94
+ # all = false # Run all problems instead of just 1
95
+ # max_concurrent = 30
96
+ # max_steps = 10
97
+ # group_size = 1
98
+ # byok = false # Remote only; use encrypted env vars on the platform.
99
+ # task_ids = ["task_1", "task_2"]
100
+ # verbose = true
101
+ # very_verbose = true
102
+ # auto_respond = true
103
+ # gateway = false # Route LLM API calls through HUD Gateway
104
+
105
+ [agent]
106
+ # allowed_tools = ["computer", "playwright"]
107
+ # disallowed_tools = []
108
+
109
+ [claude]
110
+ # model = "claude-sonnet-4-5"
111
+ # max_tokens = 16384
112
+ # use_computer_beta = true
113
+
114
+ [openai]
115
+ # model = "gpt-4o"
116
+ # temperature = 0.7
117
+ # max_output_tokens = 4096
118
+
119
+ [gemini]
120
+ # model = "gemini-2.5-pro"
121
+ # temperature = 1.0
122
+ # top_p = 0.95
123
+
124
+ [gemini_cua]
125
+ # model = "gemini-2.5-computer-use-preview"
126
+ # temperature = 1.0
127
+ # top_p = 0.95
128
+ # excluded_predefined_functions = []
129
+
130
+ [openai_compatible]
131
+ # base_url = "http://localhost:8000/v1"
132
+ # model = "my-model"
133
+ """
134
+
135
+ # Agent type -> (settings attr, env var name)
136
+ _API_KEY_REQUIREMENTS: dict[AgentType, tuple[str, str]] = {
137
+ AgentType.CLAUDE: ("anthropic_api_key", "ANTHROPIC_API_KEY"),
138
+ AgentType.GEMINI: ("gemini_api_key", "GEMINI_API_KEY"),
139
+ AgentType.GEMINI_CUA: ("gemini_api_key", "GEMINI_API_KEY"),
140
+ AgentType.OPENAI: ("openai_api_key", "OPENAI_API_KEY"),
141
+ AgentType.OPERATOR: ("openai_api_key", "OPENAI_API_KEY"),
142
+ }
143
+
144
+
145
+ class EvalConfig(BaseModel):
146
+ """Configuration for hud eval command."""
147
+
148
+ # Class-level registry
149
+ _agent_classes: ClassVar[dict[AgentType, type["MCPAgent"]]] = {}
150
+
151
+ # Fields loaded from [eval] section
152
+ _EVAL_FIELDS: ClassVar[set[str]] = {
153
+ "source",
154
+ "agent_type",
155
+ "task_ids",
156
+ "all",
157
+ "max_concurrent",
158
+ "max_steps",
159
+ "verbose",
160
+ "very_verbose",
161
+ "group_size",
162
+ "byok",
163
+ "remote",
164
+ "auto_respond",
165
+ "quiet",
166
+ "gateway",
167
+ "taskset",
168
+ }
169
+ # Fields loaded from [agent] section
170
+ _AGENT_FIELDS: ClassVar[set[str]] = {"allowed_tools", "disallowed_tools"}
171
+
172
+ # Eval settings
173
+ source: str | None = None
174
+ agent_type: AgentType | None = None
175
+ model: str | None = None
176
+ task_ids: list[str] | None = None
177
+ all: bool = False # Run all problems instead of just 1
178
+ max_concurrent: int = 30
179
+ max_steps: int = 10
180
+ verbose: bool = False
181
+ very_verbose: bool = False
182
+ auto_respond: bool | None = None # Continue without prompting
183
+ group_size: int = 1
184
+ byok: bool = False
185
+ remote: bool = False
186
+ quiet: bool = False # Suppress opening browser for eval links
187
+ gateway: bool = False # Use HUD Gateway for LLM API calls
188
+ taskset: str | None = None # Taskset slug to associate job with
189
+
190
+ # Base agent config (these merge with task's agent_config)
191
+ allowed_tools: list[str] | None = None
192
+ disallowed_tools: list[str] | None = None
193
+
194
+ agent_config: dict[str, Any] = Field(default_factory=dict)
195
+
196
+ @field_validator("agent_type", mode="before")
197
+ @classmethod
198
+ def _parse_agent_type(cls, v: Any) -> AgentType | None:
199
+ """Convert string agent name to AgentType enum."""
200
+ if v is None:
201
+ return None
202
+ if isinstance(v, AgentType):
203
+ return v
204
+ if isinstance(v, str):
205
+ try:
206
+ return AgentType(v)
207
+ except ValueError:
208
+ valid = [e.value for e in AgentType]
209
+ raise ValueError(
210
+ f"Invalid agent: {v}. Must be one of: {', '.join(valid)}"
211
+ ) from None
212
+ return v
213
+
214
+ def validate_api_keys(self) -> None:
215
+ """Validate required API keys for the selected agent. Raises typer.Exit on failure."""
216
+ # BYOK requires remote execution (check before agent_type guard)
217
+ if self.byok and not self.remote:
218
+ hud_console.error("--byok requires --remote (BYOK only works with remote execution)")
219
+ raise typer.Exit(1)
52
220
 
53
- return [
54
- {"name": model.name, "vllm_url": model.vllm_url, "base_model": model.base_model}
55
- for model in ready_models
56
- ]
57
- else:
58
- if training_count > 0:
59
- hud_console.warning(
60
- f"No ready models found. You have {training_count} models currently training."
221
+ if self.agent_type is None:
222
+ return
223
+
224
+ if self.remote:
225
+ if not settings.api_key:
226
+ hud_console.error("HUD_API_KEY is required for remote execution")
227
+ hud_console.info("Set it: hud set HUD_API_KEY=your-key-here")
228
+ raise typer.Exit(1)
229
+ return
230
+
231
+ # Gateway mode only requires HUD_API_KEY
232
+ if self.gateway:
233
+ if not settings.api_key:
234
+ hud_console.error("HUD_API_KEY is required for gateway mode")
235
+ hud_console.info("Set it: hud set HUD_API_KEY=your-key-here")
236
+ raise typer.Exit(1)
237
+ return
238
+
239
+ if self.agent_type == AgentType.OPENAI_COMPATIBLE:
240
+ # Check both CLI --model and config file model
241
+ config_model = self.agent_config.get("openai_compatible", {}).get("model")
242
+ if not self.model and not config_model:
243
+ hud_console.error(
244
+ "Model name is required for OpenAI compatible agent. "
245
+ "Use --model or set model in [openai_compatible] section of .hud_eval.toml"
61
246
  )
62
- else:
63
- hud_console.warning("No models found in your account.")
64
- return []
65
- except Exception as e:
66
- hud_console.debug(f"Error fetching models: {e}")
67
- # Don't show the error to the user, just proceed without HUD models
68
- return []
69
-
70
-
71
- def build_agent(
72
- agent_type: Literal["claude", "openai", "vllm", "litellm", "integration_test"],
73
- *,
74
- model: str | None = None,
75
- allowed_tools: list[str] | None = None,
76
- verbose: bool = False,
77
- vllm_base_url: str | None = None,
78
- ) -> Any:
79
- """Create and return the requested agent type."""
80
-
81
- # Import agents lazily to avoid dependency issues
82
- if agent_type == "integration_test":
83
- from hud.agents.misc.integration_test_agent import IntegrationTestRunner
84
-
85
- return IntegrationTestRunner(verbose=verbose)
86
- elif agent_type == "vllm":
87
- # Create a generic OpenAI agent for vLLM server
88
- try:
89
- from openai import AsyncOpenAI
90
-
91
- from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
92
- except ImportError as e:
93
- hud_console.error(
94
- "OpenAI dependencies are not installed. "
95
- "Please install with: pip install 'hud-python[agent]'"
247
+ raise typer.Exit(1)
248
+ elif self.agent_type == AgentType.CLAUDE and _is_bedrock_arn(self.model):
249
+ missing_aws = (
250
+ not settings.aws_access_key_id
251
+ or not settings.aws_secret_access_key
252
+ or not settings.aws_region
96
253
  )
97
- raise typer.Exit(1) from e
98
-
99
- # Determine the base URL to use
100
- if vllm_base_url is not None:
101
- # Use the provided vLLM URL (for custom/local servers)
102
- base_url = vllm_base_url
103
- hud_console.info(f"Using vLLM server at {base_url}")
104
- api_key = (
105
- settings.api_key if base_url.startswith(settings.hud_rl_url) else "token-abc123"
106
- )
107
- else:
108
- # Default to localhost
109
- base_url = "http://localhost:8000/v1"
110
- api_key = "token-abc123"
111
-
112
- # Create OpenAI client for vLLM
113
- openai_client = AsyncOpenAI(
114
- base_url=base_url,
115
- api_key=api_key,
116
- timeout=30.0,
117
- )
118
-
119
- return GenericOpenAIChatAgent(
120
- openai_client=openai_client,
121
- model_name=model or "served-model", # Default model name
122
- verbose=verbose,
123
- completion_kwargs={
124
- "temperature": 0.7,
125
- "max_tokens": 2048,
126
- "tool_choice": "required", # if self.actor_config.force_tool_choice else "auto",
127
- },
254
+ if missing_aws:
255
+ hud_console.error(
256
+ "AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, and AWS_REGION "
257
+ "are required for AWS Bedrock"
258
+ )
259
+ raise typer.Exit(1)
260
+ elif self.agent_type in _API_KEY_REQUIREMENTS:
261
+ attr, env_var = _API_KEY_REQUIREMENTS[self.agent_type]
262
+ if not getattr(settings, attr, None):
263
+ hud_console.error(f"{env_var} is required for {self.agent_type.value} agent")
264
+ hud_console.info(f"Set it: hud set {env_var}=your-key-here")
265
+ raise typer.Exit(1)
266
+
267
+ if not settings.api_key:
268
+ hud_console.warning("HUD_API_KEY not set. Some features may be limited.")
269
+
270
+ def get_agent_kwargs(self) -> dict[str, Any]:
271
+ """Build agent kwargs from config.
272
+
273
+ Model precedence:
274
+ 1. CLI --model (highest priority)
275
+ 2. [agent_type].model in TOML (per-agent config)
276
+ """
277
+ if self.agent_type is None:
278
+ raise ValueError("agent_type must be set before calling get_agent_kwargs()")
279
+
280
+ kwargs: dict[str, Any] = {}
281
+
282
+ if self.allowed_tools:
283
+ kwargs["allowed_tools"] = self.allowed_tools
284
+ if self.disallowed_tools:
285
+ kwargs["disallowed_tools"] = self.disallowed_tools
286
+
287
+ # Apply agent-specific config
288
+ agent_key = self.agent_type.value
289
+ if agent_key in self.agent_config:
290
+ agent_cfg = dict(self.agent_config[agent_key])
291
+ kwargs.update(agent_cfg)
292
+
293
+ # CLI --model always wins
294
+ if self.model:
295
+ kwargs["model"] = self.model
296
+
297
+ # For gateway base_url, inject HUD API key if not already set
298
+ if self.agent_type == AgentType.OPENAI_COMPATIBLE and "api_key" not in kwargs:
299
+ base_url = kwargs.get("base_url", "")
300
+ if settings.hud_gateway_url in base_url and settings.api_key:
301
+ kwargs["api_key"] = settings.api_key
302
+
303
+ # Auto-detect Bedrock when Claude is selected with a Bedrock ARN
304
+ # Check both model and checkpoint_name for ARN patterns
305
+ bedrock_arn_detected = _is_bedrock_arn(kwargs.get("model")) or _is_bedrock_arn(
306
+ kwargs.get("checkpoint_name")
128
307
  )
129
-
130
- elif agent_type == "openai":
131
- try:
132
- from hud.agents import OperatorAgent
133
- except ImportError as e:
134
- hud_console.error(
135
- "OpenAI agent dependencies are not installed. "
136
- "Please install with: pip install 'hud-python[agent]'"
308
+ if self.agent_type == AgentType.CLAUDE and bedrock_arn_detected:
309
+ missing_aws = (
310
+ not settings.aws_access_key_id
311
+ or not settings.aws_secret_access_key
312
+ or not settings.aws_region
137
313
  )
138
- raise typer.Exit(1) from e
314
+ if missing_aws:
315
+ hud_console.error(
316
+ "AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, and AWS_REGION "
317
+ "are required for AWS Bedrock"
318
+ )
319
+ raise typer.Exit(1)
139
320
 
140
- if allowed_tools:
141
- return OperatorAgent(
142
- allowed_tools=allowed_tools,
143
- verbose=verbose,
144
- )
145
- else:
146
- return OperatorAgent(verbose=verbose)
321
+ from anthropic import AsyncAnthropicBedrock
147
322
 
148
- elif agent_type == "litellm":
149
- try:
150
- from hud.agents.lite_llm import LiteAgent
151
- except ImportError as e:
152
- hud_console.error(
153
- "LiteLLM agent dependencies are not installed. "
154
- "Please install with: pip install 'hud-python[agent]'"
323
+ kwargs["model_client"] = AsyncAnthropicBedrock(
324
+ aws_access_key=settings.aws_access_key_id,
325
+ aws_secret_key=settings.aws_secret_access_key,
326
+ aws_region=settings.aws_region or "us-east-1",
155
327
  )
156
- raise typer.Exit(1) from e
157
-
158
- return LiteAgent(
159
- model_name=model or "gpt-4o-mini",
160
- allowed_tools=allowed_tools,
161
- verbose=verbose,
162
- )
328
+ hud_console.info("🔧 Using AWS Bedrock (detected ARN in model)")
329
+
330
+ kwargs["verbose"] = self.verbose or self.very_verbose
331
+
332
+ if self.agent_type in (
333
+ AgentType.CLAUDE,
334
+ AgentType.OPENAI,
335
+ AgentType.OPERATOR,
336
+ AgentType.GEMINI,
337
+ AgentType.GEMINI_CUA,
338
+ ):
339
+ kwargs["validate_api_key"] = False
340
+
341
+ # Configure gateway mode - route LLM API calls through HUD gateway
342
+ if self.gateway:
343
+ if not settings.api_key:
344
+ raise typer.Exit(1) # Already validated in validate_api_keys()
345
+
346
+ from hud.agents.gateway import build_gateway_client
347
+
348
+ # Map AgentType to provider
349
+ agent_to_provider = {
350
+ AgentType.CLAUDE: "anthropic",
351
+ AgentType.OPENAI: "openai",
352
+ AgentType.OPERATOR: "openai",
353
+ AgentType.GEMINI: "gemini",
354
+ AgentType.GEMINI_CUA: "gemini",
355
+ AgentType.OPENAI_COMPATIBLE: "openai",
356
+ }
357
+ provider = agent_to_provider.get(self.agent_type, "openai")
358
+ client = build_gateway_client(provider)
359
+
360
+ # OpenAI-compatible uses openai_client key
361
+ is_oai_compat = self.agent_type == AgentType.OPENAI_COMPATIBLE
362
+ kwargs["openai_client" if is_oai_compat else "model_client"] = client
363
+ hud_console.info(f"🌐 Using HUD Gateway for {provider} API")
364
+
365
+ return kwargs
366
+
367
+ @classmethod
368
+ def load(cls, path: str = _CONFIG_PATH) -> EvalConfig:
369
+ """Load config from TOML file."""
370
+ p = Path(path)
371
+ if not p.exists():
372
+ p.write_text(_DEFAULT_CONFIG_TEMPLATE)
373
+ hud_console.info(f"Generated {_CONFIG_PATH}")
374
+ return cls()
163
375
 
164
- # Fallback Claude agent (Anthropic)
165
- try:
166
- from hud.agents import ClaudeAgent
167
- except ImportError as e:
168
- hud_console.error(
169
- "Claude agent dependencies are not installed. "
170
- "Please install with: pip install 'hud-python[agent]'"
171
- )
172
- raise typer.Exit(1) from e
376
+ try:
377
+ with open(p, "rb") as f:
378
+ toml_data = tomllib.load(f)
379
+ except Exception as e:
380
+ hud_console.warning(f"Failed to parse {path}: {e}")
381
+ return cls()
173
382
 
174
- model = model or "claude-sonnet-4-20250514"
383
+ toml_data = resolve_env_vars(toml_data)
175
384
 
176
- if allowed_tools:
177
- return ClaudeAgent(
178
- model=model,
179
- allowed_tools=allowed_tools,
180
- verbose=verbose,
181
- )
182
- else:
183
- return ClaudeAgent(
184
- model=model,
185
- verbose=verbose,
186
- )
385
+ # Extract sections
386
+ eval_section = toml_data.get("eval", {})
387
+ agent_section = toml_data.get("agent", {})
187
388
 
389
+ # Build config data
390
+ data: dict[str, Any] = {}
188
391
 
189
- async def run_single_task(
190
- source: str,
191
- *,
192
- agent_type: Literal["claude", "openai", "vllm", "litellm", "integration_test"] = "claude",
193
- model: str | None = None,
194
- allowed_tools: list[str] | None = None,
195
- max_steps: int = 10,
196
- verbose: bool = False,
197
- vllm_base_url: str | None = None,
198
- group_size: int = 1,
199
- ) -> None:
200
- """Load one task and execute it, or detect if JSON contains a list and run as dataset."""
392
+ # Eval settings (map 'agent' -> 'agent_type')
393
+ if "agent" in eval_section:
394
+ data["agent_type"] = eval_section["agent"]
395
+ for key in cls._EVAL_FIELDS:
396
+ if key in eval_section:
397
+ data[key] = eval_section[key]
201
398
 
202
- # Import Task and run_dataset lazily
203
- try:
204
- from hud.utils.tasks import load_tasks
205
- except ImportError as e:
206
- hud_console.error(
207
- "Dataset dependencies are not installed. "
208
- "Please install with: pip install 'hud-python\u27e6agent\u27e7'"
209
- )
210
- raise typer.Exit(1) from e
399
+ # Agent base config
400
+ for key in cls._AGENT_FIELDS:
401
+ if key in agent_section:
402
+ data[key] = agent_section[key]
211
403
 
212
- path = Path(source)
213
- if path.exists() and (path.suffix in [".json", ".jsonl"]):
214
- hud_console.info("📊 Loading task file…")
215
- tasks: list[Task] = load_tasks(str(path)) # type: ignore[assignment]
404
+ # Agent-specific configs (claude, openai, gemini, etc.)
405
+ agent_config: dict[str, Any] = {}
406
+ for agent_type in AgentType:
407
+ if agent_type.value in toml_data:
408
+ agent_config[agent_type.value] = toml_data[agent_type.value]
409
+ data["agent_config"] = agent_config
216
410
 
217
- # If tasks reference a local environment (nearby), ensure it's built/up-to-date.
218
411
  try:
219
- env_dir = find_environment_dir(path)
220
- if env_dir is not None:
221
- # Non-interactive for eval; warn but don't block
222
- ensure_built(env_dir, interactive=False)
412
+ return cls.model_validate(data)
223
413
  except Exception as e:
224
- hud_console.debug(f"Eval preflight env check skipped: {e}")
225
-
226
- # Single task - use the first (and only) task
227
- task = tasks[0]
228
- hud_console.info("Found 1 task, running as single task…")
229
-
230
- else:
231
- # Load from HuggingFace dataset or non-file source
232
- hud_console.info(f"📊 Loading tasks from: {source}…")
233
- tasks: list[Task] = load_tasks(source) # type: ignore[assignment]
234
-
235
- if not tasks:
236
- hud_console.error(f"No tasks found in: {source}")
237
- raise typer.Exit(1)
238
-
239
- # Single task - use the first task
240
- task = tasks[0]
241
- hud_console.info(
242
- "Using first task from dataset (run with --full to run the entire dataset)..."
243
- )
244
-
245
- task_prompt = task.prompt[:50] + "..." if len(task.prompt) > 50 else task.prompt
246
-
247
- # Use grouped evaluation if group_size > 1
248
- agent_config: dict[str, Any] = {}
249
- if agent_type == "integration_test":
250
- from hud.agents.misc.integration_test_agent import IntegrationTestRunner
251
-
252
- agent_class = IntegrationTestRunner
253
- agent_config = {"verbose": verbose}
254
- if allowed_tools:
255
- agent_config["allowed_tools"] = allowed_tools
256
- elif agent_type == "vllm":
257
- # Special handling for vLLM
258
- sample_agent = build_agent(
259
- agent_type,
260
- model=model,
261
- allowed_tools=allowed_tools,
262
- verbose=verbose,
263
- vllm_base_url=vllm_base_url,
264
- )
265
- agent_config = {
266
- "openai_client": sample_agent.oai,
267
- "model_name": sample_agent.model_name,
268
- "verbose": verbose,
269
- "completion_kwargs": sample_agent.completion_kwargs,
270
- }
271
- if allowed_tools:
272
- agent_config["allowed_tools"] = allowed_tools
273
-
274
- from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
275
-
276
- agent_class = GenericOpenAIChatAgent
277
- elif agent_type == "openai":
278
- from hud.agents import OperatorAgent
279
-
280
- agent_class = OperatorAgent
281
- agent_config = {"verbose": verbose}
282
- if allowed_tools:
283
- agent_config["allowed_tools"] = allowed_tools
284
- elif agent_type == "litellm":
285
- from hud.agents.lite_llm import LiteAgent
286
-
287
- agent_class = LiteAgent
288
- agent_config = {
289
- "model_name": model or "gpt-4o-mini",
290
- "verbose": verbose,
291
- }
292
- if allowed_tools:
293
- agent_config["allowed_tools"] = allowed_tools
294
- elif agent_type == "claude":
295
- from hud.agents import ClaudeAgent
296
-
297
- agent_class = ClaudeAgent
298
- agent_config = {
299
- "model": model or "claude-sonnet-4-20250514",
300
- "verbose": verbose,
301
- }
302
- if allowed_tools:
303
- agent_config["allowed_tools"] = allowed_tools
304
- else:
305
- raise ValueError(f"Invalid agent type: {agent_type}")
306
-
307
- if group_size > 1:
308
- hud_console.info(f"🔄 Running task with group_size={group_size}")
309
- # Run with grouping
310
- stats = await run_tasks_grouped(
311
- tasks=[task],
312
- agent_class=agent_class,
313
- agent_config=agent_config,
314
- group_size=group_size,
315
- max_parallel_episodes=48, # Same as RL default
316
- max_steps=max_steps,
317
- verbose=verbose,
318
- )
319
- display_group_statistics(stats, show_details=True)
320
- else:
321
- # Original single-run logic
322
- with hud.trace(name=task_prompt):
323
- agent = build_agent(
324
- agent_type,
325
- model=model,
326
- allowed_tools=allowed_tools,
327
- verbose=verbose,
328
- vllm_base_url=vllm_base_url,
414
+ hud_console.warning(f"Invalid config: {e}")
415
+ return cls()
416
+
417
+ def merge_cli(
418
+ self,
419
+ agent: str | None = None,
420
+ config: list[str] | None = None,
421
+ allowed_tools: str | None = None,
422
+ disallowed_tools: str | None = None,
423
+ task_ids: str | None = None,
424
+ **cli_args: Any,
425
+ ) -> EvalConfig:
426
+ """Merge CLI args (non-None values override config)."""
427
+ overrides: dict[str, Any] = {}
428
+
429
+ if agent is not None:
430
+ overrides["agent_type"] = agent
431
+
432
+ # Parse comma-separated lists
433
+ if allowed_tools is not None:
434
+ overrides["allowed_tools"] = [t.strip() for t in allowed_tools.split(",") if t.strip()]
435
+ if disallowed_tools is not None:
436
+ overrides["disallowed_tools"] = [
437
+ t.strip() for t in disallowed_tools.split(",") if t.strip()
438
+ ]
439
+ if task_ids is not None:
440
+ overrides["task_ids"] = [t.strip() for t in task_ids.split(",") if t.strip()]
441
+
442
+ overrides.update({k: v for k, v in cli_args.items() if v is not None and v is not False})
443
+
444
+ for k in ("all", "verbose", "very_verbose", "remote", "quiet", "gateway"):
445
+ if cli_args.get(k) is True:
446
+ overrides[k] = True
447
+ elif k in overrides and cli_args.get(k) is False:
448
+ del overrides[k]
449
+
450
+ # --full is a shortcut for --all --auto-respond --max-steps 100
451
+ if overrides.get("full"):
452
+ overrides["all"] = True
453
+ if "auto_respond" not in overrides:
454
+ overrides["auto_respond"] = True
455
+ if "max_steps" not in overrides:
456
+ overrides["max_steps"] = 100
457
+
458
+ if config:
459
+ merged_agent_config = dict(self.agent_config)
460
+ for item in config:
461
+ if "=" in item:
462
+ key, value = item.split("=", 1)
463
+ key = key.strip()
464
+ value = value.strip()
465
+
466
+ # Parse value
467
+ if value.lower() == "true":
468
+ parsed_value: Any = True
469
+ elif value.lower() == "false":
470
+ parsed_value = False
471
+ else:
472
+ try:
473
+ parsed_value = int(value)
474
+ except ValueError:
475
+ try:
476
+ parsed_value = float(value)
477
+ except ValueError:
478
+ parsed_value = value
479
+
480
+ # Handle namespaced keys (e.g., claude.max_tokens)
481
+ if "." in key:
482
+ agent_name, param = key.split(".", 1)
483
+ if agent_name not in merged_agent_config:
484
+ merged_agent_config[agent_name] = {}
485
+ merged_agent_config[agent_name][param] = parsed_value
486
+ else:
487
+ # Non-namespaced: apply to current agent if set
488
+ if self.agent_type:
489
+ agent_name = self.agent_type.value
490
+ if agent_name not in merged_agent_config:
491
+ merged_agent_config[agent_name] = {}
492
+ merged_agent_config[agent_name][key] = parsed_value
493
+
494
+ overrides["agent_config"] = merged_agent_config
495
+
496
+ return self.model_validate({**self.model_dump(), **overrides})
497
+
498
+ def resolve_agent_interactive(self) -> EvalConfig:
499
+ """Prompt user to select an agent preset if not set. Returns updated config."""
500
+ if self.agent_type is not None:
501
+ return self
502
+
503
+ # Build choices from presets
504
+ choices: list[dict[str, Any]] = [
505
+ {"name": preset.name, "value": preset} for preset in _AGENT_PRESETS
506
+ ]
507
+
508
+ selected: AgentPreset = hud_console.select("Select an agent:", choices=choices, default=0) # type: ignore[arg-type]
509
+
510
+ # Merge preset into config
511
+ updates: dict[str, Any] = {"agent_type": selected.agent_type}
512
+ if selected.model:
513
+ updates["model"] = selected.model
514
+ if selected.agent_config:
515
+ # Merge preset's agent_config with existing
516
+ merged = dict(self.agent_config)
517
+ for key, value in selected.agent_config.items():
518
+ if key in merged:
519
+ merged[key] = {**merged[key], **value}
520
+ else:
521
+ merged[key] = value
522
+ updates["agent_config"] = merged
523
+
524
+ return self.model_validate({**self.model_dump(), **updates})
525
+
526
+ def display(self) -> None:
527
+ """Display settings in a table."""
528
+ table = Table(title="Evaluation Settings", title_style="bold cyan", box=box.ROUNDED)
529
+ table.add_column("Setting", style="yellow")
530
+ table.add_column("Value", style="green")
531
+
532
+ # Core settings
533
+ table.add_row("source", str(self.source or "—"))
534
+ table.add_row("agent", self.agent_type.value) # type: ignore[union-attr]
535
+ if self.task_ids:
536
+ table.add_row(
537
+ "task_ids", ", ".join(self.task_ids[:5]) + ("..." if len(self.task_ids) > 5 else "")
329
538
  )
330
- hud_console.info(task.prompt)
331
- result = await agent.run(task, max_steps=max_steps)
332
- hud_console.success(f"Reward: {result.reward}")
333
-
334
-
335
- async def run_full_dataset(
336
- source: str,
337
- *,
338
- agent_type: Literal["claude", "openai", "vllm", "litellm", "integration_test"] = "claude",
339
- model: str | None = None,
340
- allowed_tools: list[str] | None = None,
341
- max_concurrent: int = 30,
342
- max_steps: int = 10,
343
- parallel: bool = False,
344
- max_workers: int | None = None,
345
- max_concurrent_per_worker: int = 25,
346
- verbose: bool = False,
347
- vllm_base_url: str | None = None,
348
- group_size: int = 1,
349
- ) -> list[Any]:
350
- """Run evaluation across the entire dataset.
351
-
352
- Uses either asyncio-based run_dataset or process-based parallel execution
353
- depending on the parallel flag."""
354
-
355
- # Import run_dataset lazily
356
- try:
357
- from hud.datasets import run_dataset, run_dataset_parallel, run_dataset_parallel_manual
358
- from hud.utils.tasks import load_tasks
359
- except ImportError as e:
360
- hud_console.error(
361
- "Dataset dependencies are not installed. "
362
- "Please install with: pip install 'hud-python[agent]'"
363
- )
364
- raise typer.Exit(1) from e
365
-
366
- # Load tasks using unified loader
367
- hud_console.info(f"📊 Loading tasks from: {source}…")
368
- tasks: list[Task] = load_tasks(source) # type: ignore[assignment]
539
+ table.add_row("all", str(self.all))
540
+ table.add_row("max_steps", str(self.max_steps))
541
+ if not self.remote:
542
+ table.add_row("max_concurrent", str(self.max_concurrent))
543
+ if self.group_size > 1:
544
+ table.add_row("group_size", str(self.group_size))
545
+ if self.auto_respond:
546
+ table.add_row("auto_respond", "[bold green]True[/bold green]")
547
+ if self.very_verbose:
548
+ table.add_row("very_verbose", "[bold green]True[/bold green]")
549
+ elif self.verbose:
550
+ table.add_row("verbose", "[bold green]True[/bold green]")
551
+ if self.remote:
552
+ table.add_row("remote", "[bold green]True[/bold green] (submitting to platform)")
553
+ if self.gateway:
554
+ table.add_row("gateway", "[bold green]True[/bold green] (routing via HUD Gateway)")
555
+ if self.byok:
556
+ table.add_row("byok", "[bold green]True[/bold green] (remote only)")
557
+
558
+ # Tool filters (only if set)
559
+ if self.allowed_tools:
560
+ table.add_row("allowed_tools", ", ".join(self.allowed_tools))
561
+ if self.disallowed_tools:
562
+ table.add_row("disallowed_tools", ", ".join(self.disallowed_tools))
563
+
564
+ # Agent config section
565
+ if self.agent_type:
566
+ table.add_row("", "")
567
+ table.add_row(f"[dim]{self.agent_type.value} config[/dim]", "")
568
+
569
+ config_cls = self.agent_type.config_cls
570
+ defaults = config_cls()
571
+ overrides = self.agent_config.get(self.agent_type.value, {})
572
+ skip = {
573
+ "model_client",
574
+ "model_name",
575
+ "validate_api_key",
576
+ "model_config",
577
+ "allowed_tools",
578
+ "disallowed_tools",
579
+ "system_prompt",
580
+ "response_tool_name",
581
+ "append_setup_output",
582
+ "initial_screenshot",
583
+ }
584
+
585
+ sensitive_fields = {"api_key", "api_secret", "token", "password", "secret"}
586
+
587
+ for name in config_cls.model_fields:
588
+ if name in skip:
589
+ continue
590
+ # Always show model
591
+ if name == "model":
592
+ if self.model:
593
+ value = self.model
594
+ elif overrides.get("model"):
595
+ value = overrides["model"]
596
+ else:
597
+ value = getattr(defaults, "model", None)
598
+ table.add_row(" model", str(value) if value else "—")
599
+ elif name in overrides:
600
+ value = overrides[name]
601
+ if name in sensitive_fields and value:
602
+ display_value = f"{str(value)[:4]}****" if len(str(value)) > 4 else "****"
603
+ else:
604
+ display_value = str(value)
605
+ table.add_row(f" {name}", display_value)
606
+
607
+ hud_console.console.print(table)
608
+
609
+
610
+ # =============================================================================
611
+ # Evaluation runner
612
+ # =============================================================================
613
+
614
+
615
+ async def _run_evaluation(cfg: EvalConfig) -> tuple[list[Any], list[Any]]:
616
+ """Run evaluation with the given config using run_dataset()."""
617
+ from hud.datasets import load_tasks, run_dataset
618
+
619
+ if cfg.source is None or cfg.agent_type is None:
620
+ raise ValueError("source and agent_type must be set")
621
+
622
+ # Load tasks using unified loader (handles v4→v5 conversion automatically)
623
+ hud_console.info(f"📊 Loading tasks from: {cfg.source}…")
624
+ tasks = load_tasks(cfg.source)
369
625
 
370
626
  if not tasks:
371
- hud_console.error(f"No tasks found in: {source}")
627
+ hud_console.error(f"No tasks found in: {cfg.source}")
372
628
  raise typer.Exit(1)
373
629
 
374
- # Convert Task objects to dicts for dataset runners
375
- dataset_or_tasks = [task.model_dump() for task in tasks]
376
-
377
- # Determine dataset name
378
- path = Path(source)
379
- dataset_name = f"Dataset: {path.name}" if path.exists() else source.split("/")[-1]
380
-
381
- # Build agent class + config for run_dataset
382
- if agent_type == "integration_test": # --integration-test mode
383
- from hud.agents.misc.integration_test_agent import IntegrationTestRunner
630
+ # Filter by task IDs if provided
631
+ if cfg.task_ids:
632
+ id_set = set(cfg.task_ids)
633
+ # Match by task.id or index
634
+ filtered = [t for i, t in enumerate(tasks) if t.id in id_set or str(i) in id_set]
635
+ if not filtered:
636
+ hud_console.error(f"No tasks found matching IDs: {', '.join(cfg.task_ids)}")
637
+ raise typer.Exit(1)
638
+ hud_console.info(f"Filtered to {len(filtered)} task(s) by ID")
639
+ tasks = filtered
640
+ elif not cfg.all:
641
+ # Single task mode (no --all, --full, or --task-ids)
642
+ tasks = [tasks[0]]
643
+ hud_console.info("Using first task (run with --full or --task-ids for more)…")
644
+
645
+ hud_console.info(f"Loaded {len(tasks)} task(s)")
646
+
647
+ # Prepare agent kwargs
648
+ agent_kwargs = cfg.get_agent_kwargs()
649
+ auto_respond = cfg.auto_respond
650
+ if auto_respond:
651
+ agent_kwargs = {**agent_kwargs, "auto_respond": True}
652
+
653
+ max_steps = cfg.max_steps
654
+
655
+ # Remote execution - submit to HUD platform
656
+ if cfg.remote:
657
+ agent_kwargs = {
658
+ k: v for k, v in agent_kwargs.items() if k not in ("api_key", "model_client")
659
+ }
660
+ import uuid
384
661
 
385
- agent_class = IntegrationTestRunner
386
- agent_config = {"verbose": verbose}
387
- elif agent_type == "vllm":
388
- try:
389
- from hud.agents.openai_chat_generic import GenericOpenAIChatAgent
662
+ from hud.datasets.utils import submit_rollouts
663
+ from hud.eval.manager import _send_job_enter
390
664
 
391
- agent_class = GenericOpenAIChatAgent
392
- except ImportError as e:
393
- hud_console.error(
394
- "OpenAI dependencies are not installed. "
395
- "Please install with: pip install 'hud-python[agent]'"
396
- )
397
- raise typer.Exit(1) from e
398
-
399
- # Use build_agent to create a sample agent to get the config
400
- sample_agent = build_agent(
401
- agent_type,
402
- model=model,
403
- allowed_tools=allowed_tools,
404
- verbose=verbose,
405
- vllm_base_url=vllm_base_url,
665
+ job_id = str(uuid.uuid4())
666
+ hud_console.info(
667
+ f"Submitting {len(tasks)} task(s) for remote execution (job_id: {job_id})…"
406
668
  )
407
669
 
408
- # Extract the config from the sample agent
409
- agent_config: dict[str, Any] = {
410
- "openai_client": sample_agent.oai,
411
- "model_name": sample_agent.model_name,
412
- "verbose": verbose,
413
- "completion_kwargs": sample_agent.completion_kwargs,
414
- }
415
- if allowed_tools:
416
- agent_config["allowed_tools"] = allowed_tools
417
- elif agent_type == "openai":
418
- try:
419
- from hud.agents import OperatorAgent
420
-
421
- agent_class = OperatorAgent
422
- except ImportError as e:
423
- hud_console.error(
424
- "OpenAI agent dependencies are not installed. "
425
- "Please install with: pip install 'hud-python[agent]'"
670
+ if cfg.taskset:
671
+ tasks_to_create = [t for t in tasks if not t.id]
672
+ tasks_data = (
673
+ [t.model_dump(mode="json", exclude_none=True) for t in tasks_to_create]
674
+ if tasks_to_create
675
+ else None
426
676
  )
427
- raise typer.Exit(1) from e
428
-
429
- agent_config = {"verbose": verbose}
430
- if allowed_tools:
431
- agent_config["allowed_tools"] = allowed_tools
432
-
433
- elif agent_type == "litellm":
434
- try:
435
- from hud.agents.lite_llm import LiteAgent
436
-
437
- agent_class = LiteAgent
438
- except ImportError as e:
439
- hud_console.error(
440
- "LiteLLM agent dependencies are not installed. "
441
- "Please install with: pip install 'hud-python[agent]'"
677
+ ids = await _send_job_enter(
678
+ job_id=job_id,
679
+ name=f"eval ({cfg.source})" if cfg.source else "eval",
680
+ variants=None,
681
+ group=cfg.group_size,
682
+ api_key=None,
683
+ taskset=cfg.taskset,
684
+ tasks=tasks_data,
442
685
  )
443
- raise typer.Exit(1) from e
686
+ if ids:
687
+ if len(ids) != len(tasks_to_create):
688
+ hud_console.warning(
689
+ f"Task count mismatch: sent {len(tasks_to_create)} tasks, "
690
+ f"received {len(ids)} IDs. Some tasks may not be linked."
691
+ )
692
+ for task_obj, task_version_id in zip(tasks_to_create, ids, strict=False):
693
+ task_obj.id = task_version_id
694
+
695
+ await submit_rollouts(
696
+ tasks=tasks,
697
+ job_id=job_id,
698
+ agent_type=cfg.agent_type,
699
+ agent_params=agent_kwargs,
700
+ max_steps=max_steps,
701
+ group_size=cfg.group_size,
702
+ use_byok=cfg.byok,
703
+ )
444
704
 
445
- agent_config = {
446
- "model_name": model or "gpt-4o-mini",
447
- "verbose": verbose,
448
- }
449
- if allowed_tools:
450
- agent_config["allowed_tools"] = allowed_tools
705
+ hud_console.success(f"Tasks submitted. View at: https://hud.ai/jobs/{job_id}")
706
+ return [], tasks
451
707
 
708
+ # Single task mode - show extra info
709
+ if len(tasks) == 1 and cfg.group_size == 1:
710
+ logging.getLogger("hud.agents").setLevel(logging.INFO)
711
+ logging.getLogger("hud.agents.base").setLevel(logging.INFO)
712
+ # Get prompt from args (v4 tasks) or show scenario name
713
+ prompt = tasks[0].args.get("prompt") if tasks[0].args else tasks[0].scenario
714
+ if prompt:
715
+ hud_console.info(f"Prompt: {prompt}")
452
716
  else:
453
- try:
454
- from hud.agents import ClaudeAgent
717
+ hud_console.info(
718
+ f"🚀 Running evaluation (max_concurrent: {cfg.max_concurrent}, "
719
+ f"group_size: {cfg.group_size})…"
720
+ )
455
721
 
456
- agent_class = ClaudeAgent
457
- except ImportError as e:
458
- hud_console.error(
459
- "Claude agent dependencies are not installed. "
460
- "Please install with: pip install 'hud-python[agent]'"
461
- )
462
- raise typer.Exit(1) from e
722
+ # Run using run_dataset
723
+ results = await run_dataset(
724
+ tasks,
725
+ cfg.agent_type,
726
+ agent_params=agent_kwargs,
727
+ max_steps=max_steps,
728
+ max_concurrent=cfg.max_concurrent,
729
+ group_size=cfg.group_size,
730
+ quiet=cfg.quiet,
731
+ taskset=cfg.taskset,
732
+ )
463
733
 
464
- agent_config = {
465
- "model": model or "claude-sonnet-4-20250514",
466
- "verbose": verbose,
467
- }
468
- if allowed_tools:
469
- agent_config["allowed_tools"] = allowed_tools
470
-
471
- # Use grouped evaluation if group_size > 1
472
- if group_size > 1:
473
- hud_console.info(f"🔄 Running dataset with group_size={group_size}")
474
-
475
- # Run with job tracking
476
- with hud.job(
477
- name=f"Evaluation {dataset_name} (group_size={group_size})",
478
- metadata={
479
- "dataset": source,
480
- "group_size": group_size,
481
- "tasks": len(dataset_or_tasks),
482
- "total_episodes": len(dataset_or_tasks) * group_size,
483
- },
484
- ) as job:
485
- # Convert dicts to Task objects if needed
486
- from hud.datasets import Task
487
-
488
- tasks = []
489
- for item in dataset_or_tasks:
490
- if isinstance(item, dict):
491
- tasks.append(Task(**item))
492
- else:
493
- tasks.append(item)
494
-
495
- stats = await run_tasks_grouped(
496
- tasks=tasks,
497
- agent_class=agent_class,
498
- agent_config=agent_config,
499
- group_size=group_size,
500
- max_parallel_episodes=max_concurrent
501
- if not parallel
502
- else max_concurrent_per_worker * (max_workers or 4),
503
- max_steps=max_steps,
504
- verbose=verbose,
505
- job_id=job.id,
506
- )
734
+ # Show reward for single task
735
+ if len(tasks) == 1 and cfg.group_size == 1 and results:
736
+ hud_console.success(f"Reward: {results[0].reward}")
507
737
 
508
- # Display results
509
- display_group_statistics(stats, show_details=len(stats) <= 50)
738
+ return results, tasks
510
739
 
511
- # Return stats for consistency with other modes
512
- return stats
513
740
 
514
- # Original logic for non-grouped evaluation
515
- elif parallel:
516
- hud_console.info(
517
- f"🚀 Running PARALLEL evaluation (workers: {max_workers or 'auto'}, max_concurrent: {max_concurrent})…" # noqa: E501
518
- )
519
- if max_workers is None:
520
- # Use auto-optimization (now the default run_dataset_parallel)
521
- return await run_dataset_parallel(
522
- name=f"Evaluation {dataset_name}",
523
- dataset=dataset_or_tasks,
524
- agent_class=agent_class,
525
- agent_config=agent_config,
526
- max_concurrent=max_concurrent,
527
- metadata={"dataset": source, "parallel": True},
528
- max_steps=max_steps,
529
- auto_respond=True,
530
- )
531
- else:
532
- # Use manual configuration
533
- return await run_dataset_parallel_manual(
534
- name=f"Evaluation {dataset_name}",
535
- dataset=dataset_or_tasks,
536
- agent_class=agent_class,
537
- agent_config=agent_config,
538
- max_workers=max_workers,
539
- max_concurrent_per_worker=max_concurrent_per_worker,
540
- max_concurrent=max_concurrent,
541
- metadata={"dataset": source, "parallel": True},
542
- max_steps=max_steps,
543
- auto_respond=True,
544
- )
545
- else:
546
- hud_console.info(f"🚀 Running evaluation (max_concurrent: {max_concurrent})…")
547
- return await run_dataset(
548
- name=f"Evaluation {dataset_name}",
549
- dataset=dataset_or_tasks,
550
- agent_class=agent_class,
551
- agent_config=agent_config,
552
- max_concurrent=max_concurrent,
553
- metadata={"dataset": source},
554
- max_steps=max_steps,
555
- )
741
+ # =============================================================================
742
+ # CLI command
743
+ # =============================================================================
556
744
 
557
745
 
558
746
  def eval_command(
559
- source: str = typer.Argument(
560
- ...,
561
- help="HuggingFace dataset identifier (e.g. 'hud-evals/SheetBench-50'), JSON file (array of tasks), or JSONL file (one task per line)", # noqa: E501
747
+ source: str | None = typer.Argument(None, help="HuggingFace dataset or task JSON file"),
748
+ agent: str | None = typer.Argument(
749
+ None,
750
+ help="Agent: claude, openai, operator, gemini, gemini_cua, openai_compatible, integration_test", # noqa: E501
562
751
  ),
752
+ all: bool = typer.Option(False, "--all", help="Run all problems instead of just 1"),
563
753
  full: bool = typer.Option(
564
754
  False,
565
755
  "--full",
566
- help="Run the entire dataset (omit for single-task debug mode)",
756
+ help="Run the entire dataset. Shortcut for --all --auto-respond --max-steps 100",
567
757
  ),
568
- agent: Literal["claude", "openai", "vllm", "litellm", "integration_test"] = typer.Option(
569
- "claude",
570
- "--agent",
571
- help="Agent backend to use (claude, openai, vllm for local server, or litellm)",
572
- ),
573
- model: str | None = typer.Option(
574
- None,
575
- "--model",
576
- help="Model name for the chosen agent",
758
+ model: str | None = typer.Option(None, "--model", "-m", help="Model name"),
759
+ config: list[str] | None = typer.Option( # noqa: B008
760
+ None, "--config", "-c", help="Agent config: key=value"
577
761
  ),
762
+ # Task-overridable settings
578
763
  allowed_tools: str | None = typer.Option(
579
- None,
580
- "--allowed-tools",
581
- help="Comma-separated list of allowed tools",
764
+ None, "--allowed-tools", help="Comma-separated allowed tools"
582
765
  ),
583
- max_concurrent: int = typer.Option(
584
- 30,
585
- "--max-concurrent",
586
- help="Concurrency level for asyncio mode (ignored in parallel mode)",
766
+ disallowed_tools: str | None = typer.Option(
767
+ None, "--disallowed-tools", help="Comma-separated disallowed tools"
587
768
  ),
588
- max_steps: int | None = typer.Option(
589
- None,
590
- "--max-steps",
591
- help="Maximum steps per task (default: 10 for single, 50 for full)",
769
+ # Eval settings
770
+ max_concurrent: int | None = typer.Option(
771
+ None, "--max-concurrent", help="Max concurrent tasks"
592
772
  ),
593
- parallel: bool = typer.Option(
773
+ max_steps: int | None = typer.Option(None, "--max-steps", help="Max steps per task"),
774
+ verbose: bool = typer.Option(False, "--verbose", "-v", help="Verbose output"),
775
+ very_verbose: bool = typer.Option(False, "--very-verbose", "-vv", help="Debug logs"),
776
+ auto_respond: bool = typer.Option(
594
777
  False,
595
- "--parallel",
596
- help="Use process-based parallel execution for large datasets (100+ tasks)",
778
+ "--auto-respond",
779
+ help="Automatically prompt the agent to continue if it does not respond with a tool call",
597
780
  ),
598
- max_workers: int | None = typer.Option(
599
- None,
600
- "--max-workers",
601
- help="Number of worker processes for parallel mode (auto-optimized if not set)",
602
- ),
603
- max_concurrent_per_worker: int = typer.Option(
604
- 20,
605
- "--max-concurrent-per-worker",
606
- help="Maximum concurrent tasks per worker in parallel mode",
781
+ group_size: int | None = typer.Option(None, "--group-size", help="Runs per task"),
782
+ task_ids: str | None = typer.Option(None, "--task-ids", help="Comma-separated task IDs to run"),
783
+ yes: bool = typer.Option(False, "--yes", "-y", help="Skip confirmation"),
784
+ remote: bool = typer.Option(
785
+ False, "--remote", help="Submit tasks to platform for remote execution"
607
786
  ),
608
- verbose: bool = typer.Option(
787
+ byok: bool = typer.Option(
609
788
  False,
610
- "--verbose",
611
- help="Enable verbose output from the agent",
789
+ "--byok",
790
+ help="Remote only: use BYOK keys from encrypted env vars for inference",
612
791
  ),
613
- very_verbose: bool = typer.Option(
614
- False,
615
- "--very-verbose",
616
- "-vv",
617
- help="Enable debug-level logs for maximum visibility",
792
+ quiet: bool = typer.Option(
793
+ False, "--quiet", "-q", help="Suppress opening browser for eval links"
618
794
  ),
619
- vllm_base_url: str | None = typer.Option(
620
- None,
621
- "--vllm-base-url",
622
- help="Base URL for vLLM server (when using --agent vllm)",
795
+ gateway: bool = typer.Option(
796
+ False, "--gateway", "-g", help="Route LLM API calls through HUD Gateway"
623
797
  ),
624
- group_size: int = typer.Option(
625
- 1,
626
- "--group-size",
627
- help="Number of times to run each task (similar to RL training)",
628
- ),
629
- integration_test: bool = typer.Option(
630
- False,
631
- "--integration-test",
632
- help=(
633
- "Run integration_test_tool tool, where problem is setup, "
634
- "actions are applied, and evaluation is performed, without "
635
- "spinning up an agent"
636
- ),
798
+ taskset: str | None = typer.Option(
799
+ None, "--taskset", "-t", help="Taskset slug to associate job with"
637
800
  ),
638
801
  ) -> None:
639
802
  """🚀 Run evaluation on datasets or individual tasks with agents.
640
803
 
641
804
  Examples:
642
- # Evaluate a single task from SheetBench
643
- hud eval hud-evals/SheetBench-50
644
-
645
- # Evaluate the FULL SheetBench dataset with Claude (asyncio mode)
646
- hud eval hud-evals/SheetBench-50 --full --agent claude
647
-
648
- # Run large dataset with PARALLEL execution (auto-optimized)
649
- hud eval hud-evals/OSWorld-Verified-XLang --full --parallel
650
-
651
- # Parallel mode with manual configuration (16 workers, 25 tasks each)
652
- hud eval hud-evals/OSWorld-Verified-XLang --full --parallel --max-workers 16
653
-
654
- # Limit total concurrent tasks to prevent rate limits
655
- hud eval hud-evals/SheetBench-50 --full --parallel --max-concurrent 20
656
-
657
- # Run a single task from a JSON file
658
- hud eval task.json
659
-
660
- # Run multiple tasks from a JSON file with parallel execution
661
- hud eval tasks.json --full --parallel
805
+ hud eval tasks.json claude
806
+ hud eval hud-evals/SheetBench-50 claude --full
807
+ hud eval tasks.json claude --config max_tokens=32768
808
+ hud eval tasks.json openai --config temperature=0.7
809
+ hud eval tasks.json claude --full --remote # Remote execution
810
+ hud eval tasks.json claude --gateway # Route LLM calls through HUD Gateway
811
+ """
812
+ hud_console.info("🔧 Initializing evaluation...")
813
+
814
+ # Load config and merge CLI args
815
+ cfg = EvalConfig.load().merge_cli(
816
+ source=source,
817
+ agent=agent,
818
+ model=model,
819
+ all=all,
820
+ full=full,
821
+ max_concurrent=max_concurrent,
822
+ max_steps=max_steps,
823
+ allowed_tools=allowed_tools,
824
+ disallowed_tools=disallowed_tools,
825
+ task_ids=task_ids,
826
+ verbose=verbose,
827
+ very_verbose=very_verbose,
828
+ auto_respond=auto_respond,
829
+ group_size=group_size,
830
+ config=config,
831
+ remote=remote,
832
+ byok=byok,
833
+ quiet=quiet,
834
+ gateway=gateway,
835
+ taskset=taskset,
836
+ )
662
837
 
663
- # Run with OpenAI Operator agent
664
- hud eval hud-evals/OSWorld-Gold-Beta --agent openai
838
+ # Find source if not provided
839
+ if cfg.source is None:
840
+ try:
841
+ from hud.cli.utils.tasks import find_tasks_file
665
842
 
666
- # Use local vLLM server (default: localhost:8000)
667
- hud eval task.json --agent vllm --model Qwen/Qwen2.5-VL-3B-Instruct
843
+ cfg = cfg.model_copy(
844
+ update={"source": find_tasks_file(None, msg="Select a tasks file")}
845
+ )
846
+ hud_console.success(f"Selected: {cfg.source}")
847
+ except Exception:
848
+ hud_console.error("No source provided and no task files found")
849
+ raise typer.Exit(1) from None
668
850
 
669
- # Use custom vLLM server URL
670
- hud eval task.json --agent vllm --vllm-base-url http://192.168.1.100:8000/v1
851
+ # Resolve agent interactively if needed
852
+ cfg = cfg.resolve_agent_interactive()
671
853
 
672
- # Run with verbose output for debugging
673
- hud eval task.json --verbose
674
- """
675
- from hud.settings import settings
676
-
677
- if very_verbose:
678
- logging.basicConfig(
679
- level=logging.DEBUG,
680
- format="%(asctime)s - %(name)s - %(message)s",
681
- datefmt="%H:%M:%S",
682
- )
854
+ # Configure logging
855
+ if cfg.very_verbose:
856
+ logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(message)s")
683
857
  logging.getLogger("hud.agents").setLevel(logging.DEBUG)
684
- logging.getLogger("hud.agents.base").setLevel(logging.DEBUG)
685
- elif verbose:
686
- logging.basicConfig(
687
- level=logging.INFO,
688
- format="%(asctime)s - %(name)s - %(message)s",
689
- datefmt="%H:%M:%S",
690
- )
858
+ # Suppress noisy HTTP client logs
859
+ logging.getLogger("httpx").setLevel(logging.WARNING)
860
+ logging.getLogger("httpcore").setLevel(logging.WARNING)
861
+ elif cfg.verbose:
691
862
  logging.getLogger("hud.agents").setLevel(logging.INFO)
692
- logging.getLogger("hud.agents.base").setLevel(logging.INFO)
693
863
 
694
- # We pass integration_test as the agent_type
695
- if integration_test:
696
- agent = "integration_test"
864
+ # Validate API keys
865
+ cfg.validate_api_keys()
697
866
 
698
- # Check for required API keys
699
- if agent == "claude":
700
- if not settings.anthropic_api_key:
701
- hud_console.error("ANTHROPIC_API_KEY is required for Claude agent")
702
- hud_console.info(
703
- "Set it in your environment or run: hud set ANTHROPIC_API_KEY=your-key-here"
704
- )
705
- raise typer.Exit(1)
706
- elif agent == "openai" and not settings.openai_api_key:
707
- hud_console.error("OPENAI_API_KEY is required for OpenAI agent")
708
- hud_console.info("Set it in your environment or run: hud set OPENAI_API_KEY=your-key-here")
867
+ # Display and confirm
868
+ cfg.display()
869
+
870
+ if not yes and not questionary.confirm("Proceed?", default=True, qmark="").ask():
871
+ hud_console.info("Cancelled.")
709
872
  raise typer.Exit(1)
710
- elif agent == "vllm":
711
- if model:
712
- hud_console.info(f"Using vLLM with model: {model}")
713
- else:
714
- hud_console.error("Model name is required for vLLM agent, specify with --model")
715
- raise typer.Exit(1)
716
873
 
717
- # Check for HUD_API_KEY if using HUD services
718
- if not settings.api_key:
719
- hud_console.warning("HUD_API_KEY not set. Some features may be limited.")
720
- hud_console.info("Get your API key at: https://hud.so")
721
- hud_console.info("Set it in your environment or run: hud set HUD_API_KEY=your-key-here")
874
+ # Run
875
+ start_time = time.time()
876
+ try:
877
+ results, tasks = asyncio.run(_run_evaluation(cfg))
878
+ except ValueError as e:
879
+ hud_console.error(str(e))
880
+ raise typer.Exit(1) from None
881
+ elapsed = time.time() - start_time
722
882
 
723
- # Parse allowed tools
724
- allowed_tools_list = (
725
- [t.strip() for t in allowed_tools.split(",") if t.strip()] if allowed_tools else None
726
- )
883
+ if cfg.remote:
884
+ return
727
885
 
728
- # Set default max_steps if not provided
729
- if max_steps is None:
730
- max_steps = 50 if full else 10
731
-
732
- # Run evaluation
733
- if full:
734
- asyncio.run(
735
- run_full_dataset(
736
- source,
737
- agent_type=agent,
738
- model=model,
739
- allowed_tools=allowed_tools_list,
740
- max_concurrent=max_concurrent,
741
- max_steps=max_steps,
742
- parallel=parallel,
743
- max_workers=max_workers,
744
- max_concurrent_per_worker=max_concurrent_per_worker,
745
- verbose=very_verbose or verbose,
746
- vllm_base_url=vllm_base_url,
747
- group_size=group_size,
748
- )
749
- )
750
- else:
751
- asyncio.run(
752
- run_single_task(
753
- source,
754
- agent_type=agent,
755
- model=model,
756
- allowed_tools=allowed_tools_list,
757
- max_steps=max_steps,
758
- verbose=very_verbose or verbose,
759
- vllm_base_url=vllm_base_url,
760
- group_size=group_size,
761
- )
762
- )
886
+ from hud.datasets import display_results
887
+
888
+ display_results(results, tasks=tasks, elapsed=elapsed, show_details=len(results) <= 50)