hud-python 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (282) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +70 -5
  3. hud/agents/base.py +238 -500
  4. hud/agents/claude.py +236 -247
  5. hud/agents/gateway.py +42 -0
  6. hud/agents/gemini.py +264 -0
  7. hud/agents/gemini_cua.py +324 -0
  8. hud/agents/grounded_openai.py +98 -100
  9. hud/agents/misc/integration_test_agent.py +51 -20
  10. hud/agents/misc/response_agent.py +48 -36
  11. hud/agents/openai.py +282 -296
  12. hud/agents/{openai_chat_generic.py → openai_chat.py} +63 -33
  13. hud/agents/operator.py +199 -0
  14. hud/agents/resolver.py +70 -0
  15. hud/agents/tests/conftest.py +133 -0
  16. hud/agents/tests/test_base.py +300 -622
  17. hud/agents/tests/test_base_runtime.py +233 -0
  18. hud/agents/tests/test_claude.py +381 -214
  19. hud/agents/tests/test_client.py +9 -10
  20. hud/agents/tests/test_gemini.py +369 -0
  21. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  22. hud/agents/tests/test_openai.py +377 -140
  23. hud/agents/tests/test_operator.py +362 -0
  24. hud/agents/tests/test_resolver.py +192 -0
  25. hud/agents/tests/test_run_eval.py +179 -0
  26. hud/agents/types.py +148 -0
  27. hud/cli/__init__.py +493 -546
  28. hud/cli/analyze.py +43 -5
  29. hud/cli/build.py +699 -113
  30. hud/cli/debug.py +8 -5
  31. hud/cli/dev.py +889 -732
  32. hud/cli/eval.py +793 -667
  33. hud/cli/flows/dev.py +167 -0
  34. hud/cli/flows/init.py +191 -0
  35. hud/cli/flows/tasks.py +153 -56
  36. hud/cli/flows/templates.py +151 -0
  37. hud/cli/flows/tests/__init__.py +1 -0
  38. hud/cli/flows/tests/test_dev.py +126 -0
  39. hud/cli/init.py +60 -58
  40. hud/cli/pull.py +1 -1
  41. hud/cli/push.py +38 -13
  42. hud/cli/rft.py +311 -0
  43. hud/cli/rft_status.py +145 -0
  44. hud/cli/tests/test_analyze.py +5 -5
  45. hud/cli/tests/test_analyze_metadata.py +3 -2
  46. hud/cli/tests/test_analyze_module.py +120 -0
  47. hud/cli/tests/test_build.py +110 -8
  48. hud/cli/tests/test_build_failure.py +41 -0
  49. hud/cli/tests/test_build_module.py +50 -0
  50. hud/cli/tests/test_cli_init.py +6 -1
  51. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  52. hud/cli/tests/test_cli_root.py +140 -0
  53. hud/cli/tests/test_convert.py +361 -0
  54. hud/cli/tests/test_debug.py +12 -10
  55. hud/cli/tests/test_dev.py +197 -0
  56. hud/cli/tests/test_eval.py +251 -0
  57. hud/cli/tests/test_eval_bedrock.py +51 -0
  58. hud/cli/tests/test_init.py +124 -0
  59. hud/cli/tests/test_main_module.py +11 -5
  60. hud/cli/tests/test_mcp_server.py +12 -100
  61. hud/cli/tests/test_push.py +1 -1
  62. hud/cli/tests/test_push_happy.py +74 -0
  63. hud/cli/tests/test_push_wrapper.py +23 -0
  64. hud/cli/tests/test_registry.py +1 -1
  65. hud/cli/tests/test_utils.py +1 -1
  66. hud/cli/{rl → utils}/celebrate.py +14 -12
  67. hud/cli/utils/config.py +18 -1
  68. hud/cli/utils/docker.py +130 -4
  69. hud/cli/utils/env_check.py +9 -9
  70. hud/cli/utils/git.py +136 -0
  71. hud/cli/utils/interactive.py +39 -5
  72. hud/cli/utils/metadata.py +70 -1
  73. hud/cli/utils/runner.py +1 -1
  74. hud/cli/utils/server.py +2 -2
  75. hud/cli/utils/source_hash.py +3 -3
  76. hud/cli/utils/tasks.py +4 -1
  77. hud/cli/utils/tests/__init__.py +0 -0
  78. hud/cli/utils/tests/test_config.py +58 -0
  79. hud/cli/utils/tests/test_docker.py +93 -0
  80. hud/cli/utils/tests/test_docker_hints.py +71 -0
  81. hud/cli/utils/tests/test_env_check.py +74 -0
  82. hud/cli/utils/tests/test_environment.py +42 -0
  83. hud/cli/utils/tests/test_git.py +142 -0
  84. hud/cli/utils/tests/test_interactive_module.py +60 -0
  85. hud/cli/utils/tests/test_local_runner.py +50 -0
  86. hud/cli/utils/tests/test_logging_utils.py +23 -0
  87. hud/cli/utils/tests/test_metadata.py +49 -0
  88. hud/cli/utils/tests/test_package_runner.py +35 -0
  89. hud/cli/utils/tests/test_registry_utils.py +49 -0
  90. hud/cli/utils/tests/test_remote_runner.py +25 -0
  91. hud/cli/utils/tests/test_runner_modules.py +52 -0
  92. hud/cli/utils/tests/test_source_hash.py +36 -0
  93. hud/cli/utils/tests/test_tasks.py +80 -0
  94. hud/cli/utils/version_check.py +258 -0
  95. hud/cli/{rl → utils}/viewer.py +2 -2
  96. hud/clients/README.md +12 -11
  97. hud/clients/__init__.py +4 -3
  98. hud/clients/base.py +166 -26
  99. hud/clients/environment.py +51 -0
  100. hud/clients/fastmcp.py +13 -6
  101. hud/clients/mcp_use.py +45 -15
  102. hud/clients/tests/test_analyze_scenarios.py +206 -0
  103. hud/clients/tests/test_protocol.py +9 -3
  104. hud/datasets/__init__.py +23 -20
  105. hud/datasets/loader.py +326 -0
  106. hud/datasets/runner.py +198 -105
  107. hud/datasets/tests/__init__.py +0 -0
  108. hud/datasets/tests/test_loader.py +221 -0
  109. hud/datasets/tests/test_utils.py +315 -0
  110. hud/datasets/utils.py +270 -90
  111. hud/environment/__init__.py +52 -0
  112. hud/environment/connection.py +258 -0
  113. hud/environment/connectors/__init__.py +33 -0
  114. hud/environment/connectors/base.py +68 -0
  115. hud/environment/connectors/local.py +177 -0
  116. hud/environment/connectors/mcp_config.py +137 -0
  117. hud/environment/connectors/openai.py +101 -0
  118. hud/environment/connectors/remote.py +172 -0
  119. hud/environment/environment.py +835 -0
  120. hud/environment/integrations/__init__.py +45 -0
  121. hud/environment/integrations/adk.py +67 -0
  122. hud/environment/integrations/anthropic.py +196 -0
  123. hud/environment/integrations/gemini.py +92 -0
  124. hud/environment/integrations/langchain.py +82 -0
  125. hud/environment/integrations/llamaindex.py +68 -0
  126. hud/environment/integrations/openai.py +238 -0
  127. hud/environment/mock.py +306 -0
  128. hud/environment/router.py +263 -0
  129. hud/environment/scenarios.py +620 -0
  130. hud/environment/tests/__init__.py +1 -0
  131. hud/environment/tests/test_connection.py +317 -0
  132. hud/environment/tests/test_connectors.py +205 -0
  133. hud/environment/tests/test_environment.py +593 -0
  134. hud/environment/tests/test_integrations.py +257 -0
  135. hud/environment/tests/test_local_connectors.py +242 -0
  136. hud/environment/tests/test_scenarios.py +1086 -0
  137. hud/environment/tests/test_tools.py +208 -0
  138. hud/environment/types.py +23 -0
  139. hud/environment/utils/__init__.py +35 -0
  140. hud/environment/utils/formats.py +215 -0
  141. hud/environment/utils/schema.py +171 -0
  142. hud/environment/utils/tool_wrappers.py +113 -0
  143. hud/eval/__init__.py +67 -0
  144. hud/eval/context.py +727 -0
  145. hud/eval/display.py +299 -0
  146. hud/eval/instrument.py +187 -0
  147. hud/eval/manager.py +533 -0
  148. hud/eval/parallel.py +268 -0
  149. hud/eval/task.py +372 -0
  150. hud/eval/tests/__init__.py +1 -0
  151. hud/eval/tests/test_context.py +178 -0
  152. hud/eval/tests/test_eval.py +210 -0
  153. hud/eval/tests/test_manager.py +152 -0
  154. hud/eval/tests/test_parallel.py +168 -0
  155. hud/eval/tests/test_task.py +291 -0
  156. hud/eval/types.py +65 -0
  157. hud/eval/utils.py +194 -0
  158. hud/patches/__init__.py +19 -0
  159. hud/patches/mcp_patches.py +308 -0
  160. hud/patches/warnings.py +54 -0
  161. hud/samples/browser.py +4 -4
  162. hud/server/__init__.py +2 -1
  163. hud/server/low_level.py +2 -1
  164. hud/server/router.py +164 -0
  165. hud/server/server.py +567 -80
  166. hud/server/tests/test_mcp_server_integration.py +11 -11
  167. hud/server/tests/test_mcp_server_more.py +1 -1
  168. hud/server/tests/test_server_extra.py +2 -0
  169. hud/settings.py +45 -3
  170. hud/shared/exceptions.py +36 -10
  171. hud/shared/hints.py +26 -1
  172. hud/shared/requests.py +15 -3
  173. hud/shared/tests/test_exceptions.py +40 -31
  174. hud/shared/tests/test_hints.py +167 -0
  175. hud/telemetry/__init__.py +20 -19
  176. hud/telemetry/exporter.py +201 -0
  177. hud/telemetry/instrument.py +165 -253
  178. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  179. hud/telemetry/tests/test_exporter.py +258 -0
  180. hud/telemetry/tests/test_instrument.py +401 -0
  181. hud/tools/__init__.py +18 -2
  182. hud/tools/agent.py +223 -0
  183. hud/tools/apply_patch.py +639 -0
  184. hud/tools/base.py +54 -4
  185. hud/tools/bash.py +2 -2
  186. hud/tools/computer/__init__.py +36 -3
  187. hud/tools/computer/anthropic.py +2 -2
  188. hud/tools/computer/gemini.py +385 -0
  189. hud/tools/computer/hud.py +23 -6
  190. hud/tools/computer/openai.py +20 -21
  191. hud/tools/computer/qwen.py +434 -0
  192. hud/tools/computer/settings.py +37 -0
  193. hud/tools/edit.py +3 -7
  194. hud/tools/executors/base.py +4 -2
  195. hud/tools/executors/pyautogui.py +1 -1
  196. hud/tools/grounding/grounded_tool.py +13 -18
  197. hud/tools/grounding/grounder.py +10 -31
  198. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  199. hud/tools/jupyter.py +330 -0
  200. hud/tools/playwright.py +18 -3
  201. hud/tools/shell.py +308 -0
  202. hud/tools/tests/test_agent_tool.py +355 -0
  203. hud/tools/tests/test_apply_patch.py +718 -0
  204. hud/tools/tests/test_computer.py +4 -9
  205. hud/tools/tests/test_computer_actions.py +24 -2
  206. hud/tools/tests/test_jupyter_tool.py +181 -0
  207. hud/tools/tests/test_shell.py +596 -0
  208. hud/tools/tests/test_submit.py +85 -0
  209. hud/tools/tests/test_types.py +193 -0
  210. hud/tools/types.py +21 -1
  211. hud/types.py +194 -56
  212. hud/utils/__init__.py +2 -0
  213. hud/utils/env.py +67 -0
  214. hud/utils/hud_console.py +89 -18
  215. hud/utils/mcp.py +15 -58
  216. hud/utils/strict_schema.py +162 -0
  217. hud/utils/tests/test_init.py +1 -2
  218. hud/utils/tests/test_mcp.py +1 -28
  219. hud/utils/tests/test_pretty_errors.py +186 -0
  220. hud/utils/tests/test_tool_shorthand.py +154 -0
  221. hud/utils/tests/test_version.py +1 -1
  222. hud/utils/types.py +20 -0
  223. hud/version.py +1 -1
  224. hud_python-0.5.13.dist-info/METADATA +264 -0
  225. hud_python-0.5.13.dist-info/RECORD +305 -0
  226. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/WHEEL +1 -1
  227. hud/agents/langchain.py +0 -261
  228. hud/agents/lite_llm.py +0 -72
  229. hud/cli/rl/__init__.py +0 -180
  230. hud/cli/rl/config.py +0 -101
  231. hud/cli/rl/display.py +0 -133
  232. hud/cli/rl/gpu.py +0 -63
  233. hud/cli/rl/gpu_utils.py +0 -321
  234. hud/cli/rl/local_runner.py +0 -595
  235. hud/cli/rl/presets.py +0 -96
  236. hud/cli/rl/remote_runner.py +0 -463
  237. hud/cli/rl/rl_api.py +0 -150
  238. hud/cli/rl/vllm.py +0 -177
  239. hud/cli/rl/wait_utils.py +0 -89
  240. hud/datasets/parallel.py +0 -687
  241. hud/misc/__init__.py +0 -1
  242. hud/misc/claude_plays_pokemon.py +0 -292
  243. hud/otel/__init__.py +0 -35
  244. hud/otel/collector.py +0 -142
  245. hud/otel/config.py +0 -181
  246. hud/otel/context.py +0 -570
  247. hud/otel/exporters.py +0 -369
  248. hud/otel/instrumentation.py +0 -135
  249. hud/otel/processors.py +0 -121
  250. hud/otel/tests/__init__.py +0 -1
  251. hud/otel/tests/test_processors.py +0 -197
  252. hud/rl/README.md +0 -30
  253. hud/rl/__init__.py +0 -1
  254. hud/rl/actor.py +0 -176
  255. hud/rl/buffer.py +0 -405
  256. hud/rl/chat_template.jinja +0 -101
  257. hud/rl/config.py +0 -192
  258. hud/rl/distributed.py +0 -132
  259. hud/rl/learner.py +0 -637
  260. hud/rl/tests/__init__.py +0 -1
  261. hud/rl/tests/test_learner.py +0 -186
  262. hud/rl/train.py +0 -382
  263. hud/rl/types.py +0 -101
  264. hud/rl/utils/start_vllm_server.sh +0 -30
  265. hud/rl/utils.py +0 -524
  266. hud/rl/vllm_adapter.py +0 -143
  267. hud/telemetry/job.py +0 -352
  268. hud/telemetry/replay.py +0 -74
  269. hud/telemetry/tests/test_replay.py +0 -40
  270. hud/telemetry/tests/test_trace.py +0 -63
  271. hud/telemetry/trace.py +0 -158
  272. hud/utils/agent_factories.py +0 -86
  273. hud/utils/async_utils.py +0 -65
  274. hud/utils/group_eval.py +0 -223
  275. hud/utils/progress.py +0 -149
  276. hud/utils/tasks.py +0 -127
  277. hud/utils/tests/test_async_utils.py +0 -173
  278. hud/utils/tests/test_progress.py +0 -261
  279. hud_python-0.4.45.dist-info/METADATA +0 -552
  280. hud_python-0.4.45.dist-info/RECORD +0 -228
  281. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
  282. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0
@@ -3,96 +3,91 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import json
6
- from typing import Any, ClassVar
6
+ from typing import TYPE_CHECKING, Any, ClassVar
7
+
8
+ from pydantic import ConfigDict, field_validator
7
9
 
8
- from hud import instrument
9
10
  from hud.tools.grounding import GroundedComputerTool, Grounder, GrounderConfig
10
11
  from hud.types import AgentResponse, MCPToolCall, MCPToolResult
11
-
12
- from .openai_chat_generic import GenericOpenAIChatAgent
13
-
14
-
15
- class GroundedOpenAIChatAgent(GenericOpenAIChatAgent):
16
- """OpenAI agent that uses a separate grounding model for element detection.
17
-
18
- This agent:
19
- - Exposes only a synthetic "computer" tool to the planning model
20
- - Intercepts tool calls to ground element descriptions to coordinates
21
- - Converts grounded results to real computer tool calls
22
- - Maintains screenshot state for grounding operations
23
-
24
- The architecture separates concerns:
25
- - Planning model (GPT-4o etc) focuses on high-level reasoning
26
- - Grounding model (Qwen2-VL etc) handles visual element detection
27
- """
28
-
29
- metadata: ClassVar[dict[str, Any]] = {}
30
-
31
- def __init__(
32
- self,
33
- *,
34
- grounder_config: GrounderConfig,
35
- model_name: str = "gpt-4o-mini",
36
- allowed_tools: list[str] | None = None,
37
- append_setup_output: bool = False,
38
- system_prompt: str | None = None,
39
- **kwargs: Any,
40
- ) -> None:
41
- """Initialize the grounded OpenAI agent.
42
-
43
- Args:
44
- grounder_config: Configuration for the grounding model
45
- openai_client: OpenAI client for the planning model
46
- model: Name of the OpenAI model to use for planning (e.g., "gpt-4o", "gpt-4o-mini")
47
- real_computer_tool_name: Name of the actual computer tool to execute
48
- **kwargs: Additional arguments passed to GenericOpenAIChatAgent
49
- """
50
- # Set defaults for grounded agent
51
- if allowed_tools is None:
52
- allowed_tools = ["computer"]
53
-
54
- if system_prompt is None:
55
- system_prompt = (
56
- "You are a helpful AI assistant that can control the computer "
57
- "through visual interaction.\n\n"
58
- "IMPORTANT: Always explain your reasoning and observations before taking actions:\n"
59
- "1. First, describe what you see on the screen\n"
60
- "2. Explain what you plan to do and why\n"
61
- "3. Then use the computer tool with natural language descriptions\n\n"
62
- "For example:\n"
63
- "- 'I can see a login form with username and password fields. "
64
- "I need to click on the username field first.'\n"
65
- "- 'There's a blue submit button at the bottom. "
66
- "I'll click on it to submit the form.'\n"
67
- "- 'I notice a red close button in the top right corner. "
68
- "I'll click it to close this dialog.'\n\n"
69
- "Use descriptive element descriptions like:\n"
70
- "- Colors: 'red button', 'blue link', 'green checkmark'\n"
71
- "- Position: 'top right corner', 'bottom of the page', 'left sidebar'\n"
72
- "- Text content: 'Submit button', 'Login link', 'Cancel option'\n"
73
- "- Element type: 'text field', 'dropdown menu', 'checkbox'"
74
- )
75
-
76
- super().__init__(
77
- model_name=model_name,
78
- allowed_tools=allowed_tools,
79
- append_setup_output=append_setup_output,
80
- system_prompt=system_prompt,
81
- **kwargs,
82
- )
83
-
84
- self.grounder = Grounder(grounder_config)
85
- self.grounded_tool = None
86
-
87
- async def initialize(self, task: Any = None) -> None:
88
- """Initialize the agent and create the grounded tool with mcp_client."""
89
- # Call parent initialization first
90
- await super().initialize(task)
91
-
92
- if self.mcp_client is None:
93
- raise ValueError("mcp_client must be initialized before creating grounded tool")
12
+ from hud.utils.types import with_signature
13
+
14
+ if TYPE_CHECKING:
15
+ from hud.types import BaseAgentConfig
16
+ from .base import BaseCreateParams
17
+ from .openai_chat import OpenAIChatAgent, OpenAIChatConfig
18
+
19
+ DEFAULT_GROUNDED_PROMPT = (
20
+ "You are a helpful AI assistant that can control the computer through visual "
21
+ "interaction.\n\n"
22
+ "IMPORTANT: Always explain your reasoning and observations before taking actions:\n"
23
+ "1. First, describe what you see on the screen.\n"
24
+ "2. Explain what you plan to do and why.\n"
25
+ "3. Then use the computer tool with natural language descriptions.\n\n"
26
+ "Use descriptive element descriptions:\n"
27
+ '- Colors ("red button", "blue link")\n'
28
+ '- Position ("top right corner", "left sidebar")\n'
29
+ '- Text content ("Submit button", "Login link")\n'
30
+ '- Element type ("text field", "dropdown")'
31
+ )
32
+
33
+
34
+ class GroundedOpenAIConfig(OpenAIChatConfig):
35
+ """Configuration for grounded OpenAI chat agent."""
36
+
37
+ model_config = ConfigDict(arbitrary_types_allowed=True)
38
+
39
+ grounder_config: GrounderConfig
40
+ model: str = "gpt-4o-mini"
41
+ allowed_tools: list[str] | None = None # Default set in validator
42
+ append_setup_output: bool = False
43
+ system_prompt: str | None = DEFAULT_GROUNDED_PROMPT
44
+
45
+ @field_validator("grounder_config", mode="before")
46
+ @classmethod
47
+ def _coerce_grounder_config(cls, value: GrounderConfig | dict[str, Any]) -> GrounderConfig:
48
+ if isinstance(value, GrounderConfig):
49
+ return value
50
+ if isinstance(value, dict):
51
+ return GrounderConfig(**value)
52
+
53
+ @field_validator("allowed_tools", mode="before")
54
+ @classmethod
55
+ def _default_allowed_tools(cls, value: list[str] | None) -> list[str] | None:
56
+ if value is None:
57
+ return ["computer"]
58
+ return value
59
+
60
+
61
+ class GroundedOpenAICreateParams(BaseCreateParams, GroundedOpenAIConfig):
62
+ pass
63
+
64
+
65
+ class GroundedOpenAIChatAgent(OpenAIChatAgent):
66
+ """OpenAI chat agent that pipes 'computer' tool calls through a vision grounder."""
67
+
68
+ metadata: ClassVar[dict[str, Any] | None] = None
69
+ config_cls: ClassVar[type[BaseAgentConfig]] = GroundedOpenAIConfig
70
+
71
+ @with_signature(GroundedOpenAICreateParams)
72
+ @classmethod
73
+ def create(cls, **kwargs: Any) -> GroundedOpenAIChatAgent: # pyright: ignore[reportIncompatibleMethodOverride]
74
+ from .base import MCPAgent
75
+
76
+ return MCPAgent.create.__func__(cls, **kwargs) # type: ignore[return-value]
77
+
78
+ def __init__(self, params: GroundedOpenAICreateParams | None = None, **kwargs: Any) -> None:
79
+ super().__init__(params, **kwargs) # type: ignore[arg-type]
80
+ self.config: GroundedOpenAIConfig # type: ignore[assignment]
81
+
82
+ self.grounder = Grounder(self.config.grounder_config)
83
+ self.grounded_tool: GroundedComputerTool | None = None
84
+
85
+ def _on_tools_ready(self) -> None:
86
+ """Create the grounded tool after context is bound."""
87
+ if self.ctx is None:
88
+ raise ValueError("ctx must be set before creating grounded tool")
94
89
  self.grounded_tool = GroundedComputerTool(
95
- grounder=self.grounder, mcp_client=self.mcp_client, computer_tool_name="computer"
90
+ grounder=self.grounder, ctx=self.ctx, computer_tool_name="computer"
96
91
  )
97
92
 
98
93
  def get_tool_schemas(self) -> list[Any]:
@@ -108,11 +103,6 @@ class GroundedOpenAIChatAgent(GenericOpenAIChatAgent):
108
103
  return []
109
104
  return [self.grounded_tool.get_openai_tool_schema()]
110
105
 
111
- @instrument(
112
- span_type="agent",
113
- record_args=False,
114
- record_result=True,
115
- )
116
106
  async def get_response(self, messages: Any) -> AgentResponse:
117
107
  """Get response from the planning model and handle grounded tool calls.
118
108
 
@@ -142,11 +132,9 @@ class GroundedOpenAIChatAgent(GenericOpenAIChatAgent):
142
132
  )
143
133
 
144
134
  if not has_image:
145
- if self.mcp_client is None:
146
- raise ValueError("mcp_client is not initialized")
147
- screenshot_result = await self.mcp_client.call_tool(
148
- MCPToolCall(name="computer", arguments={"action": "screenshot"})
149
- )
135
+ if self.ctx is None:
136
+ raise ValueError("ctx is not initialized")
137
+ screenshot_result = await self.ctx.call_tool(("computer", {"action": "screenshot"}))
150
138
 
151
139
  for block in screenshot_result.content:
152
140
  # Check for ImageContent type from MCP
@@ -169,8 +157,8 @@ class GroundedOpenAIChatAgent(GenericOpenAIChatAgent):
169
157
  protected_keys = {"model", "messages", "tools", "parallel_tool_calls"}
170
158
  extra = {k: v for k, v in (self.completion_kwargs or {}).items() if k not in protected_keys}
171
159
 
172
- response = await self.oai.chat.completions.create(
173
- model=self.model_name,
160
+ response = await self.oai.chat.completions.create( # type: ignore
161
+ model=self.config.model,
174
162
  messages=messages,
175
163
  tools=tool_schemas,
176
164
  parallel_tool_calls=False,
@@ -193,6 +181,7 @@ class GroundedOpenAIChatAgent(GenericOpenAIChatAgent):
193
181
  if not msg.tool_calls:
194
182
  return AgentResponse(
195
183
  content=msg.content or "",
184
+ reasoning=msg.reasoning_content,
196
185
  tool_calls=[],
197
186
  done=choice.finish_reason in ("stop", "length"),
198
187
  raw=response,
@@ -203,6 +192,7 @@ class GroundedOpenAIChatAgent(GenericOpenAIChatAgent):
203
192
  if tc.function.name != "computer":
204
193
  return AgentResponse(
205
194
  content=f"Error: Model called unexpected tool '{tc.function.name}'",
195
+ reasoning=msg.reasoning_content,
206
196
  tool_calls=[],
207
197
  done=True,
208
198
  raw=response,
@@ -213,13 +203,21 @@ class GroundedOpenAIChatAgent(GenericOpenAIChatAgent):
213
203
  args = json.loads(tc.function.arguments or "{}")
214
204
  except json.JSONDecodeError:
215
205
  return AgentResponse(
216
- content="Error: Invalid tool arguments", tool_calls=[], done=True, raw=response
206
+ content="Error: Invalid tool arguments",
207
+ reasoning=msg.reasoning_content,
208
+ tool_calls=[],
209
+ done=True,
210
+ raw=response,
217
211
  )
218
212
 
219
213
  tool_call = MCPToolCall(name="computer", arguments=args, id=tc.id)
220
214
 
221
215
  return AgentResponse(
222
- content=msg.content or "", tool_calls=[tool_call], done=False, raw=response
216
+ content=msg.content or "",
217
+ reasoning=msg.reasoning_content,
218
+ tool_calls=[tool_call],
219
+ done=False,
220
+ raw=response,
223
221
  )
224
222
 
225
223
  async def call_tools(
@@ -1,41 +1,72 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Any
3
+ from typing import TYPE_CHECKING, Any, ClassVar
4
4
 
5
- from hud.agents.base import MCPAgent, find_reward
6
- from hud.types import AgentResponse, Task, Trace
5
+ from hud.agents.base import MCPAgent
6
+ from hud.types import AgentResponse, BaseAgentConfig, Trace
7
+
8
+ if TYPE_CHECKING:
9
+ from hud.eval.context import EvalContext
7
10
 
8
11
 
9
12
  class IntegrationTestRunner(MCPAgent):
13
+ """Special agent that runs integration tests by executing tools directly.
14
+
15
+ Unlike regular agents, this doesn't run an LLM loop - it executes
16
+ integration_test_tool and evaluate_tool in sequence to verify tool behavior.
17
+ """
18
+
19
+ metadata: ClassVar[dict[str, Any] | None] = {}
20
+ config_cls: ClassVar[type[BaseAgentConfig]] = BaseAgentConfig
21
+
10
22
  def __init__(self, **kwargs: Any) -> None:
11
23
  kwargs["auto_trace"] = False
12
24
  super().__init__(**kwargs)
13
- self.metadata = {}
14
25
 
15
- async def run(self, task: Task, max_steps: int = 10) -> Trace:
26
+ async def run(
27
+ self,
28
+ ctx: EvalContext,
29
+ *,
30
+ max_steps: int = 10,
31
+ ) -> Trace:
32
+ """Run integration test by executing tools directly.
33
+
34
+ The EvalContext should have integration_test_tool and evaluate_tool
35
+ configured in its metadata or environment setup.
36
+ """
37
+ from hud.eval.context import EvalContext
38
+
39
+ if not isinstance(ctx, EvalContext):
40
+ raise TypeError(f"ctx must be EvalContext, got {type(ctx).__name__}")
41
+
42
+ self.ctx = ctx
43
+
16
44
  try:
17
- # Initialize using base to set up client and telemetry correctly
18
- await self.initialize(task)
45
+ # Initialize tools from context
46
+ if not self._initialized:
47
+ await self._initialize_from_ctx(ctx)
19
48
 
20
- # Validate task shape
21
- if not getattr(task, "integration_test_tool", None):
49
+ self.console.info(f"Full system prompt: {self.system_prompt}")
50
+
51
+ # For integration tests, we expect the context's environment to have
52
+ # _setup_calls, _integration_test_calls, and _evaluate_calls configured
53
+ env = ctx
54
+
55
+ # Run integration test tool (stored in environment metadata or separate list)
56
+ integration_test_calls = getattr(env, "_integration_test_calls", [])
57
+ if not integration_test_calls:
22
58
  raise ValueError(
23
- "--integration-test requires task.integration_test_tool (single call)"
59
+ "--integration-test requires integration_test_tool to be configured"
24
60
  )
25
- elif not getattr(task, "evaluate_tool", None):
26
- raise ValueError("--integration-test requires task.evaluate_tool (single call)")
27
-
28
- if task.setup_tool:
29
- _ = await self.call_tools(task.setup_tool)
30
61
 
31
- _ = await self.call_tools(task.integration_test_tool)
32
- evaluate_result = await self.call_tools(task.evaluate_tool)
62
+ for name, args in integration_test_calls:
63
+ await ctx.call_tool((name, args))
33
64
 
34
- reward = float(find_reward(evaluate_result[0])) if evaluate_result else 0.0
65
+ # The evaluate phase runs automatically when ctx exits,
66
+ # but we can also get the reward from ctx.reward after
67
+ return Trace(done=True, reward=ctx.reward or 0.0, info={})
35
68
 
36
- return Trace(done=True, reward=reward, info={})
37
69
  finally:
38
- # Ensure resources are cleaned up so the CLI can exit cleanly
39
70
  await self._cleanup()
40
71
 
41
72
  # Stub implementations to satisfy abstract base class; not used in --integration-test path
@@ -1,14 +1,37 @@
1
1
  from __future__ import annotations
2
2
 
3
- import os
3
+ import logging
4
4
  from typing import Literal
5
5
 
6
6
  from openai import AsyncOpenAI
7
7
 
8
8
  from hud.settings import settings
9
+ from hud.telemetry import instrument
10
+
11
+ logger = logging.getLogger(__name__)
9
12
 
10
13
  ResponseType = Literal["STOP", "CONTINUE"]
11
14
 
15
+ DEFAULT_SYSTEM_PROMPT = """\
16
+ You are an assistant that helps determine the appropriate response to an agent's message.
17
+
18
+ You will receive messages from an agent that is performing tasks for a user.
19
+ Your job is to analyze these messages and respond with one of the following:
20
+
21
+ - STOP: If the agent indicates it has successfully completed a task or is stuck,
22
+ struggling or says it cannot complete the task, even if phrased as a question
23
+ like "I have entered the right values into this form. Would you like me to do
24
+ anything else?" or "Here is the website. Is there any other information you
25
+ need?" or if the agent has strongly determined it wants to stop the task like
26
+ "The task is infeasible. Can I help you with something else?"
27
+
28
+ - CONTINUE: If the agent is asking for clarification before proceeding with a task
29
+ like "I'm about to clear cookies from this website. Would you like me to proceed?"
30
+ or "I've entered the right values into this form. Would you like me to continue
31
+ with the rest of the task?"
32
+
33
+ Respond ONLY with one of these two options."""
34
+
12
35
 
13
36
  class ResponseAgent:
14
37
  """
@@ -17,49 +40,36 @@ class ResponseAgent:
17
40
  """
18
41
 
19
42
  def __init__(
20
- self, api_key: str | None = None, model: str = "gpt-4o", system_prompt: str | None = None
43
+ self,
44
+ model: str = "gpt-4o",
45
+ system_prompt: str | None = None,
21
46
  ) -> None:
22
47
  """
23
48
  Initialize the ResponseAgent.
24
49
 
25
50
  Args:
26
- api_key: The API key to use for the OpenAI client
27
- model: The model to use for the OpenAI client (default: "gpt-4o")
28
- system_prompt: The system prompt to use for the OpenAI client
51
+ model: The model to use via HUD inference gateway (default: "gpt-4o").
52
+ Supports any model available through inference.hud.ai.
53
+ system_prompt: Optional custom system prompt for determining responses.
29
54
  """
30
- self.api_key = api_key or settings.openai_api_key or os.environ.get("OPENAI_API_KEY")
31
- if not self.api_key:
55
+ api_key = settings.api_key
56
+ if not api_key:
32
57
  raise ValueError(
33
- "OpenAI API key must be provided or set as OPENAI_API_KEY environment variable"
58
+ "HUD API key is required for auto_respond. Set HUD_API_KEY environment variable."
34
59
  )
35
60
 
36
- self.client = AsyncOpenAI(api_key=self.api_key)
37
- self.model = model
38
-
39
- self.system_prompt = (
40
- system_prompt
41
- or """
42
- You are an assistant that helps determine the appropriate response to an agent's message.
43
-
44
- You will receive messages from an agent that is performing tasks for a user.
45
- Your job is to analyze these messages and respond with one of the following:
46
-
47
- - STOP: If the agent indicates it has successfully completed a task or is stuck,
48
- struggling or says it cannot complete the task, even if phrased as a question
49
- like "I have entered the right values into this form. Would you like me to do
50
- anything else?" or "Here is the website. Is there any other information you
51
- need?" or if the agent has strongly determined it wants to stop the task like
52
- "The task is infeasible. Can I help you with something else?"
53
-
54
- - CONTINUE: If the agent is asking for clarification before proceeding with a task
55
- like "I'm about to clear cookies from this website. Would you like me to proceed?"
56
- or "I've entered the right values into this form. Would you like me to continue
57
- with the rest of the task?"
58
-
59
- Respond ONLY with one of these two options.
60
- """
61
+ self.client: AsyncOpenAI = AsyncOpenAI(
62
+ base_url=settings.hud_gateway_url,
63
+ api_key=api_key,
61
64
  )
65
+ self.model = model
66
+ self.system_prompt = system_prompt or DEFAULT_SYSTEM_PROMPT
62
67
 
68
+ @instrument(
69
+ category="agent",
70
+ name="response_agent",
71
+ internal_type="user-message",
72
+ )
63
73
  async def determine_response(self, agent_message: str) -> ResponseType:
64
74
  """
65
75
  Determine whether the agent should stop or continue based on its message.
@@ -80,8 +90,9 @@ class ResponseAgent:
80
90
  "content": f"Agent message: {agent_message}\n\nWhat is the appropriate response?", # noqa: E501
81
91
  },
82
92
  ],
83
- temperature=0.1, # Low temperature for more deterministic responses
84
- max_tokens=5, # We only need a short response
93
+ temperature=0.1,
94
+ max_tokens=5,
95
+ extra_headers={"Trace-Id": ""},
85
96
  )
86
97
 
87
98
  response_text = response.choices[0].message.content
@@ -96,5 +107,6 @@ class ResponseAgent:
96
107
  else:
97
108
  return "CONTINUE"
98
109
 
99
- except Exception:
110
+ except Exception as e:
111
+ logger.warning("Auto-respond failed: %s", e)
100
112
  return "CONTINUE" # Default to continue on error