hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (274) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +11 -5
  3. hud/agents/base.py +220 -500
  4. hud/agents/claude.py +200 -240
  5. hud/agents/gemini.py +275 -0
  6. hud/agents/gemini_cua.py +335 -0
  7. hud/agents/grounded_openai.py +98 -100
  8. hud/agents/misc/integration_test_agent.py +51 -20
  9. hud/agents/misc/response_agent.py +41 -36
  10. hud/agents/openai.py +291 -292
  11. hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
  12. hud/agents/operator.py +211 -0
  13. hud/agents/tests/conftest.py +133 -0
  14. hud/agents/tests/test_base.py +300 -622
  15. hud/agents/tests/test_base_runtime.py +233 -0
  16. hud/agents/tests/test_claude.py +379 -210
  17. hud/agents/tests/test_client.py +9 -10
  18. hud/agents/tests/test_gemini.py +369 -0
  19. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  20. hud/agents/tests/test_openai.py +376 -140
  21. hud/agents/tests/test_operator.py +362 -0
  22. hud/agents/tests/test_run_eval.py +179 -0
  23. hud/cli/__init__.py +461 -545
  24. hud/cli/analyze.py +43 -5
  25. hud/cli/build.py +664 -110
  26. hud/cli/debug.py +8 -5
  27. hud/cli/dev.py +882 -734
  28. hud/cli/eval.py +782 -668
  29. hud/cli/flows/dev.py +167 -0
  30. hud/cli/flows/init.py +191 -0
  31. hud/cli/flows/tasks.py +153 -56
  32. hud/cli/flows/templates.py +151 -0
  33. hud/cli/flows/tests/__init__.py +1 -0
  34. hud/cli/flows/tests/test_dev.py +126 -0
  35. hud/cli/init.py +60 -58
  36. hud/cli/push.py +29 -11
  37. hud/cli/rft.py +311 -0
  38. hud/cli/rft_status.py +145 -0
  39. hud/cli/tests/test_analyze.py +5 -5
  40. hud/cli/tests/test_analyze_metadata.py +3 -2
  41. hud/cli/tests/test_analyze_module.py +120 -0
  42. hud/cli/tests/test_build.py +108 -6
  43. hud/cli/tests/test_build_failure.py +41 -0
  44. hud/cli/tests/test_build_module.py +50 -0
  45. hud/cli/tests/test_cli_init.py +6 -1
  46. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  47. hud/cli/tests/test_cli_root.py +140 -0
  48. hud/cli/tests/test_convert.py +361 -0
  49. hud/cli/tests/test_debug.py +12 -10
  50. hud/cli/tests/test_dev.py +197 -0
  51. hud/cli/tests/test_eval.py +251 -0
  52. hud/cli/tests/test_eval_bedrock.py +51 -0
  53. hud/cli/tests/test_init.py +124 -0
  54. hud/cli/tests/test_main_module.py +11 -5
  55. hud/cli/tests/test_mcp_server.py +12 -100
  56. hud/cli/tests/test_push_happy.py +74 -0
  57. hud/cli/tests/test_push_wrapper.py +23 -0
  58. hud/cli/tests/test_registry.py +1 -1
  59. hud/cli/tests/test_utils.py +1 -1
  60. hud/cli/{rl → utils}/celebrate.py +14 -12
  61. hud/cli/utils/config.py +18 -1
  62. hud/cli/utils/docker.py +130 -4
  63. hud/cli/utils/env_check.py +9 -9
  64. hud/cli/utils/git.py +136 -0
  65. hud/cli/utils/interactive.py +39 -5
  66. hud/cli/utils/metadata.py +69 -0
  67. hud/cli/utils/runner.py +1 -1
  68. hud/cli/utils/server.py +2 -2
  69. hud/cli/utils/source_hash.py +3 -3
  70. hud/cli/utils/tasks.py +4 -1
  71. hud/cli/utils/tests/__init__.py +0 -0
  72. hud/cli/utils/tests/test_config.py +58 -0
  73. hud/cli/utils/tests/test_docker.py +93 -0
  74. hud/cli/utils/tests/test_docker_hints.py +71 -0
  75. hud/cli/utils/tests/test_env_check.py +74 -0
  76. hud/cli/utils/tests/test_environment.py +42 -0
  77. hud/cli/utils/tests/test_git.py +142 -0
  78. hud/cli/utils/tests/test_interactive_module.py +60 -0
  79. hud/cli/utils/tests/test_local_runner.py +50 -0
  80. hud/cli/utils/tests/test_logging_utils.py +23 -0
  81. hud/cli/utils/tests/test_metadata.py +49 -0
  82. hud/cli/utils/tests/test_package_runner.py +35 -0
  83. hud/cli/utils/tests/test_registry_utils.py +49 -0
  84. hud/cli/utils/tests/test_remote_runner.py +25 -0
  85. hud/cli/utils/tests/test_runner_modules.py +52 -0
  86. hud/cli/utils/tests/test_source_hash.py +36 -0
  87. hud/cli/utils/tests/test_tasks.py +80 -0
  88. hud/cli/utils/version_check.py +258 -0
  89. hud/cli/{rl → utils}/viewer.py +2 -2
  90. hud/clients/README.md +12 -11
  91. hud/clients/__init__.py +4 -3
  92. hud/clients/base.py +166 -26
  93. hud/clients/environment.py +51 -0
  94. hud/clients/fastmcp.py +13 -6
  95. hud/clients/mcp_use.py +40 -15
  96. hud/clients/tests/test_analyze_scenarios.py +206 -0
  97. hud/clients/tests/test_protocol.py +9 -3
  98. hud/datasets/__init__.py +23 -20
  99. hud/datasets/loader.py +327 -0
  100. hud/datasets/runner.py +192 -105
  101. hud/datasets/tests/__init__.py +0 -0
  102. hud/datasets/tests/test_loader.py +221 -0
  103. hud/datasets/tests/test_utils.py +315 -0
  104. hud/datasets/utils.py +270 -90
  105. hud/environment/__init__.py +50 -0
  106. hud/environment/connection.py +206 -0
  107. hud/environment/connectors/__init__.py +33 -0
  108. hud/environment/connectors/base.py +68 -0
  109. hud/environment/connectors/local.py +177 -0
  110. hud/environment/connectors/mcp_config.py +109 -0
  111. hud/environment/connectors/openai.py +101 -0
  112. hud/environment/connectors/remote.py +172 -0
  113. hud/environment/environment.py +694 -0
  114. hud/environment/integrations/__init__.py +45 -0
  115. hud/environment/integrations/adk.py +67 -0
  116. hud/environment/integrations/anthropic.py +196 -0
  117. hud/environment/integrations/gemini.py +92 -0
  118. hud/environment/integrations/langchain.py +82 -0
  119. hud/environment/integrations/llamaindex.py +68 -0
  120. hud/environment/integrations/openai.py +238 -0
  121. hud/environment/mock.py +306 -0
  122. hud/environment/router.py +112 -0
  123. hud/environment/scenarios.py +493 -0
  124. hud/environment/tests/__init__.py +1 -0
  125. hud/environment/tests/test_connection.py +317 -0
  126. hud/environment/tests/test_connectors.py +218 -0
  127. hud/environment/tests/test_environment.py +161 -0
  128. hud/environment/tests/test_integrations.py +257 -0
  129. hud/environment/tests/test_local_connectors.py +201 -0
  130. hud/environment/tests/test_scenarios.py +280 -0
  131. hud/environment/tests/test_tools.py +208 -0
  132. hud/environment/types.py +23 -0
  133. hud/environment/utils/__init__.py +35 -0
  134. hud/environment/utils/formats.py +215 -0
  135. hud/environment/utils/schema.py +171 -0
  136. hud/environment/utils/tool_wrappers.py +113 -0
  137. hud/eval/__init__.py +67 -0
  138. hud/eval/context.py +674 -0
  139. hud/eval/display.py +299 -0
  140. hud/eval/instrument.py +185 -0
  141. hud/eval/manager.py +466 -0
  142. hud/eval/parallel.py +268 -0
  143. hud/eval/task.py +340 -0
  144. hud/eval/tests/__init__.py +1 -0
  145. hud/eval/tests/test_context.py +178 -0
  146. hud/eval/tests/test_eval.py +210 -0
  147. hud/eval/tests/test_manager.py +152 -0
  148. hud/eval/tests/test_parallel.py +168 -0
  149. hud/eval/tests/test_task.py +145 -0
  150. hud/eval/types.py +63 -0
  151. hud/eval/utils.py +183 -0
  152. hud/patches/__init__.py +19 -0
  153. hud/patches/mcp_patches.py +151 -0
  154. hud/patches/warnings.py +54 -0
  155. hud/samples/browser.py +4 -4
  156. hud/server/__init__.py +2 -1
  157. hud/server/low_level.py +2 -1
  158. hud/server/router.py +164 -0
  159. hud/server/server.py +567 -80
  160. hud/server/tests/test_mcp_server_integration.py +11 -11
  161. hud/server/tests/test_mcp_server_more.py +1 -1
  162. hud/server/tests/test_server_extra.py +2 -0
  163. hud/settings.py +45 -3
  164. hud/shared/exceptions.py +36 -10
  165. hud/shared/hints.py +26 -1
  166. hud/shared/requests.py +15 -3
  167. hud/shared/tests/test_exceptions.py +40 -31
  168. hud/shared/tests/test_hints.py +167 -0
  169. hud/telemetry/__init__.py +20 -19
  170. hud/telemetry/exporter.py +201 -0
  171. hud/telemetry/instrument.py +158 -253
  172. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  173. hud/telemetry/tests/test_exporter.py +258 -0
  174. hud/telemetry/tests/test_instrument.py +401 -0
  175. hud/tools/__init__.py +16 -2
  176. hud/tools/apply_patch.py +639 -0
  177. hud/tools/base.py +54 -4
  178. hud/tools/bash.py +2 -2
  179. hud/tools/computer/__init__.py +4 -0
  180. hud/tools/computer/anthropic.py +2 -2
  181. hud/tools/computer/gemini.py +385 -0
  182. hud/tools/computer/hud.py +23 -6
  183. hud/tools/computer/openai.py +20 -21
  184. hud/tools/computer/qwen.py +434 -0
  185. hud/tools/computer/settings.py +37 -0
  186. hud/tools/edit.py +3 -7
  187. hud/tools/executors/base.py +4 -2
  188. hud/tools/executors/pyautogui.py +1 -1
  189. hud/tools/grounding/grounded_tool.py +13 -18
  190. hud/tools/grounding/grounder.py +10 -31
  191. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  192. hud/tools/jupyter.py +330 -0
  193. hud/tools/playwright.py +18 -3
  194. hud/tools/shell.py +308 -0
  195. hud/tools/tests/test_apply_patch.py +718 -0
  196. hud/tools/tests/test_computer.py +4 -9
  197. hud/tools/tests/test_computer_actions.py +24 -2
  198. hud/tools/tests/test_jupyter_tool.py +181 -0
  199. hud/tools/tests/test_shell.py +596 -0
  200. hud/tools/tests/test_submit.py +85 -0
  201. hud/tools/tests/test_types.py +193 -0
  202. hud/tools/types.py +21 -1
  203. hud/types.py +167 -57
  204. hud/utils/__init__.py +2 -0
  205. hud/utils/env.py +67 -0
  206. hud/utils/hud_console.py +61 -3
  207. hud/utils/mcp.py +15 -58
  208. hud/utils/strict_schema.py +162 -0
  209. hud/utils/tests/test_init.py +1 -2
  210. hud/utils/tests/test_mcp.py +1 -28
  211. hud/utils/tests/test_pretty_errors.py +186 -0
  212. hud/utils/tests/test_tool_shorthand.py +154 -0
  213. hud/utils/tests/test_version.py +1 -1
  214. hud/utils/types.py +20 -0
  215. hud/version.py +1 -1
  216. hud_python-0.5.1.dist-info/METADATA +264 -0
  217. hud_python-0.5.1.dist-info/RECORD +299 -0
  218. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
  219. hud/agents/langchain.py +0 -261
  220. hud/agents/lite_llm.py +0 -72
  221. hud/cli/rl/__init__.py +0 -180
  222. hud/cli/rl/config.py +0 -101
  223. hud/cli/rl/display.py +0 -133
  224. hud/cli/rl/gpu.py +0 -63
  225. hud/cli/rl/gpu_utils.py +0 -321
  226. hud/cli/rl/local_runner.py +0 -595
  227. hud/cli/rl/presets.py +0 -96
  228. hud/cli/rl/remote_runner.py +0 -463
  229. hud/cli/rl/rl_api.py +0 -150
  230. hud/cli/rl/vllm.py +0 -177
  231. hud/cli/rl/wait_utils.py +0 -89
  232. hud/datasets/parallel.py +0 -687
  233. hud/misc/__init__.py +0 -1
  234. hud/misc/claude_plays_pokemon.py +0 -292
  235. hud/otel/__init__.py +0 -35
  236. hud/otel/collector.py +0 -142
  237. hud/otel/config.py +0 -181
  238. hud/otel/context.py +0 -570
  239. hud/otel/exporters.py +0 -369
  240. hud/otel/instrumentation.py +0 -135
  241. hud/otel/processors.py +0 -121
  242. hud/otel/tests/__init__.py +0 -1
  243. hud/otel/tests/test_processors.py +0 -197
  244. hud/rl/README.md +0 -30
  245. hud/rl/__init__.py +0 -1
  246. hud/rl/actor.py +0 -176
  247. hud/rl/buffer.py +0 -405
  248. hud/rl/chat_template.jinja +0 -101
  249. hud/rl/config.py +0 -192
  250. hud/rl/distributed.py +0 -132
  251. hud/rl/learner.py +0 -637
  252. hud/rl/tests/__init__.py +0 -1
  253. hud/rl/tests/test_learner.py +0 -186
  254. hud/rl/train.py +0 -382
  255. hud/rl/types.py +0 -101
  256. hud/rl/utils/start_vllm_server.sh +0 -30
  257. hud/rl/utils.py +0 -524
  258. hud/rl/vllm_adapter.py +0 -143
  259. hud/telemetry/job.py +0 -352
  260. hud/telemetry/replay.py +0 -74
  261. hud/telemetry/tests/test_replay.py +0 -40
  262. hud/telemetry/tests/test_trace.py +0 -63
  263. hud/telemetry/trace.py +0 -158
  264. hud/utils/agent_factories.py +0 -86
  265. hud/utils/async_utils.py +0 -65
  266. hud/utils/group_eval.py +0 -223
  267. hud/utils/progress.py +0 -149
  268. hud/utils/tasks.py +0 -127
  269. hud/utils/tests/test_async_utils.py +0 -173
  270. hud/utils/tests/test_progress.py +0 -261
  271. hud_python-0.4.45.dist-info/METADATA +0 -552
  272. hud_python-0.4.45.dist-info/RECORD +0 -228
  273. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
  274. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0
@@ -3,96 +3,91 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  import json
6
- from typing import Any, ClassVar
6
+ from typing import TYPE_CHECKING, Any, ClassVar
7
+
8
+ from pydantic import ConfigDict, field_validator
7
9
 
8
- from hud import instrument
9
10
  from hud.tools.grounding import GroundedComputerTool, Grounder, GrounderConfig
10
11
  from hud.types import AgentResponse, MCPToolCall, MCPToolResult
11
-
12
- from .openai_chat_generic import GenericOpenAIChatAgent
13
-
14
-
15
- class GroundedOpenAIChatAgent(GenericOpenAIChatAgent):
16
- """OpenAI agent that uses a separate grounding model for element detection.
17
-
18
- This agent:
19
- - Exposes only a synthetic "computer" tool to the planning model
20
- - Intercepts tool calls to ground element descriptions to coordinates
21
- - Converts grounded results to real computer tool calls
22
- - Maintains screenshot state for grounding operations
23
-
24
- The architecture separates concerns:
25
- - Planning model (GPT-4o etc) focuses on high-level reasoning
26
- - Grounding model (Qwen2-VL etc) handles visual element detection
27
- """
28
-
29
- metadata: ClassVar[dict[str, Any]] = {}
30
-
31
- def __init__(
32
- self,
33
- *,
34
- grounder_config: GrounderConfig,
35
- model_name: str = "gpt-4o-mini",
36
- allowed_tools: list[str] | None = None,
37
- append_setup_output: bool = False,
38
- system_prompt: str | None = None,
39
- **kwargs: Any,
40
- ) -> None:
41
- """Initialize the grounded OpenAI agent.
42
-
43
- Args:
44
- grounder_config: Configuration for the grounding model
45
- openai_client: OpenAI client for the planning model
46
- model: Name of the OpenAI model to use for planning (e.g., "gpt-4o", "gpt-4o-mini")
47
- real_computer_tool_name: Name of the actual computer tool to execute
48
- **kwargs: Additional arguments passed to GenericOpenAIChatAgent
49
- """
50
- # Set defaults for grounded agent
51
- if allowed_tools is None:
52
- allowed_tools = ["computer"]
53
-
54
- if system_prompt is None:
55
- system_prompt = (
56
- "You are a helpful AI assistant that can control the computer "
57
- "through visual interaction.\n\n"
58
- "IMPORTANT: Always explain your reasoning and observations before taking actions:\n"
59
- "1. First, describe what you see on the screen\n"
60
- "2. Explain what you plan to do and why\n"
61
- "3. Then use the computer tool with natural language descriptions\n\n"
62
- "For example:\n"
63
- "- 'I can see a login form with username and password fields. "
64
- "I need to click on the username field first.'\n"
65
- "- 'There's a blue submit button at the bottom. "
66
- "I'll click on it to submit the form.'\n"
67
- "- 'I notice a red close button in the top right corner. "
68
- "I'll click it to close this dialog.'\n\n"
69
- "Use descriptive element descriptions like:\n"
70
- "- Colors: 'red button', 'blue link', 'green checkmark'\n"
71
- "- Position: 'top right corner', 'bottom of the page', 'left sidebar'\n"
72
- "- Text content: 'Submit button', 'Login link', 'Cancel option'\n"
73
- "- Element type: 'text field', 'dropdown menu', 'checkbox'"
74
- )
75
-
76
- super().__init__(
77
- model_name=model_name,
78
- allowed_tools=allowed_tools,
79
- append_setup_output=append_setup_output,
80
- system_prompt=system_prompt,
81
- **kwargs,
82
- )
83
-
84
- self.grounder = Grounder(grounder_config)
85
- self.grounded_tool = None
86
-
87
- async def initialize(self, task: Any = None) -> None:
88
- """Initialize the agent and create the grounded tool with mcp_client."""
89
- # Call parent initialization first
90
- await super().initialize(task)
91
-
92
- if self.mcp_client is None:
93
- raise ValueError("mcp_client must be initialized before creating grounded tool")
12
+ from hud.utils.types import with_signature
13
+
14
+ if TYPE_CHECKING:
15
+ from hud.types import BaseAgentConfig
16
+ from .base import BaseCreateParams
17
+ from .openai_chat import OpenAIChatAgent, OpenAIChatConfig
18
+
19
+ DEFAULT_GROUNDED_PROMPT = (
20
+ "You are a helpful AI assistant that can control the computer through visual "
21
+ "interaction.\n\n"
22
+ "IMPORTANT: Always explain your reasoning and observations before taking actions:\n"
23
+ "1. First, describe what you see on the screen.\n"
24
+ "2. Explain what you plan to do and why.\n"
25
+ "3. Then use the computer tool with natural language descriptions.\n\n"
26
+ "Use descriptive element descriptions:\n"
27
+ '- Colors ("red button", "blue link")\n'
28
+ '- Position ("top right corner", "left sidebar")\n'
29
+ '- Text content ("Submit button", "Login link")\n'
30
+ '- Element type ("text field", "dropdown")'
31
+ )
32
+
33
+
34
+ class GroundedOpenAIConfig(OpenAIChatConfig):
35
+ """Configuration for grounded OpenAI chat agent."""
36
+
37
+ model_config = ConfigDict(arbitrary_types_allowed=True)
38
+
39
+ grounder_config: GrounderConfig
40
+ model: str = "gpt-4o-mini"
41
+ allowed_tools: list[str] | None = None # Default set in validator
42
+ append_setup_output: bool = False
43
+ system_prompt: str | None = DEFAULT_GROUNDED_PROMPT
44
+
45
+ @field_validator("grounder_config", mode="before")
46
+ @classmethod
47
+ def _coerce_grounder_config(cls, value: GrounderConfig | dict[str, Any]) -> GrounderConfig:
48
+ if isinstance(value, GrounderConfig):
49
+ return value
50
+ if isinstance(value, dict):
51
+ return GrounderConfig(**value)
52
+
53
+ @field_validator("allowed_tools", mode="before")
54
+ @classmethod
55
+ def _default_allowed_tools(cls, value: list[str] | None) -> list[str] | None:
56
+ if value is None:
57
+ return ["computer"]
58
+ return value
59
+
60
+
61
+ class GroundedOpenAICreateParams(BaseCreateParams, GroundedOpenAIConfig):
62
+ pass
63
+
64
+
65
+ class GroundedOpenAIChatAgent(OpenAIChatAgent):
66
+ """OpenAI chat agent that pipes 'computer' tool calls through a vision grounder."""
67
+
68
+ metadata: ClassVar[dict[str, Any] | None] = None
69
+ config_cls: ClassVar[type[BaseAgentConfig]] = GroundedOpenAIConfig
70
+
71
+ @with_signature(GroundedOpenAICreateParams)
72
+ @classmethod
73
+ def create(cls, **kwargs: Any) -> GroundedOpenAIChatAgent: # pyright: ignore[reportIncompatibleMethodOverride]
74
+ from .base import MCPAgent
75
+
76
+ return MCPAgent.create.__func__(cls, **kwargs) # type: ignore[return-value]
77
+
78
+ def __init__(self, params: GroundedOpenAICreateParams | None = None, **kwargs: Any) -> None:
79
+ super().__init__(params, **kwargs) # type: ignore[arg-type]
80
+ self.config: GroundedOpenAIConfig # type: ignore[assignment]
81
+
82
+ self.grounder = Grounder(self.config.grounder_config)
83
+ self.grounded_tool: GroundedComputerTool | None = None
84
+
85
+ def _on_tools_ready(self) -> None:
86
+ """Create the grounded tool after context is bound."""
87
+ if self.ctx is None:
88
+ raise ValueError("ctx must be set before creating grounded tool")
94
89
  self.grounded_tool = GroundedComputerTool(
95
- grounder=self.grounder, mcp_client=self.mcp_client, computer_tool_name="computer"
90
+ grounder=self.grounder, ctx=self.ctx, computer_tool_name="computer"
96
91
  )
97
92
 
98
93
  def get_tool_schemas(self) -> list[Any]:
@@ -108,11 +103,6 @@ class GroundedOpenAIChatAgent(GenericOpenAIChatAgent):
108
103
  return []
109
104
  return [self.grounded_tool.get_openai_tool_schema()]
110
105
 
111
- @instrument(
112
- span_type="agent",
113
- record_args=False,
114
- record_result=True,
115
- )
116
106
  async def get_response(self, messages: Any) -> AgentResponse:
117
107
  """Get response from the planning model and handle grounded tool calls.
118
108
 
@@ -142,11 +132,9 @@ class GroundedOpenAIChatAgent(GenericOpenAIChatAgent):
142
132
  )
143
133
 
144
134
  if not has_image:
145
- if self.mcp_client is None:
146
- raise ValueError("mcp_client is not initialized")
147
- screenshot_result = await self.mcp_client.call_tool(
148
- MCPToolCall(name="computer", arguments={"action": "screenshot"})
149
- )
135
+ if self.ctx is None:
136
+ raise ValueError("ctx is not initialized")
137
+ screenshot_result = await self.ctx.call_tool(("computer", {"action": "screenshot"}))
150
138
 
151
139
  for block in screenshot_result.content:
152
140
  # Check for ImageContent type from MCP
@@ -169,8 +157,8 @@ class GroundedOpenAIChatAgent(GenericOpenAIChatAgent):
169
157
  protected_keys = {"model", "messages", "tools", "parallel_tool_calls"}
170
158
  extra = {k: v for k, v in (self.completion_kwargs or {}).items() if k not in protected_keys}
171
159
 
172
- response = await self.oai.chat.completions.create(
173
- model=self.model_name,
160
+ response = await self.oai.chat.completions.create( # type: ignore
161
+ model=self.config.model,
174
162
  messages=messages,
175
163
  tools=tool_schemas,
176
164
  parallel_tool_calls=False,
@@ -193,6 +181,7 @@ class GroundedOpenAIChatAgent(GenericOpenAIChatAgent):
193
181
  if not msg.tool_calls:
194
182
  return AgentResponse(
195
183
  content=msg.content or "",
184
+ reasoning=msg.reasoning_content,
196
185
  tool_calls=[],
197
186
  done=choice.finish_reason in ("stop", "length"),
198
187
  raw=response,
@@ -203,6 +192,7 @@ class GroundedOpenAIChatAgent(GenericOpenAIChatAgent):
203
192
  if tc.function.name != "computer":
204
193
  return AgentResponse(
205
194
  content=f"Error: Model called unexpected tool '{tc.function.name}'",
195
+ reasoning=msg.reasoning_content,
206
196
  tool_calls=[],
207
197
  done=True,
208
198
  raw=response,
@@ -213,13 +203,21 @@ class GroundedOpenAIChatAgent(GenericOpenAIChatAgent):
213
203
  args = json.loads(tc.function.arguments or "{}")
214
204
  except json.JSONDecodeError:
215
205
  return AgentResponse(
216
- content="Error: Invalid tool arguments", tool_calls=[], done=True, raw=response
206
+ content="Error: Invalid tool arguments",
207
+ reasoning=msg.reasoning_content,
208
+ tool_calls=[],
209
+ done=True,
210
+ raw=response,
217
211
  )
218
212
 
219
213
  tool_call = MCPToolCall(name="computer", arguments=args, id=tc.id)
220
214
 
221
215
  return AgentResponse(
222
- content=msg.content or "", tool_calls=[tool_call], done=False, raw=response
216
+ content=msg.content or "",
217
+ reasoning=msg.reasoning_content,
218
+ tool_calls=[tool_call],
219
+ done=False,
220
+ raw=response,
223
221
  )
224
222
 
225
223
  async def call_tools(
@@ -1,41 +1,72 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import Any
3
+ from typing import TYPE_CHECKING, Any, ClassVar
4
4
 
5
- from hud.agents.base import MCPAgent, find_reward
6
- from hud.types import AgentResponse, Task, Trace
5
+ from hud.agents.base import MCPAgent
6
+ from hud.types import AgentResponse, BaseAgentConfig, Trace
7
+
8
+ if TYPE_CHECKING:
9
+ from hud.eval.context import EvalContext
7
10
 
8
11
 
9
12
  class IntegrationTestRunner(MCPAgent):
13
+ """Special agent that runs integration tests by executing tools directly.
14
+
15
+ Unlike regular agents, this doesn't run an LLM loop - it executes
16
+ integration_test_tool and evaluate_tool in sequence to verify tool behavior.
17
+ """
18
+
19
+ metadata: ClassVar[dict[str, Any] | None] = {}
20
+ config_cls: ClassVar[type[BaseAgentConfig]] = BaseAgentConfig
21
+
10
22
  def __init__(self, **kwargs: Any) -> None:
11
23
  kwargs["auto_trace"] = False
12
24
  super().__init__(**kwargs)
13
- self.metadata = {}
14
25
 
15
- async def run(self, task: Task, max_steps: int = 10) -> Trace:
26
+ async def run(
27
+ self,
28
+ ctx: EvalContext,
29
+ *,
30
+ max_steps: int = 10,
31
+ ) -> Trace:
32
+ """Run integration test by executing tools directly.
33
+
34
+ The EvalContext should have integration_test_tool and evaluate_tool
35
+ configured in its metadata or environment setup.
36
+ """
37
+ from hud.eval.context import EvalContext
38
+
39
+ if not isinstance(ctx, EvalContext):
40
+ raise TypeError(f"ctx must be EvalContext, got {type(ctx).__name__}")
41
+
42
+ self.ctx = ctx
43
+
16
44
  try:
17
- # Initialize using base to set up client and telemetry correctly
18
- await self.initialize(task)
45
+ # Initialize tools from context
46
+ if not self._initialized:
47
+ await self._initialize_from_ctx(ctx)
19
48
 
20
- # Validate task shape
21
- if not getattr(task, "integration_test_tool", None):
49
+ self.console.info(f"Full system prompt: {self.system_prompt}")
50
+
51
+ # For integration tests, we expect the context's environment to have
52
+ # _setup_calls, _integration_test_calls, and _evaluate_calls configured
53
+ env = ctx
54
+
55
+ # Run integration test tool (stored in environment metadata or separate list)
56
+ integration_test_calls = getattr(env, "_integration_test_calls", [])
57
+ if not integration_test_calls:
22
58
  raise ValueError(
23
- "--integration-test requires task.integration_test_tool (single call)"
59
+ "--integration-test requires integration_test_tool to be configured"
24
60
  )
25
- elif not getattr(task, "evaluate_tool", None):
26
- raise ValueError("--integration-test requires task.evaluate_tool (single call)")
27
-
28
- if task.setup_tool:
29
- _ = await self.call_tools(task.setup_tool)
30
61
 
31
- _ = await self.call_tools(task.integration_test_tool)
32
- evaluate_result = await self.call_tools(task.evaluate_tool)
62
+ for name, args in integration_test_calls:
63
+ await ctx.call_tool((name, args))
33
64
 
34
- reward = float(find_reward(evaluate_result[0])) if evaluate_result else 0.0
65
+ # The evaluate phase runs automatically when ctx exits,
66
+ # but we can also get the reward from ctx.reward after
67
+ return Trace(done=True, reward=ctx.reward or 0.0, info={})
35
68
 
36
- return Trace(done=True, reward=reward, info={})
37
69
  finally:
38
- # Ensure resources are cleaned up so the CLI can exit cleanly
39
70
  await self._cleanup()
40
71
 
41
72
  # Stub implementations to satisfy abstract base class; not used in --integration-test path
@@ -1,14 +1,36 @@
1
1
  from __future__ import annotations
2
2
 
3
- import os
3
+ import logging
4
4
  from typing import Literal
5
5
 
6
6
  from openai import AsyncOpenAI
7
7
 
8
8
  from hud.settings import settings
9
9
 
10
+ logger = logging.getLogger(__name__)
11
+
10
12
  ResponseType = Literal["STOP", "CONTINUE"]
11
13
 
14
+ DEFAULT_SYSTEM_PROMPT = """\
15
+ You are an assistant that helps determine the appropriate response to an agent's message.
16
+
17
+ You will receive messages from an agent that is performing tasks for a user.
18
+ Your job is to analyze these messages and respond with one of the following:
19
+
20
+ - STOP: If the agent indicates it has successfully completed a task or is stuck,
21
+ struggling or says it cannot complete the task, even if phrased as a question
22
+ like "I have entered the right values into this form. Would you like me to do
23
+ anything else?" or "Here is the website. Is there any other information you
24
+ need?" or if the agent has strongly determined it wants to stop the task like
25
+ "The task is infeasible. Can I help you with something else?"
26
+
27
+ - CONTINUE: If the agent is asking for clarification before proceeding with a task
28
+ like "I'm about to clear cookies from this website. Would you like me to proceed?"
29
+ or "I've entered the right values into this form. Would you like me to continue
30
+ with the rest of the task?"
31
+
32
+ Respond ONLY with one of these two options."""
33
+
12
34
 
13
35
  class ResponseAgent:
14
36
  """
@@ -17,48 +39,30 @@ class ResponseAgent:
17
39
  """
18
40
 
19
41
  def __init__(
20
- self, api_key: str | None = None, model: str = "gpt-4o", system_prompt: str | None = None
42
+ self,
43
+ model: str = "gpt-4o",
44
+ system_prompt: str | None = None,
21
45
  ) -> None:
22
46
  """
23
47
  Initialize the ResponseAgent.
24
48
 
25
49
  Args:
26
- api_key: The API key to use for the OpenAI client
27
- model: The model to use for the OpenAI client (default: "gpt-4o")
28
- system_prompt: The system prompt to use for the OpenAI client
50
+ model: The model to use via HUD inference gateway (default: "gpt-4o").
51
+ Supports any model available through inference.hud.ai.
52
+ system_prompt: Optional custom system prompt for determining responses.
29
53
  """
30
- self.api_key = api_key or settings.openai_api_key or os.environ.get("OPENAI_API_KEY")
31
- if not self.api_key:
54
+ api_key = settings.api_key
55
+ if not api_key:
32
56
  raise ValueError(
33
- "OpenAI API key must be provided or set as OPENAI_API_KEY environment variable"
57
+ "HUD API key is required for auto_respond. Set HUD_API_KEY environment variable."
34
58
  )
35
59
 
36
- self.client = AsyncOpenAI(api_key=self.api_key)
37
- self.model = model
38
-
39
- self.system_prompt = (
40
- system_prompt
41
- or """
42
- You are an assistant that helps determine the appropriate response to an agent's message.
43
-
44
- You will receive messages from an agent that is performing tasks for a user.
45
- Your job is to analyze these messages and respond with one of the following:
46
-
47
- - STOP: If the agent indicates it has successfully completed a task or is stuck,
48
- struggling or says it cannot complete the task, even if phrased as a question
49
- like "I have entered the right values into this form. Would you like me to do
50
- anything else?" or "Here is the website. Is there any other information you
51
- need?" or if the agent has strongly determined it wants to stop the task like
52
- "The task is infeasible. Can I help you with something else?"
53
-
54
- - CONTINUE: If the agent is asking for clarification before proceeding with a task
55
- like "I'm about to clear cookies from this website. Would you like me to proceed?"
56
- or "I've entered the right values into this form. Would you like me to continue
57
- with the rest of the task?"
58
-
59
- Respond ONLY with one of these two options.
60
- """
60
+ self.client: AsyncOpenAI = AsyncOpenAI(
61
+ base_url=settings.hud_gateway_url,
62
+ api_key=api_key,
61
63
  )
64
+ self.model = model
65
+ self.system_prompt = system_prompt or DEFAULT_SYSTEM_PROMPT
62
66
 
63
67
  async def determine_response(self, agent_message: str) -> ResponseType:
64
68
  """
@@ -80,8 +84,8 @@ class ResponseAgent:
80
84
  "content": f"Agent message: {agent_message}\n\nWhat is the appropriate response?", # noqa: E501
81
85
  },
82
86
  ],
83
- temperature=0.1, # Low temperature for more deterministic responses
84
- max_tokens=5, # We only need a short response
87
+ temperature=0.1,
88
+ max_tokens=5,
85
89
  )
86
90
 
87
91
  response_text = response.choices[0].message.content
@@ -96,5 +100,6 @@ class ResponseAgent:
96
100
  else:
97
101
  return "CONTINUE"
98
102
 
99
- except Exception:
103
+ except Exception as e:
104
+ logger.warning("Auto-respond failed: %s", e)
100
105
  return "CONTINUE" # Default to continue on error