hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (274) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +11 -5
  3. hud/agents/base.py +220 -500
  4. hud/agents/claude.py +200 -240
  5. hud/agents/gemini.py +275 -0
  6. hud/agents/gemini_cua.py +335 -0
  7. hud/agents/grounded_openai.py +98 -100
  8. hud/agents/misc/integration_test_agent.py +51 -20
  9. hud/agents/misc/response_agent.py +41 -36
  10. hud/agents/openai.py +291 -292
  11. hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
  12. hud/agents/operator.py +211 -0
  13. hud/agents/tests/conftest.py +133 -0
  14. hud/agents/tests/test_base.py +300 -622
  15. hud/agents/tests/test_base_runtime.py +233 -0
  16. hud/agents/tests/test_claude.py +379 -210
  17. hud/agents/tests/test_client.py +9 -10
  18. hud/agents/tests/test_gemini.py +369 -0
  19. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  20. hud/agents/tests/test_openai.py +376 -140
  21. hud/agents/tests/test_operator.py +362 -0
  22. hud/agents/tests/test_run_eval.py +179 -0
  23. hud/cli/__init__.py +461 -545
  24. hud/cli/analyze.py +43 -5
  25. hud/cli/build.py +664 -110
  26. hud/cli/debug.py +8 -5
  27. hud/cli/dev.py +882 -734
  28. hud/cli/eval.py +782 -668
  29. hud/cli/flows/dev.py +167 -0
  30. hud/cli/flows/init.py +191 -0
  31. hud/cli/flows/tasks.py +153 -56
  32. hud/cli/flows/templates.py +151 -0
  33. hud/cli/flows/tests/__init__.py +1 -0
  34. hud/cli/flows/tests/test_dev.py +126 -0
  35. hud/cli/init.py +60 -58
  36. hud/cli/push.py +29 -11
  37. hud/cli/rft.py +311 -0
  38. hud/cli/rft_status.py +145 -0
  39. hud/cli/tests/test_analyze.py +5 -5
  40. hud/cli/tests/test_analyze_metadata.py +3 -2
  41. hud/cli/tests/test_analyze_module.py +120 -0
  42. hud/cli/tests/test_build.py +108 -6
  43. hud/cli/tests/test_build_failure.py +41 -0
  44. hud/cli/tests/test_build_module.py +50 -0
  45. hud/cli/tests/test_cli_init.py +6 -1
  46. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  47. hud/cli/tests/test_cli_root.py +140 -0
  48. hud/cli/tests/test_convert.py +361 -0
  49. hud/cli/tests/test_debug.py +12 -10
  50. hud/cli/tests/test_dev.py +197 -0
  51. hud/cli/tests/test_eval.py +251 -0
  52. hud/cli/tests/test_eval_bedrock.py +51 -0
  53. hud/cli/tests/test_init.py +124 -0
  54. hud/cli/tests/test_main_module.py +11 -5
  55. hud/cli/tests/test_mcp_server.py +12 -100
  56. hud/cli/tests/test_push_happy.py +74 -0
  57. hud/cli/tests/test_push_wrapper.py +23 -0
  58. hud/cli/tests/test_registry.py +1 -1
  59. hud/cli/tests/test_utils.py +1 -1
  60. hud/cli/{rl → utils}/celebrate.py +14 -12
  61. hud/cli/utils/config.py +18 -1
  62. hud/cli/utils/docker.py +130 -4
  63. hud/cli/utils/env_check.py +9 -9
  64. hud/cli/utils/git.py +136 -0
  65. hud/cli/utils/interactive.py +39 -5
  66. hud/cli/utils/metadata.py +69 -0
  67. hud/cli/utils/runner.py +1 -1
  68. hud/cli/utils/server.py +2 -2
  69. hud/cli/utils/source_hash.py +3 -3
  70. hud/cli/utils/tasks.py +4 -1
  71. hud/cli/utils/tests/__init__.py +0 -0
  72. hud/cli/utils/tests/test_config.py +58 -0
  73. hud/cli/utils/tests/test_docker.py +93 -0
  74. hud/cli/utils/tests/test_docker_hints.py +71 -0
  75. hud/cli/utils/tests/test_env_check.py +74 -0
  76. hud/cli/utils/tests/test_environment.py +42 -0
  77. hud/cli/utils/tests/test_git.py +142 -0
  78. hud/cli/utils/tests/test_interactive_module.py +60 -0
  79. hud/cli/utils/tests/test_local_runner.py +50 -0
  80. hud/cli/utils/tests/test_logging_utils.py +23 -0
  81. hud/cli/utils/tests/test_metadata.py +49 -0
  82. hud/cli/utils/tests/test_package_runner.py +35 -0
  83. hud/cli/utils/tests/test_registry_utils.py +49 -0
  84. hud/cli/utils/tests/test_remote_runner.py +25 -0
  85. hud/cli/utils/tests/test_runner_modules.py +52 -0
  86. hud/cli/utils/tests/test_source_hash.py +36 -0
  87. hud/cli/utils/tests/test_tasks.py +80 -0
  88. hud/cli/utils/version_check.py +258 -0
  89. hud/cli/{rl → utils}/viewer.py +2 -2
  90. hud/clients/README.md +12 -11
  91. hud/clients/__init__.py +4 -3
  92. hud/clients/base.py +166 -26
  93. hud/clients/environment.py +51 -0
  94. hud/clients/fastmcp.py +13 -6
  95. hud/clients/mcp_use.py +40 -15
  96. hud/clients/tests/test_analyze_scenarios.py +206 -0
  97. hud/clients/tests/test_protocol.py +9 -3
  98. hud/datasets/__init__.py +23 -20
  99. hud/datasets/loader.py +327 -0
  100. hud/datasets/runner.py +192 -105
  101. hud/datasets/tests/__init__.py +0 -0
  102. hud/datasets/tests/test_loader.py +221 -0
  103. hud/datasets/tests/test_utils.py +315 -0
  104. hud/datasets/utils.py +270 -90
  105. hud/environment/__init__.py +50 -0
  106. hud/environment/connection.py +206 -0
  107. hud/environment/connectors/__init__.py +33 -0
  108. hud/environment/connectors/base.py +68 -0
  109. hud/environment/connectors/local.py +177 -0
  110. hud/environment/connectors/mcp_config.py +109 -0
  111. hud/environment/connectors/openai.py +101 -0
  112. hud/environment/connectors/remote.py +172 -0
  113. hud/environment/environment.py +694 -0
  114. hud/environment/integrations/__init__.py +45 -0
  115. hud/environment/integrations/adk.py +67 -0
  116. hud/environment/integrations/anthropic.py +196 -0
  117. hud/environment/integrations/gemini.py +92 -0
  118. hud/environment/integrations/langchain.py +82 -0
  119. hud/environment/integrations/llamaindex.py +68 -0
  120. hud/environment/integrations/openai.py +238 -0
  121. hud/environment/mock.py +306 -0
  122. hud/environment/router.py +112 -0
  123. hud/environment/scenarios.py +493 -0
  124. hud/environment/tests/__init__.py +1 -0
  125. hud/environment/tests/test_connection.py +317 -0
  126. hud/environment/tests/test_connectors.py +218 -0
  127. hud/environment/tests/test_environment.py +161 -0
  128. hud/environment/tests/test_integrations.py +257 -0
  129. hud/environment/tests/test_local_connectors.py +201 -0
  130. hud/environment/tests/test_scenarios.py +280 -0
  131. hud/environment/tests/test_tools.py +208 -0
  132. hud/environment/types.py +23 -0
  133. hud/environment/utils/__init__.py +35 -0
  134. hud/environment/utils/formats.py +215 -0
  135. hud/environment/utils/schema.py +171 -0
  136. hud/environment/utils/tool_wrappers.py +113 -0
  137. hud/eval/__init__.py +67 -0
  138. hud/eval/context.py +674 -0
  139. hud/eval/display.py +299 -0
  140. hud/eval/instrument.py +185 -0
  141. hud/eval/manager.py +466 -0
  142. hud/eval/parallel.py +268 -0
  143. hud/eval/task.py +340 -0
  144. hud/eval/tests/__init__.py +1 -0
  145. hud/eval/tests/test_context.py +178 -0
  146. hud/eval/tests/test_eval.py +210 -0
  147. hud/eval/tests/test_manager.py +152 -0
  148. hud/eval/tests/test_parallel.py +168 -0
  149. hud/eval/tests/test_task.py +145 -0
  150. hud/eval/types.py +63 -0
  151. hud/eval/utils.py +183 -0
  152. hud/patches/__init__.py +19 -0
  153. hud/patches/mcp_patches.py +151 -0
  154. hud/patches/warnings.py +54 -0
  155. hud/samples/browser.py +4 -4
  156. hud/server/__init__.py +2 -1
  157. hud/server/low_level.py +2 -1
  158. hud/server/router.py +164 -0
  159. hud/server/server.py +567 -80
  160. hud/server/tests/test_mcp_server_integration.py +11 -11
  161. hud/server/tests/test_mcp_server_more.py +1 -1
  162. hud/server/tests/test_server_extra.py +2 -0
  163. hud/settings.py +45 -3
  164. hud/shared/exceptions.py +36 -10
  165. hud/shared/hints.py +26 -1
  166. hud/shared/requests.py +15 -3
  167. hud/shared/tests/test_exceptions.py +40 -31
  168. hud/shared/tests/test_hints.py +167 -0
  169. hud/telemetry/__init__.py +20 -19
  170. hud/telemetry/exporter.py +201 -0
  171. hud/telemetry/instrument.py +158 -253
  172. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  173. hud/telemetry/tests/test_exporter.py +258 -0
  174. hud/telemetry/tests/test_instrument.py +401 -0
  175. hud/tools/__init__.py +16 -2
  176. hud/tools/apply_patch.py +639 -0
  177. hud/tools/base.py +54 -4
  178. hud/tools/bash.py +2 -2
  179. hud/tools/computer/__init__.py +4 -0
  180. hud/tools/computer/anthropic.py +2 -2
  181. hud/tools/computer/gemini.py +385 -0
  182. hud/tools/computer/hud.py +23 -6
  183. hud/tools/computer/openai.py +20 -21
  184. hud/tools/computer/qwen.py +434 -0
  185. hud/tools/computer/settings.py +37 -0
  186. hud/tools/edit.py +3 -7
  187. hud/tools/executors/base.py +4 -2
  188. hud/tools/executors/pyautogui.py +1 -1
  189. hud/tools/grounding/grounded_tool.py +13 -18
  190. hud/tools/grounding/grounder.py +10 -31
  191. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  192. hud/tools/jupyter.py +330 -0
  193. hud/tools/playwright.py +18 -3
  194. hud/tools/shell.py +308 -0
  195. hud/tools/tests/test_apply_patch.py +718 -0
  196. hud/tools/tests/test_computer.py +4 -9
  197. hud/tools/tests/test_computer_actions.py +24 -2
  198. hud/tools/tests/test_jupyter_tool.py +181 -0
  199. hud/tools/tests/test_shell.py +596 -0
  200. hud/tools/tests/test_submit.py +85 -0
  201. hud/tools/tests/test_types.py +193 -0
  202. hud/tools/types.py +21 -1
  203. hud/types.py +167 -57
  204. hud/utils/__init__.py +2 -0
  205. hud/utils/env.py +67 -0
  206. hud/utils/hud_console.py +61 -3
  207. hud/utils/mcp.py +15 -58
  208. hud/utils/strict_schema.py +162 -0
  209. hud/utils/tests/test_init.py +1 -2
  210. hud/utils/tests/test_mcp.py +1 -28
  211. hud/utils/tests/test_pretty_errors.py +186 -0
  212. hud/utils/tests/test_tool_shorthand.py +154 -0
  213. hud/utils/tests/test_version.py +1 -1
  214. hud/utils/types.py +20 -0
  215. hud/version.py +1 -1
  216. hud_python-0.5.1.dist-info/METADATA +264 -0
  217. hud_python-0.5.1.dist-info/RECORD +299 -0
  218. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
  219. hud/agents/langchain.py +0 -261
  220. hud/agents/lite_llm.py +0 -72
  221. hud/cli/rl/__init__.py +0 -180
  222. hud/cli/rl/config.py +0 -101
  223. hud/cli/rl/display.py +0 -133
  224. hud/cli/rl/gpu.py +0 -63
  225. hud/cli/rl/gpu_utils.py +0 -321
  226. hud/cli/rl/local_runner.py +0 -595
  227. hud/cli/rl/presets.py +0 -96
  228. hud/cli/rl/remote_runner.py +0 -463
  229. hud/cli/rl/rl_api.py +0 -150
  230. hud/cli/rl/vllm.py +0 -177
  231. hud/cli/rl/wait_utils.py +0 -89
  232. hud/datasets/parallel.py +0 -687
  233. hud/misc/__init__.py +0 -1
  234. hud/misc/claude_plays_pokemon.py +0 -292
  235. hud/otel/__init__.py +0 -35
  236. hud/otel/collector.py +0 -142
  237. hud/otel/config.py +0 -181
  238. hud/otel/context.py +0 -570
  239. hud/otel/exporters.py +0 -369
  240. hud/otel/instrumentation.py +0 -135
  241. hud/otel/processors.py +0 -121
  242. hud/otel/tests/__init__.py +0 -1
  243. hud/otel/tests/test_processors.py +0 -197
  244. hud/rl/README.md +0 -30
  245. hud/rl/__init__.py +0 -1
  246. hud/rl/actor.py +0 -176
  247. hud/rl/buffer.py +0 -405
  248. hud/rl/chat_template.jinja +0 -101
  249. hud/rl/config.py +0 -192
  250. hud/rl/distributed.py +0 -132
  251. hud/rl/learner.py +0 -637
  252. hud/rl/tests/__init__.py +0 -1
  253. hud/rl/tests/test_learner.py +0 -186
  254. hud/rl/train.py +0 -382
  255. hud/rl/types.py +0 -101
  256. hud/rl/utils/start_vllm_server.sh +0 -30
  257. hud/rl/utils.py +0 -524
  258. hud/rl/vllm_adapter.py +0 -143
  259. hud/telemetry/job.py +0 -352
  260. hud/telemetry/replay.py +0 -74
  261. hud/telemetry/tests/test_replay.py +0 -40
  262. hud/telemetry/tests/test_trace.py +0 -63
  263. hud/telemetry/trace.py +0 -158
  264. hud/utils/agent_factories.py +0 -86
  265. hud/utils/async_utils.py +0 -65
  266. hud/utils/group_eval.py +0 -223
  267. hud/utils/progress.py +0 -149
  268. hud/utils/tasks.py +0 -127
  269. hud/utils/tests/test_async_utils.py +0 -173
  270. hud/utils/tests/test_progress.py +0 -261
  271. hud_python-0.4.45.dist-info/METADATA +0 -552
  272. hud_python-0.4.45.dist-info/RECORD +0 -228
  273. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
  274. {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0
hud/tools/base.py CHANGED
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import logging
3
4
  from abc import ABC, abstractmethod
4
5
  from typing import TYPE_CHECKING, Any, cast
5
6
 
@@ -8,7 +9,7 @@ from fastmcp import FastMCP
8
9
  from hud.tools.types import ContentBlock, EvaluationResult
9
10
 
10
11
  if TYPE_CHECKING:
11
- from collections.abc import Callable
12
+ from collections.abc import Awaitable, Callable
12
13
 
13
14
  from fastmcp.tools import FunctionTool
14
15
  from fastmcp.tools.tool import Tool, ToolResult
@@ -16,6 +17,8 @@ if TYPE_CHECKING:
16
17
  # Basic result types for tools
17
18
  BaseResult = list[ContentBlock] | EvaluationResult
18
19
 
20
+ logger = logging.getLogger(__name__)
21
+
19
22
 
20
23
  class BaseTool(ABC):
21
24
  """
@@ -58,6 +61,10 @@ class BaseTool(ABC):
58
61
  self.title = title or self.__class__.__name__.replace("Tool", "").replace("_", " ").title()
59
62
  self.description = description or (self.__doc__.strip() if self.__doc__ else None)
60
63
  self.meta = meta
64
+ self._callbacks: dict[
65
+ str,
66
+ list[Callable[..., Awaitable[Any]]],
67
+ ] = {} # {"event_name": [callback_functions]}
61
68
 
62
69
  # Expose attributes FastMCP expects when registering an instance directly
63
70
  self.__name__ = self.name # FastMCP uses fn.__name__ if name param omitted
@@ -100,13 +107,48 @@ class BaseTool(ABC):
100
107
  )
101
108
  return self._mcp_tool
102
109
 
110
+ def add_callback(self, event_type: str, callback: Callable[..., Awaitable[Any]]) -> None:
111
+ """Register a callback function for specific event
112
+
113
+ Args:
114
+ event_type: (Required) Specific event name to trigger callback
115
+ e.g. "after_click", "before_navigate"
116
+ callback: (Required) Async function to call. Must be defined by `async def f(...)`
117
+ """
118
+ if event_type not in self._callbacks:
119
+ self._callbacks[event_type] = []
120
+ self._callbacks[event_type].append(callback)
121
+
122
+ def remove_callback(self, event_type: str, callback: Callable[..., Awaitable[Any]]) -> None:
123
+ """Remove a registered callback
124
+ Args:
125
+ event_type: (Required) Specific event name to trigger callback
126
+ e.g. "after_click", "before_navigate"
127
+ callback: (Required) Function to remove from callback list.
128
+ """
129
+ if (event_type in self._callbacks) and (callback in self._callbacks[event_type]):
130
+ self._callbacks[event_type].remove(callback)
131
+
132
+ async def _trigger_callbacks(self, event_type: str, **kwargs: Any) -> None:
133
+ """Trigger all registered callback functions of an event type"""
134
+ callback_list = self._callbacks.get(event_type, [])
135
+ for callback in callback_list:
136
+ try:
137
+ await callback(**kwargs)
138
+ except Exception as e:
139
+ logger.warning("Callback failed for %s: %s", event_type, e)
140
+
103
141
 
104
142
  # Prefix for internal tool names
105
143
  _INTERNAL_PREFIX = "int_"
106
144
 
107
145
 
108
146
  class BaseHub(FastMCP):
109
- """A composition-friendly FastMCP server that holds an internal tool dispatcher."""
147
+ """A composition-friendly FastMCP server that holds an internal tool dispatcher.
148
+
149
+ Note: BaseHub can be used standalone or to wrap existing routers. For the newer
150
+ FastAPI-like pattern, consider using HiddenRouter from hud.server instead.
151
+ """
110
152
 
111
153
  env: Any
112
154
 
@@ -129,6 +171,10 @@ class BaseHub(FastMCP):
129
171
  Optional long-lived environment object. Stored on the server
130
172
  instance (``layer.env``) and therefore available to every request
131
173
  via ``ctx.fastmcp.env``.
174
+ title:
175
+ Optional title for the dispatcher tool.
176
+ description:
177
+ Optional description for the dispatcher tool.
132
178
  meta:
133
179
  Metadata to include in MCP tool listing.
134
180
  """
@@ -370,8 +416,12 @@ class BaseHub(FastMCP):
370
416
  }
371
417
 
372
418
  # Override _list_tools to hide internal tools when mounted
373
- async def _list_tools(self) -> list[Tool]:
374
- """Override _list_tools to hide internal tools when mounted."""
419
+ async def _list_tools(self, context: Any = None) -> list[Tool]:
420
+ """Override _list_tools to hide internal tools when mounted.
421
+
422
+ Args:
423
+ context: MiddlewareContext passed by FastMCP (optional for backwards compat)
424
+ """
375
425
  return [
376
426
  tool
377
427
  for key, tool in self._tool_manager._tools.items()
hud/tools/bash.py CHANGED
@@ -3,7 +3,7 @@ from __future__ import annotations
3
3
  import asyncio
4
4
  import os
5
5
  import sys
6
- from typing import TYPE_CHECKING, Any
6
+ from typing import TYPE_CHECKING
7
7
 
8
8
  from .base import BaseTool
9
9
  from .types import ContentResult, ToolError
@@ -140,7 +140,7 @@ class BashTool(BaseTool):
140
140
  self.env = value
141
141
 
142
142
  async def __call__(
143
- self, command: str | None = None, restart: bool = False, **kwargs: Any
143
+ self, command: str | None = None, restart: bool = False
144
144
  ) -> list[ContentBlock]:
145
145
  if restart:
146
146
  if self.session:
@@ -3,13 +3,17 @@
3
3
  from __future__ import annotations
4
4
 
5
5
  from .anthropic import AnthropicComputerTool
6
+ from .gemini import GeminiComputerTool
6
7
  from .hud import HudComputerTool
7
8
  from .openai import OpenAIComputerTool
9
+ from .qwen import QwenComputerTool
8
10
  from .settings import computer_settings
9
11
 
10
12
  __all__ = [
11
13
  "AnthropicComputerTool",
14
+ "GeminiComputerTool",
12
15
  "HudComputerTool",
13
16
  "OpenAIComputerTool",
17
+ "QwenComputerTool",
14
18
  "computer_settings",
15
19
  ]
@@ -141,13 +141,13 @@ class AnthropicComputerTool(HudComputerTool):
141
141
  async def __call__(
142
142
  self,
143
143
  action: str = Field(..., description="The action to perform on the computer"),
144
- coordinate: list[int] | tuple[int, int] | None = Field(
144
+ coordinate: list[int] | None = Field(
145
145
  None, description="The coordinate to interact with on the computer [x, y]"
146
146
  ),
147
147
  text: str | None = Field(
148
148
  None, description="The text to type on the computer or key to press"
149
149
  ),
150
- start_coordinate: list[int] | tuple[int, int] | None = Field(
150
+ start_coordinate: list[int] | None = Field(
151
151
  None, description="The starting coordinate for drag actions [x, y]"
152
152
  ),
153
153
  scroll_direction: str | None = Field(
@@ -0,0 +1,385 @@
1
+ from __future__ import annotations
2
+
3
+ import logging
4
+ import platform
5
+ from typing import TYPE_CHECKING, Any, Literal
6
+
7
+ from mcp import ErrorData, McpError
8
+ from mcp.types import INVALID_PARAMS, ContentBlock
9
+ from pydantic import Field
10
+
11
+ from hud.tools.types import ContentResult
12
+
13
+ from .hud import HudComputerTool
14
+ from .settings import computer_settings
15
+
16
+ if TYPE_CHECKING:
17
+ from hud.tools.executors.base import BaseExecutor
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ ACTION_FIELD = Field(..., description="Gemini Computer Use action to perform")
23
+ X_FIELD = Field(None, description="X coordinate (pixels in agent space)")
24
+ Y_FIELD = Field(None, description="Y coordinate (pixels in agent space)")
25
+ TEXT_FIELD = Field(None, description="Text to type")
26
+ PRESS_ENTER_FIELD = Field(None, description="Whether to press Enter after typing (type_text_at)")
27
+ CLEAR_BEFORE_TYPING_FIELD = Field(
28
+ None, description="Whether to select-all before typing (type_text_at)"
29
+ )
30
+ DIRECTION_FIELD = Field(None, description="Scroll direction for scroll_document/scroll_at")
31
+ MAGNITUDE_FIELD = Field(None, description="Scroll magnitude (pixels in agent space)")
32
+ URL_FIELD = Field(None, description="Target URL for navigate")
33
+ KEYS_FIELD = Field(None, description="Keys for key_combination")
34
+ DESTINATION_X_FIELD = Field(None, description="Destination X for drag_and_drop (agent space)")
35
+ DESTINATION_Y_FIELD = Field(None, description="Destination Y for drag_and_drop (agent space)")
36
+ TAKE_SCREENSHOT_ON_CLICK_FIELD = Field(
37
+ True, description="Whether to include a screenshot for interactive actions"
38
+ )
39
+
40
+
41
+ class GeminiComputerTool(HudComputerTool):
42
+ """
43
+ Gemini Computer Use tool for interacting with a computer via MCP.
44
+
45
+ Maps Gemini's predefined function names (open_web_browser, click_at, hover_at,
46
+ type_text_at, scroll_document, scroll_at, wait_5_seconds, go_back, go_forward,
47
+ search, navigate, key_combination, drag_and_drop) to executor actions.
48
+ """
49
+
50
+ def __init__(
51
+ self,
52
+ # Define within environment based on platform
53
+ executor: BaseExecutor | None = None,
54
+ platform_type: Literal["auto", "xdo", "pyautogui"] = "auto",
55
+ display_num: int | None = None,
56
+ # Overrides for what dimensions the agent thinks it operates in
57
+ width: int = computer_settings.GEMINI_COMPUTER_WIDTH,
58
+ height: int = computer_settings.GEMINI_COMPUTER_HEIGHT,
59
+ rescale_images: bool = computer_settings.GEMINI_RESCALE_IMAGES,
60
+ # What the agent sees as the tool's name, title, and description
61
+ name: str | None = None,
62
+ title: str | None = None,
63
+ description: str | None = None,
64
+ **kwargs: Any,
65
+ ) -> None:
66
+ """
67
+ Initialize with Gemini's default dimensions.
68
+ """
69
+ super().__init__(
70
+ executor=executor,
71
+ platform_type=platform_type,
72
+ display_num=display_num,
73
+ width=width,
74
+ height=height,
75
+ rescale_images=rescale_images,
76
+ name=name or "gemini_computer",
77
+ title=title or "Gemini Computer Tool",
78
+ description=description or "Control computer with mouse, keyboard, and screenshots",
79
+ **kwargs,
80
+ )
81
+
82
+ async def __call__(
83
+ self,
84
+ action: str = ACTION_FIELD,
85
+ # Common coordinates
86
+ x: int | None = X_FIELD,
87
+ y: int | None = Y_FIELD,
88
+ # Text input
89
+ text: str | None = TEXT_FIELD,
90
+ press_enter: bool | None = PRESS_ENTER_FIELD,
91
+ clear_before_typing: bool | None = CLEAR_BEFORE_TYPING_FIELD,
92
+ # Scroll parameters
93
+ direction: Literal["up", "down", "left", "right"] | None = DIRECTION_FIELD,
94
+ magnitude: int | None = MAGNITUDE_FIELD,
95
+ # Navigation
96
+ url: str | None = URL_FIELD,
97
+ # Key combos
98
+ keys: list[str] | str | None = KEYS_FIELD,
99
+ # Drag parameters
100
+ destination_x: int | None = DESTINATION_X_FIELD,
101
+ destination_y: int | None = DESTINATION_Y_FIELD,
102
+ # Behavior
103
+ take_screenshot_on_click: bool = TAKE_SCREENSHOT_ON_CLICK_FIELD,
104
+ ) -> list[ContentBlock]:
105
+ """
106
+ Handle Gemini Computer Use API calls by mapping to executor actions.
107
+
108
+ Returns:
109
+ List of MCP content blocks
110
+ """
111
+ logger.info("GeminiComputerTool received action: %s", action)
112
+
113
+ # Helper to finalize ContentResult: rescale if requested and ensure URL metadata
114
+ async def _finalize(
115
+ result: ContentResult, requested_url: str | None = None
116
+ ) -> list[ContentBlock]:
117
+ if result.base64_image and self.rescale_images:
118
+ try:
119
+ result.base64_image = await self._rescale_screenshot(result.base64_image)
120
+ except Exception as e:
121
+ logger.warning("Failed to rescale screenshot: %s", e)
122
+ # Always include URL metadata if provided; otherwise default to about:blank
123
+ result.url = requested_url or result.url or "about:blank"
124
+ return result.to_content_blocks()
125
+
126
+ # Scale coordinates helper
127
+ def _scale(xv: int | None, yv: int | None) -> tuple[int | None, int | None]:
128
+ return self._scale_coordinates(xv, yv)
129
+
130
+ # Gemini emits coordinates/magnitudes in a 0-1000 normalized space.
131
+ def _denormalize(value: float | None, axis: Literal["x", "y"]) -> int | None:
132
+ if value is None:
133
+ return None
134
+ try:
135
+ numeric = float(value)
136
+ except (TypeError, ValueError):
137
+ try:
138
+ return int(value) # type: ignore[arg-type]
139
+ except (TypeError, ValueError):
140
+ return None
141
+
142
+ # Treat values within the normalized range (including defaults like 800).
143
+ if 0 <= numeric <= 1000:
144
+ target = self.width if axis == "x" else self.height
145
+ numeric = numeric / 1000 * target
146
+
147
+ return round(numeric)
148
+
149
+ def _scale_distance(value: int | None, axis: Literal["x", "y"]) -> int | None:
150
+ if value is None:
151
+ return None
152
+ scale = self.scale_x if axis == "x" else self.scale_y
153
+ if scale != 1.0:
154
+ return round(value / scale)
155
+ return value
156
+
157
+ # Map actions
158
+ if action == "open_web_browser":
159
+ screenshot = await self.executor.screenshot()
160
+ if screenshot:
161
+ result = ContentResult(base64_image=screenshot, url="about:blank")
162
+ else:
163
+ result = ContentResult(error="Failed to take screenshot", url="about:blank")
164
+ return await _finalize(result)
165
+
166
+ elif action == "click_at":
167
+ if x is None or y is None:
168
+ raise McpError(ErrorData(code=INVALID_PARAMS, message="x and y are required"))
169
+ dx = _denormalize(x, "x")
170
+ dy = _denormalize(y, "y")
171
+ sx, sy = _scale(dx, dy)
172
+ result = await self.executor.click(x=sx, y=sy)
173
+ return await _finalize(result)
174
+
175
+ elif action == "hover_at":
176
+ if x is None or y is None:
177
+ raise McpError(ErrorData(code=INVALID_PARAMS, message="x and y are required"))
178
+ dx = _denormalize(x, "x")
179
+ dy = _denormalize(y, "y")
180
+ sx, sy = _scale(dx, dy)
181
+ result = await self.executor.move(x=sx, y=sy)
182
+ return await _finalize(result)
183
+
184
+ elif action == "type_text_at":
185
+ if x is None or y is None:
186
+ raise McpError(ErrorData(code=INVALID_PARAMS, message="x and y are required"))
187
+ if text is None:
188
+ raise McpError(ErrorData(code=INVALID_PARAMS, message="text is required"))
189
+
190
+ dx = _denormalize(x, "x")
191
+ dy = _denormalize(y, "y")
192
+ sx, sy = _scale(dx, dy)
193
+
194
+ # Focus the field
195
+ await self.executor.move(x=sx, y=sy, take_screenshot=False)
196
+ await self.executor.click(x=sx, y=sy, take_screenshot=False)
197
+
198
+ # Clear existing text if requested
199
+ if clear_before_typing is None or clear_before_typing:
200
+ is_mac = platform.system().lower() == "darwin"
201
+ combo = ["cmd", "a"] if is_mac else ["ctrl", "a"]
202
+ await self.executor.press(keys=combo, take_screenshot=False)
203
+ delete_key = "backspace" if is_mac else "delete"
204
+ await self.executor.press(keys=[delete_key], take_screenshot=False)
205
+
206
+ # Type (optionally press enter after)
207
+ result = await self.executor.write(text=text, enter_after=bool(press_enter))
208
+ return await _finalize(result)
209
+
210
+ elif action == "scroll_document":
211
+ if direction is None:
212
+ raise McpError(ErrorData(code=INVALID_PARAMS, message="direction is required"))
213
+ # Default magnitude similar to reference implementation
214
+ mag = magnitude if magnitude is not None else 800
215
+ # Convert to environment units while preserving sign
216
+ if direction in ("down", "up"):
217
+ distance = _denormalize(mag, "y")
218
+ if distance is None:
219
+ raise McpError(
220
+ ErrorData(
221
+ code=INVALID_PARAMS, message="Unable to determine scroll magnitude"
222
+ )
223
+ )
224
+ distance = _scale_distance(distance, "y")
225
+ if distance is None:
226
+ raise McpError(
227
+ ErrorData(
228
+ code=INVALID_PARAMS,
229
+ message="Unable to determine scroll magnitude",
230
+ )
231
+ )
232
+ scroll_y = distance if direction == "down" else -distance
233
+ scroll_x = None
234
+ elif direction in ("right", "left"):
235
+ distance = _denormalize(mag, "x")
236
+ if distance is None:
237
+ raise McpError(
238
+ ErrorData(
239
+ code=INVALID_PARAMS, message="Unable to determine scroll magnitude"
240
+ )
241
+ )
242
+ distance = _scale_distance(distance, "x")
243
+ if distance is None:
244
+ raise McpError(
245
+ ErrorData(
246
+ code=INVALID_PARAMS,
247
+ message="Unable to determine scroll magnitude",
248
+ )
249
+ )
250
+ scroll_x = distance if direction == "right" else -distance
251
+ scroll_y = None
252
+ else:
253
+ raise McpError(
254
+ ErrorData(code=INVALID_PARAMS, message=f"Invalid direction: {direction}")
255
+ )
256
+ result = await self.executor.scroll(scroll_x=scroll_x, scroll_y=scroll_y)
257
+ return await _finalize(result)
258
+
259
+ elif action == "scroll_at":
260
+ if direction is None:
261
+ raise McpError(ErrorData(code=INVALID_PARAMS, message="direction is required"))
262
+ if x is None or y is None:
263
+ raise McpError(ErrorData(code=INVALID_PARAMS, message="x and y are required"))
264
+ mag = magnitude if magnitude is not None else 800
265
+ dx = _denormalize(x, "x")
266
+ dy = _denormalize(y, "y")
267
+ sx, sy = _scale(dx, dy)
268
+ if direction in ("down", "up"):
269
+ distance = _denormalize(mag, "y")
270
+ if distance is None:
271
+ raise McpError(
272
+ ErrorData(
273
+ code=INVALID_PARAMS, message="Unable to determine scroll magnitude"
274
+ )
275
+ )
276
+ distance = _scale_distance(distance, "y")
277
+ if distance is None:
278
+ raise McpError(
279
+ ErrorData(
280
+ code=INVALID_PARAMS,
281
+ message="Unable to determine scroll magnitude",
282
+ )
283
+ )
284
+ scroll_y = distance if direction == "down" else -distance
285
+ scroll_x = None
286
+ elif direction in ("right", "left"):
287
+ distance = _denormalize(mag, "x")
288
+ if distance is None:
289
+ raise McpError(
290
+ ErrorData(
291
+ code=INVALID_PARAMS, message="Unable to determine scroll magnitude"
292
+ )
293
+ )
294
+ distance = _scale_distance(distance, "x")
295
+ if distance is None:
296
+ raise McpError(
297
+ ErrorData(
298
+ code=INVALID_PARAMS,
299
+ message="Unable to determine scroll magnitude",
300
+ )
301
+ )
302
+ scroll_x = distance if direction == "right" else -distance
303
+ scroll_y = None
304
+ else:
305
+ raise McpError(
306
+ ErrorData(code=INVALID_PARAMS, message=f"Invalid direction: {direction}")
307
+ )
308
+ result = await self.executor.scroll(x=sx, y=sy, scroll_x=scroll_x, scroll_y=scroll_y)
309
+ return await _finalize(result)
310
+
311
+ elif action == "wait_5_seconds":
312
+ result = await self.executor.wait(time=5000)
313
+ return await _finalize(result)
314
+
315
+ elif action == "go_back":
316
+ is_mac = platform.system().lower() == "darwin"
317
+ combo = ["cmd", "["] if is_mac else ["alt", "left"]
318
+ result = await self.executor.press(keys=combo)
319
+ return await _finalize(result)
320
+
321
+ elif action == "go_forward":
322
+ is_mac = platform.system().lower() == "darwin"
323
+ combo = ["cmd", "]"] if is_mac else ["alt", "right"]
324
+ result = await self.executor.press(keys=combo)
325
+ return await _finalize(result)
326
+
327
+ elif action == "search":
328
+ # Best-effort navigate to a default search page
329
+ target = url or "https://www.google.com"
330
+ is_mac = platform.system().lower() == "darwin"
331
+ await self.executor.press(
332
+ keys=["cmd", "l"] if is_mac else ["ctrl", "l"], take_screenshot=False
333
+ )
334
+ result = await self.executor.write(text=target, enter_after=True)
335
+ return await _finalize(result, requested_url=target)
336
+
337
+ elif action == "navigate":
338
+ if not url:
339
+ raise McpError(ErrorData(code=INVALID_PARAMS, message="url is required"))
340
+ is_mac = platform.system().lower() == "darwin"
341
+ await self.executor.press(
342
+ keys=["cmd", "l"] if is_mac else ["ctrl", "l"], take_screenshot=False
343
+ )
344
+ result = await self.executor.write(text=url, enter_after=True)
345
+ return await _finalize(result, requested_url=url)
346
+
347
+ elif action == "key_combination":
348
+ if keys is None:
349
+ raise McpError(ErrorData(code=INVALID_PARAMS, message="keys is required"))
350
+ if isinstance(keys, str):
351
+ # Accept formats like "ctrl+c" or "ctrl+shift+t"
352
+ key_list = [k.strip() for k in keys.split("+") if k.strip()]
353
+ else:
354
+ key_list = keys
355
+ result = await self.executor.press(keys=key_list)
356
+ return await _finalize(result)
357
+
358
+ elif action == "drag_and_drop":
359
+ if x is None or y is None or destination_x is None or destination_y is None:
360
+ raise McpError(
361
+ ErrorData(
362
+ code=INVALID_PARAMS,
363
+ message="x, y, destination_x, and destination_y are required",
364
+ )
365
+ )
366
+ sx_norm = _denormalize(x, "x")
367
+ sy_norm = _denormalize(y, "y")
368
+ dx_norm = _denormalize(destination_x, "x")
369
+ dy_norm = _denormalize(destination_y, "y")
370
+ sx, sy = _scale(sx_norm, sy_norm)
371
+ dx_scaled, dy_scaled = _scale(dx_norm, dy_norm)
372
+ # Build a two-point path
373
+ path = [] # type: list[tuple[int, int]]
374
+ if (
375
+ sx is not None
376
+ and sy is not None
377
+ and dx_scaled is not None
378
+ and dy_scaled is not None
379
+ ):
380
+ path = [(sx, sy), (dx_scaled, dy_scaled)]
381
+ result = await self.executor.drag(path=path)
382
+ return await _finalize(result)
383
+
384
+ else:
385
+ raise McpError(ErrorData(code=INVALID_PARAMS, message=f"Unknown action: {action}"))
hud/tools/computer/hud.py CHANGED
@@ -13,7 +13,7 @@ from hud.tools.base import BaseTool
13
13
  from hud.tools.executors.base import BaseExecutor
14
14
  from hud.tools.executors.pyautogui import PyAutoGUIExecutor
15
15
  from hud.tools.executors.xdo import XDOExecutor
16
- from hud.tools.types import ContentResult, ToolError
16
+ from hud.tools.types import ContentResult, Coordinate, ToolError
17
17
 
18
18
  from .settings import computer_settings
19
19
 
@@ -231,7 +231,23 @@ class HudComputerTool(BaseTool):
231
231
 
232
232
  async def __call__(
233
233
  self,
234
- action: str = Field(..., description="The action name (click, press, write, move, etc.)"),
234
+ action: Literal[
235
+ "click",
236
+ "press",
237
+ "keydown",
238
+ "keyup",
239
+ "write",
240
+ "scroll",
241
+ "move",
242
+ "wait",
243
+ "drag",
244
+ "response",
245
+ "screenshot",
246
+ "position",
247
+ "hold_key",
248
+ "mouse_down",
249
+ "mouse_up",
250
+ ] = Field(..., description="The action name (click, press, write, move, etc.)"),
235
251
  # Click parameters
236
252
  x: int | None = Field(None, description="X coordinate for click/move/scroll actions"),
237
253
  y: int | None = Field(None, description="Y coordinate for click/move/scroll actions"),
@@ -254,8 +270,8 @@ class HudComputerTool(BaseTool):
254
270
  offset_x: int | None = Field(None, description="X offset for relative move"),
255
271
  offset_y: int | None = Field(None, description="Y offset for relative move"),
256
272
  # Drag parameters
257
- path: list[tuple[int, int]] | None = Field(
258
- None, description="Path for drag actions as list of (x, y) coordinates"
273
+ path: list[Coordinate] | None = Field(
274
+ None, description="Path for drag actions as list of {x, y} coordinates"
259
275
  ),
260
276
  # Wait parameter
261
277
  time: int | None = Field(None, description="Time in milliseconds for wait action"),
@@ -332,8 +348,9 @@ class HudComputerTool(BaseTool):
332
348
  elif action == "drag":
333
349
  if path is None:
334
350
  raise ToolError("path parameter is required for drag")
335
- # Scale path from client space to screen space
336
- scaled_path = self._scale_path(path)
351
+ # Convert Coordinate objects to tuples and scale from client space to screen space
352
+ path_tuples = [(point.x, point.y) for point in path]
353
+ scaled_path = self._scale_path(path_tuples)
337
354
  result = await self.executor.drag(
338
355
  path=scaled_path, pattern=pattern, hold_keys=hold_keys
339
356
  )