hud-python 0.4.45__py3-none-any.whl → 0.5.13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (282) hide show
  1. hud/__init__.py +27 -7
  2. hud/agents/__init__.py +70 -5
  3. hud/agents/base.py +238 -500
  4. hud/agents/claude.py +236 -247
  5. hud/agents/gateway.py +42 -0
  6. hud/agents/gemini.py +264 -0
  7. hud/agents/gemini_cua.py +324 -0
  8. hud/agents/grounded_openai.py +98 -100
  9. hud/agents/misc/integration_test_agent.py +51 -20
  10. hud/agents/misc/response_agent.py +48 -36
  11. hud/agents/openai.py +282 -296
  12. hud/agents/{openai_chat_generic.py → openai_chat.py} +63 -33
  13. hud/agents/operator.py +199 -0
  14. hud/agents/resolver.py +70 -0
  15. hud/agents/tests/conftest.py +133 -0
  16. hud/agents/tests/test_base.py +300 -622
  17. hud/agents/tests/test_base_runtime.py +233 -0
  18. hud/agents/tests/test_claude.py +381 -214
  19. hud/agents/tests/test_client.py +9 -10
  20. hud/agents/tests/test_gemini.py +369 -0
  21. hud/agents/tests/test_grounded_openai_agent.py +65 -50
  22. hud/agents/tests/test_openai.py +377 -140
  23. hud/agents/tests/test_operator.py +362 -0
  24. hud/agents/tests/test_resolver.py +192 -0
  25. hud/agents/tests/test_run_eval.py +179 -0
  26. hud/agents/types.py +148 -0
  27. hud/cli/__init__.py +493 -546
  28. hud/cli/analyze.py +43 -5
  29. hud/cli/build.py +699 -113
  30. hud/cli/debug.py +8 -5
  31. hud/cli/dev.py +889 -732
  32. hud/cli/eval.py +793 -667
  33. hud/cli/flows/dev.py +167 -0
  34. hud/cli/flows/init.py +191 -0
  35. hud/cli/flows/tasks.py +153 -56
  36. hud/cli/flows/templates.py +151 -0
  37. hud/cli/flows/tests/__init__.py +1 -0
  38. hud/cli/flows/tests/test_dev.py +126 -0
  39. hud/cli/init.py +60 -58
  40. hud/cli/pull.py +1 -1
  41. hud/cli/push.py +38 -13
  42. hud/cli/rft.py +311 -0
  43. hud/cli/rft_status.py +145 -0
  44. hud/cli/tests/test_analyze.py +5 -5
  45. hud/cli/tests/test_analyze_metadata.py +3 -2
  46. hud/cli/tests/test_analyze_module.py +120 -0
  47. hud/cli/tests/test_build.py +110 -8
  48. hud/cli/tests/test_build_failure.py +41 -0
  49. hud/cli/tests/test_build_module.py +50 -0
  50. hud/cli/tests/test_cli_init.py +6 -1
  51. hud/cli/tests/test_cli_more_wrappers.py +30 -0
  52. hud/cli/tests/test_cli_root.py +140 -0
  53. hud/cli/tests/test_convert.py +361 -0
  54. hud/cli/tests/test_debug.py +12 -10
  55. hud/cli/tests/test_dev.py +197 -0
  56. hud/cli/tests/test_eval.py +251 -0
  57. hud/cli/tests/test_eval_bedrock.py +51 -0
  58. hud/cli/tests/test_init.py +124 -0
  59. hud/cli/tests/test_main_module.py +11 -5
  60. hud/cli/tests/test_mcp_server.py +12 -100
  61. hud/cli/tests/test_push.py +1 -1
  62. hud/cli/tests/test_push_happy.py +74 -0
  63. hud/cli/tests/test_push_wrapper.py +23 -0
  64. hud/cli/tests/test_registry.py +1 -1
  65. hud/cli/tests/test_utils.py +1 -1
  66. hud/cli/{rl → utils}/celebrate.py +14 -12
  67. hud/cli/utils/config.py +18 -1
  68. hud/cli/utils/docker.py +130 -4
  69. hud/cli/utils/env_check.py +9 -9
  70. hud/cli/utils/git.py +136 -0
  71. hud/cli/utils/interactive.py +39 -5
  72. hud/cli/utils/metadata.py +70 -1
  73. hud/cli/utils/runner.py +1 -1
  74. hud/cli/utils/server.py +2 -2
  75. hud/cli/utils/source_hash.py +3 -3
  76. hud/cli/utils/tasks.py +4 -1
  77. hud/cli/utils/tests/__init__.py +0 -0
  78. hud/cli/utils/tests/test_config.py +58 -0
  79. hud/cli/utils/tests/test_docker.py +93 -0
  80. hud/cli/utils/tests/test_docker_hints.py +71 -0
  81. hud/cli/utils/tests/test_env_check.py +74 -0
  82. hud/cli/utils/tests/test_environment.py +42 -0
  83. hud/cli/utils/tests/test_git.py +142 -0
  84. hud/cli/utils/tests/test_interactive_module.py +60 -0
  85. hud/cli/utils/tests/test_local_runner.py +50 -0
  86. hud/cli/utils/tests/test_logging_utils.py +23 -0
  87. hud/cli/utils/tests/test_metadata.py +49 -0
  88. hud/cli/utils/tests/test_package_runner.py +35 -0
  89. hud/cli/utils/tests/test_registry_utils.py +49 -0
  90. hud/cli/utils/tests/test_remote_runner.py +25 -0
  91. hud/cli/utils/tests/test_runner_modules.py +52 -0
  92. hud/cli/utils/tests/test_source_hash.py +36 -0
  93. hud/cli/utils/tests/test_tasks.py +80 -0
  94. hud/cli/utils/version_check.py +258 -0
  95. hud/cli/{rl → utils}/viewer.py +2 -2
  96. hud/clients/README.md +12 -11
  97. hud/clients/__init__.py +4 -3
  98. hud/clients/base.py +166 -26
  99. hud/clients/environment.py +51 -0
  100. hud/clients/fastmcp.py +13 -6
  101. hud/clients/mcp_use.py +45 -15
  102. hud/clients/tests/test_analyze_scenarios.py +206 -0
  103. hud/clients/tests/test_protocol.py +9 -3
  104. hud/datasets/__init__.py +23 -20
  105. hud/datasets/loader.py +326 -0
  106. hud/datasets/runner.py +198 -105
  107. hud/datasets/tests/__init__.py +0 -0
  108. hud/datasets/tests/test_loader.py +221 -0
  109. hud/datasets/tests/test_utils.py +315 -0
  110. hud/datasets/utils.py +270 -90
  111. hud/environment/__init__.py +52 -0
  112. hud/environment/connection.py +258 -0
  113. hud/environment/connectors/__init__.py +33 -0
  114. hud/environment/connectors/base.py +68 -0
  115. hud/environment/connectors/local.py +177 -0
  116. hud/environment/connectors/mcp_config.py +137 -0
  117. hud/environment/connectors/openai.py +101 -0
  118. hud/environment/connectors/remote.py +172 -0
  119. hud/environment/environment.py +835 -0
  120. hud/environment/integrations/__init__.py +45 -0
  121. hud/environment/integrations/adk.py +67 -0
  122. hud/environment/integrations/anthropic.py +196 -0
  123. hud/environment/integrations/gemini.py +92 -0
  124. hud/environment/integrations/langchain.py +82 -0
  125. hud/environment/integrations/llamaindex.py +68 -0
  126. hud/environment/integrations/openai.py +238 -0
  127. hud/environment/mock.py +306 -0
  128. hud/environment/router.py +263 -0
  129. hud/environment/scenarios.py +620 -0
  130. hud/environment/tests/__init__.py +1 -0
  131. hud/environment/tests/test_connection.py +317 -0
  132. hud/environment/tests/test_connectors.py +205 -0
  133. hud/environment/tests/test_environment.py +593 -0
  134. hud/environment/tests/test_integrations.py +257 -0
  135. hud/environment/tests/test_local_connectors.py +242 -0
  136. hud/environment/tests/test_scenarios.py +1086 -0
  137. hud/environment/tests/test_tools.py +208 -0
  138. hud/environment/types.py +23 -0
  139. hud/environment/utils/__init__.py +35 -0
  140. hud/environment/utils/formats.py +215 -0
  141. hud/environment/utils/schema.py +171 -0
  142. hud/environment/utils/tool_wrappers.py +113 -0
  143. hud/eval/__init__.py +67 -0
  144. hud/eval/context.py +727 -0
  145. hud/eval/display.py +299 -0
  146. hud/eval/instrument.py +187 -0
  147. hud/eval/manager.py +533 -0
  148. hud/eval/parallel.py +268 -0
  149. hud/eval/task.py +372 -0
  150. hud/eval/tests/__init__.py +1 -0
  151. hud/eval/tests/test_context.py +178 -0
  152. hud/eval/tests/test_eval.py +210 -0
  153. hud/eval/tests/test_manager.py +152 -0
  154. hud/eval/tests/test_parallel.py +168 -0
  155. hud/eval/tests/test_task.py +291 -0
  156. hud/eval/types.py +65 -0
  157. hud/eval/utils.py +194 -0
  158. hud/patches/__init__.py +19 -0
  159. hud/patches/mcp_patches.py +308 -0
  160. hud/patches/warnings.py +54 -0
  161. hud/samples/browser.py +4 -4
  162. hud/server/__init__.py +2 -1
  163. hud/server/low_level.py +2 -1
  164. hud/server/router.py +164 -0
  165. hud/server/server.py +567 -80
  166. hud/server/tests/test_mcp_server_integration.py +11 -11
  167. hud/server/tests/test_mcp_server_more.py +1 -1
  168. hud/server/tests/test_server_extra.py +2 -0
  169. hud/settings.py +45 -3
  170. hud/shared/exceptions.py +36 -10
  171. hud/shared/hints.py +26 -1
  172. hud/shared/requests.py +15 -3
  173. hud/shared/tests/test_exceptions.py +40 -31
  174. hud/shared/tests/test_hints.py +167 -0
  175. hud/telemetry/__init__.py +20 -19
  176. hud/telemetry/exporter.py +201 -0
  177. hud/telemetry/instrument.py +165 -253
  178. hud/telemetry/tests/test_eval_telemetry.py +356 -0
  179. hud/telemetry/tests/test_exporter.py +258 -0
  180. hud/telemetry/tests/test_instrument.py +401 -0
  181. hud/tools/__init__.py +18 -2
  182. hud/tools/agent.py +223 -0
  183. hud/tools/apply_patch.py +639 -0
  184. hud/tools/base.py +54 -4
  185. hud/tools/bash.py +2 -2
  186. hud/tools/computer/__init__.py +36 -3
  187. hud/tools/computer/anthropic.py +2 -2
  188. hud/tools/computer/gemini.py +385 -0
  189. hud/tools/computer/hud.py +23 -6
  190. hud/tools/computer/openai.py +20 -21
  191. hud/tools/computer/qwen.py +434 -0
  192. hud/tools/computer/settings.py +37 -0
  193. hud/tools/edit.py +3 -7
  194. hud/tools/executors/base.py +4 -2
  195. hud/tools/executors/pyautogui.py +1 -1
  196. hud/tools/grounding/grounded_tool.py +13 -18
  197. hud/tools/grounding/grounder.py +10 -31
  198. hud/tools/grounding/tests/test_grounded_tool.py +26 -44
  199. hud/tools/jupyter.py +330 -0
  200. hud/tools/playwright.py +18 -3
  201. hud/tools/shell.py +308 -0
  202. hud/tools/tests/test_agent_tool.py +355 -0
  203. hud/tools/tests/test_apply_patch.py +718 -0
  204. hud/tools/tests/test_computer.py +4 -9
  205. hud/tools/tests/test_computer_actions.py +24 -2
  206. hud/tools/tests/test_jupyter_tool.py +181 -0
  207. hud/tools/tests/test_shell.py +596 -0
  208. hud/tools/tests/test_submit.py +85 -0
  209. hud/tools/tests/test_types.py +193 -0
  210. hud/tools/types.py +21 -1
  211. hud/types.py +194 -56
  212. hud/utils/__init__.py +2 -0
  213. hud/utils/env.py +67 -0
  214. hud/utils/hud_console.py +89 -18
  215. hud/utils/mcp.py +15 -58
  216. hud/utils/strict_schema.py +162 -0
  217. hud/utils/tests/test_init.py +1 -2
  218. hud/utils/tests/test_mcp.py +1 -28
  219. hud/utils/tests/test_pretty_errors.py +186 -0
  220. hud/utils/tests/test_tool_shorthand.py +154 -0
  221. hud/utils/tests/test_version.py +1 -1
  222. hud/utils/types.py +20 -0
  223. hud/version.py +1 -1
  224. hud_python-0.5.13.dist-info/METADATA +264 -0
  225. hud_python-0.5.13.dist-info/RECORD +305 -0
  226. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/WHEEL +1 -1
  227. hud/agents/langchain.py +0 -261
  228. hud/agents/lite_llm.py +0 -72
  229. hud/cli/rl/__init__.py +0 -180
  230. hud/cli/rl/config.py +0 -101
  231. hud/cli/rl/display.py +0 -133
  232. hud/cli/rl/gpu.py +0 -63
  233. hud/cli/rl/gpu_utils.py +0 -321
  234. hud/cli/rl/local_runner.py +0 -595
  235. hud/cli/rl/presets.py +0 -96
  236. hud/cli/rl/remote_runner.py +0 -463
  237. hud/cli/rl/rl_api.py +0 -150
  238. hud/cli/rl/vllm.py +0 -177
  239. hud/cli/rl/wait_utils.py +0 -89
  240. hud/datasets/parallel.py +0 -687
  241. hud/misc/__init__.py +0 -1
  242. hud/misc/claude_plays_pokemon.py +0 -292
  243. hud/otel/__init__.py +0 -35
  244. hud/otel/collector.py +0 -142
  245. hud/otel/config.py +0 -181
  246. hud/otel/context.py +0 -570
  247. hud/otel/exporters.py +0 -369
  248. hud/otel/instrumentation.py +0 -135
  249. hud/otel/processors.py +0 -121
  250. hud/otel/tests/__init__.py +0 -1
  251. hud/otel/tests/test_processors.py +0 -197
  252. hud/rl/README.md +0 -30
  253. hud/rl/__init__.py +0 -1
  254. hud/rl/actor.py +0 -176
  255. hud/rl/buffer.py +0 -405
  256. hud/rl/chat_template.jinja +0 -101
  257. hud/rl/config.py +0 -192
  258. hud/rl/distributed.py +0 -132
  259. hud/rl/learner.py +0 -637
  260. hud/rl/tests/__init__.py +0 -1
  261. hud/rl/tests/test_learner.py +0 -186
  262. hud/rl/train.py +0 -382
  263. hud/rl/types.py +0 -101
  264. hud/rl/utils/start_vllm_server.sh +0 -30
  265. hud/rl/utils.py +0 -524
  266. hud/rl/vllm_adapter.py +0 -143
  267. hud/telemetry/job.py +0 -352
  268. hud/telemetry/replay.py +0 -74
  269. hud/telemetry/tests/test_replay.py +0 -40
  270. hud/telemetry/tests/test_trace.py +0 -63
  271. hud/telemetry/trace.py +0 -158
  272. hud/utils/agent_factories.py +0 -86
  273. hud/utils/async_utils.py +0 -65
  274. hud/utils/group_eval.py +0 -223
  275. hud/utils/progress.py +0 -149
  276. hud/utils/tasks.py +0 -127
  277. hud/utils/tests/test_async_utils.py +0 -173
  278. hud/utils/tests/test_progress.py +0 -261
  279. hud_python-0.4.45.dist-info/METADATA +0 -552
  280. hud_python-0.4.45.dist-info/RECORD +0 -228
  281. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/entry_points.txt +0 -0
  282. {hud_python-0.4.45.dist-info → hud_python-0.5.13.dist-info}/licenses/LICENSE +0 -0
hud/agents/gateway.py ADDED
@@ -0,0 +1,42 @@
1
+ """Gateway client utilities for HUD inference gateway."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from typing import Any
6
+
7
+
8
+ def build_gateway_client(provider: str) -> Any:
9
+ """Build a client configured for HUD gateway routing.
10
+
11
+ Args:
12
+ provider: Provider name ("anthropic", "openai", "gemini", etc.)
13
+
14
+ Returns:
15
+ Configured async client for the provider.
16
+ """
17
+ from hud.settings import settings
18
+
19
+ provider = provider.lower()
20
+
21
+ if provider == "anthropic":
22
+ from anthropic import AsyncAnthropic
23
+
24
+ return AsyncAnthropic(api_key=settings.api_key, base_url=settings.hud_gateway_url)
25
+
26
+ if provider == "gemini":
27
+ from google import genai
28
+ from google.genai.types import HttpOptions
29
+
30
+ return genai.Client(
31
+ api_key="PLACEHOLDER",
32
+ http_options=HttpOptions(
33
+ api_version="v1beta",
34
+ base_url=settings.hud_gateway_url,
35
+ headers={"Authorization": f"Bearer {settings.api_key}"},
36
+ ),
37
+ )
38
+
39
+ # OpenAI-compatible (openai, azure, together, groq, fireworks, etc.)
40
+ from openai import AsyncOpenAI
41
+
42
+ return AsyncOpenAI(api_key=settings.api_key, base_url=settings.hud_gateway_url)
hud/agents/gemini.py ADDED
@@ -0,0 +1,264 @@
1
+ """Gemini MCP Agent implementation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from typing import Any, ClassVar, cast
7
+
8
+ import mcp.types as types
9
+ from google import genai
10
+ from google.genai import types as genai_types
11
+
12
+ from hud.settings import settings
13
+ from hud.types import AgentResponse, BaseAgentConfig, MCPToolCall, MCPToolResult
14
+ from hud.utils.hud_console import HUDConsole
15
+ from hud.utils.types import with_signature
16
+
17
+ from .base import MCPAgent
18
+ from .types import GeminiConfig, GeminiCreateParams
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ class GeminiAgent(MCPAgent):
24
+ """
25
+ Gemini agent that uses MCP servers for tool execution.
26
+
27
+ This agent uses Gemini's native tool calling capabilities but executes
28
+ tools through MCP servers instead of direct implementation.
29
+ """
30
+
31
+ metadata: ClassVar[dict[str, Any] | None] = None
32
+ config_cls: ClassVar[type[BaseAgentConfig]] = GeminiConfig
33
+
34
+ @with_signature(GeminiCreateParams)
35
+ @classmethod
36
+ def create(cls, **kwargs: Any) -> GeminiAgent: # pyright: ignore[reportIncompatibleMethodOverride]
37
+ return MCPAgent.create.__func__(cls, **kwargs) # type: ignore[return-value]
38
+
39
+ def __init__(self, params: GeminiCreateParams | None = None, **kwargs: Any) -> None:
40
+ super().__init__(params, **kwargs)
41
+ self.config: GeminiConfig
42
+
43
+ model_client = self.config.model_client
44
+ if model_client is None:
45
+ # Default to HUD gateway when HUD_API_KEY is available
46
+ if settings.api_key:
47
+ from hud.agents.gateway import build_gateway_client
48
+
49
+ model_client = build_gateway_client("gemini")
50
+ elif settings.gemini_api_key:
51
+ model_client = genai.Client(api_key=settings.gemini_api_key)
52
+ else:
53
+ raise ValueError(
54
+ "No API key found. Set HUD_API_KEY for HUD gateway, "
55
+ "or GEMINI_API_KEY for direct Gemini access."
56
+ )
57
+
58
+ if self.config.validate_api_key:
59
+ try:
60
+ list(model_client.models.list(config=genai_types.ListModelsConfig(page_size=1)))
61
+ except Exception as e:
62
+ raise ValueError(f"Gemini API key is invalid: {e}") from e
63
+
64
+ self.gemini_client: genai.Client = model_client
65
+ self.temperature = self.config.temperature
66
+ self.top_p = self.config.top_p
67
+ self.top_k = self.config.top_k
68
+ self.max_output_tokens = self.config.max_output_tokens
69
+ self.hud_console = HUDConsole(logger=logger)
70
+
71
+ # Track mapping from Gemini tool names to MCP tool names
72
+ self._gemini_to_mcp_tool_map: dict[str, str] = {}
73
+ self.gemini_tools: genai_types.ToolListUnion = []
74
+
75
+ def _on_tools_ready(self) -> None:
76
+ """Build Gemini-specific tool mappings after tools are discovered."""
77
+ self._convert_tools_for_gemini()
78
+
79
+ async def get_system_messages(self) -> list[genai_types.Content]:
80
+ """No system messages for Gemini because applied in get_response"""
81
+ return []
82
+
83
+ async def format_blocks(self, blocks: list[types.ContentBlock]) -> list[genai_types.Content]:
84
+ """Format messages for Gemini."""
85
+ # Convert MCP content types to Gemini content types
86
+ gemini_parts: list[genai_types.Part] = []
87
+
88
+ for block in blocks:
89
+ if isinstance(block, types.TextContent):
90
+ gemini_parts.append(genai_types.Part(text=block.text))
91
+ elif isinstance(block, types.ImageContent):
92
+ # Convert MCP ImageContent to Gemini format
93
+ # Need to decode base64 string to bytes
94
+ import base64
95
+
96
+ image_bytes = base64.b64decode(block.data)
97
+ gemini_parts.append(
98
+ genai_types.Part.from_bytes(data=image_bytes, mime_type=block.mimeType)
99
+ )
100
+ else:
101
+ # For other types, try to handle but log a warning
102
+ self.hud_console.log(f"Unknown content block type: {type(block)}", level="warning")
103
+
104
+ return [genai_types.Content(role="user", parts=gemini_parts)]
105
+
106
+ async def get_response(self, messages: list[genai_types.Content]) -> AgentResponse:
107
+ """Get response from Gemini including any tool calls."""
108
+ # Build generate content config
109
+ generate_config = genai_types.GenerateContentConfig(
110
+ temperature=self.temperature,
111
+ top_p=self.top_p,
112
+ top_k=self.top_k,
113
+ max_output_tokens=self.max_output_tokens,
114
+ tools=self.gemini_tools,
115
+ system_instruction=self.system_prompt,
116
+ )
117
+
118
+ # Use async API to avoid blocking the event loop
119
+ response = await self.gemini_client.aio.models.generate_content(
120
+ model=self.config.model,
121
+ contents=cast("Any", messages),
122
+ config=generate_config,
123
+ )
124
+
125
+ # Append assistant response (including any function_call) so that
126
+ # subsequent FunctionResponse messages correspond to a prior FunctionCall
127
+ if response.candidates and len(response.candidates) > 0 and response.candidates[0].content:
128
+ messages.append(response.candidates[0].content)
129
+
130
+ # Process response
131
+ result = AgentResponse(content="", tool_calls=[], done=True)
132
+ collected_tool_calls: list[MCPToolCall] = []
133
+
134
+ if not response.candidates:
135
+ self.hud_console.warning("Response has no candidates")
136
+ return result
137
+
138
+ candidate = response.candidates[0]
139
+
140
+ # Extract text content and function calls
141
+ text_content = ""
142
+ thinking_content = ""
143
+
144
+ if candidate.content and candidate.content.parts:
145
+ for part in candidate.content.parts:
146
+ if part.function_call:
147
+ tool_call = self._extract_tool_call(part)
148
+ if tool_call is not None:
149
+ collected_tool_calls.append(tool_call)
150
+ elif part.thought is True and part.text:
151
+ if thinking_content:
152
+ thinking_content += "\n"
153
+ thinking_content += part.text
154
+ elif part.text:
155
+ text_content += part.text
156
+
157
+ # Assign collected tool calls and mark done status
158
+ if collected_tool_calls:
159
+ result.tool_calls = collected_tool_calls
160
+ result.done = False
161
+
162
+ result.content = text_content
163
+ if thinking_content:
164
+ result.reasoning = thinking_content
165
+
166
+ return result
167
+
168
+ def _extract_tool_call(self, part: genai_types.Part) -> MCPToolCall | None:
169
+ """Extract an MCPToolCall from a function call part.
170
+
171
+ Subclasses can override to customize tool call extraction (e.g., normalizing
172
+ computer use calls to a different schema).
173
+ """
174
+ if not part.function_call:
175
+ return None
176
+
177
+ func_name = part.function_call.name or ""
178
+ mcp_tool_name = self._gemini_to_mcp_tool_map.get(func_name, func_name)
179
+ raw_args = dict(part.function_call.args) if part.function_call.args else {}
180
+
181
+ return MCPToolCall(
182
+ name=mcp_tool_name,
183
+ arguments=raw_args,
184
+ )
185
+
186
+ async def format_tool_results(
187
+ self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
188
+ ) -> list[genai_types.Content]:
189
+ """Format tool results into Gemini messages."""
190
+ # Process each tool result
191
+ function_responses = []
192
+
193
+ for tool_call, result in zip(tool_calls, tool_results, strict=True):
194
+ # Get the Gemini function name from metadata
195
+ gemini_name = getattr(tool_call, "gemini_name", tool_call.name)
196
+
197
+ # Convert MCP tool results to Gemini format
198
+ response_dict: dict[str, Any] = {}
199
+
200
+ if result.isError:
201
+ # Extract error message from content
202
+ error_msg = "Tool execution failed"
203
+ for content in result.content:
204
+ if isinstance(content, types.TextContent):
205
+ error_msg = content.text
206
+ break
207
+ response_dict["error"] = error_msg
208
+ else:
209
+ # Process success content
210
+ response_dict["success"] = True
211
+ # Add text content to response
212
+ for content in result.content:
213
+ if isinstance(content, types.TextContent):
214
+ response_dict["output"] = content.text
215
+ break
216
+
217
+ # Create function response
218
+ function_response = genai_types.FunctionResponse(
219
+ name=gemini_name,
220
+ response=response_dict,
221
+ )
222
+ function_responses.append(function_response)
223
+
224
+ # Return as a user message containing all function responses
225
+ return [
226
+ genai_types.Content(
227
+ role="user",
228
+ parts=[genai_types.Part(function_response=fr) for fr in function_responses],
229
+ )
230
+ ]
231
+
232
+ async def create_user_message(self, text: str) -> genai_types.Content:
233
+ """Create a user message in Gemini's format."""
234
+ return genai_types.Content(role="user", parts=[genai_types.Part(text=text)])
235
+
236
+ def _convert_tools_for_gemini(self) -> genai_types.ToolListUnion:
237
+ """Convert MCP tools to Gemini tool format."""
238
+ self._gemini_to_mcp_tool_map = {} # Reset mapping
239
+ self.gemini_tools = []
240
+
241
+ for tool in self.get_available_tools():
242
+ gemini_tool = self._to_gemini_tool(tool)
243
+ if gemini_tool is None:
244
+ continue
245
+
246
+ self._gemini_to_mcp_tool_map[tool.name] = tool.name
247
+ self.gemini_tools.append(gemini_tool)
248
+
249
+ return self.gemini_tools
250
+
251
+ def _to_gemini_tool(self, tool: types.Tool) -> genai_types.Tool | None:
252
+ """Convert a single MCP tool to Gemini tool format.
253
+
254
+ Subclasses can override to customize tool conversion (e.g., for computer use).
255
+ """
256
+ # Ensure parameters have proper Schema format
257
+ if tool.description is None or tool.inputSchema is None:
258
+ raise ValueError(f"MCP tool {tool.name} requires both a description and inputSchema.")
259
+ function_decl = genai_types.FunctionDeclaration(
260
+ name=tool.name,
261
+ description=tool.description,
262
+ parameters_json_schema=tool.inputSchema,
263
+ )
264
+ return genai_types.Tool(function_declarations=[function_decl])
@@ -0,0 +1,324 @@
1
+ """Gemini Computer Use Agent implementation."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from typing import Any, ClassVar
7
+
8
+ import mcp.types as types
9
+ from google.genai import types as genai_types
10
+
11
+ from hud.tools.computer.settings import computer_settings
12
+ from hud.types import AgentResponse, BaseAgentConfig, MCPToolCall, MCPToolResult
13
+ from hud.utils.types import with_signature
14
+
15
+ from .base import MCPAgent
16
+ from .gemini import GeminiAgent
17
+ from .types import GeminiCUAConfig, GeminiCUACreateParams
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+ # Predefined Gemini computer use functions
22
+ PREDEFINED_COMPUTER_USE_FUNCTIONS = [
23
+ "open_web_browser",
24
+ "click_at",
25
+ "hover_at",
26
+ "type_text_at",
27
+ "scroll_document",
28
+ "scroll_at",
29
+ "wait_5_seconds",
30
+ "go_back",
31
+ "go_forward",
32
+ "search",
33
+ "navigate",
34
+ "key_combination",
35
+ "drag_and_drop",
36
+ ]
37
+
38
+ GEMINI_CUA_INSTRUCTIONS = """
39
+ You are an autonomous computer-using agent. Follow these guidelines:
40
+
41
+ 1. NEVER ask for confirmation. Complete all tasks autonomously.
42
+ 2. Do NOT send messages like "I need to confirm before..." or "Do you want me to
43
+ continue?" - just proceed.
44
+ 3. When the user asks you to interact with something (like clicking a chat or typing
45
+ a message), DO IT without asking.
46
+ 4. Only use the formal safety check mechanism for truly dangerous operations (like
47
+ deleting important files).
48
+ 5. For normal tasks like clicking buttons, typing in chat boxes, filling forms -
49
+ JUST DO IT.
50
+ 6. The user has already given you permission by running this agent. No further
51
+ confirmation is needed.
52
+ 7. Be decisive and action-oriented. Complete the requested task fully.
53
+
54
+ Remember: You are expected to complete tasks autonomously. The user trusts you to do
55
+ what they asked.
56
+ """.strip()
57
+
58
+
59
+ class GeminiCUAAgent(GeminiAgent):
60
+ """
61
+ Gemini Computer Use Agent that extends GeminiAgent with computer use capabilities.
62
+
63
+ This agent uses Gemini's native computer use capabilities but executes
64
+ tools through MCP servers instead of direct implementation.
65
+ """
66
+
67
+ metadata: ClassVar[dict[str, Any] | None] = {
68
+ "display_width": computer_settings.GEMINI_COMPUTER_WIDTH,
69
+ "display_height": computer_settings.GEMINI_COMPUTER_HEIGHT,
70
+ }
71
+ required_tools: ClassVar[list[str]] = ["gemini_computer"]
72
+ config_cls: ClassVar[type[BaseAgentConfig]] = GeminiCUAConfig
73
+
74
+ @with_signature(GeminiCUACreateParams)
75
+ @classmethod
76
+ def create(cls, **kwargs: Any) -> GeminiCUAAgent: # pyright: ignore[reportIncompatibleMethodOverride]
77
+ return MCPAgent.create.__func__(cls, **kwargs) # type: ignore[return-value]
78
+
79
+ def __init__(self, params: GeminiCUACreateParams | None = None, **kwargs: Any) -> None:
80
+ super().__init__(params, **kwargs) # type: ignore[arg-type]
81
+ self.config: GeminiCUAConfig # type: ignore[assignment]
82
+
83
+ self._computer_tool_name = "gemini_computer"
84
+ self.excluded_predefined_functions = list(self.config.excluded_predefined_functions)
85
+
86
+ # Context management: Maximum number of recent turns to keep screenshots for
87
+ # Configurable via GEMINI_MAX_RECENT_TURN_WITH_SCREENSHOTS environment variable
88
+ self.max_recent_turn_with_screenshots = (
89
+ computer_settings.GEMINI_MAX_RECENT_TURN_WITH_SCREENSHOTS
90
+ )
91
+
92
+ # Add computer use instructions
93
+ if self.system_prompt:
94
+ self.system_prompt = f"{self.system_prompt}\n\n{GEMINI_CUA_INSTRUCTIONS}"
95
+ else:
96
+ self.system_prompt = GEMINI_CUA_INSTRUCTIONS
97
+
98
+ def _to_gemini_tool(self, tool: types.Tool) -> genai_types.Tool | None:
99
+ """Convert a single MCP tool to Gemini tool format.
100
+
101
+ Handles gemini_computer tool specially by using Gemini's native ComputerUse.
102
+ """
103
+ if tool.name == self._computer_tool_name:
104
+ # Use Gemini's native computer use capability
105
+ return genai_types.Tool(
106
+ computer_use=genai_types.ComputerUse(
107
+ environment=genai_types.Environment.ENVIRONMENT_BROWSER,
108
+ excluded_predefined_functions=self.excluded_predefined_functions,
109
+ )
110
+ )
111
+
112
+ if tool.name == "computer" or tool.name.endswith("_computer"):
113
+ return None
114
+
115
+ # For non-computer tools, use the parent implementation
116
+ return super()._to_gemini_tool(tool)
117
+
118
+ async def get_response(self, messages: list[genai_types.Content]) -> AgentResponse:
119
+ """Get response from Gemini including any tool calls.
120
+
121
+ Extends parent to trim old screenshots before making API call.
122
+ """
123
+ # Trim screenshots from older turns to manage context growth
124
+ self._remove_old_screenshots(messages)
125
+
126
+ return await super().get_response(messages)
127
+
128
+ async def format_tool_results(
129
+ self, tool_calls: list[MCPToolCall], tool_results: list[MCPToolResult]
130
+ ) -> list[genai_types.Content]:
131
+ """Format tool results into Gemini messages.
132
+
133
+ Handles computer tool results specially with screenshots and URLs.
134
+ """
135
+ # Process each tool result
136
+ function_responses = []
137
+
138
+ for tool_call, result in zip(tool_calls, tool_results, strict=True):
139
+ # Get the Gemini function name from metadata
140
+ gemini_name = getattr(tool_call, "gemini_name", tool_call.name)
141
+
142
+ # Check if this is a computer use tool call
143
+ is_computer_call = tool_call.name == self._computer_tool_name
144
+
145
+ # Convert MCP tool results to Gemini format
146
+ response_dict: dict[str, Any] = {}
147
+ url = None
148
+
149
+ if result.isError:
150
+ # Extract error message from content
151
+ error_msg = "Tool execution failed"
152
+ for content in result.content:
153
+ if isinstance(content, types.TextContent):
154
+ # Check if this is a URL metadata block
155
+ if content.text.startswith("__URL__:"):
156
+ url = content.text.replace("__URL__:", "")
157
+ else:
158
+ error_msg = content.text
159
+ break
160
+ response_dict["error"] = error_msg
161
+ # for gemini cua agent, if a nonexistend computer tool is called, it won't
162
+ # #technically count as a computer tool call, but we still need to return a url
163
+ response_dict["url"] = url if url else "about:blank"
164
+ else:
165
+ # Process success content
166
+ response_dict["success"] = True
167
+
168
+ # Extract URL and screenshot from content (for computer use)
169
+ screenshot_parts = []
170
+ if is_computer_call:
171
+ for content in result.content:
172
+ if isinstance(content, types.TextContent):
173
+ # Check if this is a URL metadata block
174
+ if content.text.startswith("__URL__:"):
175
+ url = content.text.replace("__URL__:", "")
176
+ elif isinstance(content, types.ImageContent):
177
+ # Decode base64 string to bytes for FunctionResponseBlob
178
+ import base64
179
+
180
+ image_bytes = base64.b64decode(content.data)
181
+ screenshot_parts.append(
182
+ genai_types.FunctionResponsePart(
183
+ inline_data=genai_types.FunctionResponseBlob(
184
+ mime_type=content.mimeType or "image/png",
185
+ data=image_bytes,
186
+ )
187
+ )
188
+ )
189
+
190
+ # Add URL to response dict (required by Gemini Computer Use model)
191
+ # URL must ALWAYS be present per Gemini API requirements
192
+ response_dict["url"] = url if url else "about:blank"
193
+
194
+ # For Gemini Computer Use actions, always acknowledge safety decisions
195
+ requires_ack = False
196
+ if tool_call.arguments:
197
+ requires_ack = bool(tool_call.arguments.get("safety_decision"))
198
+ if requires_ack:
199
+ response_dict["safety_acknowledgement"] = True
200
+ else:
201
+ # For non-computer tools, add text content to response
202
+ for content in result.content:
203
+ if isinstance(content, types.TextContent):
204
+ response_dict["output"] = content.text
205
+ break
206
+
207
+ # Create function response
208
+ function_response = genai_types.FunctionResponse(
209
+ name=gemini_name,
210
+ response=response_dict,
211
+ parts=screenshot_parts if screenshot_parts else None,
212
+ )
213
+ function_responses.append(function_response)
214
+
215
+ # Return as a user message containing all function responses
216
+ return [
217
+ genai_types.Content(
218
+ role="user",
219
+ parts=[genai_types.Part(function_response=fr) for fr in function_responses],
220
+ )
221
+ ]
222
+
223
+ def _extract_tool_call(self, part: genai_types.Part) -> MCPToolCall | None:
224
+ """Extract an MCPToolCall from a function call part.
225
+
226
+ Routes predefined Gemini Computer Use functions to the gemini_computer tool
227
+ and normalizes the arguments to MCP tool schema.
228
+ """
229
+ if not part.function_call:
230
+ return None
231
+
232
+ func_name = part.function_call.name or ""
233
+ raw_args = dict(part.function_call.args) if part.function_call.args else {}
234
+
235
+ # Route predefined computer use functions to the computer tool
236
+ if func_name in PREDEFINED_COMPUTER_USE_FUNCTIONS:
237
+ # Normalize Gemini Computer Use calls to MCP tool schema
238
+ # Ensure 'action' is present and equals the Gemini function name
239
+ normalized_args: dict[str, Any] = {"action": func_name}
240
+
241
+ # Map common argument shapes used by Gemini Computer Use
242
+ # 1) Coordinate arrays → x/y
243
+ coord = raw_args.get("coordinate") or raw_args.get("coordinates")
244
+ if isinstance(coord, list | tuple) and len(coord) >= 2:
245
+ try:
246
+ normalized_args["x"] = int(coord[0])
247
+ normalized_args["y"] = int(coord[1])
248
+ except (TypeError, ValueError):
249
+ # Fall back to raw if casting fails
250
+ pass
251
+
252
+ # Destination coordinate arrays → destination_x/destination_y
253
+ dest = (
254
+ raw_args.get("destination")
255
+ or raw_args.get("destination_coordinate")
256
+ or raw_args.get("destinationCoordinate")
257
+ )
258
+ if isinstance(dest, list | tuple) and len(dest) >= 2:
259
+ try:
260
+ normalized_args["destination_x"] = int(dest[0])
261
+ normalized_args["destination_y"] = int(dest[1])
262
+ except (TypeError, ValueError):
263
+ pass
264
+
265
+ # Pass through supported fields if present (including direct coords)
266
+ for key in (
267
+ "text",
268
+ "press_enter",
269
+ "clear_before_typing",
270
+ "safety_decision",
271
+ "direction",
272
+ "magnitude",
273
+ "url",
274
+ "keys",
275
+ "x",
276
+ "y",
277
+ "destination_x",
278
+ "destination_y",
279
+ ):
280
+ if key in raw_args:
281
+ normalized_args[key] = raw_args[key]
282
+
283
+ return MCPToolCall(
284
+ name=self._computer_tool_name,
285
+ arguments=normalized_args,
286
+ gemini_name=func_name, # type: ignore[arg-type]
287
+ )
288
+
289
+ # Non-computer tools: use parent implementation
290
+ return super()._extract_tool_call(part)
291
+
292
+ def _remove_old_screenshots(self, messages: list[genai_types.Content]) -> None:
293
+ """
294
+ Remove screenshots from old turns to manage context length.
295
+ Keeps only the last N turns with screenshots (configured via
296
+ self.max_recent_turn_with_screenshots).
297
+ """
298
+ turn_with_screenshots_found = 0
299
+
300
+ for content in reversed(messages):
301
+ if content.role == "user" and content.parts:
302
+ # Check if content has screenshots (function responses with images)
303
+ has_screenshot = False
304
+ for part in content.parts:
305
+ if (
306
+ part.function_response
307
+ and part.function_response.parts
308
+ and part.function_response.name in PREDEFINED_COMPUTER_USE_FUNCTIONS
309
+ ):
310
+ has_screenshot = True
311
+ break
312
+
313
+ if has_screenshot:
314
+ turn_with_screenshots_found += 1
315
+ # Remove the screenshot image if the number of screenshots exceeds the limit
316
+ if turn_with_screenshots_found > self.max_recent_turn_with_screenshots:
317
+ for part in content.parts:
318
+ if (
319
+ part.function_response
320
+ and part.function_response.parts
321
+ and part.function_response.name in PREDEFINED_COMPUTER_USE_FUNCTIONS
322
+ ):
323
+ # Clear the parts (screenshots)
324
+ part.function_response.parts = None