hud-python 0.4.45__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- hud/__init__.py +27 -7
- hud/agents/__init__.py +11 -5
- hud/agents/base.py +220 -500
- hud/agents/claude.py +200 -240
- hud/agents/gemini.py +275 -0
- hud/agents/gemini_cua.py +335 -0
- hud/agents/grounded_openai.py +98 -100
- hud/agents/misc/integration_test_agent.py +51 -20
- hud/agents/misc/response_agent.py +41 -36
- hud/agents/openai.py +291 -292
- hud/agents/{openai_chat_generic.py → openai_chat.py} +80 -34
- hud/agents/operator.py +211 -0
- hud/agents/tests/conftest.py +133 -0
- hud/agents/tests/test_base.py +300 -622
- hud/agents/tests/test_base_runtime.py +233 -0
- hud/agents/tests/test_claude.py +379 -210
- hud/agents/tests/test_client.py +9 -10
- hud/agents/tests/test_gemini.py +369 -0
- hud/agents/tests/test_grounded_openai_agent.py +65 -50
- hud/agents/tests/test_openai.py +376 -140
- hud/agents/tests/test_operator.py +362 -0
- hud/agents/tests/test_run_eval.py +179 -0
- hud/cli/__init__.py +461 -545
- hud/cli/analyze.py +43 -5
- hud/cli/build.py +664 -110
- hud/cli/debug.py +8 -5
- hud/cli/dev.py +882 -734
- hud/cli/eval.py +782 -668
- hud/cli/flows/dev.py +167 -0
- hud/cli/flows/init.py +191 -0
- hud/cli/flows/tasks.py +153 -56
- hud/cli/flows/templates.py +151 -0
- hud/cli/flows/tests/__init__.py +1 -0
- hud/cli/flows/tests/test_dev.py +126 -0
- hud/cli/init.py +60 -58
- hud/cli/push.py +29 -11
- hud/cli/rft.py +311 -0
- hud/cli/rft_status.py +145 -0
- hud/cli/tests/test_analyze.py +5 -5
- hud/cli/tests/test_analyze_metadata.py +3 -2
- hud/cli/tests/test_analyze_module.py +120 -0
- hud/cli/tests/test_build.py +108 -6
- hud/cli/tests/test_build_failure.py +41 -0
- hud/cli/tests/test_build_module.py +50 -0
- hud/cli/tests/test_cli_init.py +6 -1
- hud/cli/tests/test_cli_more_wrappers.py +30 -0
- hud/cli/tests/test_cli_root.py +140 -0
- hud/cli/tests/test_convert.py +361 -0
- hud/cli/tests/test_debug.py +12 -10
- hud/cli/tests/test_dev.py +197 -0
- hud/cli/tests/test_eval.py +251 -0
- hud/cli/tests/test_eval_bedrock.py +51 -0
- hud/cli/tests/test_init.py +124 -0
- hud/cli/tests/test_main_module.py +11 -5
- hud/cli/tests/test_mcp_server.py +12 -100
- hud/cli/tests/test_push_happy.py +74 -0
- hud/cli/tests/test_push_wrapper.py +23 -0
- hud/cli/tests/test_registry.py +1 -1
- hud/cli/tests/test_utils.py +1 -1
- hud/cli/{rl → utils}/celebrate.py +14 -12
- hud/cli/utils/config.py +18 -1
- hud/cli/utils/docker.py +130 -4
- hud/cli/utils/env_check.py +9 -9
- hud/cli/utils/git.py +136 -0
- hud/cli/utils/interactive.py +39 -5
- hud/cli/utils/metadata.py +69 -0
- hud/cli/utils/runner.py +1 -1
- hud/cli/utils/server.py +2 -2
- hud/cli/utils/source_hash.py +3 -3
- hud/cli/utils/tasks.py +4 -1
- hud/cli/utils/tests/__init__.py +0 -0
- hud/cli/utils/tests/test_config.py +58 -0
- hud/cli/utils/tests/test_docker.py +93 -0
- hud/cli/utils/tests/test_docker_hints.py +71 -0
- hud/cli/utils/tests/test_env_check.py +74 -0
- hud/cli/utils/tests/test_environment.py +42 -0
- hud/cli/utils/tests/test_git.py +142 -0
- hud/cli/utils/tests/test_interactive_module.py +60 -0
- hud/cli/utils/tests/test_local_runner.py +50 -0
- hud/cli/utils/tests/test_logging_utils.py +23 -0
- hud/cli/utils/tests/test_metadata.py +49 -0
- hud/cli/utils/tests/test_package_runner.py +35 -0
- hud/cli/utils/tests/test_registry_utils.py +49 -0
- hud/cli/utils/tests/test_remote_runner.py +25 -0
- hud/cli/utils/tests/test_runner_modules.py +52 -0
- hud/cli/utils/tests/test_source_hash.py +36 -0
- hud/cli/utils/tests/test_tasks.py +80 -0
- hud/cli/utils/version_check.py +258 -0
- hud/cli/{rl → utils}/viewer.py +2 -2
- hud/clients/README.md +12 -11
- hud/clients/__init__.py +4 -3
- hud/clients/base.py +166 -26
- hud/clients/environment.py +51 -0
- hud/clients/fastmcp.py +13 -6
- hud/clients/mcp_use.py +40 -15
- hud/clients/tests/test_analyze_scenarios.py +206 -0
- hud/clients/tests/test_protocol.py +9 -3
- hud/datasets/__init__.py +23 -20
- hud/datasets/loader.py +327 -0
- hud/datasets/runner.py +192 -105
- hud/datasets/tests/__init__.py +0 -0
- hud/datasets/tests/test_loader.py +221 -0
- hud/datasets/tests/test_utils.py +315 -0
- hud/datasets/utils.py +270 -90
- hud/environment/__init__.py +50 -0
- hud/environment/connection.py +206 -0
- hud/environment/connectors/__init__.py +33 -0
- hud/environment/connectors/base.py +68 -0
- hud/environment/connectors/local.py +177 -0
- hud/environment/connectors/mcp_config.py +109 -0
- hud/environment/connectors/openai.py +101 -0
- hud/environment/connectors/remote.py +172 -0
- hud/environment/environment.py +694 -0
- hud/environment/integrations/__init__.py +45 -0
- hud/environment/integrations/adk.py +67 -0
- hud/environment/integrations/anthropic.py +196 -0
- hud/environment/integrations/gemini.py +92 -0
- hud/environment/integrations/langchain.py +82 -0
- hud/environment/integrations/llamaindex.py +68 -0
- hud/environment/integrations/openai.py +238 -0
- hud/environment/mock.py +306 -0
- hud/environment/router.py +112 -0
- hud/environment/scenarios.py +493 -0
- hud/environment/tests/__init__.py +1 -0
- hud/environment/tests/test_connection.py +317 -0
- hud/environment/tests/test_connectors.py +218 -0
- hud/environment/tests/test_environment.py +161 -0
- hud/environment/tests/test_integrations.py +257 -0
- hud/environment/tests/test_local_connectors.py +201 -0
- hud/environment/tests/test_scenarios.py +280 -0
- hud/environment/tests/test_tools.py +208 -0
- hud/environment/types.py +23 -0
- hud/environment/utils/__init__.py +35 -0
- hud/environment/utils/formats.py +215 -0
- hud/environment/utils/schema.py +171 -0
- hud/environment/utils/tool_wrappers.py +113 -0
- hud/eval/__init__.py +67 -0
- hud/eval/context.py +674 -0
- hud/eval/display.py +299 -0
- hud/eval/instrument.py +185 -0
- hud/eval/manager.py +466 -0
- hud/eval/parallel.py +268 -0
- hud/eval/task.py +340 -0
- hud/eval/tests/__init__.py +1 -0
- hud/eval/tests/test_context.py +178 -0
- hud/eval/tests/test_eval.py +210 -0
- hud/eval/tests/test_manager.py +152 -0
- hud/eval/tests/test_parallel.py +168 -0
- hud/eval/tests/test_task.py +145 -0
- hud/eval/types.py +63 -0
- hud/eval/utils.py +183 -0
- hud/patches/__init__.py +19 -0
- hud/patches/mcp_patches.py +151 -0
- hud/patches/warnings.py +54 -0
- hud/samples/browser.py +4 -4
- hud/server/__init__.py +2 -1
- hud/server/low_level.py +2 -1
- hud/server/router.py +164 -0
- hud/server/server.py +567 -80
- hud/server/tests/test_mcp_server_integration.py +11 -11
- hud/server/tests/test_mcp_server_more.py +1 -1
- hud/server/tests/test_server_extra.py +2 -0
- hud/settings.py +45 -3
- hud/shared/exceptions.py +36 -10
- hud/shared/hints.py +26 -1
- hud/shared/requests.py +15 -3
- hud/shared/tests/test_exceptions.py +40 -31
- hud/shared/tests/test_hints.py +167 -0
- hud/telemetry/__init__.py +20 -19
- hud/telemetry/exporter.py +201 -0
- hud/telemetry/instrument.py +158 -253
- hud/telemetry/tests/test_eval_telemetry.py +356 -0
- hud/telemetry/tests/test_exporter.py +258 -0
- hud/telemetry/tests/test_instrument.py +401 -0
- hud/tools/__init__.py +16 -2
- hud/tools/apply_patch.py +639 -0
- hud/tools/base.py +54 -4
- hud/tools/bash.py +2 -2
- hud/tools/computer/__init__.py +4 -0
- hud/tools/computer/anthropic.py +2 -2
- hud/tools/computer/gemini.py +385 -0
- hud/tools/computer/hud.py +23 -6
- hud/tools/computer/openai.py +20 -21
- hud/tools/computer/qwen.py +434 -0
- hud/tools/computer/settings.py +37 -0
- hud/tools/edit.py +3 -7
- hud/tools/executors/base.py +4 -2
- hud/tools/executors/pyautogui.py +1 -1
- hud/tools/grounding/grounded_tool.py +13 -18
- hud/tools/grounding/grounder.py +10 -31
- hud/tools/grounding/tests/test_grounded_tool.py +26 -44
- hud/tools/jupyter.py +330 -0
- hud/tools/playwright.py +18 -3
- hud/tools/shell.py +308 -0
- hud/tools/tests/test_apply_patch.py +718 -0
- hud/tools/tests/test_computer.py +4 -9
- hud/tools/tests/test_computer_actions.py +24 -2
- hud/tools/tests/test_jupyter_tool.py +181 -0
- hud/tools/tests/test_shell.py +596 -0
- hud/tools/tests/test_submit.py +85 -0
- hud/tools/tests/test_types.py +193 -0
- hud/tools/types.py +21 -1
- hud/types.py +167 -57
- hud/utils/__init__.py +2 -0
- hud/utils/env.py +67 -0
- hud/utils/hud_console.py +61 -3
- hud/utils/mcp.py +15 -58
- hud/utils/strict_schema.py +162 -0
- hud/utils/tests/test_init.py +1 -2
- hud/utils/tests/test_mcp.py +1 -28
- hud/utils/tests/test_pretty_errors.py +186 -0
- hud/utils/tests/test_tool_shorthand.py +154 -0
- hud/utils/tests/test_version.py +1 -1
- hud/utils/types.py +20 -0
- hud/version.py +1 -1
- hud_python-0.5.1.dist-info/METADATA +264 -0
- hud_python-0.5.1.dist-info/RECORD +299 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/WHEEL +1 -1
- hud/agents/langchain.py +0 -261
- hud/agents/lite_llm.py +0 -72
- hud/cli/rl/__init__.py +0 -180
- hud/cli/rl/config.py +0 -101
- hud/cli/rl/display.py +0 -133
- hud/cli/rl/gpu.py +0 -63
- hud/cli/rl/gpu_utils.py +0 -321
- hud/cli/rl/local_runner.py +0 -595
- hud/cli/rl/presets.py +0 -96
- hud/cli/rl/remote_runner.py +0 -463
- hud/cli/rl/rl_api.py +0 -150
- hud/cli/rl/vllm.py +0 -177
- hud/cli/rl/wait_utils.py +0 -89
- hud/datasets/parallel.py +0 -687
- hud/misc/__init__.py +0 -1
- hud/misc/claude_plays_pokemon.py +0 -292
- hud/otel/__init__.py +0 -35
- hud/otel/collector.py +0 -142
- hud/otel/config.py +0 -181
- hud/otel/context.py +0 -570
- hud/otel/exporters.py +0 -369
- hud/otel/instrumentation.py +0 -135
- hud/otel/processors.py +0 -121
- hud/otel/tests/__init__.py +0 -1
- hud/otel/tests/test_processors.py +0 -197
- hud/rl/README.md +0 -30
- hud/rl/__init__.py +0 -1
- hud/rl/actor.py +0 -176
- hud/rl/buffer.py +0 -405
- hud/rl/chat_template.jinja +0 -101
- hud/rl/config.py +0 -192
- hud/rl/distributed.py +0 -132
- hud/rl/learner.py +0 -637
- hud/rl/tests/__init__.py +0 -1
- hud/rl/tests/test_learner.py +0 -186
- hud/rl/train.py +0 -382
- hud/rl/types.py +0 -101
- hud/rl/utils/start_vllm_server.sh +0 -30
- hud/rl/utils.py +0 -524
- hud/rl/vllm_adapter.py +0 -143
- hud/telemetry/job.py +0 -352
- hud/telemetry/replay.py +0 -74
- hud/telemetry/tests/test_replay.py +0 -40
- hud/telemetry/tests/test_trace.py +0 -63
- hud/telemetry/trace.py +0 -158
- hud/utils/agent_factories.py +0 -86
- hud/utils/async_utils.py +0 -65
- hud/utils/group_eval.py +0 -223
- hud/utils/progress.py +0 -149
- hud/utils/tasks.py +0 -127
- hud/utils/tests/test_async_utils.py +0 -173
- hud/utils/tests/test_progress.py +0 -261
- hud_python-0.4.45.dist-info/METADATA +0 -552
- hud_python-0.4.45.dist-info/RECORD +0 -228
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.45.dist-info → hud_python-0.5.1.dist-info}/licenses/LICENSE +0 -0
hud/tools/base.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
import logging
|
|
3
4
|
from abc import ABC, abstractmethod
|
|
4
5
|
from typing import TYPE_CHECKING, Any, cast
|
|
5
6
|
|
|
@@ -8,7 +9,7 @@ from fastmcp import FastMCP
|
|
|
8
9
|
from hud.tools.types import ContentBlock, EvaluationResult
|
|
9
10
|
|
|
10
11
|
if TYPE_CHECKING:
|
|
11
|
-
from collections.abc import Callable
|
|
12
|
+
from collections.abc import Awaitable, Callable
|
|
12
13
|
|
|
13
14
|
from fastmcp.tools import FunctionTool
|
|
14
15
|
from fastmcp.tools.tool import Tool, ToolResult
|
|
@@ -16,6 +17,8 @@ if TYPE_CHECKING:
|
|
|
16
17
|
# Basic result types for tools
|
|
17
18
|
BaseResult = list[ContentBlock] | EvaluationResult
|
|
18
19
|
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
19
22
|
|
|
20
23
|
class BaseTool(ABC):
|
|
21
24
|
"""
|
|
@@ -58,6 +61,10 @@ class BaseTool(ABC):
|
|
|
58
61
|
self.title = title or self.__class__.__name__.replace("Tool", "").replace("_", " ").title()
|
|
59
62
|
self.description = description or (self.__doc__.strip() if self.__doc__ else None)
|
|
60
63
|
self.meta = meta
|
|
64
|
+
self._callbacks: dict[
|
|
65
|
+
str,
|
|
66
|
+
list[Callable[..., Awaitable[Any]]],
|
|
67
|
+
] = {} # {"event_name": [callback_functions]}
|
|
61
68
|
|
|
62
69
|
# Expose attributes FastMCP expects when registering an instance directly
|
|
63
70
|
self.__name__ = self.name # FastMCP uses fn.__name__ if name param omitted
|
|
@@ -100,13 +107,48 @@ class BaseTool(ABC):
|
|
|
100
107
|
)
|
|
101
108
|
return self._mcp_tool
|
|
102
109
|
|
|
110
|
+
def add_callback(self, event_type: str, callback: Callable[..., Awaitable[Any]]) -> None:
|
|
111
|
+
"""Register a callback function for specific event
|
|
112
|
+
|
|
113
|
+
Args:
|
|
114
|
+
event_type: (Required) Specific event name to trigger callback
|
|
115
|
+
e.g. "after_click", "before_navigate"
|
|
116
|
+
callback: (Required) Async function to call. Must be defined by `async def f(...)`
|
|
117
|
+
"""
|
|
118
|
+
if event_type not in self._callbacks:
|
|
119
|
+
self._callbacks[event_type] = []
|
|
120
|
+
self._callbacks[event_type].append(callback)
|
|
121
|
+
|
|
122
|
+
def remove_callback(self, event_type: str, callback: Callable[..., Awaitable[Any]]) -> None:
|
|
123
|
+
"""Remove a registered callback
|
|
124
|
+
Args:
|
|
125
|
+
event_type: (Required) Specific event name to trigger callback
|
|
126
|
+
e.g. "after_click", "before_navigate"
|
|
127
|
+
callback: (Required) Function to remove from callback list.
|
|
128
|
+
"""
|
|
129
|
+
if (event_type in self._callbacks) and (callback in self._callbacks[event_type]):
|
|
130
|
+
self._callbacks[event_type].remove(callback)
|
|
131
|
+
|
|
132
|
+
async def _trigger_callbacks(self, event_type: str, **kwargs: Any) -> None:
|
|
133
|
+
"""Trigger all registered callback functions of an event type"""
|
|
134
|
+
callback_list = self._callbacks.get(event_type, [])
|
|
135
|
+
for callback in callback_list:
|
|
136
|
+
try:
|
|
137
|
+
await callback(**kwargs)
|
|
138
|
+
except Exception as e:
|
|
139
|
+
logger.warning("Callback failed for %s: %s", event_type, e)
|
|
140
|
+
|
|
103
141
|
|
|
104
142
|
# Prefix for internal tool names
|
|
105
143
|
_INTERNAL_PREFIX = "int_"
|
|
106
144
|
|
|
107
145
|
|
|
108
146
|
class BaseHub(FastMCP):
|
|
109
|
-
"""A composition-friendly FastMCP server that holds an internal tool dispatcher.
|
|
147
|
+
"""A composition-friendly FastMCP server that holds an internal tool dispatcher.
|
|
148
|
+
|
|
149
|
+
Note: BaseHub can be used standalone or to wrap existing routers. For the newer
|
|
150
|
+
FastAPI-like pattern, consider using HiddenRouter from hud.server instead.
|
|
151
|
+
"""
|
|
110
152
|
|
|
111
153
|
env: Any
|
|
112
154
|
|
|
@@ -129,6 +171,10 @@ class BaseHub(FastMCP):
|
|
|
129
171
|
Optional long-lived environment object. Stored on the server
|
|
130
172
|
instance (``layer.env``) and therefore available to every request
|
|
131
173
|
via ``ctx.fastmcp.env``.
|
|
174
|
+
title:
|
|
175
|
+
Optional title for the dispatcher tool.
|
|
176
|
+
description:
|
|
177
|
+
Optional description for the dispatcher tool.
|
|
132
178
|
meta:
|
|
133
179
|
Metadata to include in MCP tool listing.
|
|
134
180
|
"""
|
|
@@ -370,8 +416,12 @@ class BaseHub(FastMCP):
|
|
|
370
416
|
}
|
|
371
417
|
|
|
372
418
|
# Override _list_tools to hide internal tools when mounted
|
|
373
|
-
async def _list_tools(self) -> list[Tool]:
|
|
374
|
-
"""Override _list_tools to hide internal tools when mounted.
|
|
419
|
+
async def _list_tools(self, context: Any = None) -> list[Tool]:
|
|
420
|
+
"""Override _list_tools to hide internal tools when mounted.
|
|
421
|
+
|
|
422
|
+
Args:
|
|
423
|
+
context: MiddlewareContext passed by FastMCP (optional for backwards compat)
|
|
424
|
+
"""
|
|
375
425
|
return [
|
|
376
426
|
tool
|
|
377
427
|
for key, tool in self._tool_manager._tools.items()
|
hud/tools/bash.py
CHANGED
|
@@ -3,7 +3,7 @@ from __future__ import annotations
|
|
|
3
3
|
import asyncio
|
|
4
4
|
import os
|
|
5
5
|
import sys
|
|
6
|
-
from typing import TYPE_CHECKING
|
|
6
|
+
from typing import TYPE_CHECKING
|
|
7
7
|
|
|
8
8
|
from .base import BaseTool
|
|
9
9
|
from .types import ContentResult, ToolError
|
|
@@ -140,7 +140,7 @@ class BashTool(BaseTool):
|
|
|
140
140
|
self.env = value
|
|
141
141
|
|
|
142
142
|
async def __call__(
|
|
143
|
-
self, command: str | None = None, restart: bool = False
|
|
143
|
+
self, command: str | None = None, restart: bool = False
|
|
144
144
|
) -> list[ContentBlock]:
|
|
145
145
|
if restart:
|
|
146
146
|
if self.session:
|
hud/tools/computer/__init__.py
CHANGED
|
@@ -3,13 +3,17 @@
|
|
|
3
3
|
from __future__ import annotations
|
|
4
4
|
|
|
5
5
|
from .anthropic import AnthropicComputerTool
|
|
6
|
+
from .gemini import GeminiComputerTool
|
|
6
7
|
from .hud import HudComputerTool
|
|
7
8
|
from .openai import OpenAIComputerTool
|
|
9
|
+
from .qwen import QwenComputerTool
|
|
8
10
|
from .settings import computer_settings
|
|
9
11
|
|
|
10
12
|
__all__ = [
|
|
11
13
|
"AnthropicComputerTool",
|
|
14
|
+
"GeminiComputerTool",
|
|
12
15
|
"HudComputerTool",
|
|
13
16
|
"OpenAIComputerTool",
|
|
17
|
+
"QwenComputerTool",
|
|
14
18
|
"computer_settings",
|
|
15
19
|
]
|
hud/tools/computer/anthropic.py
CHANGED
|
@@ -141,13 +141,13 @@ class AnthropicComputerTool(HudComputerTool):
|
|
|
141
141
|
async def __call__(
|
|
142
142
|
self,
|
|
143
143
|
action: str = Field(..., description="The action to perform on the computer"),
|
|
144
|
-
coordinate: list[int] |
|
|
144
|
+
coordinate: list[int] | None = Field(
|
|
145
145
|
None, description="The coordinate to interact with on the computer [x, y]"
|
|
146
146
|
),
|
|
147
147
|
text: str | None = Field(
|
|
148
148
|
None, description="The text to type on the computer or key to press"
|
|
149
149
|
),
|
|
150
|
-
start_coordinate: list[int] |
|
|
150
|
+
start_coordinate: list[int] | None = Field(
|
|
151
151
|
None, description="The starting coordinate for drag actions [x, y]"
|
|
152
152
|
),
|
|
153
153
|
scroll_direction: str | None = Field(
|
|
@@ -0,0 +1,385 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import logging
|
|
4
|
+
import platform
|
|
5
|
+
from typing import TYPE_CHECKING, Any, Literal
|
|
6
|
+
|
|
7
|
+
from mcp import ErrorData, McpError
|
|
8
|
+
from mcp.types import INVALID_PARAMS, ContentBlock
|
|
9
|
+
from pydantic import Field
|
|
10
|
+
|
|
11
|
+
from hud.tools.types import ContentResult
|
|
12
|
+
|
|
13
|
+
from .hud import HudComputerTool
|
|
14
|
+
from .settings import computer_settings
|
|
15
|
+
|
|
16
|
+
if TYPE_CHECKING:
|
|
17
|
+
from hud.tools.executors.base import BaseExecutor
|
|
18
|
+
|
|
19
|
+
logger = logging.getLogger(__name__)
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
ACTION_FIELD = Field(..., description="Gemini Computer Use action to perform")
|
|
23
|
+
X_FIELD = Field(None, description="X coordinate (pixels in agent space)")
|
|
24
|
+
Y_FIELD = Field(None, description="Y coordinate (pixels in agent space)")
|
|
25
|
+
TEXT_FIELD = Field(None, description="Text to type")
|
|
26
|
+
PRESS_ENTER_FIELD = Field(None, description="Whether to press Enter after typing (type_text_at)")
|
|
27
|
+
CLEAR_BEFORE_TYPING_FIELD = Field(
|
|
28
|
+
None, description="Whether to select-all before typing (type_text_at)"
|
|
29
|
+
)
|
|
30
|
+
DIRECTION_FIELD = Field(None, description="Scroll direction for scroll_document/scroll_at")
|
|
31
|
+
MAGNITUDE_FIELD = Field(None, description="Scroll magnitude (pixels in agent space)")
|
|
32
|
+
URL_FIELD = Field(None, description="Target URL for navigate")
|
|
33
|
+
KEYS_FIELD = Field(None, description="Keys for key_combination")
|
|
34
|
+
DESTINATION_X_FIELD = Field(None, description="Destination X for drag_and_drop (agent space)")
|
|
35
|
+
DESTINATION_Y_FIELD = Field(None, description="Destination Y for drag_and_drop (agent space)")
|
|
36
|
+
TAKE_SCREENSHOT_ON_CLICK_FIELD = Field(
|
|
37
|
+
True, description="Whether to include a screenshot for interactive actions"
|
|
38
|
+
)
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class GeminiComputerTool(HudComputerTool):
|
|
42
|
+
"""
|
|
43
|
+
Gemini Computer Use tool for interacting with a computer via MCP.
|
|
44
|
+
|
|
45
|
+
Maps Gemini's predefined function names (open_web_browser, click_at, hover_at,
|
|
46
|
+
type_text_at, scroll_document, scroll_at, wait_5_seconds, go_back, go_forward,
|
|
47
|
+
search, navigate, key_combination, drag_and_drop) to executor actions.
|
|
48
|
+
"""
|
|
49
|
+
|
|
50
|
+
def __init__(
|
|
51
|
+
self,
|
|
52
|
+
# Define within environment based on platform
|
|
53
|
+
executor: BaseExecutor | None = None,
|
|
54
|
+
platform_type: Literal["auto", "xdo", "pyautogui"] = "auto",
|
|
55
|
+
display_num: int | None = None,
|
|
56
|
+
# Overrides for what dimensions the agent thinks it operates in
|
|
57
|
+
width: int = computer_settings.GEMINI_COMPUTER_WIDTH,
|
|
58
|
+
height: int = computer_settings.GEMINI_COMPUTER_HEIGHT,
|
|
59
|
+
rescale_images: bool = computer_settings.GEMINI_RESCALE_IMAGES,
|
|
60
|
+
# What the agent sees as the tool's name, title, and description
|
|
61
|
+
name: str | None = None,
|
|
62
|
+
title: str | None = None,
|
|
63
|
+
description: str | None = None,
|
|
64
|
+
**kwargs: Any,
|
|
65
|
+
) -> None:
|
|
66
|
+
"""
|
|
67
|
+
Initialize with Gemini's default dimensions.
|
|
68
|
+
"""
|
|
69
|
+
super().__init__(
|
|
70
|
+
executor=executor,
|
|
71
|
+
platform_type=platform_type,
|
|
72
|
+
display_num=display_num,
|
|
73
|
+
width=width,
|
|
74
|
+
height=height,
|
|
75
|
+
rescale_images=rescale_images,
|
|
76
|
+
name=name or "gemini_computer",
|
|
77
|
+
title=title or "Gemini Computer Tool",
|
|
78
|
+
description=description or "Control computer with mouse, keyboard, and screenshots",
|
|
79
|
+
**kwargs,
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
async def __call__(
|
|
83
|
+
self,
|
|
84
|
+
action: str = ACTION_FIELD,
|
|
85
|
+
# Common coordinates
|
|
86
|
+
x: int | None = X_FIELD,
|
|
87
|
+
y: int | None = Y_FIELD,
|
|
88
|
+
# Text input
|
|
89
|
+
text: str | None = TEXT_FIELD,
|
|
90
|
+
press_enter: bool | None = PRESS_ENTER_FIELD,
|
|
91
|
+
clear_before_typing: bool | None = CLEAR_BEFORE_TYPING_FIELD,
|
|
92
|
+
# Scroll parameters
|
|
93
|
+
direction: Literal["up", "down", "left", "right"] | None = DIRECTION_FIELD,
|
|
94
|
+
magnitude: int | None = MAGNITUDE_FIELD,
|
|
95
|
+
# Navigation
|
|
96
|
+
url: str | None = URL_FIELD,
|
|
97
|
+
# Key combos
|
|
98
|
+
keys: list[str] | str | None = KEYS_FIELD,
|
|
99
|
+
# Drag parameters
|
|
100
|
+
destination_x: int | None = DESTINATION_X_FIELD,
|
|
101
|
+
destination_y: int | None = DESTINATION_Y_FIELD,
|
|
102
|
+
# Behavior
|
|
103
|
+
take_screenshot_on_click: bool = TAKE_SCREENSHOT_ON_CLICK_FIELD,
|
|
104
|
+
) -> list[ContentBlock]:
|
|
105
|
+
"""
|
|
106
|
+
Handle Gemini Computer Use API calls by mapping to executor actions.
|
|
107
|
+
|
|
108
|
+
Returns:
|
|
109
|
+
List of MCP content blocks
|
|
110
|
+
"""
|
|
111
|
+
logger.info("GeminiComputerTool received action: %s", action)
|
|
112
|
+
|
|
113
|
+
# Helper to finalize ContentResult: rescale if requested and ensure URL metadata
|
|
114
|
+
async def _finalize(
|
|
115
|
+
result: ContentResult, requested_url: str | None = None
|
|
116
|
+
) -> list[ContentBlock]:
|
|
117
|
+
if result.base64_image and self.rescale_images:
|
|
118
|
+
try:
|
|
119
|
+
result.base64_image = await self._rescale_screenshot(result.base64_image)
|
|
120
|
+
except Exception as e:
|
|
121
|
+
logger.warning("Failed to rescale screenshot: %s", e)
|
|
122
|
+
# Always include URL metadata if provided; otherwise default to about:blank
|
|
123
|
+
result.url = requested_url or result.url or "about:blank"
|
|
124
|
+
return result.to_content_blocks()
|
|
125
|
+
|
|
126
|
+
# Scale coordinates helper
|
|
127
|
+
def _scale(xv: int | None, yv: int | None) -> tuple[int | None, int | None]:
|
|
128
|
+
return self._scale_coordinates(xv, yv)
|
|
129
|
+
|
|
130
|
+
# Gemini emits coordinates/magnitudes in a 0-1000 normalized space.
|
|
131
|
+
def _denormalize(value: float | None, axis: Literal["x", "y"]) -> int | None:
|
|
132
|
+
if value is None:
|
|
133
|
+
return None
|
|
134
|
+
try:
|
|
135
|
+
numeric = float(value)
|
|
136
|
+
except (TypeError, ValueError):
|
|
137
|
+
try:
|
|
138
|
+
return int(value) # type: ignore[arg-type]
|
|
139
|
+
except (TypeError, ValueError):
|
|
140
|
+
return None
|
|
141
|
+
|
|
142
|
+
# Treat values within the normalized range (including defaults like 800).
|
|
143
|
+
if 0 <= numeric <= 1000:
|
|
144
|
+
target = self.width if axis == "x" else self.height
|
|
145
|
+
numeric = numeric / 1000 * target
|
|
146
|
+
|
|
147
|
+
return round(numeric)
|
|
148
|
+
|
|
149
|
+
def _scale_distance(value: int | None, axis: Literal["x", "y"]) -> int | None:
|
|
150
|
+
if value is None:
|
|
151
|
+
return None
|
|
152
|
+
scale = self.scale_x if axis == "x" else self.scale_y
|
|
153
|
+
if scale != 1.0:
|
|
154
|
+
return round(value / scale)
|
|
155
|
+
return value
|
|
156
|
+
|
|
157
|
+
# Map actions
|
|
158
|
+
if action == "open_web_browser":
|
|
159
|
+
screenshot = await self.executor.screenshot()
|
|
160
|
+
if screenshot:
|
|
161
|
+
result = ContentResult(base64_image=screenshot, url="about:blank")
|
|
162
|
+
else:
|
|
163
|
+
result = ContentResult(error="Failed to take screenshot", url="about:blank")
|
|
164
|
+
return await _finalize(result)
|
|
165
|
+
|
|
166
|
+
elif action == "click_at":
|
|
167
|
+
if x is None or y is None:
|
|
168
|
+
raise McpError(ErrorData(code=INVALID_PARAMS, message="x and y are required"))
|
|
169
|
+
dx = _denormalize(x, "x")
|
|
170
|
+
dy = _denormalize(y, "y")
|
|
171
|
+
sx, sy = _scale(dx, dy)
|
|
172
|
+
result = await self.executor.click(x=sx, y=sy)
|
|
173
|
+
return await _finalize(result)
|
|
174
|
+
|
|
175
|
+
elif action == "hover_at":
|
|
176
|
+
if x is None or y is None:
|
|
177
|
+
raise McpError(ErrorData(code=INVALID_PARAMS, message="x and y are required"))
|
|
178
|
+
dx = _denormalize(x, "x")
|
|
179
|
+
dy = _denormalize(y, "y")
|
|
180
|
+
sx, sy = _scale(dx, dy)
|
|
181
|
+
result = await self.executor.move(x=sx, y=sy)
|
|
182
|
+
return await _finalize(result)
|
|
183
|
+
|
|
184
|
+
elif action == "type_text_at":
|
|
185
|
+
if x is None or y is None:
|
|
186
|
+
raise McpError(ErrorData(code=INVALID_PARAMS, message="x and y are required"))
|
|
187
|
+
if text is None:
|
|
188
|
+
raise McpError(ErrorData(code=INVALID_PARAMS, message="text is required"))
|
|
189
|
+
|
|
190
|
+
dx = _denormalize(x, "x")
|
|
191
|
+
dy = _denormalize(y, "y")
|
|
192
|
+
sx, sy = _scale(dx, dy)
|
|
193
|
+
|
|
194
|
+
# Focus the field
|
|
195
|
+
await self.executor.move(x=sx, y=sy, take_screenshot=False)
|
|
196
|
+
await self.executor.click(x=sx, y=sy, take_screenshot=False)
|
|
197
|
+
|
|
198
|
+
# Clear existing text if requested
|
|
199
|
+
if clear_before_typing is None or clear_before_typing:
|
|
200
|
+
is_mac = platform.system().lower() == "darwin"
|
|
201
|
+
combo = ["cmd", "a"] if is_mac else ["ctrl", "a"]
|
|
202
|
+
await self.executor.press(keys=combo, take_screenshot=False)
|
|
203
|
+
delete_key = "backspace" if is_mac else "delete"
|
|
204
|
+
await self.executor.press(keys=[delete_key], take_screenshot=False)
|
|
205
|
+
|
|
206
|
+
# Type (optionally press enter after)
|
|
207
|
+
result = await self.executor.write(text=text, enter_after=bool(press_enter))
|
|
208
|
+
return await _finalize(result)
|
|
209
|
+
|
|
210
|
+
elif action == "scroll_document":
|
|
211
|
+
if direction is None:
|
|
212
|
+
raise McpError(ErrorData(code=INVALID_PARAMS, message="direction is required"))
|
|
213
|
+
# Default magnitude similar to reference implementation
|
|
214
|
+
mag = magnitude if magnitude is not None else 800
|
|
215
|
+
# Convert to environment units while preserving sign
|
|
216
|
+
if direction in ("down", "up"):
|
|
217
|
+
distance = _denormalize(mag, "y")
|
|
218
|
+
if distance is None:
|
|
219
|
+
raise McpError(
|
|
220
|
+
ErrorData(
|
|
221
|
+
code=INVALID_PARAMS, message="Unable to determine scroll magnitude"
|
|
222
|
+
)
|
|
223
|
+
)
|
|
224
|
+
distance = _scale_distance(distance, "y")
|
|
225
|
+
if distance is None:
|
|
226
|
+
raise McpError(
|
|
227
|
+
ErrorData(
|
|
228
|
+
code=INVALID_PARAMS,
|
|
229
|
+
message="Unable to determine scroll magnitude",
|
|
230
|
+
)
|
|
231
|
+
)
|
|
232
|
+
scroll_y = distance if direction == "down" else -distance
|
|
233
|
+
scroll_x = None
|
|
234
|
+
elif direction in ("right", "left"):
|
|
235
|
+
distance = _denormalize(mag, "x")
|
|
236
|
+
if distance is None:
|
|
237
|
+
raise McpError(
|
|
238
|
+
ErrorData(
|
|
239
|
+
code=INVALID_PARAMS, message="Unable to determine scroll magnitude"
|
|
240
|
+
)
|
|
241
|
+
)
|
|
242
|
+
distance = _scale_distance(distance, "x")
|
|
243
|
+
if distance is None:
|
|
244
|
+
raise McpError(
|
|
245
|
+
ErrorData(
|
|
246
|
+
code=INVALID_PARAMS,
|
|
247
|
+
message="Unable to determine scroll magnitude",
|
|
248
|
+
)
|
|
249
|
+
)
|
|
250
|
+
scroll_x = distance if direction == "right" else -distance
|
|
251
|
+
scroll_y = None
|
|
252
|
+
else:
|
|
253
|
+
raise McpError(
|
|
254
|
+
ErrorData(code=INVALID_PARAMS, message=f"Invalid direction: {direction}")
|
|
255
|
+
)
|
|
256
|
+
result = await self.executor.scroll(scroll_x=scroll_x, scroll_y=scroll_y)
|
|
257
|
+
return await _finalize(result)
|
|
258
|
+
|
|
259
|
+
elif action == "scroll_at":
|
|
260
|
+
if direction is None:
|
|
261
|
+
raise McpError(ErrorData(code=INVALID_PARAMS, message="direction is required"))
|
|
262
|
+
if x is None or y is None:
|
|
263
|
+
raise McpError(ErrorData(code=INVALID_PARAMS, message="x and y are required"))
|
|
264
|
+
mag = magnitude if magnitude is not None else 800
|
|
265
|
+
dx = _denormalize(x, "x")
|
|
266
|
+
dy = _denormalize(y, "y")
|
|
267
|
+
sx, sy = _scale(dx, dy)
|
|
268
|
+
if direction in ("down", "up"):
|
|
269
|
+
distance = _denormalize(mag, "y")
|
|
270
|
+
if distance is None:
|
|
271
|
+
raise McpError(
|
|
272
|
+
ErrorData(
|
|
273
|
+
code=INVALID_PARAMS, message="Unable to determine scroll magnitude"
|
|
274
|
+
)
|
|
275
|
+
)
|
|
276
|
+
distance = _scale_distance(distance, "y")
|
|
277
|
+
if distance is None:
|
|
278
|
+
raise McpError(
|
|
279
|
+
ErrorData(
|
|
280
|
+
code=INVALID_PARAMS,
|
|
281
|
+
message="Unable to determine scroll magnitude",
|
|
282
|
+
)
|
|
283
|
+
)
|
|
284
|
+
scroll_y = distance if direction == "down" else -distance
|
|
285
|
+
scroll_x = None
|
|
286
|
+
elif direction in ("right", "left"):
|
|
287
|
+
distance = _denormalize(mag, "x")
|
|
288
|
+
if distance is None:
|
|
289
|
+
raise McpError(
|
|
290
|
+
ErrorData(
|
|
291
|
+
code=INVALID_PARAMS, message="Unable to determine scroll magnitude"
|
|
292
|
+
)
|
|
293
|
+
)
|
|
294
|
+
distance = _scale_distance(distance, "x")
|
|
295
|
+
if distance is None:
|
|
296
|
+
raise McpError(
|
|
297
|
+
ErrorData(
|
|
298
|
+
code=INVALID_PARAMS,
|
|
299
|
+
message="Unable to determine scroll magnitude",
|
|
300
|
+
)
|
|
301
|
+
)
|
|
302
|
+
scroll_x = distance if direction == "right" else -distance
|
|
303
|
+
scroll_y = None
|
|
304
|
+
else:
|
|
305
|
+
raise McpError(
|
|
306
|
+
ErrorData(code=INVALID_PARAMS, message=f"Invalid direction: {direction}")
|
|
307
|
+
)
|
|
308
|
+
result = await self.executor.scroll(x=sx, y=sy, scroll_x=scroll_x, scroll_y=scroll_y)
|
|
309
|
+
return await _finalize(result)
|
|
310
|
+
|
|
311
|
+
elif action == "wait_5_seconds":
|
|
312
|
+
result = await self.executor.wait(time=5000)
|
|
313
|
+
return await _finalize(result)
|
|
314
|
+
|
|
315
|
+
elif action == "go_back":
|
|
316
|
+
is_mac = platform.system().lower() == "darwin"
|
|
317
|
+
combo = ["cmd", "["] if is_mac else ["alt", "left"]
|
|
318
|
+
result = await self.executor.press(keys=combo)
|
|
319
|
+
return await _finalize(result)
|
|
320
|
+
|
|
321
|
+
elif action == "go_forward":
|
|
322
|
+
is_mac = platform.system().lower() == "darwin"
|
|
323
|
+
combo = ["cmd", "]"] if is_mac else ["alt", "right"]
|
|
324
|
+
result = await self.executor.press(keys=combo)
|
|
325
|
+
return await _finalize(result)
|
|
326
|
+
|
|
327
|
+
elif action == "search":
|
|
328
|
+
# Best-effort navigate to a default search page
|
|
329
|
+
target = url or "https://www.google.com"
|
|
330
|
+
is_mac = platform.system().lower() == "darwin"
|
|
331
|
+
await self.executor.press(
|
|
332
|
+
keys=["cmd", "l"] if is_mac else ["ctrl", "l"], take_screenshot=False
|
|
333
|
+
)
|
|
334
|
+
result = await self.executor.write(text=target, enter_after=True)
|
|
335
|
+
return await _finalize(result, requested_url=target)
|
|
336
|
+
|
|
337
|
+
elif action == "navigate":
|
|
338
|
+
if not url:
|
|
339
|
+
raise McpError(ErrorData(code=INVALID_PARAMS, message="url is required"))
|
|
340
|
+
is_mac = platform.system().lower() == "darwin"
|
|
341
|
+
await self.executor.press(
|
|
342
|
+
keys=["cmd", "l"] if is_mac else ["ctrl", "l"], take_screenshot=False
|
|
343
|
+
)
|
|
344
|
+
result = await self.executor.write(text=url, enter_after=True)
|
|
345
|
+
return await _finalize(result, requested_url=url)
|
|
346
|
+
|
|
347
|
+
elif action == "key_combination":
|
|
348
|
+
if keys is None:
|
|
349
|
+
raise McpError(ErrorData(code=INVALID_PARAMS, message="keys is required"))
|
|
350
|
+
if isinstance(keys, str):
|
|
351
|
+
# Accept formats like "ctrl+c" or "ctrl+shift+t"
|
|
352
|
+
key_list = [k.strip() for k in keys.split("+") if k.strip()]
|
|
353
|
+
else:
|
|
354
|
+
key_list = keys
|
|
355
|
+
result = await self.executor.press(keys=key_list)
|
|
356
|
+
return await _finalize(result)
|
|
357
|
+
|
|
358
|
+
elif action == "drag_and_drop":
|
|
359
|
+
if x is None or y is None or destination_x is None or destination_y is None:
|
|
360
|
+
raise McpError(
|
|
361
|
+
ErrorData(
|
|
362
|
+
code=INVALID_PARAMS,
|
|
363
|
+
message="x, y, destination_x, and destination_y are required",
|
|
364
|
+
)
|
|
365
|
+
)
|
|
366
|
+
sx_norm = _denormalize(x, "x")
|
|
367
|
+
sy_norm = _denormalize(y, "y")
|
|
368
|
+
dx_norm = _denormalize(destination_x, "x")
|
|
369
|
+
dy_norm = _denormalize(destination_y, "y")
|
|
370
|
+
sx, sy = _scale(sx_norm, sy_norm)
|
|
371
|
+
dx_scaled, dy_scaled = _scale(dx_norm, dy_norm)
|
|
372
|
+
# Build a two-point path
|
|
373
|
+
path = [] # type: list[tuple[int, int]]
|
|
374
|
+
if (
|
|
375
|
+
sx is not None
|
|
376
|
+
and sy is not None
|
|
377
|
+
and dx_scaled is not None
|
|
378
|
+
and dy_scaled is not None
|
|
379
|
+
):
|
|
380
|
+
path = [(sx, sy), (dx_scaled, dy_scaled)]
|
|
381
|
+
result = await self.executor.drag(path=path)
|
|
382
|
+
return await _finalize(result)
|
|
383
|
+
|
|
384
|
+
else:
|
|
385
|
+
raise McpError(ErrorData(code=INVALID_PARAMS, message=f"Unknown action: {action}"))
|
hud/tools/computer/hud.py
CHANGED
|
@@ -13,7 +13,7 @@ from hud.tools.base import BaseTool
|
|
|
13
13
|
from hud.tools.executors.base import BaseExecutor
|
|
14
14
|
from hud.tools.executors.pyautogui import PyAutoGUIExecutor
|
|
15
15
|
from hud.tools.executors.xdo import XDOExecutor
|
|
16
|
-
from hud.tools.types import ContentResult, ToolError
|
|
16
|
+
from hud.tools.types import ContentResult, Coordinate, ToolError
|
|
17
17
|
|
|
18
18
|
from .settings import computer_settings
|
|
19
19
|
|
|
@@ -231,7 +231,23 @@ class HudComputerTool(BaseTool):
|
|
|
231
231
|
|
|
232
232
|
async def __call__(
|
|
233
233
|
self,
|
|
234
|
-
action:
|
|
234
|
+
action: Literal[
|
|
235
|
+
"click",
|
|
236
|
+
"press",
|
|
237
|
+
"keydown",
|
|
238
|
+
"keyup",
|
|
239
|
+
"write",
|
|
240
|
+
"scroll",
|
|
241
|
+
"move",
|
|
242
|
+
"wait",
|
|
243
|
+
"drag",
|
|
244
|
+
"response",
|
|
245
|
+
"screenshot",
|
|
246
|
+
"position",
|
|
247
|
+
"hold_key",
|
|
248
|
+
"mouse_down",
|
|
249
|
+
"mouse_up",
|
|
250
|
+
] = Field(..., description="The action name (click, press, write, move, etc.)"),
|
|
235
251
|
# Click parameters
|
|
236
252
|
x: int | None = Field(None, description="X coordinate for click/move/scroll actions"),
|
|
237
253
|
y: int | None = Field(None, description="Y coordinate for click/move/scroll actions"),
|
|
@@ -254,8 +270,8 @@ class HudComputerTool(BaseTool):
|
|
|
254
270
|
offset_x: int | None = Field(None, description="X offset for relative move"),
|
|
255
271
|
offset_y: int | None = Field(None, description="Y offset for relative move"),
|
|
256
272
|
# Drag parameters
|
|
257
|
-
path: list[
|
|
258
|
-
None, description="Path for drag actions as list of
|
|
273
|
+
path: list[Coordinate] | None = Field(
|
|
274
|
+
None, description="Path for drag actions as list of {x, y} coordinates"
|
|
259
275
|
),
|
|
260
276
|
# Wait parameter
|
|
261
277
|
time: int | None = Field(None, description="Time in milliseconds for wait action"),
|
|
@@ -332,8 +348,9 @@ class HudComputerTool(BaseTool):
|
|
|
332
348
|
elif action == "drag":
|
|
333
349
|
if path is None:
|
|
334
350
|
raise ToolError("path parameter is required for drag")
|
|
335
|
-
#
|
|
336
|
-
|
|
351
|
+
# Convert Coordinate objects to tuples and scale from client space to screen space
|
|
352
|
+
path_tuples = [(point.x, point.y) for point in path]
|
|
353
|
+
scaled_path = self._scale_path(path_tuples)
|
|
337
354
|
result = await self.executor.drag(
|
|
338
355
|
path=scaled_path, pattern=pattern, hold_keys=hold_keys
|
|
339
356
|
)
|