hud-python 0.4.21__py3-none-any.whl → 0.4.23__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of hud-python might be problematic. Click here for more details.
- hud/agents/base.py +37 -37
- hud/agents/claude.py +11 -6
- hud/agents/grounded_openai.py +282 -0
- hud/agents/misc/response_agent.py +3 -2
- hud/agents/openai.py +2 -2
- hud/agents/openai_chat_generic.py +3 -1
- hud/agents/tests/test_client.py +6 -1
- hud/agents/tests/test_grounded_openai_agent.py +155 -0
- hud/cli/__init__.py +34 -24
- hud/cli/analyze.py +27 -26
- hud/cli/build.py +50 -46
- hud/cli/debug.py +7 -7
- hud/cli/dev.py +107 -99
- hud/cli/eval.py +33 -31
- hud/cli/hf.py +53 -53
- hud/cli/init.py +28 -28
- hud/cli/list_func.py +22 -22
- hud/cli/pull.py +36 -36
- hud/cli/push.py +76 -74
- hud/cli/remove.py +42 -40
- hud/cli/rl/__init__.py +2 -2
- hud/cli/rl/init.py +41 -41
- hud/cli/rl/pod.py +97 -91
- hud/cli/rl/ssh.py +42 -40
- hud/cli/rl/train.py +75 -73
- hud/cli/rl/utils.py +10 -10
- hud/cli/tests/test_analyze.py +1 -1
- hud/cli/tests/test_analyze_metadata.py +2 -2
- hud/cli/tests/test_pull.py +45 -45
- hud/cli/tests/test_push.py +31 -29
- hud/cli/tests/test_registry.py +15 -15
- hud/cli/utils/environment.py +11 -11
- hud/cli/utils/interactive.py +18 -18
- hud/cli/utils/logging.py +12 -12
- hud/cli/utils/metadata.py +12 -12
- hud/cli/utils/registry.py +5 -5
- hud/cli/utils/runner.py +23 -23
- hud/cli/utils/server.py +16 -16
- hud/settings.py +6 -0
- hud/shared/hints.py +7 -7
- hud/tools/executors/tests/test_base_executor.py +1 -1
- hud/tools/executors/xdo.py +1 -1
- hud/tools/grounding/__init__.py +13 -0
- hud/tools/grounding/config.py +54 -0
- hud/tools/grounding/grounded_tool.py +314 -0
- hud/tools/grounding/grounder.py +302 -0
- hud/tools/grounding/tests/__init__.py +1 -0
- hud/tools/grounding/tests/test_grounded_tool.py +196 -0
- hud/tools/tests/test_playwright_tool.py +1 -1
- hud/tools/tests/test_tools_init.py +1 -1
- hud/tools/tests/test_utils.py +2 -2
- hud/types.py +4 -4
- hud/utils/__init__.py +3 -3
- hud/utils/agent_factories.py +86 -0
- hud/utils/{design.py → hud_console.py} +39 -33
- hud/utils/pretty_errors.py +6 -6
- hud/utils/tests/test_version.py +1 -1
- hud/version.py +1 -1
- {hud_python-0.4.21.dist-info → hud_python-0.4.23.dist-info}/METADATA +3 -1
- {hud_python-0.4.21.dist-info → hud_python-0.4.23.dist-info}/RECORD +63 -54
- {hud_python-0.4.21.dist-info → hud_python-0.4.23.dist-info}/WHEEL +0 -0
- {hud_python-0.4.21.dist-info → hud_python-0.4.23.dist-info}/entry_points.txt +0 -0
- {hud_python-0.4.21.dist-info → hud_python-0.4.23.dist-info}/licenses/LICENSE +0 -0
hud/cli/utils/server.py
CHANGED
|
@@ -7,7 +7,7 @@ from typing import Any
|
|
|
7
7
|
|
|
8
8
|
from fastmcp import FastMCP
|
|
9
9
|
|
|
10
|
-
from hud.utils.
|
|
10
|
+
from hud.utils.hud_console import HUDConsole
|
|
11
11
|
|
|
12
12
|
from .docker import generate_container_name, remove_container
|
|
13
13
|
|
|
@@ -24,7 +24,7 @@ class MCPServerManager:
|
|
|
24
24
|
"""
|
|
25
25
|
self.image = image
|
|
26
26
|
self.docker_args = docker_args or []
|
|
27
|
-
self.
|
|
27
|
+
self.console = HUDConsole()
|
|
28
28
|
self.container_name = self._generate_container_name()
|
|
29
29
|
|
|
30
30
|
def _generate_container_name(self) -> str:
|
|
@@ -155,7 +155,7 @@ class MCPServerManager:
|
|
|
155
155
|
pass # Normal cancellation
|
|
156
156
|
except Exception as e:
|
|
157
157
|
if verbose:
|
|
158
|
-
self.
|
|
158
|
+
self.console.error(f"Server error: {e}")
|
|
159
159
|
raise
|
|
160
160
|
|
|
161
161
|
|
|
@@ -174,16 +174,16 @@ async def run_server_with_interactive(
|
|
|
174
174
|
from .interactive import run_interactive_mode
|
|
175
175
|
from .logging import find_free_port
|
|
176
176
|
|
|
177
|
-
|
|
177
|
+
hud_console = HUDConsole()
|
|
178
178
|
|
|
179
179
|
# Find available port
|
|
180
180
|
actual_port = find_free_port(port)
|
|
181
181
|
if actual_port is None:
|
|
182
|
-
|
|
182
|
+
hud_console.error(f"No available ports found starting from {port}")
|
|
183
183
|
return
|
|
184
184
|
|
|
185
185
|
if actual_port != port:
|
|
186
|
-
|
|
186
|
+
hud_console.warning(f"Port {port} in use, using port {actual_port} instead")
|
|
187
187
|
|
|
188
188
|
# Clean up any existing container
|
|
189
189
|
server_manager.cleanup_container()
|
|
@@ -198,16 +198,16 @@ async def run_server_with_interactive(
|
|
|
198
198
|
proxy = server_manager.create_proxy(config, f"HUD Interactive - {server_manager.image}")
|
|
199
199
|
|
|
200
200
|
# Show header
|
|
201
|
-
|
|
202
|
-
|
|
201
|
+
hud_console.info("") # Empty line
|
|
202
|
+
hud_console.header("HUD MCP Server - Interactive Mode", icon="🎮")
|
|
203
203
|
|
|
204
204
|
# Show configuration
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
205
|
+
hud_console.section_title("Server Information")
|
|
206
|
+
hud_console.info(f"Image: {server_manager.image}")
|
|
207
|
+
hud_console.info(f"Port: {actual_port}")
|
|
208
|
+
hud_console.info(f"URL: http://localhost:{actual_port}/mcp")
|
|
209
|
+
hud_console.info(f"Container: {server_manager.container_name}")
|
|
210
|
+
hud_console.info("")
|
|
211
211
|
|
|
212
212
|
# Create event to signal server is ready
|
|
213
213
|
server_ready = asyncio.Event()
|
|
@@ -236,7 +236,7 @@ async def run_server_with_interactive(
|
|
|
236
236
|
await run_interactive_mode(server_url, verbose=verbose)
|
|
237
237
|
|
|
238
238
|
except KeyboardInterrupt:
|
|
239
|
-
|
|
239
|
+
hud_console.info("\n👋 Shutting down...")
|
|
240
240
|
finally:
|
|
241
241
|
# Cancel server task
|
|
242
242
|
if server_task and not server_task.done():
|
|
@@ -244,7 +244,7 @@ async def run_server_with_interactive(
|
|
|
244
244
|
try:
|
|
245
245
|
await server_task
|
|
246
246
|
except asyncio.CancelledError:
|
|
247
|
-
|
|
247
|
+
hud_console.error("Server task cancelled")
|
|
248
248
|
|
|
249
249
|
# Clean up container
|
|
250
250
|
server_manager.cleanup_container()
|
hud/settings.py
CHANGED
|
@@ -44,6 +44,12 @@ class Settings(BaseSettings):
|
|
|
44
44
|
validation_alias="OPENAI_API_KEY",
|
|
45
45
|
)
|
|
46
46
|
|
|
47
|
+
openrouter_api_key: str | None = Field(
|
|
48
|
+
default=None,
|
|
49
|
+
description="API key for OpenRouter models",
|
|
50
|
+
validation_alias="OPENROUTER_API_KEY",
|
|
51
|
+
)
|
|
52
|
+
|
|
47
53
|
wandb_api_key: str | None = Field(
|
|
48
54
|
default=None,
|
|
49
55
|
description="API key for Weights & Biases",
|
hud/shared/hints.py
CHANGED
|
@@ -144,9 +144,9 @@ def render_hints(hints: Iterable[Hint] | None, *, design: Any | None = None) ->
|
|
|
144
144
|
|
|
145
145
|
try:
|
|
146
146
|
if design is None:
|
|
147
|
-
from hud.utils.
|
|
147
|
+
from hud.utils.hud_console import hud_console as default_design # lazy import
|
|
148
148
|
|
|
149
|
-
|
|
149
|
+
hud_console = default_design
|
|
150
150
|
except Exception:
|
|
151
151
|
# If design is unavailable (non-CLI contexts), silently skip rendering
|
|
152
152
|
return
|
|
@@ -155,23 +155,23 @@ def render_hints(hints: Iterable[Hint] | None, *, design: Any | None = None) ->
|
|
|
155
155
|
try:
|
|
156
156
|
# Compact rendering - skip title if same as message
|
|
157
157
|
if hint.title and hint.title != hint.message:
|
|
158
|
-
|
|
158
|
+
hud_console.warning(f"{hint.title}: {hint.message}")
|
|
159
159
|
else:
|
|
160
|
-
|
|
160
|
+
hud_console.warning(hint.message)
|
|
161
161
|
|
|
162
162
|
# Tips as bullet points
|
|
163
163
|
if hint.tips:
|
|
164
164
|
for tip in hint.tips:
|
|
165
|
-
|
|
165
|
+
hud_console.info(f" • {tip}")
|
|
166
166
|
|
|
167
167
|
# Only show command examples if provided
|
|
168
168
|
if hint.command_examples:
|
|
169
169
|
for cmd in hint.command_examples:
|
|
170
|
-
|
|
170
|
+
hud_console.command_example(cmd)
|
|
171
171
|
|
|
172
172
|
# Only show docs URL if provided
|
|
173
173
|
if hint.docs_url:
|
|
174
|
-
|
|
174
|
+
hud_console.link(hint.docs_url)
|
|
175
175
|
except Exception:
|
|
176
176
|
logger.warning("Failed to render hint: %s", hint)
|
|
177
177
|
continue
|
|
@@ -361,5 +361,5 @@ class TestLazyImports:
|
|
|
361
361
|
"""Test lazy import with invalid attribute name."""
|
|
362
362
|
import hud.tools.executors as executors_module
|
|
363
363
|
|
|
364
|
-
with pytest.raises(AttributeError, match="module '.*' has no attribute 'InvalidExecutor'"):
|
|
364
|
+
with pytest.raises(AttributeError, match=r"module '.*' has no attribute 'InvalidExecutor'"):
|
|
365
365
|
_ = executors_module.InvalidExecutor
|
hud/tools/executors/xdo.py
CHANGED
|
@@ -175,7 +175,7 @@ class XDOExecutor(BaseExecutor):
|
|
|
175
175
|
|
|
176
176
|
screenshot_cmd = f"{self._display_prefix}scrot -p {screenshot_path}"
|
|
177
177
|
|
|
178
|
-
returncode, _,
|
|
178
|
+
returncode, _, _stderr = await run(screenshot_cmd)
|
|
179
179
|
|
|
180
180
|
if returncode == 0 and screenshot_path.exists():
|
|
181
181
|
try:
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Grounding module for visual element detection and coordinate resolution."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from .config import GrounderConfig
|
|
6
|
+
from .grounded_tool import GroundedComputerTool
|
|
7
|
+
from .grounder import Grounder
|
|
8
|
+
|
|
9
|
+
__all__ = [
|
|
10
|
+
"GroundedComputerTool",
|
|
11
|
+
"Grounder",
|
|
12
|
+
"GrounderConfig",
|
|
13
|
+
]
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"""Configuration for grounding models."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
SYSTEM_PROMPT = (
|
|
9
|
+
"You are a visual grounding model. Given an image and a description, "
|
|
10
|
+
"return ONLY the center pixel coordinates of the described element as a "
|
|
11
|
+
"single point in parentheses format: (x, y). Do not return bounding boxes "
|
|
12
|
+
"or multiple coordinates."
|
|
13
|
+
)
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
@dataclass
|
|
17
|
+
class GrounderConfig:
|
|
18
|
+
"""Configuration for grounding model clients.
|
|
19
|
+
|
|
20
|
+
Attributes:
|
|
21
|
+
api_base: Base URL for the grounding model API endpoint
|
|
22
|
+
model: Model identifier to use for grounding
|
|
23
|
+
api_key: API key for authentication (default: "EMPTY" for local models)
|
|
24
|
+
system_prompt: System prompt to guide the grounding model
|
|
25
|
+
output_format: Format for coordinate output ("pixels", "norm_0_1", "norm_0_999")
|
|
26
|
+
parser_regex: Regular expression to parse coordinates from model output
|
|
27
|
+
resize: Image resizing configuration dictionary
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
api_base: str
|
|
31
|
+
model: str
|
|
32
|
+
api_key: str = "EMPTY"
|
|
33
|
+
system_prompt: str = SYSTEM_PROMPT
|
|
34
|
+
output_format: str = "pixels" # "pixels" | "norm_0_1" | "norm_0_999"
|
|
35
|
+
parser_regex: str = r"\((\d+),\s*(\d+)\)"
|
|
36
|
+
resize: dict[str, Any] = field(
|
|
37
|
+
default_factory=lambda: {
|
|
38
|
+
"enabled": True,
|
|
39
|
+
"min_pixels": 3136,
|
|
40
|
+
"max_pixels": 4096 * 2160,
|
|
41
|
+
"factor": 28,
|
|
42
|
+
}
|
|
43
|
+
)
|
|
44
|
+
|
|
45
|
+
def __post_init__(self) -> None:
|
|
46
|
+
"""Validate configuration after initialization."""
|
|
47
|
+
if self.output_format not in ("pixels", "norm_0_1", "norm_0_999"):
|
|
48
|
+
raise ValueError(f"Invalid output_format: {self.output_format}")
|
|
49
|
+
|
|
50
|
+
if not self.api_base:
|
|
51
|
+
raise ValueError("api_base is required")
|
|
52
|
+
|
|
53
|
+
if not self.model:
|
|
54
|
+
raise ValueError("model is required")
|
|
@@ -0,0 +1,314 @@
|
|
|
1
|
+
"""Grounded computer tool that resolves element descriptions to coordinates."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from typing import Any
|
|
7
|
+
|
|
8
|
+
from mcp import ErrorData, McpError
|
|
9
|
+
from mcp.types import INVALID_PARAMS, ContentBlock
|
|
10
|
+
|
|
11
|
+
from hud.clients.base import AgentMCPClient # noqa: TC001
|
|
12
|
+
from hud.tools.grounding.grounder import Grounder # noqa: TC001
|
|
13
|
+
from hud.types import MCPToolCall
|
|
14
|
+
|
|
15
|
+
logger = logging.getLogger(__name__)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class GroundedComputerTool:
|
|
19
|
+
"""Computer tool wrapper that grounds element descriptions to coordinates.
|
|
20
|
+
|
|
21
|
+
This tool acts as a local wrapper that:
|
|
22
|
+
1. Accepts natural language element descriptions from the agent
|
|
23
|
+
2. Calls the environment's computer tool via MCP to take screenshots
|
|
24
|
+
3. Uses a grounding model to resolve descriptions to coordinates
|
|
25
|
+
4. Calls the environment's computer tool via MCP with resolved coordinates
|
|
26
|
+
5. Returns the result to the agent
|
|
27
|
+
|
|
28
|
+
This allows the agent to use element descriptions while ensuring all
|
|
29
|
+
computer actions happen in the correct environment.
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
def __init__(
|
|
33
|
+
self,
|
|
34
|
+
*,
|
|
35
|
+
grounder: Grounder,
|
|
36
|
+
mcp_client: AgentMCPClient,
|
|
37
|
+
computer_tool_name: str = "computer",
|
|
38
|
+
) -> None:
|
|
39
|
+
"""Initialize the grounded computer tool.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
grounder: Grounder instance for visual grounding
|
|
43
|
+
mcp_client: MCP client to call the environment's computer tool
|
|
44
|
+
computer_tool_name: Name of the computer tool in the environment
|
|
45
|
+
"""
|
|
46
|
+
self._grounder = grounder
|
|
47
|
+
self._mcp_client = mcp_client
|
|
48
|
+
self._computer_tool_name = computer_tool_name
|
|
49
|
+
|
|
50
|
+
def get_openai_tool_schema(self) -> dict:
|
|
51
|
+
"""Get the OpenAI tool schema for the grounded computer tool.
|
|
52
|
+
|
|
53
|
+
Returns:
|
|
54
|
+
Dictionary containing the tool schema in OpenAI format
|
|
55
|
+
"""
|
|
56
|
+
return {
|
|
57
|
+
"type": "function",
|
|
58
|
+
"function": {
|
|
59
|
+
"name": "computer",
|
|
60
|
+
"description": (
|
|
61
|
+
"Control a computer by interacting with UI elements. This tool uses "
|
|
62
|
+
"element descriptions to locate and interact with UI elements on the "
|
|
63
|
+
"screen (e.g., 'red submit button', 'search text field', 'hamburger menu "
|
|
64
|
+
"icon', 'close button in top right corner')."
|
|
65
|
+
),
|
|
66
|
+
"parameters": {
|
|
67
|
+
"type": "object",
|
|
68
|
+
"properties": {
|
|
69
|
+
"action": {
|
|
70
|
+
"type": "string",
|
|
71
|
+
"enum": [
|
|
72
|
+
"click",
|
|
73
|
+
"double_click",
|
|
74
|
+
"move",
|
|
75
|
+
"scroll",
|
|
76
|
+
"drag",
|
|
77
|
+
"type",
|
|
78
|
+
"keypress",
|
|
79
|
+
"wait",
|
|
80
|
+
"screenshot",
|
|
81
|
+
"get_current_url",
|
|
82
|
+
"get_dimensions",
|
|
83
|
+
"get_environment",
|
|
84
|
+
],
|
|
85
|
+
"description": "The action to perform",
|
|
86
|
+
},
|
|
87
|
+
"element_description": {
|
|
88
|
+
"type": "string",
|
|
89
|
+
"description": (
|
|
90
|
+
"Natural language description of the element for "
|
|
91
|
+
"click/move/scroll actions"
|
|
92
|
+
),
|
|
93
|
+
},
|
|
94
|
+
"start_element_description": {
|
|
95
|
+
"type": "string",
|
|
96
|
+
"description": "Description of the start element for drag actions",
|
|
97
|
+
},
|
|
98
|
+
"end_element_description": {
|
|
99
|
+
"type": "string",
|
|
100
|
+
"description": "Description of the end element for drag actions",
|
|
101
|
+
},
|
|
102
|
+
"text": {"type": "string", "description": "Text to type"},
|
|
103
|
+
"keys": {
|
|
104
|
+
"type": "array",
|
|
105
|
+
"items": {"type": "string"},
|
|
106
|
+
"description": "Keys to press (e.g., ['ctrl', 'a'] for Ctrl+A)",
|
|
107
|
+
},
|
|
108
|
+
"button": {
|
|
109
|
+
"type": "string",
|
|
110
|
+
"enum": ["left", "right", "middle"],
|
|
111
|
+
"description": "Mouse button to use",
|
|
112
|
+
},
|
|
113
|
+
"scroll_x": {"type": "integer", "description": "Horizontal scroll amount"},
|
|
114
|
+
"scroll_y": {"type": "integer", "description": "Vertical scroll amount"},
|
|
115
|
+
},
|
|
116
|
+
"required": ["action"],
|
|
117
|
+
},
|
|
118
|
+
},
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
async def __call__(
|
|
122
|
+
self,
|
|
123
|
+
action: str,
|
|
124
|
+
# Screenshot from conversation
|
|
125
|
+
screenshot_b64: str | None = None,
|
|
126
|
+
# Grounding-specific parameters
|
|
127
|
+
element_description: str | None = None,
|
|
128
|
+
start_element_description: str | None = None,
|
|
129
|
+
end_element_description: str | None = None,
|
|
130
|
+
# Pass-through parameters
|
|
131
|
+
text: str | None = None,
|
|
132
|
+
keys: list[str] | None = None,
|
|
133
|
+
button: str | None = None,
|
|
134
|
+
scroll_x: int | None = None,
|
|
135
|
+
scroll_y: int | None = None,
|
|
136
|
+
**kwargs: Any,
|
|
137
|
+
) -> list[ContentBlock]:
|
|
138
|
+
"""Execute a computer action, grounding element descriptions to coordinates first.
|
|
139
|
+
|
|
140
|
+
This method calls the environment's computer tool through MCP to ensure
|
|
141
|
+
actions happen in the correct environment.
|
|
142
|
+
|
|
143
|
+
Args:
|
|
144
|
+
action: The action to perform
|
|
145
|
+
element_description: Description of element for click/move/scroll actions
|
|
146
|
+
start_element_description: Start element for drag actions
|
|
147
|
+
end_element_description: End element for drag actions
|
|
148
|
+
text: Text to type for type actions
|
|
149
|
+
keys: Keys to press for keypress actions
|
|
150
|
+
button: Mouse button (left, right, middle)
|
|
151
|
+
scroll_x: Horizontal scroll amount
|
|
152
|
+
scroll_y: Vertical scroll amount
|
|
153
|
+
**kwargs: Additional arguments
|
|
154
|
+
|
|
155
|
+
Returns:
|
|
156
|
+
List of ContentBlocks with action results from the environment
|
|
157
|
+
"""
|
|
158
|
+
try:
|
|
159
|
+
# For actions that don't need grounding, call environment tool directly
|
|
160
|
+
if action in (
|
|
161
|
+
"screenshot",
|
|
162
|
+
"type",
|
|
163
|
+
"keypress",
|
|
164
|
+
"wait",
|
|
165
|
+
"get_current_url",
|
|
166
|
+
"get_dimensions",
|
|
167
|
+
"get_environment",
|
|
168
|
+
):
|
|
169
|
+
computer_args: dict[str, Any] = {"action": action}
|
|
170
|
+
if text is not None:
|
|
171
|
+
computer_args["text"] = text
|
|
172
|
+
if keys is not None:
|
|
173
|
+
computer_args["keys"] = keys
|
|
174
|
+
|
|
175
|
+
result = await self._mcp_client.call_tool(
|
|
176
|
+
MCPToolCall(
|
|
177
|
+
name=self._computer_tool_name, arguments={**computer_args, **kwargs}
|
|
178
|
+
)
|
|
179
|
+
)
|
|
180
|
+
return result.content
|
|
181
|
+
|
|
182
|
+
# For actions that need coordinates, we need to ground element descriptions
|
|
183
|
+
if action in ("click", "double_click", "move", "scroll"):
|
|
184
|
+
if not element_description:
|
|
185
|
+
raise McpError(
|
|
186
|
+
ErrorData(
|
|
187
|
+
code=INVALID_PARAMS,
|
|
188
|
+
message=f"element_description is required for {action} action",
|
|
189
|
+
)
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
if not screenshot_b64:
|
|
193
|
+
raise McpError(
|
|
194
|
+
ErrorData(
|
|
195
|
+
code=INVALID_PARAMS, message="No screenshot available for grounding"
|
|
196
|
+
)
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
# Ground the element description to coordinates
|
|
200
|
+
coords = await self._grounder.predict_click(
|
|
201
|
+
image_b64=screenshot_b64, instruction=element_description
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
if not coords:
|
|
205
|
+
raise McpError(
|
|
206
|
+
ErrorData(
|
|
207
|
+
code=INVALID_PARAMS,
|
|
208
|
+
message=(
|
|
209
|
+
f"Could not locate element: '{element_description}'. "
|
|
210
|
+
"Try a more specific description or different identifying "
|
|
211
|
+
"features (color, position, text, etc.)"
|
|
212
|
+
),
|
|
213
|
+
)
|
|
214
|
+
)
|
|
215
|
+
|
|
216
|
+
x, y = coords
|
|
217
|
+
|
|
218
|
+
# Execute action with resolved coordinates
|
|
219
|
+
computer_args: dict[str, Any] = {"action": action, "x": x, "y": y}
|
|
220
|
+
if button:
|
|
221
|
+
computer_args["button"] = button
|
|
222
|
+
if scroll_x is not None:
|
|
223
|
+
computer_args["scroll_x"] = scroll_x
|
|
224
|
+
if scroll_y is not None:
|
|
225
|
+
computer_args["scroll_y"] = scroll_y
|
|
226
|
+
|
|
227
|
+
result = await self._mcp_client.call_tool(
|
|
228
|
+
MCPToolCall(
|
|
229
|
+
name=self._computer_tool_name, arguments={**computer_args, **kwargs}
|
|
230
|
+
)
|
|
231
|
+
)
|
|
232
|
+
return result.content
|
|
233
|
+
|
|
234
|
+
elif action == "drag":
|
|
235
|
+
if not start_element_description or not end_element_description:
|
|
236
|
+
raise McpError(
|
|
237
|
+
ErrorData(
|
|
238
|
+
code=INVALID_PARAMS,
|
|
239
|
+
message=(
|
|
240
|
+
"start_element_description and end_element_description "
|
|
241
|
+
"are required for drag action"
|
|
242
|
+
),
|
|
243
|
+
)
|
|
244
|
+
)
|
|
245
|
+
|
|
246
|
+
if not screenshot_b64:
|
|
247
|
+
raise McpError(
|
|
248
|
+
ErrorData(
|
|
249
|
+
code=INVALID_PARAMS, message="No screenshot available for grounding"
|
|
250
|
+
)
|
|
251
|
+
)
|
|
252
|
+
|
|
253
|
+
# Ground both start and end points
|
|
254
|
+
start_coords = await self._grounder.predict_click(
|
|
255
|
+
image_b64=screenshot_b64, instruction=start_element_description
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
if not start_coords:
|
|
259
|
+
raise McpError(
|
|
260
|
+
ErrorData(
|
|
261
|
+
code=INVALID_PARAMS,
|
|
262
|
+
message=(
|
|
263
|
+
f"Could not locate start element: '{start_element_description}'. "
|
|
264
|
+
"Try a more specific description or different identifying features."
|
|
265
|
+
),
|
|
266
|
+
)
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
end_coords = await self._grounder.predict_click(
|
|
270
|
+
image_b64=screenshot_b64, instruction=end_element_description
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
if not end_coords:
|
|
274
|
+
raise McpError(
|
|
275
|
+
ErrorData(
|
|
276
|
+
code=INVALID_PARAMS,
|
|
277
|
+
message=(
|
|
278
|
+
f"Could not locate end element: '{end_element_description}'. "
|
|
279
|
+
"Try a more specific description or different identifying features."
|
|
280
|
+
),
|
|
281
|
+
)
|
|
282
|
+
)
|
|
283
|
+
|
|
284
|
+
# Execute drag with resolved coordinates
|
|
285
|
+
computer_args: dict[str, Any] = {
|
|
286
|
+
"action": "drag",
|
|
287
|
+
"path": [
|
|
288
|
+
(start_coords[0], start_coords[1]),
|
|
289
|
+
(end_coords[0], end_coords[1]),
|
|
290
|
+
],
|
|
291
|
+
}
|
|
292
|
+
if button:
|
|
293
|
+
computer_args["button"] = button
|
|
294
|
+
|
|
295
|
+
result = await self._mcp_client.call_tool(
|
|
296
|
+
MCPToolCall(
|
|
297
|
+
name=self._computer_tool_name, arguments={**computer_args, **kwargs}
|
|
298
|
+
)
|
|
299
|
+
)
|
|
300
|
+
return result.content
|
|
301
|
+
|
|
302
|
+
else:
|
|
303
|
+
raise McpError(
|
|
304
|
+
ErrorData(code=INVALID_PARAMS, message=f"Unsupported action: {action}")
|
|
305
|
+
)
|
|
306
|
+
|
|
307
|
+
except McpError:
|
|
308
|
+
# Re-raise MCP errors
|
|
309
|
+
raise
|
|
310
|
+
except Exception as e:
|
|
311
|
+
logger.error("Grounded tool failed: %s", e)
|
|
312
|
+
raise McpError(
|
|
313
|
+
ErrorData(code=INVALID_PARAMS, message=f"Grounding failed: {e!s}")
|
|
314
|
+
) from e
|