cube-computer-tool 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,12 @@
1
+ Metadata-Version: 2.3
2
+ Name: cube-computer-tool
3
+ Version: 0.1.0
4
+ Summary: Generic desktop computer tool for CUBE VM-based benchmarks
5
+ Requires-Dist: cube-standard
6
+ Requires-Dist: pillow>=9.0
7
+ Requires-Dist: requests>=2.28
8
+ Requires-Dist: requests-toolbelt>=0.10
9
+ Requires-Dist: tqdm>=4.60
10
+ Requires-Dist: pytest>=8.0.0 ; extra == 'dev'
11
+ Requires-Python: >=3.12
12
+ Provides-Extra: dev
@@ -0,0 +1,38 @@
1
+ [project]
2
+ name = "cube-computer-tool"
3
+ version = "0.1.0"
4
+ description = "Generic desktop computer tool for CUBE VM-based benchmarks"
5
+ requires-python = ">=3.12"
6
+ dependencies = [
7
+ "cube-standard",
8
+ "pillow>=9.0",
9
+ "requests>=2.28",
10
+ "requests-toolbelt>=0.10",
11
+ "tqdm>=4.60",
12
+ ]
13
+
14
+ [project.optional-dependencies]
15
+ dev = [
16
+ "pytest>=8.0.0",
17
+ ]
18
+
19
+ [build-system]
20
+ requires = ["uv_build>=0.6.0,<0.7.0"]
21
+ build-backend = "uv_build"
22
+
23
+ [tool.uv.build-backend]
24
+ module-name = "cube_computer_tool"
25
+
26
+ [tool.ruff]
27
+ fix = true
28
+ line-length = 120
29
+ indent-width = 4
30
+
31
+ [tool.ruff.format]
32
+ quote-style = "double"
33
+ indent-style = "space"
34
+ skip-magic-trailing-comma = false
35
+ line-ending = "auto"
36
+
37
+ [tool.ruff.lint]
38
+ extend-select = ["I"]
@@ -0,0 +1,18 @@
1
+ """cube-computer-tool: generic desktop computer tool for CUBE VM-based benchmarks."""
2
+
3
+ from cube_computer_tool.axtree import linearize_accessibility_tree, tag_screenshot
4
+ from cube_computer_tool.computer import ActionSpace, Computer13, ComputerBase, ComputerConfig, PyAutoGUIComputer
5
+ from cube_computer_tool.guest_agent import GuestAgent
6
+ from cube_computer_tool.pyautogui_utils import fix_pyautogui_less_than_bug
7
+
8
+ __all__ = [
9
+ "ActionSpace",
10
+ "Computer13",
11
+ "ComputerBase",
12
+ "ComputerConfig",
13
+ "PyAutoGUIComputer",
14
+ "GuestAgent",
15
+ "fix_pyautogui_less_than_bug",
16
+ "linearize_accessibility_tree",
17
+ "tag_screenshot",
18
+ ]
@@ -0,0 +1,283 @@
1
+ """Accessibility tree processing utilities for desktop VM benchmarks.
2
+
3
+ Provides two modes of converting raw XML accessibility trees to LLM-friendly formats:
4
+
5
+ linearize_accessibility_tree(xml_str, platform) -> str
6
+ Convert XML accessibility tree to a tab-separated table.
7
+
8
+ tag_screenshot(screenshot_bytes, xml_str, platform) -> (marks, drew_nodes, tagged_bytes, element_list)
9
+ Draw numbered bounding boxes on a screenshot (Set-of-Marks).
10
+
11
+ Originally ported from desktop_env / kusha/AgentLab2 osworld_axtree.py.
12
+ """
13
+
14
+ from __future__ import annotations
15
+
16
+ import io
17
+ import xml.etree.ElementTree as ET
18
+ from typing import List, Tuple
19
+
20
+ from PIL import Image, ImageDraw, ImageFont
21
+
22
+ # XML namespace URLs for accessibility tree attributes
23
+ attributes_ns_ubuntu = "https://accessibility.ubuntu.example.org/ns/attributes"
24
+ attributes_ns_windows = "https://accessibility.windows.example.org/ns/attributes"
25
+ state_ns_ubuntu = "https://accessibility.ubuntu.example.org/ns/state"
26
+ state_ns_windows = "https://accessibility.windows.example.org/ns/state"
27
+ component_ns_ubuntu = "https://accessibility.ubuntu.example.org/ns/component"
28
+ component_ns_windows = "https://accessibility.windows.example.org/ns/component"
29
+ value_ns_ubuntu = "https://accessibility.ubuntu.example.org/ns/value"
30
+ value_ns_windows = "https://accessibility.windows.example.org/ns/value"
31
+ class_ns_windows = "https://accessibility.windows.example.org/ns/class"
32
+
33
+
34
+ def _get_ns(platform: str) -> tuple[str, str, str, str]:
35
+ """Return (attributes_ns, state_ns, component_ns, value_ns) for the given platform."""
36
+ if platform == "ubuntu":
37
+ return attributes_ns_ubuntu, state_ns_ubuntu, component_ns_ubuntu, value_ns_ubuntu
38
+ if platform == "windows":
39
+ return attributes_ns_windows, state_ns_windows, component_ns_windows, value_ns_windows
40
+ raise ValueError(f"Invalid platform '{platform}': must be 'ubuntu' or 'windows'")
41
+
42
+
43
+ def judge_node(node: ET.Element, platform: str = "ubuntu", check_image: bool = False) -> bool:
44
+ """Return True if this accessibility tree node should be included in the output.
45
+
46
+ Filters to visible, enabled, and interactable nodes that have a name or text.
47
+ """
48
+ _, _state_ns, _component_ns, _ = _get_ns(platform)
49
+
50
+ keeps: bool = (
51
+ node.tag.startswith("document")
52
+ or node.tag.endswith("item")
53
+ or node.tag.endswith("button")
54
+ or node.tag.endswith("heading")
55
+ or node.tag.endswith("label")
56
+ or node.tag.endswith("scrollbar")
57
+ or node.tag.endswith("searchbox")
58
+ or node.tag.endswith("textbox")
59
+ or node.tag.endswith("link")
60
+ or node.tag.endswith("tabelement")
61
+ or node.tag.endswith("textfield")
62
+ or node.tag.endswith("textarea")
63
+ or node.tag.endswith("menu")
64
+ or node.tag
65
+ in {
66
+ "alert",
67
+ "canvas",
68
+ "check-box",
69
+ "combo-box",
70
+ "entry",
71
+ "icon",
72
+ "image",
73
+ "paragraph",
74
+ "scroll-bar",
75
+ "section",
76
+ "slider",
77
+ "static",
78
+ "table-cell",
79
+ "terminal",
80
+ "text",
81
+ "netuiribbontab",
82
+ "start",
83
+ "trayclockwclass",
84
+ "traydummysearchcontrol",
85
+ "uiimage",
86
+ "uiproperty",
87
+ "uiribboncommandbar",
88
+ }
89
+ )
90
+
91
+ keeps = (
92
+ keeps
93
+ and (
94
+ platform == "ubuntu"
95
+ and node.get(f"{{{_state_ns}}}showing", "false") == "true"
96
+ and node.get(f"{{{_state_ns}}}visible", "false") == "true"
97
+ or platform == "windows"
98
+ and node.get(f"{{{_state_ns}}}visible", "false") == "true"
99
+ )
100
+ and (
101
+ node.get(f"{{{_state_ns}}}enabled", "false") == "true"
102
+ or node.get(f"{{{_state_ns}}}editable", "false") == "true"
103
+ or node.get(f"{{{_state_ns}}}expandable", "false") == "true"
104
+ or node.get(f"{{{_state_ns}}}checkable", "false") == "true"
105
+ )
106
+ and (
107
+ node.get("name", "") != ""
108
+ or node.text is not None
109
+ and len(node.text) > 0
110
+ or check_image
111
+ and node.get("image", "false") == "true"
112
+ )
113
+ )
114
+
115
+ coords: Tuple[int, int] = eval(node.get(f"{{{_component_ns}}}screencoord", "(-1, -1)"))
116
+ sizes: Tuple[int, int] = eval(node.get(f"{{{_component_ns}}}size", "(-1, -1)"))
117
+ keeps = keeps and coords[0] >= 0 and coords[1] >= 0 and sizes[0] > 0 and sizes[1] > 0
118
+ return keeps
119
+
120
+
121
+ def filter_nodes(root: ET.Element, platform: str = "ubuntu", check_image: bool = False) -> list[ET.Element]:
122
+ """Return all visible and interactable nodes from the accessibility tree."""
123
+ return [node for node in root.iter() if judge_node(node, platform, check_image)]
124
+
125
+
126
+ def draw_bounding_boxes(
127
+ nodes: list[ET.Element],
128
+ image_file_content: bytes,
129
+ down_sampling_ratio: float = 1.0,
130
+ platform: str = "ubuntu",
131
+ ) -> Tuple[list, list, str, bytes]:
132
+ """Draw numbered bounding boxes on a screenshot for the given accessibility nodes.
133
+
134
+ Returns:
135
+ marks: list of [x, y, w, h] bounding boxes (original coords)
136
+ drew_nodes: list of ET.Element nodes that were actually drawn
137
+ text_informations: tab-separated table of node info (index/tag/name/text)
138
+ image_content: annotated screenshot as PNG bytes
139
+ """
140
+ _, _state_ns, _component_ns, _value_ns = _get_ns(platform)
141
+
142
+ image = Image.open(io.BytesIO(image_file_content))
143
+ if float(down_sampling_ratio) != 1.0:
144
+ image = image.resize(
145
+ (
146
+ int(image.size[0] * down_sampling_ratio),
147
+ int(image.size[1] * down_sampling_ratio),
148
+ )
149
+ )
150
+ draw = ImageDraw.Draw(image)
151
+ marks: list = []
152
+ drew_nodes: list = []
153
+ text_informations: List[str] = ["index\ttag\tname\ttext"]
154
+
155
+ try:
156
+ font = ImageFont.truetype("arial.ttf", 15)
157
+ except IOError:
158
+ font = ImageFont.load_default()
159
+
160
+ index = 1
161
+ for _node in nodes:
162
+ coords_str = _node.attrib.get(f"{{{_component_ns}}}screencoord")
163
+ size_str = _node.attrib.get(f"{{{_component_ns}}}size")
164
+ if not coords_str or not size_str:
165
+ continue
166
+ try:
167
+ coords = tuple(map(int, coords_str.strip("()").split(", ")))
168
+ size = tuple(map(int, size_str.strip("()").split(", ")))
169
+ original_coords = coords
170
+ original_size = size
171
+
172
+ if float(down_sampling_ratio) != 1.0:
173
+ coords = tuple(int(c * down_sampling_ratio) for c in coords)
174
+ size = tuple(int(s * down_sampling_ratio) for s in size)
175
+
176
+ if size[0] <= 0 or size[1] <= 0:
177
+ raise ValueError(f"Size must be positive, got: {size}")
178
+
179
+ bottom_right = (coords[0] + size[0], coords[1] + size[1])
180
+ if bottom_right[0] < coords[0] or bottom_right[1] < coords[1]:
181
+ raise ValueError(f"Invalid coordinates: coords={coords}, size={size}")
182
+
183
+ # Skip single-colour (blank) regions
184
+ cropped = image.crop((*coords, *bottom_right))
185
+ if len(set(list(cropped.getdata()))) == 1:
186
+ continue
187
+
188
+ draw.rectangle([coords, bottom_right], outline="red", width=1)
189
+ text_pos = (coords[0], bottom_right[1])
190
+ text_bbox: Tuple[int, int, int, int] = draw.textbbox(text_pos, str(index), font=font, anchor="lb")
191
+ draw.rectangle(text_bbox, fill="black")
192
+ draw.text(text_pos, str(index), font=font, anchor="lb", fill="white")
193
+
194
+ marks.append([original_coords[0], original_coords[1], original_size[0], original_size[1]])
195
+ drew_nodes.append(_node)
196
+
197
+ # Build node text for the element table
198
+ if _node.text:
199
+ node_text = _node.text if '"' not in _node.text else '"{:}"'.format(_node.text.replace('"', '""'))
200
+ elif _node.get(f"{{{class_ns_windows}}}class", "").endswith("EditWrapper") and _node.get(
201
+ f"{{{_value_ns}}}value"
202
+ ):
203
+ raw = _node.get(f"{{{_value_ns}}}value", "")
204
+ node_text = raw if '"' not in raw else '"{:}"'.format(raw.replace('"', '""'))
205
+ else:
206
+ node_text = '""'
207
+
208
+ text_informations.append(f"{index}\t{_node.tag}\t{_node.get('name', '')}\t{node_text}")
209
+ index += 1
210
+
211
+ except (ValueError, SyntaxError):
212
+ pass
213
+
214
+ out = io.BytesIO()
215
+ image.save(out, format="PNG")
216
+ return marks, drew_nodes, "\n".join(text_informations), out.getvalue()
217
+
218
+
219
+ def linearize_accessibility_tree(accessibility_tree: str, platform: str = "ubuntu") -> str:
220
+ """Convert an XML accessibility tree to a tab-separated table for the agent.
221
+
222
+ Columns: tag, name, text, class, description, position (top-left x&y), size (w&h)
223
+
224
+ Args:
225
+ accessibility_tree: Raw XML string from the VM guest agent.
226
+ platform: "ubuntu" or "windows"
227
+
228
+ Returns:
229
+ Tab-separated table as a single string.
230
+ """
231
+ _attributes_ns, _state_ns, _component_ns, _value_ns = _get_ns(platform)
232
+
233
+ filtered_nodes = filter_nodes(ET.fromstring(accessibility_tree), platform)
234
+ rows = ["tag\tname\ttext\tclass\tdescription\tposition (top-left x&y)\tsize (w&h)"]
235
+
236
+ for node in filtered_nodes:
237
+ if node.text:
238
+ text = node.text if '"' not in node.text else '"{:}"'.format(node.text.replace('"', '""'))
239
+ elif node.get(f"{{{class_ns_windows}}}class", "").endswith("EditWrapper") and node.get(f"{{{_value_ns}}}value"):
240
+ raw = node.get(f"{{{_value_ns}}}value", "")
241
+ text = raw if '"' not in raw else '"{:}"'.format(raw.replace('"', '""'))
242
+ else:
243
+ text = '""'
244
+
245
+ cls = (
246
+ node.get(f"{{{_attributes_ns}}}class", "")
247
+ if platform == "ubuntu"
248
+ else node.get(f"{{{class_ns_windows}}}class", "")
249
+ )
250
+ rows.append(
251
+ "{}\t{}\t{}\t{}\t{}\t{}\t{}".format(
252
+ node.tag,
253
+ node.get("name", ""),
254
+ text,
255
+ cls,
256
+ node.get(f"{{{_attributes_ns}}}description", ""),
257
+ node.get(f"{{{_component_ns}}}screencoord", ""),
258
+ node.get(f"{{{_component_ns}}}size", ""),
259
+ )
260
+ )
261
+
262
+ return "\n".join(rows)
263
+
264
+
265
+ def tag_screenshot(
266
+ screenshot: bytes, accessibility_tree: str, platform: str = "ubuntu"
267
+ ) -> Tuple[list, list, bytes, str]:
268
+ """Annotate a screenshot with numbered bounding boxes for interactive elements.
269
+
270
+ Args:
271
+ screenshot: PNG screenshot bytes
272
+ accessibility_tree: XML string from the VM guest agent
273
+ platform: "ubuntu" or "windows"
274
+
275
+ Returns:
276
+ marks: list of [x, y, w, h] for each drawn element
277
+ drew_nodes: ET.Element nodes that were drawn
278
+ tagged_screenshot: annotated PNG bytes
279
+ element_list: tab-separated element table (index/tag/name/text)
280
+ """
281
+ nodes = filter_nodes(ET.fromstring(accessibility_tree), platform=platform, check_image=True)
282
+ marks, drew_nodes, element_list, tagged_screenshot = draw_bounding_boxes(nodes, screenshot, platform=platform)
283
+ return marks, drew_nodes, tagged_screenshot, element_list
@@ -0,0 +1,436 @@
1
+ """
2
+ Computer tool — CUBE tool for VM-based desktop automation.
3
+
4
+ Two variants selected by ComputerConfig.action_space:
5
+ Computer13 — 13 mouse/keyboard primitives + wait/done/fail
6
+ PyAutoGUIComputer — run_pyautogui() code execution + wait/done/fail
7
+
8
+ The tool receives a live VM handle (cube.vm.VM) at construction time.
9
+ VM lifecycle management (launch, reset, stop) is the caller's responsibility —
10
+ typically OSWorldTask or another benchmark-specific Task subclass.
11
+ """
12
+
13
+ import logging
14
+ import time
15
+ from enum import Enum
16
+ from io import BytesIO
17
+ from urllib.parse import urlparse
18
+
19
+ from cube.container import Container
20
+ from cube.core import Action, Content, ImageContent, Observation, StepError, TextContent
21
+ from cube.tool import Tool, ToolConfig, tool_action
22
+ from cube.vm import VM
23
+ from PIL import Image
24
+
25
+ from cube_computer_tool.guest_agent import GuestAgent
26
+ from cube_computer_tool.pyautogui_utils import fix_pyautogui_less_than_bug
27
+
28
+ logger = logging.getLogger(__name__)
29
+
30
+
31
+ # ---------------------------------------------------------------------------
32
+ # Enums
33
+ # ---------------------------------------------------------------------------
34
+
35
+
36
+ class ActionSpace(str, Enum):
37
+ """Action space variants for the Computer tool."""
38
+
39
+ COMPUTER_13 = "computer_13"
40
+ PYAUTOGUI = "pyautogui"
41
+
42
+
43
+ # ---------------------------------------------------------------------------
44
+ # Config
45
+ # ---------------------------------------------------------------------------
46
+
47
+
48
+ class ComputerConfig(ToolConfig):
49
+ """Serializable configuration for Computer tool variants.
50
+
51
+ action_space selects the tool variant:
52
+ "computer_13" → Computer13 (13 mouse/keyboard primitives + wait/done/fail)
53
+ "pyautogui" → PyAutoGUIComputer (run_pyautogui + wait/done/fail)
54
+
55
+ VM lifecycle (launch/reset/stop) is managed externally and passed in via
56
+ ComputerConfig.make(vm=...). The config itself holds only tool-behaviour
57
+ settings: observation options and action space selection.
58
+ """
59
+
60
+ action_space: ActionSpace = ActionSpace.COMPUTER_13
61
+ cache_dir: str = ""
62
+ require_a11y_tree: bool = True
63
+ require_terminal: bool = False
64
+ observe_after_action: bool = True
65
+
66
+ def make(self, container: Container | None = None, vm: VM | None = None) -> "ComputerBase":
67
+ if container is not None:
68
+ logger.warning(
69
+ "ComputerConfig.make() received a cube Container, but the Computer tool "
70
+ "uses a VM handle (cube.vm.VM). The container argument will be ignored."
71
+ )
72
+ if self.action_space == ActionSpace.PYAUTOGUI:
73
+ return PyAutoGUIComputer(self, vm=vm)
74
+ return Computer13(self, vm=vm)
75
+
76
+
77
+ # ---------------------------------------------------------------------------
78
+ # ComputerBase — shared VM observation and task helpers
79
+ # ---------------------------------------------------------------------------
80
+
81
+
82
+ class ComputerBase(Tool):
83
+ """
84
+ Shared base for Computer13 and PyAutoGUIComputer.
85
+
86
+ Provides VM observation retrieval (screenshot, axtree, terminal) and the
87
+ three terminal @tool_action signals shared by both action spaces:
88
+ wait, done, fail.
89
+
90
+ Subclasses add the action-space-specific @tool_action methods.
91
+
92
+ The VM is passed in at construction time (vm: VM). If vm is None,
93
+ the tool can still be constructed but will fail when attempting to
94
+ observe or act — useful for deferred VM launch patterns.
95
+ """
96
+
97
+ def __init__(self, config: ComputerConfig, vm: VM | None = None) -> None:
98
+ self.config = config
99
+ self._vm: VM | None = vm
100
+ self._guest: GuestAgent | None = None
101
+ self._current_task_config: dict | None = None
102
+ self._last_marks: list[list[int]] = []
103
+ self._is_done: bool = False
104
+ self._action_history: list = []
105
+
106
+ if vm is not None:
107
+ self._connect_guest(vm)
108
+
109
+ def attach_vm(self, vm: VM) -> None:
110
+ """Attach a live VM handle after construction (for deferred-launch patterns)."""
111
+ self._vm = vm
112
+ self._connect_guest(vm)
113
+
114
+ def _connect_guest(self, vm: VM) -> None:
115
+ """Parse the VM endpoint and create the GuestAgent HTTP client."""
116
+ parsed = urlparse(vm.endpoint)
117
+ host = parsed.hostname or "localhost"
118
+ port = parsed.port or 5000
119
+ self._guest = GuestAgent(host=host, port=port)
120
+
121
+ def execute_action(self, action: Action) -> Observation | StepError:
122
+ """Execute action; append full VM observation if observe_after_action=True."""
123
+ action_obs = super().execute_action(action)
124
+
125
+ if self.config.observe_after_action and action.name not in ("done", "fail"):
126
+ action_obs += self.get_observation()
127
+
128
+ return action_obs
129
+
130
+ def get_observation(self) -> Observation:
131
+ """Read current screen state from the VM and return as Observation."""
132
+ if self._guest is None:
133
+ raise RuntimeError("No VM attached — call attach_vm() or pass vm= to ComputerConfig.make()")
134
+ raw_obs = {
135
+ "screenshot": self._guest.get_screenshot(),
136
+ "accessibility_tree": self._guest.get_accessibility_tree() if self.config.require_a11y_tree else None,
137
+ "terminal": self._guest.get_terminal_output() if self.config.require_terminal else None,
138
+ }
139
+ return self._convert_observation(raw_obs)
140
+
141
+ def _convert_observation(self, raw_obs: dict) -> Observation:
142
+ """Convert VM observation dict to a cube Observation."""
143
+ contents: list[Content] = []
144
+
145
+ if raw_obs.get("screenshot"):
146
+ img = Image.open(BytesIO(raw_obs["screenshot"])).convert("RGB")
147
+ contents.append(ImageContent(data=img, name="screenshot"))
148
+
149
+ if raw_obs.get("accessibility_tree"):
150
+ contents.append(TextContent(data=raw_obs["accessibility_tree"], name="accessibility_tree"))
151
+
152
+ if raw_obs.get("terminal"):
153
+ contents.append(TextContent(data=raw_obs["terminal"], name="terminal"))
154
+
155
+ return Observation(contents=contents)
156
+
157
+ def _execute_desktop_action(self, action_dict: dict | str) -> str:
158
+ """Send an action to the guest VM and return a success string."""
159
+ if self._guest is None:
160
+ raise RuntimeError("No VM attached — call attach_vm() or pass vm= to ComputerConfig.make()")
161
+ if isinstance(action_dict, dict):
162
+ self._guest.execute_action(action_dict)
163
+ else:
164
+ self._guest.execute_python_command(str(action_dict))
165
+ self._action_history.append(action_dict)
166
+ return "Success"
167
+
168
+ def update_marks(self, marks: list[list[int]]) -> None:
169
+ """Store SoM bounding-box marks for tag_N variable resolution in run_pyautogui."""
170
+ self._last_marks = marks
171
+
172
+ def reset(self) -> None:
173
+ """Reset tool state between tasks (cube AbstractTool.reset() override)."""
174
+ self._last_marks = []
175
+ self._is_done = False
176
+ self._action_history = []
177
+
178
+ def close(self) -> None:
179
+ """Release tool resources. Does NOT stop the VM — caller owns VM lifecycle."""
180
+ logger.info("Closing ComputerBase tool")
181
+
182
+ @tool_action
183
+ def wait(self) -> str:
184
+ """Wait one step without taking any action."""
185
+ self._action_history.append("WAIT")
186
+ return "Success"
187
+
188
+ @tool_action
189
+ def done(self) -> str:
190
+ """Signal that the task has been completed successfully."""
191
+ self._is_done = True
192
+ self._action_history.append("DONE")
193
+ return "Task marked as done"
194
+
195
+ @tool_action
196
+ def fail(self) -> str:
197
+ """Signal that the task cannot be completed (infeasible or failed)."""
198
+ self._is_done = True
199
+ self._action_history.append("FAIL")
200
+ return "Task marked as failed"
201
+
202
+
203
+ # ---------------------------------------------------------------------------
204
+ # Computer13 — 13 mouse/keyboard primitives
205
+ # ---------------------------------------------------------------------------
206
+
207
+
208
+ class Computer13(ComputerBase):
209
+ """
210
+ Desktop/VM computer tool with the computer_13 action space.
211
+
212
+ Exposes 13 mouse/keyboard primitives as @tool_action methods, plus the
213
+ shared wait/done/fail terminal signals inherited from ComputerBase.
214
+ """
215
+
216
+ @tool_action
217
+ def click(
218
+ self,
219
+ button: str = "left",
220
+ x: int = -1,
221
+ y: int = -1,
222
+ num_clicks: int = 1,
223
+ ) -> str:
224
+ """Click the mouse button at optional coordinates.
225
+
226
+ Parameters
227
+ ----------
228
+ button : str
229
+ Mouse button — "left", "right", or "middle"
230
+ x : int
231
+ X coordinate to click at (-1 = use current cursor position)
232
+ y : int
233
+ Y coordinate to click at (-1 = use current cursor position)
234
+ num_clicks : int
235
+ Number of clicks (1 for single, 2 for double, etc.)
236
+ """
237
+ params: dict = {"button": button, "num_clicks": num_clicks}
238
+ if x >= 0:
239
+ params["x"] = x
240
+ if y >= 0:
241
+ params["y"] = y
242
+ return self._execute_desktop_action({"action_type": "CLICK", "parameters": params})
243
+
244
+ @tool_action
245
+ def double_click(self, x: int = -1, y: int = -1) -> str:
246
+ """Double-click the mouse at optional coordinates.
247
+
248
+ Parameters
249
+ ----------
250
+ x : int
251
+ X coordinate (-1 = use current cursor position)
252
+ y : int
253
+ Y coordinate (-1 = use current cursor position)
254
+ """
255
+ return self.click(x=x, y=y, num_clicks=2)
256
+
257
+ @tool_action
258
+ def right_click(self, x: int = -1, y: int = -1) -> str:
259
+ """Right-click the mouse at optional coordinates.
260
+
261
+ Parameters
262
+ ----------
263
+ x : int
264
+ X coordinate (-1 = use current cursor position)
265
+ y : int
266
+ Y coordinate (-1 = use current cursor position)
267
+ """
268
+ return self.click(button="right", x=x, y=y)
269
+
270
+ @tool_action
271
+ def mouse_down(self, button: str = "left") -> str:
272
+ """Press and hold a mouse button.
273
+
274
+ Parameters
275
+ ----------
276
+ button : str
277
+ Mouse button — "left", "right", or "middle"
278
+ """
279
+ return self._execute_desktop_action({"action_type": "MOUSE_DOWN", "parameters": {"button": button}})
280
+
281
+ @tool_action
282
+ def mouse_up(self, button: str = "left") -> str:
283
+ """Release a held mouse button.
284
+
285
+ Parameters
286
+ ----------
287
+ button : str
288
+ Mouse button — "left", "right", or "middle"
289
+ """
290
+ return self._execute_desktop_action({"action_type": "MOUSE_UP", "parameters": {"button": button}})
291
+
292
+ @tool_action
293
+ def move_to(self, x: int, y: int) -> str:
294
+ """Move the mouse cursor to pixel coordinates without clicking.
295
+
296
+ Parameters
297
+ ----------
298
+ x : int
299
+ Target X coordinate
300
+ y : int
301
+ Target Y coordinate
302
+ """
303
+ return self._execute_desktop_action({"action_type": "MOVE_TO", "parameters": {"x": x, "y": y}})
304
+
305
+ @tool_action
306
+ def drag_to(self, x: int, y: int) -> str:
307
+ """Click-and-drag from the current cursor position to (x, y).
308
+
309
+ Parameters
310
+ ----------
311
+ x : int
312
+ Target X coordinate
313
+ y : int
314
+ Target Y coordinate
315
+ """
316
+ return self._execute_desktop_action({"action_type": "DRAG_TO", "parameters": {"x": x, "y": y}})
317
+
318
+ @tool_action
319
+ def scroll(self, dx: int, dy: int) -> str:
320
+ """Scroll the mouse wheel.
321
+
322
+ Parameters
323
+ ----------
324
+ dx : int
325
+ Horizontal scroll amount (positive = right)
326
+ dy : int
327
+ Vertical scroll amount (positive = down)
328
+ """
329
+ return self._execute_desktop_action({"action_type": "SCROLL", "parameters": {"dx": dx, "dy": dy}})
330
+
331
+ @tool_action
332
+ def typing(self, text: str) -> str:
333
+ """Type text into the currently focused element.
334
+
335
+ Parameters
336
+ ----------
337
+ text : str
338
+ The text to type
339
+ """
340
+ return self._execute_desktop_action({"action_type": "TYPING", "parameters": {"text": text}})
341
+
342
+ @tool_action
343
+ def press(self, key: str) -> str:
344
+ """Press and release a single key.
345
+
346
+ Parameters
347
+ ----------
348
+ key : str
349
+ Key name (e.g. "enter", "esc", "tab", "backspace", "space")
350
+ """
351
+ return self._execute_desktop_action({"action_type": "PRESS", "parameters": {"key": key}})
352
+
353
+ @tool_action
354
+ def key_down(self, key: str) -> str:
355
+ """Press a key down without releasing it.
356
+
357
+ Parameters
358
+ ----------
359
+ key : str
360
+ Key name (e.g. "ctrl", "shift", "alt")
361
+ """
362
+ return self._execute_desktop_action({"action_type": "KEY_DOWN", "parameters": {"key": key}})
363
+
364
+ @tool_action
365
+ def key_up(self, key: str) -> str:
366
+ """Release a previously held key.
367
+
368
+ Parameters
369
+ ----------
370
+ key : str
371
+ Key name (e.g. "ctrl", "shift", "alt")
372
+ """
373
+ return self._execute_desktop_action({"action_type": "KEY_UP", "parameters": {"key": key}})
374
+
375
+ @tool_action
376
+ def hotkey(self, keys: str) -> str:
377
+ """Press a key combination simultaneously (e.g. Ctrl+C).
378
+
379
+ Parameters
380
+ ----------
381
+ keys : str
382
+ Key names joined by '+' (e.g. "ctrl+c", "ctrl+shift+t")
383
+ """
384
+ if isinstance(keys, str):
385
+ keys = keys.split("+")
386
+ return self._execute_desktop_action({"action_type": "HOTKEY", "parameters": {"keys": keys}})
387
+
388
+
389
+ # ---------------------------------------------------------------------------
390
+ # PyAutoGUIComputer — pyautogui code execution action space
391
+ # ---------------------------------------------------------------------------
392
+
393
+
394
+ class PyAutoGUIComputer(ComputerBase):
395
+ """
396
+ Desktop/VM computer tool with the pyautogui action space.
397
+
398
+ Exposes run_pyautogui() as a @tool_action method, plus the shared
399
+ wait/done/fail terminal signals inherited from ComputerBase.
400
+
401
+ The agent writes Python code using pyautogui; SoM tag_N variables
402
+ (center coordinates of numbered bounding boxes) are prepended automatically
403
+ so agents can reference screen elements by index.
404
+ """
405
+
406
+ @tool_action
407
+ def run_pyautogui(self, code: str) -> str:
408
+ """Execute Python code using pyautogui in the VM.
409
+
410
+ Parameters
411
+ ----------
412
+ code : str
413
+ Python code to execute (e.g. "pyautogui.click(100, 200)"). If SoM
414
+ bounding boxes are available, tag_1, tag_2, ... variables are
415
+ prepended as center coordinates (e.g. "pyautogui.click(*tag_3)").
416
+ """
417
+ if self._guest is None:
418
+ raise RuntimeError("No VM attached — call attach_vm() or pass vm= to ComputerConfig.make()")
419
+
420
+ tag_vars = ""
421
+ for i, mark in enumerate(self._last_marks):
422
+ x, y, w, h = mark
423
+ tag_vars += f"tag_{i + 1} = ({int(x + w // 2)}, {int(y + h // 2)})\n"
424
+
425
+ fixed_code = fix_pyautogui_less_than_bug(tag_vars + code)
426
+ result = self._guest.execute_python_command(fixed_code)
427
+ time.sleep(2) # replicate desktop_env.step()'s default pause
428
+
429
+ if result:
430
+ returncode = result.get("returncode", 0)
431
+ error = result.get("error", "") or result.get("stderr", "")
432
+ if returncode != 0 and error:
433
+ return f"Error executing code:\n{error.strip()}"
434
+
435
+ self._action_history.append(code)
436
+ return "Success"
@@ -0,0 +1,426 @@
1
+ """HTTP client for the Flask guest agent running inside a desktop VM.
2
+
3
+ The guest agent server runs at port 5000 inside the VM and exposes endpoints
4
+ for screenshots, accessibility trees, command execution, and file I/O.
5
+
6
+ Originally ported from desktop_env.controllers.python.PythonController.
7
+ """
8
+
9
+ import json
10
+ import logging
11
+ import random
12
+ import time
13
+ import traceback
14
+ from typing import Any
15
+
16
+ import requests
17
+
18
+ logger = logging.getLogger(__name__)
19
+
20
+ # fmt: off
21
+ _KEYBOARD_KEYS = [
22
+ "\t", "\n", "\r", " ", "!", '"', "#", "$", "%", "&", "'", "(", ")", "*",
23
+ "+", ",", "-", ".", "/", "0", "1", "2", "3", "4", "5", "6", "7", "8",
24
+ "9", ":", ";", "<", "=", ">", "?", "@", "[", "\\", "]", "^", "_", "`",
25
+ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n",
26
+ "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "{", "|",
27
+ "}", "~", "accept", "add", "alt", "altleft", "altright", "apps",
28
+ "backspace", "browserback", "browserfavorites", "browserforward",
29
+ "browserhome", "browserrefresh", "browsersearch", "browserstop",
30
+ "capslock", "clear", "convert", "ctrl", "ctrlleft", "ctrlright",
31
+ "decimal", "del", "delete", "divide", "down", "end", "enter", "esc",
32
+ "escape", "execute", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8",
33
+ "f9", "f10", "f11", "f12", "f13", "f14", "f15", "f16", "f17", "f18",
34
+ "f19", "f20", "final", "fn", "hanguel", "hangul", "hanja", "help",
35
+ "home", "insert", "junja", "kana", "kanji", "launchapp1", "launchapp2",
36
+ "launchmail", "launchmediaselect", "left", "modechange", "multiply",
37
+ "nexttrack", "nonconvert", "num0", "num1", "num2", "num3", "num4",
38
+ "num5", "num6", "num7", "num8", "num9", "numlock", "pagedown", "pageup",
39
+ "pause", "pgdn", "pgup", "playpause", "prevtrack", "print", "printscreen",
40
+ "prntscrn", "prtsc", "prtscr", "return", "right", "scrolllock", "select",
41
+ "separator", "shift", "shiftleft", "shiftright", "sleep", "space", "stop",
42
+ "subtract", "tab", "up", "volumedown", "volumemute", "volumeup", "win",
43
+ "winleft", "winright", "yen", "command", "option", "optionleft",
44
+ "optionright",
45
+ ]
46
+ # fmt: on
47
+
48
+ _PYAUTOGUI_PREFIX = "import pyautogui; import time; pyautogui.FAILSAFE = False; {command}"
49
+
50
+ _RETRY_TIMES = 3
51
+ _RETRY_INTERVAL = 5
52
+
53
+
54
+ class GuestAgent:
55
+ """HTTP client for the Flask agent server running inside a desktop VM.
56
+
57
+ Parameters
58
+ ----------
59
+ host : str
60
+ Hostname or IP of the server (typically "localhost" with port-forwarded QEMU).
61
+ port : int
62
+ Host port mapped to the guest's Flask server (default 5000).
63
+ """
64
+
65
+ def __init__(self, host: str = "localhost", port: int = 5000) -> None:
66
+ self.host = host
67
+ self.port = port
68
+ self._base_url = f"http://{host}:{port}"
69
+
70
+ # ------------------------------------------------------------------
71
+ # Observation retrieval
72
+ # ------------------------------------------------------------------
73
+
74
+ def get_screenshot(self) -> bytes | None:
75
+ """Return raw PNG/JPEG bytes of the current screen, or None on failure."""
76
+ for attempt in range(_RETRY_TIMES):
77
+ try:
78
+ resp = requests.get(self._base_url + "/screenshot", timeout=10)
79
+ if resp.status_code == 200 and self._is_valid_image(resp.headers.get("Content-Type", ""), resp.content):
80
+ return resp.content
81
+ logger.error("Invalid screenshot response (attempt %d/%d)", attempt + 1, _RETRY_TIMES)
82
+ except Exception as exc:
83
+ logger.error("Screenshot error: %s", exc)
84
+ time.sleep(_RETRY_INTERVAL)
85
+ logger.error("Failed to get screenshot after %d attempts", _RETRY_TIMES)
86
+ return None
87
+
88
+ def get_accessibility_tree(self) -> str | None:
89
+ """Return the XML accessibility tree string, or None on failure."""
90
+ for _ in range(_RETRY_TIMES):
91
+ try:
92
+ resp = requests.get(self._base_url + "/accessibility")
93
+ if resp.status_code == 200:
94
+ return resp.json()["AT"]
95
+ logger.error("Accessibility tree error: %d", resp.status_code)
96
+ except Exception as exc:
97
+ logger.error("Accessibility tree error: %s", exc)
98
+ time.sleep(_RETRY_INTERVAL)
99
+ logger.error("Failed to get accessibility tree")
100
+ return None
101
+
102
+ def get_terminal_output(self) -> str | None:
103
+ """Return the terminal output string, or None on failure."""
104
+ for _ in range(_RETRY_TIMES):
105
+ try:
106
+ resp = requests.get(self._base_url + "/terminal")
107
+ if resp.status_code == 200:
108
+ return resp.json()["output"]
109
+ logger.error("Terminal output error: %d", resp.status_code)
110
+ except Exception as exc:
111
+ logger.error("Terminal output error: %s", exc)
112
+ time.sleep(_RETRY_INTERVAL)
113
+ logger.error("Failed to get terminal output")
114
+ return None
115
+
116
+ def get_file(self, file_path: str) -> bytes | None:
117
+ """Download a file from the VM by path, or None on failure."""
118
+ for _ in range(_RETRY_TIMES):
119
+ try:
120
+ resp = requests.post(self._base_url + "/file", data={"file_path": file_path})
121
+ if resp.status_code == 200:
122
+ return resp.content
123
+ logger.error("Get file error: %d", resp.status_code)
124
+ except Exception as exc:
125
+ logger.error("Get file error: %s", exc)
126
+ time.sleep(_RETRY_INTERVAL)
127
+ logger.error("Failed to get file: %s", file_path)
128
+ return None
129
+
130
+ # ------------------------------------------------------------------
131
+ # Command execution
132
+ # ------------------------------------------------------------------
133
+
134
+ def execute_python_command(self, command: str) -> dict[str, Any] | None:
135
+ """Execute a Python command via pyautogui prefix inside the VM."""
136
+ command_list = ["python", "-c", _PYAUTOGUI_PREFIX.format(command=command)]
137
+ payload = json.dumps({"command": command_list, "shell": False})
138
+
139
+ for _ in range(_RETRY_TIMES):
140
+ try:
141
+ resp = requests.post(
142
+ self._base_url + "/execute",
143
+ headers={"Content-Type": "application/json"},
144
+ data=payload,
145
+ timeout=90,
146
+ )
147
+ if resp.status_code == 200:
148
+ return resp.json()
149
+ logger.error("Execute python error: %d", resp.status_code)
150
+ except requests.exceptions.ReadTimeout:
151
+ break
152
+ except Exception as exc:
153
+ logger.error("Execute python error: %s", exc)
154
+ time.sleep(_RETRY_INTERVAL)
155
+ logger.error("Failed to execute python command")
156
+ return None
157
+
158
+ def run_python_script(self, script: str) -> dict[str, Any] | None:
159
+ """Execute a Python script file inside the VM via /run_python."""
160
+ payload = json.dumps({"code": script})
161
+
162
+ for _ in range(_RETRY_TIMES):
163
+ try:
164
+ resp = requests.post(
165
+ self._base_url + "/run_python",
166
+ headers={"Content-Type": "application/json"},
167
+ data=payload,
168
+ timeout=90,
169
+ )
170
+ if resp.status_code == 200:
171
+ return resp.json()
172
+ return {
173
+ "status": "error",
174
+ "message": "Request failed",
175
+ "output": None,
176
+ "error": resp.json().get("error"),
177
+ }
178
+ except requests.exceptions.ReadTimeout:
179
+ break
180
+ except Exception:
181
+ logger.error("Run python script error: %s", traceback.format_exc())
182
+ time.sleep(_RETRY_INTERVAL)
183
+ return {"status": "error", "message": "Retry limit reached", "output": "", "error": "Retry limit reached."}
184
+
185
+ def run_bash_script(self, script: str, timeout: int = 30, working_dir: str | None = None) -> dict[str, Any] | None:
186
+ """Execute a bash script inside the VM via /run_bash_script."""
187
+ payload = json.dumps({"script": script, "timeout": timeout, "working_dir": working_dir})
188
+
189
+ for _ in range(_RETRY_TIMES):
190
+ try:
191
+ resp = requests.post(
192
+ self._base_url + "/run_bash_script",
193
+ headers={"Content-Type": "application/json"},
194
+ data=payload,
195
+ timeout=timeout + 100,
196
+ )
197
+ if resp.status_code == 200:
198
+ return resp.json()
199
+ logger.error("Run bash script error: %d %s", resp.status_code, resp.text)
200
+ except requests.exceptions.ReadTimeout:
201
+ return {"status": "error", "output": "", "error": f"Timed out after {timeout}s", "returncode": -1}
202
+ except Exception as exc:
203
+ logger.error("Run bash script error: %s", exc)
204
+ time.sleep(_RETRY_INTERVAL)
205
+ return {"status": "error", "output": "", "error": f"Failed after {_RETRY_TIMES} retries", "returncode": -1}
206
+
207
+ def execute_action(self, action: dict[str, Any]) -> None:
208
+ """Dispatch a computer_13 action dict to the appropriate pyautogui command.
209
+
210
+ Mirrors the dispatch table in desktop_env.controllers.python.PythonController.execute_action.
211
+ """
212
+ if action in ("WAIT", "FAIL", "DONE"):
213
+ return
214
+
215
+ action_type: str = action["action_type"]
216
+ parameters: dict = action.get("parameters") or {k: v for k, v in action.items() if k != "action_type"}
217
+
218
+ move_mode = random.choice(
219
+ [
220
+ "pyautogui.easeInQuad",
221
+ "pyautogui.easeOutQuad",
222
+ "pyautogui.easeInOutQuad",
223
+ "pyautogui.easeInBounce",
224
+ "pyautogui.easeInElastic",
225
+ ]
226
+ )
227
+ duration = random.uniform(0.5, 1)
228
+
229
+ if action_type == "MOVE_TO":
230
+ if not parameters:
231
+ self.execute_python_command("pyautogui.moveTo()")
232
+ elif "x" in parameters and "y" in parameters:
233
+ self.execute_python_command(
234
+ f"pyautogui.moveTo({parameters['x']}, {parameters['y']}, {duration}, {move_mode})"
235
+ )
236
+ else:
237
+ raise ValueError(f"Unknown MOVE_TO parameters: {parameters}")
238
+
239
+ elif action_type == "CLICK":
240
+ if not parameters:
241
+ self.execute_python_command("pyautogui.click()")
242
+ elif "button" in parameters and "x" in parameters and "y" in parameters:
243
+ btn, x, y = parameters["button"], parameters["x"], parameters["y"]
244
+ nc = parameters.get("num_clicks")
245
+ if nc:
246
+ self.execute_python_command(f"pyautogui.click(button='{btn}', x={x}, y={y}, clicks={nc})")
247
+ else:
248
+ self.execute_python_command(f"pyautogui.click(button='{btn}', x={x}, y={y})")
249
+ elif "button" in parameters:
250
+ btn = parameters["button"]
251
+ nc = parameters.get("num_clicks")
252
+ if nc:
253
+ self.execute_python_command(f"pyautogui.click(button='{btn}', clicks={nc})")
254
+ else:
255
+ self.execute_python_command(f"pyautogui.click(button='{btn}')")
256
+ elif "x" in parameters and "y" in parameters:
257
+ x, y = parameters["x"], parameters["y"]
258
+ nc = parameters.get("num_clicks")
259
+ if nc:
260
+ self.execute_python_command(f"pyautogui.click(x={x}, y={y}, clicks={nc})")
261
+ else:
262
+ self.execute_python_command(f"pyautogui.click(x={x}, y={y})")
263
+ else:
264
+ raise ValueError(f"Unknown CLICK parameters: {parameters}")
265
+
266
+ elif action_type == "MOUSE_DOWN":
267
+ btn = parameters.get("button", "left")
268
+ self.execute_python_command(f"pyautogui.mouseDown(button='{btn}')")
269
+
270
+ elif action_type == "MOUSE_UP":
271
+ btn = parameters.get("button", "left")
272
+ self.execute_python_command(f"pyautogui.mouseUp(button='{btn}')")
273
+
274
+ elif action_type == "RIGHT_CLICK":
275
+ if "x" in parameters and "y" in parameters:
276
+ self.execute_python_command(f"pyautogui.rightClick(x={parameters['x']}, y={parameters['y']})")
277
+ else:
278
+ self.execute_python_command("pyautogui.rightClick()")
279
+
280
+ elif action_type == "DOUBLE_CLICK":
281
+ if "x" in parameters and "y" in parameters:
282
+ self.execute_python_command(f"pyautogui.doubleClick(x={parameters['x']}, y={parameters['y']})")
283
+ else:
284
+ self.execute_python_command("pyautogui.doubleClick()")
285
+
286
+ elif action_type == "DRAG_TO":
287
+ self.execute_python_command(
288
+ f"pyautogui.dragTo({parameters['x']}, {parameters['y']}, duration=1.0, button='left', mouseDownUp=True)"
289
+ )
290
+
291
+ elif action_type == "SCROLL":
292
+ dx = parameters.get("dx")
293
+ dy = parameters.get("dy")
294
+ if dx is not None:
295
+ self.execute_python_command(f"pyautogui.hscroll({dx})")
296
+ if dy is not None:
297
+ self.execute_python_command(f"pyautogui.vscroll({dy})")
298
+
299
+ elif action_type == "TYPING":
300
+ text = parameters["text"]
301
+ self.execute_python_command("pyautogui.typewrite({:})".format(repr(text)))
302
+
303
+ elif action_type == "PRESS":
304
+ key = parameters["key"]
305
+ if key.lower() not in _KEYBOARD_KEYS:
306
+ raise ValueError(f"Key must be one of the known keyboard keys, got: {key!r}")
307
+ self.execute_python_command(f"pyautogui.press('{key}')")
308
+
309
+ elif action_type == "KEY_DOWN":
310
+ key = parameters["key"]
311
+ if key.lower() not in _KEYBOARD_KEYS:
312
+ raise ValueError(f"Key must be one of the known keyboard keys, got: {key!r}")
313
+ self.execute_python_command(f"pyautogui.keyDown('{key}')")
314
+
315
+ elif action_type == "KEY_UP":
316
+ key = parameters["key"]
317
+ if key.lower() not in _KEYBOARD_KEYS:
318
+ raise ValueError(f"Key must be one of the known keyboard keys, got: {key!r}")
319
+ self.execute_python_command(f"pyautogui.keyUp('{key}')")
320
+
321
+ elif action_type == "HOTKEY":
322
+ keys: list[str] = parameters["keys"]
323
+ if not isinstance(keys, list):
324
+ raise ValueError("HOTKEY keys must be a list")
325
+ for k in keys:
326
+ if k.lower() not in _KEYBOARD_KEYS:
327
+ raise ValueError(f"Key must be one of the known keyboard keys, got: {k!r}")
328
+ self.execute_python_command("pyautogui.hotkey('" + "', '".join(keys) + "')")
329
+
330
+ elif action_type in ("WAIT", "FAIL", "DONE"):
331
+ pass
332
+
333
+ else:
334
+ raise ValueError(f"Unknown action type: {action_type!r}")
335
+
336
+ # ------------------------------------------------------------------
337
+ # VM info
338
+ # ------------------------------------------------------------------
339
+
340
+ def get_vm_platform(self) -> str:
341
+ """Return the platform string (e.g. 'Linux', 'Windows')."""
342
+ result = self.execute_python_command("import platform; print(platform.system())")
343
+ if result and result.get("output"):
344
+ return result["output"].strip()
345
+ return ""
346
+
347
+ def get_vm_screen_size(self) -> dict[str, Any] | None:
348
+ """Return the VM screen size dict."""
349
+ for _ in range(_RETRY_TIMES):
350
+ try:
351
+ resp = requests.post(self._base_url + "/screen_size")
352
+ if resp.status_code == 200:
353
+ return resp.json()
354
+ except Exception as exc:
355
+ logger.error("Screen size error: %s", exc)
356
+ time.sleep(_RETRY_INTERVAL)
357
+ return None
358
+
359
+ def get_vm_window_size(self, app_class_name: str) -> dict[str, Any] | None:
360
+ """Return the window size for an application by class name."""
361
+ for _ in range(_RETRY_TIMES):
362
+ try:
363
+ resp = requests.post(self._base_url + "/window_size", data={"app_class_name": app_class_name})
364
+ if resp.status_code == 200:
365
+ return resp.json()
366
+ except Exception as exc:
367
+ logger.error("Window size error: %s", exc)
368
+ time.sleep(_RETRY_INTERVAL)
369
+ return None
370
+
371
+ def get_vm_desktop_path(self) -> str | None:
372
+ """Return the desktop directory path inside the VM."""
373
+ for _ in range(_RETRY_TIMES):
374
+ try:
375
+ resp = requests.post(self._base_url + "/desktop_path")
376
+ if resp.status_code == 200:
377
+ return resp.json()["desktop_path"]
378
+ except Exception as exc:
379
+ logger.error("Desktop path error: %s", exc)
380
+ time.sleep(_RETRY_INTERVAL)
381
+ return None
382
+
383
+ def get_vm_directory_tree(self, path: str) -> dict[str, Any] | None:
384
+ """Return the directory tree for the given path inside the VM."""
385
+ payload = json.dumps({"path": path})
386
+ for _ in range(_RETRY_TIMES):
387
+ try:
388
+ resp = requests.post(
389
+ self._base_url + "/list_directory",
390
+ headers={"Content-Type": "application/json"},
391
+ data=payload,
392
+ )
393
+ if resp.status_code == 200:
394
+ return resp.json()["directory_tree"]
395
+ except Exception as exc:
396
+ logger.error("Directory tree error: %s", exc)
397
+ time.sleep(_RETRY_INTERVAL)
398
+ return None
399
+
400
+ def get_vm_wallpaper(self) -> bytes | None:
401
+ """Return the current desktop wallpaper image bytes."""
402
+ for _ in range(_RETRY_TIMES):
403
+ try:
404
+ resp = requests.post(self._base_url + "/wallpaper")
405
+ if resp.status_code == 200:
406
+ return resp.content
407
+ except Exception as exc:
408
+ logger.error("Wallpaper error: %s", exc)
409
+ time.sleep(_RETRY_INTERVAL)
410
+ return None
411
+
412
+ # ------------------------------------------------------------------
413
+ # Internal helpers
414
+ # ------------------------------------------------------------------
415
+
416
+ @staticmethod
417
+ def _is_valid_image(content_type: str, data: bytes | None) -> bool:
418
+ if not isinstance(data, (bytes, bytearray)) or not data:
419
+ return False
420
+ if len(data) >= 8 and data[:8] == b"\x89PNG\r\n\x1a\n":
421
+ return True
422
+ if len(data) >= 3 and data[:3] == b"\xff\xd8\xff":
423
+ return True
424
+ if content_type and any(t in content_type for t in ("image/png", "image/jpeg", "image/jpg")):
425
+ return True
426
+ return False
@@ -0,0 +1,64 @@
1
+ """PyAutoGUI utilities for desktop VM tool execution."""
2
+
3
+ import re
4
+
5
+
6
+ def fix_pyautogui_less_than_bug(command: str) -> str:
7
+ """Fix PyAutoGUI '<' character bug by converting it to hotkey("shift", ',') calls.
8
+
9
+ This fixes the known PyAutoGUI issue where typing '<' produces '>' instead.
10
+ References:
11
+ - https://github.com/asweigart/pyautogui/issues/198
12
+ - https://github.com/xlang-ai/OSWorld/issues/257
13
+
14
+ Parameters
15
+ ----------
16
+ command : str
17
+ The original pyautogui command string.
18
+
19
+ Returns
20
+ -------
21
+ str
22
+ The fixed command with '<' characters handled properly.
23
+ """
24
+ # Pattern to match press('<') or press('\u003c') calls
25
+ press_pattern = r'pyautogui\.press\(["\'](?:<|\\u003c)["\']\)'
26
+
27
+ def replace_press_less_than(match: re.Match) -> str:
28
+ return 'pyautogui.hotkey("shift", ",")'
29
+
30
+ command = re.sub(press_pattern, replace_press_less_than, command)
31
+
32
+ # Pattern to match typewrite calls with quoted strings
33
+ typewrite_pattern = r'pyautogui\.typewrite\((["\'])(.*?)\1\)'
34
+
35
+ def process_typewrite_match(match: re.Match) -> str:
36
+ quote_char = match.group(1)
37
+ content = match.group(2)
38
+
39
+ try:
40
+ decoded_content = content.encode("utf-8").decode("unicode_escape")
41
+ content = decoded_content
42
+ except UnicodeDecodeError:
43
+ pass
44
+
45
+ if "<" not in content:
46
+ return match.group(0)
47
+
48
+ parts = content.split("<")
49
+ result_parts = []
50
+
51
+ for i, part in enumerate(parts):
52
+ if i == 0:
53
+ if part:
54
+ result_parts.append(f"pyautogui.typewrite({quote_char}{part}{quote_char})")
55
+ else:
56
+ result_parts.append('pyautogui.hotkey("shift", ",")')
57
+ if part:
58
+ result_parts.append(f"pyautogui.typewrite({quote_char}{part}{quote_char})")
59
+
60
+ return "; ".join(result_parts)
61
+
62
+ command = re.sub(typewrite_pattern, process_typewrite_match, command)
63
+
64
+ return command