PyPI - cube-computer-tool - Versions diffs - 0.1.0__tar.gz - Mend

cube-computer-tool 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (7) hide show

cube_computer_tool-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,12 @@
+Metadata-Version: 2.3
+Name: cube-computer-tool
+Version: 0.1.0
+Summary: Generic desktop computer tool for CUBE VM-based benchmarks
+Requires-Dist: cube-standard
+Requires-Dist: pillow>=9.0
+Requires-Dist: requests>=2.28
+Requires-Dist: requests-toolbelt>=0.10
+Requires-Dist: tqdm>=4.60
+Requires-Dist: pytest>=8.0.0 ; extra == 'dev'
+Requires-Python: >=3.12
+Provides-Extra: dev

cube_computer_tool-0.1.0/pyproject.toml ADDED Viewed

@@ -0,0 +1,38 @@
+[project]
+name = "cube-computer-tool"
+version = "0.1.0"
+description = "Generic desktop computer tool for CUBE VM-based benchmarks"
+requires-python = ">=3.12"
+dependencies = [
+    "cube-standard",
+    "pillow>=9.0",
+    "requests>=2.28",
+    "requests-toolbelt>=0.10",
+    "tqdm>=4.60",
+]
+[project.optional-dependencies]
+dev = [
+    "pytest>=8.0.0",
+]
+[build-system]
+requires = ["uv_build>=0.6.0,<0.7.0"]
+build-backend = "uv_build"
+[tool.uv.build-backend]
+module-name = "cube_computer_tool"
+[tool.ruff]
+fix = true
+line-length = 120
+indent-width = 4
+[tool.ruff.format]
+quote-style = "double"
+indent-style = "space"
+skip-magic-trailing-comma = false
+line-ending = "auto"
+[tool.ruff.lint]
+extend-select = ["I"]

cube_computer_tool-0.1.0/src/cube_computer_tool/__init__.py ADDED Viewed

@@ -0,0 +1,18 @@
+"""cube-computer-tool: generic desktop computer tool for CUBE VM-based benchmarks."""
+from cube_computer_tool.axtree import linearize_accessibility_tree, tag_screenshot
+from cube_computer_tool.computer import ActionSpace, Computer13, ComputerBase, ComputerConfig, PyAutoGUIComputer
+from cube_computer_tool.guest_agent import GuestAgent
+from cube_computer_tool.pyautogui_utils import fix_pyautogui_less_than_bug
+__all__ = [
+    "ActionSpace",
+    "Computer13",
+    "ComputerBase",
+    "ComputerConfig",
+    "PyAutoGUIComputer",
+    "GuestAgent",
+    "fix_pyautogui_less_than_bug",
+    "linearize_accessibility_tree",
+    "tag_screenshot",
+]

cube_computer_tool-0.1.0/src/cube_computer_tool/axtree.py ADDED Viewed

@@ -0,0 +1,283 @@
+"""Accessibility tree processing utilities for desktop VM benchmarks.
+Provides two modes of converting raw XML accessibility trees to LLM-friendly formats:
+    linearize_accessibility_tree(xml_str, platform) -> str
+        Convert XML accessibility tree to a tab-separated table.
+    tag_screenshot(screenshot_bytes, xml_str, platform) -> (marks, drew_nodes, tagged_bytes, element_list)
+        Draw numbered bounding boxes on a screenshot (Set-of-Marks).
+Originally ported from desktop_env / kusha/AgentLab2 osworld_axtree.py.
+"""
+from __future__ import annotations
+import io
+import xml.etree.ElementTree as ET
+from typing import List, Tuple
+from PIL import Image, ImageDraw, ImageFont
+# XML namespace URLs for accessibility tree attributes
+attributes_ns_ubuntu = "https://accessibility.ubuntu.example.org/ns/attributes"
+attributes_ns_windows = "https://accessibility.windows.example.org/ns/attributes"
+state_ns_ubuntu = "https://accessibility.ubuntu.example.org/ns/state"
+state_ns_windows = "https://accessibility.windows.example.org/ns/state"
+component_ns_ubuntu = "https://accessibility.ubuntu.example.org/ns/component"
+component_ns_windows = "https://accessibility.windows.example.org/ns/component"
+value_ns_ubuntu = "https://accessibility.ubuntu.example.org/ns/value"
+value_ns_windows = "https://accessibility.windows.example.org/ns/value"
+class_ns_windows = "https://accessibility.windows.example.org/ns/class"
+def _get_ns(platform: str) -> tuple[str, str, str, str]:
+    """Return (attributes_ns, state_ns, component_ns, value_ns) for the given platform."""
+    if platform == "ubuntu":
+        return attributes_ns_ubuntu, state_ns_ubuntu, component_ns_ubuntu, value_ns_ubuntu
+    if platform == "windows":
+        return attributes_ns_windows, state_ns_windows, component_ns_windows, value_ns_windows
+    raise ValueError(f"Invalid platform '{platform}': must be 'ubuntu' or 'windows'")
+def judge_node(node: ET.Element, platform: str = "ubuntu", check_image: bool = False) -> bool:
+    """Return True if this accessibility tree node should be included in the output.
+    Filters to visible, enabled, and interactable nodes that have a name or text.
+    """
+    _, _state_ns, _component_ns, _ = _get_ns(platform)
+    keeps: bool = (
+        node.tag.startswith("document")
+        or node.tag.endswith("item")
+        or node.tag.endswith("button")
+        or node.tag.endswith("heading")
+        or node.tag.endswith("label")
+        or node.tag.endswith("scrollbar")
+        or node.tag.endswith("searchbox")
+        or node.tag.endswith("textbox")
+        or node.tag.endswith("link")
+        or node.tag.endswith("tabelement")
+        or node.tag.endswith("textfield")
+        or node.tag.endswith("textarea")
+        or node.tag.endswith("menu")
+        or node.tag
+        in {
+            "alert",
+            "canvas",
+            "check-box",
+            "combo-box",
+            "entry",
+            "icon",
+            "image",
+            "paragraph",
+            "scroll-bar",
+            "section",
+            "slider",
+            "static",
+            "table-cell",
+            "terminal",
+            "text",
+            "netuiribbontab",
+            "start",
+            "trayclockwclass",
+            "traydummysearchcontrol",
+            "uiimage",
+            "uiproperty",
+            "uiribboncommandbar",
+        }
+    )
+    keeps = (
+        keeps
+        and (
+            platform == "ubuntu"
+            and node.get(f"{{{_state_ns}}}showing", "false") == "true"
+            and node.get(f"{{{_state_ns}}}visible", "false") == "true"
+            or platform == "windows"
+            and node.get(f"{{{_state_ns}}}visible", "false") == "true"
+        )
+        and (
+            node.get(f"{{{_state_ns}}}enabled", "false") == "true"
+            or node.get(f"{{{_state_ns}}}editable", "false") == "true"
+            or node.get(f"{{{_state_ns}}}expandable", "false") == "true"
+            or node.get(f"{{{_state_ns}}}checkable", "false") == "true"
+        )
+        and (
+            node.get("name", "") != ""
+            or node.text is not None
+            and len(node.text) > 0
+            or check_image
+            and node.get("image", "false") == "true"
+        )
+    )
+    coords: Tuple[int, int] = eval(node.get(f"{{{_component_ns}}}screencoord", "(-1, -1)"))
+    sizes: Tuple[int, int] = eval(node.get(f"{{{_component_ns}}}size", "(-1, -1)"))
+    keeps = keeps and coords[0] >= 0 and coords[1] >= 0 and sizes[0] > 0 and sizes[1] > 0
+    return keeps
+def filter_nodes(root: ET.Element, platform: str = "ubuntu", check_image: bool = False) -> list[ET.Element]:
+    """Return all visible and interactable nodes from the accessibility tree."""
+    return [node for node in root.iter() if judge_node(node, platform, check_image)]
+def draw_bounding_boxes(
+    nodes: list[ET.Element],
+    image_file_content: bytes,
+    down_sampling_ratio: float = 1.0,
+    platform: str = "ubuntu",
+) -> Tuple[list, list, str, bytes]:
+    """Draw numbered bounding boxes on a screenshot for the given accessibility nodes.
+    Returns:
+        marks:             list of [x, y, w, h] bounding boxes (original coords)
+        drew_nodes:        list of ET.Element nodes that were actually drawn
+        text_informations: tab-separated table of node info (index/tag/name/text)
+        image_content:     annotated screenshot as PNG bytes
+    """
+    _, _state_ns, _component_ns, _value_ns = _get_ns(platform)
+    image = Image.open(io.BytesIO(image_file_content))
+    if float(down_sampling_ratio) != 1.0:
+        image = image.resize(
+            (
+                int(image.size[0] * down_sampling_ratio),
+                int(image.size[1] * down_sampling_ratio),
+            )
+        )
+    draw = ImageDraw.Draw(image)
+    marks: list = []
+    drew_nodes: list = []
+    text_informations: List[str] = ["index\ttag\tname\ttext"]
+    try:
+        font = ImageFont.truetype("arial.ttf", 15)
+    except IOError:
+        font = ImageFont.load_default()
+    index = 1
+    for _node in nodes:
+        coords_str = _node.attrib.get(f"{{{_component_ns}}}screencoord")
+        size_str = _node.attrib.get(f"{{{_component_ns}}}size")
+        if not coords_str or not size_str:
+            continue
+        try:
+            coords = tuple(map(int, coords_str.strip("()").split(", ")))
+            size = tuple(map(int, size_str.strip("()").split(", ")))
+            original_coords = coords
+            original_size = size
+            if float(down_sampling_ratio) != 1.0:
+                coords = tuple(int(c * down_sampling_ratio) for c in coords)
+                size = tuple(int(s * down_sampling_ratio) for s in size)
+            if size[0] <= 0 or size[1] <= 0:
+                raise ValueError(f"Size must be positive, got: {size}")
+            bottom_right = (coords[0] + size[0], coords[1] + size[1])
+            if bottom_right[0] < coords[0] or bottom_right[1] < coords[1]:
+                raise ValueError(f"Invalid coordinates: coords={coords}, size={size}")
+            # Skip single-colour (blank) regions
+            cropped = image.crop((*coords, *bottom_right))
+            if len(set(list(cropped.getdata()))) == 1:
+                continue
+            draw.rectangle([coords, bottom_right], outline="red", width=1)
+            text_pos = (coords[0], bottom_right[1])
+            text_bbox: Tuple[int, int, int, int] = draw.textbbox(text_pos, str(index), font=font, anchor="lb")
+            draw.rectangle(text_bbox, fill="black")
+            draw.text(text_pos, str(index), font=font, anchor="lb", fill="white")
+            marks.append([original_coords[0], original_coords[1], original_size[0], original_size[1]])
+            drew_nodes.append(_node)
+            # Build node text for the element table
+            if _node.text:
+                node_text = _node.text if '"' not in _node.text else '"{:}"'.format(_node.text.replace('"', '""'))
+            elif _node.get(f"{{{class_ns_windows}}}class", "").endswith("EditWrapper") and _node.get(
+                f"{{{_value_ns}}}value"
+            ):
+                raw = _node.get(f"{{{_value_ns}}}value", "")
+                node_text = raw if '"' not in raw else '"{:}"'.format(raw.replace('"', '""'))
+            else:
+                node_text = '""'
+            text_informations.append(f"{index}\t{_node.tag}\t{_node.get('name', '')}\t{node_text}")
+            index += 1
+        except (ValueError, SyntaxError):
+            pass
+    out = io.BytesIO()
+    image.save(out, format="PNG")
+    return marks, drew_nodes, "\n".join(text_informations), out.getvalue()
+def linearize_accessibility_tree(accessibility_tree: str, platform: str = "ubuntu") -> str:
+    """Convert an XML accessibility tree to a tab-separated table for the agent.
+    Columns: tag, name, text, class, description, position (top-left x&y), size (w&h)
+    Args:
+        accessibility_tree: Raw XML string from the VM guest agent.
+        platform: "ubuntu" or "windows"
+    Returns:
+        Tab-separated table as a single string.
+    """
+    _attributes_ns, _state_ns, _component_ns, _value_ns = _get_ns(platform)
+    filtered_nodes = filter_nodes(ET.fromstring(accessibility_tree), platform)
+    rows = ["tag\tname\ttext\tclass\tdescription\tposition (top-left x&y)\tsize (w&h)"]
+    for node in filtered_nodes:
+        if node.text:
+            text = node.text if '"' not in node.text else '"{:}"'.format(node.text.replace('"', '""'))
+        elif node.get(f"{{{class_ns_windows}}}class", "").endswith("EditWrapper") and node.get(f"{{{_value_ns}}}value"):
+            raw = node.get(f"{{{_value_ns}}}value", "")
+            text = raw if '"' not in raw else '"{:}"'.format(raw.replace('"', '""'))
+        else:
+            text = '""'
+        cls = (
+            node.get(f"{{{_attributes_ns}}}class", "")
+            if platform == "ubuntu"
+            else node.get(f"{{{class_ns_windows}}}class", "")
+        )
+        rows.append(
+            "{}\t{}\t{}\t{}\t{}\t{}\t{}".format(
+                node.tag,
+                node.get("name", ""),
+                text,
+                cls,
+                node.get(f"{{{_attributes_ns}}}description", ""),
+                node.get(f"{{{_component_ns}}}screencoord", ""),
+                node.get(f"{{{_component_ns}}}size", ""),
+            )
+        )
+    return "\n".join(rows)
+def tag_screenshot(
+    screenshot: bytes, accessibility_tree: str, platform: str = "ubuntu"
+) -> Tuple[list, list, bytes, str]:
+    """Annotate a screenshot with numbered bounding boxes for interactive elements.
+    Args:
+        screenshot: PNG screenshot bytes
+        accessibility_tree: XML string from the VM guest agent
+        platform: "ubuntu" or "windows"
+    Returns:
+        marks:             list of [x, y, w, h] for each drawn element
+        drew_nodes:        ET.Element nodes that were drawn
+        tagged_screenshot: annotated PNG bytes
+        element_list:      tab-separated element table (index/tag/name/text)
+    """
+    nodes = filter_nodes(ET.fromstring(accessibility_tree), platform=platform, check_image=True)
+    marks, drew_nodes, element_list, tagged_screenshot = draw_bounding_boxes(nodes, screenshot, platform=platform)
+    return marks, drew_nodes, tagged_screenshot, element_list

cube_computer_tool-0.1.0/src/cube_computer_tool/computer.py ADDED Viewed

@@ -0,0 +1,436 @@
+"""
+Computer tool — CUBE tool for VM-based desktop automation.
+Two variants selected by ComputerConfig.action_space:
+    Computer13        — 13 mouse/keyboard primitives + wait/done/fail
+    PyAutoGUIComputer — run_pyautogui() code execution + wait/done/fail
+The tool receives a live VM handle (cube.vm.VM) at construction time.
+VM lifecycle management (launch, reset, stop) is the caller's responsibility —
+typically OSWorldTask or another benchmark-specific Task subclass.
+"""
+import logging
+import time
+from enum import Enum
+from io import BytesIO
+from urllib.parse import urlparse
+from cube.container import Container
+from cube.core import Action, Content, ImageContent, Observation, StepError, TextContent
+from cube.tool import Tool, ToolConfig, tool_action
+from cube.vm import VM
+from PIL import Image
+from cube_computer_tool.guest_agent import GuestAgent
+from cube_computer_tool.pyautogui_utils import fix_pyautogui_less_than_bug
+logger = logging.getLogger(__name__)
+# ---------------------------------------------------------------------------
+# Enums
+# ---------------------------------------------------------------------------
+class ActionSpace(str, Enum):
+    """Action space variants for the Computer tool."""
+    COMPUTER_13 = "computer_13"
+    PYAUTOGUI = "pyautogui"
+# ---------------------------------------------------------------------------
+# Config
+# ---------------------------------------------------------------------------
+class ComputerConfig(ToolConfig):
+    """Serializable configuration for Computer tool variants.
+    action_space selects the tool variant:
+      "computer_13" → Computer13 (13 mouse/keyboard primitives + wait/done/fail)
+      "pyautogui"   → PyAutoGUIComputer (run_pyautogui + wait/done/fail)
+    VM lifecycle (launch/reset/stop) is managed externally and passed in via
+    ComputerConfig.make(vm=...). The config itself holds only tool-behaviour
+    settings: observation options and action space selection.
+    """
+    action_space: ActionSpace = ActionSpace.COMPUTER_13
+    cache_dir: str = ""
+    require_a11y_tree: bool = True
+    require_terminal: bool = False
+    observe_after_action: bool = True
+    def make(self, container: Container | None = None, vm: VM | None = None) -> "ComputerBase":
+        if container is not None:
+            logger.warning(
+                "ComputerConfig.make() received a cube Container, but the Computer tool "
+                "uses a VM handle (cube.vm.VM). The container argument will be ignored."
+            )
+        if self.action_space == ActionSpace.PYAUTOGUI:
+            return PyAutoGUIComputer(self, vm=vm)
+        return Computer13(self, vm=vm)
+# ---------------------------------------------------------------------------
+# ComputerBase — shared VM observation and task helpers
+# ---------------------------------------------------------------------------
+class ComputerBase(Tool):
+    """
+    Shared base for Computer13 and PyAutoGUIComputer.
+    Provides VM observation retrieval (screenshot, axtree, terminal) and the
+    three terminal @tool_action signals shared by both action spaces:
+    wait, done, fail.
+    Subclasses add the action-space-specific @tool_action methods.
+    The VM is passed in at construction time (vm: VM). If vm is None,
+    the tool can still be constructed but will fail when attempting to
+    observe or act — useful for deferred VM launch patterns.
+    """
+    def __init__(self, config: ComputerConfig, vm: VM | None = None) -> None:
+        self.config = config
+        self._vm: VM | None = vm
+        self._guest: GuestAgent | None = None
+        self._current_task_config: dict | None = None
+        self._last_marks: list[list[int]] = []
+        self._is_done: bool = False
+        self._action_history: list = []
+        if vm is not None:
+            self._connect_guest(vm)
+    def attach_vm(self, vm: VM) -> None:
+        """Attach a live VM handle after construction (for deferred-launch patterns)."""
+        self._vm = vm
+        self._connect_guest(vm)
+    def _connect_guest(self, vm: VM) -> None:
+        """Parse the VM endpoint and create the GuestAgent HTTP client."""
+        parsed = urlparse(vm.endpoint)
+        host = parsed.hostname or "localhost"
+        port = parsed.port or 5000
+        self._guest = GuestAgent(host=host, port=port)
+    def execute_action(self, action: Action) -> Observation | StepError:
+        """Execute action; append full VM observation if observe_after_action=True."""
+        action_obs = super().execute_action(action)
+        if self.config.observe_after_action and action.name not in ("done", "fail"):
+            action_obs += self.get_observation()
+        return action_obs
+    def get_observation(self) -> Observation:
+        """Read current screen state from the VM and return as Observation."""
+        if self._guest is None:
+            raise RuntimeError("No VM attached — call attach_vm() or pass vm= to ComputerConfig.make()")
+        raw_obs = {
+            "screenshot": self._guest.get_screenshot(),
+            "accessibility_tree": self._guest.get_accessibility_tree() if self.config.require_a11y_tree else None,
+            "terminal": self._guest.get_terminal_output() if self.config.require_terminal else None,
+        }
+        return self._convert_observation(raw_obs)
+    def _convert_observation(self, raw_obs: dict) -> Observation:
+        """Convert VM observation dict to a cube Observation."""
+        contents: list[Content] = []
+        if raw_obs.get("screenshot"):
+            img = Image.open(BytesIO(raw_obs["screenshot"])).convert("RGB")
+            contents.append(ImageContent(data=img, name="screenshot"))
+        if raw_obs.get("accessibility_tree"):
+            contents.append(TextContent(data=raw_obs["accessibility_tree"], name="accessibility_tree"))
+        if raw_obs.get("terminal"):
+            contents.append(TextContent(data=raw_obs["terminal"], name="terminal"))
+        return Observation(contents=contents)
+    def _execute_desktop_action(self, action_dict: dict | str) -> str:
+        """Send an action to the guest VM and return a success string."""
+        if self._guest is None:
+            raise RuntimeError("No VM attached — call attach_vm() or pass vm= to ComputerConfig.make()")
+        if isinstance(action_dict, dict):
+            self._guest.execute_action(action_dict)
+        else:
+            self._guest.execute_python_command(str(action_dict))
+        self._action_history.append(action_dict)
+        return "Success"
+    def update_marks(self, marks: list[list[int]]) -> None:
+        """Store SoM bounding-box marks for tag_N variable resolution in run_pyautogui."""
+        self._last_marks = marks
+    def reset(self) -> None:
+        """Reset tool state between tasks (cube AbstractTool.reset() override)."""
+        self._last_marks = []
+        self._is_done = False
+        self._action_history = []
+    def close(self) -> None:
+        """Release tool resources. Does NOT stop the VM — caller owns VM lifecycle."""
+        logger.info("Closing ComputerBase tool")
+    @tool_action
+    def wait(self) -> str:
+        """Wait one step without taking any action."""
+        self._action_history.append("WAIT")
+        return "Success"
+    @tool_action
+    def done(self) -> str:
+        """Signal that the task has been completed successfully."""
+        self._is_done = True
+        self._action_history.append("DONE")
+        return "Task marked as done"
+    @tool_action
+    def fail(self) -> str:
+        """Signal that the task cannot be completed (infeasible or failed)."""
+        self._is_done = True
+        self._action_history.append("FAIL")
+        return "Task marked as failed"
+# ---------------------------------------------------------------------------
+# Computer13 — 13 mouse/keyboard primitives
+# ---------------------------------------------------------------------------
+class Computer13(ComputerBase):
+    """
+    Desktop/VM computer tool with the computer_13 action space.
+    Exposes 13 mouse/keyboard primitives as @tool_action methods, plus the
+    shared wait/done/fail terminal signals inherited from ComputerBase.
+    """
+    @tool_action
+    def click(
+        self,
+        button: str = "left",
+        x: int = -1,
+        y: int = -1,
+        num_clicks: int = 1,
+    ) -> str:
+        """Click the mouse button at optional coordinates.
+        Parameters
+        ----------
+        button : str
+            Mouse button — "left", "right", or "middle"
+        x : int
+            X coordinate to click at (-1 = use current cursor position)
+        y : int
+            Y coordinate to click at (-1 = use current cursor position)
+        num_clicks : int
+            Number of clicks (1 for single, 2 for double, etc.)
+        """
+        params: dict = {"button": button, "num_clicks": num_clicks}
+        if x >= 0:
+            params["x"] = x
+        if y >= 0:
+            params["y"] = y
+        return self._execute_desktop_action({"action_type": "CLICK", "parameters": params})
+    @tool_action
+    def double_click(self, x: int = -1, y: int = -1) -> str:
+        """Double-click the mouse at optional coordinates.
+        Parameters
+        ----------
+        x : int
+            X coordinate (-1 = use current cursor position)
+        y : int
+            Y coordinate (-1 = use current cursor position)
+        """
+        return self.click(x=x, y=y, num_clicks=2)
+    @tool_action
+    def right_click(self, x: int = -1, y: int = -1) -> str:
+        """Right-click the mouse at optional coordinates.
+        Parameters
+        ----------
+        x : int
+            X coordinate (-1 = use current cursor position)
+        y : int
+            Y coordinate (-1 = use current cursor position)
+        """
+        return self.click(button="right", x=x, y=y)
+    @tool_action
+    def mouse_down(self, button: str = "left") -> str:
+        """Press and hold a mouse button.
+        Parameters
+        ----------
+        button : str
+            Mouse button — "left", "right", or "middle"
+        """
+        return self._execute_desktop_action({"action_type": "MOUSE_DOWN", "parameters": {"button": button}})
+    @tool_action
+    def mouse_up(self, button: str = "left") -> str:
+        """Release a held mouse button.
+        Parameters
+        ----------
+        button : str
+            Mouse button — "left", "right", or "middle"
+        """
+        return self._execute_desktop_action({"action_type": "MOUSE_UP", "parameters": {"button": button}})
+    @tool_action
+    def move_to(self, x: int, y: int) -> str:
+        """Move the mouse cursor to pixel coordinates without clicking.
+        Parameters
+        ----------
+        x : int
+            Target X coordinate
+        y : int
+            Target Y coordinate
+        """
+        return self._execute_desktop_action({"action_type": "MOVE_TO", "parameters": {"x": x, "y": y}})
+    @tool_action
+    def drag_to(self, x: int, y: int) -> str:
+        """Click-and-drag from the current cursor position to (x, y).
+        Parameters
+        ----------
+        x : int
+            Target X coordinate
+        y : int
+            Target Y coordinate
+        """
+        return self._execute_desktop_action({"action_type": "DRAG_TO", "parameters": {"x": x, "y": y}})
+    @tool_action
+    def scroll(self, dx: int, dy: int) -> str:
+        """Scroll the mouse wheel.
+        Parameters
+        ----------
+        dx : int
+            Horizontal scroll amount (positive = right)
+        dy : int
+            Vertical scroll amount (positive = down)
+        """
+        return self._execute_desktop_action({"action_type": "SCROLL", "parameters": {"dx": dx, "dy": dy}})
+    @tool_action
+    def typing(self, text: str) -> str:
+        """Type text into the currently focused element.
+        Parameters
+        ----------
+        text : str
+            The text to type
+        """
+        return self._execute_desktop_action({"action_type": "TYPING", "parameters": {"text": text}})
+    @tool_action
+    def press(self, key: str) -> str:
+        """Press and release a single key.
+        Parameters
+        ----------
+        key : str
+            Key name (e.g. "enter", "esc", "tab", "backspace", "space")
+        """
+        return self._execute_desktop_action({"action_type": "PRESS", "parameters": {"key": key}})
+    @tool_action
+    def key_down(self, key: str) -> str:
+        """Press a key down without releasing it.
+        Parameters
+        ----------
+        key : str
+            Key name (e.g. "ctrl", "shift", "alt")
+        """
+        return self._execute_desktop_action({"action_type": "KEY_DOWN", "parameters": {"key": key}})
+    @tool_action
+    def key_up(self, key: str) -> str:
+        """Release a previously held key.
+        Parameters
+        ----------
+        key : str
+            Key name (e.g. "ctrl", "shift", "alt")
+        """
+        return self._execute_desktop_action({"action_type": "KEY_UP", "parameters": {"key": key}})
+    @tool_action
+    def hotkey(self, keys: str) -> str:
+        """Press a key combination simultaneously (e.g. Ctrl+C).
+        Parameters
+        ----------
+        keys : str
+            Key names joined by '+' (e.g. "ctrl+c", "ctrl+shift+t")
+        """
+        if isinstance(keys, str):
+            keys = keys.split("+")
+        return self._execute_desktop_action({"action_type": "HOTKEY", "parameters": {"keys": keys}})
+# ---------------------------------------------------------------------------
+# PyAutoGUIComputer — pyautogui code execution action space
+# ---------------------------------------------------------------------------
+class PyAutoGUIComputer(ComputerBase):
+    """
+    Desktop/VM computer tool with the pyautogui action space.
+    Exposes run_pyautogui() as a @tool_action method, plus the shared
+    wait/done/fail terminal signals inherited from ComputerBase.
+    The agent writes Python code using pyautogui; SoM tag_N variables
+    (center coordinates of numbered bounding boxes) are prepended automatically
+    so agents can reference screen elements by index.
+    """
+    @tool_action
+    def run_pyautogui(self, code: str) -> str:
+        """Execute Python code using pyautogui in the VM.
+        Parameters
+        ----------
+        code : str
+            Python code to execute (e.g. "pyautogui.click(100, 200)"). If SoM
+            bounding boxes are available, tag_1, tag_2, ... variables are
+            prepended as center coordinates (e.g. "pyautogui.click(*tag_3)").
+        """
+        if self._guest is None:
+            raise RuntimeError("No VM attached — call attach_vm() or pass vm= to ComputerConfig.make()")
+        tag_vars = ""
+        for i, mark in enumerate(self._last_marks):
+            x, y, w, h = mark
+            tag_vars += f"tag_{i + 1} = ({int(x + w // 2)}, {int(y + h // 2)})\n"
+        fixed_code = fix_pyautogui_less_than_bug(tag_vars + code)
+        result = self._guest.execute_python_command(fixed_code)
+        time.sleep(2)  # replicate desktop_env.step()'s default pause
+        if result:
+            returncode = result.get("returncode", 0)
+            error = result.get("error", "") or result.get("stderr", "")
+            if returncode != 0 and error:
+                return f"Error executing code:\n{error.strip()}"
+        self._action_history.append(code)
+        return "Success"

cube_computer_tool-0.1.0/src/cube_computer_tool/guest_agent.py ADDED Viewed

@@ -0,0 +1,426 @@
+"""HTTP client for the Flask guest agent running inside a desktop VM.
+The guest agent server runs at port 5000 inside the VM and exposes endpoints
+for screenshots, accessibility trees, command execution, and file I/O.
+Originally ported from desktop_env.controllers.python.PythonController.
+"""
+import json
+import logging
+import random
+import time
+import traceback
+from typing import Any
+import requests
+logger = logging.getLogger(__name__)
+# fmt: off
+_KEYBOARD_KEYS = [
+    "\t", "\n", "\r", " ", "!", '"', "#", "$", "%", "&", "'", "(", ")", "*",
+    "+", ",", "-", ".", "/", "0", "1", "2", "3", "4", "5", "6", "7", "8",
+    "9", ":", ";", "<", "=", ">", "?", "@", "[", "\\", "]", "^", "_", "`",
+    "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n",
+    "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "{", "|",
+    "}", "~", "accept", "add", "alt", "altleft", "altright", "apps",
+    "backspace", "browserback", "browserfavorites", "browserforward",
+    "browserhome", "browserrefresh", "browsersearch", "browserstop",
+    "capslock", "clear", "convert", "ctrl", "ctrlleft", "ctrlright",
+    "decimal", "del", "delete", "divide", "down", "end", "enter", "esc",
+    "escape", "execute", "f1", "f2", "f3", "f4", "f5", "f6", "f7", "f8",
+    "f9", "f10", "f11", "f12", "f13", "f14", "f15", "f16", "f17", "f18",
+    "f19", "f20", "final", "fn", "hanguel", "hangul", "hanja", "help",
+    "home", "insert", "junja", "kana", "kanji", "launchapp1", "launchapp2",
+    "launchmail", "launchmediaselect", "left", "modechange", "multiply",
+    "nexttrack", "nonconvert", "num0", "num1", "num2", "num3", "num4",
+    "num5", "num6", "num7", "num8", "num9", "numlock", "pagedown", "pageup",
+    "pause", "pgdn", "pgup", "playpause", "prevtrack", "print", "printscreen",
+    "prntscrn", "prtsc", "prtscr", "return", "right", "scrolllock", "select",
+    "separator", "shift", "shiftleft", "shiftright", "sleep", "space", "stop",
+    "subtract", "tab", "up", "volumedown", "volumemute", "volumeup", "win",
+    "winleft", "winright", "yen", "command", "option", "optionleft",
+    "optionright",
+]
+# fmt: on
+_PYAUTOGUI_PREFIX = "import pyautogui; import time; pyautogui.FAILSAFE = False; {command}"
+_RETRY_TIMES = 3
+_RETRY_INTERVAL = 5
+class GuestAgent:
+    """HTTP client for the Flask agent server running inside a desktop VM.
+    Parameters
+    ----------
+    host : str
+        Hostname or IP of the server (typically "localhost" with port-forwarded QEMU).
+    port : int
+        Host port mapped to the guest's Flask server (default 5000).
+    """
+    def __init__(self, host: str = "localhost", port: int = 5000) -> None:
+        self.host = host
+        self.port = port
+        self._base_url = f"http://{host}:{port}"
+    # ------------------------------------------------------------------
+    # Observation retrieval
+    # ------------------------------------------------------------------
+    def get_screenshot(self) -> bytes | None:
+        """Return raw PNG/JPEG bytes of the current screen, or None on failure."""
+        for attempt in range(_RETRY_TIMES):
+            try:
+                resp = requests.get(self._base_url + "/screenshot", timeout=10)
+                if resp.status_code == 200 and self._is_valid_image(resp.headers.get("Content-Type", ""), resp.content):
+                    return resp.content
+                logger.error("Invalid screenshot response (attempt %d/%d)", attempt + 1, _RETRY_TIMES)
+            except Exception as exc:
+                logger.error("Screenshot error: %s", exc)
+            time.sleep(_RETRY_INTERVAL)
+        logger.error("Failed to get screenshot after %d attempts", _RETRY_TIMES)
+        return None
+    def get_accessibility_tree(self) -> str | None:
+        """Return the XML accessibility tree string, or None on failure."""
+        for _ in range(_RETRY_TIMES):
+            try:
+                resp = requests.get(self._base_url + "/accessibility")
+                if resp.status_code == 200:
+                    return resp.json()["AT"]
+                logger.error("Accessibility tree error: %d", resp.status_code)
+            except Exception as exc:
+                logger.error("Accessibility tree error: %s", exc)
+            time.sleep(_RETRY_INTERVAL)
+        logger.error("Failed to get accessibility tree")
+        return None
+    def get_terminal_output(self) -> str | None:
+        """Return the terminal output string, or None on failure."""
+        for _ in range(_RETRY_TIMES):
+            try:
+                resp = requests.get(self._base_url + "/terminal")
+                if resp.status_code == 200:
+                    return resp.json()["output"]
+                logger.error("Terminal output error: %d", resp.status_code)
+            except Exception as exc:
+                logger.error("Terminal output error: %s", exc)
+            time.sleep(_RETRY_INTERVAL)
+        logger.error("Failed to get terminal output")
+        return None
+    def get_file(self, file_path: str) -> bytes | None:
+        """Download a file from the VM by path, or None on failure."""
+        for _ in range(_RETRY_TIMES):
+            try:
+                resp = requests.post(self._base_url + "/file", data={"file_path": file_path})
+                if resp.status_code == 200:
+                    return resp.content
+                logger.error("Get file error: %d", resp.status_code)
+            except Exception as exc:
+                logger.error("Get file error: %s", exc)
+            time.sleep(_RETRY_INTERVAL)
+        logger.error("Failed to get file: %s", file_path)
+        return None
+    # ------------------------------------------------------------------
+    # Command execution
+    # ------------------------------------------------------------------
+    def execute_python_command(self, command: str) -> dict[str, Any] | None:
+        """Execute a Python command via pyautogui prefix inside the VM."""
+        command_list = ["python", "-c", _PYAUTOGUI_PREFIX.format(command=command)]
+        payload = json.dumps({"command": command_list, "shell": False})
+        for _ in range(_RETRY_TIMES):
+            try:
+                resp = requests.post(
+                    self._base_url + "/execute",
+                    headers={"Content-Type": "application/json"},
+                    data=payload,
+                    timeout=90,
+                )
+                if resp.status_code == 200:
+                    return resp.json()
+                logger.error("Execute python error: %d", resp.status_code)
+            except requests.exceptions.ReadTimeout:
+                break
+            except Exception as exc:
+                logger.error("Execute python error: %s", exc)
+            time.sleep(_RETRY_INTERVAL)
+        logger.error("Failed to execute python command")
+        return None
+    def run_python_script(self, script: str) -> dict[str, Any] | None:
+        """Execute a Python script file inside the VM via /run_python."""
+        payload = json.dumps({"code": script})
+        for _ in range(_RETRY_TIMES):
+            try:
+                resp = requests.post(
+                    self._base_url + "/run_python",
+                    headers={"Content-Type": "application/json"},
+                    data=payload,
+                    timeout=90,
+                )
+                if resp.status_code == 200:
+                    return resp.json()
+                return {
+                    "status": "error",
+                    "message": "Request failed",
+                    "output": None,
+                    "error": resp.json().get("error"),
+                }
+            except requests.exceptions.ReadTimeout:
+                break
+            except Exception:
+                logger.error("Run python script error: %s", traceback.format_exc())
+            time.sleep(_RETRY_INTERVAL)
+        return {"status": "error", "message": "Retry limit reached", "output": "", "error": "Retry limit reached."}
+    def run_bash_script(self, script: str, timeout: int = 30, working_dir: str | None = None) -> dict[str, Any] | None:
+        """Execute a bash script inside the VM via /run_bash_script."""
+        payload = json.dumps({"script": script, "timeout": timeout, "working_dir": working_dir})
+        for _ in range(_RETRY_TIMES):
+            try:
+                resp = requests.post(
+                    self._base_url + "/run_bash_script",
+                    headers={"Content-Type": "application/json"},
+                    data=payload,
+                    timeout=timeout + 100,
+                )
+                if resp.status_code == 200:
+                    return resp.json()
+                logger.error("Run bash script error: %d %s", resp.status_code, resp.text)
+            except requests.exceptions.ReadTimeout:
+                return {"status": "error", "output": "", "error": f"Timed out after {timeout}s", "returncode": -1}
+            except Exception as exc:
+                logger.error("Run bash script error: %s", exc)
+            time.sleep(_RETRY_INTERVAL)
+        return {"status": "error", "output": "", "error": f"Failed after {_RETRY_TIMES} retries", "returncode": -1}
+    def execute_action(self, action: dict[str, Any]) -> None:
+        """Dispatch a computer_13 action dict to the appropriate pyautogui command.
+        Mirrors the dispatch table in desktop_env.controllers.python.PythonController.execute_action.
+        """
+        if action in ("WAIT", "FAIL", "DONE"):
+            return
+        action_type: str = action["action_type"]
+        parameters: dict = action.get("parameters") or {k: v for k, v in action.items() if k != "action_type"}
+        move_mode = random.choice(
+            [
+                "pyautogui.easeInQuad",
+                "pyautogui.easeOutQuad",
+                "pyautogui.easeInOutQuad",
+                "pyautogui.easeInBounce",
+                "pyautogui.easeInElastic",
+            ]
+        )
+        duration = random.uniform(0.5, 1)
+        if action_type == "MOVE_TO":
+            if not parameters:
+                self.execute_python_command("pyautogui.moveTo()")
+            elif "x" in parameters and "y" in parameters:
+                self.execute_python_command(
+                    f"pyautogui.moveTo({parameters['x']}, {parameters['y']}, {duration}, {move_mode})"
+                )
+            else:
+                raise ValueError(f"Unknown MOVE_TO parameters: {parameters}")
+        elif action_type == "CLICK":
+            if not parameters:
+                self.execute_python_command("pyautogui.click()")
+            elif "button" in parameters and "x" in parameters and "y" in parameters:
+                btn, x, y = parameters["button"], parameters["x"], parameters["y"]
+                nc = parameters.get("num_clicks")
+                if nc:
+                    self.execute_python_command(f"pyautogui.click(button='{btn}', x={x}, y={y}, clicks={nc})")
+                else:
+                    self.execute_python_command(f"pyautogui.click(button='{btn}', x={x}, y={y})")
+            elif "button" in parameters:
+                btn = parameters["button"]
+                nc = parameters.get("num_clicks")
+                if nc:
+                    self.execute_python_command(f"pyautogui.click(button='{btn}', clicks={nc})")
+                else:
+                    self.execute_python_command(f"pyautogui.click(button='{btn}')")
+            elif "x" in parameters and "y" in parameters:
+                x, y = parameters["x"], parameters["y"]
+                nc = parameters.get("num_clicks")
+                if nc:
+                    self.execute_python_command(f"pyautogui.click(x={x}, y={y}, clicks={nc})")
+                else:
+                    self.execute_python_command(f"pyautogui.click(x={x}, y={y})")
+            else:
+                raise ValueError(f"Unknown CLICK parameters: {parameters}")
+        elif action_type == "MOUSE_DOWN":
+            btn = parameters.get("button", "left")
+            self.execute_python_command(f"pyautogui.mouseDown(button='{btn}')")
+        elif action_type == "MOUSE_UP":
+            btn = parameters.get("button", "left")
+            self.execute_python_command(f"pyautogui.mouseUp(button='{btn}')")
+        elif action_type == "RIGHT_CLICK":
+            if "x" in parameters and "y" in parameters:
+                self.execute_python_command(f"pyautogui.rightClick(x={parameters['x']}, y={parameters['y']})")
+            else:
+                self.execute_python_command("pyautogui.rightClick()")
+        elif action_type == "DOUBLE_CLICK":
+            if "x" in parameters and "y" in parameters:
+                self.execute_python_command(f"pyautogui.doubleClick(x={parameters['x']}, y={parameters['y']})")
+            else:
+                self.execute_python_command("pyautogui.doubleClick()")
+        elif action_type == "DRAG_TO":
+            self.execute_python_command(
+                f"pyautogui.dragTo({parameters['x']}, {parameters['y']}, duration=1.0, button='left', mouseDownUp=True)"
+            )
+        elif action_type == "SCROLL":
+            dx = parameters.get("dx")
+            dy = parameters.get("dy")
+            if dx is not None:
+                self.execute_python_command(f"pyautogui.hscroll({dx})")
+            if dy is not None:
+                self.execute_python_command(f"pyautogui.vscroll({dy})")
+        elif action_type == "TYPING":
+            text = parameters["text"]
+            self.execute_python_command("pyautogui.typewrite({:})".format(repr(text)))
+        elif action_type == "PRESS":
+            key = parameters["key"]
+            if key.lower() not in _KEYBOARD_KEYS:
+                raise ValueError(f"Key must be one of the known keyboard keys, got: {key!r}")
+            self.execute_python_command(f"pyautogui.press('{key}')")
+        elif action_type == "KEY_DOWN":
+            key = parameters["key"]
+            if key.lower() not in _KEYBOARD_KEYS:
+                raise ValueError(f"Key must be one of the known keyboard keys, got: {key!r}")
+            self.execute_python_command(f"pyautogui.keyDown('{key}')")
+        elif action_type == "KEY_UP":
+            key = parameters["key"]
+            if key.lower() not in _KEYBOARD_KEYS:
+                raise ValueError(f"Key must be one of the known keyboard keys, got: {key!r}")
+            self.execute_python_command(f"pyautogui.keyUp('{key}')")
+        elif action_type == "HOTKEY":
+            keys: list[str] = parameters["keys"]
+            if not isinstance(keys, list):
+                raise ValueError("HOTKEY keys must be a list")
+            for k in keys:
+                if k.lower() not in _KEYBOARD_KEYS:
+                    raise ValueError(f"Key must be one of the known keyboard keys, got: {k!r}")
+            self.execute_python_command("pyautogui.hotkey('" + "', '".join(keys) + "')")
+        elif action_type in ("WAIT", "FAIL", "DONE"):
+            pass
+        else:
+            raise ValueError(f"Unknown action type: {action_type!r}")
+    # ------------------------------------------------------------------
+    # VM info
+    # ------------------------------------------------------------------
+    def get_vm_platform(self) -> str:
+        """Return the platform string (e.g. 'Linux', 'Windows')."""
+        result = self.execute_python_command("import platform; print(platform.system())")
+        if result and result.get("output"):
+            return result["output"].strip()
+        return ""
+    def get_vm_screen_size(self) -> dict[str, Any] | None:
+        """Return the VM screen size dict."""
+        for _ in range(_RETRY_TIMES):
+            try:
+                resp = requests.post(self._base_url + "/screen_size")
+                if resp.status_code == 200:
+                    return resp.json()
+            except Exception as exc:
+                logger.error("Screen size error: %s", exc)
+            time.sleep(_RETRY_INTERVAL)
+        return None
+    def get_vm_window_size(self, app_class_name: str) -> dict[str, Any] | None:
+        """Return the window size for an application by class name."""
+        for _ in range(_RETRY_TIMES):
+            try:
+                resp = requests.post(self._base_url + "/window_size", data={"app_class_name": app_class_name})
+                if resp.status_code == 200:
+                    return resp.json()
+            except Exception as exc:
+                logger.error("Window size error: %s", exc)
+            time.sleep(_RETRY_INTERVAL)
+        return None
+    def get_vm_desktop_path(self) -> str | None:
+        """Return the desktop directory path inside the VM."""
+        for _ in range(_RETRY_TIMES):
+            try:
+                resp = requests.post(self._base_url + "/desktop_path")
+                if resp.status_code == 200:
+                    return resp.json()["desktop_path"]
+            except Exception as exc:
+                logger.error("Desktop path error: %s", exc)
+            time.sleep(_RETRY_INTERVAL)
+        return None
+    def get_vm_directory_tree(self, path: str) -> dict[str, Any] | None:
+        """Return the directory tree for the given path inside the VM."""
+        payload = json.dumps({"path": path})
+        for _ in range(_RETRY_TIMES):
+            try:
+                resp = requests.post(
+                    self._base_url + "/list_directory",
+                    headers={"Content-Type": "application/json"},
+                    data=payload,
+                )
+                if resp.status_code == 200:
+                    return resp.json()["directory_tree"]
+            except Exception as exc:
+                logger.error("Directory tree error: %s", exc)
+            time.sleep(_RETRY_INTERVAL)
+        return None
+    def get_vm_wallpaper(self) -> bytes | None:
+        """Return the current desktop wallpaper image bytes."""
+        for _ in range(_RETRY_TIMES):
+            try:
+                resp = requests.post(self._base_url + "/wallpaper")
+                if resp.status_code == 200:
+                    return resp.content
+            except Exception as exc:
+                logger.error("Wallpaper error: %s", exc)
+            time.sleep(_RETRY_INTERVAL)
+        return None
+    # ------------------------------------------------------------------
+    # Internal helpers
+    # ------------------------------------------------------------------
+    @staticmethod
+    def _is_valid_image(content_type: str, data: bytes | None) -> bool:
+        if not isinstance(data, (bytes, bytearray)) or not data:
+            return False
+        if len(data) >= 8 and data[:8] == b"\x89PNG\r\n\x1a\n":
+            return True
+        if len(data) >= 3 and data[:3] == b"\xff\xd8\xff":
+            return True
+        if content_type and any(t in content_type for t in ("image/png", "image/jpeg", "image/jpg")):
+            return True
+        return False

cube_computer_tool-0.1.0/src/cube_computer_tool/pyautogui_utils.py ADDED Viewed

@@ -0,0 +1,64 @@
+"""PyAutoGUI utilities for desktop VM tool execution."""
+import re
+def fix_pyautogui_less_than_bug(command: str) -> str:
+    """Fix PyAutoGUI '<' character bug by converting it to hotkey("shift", ',') calls.
+    This fixes the known PyAutoGUI issue where typing '<' produces '>' instead.
+    References:
+    - https://github.com/asweigart/pyautogui/issues/198
+    - https://github.com/xlang-ai/OSWorld/issues/257
+    Parameters
+    ----------
+    command : str
+        The original pyautogui command string.
+    Returns
+    -------
+    str
+        The fixed command with '<' characters handled properly.
+    """
+    # Pattern to match press('<') or press('\u003c') calls
+    press_pattern = r'pyautogui\.press\(["\'](?:<|\\u003c)["\']\)'
+    def replace_press_less_than(match: re.Match) -> str:
+        return 'pyautogui.hotkey("shift", ",")'
+    command = re.sub(press_pattern, replace_press_less_than, command)
+    # Pattern to match typewrite calls with quoted strings
+    typewrite_pattern = r'pyautogui\.typewrite\((["\'])(.*?)\1\)'
+    def process_typewrite_match(match: re.Match) -> str:
+        quote_char = match.group(1)
+        content = match.group(2)
+        try:
+            decoded_content = content.encode("utf-8").decode("unicode_escape")
+            content = decoded_content
+        except UnicodeDecodeError:
+            pass
+        if "<" not in content:
+            return match.group(0)
+        parts = content.split("<")
+        result_parts = []
+        for i, part in enumerate(parts):
+            if i == 0:
+                if part:
+                    result_parts.append(f"pyautogui.typewrite({quote_char}{part}{quote_char})")
+            else:
+                result_parts.append('pyautogui.hotkey("shift", ",")')
+                if part:
+                    result_parts.append(f"pyautogui.typewrite({quote_char}{part}{quote_char})")
+        return "; ".join(result_parts)
+    command = re.sub(typewrite_pattern, process_typewrite_match, command)
+    return command