PyPI - autoglm-gui - Versions diffs - 0.4.14__py3-none-any.whl → 1.0.1__py3-none-any.whl - Mend

autoglm-gui 0.4.14py3-none-any.whl → 1.0.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (41) hide show

AutoGLM_GUI/api/devices.py +49 -0
AutoGLM_GUI/schemas.py +16 -0
AutoGLM_GUI/static/assets/{about-29B5FDM8.js → about-BOnRPlKQ.js} +1 -1
AutoGLM_GUI/static/assets/chat-CGW6uMKB.js +149 -0
AutoGLM_GUI/static/assets/{index-mVNV0VwM.js → index-CRFVU0eu.js} +1 -1
AutoGLM_GUI/static/assets/{index-wu8Wjf12.js → index-DH-Dl4tK.js} +5 -5
AutoGLM_GUI/static/assets/index-DzUQ89YC.css +1 -0
AutoGLM_GUI/static/index.html +2 -2
{autoglm_gui-0.4.14.dist-info → autoglm_gui-1.0.1.dist-info}/METADATA +3 -3
autoglm_gui-1.0.1.dist-info/RECORD +73 -0
phone_agent/__init__.py +3 -2
phone_agent/actions/handler.py +124 -31
phone_agent/actions/handler_ios.py +278 -0
phone_agent/adb/connection.py +14 -5
phone_agent/adb/device.py +47 -16
phone_agent/agent.py +8 -8
phone_agent/agent_ios.py +277 -0
phone_agent/config/__init__.py +18 -0
phone_agent/config/apps.py +1 -1
phone_agent/config/apps_harmonyos.py +256 -0
phone_agent/config/apps_ios.py +339 -0
phone_agent/config/i18n.py +8 -0
phone_agent/config/timing.py +167 -0
phone_agent/device_factory.py +166 -0
phone_agent/hdc/__init__.py +53 -0
phone_agent/hdc/connection.py +384 -0
phone_agent/hdc/device.py +269 -0
phone_agent/hdc/input.py +145 -0
phone_agent/hdc/screenshot.py +127 -0
phone_agent/model/client.py +104 -4
phone_agent/xctest/__init__.py +47 -0
phone_agent/xctest/connection.py +379 -0
phone_agent/xctest/device.py +472 -0
phone_agent/xctest/input.py +311 -0
phone_agent/xctest/screenshot.py +226 -0
AutoGLM_GUI/static/assets/chat-DTN2oKtA.js +0 -149
AutoGLM_GUI/static/assets/index-Dy550Qqg.css +0 -1
autoglm_gui-0.4.14.dist-info/RECORD +0 -57
{autoglm_gui-0.4.14.dist-info → autoglm_gui-1.0.1.dist-info}/WHEEL +0 -0
{autoglm_gui-0.4.14.dist-info → autoglm_gui-1.0.1.dist-info}/entry_points.txt +0 -0
{autoglm_gui-0.4.14.dist-info → autoglm_gui-1.0.1.dist-info}/licenses/LICENSE +0 -0

phone_agent/actions/handler_ios.py ADDED Viewed

@@ -0,0 +1,278 @@
+"""Action handler for iOS automation using WebDriverAgent."""
+import time
+from dataclasses import dataclass
+from typing import Any, Callable
+from phone_agent.xctest import (
+    back,
+    double_tap,
+    home,
+    launch_app,
+    long_press,
+    swipe,
+    tap,
+)
+from phone_agent.xctest.input import clear_text, hide_keyboard, type_text
+@dataclass
+class ActionResult:
+    """Result of an action execution."""
+    success: bool
+    should_finish: bool
+    message: str | None = None
+    requires_confirmation: bool = False
+class IOSActionHandler:
+    """
+    Handles execution of actions from AI model output for iOS devices.
+    Args:
+        wda_url: WebDriverAgent URL.
+        session_id: Optional WDA session ID.
+        confirmation_callback: Optional callback for sensitive action confirmation.
+            Should return True to proceed, False to cancel.
+        takeover_callback: Optional callback for takeover requests (login, captcha).
+    """
+    def __init__(
+        self,
+        wda_url: str = "http://localhost:8100",
+        session_id: str | None = None,
+        confirmation_callback: Callable[[str], bool] | None = None,
+        takeover_callback: Callable[[str], None] | None = None,
+    ):
+        self.wda_url = wda_url
+        self.session_id = session_id
+        self.confirmation_callback = confirmation_callback or self._default_confirmation
+        self.takeover_callback = takeover_callback or self._default_takeover
+    def execute(
+        self, action: dict[str, Any], screen_width: int, screen_height: int
+    ) -> ActionResult:
+        """
+        Execute an action from the AI model.
+        Args:
+            action: The action dictionary from the model.
+            screen_width: Current screen width in pixels.
+            screen_height: Current screen height in pixels.
+        Returns:
+            ActionResult indicating success and whether to finish.
+        """
+        action_type = action.get("_metadata")
+        if action_type == "finish":
+            return ActionResult(
+                success=True, should_finish=True, message=action.get("message")
+            )
+        if action_type != "do":
+            return ActionResult(
+                success=False,
+                should_finish=True,
+                message=f"Unknown action type: {action_type}",
+            )
+        action_name = action.get("action")
+        handler_method = self._get_handler(action_name)
+        if handler_method is None:
+            return ActionResult(
+                success=False,
+                should_finish=False,
+                message=f"Unknown action: {action_name}",
+            )
+        try:
+            return handler_method(action, screen_width, screen_height)
+        except Exception as e:
+            return ActionResult(
+                success=False, should_finish=False, message=f"Action failed: {e}"
+            )
+    def _get_handler(self, action_name: str) -> Callable | None:
+        """Get the handler method for an action."""
+        handlers = {
+            "Launch": self._handle_launch,
+            "Tap": self._handle_tap,
+            "Type": self._handle_type,
+            "Type_Name": self._handle_type,
+            "Swipe": self._handle_swipe,
+            "Back": self._handle_back,
+            "Home": self._handle_home,
+            "Double Tap": self._handle_double_tap,
+            "Long Press": self._handle_long_press,
+            "Wait": self._handle_wait,
+            "Take_over": self._handle_takeover,
+            "Note": self._handle_note,
+            "Call_API": self._handle_call_api,
+            "Interact": self._handle_interact,
+        }
+        return handlers.get(action_name)
+    def _convert_relative_to_absolute(
+        self, element: list[int], screen_width: int, screen_height: int
+    ) -> tuple[int, int]:
+        """Convert relative coordinates (0-1000) to absolute pixels."""
+        x = int(element[0] / 1000 * screen_width)
+        y = int(element[1] / 1000 * screen_height)
+        return x, y
+    def _handle_launch(self, action: dict, width: int, height: int) -> ActionResult:
+        """Handle app launch action."""
+        app_name = action.get("app")
+        if not app_name:
+            return ActionResult(False, False, "No app name specified")
+        success = launch_app(app_name, wda_url=self.wda_url, session_id=self.session_id)
+        if success:
+            return ActionResult(True, False)
+        return ActionResult(False, False, f"App not found: {app_name}")
+    def _handle_tap(self, action: dict, width: int, height: int) -> ActionResult:
+        """Handle tap action."""
+        element = action.get("element")
+        if not element:
+            return ActionResult(False, False, "No element coordinates")
+        x, y = self._convert_relative_to_absolute(element, width, height)
+        print(f"Physically tap on ({x}, {y})")
+        # Check for sensitive operation
+        if "message" in action:
+            if not self.confirmation_callback(action["message"]):
+                return ActionResult(
+                    success=False,
+                    should_finish=True,
+                    message="User cancelled sensitive operation",
+                )
+        tap(x, y, wda_url=self.wda_url, session_id=self.session_id)
+        return ActionResult(True, False)
+    def _handle_type(self, action: dict, width: int, height: int) -> ActionResult:
+        """Handle text input action."""
+        text = action.get("text", "")
+        # Clear existing text and type new text
+        clear_text(wda_url=self.wda_url, session_id=self.session_id)
+        time.sleep(0.5)
+        type_text(text, wda_url=self.wda_url, session_id=self.session_id)
+        time.sleep(0.5)
+        # Hide keyboard after typing
+        hide_keyboard(wda_url=self.wda_url, session_id=self.session_id)
+        time.sleep(0.5)
+        return ActionResult(True, False)
+    def _handle_swipe(self, action: dict, width: int, height: int) -> ActionResult:
+        """Handle swipe action."""
+        start = action.get("start")
+        end = action.get("end")
+        if not start or not end:
+            return ActionResult(False, False, "Missing swipe coordinates")
+        start_x, start_y = self._convert_relative_to_absolute(start, width, height)
+        end_x, end_y = self._convert_relative_to_absolute(end, width, height)
+        print(f"Physically scroll from ({start_x}, {start_y}) to ({end_x}, {end_y})")
+        swipe(
+            start_x,
+            start_y,
+            end_x,
+            end_y,
+            wda_url=self.wda_url,
+            session_id=self.session_id,
+        )
+        return ActionResult(True, False)
+    def _handle_back(self, action: dict, width: int, height: int) -> ActionResult:
+        """Handle back gesture (swipe from left edge)."""
+        back(wda_url=self.wda_url, session_id=self.session_id)
+        return ActionResult(True, False)
+    def _handle_home(self, action: dict, width: int, height: int) -> ActionResult:
+        """Handle home button action."""
+        home(wda_url=self.wda_url, session_id=self.session_id)
+        return ActionResult(True, False)
+    def _handle_double_tap(self, action: dict, width: int, height: int) -> ActionResult:
+        """Handle double tap action."""
+        element = action.get("element")
+        if not element:
+            return ActionResult(False, False, "No element coordinates")
+        x, y = self._convert_relative_to_absolute(element, width, height)
+        double_tap(x, y, wda_url=self.wda_url, session_id=self.session_id)
+        return ActionResult(True, False)
+    def _handle_long_press(self, action: dict, width: int, height: int) -> ActionResult:
+        """Handle long press action."""
+        element = action.get("element")
+        if not element:
+            return ActionResult(False, False, "No element coordinates")
+        x, y = self._convert_relative_to_absolute(element, width, height)
+        long_press(
+            x,
+            y,
+            duration=3.0,
+            wda_url=self.wda_url,
+            session_id=self.session_id,
+        )
+        return ActionResult(True, False)
+    def _handle_wait(self, action: dict, width: int, height: int) -> ActionResult:
+        """Handle wait action."""
+        duration_str = action.get("duration", "1 seconds")
+        try:
+            duration = float(duration_str.replace("seconds", "").strip())
+        except ValueError:
+            duration = 1.0
+        time.sleep(duration)
+        return ActionResult(True, False)
+    def _handle_takeover(self, action: dict, width: int, height: int) -> ActionResult:
+        """Handle takeover request (login, captcha, etc.)."""
+        message = action.get("message", "User intervention required")
+        self.takeover_callback(message)
+        return ActionResult(True, False)
+    def _handle_note(self, action: dict, width: int, height: int) -> ActionResult:
+        """Handle note action (placeholder for content recording)."""
+        # This action is typically used for recording page content
+        # Implementation depends on specific requirements
+        return ActionResult(True, False)
+    def _handle_call_api(self, action: dict, width: int, height: int) -> ActionResult:
+        """Handle API call action (placeholder for summarization)."""
+        # This action is typically used for content summarization
+        # Implementation depends on specific requirements
+        return ActionResult(True, False)
+    def _handle_interact(self, action: dict, width: int, height: int) -> ActionResult:
+        """Handle interaction request (user choice needed)."""
+        # This action signals that user input is needed
+        return ActionResult(True, False, message="User interaction required")
+    @staticmethod
+    def _default_confirmation(message: str) -> bool:
+        """Default confirmation callback using console input."""
+        response = input(f"Sensitive operation: {message}\nConfirm? (Y/N): ")
+        return response.upper() == "Y"
+    @staticmethod
+    def _default_takeover(message: str) -> None:
+        """Default takeover callback using console input."""
+        input(f"{message}\nPress Enter after completing manual operation...")

phone_agent/adb/connection.py CHANGED Viewed

@@ -5,6 +5,8 @@ import time
 from dataclasses import dataclass
 from enum import Enum
+from phone_agent.config.timing import TIMING_CONFIG
 class ConnectionType(Enum):
     """Type of ADB connection."""
@@ -106,7 +108,9 @@ class ADBConnection:
             if address:
                 cmd.append(address)
-            result = subprocess.run(cmd, capture_output=True, text=True, timeout=5)
+            result = subprocess.run(
+                cmd, capture_output=True, text=True, encoding="utf-8", timeout=5
+            )
             output = result.stdout + result.stderr
             return True, output.strip() or "Disconnected"
@@ -238,12 +242,14 @@ class ADBConnection:
                 cmd.extend(["-s", device_id])
             cmd.extend(["tcpip", str(port)])
-            result = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
+            result = subprocess.run(
+                cmd, capture_output=True, text=True, encoding="utf-8", timeout=10
+            )
             output = result.stdout + result.stderr
             if "restarting" in output.lower() or result.returncode == 0:
-                time.sleep(2)  # Wait for ADB to restart
+                time.sleep(TIMING_CONFIG.connection.adb_restart_delay)
                 return True, f"TCP/IP mode enabled on port {port}"
             else:
                 return False, output.strip()
@@ -267,7 +273,9 @@ class ADBConnection:
                 cmd.extend(["-s", device_id])
             cmd.extend(["shell", "ip", "route"])
-            result = subprocess.run(cmd, capture_output=True, text=True, timeout=5)
+            result = subprocess.run(
+                cmd, capture_output=True, text=True, encoding="utf-8", timeout=5
+            )
             # Parse IP from route output
             for line in result.stdout.split("\n"):
@@ -283,6 +291,7 @@ class ADBConnection:
                 cmd[:-1] + ["shell", "ip", "addr", "show", "wlan0"],
                 capture_output=True,
                 text=True,
+                encoding="utf-8",
                 timeout=5,
             )
@@ -311,7 +320,7 @@ class ADBConnection:
                 [self.adb_path, "kill-server"], capture_output=True, timeout=5
             )
-            time.sleep(1)
+            time.sleep(TIMING_CONFIG.connection.server_restart_delay)
             # Start server
             subprocess.run(

phone_agent/adb/device.py CHANGED Viewed

@@ -4,6 +4,7 @@ import subprocess
 import time
 from phone_agent.config.apps import APP_PACKAGES
+from phone_agent.config.timing import TIMING_CONFIG
 def get_current_app(device_id: str | None = None) -> str:
@@ -19,9 +20,14 @@ def get_current_app(device_id: str | None = None) -> str:
     adb_prefix = _get_adb_prefix(device_id)
     result = subprocess.run(
-        adb_prefix + ["shell", "dumpsys", "window"], capture_output=True, text=True
+        adb_prefix + ["shell", "dumpsys", "window"],
+        capture_output=True,
+        text=True,
+        encoding="utf-8",
     )
     output = result.stdout
+    if not output:
+        raise ValueError("No output from dumpsys window")
     # Parse window focus info
     for line in output.split("\n"):
@@ -33,7 +39,9 @@ def get_current_app(device_id: str | None = None) -> str:
     return "System Home"
-def tap(x: int, y: int, device_id: str | None = None, delay: float = 1.0) -> None:
+def tap(
+    x: int, y: int, device_id: str | None = None, delay: float | None = None
+) -> None:
     """
     Tap at the specified coordinates.
@@ -41,8 +49,11 @@ def tap(x: int, y: int, device_id: str | None = None, delay: float = 1.0) -> Non
         x: X coordinate.
         y: Y coordinate.
         device_id: Optional ADB device ID.
-        delay: Delay in seconds after tap.
+        delay: Delay in seconds after tap. If None, uses configured default.
     """
+    if delay is None:
+        delay = TIMING_CONFIG.device.default_tap_delay
     adb_prefix = _get_adb_prefix(device_id)
     subprocess.run(
@@ -52,7 +63,7 @@ def tap(x: int, y: int, device_id: str | None = None, delay: float = 1.0) -> Non
 def double_tap(
-    x: int, y: int, device_id: str | None = None, delay: float = 1.0
+    x: int, y: int, device_id: str | None = None, delay: float | None = None
 ) -> None:
     """
     Double tap at the specified coordinates.
@@ -61,14 +72,17 @@ def double_tap(
         x: X coordinate.
         y: Y coordinate.
         device_id: Optional ADB device ID.
-        delay: Delay in seconds after double tap.
+        delay: Delay in seconds after double tap. If None, uses configured default.
     """
+    if delay is None:
+        delay = TIMING_CONFIG.device.default_double_tap_delay
     adb_prefix = _get_adb_prefix(device_id)
     subprocess.run(
         adb_prefix + ["shell", "input", "tap", str(x), str(y)], capture_output=True
     )
-    time.sleep(0.1)
+    time.sleep(TIMING_CONFIG.device.double_tap_interval)
     subprocess.run(
         adb_prefix + ["shell", "input", "tap", str(x), str(y)], capture_output=True
     )
@@ -80,7 +94,7 @@ def long_press(
     y: int,
     duration_ms: int = 3000,
     device_id: str | None = None,
-    delay: float = 1.0,
+    delay: float | None = None,
 ) -> None:
     """
     Long press at the specified coordinates.
@@ -90,8 +104,11 @@ def long_press(
         y: Y coordinate.
         duration_ms: Duration of press in milliseconds.
         device_id: Optional ADB device ID.
-        delay: Delay in seconds after long press.
+        delay: Delay in seconds after long press. If None, uses configured default.
     """
+    if delay is None:
+        delay = TIMING_CONFIG.device.default_long_press_delay
     adb_prefix = _get_adb_prefix(device_id)
     subprocess.run(
@@ -109,7 +126,7 @@ def swipe(
     end_y: int,
     duration_ms: int | None = None,
     device_id: str | None = None,
-    delay: float = 1.0,
+    delay: float | None = None,
 ) -> None:
     """
     Swipe from start to end coordinates.
@@ -121,8 +138,11 @@ def swipe(
         end_y: Ending Y coordinate.
         duration_ms: Duration of swipe in milliseconds (auto-calculated if None).
         device_id: Optional ADB device ID.
-        delay: Delay in seconds after swipe.
+        delay: Delay in seconds after swipe. If None, uses configured default.
     """
+    if delay is None:
+        delay = TIMING_CONFIG.device.default_swipe_delay
     adb_prefix = _get_adb_prefix(device_id)
     if duration_ms is None:
@@ -148,14 +168,17 @@ def swipe(
     time.sleep(delay)
-def back(device_id: str | None = None, delay: float = 1.0) -> None:
+def back(device_id: str | None = None, delay: float | None = None) -> None:
     """
     Press the back button.
     Args:
         device_id: Optional ADB device ID.
-        delay: Delay in seconds after pressing back.
+        delay: Delay in seconds after pressing back. If None, uses configured default.
     """
+    if delay is None:
+        delay = TIMING_CONFIG.device.default_back_delay
     adb_prefix = _get_adb_prefix(device_id)
     subprocess.run(
@@ -164,14 +187,17 @@ def back(device_id: str | None = None, delay: float = 1.0) -> None:
     time.sleep(delay)
-def home(device_id: str | None = None, delay: float = 1.0) -> None:
+def home(device_id: str | None = None, delay: float | None = None) -> None:
     """
     Press the home button.
     Args:
         device_id: Optional ADB device ID.
-        delay: Delay in seconds after pressing home.
+        delay: Delay in seconds after pressing home. If None, uses configured default.
     """
+    if delay is None:
+        delay = TIMING_CONFIG.device.default_home_delay
     adb_prefix = _get_adb_prefix(device_id)
     subprocess.run(
@@ -180,18 +206,23 @@ def home(device_id: str | None = None, delay: float = 1.0) -> None:
     time.sleep(delay)
-def launch_app(app_name: str, device_id: str | None = None, delay: float = 1.0) -> bool:
+def launch_app(
+    app_name: str, device_id: str | None = None, delay: float | None = None
+) -> bool:
     """
     Launch an app by name.
     Args:
         app_name: The app name (must be in APP_PACKAGES).
         device_id: Optional ADB device ID.
-        delay: Delay in seconds after launching.
+        delay: Delay in seconds after launching. If None, uses configured default.
     Returns:
         True if app was launched, False if app not found.
     """
+    if delay is None:
+        delay = TIMING_CONFIG.device.default_launch_delay
     if app_name not in APP_PACKAGES:
         return False

phone_agent/agent.py CHANGED Viewed

@@ -7,8 +7,8 @@ from typing import Any, Callable
 from phone_agent.actions import ActionHandler
 from phone_agent.actions.handler import finish, parse_action
-from phone_agent.adb import get_current_app, get_screenshot
 from phone_agent.config import get_messages, get_system_prompt
+from phone_agent.device_factory import get_device_factory
 from phone_agent.model import ModelClient, ModelConfig
 from phone_agent.model.client import MessageBuilder
@@ -140,8 +140,9 @@ class PhoneAgent:
         self._step_count += 1
         # Capture current screen state
-        screenshot = get_screenshot(self.agent_config.device_id)
-        current_app = get_current_app(self.agent_config.device_id)
+        device_factory = get_device_factory()
+        screenshot = device_factory.get_screenshot(self.agent_config.device_id)
+        current_app = device_factory.get_current_app(self.agent_config.device_id)
         # Build messages
         if is_first:
@@ -169,6 +170,10 @@ class PhoneAgent:
         # Get model response
         try:
+            msgs = get_messages(self.agent_config.lang)
+            print("\n" + "=" * 50)
+            print(f"💭 {msgs['thinking']}:")
+            print("-" * 50)
             response = self.model_client.request(self._context)
         except Exception as e:
             if self.agent_config.verbose:
@@ -191,11 +196,6 @@ class PhoneAgent:
         if self.agent_config.verbose:
             # Print thinking process
-            msgs = get_messages(self.agent_config.lang)
-            print("\n" + "=" * 50)
-            print(f"💭 {msgs['thinking']}:")
-            print("-" * 50)
-            print(response.thinking)
             print("-" * 50)
             print(f"🎯 {msgs['action']}:")
             print(json.dumps(action, ensure_ascii=False, indent=2))

autoglm-gui 0.4.14__py3-none-any.whl → 1.0.1__py3-none-any.whl

autoglm-gui 0.4.14py3-none-any.whl → 1.0.1py3-none-any.whl