PyPI - autoglm-gui - Versions diffs - 1.4.1__py3-none-any.whl → 1.5.1__py3-none-any.whl - Mend

autoglm-gui 1.4.1py3-none-any.whl → 1.5.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (135) hide show

AutoGLM_GUI/__init__.py +11 -0
AutoGLM_GUI/__main__.py +26 -4
AutoGLM_GUI/actions/__init__.py +6 -0
phone_agent/actions/handler_ios.py → AutoGLM_GUI/actions/handler.py +30 -112
AutoGLM_GUI/actions/types.py +15 -0
{phone_agent → AutoGLM_GUI}/adb/__init__.py +25 -23
{phone_agent → AutoGLM_GUI}/adb/connection.py +5 -40
{phone_agent → AutoGLM_GUI}/adb/device.py +12 -94
{phone_agent → AutoGLM_GUI}/adb/input.py +6 -47
AutoGLM_GUI/adb/screenshot.py +11 -0
{phone_agent/config → AutoGLM_GUI/adb}/timing.py +1 -1
AutoGLM_GUI/adb_plus/keyboard_installer.py +4 -2
AutoGLM_GUI/adb_plus/screenshot.py +22 -1
AutoGLM_GUI/adb_plus/serial.py +38 -20
AutoGLM_GUI/adb_plus/touch.py +4 -9
AutoGLM_GUI/agents/__init__.py +43 -12
AutoGLM_GUI/agents/events.py +19 -0
AutoGLM_GUI/agents/factory.py +31 -38
AutoGLM_GUI/agents/glm/__init__.py +7 -0
AutoGLM_GUI/agents/glm/agent.py +297 -0
AutoGLM_GUI/agents/glm/message_builder.py +81 -0
AutoGLM_GUI/agents/glm/parser.py +110 -0
{phone_agent/config → AutoGLM_GUI/agents/glm}/prompts_en.py +7 -9
{phone_agent/config → AutoGLM_GUI/agents/glm}/prompts_zh.py +18 -25
AutoGLM_GUI/agents/mai/__init__.py +28 -0
AutoGLM_GUI/agents/mai/agent.py +408 -0
AutoGLM_GUI/agents/mai/parser.py +254 -0
AutoGLM_GUI/agents/mai/prompts.py +103 -0
AutoGLM_GUI/agents/mai/traj_memory.py +91 -0
AutoGLM_GUI/agents/protocols.py +12 -8
AutoGLM_GUI/agents/stream_runner.py +193 -0
AutoGLM_GUI/api/__init__.py +40 -21
AutoGLM_GUI/api/agents.py +181 -239
AutoGLM_GUI/api/control.py +9 -6
AutoGLM_GUI/api/devices.py +102 -12
AutoGLM_GUI/api/history.py +104 -0
AutoGLM_GUI/api/layered_agent.py +67 -15
AutoGLM_GUI/api/media.py +64 -1
AutoGLM_GUI/api/scheduled_tasks.py +98 -0
AutoGLM_GUI/config.py +81 -0
AutoGLM_GUI/config_manager.py +68 -51
AutoGLM_GUI/device_manager.py +248 -29
AutoGLM_GUI/device_protocol.py +1 -1
AutoGLM_GUI/devices/adb_device.py +5 -10
AutoGLM_GUI/devices/mock_device.py +4 -2
AutoGLM_GUI/devices/remote_device.py +8 -3
AutoGLM_GUI/history_manager.py +164 -0
AutoGLM_GUI/model/__init__.py +5 -0
AutoGLM_GUI/model/message_builder.py +69 -0
AutoGLM_GUI/model/types.py +24 -0
AutoGLM_GUI/models/__init__.py +10 -0
AutoGLM_GUI/models/history.py +140 -0
AutoGLM_GUI/models/scheduled_task.py +71 -0
AutoGLM_GUI/parsers/__init__.py +22 -0
AutoGLM_GUI/parsers/base.py +50 -0
AutoGLM_GUI/parsers/phone_parser.py +58 -0
AutoGLM_GUI/phone_agent_manager.py +62 -396
AutoGLM_GUI/platform_utils.py +26 -0
AutoGLM_GUI/prompt_config.py +15 -0
AutoGLM_GUI/prompts/__init__.py +32 -0
AutoGLM_GUI/scheduler_manager.py +350 -0
AutoGLM_GUI/schemas.py +246 -72
AutoGLM_GUI/scrcpy_stream.py +142 -24
AutoGLM_GUI/socketio_server.py +100 -27
AutoGLM_GUI/static/assets/{about-_XNhzQZX.js → about-CfwX1Cmc.js} +1 -1
AutoGLM_GUI/static/assets/alert-dialog-CtGlN2IJ.js +1 -0
AutoGLM_GUI/static/assets/chat-BYa-foUI.js +129 -0
AutoGLM_GUI/static/assets/circle-alert-t08bEMPO.js +1 -0
AutoGLM_GUI/static/assets/dialog-FNwZJFwk.js +45 -0
AutoGLM_GUI/static/assets/eye-D0UPWCWC.js +1 -0
AutoGLM_GUI/static/assets/history-CRo95B7i.js +1 -0
AutoGLM_GUI/static/assets/{index-Cy8TmmHV.js → index-BaLMSqd3.js} +1 -1
AutoGLM_GUI/static/assets/index-CTHbFvKl.js +11 -0
AutoGLM_GUI/static/assets/index-CV7jGxGm.css +1 -0
AutoGLM_GUI/static/assets/label-DJFevVmr.js +1 -0
AutoGLM_GUI/static/assets/logs-RW09DyYY.js +1 -0
AutoGLM_GUI/static/assets/popover--JTJrE5v.js +1 -0
AutoGLM_GUI/static/assets/scheduled-tasks-DTRKsQXF.js +1 -0
AutoGLM_GUI/static/assets/square-pen-CPK_K680.js +1 -0
AutoGLM_GUI/static/assets/textarea-PRmVnWq5.js +1 -0
AutoGLM_GUI/static/assets/workflows-CdcsAoaT.js +1 -0
AutoGLM_GUI/static/index.html +2 -2
AutoGLM_GUI/types.py +17 -0
{autoglm_gui-1.4.1.dist-info → autoglm_gui-1.5.1.dist-info}/METADATA +179 -130
autoglm_gui-1.5.1.dist-info/RECORD +118 -0
AutoGLM_GUI/agents/mai_adapter.py +0 -627
AutoGLM_GUI/api/dual_model.py +0 -317
AutoGLM_GUI/device_adapter.py +0 -263
AutoGLM_GUI/dual_model/__init__.py +0 -53
AutoGLM_GUI/dual_model/decision_model.py +0 -664
AutoGLM_GUI/dual_model/dual_agent.py +0 -917
AutoGLM_GUI/dual_model/protocols.py +0 -354
AutoGLM_GUI/dual_model/vision_model.py +0 -442
AutoGLM_GUI/mai_ui_adapter/agent_wrapper.py +0 -291
AutoGLM_GUI/phone_agent_patches.py +0 -147
AutoGLM_GUI/static/assets/chat-DwJpiAWf.js +0 -126
AutoGLM_GUI/static/assets/dialog-B3uW4T8V.js +0 -45
AutoGLM_GUI/static/assets/index-Cpv2gSF1.css +0 -1
AutoGLM_GUI/static/assets/index-UYYauTly.js +0 -12
AutoGLM_GUI/static/assets/workflows-Du_de-dt.js +0 -1
autoglm_gui-1.4.1.dist-info/RECORD +0 -117
mai_agent/base.py +0 -137
mai_agent/mai_grounding_agent.py +0 -263
mai_agent/mai_naivigation_agent.py +0 -526
mai_agent/prompt.py +0 -148
mai_agent/unified_memory.py +0 -67
mai_agent/utils.py +0 -73
phone_agent/__init__.py +0 -12
phone_agent/actions/__init__.py +0 -5
phone_agent/actions/handler.py +0 -400
phone_agent/adb/screenshot.py +0 -108
phone_agent/agent.py +0 -253
phone_agent/agent_ios.py +0 -277
phone_agent/config/__init__.py +0 -53
phone_agent/config/apps_harmonyos.py +0 -256
phone_agent/config/apps_ios.py +0 -339
phone_agent/config/prompts.py +0 -80
phone_agent/device_factory.py +0 -166
phone_agent/hdc/__init__.py +0 -53
phone_agent/hdc/connection.py +0 -384
phone_agent/hdc/device.py +0 -269
phone_agent/hdc/input.py +0 -145
phone_agent/hdc/screenshot.py +0 -127
phone_agent/model/__init__.py +0 -5
phone_agent/model/client.py +0 -290
phone_agent/xctest/__init__.py +0 -47
phone_agent/xctest/connection.py +0 -379
phone_agent/xctest/device.py +0 -472
phone_agent/xctest/input.py +0 -311
phone_agent/xctest/screenshot.py +0 -226
{phone_agent/config → AutoGLM_GUI/adb}/apps.py +0 -0
{phone_agent/config → AutoGLM_GUI}/i18n.py +0 -0
{autoglm_gui-1.4.1.dist-info → autoglm_gui-1.5.1.dist-info}/WHEEL +0 -0
{autoglm_gui-1.4.1.dist-info → autoglm_gui-1.5.1.dist-info}/entry_points.txt +0 -0
{autoglm_gui-1.4.1.dist-info → autoglm_gui-1.5.1.dist-info}/licenses/LICENSE +0 -0

AutoGLM_GUI/dual_model/vision_model.py DELETED Viewed

@@ -1,442 +0,0 @@
-"""
-视觉小模型适配器
-适配 autoglm-phone 等视觉模型，提供屏幕识别和动作执行能力
-"""
-from dataclasses import dataclass
-from typing import Callable, Optional
-from phone_agent.model.client import ModelClient, ModelConfig, MessageBuilder
-from phone_agent.actions.handler import ActionHandler, parse_action
-from phone_agent.device_factory import get_device_factory
-from AutoGLM_GUI.logger import logger
-from .protocols import VISION_DESCRIBE_PROMPT
-@dataclass
-class ScreenDescription:
-    """屏幕描述结果"""
-    description: str  # 屏幕文字描述
-    current_app: str  # 当前应用
-    elements: list[str]  # 识别到的主要元素
-    raw_response: str = ""
-@dataclass
-class ExecutionResult:
-    """动作执行结果"""
-    success: bool
-    action_type: str  # 执行的动作类型
-    target: str  # 目标描述
-    position: Optional[tuple[int, int]] = None  # 点击位置(如果有)
-    message: str = ""
-    finished: bool = False
-class VisionModel:
-    """
-    视觉小模型 - 负责屏幕识别和动作执行
-    使用 autoglm-phone 等视觉模型，识别屏幕内容并执行具体操作。
-    在双模型协作中，充当"眼睛"和"手"的角色。
-    """
-    def __init__(
-        self,
-        model_config: ModelConfig,
-        device_id: str,
-        confirmation_callback: Optional[Callable[[str], bool]] = None,
-        takeover_callback: Optional[Callable[[str], None]] = None,
-    ):
-        self.model_config = model_config
-        self.device_id = device_id
-        self.model_client = ModelClient(model_config)
-        self.action_handler = ActionHandler(
-            device_id=device_id,
-            confirmation_callback=confirmation_callback,
-            takeover_callback=takeover_callback,
-        )
-        self.device_factory = get_device_factory()
-        logger.info(f"视觉小模型初始化: {model_config.model_name}, 设备: {device_id}")
-    def capture_screenshot(self) -> tuple[str, int, int]:
-        """
-        截取当前屏幕
-        Returns:
-            (base64_string, width, height)
-        """
-        logger.debug("正在截取屏幕...")
-        screenshot = self.device_factory.get_screenshot(self.device_id)
-        logger.debug(f"截图完成: {screenshot.width}x{screenshot.height}")
-        return (
-            screenshot.base64_data,
-            screenshot.width,
-            screenshot.height,
-        )
-    def describe_screen(
-        self,
-        screenshot_base64: Optional[str] = None,
-        on_thinking: Optional[Callable[[str], None]] = None,
-    ) -> ScreenDescription:
-        """
-        识别并描述屏幕内容
-        让视觉模型描述当前屏幕，生成文字描述供决策大模型使用。
-        Args:
-            screenshot_base64: 可选的截图base64，不提供则自动截取
-            on_thinking: 思考过程回调
-        Returns:
-            ScreenDescription: 屏幕描述结果
-        """
-        logger.info("正在识别屏幕内容...")
-        # 获取截图
-        if screenshot_base64 is None:
-            screenshot_base64, width, height = self.capture_screenshot()
-        # 获取当前应用
-        current_app = self.device_factory.get_current_app(self.device_id)
-        # 构建消息，要求模型描述屏幕
-        messages = [
-            MessageBuilder.create_system_message(
-                "你是一个屏幕内容识别助手。请详细描述屏幕内容，帮助另一个AI做出操作决策。"
-            ),
-            MessageBuilder.create_user_message(
-                text=f"""请描述这个屏幕的内容。
-当前应用: {current_app}
-{VISION_DESCRIBE_PROMPT}
-请以结构化的方式描述屏幕内容。""",
-                image_base64=screenshot_base64,
-            ),
-        ]
-        # 调用视觉模型
-        try:
-            response = self.model_client.request(messages)
-            # 解析描述
-            description = (
-                response.thinking if response.thinking else response.raw_content
-            )
-            # 提取元素列表（简单解析）
-            elements = self._extract_elements(description)
-            result = ScreenDescription(
-                description=description,
-                current_app=current_app,
-                elements=elements,
-                raw_response=response.raw_content,
-            )
-            logger.info(f"屏幕识别完成: {current_app}, 识别到 {len(elements)} 个元素")
-            return result
-        except Exception as e:
-            logger.error(f"屏幕识别失败: {e}")
-            # 返回基础描述
-            return ScreenDescription(
-                description=f"当前应用: {current_app}，屏幕识别失败: {e}",
-                current_app=current_app,
-                elements=[],
-            )
-    def execute_decision(
-        self,
-        decision: dict,
-        screenshot_base64: Optional[str] = None,
-        on_thinking: Optional[Callable[[str], None]] = None,
-    ) -> ExecutionResult:
-        """
-        根据大模型的决策执行操作
-        将大模型的高级决策转换为具体的屏幕操作。
-        Args:
-            decision: 大模型的决策，包含 action, target, content 等
-            screenshot_base64: 当前截图(用于定位元素)
-            on_thinking: 思考过程回调
-        Returns:
-            ExecutionResult: 执行结果
-        """
-        action_type = decision.get("action", "")
-        target = decision.get("target", "")
-        content = decision.get("content")
-        logger.info(f"执行决策: {action_type} -> {target}")
-        # 获取截图和尺寸
-        if screenshot_base64 is None:
-            screenshot_base64, width, height = self.capture_screenshot()
-        else:
-            screenshot = self.device_factory.get_screenshot(self.device_id)
-            width, height = screenshot.width, screenshot.height
-        # 处理完成动作
-        if action_type == "finish":
-            return ExecutionResult(
-                success=True,
-                action_type="finish",
-                target="",
-                message=decision.get("reasoning", "任务完成"),
-                finished=True,
-            )
-        # 对于需要定位的操作，调用视觉模型找到具体位置
-        if action_type in ["tap", "swipe", "long_press", "double_tap"]:
-            position = self._find_element_position(
-                target, screenshot_base64, width, height, on_thinking
-            )
-            if position is None:
-                return ExecutionResult(
-                    success=False,
-                    action_type=action_type,
-                    target=target,
-                    message=f"无法定位元素: {target}",
-                )
-            # 执行点击操作
-            if action_type == "tap":
-                action_dict = {
-                    "_metadata": "do",
-                    "action": "Tap",
-                    "element": list(position),
-                }
-            elif action_type == "double_tap":
-                action_dict = {
-                    "_metadata": "do",
-                    "action": "Double Tap",
-                    "element": list(position),
-                }
-            elif action_type == "long_press":
-                action_dict = {
-                    "_metadata": "do",
-                    "action": "Long Press",
-                    "element": list(position),
-                }
-            else:
-                action_dict = {
-                    "_metadata": "do",
-                    "action": "Tap",
-                    "element": list(position),
-                }
-            result = self.action_handler.execute(action_dict, width, height)
-            return ExecutionResult(
-                success=result.success,
-                action_type=action_type,
-                target=target,
-                position=(
-                    int(position[0] * width / 1000),
-                    int(position[1] * height / 1000),
-                ),
-                message=result.message or "",
-                finished=result.should_finish,
-            )
-        # 处理输入操作
-        elif action_type == "type":
-            if not content:
-                return ExecutionResult(
-                    success=False,
-                    action_type="type",
-                    target=target,
-                    message="输入内容为空",
-                )
-            action_dict = {
-                "_metadata": "do",
-                "action": "Type",
-                "text": content,
-            }
-            result = self.action_handler.execute(action_dict, width, height)
-            return ExecutionResult(
-                success=result.success,
-                action_type="type",
-                target=target,
-                message=f"输入: {content[:50]}..."
-                if len(content) > 50
-                else f"输入: {content}",
-            )
-        # 处理滑动操作
-        elif action_type == "scroll":
-            direction = decision.get("direction", "up")
-            # 根据方向计算滑动坐标
-            if direction == "up":
-                start = [500, 700]
-                end = [500, 300]
-            elif direction == "down":
-                start = [500, 300]
-                end = [500, 700]
-            elif direction == "left":
-                start = [700, 500]
-                end = [300, 500]
-            else:  # right
-                start = [300, 500]
-                end = [700, 500]
-            action_dict = {
-                "_metadata": "do",
-                "action": "Swipe",
-                "start": start,
-                "end": end,
-            }
-            result = self.action_handler.execute(action_dict, width, height)
-            return ExecutionResult(
-                success=result.success,
-                action_type="scroll",
-                target=f"滚动{direction}",
-            )
-        # 处理返回操作
-        elif action_type == "back":
-            action_dict = {"_metadata": "do", "action": "Back"}
-            result = self.action_handler.execute(action_dict, width, height)
-            return ExecutionResult(
-                success=result.success,
-                action_type="back",
-                target="返回",
-            )
-        # 处理Home键
-        elif action_type == "home":
-            action_dict = {"_metadata": "do", "action": "Home"}
-            result = self.action_handler.execute(action_dict, width, height)
-            return ExecutionResult(
-                success=result.success,
-                action_type="home",
-                target="主页",
-            )
-        # 处理启动应用
-        elif action_type == "launch":
-            app_name = target or decision.get("app", "")
-            action_dict = {
-                "_metadata": "do",
-                "action": "Launch",
-                "app": app_name,
-            }
-            result = self.action_handler.execute(action_dict, width, height)
-            return ExecutionResult(
-                success=result.success,
-                action_type="launch",
-                target=app_name,
-                message=result.message or "",
-            )
-        else:
-            logger.warning(f"未知的动作类型: {action_type}")
-            return ExecutionResult(
-                success=False,
-                action_type=action_type,
-                target=target,
-                message=f"未知的动作类型: {action_type}",
-            )
-    def _find_element_position(
-        self,
-        target_description: str,
-        screenshot_base64: str,
-        _width: int,
-        _height: int,
-        _on_thinking: Optional[Callable[[str], None]] = None,
-    ) -> Optional[tuple[int, int]]:
-        """
-        使用视觉模型定位元素
-        Args:
-            target_description: 目标元素描述
-            screenshot_base64: 截图base64
-            width: 屏幕宽度
-            height: 屏幕高度
-            on_thinking: 思考过程回调
-        Returns:
-            (x, y) 归一化坐标(0-1000)，或 None
-        """
-        logger.debug(f"正在定位元素: {target_description}")
-        # 构建定位请求
-        messages = [
-            MessageBuilder.create_system_message(
-                """你是一个屏幕元素定位助手。根据用户描述的目标元素，找到它在屏幕上的位置。
-请以以下格式返回:
-do(action="Tap", element=[x, y])
-其中 x 和 y 是 0-1000 范围的归一化坐标。
-- x=0 表示最左边，x=1000 表示最右边
-- y=0 表示最上边，y=1000 表示最下边
-只返回坐标，不要其他解释。"""
-            ),
-            MessageBuilder.create_user_message(
-                text=f"请找到并点击: {target_description}",
-                image_base64=screenshot_base64,
-            ),
-        ]
-        try:
-            response = self.model_client.request(messages)
-            # 解析响应获取坐标
-            action = parse_action(response.action)
-            if action.get("_metadata") == "do" and "element" in action:
-                element = action["element"]
-                if isinstance(element, list) and len(element) >= 2:
-                    x, y = int(element[0]), int(element[1])
-                    logger.info(f"元素定位成功: ({x}, {y})")
-                    return (x, y)
-            logger.warning(f"无法从响应中解析坐标: {response.action}")
-            return None
-        except Exception as e:
-            logger.error(f"元素定位失败: {e}")
-            return None
-    def _extract_elements(self, description: str) -> list[str]:
-        """从描述中提取主要元素列表"""
-        elements = []
-        # 简单的关键词提取
-        keywords = ["按钮", "图标", "文本", "输入框", "搜索", "导航", "菜单", "列表"]
-        lines = description.split("\n")
-        for line in lines:
-            line = line.strip()
-            if any(kw in line for kw in keywords):
-                # 清理并添加
-                if len(line) < 100:  # 避免太长的描述
-                    elements.append(line)
-        return elements[:10]  # 最多返回10个元素
-    def get_current_app(self) -> str:
-        """获取当前应用"""
-        return self.device_factory.get_current_app(self.device_id)

autoglm-gui 1.4.1__py3-none-any.whl → 1.5.1__py3-none-any.whl

autoglm-gui 1.4.1py3-none-any.whl → 1.5.1py3-none-any.whl