autoglm-gui 1.4.1__py3-none-any.whl → 1.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- AutoGLM_GUI/__init__.py +11 -0
- AutoGLM_GUI/__main__.py +26 -4
- AutoGLM_GUI/actions/__init__.py +6 -0
- phone_agent/actions/handler_ios.py → AutoGLM_GUI/actions/handler.py +30 -112
- AutoGLM_GUI/actions/types.py +15 -0
- {phone_agent → AutoGLM_GUI}/adb/__init__.py +25 -23
- {phone_agent → AutoGLM_GUI}/adb/connection.py +5 -40
- {phone_agent → AutoGLM_GUI}/adb/device.py +12 -94
- {phone_agent → AutoGLM_GUI}/adb/input.py +6 -47
- AutoGLM_GUI/adb/screenshot.py +11 -0
- {phone_agent/config → AutoGLM_GUI/adb}/timing.py +1 -1
- AutoGLM_GUI/adb_plus/keyboard_installer.py +4 -2
- AutoGLM_GUI/adb_plus/screenshot.py +22 -1
- AutoGLM_GUI/adb_plus/serial.py +38 -20
- AutoGLM_GUI/adb_plus/touch.py +4 -9
- AutoGLM_GUI/agents/__init__.py +43 -12
- AutoGLM_GUI/agents/events.py +19 -0
- AutoGLM_GUI/agents/factory.py +31 -38
- AutoGLM_GUI/agents/glm/__init__.py +7 -0
- AutoGLM_GUI/agents/glm/agent.py +297 -0
- AutoGLM_GUI/agents/glm/message_builder.py +81 -0
- AutoGLM_GUI/agents/glm/parser.py +110 -0
- {phone_agent/config → AutoGLM_GUI/agents/glm}/prompts_en.py +7 -9
- {phone_agent/config → AutoGLM_GUI/agents/glm}/prompts_zh.py +18 -25
- AutoGLM_GUI/agents/mai/__init__.py +28 -0
- AutoGLM_GUI/agents/mai/agent.py +408 -0
- AutoGLM_GUI/agents/mai/parser.py +254 -0
- AutoGLM_GUI/agents/mai/prompts.py +103 -0
- AutoGLM_GUI/agents/mai/traj_memory.py +91 -0
- AutoGLM_GUI/agents/protocols.py +12 -8
- AutoGLM_GUI/agents/stream_runner.py +193 -0
- AutoGLM_GUI/api/__init__.py +40 -21
- AutoGLM_GUI/api/agents.py +181 -239
- AutoGLM_GUI/api/control.py +9 -6
- AutoGLM_GUI/api/devices.py +102 -12
- AutoGLM_GUI/api/history.py +104 -0
- AutoGLM_GUI/api/layered_agent.py +67 -15
- AutoGLM_GUI/api/media.py +64 -1
- AutoGLM_GUI/api/scheduled_tasks.py +98 -0
- AutoGLM_GUI/config.py +81 -0
- AutoGLM_GUI/config_manager.py +68 -51
- AutoGLM_GUI/device_manager.py +248 -29
- AutoGLM_GUI/device_protocol.py +1 -1
- AutoGLM_GUI/devices/adb_device.py +5 -10
- AutoGLM_GUI/devices/mock_device.py +4 -2
- AutoGLM_GUI/devices/remote_device.py +8 -3
- AutoGLM_GUI/history_manager.py +164 -0
- AutoGLM_GUI/model/__init__.py +5 -0
- AutoGLM_GUI/model/message_builder.py +69 -0
- AutoGLM_GUI/model/types.py +24 -0
- AutoGLM_GUI/models/__init__.py +10 -0
- AutoGLM_GUI/models/history.py +140 -0
- AutoGLM_GUI/models/scheduled_task.py +71 -0
- AutoGLM_GUI/parsers/__init__.py +22 -0
- AutoGLM_GUI/parsers/base.py +50 -0
- AutoGLM_GUI/parsers/phone_parser.py +58 -0
- AutoGLM_GUI/phone_agent_manager.py +62 -396
- AutoGLM_GUI/platform_utils.py +26 -0
- AutoGLM_GUI/prompt_config.py +15 -0
- AutoGLM_GUI/prompts/__init__.py +32 -0
- AutoGLM_GUI/scheduler_manager.py +350 -0
- AutoGLM_GUI/schemas.py +246 -72
- AutoGLM_GUI/scrcpy_stream.py +142 -24
- AutoGLM_GUI/socketio_server.py +100 -27
- AutoGLM_GUI/static/assets/{about-_XNhzQZX.js → about-CfwX1Cmc.js} +1 -1
- AutoGLM_GUI/static/assets/alert-dialog-CtGlN2IJ.js +1 -0
- AutoGLM_GUI/static/assets/chat-BYa-foUI.js +129 -0
- AutoGLM_GUI/static/assets/circle-alert-t08bEMPO.js +1 -0
- AutoGLM_GUI/static/assets/dialog-FNwZJFwk.js +45 -0
- AutoGLM_GUI/static/assets/eye-D0UPWCWC.js +1 -0
- AutoGLM_GUI/static/assets/history-CRo95B7i.js +1 -0
- AutoGLM_GUI/static/assets/{index-Cy8TmmHV.js → index-BaLMSqd3.js} +1 -1
- AutoGLM_GUI/static/assets/index-CTHbFvKl.js +11 -0
- AutoGLM_GUI/static/assets/index-CV7jGxGm.css +1 -0
- AutoGLM_GUI/static/assets/label-DJFevVmr.js +1 -0
- AutoGLM_GUI/static/assets/logs-RW09DyYY.js +1 -0
- AutoGLM_GUI/static/assets/popover--JTJrE5v.js +1 -0
- AutoGLM_GUI/static/assets/scheduled-tasks-DTRKsQXF.js +1 -0
- AutoGLM_GUI/static/assets/square-pen-CPK_K680.js +1 -0
- AutoGLM_GUI/static/assets/textarea-PRmVnWq5.js +1 -0
- AutoGLM_GUI/static/assets/workflows-CdcsAoaT.js +1 -0
- AutoGLM_GUI/static/index.html +2 -2
- AutoGLM_GUI/types.py +17 -0
- {autoglm_gui-1.4.1.dist-info → autoglm_gui-1.5.1.dist-info}/METADATA +179 -130
- autoglm_gui-1.5.1.dist-info/RECORD +118 -0
- AutoGLM_GUI/agents/mai_adapter.py +0 -627
- AutoGLM_GUI/api/dual_model.py +0 -317
- AutoGLM_GUI/device_adapter.py +0 -263
- AutoGLM_GUI/dual_model/__init__.py +0 -53
- AutoGLM_GUI/dual_model/decision_model.py +0 -664
- AutoGLM_GUI/dual_model/dual_agent.py +0 -917
- AutoGLM_GUI/dual_model/protocols.py +0 -354
- AutoGLM_GUI/dual_model/vision_model.py +0 -442
- AutoGLM_GUI/mai_ui_adapter/agent_wrapper.py +0 -291
- AutoGLM_GUI/phone_agent_patches.py +0 -147
- AutoGLM_GUI/static/assets/chat-DwJpiAWf.js +0 -126
- AutoGLM_GUI/static/assets/dialog-B3uW4T8V.js +0 -45
- AutoGLM_GUI/static/assets/index-Cpv2gSF1.css +0 -1
- AutoGLM_GUI/static/assets/index-UYYauTly.js +0 -12
- AutoGLM_GUI/static/assets/workflows-Du_de-dt.js +0 -1
- autoglm_gui-1.4.1.dist-info/RECORD +0 -117
- mai_agent/base.py +0 -137
- mai_agent/mai_grounding_agent.py +0 -263
- mai_agent/mai_naivigation_agent.py +0 -526
- mai_agent/prompt.py +0 -148
- mai_agent/unified_memory.py +0 -67
- mai_agent/utils.py +0 -73
- phone_agent/__init__.py +0 -12
- phone_agent/actions/__init__.py +0 -5
- phone_agent/actions/handler.py +0 -400
- phone_agent/adb/screenshot.py +0 -108
- phone_agent/agent.py +0 -253
- phone_agent/agent_ios.py +0 -277
- phone_agent/config/__init__.py +0 -53
- phone_agent/config/apps_harmonyos.py +0 -256
- phone_agent/config/apps_ios.py +0 -339
- phone_agent/config/prompts.py +0 -80
- phone_agent/device_factory.py +0 -166
- phone_agent/hdc/__init__.py +0 -53
- phone_agent/hdc/connection.py +0 -384
- phone_agent/hdc/device.py +0 -269
- phone_agent/hdc/input.py +0 -145
- phone_agent/hdc/screenshot.py +0 -127
- phone_agent/model/__init__.py +0 -5
- phone_agent/model/client.py +0 -290
- phone_agent/xctest/__init__.py +0 -47
- phone_agent/xctest/connection.py +0 -379
- phone_agent/xctest/device.py +0 -472
- phone_agent/xctest/input.py +0 -311
- phone_agent/xctest/screenshot.py +0 -226
- {phone_agent/config → AutoGLM_GUI/adb}/apps.py +0 -0
- {phone_agent/config → AutoGLM_GUI}/i18n.py +0 -0
- {autoglm_gui-1.4.1.dist-info → autoglm_gui-1.5.1.dist-info}/WHEEL +0 -0
- {autoglm_gui-1.4.1.dist-info → autoglm_gui-1.5.1.dist-info}/entry_points.txt +0 -0
- {autoglm_gui-1.4.1.dist-info → autoglm_gui-1.5.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,627 +0,0 @@
|
|
|
1
|
-
"""MAI Agent adapter for AutoGLM-GUI.
|
|
2
|
-
|
|
3
|
-
This module provides an adapter that wraps mai_agent.MAIUINaivigationAgent
|
|
4
|
-
to make it compatible with the PhoneAgent interface used in AutoGLM-GUI.
|
|
5
|
-
"""
|
|
6
|
-
|
|
7
|
-
from __future__ import annotations
|
|
8
|
-
|
|
9
|
-
import base64
|
|
10
|
-
import re
|
|
11
|
-
import sys
|
|
12
|
-
from dataclasses import dataclass
|
|
13
|
-
from io import BytesIO
|
|
14
|
-
from pathlib import Path
|
|
15
|
-
from typing import TYPE_CHECKING, Any, Callable, Optional, Tuple
|
|
16
|
-
|
|
17
|
-
from PIL import Image
|
|
18
|
-
|
|
19
|
-
from phone_agent.actions.handler import ActionHandler
|
|
20
|
-
from phone_agent.agent import AgentConfig, StepResult
|
|
21
|
-
from phone_agent.device_factory import get_device_factory
|
|
22
|
-
from phone_agent.model import ModelConfig
|
|
23
|
-
|
|
24
|
-
from AutoGLM_GUI.logger import logger
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
# Add mai_agent to sys.path for import
|
|
28
|
-
# mai_agent uses top-level imports (e.g., "from base import BaseAgent")
|
|
29
|
-
# which require the mai_agent directory to be in Python path
|
|
30
|
-
def _ensure_mai_agent_importable() -> None:
|
|
31
|
-
"""Ensure mai_agent directory is in sys.path for importing.
|
|
32
|
-
|
|
33
|
-
This function handles multiple environments:
|
|
34
|
-
- Development: mai_agent is in project root
|
|
35
|
-
- Wheel installation: mai_agent is installed as data file
|
|
36
|
-
- PyInstaller: mai_agent is in sys._MEIPASS
|
|
37
|
-
"""
|
|
38
|
-
# Check if already importable
|
|
39
|
-
try:
|
|
40
|
-
import mai_naivigation_agent # type: ignore[import-not-found] # noqa: F401
|
|
41
|
-
|
|
42
|
-
return
|
|
43
|
-
except ImportError:
|
|
44
|
-
pass
|
|
45
|
-
|
|
46
|
-
# Try to locate mai_agent directory
|
|
47
|
-
mai_agent_paths = []
|
|
48
|
-
|
|
49
|
-
# 1. PyInstaller environment: check sys._MEIPASS
|
|
50
|
-
if getattr(sys, "frozen", False) and hasattr(sys, "_MEIPASS"):
|
|
51
|
-
meipass = Path(getattr(sys, "_MEIPASS"))
|
|
52
|
-
mai_agent_paths.append(meipass / "mai_agent")
|
|
53
|
-
|
|
54
|
-
# 2. Wheel installation: check site-packages
|
|
55
|
-
# Try to get the package location
|
|
56
|
-
try:
|
|
57
|
-
import AutoGLM_GUI
|
|
58
|
-
|
|
59
|
-
pkg_root = Path(AutoGLM_GUI.__file__).parent.parent
|
|
60
|
-
mai_agent_paths.append(pkg_root / "mai_agent")
|
|
61
|
-
except (ImportError, AttributeError):
|
|
62
|
-
pass
|
|
63
|
-
|
|
64
|
-
# 3. Development environment: check project root relative to this file
|
|
65
|
-
# This file is at: AutoGLM_GUI/agents/mai_adapter.py
|
|
66
|
-
# Project root is 3 levels up
|
|
67
|
-
current_file = Path(__file__)
|
|
68
|
-
project_root = current_file.parent.parent.parent
|
|
69
|
-
mai_agent_paths.append(project_root / "mai_agent")
|
|
70
|
-
|
|
71
|
-
# Add first existing path to sys.path
|
|
72
|
-
for mai_path in mai_agent_paths:
|
|
73
|
-
if mai_path.exists() and mai_path.is_dir():
|
|
74
|
-
mai_path_str = str(mai_path)
|
|
75
|
-
if mai_path_str not in sys.path:
|
|
76
|
-
sys.path.insert(0, mai_path_str)
|
|
77
|
-
logger.debug(f"Added {mai_path_str} to sys.path for mai_agent imports")
|
|
78
|
-
return
|
|
79
|
-
|
|
80
|
-
# If we get here, mai_agent was not found
|
|
81
|
-
logger.warning(
|
|
82
|
-
"mai_agent directory not found. MAI Agent functionality may not work."
|
|
83
|
-
)
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
_ensure_mai_agent_importable()
|
|
87
|
-
|
|
88
|
-
if TYPE_CHECKING:
|
|
89
|
-
from mai_naivigation_agent import MAIUINaivigationAgent # type: ignore[import-not-found]
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
@dataclass
|
|
93
|
-
class MAIAgentConfig:
|
|
94
|
-
"""MAI Agent specific configuration.
|
|
95
|
-
|
|
96
|
-
Attributes:
|
|
97
|
-
history_n: Number of historical screenshots to include in context.
|
|
98
|
-
max_pixels: Maximum pixels for image resizing (optional).
|
|
99
|
-
min_pixels: Minimum pixels for image resizing (optional).
|
|
100
|
-
tools: MCP tools list (optional, not implemented yet).
|
|
101
|
-
use_mai_prompt: Whether to use MAI's native prompt format.
|
|
102
|
-
"""
|
|
103
|
-
|
|
104
|
-
history_n: int = 3
|
|
105
|
-
max_pixels: Optional[int] = None
|
|
106
|
-
min_pixels: Optional[int] = None
|
|
107
|
-
tools: Optional[list[dict[str, Any]]] = None
|
|
108
|
-
use_mai_prompt: bool = False
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
class MAIAgentAdapter:
|
|
112
|
-
"""
|
|
113
|
-
Adapter for MAI Agent that implements PhoneAgent-compatible interface.
|
|
114
|
-
|
|
115
|
-
This adapter wraps mai_agent.MAIUINaivigationAgent and provides:
|
|
116
|
-
- Compatible run() and step() methods
|
|
117
|
-
- Action format conversion (MAI → PhoneAgent)
|
|
118
|
-
- Coordinate system conversion (0-999 → 0-1000)
|
|
119
|
-
- Reuses existing ActionHandler for ADB operations
|
|
120
|
-
- Trajectory management via MAI's TrajMemory
|
|
121
|
-
|
|
122
|
-
Example:
|
|
123
|
-
>>> adapter = MAIAgentAdapter(model_config, agent_config, mai_config)
|
|
124
|
-
>>> result = adapter.run("Open Settings")
|
|
125
|
-
>>> print(result)
|
|
126
|
-
"""
|
|
127
|
-
|
|
128
|
-
def __init__(
|
|
129
|
-
self,
|
|
130
|
-
model_config: ModelConfig,
|
|
131
|
-
agent_config: AgentConfig,
|
|
132
|
-
mai_config: MAIAgentConfig,
|
|
133
|
-
confirmation_callback: Optional[Callable[[str], bool]] = None,
|
|
134
|
-
takeover_callback: Optional[Callable[[str], None]] = None,
|
|
135
|
-
on_thinking_chunk: Optional[Callable[[str], None]] = None,
|
|
136
|
-
):
|
|
137
|
-
"""Initialize the MAI Agent adapter.
|
|
138
|
-
|
|
139
|
-
Args:
|
|
140
|
-
model_config: Model configuration (base_url, model_name, etc.)
|
|
141
|
-
agent_config: Agent configuration (device_id, max_steps, etc.)
|
|
142
|
-
mai_config: MAI-specific configuration
|
|
143
|
-
confirmation_callback: Callback for sensitive action confirmation
|
|
144
|
-
takeover_callback: Callback for takeover requests
|
|
145
|
-
on_thinking_chunk: Callback for streaming thinking chunks
|
|
146
|
-
"""
|
|
147
|
-
self.model_config = model_config
|
|
148
|
-
self.agent_config = agent_config
|
|
149
|
-
self.mai_config = mai_config
|
|
150
|
-
|
|
151
|
-
from mai_naivigation_agent import MAIUINaivigationAgent # type: ignore[import-not-found]
|
|
152
|
-
|
|
153
|
-
runtime_conf = {
|
|
154
|
-
"history_n": mai_config.history_n,
|
|
155
|
-
"temperature": model_config.temperature,
|
|
156
|
-
"top_k": -1, # MAI default
|
|
157
|
-
"top_p": model_config.top_p,
|
|
158
|
-
"max_tokens": model_config.max_tokens,
|
|
159
|
-
}
|
|
160
|
-
|
|
161
|
-
if mai_config.max_pixels:
|
|
162
|
-
runtime_conf["max_pixels"] = mai_config.max_pixels
|
|
163
|
-
if mai_config.min_pixels:
|
|
164
|
-
runtime_conf["min_pixels"] = mai_config.min_pixels
|
|
165
|
-
|
|
166
|
-
self.mai_agent: MAIUINaivigationAgent = MAIUINaivigationAgent(
|
|
167
|
-
llm_base_url=model_config.base_url,
|
|
168
|
-
model_name=model_config.model_name,
|
|
169
|
-
runtime_conf=runtime_conf,
|
|
170
|
-
tools=mai_config.tools,
|
|
171
|
-
)
|
|
172
|
-
|
|
173
|
-
# Create action handler (reuse from phone_agent)
|
|
174
|
-
self.action_handler = ActionHandler(
|
|
175
|
-
device_id=agent_config.device_id,
|
|
176
|
-
confirmation_callback=confirmation_callback,
|
|
177
|
-
takeover_callback=takeover_callback,
|
|
178
|
-
)
|
|
179
|
-
|
|
180
|
-
# State management
|
|
181
|
-
self._step_count = 0
|
|
182
|
-
self._current_instruction = ""
|
|
183
|
-
self._on_thinking_chunk = on_thinking_chunk
|
|
184
|
-
|
|
185
|
-
logger.info(
|
|
186
|
-
f"MAI Agent adapter initialized for device {agent_config.device_id} "
|
|
187
|
-
f"using model {model_config.model_name}"
|
|
188
|
-
)
|
|
189
|
-
|
|
190
|
-
def run(self, task: str) -> str:
|
|
191
|
-
"""Run the agent to complete a task.
|
|
192
|
-
|
|
193
|
-
This method loops through steps until the task is finished
|
|
194
|
-
or max_steps is reached.
|
|
195
|
-
|
|
196
|
-
Args:
|
|
197
|
-
task: Natural language description of the task.
|
|
198
|
-
|
|
199
|
-
Returns:
|
|
200
|
-
Final message from the agent.
|
|
201
|
-
"""
|
|
202
|
-
self._current_instruction = task
|
|
203
|
-
self.mai_agent.reset()
|
|
204
|
-
self._step_count = 0
|
|
205
|
-
|
|
206
|
-
while self._step_count < self.agent_config.max_steps:
|
|
207
|
-
result = self._execute_step(is_first=(self._step_count == 0))
|
|
208
|
-
|
|
209
|
-
if result.finished:
|
|
210
|
-
return result.message or "Task completed"
|
|
211
|
-
|
|
212
|
-
self._step_count += 1
|
|
213
|
-
|
|
214
|
-
return "Max steps reached"
|
|
215
|
-
|
|
216
|
-
def step(self, task: Optional[str] = None) -> StepResult:
|
|
217
|
-
"""Execute a single step.
|
|
218
|
-
|
|
219
|
-
Args:
|
|
220
|
-
task: Task description (only required for the first step).
|
|
221
|
-
|
|
222
|
-
Returns:
|
|
223
|
-
StepResult containing the action and thinking.
|
|
224
|
-
"""
|
|
225
|
-
is_first = self._step_count == 0
|
|
226
|
-
|
|
227
|
-
if is_first:
|
|
228
|
-
if not task:
|
|
229
|
-
raise ValueError("Task is required for the first step")
|
|
230
|
-
self._current_instruction = task
|
|
231
|
-
if len(self.mai_agent.traj_memory.steps) == 0:
|
|
232
|
-
self.mai_agent.reset()
|
|
233
|
-
|
|
234
|
-
result = self._execute_step(is_first=is_first)
|
|
235
|
-
self._step_count += 1
|
|
236
|
-
return result
|
|
237
|
-
|
|
238
|
-
def reset(self) -> None:
|
|
239
|
-
"""Reset the agent state."""
|
|
240
|
-
self.mai_agent.reset()
|
|
241
|
-
self._step_count = 0
|
|
242
|
-
self._current_instruction = ""
|
|
243
|
-
|
|
244
|
-
def _execute_step(self, is_first: bool) -> StepResult:
|
|
245
|
-
"""Execute a single step (internal method).
|
|
246
|
-
|
|
247
|
-
Args:
|
|
248
|
-
is_first: Whether this is the first step.
|
|
249
|
-
|
|
250
|
-
Returns:
|
|
251
|
-
StepResult
|
|
252
|
-
"""
|
|
253
|
-
# 1. Get current screenshot
|
|
254
|
-
device_factory = get_device_factory()
|
|
255
|
-
screenshot = device_factory.get_screenshot(self.agent_config.device_id)
|
|
256
|
-
|
|
257
|
-
# 2. Convert base64_data to PIL Image
|
|
258
|
-
# The Screenshot object contains base64_data, not pil_image
|
|
259
|
-
image_data = base64.b64decode(screenshot.base64_data)
|
|
260
|
-
pil_image = Image.open(BytesIO(image_data))
|
|
261
|
-
|
|
262
|
-
# 3. Build observation dictionary
|
|
263
|
-
obs = {
|
|
264
|
-
"screenshot": pil_image,
|
|
265
|
-
"accessibility_tree": None, # Not supported yet
|
|
266
|
-
}
|
|
267
|
-
|
|
268
|
-
# 4. Call MAI agent predict
|
|
269
|
-
# IMPORTANT: Always pass self._current_instruction, not just on the first step.
|
|
270
|
-
# MAI agent's _build_messages uses instruction to populate the primary user message,
|
|
271
|
-
# and does not re-inject it from history. Without the instruction in subsequent steps,
|
|
272
|
-
# the model would lose track of the task goal as history grows.
|
|
273
|
-
try:
|
|
274
|
-
prediction_text, action_dict = self.mai_agent.predict(
|
|
275
|
-
instruction=self._current_instruction,
|
|
276
|
-
obs=obs,
|
|
277
|
-
)
|
|
278
|
-
except Exception as e:
|
|
279
|
-
logger.error(f"MAI agent prediction failed: {e}")
|
|
280
|
-
return StepResult(
|
|
281
|
-
success=False,
|
|
282
|
-
finished=True,
|
|
283
|
-
action=None,
|
|
284
|
-
thinking="",
|
|
285
|
-
message=f"Prediction error: {e}",
|
|
286
|
-
)
|
|
287
|
-
|
|
288
|
-
# 5. Extract thinking from prediction_text
|
|
289
|
-
# MAI Agent uses <thinking> tags
|
|
290
|
-
thinking = self._extract_thinking(prediction_text)
|
|
291
|
-
|
|
292
|
-
# 6. Convert action format
|
|
293
|
-
converted_action = self._convert_action(action_dict)
|
|
294
|
-
|
|
295
|
-
# 7. Execute action
|
|
296
|
-
try:
|
|
297
|
-
action_result = self.action_handler.execute(
|
|
298
|
-
converted_action,
|
|
299
|
-
screenshot.width,
|
|
300
|
-
screenshot.height,
|
|
301
|
-
)
|
|
302
|
-
except Exception as e:
|
|
303
|
-
logger.error(f"Action execution failed: {e}")
|
|
304
|
-
return StepResult(
|
|
305
|
-
success=False,
|
|
306
|
-
finished=True,
|
|
307
|
-
action=converted_action,
|
|
308
|
-
thinking=thinking,
|
|
309
|
-
message=f"Action error: {e}",
|
|
310
|
-
)
|
|
311
|
-
|
|
312
|
-
# 8. Check if finished
|
|
313
|
-
finished = (
|
|
314
|
-
converted_action.get("_metadata") == "finish" or action_result.should_finish
|
|
315
|
-
)
|
|
316
|
-
|
|
317
|
-
return StepResult(
|
|
318
|
-
success=action_result.success,
|
|
319
|
-
finished=finished,
|
|
320
|
-
action=converted_action,
|
|
321
|
-
thinking=thinking,
|
|
322
|
-
message=action_result.message,
|
|
323
|
-
)
|
|
324
|
-
|
|
325
|
-
def _convert_action(self, mai_action: dict[str, Any]) -> dict[str, Any]:
|
|
326
|
-
"""Convert MAI action format to PhoneAgent format.
|
|
327
|
-
|
|
328
|
-
MAI format: {"action": "click", "coordinate": [x, y]}
|
|
329
|
-
PhoneAgent format: {"_metadata": "do", "action": "Tap", "element": [x, y]}
|
|
330
|
-
|
|
331
|
-
Coordinate conversion: MAI uses 0-999, PhoneAgent uses 0-1000.
|
|
332
|
-
|
|
333
|
-
Args:
|
|
334
|
-
mai_action: Action dictionary from MAI agent.
|
|
335
|
-
|
|
336
|
-
Returns:
|
|
337
|
-
Converted action dictionary for PhoneAgent.
|
|
338
|
-
"""
|
|
339
|
-
action_type = mai_action.get("action")
|
|
340
|
-
|
|
341
|
-
# Terminate action
|
|
342
|
-
if action_type == "terminate":
|
|
343
|
-
status = mai_action.get("status", "success")
|
|
344
|
-
return {
|
|
345
|
-
"_metadata": "finish",
|
|
346
|
-
"message": "Task completed" if status == "success" else "Task failed",
|
|
347
|
-
}
|
|
348
|
-
|
|
349
|
-
# Answer action (no operation)
|
|
350
|
-
if action_type == "answer":
|
|
351
|
-
return {
|
|
352
|
-
"_metadata": "finish",
|
|
353
|
-
"message": mai_action.get("text", ""),
|
|
354
|
-
}
|
|
355
|
-
|
|
356
|
-
# Wait action
|
|
357
|
-
if action_type == "wait":
|
|
358
|
-
return {
|
|
359
|
-
"_metadata": "do",
|
|
360
|
-
"action": "Wait",
|
|
361
|
-
"duration": "1 seconds",
|
|
362
|
-
}
|
|
363
|
-
|
|
364
|
-
# System button
|
|
365
|
-
if action_type == "system_button":
|
|
366
|
-
button_name = mai_action.get("button", "")
|
|
367
|
-
|
|
368
|
-
# Special handling for Enter key
|
|
369
|
-
# ActionHandler doesn't have an "Enter" handler, so we handle it directly here
|
|
370
|
-
if button_name == "enter":
|
|
371
|
-
# Use platform_utils to run ADB keyevent command
|
|
372
|
-
from AutoGLM_GUI.platform_utils import run_cmd_silently_sync
|
|
373
|
-
|
|
374
|
-
adb_prefix = (
|
|
375
|
-
["adb", "-s", self.agent_config.device_id]
|
|
376
|
-
if self.agent_config.device_id
|
|
377
|
-
else ["adb"]
|
|
378
|
-
)
|
|
379
|
-
run_cmd_silently_sync(
|
|
380
|
-
adb_prefix + ["shell", "input", "keyevent", "KEYCODE_ENTER"],
|
|
381
|
-
timeout=5,
|
|
382
|
-
)
|
|
383
|
-
# Return a Wait action to indicate success
|
|
384
|
-
return {
|
|
385
|
-
"_metadata": "do",
|
|
386
|
-
"action": "Wait",
|
|
387
|
-
"duration": "0.5 seconds",
|
|
388
|
-
}
|
|
389
|
-
|
|
390
|
-
# Other system buttons use standard handlers
|
|
391
|
-
action_map = {
|
|
392
|
-
"back": "Back",
|
|
393
|
-
"home": "Home",
|
|
394
|
-
}
|
|
395
|
-
return {
|
|
396
|
-
"_metadata": "do",
|
|
397
|
-
"action": action_map.get(button_name, "Back"),
|
|
398
|
-
}
|
|
399
|
-
|
|
400
|
-
# Click-type actions (require coordinates)
|
|
401
|
-
coordinate = mai_action.get("coordinate")
|
|
402
|
-
if coordinate:
|
|
403
|
-
# Coordinate conversion: 0-999 -> 0-1000
|
|
404
|
-
x = self._convert_coordinate(coordinate[0])
|
|
405
|
-
y = self._convert_coordinate(coordinate[1])
|
|
406
|
-
|
|
407
|
-
if action_type == "click":
|
|
408
|
-
return {
|
|
409
|
-
"_metadata": "do",
|
|
410
|
-
"action": "Tap",
|
|
411
|
-
"element": [x, y],
|
|
412
|
-
}
|
|
413
|
-
elif action_type == "long_press":
|
|
414
|
-
return {
|
|
415
|
-
"_metadata": "do",
|
|
416
|
-
"action": "Long Press",
|
|
417
|
-
"element": [x, y],
|
|
418
|
-
}
|
|
419
|
-
elif action_type == "double_click":
|
|
420
|
-
return {
|
|
421
|
-
"_metadata": "do",
|
|
422
|
-
"action": "Double Tap",
|
|
423
|
-
"element": [x, y],
|
|
424
|
-
}
|
|
425
|
-
|
|
426
|
-
# Swipe action
|
|
427
|
-
if action_type == "swipe":
|
|
428
|
-
direction = mai_action.get("direction", "up")
|
|
429
|
-
# Default to normalized center [0.5, 0.5], not [500, 500]
|
|
430
|
-
# MAI coordinates are normalized to [0, 1], so we use normalized values
|
|
431
|
-
coordinate = mai_action.get("coordinate") or [0.5, 0.5]
|
|
432
|
-
x = self._convert_coordinate(coordinate[0])
|
|
433
|
-
y = self._convert_coordinate(coordinate[1])
|
|
434
|
-
|
|
435
|
-
start, end = self._calculate_swipe_coordinates(direction, x, y)
|
|
436
|
-
|
|
437
|
-
return {
|
|
438
|
-
"_metadata": "do",
|
|
439
|
-
"action": "Swipe",
|
|
440
|
-
"start": start,
|
|
441
|
-
"end": end,
|
|
442
|
-
}
|
|
443
|
-
|
|
444
|
-
# Drag action
|
|
445
|
-
if action_type == "drag":
|
|
446
|
-
start_coord = mai_action.get("start_coordinate", [0, 0])
|
|
447
|
-
end_coord = mai_action.get("end_coordinate", [0, 0])
|
|
448
|
-
|
|
449
|
-
# IMPORTANT: start_coordinate and end_coordinate are NOT normalized by MAI.
|
|
450
|
-
# They remain in SCALE_FACTOR range [0, 999], unlike the "coordinate" field
|
|
451
|
-
# which is normalized to [0, 1]. We must use the scale factor conversion.
|
|
452
|
-
start = [
|
|
453
|
-
self._convert_coordinate_from_scale_factor(start_coord[0]),
|
|
454
|
-
self._convert_coordinate_from_scale_factor(start_coord[1]),
|
|
455
|
-
]
|
|
456
|
-
end = [
|
|
457
|
-
self._convert_coordinate_from_scale_factor(end_coord[0]),
|
|
458
|
-
self._convert_coordinate_from_scale_factor(end_coord[1]),
|
|
459
|
-
]
|
|
460
|
-
|
|
461
|
-
return {
|
|
462
|
-
"_metadata": "do",
|
|
463
|
-
"action": "Swipe",
|
|
464
|
-
"start": start,
|
|
465
|
-
"end": end,
|
|
466
|
-
}
|
|
467
|
-
|
|
468
|
-
# Text input
|
|
469
|
-
if action_type == "type":
|
|
470
|
-
return {
|
|
471
|
-
"_metadata": "do",
|
|
472
|
-
"action": "Type",
|
|
473
|
-
"text": mai_action.get("text", ""),
|
|
474
|
-
}
|
|
475
|
-
|
|
476
|
-
# Open app
|
|
477
|
-
if action_type == "open":
|
|
478
|
-
return {
|
|
479
|
-
"_metadata": "do",
|
|
480
|
-
"action": "Launch",
|
|
481
|
-
"app": mai_action.get("text", ""),
|
|
482
|
-
}
|
|
483
|
-
|
|
484
|
-
# Unknown action - treat as finish
|
|
485
|
-
logger.warning(f"Unknown MAI action type: {action_type}")
|
|
486
|
-
return {
|
|
487
|
-
"_metadata": "finish",
|
|
488
|
-
"message": f"Unknown action: {action_type}",
|
|
489
|
-
}
|
|
490
|
-
|
|
491
|
-
def _convert_coordinate(self, coord: float) -> int:
|
|
492
|
-
"""Convert coordinate from MAI scale to PhoneAgent scale.
|
|
493
|
-
|
|
494
|
-
MAI agent normalizes coordinates to [0, 1] in parse_action_to_structure_output.
|
|
495
|
-
PhoneAgent uses normalized coordinates in [0, 1000] range.
|
|
496
|
-
|
|
497
|
-
Args:
|
|
498
|
-
coord: Coordinate in MAI scale [0, 1] (normalized).
|
|
499
|
-
|
|
500
|
-
Returns:
|
|
501
|
-
Coordinate in PhoneAgent scale [0, 1000].
|
|
502
|
-
|
|
503
|
-
Example:
|
|
504
|
-
>>> _convert_coordinate(0.5) # Center of screen
|
|
505
|
-
500
|
|
506
|
-
"""
|
|
507
|
-
return int(coord * 1000)
|
|
508
|
-
|
|
509
|
-
def _convert_coordinate_from_scale_factor(self, coord: float) -> int:
|
|
510
|
-
"""Convert coordinate from MAI SCALE_FACTOR to PhoneAgent scale.
|
|
511
|
-
|
|
512
|
-
For drag actions, MAI does NOT normalize start_coordinate/end_coordinate.
|
|
513
|
-
These coordinates remain in the SCALE_FACTOR range [0, 999].
|
|
514
|
-
PhoneAgent uses normalized coordinates in [0, 1000] range.
|
|
515
|
-
|
|
516
|
-
Args:
|
|
517
|
-
coord: Coordinate in MAI SCALE_FACTOR [0, 999].
|
|
518
|
-
|
|
519
|
-
Returns:
|
|
520
|
-
Coordinate in PhoneAgent scale [0, 1000].
|
|
521
|
-
|
|
522
|
-
Example:
|
|
523
|
-
>>> _convert_coordinate_from_scale_factor(500) # Center of screen
|
|
524
|
-
500
|
|
525
|
-
"""
|
|
526
|
-
SCALE_FACTOR = 999
|
|
527
|
-
return int(coord * 1000 / SCALE_FACTOR)
|
|
528
|
-
|
|
529
|
-
def _calculate_swipe_coordinates(
|
|
530
|
-
self, direction: str, x: int, y: int
|
|
531
|
-
) -> Tuple[list[int], list[int]]:
|
|
532
|
-
"""Calculate swipe coordinates based on direction.
|
|
533
|
-
|
|
534
|
-
Args:
|
|
535
|
-
direction: Swipe direction (up, down, left, right).
|
|
536
|
-
x: Center X coordinate.
|
|
537
|
-
y: Center Y coordinate.
|
|
538
|
-
|
|
539
|
-
Returns:
|
|
540
|
-
Tuple of [start_x, start_y] and [end_x, end_y].
|
|
541
|
-
"""
|
|
542
|
-
distance = 300 # Default swipe distance
|
|
543
|
-
|
|
544
|
-
if direction == "up":
|
|
545
|
-
start = [x, y + distance // 2]
|
|
546
|
-
end = [x, y - distance // 2]
|
|
547
|
-
elif direction == "down":
|
|
548
|
-
start = [x, y - distance // 2]
|
|
549
|
-
end = [x, y + distance // 2]
|
|
550
|
-
elif direction == "left":
|
|
551
|
-
start = [x + distance // 2, y]
|
|
552
|
-
end = [x - distance // 2, y]
|
|
553
|
-
elif direction == "right":
|
|
554
|
-
start = [x - distance // 2, y]
|
|
555
|
-
end = [x + distance // 2, y]
|
|
556
|
-
else:
|
|
557
|
-
start = [x, y]
|
|
558
|
-
end = [x, y]
|
|
559
|
-
|
|
560
|
-
return start, end
|
|
561
|
-
|
|
562
|
-
def _extract_thinking(self, prediction_text: str) -> str:
|
|
563
|
-
"""Extract thinking content from agent response.
|
|
564
|
-
|
|
565
|
-
MAI Agent format:
|
|
566
|
-
<thinking>reasoning process</thinking>
|
|
567
|
-
<tool_call>...</tool_call>
|
|
568
|
-
|
|
569
|
-
GLM Agent format:
|
|
570
|
-
```
|
|
571
|
-
详细的推理过程...
|
|
572
|
-
```
|
|
573
|
-
<answer>action</answer>
|
|
574
|
-
|
|
575
|
-
Args:
|
|
576
|
-
prediction_text: Full prediction text from agent.
|
|
577
|
-
|
|
578
|
-
Returns:
|
|
579
|
-
Thinking content (empty string if not found or truncated).
|
|
580
|
-
"""
|
|
581
|
-
# Try <thinking> tags first (MAI Agent format)
|
|
582
|
-
match = re.search(r"<thinking>(.*?)</thinking>", prediction_text, re.DOTALL)
|
|
583
|
-
if match:
|
|
584
|
-
thinking = match.group(1).strip()
|
|
585
|
-
# Truncate if too long (MAI Agent can produce very long reasoning)
|
|
586
|
-
if len(thinking) > 500:
|
|
587
|
-
thinking = thinking[:500] + "..."
|
|
588
|
-
return thinking
|
|
589
|
-
|
|
590
|
-
# Fallback to ``` tags (GLM format)
|
|
591
|
-
match = re.search(r"```(.*?)```", prediction_text, re.DOTALL)
|
|
592
|
-
if match:
|
|
593
|
-
thinking = match.group(1).strip()
|
|
594
|
-
if len(thinking) > 500:
|
|
595
|
-
thinking = thinking[:500] + "..."
|
|
596
|
-
return thinking
|
|
597
|
-
|
|
598
|
-
return ""
|
|
599
|
-
|
|
600
|
-
@property
|
|
601
|
-
def context(self) -> list[dict[str, Any]]:
|
|
602
|
-
"""Return trajectory history in PhoneAgent format (read-only).
|
|
603
|
-
|
|
604
|
-
This property converts MAI's TrajMemory to PhoneAgent's context format.
|
|
605
|
-
|
|
606
|
-
Returns:
|
|
607
|
-
List of message dictionaries.
|
|
608
|
-
"""
|
|
609
|
-
context = []
|
|
610
|
-
|
|
611
|
-
for step in self.mai_agent.traj_memory.steps:
|
|
612
|
-
# Assistant message
|
|
613
|
-
if step.thought:
|
|
614
|
-
content = f"<thinking>\n{step.thought}\n</thinking>\n<answer>\n{step.action}\n</answer>"
|
|
615
|
-
context.append(
|
|
616
|
-
{
|
|
617
|
-
"role": "assistant",
|
|
618
|
-
"content": content,
|
|
619
|
-
}
|
|
620
|
-
)
|
|
621
|
-
|
|
622
|
-
return context
|
|
623
|
-
|
|
624
|
-
@property
|
|
625
|
-
def step_count(self) -> int:
|
|
626
|
-
"""Return current step count."""
|
|
627
|
-
return self._step_count
|