autoglm-gui 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- AutoGLM_GUI/__init__.py +11 -0
- AutoGLM_GUI/__main__.py +26 -8
- AutoGLM_GUI/actions/__init__.py +6 -0
- AutoGLM_GUI/actions/handler.py +196 -0
- AutoGLM_GUI/actions/types.py +15 -0
- AutoGLM_GUI/adb/__init__.py +53 -0
- AutoGLM_GUI/adb/apps.py +227 -0
- AutoGLM_GUI/adb/connection.py +323 -0
- AutoGLM_GUI/adb/device.py +171 -0
- AutoGLM_GUI/adb/input.py +67 -0
- AutoGLM_GUI/adb/screenshot.py +11 -0
- AutoGLM_GUI/adb/timing.py +167 -0
- AutoGLM_GUI/adb_plus/keyboard_installer.py +4 -2
- AutoGLM_GUI/adb_plus/qr_pair.py +8 -8
- AutoGLM_GUI/adb_plus/screenshot.py +22 -1
- AutoGLM_GUI/adb_plus/serial.py +38 -20
- AutoGLM_GUI/adb_plus/touch.py +4 -9
- AutoGLM_GUI/agents/__init__.py +51 -0
- AutoGLM_GUI/agents/events.py +19 -0
- AutoGLM_GUI/agents/factory.py +153 -0
- AutoGLM_GUI/agents/glm/__init__.py +7 -0
- AutoGLM_GUI/agents/glm/agent.py +292 -0
- AutoGLM_GUI/agents/glm/message_builder.py +81 -0
- AutoGLM_GUI/agents/glm/parser.py +110 -0
- AutoGLM_GUI/agents/glm/prompts_en.py +77 -0
- AutoGLM_GUI/agents/glm/prompts_zh.py +75 -0
- AutoGLM_GUI/agents/mai/__init__.py +28 -0
- AutoGLM_GUI/agents/mai/agent.py +405 -0
- AutoGLM_GUI/agents/mai/parser.py +254 -0
- AutoGLM_GUI/agents/mai/prompts.py +103 -0
- AutoGLM_GUI/agents/mai/traj_memory.py +91 -0
- AutoGLM_GUI/agents/protocols.py +27 -0
- AutoGLM_GUI/agents/stream_runner.py +188 -0
- AutoGLM_GUI/api/__init__.py +71 -11
- AutoGLM_GUI/api/agents.py +190 -229
- AutoGLM_GUI/api/control.py +9 -6
- AutoGLM_GUI/api/devices.py +112 -28
- AutoGLM_GUI/api/health.py +13 -0
- AutoGLM_GUI/api/history.py +78 -0
- AutoGLM_GUI/api/layered_agent.py +306 -181
- AutoGLM_GUI/api/mcp.py +11 -10
- AutoGLM_GUI/api/media.py +64 -1
- AutoGLM_GUI/api/scheduled_tasks.py +98 -0
- AutoGLM_GUI/api/version.py +23 -10
- AutoGLM_GUI/api/workflows.py +2 -1
- AutoGLM_GUI/config.py +72 -14
- AutoGLM_GUI/config_manager.py +98 -27
- AutoGLM_GUI/device_adapter.py +263 -0
- AutoGLM_GUI/device_manager.py +248 -29
- AutoGLM_GUI/device_protocol.py +266 -0
- AutoGLM_GUI/devices/__init__.py +49 -0
- AutoGLM_GUI/devices/adb_device.py +200 -0
- AutoGLM_GUI/devices/mock_device.py +185 -0
- AutoGLM_GUI/devices/remote_device.py +177 -0
- AutoGLM_GUI/exceptions.py +3 -3
- AutoGLM_GUI/history_manager.py +164 -0
- AutoGLM_GUI/i18n.py +81 -0
- AutoGLM_GUI/metrics.py +13 -20
- AutoGLM_GUI/model/__init__.py +5 -0
- AutoGLM_GUI/model/message_builder.py +69 -0
- AutoGLM_GUI/model/types.py +24 -0
- AutoGLM_GUI/models/__init__.py +10 -0
- AutoGLM_GUI/models/history.py +96 -0
- AutoGLM_GUI/models/scheduled_task.py +71 -0
- AutoGLM_GUI/parsers/__init__.py +22 -0
- AutoGLM_GUI/parsers/base.py +50 -0
- AutoGLM_GUI/parsers/phone_parser.py +58 -0
- AutoGLM_GUI/phone_agent_manager.py +118 -367
- AutoGLM_GUI/platform_utils.py +31 -2
- AutoGLM_GUI/prompt_config.py +15 -0
- AutoGLM_GUI/prompts/__init__.py +32 -0
- AutoGLM_GUI/scheduler_manager.py +304 -0
- AutoGLM_GUI/schemas.py +272 -63
- AutoGLM_GUI/scrcpy_stream.py +159 -37
- AutoGLM_GUI/server.py +3 -1
- AutoGLM_GUI/socketio_server.py +114 -29
- AutoGLM_GUI/state.py +10 -30
- AutoGLM_GUI/static/assets/{about-DeclntHg.js → about-BQm96DAl.js} +1 -1
- AutoGLM_GUI/static/assets/alert-dialog-B42XxGPR.js +1 -0
- AutoGLM_GUI/static/assets/chat-C0L2gQYG.js +129 -0
- AutoGLM_GUI/static/assets/circle-alert-D4rSJh37.js +1 -0
- AutoGLM_GUI/static/assets/dialog-DZ78cEcj.js +45 -0
- AutoGLM_GUI/static/assets/history-DFBv7TGc.js +1 -0
- AutoGLM_GUI/static/assets/index-Bzyv2yQ2.css +1 -0
- AutoGLM_GUI/static/assets/{index-zQ4KKDHt.js → index-CmZSnDqc.js} +1 -1
- AutoGLM_GUI/static/assets/index-CssG-3TH.js +11 -0
- AutoGLM_GUI/static/assets/label-BCUzE_nm.js +1 -0
- AutoGLM_GUI/static/assets/logs-eoFxn5of.js +1 -0
- AutoGLM_GUI/static/assets/popover-DLsuV5Sx.js +1 -0
- AutoGLM_GUI/static/assets/scheduled-tasks-MyqGJvy_.js +1 -0
- AutoGLM_GUI/static/assets/square-pen-zGWYrdfj.js +1 -0
- AutoGLM_GUI/static/assets/textarea-BX6y7uM5.js +1 -0
- AutoGLM_GUI/static/assets/workflows-CYFs6ssC.js +1 -0
- AutoGLM_GUI/static/index.html +2 -2
- AutoGLM_GUI/types.py +142 -0
- {autoglm_gui-1.4.0.dist-info → autoglm_gui-1.5.0.dist-info}/METADATA +178 -92
- autoglm_gui-1.5.0.dist-info/RECORD +157 -0
- mai_agent/base.py +137 -0
- mai_agent/mai_grounding_agent.py +263 -0
- mai_agent/mai_naivigation_agent.py +526 -0
- mai_agent/prompt.py +148 -0
- mai_agent/unified_memory.py +67 -0
- mai_agent/utils.py +73 -0
- AutoGLM_GUI/api/dual_model.py +0 -311
- AutoGLM_GUI/dual_model/__init__.py +0 -53
- AutoGLM_GUI/dual_model/decision_model.py +0 -664
- AutoGLM_GUI/dual_model/dual_agent.py +0 -917
- AutoGLM_GUI/dual_model/protocols.py +0 -354
- AutoGLM_GUI/dual_model/vision_model.py +0 -442
- AutoGLM_GUI/mai_ui_adapter/agent_wrapper.py +0 -291
- AutoGLM_GUI/phone_agent_patches.py +0 -146
- AutoGLM_GUI/static/assets/chat-Iut2yhSw.js +0 -125
- AutoGLM_GUI/static/assets/dialog-BfdcBs1x.js +0 -45
- AutoGLM_GUI/static/assets/index-5hCCwHA7.css +0 -1
- AutoGLM_GUI/static/assets/index-DHF1NZh0.js +0 -12
- AutoGLM_GUI/static/assets/workflows-xiplap-r.js +0 -1
- autoglm_gui-1.4.0.dist-info/RECORD +0 -100
- {autoglm_gui-1.4.0.dist-info → autoglm_gui-1.5.0.dist-info}/WHEEL +0 -0
- {autoglm_gui-1.4.0.dist-info → autoglm_gui-1.5.0.dist-info}/entry_points.txt +0 -0
- {autoglm_gui-1.4.0.dist-info → autoglm_gui-1.5.0.dist-info}/licenses/LICENSE +0 -0
|
@@ -0,0 +1,405 @@
|
|
|
1
|
+
"""Internal MAI Agent Implementation
|
|
2
|
+
|
|
3
|
+
完全内部化实现的 MAI Agent,替代第三方 mai_agent 依赖。
|
|
4
|
+
|
|
5
|
+
核心特性:
|
|
6
|
+
- 多图像历史上下文(保留最近 N 张截图)
|
|
7
|
+
- XML 格式的思考过程和动作输出
|
|
8
|
+
- 999 坐标系统归一化
|
|
9
|
+
- 自动重试机制
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
import base64
|
|
13
|
+
import time
|
|
14
|
+
import traceback
|
|
15
|
+
from io import BytesIO
|
|
16
|
+
from typing import Any, Callable
|
|
17
|
+
|
|
18
|
+
from openai import OpenAI
|
|
19
|
+
from PIL import Image
|
|
20
|
+
|
|
21
|
+
from AutoGLM_GUI.actions import ActionHandler, ActionResult
|
|
22
|
+
from AutoGLM_GUI.config import AgentConfig, ModelConfig, StepResult
|
|
23
|
+
from AutoGLM_GUI.device_protocol import DeviceProtocol
|
|
24
|
+
from AutoGLM_GUI.logger import logger
|
|
25
|
+
from AutoGLM_GUI.model import MessageBuilder
|
|
26
|
+
|
|
27
|
+
from .traj_memory import TrajMemory, TrajStep
|
|
28
|
+
from .parser import MAIParseError, MAIParser
|
|
29
|
+
from .prompts import MAI_MOBILE_SYSTEM_PROMPT
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
class InternalMAIAgent:
|
|
33
|
+
def __init__(
|
|
34
|
+
self,
|
|
35
|
+
model_config: ModelConfig,
|
|
36
|
+
agent_config: AgentConfig,
|
|
37
|
+
device: DeviceProtocol,
|
|
38
|
+
history_n: int = 3,
|
|
39
|
+
confirmation_callback: Callable[[str], bool] | None = None,
|
|
40
|
+
takeover_callback: Callable[[str], None] | None = None,
|
|
41
|
+
thinking_callback: Callable[[str], None] | None = None,
|
|
42
|
+
):
|
|
43
|
+
self.model_config = model_config
|
|
44
|
+
self.agent_config = agent_config
|
|
45
|
+
self.history_n = history_n
|
|
46
|
+
|
|
47
|
+
self.openai_client = OpenAI(
|
|
48
|
+
base_url=model_config.base_url,
|
|
49
|
+
api_key=model_config.api_key,
|
|
50
|
+
timeout=120,
|
|
51
|
+
)
|
|
52
|
+
self.parser = MAIParser()
|
|
53
|
+
|
|
54
|
+
self.device = device
|
|
55
|
+
self.action_handler = ActionHandler(
|
|
56
|
+
device=self.device,
|
|
57
|
+
confirmation_callback=confirmation_callback,
|
|
58
|
+
takeover_callback=takeover_callback,
|
|
59
|
+
)
|
|
60
|
+
|
|
61
|
+
self.traj_memory = TrajMemory(task_goal="", task_id="", steps=[])
|
|
62
|
+
self._step_count = 0
|
|
63
|
+
self._is_running = False
|
|
64
|
+
self._thinking_callback = thinking_callback
|
|
65
|
+
|
|
66
|
+
self._total_llm_time = 0.0
|
|
67
|
+
self._total_action_time = 0.0
|
|
68
|
+
self._total_tokens = 0
|
|
69
|
+
|
|
70
|
+
def run(self, task: str) -> str:
|
|
71
|
+
self.traj_memory = TrajMemory(task_goal=task, task_id="", steps=[])
|
|
72
|
+
self._step_count = 0
|
|
73
|
+
self._is_running = True
|
|
74
|
+
|
|
75
|
+
try:
|
|
76
|
+
result = self._execute_step(task, is_first=True)
|
|
77
|
+
|
|
78
|
+
if result.finished:
|
|
79
|
+
return result.message or "Task completed"
|
|
80
|
+
|
|
81
|
+
while self._step_count < self.agent_config.max_steps and self._is_running:
|
|
82
|
+
result = self._execute_step(is_first=False)
|
|
83
|
+
|
|
84
|
+
if result.finished:
|
|
85
|
+
return result.message or "Task completed"
|
|
86
|
+
|
|
87
|
+
return "Max steps reached"
|
|
88
|
+
finally:
|
|
89
|
+
self._is_running = False
|
|
90
|
+
|
|
91
|
+
def step(self, task: str | None = None) -> StepResult:
|
|
92
|
+
is_first = len(self.traj_memory.steps) == 0
|
|
93
|
+
|
|
94
|
+
if is_first and not task:
|
|
95
|
+
raise ValueError("Task is required for the first step")
|
|
96
|
+
|
|
97
|
+
if is_first:
|
|
98
|
+
self.traj_memory.task_goal = task or ""
|
|
99
|
+
|
|
100
|
+
return self._execute_step(task, is_first)
|
|
101
|
+
|
|
102
|
+
def reset(self) -> None:
|
|
103
|
+
self.traj_memory.clear()
|
|
104
|
+
self._step_count = 0
|
|
105
|
+
self._is_running = False
|
|
106
|
+
self._total_llm_time = 0.0
|
|
107
|
+
self._total_action_time = 0.0
|
|
108
|
+
self._total_tokens = 0
|
|
109
|
+
|
|
110
|
+
def abort(self) -> None:
|
|
111
|
+
self._is_running = False
|
|
112
|
+
logger.info("InternalMAIAgent aborted by user")
|
|
113
|
+
|
|
114
|
+
def _stream_request(
|
|
115
|
+
self,
|
|
116
|
+
messages: list[dict[str, Any]],
|
|
117
|
+
on_thinking_chunk: Callable[[str], None] | None = None,
|
|
118
|
+
) -> str:
|
|
119
|
+
stream = self.openai_client.chat.completions.create(
|
|
120
|
+
messages=messages, # type: ignore[arg-type]
|
|
121
|
+
model=self.model_config.model_name,
|
|
122
|
+
max_tokens=self.model_config.max_tokens,
|
|
123
|
+
temperature=self.model_config.temperature,
|
|
124
|
+
top_p=self.model_config.top_p,
|
|
125
|
+
frequency_penalty=self.model_config.frequency_penalty,
|
|
126
|
+
extra_body=self.model_config.extra_body,
|
|
127
|
+
stream=True,
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
raw_content = ""
|
|
131
|
+
buffer = ""
|
|
132
|
+
action_markers = ["</thinking>", "<tool_call>"]
|
|
133
|
+
in_action_phase = False
|
|
134
|
+
|
|
135
|
+
for chunk in stream:
|
|
136
|
+
if len(chunk.choices) == 0:
|
|
137
|
+
continue
|
|
138
|
+
if chunk.choices[0].delta.content is not None:
|
|
139
|
+
content = chunk.choices[0].delta.content
|
|
140
|
+
raw_content += content
|
|
141
|
+
|
|
142
|
+
if in_action_phase:
|
|
143
|
+
continue
|
|
144
|
+
|
|
145
|
+
buffer += content
|
|
146
|
+
|
|
147
|
+
marker_found = False
|
|
148
|
+
for marker in action_markers:
|
|
149
|
+
if marker in buffer:
|
|
150
|
+
thinking_part = buffer.split(marker, 1)[0]
|
|
151
|
+
if on_thinking_chunk:
|
|
152
|
+
on_thinking_chunk(thinking_part)
|
|
153
|
+
in_action_phase = True
|
|
154
|
+
marker_found = True
|
|
155
|
+
break
|
|
156
|
+
|
|
157
|
+
if marker_found:
|
|
158
|
+
continue
|
|
159
|
+
|
|
160
|
+
is_potential_marker = False
|
|
161
|
+
for marker in action_markers:
|
|
162
|
+
for i in range(1, len(marker)):
|
|
163
|
+
if buffer.endswith(marker[:i]):
|
|
164
|
+
is_potential_marker = True
|
|
165
|
+
break
|
|
166
|
+
if is_potential_marker:
|
|
167
|
+
break
|
|
168
|
+
|
|
169
|
+
if not is_potential_marker:
|
|
170
|
+
if on_thinking_chunk:
|
|
171
|
+
on_thinking_chunk(buffer)
|
|
172
|
+
buffer = ""
|
|
173
|
+
|
|
174
|
+
return raw_content
|
|
175
|
+
|
|
176
|
+
def _execute_step(
|
|
177
|
+
self, user_prompt: str | None = None, is_first: bool = False
|
|
178
|
+
) -> StepResult:
|
|
179
|
+
self._step_count += 1
|
|
180
|
+
|
|
181
|
+
screenshot = self.device.get_screenshot()
|
|
182
|
+
current_app = self.device.get_current_app()
|
|
183
|
+
|
|
184
|
+
screenshot_bytes = base64.b64decode(screenshot.base64_data)
|
|
185
|
+
pil_image = Image.open(BytesIO(screenshot_bytes))
|
|
186
|
+
|
|
187
|
+
if is_first:
|
|
188
|
+
instruction = user_prompt or self.traj_memory.task_goal
|
|
189
|
+
else:
|
|
190
|
+
instruction = self.traj_memory.task_goal
|
|
191
|
+
|
|
192
|
+
screen_info = MessageBuilder.build_screen_info(current_app)
|
|
193
|
+
|
|
194
|
+
messages = self._build_messages(
|
|
195
|
+
instruction=instruction,
|
|
196
|
+
screen_info=screen_info,
|
|
197
|
+
current_screenshot_base64=screenshot.base64_data,
|
|
198
|
+
)
|
|
199
|
+
|
|
200
|
+
max_retries = 3
|
|
201
|
+
raw_content = ""
|
|
202
|
+
thinking = ""
|
|
203
|
+
raw_action = None
|
|
204
|
+
converted_action = None
|
|
205
|
+
|
|
206
|
+
for attempt in range(max_retries):
|
|
207
|
+
try:
|
|
208
|
+
if self.agent_config.verbose:
|
|
209
|
+
retry_info = (
|
|
210
|
+
f" (尝试 {attempt + 1}/{max_retries})" if attempt > 0 else ""
|
|
211
|
+
)
|
|
212
|
+
print("\n" + "=" * 50)
|
|
213
|
+
print(f"💭 步骤 {self._step_count}{retry_info} - 思考中...")
|
|
214
|
+
print("-" * 50)
|
|
215
|
+
|
|
216
|
+
callback = self._thinking_callback
|
|
217
|
+
if callback is None and self.agent_config.verbose:
|
|
218
|
+
|
|
219
|
+
def print_chunk(chunk: str) -> None:
|
|
220
|
+
print(chunk, end="", flush=True)
|
|
221
|
+
|
|
222
|
+
callback = print_chunk
|
|
223
|
+
|
|
224
|
+
llm_start = time.time()
|
|
225
|
+
raw_content = self._stream_request(messages, on_thinking_chunk=callback)
|
|
226
|
+
llm_time = time.time() - llm_start
|
|
227
|
+
self._total_llm_time += llm_time
|
|
228
|
+
|
|
229
|
+
if self.agent_config.verbose:
|
|
230
|
+
print(f"\n⏱️ LLM 耗时: {llm_time:.2f}s")
|
|
231
|
+
|
|
232
|
+
parsed = self.parser.parse_with_thinking(raw_content)
|
|
233
|
+
thinking = parsed["thinking"]
|
|
234
|
+
raw_action = parsed["raw_action"]
|
|
235
|
+
converted_action = parsed["converted_action"]
|
|
236
|
+
|
|
237
|
+
break
|
|
238
|
+
|
|
239
|
+
except MAIParseError as e:
|
|
240
|
+
if self.agent_config.verbose:
|
|
241
|
+
logger.warning(f"解析失败 (尝试 {attempt + 1}/{max_retries}): {e}")
|
|
242
|
+
if attempt == max_retries - 1:
|
|
243
|
+
return StepResult(
|
|
244
|
+
success=False,
|
|
245
|
+
finished=True,
|
|
246
|
+
action=None,
|
|
247
|
+
thinking="",
|
|
248
|
+
message=f"Parse error after {max_retries} retries: {e}",
|
|
249
|
+
)
|
|
250
|
+
continue
|
|
251
|
+
|
|
252
|
+
except Exception as e:
|
|
253
|
+
if self.agent_config.verbose:
|
|
254
|
+
logger.warning(
|
|
255
|
+
f"模型调用失败 (尝试 {attempt + 1}/{max_retries}): {e}"
|
|
256
|
+
)
|
|
257
|
+
if attempt == max_retries - 1:
|
|
258
|
+
if self.agent_config.verbose:
|
|
259
|
+
traceback.print_exc()
|
|
260
|
+
return StepResult(
|
|
261
|
+
success=False,
|
|
262
|
+
finished=True,
|
|
263
|
+
action=None,
|
|
264
|
+
thinking="",
|
|
265
|
+
message=f"Model error after {max_retries} retries: {e}",
|
|
266
|
+
)
|
|
267
|
+
continue
|
|
268
|
+
|
|
269
|
+
if not raw_content or raw_action is None or converted_action is None:
|
|
270
|
+
return StepResult(
|
|
271
|
+
success=False,
|
|
272
|
+
finished=True,
|
|
273
|
+
action=None,
|
|
274
|
+
thinking=thinking,
|
|
275
|
+
message="Failed to get valid response after retries",
|
|
276
|
+
)
|
|
277
|
+
|
|
278
|
+
if self.agent_config.verbose:
|
|
279
|
+
print()
|
|
280
|
+
print("-" * 50)
|
|
281
|
+
print("🎯 动作:")
|
|
282
|
+
print(f" 原始: {raw_action}")
|
|
283
|
+
print(f" 转换: {converted_action}")
|
|
284
|
+
print("=" * 50 + "\n")
|
|
285
|
+
|
|
286
|
+
traj_step = TrajStep(
|
|
287
|
+
screenshot=pil_image,
|
|
288
|
+
accessibility_tree=None,
|
|
289
|
+
prediction=raw_content,
|
|
290
|
+
action=raw_action,
|
|
291
|
+
conclusion="",
|
|
292
|
+
thought=thinking,
|
|
293
|
+
step_index=self._step_count - 1,
|
|
294
|
+
agent_type="InternalMAIAgent",
|
|
295
|
+
model_name=self.model_config.model_name,
|
|
296
|
+
screenshot_bytes=screenshot_bytes,
|
|
297
|
+
structured_action={"action_json": raw_action},
|
|
298
|
+
)
|
|
299
|
+
self.traj_memory.add_step(traj_step)
|
|
300
|
+
|
|
301
|
+
try:
|
|
302
|
+
action_start = time.time()
|
|
303
|
+
result = self.action_handler.execute(
|
|
304
|
+
converted_action, screenshot.width, screenshot.height
|
|
305
|
+
)
|
|
306
|
+
action_time = time.time() - action_start
|
|
307
|
+
self._total_action_time += action_time
|
|
308
|
+
|
|
309
|
+
if self.agent_config.verbose:
|
|
310
|
+
print(f"⚡ 动作执行耗时: {action_time:.2f}s")
|
|
311
|
+
except Exception as e:
|
|
312
|
+
if self.agent_config.verbose:
|
|
313
|
+
traceback.print_exc()
|
|
314
|
+
result = ActionResult(success=False, should_finish=True, message=str(e))
|
|
315
|
+
|
|
316
|
+
finished = converted_action.get("_metadata") == "finish" or result.should_finish
|
|
317
|
+
|
|
318
|
+
if finished and self.agent_config.verbose:
|
|
319
|
+
print("\n" + "🎉 " + "=" * 48)
|
|
320
|
+
print(
|
|
321
|
+
f"✅ 任务完成: {result.message or converted_action.get('message', '完成')}"
|
|
322
|
+
)
|
|
323
|
+
print("=" * 50)
|
|
324
|
+
print("\n📊 性能统计:")
|
|
325
|
+
print(f" 总步数: {self._step_count}")
|
|
326
|
+
print(f" 总 LLM 耗时: {self._total_llm_time:.2f}s")
|
|
327
|
+
print(f" 总动作耗时: {self._total_action_time:.2f}s")
|
|
328
|
+
print(
|
|
329
|
+
f" 平均每步耗时: {(self._total_llm_time + self._total_action_time) / self._step_count:.2f}s"
|
|
330
|
+
)
|
|
331
|
+
if self._total_tokens > 0:
|
|
332
|
+
print(f" 总 Token 使用: {self._total_tokens}")
|
|
333
|
+
print("=" * 50 + "\n")
|
|
334
|
+
|
|
335
|
+
return StepResult(
|
|
336
|
+
success=result.success,
|
|
337
|
+
finished=finished,
|
|
338
|
+
action=converted_action,
|
|
339
|
+
thinking=thinking,
|
|
340
|
+
message=result.message or converted_action.get("message"),
|
|
341
|
+
)
|
|
342
|
+
|
|
343
|
+
def _build_messages(
|
|
344
|
+
self, instruction: str, screen_info: str, current_screenshot_base64: str
|
|
345
|
+
) -> list[dict[str, Any]]:
|
|
346
|
+
system_prompt = self.agent_config.system_prompt or MAI_MOBILE_SYSTEM_PROMPT
|
|
347
|
+
|
|
348
|
+
messages: list[dict[str, Any]] = [
|
|
349
|
+
MessageBuilder.create_system_message(system_prompt),
|
|
350
|
+
MessageBuilder.create_user_message(f"{instruction}\n\n{screen_info}"),
|
|
351
|
+
]
|
|
352
|
+
|
|
353
|
+
history_images = self.traj_memory.get_history_images(self.history_n - 1)
|
|
354
|
+
history_thoughts = self.traj_memory.get_history_thoughts(self.history_n - 1)
|
|
355
|
+
history_actions = self.traj_memory.get_history_actions(self.history_n - 1)
|
|
356
|
+
|
|
357
|
+
for idx, (img_bytes, thought, action) in enumerate(
|
|
358
|
+
zip(history_images, history_thoughts, history_actions)
|
|
359
|
+
):
|
|
360
|
+
img_base64 = base64.b64encode(img_bytes).decode("utf-8")
|
|
361
|
+
messages.append(
|
|
362
|
+
MessageBuilder.create_user_message(
|
|
363
|
+
text=screen_info, image_base64=img_base64
|
|
364
|
+
)
|
|
365
|
+
)
|
|
366
|
+
|
|
367
|
+
import json
|
|
368
|
+
|
|
369
|
+
tool_call_dict = {
|
|
370
|
+
"name": "mobile_use",
|
|
371
|
+
"arguments": action,
|
|
372
|
+
}
|
|
373
|
+
tool_call_json = json.dumps(tool_call_dict, separators=(",", ":"))
|
|
374
|
+
assistant_content = (
|
|
375
|
+
f"<thinking>\n{thought}\n</thinking>\n"
|
|
376
|
+
f"<tool_call>\n{tool_call_json}\n</tool_call>"
|
|
377
|
+
)
|
|
378
|
+
messages.append(MessageBuilder.create_assistant_message(assistant_content))
|
|
379
|
+
|
|
380
|
+
messages.append(
|
|
381
|
+
MessageBuilder.create_user_message(
|
|
382
|
+
text=screen_info, image_base64=current_screenshot_base64
|
|
383
|
+
)
|
|
384
|
+
)
|
|
385
|
+
|
|
386
|
+
return messages
|
|
387
|
+
|
|
388
|
+
@property
|
|
389
|
+
def context(self) -> list[dict[str, Any]]:
|
|
390
|
+
return [
|
|
391
|
+
{
|
|
392
|
+
"step": step.step_index,
|
|
393
|
+
"thought": step.thought,
|
|
394
|
+
"action": step.action,
|
|
395
|
+
}
|
|
396
|
+
for step in self.traj_memory.steps
|
|
397
|
+
]
|
|
398
|
+
|
|
399
|
+
@property
|
|
400
|
+
def step_count(self) -> int:
|
|
401
|
+
return self._step_count
|
|
402
|
+
|
|
403
|
+
@property
|
|
404
|
+
def is_running(self) -> bool:
|
|
405
|
+
return self._is_running
|
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
"""MAI Agent parser using XML tags and JSON.
|
|
2
|
+
|
|
3
|
+
从 mai_agent 的 XML 格式中提取 thinking 和 action,并转换为
|
|
4
|
+
AutoGLM_GUI 的标准格式。
|
|
5
|
+
|
|
6
|
+
迁移说明:基于原有实现增强,添加 parse_with_thinking 方法。
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
import json
|
|
10
|
+
import re
|
|
11
|
+
from typing import Any
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
SCALE_FACTOR = 999
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class MAIParseError(ValueError):
|
|
18
|
+
pass
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
class MAIParser:
|
|
22
|
+
"""Parse MAI Agent XML + JSON format outputs.
|
|
23
|
+
|
|
24
|
+
Handles format like:
|
|
25
|
+
<thinking>Reasoning process</thinking>
|
|
26
|
+
<tool_call>{"name": "mobile_use", "arguments": {...}}</tool_call>
|
|
27
|
+
|
|
28
|
+
Converts MAI-specific actions to standard ActionHandler format.
|
|
29
|
+
Coordinate scale: 0-999 (automatically converted to 0-1000)
|
|
30
|
+
"""
|
|
31
|
+
|
|
32
|
+
@property
|
|
33
|
+
def coordinate_scale(self) -> int:
|
|
34
|
+
return 999
|
|
35
|
+
|
|
36
|
+
def parse_with_thinking(self, raw_response: str) -> dict[str, Any]:
|
|
37
|
+
text = raw_response.strip()
|
|
38
|
+
|
|
39
|
+
if "</think>" in text and "</thinking>" not in text:
|
|
40
|
+
text = text.replace("</think>", "</thinking>")
|
|
41
|
+
text = "<thinking>" + text
|
|
42
|
+
|
|
43
|
+
pattern = r"<thinking>(.*?)</thinking>.*?<tool_call>(.*?)</tool_call>"
|
|
44
|
+
match = re.search(pattern, text, re.DOTALL)
|
|
45
|
+
|
|
46
|
+
if not match:
|
|
47
|
+
raise MAIParseError("Failed to find <thinking> and <tool_call> tags")
|
|
48
|
+
|
|
49
|
+
thinking = match.group(1).strip().strip('"')
|
|
50
|
+
tool_call_str = match.group(2).strip().strip('"')
|
|
51
|
+
|
|
52
|
+
try:
|
|
53
|
+
tool_call = json.loads(tool_call_str)
|
|
54
|
+
except json.JSONDecodeError as e:
|
|
55
|
+
raise MAIParseError(f"Invalid JSON in tool_call: {e}") from e
|
|
56
|
+
|
|
57
|
+
mai_action = tool_call.get("arguments", {})
|
|
58
|
+
|
|
59
|
+
if "coordinate" in mai_action:
|
|
60
|
+
mai_action["coordinate"] = self._normalize_coordinate_to_0_1(
|
|
61
|
+
mai_action["coordinate"]
|
|
62
|
+
)
|
|
63
|
+
|
|
64
|
+
return {
|
|
65
|
+
"thinking": thinking,
|
|
66
|
+
"raw_action": mai_action,
|
|
67
|
+
"converted_action": self._convert_action(mai_action),
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
def _normalize_coordinate_to_0_1(
|
|
71
|
+
self, coordinate: list[int | float]
|
|
72
|
+
) -> list[float]:
|
|
73
|
+
if len(coordinate) == 2:
|
|
74
|
+
x, y = coordinate
|
|
75
|
+
elif len(coordinate) == 4:
|
|
76
|
+
x1, y1, x2, y2 = coordinate
|
|
77
|
+
x = (x1 + x2) / 2
|
|
78
|
+
y = (y1 + y2) / 2
|
|
79
|
+
else:
|
|
80
|
+
raise MAIParseError(
|
|
81
|
+
f"Invalid coordinate format: expected 2 or 4 values, got {len(coordinate)}"
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
return [x / SCALE_FACTOR, y / SCALE_FACTOR]
|
|
85
|
+
|
|
86
|
+
def parse(self, raw_response: str) -> dict[str, Any]:
|
|
87
|
+
"""Parse MAI agent XML+JSON output.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
raw_response: Model output containing <thinking> and <tool_call> tags.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
Standardized action dictionary with coordinates converted to 0-1000 scale.
|
|
94
|
+
|
|
95
|
+
Raises:
|
|
96
|
+
ValueError: If parsing fails or content is invalid JSON.
|
|
97
|
+
"""
|
|
98
|
+
text = raw_response.strip()
|
|
99
|
+
|
|
100
|
+
if "</think>" in text and "</thinking>" not in text:
|
|
101
|
+
text = text.replace("</think>", "</thinking>")
|
|
102
|
+
text = "<thinking>" + text
|
|
103
|
+
|
|
104
|
+
pattern = r"<thinking>(.*?)</thinking>.*?<tool_call>(.*?)</tool_call>"
|
|
105
|
+
match = re.search(pattern, text, re.DOTALL)
|
|
106
|
+
|
|
107
|
+
if not match:
|
|
108
|
+
raise ValueError("Failed to find <thinking> and <tool_call> tags")
|
|
109
|
+
|
|
110
|
+
tool_call_str = match.group(2).strip().strip('"')
|
|
111
|
+
|
|
112
|
+
try:
|
|
113
|
+
tool_call = json.loads(tool_call_str)
|
|
114
|
+
except json.JSONDecodeError as e:
|
|
115
|
+
raise ValueError(f"Invalid JSON in tool_call: {e}") from e
|
|
116
|
+
|
|
117
|
+
mai_action = tool_call.get("arguments", {})
|
|
118
|
+
return self._convert_action(mai_action)
|
|
119
|
+
|
|
120
|
+
def _convert_action(self, mai_action: dict[str, Any]) -> dict[str, Any]:
|
|
121
|
+
"""Convert MAI action format to standard ActionHandler format.
|
|
122
|
+
|
|
123
|
+
MAI format: {"action": "click", "coordinate": [x, y]}
|
|
124
|
+
Standard format: {"_metadata": "do", "action": "Tap", "element": [x, y]}
|
|
125
|
+
"""
|
|
126
|
+
action_type = mai_action.get("action")
|
|
127
|
+
|
|
128
|
+
if action_type == "terminate":
|
|
129
|
+
status = mai_action.get("status", "success")
|
|
130
|
+
return {
|
|
131
|
+
"_metadata": "finish",
|
|
132
|
+
"message": "Task completed" if status == "success" else "Task failed",
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
if action_type == "answer":
|
|
136
|
+
return {
|
|
137
|
+
"_metadata": "finish",
|
|
138
|
+
"message": mai_action.get("text", ""),
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
if action_type == "wait":
|
|
142
|
+
return {
|
|
143
|
+
"_metadata": "do",
|
|
144
|
+
"action": "Wait",
|
|
145
|
+
"duration": "1 seconds",
|
|
146
|
+
}
|
|
147
|
+
|
|
148
|
+
if action_type == "system_button":
|
|
149
|
+
button_name = mai_action.get("button", "")
|
|
150
|
+
action_map = {
|
|
151
|
+
"back": "Back",
|
|
152
|
+
"home": "Home",
|
|
153
|
+
"enter": "Enter",
|
|
154
|
+
}
|
|
155
|
+
return {
|
|
156
|
+
"_metadata": "do",
|
|
157
|
+
"action": action_map.get(button_name, "Back"),
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
coordinate = mai_action.get("coordinate")
|
|
161
|
+
if coordinate:
|
|
162
|
+
x = self._convert_coordinate(coordinate[0])
|
|
163
|
+
y = self._convert_coordinate(coordinate[1])
|
|
164
|
+
|
|
165
|
+
if action_type == "click":
|
|
166
|
+
return {
|
|
167
|
+
"_metadata": "do",
|
|
168
|
+
"action": "Tap",
|
|
169
|
+
"element": [x, y],
|
|
170
|
+
}
|
|
171
|
+
elif action_type == "long_press":
|
|
172
|
+
return {
|
|
173
|
+
"_metadata": "do",
|
|
174
|
+
"action": "Long Press",
|
|
175
|
+
"element": [x, y],
|
|
176
|
+
}
|
|
177
|
+
elif action_type == "double_click":
|
|
178
|
+
return {
|
|
179
|
+
"_metadata": "do",
|
|
180
|
+
"action": "Double Tap",
|
|
181
|
+
"element": [x, y],
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
if action_type == "swipe":
|
|
185
|
+
direction = mai_action.get("direction", "up")
|
|
186
|
+
coordinate = mai_action.get("coordinate") or [0.5, 0.5]
|
|
187
|
+
x = self._convert_coordinate(coordinate[0])
|
|
188
|
+
y = self._convert_coordinate(coordinate[1])
|
|
189
|
+
|
|
190
|
+
start, end = self._calculate_swipe_coordinates(direction, x, y)
|
|
191
|
+
return {
|
|
192
|
+
"_metadata": "do",
|
|
193
|
+
"action": "Swipe",
|
|
194
|
+
"start": start,
|
|
195
|
+
"end": end,
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
if action_type == "drag":
|
|
199
|
+
start_coord = mai_action.get("start_coordinate", [0, 0])
|
|
200
|
+
end_coord = mai_action.get("end_coordinate", [0, 0])
|
|
201
|
+
|
|
202
|
+
start = [
|
|
203
|
+
self._convert_coordinate_from_scale_factor(start_coord[0]),
|
|
204
|
+
self._convert_coordinate_from_scale_factor(start_coord[1]),
|
|
205
|
+
]
|
|
206
|
+
end = [
|
|
207
|
+
self._convert_coordinate_from_scale_factor(end_coord[0]),
|
|
208
|
+
self._convert_coordinate_from_scale_factor(end_coord[1]),
|
|
209
|
+
]
|
|
210
|
+
return {
|
|
211
|
+
"_metadata": "do",
|
|
212
|
+
"action": "Swipe",
|
|
213
|
+
"start": start,
|
|
214
|
+
"end": end,
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
if action_type == "type":
|
|
218
|
+
return {
|
|
219
|
+
"_metadata": "do",
|
|
220
|
+
"action": "Type",
|
|
221
|
+
"text": mai_action.get("text", ""),
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
if action_type == "open":
|
|
225
|
+
return {
|
|
226
|
+
"_metadata": "do",
|
|
227
|
+
"action": "Launch",
|
|
228
|
+
"app": mai_action.get("app", ""),
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
raise ValueError(f"Unknown MAI action type: {action_type}")
|
|
232
|
+
|
|
233
|
+
def _convert_coordinate(self, value: float) -> int:
|
|
234
|
+
"""Convert MAI normalized coordinate [0, 1] to standard scale [0, 1000]."""
|
|
235
|
+
return int(value * 1000)
|
|
236
|
+
|
|
237
|
+
def _convert_coordinate_from_scale_factor(self, value: float) -> int:
|
|
238
|
+
"""Convert MAI scale factor coordinate [0, 999] to standard scale [0, 1000]."""
|
|
239
|
+
return int((value / SCALE_FACTOR) * 1000)
|
|
240
|
+
|
|
241
|
+
def _calculate_swipe_coordinates(
|
|
242
|
+
self, direction: str, x: int, y: int
|
|
243
|
+
) -> tuple[list[int], list[int]]:
|
|
244
|
+
"""Calculate start and end coordinates for swipe based on direction."""
|
|
245
|
+
swipe_distance = 300
|
|
246
|
+
|
|
247
|
+
direction_map = {
|
|
248
|
+
"up": ([x, y + swipe_distance], [x, y - swipe_distance]),
|
|
249
|
+
"down": ([x, y - swipe_distance], [x, y + swipe_distance]),
|
|
250
|
+
"left": ([x + swipe_distance, y], [x - swipe_distance, y]),
|
|
251
|
+
"right": ([x - swipe_distance, y], [x + swipe_distance, y]),
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
return direction_map.get(direction, ([x, y], [x, y]))
|