autoglm-gui 1.4.1__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. AutoGLM_GUI/__init__.py +11 -0
  2. AutoGLM_GUI/__main__.py +26 -4
  3. AutoGLM_GUI/actions/__init__.py +6 -0
  4. AutoGLM_GUI/actions/handler.py +196 -0
  5. AutoGLM_GUI/actions/types.py +15 -0
  6. AutoGLM_GUI/adb/__init__.py +53 -0
  7. AutoGLM_GUI/adb/apps.py +227 -0
  8. AutoGLM_GUI/adb/connection.py +323 -0
  9. AutoGLM_GUI/adb/device.py +171 -0
  10. AutoGLM_GUI/adb/input.py +67 -0
  11. AutoGLM_GUI/adb/screenshot.py +11 -0
  12. AutoGLM_GUI/adb/timing.py +167 -0
  13. AutoGLM_GUI/adb_plus/keyboard_installer.py +4 -2
  14. AutoGLM_GUI/adb_plus/screenshot.py +22 -1
  15. AutoGLM_GUI/adb_plus/serial.py +38 -20
  16. AutoGLM_GUI/adb_plus/touch.py +4 -9
  17. AutoGLM_GUI/agents/__init__.py +43 -12
  18. AutoGLM_GUI/agents/events.py +19 -0
  19. AutoGLM_GUI/agents/factory.py +31 -38
  20. AutoGLM_GUI/agents/glm/__init__.py +7 -0
  21. AutoGLM_GUI/agents/glm/agent.py +292 -0
  22. AutoGLM_GUI/agents/glm/message_builder.py +81 -0
  23. AutoGLM_GUI/agents/glm/parser.py +110 -0
  24. AutoGLM_GUI/agents/glm/prompts_en.py +77 -0
  25. AutoGLM_GUI/agents/glm/prompts_zh.py +75 -0
  26. AutoGLM_GUI/agents/mai/__init__.py +28 -0
  27. AutoGLM_GUI/agents/mai/agent.py +405 -0
  28. AutoGLM_GUI/agents/mai/parser.py +254 -0
  29. AutoGLM_GUI/agents/mai/prompts.py +103 -0
  30. AutoGLM_GUI/agents/mai/traj_memory.py +91 -0
  31. AutoGLM_GUI/agents/protocols.py +12 -8
  32. AutoGLM_GUI/agents/stream_runner.py +188 -0
  33. AutoGLM_GUI/api/__init__.py +40 -21
  34. AutoGLM_GUI/api/agents.py +157 -240
  35. AutoGLM_GUI/api/control.py +9 -6
  36. AutoGLM_GUI/api/devices.py +102 -12
  37. AutoGLM_GUI/api/history.py +78 -0
  38. AutoGLM_GUI/api/layered_agent.py +67 -15
  39. AutoGLM_GUI/api/media.py +64 -1
  40. AutoGLM_GUI/api/scheduled_tasks.py +98 -0
  41. AutoGLM_GUI/config.py +81 -0
  42. AutoGLM_GUI/config_manager.py +68 -51
  43. AutoGLM_GUI/device_manager.py +248 -29
  44. AutoGLM_GUI/device_protocol.py +1 -1
  45. AutoGLM_GUI/devices/adb_device.py +5 -10
  46. AutoGLM_GUI/devices/mock_device.py +4 -2
  47. AutoGLM_GUI/devices/remote_device.py +8 -3
  48. AutoGLM_GUI/history_manager.py +164 -0
  49. AutoGLM_GUI/i18n.py +81 -0
  50. AutoGLM_GUI/model/__init__.py +5 -0
  51. AutoGLM_GUI/model/message_builder.py +69 -0
  52. AutoGLM_GUI/model/types.py +24 -0
  53. AutoGLM_GUI/models/__init__.py +10 -0
  54. AutoGLM_GUI/models/history.py +96 -0
  55. AutoGLM_GUI/models/scheduled_task.py +71 -0
  56. AutoGLM_GUI/parsers/__init__.py +22 -0
  57. AutoGLM_GUI/parsers/base.py +50 -0
  58. AutoGLM_GUI/parsers/phone_parser.py +58 -0
  59. AutoGLM_GUI/phone_agent_manager.py +62 -396
  60. AutoGLM_GUI/platform_utils.py +26 -0
  61. AutoGLM_GUI/prompt_config.py +15 -0
  62. AutoGLM_GUI/prompts/__init__.py +32 -0
  63. AutoGLM_GUI/scheduler_manager.py +304 -0
  64. AutoGLM_GUI/schemas.py +234 -72
  65. AutoGLM_GUI/scrcpy_stream.py +142 -24
  66. AutoGLM_GUI/socketio_server.py +100 -27
  67. AutoGLM_GUI/static/assets/{about-_XNhzQZX.js → about-BQm96DAl.js} +1 -1
  68. AutoGLM_GUI/static/assets/alert-dialog-B42XxGPR.js +1 -0
  69. AutoGLM_GUI/static/assets/chat-C0L2gQYG.js +129 -0
  70. AutoGLM_GUI/static/assets/circle-alert-D4rSJh37.js +1 -0
  71. AutoGLM_GUI/static/assets/dialog-DZ78cEcj.js +45 -0
  72. AutoGLM_GUI/static/assets/history-DFBv7TGc.js +1 -0
  73. AutoGLM_GUI/static/assets/index-Bzyv2yQ2.css +1 -0
  74. AutoGLM_GUI/static/assets/{index-Cy8TmmHV.js → index-CmZSnDqc.js} +1 -1
  75. AutoGLM_GUI/static/assets/index-CssG-3TH.js +11 -0
  76. AutoGLM_GUI/static/assets/label-BCUzE_nm.js +1 -0
  77. AutoGLM_GUI/static/assets/logs-eoFxn5of.js +1 -0
  78. AutoGLM_GUI/static/assets/popover-DLsuV5Sx.js +1 -0
  79. AutoGLM_GUI/static/assets/scheduled-tasks-MyqGJvy_.js +1 -0
  80. AutoGLM_GUI/static/assets/square-pen-zGWYrdfj.js +1 -0
  81. AutoGLM_GUI/static/assets/textarea-BX6y7uM5.js +1 -0
  82. AutoGLM_GUI/static/assets/workflows-CYFs6ssC.js +1 -0
  83. AutoGLM_GUI/static/index.html +2 -2
  84. AutoGLM_GUI/types.py +17 -0
  85. {autoglm_gui-1.4.1.dist-info → autoglm_gui-1.5.0.dist-info}/METADATA +137 -130
  86. autoglm_gui-1.5.0.dist-info/RECORD +157 -0
  87. AutoGLM_GUI/agents/mai_adapter.py +0 -627
  88. AutoGLM_GUI/api/dual_model.py +0 -317
  89. AutoGLM_GUI/dual_model/__init__.py +0 -53
  90. AutoGLM_GUI/dual_model/decision_model.py +0 -664
  91. AutoGLM_GUI/dual_model/dual_agent.py +0 -917
  92. AutoGLM_GUI/dual_model/protocols.py +0 -354
  93. AutoGLM_GUI/dual_model/vision_model.py +0 -442
  94. AutoGLM_GUI/mai_ui_adapter/agent_wrapper.py +0 -291
  95. AutoGLM_GUI/phone_agent_patches.py +0 -147
  96. AutoGLM_GUI/static/assets/chat-DwJpiAWf.js +0 -126
  97. AutoGLM_GUI/static/assets/dialog-B3uW4T8V.js +0 -45
  98. AutoGLM_GUI/static/assets/index-Cpv2gSF1.css +0 -1
  99. AutoGLM_GUI/static/assets/index-UYYauTly.js +0 -12
  100. AutoGLM_GUI/static/assets/workflows-Du_de-dt.js +0 -1
  101. autoglm_gui-1.4.1.dist-info/RECORD +0 -117
  102. {autoglm_gui-1.4.1.dist-info → autoglm_gui-1.5.0.dist-info}/WHEEL +0 -0
  103. {autoglm_gui-1.4.1.dist-info → autoglm_gui-1.5.0.dist-info}/entry_points.txt +0 -0
  104. {autoglm_gui-1.4.1.dist-info → autoglm_gui-1.5.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,405 @@
1
+ """Internal MAI Agent Implementation
2
+
3
+ 完全内部化实现的 MAI Agent,替代第三方 mai_agent 依赖。
4
+
5
+ 核心特性:
6
+ - 多图像历史上下文(保留最近 N 张截图)
7
+ - XML 格式的思考过程和动作输出
8
+ - 999 坐标系统归一化
9
+ - 自动重试机制
10
+ """
11
+
12
+ import base64
13
+ import time
14
+ import traceback
15
+ from io import BytesIO
16
+ from typing import Any, Callable
17
+
18
+ from openai import OpenAI
19
+ from PIL import Image
20
+
21
+ from AutoGLM_GUI.actions import ActionHandler, ActionResult
22
+ from AutoGLM_GUI.config import AgentConfig, ModelConfig, StepResult
23
+ from AutoGLM_GUI.device_protocol import DeviceProtocol
24
+ from AutoGLM_GUI.logger import logger
25
+ from AutoGLM_GUI.model import MessageBuilder
26
+
27
+ from .traj_memory import TrajMemory, TrajStep
28
+ from .parser import MAIParseError, MAIParser
29
+ from .prompts import MAI_MOBILE_SYSTEM_PROMPT
30
+
31
+
32
+ class InternalMAIAgent:
33
+ def __init__(
34
+ self,
35
+ model_config: ModelConfig,
36
+ agent_config: AgentConfig,
37
+ device: DeviceProtocol,
38
+ history_n: int = 3,
39
+ confirmation_callback: Callable[[str], bool] | None = None,
40
+ takeover_callback: Callable[[str], None] | None = None,
41
+ thinking_callback: Callable[[str], None] | None = None,
42
+ ):
43
+ self.model_config = model_config
44
+ self.agent_config = agent_config
45
+ self.history_n = history_n
46
+
47
+ self.openai_client = OpenAI(
48
+ base_url=model_config.base_url,
49
+ api_key=model_config.api_key,
50
+ timeout=120,
51
+ )
52
+ self.parser = MAIParser()
53
+
54
+ self.device = device
55
+ self.action_handler = ActionHandler(
56
+ device=self.device,
57
+ confirmation_callback=confirmation_callback,
58
+ takeover_callback=takeover_callback,
59
+ )
60
+
61
+ self.traj_memory = TrajMemory(task_goal="", task_id="", steps=[])
62
+ self._step_count = 0
63
+ self._is_running = False
64
+ self._thinking_callback = thinking_callback
65
+
66
+ self._total_llm_time = 0.0
67
+ self._total_action_time = 0.0
68
+ self._total_tokens = 0
69
+
70
+ def run(self, task: str) -> str:
71
+ self.traj_memory = TrajMemory(task_goal=task, task_id="", steps=[])
72
+ self._step_count = 0
73
+ self._is_running = True
74
+
75
+ try:
76
+ result = self._execute_step(task, is_first=True)
77
+
78
+ if result.finished:
79
+ return result.message or "Task completed"
80
+
81
+ while self._step_count < self.agent_config.max_steps and self._is_running:
82
+ result = self._execute_step(is_first=False)
83
+
84
+ if result.finished:
85
+ return result.message or "Task completed"
86
+
87
+ return "Max steps reached"
88
+ finally:
89
+ self._is_running = False
90
+
91
+ def step(self, task: str | None = None) -> StepResult:
92
+ is_first = len(self.traj_memory.steps) == 0
93
+
94
+ if is_first and not task:
95
+ raise ValueError("Task is required for the first step")
96
+
97
+ if is_first:
98
+ self.traj_memory.task_goal = task or ""
99
+
100
+ return self._execute_step(task, is_first)
101
+
102
+ def reset(self) -> None:
103
+ self.traj_memory.clear()
104
+ self._step_count = 0
105
+ self._is_running = False
106
+ self._total_llm_time = 0.0
107
+ self._total_action_time = 0.0
108
+ self._total_tokens = 0
109
+
110
+ def abort(self) -> None:
111
+ self._is_running = False
112
+ logger.info("InternalMAIAgent aborted by user")
113
+
114
+ def _stream_request(
115
+ self,
116
+ messages: list[dict[str, Any]],
117
+ on_thinking_chunk: Callable[[str], None] | None = None,
118
+ ) -> str:
119
+ stream = self.openai_client.chat.completions.create(
120
+ messages=messages, # type: ignore[arg-type]
121
+ model=self.model_config.model_name,
122
+ max_tokens=self.model_config.max_tokens,
123
+ temperature=self.model_config.temperature,
124
+ top_p=self.model_config.top_p,
125
+ frequency_penalty=self.model_config.frequency_penalty,
126
+ extra_body=self.model_config.extra_body,
127
+ stream=True,
128
+ )
129
+
130
+ raw_content = ""
131
+ buffer = ""
132
+ action_markers = ["</thinking>", "<tool_call>"]
133
+ in_action_phase = False
134
+
135
+ for chunk in stream:
136
+ if len(chunk.choices) == 0:
137
+ continue
138
+ if chunk.choices[0].delta.content is not None:
139
+ content = chunk.choices[0].delta.content
140
+ raw_content += content
141
+
142
+ if in_action_phase:
143
+ continue
144
+
145
+ buffer += content
146
+
147
+ marker_found = False
148
+ for marker in action_markers:
149
+ if marker in buffer:
150
+ thinking_part = buffer.split(marker, 1)[0]
151
+ if on_thinking_chunk:
152
+ on_thinking_chunk(thinking_part)
153
+ in_action_phase = True
154
+ marker_found = True
155
+ break
156
+
157
+ if marker_found:
158
+ continue
159
+
160
+ is_potential_marker = False
161
+ for marker in action_markers:
162
+ for i in range(1, len(marker)):
163
+ if buffer.endswith(marker[:i]):
164
+ is_potential_marker = True
165
+ break
166
+ if is_potential_marker:
167
+ break
168
+
169
+ if not is_potential_marker:
170
+ if on_thinking_chunk:
171
+ on_thinking_chunk(buffer)
172
+ buffer = ""
173
+
174
+ return raw_content
175
+
176
+ def _execute_step(
177
+ self, user_prompt: str | None = None, is_first: bool = False
178
+ ) -> StepResult:
179
+ self._step_count += 1
180
+
181
+ screenshot = self.device.get_screenshot()
182
+ current_app = self.device.get_current_app()
183
+
184
+ screenshot_bytes = base64.b64decode(screenshot.base64_data)
185
+ pil_image = Image.open(BytesIO(screenshot_bytes))
186
+
187
+ if is_first:
188
+ instruction = user_prompt or self.traj_memory.task_goal
189
+ else:
190
+ instruction = self.traj_memory.task_goal
191
+
192
+ screen_info = MessageBuilder.build_screen_info(current_app)
193
+
194
+ messages = self._build_messages(
195
+ instruction=instruction,
196
+ screen_info=screen_info,
197
+ current_screenshot_base64=screenshot.base64_data,
198
+ )
199
+
200
+ max_retries = 3
201
+ raw_content = ""
202
+ thinking = ""
203
+ raw_action = None
204
+ converted_action = None
205
+
206
+ for attempt in range(max_retries):
207
+ try:
208
+ if self.agent_config.verbose:
209
+ retry_info = (
210
+ f" (尝试 {attempt + 1}/{max_retries})" if attempt > 0 else ""
211
+ )
212
+ print("\n" + "=" * 50)
213
+ print(f"💭 步骤 {self._step_count}{retry_info} - 思考中...")
214
+ print("-" * 50)
215
+
216
+ callback = self._thinking_callback
217
+ if callback is None and self.agent_config.verbose:
218
+
219
+ def print_chunk(chunk: str) -> None:
220
+ print(chunk, end="", flush=True)
221
+
222
+ callback = print_chunk
223
+
224
+ llm_start = time.time()
225
+ raw_content = self._stream_request(messages, on_thinking_chunk=callback)
226
+ llm_time = time.time() - llm_start
227
+ self._total_llm_time += llm_time
228
+
229
+ if self.agent_config.verbose:
230
+ print(f"\n⏱️ LLM 耗时: {llm_time:.2f}s")
231
+
232
+ parsed = self.parser.parse_with_thinking(raw_content)
233
+ thinking = parsed["thinking"]
234
+ raw_action = parsed["raw_action"]
235
+ converted_action = parsed["converted_action"]
236
+
237
+ break
238
+
239
+ except MAIParseError as e:
240
+ if self.agent_config.verbose:
241
+ logger.warning(f"解析失败 (尝试 {attempt + 1}/{max_retries}): {e}")
242
+ if attempt == max_retries - 1:
243
+ return StepResult(
244
+ success=False,
245
+ finished=True,
246
+ action=None,
247
+ thinking="",
248
+ message=f"Parse error after {max_retries} retries: {e}",
249
+ )
250
+ continue
251
+
252
+ except Exception as e:
253
+ if self.agent_config.verbose:
254
+ logger.warning(
255
+ f"模型调用失败 (尝试 {attempt + 1}/{max_retries}): {e}"
256
+ )
257
+ if attempt == max_retries - 1:
258
+ if self.agent_config.verbose:
259
+ traceback.print_exc()
260
+ return StepResult(
261
+ success=False,
262
+ finished=True,
263
+ action=None,
264
+ thinking="",
265
+ message=f"Model error after {max_retries} retries: {e}",
266
+ )
267
+ continue
268
+
269
+ if not raw_content or raw_action is None or converted_action is None:
270
+ return StepResult(
271
+ success=False,
272
+ finished=True,
273
+ action=None,
274
+ thinking=thinking,
275
+ message="Failed to get valid response after retries",
276
+ )
277
+
278
+ if self.agent_config.verbose:
279
+ print()
280
+ print("-" * 50)
281
+ print("🎯 动作:")
282
+ print(f" 原始: {raw_action}")
283
+ print(f" 转换: {converted_action}")
284
+ print("=" * 50 + "\n")
285
+
286
+ traj_step = TrajStep(
287
+ screenshot=pil_image,
288
+ accessibility_tree=None,
289
+ prediction=raw_content,
290
+ action=raw_action,
291
+ conclusion="",
292
+ thought=thinking,
293
+ step_index=self._step_count - 1,
294
+ agent_type="InternalMAIAgent",
295
+ model_name=self.model_config.model_name,
296
+ screenshot_bytes=screenshot_bytes,
297
+ structured_action={"action_json": raw_action},
298
+ )
299
+ self.traj_memory.add_step(traj_step)
300
+
301
+ try:
302
+ action_start = time.time()
303
+ result = self.action_handler.execute(
304
+ converted_action, screenshot.width, screenshot.height
305
+ )
306
+ action_time = time.time() - action_start
307
+ self._total_action_time += action_time
308
+
309
+ if self.agent_config.verbose:
310
+ print(f"⚡ 动作执行耗时: {action_time:.2f}s")
311
+ except Exception as e:
312
+ if self.agent_config.verbose:
313
+ traceback.print_exc()
314
+ result = ActionResult(success=False, should_finish=True, message=str(e))
315
+
316
+ finished = converted_action.get("_metadata") == "finish" or result.should_finish
317
+
318
+ if finished and self.agent_config.verbose:
319
+ print("\n" + "🎉 " + "=" * 48)
320
+ print(
321
+ f"✅ 任务完成: {result.message or converted_action.get('message', '完成')}"
322
+ )
323
+ print("=" * 50)
324
+ print("\n📊 性能统计:")
325
+ print(f" 总步数: {self._step_count}")
326
+ print(f" 总 LLM 耗时: {self._total_llm_time:.2f}s")
327
+ print(f" 总动作耗时: {self._total_action_time:.2f}s")
328
+ print(
329
+ f" 平均每步耗时: {(self._total_llm_time + self._total_action_time) / self._step_count:.2f}s"
330
+ )
331
+ if self._total_tokens > 0:
332
+ print(f" 总 Token 使用: {self._total_tokens}")
333
+ print("=" * 50 + "\n")
334
+
335
+ return StepResult(
336
+ success=result.success,
337
+ finished=finished,
338
+ action=converted_action,
339
+ thinking=thinking,
340
+ message=result.message or converted_action.get("message"),
341
+ )
342
+
343
+ def _build_messages(
344
+ self, instruction: str, screen_info: str, current_screenshot_base64: str
345
+ ) -> list[dict[str, Any]]:
346
+ system_prompt = self.agent_config.system_prompt or MAI_MOBILE_SYSTEM_PROMPT
347
+
348
+ messages: list[dict[str, Any]] = [
349
+ MessageBuilder.create_system_message(system_prompt),
350
+ MessageBuilder.create_user_message(f"{instruction}\n\n{screen_info}"),
351
+ ]
352
+
353
+ history_images = self.traj_memory.get_history_images(self.history_n - 1)
354
+ history_thoughts = self.traj_memory.get_history_thoughts(self.history_n - 1)
355
+ history_actions = self.traj_memory.get_history_actions(self.history_n - 1)
356
+
357
+ for idx, (img_bytes, thought, action) in enumerate(
358
+ zip(history_images, history_thoughts, history_actions)
359
+ ):
360
+ img_base64 = base64.b64encode(img_bytes).decode("utf-8")
361
+ messages.append(
362
+ MessageBuilder.create_user_message(
363
+ text=screen_info, image_base64=img_base64
364
+ )
365
+ )
366
+
367
+ import json
368
+
369
+ tool_call_dict = {
370
+ "name": "mobile_use",
371
+ "arguments": action,
372
+ }
373
+ tool_call_json = json.dumps(tool_call_dict, separators=(",", ":"))
374
+ assistant_content = (
375
+ f"<thinking>\n{thought}\n</thinking>\n"
376
+ f"<tool_call>\n{tool_call_json}\n</tool_call>"
377
+ )
378
+ messages.append(MessageBuilder.create_assistant_message(assistant_content))
379
+
380
+ messages.append(
381
+ MessageBuilder.create_user_message(
382
+ text=screen_info, image_base64=current_screenshot_base64
383
+ )
384
+ )
385
+
386
+ return messages
387
+
388
+ @property
389
+ def context(self) -> list[dict[str, Any]]:
390
+ return [
391
+ {
392
+ "step": step.step_index,
393
+ "thought": step.thought,
394
+ "action": step.action,
395
+ }
396
+ for step in self.traj_memory.steps
397
+ ]
398
+
399
+ @property
400
+ def step_count(self) -> int:
401
+ return self._step_count
402
+
403
+ @property
404
+ def is_running(self) -> bool:
405
+ return self._is_running
@@ -0,0 +1,254 @@
1
+ """MAI Agent parser using XML tags and JSON.
2
+
3
+ 从 mai_agent 的 XML 格式中提取 thinking 和 action,并转换为
4
+ AutoGLM_GUI 的标准格式。
5
+
6
+ 迁移说明:基于原有实现增强,添加 parse_with_thinking 方法。
7
+ """
8
+
9
+ import json
10
+ import re
11
+ from typing import Any
12
+
13
+
14
+ SCALE_FACTOR = 999
15
+
16
+
17
+ class MAIParseError(ValueError):
18
+ pass
19
+
20
+
21
+ class MAIParser:
22
+ """Parse MAI Agent XML + JSON format outputs.
23
+
24
+ Handles format like:
25
+ <thinking>Reasoning process</thinking>
26
+ <tool_call>{"name": "mobile_use", "arguments": {...}}</tool_call>
27
+
28
+ Converts MAI-specific actions to standard ActionHandler format.
29
+ Coordinate scale: 0-999 (automatically converted to 0-1000)
30
+ """
31
+
32
+ @property
33
+ def coordinate_scale(self) -> int:
34
+ return 999
35
+
36
+ def parse_with_thinking(self, raw_response: str) -> dict[str, Any]:
37
+ text = raw_response.strip()
38
+
39
+ if "</think>" in text and "</thinking>" not in text:
40
+ text = text.replace("</think>", "</thinking>")
41
+ text = "<thinking>" + text
42
+
43
+ pattern = r"<thinking>(.*?)</thinking>.*?<tool_call>(.*?)</tool_call>"
44
+ match = re.search(pattern, text, re.DOTALL)
45
+
46
+ if not match:
47
+ raise MAIParseError("Failed to find <thinking> and <tool_call> tags")
48
+
49
+ thinking = match.group(1).strip().strip('"')
50
+ tool_call_str = match.group(2).strip().strip('"')
51
+
52
+ try:
53
+ tool_call = json.loads(tool_call_str)
54
+ except json.JSONDecodeError as e:
55
+ raise MAIParseError(f"Invalid JSON in tool_call: {e}") from e
56
+
57
+ mai_action = tool_call.get("arguments", {})
58
+
59
+ if "coordinate" in mai_action:
60
+ mai_action["coordinate"] = self._normalize_coordinate_to_0_1(
61
+ mai_action["coordinate"]
62
+ )
63
+
64
+ return {
65
+ "thinking": thinking,
66
+ "raw_action": mai_action,
67
+ "converted_action": self._convert_action(mai_action),
68
+ }
69
+
70
+ def _normalize_coordinate_to_0_1(
71
+ self, coordinate: list[int | float]
72
+ ) -> list[float]:
73
+ if len(coordinate) == 2:
74
+ x, y = coordinate
75
+ elif len(coordinate) == 4:
76
+ x1, y1, x2, y2 = coordinate
77
+ x = (x1 + x2) / 2
78
+ y = (y1 + y2) / 2
79
+ else:
80
+ raise MAIParseError(
81
+ f"Invalid coordinate format: expected 2 or 4 values, got {len(coordinate)}"
82
+ )
83
+
84
+ return [x / SCALE_FACTOR, y / SCALE_FACTOR]
85
+
86
+ def parse(self, raw_response: str) -> dict[str, Any]:
87
+ """Parse MAI agent XML+JSON output.
88
+
89
+ Args:
90
+ raw_response: Model output containing <thinking> and <tool_call> tags.
91
+
92
+ Returns:
93
+ Standardized action dictionary with coordinates converted to 0-1000 scale.
94
+
95
+ Raises:
96
+ ValueError: If parsing fails or content is invalid JSON.
97
+ """
98
+ text = raw_response.strip()
99
+
100
+ if "</think>" in text and "</thinking>" not in text:
101
+ text = text.replace("</think>", "</thinking>")
102
+ text = "<thinking>" + text
103
+
104
+ pattern = r"<thinking>(.*?)</thinking>.*?<tool_call>(.*?)</tool_call>"
105
+ match = re.search(pattern, text, re.DOTALL)
106
+
107
+ if not match:
108
+ raise ValueError("Failed to find <thinking> and <tool_call> tags")
109
+
110
+ tool_call_str = match.group(2).strip().strip('"')
111
+
112
+ try:
113
+ tool_call = json.loads(tool_call_str)
114
+ except json.JSONDecodeError as e:
115
+ raise ValueError(f"Invalid JSON in tool_call: {e}") from e
116
+
117
+ mai_action = tool_call.get("arguments", {})
118
+ return self._convert_action(mai_action)
119
+
120
+ def _convert_action(self, mai_action: dict[str, Any]) -> dict[str, Any]:
121
+ """Convert MAI action format to standard ActionHandler format.
122
+
123
+ MAI format: {"action": "click", "coordinate": [x, y]}
124
+ Standard format: {"_metadata": "do", "action": "Tap", "element": [x, y]}
125
+ """
126
+ action_type = mai_action.get("action")
127
+
128
+ if action_type == "terminate":
129
+ status = mai_action.get("status", "success")
130
+ return {
131
+ "_metadata": "finish",
132
+ "message": "Task completed" if status == "success" else "Task failed",
133
+ }
134
+
135
+ if action_type == "answer":
136
+ return {
137
+ "_metadata": "finish",
138
+ "message": mai_action.get("text", ""),
139
+ }
140
+
141
+ if action_type == "wait":
142
+ return {
143
+ "_metadata": "do",
144
+ "action": "Wait",
145
+ "duration": "1 seconds",
146
+ }
147
+
148
+ if action_type == "system_button":
149
+ button_name = mai_action.get("button", "")
150
+ action_map = {
151
+ "back": "Back",
152
+ "home": "Home",
153
+ "enter": "Enter",
154
+ }
155
+ return {
156
+ "_metadata": "do",
157
+ "action": action_map.get(button_name, "Back"),
158
+ }
159
+
160
+ coordinate = mai_action.get("coordinate")
161
+ if coordinate:
162
+ x = self._convert_coordinate(coordinate[0])
163
+ y = self._convert_coordinate(coordinate[1])
164
+
165
+ if action_type == "click":
166
+ return {
167
+ "_metadata": "do",
168
+ "action": "Tap",
169
+ "element": [x, y],
170
+ }
171
+ elif action_type == "long_press":
172
+ return {
173
+ "_metadata": "do",
174
+ "action": "Long Press",
175
+ "element": [x, y],
176
+ }
177
+ elif action_type == "double_click":
178
+ return {
179
+ "_metadata": "do",
180
+ "action": "Double Tap",
181
+ "element": [x, y],
182
+ }
183
+
184
+ if action_type == "swipe":
185
+ direction = mai_action.get("direction", "up")
186
+ coordinate = mai_action.get("coordinate") or [0.5, 0.5]
187
+ x = self._convert_coordinate(coordinate[0])
188
+ y = self._convert_coordinate(coordinate[1])
189
+
190
+ start, end = self._calculate_swipe_coordinates(direction, x, y)
191
+ return {
192
+ "_metadata": "do",
193
+ "action": "Swipe",
194
+ "start": start,
195
+ "end": end,
196
+ }
197
+
198
+ if action_type == "drag":
199
+ start_coord = mai_action.get("start_coordinate", [0, 0])
200
+ end_coord = mai_action.get("end_coordinate", [0, 0])
201
+
202
+ start = [
203
+ self._convert_coordinate_from_scale_factor(start_coord[0]),
204
+ self._convert_coordinate_from_scale_factor(start_coord[1]),
205
+ ]
206
+ end = [
207
+ self._convert_coordinate_from_scale_factor(end_coord[0]),
208
+ self._convert_coordinate_from_scale_factor(end_coord[1]),
209
+ ]
210
+ return {
211
+ "_metadata": "do",
212
+ "action": "Swipe",
213
+ "start": start,
214
+ "end": end,
215
+ }
216
+
217
+ if action_type == "type":
218
+ return {
219
+ "_metadata": "do",
220
+ "action": "Type",
221
+ "text": mai_action.get("text", ""),
222
+ }
223
+
224
+ if action_type == "open":
225
+ return {
226
+ "_metadata": "do",
227
+ "action": "Launch",
228
+ "app": mai_action.get("app", ""),
229
+ }
230
+
231
+ raise ValueError(f"Unknown MAI action type: {action_type}")
232
+
233
+ def _convert_coordinate(self, value: float) -> int:
234
+ """Convert MAI normalized coordinate [0, 1] to standard scale [0, 1000]."""
235
+ return int(value * 1000)
236
+
237
+ def _convert_coordinate_from_scale_factor(self, value: float) -> int:
238
+ """Convert MAI scale factor coordinate [0, 999] to standard scale [0, 1000]."""
239
+ return int((value / SCALE_FACTOR) * 1000)
240
+
241
+ def _calculate_swipe_coordinates(
242
+ self, direction: str, x: int, y: int
243
+ ) -> tuple[list[int], list[int]]:
244
+ """Calculate start and end coordinates for swipe based on direction."""
245
+ swipe_distance = 300
246
+
247
+ direction_map = {
248
+ "up": ([x, y + swipe_distance], [x, y - swipe_distance]),
249
+ "down": ([x, y - swipe_distance], [x, y + swipe_distance]),
250
+ "left": ([x + swipe_distance, y], [x - swipe_distance, y]),
251
+ "right": ([x - swipe_distance, y], [x + swipe_distance, y]),
252
+ }
253
+
254
+ return direction_map.get(direction, ([x, y], [x, y]))