autoglm-gui 1.4.0__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (120) hide show
  1. AutoGLM_GUI/__init__.py +11 -0
  2. AutoGLM_GUI/__main__.py +26 -8
  3. AutoGLM_GUI/actions/__init__.py +6 -0
  4. AutoGLM_GUI/actions/handler.py +196 -0
  5. AutoGLM_GUI/actions/types.py +15 -0
  6. AutoGLM_GUI/adb/__init__.py +53 -0
  7. AutoGLM_GUI/adb/apps.py +227 -0
  8. AutoGLM_GUI/adb/connection.py +323 -0
  9. AutoGLM_GUI/adb/device.py +171 -0
  10. AutoGLM_GUI/adb/input.py +67 -0
  11. AutoGLM_GUI/adb/screenshot.py +11 -0
  12. AutoGLM_GUI/adb/timing.py +167 -0
  13. AutoGLM_GUI/adb_plus/keyboard_installer.py +4 -2
  14. AutoGLM_GUI/adb_plus/qr_pair.py +8 -8
  15. AutoGLM_GUI/adb_plus/screenshot.py +22 -1
  16. AutoGLM_GUI/adb_plus/serial.py +38 -20
  17. AutoGLM_GUI/adb_plus/touch.py +4 -9
  18. AutoGLM_GUI/agents/__init__.py +51 -0
  19. AutoGLM_GUI/agents/events.py +19 -0
  20. AutoGLM_GUI/agents/factory.py +153 -0
  21. AutoGLM_GUI/agents/glm/__init__.py +7 -0
  22. AutoGLM_GUI/agents/glm/agent.py +292 -0
  23. AutoGLM_GUI/agents/glm/message_builder.py +81 -0
  24. AutoGLM_GUI/agents/glm/parser.py +110 -0
  25. AutoGLM_GUI/agents/glm/prompts_en.py +77 -0
  26. AutoGLM_GUI/agents/glm/prompts_zh.py +75 -0
  27. AutoGLM_GUI/agents/mai/__init__.py +28 -0
  28. AutoGLM_GUI/agents/mai/agent.py +405 -0
  29. AutoGLM_GUI/agents/mai/parser.py +254 -0
  30. AutoGLM_GUI/agents/mai/prompts.py +103 -0
  31. AutoGLM_GUI/agents/mai/traj_memory.py +91 -0
  32. AutoGLM_GUI/agents/protocols.py +27 -0
  33. AutoGLM_GUI/agents/stream_runner.py +188 -0
  34. AutoGLM_GUI/api/__init__.py +71 -11
  35. AutoGLM_GUI/api/agents.py +190 -229
  36. AutoGLM_GUI/api/control.py +9 -6
  37. AutoGLM_GUI/api/devices.py +112 -28
  38. AutoGLM_GUI/api/health.py +13 -0
  39. AutoGLM_GUI/api/history.py +78 -0
  40. AutoGLM_GUI/api/layered_agent.py +306 -181
  41. AutoGLM_GUI/api/mcp.py +11 -10
  42. AutoGLM_GUI/api/media.py +64 -1
  43. AutoGLM_GUI/api/scheduled_tasks.py +98 -0
  44. AutoGLM_GUI/api/version.py +23 -10
  45. AutoGLM_GUI/api/workflows.py +2 -1
  46. AutoGLM_GUI/config.py +72 -14
  47. AutoGLM_GUI/config_manager.py +98 -27
  48. AutoGLM_GUI/device_adapter.py +263 -0
  49. AutoGLM_GUI/device_manager.py +248 -29
  50. AutoGLM_GUI/device_protocol.py +266 -0
  51. AutoGLM_GUI/devices/__init__.py +49 -0
  52. AutoGLM_GUI/devices/adb_device.py +200 -0
  53. AutoGLM_GUI/devices/mock_device.py +185 -0
  54. AutoGLM_GUI/devices/remote_device.py +177 -0
  55. AutoGLM_GUI/exceptions.py +3 -3
  56. AutoGLM_GUI/history_manager.py +164 -0
  57. AutoGLM_GUI/i18n.py +81 -0
  58. AutoGLM_GUI/metrics.py +13 -20
  59. AutoGLM_GUI/model/__init__.py +5 -0
  60. AutoGLM_GUI/model/message_builder.py +69 -0
  61. AutoGLM_GUI/model/types.py +24 -0
  62. AutoGLM_GUI/models/__init__.py +10 -0
  63. AutoGLM_GUI/models/history.py +96 -0
  64. AutoGLM_GUI/models/scheduled_task.py +71 -0
  65. AutoGLM_GUI/parsers/__init__.py +22 -0
  66. AutoGLM_GUI/parsers/base.py +50 -0
  67. AutoGLM_GUI/parsers/phone_parser.py +58 -0
  68. AutoGLM_GUI/phone_agent_manager.py +118 -367
  69. AutoGLM_GUI/platform_utils.py +31 -2
  70. AutoGLM_GUI/prompt_config.py +15 -0
  71. AutoGLM_GUI/prompts/__init__.py +32 -0
  72. AutoGLM_GUI/scheduler_manager.py +304 -0
  73. AutoGLM_GUI/schemas.py +272 -63
  74. AutoGLM_GUI/scrcpy_stream.py +159 -37
  75. AutoGLM_GUI/server.py +3 -1
  76. AutoGLM_GUI/socketio_server.py +114 -29
  77. AutoGLM_GUI/state.py +10 -30
  78. AutoGLM_GUI/static/assets/{about-DeclntHg.js → about-BQm96DAl.js} +1 -1
  79. AutoGLM_GUI/static/assets/alert-dialog-B42XxGPR.js +1 -0
  80. AutoGLM_GUI/static/assets/chat-C0L2gQYG.js +129 -0
  81. AutoGLM_GUI/static/assets/circle-alert-D4rSJh37.js +1 -0
  82. AutoGLM_GUI/static/assets/dialog-DZ78cEcj.js +45 -0
  83. AutoGLM_GUI/static/assets/history-DFBv7TGc.js +1 -0
  84. AutoGLM_GUI/static/assets/index-Bzyv2yQ2.css +1 -0
  85. AutoGLM_GUI/static/assets/{index-zQ4KKDHt.js → index-CmZSnDqc.js} +1 -1
  86. AutoGLM_GUI/static/assets/index-CssG-3TH.js +11 -0
  87. AutoGLM_GUI/static/assets/label-BCUzE_nm.js +1 -0
  88. AutoGLM_GUI/static/assets/logs-eoFxn5of.js +1 -0
  89. AutoGLM_GUI/static/assets/popover-DLsuV5Sx.js +1 -0
  90. AutoGLM_GUI/static/assets/scheduled-tasks-MyqGJvy_.js +1 -0
  91. AutoGLM_GUI/static/assets/square-pen-zGWYrdfj.js +1 -0
  92. AutoGLM_GUI/static/assets/textarea-BX6y7uM5.js +1 -0
  93. AutoGLM_GUI/static/assets/workflows-CYFs6ssC.js +1 -0
  94. AutoGLM_GUI/static/index.html +2 -2
  95. AutoGLM_GUI/types.py +142 -0
  96. {autoglm_gui-1.4.0.dist-info → autoglm_gui-1.5.0.dist-info}/METADATA +178 -92
  97. autoglm_gui-1.5.0.dist-info/RECORD +157 -0
  98. mai_agent/base.py +137 -0
  99. mai_agent/mai_grounding_agent.py +263 -0
  100. mai_agent/mai_naivigation_agent.py +526 -0
  101. mai_agent/prompt.py +148 -0
  102. mai_agent/unified_memory.py +67 -0
  103. mai_agent/utils.py +73 -0
  104. AutoGLM_GUI/api/dual_model.py +0 -311
  105. AutoGLM_GUI/dual_model/__init__.py +0 -53
  106. AutoGLM_GUI/dual_model/decision_model.py +0 -664
  107. AutoGLM_GUI/dual_model/dual_agent.py +0 -917
  108. AutoGLM_GUI/dual_model/protocols.py +0 -354
  109. AutoGLM_GUI/dual_model/vision_model.py +0 -442
  110. AutoGLM_GUI/mai_ui_adapter/agent_wrapper.py +0 -291
  111. AutoGLM_GUI/phone_agent_patches.py +0 -146
  112. AutoGLM_GUI/static/assets/chat-Iut2yhSw.js +0 -125
  113. AutoGLM_GUI/static/assets/dialog-BfdcBs1x.js +0 -45
  114. AutoGLM_GUI/static/assets/index-5hCCwHA7.css +0 -1
  115. AutoGLM_GUI/static/assets/index-DHF1NZh0.js +0 -12
  116. AutoGLM_GUI/static/assets/workflows-xiplap-r.js +0 -1
  117. autoglm_gui-1.4.0.dist-info/RECORD +0 -100
  118. {autoglm_gui-1.4.0.dist-info → autoglm_gui-1.5.0.dist-info}/WHEEL +0 -0
  119. {autoglm_gui-1.4.0.dist-info → autoglm_gui-1.5.0.dist-info}/entry_points.txt +0 -0
  120. {autoglm_gui-1.4.0.dist-info → autoglm_gui-1.5.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,526 @@
1
+ # Copyright (c) 2025, Alibaba Cloud and its affiliates;
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+
14
+ """
15
+ MAI Mobile Agent - A GUI automation agent for mobile devices.
16
+
17
+ This module provides the MAIMobileAgent class that uses vision-language models
18
+ to interact with mobile device interfaces based on natural language instructions.
19
+ """
20
+
21
+ import copy
22
+ import json
23
+ import re
24
+ import traceback
25
+ from io import BytesIO
26
+ from typing import Any, Dict, List, Optional, Tuple
27
+
28
+ import numpy as np
29
+ from openai import OpenAI
30
+ from PIL import Image
31
+
32
+ from base import BaseAgent
33
+ from prompt import MAI_MOBILE_SYS_PROMPT, MAI_MOBILE_SYS_PROMPT_ASK_USER_MCP
34
+ from unified_memory import TrajStep
35
+ from utils import pil_to_base64, safe_pil_to_bytes
36
+
37
+ # Constants
38
+ SCALE_FACTOR = 999
39
+
40
+
41
+ def mask_image_urls_for_logging(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
42
+ """
43
+ Create a copy of messages with image URLs masked for logging.
44
+
45
+ Args:
46
+ messages: List of message dictionaries that may contain image URLs.
47
+
48
+ Returns:
49
+ Deep copy of messages with image URLs replaced by "[IMAGE_DATA]".
50
+ """
51
+ messages_masked = copy.deepcopy(messages)
52
+ for message in messages_masked:
53
+ content = message.get("content", [])
54
+ if content and isinstance(content, list):
55
+ for item in content:
56
+ if isinstance(item, dict) and "image_url" in item:
57
+ item["image_url"]["url"] = "[IMAGE_DATA]"
58
+ return messages_masked
59
+
60
+
61
+ def parse_tagged_text(text: str) -> Dict[str, Any]:
62
+ """
63
+ Parse text containing XML-style tags to extract thinking and tool_call content.
64
+
65
+ Args:
66
+ text: Text containing <thinking> and <tool_call> tags.
67
+
68
+ Returns:
69
+ Dictionary with keys:
70
+ - "thinking": Content inside <thinking> tags (str or None)
71
+ - "tool_call": Parsed JSON content inside <tool_call> tags (dict or None)
72
+
73
+ Raises:
74
+ ValueError: If tool_call content is not valid JSON.
75
+ """
76
+ # Handle thinking model output format (uses </think> instead of </thinking>)
77
+ if "</think>" in text and "</thinking>" not in text:
78
+ text = text.replace("</think>", "</thinking>")
79
+ text = "<thinking>" + text
80
+
81
+ # Define regex pattern with non-greedy matching
82
+ pattern = r"<thinking>(.*?)</thinking>.*?<tool_call>(.*?)</tool_call>"
83
+
84
+ result: Dict[str, Any] = {
85
+ "thinking": None,
86
+ "tool_call": None,
87
+ }
88
+
89
+ # Use re.DOTALL to match newlines
90
+ match = re.search(pattern, text, re.DOTALL)
91
+ if match:
92
+ result = {
93
+ "thinking": match.group(1).strip().strip('"'),
94
+ "tool_call": match.group(2).strip().strip('"'),
95
+ }
96
+
97
+ # Parse tool_call as JSON
98
+ if result["tool_call"]:
99
+ try:
100
+ result["tool_call"] = json.loads(result["tool_call"])
101
+ except json.JSONDecodeError as e:
102
+ raise ValueError(f"Invalid JSON in tool_call: {e}")
103
+
104
+ return result
105
+
106
+
107
+ def parse_action_to_structure_output(text: str) -> Dict[str, Any]:
108
+ """
109
+ Parse model output text into structured action format.
110
+
111
+ Args:
112
+ text: Raw model output containing thinking and tool_call tags.
113
+
114
+ Returns:
115
+ Dictionary with keys:
116
+ - "thinking": The model's reasoning process
117
+ - "action_json": Parsed action with normalized coordinates
118
+
119
+ Note:
120
+ Coordinates are normalized to [0, 1] range by dividing by SCALE_FACTOR.
121
+ """
122
+ text = text.strip()
123
+
124
+ results = parse_tagged_text(text)
125
+ thinking = results["thinking"]
126
+ tool_call = results["tool_call"]
127
+ action = tool_call["arguments"]
128
+
129
+ # Normalize coordinates from SCALE_FACTOR range to [0, 1]
130
+ if "coordinate" in action:
131
+ coordinates = action["coordinate"]
132
+ if len(coordinates) == 2:
133
+ point_x, point_y = coordinates
134
+ elif len(coordinates) == 4:
135
+ x1, y1, x2, y2 = coordinates
136
+ point_x = (x1 + x2) / 2
137
+ point_y = (y1 + y2) / 2
138
+ else:
139
+ raise ValueError(
140
+ f"Invalid coordinate format: expected 2 or 4 values, got {len(coordinates)}"
141
+ )
142
+ point_x = point_x / SCALE_FACTOR
143
+ point_y = point_y / SCALE_FACTOR
144
+ action["coordinate"] = [point_x, point_y]
145
+
146
+ return {
147
+ "thinking": thinking,
148
+ "action_json": action,
149
+ }
150
+
151
+
152
+ class MAIUINaivigationAgent(BaseAgent):
153
+ """
154
+ Mobile automation agent using vision-language models.
155
+
156
+ This agent processes screenshots and natural language instructions to
157
+ generate GUI actions for mobile device automation.
158
+
159
+ Attributes:
160
+ llm_base_url: Base URL for the LLM API endpoint.
161
+ model_name: Name of the model to use for predictions.
162
+ runtime_conf: Configuration dictionary for runtime parameters.
163
+ history_n: Number of history steps to include in context.
164
+ """
165
+
166
+ def __init__(
167
+ self,
168
+ llm_base_url: str,
169
+ model_name: str,
170
+ runtime_conf: Optional[Dict[str, Any]] = None,
171
+ tools: Optional[List[Dict[str, Any]]] = None,
172
+ ):
173
+ """
174
+ Initialize the MAIMobileAgent.
175
+
176
+ Args:
177
+ llm_base_url: Base URL for the LLM API endpoint.
178
+ model_name: Name of the model to use.
179
+ runtime_conf: Optional configuration dictionary with keys:
180
+ - history_n: Number of history images to include (default: 3)
181
+ - max_pixels: Maximum pixels for image processing
182
+ - min_pixels: Minimum pixels for image processing
183
+ - temperature: Sampling temperature (default: 0.0)
184
+ - top_k: Top-k sampling parameter (default: -1)
185
+ - top_p: Top-p sampling parameter (default: 1.0)
186
+ - max_tokens: Maximum tokens in response (default: 2048)
187
+ tools: Optional list of MCP tool definitions. Each tool should be a dict
188
+ with 'name', 'description', and 'parameters' keys.
189
+ """
190
+ super().__init__()
191
+
192
+ # Store MCP tools
193
+ self.tools = tools or []
194
+
195
+ # Set default configuration
196
+ default_conf = {
197
+ "history_n": 3,
198
+ "temperature": 0.0,
199
+ "top_k": -1,
200
+ "top_p": 1.0,
201
+ "max_tokens": 2048,
202
+ }
203
+ self.runtime_conf = {**default_conf, **(runtime_conf or {})}
204
+
205
+ self.llm_base_url = llm_base_url
206
+ self.model_name = model_name
207
+ self.llm = OpenAI(
208
+ base_url=self.llm_base_url,
209
+ api_key="empty",
210
+ )
211
+
212
+ # Extract frequently used config values
213
+ self.temperature = self.runtime_conf["temperature"]
214
+ self.top_k = self.runtime_conf["top_k"]
215
+ self.top_p = self.runtime_conf["top_p"]
216
+ self.max_tokens = self.runtime_conf["max_tokens"]
217
+ self.history_n = self.runtime_conf["history_n"]
218
+
219
+ @property
220
+ def system_prompt(self) -> str:
221
+ """
222
+ Generate the system prompt based on available MCP tools.
223
+
224
+ Returns:
225
+ System prompt string, with MCP tools section if tools are configured.
226
+ """
227
+ if self.tools:
228
+ tools_str = "\n".join(
229
+ [json.dumps(tool, ensure_ascii=False) for tool in self.tools]
230
+ )
231
+ return MAI_MOBILE_SYS_PROMPT_ASK_USER_MCP.render(tools=tools_str)
232
+ return MAI_MOBILE_SYS_PROMPT
233
+
234
+ @property
235
+ def history_responses(self) -> List[str]:
236
+ """
237
+ Generate formatted history responses for context.
238
+
239
+ Returns:
240
+ List of formatted response strings with thinking and tool_call tags.
241
+ """
242
+ history_responses = []
243
+
244
+ for step in self.traj_memory.steps:
245
+ thinking = step.thought
246
+ structured_action = step.structured_action
247
+
248
+ if not structured_action:
249
+ continue
250
+
251
+ action_json = copy.deepcopy(structured_action.get("action_json", {}))
252
+
253
+ # Convert normalized coordinates back to SCALE_FACTOR range
254
+ if "coordinate" in action_json:
255
+ coordinates = action_json.get("coordinate", [])
256
+ if len(coordinates) == 2:
257
+ point_x, point_y = coordinates
258
+ elif len(coordinates) == 4:
259
+ x1, y1, x2, y2 = coordinates
260
+ point_x = (x1 + x2) / 2
261
+ point_y = (y1 + y2) / 2
262
+ else:
263
+ continue
264
+ action_json["coordinate"] = [
265
+ int(point_x * SCALE_FACTOR),
266
+ int(point_y * SCALE_FACTOR),
267
+ ]
268
+
269
+ tool_call_dict = {
270
+ "name": "mobile_use",
271
+ "arguments": action_json,
272
+ }
273
+ tool_call_json = json.dumps(tool_call_dict, separators=(",", ":"))
274
+ history_responses.append(
275
+ f"<thinking>\n{thinking}\n</thinking>\n<tool_call>\n{tool_call_json}\n</tool_call>"
276
+ )
277
+
278
+ return history_responses
279
+
280
+ def _prepare_images(self, screenshot_bytes: bytes) -> List[Image.Image]:
281
+ """
282
+ Prepare image list including history and current screenshot.
283
+
284
+ Args:
285
+ screenshot_bytes: Current screenshot as bytes.
286
+
287
+ Returns:
288
+ List of PIL Images (history + current).
289
+ """
290
+ # Calculate how many history images to include
291
+ if len(self.history_images) > 0:
292
+ max_history = min(len(self.history_images), self.history_n - 1)
293
+ recent_history = (
294
+ self.history_images[-max_history:] if max_history > 0 else []
295
+ )
296
+ else:
297
+ recent_history = []
298
+
299
+ # Add current image bytes
300
+ recent_history.append(screenshot_bytes)
301
+
302
+ # Normalize input type
303
+ if isinstance(recent_history, bytes):
304
+ recent_history = [recent_history]
305
+ elif isinstance(recent_history, np.ndarray):
306
+ recent_history = list(recent_history)
307
+ elif not isinstance(recent_history, list):
308
+ raise TypeError(f"Unidentified images type: {type(recent_history)}")
309
+
310
+ # Convert all images to PIL format
311
+ images = []
312
+ for image in recent_history:
313
+ if isinstance(image, bytes):
314
+ image = Image.open(BytesIO(image))
315
+ elif isinstance(image, Image.Image):
316
+ pass
317
+ else:
318
+ raise TypeError(f"Expected bytes or PIL Image, got {type(image)}")
319
+
320
+ if image.mode != "RGB":
321
+ image = image.convert("RGB")
322
+
323
+ images.append(image)
324
+
325
+ return images
326
+
327
+ def _build_messages(
328
+ self,
329
+ instruction: str,
330
+ images: List[Image.Image],
331
+ ) -> List[Dict[str, Any]]:
332
+ """
333
+ Build the message list for the LLM API call.
334
+
335
+ Args:
336
+ instruction: Task instruction from user.
337
+ images: List of prepared images.
338
+
339
+ Returns:
340
+ List of message dictionaries for the API.
341
+ """
342
+ messages = [
343
+ {
344
+ "role": "system",
345
+ "content": [{"type": "text", "text": self.system_prompt}],
346
+ },
347
+ {
348
+ "role": "user",
349
+ "content": [{"type": "text", "text": instruction}],
350
+ },
351
+ ]
352
+
353
+ image_num = 0
354
+ history_responses = self.history_responses
355
+
356
+ if len(history_responses) > 0:
357
+ for history_idx, history_response in enumerate(history_responses):
358
+ # Only include images for recent history (last history_n responses)
359
+ if history_idx + self.history_n >= len(history_responses):
360
+ # Add image before the assistant response
361
+ if image_num < len(images) - 1:
362
+ cur_image = images[image_num]
363
+ encoded_string = pil_to_base64(cur_image)
364
+ messages.append(
365
+ {
366
+ "role": "user",
367
+ "content": [
368
+ {
369
+ "type": "image_url",
370
+ "image_url": {
371
+ "url": f"data:image/png;base64,{encoded_string}"
372
+ },
373
+ }
374
+ ],
375
+ }
376
+ )
377
+ image_num += 1
378
+
379
+ messages.append(
380
+ {
381
+ "role": "assistant",
382
+ "content": [{"type": "text", "text": history_response}],
383
+ }
384
+ )
385
+
386
+ # Add current image (last one in images list)
387
+ if image_num < len(images):
388
+ cur_image = images[image_num]
389
+ encoded_string = pil_to_base64(cur_image)
390
+ messages.append(
391
+ {
392
+ "role": "user",
393
+ "content": [
394
+ {
395
+ "type": "image_url",
396
+ "image_url": {
397
+ "url": f"data:image/png;base64,{encoded_string}"
398
+ },
399
+ }
400
+ ],
401
+ }
402
+ )
403
+ else:
404
+ # No history, just add the current image
405
+ cur_image = images[0]
406
+ encoded_string = pil_to_base64(cur_image)
407
+ messages.append(
408
+ {
409
+ "role": "user",
410
+ "content": [
411
+ {
412
+ "type": "image_url",
413
+ "image_url": {
414
+ "url": f"data:image/png;base64,{encoded_string}"
415
+ },
416
+ }
417
+ ],
418
+ }
419
+ )
420
+
421
+ return messages
422
+
423
+ def predict(
424
+ self,
425
+ instruction: str,
426
+ obs: Dict[str, Any],
427
+ **kwargs: Any,
428
+ ) -> Tuple[str, Dict[str, Any]]:
429
+ """
430
+ Predict the next action based on the current observation.
431
+
432
+ Args:
433
+ instruction: Task instruction/goal.
434
+ obs: Current observation containing:
435
+ - screenshot: PIL Image or bytes of current screen
436
+ - accessibility_tree: Optional accessibility tree data
437
+ **kwargs: Additional arguments including:
438
+ - extra_info: Optional extra context string
439
+
440
+ Returns:
441
+ Tuple of (prediction_text, action_dict) where:
442
+ - prediction_text: Raw model response or error message
443
+ - action_dict: Parsed action dictionary
444
+ """
445
+ # Set task goal if not already set
446
+ if not self.traj_memory.task_goal:
447
+ self.traj_memory.task_goal = instruction
448
+
449
+ # Process screenshot
450
+ screenshot_pil = obs["screenshot"]
451
+ screenshot_bytes = safe_pil_to_bytes(screenshot_pil)
452
+
453
+ # Prepare images
454
+ images = self._prepare_images(screenshot_bytes)
455
+
456
+ # Build messages
457
+ messages = self._build_messages(instruction, images)
458
+
459
+ # Make API call with retry logic
460
+ max_retries = 3
461
+ prediction = None
462
+ action_json = None
463
+
464
+ for attempt in range(max_retries):
465
+ try:
466
+ messages_print = mask_image_urls_for_logging(messages)
467
+ print(f"Messages (attempt {attempt + 1}):\n{messages_print}")
468
+
469
+ response = self.llm.chat.completions.create(
470
+ model=self.model_name,
471
+ messages=messages,
472
+ max_tokens=self.max_tokens,
473
+ temperature=self.temperature,
474
+ top_p=self.top_p,
475
+ frequency_penalty=0.0,
476
+ presence_penalty=0.0,
477
+ extra_body={"repetition_penalty": 1.0, "top_k": self.top_k},
478
+ seed=42,
479
+ )
480
+ prediction = response.choices[0].message.content.strip()
481
+ print(f"Raw response:\n{prediction}")
482
+
483
+ # Parse response
484
+ parsed_response = parse_action_to_structure_output(prediction)
485
+ thinking = parsed_response["thinking"]
486
+ action_json = parsed_response["action_json"]
487
+ print(f"Parsed response:\n{parsed_response}")
488
+ break
489
+
490
+ except Exception as e:
491
+ print(f"Error on attempt {attempt + 1}: {e}")
492
+ traceback.print_exc()
493
+ prediction = None
494
+ action_json = None
495
+
496
+ # Return error if all retries failed
497
+ if prediction is None or action_json is None:
498
+ print("Max retry attempts reached, returning error flag.")
499
+ return "llm client error", {"action": None}
500
+
501
+ # Create and store trajectory step
502
+ traj_step = TrajStep(
503
+ screenshot=screenshot_pil,
504
+ accessibility_tree=obs.get("accessibility_tree"),
505
+ prediction=prediction,
506
+ action=action_json,
507
+ conclusion="",
508
+ thought=thinking,
509
+ step_index=len(self.traj_memory.steps),
510
+ agent_type="MAIMobileAgent",
511
+ model_name=self.model_name,
512
+ screenshot_bytes=screenshot_bytes,
513
+ structured_action={"action_json": action_json},
514
+ )
515
+ self.traj_memory.steps.append(traj_step)
516
+
517
+ return prediction, action_json
518
+
519
+ def reset(self, runtime_logger: Any = None) -> None:
520
+ """
521
+ Reset the trajectory memory for a new task.
522
+
523
+ Args:
524
+ runtime_logger: Optional logger (unused, kept for API compatibility).
525
+ """
526
+ super().reset()
mai_agent/prompt.py ADDED
@@ -0,0 +1,148 @@
1
+ # Copyright (c) 2025, Alibaba Cloud and its affiliates;
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+
14
+ """System prompts for MAI Mobile Agent."""
15
+
16
+ from jinja2 import Template
17
+
18
+ MAI_MOBILE_SYS_PROMPT = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
19
+
20
+ ## Output Format
21
+ For each function call, return the thinking process in <thinking> </thinking> tags, and a json object with function name and arguments within <tool_call></tool_call> XML tags:
22
+ ```
23
+ <thinking>
24
+ ...
25
+ </thinking>
26
+ <tool_call>
27
+ {"name": "mobile_use", "arguments": <args-json-object>}
28
+ </tool_call>
29
+ ```
30
+
31
+ ## Action Space
32
+
33
+ {"action": "click", "coordinate": [x, y]}
34
+ {"action": "long_press", "coordinate": [x, y]}
35
+ {"action": "type", "text": ""}
36
+ {"action": "swipe", "direction": "up or down or left or right", "coordinate": [x, y]} # "coordinate" is optional. Use the "coordinate" if you want to swipe a specific UI element.
37
+ {"action": "open", "text": "app_name"}
38
+ {"action": "drag", "start_coordinate": [x1, y1], "end_coordinate": [x2, y2]}
39
+ {"action": "system_button", "button": "button_name"} # Options: back, home, menu, enter
40
+ {"action": "wait"}
41
+ {"action": "terminate", "status": "success or fail"}
42
+ {"action": "answer", "text": "xxx"} # Use escape characters \\', \\", and \\n in text part to ensure we can parse the text in normal python string format.
43
+
44
+
45
+ ## Note
46
+ - Write a small plan and finally summarize your next action (with its target element) in one sentence in <thinking></thinking> part.
47
+ - Available Apps: `["Camera","Chrome","Clock","Contacts","Dialer","Files","Settings","Markor","Tasks","Simple Draw Pro","Simple Gallery Pro","Simple SMS Messenger","Audio Recorder","Pro Expense","Broccoli APP","OSMand","VLC","Joplin","Retro Music","OpenTracks","Simple Calendar Pro"]`.
48
+ You should use the `open` action to open the app as possible as you can, because it is the fast way to open the app.
49
+ - You must follow the Action Space strictly, and return the correct json object within <thinking> </thinking> and <tool_call></tool_call> XML tags.
50
+ """.strip()
51
+
52
+
53
+ MAI_MOBILE_SYS_PROMPT_NO_THINKING = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
54
+
55
+ ## Output Format
56
+ For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
57
+ ```
58
+ <tool_call>
59
+ {"name": "mobile_use", "arguments": <args-json-object>}
60
+ </tool_call>
61
+ ```
62
+
63
+ ## Action Space
64
+
65
+ {"action": "click", "coordinate": [x, y]}
66
+ {"action": "long_press", "coordinate": [x, y]}
67
+ {"action": "type", "text": ""}
68
+ {"action": "swipe", "direction": "up or down or left or right", "coordinate": [x, y]} # "coordinate" is optional. Use the "coordinate" if you want to swipe a specific UI element.
69
+ {"action": "open", "text": "app_name"}
70
+ {"action": "drag", "start_coordinate": [x1, y1], "end_coordinate": [x2, y2]}
71
+ {"action": "system_button", "button": "button_name"} # Options: back, home, menu, enter
72
+ {"action": "wait"}
73
+ {"action": "terminate", "status": "success or fail"}
74
+ {"action": "answer", "text": "xxx"} # Use escape characters \\', \\", and \\n in text part to ensure we can parse the text in normal python string format.
75
+
76
+
77
+ ## Note
78
+ - Available Apps: `["Camera","Chrome","Clock","Contacts","Dialer","Files","Settings","Markor","Tasks","Simple Draw Pro","Simple Gallery Pro","Simple SMS Messenger","Audio Recorder","Pro Expense","Broccoli APP","OSMand","VLC","Joplin","Retro Music","OpenTracks","Simple Calendar Pro"]`.
79
+ You should use the `open` action to open the app as possible as you can, because it is the fast way to open the app.
80
+ - You must follow the Action Space strictly, and return the correct json object within <thinking> </thinking> and <tool_call></tool_call> XML tags.
81
+ """.strip()
82
+
83
+
84
+ # Placeholder prompts for future features
85
+ MAI_MOBILE_SYS_PROMPT_ASK_USER_MCP = Template(
86
+ """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
87
+
88
+ ## Output Format
89
+ For each function call, return the thinking process in <thinking> </thinking> tags, and a json object with function name and arguments within <tool_call></tool_call> XML tags:
90
+ ```
91
+ <thinking>
92
+ ...
93
+ </thinking>
94
+ <tool_call>
95
+ {"name": "mobile_use", "arguments": <args-json-object>}
96
+ </tool_call>
97
+ ```
98
+
99
+ ## Action Space
100
+
101
+ {"action": "click", "coordinate": [x, y]}
102
+ {"action": "long_press", "coordinate": [x, y]}
103
+ {"action": "type", "text": ""}
104
+ {"action": "swipe", "direction": "up or down or left or right", "coordinate": [x, y]} # "coordinate" is optional. Use the "coordinate" if you want to swipe a specific UI element.
105
+ {"action": "open", "text": "app_name"}
106
+ {"action": "drag", "start_coordinate": [x1, y1], "end_coordinate": [x2, y2]}
107
+ {"action": "system_button", "button": "button_name"} # Options: back, home, menu, enter
108
+ {"action": "wait"}
109
+ {"action": "terminate", "status": "success or fail"}
110
+ {"action": "answer", "text": "xxx"} # Use escape characters \\', \\", and \\n in text part to ensure we can parse the text in normal python string format.
111
+ {"action": "ask_user", "text": "xxx"} # you can ask user for more information to complete the task.
112
+ {"action": "double_click", "coordinate": [x, y]}
113
+
114
+ {% if tools -%}
115
+ ## MCP Tools
116
+ You are also provided with MCP tools, you can use them to complete the task.
117
+ {{ tools }}
118
+
119
+ If you want to use MCP tools, you must output as the following format:
120
+ ```
121
+ <thinking>
122
+ ...
123
+ </thinking>
124
+ <tool_call>
125
+ {"name": <function-name>, "arguments": <args-json-object>}
126
+ </tool_call>
127
+ ```
128
+ {% endif -%}
129
+
130
+
131
+ ## Note
132
+ - Available Apps: `["Contacts", "Settings", "Clock", "Maps", "Chrome", "Calendar", "files", "Gallery", "Taodian", "Mattermost", "Mastodon", "Mail", "SMS", "Camera"]`.
133
+ - Write a small plan and finally summarize your next action (with its target element) in one sentence in <thinking></thinking> part.
134
+ """.strip()
135
+ )
136
+
137
+ MAI_MOBILE_SYS_PROMPT_GROUNDING = """
138
+ You are a GUI grounding agent.
139
+ ## Task
140
+ Given a screenshot and the user's grounding instruction. Your task is to accurately locate a UI element based on the user's instructions.
141
+ First, you should carefully examine the screenshot and analyze the user's instructions, translate the user's instruction into a effective reasoning process, and then provide the final coordinate.
142
+ ## Output Format
143
+ Return a json object with a reasoning process in <grounding_think></grounding_think> tags, a [x,y] format coordinate within <answer></answer> XML tags:
144
+ <grounding_think>...</grounding_think>
145
+ <answer>
146
+ {"coordinate": [x,y]}
147
+ </answer>
148
+ """.strip()