autoglm-gui 1.5.0__py3-none-any.whl → 1.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- AutoGLM_GUI/agents/glm/agent.py +6 -1
- AutoGLM_GUI/agents/mai/agent.py +3 -0
- AutoGLM_GUI/agents/stream_runner.py +7 -2
- AutoGLM_GUI/api/agents.py +26 -1
- AutoGLM_GUI/api/history.py +27 -1
- AutoGLM_GUI/models/history.py +45 -1
- AutoGLM_GUI/scheduler_manager.py +52 -6
- AutoGLM_GUI/schemas.py +12 -0
- AutoGLM_GUI/static/assets/{about-BQm96DAl.js → about-CfwX1Cmc.js} +1 -1
- AutoGLM_GUI/static/assets/{alert-dialog-B42XxGPR.js → alert-dialog-CtGlN2IJ.js} +1 -1
- AutoGLM_GUI/static/assets/chat-BYa-foUI.js +129 -0
- AutoGLM_GUI/static/assets/{circle-alert-D4rSJh37.js → circle-alert-t08bEMPO.js} +1 -1
- AutoGLM_GUI/static/assets/{dialog-DZ78cEcj.js → dialog-FNwZJFwk.js} +1 -1
- AutoGLM_GUI/static/assets/eye-D0UPWCWC.js +1 -0
- AutoGLM_GUI/static/assets/history-CRo95B7i.js +1 -0
- AutoGLM_GUI/static/assets/{index-CmZSnDqc.js → index-BaLMSqd3.js} +1 -1
- AutoGLM_GUI/static/assets/{index-CssG-3TH.js → index-CTHbFvKl.js} +5 -5
- AutoGLM_GUI/static/assets/index-CV7jGxGm.css +1 -0
- AutoGLM_GUI/static/assets/{label-BCUzE_nm.js → label-DJFevVmr.js} +1 -1
- AutoGLM_GUI/static/assets/{logs-eoFxn5of.js → logs-RW09DyYY.js} +1 -1
- AutoGLM_GUI/static/assets/{popover-DLsuV5Sx.js → popover--JTJrE5v.js} +1 -1
- AutoGLM_GUI/static/assets/{scheduled-tasks-MyqGJvy_.js → scheduled-tasks-DTRKsQXF.js} +1 -1
- AutoGLM_GUI/static/assets/{square-pen-zGWYrdfj.js → square-pen-CPK_K680.js} +1 -1
- AutoGLM_GUI/static/assets/{textarea-BX6y7uM5.js → textarea-PRmVnWq5.js} +1 -1
- AutoGLM_GUI/static/assets/{workflows-CYFs6ssC.js → workflows-CdcsAoaT.js} +1 -1
- AutoGLM_GUI/static/index.html +2 -2
- {autoglm_gui-1.5.0.dist-info → autoglm_gui-1.5.1.dist-info}/METADATA +49 -7
- {autoglm_gui-1.5.0.dist-info → autoglm_gui-1.5.1.dist-info}/RECORD +31 -70
- AutoGLM_GUI/device_adapter.py +0 -263
- AutoGLM_GUI/static/assets/chat-C0L2gQYG.js +0 -129
- AutoGLM_GUI/static/assets/history-DFBv7TGc.js +0 -1
- AutoGLM_GUI/static/assets/index-Bzyv2yQ2.css +0 -1
- mai_agent/base.py +0 -137
- mai_agent/mai_grounding_agent.py +0 -263
- mai_agent/mai_naivigation_agent.py +0 -526
- mai_agent/prompt.py +0 -148
- mai_agent/unified_memory.py +0 -67
- mai_agent/utils.py +0 -73
- phone_agent/__init__.py +0 -12
- phone_agent/actions/__init__.py +0 -5
- phone_agent/actions/handler.py +0 -400
- phone_agent/actions/handler_ios.py +0 -278
- phone_agent/adb/__init__.py +0 -51
- phone_agent/adb/connection.py +0 -358
- phone_agent/adb/device.py +0 -253
- phone_agent/adb/input.py +0 -108
- phone_agent/adb/screenshot.py +0 -108
- phone_agent/agent.py +0 -253
- phone_agent/agent_ios.py +0 -277
- phone_agent/config/__init__.py +0 -53
- phone_agent/config/apps.py +0 -227
- phone_agent/config/apps_harmonyos.py +0 -256
- phone_agent/config/apps_ios.py +0 -339
- phone_agent/config/i18n.py +0 -81
- phone_agent/config/prompts.py +0 -80
- phone_agent/config/prompts_en.py +0 -79
- phone_agent/config/prompts_zh.py +0 -82
- phone_agent/config/timing.py +0 -167
- phone_agent/device_factory.py +0 -166
- phone_agent/hdc/__init__.py +0 -53
- phone_agent/hdc/connection.py +0 -384
- phone_agent/hdc/device.py +0 -269
- phone_agent/hdc/input.py +0 -145
- phone_agent/hdc/screenshot.py +0 -127
- phone_agent/model/__init__.py +0 -5
- phone_agent/model/client.py +0 -290
- phone_agent/xctest/__init__.py +0 -47
- phone_agent/xctest/connection.py +0 -379
- phone_agent/xctest/device.py +0 -472
- phone_agent/xctest/input.py +0 -311
- phone_agent/xctest/screenshot.py +0 -226
- {autoglm_gui-1.5.0.dist-info → autoglm_gui-1.5.1.dist-info}/WHEEL +0 -0
- {autoglm_gui-1.5.0.dist-info → autoglm_gui-1.5.1.dist-info}/entry_points.txt +0 -0
- {autoglm_gui-1.5.0.dist-info → autoglm_gui-1.5.1.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,526 +0,0 @@
|
|
|
1
|
-
# Copyright (c) 2025, Alibaba Cloud and its affiliates;
|
|
2
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
-
# you may not use this file except in compliance with the License.
|
|
4
|
-
# You may obtain a copy of the License at
|
|
5
|
-
#
|
|
6
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
-
#
|
|
8
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
-
# See the License for the specific language governing permissions and
|
|
12
|
-
# limitations under the License.
|
|
13
|
-
|
|
14
|
-
"""
|
|
15
|
-
MAI Mobile Agent - A GUI automation agent for mobile devices.
|
|
16
|
-
|
|
17
|
-
This module provides the MAIMobileAgent class that uses vision-language models
|
|
18
|
-
to interact with mobile device interfaces based on natural language instructions.
|
|
19
|
-
"""
|
|
20
|
-
|
|
21
|
-
import copy
|
|
22
|
-
import json
|
|
23
|
-
import re
|
|
24
|
-
import traceback
|
|
25
|
-
from io import BytesIO
|
|
26
|
-
from typing import Any, Dict, List, Optional, Tuple
|
|
27
|
-
|
|
28
|
-
import numpy as np
|
|
29
|
-
from openai import OpenAI
|
|
30
|
-
from PIL import Image
|
|
31
|
-
|
|
32
|
-
from base import BaseAgent
|
|
33
|
-
from prompt import MAI_MOBILE_SYS_PROMPT, MAI_MOBILE_SYS_PROMPT_ASK_USER_MCP
|
|
34
|
-
from unified_memory import TrajStep
|
|
35
|
-
from utils import pil_to_base64, safe_pil_to_bytes
|
|
36
|
-
|
|
37
|
-
# Constants
|
|
38
|
-
SCALE_FACTOR = 999
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
def mask_image_urls_for_logging(messages: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
|
42
|
-
"""
|
|
43
|
-
Create a copy of messages with image URLs masked for logging.
|
|
44
|
-
|
|
45
|
-
Args:
|
|
46
|
-
messages: List of message dictionaries that may contain image URLs.
|
|
47
|
-
|
|
48
|
-
Returns:
|
|
49
|
-
Deep copy of messages with image URLs replaced by "[IMAGE_DATA]".
|
|
50
|
-
"""
|
|
51
|
-
messages_masked = copy.deepcopy(messages)
|
|
52
|
-
for message in messages_masked:
|
|
53
|
-
content = message.get("content", [])
|
|
54
|
-
if content and isinstance(content, list):
|
|
55
|
-
for item in content:
|
|
56
|
-
if isinstance(item, dict) and "image_url" in item:
|
|
57
|
-
item["image_url"]["url"] = "[IMAGE_DATA]"
|
|
58
|
-
return messages_masked
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
def parse_tagged_text(text: str) -> Dict[str, Any]:
|
|
62
|
-
"""
|
|
63
|
-
Parse text containing XML-style tags to extract thinking and tool_call content.
|
|
64
|
-
|
|
65
|
-
Args:
|
|
66
|
-
text: Text containing <thinking> and <tool_call> tags.
|
|
67
|
-
|
|
68
|
-
Returns:
|
|
69
|
-
Dictionary with keys:
|
|
70
|
-
- "thinking": Content inside <thinking> tags (str or None)
|
|
71
|
-
- "tool_call": Parsed JSON content inside <tool_call> tags (dict or None)
|
|
72
|
-
|
|
73
|
-
Raises:
|
|
74
|
-
ValueError: If tool_call content is not valid JSON.
|
|
75
|
-
"""
|
|
76
|
-
# Handle thinking model output format (uses </think> instead of </thinking>)
|
|
77
|
-
if "</think>" in text and "</thinking>" not in text:
|
|
78
|
-
text = text.replace("</think>", "</thinking>")
|
|
79
|
-
text = "<thinking>" + text
|
|
80
|
-
|
|
81
|
-
# Define regex pattern with non-greedy matching
|
|
82
|
-
pattern = r"<thinking>(.*?)</thinking>.*?<tool_call>(.*?)</tool_call>"
|
|
83
|
-
|
|
84
|
-
result: Dict[str, Any] = {
|
|
85
|
-
"thinking": None,
|
|
86
|
-
"tool_call": None,
|
|
87
|
-
}
|
|
88
|
-
|
|
89
|
-
# Use re.DOTALL to match newlines
|
|
90
|
-
match = re.search(pattern, text, re.DOTALL)
|
|
91
|
-
if match:
|
|
92
|
-
result = {
|
|
93
|
-
"thinking": match.group(1).strip().strip('"'),
|
|
94
|
-
"tool_call": match.group(2).strip().strip('"'),
|
|
95
|
-
}
|
|
96
|
-
|
|
97
|
-
# Parse tool_call as JSON
|
|
98
|
-
if result["tool_call"]:
|
|
99
|
-
try:
|
|
100
|
-
result["tool_call"] = json.loads(result["tool_call"])
|
|
101
|
-
except json.JSONDecodeError as e:
|
|
102
|
-
raise ValueError(f"Invalid JSON in tool_call: {e}")
|
|
103
|
-
|
|
104
|
-
return result
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
def parse_action_to_structure_output(text: str) -> Dict[str, Any]:
|
|
108
|
-
"""
|
|
109
|
-
Parse model output text into structured action format.
|
|
110
|
-
|
|
111
|
-
Args:
|
|
112
|
-
text: Raw model output containing thinking and tool_call tags.
|
|
113
|
-
|
|
114
|
-
Returns:
|
|
115
|
-
Dictionary with keys:
|
|
116
|
-
- "thinking": The model's reasoning process
|
|
117
|
-
- "action_json": Parsed action with normalized coordinates
|
|
118
|
-
|
|
119
|
-
Note:
|
|
120
|
-
Coordinates are normalized to [0, 1] range by dividing by SCALE_FACTOR.
|
|
121
|
-
"""
|
|
122
|
-
text = text.strip()
|
|
123
|
-
|
|
124
|
-
results = parse_tagged_text(text)
|
|
125
|
-
thinking = results["thinking"]
|
|
126
|
-
tool_call = results["tool_call"]
|
|
127
|
-
action = tool_call["arguments"]
|
|
128
|
-
|
|
129
|
-
# Normalize coordinates from SCALE_FACTOR range to [0, 1]
|
|
130
|
-
if "coordinate" in action:
|
|
131
|
-
coordinates = action["coordinate"]
|
|
132
|
-
if len(coordinates) == 2:
|
|
133
|
-
point_x, point_y = coordinates
|
|
134
|
-
elif len(coordinates) == 4:
|
|
135
|
-
x1, y1, x2, y2 = coordinates
|
|
136
|
-
point_x = (x1 + x2) / 2
|
|
137
|
-
point_y = (y1 + y2) / 2
|
|
138
|
-
else:
|
|
139
|
-
raise ValueError(
|
|
140
|
-
f"Invalid coordinate format: expected 2 or 4 values, got {len(coordinates)}"
|
|
141
|
-
)
|
|
142
|
-
point_x = point_x / SCALE_FACTOR
|
|
143
|
-
point_y = point_y / SCALE_FACTOR
|
|
144
|
-
action["coordinate"] = [point_x, point_y]
|
|
145
|
-
|
|
146
|
-
return {
|
|
147
|
-
"thinking": thinking,
|
|
148
|
-
"action_json": action,
|
|
149
|
-
}
|
|
150
|
-
|
|
151
|
-
|
|
152
|
-
class MAIUINaivigationAgent(BaseAgent):
|
|
153
|
-
"""
|
|
154
|
-
Mobile automation agent using vision-language models.
|
|
155
|
-
|
|
156
|
-
This agent processes screenshots and natural language instructions to
|
|
157
|
-
generate GUI actions for mobile device automation.
|
|
158
|
-
|
|
159
|
-
Attributes:
|
|
160
|
-
llm_base_url: Base URL for the LLM API endpoint.
|
|
161
|
-
model_name: Name of the model to use for predictions.
|
|
162
|
-
runtime_conf: Configuration dictionary for runtime parameters.
|
|
163
|
-
history_n: Number of history steps to include in context.
|
|
164
|
-
"""
|
|
165
|
-
|
|
166
|
-
def __init__(
|
|
167
|
-
self,
|
|
168
|
-
llm_base_url: str,
|
|
169
|
-
model_name: str,
|
|
170
|
-
runtime_conf: Optional[Dict[str, Any]] = None,
|
|
171
|
-
tools: Optional[List[Dict[str, Any]]] = None,
|
|
172
|
-
):
|
|
173
|
-
"""
|
|
174
|
-
Initialize the MAIMobileAgent.
|
|
175
|
-
|
|
176
|
-
Args:
|
|
177
|
-
llm_base_url: Base URL for the LLM API endpoint.
|
|
178
|
-
model_name: Name of the model to use.
|
|
179
|
-
runtime_conf: Optional configuration dictionary with keys:
|
|
180
|
-
- history_n: Number of history images to include (default: 3)
|
|
181
|
-
- max_pixels: Maximum pixels for image processing
|
|
182
|
-
- min_pixels: Minimum pixels for image processing
|
|
183
|
-
- temperature: Sampling temperature (default: 0.0)
|
|
184
|
-
- top_k: Top-k sampling parameter (default: -1)
|
|
185
|
-
- top_p: Top-p sampling parameter (default: 1.0)
|
|
186
|
-
- max_tokens: Maximum tokens in response (default: 2048)
|
|
187
|
-
tools: Optional list of MCP tool definitions. Each tool should be a dict
|
|
188
|
-
with 'name', 'description', and 'parameters' keys.
|
|
189
|
-
"""
|
|
190
|
-
super().__init__()
|
|
191
|
-
|
|
192
|
-
# Store MCP tools
|
|
193
|
-
self.tools = tools or []
|
|
194
|
-
|
|
195
|
-
# Set default configuration
|
|
196
|
-
default_conf = {
|
|
197
|
-
"history_n": 3,
|
|
198
|
-
"temperature": 0.0,
|
|
199
|
-
"top_k": -1,
|
|
200
|
-
"top_p": 1.0,
|
|
201
|
-
"max_tokens": 2048,
|
|
202
|
-
}
|
|
203
|
-
self.runtime_conf = {**default_conf, **(runtime_conf or {})}
|
|
204
|
-
|
|
205
|
-
self.llm_base_url = llm_base_url
|
|
206
|
-
self.model_name = model_name
|
|
207
|
-
self.llm = OpenAI(
|
|
208
|
-
base_url=self.llm_base_url,
|
|
209
|
-
api_key="empty",
|
|
210
|
-
)
|
|
211
|
-
|
|
212
|
-
# Extract frequently used config values
|
|
213
|
-
self.temperature = self.runtime_conf["temperature"]
|
|
214
|
-
self.top_k = self.runtime_conf["top_k"]
|
|
215
|
-
self.top_p = self.runtime_conf["top_p"]
|
|
216
|
-
self.max_tokens = self.runtime_conf["max_tokens"]
|
|
217
|
-
self.history_n = self.runtime_conf["history_n"]
|
|
218
|
-
|
|
219
|
-
@property
|
|
220
|
-
def system_prompt(self) -> str:
|
|
221
|
-
"""
|
|
222
|
-
Generate the system prompt based on available MCP tools.
|
|
223
|
-
|
|
224
|
-
Returns:
|
|
225
|
-
System prompt string, with MCP tools section if tools are configured.
|
|
226
|
-
"""
|
|
227
|
-
if self.tools:
|
|
228
|
-
tools_str = "\n".join(
|
|
229
|
-
[json.dumps(tool, ensure_ascii=False) for tool in self.tools]
|
|
230
|
-
)
|
|
231
|
-
return MAI_MOBILE_SYS_PROMPT_ASK_USER_MCP.render(tools=tools_str)
|
|
232
|
-
return MAI_MOBILE_SYS_PROMPT
|
|
233
|
-
|
|
234
|
-
@property
|
|
235
|
-
def history_responses(self) -> List[str]:
|
|
236
|
-
"""
|
|
237
|
-
Generate formatted history responses for context.
|
|
238
|
-
|
|
239
|
-
Returns:
|
|
240
|
-
List of formatted response strings with thinking and tool_call tags.
|
|
241
|
-
"""
|
|
242
|
-
history_responses = []
|
|
243
|
-
|
|
244
|
-
for step in self.traj_memory.steps:
|
|
245
|
-
thinking = step.thought
|
|
246
|
-
structured_action = step.structured_action
|
|
247
|
-
|
|
248
|
-
if not structured_action:
|
|
249
|
-
continue
|
|
250
|
-
|
|
251
|
-
action_json = copy.deepcopy(structured_action.get("action_json", {}))
|
|
252
|
-
|
|
253
|
-
# Convert normalized coordinates back to SCALE_FACTOR range
|
|
254
|
-
if "coordinate" in action_json:
|
|
255
|
-
coordinates = action_json.get("coordinate", [])
|
|
256
|
-
if len(coordinates) == 2:
|
|
257
|
-
point_x, point_y = coordinates
|
|
258
|
-
elif len(coordinates) == 4:
|
|
259
|
-
x1, y1, x2, y2 = coordinates
|
|
260
|
-
point_x = (x1 + x2) / 2
|
|
261
|
-
point_y = (y1 + y2) / 2
|
|
262
|
-
else:
|
|
263
|
-
continue
|
|
264
|
-
action_json["coordinate"] = [
|
|
265
|
-
int(point_x * SCALE_FACTOR),
|
|
266
|
-
int(point_y * SCALE_FACTOR),
|
|
267
|
-
]
|
|
268
|
-
|
|
269
|
-
tool_call_dict = {
|
|
270
|
-
"name": "mobile_use",
|
|
271
|
-
"arguments": action_json,
|
|
272
|
-
}
|
|
273
|
-
tool_call_json = json.dumps(tool_call_dict, separators=(",", ":"))
|
|
274
|
-
history_responses.append(
|
|
275
|
-
f"<thinking>\n{thinking}\n</thinking>\n<tool_call>\n{tool_call_json}\n</tool_call>"
|
|
276
|
-
)
|
|
277
|
-
|
|
278
|
-
return history_responses
|
|
279
|
-
|
|
280
|
-
def _prepare_images(self, screenshot_bytes: bytes) -> List[Image.Image]:
|
|
281
|
-
"""
|
|
282
|
-
Prepare image list including history and current screenshot.
|
|
283
|
-
|
|
284
|
-
Args:
|
|
285
|
-
screenshot_bytes: Current screenshot as bytes.
|
|
286
|
-
|
|
287
|
-
Returns:
|
|
288
|
-
List of PIL Images (history + current).
|
|
289
|
-
"""
|
|
290
|
-
# Calculate how many history images to include
|
|
291
|
-
if len(self.history_images) > 0:
|
|
292
|
-
max_history = min(len(self.history_images), self.history_n - 1)
|
|
293
|
-
recent_history = (
|
|
294
|
-
self.history_images[-max_history:] if max_history > 0 else []
|
|
295
|
-
)
|
|
296
|
-
else:
|
|
297
|
-
recent_history = []
|
|
298
|
-
|
|
299
|
-
# Add current image bytes
|
|
300
|
-
recent_history.append(screenshot_bytes)
|
|
301
|
-
|
|
302
|
-
# Normalize input type
|
|
303
|
-
if isinstance(recent_history, bytes):
|
|
304
|
-
recent_history = [recent_history]
|
|
305
|
-
elif isinstance(recent_history, np.ndarray):
|
|
306
|
-
recent_history = list(recent_history)
|
|
307
|
-
elif not isinstance(recent_history, list):
|
|
308
|
-
raise TypeError(f"Unidentified images type: {type(recent_history)}")
|
|
309
|
-
|
|
310
|
-
# Convert all images to PIL format
|
|
311
|
-
images = []
|
|
312
|
-
for image in recent_history:
|
|
313
|
-
if isinstance(image, bytes):
|
|
314
|
-
image = Image.open(BytesIO(image))
|
|
315
|
-
elif isinstance(image, Image.Image):
|
|
316
|
-
pass
|
|
317
|
-
else:
|
|
318
|
-
raise TypeError(f"Expected bytes or PIL Image, got {type(image)}")
|
|
319
|
-
|
|
320
|
-
if image.mode != "RGB":
|
|
321
|
-
image = image.convert("RGB")
|
|
322
|
-
|
|
323
|
-
images.append(image)
|
|
324
|
-
|
|
325
|
-
return images
|
|
326
|
-
|
|
327
|
-
def _build_messages(
|
|
328
|
-
self,
|
|
329
|
-
instruction: str,
|
|
330
|
-
images: List[Image.Image],
|
|
331
|
-
) -> List[Dict[str, Any]]:
|
|
332
|
-
"""
|
|
333
|
-
Build the message list for the LLM API call.
|
|
334
|
-
|
|
335
|
-
Args:
|
|
336
|
-
instruction: Task instruction from user.
|
|
337
|
-
images: List of prepared images.
|
|
338
|
-
|
|
339
|
-
Returns:
|
|
340
|
-
List of message dictionaries for the API.
|
|
341
|
-
"""
|
|
342
|
-
messages = [
|
|
343
|
-
{
|
|
344
|
-
"role": "system",
|
|
345
|
-
"content": [{"type": "text", "text": self.system_prompt}],
|
|
346
|
-
},
|
|
347
|
-
{
|
|
348
|
-
"role": "user",
|
|
349
|
-
"content": [{"type": "text", "text": instruction}],
|
|
350
|
-
},
|
|
351
|
-
]
|
|
352
|
-
|
|
353
|
-
image_num = 0
|
|
354
|
-
history_responses = self.history_responses
|
|
355
|
-
|
|
356
|
-
if len(history_responses) > 0:
|
|
357
|
-
for history_idx, history_response in enumerate(history_responses):
|
|
358
|
-
# Only include images for recent history (last history_n responses)
|
|
359
|
-
if history_idx + self.history_n >= len(history_responses):
|
|
360
|
-
# Add image before the assistant response
|
|
361
|
-
if image_num < len(images) - 1:
|
|
362
|
-
cur_image = images[image_num]
|
|
363
|
-
encoded_string = pil_to_base64(cur_image)
|
|
364
|
-
messages.append(
|
|
365
|
-
{
|
|
366
|
-
"role": "user",
|
|
367
|
-
"content": [
|
|
368
|
-
{
|
|
369
|
-
"type": "image_url",
|
|
370
|
-
"image_url": {
|
|
371
|
-
"url": f"data:image/png;base64,{encoded_string}"
|
|
372
|
-
},
|
|
373
|
-
}
|
|
374
|
-
],
|
|
375
|
-
}
|
|
376
|
-
)
|
|
377
|
-
image_num += 1
|
|
378
|
-
|
|
379
|
-
messages.append(
|
|
380
|
-
{
|
|
381
|
-
"role": "assistant",
|
|
382
|
-
"content": [{"type": "text", "text": history_response}],
|
|
383
|
-
}
|
|
384
|
-
)
|
|
385
|
-
|
|
386
|
-
# Add current image (last one in images list)
|
|
387
|
-
if image_num < len(images):
|
|
388
|
-
cur_image = images[image_num]
|
|
389
|
-
encoded_string = pil_to_base64(cur_image)
|
|
390
|
-
messages.append(
|
|
391
|
-
{
|
|
392
|
-
"role": "user",
|
|
393
|
-
"content": [
|
|
394
|
-
{
|
|
395
|
-
"type": "image_url",
|
|
396
|
-
"image_url": {
|
|
397
|
-
"url": f"data:image/png;base64,{encoded_string}"
|
|
398
|
-
},
|
|
399
|
-
}
|
|
400
|
-
],
|
|
401
|
-
}
|
|
402
|
-
)
|
|
403
|
-
else:
|
|
404
|
-
# No history, just add the current image
|
|
405
|
-
cur_image = images[0]
|
|
406
|
-
encoded_string = pil_to_base64(cur_image)
|
|
407
|
-
messages.append(
|
|
408
|
-
{
|
|
409
|
-
"role": "user",
|
|
410
|
-
"content": [
|
|
411
|
-
{
|
|
412
|
-
"type": "image_url",
|
|
413
|
-
"image_url": {
|
|
414
|
-
"url": f"data:image/png;base64,{encoded_string}"
|
|
415
|
-
},
|
|
416
|
-
}
|
|
417
|
-
],
|
|
418
|
-
}
|
|
419
|
-
)
|
|
420
|
-
|
|
421
|
-
return messages
|
|
422
|
-
|
|
423
|
-
def predict(
|
|
424
|
-
self,
|
|
425
|
-
instruction: str,
|
|
426
|
-
obs: Dict[str, Any],
|
|
427
|
-
**kwargs: Any,
|
|
428
|
-
) -> Tuple[str, Dict[str, Any]]:
|
|
429
|
-
"""
|
|
430
|
-
Predict the next action based on the current observation.
|
|
431
|
-
|
|
432
|
-
Args:
|
|
433
|
-
instruction: Task instruction/goal.
|
|
434
|
-
obs: Current observation containing:
|
|
435
|
-
- screenshot: PIL Image or bytes of current screen
|
|
436
|
-
- accessibility_tree: Optional accessibility tree data
|
|
437
|
-
**kwargs: Additional arguments including:
|
|
438
|
-
- extra_info: Optional extra context string
|
|
439
|
-
|
|
440
|
-
Returns:
|
|
441
|
-
Tuple of (prediction_text, action_dict) where:
|
|
442
|
-
- prediction_text: Raw model response or error message
|
|
443
|
-
- action_dict: Parsed action dictionary
|
|
444
|
-
"""
|
|
445
|
-
# Set task goal if not already set
|
|
446
|
-
if not self.traj_memory.task_goal:
|
|
447
|
-
self.traj_memory.task_goal = instruction
|
|
448
|
-
|
|
449
|
-
# Process screenshot
|
|
450
|
-
screenshot_pil = obs["screenshot"]
|
|
451
|
-
screenshot_bytes = safe_pil_to_bytes(screenshot_pil)
|
|
452
|
-
|
|
453
|
-
# Prepare images
|
|
454
|
-
images = self._prepare_images(screenshot_bytes)
|
|
455
|
-
|
|
456
|
-
# Build messages
|
|
457
|
-
messages = self._build_messages(instruction, images)
|
|
458
|
-
|
|
459
|
-
# Make API call with retry logic
|
|
460
|
-
max_retries = 3
|
|
461
|
-
prediction = None
|
|
462
|
-
action_json = None
|
|
463
|
-
|
|
464
|
-
for attempt in range(max_retries):
|
|
465
|
-
try:
|
|
466
|
-
messages_print = mask_image_urls_for_logging(messages)
|
|
467
|
-
print(f"Messages (attempt {attempt + 1}):\n{messages_print}")
|
|
468
|
-
|
|
469
|
-
response = self.llm.chat.completions.create(
|
|
470
|
-
model=self.model_name,
|
|
471
|
-
messages=messages,
|
|
472
|
-
max_tokens=self.max_tokens,
|
|
473
|
-
temperature=self.temperature,
|
|
474
|
-
top_p=self.top_p,
|
|
475
|
-
frequency_penalty=0.0,
|
|
476
|
-
presence_penalty=0.0,
|
|
477
|
-
extra_body={"repetition_penalty": 1.0, "top_k": self.top_k},
|
|
478
|
-
seed=42,
|
|
479
|
-
)
|
|
480
|
-
prediction = response.choices[0].message.content.strip()
|
|
481
|
-
print(f"Raw response:\n{prediction}")
|
|
482
|
-
|
|
483
|
-
# Parse response
|
|
484
|
-
parsed_response = parse_action_to_structure_output(prediction)
|
|
485
|
-
thinking = parsed_response["thinking"]
|
|
486
|
-
action_json = parsed_response["action_json"]
|
|
487
|
-
print(f"Parsed response:\n{parsed_response}")
|
|
488
|
-
break
|
|
489
|
-
|
|
490
|
-
except Exception as e:
|
|
491
|
-
print(f"Error on attempt {attempt + 1}: {e}")
|
|
492
|
-
traceback.print_exc()
|
|
493
|
-
prediction = None
|
|
494
|
-
action_json = None
|
|
495
|
-
|
|
496
|
-
# Return error if all retries failed
|
|
497
|
-
if prediction is None or action_json is None:
|
|
498
|
-
print("Max retry attempts reached, returning error flag.")
|
|
499
|
-
return "llm client error", {"action": None}
|
|
500
|
-
|
|
501
|
-
# Create and store trajectory step
|
|
502
|
-
traj_step = TrajStep(
|
|
503
|
-
screenshot=screenshot_pil,
|
|
504
|
-
accessibility_tree=obs.get("accessibility_tree"),
|
|
505
|
-
prediction=prediction,
|
|
506
|
-
action=action_json,
|
|
507
|
-
conclusion="",
|
|
508
|
-
thought=thinking,
|
|
509
|
-
step_index=len(self.traj_memory.steps),
|
|
510
|
-
agent_type="MAIMobileAgent",
|
|
511
|
-
model_name=self.model_name,
|
|
512
|
-
screenshot_bytes=screenshot_bytes,
|
|
513
|
-
structured_action={"action_json": action_json},
|
|
514
|
-
)
|
|
515
|
-
self.traj_memory.steps.append(traj_step)
|
|
516
|
-
|
|
517
|
-
return prediction, action_json
|
|
518
|
-
|
|
519
|
-
def reset(self, runtime_logger: Any = None) -> None:
|
|
520
|
-
"""
|
|
521
|
-
Reset the trajectory memory for a new task.
|
|
522
|
-
|
|
523
|
-
Args:
|
|
524
|
-
runtime_logger: Optional logger (unused, kept for API compatibility).
|
|
525
|
-
"""
|
|
526
|
-
super().reset()
|
mai_agent/prompt.py
DELETED
|
@@ -1,148 +0,0 @@
|
|
|
1
|
-
# Copyright (c) 2025, Alibaba Cloud and its affiliates;
|
|
2
|
-
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
-
# you may not use this file except in compliance with the License.
|
|
4
|
-
# You may obtain a copy of the License at
|
|
5
|
-
#
|
|
6
|
-
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
-
#
|
|
8
|
-
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
-
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
-
# See the License for the specific language governing permissions and
|
|
12
|
-
# limitations under the License.
|
|
13
|
-
|
|
14
|
-
"""System prompts for MAI Mobile Agent."""
|
|
15
|
-
|
|
16
|
-
from jinja2 import Template
|
|
17
|
-
|
|
18
|
-
MAI_MOBILE_SYS_PROMPT = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
|
|
19
|
-
|
|
20
|
-
## Output Format
|
|
21
|
-
For each function call, return the thinking process in <thinking> </thinking> tags, and a json object with function name and arguments within <tool_call></tool_call> XML tags:
|
|
22
|
-
```
|
|
23
|
-
<thinking>
|
|
24
|
-
...
|
|
25
|
-
</thinking>
|
|
26
|
-
<tool_call>
|
|
27
|
-
{"name": "mobile_use", "arguments": <args-json-object>}
|
|
28
|
-
</tool_call>
|
|
29
|
-
```
|
|
30
|
-
|
|
31
|
-
## Action Space
|
|
32
|
-
|
|
33
|
-
{"action": "click", "coordinate": [x, y]}
|
|
34
|
-
{"action": "long_press", "coordinate": [x, y]}
|
|
35
|
-
{"action": "type", "text": ""}
|
|
36
|
-
{"action": "swipe", "direction": "up or down or left or right", "coordinate": [x, y]} # "coordinate" is optional. Use the "coordinate" if you want to swipe a specific UI element.
|
|
37
|
-
{"action": "open", "text": "app_name"}
|
|
38
|
-
{"action": "drag", "start_coordinate": [x1, y1], "end_coordinate": [x2, y2]}
|
|
39
|
-
{"action": "system_button", "button": "button_name"} # Options: back, home, menu, enter
|
|
40
|
-
{"action": "wait"}
|
|
41
|
-
{"action": "terminate", "status": "success or fail"}
|
|
42
|
-
{"action": "answer", "text": "xxx"} # Use escape characters \\', \\", and \\n in text part to ensure we can parse the text in normal python string format.
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
## Note
|
|
46
|
-
- Write a small plan and finally summarize your next action (with its target element) in one sentence in <thinking></thinking> part.
|
|
47
|
-
- Available Apps: `["Camera","Chrome","Clock","Contacts","Dialer","Files","Settings","Markor","Tasks","Simple Draw Pro","Simple Gallery Pro","Simple SMS Messenger","Audio Recorder","Pro Expense","Broccoli APP","OSMand","VLC","Joplin","Retro Music","OpenTracks","Simple Calendar Pro"]`.
|
|
48
|
-
You should use the `open` action to open the app as possible as you can, because it is the fast way to open the app.
|
|
49
|
-
- You must follow the Action Space strictly, and return the correct json object within <thinking> </thinking> and <tool_call></tool_call> XML tags.
|
|
50
|
-
""".strip()
|
|
51
|
-
|
|
52
|
-
|
|
53
|
-
MAI_MOBILE_SYS_PROMPT_NO_THINKING = """You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
|
|
54
|
-
|
|
55
|
-
## Output Format
|
|
56
|
-
For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
|
|
57
|
-
```
|
|
58
|
-
<tool_call>
|
|
59
|
-
{"name": "mobile_use", "arguments": <args-json-object>}
|
|
60
|
-
</tool_call>
|
|
61
|
-
```
|
|
62
|
-
|
|
63
|
-
## Action Space
|
|
64
|
-
|
|
65
|
-
{"action": "click", "coordinate": [x, y]}
|
|
66
|
-
{"action": "long_press", "coordinate": [x, y]}
|
|
67
|
-
{"action": "type", "text": ""}
|
|
68
|
-
{"action": "swipe", "direction": "up or down or left or right", "coordinate": [x, y]} # "coordinate" is optional. Use the "coordinate" if you want to swipe a specific UI element.
|
|
69
|
-
{"action": "open", "text": "app_name"}
|
|
70
|
-
{"action": "drag", "start_coordinate": [x1, y1], "end_coordinate": [x2, y2]}
|
|
71
|
-
{"action": "system_button", "button": "button_name"} # Options: back, home, menu, enter
|
|
72
|
-
{"action": "wait"}
|
|
73
|
-
{"action": "terminate", "status": "success or fail"}
|
|
74
|
-
{"action": "answer", "text": "xxx"} # Use escape characters \\', \\", and \\n in text part to ensure we can parse the text in normal python string format.
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
## Note
|
|
78
|
-
- Available Apps: `["Camera","Chrome","Clock","Contacts","Dialer","Files","Settings","Markor","Tasks","Simple Draw Pro","Simple Gallery Pro","Simple SMS Messenger","Audio Recorder","Pro Expense","Broccoli APP","OSMand","VLC","Joplin","Retro Music","OpenTracks","Simple Calendar Pro"]`.
|
|
79
|
-
You should use the `open` action to open the app as possible as you can, because it is the fast way to open the app.
|
|
80
|
-
- You must follow the Action Space strictly, and return the correct json object within <thinking> </thinking> and <tool_call></tool_call> XML tags.
|
|
81
|
-
""".strip()
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
# Placeholder prompts for future features
|
|
85
|
-
MAI_MOBILE_SYS_PROMPT_ASK_USER_MCP = Template(
|
|
86
|
-
"""You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
|
|
87
|
-
|
|
88
|
-
## Output Format
|
|
89
|
-
For each function call, return the thinking process in <thinking> </thinking> tags, and a json object with function name and arguments within <tool_call></tool_call> XML tags:
|
|
90
|
-
```
|
|
91
|
-
<thinking>
|
|
92
|
-
...
|
|
93
|
-
</thinking>
|
|
94
|
-
<tool_call>
|
|
95
|
-
{"name": "mobile_use", "arguments": <args-json-object>}
|
|
96
|
-
</tool_call>
|
|
97
|
-
```
|
|
98
|
-
|
|
99
|
-
## Action Space
|
|
100
|
-
|
|
101
|
-
{"action": "click", "coordinate": [x, y]}
|
|
102
|
-
{"action": "long_press", "coordinate": [x, y]}
|
|
103
|
-
{"action": "type", "text": ""}
|
|
104
|
-
{"action": "swipe", "direction": "up or down or left or right", "coordinate": [x, y]} # "coordinate" is optional. Use the "coordinate" if you want to swipe a specific UI element.
|
|
105
|
-
{"action": "open", "text": "app_name"}
|
|
106
|
-
{"action": "drag", "start_coordinate": [x1, y1], "end_coordinate": [x2, y2]}
|
|
107
|
-
{"action": "system_button", "button": "button_name"} # Options: back, home, menu, enter
|
|
108
|
-
{"action": "wait"}
|
|
109
|
-
{"action": "terminate", "status": "success or fail"}
|
|
110
|
-
{"action": "answer", "text": "xxx"} # Use escape characters \\', \\", and \\n in text part to ensure we can parse the text in normal python string format.
|
|
111
|
-
{"action": "ask_user", "text": "xxx"} # you can ask user for more information to complete the task.
|
|
112
|
-
{"action": "double_click", "coordinate": [x, y]}
|
|
113
|
-
|
|
114
|
-
{% if tools -%}
|
|
115
|
-
## MCP Tools
|
|
116
|
-
You are also provided with MCP tools, you can use them to complete the task.
|
|
117
|
-
{{ tools }}
|
|
118
|
-
|
|
119
|
-
If you want to use MCP tools, you must output as the following format:
|
|
120
|
-
```
|
|
121
|
-
<thinking>
|
|
122
|
-
...
|
|
123
|
-
</thinking>
|
|
124
|
-
<tool_call>
|
|
125
|
-
{"name": <function-name>, "arguments": <args-json-object>}
|
|
126
|
-
</tool_call>
|
|
127
|
-
```
|
|
128
|
-
{% endif -%}
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
## Note
|
|
132
|
-
- Available Apps: `["Contacts", "Settings", "Clock", "Maps", "Chrome", "Calendar", "files", "Gallery", "Taodian", "Mattermost", "Mastodon", "Mail", "SMS", "Camera"]`.
|
|
133
|
-
- Write a small plan and finally summarize your next action (with its target element) in one sentence in <thinking></thinking> part.
|
|
134
|
-
""".strip()
|
|
135
|
-
)
|
|
136
|
-
|
|
137
|
-
MAI_MOBILE_SYS_PROMPT_GROUNDING = """
|
|
138
|
-
You are a GUI grounding agent.
|
|
139
|
-
## Task
|
|
140
|
-
Given a screenshot and the user's grounding instruction. Your task is to accurately locate a UI element based on the user's instructions.
|
|
141
|
-
First, you should carefully examine the screenshot and analyze the user's instructions, translate the user's instruction into a effective reasoning process, and then provide the final coordinate.
|
|
142
|
-
## Output Format
|
|
143
|
-
Return a json object with a reasoning process in <grounding_think></grounding_think> tags, a [x,y] format coordinate within <answer></answer> XML tags:
|
|
144
|
-
<grounding_think>...</grounding_think>
|
|
145
|
-
<answer>
|
|
146
|
-
{"coordinate": [x,y]}
|
|
147
|
-
</answer>
|
|
148
|
-
""".strip()
|