autoglm-gui 1.3.1__py3-none-any.whl → 1.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- AutoGLM_GUI/__main__.py +0 -4
- AutoGLM_GUI/adb_plus/qr_pair.py +8 -8
- AutoGLM_GUI/agents/__init__.py +20 -0
- AutoGLM_GUI/agents/factory.py +160 -0
- AutoGLM_GUI/agents/mai_adapter.py +627 -0
- AutoGLM_GUI/agents/protocols.py +23 -0
- AutoGLM_GUI/api/__init__.py +50 -7
- AutoGLM_GUI/api/agents.py +61 -19
- AutoGLM_GUI/api/devices.py +12 -18
- AutoGLM_GUI/api/dual_model.py +24 -17
- AutoGLM_GUI/api/health.py +13 -0
- AutoGLM_GUI/api/layered_agent.py +659 -0
- AutoGLM_GUI/api/mcp.py +11 -10
- AutoGLM_GUI/api/version.py +23 -10
- AutoGLM_GUI/api/workflows.py +2 -1
- AutoGLM_GUI/config_manager.py +56 -24
- AutoGLM_GUI/device_adapter.py +263 -0
- AutoGLM_GUI/device_protocol.py +266 -0
- AutoGLM_GUI/devices/__init__.py +49 -0
- AutoGLM_GUI/devices/adb_device.py +205 -0
- AutoGLM_GUI/devices/mock_device.py +183 -0
- AutoGLM_GUI/devices/remote_device.py +172 -0
- AutoGLM_GUI/dual_model/decision_model.py +4 -4
- AutoGLM_GUI/dual_model/protocols.py +3 -3
- AutoGLM_GUI/exceptions.py +3 -3
- AutoGLM_GUI/mai_ui_adapter/agent_wrapper.py +291 -0
- AutoGLM_GUI/metrics.py +13 -20
- AutoGLM_GUI/phone_agent_manager.py +219 -134
- AutoGLM_GUI/phone_agent_patches.py +2 -1
- AutoGLM_GUI/platform_utils.py +5 -2
- AutoGLM_GUI/prompts.py +6 -1
- AutoGLM_GUI/schemas.py +45 -14
- AutoGLM_GUI/scrcpy_stream.py +17 -13
- AutoGLM_GUI/server.py +3 -1
- AutoGLM_GUI/socketio_server.py +16 -4
- AutoGLM_GUI/state.py +10 -30
- AutoGLM_GUI/static/assets/{about-Cj6QXqMf.js → about-_XNhzQZX.js} +1 -1
- AutoGLM_GUI/static/assets/chat-DwJpiAWf.js +126 -0
- AutoGLM_GUI/static/assets/{dialog-CxJlnjzH.js → dialog-B3uW4T8V.js} +3 -3
- AutoGLM_GUI/static/assets/index-Cpv2gSF1.css +1 -0
- AutoGLM_GUI/static/assets/{index-C_B-Arvf.js → index-Cy8TmmHV.js} +1 -1
- AutoGLM_GUI/static/assets/{index-CxJQuE4y.js → index-UYYauTly.js} +6 -6
- AutoGLM_GUI/static/assets/{workflows-BTiGCNI0.js → workflows-Du_de-dt.js} +1 -1
- AutoGLM_GUI/static/index.html +2 -2
- AutoGLM_GUI/types.py +125 -0
- {autoglm_gui-1.3.1.dist-info → autoglm_gui-1.4.1.dist-info}/METADATA +147 -65
- {autoglm_gui-1.3.1.dist-info → autoglm_gui-1.4.1.dist-info}/RECORD +58 -39
- mai_agent/base.py +137 -0
- mai_agent/mai_grounding_agent.py +263 -0
- mai_agent/mai_naivigation_agent.py +526 -0
- mai_agent/prompt.py +148 -0
- mai_agent/unified_memory.py +67 -0
- mai_agent/utils.py +73 -0
- phone_agent/config/prompts.py +6 -1
- phone_agent/config/prompts_zh.py +6 -1
- AutoGLM_GUI/config.py +0 -23
- AutoGLM_GUI/static/assets/chat-BJeomZgh.js +0 -124
- AutoGLM_GUI/static/assets/index-Z0uYCPOO.css +0 -1
- {autoglm_gui-1.3.1.dist-info → autoglm_gui-1.4.1.dist-info}/WHEEL +0 -0
- {autoglm_gui-1.3.1.dist-info → autoglm_gui-1.4.1.dist-info}/entry_points.txt +0 -0
- {autoglm_gui-1.3.1.dist-info → autoglm_gui-1.4.1.dist-info}/licenses/LICENSE +0 -0
mai_agent/base.py
ADDED
|
@@ -0,0 +1,137 @@
|
|
|
1
|
+
# Copyright (c) 2025, Alibaba Cloud and its affiliates;
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
+
# See the License for the specific language governing permissions and
|
|
12
|
+
# limitations under the License.
|
|
13
|
+
|
|
14
|
+
"""Base agent class for mobile GUI automation agents."""
|
|
15
|
+
|
|
16
|
+
from abc import ABC, abstractmethod
|
|
17
|
+
from typing import Any, Dict, List, Tuple
|
|
18
|
+
|
|
19
|
+
from unified_memory import TrajMemory
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
class BaseAgent(ABC):
|
|
23
|
+
"""
|
|
24
|
+
Abstract base class for all GUI automation agents.
|
|
25
|
+
|
|
26
|
+
This class provides common functionality for trajectory management
|
|
27
|
+
and defines the interface that all agents must implement.
|
|
28
|
+
"""
|
|
29
|
+
|
|
30
|
+
def __init__(self) -> None:
|
|
31
|
+
"""Initialize the base agent with empty trajectory memory."""
|
|
32
|
+
self.traj_memory = TrajMemory(
|
|
33
|
+
task_goal="",
|
|
34
|
+
task_id="",
|
|
35
|
+
steps=[],
|
|
36
|
+
)
|
|
37
|
+
|
|
38
|
+
@property
|
|
39
|
+
def thoughts(self) -> List[str]:
|
|
40
|
+
"""Return list of thoughts from trajectory memory."""
|
|
41
|
+
return [step.thought if step.thought else "" for step in self.traj_memory.steps]
|
|
42
|
+
|
|
43
|
+
@property
|
|
44
|
+
def actions(self) -> List[Dict[str, Any]]:
|
|
45
|
+
"""Return list of actions from trajectory memory."""
|
|
46
|
+
return [step.action for step in self.traj_memory.steps]
|
|
47
|
+
|
|
48
|
+
@property
|
|
49
|
+
def conclusions(self) -> List[str]:
|
|
50
|
+
"""Return list of conclusions from trajectory memory."""
|
|
51
|
+
return [step.conclusion for step in self.traj_memory.steps]
|
|
52
|
+
|
|
53
|
+
@property
|
|
54
|
+
def observations(self) -> List[Dict[str, Any]]:
|
|
55
|
+
"""Return list of observations from trajectory memory."""
|
|
56
|
+
return [
|
|
57
|
+
{
|
|
58
|
+
"screenshot": step.screenshot_bytes,
|
|
59
|
+
"accessibility_tree": step.accessibility_tree,
|
|
60
|
+
}
|
|
61
|
+
for step in self.traj_memory.steps
|
|
62
|
+
]
|
|
63
|
+
|
|
64
|
+
@property
|
|
65
|
+
def history_images(self) -> List[bytes]:
|
|
66
|
+
"""Return list of screenshot bytes from trajectory memory."""
|
|
67
|
+
return [step.screenshot_bytes for step in self.traj_memory.steps]
|
|
68
|
+
|
|
69
|
+
@property
|
|
70
|
+
def history_responses(self) -> List[str]:
|
|
71
|
+
"""Return list of predictions from trajectory memory."""
|
|
72
|
+
return [step.prediction for step in self.traj_memory.steps]
|
|
73
|
+
|
|
74
|
+
@abstractmethod
|
|
75
|
+
def predict(
|
|
76
|
+
self,
|
|
77
|
+
instruction: str,
|
|
78
|
+
obs: Dict[str, Any],
|
|
79
|
+
**kwargs: Any,
|
|
80
|
+
) -> Tuple[str, Dict[str, Any]]:
|
|
81
|
+
"""
|
|
82
|
+
Predict the next action based on the current observation.
|
|
83
|
+
|
|
84
|
+
Args:
|
|
85
|
+
instruction: Task instruction/goal.
|
|
86
|
+
obs: Current observation containing screenshot and optional accessibility tree.
|
|
87
|
+
**kwargs: Additional keyword arguments.
|
|
88
|
+
|
|
89
|
+
Returns:
|
|
90
|
+
Tuple of (prediction_text, action_dict).
|
|
91
|
+
"""
|
|
92
|
+
pass
|
|
93
|
+
|
|
94
|
+
def reset(self) -> None:
|
|
95
|
+
"""Reset the trajectory memory for a new task."""
|
|
96
|
+
self.traj_memory = TrajMemory(
|
|
97
|
+
task_goal="",
|
|
98
|
+
task_id="",
|
|
99
|
+
steps=[],
|
|
100
|
+
)
|
|
101
|
+
|
|
102
|
+
def load_traj(self, traj_memory: TrajMemory) -> None:
|
|
103
|
+
"""
|
|
104
|
+
Load trajectory from existing TrajMemory object.
|
|
105
|
+
|
|
106
|
+
Args:
|
|
107
|
+
traj_memory: TrajMemory object containing trajectory data.
|
|
108
|
+
"""
|
|
109
|
+
self.traj_memory = traj_memory
|
|
110
|
+
|
|
111
|
+
def save_traj(self) -> Dict[str, Any]:
|
|
112
|
+
"""
|
|
113
|
+
Save current trajectory to a dictionary format.
|
|
114
|
+
|
|
115
|
+
Returns:
|
|
116
|
+
Dictionary containing the trajectory data that can be serialized.
|
|
117
|
+
"""
|
|
118
|
+
steps_data = []
|
|
119
|
+
for step in self.traj_memory.steps:
|
|
120
|
+
step_dict = {
|
|
121
|
+
"screenshot_bytes": step.screenshot_bytes,
|
|
122
|
+
"accessibility_tree": step.accessibility_tree,
|
|
123
|
+
"prediction": step.prediction,
|
|
124
|
+
"action": step.action,
|
|
125
|
+
"conclusion": step.conclusion,
|
|
126
|
+
"thought": step.thought,
|
|
127
|
+
"step_index": step.step_index,
|
|
128
|
+
"agent_type": step.agent_type,
|
|
129
|
+
"model_name": step.model_name,
|
|
130
|
+
}
|
|
131
|
+
steps_data.append(step_dict)
|
|
132
|
+
|
|
133
|
+
return {
|
|
134
|
+
"task_goal": self.traj_memory.task_goal,
|
|
135
|
+
"task_id": self.traj_memory.task_id,
|
|
136
|
+
"steps": steps_data,
|
|
137
|
+
}
|
|
@@ -0,0 +1,263 @@
|
|
|
1
|
+
# Copyright (c) 2025, Alibaba Cloud and its affiliates;
|
|
2
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
3
|
+
# you may not use this file except in compliance with the License.
|
|
4
|
+
# You may obtain a copy of the License at
|
|
5
|
+
#
|
|
6
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
7
|
+
#
|
|
8
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
9
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
10
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
11
|
+
# See the License for the specific language governing permissions and
|
|
12
|
+
# limitations under the License.
|
|
13
|
+
|
|
14
|
+
"""
|
|
15
|
+
MAI Grounding Agent - A GUI grounding agent for locating UI elements.
|
|
16
|
+
|
|
17
|
+
This module provides the MAIGroundingAgent class that uses vision-language models
|
|
18
|
+
to locate UI elements based on natural language instructions.
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
import json
|
|
22
|
+
import re
|
|
23
|
+
from io import BytesIO
|
|
24
|
+
from typing import Any, Dict, Optional, Tuple, Union
|
|
25
|
+
|
|
26
|
+
from openai import OpenAI
|
|
27
|
+
from PIL import Image
|
|
28
|
+
|
|
29
|
+
from prompt import MAI_MOBILE_SYS_PROMPT_GROUNDING
|
|
30
|
+
from utils import pil_to_base64
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# Constants
|
|
34
|
+
SCALE_FACTOR = 999
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def parse_grounding_response(text: str) -> Dict[str, Any]:
|
|
38
|
+
"""
|
|
39
|
+
Parse model output text containing grounding_think and answer tags.
|
|
40
|
+
|
|
41
|
+
Args:
|
|
42
|
+
text: Raw model output containing <grounding_think> and <answer> tags.
|
|
43
|
+
|
|
44
|
+
Returns:
|
|
45
|
+
Dictionary with keys:
|
|
46
|
+
- "thinking": The model's reasoning process
|
|
47
|
+
- "coordinate": Normalized [x, y] coordinate
|
|
48
|
+
|
|
49
|
+
Raises:
|
|
50
|
+
ValueError: If parsing fails or JSON is invalid.
|
|
51
|
+
"""
|
|
52
|
+
text = text.strip()
|
|
53
|
+
|
|
54
|
+
result: Dict[str, Any] = {
|
|
55
|
+
"thinking": None,
|
|
56
|
+
"coordinate": None,
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
# Extract thinking content
|
|
60
|
+
think_pattern = r"<grounding_think>(.*?)</grounding_think>"
|
|
61
|
+
think_match = re.search(think_pattern, text, re.DOTALL)
|
|
62
|
+
if think_match:
|
|
63
|
+
result["thinking"] = think_match.group(1).strip()
|
|
64
|
+
|
|
65
|
+
# Extract answer content
|
|
66
|
+
answer_pattern = r"<answer>(.*?)</answer>"
|
|
67
|
+
answer_match = re.search(answer_pattern, text, re.DOTALL)
|
|
68
|
+
if answer_match:
|
|
69
|
+
answer_text = answer_match.group(1).strip()
|
|
70
|
+
try:
|
|
71
|
+
answer_json = json.loads(answer_text)
|
|
72
|
+
coordinates = answer_json.get("coordinate", [])
|
|
73
|
+
if len(coordinates) == 2:
|
|
74
|
+
# Normalize coordinates from SCALE_FACTOR range to [0, 1]
|
|
75
|
+
point_x = coordinates[0] / SCALE_FACTOR
|
|
76
|
+
point_y = coordinates[1] / SCALE_FACTOR
|
|
77
|
+
result["coordinate"] = [point_x, point_y]
|
|
78
|
+
else:
|
|
79
|
+
raise ValueError(
|
|
80
|
+
f"Invalid coordinate format: expected 2 values, got {len(coordinates)}"
|
|
81
|
+
)
|
|
82
|
+
except json.JSONDecodeError as e:
|
|
83
|
+
raise ValueError(f"Invalid JSON in answer: {e}")
|
|
84
|
+
|
|
85
|
+
return result
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class MAIGroundingAgent:
|
|
89
|
+
"""
|
|
90
|
+
GUI grounding agent using vision-language models.
|
|
91
|
+
|
|
92
|
+
This agent processes a screenshot and natural language instruction to
|
|
93
|
+
locate a specific UI element and return its coordinates.
|
|
94
|
+
|
|
95
|
+
Attributes:
|
|
96
|
+
llm_base_url: Base URL for the LLM API endpoint.
|
|
97
|
+
model_name: Name of the model to use for predictions.
|
|
98
|
+
runtime_conf: Configuration dictionary for runtime parameters.
|
|
99
|
+
"""
|
|
100
|
+
|
|
101
|
+
def __init__(
|
|
102
|
+
self,
|
|
103
|
+
llm_base_url: str,
|
|
104
|
+
model_name: str,
|
|
105
|
+
runtime_conf: Optional[Dict[str, Any]] = None,
|
|
106
|
+
):
|
|
107
|
+
"""
|
|
108
|
+
Initialize the MAIGroundingAgent.
|
|
109
|
+
|
|
110
|
+
Args:
|
|
111
|
+
llm_base_url: Base URL for the LLM API endpoint.
|
|
112
|
+
model_name: Name of the model to use.
|
|
113
|
+
runtime_conf: Optional configuration dictionary with keys:
|
|
114
|
+
- max_pixels: Maximum pixels for image processing
|
|
115
|
+
- min_pixels: Minimum pixels for image processing
|
|
116
|
+
- temperature: Sampling temperature (default: 0.0)
|
|
117
|
+
- top_k: Top-k sampling parameter (default: -1)
|
|
118
|
+
- top_p: Top-p sampling parameter (default: 1.0)
|
|
119
|
+
- max_tokens: Maximum tokens in response (default: 2048)
|
|
120
|
+
"""
|
|
121
|
+
# Set default configuration
|
|
122
|
+
default_conf = {
|
|
123
|
+
"temperature": 0.0,
|
|
124
|
+
"top_k": -1,
|
|
125
|
+
"top_p": 1.0,
|
|
126
|
+
"max_tokens": 2048,
|
|
127
|
+
}
|
|
128
|
+
self.runtime_conf = {**default_conf, **(runtime_conf or {})}
|
|
129
|
+
|
|
130
|
+
self.llm_base_url = llm_base_url
|
|
131
|
+
self.model_name = model_name
|
|
132
|
+
self.llm = OpenAI(
|
|
133
|
+
base_url=self.llm_base_url,
|
|
134
|
+
api_key="empty",
|
|
135
|
+
)
|
|
136
|
+
|
|
137
|
+
# Extract frequently used config values
|
|
138
|
+
self.temperature = self.runtime_conf["temperature"]
|
|
139
|
+
self.top_k = self.runtime_conf["top_k"]
|
|
140
|
+
self.top_p = self.runtime_conf["top_p"]
|
|
141
|
+
self.max_tokens = self.runtime_conf["max_tokens"]
|
|
142
|
+
|
|
143
|
+
@property
|
|
144
|
+
def system_prompt(self) -> str:
|
|
145
|
+
"""Return the system prompt for grounding tasks."""
|
|
146
|
+
return MAI_MOBILE_SYS_PROMPT_GROUNDING
|
|
147
|
+
|
|
148
|
+
def _build_messages(
|
|
149
|
+
self,
|
|
150
|
+
instruction: str,
|
|
151
|
+
image: Image.Image,
|
|
152
|
+
) -> list:
|
|
153
|
+
"""
|
|
154
|
+
Build the message list for the LLM API call.
|
|
155
|
+
|
|
156
|
+
Args:
|
|
157
|
+
instruction: Grounding instruction from user.
|
|
158
|
+
image: PIL Image of the screenshot.
|
|
159
|
+
magic_prompt: Whether to use the magic prompt format.
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
List of message dictionaries for the API.
|
|
163
|
+
"""
|
|
164
|
+
encoded_string = pil_to_base64(image)
|
|
165
|
+
|
|
166
|
+
messages = [
|
|
167
|
+
{
|
|
168
|
+
"role": "system",
|
|
169
|
+
"content": [
|
|
170
|
+
{
|
|
171
|
+
"type": "text",
|
|
172
|
+
"text": self.system_prompt,
|
|
173
|
+
}
|
|
174
|
+
],
|
|
175
|
+
}
|
|
176
|
+
]
|
|
177
|
+
|
|
178
|
+
messages.append(
|
|
179
|
+
{
|
|
180
|
+
"role": "user",
|
|
181
|
+
"content": [
|
|
182
|
+
{
|
|
183
|
+
"type": "text",
|
|
184
|
+
"text": instruction + "\n",
|
|
185
|
+
},
|
|
186
|
+
{
|
|
187
|
+
"type": "image_url",
|
|
188
|
+
"image_url": {"url": f"data:image/png;base64,{encoded_string}"},
|
|
189
|
+
},
|
|
190
|
+
],
|
|
191
|
+
}
|
|
192
|
+
)
|
|
193
|
+
|
|
194
|
+
return messages
|
|
195
|
+
|
|
196
|
+
def predict(
|
|
197
|
+
self,
|
|
198
|
+
instruction: str,
|
|
199
|
+
image: Union[Image.Image, bytes],
|
|
200
|
+
**kwargs: Any,
|
|
201
|
+
) -> Tuple[str, Dict[str, Any]]:
|
|
202
|
+
"""
|
|
203
|
+
Predict the coordinate of the UI element based on the instruction.
|
|
204
|
+
|
|
205
|
+
Args:
|
|
206
|
+
instruction: Grounding instruction describing the UI element to locate.
|
|
207
|
+
image: PIL Image or bytes of the screenshot.
|
|
208
|
+
**kwargs: Additional arguments (unused).
|
|
209
|
+
|
|
210
|
+
Returns:
|
|
211
|
+
Tuple of (prediction_text, result_dict) where:
|
|
212
|
+
- prediction_text: Raw model response or error message
|
|
213
|
+
- result_dict: Dictionary containing:
|
|
214
|
+
- "thinking": Model's reasoning process
|
|
215
|
+
- "coordinate": Normalized [x, y] coordinate
|
|
216
|
+
"""
|
|
217
|
+
# Convert bytes to PIL Image if necessary
|
|
218
|
+
if isinstance(image, bytes):
|
|
219
|
+
image = Image.open(BytesIO(image))
|
|
220
|
+
|
|
221
|
+
if image.mode != "RGB":
|
|
222
|
+
image = image.convert("RGB")
|
|
223
|
+
|
|
224
|
+
# Build messages
|
|
225
|
+
messages = self._build_messages(instruction, image)
|
|
226
|
+
|
|
227
|
+
# Make API call with retry logic
|
|
228
|
+
max_retries = 3
|
|
229
|
+
prediction = None
|
|
230
|
+
result = None
|
|
231
|
+
|
|
232
|
+
for attempt in range(max_retries):
|
|
233
|
+
try:
|
|
234
|
+
response = self.llm.chat.completions.create(
|
|
235
|
+
model=self.model_name,
|
|
236
|
+
messages=messages,
|
|
237
|
+
max_tokens=self.max_tokens,
|
|
238
|
+
temperature=self.temperature,
|
|
239
|
+
top_p=self.top_p,
|
|
240
|
+
frequency_penalty=0.0,
|
|
241
|
+
presence_penalty=0.0,
|
|
242
|
+
extra_body={"repetition_penalty": 1.0, "top_k": self.top_k},
|
|
243
|
+
seed=42,
|
|
244
|
+
)
|
|
245
|
+
prediction = response.choices[0].message.content.strip()
|
|
246
|
+
print(f"Raw response:\n{prediction}")
|
|
247
|
+
|
|
248
|
+
# Parse response
|
|
249
|
+
result = parse_grounding_response(prediction)
|
|
250
|
+
print(f"Parsed result:\n{result}")
|
|
251
|
+
break
|
|
252
|
+
|
|
253
|
+
except Exception as e:
|
|
254
|
+
print(f"Error on attempt {attempt + 1}: {e}")
|
|
255
|
+
prediction = None
|
|
256
|
+
result = None
|
|
257
|
+
|
|
258
|
+
# Return error if all retries failed
|
|
259
|
+
if prediction is None or result is None:
|
|
260
|
+
print("Max retry attempts reached, returning error flag.")
|
|
261
|
+
return "llm client error", {"thinking": None, "coordinate": None}
|
|
262
|
+
|
|
263
|
+
return prediction, result
|