autoglm-gui 1.4.0__py3-none-any.whl → 1.4.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. AutoGLM_GUI/__main__.py +0 -4
  2. AutoGLM_GUI/adb_plus/qr_pair.py +8 -8
  3. AutoGLM_GUI/agents/__init__.py +20 -0
  4. AutoGLM_GUI/agents/factory.py +160 -0
  5. AutoGLM_GUI/agents/mai_adapter.py +627 -0
  6. AutoGLM_GUI/agents/protocols.py +23 -0
  7. AutoGLM_GUI/api/__init__.py +48 -7
  8. AutoGLM_GUI/api/agents.py +61 -17
  9. AutoGLM_GUI/api/devices.py +12 -18
  10. AutoGLM_GUI/api/dual_model.py +15 -9
  11. AutoGLM_GUI/api/health.py +13 -0
  12. AutoGLM_GUI/api/layered_agent.py +239 -166
  13. AutoGLM_GUI/api/mcp.py +11 -10
  14. AutoGLM_GUI/api/version.py +23 -10
  15. AutoGLM_GUI/api/workflows.py +2 -1
  16. AutoGLM_GUI/config_manager.py +55 -1
  17. AutoGLM_GUI/device_adapter.py +263 -0
  18. AutoGLM_GUI/device_protocol.py +266 -0
  19. AutoGLM_GUI/devices/__init__.py +49 -0
  20. AutoGLM_GUI/devices/adb_device.py +205 -0
  21. AutoGLM_GUI/devices/mock_device.py +183 -0
  22. AutoGLM_GUI/devices/remote_device.py +172 -0
  23. AutoGLM_GUI/dual_model/decision_model.py +4 -4
  24. AutoGLM_GUI/exceptions.py +3 -3
  25. AutoGLM_GUI/mai_ui_adapter/agent_wrapper.py +2 -2
  26. AutoGLM_GUI/metrics.py +13 -20
  27. AutoGLM_GUI/phone_agent_manager.py +219 -134
  28. AutoGLM_GUI/phone_agent_patches.py +2 -1
  29. AutoGLM_GUI/platform_utils.py +5 -2
  30. AutoGLM_GUI/schemas.py +47 -0
  31. AutoGLM_GUI/scrcpy_stream.py +17 -13
  32. AutoGLM_GUI/server.py +3 -1
  33. AutoGLM_GUI/socketio_server.py +16 -4
  34. AutoGLM_GUI/state.py +10 -30
  35. AutoGLM_GUI/static/assets/{about-DeclntHg.js → about-_XNhzQZX.js} +1 -1
  36. AutoGLM_GUI/static/assets/chat-DwJpiAWf.js +126 -0
  37. AutoGLM_GUI/static/assets/{dialog-BfdcBs1x.js → dialog-B3uW4T8V.js} +3 -3
  38. AutoGLM_GUI/static/assets/index-Cpv2gSF1.css +1 -0
  39. AutoGLM_GUI/static/assets/{index-zQ4KKDHt.js → index-Cy8TmmHV.js} +1 -1
  40. AutoGLM_GUI/static/assets/{index-DHF1NZh0.js → index-UYYauTly.js} +6 -6
  41. AutoGLM_GUI/static/assets/{workflows-xiplap-r.js → workflows-Du_de-dt.js} +1 -1
  42. AutoGLM_GUI/static/index.html +2 -2
  43. AutoGLM_GUI/types.py +125 -0
  44. {autoglm_gui-1.4.0.dist-info → autoglm_gui-1.4.1.dist-info}/METADATA +83 -4
  45. {autoglm_gui-1.4.0.dist-info → autoglm_gui-1.4.1.dist-info}/RECORD +54 -37
  46. mai_agent/base.py +137 -0
  47. mai_agent/mai_grounding_agent.py +263 -0
  48. mai_agent/mai_naivigation_agent.py +526 -0
  49. mai_agent/prompt.py +148 -0
  50. mai_agent/unified_memory.py +67 -0
  51. mai_agent/utils.py +73 -0
  52. AutoGLM_GUI/config.py +0 -23
  53. AutoGLM_GUI/static/assets/chat-Iut2yhSw.js +0 -125
  54. AutoGLM_GUI/static/assets/index-5hCCwHA7.css +0 -1
  55. {autoglm_gui-1.4.0.dist-info → autoglm_gui-1.4.1.dist-info}/WHEEL +0 -0
  56. {autoglm_gui-1.4.0.dist-info → autoglm_gui-1.4.1.dist-info}/entry_points.txt +0 -0
  57. {autoglm_gui-1.4.0.dist-info → autoglm_gui-1.4.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,263 @@
1
+ # Copyright (c) 2025, Alibaba Cloud and its affiliates;
2
+ # Licensed under the Apache License, Version 2.0 (the "License");
3
+ # you may not use this file except in compliance with the License.
4
+ # You may obtain a copy of the License at
5
+ #
6
+ # http://www.apache.org/licenses/LICENSE-2.0
7
+ #
8
+ # Unless required by applicable law or agreed to in writing, software
9
+ # distributed under the License is distributed on an "AS IS" BASIS,
10
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11
+ # See the License for the specific language governing permissions and
12
+ # limitations under the License.
13
+
14
+ """
15
+ MAI Grounding Agent - A GUI grounding agent for locating UI elements.
16
+
17
+ This module provides the MAIGroundingAgent class that uses vision-language models
18
+ to locate UI elements based on natural language instructions.
19
+ """
20
+
21
+ import json
22
+ import re
23
+ from io import BytesIO
24
+ from typing import Any, Dict, Optional, Tuple, Union
25
+
26
+ from openai import OpenAI
27
+ from PIL import Image
28
+
29
+ from prompt import MAI_MOBILE_SYS_PROMPT_GROUNDING
30
+ from utils import pil_to_base64
31
+
32
+
33
+ # Constants
34
+ SCALE_FACTOR = 999
35
+
36
+
37
+ def parse_grounding_response(text: str) -> Dict[str, Any]:
38
+ """
39
+ Parse model output text containing grounding_think and answer tags.
40
+
41
+ Args:
42
+ text: Raw model output containing <grounding_think> and <answer> tags.
43
+
44
+ Returns:
45
+ Dictionary with keys:
46
+ - "thinking": The model's reasoning process
47
+ - "coordinate": Normalized [x, y] coordinate
48
+
49
+ Raises:
50
+ ValueError: If parsing fails or JSON is invalid.
51
+ """
52
+ text = text.strip()
53
+
54
+ result: Dict[str, Any] = {
55
+ "thinking": None,
56
+ "coordinate": None,
57
+ }
58
+
59
+ # Extract thinking content
60
+ think_pattern = r"<grounding_think>(.*?)</grounding_think>"
61
+ think_match = re.search(think_pattern, text, re.DOTALL)
62
+ if think_match:
63
+ result["thinking"] = think_match.group(1).strip()
64
+
65
+ # Extract answer content
66
+ answer_pattern = r"<answer>(.*?)</answer>"
67
+ answer_match = re.search(answer_pattern, text, re.DOTALL)
68
+ if answer_match:
69
+ answer_text = answer_match.group(1).strip()
70
+ try:
71
+ answer_json = json.loads(answer_text)
72
+ coordinates = answer_json.get("coordinate", [])
73
+ if len(coordinates) == 2:
74
+ # Normalize coordinates from SCALE_FACTOR range to [0, 1]
75
+ point_x = coordinates[0] / SCALE_FACTOR
76
+ point_y = coordinates[1] / SCALE_FACTOR
77
+ result["coordinate"] = [point_x, point_y]
78
+ else:
79
+ raise ValueError(
80
+ f"Invalid coordinate format: expected 2 values, got {len(coordinates)}"
81
+ )
82
+ except json.JSONDecodeError as e:
83
+ raise ValueError(f"Invalid JSON in answer: {e}")
84
+
85
+ return result
86
+
87
+
88
+ class MAIGroundingAgent:
89
+ """
90
+ GUI grounding agent using vision-language models.
91
+
92
+ This agent processes a screenshot and natural language instruction to
93
+ locate a specific UI element and return its coordinates.
94
+
95
+ Attributes:
96
+ llm_base_url: Base URL for the LLM API endpoint.
97
+ model_name: Name of the model to use for predictions.
98
+ runtime_conf: Configuration dictionary for runtime parameters.
99
+ """
100
+
101
+ def __init__(
102
+ self,
103
+ llm_base_url: str,
104
+ model_name: str,
105
+ runtime_conf: Optional[Dict[str, Any]] = None,
106
+ ):
107
+ """
108
+ Initialize the MAIGroundingAgent.
109
+
110
+ Args:
111
+ llm_base_url: Base URL for the LLM API endpoint.
112
+ model_name: Name of the model to use.
113
+ runtime_conf: Optional configuration dictionary with keys:
114
+ - max_pixels: Maximum pixels for image processing
115
+ - min_pixels: Minimum pixels for image processing
116
+ - temperature: Sampling temperature (default: 0.0)
117
+ - top_k: Top-k sampling parameter (default: -1)
118
+ - top_p: Top-p sampling parameter (default: 1.0)
119
+ - max_tokens: Maximum tokens in response (default: 2048)
120
+ """
121
+ # Set default configuration
122
+ default_conf = {
123
+ "temperature": 0.0,
124
+ "top_k": -1,
125
+ "top_p": 1.0,
126
+ "max_tokens": 2048,
127
+ }
128
+ self.runtime_conf = {**default_conf, **(runtime_conf or {})}
129
+
130
+ self.llm_base_url = llm_base_url
131
+ self.model_name = model_name
132
+ self.llm = OpenAI(
133
+ base_url=self.llm_base_url,
134
+ api_key="empty",
135
+ )
136
+
137
+ # Extract frequently used config values
138
+ self.temperature = self.runtime_conf["temperature"]
139
+ self.top_k = self.runtime_conf["top_k"]
140
+ self.top_p = self.runtime_conf["top_p"]
141
+ self.max_tokens = self.runtime_conf["max_tokens"]
142
+
143
+ @property
144
+ def system_prompt(self) -> str:
145
+ """Return the system prompt for grounding tasks."""
146
+ return MAI_MOBILE_SYS_PROMPT_GROUNDING
147
+
148
+ def _build_messages(
149
+ self,
150
+ instruction: str,
151
+ image: Image.Image,
152
+ ) -> list:
153
+ """
154
+ Build the message list for the LLM API call.
155
+
156
+ Args:
157
+ instruction: Grounding instruction from user.
158
+ image: PIL Image of the screenshot.
159
+ magic_prompt: Whether to use the magic prompt format.
160
+
161
+ Returns:
162
+ List of message dictionaries for the API.
163
+ """
164
+ encoded_string = pil_to_base64(image)
165
+
166
+ messages = [
167
+ {
168
+ "role": "system",
169
+ "content": [
170
+ {
171
+ "type": "text",
172
+ "text": self.system_prompt,
173
+ }
174
+ ],
175
+ }
176
+ ]
177
+
178
+ messages.append(
179
+ {
180
+ "role": "user",
181
+ "content": [
182
+ {
183
+ "type": "text",
184
+ "text": instruction + "\n",
185
+ },
186
+ {
187
+ "type": "image_url",
188
+ "image_url": {"url": f"data:image/png;base64,{encoded_string}"},
189
+ },
190
+ ],
191
+ }
192
+ )
193
+
194
+ return messages
195
+
196
+ def predict(
197
+ self,
198
+ instruction: str,
199
+ image: Union[Image.Image, bytes],
200
+ **kwargs: Any,
201
+ ) -> Tuple[str, Dict[str, Any]]:
202
+ """
203
+ Predict the coordinate of the UI element based on the instruction.
204
+
205
+ Args:
206
+ instruction: Grounding instruction describing the UI element to locate.
207
+ image: PIL Image or bytes of the screenshot.
208
+ **kwargs: Additional arguments (unused).
209
+
210
+ Returns:
211
+ Tuple of (prediction_text, result_dict) where:
212
+ - prediction_text: Raw model response or error message
213
+ - result_dict: Dictionary containing:
214
+ - "thinking": Model's reasoning process
215
+ - "coordinate": Normalized [x, y] coordinate
216
+ """
217
+ # Convert bytes to PIL Image if necessary
218
+ if isinstance(image, bytes):
219
+ image = Image.open(BytesIO(image))
220
+
221
+ if image.mode != "RGB":
222
+ image = image.convert("RGB")
223
+
224
+ # Build messages
225
+ messages = self._build_messages(instruction, image)
226
+
227
+ # Make API call with retry logic
228
+ max_retries = 3
229
+ prediction = None
230
+ result = None
231
+
232
+ for attempt in range(max_retries):
233
+ try:
234
+ response = self.llm.chat.completions.create(
235
+ model=self.model_name,
236
+ messages=messages,
237
+ max_tokens=self.max_tokens,
238
+ temperature=self.temperature,
239
+ top_p=self.top_p,
240
+ frequency_penalty=0.0,
241
+ presence_penalty=0.0,
242
+ extra_body={"repetition_penalty": 1.0, "top_k": self.top_k},
243
+ seed=42,
244
+ )
245
+ prediction = response.choices[0].message.content.strip()
246
+ print(f"Raw response:\n{prediction}")
247
+
248
+ # Parse response
249
+ result = parse_grounding_response(prediction)
250
+ print(f"Parsed result:\n{result}")
251
+ break
252
+
253
+ except Exception as e:
254
+ print(f"Error on attempt {attempt + 1}: {e}")
255
+ prediction = None
256
+ result = None
257
+
258
+ # Return error if all retries failed
259
+ if prediction is None or result is None:
260
+ print("Max retry attempts reached, returning error flag.")
261
+ return "llm client error", {"thinking": None, "coordinate": None}
262
+
263
+ return prediction, result