autoglm-gui 0.1.10__py3-none-any.whl → 0.2.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: autoglm-gui
3
- Version: 0.1.10
3
+ Version: 0.2.2
4
4
  Summary: Web GUI for AutoGLM Phone Agent - AI-powered Android automation
5
5
  Project-URL: Homepage, https://github.com/suyiiyii/AutoGLM-GUI
6
6
  Project-URL: Repository, https://github.com/suyiiyii/AutoGLM-GUI
@@ -22,7 +22,6 @@ Classifier: Topic :: Software Development :: Libraries :: Python Modules
22
22
  Requires-Python: >=3.10
23
23
  Requires-Dist: fastapi>=0.124.0
24
24
  Requires-Dist: openai>=2.9.0
25
- Requires-Dist: phone-agent
26
25
  Requires-Dist: pillow>=11.3.0
27
26
  Requires-Dist: uvicorn[standard]>=0.38.0
28
27
  Description-Content-Type: text/markdown
@@ -58,7 +57,7 @@ AutoGLM-GUI 只需要一个 OpenAI 兼容的模型服务。你可以:
58
57
  - 使用官方已托管的第三方服务
59
58
  - 智谱 BigModel:`--base-url https://open.bigmodel.cn/api/paas/v4`,`--model autoglm-phone`,`--apikey <你的 API Key>`
60
59
  - ModelScope:`--base-url https://api-inference.modelscope.cn/v1`,`--model ZhipuAI/AutoGLM-Phone-9B`,`--apikey <你的 API Key>`
61
- - 或自建服务:参考上游项目的部署文档(`/Users/suyiiyii/Documents/git/Open-AutoGLM/README.md` 或仓库 README) 用 vLLM/SGLang 部署 `zai-org/AutoGLM-Phone-9B`,启动 OpenAI 兼容端口后将 `--base-url` 指向你的服务。
60
+ - 或自建服务:参考上游项目的[部署文档](https://github.com/zai-org/Open-AutoGLM/blob/main/README.md)用 vLLM/SGLang 部署 `zai-org/AutoGLM-Phone-9B`,启动 OpenAI 兼容端口后将 `--base-url` 指向你的服务。
62
61
 
63
62
  示例:
64
63
 
@@ -0,0 +1,34 @@
1
+ AutoGLM_GUI/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
2
+ AutoGLM_GUI/__main__.py,sha256=jinPYge6MvGaDZPdTZa4rtc7vUx39WZg_x6NXMjOuSs,4917
3
+ AutoGLM_GUI/scrcpy_stream.py,sha256=A6SPQzCWPg2mqVUeLB5WGNnKitYAC7ozEwoojEbJE5o,19800
4
+ AutoGLM_GUI/server.py,sha256=1UN5SN5vsxhi1tbBPHvojzYoIRqRH2rLbn9ZgCMTFMY,14965
5
+ AutoGLM_GUI/adb_plus/__init__.py,sha256=GVRx9G0QG_TxbEQc1FCx8eA2UCPvZeIC4kC3amoQzYk,180
6
+ AutoGLM_GUI/adb_plus/screenshot.py,sha256=TR0ortVd48LZ_evbBUkdnFmS1Us12hbpvUEwkNlUov8,3245
7
+ AutoGLM_GUI/static/index.html,sha256=wrqe9LW08fslYpYXAy2FMV3aGDV3Z8vvKSK1woDgwiE,395
8
+ AutoGLM_GUI/static/assets/about-uuv-AkSr.js,sha256=XGaj-c2munlVOcJCBXPthoRucW4l6BhVkVF3Op7tvuA,155
9
+ AutoGLM_GUI/static/assets/chat-Bl1mU48-.js,sha256=FgFMNOfR29xKS0QDuEaP8Q3XnfY2AxpvPz9ANRglswo,60776
10
+ AutoGLM_GUI/static/assets/index-B6TfcGH7.js,sha256=JO0sAp0nipAYEL2SfWyAqi72RvE0MVIT2p9cLf5_MEA,228
11
+ AutoGLM_GUI/static/assets/index-BCzw2xc6.css,sha256=Zi8sI_VW-X8ZfqbPj7_lOb_9cN7ujq5TnCc5c2VT29M,18094
12
+ AutoGLM_GUI/static/assets/index-BhEqSAe_.js,sha256=mVUo9OKOTSDNuXRBe4RmMoqzu9a51vnFGh4EBPCz1GM,277184
13
+ phone_agent/__init__.py,sha256=6_AwafPDiJsEVstiMcfj-HbvpuxWTC2di6MVbQzHK6s,287
14
+ phone_agent/agent.py,sha256=mvw_nDupyqjCjS8WekI9v11G6IMAOdK8Tx1vbi80Gdk,8124
15
+ phone_agent/actions/__init__.py,sha256=YiAMeBMVhRfmDOf0hQ_1FEBSV5ApcnHPQ05gHjA_weQ,160
16
+ phone_agent/actions/handler.py,sha256=kIz6SxRwxzoGoFgxMzASM-a-qDZsFE6TcieIMtE-yC4,10744
17
+ phone_agent/adb/__init__.py,sha256=w3CBSGRv-cQPME-Q-Bx2tbL_WJkpb9RqmT2olNK9TZw,950
18
+ phone_agent/adb/connection.py,sha256=f5V4IxREo0CedXAmadqCxYJXTskSISXdpjlHXeR9H28,10083
19
+ phone_agent/adb/device.py,sha256=kSkGM0B3QLyZXlec1QfmmLwy85S1kPRp3Yldq93pxPk,5655
20
+ phone_agent/adb/input.py,sha256=d__mISZd7xDCWZLIcWjfJ4EUrpnVKAWNgKo1q7k7n48,2885
21
+ phone_agent/adb/screenshot.py,sha256=9MFOvoAKV0U3H5zjn8mQyqgLWqQwIStxlr_4itFO-cc,3186
22
+ phone_agent/config/__init__.py,sha256=3rrDQhBQUpgAlJe9bJfpuGA3ugzZsLerq06eMVtJIV0,861
23
+ phone_agent/config/apps.py,sha256=3QCp29T73_jFkAubAY6CRW_vlAT_EY7qJ66FFKqCVQo,8773
24
+ phone_agent/config/i18n.py,sha256=2xs1x05ouGRVILGQSgL3UMzi6wu6iqYy3scagAObRXA,2096
25
+ phone_agent/config/prompts.py,sha256=547DMu849nKQKL_PO0TWMgvSaW7Qo_-DV8Hsx2fUWQ4,8091
26
+ phone_agent/config/prompts_en.py,sha256=t8C9YLailtg8kVlFtYvq3t9EdtMGE8HastTfDh5BAUM,2630
27
+ phone_agent/config/prompts_zh.py,sha256=vSWo5_qlWwNHVYUxrePBiL2FmJFnQG0Jpb1NsVsVJos,8256
28
+ phone_agent/model/__init__.py,sha256=w6yFkla9VoUvX5vZkTWa7Z5Bg5_lfA9AajrLjZsdL24,149
29
+ phone_agent/model/client.py,sha256=6EM7BIdj9UvqPG9YmgwSGWVxensH5HDfCSAgHDeBgeo,5842
30
+ autoglm_gui-0.2.2.dist-info/METADATA,sha256=uEeINImNfq60cA7qXSpJkITLU1RYobyOJbs097Y3GU8,4621
31
+ autoglm_gui-0.2.2.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
32
+ autoglm_gui-0.2.2.dist-info/entry_points.txt,sha256=sz4rBO_kgrYmOiT0QnhCCv0b9QqBdWyCjugJgY8AEOI,58
33
+ autoglm_gui-0.2.2.dist-info/licenses/LICENSE,sha256=0IkSHDewdtmXnmYzTNq4U47EJYjTuhjQNpT0bZKuqWc,11342
34
+ autoglm_gui-0.2.2.dist-info/RECORD,,
@@ -0,0 +1,11 @@
1
+ """
2
+ Phone Agent - An AI-powered phone automation framework.
3
+
4
+ This package provides tools for automating Android phone interactions
5
+ using AI models for visual understanding and decision making.
6
+ """
7
+
8
+ from phone_agent.agent import PhoneAgent
9
+
10
+ __version__ = "0.1.0"
11
+ __all__ = ["PhoneAgent"]
@@ -0,0 +1,5 @@
1
+ """Action handling module for Phone Agent."""
2
+
3
+ from phone_agent.actions.handler import ActionHandler, ActionResult
4
+
5
+ __all__ = ["ActionHandler", "ActionResult"]
@@ -0,0 +1,307 @@
1
+ """Action handler for processing AI model outputs."""
2
+
3
+ import time
4
+ from dataclasses import dataclass
5
+ from typing import Any, Callable
6
+
7
+ from phone_agent.adb import (
8
+ back,
9
+ clear_text,
10
+ detect_and_set_adb_keyboard,
11
+ double_tap,
12
+ home,
13
+ launch_app,
14
+ long_press,
15
+ restore_keyboard,
16
+ swipe,
17
+ tap,
18
+ type_text,
19
+ )
20
+
21
+
22
+ @dataclass
23
+ class ActionResult:
24
+ """Result of an action execution."""
25
+
26
+ success: bool
27
+ should_finish: bool
28
+ message: str | None = None
29
+ requires_confirmation: bool = False
30
+
31
+
32
+ class ActionHandler:
33
+ """
34
+ Handles execution of actions from AI model output.
35
+
36
+ Args:
37
+ device_id: Optional ADB device ID for multi-device setups.
38
+ confirmation_callback: Optional callback for sensitive action confirmation.
39
+ Should return True to proceed, False to cancel.
40
+ takeover_callback: Optional callback for takeover requests (login, captcha).
41
+ """
42
+
43
+ def __init__(
44
+ self,
45
+ device_id: str | None = None,
46
+ confirmation_callback: Callable[[str], bool] | None = None,
47
+ takeover_callback: Callable[[str], None] | None = None,
48
+ ):
49
+ self.device_id = device_id
50
+ self.confirmation_callback = confirmation_callback or self._default_confirmation
51
+ self.takeover_callback = takeover_callback or self._default_takeover
52
+
53
+ def execute(
54
+ self, action: dict[str, Any], screen_width: int, screen_height: int
55
+ ) -> ActionResult:
56
+ """
57
+ Execute an action from the AI model.
58
+
59
+ Args:
60
+ action: The action dictionary from the model.
61
+ screen_width: Current screen width in pixels.
62
+ screen_height: Current screen height in pixels.
63
+
64
+ Returns:
65
+ ActionResult indicating success and whether to finish.
66
+ """
67
+ action_type = action.get("_metadata")
68
+
69
+ if action_type == "finish":
70
+ return ActionResult(
71
+ success=True, should_finish=True, message=action.get("message")
72
+ )
73
+
74
+ if action_type != "do":
75
+ return ActionResult(
76
+ success=False,
77
+ should_finish=True,
78
+ message=f"Unknown action type: {action_type}",
79
+ )
80
+
81
+ action_name = action.get("action")
82
+ handler_method = self._get_handler(action_name)
83
+
84
+ if handler_method is None:
85
+ return ActionResult(
86
+ success=False,
87
+ should_finish=False,
88
+ message=f"Unknown action: {action_name}",
89
+ )
90
+
91
+ try:
92
+ return handler_method(action, screen_width, screen_height)
93
+ except Exception as e:
94
+ return ActionResult(
95
+ success=False, should_finish=False, message=f"Action failed: {e}"
96
+ )
97
+
98
+ def _get_handler(self, action_name: str) -> Callable | None:
99
+ """Get the handler method for an action."""
100
+ handlers = {
101
+ "Launch": self._handle_launch,
102
+ "Tap": self._handle_tap,
103
+ "Type": self._handle_type,
104
+ "Type_Name": self._handle_type,
105
+ "Swipe": self._handle_swipe,
106
+ "Back": self._handle_back,
107
+ "Home": self._handle_home,
108
+ "Double Tap": self._handle_double_tap,
109
+ "Long Press": self._handle_long_press,
110
+ "Wait": self._handle_wait,
111
+ "Take_over": self._handle_takeover,
112
+ "Note": self._handle_note,
113
+ "Call_API": self._handle_call_api,
114
+ "Interact": self._handle_interact,
115
+ }
116
+ return handlers.get(action_name)
117
+
118
+ def _convert_relative_to_absolute(
119
+ self, element: list[int], screen_width: int, screen_height: int
120
+ ) -> tuple[int, int]:
121
+ """Convert relative coordinates (0-1000) to absolute pixels."""
122
+ x = int(element[0] / 1000 * screen_width)
123
+ y = int(element[1] / 1000 * screen_height)
124
+ return x, y
125
+
126
+ def _handle_launch(self, action: dict, width: int, height: int) -> ActionResult:
127
+ """Handle app launch action."""
128
+ app_name = action.get("app")
129
+ if not app_name:
130
+ return ActionResult(False, False, "No app name specified")
131
+
132
+ success = launch_app(app_name, self.device_id)
133
+ if success:
134
+ return ActionResult(True, False)
135
+ return ActionResult(False, False, f"App not found: {app_name}")
136
+
137
+ def _handle_tap(self, action: dict, width: int, height: int) -> ActionResult:
138
+ """Handle tap action."""
139
+ element = action.get("element")
140
+ if not element:
141
+ return ActionResult(False, False, "No element coordinates")
142
+
143
+ x, y = self._convert_relative_to_absolute(element, width, height)
144
+
145
+ # Check for sensitive operation
146
+ if "message" in action:
147
+ if not self.confirmation_callback(action["message"]):
148
+ return ActionResult(
149
+ success=False,
150
+ should_finish=True,
151
+ message="User cancelled sensitive operation",
152
+ )
153
+
154
+ tap(x, y, self.device_id)
155
+ return ActionResult(True, False)
156
+
157
+ def _handle_type(self, action: dict, width: int, height: int) -> ActionResult:
158
+ """Handle text input action."""
159
+ text = action.get("text", "")
160
+
161
+ # Switch to ADB keyboard
162
+ original_ime = detect_and_set_adb_keyboard(self.device_id)
163
+ time.sleep(1.0)
164
+
165
+ # Clear existing text and type new text
166
+ clear_text(self.device_id)
167
+ time.sleep(1.0)
168
+
169
+ type_text(text, self.device_id)
170
+ time.sleep(1.0)
171
+
172
+ # Restore original keyboard
173
+ restore_keyboard(original_ime, self.device_id)
174
+ time.sleep(1.0)
175
+
176
+ return ActionResult(True, False)
177
+
178
+ def _handle_swipe(self, action: dict, width: int, height: int) -> ActionResult:
179
+ """Handle swipe action."""
180
+ start = action.get("start")
181
+ end = action.get("end")
182
+
183
+ if not start or not end:
184
+ return ActionResult(False, False, "Missing swipe coordinates")
185
+
186
+ start_x, start_y = self._convert_relative_to_absolute(start, width, height)
187
+ end_x, end_y = self._convert_relative_to_absolute(end, width, height)
188
+
189
+ swipe(start_x, start_y, end_x, end_y, device_id=self.device_id)
190
+ return ActionResult(True, False)
191
+
192
+ def _handle_back(self, action: dict, width: int, height: int) -> ActionResult:
193
+ """Handle back button action."""
194
+ back(self.device_id)
195
+ return ActionResult(True, False)
196
+
197
+ def _handle_home(self, action: dict, width: int, height: int) -> ActionResult:
198
+ """Handle home button action."""
199
+ home(self.device_id)
200
+ return ActionResult(True, False)
201
+
202
+ def _handle_double_tap(self, action: dict, width: int, height: int) -> ActionResult:
203
+ """Handle double tap action."""
204
+ element = action.get("element")
205
+ if not element:
206
+ return ActionResult(False, False, "No element coordinates")
207
+
208
+ x, y = self._convert_relative_to_absolute(element, width, height)
209
+ double_tap(x, y, self.device_id)
210
+ return ActionResult(True, False)
211
+
212
+ def _handle_long_press(self, action: dict, width: int, height: int) -> ActionResult:
213
+ """Handle long press action."""
214
+ element = action.get("element")
215
+ if not element:
216
+ return ActionResult(False, False, "No element coordinates")
217
+
218
+ x, y = self._convert_relative_to_absolute(element, width, height)
219
+ long_press(x, y, device_id=self.device_id)
220
+ return ActionResult(True, False)
221
+
222
+ def _handle_wait(self, action: dict, width: int, height: int) -> ActionResult:
223
+ """Handle wait action."""
224
+ duration_str = action.get("duration", "1 seconds")
225
+ try:
226
+ duration = float(duration_str.replace("seconds", "").strip())
227
+ except ValueError:
228
+ duration = 1.0
229
+
230
+ time.sleep(duration)
231
+ return ActionResult(True, False)
232
+
233
+ def _handle_takeover(self, action: dict, width: int, height: int) -> ActionResult:
234
+ """Handle takeover request (login, captcha, etc.)."""
235
+ message = action.get("message", "User intervention required")
236
+ self.takeover_callback(message)
237
+ return ActionResult(True, False)
238
+
239
+ def _handle_note(self, action: dict, width: int, height: int) -> ActionResult:
240
+ """Handle note action (placeholder for content recording)."""
241
+ # This action is typically used for recording page content
242
+ # Implementation depends on specific requirements
243
+ return ActionResult(True, False)
244
+
245
+ def _handle_call_api(self, action: dict, width: int, height: int) -> ActionResult:
246
+ """Handle API call action (placeholder for summarization)."""
247
+ # This action is typically used for content summarization
248
+ # Implementation depends on specific requirements
249
+ return ActionResult(True, False)
250
+
251
+ def _handle_interact(self, action: dict, width: int, height: int) -> ActionResult:
252
+ """Handle interaction request (user choice needed)."""
253
+ # This action signals that user input is needed
254
+ return ActionResult(True, False, message="User interaction required")
255
+
256
+ @staticmethod
257
+ def _default_confirmation(message: str) -> bool:
258
+ """Default confirmation callback using console input."""
259
+ response = input(f"Sensitive operation: {message}\nConfirm? (Y/N): ")
260
+ return response.upper() == "Y"
261
+
262
+ @staticmethod
263
+ def _default_takeover(message: str) -> None:
264
+ """Default takeover callback using console input."""
265
+ input(f"{message}\nPress Enter after completing manual operation...")
266
+
267
+
268
+ def parse_action(response: str) -> dict[str, Any]:
269
+ """
270
+ Parse action from model response.
271
+
272
+ Args:
273
+ response: Raw response string from the model.
274
+
275
+ Returns:
276
+ Parsed action dictionary.
277
+
278
+ Raises:
279
+ ValueError: If the response cannot be parsed.
280
+ """
281
+ try:
282
+ # Try to evaluate as Python dict/function call
283
+ response = response.strip()
284
+ if response.startswith("do"):
285
+ action = eval(response)
286
+ elif response.startswith("finish"):
287
+ action = {
288
+ "_metadata": "finish",
289
+ "message": response.replace("finish(message=", "")[1:-2],
290
+ }
291
+ else:
292
+ raise ValueError(f"Failed to parse action: {response}")
293
+ return action
294
+ except Exception as e:
295
+ raise ValueError(f"Failed to parse action: {e}")
296
+
297
+
298
+ def do(**kwargs) -> dict[str, Any]:
299
+ """Helper function for creating 'do' actions."""
300
+ kwargs["_metadata"] = "do"
301
+ return kwargs
302
+
303
+
304
+ def finish(**kwargs) -> dict[str, Any]:
305
+ """Helper function for creating 'finish' actions."""
306
+ kwargs["_metadata"] = "finish"
307
+ return kwargs
@@ -0,0 +1,51 @@
1
+ """ADB utilities for Android device interaction."""
2
+
3
+ from phone_agent.adb.connection import (
4
+ ADBConnection,
5
+ ConnectionType,
6
+ DeviceInfo,
7
+ list_devices,
8
+ quick_connect,
9
+ )
10
+ from phone_agent.adb.device import (
11
+ back,
12
+ double_tap,
13
+ get_current_app,
14
+ home,
15
+ launch_app,
16
+ long_press,
17
+ swipe,
18
+ tap,
19
+ )
20
+ from phone_agent.adb.input import (
21
+ clear_text,
22
+ detect_and_set_adb_keyboard,
23
+ restore_keyboard,
24
+ type_text,
25
+ )
26
+ from phone_agent.adb.screenshot import get_screenshot
27
+
28
+ __all__ = [
29
+ # Screenshot
30
+ "get_screenshot",
31
+ # Input
32
+ "type_text",
33
+ "clear_text",
34
+ "detect_and_set_adb_keyboard",
35
+ "restore_keyboard",
36
+ # Device control
37
+ "get_current_app",
38
+ "tap",
39
+ "swipe",
40
+ "back",
41
+ "home",
42
+ "double_tap",
43
+ "long_press",
44
+ "launch_app",
45
+ # Connection management
46
+ "ADBConnection",
47
+ "DeviceInfo",
48
+ "ConnectionType",
49
+ "quick_connect",
50
+ "list_devices",
51
+ ]