autoglm-gui 1.4.1__py3-none-any.whl → 1.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. AutoGLM_GUI/__init__.py +11 -0
  2. AutoGLM_GUI/__main__.py +26 -4
  3. AutoGLM_GUI/actions/__init__.py +6 -0
  4. phone_agent/actions/handler_ios.py → AutoGLM_GUI/actions/handler.py +30 -112
  5. AutoGLM_GUI/actions/types.py +15 -0
  6. {phone_agent → AutoGLM_GUI}/adb/__init__.py +25 -23
  7. {phone_agent → AutoGLM_GUI}/adb/connection.py +5 -40
  8. {phone_agent → AutoGLM_GUI}/adb/device.py +12 -94
  9. {phone_agent → AutoGLM_GUI}/adb/input.py +6 -47
  10. AutoGLM_GUI/adb/screenshot.py +11 -0
  11. {phone_agent/config → AutoGLM_GUI/adb}/timing.py +1 -1
  12. AutoGLM_GUI/adb_plus/keyboard_installer.py +4 -2
  13. AutoGLM_GUI/adb_plus/screenshot.py +22 -1
  14. AutoGLM_GUI/adb_plus/serial.py +38 -20
  15. AutoGLM_GUI/adb_plus/touch.py +4 -9
  16. AutoGLM_GUI/agents/__init__.py +43 -12
  17. AutoGLM_GUI/agents/events.py +19 -0
  18. AutoGLM_GUI/agents/factory.py +31 -38
  19. AutoGLM_GUI/agents/glm/__init__.py +7 -0
  20. AutoGLM_GUI/agents/glm/agent.py +297 -0
  21. AutoGLM_GUI/agents/glm/message_builder.py +81 -0
  22. AutoGLM_GUI/agents/glm/parser.py +110 -0
  23. {phone_agent/config → AutoGLM_GUI/agents/glm}/prompts_en.py +7 -9
  24. {phone_agent/config → AutoGLM_GUI/agents/glm}/prompts_zh.py +18 -25
  25. AutoGLM_GUI/agents/mai/__init__.py +28 -0
  26. AutoGLM_GUI/agents/mai/agent.py +408 -0
  27. AutoGLM_GUI/agents/mai/parser.py +254 -0
  28. AutoGLM_GUI/agents/mai/prompts.py +103 -0
  29. AutoGLM_GUI/agents/mai/traj_memory.py +91 -0
  30. AutoGLM_GUI/agents/protocols.py +12 -8
  31. AutoGLM_GUI/agents/stream_runner.py +193 -0
  32. AutoGLM_GUI/api/__init__.py +40 -21
  33. AutoGLM_GUI/api/agents.py +181 -239
  34. AutoGLM_GUI/api/control.py +9 -6
  35. AutoGLM_GUI/api/devices.py +102 -12
  36. AutoGLM_GUI/api/history.py +104 -0
  37. AutoGLM_GUI/api/layered_agent.py +67 -15
  38. AutoGLM_GUI/api/media.py +64 -1
  39. AutoGLM_GUI/api/scheduled_tasks.py +98 -0
  40. AutoGLM_GUI/config.py +81 -0
  41. AutoGLM_GUI/config_manager.py +68 -51
  42. AutoGLM_GUI/device_manager.py +248 -29
  43. AutoGLM_GUI/device_protocol.py +1 -1
  44. AutoGLM_GUI/devices/adb_device.py +5 -10
  45. AutoGLM_GUI/devices/mock_device.py +4 -2
  46. AutoGLM_GUI/devices/remote_device.py +8 -3
  47. AutoGLM_GUI/history_manager.py +164 -0
  48. AutoGLM_GUI/model/__init__.py +5 -0
  49. AutoGLM_GUI/model/message_builder.py +69 -0
  50. AutoGLM_GUI/model/types.py +24 -0
  51. AutoGLM_GUI/models/__init__.py +10 -0
  52. AutoGLM_GUI/models/history.py +140 -0
  53. AutoGLM_GUI/models/scheduled_task.py +71 -0
  54. AutoGLM_GUI/parsers/__init__.py +22 -0
  55. AutoGLM_GUI/parsers/base.py +50 -0
  56. AutoGLM_GUI/parsers/phone_parser.py +58 -0
  57. AutoGLM_GUI/phone_agent_manager.py +62 -396
  58. AutoGLM_GUI/platform_utils.py +26 -0
  59. AutoGLM_GUI/prompt_config.py +15 -0
  60. AutoGLM_GUI/prompts/__init__.py +32 -0
  61. AutoGLM_GUI/scheduler_manager.py +350 -0
  62. AutoGLM_GUI/schemas.py +246 -72
  63. AutoGLM_GUI/scrcpy_stream.py +142 -24
  64. AutoGLM_GUI/socketio_server.py +100 -27
  65. AutoGLM_GUI/static/assets/{about-_XNhzQZX.js → about-CfwX1Cmc.js} +1 -1
  66. AutoGLM_GUI/static/assets/alert-dialog-CtGlN2IJ.js +1 -0
  67. AutoGLM_GUI/static/assets/chat-BYa-foUI.js +129 -0
  68. AutoGLM_GUI/static/assets/circle-alert-t08bEMPO.js +1 -0
  69. AutoGLM_GUI/static/assets/dialog-FNwZJFwk.js +45 -0
  70. AutoGLM_GUI/static/assets/eye-D0UPWCWC.js +1 -0
  71. AutoGLM_GUI/static/assets/history-CRo95B7i.js +1 -0
  72. AutoGLM_GUI/static/assets/{index-Cy8TmmHV.js → index-BaLMSqd3.js} +1 -1
  73. AutoGLM_GUI/static/assets/index-CTHbFvKl.js +11 -0
  74. AutoGLM_GUI/static/assets/index-CV7jGxGm.css +1 -0
  75. AutoGLM_GUI/static/assets/label-DJFevVmr.js +1 -0
  76. AutoGLM_GUI/static/assets/logs-RW09DyYY.js +1 -0
  77. AutoGLM_GUI/static/assets/popover--JTJrE5v.js +1 -0
  78. AutoGLM_GUI/static/assets/scheduled-tasks-DTRKsQXF.js +1 -0
  79. AutoGLM_GUI/static/assets/square-pen-CPK_K680.js +1 -0
  80. AutoGLM_GUI/static/assets/textarea-PRmVnWq5.js +1 -0
  81. AutoGLM_GUI/static/assets/workflows-CdcsAoaT.js +1 -0
  82. AutoGLM_GUI/static/index.html +2 -2
  83. AutoGLM_GUI/types.py +17 -0
  84. {autoglm_gui-1.4.1.dist-info → autoglm_gui-1.5.1.dist-info}/METADATA +179 -130
  85. autoglm_gui-1.5.1.dist-info/RECORD +118 -0
  86. AutoGLM_GUI/agents/mai_adapter.py +0 -627
  87. AutoGLM_GUI/api/dual_model.py +0 -317
  88. AutoGLM_GUI/device_adapter.py +0 -263
  89. AutoGLM_GUI/dual_model/__init__.py +0 -53
  90. AutoGLM_GUI/dual_model/decision_model.py +0 -664
  91. AutoGLM_GUI/dual_model/dual_agent.py +0 -917
  92. AutoGLM_GUI/dual_model/protocols.py +0 -354
  93. AutoGLM_GUI/dual_model/vision_model.py +0 -442
  94. AutoGLM_GUI/mai_ui_adapter/agent_wrapper.py +0 -291
  95. AutoGLM_GUI/phone_agent_patches.py +0 -147
  96. AutoGLM_GUI/static/assets/chat-DwJpiAWf.js +0 -126
  97. AutoGLM_GUI/static/assets/dialog-B3uW4T8V.js +0 -45
  98. AutoGLM_GUI/static/assets/index-Cpv2gSF1.css +0 -1
  99. AutoGLM_GUI/static/assets/index-UYYauTly.js +0 -12
  100. AutoGLM_GUI/static/assets/workflows-Du_de-dt.js +0 -1
  101. autoglm_gui-1.4.1.dist-info/RECORD +0 -117
  102. mai_agent/base.py +0 -137
  103. mai_agent/mai_grounding_agent.py +0 -263
  104. mai_agent/mai_naivigation_agent.py +0 -526
  105. mai_agent/prompt.py +0 -148
  106. mai_agent/unified_memory.py +0 -67
  107. mai_agent/utils.py +0 -73
  108. phone_agent/__init__.py +0 -12
  109. phone_agent/actions/__init__.py +0 -5
  110. phone_agent/actions/handler.py +0 -400
  111. phone_agent/adb/screenshot.py +0 -108
  112. phone_agent/agent.py +0 -253
  113. phone_agent/agent_ios.py +0 -277
  114. phone_agent/config/__init__.py +0 -53
  115. phone_agent/config/apps_harmonyos.py +0 -256
  116. phone_agent/config/apps_ios.py +0 -339
  117. phone_agent/config/prompts.py +0 -80
  118. phone_agent/device_factory.py +0 -166
  119. phone_agent/hdc/__init__.py +0 -53
  120. phone_agent/hdc/connection.py +0 -384
  121. phone_agent/hdc/device.py +0 -269
  122. phone_agent/hdc/input.py +0 -145
  123. phone_agent/hdc/screenshot.py +0 -127
  124. phone_agent/model/__init__.py +0 -5
  125. phone_agent/model/client.py +0 -290
  126. phone_agent/xctest/__init__.py +0 -47
  127. phone_agent/xctest/connection.py +0 -379
  128. phone_agent/xctest/device.py +0 -472
  129. phone_agent/xctest/input.py +0 -311
  130. phone_agent/xctest/screenshot.py +0 -226
  131. {phone_agent/config → AutoGLM_GUI/adb}/apps.py +0 -0
  132. {phone_agent/config → AutoGLM_GUI}/i18n.py +0 -0
  133. {autoglm_gui-1.4.1.dist-info → autoglm_gui-1.5.1.dist-info}/WHEEL +0 -0
  134. {autoglm_gui-1.4.1.dist-info → autoglm_gui-1.5.1.dist-info}/entry_points.txt +0 -0
  135. {autoglm_gui-1.4.1.dist-info → autoglm_gui-1.5.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,442 +0,0 @@
1
- """
2
- 视觉小模型适配器
3
-
4
- 适配 autoglm-phone 等视觉模型,提供屏幕识别和动作执行能力
5
- """
6
-
7
- from dataclasses import dataclass
8
- from typing import Callable, Optional
9
-
10
- from phone_agent.model.client import ModelClient, ModelConfig, MessageBuilder
11
- from phone_agent.actions.handler import ActionHandler, parse_action
12
- from phone_agent.device_factory import get_device_factory
13
-
14
- from AutoGLM_GUI.logger import logger
15
- from .protocols import VISION_DESCRIBE_PROMPT
16
-
17
-
18
- @dataclass
19
- class ScreenDescription:
20
- """屏幕描述结果"""
21
-
22
- description: str # 屏幕文字描述
23
- current_app: str # 当前应用
24
- elements: list[str] # 识别到的主要元素
25
- raw_response: str = ""
26
-
27
-
28
- @dataclass
29
- class ExecutionResult:
30
- """动作执行结果"""
31
-
32
- success: bool
33
- action_type: str # 执行的动作类型
34
- target: str # 目标描述
35
- position: Optional[tuple[int, int]] = None # 点击位置(如果有)
36
- message: str = ""
37
- finished: bool = False
38
-
39
-
40
- class VisionModel:
41
- """
42
- 视觉小模型 - 负责屏幕识别和动作执行
43
-
44
- 使用 autoglm-phone 等视觉模型,识别屏幕内容并执行具体操作。
45
- 在双模型协作中,充当"眼睛"和"手"的角色。
46
- """
47
-
48
- def __init__(
49
- self,
50
- model_config: ModelConfig,
51
- device_id: str,
52
- confirmation_callback: Optional[Callable[[str], bool]] = None,
53
- takeover_callback: Optional[Callable[[str], None]] = None,
54
- ):
55
- self.model_config = model_config
56
- self.device_id = device_id
57
- self.model_client = ModelClient(model_config)
58
- self.action_handler = ActionHandler(
59
- device_id=device_id,
60
- confirmation_callback=confirmation_callback,
61
- takeover_callback=takeover_callback,
62
- )
63
- self.device_factory = get_device_factory()
64
-
65
- logger.info(f"视觉小模型初始化: {model_config.model_name}, 设备: {device_id}")
66
-
67
- def capture_screenshot(self) -> tuple[str, int, int]:
68
- """
69
- 截取当前屏幕
70
-
71
- Returns:
72
- (base64_string, width, height)
73
- """
74
- logger.debug("正在截取屏幕...")
75
-
76
- screenshot = self.device_factory.get_screenshot(self.device_id)
77
-
78
- logger.debug(f"截图完成: {screenshot.width}x{screenshot.height}")
79
- return (
80
- screenshot.base64_data,
81
- screenshot.width,
82
- screenshot.height,
83
- )
84
-
85
- def describe_screen(
86
- self,
87
- screenshot_base64: Optional[str] = None,
88
- on_thinking: Optional[Callable[[str], None]] = None,
89
- ) -> ScreenDescription:
90
- """
91
- 识别并描述屏幕内容
92
-
93
- 让视觉模型描述当前屏幕,生成文字描述供决策大模型使用。
94
-
95
- Args:
96
- screenshot_base64: 可选的截图base64,不提供则自动截取
97
- on_thinking: 思考过程回调
98
-
99
- Returns:
100
- ScreenDescription: 屏幕描述结果
101
- """
102
- logger.info("正在识别屏幕内容...")
103
-
104
- # 获取截图
105
- if screenshot_base64 is None:
106
- screenshot_base64, width, height = self.capture_screenshot()
107
-
108
- # 获取当前应用
109
- current_app = self.device_factory.get_current_app(self.device_id)
110
-
111
- # 构建消息,要求模型描述屏幕
112
- messages = [
113
- MessageBuilder.create_system_message(
114
- "你是一个屏幕内容识别助手。请详细描述屏幕内容,帮助另一个AI做出操作决策。"
115
- ),
116
- MessageBuilder.create_user_message(
117
- text=f"""请描述这个屏幕的内容。
118
-
119
- 当前应用: {current_app}
120
-
121
- {VISION_DESCRIBE_PROMPT}
122
-
123
- 请以结构化的方式描述屏幕内容。""",
124
- image_base64=screenshot_base64,
125
- ),
126
- ]
127
-
128
- # 调用视觉模型
129
- try:
130
- response = self.model_client.request(messages)
131
-
132
- # 解析描述
133
- description = (
134
- response.thinking if response.thinking else response.raw_content
135
- )
136
-
137
- # 提取元素列表(简单解析)
138
- elements = self._extract_elements(description)
139
-
140
- result = ScreenDescription(
141
- description=description,
142
- current_app=current_app,
143
- elements=elements,
144
- raw_response=response.raw_content,
145
- )
146
-
147
- logger.info(f"屏幕识别完成: {current_app}, 识别到 {len(elements)} 个元素")
148
- return result
149
-
150
- except Exception as e:
151
- logger.error(f"屏幕识别失败: {e}")
152
- # 返回基础描述
153
- return ScreenDescription(
154
- description=f"当前应用: {current_app},屏幕识别失败: {e}",
155
- current_app=current_app,
156
- elements=[],
157
- )
158
-
159
- def execute_decision(
160
- self,
161
- decision: dict,
162
- screenshot_base64: Optional[str] = None,
163
- on_thinking: Optional[Callable[[str], None]] = None,
164
- ) -> ExecutionResult:
165
- """
166
- 根据大模型的决策执行操作
167
-
168
- 将大模型的高级决策转换为具体的屏幕操作。
169
-
170
- Args:
171
- decision: 大模型的决策,包含 action, target, content 等
172
- screenshot_base64: 当前截图(用于定位元素)
173
- on_thinking: 思考过程回调
174
-
175
- Returns:
176
- ExecutionResult: 执行结果
177
- """
178
- action_type = decision.get("action", "")
179
- target = decision.get("target", "")
180
- content = decision.get("content")
181
-
182
- logger.info(f"执行决策: {action_type} -> {target}")
183
-
184
- # 获取截图和尺寸
185
- if screenshot_base64 is None:
186
- screenshot_base64, width, height = self.capture_screenshot()
187
- else:
188
- screenshot = self.device_factory.get_screenshot(self.device_id)
189
- width, height = screenshot.width, screenshot.height
190
-
191
- # 处理完成动作
192
- if action_type == "finish":
193
- return ExecutionResult(
194
- success=True,
195
- action_type="finish",
196
- target="",
197
- message=decision.get("reasoning", "任务完成"),
198
- finished=True,
199
- )
200
-
201
- # 对于需要定位的操作,调用视觉模型找到具体位置
202
- if action_type in ["tap", "swipe", "long_press", "double_tap"]:
203
- position = self._find_element_position(
204
- target, screenshot_base64, width, height, on_thinking
205
- )
206
-
207
- if position is None:
208
- return ExecutionResult(
209
- success=False,
210
- action_type=action_type,
211
- target=target,
212
- message=f"无法定位元素: {target}",
213
- )
214
-
215
- # 执行点击操作
216
- if action_type == "tap":
217
- action_dict = {
218
- "_metadata": "do",
219
- "action": "Tap",
220
- "element": list(position),
221
- }
222
- elif action_type == "double_tap":
223
- action_dict = {
224
- "_metadata": "do",
225
- "action": "Double Tap",
226
- "element": list(position),
227
- }
228
- elif action_type == "long_press":
229
- action_dict = {
230
- "_metadata": "do",
231
- "action": "Long Press",
232
- "element": list(position),
233
- }
234
- else:
235
- action_dict = {
236
- "_metadata": "do",
237
- "action": "Tap",
238
- "element": list(position),
239
- }
240
-
241
- result = self.action_handler.execute(action_dict, width, height)
242
-
243
- return ExecutionResult(
244
- success=result.success,
245
- action_type=action_type,
246
- target=target,
247
- position=(
248
- int(position[0] * width / 1000),
249
- int(position[1] * height / 1000),
250
- ),
251
- message=result.message or "",
252
- finished=result.should_finish,
253
- )
254
-
255
- # 处理输入操作
256
- elif action_type == "type":
257
- if not content:
258
- return ExecutionResult(
259
- success=False,
260
- action_type="type",
261
- target=target,
262
- message="输入内容为空",
263
- )
264
-
265
- action_dict = {
266
- "_metadata": "do",
267
- "action": "Type",
268
- "text": content,
269
- }
270
- result = self.action_handler.execute(action_dict, width, height)
271
-
272
- return ExecutionResult(
273
- success=result.success,
274
- action_type="type",
275
- target=target,
276
- message=f"输入: {content[:50]}..."
277
- if len(content) > 50
278
- else f"输入: {content}",
279
- )
280
-
281
- # 处理滑动操作
282
- elif action_type == "scroll":
283
- direction = decision.get("direction", "up")
284
- # 根据方向计算滑动坐标
285
- if direction == "up":
286
- start = [500, 700]
287
- end = [500, 300]
288
- elif direction == "down":
289
- start = [500, 300]
290
- end = [500, 700]
291
- elif direction == "left":
292
- start = [700, 500]
293
- end = [300, 500]
294
- else: # right
295
- start = [300, 500]
296
- end = [700, 500]
297
-
298
- action_dict = {
299
- "_metadata": "do",
300
- "action": "Swipe",
301
- "start": start,
302
- "end": end,
303
- }
304
- result = self.action_handler.execute(action_dict, width, height)
305
-
306
- return ExecutionResult(
307
- success=result.success,
308
- action_type="scroll",
309
- target=f"滚动{direction}",
310
- )
311
-
312
- # 处理返回操作
313
- elif action_type == "back":
314
- action_dict = {"_metadata": "do", "action": "Back"}
315
- result = self.action_handler.execute(action_dict, width, height)
316
-
317
- return ExecutionResult(
318
- success=result.success,
319
- action_type="back",
320
- target="返回",
321
- )
322
-
323
- # 处理Home键
324
- elif action_type == "home":
325
- action_dict = {"_metadata": "do", "action": "Home"}
326
- result = self.action_handler.execute(action_dict, width, height)
327
-
328
- return ExecutionResult(
329
- success=result.success,
330
- action_type="home",
331
- target="主页",
332
- )
333
-
334
- # 处理启动应用
335
- elif action_type == "launch":
336
- app_name = target or decision.get("app", "")
337
- action_dict = {
338
- "_metadata": "do",
339
- "action": "Launch",
340
- "app": app_name,
341
- }
342
- result = self.action_handler.execute(action_dict, width, height)
343
-
344
- return ExecutionResult(
345
- success=result.success,
346
- action_type="launch",
347
- target=app_name,
348
- message=result.message or "",
349
- )
350
-
351
- else:
352
- logger.warning(f"未知的动作类型: {action_type}")
353
- return ExecutionResult(
354
- success=False,
355
- action_type=action_type,
356
- target=target,
357
- message=f"未知的动作类型: {action_type}",
358
- )
359
-
360
- def _find_element_position(
361
- self,
362
- target_description: str,
363
- screenshot_base64: str,
364
- _width: int,
365
- _height: int,
366
- _on_thinking: Optional[Callable[[str], None]] = None,
367
- ) -> Optional[tuple[int, int]]:
368
- """
369
- 使用视觉模型定位元素
370
-
371
- Args:
372
- target_description: 目标元素描述
373
- screenshot_base64: 截图base64
374
- width: 屏幕宽度
375
- height: 屏幕高度
376
- on_thinking: 思考过程回调
377
-
378
- Returns:
379
- (x, y) 归一化坐标(0-1000),或 None
380
- """
381
- logger.debug(f"正在定位元素: {target_description}")
382
-
383
- # 构建定位请求
384
- messages = [
385
- MessageBuilder.create_system_message(
386
- """你是一个屏幕元素定位助手。根据用户描述的目标元素,找到它在屏幕上的位置。
387
-
388
- 请以以下格式返回:
389
- do(action="Tap", element=[x, y])
390
-
391
- 其中 x 和 y 是 0-1000 范围的归一化坐标。
392
- - x=0 表示最左边,x=1000 表示最右边
393
- - y=0 表示最上边,y=1000 表示最下边
394
-
395
- 只返回坐标,不要其他解释。"""
396
- ),
397
- MessageBuilder.create_user_message(
398
- text=f"请找到并点击: {target_description}",
399
- image_base64=screenshot_base64,
400
- ),
401
- ]
402
-
403
- try:
404
- response = self.model_client.request(messages)
405
-
406
- # 解析响应获取坐标
407
- action = parse_action(response.action)
408
-
409
- if action.get("_metadata") == "do" and "element" in action:
410
- element = action["element"]
411
- if isinstance(element, list) and len(element) >= 2:
412
- x, y = int(element[0]), int(element[1])
413
- logger.info(f"元素定位成功: ({x}, {y})")
414
- return (x, y)
415
-
416
- logger.warning(f"无法从响应中解析坐标: {response.action}")
417
- return None
418
-
419
- except Exception as e:
420
- logger.error(f"元素定位失败: {e}")
421
- return None
422
-
423
- def _extract_elements(self, description: str) -> list[str]:
424
- """从描述中提取主要元素列表"""
425
- elements = []
426
-
427
- # 简单的关键词提取
428
- keywords = ["按钮", "图标", "文本", "输入框", "搜索", "导航", "菜单", "列表"]
429
- lines = description.split("\n")
430
-
431
- for line in lines:
432
- line = line.strip()
433
- if any(kw in line for kw in keywords):
434
- # 清理并添加
435
- if len(line) < 100: # 避免太长的描述
436
- elements.append(line)
437
-
438
- return elements[:10] # 最多返回10个元素
439
-
440
- def get_current_app(self) -> str:
441
- """获取当前应用"""
442
- return self.device_factory.get_current_app(self.device_id)