autoglm-gui 1.4.1__py3-none-any.whl → 1.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. AutoGLM_GUI/__init__.py +11 -0
  2. AutoGLM_GUI/__main__.py +26 -4
  3. AutoGLM_GUI/actions/__init__.py +6 -0
  4. phone_agent/actions/handler_ios.py → AutoGLM_GUI/actions/handler.py +30 -112
  5. AutoGLM_GUI/actions/types.py +15 -0
  6. {phone_agent → AutoGLM_GUI}/adb/__init__.py +25 -23
  7. {phone_agent → AutoGLM_GUI}/adb/connection.py +5 -40
  8. {phone_agent → AutoGLM_GUI}/adb/device.py +12 -94
  9. {phone_agent → AutoGLM_GUI}/adb/input.py +6 -47
  10. AutoGLM_GUI/adb/screenshot.py +11 -0
  11. {phone_agent/config → AutoGLM_GUI/adb}/timing.py +1 -1
  12. AutoGLM_GUI/adb_plus/keyboard_installer.py +4 -2
  13. AutoGLM_GUI/adb_plus/screenshot.py +22 -1
  14. AutoGLM_GUI/adb_plus/serial.py +38 -20
  15. AutoGLM_GUI/adb_plus/touch.py +4 -9
  16. AutoGLM_GUI/agents/__init__.py +43 -12
  17. AutoGLM_GUI/agents/events.py +19 -0
  18. AutoGLM_GUI/agents/factory.py +31 -38
  19. AutoGLM_GUI/agents/glm/__init__.py +7 -0
  20. AutoGLM_GUI/agents/glm/agent.py +297 -0
  21. AutoGLM_GUI/agents/glm/message_builder.py +81 -0
  22. AutoGLM_GUI/agents/glm/parser.py +110 -0
  23. {phone_agent/config → AutoGLM_GUI/agents/glm}/prompts_en.py +7 -9
  24. {phone_agent/config → AutoGLM_GUI/agents/glm}/prompts_zh.py +18 -25
  25. AutoGLM_GUI/agents/mai/__init__.py +28 -0
  26. AutoGLM_GUI/agents/mai/agent.py +408 -0
  27. AutoGLM_GUI/agents/mai/parser.py +254 -0
  28. AutoGLM_GUI/agents/mai/prompts.py +103 -0
  29. AutoGLM_GUI/agents/mai/traj_memory.py +91 -0
  30. AutoGLM_GUI/agents/protocols.py +12 -8
  31. AutoGLM_GUI/agents/stream_runner.py +193 -0
  32. AutoGLM_GUI/api/__init__.py +40 -21
  33. AutoGLM_GUI/api/agents.py +181 -239
  34. AutoGLM_GUI/api/control.py +9 -6
  35. AutoGLM_GUI/api/devices.py +102 -12
  36. AutoGLM_GUI/api/history.py +104 -0
  37. AutoGLM_GUI/api/layered_agent.py +67 -15
  38. AutoGLM_GUI/api/media.py +64 -1
  39. AutoGLM_GUI/api/scheduled_tasks.py +98 -0
  40. AutoGLM_GUI/config.py +81 -0
  41. AutoGLM_GUI/config_manager.py +68 -51
  42. AutoGLM_GUI/device_manager.py +248 -29
  43. AutoGLM_GUI/device_protocol.py +1 -1
  44. AutoGLM_GUI/devices/adb_device.py +5 -10
  45. AutoGLM_GUI/devices/mock_device.py +4 -2
  46. AutoGLM_GUI/devices/remote_device.py +8 -3
  47. AutoGLM_GUI/history_manager.py +164 -0
  48. AutoGLM_GUI/model/__init__.py +5 -0
  49. AutoGLM_GUI/model/message_builder.py +69 -0
  50. AutoGLM_GUI/model/types.py +24 -0
  51. AutoGLM_GUI/models/__init__.py +10 -0
  52. AutoGLM_GUI/models/history.py +140 -0
  53. AutoGLM_GUI/models/scheduled_task.py +71 -0
  54. AutoGLM_GUI/parsers/__init__.py +22 -0
  55. AutoGLM_GUI/parsers/base.py +50 -0
  56. AutoGLM_GUI/parsers/phone_parser.py +58 -0
  57. AutoGLM_GUI/phone_agent_manager.py +62 -396
  58. AutoGLM_GUI/platform_utils.py +26 -0
  59. AutoGLM_GUI/prompt_config.py +15 -0
  60. AutoGLM_GUI/prompts/__init__.py +32 -0
  61. AutoGLM_GUI/scheduler_manager.py +350 -0
  62. AutoGLM_GUI/schemas.py +246 -72
  63. AutoGLM_GUI/scrcpy_stream.py +142 -24
  64. AutoGLM_GUI/socketio_server.py +100 -27
  65. AutoGLM_GUI/static/assets/{about-_XNhzQZX.js → about-CfwX1Cmc.js} +1 -1
  66. AutoGLM_GUI/static/assets/alert-dialog-CtGlN2IJ.js +1 -0
  67. AutoGLM_GUI/static/assets/chat-BYa-foUI.js +129 -0
  68. AutoGLM_GUI/static/assets/circle-alert-t08bEMPO.js +1 -0
  69. AutoGLM_GUI/static/assets/dialog-FNwZJFwk.js +45 -0
  70. AutoGLM_GUI/static/assets/eye-D0UPWCWC.js +1 -0
  71. AutoGLM_GUI/static/assets/history-CRo95B7i.js +1 -0
  72. AutoGLM_GUI/static/assets/{index-Cy8TmmHV.js → index-BaLMSqd3.js} +1 -1
  73. AutoGLM_GUI/static/assets/index-CTHbFvKl.js +11 -0
  74. AutoGLM_GUI/static/assets/index-CV7jGxGm.css +1 -0
  75. AutoGLM_GUI/static/assets/label-DJFevVmr.js +1 -0
  76. AutoGLM_GUI/static/assets/logs-RW09DyYY.js +1 -0
  77. AutoGLM_GUI/static/assets/popover--JTJrE5v.js +1 -0
  78. AutoGLM_GUI/static/assets/scheduled-tasks-DTRKsQXF.js +1 -0
  79. AutoGLM_GUI/static/assets/square-pen-CPK_K680.js +1 -0
  80. AutoGLM_GUI/static/assets/textarea-PRmVnWq5.js +1 -0
  81. AutoGLM_GUI/static/assets/workflows-CdcsAoaT.js +1 -0
  82. AutoGLM_GUI/static/index.html +2 -2
  83. AutoGLM_GUI/types.py +17 -0
  84. {autoglm_gui-1.4.1.dist-info → autoglm_gui-1.5.1.dist-info}/METADATA +179 -130
  85. autoglm_gui-1.5.1.dist-info/RECORD +118 -0
  86. AutoGLM_GUI/agents/mai_adapter.py +0 -627
  87. AutoGLM_GUI/api/dual_model.py +0 -317
  88. AutoGLM_GUI/device_adapter.py +0 -263
  89. AutoGLM_GUI/dual_model/__init__.py +0 -53
  90. AutoGLM_GUI/dual_model/decision_model.py +0 -664
  91. AutoGLM_GUI/dual_model/dual_agent.py +0 -917
  92. AutoGLM_GUI/dual_model/protocols.py +0 -354
  93. AutoGLM_GUI/dual_model/vision_model.py +0 -442
  94. AutoGLM_GUI/mai_ui_adapter/agent_wrapper.py +0 -291
  95. AutoGLM_GUI/phone_agent_patches.py +0 -147
  96. AutoGLM_GUI/static/assets/chat-DwJpiAWf.js +0 -126
  97. AutoGLM_GUI/static/assets/dialog-B3uW4T8V.js +0 -45
  98. AutoGLM_GUI/static/assets/index-Cpv2gSF1.css +0 -1
  99. AutoGLM_GUI/static/assets/index-UYYauTly.js +0 -12
  100. AutoGLM_GUI/static/assets/workflows-Du_de-dt.js +0 -1
  101. autoglm_gui-1.4.1.dist-info/RECORD +0 -117
  102. mai_agent/base.py +0 -137
  103. mai_agent/mai_grounding_agent.py +0 -263
  104. mai_agent/mai_naivigation_agent.py +0 -526
  105. mai_agent/prompt.py +0 -148
  106. mai_agent/unified_memory.py +0 -67
  107. mai_agent/utils.py +0 -73
  108. phone_agent/__init__.py +0 -12
  109. phone_agent/actions/__init__.py +0 -5
  110. phone_agent/actions/handler.py +0 -400
  111. phone_agent/adb/screenshot.py +0 -108
  112. phone_agent/agent.py +0 -253
  113. phone_agent/agent_ios.py +0 -277
  114. phone_agent/config/__init__.py +0 -53
  115. phone_agent/config/apps_harmonyos.py +0 -256
  116. phone_agent/config/apps_ios.py +0 -339
  117. phone_agent/config/prompts.py +0 -80
  118. phone_agent/device_factory.py +0 -166
  119. phone_agent/hdc/__init__.py +0 -53
  120. phone_agent/hdc/connection.py +0 -384
  121. phone_agent/hdc/device.py +0 -269
  122. phone_agent/hdc/input.py +0 -145
  123. phone_agent/hdc/screenshot.py +0 -127
  124. phone_agent/model/__init__.py +0 -5
  125. phone_agent/model/client.py +0 -290
  126. phone_agent/xctest/__init__.py +0 -47
  127. phone_agent/xctest/connection.py +0 -379
  128. phone_agent/xctest/device.py +0 -472
  129. phone_agent/xctest/input.py +0 -311
  130. phone_agent/xctest/screenshot.py +0 -226
  131. {phone_agent/config → AutoGLM_GUI/adb}/apps.py +0 -0
  132. {phone_agent/config → AutoGLM_GUI}/i18n.py +0 -0
  133. {autoglm_gui-1.4.1.dist-info → autoglm_gui-1.5.1.dist-info}/WHEEL +0 -0
  134. {autoglm_gui-1.4.1.dist-info → autoglm_gui-1.5.1.dist-info}/entry_points.txt +0 -0
  135. {autoglm_gui-1.4.1.dist-info → autoglm_gui-1.5.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,254 @@
1
+ """MAI Agent parser using XML tags and JSON.
2
+
3
+ 从 mai_agent 的 XML 格式中提取 thinking 和 action,并转换为
4
+ AutoGLM_GUI 的标准格式。
5
+
6
+ 迁移说明:基于原有实现增强,添加 parse_with_thinking 方法。
7
+ """
8
+
9
+ import json
10
+ import re
11
+ from typing import Any
12
+
13
+
14
+ SCALE_FACTOR = 999
15
+
16
+
17
+ class MAIParseError(ValueError):
18
+ pass
19
+
20
+
21
+ class MAIParser:
22
+ """Parse MAI Agent XML + JSON format outputs.
23
+
24
+ Handles format like:
25
+ <thinking>Reasoning process</thinking>
26
+ <tool_call>{"name": "mobile_use", "arguments": {...}}</tool_call>
27
+
28
+ Converts MAI-specific actions to standard ActionHandler format.
29
+ Coordinate scale: 0-999 (automatically converted to 0-1000)
30
+ """
31
+
32
+ @property
33
+ def coordinate_scale(self) -> int:
34
+ return 999
35
+
36
+ def parse_with_thinking(self, raw_response: str) -> dict[str, Any]:
37
+ text = raw_response.strip()
38
+
39
+ if "</think>" in text and "</thinking>" not in text:
40
+ text = text.replace("</think>", "</thinking>")
41
+ text = "<thinking>" + text
42
+
43
+ pattern = r"<thinking>(.*?)</thinking>.*?<tool_call>(.*?)</tool_call>"
44
+ match = re.search(pattern, text, re.DOTALL)
45
+
46
+ if not match:
47
+ raise MAIParseError("Failed to find <thinking> and <tool_call> tags")
48
+
49
+ thinking = match.group(1).strip().strip('"')
50
+ tool_call_str = match.group(2).strip().strip('"')
51
+
52
+ try:
53
+ tool_call = json.loads(tool_call_str)
54
+ except json.JSONDecodeError as e:
55
+ raise MAIParseError(f"Invalid JSON in tool_call: {e}") from e
56
+
57
+ mai_action = tool_call.get("arguments", {})
58
+
59
+ if "coordinate" in mai_action:
60
+ mai_action["coordinate"] = self._normalize_coordinate_to_0_1(
61
+ mai_action["coordinate"]
62
+ )
63
+
64
+ return {
65
+ "thinking": thinking,
66
+ "raw_action": mai_action,
67
+ "converted_action": self._convert_action(mai_action),
68
+ }
69
+
70
+ def _normalize_coordinate_to_0_1(
71
+ self, coordinate: list[int | float]
72
+ ) -> list[float]:
73
+ if len(coordinate) == 2:
74
+ x, y = coordinate
75
+ elif len(coordinate) == 4:
76
+ x1, y1, x2, y2 = coordinate
77
+ x = (x1 + x2) / 2
78
+ y = (y1 + y2) / 2
79
+ else:
80
+ raise MAIParseError(
81
+ f"Invalid coordinate format: expected 2 or 4 values, got {len(coordinate)}"
82
+ )
83
+
84
+ return [x / SCALE_FACTOR, y / SCALE_FACTOR]
85
+
86
+ def parse(self, raw_response: str) -> dict[str, Any]:
87
+ """Parse MAI agent XML+JSON output.
88
+
89
+ Args:
90
+ raw_response: Model output containing <thinking> and <tool_call> tags.
91
+
92
+ Returns:
93
+ Standardized action dictionary with coordinates converted to 0-1000 scale.
94
+
95
+ Raises:
96
+ ValueError: If parsing fails or content is invalid JSON.
97
+ """
98
+ text = raw_response.strip()
99
+
100
+ if "</think>" in text and "</thinking>" not in text:
101
+ text = text.replace("</think>", "</thinking>")
102
+ text = "<thinking>" + text
103
+
104
+ pattern = r"<thinking>(.*?)</thinking>.*?<tool_call>(.*?)</tool_call>"
105
+ match = re.search(pattern, text, re.DOTALL)
106
+
107
+ if not match:
108
+ raise ValueError("Failed to find <thinking> and <tool_call> tags")
109
+
110
+ tool_call_str = match.group(2).strip().strip('"')
111
+
112
+ try:
113
+ tool_call = json.loads(tool_call_str)
114
+ except json.JSONDecodeError as e:
115
+ raise ValueError(f"Invalid JSON in tool_call: {e}") from e
116
+
117
+ mai_action = tool_call.get("arguments", {})
118
+ return self._convert_action(mai_action)
119
+
120
+ def _convert_action(self, mai_action: dict[str, Any]) -> dict[str, Any]:
121
+ """Convert MAI action format to standard ActionHandler format.
122
+
123
+ MAI format: {"action": "click", "coordinate": [x, y]}
124
+ Standard format: {"_metadata": "do", "action": "Tap", "element": [x, y]}
125
+ """
126
+ action_type = mai_action.get("action")
127
+
128
+ if action_type == "terminate":
129
+ status = mai_action.get("status", "success")
130
+ return {
131
+ "_metadata": "finish",
132
+ "message": "Task completed" if status == "success" else "Task failed",
133
+ }
134
+
135
+ if action_type == "answer":
136
+ return {
137
+ "_metadata": "finish",
138
+ "message": mai_action.get("text", ""),
139
+ }
140
+
141
+ if action_type == "wait":
142
+ return {
143
+ "_metadata": "do",
144
+ "action": "Wait",
145
+ "duration": "1 seconds",
146
+ }
147
+
148
+ if action_type == "system_button":
149
+ button_name = mai_action.get("button", "")
150
+ action_map = {
151
+ "back": "Back",
152
+ "home": "Home",
153
+ "enter": "Enter",
154
+ }
155
+ return {
156
+ "_metadata": "do",
157
+ "action": action_map.get(button_name, "Back"),
158
+ }
159
+
160
+ coordinate = mai_action.get("coordinate")
161
+ if coordinate:
162
+ x = self._convert_coordinate(coordinate[0])
163
+ y = self._convert_coordinate(coordinate[1])
164
+
165
+ if action_type == "click":
166
+ return {
167
+ "_metadata": "do",
168
+ "action": "Tap",
169
+ "element": [x, y],
170
+ }
171
+ elif action_type == "long_press":
172
+ return {
173
+ "_metadata": "do",
174
+ "action": "Long Press",
175
+ "element": [x, y],
176
+ }
177
+ elif action_type == "double_click":
178
+ return {
179
+ "_metadata": "do",
180
+ "action": "Double Tap",
181
+ "element": [x, y],
182
+ }
183
+
184
+ if action_type == "swipe":
185
+ direction = mai_action.get("direction", "up")
186
+ coordinate = mai_action.get("coordinate") or [0.5, 0.5]
187
+ x = self._convert_coordinate(coordinate[0])
188
+ y = self._convert_coordinate(coordinate[1])
189
+
190
+ start, end = self._calculate_swipe_coordinates(direction, x, y)
191
+ return {
192
+ "_metadata": "do",
193
+ "action": "Swipe",
194
+ "start": start,
195
+ "end": end,
196
+ }
197
+
198
+ if action_type == "drag":
199
+ start_coord = mai_action.get("start_coordinate", [0, 0])
200
+ end_coord = mai_action.get("end_coordinate", [0, 0])
201
+
202
+ start = [
203
+ self._convert_coordinate_from_scale_factor(start_coord[0]),
204
+ self._convert_coordinate_from_scale_factor(start_coord[1]),
205
+ ]
206
+ end = [
207
+ self._convert_coordinate_from_scale_factor(end_coord[0]),
208
+ self._convert_coordinate_from_scale_factor(end_coord[1]),
209
+ ]
210
+ return {
211
+ "_metadata": "do",
212
+ "action": "Swipe",
213
+ "start": start,
214
+ "end": end,
215
+ }
216
+
217
+ if action_type == "type":
218
+ return {
219
+ "_metadata": "do",
220
+ "action": "Type",
221
+ "text": mai_action.get("text", ""),
222
+ }
223
+
224
+ if action_type == "open":
225
+ return {
226
+ "_metadata": "do",
227
+ "action": "Launch",
228
+ "app": mai_action.get("app", ""),
229
+ }
230
+
231
+ raise ValueError(f"Unknown MAI action type: {action_type}")
232
+
233
+ def _convert_coordinate(self, value: float) -> int:
234
+ """Convert MAI normalized coordinate [0, 1] to standard scale [0, 1000]."""
235
+ return int(value * 1000)
236
+
237
+ def _convert_coordinate_from_scale_factor(self, value: float) -> int:
238
+ """Convert MAI scale factor coordinate [0, 999] to standard scale [0, 1000]."""
239
+ return int((value / SCALE_FACTOR) * 1000)
240
+
241
+ def _calculate_swipe_coordinates(
242
+ self, direction: str, x: int, y: int
243
+ ) -> tuple[list[int], list[int]]:
244
+ """Calculate start and end coordinates for swipe based on direction."""
245
+ swipe_distance = 300
246
+
247
+ direction_map = {
248
+ "up": ([x, y + swipe_distance], [x, y - swipe_distance]),
249
+ "down": ([x, y - swipe_distance], [x, y + swipe_distance]),
250
+ "left": ([x + swipe_distance, y], [x - swipe_distance, y]),
251
+ "right": ([x - swipe_distance, y], [x + swipe_distance, y]),
252
+ }
253
+
254
+ return direction_map.get(direction, ([x, y], [x, y]))
@@ -0,0 +1,103 @@
1
+ """MAI Agent 系统提示模板
2
+
3
+ 基于 mai_agent/prompt.py 迁移,针对中文环境和国内应用优化。
4
+ """
5
+
6
+ MAI_MOBILE_SYSTEM_PROMPT = """你是一个 GUI 自动化助手。你会收到一个任务和历史操作记录(包含多张截图),你需要分析当前屏幕状态,执行下一步操作来完成任务。
7
+
8
+ ## 输出格式
9
+ 每次操作必须包含两部分:
10
+ 1. **思考过程**:在 <thinking></thinking> 标签中详细说明你的分析和决策
11
+ 2. **动作指令**:在 <tool_call></tool_call> 标签中返回 JSON 格式的函数调用
12
+
13
+ 示例:
14
+ ```
15
+ <thinking>
16
+ 当前屏幕显示美团首页。我需要点击顶部搜索框输入"霸王茶姬"。搜索框位于屏幕上方中央,坐标大约在 [500, 100]。
17
+ 下一步操作:点击搜索框。
18
+ </thinking>
19
+ <tool_call>
20
+ {"name": "mobile_use", "arguments": {"action": "click", "coordinate": [500, 100]}}
21
+ </tool_call>
22
+ ```
23
+
24
+ ## 动作空间(严格遵守)
25
+
26
+ ### 基础操作
27
+ - **点击**:`{"action": "click", "coordinate": [x, y]}`
28
+ 用于点击按钮、链接、输入框等可点击元素
29
+
30
+ - **长按**:`{"action": "long_press", "coordinate": [x, y]}`
31
+ 用于触发长按菜单或特殊功能
32
+
33
+ - **输入文本**:`{"action": "type", "text": "要输入的文字"}`
34
+ 必须先点击输入框聚焦,再使用此动作输入文本
35
+ 注意:文本中的特殊字符需要转义(\\'、\\"、\\n)
36
+
37
+ ### 滑动操作
38
+ - **滑动**:`{"action": "swipe", "direction": "up|down|left|right", "coordinate": [x, y]}`
39
+ direction 可选值:up(向上滑)、down(向下滑)、left(向左滑)、right(向右滑)
40
+ coordinate 可选:指定滑动起点坐标(用于滑动特定 UI 元素)
41
+
42
+ - **拖动**:`{"action": "drag", "start_coordinate": [x1, y1], "end_coordinate": [x2, y2]}`
43
+ 用于拖拽元素到新位置
44
+
45
+ ### 系统操作
46
+ - **打开应用**:`{"action": "open", "text": "应用名称"}`
47
+ 推荐优先使用此方式打开应用(比手动点击更快)
48
+
49
+ - **系统按键**:`{"action": "system_button", "button": "back|home|menu|enter"}`
50
+ 可选值:back(返回)、home(主页)、menu(菜单)、enter(确认)
51
+
52
+ ### 任务控制
53
+ - **等待**:`{"action": "wait"}`
54
+ 用于等待页面加载或动画完成(建议谨慎使用,大多数情况不需要)
55
+
56
+ - **结束任务**:`{"action": "terminate", "status": "success|fail"}`
57
+ 任务完成或失败时必须调用此动作
58
+
59
+ - **回答问题**:`{"action": "answer", "text": "答案内容"}`
60
+ 当用户要求你查找信息或回答问题时使用
61
+
62
+ ## 坐标系统
63
+ - **范围**:x 和 y 都在 [0, 999] 之间
64
+ - **原点**:(0, 0) 是屏幕左上角
65
+ - **边界**:(999, 999) 是屏幕右下角
66
+ - **精度**:坐标是归一化的,会自动映射到实际屏幕分辨率
67
+
68
+ ## 操作指南
69
+
70
+ ### 思考过程建议
71
+ 在 <thinking> 部分应包含:
72
+ 1. **观察**:当前屏幕显示的内容和状态
73
+ 2. **分析**:识别目标元素的位置和特征
74
+ 3. **决策**:选择最合适的操作和参数
75
+ 4. **总结**:用一句话明确说明下一步要做什么
76
+
77
+ ### 常见应用操作技巧
78
+ **国内常用应用**:
79
+ - 外卖应用(美团、饿了么):优先使用顶部搜索框查找商家
80
+ - 打车应用(滴滴、高德):注意起点/终点输入框的位置区分
81
+ - 电商应用(淘宝、京东):搜索框通常在顶部,商品列表需要向下滑动浏览
82
+ - 社交应用(微信、QQ):注意顶部/底部导航栏的切换
83
+
84
+ **通用技巧**:
85
+ - 如果页面内容未完全显示,使用 swipe 滚动查看
86
+ - 输入文本前必须先 click 输入框获得焦点
87
+ - 遇到加载动画可以 wait 一次,但不要连续 wait
88
+ - 无法找到目标元素时,尝试返回上一级(system_button back)重新导航
89
+
90
+ ### 常见错误避免
91
+ - ❌ 不要在未点击输入框的情况下直接 type
92
+ - ❌ 不要使用超出 [0, 999] 范围的坐标
93
+ - ❌ 不要遗漏 <thinking> 或 <tool_call> 标签
94
+ - ❌ 不要在 JSON 中使用注释或多余的字段
95
+ - ❌ 不要连续执行多个相同的无效操作
96
+
97
+ ## 注意事项
98
+ - 必须严格遵循动作空间,所有动作参数必须符合上述格式
99
+ - 坐标必须是整数,范围在 [0, 999]
100
+ - 文本输入中的引号、换行等特殊字符必须转义
101
+ - 每次只返回一个动作,不要尝试批量操作
102
+ - 仔细观察截图中的 UI 元素位置,准确估算坐标
103
+ """.strip()
@@ -0,0 +1,91 @@
1
+ """轨迹记忆数据结构 - MAI Agent 内部实现
2
+
3
+ 本模块定义了 MAI Agent 的轨迹记忆系统,用于存储和管理 Agent 执行过程中的历史信息。
4
+
5
+ 设计说明:
6
+ - 从 mai_agent/unified_memory.py 迁移而来
7
+ - 适配 Python 3.10+ 类型注解
8
+ - 与 AutoGLM_GUI 架构集成
9
+ """
10
+
11
+ from dataclasses import dataclass, field
12
+ from typing import Any
13
+
14
+ from PIL import Image
15
+
16
+
17
+ @dataclass
18
+ class TrajStep:
19
+ """轨迹中的单个步骤
20
+
21
+ 记录 Agent 在某一步的完整状态,包括观察、思考、动作和结果。
22
+
23
+ Attributes:
24
+ screenshot: 当前步骤的截图 (PIL Image 格式)
25
+ accessibility_tree: 可访问性树数据(可选,用于辅助 UI 理解)
26
+ prediction: 模型的原始响应文本(包含 <thinking> 和 <tool_call>)
27
+ action: 解析后的动作字典(如 {"action": "click", "coordinate": [0.5, 0.8]})
28
+ conclusion: 本步骤的结论或总结
29
+ thought: 模型的思考过程(从 <thinking> 标签中提取)
30
+ step_index: 步骤索引(从 0 开始)
31
+ agent_type: 生成此步骤的 Agent 类型(如 "InternalMAIAgent")
32
+ model_name: 使用的模型名称(如 "qwen2-vl-7b")
33
+ screenshot_bytes: 截图的字节数据(可选,用于序列化)
34
+ structured_action: 结构化的动作数据(可选,包含额外元数据)
35
+ """
36
+
37
+ screenshot: Image.Image
38
+ accessibility_tree: dict[str, Any] | None
39
+ prediction: str
40
+ action: dict[str, Any]
41
+ conclusion: str
42
+ thought: str
43
+ step_index: int
44
+ agent_type: str
45
+ model_name: str
46
+ screenshot_bytes: bytes | None = None
47
+ structured_action: dict[str, Any] | None = None
48
+
49
+
50
+ @dataclass
51
+ class TrajMemory:
52
+ """完整任务的轨迹记忆容器
53
+
54
+ 存储一个完整任务的所有步骤,提供历史查询和状态管理功能。
55
+
56
+ Attributes:
57
+ task_goal: 任务目标描述(用户的原始指令)
58
+ task_id: 任务唯一标识符
59
+ steps: 步骤列表(按执行顺序)
60
+ """
61
+
62
+ task_goal: str
63
+ task_id: str
64
+ steps: list[TrajStep] = field(default_factory=list)
65
+
66
+ def add_step(self, step: TrajStep) -> None:
67
+ self.steps.append(step)
68
+
69
+ def get_history_images(self, n: int = -1) -> list[bytes]:
70
+ images = [step.screenshot_bytes for step in self.steps if step.screenshot_bytes]
71
+ if n > 0:
72
+ return images[-n:]
73
+ return images
74
+
75
+ def get_history_thoughts(self, n: int = -1) -> list[str]:
76
+ thoughts = [step.thought for step in self.steps if step.thought]
77
+ if n > 0:
78
+ return thoughts[-n:]
79
+ return thoughts
80
+
81
+ def get_history_actions(self, n: int = -1) -> list[dict[str, Any]]:
82
+ actions = [step.action for step in self.steps]
83
+ if n > 0:
84
+ return actions[-n:]
85
+ return actions
86
+
87
+ def clear(self) -> None:
88
+ self.steps.clear()
89
+
90
+ def __len__(self) -> int:
91
+ return len(self.steps)
@@ -1,23 +1,27 @@
1
1
  from __future__ import annotations
2
2
 
3
- from typing import TYPE_CHECKING, Any, Protocol
3
+ from typing import Any, Protocol
4
4
 
5
-
6
- if TYPE_CHECKING:
7
- from phone_agent.agent import AgentConfig, StepResult
8
- from phone_agent.model import ModelConfig
5
+ from AutoGLM_GUI.config import AgentConfig, ModelConfig, StepResult
9
6
 
10
7
 
11
8
  class BaseAgent(Protocol):
12
- agent_config: "AgentConfig"
13
- model_config: "ModelConfig"
9
+ model_config: ModelConfig
10
+ agent_config: AgentConfig
14
11
 
15
12
  def run(self, task: str) -> str: ...
16
- def step(self, task: str | None = None) -> "StepResult": ...
13
+
14
+ def step(self, task: str | None = None) -> StepResult: ...
15
+
17
16
  def reset(self) -> None: ...
18
17
 
18
+ def abort(self) -> None: ...
19
+
19
20
  @property
20
21
  def step_count(self) -> int: ...
21
22
 
22
23
  @property
23
24
  def context(self) -> list[dict[str, Any]]: ...
25
+
26
+ @property
27
+ def is_running(self) -> bool: ...
@@ -0,0 +1,193 @@
1
+ import queue
2
+ import threading
3
+ import typing
4
+ from contextlib import contextmanager
5
+ from typing import Any, Callable, Iterator, Optional
6
+
7
+ from AutoGLM_GUI.agents.events import AgentEvent, AgentEventType
8
+
9
+ if typing.TYPE_CHECKING:
10
+ from AutoGLM_GUI.agents.protocols import BaseAgent
11
+
12
+
13
+ class AgentStepStreamer:
14
+ """
15
+ 流式 Agent 执行器(抽取可复用逻辑).
16
+
17
+ 职责:
18
+ - 管理事件队列
19
+ - 协调 worker 线程
20
+ - 转换 StepResult 为事件
21
+ """
22
+
23
+ def __init__(
24
+ self,
25
+ agent: "BaseAgent",
26
+ task: str,
27
+ ) -> None:
28
+ self._agent = agent
29
+ self._task = task
30
+ self._event_queue: queue.Queue[Optional[tuple[str, dict[str, Any]]]] = (
31
+ queue.Queue(maxsize=100)
32
+ )
33
+ self._stop_event = threading.Event()
34
+ self._worker_thread: Optional[threading.Thread] = None
35
+
36
+ def __iter__(self) -> Iterator[AgentEvent]:
37
+ """返回迭代器."""
38
+ return self # type: ignore
39
+
40
+ def __next__(self) -> AgentEvent:
41
+ """从队列获取下一个事件."""
42
+ try:
43
+ if self._worker_thread is None:
44
+ self._start_worker()
45
+
46
+ item = self._event_queue.get(timeout=0.1)
47
+
48
+ if item is None:
49
+ raise StopIteration
50
+
51
+ event_type, event_data = item
52
+ return AgentEvent(type=event_type, data=event_data)
53
+
54
+ except queue.Empty:
55
+ if self._worker_thread and self._worker_thread.is_alive():
56
+ return AgentEvent(
57
+ type=AgentEventType.STEP.value,
58
+ data={
59
+ "step": -1,
60
+ "thinking": "",
61
+ "action": None,
62
+ "success": True,
63
+ "finished": False,
64
+ },
65
+ )
66
+ else:
67
+ raise StopIteration
68
+
69
+ except StopIteration:
70
+ raise
71
+
72
+ except Exception as e:
73
+ self._stop_event.set()
74
+ return AgentEvent(type=AgentEventType.ERROR.value, data={"message": str(e)})
75
+
76
+ def _start_worker(self) -> None:
77
+ """启动 worker 线程."""
78
+
79
+ def worker():
80
+ try:
81
+ # 检查停止事件
82
+ if self._stop_event.is_set():
83
+ return
84
+
85
+ # 注入 thinking 回调
86
+ # 这是一个 hack,但为了实现 "Zero Agent Change" 目标
87
+ # 假设 agent 有 _thinking_callback 属性
88
+ original_callback = getattr(self._agent, "_thinking_callback", None)
89
+
90
+ def on_thinking(chunk: str):
91
+ self._event_queue.put(
92
+ (AgentEventType.THINKING.value, {"chunk": chunk})
93
+ )
94
+ if original_callback:
95
+ original_callback(chunk)
96
+
97
+ # Monkey-patch thinking callback
98
+ setattr(self._agent, "_thinking_callback", on_thinking)
99
+
100
+ try:
101
+ # 执行 step 循环
102
+ # 使用会话级别的标记,而不是 agent.step_count
103
+ # 这样每次新对话开始时,第一步都会传递 task
104
+ is_first_in_session = True
105
+ while not self._stop_event.is_set():
106
+ result = self._agent.step(
107
+ self._task if is_first_in_session else None
108
+ )
109
+ is_first_in_session = False
110
+
111
+ # 发射 step 事件
112
+ self._event_queue.put(
113
+ (
114
+ AgentEventType.STEP.value,
115
+ {
116
+ "step": self._agent.step_count,
117
+ "thinking": result.thinking,
118
+ "action": result.action,
119
+ "success": result.success,
120
+ "finished": result.finished,
121
+ },
122
+ )
123
+ )
124
+
125
+ # 检查是否完成
126
+ if result.finished:
127
+ # 发射 done 事件
128
+ self._event_queue.put(
129
+ (
130
+ AgentEventType.DONE.value,
131
+ {
132
+ "message": result.message,
133
+ "steps": self._agent.step_count,
134
+ "success": result.success,
135
+ },
136
+ )
137
+ )
138
+ break
139
+
140
+ # 检查步数限制
141
+ if self._agent.step_count >= self._agent.agent_config.max_steps:
142
+ self._event_queue.put(
143
+ (
144
+ AgentEventType.DONE.value,
145
+ {
146
+ "message": "Max steps reached",
147
+ "steps": self._agent.step_count,
148
+ "success": result.success,
149
+ },
150
+ )
151
+ )
152
+ break
153
+ finally:
154
+ # 恢复原始回调
155
+ setattr(self._agent, "_thinking_callback", original_callback)
156
+
157
+ except Exception as e:
158
+ # 发射 error 事件
159
+ self._event_queue.put((AgentEventType.ERROR.value, {"message": str(e)}))
160
+
161
+ finally:
162
+ # 标记完成
163
+ self._event_queue.put(None)
164
+
165
+ self._worker_thread = threading.Thread(target=worker, daemon=True)
166
+ self._worker_thread.start()
167
+
168
+ @contextmanager
169
+ def stream_context(self) -> Iterator[Callable[[], None]]:
170
+ """
171
+ Context manager,自动管理清理.
172
+ """
173
+ self._stop_event.clear()
174
+ try:
175
+ yield self.abort
176
+ finally:
177
+ self._stop_event.set()
178
+ # 等待 worker 完成
179
+ if self._worker_thread and self._worker_thread.is_alive():
180
+ self._worker_thread.join(timeout=5.0)
181
+
182
+ # 清空队列
183
+ while not self._event_queue.empty():
184
+ try:
185
+ self._event_queue.get_nowait()
186
+ except queue.Empty:
187
+ break
188
+
189
+ def abort(self) -> None:
190
+ """中止流式执行."""
191
+ self._stop_event.set()
192
+ if hasattr(self._agent, "abort"):
193
+ self._agent.abort()