autoglm-gui 1.4.1__py3-none-any.whl → 1.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. AutoGLM_GUI/__init__.py +11 -0
  2. AutoGLM_GUI/__main__.py +26 -4
  3. AutoGLM_GUI/actions/__init__.py +6 -0
  4. phone_agent/actions/handler_ios.py → AutoGLM_GUI/actions/handler.py +30 -112
  5. AutoGLM_GUI/actions/types.py +15 -0
  6. {phone_agent → AutoGLM_GUI}/adb/__init__.py +25 -23
  7. {phone_agent → AutoGLM_GUI}/adb/connection.py +5 -40
  8. {phone_agent → AutoGLM_GUI}/adb/device.py +12 -94
  9. {phone_agent → AutoGLM_GUI}/adb/input.py +6 -47
  10. AutoGLM_GUI/adb/screenshot.py +11 -0
  11. {phone_agent/config → AutoGLM_GUI/adb}/timing.py +1 -1
  12. AutoGLM_GUI/adb_plus/keyboard_installer.py +4 -2
  13. AutoGLM_GUI/adb_plus/screenshot.py +22 -1
  14. AutoGLM_GUI/adb_plus/serial.py +38 -20
  15. AutoGLM_GUI/adb_plus/touch.py +4 -9
  16. AutoGLM_GUI/agents/__init__.py +43 -12
  17. AutoGLM_GUI/agents/events.py +19 -0
  18. AutoGLM_GUI/agents/factory.py +31 -38
  19. AutoGLM_GUI/agents/glm/__init__.py +7 -0
  20. AutoGLM_GUI/agents/glm/agent.py +297 -0
  21. AutoGLM_GUI/agents/glm/message_builder.py +81 -0
  22. AutoGLM_GUI/agents/glm/parser.py +110 -0
  23. {phone_agent/config → AutoGLM_GUI/agents/glm}/prompts_en.py +7 -9
  24. {phone_agent/config → AutoGLM_GUI/agents/glm}/prompts_zh.py +18 -25
  25. AutoGLM_GUI/agents/mai/__init__.py +28 -0
  26. AutoGLM_GUI/agents/mai/agent.py +408 -0
  27. AutoGLM_GUI/agents/mai/parser.py +254 -0
  28. AutoGLM_GUI/agents/mai/prompts.py +103 -0
  29. AutoGLM_GUI/agents/mai/traj_memory.py +91 -0
  30. AutoGLM_GUI/agents/protocols.py +12 -8
  31. AutoGLM_GUI/agents/stream_runner.py +193 -0
  32. AutoGLM_GUI/api/__init__.py +40 -21
  33. AutoGLM_GUI/api/agents.py +181 -239
  34. AutoGLM_GUI/api/control.py +9 -6
  35. AutoGLM_GUI/api/devices.py +102 -12
  36. AutoGLM_GUI/api/history.py +104 -0
  37. AutoGLM_GUI/api/layered_agent.py +67 -15
  38. AutoGLM_GUI/api/media.py +64 -1
  39. AutoGLM_GUI/api/scheduled_tasks.py +98 -0
  40. AutoGLM_GUI/config.py +81 -0
  41. AutoGLM_GUI/config_manager.py +68 -51
  42. AutoGLM_GUI/device_manager.py +248 -29
  43. AutoGLM_GUI/device_protocol.py +1 -1
  44. AutoGLM_GUI/devices/adb_device.py +5 -10
  45. AutoGLM_GUI/devices/mock_device.py +4 -2
  46. AutoGLM_GUI/devices/remote_device.py +8 -3
  47. AutoGLM_GUI/history_manager.py +164 -0
  48. AutoGLM_GUI/model/__init__.py +5 -0
  49. AutoGLM_GUI/model/message_builder.py +69 -0
  50. AutoGLM_GUI/model/types.py +24 -0
  51. AutoGLM_GUI/models/__init__.py +10 -0
  52. AutoGLM_GUI/models/history.py +140 -0
  53. AutoGLM_GUI/models/scheduled_task.py +71 -0
  54. AutoGLM_GUI/parsers/__init__.py +22 -0
  55. AutoGLM_GUI/parsers/base.py +50 -0
  56. AutoGLM_GUI/parsers/phone_parser.py +58 -0
  57. AutoGLM_GUI/phone_agent_manager.py +62 -396
  58. AutoGLM_GUI/platform_utils.py +26 -0
  59. AutoGLM_GUI/prompt_config.py +15 -0
  60. AutoGLM_GUI/prompts/__init__.py +32 -0
  61. AutoGLM_GUI/scheduler_manager.py +350 -0
  62. AutoGLM_GUI/schemas.py +246 -72
  63. AutoGLM_GUI/scrcpy_stream.py +142 -24
  64. AutoGLM_GUI/socketio_server.py +100 -27
  65. AutoGLM_GUI/static/assets/{about-_XNhzQZX.js → about-CfwX1Cmc.js} +1 -1
  66. AutoGLM_GUI/static/assets/alert-dialog-CtGlN2IJ.js +1 -0
  67. AutoGLM_GUI/static/assets/chat-BYa-foUI.js +129 -0
  68. AutoGLM_GUI/static/assets/circle-alert-t08bEMPO.js +1 -0
  69. AutoGLM_GUI/static/assets/dialog-FNwZJFwk.js +45 -0
  70. AutoGLM_GUI/static/assets/eye-D0UPWCWC.js +1 -0
  71. AutoGLM_GUI/static/assets/history-CRo95B7i.js +1 -0
  72. AutoGLM_GUI/static/assets/{index-Cy8TmmHV.js → index-BaLMSqd3.js} +1 -1
  73. AutoGLM_GUI/static/assets/index-CTHbFvKl.js +11 -0
  74. AutoGLM_GUI/static/assets/index-CV7jGxGm.css +1 -0
  75. AutoGLM_GUI/static/assets/label-DJFevVmr.js +1 -0
  76. AutoGLM_GUI/static/assets/logs-RW09DyYY.js +1 -0
  77. AutoGLM_GUI/static/assets/popover--JTJrE5v.js +1 -0
  78. AutoGLM_GUI/static/assets/scheduled-tasks-DTRKsQXF.js +1 -0
  79. AutoGLM_GUI/static/assets/square-pen-CPK_K680.js +1 -0
  80. AutoGLM_GUI/static/assets/textarea-PRmVnWq5.js +1 -0
  81. AutoGLM_GUI/static/assets/workflows-CdcsAoaT.js +1 -0
  82. AutoGLM_GUI/static/index.html +2 -2
  83. AutoGLM_GUI/types.py +17 -0
  84. {autoglm_gui-1.4.1.dist-info → autoglm_gui-1.5.1.dist-info}/METADATA +179 -130
  85. autoglm_gui-1.5.1.dist-info/RECORD +118 -0
  86. AutoGLM_GUI/agents/mai_adapter.py +0 -627
  87. AutoGLM_GUI/api/dual_model.py +0 -317
  88. AutoGLM_GUI/device_adapter.py +0 -263
  89. AutoGLM_GUI/dual_model/__init__.py +0 -53
  90. AutoGLM_GUI/dual_model/decision_model.py +0 -664
  91. AutoGLM_GUI/dual_model/dual_agent.py +0 -917
  92. AutoGLM_GUI/dual_model/protocols.py +0 -354
  93. AutoGLM_GUI/dual_model/vision_model.py +0 -442
  94. AutoGLM_GUI/mai_ui_adapter/agent_wrapper.py +0 -291
  95. AutoGLM_GUI/phone_agent_patches.py +0 -147
  96. AutoGLM_GUI/static/assets/chat-DwJpiAWf.js +0 -126
  97. AutoGLM_GUI/static/assets/dialog-B3uW4T8V.js +0 -45
  98. AutoGLM_GUI/static/assets/index-Cpv2gSF1.css +0 -1
  99. AutoGLM_GUI/static/assets/index-UYYauTly.js +0 -12
  100. AutoGLM_GUI/static/assets/workflows-Du_de-dt.js +0 -1
  101. autoglm_gui-1.4.1.dist-info/RECORD +0 -117
  102. mai_agent/base.py +0 -137
  103. mai_agent/mai_grounding_agent.py +0 -263
  104. mai_agent/mai_naivigation_agent.py +0 -526
  105. mai_agent/prompt.py +0 -148
  106. mai_agent/unified_memory.py +0 -67
  107. mai_agent/utils.py +0 -73
  108. phone_agent/__init__.py +0 -12
  109. phone_agent/actions/__init__.py +0 -5
  110. phone_agent/actions/handler.py +0 -400
  111. phone_agent/adb/screenshot.py +0 -108
  112. phone_agent/agent.py +0 -253
  113. phone_agent/agent_ios.py +0 -277
  114. phone_agent/config/__init__.py +0 -53
  115. phone_agent/config/apps_harmonyos.py +0 -256
  116. phone_agent/config/apps_ios.py +0 -339
  117. phone_agent/config/prompts.py +0 -80
  118. phone_agent/device_factory.py +0 -166
  119. phone_agent/hdc/__init__.py +0 -53
  120. phone_agent/hdc/connection.py +0 -384
  121. phone_agent/hdc/device.py +0 -269
  122. phone_agent/hdc/input.py +0 -145
  123. phone_agent/hdc/screenshot.py +0 -127
  124. phone_agent/model/__init__.py +0 -5
  125. phone_agent/model/client.py +0 -290
  126. phone_agent/xctest/__init__.py +0 -47
  127. phone_agent/xctest/connection.py +0 -379
  128. phone_agent/xctest/device.py +0 -472
  129. phone_agent/xctest/input.py +0 -311
  130. phone_agent/xctest/screenshot.py +0 -226
  131. {phone_agent/config → AutoGLM_GUI/adb}/apps.py +0 -0
  132. {phone_agent/config → AutoGLM_GUI}/i18n.py +0 -0
  133. {autoglm_gui-1.4.1.dist-info → autoglm_gui-1.5.1.dist-info}/WHEEL +0 -0
  134. {autoglm_gui-1.4.1.dist-info → autoglm_gui-1.5.1.dist-info}/entry_points.txt +0 -0
  135. {autoglm_gui-1.4.1.dist-info → autoglm_gui-1.5.1.dist-info}/licenses/LICENSE +0 -0
@@ -6,20 +6,14 @@ making it easy to add new agent types without modifying existing code.
6
6
 
7
7
  from __future__ import annotations
8
8
 
9
- from typing import TYPE_CHECKING, Callable, Dict
9
+ from typing import Callable, Dict
10
10
 
11
+ from AutoGLM_GUI.config import AgentConfig, ModelConfig
11
12
  from AutoGLM_GUI.logger import logger
12
13
  from AutoGLM_GUI.types import AgentSpecificConfig
13
14
 
14
15
  from .protocols import BaseAgent
15
16
 
16
- if TYPE_CHECKING:
17
- from phone_agent import PhoneAgent
18
- from phone_agent.agent import AgentConfig
19
- from phone_agent.model import ModelConfig
20
-
21
- from .mai_adapter import MAIAgentAdapter
22
-
23
17
 
24
18
  # Agent registry: agent_type -> (creator_function, config_schema)
25
19
  AGENT_REGISTRY: Dict[str, Callable] = {}
@@ -52,12 +46,13 @@ def register_agent(
52
46
 
53
47
  def create_agent(
54
48
  agent_type: str,
55
- model_config: "ModelConfig",
56
- agent_config: "AgentConfig",
49
+ model_config: ModelConfig,
50
+ agent_config: AgentConfig,
57
51
  agent_specific_config: AgentSpecificConfig,
52
+ device,
58
53
  takeover_callback: Callable | None = None,
59
54
  confirmation_callback: Callable | None = None,
60
- ) -> "BaseAgent":
55
+ ) -> BaseAgent:
61
56
  """
62
57
  Create an agent instance using the factory pattern.
63
58
 
@@ -66,6 +61,7 @@ def create_agent(
66
61
  model_config: Model configuration
67
62
  agent_config: Agent configuration
68
63
  agent_specific_config: Agent-specific configuration (e.g., MAIConfig fields)
64
+ device: DeviceProtocol instance (provided by PhoneAgentManager)
69
65
  takeover_callback: Takeover callback
70
66
  confirmation_callback: Confirmation callback
71
67
 
@@ -88,6 +84,7 @@ def create_agent(
88
84
  model_config=model_config,
89
85
  agent_config=agent_config,
90
86
  agent_specific_config=agent_specific_config,
87
+ device=device,
91
88
  takeover_callback=takeover_callback,
92
89
  confirmation_callback=confirmation_callback,
93
90
  )
@@ -111,50 +108,46 @@ def is_agent_type_registered(agent_type: str) -> bool:
111
108
  # ==================== Built-in Agent Creators ====================
112
109
 
113
110
 
114
- def _create_phone_agent(
115
- model_config: "ModelConfig",
116
- agent_config: "AgentConfig",
111
+ def _create_glm_agent_v2(
112
+ model_config: ModelConfig,
113
+ agent_config: AgentConfig,
117
114
  agent_specific_config: AgentSpecificConfig,
115
+ device,
118
116
  takeover_callback: Callable | None = None,
119
117
  confirmation_callback: Callable | None = None,
120
- ) -> "PhoneAgent":
121
- from phone_agent import PhoneAgent
118
+ ) -> BaseAgent:
119
+ from .glm.agent import GLMAgent
122
120
 
123
- return PhoneAgent(
121
+ return GLMAgent(
124
122
  model_config=model_config,
125
123
  agent_config=agent_config,
126
- takeover_callback=takeover_callback,
124
+ device=device,
127
125
  confirmation_callback=confirmation_callback,
126
+ takeover_callback=takeover_callback,
128
127
  )
129
128
 
130
129
 
131
- def _create_mai_agent(
132
- model_config: "ModelConfig",
133
- agent_config: "AgentConfig",
130
+ def _create_internal_mai_agent(
131
+ model_config: ModelConfig,
132
+ agent_config: AgentConfig,
134
133
  agent_specific_config: AgentSpecificConfig,
134
+ device,
135
135
  takeover_callback: Callable | None = None,
136
136
  confirmation_callback: Callable | None = None,
137
- ) -> "MAIAgentAdapter":
138
- from .mai_adapter import MAIAgentAdapter, MAIAgentConfig
139
-
140
- # Build MAI config from dict
141
- mai_config = MAIAgentConfig(
142
- history_n=agent_specific_config.get("history_n", 3),
143
- max_pixels=agent_specific_config.get("max_pixels"),
144
- min_pixels=agent_specific_config.get("min_pixels"),
145
- tools=agent_specific_config.get("tools"),
146
- use_mai_prompt=agent_specific_config.get("use_mai_prompt", False),
147
- )
137
+ ) -> BaseAgent:
138
+ from .mai.agent import InternalMAIAgent
139
+
140
+ history_n = agent_specific_config.get("history_n", 3)
148
141
 
149
- return MAIAgentAdapter(
142
+ return InternalMAIAgent(
150
143
  model_config=model_config,
151
144
  agent_config=agent_config,
152
- mai_config=mai_config,
153
- takeover_callback=takeover_callback,
145
+ device=device,
146
+ history_n=history_n,
154
147
  confirmation_callback=confirmation_callback,
148
+ takeover_callback=takeover_callback,
155
149
  )
156
150
 
157
151
 
158
- # Register built-in agents
159
- register_agent("glm", _create_phone_agent)
160
- register_agent("mai", _create_mai_agent)
152
+ register_agent("glm", _create_glm_agent_v2)
153
+ register_agent("mai", _create_internal_mai_agent)
@@ -0,0 +1,7 @@
1
+ from .prompts_en import SYSTEM_PROMPT as SYSTEM_PROMPT_EN
2
+ from .prompts_zh import SYSTEM_PROMPT as SYSTEM_PROMPT_ZH
3
+
4
+ __all__ = [
5
+ "SYSTEM_PROMPT_EN",
6
+ "SYSTEM_PROMPT_ZH",
7
+ ]
@@ -0,0 +1,297 @@
1
+ import json
2
+ import traceback
3
+ from typing import Any, Callable, cast
4
+
5
+ from openai import OpenAI
6
+
7
+ from AutoGLM_GUI.actions import ActionHandler, ActionResult
8
+ from AutoGLM_GUI.config import AgentConfig, ModelConfig, StepResult
9
+ from AutoGLM_GUI.device_protocol import DeviceProtocol
10
+ from AutoGLM_GUI.logger import logger
11
+ from AutoGLM_GUI.prompt_config import get_messages, get_system_prompt
12
+
13
+ from .message_builder import MessageBuilder
14
+ from .parser import GLMParser
15
+
16
+
17
+ class GLMAgent:
18
+ def __init__(
19
+ self,
20
+ model_config: ModelConfig,
21
+ agent_config: AgentConfig,
22
+ device: DeviceProtocol,
23
+ confirmation_callback: Callable[[str], bool] | None = None,
24
+ takeover_callback: Callable[[str], None] | None = None,
25
+ thinking_callback: Callable[[str], None] | None = None,
26
+ ):
27
+ self.model_config = model_config
28
+ self.agent_config = agent_config
29
+
30
+ self.openai_client = OpenAI(
31
+ base_url=model_config.base_url,
32
+ api_key=model_config.api_key,
33
+ timeout=120,
34
+ )
35
+ self.parser = GLMParser()
36
+
37
+ self.device = device
38
+ self.action_handler = ActionHandler(
39
+ device=self.device,
40
+ confirmation_callback=confirmation_callback,
41
+ takeover_callback=takeover_callback,
42
+ )
43
+
44
+ self._context: list[dict[str, Any]] = []
45
+ self._step_count = 0
46
+ self._is_running = False
47
+ self._thinking_callback = thinking_callback
48
+
49
+ def run(self, task: str) -> str:
50
+ self._context = []
51
+ self._step_count = 0
52
+ self._is_running = True
53
+
54
+ try:
55
+ result = self._execute_step(task, is_first=True)
56
+
57
+ if result.finished:
58
+ return result.message or "Task completed"
59
+
60
+ while self._step_count < self.agent_config.max_steps and self._is_running:
61
+ result = self._execute_step(is_first=False)
62
+
63
+ if result.finished:
64
+ return result.message or "Task completed"
65
+
66
+ return "Max steps reached"
67
+ finally:
68
+ self._is_running = False
69
+
70
+ def step(self, task: str | None = None) -> StepResult:
71
+ is_first = len(self._context) == 0
72
+
73
+ if is_first and not task:
74
+ raise ValueError("Task is required for the first step")
75
+
76
+ return self._execute_step(task, is_first)
77
+
78
+ def reset(self) -> None:
79
+ self._context = []
80
+ self._step_count = 0
81
+ self._is_running = False
82
+
83
+ def abort(self) -> None:
84
+ self._is_running = False
85
+ logger.info("Agent aborted by user")
86
+
87
+ def _stream_request(
88
+ self,
89
+ messages: list[dict[str, Any]],
90
+ on_thinking_chunk: Callable[[str], None] | None = None,
91
+ ) -> tuple[str, str, str]:
92
+ stream = self.openai_client.chat.completions.create(
93
+ messages=cast(Any, messages),
94
+ model=self.model_config.model_name,
95
+ max_tokens=self.model_config.max_tokens,
96
+ temperature=self.model_config.temperature,
97
+ top_p=self.model_config.top_p,
98
+ frequency_penalty=self.model_config.frequency_penalty,
99
+ extra_body=self.model_config.extra_body,
100
+ stream=True,
101
+ )
102
+
103
+ raw_content = ""
104
+ buffer = ""
105
+ action_markers = ["finish(message=", "do(action="]
106
+ in_action_phase = False
107
+
108
+ for chunk in stream:
109
+ if len(chunk.choices) == 0:
110
+ continue
111
+ if chunk.choices[0].delta.content is not None:
112
+ content = chunk.choices[0].delta.content
113
+ raw_content += content
114
+
115
+ if in_action_phase:
116
+ continue
117
+
118
+ buffer += content
119
+
120
+ marker_found = False
121
+ for marker in action_markers:
122
+ if marker in buffer:
123
+ thinking_part = buffer.split(marker, 1)[0]
124
+ if on_thinking_chunk:
125
+ on_thinking_chunk(thinking_part)
126
+ in_action_phase = True
127
+ marker_found = True
128
+ break
129
+
130
+ if marker_found:
131
+ continue
132
+
133
+ is_potential_marker = False
134
+ for marker in action_markers:
135
+ for i in range(1, len(marker)):
136
+ if buffer.endswith(marker[:i]):
137
+ is_potential_marker = True
138
+ break
139
+ if is_potential_marker:
140
+ break
141
+
142
+ if not is_potential_marker:
143
+ if on_thinking_chunk:
144
+ on_thinking_chunk(buffer)
145
+ buffer = ""
146
+
147
+ thinking, action = self._parse_raw_response(raw_content)
148
+ return thinking, action, raw_content
149
+
150
+ def _parse_raw_response(self, content: str) -> tuple[str, str]:
151
+ if "finish(message=" in content:
152
+ parts = content.split("finish(message=", 1)
153
+ thinking = parts[0].strip()
154
+ action = "finish(message=" + parts[1]
155
+ return thinking, action
156
+
157
+ if "do(action=" in content:
158
+ parts = content.split("do(action=", 1)
159
+ thinking = parts[0].strip()
160
+ action = "do(action=" + parts[1]
161
+ return thinking, action
162
+
163
+ if "<answer>" in content:
164
+ parts = content.split("<answer>", 1)
165
+ thinking = parts[0].replace("<think>", "").replace("</think>", "").strip()
166
+ action = parts[1].replace("</answer>", "").strip()
167
+ return thinking, action
168
+
169
+ return "", content
170
+
171
+ def _execute_step(
172
+ self, user_prompt: str | None = None, is_first: bool = False
173
+ ) -> StepResult:
174
+ self._step_count += 1
175
+
176
+ screenshot = self.device.get_screenshot()
177
+ current_app = self.device.get_current_app()
178
+
179
+ if is_first:
180
+ system_prompt = self.agent_config.system_prompt
181
+ if system_prompt is None:
182
+ system_prompt = get_system_prompt(self.agent_config.lang)
183
+
184
+ self._context.append(MessageBuilder.create_system_message(system_prompt))
185
+
186
+ screen_info = MessageBuilder.build_screen_info(current_app)
187
+ text_content = f"{user_prompt}\n\n{screen_info}"
188
+
189
+ self._context.append(
190
+ MessageBuilder.create_user_message(
191
+ text=text_content, image_base64=screenshot.base64_data
192
+ )
193
+ )
194
+ else:
195
+ screen_info = MessageBuilder.build_screen_info(current_app)
196
+ # 如果有新的用户消息(多轮对话场景),把它加入消息中
197
+ if user_prompt:
198
+ text_content = f"{user_prompt}\n\n** Screen Info **\n\n{screen_info}"
199
+ else:
200
+ # 继续执行当前任务,只需要屏幕信息
201
+ text_content = f"** Screen Info **\n\n{screen_info}"
202
+
203
+ self._context.append(
204
+ MessageBuilder.create_user_message(
205
+ text=text_content, image_base64=screenshot.base64_data
206
+ )
207
+ )
208
+
209
+ try:
210
+ msgs = get_messages(self.agent_config.lang)
211
+ if self.agent_config.verbose:
212
+ print("\n" + "=" * 50)
213
+ print(f"💭 {msgs['thinking']}:")
214
+ print("-" * 50)
215
+
216
+ callback = self._thinking_callback
217
+ if callback is None and self.agent_config.verbose:
218
+
219
+ def print_chunk(chunk: str) -> None:
220
+ print(chunk, end="", flush=True)
221
+
222
+ callback = print_chunk
223
+
224
+ thinking, action_str, raw_content = self._stream_request(
225
+ self._context, on_thinking_chunk=callback
226
+ )
227
+ except Exception as e:
228
+ if self.agent_config.verbose:
229
+ traceback.print_exc()
230
+ return StepResult(
231
+ success=False,
232
+ finished=True,
233
+ action=None,
234
+ thinking="",
235
+ message=f"Model error: {e}",
236
+ )
237
+
238
+ try:
239
+ action = self.parser.parse(action_str)
240
+ except ValueError as e:
241
+ if self.agent_config.verbose:
242
+ logger.warning(f"Failed to parse action: {e}, treating as finish")
243
+ action = {"_metadata": "finish", "message": action_str}
244
+
245
+ if self.agent_config.verbose:
246
+ print()
247
+ print("-" * 50)
248
+ print(f"🎯 {msgs['action']}:")
249
+ print(json.dumps(action, ensure_ascii=False, indent=2))
250
+ print("=" * 50 + "\n")
251
+
252
+ self._context[-1] = MessageBuilder.remove_images_from_message(self._context[-1])
253
+
254
+ try:
255
+ result = self.action_handler.execute(
256
+ action, screenshot.width, screenshot.height
257
+ )
258
+ except Exception as e:
259
+ if self.agent_config.verbose:
260
+ traceback.print_exc()
261
+ result = ActionResult(success=False, should_finish=True, message=str(e))
262
+
263
+ self._context.append(
264
+ MessageBuilder.create_assistant_message(
265
+ f"<think>{thinking}</think><answer>{action_str}</answer>"
266
+ )
267
+ )
268
+
269
+ finished = action.get("_metadata") == "finish" or result.should_finish
270
+
271
+ if finished and self.agent_config.verbose:
272
+ msgs = get_messages(self.agent_config.lang)
273
+ print("\n" + "🎉 " + "=" * 48)
274
+ print(
275
+ f"✅ {msgs['task_completed']}: {result.message or action.get('message', msgs['done'])}"
276
+ )
277
+ print("=" * 50 + "\n")
278
+
279
+ return StepResult(
280
+ success=result.success,
281
+ finished=finished,
282
+ action=action,
283
+ thinking=thinking,
284
+ message=result.message or action.get("message"),
285
+ )
286
+
287
+ @property
288
+ def context(self) -> list[dict[str, Any]]:
289
+ return self._context.copy()
290
+
291
+ @property
292
+ def step_count(self) -> int:
293
+ return self._step_count
294
+
295
+ @property
296
+ def is_running(self) -> bool:
297
+ return self._is_running
@@ -0,0 +1,81 @@
1
+ """Message builder for GLM agent - copied from phone_agent.model.client.
2
+
3
+ This is an exact copy of the upstream MessageBuilder to ensure consistent behavior.
4
+ """
5
+
6
+ import json
7
+ from typing import Any
8
+
9
+
10
+ class MessageBuilder:
11
+ """Helper class for building conversation messages."""
12
+
13
+ @staticmethod
14
+ def create_system_message(content: str) -> dict[str, Any]:
15
+ """Create a system message."""
16
+ return {"role": "system", "content": content}
17
+
18
+ @staticmethod
19
+ def create_user_message(
20
+ text: str, image_base64: str | None = None
21
+ ) -> dict[str, Any]:
22
+ """
23
+ Create a user message with optional image.
24
+
25
+ Args:
26
+ text: Text content.
27
+ image_base64: Optional base64-encoded image.
28
+
29
+ Returns:
30
+ Message dictionary.
31
+ """
32
+ content = []
33
+
34
+ if image_base64:
35
+ content.append(
36
+ {
37
+ "type": "image_url",
38
+ "image_url": {"url": f"data:image/png;base64,{image_base64}"},
39
+ }
40
+ )
41
+
42
+ content.append({"type": "text", "text": text})
43
+
44
+ return {"role": "user", "content": content}
45
+
46
+ @staticmethod
47
+ def create_assistant_message(content: str) -> dict[str, Any]:
48
+ """Create an assistant message."""
49
+ return {"role": "assistant", "content": content}
50
+
51
+ @staticmethod
52
+ def remove_images_from_message(message: dict[str, Any]) -> dict[str, Any]:
53
+ """
54
+ Remove image content from a message to save context space.
55
+
56
+ Args:
57
+ message: Message dictionary.
58
+
59
+ Returns:
60
+ Message with images removed.
61
+ """
62
+ if isinstance(message.get("content"), list):
63
+ message["content"] = [
64
+ item for item in message["content"] if item.get("type") == "text"
65
+ ]
66
+ return message
67
+
68
+ @staticmethod
69
+ def build_screen_info(current_app: str, **extra_info) -> str:
70
+ """
71
+ Build screen info string for the model.
72
+
73
+ Args:
74
+ current_app: Current app name.
75
+ **extra_info: Additional info to include.
76
+
77
+ Returns:
78
+ JSON string with screen info.
79
+ """
80
+ info = {"current_app": current_app, **extra_info}
81
+ return json.dumps(info, ensure_ascii=False)
@@ -0,0 +1,110 @@
1
+ import ast
2
+ from typing import Any
3
+
4
+
5
+ class GLMParser:
6
+ @property
7
+ def coordinate_scale(self) -> int:
8
+ return 1000
9
+
10
+ def parse(self, raw_response: str) -> dict[str, Any]:
11
+ action_str = raw_response.strip()
12
+
13
+ if action_str.startswith("finish("):
14
+ return self._parse_finish(action_str)
15
+ if action_str.startswith("do("):
16
+ return self._parse_do(action_str)
17
+ raise ValueError(f"Unknown action format: {action_str}")
18
+
19
+ def _parse_finish(self, action_str: str) -> dict[str, Any]:
20
+ try:
21
+ params = self._extract_params(action_str, "finish")
22
+ return {
23
+ "_metadata": "finish",
24
+ "message": params.get("message", "Task completed"),
25
+ }
26
+ except Exception as e:
27
+ raise ValueError(f"Failed to parse finish action: {e}") from e
28
+
29
+ def _parse_do(self, action_str: str) -> dict[str, Any]:
30
+ try:
31
+ params = self._extract_params(action_str, "do")
32
+ action_name = params.get("action", "")
33
+
34
+ result = {
35
+ "_metadata": "do",
36
+ "action": action_name,
37
+ }
38
+
39
+ for key, value in params.items():
40
+ if key != "action":
41
+ result[key] = value
42
+
43
+ return result
44
+ except Exception as e:
45
+ raise ValueError(f"Failed to parse do action: {e}") from e
46
+
47
+ def _extract_params(self, action_str: str, function_name: str) -> dict[str, Any]:
48
+ prefix = f"{function_name}("
49
+ if not action_str.startswith(prefix):
50
+ raise ValueError(f"Action does not start with {prefix}")
51
+
52
+ params_str = action_str[len(prefix) : -1]
53
+
54
+ params: dict[str, Any] = {}
55
+ current_key = None
56
+ current_value = ""
57
+ in_quotes = False
58
+ quote_char = None
59
+ bracket_depth = 0
60
+ i = 0
61
+
62
+ while i < len(params_str):
63
+ char = params_str[i]
64
+
65
+ if char in ('"', "'") and (i == 0 or params_str[i - 1] != "\\"):
66
+ if not in_quotes:
67
+ in_quotes = True
68
+ quote_char = char
69
+ elif char == quote_char:
70
+ in_quotes = False
71
+ quote_char = None
72
+
73
+ if not in_quotes:
74
+ if char in ("[", "{"):
75
+ bracket_depth += 1
76
+ elif char in ("]", "}"):
77
+ bracket_depth -= 1
78
+
79
+ if char == "=" and bracket_depth == 0:
80
+ current_key = current_value.strip()
81
+ current_value = ""
82
+ i += 1
83
+ continue
84
+
85
+ if char == "," and bracket_depth == 0:
86
+ if current_key:
87
+ params[current_key] = self._parse_value(current_value.strip())
88
+ current_key = None
89
+ current_value = ""
90
+ i += 1
91
+ continue
92
+
93
+ current_value += char
94
+ i += 1
95
+
96
+ if current_key:
97
+ params[current_key] = self._parse_value(current_value.strip())
98
+
99
+ return params
100
+
101
+ def _parse_value(self, value_str: str) -> Any:
102
+ value_str = value_str.strip()
103
+
104
+ if not value_str:
105
+ return ""
106
+
107
+ try:
108
+ return ast.literal_eval(value_str)
109
+ except (ValueError, SyntaxError):
110
+ return value_str
@@ -1,5 +1,3 @@
1
- """System prompts for the AI agent."""
2
-
3
1
  from datetime import datetime
4
2
 
5
3
  today = datetime.today()
@@ -30,44 +28,44 @@ Your output should STRICTLY follow the format:
30
28
  Perform a tap action on a specified screen area. The element is a list of 2 integers, representing the coordinates of the tap point.
31
29
  **Example**:
32
30
  <answer>
33
- do(action="Tap", element=[x,y])
31
+ do(action=\"Tap\", element=[x,y])
34
32
  </answer>
35
33
  - **Type**
36
34
  Enter text into the currently focused input field.
37
35
  **Example**:
38
36
  <answer>
39
- do(action="Type", text="Hello World")
37
+ do(action=\"Type\", text=\"Hello World\")
40
38
  </answer>
41
39
  - **Swipe**
42
40
  Perform a swipe action with start point and end point.
43
41
  **Examples**:
44
42
  <answer>
45
- do(action="Swipe", start=[x1,y1], end=[x2,y2])
43
+ do(action=\"Swipe\", start=[x1,y1], end=[x2,y2])
46
44
  </answer>
47
45
  - **Long Press**
48
46
  Perform a long press action on a specified screen area.
49
47
  You can add the element to the action to specify the long press area. The element is a list of 2 integers, representing the coordinates of the long press point.
50
48
  **Example**:
51
49
  <answer>
52
- do(action="Long Press", element=[x,y])
50
+ do(action=\"Long Press\", element=[x,y])
53
51
  </answer>
54
52
  - **Launch**
55
53
  Launch an app. Try to use launch action when you need to launch an app. Check the instruction to choose the right app before you use this action.
56
54
  **Example**:
57
55
  <answer>
58
- do(action="Launch", app="Settings")
56
+ do(action=\"Launch\", app=\"Settings\")
59
57
  </answer>
60
58
  - **Back**
61
59
  Press the Back button to navigate to the previous screen.
62
60
  **Example**:
63
61
  <answer>
64
- do(action="Back")
62
+ do(action=\"Back\")
65
63
  </answer>
66
64
  - **Finish**
67
65
  Terminate the program and optionally print a message.
68
66
  **Example**:
69
67
  <answer>
70
- finish(message="Task completed.")
68
+ finish(message=\"Task completed.\")
71
69
  </answer>
72
70
 
73
71