autoglm-gui 1.4.1__py3-none-any.whl → 1.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- AutoGLM_GUI/__init__.py +11 -0
- AutoGLM_GUI/__main__.py +26 -4
- AutoGLM_GUI/actions/__init__.py +6 -0
- phone_agent/actions/handler_ios.py → AutoGLM_GUI/actions/handler.py +30 -112
- AutoGLM_GUI/actions/types.py +15 -0
- {phone_agent → AutoGLM_GUI}/adb/__init__.py +25 -23
- {phone_agent → AutoGLM_GUI}/adb/connection.py +5 -40
- {phone_agent → AutoGLM_GUI}/adb/device.py +12 -94
- {phone_agent → AutoGLM_GUI}/adb/input.py +6 -47
- AutoGLM_GUI/adb/screenshot.py +11 -0
- {phone_agent/config → AutoGLM_GUI/adb}/timing.py +1 -1
- AutoGLM_GUI/adb_plus/keyboard_installer.py +4 -2
- AutoGLM_GUI/adb_plus/screenshot.py +22 -1
- AutoGLM_GUI/adb_plus/serial.py +38 -20
- AutoGLM_GUI/adb_plus/touch.py +4 -9
- AutoGLM_GUI/agents/__init__.py +43 -12
- AutoGLM_GUI/agents/events.py +19 -0
- AutoGLM_GUI/agents/factory.py +31 -38
- AutoGLM_GUI/agents/glm/__init__.py +7 -0
- AutoGLM_GUI/agents/glm/agent.py +297 -0
- AutoGLM_GUI/agents/glm/message_builder.py +81 -0
- AutoGLM_GUI/agents/glm/parser.py +110 -0
- {phone_agent/config → AutoGLM_GUI/agents/glm}/prompts_en.py +7 -9
- {phone_agent/config → AutoGLM_GUI/agents/glm}/prompts_zh.py +18 -25
- AutoGLM_GUI/agents/mai/__init__.py +28 -0
- AutoGLM_GUI/agents/mai/agent.py +408 -0
- AutoGLM_GUI/agents/mai/parser.py +254 -0
- AutoGLM_GUI/agents/mai/prompts.py +103 -0
- AutoGLM_GUI/agents/mai/traj_memory.py +91 -0
- AutoGLM_GUI/agents/protocols.py +12 -8
- AutoGLM_GUI/agents/stream_runner.py +193 -0
- AutoGLM_GUI/api/__init__.py +40 -21
- AutoGLM_GUI/api/agents.py +181 -239
- AutoGLM_GUI/api/control.py +9 -6
- AutoGLM_GUI/api/devices.py +102 -12
- AutoGLM_GUI/api/history.py +104 -0
- AutoGLM_GUI/api/layered_agent.py +67 -15
- AutoGLM_GUI/api/media.py +64 -1
- AutoGLM_GUI/api/scheduled_tasks.py +98 -0
- AutoGLM_GUI/config.py +81 -0
- AutoGLM_GUI/config_manager.py +68 -51
- AutoGLM_GUI/device_manager.py +248 -29
- AutoGLM_GUI/device_protocol.py +1 -1
- AutoGLM_GUI/devices/adb_device.py +5 -10
- AutoGLM_GUI/devices/mock_device.py +4 -2
- AutoGLM_GUI/devices/remote_device.py +8 -3
- AutoGLM_GUI/history_manager.py +164 -0
- AutoGLM_GUI/model/__init__.py +5 -0
- AutoGLM_GUI/model/message_builder.py +69 -0
- AutoGLM_GUI/model/types.py +24 -0
- AutoGLM_GUI/models/__init__.py +10 -0
- AutoGLM_GUI/models/history.py +140 -0
- AutoGLM_GUI/models/scheduled_task.py +71 -0
- AutoGLM_GUI/parsers/__init__.py +22 -0
- AutoGLM_GUI/parsers/base.py +50 -0
- AutoGLM_GUI/parsers/phone_parser.py +58 -0
- AutoGLM_GUI/phone_agent_manager.py +62 -396
- AutoGLM_GUI/platform_utils.py +26 -0
- AutoGLM_GUI/prompt_config.py +15 -0
- AutoGLM_GUI/prompts/__init__.py +32 -0
- AutoGLM_GUI/scheduler_manager.py +350 -0
- AutoGLM_GUI/schemas.py +246 -72
- AutoGLM_GUI/scrcpy_stream.py +142 -24
- AutoGLM_GUI/socketio_server.py +100 -27
- AutoGLM_GUI/static/assets/{about-_XNhzQZX.js → about-CfwX1Cmc.js} +1 -1
- AutoGLM_GUI/static/assets/alert-dialog-CtGlN2IJ.js +1 -0
- AutoGLM_GUI/static/assets/chat-BYa-foUI.js +129 -0
- AutoGLM_GUI/static/assets/circle-alert-t08bEMPO.js +1 -0
- AutoGLM_GUI/static/assets/dialog-FNwZJFwk.js +45 -0
- AutoGLM_GUI/static/assets/eye-D0UPWCWC.js +1 -0
- AutoGLM_GUI/static/assets/history-CRo95B7i.js +1 -0
- AutoGLM_GUI/static/assets/{index-Cy8TmmHV.js → index-BaLMSqd3.js} +1 -1
- AutoGLM_GUI/static/assets/index-CTHbFvKl.js +11 -0
- AutoGLM_GUI/static/assets/index-CV7jGxGm.css +1 -0
- AutoGLM_GUI/static/assets/label-DJFevVmr.js +1 -0
- AutoGLM_GUI/static/assets/logs-RW09DyYY.js +1 -0
- AutoGLM_GUI/static/assets/popover--JTJrE5v.js +1 -0
- AutoGLM_GUI/static/assets/scheduled-tasks-DTRKsQXF.js +1 -0
- AutoGLM_GUI/static/assets/square-pen-CPK_K680.js +1 -0
- AutoGLM_GUI/static/assets/textarea-PRmVnWq5.js +1 -0
- AutoGLM_GUI/static/assets/workflows-CdcsAoaT.js +1 -0
- AutoGLM_GUI/static/index.html +2 -2
- AutoGLM_GUI/types.py +17 -0
- {autoglm_gui-1.4.1.dist-info → autoglm_gui-1.5.1.dist-info}/METADATA +179 -130
- autoglm_gui-1.5.1.dist-info/RECORD +118 -0
- AutoGLM_GUI/agents/mai_adapter.py +0 -627
- AutoGLM_GUI/api/dual_model.py +0 -317
- AutoGLM_GUI/device_adapter.py +0 -263
- AutoGLM_GUI/dual_model/__init__.py +0 -53
- AutoGLM_GUI/dual_model/decision_model.py +0 -664
- AutoGLM_GUI/dual_model/dual_agent.py +0 -917
- AutoGLM_GUI/dual_model/protocols.py +0 -354
- AutoGLM_GUI/dual_model/vision_model.py +0 -442
- AutoGLM_GUI/mai_ui_adapter/agent_wrapper.py +0 -291
- AutoGLM_GUI/phone_agent_patches.py +0 -147
- AutoGLM_GUI/static/assets/chat-DwJpiAWf.js +0 -126
- AutoGLM_GUI/static/assets/dialog-B3uW4T8V.js +0 -45
- AutoGLM_GUI/static/assets/index-Cpv2gSF1.css +0 -1
- AutoGLM_GUI/static/assets/index-UYYauTly.js +0 -12
- AutoGLM_GUI/static/assets/workflows-Du_de-dt.js +0 -1
- autoglm_gui-1.4.1.dist-info/RECORD +0 -117
- mai_agent/base.py +0 -137
- mai_agent/mai_grounding_agent.py +0 -263
- mai_agent/mai_naivigation_agent.py +0 -526
- mai_agent/prompt.py +0 -148
- mai_agent/unified_memory.py +0 -67
- mai_agent/utils.py +0 -73
- phone_agent/__init__.py +0 -12
- phone_agent/actions/__init__.py +0 -5
- phone_agent/actions/handler.py +0 -400
- phone_agent/adb/screenshot.py +0 -108
- phone_agent/agent.py +0 -253
- phone_agent/agent_ios.py +0 -277
- phone_agent/config/__init__.py +0 -53
- phone_agent/config/apps_harmonyos.py +0 -256
- phone_agent/config/apps_ios.py +0 -339
- phone_agent/config/prompts.py +0 -80
- phone_agent/device_factory.py +0 -166
- phone_agent/hdc/__init__.py +0 -53
- phone_agent/hdc/connection.py +0 -384
- phone_agent/hdc/device.py +0 -269
- phone_agent/hdc/input.py +0 -145
- phone_agent/hdc/screenshot.py +0 -127
- phone_agent/model/__init__.py +0 -5
- phone_agent/model/client.py +0 -290
- phone_agent/xctest/__init__.py +0 -47
- phone_agent/xctest/connection.py +0 -379
- phone_agent/xctest/device.py +0 -472
- phone_agent/xctest/input.py +0 -311
- phone_agent/xctest/screenshot.py +0 -226
- {phone_agent/config → AutoGLM_GUI/adb}/apps.py +0 -0
- {phone_agent/config → AutoGLM_GUI}/i18n.py +0 -0
- {autoglm_gui-1.4.1.dist-info → autoglm_gui-1.5.1.dist-info}/WHEEL +0 -0
- {autoglm_gui-1.4.1.dist-info → autoglm_gui-1.5.1.dist-info}/entry_points.txt +0 -0
- {autoglm_gui-1.4.1.dist-info → autoglm_gui-1.5.1.dist-info}/licenses/LICENSE +0 -0
AutoGLM_GUI/agents/factory.py
CHANGED
|
@@ -6,20 +6,14 @@ making it easy to add new agent types without modifying existing code.
|
|
|
6
6
|
|
|
7
7
|
from __future__ import annotations
|
|
8
8
|
|
|
9
|
-
from typing import
|
|
9
|
+
from typing import Callable, Dict
|
|
10
10
|
|
|
11
|
+
from AutoGLM_GUI.config import AgentConfig, ModelConfig
|
|
11
12
|
from AutoGLM_GUI.logger import logger
|
|
12
13
|
from AutoGLM_GUI.types import AgentSpecificConfig
|
|
13
14
|
|
|
14
15
|
from .protocols import BaseAgent
|
|
15
16
|
|
|
16
|
-
if TYPE_CHECKING:
|
|
17
|
-
from phone_agent import PhoneAgent
|
|
18
|
-
from phone_agent.agent import AgentConfig
|
|
19
|
-
from phone_agent.model import ModelConfig
|
|
20
|
-
|
|
21
|
-
from .mai_adapter import MAIAgentAdapter
|
|
22
|
-
|
|
23
17
|
|
|
24
18
|
# Agent registry: agent_type -> (creator_function, config_schema)
|
|
25
19
|
AGENT_REGISTRY: Dict[str, Callable] = {}
|
|
@@ -52,12 +46,13 @@ def register_agent(
|
|
|
52
46
|
|
|
53
47
|
def create_agent(
|
|
54
48
|
agent_type: str,
|
|
55
|
-
model_config:
|
|
56
|
-
agent_config:
|
|
49
|
+
model_config: ModelConfig,
|
|
50
|
+
agent_config: AgentConfig,
|
|
57
51
|
agent_specific_config: AgentSpecificConfig,
|
|
52
|
+
device,
|
|
58
53
|
takeover_callback: Callable | None = None,
|
|
59
54
|
confirmation_callback: Callable | None = None,
|
|
60
|
-
) ->
|
|
55
|
+
) -> BaseAgent:
|
|
61
56
|
"""
|
|
62
57
|
Create an agent instance using the factory pattern.
|
|
63
58
|
|
|
@@ -66,6 +61,7 @@ def create_agent(
|
|
|
66
61
|
model_config: Model configuration
|
|
67
62
|
agent_config: Agent configuration
|
|
68
63
|
agent_specific_config: Agent-specific configuration (e.g., MAIConfig fields)
|
|
64
|
+
device: DeviceProtocol instance (provided by PhoneAgentManager)
|
|
69
65
|
takeover_callback: Takeover callback
|
|
70
66
|
confirmation_callback: Confirmation callback
|
|
71
67
|
|
|
@@ -88,6 +84,7 @@ def create_agent(
|
|
|
88
84
|
model_config=model_config,
|
|
89
85
|
agent_config=agent_config,
|
|
90
86
|
agent_specific_config=agent_specific_config,
|
|
87
|
+
device=device,
|
|
91
88
|
takeover_callback=takeover_callback,
|
|
92
89
|
confirmation_callback=confirmation_callback,
|
|
93
90
|
)
|
|
@@ -111,50 +108,46 @@ def is_agent_type_registered(agent_type: str) -> bool:
|
|
|
111
108
|
# ==================== Built-in Agent Creators ====================
|
|
112
109
|
|
|
113
110
|
|
|
114
|
-
def
|
|
115
|
-
model_config:
|
|
116
|
-
agent_config:
|
|
111
|
+
def _create_glm_agent_v2(
|
|
112
|
+
model_config: ModelConfig,
|
|
113
|
+
agent_config: AgentConfig,
|
|
117
114
|
agent_specific_config: AgentSpecificConfig,
|
|
115
|
+
device,
|
|
118
116
|
takeover_callback: Callable | None = None,
|
|
119
117
|
confirmation_callback: Callable | None = None,
|
|
120
|
-
) ->
|
|
121
|
-
from
|
|
118
|
+
) -> BaseAgent:
|
|
119
|
+
from .glm.agent import GLMAgent
|
|
122
120
|
|
|
123
|
-
return
|
|
121
|
+
return GLMAgent(
|
|
124
122
|
model_config=model_config,
|
|
125
123
|
agent_config=agent_config,
|
|
126
|
-
|
|
124
|
+
device=device,
|
|
127
125
|
confirmation_callback=confirmation_callback,
|
|
126
|
+
takeover_callback=takeover_callback,
|
|
128
127
|
)
|
|
129
128
|
|
|
130
129
|
|
|
131
|
-
def
|
|
132
|
-
model_config:
|
|
133
|
-
agent_config:
|
|
130
|
+
def _create_internal_mai_agent(
|
|
131
|
+
model_config: ModelConfig,
|
|
132
|
+
agent_config: AgentConfig,
|
|
134
133
|
agent_specific_config: AgentSpecificConfig,
|
|
134
|
+
device,
|
|
135
135
|
takeover_callback: Callable | None = None,
|
|
136
136
|
confirmation_callback: Callable | None = None,
|
|
137
|
-
) ->
|
|
138
|
-
from .
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
mai_config = MAIAgentConfig(
|
|
142
|
-
history_n=agent_specific_config.get("history_n", 3),
|
|
143
|
-
max_pixels=agent_specific_config.get("max_pixels"),
|
|
144
|
-
min_pixels=agent_specific_config.get("min_pixels"),
|
|
145
|
-
tools=agent_specific_config.get("tools"),
|
|
146
|
-
use_mai_prompt=agent_specific_config.get("use_mai_prompt", False),
|
|
147
|
-
)
|
|
137
|
+
) -> BaseAgent:
|
|
138
|
+
from .mai.agent import InternalMAIAgent
|
|
139
|
+
|
|
140
|
+
history_n = agent_specific_config.get("history_n", 3)
|
|
148
141
|
|
|
149
|
-
return
|
|
142
|
+
return InternalMAIAgent(
|
|
150
143
|
model_config=model_config,
|
|
151
144
|
agent_config=agent_config,
|
|
152
|
-
|
|
153
|
-
|
|
145
|
+
device=device,
|
|
146
|
+
history_n=history_n,
|
|
154
147
|
confirmation_callback=confirmation_callback,
|
|
148
|
+
takeover_callback=takeover_callback,
|
|
155
149
|
)
|
|
156
150
|
|
|
157
151
|
|
|
158
|
-
|
|
159
|
-
register_agent("
|
|
160
|
-
register_agent("mai", _create_mai_agent)
|
|
152
|
+
register_agent("glm", _create_glm_agent_v2)
|
|
153
|
+
register_agent("mai", _create_internal_mai_agent)
|
|
@@ -0,0 +1,297 @@
|
|
|
1
|
+
import json
|
|
2
|
+
import traceback
|
|
3
|
+
from typing import Any, Callable, cast
|
|
4
|
+
|
|
5
|
+
from openai import OpenAI
|
|
6
|
+
|
|
7
|
+
from AutoGLM_GUI.actions import ActionHandler, ActionResult
|
|
8
|
+
from AutoGLM_GUI.config import AgentConfig, ModelConfig, StepResult
|
|
9
|
+
from AutoGLM_GUI.device_protocol import DeviceProtocol
|
|
10
|
+
from AutoGLM_GUI.logger import logger
|
|
11
|
+
from AutoGLM_GUI.prompt_config import get_messages, get_system_prompt
|
|
12
|
+
|
|
13
|
+
from .message_builder import MessageBuilder
|
|
14
|
+
from .parser import GLMParser
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class GLMAgent:
|
|
18
|
+
def __init__(
|
|
19
|
+
self,
|
|
20
|
+
model_config: ModelConfig,
|
|
21
|
+
agent_config: AgentConfig,
|
|
22
|
+
device: DeviceProtocol,
|
|
23
|
+
confirmation_callback: Callable[[str], bool] | None = None,
|
|
24
|
+
takeover_callback: Callable[[str], None] | None = None,
|
|
25
|
+
thinking_callback: Callable[[str], None] | None = None,
|
|
26
|
+
):
|
|
27
|
+
self.model_config = model_config
|
|
28
|
+
self.agent_config = agent_config
|
|
29
|
+
|
|
30
|
+
self.openai_client = OpenAI(
|
|
31
|
+
base_url=model_config.base_url,
|
|
32
|
+
api_key=model_config.api_key,
|
|
33
|
+
timeout=120,
|
|
34
|
+
)
|
|
35
|
+
self.parser = GLMParser()
|
|
36
|
+
|
|
37
|
+
self.device = device
|
|
38
|
+
self.action_handler = ActionHandler(
|
|
39
|
+
device=self.device,
|
|
40
|
+
confirmation_callback=confirmation_callback,
|
|
41
|
+
takeover_callback=takeover_callback,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
self._context: list[dict[str, Any]] = []
|
|
45
|
+
self._step_count = 0
|
|
46
|
+
self._is_running = False
|
|
47
|
+
self._thinking_callback = thinking_callback
|
|
48
|
+
|
|
49
|
+
def run(self, task: str) -> str:
|
|
50
|
+
self._context = []
|
|
51
|
+
self._step_count = 0
|
|
52
|
+
self._is_running = True
|
|
53
|
+
|
|
54
|
+
try:
|
|
55
|
+
result = self._execute_step(task, is_first=True)
|
|
56
|
+
|
|
57
|
+
if result.finished:
|
|
58
|
+
return result.message or "Task completed"
|
|
59
|
+
|
|
60
|
+
while self._step_count < self.agent_config.max_steps and self._is_running:
|
|
61
|
+
result = self._execute_step(is_first=False)
|
|
62
|
+
|
|
63
|
+
if result.finished:
|
|
64
|
+
return result.message or "Task completed"
|
|
65
|
+
|
|
66
|
+
return "Max steps reached"
|
|
67
|
+
finally:
|
|
68
|
+
self._is_running = False
|
|
69
|
+
|
|
70
|
+
def step(self, task: str | None = None) -> StepResult:
|
|
71
|
+
is_first = len(self._context) == 0
|
|
72
|
+
|
|
73
|
+
if is_first and not task:
|
|
74
|
+
raise ValueError("Task is required for the first step")
|
|
75
|
+
|
|
76
|
+
return self._execute_step(task, is_first)
|
|
77
|
+
|
|
78
|
+
def reset(self) -> None:
|
|
79
|
+
self._context = []
|
|
80
|
+
self._step_count = 0
|
|
81
|
+
self._is_running = False
|
|
82
|
+
|
|
83
|
+
def abort(self) -> None:
|
|
84
|
+
self._is_running = False
|
|
85
|
+
logger.info("Agent aborted by user")
|
|
86
|
+
|
|
87
|
+
def _stream_request(
|
|
88
|
+
self,
|
|
89
|
+
messages: list[dict[str, Any]],
|
|
90
|
+
on_thinking_chunk: Callable[[str], None] | None = None,
|
|
91
|
+
) -> tuple[str, str, str]:
|
|
92
|
+
stream = self.openai_client.chat.completions.create(
|
|
93
|
+
messages=cast(Any, messages),
|
|
94
|
+
model=self.model_config.model_name,
|
|
95
|
+
max_tokens=self.model_config.max_tokens,
|
|
96
|
+
temperature=self.model_config.temperature,
|
|
97
|
+
top_p=self.model_config.top_p,
|
|
98
|
+
frequency_penalty=self.model_config.frequency_penalty,
|
|
99
|
+
extra_body=self.model_config.extra_body,
|
|
100
|
+
stream=True,
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
raw_content = ""
|
|
104
|
+
buffer = ""
|
|
105
|
+
action_markers = ["finish(message=", "do(action="]
|
|
106
|
+
in_action_phase = False
|
|
107
|
+
|
|
108
|
+
for chunk in stream:
|
|
109
|
+
if len(chunk.choices) == 0:
|
|
110
|
+
continue
|
|
111
|
+
if chunk.choices[0].delta.content is not None:
|
|
112
|
+
content = chunk.choices[0].delta.content
|
|
113
|
+
raw_content += content
|
|
114
|
+
|
|
115
|
+
if in_action_phase:
|
|
116
|
+
continue
|
|
117
|
+
|
|
118
|
+
buffer += content
|
|
119
|
+
|
|
120
|
+
marker_found = False
|
|
121
|
+
for marker in action_markers:
|
|
122
|
+
if marker in buffer:
|
|
123
|
+
thinking_part = buffer.split(marker, 1)[0]
|
|
124
|
+
if on_thinking_chunk:
|
|
125
|
+
on_thinking_chunk(thinking_part)
|
|
126
|
+
in_action_phase = True
|
|
127
|
+
marker_found = True
|
|
128
|
+
break
|
|
129
|
+
|
|
130
|
+
if marker_found:
|
|
131
|
+
continue
|
|
132
|
+
|
|
133
|
+
is_potential_marker = False
|
|
134
|
+
for marker in action_markers:
|
|
135
|
+
for i in range(1, len(marker)):
|
|
136
|
+
if buffer.endswith(marker[:i]):
|
|
137
|
+
is_potential_marker = True
|
|
138
|
+
break
|
|
139
|
+
if is_potential_marker:
|
|
140
|
+
break
|
|
141
|
+
|
|
142
|
+
if not is_potential_marker:
|
|
143
|
+
if on_thinking_chunk:
|
|
144
|
+
on_thinking_chunk(buffer)
|
|
145
|
+
buffer = ""
|
|
146
|
+
|
|
147
|
+
thinking, action = self._parse_raw_response(raw_content)
|
|
148
|
+
return thinking, action, raw_content
|
|
149
|
+
|
|
150
|
+
def _parse_raw_response(self, content: str) -> tuple[str, str]:
|
|
151
|
+
if "finish(message=" in content:
|
|
152
|
+
parts = content.split("finish(message=", 1)
|
|
153
|
+
thinking = parts[0].strip()
|
|
154
|
+
action = "finish(message=" + parts[1]
|
|
155
|
+
return thinking, action
|
|
156
|
+
|
|
157
|
+
if "do(action=" in content:
|
|
158
|
+
parts = content.split("do(action=", 1)
|
|
159
|
+
thinking = parts[0].strip()
|
|
160
|
+
action = "do(action=" + parts[1]
|
|
161
|
+
return thinking, action
|
|
162
|
+
|
|
163
|
+
if "<answer>" in content:
|
|
164
|
+
parts = content.split("<answer>", 1)
|
|
165
|
+
thinking = parts[0].replace("<think>", "").replace("</think>", "").strip()
|
|
166
|
+
action = parts[1].replace("</answer>", "").strip()
|
|
167
|
+
return thinking, action
|
|
168
|
+
|
|
169
|
+
return "", content
|
|
170
|
+
|
|
171
|
+
def _execute_step(
|
|
172
|
+
self, user_prompt: str | None = None, is_first: bool = False
|
|
173
|
+
) -> StepResult:
|
|
174
|
+
self._step_count += 1
|
|
175
|
+
|
|
176
|
+
screenshot = self.device.get_screenshot()
|
|
177
|
+
current_app = self.device.get_current_app()
|
|
178
|
+
|
|
179
|
+
if is_first:
|
|
180
|
+
system_prompt = self.agent_config.system_prompt
|
|
181
|
+
if system_prompt is None:
|
|
182
|
+
system_prompt = get_system_prompt(self.agent_config.lang)
|
|
183
|
+
|
|
184
|
+
self._context.append(MessageBuilder.create_system_message(system_prompt))
|
|
185
|
+
|
|
186
|
+
screen_info = MessageBuilder.build_screen_info(current_app)
|
|
187
|
+
text_content = f"{user_prompt}\n\n{screen_info}"
|
|
188
|
+
|
|
189
|
+
self._context.append(
|
|
190
|
+
MessageBuilder.create_user_message(
|
|
191
|
+
text=text_content, image_base64=screenshot.base64_data
|
|
192
|
+
)
|
|
193
|
+
)
|
|
194
|
+
else:
|
|
195
|
+
screen_info = MessageBuilder.build_screen_info(current_app)
|
|
196
|
+
# 如果有新的用户消息(多轮对话场景),把它加入消息中
|
|
197
|
+
if user_prompt:
|
|
198
|
+
text_content = f"{user_prompt}\n\n** Screen Info **\n\n{screen_info}"
|
|
199
|
+
else:
|
|
200
|
+
# 继续执行当前任务,只需要屏幕信息
|
|
201
|
+
text_content = f"** Screen Info **\n\n{screen_info}"
|
|
202
|
+
|
|
203
|
+
self._context.append(
|
|
204
|
+
MessageBuilder.create_user_message(
|
|
205
|
+
text=text_content, image_base64=screenshot.base64_data
|
|
206
|
+
)
|
|
207
|
+
)
|
|
208
|
+
|
|
209
|
+
try:
|
|
210
|
+
msgs = get_messages(self.agent_config.lang)
|
|
211
|
+
if self.agent_config.verbose:
|
|
212
|
+
print("\n" + "=" * 50)
|
|
213
|
+
print(f"💭 {msgs['thinking']}:")
|
|
214
|
+
print("-" * 50)
|
|
215
|
+
|
|
216
|
+
callback = self._thinking_callback
|
|
217
|
+
if callback is None and self.agent_config.verbose:
|
|
218
|
+
|
|
219
|
+
def print_chunk(chunk: str) -> None:
|
|
220
|
+
print(chunk, end="", flush=True)
|
|
221
|
+
|
|
222
|
+
callback = print_chunk
|
|
223
|
+
|
|
224
|
+
thinking, action_str, raw_content = self._stream_request(
|
|
225
|
+
self._context, on_thinking_chunk=callback
|
|
226
|
+
)
|
|
227
|
+
except Exception as e:
|
|
228
|
+
if self.agent_config.verbose:
|
|
229
|
+
traceback.print_exc()
|
|
230
|
+
return StepResult(
|
|
231
|
+
success=False,
|
|
232
|
+
finished=True,
|
|
233
|
+
action=None,
|
|
234
|
+
thinking="",
|
|
235
|
+
message=f"Model error: {e}",
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
try:
|
|
239
|
+
action = self.parser.parse(action_str)
|
|
240
|
+
except ValueError as e:
|
|
241
|
+
if self.agent_config.verbose:
|
|
242
|
+
logger.warning(f"Failed to parse action: {e}, treating as finish")
|
|
243
|
+
action = {"_metadata": "finish", "message": action_str}
|
|
244
|
+
|
|
245
|
+
if self.agent_config.verbose:
|
|
246
|
+
print()
|
|
247
|
+
print("-" * 50)
|
|
248
|
+
print(f"🎯 {msgs['action']}:")
|
|
249
|
+
print(json.dumps(action, ensure_ascii=False, indent=2))
|
|
250
|
+
print("=" * 50 + "\n")
|
|
251
|
+
|
|
252
|
+
self._context[-1] = MessageBuilder.remove_images_from_message(self._context[-1])
|
|
253
|
+
|
|
254
|
+
try:
|
|
255
|
+
result = self.action_handler.execute(
|
|
256
|
+
action, screenshot.width, screenshot.height
|
|
257
|
+
)
|
|
258
|
+
except Exception as e:
|
|
259
|
+
if self.agent_config.verbose:
|
|
260
|
+
traceback.print_exc()
|
|
261
|
+
result = ActionResult(success=False, should_finish=True, message=str(e))
|
|
262
|
+
|
|
263
|
+
self._context.append(
|
|
264
|
+
MessageBuilder.create_assistant_message(
|
|
265
|
+
f"<think>{thinking}</think><answer>{action_str}</answer>"
|
|
266
|
+
)
|
|
267
|
+
)
|
|
268
|
+
|
|
269
|
+
finished = action.get("_metadata") == "finish" or result.should_finish
|
|
270
|
+
|
|
271
|
+
if finished and self.agent_config.verbose:
|
|
272
|
+
msgs = get_messages(self.agent_config.lang)
|
|
273
|
+
print("\n" + "🎉 " + "=" * 48)
|
|
274
|
+
print(
|
|
275
|
+
f"✅ {msgs['task_completed']}: {result.message or action.get('message', msgs['done'])}"
|
|
276
|
+
)
|
|
277
|
+
print("=" * 50 + "\n")
|
|
278
|
+
|
|
279
|
+
return StepResult(
|
|
280
|
+
success=result.success,
|
|
281
|
+
finished=finished,
|
|
282
|
+
action=action,
|
|
283
|
+
thinking=thinking,
|
|
284
|
+
message=result.message or action.get("message"),
|
|
285
|
+
)
|
|
286
|
+
|
|
287
|
+
@property
|
|
288
|
+
def context(self) -> list[dict[str, Any]]:
|
|
289
|
+
return self._context.copy()
|
|
290
|
+
|
|
291
|
+
@property
|
|
292
|
+
def step_count(self) -> int:
|
|
293
|
+
return self._step_count
|
|
294
|
+
|
|
295
|
+
@property
|
|
296
|
+
def is_running(self) -> bool:
|
|
297
|
+
return self._is_running
|
|
@@ -0,0 +1,81 @@
|
|
|
1
|
+
"""Message builder for GLM agent - copied from phone_agent.model.client.
|
|
2
|
+
|
|
3
|
+
This is an exact copy of the upstream MessageBuilder to ensure consistent behavior.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
import json
|
|
7
|
+
from typing import Any
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class MessageBuilder:
|
|
11
|
+
"""Helper class for building conversation messages."""
|
|
12
|
+
|
|
13
|
+
@staticmethod
|
|
14
|
+
def create_system_message(content: str) -> dict[str, Any]:
|
|
15
|
+
"""Create a system message."""
|
|
16
|
+
return {"role": "system", "content": content}
|
|
17
|
+
|
|
18
|
+
@staticmethod
|
|
19
|
+
def create_user_message(
|
|
20
|
+
text: str, image_base64: str | None = None
|
|
21
|
+
) -> dict[str, Any]:
|
|
22
|
+
"""
|
|
23
|
+
Create a user message with optional image.
|
|
24
|
+
|
|
25
|
+
Args:
|
|
26
|
+
text: Text content.
|
|
27
|
+
image_base64: Optional base64-encoded image.
|
|
28
|
+
|
|
29
|
+
Returns:
|
|
30
|
+
Message dictionary.
|
|
31
|
+
"""
|
|
32
|
+
content = []
|
|
33
|
+
|
|
34
|
+
if image_base64:
|
|
35
|
+
content.append(
|
|
36
|
+
{
|
|
37
|
+
"type": "image_url",
|
|
38
|
+
"image_url": {"url": f"data:image/png;base64,{image_base64}"},
|
|
39
|
+
}
|
|
40
|
+
)
|
|
41
|
+
|
|
42
|
+
content.append({"type": "text", "text": text})
|
|
43
|
+
|
|
44
|
+
return {"role": "user", "content": content}
|
|
45
|
+
|
|
46
|
+
@staticmethod
|
|
47
|
+
def create_assistant_message(content: str) -> dict[str, Any]:
|
|
48
|
+
"""Create an assistant message."""
|
|
49
|
+
return {"role": "assistant", "content": content}
|
|
50
|
+
|
|
51
|
+
@staticmethod
|
|
52
|
+
def remove_images_from_message(message: dict[str, Any]) -> dict[str, Any]:
|
|
53
|
+
"""
|
|
54
|
+
Remove image content from a message to save context space.
|
|
55
|
+
|
|
56
|
+
Args:
|
|
57
|
+
message: Message dictionary.
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
Message with images removed.
|
|
61
|
+
"""
|
|
62
|
+
if isinstance(message.get("content"), list):
|
|
63
|
+
message["content"] = [
|
|
64
|
+
item for item in message["content"] if item.get("type") == "text"
|
|
65
|
+
]
|
|
66
|
+
return message
|
|
67
|
+
|
|
68
|
+
@staticmethod
|
|
69
|
+
def build_screen_info(current_app: str, **extra_info) -> str:
|
|
70
|
+
"""
|
|
71
|
+
Build screen info string for the model.
|
|
72
|
+
|
|
73
|
+
Args:
|
|
74
|
+
current_app: Current app name.
|
|
75
|
+
**extra_info: Additional info to include.
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
JSON string with screen info.
|
|
79
|
+
"""
|
|
80
|
+
info = {"current_app": current_app, **extra_info}
|
|
81
|
+
return json.dumps(info, ensure_ascii=False)
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
import ast
|
|
2
|
+
from typing import Any
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
class GLMParser:
|
|
6
|
+
@property
|
|
7
|
+
def coordinate_scale(self) -> int:
|
|
8
|
+
return 1000
|
|
9
|
+
|
|
10
|
+
def parse(self, raw_response: str) -> dict[str, Any]:
|
|
11
|
+
action_str = raw_response.strip()
|
|
12
|
+
|
|
13
|
+
if action_str.startswith("finish("):
|
|
14
|
+
return self._parse_finish(action_str)
|
|
15
|
+
if action_str.startswith("do("):
|
|
16
|
+
return self._parse_do(action_str)
|
|
17
|
+
raise ValueError(f"Unknown action format: {action_str}")
|
|
18
|
+
|
|
19
|
+
def _parse_finish(self, action_str: str) -> dict[str, Any]:
|
|
20
|
+
try:
|
|
21
|
+
params = self._extract_params(action_str, "finish")
|
|
22
|
+
return {
|
|
23
|
+
"_metadata": "finish",
|
|
24
|
+
"message": params.get("message", "Task completed"),
|
|
25
|
+
}
|
|
26
|
+
except Exception as e:
|
|
27
|
+
raise ValueError(f"Failed to parse finish action: {e}") from e
|
|
28
|
+
|
|
29
|
+
def _parse_do(self, action_str: str) -> dict[str, Any]:
|
|
30
|
+
try:
|
|
31
|
+
params = self._extract_params(action_str, "do")
|
|
32
|
+
action_name = params.get("action", "")
|
|
33
|
+
|
|
34
|
+
result = {
|
|
35
|
+
"_metadata": "do",
|
|
36
|
+
"action": action_name,
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
for key, value in params.items():
|
|
40
|
+
if key != "action":
|
|
41
|
+
result[key] = value
|
|
42
|
+
|
|
43
|
+
return result
|
|
44
|
+
except Exception as e:
|
|
45
|
+
raise ValueError(f"Failed to parse do action: {e}") from e
|
|
46
|
+
|
|
47
|
+
def _extract_params(self, action_str: str, function_name: str) -> dict[str, Any]:
|
|
48
|
+
prefix = f"{function_name}("
|
|
49
|
+
if not action_str.startswith(prefix):
|
|
50
|
+
raise ValueError(f"Action does not start with {prefix}")
|
|
51
|
+
|
|
52
|
+
params_str = action_str[len(prefix) : -1]
|
|
53
|
+
|
|
54
|
+
params: dict[str, Any] = {}
|
|
55
|
+
current_key = None
|
|
56
|
+
current_value = ""
|
|
57
|
+
in_quotes = False
|
|
58
|
+
quote_char = None
|
|
59
|
+
bracket_depth = 0
|
|
60
|
+
i = 0
|
|
61
|
+
|
|
62
|
+
while i < len(params_str):
|
|
63
|
+
char = params_str[i]
|
|
64
|
+
|
|
65
|
+
if char in ('"', "'") and (i == 0 or params_str[i - 1] != "\\"):
|
|
66
|
+
if not in_quotes:
|
|
67
|
+
in_quotes = True
|
|
68
|
+
quote_char = char
|
|
69
|
+
elif char == quote_char:
|
|
70
|
+
in_quotes = False
|
|
71
|
+
quote_char = None
|
|
72
|
+
|
|
73
|
+
if not in_quotes:
|
|
74
|
+
if char in ("[", "{"):
|
|
75
|
+
bracket_depth += 1
|
|
76
|
+
elif char in ("]", "}"):
|
|
77
|
+
bracket_depth -= 1
|
|
78
|
+
|
|
79
|
+
if char == "=" and bracket_depth == 0:
|
|
80
|
+
current_key = current_value.strip()
|
|
81
|
+
current_value = ""
|
|
82
|
+
i += 1
|
|
83
|
+
continue
|
|
84
|
+
|
|
85
|
+
if char == "," and bracket_depth == 0:
|
|
86
|
+
if current_key:
|
|
87
|
+
params[current_key] = self._parse_value(current_value.strip())
|
|
88
|
+
current_key = None
|
|
89
|
+
current_value = ""
|
|
90
|
+
i += 1
|
|
91
|
+
continue
|
|
92
|
+
|
|
93
|
+
current_value += char
|
|
94
|
+
i += 1
|
|
95
|
+
|
|
96
|
+
if current_key:
|
|
97
|
+
params[current_key] = self._parse_value(current_value.strip())
|
|
98
|
+
|
|
99
|
+
return params
|
|
100
|
+
|
|
101
|
+
def _parse_value(self, value_str: str) -> Any:
|
|
102
|
+
value_str = value_str.strip()
|
|
103
|
+
|
|
104
|
+
if not value_str:
|
|
105
|
+
return ""
|
|
106
|
+
|
|
107
|
+
try:
|
|
108
|
+
return ast.literal_eval(value_str)
|
|
109
|
+
except (ValueError, SyntaxError):
|
|
110
|
+
return value_str
|
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
"""System prompts for the AI agent."""
|
|
2
|
-
|
|
3
1
|
from datetime import datetime
|
|
4
2
|
|
|
5
3
|
today = datetime.today()
|
|
@@ -30,44 +28,44 @@ Your output should STRICTLY follow the format:
|
|
|
30
28
|
Perform a tap action on a specified screen area. The element is a list of 2 integers, representing the coordinates of the tap point.
|
|
31
29
|
**Example**:
|
|
32
30
|
<answer>
|
|
33
|
-
do(action
|
|
31
|
+
do(action=\"Tap\", element=[x,y])
|
|
34
32
|
</answer>
|
|
35
33
|
- **Type**
|
|
36
34
|
Enter text into the currently focused input field.
|
|
37
35
|
**Example**:
|
|
38
36
|
<answer>
|
|
39
|
-
do(action
|
|
37
|
+
do(action=\"Type\", text=\"Hello World\")
|
|
40
38
|
</answer>
|
|
41
39
|
- **Swipe**
|
|
42
40
|
Perform a swipe action with start point and end point.
|
|
43
41
|
**Examples**:
|
|
44
42
|
<answer>
|
|
45
|
-
do(action
|
|
43
|
+
do(action=\"Swipe\", start=[x1,y1], end=[x2,y2])
|
|
46
44
|
</answer>
|
|
47
45
|
- **Long Press**
|
|
48
46
|
Perform a long press action on a specified screen area.
|
|
49
47
|
You can add the element to the action to specify the long press area. The element is a list of 2 integers, representing the coordinates of the long press point.
|
|
50
48
|
**Example**:
|
|
51
49
|
<answer>
|
|
52
|
-
do(action
|
|
50
|
+
do(action=\"Long Press\", element=[x,y])
|
|
53
51
|
</answer>
|
|
54
52
|
- **Launch**
|
|
55
53
|
Launch an app. Try to use launch action when you need to launch an app. Check the instruction to choose the right app before you use this action.
|
|
56
54
|
**Example**:
|
|
57
55
|
<answer>
|
|
58
|
-
do(action
|
|
56
|
+
do(action=\"Launch\", app=\"Settings\")
|
|
59
57
|
</answer>
|
|
60
58
|
- **Back**
|
|
61
59
|
Press the Back button to navigate to the previous screen.
|
|
62
60
|
**Example**:
|
|
63
61
|
<answer>
|
|
64
|
-
do(action
|
|
62
|
+
do(action=\"Back\")
|
|
65
63
|
</answer>
|
|
66
64
|
- **Finish**
|
|
67
65
|
Terminate the program and optionally print a message.
|
|
68
66
|
**Example**:
|
|
69
67
|
<answer>
|
|
70
|
-
finish(message
|
|
68
|
+
finish(message=\"Task completed.\")
|
|
71
69
|
</answer>
|
|
72
70
|
|
|
73
71
|
|