autoglm-gui 0.4.14__py3-none-any.whl → 1.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. AutoGLM_GUI/api/devices.py +49 -0
  2. AutoGLM_GUI/schemas.py +16 -0
  3. AutoGLM_GUI/static/assets/{about-29B5FDM8.js → about-BOnRPlKQ.js} +1 -1
  4. AutoGLM_GUI/static/assets/chat-CGW6uMKB.js +149 -0
  5. AutoGLM_GUI/static/assets/{index-mVNV0VwM.js → index-CRFVU0eu.js} +1 -1
  6. AutoGLM_GUI/static/assets/{index-wu8Wjf12.js → index-DH-Dl4tK.js} +5 -5
  7. AutoGLM_GUI/static/assets/index-DzUQ89YC.css +1 -0
  8. AutoGLM_GUI/static/index.html +2 -2
  9. {autoglm_gui-0.4.14.dist-info → autoglm_gui-1.0.1.dist-info}/METADATA +3 -3
  10. autoglm_gui-1.0.1.dist-info/RECORD +73 -0
  11. phone_agent/__init__.py +3 -2
  12. phone_agent/actions/handler.py +124 -31
  13. phone_agent/actions/handler_ios.py +278 -0
  14. phone_agent/adb/connection.py +14 -5
  15. phone_agent/adb/device.py +47 -16
  16. phone_agent/agent.py +8 -8
  17. phone_agent/agent_ios.py +277 -0
  18. phone_agent/config/__init__.py +18 -0
  19. phone_agent/config/apps.py +1 -1
  20. phone_agent/config/apps_harmonyos.py +256 -0
  21. phone_agent/config/apps_ios.py +339 -0
  22. phone_agent/config/i18n.py +8 -0
  23. phone_agent/config/timing.py +167 -0
  24. phone_agent/device_factory.py +166 -0
  25. phone_agent/hdc/__init__.py +53 -0
  26. phone_agent/hdc/connection.py +384 -0
  27. phone_agent/hdc/device.py +269 -0
  28. phone_agent/hdc/input.py +145 -0
  29. phone_agent/hdc/screenshot.py +127 -0
  30. phone_agent/model/client.py +104 -4
  31. phone_agent/xctest/__init__.py +47 -0
  32. phone_agent/xctest/connection.py +379 -0
  33. phone_agent/xctest/device.py +472 -0
  34. phone_agent/xctest/input.py +311 -0
  35. phone_agent/xctest/screenshot.py +226 -0
  36. AutoGLM_GUI/static/assets/chat-DTN2oKtA.js +0 -149
  37. AutoGLM_GUI/static/assets/index-Dy550Qqg.css +0 -1
  38. autoglm_gui-0.4.14.dist-info/RECORD +0 -57
  39. {autoglm_gui-0.4.14.dist-info → autoglm_gui-1.0.1.dist-info}/WHEEL +0 -0
  40. {autoglm_gui-0.4.14.dist-info → autoglm_gui-1.0.1.dist-info}/entry_points.txt +0 -0
  41. {autoglm_gui-0.4.14.dist-info → autoglm_gui-1.0.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,278 @@
1
+ """Action handler for iOS automation using WebDriverAgent."""
2
+
3
+ import time
4
+ from dataclasses import dataclass
5
+ from typing import Any, Callable
6
+
7
+ from phone_agent.xctest import (
8
+ back,
9
+ double_tap,
10
+ home,
11
+ launch_app,
12
+ long_press,
13
+ swipe,
14
+ tap,
15
+ )
16
+ from phone_agent.xctest.input import clear_text, hide_keyboard, type_text
17
+
18
+
19
+ @dataclass
20
+ class ActionResult:
21
+ """Result of an action execution."""
22
+
23
+ success: bool
24
+ should_finish: bool
25
+ message: str | None = None
26
+ requires_confirmation: bool = False
27
+
28
+
29
+ class IOSActionHandler:
30
+ """
31
+ Handles execution of actions from AI model output for iOS devices.
32
+
33
+ Args:
34
+ wda_url: WebDriverAgent URL.
35
+ session_id: Optional WDA session ID.
36
+ confirmation_callback: Optional callback for sensitive action confirmation.
37
+ Should return True to proceed, False to cancel.
38
+ takeover_callback: Optional callback for takeover requests (login, captcha).
39
+ """
40
+
41
+ def __init__(
42
+ self,
43
+ wda_url: str = "http://localhost:8100",
44
+ session_id: str | None = None,
45
+ confirmation_callback: Callable[[str], bool] | None = None,
46
+ takeover_callback: Callable[[str], None] | None = None,
47
+ ):
48
+ self.wda_url = wda_url
49
+ self.session_id = session_id
50
+ self.confirmation_callback = confirmation_callback or self._default_confirmation
51
+ self.takeover_callback = takeover_callback or self._default_takeover
52
+
53
+ def execute(
54
+ self, action: dict[str, Any], screen_width: int, screen_height: int
55
+ ) -> ActionResult:
56
+ """
57
+ Execute an action from the AI model.
58
+
59
+ Args:
60
+ action: The action dictionary from the model.
61
+ screen_width: Current screen width in pixels.
62
+ screen_height: Current screen height in pixels.
63
+
64
+ Returns:
65
+ ActionResult indicating success and whether to finish.
66
+ """
67
+ action_type = action.get("_metadata")
68
+
69
+ if action_type == "finish":
70
+ return ActionResult(
71
+ success=True, should_finish=True, message=action.get("message")
72
+ )
73
+
74
+ if action_type != "do":
75
+ return ActionResult(
76
+ success=False,
77
+ should_finish=True,
78
+ message=f"Unknown action type: {action_type}",
79
+ )
80
+
81
+ action_name = action.get("action")
82
+ handler_method = self._get_handler(action_name)
83
+
84
+ if handler_method is None:
85
+ return ActionResult(
86
+ success=False,
87
+ should_finish=False,
88
+ message=f"Unknown action: {action_name}",
89
+ )
90
+
91
+ try:
92
+ return handler_method(action, screen_width, screen_height)
93
+ except Exception as e:
94
+ return ActionResult(
95
+ success=False, should_finish=False, message=f"Action failed: {e}"
96
+ )
97
+
98
+ def _get_handler(self, action_name: str) -> Callable | None:
99
+ """Get the handler method for an action."""
100
+ handlers = {
101
+ "Launch": self._handle_launch,
102
+ "Tap": self._handle_tap,
103
+ "Type": self._handle_type,
104
+ "Type_Name": self._handle_type,
105
+ "Swipe": self._handle_swipe,
106
+ "Back": self._handle_back,
107
+ "Home": self._handle_home,
108
+ "Double Tap": self._handle_double_tap,
109
+ "Long Press": self._handle_long_press,
110
+ "Wait": self._handle_wait,
111
+ "Take_over": self._handle_takeover,
112
+ "Note": self._handle_note,
113
+ "Call_API": self._handle_call_api,
114
+ "Interact": self._handle_interact,
115
+ }
116
+ return handlers.get(action_name)
117
+
118
+ def _convert_relative_to_absolute(
119
+ self, element: list[int], screen_width: int, screen_height: int
120
+ ) -> tuple[int, int]:
121
+ """Convert relative coordinates (0-1000) to absolute pixels."""
122
+ x = int(element[0] / 1000 * screen_width)
123
+ y = int(element[1] / 1000 * screen_height)
124
+ return x, y
125
+
126
+ def _handle_launch(self, action: dict, width: int, height: int) -> ActionResult:
127
+ """Handle app launch action."""
128
+ app_name = action.get("app")
129
+ if not app_name:
130
+ return ActionResult(False, False, "No app name specified")
131
+
132
+ success = launch_app(app_name, wda_url=self.wda_url, session_id=self.session_id)
133
+ if success:
134
+ return ActionResult(True, False)
135
+ return ActionResult(False, False, f"App not found: {app_name}")
136
+
137
+ def _handle_tap(self, action: dict, width: int, height: int) -> ActionResult:
138
+ """Handle tap action."""
139
+ element = action.get("element")
140
+ if not element:
141
+ return ActionResult(False, False, "No element coordinates")
142
+
143
+ x, y = self._convert_relative_to_absolute(element, width, height)
144
+
145
+ print(f"Physically tap on ({x}, {y})")
146
+
147
+ # Check for sensitive operation
148
+ if "message" in action:
149
+ if not self.confirmation_callback(action["message"]):
150
+ return ActionResult(
151
+ success=False,
152
+ should_finish=True,
153
+ message="User cancelled sensitive operation",
154
+ )
155
+
156
+ tap(x, y, wda_url=self.wda_url, session_id=self.session_id)
157
+ return ActionResult(True, False)
158
+
159
+ def _handle_type(self, action: dict, width: int, height: int) -> ActionResult:
160
+ """Handle text input action."""
161
+ text = action.get("text", "")
162
+
163
+ # Clear existing text and type new text
164
+ clear_text(wda_url=self.wda_url, session_id=self.session_id)
165
+ time.sleep(0.5)
166
+
167
+ type_text(text, wda_url=self.wda_url, session_id=self.session_id)
168
+ time.sleep(0.5)
169
+
170
+ # Hide keyboard after typing
171
+ hide_keyboard(wda_url=self.wda_url, session_id=self.session_id)
172
+ time.sleep(0.5)
173
+
174
+ return ActionResult(True, False)
175
+
176
+ def _handle_swipe(self, action: dict, width: int, height: int) -> ActionResult:
177
+ """Handle swipe action."""
178
+ start = action.get("start")
179
+ end = action.get("end")
180
+
181
+ if not start or not end:
182
+ return ActionResult(False, False, "Missing swipe coordinates")
183
+
184
+ start_x, start_y = self._convert_relative_to_absolute(start, width, height)
185
+ end_x, end_y = self._convert_relative_to_absolute(end, width, height)
186
+
187
+ print(f"Physically scroll from ({start_x}, {start_y}) to ({end_x}, {end_y})")
188
+
189
+ swipe(
190
+ start_x,
191
+ start_y,
192
+ end_x,
193
+ end_y,
194
+ wda_url=self.wda_url,
195
+ session_id=self.session_id,
196
+ )
197
+ return ActionResult(True, False)
198
+
199
+ def _handle_back(self, action: dict, width: int, height: int) -> ActionResult:
200
+ """Handle back gesture (swipe from left edge)."""
201
+ back(wda_url=self.wda_url, session_id=self.session_id)
202
+ return ActionResult(True, False)
203
+
204
+ def _handle_home(self, action: dict, width: int, height: int) -> ActionResult:
205
+ """Handle home button action."""
206
+ home(wda_url=self.wda_url, session_id=self.session_id)
207
+ return ActionResult(True, False)
208
+
209
+ def _handle_double_tap(self, action: dict, width: int, height: int) -> ActionResult:
210
+ """Handle double tap action."""
211
+ element = action.get("element")
212
+ if not element:
213
+ return ActionResult(False, False, "No element coordinates")
214
+
215
+ x, y = self._convert_relative_to_absolute(element, width, height)
216
+ double_tap(x, y, wda_url=self.wda_url, session_id=self.session_id)
217
+ return ActionResult(True, False)
218
+
219
+ def _handle_long_press(self, action: dict, width: int, height: int) -> ActionResult:
220
+ """Handle long press action."""
221
+ element = action.get("element")
222
+ if not element:
223
+ return ActionResult(False, False, "No element coordinates")
224
+
225
+ x, y = self._convert_relative_to_absolute(element, width, height)
226
+ long_press(
227
+ x,
228
+ y,
229
+ duration=3.0,
230
+ wda_url=self.wda_url,
231
+ session_id=self.session_id,
232
+ )
233
+ return ActionResult(True, False)
234
+
235
+ def _handle_wait(self, action: dict, width: int, height: int) -> ActionResult:
236
+ """Handle wait action."""
237
+ duration_str = action.get("duration", "1 seconds")
238
+ try:
239
+ duration = float(duration_str.replace("seconds", "").strip())
240
+ except ValueError:
241
+ duration = 1.0
242
+
243
+ time.sleep(duration)
244
+ return ActionResult(True, False)
245
+
246
+ def _handle_takeover(self, action: dict, width: int, height: int) -> ActionResult:
247
+ """Handle takeover request (login, captcha, etc.)."""
248
+ message = action.get("message", "User intervention required")
249
+ self.takeover_callback(message)
250
+ return ActionResult(True, False)
251
+
252
+ def _handle_note(self, action: dict, width: int, height: int) -> ActionResult:
253
+ """Handle note action (placeholder for content recording)."""
254
+ # This action is typically used for recording page content
255
+ # Implementation depends on specific requirements
256
+ return ActionResult(True, False)
257
+
258
+ def _handle_call_api(self, action: dict, width: int, height: int) -> ActionResult:
259
+ """Handle API call action (placeholder for summarization)."""
260
+ # This action is typically used for content summarization
261
+ # Implementation depends on specific requirements
262
+ return ActionResult(True, False)
263
+
264
+ def _handle_interact(self, action: dict, width: int, height: int) -> ActionResult:
265
+ """Handle interaction request (user choice needed)."""
266
+ # This action signals that user input is needed
267
+ return ActionResult(True, False, message="User interaction required")
268
+
269
+ @staticmethod
270
+ def _default_confirmation(message: str) -> bool:
271
+ """Default confirmation callback using console input."""
272
+ response = input(f"Sensitive operation: {message}\nConfirm? (Y/N): ")
273
+ return response.upper() == "Y"
274
+
275
+ @staticmethod
276
+ def _default_takeover(message: str) -> None:
277
+ """Default takeover callback using console input."""
278
+ input(f"{message}\nPress Enter after completing manual operation...")
@@ -5,6 +5,8 @@ import time
5
5
  from dataclasses import dataclass
6
6
  from enum import Enum
7
7
 
8
+ from phone_agent.config.timing import TIMING_CONFIG
9
+
8
10
 
9
11
  class ConnectionType(Enum):
10
12
  """Type of ADB connection."""
@@ -106,7 +108,9 @@ class ADBConnection:
106
108
  if address:
107
109
  cmd.append(address)
108
110
 
109
- result = subprocess.run(cmd, capture_output=True, text=True, timeout=5)
111
+ result = subprocess.run(
112
+ cmd, capture_output=True, text=True, encoding="utf-8", timeout=5
113
+ )
110
114
 
111
115
  output = result.stdout + result.stderr
112
116
  return True, output.strip() or "Disconnected"
@@ -238,12 +242,14 @@ class ADBConnection:
238
242
  cmd.extend(["-s", device_id])
239
243
  cmd.extend(["tcpip", str(port)])
240
244
 
241
- result = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
245
+ result = subprocess.run(
246
+ cmd, capture_output=True, text=True, encoding="utf-8", timeout=10
247
+ )
242
248
 
243
249
  output = result.stdout + result.stderr
244
250
 
245
251
  if "restarting" in output.lower() or result.returncode == 0:
246
- time.sleep(2) # Wait for ADB to restart
252
+ time.sleep(TIMING_CONFIG.connection.adb_restart_delay)
247
253
  return True, f"TCP/IP mode enabled on port {port}"
248
254
  else:
249
255
  return False, output.strip()
@@ -267,7 +273,9 @@ class ADBConnection:
267
273
  cmd.extend(["-s", device_id])
268
274
  cmd.extend(["shell", "ip", "route"])
269
275
 
270
- result = subprocess.run(cmd, capture_output=True, text=True, timeout=5)
276
+ result = subprocess.run(
277
+ cmd, capture_output=True, text=True, encoding="utf-8", timeout=5
278
+ )
271
279
 
272
280
  # Parse IP from route output
273
281
  for line in result.stdout.split("\n"):
@@ -283,6 +291,7 @@ class ADBConnection:
283
291
  cmd[:-1] + ["shell", "ip", "addr", "show", "wlan0"],
284
292
  capture_output=True,
285
293
  text=True,
294
+ encoding="utf-8",
286
295
  timeout=5,
287
296
  )
288
297
 
@@ -311,7 +320,7 @@ class ADBConnection:
311
320
  [self.adb_path, "kill-server"], capture_output=True, timeout=5
312
321
  )
313
322
 
314
- time.sleep(1)
323
+ time.sleep(TIMING_CONFIG.connection.server_restart_delay)
315
324
 
316
325
  # Start server
317
326
  subprocess.run(
phone_agent/adb/device.py CHANGED
@@ -4,6 +4,7 @@ import subprocess
4
4
  import time
5
5
 
6
6
  from phone_agent.config.apps import APP_PACKAGES
7
+ from phone_agent.config.timing import TIMING_CONFIG
7
8
 
8
9
 
9
10
  def get_current_app(device_id: str | None = None) -> str:
@@ -19,9 +20,14 @@ def get_current_app(device_id: str | None = None) -> str:
19
20
  adb_prefix = _get_adb_prefix(device_id)
20
21
 
21
22
  result = subprocess.run(
22
- adb_prefix + ["shell", "dumpsys", "window"], capture_output=True, text=True
23
+ adb_prefix + ["shell", "dumpsys", "window"],
24
+ capture_output=True,
25
+ text=True,
26
+ encoding="utf-8",
23
27
  )
24
28
  output = result.stdout
29
+ if not output:
30
+ raise ValueError("No output from dumpsys window")
25
31
 
26
32
  # Parse window focus info
27
33
  for line in output.split("\n"):
@@ -33,7 +39,9 @@ def get_current_app(device_id: str | None = None) -> str:
33
39
  return "System Home"
34
40
 
35
41
 
36
- def tap(x: int, y: int, device_id: str | None = None, delay: float = 1.0) -> None:
42
+ def tap(
43
+ x: int, y: int, device_id: str | None = None, delay: float | None = None
44
+ ) -> None:
37
45
  """
38
46
  Tap at the specified coordinates.
39
47
 
@@ -41,8 +49,11 @@ def tap(x: int, y: int, device_id: str | None = None, delay: float = 1.0) -> Non
41
49
  x: X coordinate.
42
50
  y: Y coordinate.
43
51
  device_id: Optional ADB device ID.
44
- delay: Delay in seconds after tap.
52
+ delay: Delay in seconds after tap. If None, uses configured default.
45
53
  """
54
+ if delay is None:
55
+ delay = TIMING_CONFIG.device.default_tap_delay
56
+
46
57
  adb_prefix = _get_adb_prefix(device_id)
47
58
 
48
59
  subprocess.run(
@@ -52,7 +63,7 @@ def tap(x: int, y: int, device_id: str | None = None, delay: float = 1.0) -> Non
52
63
 
53
64
 
54
65
  def double_tap(
55
- x: int, y: int, device_id: str | None = None, delay: float = 1.0
66
+ x: int, y: int, device_id: str | None = None, delay: float | None = None
56
67
  ) -> None:
57
68
  """
58
69
  Double tap at the specified coordinates.
@@ -61,14 +72,17 @@ def double_tap(
61
72
  x: X coordinate.
62
73
  y: Y coordinate.
63
74
  device_id: Optional ADB device ID.
64
- delay: Delay in seconds after double tap.
75
+ delay: Delay in seconds after double tap. If None, uses configured default.
65
76
  """
77
+ if delay is None:
78
+ delay = TIMING_CONFIG.device.default_double_tap_delay
79
+
66
80
  adb_prefix = _get_adb_prefix(device_id)
67
81
 
68
82
  subprocess.run(
69
83
  adb_prefix + ["shell", "input", "tap", str(x), str(y)], capture_output=True
70
84
  )
71
- time.sleep(0.1)
85
+ time.sleep(TIMING_CONFIG.device.double_tap_interval)
72
86
  subprocess.run(
73
87
  adb_prefix + ["shell", "input", "tap", str(x), str(y)], capture_output=True
74
88
  )
@@ -80,7 +94,7 @@ def long_press(
80
94
  y: int,
81
95
  duration_ms: int = 3000,
82
96
  device_id: str | None = None,
83
- delay: float = 1.0,
97
+ delay: float | None = None,
84
98
  ) -> None:
85
99
  """
86
100
  Long press at the specified coordinates.
@@ -90,8 +104,11 @@ def long_press(
90
104
  y: Y coordinate.
91
105
  duration_ms: Duration of press in milliseconds.
92
106
  device_id: Optional ADB device ID.
93
- delay: Delay in seconds after long press.
107
+ delay: Delay in seconds after long press. If None, uses configured default.
94
108
  """
109
+ if delay is None:
110
+ delay = TIMING_CONFIG.device.default_long_press_delay
111
+
95
112
  adb_prefix = _get_adb_prefix(device_id)
96
113
 
97
114
  subprocess.run(
@@ -109,7 +126,7 @@ def swipe(
109
126
  end_y: int,
110
127
  duration_ms: int | None = None,
111
128
  device_id: str | None = None,
112
- delay: float = 1.0,
129
+ delay: float | None = None,
113
130
  ) -> None:
114
131
  """
115
132
  Swipe from start to end coordinates.
@@ -121,8 +138,11 @@ def swipe(
121
138
  end_y: Ending Y coordinate.
122
139
  duration_ms: Duration of swipe in milliseconds (auto-calculated if None).
123
140
  device_id: Optional ADB device ID.
124
- delay: Delay in seconds after swipe.
141
+ delay: Delay in seconds after swipe. If None, uses configured default.
125
142
  """
143
+ if delay is None:
144
+ delay = TIMING_CONFIG.device.default_swipe_delay
145
+
126
146
  adb_prefix = _get_adb_prefix(device_id)
127
147
 
128
148
  if duration_ms is None:
@@ -148,14 +168,17 @@ def swipe(
148
168
  time.sleep(delay)
149
169
 
150
170
 
151
- def back(device_id: str | None = None, delay: float = 1.0) -> None:
171
+ def back(device_id: str | None = None, delay: float | None = None) -> None:
152
172
  """
153
173
  Press the back button.
154
174
 
155
175
  Args:
156
176
  device_id: Optional ADB device ID.
157
- delay: Delay in seconds after pressing back.
177
+ delay: Delay in seconds after pressing back. If None, uses configured default.
158
178
  """
179
+ if delay is None:
180
+ delay = TIMING_CONFIG.device.default_back_delay
181
+
159
182
  adb_prefix = _get_adb_prefix(device_id)
160
183
 
161
184
  subprocess.run(
@@ -164,14 +187,17 @@ def back(device_id: str | None = None, delay: float = 1.0) -> None:
164
187
  time.sleep(delay)
165
188
 
166
189
 
167
- def home(device_id: str | None = None, delay: float = 1.0) -> None:
190
+ def home(device_id: str | None = None, delay: float | None = None) -> None:
168
191
  """
169
192
  Press the home button.
170
193
 
171
194
  Args:
172
195
  device_id: Optional ADB device ID.
173
- delay: Delay in seconds after pressing home.
196
+ delay: Delay in seconds after pressing home. If None, uses configured default.
174
197
  """
198
+ if delay is None:
199
+ delay = TIMING_CONFIG.device.default_home_delay
200
+
175
201
  adb_prefix = _get_adb_prefix(device_id)
176
202
 
177
203
  subprocess.run(
@@ -180,18 +206,23 @@ def home(device_id: str | None = None, delay: float = 1.0) -> None:
180
206
  time.sleep(delay)
181
207
 
182
208
 
183
- def launch_app(app_name: str, device_id: str | None = None, delay: float = 1.0) -> bool:
209
+ def launch_app(
210
+ app_name: str, device_id: str | None = None, delay: float | None = None
211
+ ) -> bool:
184
212
  """
185
213
  Launch an app by name.
186
214
 
187
215
  Args:
188
216
  app_name: The app name (must be in APP_PACKAGES).
189
217
  device_id: Optional ADB device ID.
190
- delay: Delay in seconds after launching.
218
+ delay: Delay in seconds after launching. If None, uses configured default.
191
219
 
192
220
  Returns:
193
221
  True if app was launched, False if app not found.
194
222
  """
223
+ if delay is None:
224
+ delay = TIMING_CONFIG.device.default_launch_delay
225
+
195
226
  if app_name not in APP_PACKAGES:
196
227
  return False
197
228
 
phone_agent/agent.py CHANGED
@@ -7,8 +7,8 @@ from typing import Any, Callable
7
7
 
8
8
  from phone_agent.actions import ActionHandler
9
9
  from phone_agent.actions.handler import finish, parse_action
10
- from phone_agent.adb import get_current_app, get_screenshot
11
10
  from phone_agent.config import get_messages, get_system_prompt
11
+ from phone_agent.device_factory import get_device_factory
12
12
  from phone_agent.model import ModelClient, ModelConfig
13
13
  from phone_agent.model.client import MessageBuilder
14
14
 
@@ -140,8 +140,9 @@ class PhoneAgent:
140
140
  self._step_count += 1
141
141
 
142
142
  # Capture current screen state
143
- screenshot = get_screenshot(self.agent_config.device_id)
144
- current_app = get_current_app(self.agent_config.device_id)
143
+ device_factory = get_device_factory()
144
+ screenshot = device_factory.get_screenshot(self.agent_config.device_id)
145
+ current_app = device_factory.get_current_app(self.agent_config.device_id)
145
146
 
146
147
  # Build messages
147
148
  if is_first:
@@ -169,6 +170,10 @@ class PhoneAgent:
169
170
 
170
171
  # Get model response
171
172
  try:
173
+ msgs = get_messages(self.agent_config.lang)
174
+ print("\n" + "=" * 50)
175
+ print(f"💭 {msgs['thinking']}:")
176
+ print("-" * 50)
172
177
  response = self.model_client.request(self._context)
173
178
  except Exception as e:
174
179
  if self.agent_config.verbose:
@@ -191,11 +196,6 @@ class PhoneAgent:
191
196
 
192
197
  if self.agent_config.verbose:
193
198
  # Print thinking process
194
- msgs = get_messages(self.agent_config.lang)
195
- print("\n" + "=" * 50)
196
- print(f"💭 {msgs['thinking']}:")
197
- print("-" * 50)
198
- print(response.thinking)
199
199
  print("-" * 50)
200
200
  print(f"🎯 {msgs['action']}:")
201
201
  print(json.dumps(action, ensure_ascii=False, indent=2))