autoglm-gui 1.4.1__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (104) hide show
  1. AutoGLM_GUI/__init__.py +11 -0
  2. AutoGLM_GUI/__main__.py +26 -4
  3. AutoGLM_GUI/actions/__init__.py +6 -0
  4. AutoGLM_GUI/actions/handler.py +196 -0
  5. AutoGLM_GUI/actions/types.py +15 -0
  6. AutoGLM_GUI/adb/__init__.py +53 -0
  7. AutoGLM_GUI/adb/apps.py +227 -0
  8. AutoGLM_GUI/adb/connection.py +323 -0
  9. AutoGLM_GUI/adb/device.py +171 -0
  10. AutoGLM_GUI/adb/input.py +67 -0
  11. AutoGLM_GUI/adb/screenshot.py +11 -0
  12. AutoGLM_GUI/adb/timing.py +167 -0
  13. AutoGLM_GUI/adb_plus/keyboard_installer.py +4 -2
  14. AutoGLM_GUI/adb_plus/screenshot.py +22 -1
  15. AutoGLM_GUI/adb_plus/serial.py +38 -20
  16. AutoGLM_GUI/adb_plus/touch.py +4 -9
  17. AutoGLM_GUI/agents/__init__.py +43 -12
  18. AutoGLM_GUI/agents/events.py +19 -0
  19. AutoGLM_GUI/agents/factory.py +31 -38
  20. AutoGLM_GUI/agents/glm/__init__.py +7 -0
  21. AutoGLM_GUI/agents/glm/agent.py +292 -0
  22. AutoGLM_GUI/agents/glm/message_builder.py +81 -0
  23. AutoGLM_GUI/agents/glm/parser.py +110 -0
  24. AutoGLM_GUI/agents/glm/prompts_en.py +77 -0
  25. AutoGLM_GUI/agents/glm/prompts_zh.py +75 -0
  26. AutoGLM_GUI/agents/mai/__init__.py +28 -0
  27. AutoGLM_GUI/agents/mai/agent.py +405 -0
  28. AutoGLM_GUI/agents/mai/parser.py +254 -0
  29. AutoGLM_GUI/agents/mai/prompts.py +103 -0
  30. AutoGLM_GUI/agents/mai/traj_memory.py +91 -0
  31. AutoGLM_GUI/agents/protocols.py +12 -8
  32. AutoGLM_GUI/agents/stream_runner.py +188 -0
  33. AutoGLM_GUI/api/__init__.py +40 -21
  34. AutoGLM_GUI/api/agents.py +157 -240
  35. AutoGLM_GUI/api/control.py +9 -6
  36. AutoGLM_GUI/api/devices.py +102 -12
  37. AutoGLM_GUI/api/history.py +78 -0
  38. AutoGLM_GUI/api/layered_agent.py +67 -15
  39. AutoGLM_GUI/api/media.py +64 -1
  40. AutoGLM_GUI/api/scheduled_tasks.py +98 -0
  41. AutoGLM_GUI/config.py +81 -0
  42. AutoGLM_GUI/config_manager.py +68 -51
  43. AutoGLM_GUI/device_manager.py +248 -29
  44. AutoGLM_GUI/device_protocol.py +1 -1
  45. AutoGLM_GUI/devices/adb_device.py +5 -10
  46. AutoGLM_GUI/devices/mock_device.py +4 -2
  47. AutoGLM_GUI/devices/remote_device.py +8 -3
  48. AutoGLM_GUI/history_manager.py +164 -0
  49. AutoGLM_GUI/i18n.py +81 -0
  50. AutoGLM_GUI/model/__init__.py +5 -0
  51. AutoGLM_GUI/model/message_builder.py +69 -0
  52. AutoGLM_GUI/model/types.py +24 -0
  53. AutoGLM_GUI/models/__init__.py +10 -0
  54. AutoGLM_GUI/models/history.py +96 -0
  55. AutoGLM_GUI/models/scheduled_task.py +71 -0
  56. AutoGLM_GUI/parsers/__init__.py +22 -0
  57. AutoGLM_GUI/parsers/base.py +50 -0
  58. AutoGLM_GUI/parsers/phone_parser.py +58 -0
  59. AutoGLM_GUI/phone_agent_manager.py +62 -396
  60. AutoGLM_GUI/platform_utils.py +26 -0
  61. AutoGLM_GUI/prompt_config.py +15 -0
  62. AutoGLM_GUI/prompts/__init__.py +32 -0
  63. AutoGLM_GUI/scheduler_manager.py +304 -0
  64. AutoGLM_GUI/schemas.py +234 -72
  65. AutoGLM_GUI/scrcpy_stream.py +142 -24
  66. AutoGLM_GUI/socketio_server.py +100 -27
  67. AutoGLM_GUI/static/assets/{about-_XNhzQZX.js → about-BQm96DAl.js} +1 -1
  68. AutoGLM_GUI/static/assets/alert-dialog-B42XxGPR.js +1 -0
  69. AutoGLM_GUI/static/assets/chat-C0L2gQYG.js +129 -0
  70. AutoGLM_GUI/static/assets/circle-alert-D4rSJh37.js +1 -0
  71. AutoGLM_GUI/static/assets/dialog-DZ78cEcj.js +45 -0
  72. AutoGLM_GUI/static/assets/history-DFBv7TGc.js +1 -0
  73. AutoGLM_GUI/static/assets/index-Bzyv2yQ2.css +1 -0
  74. AutoGLM_GUI/static/assets/{index-Cy8TmmHV.js → index-CmZSnDqc.js} +1 -1
  75. AutoGLM_GUI/static/assets/index-CssG-3TH.js +11 -0
  76. AutoGLM_GUI/static/assets/label-BCUzE_nm.js +1 -0
  77. AutoGLM_GUI/static/assets/logs-eoFxn5of.js +1 -0
  78. AutoGLM_GUI/static/assets/popover-DLsuV5Sx.js +1 -0
  79. AutoGLM_GUI/static/assets/scheduled-tasks-MyqGJvy_.js +1 -0
  80. AutoGLM_GUI/static/assets/square-pen-zGWYrdfj.js +1 -0
  81. AutoGLM_GUI/static/assets/textarea-BX6y7uM5.js +1 -0
  82. AutoGLM_GUI/static/assets/workflows-CYFs6ssC.js +1 -0
  83. AutoGLM_GUI/static/index.html +2 -2
  84. AutoGLM_GUI/types.py +17 -0
  85. {autoglm_gui-1.4.1.dist-info → autoglm_gui-1.5.0.dist-info}/METADATA +137 -130
  86. autoglm_gui-1.5.0.dist-info/RECORD +157 -0
  87. AutoGLM_GUI/agents/mai_adapter.py +0 -627
  88. AutoGLM_GUI/api/dual_model.py +0 -317
  89. AutoGLM_GUI/dual_model/__init__.py +0 -53
  90. AutoGLM_GUI/dual_model/decision_model.py +0 -664
  91. AutoGLM_GUI/dual_model/dual_agent.py +0 -917
  92. AutoGLM_GUI/dual_model/protocols.py +0 -354
  93. AutoGLM_GUI/dual_model/vision_model.py +0 -442
  94. AutoGLM_GUI/mai_ui_adapter/agent_wrapper.py +0 -291
  95. AutoGLM_GUI/phone_agent_patches.py +0 -147
  96. AutoGLM_GUI/static/assets/chat-DwJpiAWf.js +0 -126
  97. AutoGLM_GUI/static/assets/dialog-B3uW4T8V.js +0 -45
  98. AutoGLM_GUI/static/assets/index-Cpv2gSF1.css +0 -1
  99. AutoGLM_GUI/static/assets/index-UYYauTly.js +0 -12
  100. AutoGLM_GUI/static/assets/workflows-Du_de-dt.js +0 -1
  101. autoglm_gui-1.4.1.dist-info/RECORD +0 -117
  102. {autoglm_gui-1.4.1.dist-info → autoglm_gui-1.5.0.dist-info}/WHEEL +0 -0
  103. {autoglm_gui-1.4.1.dist-info → autoglm_gui-1.5.0.dist-info}/entry_points.txt +0 -0
  104. {autoglm_gui-1.4.1.dist-info → autoglm_gui-1.5.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,627 +0,0 @@
1
- """MAI Agent adapter for AutoGLM-GUI.
2
-
3
- This module provides an adapter that wraps mai_agent.MAIUINaivigationAgent
4
- to make it compatible with the PhoneAgent interface used in AutoGLM-GUI.
5
- """
6
-
7
- from __future__ import annotations
8
-
9
- import base64
10
- import re
11
- import sys
12
- from dataclasses import dataclass
13
- from io import BytesIO
14
- from pathlib import Path
15
- from typing import TYPE_CHECKING, Any, Callable, Optional, Tuple
16
-
17
- from PIL import Image
18
-
19
- from phone_agent.actions.handler import ActionHandler
20
- from phone_agent.agent import AgentConfig, StepResult
21
- from phone_agent.device_factory import get_device_factory
22
- from phone_agent.model import ModelConfig
23
-
24
- from AutoGLM_GUI.logger import logger
25
-
26
-
27
- # Add mai_agent to sys.path for import
28
- # mai_agent uses top-level imports (e.g., "from base import BaseAgent")
29
- # which require the mai_agent directory to be in Python path
30
- def _ensure_mai_agent_importable() -> None:
31
- """Ensure mai_agent directory is in sys.path for importing.
32
-
33
- This function handles multiple environments:
34
- - Development: mai_agent is in project root
35
- - Wheel installation: mai_agent is installed as data file
36
- - PyInstaller: mai_agent is in sys._MEIPASS
37
- """
38
- # Check if already importable
39
- try:
40
- import mai_naivigation_agent # type: ignore[import-not-found] # noqa: F401
41
-
42
- return
43
- except ImportError:
44
- pass
45
-
46
- # Try to locate mai_agent directory
47
- mai_agent_paths = []
48
-
49
- # 1. PyInstaller environment: check sys._MEIPASS
50
- if getattr(sys, "frozen", False) and hasattr(sys, "_MEIPASS"):
51
- meipass = Path(getattr(sys, "_MEIPASS"))
52
- mai_agent_paths.append(meipass / "mai_agent")
53
-
54
- # 2. Wheel installation: check site-packages
55
- # Try to get the package location
56
- try:
57
- import AutoGLM_GUI
58
-
59
- pkg_root = Path(AutoGLM_GUI.__file__).parent.parent
60
- mai_agent_paths.append(pkg_root / "mai_agent")
61
- except (ImportError, AttributeError):
62
- pass
63
-
64
- # 3. Development environment: check project root relative to this file
65
- # This file is at: AutoGLM_GUI/agents/mai_adapter.py
66
- # Project root is 3 levels up
67
- current_file = Path(__file__)
68
- project_root = current_file.parent.parent.parent
69
- mai_agent_paths.append(project_root / "mai_agent")
70
-
71
- # Add first existing path to sys.path
72
- for mai_path in mai_agent_paths:
73
- if mai_path.exists() and mai_path.is_dir():
74
- mai_path_str = str(mai_path)
75
- if mai_path_str not in sys.path:
76
- sys.path.insert(0, mai_path_str)
77
- logger.debug(f"Added {mai_path_str} to sys.path for mai_agent imports")
78
- return
79
-
80
- # If we get here, mai_agent was not found
81
- logger.warning(
82
- "mai_agent directory not found. MAI Agent functionality may not work."
83
- )
84
-
85
-
86
- _ensure_mai_agent_importable()
87
-
88
- if TYPE_CHECKING:
89
- from mai_naivigation_agent import MAIUINaivigationAgent # type: ignore[import-not-found]
90
-
91
-
92
- @dataclass
93
- class MAIAgentConfig:
94
- """MAI Agent specific configuration.
95
-
96
- Attributes:
97
- history_n: Number of historical screenshots to include in context.
98
- max_pixels: Maximum pixels for image resizing (optional).
99
- min_pixels: Minimum pixels for image resizing (optional).
100
- tools: MCP tools list (optional, not implemented yet).
101
- use_mai_prompt: Whether to use MAI's native prompt format.
102
- """
103
-
104
- history_n: int = 3
105
- max_pixels: Optional[int] = None
106
- min_pixels: Optional[int] = None
107
- tools: Optional[list[dict[str, Any]]] = None
108
- use_mai_prompt: bool = False
109
-
110
-
111
- class MAIAgentAdapter:
112
- """
113
- Adapter for MAI Agent that implements PhoneAgent-compatible interface.
114
-
115
- This adapter wraps mai_agent.MAIUINaivigationAgent and provides:
116
- - Compatible run() and step() methods
117
- - Action format conversion (MAI → PhoneAgent)
118
- - Coordinate system conversion (0-999 → 0-1000)
119
- - Reuses existing ActionHandler for ADB operations
120
- - Trajectory management via MAI's TrajMemory
121
-
122
- Example:
123
- >>> adapter = MAIAgentAdapter(model_config, agent_config, mai_config)
124
- >>> result = adapter.run("Open Settings")
125
- >>> print(result)
126
- """
127
-
128
- def __init__(
129
- self,
130
- model_config: ModelConfig,
131
- agent_config: AgentConfig,
132
- mai_config: MAIAgentConfig,
133
- confirmation_callback: Optional[Callable[[str], bool]] = None,
134
- takeover_callback: Optional[Callable[[str], None]] = None,
135
- on_thinking_chunk: Optional[Callable[[str], None]] = None,
136
- ):
137
- """Initialize the MAI Agent adapter.
138
-
139
- Args:
140
- model_config: Model configuration (base_url, model_name, etc.)
141
- agent_config: Agent configuration (device_id, max_steps, etc.)
142
- mai_config: MAI-specific configuration
143
- confirmation_callback: Callback for sensitive action confirmation
144
- takeover_callback: Callback for takeover requests
145
- on_thinking_chunk: Callback for streaming thinking chunks
146
- """
147
- self.model_config = model_config
148
- self.agent_config = agent_config
149
- self.mai_config = mai_config
150
-
151
- from mai_naivigation_agent import MAIUINaivigationAgent # type: ignore[import-not-found]
152
-
153
- runtime_conf = {
154
- "history_n": mai_config.history_n,
155
- "temperature": model_config.temperature,
156
- "top_k": -1, # MAI default
157
- "top_p": model_config.top_p,
158
- "max_tokens": model_config.max_tokens,
159
- }
160
-
161
- if mai_config.max_pixels:
162
- runtime_conf["max_pixels"] = mai_config.max_pixels
163
- if mai_config.min_pixels:
164
- runtime_conf["min_pixels"] = mai_config.min_pixels
165
-
166
- self.mai_agent: MAIUINaivigationAgent = MAIUINaivigationAgent(
167
- llm_base_url=model_config.base_url,
168
- model_name=model_config.model_name,
169
- runtime_conf=runtime_conf,
170
- tools=mai_config.tools,
171
- )
172
-
173
- # Create action handler (reuse from phone_agent)
174
- self.action_handler = ActionHandler(
175
- device_id=agent_config.device_id,
176
- confirmation_callback=confirmation_callback,
177
- takeover_callback=takeover_callback,
178
- )
179
-
180
- # State management
181
- self._step_count = 0
182
- self._current_instruction = ""
183
- self._on_thinking_chunk = on_thinking_chunk
184
-
185
- logger.info(
186
- f"MAI Agent adapter initialized for device {agent_config.device_id} "
187
- f"using model {model_config.model_name}"
188
- )
189
-
190
- def run(self, task: str) -> str:
191
- """Run the agent to complete a task.
192
-
193
- This method loops through steps until the task is finished
194
- or max_steps is reached.
195
-
196
- Args:
197
- task: Natural language description of the task.
198
-
199
- Returns:
200
- Final message from the agent.
201
- """
202
- self._current_instruction = task
203
- self.mai_agent.reset()
204
- self._step_count = 0
205
-
206
- while self._step_count < self.agent_config.max_steps:
207
- result = self._execute_step(is_first=(self._step_count == 0))
208
-
209
- if result.finished:
210
- return result.message or "Task completed"
211
-
212
- self._step_count += 1
213
-
214
- return "Max steps reached"
215
-
216
- def step(self, task: Optional[str] = None) -> StepResult:
217
- """Execute a single step.
218
-
219
- Args:
220
- task: Task description (only required for the first step).
221
-
222
- Returns:
223
- StepResult containing the action and thinking.
224
- """
225
- is_first = self._step_count == 0
226
-
227
- if is_first:
228
- if not task:
229
- raise ValueError("Task is required for the first step")
230
- self._current_instruction = task
231
- if len(self.mai_agent.traj_memory.steps) == 0:
232
- self.mai_agent.reset()
233
-
234
- result = self._execute_step(is_first=is_first)
235
- self._step_count += 1
236
- return result
237
-
238
- def reset(self) -> None:
239
- """Reset the agent state."""
240
- self.mai_agent.reset()
241
- self._step_count = 0
242
- self._current_instruction = ""
243
-
244
- def _execute_step(self, is_first: bool) -> StepResult:
245
- """Execute a single step (internal method).
246
-
247
- Args:
248
- is_first: Whether this is the first step.
249
-
250
- Returns:
251
- StepResult
252
- """
253
- # 1. Get current screenshot
254
- device_factory = get_device_factory()
255
- screenshot = device_factory.get_screenshot(self.agent_config.device_id)
256
-
257
- # 2. Convert base64_data to PIL Image
258
- # The Screenshot object contains base64_data, not pil_image
259
- image_data = base64.b64decode(screenshot.base64_data)
260
- pil_image = Image.open(BytesIO(image_data))
261
-
262
- # 3. Build observation dictionary
263
- obs = {
264
- "screenshot": pil_image,
265
- "accessibility_tree": None, # Not supported yet
266
- }
267
-
268
- # 4. Call MAI agent predict
269
- # IMPORTANT: Always pass self._current_instruction, not just on the first step.
270
- # MAI agent's _build_messages uses instruction to populate the primary user message,
271
- # and does not re-inject it from history. Without the instruction in subsequent steps,
272
- # the model would lose track of the task goal as history grows.
273
- try:
274
- prediction_text, action_dict = self.mai_agent.predict(
275
- instruction=self._current_instruction,
276
- obs=obs,
277
- )
278
- except Exception as e:
279
- logger.error(f"MAI agent prediction failed: {e}")
280
- return StepResult(
281
- success=False,
282
- finished=True,
283
- action=None,
284
- thinking="",
285
- message=f"Prediction error: {e}",
286
- )
287
-
288
- # 5. Extract thinking from prediction_text
289
- # MAI Agent uses <thinking> tags
290
- thinking = self._extract_thinking(prediction_text)
291
-
292
- # 6. Convert action format
293
- converted_action = self._convert_action(action_dict)
294
-
295
- # 7. Execute action
296
- try:
297
- action_result = self.action_handler.execute(
298
- converted_action,
299
- screenshot.width,
300
- screenshot.height,
301
- )
302
- except Exception as e:
303
- logger.error(f"Action execution failed: {e}")
304
- return StepResult(
305
- success=False,
306
- finished=True,
307
- action=converted_action,
308
- thinking=thinking,
309
- message=f"Action error: {e}",
310
- )
311
-
312
- # 8. Check if finished
313
- finished = (
314
- converted_action.get("_metadata") == "finish" or action_result.should_finish
315
- )
316
-
317
- return StepResult(
318
- success=action_result.success,
319
- finished=finished,
320
- action=converted_action,
321
- thinking=thinking,
322
- message=action_result.message,
323
- )
324
-
325
- def _convert_action(self, mai_action: dict[str, Any]) -> dict[str, Any]:
326
- """Convert MAI action format to PhoneAgent format.
327
-
328
- MAI format: {"action": "click", "coordinate": [x, y]}
329
- PhoneAgent format: {"_metadata": "do", "action": "Tap", "element": [x, y]}
330
-
331
- Coordinate conversion: MAI uses 0-999, PhoneAgent uses 0-1000.
332
-
333
- Args:
334
- mai_action: Action dictionary from MAI agent.
335
-
336
- Returns:
337
- Converted action dictionary for PhoneAgent.
338
- """
339
- action_type = mai_action.get("action")
340
-
341
- # Terminate action
342
- if action_type == "terminate":
343
- status = mai_action.get("status", "success")
344
- return {
345
- "_metadata": "finish",
346
- "message": "Task completed" if status == "success" else "Task failed",
347
- }
348
-
349
- # Answer action (no operation)
350
- if action_type == "answer":
351
- return {
352
- "_metadata": "finish",
353
- "message": mai_action.get("text", ""),
354
- }
355
-
356
- # Wait action
357
- if action_type == "wait":
358
- return {
359
- "_metadata": "do",
360
- "action": "Wait",
361
- "duration": "1 seconds",
362
- }
363
-
364
- # System button
365
- if action_type == "system_button":
366
- button_name = mai_action.get("button", "")
367
-
368
- # Special handling for Enter key
369
- # ActionHandler doesn't have an "Enter" handler, so we handle it directly here
370
- if button_name == "enter":
371
- # Use platform_utils to run ADB keyevent command
372
- from AutoGLM_GUI.platform_utils import run_cmd_silently_sync
373
-
374
- adb_prefix = (
375
- ["adb", "-s", self.agent_config.device_id]
376
- if self.agent_config.device_id
377
- else ["adb"]
378
- )
379
- run_cmd_silently_sync(
380
- adb_prefix + ["shell", "input", "keyevent", "KEYCODE_ENTER"],
381
- timeout=5,
382
- )
383
- # Return a Wait action to indicate success
384
- return {
385
- "_metadata": "do",
386
- "action": "Wait",
387
- "duration": "0.5 seconds",
388
- }
389
-
390
- # Other system buttons use standard handlers
391
- action_map = {
392
- "back": "Back",
393
- "home": "Home",
394
- }
395
- return {
396
- "_metadata": "do",
397
- "action": action_map.get(button_name, "Back"),
398
- }
399
-
400
- # Click-type actions (require coordinates)
401
- coordinate = mai_action.get("coordinate")
402
- if coordinate:
403
- # Coordinate conversion: 0-999 -> 0-1000
404
- x = self._convert_coordinate(coordinate[0])
405
- y = self._convert_coordinate(coordinate[1])
406
-
407
- if action_type == "click":
408
- return {
409
- "_metadata": "do",
410
- "action": "Tap",
411
- "element": [x, y],
412
- }
413
- elif action_type == "long_press":
414
- return {
415
- "_metadata": "do",
416
- "action": "Long Press",
417
- "element": [x, y],
418
- }
419
- elif action_type == "double_click":
420
- return {
421
- "_metadata": "do",
422
- "action": "Double Tap",
423
- "element": [x, y],
424
- }
425
-
426
- # Swipe action
427
- if action_type == "swipe":
428
- direction = mai_action.get("direction", "up")
429
- # Default to normalized center [0.5, 0.5], not [500, 500]
430
- # MAI coordinates are normalized to [0, 1], so we use normalized values
431
- coordinate = mai_action.get("coordinate") or [0.5, 0.5]
432
- x = self._convert_coordinate(coordinate[0])
433
- y = self._convert_coordinate(coordinate[1])
434
-
435
- start, end = self._calculate_swipe_coordinates(direction, x, y)
436
-
437
- return {
438
- "_metadata": "do",
439
- "action": "Swipe",
440
- "start": start,
441
- "end": end,
442
- }
443
-
444
- # Drag action
445
- if action_type == "drag":
446
- start_coord = mai_action.get("start_coordinate", [0, 0])
447
- end_coord = mai_action.get("end_coordinate", [0, 0])
448
-
449
- # IMPORTANT: start_coordinate and end_coordinate are NOT normalized by MAI.
450
- # They remain in SCALE_FACTOR range [0, 999], unlike the "coordinate" field
451
- # which is normalized to [0, 1]. We must use the scale factor conversion.
452
- start = [
453
- self._convert_coordinate_from_scale_factor(start_coord[0]),
454
- self._convert_coordinate_from_scale_factor(start_coord[1]),
455
- ]
456
- end = [
457
- self._convert_coordinate_from_scale_factor(end_coord[0]),
458
- self._convert_coordinate_from_scale_factor(end_coord[1]),
459
- ]
460
-
461
- return {
462
- "_metadata": "do",
463
- "action": "Swipe",
464
- "start": start,
465
- "end": end,
466
- }
467
-
468
- # Text input
469
- if action_type == "type":
470
- return {
471
- "_metadata": "do",
472
- "action": "Type",
473
- "text": mai_action.get("text", ""),
474
- }
475
-
476
- # Open app
477
- if action_type == "open":
478
- return {
479
- "_metadata": "do",
480
- "action": "Launch",
481
- "app": mai_action.get("text", ""),
482
- }
483
-
484
- # Unknown action - treat as finish
485
- logger.warning(f"Unknown MAI action type: {action_type}")
486
- return {
487
- "_metadata": "finish",
488
- "message": f"Unknown action: {action_type}",
489
- }
490
-
491
- def _convert_coordinate(self, coord: float) -> int:
492
- """Convert coordinate from MAI scale to PhoneAgent scale.
493
-
494
- MAI agent normalizes coordinates to [0, 1] in parse_action_to_structure_output.
495
- PhoneAgent uses normalized coordinates in [0, 1000] range.
496
-
497
- Args:
498
- coord: Coordinate in MAI scale [0, 1] (normalized).
499
-
500
- Returns:
501
- Coordinate in PhoneAgent scale [0, 1000].
502
-
503
- Example:
504
- >>> _convert_coordinate(0.5) # Center of screen
505
- 500
506
- """
507
- return int(coord * 1000)
508
-
509
- def _convert_coordinate_from_scale_factor(self, coord: float) -> int:
510
- """Convert coordinate from MAI SCALE_FACTOR to PhoneAgent scale.
511
-
512
- For drag actions, MAI does NOT normalize start_coordinate/end_coordinate.
513
- These coordinates remain in the SCALE_FACTOR range [0, 999].
514
- PhoneAgent uses normalized coordinates in [0, 1000] range.
515
-
516
- Args:
517
- coord: Coordinate in MAI SCALE_FACTOR [0, 999].
518
-
519
- Returns:
520
- Coordinate in PhoneAgent scale [0, 1000].
521
-
522
- Example:
523
- >>> _convert_coordinate_from_scale_factor(500) # Center of screen
524
- 500
525
- """
526
- SCALE_FACTOR = 999
527
- return int(coord * 1000 / SCALE_FACTOR)
528
-
529
- def _calculate_swipe_coordinates(
530
- self, direction: str, x: int, y: int
531
- ) -> Tuple[list[int], list[int]]:
532
- """Calculate swipe coordinates based on direction.
533
-
534
- Args:
535
- direction: Swipe direction (up, down, left, right).
536
- x: Center X coordinate.
537
- y: Center Y coordinate.
538
-
539
- Returns:
540
- Tuple of [start_x, start_y] and [end_x, end_y].
541
- """
542
- distance = 300 # Default swipe distance
543
-
544
- if direction == "up":
545
- start = [x, y + distance // 2]
546
- end = [x, y - distance // 2]
547
- elif direction == "down":
548
- start = [x, y - distance // 2]
549
- end = [x, y + distance // 2]
550
- elif direction == "left":
551
- start = [x + distance // 2, y]
552
- end = [x - distance // 2, y]
553
- elif direction == "right":
554
- start = [x - distance // 2, y]
555
- end = [x + distance // 2, y]
556
- else:
557
- start = [x, y]
558
- end = [x, y]
559
-
560
- return start, end
561
-
562
- def _extract_thinking(self, prediction_text: str) -> str:
563
- """Extract thinking content from agent response.
564
-
565
- MAI Agent format:
566
- <thinking>reasoning process</thinking>
567
- <tool_call>...</tool_call>
568
-
569
- GLM Agent format:
570
- ```
571
- 详细的推理过程...
572
- ```
573
- <answer>action</answer>
574
-
575
- Args:
576
- prediction_text: Full prediction text from agent.
577
-
578
- Returns:
579
- Thinking content (empty string if not found or truncated).
580
- """
581
- # Try <thinking> tags first (MAI Agent format)
582
- match = re.search(r"<thinking>(.*?)</thinking>", prediction_text, re.DOTALL)
583
- if match:
584
- thinking = match.group(1).strip()
585
- # Truncate if too long (MAI Agent can produce very long reasoning)
586
- if len(thinking) > 500:
587
- thinking = thinking[:500] + "..."
588
- return thinking
589
-
590
- # Fallback to ``` tags (GLM format)
591
- match = re.search(r"```(.*?)```", prediction_text, re.DOTALL)
592
- if match:
593
- thinking = match.group(1).strip()
594
- if len(thinking) > 500:
595
- thinking = thinking[:500] + "..."
596
- return thinking
597
-
598
- return ""
599
-
600
- @property
601
- def context(self) -> list[dict[str, Any]]:
602
- """Return trajectory history in PhoneAgent format (read-only).
603
-
604
- This property converts MAI's TrajMemory to PhoneAgent's context format.
605
-
606
- Returns:
607
- List of message dictionaries.
608
- """
609
- context = []
610
-
611
- for step in self.mai_agent.traj_memory.steps:
612
- # Assistant message
613
- if step.thought:
614
- content = f"<thinking>\n{step.thought}\n</thinking>\n<answer>\n{step.action}\n</answer>"
615
- context.append(
616
- {
617
- "role": "assistant",
618
- "content": content,
619
- }
620
- )
621
-
622
- return context
623
-
624
- @property
625
- def step_count(self) -> int:
626
- """Return current step count."""
627
- return self._step_count