autoglm-gui 1.4.1__py3-none-any.whl → 1.5.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (135) hide show
  1. AutoGLM_GUI/__init__.py +11 -0
  2. AutoGLM_GUI/__main__.py +26 -4
  3. AutoGLM_GUI/actions/__init__.py +6 -0
  4. phone_agent/actions/handler_ios.py → AutoGLM_GUI/actions/handler.py +30 -112
  5. AutoGLM_GUI/actions/types.py +15 -0
  6. {phone_agent → AutoGLM_GUI}/adb/__init__.py +25 -23
  7. {phone_agent → AutoGLM_GUI}/adb/connection.py +5 -40
  8. {phone_agent → AutoGLM_GUI}/adb/device.py +12 -94
  9. {phone_agent → AutoGLM_GUI}/adb/input.py +6 -47
  10. AutoGLM_GUI/adb/screenshot.py +11 -0
  11. {phone_agent/config → AutoGLM_GUI/adb}/timing.py +1 -1
  12. AutoGLM_GUI/adb_plus/keyboard_installer.py +4 -2
  13. AutoGLM_GUI/adb_plus/screenshot.py +22 -1
  14. AutoGLM_GUI/adb_plus/serial.py +38 -20
  15. AutoGLM_GUI/adb_plus/touch.py +4 -9
  16. AutoGLM_GUI/agents/__init__.py +43 -12
  17. AutoGLM_GUI/agents/events.py +19 -0
  18. AutoGLM_GUI/agents/factory.py +31 -38
  19. AutoGLM_GUI/agents/glm/__init__.py +7 -0
  20. AutoGLM_GUI/agents/glm/agent.py +297 -0
  21. AutoGLM_GUI/agents/glm/message_builder.py +81 -0
  22. AutoGLM_GUI/agents/glm/parser.py +110 -0
  23. {phone_agent/config → AutoGLM_GUI/agents/glm}/prompts_en.py +7 -9
  24. {phone_agent/config → AutoGLM_GUI/agents/glm}/prompts_zh.py +18 -25
  25. AutoGLM_GUI/agents/mai/__init__.py +28 -0
  26. AutoGLM_GUI/agents/mai/agent.py +408 -0
  27. AutoGLM_GUI/agents/mai/parser.py +254 -0
  28. AutoGLM_GUI/agents/mai/prompts.py +103 -0
  29. AutoGLM_GUI/agents/mai/traj_memory.py +91 -0
  30. AutoGLM_GUI/agents/protocols.py +12 -8
  31. AutoGLM_GUI/agents/stream_runner.py +193 -0
  32. AutoGLM_GUI/api/__init__.py +40 -21
  33. AutoGLM_GUI/api/agents.py +181 -239
  34. AutoGLM_GUI/api/control.py +9 -6
  35. AutoGLM_GUI/api/devices.py +102 -12
  36. AutoGLM_GUI/api/history.py +104 -0
  37. AutoGLM_GUI/api/layered_agent.py +67 -15
  38. AutoGLM_GUI/api/media.py +64 -1
  39. AutoGLM_GUI/api/scheduled_tasks.py +98 -0
  40. AutoGLM_GUI/config.py +81 -0
  41. AutoGLM_GUI/config_manager.py +68 -51
  42. AutoGLM_GUI/device_manager.py +248 -29
  43. AutoGLM_GUI/device_protocol.py +1 -1
  44. AutoGLM_GUI/devices/adb_device.py +5 -10
  45. AutoGLM_GUI/devices/mock_device.py +4 -2
  46. AutoGLM_GUI/devices/remote_device.py +8 -3
  47. AutoGLM_GUI/history_manager.py +164 -0
  48. AutoGLM_GUI/model/__init__.py +5 -0
  49. AutoGLM_GUI/model/message_builder.py +69 -0
  50. AutoGLM_GUI/model/types.py +24 -0
  51. AutoGLM_GUI/models/__init__.py +10 -0
  52. AutoGLM_GUI/models/history.py +140 -0
  53. AutoGLM_GUI/models/scheduled_task.py +71 -0
  54. AutoGLM_GUI/parsers/__init__.py +22 -0
  55. AutoGLM_GUI/parsers/base.py +50 -0
  56. AutoGLM_GUI/parsers/phone_parser.py +58 -0
  57. AutoGLM_GUI/phone_agent_manager.py +62 -396
  58. AutoGLM_GUI/platform_utils.py +26 -0
  59. AutoGLM_GUI/prompt_config.py +15 -0
  60. AutoGLM_GUI/prompts/__init__.py +32 -0
  61. AutoGLM_GUI/scheduler_manager.py +350 -0
  62. AutoGLM_GUI/schemas.py +246 -72
  63. AutoGLM_GUI/scrcpy_stream.py +142 -24
  64. AutoGLM_GUI/socketio_server.py +100 -27
  65. AutoGLM_GUI/static/assets/{about-_XNhzQZX.js → about-CfwX1Cmc.js} +1 -1
  66. AutoGLM_GUI/static/assets/alert-dialog-CtGlN2IJ.js +1 -0
  67. AutoGLM_GUI/static/assets/chat-BYa-foUI.js +129 -0
  68. AutoGLM_GUI/static/assets/circle-alert-t08bEMPO.js +1 -0
  69. AutoGLM_GUI/static/assets/dialog-FNwZJFwk.js +45 -0
  70. AutoGLM_GUI/static/assets/eye-D0UPWCWC.js +1 -0
  71. AutoGLM_GUI/static/assets/history-CRo95B7i.js +1 -0
  72. AutoGLM_GUI/static/assets/{index-Cy8TmmHV.js → index-BaLMSqd3.js} +1 -1
  73. AutoGLM_GUI/static/assets/index-CTHbFvKl.js +11 -0
  74. AutoGLM_GUI/static/assets/index-CV7jGxGm.css +1 -0
  75. AutoGLM_GUI/static/assets/label-DJFevVmr.js +1 -0
  76. AutoGLM_GUI/static/assets/logs-RW09DyYY.js +1 -0
  77. AutoGLM_GUI/static/assets/popover--JTJrE5v.js +1 -0
  78. AutoGLM_GUI/static/assets/scheduled-tasks-DTRKsQXF.js +1 -0
  79. AutoGLM_GUI/static/assets/square-pen-CPK_K680.js +1 -0
  80. AutoGLM_GUI/static/assets/textarea-PRmVnWq5.js +1 -0
  81. AutoGLM_GUI/static/assets/workflows-CdcsAoaT.js +1 -0
  82. AutoGLM_GUI/static/index.html +2 -2
  83. AutoGLM_GUI/types.py +17 -0
  84. {autoglm_gui-1.4.1.dist-info → autoglm_gui-1.5.1.dist-info}/METADATA +179 -130
  85. autoglm_gui-1.5.1.dist-info/RECORD +118 -0
  86. AutoGLM_GUI/agents/mai_adapter.py +0 -627
  87. AutoGLM_GUI/api/dual_model.py +0 -317
  88. AutoGLM_GUI/device_adapter.py +0 -263
  89. AutoGLM_GUI/dual_model/__init__.py +0 -53
  90. AutoGLM_GUI/dual_model/decision_model.py +0 -664
  91. AutoGLM_GUI/dual_model/dual_agent.py +0 -917
  92. AutoGLM_GUI/dual_model/protocols.py +0 -354
  93. AutoGLM_GUI/dual_model/vision_model.py +0 -442
  94. AutoGLM_GUI/mai_ui_adapter/agent_wrapper.py +0 -291
  95. AutoGLM_GUI/phone_agent_patches.py +0 -147
  96. AutoGLM_GUI/static/assets/chat-DwJpiAWf.js +0 -126
  97. AutoGLM_GUI/static/assets/dialog-B3uW4T8V.js +0 -45
  98. AutoGLM_GUI/static/assets/index-Cpv2gSF1.css +0 -1
  99. AutoGLM_GUI/static/assets/index-UYYauTly.js +0 -12
  100. AutoGLM_GUI/static/assets/workflows-Du_de-dt.js +0 -1
  101. autoglm_gui-1.4.1.dist-info/RECORD +0 -117
  102. mai_agent/base.py +0 -137
  103. mai_agent/mai_grounding_agent.py +0 -263
  104. mai_agent/mai_naivigation_agent.py +0 -526
  105. mai_agent/prompt.py +0 -148
  106. mai_agent/unified_memory.py +0 -67
  107. mai_agent/utils.py +0 -73
  108. phone_agent/__init__.py +0 -12
  109. phone_agent/actions/__init__.py +0 -5
  110. phone_agent/actions/handler.py +0 -400
  111. phone_agent/adb/screenshot.py +0 -108
  112. phone_agent/agent.py +0 -253
  113. phone_agent/agent_ios.py +0 -277
  114. phone_agent/config/__init__.py +0 -53
  115. phone_agent/config/apps_harmonyos.py +0 -256
  116. phone_agent/config/apps_ios.py +0 -339
  117. phone_agent/config/prompts.py +0 -80
  118. phone_agent/device_factory.py +0 -166
  119. phone_agent/hdc/__init__.py +0 -53
  120. phone_agent/hdc/connection.py +0 -384
  121. phone_agent/hdc/device.py +0 -269
  122. phone_agent/hdc/input.py +0 -145
  123. phone_agent/hdc/screenshot.py +0 -127
  124. phone_agent/model/__init__.py +0 -5
  125. phone_agent/model/client.py +0 -290
  126. phone_agent/xctest/__init__.py +0 -47
  127. phone_agent/xctest/connection.py +0 -379
  128. phone_agent/xctest/device.py +0 -472
  129. phone_agent/xctest/input.py +0 -311
  130. phone_agent/xctest/screenshot.py +0 -226
  131. {phone_agent/config → AutoGLM_GUI/adb}/apps.py +0 -0
  132. {phone_agent/config → AutoGLM_GUI}/i18n.py +0 -0
  133. {autoglm_gui-1.4.1.dist-info → autoglm_gui-1.5.1.dist-info}/WHEEL +0 -0
  134. {autoglm_gui-1.4.1.dist-info → autoglm_gui-1.5.1.dist-info}/entry_points.txt +0 -0
  135. {autoglm_gui-1.4.1.dist-info → autoglm_gui-1.5.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,15 +1,8 @@
1
- """System prompts for the AI agent."""
2
-
3
1
  from datetime import datetime
4
2
 
5
3
  today = datetime.today()
6
4
  weekday_names = ["星期一", "星期二", "星期三", "星期四", "星期五", "星期六", "星期日"]
7
5
  weekday = weekday_names[today.weekday()]
8
- # NOTE: Do NOT use strftime with Chinese characters in format string!
9
- # On some Windows systems with non-UTF-8 locale (e.g., GBK/CP936),
10
- # strftime("%Y年%m月%d日") raises UnicodeEncodeError because the C library's
11
- # strftime uses locale encoding, not Python's UTF-8 mode.
12
- # Use f-string instead to avoid this issue completely.
13
6
  formatted_date = f"{today.year}年{today.month:02d}月{today.day:02d}日 {weekday}"
14
7
 
15
8
  SYSTEM_PROMPT = (
@@ -26,37 +19,37 @@ SYSTEM_PROMPT = (
26
19
  - {action} 是本次执行的具体操作指令,必须严格遵循下方定义的指令格式。
27
20
 
28
21
  操作指令及其作用如下:
29
- - do(action="Launch", app="xxx")
22
+ - do(action=\"Launch\", app=\"xxx\")
30
23
  Launch是启动目标app的操作,这比通过主屏幕导航更快。此操作完成后,您将自动收到结果状态的截图。
31
- - do(action="Tap", element=[x,y])
24
+ - do(action=\"Tap\", element=[x,y])
32
25
  Tap是点击操作,点击屏幕上的特定点。可用此操作点击按钮、选择项目、从主屏幕打开应用程序,或与任何可点击的用户界面元素进行交互。坐标系统从左上角 (0,0) 开始到右下角(999,999)结束。此操作完成后,您将自动收到结果状态的截图。
33
- - do(action="Tap", element=[x,y], message="重要操作")
26
+ - do(action=\"Tap\", element=[x,y], message=\"重要操作\")
34
27
  基本功能同Tap,点击涉及财产、支付、隐私等敏感按钮时触发。
35
- - do(action="Type", text="xxx")
28
+ - do(action=\"Type\", text=\"xxx\")
36
29
  Type是输入操作,在当前聚焦的输入框中输入文本。使用此操作前,请确保输入框已被聚焦(先点击它)。输入的文本将像使用键盘输入一样输入。重要提示:手机可能正在使用 ADB 键盘,该键盘不会像普通键盘那样占用屏幕空间。要确认键盘已激活,请查看屏幕底部是否显示 'ADB Keyboard {ON}' 类似的文本,或者检查输入框是否处于激活/高亮状态。不要仅仅依赖视觉上的键盘显示。自动清除文本:当你使用输入操作时,输入框中现有的任何文本(包括占位符文本和实际输入)都会在输入新文本前自动清除。你无需在输入前手动清除文本——直接使用输入操作输入所需文本即可。操作完成后,你将自动收到结果状态的截图。
37
- - do(action="Type_Name", text="xxx")
30
+ - do(action=\"Type_Name\", text=\"xxx\")
38
31
  Type_Name是输入人名的操作,基本功能同Type。
39
- - do(action="Interact")
32
+ - do(action=\"Interact\")
40
33
  Interact是当有多个满足条件的选项时而触发的交互操作,询问用户如何选择。
41
- - do(action="Swipe", start=[x1,y1], end=[x2,y2])
34
+ - do(action=\"Swipe\", start=[x1,y1], end=[x2,y2])
42
35
  Swipe是滑动操作,通过从起始坐标拖动到结束坐标来执行滑动手势。可用于滚动内容、在屏幕之间导航、下拉通知栏以及项目栏或进行基于手势的导航。坐标系统从左上角 (0,0) 开始到右下角(999,999)结束。滑动持续时间会自动调整以实现自然的移动。此操作完成后,您将自动收到结果状态的截图。
43
- - do(action="Note", message="True")
36
+ - do(action=\"Note\", message=\"True\")
44
37
  记录当前页面内容以便后续总结。
45
- - do(action="Call_API", instruction="xxx")
38
+ - do(action=\"Call_API\", instruction=\"xxx\")
46
39
  总结或评论当前页面或已记录的内容。
47
- - do(action="Long Press", element=[x,y])
40
+ - do(action=\"Long Press\", element=[x,y])
48
41
  Long Pres是长按操作,在屏幕上的特定点长按指定时间。可用于触发上下文菜单、选择文本或激活长按交互。坐标系统从左上角 (0,0) 开始到右下角(999,999)结束。此操作完成后,您将自动收到结果状态的屏幕截图。
49
- - do(action="Double Tap", element=[x,y])
42
+ - do(action=\"Double Tap\", element=[x,y])
50
43
  Double Tap在屏幕上的特定点快速连续点按两次。使用此操作可以激活双击交互,如缩放、选择文本或打开项目。坐标系统从左上角 (0,0) 开始到右下角(999,999)结束。此操作完成后,您将自动收到结果状态的截图。
51
- - do(action="Take_over", message="xxx")
44
+ - do(action=\"Take_over\", message=\"xxx\")
52
45
  Take_over是接管操作,表示在登录和验证阶段需要用户协助。
53
- - do(action="Back")
46
+ - do(action=\"Back\")
54
47
  导航返回到上一个屏幕或关闭当前对话框。相当于按下 Android 的返回按钮。使用此操作可以从更深的屏幕返回、关闭弹出窗口或退出当前上下文。此操作完成后,您将自动收到结果状态的截图。
55
- - do(action="Home")
48
+ - do(action=\"Home\")
56
49
  Home是回到系统桌面的操作,相当于按下 Android 主屏幕按钮。使用此操作可退出当前应用并返回启动器,或从已知状态启动新任务。此操作完成后,您将自动收到结果状态的截图。
57
- - do(action="Wait", duration="x seconds")
50
+ - do(action=\"Wait\", duration=\"x seconds\")
58
51
  等待页面加载,x为需要等待多少秒。
59
- - finish(message="xxx")
52
+ - finish(message=\"xxx\")
60
53
  finish是结束任务的操作,表示准确完整完成任务,message是终止信息。
61
54
 
62
55
  必须遵循的规则:
@@ -70,13 +63,13 @@ SYSTEM_PROMPT = (
70
63
  8. 购物车全选后再点击全选可以把状态设为全不选,在做购物车任务时,如果购物车里已经有商品被选中时,你需要点击全选后再点击取消全选,再去找需要购买或者删除的商品。
71
64
  9. 在做外卖任务时,如果相应店铺购物车里已经有其他商品你需要先把购物车清空再去购买用户指定的外卖。
72
65
  10. 在做点外卖任务时,如果用户需要点多个外卖,请尽量在同一店铺进行购买,如果无法找到可以下单,并说明某个商品未找到。
73
- 11. 请严格遵循用户意图执行任务,用户的特殊要求可以执行多次搜索,滑动查找。比如(i)用户要求点一杯咖啡,要咸的,你可以直接搜索咸咖啡,或者搜索咖啡后滑动查找咸的咖啡,比如海盐咖啡。(ii)用户要找到XX群,发一条消息,你可以先搜索XX群,找不到结果后,将""字去掉,搜索XX重试。(iii)用户要找到宠物友好的餐厅,你可以搜索餐厅,找到筛选,找到设施,选择可带宠物,或者直接搜索可带宠物,必要时可以使用AI搜索。
66
+ 11. 请严格遵循用户意图执行任务,用户的特殊要求可以执行多次搜索,滑动查找。比如(i)用户要求点一杯咖啡,要咸的,你可以直接搜索咸咖啡,或者搜索咖啡后滑动查找咸的咖啡,比如海盐咖啡。(ii)用户要找到XX群,发一条消息,你可以先搜索XX群,找不到结果后,将\"群\"字去掉,搜索XX重试。(iii)用户要找到宠物友好的餐厅,你可以搜索餐厅,找到筛选,找到设施,选择可带宠物,或者直接搜索可带宠物,必要时可以使用AI搜索。
74
67
  12. 在选择日期时,如果原滑动方向与预期日期越来越远,请向反方向滑动查找。
75
68
  13. 执行任务过程中如果有多个可选择的项目栏,请逐个查找每个项目栏,直到完成任务,一定不要在同一项目栏多次查找,从而陷入死循环。
76
69
  14. 在执行下一步操作前请一定要检查上一步的操作是否生效,如果点击没生效,可能因为app反应较慢,请先稍微等待一下,如果还是不生效请调整一下点击位置重试,如果仍然不生效请跳过这一步继续任务,并在finish message说明点击不生效。
77
70
  15. 在执行任务中如果遇到滑动不生效的情况,请调整一下起始点位置,增大滑动距离重试,如果还是不生效,有可能是已经滑到底了,请继续向反方向滑动,直到顶部或底部,如果仍然没有符合要求的结果,请跳过这一步继续任务,并在finish message说明但没找到要求的项目。
78
71
  16. 在做游戏任务时如果在战斗页面如果有自动战斗一定要开启自动战斗,如果多轮历史状态相似要检查自动战斗是否开启。
79
- 17. 如果没有合适的搜索结果,可能是因为搜索页面不对,请返回到搜索页面的上一级尝试重新搜索,如果尝试三次返回上一级搜索后仍然没有符合要求的结果,执行 finish(message="原因")。
72
+ 17. 如果没有合适的搜索结果,可能是因为搜索页面不对,请返回到搜索页面的上一级尝试重新搜索,如果尝试三次返回上一级搜索后仍然没有符合要求的结果,执行 finish(message=\"原因\")。
80
73
  18. 在结束任务前请一定要仔细检查任务是否完整准确的完成,如果出现错选、漏选、多选的情况,请返回之前的步骤进行纠正。
81
74
  """
82
75
  )
@@ -0,0 +1,28 @@
1
+ """MAI Agent - Internal implementation.
2
+
3
+ This module co-locates all MAI-specific code:
4
+ - InternalMAIAgent: The main agent implementation
5
+ - MAIParser: XML+JSON format parser with coordinate conversion (0-999 to 0-1000)
6
+ - MAI_MOBILE_SYSTEM_PROMPT: System prompt for Chinese environments
7
+ - TrajMemory, TrajStep: Trajectory memory for multi-step tasks
8
+
9
+ Design notes:
10
+ - MAI uses 0-999 coordinate system (normalized internally)
11
+ - Supports multi-image history context (configurable via history_n)
12
+ - Chinese-optimized prompts for domestic app scenarios
13
+ - Internal implementation replacing third-party mai_agent dependency
14
+ """
15
+
16
+ from .agent import InternalMAIAgent
17
+ from .parser import MAIParser, MAIParseError
18
+ from .prompts import MAI_MOBILE_SYSTEM_PROMPT
19
+ from .traj_memory import TrajMemory, TrajStep
20
+
21
+ __all__ = [
22
+ "InternalMAIAgent",
23
+ "MAIParser",
24
+ "MAIParseError",
25
+ "MAI_MOBILE_SYSTEM_PROMPT",
26
+ "TrajMemory",
27
+ "TrajStep",
28
+ ]
@@ -0,0 +1,408 @@
1
+ """Internal MAI Agent Implementation
2
+
3
+ 完全内部化实现的 MAI Agent,替代第三方 mai_agent 依赖。
4
+
5
+ 核心特性:
6
+ - 多图像历史上下文(保留最近 N 张截图)
7
+ - XML 格式的思考过程和动作输出
8
+ - 999 坐标系统归一化
9
+ - 自动重试机制
10
+ """
11
+
12
+ import base64
13
+ import time
14
+ import traceback
15
+ from io import BytesIO
16
+ from typing import Any, Callable
17
+
18
+ from openai import OpenAI
19
+ from PIL import Image
20
+
21
+ from AutoGLM_GUI.actions import ActionHandler, ActionResult
22
+ from AutoGLM_GUI.config import AgentConfig, ModelConfig, StepResult
23
+ from AutoGLM_GUI.device_protocol import DeviceProtocol
24
+ from AutoGLM_GUI.logger import logger
25
+ from AutoGLM_GUI.model import MessageBuilder
26
+
27
+ from .traj_memory import TrajMemory, TrajStep
28
+ from .parser import MAIParseError, MAIParser
29
+ from .prompts import MAI_MOBILE_SYSTEM_PROMPT
30
+
31
+
32
+ class InternalMAIAgent:
33
+ def __init__(
34
+ self,
35
+ model_config: ModelConfig,
36
+ agent_config: AgentConfig,
37
+ device: DeviceProtocol,
38
+ history_n: int = 3,
39
+ confirmation_callback: Callable[[str], bool] | None = None,
40
+ takeover_callback: Callable[[str], None] | None = None,
41
+ thinking_callback: Callable[[str], None] | None = None,
42
+ ):
43
+ self.model_config = model_config
44
+ self.agent_config = agent_config
45
+ self.history_n = history_n
46
+
47
+ self.openai_client = OpenAI(
48
+ base_url=model_config.base_url,
49
+ api_key=model_config.api_key,
50
+ timeout=120,
51
+ )
52
+ self.parser = MAIParser()
53
+
54
+ self.device = device
55
+ self.action_handler = ActionHandler(
56
+ device=self.device,
57
+ confirmation_callback=confirmation_callback,
58
+ takeover_callback=takeover_callback,
59
+ )
60
+
61
+ self.traj_memory = TrajMemory(task_goal="", task_id="", steps=[])
62
+ self._step_count = 0
63
+ self._is_running = False
64
+ self._thinking_callback = thinking_callback
65
+
66
+ self._total_llm_time = 0.0
67
+ self._total_action_time = 0.0
68
+ self._total_tokens = 0
69
+
70
+ def run(self, task: str) -> str:
71
+ self.traj_memory = TrajMemory(task_goal=task, task_id="", steps=[])
72
+ self._step_count = 0
73
+ self._is_running = True
74
+
75
+ try:
76
+ result = self._execute_step(task, is_first=True)
77
+
78
+ if result.finished:
79
+ return result.message or "Task completed"
80
+
81
+ while self._step_count < self.agent_config.max_steps and self._is_running:
82
+ result = self._execute_step(is_first=False)
83
+
84
+ if result.finished:
85
+ return result.message or "Task completed"
86
+
87
+ return "Max steps reached"
88
+ finally:
89
+ self._is_running = False
90
+
91
+ def step(self, task: str | None = None) -> StepResult:
92
+ is_first = len(self.traj_memory.steps) == 0
93
+
94
+ if is_first and not task:
95
+ raise ValueError("Task is required for the first step")
96
+
97
+ if is_first:
98
+ self.traj_memory.task_goal = task or ""
99
+ elif task:
100
+ # 多轮对话:有新的用户消息,更新 task_goal
101
+ self.traj_memory.task_goal = task
102
+
103
+ return self._execute_step(task, is_first)
104
+
105
+ def reset(self) -> None:
106
+ self.traj_memory.clear()
107
+ self._step_count = 0
108
+ self._is_running = False
109
+ self._total_llm_time = 0.0
110
+ self._total_action_time = 0.0
111
+ self._total_tokens = 0
112
+
113
+ def abort(self) -> None:
114
+ self._is_running = False
115
+ logger.info("InternalMAIAgent aborted by user")
116
+
117
+ def _stream_request(
118
+ self,
119
+ messages: list[dict[str, Any]],
120
+ on_thinking_chunk: Callable[[str], None] | None = None,
121
+ ) -> str:
122
+ stream = self.openai_client.chat.completions.create(
123
+ messages=messages, # type: ignore[arg-type]
124
+ model=self.model_config.model_name,
125
+ max_tokens=self.model_config.max_tokens,
126
+ temperature=self.model_config.temperature,
127
+ top_p=self.model_config.top_p,
128
+ frequency_penalty=self.model_config.frequency_penalty,
129
+ extra_body=self.model_config.extra_body,
130
+ stream=True,
131
+ )
132
+
133
+ raw_content = ""
134
+ buffer = ""
135
+ action_markers = ["</thinking>", "<tool_call>"]
136
+ in_action_phase = False
137
+
138
+ for chunk in stream:
139
+ if len(chunk.choices) == 0:
140
+ continue
141
+ if chunk.choices[0].delta.content is not None:
142
+ content = chunk.choices[0].delta.content
143
+ raw_content += content
144
+
145
+ if in_action_phase:
146
+ continue
147
+
148
+ buffer += content
149
+
150
+ marker_found = False
151
+ for marker in action_markers:
152
+ if marker in buffer:
153
+ thinking_part = buffer.split(marker, 1)[0]
154
+ if on_thinking_chunk:
155
+ on_thinking_chunk(thinking_part)
156
+ in_action_phase = True
157
+ marker_found = True
158
+ break
159
+
160
+ if marker_found:
161
+ continue
162
+
163
+ is_potential_marker = False
164
+ for marker in action_markers:
165
+ for i in range(1, len(marker)):
166
+ if buffer.endswith(marker[:i]):
167
+ is_potential_marker = True
168
+ break
169
+ if is_potential_marker:
170
+ break
171
+
172
+ if not is_potential_marker:
173
+ if on_thinking_chunk:
174
+ on_thinking_chunk(buffer)
175
+ buffer = ""
176
+
177
+ return raw_content
178
+
179
+ def _execute_step(
180
+ self, user_prompt: str | None = None, is_first: bool = False
181
+ ) -> StepResult:
182
+ self._step_count += 1
183
+
184
+ screenshot = self.device.get_screenshot()
185
+ current_app = self.device.get_current_app()
186
+
187
+ screenshot_bytes = base64.b64decode(screenshot.base64_data)
188
+ pil_image = Image.open(BytesIO(screenshot_bytes))
189
+
190
+ if is_first:
191
+ instruction = user_prompt or self.traj_memory.task_goal
192
+ else:
193
+ instruction = self.traj_memory.task_goal
194
+
195
+ screen_info = MessageBuilder.build_screen_info(current_app)
196
+
197
+ messages = self._build_messages(
198
+ instruction=instruction,
199
+ screen_info=screen_info,
200
+ current_screenshot_base64=screenshot.base64_data,
201
+ )
202
+
203
+ max_retries = 3
204
+ raw_content = ""
205
+ thinking = ""
206
+ raw_action = None
207
+ converted_action = None
208
+
209
+ for attempt in range(max_retries):
210
+ try:
211
+ if self.agent_config.verbose:
212
+ retry_info = (
213
+ f" (尝试 {attempt + 1}/{max_retries})" if attempt > 0 else ""
214
+ )
215
+ print("\n" + "=" * 50)
216
+ print(f"💭 步骤 {self._step_count}{retry_info} - 思考中...")
217
+ print("-" * 50)
218
+
219
+ callback = self._thinking_callback
220
+ if callback is None and self.agent_config.verbose:
221
+
222
+ def print_chunk(chunk: str) -> None:
223
+ print(chunk, end="", flush=True)
224
+
225
+ callback = print_chunk
226
+
227
+ llm_start = time.time()
228
+ raw_content = self._stream_request(messages, on_thinking_chunk=callback)
229
+ llm_time = time.time() - llm_start
230
+ self._total_llm_time += llm_time
231
+
232
+ if self.agent_config.verbose:
233
+ print(f"\n⏱️ LLM 耗时: {llm_time:.2f}s")
234
+
235
+ parsed = self.parser.parse_with_thinking(raw_content)
236
+ thinking = parsed["thinking"]
237
+ raw_action = parsed["raw_action"]
238
+ converted_action = parsed["converted_action"]
239
+
240
+ break
241
+
242
+ except MAIParseError as e:
243
+ if self.agent_config.verbose:
244
+ logger.warning(f"解析失败 (尝试 {attempt + 1}/{max_retries}): {e}")
245
+ if attempt == max_retries - 1:
246
+ return StepResult(
247
+ success=False,
248
+ finished=True,
249
+ action=None,
250
+ thinking="",
251
+ message=f"Parse error after {max_retries} retries: {e}",
252
+ )
253
+ continue
254
+
255
+ except Exception as e:
256
+ if self.agent_config.verbose:
257
+ logger.warning(
258
+ f"模型调用失败 (尝试 {attempt + 1}/{max_retries}): {e}"
259
+ )
260
+ if attempt == max_retries - 1:
261
+ if self.agent_config.verbose:
262
+ traceback.print_exc()
263
+ return StepResult(
264
+ success=False,
265
+ finished=True,
266
+ action=None,
267
+ thinking="",
268
+ message=f"Model error after {max_retries} retries: {e}",
269
+ )
270
+ continue
271
+
272
+ if not raw_content or raw_action is None or converted_action is None:
273
+ return StepResult(
274
+ success=False,
275
+ finished=True,
276
+ action=None,
277
+ thinking=thinking,
278
+ message="Failed to get valid response after retries",
279
+ )
280
+
281
+ if self.agent_config.verbose:
282
+ print()
283
+ print("-" * 50)
284
+ print("🎯 动作:")
285
+ print(f" 原始: {raw_action}")
286
+ print(f" 转换: {converted_action}")
287
+ print("=" * 50 + "\n")
288
+
289
+ traj_step = TrajStep(
290
+ screenshot=pil_image,
291
+ accessibility_tree=None,
292
+ prediction=raw_content,
293
+ action=raw_action,
294
+ conclusion="",
295
+ thought=thinking,
296
+ step_index=self._step_count - 1,
297
+ agent_type="InternalMAIAgent",
298
+ model_name=self.model_config.model_name,
299
+ screenshot_bytes=screenshot_bytes,
300
+ structured_action={"action_json": raw_action},
301
+ )
302
+ self.traj_memory.add_step(traj_step)
303
+
304
+ try:
305
+ action_start = time.time()
306
+ result = self.action_handler.execute(
307
+ converted_action, screenshot.width, screenshot.height
308
+ )
309
+ action_time = time.time() - action_start
310
+ self._total_action_time += action_time
311
+
312
+ if self.agent_config.verbose:
313
+ print(f"⚡ 动作执行耗时: {action_time:.2f}s")
314
+ except Exception as e:
315
+ if self.agent_config.verbose:
316
+ traceback.print_exc()
317
+ result = ActionResult(success=False, should_finish=True, message=str(e))
318
+
319
+ finished = converted_action.get("_metadata") == "finish" or result.should_finish
320
+
321
+ if finished and self.agent_config.verbose:
322
+ print("\n" + "🎉 " + "=" * 48)
323
+ print(
324
+ f"✅ 任务完成: {result.message or converted_action.get('message', '完成')}"
325
+ )
326
+ print("=" * 50)
327
+ print("\n📊 性能统计:")
328
+ print(f" 总步数: {self._step_count}")
329
+ print(f" 总 LLM 耗时: {self._total_llm_time:.2f}s")
330
+ print(f" 总动作耗时: {self._total_action_time:.2f}s")
331
+ print(
332
+ f" 平均每步耗时: {(self._total_llm_time + self._total_action_time) / self._step_count:.2f}s"
333
+ )
334
+ if self._total_tokens > 0:
335
+ print(f" 总 Token 使用: {self._total_tokens}")
336
+ print("=" * 50 + "\n")
337
+
338
+ return StepResult(
339
+ success=result.success,
340
+ finished=finished,
341
+ action=converted_action,
342
+ thinking=thinking,
343
+ message=result.message or converted_action.get("message"),
344
+ )
345
+
346
+ def _build_messages(
347
+ self, instruction: str, screen_info: str, current_screenshot_base64: str
348
+ ) -> list[dict[str, Any]]:
349
+ system_prompt = self.agent_config.system_prompt or MAI_MOBILE_SYSTEM_PROMPT
350
+
351
+ messages: list[dict[str, Any]] = [
352
+ MessageBuilder.create_system_message(system_prompt),
353
+ MessageBuilder.create_user_message(f"{instruction}\n\n{screen_info}"),
354
+ ]
355
+
356
+ history_images = self.traj_memory.get_history_images(self.history_n - 1)
357
+ history_thoughts = self.traj_memory.get_history_thoughts(self.history_n - 1)
358
+ history_actions = self.traj_memory.get_history_actions(self.history_n - 1)
359
+
360
+ for idx, (img_bytes, thought, action) in enumerate(
361
+ zip(history_images, history_thoughts, history_actions)
362
+ ):
363
+ img_base64 = base64.b64encode(img_bytes).decode("utf-8")
364
+ messages.append(
365
+ MessageBuilder.create_user_message(
366
+ text=screen_info, image_base64=img_base64
367
+ )
368
+ )
369
+
370
+ import json
371
+
372
+ tool_call_dict = {
373
+ "name": "mobile_use",
374
+ "arguments": action,
375
+ }
376
+ tool_call_json = json.dumps(tool_call_dict, separators=(",", ":"))
377
+ assistant_content = (
378
+ f"<thinking>\n{thought}\n</thinking>\n"
379
+ f"<tool_call>\n{tool_call_json}\n</tool_call>"
380
+ )
381
+ messages.append(MessageBuilder.create_assistant_message(assistant_content))
382
+
383
+ messages.append(
384
+ MessageBuilder.create_user_message(
385
+ text=screen_info, image_base64=current_screenshot_base64
386
+ )
387
+ )
388
+
389
+ return messages
390
+
391
+ @property
392
+ def context(self) -> list[dict[str, Any]]:
393
+ return [
394
+ {
395
+ "step": step.step_index,
396
+ "thought": step.thought,
397
+ "action": step.action,
398
+ }
399
+ for step in self.traj_memory.steps
400
+ ]
401
+
402
+ @property
403
+ def step_count(self) -> int:
404
+ return self._step_count
405
+
406
+ @property
407
+ def is_running(self) -> bool:
408
+ return self._is_running