@xagent-ai/cli 1.2.2 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (568) hide show
  1. package/.github/ISSUE_TEMPLATE/bug_report.md +38 -38
  2. package/.github/ISSUE_TEMPLATE/feature_request.md +20 -20
  3. package/.github/workflows/ci.yml +72 -0
  4. package/.github/workflows/release.yml +109 -0
  5. package/.gitmodules +3 -3
  6. package/README.md +326 -280
  7. package/README_CN.md +325 -279
  8. package/dist/ai-client/factory.d.ts +52 -0
  9. package/dist/ai-client/factory.d.ts.map +1 -0
  10. package/dist/ai-client/factory.js +132 -0
  11. package/dist/ai-client/factory.js.map +1 -0
  12. package/dist/ai-client/index.d.ts +20 -0
  13. package/dist/ai-client/index.d.ts.map +1 -0
  14. package/dist/ai-client/index.js +49 -0
  15. package/dist/ai-client/index.js.map +1 -0
  16. package/dist/ai-client/providers/anthropic.d.ts +57 -0
  17. package/dist/ai-client/providers/anthropic.d.ts.map +1 -0
  18. package/dist/ai-client/providers/anthropic.js +400 -0
  19. package/dist/ai-client/providers/anthropic.js.map +1 -0
  20. package/dist/ai-client/providers/openai.d.ts +57 -0
  21. package/dist/ai-client/providers/openai.d.ts.map +1 -0
  22. package/dist/ai-client/providers/openai.js +286 -0
  23. package/dist/ai-client/providers/openai.js.map +1 -0
  24. package/dist/ai-client/providers/remote.d.ts +111 -0
  25. package/dist/ai-client/providers/remote.d.ts.map +1 -0
  26. package/dist/ai-client/providers/remote.js +351 -0
  27. package/dist/ai-client/providers/remote.js.map +1 -0
  28. package/dist/ai-client/registry.d.ts +51 -0
  29. package/dist/ai-client/registry.d.ts.map +1 -0
  30. package/dist/ai-client/registry.js +81 -0
  31. package/dist/ai-client/registry.js.map +1 -0
  32. package/dist/ai-client/types.d.ts +260 -0
  33. package/dist/ai-client/types.d.ts.map +1 -0
  34. package/dist/ai-client/types.js +73 -0
  35. package/dist/ai-client/types.js.map +1 -0
  36. package/dist/ai-client-factory.d.ts +62 -0
  37. package/dist/ai-client-factory.d.ts.map +1 -0
  38. package/dist/ai-client-factory.js +157 -0
  39. package/dist/ai-client-factory.js.map +1 -0
  40. package/dist/auth.d.ts +23 -1
  41. package/dist/auth.d.ts.map +1 -1
  42. package/dist/auth.js +160 -168
  43. package/dist/auth.js.map +1 -1
  44. package/dist/cancellation.d.ts +5 -4
  45. package/dist/cancellation.d.ts.map +1 -1
  46. package/dist/cancellation.js +55 -32
  47. package/dist/cancellation.js.map +1 -1
  48. package/dist/checkpoint.d.ts +1 -1
  49. package/dist/checkpoint.d.ts.map +1 -1
  50. package/dist/checkpoint.js +2 -2
  51. package/dist/checkpoint.js.map +1 -1
  52. package/dist/cli.js +626 -13
  53. package/dist/cli.js.map +1 -1
  54. package/dist/config.d.ts +10 -4
  55. package/dist/config.d.ts.map +1 -1
  56. package/dist/config.js +62 -25
  57. package/dist/config.js.map +1 -1
  58. package/dist/context-compressor.d.ts +81 -16
  59. package/dist/context-compressor.d.ts.map +1 -1
  60. package/dist/context-compressor.js +712 -153
  61. package/dist/context-compressor.js.map +1 -1
  62. package/dist/gui-subagent/action-parser/actionParser.d.ts.map +1 -1
  63. package/dist/gui-subagent/action-parser/actionParser.js +4 -2
  64. package/dist/gui-subagent/action-parser/actionParser.js.map +1 -1
  65. package/dist/gui-subagent/agent/gui-agent.d.ts +29 -2
  66. package/dist/gui-subagent/agent/gui-agent.d.ts.map +1 -1
  67. package/dist/gui-subagent/agent/gui-agent.js +87 -45
  68. package/dist/gui-subagent/agent/gui-agent.js.map +1 -1
  69. package/dist/gui-subagent/index.d.ts +16 -1
  70. package/dist/gui-subagent/index.d.ts.map +1 -1
  71. package/dist/gui-subagent/index.js +4 -0
  72. package/dist/gui-subagent/index.js.map +1 -1
  73. package/dist/gui-subagent/operator/base-operator.d.ts.map +1 -1
  74. package/dist/gui-subagent/operator/base-operator.js +0 -1
  75. package/dist/gui-subagent/operator/base-operator.js.map +1 -1
  76. package/dist/gui-subagent/operator/computer-operator.d.ts.map +1 -1
  77. package/dist/gui-subagent/operator/computer-operator.js +29 -8
  78. package/dist/gui-subagent/operator/computer-operator.js.map +1 -1
  79. package/dist/gui-subagent/types/actions.d.ts +1 -1
  80. package/dist/gui-subagent/types/actions.d.ts.map +1 -1
  81. package/dist/gui-subagent/types/actions.js +0 -1
  82. package/dist/gui-subagent/types/actions.js.map +1 -1
  83. package/dist/gui-subagent/types/operator.d.ts +1 -1
  84. package/dist/gui-subagent/types/operator.d.ts.map +1 -1
  85. package/dist/index.d.ts +1 -2
  86. package/dist/index.d.ts.map +1 -1
  87. package/dist/index.js +1 -2
  88. package/dist/index.js.map +1 -1
  89. package/dist/input-processor.d.ts.map +1 -1
  90. package/dist/input-processor.js +6 -3
  91. package/dist/input-processor.js.map +1 -1
  92. package/dist/mcp.d.ts +5 -0
  93. package/dist/mcp.d.ts.map +1 -1
  94. package/dist/mcp.js +81 -35
  95. package/dist/mcp.js.map +1 -1
  96. package/dist/ripgrep.d.ts +29 -0
  97. package/dist/ripgrep.d.ts.map +1 -0
  98. package/dist/ripgrep.js +292 -0
  99. package/dist/ripgrep.js.map +1 -0
  100. package/dist/session.d.ts +23 -7
  101. package/dist/session.d.ts.map +1 -1
  102. package/dist/session.js +624 -243
  103. package/dist/session.js.map +1 -1
  104. package/dist/shell.d.ts +33 -0
  105. package/dist/shell.d.ts.map +1 -0
  106. package/dist/shell.js +125 -0
  107. package/dist/shell.js.map +1 -0
  108. package/dist/skill-installer.d.ts +38 -0
  109. package/dist/skill-installer.d.ts.map +1 -0
  110. package/dist/skill-installer.js +447 -0
  111. package/dist/skill-installer.js.map +1 -0
  112. package/dist/skill-invoker.d.ts +7 -1
  113. package/dist/skill-invoker.d.ts.map +1 -1
  114. package/dist/skill-invoker.js +34 -13
  115. package/dist/skill-invoker.js.map +1 -1
  116. package/dist/skill-loader.d.ts +8 -3
  117. package/dist/skill-loader.d.ts.map +1 -1
  118. package/dist/skill-loader.js +46 -44
  119. package/dist/skill-loader.js.map +1 -1
  120. package/dist/skill-manager.d.ts +85 -0
  121. package/dist/skill-manager.d.ts.map +1 -0
  122. package/dist/skill-manager.js +340 -0
  123. package/dist/skill-manager.js.map +1 -0
  124. package/dist/slash-commands.d.ts +38 -1
  125. package/dist/slash-commands.d.ts.map +1 -1
  126. package/dist/slash-commands.js +912 -296
  127. package/dist/slash-commands.js.map +1 -1
  128. package/dist/smart-approval.d.ts.map +1 -1
  129. package/dist/smart-approval.js +67 -55
  130. package/dist/smart-approval.js.map +1 -1
  131. package/dist/system-prompt-generator.d.ts +6 -0
  132. package/dist/system-prompt-generator.d.ts.map +1 -1
  133. package/dist/system-prompt-generator.js +84 -34
  134. package/dist/system-prompt-generator.js.map +1 -1
  135. package/dist/terminal.d.ts +28 -0
  136. package/dist/terminal.d.ts.map +1 -0
  137. package/dist/terminal.js +82 -0
  138. package/dist/terminal.js.map +1 -0
  139. package/dist/tools.d.ts +23 -7
  140. package/dist/tools.d.ts.map +1 -1
  141. package/dist/tools.js +797 -437
  142. package/dist/tools.js.map +1 -1
  143. package/dist/truncate.d.ts +55 -0
  144. package/dist/truncate.d.ts.map +1 -0
  145. package/dist/truncate.js +130 -0
  146. package/dist/truncate.js.map +1 -0
  147. package/dist/types.d.ts +27 -9
  148. package/dist/types.d.ts.map +1 -1
  149. package/dist/update.d.ts.map +1 -1
  150. package/dist/update.js +17 -28
  151. package/dist/update.js.map +1 -1
  152. package/dist/workflow.d.ts +5 -1
  153. package/dist/workflow.d.ts.map +1 -1
  154. package/dist/workflow.js +60 -47
  155. package/dist/workflow.js.map +1 -1
  156. package/docs/architecture/mcp-integration-guide.md +304 -194
  157. package/docs/architecture/overview.md +169 -169
  158. package/docs/architecture/tool-system-design.md +134 -134
  159. package/docs/cli/commands.md +349 -238
  160. package/docs/smart-mode.md +281 -281
  161. package/docs/third-party-models.md +439 -439
  162. package/find-skills/SKILL.md +133 -0
  163. package/package.json +89 -90
  164. package/scripts/install-ripgrep.js +241 -0
  165. package/src/ai-client/factory.ts +151 -0
  166. package/src/ai-client/index.ts +61 -0
  167. package/src/ai-client/providers/anthropic.ts +466 -0
  168. package/src/ai-client/providers/openai.ts +342 -0
  169. package/src/ai-client/providers/remote.ts +436 -0
  170. package/src/ai-client/registry.ts +97 -0
  171. package/src/ai-client/types.ts +345 -0
  172. package/src/ai-client-factory.ts +204 -0
  173. package/src/auth.ts +663 -614
  174. package/src/cancellation.ts +205 -176
  175. package/src/checkpoint.ts +219 -219
  176. package/src/cli.ts +1406 -743
  177. package/src/config.ts +341 -297
  178. package/src/context-compressor.ts +982 -290
  179. package/src/conversation.ts +288 -288
  180. package/src/gui-subagent/action-parser/actionParser.ts +318 -315
  181. package/src/gui-subagent/action-parser/constants.ts +14 -14
  182. package/src/gui-subagent/action-parser/index.ts +8 -8
  183. package/src/gui-subagent/action-parser/types.ts +31 -31
  184. package/src/gui-subagent/agent/gui-agent.ts +1151 -1089
  185. package/src/gui-subagent/agent/index.ts +5 -5
  186. package/src/gui-subagent/index.ts +177 -163
  187. package/src/gui-subagent/operator/base-operator.ts +244 -245
  188. package/src/gui-subagent/operator/computer-operator.ts +540 -520
  189. package/src/gui-subagent/operator/index.ts +6 -6
  190. package/src/gui-subagent/types/actions.ts +260 -262
  191. package/src/gui-subagent/types/index.ts +6 -6
  192. package/src/gui-subagent/types/operator.ts +106 -106
  193. package/src/gui-subagent/utils.ts +51 -51
  194. package/src/index.ts +17 -18
  195. package/src/input-processor.ts +6 -3
  196. package/src/logger.ts +438 -438
  197. package/src/mcp.ts +730 -682
  198. package/src/memory.ts +344 -344
  199. package/src/ripgrep.ts +368 -0
  200. package/src/session-manager.ts +308 -308
  201. package/src/session.ts +948 -386
  202. package/src/shell.ts +133 -0
  203. package/src/skill-installer.ts +518 -0
  204. package/src/skill-invoker.ts +960 -935
  205. package/src/skill-loader.ts +501 -496
  206. package/src/skill-manager.ts +384 -0
  207. package/src/slash-commands.ts +2181 -1389
  208. package/src/smart-approval.ts +117 -73
  209. package/src/system-prompt-generator.ts +89 -34
  210. package/src/terminal.ts +96 -0
  211. package/src/theme.ts +738 -738
  212. package/src/tools.ts +1336 -773
  213. package/src/truncate.ts +173 -0
  214. package/src/types.ts +219 -198
  215. package/src/update.ts +22 -32
  216. package/src/workflow.ts +523 -508
  217. package/tsconfig.json +22 -22
  218. package/vitest.config.ts +19 -19
  219. package/dist/ai-client.d.ts +0 -86
  220. package/dist/ai-client.d.ts.map +0 -1
  221. package/dist/ai-client.js +0 -1372
  222. package/dist/ai-client.js.map +0 -1
  223. package/dist/gui-subagent/operator/browser-operator.d.ts +0 -36
  224. package/dist/gui-subagent/operator/browser-operator.d.ts.map +0 -1
  225. package/dist/gui-subagent/operator/browser-operator.js +0 -306
  226. package/dist/gui-subagent/operator/browser-operator.js.map +0 -1
  227. package/dist/gui-subagent/operator/desktop-operator.d.ts +0 -55
  228. package/dist/gui-subagent/operator/desktop-operator.d.ts.map +0 -1
  229. package/dist/gui-subagent/operator/desktop-operator.js +0 -527
  230. package/dist/gui-subagent/operator/desktop-operator.js.map +0 -1
  231. package/dist/hook.d.ts +0 -73
  232. package/dist/hook.d.ts.map +0 -1
  233. package/dist/hook.js +0 -156
  234. package/dist/hook.js.map +0 -1
  235. package/dist/input-history.d.ts +0 -24
  236. package/dist/input-history.d.ts.map +0 -1
  237. package/dist/input-history.js +0 -94
  238. package/dist/input-history.js.map +0 -1
  239. package/dist/keyboard-manager.d.ts +0 -151
  240. package/dist/keyboard-manager.d.ts.map +0 -1
  241. package/dist/keyboard-manager.js +0 -396
  242. package/dist/keyboard-manager.js.map +0 -1
  243. package/dist/print-system-prompt.d.ts +0 -2
  244. package/dist/print-system-prompt.d.ts.map +0 -1
  245. package/dist/print-system-prompt.js +0 -40
  246. package/dist/print-system-prompt.js.map +0 -1
  247. package/dist/remote-ai-client.d.ts +0 -104
  248. package/dist/remote-ai-client.d.ts.map +0 -1
  249. package/dist/remote-ai-client.js +0 -552
  250. package/dist/remote-ai-client.js.map +0 -1
  251. package/dist/sdk-output-adapter.d.ts +0 -232
  252. package/dist/sdk-output-adapter.d.ts.map +0 -1
  253. package/dist/sdk-output-adapter.js +0 -636
  254. package/dist/sdk-output-adapter.js.map +0 -1
  255. package/dist/sdk-session-v2.d.ts +0 -13
  256. package/dist/sdk-session-v2.d.ts.map +0 -1
  257. package/dist/sdk-session-v2.js +0 -46
  258. package/dist/sdk-session-v2.js.map +0 -1
  259. package/dist/sdk-session.d.ts +0 -13
  260. package/dist/sdk-session.d.ts.map +0 -1
  261. package/dist/sdk-session.js +0 -48
  262. package/dist/sdk-session.js.map +0 -1
  263. package/dist/test-boundary-conditions.d.ts.map +0 -1
  264. package/dist/test-boundary-conditions.js.map +0 -1
  265. package/dist/test-cancellation-fix.d.ts.map +0 -1
  266. package/dist/test-cancellation-fix.js.map +0 -1
  267. package/dist/test-input-history.d.ts.map +0 -1
  268. package/dist/test-input-history.js.map +0 -1
  269. package/dist/test-interaction-flow.d.ts.map +0 -1
  270. package/dist/test-interaction-flow.js.map +0 -1
  271. package/dist/test-quick.d.ts.map +0 -1
  272. package/dist/test-quick.js.map +0 -1
  273. package/dist/test-user-interaction.d.ts.map +0 -1
  274. package/dist/test-user-interaction.js.map +0 -1
  275. package/dist/tools/edit-diff.d.ts +0 -32
  276. package/dist/tools/edit-diff.d.ts.map +0 -1
  277. package/dist/tools/edit-diff.js +0 -185
  278. package/dist/tools/edit-diff.js.map +0 -1
  279. package/dist/tools/edit.d.ts +0 -11
  280. package/dist/tools/edit.d.ts.map +0 -1
  281. package/dist/tools/edit.js +0 -129
  282. package/dist/tools/edit.js.map +0 -1
  283. package/dist/unified-session.d.ts +0 -42
  284. package/dist/unified-session.d.ts.map +0 -1
  285. package/dist/unified-session.js +0 -271
  286. package/dist/unified-session.js.map +0 -1
  287. package/skills/.claude-plugin/marketplace.json +0 -45
  288. package/skills/README.md +0 -94
  289. package/skills/THIRD_PARTY_NOTICES.md +0 -405
  290. package/skills/skills/algorithmic-art/LICENSE.txt +0 -202
  291. package/skills/skills/algorithmic-art/SKILL.md +0 -405
  292. package/skills/skills/algorithmic-art/templates/generator_template.js +0 -223
  293. package/skills/skills/algorithmic-art/templates/viewer.html +0 -599
  294. package/skills/skills/brand-guidelines/LICENSE.txt +0 -202
  295. package/skills/skills/brand-guidelines/SKILL.md +0 -73
  296. package/skills/skills/canvas-design/LICENSE.txt +0 -202
  297. package/skills/skills/canvas-design/SKILL.md +0 -130
  298. package/skills/skills/canvas-design/canvas-fonts/ArsenalSC-OFL.txt +0 -93
  299. package/skills/skills/canvas-design/canvas-fonts/ArsenalSC-Regular.ttf +0 -0
  300. package/skills/skills/canvas-design/canvas-fonts/BigShoulders-Bold.ttf +0 -0
  301. package/skills/skills/canvas-design/canvas-fonts/BigShoulders-OFL.txt +0 -93
  302. package/skills/skills/canvas-design/canvas-fonts/BigShoulders-Regular.ttf +0 -0
  303. package/skills/skills/canvas-design/canvas-fonts/Boldonse-OFL.txt +0 -93
  304. package/skills/skills/canvas-design/canvas-fonts/Boldonse-Regular.ttf +0 -0
  305. package/skills/skills/canvas-design/canvas-fonts/BricolageGrotesque-Bold.ttf +0 -0
  306. package/skills/skills/canvas-design/canvas-fonts/BricolageGrotesque-OFL.txt +0 -93
  307. package/skills/skills/canvas-design/canvas-fonts/BricolageGrotesque-Regular.ttf +0 -0
  308. package/skills/skills/canvas-design/canvas-fonts/CrimsonPro-Bold.ttf +0 -0
  309. package/skills/skills/canvas-design/canvas-fonts/CrimsonPro-Italic.ttf +0 -0
  310. package/skills/skills/canvas-design/canvas-fonts/CrimsonPro-OFL.txt +0 -93
  311. package/skills/skills/canvas-design/canvas-fonts/CrimsonPro-Regular.ttf +0 -0
  312. package/skills/skills/canvas-design/canvas-fonts/DMMono-OFL.txt +0 -93
  313. package/skills/skills/canvas-design/canvas-fonts/DMMono-Regular.ttf +0 -0
  314. package/skills/skills/canvas-design/canvas-fonts/EricaOne-OFL.txt +0 -94
  315. package/skills/skills/canvas-design/canvas-fonts/EricaOne-Regular.ttf +0 -0
  316. package/skills/skills/canvas-design/canvas-fonts/GeistMono-Bold.ttf +0 -0
  317. package/skills/skills/canvas-design/canvas-fonts/GeistMono-OFL.txt +0 -93
  318. package/skills/skills/canvas-design/canvas-fonts/GeistMono-Regular.ttf +0 -0
  319. package/skills/skills/canvas-design/canvas-fonts/Gloock-OFL.txt +0 -93
  320. package/skills/skills/canvas-design/canvas-fonts/Gloock-Regular.ttf +0 -0
  321. package/skills/skills/canvas-design/canvas-fonts/IBMPlexMono-Bold.ttf +0 -0
  322. package/skills/skills/canvas-design/canvas-fonts/IBMPlexMono-OFL.txt +0 -93
  323. package/skills/skills/canvas-design/canvas-fonts/IBMPlexMono-Regular.ttf +0 -0
  324. package/skills/skills/canvas-design/canvas-fonts/IBMPlexSerif-Bold.ttf +0 -0
  325. package/skills/skills/canvas-design/canvas-fonts/IBMPlexSerif-BoldItalic.ttf +0 -0
  326. package/skills/skills/canvas-design/canvas-fonts/IBMPlexSerif-Italic.ttf +0 -0
  327. package/skills/skills/canvas-design/canvas-fonts/IBMPlexSerif-Regular.ttf +0 -0
  328. package/skills/skills/canvas-design/canvas-fonts/InstrumentSans-Bold.ttf +0 -0
  329. package/skills/skills/canvas-design/canvas-fonts/InstrumentSans-BoldItalic.ttf +0 -0
  330. package/skills/skills/canvas-design/canvas-fonts/InstrumentSans-Italic.ttf +0 -0
  331. package/skills/skills/canvas-design/canvas-fonts/InstrumentSans-OFL.txt +0 -93
  332. package/skills/skills/canvas-design/canvas-fonts/InstrumentSans-Regular.ttf +0 -0
  333. package/skills/skills/canvas-design/canvas-fonts/InstrumentSerif-Italic.ttf +0 -0
  334. package/skills/skills/canvas-design/canvas-fonts/InstrumentSerif-Regular.ttf +0 -0
  335. package/skills/skills/canvas-design/canvas-fonts/Italiana-OFL.txt +0 -93
  336. package/skills/skills/canvas-design/canvas-fonts/Italiana-Regular.ttf +0 -0
  337. package/skills/skills/canvas-design/canvas-fonts/JetBrainsMono-Bold.ttf +0 -0
  338. package/skills/skills/canvas-design/canvas-fonts/JetBrainsMono-OFL.txt +0 -93
  339. package/skills/skills/canvas-design/canvas-fonts/JetBrainsMono-Regular.ttf +0 -0
  340. package/skills/skills/canvas-design/canvas-fonts/Jura-Light.ttf +0 -0
  341. package/skills/skills/canvas-design/canvas-fonts/Jura-Medium.ttf +0 -0
  342. package/skills/skills/canvas-design/canvas-fonts/Jura-OFL.txt +0 -93
  343. package/skills/skills/canvas-design/canvas-fonts/LibreBaskerville-OFL.txt +0 -93
  344. package/skills/skills/canvas-design/canvas-fonts/LibreBaskerville-Regular.ttf +0 -0
  345. package/skills/skills/canvas-design/canvas-fonts/Lora-Bold.ttf +0 -0
  346. package/skills/skills/canvas-design/canvas-fonts/Lora-BoldItalic.ttf +0 -0
  347. package/skills/skills/canvas-design/canvas-fonts/Lora-Italic.ttf +0 -0
  348. package/skills/skills/canvas-design/canvas-fonts/Lora-OFL.txt +0 -93
  349. package/skills/skills/canvas-design/canvas-fonts/Lora-Regular.ttf +0 -0
  350. package/skills/skills/canvas-design/canvas-fonts/NationalPark-Bold.ttf +0 -0
  351. package/skills/skills/canvas-design/canvas-fonts/NationalPark-OFL.txt +0 -93
  352. package/skills/skills/canvas-design/canvas-fonts/NationalPark-Regular.ttf +0 -0
  353. package/skills/skills/canvas-design/canvas-fonts/NothingYouCouldDo-OFL.txt +0 -93
  354. package/skills/skills/canvas-design/canvas-fonts/NothingYouCouldDo-Regular.ttf +0 -0
  355. package/skills/skills/canvas-design/canvas-fonts/Outfit-Bold.ttf +0 -0
  356. package/skills/skills/canvas-design/canvas-fonts/Outfit-OFL.txt +0 -93
  357. package/skills/skills/canvas-design/canvas-fonts/Outfit-Regular.ttf +0 -0
  358. package/skills/skills/canvas-design/canvas-fonts/PixelifySans-Medium.ttf +0 -0
  359. package/skills/skills/canvas-design/canvas-fonts/PixelifySans-OFL.txt +0 -93
  360. package/skills/skills/canvas-design/canvas-fonts/PoiretOne-OFL.txt +0 -93
  361. package/skills/skills/canvas-design/canvas-fonts/PoiretOne-Regular.ttf +0 -0
  362. package/skills/skills/canvas-design/canvas-fonts/RedHatMono-Bold.ttf +0 -0
  363. package/skills/skills/canvas-design/canvas-fonts/RedHatMono-OFL.txt +0 -93
  364. package/skills/skills/canvas-design/canvas-fonts/RedHatMono-Regular.ttf +0 -0
  365. package/skills/skills/canvas-design/canvas-fonts/Silkscreen-OFL.txt +0 -93
  366. package/skills/skills/canvas-design/canvas-fonts/Silkscreen-Regular.ttf +0 -0
  367. package/skills/skills/canvas-design/canvas-fonts/SmoochSans-Medium.ttf +0 -0
  368. package/skills/skills/canvas-design/canvas-fonts/SmoochSans-OFL.txt +0 -93
  369. package/skills/skills/canvas-design/canvas-fonts/Tektur-Medium.ttf +0 -0
  370. package/skills/skills/canvas-design/canvas-fonts/Tektur-OFL.txt +0 -93
  371. package/skills/skills/canvas-design/canvas-fonts/Tektur-Regular.ttf +0 -0
  372. package/skills/skills/canvas-design/canvas-fonts/WorkSans-Bold.ttf +0 -0
  373. package/skills/skills/canvas-design/canvas-fonts/WorkSans-BoldItalic.ttf +0 -0
  374. package/skills/skills/canvas-design/canvas-fonts/WorkSans-Italic.ttf +0 -0
  375. package/skills/skills/canvas-design/canvas-fonts/WorkSans-OFL.txt +0 -93
  376. package/skills/skills/canvas-design/canvas-fonts/WorkSans-Regular.ttf +0 -0
  377. package/skills/skills/canvas-design/canvas-fonts/YoungSerif-OFL.txt +0 -93
  378. package/skills/skills/canvas-design/canvas-fonts/YoungSerif-Regular.ttf +0 -0
  379. package/skills/skills/doc-coauthoring/SKILL.md +0 -375
  380. package/skills/skills/docx/LICENSE.txt +0 -30
  381. package/skills/skills/docx/SKILL.md +0 -197
  382. package/skills/skills/docx/docx-js.md +0 -350
  383. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +0 -1499
  384. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +0 -146
  385. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +0 -1085
  386. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +0 -11
  387. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-main.xsd +0 -3081
  388. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +0 -23
  389. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +0 -185
  390. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +0 -287
  391. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/pml.xsd +0 -1676
  392. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +0 -28
  393. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +0 -144
  394. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +0 -174
  395. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +0 -25
  396. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +0 -18
  397. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +0 -59
  398. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +0 -56
  399. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +0 -195
  400. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-math.xsd +0 -582
  401. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +0 -25
  402. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/sml.xsd +0 -4439
  403. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/vml-main.xsd +0 -570
  404. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +0 -509
  405. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +0 -12
  406. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +0 -108
  407. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +0 -96
  408. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/wml.xsd +0 -3646
  409. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/xml.xsd +0 -116
  410. package/skills/skills/docx/ooxml/schemas/ecma/fouth-edition/opc-contentTypes.xsd +0 -42
  411. package/skills/skills/docx/ooxml/schemas/ecma/fouth-edition/opc-coreProperties.xsd +0 -50
  412. package/skills/skills/docx/ooxml/schemas/ecma/fouth-edition/opc-digSig.xsd +0 -49
  413. package/skills/skills/docx/ooxml/schemas/ecma/fouth-edition/opc-relationships.xsd +0 -33
  414. package/skills/skills/docx/ooxml/schemas/mce/mc.xsd +0 -75
  415. package/skills/skills/docx/ooxml/schemas/microsoft/wml-2010.xsd +0 -560
  416. package/skills/skills/docx/ooxml/schemas/microsoft/wml-2012.xsd +0 -67
  417. package/skills/skills/docx/ooxml/schemas/microsoft/wml-2018.xsd +0 -14
  418. package/skills/skills/docx/ooxml/schemas/microsoft/wml-cex-2018.xsd +0 -20
  419. package/skills/skills/docx/ooxml/schemas/microsoft/wml-cid-2016.xsd +0 -13
  420. package/skills/skills/docx/ooxml/schemas/microsoft/wml-sdtdatahash-2020.xsd +0 -4
  421. package/skills/skills/docx/ooxml/schemas/microsoft/wml-symex-2015.xsd +0 -8
  422. package/skills/skills/docx/ooxml/scripts/pack.py +0 -159
  423. package/skills/skills/docx/ooxml/scripts/unpack.py +0 -29
  424. package/skills/skills/docx/ooxml/scripts/validate.py +0 -69
  425. package/skills/skills/docx/ooxml/scripts/validation/__init__.py +0 -15
  426. package/skills/skills/docx/ooxml/scripts/validation/base.py +0 -951
  427. package/skills/skills/docx/ooxml/scripts/validation/docx.py +0 -274
  428. package/skills/skills/docx/ooxml/scripts/validation/pptx.py +0 -315
  429. package/skills/skills/docx/ooxml/scripts/validation/redlining.py +0 -279
  430. package/skills/skills/docx/ooxml.md +0 -610
  431. package/skills/skills/docx/scripts/__init__.py +0 -1
  432. package/skills/skills/docx/scripts/document.py +0 -1276
  433. package/skills/skills/docx/scripts/templates/comments.xml +0 -3
  434. package/skills/skills/docx/scripts/templates/commentsExtended.xml +0 -3
  435. package/skills/skills/docx/scripts/templates/commentsExtensible.xml +0 -3
  436. package/skills/skills/docx/scripts/templates/commentsIds.xml +0 -3
  437. package/skills/skills/docx/scripts/templates/people.xml +0 -3
  438. package/skills/skills/docx/scripts/utilities.py +0 -374
  439. package/skills/skills/frontend-design/LICENSE.txt +0 -177
  440. package/skills/skills/frontend-design/SKILL.md +0 -42
  441. package/skills/skills/internal-comms/LICENSE.txt +0 -202
  442. package/skills/skills/internal-comms/SKILL.md +0 -32
  443. package/skills/skills/internal-comms/examples/3p-updates.md +0 -47
  444. package/skills/skills/internal-comms/examples/company-newsletter.md +0 -65
  445. package/skills/skills/internal-comms/examples/faq-answers.md +0 -30
  446. package/skills/skills/internal-comms/examples/general-comms.md +0 -16
  447. package/skills/skills/mcp-builder/LICENSE.txt +0 -202
  448. package/skills/skills/mcp-builder/SKILL.md +0 -236
  449. package/skills/skills/mcp-builder/reference/evaluation.md +0 -602
  450. package/skills/skills/mcp-builder/reference/mcp_best_practices.md +0 -249
  451. package/skills/skills/mcp-builder/reference/node_mcp_server.md +0 -970
  452. package/skills/skills/mcp-builder/reference/python_mcp_server.md +0 -719
  453. package/skills/skills/mcp-builder/scripts/connections.py +0 -151
  454. package/skills/skills/mcp-builder/scripts/evaluation.py +0 -373
  455. package/skills/skills/mcp-builder/scripts/example_evaluation.xml +0 -22
  456. package/skills/skills/mcp-builder/scripts/requirements.txt +0 -2
  457. package/skills/skills/pdf/LICENSE.txt +0 -30
  458. package/skills/skills/pdf/SKILL.md +0 -294
  459. package/skills/skills/pdf/forms.md +0 -205
  460. package/skills/skills/pdf/reference.md +0 -612
  461. package/skills/skills/pdf/scripts/check_bounding_boxes.py +0 -70
  462. package/skills/skills/pdf/scripts/check_bounding_boxes_test.py +0 -226
  463. package/skills/skills/pdf/scripts/check_fillable_fields.py +0 -12
  464. package/skills/skills/pdf/scripts/convert_pdf_to_images.py +0 -35
  465. package/skills/skills/pdf/scripts/create_validation_image.py +0 -41
  466. package/skills/skills/pdf/scripts/extract_form_field_info.py +0 -152
  467. package/skills/skills/pdf/scripts/fill_fillable_fields.py +0 -114
  468. package/skills/skills/pdf/scripts/fill_pdf_form_with_annotations.py +0 -108
  469. package/skills/skills/pptx/LICENSE.txt +0 -30
  470. package/skills/skills/pptx/SKILL.md +0 -484
  471. package/skills/skills/pptx/html2pptx.md +0 -625
  472. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +0 -1499
  473. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +0 -146
  474. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +0 -1085
  475. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +0 -11
  476. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-main.xsd +0 -3081
  477. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +0 -23
  478. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +0 -185
  479. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +0 -287
  480. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/pml.xsd +0 -1676
  481. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +0 -28
  482. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +0 -144
  483. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +0 -174
  484. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +0 -25
  485. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +0 -18
  486. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +0 -59
  487. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +0 -56
  488. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +0 -195
  489. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-math.xsd +0 -582
  490. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +0 -25
  491. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/sml.xsd +0 -4439
  492. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-main.xsd +0 -570
  493. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +0 -509
  494. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +0 -12
  495. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +0 -108
  496. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +0 -96
  497. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/wml.xsd +0 -3646
  498. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/xml.xsd +0 -116
  499. package/skills/skills/pptx/ooxml/schemas/ecma/fouth-edition/opc-contentTypes.xsd +0 -42
  500. package/skills/skills/pptx/ooxml/schemas/ecma/fouth-edition/opc-coreProperties.xsd +0 -50
  501. package/skills/skills/pptx/ooxml/schemas/ecma/fouth-edition/opc-digSig.xsd +0 -49
  502. package/skills/skills/pptx/ooxml/schemas/ecma/fouth-edition/opc-relationships.xsd +0 -33
  503. package/skills/skills/pptx/ooxml/schemas/mce/mc.xsd +0 -75
  504. package/skills/skills/pptx/ooxml/schemas/microsoft/wml-2010.xsd +0 -560
  505. package/skills/skills/pptx/ooxml/schemas/microsoft/wml-2012.xsd +0 -67
  506. package/skills/skills/pptx/ooxml/schemas/microsoft/wml-2018.xsd +0 -14
  507. package/skills/skills/pptx/ooxml/schemas/microsoft/wml-cex-2018.xsd +0 -20
  508. package/skills/skills/pptx/ooxml/schemas/microsoft/wml-cid-2016.xsd +0 -13
  509. package/skills/skills/pptx/ooxml/schemas/microsoft/wml-sdtdatahash-2020.xsd +0 -4
  510. package/skills/skills/pptx/ooxml/schemas/microsoft/wml-symex-2015.xsd +0 -8
  511. package/skills/skills/pptx/ooxml/scripts/pack.py +0 -159
  512. package/skills/skills/pptx/ooxml/scripts/unpack.py +0 -29
  513. package/skills/skills/pptx/ooxml/scripts/validate.py +0 -69
  514. package/skills/skills/pptx/ooxml/scripts/validation/__init__.py +0 -15
  515. package/skills/skills/pptx/ooxml/scripts/validation/base.py +0 -951
  516. package/skills/skills/pptx/ooxml/scripts/validation/docx.py +0 -274
  517. package/skills/skills/pptx/ooxml/scripts/validation/pptx.py +0 -315
  518. package/skills/skills/pptx/ooxml/scripts/validation/redlining.py +0 -279
  519. package/skills/skills/pptx/ooxml.md +0 -427
  520. package/skills/skills/pptx/scripts/html2pptx.js +0 -979
  521. package/skills/skills/pptx/scripts/inventory.py +0 -1020
  522. package/skills/skills/pptx/scripts/rearrange.py +0 -231
  523. package/skills/skills/pptx/scripts/replace.py +0 -385
  524. package/skills/skills/pptx/scripts/thumbnail.py +0 -450
  525. package/skills/skills/skill-creator/LICENSE.txt +0 -202
  526. package/skills/skills/skill-creator/SKILL.md +0 -356
  527. package/skills/skills/skill-creator/references/output-patterns.md +0 -82
  528. package/skills/skills/skill-creator/references/workflows.md +0 -28
  529. package/skills/skills/skill-creator/scripts/init_skill.py +0 -303
  530. package/skills/skills/skill-creator/scripts/package_skill.py +0 -110
  531. package/skills/skills/skill-creator/scripts/quick_validate.py +0 -95
  532. package/skills/skills/slack-gif-creator/LICENSE.txt +0 -202
  533. package/skills/skills/slack-gif-creator/SKILL.md +0 -254
  534. package/skills/skills/slack-gif-creator/core/easing.py +0 -234
  535. package/skills/skills/slack-gif-creator/core/frame_composer.py +0 -176
  536. package/skills/skills/slack-gif-creator/core/gif_builder.py +0 -269
  537. package/skills/skills/slack-gif-creator/core/validators.py +0 -136
  538. package/skills/skills/slack-gif-creator/requirements.txt +0 -4
  539. package/skills/skills/theme-factory/LICENSE.txt +0 -202
  540. package/skills/skills/theme-factory/SKILL.md +0 -59
  541. package/skills/skills/theme-factory/theme-showcase.pdf +0 -0
  542. package/skills/skills/theme-factory/themes/arctic-frost.md +0 -19
  543. package/skills/skills/theme-factory/themes/botanical-garden.md +0 -19
  544. package/skills/skills/theme-factory/themes/desert-rose.md +0 -19
  545. package/skills/skills/theme-factory/themes/forest-canopy.md +0 -19
  546. package/skills/skills/theme-factory/themes/golden-hour.md +0 -19
  547. package/skills/skills/theme-factory/themes/midnight-galaxy.md +0 -19
  548. package/skills/skills/theme-factory/themes/modern-minimalist.md +0 -19
  549. package/skills/skills/theme-factory/themes/ocean-depths.md +0 -19
  550. package/skills/skills/theme-factory/themes/sunset-boulevard.md +0 -19
  551. package/skills/skills/theme-factory/themes/tech-innovation.md +0 -19
  552. package/skills/skills/web-artifacts-builder/LICENSE.txt +0 -202
  553. package/skills/skills/web-artifacts-builder/SKILL.md +0 -74
  554. package/skills/skills/web-artifacts-builder/scripts/bundle-artifact.sh +0 -54
  555. package/skills/skills/web-artifacts-builder/scripts/init-artifact.sh +0 -322
  556. package/skills/skills/webapp-testing/LICENSE.txt +0 -202
  557. package/skills/skills/webapp-testing/SKILL.md +0 -96
  558. package/skills/skills/webapp-testing/examples/console_logging.py +0 -35
  559. package/skills/skills/webapp-testing/examples/element_discovery.py +0 -40
  560. package/skills/skills/webapp-testing/examples/static_html_automation.py +0 -33
  561. package/skills/skills/webapp-testing/scripts/with_server.py +0 -106
  562. package/skills/skills/xlsx/LICENSE.txt +0 -30
  563. package/skills/skills/xlsx/SKILL.md +0 -289
  564. package/skills/skills/xlsx/recalc.py +0 -178
  565. package/skills/spec/agent-skills-spec.md +0 -3
  566. package/skills/template/SKILL.md +0 -6
  567. package/src/ai-client.ts +0 -1560
  568. package/src/remote-ai-client.ts +0 -664
@@ -1,1089 +1,1151 @@
1
- /**
2
- * GUI Agent for xagent
3
- * Orchestrates desktop automation with AI-powered action execution
4
- * Based on UI-TARS architecture with computer control only
5
- *
6
- * This implementation is aligned with packages/ui-tars/sdk/src/GUIAgent.ts
7
- */
8
-
9
- import type {
10
- ScreenContext,
11
- ScreenshotOutput,
12
- ExecuteParams,
13
- ExecuteOutput,
14
- PredictionParsed,
15
- } from '../types/operator.js';
16
- import type { Operator } from '../operator/base-operator.js';
17
- import { sleep, asyncRetry } from '../utils.js';
18
- import { actionParser } from '../action-parser/index.js';
19
- import { colors, icons, renderMarkdown } from '../../theme.js';
20
- import { getLogger } from '../../logger.js';
21
-
22
- /**
23
- * Helper function to truncate long text
24
- */
25
- function truncateText(text: string, maxLength: number = 200): string {
26
- if (!text) return '';
27
- return text.length > maxLength ? text.substring(0, maxLength) + '...' : text;
28
- }
29
-
30
- /**
31
- * Helper function to indent multiline text
32
- */
33
- function indentMultiline(text: string, indent: string): string {
34
- return text.split('\n').map(line => indent + line).join('\n');
35
- }
36
-
37
- const guiLogger = getLogger();
38
-
39
- // UI-TARS Status Enum
40
- export enum GUIAgentStatus {
41
- INIT = 'init',
42
- RUNNING = 'running',
43
- PAUSE = 'paused',
44
- END = 'end',
45
- ERROR = 'error',
46
- USER_STOPPED = 'user_stopped',
47
- CALL_USER = 'call_user',
48
- }
49
-
50
- /**
51
- * Remote VLM Caller callback function type
52
- * Inject this function externally to handle VLM calls, GUI Agent doesn't need to know VLM implementation details
53
- * Receives full messages array (same as local mode) for consistent behavior
54
- */
55
- export type RemoteVlmCaller = (messages: any[], systemPrompt: string) => Promise<string>;
56
-
57
- export interface GUIAgentConfig<T extends Operator> {
58
- operator: T;
59
- model?: string;
60
- modelBaseUrl?: string;
61
- modelApiKey?: string;
62
- /**
63
- * Externally injected VLM caller function
64
- * If this function is provided, GUI Agent will use it to call VLM
65
- * instead of directly calling modelBaseUrl/modelApiKey
66
- * This allows GUI Agent to work with remote services without exposing any configuration
67
- */
68
- remoteVlmCaller?: RemoteVlmCaller;
69
- /**
70
- * Whether to use local mode
71
- * If true, use model/modelBaseUrl/modelApiKey for VLM calls
72
- * If false, use remoteVlmCaller for remote VLM calls
73
- */
74
- isLocalMode: boolean;
75
- systemPrompt?: string;
76
- loopIntervalInMs?: number;
77
- maxLoopCount?: number;
78
- logger?: any;
79
- signal?: AbortSignal;
80
- onData?: (data: GUIAgentData) => void;
81
- onError?: (error: Error) => void;
82
- showAIDebugInfo?: boolean;
83
- retry?: {
84
- screenshot?: {
85
- maxRetries?: number;
86
- onRetry?: (e: Error) => void;
87
- };
88
- model?: {
89
- maxRetries?: number;
90
- onRetry?: (e: Error) => void;
91
- };
92
- execute?: {
93
- maxRetries?: number;
94
- onRetry?: (e: Error) => void;
95
- };
96
- };
97
- }
98
-
99
- export interface GUIAgentData {
100
- status: GUIAgentStatus;
101
- conversations: Conversation[];
102
- error?: string;
103
- systemPrompt?: string;
104
- }
105
-
106
- export interface Conversation {
107
- from: 'human' | 'assistant';
108
- value: string;
109
- screenshotBase64?: string;
110
- screenshotContext?: {
111
- size: { width: number; height: number };
112
- mime?: string;
113
- scaleFactor: number;
114
- };
115
- actionType?: string;
116
- actionInputs?: Record<string, any>;
117
- timing?: {
118
- start: number;
119
- end: number;
120
- cost: number;
121
- };
122
- predictionParsed?: PredictionParsed[];
123
- }
124
-
125
- // UI-TARS constants (aligned with @ui-tars/shared/constants)
126
- const MAX_LOOP_COUNT = 100;
127
- const MAX_SNAPSHOT_ERR_CNT = 5;
128
- const MAX_STEP_RETRIES = 3; // Max retries for a single action step before giving up
129
- const IMAGE_PLACEHOLDER = '{{IMG_PLACEHOLDER_0}}';
130
-
131
- export class GUIAgent<T extends Operator> {
132
- private operator: T;
133
- private readonly model: string;
134
- private readonly modelBaseUrl: string;
135
- private readonly modelApiKey: string;
136
- private readonly remoteVlmCaller?: RemoteVlmCaller;
137
- private readonly isLocalMode: boolean;
138
- private readonly systemPrompt: string;
139
- private readonly loopIntervalInMs: number;
140
- private readonly maxLoopCount: number;
141
- private readonly logger: Console;
142
- private readonly signal?: AbortSignal;
143
- private readonly onData?: (data: GUIAgentData) => void;
144
- private readonly onError?: (error: Error) => void;
145
- private readonly showAIDebugInfo: boolean;
146
- private readonly retry?: GUIAgentConfig<T>['retry'];
147
-
148
- private isPaused = false;
149
- private resumePromise: Promise<void> | null = null;
150
- private resolveResume: (() => void) | null = null;
151
- private isStopped = false;
152
-
153
- constructor(config: GUIAgentConfig<T>) {
154
- this.operator = config.operator;
155
- this.model = config.model || '';
156
- this.modelBaseUrl = config.modelBaseUrl || '';
157
- this.modelApiKey = config.modelApiKey || '';
158
- this.remoteVlmCaller = config.remoteVlmCaller;
159
- this.isLocalMode = config.isLocalMode;
160
- this.loopIntervalInMs = config.loopIntervalInMs || 0;
161
- this.maxLoopCount = config.maxLoopCount || MAX_LOOP_COUNT;
162
- this.logger = config.logger || guiLogger;
163
- this.signal = config.signal;
164
- this.onData = config.onData;
165
- this.onError = config.onError;
166
- this.showAIDebugInfo = config.showAIDebugInfo ?? false;
167
- this.retry = config.retry;
168
-
169
- this.systemPrompt = config.systemPrompt || this.buildSystemPrompt();
170
- }
171
-
172
- /**
173
- * Display conversation results with formatting similar to session.ts (simplified)
174
- */
175
- private displayConversationResult(conversation: Conversation, iteration: number, indentLevel: number = 1): void {
176
- const indent = ' '.repeat(indentLevel);
177
- const innerIndent = ' '.repeat(indentLevel + 1);
178
- const maxWidth = process.stdout.columns || 80;
179
-
180
- if (conversation.from === 'assistant') {
181
- // Display assistant response (action)
182
- const content = conversation.value || '';
183
- const timing = conversation.timing;
184
-
185
- // Simplified: show step number and action
186
- const actionSummary = content.replace(/Thought:[\s\S]*?Action:\s*/i, '').trim();
187
- const actionType = conversation.predictionParsed?.[0]?.action_type || 'action';
188
-
189
- console.log(`${indent}${colors.primaryBright(`[${iteration}]`)} ${colors.textMuted(actionType)}${timing ? colors.textDim(` (${timing.cost}ms)`) : ''}`);
190
-
191
- // Optionally show action details on next line if verbose
192
- if (this.showAIDebugInfo && actionSummary) {
193
- const truncatedSummary = actionSummary.length > 60 ? actionSummary.substring(0, 60) + '...' : actionSummary;
194
- console.log(`${innerIndent}${colors.textMuted(truncatedSummary)}`);
195
- }
196
- } else if (conversation.from === 'human' && conversation.screenshotBase64) {
197
- // Show minimal indicator for screenshot
198
- if (this.showAIDebugInfo) {
199
- const timing = conversation.timing;
200
- console.log(`${indent}${colors.textMuted(`${icons.loading} screenshot${timing ? ` (${timing.cost}ms)` : ''}`)}`);
201
- }
202
- }
203
- }
204
-
205
- /**
206
- * Display status message
207
- */
208
- private displayStatus(data: GUIAgentData, iteration: number, indentLevel: number = 1): void {
209
- const indent = ' '.repeat(indentLevel);
210
- const status = data.status;
211
-
212
- switch (status) {
213
- case GUIAgentStatus.RUNNING:
214
- console.log(`${indent}${colors.info(`${icons.loading} Step ${iteration}: Running...`)}`);
215
- break;
216
- case GUIAgentStatus.END:
217
- // Handled by caller
218
- break;
219
- case GUIAgentStatus.ERROR:
220
- if (data.error) {
221
- console.log(`${indent}${colors.error(`${icons.cross} ${data.error}`)}`);
222
- }
223
- break;
224
- case GUIAgentStatus.CALL_USER:
225
- console.log(`${indent}${colors.warning(`${icons.warning} Needs user input`)}`);
226
- break;
227
- case GUIAgentStatus.USER_STOPPED:
228
- console.log(`${indent}${colors.warning(`${icons.warning} Stopped`)}`);
229
- break;
230
- default:
231
- break;
232
- }
233
- }
234
-
235
- private buildSystemPrompt(): string {
236
- return `You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
237
-
238
- ## Output Format
239
- \`
240
- Thought: ...
241
- Action: ...
242
- \`
243
-
244
- ## Action Space
245
- click(point='<point>x1 y1</point>')
246
- left_double(point='<point>x1 y1</point>')
247
- right_single(point='<point>x1 y1</point>')
248
- drag(start_point='<point>x1 y1</point>', end_point='<point>x2 y2</point>')
249
- hotkey(key='ctrl c') # Split keys with a space and use lowercase. Also, do not use more than 3 keys in one hotkey action.
250
- type(content='xxx') # Use escape characters \', \", and \n in content part to ensure we can parse the content in normal python string format. If you want to submit your input, use \n at the end of content.
251
- scroll(point='<point>x1 y1</point>', direction='down or up or right or left') # Show more information on the \`direction\` side.
252
- open_url(url='https://xxx') # Open URL in browser
253
- wait() #Sleep for 5s and take a screenshot to check for any changes.
254
- finished(content='xxx') # Use escape characters \', \", and \n in content part to ensure we can parse the content in normal python string format.
255
-
256
-
257
-
258
-
259
- ## Note
260
- - Use {language} in \`Thought\` part.
261
- - Write a small plan and finally summarize your next action (with its target element) in one sentence in \`Thought\` part.
262
-
263
- `;
264
- }
265
-
266
-
267
-
268
- async initialize(): Promise<void> {
269
- await this.operator.doInitialize();
270
- }
271
-
272
- /**
273
- * Run the GUI agent with a single instruction (UI-TARS style)
274
- * All operations are determined by the GUI model
275
- */
276
- async run(instruction: string): Promise<GUIAgentData> {
277
- const data: GUIAgentData = {
278
- status: GUIAgentStatus.INIT,
279
- conversations: [
280
- {
281
- from: 'human',
282
- value: instruction,
283
- timing: {
284
- start: Date.now(),
285
- end: Date.now(),
286
- cost: 0,
287
- },
288
- },
289
- ],
290
- };
291
-
292
- // Initialize operator for initial screenshot
293
- try {
294
- await this.operator.doInitialize();
295
- } catch (initError) {
296
- const errorMsg = initError instanceof Error ? initError.message : 'Unknown error';
297
- this.logger.error(`[GUIAgent] Failed to initialize operator: ${errorMsg}`);
298
-
299
- // Check if it's an RDP-related issue
300
- if (errorMsg.includes('screen') || errorMsg.includes('capture') || errorMsg.includes('display')) {
301
- data.status = GUIAgentStatus.ERROR;
302
- data.error = 'Failed to initialize screen capture. This may be caused by:\n' +
303
- ' 1. Remote Desktop session disconnected or minimized\n' +
304
- ' 2. Display driver issues\n' +
305
- 'Suggestion: Ensure your display is active and try again.';
306
- } else {
307
- data.status = GUIAgentStatus.ERROR;
308
- data.error = `Failed to initialize operator: ${errorMsg}`;
309
- }
310
- return data;
311
- }
312
-
313
- const currentTime = Date.now();
314
-
315
- if (this.showAIDebugInfo) {
316
- this.logger.debug('[GUIAgent] run:', {
317
- systemPrompt: this.systemPrompt,
318
- model: this.model,
319
- maxLoopCount: this.maxLoopCount,
320
- });
321
- }
322
-
323
- let loopCnt = 0;
324
- let snapshotErrCnt = 0;
325
-
326
- // Start running agent
327
- data.status = GUIAgentStatus.RUNNING;
328
- data.systemPrompt = this.systemPrompt;
329
- console.log(`${colors.primaryBright(`${icons.rocket} GUI Agent started`)}`);
330
- console.log('');
331
- await this.onData?.({ ...data, conversations: [] });
332
-
333
- try {
334
- // eslint-disable-next-line no-constant-condition
335
- while (true) {
336
- if (this.showAIDebugInfo) {
337
- this.logger.debug('[GUIAgent] loopCnt:', loopCnt);
338
- }
339
-
340
- // Check pause status
341
- if (this.isPaused && this.resumePromise) {
342
- data.status = GUIAgentStatus.PAUSE;
343
- await this.onData?.({ ...data, conversations: [] });
344
- await this.resumePromise;
345
- data.status = GUIAgentStatus.RUNNING;
346
- await this.onData?.({ ...data, conversations: [] });
347
- }
348
-
349
- // Check stop or aborted status
350
- if (
351
- this.isStopped ||
352
- data.status !== GUIAgentStatus.RUNNING ||
353
- this.signal?.aborted
354
- ) {
355
- if (this.signal?.aborted) {
356
- data.status = GUIAgentStatus.USER_STOPPED;
357
- }
358
- break;
359
- }
360
-
361
- // Check loop limit
362
- if (loopCnt >= this.maxLoopCount) {
363
- data.status = GUIAgentStatus.ERROR;
364
- data.error = `Has reached max loop count: ${loopCnt}`;
365
- break;
366
- }
367
-
368
- // Check screenshot error limit
369
- if (snapshotErrCnt >= MAX_SNAPSHOT_ERR_CNT) {
370
- data.status = GUIAgentStatus.ERROR;
371
- data.error = 'Screenshot failed too many times. Stopping task.';
372
- break;
373
- }
374
-
375
- loopCnt += 1;
376
- const start = Date.now();
377
-
378
- // Take screenshot (single attempt - no retry to avoid infinite loops)
379
- let snapshot: ScreenshotOutput;
380
- try {
381
- snapshot = await this.operator.doScreenshot();
382
- } catch (screenshotError) {
383
- const errorMsg = screenshotError instanceof Error ? screenshotError.message : 'Unknown error';
384
- this.logger.warn(`[GUIAgent] Screenshot exception: ${errorMsg}`);
385
- snapshotErrCnt += 1;
386
- data.status = GUIAgentStatus.ERROR;
387
- data.error = `Screenshot failed ${snapshotErrCnt} times. Stopping task.`;
388
- this.logger.error(`[GUIAgent] ${data.error}`);
389
- await sleep(1000);
390
- break;
391
- }
392
-
393
- // Check if screenshot returned failure status
394
- if (snapshot.status === 'failed') {
395
- const errorMsg = snapshot.errorMessage || 'Unknown error';
396
- this.logger.warn(`[GUIAgent] Screenshot failed: ${errorMsg}`);
397
- snapshotErrCnt += 1;
398
- data.status = GUIAgentStatus.ERROR;
399
- data.error = `Screenshot failed ${snapshotErrCnt} times. Stopping task.`;
400
- this.logger.error(`[GUIAgent] ${data.error}`);
401
- await sleep(1000);
402
- break;
403
- }
404
-
405
- // Check abort immediately after screenshot
406
- if (this.signal?.aborted) {
407
- data.status = GUIAgentStatus.USER_STOPPED;
408
- break;
409
- }
410
-
411
- // Validate screenshot
412
- const isValidImage = !!(snapshot?.base64);
413
- if (!isValidImage) {
414
- snapshotErrCnt += 1;
415
- data.status = GUIAgentStatus.ERROR;
416
- data.error = `Screenshot failed ${snapshotErrCnt} times. Stopping task.`;
417
- this.logger.error(`[GUIAgent] ${data.error}`);
418
- await sleep(1000);
419
- break;
420
- }
421
-
422
- // Reset error counter on successful screenshot
423
- snapshotErrCnt = 0;
424
-
425
- const end = Date.now();
426
-
427
- // Get screen context
428
- const screenContext = await this.operator.getScreenContext();
429
-
430
- // Add screenshot to conversation
431
- data.conversations.push({
432
- from: 'human',
433
- value: IMAGE_PLACEHOLDER,
434
- screenshotBase64: snapshot.base64,
435
- screenshotContext: {
436
- size: {
437
- width: screenContext.width,
438
- height: screenContext.height,
439
- },
440
- scaleFactor: snapshot.scaleFactor ?? screenContext.scaleFactor,
441
- },
442
- timing: {
443
- start,
444
- end,
445
- cost: end - start,
446
- },
447
- });
448
-
449
- await this.onData?.({
450
- ...data,
451
- conversations: data.conversations.slice(-1),
452
- });
453
-
454
- // Display screenshot notification
455
- const latestScreenshot = data.conversations[data.conversations.length - 1];
456
- if (latestScreenshot && latestScreenshot.from === 'human' && latestScreenshot.screenshotBase64) {
457
- this.displayConversationResult(latestScreenshot, loopCnt);
458
- }
459
-
460
- // Build messages for model
461
- const messages = this.buildModelMessages(data.conversations, data.systemPrompt);
462
-
463
- // Check abort before model call
464
- if (this.signal?.aborted) {
465
- data.status = GUIAgentStatus.USER_STOPPED;
466
- break;
467
- }
468
-
469
- // Invoke model with retry
470
- let prediction: string;
471
- let parsedPredictions: PredictionParsed[];
472
- try {
473
- const modelResult: { prediction: string; parsedPredictions: PredictionParsed[] } = await asyncRetry(
474
- async (bail) => {
475
- try {
476
- const result = await this.callModelAPI(messages, screenContext, this.remoteVlmCaller!);
477
- return result;
478
- } catch (error: unknown) {
479
- if (
480
- error instanceof Error &&
481
- (error.name === 'AbortError' ||
482
- error.message?.includes('aborted'))
483
- ) {
484
- bail(error as Error);
485
- return { prediction: '', parsedPredictions: [] };
486
- }
487
- throw error;
488
- }
489
- },
490
- {
491
- retries: this.retry?.model?.maxRetries ?? 0,
492
- minTimeout: 1000 * 30,
493
- onRetry: this.retry?.model?.onRetry,
494
- }
495
- );
496
- prediction = modelResult.prediction;
497
- parsedPredictions = modelResult.parsedPredictions;
498
- } catch (modelError) {
499
- // Handle multimodal model API errors with specific error messages
500
- data.status = GUIAgentStatus.ERROR;
501
- const errorMsg = modelError instanceof Error ? modelError.message : String(modelError);
502
-
503
- // Provide specific error message based on error type
504
- if (errorMsg.includes('401') || errorMsg.includes('authentication') || errorMsg.includes('API key') || errorMsg.includes('api_key') || errorMsg.includes('Unauthorized') || errorMsg.includes('invalid_api_key')) {
505
- data.error = '[Multimodal Model Authentication Failed] The guiSubagentApiKey configuration is invalid.\n' +
506
- 'Error details: HTTP 401 - API key is invalid or expired\n' +
507
- 'Suggested action: Please check the guiSubagentApiKey configuration in ~/.xagent/settings.json and ensure a valid API key is set';
508
- } else if (errorMsg.includes('429') || errorMsg.includes('rate limit') || errorMsg.includes('too many requests')) {
509
- data.error = '[Multimodal Model Rate Limit Exceeded] API requests exceed rate limit.\n' +
510
- 'Error details: HTTP 429 - Too Many Requests\n' +
511
- 'Suggested action: Please retry later, or check your API account quota settings. Wait a few minutes before retrying';
512
- } else if (errorMsg.includes('network') || errorMsg.includes('fetch') || errorMsg.includes('connection') || errorMsg.includes('ECONNREFUSED')) {
513
- data.error = '[Multimodal Model Network Error] Cannot connect to API service.\n' +
514
- 'Error details: Network connection failed. Possible causes:\n' +
515
- ' 1. Network connection is lost\n' +
516
- ' 2. The guiSubagentBaseUrl configuration is incorrect\n' +
517
- ' 3. API service endpoint is unreachable\n' +
518
- 'Suggested action: Please check the guiSubagentBaseUrl configuration in ~/.xagent/settings.json and ensure network connectivity';
519
- } else if (errorMsg.includes('404') || errorMsg.includes('not found') || errorMsg.includes('model not found') || errorMsg.includes('InvalidEndpointOrModel.NotFound')) {
520
- // Extract model name
521
- const modelMatch = errorMsg.match(/model[:\s]+([^\s,"]+)|"model[:"]+([^",}]+)/i);
522
- const modelName = modelMatch ? (modelMatch[1] || modelMatch[2]) : 'Unknown';
523
- data.error = '[Multimodal Model Configuration Error] The model specified in guiSubagentModel does not exist or is not accessible.\n' +
524
- 'Error details: HTTP 404 - Model or Endpoint not found\n' +
525
- 'Configured model name: ' + modelName + '\n' +
526
- 'Suggested action: Please check the guiSubagentModel configuration in ~/.xagent/settings.json, remove or replace with a valid model name';
527
- } else {
528
- data.error = '[Multimodal Model API Call Failed]\n' +
529
- 'Error details: ' + errorMsg + '\n' +
530
- 'Please check the following configuration items:\n' +
531
- ' - guiSubagentApiKey: API key\n' +
532
- ' - guiSubagentBaseUrl: API service URL\n' +
533
- ' - guiSubagentModel: Model name\n' +
534
- 'Config file location: ~/.xagent/settings.json';
535
- }
536
- break;
537
- }
538
-
539
- // Check abort immediately after model call
540
- if (this.signal?.aborted) {
541
- data.status = GUIAgentStatus.USER_STOPPED;
542
- break;
543
- }
544
-
545
- if (!prediction) {
546
- this.logger.warn('[GUIAgent] Warning: Empty response from model, retrying...');
547
- continue;
548
- }
549
-
550
- if (this.showAIDebugInfo) {
551
- this.logger.debug('[GUIAgent] Response:', prediction);
552
- this.logger.debug('[GUIAgent] Parsed Predictions:', JSON.stringify(parsedPredictions));
553
- }
554
-
555
- const predictionSummary = this.getSummary(prediction);
556
-
557
- data.conversations.push({
558
- from: 'assistant',
559
- value: predictionSummary,
560
- timing: {
561
- start,
562
- end: Date.now(),
563
- cost: Date.now() - start,
564
- },
565
- screenshotContext: {
566
- size: {
567
- width: screenContext.width,
568
- height: screenContext.height,
569
- },
570
- scaleFactor: snapshot.scaleFactor ?? screenContext.scaleFactor,
571
- },
572
- predictionParsed: parsedPredictions,
573
- });
574
-
575
- await this.onData?.({
576
- ...data,
577
- conversations: data.conversations.slice(-1),
578
- });
579
-
580
- // Display assistant response
581
- const latestAssistant = data.conversations[data.conversations.length - 1];
582
- if (latestAssistant && latestAssistant.from === 'assistant') {
583
- this.displayConversationResult(latestAssistant, loopCnt);
584
- }
585
-
586
- // Check if we need to switch operator based on first action
587
- // Execute actions
588
- for (const parsedPrediction of parsedPredictions) {
589
- const actionType = parsedPrediction.action_type;
590
-
591
- if (this.showAIDebugInfo) {
592
- this.logger.debug('[GUIAgent] Action:', actionType);
593
- }
594
-
595
- // Handle internal action spaces
596
- if (actionType === 'error_env') {
597
- data.status = GUIAgentStatus.ERROR;
598
- data.error = 'Environment error';
599
- break;
600
- } else if (actionType === 'max_loop') {
601
- data.status = GUIAgentStatus.ERROR;
602
- data.error = 'Reached max loop';
603
- break;
604
- }
605
-
606
- // Execute action with retry
607
- if (!this.signal?.aborted && !this.isStopped) {
608
- let stepRetryCount = 0;
609
- let stepSuccess = false;
610
- let lastErrorMsg = '';
611
-
612
- this.logger.debug(`[GUIAgent] Executing action: ${actionType}, loopCnt: ${loopCnt}`);
613
-
614
- while (stepRetryCount < MAX_STEP_RETRIES && !stepSuccess) {
615
- try {
616
- const executeResult = await this.operator.doExecute({
617
- prediction,
618
- parsedPrediction,
619
- screenWidth: screenContext.width,
620
- screenHeight: screenContext.height,
621
- scaleFactor: snapshot.scaleFactor ?? screenContext.scaleFactor,
622
- factors: [1000, 1000], // Default factors
623
- });
624
-
625
- if (executeResult.status === 'end') {
626
- // 'finished' action or explicit end
627
- stepSuccess = true;
628
- break;
629
- }
630
-
631
- // Any other status (success, failed, etc.) is considered success
632
- stepSuccess = true;
633
- break;
634
- } catch (executeError) {
635
- stepRetryCount++;
636
- lastErrorMsg = executeError instanceof Error ? executeError.message : 'Unknown error';
637
- this.logger.warn(`[GUIAgent] Action failed ${stepRetryCount}/${MAX_STEP_RETRIES}: ${lastErrorMsg}`);
638
-
639
- if (stepRetryCount < MAX_STEP_RETRIES) {
640
- await sleep(1000);
641
- // Take new screenshot for retry
642
- const retrySnapshot = await this.operator.doScreenshot();
643
- if (retrySnapshot?.base64) {
644
- data.conversations.push({
645
- from: 'human',
646
- value: IMAGE_PLACEHOLDER,
647
- screenshotBase64: retrySnapshot.base64,
648
- screenshotContext: {
649
- size: {
650
- width: screenContext.width,
651
- height: screenContext.height,
652
- },
653
- scaleFactor: retrySnapshot.scaleFactor ?? screenContext.scaleFactor,
654
- },
655
- });
656
- }
657
- }
658
- }
659
- }
660
-
661
- if (!stepSuccess) {
662
- // All retries exhausted
663
- this.logger.error(`[GUIAgent] Action failed after ${MAX_STEP_RETRIES} attempts: ${lastErrorMsg}`);
664
- data.status = GUIAgentStatus.ERROR;
665
- data.error = `Action failed after ${MAX_STEP_RETRIES} attempts: ${lastErrorMsg}`;
666
- break;
667
- }
668
- }
669
-
670
- // Check abort immediately after action execution
671
- if (this.signal?.aborted) {
672
- data.status = GUIAgentStatus.USER_STOPPED;
673
- break;
674
- }
675
-
676
- // Handle special action types
677
- if (actionType === 'call_user') {
678
- data.status = GUIAgentStatus.CALL_USER;
679
- break;
680
- } else if (actionType === 'finished') {
681
- data.status = GUIAgentStatus.END;
682
- break;
683
- }
684
- }
685
-
686
- // Check abort after action loop
687
- if (this.signal?.aborted) {
688
- data.status = GUIAgentStatus.USER_STOPPED;
689
- break;
690
- }
691
-
692
- // Wait between iterations
693
- if (this.loopIntervalInMs > 0) {
694
- await sleep(this.loopIntervalInMs);
695
- }
696
- }
697
- } catch (error) {
698
- this.logger.error('[GUIAgent] Catch error', error);
699
- if (
700
- error instanceof Error &&
701
- (error.name === 'AbortError' || error.message?.includes('aborted'))
702
- ) {
703
- data.status = GUIAgentStatus.USER_STOPPED;
704
- } else {
705
- data.status = GUIAgentStatus.ERROR;
706
- data.error = error instanceof Error ? error.message : 'Unknown error';
707
- }
708
- } finally {
709
- // Save final status
710
- const finalStatus = data.status;
711
- const finalError = data.error;
712
-
713
- // Output error immediately if task failed
714
- if (finalStatus === GUIAgentStatus.ERROR && finalError) {
715
- console.log(`\n${colors.error('✖')} ${finalError}\n`);
716
- }
717
-
718
- // Call onData callback if set
719
- // Note: Use Promise.resolve().then() to avoid modifying data in callback
720
- const onDataCallback = this.onData;
721
- if (onDataCallback) {
722
- Promise.resolve().then(() => onDataCallback({ ...data, conversations: [] }));
723
- }
724
-
725
- // Call onError callback if status is error
726
- if (finalStatus === GUIAgentStatus.ERROR && this.onError) {
727
- this.onError(new Error(finalError || 'Unknown error occurred'));
728
- }
729
-
730
- if (this.showAIDebugInfo) {
731
- this.logger.debug('[GUIAgent] Final status:', {
732
- status: finalStatus,
733
- loopCnt,
734
- totalConversations: data.conversations.length,
735
- });
736
- }
737
-
738
- // Ensure the returned status is correct (reassign)
739
- this.logger.debug(`[GUIAgent] Finally: finalStatus=${finalStatus}, finalError=${finalError}, data.status=${data.status}, data.error=${data.error}`);
740
-
741
- // Log final status (only visible when showAIDebugInfo is enabled)
742
- this.logger.debug(`[GUIAgent] Final status: ${finalStatus}${finalError ? `, Error: ${finalError}` : ''}, Steps: ${loopCnt}`);
743
-
744
- data.status = finalStatus;
745
- data.error = finalError;
746
- }
747
-
748
- return data;
749
- }
750
-
751
- /**
752
- * Build messages for the model API
753
- */
754
- private buildModelMessages(conversations: Conversation[], systemPrompt: string): any[] {
755
- const messages: any[] = [];
756
-
757
- // System prompt
758
- messages.push({
759
- role: 'system',
760
- content: systemPrompt,
761
- });
762
-
763
- // Add conversation history
764
- for (const conv of conversations) {
765
- if (conv.from === 'human' && conv.screenshotBase64) {
766
- messages.push({
767
- role: 'user',
768
- content: [
769
- { type: 'text', text: conv.value },
770
- {
771
- type: 'image_url',
772
- image_url: {
773
- url: `data:image/png;base64,${conv.screenshotBase64}`,
774
- detail: 'high',
775
- },
776
- },
777
- ],
778
- });
779
- } else if (conv.from === 'assistant') {
780
- messages.push({
781
- role: 'assistant',
782
- content: conv.value,
783
- });
784
- } else {
785
- messages.push({
786
- role: 'user',
787
- content: conv.value,
788
- });
789
- }
790
- }
791
-
792
- return messages;
793
- }
794
-
795
- /**
796
- * Extract image and prompt from messages for remote VLM calls
797
- */
798
- private extractImageAndPrompt(messages: any[]): { image: string; prompt: string } {
799
- const lastUserMessage = messages[messages.length - 1];
800
- let image = '';
801
- let prompt = '';
802
-
803
- if (lastUserMessage && Array.isArray(lastUserMessage.content)) {
804
- const imageBlock = lastUserMessage.content.find((c: any) => c.type === 'image_url');
805
- const textBlock = lastUserMessage.content.find((c: any) => c.type === 'text');
806
-
807
- if (imageBlock) {
808
- const imageUrl = imageBlock.image_url?.url || '';
809
- if (imageUrl.startsWith('data:image')) {
810
- image = imageUrl.split(',')[1] || '';
811
- } else {
812
- image = imageUrl;
813
- }
814
- }
815
- prompt = textBlock?.text || '';
816
- }
817
-
818
- return { image, prompt };
819
- }
820
-
821
- /**
822
- * Debug output for model request
823
- */
824
- private debugRequest(messages: any[], remoteVlmCaller?: RemoteVlmCaller): void {
825
- console.log('\n╔══════════════════════════════════════════════════════════╗');
826
- console.log('║ GUI MODEL REQUEST DEBUG ║');
827
- console.log('╚══════════════════════════════════════════════════════════╝');
828
- console.log(`📦 Model: ${remoteVlmCaller ? ((remoteVlmCaller as any).info?.model || 'remote') : this.model}`);
829
- console.log(`🌐 Base URL: ${remoteVlmCaller ? ((remoteVlmCaller as any).info?.baseUrl || 'remote') : (this.modelBaseUrl || process.env.MODEL_BASE_URL || 'https://api.openai.com/v1')}`);
830
- console.log(`💬 Messages: ${messages.length}`);
831
-
832
- // Show system prompt if present
833
- const systemMsg = messages.find((m: any) => m.role === 'system');
834
- if (systemMsg) {
835
- console.log('\n┌─────────────────────────────────────────────────────────────┐');
836
- console.log('│ 🟫 SYSTEM │');
837
- console.log('├─────────────────────────────────────────────────────────────┤');
838
- const systemContent = typeof systemMsg.content === 'string'
839
- ? systemMsg.content
840
- : JSON.stringify(systemMsg.content);
841
- const lines = systemContent.split('\n').slice(0, 15);
842
- for (const line of lines) {
843
- console.log('│ ' + line.slice(0, 62));
844
- }
845
- if (systemContent.split('\n').length > 15) {
846
- console.log('│ ... (truncated)');
847
- }
848
- console.log('└─────────────────────────────────────────────────────────────┘');
849
- }
850
-
851
- // Show conversation messages
852
- const roleColors: Record<string, string> = {
853
- user: '👤 USER',
854
- assistant: '🤖 ASSISTANT',
855
- };
856
-
857
- for (let i = 0; i < messages.length; i++) {
858
- const msg = messages[i];
859
- if (msg.role === 'system') continue;
860
-
861
- const roleLabel = roleColors[msg.role] || `● ${msg.role.toUpperCase()}`;
862
- console.log(`\n┌─────────────────────────────────────────────────────────────┐`);
863
- console.log(`│ ${roleLabel} (${i + 1}) │`);
864
- console.log('├─────────────────────────────────────────────────────────────┤');
865
-
866
- if (typeof msg.content === 'string') {
867
- const lines = msg.content.split('\n').slice(0, 20);
868
- for (const line of lines) {
869
- console.log('│ ' + line.slice(0, 62));
870
- }
871
- if (msg.content.split('\n').length > 20) {
872
- console.log('│ ... (truncated)');
873
- }
874
- } else if (Array.isArray(msg.content)) {
875
- const hasImage = msg.content.some((c: any) => c.type === 'image_url');
876
- console.log('│ 📎 Content blocks: ' + msg.content.length);
877
- if (hasImage) {
878
- const imageBlock = msg.content.find((c: any) => c.type === 'image_url');
879
- const imageSize = imageBlock?.image_url?.url?.length || 0;
880
- console.log('│ 🖼️ Image size: ' + (imageSize / 1024).toFixed(2) + ' KB');
881
- }
882
- const textBlock = msg.content.find((c: any) => c.type === 'text');
883
- if (textBlock?.text) {
884
- const lines = textBlock.text.split('\n').slice(0, 10);
885
- for (const line of lines) {
886
- console.log('│ ' + line.slice(0, 62));
887
- }
888
- }
889
- }
890
- console.log('└─────────────────────────────────────────────────────────────┘');
891
- }
892
-
893
- console.log('\n📤 Sending request to model API...\n');
894
- }
895
-
896
- /**
897
- * Debug output for model response
898
- */
899
- private debugResponse(content: string, usage?: any): void {
900
- console.log('\n╔══════════════════════════════════════════════════════════╗');
901
- console.log('║ GUI MODEL RESPONSE DEBUG ║');
902
- console.log('╚══════════════════════════════════════════════════════════╝');
903
-
904
- if (usage) {
905
- console.log(`📊 Tokens: ${usage.prompt_tokens} (prompt) + ${usage.completion_tokens} (completion) = ${usage.total_tokens} (total)`);
906
- }
907
-
908
- console.log('\n┌─────────────────────────────────────────────────────────────┐');
909
- console.log('🤖 ASSISTANT');
910
- console.log('├─────────────────────────────────────────────────────────────┤');
911
- console.log('│ 💬 CONTENT:');
912
- console.log('│ ───────────────────────────────────────────────────────────');
913
-
914
- const lines = content.split('\n').slice(0, 30);
915
- for (const line of lines) {
916
- console.log('│ ' + line.slice(0, 62));
917
- }
918
- if (content.split('\n').length > 30) {
919
- console.log(`│ ... (${content.split('\n').length - 30} more lines)`);
920
- }
921
- console.log('│ ───────────────────────────────────────────────────────────');
922
- console.log('└─────────────────────────────────────────────────────────────┘');
923
-
924
- console.log('\n╔══════════════════════════════════════════════════════════╗');
925
- console.log('║ RESPONSE ENDED ║');
926
- console.log('╚══════════════════════════════════════════════════════════╝\n');
927
- }
928
-
929
- /**
930
- * Call local VLM API
931
- */
932
- private async callLocalVLM(
933
- messages: any[],
934
- screenContext: ScreenContext
935
- ): Promise<{ prediction: string; parsedPredictions: PredictionParsed[] }> {
936
- const baseUrl = this.modelBaseUrl || process.env.MODEL_BASE_URL || 'https://api.openai.com/v1';
937
- const apiKey = this.modelApiKey || process.env.MODEL_API_KEY || '';
938
-
939
- const requestBody = {
940
- model: this.model,
941
- messages,
942
- max_tokens: 1024,
943
- temperature: 0.1,
944
- };
945
-
946
- // Debug output for model input
947
- if (this.showAIDebugInfo) {
948
- this.debugRequest(messages);
949
- }
950
-
951
- let response;
952
- try {
953
- response = await fetch(`${baseUrl}/chat/completions`, {
954
- method: 'POST',
955
- headers: {
956
- 'Content-Type': 'application/json',
957
- 'Authorization': `Bearer ${apiKey}`,
958
- },
959
- body: JSON.stringify(requestBody),
960
- signal: this.signal,
961
- });
962
- } catch (fetchError) {
963
- throw fetchError;
964
- }
965
-
966
- // Handle non-200 responses
967
- if (!response.ok) {
968
- const errorText = await response.text();
969
- throw new Error(`Model API error: ${errorText}`);
970
- }
971
-
972
- const result = await response.json() as { choices?: Array<{ message?: { content?: string } }>; usage?: any };
973
- const content = result.choices?.[0]?.message?.content || '';
974
-
975
- // Debug output for model response
976
- if (this.showAIDebugInfo) {
977
- this.debugResponse(content, result.usage);
978
- }
979
-
980
- const { parsed: parsedPredictions } = actionParser({
981
- prediction: content,
982
- factor: [1000, 1000],
983
- screenContext: {
984
- width: screenContext.width,
985
- height: screenContext.height,
986
- },
987
- });
988
-
989
- return {
990
- prediction: content,
991
- parsedPredictions,
992
- };
993
- }
994
-
995
- /**
996
- * Call the model API with debug logging
997
- * Local mode: use model/modelBaseUrl/modelApiKey directly
998
- * Remote mode: use remoteVlmCaller for VLM calls (now with full messages for consistent behavior)
999
- */
1000
- private async callModelAPI(
1001
- messages: any[],
1002
- screenContext: ScreenContext,
1003
- remoteVlmCaller: RemoteVlmCaller
1004
- ): Promise<{ prediction: string; parsedPredictions: PredictionParsed[] }> {
1005
- // === LOCAL 模式 ===
1006
- if (this.isLocalMode) {
1007
- return this.callLocalVLM(messages, screenContext);
1008
- }
1009
-
1010
- // === REMOTE 模式 ===
1011
- else {
1012
- // Debug output for model input
1013
- if (this.showAIDebugInfo) {
1014
- this.debugRequest(messages, remoteVlmCaller);
1015
- }
1016
-
1017
- // Use externally injected VLM caller function with full messages (same as local mode)
1018
- const prediction = await remoteVlmCaller(messages, this.systemPrompt);
1019
-
1020
- // Debug output for model response
1021
- if (this.showAIDebugInfo) {
1022
- this.debugResponse(prediction);
1023
- }
1024
-
1025
- const { parsed: parsedPredictions } = actionParser({
1026
- prediction,
1027
- factor: [1000, 1000],
1028
- screenContext: {
1029
- width: screenContext.width,
1030
- height: screenContext.height,
1031
- },
1032
- });
1033
-
1034
- return {
1035
- prediction,
1036
- parsedPredictions,
1037
- };
1038
- }
1039
- }
1040
-
1041
- /**
1042
- * Get summary from prediction text
1043
- */
1044
- private getSummary(prediction: string): string {
1045
- // Extract the action part as summary
1046
- const actionMatch = prediction.match(/Action[::]\s*([\s\S]+)$/i);
1047
- if (actionMatch) {
1048
- return actionMatch[1].trim();
1049
- }
1050
- return prediction.slice(0, 200);
1051
- }
1052
-
1053
- pause(): void {
1054
- this.isPaused = true;
1055
- this.resumePromise = new Promise((resolve) => {
1056
- this.resolveResume = resolve;
1057
- });
1058
- }
1059
-
1060
- resume(): void {
1061
- if (this.resolveResume) {
1062
- this.resolveResume();
1063
- this.resumePromise = null;
1064
- this.resolveResume = null;
1065
- }
1066
- this.isPaused = false;
1067
- }
1068
-
1069
- stop(): void {
1070
- this.isStopped = true;
1071
- }
1072
-
1073
- async cleanup(): Promise<void> {
1074
- this.logger.debug('Cleaning up GUI Agent...');
1075
- await this.operator.cleanup();
1076
-
1077
- // Cleanup cancellation listener if attached
1078
- const cancelHandler = (this as any)._cancelHandler;
1079
- const cancellationManager = (this as any)._cancellationManager;
1080
- if (cancelHandler && cancellationManager) {
1081
- cancellationManager.off('cancelled', cancelHandler);
1082
- (this as any)._cancelHandler = undefined;
1083
- (this as any)._cancellationManager = undefined;
1084
- }
1085
- }
1086
- }
1087
-
1088
- export { GUIAgentStatus as StatusEnum };
1089
-
1
+ /**
2
+ * GUI Agent for xagent
3
+ * Orchestrates desktop automation with AI-powered action execution
4
+ * Based on UI-TARS architecture with computer control only
5
+ *
6
+ * This implementation is aligned with packages/ui-tars/sdk/src/GUIAgent.ts
7
+ */
8
+
9
+ import type {
10
+ ScreenContext,
11
+ ScreenshotOutput,
12
+ ExecuteParams,
13
+ ExecuteOutput,
14
+ PredictionParsed,
15
+ } from '../types/operator.js';
16
+ import type { Operator } from '../operator/base-operator.js';
17
+ import { sleep, asyncRetry } from '../utils.js';
18
+ import { actionParser } from '../action-parser/index.js';
19
+ import { colors, icons, renderMarkdown } from '../../theme.js';
20
+ import { getLogger } from '../../logger.js';
21
+
22
+ /**
23
+ * Helper function to truncate long text
24
+ */
25
+ function truncateText(text: string, maxLength: number = 200): string {
26
+ if (!text) return '';
27
+ return text.length > maxLength ? text.substring(0, maxLength) + '...' : text;
28
+ }
29
+
30
+ /**
31
+ * Helper function to indent multiline text
32
+ */
33
+ function indentMultiline(text: string, indent: string): string {
34
+ return text.split('\n').map(line => indent + line).join('\n');
35
+ }
36
+
37
+ const guiLogger = getLogger();
38
+
39
+ // UI-TARS Status Enum
40
+ export enum GUIAgentStatus {
41
+ INIT = 'init',
42
+ RUNNING = 'running',
43
+ PAUSE = 'paused',
44
+ END = 'end',
45
+ ERROR = 'error',
46
+ USER_STOPPED = 'user_stopped',
47
+ CALL_LLM = 'call_llm',
48
+ }
49
+
50
+ /**
51
+ * Remote VLM Caller callback function type
52
+ * Inject this function externally to handle VLM calls, GUI Agent doesn't need to know VLM implementation details
53
+ * Receives full messages array (same as local mode) for consistent behavior
54
+ * @param messages - Full messages array
55
+ * @param systemPrompt - System prompt (for reference)
56
+ * @param taskId - Task identifier for backend tracking
57
+ * @param isFirstVlmCallRef - Reference object to track and update first VLM call state
58
+ */
59
+ export type RemoteVlmCaller = (messages: any[], systemPrompt: string, taskId: string, isFirstVlmCallRef: { current: boolean }) => Promise<string>;
60
+
61
+ export interface GUIAgentConfig<T extends Operator> {
62
+ operator: T;
63
+ model?: string;
64
+ modelBaseUrl?: string;
65
+ modelApiKey?: string;
66
+ /**
67
+ * Task identifier for VLM state tracking (begin vs continue)
68
+ */
69
+ taskId?: string;
70
+ /**
71
+ * Shared ref object to track first VLM call across createGUISubAgent calls
72
+ * Must be passed from outside to properly track VLM status across loop iterations
73
+ */
74
+ isFirstVlmCallRef?: { current: boolean };
75
+ /**
76
+ * Externally injected VLM caller function
77
+ * If this function is provided, GUI Agent will use it to call VLM
78
+ * instead of directly calling modelBaseUrl/modelApiKey
79
+ * This allows GUI Agent to work with remote services without exposing any configuration
80
+ */
81
+ remoteVlmCaller?: RemoteVlmCaller;
82
+ /**
83
+ * Whether to use local mode
84
+ * If true, use model/modelBaseUrl/modelApiKey for VLM calls
85
+ * If false, use remoteVlmCaller for remote VLM calls
86
+ */
87
+ isLocalMode: boolean;
88
+ systemPrompt?: string;
89
+ loopIntervalInMs?: number;
90
+ maxLoopCount?: number;
91
+ logger?: any;
92
+ signal?: AbortSignal;
93
+ onData?: (data: GUIAgentData) => void;
94
+ onError?: (error: Error) => void;
95
+ showAIDebugInfo?: boolean;
96
+ indentLevel?: number;
97
+ retry?: {
98
+ screenshot?: {
99
+ maxRetries?: number;
100
+ onRetry?: (e: Error) => void;
101
+ };
102
+ model?: {
103
+ maxRetries?: number;
104
+ onRetry?: (e: Error) => void;
105
+ };
106
+ execute?: {
107
+ maxRetries?: number;
108
+ onRetry?: (e: Error) => void;
109
+ };
110
+ };
111
+ }
112
+
113
+ export interface GUIAgentData {
114
+ status: GUIAgentStatus;
115
+ conversations: Conversation[];
116
+ error?: string;
117
+ systemPrompt?: string;
118
+ }
119
+
120
+ export interface Conversation {
121
+ from: 'human' | 'assistant';
122
+ value: string;
123
+ screenshotBase64?: string;
124
+ screenshotContext?: {
125
+ size: { width: number; height: number };
126
+ mime?: string;
127
+ scaleFactor: number;
128
+ };
129
+ actionType?: string;
130
+ actionInputs?: Record<string, any>;
131
+ timing?: {
132
+ start: number;
133
+ end: number;
134
+ cost: number;
135
+ };
136
+ predictionParsed?: PredictionParsed[];
137
+ }
138
+
139
+ // UI-TARS constants (aligned with @ui-tars/shared/constants)
140
+ const MAX_LOOP_COUNT = 100;
141
+ const MAX_SNAPSHOT_ERR_CNT = 5;
142
+ const MAX_STEP_RETRIES = 3; // Max retries for a single action step before giving up
143
+ const IMAGE_PLACEHOLDER = '{{IMG_PLACEHOLDER_0}}';
144
+
145
+ export class GUIAgent<T extends Operator> {
146
+ private operator: T;
147
+ private readonly model: string;
148
+ private readonly modelBaseUrl: string;
149
+ private readonly modelApiKey: string;
150
+ private readonly taskId: string;
151
+ private readonly isFirstVlmCallRef?: { current: boolean };
152
+ private readonly remoteVlmCaller?: RemoteVlmCaller;
153
+ private readonly isLocalMode: boolean;
154
+ private readonly systemPrompt: string;
155
+ private readonly loopIntervalInMs: number;
156
+ private readonly maxLoopCount: number;
157
+ private readonly logger: Console;
158
+ private readonly signal?: AbortSignal;
159
+ private readonly onData?: (data: GUIAgentData) => void;
160
+ private readonly onError?: (error: Error) => void;
161
+ private readonly showAIDebugInfo: boolean;
162
+ private readonly indentLevel: number;
163
+ private readonly retry?: GUIAgentConfig<T>['retry'];
164
+
165
+ private isPaused = false;
166
+ private resumePromise: Promise<void> | null = null;
167
+ private resolveResume: (() => void) | null = null;
168
+ private isStopped = false;
169
+ private isFirstVlmCall = true;
170
+
171
+ constructor(config: GUIAgentConfig<T>) {
172
+ this.operator = config.operator;
173
+ this.model = config.model || '';
174
+ this.modelBaseUrl = config.modelBaseUrl || '';
175
+ this.modelApiKey = config.modelApiKey || '';
176
+ this.taskId = config.taskId || crypto.randomUUID();
177
+ this.isFirstVlmCallRef = config.isFirstVlmCallRef;
178
+ this.remoteVlmCaller = config.remoteVlmCaller;
179
+ this.isLocalMode = config.isLocalMode;
180
+ this.loopIntervalInMs = config.loopIntervalInMs || 0;
181
+ this.maxLoopCount = config.maxLoopCount || MAX_LOOP_COUNT;
182
+ this.logger = config.logger || guiLogger;
183
+ this.signal = config.signal;
184
+ this.onData = config.onData;
185
+ this.onError = config.onError;
186
+ this.showAIDebugInfo = config.showAIDebugInfo ?? false;
187
+ this.indentLevel = config.indentLevel ?? 1;
188
+ this.retry = config.retry;
189
+
190
+ this.systemPrompt = config.systemPrompt || this.buildSystemPrompt();
191
+ }
192
+
193
+ /**
194
+ * Set isFirstVlmCall to false after first VLM call
195
+ * Called by external code after remoteVlmCaller completes first call
196
+ */
197
+ public setIsFirstVlmCall(value: boolean): void {
198
+ this.isFirstVlmCall = value;
199
+ }
200
+
201
+ /**
202
+ * Display conversation results with formatting similar to session.ts (simplified)
203
+ */
204
+ private displayConversationResult(conversation: Conversation, iteration: number, indentLevel: number = 1): void {
205
+ const indent = ' '.repeat(indentLevel);
206
+ const innerIndent = ' '.repeat(indentLevel + 1);
207
+ const maxWidth = process.stdout.columns || 80;
208
+
209
+ if (conversation.from === 'assistant') {
210
+ // Display assistant response (action)
211
+ const content = conversation.value || '';
212
+ const timing = conversation.timing;
213
+
214
+ // Simplified: show step number and action
215
+ const actionSummary = content.replace(/Thought:[\s\S]*?Action:\s*/i, '').trim();
216
+ const actionType = conversation.predictionParsed?.[0]?.action_type || 'action';
217
+
218
+ console.log(`${indent}${colors.primaryBright(`[${iteration}]`)} ${colors.textMuted(actionType)}${timing ? colors.textDim(` (${timing.cost}ms)`) : ''}`);
219
+
220
+ // Optionally show action details on next line if verbose
221
+ if (this.showAIDebugInfo && actionSummary) {
222
+ const truncatedSummary = actionSummary.length > 60 ? actionSummary.substring(0, 60) + '...' : actionSummary;
223
+ console.log(`${innerIndent}${colors.textMuted(truncatedSummary)}`);
224
+ }
225
+ } else if (conversation.from === 'human' && conversation.screenshotBase64) {
226
+ // Show minimal indicator for screenshot
227
+ if (this.showAIDebugInfo) {
228
+ const timing = conversation.timing;
229
+ console.log(`${indent}${colors.textMuted(`${icons.loading} screenshot${timing ? ` (${timing.cost}ms)` : ''}`)}`);
230
+ }
231
+ }
232
+ }
233
+
234
+ /**
235
+ * Display status message
236
+ */
237
+ private displayStatus(data: GUIAgentData, iteration: number, indentLevel: number = 1): void {
238
+ const indent = ' '.repeat(indentLevel);
239
+ const status = data.status;
240
+
241
+ switch (status) {
242
+ case GUIAgentStatus.RUNNING:
243
+ console.log(`${indent}${colors.info(`${icons.loading} Step ${iteration}: Running...`)}`);
244
+ break;
245
+ case GUIAgentStatus.END:
246
+ // Handled by caller
247
+ break;
248
+ case GUIAgentStatus.ERROR:
249
+ if (data.error) {
250
+ console.log(`${indent}${colors.error(`${icons.cross} ${data.error}`)}`);
251
+ }
252
+ break;
253
+ case GUIAgentStatus.USER_STOPPED:
254
+ console.log(`${indent}${colors.warning(`${icons.warning} Stopped`)}`);
255
+ break;
256
+ default:
257
+ break;
258
+ }
259
+ }
260
+
261
+ private buildSystemPrompt(): string {
262
+ return `You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
263
+
264
+ ## Output Format
265
+ \`
266
+ Thought: ...
267
+ Action: ...
268
+ \`
269
+
270
+ ## Action Space
271
+ click(point='<point>x1 y1</point>')
272
+ left_double(point='<point>x1 y1</point>')
273
+ right_single(point='<point>x1 y1</point>')
274
+ drag(start_point='<point>x1 y1</point>', end_point='<point>x2 y2</point>')
275
+ hotkey(key='ctrl c') # Split keys with a space and use lowercase. Also, do not use more than 3 keys in one hotkey action.
276
+ type(content='xxx') # Use escape characters \', \", and \n in content part to ensure we can parse the content in normal python string format. If you want to submit your input, use \n at the end of content.
277
+ scroll(point='<point>x1 y1</point>', direction='down or up or right or left') # Show more information on the \`direction\` side.
278
+ open_url(url='https://xxx') # Open URL in browser
279
+ wait() #Sleep for 5s and take a screenshot to check for any changes.
280
+ finished(content='xxx') # Use escape characters \', \", and \n in content part to ensure we can parse the content in normal python string format.
281
+
282
+
283
+
284
+
285
+ ## Note
286
+ - Use {language} in \`Thought\` part.
287
+ - Write a small plan and finally summarize your next action (with its target element) in one sentence in \`Thought\` part.
288
+
289
+ `;
290
+ }
291
+
292
+
293
+
294
+ async initialize(): Promise<void> {
295
+ await this.operator.doInitialize();
296
+ }
297
+
298
+ /**
299
+ * Run the GUI agent with a single instruction (UI-TARS style)
300
+ * All operations are determined by the GUI model
301
+ */
302
+ async run(instruction: string): Promise<GUIAgentData> {
303
+ const data: GUIAgentData = {
304
+ status: GUIAgentStatus.INIT,
305
+ conversations: [
306
+ {
307
+ from: 'human',
308
+ value: instruction,
309
+ timing: {
310
+ start: Date.now(),
311
+ end: Date.now(),
312
+ cost: 0,
313
+ },
314
+ },
315
+ ],
316
+ };
317
+
318
+ // Initialize operator for initial screenshot
319
+ try {
320
+ await this.operator.doInitialize();
321
+ } catch (initError) {
322
+ const errorMsg = initError instanceof Error ? initError.message : 'Unknown error';
323
+ this.logger.error(`[GUIAgent] Failed to initialize operator: ${errorMsg}`);
324
+
325
+ // Check if it's an RDP-related issue
326
+ if (errorMsg.includes('screen') || errorMsg.includes('capture') || errorMsg.includes('display')) {
327
+ data.status = GUIAgentStatus.ERROR;
328
+ data.error = 'Failed to initialize screen capture. This may be caused by:\n' +
329
+ ' 1. Remote Desktop session disconnected or minimized\n' +
330
+ ' 2. Display driver issues\n' +
331
+ 'Suggestion: Ensure your display is active and try again.';
332
+ } else {
333
+ data.status = GUIAgentStatus.ERROR;
334
+ data.error = `Failed to initialize operator: ${errorMsg}`;
335
+ }
336
+ return data;
337
+ }
338
+
339
+ const currentTime = Date.now();
340
+
341
+ if (this.showAIDebugInfo) {
342
+ this.logger.debug('[GUIAgent] run:', {
343
+ systemPrompt: this.systemPrompt,
344
+ model: this.model,
345
+ maxLoopCount: this.maxLoopCount,
346
+ });
347
+ }
348
+
349
+ let loopCnt = 0;
350
+ let snapshotErrCnt = 0;
351
+
352
+ // Start running agent
353
+ data.status = GUIAgentStatus.RUNNING;
354
+ data.systemPrompt = this.systemPrompt;
355
+ const indent = ' '.repeat(this.indentLevel);
356
+ console.log(`${indent}${colors.primaryBright(`${icons.rocket} GUI Agent started`)}`);
357
+ console.log('');
358
+ await this.onData?.({ ...data, conversations: [] });
359
+
360
+ try {
361
+ // eslint-disable-next-line no-constant-condition
362
+ while (true) {
363
+ if (this.showAIDebugInfo) {
364
+ this.logger.debug('[GUIAgent] loopCnt:', loopCnt);
365
+ }
366
+
367
+ // Check pause status
368
+ if (this.isPaused && this.resumePromise) {
369
+ data.status = GUIAgentStatus.PAUSE;
370
+ await this.onData?.({ ...data, conversations: [] });
371
+ await this.resumePromise;
372
+ data.status = GUIAgentStatus.RUNNING;
373
+ await this.onData?.({ ...data, conversations: [] });
374
+ }
375
+
376
+ // Check stop or aborted status
377
+ if (
378
+ this.isStopped ||
379
+ data.status !== GUIAgentStatus.RUNNING ||
380
+ this.signal?.aborted
381
+ ) {
382
+ if (this.signal?.aborted) {
383
+ data.status = GUIAgentStatus.USER_STOPPED;
384
+ }
385
+ break;
386
+ }
387
+
388
+ // Check loop limit
389
+ if (loopCnt >= this.maxLoopCount) {
390
+ data.status = GUIAgentStatus.ERROR;
391
+ data.error = `Has reached max loop count: ${loopCnt}`;
392
+ break;
393
+ }
394
+
395
+ // Check screenshot error limit
396
+ if (snapshotErrCnt >= MAX_SNAPSHOT_ERR_CNT) {
397
+ data.status = GUIAgentStatus.ERROR;
398
+ data.error = 'Screenshot failed too many times. Stopping task.';
399
+ break;
400
+ }
401
+
402
+ loopCnt += 1;
403
+ const start = Date.now();
404
+
405
+ // Take screenshot (single attempt - no retry to avoid infinite loops)
406
+ let snapshot: ScreenshotOutput;
407
+ try {
408
+ snapshot = await this.operator.doScreenshot();
409
+ } catch (screenshotError) {
410
+ const errorMsg = screenshotError instanceof Error ? screenshotError.message : 'Unknown error';
411
+ this.logger.warn(`[GUIAgent] Screenshot exception: ${errorMsg}`);
412
+ snapshotErrCnt += 1;
413
+ data.status = GUIAgentStatus.ERROR;
414
+ data.error = `Screenshot failed ${snapshotErrCnt} times. Stopping task.`;
415
+ this.logger.error(`[GUIAgent] ${data.error}`);
416
+ await sleep(1000);
417
+ break;
418
+ }
419
+
420
+ // Check if screenshot returned failure status
421
+ if (snapshot.status === 'failed') {
422
+ const errorMsg = snapshot.errorMessage || 'Unknown error';
423
+ this.logger.warn(`[GUIAgent] Screenshot failed: ${errorMsg}`);
424
+ snapshotErrCnt += 1;
425
+ data.status = GUIAgentStatus.ERROR;
426
+ data.error = `Screenshot failed ${snapshotErrCnt} times. Stopping task.`;
427
+ this.logger.error(`[GUIAgent] ${data.error}`);
428
+ await sleep(1000);
429
+ break;
430
+ }
431
+
432
+ // Check abort immediately after screenshot
433
+ if (this.signal?.aborted) {
434
+ data.status = GUIAgentStatus.USER_STOPPED;
435
+ break;
436
+ }
437
+
438
+ // Validate screenshot
439
+ const isValidImage = !!(snapshot?.base64);
440
+ if (!isValidImage) {
441
+ snapshotErrCnt += 1;
442
+ data.status = GUIAgentStatus.ERROR;
443
+ data.error = `Screenshot failed ${snapshotErrCnt} times. Stopping task.`;
444
+ this.logger.error(`[GUIAgent] ${data.error}`);
445
+ await sleep(1000);
446
+ break;
447
+ }
448
+
449
+ // Reset error counter on successful screenshot
450
+ snapshotErrCnt = 0;
451
+
452
+ const end = Date.now();
453
+
454
+ // Get screen context
455
+ const screenContext = await this.operator.getScreenContext();
456
+
457
+ // Add screenshot to conversation
458
+ data.conversations.push({
459
+ from: 'human',
460
+ value: IMAGE_PLACEHOLDER,
461
+ screenshotBase64: snapshot.base64,
462
+ screenshotContext: {
463
+ size: {
464
+ width: screenContext.width,
465
+ height: screenContext.height,
466
+ },
467
+ scaleFactor: snapshot.scaleFactor ?? screenContext.scaleFactor,
468
+ },
469
+ timing: {
470
+ start,
471
+ end,
472
+ cost: end - start,
473
+ },
474
+ });
475
+
476
+ await this.onData?.({
477
+ ...data,
478
+ conversations: data.conversations.slice(-1),
479
+ });
480
+
481
+ // Display screenshot notification
482
+ const latestScreenshot = data.conversations[data.conversations.length - 1];
483
+ if (latestScreenshot && latestScreenshot.from === 'human' && latestScreenshot.screenshotBase64) {
484
+ this.displayConversationResult(latestScreenshot, loopCnt, this.indentLevel);
485
+ }
486
+
487
+ // Build messages for model
488
+ const messages = this.buildModelMessages(data.conversations, data.systemPrompt);
489
+
490
+ // Check abort before model call
491
+ if (this.signal?.aborted) {
492
+ data.status = GUIAgentStatus.USER_STOPPED;
493
+ break;
494
+ }
495
+
496
+ // Invoke model with retry
497
+ let prediction: string;
498
+ let parsedPredictions: PredictionParsed[];
499
+ try {
500
+ const modelResult: { prediction: string; parsedPredictions: PredictionParsed[] } = await asyncRetry(
501
+ async (bail) => {
502
+ try {
503
+ const result = await this.callModelAPI(messages, screenContext, this.remoteVlmCaller!);
504
+ return result;
505
+ } catch (error: unknown) {
506
+ const errorMsg = error instanceof Error ? error.message : String(error);
507
+ // 捕获各种 abort 相关的错误
508
+ if (
509
+ error instanceof Error &&
510
+ (error.name === 'AbortError' ||
511
+ errorMsg.includes('aborted') ||
512
+ errorMsg.includes('canceled') ||
513
+ errorMsg.includes('cancelled') ||
514
+ errorMsg === 'Operation was canceled' ||
515
+ errorMsg === 'The operation was canceled' ||
516
+ errorMsg === 'This operation was aborted')
517
+ ) {
518
+ bail(error as Error);
519
+ return { prediction: '', parsedPredictions: [] };
520
+ }
521
+ throw error;
522
+ }
523
+ },
524
+ {
525
+ retries: this.retry?.model?.maxRetries ?? 0,
526
+ minTimeout: 1000 * 30,
527
+ onRetry: this.retry?.model?.onRetry,
528
+ }
529
+ );
530
+ prediction = modelResult.prediction;
531
+ parsedPredictions = modelResult.parsedPredictions;
532
+ } catch (modelError) {
533
+ // 首先检查是否是取消/abort 错误
534
+ const errorMsg = modelError instanceof Error ? modelError.message : String(modelError);
535
+ const isAbortError =
536
+ modelError instanceof Error && (
537
+ modelError.name === 'AbortError' ||
538
+ errorMsg.includes('aborted') ||
539
+ errorMsg.includes('canceled') ||
540
+ errorMsg.includes('cancelled') ||
541
+ errorMsg === 'Operation was canceled' ||
542
+ errorMsg === 'The operation was canceled' ||
543
+ errorMsg === 'This operation was aborted'
544
+ );
545
+
546
+ if (isAbortError || this.signal?.aborted) {
547
+ data.status = GUIAgentStatus.USER_STOPPED;
548
+ data.conversations = data.conversations || [];
549
+ return data;
550
+ }
551
+
552
+ // Handle multimodal model API errors with specific error messages
553
+ data.status = GUIAgentStatus.ERROR;
554
+ if (errorMsg.includes('401') || errorMsg.includes('authentication') || errorMsg.includes('API key') || errorMsg.includes('api_key') || errorMsg.includes('Unauthorized') || errorMsg.includes('invalid_api_key')) {
555
+ data.error = '[Multimodal Model Authentication Failed] The guiSubagentApiKey configuration is invalid.\n' +
556
+ 'Error details: HTTP 401 - API key is invalid or expired\n' +
557
+ 'Suggested action: Please check the guiSubagentApiKey configuration in ~/.xagent/settings.json and ensure a valid API key is set';
558
+ } else if (errorMsg.includes('429') || errorMsg.includes('rate limit') || errorMsg.includes('too many requests')) {
559
+ data.error = '[Multimodal Model Rate Limit Exceeded] API requests exceed rate limit.\n' +
560
+ 'Error details: HTTP 429 - Too Many Requests\n' +
561
+ 'Suggested action: Please retry later, or check your API account quota settings. Wait a few minutes before retrying';
562
+ } else if (errorMsg.includes('network') || errorMsg.includes('fetch') || errorMsg.includes('connection') || errorMsg.includes('ECONNREFUSED')) {
563
+ data.error = '[Multimodal Model Network Error] Cannot connect to API service.\n' +
564
+ 'Error details: Network connection failed. Possible causes:\n' +
565
+ ' 1. Network connection is lost\n' +
566
+ ' 2. The guiSubagentBaseUrl configuration is incorrect\n' +
567
+ ' 3. API service endpoint is unreachable\n' +
568
+ 'Suggested action: Please check the guiSubagentBaseUrl configuration in ~/.xagent/settings.json and ensure network connectivity';
569
+ } else if (errorMsg.includes('404') || errorMsg.includes('not found') || errorMsg.includes('model not found') || errorMsg.includes('InvalidEndpointOrModel.NotFound')) {
570
+ // Extract model name
571
+ const modelMatch = errorMsg.match(/model[:\s]+([^\s,"]+)|"model[:"]+([^",}]+)/i);
572
+ const modelName = modelMatch ? (modelMatch[1] || modelMatch[2]) : 'Unknown';
573
+ data.error = '[Multimodal Model Configuration Error] The model specified in guiSubagentModel does not exist or is not accessible.\n' +
574
+ 'Error details: HTTP 404 - Model or Endpoint not found\n' +
575
+ 'Configured model name: ' + modelName + '\n' +
576
+ 'Suggested action: Please check the guiSubagentModel configuration in ~/.xagent/settings.json, remove or replace with a valid model name';
577
+ } else {
578
+ data.error = '[Multimodal Model API Call Failed]\n' +
579
+ 'Error details: ' + errorMsg + '\n' +
580
+ 'Please check the following configuration items:\n' +
581
+ ' - guiSubagentApiKey: API key\n' +
582
+ ' - guiSubagentBaseUrl: API service URL\n' +
583
+ ' - guiSubagentModel: Model name\n' +
584
+ 'Config file location: ~/.xagent/settings.json';
585
+ }
586
+ break;
587
+ }
588
+
589
+ // Check abort immediately after model call
590
+ if (this.signal?.aborted) {
591
+ data.status = GUIAgentStatus.USER_STOPPED;
592
+ break;
593
+ }
594
+
595
+ if (!prediction) {
596
+ this.logger.warn('[GUIAgent] Warning: Empty response from model, retrying...');
597
+ continue;
598
+ }
599
+
600
+ if (this.showAIDebugInfo) {
601
+ this.logger.debug('[GUIAgent] Response:', prediction);
602
+ this.logger.debug('[GUIAgent] Parsed Predictions:', JSON.stringify(parsedPredictions));
603
+ }
604
+
605
+ const predictionSummary = this.getSummary(prediction);
606
+
607
+ data.conversations.push({
608
+ from: 'assistant',
609
+ value: predictionSummary,
610
+ timing: {
611
+ start,
612
+ end: Date.now(),
613
+ cost: Date.now() - start,
614
+ },
615
+ screenshotContext: {
616
+ size: {
617
+ width: screenContext.width,
618
+ height: screenContext.height,
619
+ },
620
+ scaleFactor: snapshot.scaleFactor ?? screenContext.scaleFactor,
621
+ },
622
+ predictionParsed: parsedPredictions,
623
+ });
624
+
625
+ await this.onData?.({
626
+ ...data,
627
+ conversations: data.conversations.slice(-1),
628
+ });
629
+
630
+ // Display assistant response
631
+ const latestAssistant = data.conversations[data.conversations.length - 1];
632
+ if (latestAssistant && latestAssistant.from === 'assistant') {
633
+ this.displayConversationResult(latestAssistant, loopCnt, this.indentLevel);
634
+ }
635
+
636
+ // Check if we need to switch operator based on first action
637
+ // Execute actions
638
+ for (const parsedPrediction of parsedPredictions) {
639
+ const actionType = parsedPrediction.action_type;
640
+
641
+ if (this.showAIDebugInfo) {
642
+ this.logger.debug('[GUIAgent] Action:', actionType);
643
+ }
644
+
645
+ // Handle internal action spaces
646
+ if (actionType === 'error_env') {
647
+ data.status = GUIAgentStatus.ERROR;
648
+ data.error = 'Environment error';
649
+ break;
650
+ } else if (actionType === 'max_loop') {
651
+ data.status = GUIAgentStatus.ERROR;
652
+ data.error = 'Reached max loop';
653
+ break;
654
+ }
655
+
656
+ // Execute action with retry
657
+ if (!this.signal?.aborted && !this.isStopped) {
658
+ let stepRetryCount = 0;
659
+ let stepSuccess = false;
660
+ let lastErrorMsg = '';
661
+
662
+ this.logger.debug(`[GUIAgent] Executing action: ${actionType}, loopCnt: ${loopCnt}`);
663
+
664
+ while (stepRetryCount < MAX_STEP_RETRIES && !stepSuccess) {
665
+ try {
666
+ const executeResult = await this.operator.doExecute({
667
+ prediction,
668
+ parsedPrediction,
669
+ screenWidth: screenContext.width,
670
+ screenHeight: screenContext.height,
671
+ scaleFactor: snapshot.scaleFactor ?? screenContext.scaleFactor,
672
+ factors: [1000, 1000], // Default factors
673
+ });
674
+
675
+ if (executeResult.status === 'end') {
676
+ // 'finished' action or explicit end
677
+ stepSuccess = true;
678
+ break;
679
+ } else if (executeResult.status === 'needs_input') {
680
+ // Empty action - return to main agent for re-calling LLM
681
+ this.logger.debug(`[GUIAgent] Empty action received, returning to main agent for LLM decision`);
682
+ data.status = GUIAgentStatus.CALL_LLM;
683
+ data.error = 'Empty action - main agent should re-call LLM to decide next step';
684
+ stepSuccess = true;
685
+ return data; // Return immediately with all results to main agent
686
+ }
687
+
688
+ // Any other status (success, failed, etc.) is considered success
689
+ stepSuccess = true;
690
+ break;
691
+ } catch (executeError) {
692
+ stepRetryCount++;
693
+ lastErrorMsg = executeError instanceof Error ? executeError.message : 'Unknown error';
694
+ this.logger.warn(`[GUIAgent] Action failed ${stepRetryCount}/${MAX_STEP_RETRIES}: ${lastErrorMsg}`);
695
+
696
+ if (stepRetryCount < MAX_STEP_RETRIES) {
697
+ await sleep(1000);
698
+ // Take new screenshot for retry
699
+ const retrySnapshot = await this.operator.doScreenshot();
700
+ if (retrySnapshot?.base64) {
701
+ data.conversations.push({
702
+ from: 'human',
703
+ value: IMAGE_PLACEHOLDER,
704
+ screenshotBase64: retrySnapshot.base64,
705
+ screenshotContext: {
706
+ size: {
707
+ width: screenContext.width,
708
+ height: screenContext.height,
709
+ },
710
+ scaleFactor: retrySnapshot.scaleFactor ?? screenContext.scaleFactor,
711
+ },
712
+ });
713
+ }
714
+ }
715
+ }
716
+ }
717
+
718
+ if (!stepSuccess) {
719
+ // All retries exhausted
720
+ this.logger.error(`[GUIAgent] Action failed after ${MAX_STEP_RETRIES} attempts: ${lastErrorMsg}`);
721
+ data.status = GUIAgentStatus.ERROR;
722
+ data.error = `Action failed after ${MAX_STEP_RETRIES} attempts: ${lastErrorMsg}`;
723
+ break;
724
+ }
725
+ }
726
+
727
+ // Check abort immediately after action execution
728
+ if (this.signal?.aborted) {
729
+ data.status = GUIAgentStatus.USER_STOPPED;
730
+ break;
731
+ }
732
+
733
+ // Handle special action types
734
+ if (actionType === 'finished') {
735
+ data.status = GUIAgentStatus.END;
736
+ break;
737
+ }
738
+ }
739
+
740
+ // Check abort after action loop
741
+ if (this.signal?.aborted) {
742
+ data.status = GUIAgentStatus.USER_STOPPED;
743
+ break;
744
+ }
745
+
746
+ // Wait between iterations
747
+ if (this.loopIntervalInMs > 0) {
748
+ await sleep(this.loopIntervalInMs);
749
+ }
750
+ }
751
+ } catch (error) {
752
+ this.logger.error('[GUIAgent] Catch error', error);
753
+ if (
754
+ error instanceof Error &&
755
+ (error.name === 'AbortError' || error.message?.includes('aborted'))
756
+ ) {
757
+ data.status = GUIAgentStatus.USER_STOPPED;
758
+ } else {
759
+ data.status = GUIAgentStatus.ERROR;
760
+ data.error = error instanceof Error ? error.message : 'Unknown error';
761
+ }
762
+ } finally {
763
+ // Save final status
764
+ const finalStatus = data.status;
765
+ const finalError = data.error;
766
+ const indent = ' '.repeat(this.indentLevel);
767
+
768
+ // Output error immediately if task failed
769
+ if (finalStatus === GUIAgentStatus.ERROR && finalError) {
770
+ console.log(`\n${indent}${colors.error('✖')} ${finalError}\n`);
771
+ }
772
+
773
+ // Call onData callback if set
774
+ // Note: Use Promise.resolve().then() to avoid modifying data in callback
775
+ const onDataCallback = this.onData;
776
+ if (onDataCallback) {
777
+ Promise.resolve().then(() => onDataCallback({ ...data, conversations: [] }));
778
+ }
779
+
780
+ // Call onError callback if status is error
781
+ if (finalStatus === GUIAgentStatus.ERROR && this.onError) {
782
+ this.onError(new Error(finalError || 'Unknown error occurred'));
783
+ }
784
+
785
+ if (this.showAIDebugInfo) {
786
+ this.logger.debug('[GUIAgent] Final status:', {
787
+ status: finalStatus,
788
+ loopCnt,
789
+ totalConversations: data.conversations.length,
790
+ });
791
+ }
792
+
793
+ // Ensure the returned status is correct (reassign)
794
+ this.logger.debug(`[GUIAgent] Finally: finalStatus=${finalStatus}, finalError=${finalError}, data.status=${data.status}, data.error=${data.error}`);
795
+
796
+ // Log final status (only visible when showAIDebugInfo is enabled)
797
+ this.logger.debug(`[GUIAgent] Final status: ${finalStatus}${finalError ? `, Error: ${finalError}` : ''}, Steps: ${loopCnt}`);
798
+
799
+ data.status = finalStatus;
800
+ data.error = finalError;
801
+ }
802
+
803
+ return data;
804
+ }
805
+
806
+ /**
807
+ * Build messages for the model API
808
+ */
809
+ private buildModelMessages(conversations: Conversation[], systemPrompt: string): any[] {
810
+ const messages: any[] = [];
811
+
812
+ // System prompt
813
+ messages.push({
814
+ role: 'system',
815
+ content: systemPrompt,
816
+ });
817
+
818
+ // Add conversation history
819
+ for (const conv of conversations) {
820
+ if (conv.from === 'human' && conv.screenshotBase64) {
821
+ messages.push({
822
+ role: 'user',
823
+ content: [
824
+ { type: 'text', text: conv.value },
825
+ {
826
+ type: 'image_url',
827
+ image_url: {
828
+ url: `data:image/png;base64,${conv.screenshotBase64}`,
829
+ detail: 'high',
830
+ },
831
+ },
832
+ ],
833
+ });
834
+ } else if (conv.from === 'assistant') {
835
+ messages.push({
836
+ role: 'assistant',
837
+ content: conv.value,
838
+ });
839
+ } else {
840
+ messages.push({
841
+ role: 'user',
842
+ content: conv.value,
843
+ });
844
+ }
845
+ }
846
+
847
+ return messages;
848
+ }
849
+
850
+ /**
851
+ * Extract image and prompt from messages for remote VLM calls
852
+ */
853
+ private extractImageAndPrompt(messages: any[]): { image: string; prompt: string } {
854
+ const lastUserMessage = messages[messages.length - 1];
855
+ let image = '';
856
+ let prompt = '';
857
+
858
+ if (lastUserMessage && Array.isArray(lastUserMessage.content)) {
859
+ const imageBlock = lastUserMessage.content.find((c: any) => c.type === 'image_url');
860
+ const textBlock = lastUserMessage.content.find((c: any) => c.type === 'text');
861
+
862
+ if (imageBlock) {
863
+ const imageUrl = imageBlock.image_url?.url || '';
864
+ if (imageUrl.startsWith('data:image')) {
865
+ image = imageUrl.split(',')[1] || '';
866
+ } else {
867
+ image = imageUrl;
868
+ }
869
+ }
870
+ prompt = textBlock?.text || '';
871
+ }
872
+
873
+ return { image, prompt };
874
+ }
875
+
876
+ /**
877
+ * Debug output for model request
878
+ */
879
+ private debugRequest(messages: any[], remoteVlmCaller?: RemoteVlmCaller): void {
880
+ console.log('\n╔══════════════════════════════════════════════════════════╗');
881
+ console.log('║ GUI MODEL REQUEST DEBUG ║');
882
+ console.log('╚══════════════════════════════════════════════════════════╝');
883
+ console.log(`📦 Model: ${remoteVlmCaller ? ((remoteVlmCaller as any).info?.model || 'remote') : this.model}`);
884
+ console.log(`🌐 Base URL: ${remoteVlmCaller ? ((remoteVlmCaller as any).info?.baseUrl || 'remote') : (this.modelBaseUrl || process.env.MODEL_BASE_URL || 'https://api.openai.com/v1')}`);
885
+ console.log(`💬 Messages: ${messages.length}`);
886
+
887
+ // Show system prompt if present
888
+ const systemMsg = messages.find((m: any) => m.role === 'system');
889
+ if (systemMsg) {
890
+ console.log('\n┌─────────────────────────────────────────────────────────────┐');
891
+ console.log('│ 🟫 SYSTEM │');
892
+ console.log('├─────────────────────────────────────────────────────────────┤');
893
+ const systemContent = typeof systemMsg.content === 'string'
894
+ ? systemMsg.content
895
+ : JSON.stringify(systemMsg.content);
896
+ const lines = systemContent.split('\n').slice(0, 15);
897
+ for (const line of lines) {
898
+ console.log('│ ' + line.slice(0, 62));
899
+ }
900
+ if (systemContent.split('\n').length > 15) {
901
+ console.log(' ... (truncated)');
902
+ }
903
+ console.log('└─────────────────────────────────────────────────────────────┘');
904
+ }
905
+
906
+ // Show conversation messages
907
+ const roleColors: Record<string, string> = {
908
+ user: '👤 USER',
909
+ assistant: '🤖 ASSISTANT',
910
+ };
911
+
912
+ for (let i = 0; i < messages.length; i++) {
913
+ const msg = messages[i];
914
+ if (msg.role === 'system') continue;
915
+
916
+ const roleLabel = roleColors[msg.role] || `● ${msg.role.toUpperCase()}`;
917
+ console.log(`\n┌─────────────────────────────────────────────────────────────┐`);
918
+ console.log(`│ ${roleLabel} (${i + 1}) │`);
919
+ console.log('├─────────────────────────────────────────────────────────────┤');
920
+
921
+ if (typeof msg.content === 'string') {
922
+ const lines = msg.content.split('\n').slice(0, 20);
923
+ for (const line of lines) {
924
+ console.log('' + line.slice(0, 62));
925
+ }
926
+ if (msg.content.split('\n').length > 20) {
927
+ console.log('│ ... (truncated)');
928
+ }
929
+ } else if (Array.isArray(msg.content)) {
930
+ const hasImage = msg.content.some((c: any) => c.type === 'image_url');
931
+ console.log('│ 📎 Content blocks: ' + msg.content.length);
932
+ if (hasImage) {
933
+ const imageBlock = msg.content.find((c: any) => c.type === 'image_url');
934
+ const imageSize = imageBlock?.image_url?.url?.length || 0;
935
+ console.log('│ 🖼️ Image size: ' + (imageSize / 1024).toFixed(2) + ' KB');
936
+ }
937
+ const textBlock = msg.content.find((c: any) => c.type === 'text');
938
+ if (textBlock?.text) {
939
+ const lines = textBlock.text.split('\n').slice(0, 10);
940
+ for (const line of lines) {
941
+ console.log('│ ' + line.slice(0, 62));
942
+ }
943
+ }
944
+ }
945
+ console.log('└─────────────────────────────────────────────────────────────┘');
946
+ }
947
+
948
+ console.log('\n📤 Sending request to model API...\n');
949
+ }
950
+
951
+ /**
952
+ * Debug output for model response
953
+ */
954
+ private debugResponse(content: string, usage?: any): void {
955
+ console.log('\n╔══════════════════════════════════════════════════════════╗');
956
+ console.log('║ GUI MODEL RESPONSE DEBUG ║');
957
+ console.log('╚══════════════════════════════════════════════════════════╝');
958
+
959
+ if (usage) {
960
+ console.log(`📊 Tokens: ${usage.prompt_tokens} (prompt) + ${usage.completion_tokens} (completion) = ${usage.total_tokens} (total)`);
961
+ }
962
+
963
+ console.log('\n┌─────────────────────────────────────────────────────────────┐');
964
+ console.log('│ 🤖 ASSISTANT │');
965
+ console.log('├─────────────────────────────────────────────────────────────┤');
966
+ console.log('│ 💬 CONTENT:');
967
+ console.log('│ ───────────────────────────────────────────────────────────');
968
+
969
+ const lines = content.split('\n').slice(0, 30);
970
+ for (const line of lines) {
971
+ console.log('│ ' + line.slice(0, 62));
972
+ }
973
+ if (content.split('\n').length > 30) {
974
+ console.log(`│ ... (${content.split('\n').length - 30} more lines)`);
975
+ }
976
+ console.log('│ ───────────────────────────────────────────────────────────');
977
+ console.log('└─────────────────────────────────────────────────────────────┘');
978
+
979
+ console.log('\n╔══════════════════════════════════════════════════════════╗');
980
+ console.log('║ RESPONSE ENDED ║');
981
+ console.log('╚══════════════════════════════════════════════════════════╝\n');
982
+ }
983
+
984
+ /**
985
+ * Call local VLM API
986
+ */
987
+ private async callLocalVLM(
988
+ messages: any[],
989
+ screenContext: ScreenContext
990
+ ): Promise<{ prediction: string; parsedPredictions: PredictionParsed[] }> {
991
+ const baseUrl = this.modelBaseUrl || process.env.MODEL_BASE_URL || 'https://api.openai.com/v1';
992
+ const apiKey = this.modelApiKey || process.env.MODEL_API_KEY || '';
993
+
994
+ const requestBody = {
995
+ model: this.model,
996
+ messages,
997
+ max_tokens: 1024,
998
+ temperature: 0.1,
999
+ };
1000
+
1001
+ // Debug output for model input
1002
+ if (this.showAIDebugInfo) {
1003
+ this.debugRequest(messages);
1004
+ }
1005
+
1006
+ let response;
1007
+ try {
1008
+ response = await fetch(`${baseUrl}/chat/completions`, {
1009
+ method: 'POST',
1010
+ headers: {
1011
+ 'Content-Type': 'application/json',
1012
+ 'Authorization': `Bearer ${apiKey}`,
1013
+ },
1014
+ body: JSON.stringify(requestBody),
1015
+ signal: this.signal,
1016
+ });
1017
+ } catch (fetchError) {
1018
+ throw fetchError;
1019
+ }
1020
+
1021
+ // Handle non-200 responses
1022
+ if (!response.ok) {
1023
+ const errorText = await response.text();
1024
+ throw new Error(`Model API error: ${errorText}`);
1025
+ }
1026
+
1027
+ const result = await response.json() as { choices?: Array<{ message?: { content?: string } }>; usage?: any };
1028
+ const content = result.choices?.[0]?.message?.content || '';
1029
+
1030
+ // Debug output for model response
1031
+ if (this.showAIDebugInfo) {
1032
+ this.debugResponse(content, result.usage);
1033
+ }
1034
+
1035
+ const { parsed: parsedPredictions } = actionParser({
1036
+ prediction: content,
1037
+ factor: [1000, 1000],
1038
+ screenContext: {
1039
+ width: screenContext.width,
1040
+ height: screenContext.height,
1041
+ },
1042
+ });
1043
+
1044
+ return {
1045
+ prediction: content,
1046
+ parsedPredictions,
1047
+ };
1048
+ }
1049
+
1050
+ /**
1051
+ * Call the model API with debug logging
1052
+ * Local mode: use model/modelBaseUrl/modelApiKey directly
1053
+ * Remote mode: use remoteVlmCaller for VLM calls (now with full messages for consistent behavior)
1054
+ */
1055
+ private async callModelAPI(
1056
+ messages: any[],
1057
+ screenContext: ScreenContext,
1058
+ remoteVlmCaller: RemoteVlmCaller
1059
+ ): Promise<{ prediction: string; parsedPredictions: PredictionParsed[] }> {
1060
+ // === LOCAL 模式 ===
1061
+ if (this.isLocalMode) {
1062
+ return this.callLocalVLM(messages, screenContext);
1063
+ }
1064
+
1065
+ // === REMOTE 模式 ===
1066
+ else {
1067
+ // Debug output for model input
1068
+ if (this.showAIDebugInfo) {
1069
+ this.debugRequest(messages, remoteVlmCaller);
1070
+ }
1071
+
1072
+ // Use shared ref from config for tracking first VLM call across createGUISubAgent calls
1073
+ // If no shared ref provided, fall back to local tracking
1074
+ const isFirstVlmCallRef = this.isFirstVlmCallRef || { current: this.isFirstVlmCall };
1075
+
1076
+ // Pass taskId and isFirstVlmCallRef for proper status tracking
1077
+ const prediction = await remoteVlmCaller(messages, this.systemPrompt, this.taskId, isFirstVlmCallRef);
1078
+ // Mark subsequent calls as continue (update both local state and shared ref)
1079
+ this.isFirstVlmCall = false;
1080
+ isFirstVlmCallRef.current = false;
1081
+
1082
+ // Debug output for model response
1083
+ if (this.showAIDebugInfo) {
1084
+ this.debugResponse(prediction);
1085
+ }
1086
+
1087
+ const { parsed: parsedPredictions } = actionParser({
1088
+ prediction,
1089
+ factor: [1000, 1000],
1090
+ screenContext: {
1091
+ width: screenContext.width,
1092
+ height: screenContext.height,
1093
+ },
1094
+ });
1095
+
1096
+ return {
1097
+ prediction,
1098
+ parsedPredictions,
1099
+ };
1100
+ }
1101
+ }
1102
+
1103
+ /**
1104
+ * Get summary from prediction text
1105
+ */
1106
+ private getSummary(prediction: string): string {
1107
+ // Extract the action part as summary
1108
+ const actionMatch = prediction.match(/Action[::]\s*([\s\S]+)$/i);
1109
+ if (actionMatch) {
1110
+ return actionMatch[1].trim();
1111
+ }
1112
+ return prediction.slice(0, 200);
1113
+ }
1114
+
1115
+ pause(): void {
1116
+ this.isPaused = true;
1117
+ this.resumePromise = new Promise((resolve) => {
1118
+ this.resolveResume = resolve;
1119
+ });
1120
+ }
1121
+
1122
+ resume(): void {
1123
+ if (this.resolveResume) {
1124
+ this.resolveResume();
1125
+ this.resumePromise = null;
1126
+ this.resolveResume = null;
1127
+ }
1128
+ this.isPaused = false;
1129
+ }
1130
+
1131
+ stop(): void {
1132
+ this.isStopped = true;
1133
+ }
1134
+
1135
+ async cleanup(): Promise<void> {
1136
+ this.logger.debug('Cleaning up GUI Agent...');
1137
+ await this.operator.cleanup();
1138
+
1139
+ // Cleanup cancellation listener if attached
1140
+ const cancelHandler = (this as any)._cancelHandler;
1141
+ const cancellationManager = (this as any)._cancellationManager;
1142
+ if (cancelHandler && cancellationManager) {
1143
+ cancellationManager.off('cancelled', cancelHandler);
1144
+ (this as any)._cancelHandler = undefined;
1145
+ (this as any)._cancellationManager = undefined;
1146
+ }
1147
+ }
1148
+ }
1149
+
1150
+ export { GUIAgentStatus as StatusEnum };
1151
+