@xagent-ai/cli 1.2.2 → 1.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (602) hide show
  1. package/.github/ISSUE_TEMPLATE/bug_report.md +38 -38
  2. package/.github/ISSUE_TEMPLATE/feature_request.md +20 -20
  3. package/.github/release.yml +76 -0
  4. package/.github/workflows/ci.yml +75 -0
  5. package/.github/workflows/release.yml +103 -0
  6. package/.gitmodules +3 -3
  7. package/README.md +326 -280
  8. package/README_CN.md +325 -279
  9. package/dist/agents.d.ts.map +1 -1
  10. package/dist/agents.js +7 -3
  11. package/dist/agents.js.map +1 -1
  12. package/dist/ai-client/factory.d.ts +40 -0
  13. package/dist/ai-client/factory.d.ts.map +1 -0
  14. package/dist/ai-client/factory.js +100 -0
  15. package/dist/ai-client/factory.js.map +1 -0
  16. package/dist/ai-client/index.d.ts +20 -0
  17. package/dist/ai-client/index.d.ts.map +1 -0
  18. package/dist/ai-client/index.js +49 -0
  19. package/dist/ai-client/index.js.map +1 -0
  20. package/dist/ai-client/providers/anthropic.d.ts +57 -0
  21. package/dist/ai-client/providers/anthropic.d.ts.map +1 -0
  22. package/dist/ai-client/providers/anthropic.js +406 -0
  23. package/dist/ai-client/providers/anthropic.js.map +1 -0
  24. package/dist/ai-client/providers/openai.d.ts +57 -0
  25. package/dist/ai-client/providers/openai.d.ts.map +1 -0
  26. package/dist/ai-client/providers/openai.js +290 -0
  27. package/dist/ai-client/providers/openai.js.map +1 -0
  28. package/dist/ai-client/providers/remote.d.ts +110 -0
  29. package/dist/ai-client/providers/remote.d.ts.map +1 -0
  30. package/dist/ai-client/providers/remote.js +352 -0
  31. package/dist/ai-client/providers/remote.js.map +1 -0
  32. package/dist/ai-client/registry.d.ts +51 -0
  33. package/dist/ai-client/registry.d.ts.map +1 -0
  34. package/dist/ai-client/registry.js +81 -0
  35. package/dist/ai-client/registry.js.map +1 -0
  36. package/dist/ai-client/types.d.ts +274 -0
  37. package/dist/ai-client/types.d.ts.map +1 -0
  38. package/dist/ai-client/types.js +90 -0
  39. package/dist/ai-client/types.js.map +1 -0
  40. package/dist/ai-client-factory.d.ts +62 -0
  41. package/dist/ai-client-factory.d.ts.map +1 -0
  42. package/dist/ai-client-factory.js +157 -0
  43. package/dist/ai-client-factory.js.map +1 -0
  44. package/dist/auth.d.ts +23 -1
  45. package/dist/auth.d.ts.map +1 -1
  46. package/dist/auth.js +164 -174
  47. package/dist/auth.js.map +1 -1
  48. package/dist/cancellation.d.ts +5 -4
  49. package/dist/cancellation.d.ts.map +1 -1
  50. package/dist/cancellation.js +53 -32
  51. package/dist/cancellation.js.map +1 -1
  52. package/dist/checkpoint.d.ts +2 -1
  53. package/dist/checkpoint.d.ts.map +1 -1
  54. package/dist/checkpoint.js +39 -6
  55. package/dist/checkpoint.js.map +1 -1
  56. package/dist/cli.js +742 -29
  57. package/dist/cli.js.map +1 -1
  58. package/dist/config.d.ts +10 -4
  59. package/dist/config.d.ts.map +1 -1
  60. package/dist/config.js +62 -25
  61. package/dist/config.js.map +1 -1
  62. package/dist/context-compressor.d.ts +82 -18
  63. package/dist/context-compressor.d.ts.map +1 -1
  64. package/dist/context-compressor.js +718 -154
  65. package/dist/context-compressor.js.map +1 -1
  66. package/dist/conversation.d.ts +1 -1
  67. package/dist/conversation.d.ts.map +1 -1
  68. package/dist/conversation.js +8 -7
  69. package/dist/conversation.js.map +1 -1
  70. package/dist/gui-subagent/action-parser/actionParser.d.ts.map +1 -1
  71. package/dist/gui-subagent/action-parser/actionParser.js +6 -4
  72. package/dist/gui-subagent/action-parser/actionParser.js.map +1 -1
  73. package/dist/gui-subagent/agent/gui-agent.d.ts +39 -2
  74. package/dist/gui-subagent/agent/gui-agent.d.ts.map +1 -1
  75. package/dist/gui-subagent/agent/gui-agent.js +189 -74
  76. package/dist/gui-subagent/agent/gui-agent.js.map +1 -1
  77. package/dist/gui-subagent/index.d.ts +23 -1
  78. package/dist/gui-subagent/index.d.ts.map +1 -1
  79. package/dist/gui-subagent/index.js +6 -0
  80. package/dist/gui-subagent/index.js.map +1 -1
  81. package/dist/gui-subagent/operator/base-operator.d.ts.map +1 -1
  82. package/dist/gui-subagent/operator/base-operator.js +0 -1
  83. package/dist/gui-subagent/operator/base-operator.js.map +1 -1
  84. package/dist/gui-subagent/operator/computer-operator.d.ts.map +1 -1
  85. package/dist/gui-subagent/operator/computer-operator.js +31 -8
  86. package/dist/gui-subagent/operator/computer-operator.js.map +1 -1
  87. package/dist/gui-subagent/types/actions.d.ts +1 -1
  88. package/dist/gui-subagent/types/actions.d.ts.map +1 -1
  89. package/dist/gui-subagent/types/actions.js +0 -1
  90. package/dist/gui-subagent/types/actions.js.map +1 -1
  91. package/dist/gui-subagent/types/operator.d.ts +1 -1
  92. package/dist/gui-subagent/types/operator.d.ts.map +1 -1
  93. package/dist/index.d.ts +1 -2
  94. package/dist/index.d.ts.map +1 -1
  95. package/dist/index.js +1 -2
  96. package/dist/index.js.map +1 -1
  97. package/dist/input-processor.d.ts.map +1 -1
  98. package/dist/input-processor.js +8 -5
  99. package/dist/input-processor.js.map +1 -1
  100. package/dist/logger.d.ts.map +1 -1
  101. package/dist/logger.js +1 -1
  102. package/dist/logger.js.map +1 -1
  103. package/dist/mcp.d.ts +7 -1
  104. package/dist/mcp.d.ts.map +1 -1
  105. package/dist/mcp.js +157 -49
  106. package/dist/mcp.js.map +1 -1
  107. package/dist/memory.d.ts.map +1 -1
  108. package/dist/memory.js +3 -3
  109. package/dist/memory.js.map +1 -1
  110. package/dist/output-util.d.ts +27 -0
  111. package/dist/output-util.d.ts.map +1 -0
  112. package/dist/output-util.js +74 -0
  113. package/dist/output-util.js.map +1 -0
  114. package/dist/retry.js +1 -1
  115. package/dist/retry.js.map +1 -1
  116. package/dist/ripgrep.d.ts +29 -0
  117. package/dist/ripgrep.d.ts.map +1 -0
  118. package/dist/ripgrep.js +294 -0
  119. package/dist/ripgrep.js.map +1 -0
  120. package/dist/sdk-output-adapter.d.ts +34 -1
  121. package/dist/sdk-output-adapter.d.ts.map +1 -1
  122. package/dist/sdk-output-adapter.js +67 -2
  123. package/dist/sdk-output-adapter.js.map +1 -1
  124. package/dist/sdk-session.d.ts.map +1 -1
  125. package/dist/sdk-session.js +2 -0
  126. package/dist/sdk-session.js.map +1 -1
  127. package/dist/session-manager.js +3 -3
  128. package/dist/session-manager.js.map +1 -1
  129. package/dist/session.d.ts +116 -6
  130. package/dist/session.d.ts.map +1 -1
  131. package/dist/session.js +1416 -448
  132. package/dist/session.js.map +1 -1
  133. package/dist/shell.d.ts +33 -0
  134. package/dist/shell.d.ts.map +1 -0
  135. package/dist/shell.js +126 -0
  136. package/dist/shell.js.map +1 -0
  137. package/dist/skill-installer.d.ts +38 -0
  138. package/dist/skill-installer.d.ts.map +1 -0
  139. package/dist/skill-installer.js +447 -0
  140. package/dist/skill-installer.js.map +1 -0
  141. package/dist/skill-invoker.d.ts +8 -2
  142. package/dist/skill-invoker.d.ts.map +1 -1
  143. package/dist/skill-invoker.js +36 -15
  144. package/dist/skill-invoker.js.map +1 -1
  145. package/dist/skill-loader.d.ts +8 -3
  146. package/dist/skill-loader.d.ts.map +1 -1
  147. package/dist/skill-loader.js +51 -48
  148. package/dist/skill-loader.js.map +1 -1
  149. package/dist/skill-manager.d.ts +85 -0
  150. package/dist/skill-manager.d.ts.map +1 -0
  151. package/dist/skill-manager.js +341 -0
  152. package/dist/skill-manager.js.map +1 -0
  153. package/dist/slash-commands.d.ts +39 -2
  154. package/dist/slash-commands.d.ts.map +1 -1
  155. package/dist/slash-commands.js +934 -305
  156. package/dist/slash-commands.js.map +1 -1
  157. package/dist/smart-approval.d.ts +20 -1
  158. package/dist/smart-approval.d.ts.map +1 -1
  159. package/dist/smart-approval.js +125 -56
  160. package/dist/smart-approval.js.map +1 -1
  161. package/dist/system-prompt-generator.d.ts +6 -0
  162. package/dist/system-prompt-generator.d.ts.map +1 -1
  163. package/dist/system-prompt-generator.js +86 -36
  164. package/dist/system-prompt-generator.js.map +1 -1
  165. package/dist/terminal.d.ts +28 -0
  166. package/dist/terminal.d.ts.map +1 -0
  167. package/dist/terminal.js +82 -0
  168. package/dist/terminal.js.map +1 -0
  169. package/dist/theme.d.ts.map +1 -1
  170. package/dist/theme.js +8 -7
  171. package/dist/theme.js.map +1 -1
  172. package/dist/tools.d.ts +38 -7
  173. package/dist/tools.d.ts.map +1 -1
  174. package/dist/tools.js +1249 -617
  175. package/dist/tools.js.map +1 -1
  176. package/dist/truncate.d.ts +55 -0
  177. package/dist/truncate.d.ts.map +1 -0
  178. package/dist/truncate.js +130 -0
  179. package/dist/truncate.js.map +1 -0
  180. package/dist/types.d.ts +84 -9
  181. package/dist/types.d.ts.map +1 -1
  182. package/dist/types.js +49 -0
  183. package/dist/types.js.map +1 -1
  184. package/dist/update.d.ts.map +1 -1
  185. package/dist/update.js +28 -36
  186. package/dist/update.js.map +1 -1
  187. package/dist/workflow.d.ts +5 -1
  188. package/dist/workflow.d.ts.map +1 -1
  189. package/dist/workflow.js +61 -49
  190. package/dist/workflow.js.map +1 -1
  191. package/docs/architecture/mcp-integration-guide.md +304 -194
  192. package/docs/architecture/overview.md +169 -169
  193. package/docs/architecture/tool-system-design.md +134 -134
  194. package/docs/cli/commands.md +349 -238
  195. package/docs/smart-mode.md +281 -281
  196. package/docs/third-party-models.md +440 -439
  197. package/find-skills/SKILL.md +133 -0
  198. package/package.json +91 -90
  199. package/scripts/install-ripgrep.js +241 -0
  200. package/src/agents.ts +7 -3
  201. package/src/ai-client/factory.ts +116 -0
  202. package/src/ai-client/index.ts +61 -0
  203. package/src/ai-client/providers/anthropic.ts +475 -0
  204. package/src/ai-client/providers/openai.ts +348 -0
  205. package/src/ai-client/providers/remote.ts +439 -0
  206. package/src/ai-client/registry.ts +97 -0
  207. package/src/ai-client/types.ts +364 -0
  208. package/src/ai-client-factory.ts +204 -0
  209. package/src/auth.ts +661 -614
  210. package/src/cancellation.ts +202 -176
  211. package/src/checkpoint.ts +255 -219
  212. package/src/cli.ts +1523 -743
  213. package/src/config.ts +341 -297
  214. package/src/context-compressor.ts +987 -290
  215. package/src/conversation.ts +290 -288
  216. package/src/gui-subagent/action-parser/actionParser.ts +318 -315
  217. package/src/gui-subagent/action-parser/constants.ts +14 -14
  218. package/src/gui-subagent/action-parser/index.ts +8 -8
  219. package/src/gui-subagent/action-parser/types.ts +31 -31
  220. package/src/gui-subagent/agent/gui-agent.ts +1234 -1089
  221. package/src/gui-subagent/agent/index.ts +5 -5
  222. package/src/gui-subagent/index.ts +185 -163
  223. package/src/gui-subagent/operator/base-operator.ts +244 -245
  224. package/src/gui-subagent/operator/computer-operator.ts +541 -520
  225. package/src/gui-subagent/operator/index.ts +6 -6
  226. package/src/gui-subagent/types/actions.ts +260 -262
  227. package/src/gui-subagent/types/index.ts +6 -6
  228. package/src/gui-subagent/types/operator.ts +106 -106
  229. package/src/gui-subagent/utils.ts +51 -51
  230. package/src/index.ts +17 -18
  231. package/src/input-processor.ts +8 -5
  232. package/src/logger.ts +436 -438
  233. package/src/mcp.ts +793 -682
  234. package/src/memory.ts +343 -344
  235. package/src/output-util.ts +80 -0
  236. package/src/retry.ts +1 -1
  237. package/src/ripgrep.ts +370 -0
  238. package/src/sdk-output-adapter.ts +842 -0
  239. package/src/sdk-session.ts +62 -0
  240. package/src/session-manager.ts +308 -308
  241. package/src/session.ts +1775 -573
  242. package/src/shell.ts +134 -0
  243. package/src/skill-installer.ts +518 -0
  244. package/src/skill-invoker.ts +959 -935
  245. package/src/skill-loader.ts +501 -496
  246. package/src/skill-manager.ts +385 -0
  247. package/src/slash-commands.ts +2189 -1389
  248. package/src/smart-approval.ts +193 -74
  249. package/src/system-prompt-generator.ts +91 -36
  250. package/src/terminal.ts +96 -0
  251. package/src/theme.ts +739 -738
  252. package/src/tools.ts +1790 -931
  253. package/src/truncate.ts +173 -0
  254. package/src/types.ts +337 -198
  255. package/src/update.ts +33 -40
  256. package/src/workflow.ts +521 -508
  257. package/test/cli-launch.test.ts +279 -0
  258. package/tsconfig.json +22 -22
  259. package/vitest.config.ts +21 -19
  260. package/dist/ai-client.d.ts +0 -86
  261. package/dist/ai-client.d.ts.map +0 -1
  262. package/dist/ai-client.js +0 -1372
  263. package/dist/ai-client.js.map +0 -1
  264. package/dist/gui-subagent/operator/browser-operator.d.ts +0 -36
  265. package/dist/gui-subagent/operator/browser-operator.d.ts.map +0 -1
  266. package/dist/gui-subagent/operator/browser-operator.js +0 -306
  267. package/dist/gui-subagent/operator/browser-operator.js.map +0 -1
  268. package/dist/gui-subagent/operator/desktop-operator.d.ts +0 -55
  269. package/dist/gui-subagent/operator/desktop-operator.d.ts.map +0 -1
  270. package/dist/gui-subagent/operator/desktop-operator.js +0 -527
  271. package/dist/gui-subagent/operator/desktop-operator.js.map +0 -1
  272. package/dist/hook.d.ts +0 -73
  273. package/dist/hook.d.ts.map +0 -1
  274. package/dist/hook.js +0 -156
  275. package/dist/hook.js.map +0 -1
  276. package/dist/input-history.d.ts +0 -24
  277. package/dist/input-history.d.ts.map +0 -1
  278. package/dist/input-history.js +0 -94
  279. package/dist/input-history.js.map +0 -1
  280. package/dist/keyboard-manager.d.ts +0 -151
  281. package/dist/keyboard-manager.d.ts.map +0 -1
  282. package/dist/keyboard-manager.js +0 -396
  283. package/dist/keyboard-manager.js.map +0 -1
  284. package/dist/print-system-prompt.d.ts +0 -2
  285. package/dist/print-system-prompt.d.ts.map +0 -1
  286. package/dist/print-system-prompt.js +0 -40
  287. package/dist/print-system-prompt.js.map +0 -1
  288. package/dist/remote-ai-client.d.ts +0 -104
  289. package/dist/remote-ai-client.d.ts.map +0 -1
  290. package/dist/remote-ai-client.js +0 -552
  291. package/dist/remote-ai-client.js.map +0 -1
  292. package/dist/sdk-session-v2.d.ts +0 -13
  293. package/dist/sdk-session-v2.d.ts.map +0 -1
  294. package/dist/sdk-session-v2.js +0 -46
  295. package/dist/sdk-session-v2.js.map +0 -1
  296. package/dist/test-boundary-conditions.d.ts.map +0 -1
  297. package/dist/test-boundary-conditions.js.map +0 -1
  298. package/dist/test-cancellation-fix.d.ts.map +0 -1
  299. package/dist/test-cancellation-fix.js.map +0 -1
  300. package/dist/test-input-history.d.ts.map +0 -1
  301. package/dist/test-input-history.js.map +0 -1
  302. package/dist/test-interaction-flow.d.ts.map +0 -1
  303. package/dist/test-interaction-flow.js.map +0 -1
  304. package/dist/test-quick.d.ts.map +0 -1
  305. package/dist/test-quick.js.map +0 -1
  306. package/dist/test-user-interaction.d.ts.map +0 -1
  307. package/dist/test-user-interaction.js.map +0 -1
  308. package/dist/tools/edit-diff.d.ts +0 -32
  309. package/dist/tools/edit-diff.d.ts.map +0 -1
  310. package/dist/tools/edit-diff.js +0 -185
  311. package/dist/tools/edit-diff.js.map +0 -1
  312. package/dist/tools/edit.d.ts +0 -11
  313. package/dist/tools/edit.d.ts.map +0 -1
  314. package/dist/tools/edit.js +0 -129
  315. package/dist/tools/edit.js.map +0 -1
  316. package/dist/unified-session.d.ts +0 -42
  317. package/dist/unified-session.d.ts.map +0 -1
  318. package/dist/unified-session.js +0 -271
  319. package/dist/unified-session.js.map +0 -1
  320. package/skills/.claude-plugin/marketplace.json +0 -45
  321. package/skills/README.md +0 -94
  322. package/skills/THIRD_PARTY_NOTICES.md +0 -405
  323. package/skills/skills/algorithmic-art/LICENSE.txt +0 -202
  324. package/skills/skills/algorithmic-art/SKILL.md +0 -405
  325. package/skills/skills/algorithmic-art/templates/generator_template.js +0 -223
  326. package/skills/skills/algorithmic-art/templates/viewer.html +0 -599
  327. package/skills/skills/brand-guidelines/LICENSE.txt +0 -202
  328. package/skills/skills/brand-guidelines/SKILL.md +0 -73
  329. package/skills/skills/canvas-design/LICENSE.txt +0 -202
  330. package/skills/skills/canvas-design/SKILL.md +0 -130
  331. package/skills/skills/canvas-design/canvas-fonts/ArsenalSC-OFL.txt +0 -93
  332. package/skills/skills/canvas-design/canvas-fonts/ArsenalSC-Regular.ttf +0 -0
  333. package/skills/skills/canvas-design/canvas-fonts/BigShoulders-Bold.ttf +0 -0
  334. package/skills/skills/canvas-design/canvas-fonts/BigShoulders-OFL.txt +0 -93
  335. package/skills/skills/canvas-design/canvas-fonts/BigShoulders-Regular.ttf +0 -0
  336. package/skills/skills/canvas-design/canvas-fonts/Boldonse-OFL.txt +0 -93
  337. package/skills/skills/canvas-design/canvas-fonts/Boldonse-Regular.ttf +0 -0
  338. package/skills/skills/canvas-design/canvas-fonts/BricolageGrotesque-Bold.ttf +0 -0
  339. package/skills/skills/canvas-design/canvas-fonts/BricolageGrotesque-OFL.txt +0 -93
  340. package/skills/skills/canvas-design/canvas-fonts/BricolageGrotesque-Regular.ttf +0 -0
  341. package/skills/skills/canvas-design/canvas-fonts/CrimsonPro-Bold.ttf +0 -0
  342. package/skills/skills/canvas-design/canvas-fonts/CrimsonPro-Italic.ttf +0 -0
  343. package/skills/skills/canvas-design/canvas-fonts/CrimsonPro-OFL.txt +0 -93
  344. package/skills/skills/canvas-design/canvas-fonts/CrimsonPro-Regular.ttf +0 -0
  345. package/skills/skills/canvas-design/canvas-fonts/DMMono-OFL.txt +0 -93
  346. package/skills/skills/canvas-design/canvas-fonts/DMMono-Regular.ttf +0 -0
  347. package/skills/skills/canvas-design/canvas-fonts/EricaOne-OFL.txt +0 -94
  348. package/skills/skills/canvas-design/canvas-fonts/EricaOne-Regular.ttf +0 -0
  349. package/skills/skills/canvas-design/canvas-fonts/GeistMono-Bold.ttf +0 -0
  350. package/skills/skills/canvas-design/canvas-fonts/GeistMono-OFL.txt +0 -93
  351. package/skills/skills/canvas-design/canvas-fonts/GeistMono-Regular.ttf +0 -0
  352. package/skills/skills/canvas-design/canvas-fonts/Gloock-OFL.txt +0 -93
  353. package/skills/skills/canvas-design/canvas-fonts/Gloock-Regular.ttf +0 -0
  354. package/skills/skills/canvas-design/canvas-fonts/IBMPlexMono-Bold.ttf +0 -0
  355. package/skills/skills/canvas-design/canvas-fonts/IBMPlexMono-OFL.txt +0 -93
  356. package/skills/skills/canvas-design/canvas-fonts/IBMPlexMono-Regular.ttf +0 -0
  357. package/skills/skills/canvas-design/canvas-fonts/IBMPlexSerif-Bold.ttf +0 -0
  358. package/skills/skills/canvas-design/canvas-fonts/IBMPlexSerif-BoldItalic.ttf +0 -0
  359. package/skills/skills/canvas-design/canvas-fonts/IBMPlexSerif-Italic.ttf +0 -0
  360. package/skills/skills/canvas-design/canvas-fonts/IBMPlexSerif-Regular.ttf +0 -0
  361. package/skills/skills/canvas-design/canvas-fonts/InstrumentSans-Bold.ttf +0 -0
  362. package/skills/skills/canvas-design/canvas-fonts/InstrumentSans-BoldItalic.ttf +0 -0
  363. package/skills/skills/canvas-design/canvas-fonts/InstrumentSans-Italic.ttf +0 -0
  364. package/skills/skills/canvas-design/canvas-fonts/InstrumentSans-OFL.txt +0 -93
  365. package/skills/skills/canvas-design/canvas-fonts/InstrumentSans-Regular.ttf +0 -0
  366. package/skills/skills/canvas-design/canvas-fonts/InstrumentSerif-Italic.ttf +0 -0
  367. package/skills/skills/canvas-design/canvas-fonts/InstrumentSerif-Regular.ttf +0 -0
  368. package/skills/skills/canvas-design/canvas-fonts/Italiana-OFL.txt +0 -93
  369. package/skills/skills/canvas-design/canvas-fonts/Italiana-Regular.ttf +0 -0
  370. package/skills/skills/canvas-design/canvas-fonts/JetBrainsMono-Bold.ttf +0 -0
  371. package/skills/skills/canvas-design/canvas-fonts/JetBrainsMono-OFL.txt +0 -93
  372. package/skills/skills/canvas-design/canvas-fonts/JetBrainsMono-Regular.ttf +0 -0
  373. package/skills/skills/canvas-design/canvas-fonts/Jura-Light.ttf +0 -0
  374. package/skills/skills/canvas-design/canvas-fonts/Jura-Medium.ttf +0 -0
  375. package/skills/skills/canvas-design/canvas-fonts/Jura-OFL.txt +0 -93
  376. package/skills/skills/canvas-design/canvas-fonts/LibreBaskerville-OFL.txt +0 -93
  377. package/skills/skills/canvas-design/canvas-fonts/LibreBaskerville-Regular.ttf +0 -0
  378. package/skills/skills/canvas-design/canvas-fonts/Lora-Bold.ttf +0 -0
  379. package/skills/skills/canvas-design/canvas-fonts/Lora-BoldItalic.ttf +0 -0
  380. package/skills/skills/canvas-design/canvas-fonts/Lora-Italic.ttf +0 -0
  381. package/skills/skills/canvas-design/canvas-fonts/Lora-OFL.txt +0 -93
  382. package/skills/skills/canvas-design/canvas-fonts/Lora-Regular.ttf +0 -0
  383. package/skills/skills/canvas-design/canvas-fonts/NationalPark-Bold.ttf +0 -0
  384. package/skills/skills/canvas-design/canvas-fonts/NationalPark-OFL.txt +0 -93
  385. package/skills/skills/canvas-design/canvas-fonts/NationalPark-Regular.ttf +0 -0
  386. package/skills/skills/canvas-design/canvas-fonts/NothingYouCouldDo-OFL.txt +0 -93
  387. package/skills/skills/canvas-design/canvas-fonts/NothingYouCouldDo-Regular.ttf +0 -0
  388. package/skills/skills/canvas-design/canvas-fonts/Outfit-Bold.ttf +0 -0
  389. package/skills/skills/canvas-design/canvas-fonts/Outfit-OFL.txt +0 -93
  390. package/skills/skills/canvas-design/canvas-fonts/Outfit-Regular.ttf +0 -0
  391. package/skills/skills/canvas-design/canvas-fonts/PixelifySans-Medium.ttf +0 -0
  392. package/skills/skills/canvas-design/canvas-fonts/PixelifySans-OFL.txt +0 -93
  393. package/skills/skills/canvas-design/canvas-fonts/PoiretOne-OFL.txt +0 -93
  394. package/skills/skills/canvas-design/canvas-fonts/PoiretOne-Regular.ttf +0 -0
  395. package/skills/skills/canvas-design/canvas-fonts/RedHatMono-Bold.ttf +0 -0
  396. package/skills/skills/canvas-design/canvas-fonts/RedHatMono-OFL.txt +0 -93
  397. package/skills/skills/canvas-design/canvas-fonts/RedHatMono-Regular.ttf +0 -0
  398. package/skills/skills/canvas-design/canvas-fonts/Silkscreen-OFL.txt +0 -93
  399. package/skills/skills/canvas-design/canvas-fonts/Silkscreen-Regular.ttf +0 -0
  400. package/skills/skills/canvas-design/canvas-fonts/SmoochSans-Medium.ttf +0 -0
  401. package/skills/skills/canvas-design/canvas-fonts/SmoochSans-OFL.txt +0 -93
  402. package/skills/skills/canvas-design/canvas-fonts/Tektur-Medium.ttf +0 -0
  403. package/skills/skills/canvas-design/canvas-fonts/Tektur-OFL.txt +0 -93
  404. package/skills/skills/canvas-design/canvas-fonts/Tektur-Regular.ttf +0 -0
  405. package/skills/skills/canvas-design/canvas-fonts/WorkSans-Bold.ttf +0 -0
  406. package/skills/skills/canvas-design/canvas-fonts/WorkSans-BoldItalic.ttf +0 -0
  407. package/skills/skills/canvas-design/canvas-fonts/WorkSans-Italic.ttf +0 -0
  408. package/skills/skills/canvas-design/canvas-fonts/WorkSans-OFL.txt +0 -93
  409. package/skills/skills/canvas-design/canvas-fonts/WorkSans-Regular.ttf +0 -0
  410. package/skills/skills/canvas-design/canvas-fonts/YoungSerif-OFL.txt +0 -93
  411. package/skills/skills/canvas-design/canvas-fonts/YoungSerif-Regular.ttf +0 -0
  412. package/skills/skills/doc-coauthoring/SKILL.md +0 -375
  413. package/skills/skills/docx/LICENSE.txt +0 -30
  414. package/skills/skills/docx/SKILL.md +0 -197
  415. package/skills/skills/docx/docx-js.md +0 -350
  416. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +0 -1499
  417. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +0 -146
  418. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +0 -1085
  419. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +0 -11
  420. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-main.xsd +0 -3081
  421. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +0 -23
  422. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +0 -185
  423. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +0 -287
  424. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/pml.xsd +0 -1676
  425. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +0 -28
  426. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +0 -144
  427. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +0 -174
  428. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +0 -25
  429. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +0 -18
  430. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +0 -59
  431. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +0 -56
  432. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +0 -195
  433. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-math.xsd +0 -582
  434. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +0 -25
  435. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/sml.xsd +0 -4439
  436. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/vml-main.xsd +0 -570
  437. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +0 -509
  438. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +0 -12
  439. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +0 -108
  440. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +0 -96
  441. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/wml.xsd +0 -3646
  442. package/skills/skills/docx/ooxml/schemas/ISO-IEC29500-4_2016/xml.xsd +0 -116
  443. package/skills/skills/docx/ooxml/schemas/ecma/fouth-edition/opc-contentTypes.xsd +0 -42
  444. package/skills/skills/docx/ooxml/schemas/ecma/fouth-edition/opc-coreProperties.xsd +0 -50
  445. package/skills/skills/docx/ooxml/schemas/ecma/fouth-edition/opc-digSig.xsd +0 -49
  446. package/skills/skills/docx/ooxml/schemas/ecma/fouth-edition/opc-relationships.xsd +0 -33
  447. package/skills/skills/docx/ooxml/schemas/mce/mc.xsd +0 -75
  448. package/skills/skills/docx/ooxml/schemas/microsoft/wml-2010.xsd +0 -560
  449. package/skills/skills/docx/ooxml/schemas/microsoft/wml-2012.xsd +0 -67
  450. package/skills/skills/docx/ooxml/schemas/microsoft/wml-2018.xsd +0 -14
  451. package/skills/skills/docx/ooxml/schemas/microsoft/wml-cex-2018.xsd +0 -20
  452. package/skills/skills/docx/ooxml/schemas/microsoft/wml-cid-2016.xsd +0 -13
  453. package/skills/skills/docx/ooxml/schemas/microsoft/wml-sdtdatahash-2020.xsd +0 -4
  454. package/skills/skills/docx/ooxml/schemas/microsoft/wml-symex-2015.xsd +0 -8
  455. package/skills/skills/docx/ooxml/scripts/pack.py +0 -159
  456. package/skills/skills/docx/ooxml/scripts/unpack.py +0 -29
  457. package/skills/skills/docx/ooxml/scripts/validate.py +0 -69
  458. package/skills/skills/docx/ooxml/scripts/validation/__init__.py +0 -15
  459. package/skills/skills/docx/ooxml/scripts/validation/base.py +0 -951
  460. package/skills/skills/docx/ooxml/scripts/validation/docx.py +0 -274
  461. package/skills/skills/docx/ooxml/scripts/validation/pptx.py +0 -315
  462. package/skills/skills/docx/ooxml/scripts/validation/redlining.py +0 -279
  463. package/skills/skills/docx/ooxml.md +0 -610
  464. package/skills/skills/docx/scripts/__init__.py +0 -1
  465. package/skills/skills/docx/scripts/document.py +0 -1276
  466. package/skills/skills/docx/scripts/templates/comments.xml +0 -3
  467. package/skills/skills/docx/scripts/templates/commentsExtended.xml +0 -3
  468. package/skills/skills/docx/scripts/templates/commentsExtensible.xml +0 -3
  469. package/skills/skills/docx/scripts/templates/commentsIds.xml +0 -3
  470. package/skills/skills/docx/scripts/templates/people.xml +0 -3
  471. package/skills/skills/docx/scripts/utilities.py +0 -374
  472. package/skills/skills/frontend-design/LICENSE.txt +0 -177
  473. package/skills/skills/frontend-design/SKILL.md +0 -42
  474. package/skills/skills/internal-comms/LICENSE.txt +0 -202
  475. package/skills/skills/internal-comms/SKILL.md +0 -32
  476. package/skills/skills/internal-comms/examples/3p-updates.md +0 -47
  477. package/skills/skills/internal-comms/examples/company-newsletter.md +0 -65
  478. package/skills/skills/internal-comms/examples/faq-answers.md +0 -30
  479. package/skills/skills/internal-comms/examples/general-comms.md +0 -16
  480. package/skills/skills/mcp-builder/LICENSE.txt +0 -202
  481. package/skills/skills/mcp-builder/SKILL.md +0 -236
  482. package/skills/skills/mcp-builder/reference/evaluation.md +0 -602
  483. package/skills/skills/mcp-builder/reference/mcp_best_practices.md +0 -249
  484. package/skills/skills/mcp-builder/reference/node_mcp_server.md +0 -970
  485. package/skills/skills/mcp-builder/reference/python_mcp_server.md +0 -719
  486. package/skills/skills/mcp-builder/scripts/connections.py +0 -151
  487. package/skills/skills/mcp-builder/scripts/evaluation.py +0 -373
  488. package/skills/skills/mcp-builder/scripts/example_evaluation.xml +0 -22
  489. package/skills/skills/mcp-builder/scripts/requirements.txt +0 -2
  490. package/skills/skills/pdf/LICENSE.txt +0 -30
  491. package/skills/skills/pdf/SKILL.md +0 -294
  492. package/skills/skills/pdf/forms.md +0 -205
  493. package/skills/skills/pdf/reference.md +0 -612
  494. package/skills/skills/pdf/scripts/check_bounding_boxes.py +0 -70
  495. package/skills/skills/pdf/scripts/check_bounding_boxes_test.py +0 -226
  496. package/skills/skills/pdf/scripts/check_fillable_fields.py +0 -12
  497. package/skills/skills/pdf/scripts/convert_pdf_to_images.py +0 -35
  498. package/skills/skills/pdf/scripts/create_validation_image.py +0 -41
  499. package/skills/skills/pdf/scripts/extract_form_field_info.py +0 -152
  500. package/skills/skills/pdf/scripts/fill_fillable_fields.py +0 -114
  501. package/skills/skills/pdf/scripts/fill_pdf_form_with_annotations.py +0 -108
  502. package/skills/skills/pptx/LICENSE.txt +0 -30
  503. package/skills/skills/pptx/SKILL.md +0 -484
  504. package/skills/skills/pptx/html2pptx.md +0 -625
  505. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-chart.xsd +0 -1499
  506. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-chartDrawing.xsd +0 -146
  507. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-diagram.xsd +0 -1085
  508. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-lockedCanvas.xsd +0 -11
  509. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-main.xsd +0 -3081
  510. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-picture.xsd +0 -23
  511. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-spreadsheetDrawing.xsd +0 -185
  512. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/dml-wordprocessingDrawing.xsd +0 -287
  513. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/pml.xsd +0 -1676
  514. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-additionalCharacteristics.xsd +0 -28
  515. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-bibliography.xsd +0 -144
  516. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-commonSimpleTypes.xsd +0 -174
  517. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-customXmlDataProperties.xsd +0 -25
  518. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-customXmlSchemaProperties.xsd +0 -18
  519. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesCustom.xsd +0 -59
  520. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesExtended.xsd +0 -56
  521. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-documentPropertiesVariantTypes.xsd +0 -195
  522. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-math.xsd +0 -582
  523. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/shared-relationshipReference.xsd +0 -25
  524. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/sml.xsd +0 -4439
  525. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-main.xsd +0 -570
  526. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-officeDrawing.xsd +0 -509
  527. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-presentationDrawing.xsd +0 -12
  528. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-spreadsheetDrawing.xsd +0 -108
  529. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/vml-wordprocessingDrawing.xsd +0 -96
  530. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/wml.xsd +0 -3646
  531. package/skills/skills/pptx/ooxml/schemas/ISO-IEC29500-4_2016/xml.xsd +0 -116
  532. package/skills/skills/pptx/ooxml/schemas/ecma/fouth-edition/opc-contentTypes.xsd +0 -42
  533. package/skills/skills/pptx/ooxml/schemas/ecma/fouth-edition/opc-coreProperties.xsd +0 -50
  534. package/skills/skills/pptx/ooxml/schemas/ecma/fouth-edition/opc-digSig.xsd +0 -49
  535. package/skills/skills/pptx/ooxml/schemas/ecma/fouth-edition/opc-relationships.xsd +0 -33
  536. package/skills/skills/pptx/ooxml/schemas/mce/mc.xsd +0 -75
  537. package/skills/skills/pptx/ooxml/schemas/microsoft/wml-2010.xsd +0 -560
  538. package/skills/skills/pptx/ooxml/schemas/microsoft/wml-2012.xsd +0 -67
  539. package/skills/skills/pptx/ooxml/schemas/microsoft/wml-2018.xsd +0 -14
  540. package/skills/skills/pptx/ooxml/schemas/microsoft/wml-cex-2018.xsd +0 -20
  541. package/skills/skills/pptx/ooxml/schemas/microsoft/wml-cid-2016.xsd +0 -13
  542. package/skills/skills/pptx/ooxml/schemas/microsoft/wml-sdtdatahash-2020.xsd +0 -4
  543. package/skills/skills/pptx/ooxml/schemas/microsoft/wml-symex-2015.xsd +0 -8
  544. package/skills/skills/pptx/ooxml/scripts/pack.py +0 -159
  545. package/skills/skills/pptx/ooxml/scripts/unpack.py +0 -29
  546. package/skills/skills/pptx/ooxml/scripts/validate.py +0 -69
  547. package/skills/skills/pptx/ooxml/scripts/validation/__init__.py +0 -15
  548. package/skills/skills/pptx/ooxml/scripts/validation/base.py +0 -951
  549. package/skills/skills/pptx/ooxml/scripts/validation/docx.py +0 -274
  550. package/skills/skills/pptx/ooxml/scripts/validation/pptx.py +0 -315
  551. package/skills/skills/pptx/ooxml/scripts/validation/redlining.py +0 -279
  552. package/skills/skills/pptx/ooxml.md +0 -427
  553. package/skills/skills/pptx/scripts/html2pptx.js +0 -979
  554. package/skills/skills/pptx/scripts/inventory.py +0 -1020
  555. package/skills/skills/pptx/scripts/rearrange.py +0 -231
  556. package/skills/skills/pptx/scripts/replace.py +0 -385
  557. package/skills/skills/pptx/scripts/thumbnail.py +0 -450
  558. package/skills/skills/skill-creator/LICENSE.txt +0 -202
  559. package/skills/skills/skill-creator/SKILL.md +0 -356
  560. package/skills/skills/skill-creator/references/output-patterns.md +0 -82
  561. package/skills/skills/skill-creator/references/workflows.md +0 -28
  562. package/skills/skills/skill-creator/scripts/init_skill.py +0 -303
  563. package/skills/skills/skill-creator/scripts/package_skill.py +0 -110
  564. package/skills/skills/skill-creator/scripts/quick_validate.py +0 -95
  565. package/skills/skills/slack-gif-creator/LICENSE.txt +0 -202
  566. package/skills/skills/slack-gif-creator/SKILL.md +0 -254
  567. package/skills/skills/slack-gif-creator/core/easing.py +0 -234
  568. package/skills/skills/slack-gif-creator/core/frame_composer.py +0 -176
  569. package/skills/skills/slack-gif-creator/core/gif_builder.py +0 -269
  570. package/skills/skills/slack-gif-creator/core/validators.py +0 -136
  571. package/skills/skills/slack-gif-creator/requirements.txt +0 -4
  572. package/skills/skills/theme-factory/LICENSE.txt +0 -202
  573. package/skills/skills/theme-factory/SKILL.md +0 -59
  574. package/skills/skills/theme-factory/theme-showcase.pdf +0 -0
  575. package/skills/skills/theme-factory/themes/arctic-frost.md +0 -19
  576. package/skills/skills/theme-factory/themes/botanical-garden.md +0 -19
  577. package/skills/skills/theme-factory/themes/desert-rose.md +0 -19
  578. package/skills/skills/theme-factory/themes/forest-canopy.md +0 -19
  579. package/skills/skills/theme-factory/themes/golden-hour.md +0 -19
  580. package/skills/skills/theme-factory/themes/midnight-galaxy.md +0 -19
  581. package/skills/skills/theme-factory/themes/modern-minimalist.md +0 -19
  582. package/skills/skills/theme-factory/themes/ocean-depths.md +0 -19
  583. package/skills/skills/theme-factory/themes/sunset-boulevard.md +0 -19
  584. package/skills/skills/theme-factory/themes/tech-innovation.md +0 -19
  585. package/skills/skills/web-artifacts-builder/LICENSE.txt +0 -202
  586. package/skills/skills/web-artifacts-builder/SKILL.md +0 -74
  587. package/skills/skills/web-artifacts-builder/scripts/bundle-artifact.sh +0 -54
  588. package/skills/skills/web-artifacts-builder/scripts/init-artifact.sh +0 -322
  589. package/skills/skills/webapp-testing/LICENSE.txt +0 -202
  590. package/skills/skills/webapp-testing/SKILL.md +0 -96
  591. package/skills/skills/webapp-testing/examples/console_logging.py +0 -35
  592. package/skills/skills/webapp-testing/examples/element_discovery.py +0 -40
  593. package/skills/skills/webapp-testing/examples/static_html_automation.py +0 -33
  594. package/skills/skills/webapp-testing/scripts/with_server.py +0 -106
  595. package/skills/skills/xlsx/LICENSE.txt +0 -30
  596. package/skills/skills/xlsx/SKILL.md +0 -289
  597. package/skills/skills/xlsx/recalc.py +0 -178
  598. package/skills/spec/agent-skills-spec.md +0 -3
  599. package/skills/template/SKILL.md +0 -6
  600. package/src/ai-client.ts +0 -1560
  601. package/src/remote-ai-client.ts +0 -664
  602. /package/{.eslintrc.js → .eslintrc.cjs} +0 -0
@@ -1,1089 +1,1234 @@
1
- /**
2
- * GUI Agent for xagent
3
- * Orchestrates desktop automation with AI-powered action execution
4
- * Based on UI-TARS architecture with computer control only
5
- *
6
- * This implementation is aligned with packages/ui-tars/sdk/src/GUIAgent.ts
7
- */
8
-
9
- import type {
10
- ScreenContext,
11
- ScreenshotOutput,
12
- ExecuteParams,
13
- ExecuteOutput,
14
- PredictionParsed,
15
- } from '../types/operator.js';
16
- import type { Operator } from '../operator/base-operator.js';
17
- import { sleep, asyncRetry } from '../utils.js';
18
- import { actionParser } from '../action-parser/index.js';
19
- import { colors, icons, renderMarkdown } from '../../theme.js';
20
- import { getLogger } from '../../logger.js';
21
-
22
- /**
23
- * Helper function to truncate long text
24
- */
25
- function truncateText(text: string, maxLength: number = 200): string {
26
- if (!text) return '';
27
- return text.length > maxLength ? text.substring(0, maxLength) + '...' : text;
28
- }
29
-
30
- /**
31
- * Helper function to indent multiline text
32
- */
33
- function indentMultiline(text: string, indent: string): string {
34
- return text.split('\n').map(line => indent + line).join('\n');
35
- }
36
-
37
- const guiLogger = getLogger();
38
-
39
- // UI-TARS Status Enum
40
- export enum GUIAgentStatus {
41
- INIT = 'init',
42
- RUNNING = 'running',
43
- PAUSE = 'paused',
44
- END = 'end',
45
- ERROR = 'error',
46
- USER_STOPPED = 'user_stopped',
47
- CALL_USER = 'call_user',
48
- }
49
-
50
- /**
51
- * Remote VLM Caller callback function type
52
- * Inject this function externally to handle VLM calls, GUI Agent doesn't need to know VLM implementation details
53
- * Receives full messages array (same as local mode) for consistent behavior
54
- */
55
- export type RemoteVlmCaller = (messages: any[], systemPrompt: string) => Promise<string>;
56
-
57
- export interface GUIAgentConfig<T extends Operator> {
58
- operator: T;
59
- model?: string;
60
- modelBaseUrl?: string;
61
- modelApiKey?: string;
62
- /**
63
- * Externally injected VLM caller function
64
- * If this function is provided, GUI Agent will use it to call VLM
65
- * instead of directly calling modelBaseUrl/modelApiKey
66
- * This allows GUI Agent to work with remote services without exposing any configuration
67
- */
68
- remoteVlmCaller?: RemoteVlmCaller;
69
- /**
70
- * Whether to use local mode
71
- * If true, use model/modelBaseUrl/modelApiKey for VLM calls
72
- * If false, use remoteVlmCaller for remote VLM calls
73
- */
74
- isLocalMode: boolean;
75
- systemPrompt?: string;
76
- loopIntervalInMs?: number;
77
- maxLoopCount?: number;
78
- logger?: any;
79
- signal?: AbortSignal;
80
- onData?: (data: GUIAgentData) => void;
81
- onError?: (error: Error) => void;
82
- showAIDebugInfo?: boolean;
83
- retry?: {
84
- screenshot?: {
85
- maxRetries?: number;
86
- onRetry?: (e: Error) => void;
87
- };
88
- model?: {
89
- maxRetries?: number;
90
- onRetry?: (e: Error) => void;
91
- };
92
- execute?: {
93
- maxRetries?: number;
94
- onRetry?: (e: Error) => void;
95
- };
96
- };
97
- }
98
-
99
- export interface GUIAgentData {
100
- status: GUIAgentStatus;
101
- conversations: Conversation[];
102
- error?: string;
103
- systemPrompt?: string;
104
- }
105
-
106
- export interface Conversation {
107
- from: 'human' | 'assistant';
108
- value: string;
109
- screenshotBase64?: string;
110
- screenshotContext?: {
111
- size: { width: number; height: number };
112
- mime?: string;
113
- scaleFactor: number;
114
- };
115
- actionType?: string;
116
- actionInputs?: Record<string, any>;
117
- timing?: {
118
- start: number;
119
- end: number;
120
- cost: number;
121
- };
122
- predictionParsed?: PredictionParsed[];
123
- }
124
-
125
- // UI-TARS constants (aligned with @ui-tars/shared/constants)
126
- const MAX_LOOP_COUNT = 100;
127
- const MAX_SNAPSHOT_ERR_CNT = 5;
128
- const MAX_STEP_RETRIES = 3; // Max retries for a single action step before giving up
129
- const IMAGE_PLACEHOLDER = '{{IMG_PLACEHOLDER_0}}';
130
-
131
- export class GUIAgent<T extends Operator> {
132
- private operator: T;
133
- private readonly model: string;
134
- private readonly modelBaseUrl: string;
135
- private readonly modelApiKey: string;
136
- private readonly remoteVlmCaller?: RemoteVlmCaller;
137
- private readonly isLocalMode: boolean;
138
- private readonly systemPrompt: string;
139
- private readonly loopIntervalInMs: number;
140
- private readonly maxLoopCount: number;
141
- private readonly logger: Console;
142
- private readonly signal?: AbortSignal;
143
- private readonly onData?: (data: GUIAgentData) => void;
144
- private readonly onError?: (error: Error) => void;
145
- private readonly showAIDebugInfo: boolean;
146
- private readonly retry?: GUIAgentConfig<T>['retry'];
147
-
148
- private isPaused = false;
149
- private resumePromise: Promise<void> | null = null;
150
- private resolveResume: (() => void) | null = null;
151
- private isStopped = false;
152
-
153
- constructor(config: GUIAgentConfig<T>) {
154
- this.operator = config.operator;
155
- this.model = config.model || '';
156
- this.modelBaseUrl = config.modelBaseUrl || '';
157
- this.modelApiKey = config.modelApiKey || '';
158
- this.remoteVlmCaller = config.remoteVlmCaller;
159
- this.isLocalMode = config.isLocalMode;
160
- this.loopIntervalInMs = config.loopIntervalInMs || 0;
161
- this.maxLoopCount = config.maxLoopCount || MAX_LOOP_COUNT;
162
- this.logger = config.logger || guiLogger;
163
- this.signal = config.signal;
164
- this.onData = config.onData;
165
- this.onError = config.onError;
166
- this.showAIDebugInfo = config.showAIDebugInfo ?? false;
167
- this.retry = config.retry;
168
-
169
- this.systemPrompt = config.systemPrompt || this.buildSystemPrompt();
170
- }
171
-
172
- /**
173
- * Display conversation results with formatting similar to session.ts (simplified)
174
- */
175
- private displayConversationResult(conversation: Conversation, iteration: number, indentLevel: number = 1): void {
176
- const indent = ' '.repeat(indentLevel);
177
- const innerIndent = ' '.repeat(indentLevel + 1);
178
- const maxWidth = process.stdout.columns || 80;
179
-
180
- if (conversation.from === 'assistant') {
181
- // Display assistant response (action)
182
- const content = conversation.value || '';
183
- const timing = conversation.timing;
184
-
185
- // Simplified: show step number and action
186
- const actionSummary = content.replace(/Thought:[\s\S]*?Action:\s*/i, '').trim();
187
- const actionType = conversation.predictionParsed?.[0]?.action_type || 'action';
188
-
189
- console.log(`${indent}${colors.primaryBright(`[${iteration}]`)} ${colors.textMuted(actionType)}${timing ? colors.textDim(` (${timing.cost}ms)`) : ''}`);
190
-
191
- // Optionally show action details on next line if verbose
192
- if (this.showAIDebugInfo && actionSummary) {
193
- const truncatedSummary = actionSummary.length > 60 ? actionSummary.substring(0, 60) + '...' : actionSummary;
194
- console.log(`${innerIndent}${colors.textMuted(truncatedSummary)}`);
195
- }
196
- } else if (conversation.from === 'human' && conversation.screenshotBase64) {
197
- // Show minimal indicator for screenshot
198
- if (this.showAIDebugInfo) {
199
- const timing = conversation.timing;
200
- console.log(`${indent}${colors.textMuted(`${icons.loading} screenshot${timing ? ` (${timing.cost}ms)` : ''}`)}`);
201
- }
202
- }
203
- }
204
-
205
- /**
206
- * Display status message
207
- */
208
- private displayStatus(data: GUIAgentData, iteration: number, indentLevel: number = 1): void {
209
- const indent = ' '.repeat(indentLevel);
210
- const status = data.status;
211
-
212
- switch (status) {
213
- case GUIAgentStatus.RUNNING:
214
- console.log(`${indent}${colors.info(`${icons.loading} Step ${iteration}: Running...`)}`);
215
- break;
216
- case GUIAgentStatus.END:
217
- // Handled by caller
218
- break;
219
- case GUIAgentStatus.ERROR:
220
- if (data.error) {
221
- console.log(`${indent}${colors.error(`${icons.cross} ${data.error}`)}`);
222
- }
223
- break;
224
- case GUIAgentStatus.CALL_USER:
225
- console.log(`${indent}${colors.warning(`${icons.warning} Needs user input`)}`);
226
- break;
227
- case GUIAgentStatus.USER_STOPPED:
228
- console.log(`${indent}${colors.warning(`${icons.warning} Stopped`)}`);
229
- break;
230
- default:
231
- break;
232
- }
233
- }
234
-
235
- private buildSystemPrompt(): string {
236
- return `You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
237
-
238
- ## Output Format
239
- \`
240
- Thought: ...
241
- Action: ...
242
- \`
243
-
244
- ## Action Space
245
- click(point='<point>x1 y1</point>')
246
- left_double(point='<point>x1 y1</point>')
247
- right_single(point='<point>x1 y1</point>')
248
- drag(start_point='<point>x1 y1</point>', end_point='<point>x2 y2</point>')
249
- hotkey(key='ctrl c') # Split keys with a space and use lowercase. Also, do not use more than 3 keys in one hotkey action.
250
- type(content='xxx') # Use escape characters \', \", and \n in content part to ensure we can parse the content in normal python string format. If you want to submit your input, use \n at the end of content.
251
- scroll(point='<point>x1 y1</point>', direction='down or up or right or left') # Show more information on the \`direction\` side.
252
- open_url(url='https://xxx') # Open URL in browser
253
- wait() #Sleep for 5s and take a screenshot to check for any changes.
254
- finished(content='xxx') # Use escape characters \', \", and \n in content part to ensure we can parse the content in normal python string format.
255
-
256
-
257
-
258
-
259
- ## Note
260
- - Use {language} in \`Thought\` part.
261
- - Write a small plan and finally summarize your next action (with its target element) in one sentence in \`Thought\` part.
262
-
263
- `;
264
- }
265
-
266
-
267
-
268
- async initialize(): Promise<void> {
269
- await this.operator.doInitialize();
270
- }
271
-
272
- /**
273
- * Run the GUI agent with a single instruction (UI-TARS style)
274
- * All operations are determined by the GUI model
275
- */
276
- async run(instruction: string): Promise<GUIAgentData> {
277
- const data: GUIAgentData = {
278
- status: GUIAgentStatus.INIT,
279
- conversations: [
280
- {
281
- from: 'human',
282
- value: instruction,
283
- timing: {
284
- start: Date.now(),
285
- end: Date.now(),
286
- cost: 0,
287
- },
288
- },
289
- ],
290
- };
291
-
292
- // Initialize operator for initial screenshot
293
- try {
294
- await this.operator.doInitialize();
295
- } catch (initError) {
296
- const errorMsg = initError instanceof Error ? initError.message : 'Unknown error';
297
- this.logger.error(`[GUIAgent] Failed to initialize operator: ${errorMsg}`);
298
-
299
- // Check if it's an RDP-related issue
300
- if (errorMsg.includes('screen') || errorMsg.includes('capture') || errorMsg.includes('display')) {
301
- data.status = GUIAgentStatus.ERROR;
302
- data.error = 'Failed to initialize screen capture. This may be caused by:\n' +
303
- ' 1. Remote Desktop session disconnected or minimized\n' +
304
- ' 2. Display driver issues\n' +
305
- 'Suggestion: Ensure your display is active and try again.';
306
- } else {
307
- data.status = GUIAgentStatus.ERROR;
308
- data.error = `Failed to initialize operator: ${errorMsg}`;
309
- }
310
- return data;
311
- }
312
-
313
- const currentTime = Date.now();
314
-
315
- if (this.showAIDebugInfo) {
316
- this.logger.debug('[GUIAgent] run:', {
317
- systemPrompt: this.systemPrompt,
318
- model: this.model,
319
- maxLoopCount: this.maxLoopCount,
320
- });
321
- }
322
-
323
- let loopCnt = 0;
324
- let snapshotErrCnt = 0;
325
-
326
- // Start running agent
327
- data.status = GUIAgentStatus.RUNNING;
328
- data.systemPrompt = this.systemPrompt;
329
- console.log(`${colors.primaryBright(`${icons.rocket} GUI Agent started`)}`);
330
- console.log('');
331
- await this.onData?.({ ...data, conversations: [] });
332
-
333
- try {
334
- // eslint-disable-next-line no-constant-condition
335
- while (true) {
336
- if (this.showAIDebugInfo) {
337
- this.logger.debug('[GUIAgent] loopCnt:', loopCnt);
338
- }
339
-
340
- // Check pause status
341
- if (this.isPaused && this.resumePromise) {
342
- data.status = GUIAgentStatus.PAUSE;
343
- await this.onData?.({ ...data, conversations: [] });
344
- await this.resumePromise;
345
- data.status = GUIAgentStatus.RUNNING;
346
- await this.onData?.({ ...data, conversations: [] });
347
- }
348
-
349
- // Check stop or aborted status
350
- if (
351
- this.isStopped ||
352
- data.status !== GUIAgentStatus.RUNNING ||
353
- this.signal?.aborted
354
- ) {
355
- if (this.signal?.aborted) {
356
- data.status = GUIAgentStatus.USER_STOPPED;
357
- }
358
- break;
359
- }
360
-
361
- // Check loop limit
362
- if (loopCnt >= this.maxLoopCount) {
363
- data.status = GUIAgentStatus.ERROR;
364
- data.error = `Has reached max loop count: ${loopCnt}`;
365
- break;
366
- }
367
-
368
- // Check screenshot error limit
369
- if (snapshotErrCnt >= MAX_SNAPSHOT_ERR_CNT) {
370
- data.status = GUIAgentStatus.ERROR;
371
- data.error = 'Screenshot failed too many times. Stopping task.';
372
- break;
373
- }
374
-
375
- loopCnt += 1;
376
- const start = Date.now();
377
-
378
- // Take screenshot (single attempt - no retry to avoid infinite loops)
379
- let snapshot: ScreenshotOutput;
380
- try {
381
- snapshot = await this.operator.doScreenshot();
382
- } catch (screenshotError) {
383
- const errorMsg = screenshotError instanceof Error ? screenshotError.message : 'Unknown error';
384
- this.logger.warn(`[GUIAgent] Screenshot exception: ${errorMsg}`);
385
- snapshotErrCnt += 1;
386
- data.status = GUIAgentStatus.ERROR;
387
- data.error = `Screenshot failed ${snapshotErrCnt} times. Stopping task.`;
388
- this.logger.error(`[GUIAgent] ${data.error}`);
389
- await sleep(1000);
390
- break;
391
- }
392
-
393
- // Check if screenshot returned failure status
394
- if (snapshot.status === 'failed') {
395
- const errorMsg = snapshot.errorMessage || 'Unknown error';
396
- this.logger.warn(`[GUIAgent] Screenshot failed: ${errorMsg}`);
397
- snapshotErrCnt += 1;
398
- data.status = GUIAgentStatus.ERROR;
399
- data.error = `Screenshot failed ${snapshotErrCnt} times. Stopping task.`;
400
- this.logger.error(`[GUIAgent] ${data.error}`);
401
- await sleep(1000);
402
- break;
403
- }
404
-
405
- // Check abort immediately after screenshot
406
- if (this.signal?.aborted) {
407
- data.status = GUIAgentStatus.USER_STOPPED;
408
- break;
409
- }
410
-
411
- // Validate screenshot
412
- const isValidImage = !!(snapshot?.base64);
413
- if (!isValidImage) {
414
- snapshotErrCnt += 1;
415
- data.status = GUIAgentStatus.ERROR;
416
- data.error = `Screenshot failed ${snapshotErrCnt} times. Stopping task.`;
417
- this.logger.error(`[GUIAgent] ${data.error}`);
418
- await sleep(1000);
419
- break;
420
- }
421
-
422
- // Reset error counter on successful screenshot
423
- snapshotErrCnt = 0;
424
-
425
- const end = Date.now();
426
-
427
- // Get screen context
428
- const screenContext = await this.operator.getScreenContext();
429
-
430
- // Add screenshot to conversation
431
- data.conversations.push({
432
- from: 'human',
433
- value: IMAGE_PLACEHOLDER,
434
- screenshotBase64: snapshot.base64,
435
- screenshotContext: {
436
- size: {
437
- width: screenContext.width,
438
- height: screenContext.height,
439
- },
440
- scaleFactor: snapshot.scaleFactor ?? screenContext.scaleFactor,
441
- },
442
- timing: {
443
- start,
444
- end,
445
- cost: end - start,
446
- },
447
- });
448
-
449
- await this.onData?.({
450
- ...data,
451
- conversations: data.conversations.slice(-1),
452
- });
453
-
454
- // Display screenshot notification
455
- const latestScreenshot = data.conversations[data.conversations.length - 1];
456
- if (latestScreenshot && latestScreenshot.from === 'human' && latestScreenshot.screenshotBase64) {
457
- this.displayConversationResult(latestScreenshot, loopCnt);
458
- }
459
-
460
- // Build messages for model
461
- const messages = this.buildModelMessages(data.conversations, data.systemPrompt);
462
-
463
- // Check abort before model call
464
- if (this.signal?.aborted) {
465
- data.status = GUIAgentStatus.USER_STOPPED;
466
- break;
467
- }
468
-
469
- // Invoke model with retry
470
- let prediction: string;
471
- let parsedPredictions: PredictionParsed[];
472
- try {
473
- const modelResult: { prediction: string; parsedPredictions: PredictionParsed[] } = await asyncRetry(
474
- async (bail) => {
475
- try {
476
- const result = await this.callModelAPI(messages, screenContext, this.remoteVlmCaller!);
477
- return result;
478
- } catch (error: unknown) {
479
- if (
480
- error instanceof Error &&
481
- (error.name === 'AbortError' ||
482
- error.message?.includes('aborted'))
483
- ) {
484
- bail(error as Error);
485
- return { prediction: '', parsedPredictions: [] };
486
- }
487
- throw error;
488
- }
489
- },
490
- {
491
- retries: this.retry?.model?.maxRetries ?? 0,
492
- minTimeout: 1000 * 30,
493
- onRetry: this.retry?.model?.onRetry,
494
- }
495
- );
496
- prediction = modelResult.prediction;
497
- parsedPredictions = modelResult.parsedPredictions;
498
- } catch (modelError) {
499
- // Handle multimodal model API errors with specific error messages
500
- data.status = GUIAgentStatus.ERROR;
501
- const errorMsg = modelError instanceof Error ? modelError.message : String(modelError);
502
-
503
- // Provide specific error message based on error type
504
- if (errorMsg.includes('401') || errorMsg.includes('authentication') || errorMsg.includes('API key') || errorMsg.includes('api_key') || errorMsg.includes('Unauthorized') || errorMsg.includes('invalid_api_key')) {
505
- data.error = '[Multimodal Model Authentication Failed] The guiSubagentApiKey configuration is invalid.\n' +
506
- 'Error details: HTTP 401 - API key is invalid or expired\n' +
507
- 'Suggested action: Please check the guiSubagentApiKey configuration in ~/.xagent/settings.json and ensure a valid API key is set';
508
- } else if (errorMsg.includes('429') || errorMsg.includes('rate limit') || errorMsg.includes('too many requests')) {
509
- data.error = '[Multimodal Model Rate Limit Exceeded] API requests exceed rate limit.\n' +
510
- 'Error details: HTTP 429 - Too Many Requests\n' +
511
- 'Suggested action: Please retry later, or check your API account quota settings. Wait a few minutes before retrying';
512
- } else if (errorMsg.includes('network') || errorMsg.includes('fetch') || errorMsg.includes('connection') || errorMsg.includes('ECONNREFUSED')) {
513
- data.error = '[Multimodal Model Network Error] Cannot connect to API service.\n' +
514
- 'Error details: Network connection failed. Possible causes:\n' +
515
- ' 1. Network connection is lost\n' +
516
- ' 2. The guiSubagentBaseUrl configuration is incorrect\n' +
517
- ' 3. API service endpoint is unreachable\n' +
518
- 'Suggested action: Please check the guiSubagentBaseUrl configuration in ~/.xagent/settings.json and ensure network connectivity';
519
- } else if (errorMsg.includes('404') || errorMsg.includes('not found') || errorMsg.includes('model not found') || errorMsg.includes('InvalidEndpointOrModel.NotFound')) {
520
- // Extract model name
521
- const modelMatch = errorMsg.match(/model[:\s]+([^\s,"]+)|"model[:"]+([^",}]+)/i);
522
- const modelName = modelMatch ? (modelMatch[1] || modelMatch[2]) : 'Unknown';
523
- data.error = '[Multimodal Model Configuration Error] The model specified in guiSubagentModel does not exist or is not accessible.\n' +
524
- 'Error details: HTTP 404 - Model or Endpoint not found\n' +
525
- 'Configured model name: ' + modelName + '\n' +
526
- 'Suggested action: Please check the guiSubagentModel configuration in ~/.xagent/settings.json, remove or replace with a valid model name';
527
- } else {
528
- data.error = '[Multimodal Model API Call Failed]\n' +
529
- 'Error details: ' + errorMsg + '\n' +
530
- 'Please check the following configuration items:\n' +
531
- ' - guiSubagentApiKey: API key\n' +
532
- ' - guiSubagentBaseUrl: API service URL\n' +
533
- ' - guiSubagentModel: Model name\n' +
534
- 'Config file location: ~/.xagent/settings.json';
535
- }
536
- break;
537
- }
538
-
539
- // Check abort immediately after model call
540
- if (this.signal?.aborted) {
541
- data.status = GUIAgentStatus.USER_STOPPED;
542
- break;
543
- }
544
-
545
- if (!prediction) {
546
- this.logger.warn('[GUIAgent] Warning: Empty response from model, retrying...');
547
- continue;
548
- }
549
-
550
- if (this.showAIDebugInfo) {
551
- this.logger.debug('[GUIAgent] Response:', prediction);
552
- this.logger.debug('[GUIAgent] Parsed Predictions:', JSON.stringify(parsedPredictions));
553
- }
554
-
555
- const predictionSummary = this.getSummary(prediction);
556
-
557
- data.conversations.push({
558
- from: 'assistant',
559
- value: predictionSummary,
560
- timing: {
561
- start,
562
- end: Date.now(),
563
- cost: Date.now() - start,
564
- },
565
- screenshotContext: {
566
- size: {
567
- width: screenContext.width,
568
- height: screenContext.height,
569
- },
570
- scaleFactor: snapshot.scaleFactor ?? screenContext.scaleFactor,
571
- },
572
- predictionParsed: parsedPredictions,
573
- });
574
-
575
- await this.onData?.({
576
- ...data,
577
- conversations: data.conversations.slice(-1),
578
- });
579
-
580
- // Display assistant response
581
- const latestAssistant = data.conversations[data.conversations.length - 1];
582
- if (latestAssistant && latestAssistant.from === 'assistant') {
583
- this.displayConversationResult(latestAssistant, loopCnt);
584
- }
585
-
586
- // Check if we need to switch operator based on first action
587
- // Execute actions
588
- for (const parsedPrediction of parsedPredictions) {
589
- const actionType = parsedPrediction.action_type;
590
-
591
- if (this.showAIDebugInfo) {
592
- this.logger.debug('[GUIAgent] Action:', actionType);
593
- }
594
-
595
- // Handle internal action spaces
596
- if (actionType === 'error_env') {
597
- data.status = GUIAgentStatus.ERROR;
598
- data.error = 'Environment error';
599
- break;
600
- } else if (actionType === 'max_loop') {
601
- data.status = GUIAgentStatus.ERROR;
602
- data.error = 'Reached max loop';
603
- break;
604
- }
605
-
606
- // Execute action with retry
607
- if (!this.signal?.aborted && !this.isStopped) {
608
- let stepRetryCount = 0;
609
- let stepSuccess = false;
610
- let lastErrorMsg = '';
611
-
612
- this.logger.debug(`[GUIAgent] Executing action: ${actionType}, loopCnt: ${loopCnt}`);
613
-
614
- while (stepRetryCount < MAX_STEP_RETRIES && !stepSuccess) {
615
- try {
616
- const executeResult = await this.operator.doExecute({
617
- prediction,
618
- parsedPrediction,
619
- screenWidth: screenContext.width,
620
- screenHeight: screenContext.height,
621
- scaleFactor: snapshot.scaleFactor ?? screenContext.scaleFactor,
622
- factors: [1000, 1000], // Default factors
623
- });
624
-
625
- if (executeResult.status === 'end') {
626
- // 'finished' action or explicit end
627
- stepSuccess = true;
628
- break;
629
- }
630
-
631
- // Any other status (success, failed, etc.) is considered success
632
- stepSuccess = true;
633
- break;
634
- } catch (executeError) {
635
- stepRetryCount++;
636
- lastErrorMsg = executeError instanceof Error ? executeError.message : 'Unknown error';
637
- this.logger.warn(`[GUIAgent] Action failed ${stepRetryCount}/${MAX_STEP_RETRIES}: ${lastErrorMsg}`);
638
-
639
- if (stepRetryCount < MAX_STEP_RETRIES) {
640
- await sleep(1000);
641
- // Take new screenshot for retry
642
- const retrySnapshot = await this.operator.doScreenshot();
643
- if (retrySnapshot?.base64) {
644
- data.conversations.push({
645
- from: 'human',
646
- value: IMAGE_PLACEHOLDER,
647
- screenshotBase64: retrySnapshot.base64,
648
- screenshotContext: {
649
- size: {
650
- width: screenContext.width,
651
- height: screenContext.height,
652
- },
653
- scaleFactor: retrySnapshot.scaleFactor ?? screenContext.scaleFactor,
654
- },
655
- });
656
- }
657
- }
658
- }
659
- }
660
-
661
- if (!stepSuccess) {
662
- // All retries exhausted
663
- this.logger.error(`[GUIAgent] Action failed after ${MAX_STEP_RETRIES} attempts: ${lastErrorMsg}`);
664
- data.status = GUIAgentStatus.ERROR;
665
- data.error = `Action failed after ${MAX_STEP_RETRIES} attempts: ${lastErrorMsg}`;
666
- break;
667
- }
668
- }
669
-
670
- // Check abort immediately after action execution
671
- if (this.signal?.aborted) {
672
- data.status = GUIAgentStatus.USER_STOPPED;
673
- break;
674
- }
675
-
676
- // Handle special action types
677
- if (actionType === 'call_user') {
678
- data.status = GUIAgentStatus.CALL_USER;
679
- break;
680
- } else if (actionType === 'finished') {
681
- data.status = GUIAgentStatus.END;
682
- break;
683
- }
684
- }
685
-
686
- // Check abort after action loop
687
- if (this.signal?.aborted) {
688
- data.status = GUIAgentStatus.USER_STOPPED;
689
- break;
690
- }
691
-
692
- // Wait between iterations
693
- if (this.loopIntervalInMs > 0) {
694
- await sleep(this.loopIntervalInMs);
695
- }
696
- }
697
- } catch (error) {
698
- this.logger.error('[GUIAgent] Catch error', error);
699
- if (
700
- error instanceof Error &&
701
- (error.name === 'AbortError' || error.message?.includes('aborted'))
702
- ) {
703
- data.status = GUIAgentStatus.USER_STOPPED;
704
- } else {
705
- data.status = GUIAgentStatus.ERROR;
706
- data.error = error instanceof Error ? error.message : 'Unknown error';
707
- }
708
- } finally {
709
- // Save final status
710
- const finalStatus = data.status;
711
- const finalError = data.error;
712
-
713
- // Output error immediately if task failed
714
- if (finalStatus === GUIAgentStatus.ERROR && finalError) {
715
- console.log(`\n${colors.error('✖')} ${finalError}\n`);
716
- }
717
-
718
- // Call onData callback if set
719
- // Note: Use Promise.resolve().then() to avoid modifying data in callback
720
- const onDataCallback = this.onData;
721
- if (onDataCallback) {
722
- Promise.resolve().then(() => onDataCallback({ ...data, conversations: [] }));
723
- }
724
-
725
- // Call onError callback if status is error
726
- if (finalStatus === GUIAgentStatus.ERROR && this.onError) {
727
- this.onError(new Error(finalError || 'Unknown error occurred'));
728
- }
729
-
730
- if (this.showAIDebugInfo) {
731
- this.logger.debug('[GUIAgent] Final status:', {
732
- status: finalStatus,
733
- loopCnt,
734
- totalConversations: data.conversations.length,
735
- });
736
- }
737
-
738
- // Ensure the returned status is correct (reassign)
739
- this.logger.debug(`[GUIAgent] Finally: finalStatus=${finalStatus}, finalError=${finalError}, data.status=${data.status}, data.error=${data.error}`);
740
-
741
- // Log final status (only visible when showAIDebugInfo is enabled)
742
- this.logger.debug(`[GUIAgent] Final status: ${finalStatus}${finalError ? `, Error: ${finalError}` : ''}, Steps: ${loopCnt}`);
743
-
744
- data.status = finalStatus;
745
- data.error = finalError;
746
- }
747
-
748
- return data;
749
- }
750
-
751
- /**
752
- * Build messages for the model API
753
- */
754
- private buildModelMessages(conversations: Conversation[], systemPrompt: string): any[] {
755
- const messages: any[] = [];
756
-
757
- // System prompt
758
- messages.push({
759
- role: 'system',
760
- content: systemPrompt,
761
- });
762
-
763
- // Add conversation history
764
- for (const conv of conversations) {
765
- if (conv.from === 'human' && conv.screenshotBase64) {
766
- messages.push({
767
- role: 'user',
768
- content: [
769
- { type: 'text', text: conv.value },
770
- {
771
- type: 'image_url',
772
- image_url: {
773
- url: `data:image/png;base64,${conv.screenshotBase64}`,
774
- detail: 'high',
775
- },
776
- },
777
- ],
778
- });
779
- } else if (conv.from === 'assistant') {
780
- messages.push({
781
- role: 'assistant',
782
- content: conv.value,
783
- });
784
- } else {
785
- messages.push({
786
- role: 'user',
787
- content: conv.value,
788
- });
789
- }
790
- }
791
-
792
- return messages;
793
- }
794
-
795
- /**
796
- * Extract image and prompt from messages for remote VLM calls
797
- */
798
- private extractImageAndPrompt(messages: any[]): { image: string; prompt: string } {
799
- const lastUserMessage = messages[messages.length - 1];
800
- let image = '';
801
- let prompt = '';
802
-
803
- if (lastUserMessage && Array.isArray(lastUserMessage.content)) {
804
- const imageBlock = lastUserMessage.content.find((c: any) => c.type === 'image_url');
805
- const textBlock = lastUserMessage.content.find((c: any) => c.type === 'text');
806
-
807
- if (imageBlock) {
808
- const imageUrl = imageBlock.image_url?.url || '';
809
- if (imageUrl.startsWith('data:image')) {
810
- image = imageUrl.split(',')[1] || '';
811
- } else {
812
- image = imageUrl;
813
- }
814
- }
815
- prompt = textBlock?.text || '';
816
- }
817
-
818
- return { image, prompt };
819
- }
820
-
821
- /**
822
- * Debug output for model request
823
- */
824
- private debugRequest(messages: any[], remoteVlmCaller?: RemoteVlmCaller): void {
825
- console.log('\n╔══════════════════════════════════════════════════════════╗');
826
- console.log('║ GUI MODEL REQUEST DEBUG ║');
827
- console.log('╚══════════════════════════════════════════════════════════╝');
828
- console.log(`📦 Model: ${remoteVlmCaller ? ((remoteVlmCaller as any).info?.model || 'remote') : this.model}`);
829
- console.log(`🌐 Base URL: ${remoteVlmCaller ? ((remoteVlmCaller as any).info?.baseUrl || 'remote') : (this.modelBaseUrl || process.env.MODEL_BASE_URL || 'https://api.openai.com/v1')}`);
830
- console.log(`💬 Messages: ${messages.length}`);
831
-
832
- // Show system prompt if present
833
- const systemMsg = messages.find((m: any) => m.role === 'system');
834
- if (systemMsg) {
835
- console.log('\n┌─────────────────────────────────────────────────────────────┐');
836
- console.log('│ 🟫 SYSTEM │');
837
- console.log('├─────────────────────────────────────────────────────────────┤');
838
- const systemContent = typeof systemMsg.content === 'string'
839
- ? systemMsg.content
840
- : JSON.stringify(systemMsg.content);
841
- const lines = systemContent.split('\n').slice(0, 15);
842
- for (const line of lines) {
843
- console.log('│ ' + line.slice(0, 62));
844
- }
845
- if (systemContent.split('\n').length > 15) {
846
- console.log('│ ... (truncated)');
847
- }
848
- console.log('└─────────────────────────────────────────────────────────────┘');
849
- }
850
-
851
- // Show conversation messages
852
- const roleColors: Record<string, string> = {
853
- user: '👤 USER',
854
- assistant: '🤖 ASSISTANT',
855
- };
856
-
857
- for (let i = 0; i < messages.length; i++) {
858
- const msg = messages[i];
859
- if (msg.role === 'system') continue;
860
-
861
- const roleLabel = roleColors[msg.role] || `● ${msg.role.toUpperCase()}`;
862
- console.log(`\n┌─────────────────────────────────────────────────────────────┐`);
863
- console.log(`│ ${roleLabel} (${i + 1}) │`);
864
- console.log('├─────────────────────────────────────────────────────────────┤');
865
-
866
- if (typeof msg.content === 'string') {
867
- const lines = msg.content.split('\n').slice(0, 20);
868
- for (const line of lines) {
869
- console.log('│ ' + line.slice(0, 62));
870
- }
871
- if (msg.content.split('\n').length > 20) {
872
- console.log('│ ... (truncated)');
873
- }
874
- } else if (Array.isArray(msg.content)) {
875
- const hasImage = msg.content.some((c: any) => c.type === 'image_url');
876
- console.log('│ 📎 Content blocks: ' + msg.content.length);
877
- if (hasImage) {
878
- const imageBlock = msg.content.find((c: any) => c.type === 'image_url');
879
- const imageSize = imageBlock?.image_url?.url?.length || 0;
880
- console.log('│ 🖼️ Image size: ' + (imageSize / 1024).toFixed(2) + ' KB');
881
- }
882
- const textBlock = msg.content.find((c: any) => c.type === 'text');
883
- if (textBlock?.text) {
884
- const lines = textBlock.text.split('\n').slice(0, 10);
885
- for (const line of lines) {
886
- console.log('│ ' + line.slice(0, 62));
887
- }
888
- }
889
- }
890
- console.log('└─────────────────────────────────────────────────────────────┘');
891
- }
892
-
893
- console.log('\n📤 Sending request to model API...\n');
894
- }
895
-
896
- /**
897
- * Debug output for model response
898
- */
899
- private debugResponse(content: string, usage?: any): void {
900
- console.log('\n╔══════════════════════════════════════════════════════════╗');
901
- console.log('║ GUI MODEL RESPONSE DEBUG ║');
902
- console.log('╚══════════════════════════════════════════════════════════╝');
903
-
904
- if (usage) {
905
- console.log(`📊 Tokens: ${usage.prompt_tokens} (prompt) + ${usage.completion_tokens} (completion) = ${usage.total_tokens} (total)`);
906
- }
907
-
908
- console.log('\n┌─────────────────────────────────────────────────────────────┐');
909
- console.log('│ 🤖 ASSISTANT │');
910
- console.log('├─────────────────────────────────────────────────────────────┤');
911
- console.log('│ 💬 CONTENT:');
912
- console.log(' ───────────────────────────────────────────────────────────');
913
-
914
- const lines = content.split('\n').slice(0, 30);
915
- for (const line of lines) {
916
- console.log('│ ' + line.slice(0, 62));
917
- }
918
- if (content.split('\n').length > 30) {
919
- console.log(`│ ... (${content.split('\n').length - 30} more lines)`);
920
- }
921
- console.log('│ ───────────────────────────────────────────────────────────');
922
- console.log('└─────────────────────────────────────────────────────────────┘');
923
-
924
- console.log('\n╔══════════════════════════════════════════════════════════╗');
925
- console.log('║ RESPONSE ENDED ║');
926
- console.log('╚══════════════════════════════════════════════════════════╝\n');
927
- }
928
-
929
- /**
930
- * Call local VLM API
931
- */
932
- private async callLocalVLM(
933
- messages: any[],
934
- screenContext: ScreenContext
935
- ): Promise<{ prediction: string; parsedPredictions: PredictionParsed[] }> {
936
- const baseUrl = this.modelBaseUrl || process.env.MODEL_BASE_URL || 'https://api.openai.com/v1';
937
- const apiKey = this.modelApiKey || process.env.MODEL_API_KEY || '';
938
-
939
- const requestBody = {
940
- model: this.model,
941
- messages,
942
- max_tokens: 1024,
943
- temperature: 0.1,
944
- };
945
-
946
- // Debug output for model input
947
- if (this.showAIDebugInfo) {
948
- this.debugRequest(messages);
949
- }
950
-
951
- let response;
952
- try {
953
- response = await fetch(`${baseUrl}/chat/completions`, {
954
- method: 'POST',
955
- headers: {
956
- 'Content-Type': 'application/json',
957
- 'Authorization': `Bearer ${apiKey}`,
958
- },
959
- body: JSON.stringify(requestBody),
960
- signal: this.signal,
961
- });
962
- } catch (fetchError) {
963
- throw fetchError;
964
- }
965
-
966
- // Handle non-200 responses
967
- if (!response.ok) {
968
- const errorText = await response.text();
969
- throw new Error(`Model API error: ${errorText}`);
970
- }
971
-
972
- const result = await response.json() as { choices?: Array<{ message?: { content?: string } }>; usage?: any };
973
- const content = result.choices?.[0]?.message?.content || '';
974
-
975
- // Debug output for model response
976
- if (this.showAIDebugInfo) {
977
- this.debugResponse(content, result.usage);
978
- }
979
-
980
- const { parsed: parsedPredictions } = actionParser({
981
- prediction: content,
982
- factor: [1000, 1000],
983
- screenContext: {
984
- width: screenContext.width,
985
- height: screenContext.height,
986
- },
987
- });
988
-
989
- return {
990
- prediction: content,
991
- parsedPredictions,
992
- };
993
- }
994
-
995
- /**
996
- * Call the model API with debug logging
997
- * Local mode: use model/modelBaseUrl/modelApiKey directly
998
- * Remote mode: use remoteVlmCaller for VLM calls (now with full messages for consistent behavior)
999
- */
1000
- private async callModelAPI(
1001
- messages: any[],
1002
- screenContext: ScreenContext,
1003
- remoteVlmCaller: RemoteVlmCaller
1004
- ): Promise<{ prediction: string; parsedPredictions: PredictionParsed[] }> {
1005
- // === LOCAL 模式 ===
1006
- if (this.isLocalMode) {
1007
- return this.callLocalVLM(messages, screenContext);
1008
- }
1009
-
1010
- // === REMOTE 模式 ===
1011
- else {
1012
- // Debug output for model input
1013
- if (this.showAIDebugInfo) {
1014
- this.debugRequest(messages, remoteVlmCaller);
1015
- }
1016
-
1017
- // Use externally injected VLM caller function with full messages (same as local mode)
1018
- const prediction = await remoteVlmCaller(messages, this.systemPrompt);
1019
-
1020
- // Debug output for model response
1021
- if (this.showAIDebugInfo) {
1022
- this.debugResponse(prediction);
1023
- }
1024
-
1025
- const { parsed: parsedPredictions } = actionParser({
1026
- prediction,
1027
- factor: [1000, 1000],
1028
- screenContext: {
1029
- width: screenContext.width,
1030
- height: screenContext.height,
1031
- },
1032
- });
1033
-
1034
- return {
1035
- prediction,
1036
- parsedPredictions,
1037
- };
1038
- }
1039
- }
1040
-
1041
- /**
1042
- * Get summary from prediction text
1043
- */
1044
- private getSummary(prediction: string): string {
1045
- // Extract the action part as summary
1046
- const actionMatch = prediction.match(/Action[::]\s*([\s\S]+)$/i);
1047
- if (actionMatch) {
1048
- return actionMatch[1].trim();
1049
- }
1050
- return prediction.slice(0, 200);
1051
- }
1052
-
1053
- pause(): void {
1054
- this.isPaused = true;
1055
- this.resumePromise = new Promise((resolve) => {
1056
- this.resolveResume = resolve;
1057
- });
1058
- }
1059
-
1060
- resume(): void {
1061
- if (this.resolveResume) {
1062
- this.resolveResume();
1063
- this.resumePromise = null;
1064
- this.resolveResume = null;
1065
- }
1066
- this.isPaused = false;
1067
- }
1068
-
1069
- stop(): void {
1070
- this.isStopped = true;
1071
- }
1072
-
1073
- async cleanup(): Promise<void> {
1074
- this.logger.debug('Cleaning up GUI Agent...');
1075
- await this.operator.cleanup();
1076
-
1077
- // Cleanup cancellation listener if attached
1078
- const cancelHandler = (this as any)._cancelHandler;
1079
- const cancellationManager = (this as any)._cancellationManager;
1080
- if (cancelHandler && cancellationManager) {
1081
- cancellationManager.off('cancelled', cancelHandler);
1082
- (this as any)._cancelHandler = undefined;
1083
- (this as any)._cancellationManager = undefined;
1084
- }
1085
- }
1086
- }
1087
-
1088
- export { GUIAgentStatus as StatusEnum };
1089
-
1
+ /**
2
+ * GUI Agent for xagent
3
+ * Orchestrates desktop automation with AI-powered action execution
4
+ * Based on UI-TARS architecture with computer control only
5
+ *
6
+ * This implementation is aligned with packages/ui-tars/sdk/src/GUIAgent.ts
7
+ */
8
+
9
+ import type {
10
+ ScreenContext,
11
+ ScreenshotOutput,
12
+ PredictionParsed,
13
+ } from '../types/operator.js';
14
+ import type { Operator } from '../operator/base-operator.js';
15
+ import { sleep, asyncRetry } from '../utils.js';
16
+ import { actionParser } from '../action-parser/index.js';
17
+ import { colors, icons} from '../../theme.js';
18
+ import { getLogger } from '../../logger.js';
19
+ import { SdkOutputAdapter } from '../../sdk-output-adapter.js';
20
+
21
+ /**
22
+ * Helper function to truncate long text
23
+ */
24
+ function _truncateText(text: string, maxLength: number = 200): string {
25
+ if (!text) return '';
26
+ return text.length > maxLength ? text.substring(0, maxLength) + '...' : text;
27
+ }
28
+
29
+ /**
30
+ * Helper function to indent multiline text
31
+ */
32
+ function _indentMultiline(text: string, indent: string): string {
33
+ return text.split('\n').map(line => indent + line).join('\n');
34
+ }
35
+
36
+ const guiLogger = getLogger();
37
+
38
+ // UI-TARS Status Enum
39
+ export enum GUIAgentStatus {
40
+ INIT = 'init',
41
+ RUNNING = 'running',
42
+ PAUSE = 'paused',
43
+ END = 'end',
44
+ ERROR = 'error',
45
+ USER_STOPPED = 'user_stopped',
46
+ CALL_LLM = 'call_llm',
47
+ }
48
+
49
+ /**
50
+ * Remote VLM Caller callback function type
51
+ * Inject this function externally to handle VLM calls, GUI Agent doesn't need to know VLM implementation details
52
+ * Receives full messages array (same as local mode) for consistent behavior
53
+ * @param messages - Full messages array
54
+ * @param systemPrompt - System prompt (for reference)
55
+ * @param taskId - Task identifier for backend tracking
56
+ * @param isFirstVlmCallRef - Reference object to track and update first VLM call state
57
+ */
58
+ export type RemoteVlmCaller = (messages: any[], systemPrompt: string, taskId: string, isFirstVlmCallRef: { current: boolean }) => Promise<string>;
59
+
60
+ export interface GUIAgentConfig<T extends Operator> {
61
+ operator: T;
62
+ model?: string;
63
+ modelBaseUrl?: string;
64
+ modelApiKey?: string;
65
+ /**
66
+ * Task identifier for VLM state tracking (begin vs continue)
67
+ */
68
+ taskId?: string;
69
+ /**
70
+ * Shared ref object to track first VLM call across createGUISubAgent calls
71
+ * Must be passed from outside to properly track VLM status across loop iterations
72
+ */
73
+ isFirstVlmCallRef?: { current: boolean };
74
+ /**
75
+ * Externally injected VLM caller function
76
+ * If this function is provided, GUI Agent will use it to call VLM
77
+ * instead of directly calling modelBaseUrl/modelApiKey
78
+ * This allows GUI Agent to work with remote services without exposing any configuration
79
+ */
80
+ remoteVlmCaller?: RemoteVlmCaller;
81
+ /**
82
+ * Whether to use local mode
83
+ * If true, use model/modelBaseUrl/modelApiKey for VLM calls
84
+ * If false, use remoteVlmCaller for remote VLM calls
85
+ */
86
+ isLocalMode: boolean;
87
+ systemPrompt?: string;
88
+ loopIntervalInMs?: number;
89
+ maxLoopCount?: number;
90
+ logger?: any;
91
+ signal?: AbortSignal;
92
+ /**
93
+ * SDK output adapter for SDK mode output
94
+ * When provided, GUI Agent will use it to output status and progress in SDK format
95
+ */
96
+ sdkOutputAdapter?: SdkOutputAdapter | null;
97
+ onData?: (data: GUIAgentData) => void;
98
+ onError?: (error: Error) => void;
99
+ showAIDebugInfo?: boolean;
100
+ indentLevel?: number;
101
+ retry?: {
102
+ screenshot?: {
103
+ maxRetries?: number;
104
+ onRetry?: (e: Error) => void;
105
+ };
106
+ model?: {
107
+ maxRetries?: number;
108
+ onRetry?: (e: Error) => void;
109
+ };
110
+ execute?: {
111
+ maxRetries?: number;
112
+ onRetry?: (e: Error) => void;
113
+ };
114
+ };
115
+ }
116
+
117
+ export interface GUIAgentData {
118
+ status: GUIAgentStatus;
119
+ conversations: Conversation[];
120
+ error?: string;
121
+ systemPrompt?: string;
122
+ }
123
+
124
+ export interface Conversation {
125
+ from: 'human' | 'assistant';
126
+ value: string;
127
+ screenshotBase64?: string;
128
+ screenshotContext?: {
129
+ size: { width: number; height: number };
130
+ mime?: string;
131
+ scaleFactor: number;
132
+ };
133
+ actionType?: string;
134
+ actionInputs?: Record<string, any>;
135
+ timing?: {
136
+ start: number;
137
+ end: number;
138
+ cost: number;
139
+ };
140
+ predictionParsed?: PredictionParsed[];
141
+ }
142
+
143
+ // UI-TARS constants (aligned with @ui-tars/shared/constants)
144
+ const MAX_LOOP_COUNT = 100;
145
+ const MAX_SNAPSHOT_ERR_CNT = 5;
146
+ const MAX_STEP_RETRIES = 3; // Max retries for a single action step before giving up
147
+ const IMAGE_PLACEHOLDER = '{{IMG_PLACEHOLDER_0}}';
148
+
149
+ export class GUIAgent<T extends Operator> {
150
+ private operator: T;
151
+ private readonly model: string;
152
+ private readonly modelBaseUrl: string;
153
+ private readonly modelApiKey: string;
154
+ private readonly taskId: string;
155
+ private readonly isFirstVlmCallRef?: { current: boolean };
156
+ private readonly remoteVlmCaller?: RemoteVlmCaller;
157
+ private readonly isLocalMode: boolean;
158
+ private readonly systemPrompt: string;
159
+ private readonly loopIntervalInMs: number;
160
+ private readonly maxLoopCount: number;
161
+ private readonly logger: Console;
162
+ private readonly signal?: AbortSignal;
163
+ private readonly sdkOutputAdapter?: SdkOutputAdapter | null;
164
+ private readonly onData?: (data: GUIAgentData) => void;
165
+ private readonly onError?: (error: Error) => void;
166
+ private readonly showAIDebugInfo: boolean;
167
+ private readonly indentLevel: number;
168
+ private readonly retry?: GUIAgentConfig<T>['retry'];
169
+
170
+ private isPaused = false;
171
+ private resumePromise: Promise<void> | null = null;
172
+ private resolveResume: (() => void) | null = null;
173
+ private isStopped = false;
174
+ private isFirstVlmCall = true;
175
+
176
+ constructor(config: GUIAgentConfig<T>) {
177
+ this.operator = config.operator;
178
+ this.model = config.model || '';
179
+ this.modelBaseUrl = config.modelBaseUrl || '';
180
+ this.modelApiKey = config.modelApiKey || '';
181
+ this.taskId = config.taskId || crypto.randomUUID();
182
+ this.isFirstVlmCallRef = config.isFirstVlmCallRef;
183
+ this.remoteVlmCaller = config.remoteVlmCaller;
184
+ this.isLocalMode = config.isLocalMode;
185
+ this.loopIntervalInMs = config.loopIntervalInMs || 0;
186
+ this.maxLoopCount = config.maxLoopCount || MAX_LOOP_COUNT;
187
+ this.logger = config.logger || guiLogger;
188
+ this.signal = config.signal;
189
+ this.sdkOutputAdapter = config.sdkOutputAdapter ?? null;
190
+ this.onData = config.onData;
191
+ this.onError = config.onError;
192
+ this.showAIDebugInfo = config.showAIDebugInfo ?? false;
193
+ this.indentLevel = config.indentLevel ?? 1;
194
+ this.retry = config.retry;
195
+
196
+ this.systemPrompt = config.systemPrompt || this.buildSystemPrompt();
197
+ }
198
+
199
+ /**
200
+ * Set isFirstVlmCall to false after first VLM call
201
+ * Called by external code after remoteVlmCaller completes first call
202
+ */
203
+ public setIsFirstVlmCall(value: boolean): void {
204
+ this.isFirstVlmCall = value;
205
+ }
206
+
207
+ /**
208
+ * Display conversation results with formatting similar to session.ts (simplified)
209
+ * In SDK mode, uses the SDK adapter for structured output
210
+ * Note: For assistant actions, SDK output is handled in the action execution loop
211
+ * to ensure accurate timing information
212
+ */
213
+ private displayConversationResult(conversation: Conversation, iteration: number, indentLevel: number = 1): void {
214
+ const indent = ' '.repeat(indentLevel);
215
+ const innerIndent = ' '.repeat(indentLevel + 1);
216
+ const _maxWidth = process.stdout.columns || 80;
217
+
218
+ if (conversation.from === 'assistant') {
219
+ // Display assistant response (action)
220
+ const content = conversation.value || '';
221
+ const timing = conversation.timing;
222
+
223
+ // Simplified: show step number and action
224
+ const actionSummary = content.replace(/Thought:[\s\S]*?Action:\s*/i, '').trim();
225
+ const actionType = conversation.predictionParsed?.[0]?.action_type || 'action';
226
+
227
+ // In SDK mode, action output is handled in the action execution loop
228
+ // Only use console output for non-SDK mode
229
+ if (!this.sdkOutputAdapter) {
230
+ console.log(`${indent}${colors.primaryBright(`[${iteration}]`)} ${colors.textMuted(actionType)}${timing ? colors.textDim(` (${timing.cost}ms)`) : ''}`);
231
+
232
+ // Optionally show action details on next line if verbose
233
+ if (this.showAIDebugInfo && actionSummary) {
234
+ const truncatedSummary = actionSummary.length > 60 ? actionSummary.substring(0, 60) + '...' : actionSummary;
235
+ console.log(`${innerIndent}${colors.textMuted(truncatedSummary)}`);
236
+ }
237
+ }
238
+ } else if (conversation.from === 'human' && conversation.screenshotBase64) {
239
+ // Show minimal indicator for screenshot
240
+ // In SDK mode, screenshot is handled by the conversation data
241
+ if (this.showAIDebugInfo && !this.sdkOutputAdapter) {
242
+ const timing = conversation.timing;
243
+ console.log(`${indent}${colors.textMuted(`${icons.loading} screenshot${timing ? ` (${timing.cost}ms)` : ''}`)}`);
244
+ }
245
+ }
246
+ }
247
+
248
+ /**
249
+ * Display status message
250
+ */
251
+ private displayStatus(data: GUIAgentData, iteration: number, indentLevel: number = 1): void {
252
+ const indent = ' '.repeat(indentLevel);
253
+ const status = data.status;
254
+
255
+ switch (status) {
256
+ case GUIAgentStatus.RUNNING:
257
+ if (!this.sdkOutputAdapter) {
258
+ console.log(`${indent}${colors.info(`${icons.loading} Step ${iteration}: Running...`)}`);
259
+ } else {
260
+ this.sdkOutputAdapter.outputInfo(`Step ${iteration}: Running...`);
261
+ }
262
+ break;
263
+ case GUIAgentStatus.END:
264
+ // Handled by caller
265
+ break;
266
+ case GUIAgentStatus.ERROR:
267
+ if (data.error) {
268
+ if (!this.sdkOutputAdapter) {
269
+ console.log(`${indent}${colors.error(`${icons.cross} ${data.error}`)}`);
270
+ } else {
271
+ this.sdkOutputAdapter.outputError(data.error);
272
+ }
273
+ }
274
+ break;
275
+ case GUIAgentStatus.USER_STOPPED:
276
+ if (!this.sdkOutputAdapter) {
277
+ console.log(`${indent}${colors.warning(`${icons.warning} Stopped`)}`);
278
+ } else {
279
+ this.sdkOutputAdapter.outputWarning('Stopped');
280
+ }
281
+ break;
282
+ default:
283
+ break;
284
+ }
285
+ }
286
+
287
+ private buildSystemPrompt(): string {
288
+ /* eslint-disable no-useless-escape */
289
+ return `You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
290
+
291
+ ## Output Format
292
+ \`
293
+ Thought: ...
294
+ Action: ...
295
+ \`
296
+
297
+ ## Action Space
298
+ click(point='<point>x1 y1</point>')
299
+ left_double(point='<point>x1 y1</point>')
300
+ right_single(point='<point>x1 y1</point>')
301
+ drag(start_point='<point>x1 y1</point>', end_point='<point>x2 y2</point>')
302
+ hotkey(key='ctrl c') # Split keys with a space and use lowercase. Also, do not use more than 3 keys in one hotkey action.
303
+ type(content='xxx') # Use escape characters \', \", and \n in content part to ensure we can parse the content in normal python string format. If you want to submit your input, use \n at the end of content.
304
+ scroll(point='<point>x1 y1</point>', direction='down or up or right or left') # Show more information on the \`direction\` side.
305
+ open_url(url='https://xxx') # Open URL in browser
306
+ wait() #Sleep for 5s and take a screenshot to check for any changes.
307
+ finished(content='xxx') # Use escape characters \', \", and \n in content part to ensure we can parse the content in normal python string format.
308
+
309
+
310
+
311
+
312
+ ## Note
313
+ - Use {language} in \`Thought\` part.
314
+ - Write a small plan and finally summarize your next action (with its target element) in one sentence in \`Thought\` part.
315
+
316
+ `;
317
+ /* eslint-enable no-useless-escape */
318
+ }
319
+
320
+
321
+
322
+ async initialize(): Promise<void> {
323
+ await this.operator.doInitialize();
324
+ }
325
+
326
+ /**
327
+ * Run the GUI agent with a single instruction (UI-TARS style)
328
+ * All operations are determined by the GUI model
329
+ */
330
+ async run(instruction: string): Promise<GUIAgentData> {
331
+ const data: GUIAgentData = {
332
+ status: GUIAgentStatus.INIT,
333
+ conversations: [
334
+ {
335
+ from: 'human',
336
+ value: instruction,
337
+ timing: {
338
+ start: Date.now(),
339
+ end: Date.now(),
340
+ cost: 0,
341
+ },
342
+ },
343
+ ],
344
+ };
345
+
346
+ // Output start via SDK adapter if available
347
+ if (this.sdkOutputAdapter) {
348
+ this.sdkOutputAdapter.outputGUIAgentStart(instruction, this.isLocalMode ? 'local' : 'remote');
349
+ }
350
+
351
+ // Initialize operator for initial screenshot
352
+ try {
353
+ await this.operator.doInitialize();
354
+ } catch (initError) {
355
+ const errorMsg = initError instanceof Error ? initError.message : 'Unknown error';
356
+ this.logger.error(`[GUIAgent] Failed to initialize operator: ${errorMsg}`);
357
+
358
+ // Check if it's an RDP-related issue
359
+ if (errorMsg.includes('screen') || errorMsg.includes('capture') || errorMsg.includes('display')) {
360
+ data.status = GUIAgentStatus.ERROR;
361
+ data.error = 'Failed to initialize screen capture. This may be caused by:\n' +
362
+ ' 1. Remote Desktop session disconnected or minimized\n' +
363
+ ' 2. Display driver issues\n' +
364
+ 'Suggestion: Ensure your display is active and try again.';
365
+ } else {
366
+ data.status = GUIAgentStatus.ERROR;
367
+ data.error = `Failed to initialize operator: ${errorMsg}`;
368
+ }
369
+
370
+ // Output error via SDK adapter if available
371
+ if (this.sdkOutputAdapter) {
372
+ this.sdkOutputAdapter.outputGUIAgentError(data.error, errorMsg);
373
+ }
374
+ return data;
375
+ }
376
+
377
+ const _currentTime = Date.now();
378
+
379
+ if (this.showAIDebugInfo) {
380
+ this.logger.debug('[GUIAgent] run:', {
381
+ systemPrompt: this.systemPrompt,
382
+ model: this.model,
383
+ maxLoopCount: this.maxLoopCount,
384
+ });
385
+ }
386
+
387
+ let loopCnt = 0;
388
+ let snapshotErrCnt = 0;
389
+
390
+ // Start running agent
391
+ data.status = GUIAgentStatus.RUNNING;
392
+ data.systemPrompt = this.systemPrompt;
393
+ const indent = ' '.repeat(this.indentLevel);
394
+
395
+ // Output start via SDK adapter if available, otherwise use console
396
+ if (this.sdkOutputAdapter) {
397
+ this.sdkOutputAdapter.outputGUIAgentStart(data.conversations[0]?.value || '', this.isLocalMode ? 'local' : 'remote');
398
+ } else {
399
+ console.log(`${indent}${colors.primaryBright(`${icons.rocket} GUI Agent started`)}`);
400
+ console.log('');
401
+ }
402
+
403
+ // Output running status via SDK adapter if available
404
+ if (this.sdkOutputAdapter) {
405
+ this.sdkOutputAdapter.outputGUIAgentStatus(GUIAgentStatus.RUNNING);
406
+ }
407
+
408
+ await this.onData?.({ ...data, conversations: [] });
409
+
410
+ try {
411
+ // eslint-disable-next-line no-constant-condition
412
+ while (true) {
413
+ if (this.showAIDebugInfo) {
414
+ this.logger.debug('[GUIAgent] loopCnt:', loopCnt);
415
+ }
416
+
417
+ // Check pause status
418
+ if (this.isPaused && this.resumePromise) {
419
+ data.status = GUIAgentStatus.PAUSE;
420
+ // Output pause status via SDK adapter if available
421
+ if (this.sdkOutputAdapter) {
422
+ this.sdkOutputAdapter.outputGUIAgentStatus(GUIAgentStatus.PAUSE, loopCnt);
423
+ }
424
+ await this.onData?.({ ...data, conversations: [] });
425
+ await this.resumePromise;
426
+ data.status = GUIAgentStatus.RUNNING;
427
+ // Output running status via SDK adapter if available
428
+ if (this.sdkOutputAdapter) {
429
+ this.sdkOutputAdapter.outputGUIAgentStatus(GUIAgentStatus.RUNNING, loopCnt);
430
+ }
431
+ await this.onData?.({ ...data, conversations: [] });
432
+ }
433
+
434
+ // Check stop or aborted status
435
+ if (
436
+ this.isStopped ||
437
+ data.status !== GUIAgentStatus.RUNNING ||
438
+ this.signal?.aborted
439
+ ) {
440
+ if (this.signal?.aborted) {
441
+ data.status = GUIAgentStatus.USER_STOPPED;
442
+ }
443
+ break;
444
+ }
445
+
446
+ // Check loop limit
447
+ if (loopCnt >= this.maxLoopCount) {
448
+ data.status = GUIAgentStatus.ERROR;
449
+ data.error = `Has reached max loop count: ${loopCnt}`;
450
+ break;
451
+ }
452
+
453
+ // Check screenshot error limit
454
+ if (snapshotErrCnt >= MAX_SNAPSHOT_ERR_CNT) {
455
+ data.status = GUIAgentStatus.ERROR;
456
+ data.error = 'Screenshot failed too many times. Stopping task.';
457
+ break;
458
+ }
459
+
460
+ loopCnt += 1;
461
+ const start = Date.now();
462
+
463
+ // Take screenshot (single attempt - no retry to avoid infinite loops)
464
+ let snapshot: ScreenshotOutput;
465
+ try {
466
+ snapshot = await this.operator.doScreenshot();
467
+ } catch (screenshotError) {
468
+ const errorMsg = screenshotError instanceof Error ? screenshotError.message : 'Unknown error';
469
+ this.logger.warn(`[GUIAgent] Screenshot exception: ${errorMsg}`);
470
+ snapshotErrCnt += 1;
471
+ data.status = GUIAgentStatus.ERROR;
472
+ data.error = `Screenshot failed ${snapshotErrCnt} times. Stopping task.`;
473
+ this.logger.error(`[GUIAgent] ${data.error}`);
474
+ await sleep(1000);
475
+ break;
476
+ }
477
+
478
+ // Check if screenshot returned failure status
479
+ if (snapshot.status === 'failed') {
480
+ const errorMsg = snapshot.errorMessage || 'Unknown error';
481
+ this.logger.warn(`[GUIAgent] Screenshot failed: ${errorMsg}`);
482
+ snapshotErrCnt += 1;
483
+ data.status = GUIAgentStatus.ERROR;
484
+ data.error = `Screenshot failed ${snapshotErrCnt} times. Stopping task.`;
485
+ this.logger.error(`[GUIAgent] ${data.error}`);
486
+ await sleep(1000);
487
+ break;
488
+ }
489
+
490
+ // Check abort immediately after screenshot
491
+ if (this.signal?.aborted) {
492
+ data.status = GUIAgentStatus.USER_STOPPED;
493
+ break;
494
+ }
495
+
496
+ // Validate screenshot
497
+ const isValidImage = !!(snapshot?.base64);
498
+ if (!isValidImage) {
499
+ snapshotErrCnt += 1;
500
+ data.status = GUIAgentStatus.ERROR;
501
+ data.error = `Screenshot failed ${snapshotErrCnt} times. Stopping task.`;
502
+ this.logger.error(`[GUIAgent] ${data.error}`);
503
+ await sleep(1000);
504
+ break;
505
+ }
506
+
507
+ // Reset error counter on successful screenshot
508
+ snapshotErrCnt = 0;
509
+
510
+ const end = Date.now();
511
+
512
+ // Get screen context
513
+ const screenContext = await this.operator.getScreenContext();
514
+
515
+ // Add screenshot to conversation
516
+ data.conversations.push({
517
+ from: 'human',
518
+ value: IMAGE_PLACEHOLDER,
519
+ screenshotBase64: snapshot.base64,
520
+ screenshotContext: {
521
+ size: {
522
+ width: screenContext.width,
523
+ height: screenContext.height,
524
+ },
525
+ scaleFactor: snapshot.scaleFactor ?? screenContext.scaleFactor,
526
+ },
527
+ timing: {
528
+ start,
529
+ end,
530
+ cost: end - start,
531
+ },
532
+ });
533
+
534
+ await this.onData?.({
535
+ ...data,
536
+ conversations: data.conversations.slice(-1),
537
+ });
538
+
539
+ // Display screenshot notification
540
+ const latestScreenshot = data.conversations[data.conversations.length - 1];
541
+ if (latestScreenshot && latestScreenshot.from === 'human' && latestScreenshot.screenshotBase64) {
542
+ this.displayConversationResult(latestScreenshot, loopCnt, this.indentLevel);
543
+ }
544
+
545
+ // Build messages for model
546
+ const messages = this.buildModelMessages(data.conversations, data.systemPrompt);
547
+
548
+ // Check abort before model call
549
+ if (this.signal?.aborted) {
550
+ data.status = GUIAgentStatus.USER_STOPPED;
551
+ break;
552
+ }
553
+
554
+ // Invoke model with retry
555
+ let prediction: string;
556
+ let parsedPredictions: PredictionParsed[];
557
+ try {
558
+ const modelResult: { prediction: string; parsedPredictions: PredictionParsed[] } = await asyncRetry(
559
+ async (bail) => {
560
+ try {
561
+ const result = await this.callModelAPI(messages, screenContext, this.remoteVlmCaller!);
562
+ return result;
563
+ } catch (error: unknown) {
564
+ const errorMsg = error instanceof Error ? error.message : String(error);
565
+ // 捕获各种 abort 相关的错误
566
+ if (
567
+ error instanceof Error &&
568
+ (error.name === 'AbortError' ||
569
+ errorMsg.includes('aborted') ||
570
+ errorMsg.includes('canceled') ||
571
+ errorMsg.includes('cancelled') ||
572
+ errorMsg === 'Operation was canceled' ||
573
+ errorMsg === 'The operation was canceled' ||
574
+ errorMsg === 'This operation was aborted')
575
+ ) {
576
+ bail(error as Error);
577
+ return { prediction: '', parsedPredictions: [] };
578
+ }
579
+ throw error;
580
+ }
581
+ },
582
+ {
583
+ retries: this.retry?.model?.maxRetries ?? 0,
584
+ minTimeout: 1000 * 30,
585
+ onRetry: this.retry?.model?.onRetry,
586
+ }
587
+ );
588
+ prediction = modelResult.prediction;
589
+ parsedPredictions = modelResult.parsedPredictions;
590
+ } catch (modelError) {
591
+ // 首先检查是否是取消/abort 错误
592
+ const errorMsg = modelError instanceof Error ? modelError.message : String(modelError);
593
+ const isAbortError =
594
+ modelError instanceof Error && (
595
+ modelError.name === 'AbortError' ||
596
+ errorMsg.includes('aborted') ||
597
+ errorMsg.includes('canceled') ||
598
+ errorMsg.includes('cancelled') ||
599
+ errorMsg === 'Operation was canceled' ||
600
+ errorMsg === 'The operation was canceled' ||
601
+ errorMsg === 'This operation was aborted'
602
+ );
603
+
604
+ if (isAbortError || this.signal?.aborted) {
605
+ data.status = GUIAgentStatus.USER_STOPPED;
606
+ data.conversations = data.conversations || [];
607
+ return data;
608
+ }
609
+
610
+ // Handle multimodal model API errors with specific error messages
611
+ data.status = GUIAgentStatus.ERROR;
612
+ if (errorMsg.includes('401') || errorMsg.includes('authentication') || errorMsg.includes('API key') || errorMsg.includes('api_key') || errorMsg.includes('Unauthorized') || errorMsg.includes('invalid_api_key')) {
613
+ data.error = '[Multimodal Model Authentication Failed] The guiSubagentApiKey configuration is invalid.\n' +
614
+ 'Error details: HTTP 401 - API key is invalid or expired\n' +
615
+ 'Suggested action: Please check the guiSubagentApiKey configuration in ~/.xagent/settings.json and ensure a valid API key is set';
616
+ } else if (errorMsg.includes('429') || errorMsg.includes('rate limit') || errorMsg.includes('too many requests')) {
617
+ data.error = '[Multimodal Model Rate Limit Exceeded] API requests exceed rate limit.\n' +
618
+ 'Error details: HTTP 429 - Too Many Requests\n' +
619
+ 'Suggested action: Please retry later, or check your API account quota settings. Wait a few minutes before retrying';
620
+ } else if (errorMsg.includes('network') || errorMsg.includes('fetch') || errorMsg.includes('connection') || errorMsg.includes('ECONNREFUSED')) {
621
+ data.error = '[Multimodal Model Network Error] Cannot connect to API service.\n' +
622
+ 'Error details: Network connection failed. Possible causes:\n' +
623
+ ' 1. Network connection is lost\n' +
624
+ ' 2. The guiSubagentBaseUrl configuration is incorrect\n' +
625
+ ' 3. API service endpoint is unreachable\n' +
626
+ 'Suggested action: Please check the guiSubagentBaseUrl configuration in ~/.xagent/settings.json and ensure network connectivity';
627
+ } else if (errorMsg.includes('404') || errorMsg.includes('not found') || errorMsg.includes('model not found') || errorMsg.includes('InvalidEndpointOrModel.NotFound')) {
628
+ // Extract model name
629
+ const modelMatch = errorMsg.match(/model[:\s]+([^\s,"]+)|"model[:"]+([^",}]+)/i);
630
+ const modelName = modelMatch ? (modelMatch[1] || modelMatch[2]) : 'Unknown';
631
+ data.error = '[Multimodal Model Configuration Error] The model specified in guiSubagentModel does not exist or is not accessible.\n' +
632
+ 'Error details: HTTP 404 - Model or Endpoint not found\n' +
633
+ 'Configured model name: ' + modelName + '\n' +
634
+ 'Suggested action: Please check the guiSubagentModel configuration in ~/.xagent/settings.json, remove or replace with a valid model name';
635
+ } else {
636
+ data.error = '[Multimodal Model API Call Failed]\n' +
637
+ 'Error details: ' + errorMsg + '\n' +
638
+ 'Please check the following configuration items:\n' +
639
+ ' - guiSubagentApiKey: API key\n' +
640
+ ' - guiSubagentBaseUrl: API service URL\n' +
641
+ ' - guiSubagentModel: Model name\n' +
642
+ 'Config file location: ~/.xagent/settings.json';
643
+ }
644
+ break;
645
+ }
646
+
647
+ // Check abort immediately after model call
648
+ if (this.signal?.aborted) {
649
+ data.status = GUIAgentStatus.USER_STOPPED;
650
+ break;
651
+ }
652
+
653
+ if (!prediction) {
654
+ this.logger.warn('[GUIAgent] Warning: Empty response from model, retrying...');
655
+ continue;
656
+ }
657
+
658
+ if (this.showAIDebugInfo) {
659
+ this.logger.debug('[GUIAgent] Response:', prediction);
660
+ this.logger.debug('[GUIAgent] Parsed Predictions:', JSON.stringify(parsedPredictions));
661
+ }
662
+
663
+ const predictionSummary = this.getSummary(prediction);
664
+
665
+ data.conversations.push({
666
+ from: 'assistant',
667
+ value: predictionSummary,
668
+ timing: {
669
+ start,
670
+ end: Date.now(),
671
+ cost: Date.now() - start,
672
+ },
673
+ screenshotContext: {
674
+ size: {
675
+ width: screenContext.width,
676
+ height: screenContext.height,
677
+ },
678
+ scaleFactor: snapshot.scaleFactor ?? screenContext.scaleFactor,
679
+ },
680
+ predictionParsed: parsedPredictions,
681
+ });
682
+
683
+ await this.onData?.({
684
+ ...data,
685
+ conversations: data.conversations.slice(-1),
686
+ });
687
+
688
+ // Display assistant response
689
+ const latestAssistant = data.conversations[data.conversations.length - 1];
690
+ if (latestAssistant && latestAssistant.from === 'assistant') {
691
+ this.displayConversationResult(latestAssistant, loopCnt, this.indentLevel);
692
+ }
693
+
694
+ // Check if we need to switch operator based on first action
695
+ // Execute actions
696
+ for (const parsedPrediction of parsedPredictions) {
697
+ const actionType = parsedPrediction.action_type;
698
+
699
+ if (this.showAIDebugInfo) {
700
+ this.logger.debug('[GUIAgent] Action:', actionType);
701
+ }
702
+
703
+ // Handle internal action spaces
704
+ if (actionType === 'error_env') {
705
+ data.status = GUIAgentStatus.ERROR;
706
+ data.error = 'Environment error';
707
+ break;
708
+ } else if (actionType === 'max_loop') {
709
+ data.status = GUIAgentStatus.ERROR;
710
+ data.error = 'Reached max loop';
711
+ break;
712
+ }
713
+
714
+ // Execute action with retry
715
+ if (!this.signal?.aborted && !this.isStopped) {
716
+ let stepRetryCount = 0;
717
+ let stepSuccess = false;
718
+ let lastErrorMsg = '';
719
+
720
+ this.logger.debug(`[GUIAgent] Executing action: ${actionType}, loopCnt: ${loopCnt}`);
721
+
722
+ while (stepRetryCount < MAX_STEP_RETRIES && !stepSuccess) {
723
+ try {
724
+ const executeResult = await this.operator.doExecute({
725
+ prediction,
726
+ parsedPrediction,
727
+ screenWidth: screenContext.width,
728
+ screenHeight: screenContext.height,
729
+ scaleFactor: snapshot.scaleFactor ?? screenContext.scaleFactor,
730
+ factors: [1000, 1000], // Default factors
731
+ });
732
+
733
+ if (executeResult.status === 'end') {
734
+ // 'finished' action or explicit end
735
+ stepSuccess = true;
736
+ break;
737
+ } else if (executeResult.status === 'needs_input') {
738
+ // Empty action - return to main agent for re-calling LLM
739
+ this.logger.debug(`[GUIAgent] Empty action received, returning to main agent for LLM decision`);
740
+ data.status = GUIAgentStatus.CALL_LLM;
741
+ data.error = 'Empty action - main agent should re-call LLM to decide next step';
742
+ stepSuccess = true;
743
+ return data; // Return immediately with all results to main agent
744
+ }
745
+
746
+ // Any other status (success, failed, etc.) is considered success
747
+ stepSuccess = true;
748
+
749
+ // Output action via SDK adapter if available
750
+ if (this.sdkOutputAdapter && actionType) {
751
+ const timingCost = Date.now() - start;
752
+ this.sdkOutputAdapter.outputGUIAgentAction(loopCnt, actionType, timingCost);
753
+ }
754
+ break;
755
+ } catch (executeError) {
756
+ stepRetryCount++;
757
+ lastErrorMsg = executeError instanceof Error ? executeError.message : 'Unknown error';
758
+ this.logger.warn(`[GUIAgent] Action failed ${stepRetryCount}/${MAX_STEP_RETRIES}: ${lastErrorMsg}`);
759
+
760
+ if (stepRetryCount < MAX_STEP_RETRIES) {
761
+ await sleep(1000);
762
+ // Take new screenshot for retry
763
+ const retrySnapshot = await this.operator.doScreenshot();
764
+ if (retrySnapshot?.base64) {
765
+ data.conversations.push({
766
+ from: 'human',
767
+ value: IMAGE_PLACEHOLDER,
768
+ screenshotBase64: retrySnapshot.base64,
769
+ screenshotContext: {
770
+ size: {
771
+ width: screenContext.width,
772
+ height: screenContext.height,
773
+ },
774
+ scaleFactor: retrySnapshot.scaleFactor ?? screenContext.scaleFactor,
775
+ },
776
+ });
777
+ }
778
+ }
779
+ }
780
+ }
781
+
782
+ if (!stepSuccess) {
783
+ // All retries exhausted
784
+ this.logger.error(`[GUIAgent] Action failed after ${MAX_STEP_RETRIES} attempts: ${lastErrorMsg}`);
785
+ data.status = GUIAgentStatus.ERROR;
786
+ data.error = `Action failed after ${MAX_STEP_RETRIES} attempts: ${lastErrorMsg}`;
787
+ break;
788
+ }
789
+ }
790
+
791
+ // Check abort immediately after action execution
792
+ if (this.signal?.aborted) {
793
+ data.status = GUIAgentStatus.USER_STOPPED;
794
+ break;
795
+ }
796
+
797
+ // Handle special action types
798
+ if (actionType === 'finished') {
799
+ data.status = GUIAgentStatus.END;
800
+ break;
801
+ }
802
+ }
803
+
804
+ // Check abort after action loop
805
+ if (this.signal?.aborted) {
806
+ data.status = GUIAgentStatus.USER_STOPPED;
807
+ break;
808
+ }
809
+
810
+ // Wait between iterations
811
+ if (this.loopIntervalInMs > 0) {
812
+ await sleep(this.loopIntervalInMs);
813
+ }
814
+ }
815
+ } catch (error) {
816
+ this.logger.error('[GUIAgent] Catch error', error);
817
+ if (
818
+ error instanceof Error &&
819
+ (error.name === 'AbortError' || error.message?.includes('aborted'))
820
+ ) {
821
+ data.status = GUIAgentStatus.USER_STOPPED;
822
+ } else {
823
+ data.status = GUIAgentStatus.ERROR;
824
+ data.error = error instanceof Error ? error.message : 'Unknown error';
825
+ }
826
+ } finally {
827
+ // Save final status
828
+ const finalStatus = data.status;
829
+ const finalError = data.error;
830
+ const indent = ' '.repeat(this.indentLevel);
831
+
832
+ // Output error immediately if task failed
833
+ if (finalStatus === GUIAgentStatus.ERROR && finalError) {
834
+ if (!this.sdkOutputAdapter) {
835
+ console.log(`\n${indent}${colors.error('✖')} ${finalError}\n`);
836
+ } else {
837
+ this.sdkOutputAdapter.outputError(finalError);
838
+ }
839
+ }
840
+
841
+ // Call onData callback if set
842
+ // Note: Use Promise.resolve().then() to avoid modifying data in callback
843
+ const onDataCallback = this.onData;
844
+ if (onDataCallback) {
845
+ Promise.resolve().then(() => onDataCallback({ ...data, conversations: [] }));
846
+ }
847
+
848
+ // Call onError callback if status is error
849
+ if (finalStatus === GUIAgentStatus.ERROR && this.onError) {
850
+ this.onError(new Error(finalError || 'Unknown error occurred'));
851
+ }
852
+
853
+ if (this.showAIDebugInfo) {
854
+ this.logger.debug('[GUIAgent] Final status:', {
855
+ status: finalStatus,
856
+ loopCnt,
857
+ totalConversations: data.conversations.length,
858
+ });
859
+ }
860
+
861
+ // Ensure the returned status is correct (reassign)
862
+ this.logger.debug(`[GUIAgent] Finally: finalStatus=${finalStatus}, finalError=${finalError}, data.status=${data.status}, data.error=${data.error}`);
863
+
864
+ // Log final status (only visible when showAIDebugInfo is enabled)
865
+ this.logger.debug(`[GUIAgent] Final status: ${finalStatus}${finalError ? `, Error: ${finalError}` : ''}, Steps: ${loopCnt}`);
866
+
867
+ // Output final status via SDK adapter if available
868
+ if (this.sdkOutputAdapter) {
869
+ switch (finalStatus) {
870
+ case GUIAgentStatus.END:
871
+ this.sdkOutputAdapter.outputGUIAgentComplete(data.conversations[0]?.value || '', loopCnt);
872
+ break;
873
+ case GUIAgentStatus.USER_STOPPED:
874
+ this.sdkOutputAdapter.outputGUIAgentCancelled(data.conversations[0]?.value || '');
875
+ break;
876
+ case GUIAgentStatus.ERROR:
877
+ this.sdkOutputAdapter.outputGUIAgentError(
878
+ data.conversations[0]?.value || 'GUI Agent error',
879
+ finalError || 'Unknown error'
880
+ );
881
+ break;
882
+ default:
883
+ this.sdkOutputAdapter.outputGUIAgentStatus(finalStatus, loopCnt, finalError);
884
+ }
885
+ }
886
+
887
+ data.status = finalStatus;
888
+ data.error = finalError;
889
+ }
890
+
891
+ return data;
892
+ }
893
+
894
+ /**
895
+ * Build messages for the model API
896
+ */
897
+ private buildModelMessages(conversations: Conversation[], systemPrompt: string): any[] {
898
+ const messages: any[] = [];
899
+
900
+ // System prompt
901
+ messages.push({
902
+ role: 'system',
903
+ content: systemPrompt,
904
+ });
905
+
906
+ // Add conversation history
907
+ for (const conv of conversations) {
908
+ if (conv.from === 'human' && conv.screenshotBase64) {
909
+ messages.push({
910
+ role: 'user',
911
+ content: [
912
+ { type: 'text', text: conv.value },
913
+ {
914
+ type: 'image_url',
915
+ image_url: {
916
+ url: `data:image/png;base64,${conv.screenshotBase64}`,
917
+ detail: 'high',
918
+ },
919
+ },
920
+ ],
921
+ });
922
+ } else if (conv.from === 'assistant') {
923
+ messages.push({
924
+ role: 'assistant',
925
+ content: conv.value,
926
+ });
927
+ } else {
928
+ messages.push({
929
+ role: 'user',
930
+ content: conv.value,
931
+ });
932
+ }
933
+ }
934
+
935
+ return messages;
936
+ }
937
+
938
+ /**
939
+ * Extract image and prompt from messages for remote VLM calls
940
+ */
941
+ private extractImageAndPrompt(messages: any[]): { image: string; prompt: string } {
942
+ const lastUserMessage = messages[messages.length - 1];
943
+ let image = '';
944
+ let prompt = '';
945
+
946
+ if (lastUserMessage && Array.isArray(lastUserMessage.content)) {
947
+ const imageBlock = lastUserMessage.content.find((c: any) => c.type === 'image_url');
948
+ const textBlock = lastUserMessage.content.find((c: any) => c.type === 'text');
949
+
950
+ if (imageBlock) {
951
+ const imageUrl = imageBlock.image_url?.url || '';
952
+ if (imageUrl.startsWith('data:image')) {
953
+ image = imageUrl.split(',')[1] || '';
954
+ } else {
955
+ image = imageUrl;
956
+ }
957
+ }
958
+ prompt = textBlock?.text || '';
959
+ }
960
+
961
+ return { image, prompt };
962
+ }
963
+
964
+ /**
965
+ * Debug output for model request
966
+ */
967
+ private debugRequest(messages: any[], remoteVlmCaller?: RemoteVlmCaller): void {
968
+ console.log('\n╔══════════════════════════════════════════════════════════╗');
969
+ console.log('║ GUI MODEL REQUEST DEBUG ║');
970
+ console.log('╚══════════════════════════════════════════════════════════╝');
971
+ console.log(`📦 Model: ${remoteVlmCaller ? ((remoteVlmCaller as any).info?.model || 'remote') : this.model}`);
972
+ console.log(`🌐 Base URL: ${remoteVlmCaller ? ((remoteVlmCaller as any).info?.baseUrl || 'remote') : (this.modelBaseUrl || process.env.MODEL_BASE_URL || 'https://api.openai.com/v1')}`);
973
+ console.log(`💬 Messages: ${messages.length}`);
974
+
975
+ // Show system prompt if present
976
+ const systemMsg = messages.find((m: any) => m.role === 'system');
977
+ if (systemMsg) {
978
+ console.log('\n┌─────────────────────────────────────────────────────────────┐');
979
+ console.log('│ 🟫 SYSTEM │');
980
+ console.log('├─────────────────────────────────────────────────────────────┤');
981
+ const systemContent = typeof systemMsg.content === 'string'
982
+ ? systemMsg.content
983
+ : JSON.stringify(systemMsg.content);
984
+ const lines = systemContent.split('\n').slice(0, 15);
985
+ for (const line of lines) {
986
+ console.log('│ ' + line.slice(0, 62));
987
+ }
988
+ if (systemContent.split('\n').length > 15) {
989
+ console.log('│ ... (truncated)');
990
+ }
991
+ console.log('└─────────────────────────────────────────────────────────────┘');
992
+ }
993
+
994
+ // Show conversation messages
995
+ const roleColors: Record<string, string> = {
996
+ user: '👤 USER',
997
+ assistant: '🤖 ASSISTANT',
998
+ };
999
+
1000
+ for (let i = 0; i < messages.length; i++) {
1001
+ const msg = messages[i];
1002
+ if (msg.role === 'system') continue;
1003
+
1004
+ const roleLabel = roleColors[msg.role] || `● ${msg.role.toUpperCase()}`;
1005
+ console.log(`\n┌─────────────────────────────────────────────────────────────┐`);
1006
+ console.log(`│ ${roleLabel} (${i + 1}) │`);
1007
+ console.log('├─────────────────────────────────────────────────────────────┤');
1008
+
1009
+ if (typeof msg.content === 'string') {
1010
+ const lines = msg.content.split('\n').slice(0, 20);
1011
+ for (const line of lines) {
1012
+ console.log('│ ' + line.slice(0, 62));
1013
+ }
1014
+ if (msg.content.split('\n').length > 20) {
1015
+ console.log('│ ... (truncated)');
1016
+ }
1017
+ } else if (Array.isArray(msg.content)) {
1018
+ const hasImage = msg.content.some((c: any) => c.type === 'image_url');
1019
+ console.log('│ 📎 Content blocks: ' + msg.content.length);
1020
+ if (hasImage) {
1021
+ const imageBlock = msg.content.find((c: any) => c.type === 'image_url');
1022
+ const imageSize = imageBlock?.image_url?.url?.length || 0;
1023
+ console.log('│ 🖼️ Image size: ' + (imageSize / 1024).toFixed(2) + ' KB');
1024
+ }
1025
+ const textBlock = msg.content.find((c: any) => c.type === 'text');
1026
+ if (textBlock?.text) {
1027
+ const lines = textBlock.text.split('\n').slice(0, 10);
1028
+ for (const line of lines) {
1029
+ console.log('│ ' + line.slice(0, 62));
1030
+ }
1031
+ }
1032
+ }
1033
+ console.log('└─────────────────────────────────────────────────────────────┘');
1034
+ }
1035
+
1036
+ console.log('\n📤 Sending request to model API...\n');
1037
+ }
1038
+
1039
+ /**
1040
+ * Debug output for model response
1041
+ */
1042
+ private debugResponse(content: string, usage?: any): void {
1043
+ console.log('\n╔══════════════════════════════════════════════════════════╗');
1044
+ console.log('║ GUI MODEL RESPONSE DEBUG ║');
1045
+ console.log('╚══════════════════════════════════════════════════════════╝');
1046
+
1047
+ if (usage) {
1048
+ console.log(`📊 Tokens: ${usage.prompt_tokens} (prompt) + ${usage.completion_tokens} (completion) = ${usage.total_tokens} (total)`);
1049
+ }
1050
+
1051
+ console.log('\n┌─────────────────────────────────────────────────────────────┐');
1052
+ console.log('│ 🤖 ASSISTANT │');
1053
+ console.log('├─────────────────────────────────────────────────────────────┤');
1054
+ console.log('│ 💬 CONTENT:');
1055
+ console.log('│ ───────────────────────────────────────────────────────────');
1056
+
1057
+ const lines = content.split('\n').slice(0, 30);
1058
+ for (const line of lines) {
1059
+ console.log('│ ' + line.slice(0, 62));
1060
+ }
1061
+ if (content.split('\n').length > 30) {
1062
+ console.log(`│ ... (${content.split('\n').length - 30} more lines)`);
1063
+ }
1064
+ console.log('│ ───────────────────────────────────────────────────────────');
1065
+ console.log('└─────────────────────────────────────────────────────────────┘');
1066
+
1067
+ console.log('\n╔══════════════════════════════════════════════════════════╗');
1068
+ console.log('║ RESPONSE ENDED ║');
1069
+ console.log('╚══════════════════════════════════════════════════════════╝\n');
1070
+ }
1071
+
1072
+ /**
1073
+ * Call local VLM API
1074
+ */
1075
+ private async callLocalVLM(
1076
+ messages: any[],
1077
+ screenContext: ScreenContext
1078
+ ): Promise<{ prediction: string; parsedPredictions: PredictionParsed[] }> {
1079
+ const baseUrl = this.modelBaseUrl || process.env.MODEL_BASE_URL || 'https://api.openai.com/v1';
1080
+ const apiKey = this.modelApiKey || process.env.MODEL_API_KEY || '';
1081
+
1082
+ const requestBody = {
1083
+ model: this.model,
1084
+ messages,
1085
+ max_tokens: 1024,
1086
+ temperature: 0.1,
1087
+ };
1088
+
1089
+ // Debug output for model input
1090
+ if (this.showAIDebugInfo) {
1091
+ this.debugRequest(messages);
1092
+ }
1093
+
1094
+ const response = await fetch(`${baseUrl}/chat/completions`, {
1095
+ method: 'POST',
1096
+ headers: {
1097
+ 'Content-Type': 'application/json',
1098
+ 'Authorization': `Bearer ${apiKey}`,
1099
+ },
1100
+ body: JSON.stringify(requestBody),
1101
+ signal: this.signal,
1102
+ });
1103
+
1104
+ // Handle non-200 responses
1105
+ if (!response.ok) {
1106
+ const errorText = await response.text();
1107
+ throw new Error(`Model API error: ${errorText}`);
1108
+ }
1109
+
1110
+ const result = await response.json() as { choices?: Array<{ message?: { content?: string } }>; usage?: any };
1111
+ const content = result.choices?.[0]?.message?.content || '';
1112
+
1113
+ // Debug output for model response
1114
+ if (this.showAIDebugInfo) {
1115
+ this.debugResponse(content, result.usage);
1116
+ }
1117
+
1118
+ const { parsed: parsedPredictions } = actionParser({
1119
+ prediction: content,
1120
+ factor: [1000, 1000],
1121
+ screenContext: {
1122
+ width: screenContext.width,
1123
+ height: screenContext.height,
1124
+ },
1125
+ });
1126
+
1127
+ return {
1128
+ prediction: content,
1129
+ parsedPredictions,
1130
+ };
1131
+ }
1132
+
1133
+ /**
1134
+ * Call the model API with debug logging
1135
+ * Local mode: use model/modelBaseUrl/modelApiKey directly
1136
+ * Remote mode: use remoteVlmCaller for VLM calls (now with full messages for consistent behavior)
1137
+ */
1138
+ private async callModelAPI(
1139
+ messages: any[],
1140
+ screenContext: ScreenContext,
1141
+ remoteVlmCaller: RemoteVlmCaller
1142
+ ): Promise<{ prediction: string; parsedPredictions: PredictionParsed[] }> {
1143
+ // === LOCAL 模式 ===
1144
+ if (this.isLocalMode) {
1145
+ return this.callLocalVLM(messages, screenContext);
1146
+ }
1147
+
1148
+ // === REMOTE 模式 ===
1149
+ else {
1150
+ // Debug output for model input
1151
+ if (this.showAIDebugInfo) {
1152
+ this.debugRequest(messages, remoteVlmCaller);
1153
+ }
1154
+
1155
+ // Use shared ref from config for tracking first VLM call across createGUISubAgent calls
1156
+ // If no shared ref provided, fall back to local tracking
1157
+ const isFirstVlmCallRef = this.isFirstVlmCallRef || { current: this.isFirstVlmCall };
1158
+
1159
+ // Pass taskId and isFirstVlmCallRef for proper status tracking
1160
+ const prediction = await remoteVlmCaller(messages, this.systemPrompt, this.taskId, isFirstVlmCallRef);
1161
+ // Mark subsequent calls as continue (update both local state and shared ref)
1162
+ this.isFirstVlmCall = false;
1163
+ isFirstVlmCallRef.current = false;
1164
+
1165
+ // Debug output for model response
1166
+ if (this.showAIDebugInfo) {
1167
+ this.debugResponse(prediction);
1168
+ }
1169
+
1170
+ const { parsed: parsedPredictions } = actionParser({
1171
+ prediction,
1172
+ factor: [1000, 1000],
1173
+ screenContext: {
1174
+ width: screenContext.width,
1175
+ height: screenContext.height,
1176
+ },
1177
+ });
1178
+
1179
+ return {
1180
+ prediction,
1181
+ parsedPredictions,
1182
+ };
1183
+ }
1184
+ }
1185
+
1186
+ /**
1187
+ * Get summary from prediction text
1188
+ */
1189
+ private getSummary(prediction: string): string {
1190
+ // Extract the action part as summary
1191
+ const actionMatch = prediction.match(/Action[::]\s*([\s\S]+)$/i);
1192
+ if (actionMatch) {
1193
+ return actionMatch[1].trim();
1194
+ }
1195
+ return prediction.slice(0, 200);
1196
+ }
1197
+
1198
+ pause(): void {
1199
+ this.isPaused = true;
1200
+ this.resumePromise = new Promise((resolve) => {
1201
+ this.resolveResume = resolve;
1202
+ });
1203
+ }
1204
+
1205
+ resume(): void {
1206
+ if (this.resolveResume) {
1207
+ this.resolveResume();
1208
+ this.resumePromise = null;
1209
+ this.resolveResume = null;
1210
+ }
1211
+ this.isPaused = false;
1212
+ }
1213
+
1214
+ stop(): void {
1215
+ this.isStopped = true;
1216
+ }
1217
+
1218
+ async cleanup(): Promise<void> {
1219
+ this.logger.debug('Cleaning up GUI Agent...');
1220
+ await this.operator.cleanup();
1221
+
1222
+ // Cleanup cancellation listener if attached
1223
+ const cancelHandler = (this as any)._cancelHandler;
1224
+ const cancellationManager = (this as any)._cancellationManager;
1225
+ if (cancelHandler && cancellationManager) {
1226
+ cancellationManager.off('cancelled', cancelHandler);
1227
+ (this as any)._cancelHandler = undefined;
1228
+ (this as any)._cancellationManager = undefined;
1229
+ }
1230
+ }
1231
+ }
1232
+
1233
+ export { GUIAgentStatus as StatusEnum };
1234
+