@donggui/core 1.5.4-donggui.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (269) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +9 -0
  3. package/dist/es/agent/agent.mjs +709 -0
  4. package/dist/es/agent/agent.mjs.map +1 -0
  5. package/dist/es/agent/common.mjs +0 -0
  6. package/dist/es/agent/execution-session.mjs +41 -0
  7. package/dist/es/agent/execution-session.mjs.map +1 -0
  8. package/dist/es/agent/index.mjs +6 -0
  9. package/dist/es/agent/task-builder.mjs +330 -0
  10. package/dist/es/agent/task-builder.mjs.map +1 -0
  11. package/dist/es/agent/task-cache.mjs +186 -0
  12. package/dist/es/agent/task-cache.mjs.map +1 -0
  13. package/dist/es/agent/tasks.mjs +422 -0
  14. package/dist/es/agent/tasks.mjs.map +1 -0
  15. package/dist/es/agent/ui-utils.mjs +91 -0
  16. package/dist/es/agent/ui-utils.mjs.map +1 -0
  17. package/dist/es/agent/utils.mjs +198 -0
  18. package/dist/es/agent/utils.mjs.map +1 -0
  19. package/dist/es/ai-model/auto-glm/actions.mjs +224 -0
  20. package/dist/es/ai-model/auto-glm/actions.mjs.map +1 -0
  21. package/dist/es/ai-model/auto-glm/index.mjs +6 -0
  22. package/dist/es/ai-model/auto-glm/parser.mjs +239 -0
  23. package/dist/es/ai-model/auto-glm/parser.mjs.map +1 -0
  24. package/dist/es/ai-model/auto-glm/planning.mjs +71 -0
  25. package/dist/es/ai-model/auto-glm/planning.mjs.map +1 -0
  26. package/dist/es/ai-model/auto-glm/prompt.mjs +222 -0
  27. package/dist/es/ai-model/auto-glm/prompt.mjs.map +1 -0
  28. package/dist/es/ai-model/auto-glm/util.mjs +9 -0
  29. package/dist/es/ai-model/auto-glm/util.mjs.map +1 -0
  30. package/dist/es/ai-model/conversation-history.mjs +195 -0
  31. package/dist/es/ai-model/conversation-history.mjs.map +1 -0
  32. package/dist/es/ai-model/index.mjs +11 -0
  33. package/dist/es/ai-model/inspect.mjs +386 -0
  34. package/dist/es/ai-model/inspect.mjs.map +1 -0
  35. package/dist/es/ai-model/llm-planning.mjs +233 -0
  36. package/dist/es/ai-model/llm-planning.mjs.map +1 -0
  37. package/dist/es/ai-model/prompt/common.mjs +7 -0
  38. package/dist/es/ai-model/prompt/common.mjs.map +1 -0
  39. package/dist/es/ai-model/prompt/describe.mjs +66 -0
  40. package/dist/es/ai-model/prompt/describe.mjs.map +1 -0
  41. package/dist/es/ai-model/prompt/extraction.mjs +129 -0
  42. package/dist/es/ai-model/prompt/extraction.mjs.map +1 -0
  43. package/dist/es/ai-model/prompt/llm-locator.mjs +51 -0
  44. package/dist/es/ai-model/prompt/llm-locator.mjs.map +1 -0
  45. package/dist/es/ai-model/prompt/llm-planning.mjs +364 -0
  46. package/dist/es/ai-model/prompt/llm-planning.mjs.map +1 -0
  47. package/dist/es/ai-model/prompt/llm-section-locator.mjs +44 -0
  48. package/dist/es/ai-model/prompt/llm-section-locator.mjs.map +1 -0
  49. package/dist/es/ai-model/prompt/order-sensitive-judge.mjs +35 -0
  50. package/dist/es/ai-model/prompt/order-sensitive-judge.mjs.map +1 -0
  51. package/dist/es/ai-model/prompt/playwright-generator.mjs +117 -0
  52. package/dist/es/ai-model/prompt/playwright-generator.mjs.map +1 -0
  53. package/dist/es/ai-model/prompt/ui-tars-planning.mjs +36 -0
  54. package/dist/es/ai-model/prompt/ui-tars-planning.mjs.map +1 -0
  55. package/dist/es/ai-model/prompt/util.mjs +59 -0
  56. package/dist/es/ai-model/prompt/util.mjs.map +1 -0
  57. package/dist/es/ai-model/prompt/yaml-generator.mjs +219 -0
  58. package/dist/es/ai-model/prompt/yaml-generator.mjs.map +1 -0
  59. package/dist/es/ai-model/service-caller/index.mjs +466 -0
  60. package/dist/es/ai-model/service-caller/index.mjs.map +1 -0
  61. package/dist/es/ai-model/ui-tars-planning.mjs +249 -0
  62. package/dist/es/ai-model/ui-tars-planning.mjs.map +1 -0
  63. package/dist/es/common.mjs +371 -0
  64. package/dist/es/common.mjs.map +1 -0
  65. package/dist/es/device/device-options.mjs +0 -0
  66. package/dist/es/device/index.mjs +300 -0
  67. package/dist/es/device/index.mjs.map +1 -0
  68. package/dist/es/dump/html-utils.mjs +211 -0
  69. package/dist/es/dump/html-utils.mjs.map +1 -0
  70. package/dist/es/dump/image-restoration.mjs +43 -0
  71. package/dist/es/dump/image-restoration.mjs.map +1 -0
  72. package/dist/es/dump/index.mjs +3 -0
  73. package/dist/es/index.mjs +15 -0
  74. package/dist/es/index.mjs.map +1 -0
  75. package/dist/es/report-generator.mjs +134 -0
  76. package/dist/es/report-generator.mjs.map +1 -0
  77. package/dist/es/report.mjs +111 -0
  78. package/dist/es/report.mjs.map +1 -0
  79. package/dist/es/screenshot-item.mjs +105 -0
  80. package/dist/es/screenshot-item.mjs.map +1 -0
  81. package/dist/es/service/index.mjs +256 -0
  82. package/dist/es/service/index.mjs.map +1 -0
  83. package/dist/es/service/utils.mjs +15 -0
  84. package/dist/es/service/utils.mjs.map +1 -0
  85. package/dist/es/skill/index.mjs +38 -0
  86. package/dist/es/skill/index.mjs.map +1 -0
  87. package/dist/es/task-runner.mjs +258 -0
  88. package/dist/es/task-runner.mjs.map +1 -0
  89. package/dist/es/task-timing.mjs +12 -0
  90. package/dist/es/task-timing.mjs.map +1 -0
  91. package/dist/es/tree.mjs +13 -0
  92. package/dist/es/tree.mjs.map +1 -0
  93. package/dist/es/types.mjs +196 -0
  94. package/dist/es/types.mjs.map +1 -0
  95. package/dist/es/utils.mjs +218 -0
  96. package/dist/es/utils.mjs.map +1 -0
  97. package/dist/es/yaml/builder.mjs +13 -0
  98. package/dist/es/yaml/builder.mjs.map +1 -0
  99. package/dist/es/yaml/index.mjs +4 -0
  100. package/dist/es/yaml/player.mjs +418 -0
  101. package/dist/es/yaml/player.mjs.map +1 -0
  102. package/dist/es/yaml/utils.mjs +73 -0
  103. package/dist/es/yaml/utils.mjs.map +1 -0
  104. package/dist/es/yaml.mjs +0 -0
  105. package/dist/lib/agent/agent.js +757 -0
  106. package/dist/lib/agent/agent.js.map +1 -0
  107. package/dist/lib/agent/common.js +5 -0
  108. package/dist/lib/agent/execution-session.js +75 -0
  109. package/dist/lib/agent/execution-session.js.map +1 -0
  110. package/dist/lib/agent/index.js +81 -0
  111. package/dist/lib/agent/index.js.map +1 -0
  112. package/dist/lib/agent/task-builder.js +367 -0
  113. package/dist/lib/agent/task-builder.js.map +1 -0
  114. package/dist/lib/agent/task-cache.js +238 -0
  115. package/dist/lib/agent/task-cache.js.map +1 -0
  116. package/dist/lib/agent/tasks.js +465 -0
  117. package/dist/lib/agent/tasks.js.map +1 -0
  118. package/dist/lib/agent/ui-utils.js +143 -0
  119. package/dist/lib/agent/ui-utils.js.map +1 -0
  120. package/dist/lib/agent/utils.js +275 -0
  121. package/dist/lib/agent/utils.js.map +1 -0
  122. package/dist/lib/ai-model/auto-glm/actions.js +258 -0
  123. package/dist/lib/ai-model/auto-glm/actions.js.map +1 -0
  124. package/dist/lib/ai-model/auto-glm/index.js +66 -0
  125. package/dist/lib/ai-model/auto-glm/index.js.map +1 -0
  126. package/dist/lib/ai-model/auto-glm/parser.js +282 -0
  127. package/dist/lib/ai-model/auto-glm/parser.js.map +1 -0
  128. package/dist/lib/ai-model/auto-glm/planning.js +105 -0
  129. package/dist/lib/ai-model/auto-glm/planning.js.map +1 -0
  130. package/dist/lib/ai-model/auto-glm/prompt.js +259 -0
  131. package/dist/lib/ai-model/auto-glm/prompt.js.map +1 -0
  132. package/dist/lib/ai-model/auto-glm/util.js +46 -0
  133. package/dist/lib/ai-model/auto-glm/util.js.map +1 -0
  134. package/dist/lib/ai-model/conversation-history.js +229 -0
  135. package/dist/lib/ai-model/conversation-history.js.map +1 -0
  136. package/dist/lib/ai-model/index.js +125 -0
  137. package/dist/lib/ai-model/index.js.map +1 -0
  138. package/dist/lib/ai-model/inspect.js +429 -0
  139. package/dist/lib/ai-model/inspect.js.map +1 -0
  140. package/dist/lib/ai-model/llm-planning.js +270 -0
  141. package/dist/lib/ai-model/llm-planning.js.map +1 -0
  142. package/dist/lib/ai-model/prompt/common.js +41 -0
  143. package/dist/lib/ai-model/prompt/common.js.map +1 -0
  144. package/dist/lib/ai-model/prompt/describe.js +100 -0
  145. package/dist/lib/ai-model/prompt/describe.js.map +1 -0
  146. package/dist/lib/ai-model/prompt/extraction.js +169 -0
  147. package/dist/lib/ai-model/prompt/extraction.js.map +1 -0
  148. package/dist/lib/ai-model/prompt/llm-locator.js +88 -0
  149. package/dist/lib/ai-model/prompt/llm-locator.js.map +1 -0
  150. package/dist/lib/ai-model/prompt/llm-planning.js +401 -0
  151. package/dist/lib/ai-model/prompt/llm-planning.js.map +1 -0
  152. package/dist/lib/ai-model/prompt/llm-section-locator.js +81 -0
  153. package/dist/lib/ai-model/prompt/llm-section-locator.js.map +1 -0
  154. package/dist/lib/ai-model/prompt/order-sensitive-judge.js +72 -0
  155. package/dist/lib/ai-model/prompt/order-sensitive-judge.js.map +1 -0
  156. package/dist/lib/ai-model/prompt/playwright-generator.js +178 -0
  157. package/dist/lib/ai-model/prompt/playwright-generator.js.map +1 -0
  158. package/dist/lib/ai-model/prompt/ui-tars-planning.js +73 -0
  159. package/dist/lib/ai-model/prompt/ui-tars-planning.js.map +1 -0
  160. package/dist/lib/ai-model/prompt/util.js +105 -0
  161. package/dist/lib/ai-model/prompt/util.js.map +1 -0
  162. package/dist/lib/ai-model/prompt/yaml-generator.js +280 -0
  163. package/dist/lib/ai-model/prompt/yaml-generator.js.map +1 -0
  164. package/dist/lib/ai-model/service-caller/index.js +531 -0
  165. package/dist/lib/ai-model/service-caller/index.js.map +1 -0
  166. package/dist/lib/ai-model/ui-tars-planning.js +283 -0
  167. package/dist/lib/ai-model/ui-tars-planning.js.map +1 -0
  168. package/dist/lib/common.js +480 -0
  169. package/dist/lib/common.js.map +1 -0
  170. package/dist/lib/device/device-options.js +20 -0
  171. package/dist/lib/device/device-options.js.map +1 -0
  172. package/dist/lib/device/index.js +418 -0
  173. package/dist/lib/device/index.js.map +1 -0
  174. package/dist/lib/dump/html-utils.js +281 -0
  175. package/dist/lib/dump/html-utils.js.map +1 -0
  176. package/dist/lib/dump/image-restoration.js +77 -0
  177. package/dist/lib/dump/image-restoration.js.map +1 -0
  178. package/dist/lib/dump/index.js +60 -0
  179. package/dist/lib/dump/index.js.map +1 -0
  180. package/dist/lib/index.js +146 -0
  181. package/dist/lib/index.js.map +1 -0
  182. package/dist/lib/report-generator.js +172 -0
  183. package/dist/lib/report-generator.js.map +1 -0
  184. package/dist/lib/report.js +145 -0
  185. package/dist/lib/report.js.map +1 -0
  186. package/dist/lib/screenshot-item.js +139 -0
  187. package/dist/lib/screenshot-item.js.map +1 -0
  188. package/dist/lib/service/index.js +290 -0
  189. package/dist/lib/service/index.js.map +1 -0
  190. package/dist/lib/service/utils.js +49 -0
  191. package/dist/lib/service/utils.js.map +1 -0
  192. package/dist/lib/skill/index.js +72 -0
  193. package/dist/lib/skill/index.js.map +1 -0
  194. package/dist/lib/task-runner.js +295 -0
  195. package/dist/lib/task-runner.js.map +1 -0
  196. package/dist/lib/task-timing.js +46 -0
  197. package/dist/lib/task-timing.js.map +1 -0
  198. package/dist/lib/tree.js +53 -0
  199. package/dist/lib/tree.js.map +1 -0
  200. package/dist/lib/types.js +285 -0
  201. package/dist/lib/types.js.map +1 -0
  202. package/dist/lib/utils.js +297 -0
  203. package/dist/lib/utils.js.map +1 -0
  204. package/dist/lib/yaml/builder.js +57 -0
  205. package/dist/lib/yaml/builder.js.map +1 -0
  206. package/dist/lib/yaml/index.js +81 -0
  207. package/dist/lib/yaml/index.js.map +1 -0
  208. package/dist/lib/yaml/player.js +452 -0
  209. package/dist/lib/yaml/player.js.map +1 -0
  210. package/dist/lib/yaml/utils.js +126 -0
  211. package/dist/lib/yaml/utils.js.map +1 -0
  212. package/dist/lib/yaml.js +20 -0
  213. package/dist/lib/yaml.js.map +1 -0
  214. package/dist/types/agent/agent.d.ts +190 -0
  215. package/dist/types/agent/common.d.ts +0 -0
  216. package/dist/types/agent/execution-session.d.ts +36 -0
  217. package/dist/types/agent/index.d.ts +10 -0
  218. package/dist/types/agent/task-builder.d.ts +34 -0
  219. package/dist/types/agent/task-cache.d.ts +48 -0
  220. package/dist/types/agent/tasks.d.ts +70 -0
  221. package/dist/types/agent/ui-utils.d.ts +14 -0
  222. package/dist/types/agent/utils.d.ts +29 -0
  223. package/dist/types/ai-model/auto-glm/actions.d.ts +77 -0
  224. package/dist/types/ai-model/auto-glm/index.d.ts +6 -0
  225. package/dist/types/ai-model/auto-glm/parser.d.ts +18 -0
  226. package/dist/types/ai-model/auto-glm/planning.d.ts +10 -0
  227. package/dist/types/ai-model/auto-glm/prompt.d.ts +27 -0
  228. package/dist/types/ai-model/auto-glm/util.d.ts +13 -0
  229. package/dist/types/ai-model/conversation-history.d.ts +105 -0
  230. package/dist/types/ai-model/index.d.ts +14 -0
  231. package/dist/types/ai-model/inspect.d.ts +58 -0
  232. package/dist/types/ai-model/llm-planning.d.ts +19 -0
  233. package/dist/types/ai-model/prompt/common.d.ts +2 -0
  234. package/dist/types/ai-model/prompt/describe.d.ts +1 -0
  235. package/dist/types/ai-model/prompt/extraction.d.ts +7 -0
  236. package/dist/types/ai-model/prompt/llm-locator.d.ts +3 -0
  237. package/dist/types/ai-model/prompt/llm-planning.d.ts +10 -0
  238. package/dist/types/ai-model/prompt/llm-section-locator.d.ts +3 -0
  239. package/dist/types/ai-model/prompt/order-sensitive-judge.d.ts +2 -0
  240. package/dist/types/ai-model/prompt/playwright-generator.d.ts +26 -0
  241. package/dist/types/ai-model/prompt/ui-tars-planning.d.ts +2 -0
  242. package/dist/types/ai-model/prompt/util.d.ts +33 -0
  243. package/dist/types/ai-model/prompt/yaml-generator.d.ts +100 -0
  244. package/dist/types/ai-model/service-caller/index.d.ts +49 -0
  245. package/dist/types/ai-model/ui-tars-planning.d.ts +72 -0
  246. package/dist/types/common.d.ts +288 -0
  247. package/dist/types/device/device-options.d.ts +142 -0
  248. package/dist/types/device/index.d.ts +2315 -0
  249. package/dist/types/dump/html-utils.d.ts +52 -0
  250. package/dist/types/dump/image-restoration.d.ts +6 -0
  251. package/dist/types/dump/index.d.ts +5 -0
  252. package/dist/types/index.d.ts +17 -0
  253. package/dist/types/report-generator.d.ts +48 -0
  254. package/dist/types/report.d.ts +15 -0
  255. package/dist/types/screenshot-item.d.ts +66 -0
  256. package/dist/types/service/index.d.ts +23 -0
  257. package/dist/types/service/utils.d.ts +2 -0
  258. package/dist/types/skill/index.d.ts +25 -0
  259. package/dist/types/task-runner.d.ts +48 -0
  260. package/dist/types/task-timing.d.ts +8 -0
  261. package/dist/types/tree.d.ts +4 -0
  262. package/dist/types/types.d.ts +645 -0
  263. package/dist/types/utils.d.ts +40 -0
  264. package/dist/types/yaml/builder.d.ts +2 -0
  265. package/dist/types/yaml/index.d.ts +4 -0
  266. package/dist/types/yaml/player.d.ts +34 -0
  267. package/dist/types/yaml/utils.d.ts +9 -0
  268. package/dist/types/yaml.d.ts +203 -0
  269. package/package.json +111 -0
@@ -0,0 +1,386 @@
1
+ import { generateElementByPoint, generateElementByRect } from "@midscene/shared/extractor/dom-util";
2
+ import { cropByRect, paddingToMatchBlockByBase64, preProcessImageUrl, scaleImage } from "@midscene/shared/img";
3
+ import { getDebug } from "@midscene/shared/logger";
4
+ import { assert } from "@midscene/shared/utils";
5
+ import { adaptBboxToRect, expandSearchArea, mergeRects } from "../common.mjs";
6
+ import { parseAutoGLMLocateResponse } from "./auto-glm/parser.mjs";
7
+ import { getAutoGLMLocatePrompt } from "./auto-glm/prompt.mjs";
8
+ import { isAutoGLM } from "./auto-glm/util.mjs";
9
+ import { extractDataQueryPrompt, parseXMLExtractionResponse, systemPromptToExtract } from "./prompt/extraction.mjs";
10
+ import { findElementPrompt, systemPromptToLocateElement } from "./prompt/llm-locator.mjs";
11
+ import { sectionLocatorInstruction, systemPromptToLocateSection } from "./prompt/llm-section-locator.mjs";
12
+ import { orderSensitiveJudgePrompt, systemPromptToJudgeOrderSensitive } from "./prompt/order-sensitive-judge.mjs";
13
+ import { AIResponseParseError, callAI, callAIWithObjectResponse, callAIWithStringResponse } from "./service-caller/index.mjs";
14
+ const debugInspect = getDebug('ai:inspect');
15
+ const debugSection = getDebug('ai:section');
16
+ const extraTextFromUserPrompt = (prompt)=>{
17
+ if ('string' == typeof prompt) return prompt;
18
+ return prompt.prompt;
19
+ };
20
+ const promptsToChatParam = async (multimodalPrompt)=>{
21
+ const msgs = [];
22
+ if (multimodalPrompt?.images?.length) {
23
+ msgs.push({
24
+ role: 'user',
25
+ content: [
26
+ {
27
+ type: 'text',
28
+ text: 'Next, I will provide all the reference images.'
29
+ }
30
+ ]
31
+ });
32
+ for (const item of multimodalPrompt.images){
33
+ const base64 = await preProcessImageUrl(item.url, !!multimodalPrompt.convertHttpImage2Base64);
34
+ msgs.push({
35
+ role: 'user',
36
+ content: [
37
+ {
38
+ type: 'text',
39
+ text: `this is the reference image named '${item.name}':`
40
+ }
41
+ ]
42
+ });
43
+ msgs.push({
44
+ role: 'user',
45
+ content: [
46
+ {
47
+ type: 'image_url',
48
+ image_url: {
49
+ url: base64,
50
+ detail: 'high'
51
+ }
52
+ }
53
+ ]
54
+ });
55
+ }
56
+ }
57
+ return msgs;
58
+ };
59
+ async function AiLocateElement(options) {
60
+ const { context, targetElementDescription, modelConfig } = options;
61
+ const { modelFamily } = modelConfig;
62
+ const screenshotBase64 = context.screenshot.base64;
63
+ assert(targetElementDescription, "cannot find the target element description");
64
+ const targetElementDescriptionText = extraTextFromUserPrompt(targetElementDescription);
65
+ const userInstructionPrompt = findElementPrompt(targetElementDescriptionText);
66
+ const systemPrompt = isAutoGLM(modelFamily) ? getAutoGLMLocatePrompt(modelFamily) : systemPromptToLocateElement(modelFamily);
67
+ let imagePayload = screenshotBase64;
68
+ let imageWidth = context.shotSize.width;
69
+ let imageHeight = context.shotSize.height;
70
+ let originalImageWidth = imageWidth;
71
+ let originalImageHeight = imageHeight;
72
+ if (options.searchConfig) {
73
+ assert(options.searchConfig.rect, 'searchArea is provided but its rect cannot be found. Failed to locate element');
74
+ assert(options.searchConfig.imageBase64, 'searchArea is provided but its imageBase64 cannot be found. Failed to locate element');
75
+ imagePayload = options.searchConfig.imageBase64;
76
+ imageWidth = options.searchConfig.rect?.width;
77
+ imageHeight = options.searchConfig.rect?.height;
78
+ originalImageWidth = imageWidth;
79
+ originalImageHeight = imageHeight;
80
+ } else if ('qwen2.5-vl' === modelFamily) {
81
+ const paddedResult = await paddingToMatchBlockByBase64(imagePayload);
82
+ imageWidth = paddedResult.width;
83
+ imageHeight = paddedResult.height;
84
+ imagePayload = paddedResult.imageBase64;
85
+ }
86
+ const msgs = [
87
+ {
88
+ role: 'system',
89
+ content: systemPrompt
90
+ },
91
+ {
92
+ role: 'user',
93
+ content: [
94
+ {
95
+ type: 'image_url',
96
+ image_url: {
97
+ url: imagePayload,
98
+ detail: 'high'
99
+ }
100
+ },
101
+ {
102
+ type: 'text',
103
+ text: isAutoGLM(modelFamily) ? `Tap: ${userInstructionPrompt}` : userInstructionPrompt
104
+ }
105
+ ]
106
+ }
107
+ ];
108
+ if ('string' != typeof targetElementDescription) {
109
+ const addOns = await promptsToChatParam({
110
+ images: targetElementDescription.images,
111
+ convertHttpImage2Base64: targetElementDescription.convertHttpImage2Base64
112
+ });
113
+ msgs.push(...addOns);
114
+ }
115
+ if (isAutoGLM(modelFamily)) {
116
+ const { content: rawResponseContent, usage } = await callAIWithStringResponse(msgs, modelConfig, {
117
+ abortSignal: options.abortSignal
118
+ });
119
+ debugInspect('auto-glm rawResponse:', rawResponseContent);
120
+ const parsed = parseAutoGLMLocateResponse(rawResponseContent);
121
+ debugInspect('auto-glm thinking:', parsed.think);
122
+ debugInspect('auto-glm coordinates:', parsed.coordinates);
123
+ let resRect;
124
+ let matchedElements = [];
125
+ let errors = [];
126
+ if (parsed.error || !parsed.coordinates) {
127
+ errors = [
128
+ parsed.error || 'Failed to parse auto-glm response'
129
+ ];
130
+ debugInspect('auto-glm parse error:', errors[0]);
131
+ } else {
132
+ const { x, y } = parsed.coordinates;
133
+ debugInspect('auto-glm coordinates [0-999]:', {
134
+ x,
135
+ y
136
+ });
137
+ const pixelX = Math.round(x * imageWidth / 1000);
138
+ const pixelY = Math.round(y * imageHeight / 1000);
139
+ debugInspect('auto-glm pixel coordinates:', {
140
+ pixelX,
141
+ pixelY
142
+ });
143
+ let finalX = pixelX;
144
+ let finalY = pixelY;
145
+ if (options.searchConfig?.rect) {
146
+ finalX += options.searchConfig.rect.left;
147
+ finalY += options.searchConfig.rect.top;
148
+ }
149
+ const element = generateElementByPoint([
150
+ finalX,
151
+ finalY
152
+ ], targetElementDescriptionText);
153
+ resRect = element.rect;
154
+ debugInspect('auto-glm resRect:', resRect);
155
+ if (element) matchedElements = [
156
+ element
157
+ ];
158
+ }
159
+ return {
160
+ rect: resRect,
161
+ parseResult: {
162
+ elements: matchedElements,
163
+ errors
164
+ },
165
+ rawResponse: rawResponseContent,
166
+ usage,
167
+ reasoning_content: parsed.think
168
+ };
169
+ }
170
+ let res;
171
+ try {
172
+ res = await callAIWithObjectResponse(msgs, modelConfig, {
173
+ abortSignal: options.abortSignal
174
+ });
175
+ } catch (callError) {
176
+ const errorMessage = callError instanceof Error ? callError.message : String(callError);
177
+ const rawResponse = callError instanceof AIResponseParseError ? callError.rawResponse : errorMessage;
178
+ const usage = callError instanceof AIResponseParseError ? callError.usage : void 0;
179
+ return {
180
+ rect: void 0,
181
+ parseResult: {
182
+ elements: [],
183
+ errors: [
184
+ `AI call error: ${errorMessage}`
185
+ ]
186
+ },
187
+ rawResponse,
188
+ usage,
189
+ reasoning_content: void 0
190
+ };
191
+ }
192
+ const rawResponse = JSON.stringify(res.content);
193
+ let resRect;
194
+ let matchedElements = [];
195
+ let errors = 'errors' in res.content ? res.content.errors : [];
196
+ try {
197
+ if ('bbox' in res.content && Array.isArray(res.content.bbox) && res.content.bbox.length >= 1) {
198
+ resRect = adaptBboxToRect(res.content.bbox, imageWidth, imageHeight, options.searchConfig?.rect?.left, options.searchConfig?.rect?.top, originalImageWidth, originalImageHeight, modelFamily, options.searchConfig?.scale);
199
+ debugInspect('resRect', resRect);
200
+ const element = generateElementByRect(resRect, targetElementDescriptionText);
201
+ errors = [];
202
+ if (element) matchedElements = [
203
+ element
204
+ ];
205
+ }
206
+ } catch (e) {
207
+ const msg = e instanceof Error ? `Failed to parse bbox: ${e.message}` : 'unknown error in locate';
208
+ if (errors && errors?.length !== 0) errors.push(`(${msg})`);
209
+ else errors = [
210
+ msg
211
+ ];
212
+ }
213
+ return {
214
+ rect: resRect,
215
+ parseResult: {
216
+ elements: matchedElements,
217
+ errors: errors
218
+ },
219
+ rawResponse,
220
+ usage: res.usage,
221
+ reasoning_content: res.reasoning_content
222
+ };
223
+ }
224
+ async function AiLocateSection(options) {
225
+ const { context, sectionDescription, modelConfig } = options;
226
+ const { modelFamily } = modelConfig;
227
+ const screenshotBase64 = context.screenshot.base64;
228
+ const systemPrompt = systemPromptToLocateSection(modelFamily);
229
+ const sectionLocatorInstructionText = sectionLocatorInstruction(extraTextFromUserPrompt(sectionDescription));
230
+ const msgs = [
231
+ {
232
+ role: 'system',
233
+ content: systemPrompt
234
+ },
235
+ {
236
+ role: 'user',
237
+ content: [
238
+ {
239
+ type: 'image_url',
240
+ image_url: {
241
+ url: screenshotBase64,
242
+ detail: 'high'
243
+ }
244
+ },
245
+ {
246
+ type: 'text',
247
+ text: sectionLocatorInstructionText
248
+ }
249
+ ]
250
+ }
251
+ ];
252
+ if ('string' != typeof sectionDescription) {
253
+ const addOns = await promptsToChatParam({
254
+ images: sectionDescription.images,
255
+ convertHttpImage2Base64: sectionDescription.convertHttpImage2Base64
256
+ });
257
+ msgs.push(...addOns);
258
+ }
259
+ let result;
260
+ try {
261
+ result = await callAIWithObjectResponse(msgs, modelConfig, {
262
+ abortSignal: options.abortSignal
263
+ });
264
+ } catch (callError) {
265
+ const errorMessage = callError instanceof Error ? callError.message : String(callError);
266
+ const rawResponse = callError instanceof AIResponseParseError ? callError.rawResponse : errorMessage;
267
+ const usage = callError instanceof AIResponseParseError ? callError.usage : void 0;
268
+ return {
269
+ rect: void 0,
270
+ imageBase64: void 0,
271
+ error: `AI call error: ${errorMessage}`,
272
+ rawResponse,
273
+ usage
274
+ };
275
+ }
276
+ let sectionRect;
277
+ const sectionBbox = result.content.bbox;
278
+ if (sectionBbox) {
279
+ const targetRect = adaptBboxToRect(sectionBbox, context.shotSize.width, context.shotSize.height, 0, 0, context.shotSize.width, context.shotSize.height, modelFamily);
280
+ debugSection('original targetRect %j', targetRect);
281
+ const referenceBboxList = result.content.references_bbox || [];
282
+ debugSection('referenceBboxList %j', referenceBboxList);
283
+ const referenceRects = referenceBboxList.filter((bbox)=>Array.isArray(bbox)).map((bbox)=>adaptBboxToRect(bbox, context.shotSize.width, context.shotSize.height, 0, 0, context.shotSize.width, context.shotSize.height, modelFamily));
284
+ debugSection('referenceRects %j', referenceRects);
285
+ const mergedRect = mergeRects([
286
+ targetRect,
287
+ ...referenceRects
288
+ ]);
289
+ debugSection('mergedRect %j', mergedRect);
290
+ sectionRect = expandSearchArea(mergedRect, context.shotSize);
291
+ debugSection('expanded sectionRect %j', sectionRect);
292
+ }
293
+ let imageBase64 = screenshotBase64;
294
+ let scale;
295
+ if (sectionRect) {
296
+ const originalWidth = sectionRect.width;
297
+ const originalHeight = sectionRect.height;
298
+ const croppedResult = await cropByRect(screenshotBase64, sectionRect, 'qwen2.5-vl' === modelFamily);
299
+ const scaleRatio = 2;
300
+ const scaledResult = await scaleImage(croppedResult.imageBase64, scaleRatio);
301
+ imageBase64 = scaledResult.imageBase64;
302
+ scale = scaleRatio;
303
+ sectionRect.width = scaledResult.width;
304
+ sectionRect.height = scaledResult.height;
305
+ debugSection('scaled sectionRect from %dx%d to %dx%d (scale=%d)', originalWidth, originalHeight, sectionRect.width, sectionRect.height, scale);
306
+ }
307
+ return {
308
+ rect: sectionRect,
309
+ imageBase64,
310
+ scale,
311
+ error: result.content.error,
312
+ rawResponse: JSON.stringify(result.content),
313
+ usage: result.usage
314
+ };
315
+ }
316
+ async function AiExtractElementInfo(options) {
317
+ const { dataQuery, context, extractOption, multimodalPrompt, modelConfig } = options;
318
+ const systemPrompt = systemPromptToExtract();
319
+ const screenshotBase64 = context.screenshot.base64;
320
+ const extractDataPromptText = extractDataQueryPrompt(options.pageDescription || '', dataQuery);
321
+ const userContent = [];
322
+ if (extractOption?.screenshotIncluded !== false) userContent.push({
323
+ type: 'image_url',
324
+ image_url: {
325
+ url: screenshotBase64,
326
+ detail: 'high'
327
+ }
328
+ });
329
+ userContent.push({
330
+ type: 'text',
331
+ text: extractDataPromptText
332
+ });
333
+ const msgs = [
334
+ {
335
+ role: 'system',
336
+ content: systemPrompt
337
+ },
338
+ {
339
+ role: 'user',
340
+ content: userContent
341
+ }
342
+ ];
343
+ if (multimodalPrompt) {
344
+ const addOns = await promptsToChatParam({
345
+ images: multimodalPrompt.images,
346
+ convertHttpImage2Base64: multimodalPrompt.convertHttpImage2Base64
347
+ });
348
+ msgs.push(...addOns);
349
+ }
350
+ const { content: rawResponse, usage, reasoning_content } = await callAI(msgs, modelConfig);
351
+ let parseResult;
352
+ try {
353
+ parseResult = parseXMLExtractionResponse(rawResponse);
354
+ } catch (parseError) {
355
+ const errorMessage = parseError instanceof Error ? parseError.message : String(parseError);
356
+ throw new AIResponseParseError(`XML parse error: ${errorMessage}`, rawResponse, usage);
357
+ }
358
+ return {
359
+ parseResult,
360
+ rawResponse,
361
+ usage,
362
+ reasoning_content
363
+ };
364
+ }
365
+ async function AiJudgeOrderSensitive(description, callAIFn, modelConfig) {
366
+ const systemPrompt = systemPromptToJudgeOrderSensitive();
367
+ const userPrompt = orderSensitiveJudgePrompt(description);
368
+ const msgs = [
369
+ {
370
+ role: 'system',
371
+ content: systemPrompt
372
+ },
373
+ {
374
+ role: 'user',
375
+ content: userPrompt
376
+ }
377
+ ];
378
+ const result = await callAIFn(msgs, modelConfig);
379
+ return {
380
+ isOrderSensitive: result.content.isOrderSensitive ?? false,
381
+ usage: result.usage
382
+ };
383
+ }
384
+ export { AiExtractElementInfo, AiJudgeOrderSensitive, AiLocateElement, AiLocateSection };
385
+
386
+ //# sourceMappingURL=inspect.mjs.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"ai-model/inspect.mjs","sources":["../../../src/ai-model/inspect.ts"],"sourcesContent":["import type {\n AIDataExtractionResponse,\n AIElementResponse,\n AISectionLocatorResponse,\n AIUsageInfo,\n Rect,\n ServiceExtractOption,\n UIContext,\n} from '@/types';\nimport type { IModelConfig } from '@midscene/shared/env';\nimport {\n generateElementByPoint,\n generateElementByRect,\n} from '@midscene/shared/extractor/dom-util';\nimport {\n cropByRect,\n paddingToMatchBlockByBase64,\n preProcessImageUrl,\n scaleImage,\n} from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport type { LocateResultElement } from '@midscene/shared/types';\nimport { assert } from '@midscene/shared/utils';\nimport type {\n ChatCompletionSystemMessageParam,\n ChatCompletionUserMessageParam,\n} from 'openai/resources/index';\nimport type { TMultimodalPrompt, TUserPrompt } from '../common';\nimport { adaptBboxToRect, expandSearchArea, mergeRects } from '../common';\nimport { parseAutoGLMLocateResponse } from './auto-glm/parser';\nimport { getAutoGLMLocatePrompt } from './auto-glm/prompt';\nimport { isAutoGLM } from './auto-glm/util';\nimport {\n extractDataQueryPrompt,\n parseXMLExtractionResponse,\n systemPromptToExtract,\n} from './prompt/extraction';\nimport {\n findElementPrompt,\n systemPromptToLocateElement,\n} from './prompt/llm-locator';\nimport {\n sectionLocatorInstruction,\n systemPromptToLocateSection,\n} from './prompt/llm-section-locator';\nimport {\n orderSensitiveJudgePrompt,\n systemPromptToJudgeOrderSensitive,\n} from './prompt/order-sensitive-judge';\nimport {\n AIResponseParseError,\n callAI,\n callAIWithObjectResponse,\n callAIWithStringResponse,\n} from './service-caller/index';\n\nexport type AIArgs = [\n ChatCompletionSystemMessageParam,\n ...ChatCompletionUserMessageParam[],\n];\n\nconst debugInspect = getDebug('ai:inspect');\nconst debugSection = getDebug('ai:section');\n\nconst extraTextFromUserPrompt = (prompt: TUserPrompt): string => {\n if (typeof prompt === 'string') {\n return prompt;\n } else {\n return prompt.prompt;\n }\n};\n\nconst promptsToChatParam = async (\n multimodalPrompt: TMultimodalPrompt,\n): Promise<ChatCompletionUserMessageParam[]> => {\n const msgs: ChatCompletionUserMessageParam[] = [];\n if (multimodalPrompt?.images?.length) {\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: 'Next, I will provide all the reference images.',\n },\n ],\n });\n\n for (const item of multimodalPrompt.images) {\n const base64 = await preProcessImageUrl(\n item.url,\n !!multimodalPrompt.convertHttpImage2Base64,\n );\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: `this is the reference image named '${item.name}':`,\n },\n ],\n });\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: base64,\n detail: 'high',\n },\n },\n ],\n });\n }\n }\n return msgs;\n};\n\nexport async function AiLocateElement(options: {\n context: UIContext;\n targetElementDescription: TUserPrompt;\n searchConfig?: Awaited<ReturnType<typeof AiLocateSection>>;\n modelConfig: IModelConfig;\n abortSignal?: AbortSignal;\n}): Promise<{\n parseResult: {\n elements: LocateResultElement[];\n errors?: string[];\n };\n rect?: Rect;\n rawResponse: string;\n usage?: AIUsageInfo;\n reasoning_content?: string;\n}> {\n const { context, targetElementDescription, modelConfig } = options;\n const { modelFamily } = modelConfig;\n const screenshotBase64 = context.screenshot.base64;\n\n assert(\n targetElementDescription,\n 'cannot find the target element description',\n );\n const targetElementDescriptionText = extraTextFromUserPrompt(\n targetElementDescription,\n );\n const userInstructionPrompt = findElementPrompt(targetElementDescriptionText);\n const systemPrompt = isAutoGLM(modelFamily)\n ? getAutoGLMLocatePrompt(modelFamily)\n : systemPromptToLocateElement(modelFamily);\n\n let imagePayload = screenshotBase64;\n let imageWidth = context.shotSize.width;\n let imageHeight = context.shotSize.height;\n let originalImageWidth = imageWidth;\n let originalImageHeight = imageHeight;\n\n if (options.searchConfig) {\n assert(\n options.searchConfig.rect,\n 'searchArea is provided but its rect cannot be found. Failed to locate element',\n );\n assert(\n options.searchConfig.imageBase64,\n 'searchArea is provided but its imageBase64 cannot be found. Failed to locate element',\n );\n\n imagePayload = options.searchConfig.imageBase64;\n imageWidth = options.searchConfig.rect?.width;\n imageHeight = options.searchConfig.rect?.height;\n originalImageWidth = imageWidth;\n originalImageHeight = imageHeight;\n } else if (modelFamily === 'qwen2.5-vl') {\n const paddedResult = await paddingToMatchBlockByBase64(imagePayload);\n imageWidth = paddedResult.width;\n imageHeight = paddedResult.height;\n imagePayload = paddedResult.imageBase64;\n }\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: isAutoGLM(modelFamily)\n ? `Tap: ${userInstructionPrompt}`\n : userInstructionPrompt,\n },\n ],\n },\n ];\n\n if (typeof targetElementDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: targetElementDescription.images,\n convertHttpImage2Base64: targetElementDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n if (isAutoGLM(modelFamily)) {\n const { content: rawResponseContent, usage } =\n await callAIWithStringResponse(msgs, modelConfig, {\n abortSignal: options.abortSignal,\n });\n\n debugInspect('auto-glm rawResponse:', rawResponseContent);\n\n const parsed = parseAutoGLMLocateResponse(rawResponseContent);\n\n debugInspect('auto-glm thinking:', parsed.think);\n debugInspect('auto-glm coordinates:', parsed.coordinates);\n\n let resRect: Rect | undefined;\n let matchedElements: LocateResultElement[] = [];\n let errors: string[] = [];\n\n if (parsed.error || !parsed.coordinates) {\n errors = [parsed.error || 'Failed to parse auto-glm response'];\n debugInspect('auto-glm parse error:', errors[0]);\n } else {\n const { x, y } = parsed.coordinates;\n\n debugInspect('auto-glm coordinates [0-999]:', { x, y });\n\n // Convert auto-glm coordinates [0,999] to pixel bbox\n // Map from [0,999] to pixel coordinates\n const pixelX = Math.round((x * imageWidth) / 1000);\n const pixelY = Math.round((y * imageHeight) / 1000);\n\n debugInspect('auto-glm pixel coordinates:', { pixelX, pixelY });\n\n // Apply offset if searching in a cropped area\n let finalX = pixelX;\n let finalY = pixelY;\n if (options.searchConfig?.rect) {\n finalX += options.searchConfig.rect.left;\n finalY += options.searchConfig.rect.top;\n }\n\n const element: LocateResultElement = generateElementByPoint(\n [finalX, finalY],\n targetElementDescriptionText as string,\n );\n\n resRect = element.rect;\n debugInspect('auto-glm resRect:', resRect);\n\n if (element) {\n matchedElements = [element];\n }\n }\n\n return {\n rect: resRect,\n parseResult: {\n elements: matchedElements,\n errors,\n },\n rawResponse: rawResponseContent,\n usage,\n reasoning_content: parsed.think,\n };\n }\n\n let res: Awaited<\n ReturnType<\n typeof callAIWithObjectResponse<AIElementResponse | [number, number]>\n >\n >;\n try {\n res = await callAIWithObjectResponse<AIElementResponse | [number, number]>(\n msgs,\n modelConfig,\n { abortSignal: options.abortSignal },\n );\n } catch (callError) {\n // Return error with usage and rawResponse if available\n const errorMessage =\n callError instanceof Error ? callError.message : String(callError);\n const rawResponse =\n callError instanceof AIResponseParseError\n ? callError.rawResponse\n : errorMessage;\n const usage =\n callError instanceof AIResponseParseError ? callError.usage : undefined;\n return {\n rect: undefined,\n parseResult: {\n elements: [],\n errors: [`AI call error: ${errorMessage}`],\n },\n rawResponse,\n usage,\n reasoning_content: undefined,\n };\n }\n\n const rawResponse = JSON.stringify(res.content);\n\n let resRect: Rect | undefined;\n let matchedElements: LocateResultElement[] = [];\n let errors: string[] | undefined =\n 'errors' in res.content ? res.content.errors : [];\n try {\n if (\n 'bbox' in res.content &&\n Array.isArray(res.content.bbox) &&\n res.content.bbox.length >= 1\n ) {\n resRect = adaptBboxToRect(\n res.content.bbox,\n imageWidth,\n imageHeight,\n options.searchConfig?.rect?.left,\n options.searchConfig?.rect?.top,\n originalImageWidth,\n originalImageHeight,\n modelFamily,\n options.searchConfig?.scale,\n );\n\n debugInspect('resRect', resRect);\n\n const element: LocateResultElement = generateElementByRect(\n resRect,\n targetElementDescriptionText as string,\n );\n errors = [];\n\n if (element) {\n matchedElements = [element];\n }\n }\n } catch (e) {\n const msg =\n e instanceof Error\n ? `Failed to parse bbox: ${e.message}`\n : 'unknown error in locate';\n if (!errors || errors?.length === 0) {\n errors = [msg];\n } else {\n errors.push(`(${msg})`);\n }\n }\n\n return {\n rect: resRect,\n parseResult: {\n elements: matchedElements as LocateResultElement[],\n errors: errors as string[],\n },\n rawResponse,\n usage: res.usage,\n reasoning_content: res.reasoning_content,\n };\n}\n\nexport async function AiLocateSection(options: {\n context: UIContext;\n sectionDescription: TUserPrompt;\n modelConfig: IModelConfig;\n abortSignal?: AbortSignal;\n}): Promise<{\n rect?: Rect;\n imageBase64?: string;\n scale?: number;\n error?: string;\n rawResponse: string;\n usage?: AIUsageInfo;\n}> {\n const { context, sectionDescription, modelConfig } = options;\n const { modelFamily } = modelConfig;\n const screenshotBase64 = context.screenshot.base64;\n\n const systemPrompt = systemPromptToLocateSection(modelFamily);\n const sectionLocatorInstructionText = sectionLocatorInstruction(\n extraTextFromUserPrompt(sectionDescription),\n );\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: sectionLocatorInstructionText,\n },\n ],\n },\n ];\n\n if (typeof sectionDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: sectionDescription.images,\n convertHttpImage2Base64: sectionDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n let result: Awaited<\n ReturnType<typeof callAIWithObjectResponse<AISectionLocatorResponse>>\n >;\n try {\n result = await callAIWithObjectResponse<AISectionLocatorResponse>(\n msgs,\n modelConfig,\n { abortSignal: options.abortSignal },\n );\n } catch (callError) {\n // Return error with usage and rawResponse if available\n const errorMessage =\n callError instanceof Error ? callError.message : String(callError);\n const rawResponse =\n callError instanceof AIResponseParseError\n ? callError.rawResponse\n : errorMessage;\n const usage =\n callError instanceof AIResponseParseError ? callError.usage : undefined;\n return {\n rect: undefined,\n imageBase64: undefined,\n error: `AI call error: ${errorMessage}`,\n rawResponse,\n usage,\n };\n }\n\n let sectionRect: Rect | undefined;\n const sectionBbox = result.content.bbox;\n if (sectionBbox) {\n const targetRect = adaptBboxToRect(\n sectionBbox,\n context.shotSize.width,\n context.shotSize.height,\n 0,\n 0,\n context.shotSize.width,\n context.shotSize.height,\n modelFamily,\n );\n debugSection('original targetRect %j', targetRect);\n\n const referenceBboxList = result.content.references_bbox || [];\n debugSection('referenceBboxList %j', referenceBboxList);\n\n const referenceRects = referenceBboxList\n .filter((bbox) => Array.isArray(bbox))\n .map((bbox) => {\n return adaptBboxToRect(\n bbox,\n context.shotSize.width,\n context.shotSize.height,\n 0,\n 0,\n context.shotSize.width,\n context.shotSize.height,\n modelFamily,\n );\n });\n debugSection('referenceRects %j', referenceRects);\n\n // merge the sectionRect and referenceRects\n const mergedRect = mergeRects([targetRect, ...referenceRects]);\n debugSection('mergedRect %j', mergedRect);\n\n sectionRect = expandSearchArea(mergedRect, context.shotSize);\n debugSection('expanded sectionRect %j', sectionRect);\n }\n\n let imageBase64 = screenshotBase64;\n let scale: number | undefined;\n\n if (sectionRect) {\n const originalWidth = sectionRect.width;\n const originalHeight = sectionRect.height;\n\n const croppedResult = await cropByRect(\n screenshotBase64,\n sectionRect,\n modelFamily === 'qwen2.5-vl',\n );\n\n const scaleRatio = 2;\n const scaledResult = await scaleImage(\n croppedResult.imageBase64,\n scaleRatio,\n );\n\n imageBase64 = scaledResult.imageBase64;\n scale = scaleRatio;\n sectionRect.width = scaledResult.width;\n sectionRect.height = scaledResult.height;\n\n debugSection(\n 'scaled sectionRect from %dx%d to %dx%d (scale=%d)',\n originalWidth,\n originalHeight,\n sectionRect.width,\n sectionRect.height,\n scale,\n );\n }\n\n return {\n rect: sectionRect,\n imageBase64,\n scale,\n error: result.content.error,\n rawResponse: JSON.stringify(result.content),\n usage: result.usage,\n };\n}\n\nexport async function AiExtractElementInfo<T>(options: {\n dataQuery: string | Record<string, string>;\n multimodalPrompt?: TMultimodalPrompt;\n context: UIContext;\n pageDescription?: string;\n extractOption?: ServiceExtractOption;\n modelConfig: IModelConfig;\n}) {\n const { dataQuery, context, extractOption, multimodalPrompt, modelConfig } =\n options;\n const systemPrompt = systemPromptToExtract();\n const screenshotBase64 = context.screenshot.base64;\n\n const extractDataPromptText = extractDataQueryPrompt(\n options.pageDescription || '',\n dataQuery,\n );\n\n const userContent: ChatCompletionUserMessageParam['content'] = [];\n\n if (extractOption?.screenshotIncluded !== false) {\n userContent.push({\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n });\n }\n\n userContent.push({\n type: 'text',\n text: extractDataPromptText,\n });\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: userContent,\n },\n ];\n\n if (multimodalPrompt) {\n const addOns = await promptsToChatParam({\n images: multimodalPrompt.images,\n convertHttpImage2Base64: multimodalPrompt.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const {\n content: rawResponse,\n usage,\n reasoning_content,\n } = await callAI(msgs, modelConfig);\n\n // Parse XML response to JSON object\n let parseResult: AIDataExtractionResponse<T>;\n try {\n parseResult = parseXMLExtractionResponse<T>(rawResponse);\n } catch (parseError) {\n // Throw AIResponseParseError with usage and rawResponse preserved\n const errorMessage =\n parseError instanceof Error ? parseError.message : String(parseError);\n throw new AIResponseParseError(\n `XML parse error: ${errorMessage}`,\n rawResponse,\n usage,\n );\n }\n\n return {\n parseResult,\n rawResponse,\n usage,\n reasoning_content,\n };\n}\n\nexport async function AiJudgeOrderSensitive(\n description: string,\n callAIFn: typeof callAIWithObjectResponse<{ isOrderSensitive: boolean }>,\n modelConfig: IModelConfig,\n): Promise<{\n isOrderSensitive: boolean;\n usage?: AIUsageInfo;\n}> {\n const systemPrompt = systemPromptToJudgeOrderSensitive();\n const userPrompt = orderSensitiveJudgePrompt(description);\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: userPrompt,\n },\n ];\n\n const result = await callAIFn(msgs, modelConfig);\n\n return {\n isOrderSensitive: result.content.isOrderSensitive ?? false,\n usage: result.usage,\n };\n}\n"],"names":["debugInspect","getDebug","debugSection","extraTextFromUserPrompt","prompt","promptsToChatParam","multimodalPrompt","msgs","item","base64","preProcessImageUrl","AiLocateElement","options","context","targetElementDescription","modelConfig","modelFamily","screenshotBase64","assert","targetElementDescriptionText","userInstructionPrompt","findElementPrompt","systemPrompt","isAutoGLM","getAutoGLMLocatePrompt","systemPromptToLocateElement","imagePayload","imageWidth","imageHeight","originalImageWidth","originalImageHeight","paddedResult","paddingToMatchBlockByBase64","addOns","rawResponseContent","usage","callAIWithStringResponse","parsed","parseAutoGLMLocateResponse","resRect","matchedElements","errors","x","y","pixelX","Math","pixelY","finalX","finalY","element","generateElementByPoint","res","callAIWithObjectResponse","callError","errorMessage","Error","String","rawResponse","AIResponseParseError","undefined","JSON","Array","adaptBboxToRect","generateElementByRect","e","msg","AiLocateSection","sectionDescription","systemPromptToLocateSection","sectionLocatorInstructionText","sectionLocatorInstruction","result","sectionRect","sectionBbox","targetRect","referenceBboxList","referenceRects","bbox","mergedRect","mergeRects","expandSearchArea","imageBase64","scale","originalWidth","originalHeight","croppedResult","cropByRect","scaleRatio","scaledResult","scaleImage","AiExtractElementInfo","dataQuery","extractOption","systemPromptToExtract","extractDataPromptText","extractDataQueryPrompt","userContent","reasoning_content","callAI","parseResult","parseXMLExtractionResponse","parseError","AiJudgeOrderSensitive","description","callAIFn","systemPromptToJudgeOrderSensitive","userPrompt","orderSensitiveJudgePrompt"],"mappings":";;;;;;;;;;;;;AA6DA,MAAMA,eAAeC,SAAS;AAC9B,MAAMC,eAAeD,SAAS;AAE9B,MAAME,0BAA0B,CAACC;IAC/B,IAAI,AAAkB,YAAlB,OAAOA,QACT,OAAOA;IAEP,OAAOA,OAAO,MAAM;AAExB;AAEA,MAAMC,qBAAqB,OACzBC;IAEA,MAAMC,OAAyC,EAAE;IACjD,IAAID,kBAAkB,QAAQ,QAAQ;QACpCC,KAAK,IAAI,CAAC;YACR,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM;gBACR;aACD;QACH;QAEA,KAAK,MAAMC,QAAQF,iBAAiB,MAAM,CAAE;YAC1C,MAAMG,SAAS,MAAMC,mBACnBF,KAAK,GAAG,EACR,CAAC,CAACF,iBAAiB,uBAAuB;YAG5CC,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,MAAM,CAAC,mCAAmC,EAAEC,KAAK,IAAI,CAAC,EAAE,CAAC;oBAC3D;iBACD;YACH;YAEAD,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,WAAW;4BACT,KAAKE;4BACL,QAAQ;wBACV;oBACF;iBACD;YACH;QACF;IACF;IACA,OAAOF;AACT;AAEO,eAAeI,gBAAgBC,OAMrC;IAUC,MAAM,EAAEC,OAAO,EAAEC,wBAAwB,EAAEC,WAAW,EAAE,GAAGH;IAC3D,MAAM,EAAEI,WAAW,EAAE,GAAGD;IACxB,MAAME,mBAAmBJ,QAAQ,UAAU,CAAC,MAAM;IAElDK,OACEJ,0BACA;IAEF,MAAMK,+BAA+BhB,wBACnCW;IAEF,MAAMM,wBAAwBC,kBAAkBF;IAChD,MAAMG,eAAeC,UAAUP,eAC3BQ,uBAAuBR,eACvBS,4BAA4BT;IAEhC,IAAIU,eAAeT;IACnB,IAAIU,aAAad,QAAQ,QAAQ,CAAC,KAAK;IACvC,IAAIe,cAAcf,QAAQ,QAAQ,CAAC,MAAM;IACzC,IAAIgB,qBAAqBF;IACzB,IAAIG,sBAAsBF;IAE1B,IAAIhB,QAAQ,YAAY,EAAE;QACxBM,OACEN,QAAQ,YAAY,CAAC,IAAI,EACzB;QAEFM,OACEN,QAAQ,YAAY,CAAC,WAAW,EAChC;QAGFc,eAAed,QAAQ,YAAY,CAAC,WAAW;QAC/Ce,aAAaf,QAAQ,YAAY,CAAC,IAAI,EAAE;QACxCgB,cAAchB,QAAQ,YAAY,CAAC,IAAI,EAAE;QACzCiB,qBAAqBF;QACrBG,sBAAsBF;IACxB,OAAO,IAAIZ,AAAgB,iBAAhBA,aAA8B;QACvC,MAAMe,eAAe,MAAMC,4BAA4BN;QACvDC,aAAaI,aAAa,KAAK;QAC/BH,cAAcG,aAAa,MAAM;QACjCL,eAAeK,aAAa,WAAW;IACzC;IAEA,MAAMxB,OAAe;QACnB;YAAE,MAAM;YAAU,SAASe;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKI;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAMH,UAAUP,eACZ,CAAC,KAAK,EAAEI,uBAAuB,GAC/BA;gBACN;aACD;QACH;KACD;IAED,IAAI,AAAoC,YAApC,OAAON,0BAAuC;QAChD,MAAMmB,SAAS,MAAM5B,mBAAmB;YACtC,QAAQS,yBAAyB,MAAM;YACvC,yBAAyBA,yBAAyB,uBAAuB;QAC3E;QACAP,KAAK,IAAI,IAAI0B;IACf;IAEA,IAAIV,UAAUP,cAAc;QAC1B,MAAM,EAAE,SAASkB,kBAAkB,EAAEC,KAAK,EAAE,GAC1C,MAAMC,yBAAyB7B,MAAMQ,aAAa;YAChD,aAAaH,QAAQ,WAAW;QAClC;QAEFZ,aAAa,yBAAyBkC;QAEtC,MAAMG,SAASC,2BAA2BJ;QAE1ClC,aAAa,sBAAsBqC,OAAO,KAAK;QAC/CrC,aAAa,yBAAyBqC,OAAO,WAAW;QAExD,IAAIE;QACJ,IAAIC,kBAAyC,EAAE;QAC/C,IAAIC,SAAmB,EAAE;QAEzB,IAAIJ,OAAO,KAAK,IAAI,CAACA,OAAO,WAAW,EAAE;YACvCI,SAAS;gBAACJ,OAAO,KAAK,IAAI;aAAoC;YAC9DrC,aAAa,yBAAyByC,MAAM,CAAC,EAAE;QACjD,OAAO;YACL,MAAM,EAAEC,CAAC,EAAEC,CAAC,EAAE,GAAGN,OAAO,WAAW;YAEnCrC,aAAa,iCAAiC;gBAAE0C;gBAAGC;YAAE;YAIrD,MAAMC,SAASC,KAAK,KAAK,CAAEH,IAAIf,aAAc;YAC7C,MAAMmB,SAASD,KAAK,KAAK,CAAEF,IAAIf,cAAe;YAE9C5B,aAAa,+BAA+B;gBAAE4C;gBAAQE;YAAO;YAG7D,IAAIC,SAASH;YACb,IAAII,SAASF;YACb,IAAIlC,QAAQ,YAAY,EAAE,MAAM;gBAC9BmC,UAAUnC,QAAQ,YAAY,CAAC,IAAI,CAAC,IAAI;gBACxCoC,UAAUpC,QAAQ,YAAY,CAAC,IAAI,CAAC,GAAG;YACzC;YAEA,MAAMqC,UAA+BC,uBACnC;gBAACH;gBAAQC;aAAO,EAChB7B;YAGFoB,UAAUU,QAAQ,IAAI;YACtBjD,aAAa,qBAAqBuC;YAElC,IAAIU,SACFT,kBAAkB;gBAACS;aAAQ;QAE/B;QAEA,OAAO;YACL,MAAMV;YACN,aAAa;gBACX,UAAUC;gBACVC;YACF;YACA,aAAaP;YACbC;YACA,mBAAmBE,OAAO,KAAK;QACjC;IACF;IAEA,IAAIc;IAKJ,IAAI;QACFA,MAAM,MAAMC,yBACV7C,MACAQ,aACA;YAAE,aAAaH,QAAQ,WAAW;QAAC;IAEvC,EAAE,OAAOyC,WAAW;QAElB,MAAMC,eACJD,qBAAqBE,QAAQF,UAAU,OAAO,GAAGG,OAAOH;QAC1D,MAAMI,cACJJ,qBAAqBK,uBACjBL,UAAU,WAAW,GACrBC;QACN,MAAMnB,QACJkB,qBAAqBK,uBAAuBL,UAAU,KAAK,GAAGM;QAChE,OAAO;YACL,MAAMA;YACN,aAAa;gBACX,UAAU,EAAE;gBACZ,QAAQ;oBAAC,CAAC,eAAe,EAAEL,cAAc;iBAAC;YAC5C;YACAG;YACAtB;YACA,mBAAmBwB;QACrB;IACF;IAEA,MAAMF,cAAcG,KAAK,SAAS,CAACT,IAAI,OAAO;IAE9C,IAAIZ;IACJ,IAAIC,kBAAyC,EAAE;IAC/C,IAAIC,SACF,YAAYU,IAAI,OAAO,GAAGA,IAAI,OAAO,CAAC,MAAM,GAAG,EAAE;IACnD,IAAI;QACF,IACE,UAAUA,IAAI,OAAO,IACrBU,MAAM,OAAO,CAACV,IAAI,OAAO,CAAC,IAAI,KAC9BA,IAAI,OAAO,CAAC,IAAI,CAAC,MAAM,IAAI,GAC3B;YACAZ,UAAUuB,gBACRX,IAAI,OAAO,CAAC,IAAI,EAChBxB,YACAC,aACAhB,QAAQ,YAAY,EAAE,MAAM,MAC5BA,QAAQ,YAAY,EAAE,MAAM,KAC5BiB,oBACAC,qBACAd,aACAJ,QAAQ,YAAY,EAAE;YAGxBZ,aAAa,WAAWuC;YAExB,MAAMU,UAA+Bc,sBACnCxB,SACApB;YAEFsB,SAAS,EAAE;YAEX,IAAIQ,SACFT,kBAAkB;gBAACS;aAAQ;QAE/B;IACF,EAAE,OAAOe,GAAG;QACV,MAAMC,MACJD,aAAaT,QACT,CAAC,sBAAsB,EAAES,EAAE,OAAO,EAAE,GACpC;QACN,IAAI,AAACvB,UAAUA,QAAQ,WAAW,GAGhCA,OAAO,IAAI,CAAC,CAAC,CAAC,EAAEwB,IAAI,CAAC,CAAC;aAFtBxB,SAAS;YAACwB;SAAI;IAIlB;IAEA,OAAO;QACL,MAAM1B;QACN,aAAa;YACX,UAAUC;YACV,QAAQC;QACV;QACAgB;QACA,OAAON,IAAI,KAAK;QAChB,mBAAmBA,IAAI,iBAAiB;IAC1C;AACF;AAEO,eAAee,gBAAgBtD,OAKrC;IAQC,MAAM,EAAEC,OAAO,EAAEsD,kBAAkB,EAAEpD,WAAW,EAAE,GAAGH;IACrD,MAAM,EAAEI,WAAW,EAAE,GAAGD;IACxB,MAAME,mBAAmBJ,QAAQ,UAAU,CAAC,MAAM;IAElD,MAAMS,eAAe8C,4BAA4BpD;IACjD,MAAMqD,gCAAgCC,0BACpCnE,wBAAwBgE;IAE1B,MAAM5D,OAAe;QACnB;YAAE,MAAM;YAAU,SAASe;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKL;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAMoD;gBACR;aACD;QACH;KACD;IAED,IAAI,AAA8B,YAA9B,OAAOF,oBAAiC;QAC1C,MAAMlC,SAAS,MAAM5B,mBAAmB;YACtC,QAAQ8D,mBAAmB,MAAM;YACjC,yBAAyBA,mBAAmB,uBAAuB;QACrE;QACA5D,KAAK,IAAI,IAAI0B;IACf;IAEA,IAAIsC;IAGJ,IAAI;QACFA,SAAS,MAAMnB,yBACb7C,MACAQ,aACA;YAAE,aAAaH,QAAQ,WAAW;QAAC;IAEvC,EAAE,OAAOyC,WAAW;QAElB,MAAMC,eACJD,qBAAqBE,QAAQF,UAAU,OAAO,GAAGG,OAAOH;QAC1D,MAAMI,cACJJ,qBAAqBK,uBACjBL,UAAU,WAAW,GACrBC;QACN,MAAMnB,QACJkB,qBAAqBK,uBAAuBL,UAAU,KAAK,GAAGM;QAChE,OAAO;YACL,MAAMA;YACN,aAAaA;YACb,OAAO,CAAC,eAAe,EAAEL,cAAc;YACvCG;YACAtB;QACF;IACF;IAEA,IAAIqC;IACJ,MAAMC,cAAcF,OAAO,OAAO,CAAC,IAAI;IACvC,IAAIE,aAAa;QACf,MAAMC,aAAaZ,gBACjBW,aACA5D,QAAQ,QAAQ,CAAC,KAAK,EACtBA,QAAQ,QAAQ,CAAC,MAAM,EACvB,GACA,GACAA,QAAQ,QAAQ,CAAC,KAAK,EACtBA,QAAQ,QAAQ,CAAC,MAAM,EACvBG;QAEFd,aAAa,0BAA0BwE;QAEvC,MAAMC,oBAAoBJ,OAAO,OAAO,CAAC,eAAe,IAAI,EAAE;QAC9DrE,aAAa,wBAAwByE;QAErC,MAAMC,iBAAiBD,kBACpB,MAAM,CAAC,CAACE,OAAShB,MAAM,OAAO,CAACgB,OAC/B,GAAG,CAAC,CAACA,OACGf,gBACLe,MACAhE,QAAQ,QAAQ,CAAC,KAAK,EACtBA,QAAQ,QAAQ,CAAC,MAAM,EACvB,GACA,GACAA,QAAQ,QAAQ,CAAC,KAAK,EACtBA,QAAQ,QAAQ,CAAC,MAAM,EACvBG;QAGNd,aAAa,qBAAqB0E;QAGlC,MAAME,aAAaC,WAAW;YAACL;eAAeE;SAAe;QAC7D1E,aAAa,iBAAiB4E;QAE9BN,cAAcQ,iBAAiBF,YAAYjE,QAAQ,QAAQ;QAC3DX,aAAa,2BAA2BsE;IAC1C;IAEA,IAAIS,cAAchE;IAClB,IAAIiE;IAEJ,IAAIV,aAAa;QACf,MAAMW,gBAAgBX,YAAY,KAAK;QACvC,MAAMY,iBAAiBZ,YAAY,MAAM;QAEzC,MAAMa,gBAAgB,MAAMC,WAC1BrE,kBACAuD,aACAxD,AAAgB,iBAAhBA;QAGF,MAAMuE,aAAa;QACnB,MAAMC,eAAe,MAAMC,WACzBJ,cAAc,WAAW,EACzBE;QAGFN,cAAcO,aAAa,WAAW;QACtCN,QAAQK;QACRf,YAAY,KAAK,GAAGgB,aAAa,KAAK;QACtChB,YAAY,MAAM,GAAGgB,aAAa,MAAM;QAExCtF,aACE,qDACAiF,eACAC,gBACAZ,YAAY,KAAK,EACjBA,YAAY,MAAM,EAClBU;IAEJ;IAEA,OAAO;QACL,MAAMV;QACNS;QACAC;QACA,OAAOX,OAAO,OAAO,CAAC,KAAK;QAC3B,aAAaX,KAAK,SAAS,CAACW,OAAO,OAAO;QAC1C,OAAOA,OAAO,KAAK;IACrB;AACF;AAEO,eAAemB,qBAAwB9E,OAO7C;IACC,MAAM,EAAE+E,SAAS,EAAE9E,OAAO,EAAE+E,aAAa,EAAEtF,gBAAgB,EAAES,WAAW,EAAE,GACxEH;IACF,MAAMU,eAAeuE;IACrB,MAAM5E,mBAAmBJ,QAAQ,UAAU,CAAC,MAAM;IAElD,MAAMiF,wBAAwBC,uBAC5BnF,QAAQ,eAAe,IAAI,IAC3B+E;IAGF,MAAMK,cAAyD,EAAE;IAEjE,IAAIJ,eAAe,uBAAuB,OACxCI,YAAY,IAAI,CAAC;QACf,MAAM;QACN,WAAW;YACT,KAAK/E;YACL,QAAQ;QACV;IACF;IAGF+E,YAAY,IAAI,CAAC;QACf,MAAM;QACN,MAAMF;IACR;IAEA,MAAMvF,OAAe;QACnB;YAAE,MAAM;YAAU,SAASe;QAAa;QACxC;YACE,MAAM;YACN,SAAS0E;QACX;KACD;IAED,IAAI1F,kBAAkB;QACpB,MAAM2B,SAAS,MAAM5B,mBAAmB;YACtC,QAAQC,iBAAiB,MAAM;YAC/B,yBAAyBA,iBAAiB,uBAAuB;QACnE;QACAC,KAAK,IAAI,IAAI0B;IACf;IAEA,MAAM,EACJ,SAASwB,WAAW,EACpBtB,KAAK,EACL8D,iBAAiB,EAClB,GAAG,MAAMC,OAAO3F,MAAMQ;IAGvB,IAAIoF;IACJ,IAAI;QACFA,cAAcC,2BAA8B3C;IAC9C,EAAE,OAAO4C,YAAY;QAEnB,MAAM/C,eACJ+C,sBAAsB9C,QAAQ8C,WAAW,OAAO,GAAG7C,OAAO6C;QAC5D,MAAM,IAAI3C,qBACR,CAAC,iBAAiB,EAAEJ,cAAc,EAClCG,aACAtB;IAEJ;IAEA,OAAO;QACLgE;QACA1C;QACAtB;QACA8D;IACF;AACF;AAEO,eAAeK,sBACpBC,WAAmB,EACnBC,QAAwE,EACxEzF,WAAyB;IAKzB,MAAMO,eAAemF;IACrB,MAAMC,aAAaC,0BAA0BJ;IAE7C,MAAMhG,OAAe;QACnB;YAAE,MAAM;YAAU,SAASe;QAAa;QACxC;YACE,MAAM;YACN,SAASoF;QACX;KACD;IAED,MAAMnC,SAAS,MAAMiC,SAASjG,MAAMQ;IAEpC,OAAO;QACL,kBAAkBwD,OAAO,OAAO,CAAC,gBAAgB,IAAI;QACrD,OAAOA,OAAO,KAAK;IACrB;AACF"}
@@ -0,0 +1,233 @@
1
+ import { paddingToMatchBlockByBase64 } from "@midscene/shared/img";
2
+ import { getDebug } from "@midscene/shared/logger";
3
+ import { assert } from "@midscene/shared/utils";
4
+ import { buildYamlFlowFromPlans, fillBboxParam, findAllMidsceneLocatorField } from "../common.mjs";
5
+ import { systemPromptToTaskPlanning } from "./prompt/llm-planning.mjs";
6
+ import { extractXMLTag, parseMarkFinishedIndexes, parseSubGoalsFromXML } from "./prompt/util.mjs";
7
+ import { AIResponseParseError, callAI, safeParseJson } from "./service-caller/index.mjs";
8
+ const debug = getDebug('planning');
9
+ const warnLog = getDebug('planning', {
10
+ console: true
11
+ });
12
+ function parseXMLPlanningResponse(xmlString, modelFamily) {
13
+ const thought = extractXMLTag(xmlString, 'thought');
14
+ const memory = extractXMLTag(xmlString, 'memory');
15
+ const log = extractXMLTag(xmlString, 'log') || '';
16
+ const error = extractXMLTag(xmlString, 'error');
17
+ const actionType = extractXMLTag(xmlString, 'action-type');
18
+ const actionParamStr = extractXMLTag(xmlString, 'action-param-json');
19
+ const completeGoalRegex = /<complete\s+success="(true|false)">([\s\S]*?)<\/complete>/i;
20
+ const completeGoalMatch = xmlString.match(completeGoalRegex);
21
+ let finalizeMessage;
22
+ let finalizeSuccess;
23
+ if (completeGoalMatch) {
24
+ finalizeSuccess = 'true' === completeGoalMatch[1];
25
+ finalizeMessage = completeGoalMatch[2]?.trim() || void 0;
26
+ }
27
+ const updatePlanContent = extractXMLTag(xmlString, 'update-plan-content');
28
+ const markSubGoalDone = extractXMLTag(xmlString, 'mark-sub-goal-done');
29
+ const updateSubGoals = updatePlanContent ? parseSubGoalsFromXML(updatePlanContent) : void 0;
30
+ const markFinishedIndexes = markSubGoalDone ? parseMarkFinishedIndexes(markSubGoalDone) : void 0;
31
+ let action = null;
32
+ if (actionType && 'null' !== actionType.toLowerCase()) {
33
+ const type = actionType.trim();
34
+ let param;
35
+ if (actionParamStr) try {
36
+ param = safeParseJson(actionParamStr, modelFamily);
37
+ } catch (e) {
38
+ throw new Error(`Failed to parse action-param-json: ${e}`);
39
+ }
40
+ action = {
41
+ type,
42
+ ...void 0 !== param ? {
43
+ param
44
+ } : {}
45
+ };
46
+ }
47
+ return {
48
+ ...thought ? {
49
+ thought
50
+ } : {},
51
+ ...memory ? {
52
+ memory
53
+ } : {},
54
+ log,
55
+ ...error ? {
56
+ error
57
+ } : {},
58
+ action,
59
+ ...void 0 !== finalizeMessage ? {
60
+ finalizeMessage
61
+ } : {},
62
+ ...void 0 !== finalizeSuccess ? {
63
+ finalizeSuccess
64
+ } : {},
65
+ ...updateSubGoals?.length ? {
66
+ updateSubGoals
67
+ } : {},
68
+ ...markFinishedIndexes?.length ? {
69
+ markFinishedIndexes
70
+ } : {}
71
+ };
72
+ }
73
+ async function plan(userInstruction, opts) {
74
+ const { context, modelConfig, conversationHistory } = opts;
75
+ const { shotSize } = context;
76
+ const screenshotBase64 = context.screenshot.base64;
77
+ const { modelFamily } = modelConfig;
78
+ const includeSubGoals = true === opts.deepThink;
79
+ const systemPrompt = await systemPromptToTaskPlanning({
80
+ actionSpace: opts.actionSpace,
81
+ modelFamily,
82
+ includeBbox: opts.includeBbox,
83
+ includeThought: true,
84
+ includeSubGoals
85
+ });
86
+ let imagePayload = screenshotBase64;
87
+ let imageWidth = shotSize.width;
88
+ let imageHeight = shotSize.height;
89
+ if ('qwen2.5-vl' === modelFamily) {
90
+ const paddedResult = await paddingToMatchBlockByBase64(imagePayload);
91
+ imageWidth = paddedResult.width;
92
+ imageHeight = paddedResult.height;
93
+ imagePayload = paddedResult.imageBase64;
94
+ }
95
+ const actionContext = opts.actionContext ? `<high_priority_knowledge>${opts.actionContext}</high_priority_knowledge>\n` : '';
96
+ const instruction = [
97
+ {
98
+ role: 'user',
99
+ content: [
100
+ {
101
+ type: 'text',
102
+ text: `${actionContext}<user_instruction>${userInstruction}</user_instruction>`
103
+ }
104
+ ]
105
+ }
106
+ ];
107
+ let latestFeedbackMessage;
108
+ const subGoalsText = includeSubGoals ? conversationHistory.subGoalsToText() : conversationHistory.historicalLogsToText();
109
+ const subGoalsSection = subGoalsText ? `\n\n${subGoalsText}` : '';
110
+ const memoriesText = conversationHistory.memoriesToText();
111
+ const memoriesSection = memoriesText ? `\n\n${memoriesText}` : '';
112
+ if (conversationHistory.pendingFeedbackMessage) {
113
+ latestFeedbackMessage = {
114
+ role: 'user',
115
+ content: [
116
+ {
117
+ type: 'text',
118
+ text: `${conversationHistory.pendingFeedbackMessage}. The previous action has been executed, here is the latest screenshot. Please continue according to the instruction.${memoriesSection}${subGoalsSection}`
119
+ },
120
+ {
121
+ type: 'image_url',
122
+ image_url: {
123
+ url: imagePayload,
124
+ detail: 'high'
125
+ }
126
+ }
127
+ ]
128
+ };
129
+ conversationHistory.resetPendingFeedbackMessageIfExists();
130
+ } else latestFeedbackMessage = {
131
+ role: 'user',
132
+ content: [
133
+ {
134
+ type: 'text',
135
+ text: `this is the latest screenshot${memoriesSection}${subGoalsSection}`
136
+ },
137
+ {
138
+ type: 'image_url',
139
+ image_url: {
140
+ url: imagePayload,
141
+ detail: 'high'
142
+ }
143
+ }
144
+ ]
145
+ };
146
+ conversationHistory.append(latestFeedbackMessage);
147
+ conversationHistory.compressHistory(50, 20);
148
+ const historyLog = conversationHistory.snapshot(opts.imagesIncludeCount);
149
+ const msgs = [
150
+ {
151
+ role: 'system',
152
+ content: systemPrompt
153
+ },
154
+ ...instruction,
155
+ ...historyLog
156
+ ];
157
+ let { content: rawResponse, usage, reasoning_content } = await callAI(msgs, modelConfig, {
158
+ deepThink: 'unset' === opts.deepThink ? void 0 : opts.deepThink,
159
+ abortSignal: opts.abortSignal
160
+ });
161
+ let planFromAI;
162
+ try {
163
+ try {
164
+ planFromAI = parseXMLPlanningResponse(rawResponse, modelFamily);
165
+ } catch {
166
+ const retry = await callAI(msgs, modelConfig, {
167
+ deepThink: 'unset' === opts.deepThink ? void 0 : opts.deepThink,
168
+ abortSignal: opts.abortSignal
169
+ });
170
+ rawResponse = retry.content;
171
+ usage = retry.usage;
172
+ reasoning_content = retry.reasoning_content;
173
+ planFromAI = parseXMLPlanningResponse(rawResponse, modelFamily);
174
+ }
175
+ if (planFromAI.action && void 0 !== planFromAI.finalizeSuccess) {
176
+ warnLog('Planning response included both an action and <complete>; ignoring <complete> output.');
177
+ planFromAI.finalizeMessage = void 0;
178
+ planFromAI.finalizeSuccess = void 0;
179
+ }
180
+ const actions = planFromAI.action ? [
181
+ planFromAI.action
182
+ ] : [];
183
+ let shouldContinuePlanning = true;
184
+ if (void 0 !== planFromAI.finalizeSuccess) {
185
+ debug('task completed via <complete> tag, stop planning');
186
+ shouldContinuePlanning = false;
187
+ if (includeSubGoals) conversationHistory.markAllSubGoalsFinished();
188
+ }
189
+ const returnValue = {
190
+ ...planFromAI,
191
+ actions,
192
+ rawResponse,
193
+ usage,
194
+ reasoning_content,
195
+ yamlFlow: buildYamlFlowFromPlans(actions, opts.actionSpace),
196
+ shouldContinuePlanning
197
+ };
198
+ assert(planFromAI, "can't get plans from AI");
199
+ actions.forEach((action)=>{
200
+ const type = action.type;
201
+ const actionInActionSpace = opts.actionSpace.find((action)=>action.name === type);
202
+ debug('actionInActionSpace matched', actionInActionSpace);
203
+ const locateFields = actionInActionSpace ? findAllMidsceneLocatorField(actionInActionSpace.paramSchema) : [];
204
+ debug('locateFields', locateFields);
205
+ locateFields.forEach((field)=>{
206
+ const locateResult = action.param[field];
207
+ if (locateResult && void 0 !== modelFamily) action.param[field] = fillBboxParam(locateResult, imageWidth, imageHeight, modelFamily);
208
+ });
209
+ });
210
+ if (includeSubGoals) {
211
+ if (planFromAI.updateSubGoals?.length) conversationHistory.mergeSubGoals(planFromAI.updateSubGoals);
212
+ if (planFromAI.markFinishedIndexes?.length) for (const index of planFromAI.markFinishedIndexes)conversationHistory.markSubGoalFinished(index);
213
+ if (planFromAI.log) conversationHistory.appendSubGoalLog(planFromAI.log);
214
+ } else if (planFromAI.log) conversationHistory.appendHistoricalLog(planFromAI.log);
215
+ if (planFromAI.memory) conversationHistory.appendMemory(planFromAI.memory);
216
+ conversationHistory.append({
217
+ role: 'assistant',
218
+ content: [
219
+ {
220
+ type: 'text',
221
+ text: rawResponse
222
+ }
223
+ ]
224
+ });
225
+ return returnValue;
226
+ } catch (parseError) {
227
+ const errorMessage = parseError instanceof Error ? parseError.message : String(parseError);
228
+ throw new AIResponseParseError(`XML parse error: ${errorMessage}`, rawResponse, usage);
229
+ }
230
+ }
231
+ export { parseXMLPlanningResponse, plan };
232
+
233
+ //# sourceMappingURL=llm-planning.mjs.map