@aiscene/core 1.6.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (279) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +9 -0
  3. package/dist/es/agent/agent.mjs +749 -0
  4. package/dist/es/agent/agent.mjs.map +1 -0
  5. package/dist/es/agent/common.mjs +0 -0
  6. package/dist/es/agent/execution-session.mjs +41 -0
  7. package/dist/es/agent/execution-session.mjs.map +1 -0
  8. package/dist/es/agent/index.mjs +6 -0
  9. package/dist/es/agent/task-builder.mjs +332 -0
  10. package/dist/es/agent/task-builder.mjs.map +1 -0
  11. package/dist/es/agent/task-cache.mjs +214 -0
  12. package/dist/es/agent/task-cache.mjs.map +1 -0
  13. package/dist/es/agent/tasks.mjs +424 -0
  14. package/dist/es/agent/tasks.mjs.map +1 -0
  15. package/dist/es/agent/ui-utils.mjs +91 -0
  16. package/dist/es/agent/ui-utils.mjs.map +1 -0
  17. package/dist/es/agent/utils.mjs +198 -0
  18. package/dist/es/agent/utils.mjs.map +1 -0
  19. package/dist/es/ai-model/auto-glm/actions.mjs +237 -0
  20. package/dist/es/ai-model/auto-glm/actions.mjs.map +1 -0
  21. package/dist/es/ai-model/auto-glm/index.mjs +6 -0
  22. package/dist/es/ai-model/auto-glm/parser.mjs +239 -0
  23. package/dist/es/ai-model/auto-glm/parser.mjs.map +1 -0
  24. package/dist/es/ai-model/auto-glm/planning.mjs +71 -0
  25. package/dist/es/ai-model/auto-glm/planning.mjs.map +1 -0
  26. package/dist/es/ai-model/auto-glm/prompt.mjs +222 -0
  27. package/dist/es/ai-model/auto-glm/prompt.mjs.map +1 -0
  28. package/dist/es/ai-model/auto-glm/util.mjs +9 -0
  29. package/dist/es/ai-model/auto-glm/util.mjs.map +1 -0
  30. package/dist/es/ai-model/conversation-history.mjs +195 -0
  31. package/dist/es/ai-model/conversation-history.mjs.map +1 -0
  32. package/dist/es/ai-model/index.mjs +11 -0
  33. package/dist/es/ai-model/inspect.mjs +394 -0
  34. package/dist/es/ai-model/inspect.mjs.map +1 -0
  35. package/dist/es/ai-model/llm-planning.mjs +233 -0
  36. package/dist/es/ai-model/llm-planning.mjs.map +1 -0
  37. package/dist/es/ai-model/prompt/common.mjs +7 -0
  38. package/dist/es/ai-model/prompt/common.mjs.map +1 -0
  39. package/dist/es/ai-model/prompt/describe.mjs +66 -0
  40. package/dist/es/ai-model/prompt/describe.mjs.map +1 -0
  41. package/dist/es/ai-model/prompt/extraction.mjs +169 -0
  42. package/dist/es/ai-model/prompt/extraction.mjs.map +1 -0
  43. package/dist/es/ai-model/prompt/llm-locator.mjs +51 -0
  44. package/dist/es/ai-model/prompt/llm-locator.mjs.map +1 -0
  45. package/dist/es/ai-model/prompt/llm-planning.mjs +568 -0
  46. package/dist/es/ai-model/prompt/llm-planning.mjs.map +1 -0
  47. package/dist/es/ai-model/prompt/llm-section-locator.mjs +44 -0
  48. package/dist/es/ai-model/prompt/llm-section-locator.mjs.map +1 -0
  49. package/dist/es/ai-model/prompt/order-sensitive-judge.mjs +35 -0
  50. package/dist/es/ai-model/prompt/order-sensitive-judge.mjs.map +1 -0
  51. package/dist/es/ai-model/prompt/playwright-generator.mjs +117 -0
  52. package/dist/es/ai-model/prompt/playwright-generator.mjs.map +1 -0
  53. package/dist/es/ai-model/prompt/ui-tars-planning.mjs +36 -0
  54. package/dist/es/ai-model/prompt/ui-tars-planning.mjs.map +1 -0
  55. package/dist/es/ai-model/prompt/util.mjs +59 -0
  56. package/dist/es/ai-model/prompt/util.mjs.map +1 -0
  57. package/dist/es/ai-model/prompt/yaml-generator.mjs +219 -0
  58. package/dist/es/ai-model/prompt/yaml-generator.mjs.map +1 -0
  59. package/dist/es/ai-model/service-caller/codex-app-server.mjs +575 -0
  60. package/dist/es/ai-model/service-caller/codex-app-server.mjs.map +1 -0
  61. package/dist/es/ai-model/service-caller/image-detail.mjs +6 -0
  62. package/dist/es/ai-model/service-caller/image-detail.mjs.map +1 -0
  63. package/dist/es/ai-model/service-caller/index.mjs +473 -0
  64. package/dist/es/ai-model/service-caller/index.mjs.map +1 -0
  65. package/dist/es/ai-model/ui-tars-planning.mjs +249 -0
  66. package/dist/es/ai-model/ui-tars-planning.mjs.map +1 -0
  67. package/dist/es/common.mjs +371 -0
  68. package/dist/es/common.mjs.map +1 -0
  69. package/dist/es/device/device-options.mjs +0 -0
  70. package/dist/es/device/index.mjs +341 -0
  71. package/dist/es/device/index.mjs.map +1 -0
  72. package/dist/es/dump/html-utils.mjs +284 -0
  73. package/dist/es/dump/html-utils.mjs.map +1 -0
  74. package/dist/es/dump/image-restoration.mjs +43 -0
  75. package/dist/es/dump/image-restoration.mjs.map +1 -0
  76. package/dist/es/dump/index.mjs +3 -0
  77. package/dist/es/index.mjs +15 -0
  78. package/dist/es/index.mjs.map +1 -0
  79. package/dist/es/report-generator.mjs +162 -0
  80. package/dist/es/report-generator.mjs.map +1 -0
  81. package/dist/es/report.mjs +137 -0
  82. package/dist/es/report.mjs.map +1 -0
  83. package/dist/es/screenshot-item.mjs +105 -0
  84. package/dist/es/screenshot-item.mjs.map +1 -0
  85. package/dist/es/service/index.mjs +274 -0
  86. package/dist/es/service/index.mjs.map +1 -0
  87. package/dist/es/service/utils.mjs +15 -0
  88. package/dist/es/service/utils.mjs.map +1 -0
  89. package/dist/es/skill/index.mjs +38 -0
  90. package/dist/es/skill/index.mjs.map +1 -0
  91. package/dist/es/task-runner.mjs +263 -0
  92. package/dist/es/task-runner.mjs.map +1 -0
  93. package/dist/es/task-timing.mjs +12 -0
  94. package/dist/es/task-timing.mjs.map +1 -0
  95. package/dist/es/tree.mjs +13 -0
  96. package/dist/es/tree.mjs.map +1 -0
  97. package/dist/es/types.mjs +199 -0
  98. package/dist/es/types.mjs.map +1 -0
  99. package/dist/es/utils.mjs +229 -0
  100. package/dist/es/utils.mjs.map +1 -0
  101. package/dist/es/yaml/builder.mjs +13 -0
  102. package/dist/es/yaml/builder.mjs.map +1 -0
  103. package/dist/es/yaml/index.mjs +4 -0
  104. package/dist/es/yaml/player.mjs +434 -0
  105. package/dist/es/yaml/player.mjs.map +1 -0
  106. package/dist/es/yaml/utils.mjs +102 -0
  107. package/dist/es/yaml/utils.mjs.map +1 -0
  108. package/dist/es/yaml.mjs +0 -0
  109. package/dist/lib/agent/agent.js +797 -0
  110. package/dist/lib/agent/agent.js.map +1 -0
  111. package/dist/lib/agent/common.js +5 -0
  112. package/dist/lib/agent/execution-session.js +75 -0
  113. package/dist/lib/agent/execution-session.js.map +1 -0
  114. package/dist/lib/agent/index.js +81 -0
  115. package/dist/lib/agent/index.js.map +1 -0
  116. package/dist/lib/agent/task-builder.js +369 -0
  117. package/dist/lib/agent/task-builder.js.map +1 -0
  118. package/dist/lib/agent/task-cache.js +266 -0
  119. package/dist/lib/agent/task-cache.js.map +1 -0
  120. package/dist/lib/agent/tasks.js +467 -0
  121. package/dist/lib/agent/tasks.js.map +1 -0
  122. package/dist/lib/agent/ui-utils.js +143 -0
  123. package/dist/lib/agent/ui-utils.js.map +1 -0
  124. package/dist/lib/agent/utils.js +275 -0
  125. package/dist/lib/agent/utils.js.map +1 -0
  126. package/dist/lib/ai-model/auto-glm/actions.js +271 -0
  127. package/dist/lib/ai-model/auto-glm/actions.js.map +1 -0
  128. package/dist/lib/ai-model/auto-glm/index.js +66 -0
  129. package/dist/lib/ai-model/auto-glm/index.js.map +1 -0
  130. package/dist/lib/ai-model/auto-glm/parser.js +282 -0
  131. package/dist/lib/ai-model/auto-glm/parser.js.map +1 -0
  132. package/dist/lib/ai-model/auto-glm/planning.js +105 -0
  133. package/dist/lib/ai-model/auto-glm/planning.js.map +1 -0
  134. package/dist/lib/ai-model/auto-glm/prompt.js +259 -0
  135. package/dist/lib/ai-model/auto-glm/prompt.js.map +1 -0
  136. package/dist/lib/ai-model/auto-glm/util.js +46 -0
  137. package/dist/lib/ai-model/auto-glm/util.js.map +1 -0
  138. package/dist/lib/ai-model/conversation-history.js +229 -0
  139. package/dist/lib/ai-model/conversation-history.js.map +1 -0
  140. package/dist/lib/ai-model/index.js +125 -0
  141. package/dist/lib/ai-model/index.js.map +1 -0
  142. package/dist/lib/ai-model/inspect.js +440 -0
  143. package/dist/lib/ai-model/inspect.js.map +1 -0
  144. package/dist/lib/ai-model/llm-planning.js +270 -0
  145. package/dist/lib/ai-model/llm-planning.js.map +1 -0
  146. package/dist/lib/ai-model/prompt/common.js +41 -0
  147. package/dist/lib/ai-model/prompt/common.js.map +1 -0
  148. package/dist/lib/ai-model/prompt/describe.js +100 -0
  149. package/dist/lib/ai-model/prompt/describe.js.map +1 -0
  150. package/dist/lib/ai-model/prompt/extraction.js +209 -0
  151. package/dist/lib/ai-model/prompt/extraction.js.map +1 -0
  152. package/dist/lib/ai-model/prompt/llm-locator.js +88 -0
  153. package/dist/lib/ai-model/prompt/llm-locator.js.map +1 -0
  154. package/dist/lib/ai-model/prompt/llm-planning.js +605 -0
  155. package/dist/lib/ai-model/prompt/llm-planning.js.map +1 -0
  156. package/dist/lib/ai-model/prompt/llm-section-locator.js +81 -0
  157. package/dist/lib/ai-model/prompt/llm-section-locator.js.map +1 -0
  158. package/dist/lib/ai-model/prompt/order-sensitive-judge.js +72 -0
  159. package/dist/lib/ai-model/prompt/order-sensitive-judge.js.map +1 -0
  160. package/dist/lib/ai-model/prompt/playwright-generator.js +178 -0
  161. package/dist/lib/ai-model/prompt/playwright-generator.js.map +1 -0
  162. package/dist/lib/ai-model/prompt/ui-tars-planning.js +73 -0
  163. package/dist/lib/ai-model/prompt/ui-tars-planning.js.map +1 -0
  164. package/dist/lib/ai-model/prompt/util.js +105 -0
  165. package/dist/lib/ai-model/prompt/util.js.map +1 -0
  166. package/dist/lib/ai-model/prompt/yaml-generator.js +280 -0
  167. package/dist/lib/ai-model/prompt/yaml-generator.js.map +1 -0
  168. package/dist/lib/ai-model/service-caller/codex-app-server.js +624 -0
  169. package/dist/lib/ai-model/service-caller/codex-app-server.js.map +1 -0
  170. package/dist/lib/ai-model/service-caller/image-detail.js +40 -0
  171. package/dist/lib/ai-model/service-caller/image-detail.js.map +1 -0
  172. package/dist/lib/ai-model/service-caller/index.js +538 -0
  173. package/dist/lib/ai-model/service-caller/index.js.map +1 -0
  174. package/dist/lib/ai-model/ui-tars-planning.js +283 -0
  175. package/dist/lib/ai-model/ui-tars-planning.js.map +1 -0
  176. package/dist/lib/common.js +480 -0
  177. package/dist/lib/common.js.map +1 -0
  178. package/dist/lib/device/device-options.js +20 -0
  179. package/dist/lib/device/device-options.js.map +1 -0
  180. package/dist/lib/device/index.js +468 -0
  181. package/dist/lib/device/index.js.map +1 -0
  182. package/dist/lib/dump/html-utils.js +357 -0
  183. package/dist/lib/dump/html-utils.js.map +1 -0
  184. package/dist/lib/dump/image-restoration.js +77 -0
  185. package/dist/lib/dump/image-restoration.js.map +1 -0
  186. package/dist/lib/dump/index.js +60 -0
  187. package/dist/lib/dump/index.js.map +1 -0
  188. package/dist/lib/index.js +146 -0
  189. package/dist/lib/index.js.map +1 -0
  190. package/dist/lib/report-generator.js +200 -0
  191. package/dist/lib/report-generator.js.map +1 -0
  192. package/dist/lib/report.js +171 -0
  193. package/dist/lib/report.js.map +1 -0
  194. package/dist/lib/screenshot-item.js +139 -0
  195. package/dist/lib/screenshot-item.js.map +1 -0
  196. package/dist/lib/service/index.js +308 -0
  197. package/dist/lib/service/index.js.map +1 -0
  198. package/dist/lib/service/utils.js +49 -0
  199. package/dist/lib/service/utils.js.map +1 -0
  200. package/dist/lib/skill/index.js +72 -0
  201. package/dist/lib/skill/index.js.map +1 -0
  202. package/dist/lib/task-runner.js +300 -0
  203. package/dist/lib/task-runner.js.map +1 -0
  204. package/dist/lib/task-timing.js +46 -0
  205. package/dist/lib/task-timing.js.map +1 -0
  206. package/dist/lib/tree.js +53 -0
  207. package/dist/lib/tree.js.map +1 -0
  208. package/dist/lib/types.js +288 -0
  209. package/dist/lib/types.js.map +1 -0
  210. package/dist/lib/utils.js +308 -0
  211. package/dist/lib/utils.js.map +1 -0
  212. package/dist/lib/yaml/builder.js +57 -0
  213. package/dist/lib/yaml/builder.js.map +1 -0
  214. package/dist/lib/yaml/index.js +81 -0
  215. package/dist/lib/yaml/index.js.map +1 -0
  216. package/dist/lib/yaml/player.js +468 -0
  217. package/dist/lib/yaml/player.js.map +1 -0
  218. package/dist/lib/yaml/utils.js +155 -0
  219. package/dist/lib/yaml/utils.js.map +1 -0
  220. package/dist/lib/yaml.js +20 -0
  221. package/dist/lib/yaml.js.map +1 -0
  222. package/dist/types/agent/agent.d.ts +205 -0
  223. package/dist/types/agent/common.d.ts +0 -0
  224. package/dist/types/agent/execution-session.d.ts +36 -0
  225. package/dist/types/agent/index.d.ts +10 -0
  226. package/dist/types/agent/task-builder.d.ts +34 -0
  227. package/dist/types/agent/task-cache.d.ts +49 -0
  228. package/dist/types/agent/tasks.d.ts +69 -0
  229. package/dist/types/agent/ui-utils.d.ts +14 -0
  230. package/dist/types/agent/utils.d.ts +31 -0
  231. package/dist/types/ai-model/auto-glm/actions.d.ts +78 -0
  232. package/dist/types/ai-model/auto-glm/index.d.ts +6 -0
  233. package/dist/types/ai-model/auto-glm/parser.d.ts +18 -0
  234. package/dist/types/ai-model/auto-glm/planning.d.ts +12 -0
  235. package/dist/types/ai-model/auto-glm/prompt.d.ts +27 -0
  236. package/dist/types/ai-model/auto-glm/util.d.ts +13 -0
  237. package/dist/types/ai-model/conversation-history.d.ts +105 -0
  238. package/dist/types/ai-model/index.d.ts +14 -0
  239. package/dist/types/ai-model/inspect.d.ts +67 -0
  240. package/dist/types/ai-model/llm-planning.d.ts +19 -0
  241. package/dist/types/ai-model/prompt/common.d.ts +2 -0
  242. package/dist/types/ai-model/prompt/describe.d.ts +1 -0
  243. package/dist/types/ai-model/prompt/extraction.d.ts +7 -0
  244. package/dist/types/ai-model/prompt/llm-locator.d.ts +3 -0
  245. package/dist/types/ai-model/prompt/llm-planning.d.ts +10 -0
  246. package/dist/types/ai-model/prompt/llm-section-locator.d.ts +3 -0
  247. package/dist/types/ai-model/prompt/order-sensitive-judge.d.ts +2 -0
  248. package/dist/types/ai-model/prompt/playwright-generator.d.ts +26 -0
  249. package/dist/types/ai-model/prompt/ui-tars-planning.d.ts +2 -0
  250. package/dist/types/ai-model/prompt/util.d.ts +33 -0
  251. package/dist/types/ai-model/prompt/yaml-generator.d.ts +100 -0
  252. package/dist/types/ai-model/service-caller/codex-app-server.d.ts +42 -0
  253. package/dist/types/ai-model/service-caller/image-detail.d.ts +2 -0
  254. package/dist/types/ai-model/service-caller/index.d.ts +49 -0
  255. package/dist/types/ai-model/ui-tars-planning.d.ts +72 -0
  256. package/dist/types/common.d.ts +288 -0
  257. package/dist/types/device/device-options.d.ts +142 -0
  258. package/dist/types/device/index.d.ts +2528 -0
  259. package/dist/types/dump/html-utils.d.ts +63 -0
  260. package/dist/types/dump/image-restoration.d.ts +6 -0
  261. package/dist/types/dump/index.d.ts +5 -0
  262. package/dist/types/index.d.ts +17 -0
  263. package/dist/types/report-generator.d.ts +66 -0
  264. package/dist/types/report.d.ts +22 -0
  265. package/dist/types/screenshot-item.d.ts +66 -0
  266. package/dist/types/service/index.d.ts +24 -0
  267. package/dist/types/service/utils.d.ts +2 -0
  268. package/dist/types/skill/index.d.ts +25 -0
  269. package/dist/types/task-runner.d.ts +50 -0
  270. package/dist/types/task-timing.d.ts +8 -0
  271. package/dist/types/tree.d.ts +4 -0
  272. package/dist/types/types.d.ts +669 -0
  273. package/dist/types/utils.d.ts +40 -0
  274. package/dist/types/yaml/builder.d.ts +2 -0
  275. package/dist/types/yaml/index.d.ts +4 -0
  276. package/dist/types/yaml/player.d.ts +34 -0
  277. package/dist/types/yaml/utils.d.ts +9 -0
  278. package/dist/types/yaml.d.ts +215 -0
  279. package/package.json +111 -0
@@ -0,0 +1 @@
1
+ {"version":3,"file":"ai-model/inspect.mjs","sources":["../../../src/ai-model/inspect.ts"],"sourcesContent":["import type {\n AIDataExtractionResponse,\n AIElementResponse,\n AISectionLocatorResponse,\n AIUsageInfo,\n Rect,\n ServiceExtractOption,\n UIContext,\n} from '@/types';\nimport type { IModelConfig } from '@midscene/shared/env';\nimport {\n generateElementByPoint,\n generateElementByRect,\n} from '@midscene/shared/extractor/dom-util';\nimport {\n cropByRect,\n paddingToMatchBlockByBase64,\n preProcessImageUrl,\n scaleImage,\n} from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport type { LocateResultElement } from '@midscene/shared/types';\nimport { assert } from '@midscene/shared/utils';\nimport type {\n ChatCompletionSystemMessageParam,\n ChatCompletionUserMessageParam,\n} from 'openai/resources/index';\nimport type { TMultimodalPrompt, TUserPrompt } from '../common';\nimport { adaptBboxToRect, expandSearchArea, mergeRects } from '../common';\nimport { parseAutoGLMLocateResponse } from './auto-glm/parser';\nimport { getAutoGLMLocatePrompt } from './auto-glm/prompt';\nimport { isAutoGLM } from './auto-glm/util';\nimport {\n extractDataQueryPrompt,\n parseXMLExtractionResponse,\n systemPromptToExtract,\n} from './prompt/extraction';\nimport {\n findElementPrompt,\n systemPromptToLocateElement,\n} from './prompt/llm-locator';\nimport {\n sectionLocatorInstruction,\n systemPromptToLocateSection,\n} from './prompt/llm-section-locator';\nimport {\n orderSensitiveJudgePrompt,\n systemPromptToJudgeOrderSensitive,\n} from './prompt/order-sensitive-judge';\nimport {\n AIResponseParseError,\n callAI,\n callAIWithObjectResponse,\n callAIWithStringResponse,\n} from './service-caller/index';\n\nexport type AIArgs = [\n ChatCompletionSystemMessageParam,\n ...ChatCompletionUserMessageParam[],\n];\n\nconst debugInspect = getDebug('ai:inspect');\nconst debugSection = getDebug('ai:section');\n\nexport async function buildSearchAreaConfig(options: {\n context: UIContext;\n baseRect: Rect;\n modelFamily: IModelConfig['modelFamily'];\n}): Promise<{ rect: Rect; imageBase64: string; scale: number }> {\n const { context, baseRect, modelFamily } = options;\n const scaleRatio = 2;\n const sectionRect = expandSearchArea(baseRect, context.shotSize);\n\n const croppedResult = await cropByRect(\n context.screenshot.base64,\n sectionRect,\n modelFamily === 'qwen2.5-vl',\n );\n\n const scaledResult = await scaleImage(croppedResult.imageBase64, scaleRatio);\n sectionRect.width = scaledResult.width;\n sectionRect.height = scaledResult.height;\n return {\n rect: sectionRect,\n imageBase64: scaledResult.imageBase64,\n scale: scaleRatio,\n };\n}\n\nconst extraTextFromUserPrompt = (prompt: TUserPrompt): string => {\n if (typeof prompt === 'string') {\n return prompt;\n } else {\n return prompt.prompt;\n }\n};\n\nconst promptsToChatParam = async (\n multimodalPrompt: TMultimodalPrompt,\n): Promise<ChatCompletionUserMessageParam[]> => {\n const msgs: ChatCompletionUserMessageParam[] = [];\n if (multimodalPrompt?.images?.length) {\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: 'Next, I will provide all the reference images.',\n },\n ],\n });\n\n for (const item of multimodalPrompt.images) {\n const base64 = await preProcessImageUrl(\n item.url,\n !!multimodalPrompt.convertHttpImage2Base64,\n );\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: `this is the reference image named '${item.name}':`,\n },\n ],\n });\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: base64,\n detail: 'high',\n },\n },\n ],\n });\n }\n }\n return msgs;\n};\n\nexport async function AiLocateElement(options: {\n context: UIContext;\n targetElementDescription: TUserPrompt;\n searchConfig?: Awaited<ReturnType<typeof AiLocateSection>>;\n modelConfig: IModelConfig;\n abortSignal?: AbortSignal;\n}): Promise<{\n parseResult: {\n elements: LocateResultElement[];\n errors?: string[];\n };\n rect?: Rect;\n rawResponse: string;\n usage?: AIUsageInfo;\n reasoning_content?: string;\n}> {\n const { context, targetElementDescription, modelConfig } = options;\n const { modelFamily } = modelConfig;\n const screenshotBase64 = context.screenshot.base64;\n\n assert(\n targetElementDescription,\n 'cannot find the target element description',\n );\n const targetElementDescriptionText = extraTextFromUserPrompt(\n targetElementDescription,\n );\n const userInstructionPrompt = findElementPrompt(targetElementDescriptionText);\n const systemPrompt = isAutoGLM(modelFamily)\n ? getAutoGLMLocatePrompt(modelFamily)\n : systemPromptToLocateElement(modelFamily);\n\n let imagePayload = screenshotBase64;\n let imageWidth = context.shotSize.width;\n let imageHeight = context.shotSize.height;\n let originalImageWidth = imageWidth;\n let originalImageHeight = imageHeight;\n\n if (options.searchConfig) {\n assert(\n options.searchConfig.rect,\n 'searchArea is provided but its rect cannot be found. Failed to locate element',\n );\n assert(\n options.searchConfig.imageBase64,\n 'searchArea is provided but its imageBase64 cannot be found. Failed to locate element',\n );\n\n imagePayload = options.searchConfig.imageBase64;\n imageWidth = options.searchConfig.rect?.width;\n imageHeight = options.searchConfig.rect?.height;\n originalImageWidth = imageWidth;\n originalImageHeight = imageHeight;\n } else if (modelFamily === 'qwen2.5-vl') {\n const paddedResult = await paddingToMatchBlockByBase64(imagePayload);\n imageWidth = paddedResult.width;\n imageHeight = paddedResult.height;\n imagePayload = paddedResult.imageBase64;\n }\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: isAutoGLM(modelFamily)\n ? `Tap: ${userInstructionPrompt}`\n : userInstructionPrompt,\n },\n ],\n },\n ];\n\n if (typeof targetElementDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: targetElementDescription.images,\n convertHttpImage2Base64: targetElementDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n if (isAutoGLM(modelFamily)) {\n const { content: rawResponseContent, usage } =\n await callAIWithStringResponse(msgs, modelConfig, {\n abortSignal: options.abortSignal,\n });\n\n debugInspect('auto-glm rawResponse:', rawResponseContent);\n\n const parsed = parseAutoGLMLocateResponse(rawResponseContent);\n\n debugInspect('auto-glm thinking:', parsed.think);\n debugInspect('auto-glm coordinates:', parsed.coordinates);\n\n let resRect: Rect | undefined;\n let matchedElements: LocateResultElement[] = [];\n let errors: string[] = [];\n\n if (parsed.error || !parsed.coordinates) {\n errors = [parsed.error || 'Failed to parse auto-glm response'];\n debugInspect('auto-glm parse error:', errors[0]);\n } else {\n const { x, y } = parsed.coordinates;\n\n debugInspect('auto-glm coordinates [0-999]:', { x, y });\n\n // Convert auto-glm coordinates [0,999] to pixel bbox\n // Map from [0,999] to pixel coordinates\n const pixelX = Math.round((x * imageWidth) / 1000);\n const pixelY = Math.round((y * imageHeight) / 1000);\n\n debugInspect('auto-glm pixel coordinates:', { pixelX, pixelY });\n\n // Apply offset if searching in a cropped area\n let finalX = pixelX;\n let finalY = pixelY;\n if (options.searchConfig?.rect) {\n finalX += options.searchConfig.rect.left;\n finalY += options.searchConfig.rect.top;\n }\n\n const element: LocateResultElement = generateElementByPoint(\n [finalX, finalY],\n targetElementDescriptionText as string,\n );\n\n resRect = element.rect;\n debugInspect('auto-glm resRect:', resRect);\n\n if (element) {\n matchedElements = [element];\n }\n }\n\n return {\n rect: resRect,\n parseResult: {\n elements: matchedElements,\n errors,\n },\n rawResponse: rawResponseContent,\n usage,\n reasoning_content: parsed.think,\n };\n }\n\n let res: Awaited<\n ReturnType<\n typeof callAIWithObjectResponse<AIElementResponse | [number, number]>\n >\n >;\n try {\n res = await callAIWithObjectResponse<AIElementResponse | [number, number]>(\n msgs,\n modelConfig,\n { abortSignal: options.abortSignal },\n );\n } catch (callError) {\n // Return error with usage and rawResponse if available\n const errorMessage =\n callError instanceof Error ? callError.message : String(callError);\n const rawResponse =\n callError instanceof AIResponseParseError\n ? callError.rawResponse\n : errorMessage;\n const usage =\n callError instanceof AIResponseParseError ? callError.usage : undefined;\n return {\n rect: undefined,\n parseResult: {\n elements: [],\n errors: [`AI call error: ${errorMessage}`],\n },\n rawResponse,\n usage,\n reasoning_content: undefined,\n };\n }\n\n const rawResponse = JSON.stringify(res.content);\n\n let resRect: Rect | undefined;\n let matchedElements: LocateResultElement[] = [];\n let errors: string[] | undefined =\n 'errors' in res.content ? res.content.errors : [];\n try {\n if (\n 'bbox' in res.content &&\n Array.isArray(res.content.bbox) &&\n res.content.bbox.length >= 1\n ) {\n resRect = adaptBboxToRect(\n res.content.bbox,\n imageWidth,\n imageHeight,\n options.searchConfig?.rect?.left,\n options.searchConfig?.rect?.top,\n originalImageWidth,\n originalImageHeight,\n modelFamily,\n options.searchConfig?.scale,\n );\n\n debugInspect('resRect', resRect);\n\n const element: LocateResultElement = generateElementByRect(\n resRect,\n targetElementDescriptionText as string,\n );\n errors = [];\n\n if (element) {\n matchedElements = [element];\n }\n }\n } catch (e) {\n const msg =\n e instanceof Error\n ? `Failed to parse bbox: ${e.message}`\n : 'unknown error in locate';\n if (!errors || errors?.length === 0) {\n errors = [msg];\n } else {\n errors.push(`(${msg})`);\n }\n }\n\n return {\n rect: resRect,\n parseResult: {\n elements: matchedElements as LocateResultElement[],\n errors: errors as string[],\n },\n rawResponse,\n usage: res.usage,\n reasoning_content: res.reasoning_content,\n };\n}\n\nexport async function AiLocateSection(options: {\n context: UIContext;\n sectionDescription: TUserPrompt;\n modelConfig: IModelConfig;\n abortSignal?: AbortSignal;\n}): Promise<{\n rect?: Rect;\n imageBase64?: string;\n scale?: number;\n error?: string;\n rawResponse: string;\n usage?: AIUsageInfo;\n}> {\n const { context, sectionDescription, modelConfig } = options;\n const { modelFamily } = modelConfig;\n const screenshotBase64 = context.screenshot.base64;\n\n const systemPrompt = systemPromptToLocateSection(modelFamily);\n const sectionLocatorInstructionText = sectionLocatorInstruction(\n extraTextFromUserPrompt(sectionDescription),\n );\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: sectionLocatorInstructionText,\n },\n ],\n },\n ];\n\n if (typeof sectionDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: sectionDescription.images,\n convertHttpImage2Base64: sectionDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n let result: Awaited<\n ReturnType<typeof callAIWithObjectResponse<AISectionLocatorResponse>>\n >;\n try {\n result = await callAIWithObjectResponse<AISectionLocatorResponse>(\n msgs,\n modelConfig,\n { abortSignal: options.abortSignal },\n );\n } catch (callError) {\n // Return error with usage and rawResponse if available\n const errorMessage =\n callError instanceof Error ? callError.message : String(callError);\n const rawResponse =\n callError instanceof AIResponseParseError\n ? callError.rawResponse\n : errorMessage;\n const usage =\n callError instanceof AIResponseParseError ? callError.usage : undefined;\n return {\n rect: undefined,\n imageBase64: undefined,\n error: `AI call error: ${errorMessage}`,\n rawResponse,\n usage,\n };\n }\n\n let searchAreaConfig:\n | Awaited<ReturnType<typeof buildSearchAreaConfig>>\n | undefined;\n const sectionBbox = result.content.bbox;\n if (sectionBbox) {\n const targetRect = adaptBboxToRect(\n sectionBbox,\n context.shotSize.width,\n context.shotSize.height,\n 0,\n 0,\n context.shotSize.width,\n context.shotSize.height,\n modelFamily,\n );\n debugSection('original targetRect %j', targetRect);\n\n const referenceBboxList = result.content.references_bbox || [];\n debugSection('referenceBboxList %j', referenceBboxList);\n\n const referenceRects = referenceBboxList\n .filter((bbox) => Array.isArray(bbox))\n .map((bbox) => {\n return adaptBboxToRect(\n bbox,\n context.shotSize.width,\n context.shotSize.height,\n 0,\n 0,\n context.shotSize.width,\n context.shotSize.height,\n modelFamily,\n );\n });\n debugSection('referenceRects %j', referenceRects);\n\n // merge the sectionRect and referenceRects\n const mergedRect = mergeRects([targetRect, ...referenceRects]);\n debugSection('mergedRect %j', mergedRect);\n\n const expandedRect = expandSearchArea(mergedRect, context.shotSize);\n const originalWidth = expandedRect.width;\n const originalHeight = expandedRect.height;\n debugSection('expanded sectionRect %j', expandedRect);\n\n searchAreaConfig = await buildSearchAreaConfig({\n context,\n baseRect: mergedRect,\n modelFamily,\n });\n\n debugSection(\n 'scaled sectionRect from %dx%d to %dx%d (scale=%d)',\n originalWidth,\n originalHeight,\n searchAreaConfig.rect.width,\n searchAreaConfig.rect.height,\n searchAreaConfig.scale,\n );\n }\n\n return {\n rect: searchAreaConfig?.rect,\n imageBase64: searchAreaConfig?.imageBase64,\n scale: searchAreaConfig?.scale,\n error: result.content.error,\n rawResponse: JSON.stringify(result.content),\n usage: result.usage,\n };\n}\n\nexport async function AiExtractElementInfo<T>(options: {\n dataQuery: string | Record<string, string>;\n multimodalPrompt?: TMultimodalPrompt;\n context: UIContext;\n pageDescription?: string;\n extractOption?: ServiceExtractOption;\n modelConfig: IModelConfig;\n}) {\n const { dataQuery, context, extractOption, multimodalPrompt, modelConfig } =\n options;\n const systemPrompt = systemPromptToExtract();\n const screenshotBase64 = context.screenshot.base64;\n\n const extractDataPromptText = extractDataQueryPrompt(\n options.pageDescription || '',\n dataQuery,\n );\n\n const userContent: ChatCompletionUserMessageParam['content'] = [];\n\n if (extractOption?.screenshotIncluded !== false) {\n userContent.push({\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n });\n }\n\n userContent.push({\n type: 'text',\n text: extractDataPromptText,\n });\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: userContent,\n },\n ];\n\n if (multimodalPrompt) {\n const addOns = await promptsToChatParam({\n images: multimodalPrompt.images,\n convertHttpImage2Base64: multimodalPrompt.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const {\n content: rawResponse,\n usage,\n reasoning_content,\n } = await callAI(msgs, modelConfig);\n\n // Parse XML response to JSON object\n let parseResult: AIDataExtractionResponse<T>;\n try {\n parseResult = parseXMLExtractionResponse<T>(rawResponse);\n } catch (parseError) {\n // Throw AIResponseParseError with usage and rawResponse preserved\n const errorMessage =\n parseError instanceof Error ? parseError.message : String(parseError);\n throw new AIResponseParseError(\n `XML parse error: ${errorMessage}`,\n rawResponse,\n usage,\n );\n }\n\n return {\n parseResult,\n rawResponse,\n usage,\n reasoning_content,\n };\n}\n\nexport async function AiJudgeOrderSensitive(\n description: string,\n callAIFn: typeof callAIWithObjectResponse<{ isOrderSensitive: boolean }>,\n modelConfig: IModelConfig,\n): Promise<{\n isOrderSensitive: boolean;\n usage?: AIUsageInfo;\n}> {\n const systemPrompt = systemPromptToJudgeOrderSensitive();\n const userPrompt = orderSensitiveJudgePrompt(description);\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: userPrompt,\n },\n ];\n\n const result = await callAIFn(msgs, modelConfig);\n\n return {\n isOrderSensitive: result.content.isOrderSensitive ?? false,\n usage: result.usage,\n };\n}\n"],"names":["debugInspect","getDebug","debugSection","buildSearchAreaConfig","options","context","baseRect","modelFamily","scaleRatio","sectionRect","expandSearchArea","croppedResult","cropByRect","scaledResult","scaleImage","extraTextFromUserPrompt","prompt","promptsToChatParam","multimodalPrompt","msgs","item","base64","preProcessImageUrl","AiLocateElement","targetElementDescription","modelConfig","screenshotBase64","assert","targetElementDescriptionText","userInstructionPrompt","findElementPrompt","systemPrompt","isAutoGLM","getAutoGLMLocatePrompt","systemPromptToLocateElement","imagePayload","imageWidth","imageHeight","originalImageWidth","originalImageHeight","paddedResult","paddingToMatchBlockByBase64","addOns","rawResponseContent","usage","callAIWithStringResponse","parsed","parseAutoGLMLocateResponse","resRect","matchedElements","errors","x","y","pixelX","Math","pixelY","finalX","finalY","element","generateElementByPoint","res","callAIWithObjectResponse","callError","errorMessage","Error","String","rawResponse","AIResponseParseError","undefined","JSON","Array","adaptBboxToRect","generateElementByRect","e","msg","AiLocateSection","sectionDescription","systemPromptToLocateSection","sectionLocatorInstructionText","sectionLocatorInstruction","result","searchAreaConfig","sectionBbox","targetRect","referenceBboxList","referenceRects","bbox","mergedRect","mergeRects","expandedRect","originalWidth","originalHeight","AiExtractElementInfo","dataQuery","extractOption","systemPromptToExtract","extractDataPromptText","extractDataQueryPrompt","userContent","reasoning_content","callAI","parseResult","parseXMLExtractionResponse","parseError","AiJudgeOrderSensitive","description","callAIFn","systemPromptToJudgeOrderSensitive","userPrompt","orderSensitiveJudgePrompt"],"mappings":";;;;;;;;;;;;;AA6DA,MAAMA,eAAeC,SAAS;AAC9B,MAAMC,eAAeD,SAAS;AAEvB,eAAeE,sBAAsBC,OAI3C;IACC,MAAM,EAAEC,OAAO,EAAEC,QAAQ,EAAEC,WAAW,EAAE,GAAGH;IAC3C,MAAMI,aAAa;IACnB,MAAMC,cAAcC,iBAAiBJ,UAAUD,QAAQ,QAAQ;IAE/D,MAAMM,gBAAgB,MAAMC,WAC1BP,QAAQ,UAAU,CAAC,MAAM,EACzBI,aACAF,AAAgB,iBAAhBA;IAGF,MAAMM,eAAe,MAAMC,WAAWH,cAAc,WAAW,EAAEH;IACjEC,YAAY,KAAK,GAAGI,aAAa,KAAK;IACtCJ,YAAY,MAAM,GAAGI,aAAa,MAAM;IACxC,OAAO;QACL,MAAMJ;QACN,aAAaI,aAAa,WAAW;QACrC,OAAOL;IACT;AACF;AAEA,MAAMO,0BAA0B,CAACC;IAC/B,IAAI,AAAkB,YAAlB,OAAOA,QACT,OAAOA;IAEP,OAAOA,OAAO,MAAM;AAExB;AAEA,MAAMC,qBAAqB,OACzBC;IAEA,MAAMC,OAAyC,EAAE;IACjD,IAAID,kBAAkB,QAAQ,QAAQ;QACpCC,KAAK,IAAI,CAAC;YACR,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM;gBACR;aACD;QACH;QAEA,KAAK,MAAMC,QAAQF,iBAAiB,MAAM,CAAE;YAC1C,MAAMG,SAAS,MAAMC,mBACnBF,KAAK,GAAG,EACR,CAAC,CAACF,iBAAiB,uBAAuB;YAG5CC,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,MAAM,CAAC,mCAAmC,EAAEC,KAAK,IAAI,CAAC,EAAE,CAAC;oBAC3D;iBACD;YACH;YAEAD,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,WAAW;4BACT,KAAKE;4BACL,QAAQ;wBACV;oBACF;iBACD;YACH;QACF;IACF;IACA,OAAOF;AACT;AAEO,eAAeI,gBAAgBnB,OAMrC;IAUC,MAAM,EAAEC,OAAO,EAAEmB,wBAAwB,EAAEC,WAAW,EAAE,GAAGrB;IAC3D,MAAM,EAAEG,WAAW,EAAE,GAAGkB;IACxB,MAAMC,mBAAmBrB,QAAQ,UAAU,CAAC,MAAM;IAElDsB,OACEH,0BACA;IAEF,MAAMI,+BAA+Bb,wBACnCS;IAEF,MAAMK,wBAAwBC,kBAAkBF;IAChD,MAAMG,eAAeC,UAAUzB,eAC3B0B,uBAAuB1B,eACvB2B,4BAA4B3B;IAEhC,IAAI4B,eAAeT;IACnB,IAAIU,aAAa/B,QAAQ,QAAQ,CAAC,KAAK;IACvC,IAAIgC,cAAchC,QAAQ,QAAQ,CAAC,MAAM;IACzC,IAAIiC,qBAAqBF;IACzB,IAAIG,sBAAsBF;IAE1B,IAAIjC,QAAQ,YAAY,EAAE;QACxBuB,OACEvB,QAAQ,YAAY,CAAC,IAAI,EACzB;QAEFuB,OACEvB,QAAQ,YAAY,CAAC,WAAW,EAChC;QAGF+B,eAAe/B,QAAQ,YAAY,CAAC,WAAW;QAC/CgC,aAAahC,QAAQ,YAAY,CAAC,IAAI,EAAE;QACxCiC,cAAcjC,QAAQ,YAAY,CAAC,IAAI,EAAE;QACzCkC,qBAAqBF;QACrBG,sBAAsBF;IACxB,OAAO,IAAI9B,AAAgB,iBAAhBA,aAA8B;QACvC,MAAMiC,eAAe,MAAMC,4BAA4BN;QACvDC,aAAaI,aAAa,KAAK;QAC/BH,cAAcG,aAAa,MAAM;QACjCL,eAAeK,aAAa,WAAW;IACzC;IAEA,MAAMrB,OAAe;QACnB;YAAE,MAAM;YAAU,SAASY;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKI;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAMH,UAAUzB,eACZ,CAAC,KAAK,EAAEsB,uBAAuB,GAC/BA;gBACN;aACD;QACH;KACD;IAED,IAAI,AAAoC,YAApC,OAAOL,0BAAuC;QAChD,MAAMkB,SAAS,MAAMzB,mBAAmB;YACtC,QAAQO,yBAAyB,MAAM;YACvC,yBAAyBA,yBAAyB,uBAAuB;QAC3E;QACAL,KAAK,IAAI,IAAIuB;IACf;IAEA,IAAIV,UAAUzB,cAAc;QAC1B,MAAM,EAAE,SAASoC,kBAAkB,EAAEC,KAAK,EAAE,GAC1C,MAAMC,yBAAyB1B,MAAMM,aAAa;YAChD,aAAarB,QAAQ,WAAW;QAClC;QAEFJ,aAAa,yBAAyB2C;QAEtC,MAAMG,SAASC,2BAA2BJ;QAE1C3C,aAAa,sBAAsB8C,OAAO,KAAK;QAC/C9C,aAAa,yBAAyB8C,OAAO,WAAW;QAExD,IAAIE;QACJ,IAAIC,kBAAyC,EAAE;QAC/C,IAAIC,SAAmB,EAAE;QAEzB,IAAIJ,OAAO,KAAK,IAAI,CAACA,OAAO,WAAW,EAAE;YACvCI,SAAS;gBAACJ,OAAO,KAAK,IAAI;aAAoC;YAC9D9C,aAAa,yBAAyBkD,MAAM,CAAC,EAAE;QACjD,OAAO;YACL,MAAM,EAAEC,CAAC,EAAEC,CAAC,EAAE,GAAGN,OAAO,WAAW;YAEnC9C,aAAa,iCAAiC;gBAAEmD;gBAAGC;YAAE;YAIrD,MAAMC,SAASC,KAAK,KAAK,CAAEH,IAAIf,aAAc;YAC7C,MAAMmB,SAASD,KAAK,KAAK,CAAEF,IAAIf,cAAe;YAE9CrC,aAAa,+BAA+B;gBAAEqD;gBAAQE;YAAO;YAG7D,IAAIC,SAASH;YACb,IAAII,SAASF;YACb,IAAInD,QAAQ,YAAY,EAAE,MAAM;gBAC9BoD,UAAUpD,QAAQ,YAAY,CAAC,IAAI,CAAC,IAAI;gBACxCqD,UAAUrD,QAAQ,YAAY,CAAC,IAAI,CAAC,GAAG;YACzC;YAEA,MAAMsD,UAA+BC,uBACnC;gBAACH;gBAAQC;aAAO,EAChB7B;YAGFoB,UAAUU,QAAQ,IAAI;YACtB1D,aAAa,qBAAqBgD;YAElC,IAAIU,SACFT,kBAAkB;gBAACS;aAAQ;QAE/B;QAEA,OAAO;YACL,MAAMV;YACN,aAAa;gBACX,UAAUC;gBACVC;YACF;YACA,aAAaP;YACbC;YACA,mBAAmBE,OAAO,KAAK;QACjC;IACF;IAEA,IAAIc;IAKJ,IAAI;QACFA,MAAM,MAAMC,yBACV1C,MACAM,aACA;YAAE,aAAarB,QAAQ,WAAW;QAAC;IAEvC,EAAE,OAAO0D,WAAW;QAElB,MAAMC,eACJD,qBAAqBE,QAAQF,UAAU,OAAO,GAAGG,OAAOH;QAC1D,MAAMI,cACJJ,qBAAqBK,uBACjBL,UAAU,WAAW,GACrBC;QACN,MAAMnB,QACJkB,qBAAqBK,uBAAuBL,UAAU,KAAK,GAAGM;QAChE,OAAO;YACL,MAAMA;YACN,aAAa;gBACX,UAAU,EAAE;gBACZ,QAAQ;oBAAC,CAAC,eAAe,EAAEL,cAAc;iBAAC;YAC5C;YACAG;YACAtB;YACA,mBAAmBwB;QACrB;IACF;IAEA,MAAMF,cAAcG,KAAK,SAAS,CAACT,IAAI,OAAO;IAE9C,IAAIZ;IACJ,IAAIC,kBAAyC,EAAE;IAC/C,IAAIC,SACF,YAAYU,IAAI,OAAO,GAAGA,IAAI,OAAO,CAAC,MAAM,GAAG,EAAE;IACnD,IAAI;QACF,IACE,UAAUA,IAAI,OAAO,IACrBU,MAAM,OAAO,CAACV,IAAI,OAAO,CAAC,IAAI,KAC9BA,IAAI,OAAO,CAAC,IAAI,CAAC,MAAM,IAAI,GAC3B;YACAZ,UAAUuB,gBACRX,IAAI,OAAO,CAAC,IAAI,EAChBxB,YACAC,aACAjC,QAAQ,YAAY,EAAE,MAAM,MAC5BA,QAAQ,YAAY,EAAE,MAAM,KAC5BkC,oBACAC,qBACAhC,aACAH,QAAQ,YAAY,EAAE;YAGxBJ,aAAa,WAAWgD;YAExB,MAAMU,UAA+Bc,sBACnCxB,SACApB;YAEFsB,SAAS,EAAE;YAEX,IAAIQ,SACFT,kBAAkB;gBAACS;aAAQ;QAE/B;IACF,EAAE,OAAOe,GAAG;QACV,MAAMC,MACJD,aAAaT,QACT,CAAC,sBAAsB,EAAES,EAAE,OAAO,EAAE,GACpC;QACN,IAAI,AAACvB,UAAUA,QAAQ,WAAW,GAGhCA,OAAO,IAAI,CAAC,CAAC,CAAC,EAAEwB,IAAI,CAAC,CAAC;aAFtBxB,SAAS;YAACwB;SAAI;IAIlB;IAEA,OAAO;QACL,MAAM1B;QACN,aAAa;YACX,UAAUC;YACV,QAAQC;QACV;QACAgB;QACA,OAAON,IAAI,KAAK;QAChB,mBAAmBA,IAAI,iBAAiB;IAC1C;AACF;AAEO,eAAee,gBAAgBvE,OAKrC;IAQC,MAAM,EAAEC,OAAO,EAAEuE,kBAAkB,EAAEnD,WAAW,EAAE,GAAGrB;IACrD,MAAM,EAAEG,WAAW,EAAE,GAAGkB;IACxB,MAAMC,mBAAmBrB,QAAQ,UAAU,CAAC,MAAM;IAElD,MAAM0B,eAAe8C,4BAA4BtE;IACjD,MAAMuE,gCAAgCC,0BACpChE,wBAAwB6D;IAE1B,MAAMzD,OAAe;QACnB;YAAE,MAAM;YAAU,SAASY;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKL;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAMoD;gBACR;aACD;QACH;KACD;IAED,IAAI,AAA8B,YAA9B,OAAOF,oBAAiC;QAC1C,MAAMlC,SAAS,MAAMzB,mBAAmB;YACtC,QAAQ2D,mBAAmB,MAAM;YACjC,yBAAyBA,mBAAmB,uBAAuB;QACrE;QACAzD,KAAK,IAAI,IAAIuB;IACf;IAEA,IAAIsC;IAGJ,IAAI;QACFA,SAAS,MAAMnB,yBACb1C,MACAM,aACA;YAAE,aAAarB,QAAQ,WAAW;QAAC;IAEvC,EAAE,OAAO0D,WAAW;QAElB,MAAMC,eACJD,qBAAqBE,QAAQF,UAAU,OAAO,GAAGG,OAAOH;QAC1D,MAAMI,cACJJ,qBAAqBK,uBACjBL,UAAU,WAAW,GACrBC;QACN,MAAMnB,QACJkB,qBAAqBK,uBAAuBL,UAAU,KAAK,GAAGM;QAChE,OAAO;YACL,MAAMA;YACN,aAAaA;YACb,OAAO,CAAC,eAAe,EAAEL,cAAc;YACvCG;YACAtB;QACF;IACF;IAEA,IAAIqC;IAGJ,MAAMC,cAAcF,OAAO,OAAO,CAAC,IAAI;IACvC,IAAIE,aAAa;QACf,MAAMC,aAAaZ,gBACjBW,aACA7E,QAAQ,QAAQ,CAAC,KAAK,EACtBA,QAAQ,QAAQ,CAAC,MAAM,EACvB,GACA,GACAA,QAAQ,QAAQ,CAAC,KAAK,EACtBA,QAAQ,QAAQ,CAAC,MAAM,EACvBE;QAEFL,aAAa,0BAA0BiF;QAEvC,MAAMC,oBAAoBJ,OAAO,OAAO,CAAC,eAAe,IAAI,EAAE;QAC9D9E,aAAa,wBAAwBkF;QAErC,MAAMC,iBAAiBD,kBACpB,MAAM,CAAC,CAACE,OAAShB,MAAM,OAAO,CAACgB,OAC/B,GAAG,CAAC,CAACA,OACGf,gBACLe,MACAjF,QAAQ,QAAQ,CAAC,KAAK,EACtBA,QAAQ,QAAQ,CAAC,MAAM,EACvB,GACA,GACAA,QAAQ,QAAQ,CAAC,KAAK,EACtBA,QAAQ,QAAQ,CAAC,MAAM,EACvBE;QAGNL,aAAa,qBAAqBmF;QAGlC,MAAME,aAAaC,WAAW;YAACL;eAAeE;SAAe;QAC7DnF,aAAa,iBAAiBqF;QAE9B,MAAME,eAAe/E,iBAAiB6E,YAAYlF,QAAQ,QAAQ;QAClE,MAAMqF,gBAAgBD,aAAa,KAAK;QACxC,MAAME,iBAAiBF,aAAa,MAAM;QAC1CvF,aAAa,2BAA2BuF;QAExCR,mBAAmB,MAAM9E,sBAAsB;YAC7CE;YACA,UAAUkF;YACVhF;QACF;QAEAL,aACE,qDACAwF,eACAC,gBACAV,iBAAiB,IAAI,CAAC,KAAK,EAC3BA,iBAAiB,IAAI,CAAC,MAAM,EAC5BA,iBAAiB,KAAK;IAE1B;IAEA,OAAO;QACL,MAAMA,kBAAkB;QACxB,aAAaA,kBAAkB;QAC/B,OAAOA,kBAAkB;QACzB,OAAOD,OAAO,OAAO,CAAC,KAAK;QAC3B,aAAaX,KAAK,SAAS,CAACW,OAAO,OAAO;QAC1C,OAAOA,OAAO,KAAK;IACrB;AACF;AAEO,eAAeY,qBAAwBxF,OAO7C;IACC,MAAM,EAAEyF,SAAS,EAAExF,OAAO,EAAEyF,aAAa,EAAE5E,gBAAgB,EAAEO,WAAW,EAAE,GACxErB;IACF,MAAM2B,eAAegE;IACrB,MAAMrE,mBAAmBrB,QAAQ,UAAU,CAAC,MAAM;IAElD,MAAM2F,wBAAwBC,uBAC5B7F,QAAQ,eAAe,IAAI,IAC3ByF;IAGF,MAAMK,cAAyD,EAAE;IAEjE,IAAIJ,eAAe,uBAAuB,OACxCI,YAAY,IAAI,CAAC;QACf,MAAM;QACN,WAAW;YACT,KAAKxE;YACL,QAAQ;QACV;IACF;IAGFwE,YAAY,IAAI,CAAC;QACf,MAAM;QACN,MAAMF;IACR;IAEA,MAAM7E,OAAe;QACnB;YAAE,MAAM;YAAU,SAASY;QAAa;QACxC;YACE,MAAM;YACN,SAASmE;QACX;KACD;IAED,IAAIhF,kBAAkB;QACpB,MAAMwB,SAAS,MAAMzB,mBAAmB;YACtC,QAAQC,iBAAiB,MAAM;YAC/B,yBAAyBA,iBAAiB,uBAAuB;QACnE;QACAC,KAAK,IAAI,IAAIuB;IACf;IAEA,MAAM,EACJ,SAASwB,WAAW,EACpBtB,KAAK,EACLuD,iBAAiB,EAClB,GAAG,MAAMC,OAAOjF,MAAMM;IAGvB,IAAI4E;IACJ,IAAI;QACFA,cAAcC,2BAA8BpC;IAC9C,EAAE,OAAOqC,YAAY;QAEnB,MAAMxC,eACJwC,sBAAsBvC,QAAQuC,WAAW,OAAO,GAAGtC,OAAOsC;QAC5D,MAAM,IAAIpC,qBACR,CAAC,iBAAiB,EAAEJ,cAAc,EAClCG,aACAtB;IAEJ;IAEA,OAAO;QACLyD;QACAnC;QACAtB;QACAuD;IACF;AACF;AAEO,eAAeK,sBACpBC,WAAmB,EACnBC,QAAwE,EACxEjF,WAAyB;IAKzB,MAAMM,eAAe4E;IACrB,MAAMC,aAAaC,0BAA0BJ;IAE7C,MAAMtF,OAAe;QACnB;YAAE,MAAM;YAAU,SAASY;QAAa;QACxC;YACE,MAAM;YACN,SAAS6E;QACX;KACD;IAED,MAAM5B,SAAS,MAAM0B,SAASvF,MAAMM;IAEpC,OAAO;QACL,kBAAkBuD,OAAO,OAAO,CAAC,gBAAgB,IAAI;QACrD,OAAOA,OAAO,KAAK;IACrB;AACF"}
@@ -0,0 +1,233 @@
1
+ import { paddingToMatchBlockByBase64 } from "@midscene/shared/img";
2
+ import { getDebug } from "@midscene/shared/logger";
3
+ import { assert } from "@midscene/shared/utils";
4
+ import { buildYamlFlowFromPlans, fillBboxParam, findAllMidsceneLocatorField } from "../common.mjs";
5
+ import { systemPromptToTaskPlanning } from "./prompt/llm-planning.mjs";
6
+ import { extractXMLTag, parseMarkFinishedIndexes, parseSubGoalsFromXML } from "./prompt/util.mjs";
7
+ import { AIResponseParseError, callAI, safeParseJson } from "./service-caller/index.mjs";
8
+ const debug = getDebug('planning');
9
+ const warnLog = getDebug('planning', {
10
+ console: true
11
+ });
12
+ function parseXMLPlanningResponse(xmlString, modelFamily) {
13
+ const thought = extractXMLTag(xmlString, 'thought');
14
+ const memory = extractXMLTag(xmlString, 'memory');
15
+ const log = extractXMLTag(xmlString, 'log') || '';
16
+ const error = extractXMLTag(xmlString, 'error');
17
+ const actionType = extractXMLTag(xmlString, 'action-type');
18
+ const actionParamStr = extractXMLTag(xmlString, 'action-param-json');
19
+ const completeGoalRegex = /<complete\s+success="(true|false)">([\s\S]*?)<\/complete>/i;
20
+ const completeGoalMatch = xmlString.match(completeGoalRegex);
21
+ let finalizeMessage;
22
+ let finalizeSuccess;
23
+ if (completeGoalMatch) {
24
+ finalizeSuccess = 'true' === completeGoalMatch[1];
25
+ finalizeMessage = completeGoalMatch[2]?.trim() || void 0;
26
+ }
27
+ const updatePlanContent = extractXMLTag(xmlString, 'update-plan-content');
28
+ const markSubGoalDone = extractXMLTag(xmlString, 'mark-sub-goal-done');
29
+ const updateSubGoals = updatePlanContent ? parseSubGoalsFromXML(updatePlanContent) : void 0;
30
+ const markFinishedIndexes = markSubGoalDone ? parseMarkFinishedIndexes(markSubGoalDone) : void 0;
31
+ let action = null;
32
+ if (actionType && 'null' !== actionType.toLowerCase()) {
33
+ const type = actionType.split('<')[0].trim();
34
+ let param;
35
+ if (actionParamStr) try {
36
+ param = safeParseJson(actionParamStr, modelFamily);
37
+ } catch (e) {
38
+ throw new Error(`Failed to parse action-param-json: ${e}`);
39
+ }
40
+ action = {
41
+ type,
42
+ ...void 0 !== param ? {
43
+ param
44
+ } : {}
45
+ };
46
+ }
47
+ return {
48
+ ...thought ? {
49
+ thought
50
+ } : {},
51
+ ...memory ? {
52
+ memory
53
+ } : {},
54
+ log,
55
+ ...error ? {
56
+ error
57
+ } : {},
58
+ action,
59
+ ...void 0 !== finalizeMessage ? {
60
+ finalizeMessage
61
+ } : {},
62
+ ...void 0 !== finalizeSuccess ? {
63
+ finalizeSuccess
64
+ } : {},
65
+ ...updateSubGoals?.length ? {
66
+ updateSubGoals
67
+ } : {},
68
+ ...markFinishedIndexes?.length ? {
69
+ markFinishedIndexes
70
+ } : {}
71
+ };
72
+ }
73
+ async function plan(userInstruction, opts) {
74
+ const { context, modelConfig, conversationHistory } = opts;
75
+ const { shotSize } = context;
76
+ const screenshotBase64 = context.screenshot.base64;
77
+ const { modelFamily } = modelConfig;
78
+ const includeSubGoals = true === opts.deepThink;
79
+ const systemPrompt = await systemPromptToTaskPlanning({
80
+ actionSpace: opts.actionSpace,
81
+ modelFamily,
82
+ includeBbox: opts.includeBbox,
83
+ includeThought: true,
84
+ includeSubGoals
85
+ });
86
+ let imagePayload = screenshotBase64;
87
+ let imageWidth = shotSize.width;
88
+ let imageHeight = shotSize.height;
89
+ if ('qwen2.5-vl' === modelFamily) {
90
+ const paddedResult = await paddingToMatchBlockByBase64(imagePayload);
91
+ imageWidth = paddedResult.width;
92
+ imageHeight = paddedResult.height;
93
+ imagePayload = paddedResult.imageBase64;
94
+ }
95
+ const actionContext = opts.actionContext ? `<high_priority_knowledge>${opts.actionContext}</high_priority_knowledge>\n` : '';
96
+ const instruction = [
97
+ {
98
+ role: 'user',
99
+ content: [
100
+ {
101
+ type: 'text',
102
+ text: `${actionContext}<user_instruction>${userInstruction}</user_instruction>`
103
+ }
104
+ ]
105
+ }
106
+ ];
107
+ let latestFeedbackMessage;
108
+ const subGoalsText = includeSubGoals ? conversationHistory.subGoalsToText() : conversationHistory.historicalLogsToText();
109
+ const subGoalsSection = subGoalsText ? `\n\n${subGoalsText}` : '';
110
+ const memoriesText = conversationHistory.memoriesToText();
111
+ const memoriesSection = memoriesText ? `\n\n${memoriesText}` : '';
112
+ if (conversationHistory.pendingFeedbackMessage) {
113
+ latestFeedbackMessage = {
114
+ role: 'user',
115
+ content: [
116
+ {
117
+ type: 'text',
118
+ text: `${conversationHistory.pendingFeedbackMessage}. The previous action has been executed, here is the latest screenshot. Please continue according to the instruction.${memoriesSection}${subGoalsSection}`
119
+ },
120
+ {
121
+ type: 'image_url',
122
+ image_url: {
123
+ url: imagePayload,
124
+ detail: 'high'
125
+ }
126
+ }
127
+ ]
128
+ };
129
+ conversationHistory.resetPendingFeedbackMessageIfExists();
130
+ } else latestFeedbackMessage = {
131
+ role: 'user',
132
+ content: [
133
+ {
134
+ type: 'text',
135
+ text: `this is the latest screenshot${memoriesSection}${subGoalsSection}`
136
+ },
137
+ {
138
+ type: 'image_url',
139
+ image_url: {
140
+ url: imagePayload,
141
+ detail: 'high'
142
+ }
143
+ }
144
+ ]
145
+ };
146
+ conversationHistory.append(latestFeedbackMessage);
147
+ conversationHistory.compressHistory(50, 20);
148
+ const historyLog = conversationHistory.snapshot(opts.imagesIncludeCount);
149
+ const msgs = [
150
+ {
151
+ role: 'system',
152
+ content: systemPrompt
153
+ },
154
+ ...instruction,
155
+ ...historyLog
156
+ ];
157
+ let { content: rawResponse, usage, reasoning_content } = await callAI(msgs, modelConfig, {
158
+ deepThink: 'unset' === opts.deepThink ? void 0 : opts.deepThink,
159
+ abortSignal: opts.abortSignal
160
+ });
161
+ let planFromAI;
162
+ try {
163
+ try {
164
+ planFromAI = parseXMLPlanningResponse(rawResponse, modelFamily);
165
+ } catch {
166
+ const retry = await callAI(msgs, modelConfig, {
167
+ deepThink: 'unset' === opts.deepThink ? void 0 : opts.deepThink,
168
+ abortSignal: opts.abortSignal
169
+ });
170
+ rawResponse = retry.content;
171
+ usage = retry.usage;
172
+ reasoning_content = retry.reasoning_content;
173
+ planFromAI = parseXMLPlanningResponse(rawResponse, modelFamily);
174
+ }
175
+ if (planFromAI.action && void 0 !== planFromAI.finalizeSuccess) {
176
+ warnLog('Planning response included both an action and <complete>; ignoring <complete> output.');
177
+ planFromAI.finalizeMessage = void 0;
178
+ planFromAI.finalizeSuccess = void 0;
179
+ }
180
+ const actions = planFromAI.action ? [
181
+ planFromAI.action
182
+ ] : [];
183
+ let shouldContinuePlanning = true;
184
+ if (void 0 !== planFromAI.finalizeSuccess) {
185
+ debug('task completed via <complete> tag, stop planning');
186
+ shouldContinuePlanning = false;
187
+ if (includeSubGoals) conversationHistory.markAllSubGoalsFinished();
188
+ }
189
+ const returnValue = {
190
+ ...planFromAI,
191
+ actions,
192
+ rawResponse,
193
+ usage,
194
+ reasoning_content,
195
+ yamlFlow: buildYamlFlowFromPlans(actions, opts.actionSpace),
196
+ shouldContinuePlanning
197
+ };
198
+ assert(planFromAI, "can't get plans from AI");
199
+ actions.forEach((action)=>{
200
+ const type = action.type;
201
+ const actionInActionSpace = opts.actionSpace.find((action)=>action.name === type);
202
+ debug('actionInActionSpace matched', actionInActionSpace);
203
+ const locateFields = actionInActionSpace ? findAllMidsceneLocatorField(actionInActionSpace.paramSchema) : [];
204
+ debug('locateFields', locateFields);
205
+ locateFields.forEach((field)=>{
206
+ const locateResult = action.param[field];
207
+ if (locateResult && void 0 !== modelFamily) action.param[field] = fillBboxParam(locateResult, imageWidth, imageHeight, modelFamily);
208
+ });
209
+ });
210
+ if (includeSubGoals) {
211
+ if (planFromAI.updateSubGoals?.length) conversationHistory.mergeSubGoals(planFromAI.updateSubGoals);
212
+ if (planFromAI.markFinishedIndexes?.length) for (const index of planFromAI.markFinishedIndexes)conversationHistory.markSubGoalFinished(index);
213
+ if (planFromAI.log) conversationHistory.appendSubGoalLog(planFromAI.log);
214
+ } else if (planFromAI.log) conversationHistory.appendHistoricalLog(planFromAI.log);
215
+ if (planFromAI.memory) conversationHistory.appendMemory(planFromAI.memory);
216
+ conversationHistory.append({
217
+ role: 'assistant',
218
+ content: [
219
+ {
220
+ type: 'text',
221
+ text: rawResponse
222
+ }
223
+ ]
224
+ });
225
+ return returnValue;
226
+ } catch (parseError) {
227
+ const errorMessage = parseError instanceof Error ? parseError.message : String(parseError);
228
+ throw new AIResponseParseError(`XML parse error: ${errorMessage}`, rawResponse, usage);
229
+ }
230
+ }
231
+ export { parseXMLPlanningResponse, plan };
232
+
233
+ //# sourceMappingURL=llm-planning.mjs.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"ai-model/llm-planning.mjs","sources":["../../../src/ai-model/llm-planning.ts"],"sourcesContent":["import type {\n DeepThinkOption,\n DeviceAction,\n InterfaceType,\n PlanningAIResponse,\n RawResponsePlanningAIResponse,\n UIContext,\n} from '@/types';\nimport type { IModelConfig, TModelFamily } from '@midscene/shared/env';\nimport { paddingToMatchBlockByBase64 } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport type { ChatCompletionMessageParam } from 'openai/resources/index';\nimport {\n buildYamlFlowFromPlans,\n fillBboxParam,\n findAllMidsceneLocatorField,\n} from '../common';\nimport type { ConversationHistory } from './conversation-history';\nimport { systemPromptToTaskPlanning } from './prompt/llm-planning';\nimport {\n extractXMLTag,\n parseMarkFinishedIndexes,\n parseSubGoalsFromXML,\n} from './prompt/util';\nimport {\n AIResponseParseError,\n callAI,\n safeParseJson,\n} from './service-caller/index';\n\nconst debug = getDebug('planning');\nconst warnLog = getDebug('planning', { console: true });\n\n/**\n * Parse XML response from LLM and convert to RawResponsePlanningAIResponse\n */\nexport function parseXMLPlanningResponse(\n xmlString: string,\n modelFamily: TModelFamily | undefined,\n): RawResponsePlanningAIResponse {\n const thought = extractXMLTag(xmlString, 'thought');\n const memory = extractXMLTag(xmlString, 'memory');\n const log = extractXMLTag(xmlString, 'log') || '';\n const error = extractXMLTag(xmlString, 'error');\n const actionType = extractXMLTag(xmlString, 'action-type');\n const actionParamStr = extractXMLTag(xmlString, 'action-param-json');\n\n // Parse <complete> tag with success attribute\n const completeGoalRegex =\n /<complete\\s+success=\"(true|false)\">([\\s\\S]*?)<\\/complete>/i;\n const completeGoalMatch = xmlString.match(completeGoalRegex);\n let finalizeMessage: string | undefined;\n let finalizeSuccess: boolean | undefined;\n\n if (completeGoalMatch) {\n finalizeSuccess = completeGoalMatch[1] === 'true';\n finalizeMessage = completeGoalMatch[2]?.trim() || undefined;\n }\n\n // Parse sub-goal related tags\n const updatePlanContent = extractXMLTag(xmlString, 'update-plan-content');\n const markSubGoalDone = extractXMLTag(xmlString, 'mark-sub-goal-done');\n\n const updateSubGoals = updatePlanContent\n ? parseSubGoalsFromXML(updatePlanContent)\n : undefined;\n const markFinishedIndexes = markSubGoalDone\n ? parseMarkFinishedIndexes(markSubGoalDone)\n : undefined;\n\n // Parse action\n let action: any = null;\n if (actionType && actionType.toLowerCase() !== 'null') {\n // Strip any trailing XML tags that LLM might have leaked into the action type\n // e.g. \"KeyboardPress</action-type>\\n<action-param-json>\" -> \"KeyboardPress\"\n const type = actionType.split('<')[0].trim();\n let param: any = undefined;\n\n if (actionParamStr) {\n try {\n // Parse the JSON string in action-param-json\n param = safeParseJson(actionParamStr, modelFamily);\n } catch (e) {\n throw new Error(`Failed to parse action-param-json: ${e}`);\n }\n }\n\n action = {\n type,\n ...(param !== undefined ? { param } : {}),\n };\n }\n\n return {\n ...(thought ? { thought } : {}),\n ...(memory ? { memory } : {}),\n log,\n ...(error ? { error } : {}),\n action,\n ...(finalizeMessage !== undefined ? { finalizeMessage } : {}),\n ...(finalizeSuccess !== undefined ? { finalizeSuccess } : {}),\n ...(updateSubGoals?.length ? { updateSubGoals } : {}),\n ...(markFinishedIndexes?.length ? { markFinishedIndexes } : {}),\n };\n}\n\nexport async function plan(\n userInstruction: string,\n opts: {\n context: UIContext;\n interfaceType: InterfaceType;\n actionSpace: DeviceAction<any>[];\n actionContext?: string;\n modelConfig: IModelConfig;\n conversationHistory: ConversationHistory;\n includeBbox: boolean;\n imagesIncludeCount?: number;\n deepThink?: DeepThinkOption;\n abortSignal?: AbortSignal;\n },\n): Promise<PlanningAIResponse> {\n const { context, modelConfig, conversationHistory } = opts;\n const { shotSize } = context;\n const screenshotBase64 = context.screenshot.base64;\n\n const { modelFamily } = modelConfig;\n\n // Only enable sub-goals when deepThink is true\n const includeSubGoals = opts.deepThink === true;\n\n const systemPrompt = await systemPromptToTaskPlanning({\n actionSpace: opts.actionSpace,\n modelFamily,\n includeBbox: opts.includeBbox,\n includeThought: true, // always include thought\n includeSubGoals,\n });\n\n let imagePayload = screenshotBase64;\n let imageWidth = shotSize.width;\n let imageHeight = shotSize.height;\n const rightLimit = imageWidth;\n const bottomLimit = imageHeight;\n\n // Process image based on VL mode requirements\n if (modelFamily === 'qwen2.5-vl') {\n const paddedResult = await paddingToMatchBlockByBase64(imagePayload);\n imageWidth = paddedResult.width;\n imageHeight = paddedResult.height;\n imagePayload = paddedResult.imageBase64;\n }\n\n const actionContext = opts.actionContext\n ? `<high_priority_knowledge>${opts.actionContext}</high_priority_knowledge>\\n`\n : '';\n\n const instruction: ChatCompletionMessageParam[] = [\n {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `${actionContext}<user_instruction>${userInstruction}</user_instruction>`,\n },\n ],\n },\n ];\n\n let latestFeedbackMessage: ChatCompletionMessageParam;\n\n // Build sub-goal status text to include in the message\n // In deepThink mode: show full sub-goals with logs\n // In non-deepThink mode: show historical execution logs\n const subGoalsText = includeSubGoals\n ? conversationHistory.subGoalsToText()\n : conversationHistory.historicalLogsToText();\n const subGoalsSection = subGoalsText ? `\\n\\n${subGoalsText}` : '';\n\n // Build memories text to include in the message\n const memoriesText = conversationHistory.memoriesToText();\n const memoriesSection = memoriesText ? `\\n\\n${memoriesText}` : '';\n\n if (conversationHistory.pendingFeedbackMessage) {\n latestFeedbackMessage = {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `${conversationHistory.pendingFeedbackMessage}. The previous action has been executed, here is the latest screenshot. Please continue according to the instruction.${memoriesSection}${subGoalsSection}`,\n },\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n ],\n };\n\n conversationHistory.resetPendingFeedbackMessageIfExists();\n } else {\n latestFeedbackMessage = {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `this is the latest screenshot${memoriesSection}${subGoalsSection}`,\n },\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n ],\n };\n }\n conversationHistory.append(latestFeedbackMessage);\n\n // Compress history if it exceeds the threshold to avoid context overflow\n conversationHistory.compressHistory(50, 20);\n\n const historyLog = conversationHistory.snapshot(opts.imagesIncludeCount);\n\n const msgs: ChatCompletionMessageParam[] = [\n { role: 'system', content: systemPrompt },\n ...instruction,\n ...historyLog,\n ];\n\n let {\n content: rawResponse,\n usage,\n reasoning_content,\n } = await callAI(msgs, modelConfig, {\n deepThink: opts.deepThink === 'unset' ? undefined : opts.deepThink,\n abortSignal: opts.abortSignal,\n });\n\n // Parse XML response to JSON object, retry once on parse failure\n let planFromAI: RawResponsePlanningAIResponse;\n try {\n try {\n planFromAI = parseXMLPlanningResponse(rawResponse, modelFamily);\n } catch {\n const retry = await callAI(msgs, modelConfig, {\n deepThink: opts.deepThink === 'unset' ? undefined : opts.deepThink,\n abortSignal: opts.abortSignal,\n });\n rawResponse = retry.content;\n usage = retry.usage;\n reasoning_content = retry.reasoning_content;\n planFromAI = parseXMLPlanningResponse(rawResponse, modelFamily);\n }\n\n if (planFromAI.action && planFromAI.finalizeSuccess !== undefined) {\n warnLog(\n 'Planning response included both an action and <complete>; ignoring <complete> output.',\n );\n planFromAI.finalizeMessage = undefined;\n planFromAI.finalizeSuccess = undefined;\n }\n\n const actions = planFromAI.action ? [planFromAI.action] : [];\n let shouldContinuePlanning = true;\n\n // Check if task is completed via <complete> tag\n if (planFromAI.finalizeSuccess !== undefined) {\n debug('task completed via <complete> tag, stop planning');\n shouldContinuePlanning = false;\n // Mark all sub-goals as finished when goal is completed (only when deepThink is enabled)\n if (includeSubGoals) {\n conversationHistory.markAllSubGoalsFinished();\n }\n }\n\n const returnValue: PlanningAIResponse = {\n ...planFromAI,\n actions,\n rawResponse,\n usage,\n reasoning_content,\n yamlFlow: buildYamlFlowFromPlans(actions, opts.actionSpace),\n shouldContinuePlanning,\n };\n\n assert(planFromAI, \"can't get plans from AI\");\n\n actions.forEach((action) => {\n const type = action.type;\n const actionInActionSpace = opts.actionSpace.find(\n (action) => action.name === type,\n );\n\n debug('actionInActionSpace matched', actionInActionSpace);\n const locateFields = actionInActionSpace\n ? findAllMidsceneLocatorField(actionInActionSpace.paramSchema)\n : [];\n\n debug('locateFields', locateFields);\n\n locateFields.forEach((field) => {\n const locateResult = action.param[field];\n if (locateResult && modelFamily !== undefined) {\n // Always use model family to fill bbox parameters\n action.param[field] = fillBboxParam(\n locateResult,\n imageWidth,\n imageHeight,\n modelFamily,\n );\n }\n });\n });\n\n // Update sub-goals in conversation history based on response (only when deepThink is enabled)\n if (includeSubGoals) {\n if (planFromAI.updateSubGoals?.length) {\n conversationHistory.mergeSubGoals(planFromAI.updateSubGoals);\n }\n if (planFromAI.markFinishedIndexes?.length) {\n for (const index of planFromAI.markFinishedIndexes) {\n conversationHistory.markSubGoalFinished(index);\n }\n }\n // Append the planning log to the currently running sub-goal\n if (planFromAI.log) {\n conversationHistory.appendSubGoalLog(planFromAI.log);\n }\n } else {\n // In non-deepThink mode, accumulate logs as historical execution steps\n if (planFromAI.log) {\n conversationHistory.appendHistoricalLog(planFromAI.log);\n }\n }\n\n // Append memory to conversation history if present\n if (planFromAI.memory) {\n conversationHistory.appendMemory(planFromAI.memory);\n }\n\n conversationHistory.append({\n role: 'assistant',\n content: [\n {\n type: 'text',\n text: rawResponse,\n },\n ],\n });\n\n return returnValue;\n } catch (parseError) {\n // Throw AIResponseParseError with usage and rawResponse preserved\n const errorMessage =\n parseError instanceof Error ? parseError.message : String(parseError);\n throw new AIResponseParseError(\n `XML parse error: ${errorMessage}`,\n rawResponse,\n usage,\n );\n }\n}\n"],"names":["debug","getDebug","warnLog","parseXMLPlanningResponse","xmlString","modelFamily","thought","extractXMLTag","memory","log","error","actionType","actionParamStr","completeGoalRegex","completeGoalMatch","finalizeMessage","finalizeSuccess","undefined","updatePlanContent","markSubGoalDone","updateSubGoals","parseSubGoalsFromXML","markFinishedIndexes","parseMarkFinishedIndexes","action","type","param","safeParseJson","e","Error","plan","userInstruction","opts","context","modelConfig","conversationHistory","shotSize","screenshotBase64","includeSubGoals","systemPrompt","systemPromptToTaskPlanning","imagePayload","imageWidth","imageHeight","paddedResult","paddingToMatchBlockByBase64","actionContext","instruction","latestFeedbackMessage","subGoalsText","subGoalsSection","memoriesText","memoriesSection","historyLog","msgs","rawResponse","usage","reasoning_content","callAI","planFromAI","retry","actions","shouldContinuePlanning","returnValue","buildYamlFlowFromPlans","assert","actionInActionSpace","locateFields","findAllMidsceneLocatorField","field","locateResult","fillBboxParam","index","parseError","errorMessage","String","AIResponseParseError"],"mappings":";;;;;;;AA+BA,MAAMA,QAAQC,SAAS;AACvB,MAAMC,UAAUD,SAAS,YAAY;IAAE,SAAS;AAAK;AAK9C,SAASE,yBACdC,SAAiB,EACjBC,WAAqC;IAErC,MAAMC,UAAUC,cAAcH,WAAW;IACzC,MAAMI,SAASD,cAAcH,WAAW;IACxC,MAAMK,MAAMF,cAAcH,WAAW,UAAU;IAC/C,MAAMM,QAAQH,cAAcH,WAAW;IACvC,MAAMO,aAAaJ,cAAcH,WAAW;IAC5C,MAAMQ,iBAAiBL,cAAcH,WAAW;IAGhD,MAAMS,oBACJ;IACF,MAAMC,oBAAoBV,UAAU,KAAK,CAACS;IAC1C,IAAIE;IACJ,IAAIC;IAEJ,IAAIF,mBAAmB;QACrBE,kBAAkBF,AAAyB,WAAzBA,iBAAiB,CAAC,EAAE;QACtCC,kBAAkBD,iBAAiB,CAAC,EAAE,EAAE,UAAUG;IACpD;IAGA,MAAMC,oBAAoBX,cAAcH,WAAW;IACnD,MAAMe,kBAAkBZ,cAAcH,WAAW;IAEjD,MAAMgB,iBAAiBF,oBACnBG,qBAAqBH,qBACrBD;IACJ,MAAMK,sBAAsBH,kBACxBI,yBAAyBJ,mBACzBF;IAGJ,IAAIO,SAAc;IAClB,IAAIb,cAAcA,AAA6B,WAA7BA,WAAW,WAAW,IAAe;QAGrD,MAAMc,OAAOd,WAAW,KAAK,CAAC,IAAI,CAAC,EAAE,CAAC,IAAI;QAC1C,IAAIe;QAEJ,IAAId,gBACF,IAAI;YAEFc,QAAQC,cAAcf,gBAAgBP;QACxC,EAAE,OAAOuB,GAAG;YACV,MAAM,IAAIC,MAAM,CAAC,mCAAmC,EAAED,GAAG;QAC3D;QAGFJ,SAAS;YACPC;YACA,GAAIC,AAAUT,WAAVS,QAAsB;gBAAEA;YAAM,IAAI,CAAC,CAAC;QAC1C;IACF;IAEA,OAAO;QACL,GAAIpB,UAAU;YAAEA;QAAQ,IAAI,CAAC,CAAC;QAC9B,GAAIE,SAAS;YAAEA;QAAO,IAAI,CAAC,CAAC;QAC5BC;QACA,GAAIC,QAAQ;YAAEA;QAAM,IAAI,CAAC,CAAC;QAC1Bc;QACA,GAAIT,AAAoBE,WAApBF,kBAAgC;YAAEA;QAAgB,IAAI,CAAC,CAAC;QAC5D,GAAIC,AAAoBC,WAApBD,kBAAgC;YAAEA;QAAgB,IAAI,CAAC,CAAC;QAC5D,GAAII,gBAAgB,SAAS;YAAEA;QAAe,IAAI,CAAC,CAAC;QACpD,GAAIE,qBAAqB,SAAS;YAAEA;QAAoB,IAAI,CAAC,CAAC;IAChE;AACF;AAEO,eAAeQ,KACpBC,eAAuB,EACvBC,IAWC;IAED,MAAM,EAAEC,OAAO,EAAEC,WAAW,EAAEC,mBAAmB,EAAE,GAAGH;IACtD,MAAM,EAAEI,QAAQ,EAAE,GAAGH;IACrB,MAAMI,mBAAmBJ,QAAQ,UAAU,CAAC,MAAM;IAElD,MAAM,EAAE5B,WAAW,EAAE,GAAG6B;IAGxB,MAAMI,kBAAkBN,AAAmB,SAAnBA,KAAK,SAAS;IAEtC,MAAMO,eAAe,MAAMC,2BAA2B;QACpD,aAAaR,KAAK,WAAW;QAC7B3B;QACA,aAAa2B,KAAK,WAAW;QAC7B,gBAAgB;QAChBM;IACF;IAEA,IAAIG,eAAeJ;IACnB,IAAIK,aAAaN,SAAS,KAAK;IAC/B,IAAIO,cAAcP,SAAS,MAAM;IAKjC,IAAI/B,AAAgB,iBAAhBA,aAA8B;QAChC,MAAMuC,eAAe,MAAMC,4BAA4BJ;QACvDC,aAAaE,aAAa,KAAK;QAC/BD,cAAcC,aAAa,MAAM;QACjCH,eAAeG,aAAa,WAAW;IACzC;IAEA,MAAME,gBAAgBd,KAAK,aAAa,GACpC,CAAC,yBAAyB,EAAEA,KAAK,aAAa,CAAC,4BAA4B,CAAC,GAC5E;IAEJ,MAAMe,cAA4C;QAChD;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM,GAAGD,cAAc,kBAAkB,EAAEf,gBAAgB,mBAAmB,CAAC;gBACjF;aACD;QACH;KACD;IAED,IAAIiB;IAKJ,MAAMC,eAAeX,kBACjBH,oBAAoB,cAAc,KAClCA,oBAAoB,oBAAoB;IAC5C,MAAMe,kBAAkBD,eAAe,CAAC,IAAI,EAAEA,cAAc,GAAG;IAG/D,MAAME,eAAehB,oBAAoB,cAAc;IACvD,MAAMiB,kBAAkBD,eAAe,CAAC,IAAI,EAAEA,cAAc,GAAG;IAE/D,IAAIhB,oBAAoB,sBAAsB,EAAE;QAC9Ca,wBAAwB;YACtB,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM,GAAGb,oBAAoB,sBAAsB,CAAC,qHAAqH,EAAEiB,kBAAkBF,iBAAiB;gBAChN;gBACA;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKT;wBACL,QAAQ;oBACV;gBACF;aACD;QACH;QAEAN,oBAAoB,mCAAmC;IACzD,OACEa,wBAAwB;QACtB,MAAM;QACN,SAAS;YACP;gBACE,MAAM;gBACN,MAAM,CAAC,6BAA6B,EAAEI,kBAAkBF,iBAAiB;YAC3E;YACA;gBACE,MAAM;gBACN,WAAW;oBACT,KAAKT;oBACL,QAAQ;gBACV;YACF;SACD;IACH;IAEFN,oBAAoB,MAAM,CAACa;IAG3Bb,oBAAoB,eAAe,CAAC,IAAI;IAExC,MAAMkB,aAAalB,oBAAoB,QAAQ,CAACH,KAAK,kBAAkB;IAEvE,MAAMsB,OAAqC;QACzC;YAAE,MAAM;YAAU,SAASf;QAAa;WACrCQ;WACAM;KACJ;IAED,IAAI,EACF,SAASE,WAAW,EACpBC,KAAK,EACLC,iBAAiB,EAClB,GAAG,MAAMC,OAAOJ,MAAMpB,aAAa;QAClC,WAAWF,AAAmB,YAAnBA,KAAK,SAAS,GAAef,SAAYe,KAAK,SAAS;QAClE,aAAaA,KAAK,WAAW;IAC/B;IAGA,IAAI2B;IACJ,IAAI;QACF,IAAI;YACFA,aAAaxD,yBAAyBoD,aAAalD;QACrD,EAAE,OAAM;YACN,MAAMuD,QAAQ,MAAMF,OAAOJ,MAAMpB,aAAa;gBAC5C,WAAWF,AAAmB,YAAnBA,KAAK,SAAS,GAAef,SAAYe,KAAK,SAAS;gBAClE,aAAaA,KAAK,WAAW;YAC/B;YACAuB,cAAcK,MAAM,OAAO;YAC3BJ,QAAQI,MAAM,KAAK;YACnBH,oBAAoBG,MAAM,iBAAiB;YAC3CD,aAAaxD,yBAAyBoD,aAAalD;QACrD;QAEA,IAAIsD,WAAW,MAAM,IAAIA,AAA+B1C,WAA/B0C,WAAW,eAAe,EAAgB;YACjEzD,QACE;YAEFyD,WAAW,eAAe,GAAG1C;YAC7B0C,WAAW,eAAe,GAAG1C;QAC/B;QAEA,MAAM4C,UAAUF,WAAW,MAAM,GAAG;YAACA,WAAW,MAAM;SAAC,GAAG,EAAE;QAC5D,IAAIG,yBAAyB;QAG7B,IAAIH,AAA+B1C,WAA/B0C,WAAW,eAAe,EAAgB;YAC5C3D,MAAM;YACN8D,yBAAyB;YAEzB,IAAIxB,iBACFH,oBAAoB,uBAAuB;QAE/C;QAEA,MAAM4B,cAAkC;YACtC,GAAGJ,UAAU;YACbE;YACAN;YACAC;YACAC;YACA,UAAUO,uBAAuBH,SAAS7B,KAAK,WAAW;YAC1D8B;QACF;QAEAG,OAAON,YAAY;QAEnBE,QAAQ,OAAO,CAAC,CAACrC;YACf,MAAMC,OAAOD,OAAO,IAAI;YACxB,MAAM0C,sBAAsBlC,KAAK,WAAW,CAAC,IAAI,CAC/C,CAACR,SAAWA,OAAO,IAAI,KAAKC;YAG9BzB,MAAM,+BAA+BkE;YACrC,MAAMC,eAAeD,sBACjBE,4BAA4BF,oBAAoB,WAAW,IAC3D,EAAE;YAENlE,MAAM,gBAAgBmE;YAEtBA,aAAa,OAAO,CAAC,CAACE;gBACpB,MAAMC,eAAe9C,OAAO,KAAK,CAAC6C,MAAM;gBACxC,IAAIC,gBAAgBjE,AAAgBY,WAAhBZ,aAElBmB,OAAO,KAAK,CAAC6C,MAAM,GAAGE,cACpBD,cACA5B,YACAC,aACAtC;YAGN;QACF;QAGA,IAAIiC,iBAAiB;YACnB,IAAIqB,WAAW,cAAc,EAAE,QAC7BxB,oBAAoB,aAAa,CAACwB,WAAW,cAAc;YAE7D,IAAIA,WAAW,mBAAmB,EAAE,QAClC,KAAK,MAAMa,SAASb,WAAW,mBAAmB,CAChDxB,oBAAoB,mBAAmB,CAACqC;YAI5C,IAAIb,WAAW,GAAG,EAChBxB,oBAAoB,gBAAgB,CAACwB,WAAW,GAAG;QAEvD,OAEE,IAAIA,WAAW,GAAG,EAChBxB,oBAAoB,mBAAmB,CAACwB,WAAW,GAAG;QAK1D,IAAIA,WAAW,MAAM,EACnBxB,oBAAoB,YAAY,CAACwB,WAAW,MAAM;QAGpDxB,oBAAoB,MAAM,CAAC;YACzB,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAMoB;gBACR;aACD;QACH;QAEA,OAAOQ;IACT,EAAE,OAAOU,YAAY;QAEnB,MAAMC,eACJD,sBAAsB5C,QAAQ4C,WAAW,OAAO,GAAGE,OAAOF;QAC5D,MAAM,IAAIG,qBACR,CAAC,iBAAiB,EAAEF,cAAc,EAClCnB,aACAC;IAEJ;AACF"}
@@ -0,0 +1,7 @@
1
+ function bboxDescription(modelFamily) {
2
+ if ('gemini' === modelFamily) return 'box_2d bounding box for the target element, should be [ymin, xmin, ymax, xmax] normalized to 0-1000.';
3
+ return '2d bounding box as [xmin, ymin, xmax, ymax]';
4
+ }
5
+ export { bboxDescription };
6
+
7
+ //# sourceMappingURL=common.mjs.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"ai-model/prompt/common.mjs","sources":["../../../../src/ai-model/prompt/common.ts"],"sourcesContent":["import type { TModelFamily } from '@midscene/shared/env';\nexport function bboxDescription(modelFamily: TModelFamily | undefined) {\n if (modelFamily === 'gemini') {\n return 'box_2d bounding box for the target element, should be [ymin, xmin, ymax, xmax] normalized to 0-1000.';\n }\n return '2d bounding box as [xmin, ymin, xmax, ymax]';\n}\n"],"names":["bboxDescription","modelFamily"],"mappings":"AACO,SAASA,gBAAgBC,WAAqC;IACnE,IAAIA,AAAgB,aAAhBA,aACF,OAAO;IAET,OAAO;AACT"}
@@ -0,0 +1,66 @@
1
+ import { getPreferredLanguage } from "@midscene/shared/env";
2
+ const examplesMap = {
3
+ Chinese: [
4
+ '"登录表单中的"登录"按钮"',
5
+ '"搜索输入框,placeholder 为"请输入关键词""',
6
+ '"顶部导航栏中文字为"首页"的链接"',
7
+ '"联系表单中的提交按钮"',
8
+ '"aria-label 为"打开菜单"的菜单图标"'
9
+ ],
10
+ English: [
11
+ '"Login button with text \'Sign In\'"',
12
+ '"Search input with placeholder \'Enter keywords\'"',
13
+ '"Navigation link with text \'Home\' in header"',
14
+ '"Submit button in contact form"',
15
+ '"Menu icon with aria-label \'Open menu\'"'
16
+ ]
17
+ };
18
+ const getExamples = (language)=>{
19
+ const examples = examplesMap[language] || examplesMap.English;
20
+ return examples.map((e)=>`- ${e}`).join('\n');
21
+ };
22
+ const elementDescriberInstruction = ()=>{
23
+ const preferredLanguage = getPreferredLanguage();
24
+ return `
25
+ Describe the element in the red rectangle for precise identification.
26
+
27
+ IMPORTANT: You MUST write the description in ${preferredLanguage}.
28
+
29
+ CRITICAL REQUIREMENTS:
30
+ 1. UNIQUENESS: The description must uniquely identify this element on the current page
31
+ 2. UNIVERSALITY: Use generic, reusable selectors that work across different contexts
32
+ 3. PRECISION: Be specific enough to distinguish from similar elements
33
+
34
+ DESCRIPTION STRUCTURE:
35
+ 1. Element type (button, input, link, div, etc.)
36
+ 2. Primary identifier (in order of preference):
37
+ - Unique text content: "with text 'Login'"
38
+ - Unique attribute: "with aria-label 'Search'"
39
+ - Unique class/ID: "with class 'primary-button'"
40
+ - Unique position: "in header navigation"
41
+ 3. Secondary identifiers (if needed for uniqueness):
42
+ - Visual features: "blue background", "with icon"
43
+ - Relative position: "below search bar", "in sidebar"
44
+ - Parent context: "in login form", "in main menu"
45
+
46
+ GUIDELINES:
47
+ - Keep description under 25 words
48
+ - Prioritize semantic identifiers over visual ones
49
+ - Use consistent terminology across similar elements
50
+ - Avoid page-specific or temporary content
51
+ - Don't mention the red rectangle or selection box
52
+ - Focus on stable, reusable characteristics
53
+ - **Write the description in ${preferredLanguage}**
54
+
55
+ EXAMPLES:
56
+ ${getExamples(preferredLanguage)}
57
+
58
+ Return JSON:
59
+ {
60
+ "description": "unique element identifier",
61
+ "error"?: "error message if any"
62
+ }`;
63
+ };
64
+ export { elementDescriberInstruction };
65
+
66
+ //# sourceMappingURL=describe.mjs.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"ai-model/prompt/describe.mjs","sources":["../../../../src/ai-model/prompt/describe.ts"],"sourcesContent":["import { getPreferredLanguage } from '@midscene/shared/env';\n\nconst examplesMap: Record<string, string[]> = {\n Chinese: [\n '\"登录表单中的\"登录\"按钮\"',\n '\"搜索输入框,placeholder 为\"请输入关键词\"\"',\n '\"顶部导航栏中文字为\"首页\"的链接\"',\n '\"联系表单中的提交按钮\"',\n '\"aria-label 为\"打开菜单\"的菜单图标\"',\n ],\n English: [\n '\"Login button with text \\'Sign In\\'\"',\n '\"Search input with placeholder \\'Enter keywords\\'\"',\n '\"Navigation link with text \\'Home\\' in header\"',\n '\"Submit button in contact form\"',\n '\"Menu icon with aria-label \\'Open menu\\'\"',\n ],\n};\n\nconst getExamples = (language: string) => {\n const examples = examplesMap[language] || examplesMap.English;\n return examples.map((e) => `- ${e}`).join('\\n');\n};\n\nexport const elementDescriberInstruction = () => {\n const preferredLanguage = getPreferredLanguage();\n\n return `\nDescribe the element in the red rectangle for precise identification.\n\nIMPORTANT: You MUST write the description in ${preferredLanguage}.\n\nCRITICAL REQUIREMENTS:\n1. UNIQUENESS: The description must uniquely identify this element on the current page\n2. UNIVERSALITY: Use generic, reusable selectors that work across different contexts\n3. PRECISION: Be specific enough to distinguish from similar elements\n\nDESCRIPTION STRUCTURE:\n1. Element type (button, input, link, div, etc.)\n2. Primary identifier (in order of preference):\n - Unique text content: \"with text 'Login'\"\n - Unique attribute: \"with aria-label 'Search'\"\n - Unique class/ID: \"with class 'primary-button'\"\n - Unique position: \"in header navigation\"\n3. Secondary identifiers (if needed for uniqueness):\n - Visual features: \"blue background\", \"with icon\"\n - Relative position: \"below search bar\", \"in sidebar\"\n - Parent context: \"in login form\", \"in main menu\"\n\nGUIDELINES:\n- Keep description under 25 words\n- Prioritize semantic identifiers over visual ones\n- Use consistent terminology across similar elements\n- Avoid page-specific or temporary content\n- Don't mention the red rectangle or selection box\n- Focus on stable, reusable characteristics\n- **Write the description in ${preferredLanguage}**\n\nEXAMPLES:\n${getExamples(preferredLanguage)}\n\nReturn JSON:\n{\n \"description\": \"unique element identifier\",\n \"error\"?: \"error message if any\"\n}`;\n};\n"],"names":["examplesMap","getExamples","language","examples","e","elementDescriberInstruction","preferredLanguage","getPreferredLanguage"],"mappings":";AAEA,MAAMA,cAAwC;IAC5C,SAAS;QACP;QACA;QACA;QACA;QACA;KACD;IACD,SAAS;QACP;QACA;QACA;QACA;QACA;KACD;AACH;AAEA,MAAMC,cAAc,CAACC;IACnB,MAAMC,WAAWH,WAAW,CAACE,SAAS,IAAIF,YAAY,OAAO;IAC7D,OAAOG,SAAS,GAAG,CAAC,CAACC,IAAM,CAAC,EAAE,EAAEA,GAAG,EAAE,IAAI,CAAC;AAC5C;AAEO,MAAMC,8BAA8B;IACzC,MAAMC,oBAAoBC;IAE1B,OAAO,CAAC;;;6CAGmC,EAAED,kBAAkB;;;;;;;;;;;;;;;;;;;;;;;;;;6BA0BpC,EAAEA,kBAAkB;;;AAGjD,EAAEL,YAAYK,mBAAmB;;;;;;CAMhC,CAAC;AACF"}
@@ -0,0 +1,169 @@
1
+ import { getPreferredLanguage } from "@midscene/shared/env";
2
+ import { safeParseJson } from "../service-caller/index.mjs";
3
+ import { extractXMLTag } from "./util.mjs";
4
+ function parseXMLExtractionResponse(xmlString) {
5
+ const thought = extractXMLTag(xmlString, 'thought');
6
+ const dataJsonStr = extractXMLTag(xmlString, 'data-json');
7
+ const errorsStr = extractXMLTag(xmlString, 'errors');
8
+ if (!dataJsonStr) throw new Error('Missing required field: data-json');
9
+ let data;
10
+ try {
11
+ data = safeParseJson(dataJsonStr, void 0);
12
+ } catch (e) {
13
+ throw new Error(`Failed to parse data-json: ${e}`);
14
+ }
15
+ let errors;
16
+ if (errorsStr) try {
17
+ const parsedErrors = safeParseJson(errorsStr, void 0);
18
+ if (Array.isArray(parsedErrors)) errors = parsedErrors;
19
+ } catch (e) {}
20
+ return {
21
+ ...thought ? {
22
+ thought
23
+ } : {},
24
+ data,
25
+ ...errors && errors.length > 0 ? {
26
+ errors
27
+ } : {}
28
+ };
29
+ }
30
+ function systemPromptToExtract() {
31
+ const preferredLanguage = getPreferredLanguage();
32
+ return `
33
+ You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.
34
+
35
+ The user will give you a screenshot, the contents of it (optional), and some data requirements in <DATA_DEMAND>. You need to understand the user's requirements and extract the data satisfying the <DATA_DEMAND>.
36
+
37
+ If a key specifies a JSON data type (such as Number, String, Boolean, Object, Array), ensure the returned value strictly matches that data type.
38
+
39
+ If the user provides multiple reference images, please carefully review the reference images with the screenshot and provide the correct answer for <DATA_DEMAND>.
40
+
41
+ CRITICAL RULES FOR BOOLEAN EXTRACTION:
42
+ - When extracting Boolean values, ONLY return true if you can CLEARLY and VISIBLY confirm the condition from the screenshot
43
+ - DO NOT make assumptions, guesses, or inferences based on what "should" be there or typical UI patterns
44
+ - If the requested element is not clearly visible, not present, or ambiguous in the screenshot, return false
45
+ - For queries asking if something "exists", "is present", or "appears" on the screen: be extremely strict and only return true when there is direct visual evidence
46
+ - Avoid confirmation bias: if you're not 100% certain the element exists based on the screenshot alone, return false
47
+ - Partial matches, similar-looking elements, or elements that might be "loaded" but not visible should result in false
48
+
49
+
50
+ Return in the following XML format:
51
+ <thought>the thinking process of the extraction, less than 300 words. Use ${preferredLanguage} in this field.</thought>
52
+ <data-json>the extracted data as JSON. Make sure both the value and scheme meet the DATA_DEMAND. If you want to write some description in this field, use the same language as the DATA_DEMAND.</data-json>
53
+ <errors>optional error messages as JSON array, e.g., ["error1", "error2"]</errors>
54
+
55
+ # Example 1
56
+ For example, if the DATA_DEMAND is:
57
+
58
+ <DATA_DEMAND>
59
+ {
60
+ "name": "name shows on the left panel, string",
61
+ "age": "age shows on the right panel, number",
62
+ "isAdmin": "if the user is admin, boolean"
63
+ }
64
+ </DATA_DEMAND>
65
+
66
+ By viewing the screenshot and page contents, you can extract the following data:
67
+
68
+ <thought>According to the screenshot, i can see ...</thought>
69
+ <data-json>
70
+ {
71
+ "name": "John",
72
+ "age": 30,
73
+ "isAdmin": true
74
+ }
75
+ </data-json>
76
+
77
+ # Example 2
78
+ If the DATA_DEMAND is:
79
+
80
+ <DATA_DEMAND>
81
+ the todo items list, string[]
82
+ </DATA_DEMAND>
83
+
84
+ By viewing the screenshot and page contents, you can extract the following data:
85
+
86
+ <thought>According to the screenshot, i can see ...</thought>
87
+ <data-json>
88
+ ["todo 1", "todo 2", "todo 3"]
89
+ </data-json>
90
+
91
+ # Example 3
92
+ If the DATA_DEMAND is:
93
+
94
+ <DATA_DEMAND>
95
+ the page title, string
96
+ </DATA_DEMAND>
97
+
98
+ By viewing the screenshot and page contents, you can extract the following data:
99
+
100
+ <thought>According to the screenshot, i can see ...</thought>
101
+ <data-json>
102
+ "todo list"
103
+ </data-json>
104
+
105
+ # Example 4
106
+ If the DATA_DEMAND is:
107
+
108
+ <DATA_DEMAND>
109
+ {
110
+ "result": "Boolean, is it currently the SMS page?"
111
+ }
112
+ </DATA_DEMAND>
113
+
114
+ By viewing the screenshot and page contents, you can extract the following data:
115
+
116
+ <thought>According to the screenshot, i can see the SMS page title "Messages" and the list of conversations. Therefore, it is indeed the SMS page.</thought>
117
+ <data-json>
118
+ { "result": true }
119
+ </data-json>
120
+
121
+ # Example 5 - Strict Boolean Check
122
+ If the DATA_DEMAND is:
123
+
124
+ <DATA_DEMAND>
125
+ {
126
+ "hasNotification": "Boolean, is there a notification bell icon with a red badge showing unread count?"
127
+ }
128
+ </DATA_DEMAND>
129
+
130
+ By viewing the screenshot and page contents, you can extract the following data:
131
+
132
+ <thought>I can see a bell icon in the top right corner, but there is no red badge or number indicating unread notifications. The bell appears plain without any notification indicator.</thought>
133
+ <data-json>
134
+ { "hasNotification": false }
135
+ </data-json>
136
+
137
+ # Example 6 - When Element is Not Present
138
+ If the DATA_DEMAND is:
139
+
140
+ <DATA_DEMAND>
141
+ {
142
+ "isLoggedIn": "Boolean, is the user logged in?"
143
+ }
144
+ </DATA_DEMAND>
145
+
146
+ By viewing the screenshot and page contents, you can extract the following data:
147
+
148
+ <thought>The screenshot shows a login form with username and password fields, along with a "Sign In" button. There is no user profile, avatar, or logged-in state visible. The user is clearly not logged in.</thought>
149
+ <data-json>
150
+ { "isLoggedIn": false }
151
+ </data-json>
152
+ `;
153
+ }
154
+ const extractDataQueryPrompt = (pageDescription, dataQuery)=>{
155
+ let dataQueryText = '';
156
+ dataQueryText = 'string' == typeof dataQuery ? dataQuery : JSON.stringify(dataQuery, null, 2);
157
+ return `
158
+ <PageDescription>
159
+ ${pageDescription}
160
+ </PageDescription>
161
+
162
+ <DATA_DEMAND>
163
+ ${dataQueryText}
164
+ </DATA_DEMAND>
165
+ `;
166
+ };
167
+ export { extractDataQueryPrompt, parseXMLExtractionResponse, systemPromptToExtract };
168
+
169
+ //# sourceMappingURL=extraction.mjs.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"ai-model/prompt/extraction.mjs","sources":["../../../../src/ai-model/prompt/extraction.ts"],"sourcesContent":["import type { AIDataExtractionResponse } from '@/types';\nimport { getPreferredLanguage } from '@midscene/shared/env';\nimport type { ResponseFormatJSONSchema } from 'openai/resources/index';\nimport { safeParseJson } from '../service-caller/index';\nimport { extractXMLTag } from './util';\n\n/**\n * Parse XML response from LLM and convert to AIDataExtractionResponse\n */\nexport function parseXMLExtractionResponse<T>(\n xmlString: string,\n): AIDataExtractionResponse<T> {\n const thought = extractXMLTag(xmlString, 'thought');\n const dataJsonStr = extractXMLTag(xmlString, 'data-json');\n const errorsStr = extractXMLTag(xmlString, 'errors');\n\n // Parse data-json (required)\n if (!dataJsonStr) {\n throw new Error('Missing required field: data-json');\n }\n\n let data: T;\n try {\n data = safeParseJson(dataJsonStr, undefined) as T;\n } catch (e) {\n throw new Error(`Failed to parse data-json: ${e}`);\n }\n\n // Parse errors (optional)\n let errors: string[] | undefined;\n if (errorsStr) {\n try {\n const parsedErrors = safeParseJson(errorsStr, undefined);\n if (Array.isArray(parsedErrors)) {\n errors = parsedErrors;\n }\n } catch (e) {\n // If errors parsing fails, just ignore it\n }\n }\n\n return {\n ...(thought ? { thought } : {}),\n data,\n ...(errors && errors.length > 0 ? { errors } : {}),\n };\n}\n\nexport function systemPromptToExtract() {\n const preferredLanguage = getPreferredLanguage();\n\n return `\nYou are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.\n\nThe user will give you a screenshot, the contents of it (optional), and some data requirements in <DATA_DEMAND>. You need to understand the user's requirements and extract the data satisfying the <DATA_DEMAND>.\n\nIf a key specifies a JSON data type (such as Number, String, Boolean, Object, Array), ensure the returned value strictly matches that data type.\n\nIf the user provides multiple reference images, please carefully review the reference images with the screenshot and provide the correct answer for <DATA_DEMAND>.\n\nCRITICAL RULES FOR BOOLEAN EXTRACTION:\n- When extracting Boolean values, ONLY return true if you can CLEARLY and VISIBLY confirm the condition from the screenshot\n- DO NOT make assumptions, guesses, or inferences based on what \"should\" be there or typical UI patterns\n- If the requested element is not clearly visible, not present, or ambiguous in the screenshot, return false\n- For queries asking if something \"exists\", \"is present\", or \"appears\" on the screen: be extremely strict and only return true when there is direct visual evidence\n- Avoid confirmation bias: if you're not 100% certain the element exists based on the screenshot alone, return false\n- Partial matches, similar-looking elements, or elements that might be \"loaded\" but not visible should result in false\n\n\nReturn in the following XML format:\n<thought>the thinking process of the extraction, less than 300 words. Use ${preferredLanguage} in this field.</thought>\n<data-json>the extracted data as JSON. Make sure both the value and scheme meet the DATA_DEMAND. If you want to write some description in this field, use the same language as the DATA_DEMAND.</data-json>\n<errors>optional error messages as JSON array, e.g., [\"error1\", \"error2\"]</errors>\n\n# Example 1\nFor example, if the DATA_DEMAND is:\n\n<DATA_DEMAND>\n{\n \"name\": \"name shows on the left panel, string\",\n \"age\": \"age shows on the right panel, number\",\n \"isAdmin\": \"if the user is admin, boolean\"\n}\n</DATA_DEMAND>\n\nBy viewing the screenshot and page contents, you can extract the following data:\n\n<thought>According to the screenshot, i can see ...</thought>\n<data-json>\n{\n \"name\": \"John\",\n \"age\": 30,\n \"isAdmin\": true\n}\n</data-json>\n\n# Example 2\nIf the DATA_DEMAND is:\n\n<DATA_DEMAND>\nthe todo items list, string[]\n</DATA_DEMAND>\n\nBy viewing the screenshot and page contents, you can extract the following data:\n\n<thought>According to the screenshot, i can see ...</thought>\n<data-json>\n[\"todo 1\", \"todo 2\", \"todo 3\"]\n</data-json>\n\n# Example 3\nIf the DATA_DEMAND is:\n\n<DATA_DEMAND>\nthe page title, string\n</DATA_DEMAND>\n\nBy viewing the screenshot and page contents, you can extract the following data:\n\n<thought>According to the screenshot, i can see ...</thought>\n<data-json>\n\"todo list\"\n</data-json>\n\n# Example 4\nIf the DATA_DEMAND is:\n\n<DATA_DEMAND>\n{\n \"result\": \"Boolean, is it currently the SMS page?\"\n}\n</DATA_DEMAND>\n\nBy viewing the screenshot and page contents, you can extract the following data:\n\n<thought>According to the screenshot, i can see the SMS page title \"Messages\" and the list of conversations. Therefore, it is indeed the SMS page.</thought>\n<data-json>\n{ \"result\": true }\n</data-json>\n\n# Example 5 - Strict Boolean Check\nIf the DATA_DEMAND is:\n\n<DATA_DEMAND>\n{\n \"hasNotification\": \"Boolean, is there a notification bell icon with a red badge showing unread count?\"\n}\n</DATA_DEMAND>\n\nBy viewing the screenshot and page contents, you can extract the following data:\n\n<thought>I can see a bell icon in the top right corner, but there is no red badge or number indicating unread notifications. The bell appears plain without any notification indicator.</thought>\n<data-json>\n{ \"hasNotification\": false }\n</data-json>\n\n# Example 6 - When Element is Not Present\nIf the DATA_DEMAND is:\n\n<DATA_DEMAND>\n{\n \"isLoggedIn\": \"Boolean, is the user logged in?\"\n}\n</DATA_DEMAND>\n\nBy viewing the screenshot and page contents, you can extract the following data:\n\n<thought>The screenshot shows a login form with username and password fields, along with a \"Sign In\" button. There is no user profile, avatar, or logged-in state visible. The user is clearly not logged in.</thought>\n<data-json>\n{ \"isLoggedIn\": false }\n</data-json>\n`;\n}\n\nexport const extractDataQueryPrompt = (\n pageDescription: string,\n dataQuery: string | Record<string, string>,\n) => {\n let dataQueryText = '';\n if (typeof dataQuery === 'string') {\n dataQueryText = dataQuery;\n } else {\n dataQueryText = JSON.stringify(dataQuery, null, 2);\n }\n\n return `\n<PageDescription>\n${pageDescription}\n</PageDescription>\n\n<DATA_DEMAND>\n${dataQueryText}\n</DATA_DEMAND>\n `;\n};\n"],"names":["parseXMLExtractionResponse","xmlString","thought","extractXMLTag","dataJsonStr","errorsStr","Error","data","safeParseJson","undefined","e","errors","parsedErrors","Array","systemPromptToExtract","preferredLanguage","getPreferredLanguage","extractDataQueryPrompt","pageDescription","dataQuery","dataQueryText","JSON"],"mappings":";;;AASO,SAASA,2BACdC,SAAiB;IAEjB,MAAMC,UAAUC,cAAcF,WAAW;IACzC,MAAMG,cAAcD,cAAcF,WAAW;IAC7C,MAAMI,YAAYF,cAAcF,WAAW;IAG3C,IAAI,CAACG,aACH,MAAM,IAAIE,MAAM;IAGlB,IAAIC;IACJ,IAAI;QACFA,OAAOC,cAAcJ,aAAaK;IACpC,EAAE,OAAOC,GAAG;QACV,MAAM,IAAIJ,MAAM,CAAC,2BAA2B,EAAEI,GAAG;IACnD;IAGA,IAAIC;IACJ,IAAIN,WACF,IAAI;QACF,MAAMO,eAAeJ,cAAcH,WAAWI;QAC9C,IAAII,MAAM,OAAO,CAACD,eAChBD,SAASC;IAEb,EAAE,OAAOF,GAAG,CAEZ;IAGF,OAAO;QACL,GAAIR,UAAU;YAAEA;QAAQ,IAAI,CAAC,CAAC;QAC9BK;QACA,GAAII,UAAUA,OAAO,MAAM,GAAG,IAAI;YAAEA;QAAO,IAAI,CAAC,CAAC;IACnD;AACF;AAEO,SAASG;IACd,MAAMC,oBAAoBC;IAE1B,OAAO,CAAC;;;;;;;;;;;;;;;;;;;0EAmBgE,EAAED,kBAAkB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAqG9F,CAAC;AACD;AAEO,MAAME,yBAAyB,CACpCC,iBACAC;IAEA,IAAIC,gBAAgB;IAElBA,gBADE,AAAqB,YAArB,OAAOD,YACOA,YAEAE,KAAK,SAAS,CAACF,WAAW,MAAM;IAGlD,OAAO,CAAC;;AAEV,EAAED,gBAAgB;;;;AAIlB,EAAEE,cAAc;;EAEd,CAAC;AACH"}