@midscene/core 0.26.7-beta-20250818081955.0 → 0.26.7-beta-20250820150415.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. package/dist/es/ai-model/common.mjs +130 -22
  2. package/dist/es/ai-model/common.mjs.map +1 -1
  3. package/dist/es/ai-model/index.mjs +3 -3
  4. package/dist/es/ai-model/inspect.mjs +28 -16
  5. package/dist/es/ai-model/inspect.mjs.map +1 -1
  6. package/dist/es/ai-model/llm-planning.mjs +25 -23
  7. package/dist/es/ai-model/llm-planning.mjs.map +1 -1
  8. package/dist/es/ai-model/prompt/llm-planning.mjs +69 -23
  9. package/dist/es/ai-model/prompt/llm-planning.mjs.map +1 -1
  10. package/dist/es/ai-model/prompt/playwright-generator.mjs +9 -3
  11. package/dist/es/ai-model/prompt/playwright-generator.mjs.map +1 -1
  12. package/dist/es/ai-model/prompt/util.mjs +2 -2
  13. package/dist/es/ai-model/prompt/util.mjs.map +1 -1
  14. package/dist/es/ai-model/prompt/yaml-generator.mjs +9 -3
  15. package/dist/es/ai-model/prompt/yaml-generator.mjs.map +1 -1
  16. package/dist/es/ai-model/service-caller/index.mjs +72 -118
  17. package/dist/es/ai-model/service-caller/index.mjs.map +1 -1
  18. package/dist/es/ai-model/ui-tars-planning.mjs +5 -5
  19. package/dist/es/ai-model/ui-tars-planning.mjs.map +1 -1
  20. package/dist/es/index.mjs +3 -2
  21. package/dist/es/index.mjs.map +1 -1
  22. package/dist/es/insight/index.mjs +13 -61
  23. package/dist/es/insight/index.mjs.map +1 -1
  24. package/dist/es/types.mjs.map +1 -1
  25. package/dist/es/utils.mjs +5 -6
  26. package/dist/es/utils.mjs.map +1 -1
  27. package/dist/lib/ai-model/common.js +166 -28
  28. package/dist/lib/ai-model/common.js.map +1 -1
  29. package/dist/lib/ai-model/index.js +31 -10
  30. package/dist/lib/ai-model/inspect.js +27 -15
  31. package/dist/lib/ai-model/inspect.js.map +1 -1
  32. package/dist/lib/ai-model/llm-planning.js +24 -22
  33. package/dist/lib/ai-model/llm-planning.js.map +1 -1
  34. package/dist/lib/ai-model/prompt/llm-planning.js +71 -25
  35. package/dist/lib/ai-model/prompt/llm-planning.js.map +1 -1
  36. package/dist/lib/ai-model/prompt/playwright-generator.js +9 -3
  37. package/dist/lib/ai-model/prompt/playwright-generator.js.map +1 -1
  38. package/dist/lib/ai-model/prompt/util.js +2 -2
  39. package/dist/lib/ai-model/prompt/util.js.map +1 -1
  40. package/dist/lib/ai-model/prompt/yaml-generator.js +9 -3
  41. package/dist/lib/ai-model/prompt/yaml-generator.js.map +1 -1
  42. package/dist/lib/ai-model/service-caller/index.js +75 -124
  43. package/dist/lib/ai-model/service-caller/index.js.map +1 -1
  44. package/dist/lib/ai-model/ui-tars-planning.js +5 -5
  45. package/dist/lib/ai-model/ui-tars-planning.js.map +1 -1
  46. package/dist/lib/index.js +25 -3
  47. package/dist/lib/index.js.map +1 -1
  48. package/dist/lib/insight/index.js +10 -58
  49. package/dist/lib/insight/index.js.map +1 -1
  50. package/dist/lib/types.js.map +1 -1
  51. package/dist/lib/utils.js +4 -5
  52. package/dist/lib/utils.js.map +1 -1
  53. package/dist/types/ai-model/common.d.ts +517 -8
  54. package/dist/types/ai-model/index.d.ts +2 -2
  55. package/dist/types/ai-model/inspect.d.ts +4 -1
  56. package/dist/types/ai-model/llm-planning.d.ts +1 -1
  57. package/dist/types/ai-model/prompt/llm-planning.d.ts +2 -2
  58. package/dist/types/ai-model/prompt/util.d.ts +2 -1
  59. package/dist/types/ai-model/service-caller/index.d.ts +6 -6
  60. package/dist/types/ai-model/ui-tars-planning.d.ts +3 -1
  61. package/dist/types/index.d.ts +3 -1
  62. package/dist/types/insight/index.d.ts +2 -4
  63. package/dist/types/types.d.ts +9 -31
  64. package/dist/types/yaml.d.ts +6 -2
  65. package/package.json +4 -3
@@ -87,20 +87,23 @@ const promptsToChatParam = async (multimodalPrompt)=>{
87
87
  async function AiLocateElement(options) {
88
88
  const { context, targetElementDescription, callAI } = options;
89
89
  const { screenshotBase64 } = context;
90
- const { description, elementById, insertElementByPosition } = await (0, util_js_namespaceObject.describeUserPage)(context);
90
+ const modelPreferences = {
91
+ intent: 'grounding'
92
+ };
93
+ const { description, elementById, insertElementByPosition } = await (0, util_js_namespaceObject.describeUserPage)(context, modelPreferences);
91
94
  (0, utils_namespaceObject.assert)(targetElementDescription, "cannot find the target element description");
92
95
  const userInstructionPrompt = await llm_locator_js_namespaceObject.findElementPrompt.format({
93
96
  pageDescription: description,
94
97
  targetElementDescription: extraTextFromUserPrompt(targetElementDescription)
95
98
  });
96
- const systemPrompt = (0, llm_locator_js_namespaceObject.systemPromptToLocateElement)((0, env_namespaceObject.vlLocateMode)());
99
+ const systemPrompt = (0, llm_locator_js_namespaceObject.systemPromptToLocateElement)((0, env_namespaceObject.vlLocateMode)(modelPreferences));
97
100
  let imagePayload = screenshotBase64;
98
101
  if (options.searchConfig) {
99
102
  (0, utils_namespaceObject.assert)(options.searchConfig.rect, 'searchArea is provided but its rect cannot be found. Failed to locate element');
100
103
  (0, utils_namespaceObject.assert)(options.searchConfig.imageBase64, 'searchArea is provided but its imageBase64 cannot be found. Failed to locate element');
101
104
  imagePayload = options.searchConfig.imageBase64;
102
- } else if ('qwen-vl' === (0, env_namespaceObject.vlLocateMode)()) imagePayload = await (0, img_namespaceObject.paddingToMatchBlockByBase64)(imagePayload);
103
- else if (!(0, env_namespaceObject.vlLocateMode)()) imagePayload = await (0, external_common_js_namespaceObject.markupImageForLLM)(screenshotBase64, context.tree, context.size);
105
+ } else if ('qwen-vl' === (0, env_namespaceObject.vlLocateMode)(modelPreferences)) imagePayload = await (0, img_namespaceObject.paddingToMatchBlockByBase64)(imagePayload);
106
+ else if (!(0, env_namespaceObject.vlLocateMode)(modelPreferences)) imagePayload = await (0, external_common_js_namespaceObject.markupImageForLLM)(screenshotBase64, context.tree, context.size);
104
107
  const msgs = [
105
108
  {
106
109
  role: 'system',
@@ -131,7 +134,9 @@ async function AiLocateElement(options) {
131
134
  msgs.push(...addOns);
132
135
  }
133
136
  const callAIFn = callAI || index_js_namespaceObject.callToGetJSONObject;
134
- const res = await callAIFn(msgs, external_common_js_namespaceObject.AIActionType.INSPECT_ELEMENT);
137
+ const res = await callAIFn(msgs, external_common_js_namespaceObject.AIActionType.INSPECT_ELEMENT, {
138
+ intent: 'grounding'
139
+ });
135
140
  const rawResponse = JSON.stringify(res.content);
136
141
  let resRect;
137
142
  let matchedElements = 'elements' in res.content ? res.content.elements : [];
@@ -139,7 +144,7 @@ async function AiLocateElement(options) {
139
144
  try {
140
145
  if ('bbox' in res.content && Array.isArray(res.content.bbox)) {
141
146
  var _options_searchConfig_rect, _options_searchConfig, _options_searchConfig_rect1, _options_searchConfig1, _options_searchConfig_rect2, _options_searchConfig2, _options_searchConfig_rect3, _options_searchConfig3;
142
- resRect = (0, external_common_js_namespaceObject.adaptBboxToRect)(res.content.bbox, (null == (_options_searchConfig = options.searchConfig) ? void 0 : null == (_options_searchConfig_rect = _options_searchConfig.rect) ? void 0 : _options_searchConfig_rect.width) || context.size.width, (null == (_options_searchConfig1 = options.searchConfig) ? void 0 : null == (_options_searchConfig_rect1 = _options_searchConfig1.rect) ? void 0 : _options_searchConfig_rect1.height) || context.size.height, null == (_options_searchConfig2 = options.searchConfig) ? void 0 : null == (_options_searchConfig_rect2 = _options_searchConfig2.rect) ? void 0 : _options_searchConfig_rect2.left, null == (_options_searchConfig3 = options.searchConfig) ? void 0 : null == (_options_searchConfig_rect3 = _options_searchConfig3.rect) ? void 0 : _options_searchConfig_rect3.top);
147
+ resRect = (0, external_common_js_namespaceObject.adaptBboxToRect)(res.content.bbox, (null == (_options_searchConfig = options.searchConfig) ? void 0 : null == (_options_searchConfig_rect = _options_searchConfig.rect) ? void 0 : _options_searchConfig_rect.width) || context.size.width, (null == (_options_searchConfig1 = options.searchConfig) ? void 0 : null == (_options_searchConfig_rect1 = _options_searchConfig1.rect) ? void 0 : _options_searchConfig_rect1.height) || context.size.height, modelPreferences, null == (_options_searchConfig2 = options.searchConfig) ? void 0 : null == (_options_searchConfig_rect2 = _options_searchConfig2.rect) ? void 0 : _options_searchConfig_rect2.left, null == (_options_searchConfig3 = options.searchConfig) ? void 0 : null == (_options_searchConfig_rect3 = _options_searchConfig3.rect) ? void 0 : _options_searchConfig_rect3.top);
143
148
  debugInspect('resRect', resRect);
144
149
  const rectCenter = {
145
150
  x: resRect.left + resRect.width / 2,
@@ -180,7 +185,10 @@ async function AiLocateElement(options) {
180
185
  async function AiLocateSection(options) {
181
186
  const { context, sectionDescription } = options;
182
187
  const { screenshotBase64 } = context;
183
- const systemPrompt = (0, llm_section_locator_js_namespaceObject.systemPromptToLocateSection)((0, env_namespaceObject.vlLocateMode)());
188
+ const modelPreferences = {
189
+ intent: 'grounding'
190
+ };
191
+ const systemPrompt = (0, llm_section_locator_js_namespaceObject.systemPromptToLocateSection)((0, env_namespaceObject.vlLocateMode)(modelPreferences));
184
192
  const sectionLocatorInstructionText = await llm_section_locator_js_namespaceObject.sectionLocatorInstruction.format({
185
193
  sectionDescription: extraTextFromUserPrompt(sectionDescription)
186
194
  });
@@ -213,26 +221,30 @@ async function AiLocateSection(options) {
213
221
  });
214
222
  msgs.push(...addOns);
215
223
  }
216
- const result = await (0, external_common_js_namespaceObject.callAiFn)(msgs, external_common_js_namespaceObject.AIActionType.EXTRACT_DATA);
224
+ const result = await (0, external_common_js_namespaceObject.callAiFn)(msgs, external_common_js_namespaceObject.AIActionType.EXTRACT_DATA, {
225
+ intent: 'grounding'
226
+ });
217
227
  let sectionRect;
218
228
  const sectionBbox = result.content.bbox;
219
229
  if (sectionBbox) {
220
- const targetRect = (0, external_common_js_namespaceObject.adaptBboxToRect)(sectionBbox, context.size.width, context.size.height);
230
+ const targetRect = (0, external_common_js_namespaceObject.adaptBboxToRect)(sectionBbox, context.size.width, context.size.height, modelPreferences);
221
231
  debugSection('original targetRect %j', targetRect);
222
232
  const referenceBboxList = result.content.references_bbox || [];
223
233
  debugSection('referenceBboxList %j', referenceBboxList);
224
- const referenceRects = referenceBboxList.filter((bbox)=>Array.isArray(bbox)).map((bbox)=>(0, external_common_js_namespaceObject.adaptBboxToRect)(bbox, context.size.width, context.size.height));
234
+ const referenceRects = referenceBboxList.filter((bbox)=>Array.isArray(bbox)).map((bbox)=>(0, external_common_js_namespaceObject.adaptBboxToRect)(bbox, context.size.width, context.size.height, modelPreferences));
225
235
  debugSection('referenceRects %j', referenceRects);
226
236
  const mergedRect = (0, external_common_js_namespaceObject.mergeRects)([
227
237
  targetRect,
228
238
  ...referenceRects
229
239
  ]);
230
240
  debugSection('mergedRect %j', mergedRect);
231
- sectionRect = (0, external_common_js_namespaceObject.expandSearchArea)(mergedRect, context.size);
241
+ sectionRect = (0, external_common_js_namespaceObject.expandSearchArea)(mergedRect, context.size, modelPreferences);
232
242
  debugSection('expanded sectionRect %j', sectionRect);
233
243
  }
234
244
  let imageBase64 = screenshotBase64;
235
- if (sectionRect) imageBase64 = await (0, img_namespaceObject.cropByRect)(screenshotBase64, sectionRect, (0, env_namespaceObject.getAIConfigInBoolean)(env_namespaceObject.MIDSCENE_USE_QWEN_VL));
245
+ if (sectionRect) imageBase64 = await (0, img_namespaceObject.cropByRect)(screenshotBase64, sectionRect, (0, env_namespaceObject.getIsUseQwenVl)({
246
+ intent: 'grounding'
247
+ }));
236
248
  return {
237
249
  rect: sectionRect,
238
250
  imageBase64,
@@ -243,10 +255,10 @@ async function AiLocateSection(options) {
243
255
  }
244
256
  async function AiExtractElementInfo(options) {
245
257
  var _options_extractOption;
246
- const { dataQuery, context, extractOption, multimodalPrompt } = options;
258
+ const { dataQuery, context, extractOption, multimodalPrompt, modelPreferences } = options;
247
259
  const systemPrompt = (0, extraction_js_namespaceObject.systemPromptToExtract)();
248
260
  const { screenshotBase64 } = context;
249
- const { description, elementById } = await (0, util_js_namespaceObject.describeUserPage)(context, {
261
+ const { description, elementById } = await (0, util_js_namespaceObject.describeUserPage)(context, modelPreferences, {
250
262
  truncateTextLength: 200,
251
263
  filterNonTextContent: false,
252
264
  visibleOnly: false,
@@ -286,7 +298,7 @@ async function AiExtractElementInfo(options) {
286
298
  });
287
299
  msgs.push(...addOns);
288
300
  }
289
- const result = await (0, external_common_js_namespaceObject.callAiFn)(msgs, external_common_js_namespaceObject.AIActionType.EXTRACT_DATA);
301
+ const result = await (0, external_common_js_namespaceObject.callAiFn)(msgs, external_common_js_namespaceObject.AIActionType.EXTRACT_DATA, modelPreferences);
290
302
  return {
291
303
  parseResult: result.content,
292
304
  elementById,
@@ -1 +1 @@
1
- {"version":3,"file":"ai-model/inspect.js","sources":["webpack://@midscene/core/webpack/runtime/define_property_getters","webpack://@midscene/core/webpack/runtime/has_own_property","webpack://@midscene/core/webpack/runtime/make_namespace_object","webpack://@midscene/core/./src/ai-model/inspect.ts"],"sourcesContent":["__webpack_require__.d = (exports, definition) => {\n\tfor(var key in definition) {\n if(__webpack_require__.o(definition, key) && !__webpack_require__.o(exports, key)) {\n Object.defineProperty(exports, key, { enumerable: true, get: definition[key] });\n }\n }\n};","__webpack_require__.o = (obj, prop) => (Object.prototype.hasOwnProperty.call(obj, prop))","// define __esModule on exports\n__webpack_require__.r = (exports) => {\n\tif(typeof Symbol !== 'undefined' && Symbol.toStringTag) {\n\t\tObject.defineProperty(exports, Symbol.toStringTag, { value: 'Module' });\n\t}\n\tObject.defineProperty(exports, '__esModule', { value: true });\n};","import type {\n AIDataExtractionResponse,\n AIElementLocatorResponse,\n AIElementResponse,\n AISectionLocatorResponse,\n AIUsageInfo,\n BaseElement,\n ElementById,\n InsightExtractOption,\n Rect,\n ReferenceImage,\n TMultimodalPrompt,\n TUserPrompt,\n UIContext,\n} from '@/types';\nimport {\n MIDSCENE_USE_QWEN_VL,\n getAIConfigInBoolean,\n vlLocateMode,\n} from '@midscene/shared/env';\nimport {\n cropByRect,\n paddingToMatchBlockByBase64,\n preProcessImageUrl,\n} from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport type {\n ChatCompletionSystemMessageParam,\n ChatCompletionUserMessageParam,\n} from 'openai/resources/index';\nimport {\n AIActionType,\n adaptBboxToRect,\n callAiFn,\n expandSearchArea,\n markupImageForLLM,\n mergeRects,\n} from './common';\nimport {\n extractDataQueryPrompt,\n systemPromptToExtract,\n} from './prompt/extraction';\nimport {\n findElementPrompt,\n systemPromptToLocateElement,\n} from './prompt/llm-locator';\nimport {\n sectionLocatorInstruction,\n systemPromptToLocateSection,\n} from './prompt/llm-section-locator';\nimport {\n describeUserPage,\n distance,\n distanceThreshold,\n elementByPositionWithElementInfo,\n} from './prompt/util';\nimport { callToGetJSONObject } from './service-caller/index';\n\nexport type AIArgs = [\n ChatCompletionSystemMessageParam,\n ...ChatCompletionUserMessageParam[],\n];\n\nconst debugInspect = getDebug('ai:inspect');\nconst debugSection = getDebug('ai:section');\n\nconst extraTextFromUserPrompt = (prompt: TUserPrompt): string => {\n if (typeof prompt === 'string') {\n return prompt;\n } else {\n return prompt.prompt;\n }\n};\n\nconst promptsToChatParam = async (\n multimodalPrompt: TMultimodalPrompt,\n): Promise<ChatCompletionUserMessageParam[]> => {\n const msgs: ChatCompletionUserMessageParam[] = [];\n if (multimodalPrompt?.images?.length) {\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: 'Next, I will provide all the reference images.',\n },\n ],\n });\n\n for (const item of multimodalPrompt.images) {\n const base64 = await preProcessImageUrl(\n item.url,\n !!multimodalPrompt.convertHttpImage2Base64,\n );\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: `reference image ${item.name}:`,\n },\n ],\n });\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: base64,\n detail: 'high',\n },\n },\n ],\n });\n }\n }\n return msgs;\n};\n\nexport async function AiLocateElement<\n ElementType extends BaseElement = BaseElement,\n>(options: {\n context: UIContext<ElementType>;\n targetElementDescription: TUserPrompt;\n referenceImage?: ReferenceImage;\n callAI?: typeof callAiFn<AIElementResponse | [number, number]>;\n searchConfig?: Awaited<ReturnType<typeof AiLocateSection>>;\n}): Promise<{\n parseResult: AIElementLocatorResponse;\n rect?: Rect;\n rawResponse: string;\n elementById: ElementById;\n usage?: AIUsageInfo;\n isOrderSensitive?: boolean;\n}> {\n const { context, targetElementDescription, callAI } = options;\n const { screenshotBase64 } = context;\n const { description, elementById, insertElementByPosition } =\n await describeUserPage(context);\n\n assert(\n targetElementDescription,\n 'cannot find the target element description',\n );\n\n const userInstructionPrompt = await findElementPrompt.format({\n pageDescription: description,\n targetElementDescription: extraTextFromUserPrompt(targetElementDescription),\n });\n const systemPrompt = systemPromptToLocateElement(vlLocateMode());\n\n let imagePayload = screenshotBase64;\n\n if (options.searchConfig) {\n assert(\n options.searchConfig.rect,\n 'searchArea is provided but its rect cannot be found. Failed to locate element',\n );\n assert(\n options.searchConfig.imageBase64,\n 'searchArea is provided but its imageBase64 cannot be found. Failed to locate element',\n );\n\n imagePayload = options.searchConfig.imageBase64;\n } else if (vlLocateMode() === 'qwen-vl') {\n imagePayload = await paddingToMatchBlockByBase64(imagePayload);\n } else if (!vlLocateMode()) {\n imagePayload = await markupImageForLLM(\n screenshotBase64,\n context.tree,\n context.size,\n );\n }\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: userInstructionPrompt,\n },\n ],\n },\n ];\n\n if (typeof targetElementDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: targetElementDescription.images,\n convertHttpImage2Base64: targetElementDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const callAIFn =\n callAI || callToGetJSONObject<AIElementResponse | [number, number]>;\n\n const res = await callAIFn(msgs, AIActionType.INSPECT_ELEMENT);\n\n const rawResponse = JSON.stringify(res.content);\n\n let resRect: Rect | undefined;\n let matchedElements: AIElementLocatorResponse['elements'] =\n 'elements' in res.content ? res.content.elements : [];\n let errors: AIElementLocatorResponse['errors'] | undefined =\n 'errors' in res.content ? res.content.errors : [];\n try {\n if ('bbox' in res.content && Array.isArray(res.content.bbox)) {\n resRect = adaptBboxToRect(\n res.content.bbox,\n options.searchConfig?.rect?.width || context.size.width,\n options.searchConfig?.rect?.height || context.size.height,\n options.searchConfig?.rect?.left,\n options.searchConfig?.rect?.top,\n );\n debugInspect('resRect', resRect);\n\n const rectCenter = {\n x: resRect.left + resRect.width / 2,\n y: resRect.top + resRect.height / 2,\n };\n let element = elementByPositionWithElementInfo(context.tree, rectCenter);\n\n const distanceToCenter = element\n ? distance({ x: element.center[0], y: element.center[1] }, rectCenter)\n : 0;\n\n if (!element || distanceToCenter > distanceThreshold) {\n element = insertElementByPosition(rectCenter);\n }\n\n if (element) {\n matchedElements = [element];\n errors = [];\n }\n }\n } catch (e) {\n const msg =\n e instanceof Error\n ? `Failed to parse bbox: ${e.message}`\n : 'unknown error in locate';\n if (!errors || errors?.length === 0) {\n errors = [msg];\n } else {\n errors.push(`(${msg})`);\n }\n }\n\n return {\n rect: resRect,\n parseResult: {\n elements: matchedElements,\n errors,\n },\n rawResponse,\n elementById,\n usage: res.usage,\n isOrderSensitive:\n typeof res.content === 'object' &&\n res.content !== null &&\n 'isOrderSensitive' in res.content\n ? (res.content as any).isOrderSensitive\n : undefined,\n };\n}\n\nexport async function AiLocateSection(options: {\n context: UIContext<BaseElement>;\n sectionDescription: TUserPrompt;\n callAI?: typeof callAiFn<AISectionLocatorResponse>;\n}): Promise<{\n rect?: Rect;\n imageBase64?: string;\n error?: string;\n rawResponse: string;\n usage?: AIUsageInfo;\n}> {\n const { context, sectionDescription } = options;\n const { screenshotBase64 } = context;\n\n const systemPrompt = systemPromptToLocateSection(vlLocateMode());\n const sectionLocatorInstructionText = await sectionLocatorInstruction.format({\n sectionDescription: extraTextFromUserPrompt(sectionDescription),\n });\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: sectionLocatorInstructionText,\n },\n ],\n },\n ];\n\n if (typeof sectionDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: sectionDescription.images,\n convertHttpImage2Base64: sectionDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const result = await callAiFn<AISectionLocatorResponse>(\n msgs,\n AIActionType.EXTRACT_DATA,\n );\n\n let sectionRect: Rect | undefined;\n const sectionBbox = result.content.bbox;\n if (sectionBbox) {\n const targetRect = adaptBboxToRect(\n sectionBbox,\n context.size.width,\n context.size.height,\n );\n debugSection('original targetRect %j', targetRect);\n\n const referenceBboxList = result.content.references_bbox || [];\n debugSection('referenceBboxList %j', referenceBboxList);\n\n const referenceRects = referenceBboxList\n .filter((bbox) => Array.isArray(bbox))\n .map((bbox) => {\n return adaptBboxToRect(bbox, context.size.width, context.size.height);\n });\n debugSection('referenceRects %j', referenceRects);\n\n // merge the sectionRect and referenceRects\n const mergedRect = mergeRects([targetRect, ...referenceRects]);\n debugSection('mergedRect %j', mergedRect);\n\n // expand search area to at least 200 x 200\n sectionRect = expandSearchArea(mergedRect, context.size);\n debugSection('expanded sectionRect %j', sectionRect);\n }\n\n let imageBase64 = screenshotBase64;\n if (sectionRect) {\n imageBase64 = await cropByRect(\n screenshotBase64,\n sectionRect,\n getAIConfigInBoolean(MIDSCENE_USE_QWEN_VL),\n );\n }\n\n return {\n rect: sectionRect,\n imageBase64,\n error: result.content.error,\n rawResponse: JSON.stringify(result.content),\n usage: result.usage,\n };\n}\n\nexport async function AiExtractElementInfo<\n T,\n ElementType extends BaseElement = BaseElement,\n>(options: {\n dataQuery: string | Record<string, string>;\n multimodalPrompt?: TMultimodalPrompt;\n context: UIContext<ElementType>;\n extractOption?: InsightExtractOption;\n}) {\n const { dataQuery, context, extractOption, multimodalPrompt } = options;\n const systemPrompt = systemPromptToExtract();\n\n const { screenshotBase64 } = context;\n const { description, elementById } = await describeUserPage(context, {\n truncateTextLength: 200,\n filterNonTextContent: false,\n visibleOnly: false,\n domIncluded: extractOption?.domIncluded,\n });\n\n const extractDataPromptText = await extractDataQueryPrompt(\n description,\n dataQuery,\n );\n\n const userContent: ChatCompletionUserMessageParam['content'] = [];\n\n if (extractOption?.screenshotIncluded !== false) {\n userContent.push({\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n });\n }\n\n userContent.push({\n type: 'text',\n text: extractDataPromptText,\n });\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: userContent,\n },\n ];\n\n if (options.extractOption?.returnThought) {\n msgs.push({\n role: 'user',\n content: 'Please provide reasons.',\n });\n }\n\n if (multimodalPrompt) {\n const addOns = await promptsToChatParam({\n images: multimodalPrompt.images,\n convertHttpImage2Base64: multimodalPrompt.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const result = await callAiFn<AIDataExtractionResponse<T>>(\n msgs,\n AIActionType.EXTRACT_DATA,\n );\n return {\n parseResult: result.content,\n elementById,\n usage: result.usage,\n };\n}\n"],"names":["__webpack_require__","definition","key","Object","obj","prop","Symbol","debugInspect","getDebug","debugSection","extraTextFromUserPrompt","prompt","promptsToChatParam","multimodalPrompt","_multimodalPrompt_images","msgs","item","base64","preProcessImageUrl","AiLocateElement","options","context","targetElementDescription","callAI","screenshotBase64","description","elementById","insertElementByPosition","describeUserPage","assert","userInstructionPrompt","findElementPrompt","systemPrompt","systemPromptToLocateElement","vlLocateMode","imagePayload","paddingToMatchBlockByBase64","markupImageForLLM","addOns","callAIFn","callToGetJSONObject","res","AIActionType","rawResponse","JSON","resRect","matchedElements","errors","Array","_options_searchConfig_rect","_options_searchConfig_rect1","_options_searchConfig_rect2","_options_searchConfig_rect3","adaptBboxToRect","rectCenter","element","elementByPositionWithElementInfo","distanceToCenter","distance","distanceThreshold","e","msg","Error","undefined","AiLocateSection","sectionDescription","systemPromptToLocateSection","sectionLocatorInstructionText","sectionLocatorInstruction","result","callAiFn","sectionRect","sectionBbox","targetRect","referenceBboxList","referenceRects","bbox","mergedRect","mergeRects","expandSearchArea","imageBase64","cropByRect","getAIConfigInBoolean","MIDSCENE_USE_QWEN_VL","AiExtractElementInfo","_options_extractOption","dataQuery","extractOption","systemPromptToExtract","extractDataPromptText","extractDataQueryPrompt","userContent"],"mappings":";;;IAAAA,oBAAoB,CAAC,GAAG,CAAC,UAASC;QACjC,IAAI,IAAIC,OAAOD,WACR,IAAGD,oBAAoB,CAAC,CAACC,YAAYC,QAAQ,CAACF,oBAAoB,CAAC,CAAC,UAASE,MACzEC,OAAO,cAAc,CAAC,UAASD,KAAK;YAAE,YAAY;YAAM,KAAKD,UAAU,CAACC,IAAI;QAAC;IAGzF;;;ICNAF,oBAAoB,CAAC,GAAG,CAACI,KAAKC,OAAUF,OAAO,SAAS,CAAC,cAAc,CAAC,IAAI,CAACC,KAAKC;;;ICClFL,oBAAoB,CAAC,GAAG,CAAC;QACxB,IAAG,AAAkB,eAAlB,OAAOM,UAA0BA,OAAO,WAAW,EACrDH,OAAO,cAAc,CAAC,UAASG,OAAO,WAAW,EAAE;YAAE,OAAO;QAAS;QAEtEH,OAAO,cAAc,CAAC,UAAS,cAAc;YAAE,OAAO;QAAK;IAC5D;;;;;;;;;;;;;;;;;;;AC0DA,MAAMI,eAAeC,AAAAA,IAAAA,uBAAAA,QAAAA,AAAAA,EAAS;AAC9B,MAAMC,eAAeD,AAAAA,IAAAA,uBAAAA,QAAAA,AAAAA,EAAS;AAE9B,MAAME,0BAA0B,CAACC;IAC/B,IAAI,AAAkB,YAAlB,OAAOA,QACT,OAAOA;IAEP,OAAOA,OAAO,MAAM;AAExB;AAEA,MAAMC,qBAAqB,OACzBC;QAGIC;IADJ,MAAMC,OAAyC,EAAE;IACjD,IAAID,QAAAA,mBAAAA,KAAAA,IAAAA,QAAAA,CAAAA,2BAAAA,iBAAkB,MAAM,AAAD,IAAvBA,KAAAA,IAAAA,yBAA0B,MAAM,EAAE;QACpCC,KAAK,IAAI,CAAC;YACR,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM;gBACR;aACD;QACH;QAEA,KAAK,MAAMC,QAAQH,iBAAiB,MAAM,CAAE;YAC1C,MAAMI,SAAS,MAAMC,AAAAA,IAAAA,oBAAAA,kBAAAA,AAAAA,EACnBF,KAAK,GAAG,EACR,CAAC,CAACH,iBAAiB,uBAAuB;YAG5CE,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,MAAM,CAAC,gBAAgB,EAAEC,KAAK,IAAI,CAAC,CAAC,CAAC;oBACvC;iBACD;YACH;YAEAD,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,WAAW;4BACT,KAAKE;4BACL,QAAQ;wBACV;oBACF;iBACD;YACH;QACF;IACF;IACA,OAAOF;AACT;AAEO,eAAeI,gBAEpBC,OAMD;IAQC,MAAM,EAAEC,OAAO,EAAEC,wBAAwB,EAAEC,MAAM,EAAE,GAAGH;IACtD,MAAM,EAAEI,gBAAgB,EAAE,GAAGH;IAC7B,MAAM,EAAEI,WAAW,EAAEC,WAAW,EAAEC,uBAAuB,EAAE,GACzD,MAAMC,AAAAA,IAAAA,wBAAAA,gBAAAA,AAAAA,EAAiBP;IAEzBQ,IAAAA,sBAAAA,MAAAA,AAAAA,EACEP,0BACA;IAGF,MAAMQ,wBAAwB,MAAMC,+BAAAA,iBAAAA,CAAAA,MAAwB,CAAC;QAC3D,iBAAiBN;QACjB,0BAA0Bf,wBAAwBY;IACpD;IACA,MAAMU,eAAeC,AAAAA,IAAAA,+BAAAA,2BAAAA,AAAAA,EAA4BC,AAAAA,IAAAA,oBAAAA,YAAAA,AAAAA;IAEjD,IAAIC,eAAeX;IAEnB,IAAIJ,QAAQ,YAAY,EAAE;QACxBS,IAAAA,sBAAAA,MAAAA,AAAAA,EACET,QAAQ,YAAY,CAAC,IAAI,EACzB;QAEFS,IAAAA,sBAAAA,MAAAA,AAAAA,EACET,QAAQ,YAAY,CAAC,WAAW,EAChC;QAGFe,eAAef,QAAQ,YAAY,CAAC,WAAW;IACjD,OAAO,IAAIc,AAAmB,cAAnBA,AAAAA,IAAAA,oBAAAA,YAAAA,AAAAA,KACTC,eAAe,MAAMC,AAAAA,IAAAA,oBAAAA,2BAAAA,AAAAA,EAA4BD;SAC5C,IAAI,CAACD,AAAAA,IAAAA,oBAAAA,YAAAA,AAAAA,KACVC,eAAe,MAAME,AAAAA,IAAAA,mCAAAA,iBAAAA,AAAAA,EACnBb,kBACAH,QAAQ,IAAI,EACZA,QAAQ,IAAI;IAIhB,MAAMN,OAAe;QACnB;YAAE,MAAM;YAAU,SAASiB;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKG;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAML;gBACR;aACD;QACH;KACD;IAED,IAAI,AAAoC,YAApC,OAAOR,0BAAuC;QAChD,MAAMgB,SAAS,MAAM1B,mBAAmB;YACtC,QAAQU,yBAAyB,MAAM;YACvC,yBAAyBA,yBAAyB,uBAAuB;QAC3E;QACAP,KAAK,IAAI,IAAIuB;IACf;IAEA,MAAMC,WACJhB,UAAUiB,yBAAAA,mBAAmBA;IAE/B,MAAMC,MAAM,MAAMF,SAASxB,MAAM2B,mCAAAA,YAAAA,CAAAA,eAA4B;IAE7D,MAAMC,cAAcC,KAAK,SAAS,CAACH,IAAI,OAAO;IAE9C,IAAII;IACJ,IAAIC,kBACF,cAAcL,IAAI,OAAO,GAAGA,IAAI,OAAO,CAAC,QAAQ,GAAG,EAAE;IACvD,IAAIM,SACF,YAAYN,IAAI,OAAO,GAAGA,IAAI,OAAO,CAAC,MAAM,GAAG,EAAE;IACnD,IAAI;QACF,IAAI,UAAUA,IAAI,OAAO,IAAIO,MAAM,OAAO,CAACP,IAAI,OAAO,CAAC,IAAI,GAAG;gBAG1DQ,4BAAAA,uBACAC,6BAAAA,wBACAC,6BAAAA,wBACAC,6BAAAA;YALFP,UAAUQ,AAAAA,IAAAA,mCAAAA,eAAAA,AAAAA,EACRZ,IAAI,OAAO,CAAC,IAAI,EAChBQ,AAAAA,SAAAA,CAAAA,wBAAAA,QAAQ,YAAY,AAAD,IAAnBA,KAAAA,IAAAA,QAAAA,CAAAA,6BAAAA,sBAAsB,IAAI,AAAD,IAAzBA,KAAAA,IAAAA,2BAA4B,KAAK,AAAD,KAAK5B,QAAQ,IAAI,CAAC,KAAK,EACvD6B,AAAAA,SAAAA,CAAAA,yBAAAA,QAAQ,YAAY,AAAD,IAAnBA,KAAAA,IAAAA,QAAAA,CAAAA,8BAAAA,uBAAsB,IAAI,AAAD,IAAzBA,KAAAA,IAAAA,4BAA4B,MAAM,AAAD,KAAK7B,QAAQ,IAAI,CAAC,MAAM,UACzD8B,CAAAA,yBAAAA,QAAQ,YAAY,AAAD,IAAnBA,KAAAA,IAAAA,QAAAA,CAAAA,8BAAAA,uBAAsB,IAAI,AAAD,IAAzBA,KAAAA,IAAAA,4BAA4B,IAAI,UAChCC,CAAAA,yBAAAA,QAAQ,YAAY,AAAD,IAAnBA,KAAAA,IAAAA,QAAAA,CAAAA,8BAAAA,uBAAsB,IAAI,AAAD,IAAzBA,KAAAA,IAAAA,4BAA4B,GAAG;YAEjC7C,aAAa,WAAWsC;YAExB,MAAMS,aAAa;gBACjB,GAAGT,QAAQ,IAAI,GAAGA,QAAQ,KAAK,GAAG;gBAClC,GAAGA,QAAQ,GAAG,GAAGA,QAAQ,MAAM,GAAG;YACpC;YACA,IAAIU,UAAUC,AAAAA,IAAAA,wBAAAA,gCAAAA,AAAAA,EAAiCnC,QAAQ,IAAI,EAAEiC;YAE7D,MAAMG,mBAAmBF,UACrBG,AAAAA,IAAAA,wBAAAA,QAAAA,AAAAA,EAAS;gBAAE,GAAGH,QAAQ,MAAM,CAAC,EAAE;gBAAE,GAAGA,QAAQ,MAAM,CAAC,EAAE;YAAC,GAAGD,cACzD;YAEJ,IAAI,CAACC,WAAWE,mBAAmBE,wBAAAA,iBAAiBA,EAClDJ,UAAU5B,wBAAwB2B;YAGpC,IAAIC,SAAS;gBACXT,kBAAkB;oBAACS;iBAAQ;gBAC3BR,SAAS,EAAE;YACb;QACF;IACF,EAAE,OAAOa,GAAG;QACV,MAAMC,MACJD,aAAaE,QACT,CAAC,sBAAsB,EAAEF,EAAE,OAAO,EAAE,GACpC;QACN,IAAI,AAACb,UAAUA,AAAAA,CAAAA,QAAAA,SAAAA,KAAAA,IAAAA,OAAQ,MAAM,AAAD,MAAM,GAGhCA,OAAO,IAAI,CAAC,CAAC,CAAC,EAAEc,IAAI,CAAC,CAAC;aAFtBd,SAAS;YAACc;SAAI;IAIlB;IAEA,OAAO;QACL,MAAMhB;QACN,aAAa;YACX,UAAUC;YACVC;QACF;QACAJ;QACAjB;QACA,OAAOe,IAAI,KAAK;QAChB,kBACE,AAAuB,YAAvB,OAAOA,IAAI,OAAO,IAClBA,AAAgB,SAAhBA,IAAI,OAAO,IACX,sBAAsBA,IAAI,OAAO,GAC5BA,IAAI,OAAO,CAAS,gBAAgB,GACrCsB;IACR;AACF;AAEO,eAAeC,gBAAgB5C,OAIrC;IAOC,MAAM,EAAEC,OAAO,EAAE4C,kBAAkB,EAAE,GAAG7C;IACxC,MAAM,EAAEI,gBAAgB,EAAE,GAAGH;IAE7B,MAAMW,eAAekC,AAAAA,IAAAA,uCAAAA,2BAAAA,AAAAA,EAA4BhC,AAAAA,IAAAA,oBAAAA,YAAAA,AAAAA;IACjD,MAAMiC,gCAAgC,MAAMC,uCAAAA,yBAAAA,CAAAA,MAAgC,CAAC;QAC3E,oBAAoB1D,wBAAwBuD;IAC9C;IACA,MAAMlD,OAAe;QACnB;YAAE,MAAM;YAAU,SAASiB;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKR;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAM2C;gBACR;aACD;QACH;KACD;IAED,IAAI,AAA8B,YAA9B,OAAOF,oBAAiC;QAC1C,MAAM3B,SAAS,MAAM1B,mBAAmB;YACtC,QAAQqD,mBAAmB,MAAM;YACjC,yBAAyBA,mBAAmB,uBAAuB;QACrE;QACAlD,KAAK,IAAI,IAAIuB;IACf;IAEA,MAAM+B,SAAS,MAAMC,AAAAA,IAAAA,mCAAAA,QAAAA,AAAAA,EACnBvD,MACA2B,mCAAAA,YAAAA,CAAAA,YAAyB;IAG3B,IAAI6B;IACJ,MAAMC,cAAcH,OAAO,OAAO,CAAC,IAAI;IACvC,IAAIG,aAAa;QACf,MAAMC,aAAapB,AAAAA,IAAAA,mCAAAA,eAAAA,AAAAA,EACjBmB,aACAnD,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM;QAErBZ,aAAa,0BAA0BgE;QAEvC,MAAMC,oBAAoBL,OAAO,OAAO,CAAC,eAAe,IAAI,EAAE;QAC9D5D,aAAa,wBAAwBiE;QAErC,MAAMC,iBAAiBD,kBACpB,MAAM,CAAC,CAACE,OAAS5B,MAAM,OAAO,CAAC4B,OAC/B,GAAG,CAAC,CAACA,OACGvB,AAAAA,IAAAA,mCAAAA,eAAAA,AAAAA,EAAgBuB,MAAMvD,QAAQ,IAAI,CAAC,KAAK,EAAEA,QAAQ,IAAI,CAAC,MAAM;QAExEZ,aAAa,qBAAqBkE;QAGlC,MAAME,aAAaC,AAAAA,IAAAA,mCAAAA,UAAAA,AAAAA,EAAW;YAACL;eAAeE;SAAe;QAC7DlE,aAAa,iBAAiBoE;QAG9BN,cAAcQ,AAAAA,IAAAA,mCAAAA,gBAAAA,AAAAA,EAAiBF,YAAYxD,QAAQ,IAAI;QACvDZ,aAAa,2BAA2B8D;IAC1C;IAEA,IAAIS,cAAcxD;IAClB,IAAI+C,aACFS,cAAc,MAAMC,AAAAA,IAAAA,oBAAAA,UAAAA,AAAAA,EAClBzD,kBACA+C,aACAW,AAAAA,IAAAA,oBAAAA,oBAAAA,AAAAA,EAAqBC,oBAAAA,oBAAoBA;IAI7C,OAAO;QACL,MAAMZ;QACNS;QACA,OAAOX,OAAO,OAAO,CAAC,KAAK;QAC3B,aAAazB,KAAK,SAAS,CAACyB,OAAO,OAAO;QAC1C,OAAOA,OAAO,KAAK;IACrB;AACF;AAEO,eAAee,qBAGpBhE,OAKD;QA0CKiE;IAzCJ,MAAM,EAAEC,SAAS,EAAEjE,OAAO,EAAEkE,aAAa,EAAE1E,gBAAgB,EAAE,GAAGO;IAChE,MAAMY,eAAewD,AAAAA,IAAAA,8BAAAA,qBAAAA,AAAAA;IAErB,MAAM,EAAEhE,gBAAgB,EAAE,GAAGH;IAC7B,MAAM,EAAEI,WAAW,EAAEC,WAAW,EAAE,GAAG,MAAME,AAAAA,IAAAA,wBAAAA,gBAAAA,AAAAA,EAAiBP,SAAS;QACnE,oBAAoB;QACpB,sBAAsB;QACtB,aAAa;QACb,aAAakE,QAAAA,gBAAAA,KAAAA,IAAAA,cAAe,WAAW;IACzC;IAEA,MAAME,wBAAwB,MAAMC,AAAAA,IAAAA,8BAAAA,sBAAAA,AAAAA,EAClCjE,aACA6D;IAGF,MAAMK,cAAyD,EAAE;IAEjE,IAAIJ,AAAAA,CAAAA,QAAAA,gBAAAA,KAAAA,IAAAA,cAAe,kBAAkB,AAAD,MAAM,OACxCI,YAAY,IAAI,CAAC;QACf,MAAM;QACN,WAAW;YACT,KAAKnE;YACL,QAAQ;QACV;IACF;IAGFmE,YAAY,IAAI,CAAC;QACf,MAAM;QACN,MAAMF;IACR;IAEA,MAAM1E,OAAe;QACnB;YAAE,MAAM;YAAU,SAASiB;QAAa;QACxC;YACE,MAAM;YACN,SAAS2D;QACX;KACD;IAED,IAAI,QAAAN,CAAAA,yBAAAA,QAAQ,aAAa,AAAD,IAApBA,KAAAA,IAAAA,uBAAuB,aAAa,EACtCtE,KAAK,IAAI,CAAC;QACR,MAAM;QACN,SAAS;IACX;IAGF,IAAIF,kBAAkB;QACpB,MAAMyB,SAAS,MAAM1B,mBAAmB;YACtC,QAAQC,iBAAiB,MAAM;YAC/B,yBAAyBA,iBAAiB,uBAAuB;QACnE;QACAE,KAAK,IAAI,IAAIuB;IACf;IAEA,MAAM+B,SAAS,MAAMC,AAAAA,IAAAA,mCAAAA,QAAAA,AAAAA,EACnBvD,MACA2B,mCAAAA,YAAAA,CAAAA,YAAyB;IAE3B,OAAO;QACL,aAAa2B,OAAO,OAAO;QAC3B3C;QACA,OAAO2C,OAAO,KAAK;IACrB;AACF"}
1
+ {"version":3,"file":"ai-model/inspect.js","sources":["webpack://@midscene/core/webpack/runtime/define_property_getters","webpack://@midscene/core/webpack/runtime/has_own_property","webpack://@midscene/core/webpack/runtime/make_namespace_object","webpack://@midscene/core/./src/ai-model/inspect.ts"],"sourcesContent":["__webpack_require__.d = (exports, definition) => {\n\tfor(var key in definition) {\n if(__webpack_require__.o(definition, key) && !__webpack_require__.o(exports, key)) {\n Object.defineProperty(exports, key, { enumerable: true, get: definition[key] });\n }\n }\n};","__webpack_require__.o = (obj, prop) => (Object.prototype.hasOwnProperty.call(obj, prop))","// define __esModule on exports\n__webpack_require__.r = (exports) => {\n\tif(typeof Symbol !== 'undefined' && Symbol.toStringTag) {\n\t\tObject.defineProperty(exports, Symbol.toStringTag, { value: 'Module' });\n\t}\n\tObject.defineProperty(exports, '__esModule', { value: true });\n};","import type {\n AIDataExtractionResponse,\n AIElementLocatorResponse,\n AIElementResponse,\n AISectionLocatorResponse,\n AIUsageInfo,\n BaseElement,\n ElementById,\n InsightExtractOption,\n Rect,\n ReferenceImage,\n UIContext,\n} from '@/types';\nimport {\n type IModelPreferences,\n getIsUseQwenVl,\n vlLocateMode,\n} from '@midscene/shared/env';\nimport {\n cropByRect,\n paddingToMatchBlockByBase64,\n preProcessImageUrl,\n} from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport type {\n ChatCompletionSystemMessageParam,\n ChatCompletionUserMessageParam,\n} from 'openai/resources/index';\nimport type { TMultimodalPrompt, TUserPrompt } from './common';\nimport {\n AIActionType,\n adaptBboxToRect,\n callAiFn,\n expandSearchArea,\n markupImageForLLM,\n mergeRects,\n} from './common';\nimport {\n extractDataQueryPrompt,\n systemPromptToExtract,\n} from './prompt/extraction';\nimport {\n findElementPrompt,\n systemPromptToLocateElement,\n} from './prompt/llm-locator';\nimport {\n sectionLocatorInstruction,\n systemPromptToLocateSection,\n} from './prompt/llm-section-locator';\nimport {\n describeUserPage,\n distance,\n distanceThreshold,\n elementByPositionWithElementInfo,\n} from './prompt/util';\nimport { callToGetJSONObject } from './service-caller/index';\n\nexport type AIArgs = [\n ChatCompletionSystemMessageParam,\n ...ChatCompletionUserMessageParam[],\n];\n\nconst debugInspect = getDebug('ai:inspect');\nconst debugSection = getDebug('ai:section');\n\nconst extraTextFromUserPrompt = (prompt: TUserPrompt): string => {\n if (typeof prompt === 'string') {\n return prompt;\n } else {\n return prompt.prompt;\n }\n};\n\nconst promptsToChatParam = async (\n multimodalPrompt: TMultimodalPrompt,\n): Promise<ChatCompletionUserMessageParam[]> => {\n const msgs: ChatCompletionUserMessageParam[] = [];\n if (multimodalPrompt?.images?.length) {\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: 'Next, I will provide all the reference images.',\n },\n ],\n });\n\n for (const item of multimodalPrompt.images) {\n const base64 = await preProcessImageUrl(\n item.url,\n !!multimodalPrompt.convertHttpImage2Base64,\n );\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: `reference image ${item.name}:`,\n },\n ],\n });\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: base64,\n detail: 'high',\n },\n },\n ],\n });\n }\n }\n return msgs;\n};\n\nexport async function AiLocateElement<\n ElementType extends BaseElement = BaseElement,\n>(options: {\n context: UIContext<ElementType>;\n targetElementDescription: TUserPrompt;\n referenceImage?: ReferenceImage;\n callAI?: typeof callAiFn<AIElementResponse | [number, number]>;\n searchConfig?: Awaited<ReturnType<typeof AiLocateSection>>;\n}): Promise<{\n parseResult: AIElementLocatorResponse;\n rect?: Rect;\n rawResponse: string;\n elementById: ElementById;\n usage?: AIUsageInfo;\n isOrderSensitive?: boolean;\n}> {\n const { context, targetElementDescription, callAI } = options;\n const { screenshotBase64 } = context;\n\n const modelPreferences: IModelPreferences = {\n intent: 'grounding',\n };\n\n const { description, elementById, insertElementByPosition } =\n await describeUserPage(context, modelPreferences);\n\n assert(\n targetElementDescription,\n 'cannot find the target element description',\n );\n\n const userInstructionPrompt = await findElementPrompt.format({\n pageDescription: description,\n targetElementDescription: extraTextFromUserPrompt(targetElementDescription),\n });\n const systemPrompt = systemPromptToLocateElement(\n vlLocateMode(modelPreferences),\n );\n\n let imagePayload = screenshotBase64;\n\n if (options.searchConfig) {\n assert(\n options.searchConfig.rect,\n 'searchArea is provided but its rect cannot be found. Failed to locate element',\n );\n assert(\n options.searchConfig.imageBase64,\n 'searchArea is provided but its imageBase64 cannot be found. Failed to locate element',\n );\n\n imagePayload = options.searchConfig.imageBase64;\n } else if (vlLocateMode(modelPreferences) === 'qwen-vl') {\n imagePayload = await paddingToMatchBlockByBase64(imagePayload);\n } else if (!vlLocateMode(modelPreferences)) {\n imagePayload = await markupImageForLLM(\n screenshotBase64,\n context.tree,\n context.size,\n );\n }\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: userInstructionPrompt,\n },\n ],\n },\n ];\n\n if (typeof targetElementDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: targetElementDescription.images,\n convertHttpImage2Base64: targetElementDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const callAIFn =\n callAI || callToGetJSONObject<AIElementResponse | [number, number]>;\n\n const res = await callAIFn(msgs, AIActionType.INSPECT_ELEMENT, {\n intent: 'grounding',\n });\n\n const rawResponse = JSON.stringify(res.content);\n\n let resRect: Rect | undefined;\n let matchedElements: AIElementLocatorResponse['elements'] =\n 'elements' in res.content ? res.content.elements : [];\n let errors: AIElementLocatorResponse['errors'] | undefined =\n 'errors' in res.content ? res.content.errors : [];\n try {\n if ('bbox' in res.content && Array.isArray(res.content.bbox)) {\n resRect = adaptBboxToRect(\n res.content.bbox,\n options.searchConfig?.rect?.width || context.size.width,\n options.searchConfig?.rect?.height || context.size.height,\n modelPreferences,\n options.searchConfig?.rect?.left,\n options.searchConfig?.rect?.top,\n );\n debugInspect('resRect', resRect);\n\n const rectCenter = {\n x: resRect.left + resRect.width / 2,\n y: resRect.top + resRect.height / 2,\n };\n let element = elementByPositionWithElementInfo(context.tree, rectCenter);\n\n const distanceToCenter = element\n ? distance({ x: element.center[0], y: element.center[1] }, rectCenter)\n : 0;\n\n if (!element || distanceToCenter > distanceThreshold) {\n element = insertElementByPosition(rectCenter);\n }\n\n if (element) {\n matchedElements = [element];\n errors = [];\n }\n }\n } catch (e) {\n const msg =\n e instanceof Error\n ? `Failed to parse bbox: ${e.message}`\n : 'unknown error in locate';\n if (!errors || errors?.length === 0) {\n errors = [msg];\n } else {\n errors.push(`(${msg})`);\n }\n }\n\n return {\n rect: resRect,\n parseResult: {\n elements: matchedElements,\n errors,\n },\n rawResponse,\n elementById,\n usage: res.usage,\n isOrderSensitive:\n typeof res.content === 'object' &&\n res.content !== null &&\n 'isOrderSensitive' in res.content\n ? (res.content as any).isOrderSensitive\n : undefined,\n };\n}\n\nexport async function AiLocateSection(options: {\n context: UIContext<BaseElement>;\n sectionDescription: TUserPrompt;\n callAI?: typeof callAiFn<AISectionLocatorResponse>;\n}): Promise<{\n rect?: Rect;\n imageBase64?: string;\n error?: string;\n rawResponse: string;\n usage?: AIUsageInfo;\n}> {\n const { context, sectionDescription } = options;\n const { screenshotBase64 } = context;\n\n const modelPreferences: IModelPreferences = {\n intent: 'grounding',\n };\n\n const systemPrompt = systemPromptToLocateSection(\n vlLocateMode(modelPreferences),\n );\n const sectionLocatorInstructionText = await sectionLocatorInstruction.format({\n sectionDescription: extraTextFromUserPrompt(sectionDescription),\n });\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: sectionLocatorInstructionText,\n },\n ],\n },\n ];\n\n if (typeof sectionDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: sectionDescription.images,\n convertHttpImage2Base64: sectionDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const result = await callAiFn<AISectionLocatorResponse>(\n msgs,\n AIActionType.EXTRACT_DATA,\n {\n intent: 'grounding',\n },\n );\n\n let sectionRect: Rect | undefined;\n const sectionBbox = result.content.bbox;\n if (sectionBbox) {\n const targetRect = adaptBboxToRect(\n sectionBbox,\n context.size.width,\n context.size.height,\n modelPreferences,\n );\n debugSection('original targetRect %j', targetRect);\n\n const referenceBboxList = result.content.references_bbox || [];\n debugSection('referenceBboxList %j', referenceBboxList);\n\n const referenceRects = referenceBboxList\n .filter((bbox) => Array.isArray(bbox))\n .map((bbox) => {\n return adaptBboxToRect(\n bbox,\n context.size.width,\n context.size.height,\n modelPreferences,\n );\n });\n debugSection('referenceRects %j', referenceRects);\n\n // merge the sectionRect and referenceRects\n const mergedRect = mergeRects([targetRect, ...referenceRects]);\n debugSection('mergedRect %j', mergedRect);\n\n // expand search area to at least 200 x 200\n sectionRect = expandSearchArea(mergedRect, context.size, modelPreferences);\n debugSection('expanded sectionRect %j', sectionRect);\n }\n\n let imageBase64 = screenshotBase64;\n if (sectionRect) {\n imageBase64 = await cropByRect(\n screenshotBase64,\n sectionRect,\n getIsUseQwenVl({\n intent: 'grounding',\n }),\n );\n }\n\n return {\n rect: sectionRect,\n imageBase64,\n error: result.content.error,\n rawResponse: JSON.stringify(result.content),\n usage: result.usage,\n };\n}\n\nexport async function AiExtractElementInfo<\n T,\n ElementType extends BaseElement = BaseElement,\n>(options: {\n dataQuery: string | Record<string, string>;\n multimodalPrompt?: TMultimodalPrompt;\n context: UIContext<ElementType>;\n extractOption?: InsightExtractOption;\n modelPreferences: IModelPreferences;\n}) {\n const {\n dataQuery,\n context,\n extractOption,\n multimodalPrompt,\n modelPreferences,\n } = options;\n const systemPrompt = systemPromptToExtract();\n\n const { screenshotBase64 } = context;\n const { description, elementById } = await describeUserPage(\n context,\n modelPreferences,\n {\n truncateTextLength: 200,\n filterNonTextContent: false,\n visibleOnly: false,\n domIncluded: extractOption?.domIncluded,\n },\n );\n\n const extractDataPromptText = await extractDataQueryPrompt(\n description,\n dataQuery,\n );\n\n const userContent: ChatCompletionUserMessageParam['content'] = [];\n\n if (extractOption?.screenshotIncluded !== false) {\n userContent.push({\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n });\n }\n\n userContent.push({\n type: 'text',\n text: extractDataPromptText,\n });\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: userContent,\n },\n ];\n\n if (options.extractOption?.returnThought) {\n msgs.push({\n role: 'user',\n content: 'Please provide reasons.',\n });\n }\n\n if (multimodalPrompt) {\n const addOns = await promptsToChatParam({\n images: multimodalPrompt.images,\n convertHttpImage2Base64: multimodalPrompt.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const result = await callAiFn<AIDataExtractionResponse<T>>(\n msgs,\n AIActionType.EXTRACT_DATA,\n modelPreferences,\n );\n return {\n parseResult: result.content,\n elementById,\n usage: result.usage,\n };\n}\n"],"names":["__webpack_require__","definition","key","Object","obj","prop","Symbol","debugInspect","getDebug","debugSection","extraTextFromUserPrompt","prompt","promptsToChatParam","multimodalPrompt","_multimodalPrompt_images","msgs","item","base64","preProcessImageUrl","AiLocateElement","options","context","targetElementDescription","callAI","screenshotBase64","modelPreferences","description","elementById","insertElementByPosition","describeUserPage","assert","userInstructionPrompt","findElementPrompt","systemPrompt","systemPromptToLocateElement","vlLocateMode","imagePayload","paddingToMatchBlockByBase64","markupImageForLLM","addOns","callAIFn","callToGetJSONObject","res","AIActionType","rawResponse","JSON","resRect","matchedElements","errors","Array","_options_searchConfig_rect","_options_searchConfig_rect1","_options_searchConfig_rect2","_options_searchConfig_rect3","adaptBboxToRect","rectCenter","element","elementByPositionWithElementInfo","distanceToCenter","distance","distanceThreshold","e","msg","Error","undefined","AiLocateSection","sectionDescription","systemPromptToLocateSection","sectionLocatorInstructionText","sectionLocatorInstruction","result","callAiFn","sectionRect","sectionBbox","targetRect","referenceBboxList","referenceRects","bbox","mergedRect","mergeRects","expandSearchArea","imageBase64","cropByRect","getIsUseQwenVl","AiExtractElementInfo","_options_extractOption","dataQuery","extractOption","systemPromptToExtract","extractDataPromptText","extractDataQueryPrompt","userContent"],"mappings":";;;IAAAA,oBAAoB,CAAC,GAAG,CAAC,UAASC;QACjC,IAAI,IAAIC,OAAOD,WACR,IAAGD,oBAAoB,CAAC,CAACC,YAAYC,QAAQ,CAACF,oBAAoB,CAAC,CAAC,UAASE,MACzEC,OAAO,cAAc,CAAC,UAASD,KAAK;YAAE,YAAY;YAAM,KAAKD,UAAU,CAACC,IAAI;QAAC;IAGzF;;;ICNAF,oBAAoB,CAAC,GAAG,CAACI,KAAKC,OAAUF,OAAO,SAAS,CAAC,cAAc,CAAC,IAAI,CAACC,KAAKC;;;ICClFL,oBAAoB,CAAC,GAAG,CAAC;QACxB,IAAG,AAAkB,eAAlB,OAAOM,UAA0BA,OAAO,WAAW,EACrDH,OAAO,cAAc,CAAC,UAASG,OAAO,WAAW,EAAE;YAAE,OAAO;QAAS;QAEtEH,OAAO,cAAc,CAAC,UAAS,cAAc;YAAE,OAAO;QAAK;IAC5D;;;;;;;;;;;;;;;;;;;ACyDA,MAAMI,eAAeC,AAAAA,IAAAA,uBAAAA,QAAAA,AAAAA,EAAS;AAC9B,MAAMC,eAAeD,AAAAA,IAAAA,uBAAAA,QAAAA,AAAAA,EAAS;AAE9B,MAAME,0BAA0B,CAACC;IAC/B,IAAI,AAAkB,YAAlB,OAAOA,QACT,OAAOA;IAEP,OAAOA,OAAO,MAAM;AAExB;AAEA,MAAMC,qBAAqB,OACzBC;QAGIC;IADJ,MAAMC,OAAyC,EAAE;IACjD,IAAID,QAAAA,mBAAAA,KAAAA,IAAAA,QAAAA,CAAAA,2BAAAA,iBAAkB,MAAM,AAAD,IAAvBA,KAAAA,IAAAA,yBAA0B,MAAM,EAAE;QACpCC,KAAK,IAAI,CAAC;YACR,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM;gBACR;aACD;QACH;QAEA,KAAK,MAAMC,QAAQH,iBAAiB,MAAM,CAAE;YAC1C,MAAMI,SAAS,MAAMC,AAAAA,IAAAA,oBAAAA,kBAAAA,AAAAA,EACnBF,KAAK,GAAG,EACR,CAAC,CAACH,iBAAiB,uBAAuB;YAG5CE,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,MAAM,CAAC,gBAAgB,EAAEC,KAAK,IAAI,CAAC,CAAC,CAAC;oBACvC;iBACD;YACH;YAEAD,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,WAAW;4BACT,KAAKE;4BACL,QAAQ;wBACV;oBACF;iBACD;YACH;QACF;IACF;IACA,OAAOF;AACT;AAEO,eAAeI,gBAEpBC,OAMD;IAQC,MAAM,EAAEC,OAAO,EAAEC,wBAAwB,EAAEC,MAAM,EAAE,GAAGH;IACtD,MAAM,EAAEI,gBAAgB,EAAE,GAAGH;IAE7B,MAAMI,mBAAsC;QAC1C,QAAQ;IACV;IAEA,MAAM,EAAEC,WAAW,EAAEC,WAAW,EAAEC,uBAAuB,EAAE,GACzD,MAAMC,AAAAA,IAAAA,wBAAAA,gBAAAA,AAAAA,EAAiBR,SAASI;IAElCK,IAAAA,sBAAAA,MAAAA,AAAAA,EACER,0BACA;IAGF,MAAMS,wBAAwB,MAAMC,+BAAAA,iBAAAA,CAAAA,MAAwB,CAAC;QAC3D,iBAAiBN;QACjB,0BAA0BhB,wBAAwBY;IACpD;IACA,MAAMW,eAAeC,AAAAA,IAAAA,+BAAAA,2BAAAA,AAAAA,EACnBC,AAAAA,IAAAA,oBAAAA,YAAAA,AAAAA,EAAaV;IAGf,IAAIW,eAAeZ;IAEnB,IAAIJ,QAAQ,YAAY,EAAE;QACxBU,IAAAA,sBAAAA,MAAAA,AAAAA,EACEV,QAAQ,YAAY,CAAC,IAAI,EACzB;QAEFU,IAAAA,sBAAAA,MAAAA,AAAAA,EACEV,QAAQ,YAAY,CAAC,WAAW,EAChC;QAGFgB,eAAehB,QAAQ,YAAY,CAAC,WAAW;IACjD,OAAO,IAAIe,AAAmC,cAAnCA,AAAAA,IAAAA,oBAAAA,YAAAA,AAAAA,EAAaV,mBACtBW,eAAe,MAAMC,AAAAA,IAAAA,oBAAAA,2BAAAA,AAAAA,EAA4BD;SAC5C,IAAI,CAACD,AAAAA,IAAAA,oBAAAA,YAAAA,AAAAA,EAAaV,mBACvBW,eAAe,MAAME,AAAAA,IAAAA,mCAAAA,iBAAAA,AAAAA,EACnBd,kBACAH,QAAQ,IAAI,EACZA,QAAQ,IAAI;IAIhB,MAAMN,OAAe;QACnB;YAAE,MAAM;YAAU,SAASkB;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKG;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAML;gBACR;aACD;QACH;KACD;IAED,IAAI,AAAoC,YAApC,OAAOT,0BAAuC;QAChD,MAAMiB,SAAS,MAAM3B,mBAAmB;YACtC,QAAQU,yBAAyB,MAAM;YACvC,yBAAyBA,yBAAyB,uBAAuB;QAC3E;QACAP,KAAK,IAAI,IAAIwB;IACf;IAEA,MAAMC,WACJjB,UAAUkB,yBAAAA,mBAAmBA;IAE/B,MAAMC,MAAM,MAAMF,SAASzB,MAAM4B,mCAAAA,YAAAA,CAAAA,eAA4B,EAAE;QAC7D,QAAQ;IACV;IAEA,MAAMC,cAAcC,KAAK,SAAS,CAACH,IAAI,OAAO;IAE9C,IAAII;IACJ,IAAIC,kBACF,cAAcL,IAAI,OAAO,GAAGA,IAAI,OAAO,CAAC,QAAQ,GAAG,EAAE;IACvD,IAAIM,SACF,YAAYN,IAAI,OAAO,GAAGA,IAAI,OAAO,CAAC,MAAM,GAAG,EAAE;IACnD,IAAI;QACF,IAAI,UAAUA,IAAI,OAAO,IAAIO,MAAM,OAAO,CAACP,IAAI,OAAO,CAAC,IAAI,GAAG;gBAG1DQ,4BAAAA,uBACAC,6BAAAA,wBAEAC,6BAAAA,wBACAC,6BAAAA;YANFP,UAAUQ,AAAAA,IAAAA,mCAAAA,eAAAA,AAAAA,EACRZ,IAAI,OAAO,CAAC,IAAI,EAChBQ,AAAAA,SAAAA,CAAAA,wBAAAA,QAAQ,YAAY,AAAD,IAAnBA,KAAAA,IAAAA,QAAAA,CAAAA,6BAAAA,sBAAsB,IAAI,AAAD,IAAzBA,KAAAA,IAAAA,2BAA4B,KAAK,AAAD,KAAK7B,QAAQ,IAAI,CAAC,KAAK,EACvD8B,AAAAA,SAAAA,CAAAA,yBAAAA,QAAQ,YAAY,AAAD,IAAnBA,KAAAA,IAAAA,QAAAA,CAAAA,8BAAAA,uBAAsB,IAAI,AAAD,IAAzBA,KAAAA,IAAAA,4BAA4B,MAAM,AAAD,KAAK9B,QAAQ,IAAI,CAAC,MAAM,EACzDI,kBAAAA,QACA2B,CAAAA,yBAAAA,QAAQ,YAAY,AAAD,IAAnBA,KAAAA,IAAAA,QAAAA,CAAAA,8BAAAA,uBAAsB,IAAI,AAAD,IAAzBA,KAAAA,IAAAA,4BAA4B,IAAI,UAChCC,CAAAA,yBAAAA,QAAQ,YAAY,AAAD,IAAnBA,KAAAA,IAAAA,QAAAA,CAAAA,8BAAAA,uBAAsB,IAAI,AAAD,IAAzBA,KAAAA,IAAAA,4BAA4B,GAAG;YAEjC9C,aAAa,WAAWuC;YAExB,MAAMS,aAAa;gBACjB,GAAGT,QAAQ,IAAI,GAAGA,QAAQ,KAAK,GAAG;gBAClC,GAAGA,QAAQ,GAAG,GAAGA,QAAQ,MAAM,GAAG;YACpC;YACA,IAAIU,UAAUC,AAAAA,IAAAA,wBAAAA,gCAAAA,AAAAA,EAAiCpC,QAAQ,IAAI,EAAEkC;YAE7D,MAAMG,mBAAmBF,UACrBG,AAAAA,IAAAA,wBAAAA,QAAAA,AAAAA,EAAS;gBAAE,GAAGH,QAAQ,MAAM,CAAC,EAAE;gBAAE,GAAGA,QAAQ,MAAM,CAAC,EAAE;YAAC,GAAGD,cACzD;YAEJ,IAAI,CAACC,WAAWE,mBAAmBE,wBAAAA,iBAAiBA,EAClDJ,UAAU5B,wBAAwB2B;YAGpC,IAAIC,SAAS;gBACXT,kBAAkB;oBAACS;iBAAQ;gBAC3BR,SAAS,EAAE;YACb;QACF;IACF,EAAE,OAAOa,GAAG;QACV,MAAMC,MACJD,aAAaE,QACT,CAAC,sBAAsB,EAAEF,EAAE,OAAO,EAAE,GACpC;QACN,IAAI,AAACb,UAAUA,AAAAA,CAAAA,QAAAA,SAAAA,KAAAA,IAAAA,OAAQ,MAAM,AAAD,MAAM,GAGhCA,OAAO,IAAI,CAAC,CAAC,CAAC,EAAEc,IAAI,CAAC,CAAC;aAFtBd,SAAS;YAACc;SAAI;IAIlB;IAEA,OAAO;QACL,MAAMhB;QACN,aAAa;YACX,UAAUC;YACVC;QACF;QACAJ;QACAjB;QACA,OAAOe,IAAI,KAAK;QAChB,kBACE,AAAuB,YAAvB,OAAOA,IAAI,OAAO,IAClBA,AAAgB,SAAhBA,IAAI,OAAO,IACX,sBAAsBA,IAAI,OAAO,GAC5BA,IAAI,OAAO,CAAS,gBAAgB,GACrCsB;IACR;AACF;AAEO,eAAeC,gBAAgB7C,OAIrC;IAOC,MAAM,EAAEC,OAAO,EAAE6C,kBAAkB,EAAE,GAAG9C;IACxC,MAAM,EAAEI,gBAAgB,EAAE,GAAGH;IAE7B,MAAMI,mBAAsC;QAC1C,QAAQ;IACV;IAEA,MAAMQ,eAAekC,AAAAA,IAAAA,uCAAAA,2BAAAA,AAAAA,EACnBhC,AAAAA,IAAAA,oBAAAA,YAAAA,AAAAA,EAAaV;IAEf,MAAM2C,gCAAgC,MAAMC,uCAAAA,yBAAAA,CAAAA,MAAgC,CAAC;QAC3E,oBAAoB3D,wBAAwBwD;IAC9C;IACA,MAAMnD,OAAe;QACnB;YAAE,MAAM;YAAU,SAASkB;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKT;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAM4C;gBACR;aACD;QACH;KACD;IAED,IAAI,AAA8B,YAA9B,OAAOF,oBAAiC;QAC1C,MAAM3B,SAAS,MAAM3B,mBAAmB;YACtC,QAAQsD,mBAAmB,MAAM;YACjC,yBAAyBA,mBAAmB,uBAAuB;QACrE;QACAnD,KAAK,IAAI,IAAIwB;IACf;IAEA,MAAM+B,SAAS,MAAMC,AAAAA,IAAAA,mCAAAA,QAAAA,AAAAA,EACnBxD,MACA4B,mCAAAA,YAAAA,CAAAA,YAAyB,EACzB;QACE,QAAQ;IACV;IAGF,IAAI6B;IACJ,MAAMC,cAAcH,OAAO,OAAO,CAAC,IAAI;IACvC,IAAIG,aAAa;QACf,MAAMC,aAAapB,AAAAA,IAAAA,mCAAAA,eAAAA,AAAAA,EACjBmB,aACApD,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnBI;QAEFhB,aAAa,0BAA0BiE;QAEvC,MAAMC,oBAAoBL,OAAO,OAAO,CAAC,eAAe,IAAI,EAAE;QAC9D7D,aAAa,wBAAwBkE;QAErC,MAAMC,iBAAiBD,kBACpB,MAAM,CAAC,CAACE,OAAS5B,MAAM,OAAO,CAAC4B,OAC/B,GAAG,CAAC,CAACA,OACGvB,AAAAA,IAAAA,mCAAAA,eAAAA,AAAAA,EACLuB,MACAxD,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnBI;QAGNhB,aAAa,qBAAqBmE;QAGlC,MAAME,aAAaC,AAAAA,IAAAA,mCAAAA,UAAAA,AAAAA,EAAW;YAACL;eAAeE;SAAe;QAC7DnE,aAAa,iBAAiBqE;QAG9BN,cAAcQ,AAAAA,IAAAA,mCAAAA,gBAAAA,AAAAA,EAAiBF,YAAYzD,QAAQ,IAAI,EAAEI;QACzDhB,aAAa,2BAA2B+D;IAC1C;IAEA,IAAIS,cAAczD;IAClB,IAAIgD,aACFS,cAAc,MAAMC,AAAAA,IAAAA,oBAAAA,UAAAA,AAAAA,EAClB1D,kBACAgD,aACAW,AAAAA,IAAAA,oBAAAA,cAAAA,AAAAA,EAAe;QACb,QAAQ;IACV;IAIJ,OAAO;QACL,MAAMX;QACNS;QACA,OAAOX,OAAO,OAAO,CAAC,KAAK;QAC3B,aAAazB,KAAK,SAAS,CAACyB,OAAO,OAAO;QAC1C,OAAOA,OAAO,KAAK;IACrB;AACF;AAEO,eAAec,qBAGpBhE,OAMD;QAoDKiE;IAnDJ,MAAM,EACJC,SAAS,EACTjE,OAAO,EACPkE,aAAa,EACb1E,gBAAgB,EAChBY,gBAAgB,EACjB,GAAGL;IACJ,MAAMa,eAAeuD,AAAAA,IAAAA,8BAAAA,qBAAAA,AAAAA;IAErB,MAAM,EAAEhE,gBAAgB,EAAE,GAAGH;IAC7B,MAAM,EAAEK,WAAW,EAAEC,WAAW,EAAE,GAAG,MAAME,AAAAA,IAAAA,wBAAAA,gBAAAA,AAAAA,EACzCR,SACAI,kBACA;QACE,oBAAoB;QACpB,sBAAsB;QACtB,aAAa;QACb,aAAa8D,QAAAA,gBAAAA,KAAAA,IAAAA,cAAe,WAAW;IACzC;IAGF,MAAME,wBAAwB,MAAMC,AAAAA,IAAAA,8BAAAA,sBAAAA,AAAAA,EAClChE,aACA4D;IAGF,MAAMK,cAAyD,EAAE;IAEjE,IAAIJ,AAAAA,CAAAA,QAAAA,gBAAAA,KAAAA,IAAAA,cAAe,kBAAkB,AAAD,MAAM,OACxCI,YAAY,IAAI,CAAC;QACf,MAAM;QACN,WAAW;YACT,KAAKnE;YACL,QAAQ;QACV;IACF;IAGFmE,YAAY,IAAI,CAAC;QACf,MAAM;QACN,MAAMF;IACR;IAEA,MAAM1E,OAAe;QACnB;YAAE,MAAM;YAAU,SAASkB;QAAa;QACxC;YACE,MAAM;YACN,SAAS0D;QACX;KACD;IAED,IAAI,QAAAN,CAAAA,yBAAAA,QAAQ,aAAa,AAAD,IAApBA,KAAAA,IAAAA,uBAAuB,aAAa,EACtCtE,KAAK,IAAI,CAAC;QACR,MAAM;QACN,SAAS;IACX;IAGF,IAAIF,kBAAkB;QACpB,MAAM0B,SAAS,MAAM3B,mBAAmB;YACtC,QAAQC,iBAAiB,MAAM;YAC/B,yBAAyBA,iBAAiB,uBAAuB;QACnE;QACAE,KAAK,IAAI,IAAIwB;IACf;IAEA,MAAM+B,SAAS,MAAMC,AAAAA,IAAAA,mCAAAA,QAAAA,AAAAA,EACnBxD,MACA4B,mCAAAA,YAAAA,CAAAA,YAAyB,EACzBlB;IAEF,OAAO;QACL,aAAa6C,OAAO,OAAO;QAC3B3C;QACA,OAAO2C,OAAO,KAAK;IACrB;AACF"}
@@ -28,28 +28,33 @@ __webpack_require__.d(__webpack_exports__, {
28
28
  });
29
29
  const env_namespaceObject = require("@midscene/shared/env");
30
30
  const img_namespaceObject = require("@midscene/shared/img");
31
+ const logger_namespaceObject = require("@midscene/shared/logger");
31
32
  const utils_namespaceObject = require("@midscene/shared/utils");
32
33
  const external_common_js_namespaceObject = require("./common.js");
33
34
  const llm_planning_js_namespaceObject = require("./prompt/llm-planning.js");
34
35
  const util_js_namespaceObject = require("./prompt/util.js");
36
+ const debug = (0, logger_namespaceObject.getDebug)('planning');
35
37
  async function plan(userInstruction, opts) {
36
38
  var _planFromAI_action;
37
39
  const { callAI, context } = opts || {};
38
40
  const { screenshotBase64, size } = context;
39
- const { description: pageDescription, elementById } = await (0, util_js_namespaceObject.describeUserPage)(context);
41
+ const modelPreferences = {
42
+ intent: 'planning'
43
+ };
44
+ const { description: pageDescription, elementById } = await (0, util_js_namespaceObject.describeUserPage)(context, modelPreferences);
40
45
  const systemPrompt = await (0, llm_planning_js_namespaceObject.systemPromptToTaskPlanning)({
41
46
  actionSpace: opts.actionSpace,
42
- vlMode: (0, env_namespaceObject.vlLocateMode)()
47
+ vlMode: (0, env_namespaceObject.vlLocateMode)(modelPreferences)
43
48
  });
44
49
  const taskBackgroundContextText = (0, llm_planning_js_namespaceObject.generateTaskBackgroundContext)(userInstruction, opts.log, opts.actionContext);
45
- const userInstructionPrompt = await (0, llm_planning_js_namespaceObject.automationUserPrompt)((0, env_namespaceObject.vlLocateMode)()).format({
50
+ const userInstructionPrompt = await (0, llm_planning_js_namespaceObject.automationUserPrompt)((0, env_namespaceObject.vlLocateMode)(modelPreferences)).format({
46
51
  pageDescription,
47
52
  taskBackgroundContext: taskBackgroundContextText
48
53
  });
49
54
  let imagePayload = screenshotBase64;
50
- if ('qwen-vl' === (0, env_namespaceObject.vlLocateMode)()) imagePayload = await (0, img_namespaceObject.paddingToMatchBlockByBase64)(imagePayload);
51
- else if (!(0, env_namespaceObject.vlLocateMode)()) imagePayload = await (0, external_common_js_namespaceObject.markupImageForLLM)(screenshotBase64, context.tree, context.size);
52
- (0, external_common_js_namespaceObject.warnGPT4oSizeLimit)(size);
55
+ if ('qwen-vl' === (0, env_namespaceObject.vlLocateMode)(modelPreferences)) imagePayload = await (0, img_namespaceObject.paddingToMatchBlockByBase64)(imagePayload);
56
+ else if (!(0, env_namespaceObject.vlLocateMode)(modelPreferences)) imagePayload = await (0, external_common_js_namespaceObject.markupImageForLLM)(screenshotBase64, context.tree, context.size);
57
+ (0, external_common_js_namespaceObject.warnGPT4oSizeLimit)(size, modelPreferences);
53
58
  const msgs = [
54
59
  {
55
60
  role: 'system',
@@ -73,7 +78,7 @@ async function plan(userInstruction, opts) {
73
78
  }
74
79
  ];
75
80
  const call = callAI || external_common_js_namespaceObject.callAiFn;
76
- const { content, usage } = await call(msgs, external_common_js_namespaceObject.AIActionType.PLAN);
81
+ const { content, usage } = await call(msgs, external_common_js_namespaceObject.AIActionType.PLAN, modelPreferences);
77
82
  const rawResponse = JSON.stringify(content, void 0, 2);
78
83
  const planFromAI = content;
79
84
  const actions = ((null == (_planFromAI_action = planFromAI.action) ? void 0 : _planFromAI_action.type) ? [
@@ -87,24 +92,21 @@ async function plan(userInstruction, opts) {
87
92
  yamlFlow: (0, external_common_js_namespaceObject.buildYamlFlowFromPlans)(actions, opts.actionSpace, planFromAI.sleep)
88
93
  };
89
94
  (0, utils_namespaceObject.assert)(planFromAI, "can't get plans from AI");
90
- if ((0, env_namespaceObject.vlLocateMode)()) {
91
- actions.forEach((action)=>{
92
- if (action.locate) try {
93
- action.locate = (0, external_common_js_namespaceObject.fillBboxParam)(action.locate, size.width, size.height);
94
- } catch (e) {
95
- throw new Error(`Failed to fill locate param: ${planFromAI.error} (${e instanceof Error ? e.message : 'unknown error'})`, {
96
- cause: e
97
- });
95
+ actions.forEach((action)=>{
96
+ const type = action.type;
97
+ const actionInActionSpace = opts.actionSpace.find((action)=>action.name === type);
98
+ const locateFields = actionInActionSpace ? (0, external_common_js_namespaceObject.findAllMidsceneLocatorField)(actionInActionSpace.paramSchema) : [];
99
+ debug('locateFields', locateFields);
100
+ locateFields.forEach((field)=>{
101
+ const locateResult = action.param[field];
102
+ if (locateResult) if ((0, env_namespaceObject.vlLocateMode)(modelPreferences)) action.param[field] = (0, external_common_js_namespaceObject.fillBboxParam)(locateResult, size.width, size.height, modelPreferences);
103
+ else {
104
+ const element = elementById(locateResult);
105
+ if (element) action.param[field].id = element.id;
98
106
  }
99
107
  });
100
- (0, utils_namespaceObject.assert)(!planFromAI.error, `Failed to plan actions: ${planFromAI.error}`);
101
- } else actions.forEach((action)=>{
102
- var _action_locate;
103
- if (null == (_action_locate = action.locate) ? void 0 : _action_locate.id) {
104
- const element = elementById(action.locate.id);
105
- if (element) action.locate.id = element.id;
106
- }
107
108
  });
109
+ (0, utils_namespaceObject.assert)(!planFromAI.error, `Failed to plan actions: ${planFromAI.error}`);
108
110
  if (0 === actions.length && returnValue.more_actions_needed_by_instruction && !returnValue.sleep) console.warn('No actions planned for the prompt, but model said more actions are needed:', userInstruction);
109
111
  return returnValue;
110
112
  }
@@ -1 +1 @@
1
- {"version":3,"file":"ai-model/llm-planning.js","sources":["webpack://@midscene/core/webpack/runtime/define_property_getters","webpack://@midscene/core/webpack/runtime/has_own_property","webpack://@midscene/core/webpack/runtime/make_namespace_object","webpack://@midscene/core/./src/ai-model/llm-planning.ts"],"sourcesContent":["__webpack_require__.d = (exports, definition) => {\n\tfor(var key in definition) {\n if(__webpack_require__.o(definition, key) && !__webpack_require__.o(exports, key)) {\n Object.defineProperty(exports, key, { enumerable: true, get: definition[key] });\n }\n }\n};","__webpack_require__.o = (obj, prop) => (Object.prototype.hasOwnProperty.call(obj, prop))","// define __esModule on exports\n__webpack_require__.r = (exports) => {\n\tif(typeof Symbol !== 'undefined' && Symbol.toStringTag) {\n\t\tObject.defineProperty(exports, Symbol.toStringTag, { value: 'Module' });\n\t}\n\tObject.defineProperty(exports, '__esModule', { value: true });\n};","import type {\n DeviceAction,\n PageType,\n PlanningAIResponse,\n UIContext,\n} from '@/types';\nimport { vlLocateMode } from '@midscene/shared/env';\nimport { paddingToMatchBlockByBase64 } from '@midscene/shared/img';\nimport { assert } from '@midscene/shared/utils';\nimport {\n AIActionType,\n type AIArgs,\n buildYamlFlowFromPlans,\n callAiFn,\n fillBboxParam,\n markupImageForLLM,\n warnGPT4oSizeLimit,\n} from './common';\nimport {\n automationUserPrompt,\n generateTaskBackgroundContext,\n systemPromptToTaskPlanning,\n} from './prompt/llm-planning';\nimport { describeUserPage } from './prompt/util';\n\nexport async function plan(\n userInstruction: string,\n opts: {\n context: UIContext;\n pageType: PageType;\n actionSpace: DeviceAction[];\n callAI?: typeof callAiFn<PlanningAIResponse>;\n log?: string;\n actionContext?: string;\n },\n): Promise<PlanningAIResponse> {\n const { callAI, context } = opts || {};\n const { screenshotBase64, size } = context;\n const { description: pageDescription, elementById } =\n await describeUserPage(context);\n\n const systemPrompt = await systemPromptToTaskPlanning({\n actionSpace: opts.actionSpace,\n vlMode: vlLocateMode(),\n });\n const taskBackgroundContextText = generateTaskBackgroundContext(\n userInstruction,\n opts.log,\n opts.actionContext,\n );\n const userInstructionPrompt = await automationUserPrompt(\n vlLocateMode(),\n ).format({\n pageDescription,\n taskBackgroundContext: taskBackgroundContextText,\n });\n\n let imagePayload = screenshotBase64;\n if (vlLocateMode() === 'qwen-vl') {\n imagePayload = await paddingToMatchBlockByBase64(imagePayload);\n } else if (!vlLocateMode()) {\n imagePayload = await markupImageForLLM(\n screenshotBase64,\n context.tree,\n context.size,\n );\n }\n\n warnGPT4oSizeLimit(size);\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: userInstructionPrompt,\n },\n ],\n },\n ];\n\n const call = callAI || callAiFn;\n const { content, usage } = await call(msgs, AIActionType.PLAN);\n const rawResponse = JSON.stringify(content, undefined, 2);\n const planFromAI = content;\n\n const actions =\n (planFromAI.action?.type ? [planFromAI.action] : planFromAI.actions) || [];\n const returnValue: PlanningAIResponse = {\n ...planFromAI,\n actions,\n rawResponse,\n usage,\n yamlFlow: buildYamlFlowFromPlans(\n actions,\n opts.actionSpace,\n planFromAI.sleep,\n ),\n };\n\n assert(planFromAI, \"can't get plans from AI\");\n\n if (vlLocateMode()) {\n actions.forEach((action) => {\n if (action.locate) {\n try {\n action.locate = fillBboxParam(action.locate, size.width, size.height);\n } catch (e) {\n throw new Error(\n `Failed to fill locate param: ${planFromAI.error} (${\n e instanceof Error ? e.message : 'unknown error'\n })`,\n {\n cause: e,\n },\n );\n }\n }\n });\n // in Qwen-VL, error means error. In GPT-4o, error may mean more actions are needed.\n assert(!planFromAI.error, `Failed to plan actions: ${planFromAI.error}`);\n } else {\n actions.forEach((action) => {\n if (action.locate?.id) {\n // The model may return indexId, need to perform a query correction to avoid exceptions\n const element = elementById(action.locate.id);\n if (element) {\n action.locate.id = element.id;\n }\n }\n });\n }\n\n if (\n actions.length === 0 &&\n returnValue.more_actions_needed_by_instruction &&\n !returnValue.sleep\n ) {\n console.warn(\n 'No actions planned for the prompt, but model said more actions are needed:',\n userInstruction,\n );\n }\n\n return returnValue;\n}\n"],"names":["__webpack_require__","definition","key","Object","obj","prop","Symbol","plan","userInstruction","opts","_planFromAI_action","callAI","context","screenshotBase64","size","pageDescription","elementById","describeUserPage","systemPrompt","systemPromptToTaskPlanning","vlLocateMode","taskBackgroundContextText","generateTaskBackgroundContext","userInstructionPrompt","automationUserPrompt","imagePayload","paddingToMatchBlockByBase64","markupImageForLLM","warnGPT4oSizeLimit","msgs","call","callAiFn","content","usage","AIActionType","rawResponse","JSON","undefined","planFromAI","actions","returnValue","buildYamlFlowFromPlans","assert","action","fillBboxParam","e","Error","_action_locate","element","console"],"mappings":";;;IAAAA,oBAAoB,CAAC,GAAG,CAAC,UAASC;QACjC,IAAI,IAAIC,OAAOD,WACR,IAAGD,oBAAoB,CAAC,CAACC,YAAYC,QAAQ,CAACF,oBAAoB,CAAC,CAAC,UAASE,MACzEC,OAAO,cAAc,CAAC,UAASD,KAAK;YAAE,YAAY;YAAM,KAAKD,UAAU,CAACC,IAAI;QAAC;IAGzF;;;ICNAF,oBAAoB,CAAC,GAAG,CAACI,KAAKC,OAAUF,OAAO,SAAS,CAAC,cAAc,CAAC,IAAI,CAACC,KAAKC;;;ICClFL,oBAAoB,CAAC,GAAG,CAAC;QACxB,IAAG,AAAkB,eAAlB,OAAOM,UAA0BA,OAAO,WAAW,EACrDH,OAAO,cAAc,CAAC,UAASG,OAAO,WAAW,EAAE;YAAE,OAAO;QAAS;QAEtEH,OAAO,cAAc,CAAC,UAAS,cAAc;YAAE,OAAO;QAAK;IAC5D;;;;;;;;;;;;;ACmBO,eAAeI,KACpBC,eAAuB,EACvBC,IAOC;QA8DEC;IA5DH,MAAM,EAAEC,MAAM,EAAEC,OAAO,EAAE,GAAGH,QAAQ,CAAC;IACrC,MAAM,EAAEI,gBAAgB,EAAEC,IAAI,EAAE,GAAGF;IACnC,MAAM,EAAE,aAAaG,eAAe,EAAEC,WAAW,EAAE,GACjD,MAAMC,AAAAA,IAAAA,wBAAAA,gBAAAA,AAAAA,EAAiBL;IAEzB,MAAMM,eAAe,MAAMC,AAAAA,IAAAA,gCAAAA,0BAAAA,AAAAA,EAA2B;QACpD,aAAaV,KAAK,WAAW;QAC7B,QAAQW,AAAAA,IAAAA,oBAAAA,YAAAA,AAAAA;IACV;IACA,MAAMC,4BAA4BC,AAAAA,IAAAA,gCAAAA,6BAAAA,AAAAA,EAChCd,iBACAC,KAAK,GAAG,EACRA,KAAK,aAAa;IAEpB,MAAMc,wBAAwB,MAAMC,AAAAA,IAAAA,gCAAAA,oBAAAA,AAAAA,EAClCJ,AAAAA,IAAAA,oBAAAA,YAAAA,AAAAA,KACA,MAAM,CAAC;QACPL;QACA,uBAAuBM;IACzB;IAEA,IAAII,eAAeZ;IACnB,IAAIO,AAAmB,cAAnBA,AAAAA,IAAAA,oBAAAA,YAAAA,AAAAA,KACFK,eAAe,MAAMC,AAAAA,IAAAA,oBAAAA,2BAAAA,AAAAA,EAA4BD;SAC5C,IAAI,CAACL,AAAAA,IAAAA,oBAAAA,YAAAA,AAAAA,KACVK,eAAe,MAAME,AAAAA,IAAAA,mCAAAA,iBAAAA,AAAAA,EACnBd,kBACAD,QAAQ,IAAI,EACZA,QAAQ,IAAI;IAIhBgB,IAAAA,mCAAAA,kBAAAA,AAAAA,EAAmBd;IAEnB,MAAMe,OAAe;QACnB;YAAE,MAAM;YAAU,SAASX;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKO;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAMF;gBACR;aACD;QACH;KACD;IAED,MAAMO,OAAOnB,UAAUoB,mCAAAA,QAAQA;IAC/B,MAAM,EAAEC,OAAO,EAAEC,KAAK,EAAE,GAAG,MAAMH,KAAKD,MAAMK,mCAAAA,YAAAA,CAAAA,IAAiB;IAC7D,MAAMC,cAAcC,KAAK,SAAS,CAACJ,SAASK,QAAW;IACvD,MAAMC,aAAaN;IAEnB,MAAMO,UACH7B,AAAAA,CAAAA,SAAAA,CAAAA,qBAAAA,WAAW,MAAM,AAAD,IAAhBA,KAAAA,IAAAA,mBAAmB,IAAI,AAAD,IAAI;QAAC4B,WAAW,MAAM;KAAC,GAAGA,WAAW,OAAM,KAAM,EAAE;IAC5E,MAAME,cAAkC;QACtC,GAAGF,UAAU;QACbC;QACAJ;QACAF;QACA,UAAUQ,AAAAA,IAAAA,mCAAAA,sBAAAA,AAAAA,EACRF,SACA9B,KAAK,WAAW,EAChB6B,WAAW,KAAK;IAEpB;IAEAI,IAAAA,sBAAAA,MAAAA,AAAAA,EAAOJ,YAAY;IAEnB,IAAIlB,AAAAA,IAAAA,oBAAAA,YAAAA,AAAAA,KAAgB;QAClBmB,QAAQ,OAAO,CAAC,CAACI;YACf,IAAIA,OAAO,MAAM,EACf,IAAI;gBACFA,OAAO,MAAM,GAAGC,AAAAA,IAAAA,mCAAAA,aAAAA,AAAAA,EAAcD,OAAO,MAAM,EAAE7B,KAAK,KAAK,EAAEA,KAAK,MAAM;YACtE,EAAE,OAAO+B,GAAG;gBACV,MAAM,IAAIC,MACR,CAAC,6BAA6B,EAAER,WAAW,KAAK,CAAC,EAAE,EACjDO,aAAaC,QAAQD,EAAE,OAAO,GAAG,gBAClC,CAAC,CAAC,EACH;oBACE,OAAOA;gBACT;YAEJ;QAEJ;QAEAH,IAAAA,sBAAAA,MAAAA,AAAAA,EAAO,CAACJ,WAAW,KAAK,EAAE,CAAC,wBAAwB,EAAEA,WAAW,KAAK,EAAE;IACzE,OACEC,QAAQ,OAAO,CAAC,CAACI;YACXI;QAAJ,IAAI,QAAAA,CAAAA,iBAAAA,OAAO,MAAM,AAAD,IAAZA,KAAAA,IAAAA,eAAe,EAAE,EAAE;YAErB,MAAMC,UAAUhC,YAAY2B,OAAO,MAAM,CAAC,EAAE;YAC5C,IAAIK,SACFL,OAAO,MAAM,CAAC,EAAE,GAAGK,QAAQ,EAAE;QAEjC;IACF;IAGF,IACET,AAAmB,MAAnBA,QAAQ,MAAM,IACdC,YAAY,kCAAkC,IAC9C,CAACA,YAAY,KAAK,EAElBS,QAAQ,IAAI,CACV,8EACAzC;IAIJ,OAAOgC;AACT"}
1
+ {"version":3,"file":"ai-model/llm-planning.js","sources":["webpack://@midscene/core/webpack/runtime/define_property_getters","webpack://@midscene/core/webpack/runtime/has_own_property","webpack://@midscene/core/webpack/runtime/make_namespace_object","webpack://@midscene/core/./src/ai-model/llm-planning.ts"],"sourcesContent":["__webpack_require__.d = (exports, definition) => {\n\tfor(var key in definition) {\n if(__webpack_require__.o(definition, key) && !__webpack_require__.o(exports, key)) {\n Object.defineProperty(exports, key, { enumerable: true, get: definition[key] });\n }\n }\n};","__webpack_require__.o = (obj, prop) => (Object.prototype.hasOwnProperty.call(obj, prop))","// define __esModule on exports\n__webpack_require__.r = (exports) => {\n\tif(typeof Symbol !== 'undefined' && Symbol.toStringTag) {\n\t\tObject.defineProperty(exports, Symbol.toStringTag, { value: 'Module' });\n\t}\n\tObject.defineProperty(exports, '__esModule', { value: true });\n};","import type {\n DeviceAction,\n PageType,\n PlanningAIResponse,\n UIContext,\n} from '@/types';\nimport { type IModelPreferences, vlLocateMode } from '@midscene/shared/env';\nimport { paddingToMatchBlockByBase64 } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport {\n AIActionType,\n type AIArgs,\n buildYamlFlowFromPlans,\n callAiFn,\n fillBboxParam,\n findAllMidsceneLocatorField,\n markupImageForLLM,\n warnGPT4oSizeLimit,\n} from './common';\nimport {\n automationUserPrompt,\n generateTaskBackgroundContext,\n systemPromptToTaskPlanning,\n} from './prompt/llm-planning';\nimport { describeUserPage } from './prompt/util';\n\nconst debug = getDebug('planning');\n\nexport async function plan(\n userInstruction: string,\n opts: {\n context: UIContext;\n pageType: PageType;\n actionSpace: DeviceAction<any>[];\n callAI?: typeof callAiFn<PlanningAIResponse>;\n log?: string;\n actionContext?: string;\n },\n): Promise<PlanningAIResponse> {\n const { callAI, context } = opts || {};\n const { screenshotBase64, size } = context;\n\n const modelPreferences: IModelPreferences = {\n intent: 'planning',\n };\n const { description: pageDescription, elementById } = await describeUserPage(\n context,\n modelPreferences,\n );\n\n const systemPrompt = await systemPromptToTaskPlanning({\n actionSpace: opts.actionSpace,\n vlMode: vlLocateMode(modelPreferences),\n });\n const taskBackgroundContextText = generateTaskBackgroundContext(\n userInstruction,\n opts.log,\n opts.actionContext,\n );\n const userInstructionPrompt = await automationUserPrompt(\n vlLocateMode(modelPreferences),\n ).format({\n pageDescription,\n taskBackgroundContext: taskBackgroundContextText,\n });\n\n let imagePayload = screenshotBase64;\n if (vlLocateMode(modelPreferences) === 'qwen-vl') {\n imagePayload = await paddingToMatchBlockByBase64(imagePayload);\n } else if (!vlLocateMode(modelPreferences)) {\n imagePayload = await markupImageForLLM(\n screenshotBase64,\n context.tree,\n context.size,\n );\n }\n\n warnGPT4oSizeLimit(size, modelPreferences);\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: userInstructionPrompt,\n },\n ],\n },\n ];\n\n const call = callAI || callAiFn;\n const { content, usage } = await call(\n msgs,\n AIActionType.PLAN,\n modelPreferences,\n );\n const rawResponse = JSON.stringify(content, undefined, 2);\n const planFromAI = content;\n\n const actions =\n (planFromAI.action?.type ? [planFromAI.action] : planFromAI.actions) || [];\n const returnValue: PlanningAIResponse = {\n ...planFromAI,\n actions,\n rawResponse,\n usage,\n yamlFlow: buildYamlFlowFromPlans(\n actions,\n opts.actionSpace,\n planFromAI.sleep,\n ),\n };\n\n assert(planFromAI, \"can't get plans from AI\");\n\n // TODO: use zod.parse to parse the action.param, and then fill the bbox param.\n actions.forEach((action) => {\n const type = action.type;\n const actionInActionSpace = opts.actionSpace.find(\n (action) => action.name === type,\n );\n const locateFields = actionInActionSpace\n ? findAllMidsceneLocatorField(actionInActionSpace.paramSchema)\n : [];\n\n debug('locateFields', locateFields);\n\n locateFields.forEach((field) => {\n const locateResult = action.param[field];\n if (locateResult) {\n if (vlLocateMode(modelPreferences)) {\n action.param[field] = fillBboxParam(\n locateResult,\n size.width,\n size.height,\n modelPreferences,\n );\n } else {\n const element = elementById(locateResult);\n if (element) {\n action.param[field].id = element.id;\n }\n }\n }\n });\n });\n // in Qwen-VL, error means error. In GPT-4o, error may mean more actions are needed.\n assert(!planFromAI.error, `Failed to plan actions: ${planFromAI.error}`);\n\n if (\n actions.length === 0 &&\n returnValue.more_actions_needed_by_instruction &&\n !returnValue.sleep\n ) {\n console.warn(\n 'No actions planned for the prompt, but model said more actions are needed:',\n userInstruction,\n );\n }\n\n return returnValue;\n}\n"],"names":["__webpack_require__","definition","key","Object","obj","prop","Symbol","debug","getDebug","plan","userInstruction","opts","_planFromAI_action","callAI","context","screenshotBase64","size","modelPreferences","pageDescription","elementById","describeUserPage","systemPrompt","systemPromptToTaskPlanning","vlLocateMode","taskBackgroundContextText","generateTaskBackgroundContext","userInstructionPrompt","automationUserPrompt","imagePayload","paddingToMatchBlockByBase64","markupImageForLLM","warnGPT4oSizeLimit","msgs","call","callAiFn","content","usage","AIActionType","rawResponse","JSON","undefined","planFromAI","actions","returnValue","buildYamlFlowFromPlans","assert","action","type","actionInActionSpace","locateFields","findAllMidsceneLocatorField","field","locateResult","fillBboxParam","element","console"],"mappings":";;;IAAAA,oBAAoB,CAAC,GAAG,CAAC,UAASC;QACjC,IAAI,IAAIC,OAAOD,WACR,IAAGD,oBAAoB,CAAC,CAACC,YAAYC,QAAQ,CAACF,oBAAoB,CAAC,CAAC,UAASE,MACzEC,OAAO,cAAc,CAAC,UAASD,KAAK;YAAE,YAAY;YAAM,KAAKD,UAAU,CAACC,IAAI;QAAC;IAGzF;;;ICNAF,oBAAoB,CAAC,GAAG,CAACI,KAAKC,OAAUF,OAAO,SAAS,CAAC,cAAc,CAAC,IAAI,CAACC,KAAKC;;;ICClFL,oBAAoB,CAAC,GAAG,CAAC;QACxB,IAAG,AAAkB,eAAlB,OAAOM,UAA0BA,OAAO,WAAW,EACrDH,OAAO,cAAc,CAAC,UAASG,OAAO,WAAW,EAAE;YAAE,OAAO;QAAS;QAEtEH,OAAO,cAAc,CAAC,UAAS,cAAc;YAAE,OAAO;QAAK;IAC5D;;;;;;;;;;;;;;ACqBA,MAAMI,QAAQC,AAAAA,IAAAA,uBAAAA,QAAAA,AAAAA,EAAS;AAEhB,eAAeC,KACpBC,eAAuB,EACvBC,IAOC;QAwEEC;IAtEH,MAAM,EAAEC,MAAM,EAAEC,OAAO,EAAE,GAAGH,QAAQ,CAAC;IACrC,MAAM,EAAEI,gBAAgB,EAAEC,IAAI,EAAE,GAAGF;IAEnC,MAAMG,mBAAsC;QAC1C,QAAQ;IACV;IACA,MAAM,EAAE,aAAaC,eAAe,EAAEC,WAAW,EAAE,GAAG,MAAMC,AAAAA,IAAAA,wBAAAA,gBAAAA,AAAAA,EAC1DN,SACAG;IAGF,MAAMI,eAAe,MAAMC,AAAAA,IAAAA,gCAAAA,0BAAAA,AAAAA,EAA2B;QACpD,aAAaX,KAAK,WAAW;QAC7B,QAAQY,AAAAA,IAAAA,oBAAAA,YAAAA,AAAAA,EAAaN;IACvB;IACA,MAAMO,4BAA4BC,AAAAA,IAAAA,gCAAAA,6BAAAA,AAAAA,EAChCf,iBACAC,KAAK,GAAG,EACRA,KAAK,aAAa;IAEpB,MAAMe,wBAAwB,MAAMC,AAAAA,IAAAA,gCAAAA,oBAAAA,AAAAA,EAClCJ,AAAAA,IAAAA,oBAAAA,YAAAA,AAAAA,EAAaN,mBACb,MAAM,CAAC;QACPC;QACA,uBAAuBM;IACzB;IAEA,IAAII,eAAeb;IACnB,IAAIQ,AAAmC,cAAnCA,AAAAA,IAAAA,oBAAAA,YAAAA,AAAAA,EAAaN,mBACfW,eAAe,MAAMC,AAAAA,IAAAA,oBAAAA,2BAAAA,AAAAA,EAA4BD;SAC5C,IAAI,CAACL,AAAAA,IAAAA,oBAAAA,YAAAA,AAAAA,EAAaN,mBACvBW,eAAe,MAAME,AAAAA,IAAAA,mCAAAA,iBAAAA,AAAAA,EACnBf,kBACAD,QAAQ,IAAI,EACZA,QAAQ,IAAI;IAIhBiB,IAAAA,mCAAAA,kBAAAA,AAAAA,EAAmBf,MAAMC;IAEzB,MAAMe,OAAe;QACnB;YAAE,MAAM;YAAU,SAASX;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKO;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAMF;gBACR;aACD;QACH;KACD;IAED,MAAMO,OAAOpB,UAAUqB,mCAAAA,QAAQA;IAC/B,MAAM,EAAEC,OAAO,EAAEC,KAAK,EAAE,GAAG,MAAMH,KAC/BD,MACAK,mCAAAA,YAAAA,CAAAA,IAAiB,EACjBpB;IAEF,MAAMqB,cAAcC,KAAK,SAAS,CAACJ,SAASK,QAAW;IACvD,MAAMC,aAAaN;IAEnB,MAAMO,UACH9B,AAAAA,CAAAA,SAAAA,CAAAA,qBAAAA,WAAW,MAAM,AAAD,IAAhBA,KAAAA,IAAAA,mBAAmB,IAAI,AAAD,IAAI;QAAC6B,WAAW,MAAM;KAAC,GAAGA,WAAW,OAAM,KAAM,EAAE;IAC5E,MAAME,cAAkC;QACtC,GAAGF,UAAU;QACbC;QACAJ;QACAF;QACA,UAAUQ,AAAAA,IAAAA,mCAAAA,sBAAAA,AAAAA,EACRF,SACA/B,KAAK,WAAW,EAChB8B,WAAW,KAAK;IAEpB;IAEAI,IAAAA,sBAAAA,MAAAA,AAAAA,EAAOJ,YAAY;IAGnBC,QAAQ,OAAO,CAAC,CAACI;QACf,MAAMC,OAAOD,OAAO,IAAI;QACxB,MAAME,sBAAsBrC,KAAK,WAAW,CAAC,IAAI,CAC/C,CAACmC,SAAWA,OAAO,IAAI,KAAKC;QAE9B,MAAME,eAAeD,sBACjBE,AAAAA,IAAAA,mCAAAA,2BAAAA,AAAAA,EAA4BF,oBAAoB,WAAW,IAC3D,EAAE;QAENzC,MAAM,gBAAgB0C;QAEtBA,aAAa,OAAO,CAAC,CAACE;YACpB,MAAMC,eAAeN,OAAO,KAAK,CAACK,MAAM;YACxC,IAAIC,cACF,IAAI7B,AAAAA,IAAAA,oBAAAA,YAAAA,AAAAA,EAAaN,mBACf6B,OAAO,KAAK,CAACK,MAAM,GAAGE,AAAAA,IAAAA,mCAAAA,aAAAA,AAAAA,EACpBD,cACApC,KAAK,KAAK,EACVA,KAAK,MAAM,EACXC;iBAEG;gBACL,MAAMqC,UAAUnC,YAAYiC;gBAC5B,IAAIE,SACFR,OAAO,KAAK,CAACK,MAAM,CAAC,EAAE,GAAGG,QAAQ,EAAE;YAEvC;QAEJ;IACF;IAEAT,IAAAA,sBAAAA,MAAAA,AAAAA,EAAO,CAACJ,WAAW,KAAK,EAAE,CAAC,wBAAwB,EAAEA,WAAW,KAAK,EAAE;IAEvE,IACEC,AAAmB,MAAnBA,QAAQ,MAAM,IACdC,YAAY,kCAAkC,IAC9C,CAACA,YAAY,KAAK,EAElBY,QAAQ,IAAI,CACV,8EACA7C;IAIJ,OAAOiC;AACT"}
@@ -42,41 +42,87 @@ __webpack_require__.d(__webpack_exports__, {
42
42
  const external_node_assert_namespaceObject = require("node:assert");
43
43
  var external_node_assert_default = /*#__PURE__*/ __webpack_require__.n(external_node_assert_namespaceObject);
44
44
  const prompts_namespaceObject = require("@langchain/core/prompts");
45
- const external_common_js_namespaceObject = require("./common.js");
45
+ const external_zod_namespaceObject = require("zod");
46
+ const external_common_js_namespaceObject = require("../common.js");
47
+ const external_common_js_namespaceObject_1 = require("./common.js");
46
48
  const vlCoTLog = '"what_the_user_wants_to_do_next_by_instruction": string, // What the user wants to do according to the instruction and previous logs. ';
47
49
  const vlCurrentLog = '"log": string, // Log what the next one action (ONLY ONE!) you can do according to the screenshot and the instruction. The typical log looks like "Now i want to use action \'{ action-type }\' to do .. first". If no action should be done, log the reason. ". Use the same language as the user\'s instruction.';
48
50
  const llmCurrentLog = '"log": string, // Log what the next actions you can do according to the screenshot and the instruction. The typical log looks like "Now i want to use action \'{ action-type }\' to do ..". If no action should be done, log the reason. ". Use the same language as the user\'s instruction.';
49
51
  const commonOutputFields = `"error"?: string, // Error messages about unexpected situations, if any. Only think it is an error when the situation is not foreseeable according to the instruction. Use the same language as the user's instruction.
50
52
  "more_actions_needed_by_instruction": boolean, // Consider if there is still more action(s) to do after the action in "Log" is done, according to the instruction. If so, set this field to true. Otherwise, set it to false.`;
51
- const vlLocateParam = (required)=>`locate${required ? '' : '?'}: {bbox: [number, number, number, number], prompt: string }`;
52
- const llmLocateParam = (required)=>`locate${required ? '' : '?'}: {"id": string, "prompt": string}`;
53
- const descriptionForAction = (action, locatorScheme)=>{
53
+ const vlLocateParam = ()=>'{bbox: [number, number, number, number], prompt: string }';
54
+ const llmLocateParam = ()=>'{"id": string, "prompt": string}';
55
+ const descriptionForAction = (action, locatorSchemaTypeDescription)=>{
54
56
  const tab = ' ';
55
- let locateParam = '';
56
- if ('required' === action.location) locateParam = locatorScheme;
57
- else if ('optional' === action.location) locateParam = `${locatorScheme} | null`;
58
- else if (false === action.location) locateParam = '';
59
- const locatorParam = locateParam ? `- ${locateParam}` : '';
60
- if (action.whatToLocate) if (locateParam) locateParam += ` // ${action.whatToLocate}`;
61
- else console.warn(`whatToLocate is provided for action ${action.name}, but location is not required or optional. The whatToLocate will be ignored.`);
62
- let paramSchema = '';
63
- if (action.paramSchema) paramSchema = `- param: ${action.paramSchema}`;
64
- if (action.paramDescription) {
65
- external_node_assert_default()(paramSchema, `paramSchema is required when paramDescription is provided for action ${action.name}, but got ${action.paramSchema}`);
66
- paramSchema += ` // ${action.paramDescription}`;
57
+ const fields = [];
58
+ fields.push(`- type: "${action.name}"`);
59
+ if (action.paramSchema) {
60
+ external_node_assert_default()(action.paramSchema instanceof external_zod_namespaceObject.z.ZodObject, 'paramSchema must be a zod object');
61
+ const shape = action.paramSchema.shape;
62
+ const paramLines = [];
63
+ const getTypeName = (field)=>{
64
+ var _actualField__def;
65
+ const unwrapField = (f)=>{
66
+ if (!f._def) return f;
67
+ const typeName = f._def.typeName;
68
+ if ('ZodOptional' === typeName || 'ZodNullable' === typeName || 'ZodDefault' === typeName) return unwrapField(f._def.innerType);
69
+ return f;
70
+ };
71
+ const actualField = unwrapField(field);
72
+ const fieldTypeName = null == (_actualField__def = actualField._def) ? void 0 : _actualField__def.typeName;
73
+ if ('ZodString' === fieldTypeName) return 'string';
74
+ if ('ZodNumber' === fieldTypeName) return 'number';
75
+ if ('ZodBoolean' === fieldTypeName) return 'boolean';
76
+ if ('ZodArray' === fieldTypeName) return 'array';
77
+ if ('ZodObject' === fieldTypeName) {
78
+ if ((0, external_common_js_namespaceObject.ifMidsceneLocatorField)(actualField)) return locatorSchemaTypeDescription;
79
+ return 'object';
80
+ }
81
+ if ('ZodEnum' === fieldTypeName) {
82
+ var _actualField__def_values, _actualField__def1;
83
+ const values = (null == (_actualField__def1 = actualField._def) ? void 0 : null == (_actualField__def_values = _actualField__def1.values) ? void 0 : _actualField__def_values.map((option)=>String(`'${option}'`)).join(', ')) ?? 'enum';
84
+ return `enum(${values})`;
85
+ }
86
+ console.warn('failed to parse Zod type. This may lead to wrong params from the LLM.\n', actualField._def);
87
+ return actualField.toString();
88
+ };
89
+ const getDescription = (field)=>{
90
+ var _actualField__def;
91
+ const unwrapField = (f)=>{
92
+ if (!f._def) return f;
93
+ const typeName = f._def.typeName;
94
+ if ('ZodOptional' === typeName || 'ZodNullable' === typeName || 'ZodDefault' === typeName) return unwrapField(f._def.innerType);
95
+ return f;
96
+ };
97
+ if ("description" in field) return field.description || null;
98
+ const actualField = unwrapField(field);
99
+ if ("description" in actualField) return actualField.description || null;
100
+ if ((null == (_actualField__def = actualField._def) ? void 0 : _actualField__def.typeName) === 'ZodObject') {
101
+ if ('midscene_location_field_flag' in actualField._def.shape()) return 'Location information for the target element';
102
+ }
103
+ return null;
104
+ };
105
+ for (const [key, field] of Object.entries(shape))if (field && 'object' == typeof field) {
106
+ const isOptional = 'function' == typeof field.isOptional && field.isOptional();
107
+ const keyWithOptional = isOptional ? `${key}?` : key;
108
+ const typeName = getTypeName(field);
109
+ const description = getDescription(field);
110
+ let paramLine = `${keyWithOptional}: ${typeName}`;
111
+ if (description) paramLine += ` // ${description}`;
112
+ paramLines.push(paramLine);
113
+ }
114
+ if (paramLines.length > 0) {
115
+ fields.push('- param:');
116
+ for (const paramLine of paramLines)fields.push(` - ${paramLine}`);
117
+ }
67
118
  }
68
- const fields = [
69
- paramSchema,
70
- locatorParam
71
- ].filter(Boolean);
72
- return `- ${action.name}, ${action.description}
73
- ${tab}- type: "${action.name}"
119
+ return `- ${action.name}, ${action.description || "No description provided"}
74
120
  ${tab}${fields.join(`\n${tab}`)}
75
121
  `.trim();
76
122
  };
77
123
  const systemTemplateOfVLPlanning = ({ actionSpace, vlMode })=>{
78
124
  const actionNameList = actionSpace.map((action)=>action.name).join(', ');
79
- const actionDescriptionList = actionSpace.map((action)=>descriptionForAction(action, vlLocateParam('required' === action.location)));
125
+ const actionDescriptionList = actionSpace.map((action)=>descriptionForAction(action, vlLocateParam()));
80
126
  const actionList = actionDescriptionList.join('\n');
81
127
  return `
82
128
  Target: User will give you a screenshot, an instruction and some previous logs indicating what have been done. Please tell what the next one action is (or null if no action should be done) to do the tasks the instruction requires.
@@ -85,7 +131,7 @@ Restriction:
85
131
  - Don't give extra actions or plans beyond the instruction. ONLY plan for what the instruction requires. For example, don't try to submit the form if the instruction is only to fill something.
86
132
  - Always give ONLY ONE action in \`log\` field (or null if no action should be done), instead of multiple actions. Supported actions are ${actionNameList}.
87
133
  - Don't repeat actions in the previous logs.
88
- - Bbox is the bounding box of the element to be located. It's an array of 4 numbers, representing ${(0, external_common_js_namespaceObject.bboxDescription)(vlMode)}.
134
+ - Bbox is the bounding box of the element to be located. It's an array of 4 numbers, representing ${(0, external_common_js_namespaceObject_1.bboxDescription)(vlMode)}.
89
135
 
90
136
  Supporting actions:
91
137
  ${actionList}
@@ -126,7 +172,7 @@ this and output the JSON:
126
172
  };
127
173
  const systemTemplateOfLLM = ({ actionSpace })=>{
128
174
  const actionNameList = actionSpace.map((action)=>action.name).join(' / ');
129
- const actionDescriptionList = actionSpace.map((action)=>descriptionForAction(action, llmLocateParam('required' === action.location)));
175
+ const actionDescriptionList = actionSpace.map((action)=>descriptionForAction(action, llmLocateParam()));
130
176
  const actionList = actionDescriptionList.join('\n');
131
177
  return `
132
178
  ## Role