@midscene/core 0.26.7-beta-20250818081955.0 → 0.26.7-beta-20250820105545.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. package/dist/es/ai-model/common.mjs +58 -16
  2. package/dist/es/ai-model/common.mjs.map +1 -1
  3. package/dist/es/ai-model/index.mjs +3 -3
  4. package/dist/es/ai-model/inspect.mjs +28 -16
  5. package/dist/es/ai-model/inspect.mjs.map +1 -1
  6. package/dist/es/ai-model/llm-planning.mjs +26 -23
  7. package/dist/es/ai-model/llm-planning.mjs.map +1 -1
  8. package/dist/es/ai-model/prompt/llm-planning.mjs +50 -23
  9. package/dist/es/ai-model/prompt/llm-planning.mjs.map +1 -1
  10. package/dist/es/ai-model/prompt/playwright-generator.mjs +9 -3
  11. package/dist/es/ai-model/prompt/playwright-generator.mjs.map +1 -1
  12. package/dist/es/ai-model/prompt/util.mjs +2 -2
  13. package/dist/es/ai-model/prompt/util.mjs.map +1 -1
  14. package/dist/es/ai-model/prompt/yaml-generator.mjs +9 -3
  15. package/dist/es/ai-model/prompt/yaml-generator.mjs.map +1 -1
  16. package/dist/es/ai-model/service-caller/index.mjs +72 -118
  17. package/dist/es/ai-model/service-caller/index.mjs.map +1 -1
  18. package/dist/es/ai-model/ui-tars-planning.mjs +5 -5
  19. package/dist/es/ai-model/ui-tars-planning.mjs.map +1 -1
  20. package/dist/es/index.mjs +3 -2
  21. package/dist/es/index.mjs.map +1 -1
  22. package/dist/es/insight/index.mjs +13 -61
  23. package/dist/es/insight/index.mjs.map +1 -1
  24. package/dist/es/types.mjs.map +1 -1
  25. package/dist/es/utils.mjs +5 -6
  26. package/dist/es/utils.mjs.map +1 -1
  27. package/dist/lib/ai-model/common.js +80 -20
  28. package/dist/lib/ai-model/common.js.map +1 -1
  29. package/dist/lib/ai-model/index.js +14 -5
  30. package/dist/lib/ai-model/inspect.js +27 -15
  31. package/dist/lib/ai-model/inspect.js.map +1 -1
  32. package/dist/lib/ai-model/llm-planning.js +25 -22
  33. package/dist/lib/ai-model/llm-planning.js.map +1 -1
  34. package/dist/lib/ai-model/prompt/llm-planning.js +52 -25
  35. package/dist/lib/ai-model/prompt/llm-planning.js.map +1 -1
  36. package/dist/lib/ai-model/prompt/playwright-generator.js +9 -3
  37. package/dist/lib/ai-model/prompt/playwright-generator.js.map +1 -1
  38. package/dist/lib/ai-model/prompt/util.js +2 -2
  39. package/dist/lib/ai-model/prompt/util.js.map +1 -1
  40. package/dist/lib/ai-model/prompt/yaml-generator.js +9 -3
  41. package/dist/lib/ai-model/prompt/yaml-generator.js.map +1 -1
  42. package/dist/lib/ai-model/service-caller/index.js +75 -124
  43. package/dist/lib/ai-model/service-caller/index.js.map +1 -1
  44. package/dist/lib/ai-model/ui-tars-planning.js +5 -5
  45. package/dist/lib/ai-model/ui-tars-planning.js.map +1 -1
  46. package/dist/lib/index.js +20 -4
  47. package/dist/lib/index.js.map +1 -1
  48. package/dist/lib/insight/index.js +10 -58
  49. package/dist/lib/insight/index.js.map +1 -1
  50. package/dist/lib/types.js.map +1 -1
  51. package/dist/lib/utils.js +4 -5
  52. package/dist/lib/utils.js.map +1 -1
  53. package/dist/types/ai-model/common.d.ts +160 -7
  54. package/dist/types/ai-model/index.d.ts +2 -2
  55. package/dist/types/ai-model/inspect.d.ts +2 -0
  56. package/dist/types/ai-model/llm-planning.d.ts +1 -1
  57. package/dist/types/ai-model/prompt/llm-planning.d.ts +2 -2
  58. package/dist/types/ai-model/prompt/util.d.ts +2 -1
  59. package/dist/types/ai-model/service-caller/index.d.ts +6 -6
  60. package/dist/types/ai-model/ui-tars-planning.d.ts +3 -1
  61. package/dist/types/index.d.ts +3 -1
  62. package/dist/types/insight/index.d.ts +1 -4
  63. package/dist/types/types.d.ts +8 -11
  64. package/dist/types/yaml.d.ts +1 -0
  65. package/package.json +4 -3
@@ -1,10 +1,11 @@
1
1
  import { assert } from "@midscene/shared/utils";
2
- import { callToGetJSONObject, getModelName } from "./service-caller/index.mjs";
2
+ import { callToGetJSONObject } from "./service-caller/index.mjs";
3
3
  import { NodeType } from "@midscene/shared/constants";
4
- import { vlLocateMode } from "@midscene/shared/env";
4
+ import { getModelName, vlLocateMode } from "@midscene/shared/env";
5
5
  import { treeToList } from "@midscene/shared/extractor";
6
6
  import { compositeElementInfoImg } from "@midscene/shared/img";
7
7
  import { getDebug } from "@midscene/shared/logger";
8
+ import { z } from "zod";
8
9
  var common_AIActionType = /*#__PURE__*/ function(AIActionType) {
9
10
  AIActionType[AIActionType["ASSERT"] = 0] = "ASSERT";
10
11
  AIActionType[AIActionType["INSPECT_ELEMENT"] = 1] = "INSPECT_ELEMENT";
@@ -14,8 +15,8 @@ var common_AIActionType = /*#__PURE__*/ function(AIActionType) {
14
15
  return AIActionType;
15
16
  }({});
16
17
  const actionSpaceTypePrefix = 'action_space_';
17
- async function callAiFn(msgs, AIActionTypeValue) {
18
- const jsonObject = await callToGetJSONObject(msgs, AIActionTypeValue);
18
+ async function callAiFn(msgs, AIActionTypeValue, modelPreferences) {
19
+ const jsonObject = await callToGetJSONObject(msgs, AIActionTypeValue, modelPreferences);
19
20
  return {
20
21
  content: jsonObject.content,
21
22
  usage: jsonObject.usage
@@ -23,12 +24,12 @@ async function callAiFn(msgs, AIActionTypeValue) {
23
24
  }
24
25
  const defaultBboxSize = 20;
25
26
  const debugInspectUtils = getDebug('ai:common');
26
- function fillBboxParam(locate, width, height) {
27
+ function fillBboxParam(locate, width, height, modelPreferences) {
27
28
  if (locate.bbox_2d && !(null == locate ? void 0 : locate.bbox)) {
28
29
  locate.bbox = locate.bbox_2d;
29
30
  delete locate.bbox_2d;
30
31
  }
31
- if (null == locate ? void 0 : locate.bbox) locate.bbox = adaptBbox(locate.bbox, width, height);
32
+ if (null == locate ? void 0 : locate.bbox) locate.bbox = adaptBbox(locate.bbox, width, height, modelPreferences);
32
33
  return locate;
33
34
  }
34
35
  function adaptQwenBbox(bbox) {
@@ -90,9 +91,9 @@ function adaptDoubaoBbox(bbox, width, height) {
90
91
  const msg = `invalid bbox data for doubao-vision mode: ${JSON.stringify(bbox)} `;
91
92
  throw new Error(msg);
92
93
  }
93
- function adaptBbox(bbox, width, height) {
94
- if ('doubao-vision' === vlLocateMode() || 'vlm-ui-tars' === vlLocateMode()) return adaptDoubaoBbox(bbox, width, height);
95
- if ('gemini' === vlLocateMode()) return adaptGeminiBbox(bbox, width, height);
94
+ function adaptBbox(bbox, width, height, modelPreferences) {
95
+ if ('doubao-vision' === vlLocateMode(modelPreferences) || 'vlm-ui-tars' === vlLocateMode(modelPreferences)) return adaptDoubaoBbox(bbox, width, height);
96
+ if ('gemini' === vlLocateMode(modelPreferences)) return adaptGeminiBbox(bbox, width, height);
96
97
  return adaptQwenBbox(bbox);
97
98
  }
98
99
  function adaptGeminiBbox(bbox, width, height) {
@@ -107,9 +108,9 @@ function adaptGeminiBbox(bbox, width, height) {
107
108
  bottom
108
109
  ];
109
110
  }
110
- function adaptBboxToRect(bbox, width, height, offsetX = 0, offsetY = 0) {
111
+ function adaptBboxToRect(bbox, width, height, modelPreferences, offsetX = 0, offsetY = 0) {
111
112
  debugInspectUtils('adaptBboxToRect', bbox, width, height, offsetX, offsetY);
112
- const [left, top, right, bottom] = adaptBbox(bbox, width, height);
113
+ const [left, top, right, bottom] = adaptBbox(bbox, width, height, modelPreferences);
113
114
  const rect = {
114
115
  left: left + offsetX,
115
116
  top: top + offsetY,
@@ -120,10 +121,10 @@ function adaptBboxToRect(bbox, width, height, offsetX = 0, offsetY = 0) {
120
121
  return rect;
121
122
  }
122
123
  let warned = false;
123
- function warnGPT4oSizeLimit(size) {
124
+ function warnGPT4oSizeLimit(size, modelPreferences) {
124
125
  var _getModelName;
125
126
  if (warned) return;
126
- if (null == (_getModelName = getModelName()) ? void 0 : _getModelName.toLowerCase().includes('gpt-4o')) {
127
+ if (null == (_getModelName = getModelName(modelPreferences)) ? void 0 : _getModelName.toLowerCase().includes('gpt-4o')) {
127
128
  const warningMsg = `GPT-4o has a maximum image input size of 2000x768 or 768x2000, but got ${size.width}x${size.height}. Please set your page to a smaller resolution. Otherwise, the result may be inaccurate.`;
128
129
  if (Math.max(size.width, size.height) > 2000 || Math.min(size.width, size.height) > 768) {
129
130
  console.warn(warningMsg);
@@ -146,8 +147,8 @@ function mergeRects(rects) {
146
147
  height: maxBottom - minTop
147
148
  };
148
149
  }
149
- function expandSearchArea(rect, screenSize) {
150
- const minEdgeSize = 'doubao-vision' === vlLocateMode() ? 500 : 300;
150
+ function expandSearchArea(rect, screenSize, modelPreferences) {
151
+ const minEdgeSize = 'doubao-vision' === vlLocateMode(modelPreferences) ? 500 : 300;
151
152
  const defaultPadding = 160;
152
153
  const paddingSizeHorizontal = rect.width < minEdgeSize ? Math.ceil((minEdgeSize - rect.width) / 2) : defaultPadding;
153
154
  const paddingSizeVertical = rect.height < minEdgeSize ? Math.ceil((minEdgeSize - rect.height) / 2) : defaultPadding;
@@ -193,6 +194,47 @@ function buildYamlFlowFromPlans(plans, actionSpace, sleep) {
193
194
  });
194
195
  return flow;
195
196
  }
196
- export { common_AIActionType as AIActionType, actionSpaceTypePrefix, adaptBbox, adaptBboxToRect, adaptDoubaoBbox, adaptGeminiBbox, adaptQwenBbox, buildYamlFlowFromPlans, callAiFn, expandSearchArea, fillBboxParam, markupImageForLLM, mergeRects, warnGPT4oSizeLimit };
197
+ const PointSchema = z.object({
198
+ left: z.number(),
199
+ top: z.number()
200
+ });
201
+ const SizeSchema = z.object({
202
+ width: z.number(),
203
+ height: z.number(),
204
+ dpr: z.number().optional()
205
+ });
206
+ const RectSchema = PointSchema.and(SizeSchema).and(z.object({
207
+ zoom: z.number().optional()
208
+ }));
209
+ const MidsceneLocation = z.object({
210
+ midscene_location_field_flag: z.literal(true),
211
+ prompt: z.string(),
212
+ center: z.tuple([
213
+ z.number(),
214
+ z.number()
215
+ ]),
216
+ rect: RectSchema
217
+ }).passthrough();
218
+ const ifMidsceneLocatorField = (field)=>{
219
+ var _actualField__def, _actualField__def1;
220
+ let actualField = field;
221
+ if ((null == (_actualField__def = actualField._def) ? void 0 : _actualField__def.typeName) === 'ZodOptional') actualField = actualField._def.innerType;
222
+ if ((null == (_actualField__def1 = actualField._def) ? void 0 : _actualField__def1.typeName) === 'ZodObject') {
223
+ const shape = actualField._def.shape();
224
+ return 'midscene_location_field_flag' in shape;
225
+ }
226
+ return false;
227
+ };
228
+ const findAllMidsceneLocatorField = (zodType)=>{
229
+ var _zodObject__def;
230
+ if (!zodType) return [];
231
+ const zodObject = zodType;
232
+ if ((null == (_zodObject__def = zodObject._def) ? void 0 : _zodObject__def.typeName) === 'ZodObject' && zodObject.shape) {
233
+ const keys = Object.keys(zodObject.shape);
234
+ return keys.filter((key)=>ifMidsceneLocatorField(zodObject.shape[key]));
235
+ }
236
+ return [];
237
+ };
238
+ export { common_AIActionType as AIActionType, MidsceneLocation, PointSchema, RectSchema, SizeSchema, actionSpaceTypePrefix, adaptBbox, adaptBboxToRect, adaptDoubaoBbox, adaptGeminiBbox, adaptQwenBbox, buildYamlFlowFromPlans, callAiFn, expandSearchArea, fillBboxParam, findAllMidsceneLocatorField, ifMidsceneLocatorField, markupImageForLLM, mergeRects, warnGPT4oSizeLimit };
197
239
 
198
240
  //# sourceMappingURL=common.mjs.map
@@ -1 +1 @@
1
- {"version":3,"file":"ai-model/common.mjs","sources":["webpack://@midscene/core/./src/ai-model/common.ts"],"sourcesContent":["import type {\n AIUsageInfo,\n BaseElement,\n DeviceAction,\n ElementTreeNode,\n MidsceneYamlFlowItem,\n PlanningAction,\n Rect,\n Size,\n} from '@/types';\nimport { assert } from '@midscene/shared/utils';\n\nimport type {\n ChatCompletionSystemMessageParam,\n ChatCompletionUserMessageParam,\n} from 'openai/resources/index';\nimport {\n call,\n callToGetJSONObject,\n getModelName,\n} from './service-caller/index';\n\nimport type { PlanningLocateParam } from '@/types';\nimport { NodeType } from '@midscene/shared/constants';\nimport { vlLocateMode } from '@midscene/shared/env';\nimport { treeToList } from '@midscene/shared/extractor';\nimport { compositeElementInfoImg } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\n\nexport type AIArgs = [\n ChatCompletionSystemMessageParam,\n ...ChatCompletionUserMessageParam[],\n];\n\nexport enum AIActionType {\n ASSERT = 0,\n INSPECT_ELEMENT = 1,\n EXTRACT_DATA = 2,\n PLAN = 3,\n DESCRIBE_ELEMENT = 4,\n}\n\nexport const actionSpaceTypePrefix = 'action_space_';\n\nexport async function callAiFn<T>(\n msgs: AIArgs,\n AIActionTypeValue: AIActionType,\n): Promise<{ content: T; usage?: AIUsageInfo }> {\n const jsonObject = await callToGetJSONObject<T>(msgs, AIActionTypeValue);\n\n return {\n content: jsonObject.content,\n usage: jsonObject.usage,\n };\n}\n\nconst defaultBboxSize = 20; // must be even number\nconst debugInspectUtils = getDebug('ai:common');\n\n// transform the param of locate from qwen mode\nexport function fillBboxParam(\n locate: PlanningLocateParam,\n width: number,\n height: number,\n) {\n // The Qwen model might have hallucinations of naming bbox as bbox_2d.\n if ((locate as any).bbox_2d && !locate?.bbox) {\n locate.bbox = (locate as any).bbox_2d;\n // biome-ignore lint/performance/noDelete: <explanation>\n delete (locate as any).bbox_2d;\n }\n\n if (locate?.bbox) {\n locate.bbox = adaptBbox(locate.bbox, width, height);\n }\n\n return locate;\n}\n\nexport function adaptQwenBbox(\n bbox: number[],\n): [number, number, number, number] {\n if (bbox.length < 2) {\n const msg = `invalid bbox data for qwen-vl mode: ${JSON.stringify(bbox)} `;\n throw new Error(msg);\n }\n\n const result: [number, number, number, number] = [\n Math.round(bbox[0]),\n Math.round(bbox[1]),\n typeof bbox[2] === 'number'\n ? Math.round(bbox[2])\n : Math.round(bbox[0] + defaultBboxSize),\n typeof bbox[3] === 'number'\n ? Math.round(bbox[3])\n : Math.round(bbox[1] + defaultBboxSize),\n ];\n return result;\n}\n\nexport function adaptDoubaoBbox(\n bbox: string[] | number[] | string,\n width: number,\n height: number,\n): [number, number, number, number] {\n assert(\n width > 0 && height > 0,\n 'width and height must be greater than 0 in doubao mode',\n );\n\n if (typeof bbox === 'string') {\n assert(\n /^(\\d+)\\s(\\d+)\\s(\\d+)\\s(\\d+)$/.test(bbox.trim()),\n `invalid bbox data string for doubao-vision mode: ${bbox}`,\n );\n const splitted = bbox.split(' ');\n if (splitted.length === 4) {\n return [\n Math.round((Number(splitted[0]) * width) / 1000),\n Math.round((Number(splitted[1]) * height) / 1000),\n Math.round((Number(splitted[2]) * width) / 1000),\n Math.round((Number(splitted[3]) * height) / 1000),\n ];\n }\n throw new Error(`invalid bbox data string for doubao-vision mode: ${bbox}`);\n }\n\n if (Array.isArray(bbox) && Array.isArray(bbox[0])) {\n bbox = bbox[0];\n }\n\n let bboxList: number[] = [];\n if (Array.isArray(bbox) && typeof bbox[0] === 'string') {\n bbox.forEach((item) => {\n if (typeof item === 'string' && item.includes(',')) {\n const [x, y] = item.split(',');\n bboxList.push(Number(x.trim()), Number(y.trim()));\n } else if (typeof item === 'string' && item.includes(' ')) {\n const [x, y] = item.split(' ');\n bboxList.push(Number(x.trim()), Number(y.trim()));\n } else {\n bboxList.push(Number(item));\n }\n });\n } else {\n bboxList = bbox as any;\n }\n\n if (bboxList.length === 4 || bboxList.length === 5) {\n return [\n Math.round((bboxList[0] * width) / 1000),\n Math.round((bboxList[1] * height) / 1000),\n Math.round((bboxList[2] * width) / 1000),\n Math.round((bboxList[3] * height) / 1000),\n ];\n }\n\n // treat the bbox as a center point\n if (\n bboxList.length === 6 ||\n bboxList.length === 2 ||\n bboxList.length === 3 ||\n bboxList.length === 7\n ) {\n return [\n Math.max(\n 0,\n Math.round((bboxList[0] * width) / 1000) - defaultBboxSize / 2,\n ),\n Math.max(\n 0,\n Math.round((bboxList[1] * height) / 1000) - defaultBboxSize / 2,\n ),\n Math.min(\n width,\n Math.round((bboxList[0] * width) / 1000) + defaultBboxSize / 2,\n ),\n Math.min(\n height,\n Math.round((bboxList[1] * height) / 1000) + defaultBboxSize / 2,\n ),\n ];\n }\n\n if (bbox.length === 8) {\n return [\n Math.round((bboxList[0] * width) / 1000),\n Math.round((bboxList[1] * height) / 1000),\n Math.round((bboxList[4] * width) / 1000),\n Math.round((bboxList[5] * height) / 1000),\n ];\n }\n\n const msg = `invalid bbox data for doubao-vision mode: ${JSON.stringify(bbox)} `;\n throw new Error(msg);\n}\n\nexport function adaptBbox(\n bbox: number[],\n width: number,\n height: number,\n): [number, number, number, number] {\n if (vlLocateMode() === 'doubao-vision' || vlLocateMode() === 'vlm-ui-tars') {\n return adaptDoubaoBbox(bbox, width, height);\n }\n\n if (vlLocateMode() === 'gemini') {\n return adaptGeminiBbox(bbox, width, height);\n }\n\n return adaptQwenBbox(bbox);\n}\n\nexport function adaptGeminiBbox(\n bbox: number[],\n width: number,\n height: number,\n): [number, number, number, number] {\n const left = Math.round((bbox[1] * width) / 1000);\n const top = Math.round((bbox[0] * height) / 1000);\n const right = Math.round((bbox[3] * width) / 1000);\n const bottom = Math.round((bbox[2] * height) / 1000);\n return [left, top, right, bottom];\n}\n\nexport function adaptBboxToRect(\n bbox: number[],\n width: number,\n height: number,\n offsetX = 0,\n offsetY = 0,\n): Rect {\n debugInspectUtils('adaptBboxToRect', bbox, width, height, offsetX, offsetY);\n const [left, top, right, bottom] = adaptBbox(bbox, width, height);\n const rect = {\n left: left + offsetX,\n top: top + offsetY,\n width: right - left,\n height: bottom - top,\n };\n debugInspectUtils('adaptBboxToRect, result=', rect);\n return rect;\n}\n\nlet warned = false;\nexport function warnGPT4oSizeLimit(size: Size) {\n if (warned) return;\n if (getModelName()?.toLowerCase().includes('gpt-4o')) {\n const warningMsg = `GPT-4o has a maximum image input size of 2000x768 or 768x2000, but got ${size.width}x${size.height}. Please set your page to a smaller resolution. Otherwise, the result may be inaccurate.`;\n\n if (\n Math.max(size.width, size.height) > 2000 ||\n Math.min(size.width, size.height) > 768\n ) {\n console.warn(warningMsg);\n warned = true;\n }\n } else if (size.width > 1800 || size.height > 1800) {\n console.warn(\n `The image size seems too large (${size.width}x${size.height}). It may lead to more token usage, slower response, and inaccurate result.`,\n );\n warned = true;\n }\n}\n\nexport function mergeRects(rects: Rect[]) {\n const minLeft = Math.min(...rects.map((r) => r.left));\n const minTop = Math.min(...rects.map((r) => r.top));\n const maxRight = Math.max(...rects.map((r) => r.left + r.width));\n const maxBottom = Math.max(...rects.map((r) => r.top + r.height));\n return {\n left: minLeft,\n top: minTop,\n width: maxRight - minLeft,\n height: maxBottom - minTop,\n };\n}\n\n// expand the search area to at least 300 x 300, or add a default padding\nexport function expandSearchArea(rect: Rect, screenSize: Size) {\n const minEdgeSize = vlLocateMode() === 'doubao-vision' ? 500 : 300;\n const defaultPadding = 160;\n\n const paddingSizeHorizontal =\n rect.width < minEdgeSize\n ? Math.ceil((minEdgeSize - rect.width) / 2)\n : defaultPadding;\n const paddingSizeVertical =\n rect.height < minEdgeSize\n ? Math.ceil((minEdgeSize - rect.height) / 2)\n : defaultPadding;\n rect.left = Math.max(0, rect.left - paddingSizeHorizontal);\n rect.width = Math.min(\n rect.width + paddingSizeHorizontal * 2,\n screenSize.width - rect.left,\n );\n rect.top = Math.max(0, rect.top - paddingSizeVertical);\n rect.height = Math.min(\n rect.height + paddingSizeVertical * 2,\n screenSize.height - rect.top,\n );\n return rect;\n}\n\nexport async function markupImageForLLM(\n screenshotBase64: string,\n tree: ElementTreeNode<BaseElement>,\n size: Size,\n) {\n const elementsInfo = treeToList(tree);\n const elementsPositionInfoWithoutText = elementsInfo!.filter(\n (elementInfo) => {\n if (elementInfo.attributes.nodeType === NodeType.TEXT) {\n return false;\n }\n return true;\n },\n );\n\n const imagePayload = await compositeElementInfoImg({\n inputImgBase64: screenshotBase64,\n elementsPositionInfo: elementsPositionInfoWithoutText,\n size,\n });\n return imagePayload;\n}\n\nexport function buildYamlFlowFromPlans(\n plans: PlanningAction[],\n actionSpace: DeviceAction[],\n sleep?: number,\n): MidsceneYamlFlowItem[] {\n const flow: MidsceneYamlFlowItem[] = [];\n\n for (const plan of plans) {\n const verb = plan.type;\n\n const action = actionSpace.find((action) => action.name === verb);\n if (!action) {\n console.warn(\n `Cannot convert action ${verb} to yaml flow. Will ignore it.`,\n );\n continue;\n }\n\n const locate = plan.locate?.prompt;\n const flowKey = action.interfaceAlias || `${actionSpaceTypePrefix}${verb}`;\n\n const flowItem: MidsceneYamlFlowItem = {\n [flowKey]: locate || '',\n ...(plan.param || {}),\n };\n\n flow.push(flowItem);\n }\n\n if (sleep) {\n flow.push({\n sleep,\n });\n }\n\n return flow;\n}\n"],"names":["AIActionType","actionSpaceTypePrefix","callAiFn","msgs","AIActionTypeValue","jsonObject","callToGetJSONObject","defaultBboxSize","debugInspectUtils","getDebug","fillBboxParam","locate","width","height","adaptBbox","adaptQwenBbox","bbox","msg","JSON","Error","result","Math","adaptDoubaoBbox","assert","splitted","Number","Array","bboxList","item","x","y","vlLocateMode","adaptGeminiBbox","left","top","right","bottom","adaptBboxToRect","offsetX","offsetY","rect","warned","warnGPT4oSizeLimit","size","_getModelName","warningMsg","console","mergeRects","rects","minLeft","r","minTop","maxRight","maxBottom","expandSearchArea","screenSize","minEdgeSize","defaultPadding","paddingSizeHorizontal","paddingSizeVertical","markupImageForLLM","screenshotBase64","tree","elementsInfo","treeToList","elementsPositionInfoWithoutText","elementInfo","NodeType","imagePayload","compositeElementInfoImg","buildYamlFlowFromPlans","plans","actionSpace","sleep","flow","plan","_plan_locate","verb","action","flowKey","flowItem"],"mappings":";;;;;;;AAkCO,IAAKA,sBAAYA,WAAAA,GAAAA,SAAZA,YAAY;;;;;;WAAZA;;AAQL,MAAMC,wBAAwB;AAE9B,eAAeC,SACpBC,IAAY,EACZC,iBAA+B;IAE/B,MAAMC,aAAa,MAAMC,oBAAuBH,MAAMC;IAEtD,OAAO;QACL,SAASC,WAAW,OAAO;QAC3B,OAAOA,WAAW,KAAK;IACzB;AACF;AAEA,MAAME,kBAAkB;AACxB,MAAMC,oBAAoBC,SAAS;AAG5B,SAASC,cACdC,MAA2B,EAC3BC,KAAa,EACbC,MAAc;IAGd,IAAKF,OAAe,OAAO,IAAI,CAACA,CAAAA,QAAAA,SAAAA,KAAAA,IAAAA,OAAQ,IAAI,AAAD,GAAG;QAC5CA,OAAO,IAAI,GAAIA,OAAe,OAAO;QAErC,OAAQA,OAAe,OAAO;IAChC;IAEA,IAAIA,QAAAA,SAAAA,KAAAA,IAAAA,OAAQ,IAAI,EACdA,OAAO,IAAI,GAAGG,UAAUH,OAAO,IAAI,EAAEC,OAAOC;IAG9C,OAAOF;AACT;AAEO,SAASI,cACdC,IAAc;IAEd,IAAIA,KAAK,MAAM,GAAG,GAAG;QACnB,MAAMC,MAAM,CAAC,oCAAoC,EAAEC,KAAK,SAAS,CAACF,MAAM,CAAC,CAAC;QAC1E,MAAM,IAAIG,MAAMF;IAClB;IAEA,MAAMG,SAA2C;QAC/CC,KAAK,KAAK,CAACL,IAAI,CAAC,EAAE;QAClBK,KAAK,KAAK,CAACL,IAAI,CAAC,EAAE;QACC,YAAnB,OAAOA,IAAI,CAAC,EAAE,GACVK,KAAK,KAAK,CAACL,IAAI,CAAC,EAAE,IAClBK,KAAK,KAAK,CAACL,IAAI,CAAC,EAAE,GAAGT;QACN,YAAnB,OAAOS,IAAI,CAAC,EAAE,GACVK,KAAK,KAAK,CAACL,IAAI,CAAC,EAAE,IAClBK,KAAK,KAAK,CAACL,IAAI,CAAC,EAAE,GAAGT;KAC1B;IACD,OAAOa;AACT;AAEO,SAASE,gBACdN,IAAkC,EAClCJ,KAAa,EACbC,MAAc;IAEdU,OACEX,QAAQ,KAAKC,SAAS,GACtB;IAGF,IAAI,AAAgB,YAAhB,OAAOG,MAAmB;QAC5BO,OACE,+BAA+B,IAAI,CAACP,KAAK,IAAI,KAC7C,CAAC,iDAAiD,EAAEA,MAAM;QAE5D,MAAMQ,WAAWR,KAAK,KAAK,CAAC;QAC5B,IAAIQ,AAAoB,MAApBA,SAAS,MAAM,EACjB,OAAO;YACLH,KAAK,KAAK,CAAEI,OAAOD,QAAQ,CAAC,EAAE,IAAIZ,QAAS;YAC3CS,KAAK,KAAK,CAAEI,OAAOD,QAAQ,CAAC,EAAE,IAAIX,SAAU;YAC5CQ,KAAK,KAAK,CAAEI,OAAOD,QAAQ,CAAC,EAAE,IAAIZ,QAAS;YAC3CS,KAAK,KAAK,CAAEI,OAAOD,QAAQ,CAAC,EAAE,IAAIX,SAAU;SAC7C;QAEH,MAAM,IAAIM,MAAM,CAAC,iDAAiD,EAAEH,MAAM;IAC5E;IAEA,IAAIU,MAAM,OAAO,CAACV,SAASU,MAAM,OAAO,CAACV,IAAI,CAAC,EAAE,GAC9CA,OAAOA,IAAI,CAAC,EAAE;IAGhB,IAAIW,WAAqB,EAAE;IAC3B,IAAID,MAAM,OAAO,CAACV,SAAS,AAAmB,YAAnB,OAAOA,IAAI,CAAC,EAAE,EACvCA,KAAK,OAAO,CAAC,CAACY;QACZ,IAAI,AAAgB,YAAhB,OAAOA,QAAqBA,KAAK,QAAQ,CAAC,MAAM;YAClD,MAAM,CAACC,GAAGC,EAAE,GAAGF,KAAK,KAAK,CAAC;YAC1BD,SAAS,IAAI,CAACF,OAAOI,EAAE,IAAI,KAAKJ,OAAOK,EAAE,IAAI;QAC/C,OAAO,IAAI,AAAgB,YAAhB,OAAOF,QAAqBA,KAAK,QAAQ,CAAC,MAAM;YACzD,MAAM,CAACC,GAAGC,EAAE,GAAGF,KAAK,KAAK,CAAC;YAC1BD,SAAS,IAAI,CAACF,OAAOI,EAAE,IAAI,KAAKJ,OAAOK,EAAE,IAAI;QAC/C,OACEH,SAAS,IAAI,CAACF,OAAOG;IAEzB;SAEAD,WAAWX;IAGb,IAAIW,AAAoB,MAApBA,SAAS,MAAM,IAAUA,AAAoB,MAApBA,SAAS,MAAM,EAC1C,OAAO;QACLN,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGf,QAAS;QACnCS,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGd,SAAU;QACpCQ,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGf,QAAS;QACnCS,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGd,SAAU;KACrC;IAIH,IACEc,AAAoB,MAApBA,SAAS,MAAM,IACfA,AAAoB,MAApBA,SAAS,MAAM,IACfA,AAAoB,MAApBA,SAAS,MAAM,IACfA,AAAoB,MAApBA,SAAS,MAAM,EAEf,OAAO;QACLN,KAAK,GAAG,CACN,GACAA,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGf,QAAS,QAAQL,kBAAkB;QAE/Dc,KAAK,GAAG,CACN,GACAA,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGd,SAAU,QAAQN,kBAAkB;QAEhEc,KAAK,GAAG,CACNT,OACAS,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGf,QAAS,QAAQL,kBAAkB;QAE/Dc,KAAK,GAAG,CACNR,QACAQ,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGd,SAAU,QAAQN,kBAAkB;KAEjE;IAGH,IAAIS,AAAgB,MAAhBA,KAAK,MAAM,EACb,OAAO;QACLK,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGf,QAAS;QACnCS,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGd,SAAU;QACpCQ,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGf,QAAS;QACnCS,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGd,SAAU;KACrC;IAGH,MAAMI,MAAM,CAAC,0CAA0C,EAAEC,KAAK,SAAS,CAACF,MAAM,CAAC,CAAC;IAChF,MAAM,IAAIG,MAAMF;AAClB;AAEO,SAASH,UACdE,IAAc,EACdJ,KAAa,EACbC,MAAc;IAEd,IAAIkB,AAAmB,oBAAnBA,kBAAsCA,AAAmB,kBAAnBA,gBACxC,OAAOT,gBAAgBN,MAAMJ,OAAOC;IAGtC,IAAIkB,AAAmB,aAAnBA,gBACF,OAAOC,gBAAgBhB,MAAMJ,OAAOC;IAGtC,OAAOE,cAAcC;AACvB;AAEO,SAASgB,gBACdhB,IAAc,EACdJ,KAAa,EACbC,MAAc;IAEd,MAAMoB,OAAOZ,KAAK,KAAK,CAAEL,IAAI,CAAC,EAAE,GAAGJ,QAAS;IAC5C,MAAMsB,MAAMb,KAAK,KAAK,CAAEL,IAAI,CAAC,EAAE,GAAGH,SAAU;IAC5C,MAAMsB,QAAQd,KAAK,KAAK,CAAEL,IAAI,CAAC,EAAE,GAAGJ,QAAS;IAC7C,MAAMwB,SAASf,KAAK,KAAK,CAAEL,IAAI,CAAC,EAAE,GAAGH,SAAU;IAC/C,OAAO;QAACoB;QAAMC;QAAKC;QAAOC;KAAO;AACnC;AAEO,SAASC,gBACdrB,IAAc,EACdJ,KAAa,EACbC,MAAc,EACdyB,UAAU,CAAC,EACXC,UAAU,CAAC;IAEX/B,kBAAkB,mBAAmBQ,MAAMJ,OAAOC,QAAQyB,SAASC;IACnE,MAAM,CAACN,MAAMC,KAAKC,OAAOC,OAAO,GAAGtB,UAAUE,MAAMJ,OAAOC;IAC1D,MAAM2B,OAAO;QACX,MAAMP,OAAOK;QACb,KAAKJ,MAAMK;QACX,OAAOJ,QAAQF;QACf,QAAQG,SAASF;IACnB;IACA1B,kBAAkB,4BAA4BgC;IAC9C,OAAOA;AACT;AAEA,IAAIC,SAAS;AACN,SAASC,mBAAmBC,IAAU;QAEvCC;IADJ,IAAIH,QAAQ;IACZ,IAAI,QAAAG,CAAAA,gBAAAA,cAAa,IAAbA,KAAAA,IAAAA,cAAgB,WAAW,GAAG,QAAQ,CAAC,WAAW;QACpD,MAAMC,aAAa,CAAC,uEAAuE,EAAEF,KAAK,KAAK,CAAC,CAAC,EAAEA,KAAK,MAAM,CAAC,wFAAwF,CAAC;QAEhN,IACEtB,KAAK,GAAG,CAACsB,KAAK,KAAK,EAAEA,KAAK,MAAM,IAAI,QACpCtB,KAAK,GAAG,CAACsB,KAAK,KAAK,EAAEA,KAAK,MAAM,IAAI,KACpC;YACAG,QAAQ,IAAI,CAACD;YACbJ,SAAS;QACX;IACF,OAAO,IAAIE,KAAK,KAAK,GAAG,QAAQA,KAAK,MAAM,GAAG,MAAM;QAClDG,QAAQ,IAAI,CACV,CAAC,gCAAgC,EAAEH,KAAK,KAAK,CAAC,CAAC,EAAEA,KAAK,MAAM,CAAC,2EAA2E,CAAC;QAE3IF,SAAS;IACX;AACF;AAEO,SAASM,WAAWC,KAAa;IACtC,MAAMC,UAAU5B,KAAK,GAAG,IAAI2B,MAAM,GAAG,CAAC,CAACE,IAAMA,EAAE,IAAI;IACnD,MAAMC,SAAS9B,KAAK,GAAG,IAAI2B,MAAM,GAAG,CAAC,CAACE,IAAMA,EAAE,GAAG;IACjD,MAAME,WAAW/B,KAAK,GAAG,IAAI2B,MAAM,GAAG,CAAC,CAACE,IAAMA,EAAE,IAAI,GAAGA,EAAE,KAAK;IAC9D,MAAMG,YAAYhC,KAAK,GAAG,IAAI2B,MAAM,GAAG,CAAC,CAACE,IAAMA,EAAE,GAAG,GAAGA,EAAE,MAAM;IAC/D,OAAO;QACL,MAAMD;QACN,KAAKE;QACL,OAAOC,WAAWH;QAClB,QAAQI,YAAYF;IACtB;AACF;AAGO,SAASG,iBAAiBd,IAAU,EAAEe,UAAgB;IAC3D,MAAMC,cAAczB,AAAmB,oBAAnBA,iBAAqC,MAAM;IAC/D,MAAM0B,iBAAiB;IAEvB,MAAMC,wBACJlB,KAAK,KAAK,GAAGgB,cACTnC,KAAK,IAAI,CAAEmC,AAAAA,CAAAA,cAAchB,KAAK,KAAI,IAAK,KACvCiB;IACN,MAAME,sBACJnB,KAAK,MAAM,GAAGgB,cACVnC,KAAK,IAAI,CAAEmC,AAAAA,CAAAA,cAAchB,KAAK,MAAK,IAAK,KACxCiB;IACNjB,KAAK,IAAI,GAAGnB,KAAK,GAAG,CAAC,GAAGmB,KAAK,IAAI,GAAGkB;IACpClB,KAAK,KAAK,GAAGnB,KAAK,GAAG,CACnBmB,KAAK,KAAK,GAAGkB,AAAwB,IAAxBA,uBACbH,WAAW,KAAK,GAAGf,KAAK,IAAI;IAE9BA,KAAK,GAAG,GAAGnB,KAAK,GAAG,CAAC,GAAGmB,KAAK,GAAG,GAAGmB;IAClCnB,KAAK,MAAM,GAAGnB,KAAK,GAAG,CACpBmB,KAAK,MAAM,GAAGmB,AAAsB,IAAtBA,qBACdJ,WAAW,MAAM,GAAGf,KAAK,GAAG;IAE9B,OAAOA;AACT;AAEO,eAAeoB,kBACpBC,gBAAwB,EACxBC,IAAkC,EAClCnB,IAAU;IAEV,MAAMoB,eAAeC,WAAWF;IAChC,MAAMG,kCAAkCF,aAAc,MAAM,CAC1D,CAACG;QACC,IAAIA,YAAY,UAAU,CAAC,QAAQ,KAAKC,SAAS,IAAI,EACnD,OAAO;QAET,OAAO;IACT;IAGF,MAAMC,eAAe,MAAMC,wBAAwB;QACjD,gBAAgBR;QAChB,sBAAsBI;QACtBtB;IACF;IACA,OAAOyB;AACT;AAEO,SAASE,uBACdC,KAAuB,EACvBC,WAA2B,EAC3BC,KAAc;IAEd,MAAMC,OAA+B,EAAE;IAEvC,KAAK,MAAMC,QAAQJ,MAAO;YAWTK;QAVf,MAAMC,OAAOF,KAAK,IAAI;QAEtB,MAAMG,SAASN,YAAY,IAAI,CAAC,CAACM,SAAWA,OAAO,IAAI,KAAKD;QAC5D,IAAI,CAACC,QAAQ;YACXhC,QAAQ,IAAI,CACV,CAAC,sBAAsB,EAAE+B,KAAK,8BAA8B,CAAC;YAE/D;QACF;QAEA,MAAMlE,SAAS,QAAAiE,CAAAA,eAAAA,KAAK,MAAM,AAAD,IAAVA,KAAAA,IAAAA,aAAa,MAAM;QAClC,MAAMG,UAAUD,OAAO,cAAc,IAAI,GAAG7E,wBAAwB4E,MAAM;QAE1E,MAAMG,WAAiC;YACrC,CAACD,QAAQ,EAAEpE,UAAU;YACrB,GAAIgE,KAAK,KAAK,IAAI,CAAC,CAAC;QACtB;QAEAD,KAAK,IAAI,CAACM;IACZ;IAEA,IAAIP,OACFC,KAAK,IAAI,CAAC;QACRD;IACF;IAGF,OAAOC;AACT"}
1
+ {"version":3,"file":"ai-model/common.mjs","sources":["webpack://@midscene/core/./src/ai-model/common.ts"],"sourcesContent":["import type {\n AIUsageInfo,\n BaseElement,\n DeviceAction,\n ElementTreeNode,\n MidsceneYamlFlowItem,\n PlanningAction,\n Rect,\n Size,\n} from '@/types';\nimport { assert } from '@midscene/shared/utils';\n\nimport type {\n ChatCompletionSystemMessageParam,\n ChatCompletionUserMessageParam,\n} from 'openai/resources/index';\nimport { callToGetJSONObject } from './service-caller/index';\n\nimport type { PlanningLocateParam } from '@/types';\nimport { NodeType } from '@midscene/shared/constants';\nimport {\n type IModelPreferences,\n getModelName,\n vlLocateMode,\n} from '@midscene/shared/env';\nimport { treeToList } from '@midscene/shared/extractor';\nimport { compositeElementInfoImg } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { z } from 'zod';\n\nexport type AIArgs = [\n ChatCompletionSystemMessageParam,\n ...ChatCompletionUserMessageParam[],\n];\n\nexport enum AIActionType {\n ASSERT = 0,\n INSPECT_ELEMENT = 1,\n EXTRACT_DATA = 2,\n PLAN = 3,\n DESCRIBE_ELEMENT = 4,\n}\n\nexport const actionSpaceTypePrefix = 'action_space_';\n\nexport async function callAiFn<T>(\n msgs: AIArgs,\n AIActionTypeValue: AIActionType,\n modelPreferences: IModelPreferences,\n): Promise<{ content: T; usage?: AIUsageInfo }> {\n const jsonObject = await callToGetJSONObject<T>(\n msgs,\n AIActionTypeValue,\n modelPreferences,\n );\n\n return {\n content: jsonObject.content,\n usage: jsonObject.usage,\n };\n}\n\nconst defaultBboxSize = 20; // must be even number\nconst debugInspectUtils = getDebug('ai:common');\n\n// transform the param of locate from qwen mode\nexport function fillBboxParam(\n locate: PlanningLocateParam,\n width: number,\n height: number,\n modelPreferences: IModelPreferences,\n) {\n // The Qwen model might have hallucinations of naming bbox as bbox_2d.\n if ((locate as any).bbox_2d && !locate?.bbox) {\n locate.bbox = (locate as any).bbox_2d;\n // biome-ignore lint/performance/noDelete: <explanation>\n delete (locate as any).bbox_2d;\n }\n\n if (locate?.bbox) {\n locate.bbox = adaptBbox(locate.bbox, width, height, modelPreferences);\n }\n\n return locate;\n}\n\nexport function adaptQwenBbox(\n bbox: number[],\n): [number, number, number, number] {\n if (bbox.length < 2) {\n const msg = `invalid bbox data for qwen-vl mode: ${JSON.stringify(bbox)} `;\n throw new Error(msg);\n }\n\n const result: [number, number, number, number] = [\n Math.round(bbox[0]),\n Math.round(bbox[1]),\n typeof bbox[2] === 'number'\n ? Math.round(bbox[2])\n : Math.round(bbox[0] + defaultBboxSize),\n typeof bbox[3] === 'number'\n ? Math.round(bbox[3])\n : Math.round(bbox[1] + defaultBboxSize),\n ];\n return result;\n}\n\nexport function adaptDoubaoBbox(\n bbox: string[] | number[] | string,\n width: number,\n height: number,\n): [number, number, number, number] {\n assert(\n width > 0 && height > 0,\n 'width and height must be greater than 0 in doubao mode',\n );\n\n if (typeof bbox === 'string') {\n assert(\n /^(\\d+)\\s(\\d+)\\s(\\d+)\\s(\\d+)$/.test(bbox.trim()),\n `invalid bbox data string for doubao-vision mode: ${bbox}`,\n );\n const splitted = bbox.split(' ');\n if (splitted.length === 4) {\n return [\n Math.round((Number(splitted[0]) * width) / 1000),\n Math.round((Number(splitted[1]) * height) / 1000),\n Math.round((Number(splitted[2]) * width) / 1000),\n Math.round((Number(splitted[3]) * height) / 1000),\n ];\n }\n throw new Error(`invalid bbox data string for doubao-vision mode: ${bbox}`);\n }\n\n if (Array.isArray(bbox) && Array.isArray(bbox[0])) {\n bbox = bbox[0];\n }\n\n let bboxList: number[] = [];\n if (Array.isArray(bbox) && typeof bbox[0] === 'string') {\n bbox.forEach((item) => {\n if (typeof item === 'string' && item.includes(',')) {\n const [x, y] = item.split(',');\n bboxList.push(Number(x.trim()), Number(y.trim()));\n } else if (typeof item === 'string' && item.includes(' ')) {\n const [x, y] = item.split(' ');\n bboxList.push(Number(x.trim()), Number(y.trim()));\n } else {\n bboxList.push(Number(item));\n }\n });\n } else {\n bboxList = bbox as any;\n }\n\n if (bboxList.length === 4 || bboxList.length === 5) {\n return [\n Math.round((bboxList[0] * width) / 1000),\n Math.round((bboxList[1] * height) / 1000),\n Math.round((bboxList[2] * width) / 1000),\n Math.round((bboxList[3] * height) / 1000),\n ];\n }\n\n // treat the bbox as a center point\n if (\n bboxList.length === 6 ||\n bboxList.length === 2 ||\n bboxList.length === 3 ||\n bboxList.length === 7\n ) {\n return [\n Math.max(\n 0,\n Math.round((bboxList[0] * width) / 1000) - defaultBboxSize / 2,\n ),\n Math.max(\n 0,\n Math.round((bboxList[1] * height) / 1000) - defaultBboxSize / 2,\n ),\n Math.min(\n width,\n Math.round((bboxList[0] * width) / 1000) + defaultBboxSize / 2,\n ),\n Math.min(\n height,\n Math.round((bboxList[1] * height) / 1000) + defaultBboxSize / 2,\n ),\n ];\n }\n\n if (bbox.length === 8) {\n return [\n Math.round((bboxList[0] * width) / 1000),\n Math.round((bboxList[1] * height) / 1000),\n Math.round((bboxList[4] * width) / 1000),\n Math.round((bboxList[5] * height) / 1000),\n ];\n }\n\n const msg = `invalid bbox data for doubao-vision mode: ${JSON.stringify(bbox)} `;\n throw new Error(msg);\n}\n\nexport function adaptBbox(\n bbox: number[],\n width: number,\n height: number,\n modelPreferences: IModelPreferences,\n): [number, number, number, number] {\n if (\n vlLocateMode(modelPreferences) === 'doubao-vision' ||\n vlLocateMode(modelPreferences) === 'vlm-ui-tars'\n ) {\n return adaptDoubaoBbox(bbox, width, height);\n }\n\n if (vlLocateMode(modelPreferences) === 'gemini') {\n return adaptGeminiBbox(bbox, width, height);\n }\n\n return adaptQwenBbox(bbox);\n}\n\nexport function adaptGeminiBbox(\n bbox: number[],\n width: number,\n height: number,\n): [number, number, number, number] {\n const left = Math.round((bbox[1] * width) / 1000);\n const top = Math.round((bbox[0] * height) / 1000);\n const right = Math.round((bbox[3] * width) / 1000);\n const bottom = Math.round((bbox[2] * height) / 1000);\n return [left, top, right, bottom];\n}\n\nexport function adaptBboxToRect(\n bbox: number[],\n width: number,\n height: number,\n modelPreferences: IModelPreferences,\n offsetX = 0,\n offsetY = 0,\n): Rect {\n debugInspectUtils('adaptBboxToRect', bbox, width, height, offsetX, offsetY);\n const [left, top, right, bottom] = adaptBbox(\n bbox,\n width,\n height,\n modelPreferences,\n );\n const rect = {\n left: left + offsetX,\n top: top + offsetY,\n width: right - left,\n height: bottom - top,\n };\n debugInspectUtils('adaptBboxToRect, result=', rect);\n return rect;\n}\n\nlet warned = false;\nexport function warnGPT4oSizeLimit(\n size: Size,\n modelPreferences: IModelPreferences,\n) {\n if (warned) return;\n if (getModelName(modelPreferences)?.toLowerCase().includes('gpt-4o')) {\n const warningMsg = `GPT-4o has a maximum image input size of 2000x768 or 768x2000, but got ${size.width}x${size.height}. Please set your page to a smaller resolution. Otherwise, the result may be inaccurate.`;\n\n if (\n Math.max(size.width, size.height) > 2000 ||\n Math.min(size.width, size.height) > 768\n ) {\n console.warn(warningMsg);\n warned = true;\n }\n } else if (size.width > 1800 || size.height > 1800) {\n console.warn(\n `The image size seems too large (${size.width}x${size.height}). It may lead to more token usage, slower response, and inaccurate result.`,\n );\n warned = true;\n }\n}\n\nexport function mergeRects(rects: Rect[]) {\n const minLeft = Math.min(...rects.map((r) => r.left));\n const minTop = Math.min(...rects.map((r) => r.top));\n const maxRight = Math.max(...rects.map((r) => r.left + r.width));\n const maxBottom = Math.max(...rects.map((r) => r.top + r.height));\n return {\n left: minLeft,\n top: minTop,\n width: maxRight - minLeft,\n height: maxBottom - minTop,\n };\n}\n\n// expand the search area to at least 300 x 300, or add a default padding\nexport function expandSearchArea(\n rect: Rect,\n screenSize: Size,\n modelPreferences: IModelPreferences,\n) {\n const minEdgeSize =\n vlLocateMode(modelPreferences) === 'doubao-vision' ? 500 : 300;\n const defaultPadding = 160;\n\n const paddingSizeHorizontal =\n rect.width < minEdgeSize\n ? Math.ceil((minEdgeSize - rect.width) / 2)\n : defaultPadding;\n const paddingSizeVertical =\n rect.height < minEdgeSize\n ? Math.ceil((minEdgeSize - rect.height) / 2)\n : defaultPadding;\n rect.left = Math.max(0, rect.left - paddingSizeHorizontal);\n rect.width = Math.min(\n rect.width + paddingSizeHorizontal * 2,\n screenSize.width - rect.left,\n );\n rect.top = Math.max(0, rect.top - paddingSizeVertical);\n rect.height = Math.min(\n rect.height + paddingSizeVertical * 2,\n screenSize.height - rect.top,\n );\n return rect;\n}\n\nexport async function markupImageForLLM(\n screenshotBase64: string,\n tree: ElementTreeNode<BaseElement>,\n size: Size,\n) {\n const elementsInfo = treeToList(tree);\n const elementsPositionInfoWithoutText = elementsInfo!.filter(\n (elementInfo) => {\n if (elementInfo.attributes.nodeType === NodeType.TEXT) {\n return false;\n }\n return true;\n },\n );\n\n const imagePayload = await compositeElementInfoImg({\n inputImgBase64: screenshotBase64,\n elementsPositionInfo: elementsPositionInfoWithoutText,\n size,\n });\n return imagePayload;\n}\n\nexport function buildYamlFlowFromPlans(\n plans: PlanningAction[],\n actionSpace: DeviceAction<any>[],\n sleep?: number,\n): MidsceneYamlFlowItem[] {\n const flow: MidsceneYamlFlowItem[] = [];\n\n for (const plan of plans) {\n const verb = plan.type;\n\n const action = actionSpace.find((action) => action.name === verb);\n if (!action) {\n console.warn(\n `Cannot convert action ${verb} to yaml flow. Will ignore it.`,\n );\n continue;\n }\n\n const locate = plan.locate?.prompt;\n const flowKey = action.interfaceAlias || `${actionSpaceTypePrefix}${verb}`;\n\n const flowItem: MidsceneYamlFlowItem = {\n [flowKey]: locate || '',\n ...(plan.param || {}),\n };\n\n flow.push(flowItem);\n }\n\n if (sleep) {\n flow.push({\n sleep,\n });\n }\n\n return flow;\n}\n\n// Zod schemas for shared types\nexport const PointSchema = z.object({\n left: z.number(),\n top: z.number(),\n});\n\nexport const SizeSchema = z.object({\n width: z.number(),\n height: z.number(),\n dpr: z.number().optional(),\n});\n\nexport const RectSchema = PointSchema.and(SizeSchema).and(\n z.object({\n zoom: z.number().optional(),\n }),\n);\n\nexport const MidsceneLocation = z\n .object({\n midscene_location_field_flag: z.literal(true),\n prompt: z.string(),\n center: z.tuple([z.number(), z.number()]),\n rect: RectSchema,\n })\n .passthrough();\n\nexport type MidsceneLocationType = z.infer<typeof MidsceneLocation>;\n\nexport const ifMidsceneLocatorField = (field: any): boolean => {\n // Handle optional fields by getting the inner type\n let actualField = field;\n if (actualField._def?.typeName === 'ZodOptional') {\n actualField = actualField._def.innerType;\n }\n\n // Check if this is a ZodObject with midscene_location_field_flag\n if (actualField._def?.typeName === 'ZodObject') {\n const shape = actualField._def.shape();\n return 'midscene_location_field_flag' in shape;\n }\n\n return false;\n};\n\nexport const findAllMidsceneLocatorField = (\n zodType?: z.ZodType<any>,\n): string[] => {\n if (!zodType) {\n return [];\n }\n\n // Check if this is a ZodObject by checking if it has a shape property\n const zodObject = zodType as any;\n if (zodObject._def?.typeName === 'ZodObject' && zodObject.shape) {\n const keys = Object.keys(zodObject.shape);\n return keys.filter((key) => ifMidsceneLocatorField(zodObject.shape[key]));\n }\n\n // For other ZodType instances, we can't extract field names\n return [];\n};\n"],"names":["AIActionType","actionSpaceTypePrefix","callAiFn","msgs","AIActionTypeValue","modelPreferences","jsonObject","callToGetJSONObject","defaultBboxSize","debugInspectUtils","getDebug","fillBboxParam","locate","width","height","adaptBbox","adaptQwenBbox","bbox","msg","JSON","Error","result","Math","adaptDoubaoBbox","assert","splitted","Number","Array","bboxList","item","x","y","vlLocateMode","adaptGeminiBbox","left","top","right","bottom","adaptBboxToRect","offsetX","offsetY","rect","warned","warnGPT4oSizeLimit","size","_getModelName","warningMsg","console","mergeRects","rects","minLeft","r","minTop","maxRight","maxBottom","expandSearchArea","screenSize","minEdgeSize","defaultPadding","paddingSizeHorizontal","paddingSizeVertical","markupImageForLLM","screenshotBase64","tree","elementsInfo","treeToList","elementsPositionInfoWithoutText","elementInfo","NodeType","imagePayload","compositeElementInfoImg","buildYamlFlowFromPlans","plans","actionSpace","sleep","flow","plan","_plan_locate","verb","action","flowKey","flowItem","PointSchema","z","SizeSchema","RectSchema","MidsceneLocation","ifMidsceneLocatorField","field","_actualField__def","_actualField__def1","actualField","shape","findAllMidsceneLocatorField","zodType","_zodObject__def","zodObject","keys","Object","key"],"mappings":";;;;;;;;AAmCO,IAAKA,sBAAYA,WAAAA,GAAAA,SAAZA,YAAY;;;;;;WAAZA;;AAQL,MAAMC,wBAAwB;AAE9B,eAAeC,SACpBC,IAAY,EACZC,iBAA+B,EAC/BC,gBAAmC;IAEnC,MAAMC,aAAa,MAAMC,oBACvBJ,MACAC,mBACAC;IAGF,OAAO;QACL,SAASC,WAAW,OAAO;QAC3B,OAAOA,WAAW,KAAK;IACzB;AACF;AAEA,MAAME,kBAAkB;AACxB,MAAMC,oBAAoBC,SAAS;AAG5B,SAASC,cACdC,MAA2B,EAC3BC,KAAa,EACbC,MAAc,EACdT,gBAAmC;IAGnC,IAAKO,OAAe,OAAO,IAAI,CAACA,CAAAA,QAAAA,SAAAA,KAAAA,IAAAA,OAAQ,IAAI,AAAD,GAAG;QAC5CA,OAAO,IAAI,GAAIA,OAAe,OAAO;QAErC,OAAQA,OAAe,OAAO;IAChC;IAEA,IAAIA,QAAAA,SAAAA,KAAAA,IAAAA,OAAQ,IAAI,EACdA,OAAO,IAAI,GAAGG,UAAUH,OAAO,IAAI,EAAEC,OAAOC,QAAQT;IAGtD,OAAOO;AACT;AAEO,SAASI,cACdC,IAAc;IAEd,IAAIA,KAAK,MAAM,GAAG,GAAG;QACnB,MAAMC,MAAM,CAAC,oCAAoC,EAAEC,KAAK,SAAS,CAACF,MAAM,CAAC,CAAC;QAC1E,MAAM,IAAIG,MAAMF;IAClB;IAEA,MAAMG,SAA2C;QAC/CC,KAAK,KAAK,CAACL,IAAI,CAAC,EAAE;QAClBK,KAAK,KAAK,CAACL,IAAI,CAAC,EAAE;QACC,YAAnB,OAAOA,IAAI,CAAC,EAAE,GACVK,KAAK,KAAK,CAACL,IAAI,CAAC,EAAE,IAClBK,KAAK,KAAK,CAACL,IAAI,CAAC,EAAE,GAAGT;QACN,YAAnB,OAAOS,IAAI,CAAC,EAAE,GACVK,KAAK,KAAK,CAACL,IAAI,CAAC,EAAE,IAClBK,KAAK,KAAK,CAACL,IAAI,CAAC,EAAE,GAAGT;KAC1B;IACD,OAAOa;AACT;AAEO,SAASE,gBACdN,IAAkC,EAClCJ,KAAa,EACbC,MAAc;IAEdU,OACEX,QAAQ,KAAKC,SAAS,GACtB;IAGF,IAAI,AAAgB,YAAhB,OAAOG,MAAmB;QAC5BO,OACE,+BAA+B,IAAI,CAACP,KAAK,IAAI,KAC7C,CAAC,iDAAiD,EAAEA,MAAM;QAE5D,MAAMQ,WAAWR,KAAK,KAAK,CAAC;QAC5B,IAAIQ,AAAoB,MAApBA,SAAS,MAAM,EACjB,OAAO;YACLH,KAAK,KAAK,CAAEI,OAAOD,QAAQ,CAAC,EAAE,IAAIZ,QAAS;YAC3CS,KAAK,KAAK,CAAEI,OAAOD,QAAQ,CAAC,EAAE,IAAIX,SAAU;YAC5CQ,KAAK,KAAK,CAAEI,OAAOD,QAAQ,CAAC,EAAE,IAAIZ,QAAS;YAC3CS,KAAK,KAAK,CAAEI,OAAOD,QAAQ,CAAC,EAAE,IAAIX,SAAU;SAC7C;QAEH,MAAM,IAAIM,MAAM,CAAC,iDAAiD,EAAEH,MAAM;IAC5E;IAEA,IAAIU,MAAM,OAAO,CAACV,SAASU,MAAM,OAAO,CAACV,IAAI,CAAC,EAAE,GAC9CA,OAAOA,IAAI,CAAC,EAAE;IAGhB,IAAIW,WAAqB,EAAE;IAC3B,IAAID,MAAM,OAAO,CAACV,SAAS,AAAmB,YAAnB,OAAOA,IAAI,CAAC,EAAE,EACvCA,KAAK,OAAO,CAAC,CAACY;QACZ,IAAI,AAAgB,YAAhB,OAAOA,QAAqBA,KAAK,QAAQ,CAAC,MAAM;YAClD,MAAM,CAACC,GAAGC,EAAE,GAAGF,KAAK,KAAK,CAAC;YAC1BD,SAAS,IAAI,CAACF,OAAOI,EAAE,IAAI,KAAKJ,OAAOK,EAAE,IAAI;QAC/C,OAAO,IAAI,AAAgB,YAAhB,OAAOF,QAAqBA,KAAK,QAAQ,CAAC,MAAM;YACzD,MAAM,CAACC,GAAGC,EAAE,GAAGF,KAAK,KAAK,CAAC;YAC1BD,SAAS,IAAI,CAACF,OAAOI,EAAE,IAAI,KAAKJ,OAAOK,EAAE,IAAI;QAC/C,OACEH,SAAS,IAAI,CAACF,OAAOG;IAEzB;SAEAD,WAAWX;IAGb,IAAIW,AAAoB,MAApBA,SAAS,MAAM,IAAUA,AAAoB,MAApBA,SAAS,MAAM,EAC1C,OAAO;QACLN,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGf,QAAS;QACnCS,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGd,SAAU;QACpCQ,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGf,QAAS;QACnCS,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGd,SAAU;KACrC;IAIH,IACEc,AAAoB,MAApBA,SAAS,MAAM,IACfA,AAAoB,MAApBA,SAAS,MAAM,IACfA,AAAoB,MAApBA,SAAS,MAAM,IACfA,AAAoB,MAApBA,SAAS,MAAM,EAEf,OAAO;QACLN,KAAK,GAAG,CACN,GACAA,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGf,QAAS,QAAQL,kBAAkB;QAE/Dc,KAAK,GAAG,CACN,GACAA,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGd,SAAU,QAAQN,kBAAkB;QAEhEc,KAAK,GAAG,CACNT,OACAS,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGf,QAAS,QAAQL,kBAAkB;QAE/Dc,KAAK,GAAG,CACNR,QACAQ,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGd,SAAU,QAAQN,kBAAkB;KAEjE;IAGH,IAAIS,AAAgB,MAAhBA,KAAK,MAAM,EACb,OAAO;QACLK,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGf,QAAS;QACnCS,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGd,SAAU;QACpCQ,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGf,QAAS;QACnCS,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGd,SAAU;KACrC;IAGH,MAAMI,MAAM,CAAC,0CAA0C,EAAEC,KAAK,SAAS,CAACF,MAAM,CAAC,CAAC;IAChF,MAAM,IAAIG,MAAMF;AAClB;AAEO,SAASH,UACdE,IAAc,EACdJ,KAAa,EACbC,MAAc,EACdT,gBAAmC;IAEnC,IACE2B,AAAmC,oBAAnCA,aAAa3B,qBACb2B,AAAmC,kBAAnCA,aAAa3B,mBAEb,OAAOkB,gBAAgBN,MAAMJ,OAAOC;IAGtC,IAAIkB,AAAmC,aAAnCA,aAAa3B,mBACf,OAAO4B,gBAAgBhB,MAAMJ,OAAOC;IAGtC,OAAOE,cAAcC;AACvB;AAEO,SAASgB,gBACdhB,IAAc,EACdJ,KAAa,EACbC,MAAc;IAEd,MAAMoB,OAAOZ,KAAK,KAAK,CAAEL,IAAI,CAAC,EAAE,GAAGJ,QAAS;IAC5C,MAAMsB,MAAMb,KAAK,KAAK,CAAEL,IAAI,CAAC,EAAE,GAAGH,SAAU;IAC5C,MAAMsB,QAAQd,KAAK,KAAK,CAAEL,IAAI,CAAC,EAAE,GAAGJ,QAAS;IAC7C,MAAMwB,SAASf,KAAK,KAAK,CAAEL,IAAI,CAAC,EAAE,GAAGH,SAAU;IAC/C,OAAO;QAACoB;QAAMC;QAAKC;QAAOC;KAAO;AACnC;AAEO,SAASC,gBACdrB,IAAc,EACdJ,KAAa,EACbC,MAAc,EACdT,gBAAmC,EACnCkC,UAAU,CAAC,EACXC,UAAU,CAAC;IAEX/B,kBAAkB,mBAAmBQ,MAAMJ,OAAOC,QAAQyB,SAASC;IACnE,MAAM,CAACN,MAAMC,KAAKC,OAAOC,OAAO,GAAGtB,UACjCE,MACAJ,OACAC,QACAT;IAEF,MAAMoC,OAAO;QACX,MAAMP,OAAOK;QACb,KAAKJ,MAAMK;QACX,OAAOJ,QAAQF;QACf,QAAQG,SAASF;IACnB;IACA1B,kBAAkB,4BAA4BgC;IAC9C,OAAOA;AACT;AAEA,IAAIC,SAAS;AACN,SAASC,mBACdC,IAAU,EACVvC,gBAAmC;QAG/BwC;IADJ,IAAIH,QAAQ;IACZ,IAAI,QAAAG,CAAAA,gBAAAA,aAAaxC,iBAAgB,IAA7BwC,KAAAA,IAAAA,cAAgC,WAAW,GAAG,QAAQ,CAAC,WAAW;QACpE,MAAMC,aAAa,CAAC,uEAAuE,EAAEF,KAAK,KAAK,CAAC,CAAC,EAAEA,KAAK,MAAM,CAAC,wFAAwF,CAAC;QAEhN,IACEtB,KAAK,GAAG,CAACsB,KAAK,KAAK,EAAEA,KAAK,MAAM,IAAI,QACpCtB,KAAK,GAAG,CAACsB,KAAK,KAAK,EAAEA,KAAK,MAAM,IAAI,KACpC;YACAG,QAAQ,IAAI,CAACD;YACbJ,SAAS;QACX;IACF,OAAO,IAAIE,KAAK,KAAK,GAAG,QAAQA,KAAK,MAAM,GAAG,MAAM;QAClDG,QAAQ,IAAI,CACV,CAAC,gCAAgC,EAAEH,KAAK,KAAK,CAAC,CAAC,EAAEA,KAAK,MAAM,CAAC,2EAA2E,CAAC;QAE3IF,SAAS;IACX;AACF;AAEO,SAASM,WAAWC,KAAa;IACtC,MAAMC,UAAU5B,KAAK,GAAG,IAAI2B,MAAM,GAAG,CAAC,CAACE,IAAMA,EAAE,IAAI;IACnD,MAAMC,SAAS9B,KAAK,GAAG,IAAI2B,MAAM,GAAG,CAAC,CAACE,IAAMA,EAAE,GAAG;IACjD,MAAME,WAAW/B,KAAK,GAAG,IAAI2B,MAAM,GAAG,CAAC,CAACE,IAAMA,EAAE,IAAI,GAAGA,EAAE,KAAK;IAC9D,MAAMG,YAAYhC,KAAK,GAAG,IAAI2B,MAAM,GAAG,CAAC,CAACE,IAAMA,EAAE,GAAG,GAAGA,EAAE,MAAM;IAC/D,OAAO;QACL,MAAMD;QACN,KAAKE;QACL,OAAOC,WAAWH;QAClB,QAAQI,YAAYF;IACtB;AACF;AAGO,SAASG,iBACdd,IAAU,EACVe,UAAgB,EAChBnD,gBAAmC;IAEnC,MAAMoD,cACJzB,AAAmC,oBAAnCA,aAAa3B,oBAAwC,MAAM;IAC7D,MAAMqD,iBAAiB;IAEvB,MAAMC,wBACJlB,KAAK,KAAK,GAAGgB,cACTnC,KAAK,IAAI,CAAEmC,AAAAA,CAAAA,cAAchB,KAAK,KAAI,IAAK,KACvCiB;IACN,MAAME,sBACJnB,KAAK,MAAM,GAAGgB,cACVnC,KAAK,IAAI,CAAEmC,AAAAA,CAAAA,cAAchB,KAAK,MAAK,IAAK,KACxCiB;IACNjB,KAAK,IAAI,GAAGnB,KAAK,GAAG,CAAC,GAAGmB,KAAK,IAAI,GAAGkB;IACpClB,KAAK,KAAK,GAAGnB,KAAK,GAAG,CACnBmB,KAAK,KAAK,GAAGkB,AAAwB,IAAxBA,uBACbH,WAAW,KAAK,GAAGf,KAAK,IAAI;IAE9BA,KAAK,GAAG,GAAGnB,KAAK,GAAG,CAAC,GAAGmB,KAAK,GAAG,GAAGmB;IAClCnB,KAAK,MAAM,GAAGnB,KAAK,GAAG,CACpBmB,KAAK,MAAM,GAAGmB,AAAsB,IAAtBA,qBACdJ,WAAW,MAAM,GAAGf,KAAK,GAAG;IAE9B,OAAOA;AACT;AAEO,eAAeoB,kBACpBC,gBAAwB,EACxBC,IAAkC,EAClCnB,IAAU;IAEV,MAAMoB,eAAeC,WAAWF;IAChC,MAAMG,kCAAkCF,aAAc,MAAM,CAC1D,CAACG;QACC,IAAIA,YAAY,UAAU,CAAC,QAAQ,KAAKC,SAAS,IAAI,EACnD,OAAO;QAET,OAAO;IACT;IAGF,MAAMC,eAAe,MAAMC,wBAAwB;QACjD,gBAAgBR;QAChB,sBAAsBI;QACtBtB;IACF;IACA,OAAOyB;AACT;AAEO,SAASE,uBACdC,KAAuB,EACvBC,WAAgC,EAChCC,KAAc;IAEd,MAAMC,OAA+B,EAAE;IAEvC,KAAK,MAAMC,QAAQJ,MAAO;YAWTK;QAVf,MAAMC,OAAOF,KAAK,IAAI;QAEtB,MAAMG,SAASN,YAAY,IAAI,CAAC,CAACM,SAAWA,OAAO,IAAI,KAAKD;QAC5D,IAAI,CAACC,QAAQ;YACXhC,QAAQ,IAAI,CACV,CAAC,sBAAsB,EAAE+B,KAAK,8BAA8B,CAAC;YAE/D;QACF;QAEA,MAAMlE,SAAS,QAAAiE,CAAAA,eAAAA,KAAK,MAAM,AAAD,IAAVA,KAAAA,IAAAA,aAAa,MAAM;QAClC,MAAMG,UAAUD,OAAO,cAAc,IAAI,GAAG9E,wBAAwB6E,MAAM;QAE1E,MAAMG,WAAiC;YACrC,CAACD,QAAQ,EAAEpE,UAAU;YACrB,GAAIgE,KAAK,KAAK,IAAI,CAAC,CAAC;QACtB;QAEAD,KAAK,IAAI,CAACM;IACZ;IAEA,IAAIP,OACFC,KAAK,IAAI,CAAC;QACRD;IACF;IAGF,OAAOC;AACT;AAGO,MAAMO,cAAcC,EAAE,MAAM,CAAC;IAClC,MAAMA,EAAE,MAAM;IACd,KAAKA,EAAE,MAAM;AACf;AAEO,MAAMC,aAAaD,EAAE,MAAM,CAAC;IACjC,OAAOA,EAAE,MAAM;IACf,QAAQA,EAAE,MAAM;IAChB,KAAKA,EAAE,MAAM,GAAG,QAAQ;AAC1B;AAEO,MAAME,aAAaH,YAAY,GAAG,CAACE,YAAY,GAAG,CACvDD,EAAE,MAAM,CAAC;IACP,MAAMA,EAAE,MAAM,GAAG,QAAQ;AAC3B;AAGK,MAAMG,mBAAmBH,EAAAA,MACvB,CAAC;IACN,8BAA8BA,EAAE,OAAO,CAAC;IACxC,QAAQA,EAAE,MAAM;IAChB,QAAQA,EAAE,KAAK,CAAC;QAACA,EAAE,MAAM;QAAIA,EAAE,MAAM;KAAG;IACxC,MAAME;AACR,GACC,WAAW;AAIP,MAAME,yBAAyB,CAACC;QAGjCC,mBAKAC;IANJ,IAAIC,cAAcH;IAClB,IAAIC,AAAAA,SAAAA,CAAAA,oBAAAA,YAAY,IAAI,AAAD,IAAfA,KAAAA,IAAAA,kBAAkB,QAAQ,AAAD,MAAM,eACjCE,cAAcA,YAAY,IAAI,CAAC,SAAS;IAI1C,IAAID,AAAAA,SAAAA,CAAAA,qBAAAA,YAAY,IAAI,AAAD,IAAfA,KAAAA,IAAAA,mBAAkB,QAAQ,AAAD,MAAM,aAAa;QAC9C,MAAME,QAAQD,YAAY,IAAI,CAAC,KAAK;QACpC,OAAO,kCAAkCC;IAC3C;IAEA,OAAO;AACT;AAEO,MAAMC,8BAA8B,CACzCC;QAQIC;IANJ,IAAI,CAACD,SACH,OAAO,EAAE;IAIX,MAAME,YAAYF;IAClB,IAAIC,AAAAA,SAAAA,CAAAA,kBAAAA,UAAU,IAAI,AAAD,IAAbA,KAAAA,IAAAA,gBAAgB,QAAQ,AAAD,MAAM,eAAeC,UAAU,KAAK,EAAE;QAC/D,MAAMC,OAAOC,OAAO,IAAI,CAACF,UAAU,KAAK;QACxC,OAAOC,KAAK,MAAM,CAAC,CAACE,MAAQZ,uBAAuBS,UAAU,KAAK,CAACG,IAAI;IACzE;IAGA,OAAO,EAAE;AACX"}
@@ -1,10 +1,10 @@
1
- import { call, callAiFnWithStringResponse, callToGetJSONObject, getModelName } from "./service-caller/index.mjs";
1
+ import { call, callAiFnWithStringResponse, callToGetJSONObject } from "./service-caller/index.mjs";
2
2
  import { systemPromptToLocateElement } from "./prompt/llm-locator.mjs";
3
3
  import { describeUserPage, elementByPositionWithElementInfo } from "./prompt/util.mjs";
4
4
  import { generatePlaywrightTest, generatePlaywrightTestStream } from "./prompt/playwright-generator.mjs";
5
5
  import { generateYamlTest, generateYamlTestStream } from "./prompt/yaml-generator.mjs";
6
6
  import { AiExtractElementInfo, AiLocateElement, AiLocateSection } from "./inspect.mjs";
7
7
  import { plan } from "./llm-planning.mjs";
8
- import { AIActionType, actionSpaceTypePrefix, adaptBboxToRect, callAiFn } from "./common.mjs";
8
+ import { AIActionType, MidsceneLocation, PointSchema, RectSchema, SizeSchema, actionSpaceTypePrefix, adaptBboxToRect, callAiFn } from "./common.mjs";
9
9
  import { resizeImageForUiTars, vlmPlanning } from "./ui-tars-planning.mjs";
10
- export { AIActionType, AiExtractElementInfo, AiLocateElement, AiLocateSection, actionSpaceTypePrefix, adaptBboxToRect, call as callAi, callAiFn, callAiFnWithStringResponse, callToGetJSONObject, describeUserPage, elementByPositionWithElementInfo, generatePlaywrightTest, generatePlaywrightTestStream, generateYamlTest, generateYamlTestStream, getModelName, plan, resizeImageForUiTars, systemPromptToLocateElement, vlmPlanning };
10
+ export { AIActionType, AiExtractElementInfo, AiLocateElement, AiLocateSection, MidsceneLocation, PointSchema, RectSchema, SizeSchema, actionSpaceTypePrefix, adaptBboxToRect, call as callAi, callAiFn, callAiFnWithStringResponse, callToGetJSONObject, describeUserPage, elementByPositionWithElementInfo, generatePlaywrightTest, generatePlaywrightTestStream, generateYamlTest, generateYamlTestStream, plan, resizeImageForUiTars, systemPromptToLocateElement, vlmPlanning };
@@ -1,4 +1,4 @@
1
- import { MIDSCENE_USE_QWEN_VL, getAIConfigInBoolean, vlLocateMode } from "@midscene/shared/env";
1
+ import { getIsUseQwenVl, vlLocateMode } from "@midscene/shared/env";
2
2
  import { cropByRect, paddingToMatchBlockByBase64, preProcessImageUrl } from "@midscene/shared/img";
3
3
  import { getDebug } from "@midscene/shared/logger";
4
4
  import { assert } from "@midscene/shared/utils";
@@ -57,20 +57,23 @@ const promptsToChatParam = async (multimodalPrompt)=>{
57
57
  async function AiLocateElement(options) {
58
58
  const { context, targetElementDescription, callAI } = options;
59
59
  const { screenshotBase64 } = context;
60
- const { description, elementById, insertElementByPosition } = await describeUserPage(context);
60
+ const modelPreferences = {
61
+ intent: 'grounding'
62
+ };
63
+ const { description, elementById, insertElementByPosition } = await describeUserPage(context, modelPreferences);
61
64
  assert(targetElementDescription, "cannot find the target element description");
62
65
  const userInstructionPrompt = await findElementPrompt.format({
63
66
  pageDescription: description,
64
67
  targetElementDescription: extraTextFromUserPrompt(targetElementDescription)
65
68
  });
66
- const systemPrompt = systemPromptToLocateElement(vlLocateMode());
69
+ const systemPrompt = systemPromptToLocateElement(vlLocateMode(modelPreferences));
67
70
  let imagePayload = screenshotBase64;
68
71
  if (options.searchConfig) {
69
72
  assert(options.searchConfig.rect, 'searchArea is provided but its rect cannot be found. Failed to locate element');
70
73
  assert(options.searchConfig.imageBase64, 'searchArea is provided but its imageBase64 cannot be found. Failed to locate element');
71
74
  imagePayload = options.searchConfig.imageBase64;
72
- } else if ('qwen-vl' === vlLocateMode()) imagePayload = await paddingToMatchBlockByBase64(imagePayload);
73
- else if (!vlLocateMode()) imagePayload = await markupImageForLLM(screenshotBase64, context.tree, context.size);
75
+ } else if ('qwen-vl' === vlLocateMode(modelPreferences)) imagePayload = await paddingToMatchBlockByBase64(imagePayload);
76
+ else if (!vlLocateMode(modelPreferences)) imagePayload = await markupImageForLLM(screenshotBase64, context.tree, context.size);
74
77
  const msgs = [
75
78
  {
76
79
  role: 'system',
@@ -101,7 +104,9 @@ async function AiLocateElement(options) {
101
104
  msgs.push(...addOns);
102
105
  }
103
106
  const callAIFn = callAI || callToGetJSONObject;
104
- const res = await callAIFn(msgs, AIActionType.INSPECT_ELEMENT);
107
+ const res = await callAIFn(msgs, AIActionType.INSPECT_ELEMENT, {
108
+ intent: 'grounding'
109
+ });
105
110
  const rawResponse = JSON.stringify(res.content);
106
111
  let resRect;
107
112
  let matchedElements = 'elements' in res.content ? res.content.elements : [];
@@ -109,7 +114,7 @@ async function AiLocateElement(options) {
109
114
  try {
110
115
  if ('bbox' in res.content && Array.isArray(res.content.bbox)) {
111
116
  var _options_searchConfig_rect, _options_searchConfig, _options_searchConfig_rect1, _options_searchConfig1, _options_searchConfig_rect2, _options_searchConfig2, _options_searchConfig_rect3, _options_searchConfig3;
112
- resRect = adaptBboxToRect(res.content.bbox, (null == (_options_searchConfig = options.searchConfig) ? void 0 : null == (_options_searchConfig_rect = _options_searchConfig.rect) ? void 0 : _options_searchConfig_rect.width) || context.size.width, (null == (_options_searchConfig1 = options.searchConfig) ? void 0 : null == (_options_searchConfig_rect1 = _options_searchConfig1.rect) ? void 0 : _options_searchConfig_rect1.height) || context.size.height, null == (_options_searchConfig2 = options.searchConfig) ? void 0 : null == (_options_searchConfig_rect2 = _options_searchConfig2.rect) ? void 0 : _options_searchConfig_rect2.left, null == (_options_searchConfig3 = options.searchConfig) ? void 0 : null == (_options_searchConfig_rect3 = _options_searchConfig3.rect) ? void 0 : _options_searchConfig_rect3.top);
117
+ resRect = adaptBboxToRect(res.content.bbox, (null == (_options_searchConfig = options.searchConfig) ? void 0 : null == (_options_searchConfig_rect = _options_searchConfig.rect) ? void 0 : _options_searchConfig_rect.width) || context.size.width, (null == (_options_searchConfig1 = options.searchConfig) ? void 0 : null == (_options_searchConfig_rect1 = _options_searchConfig1.rect) ? void 0 : _options_searchConfig_rect1.height) || context.size.height, modelPreferences, null == (_options_searchConfig2 = options.searchConfig) ? void 0 : null == (_options_searchConfig_rect2 = _options_searchConfig2.rect) ? void 0 : _options_searchConfig_rect2.left, null == (_options_searchConfig3 = options.searchConfig) ? void 0 : null == (_options_searchConfig_rect3 = _options_searchConfig3.rect) ? void 0 : _options_searchConfig_rect3.top);
113
118
  debugInspect('resRect', resRect);
114
119
  const rectCenter = {
115
120
  x: resRect.left + resRect.width / 2,
@@ -150,7 +155,10 @@ async function AiLocateElement(options) {
150
155
  async function AiLocateSection(options) {
151
156
  const { context, sectionDescription } = options;
152
157
  const { screenshotBase64 } = context;
153
- const systemPrompt = systemPromptToLocateSection(vlLocateMode());
158
+ const modelPreferences = {
159
+ intent: 'grounding'
160
+ };
161
+ const systemPrompt = systemPromptToLocateSection(vlLocateMode(modelPreferences));
154
162
  const sectionLocatorInstructionText = await sectionLocatorInstruction.format({
155
163
  sectionDescription: extraTextFromUserPrompt(sectionDescription)
156
164
  });
@@ -183,26 +191,30 @@ async function AiLocateSection(options) {
183
191
  });
184
192
  msgs.push(...addOns);
185
193
  }
186
- const result = await callAiFn(msgs, AIActionType.EXTRACT_DATA);
194
+ const result = await callAiFn(msgs, AIActionType.EXTRACT_DATA, {
195
+ intent: 'grounding'
196
+ });
187
197
  let sectionRect;
188
198
  const sectionBbox = result.content.bbox;
189
199
  if (sectionBbox) {
190
- const targetRect = adaptBboxToRect(sectionBbox, context.size.width, context.size.height);
200
+ const targetRect = adaptBboxToRect(sectionBbox, context.size.width, context.size.height, modelPreferences);
191
201
  debugSection('original targetRect %j', targetRect);
192
202
  const referenceBboxList = result.content.references_bbox || [];
193
203
  debugSection('referenceBboxList %j', referenceBboxList);
194
- const referenceRects = referenceBboxList.filter((bbox)=>Array.isArray(bbox)).map((bbox)=>adaptBboxToRect(bbox, context.size.width, context.size.height));
204
+ const referenceRects = referenceBboxList.filter((bbox)=>Array.isArray(bbox)).map((bbox)=>adaptBboxToRect(bbox, context.size.width, context.size.height, modelPreferences));
195
205
  debugSection('referenceRects %j', referenceRects);
196
206
  const mergedRect = mergeRects([
197
207
  targetRect,
198
208
  ...referenceRects
199
209
  ]);
200
210
  debugSection('mergedRect %j', mergedRect);
201
- sectionRect = expandSearchArea(mergedRect, context.size);
211
+ sectionRect = expandSearchArea(mergedRect, context.size, modelPreferences);
202
212
  debugSection('expanded sectionRect %j', sectionRect);
203
213
  }
204
214
  let imageBase64 = screenshotBase64;
205
- if (sectionRect) imageBase64 = await cropByRect(screenshotBase64, sectionRect, getAIConfigInBoolean(MIDSCENE_USE_QWEN_VL));
215
+ if (sectionRect) imageBase64 = await cropByRect(screenshotBase64, sectionRect, getIsUseQwenVl({
216
+ intent: 'grounding'
217
+ }));
206
218
  return {
207
219
  rect: sectionRect,
208
220
  imageBase64,
@@ -213,10 +225,10 @@ async function AiLocateSection(options) {
213
225
  }
214
226
  async function AiExtractElementInfo(options) {
215
227
  var _options_extractOption;
216
- const { dataQuery, context, extractOption, multimodalPrompt } = options;
228
+ const { dataQuery, context, extractOption, multimodalPrompt, modelPreferences } = options;
217
229
  const systemPrompt = systemPromptToExtract();
218
230
  const { screenshotBase64 } = context;
219
- const { description, elementById } = await describeUserPage(context, {
231
+ const { description, elementById } = await describeUserPage(context, modelPreferences, {
220
232
  truncateTextLength: 200,
221
233
  filterNonTextContent: false,
222
234
  visibleOnly: false,
@@ -256,7 +268,7 @@ async function AiExtractElementInfo(options) {
256
268
  });
257
269
  msgs.push(...addOns);
258
270
  }
259
- const result = await callAiFn(msgs, AIActionType.EXTRACT_DATA);
271
+ const result = await callAiFn(msgs, AIActionType.EXTRACT_DATA, modelPreferences);
260
272
  return {
261
273
  parseResult: result.content,
262
274
  elementById,
@@ -1 +1 @@
1
- {"version":3,"file":"ai-model/inspect.mjs","sources":["webpack://@midscene/core/./src/ai-model/inspect.ts"],"sourcesContent":["import type {\n AIDataExtractionResponse,\n AIElementLocatorResponse,\n AIElementResponse,\n AISectionLocatorResponse,\n AIUsageInfo,\n BaseElement,\n ElementById,\n InsightExtractOption,\n Rect,\n ReferenceImage,\n TMultimodalPrompt,\n TUserPrompt,\n UIContext,\n} from '@/types';\nimport {\n MIDSCENE_USE_QWEN_VL,\n getAIConfigInBoolean,\n vlLocateMode,\n} from '@midscene/shared/env';\nimport {\n cropByRect,\n paddingToMatchBlockByBase64,\n preProcessImageUrl,\n} from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport type {\n ChatCompletionSystemMessageParam,\n ChatCompletionUserMessageParam,\n} from 'openai/resources/index';\nimport {\n AIActionType,\n adaptBboxToRect,\n callAiFn,\n expandSearchArea,\n markupImageForLLM,\n mergeRects,\n} from './common';\nimport {\n extractDataQueryPrompt,\n systemPromptToExtract,\n} from './prompt/extraction';\nimport {\n findElementPrompt,\n systemPromptToLocateElement,\n} from './prompt/llm-locator';\nimport {\n sectionLocatorInstruction,\n systemPromptToLocateSection,\n} from './prompt/llm-section-locator';\nimport {\n describeUserPage,\n distance,\n distanceThreshold,\n elementByPositionWithElementInfo,\n} from './prompt/util';\nimport { callToGetJSONObject } from './service-caller/index';\n\nexport type AIArgs = [\n ChatCompletionSystemMessageParam,\n ...ChatCompletionUserMessageParam[],\n];\n\nconst debugInspect = getDebug('ai:inspect');\nconst debugSection = getDebug('ai:section');\n\nconst extraTextFromUserPrompt = (prompt: TUserPrompt): string => {\n if (typeof prompt === 'string') {\n return prompt;\n } else {\n return prompt.prompt;\n }\n};\n\nconst promptsToChatParam = async (\n multimodalPrompt: TMultimodalPrompt,\n): Promise<ChatCompletionUserMessageParam[]> => {\n const msgs: ChatCompletionUserMessageParam[] = [];\n if (multimodalPrompt?.images?.length) {\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: 'Next, I will provide all the reference images.',\n },\n ],\n });\n\n for (const item of multimodalPrompt.images) {\n const base64 = await preProcessImageUrl(\n item.url,\n !!multimodalPrompt.convertHttpImage2Base64,\n );\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: `reference image ${item.name}:`,\n },\n ],\n });\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: base64,\n detail: 'high',\n },\n },\n ],\n });\n }\n }\n return msgs;\n};\n\nexport async function AiLocateElement<\n ElementType extends BaseElement = BaseElement,\n>(options: {\n context: UIContext<ElementType>;\n targetElementDescription: TUserPrompt;\n referenceImage?: ReferenceImage;\n callAI?: typeof callAiFn<AIElementResponse | [number, number]>;\n searchConfig?: Awaited<ReturnType<typeof AiLocateSection>>;\n}): Promise<{\n parseResult: AIElementLocatorResponse;\n rect?: Rect;\n rawResponse: string;\n elementById: ElementById;\n usage?: AIUsageInfo;\n isOrderSensitive?: boolean;\n}> {\n const { context, targetElementDescription, callAI } = options;\n const { screenshotBase64 } = context;\n const { description, elementById, insertElementByPosition } =\n await describeUserPage(context);\n\n assert(\n targetElementDescription,\n 'cannot find the target element description',\n );\n\n const userInstructionPrompt = await findElementPrompt.format({\n pageDescription: description,\n targetElementDescription: extraTextFromUserPrompt(targetElementDescription),\n });\n const systemPrompt = systemPromptToLocateElement(vlLocateMode());\n\n let imagePayload = screenshotBase64;\n\n if (options.searchConfig) {\n assert(\n options.searchConfig.rect,\n 'searchArea is provided but its rect cannot be found. Failed to locate element',\n );\n assert(\n options.searchConfig.imageBase64,\n 'searchArea is provided but its imageBase64 cannot be found. Failed to locate element',\n );\n\n imagePayload = options.searchConfig.imageBase64;\n } else if (vlLocateMode() === 'qwen-vl') {\n imagePayload = await paddingToMatchBlockByBase64(imagePayload);\n } else if (!vlLocateMode()) {\n imagePayload = await markupImageForLLM(\n screenshotBase64,\n context.tree,\n context.size,\n );\n }\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: userInstructionPrompt,\n },\n ],\n },\n ];\n\n if (typeof targetElementDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: targetElementDescription.images,\n convertHttpImage2Base64: targetElementDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const callAIFn =\n callAI || callToGetJSONObject<AIElementResponse | [number, number]>;\n\n const res = await callAIFn(msgs, AIActionType.INSPECT_ELEMENT);\n\n const rawResponse = JSON.stringify(res.content);\n\n let resRect: Rect | undefined;\n let matchedElements: AIElementLocatorResponse['elements'] =\n 'elements' in res.content ? res.content.elements : [];\n let errors: AIElementLocatorResponse['errors'] | undefined =\n 'errors' in res.content ? res.content.errors : [];\n try {\n if ('bbox' in res.content && Array.isArray(res.content.bbox)) {\n resRect = adaptBboxToRect(\n res.content.bbox,\n options.searchConfig?.rect?.width || context.size.width,\n options.searchConfig?.rect?.height || context.size.height,\n options.searchConfig?.rect?.left,\n options.searchConfig?.rect?.top,\n );\n debugInspect('resRect', resRect);\n\n const rectCenter = {\n x: resRect.left + resRect.width / 2,\n y: resRect.top + resRect.height / 2,\n };\n let element = elementByPositionWithElementInfo(context.tree, rectCenter);\n\n const distanceToCenter = element\n ? distance({ x: element.center[0], y: element.center[1] }, rectCenter)\n : 0;\n\n if (!element || distanceToCenter > distanceThreshold) {\n element = insertElementByPosition(rectCenter);\n }\n\n if (element) {\n matchedElements = [element];\n errors = [];\n }\n }\n } catch (e) {\n const msg =\n e instanceof Error\n ? `Failed to parse bbox: ${e.message}`\n : 'unknown error in locate';\n if (!errors || errors?.length === 0) {\n errors = [msg];\n } else {\n errors.push(`(${msg})`);\n }\n }\n\n return {\n rect: resRect,\n parseResult: {\n elements: matchedElements,\n errors,\n },\n rawResponse,\n elementById,\n usage: res.usage,\n isOrderSensitive:\n typeof res.content === 'object' &&\n res.content !== null &&\n 'isOrderSensitive' in res.content\n ? (res.content as any).isOrderSensitive\n : undefined,\n };\n}\n\nexport async function AiLocateSection(options: {\n context: UIContext<BaseElement>;\n sectionDescription: TUserPrompt;\n callAI?: typeof callAiFn<AISectionLocatorResponse>;\n}): Promise<{\n rect?: Rect;\n imageBase64?: string;\n error?: string;\n rawResponse: string;\n usage?: AIUsageInfo;\n}> {\n const { context, sectionDescription } = options;\n const { screenshotBase64 } = context;\n\n const systemPrompt = systemPromptToLocateSection(vlLocateMode());\n const sectionLocatorInstructionText = await sectionLocatorInstruction.format({\n sectionDescription: extraTextFromUserPrompt(sectionDescription),\n });\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: sectionLocatorInstructionText,\n },\n ],\n },\n ];\n\n if (typeof sectionDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: sectionDescription.images,\n convertHttpImage2Base64: sectionDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const result = await callAiFn<AISectionLocatorResponse>(\n msgs,\n AIActionType.EXTRACT_DATA,\n );\n\n let sectionRect: Rect | undefined;\n const sectionBbox = result.content.bbox;\n if (sectionBbox) {\n const targetRect = adaptBboxToRect(\n sectionBbox,\n context.size.width,\n context.size.height,\n );\n debugSection('original targetRect %j', targetRect);\n\n const referenceBboxList = result.content.references_bbox || [];\n debugSection('referenceBboxList %j', referenceBboxList);\n\n const referenceRects = referenceBboxList\n .filter((bbox) => Array.isArray(bbox))\n .map((bbox) => {\n return adaptBboxToRect(bbox, context.size.width, context.size.height);\n });\n debugSection('referenceRects %j', referenceRects);\n\n // merge the sectionRect and referenceRects\n const mergedRect = mergeRects([targetRect, ...referenceRects]);\n debugSection('mergedRect %j', mergedRect);\n\n // expand search area to at least 200 x 200\n sectionRect = expandSearchArea(mergedRect, context.size);\n debugSection('expanded sectionRect %j', sectionRect);\n }\n\n let imageBase64 = screenshotBase64;\n if (sectionRect) {\n imageBase64 = await cropByRect(\n screenshotBase64,\n sectionRect,\n getAIConfigInBoolean(MIDSCENE_USE_QWEN_VL),\n );\n }\n\n return {\n rect: sectionRect,\n imageBase64,\n error: result.content.error,\n rawResponse: JSON.stringify(result.content),\n usage: result.usage,\n };\n}\n\nexport async function AiExtractElementInfo<\n T,\n ElementType extends BaseElement = BaseElement,\n>(options: {\n dataQuery: string | Record<string, string>;\n multimodalPrompt?: TMultimodalPrompt;\n context: UIContext<ElementType>;\n extractOption?: InsightExtractOption;\n}) {\n const { dataQuery, context, extractOption, multimodalPrompt } = options;\n const systemPrompt = systemPromptToExtract();\n\n const { screenshotBase64 } = context;\n const { description, elementById } = await describeUserPage(context, {\n truncateTextLength: 200,\n filterNonTextContent: false,\n visibleOnly: false,\n domIncluded: extractOption?.domIncluded,\n });\n\n const extractDataPromptText = await extractDataQueryPrompt(\n description,\n dataQuery,\n );\n\n const userContent: ChatCompletionUserMessageParam['content'] = [];\n\n if (extractOption?.screenshotIncluded !== false) {\n userContent.push({\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n });\n }\n\n userContent.push({\n type: 'text',\n text: extractDataPromptText,\n });\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: userContent,\n },\n ];\n\n if (options.extractOption?.returnThought) {\n msgs.push({\n role: 'user',\n content: 'Please provide reasons.',\n });\n }\n\n if (multimodalPrompt) {\n const addOns = await promptsToChatParam({\n images: multimodalPrompt.images,\n convertHttpImage2Base64: multimodalPrompt.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const result = await callAiFn<AIDataExtractionResponse<T>>(\n msgs,\n AIActionType.EXTRACT_DATA,\n );\n return {\n parseResult: result.content,\n elementById,\n usage: result.usage,\n };\n}\n"],"names":["debugInspect","getDebug","debugSection","extraTextFromUserPrompt","prompt","promptsToChatParam","multimodalPrompt","_multimodalPrompt_images","msgs","item","base64","preProcessImageUrl","AiLocateElement","options","context","targetElementDescription","callAI","screenshotBase64","description","elementById","insertElementByPosition","describeUserPage","assert","userInstructionPrompt","findElementPrompt","systemPrompt","systemPromptToLocateElement","vlLocateMode","imagePayload","paddingToMatchBlockByBase64","markupImageForLLM","addOns","callAIFn","callToGetJSONObject","res","AIActionType","rawResponse","JSON","resRect","matchedElements","errors","Array","_options_searchConfig_rect","_options_searchConfig_rect1","_options_searchConfig_rect2","_options_searchConfig_rect3","adaptBboxToRect","rectCenter","element","elementByPositionWithElementInfo","distanceToCenter","distance","distanceThreshold","e","msg","Error","undefined","AiLocateSection","sectionDescription","systemPromptToLocateSection","sectionLocatorInstructionText","sectionLocatorInstruction","result","callAiFn","sectionRect","sectionBbox","targetRect","referenceBboxList","referenceRects","bbox","mergedRect","mergeRects","expandSearchArea","imageBase64","cropByRect","getAIConfigInBoolean","MIDSCENE_USE_QWEN_VL","AiExtractElementInfo","_options_extractOption","dataQuery","extractOption","systemPromptToExtract","extractDataPromptText","extractDataQueryPrompt","userContent"],"mappings":";;;;;;;;;;AAgEA,MAAMA,eAAeC,SAAS;AAC9B,MAAMC,eAAeD,SAAS;AAE9B,MAAME,0BAA0B,CAACC;IAC/B,IAAI,AAAkB,YAAlB,OAAOA,QACT,OAAOA;IAEP,OAAOA,OAAO,MAAM;AAExB;AAEA,MAAMC,qBAAqB,OACzBC;QAGIC;IADJ,MAAMC,OAAyC,EAAE;IACjD,IAAID,QAAAA,mBAAAA,KAAAA,IAAAA,QAAAA,CAAAA,2BAAAA,iBAAkB,MAAM,AAAD,IAAvBA,KAAAA,IAAAA,yBAA0B,MAAM,EAAE;QACpCC,KAAK,IAAI,CAAC;YACR,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM;gBACR;aACD;QACH;QAEA,KAAK,MAAMC,QAAQH,iBAAiB,MAAM,CAAE;YAC1C,MAAMI,SAAS,MAAMC,mBACnBF,KAAK,GAAG,EACR,CAAC,CAACH,iBAAiB,uBAAuB;YAG5CE,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,MAAM,CAAC,gBAAgB,EAAEC,KAAK,IAAI,CAAC,CAAC,CAAC;oBACvC;iBACD;YACH;YAEAD,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,WAAW;4BACT,KAAKE;4BACL,QAAQ;wBACV;oBACF;iBACD;YACH;QACF;IACF;IACA,OAAOF;AACT;AAEO,eAAeI,gBAEpBC,OAMD;IAQC,MAAM,EAAEC,OAAO,EAAEC,wBAAwB,EAAEC,MAAM,EAAE,GAAGH;IACtD,MAAM,EAAEI,gBAAgB,EAAE,GAAGH;IAC7B,MAAM,EAAEI,WAAW,EAAEC,WAAW,EAAEC,uBAAuB,EAAE,GACzD,MAAMC,iBAAiBP;IAEzBQ,OACEP,0BACA;IAGF,MAAMQ,wBAAwB,MAAMC,kBAAkB,MAAM,CAAC;QAC3D,iBAAiBN;QACjB,0BAA0Bf,wBAAwBY;IACpD;IACA,MAAMU,eAAeC,4BAA4BC;IAEjD,IAAIC,eAAeX;IAEnB,IAAIJ,QAAQ,YAAY,EAAE;QACxBS,OACET,QAAQ,YAAY,CAAC,IAAI,EACzB;QAEFS,OACET,QAAQ,YAAY,CAAC,WAAW,EAChC;QAGFe,eAAef,QAAQ,YAAY,CAAC,WAAW;IACjD,OAAO,IAAIc,AAAmB,cAAnBA,gBACTC,eAAe,MAAMC,4BAA4BD;SAC5C,IAAI,CAACD,gBACVC,eAAe,MAAME,kBACnBb,kBACAH,QAAQ,IAAI,EACZA,QAAQ,IAAI;IAIhB,MAAMN,OAAe;QACnB;YAAE,MAAM;YAAU,SAASiB;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKG;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAML;gBACR;aACD;QACH;KACD;IAED,IAAI,AAAoC,YAApC,OAAOR,0BAAuC;QAChD,MAAMgB,SAAS,MAAM1B,mBAAmB;YACtC,QAAQU,yBAAyB,MAAM;YACvC,yBAAyBA,yBAAyB,uBAAuB;QAC3E;QACAP,KAAK,IAAI,IAAIuB;IACf;IAEA,MAAMC,WACJhB,UAAUiB;IAEZ,MAAMC,MAAM,MAAMF,SAASxB,MAAM2B,aAAa,eAAe;IAE7D,MAAMC,cAAcC,KAAK,SAAS,CAACH,IAAI,OAAO;IAE9C,IAAII;IACJ,IAAIC,kBACF,cAAcL,IAAI,OAAO,GAAGA,IAAI,OAAO,CAAC,QAAQ,GAAG,EAAE;IACvD,IAAIM,SACF,YAAYN,IAAI,OAAO,GAAGA,IAAI,OAAO,CAAC,MAAM,GAAG,EAAE;IACnD,IAAI;QACF,IAAI,UAAUA,IAAI,OAAO,IAAIO,MAAM,OAAO,CAACP,IAAI,OAAO,CAAC,IAAI,GAAG;gBAG1DQ,4BAAAA,uBACAC,6BAAAA,wBACAC,6BAAAA,wBACAC,6BAAAA;YALFP,UAAUQ,gBACRZ,IAAI,OAAO,CAAC,IAAI,EAChBQ,AAAAA,SAAAA,CAAAA,wBAAAA,QAAQ,YAAY,AAAD,IAAnBA,KAAAA,IAAAA,QAAAA,CAAAA,6BAAAA,sBAAsB,IAAI,AAAD,IAAzBA,KAAAA,IAAAA,2BAA4B,KAAK,AAAD,KAAK5B,QAAQ,IAAI,CAAC,KAAK,EACvD6B,AAAAA,SAAAA,CAAAA,yBAAAA,QAAQ,YAAY,AAAD,IAAnBA,KAAAA,IAAAA,QAAAA,CAAAA,8BAAAA,uBAAsB,IAAI,AAAD,IAAzBA,KAAAA,IAAAA,4BAA4B,MAAM,AAAD,KAAK7B,QAAQ,IAAI,CAAC,MAAM,UACzD8B,CAAAA,yBAAAA,QAAQ,YAAY,AAAD,IAAnBA,KAAAA,IAAAA,QAAAA,CAAAA,8BAAAA,uBAAsB,IAAI,AAAD,IAAzBA,KAAAA,IAAAA,4BAA4B,IAAI,UAChCC,CAAAA,yBAAAA,QAAQ,YAAY,AAAD,IAAnBA,KAAAA,IAAAA,QAAAA,CAAAA,8BAAAA,uBAAsB,IAAI,AAAD,IAAzBA,KAAAA,IAAAA,4BAA4B,GAAG;YAEjC7C,aAAa,WAAWsC;YAExB,MAAMS,aAAa;gBACjB,GAAGT,QAAQ,IAAI,GAAGA,QAAQ,KAAK,GAAG;gBAClC,GAAGA,QAAQ,GAAG,GAAGA,QAAQ,MAAM,GAAG;YACpC;YACA,IAAIU,UAAUC,iCAAiCnC,QAAQ,IAAI,EAAEiC;YAE7D,MAAMG,mBAAmBF,UACrBG,SAAS;gBAAE,GAAGH,QAAQ,MAAM,CAAC,EAAE;gBAAE,GAAGA,QAAQ,MAAM,CAAC,EAAE;YAAC,GAAGD,cACzD;YAEJ,IAAI,CAACC,WAAWE,mBAAmBE,mBACjCJ,UAAU5B,wBAAwB2B;YAGpC,IAAIC,SAAS;gBACXT,kBAAkB;oBAACS;iBAAQ;gBAC3BR,SAAS,EAAE;YACb;QACF;IACF,EAAE,OAAOa,GAAG;QACV,MAAMC,MACJD,aAAaE,QACT,CAAC,sBAAsB,EAAEF,EAAE,OAAO,EAAE,GACpC;QACN,IAAI,AAACb,UAAUA,AAAAA,CAAAA,QAAAA,SAAAA,KAAAA,IAAAA,OAAQ,MAAM,AAAD,MAAM,GAGhCA,OAAO,IAAI,CAAC,CAAC,CAAC,EAAEc,IAAI,CAAC,CAAC;aAFtBd,SAAS;YAACc;SAAI;IAIlB;IAEA,OAAO;QACL,MAAMhB;QACN,aAAa;YACX,UAAUC;YACVC;QACF;QACAJ;QACAjB;QACA,OAAOe,IAAI,KAAK;QAChB,kBACE,AAAuB,YAAvB,OAAOA,IAAI,OAAO,IAClBA,AAAgB,SAAhBA,IAAI,OAAO,IACX,sBAAsBA,IAAI,OAAO,GAC5BA,IAAI,OAAO,CAAS,gBAAgB,GACrCsB;IACR;AACF;AAEO,eAAeC,gBAAgB5C,OAIrC;IAOC,MAAM,EAAEC,OAAO,EAAE4C,kBAAkB,EAAE,GAAG7C;IACxC,MAAM,EAAEI,gBAAgB,EAAE,GAAGH;IAE7B,MAAMW,eAAekC,4BAA4BhC;IACjD,MAAMiC,gCAAgC,MAAMC,0BAA0B,MAAM,CAAC;QAC3E,oBAAoB1D,wBAAwBuD;IAC9C;IACA,MAAMlD,OAAe;QACnB;YAAE,MAAM;YAAU,SAASiB;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKR;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAM2C;gBACR;aACD;QACH;KACD;IAED,IAAI,AAA8B,YAA9B,OAAOF,oBAAiC;QAC1C,MAAM3B,SAAS,MAAM1B,mBAAmB;YACtC,QAAQqD,mBAAmB,MAAM;YACjC,yBAAyBA,mBAAmB,uBAAuB;QACrE;QACAlD,KAAK,IAAI,IAAIuB;IACf;IAEA,MAAM+B,SAAS,MAAMC,SACnBvD,MACA2B,aAAa,YAAY;IAG3B,IAAI6B;IACJ,MAAMC,cAAcH,OAAO,OAAO,CAAC,IAAI;IACvC,IAAIG,aAAa;QACf,MAAMC,aAAapB,gBACjBmB,aACAnD,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM;QAErBZ,aAAa,0BAA0BgE;QAEvC,MAAMC,oBAAoBL,OAAO,OAAO,CAAC,eAAe,IAAI,EAAE;QAC9D5D,aAAa,wBAAwBiE;QAErC,MAAMC,iBAAiBD,kBACpB,MAAM,CAAC,CAACE,OAAS5B,MAAM,OAAO,CAAC4B,OAC/B,GAAG,CAAC,CAACA,OACGvB,gBAAgBuB,MAAMvD,QAAQ,IAAI,CAAC,KAAK,EAAEA,QAAQ,IAAI,CAAC,MAAM;QAExEZ,aAAa,qBAAqBkE;QAGlC,MAAME,aAAaC,WAAW;YAACL;eAAeE;SAAe;QAC7DlE,aAAa,iBAAiBoE;QAG9BN,cAAcQ,iBAAiBF,YAAYxD,QAAQ,IAAI;QACvDZ,aAAa,2BAA2B8D;IAC1C;IAEA,IAAIS,cAAcxD;IAClB,IAAI+C,aACFS,cAAc,MAAMC,WAClBzD,kBACA+C,aACAW,qBAAqBC;IAIzB,OAAO;QACL,MAAMZ;QACNS;QACA,OAAOX,OAAO,OAAO,CAAC,KAAK;QAC3B,aAAazB,KAAK,SAAS,CAACyB,OAAO,OAAO;QAC1C,OAAOA,OAAO,KAAK;IACrB;AACF;AAEO,eAAee,qBAGpBhE,OAKD;QA0CKiE;IAzCJ,MAAM,EAAEC,SAAS,EAAEjE,OAAO,EAAEkE,aAAa,EAAE1E,gBAAgB,EAAE,GAAGO;IAChE,MAAMY,eAAewD;IAErB,MAAM,EAAEhE,gBAAgB,EAAE,GAAGH;IAC7B,MAAM,EAAEI,WAAW,EAAEC,WAAW,EAAE,GAAG,MAAME,iBAAiBP,SAAS;QACnE,oBAAoB;QACpB,sBAAsB;QACtB,aAAa;QACb,aAAakE,QAAAA,gBAAAA,KAAAA,IAAAA,cAAe,WAAW;IACzC;IAEA,MAAME,wBAAwB,MAAMC,uBAClCjE,aACA6D;IAGF,MAAMK,cAAyD,EAAE;IAEjE,IAAIJ,AAAAA,CAAAA,QAAAA,gBAAAA,KAAAA,IAAAA,cAAe,kBAAkB,AAAD,MAAM,OACxCI,YAAY,IAAI,CAAC;QACf,MAAM;QACN,WAAW;YACT,KAAKnE;YACL,QAAQ;QACV;IACF;IAGFmE,YAAY,IAAI,CAAC;QACf,MAAM;QACN,MAAMF;IACR;IAEA,MAAM1E,OAAe;QACnB;YAAE,MAAM;YAAU,SAASiB;QAAa;QACxC;YACE,MAAM;YACN,SAAS2D;QACX;KACD;IAED,IAAI,QAAAN,CAAAA,yBAAAA,QAAQ,aAAa,AAAD,IAApBA,KAAAA,IAAAA,uBAAuB,aAAa,EACtCtE,KAAK,IAAI,CAAC;QACR,MAAM;QACN,SAAS;IACX;IAGF,IAAIF,kBAAkB;QACpB,MAAMyB,SAAS,MAAM1B,mBAAmB;YACtC,QAAQC,iBAAiB,MAAM;YAC/B,yBAAyBA,iBAAiB,uBAAuB;QACnE;QACAE,KAAK,IAAI,IAAIuB;IACf;IAEA,MAAM+B,SAAS,MAAMC,SACnBvD,MACA2B,aAAa,YAAY;IAE3B,OAAO;QACL,aAAa2B,OAAO,OAAO;QAC3B3C;QACA,OAAO2C,OAAO,KAAK;IACrB;AACF"}
1
+ {"version":3,"file":"ai-model/inspect.mjs","sources":["webpack://@midscene/core/./src/ai-model/inspect.ts"],"sourcesContent":["import type {\n AIDataExtractionResponse,\n AIElementLocatorResponse,\n AIElementResponse,\n AISectionLocatorResponse,\n AIUsageInfo,\n BaseElement,\n ElementById,\n InsightExtractOption,\n Rect,\n ReferenceImage,\n TMultimodalPrompt,\n TUserPrompt,\n UIContext,\n} from '@/types';\nimport {\n type IModelPreferences,\n getIsUseQwenVl,\n vlLocateMode,\n} from '@midscene/shared/env';\nimport {\n cropByRect,\n paddingToMatchBlockByBase64,\n preProcessImageUrl,\n} from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport type {\n ChatCompletionSystemMessageParam,\n ChatCompletionUserMessageParam,\n} from 'openai/resources/index';\nimport {\n AIActionType,\n adaptBboxToRect,\n callAiFn,\n expandSearchArea,\n markupImageForLLM,\n mergeRects,\n} from './common';\nimport {\n extractDataQueryPrompt,\n systemPromptToExtract,\n} from './prompt/extraction';\nimport {\n findElementPrompt,\n systemPromptToLocateElement,\n} from './prompt/llm-locator';\nimport {\n sectionLocatorInstruction,\n systemPromptToLocateSection,\n} from './prompt/llm-section-locator';\nimport {\n describeUserPage,\n distance,\n distanceThreshold,\n elementByPositionWithElementInfo,\n} from './prompt/util';\nimport { callToGetJSONObject } from './service-caller/index';\n\nexport type AIArgs = [\n ChatCompletionSystemMessageParam,\n ...ChatCompletionUserMessageParam[],\n];\n\nconst debugInspect = getDebug('ai:inspect');\nconst debugSection = getDebug('ai:section');\n\nconst extraTextFromUserPrompt = (prompt: TUserPrompt): string => {\n if (typeof prompt === 'string') {\n return prompt;\n } else {\n return prompt.prompt;\n }\n};\n\nconst promptsToChatParam = async (\n multimodalPrompt: TMultimodalPrompt,\n): Promise<ChatCompletionUserMessageParam[]> => {\n const msgs: ChatCompletionUserMessageParam[] = [];\n if (multimodalPrompt?.images?.length) {\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: 'Next, I will provide all the reference images.',\n },\n ],\n });\n\n for (const item of multimodalPrompt.images) {\n const base64 = await preProcessImageUrl(\n item.url,\n !!multimodalPrompt.convertHttpImage2Base64,\n );\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: `reference image ${item.name}:`,\n },\n ],\n });\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: base64,\n detail: 'high',\n },\n },\n ],\n });\n }\n }\n return msgs;\n};\n\nexport async function AiLocateElement<\n ElementType extends BaseElement = BaseElement,\n>(options: {\n context: UIContext<ElementType>;\n targetElementDescription: TUserPrompt;\n referenceImage?: ReferenceImage;\n callAI?: typeof callAiFn<AIElementResponse | [number, number]>;\n searchConfig?: Awaited<ReturnType<typeof AiLocateSection>>;\n}): Promise<{\n parseResult: AIElementLocatorResponse;\n rect?: Rect;\n rawResponse: string;\n elementById: ElementById;\n usage?: AIUsageInfo;\n isOrderSensitive?: boolean;\n}> {\n const { context, targetElementDescription, callAI } = options;\n const { screenshotBase64 } = context;\n\n const modelPreferences: IModelPreferences = {\n intent: 'grounding',\n };\n\n const { description, elementById, insertElementByPosition } =\n await describeUserPage(context, modelPreferences);\n\n assert(\n targetElementDescription,\n 'cannot find the target element description',\n );\n\n const userInstructionPrompt = await findElementPrompt.format({\n pageDescription: description,\n targetElementDescription: extraTextFromUserPrompt(targetElementDescription),\n });\n const systemPrompt = systemPromptToLocateElement(\n vlLocateMode(modelPreferences),\n );\n\n let imagePayload = screenshotBase64;\n\n if (options.searchConfig) {\n assert(\n options.searchConfig.rect,\n 'searchArea is provided but its rect cannot be found. Failed to locate element',\n );\n assert(\n options.searchConfig.imageBase64,\n 'searchArea is provided but its imageBase64 cannot be found. Failed to locate element',\n );\n\n imagePayload = options.searchConfig.imageBase64;\n } else if (vlLocateMode(modelPreferences) === 'qwen-vl') {\n imagePayload = await paddingToMatchBlockByBase64(imagePayload);\n } else if (!vlLocateMode(modelPreferences)) {\n imagePayload = await markupImageForLLM(\n screenshotBase64,\n context.tree,\n context.size,\n );\n }\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: userInstructionPrompt,\n },\n ],\n },\n ];\n\n if (typeof targetElementDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: targetElementDescription.images,\n convertHttpImage2Base64: targetElementDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const callAIFn =\n callAI || callToGetJSONObject<AIElementResponse | [number, number]>;\n\n const res = await callAIFn(msgs, AIActionType.INSPECT_ELEMENT, {\n intent: 'grounding',\n });\n\n const rawResponse = JSON.stringify(res.content);\n\n let resRect: Rect | undefined;\n let matchedElements: AIElementLocatorResponse['elements'] =\n 'elements' in res.content ? res.content.elements : [];\n let errors: AIElementLocatorResponse['errors'] | undefined =\n 'errors' in res.content ? res.content.errors : [];\n try {\n if ('bbox' in res.content && Array.isArray(res.content.bbox)) {\n resRect = adaptBboxToRect(\n res.content.bbox,\n options.searchConfig?.rect?.width || context.size.width,\n options.searchConfig?.rect?.height || context.size.height,\n modelPreferences,\n options.searchConfig?.rect?.left,\n options.searchConfig?.rect?.top,\n );\n debugInspect('resRect', resRect);\n\n const rectCenter = {\n x: resRect.left + resRect.width / 2,\n y: resRect.top + resRect.height / 2,\n };\n let element = elementByPositionWithElementInfo(context.tree, rectCenter);\n\n const distanceToCenter = element\n ? distance({ x: element.center[0], y: element.center[1] }, rectCenter)\n : 0;\n\n if (!element || distanceToCenter > distanceThreshold) {\n element = insertElementByPosition(rectCenter);\n }\n\n if (element) {\n matchedElements = [element];\n errors = [];\n }\n }\n } catch (e) {\n const msg =\n e instanceof Error\n ? `Failed to parse bbox: ${e.message}`\n : 'unknown error in locate';\n if (!errors || errors?.length === 0) {\n errors = [msg];\n } else {\n errors.push(`(${msg})`);\n }\n }\n\n return {\n rect: resRect,\n parseResult: {\n elements: matchedElements,\n errors,\n },\n rawResponse,\n elementById,\n usage: res.usage,\n isOrderSensitive:\n typeof res.content === 'object' &&\n res.content !== null &&\n 'isOrderSensitive' in res.content\n ? (res.content as any).isOrderSensitive\n : undefined,\n };\n}\n\nexport async function AiLocateSection(options: {\n context: UIContext<BaseElement>;\n sectionDescription: TUserPrompt;\n callAI?: typeof callAiFn<AISectionLocatorResponse>;\n}): Promise<{\n rect?: Rect;\n imageBase64?: string;\n error?: string;\n rawResponse: string;\n usage?: AIUsageInfo;\n}> {\n const { context, sectionDescription } = options;\n const { screenshotBase64 } = context;\n\n const modelPreferences: IModelPreferences = {\n intent: 'grounding',\n };\n\n const systemPrompt = systemPromptToLocateSection(\n vlLocateMode(modelPreferences),\n );\n const sectionLocatorInstructionText = await sectionLocatorInstruction.format({\n sectionDescription: extraTextFromUserPrompt(sectionDescription),\n });\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: sectionLocatorInstructionText,\n },\n ],\n },\n ];\n\n if (typeof sectionDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: sectionDescription.images,\n convertHttpImage2Base64: sectionDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const result = await callAiFn<AISectionLocatorResponse>(\n msgs,\n AIActionType.EXTRACT_DATA,\n {\n intent: 'grounding',\n },\n );\n\n let sectionRect: Rect | undefined;\n const sectionBbox = result.content.bbox;\n if (sectionBbox) {\n const targetRect = adaptBboxToRect(\n sectionBbox,\n context.size.width,\n context.size.height,\n modelPreferences,\n );\n debugSection('original targetRect %j', targetRect);\n\n const referenceBboxList = result.content.references_bbox || [];\n debugSection('referenceBboxList %j', referenceBboxList);\n\n const referenceRects = referenceBboxList\n .filter((bbox) => Array.isArray(bbox))\n .map((bbox) => {\n return adaptBboxToRect(\n bbox,\n context.size.width,\n context.size.height,\n modelPreferences,\n );\n });\n debugSection('referenceRects %j', referenceRects);\n\n // merge the sectionRect and referenceRects\n const mergedRect = mergeRects([targetRect, ...referenceRects]);\n debugSection('mergedRect %j', mergedRect);\n\n // expand search area to at least 200 x 200\n sectionRect = expandSearchArea(mergedRect, context.size, modelPreferences);\n debugSection('expanded sectionRect %j', sectionRect);\n }\n\n let imageBase64 = screenshotBase64;\n if (sectionRect) {\n imageBase64 = await cropByRect(\n screenshotBase64,\n sectionRect,\n getIsUseQwenVl({\n intent: 'grounding',\n }),\n );\n }\n\n return {\n rect: sectionRect,\n imageBase64,\n error: result.content.error,\n rawResponse: JSON.stringify(result.content),\n usage: result.usage,\n };\n}\n\nexport async function AiExtractElementInfo<\n T,\n ElementType extends BaseElement = BaseElement,\n>(options: {\n dataQuery: string | Record<string, string>;\n multimodalPrompt?: TMultimodalPrompt;\n context: UIContext<ElementType>;\n extractOption?: InsightExtractOption;\n modelPreferences: IModelPreferences;\n}) {\n const {\n dataQuery,\n context,\n extractOption,\n multimodalPrompt,\n modelPreferences,\n } = options;\n const systemPrompt = systemPromptToExtract();\n\n const { screenshotBase64 } = context;\n const { description, elementById } = await describeUserPage(\n context,\n modelPreferences,\n {\n truncateTextLength: 200,\n filterNonTextContent: false,\n visibleOnly: false,\n domIncluded: extractOption?.domIncluded,\n },\n );\n\n const extractDataPromptText = await extractDataQueryPrompt(\n description,\n dataQuery,\n );\n\n const userContent: ChatCompletionUserMessageParam['content'] = [];\n\n if (extractOption?.screenshotIncluded !== false) {\n userContent.push({\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n });\n }\n\n userContent.push({\n type: 'text',\n text: extractDataPromptText,\n });\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: userContent,\n },\n ];\n\n if (options.extractOption?.returnThought) {\n msgs.push({\n role: 'user',\n content: 'Please provide reasons.',\n });\n }\n\n if (multimodalPrompt) {\n const addOns = await promptsToChatParam({\n images: multimodalPrompt.images,\n convertHttpImage2Base64: multimodalPrompt.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const result = await callAiFn<AIDataExtractionResponse<T>>(\n msgs,\n AIActionType.EXTRACT_DATA,\n modelPreferences,\n );\n return {\n parseResult: result.content,\n elementById,\n usage: result.usage,\n };\n}\n"],"names":["debugInspect","getDebug","debugSection","extraTextFromUserPrompt","prompt","promptsToChatParam","multimodalPrompt","_multimodalPrompt_images","msgs","item","base64","preProcessImageUrl","AiLocateElement","options","context","targetElementDescription","callAI","screenshotBase64","modelPreferences","description","elementById","insertElementByPosition","describeUserPage","assert","userInstructionPrompt","findElementPrompt","systemPrompt","systemPromptToLocateElement","vlLocateMode","imagePayload","paddingToMatchBlockByBase64","markupImageForLLM","addOns","callAIFn","callToGetJSONObject","res","AIActionType","rawResponse","JSON","resRect","matchedElements","errors","Array","_options_searchConfig_rect","_options_searchConfig_rect1","_options_searchConfig_rect2","_options_searchConfig_rect3","adaptBboxToRect","rectCenter","element","elementByPositionWithElementInfo","distanceToCenter","distance","distanceThreshold","e","msg","Error","undefined","AiLocateSection","sectionDescription","systemPromptToLocateSection","sectionLocatorInstructionText","sectionLocatorInstruction","result","callAiFn","sectionRect","sectionBbox","targetRect","referenceBboxList","referenceRects","bbox","mergedRect","mergeRects","expandSearchArea","imageBase64","cropByRect","getIsUseQwenVl","AiExtractElementInfo","_options_extractOption","dataQuery","extractOption","systemPromptToExtract","extractDataPromptText","extractDataQueryPrompt","userContent"],"mappings":";;;;;;;;;;AAgEA,MAAMA,eAAeC,SAAS;AAC9B,MAAMC,eAAeD,SAAS;AAE9B,MAAME,0BAA0B,CAACC;IAC/B,IAAI,AAAkB,YAAlB,OAAOA,QACT,OAAOA;IAEP,OAAOA,OAAO,MAAM;AAExB;AAEA,MAAMC,qBAAqB,OACzBC;QAGIC;IADJ,MAAMC,OAAyC,EAAE;IACjD,IAAID,QAAAA,mBAAAA,KAAAA,IAAAA,QAAAA,CAAAA,2BAAAA,iBAAkB,MAAM,AAAD,IAAvBA,KAAAA,IAAAA,yBAA0B,MAAM,EAAE;QACpCC,KAAK,IAAI,CAAC;YACR,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM;gBACR;aACD;QACH;QAEA,KAAK,MAAMC,QAAQH,iBAAiB,MAAM,CAAE;YAC1C,MAAMI,SAAS,MAAMC,mBACnBF,KAAK,GAAG,EACR,CAAC,CAACH,iBAAiB,uBAAuB;YAG5CE,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,MAAM,CAAC,gBAAgB,EAAEC,KAAK,IAAI,CAAC,CAAC,CAAC;oBACvC;iBACD;YACH;YAEAD,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,WAAW;4BACT,KAAKE;4BACL,QAAQ;wBACV;oBACF;iBACD;YACH;QACF;IACF;IACA,OAAOF;AACT;AAEO,eAAeI,gBAEpBC,OAMD;IAQC,MAAM,EAAEC,OAAO,EAAEC,wBAAwB,EAAEC,MAAM,EAAE,GAAGH;IACtD,MAAM,EAAEI,gBAAgB,EAAE,GAAGH;IAE7B,MAAMI,mBAAsC;QAC1C,QAAQ;IACV;IAEA,MAAM,EAAEC,WAAW,EAAEC,WAAW,EAAEC,uBAAuB,EAAE,GACzD,MAAMC,iBAAiBR,SAASI;IAElCK,OACER,0BACA;IAGF,MAAMS,wBAAwB,MAAMC,kBAAkB,MAAM,CAAC;QAC3D,iBAAiBN;QACjB,0BAA0BhB,wBAAwBY;IACpD;IACA,MAAMW,eAAeC,4BACnBC,aAAaV;IAGf,IAAIW,eAAeZ;IAEnB,IAAIJ,QAAQ,YAAY,EAAE;QACxBU,OACEV,QAAQ,YAAY,CAAC,IAAI,EACzB;QAEFU,OACEV,QAAQ,YAAY,CAAC,WAAW,EAChC;QAGFgB,eAAehB,QAAQ,YAAY,CAAC,WAAW;IACjD,OAAO,IAAIe,AAAmC,cAAnCA,aAAaV,mBACtBW,eAAe,MAAMC,4BAA4BD;SAC5C,IAAI,CAACD,aAAaV,mBACvBW,eAAe,MAAME,kBACnBd,kBACAH,QAAQ,IAAI,EACZA,QAAQ,IAAI;IAIhB,MAAMN,OAAe;QACnB;YAAE,MAAM;YAAU,SAASkB;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKG;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAML;gBACR;aACD;QACH;KACD;IAED,IAAI,AAAoC,YAApC,OAAOT,0BAAuC;QAChD,MAAMiB,SAAS,MAAM3B,mBAAmB;YACtC,QAAQU,yBAAyB,MAAM;YACvC,yBAAyBA,yBAAyB,uBAAuB;QAC3E;QACAP,KAAK,IAAI,IAAIwB;IACf;IAEA,MAAMC,WACJjB,UAAUkB;IAEZ,MAAMC,MAAM,MAAMF,SAASzB,MAAM4B,aAAa,eAAe,EAAE;QAC7D,QAAQ;IACV;IAEA,MAAMC,cAAcC,KAAK,SAAS,CAACH,IAAI,OAAO;IAE9C,IAAII;IACJ,IAAIC,kBACF,cAAcL,IAAI,OAAO,GAAGA,IAAI,OAAO,CAAC,QAAQ,GAAG,EAAE;IACvD,IAAIM,SACF,YAAYN,IAAI,OAAO,GAAGA,IAAI,OAAO,CAAC,MAAM,GAAG,EAAE;IACnD,IAAI;QACF,IAAI,UAAUA,IAAI,OAAO,IAAIO,MAAM,OAAO,CAACP,IAAI,OAAO,CAAC,IAAI,GAAG;gBAG1DQ,4BAAAA,uBACAC,6BAAAA,wBAEAC,6BAAAA,wBACAC,6BAAAA;YANFP,UAAUQ,gBACRZ,IAAI,OAAO,CAAC,IAAI,EAChBQ,AAAAA,SAAAA,CAAAA,wBAAAA,QAAQ,YAAY,AAAD,IAAnBA,KAAAA,IAAAA,QAAAA,CAAAA,6BAAAA,sBAAsB,IAAI,AAAD,IAAzBA,KAAAA,IAAAA,2BAA4B,KAAK,AAAD,KAAK7B,QAAQ,IAAI,CAAC,KAAK,EACvD8B,AAAAA,SAAAA,CAAAA,yBAAAA,QAAQ,YAAY,AAAD,IAAnBA,KAAAA,IAAAA,QAAAA,CAAAA,8BAAAA,uBAAsB,IAAI,AAAD,IAAzBA,KAAAA,IAAAA,4BAA4B,MAAM,AAAD,KAAK9B,QAAQ,IAAI,CAAC,MAAM,EACzDI,kBAAAA,QACA2B,CAAAA,yBAAAA,QAAQ,YAAY,AAAD,IAAnBA,KAAAA,IAAAA,QAAAA,CAAAA,8BAAAA,uBAAsB,IAAI,AAAD,IAAzBA,KAAAA,IAAAA,4BAA4B,IAAI,UAChCC,CAAAA,yBAAAA,QAAQ,YAAY,AAAD,IAAnBA,KAAAA,IAAAA,QAAAA,CAAAA,8BAAAA,uBAAsB,IAAI,AAAD,IAAzBA,KAAAA,IAAAA,4BAA4B,GAAG;YAEjC9C,aAAa,WAAWuC;YAExB,MAAMS,aAAa;gBACjB,GAAGT,QAAQ,IAAI,GAAGA,QAAQ,KAAK,GAAG;gBAClC,GAAGA,QAAQ,GAAG,GAAGA,QAAQ,MAAM,GAAG;YACpC;YACA,IAAIU,UAAUC,iCAAiCpC,QAAQ,IAAI,EAAEkC;YAE7D,MAAMG,mBAAmBF,UACrBG,SAAS;gBAAE,GAAGH,QAAQ,MAAM,CAAC,EAAE;gBAAE,GAAGA,QAAQ,MAAM,CAAC,EAAE;YAAC,GAAGD,cACzD;YAEJ,IAAI,CAACC,WAAWE,mBAAmBE,mBACjCJ,UAAU5B,wBAAwB2B;YAGpC,IAAIC,SAAS;gBACXT,kBAAkB;oBAACS;iBAAQ;gBAC3BR,SAAS,EAAE;YACb;QACF;IACF,EAAE,OAAOa,GAAG;QACV,MAAMC,MACJD,aAAaE,QACT,CAAC,sBAAsB,EAAEF,EAAE,OAAO,EAAE,GACpC;QACN,IAAI,AAACb,UAAUA,AAAAA,CAAAA,QAAAA,SAAAA,KAAAA,IAAAA,OAAQ,MAAM,AAAD,MAAM,GAGhCA,OAAO,IAAI,CAAC,CAAC,CAAC,EAAEc,IAAI,CAAC,CAAC;aAFtBd,SAAS;YAACc;SAAI;IAIlB;IAEA,OAAO;QACL,MAAMhB;QACN,aAAa;YACX,UAAUC;YACVC;QACF;QACAJ;QACAjB;QACA,OAAOe,IAAI,KAAK;QAChB,kBACE,AAAuB,YAAvB,OAAOA,IAAI,OAAO,IAClBA,AAAgB,SAAhBA,IAAI,OAAO,IACX,sBAAsBA,IAAI,OAAO,GAC5BA,IAAI,OAAO,CAAS,gBAAgB,GACrCsB;IACR;AACF;AAEO,eAAeC,gBAAgB7C,OAIrC;IAOC,MAAM,EAAEC,OAAO,EAAE6C,kBAAkB,EAAE,GAAG9C;IACxC,MAAM,EAAEI,gBAAgB,EAAE,GAAGH;IAE7B,MAAMI,mBAAsC;QAC1C,QAAQ;IACV;IAEA,MAAMQ,eAAekC,4BACnBhC,aAAaV;IAEf,MAAM2C,gCAAgC,MAAMC,0BAA0B,MAAM,CAAC;QAC3E,oBAAoB3D,wBAAwBwD;IAC9C;IACA,MAAMnD,OAAe;QACnB;YAAE,MAAM;YAAU,SAASkB;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKT;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAM4C;gBACR;aACD;QACH;KACD;IAED,IAAI,AAA8B,YAA9B,OAAOF,oBAAiC;QAC1C,MAAM3B,SAAS,MAAM3B,mBAAmB;YACtC,QAAQsD,mBAAmB,MAAM;YACjC,yBAAyBA,mBAAmB,uBAAuB;QACrE;QACAnD,KAAK,IAAI,IAAIwB;IACf;IAEA,MAAM+B,SAAS,MAAMC,SACnBxD,MACA4B,aAAa,YAAY,EACzB;QACE,QAAQ;IACV;IAGF,IAAI6B;IACJ,MAAMC,cAAcH,OAAO,OAAO,CAAC,IAAI;IACvC,IAAIG,aAAa;QACf,MAAMC,aAAapB,gBACjBmB,aACApD,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnBI;QAEFhB,aAAa,0BAA0BiE;QAEvC,MAAMC,oBAAoBL,OAAO,OAAO,CAAC,eAAe,IAAI,EAAE;QAC9D7D,aAAa,wBAAwBkE;QAErC,MAAMC,iBAAiBD,kBACpB,MAAM,CAAC,CAACE,OAAS5B,MAAM,OAAO,CAAC4B,OAC/B,GAAG,CAAC,CAACA,OACGvB,gBACLuB,MACAxD,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnBI;QAGNhB,aAAa,qBAAqBmE;QAGlC,MAAME,aAAaC,WAAW;YAACL;eAAeE;SAAe;QAC7DnE,aAAa,iBAAiBqE;QAG9BN,cAAcQ,iBAAiBF,YAAYzD,QAAQ,IAAI,EAAEI;QACzDhB,aAAa,2BAA2B+D;IAC1C;IAEA,IAAIS,cAAczD;IAClB,IAAIgD,aACFS,cAAc,MAAMC,WAClB1D,kBACAgD,aACAW,eAAe;QACb,QAAQ;IACV;IAIJ,OAAO;QACL,MAAMX;QACNS;QACA,OAAOX,OAAO,OAAO,CAAC,KAAK;QAC3B,aAAazB,KAAK,SAAS,CAACyB,OAAO,OAAO;QAC1C,OAAOA,OAAO,KAAK;IACrB;AACF;AAEO,eAAec,qBAGpBhE,OAMD;QAoDKiE;IAnDJ,MAAM,EACJC,SAAS,EACTjE,OAAO,EACPkE,aAAa,EACb1E,gBAAgB,EAChBY,gBAAgB,EACjB,GAAGL;IACJ,MAAMa,eAAeuD;IAErB,MAAM,EAAEhE,gBAAgB,EAAE,GAAGH;IAC7B,MAAM,EAAEK,WAAW,EAAEC,WAAW,EAAE,GAAG,MAAME,iBACzCR,SACAI,kBACA;QACE,oBAAoB;QACpB,sBAAsB;QACtB,aAAa;QACb,aAAa8D,QAAAA,gBAAAA,KAAAA,IAAAA,cAAe,WAAW;IACzC;IAGF,MAAME,wBAAwB,MAAMC,uBAClChE,aACA4D;IAGF,MAAMK,cAAyD,EAAE;IAEjE,IAAIJ,AAAAA,CAAAA,QAAAA,gBAAAA,KAAAA,IAAAA,cAAe,kBAAkB,AAAD,MAAM,OACxCI,YAAY,IAAI,CAAC;QACf,MAAM;QACN,WAAW;YACT,KAAKnE;YACL,QAAQ;QACV;IACF;IAGFmE,YAAY,IAAI,CAAC;QACf,MAAM;QACN,MAAMF;IACR;IAEA,MAAM1E,OAAe;QACnB;YAAE,MAAM;YAAU,SAASkB;QAAa;QACxC;YACE,MAAM;YACN,SAAS0D;QACX;KACD;IAED,IAAI,QAAAN,CAAAA,yBAAAA,QAAQ,aAAa,AAAD,IAApBA,KAAAA,IAAAA,uBAAuB,aAAa,EACtCtE,KAAK,IAAI,CAAC;QACR,MAAM;QACN,SAAS;IACX;IAGF,IAAIF,kBAAkB;QACpB,MAAM0B,SAAS,MAAM3B,mBAAmB;YACtC,QAAQC,iBAAiB,MAAM;YAC/B,yBAAyBA,iBAAiB,uBAAuB;QACnE;QACAE,KAAK,IAAI,IAAIwB;IACf;IAEA,MAAM+B,SAAS,MAAMC,SACnBxD,MACA4B,aAAa,YAAY,EACzBlB;IAEF,OAAO;QACL,aAAa6C,OAAO,OAAO;QAC3B3C;QACA,OAAO2C,OAAO,KAAK;IACrB;AACF"}
@@ -1,27 +1,32 @@
1
1
  import { vlLocateMode } from "@midscene/shared/env";
2
2
  import { paddingToMatchBlockByBase64 } from "@midscene/shared/img";
3
+ import { getDebug } from "@midscene/shared/logger";
3
4
  import { assert } from "@midscene/shared/utils";
4
- import { AIActionType, buildYamlFlowFromPlans, callAiFn, fillBboxParam, markupImageForLLM, warnGPT4oSizeLimit } from "./common.mjs";
5
+ import { AIActionType, buildYamlFlowFromPlans, callAiFn, fillBboxParam, findAllMidsceneLocatorField, markupImageForLLM, warnGPT4oSizeLimit } from "./common.mjs";
5
6
  import { automationUserPrompt, generateTaskBackgroundContext, systemPromptToTaskPlanning } from "./prompt/llm-planning.mjs";
6
7
  import { describeUserPage } from "./prompt/util.mjs";
8
+ const debug = getDebug('planning');
7
9
  async function plan(userInstruction, opts) {
8
10
  var _planFromAI_action;
9
11
  const { callAI, context } = opts || {};
10
12
  const { screenshotBase64, size } = context;
11
- const { description: pageDescription, elementById } = await describeUserPage(context);
13
+ const modelPreferences = {
14
+ intent: 'planning'
15
+ };
16
+ const { description: pageDescription, elementById } = await describeUserPage(context, modelPreferences);
12
17
  const systemPrompt = await systemPromptToTaskPlanning({
13
18
  actionSpace: opts.actionSpace,
14
- vlMode: vlLocateMode()
19
+ vlMode: vlLocateMode(modelPreferences)
15
20
  });
16
21
  const taskBackgroundContextText = generateTaskBackgroundContext(userInstruction, opts.log, opts.actionContext);
17
- const userInstructionPrompt = await automationUserPrompt(vlLocateMode()).format({
22
+ const userInstructionPrompt = await automationUserPrompt(vlLocateMode(modelPreferences)).format({
18
23
  pageDescription,
19
24
  taskBackgroundContext: taskBackgroundContextText
20
25
  });
21
26
  let imagePayload = screenshotBase64;
22
- if ('qwen-vl' === vlLocateMode()) imagePayload = await paddingToMatchBlockByBase64(imagePayload);
23
- else if (!vlLocateMode()) imagePayload = await markupImageForLLM(screenshotBase64, context.tree, context.size);
24
- warnGPT4oSizeLimit(size);
27
+ if ('qwen-vl' === vlLocateMode(modelPreferences)) imagePayload = await paddingToMatchBlockByBase64(imagePayload);
28
+ else if (!vlLocateMode(modelPreferences)) imagePayload = await markupImageForLLM(screenshotBase64, context.tree, context.size);
29
+ warnGPT4oSizeLimit(size, modelPreferences);
25
30
  const msgs = [
26
31
  {
27
32
  role: 'system',
@@ -45,7 +50,7 @@ async function plan(userInstruction, opts) {
45
50
  }
46
51
  ];
47
52
  const call = callAI || callAiFn;
48
- const { content, usage } = await call(msgs, AIActionType.PLAN);
53
+ const { content, usage } = await call(msgs, AIActionType.PLAN, modelPreferences);
49
54
  const rawResponse = JSON.stringify(content, void 0, 2);
50
55
  const planFromAI = content;
51
56
  const actions = ((null == (_planFromAI_action = planFromAI.action) ? void 0 : _planFromAI_action.type) ? [
@@ -59,24 +64,22 @@ async function plan(userInstruction, opts) {
59
64
  yamlFlow: buildYamlFlowFromPlans(actions, opts.actionSpace, planFromAI.sleep)
60
65
  };
61
66
  assert(planFromAI, "can't get plans from AI");
62
- if (vlLocateMode()) {
63
- actions.forEach((action)=>{
64
- if (action.locate) try {
65
- action.locate = fillBboxParam(action.locate, size.width, size.height);
66
- } catch (e) {
67
- throw new Error(`Failed to fill locate param: ${planFromAI.error} (${e instanceof Error ? e.message : 'unknown error'})`, {
68
- cause: e
69
- });
67
+ actions.forEach((action)=>{
68
+ const type = action.type;
69
+ const actionInActionSpace = opts.actionSpace.find((action)=>action.name === type);
70
+ const locateFields = actionInActionSpace ? findAllMidsceneLocatorField(actionInActionSpace.paramSchema) : [];
71
+ debug('locateFields', locateFields);
72
+ locateFields.forEach((field)=>{
73
+ const locateResult = action.param[field];
74
+ if (locateResult) if (vlLocateMode(modelPreferences)) action.param[field] = fillBboxParam(locateResult, size.width, size.height, modelPreferences);
75
+ else {
76
+ const element = elementById(locateResult);
77
+ if (element) action.param[field].id = element.id;
70
78
  }
79
+ action.locate = action.param[field];
71
80
  });
72
- assert(!planFromAI.error, `Failed to plan actions: ${planFromAI.error}`);
73
- } else actions.forEach((action)=>{
74
- var _action_locate;
75
- if (null == (_action_locate = action.locate) ? void 0 : _action_locate.id) {
76
- const element = elementById(action.locate.id);
77
- if (element) action.locate.id = element.id;
78
- }
79
81
  });
82
+ assert(!planFromAI.error, `Failed to plan actions: ${planFromAI.error}`);
80
83
  if (0 === actions.length && returnValue.more_actions_needed_by_instruction && !returnValue.sleep) console.warn('No actions planned for the prompt, but model said more actions are needed:', userInstruction);
81
84
  return returnValue;
82
85
  }
@@ -1 +1 @@
1
- {"version":3,"file":"ai-model/llm-planning.mjs","sources":["webpack://@midscene/core/./src/ai-model/llm-planning.ts"],"sourcesContent":["import type {\n DeviceAction,\n PageType,\n PlanningAIResponse,\n UIContext,\n} from '@/types';\nimport { vlLocateMode } from '@midscene/shared/env';\nimport { paddingToMatchBlockByBase64 } from '@midscene/shared/img';\nimport { assert } from '@midscene/shared/utils';\nimport {\n AIActionType,\n type AIArgs,\n buildYamlFlowFromPlans,\n callAiFn,\n fillBboxParam,\n markupImageForLLM,\n warnGPT4oSizeLimit,\n} from './common';\nimport {\n automationUserPrompt,\n generateTaskBackgroundContext,\n systemPromptToTaskPlanning,\n} from './prompt/llm-planning';\nimport { describeUserPage } from './prompt/util';\n\nexport async function plan(\n userInstruction: string,\n opts: {\n context: UIContext;\n pageType: PageType;\n actionSpace: DeviceAction[];\n callAI?: typeof callAiFn<PlanningAIResponse>;\n log?: string;\n actionContext?: string;\n },\n): Promise<PlanningAIResponse> {\n const { callAI, context } = opts || {};\n const { screenshotBase64, size } = context;\n const { description: pageDescription, elementById } =\n await describeUserPage(context);\n\n const systemPrompt = await systemPromptToTaskPlanning({\n actionSpace: opts.actionSpace,\n vlMode: vlLocateMode(),\n });\n const taskBackgroundContextText = generateTaskBackgroundContext(\n userInstruction,\n opts.log,\n opts.actionContext,\n );\n const userInstructionPrompt = await automationUserPrompt(\n vlLocateMode(),\n ).format({\n pageDescription,\n taskBackgroundContext: taskBackgroundContextText,\n });\n\n let imagePayload = screenshotBase64;\n if (vlLocateMode() === 'qwen-vl') {\n imagePayload = await paddingToMatchBlockByBase64(imagePayload);\n } else if (!vlLocateMode()) {\n imagePayload = await markupImageForLLM(\n screenshotBase64,\n context.tree,\n context.size,\n );\n }\n\n warnGPT4oSizeLimit(size);\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: userInstructionPrompt,\n },\n ],\n },\n ];\n\n const call = callAI || callAiFn;\n const { content, usage } = await call(msgs, AIActionType.PLAN);\n const rawResponse = JSON.stringify(content, undefined, 2);\n const planFromAI = content;\n\n const actions =\n (planFromAI.action?.type ? [planFromAI.action] : planFromAI.actions) || [];\n const returnValue: PlanningAIResponse = {\n ...planFromAI,\n actions,\n rawResponse,\n usage,\n yamlFlow: buildYamlFlowFromPlans(\n actions,\n opts.actionSpace,\n planFromAI.sleep,\n ),\n };\n\n assert(planFromAI, \"can't get plans from AI\");\n\n if (vlLocateMode()) {\n actions.forEach((action) => {\n if (action.locate) {\n try {\n action.locate = fillBboxParam(action.locate, size.width, size.height);\n } catch (e) {\n throw new Error(\n `Failed to fill locate param: ${planFromAI.error} (${\n e instanceof Error ? e.message : 'unknown error'\n })`,\n {\n cause: e,\n },\n );\n }\n }\n });\n // in Qwen-VL, error means error. In GPT-4o, error may mean more actions are needed.\n assert(!planFromAI.error, `Failed to plan actions: ${planFromAI.error}`);\n } else {\n actions.forEach((action) => {\n if (action.locate?.id) {\n // The model may return indexId, need to perform a query correction to avoid exceptions\n const element = elementById(action.locate.id);\n if (element) {\n action.locate.id = element.id;\n }\n }\n });\n }\n\n if (\n actions.length === 0 &&\n returnValue.more_actions_needed_by_instruction &&\n !returnValue.sleep\n ) {\n console.warn(\n 'No actions planned for the prompt, but model said more actions are needed:',\n userInstruction,\n );\n }\n\n return returnValue;\n}\n"],"names":["plan","userInstruction","opts","_planFromAI_action","callAI","context","screenshotBase64","size","pageDescription","elementById","describeUserPage","systemPrompt","systemPromptToTaskPlanning","vlLocateMode","taskBackgroundContextText","generateTaskBackgroundContext","userInstructionPrompt","automationUserPrompt","imagePayload","paddingToMatchBlockByBase64","markupImageForLLM","warnGPT4oSizeLimit","msgs","call","callAiFn","content","usage","AIActionType","rawResponse","JSON","undefined","planFromAI","actions","returnValue","buildYamlFlowFromPlans","assert","action","fillBboxParam","e","Error","_action_locate","element","console"],"mappings":";;;;;;AAyBO,eAAeA,KACpBC,eAAuB,EACvBC,IAOC;QA8DEC;IA5DH,MAAM,EAAEC,MAAM,EAAEC,OAAO,EAAE,GAAGH,QAAQ,CAAC;IACrC,MAAM,EAAEI,gBAAgB,EAAEC,IAAI,EAAE,GAAGF;IACnC,MAAM,EAAE,aAAaG,eAAe,EAAEC,WAAW,EAAE,GACjD,MAAMC,iBAAiBL;IAEzB,MAAMM,eAAe,MAAMC,2BAA2B;QACpD,aAAaV,KAAK,WAAW;QAC7B,QAAQW;IACV;IACA,MAAMC,4BAA4BC,8BAChCd,iBACAC,KAAK,GAAG,EACRA,KAAK,aAAa;IAEpB,MAAMc,wBAAwB,MAAMC,qBAClCJ,gBACA,MAAM,CAAC;QACPL;QACA,uBAAuBM;IACzB;IAEA,IAAII,eAAeZ;IACnB,IAAIO,AAAmB,cAAnBA,gBACFK,eAAe,MAAMC,4BAA4BD;SAC5C,IAAI,CAACL,gBACVK,eAAe,MAAME,kBACnBd,kBACAD,QAAQ,IAAI,EACZA,QAAQ,IAAI;IAIhBgB,mBAAmBd;IAEnB,MAAMe,OAAe;QACnB;YAAE,MAAM;YAAU,SAASX;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKO;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAMF;gBACR;aACD;QACH;KACD;IAED,MAAMO,OAAOnB,UAAUoB;IACvB,MAAM,EAAEC,OAAO,EAAEC,KAAK,EAAE,GAAG,MAAMH,KAAKD,MAAMK,aAAa,IAAI;IAC7D,MAAMC,cAAcC,KAAK,SAAS,CAACJ,SAASK,QAAW;IACvD,MAAMC,aAAaN;IAEnB,MAAMO,UACH7B,AAAAA,CAAAA,SAAAA,CAAAA,qBAAAA,WAAW,MAAM,AAAD,IAAhBA,KAAAA,IAAAA,mBAAmB,IAAI,AAAD,IAAI;QAAC4B,WAAW,MAAM;KAAC,GAAGA,WAAW,OAAM,KAAM,EAAE;IAC5E,MAAME,cAAkC;QACtC,GAAGF,UAAU;QACbC;QACAJ;QACAF;QACA,UAAUQ,uBACRF,SACA9B,KAAK,WAAW,EAChB6B,WAAW,KAAK;IAEpB;IAEAI,OAAOJ,YAAY;IAEnB,IAAIlB,gBAAgB;QAClBmB,QAAQ,OAAO,CAAC,CAACI;YACf,IAAIA,OAAO,MAAM,EACf,IAAI;gBACFA,OAAO,MAAM,GAAGC,cAAcD,OAAO,MAAM,EAAE7B,KAAK,KAAK,EAAEA,KAAK,MAAM;YACtE,EAAE,OAAO+B,GAAG;gBACV,MAAM,IAAIC,MACR,CAAC,6BAA6B,EAAER,WAAW,KAAK,CAAC,EAAE,EACjDO,aAAaC,QAAQD,EAAE,OAAO,GAAG,gBAClC,CAAC,CAAC,EACH;oBACE,OAAOA;gBACT;YAEJ;QAEJ;QAEAH,OAAO,CAACJ,WAAW,KAAK,EAAE,CAAC,wBAAwB,EAAEA,WAAW,KAAK,EAAE;IACzE,OACEC,QAAQ,OAAO,CAAC,CAACI;YACXI;QAAJ,IAAI,QAAAA,CAAAA,iBAAAA,OAAO,MAAM,AAAD,IAAZA,KAAAA,IAAAA,eAAe,EAAE,EAAE;YAErB,MAAMC,UAAUhC,YAAY2B,OAAO,MAAM,CAAC,EAAE;YAC5C,IAAIK,SACFL,OAAO,MAAM,CAAC,EAAE,GAAGK,QAAQ,EAAE;QAEjC;IACF;IAGF,IACET,AAAmB,MAAnBA,QAAQ,MAAM,IACdC,YAAY,kCAAkC,IAC9C,CAACA,YAAY,KAAK,EAElBS,QAAQ,IAAI,CACV,8EACAzC;IAIJ,OAAOgC;AACT"}
1
+ {"version":3,"file":"ai-model/llm-planning.mjs","sources":["webpack://@midscene/core/./src/ai-model/llm-planning.ts"],"sourcesContent":["import type {\n DeviceAction,\n PageType,\n PlanningAIResponse,\n UIContext,\n} from '@/types';\nimport { type IModelPreferences, vlLocateMode } from '@midscene/shared/env';\nimport { paddingToMatchBlockByBase64 } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport {\n AIActionType,\n type AIArgs,\n buildYamlFlowFromPlans,\n callAiFn,\n fillBboxParam,\n findAllMidsceneLocatorField,\n markupImageForLLM,\n warnGPT4oSizeLimit,\n} from './common';\nimport {\n automationUserPrompt,\n generateTaskBackgroundContext,\n systemPromptToTaskPlanning,\n} from './prompt/llm-planning';\nimport { describeUserPage } from './prompt/util';\n\nconst debug = getDebug('planning');\n\nexport async function plan(\n userInstruction: string,\n opts: {\n context: UIContext;\n pageType: PageType;\n actionSpace: DeviceAction<any>[];\n callAI?: typeof callAiFn<PlanningAIResponse>;\n log?: string;\n actionContext?: string;\n },\n): Promise<PlanningAIResponse> {\n const { callAI, context } = opts || {};\n const { screenshotBase64, size } = context;\n\n const modelPreferences: IModelPreferences = {\n intent: 'planning',\n };\n const { description: pageDescription, elementById } = await describeUserPage(\n context,\n modelPreferences,\n );\n\n const systemPrompt = await systemPromptToTaskPlanning({\n actionSpace: opts.actionSpace,\n vlMode: vlLocateMode(modelPreferences),\n });\n const taskBackgroundContextText = generateTaskBackgroundContext(\n userInstruction,\n opts.log,\n opts.actionContext,\n );\n const userInstructionPrompt = await automationUserPrompt(\n vlLocateMode(modelPreferences),\n ).format({\n pageDescription,\n taskBackgroundContext: taskBackgroundContextText,\n });\n\n let imagePayload = screenshotBase64;\n if (vlLocateMode(modelPreferences) === 'qwen-vl') {\n imagePayload = await paddingToMatchBlockByBase64(imagePayload);\n } else if (!vlLocateMode(modelPreferences)) {\n imagePayload = await markupImageForLLM(\n screenshotBase64,\n context.tree,\n context.size,\n );\n }\n\n warnGPT4oSizeLimit(size, modelPreferences);\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: userInstructionPrompt,\n },\n ],\n },\n ];\n\n const call = callAI || callAiFn;\n const { content, usage } = await call(\n msgs,\n AIActionType.PLAN,\n modelPreferences,\n );\n const rawResponse = JSON.stringify(content, undefined, 2);\n const planFromAI = content;\n\n const actions =\n (planFromAI.action?.type ? [planFromAI.action] : planFromAI.actions) || [];\n const returnValue: PlanningAIResponse = {\n ...planFromAI,\n actions,\n rawResponse,\n usage,\n yamlFlow: buildYamlFlowFromPlans(\n actions,\n opts.actionSpace,\n planFromAI.sleep,\n ),\n };\n\n assert(planFromAI, \"can't get plans from AI\");\n\n // TODO: use zod.parse to parse the action.param, and then fill the bbox param.\n actions.forEach((action) => {\n const type = action.type;\n const actionInActionSpace = opts.actionSpace.find(\n (action) => action.name === type,\n );\n const locateFields = actionInActionSpace\n ? findAllMidsceneLocatorField(actionInActionSpace.paramSchema)\n : [];\n\n debug('locateFields', locateFields);\n\n locateFields.forEach((field) => {\n const locateResult = action.param[field];\n if (locateResult) {\n if (vlLocateMode(modelPreferences)) {\n action.param[field] = fillBboxParam(\n locateResult,\n size.width,\n size.height,\n modelPreferences,\n );\n } else {\n const element = elementById(locateResult);\n if (element) {\n action.param[field].id = element.id;\n }\n }\n }\n\n // to be compatible with the web-integration\n action.locate = action.param[field];\n });\n });\n // in Qwen-VL, error means error. In GPT-4o, error may mean more actions are needed.\n assert(!planFromAI.error, `Failed to plan actions: ${planFromAI.error}`);\n\n if (\n actions.length === 0 &&\n returnValue.more_actions_needed_by_instruction &&\n !returnValue.sleep\n ) {\n console.warn(\n 'No actions planned for the prompt, but model said more actions are needed:',\n userInstruction,\n );\n }\n\n return returnValue;\n}\n"],"names":["debug","getDebug","plan","userInstruction","opts","_planFromAI_action","callAI","context","screenshotBase64","size","modelPreferences","pageDescription","elementById","describeUserPage","systemPrompt","systemPromptToTaskPlanning","vlLocateMode","taskBackgroundContextText","generateTaskBackgroundContext","userInstructionPrompt","automationUserPrompt","imagePayload","paddingToMatchBlockByBase64","markupImageForLLM","warnGPT4oSizeLimit","msgs","call","callAiFn","content","usage","AIActionType","rawResponse","JSON","undefined","planFromAI","actions","returnValue","buildYamlFlowFromPlans","assert","action","type","actionInActionSpace","locateFields","findAllMidsceneLocatorField","field","locateResult","fillBboxParam","element","console"],"mappings":";;;;;;;AA2BA,MAAMA,QAAQC,SAAS;AAEhB,eAAeC,KACpBC,eAAuB,EACvBC,IAOC;QAwEEC;IAtEH,MAAM,EAAEC,MAAM,EAAEC,OAAO,EAAE,GAAGH,QAAQ,CAAC;IACrC,MAAM,EAAEI,gBAAgB,EAAEC,IAAI,EAAE,GAAGF;IAEnC,MAAMG,mBAAsC;QAC1C,QAAQ;IACV;IACA,MAAM,EAAE,aAAaC,eAAe,EAAEC,WAAW,EAAE,GAAG,MAAMC,iBAC1DN,SACAG;IAGF,MAAMI,eAAe,MAAMC,2BAA2B;QACpD,aAAaX,KAAK,WAAW;QAC7B,QAAQY,aAAaN;IACvB;IACA,MAAMO,4BAA4BC,8BAChCf,iBACAC,KAAK,GAAG,EACRA,KAAK,aAAa;IAEpB,MAAMe,wBAAwB,MAAMC,qBAClCJ,aAAaN,mBACb,MAAM,CAAC;QACPC;QACA,uBAAuBM;IACzB;IAEA,IAAII,eAAeb;IACnB,IAAIQ,AAAmC,cAAnCA,aAAaN,mBACfW,eAAe,MAAMC,4BAA4BD;SAC5C,IAAI,CAACL,aAAaN,mBACvBW,eAAe,MAAME,kBACnBf,kBACAD,QAAQ,IAAI,EACZA,QAAQ,IAAI;IAIhBiB,mBAAmBf,MAAMC;IAEzB,MAAMe,OAAe;QACnB;YAAE,MAAM;YAAU,SAASX;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKO;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAMF;gBACR;aACD;QACH;KACD;IAED,MAAMO,OAAOpB,UAAUqB;IACvB,MAAM,EAAEC,OAAO,EAAEC,KAAK,EAAE,GAAG,MAAMH,KAC/BD,MACAK,aAAa,IAAI,EACjBpB;IAEF,MAAMqB,cAAcC,KAAK,SAAS,CAACJ,SAASK,QAAW;IACvD,MAAMC,aAAaN;IAEnB,MAAMO,UACH9B,AAAAA,CAAAA,SAAAA,CAAAA,qBAAAA,WAAW,MAAM,AAAD,IAAhBA,KAAAA,IAAAA,mBAAmB,IAAI,AAAD,IAAI;QAAC6B,WAAW,MAAM;KAAC,GAAGA,WAAW,OAAM,KAAM,EAAE;IAC5E,MAAME,cAAkC;QACtC,GAAGF,UAAU;QACbC;QACAJ;QACAF;QACA,UAAUQ,uBACRF,SACA/B,KAAK,WAAW,EAChB8B,WAAW,KAAK;IAEpB;IAEAI,OAAOJ,YAAY;IAGnBC,QAAQ,OAAO,CAAC,CAACI;QACf,MAAMC,OAAOD,OAAO,IAAI;QACxB,MAAME,sBAAsBrC,KAAK,WAAW,CAAC,IAAI,CAC/C,CAACmC,SAAWA,OAAO,IAAI,KAAKC;QAE9B,MAAME,eAAeD,sBACjBE,4BAA4BF,oBAAoB,WAAW,IAC3D,EAAE;QAENzC,MAAM,gBAAgB0C;QAEtBA,aAAa,OAAO,CAAC,CAACE;YACpB,MAAMC,eAAeN,OAAO,KAAK,CAACK,MAAM;YACxC,IAAIC,cACF,IAAI7B,aAAaN,mBACf6B,OAAO,KAAK,CAACK,MAAM,GAAGE,cACpBD,cACApC,KAAK,KAAK,EACVA,KAAK,MAAM,EACXC;iBAEG;gBACL,MAAMqC,UAAUnC,YAAYiC;gBAC5B,IAAIE,SACFR,OAAO,KAAK,CAACK,MAAM,CAAC,EAAE,GAAGG,QAAQ,EAAE;YAEvC;YAIFR,OAAO,MAAM,GAAGA,OAAO,KAAK,CAACK,MAAM;QACrC;IACF;IAEAN,OAAO,CAACJ,WAAW,KAAK,EAAE,CAAC,wBAAwB,EAAEA,WAAW,KAAK,EAAE;IAEvE,IACEC,AAAmB,MAAnBA,QAAQ,MAAM,IACdC,YAAY,kCAAkC,IAC9C,CAACA,YAAY,KAAK,EAElBY,QAAQ,IAAI,CACV,8EACA7C;IAIJ,OAAOiC;AACT"}