@midscene/core 0.28.12-beta-20250924091555.0 → 0.29.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/es/agent/utils.mjs +1 -1
- package/dist/es/ai-model/common.mjs +10 -25
- package/dist/es/ai-model/common.mjs.map +1 -1
- package/dist/es/ai-model/inspect.mjs +7 -30
- package/dist/es/ai-model/inspect.mjs.map +1 -1
- package/dist/es/ai-model/llm-planning.mjs +4 -29
- package/dist/es/ai-model/llm-planning.mjs.map +1 -1
- package/dist/es/ai-model/prompt/llm-planning.mjs +2 -2
- package/dist/es/ai-model/prompt/llm-planning.mjs.map +1 -1
- package/dist/es/insight/index.mjs +1 -2
- package/dist/es/insight/index.mjs.map +1 -1
- package/dist/es/utils.mjs +2 -2
- package/dist/lib/agent/utils.js +1 -1
- package/dist/lib/ai-model/common.js +9 -27
- package/dist/lib/ai-model/common.js.map +1 -1
- package/dist/lib/ai-model/inspect.js +7 -30
- package/dist/lib/ai-model/inspect.js.map +1 -1
- package/dist/lib/ai-model/llm-planning.js +4 -29
- package/dist/lib/ai-model/llm-planning.js.map +1 -1
- package/dist/lib/ai-model/prompt/llm-planning.js +2 -2
- package/dist/lib/ai-model/prompt/llm-planning.js.map +1 -1
- package/dist/lib/insight/index.js +1 -2
- package/dist/lib/insight/index.js.map +1 -1
- package/dist/lib/utils.js +2 -2
- package/dist/types/ai-model/common.d.ts +3 -4
- package/package.json +3 -3
package/dist/es/agent/utils.mjs
CHANGED
|
@@ -139,7 +139,7 @@ function trimContextByViewport(execution) {
|
|
|
139
139
|
}) : execution.tasks
|
|
140
140
|
};
|
|
141
141
|
}
|
|
142
|
-
const getMidsceneVersion = ()=>"0.
|
|
142
|
+
const getMidsceneVersion = ()=>"0.29.0";
|
|
143
143
|
const parsePrompt = (prompt)=>{
|
|
144
144
|
if ('string' == typeof prompt) return {
|
|
145
145
|
textPrompt: prompt,
|
|
@@ -15,12 +15,12 @@ var common_AIActionType = /*#__PURE__*/ function(AIActionType) {
|
|
|
15
15
|
}({});
|
|
16
16
|
const defaultBboxSize = 20;
|
|
17
17
|
const debugInspectUtils = getDebug('ai:common');
|
|
18
|
-
function fillBboxParam(locate, width, height,
|
|
18
|
+
function fillBboxParam(locate, width, height, vlMode) {
|
|
19
19
|
if (locate.bbox_2d && !(null == locate ? void 0 : locate.bbox)) {
|
|
20
20
|
locate.bbox = locate.bbox_2d;
|
|
21
21
|
delete locate.bbox_2d;
|
|
22
22
|
}
|
|
23
|
-
if (null == locate ? void 0 : locate.bbox) locate.bbox = adaptBbox(locate.bbox, width, height,
|
|
23
|
+
if (null == locate ? void 0 : locate.bbox) locate.bbox = adaptBbox(locate.bbox, width, height, vlMode);
|
|
24
24
|
return locate;
|
|
25
25
|
}
|
|
26
26
|
function adaptQwenBbox(bbox) {
|
|
@@ -82,25 +82,10 @@ function adaptDoubaoBbox(bbox, width, height) {
|
|
|
82
82
|
const msg = `invalid bbox data for doubao-vision mode: ${JSON.stringify(bbox)} `;
|
|
83
83
|
throw new Error(msg);
|
|
84
84
|
}
|
|
85
|
-
function adaptBbox(bbox, width, height,
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
0,
|
|
90
|
-
0
|
|
91
|
-
];
|
|
92
|
-
result = 'doubao-vision' === vlMode || 'vlm-ui-tars' === vlMode ? adaptDoubaoBbox(bbox, width, height) : 'gemini' === vlMode ? adaptGeminiBbox(bbox, width, height) : 'qwen3-vl' === vlMode ? normalized01000(bbox, width, height) : adaptQwenBbox(bbox);
|
|
93
|
-
result[2] = Math.min(result[2], rightLimit);
|
|
94
|
-
result[3] = Math.min(result[3], bottomLimit);
|
|
95
|
-
return result;
|
|
96
|
-
}
|
|
97
|
-
function normalized01000(bbox, width, height) {
|
|
98
|
-
return [
|
|
99
|
-
Math.round(bbox[0] * width / 1000),
|
|
100
|
-
Math.round(bbox[1] * height / 1000),
|
|
101
|
-
Math.round(bbox[2] * width / 1000),
|
|
102
|
-
Math.round(bbox[3] * height / 1000)
|
|
103
|
-
];
|
|
85
|
+
function adaptBbox(bbox, width, height, vlMode) {
|
|
86
|
+
if ('doubao-vision' === vlMode || 'vlm-ui-tars' === vlMode) return adaptDoubaoBbox(bbox, width, height);
|
|
87
|
+
if ('gemini' === vlMode) return adaptGeminiBbox(bbox, width, height);
|
|
88
|
+
return adaptQwenBbox(bbox);
|
|
104
89
|
}
|
|
105
90
|
function adaptGeminiBbox(bbox, width, height) {
|
|
106
91
|
const left = Math.round(bbox[1] * width / 1000);
|
|
@@ -114,9 +99,9 @@ function adaptGeminiBbox(bbox, width, height) {
|
|
|
114
99
|
bottom
|
|
115
100
|
];
|
|
116
101
|
}
|
|
117
|
-
function adaptBboxToRect(bbox, width, height, offsetX = 0, offsetY = 0,
|
|
118
|
-
debugInspectUtils('adaptBboxToRect', bbox, width, height,
|
|
119
|
-
const [left, top, right, bottom] = adaptBbox(bbox, width, height,
|
|
102
|
+
function adaptBboxToRect(bbox, width, height, offsetX = 0, offsetY = 0, vlMode) {
|
|
103
|
+
debugInspectUtils('adaptBboxToRect', bbox, width, height, offsetX, offsetY, vlMode);
|
|
104
|
+
const [left, top, right, bottom] = adaptBbox(bbox, width, height, vlMode);
|
|
120
105
|
const rectLeft = left;
|
|
121
106
|
const rectTop = top;
|
|
122
107
|
let rectWidth = right - left;
|
|
@@ -325,6 +310,6 @@ const loadActionParam = (jsonObject, zodSchema)=>{
|
|
|
325
310
|
}
|
|
326
311
|
return result;
|
|
327
312
|
};
|
|
328
|
-
export { common_AIActionType as AIActionType, PointSchema, RectSchema, SizeSchema, TMultimodalPromptSchema, TUserPromptSchema, adaptBbox, adaptBboxToRect, adaptDoubaoBbox, adaptGeminiBbox, adaptQwenBbox, buildYamlFlowFromPlans, dumpActionParam, dumpMidsceneLocatorField, expandSearchArea, fillBboxParam, findAllMidsceneLocatorField, getMidsceneLocationSchema, ifMidsceneLocatorField, loadActionParam, markupImageForLLM, mergeRects,
|
|
313
|
+
export { common_AIActionType as AIActionType, PointSchema, RectSchema, SizeSchema, TMultimodalPromptSchema, TUserPromptSchema, adaptBbox, adaptBboxToRect, adaptDoubaoBbox, adaptGeminiBbox, adaptQwenBbox, buildYamlFlowFromPlans, dumpActionParam, dumpMidsceneLocatorField, expandSearchArea, fillBboxParam, findAllMidsceneLocatorField, getMidsceneLocationSchema, ifMidsceneLocatorField, loadActionParam, markupImageForLLM, mergeRects, warnGPT4oSizeLimit };
|
|
329
314
|
|
|
330
315
|
//# sourceMappingURL=common.mjs.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ai-model/common.mjs","sources":["webpack://@midscene/core/./src/ai-model/common.ts"],"sourcesContent":["import type {\n AIUsageInfo,\n BaseElement,\n DeviceAction,\n ElementTreeNode,\n MidsceneYamlFlowItem,\n PlanningAction,\n Rect,\n Size,\n} from '@/types';\nimport { assert } from '@midscene/shared/utils';\n\nimport type { ChatCompletionMessageParam } from 'openai/resources/index';\n\nimport type { PlanningLocateParam } from '@/types';\nimport { NodeType } from '@midscene/shared/constants';\nimport type { TVlModeTypes } from '@midscene/shared/env';\nimport { treeToList } from '@midscene/shared/extractor';\nimport { compositeElementInfoImg } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { z } from 'zod';\n\nexport type AIArgs = ChatCompletionMessageParam[];\n\nexport enum AIActionType {\n ASSERT = 0,\n INSPECT_ELEMENT = 1,\n EXTRACT_DATA = 2,\n PLAN = 3,\n DESCRIBE_ELEMENT = 4,\n TEXT = 5,\n}\n\nconst defaultBboxSize = 20; // must be even number\nconst debugInspectUtils = getDebug('ai:common');\n\n// transform the param of locate from qwen mode\nexport function fillBboxParam(\n locate: PlanningLocateParam,\n width: number,\n height: number,\n rightLimit: number,\n bottomLimit: number,\n vlMode: TVlModeTypes | undefined,\n) {\n // The Qwen model might have hallucinations of naming bbox as bbox_2d.\n if ((locate as any).bbox_2d && !locate?.bbox) {\n locate.bbox = (locate as any).bbox_2d;\n // biome-ignore lint/performance/noDelete: <explanation>\n delete (locate as any).bbox_2d;\n }\n\n if (locate?.bbox) {\n locate.bbox = adaptBbox(\n locate.bbox,\n width,\n height,\n rightLimit,\n bottomLimit,\n vlMode,\n );\n }\n\n return locate;\n}\n\nexport function adaptQwenBbox(\n bbox: number[],\n): [number, number, number, number] {\n if (bbox.length < 2) {\n const msg = `invalid bbox data for qwen-vl mode: ${JSON.stringify(bbox)} `;\n throw new Error(msg);\n }\n\n const result: [number, number, number, number] = [\n Math.round(bbox[0]),\n Math.round(bbox[1]),\n typeof bbox[2] === 'number'\n ? Math.round(bbox[2])\n : Math.round(bbox[0] + defaultBboxSize),\n typeof bbox[3] === 'number'\n ? Math.round(bbox[3])\n : Math.round(bbox[1] + defaultBboxSize),\n ];\n return result;\n}\n\nexport function adaptDoubaoBbox(\n bbox: string[] | number[] | string,\n width: number,\n height: number,\n): [number, number, number, number] {\n assert(\n width > 0 && height > 0,\n 'width and height must be greater than 0 in doubao mode',\n );\n\n if (typeof bbox === 'string') {\n assert(\n /^(\\d+)\\s(\\d+)\\s(\\d+)\\s(\\d+)$/.test(bbox.trim()),\n `invalid bbox data string for doubao-vision mode: ${bbox}`,\n );\n const splitted = bbox.split(' ');\n if (splitted.length === 4) {\n return [\n Math.round((Number(splitted[0]) * width) / 1000),\n Math.round((Number(splitted[1]) * height) / 1000),\n Math.round((Number(splitted[2]) * width) / 1000),\n Math.round((Number(splitted[3]) * height) / 1000),\n ];\n }\n throw new Error(`invalid bbox data string for doubao-vision mode: ${bbox}`);\n }\n\n if (Array.isArray(bbox) && Array.isArray(bbox[0])) {\n bbox = bbox[0];\n }\n\n let bboxList: number[] = [];\n if (Array.isArray(bbox) && typeof bbox[0] === 'string') {\n bbox.forEach((item) => {\n if (typeof item === 'string' && item.includes(',')) {\n const [x, y] = item.split(',');\n bboxList.push(Number(x.trim()), Number(y.trim()));\n } else if (typeof item === 'string' && item.includes(' ')) {\n const [x, y] = item.split(' ');\n bboxList.push(Number(x.trim()), Number(y.trim()));\n } else {\n bboxList.push(Number(item));\n }\n });\n } else {\n bboxList = bbox as any;\n }\n\n if (bboxList.length === 4 || bboxList.length === 5) {\n return [\n Math.round((bboxList[0] * width) / 1000),\n Math.round((bboxList[1] * height) / 1000),\n Math.round((bboxList[2] * width) / 1000),\n Math.round((bboxList[3] * height) / 1000),\n ];\n }\n\n // treat the bbox as a center point\n if (\n bboxList.length === 6 ||\n bboxList.length === 2 ||\n bboxList.length === 3 ||\n bboxList.length === 7\n ) {\n return [\n Math.max(\n 0,\n Math.round((bboxList[0] * width) / 1000) - defaultBboxSize / 2,\n ),\n Math.max(\n 0,\n Math.round((bboxList[1] * height) / 1000) - defaultBboxSize / 2,\n ),\n Math.min(\n width,\n Math.round((bboxList[0] * width) / 1000) + defaultBboxSize / 2,\n ),\n Math.min(\n height,\n Math.round((bboxList[1] * height) / 1000) + defaultBboxSize / 2,\n ),\n ];\n }\n\n if (bbox.length === 8) {\n return [\n Math.round((bboxList[0] * width) / 1000),\n Math.round((bboxList[1] * height) / 1000),\n Math.round((bboxList[4] * width) / 1000),\n Math.round((bboxList[5] * height) / 1000),\n ];\n }\n\n const msg = `invalid bbox data for doubao-vision mode: ${JSON.stringify(bbox)} `;\n throw new Error(msg);\n}\n\nexport function adaptBbox(\n bbox: number[],\n width: number,\n height: number,\n rightLimit: number,\n bottomLimit: number,\n vlMode: TVlModeTypes | undefined,\n): [number, number, number, number] {\n let result: [number, number, number, number] = [0, 0, 0, 0];\n if (vlMode === 'doubao-vision' || vlMode === 'vlm-ui-tars') {\n result = adaptDoubaoBbox(bbox, width, height);\n } else if (vlMode === 'gemini') {\n result = adaptGeminiBbox(bbox, width, height);\n } else if (vlMode === 'qwen3-vl') {\n result = normalized01000(bbox, width, height);\n } else {\n result = adaptQwenBbox(bbox);\n }\n\n result[2] = Math.min(result[2], rightLimit);\n result[3] = Math.min(result[3], bottomLimit);\n\n return result;\n}\n\n// x1, y1, x2, y2 -> 0-1000\nexport function normalized01000(\n bbox: number[],\n width: number,\n height: number,\n): [number, number, number, number] {\n return [\n Math.round((bbox[0] * width) / 1000),\n Math.round((bbox[1] * height) / 1000),\n Math.round((bbox[2] * width) / 1000),\n Math.round((bbox[3] * height) / 1000),\n ];\n}\n\n// y1, x1, y2, x2 -> 0-1000\nexport function adaptGeminiBbox(\n bbox: number[],\n width: number,\n height: number,\n): [number, number, number, number] {\n const left = Math.round((bbox[1] * width) / 1000);\n const top = Math.round((bbox[0] * height) / 1000);\n const right = Math.round((bbox[3] * width) / 1000);\n const bottom = Math.round((bbox[2] * height) / 1000);\n return [left, top, right, bottom];\n}\n\nexport function adaptBboxToRect(\n bbox: number[],\n width: number,\n height: number,\n offsetX = 0,\n offsetY = 0,\n rightLimit = width,\n bottomLimit = height,\n vlMode?: TVlModeTypes | undefined,\n): Rect {\n debugInspectUtils(\n 'adaptBboxToRect',\n bbox,\n width,\n height,\n 'offset',\n offsetX,\n offsetY,\n 'limit',\n rightLimit,\n bottomLimit,\n 'vlMode',\n vlMode,\n );\n const [left, top, right, bottom] = adaptBbox(\n bbox,\n width,\n height,\n rightLimit,\n bottomLimit,\n vlMode,\n );\n\n // Calculate initial rect dimensions\n const rectLeft = left;\n const rectTop = top;\n let rectWidth = right - left;\n let rectHeight = bottom - top;\n\n // Ensure the rect doesn't exceed image boundaries\n // If right edge exceeds width, adjust the width\n if (rectLeft + rectWidth > width) {\n rectWidth = width - rectLeft;\n }\n\n // If bottom edge exceeds height, adjust the height\n if (rectTop + rectHeight > height) {\n rectHeight = height - rectTop;\n }\n\n // Ensure minimum dimensions (width and height should be at least 1)\n rectWidth = Math.max(1, rectWidth);\n rectHeight = Math.max(1, rectHeight);\n\n const rect = {\n left: rectLeft + offsetX,\n top: rectTop + offsetY,\n width: rectWidth,\n height: rectHeight,\n };\n debugInspectUtils('adaptBboxToRect, result=', rect);\n\n return rect;\n}\n\nlet warned = false;\nexport function warnGPT4oSizeLimit(size: Size, modelName: string) {\n if (warned) return;\n if (modelName.toLowerCase().includes('gpt-4o')) {\n const warningMsg = `GPT-4o has a maximum image input size of 2000x768 or 768x2000, but got ${size.width}x${size.height}. Please set your interface to a smaller resolution. Otherwise, the result may be inaccurate.`;\n\n if (\n Math.max(size.width, size.height) > 2000 ||\n Math.min(size.width, size.height) > 768\n ) {\n console.warn(warningMsg);\n warned = true;\n }\n } else if (size.width > 1800 || size.height > 1800) {\n console.warn(\n `The image size seems too large (${size.width}x${size.height}). It may lead to more token usage, slower response, and inaccurate result.`,\n );\n warned = true;\n }\n}\n\nexport function mergeRects(rects: Rect[]) {\n const minLeft = Math.min(...rects.map((r) => r.left));\n const minTop = Math.min(...rects.map((r) => r.top));\n const maxRight = Math.max(...rects.map((r) => r.left + r.width));\n const maxBottom = Math.max(...rects.map((r) => r.top + r.height));\n return {\n left: minLeft,\n top: minTop,\n width: maxRight - minLeft,\n height: maxBottom - minTop,\n };\n}\n\n// expand the search area to at least 300 x 300, or add a default padding\nexport function expandSearchArea(\n rect: Rect,\n screenSize: Size,\n vlMode: TVlModeTypes | undefined,\n) {\n const minEdgeSize = vlMode === 'doubao-vision' ? 500 : 300;\n const defaultPadding = 160;\n\n // Calculate padding needed to reach minimum edge size\n const paddingSizeHorizontal =\n rect.width < minEdgeSize\n ? Math.ceil((minEdgeSize - rect.width) / 2)\n : defaultPadding;\n const paddingSizeVertical =\n rect.height < minEdgeSize\n ? Math.ceil((minEdgeSize - rect.height) / 2)\n : defaultPadding;\n\n // Calculate new dimensions (ensure minimum edge size)\n let newWidth = Math.max(minEdgeSize, rect.width + paddingSizeHorizontal * 2);\n let newHeight = Math.max(minEdgeSize, rect.height + paddingSizeVertical * 2);\n\n // Calculate initial position with padding\n let newLeft = rect.left - paddingSizeHorizontal;\n let newTop = rect.top - paddingSizeVertical;\n\n // Ensure the rect doesn't exceed screen boundaries by adjusting position\n // If the rect goes beyond the right edge, shift it left\n if (newLeft + newWidth > screenSize.width) {\n newLeft = screenSize.width - newWidth;\n }\n\n // If the rect goes beyond the bottom edge, shift it up\n if (newTop + newHeight > screenSize.height) {\n newTop = screenSize.height - newHeight;\n }\n\n // Ensure the rect doesn't go beyond the left/top edges\n newLeft = Math.max(0, newLeft);\n newTop = Math.max(0, newTop);\n\n // If after position adjustment, the rect still exceeds screen boundaries,\n // clamp the dimensions to fit within screen\n if (newLeft + newWidth > screenSize.width) {\n newWidth = screenSize.width - newLeft;\n }\n if (newTop + newHeight > screenSize.height) {\n newHeight = screenSize.height - newTop;\n }\n\n rect.left = newLeft;\n rect.top = newTop;\n rect.width = newWidth;\n rect.height = newHeight;\n\n return rect;\n}\n\nexport async function markupImageForLLM(\n screenshotBase64: string,\n tree: ElementTreeNode<BaseElement>,\n size: Size,\n) {\n const elementsInfo = treeToList(tree);\n const elementsPositionInfoWithoutText = elementsInfo!.filter(\n (elementInfo) => {\n if (elementInfo.attributes.nodeType === NodeType.TEXT) {\n return false;\n }\n return true;\n },\n );\n\n const imagePayload = await compositeElementInfoImg({\n inputImgBase64: screenshotBase64,\n elementsPositionInfo: elementsPositionInfoWithoutText,\n size,\n });\n return imagePayload;\n}\n\nexport function buildYamlFlowFromPlans(\n plans: PlanningAction[],\n actionSpace: DeviceAction<any>[],\n sleep?: number,\n): MidsceneYamlFlowItem[] {\n const flow: MidsceneYamlFlowItem[] = [];\n\n for (const plan of plans) {\n const verb = plan.type;\n\n const action = actionSpace.find((action) => action.name === verb);\n if (!action) {\n console.warn(\n `Cannot convert action ${verb} to yaml flow. Will ignore it.`,\n );\n continue;\n }\n\n const flowKey = action.interfaceAlias || verb;\n const flowParam = action.paramSchema\n ? dumpActionParam(plan.param || {}, action.paramSchema)\n : {};\n\n const flowItem: MidsceneYamlFlowItem = {\n [flowKey]: '',\n ...flowParam,\n };\n\n flow.push(flowItem);\n }\n\n if (sleep) {\n flow.push({\n sleep,\n });\n }\n\n return flow;\n}\n\n// Zod schemas for shared types\nexport const PointSchema = z.object({\n left: z.number(),\n top: z.number(),\n});\n\nexport const SizeSchema = z.object({\n width: z.number(),\n height: z.number(),\n dpr: z.number().optional(),\n});\n\nexport const RectSchema = PointSchema.and(SizeSchema).and(\n z.object({\n zoom: z.number().optional(),\n }),\n);\n\n// Zod schema for TMultimodalPrompt\nexport const TMultimodalPromptSchema = z.object({\n images: z\n .array(\n z.object({\n name: z.string(),\n url: z.string(),\n }),\n )\n .optional(),\n convertHttpImage2Base64: z.boolean().optional(),\n});\n\n// Zod schema for TUserPrompt\nexport const TUserPromptSchema = z.union([\n z.string(),\n z\n .object({\n prompt: z.string(),\n })\n .and(TMultimodalPromptSchema.partial()),\n]);\n\n// Generate TypeScript types from Zod schemas\nexport type TMultimodalPrompt = z.infer<typeof TMultimodalPromptSchema>;\nexport type TUserPrompt = z.infer<typeof TUserPromptSchema>;\n\nconst locateFieldFlagName = 'midscene_location_field_flag';\n\nconst MidsceneLocationResult = z\n .object({\n [locateFieldFlagName]: z.literal(true),\n prompt: TUserPromptSchema,\n\n // optional fields\n deepThink: z.boolean().optional(), // only available in vl model\n cacheable: z.boolean().optional(),\n xpath: z.boolean().optional(), // preset result for xpath\n\n // these two fields will only appear in the result\n center: z.tuple([z.number(), z.number()]),\n rect: RectSchema,\n })\n .passthrough();\n\nexport type MidsceneLocationResultType = z.infer<typeof MidsceneLocationResult>;\nexport const getMidsceneLocationSchema = () => {\n return MidsceneLocationResult;\n};\n\nexport const ifMidsceneLocatorField = (field: any): boolean => {\n // Handle optional fields by getting the inner type\n let actualField = field;\n if (actualField._def?.typeName === 'ZodOptional') {\n actualField = actualField._def.innerType;\n }\n\n // Check if this is a ZodUnion (the new MidsceneLocation structure)\n if (actualField._def?.typeName === 'ZodObject') {\n const shape = actualField._def.shape();\n return locateFieldFlagName in shape;\n }\n\n return false;\n};\n\nexport const dumpMidsceneLocatorField = (field: any): string => {\n assert(\n ifMidsceneLocatorField(field),\n 'field is not a midscene locator field',\n );\n\n // If field is a string, return it directly\n if (typeof field === 'string') {\n return field;\n }\n\n // If field is an object with prompt property\n if (field && typeof field === 'object' && field.prompt) {\n // If prompt is a string, return it directly\n if (typeof field.prompt === 'string') {\n return field.prompt;\n }\n // If prompt is a TUserPrompt object, extract the prompt string\n if (typeof field.prompt === 'object' && field.prompt.prompt) {\n return field.prompt.prompt; // TODO: dump images if necessary\n }\n }\n\n // Fallback: try to convert to string\n return String(field);\n};\n\nexport const findAllMidsceneLocatorField = (\n zodType?: z.ZodType<any>,\n requiredOnly?: boolean,\n): string[] => {\n if (!zodType) {\n return [];\n }\n\n // Check if this is a ZodObject by checking if it has a shape property\n const zodObject = zodType as any;\n if (zodObject._def?.typeName === 'ZodObject' && zodObject.shape) {\n const keys = Object.keys(zodObject.shape);\n return keys.filter((key) => {\n const field = zodObject.shape[key];\n if (!ifMidsceneLocatorField(field)) {\n return false;\n }\n\n // If requiredOnly is true, filter out optional fields\n if (requiredOnly) {\n return field._def?.typeName !== 'ZodOptional';\n }\n\n return true;\n });\n }\n\n // For other ZodType instances, we can't extract field names\n return [];\n};\n\nexport const dumpActionParam = (\n jsonObject: Record<string, any>,\n zodSchema: z.ZodType<any>,\n): Record<string, any> => {\n const locatorFields = findAllMidsceneLocatorField(zodSchema);\n const result = { ...jsonObject };\n\n for (const fieldName of locatorFields) {\n const fieldValue = result[fieldName];\n if (fieldValue) {\n // If it's already a string, keep it as is\n if (typeof fieldValue === 'string') {\n result[fieldName] = fieldValue;\n } else if (typeof fieldValue === 'object') {\n // Check if this field is actually a MidsceneLocationType object\n if (fieldValue.prompt) {\n // If prompt is a string, use it directly\n if (typeof fieldValue.prompt === 'string') {\n result[fieldName] = fieldValue.prompt;\n } else if (\n typeof fieldValue.prompt === 'object' &&\n fieldValue.prompt.prompt\n ) {\n // If prompt is a TUserPrompt object, extract the prompt string\n result[fieldName] = fieldValue.prompt.prompt;\n }\n }\n }\n }\n }\n\n return result;\n};\n\nexport const loadActionParam = (\n jsonObject: Record<string, any>,\n zodSchema: z.ZodType<any>,\n): Record<string, any> => {\n const locatorFields = findAllMidsceneLocatorField(zodSchema);\n const result = { ...jsonObject };\n\n for (const fieldName of locatorFields) {\n const fieldValue = result[fieldName];\n if (fieldValue && typeof fieldValue === 'string') {\n result[fieldName] = {\n [locateFieldFlagName]: true,\n prompt: fieldValue,\n };\n }\n }\n\n return result;\n};\n"],"names":["AIActionType","defaultBboxSize","debugInspectUtils","getDebug","fillBboxParam","locate","width","height","rightLimit","bottomLimit","vlMode","adaptBbox","adaptQwenBbox","bbox","msg","JSON","Error","result","Math","adaptDoubaoBbox","assert","splitted","Number","Array","bboxList","item","x","y","adaptGeminiBbox","normalized01000","left","top","right","bottom","adaptBboxToRect","offsetX","offsetY","rectLeft","rectTop","rectWidth","rectHeight","rect","warned","warnGPT4oSizeLimit","size","modelName","warningMsg","console","mergeRects","rects","minLeft","r","minTop","maxRight","maxBottom","expandSearchArea","screenSize","minEdgeSize","defaultPadding","paddingSizeHorizontal","paddingSizeVertical","newWidth","newHeight","newLeft","newTop","markupImageForLLM","screenshotBase64","tree","elementsInfo","treeToList","elementsPositionInfoWithoutText","elementInfo","NodeType","imagePayload","compositeElementInfoImg","buildYamlFlowFromPlans","plans","actionSpace","sleep","flow","plan","verb","action","flowKey","flowParam","dumpActionParam","flowItem","PointSchema","z","SizeSchema","RectSchema","TMultimodalPromptSchema","TUserPromptSchema","locateFieldFlagName","MidsceneLocationResult","getMidsceneLocationSchema","ifMidsceneLocatorField","field","_actualField__def","_actualField__def1","actualField","shape","dumpMidsceneLocatorField","String","findAllMidsceneLocatorField","zodType","requiredOnly","_zodObject__def","zodObject","keys","Object","key","_field__def","jsonObject","zodSchema","locatorFields","fieldName","fieldValue","loadActionParam"],"mappings":";;;;;;AAwBO,IAAKA,sBAAYA,WAAAA,GAAAA,SAAZA,YAAY;;;;;;;WAAZA;;AASZ,MAAMC,kBAAkB;AACxB,MAAMC,oBAAoBC,SAAS;AAG5B,SAASC,cACdC,MAA2B,EAC3BC,KAAa,EACbC,MAAc,EACdC,UAAkB,EAClBC,WAAmB,EACnBC,MAAgC;IAGhC,IAAKL,OAAe,OAAO,IAAI,CAACA,CAAAA,QAAAA,SAAAA,KAAAA,IAAAA,OAAQ,IAAI,AAAD,GAAG;QAC5CA,OAAO,IAAI,GAAIA,OAAe,OAAO;QAErC,OAAQA,OAAe,OAAO;IAChC;IAEA,IAAIA,QAAAA,SAAAA,KAAAA,IAAAA,OAAQ,IAAI,EACdA,OAAO,IAAI,GAAGM,UACZN,OAAO,IAAI,EACXC,OACAC,QACAC,YACAC,aACAC;IAIJ,OAAOL;AACT;AAEO,SAASO,cACdC,IAAc;IAEd,IAAIA,KAAK,MAAM,GAAG,GAAG;QACnB,MAAMC,MAAM,CAAC,oCAAoC,EAAEC,KAAK,SAAS,CAACF,MAAM,CAAC,CAAC;QAC1E,MAAM,IAAIG,MAAMF;IAClB;IAEA,MAAMG,SAA2C;QAC/CC,KAAK,KAAK,CAACL,IAAI,CAAC,EAAE;QAClBK,KAAK,KAAK,CAACL,IAAI,CAAC,EAAE;QACC,YAAnB,OAAOA,IAAI,CAAC,EAAE,GACVK,KAAK,KAAK,CAACL,IAAI,CAAC,EAAE,IAClBK,KAAK,KAAK,CAACL,IAAI,CAAC,EAAE,GAAGZ;QACN,YAAnB,OAAOY,IAAI,CAAC,EAAE,GACVK,KAAK,KAAK,CAACL,IAAI,CAAC,EAAE,IAClBK,KAAK,KAAK,CAACL,IAAI,CAAC,EAAE,GAAGZ;KAC1B;IACD,OAAOgB;AACT;AAEO,SAASE,gBACdN,IAAkC,EAClCP,KAAa,EACbC,MAAc;IAEda,OACEd,QAAQ,KAAKC,SAAS,GACtB;IAGF,IAAI,AAAgB,YAAhB,OAAOM,MAAmB;QAC5BO,OACE,+BAA+B,IAAI,CAACP,KAAK,IAAI,KAC7C,CAAC,iDAAiD,EAAEA,MAAM;QAE5D,MAAMQ,WAAWR,KAAK,KAAK,CAAC;QAC5B,IAAIQ,AAAoB,MAApBA,SAAS,MAAM,EACjB,OAAO;YACLH,KAAK,KAAK,CAAEI,OAAOD,QAAQ,CAAC,EAAE,IAAIf,QAAS;YAC3CY,KAAK,KAAK,CAAEI,OAAOD,QAAQ,CAAC,EAAE,IAAId,SAAU;YAC5CW,KAAK,KAAK,CAAEI,OAAOD,QAAQ,CAAC,EAAE,IAAIf,QAAS;YAC3CY,KAAK,KAAK,CAAEI,OAAOD,QAAQ,CAAC,EAAE,IAAId,SAAU;SAC7C;QAEH,MAAM,IAAIS,MAAM,CAAC,iDAAiD,EAAEH,MAAM;IAC5E;IAEA,IAAIU,MAAM,OAAO,CAACV,SAASU,MAAM,OAAO,CAACV,IAAI,CAAC,EAAE,GAC9CA,OAAOA,IAAI,CAAC,EAAE;IAGhB,IAAIW,WAAqB,EAAE;IAC3B,IAAID,MAAM,OAAO,CAACV,SAAS,AAAmB,YAAnB,OAAOA,IAAI,CAAC,EAAE,EACvCA,KAAK,OAAO,CAAC,CAACY;QACZ,IAAI,AAAgB,YAAhB,OAAOA,QAAqBA,KAAK,QAAQ,CAAC,MAAM;YAClD,MAAM,CAACC,GAAGC,EAAE,GAAGF,KAAK,KAAK,CAAC;YAC1BD,SAAS,IAAI,CAACF,OAAOI,EAAE,IAAI,KAAKJ,OAAOK,EAAE,IAAI;QAC/C,OAAO,IAAI,AAAgB,YAAhB,OAAOF,QAAqBA,KAAK,QAAQ,CAAC,MAAM;YACzD,MAAM,CAACC,GAAGC,EAAE,GAAGF,KAAK,KAAK,CAAC;YAC1BD,SAAS,IAAI,CAACF,OAAOI,EAAE,IAAI,KAAKJ,OAAOK,EAAE,IAAI;QAC/C,OACEH,SAAS,IAAI,CAACF,OAAOG;IAEzB;SAEAD,WAAWX;IAGb,IAAIW,AAAoB,MAApBA,SAAS,MAAM,IAAUA,AAAoB,MAApBA,SAAS,MAAM,EAC1C,OAAO;QACLN,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGlB,QAAS;QACnCY,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGjB,SAAU;QACpCW,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGlB,QAAS;QACnCY,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGjB,SAAU;KACrC;IAIH,IACEiB,AAAoB,MAApBA,SAAS,MAAM,IACfA,AAAoB,MAApBA,SAAS,MAAM,IACfA,AAAoB,MAApBA,SAAS,MAAM,IACfA,AAAoB,MAApBA,SAAS,MAAM,EAEf,OAAO;QACLN,KAAK,GAAG,CACN,GACAA,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGlB,QAAS,QAAQL,kBAAkB;QAE/DiB,KAAK,GAAG,CACN,GACAA,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGjB,SAAU,QAAQN,kBAAkB;QAEhEiB,KAAK,GAAG,CACNZ,OACAY,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGlB,QAAS,QAAQL,kBAAkB;QAE/DiB,KAAK,GAAG,CACNX,QACAW,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGjB,SAAU,QAAQN,kBAAkB;KAEjE;IAGH,IAAIY,AAAgB,MAAhBA,KAAK,MAAM,EACb,OAAO;QACLK,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGlB,QAAS;QACnCY,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGjB,SAAU;QACpCW,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGlB,QAAS;QACnCY,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGjB,SAAU;KACrC;IAGH,MAAMO,MAAM,CAAC,0CAA0C,EAAEC,KAAK,SAAS,CAACF,MAAM,CAAC,CAAC;IAChF,MAAM,IAAIG,MAAMF;AAClB;AAEO,SAASH,UACdE,IAAc,EACdP,KAAa,EACbC,MAAc,EACdC,UAAkB,EAClBC,WAAmB,EACnBC,MAAgC;IAEhC,IAAIO,SAA2C;QAAC;QAAG;QAAG;QAAG;KAAE;IAEzDA,SADEP,AAAW,oBAAXA,UAA8BA,AAAW,kBAAXA,SACvBS,gBAAgBN,MAAMP,OAAOC,UAC7BG,AAAW,aAAXA,SACAkB,gBAAgBf,MAAMP,OAAOC,UAC7BG,AAAW,eAAXA,SACAmB,gBAAgBhB,MAAMP,OAAOC,UAE7BK,cAAcC;IAGzBI,MAAM,CAAC,EAAE,GAAGC,KAAK,GAAG,CAACD,MAAM,CAAC,EAAE,EAAET;IAChCS,MAAM,CAAC,EAAE,GAAGC,KAAK,GAAG,CAACD,MAAM,CAAC,EAAE,EAAER;IAEhC,OAAOQ;AACT;AAGO,SAASY,gBACdhB,IAAc,EACdP,KAAa,EACbC,MAAc;IAEd,OAAO;QACLW,KAAK,KAAK,CAAEL,IAAI,CAAC,EAAE,GAAGP,QAAS;QAC/BY,KAAK,KAAK,CAAEL,IAAI,CAAC,EAAE,GAAGN,SAAU;QAChCW,KAAK,KAAK,CAAEL,IAAI,CAAC,EAAE,GAAGP,QAAS;QAC/BY,KAAK,KAAK,CAAEL,IAAI,CAAC,EAAE,GAAGN,SAAU;KACjC;AACH;AAGO,SAASqB,gBACdf,IAAc,EACdP,KAAa,EACbC,MAAc;IAEd,MAAMuB,OAAOZ,KAAK,KAAK,CAAEL,IAAI,CAAC,EAAE,GAAGP,QAAS;IAC5C,MAAMyB,MAAMb,KAAK,KAAK,CAAEL,IAAI,CAAC,EAAE,GAAGN,SAAU;IAC5C,MAAMyB,QAAQd,KAAK,KAAK,CAAEL,IAAI,CAAC,EAAE,GAAGP,QAAS;IAC7C,MAAM2B,SAASf,KAAK,KAAK,CAAEL,IAAI,CAAC,EAAE,GAAGN,SAAU;IAC/C,OAAO;QAACuB;QAAMC;QAAKC;QAAOC;KAAO;AACnC;AAEO,SAASC,gBACdrB,IAAc,EACdP,KAAa,EACbC,MAAc,EACd4B,UAAU,CAAC,EACXC,UAAU,CAAC,EACX5B,aAAaF,KAAK,EAClBG,cAAcF,MAAM,EACpBG,MAAiC;IAEjCR,kBACE,mBACAW,MACAP,OACAC,QACA,UACA4B,SACAC,SACA,SACA5B,YACAC,aACA,UACAC;IAEF,MAAM,CAACoB,MAAMC,KAAKC,OAAOC,OAAO,GAAGtB,UACjCE,MACAP,OACAC,QACAC,YACAC,aACAC;IAIF,MAAM2B,WAAWP;IACjB,MAAMQ,UAAUP;IAChB,IAAIQ,YAAYP,QAAQF;IACxB,IAAIU,aAAaP,SAASF;IAI1B,IAAIM,WAAWE,YAAYjC,OACzBiC,YAAYjC,QAAQ+B;IAItB,IAAIC,UAAUE,aAAajC,QACzBiC,aAAajC,SAAS+B;IAIxBC,YAAYrB,KAAK,GAAG,CAAC,GAAGqB;IACxBC,aAAatB,KAAK,GAAG,CAAC,GAAGsB;IAEzB,MAAMC,OAAO;QACX,MAAMJ,WAAWF;QACjB,KAAKG,UAAUF;QACf,OAAOG;QACP,QAAQC;IACV;IACAtC,kBAAkB,4BAA4BuC;IAE9C,OAAOA;AACT;AAEA,IAAIC,SAAS;AACN,SAASC,mBAAmBC,IAAU,EAAEC,SAAiB;IAC9D,IAAIH,QAAQ;IACZ,IAAIG,UAAU,WAAW,GAAG,QAAQ,CAAC,WAAW;QAC9C,MAAMC,aAAa,CAAC,uEAAuE,EAAEF,KAAK,KAAK,CAAC,CAAC,EAAEA,KAAK,MAAM,CAAC,6FAA6F,CAAC;QAErN,IACE1B,KAAK,GAAG,CAAC0B,KAAK,KAAK,EAAEA,KAAK,MAAM,IAAI,QACpC1B,KAAK,GAAG,CAAC0B,KAAK,KAAK,EAAEA,KAAK,MAAM,IAAI,KACpC;YACAG,QAAQ,IAAI,CAACD;YACbJ,SAAS;QACX;IACF,OAAO,IAAIE,KAAK,KAAK,GAAG,QAAQA,KAAK,MAAM,GAAG,MAAM;QAClDG,QAAQ,IAAI,CACV,CAAC,gCAAgC,EAAEH,KAAK,KAAK,CAAC,CAAC,EAAEA,KAAK,MAAM,CAAC,2EAA2E,CAAC;QAE3IF,SAAS;IACX;AACF;AAEO,SAASM,WAAWC,KAAa;IACtC,MAAMC,UAAUhC,KAAK,GAAG,IAAI+B,MAAM,GAAG,CAAC,CAACE,IAAMA,EAAE,IAAI;IACnD,MAAMC,SAASlC,KAAK,GAAG,IAAI+B,MAAM,GAAG,CAAC,CAACE,IAAMA,EAAE,GAAG;IACjD,MAAME,WAAWnC,KAAK,GAAG,IAAI+B,MAAM,GAAG,CAAC,CAACE,IAAMA,EAAE,IAAI,GAAGA,EAAE,KAAK;IAC9D,MAAMG,YAAYpC,KAAK,GAAG,IAAI+B,MAAM,GAAG,CAAC,CAACE,IAAMA,EAAE,GAAG,GAAGA,EAAE,MAAM;IAC/D,OAAO;QACL,MAAMD;QACN,KAAKE;QACL,OAAOC,WAAWH;QAClB,QAAQI,YAAYF;IACtB;AACF;AAGO,SAASG,iBACdd,IAAU,EACVe,UAAgB,EAChB9C,MAAgC;IAEhC,MAAM+C,cAAc/C,AAAW,oBAAXA,SAA6B,MAAM;IACvD,MAAMgD,iBAAiB;IAGvB,MAAMC,wBACJlB,KAAK,KAAK,GAAGgB,cACTvC,KAAK,IAAI,CAAEuC,AAAAA,CAAAA,cAAchB,KAAK,KAAI,IAAK,KACvCiB;IACN,MAAME,sBACJnB,KAAK,MAAM,GAAGgB,cACVvC,KAAK,IAAI,CAAEuC,AAAAA,CAAAA,cAAchB,KAAK,MAAK,IAAK,KACxCiB;IAGN,IAAIG,WAAW3C,KAAK,GAAG,CAACuC,aAAahB,KAAK,KAAK,GAAGkB,AAAwB,IAAxBA;IAClD,IAAIG,YAAY5C,KAAK,GAAG,CAACuC,aAAahB,KAAK,MAAM,GAAGmB,AAAsB,IAAtBA;IAGpD,IAAIG,UAAUtB,KAAK,IAAI,GAAGkB;IAC1B,IAAIK,SAASvB,KAAK,GAAG,GAAGmB;IAIxB,IAAIG,UAAUF,WAAWL,WAAW,KAAK,EACvCO,UAAUP,WAAW,KAAK,GAAGK;IAI/B,IAAIG,SAASF,YAAYN,WAAW,MAAM,EACxCQ,SAASR,WAAW,MAAM,GAAGM;IAI/BC,UAAU7C,KAAK,GAAG,CAAC,GAAG6C;IACtBC,SAAS9C,KAAK,GAAG,CAAC,GAAG8C;IAIrB,IAAID,UAAUF,WAAWL,WAAW,KAAK,EACvCK,WAAWL,WAAW,KAAK,GAAGO;IAEhC,IAAIC,SAASF,YAAYN,WAAW,MAAM,EACxCM,YAAYN,WAAW,MAAM,GAAGQ;IAGlCvB,KAAK,IAAI,GAAGsB;IACZtB,KAAK,GAAG,GAAGuB;IACXvB,KAAK,KAAK,GAAGoB;IACbpB,KAAK,MAAM,GAAGqB;IAEd,OAAOrB;AACT;AAEO,eAAewB,kBACpBC,gBAAwB,EACxBC,IAAkC,EAClCvB,IAAU;IAEV,MAAMwB,eAAeC,WAAWF;IAChC,MAAMG,kCAAkCF,aAAc,MAAM,CAC1D,CAACG;QACC,IAAIA,YAAY,UAAU,CAAC,QAAQ,KAAKC,SAAS,IAAI,EACnD,OAAO;QAET,OAAO;IACT;IAGF,MAAMC,eAAe,MAAMC,wBAAwB;QACjD,gBAAgBR;QAChB,sBAAsBI;QACtB1B;IACF;IACA,OAAO6B;AACT;AAEO,SAASE,uBACdC,KAAuB,EACvBC,WAAgC,EAChCC,KAAc;IAEd,MAAMC,OAA+B,EAAE;IAEvC,KAAK,MAAMC,QAAQJ,MAAO;QACxB,MAAMK,OAAOD,KAAK,IAAI;QAEtB,MAAME,SAASL,YAAY,IAAI,CAAC,CAACK,SAAWA,OAAO,IAAI,KAAKD;QAC5D,IAAI,CAACC,QAAQ;YACXnC,QAAQ,IAAI,CACV,CAAC,sBAAsB,EAAEkC,KAAK,8BAA8B,CAAC;YAE/D;QACF;QAEA,MAAME,UAAUD,OAAO,cAAc,IAAID;QACzC,MAAMG,YAAYF,OAAO,WAAW,GAChCG,gBAAgBL,KAAK,KAAK,IAAI,CAAC,GAAGE,OAAO,WAAW,IACpD,CAAC;QAEL,MAAMI,WAAiC;YACrC,CAACH,QAAQ,EAAE;YACX,GAAGC,SAAS;QACd;QAEAL,KAAK,IAAI,CAACO;IACZ;IAEA,IAAIR,OACFC,KAAK,IAAI,CAAC;QACRD;IACF;IAGF,OAAOC;AACT;AAGO,MAAMQ,cAAcC,EAAE,MAAM,CAAC;IAClC,MAAMA,EAAE,MAAM;IACd,KAAKA,EAAE,MAAM;AACf;AAEO,MAAMC,aAAaD,EAAE,MAAM,CAAC;IACjC,OAAOA,EAAE,MAAM;IACf,QAAQA,EAAE,MAAM;IAChB,KAAKA,EAAE,MAAM,GAAG,QAAQ;AAC1B;AAEO,MAAME,aAAaH,YAAY,GAAG,CAACE,YAAY,GAAG,CACvDD,EAAE,MAAM,CAAC;IACP,MAAMA,EAAE,MAAM,GAAG,QAAQ;AAC3B;AAIK,MAAMG,0BAA0BH,EAAE,MAAM,CAAC;IAC9C,QAAQA,EAAAA,KACA,CACJA,EAAE,MAAM,CAAC;QACP,MAAMA,EAAE,MAAM;QACd,KAAKA,EAAE,MAAM;IACf,IAED,QAAQ;IACX,yBAAyBA,EAAE,OAAO,GAAG,QAAQ;AAC/C;AAGO,MAAMI,oBAAoBJ,EAAE,KAAK,CAAC;IACvCA,EAAE,MAAM;IACRA,EAAAA,MACS,CAAC;QACN,QAAQA,EAAE,MAAM;IAClB,GACC,GAAG,CAACG,wBAAwB,OAAO;CACvC;AAMD,MAAME,sBAAsB;AAE5B,MAAMC,yBAAyBN,EAAAA,MACtB,CAAC;IACN,CAACK,oBAAoB,EAAEL,EAAE,OAAO,CAAC;IACjC,QAAQI;IAGR,WAAWJ,EAAE,OAAO,GAAG,QAAQ;IAC/B,WAAWA,EAAE,OAAO,GAAG,QAAQ;IAC/B,OAAOA,EAAE,OAAO,GAAG,QAAQ;IAG3B,QAAQA,EAAE,KAAK,CAAC;QAACA,EAAE,MAAM;QAAIA,EAAE,MAAM;KAAG;IACxC,MAAME;AACR,GACC,WAAW;AAGP,MAAMK,4BAA4B,IAChCD;AAGF,MAAME,yBAAyB,CAACC;QAGjCC,mBAKAC;IANJ,IAAIC,cAAcH;IAClB,IAAIC,AAAAA,SAAAA,CAAAA,oBAAAA,YAAY,IAAI,AAAD,IAAfA,KAAAA,IAAAA,kBAAkB,QAAQ,AAAD,MAAM,eACjCE,cAAcA,YAAY,IAAI,CAAC,SAAS;IAI1C,IAAID,AAAAA,SAAAA,CAAAA,qBAAAA,YAAY,IAAI,AAAD,IAAfA,KAAAA,IAAAA,mBAAkB,QAAQ,AAAD,MAAM,aAAa;QAC9C,MAAME,QAAQD,YAAY,IAAI,CAAC,KAAK;QACpC,OAAOP,uBAAuBQ;IAChC;IAEA,OAAO;AACT;AAEO,MAAMC,2BAA2B,CAACL;IACvC7E,OACE4E,uBAAuBC,QACvB;IAIF,IAAI,AAAiB,YAAjB,OAAOA,OACT,OAAOA;IAIT,IAAIA,SAAS,AAAiB,YAAjB,OAAOA,SAAsBA,MAAM,MAAM,EAAE;QAEtD,IAAI,AAAwB,YAAxB,OAAOA,MAAM,MAAM,EACrB,OAAOA,MAAM,MAAM;QAGrB,IAAI,AAAwB,YAAxB,OAAOA,MAAM,MAAM,IAAiBA,MAAM,MAAM,CAAC,MAAM,EACzD,OAAOA,MAAM,MAAM,CAAC,MAAM;IAE9B;IAGA,OAAOM,OAAON;AAChB;AAEO,MAAMO,8BAA8B,CACzCC,SACAC;QAQIC;IANJ,IAAI,CAACF,SACH,OAAO,EAAE;IAIX,MAAMG,YAAYH;IAClB,IAAIE,AAAAA,SAAAA,CAAAA,kBAAAA,UAAU,IAAI,AAAD,IAAbA,KAAAA,IAAAA,gBAAgB,QAAQ,AAAD,MAAM,eAAeC,UAAU,KAAK,EAAE;QAC/D,MAAMC,OAAOC,OAAO,IAAI,CAACF,UAAU,KAAK;QACxC,OAAOC,KAAK,MAAM,CAAC,CAACE;YAClB,MAAMd,QAAQW,UAAU,KAAK,CAACG,IAAI;YAClC,IAAI,CAACf,uBAAuBC,QAC1B,OAAO;YAIT,IAAIS,cAAc;oBACTM;gBAAP,OAAOA,AAAAA,SAAAA,CAAAA,cAAAA,MAAM,IAAI,AAAD,IAATA,KAAAA,IAAAA,YAAY,QAAQ,AAAD,MAAM;YAClC;YAEA,OAAO;QACT;IACF;IAGA,OAAO,EAAE;AACX;AAEO,MAAM3B,kBAAkB,CAC7B4B,YACAC;IAEA,MAAMC,gBAAgBX,4BAA4BU;IAClD,MAAMjG,SAAS;QAAE,GAAGgG,UAAU;IAAC;IAE/B,KAAK,MAAMG,aAAaD,cAAe;QACrC,MAAME,aAAapG,MAAM,CAACmG,UAAU;QACpC,IAAIC,YAEF;YAAA,IAAI,AAAsB,YAAtB,OAAOA,YACTpG,MAAM,CAACmG,UAAU,GAAGC;iBACf,IAAI,AAAsB,YAAtB,OAAOA,YAEhB;gBAAA,IAAIA,WAAW,MAAM,EAEnB;oBAAA,IAAI,AAA6B,YAA7B,OAAOA,WAAW,MAAM,EAC1BpG,MAAM,CAACmG,UAAU,GAAGC,WAAW,MAAM;yBAChC,IACL,AAA6B,YAA7B,OAAOA,WAAW,MAAM,IACxBA,WAAW,MAAM,CAAC,MAAM,EAGxBpG,MAAM,CAACmG,UAAU,GAAGC,WAAW,MAAM,CAAC,MAAM;gBAC9C;YACF;QACF;IAEJ;IAEA,OAAOpG;AACT;AAEO,MAAMqG,kBAAkB,CAC7BL,YACAC;IAEA,MAAMC,gBAAgBX,4BAA4BU;IAClD,MAAMjG,SAAS;QAAE,GAAGgG,UAAU;IAAC;IAE/B,KAAK,MAAMG,aAAaD,cAAe;QACrC,MAAME,aAAapG,MAAM,CAACmG,UAAU;QACpC,IAAIC,cAAc,AAAsB,YAAtB,OAAOA,YACvBpG,MAAM,CAACmG,UAAU,GAAG;YAClB,CAACvB,oBAAoB,EAAE;YACvB,QAAQwB;QACV;IAEJ;IAEA,OAAOpG;AACT"}
|
|
1
|
+
{"version":3,"file":"ai-model/common.mjs","sources":["webpack://@midscene/core/./src/ai-model/common.ts"],"sourcesContent":["import type {\n AIUsageInfo,\n BaseElement,\n DeviceAction,\n ElementTreeNode,\n MidsceneYamlFlowItem,\n PlanningAction,\n Rect,\n Size,\n} from '@/types';\nimport { assert } from '@midscene/shared/utils';\n\nimport type { ChatCompletionMessageParam } from 'openai/resources/index';\n\nimport type { PlanningLocateParam } from '@/types';\nimport { NodeType } from '@midscene/shared/constants';\nimport type { TVlModeTypes } from '@midscene/shared/env';\nimport { treeToList } from '@midscene/shared/extractor';\nimport { compositeElementInfoImg } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { z } from 'zod';\n\nexport type AIArgs = ChatCompletionMessageParam[];\n\nexport enum AIActionType {\n ASSERT = 0,\n INSPECT_ELEMENT = 1,\n EXTRACT_DATA = 2,\n PLAN = 3,\n DESCRIBE_ELEMENT = 4,\n TEXT = 5,\n}\n\nconst defaultBboxSize = 20; // must be even number\nconst debugInspectUtils = getDebug('ai:common');\n\n// transform the param of locate from qwen mode\nexport function fillBboxParam(\n locate: PlanningLocateParam,\n width: number,\n height: number,\n vlMode: TVlModeTypes | undefined,\n) {\n // The Qwen model might have hallucinations of naming bbox as bbox_2d.\n if ((locate as any).bbox_2d && !locate?.bbox) {\n locate.bbox = (locate as any).bbox_2d;\n // biome-ignore lint/performance/noDelete: <explanation>\n delete (locate as any).bbox_2d;\n }\n\n if (locate?.bbox) {\n locate.bbox = adaptBbox(locate.bbox, width, height, vlMode);\n }\n\n return locate;\n}\n\nexport function adaptQwenBbox(\n bbox: number[],\n): [number, number, number, number] {\n if (bbox.length < 2) {\n const msg = `invalid bbox data for qwen-vl mode: ${JSON.stringify(bbox)} `;\n throw new Error(msg);\n }\n\n const result: [number, number, number, number] = [\n Math.round(bbox[0]),\n Math.round(bbox[1]),\n typeof bbox[2] === 'number'\n ? Math.round(bbox[2])\n : Math.round(bbox[0] + defaultBboxSize),\n typeof bbox[3] === 'number'\n ? Math.round(bbox[3])\n : Math.round(bbox[1] + defaultBboxSize),\n ];\n return result;\n}\n\nexport function adaptDoubaoBbox(\n bbox: string[] | number[] | string,\n width: number,\n height: number,\n): [number, number, number, number] {\n assert(\n width > 0 && height > 0,\n 'width and height must be greater than 0 in doubao mode',\n );\n\n if (typeof bbox === 'string') {\n assert(\n /^(\\d+)\\s(\\d+)\\s(\\d+)\\s(\\d+)$/.test(bbox.trim()),\n `invalid bbox data string for doubao-vision mode: ${bbox}`,\n );\n const splitted = bbox.split(' ');\n if (splitted.length === 4) {\n return [\n Math.round((Number(splitted[0]) * width) / 1000),\n Math.round((Number(splitted[1]) * height) / 1000),\n Math.round((Number(splitted[2]) * width) / 1000),\n Math.round((Number(splitted[3]) * height) / 1000),\n ];\n }\n throw new Error(`invalid bbox data string for doubao-vision mode: ${bbox}`);\n }\n\n if (Array.isArray(bbox) && Array.isArray(bbox[0])) {\n bbox = bbox[0];\n }\n\n let bboxList: number[] = [];\n if (Array.isArray(bbox) && typeof bbox[0] === 'string') {\n bbox.forEach((item) => {\n if (typeof item === 'string' && item.includes(',')) {\n const [x, y] = item.split(',');\n bboxList.push(Number(x.trim()), Number(y.trim()));\n } else if (typeof item === 'string' && item.includes(' ')) {\n const [x, y] = item.split(' ');\n bboxList.push(Number(x.trim()), Number(y.trim()));\n } else {\n bboxList.push(Number(item));\n }\n });\n } else {\n bboxList = bbox as any;\n }\n\n if (bboxList.length === 4 || bboxList.length === 5) {\n return [\n Math.round((bboxList[0] * width) / 1000),\n Math.round((bboxList[1] * height) / 1000),\n Math.round((bboxList[2] * width) / 1000),\n Math.round((bboxList[3] * height) / 1000),\n ];\n }\n\n // treat the bbox as a center point\n if (\n bboxList.length === 6 ||\n bboxList.length === 2 ||\n bboxList.length === 3 ||\n bboxList.length === 7\n ) {\n return [\n Math.max(\n 0,\n Math.round((bboxList[0] * width) / 1000) - defaultBboxSize / 2,\n ),\n Math.max(\n 0,\n Math.round((bboxList[1] * height) / 1000) - defaultBboxSize / 2,\n ),\n Math.min(\n width,\n Math.round((bboxList[0] * width) / 1000) + defaultBboxSize / 2,\n ),\n Math.min(\n height,\n Math.round((bboxList[1] * height) / 1000) + defaultBboxSize / 2,\n ),\n ];\n }\n\n if (bbox.length === 8) {\n return [\n Math.round((bboxList[0] * width) / 1000),\n Math.round((bboxList[1] * height) / 1000),\n Math.round((bboxList[4] * width) / 1000),\n Math.round((bboxList[5] * height) / 1000),\n ];\n }\n\n const msg = `invalid bbox data for doubao-vision mode: ${JSON.stringify(bbox)} `;\n throw new Error(msg);\n}\n\nexport function adaptBbox(\n bbox: number[],\n width: number,\n height: number,\n vlMode: TVlModeTypes | undefined,\n): [number, number, number, number] {\n if (vlMode === 'doubao-vision' || vlMode === 'vlm-ui-tars') {\n return adaptDoubaoBbox(bbox, width, height);\n }\n\n if (vlMode === 'gemini') {\n return adaptGeminiBbox(bbox, width, height);\n }\n\n return adaptQwenBbox(bbox);\n}\n\nexport function adaptGeminiBbox(\n bbox: number[],\n width: number,\n height: number,\n): [number, number, number, number] {\n const left = Math.round((bbox[1] * width) / 1000);\n const top = Math.round((bbox[0] * height) / 1000);\n const right = Math.round((bbox[3] * width) / 1000);\n const bottom = Math.round((bbox[2] * height) / 1000);\n return [left, top, right, bottom];\n}\n\nexport function adaptBboxToRect(\n bbox: number[],\n width: number,\n height: number,\n offsetX = 0,\n offsetY = 0,\n vlMode?: TVlModeTypes | undefined,\n): Rect {\n debugInspectUtils(\n 'adaptBboxToRect',\n bbox,\n width,\n height,\n offsetX,\n offsetY,\n vlMode,\n );\n const [left, top, right, bottom] = adaptBbox(bbox, width, height, vlMode);\n\n // Calculate initial rect dimensions\n const rectLeft = left;\n const rectTop = top;\n let rectWidth = right - left;\n let rectHeight = bottom - top;\n\n // Ensure the rect doesn't exceed image boundaries\n // If right edge exceeds width, adjust the width\n if (rectLeft + rectWidth > width) {\n rectWidth = width - rectLeft;\n }\n\n // If bottom edge exceeds height, adjust the height\n if (rectTop + rectHeight > height) {\n rectHeight = height - rectTop;\n }\n\n // Ensure minimum dimensions (width and height should be at least 1)\n rectWidth = Math.max(1, rectWidth);\n rectHeight = Math.max(1, rectHeight);\n\n const rect = {\n left: rectLeft + offsetX,\n top: rectTop + offsetY,\n width: rectWidth,\n height: rectHeight,\n };\n debugInspectUtils('adaptBboxToRect, result=', rect);\n return rect;\n}\n\nlet warned = false;\nexport function warnGPT4oSizeLimit(size: Size, modelName: string) {\n if (warned) return;\n if (modelName.toLowerCase().includes('gpt-4o')) {\n const warningMsg = `GPT-4o has a maximum image input size of 2000x768 or 768x2000, but got ${size.width}x${size.height}. Please set your interface to a smaller resolution. Otherwise, the result may be inaccurate.`;\n\n if (\n Math.max(size.width, size.height) > 2000 ||\n Math.min(size.width, size.height) > 768\n ) {\n console.warn(warningMsg);\n warned = true;\n }\n } else if (size.width > 1800 || size.height > 1800) {\n console.warn(\n `The image size seems too large (${size.width}x${size.height}). It may lead to more token usage, slower response, and inaccurate result.`,\n );\n warned = true;\n }\n}\n\nexport function mergeRects(rects: Rect[]) {\n const minLeft = Math.min(...rects.map((r) => r.left));\n const minTop = Math.min(...rects.map((r) => r.top));\n const maxRight = Math.max(...rects.map((r) => r.left + r.width));\n const maxBottom = Math.max(...rects.map((r) => r.top + r.height));\n return {\n left: minLeft,\n top: minTop,\n width: maxRight - minLeft,\n height: maxBottom - minTop,\n };\n}\n\n// expand the search area to at least 300 x 300, or add a default padding\nexport function expandSearchArea(\n rect: Rect,\n screenSize: Size,\n vlMode: TVlModeTypes | undefined,\n) {\n const minEdgeSize = vlMode === 'doubao-vision' ? 500 : 300;\n const defaultPadding = 160;\n\n // Calculate padding needed to reach minimum edge size\n const paddingSizeHorizontal =\n rect.width < minEdgeSize\n ? Math.ceil((minEdgeSize - rect.width) / 2)\n : defaultPadding;\n const paddingSizeVertical =\n rect.height < minEdgeSize\n ? Math.ceil((minEdgeSize - rect.height) / 2)\n : defaultPadding;\n\n // Calculate new dimensions (ensure minimum edge size)\n let newWidth = Math.max(minEdgeSize, rect.width + paddingSizeHorizontal * 2);\n let newHeight = Math.max(minEdgeSize, rect.height + paddingSizeVertical * 2);\n\n // Calculate initial position with padding\n let newLeft = rect.left - paddingSizeHorizontal;\n let newTop = rect.top - paddingSizeVertical;\n\n // Ensure the rect doesn't exceed screen boundaries by adjusting position\n // If the rect goes beyond the right edge, shift it left\n if (newLeft + newWidth > screenSize.width) {\n newLeft = screenSize.width - newWidth;\n }\n\n // If the rect goes beyond the bottom edge, shift it up\n if (newTop + newHeight > screenSize.height) {\n newTop = screenSize.height - newHeight;\n }\n\n // Ensure the rect doesn't go beyond the left/top edges\n newLeft = Math.max(0, newLeft);\n newTop = Math.max(0, newTop);\n\n // If after position adjustment, the rect still exceeds screen boundaries,\n // clamp the dimensions to fit within screen\n if (newLeft + newWidth > screenSize.width) {\n newWidth = screenSize.width - newLeft;\n }\n if (newTop + newHeight > screenSize.height) {\n newHeight = screenSize.height - newTop;\n }\n\n rect.left = newLeft;\n rect.top = newTop;\n rect.width = newWidth;\n rect.height = newHeight;\n\n return rect;\n}\n\nexport async function markupImageForLLM(\n screenshotBase64: string,\n tree: ElementTreeNode<BaseElement>,\n size: Size,\n) {\n const elementsInfo = treeToList(tree);\n const elementsPositionInfoWithoutText = elementsInfo!.filter(\n (elementInfo) => {\n if (elementInfo.attributes.nodeType === NodeType.TEXT) {\n return false;\n }\n return true;\n },\n );\n\n const imagePayload = await compositeElementInfoImg({\n inputImgBase64: screenshotBase64,\n elementsPositionInfo: elementsPositionInfoWithoutText,\n size,\n });\n return imagePayload;\n}\n\nexport function buildYamlFlowFromPlans(\n plans: PlanningAction[],\n actionSpace: DeviceAction<any>[],\n sleep?: number,\n): MidsceneYamlFlowItem[] {\n const flow: MidsceneYamlFlowItem[] = [];\n\n for (const plan of plans) {\n const verb = plan.type;\n\n const action = actionSpace.find((action) => action.name === verb);\n if (!action) {\n console.warn(\n `Cannot convert action ${verb} to yaml flow. Will ignore it.`,\n );\n continue;\n }\n\n const flowKey = action.interfaceAlias || verb;\n const flowParam = action.paramSchema\n ? dumpActionParam(plan.param || {}, action.paramSchema)\n : {};\n\n const flowItem: MidsceneYamlFlowItem = {\n [flowKey]: '',\n ...flowParam,\n };\n\n flow.push(flowItem);\n }\n\n if (sleep) {\n flow.push({\n sleep,\n });\n }\n\n return flow;\n}\n\n// Zod schemas for shared types\nexport const PointSchema = z.object({\n left: z.number(),\n top: z.number(),\n});\n\nexport const SizeSchema = z.object({\n width: z.number(),\n height: z.number(),\n dpr: z.number().optional(),\n});\n\nexport const RectSchema = PointSchema.and(SizeSchema).and(\n z.object({\n zoom: z.number().optional(),\n }),\n);\n\n// Zod schema for TMultimodalPrompt\nexport const TMultimodalPromptSchema = z.object({\n images: z\n .array(\n z.object({\n name: z.string(),\n url: z.string(),\n }),\n )\n .optional(),\n convertHttpImage2Base64: z.boolean().optional(),\n});\n\n// Zod schema for TUserPrompt\nexport const TUserPromptSchema = z.union([\n z.string(),\n z\n .object({\n prompt: z.string(),\n })\n .and(TMultimodalPromptSchema.partial()),\n]);\n\n// Generate TypeScript types from Zod schemas\nexport type TMultimodalPrompt = z.infer<typeof TMultimodalPromptSchema>;\nexport type TUserPrompt = z.infer<typeof TUserPromptSchema>;\n\nconst locateFieldFlagName = 'midscene_location_field_flag';\n\nconst MidsceneLocationResult = z\n .object({\n [locateFieldFlagName]: z.literal(true),\n prompt: TUserPromptSchema,\n\n // optional fields\n deepThink: z.boolean().optional(), // only available in vl model\n cacheable: z.boolean().optional(),\n xpath: z.boolean().optional(), // preset result for xpath\n\n // these two fields will only appear in the result\n center: z.tuple([z.number(), z.number()]),\n rect: RectSchema,\n })\n .passthrough();\n\nexport type MidsceneLocationResultType = z.infer<typeof MidsceneLocationResult>;\nexport const getMidsceneLocationSchema = () => {\n return MidsceneLocationResult;\n};\n\nexport const ifMidsceneLocatorField = (field: any): boolean => {\n // Handle optional fields by getting the inner type\n let actualField = field;\n if (actualField._def?.typeName === 'ZodOptional') {\n actualField = actualField._def.innerType;\n }\n\n // Check if this is a ZodUnion (the new MidsceneLocation structure)\n if (actualField._def?.typeName === 'ZodObject') {\n const shape = actualField._def.shape();\n return locateFieldFlagName in shape;\n }\n\n return false;\n};\n\nexport const dumpMidsceneLocatorField = (field: any): string => {\n assert(\n ifMidsceneLocatorField(field),\n 'field is not a midscene locator field',\n );\n\n // If field is a string, return it directly\n if (typeof field === 'string') {\n return field;\n }\n\n // If field is an object with prompt property\n if (field && typeof field === 'object' && field.prompt) {\n // If prompt is a string, return it directly\n if (typeof field.prompt === 'string') {\n return field.prompt;\n }\n // If prompt is a TUserPrompt object, extract the prompt string\n if (typeof field.prompt === 'object' && field.prompt.prompt) {\n return field.prompt.prompt; // TODO: dump images if necessary\n }\n }\n\n // Fallback: try to convert to string\n return String(field);\n};\n\nexport const findAllMidsceneLocatorField = (\n zodType?: z.ZodType<any>,\n requiredOnly?: boolean,\n): string[] => {\n if (!zodType) {\n return [];\n }\n\n // Check if this is a ZodObject by checking if it has a shape property\n const zodObject = zodType as any;\n if (zodObject._def?.typeName === 'ZodObject' && zodObject.shape) {\n const keys = Object.keys(zodObject.shape);\n return keys.filter((key) => {\n const field = zodObject.shape[key];\n if (!ifMidsceneLocatorField(field)) {\n return false;\n }\n\n // If requiredOnly is true, filter out optional fields\n if (requiredOnly) {\n return field._def?.typeName !== 'ZodOptional';\n }\n\n return true;\n });\n }\n\n // For other ZodType instances, we can't extract field names\n return [];\n};\n\nexport const dumpActionParam = (\n jsonObject: Record<string, any>,\n zodSchema: z.ZodType<any>,\n): Record<string, any> => {\n const locatorFields = findAllMidsceneLocatorField(zodSchema);\n const result = { ...jsonObject };\n\n for (const fieldName of locatorFields) {\n const fieldValue = result[fieldName];\n if (fieldValue) {\n // If it's already a string, keep it as is\n if (typeof fieldValue === 'string') {\n result[fieldName] = fieldValue;\n } else if (typeof fieldValue === 'object') {\n // Check if this field is actually a MidsceneLocationType object\n if (fieldValue.prompt) {\n // If prompt is a string, use it directly\n if (typeof fieldValue.prompt === 'string') {\n result[fieldName] = fieldValue.prompt;\n } else if (\n typeof fieldValue.prompt === 'object' &&\n fieldValue.prompt.prompt\n ) {\n // If prompt is a TUserPrompt object, extract the prompt string\n result[fieldName] = fieldValue.prompt.prompt;\n }\n }\n }\n }\n }\n\n return result;\n};\n\nexport const loadActionParam = (\n jsonObject: Record<string, any>,\n zodSchema: z.ZodType<any>,\n): Record<string, any> => {\n const locatorFields = findAllMidsceneLocatorField(zodSchema);\n const result = { ...jsonObject };\n\n for (const fieldName of locatorFields) {\n const fieldValue = result[fieldName];\n if (fieldValue && typeof fieldValue === 'string') {\n result[fieldName] = {\n [locateFieldFlagName]: true,\n prompt: fieldValue,\n };\n }\n }\n\n return result;\n};\n"],"names":["AIActionType","defaultBboxSize","debugInspectUtils","getDebug","fillBboxParam","locate","width","height","vlMode","adaptBbox","adaptQwenBbox","bbox","msg","JSON","Error","result","Math","adaptDoubaoBbox","assert","splitted","Number","Array","bboxList","item","x","y","adaptGeminiBbox","left","top","right","bottom","adaptBboxToRect","offsetX","offsetY","rectLeft","rectTop","rectWidth","rectHeight","rect","warned","warnGPT4oSizeLimit","size","modelName","warningMsg","console","mergeRects","rects","minLeft","r","minTop","maxRight","maxBottom","expandSearchArea","screenSize","minEdgeSize","defaultPadding","paddingSizeHorizontal","paddingSizeVertical","newWidth","newHeight","newLeft","newTop","markupImageForLLM","screenshotBase64","tree","elementsInfo","treeToList","elementsPositionInfoWithoutText","elementInfo","NodeType","imagePayload","compositeElementInfoImg","buildYamlFlowFromPlans","plans","actionSpace","sleep","flow","plan","verb","action","flowKey","flowParam","dumpActionParam","flowItem","PointSchema","z","SizeSchema","RectSchema","TMultimodalPromptSchema","TUserPromptSchema","locateFieldFlagName","MidsceneLocationResult","getMidsceneLocationSchema","ifMidsceneLocatorField","field","_actualField__def","_actualField__def1","actualField","shape","dumpMidsceneLocatorField","String","findAllMidsceneLocatorField","zodType","requiredOnly","_zodObject__def","zodObject","keys","Object","key","_field__def","jsonObject","zodSchema","locatorFields","fieldName","fieldValue","loadActionParam"],"mappings":";;;;;;AAwBO,IAAKA,sBAAYA,WAAAA,GAAAA,SAAZA,YAAY;;;;;;;WAAZA;;AASZ,MAAMC,kBAAkB;AACxB,MAAMC,oBAAoBC,SAAS;AAG5B,SAASC,cACdC,MAA2B,EAC3BC,KAAa,EACbC,MAAc,EACdC,MAAgC;IAGhC,IAAKH,OAAe,OAAO,IAAI,CAACA,CAAAA,QAAAA,SAAAA,KAAAA,IAAAA,OAAQ,IAAI,AAAD,GAAG;QAC5CA,OAAO,IAAI,GAAIA,OAAe,OAAO;QAErC,OAAQA,OAAe,OAAO;IAChC;IAEA,IAAIA,QAAAA,SAAAA,KAAAA,IAAAA,OAAQ,IAAI,EACdA,OAAO,IAAI,GAAGI,UAAUJ,OAAO,IAAI,EAAEC,OAAOC,QAAQC;IAGtD,OAAOH;AACT;AAEO,SAASK,cACdC,IAAc;IAEd,IAAIA,KAAK,MAAM,GAAG,GAAG;QACnB,MAAMC,MAAM,CAAC,oCAAoC,EAAEC,KAAK,SAAS,CAACF,MAAM,CAAC,CAAC;QAC1E,MAAM,IAAIG,MAAMF;IAClB;IAEA,MAAMG,SAA2C;QAC/CC,KAAK,KAAK,CAACL,IAAI,CAAC,EAAE;QAClBK,KAAK,KAAK,CAACL,IAAI,CAAC,EAAE;QACC,YAAnB,OAAOA,IAAI,CAAC,EAAE,GACVK,KAAK,KAAK,CAACL,IAAI,CAAC,EAAE,IAClBK,KAAK,KAAK,CAACL,IAAI,CAAC,EAAE,GAAGV;QACN,YAAnB,OAAOU,IAAI,CAAC,EAAE,GACVK,KAAK,KAAK,CAACL,IAAI,CAAC,EAAE,IAClBK,KAAK,KAAK,CAACL,IAAI,CAAC,EAAE,GAAGV;KAC1B;IACD,OAAOc;AACT;AAEO,SAASE,gBACdN,IAAkC,EAClCL,KAAa,EACbC,MAAc;IAEdW,OACEZ,QAAQ,KAAKC,SAAS,GACtB;IAGF,IAAI,AAAgB,YAAhB,OAAOI,MAAmB;QAC5BO,OACE,+BAA+B,IAAI,CAACP,KAAK,IAAI,KAC7C,CAAC,iDAAiD,EAAEA,MAAM;QAE5D,MAAMQ,WAAWR,KAAK,KAAK,CAAC;QAC5B,IAAIQ,AAAoB,MAApBA,SAAS,MAAM,EACjB,OAAO;YACLH,KAAK,KAAK,CAAEI,OAAOD,QAAQ,CAAC,EAAE,IAAIb,QAAS;YAC3CU,KAAK,KAAK,CAAEI,OAAOD,QAAQ,CAAC,EAAE,IAAIZ,SAAU;YAC5CS,KAAK,KAAK,CAAEI,OAAOD,QAAQ,CAAC,EAAE,IAAIb,QAAS;YAC3CU,KAAK,KAAK,CAAEI,OAAOD,QAAQ,CAAC,EAAE,IAAIZ,SAAU;SAC7C;QAEH,MAAM,IAAIO,MAAM,CAAC,iDAAiD,EAAEH,MAAM;IAC5E;IAEA,IAAIU,MAAM,OAAO,CAACV,SAASU,MAAM,OAAO,CAACV,IAAI,CAAC,EAAE,GAC9CA,OAAOA,IAAI,CAAC,EAAE;IAGhB,IAAIW,WAAqB,EAAE;IAC3B,IAAID,MAAM,OAAO,CAACV,SAAS,AAAmB,YAAnB,OAAOA,IAAI,CAAC,EAAE,EACvCA,KAAK,OAAO,CAAC,CAACY;QACZ,IAAI,AAAgB,YAAhB,OAAOA,QAAqBA,KAAK,QAAQ,CAAC,MAAM;YAClD,MAAM,CAACC,GAAGC,EAAE,GAAGF,KAAK,KAAK,CAAC;YAC1BD,SAAS,IAAI,CAACF,OAAOI,EAAE,IAAI,KAAKJ,OAAOK,EAAE,IAAI;QAC/C,OAAO,IAAI,AAAgB,YAAhB,OAAOF,QAAqBA,KAAK,QAAQ,CAAC,MAAM;YACzD,MAAM,CAACC,GAAGC,EAAE,GAAGF,KAAK,KAAK,CAAC;YAC1BD,SAAS,IAAI,CAACF,OAAOI,EAAE,IAAI,KAAKJ,OAAOK,EAAE,IAAI;QAC/C,OACEH,SAAS,IAAI,CAACF,OAAOG;IAEzB;SAEAD,WAAWX;IAGb,IAAIW,AAAoB,MAApBA,SAAS,MAAM,IAAUA,AAAoB,MAApBA,SAAS,MAAM,EAC1C,OAAO;QACLN,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGhB,QAAS;QACnCU,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGf,SAAU;QACpCS,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGhB,QAAS;QACnCU,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGf,SAAU;KACrC;IAIH,IACEe,AAAoB,MAApBA,SAAS,MAAM,IACfA,AAAoB,MAApBA,SAAS,MAAM,IACfA,AAAoB,MAApBA,SAAS,MAAM,IACfA,AAAoB,MAApBA,SAAS,MAAM,EAEf,OAAO;QACLN,KAAK,GAAG,CACN,GACAA,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGhB,QAAS,QAAQL,kBAAkB;QAE/De,KAAK,GAAG,CACN,GACAA,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGf,SAAU,QAAQN,kBAAkB;QAEhEe,KAAK,GAAG,CACNV,OACAU,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGhB,QAAS,QAAQL,kBAAkB;QAE/De,KAAK,GAAG,CACNT,QACAS,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGf,SAAU,QAAQN,kBAAkB;KAEjE;IAGH,IAAIU,AAAgB,MAAhBA,KAAK,MAAM,EACb,OAAO;QACLK,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGhB,QAAS;QACnCU,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGf,SAAU;QACpCS,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGhB,QAAS;QACnCU,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGf,SAAU;KACrC;IAGH,MAAMK,MAAM,CAAC,0CAA0C,EAAEC,KAAK,SAAS,CAACF,MAAM,CAAC,CAAC;IAChF,MAAM,IAAIG,MAAMF;AAClB;AAEO,SAASH,UACdE,IAAc,EACdL,KAAa,EACbC,MAAc,EACdC,MAAgC;IAEhC,IAAIA,AAAW,oBAAXA,UAA8BA,AAAW,kBAAXA,QAChC,OAAOS,gBAAgBN,MAAML,OAAOC;IAGtC,IAAIC,AAAW,aAAXA,QACF,OAAOkB,gBAAgBf,MAAML,OAAOC;IAGtC,OAAOG,cAAcC;AACvB;AAEO,SAASe,gBACdf,IAAc,EACdL,KAAa,EACbC,MAAc;IAEd,MAAMoB,OAAOX,KAAK,KAAK,CAAEL,IAAI,CAAC,EAAE,GAAGL,QAAS;IAC5C,MAAMsB,MAAMZ,KAAK,KAAK,CAAEL,IAAI,CAAC,EAAE,GAAGJ,SAAU;IAC5C,MAAMsB,QAAQb,KAAK,KAAK,CAAEL,IAAI,CAAC,EAAE,GAAGL,QAAS;IAC7C,MAAMwB,SAASd,KAAK,KAAK,CAAEL,IAAI,CAAC,EAAE,GAAGJ,SAAU;IAC/C,OAAO;QAACoB;QAAMC;QAAKC;QAAOC;KAAO;AACnC;AAEO,SAASC,gBACdpB,IAAc,EACdL,KAAa,EACbC,MAAc,EACdyB,UAAU,CAAC,EACXC,UAAU,CAAC,EACXzB,MAAiC;IAEjCN,kBACE,mBACAS,MACAL,OACAC,QACAyB,SACAC,SACAzB;IAEF,MAAM,CAACmB,MAAMC,KAAKC,OAAOC,OAAO,GAAGrB,UAAUE,MAAML,OAAOC,QAAQC;IAGlE,MAAM0B,WAAWP;IACjB,MAAMQ,UAAUP;IAChB,IAAIQ,YAAYP,QAAQF;IACxB,IAAIU,aAAaP,SAASF;IAI1B,IAAIM,WAAWE,YAAY9B,OACzB8B,YAAY9B,QAAQ4B;IAItB,IAAIC,UAAUE,aAAa9B,QACzB8B,aAAa9B,SAAS4B;IAIxBC,YAAYpB,KAAK,GAAG,CAAC,GAAGoB;IACxBC,aAAarB,KAAK,GAAG,CAAC,GAAGqB;IAEzB,MAAMC,OAAO;QACX,MAAMJ,WAAWF;QACjB,KAAKG,UAAUF;QACf,OAAOG;QACP,QAAQC;IACV;IACAnC,kBAAkB,4BAA4BoC;IAC9C,OAAOA;AACT;AAEA,IAAIC,SAAS;AACN,SAASC,mBAAmBC,IAAU,EAAEC,SAAiB;IAC9D,IAAIH,QAAQ;IACZ,IAAIG,UAAU,WAAW,GAAG,QAAQ,CAAC,WAAW;QAC9C,MAAMC,aAAa,CAAC,uEAAuE,EAAEF,KAAK,KAAK,CAAC,CAAC,EAAEA,KAAK,MAAM,CAAC,6FAA6F,CAAC;QAErN,IACEzB,KAAK,GAAG,CAACyB,KAAK,KAAK,EAAEA,KAAK,MAAM,IAAI,QACpCzB,KAAK,GAAG,CAACyB,KAAK,KAAK,EAAEA,KAAK,MAAM,IAAI,KACpC;YACAG,QAAQ,IAAI,CAACD;YACbJ,SAAS;QACX;IACF,OAAO,IAAIE,KAAK,KAAK,GAAG,QAAQA,KAAK,MAAM,GAAG,MAAM;QAClDG,QAAQ,IAAI,CACV,CAAC,gCAAgC,EAAEH,KAAK,KAAK,CAAC,CAAC,EAAEA,KAAK,MAAM,CAAC,2EAA2E,CAAC;QAE3IF,SAAS;IACX;AACF;AAEO,SAASM,WAAWC,KAAa;IACtC,MAAMC,UAAU/B,KAAK,GAAG,IAAI8B,MAAM,GAAG,CAAC,CAACE,IAAMA,EAAE,IAAI;IACnD,MAAMC,SAASjC,KAAK,GAAG,IAAI8B,MAAM,GAAG,CAAC,CAACE,IAAMA,EAAE,GAAG;IACjD,MAAME,WAAWlC,KAAK,GAAG,IAAI8B,MAAM,GAAG,CAAC,CAACE,IAAMA,EAAE,IAAI,GAAGA,EAAE,KAAK;IAC9D,MAAMG,YAAYnC,KAAK,GAAG,IAAI8B,MAAM,GAAG,CAAC,CAACE,IAAMA,EAAE,GAAG,GAAGA,EAAE,MAAM;IAC/D,OAAO;QACL,MAAMD;QACN,KAAKE;QACL,OAAOC,WAAWH;QAClB,QAAQI,YAAYF;IACtB;AACF;AAGO,SAASG,iBACdd,IAAU,EACVe,UAAgB,EAChB7C,MAAgC;IAEhC,MAAM8C,cAAc9C,AAAW,oBAAXA,SAA6B,MAAM;IACvD,MAAM+C,iBAAiB;IAGvB,MAAMC,wBACJlB,KAAK,KAAK,GAAGgB,cACTtC,KAAK,IAAI,CAAEsC,AAAAA,CAAAA,cAAchB,KAAK,KAAI,IAAK,KACvCiB;IACN,MAAME,sBACJnB,KAAK,MAAM,GAAGgB,cACVtC,KAAK,IAAI,CAAEsC,AAAAA,CAAAA,cAAchB,KAAK,MAAK,IAAK,KACxCiB;IAGN,IAAIG,WAAW1C,KAAK,GAAG,CAACsC,aAAahB,KAAK,KAAK,GAAGkB,AAAwB,IAAxBA;IAClD,IAAIG,YAAY3C,KAAK,GAAG,CAACsC,aAAahB,KAAK,MAAM,GAAGmB,AAAsB,IAAtBA;IAGpD,IAAIG,UAAUtB,KAAK,IAAI,GAAGkB;IAC1B,IAAIK,SAASvB,KAAK,GAAG,GAAGmB;IAIxB,IAAIG,UAAUF,WAAWL,WAAW,KAAK,EACvCO,UAAUP,WAAW,KAAK,GAAGK;IAI/B,IAAIG,SAASF,YAAYN,WAAW,MAAM,EACxCQ,SAASR,WAAW,MAAM,GAAGM;IAI/BC,UAAU5C,KAAK,GAAG,CAAC,GAAG4C;IACtBC,SAAS7C,KAAK,GAAG,CAAC,GAAG6C;IAIrB,IAAID,UAAUF,WAAWL,WAAW,KAAK,EACvCK,WAAWL,WAAW,KAAK,GAAGO;IAEhC,IAAIC,SAASF,YAAYN,WAAW,MAAM,EACxCM,YAAYN,WAAW,MAAM,GAAGQ;IAGlCvB,KAAK,IAAI,GAAGsB;IACZtB,KAAK,GAAG,GAAGuB;IACXvB,KAAK,KAAK,GAAGoB;IACbpB,KAAK,MAAM,GAAGqB;IAEd,OAAOrB;AACT;AAEO,eAAewB,kBACpBC,gBAAwB,EACxBC,IAAkC,EAClCvB,IAAU;IAEV,MAAMwB,eAAeC,WAAWF;IAChC,MAAMG,kCAAkCF,aAAc,MAAM,CAC1D,CAACG;QACC,IAAIA,YAAY,UAAU,CAAC,QAAQ,KAAKC,SAAS,IAAI,EACnD,OAAO;QAET,OAAO;IACT;IAGF,MAAMC,eAAe,MAAMC,wBAAwB;QACjD,gBAAgBR;QAChB,sBAAsBI;QACtB1B;IACF;IACA,OAAO6B;AACT;AAEO,SAASE,uBACdC,KAAuB,EACvBC,WAAgC,EAChCC,KAAc;IAEd,MAAMC,OAA+B,EAAE;IAEvC,KAAK,MAAMC,QAAQJ,MAAO;QACxB,MAAMK,OAAOD,KAAK,IAAI;QAEtB,MAAME,SAASL,YAAY,IAAI,CAAC,CAACK,SAAWA,OAAO,IAAI,KAAKD;QAC5D,IAAI,CAACC,QAAQ;YACXnC,QAAQ,IAAI,CACV,CAAC,sBAAsB,EAAEkC,KAAK,8BAA8B,CAAC;YAE/D;QACF;QAEA,MAAME,UAAUD,OAAO,cAAc,IAAID;QACzC,MAAMG,YAAYF,OAAO,WAAW,GAChCG,gBAAgBL,KAAK,KAAK,IAAI,CAAC,GAAGE,OAAO,WAAW,IACpD,CAAC;QAEL,MAAMI,WAAiC;YACrC,CAACH,QAAQ,EAAE;YACX,GAAGC,SAAS;QACd;QAEAL,KAAK,IAAI,CAACO;IACZ;IAEA,IAAIR,OACFC,KAAK,IAAI,CAAC;QACRD;IACF;IAGF,OAAOC;AACT;AAGO,MAAMQ,cAAcC,EAAE,MAAM,CAAC;IAClC,MAAMA,EAAE,MAAM;IACd,KAAKA,EAAE,MAAM;AACf;AAEO,MAAMC,aAAaD,EAAE,MAAM,CAAC;IACjC,OAAOA,EAAE,MAAM;IACf,QAAQA,EAAE,MAAM;IAChB,KAAKA,EAAE,MAAM,GAAG,QAAQ;AAC1B;AAEO,MAAME,aAAaH,YAAY,GAAG,CAACE,YAAY,GAAG,CACvDD,EAAE,MAAM,CAAC;IACP,MAAMA,EAAE,MAAM,GAAG,QAAQ;AAC3B;AAIK,MAAMG,0BAA0BH,EAAE,MAAM,CAAC;IAC9C,QAAQA,EAAAA,KACA,CACJA,EAAE,MAAM,CAAC;QACP,MAAMA,EAAE,MAAM;QACd,KAAKA,EAAE,MAAM;IACf,IAED,QAAQ;IACX,yBAAyBA,EAAE,OAAO,GAAG,QAAQ;AAC/C;AAGO,MAAMI,oBAAoBJ,EAAE,KAAK,CAAC;IACvCA,EAAE,MAAM;IACRA,EAAAA,MACS,CAAC;QACN,QAAQA,EAAE,MAAM;IAClB,GACC,GAAG,CAACG,wBAAwB,OAAO;CACvC;AAMD,MAAME,sBAAsB;AAE5B,MAAMC,yBAAyBN,EAAAA,MACtB,CAAC;IACN,CAACK,oBAAoB,EAAEL,EAAE,OAAO,CAAC;IACjC,QAAQI;IAGR,WAAWJ,EAAE,OAAO,GAAG,QAAQ;IAC/B,WAAWA,EAAE,OAAO,GAAG,QAAQ;IAC/B,OAAOA,EAAE,OAAO,GAAG,QAAQ;IAG3B,QAAQA,EAAE,KAAK,CAAC;QAACA,EAAE,MAAM;QAAIA,EAAE,MAAM;KAAG;IACxC,MAAME;AACR,GACC,WAAW;AAGP,MAAMK,4BAA4B,IAChCD;AAGF,MAAME,yBAAyB,CAACC;QAGjCC,mBAKAC;IANJ,IAAIC,cAAcH;IAClB,IAAIC,AAAAA,SAAAA,CAAAA,oBAAAA,YAAY,IAAI,AAAD,IAAfA,KAAAA,IAAAA,kBAAkB,QAAQ,AAAD,MAAM,eACjCE,cAAcA,YAAY,IAAI,CAAC,SAAS;IAI1C,IAAID,AAAAA,SAAAA,CAAAA,qBAAAA,YAAY,IAAI,AAAD,IAAfA,KAAAA,IAAAA,mBAAkB,QAAQ,AAAD,MAAM,aAAa;QAC9C,MAAME,QAAQD,YAAY,IAAI,CAAC,KAAK;QACpC,OAAOP,uBAAuBQ;IAChC;IAEA,OAAO;AACT;AAEO,MAAMC,2BAA2B,CAACL;IACvC5E,OACE2E,uBAAuBC,QACvB;IAIF,IAAI,AAAiB,YAAjB,OAAOA,OACT,OAAOA;IAIT,IAAIA,SAAS,AAAiB,YAAjB,OAAOA,SAAsBA,MAAM,MAAM,EAAE;QAEtD,IAAI,AAAwB,YAAxB,OAAOA,MAAM,MAAM,EACrB,OAAOA,MAAM,MAAM;QAGrB,IAAI,AAAwB,YAAxB,OAAOA,MAAM,MAAM,IAAiBA,MAAM,MAAM,CAAC,MAAM,EACzD,OAAOA,MAAM,MAAM,CAAC,MAAM;IAE9B;IAGA,OAAOM,OAAON;AAChB;AAEO,MAAMO,8BAA8B,CACzCC,SACAC;QAQIC;IANJ,IAAI,CAACF,SACH,OAAO,EAAE;IAIX,MAAMG,YAAYH;IAClB,IAAIE,AAAAA,SAAAA,CAAAA,kBAAAA,UAAU,IAAI,AAAD,IAAbA,KAAAA,IAAAA,gBAAgB,QAAQ,AAAD,MAAM,eAAeC,UAAU,KAAK,EAAE;QAC/D,MAAMC,OAAOC,OAAO,IAAI,CAACF,UAAU,KAAK;QACxC,OAAOC,KAAK,MAAM,CAAC,CAACE;YAClB,MAAMd,QAAQW,UAAU,KAAK,CAACG,IAAI;YAClC,IAAI,CAACf,uBAAuBC,QAC1B,OAAO;YAIT,IAAIS,cAAc;oBACTM;gBAAP,OAAOA,AAAAA,SAAAA,CAAAA,cAAAA,MAAM,IAAI,AAAD,IAATA,KAAAA,IAAAA,YAAY,QAAQ,AAAD,MAAM;YAClC;YAEA,OAAO;QACT;IACF;IAGA,OAAO,EAAE;AACX;AAEO,MAAM3B,kBAAkB,CAC7B4B,YACAC;IAEA,MAAMC,gBAAgBX,4BAA4BU;IAClD,MAAMhG,SAAS;QAAE,GAAG+F,UAAU;IAAC;IAE/B,KAAK,MAAMG,aAAaD,cAAe;QACrC,MAAME,aAAanG,MAAM,CAACkG,UAAU;QACpC,IAAIC,YAEF;YAAA,IAAI,AAAsB,YAAtB,OAAOA,YACTnG,MAAM,CAACkG,UAAU,GAAGC;iBACf,IAAI,AAAsB,YAAtB,OAAOA,YAEhB;gBAAA,IAAIA,WAAW,MAAM,EAEnB;oBAAA,IAAI,AAA6B,YAA7B,OAAOA,WAAW,MAAM,EAC1BnG,MAAM,CAACkG,UAAU,GAAGC,WAAW,MAAM;yBAChC,IACL,AAA6B,YAA7B,OAAOA,WAAW,MAAM,IACxBA,WAAW,MAAM,CAAC,MAAM,EAGxBnG,MAAM,CAACkG,UAAU,GAAGC,WAAW,MAAM,CAAC,MAAM;gBAC9C;YACF;QACF;IAEJ;IAEA,OAAOnG;AACT;AAEO,MAAMoG,kBAAkB,CAC7BL,YACAC;IAEA,MAAMC,gBAAgBX,4BAA4BU;IAClD,MAAMhG,SAAS;QAAE,GAAG+F,UAAU;IAAC;IAE/B,KAAK,MAAMG,aAAaD,cAAe;QACrC,MAAME,aAAanG,MAAM,CAACkG,UAAU;QACpC,IAAIC,cAAc,AAAsB,YAAtB,OAAOA,YACvBnG,MAAM,CAACkG,UAAU,GAAG;YAClB,CAACvB,oBAAoB,EAAE;YACvB,QAAQwB;QACV;IAEJ;IAEA,OAAOnG;AACT"}
|
|
@@ -67,30 +67,12 @@ async function AiLocateElement(options) {
|
|
|
67
67
|
});
|
|
68
68
|
const systemPrompt = systemPromptToLocateElement(vlMode);
|
|
69
69
|
let imagePayload = screenshotBase64;
|
|
70
|
-
let imageWidth = context.size.width;
|
|
71
|
-
let imageHeight = context.size.height;
|
|
72
|
-
let originalImageWidth = imageWidth;
|
|
73
|
-
let originalImageHeight = imageHeight;
|
|
74
70
|
if (options.searchConfig) {
|
|
75
|
-
var _options_searchConfig_rect, _options_searchConfig_rect1;
|
|
76
71
|
assert(options.searchConfig.rect, 'searchArea is provided but its rect cannot be found. Failed to locate element');
|
|
77
72
|
assert(options.searchConfig.imageBase64, 'searchArea is provided but its imageBase64 cannot be found. Failed to locate element');
|
|
78
73
|
imagePayload = options.searchConfig.imageBase64;
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
originalImageWidth = imageWidth;
|
|
82
|
-
originalImageHeight = imageHeight;
|
|
83
|
-
} else if ('qwen-vl' === vlMode) {
|
|
84
|
-
const paddedResult = await paddingToMatchBlockByBase64(imagePayload);
|
|
85
|
-
imageWidth = paddedResult.width;
|
|
86
|
-
imageHeight = paddedResult.height;
|
|
87
|
-
imagePayload = paddedResult.imageBase64;
|
|
88
|
-
} else if ('qwen3-vl' === vlMode) {
|
|
89
|
-
const paddedResult = await paddingToMatchBlockByBase64(imagePayload, 32);
|
|
90
|
-
imageWidth = paddedResult.width;
|
|
91
|
-
imageHeight = paddedResult.height;
|
|
92
|
-
imagePayload = paddedResult.imageBase64;
|
|
93
|
-
} else if (!vlMode) imagePayload = await markupImageForLLM(screenshotBase64, context.tree, context.size);
|
|
74
|
+
} else if ('qwen-vl' === vlMode) imagePayload = await paddingToMatchBlockByBase64(imagePayload);
|
|
75
|
+
else if (!vlMode) imagePayload = await markupImageForLLM(screenshotBase64, context.tree, context.size);
|
|
94
76
|
const msgs = [
|
|
95
77
|
{
|
|
96
78
|
role: 'system',
|
|
@@ -127,8 +109,8 @@ async function AiLocateElement(options) {
|
|
|
127
109
|
let errors = 'errors' in res.content ? res.content.errors : [];
|
|
128
110
|
try {
|
|
129
111
|
if ('bbox' in res.content && Array.isArray(res.content.bbox)) {
|
|
130
|
-
var
|
|
131
|
-
resRect = adaptBboxToRect(res.content.bbox,
|
|
112
|
+
var _options_searchConfig_rect, _options_searchConfig, _options_searchConfig_rect1, _options_searchConfig1, _options_searchConfig_rect2, _options_searchConfig2, _options_searchConfig_rect3, _options_searchConfig3;
|
|
113
|
+
resRect = adaptBboxToRect(res.content.bbox, (null == (_options_searchConfig = options.searchConfig) ? void 0 : null == (_options_searchConfig_rect = _options_searchConfig.rect) ? void 0 : _options_searchConfig_rect.width) || context.size.width, (null == (_options_searchConfig1 = options.searchConfig) ? void 0 : null == (_options_searchConfig_rect1 = _options_searchConfig1.rect) ? void 0 : _options_searchConfig_rect1.height) || context.size.height, null == (_options_searchConfig2 = options.searchConfig) ? void 0 : null == (_options_searchConfig_rect2 = _options_searchConfig2.rect) ? void 0 : _options_searchConfig_rect2.left, null == (_options_searchConfig3 = options.searchConfig) ? void 0 : null == (_options_searchConfig_rect3 = _options_searchConfig3.rect) ? void 0 : _options_searchConfig_rect3.top, vlMode);
|
|
132
114
|
debugInspect('resRect', resRect);
|
|
133
115
|
const rectCenter = {
|
|
134
116
|
x: resRect.left + resRect.width / 2,
|
|
@@ -207,11 +189,11 @@ async function AiLocateSection(options) {
|
|
|
207
189
|
let sectionRect;
|
|
208
190
|
const sectionBbox = result.content.bbox;
|
|
209
191
|
if (sectionBbox) {
|
|
210
|
-
const targetRect = adaptBboxToRect(sectionBbox, context.size.width, context.size.height, 0, 0,
|
|
192
|
+
const targetRect = adaptBboxToRect(sectionBbox, context.size.width, context.size.height, 0, 0, vlMode);
|
|
211
193
|
debugSection('original targetRect %j', targetRect);
|
|
212
194
|
const referenceBboxList = result.content.references_bbox || [];
|
|
213
195
|
debugSection('referenceBboxList %j', referenceBboxList);
|
|
214
|
-
const referenceRects = referenceBboxList.filter((bbox)=>Array.isArray(bbox)).map((bbox)=>adaptBboxToRect(bbox, context.size.width, context.size.height, 0, 0,
|
|
196
|
+
const referenceRects = referenceBboxList.filter((bbox)=>Array.isArray(bbox)).map((bbox)=>adaptBboxToRect(bbox, context.size.width, context.size.height, 0, 0, vlMode));
|
|
215
197
|
debugSection('referenceRects %j', referenceRects);
|
|
216
198
|
const mergedRect = mergeRects([
|
|
217
199
|
targetRect,
|
|
@@ -222,12 +204,7 @@ async function AiLocateSection(options) {
|
|
|
222
204
|
debugSection('expanded sectionRect %j', sectionRect);
|
|
223
205
|
}
|
|
224
206
|
let imageBase64 = screenshotBase64;
|
|
225
|
-
if (sectionRect)
|
|
226
|
-
const croppedResult = await cropByRect(screenshotBase64, sectionRect, 'qwen-vl' === vlMode);
|
|
227
|
-
imageBase64 = croppedResult.imageBase64;
|
|
228
|
-
sectionRect.width = croppedResult.width;
|
|
229
|
-
sectionRect.height = croppedResult.height;
|
|
230
|
-
}
|
|
207
|
+
if (sectionRect) imageBase64 = await cropByRect(screenshotBase64, sectionRect, 'qwen-vl' === vlMode);
|
|
231
208
|
return {
|
|
232
209
|
rect: sectionRect,
|
|
233
210
|
imageBase64,
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ai-model/inspect.mjs","sources":["webpack://@midscene/core/./src/ai-model/inspect.ts"],"sourcesContent":["import type {\n AIDataExtractionResponse,\n AIElementLocatorResponse,\n AIElementResponse,\n AISectionLocatorResponse,\n AIUsageInfo,\n BaseElement,\n ElementById,\n InsightExtractOption,\n Rect,\n ReferenceImage,\n UIContext,\n} from '@/types';\nimport type { IModelConfig } from '@midscene/shared/env';\nimport {\n cropByRect,\n paddingToMatchBlockByBase64,\n preProcessImageUrl,\n} from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport type {\n ChatCompletionSystemMessageParam,\n ChatCompletionUserMessageParam,\n} from 'openai/resources/index';\nimport type { TMultimodalPrompt, TUserPrompt } from './common';\nimport {\n AIActionType,\n adaptBboxToRect,\n expandSearchArea,\n markupImageForLLM,\n mergeRects,\n} from './common';\nimport {\n extractDataQueryPrompt,\n systemPromptToExtract,\n} from './prompt/extraction';\nimport {\n findElementPrompt,\n systemPromptToLocateElement,\n} from './prompt/llm-locator';\nimport {\n sectionLocatorInstruction,\n systemPromptToLocateSection,\n} from './prompt/llm-section-locator';\nimport {\n describeUserPage,\n distance,\n distanceThreshold,\n elementByPositionWithElementInfo,\n} from './prompt/util';\nimport { callAIWithObjectResponse } from './service-caller/index';\n\nexport type AIArgs = [\n ChatCompletionSystemMessageParam,\n ...ChatCompletionUserMessageParam[],\n];\n\nconst debugInspect = getDebug('ai:inspect');\nconst debugSection = getDebug('ai:section');\n\nconst extraTextFromUserPrompt = (prompt: TUserPrompt): string => {\n if (typeof prompt === 'string') {\n return prompt;\n } else {\n return prompt.prompt;\n }\n};\n\nconst promptsToChatParam = async (\n multimodalPrompt: TMultimodalPrompt,\n): Promise<ChatCompletionUserMessageParam[]> => {\n const msgs: ChatCompletionUserMessageParam[] = [];\n if (multimodalPrompt?.images?.length) {\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: 'Next, I will provide all the reference images.',\n },\n ],\n });\n\n for (const item of multimodalPrompt.images) {\n const base64 = await preProcessImageUrl(\n item.url,\n !!multimodalPrompt.convertHttpImage2Base64,\n );\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: `reference image ${item.name}:`,\n },\n ],\n });\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: base64,\n detail: 'high',\n },\n },\n ],\n });\n }\n }\n return msgs;\n};\n\nexport async function AiLocateElement<\n ElementType extends BaseElement = BaseElement,\n>(options: {\n context: UIContext<ElementType>;\n targetElementDescription: TUserPrompt;\n referenceImage?: ReferenceImage;\n callAIFn: typeof callAIWithObjectResponse<\n AIElementResponse | [number, number]\n >;\n searchConfig?: Awaited<ReturnType<typeof AiLocateSection>>;\n modelConfig: IModelConfig;\n}): Promise<{\n parseResult: AIElementLocatorResponse;\n rect?: Rect;\n rawResponse: string;\n elementById: ElementById;\n usage?: AIUsageInfo;\n isOrderSensitive?: boolean;\n}> {\n const { context, targetElementDescription, callAIFn, modelConfig } = options;\n const { vlMode } = modelConfig;\n const { screenshotBase64 } = context;\n\n const { description, elementById, insertElementByPosition } =\n await describeUserPage(context, { vlMode });\n\n assert(\n targetElementDescription,\n 'cannot find the target element description',\n );\n const userInstructionPrompt = await findElementPrompt.format({\n pageDescription: description,\n targetElementDescription: extraTextFromUserPrompt(targetElementDescription),\n });\n const systemPrompt = systemPromptToLocateElement(vlMode);\n\n let imagePayload = screenshotBase64;\n let imageWidth = context.size.width;\n let imageHeight = context.size.height;\n let originalImageWidth = imageWidth;\n let originalImageHeight = imageHeight;\n\n if (options.searchConfig) {\n assert(\n options.searchConfig.rect,\n 'searchArea is provided but its rect cannot be found. Failed to locate element',\n );\n assert(\n options.searchConfig.imageBase64,\n 'searchArea is provided but its imageBase64 cannot be found. Failed to locate element',\n );\n\n imagePayload = options.searchConfig.imageBase64;\n imageWidth = options.searchConfig.rect?.width;\n imageHeight = options.searchConfig.rect?.height;\n originalImageWidth = imageWidth;\n originalImageHeight = imageHeight;\n } else if (vlMode === 'qwen-vl') {\n const paddedResult = await paddingToMatchBlockByBase64(imagePayload);\n imageWidth = paddedResult.width;\n imageHeight = paddedResult.height;\n imagePayload = paddedResult.imageBase64;\n } else if (vlMode === 'qwen3-vl') {\n const paddedResult = await paddingToMatchBlockByBase64(imagePayload, 32);\n imageWidth = paddedResult.width;\n imageHeight = paddedResult.height;\n imagePayload = paddedResult.imageBase64;\n } else if (!vlMode) {\n imagePayload = await markupImageForLLM(\n screenshotBase64,\n context.tree,\n context.size,\n );\n }\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: userInstructionPrompt,\n },\n ],\n },\n ];\n\n if (typeof targetElementDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: targetElementDescription.images,\n convertHttpImage2Base64: targetElementDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const res = await callAIFn(msgs, AIActionType.INSPECT_ELEMENT, modelConfig);\n\n const rawResponse = JSON.stringify(res.content);\n\n let resRect: Rect | undefined;\n let matchedElements: AIElementLocatorResponse['elements'] =\n 'elements' in res.content ? res.content.elements : [];\n let errors: AIElementLocatorResponse['errors'] | undefined =\n 'errors' in res.content ? res.content.errors : [];\n try {\n if ('bbox' in res.content && Array.isArray(res.content.bbox)) {\n resRect = adaptBboxToRect(\n res.content.bbox,\n imageWidth,\n imageHeight,\n options.searchConfig?.rect?.left,\n options.searchConfig?.rect?.top,\n originalImageWidth,\n originalImageHeight,\n vlMode,\n );\n\n debugInspect('resRect', resRect);\n\n const rectCenter = {\n x: resRect.left + resRect.width / 2,\n y: resRect.top + resRect.height / 2,\n };\n let element = elementByPositionWithElementInfo(context.tree, rectCenter);\n\n const distanceToCenter = element\n ? distance({ x: element.center[0], y: element.center[1] }, rectCenter)\n : 0;\n\n if (!element || distanceToCenter > distanceThreshold) {\n element = insertElementByPosition(rectCenter);\n }\n\n if (element) {\n matchedElements = [element];\n errors = [];\n }\n }\n } catch (e) {\n const msg =\n e instanceof Error\n ? `Failed to parse bbox: ${e.message}`\n : 'unknown error in locate';\n if (!errors || errors?.length === 0) {\n errors = [msg];\n } else {\n errors.push(`(${msg})`);\n }\n }\n\n return {\n rect: resRect,\n parseResult: {\n elements: matchedElements,\n errors,\n },\n rawResponse,\n elementById,\n usage: res.usage,\n isOrderSensitive:\n typeof res.content === 'object' &&\n res.content !== null &&\n 'isOrderSensitive' in res.content\n ? (res.content as any).isOrderSensitive\n : undefined,\n };\n}\n\nexport async function AiLocateSection(options: {\n context: UIContext<BaseElement>;\n sectionDescription: TUserPrompt;\n modelConfig: IModelConfig;\n}): Promise<{\n rect?: Rect;\n imageBase64?: string;\n error?: string;\n rawResponse: string;\n usage?: AIUsageInfo;\n}> {\n const { context, sectionDescription, modelConfig } = options;\n const { vlMode } = modelConfig;\n const { screenshotBase64 } = context;\n\n const systemPrompt = systemPromptToLocateSection(vlMode);\n const sectionLocatorInstructionText = await sectionLocatorInstruction.format({\n sectionDescription: extraTextFromUserPrompt(sectionDescription),\n });\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: sectionLocatorInstructionText,\n },\n ],\n },\n ];\n\n if (typeof sectionDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: sectionDescription.images,\n convertHttpImage2Base64: sectionDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const result = await callAIWithObjectResponse<AISectionLocatorResponse>(\n msgs,\n AIActionType.EXTRACT_DATA,\n modelConfig,\n );\n\n let sectionRect: Rect | undefined;\n const sectionBbox = result.content.bbox;\n if (sectionBbox) {\n const targetRect = adaptBboxToRect(\n sectionBbox,\n context.size.width,\n context.size.height,\n 0,\n 0,\n context.size.width,\n context.size.height,\n vlMode,\n );\n debugSection('original targetRect %j', targetRect);\n\n const referenceBboxList = result.content.references_bbox || [];\n debugSection('referenceBboxList %j', referenceBboxList);\n\n const referenceRects = referenceBboxList\n .filter((bbox) => Array.isArray(bbox))\n .map((bbox) => {\n return adaptBboxToRect(\n bbox,\n context.size.width,\n context.size.height,\n 0,\n 0,\n context.size.width,\n context.size.height,\n vlMode,\n );\n });\n debugSection('referenceRects %j', referenceRects);\n\n // merge the sectionRect and referenceRects\n const mergedRect = mergeRects([targetRect, ...referenceRects]);\n debugSection('mergedRect %j', mergedRect);\n\n // expand search area to at least 200 x 200\n sectionRect = expandSearchArea(mergedRect, context.size, vlMode);\n debugSection('expanded sectionRect %j', sectionRect);\n }\n\n let imageBase64 = screenshotBase64;\n if (sectionRect) {\n const croppedResult = await cropByRect(\n screenshotBase64,\n sectionRect,\n vlMode === 'qwen-vl',\n );\n imageBase64 = croppedResult.imageBase64;\n sectionRect.width = croppedResult.width;\n sectionRect.height = croppedResult.height;\n }\n\n return {\n rect: sectionRect,\n imageBase64,\n error: result.content.error,\n rawResponse: JSON.stringify(result.content),\n usage: result.usage,\n };\n}\n\nexport async function AiExtractElementInfo<\n T,\n ElementType extends BaseElement = BaseElement,\n>(options: {\n dataQuery: string | Record<string, string>;\n multimodalPrompt?: TMultimodalPrompt;\n context: UIContext<ElementType>;\n extractOption?: InsightExtractOption;\n modelConfig: IModelConfig;\n}) {\n const { dataQuery, context, extractOption, multimodalPrompt, modelConfig } =\n options;\n const { vlMode } = modelConfig;\n const systemPrompt = systemPromptToExtract();\n\n const { screenshotBase64 } = context;\n\n const { description, elementById } = await describeUserPage(context, {\n truncateTextLength: 200,\n filterNonTextContent: false,\n visibleOnly: false,\n domIncluded: extractOption?.domIncluded,\n vlMode,\n });\n\n const extractDataPromptText = await extractDataQueryPrompt(\n description,\n dataQuery,\n );\n\n const userContent: ChatCompletionUserMessageParam['content'] = [];\n\n if (extractOption?.screenshotIncluded !== false) {\n userContent.push({\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n });\n }\n\n userContent.push({\n type: 'text',\n text: extractDataPromptText,\n });\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: userContent,\n },\n ];\n\n if (options.extractOption?.returnThought) {\n msgs.push({\n role: 'user',\n content: 'Please provide reasons.',\n });\n }\n\n if (multimodalPrompt) {\n const addOns = await promptsToChatParam({\n images: multimodalPrompt.images,\n convertHttpImage2Base64: multimodalPrompt.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const result = await callAIWithObjectResponse<AIDataExtractionResponse<T>>(\n msgs,\n AIActionType.EXTRACT_DATA,\n modelConfig,\n );\n return {\n parseResult: result.content,\n elementById,\n usage: result.usage,\n };\n}\n"],"names":["debugInspect","getDebug","debugSection","extraTextFromUserPrompt","prompt","promptsToChatParam","multimodalPrompt","_multimodalPrompt_images","msgs","item","base64","preProcessImageUrl","AiLocateElement","options","context","targetElementDescription","callAIFn","modelConfig","vlMode","screenshotBase64","description","elementById","insertElementByPosition","describeUserPage","assert","userInstructionPrompt","findElementPrompt","systemPrompt","systemPromptToLocateElement","imagePayload","imageWidth","imageHeight","originalImageWidth","originalImageHeight","_options_searchConfig_rect","_options_searchConfig_rect1","paddedResult","paddingToMatchBlockByBase64","markupImageForLLM","addOns","res","AIActionType","rawResponse","JSON","resRect","matchedElements","errors","Array","_options_searchConfig_rect2","_options_searchConfig_rect3","adaptBboxToRect","rectCenter","element","elementByPositionWithElementInfo","distanceToCenter","distance","distanceThreshold","e","msg","Error","undefined","AiLocateSection","sectionDescription","systemPromptToLocateSection","sectionLocatorInstructionText","sectionLocatorInstruction","result","callAIWithObjectResponse","sectionRect","sectionBbox","targetRect","referenceBboxList","referenceRects","bbox","mergedRect","mergeRects","expandSearchArea","imageBase64","croppedResult","cropByRect","AiExtractElementInfo","_options_extractOption","dataQuery","extractOption","systemPromptToExtract","extractDataPromptText","extractDataQueryPrompt","userContent"],"mappings":";;;;;;;;;AA0DA,MAAMA,eAAeC,SAAS;AAC9B,MAAMC,eAAeD,SAAS;AAE9B,MAAME,0BAA0B,CAACC;IAC/B,IAAI,AAAkB,YAAlB,OAAOA,QACT,OAAOA;IAEP,OAAOA,OAAO,MAAM;AAExB;AAEA,MAAMC,qBAAqB,OACzBC;QAGIC;IADJ,MAAMC,OAAyC,EAAE;IACjD,IAAID,QAAAA,mBAAAA,KAAAA,IAAAA,QAAAA,CAAAA,2BAAAA,iBAAkB,MAAM,AAAD,IAAvBA,KAAAA,IAAAA,yBAA0B,MAAM,EAAE;QACpCC,KAAK,IAAI,CAAC;YACR,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM;gBACR;aACD;QACH;QAEA,KAAK,MAAMC,QAAQH,iBAAiB,MAAM,CAAE;YAC1C,MAAMI,SAAS,MAAMC,mBACnBF,KAAK,GAAG,EACR,CAAC,CAACH,iBAAiB,uBAAuB;YAG5CE,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,MAAM,CAAC,gBAAgB,EAAEC,KAAK,IAAI,CAAC,CAAC,CAAC;oBACvC;iBACD;YACH;YAEAD,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,WAAW;4BACT,KAAKE;4BACL,QAAQ;wBACV;oBACF;iBACD;YACH;QACF;IACF;IACA,OAAOF;AACT;AAEO,eAAeI,gBAEpBC,OASD;IAQC,MAAM,EAAEC,OAAO,EAAEC,wBAAwB,EAAEC,QAAQ,EAAEC,WAAW,EAAE,GAAGJ;IACrE,MAAM,EAAEK,MAAM,EAAE,GAAGD;IACnB,MAAM,EAAEE,gBAAgB,EAAE,GAAGL;IAE7B,MAAM,EAAEM,WAAW,EAAEC,WAAW,EAAEC,uBAAuB,EAAE,GACzD,MAAMC,iBAAiBT,SAAS;QAAEI;IAAO;IAE3CM,OACET,0BACA;IAEF,MAAMU,wBAAwB,MAAMC,kBAAkB,MAAM,CAAC;QAC3D,iBAAiBN;QACjB,0BAA0BjB,wBAAwBY;IACpD;IACA,MAAMY,eAAeC,4BAA4BV;IAEjD,IAAIW,eAAeV;IACnB,IAAIW,aAAahB,QAAQ,IAAI,CAAC,KAAK;IACnC,IAAIiB,cAAcjB,QAAQ,IAAI,CAAC,MAAM;IACrC,IAAIkB,qBAAqBF;IACzB,IAAIG,sBAAsBF;IAE1B,IAAIlB,QAAQ,YAAY,EAAE;YAWXqB,4BACCC;QAXdX,OACEX,QAAQ,YAAY,CAAC,IAAI,EACzB;QAEFW,OACEX,QAAQ,YAAY,CAAC,WAAW,EAChC;QAGFgB,eAAehB,QAAQ,YAAY,CAAC,WAAW;QAC/CiB,aAAa,QAAAI,CAAAA,6BAAAA,QAAQ,YAAY,CAAC,IAAI,AAAD,IAAxBA,KAAAA,IAAAA,2BAA2B,KAAK;QAC7CH,cAAc,QAAAI,CAAAA,8BAAAA,QAAQ,YAAY,CAAC,IAAI,AAAD,IAAxBA,KAAAA,IAAAA,4BAA2B,MAAM;QAC/CH,qBAAqBF;QACrBG,sBAAsBF;IACxB,OAAO,IAAIb,AAAW,cAAXA,QAAsB;QAC/B,MAAMkB,eAAe,MAAMC,4BAA4BR;QACvDC,aAAaM,aAAa,KAAK;QAC/BL,cAAcK,aAAa,MAAM;QACjCP,eAAeO,aAAa,WAAW;IACzC,OAAO,IAAIlB,AAAW,eAAXA,QAAuB;QAChC,MAAMkB,eAAe,MAAMC,4BAA4BR,cAAc;QACrEC,aAAaM,aAAa,KAAK;QAC/BL,cAAcK,aAAa,MAAM;QACjCP,eAAeO,aAAa,WAAW;IACzC,OAAO,IAAI,CAAClB,QACVW,eAAe,MAAMS,kBACnBnB,kBACAL,QAAQ,IAAI,EACZA,QAAQ,IAAI;IAIhB,MAAMN,OAAe;QACnB;YAAE,MAAM;YAAU,SAASmB;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKE;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAMJ;gBACR;aACD;QACH;KACD;IAED,IAAI,AAAoC,YAApC,OAAOV,0BAAuC;QAChD,MAAMwB,SAAS,MAAMlC,mBAAmB;YACtC,QAAQU,yBAAyB,MAAM;YACvC,yBAAyBA,yBAAyB,uBAAuB;QAC3E;QACAP,KAAK,IAAI,IAAI+B;IACf;IAEA,MAAMC,MAAM,MAAMxB,SAASR,MAAMiC,aAAa,eAAe,EAAExB;IAE/D,MAAMyB,cAAcC,KAAK,SAAS,CAACH,IAAI,OAAO;IAE9C,IAAII;IACJ,IAAIC,kBACF,cAAcL,IAAI,OAAO,GAAGA,IAAI,OAAO,CAAC,QAAQ,GAAG,EAAE;IACvD,IAAIM,SACF,YAAYN,IAAI,OAAO,GAAGA,IAAI,OAAO,CAAC,MAAM,GAAG,EAAE;IACnD,IAAI;QACF,IAAI,UAAUA,IAAI,OAAO,IAAIO,MAAM,OAAO,CAACP,IAAI,OAAO,CAAC,IAAI,GAAG;gBAK1DQ,6BAAAA,uBACAC,6BAAAA;YALFL,UAAUM,gBACRV,IAAI,OAAO,CAAC,IAAI,EAChBV,YACAC,aAAAA,QACAiB,CAAAA,wBAAAA,QAAQ,YAAY,AAAD,IAAnBA,KAAAA,IAAAA,QAAAA,CAAAA,8BAAAA,sBAAsB,IAAI,AAAD,IAAzBA,KAAAA,IAAAA,4BAA4B,IAAI,UAChCC,CAAAA,yBAAAA,QAAQ,YAAY,AAAD,IAAnBA,KAAAA,IAAAA,QAAAA,CAAAA,8BAAAA,uBAAsB,IAAI,AAAD,IAAzBA,KAAAA,IAAAA,4BAA4B,GAAG,EAC/BjB,oBACAC,qBACAf;YAGFlB,aAAa,WAAW4C;YAExB,MAAMO,aAAa;gBACjB,GAAGP,QAAQ,IAAI,GAAGA,QAAQ,KAAK,GAAG;gBAClC,GAAGA,QAAQ,GAAG,GAAGA,QAAQ,MAAM,GAAG;YACpC;YACA,IAAIQ,UAAUC,iCAAiCvC,QAAQ,IAAI,EAAEqC;YAE7D,MAAMG,mBAAmBF,UACrBG,SAAS;gBAAE,GAAGH,QAAQ,MAAM,CAAC,EAAE;gBAAE,GAAGA,QAAQ,MAAM,CAAC,EAAE;YAAC,GAAGD,cACzD;YAEJ,IAAI,CAACC,WAAWE,mBAAmBE,mBACjCJ,UAAU9B,wBAAwB6B;YAGpC,IAAIC,SAAS;gBACXP,kBAAkB;oBAACO;iBAAQ;gBAC3BN,SAAS,EAAE;YACb;QACF;IACF,EAAE,OAAOW,GAAG;QACV,MAAMC,MACJD,aAAaE,QACT,CAAC,sBAAsB,EAAEF,EAAE,OAAO,EAAE,GACpC;QACN,IAAI,AAACX,UAAUA,AAAAA,CAAAA,QAAAA,SAAAA,KAAAA,IAAAA,OAAQ,MAAM,AAAD,MAAM,GAGhCA,OAAO,IAAI,CAAC,CAAC,CAAC,EAAEY,IAAI,CAAC,CAAC;aAFtBZ,SAAS;YAACY;SAAI;IAIlB;IAEA,OAAO;QACL,MAAMd;QACN,aAAa;YACX,UAAUC;YACVC;QACF;QACAJ;QACArB;QACA,OAAOmB,IAAI,KAAK;QAChB,kBACE,AAAuB,YAAvB,OAAOA,IAAI,OAAO,IAClBA,AAAgB,SAAhBA,IAAI,OAAO,IACX,sBAAsBA,IAAI,OAAO,GAC5BA,IAAI,OAAO,CAAS,gBAAgB,GACrCoB;IACR;AACF;AAEO,eAAeC,gBAAgBhD,OAIrC;IAOC,MAAM,EAAEC,OAAO,EAAEgD,kBAAkB,EAAE7C,WAAW,EAAE,GAAGJ;IACrD,MAAM,EAAEK,MAAM,EAAE,GAAGD;IACnB,MAAM,EAAEE,gBAAgB,EAAE,GAAGL;IAE7B,MAAMa,eAAeoC,4BAA4B7C;IACjD,MAAM8C,gCAAgC,MAAMC,0BAA0B,MAAM,CAAC;QAC3E,oBAAoB9D,wBAAwB2D;IAC9C;IACA,MAAMtD,OAAe;QACnB;YAAE,MAAM;YAAU,SAASmB;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKR;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAM6C;gBACR;aACD;QACH;KACD;IAED,IAAI,AAA8B,YAA9B,OAAOF,oBAAiC;QAC1C,MAAMvB,SAAS,MAAMlC,mBAAmB;YACtC,QAAQyD,mBAAmB,MAAM;YACjC,yBAAyBA,mBAAmB,uBAAuB;QACrE;QACAtD,KAAK,IAAI,IAAI+B;IACf;IAEA,MAAM2B,SAAS,MAAMC,yBACnB3D,MACAiC,aAAa,YAAY,EACzBxB;IAGF,IAAImD;IACJ,MAAMC,cAAcH,OAAO,OAAO,CAAC,IAAI;IACvC,IAAIG,aAAa;QACf,MAAMC,aAAapB,gBACjBmB,aACAvD,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnB,GACA,GACAA,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnBI;QAEFhB,aAAa,0BAA0BoE;QAEvC,MAAMC,oBAAoBL,OAAO,OAAO,CAAC,eAAe,IAAI,EAAE;QAC9DhE,aAAa,wBAAwBqE;QAErC,MAAMC,iBAAiBD,kBACpB,MAAM,CAAC,CAACE,OAAS1B,MAAM,OAAO,CAAC0B,OAC/B,GAAG,CAAC,CAACA,OACGvB,gBACLuB,MACA3D,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnB,GACA,GACAA,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnBI;QAGNhB,aAAa,qBAAqBsE;QAGlC,MAAME,aAAaC,WAAW;YAACL;eAAeE;SAAe;QAC7DtE,aAAa,iBAAiBwE;QAG9BN,cAAcQ,iBAAiBF,YAAY5D,QAAQ,IAAI,EAAEI;QACzDhB,aAAa,2BAA2BkE;IAC1C;IAEA,IAAIS,cAAc1D;IAClB,IAAIiD,aAAa;QACf,MAAMU,gBAAgB,MAAMC,WAC1B5D,kBACAiD,aACAlD,AAAW,cAAXA;QAEF2D,cAAcC,cAAc,WAAW;QACvCV,YAAY,KAAK,GAAGU,cAAc,KAAK;QACvCV,YAAY,MAAM,GAAGU,cAAc,MAAM;IAC3C;IAEA,OAAO;QACL,MAAMV;QACNS;QACA,OAAOX,OAAO,OAAO,CAAC,KAAK;QAC3B,aAAavB,KAAK,SAAS,CAACuB,OAAO,OAAO;QAC1C,OAAOA,OAAO,KAAK;IACrB;AACF;AAEO,eAAec,qBAGpBnE,OAMD;QA8CKoE;IA7CJ,MAAM,EAAEC,SAAS,EAAEpE,OAAO,EAAEqE,aAAa,EAAE7E,gBAAgB,EAAEW,WAAW,EAAE,GACxEJ;IACF,MAAM,EAAEK,MAAM,EAAE,GAAGD;IACnB,MAAMU,eAAeyD;IAErB,MAAM,EAAEjE,gBAAgB,EAAE,GAAGL;IAE7B,MAAM,EAAEM,WAAW,EAAEC,WAAW,EAAE,GAAG,MAAME,iBAAiBT,SAAS;QACnE,oBAAoB;QACpB,sBAAsB;QACtB,aAAa;QACb,aAAaqE,QAAAA,gBAAAA,KAAAA,IAAAA,cAAe,WAAW;QACvCjE;IACF;IAEA,MAAMmE,wBAAwB,MAAMC,uBAClClE,aACA8D;IAGF,MAAMK,cAAyD,EAAE;IAEjE,IAAIJ,AAAAA,CAAAA,QAAAA,gBAAAA,KAAAA,IAAAA,cAAe,kBAAkB,AAAD,MAAM,OACxCI,YAAY,IAAI,CAAC;QACf,MAAM;QACN,WAAW;YACT,KAAKpE;YACL,QAAQ;QACV;IACF;IAGFoE,YAAY,IAAI,CAAC;QACf,MAAM;QACN,MAAMF;IACR;IAEA,MAAM7E,OAAe;QACnB;YAAE,MAAM;YAAU,SAASmB;QAAa;QACxC;YACE,MAAM;YACN,SAAS4D;QACX;KACD;IAED,IAAI,QAAAN,CAAAA,yBAAAA,QAAQ,aAAa,AAAD,IAApBA,KAAAA,IAAAA,uBAAuB,aAAa,EACtCzE,KAAK,IAAI,CAAC;QACR,MAAM;QACN,SAAS;IACX;IAGF,IAAIF,kBAAkB;QACpB,MAAMiC,SAAS,MAAMlC,mBAAmB;YACtC,QAAQC,iBAAiB,MAAM;YAC/B,yBAAyBA,iBAAiB,uBAAuB;QACnE;QACAE,KAAK,IAAI,IAAI+B;IACf;IAEA,MAAM2B,SAAS,MAAMC,yBACnB3D,MACAiC,aAAa,YAAY,EACzBxB;IAEF,OAAO;QACL,aAAaiD,OAAO,OAAO;QAC3B7C;QACA,OAAO6C,OAAO,KAAK;IACrB;AACF"}
|
|
1
|
+
{"version":3,"file":"ai-model/inspect.mjs","sources":["webpack://@midscene/core/./src/ai-model/inspect.ts"],"sourcesContent":["import type {\n AIDataExtractionResponse,\n AIElementLocatorResponse,\n AIElementResponse,\n AISectionLocatorResponse,\n AIUsageInfo,\n BaseElement,\n ElementById,\n InsightExtractOption,\n Rect,\n ReferenceImage,\n UIContext,\n} from '@/types';\nimport type { IModelConfig } from '@midscene/shared/env';\nimport {\n cropByRect,\n paddingToMatchBlockByBase64,\n preProcessImageUrl,\n} from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport type {\n ChatCompletionSystemMessageParam,\n ChatCompletionUserMessageParam,\n} from 'openai/resources/index';\nimport type { TMultimodalPrompt, TUserPrompt } from './common';\nimport {\n AIActionType,\n adaptBboxToRect,\n expandSearchArea,\n markupImageForLLM,\n mergeRects,\n} from './common';\nimport {\n extractDataQueryPrompt,\n systemPromptToExtract,\n} from './prompt/extraction';\nimport {\n findElementPrompt,\n systemPromptToLocateElement,\n} from './prompt/llm-locator';\nimport {\n sectionLocatorInstruction,\n systemPromptToLocateSection,\n} from './prompt/llm-section-locator';\nimport {\n describeUserPage,\n distance,\n distanceThreshold,\n elementByPositionWithElementInfo,\n} from './prompt/util';\nimport { callAIWithObjectResponse } from './service-caller/index';\n\nexport type AIArgs = [\n ChatCompletionSystemMessageParam,\n ...ChatCompletionUserMessageParam[],\n];\n\nconst debugInspect = getDebug('ai:inspect');\nconst debugSection = getDebug('ai:section');\n\nconst extraTextFromUserPrompt = (prompt: TUserPrompt): string => {\n if (typeof prompt === 'string') {\n return prompt;\n } else {\n return prompt.prompt;\n }\n};\n\nconst promptsToChatParam = async (\n multimodalPrompt: TMultimodalPrompt,\n): Promise<ChatCompletionUserMessageParam[]> => {\n const msgs: ChatCompletionUserMessageParam[] = [];\n if (multimodalPrompt?.images?.length) {\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: 'Next, I will provide all the reference images.',\n },\n ],\n });\n\n for (const item of multimodalPrompt.images) {\n const base64 = await preProcessImageUrl(\n item.url,\n !!multimodalPrompt.convertHttpImage2Base64,\n );\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: `reference image ${item.name}:`,\n },\n ],\n });\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: base64,\n detail: 'high',\n },\n },\n ],\n });\n }\n }\n return msgs;\n};\n\nexport async function AiLocateElement<\n ElementType extends BaseElement = BaseElement,\n>(options: {\n context: UIContext<ElementType>;\n targetElementDescription: TUserPrompt;\n referenceImage?: ReferenceImage;\n callAIFn: typeof callAIWithObjectResponse<\n AIElementResponse | [number, number]\n >;\n searchConfig?: Awaited<ReturnType<typeof AiLocateSection>>;\n modelConfig: IModelConfig;\n}): Promise<{\n parseResult: AIElementLocatorResponse;\n rect?: Rect;\n rawResponse: string;\n elementById: ElementById;\n usage?: AIUsageInfo;\n isOrderSensitive?: boolean;\n}> {\n const { context, targetElementDescription, callAIFn, modelConfig } = options;\n const { vlMode } = modelConfig;\n const { screenshotBase64 } = context;\n\n const { description, elementById, insertElementByPosition } =\n await describeUserPage(context, { vlMode });\n\n assert(\n targetElementDescription,\n 'cannot find the target element description',\n );\n const userInstructionPrompt = await findElementPrompt.format({\n pageDescription: description,\n targetElementDescription: extraTextFromUserPrompt(targetElementDescription),\n });\n const systemPrompt = systemPromptToLocateElement(vlMode);\n\n let imagePayload = screenshotBase64;\n\n if (options.searchConfig) {\n assert(\n options.searchConfig.rect,\n 'searchArea is provided but its rect cannot be found. Failed to locate element',\n );\n assert(\n options.searchConfig.imageBase64,\n 'searchArea is provided but its imageBase64 cannot be found. Failed to locate element',\n );\n\n imagePayload = options.searchConfig.imageBase64;\n } else if (vlMode === 'qwen-vl') {\n imagePayload = await paddingToMatchBlockByBase64(imagePayload);\n } else if (!vlMode) {\n imagePayload = await markupImageForLLM(\n screenshotBase64,\n context.tree,\n context.size,\n );\n }\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: userInstructionPrompt,\n },\n ],\n },\n ];\n\n if (typeof targetElementDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: targetElementDescription.images,\n convertHttpImage2Base64: targetElementDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const res = await callAIFn(msgs, AIActionType.INSPECT_ELEMENT, modelConfig);\n\n const rawResponse = JSON.stringify(res.content);\n\n let resRect: Rect | undefined;\n let matchedElements: AIElementLocatorResponse['elements'] =\n 'elements' in res.content ? res.content.elements : [];\n let errors: AIElementLocatorResponse['errors'] | undefined =\n 'errors' in res.content ? res.content.errors : [];\n try {\n if ('bbox' in res.content && Array.isArray(res.content.bbox)) {\n resRect = adaptBboxToRect(\n res.content.bbox,\n options.searchConfig?.rect?.width || context.size.width,\n options.searchConfig?.rect?.height || context.size.height,\n options.searchConfig?.rect?.left,\n options.searchConfig?.rect?.top,\n vlMode,\n );\n debugInspect('resRect', resRect);\n\n const rectCenter = {\n x: resRect.left + resRect.width / 2,\n y: resRect.top + resRect.height / 2,\n };\n let element = elementByPositionWithElementInfo(context.tree, rectCenter);\n\n const distanceToCenter = element\n ? distance({ x: element.center[0], y: element.center[1] }, rectCenter)\n : 0;\n\n if (!element || distanceToCenter > distanceThreshold) {\n element = insertElementByPosition(rectCenter);\n }\n\n if (element) {\n matchedElements = [element];\n errors = [];\n }\n }\n } catch (e) {\n const msg =\n e instanceof Error\n ? `Failed to parse bbox: ${e.message}`\n : 'unknown error in locate';\n if (!errors || errors?.length === 0) {\n errors = [msg];\n } else {\n errors.push(`(${msg})`);\n }\n }\n\n return {\n rect: resRect,\n parseResult: {\n elements: matchedElements,\n errors,\n },\n rawResponse,\n elementById,\n usage: res.usage,\n isOrderSensitive:\n typeof res.content === 'object' &&\n res.content !== null &&\n 'isOrderSensitive' in res.content\n ? (res.content as any).isOrderSensitive\n : undefined,\n };\n}\n\nexport async function AiLocateSection(options: {\n context: UIContext<BaseElement>;\n sectionDescription: TUserPrompt;\n modelConfig: IModelConfig;\n}): Promise<{\n rect?: Rect;\n imageBase64?: string;\n error?: string;\n rawResponse: string;\n usage?: AIUsageInfo;\n}> {\n const { context, sectionDescription, modelConfig } = options;\n const { vlMode } = modelConfig;\n const { screenshotBase64 } = context;\n\n const systemPrompt = systemPromptToLocateSection(vlMode);\n const sectionLocatorInstructionText = await sectionLocatorInstruction.format({\n sectionDescription: extraTextFromUserPrompt(sectionDescription),\n });\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: sectionLocatorInstructionText,\n },\n ],\n },\n ];\n\n if (typeof sectionDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: sectionDescription.images,\n convertHttpImage2Base64: sectionDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const result = await callAIWithObjectResponse<AISectionLocatorResponse>(\n msgs,\n AIActionType.EXTRACT_DATA,\n modelConfig,\n );\n\n let sectionRect: Rect | undefined;\n const sectionBbox = result.content.bbox;\n if (sectionBbox) {\n const targetRect = adaptBboxToRect(\n sectionBbox,\n context.size.width,\n context.size.height,\n 0,\n 0,\n vlMode,\n );\n debugSection('original targetRect %j', targetRect);\n\n const referenceBboxList = result.content.references_bbox || [];\n debugSection('referenceBboxList %j', referenceBboxList);\n\n const referenceRects = referenceBboxList\n .filter((bbox) => Array.isArray(bbox))\n .map((bbox) => {\n return adaptBboxToRect(\n bbox,\n context.size.width,\n context.size.height,\n 0,\n 0,\n vlMode,\n );\n });\n debugSection('referenceRects %j', referenceRects);\n\n // merge the sectionRect and referenceRects\n const mergedRect = mergeRects([targetRect, ...referenceRects]);\n debugSection('mergedRect %j', mergedRect);\n\n // expand search area to at least 200 x 200\n sectionRect = expandSearchArea(mergedRect, context.size, vlMode);\n debugSection('expanded sectionRect %j', sectionRect);\n }\n\n let imageBase64 = screenshotBase64;\n if (sectionRect) {\n imageBase64 = await cropByRect(\n screenshotBase64,\n sectionRect,\n vlMode === 'qwen-vl',\n );\n }\n\n return {\n rect: sectionRect,\n imageBase64,\n error: result.content.error,\n rawResponse: JSON.stringify(result.content),\n usage: result.usage,\n };\n}\n\nexport async function AiExtractElementInfo<\n T,\n ElementType extends BaseElement = BaseElement,\n>(options: {\n dataQuery: string | Record<string, string>;\n multimodalPrompt?: TMultimodalPrompt;\n context: UIContext<ElementType>;\n extractOption?: InsightExtractOption;\n modelConfig: IModelConfig;\n}) {\n const { dataQuery, context, extractOption, multimodalPrompt, modelConfig } =\n options;\n const { vlMode } = modelConfig;\n const systemPrompt = systemPromptToExtract();\n\n const { screenshotBase64 } = context;\n\n const { description, elementById } = await describeUserPage(context, {\n truncateTextLength: 200,\n filterNonTextContent: false,\n visibleOnly: false,\n domIncluded: extractOption?.domIncluded,\n vlMode,\n });\n\n const extractDataPromptText = await extractDataQueryPrompt(\n description,\n dataQuery,\n );\n\n const userContent: ChatCompletionUserMessageParam['content'] = [];\n\n if (extractOption?.screenshotIncluded !== false) {\n userContent.push({\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n });\n }\n\n userContent.push({\n type: 'text',\n text: extractDataPromptText,\n });\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: userContent,\n },\n ];\n\n if (options.extractOption?.returnThought) {\n msgs.push({\n role: 'user',\n content: 'Please provide reasons.',\n });\n }\n\n if (multimodalPrompt) {\n const addOns = await promptsToChatParam({\n images: multimodalPrompt.images,\n convertHttpImage2Base64: multimodalPrompt.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const result = await callAIWithObjectResponse<AIDataExtractionResponse<T>>(\n msgs,\n AIActionType.EXTRACT_DATA,\n modelConfig,\n );\n return {\n parseResult: result.content,\n elementById,\n usage: result.usage,\n };\n}\n"],"names":["debugInspect","getDebug","debugSection","extraTextFromUserPrompt","prompt","promptsToChatParam","multimodalPrompt","_multimodalPrompt_images","msgs","item","base64","preProcessImageUrl","AiLocateElement","options","context","targetElementDescription","callAIFn","modelConfig","vlMode","screenshotBase64","description","elementById","insertElementByPosition","describeUserPage","assert","userInstructionPrompt","findElementPrompt","systemPrompt","systemPromptToLocateElement","imagePayload","paddingToMatchBlockByBase64","markupImageForLLM","addOns","res","AIActionType","rawResponse","JSON","resRect","matchedElements","errors","Array","_options_searchConfig_rect","_options_searchConfig_rect1","_options_searchConfig_rect2","_options_searchConfig_rect3","adaptBboxToRect","rectCenter","element","elementByPositionWithElementInfo","distanceToCenter","distance","distanceThreshold","e","msg","Error","undefined","AiLocateSection","sectionDescription","systemPromptToLocateSection","sectionLocatorInstructionText","sectionLocatorInstruction","result","callAIWithObjectResponse","sectionRect","sectionBbox","targetRect","referenceBboxList","referenceRects","bbox","mergedRect","mergeRects","expandSearchArea","imageBase64","cropByRect","AiExtractElementInfo","_options_extractOption","dataQuery","extractOption","systemPromptToExtract","extractDataPromptText","extractDataQueryPrompt","userContent"],"mappings":";;;;;;;;;AA0DA,MAAMA,eAAeC,SAAS;AAC9B,MAAMC,eAAeD,SAAS;AAE9B,MAAME,0BAA0B,CAACC;IAC/B,IAAI,AAAkB,YAAlB,OAAOA,QACT,OAAOA;IAEP,OAAOA,OAAO,MAAM;AAExB;AAEA,MAAMC,qBAAqB,OACzBC;QAGIC;IADJ,MAAMC,OAAyC,EAAE;IACjD,IAAID,QAAAA,mBAAAA,KAAAA,IAAAA,QAAAA,CAAAA,2BAAAA,iBAAkB,MAAM,AAAD,IAAvBA,KAAAA,IAAAA,yBAA0B,MAAM,EAAE;QACpCC,KAAK,IAAI,CAAC;YACR,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM;gBACR;aACD;QACH;QAEA,KAAK,MAAMC,QAAQH,iBAAiB,MAAM,CAAE;YAC1C,MAAMI,SAAS,MAAMC,mBACnBF,KAAK,GAAG,EACR,CAAC,CAACH,iBAAiB,uBAAuB;YAG5CE,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,MAAM,CAAC,gBAAgB,EAAEC,KAAK,IAAI,CAAC,CAAC,CAAC;oBACvC;iBACD;YACH;YAEAD,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,WAAW;4BACT,KAAKE;4BACL,QAAQ;wBACV;oBACF;iBACD;YACH;QACF;IACF;IACA,OAAOF;AACT;AAEO,eAAeI,gBAEpBC,OASD;IAQC,MAAM,EAAEC,OAAO,EAAEC,wBAAwB,EAAEC,QAAQ,EAAEC,WAAW,EAAE,GAAGJ;IACrE,MAAM,EAAEK,MAAM,EAAE,GAAGD;IACnB,MAAM,EAAEE,gBAAgB,EAAE,GAAGL;IAE7B,MAAM,EAAEM,WAAW,EAAEC,WAAW,EAAEC,uBAAuB,EAAE,GACzD,MAAMC,iBAAiBT,SAAS;QAAEI;IAAO;IAE3CM,OACET,0BACA;IAEF,MAAMU,wBAAwB,MAAMC,kBAAkB,MAAM,CAAC;QAC3D,iBAAiBN;QACjB,0BAA0BjB,wBAAwBY;IACpD;IACA,MAAMY,eAAeC,4BAA4BV;IAEjD,IAAIW,eAAeV;IAEnB,IAAIN,QAAQ,YAAY,EAAE;QACxBW,OACEX,QAAQ,YAAY,CAAC,IAAI,EACzB;QAEFW,OACEX,QAAQ,YAAY,CAAC,WAAW,EAChC;QAGFgB,eAAehB,QAAQ,YAAY,CAAC,WAAW;IACjD,OAAO,IAAIK,AAAW,cAAXA,QACTW,eAAe,MAAMC,4BAA4BD;SAC5C,IAAI,CAACX,QACVW,eAAe,MAAME,kBACnBZ,kBACAL,QAAQ,IAAI,EACZA,QAAQ,IAAI;IAIhB,MAAMN,OAAe;QACnB;YAAE,MAAM;YAAU,SAASmB;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKE;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAMJ;gBACR;aACD;QACH;KACD;IAED,IAAI,AAAoC,YAApC,OAAOV,0BAAuC;QAChD,MAAMiB,SAAS,MAAM3B,mBAAmB;YACtC,QAAQU,yBAAyB,MAAM;YACvC,yBAAyBA,yBAAyB,uBAAuB;QAC3E;QACAP,KAAK,IAAI,IAAIwB;IACf;IAEA,MAAMC,MAAM,MAAMjB,SAASR,MAAM0B,aAAa,eAAe,EAAEjB;IAE/D,MAAMkB,cAAcC,KAAK,SAAS,CAACH,IAAI,OAAO;IAE9C,IAAII;IACJ,IAAIC,kBACF,cAAcL,IAAI,OAAO,GAAGA,IAAI,OAAO,CAAC,QAAQ,GAAG,EAAE;IACvD,IAAIM,SACF,YAAYN,IAAI,OAAO,GAAGA,IAAI,OAAO,CAAC,MAAM,GAAG,EAAE;IACnD,IAAI;QACF,IAAI,UAAUA,IAAI,OAAO,IAAIO,MAAM,OAAO,CAACP,IAAI,OAAO,CAAC,IAAI,GAAG;gBAG1DQ,4BAAAA,uBACAC,6BAAAA,wBACAC,6BAAAA,wBACAC,6BAAAA;YALFP,UAAUQ,gBACRZ,IAAI,OAAO,CAAC,IAAI,EAChBQ,AAAAA,SAAAA,CAAAA,wBAAAA,QAAQ,YAAY,AAAD,IAAnBA,KAAAA,IAAAA,QAAAA,CAAAA,6BAAAA,sBAAsB,IAAI,AAAD,IAAzBA,KAAAA,IAAAA,2BAA4B,KAAK,AAAD,KAAK3B,QAAQ,IAAI,CAAC,KAAK,EACvD4B,AAAAA,SAAAA,CAAAA,yBAAAA,QAAQ,YAAY,AAAD,IAAnBA,KAAAA,IAAAA,QAAAA,CAAAA,8BAAAA,uBAAsB,IAAI,AAAD,IAAzBA,KAAAA,IAAAA,4BAA4B,MAAM,AAAD,KAAK5B,QAAQ,IAAI,CAAC,MAAM,UACzD6B,CAAAA,yBAAAA,QAAQ,YAAY,AAAD,IAAnBA,KAAAA,IAAAA,QAAAA,CAAAA,8BAAAA,uBAAsB,IAAI,AAAD,IAAzBA,KAAAA,IAAAA,4BAA4B,IAAI,UAChCC,CAAAA,yBAAAA,QAAQ,YAAY,AAAD,IAAnBA,KAAAA,IAAAA,QAAAA,CAAAA,8BAAAA,uBAAsB,IAAI,AAAD,IAAzBA,KAAAA,IAAAA,4BAA4B,GAAG,EAC/B1B;YAEFlB,aAAa,WAAWqC;YAExB,MAAMS,aAAa;gBACjB,GAAGT,QAAQ,IAAI,GAAGA,QAAQ,KAAK,GAAG;gBAClC,GAAGA,QAAQ,GAAG,GAAGA,QAAQ,MAAM,GAAG;YACpC;YACA,IAAIU,UAAUC,iCAAiClC,QAAQ,IAAI,EAAEgC;YAE7D,MAAMG,mBAAmBF,UACrBG,SAAS;gBAAE,GAAGH,QAAQ,MAAM,CAAC,EAAE;gBAAE,GAAGA,QAAQ,MAAM,CAAC,EAAE;YAAC,GAAGD,cACzD;YAEJ,IAAI,CAACC,WAAWE,mBAAmBE,mBACjCJ,UAAUzB,wBAAwBwB;YAGpC,IAAIC,SAAS;gBACXT,kBAAkB;oBAACS;iBAAQ;gBAC3BR,SAAS,EAAE;YACb;QACF;IACF,EAAE,OAAOa,GAAG;QACV,MAAMC,MACJD,aAAaE,QACT,CAAC,sBAAsB,EAAEF,EAAE,OAAO,EAAE,GACpC;QACN,IAAI,AAACb,UAAUA,AAAAA,CAAAA,QAAAA,SAAAA,KAAAA,IAAAA,OAAQ,MAAM,AAAD,MAAM,GAGhCA,OAAO,IAAI,CAAC,CAAC,CAAC,EAAEc,IAAI,CAAC,CAAC;aAFtBd,SAAS;YAACc;SAAI;IAIlB;IAEA,OAAO;QACL,MAAMhB;QACN,aAAa;YACX,UAAUC;YACVC;QACF;QACAJ;QACAd;QACA,OAAOY,IAAI,KAAK;QAChB,kBACE,AAAuB,YAAvB,OAAOA,IAAI,OAAO,IAClBA,AAAgB,SAAhBA,IAAI,OAAO,IACX,sBAAsBA,IAAI,OAAO,GAC5BA,IAAI,OAAO,CAAS,gBAAgB,GACrCsB;IACR;AACF;AAEO,eAAeC,gBAAgB3C,OAIrC;IAOC,MAAM,EAAEC,OAAO,EAAE2C,kBAAkB,EAAExC,WAAW,EAAE,GAAGJ;IACrD,MAAM,EAAEK,MAAM,EAAE,GAAGD;IACnB,MAAM,EAAEE,gBAAgB,EAAE,GAAGL;IAE7B,MAAMa,eAAe+B,4BAA4BxC;IACjD,MAAMyC,gCAAgC,MAAMC,0BAA0B,MAAM,CAAC;QAC3E,oBAAoBzD,wBAAwBsD;IAC9C;IACA,MAAMjD,OAAe;QACnB;YAAE,MAAM;YAAU,SAASmB;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKR;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAMwC;gBACR;aACD;QACH;KACD;IAED,IAAI,AAA8B,YAA9B,OAAOF,oBAAiC;QAC1C,MAAMzB,SAAS,MAAM3B,mBAAmB;YACtC,QAAQoD,mBAAmB,MAAM;YACjC,yBAAyBA,mBAAmB,uBAAuB;QACrE;QACAjD,KAAK,IAAI,IAAIwB;IACf;IAEA,MAAM6B,SAAS,MAAMC,yBACnBtD,MACA0B,aAAa,YAAY,EACzBjB;IAGF,IAAI8C;IACJ,MAAMC,cAAcH,OAAO,OAAO,CAAC,IAAI;IACvC,IAAIG,aAAa;QACf,MAAMC,aAAapB,gBACjBmB,aACAlD,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnB,GACA,GACAI;QAEFhB,aAAa,0BAA0B+D;QAEvC,MAAMC,oBAAoBL,OAAO,OAAO,CAAC,eAAe,IAAI,EAAE;QAC9D3D,aAAa,wBAAwBgE;QAErC,MAAMC,iBAAiBD,kBACpB,MAAM,CAAC,CAACE,OAAS5B,MAAM,OAAO,CAAC4B,OAC/B,GAAG,CAAC,CAACA,OACGvB,gBACLuB,MACAtD,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnB,GACA,GACAI;QAGNhB,aAAa,qBAAqBiE;QAGlC,MAAME,aAAaC,WAAW;YAACL;eAAeE;SAAe;QAC7DjE,aAAa,iBAAiBmE;QAG9BN,cAAcQ,iBAAiBF,YAAYvD,QAAQ,IAAI,EAAEI;QACzDhB,aAAa,2BAA2B6D;IAC1C;IAEA,IAAIS,cAAcrD;IAClB,IAAI4C,aACFS,cAAc,MAAMC,WAClBtD,kBACA4C,aACA7C,AAAW,cAAXA;IAIJ,OAAO;QACL,MAAM6C;QACNS;QACA,OAAOX,OAAO,OAAO,CAAC,KAAK;QAC3B,aAAazB,KAAK,SAAS,CAACyB,OAAO,OAAO;QAC1C,OAAOA,OAAO,KAAK;IACrB;AACF;AAEO,eAAea,qBAGpB7D,OAMD;QA8CK8D;IA7CJ,MAAM,EAAEC,SAAS,EAAE9D,OAAO,EAAE+D,aAAa,EAAEvE,gBAAgB,EAAEW,WAAW,EAAE,GACxEJ;IACF,MAAM,EAAEK,MAAM,EAAE,GAAGD;IACnB,MAAMU,eAAemD;IAErB,MAAM,EAAE3D,gBAAgB,EAAE,GAAGL;IAE7B,MAAM,EAAEM,WAAW,EAAEC,WAAW,EAAE,GAAG,MAAME,iBAAiBT,SAAS;QACnE,oBAAoB;QACpB,sBAAsB;QACtB,aAAa;QACb,aAAa+D,QAAAA,gBAAAA,KAAAA,IAAAA,cAAe,WAAW;QACvC3D;IACF;IAEA,MAAM6D,wBAAwB,MAAMC,uBAClC5D,aACAwD;IAGF,MAAMK,cAAyD,EAAE;IAEjE,IAAIJ,AAAAA,CAAAA,QAAAA,gBAAAA,KAAAA,IAAAA,cAAe,kBAAkB,AAAD,MAAM,OACxCI,YAAY,IAAI,CAAC;QACf,MAAM;QACN,WAAW;YACT,KAAK9D;YACL,QAAQ;QACV;IACF;IAGF8D,YAAY,IAAI,CAAC;QACf,MAAM;QACN,MAAMF;IACR;IAEA,MAAMvE,OAAe;QACnB;YAAE,MAAM;YAAU,SAASmB;QAAa;QACxC;YACE,MAAM;YACN,SAASsD;QACX;KACD;IAED,IAAI,QAAAN,CAAAA,yBAAAA,QAAQ,aAAa,AAAD,IAApBA,KAAAA,IAAAA,uBAAuB,aAAa,EACtCnE,KAAK,IAAI,CAAC;QACR,MAAM;QACN,SAAS;IACX;IAGF,IAAIF,kBAAkB;QACpB,MAAM0B,SAAS,MAAM3B,mBAAmB;YACtC,QAAQC,iBAAiB,MAAM;YAC/B,yBAAyBA,iBAAiB,uBAAuB;QACnE;QACAE,KAAK,IAAI,IAAIwB;IACf;IAEA,MAAM6B,SAAS,MAAMC,yBACnBtD,MACA0B,aAAa,YAAY,EACzBjB;IAEF,OAAO;QACL,aAAa4C,OAAO,OAAO;QAC3BxC;QACA,OAAOwC,OAAO,KAAK;IACrB;AACF"}
|
|
@@ -19,26 +19,10 @@ async function plan(userInstruction, opts) {
|
|
|
19
19
|
vlMode: vlMode
|
|
20
20
|
});
|
|
21
21
|
let imagePayload = screenshotBase64;
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
const rightLimit = imageWidth;
|
|
25
|
-
const bottomLimit = imageHeight;
|
|
26
|
-
if ('qwen-vl' === vlMode) {
|
|
27
|
-
const paddedResult = await paddingToMatchBlockByBase64(imagePayload);
|
|
28
|
-
imageWidth = paddedResult.width;
|
|
29
|
-
imageHeight = paddedResult.height;
|
|
30
|
-
imagePayload = paddedResult.imageBase64;
|
|
31
|
-
} else if ('qwen3-vl' === vlMode) {
|
|
32
|
-
const paddedResult = await paddingToMatchBlockByBase64(imagePayload, 32);
|
|
33
|
-
imageWidth = paddedResult.width;
|
|
34
|
-
imageHeight = paddedResult.height;
|
|
35
|
-
imagePayload = paddedResult.imageBase64;
|
|
36
|
-
} else if (!vlMode) imagePayload = await markupImageForLLM(screenshotBase64, context.tree, {
|
|
37
|
-
width: imageWidth,
|
|
38
|
-
height: imageHeight
|
|
39
|
-
});
|
|
22
|
+
if ('qwen-vl' === vlMode) imagePayload = await paddingToMatchBlockByBase64(imagePayload);
|
|
23
|
+
else if (!vlMode) imagePayload = await markupImageForLLM(screenshotBase64, context.tree, context.size);
|
|
40
24
|
warnGPT4oSizeLimit(size, modelName);
|
|
41
|
-
const historyLog = (null == (_opts_conversationHistory = opts.conversationHistory) ? void 0 : _opts_conversationHistory.snapshot()) || [];
|
|
25
|
+
const historyLog = (null == (_opts_conversationHistory = opts.conversationHistory) ? void 0 : _opts_conversationHistory.snapshot().filter((item)=>'assistant' === item.role)) || [];
|
|
42
26
|
const knowledgeContext = opts.actionContext ? [
|
|
43
27
|
{
|
|
44
28
|
role: 'user',
|
|
@@ -110,7 +94,7 @@ async function plan(userInstruction, opts) {
|
|
|
110
94
|
debug('locateFields', locateFields);
|
|
111
95
|
locateFields.forEach((field)=>{
|
|
112
96
|
const locateResult = action.param[field];
|
|
113
|
-
if (locateResult) if (vlMode) action.param[field] = fillBboxParam(locateResult,
|
|
97
|
+
if (locateResult) if (vlMode) action.param[field] = fillBboxParam(locateResult, size.width, size.height, vlMode);
|
|
114
98
|
else {
|
|
115
99
|
const element = elementById(locateResult);
|
|
116
100
|
if (element) action.param[field].id = element.id;
|
|
@@ -128,15 +112,6 @@ async function plan(userInstruction, opts) {
|
|
|
128
112
|
}
|
|
129
113
|
]
|
|
130
114
|
});
|
|
131
|
-
null == conversationHistory || conversationHistory.append({
|
|
132
|
-
role: 'user',
|
|
133
|
-
content: [
|
|
134
|
-
{
|
|
135
|
-
type: 'text',
|
|
136
|
-
text: 'I have finished the task'
|
|
137
|
-
}
|
|
138
|
-
]
|
|
139
|
-
});
|
|
140
115
|
return returnValue;
|
|
141
116
|
}
|
|
142
117
|
export { plan };
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ai-model/llm-planning.mjs","sources":["webpack://@midscene/core/./src/ai-model/llm-planning.ts"],"sourcesContent":["import type {\n DeviceAction,\n InterfaceType,\n PlanningAIResponse,\n UIContext,\n} from '@/types';\nimport type { IModelConfig } from '@midscene/shared/env';\nimport { paddingToMatchBlockByBase64 } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport type {\n ChatCompletionContentPart,\n ChatCompletionMessageParam,\n} from 'openai/resources/index';\nimport {\n AIActionType,\n buildYamlFlowFromPlans,\n fillBboxParam,\n findAllMidsceneLocatorField,\n markupImageForLLM,\n warnGPT4oSizeLimit,\n} from './common';\nimport type { ConversationHistory } from './conversation-history';\nimport { systemPromptToTaskPlanning } from './prompt/llm-planning';\nimport { describeUserPage } from './prompt/util';\nimport { callAIWithObjectResponse } from './service-caller/index';\n\nconst debug = getDebug('planning');\n\nexport async function plan(\n userInstruction: string,\n opts: {\n context: UIContext;\n interfaceType: InterfaceType;\n actionSpace: DeviceAction<any>[];\n actionContext?: string;\n modelConfig: IModelConfig;\n conversationHistory?: ConversationHistory;\n },\n): Promise<PlanningAIResponse> {\n const { context, modelConfig, conversationHistory } = opts;\n const { screenshotBase64, size } = context;\n\n const { modelName, vlMode } = modelConfig;\n\n const { description: pageDescription, elementById } = await describeUserPage(\n context,\n { vlMode },\n );\n const systemPrompt = await systemPromptToTaskPlanning({\n actionSpace: opts.actionSpace,\n vlMode: vlMode,\n });\n\n let imagePayload = screenshotBase64;\n let imageWidth = size.width;\n let imageHeight = size.height;\n const rightLimit = imageWidth;\n const bottomLimit = imageHeight;\n if (vlMode === 'qwen-vl') {\n const paddedResult = await paddingToMatchBlockByBase64(imagePayload);\n imageWidth = paddedResult.width;\n imageHeight = paddedResult.height;\n imagePayload = paddedResult.imageBase64;\n } else if (vlMode === 'qwen3-vl') {\n const paddedResult = await paddingToMatchBlockByBase64(imagePayload, 32);\n imageWidth = paddedResult.width;\n imageHeight = paddedResult.height;\n imagePayload = paddedResult.imageBase64;\n } else if (!vlMode) {\n imagePayload = await markupImageForLLM(screenshotBase64, context.tree, {\n width: imageWidth,\n height: imageHeight,\n });\n }\n\n warnGPT4oSizeLimit(size, modelName);\n\n const historyLog = opts.conversationHistory?.snapshot() || [];\n // .filter((item) => item.role === 'assistant') || [];\n\n const knowledgeContext: ChatCompletionMessageParam[] = opts.actionContext\n ? [\n {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `<high_priority_knowledge>${opts.actionContext}</high_priority_knowledge>`,\n },\n ],\n },\n ]\n : [];\n\n const instruction: ChatCompletionMessageParam[] = [\n {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `<user_instruction>${userInstruction}</user_instruction>`,\n },\n ],\n },\n ];\n\n const msgs: ChatCompletionMessageParam[] = [\n { role: 'system', content: systemPrompt },\n ...knowledgeContext,\n ...instruction,\n ...historyLog,\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n ...(vlMode\n ? []\n : ([\n {\n type: 'text',\n text: pageDescription,\n },\n ] as ChatCompletionContentPart[])),\n ],\n },\n ];\n\n const { content, usage } = await callAIWithObjectResponse<PlanningAIResponse>(\n msgs,\n AIActionType.PLAN,\n modelConfig,\n );\n const rawResponse = JSON.stringify(content, undefined, 2);\n const planFromAI = content;\n\n const actions =\n (planFromAI.action?.type ? [planFromAI.action] : planFromAI.actions) || [];\n const returnValue: PlanningAIResponse = {\n ...planFromAI,\n actions,\n rawResponse,\n usage,\n yamlFlow: buildYamlFlowFromPlans(\n actions,\n opts.actionSpace,\n planFromAI.sleep,\n ),\n };\n\n assert(planFromAI, \"can't get plans from AI\");\n\n // TODO: use zod.parse to parse the action.param, and then fill the bbox param.\n actions.forEach((action) => {\n const type = action.type;\n const actionInActionSpace = opts.actionSpace.find(\n (action) => action.name === type,\n );\n\n debug('actionInActionSpace matched', actionInActionSpace);\n const locateFields = actionInActionSpace\n ? findAllMidsceneLocatorField(actionInActionSpace.paramSchema)\n : [];\n\n debug('locateFields', locateFields);\n\n locateFields.forEach((field) => {\n const locateResult = action.param[field];\n if (locateResult) {\n if (vlMode) {\n action.param[field] = fillBboxParam(\n locateResult,\n imageWidth,\n imageHeight,\n rightLimit,\n bottomLimit,\n vlMode,\n );\n } else {\n const element = elementById(locateResult);\n if (element) {\n action.param[field].id = element.id;\n }\n }\n }\n });\n });\n // in Qwen-VL, error means error. In GPT-4o, error may mean more actions are needed.\n assert(!planFromAI.error, `Failed to plan actions: ${planFromAI.error}`);\n\n if (\n actions.length === 0 &&\n returnValue.more_actions_needed_by_instruction &&\n !returnValue.sleep\n ) {\n console.warn(\n 'No actions planned for the prompt, but model said more actions are needed:',\n userInstruction,\n );\n }\n\n conversationHistory?.append({\n role: 'assistant',\n content: [\n {\n type: 'text',\n text: rawResponse,\n },\n ],\n });\n conversationHistory?.append({\n role: 'user',\n content: [\n {\n type: 'text',\n text: 'I have finished the task',\n },\n ],\n });\n\n return returnValue;\n}\n"],"names":["debug","getDebug","plan","userInstruction","opts","_opts_conversationHistory","_planFromAI_action","context","modelConfig","conversationHistory","screenshotBase64","size","modelName","vlMode","pageDescription","elementById","describeUserPage","systemPrompt","systemPromptToTaskPlanning","imagePayload","imageWidth","imageHeight","rightLimit","bottomLimit","paddedResult","paddingToMatchBlockByBase64","markupImageForLLM","warnGPT4oSizeLimit","historyLog","knowledgeContext","instruction","msgs","content","usage","callAIWithObjectResponse","AIActionType","rawResponse","JSON","undefined","planFromAI","actions","returnValue","buildYamlFlowFromPlans","assert","action","type","actionInActionSpace","locateFields","findAllMidsceneLocatorField","field","locateResult","fillBboxParam","element","console"],"mappings":";;;;;;;AA2BA,MAAMA,QAAQC,SAAS;AAEhB,eAAeC,KACpBC,eAAuB,EACvBC,IAOC;QAwCkBC,2BAiEhBC;IAvGH,MAAM,EAAEC,OAAO,EAAEC,WAAW,EAAEC,mBAAmB,EAAE,GAAGL;IACtD,MAAM,EAAEM,gBAAgB,EAAEC,IAAI,EAAE,GAAGJ;IAEnC,MAAM,EAAEK,SAAS,EAAEC,MAAM,EAAE,GAAGL;IAE9B,MAAM,EAAE,aAAaM,eAAe,EAAEC,WAAW,EAAE,GAAG,MAAMC,iBAC1DT,SACA;QAAEM;IAAO;IAEX,MAAMI,eAAe,MAAMC,2BAA2B;QACpD,aAAad,KAAK,WAAW;QAC7B,QAAQS;IACV;IAEA,IAAIM,eAAeT;IACnB,IAAIU,aAAaT,KAAK,KAAK;IAC3B,IAAIU,cAAcV,KAAK,MAAM;IAC7B,MAAMW,aAAaF;IACnB,MAAMG,cAAcF;IACpB,IAAIR,AAAW,cAAXA,QAAsB;QACxB,MAAMW,eAAe,MAAMC,4BAA4BN;QACvDC,aAAaI,aAAa,KAAK;QAC/BH,cAAcG,aAAa,MAAM;QACjCL,eAAeK,aAAa,WAAW;IACzC,OAAO,IAAIX,AAAW,eAAXA,QAAuB;QAChC,MAAMW,eAAe,MAAMC,4BAA4BN,cAAc;QACrEC,aAAaI,aAAa,KAAK;QAC/BH,cAAcG,aAAa,MAAM;QACjCL,eAAeK,aAAa,WAAW;IACzC,OAAO,IAAI,CAACX,QACVM,eAAe,MAAMO,kBAAkBhB,kBAAkBH,QAAQ,IAAI,EAAE;QACrE,OAAOa;QACP,QAAQC;IACV;IAGFM,mBAAmBhB,MAAMC;IAEzB,MAAMgB,aAAavB,AAAAA,SAAAA,CAAAA,4BAAAA,KAAK,mBAAmB,AAAD,IAAvBA,KAAAA,IAAAA,0BAA0B,QAAQ,EAAC,KAAK,EAAE;IAG7D,MAAMwB,mBAAiDzB,KAAK,aAAa,GACrE;QACE;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM,CAAC,yBAAyB,EAAEA,KAAK,aAAa,CAAC,0BAA0B,CAAC;gBAClF;aACD;QACH;KACD,GACD,EAAE;IAEN,MAAM0B,cAA4C;QAChD;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM,CAAC,kBAAkB,EAAE3B,gBAAgB,mBAAmB,CAAC;gBACjE;aACD;QACH;KACD;IAED,MAAM4B,OAAqC;QACzC;YAAE,MAAM;YAAU,SAASd;QAAa;WACrCY;WACAC;WACAF;QACH;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKT;wBACL,QAAQ;oBACV;gBACF;mBACIN,SACA,EAAE,GACD;oBACC;wBACE,MAAM;wBACN,MAAMC;oBACR;iBACD;aACN;QACH;KACD;IAED,MAAM,EAAEkB,OAAO,EAAEC,KAAK,EAAE,GAAG,MAAMC,yBAC/BH,MACAI,aAAa,IAAI,EACjB3B;IAEF,MAAM4B,cAAcC,KAAK,SAAS,CAACL,SAASM,QAAW;IACvD,MAAMC,aAAaP;IAEnB,MAAMQ,UACHlC,AAAAA,CAAAA,SAAAA,CAAAA,qBAAAA,WAAW,MAAM,AAAD,IAAhBA,KAAAA,IAAAA,mBAAmB,IAAI,AAAD,IAAI;QAACiC,WAAW,MAAM;KAAC,GAAGA,WAAW,OAAM,KAAM,EAAE;IAC5E,MAAME,cAAkC;QACtC,GAAGF,UAAU;QACbC;QACAJ;QACAH;QACA,UAAUS,uBACRF,SACApC,KAAK,WAAW,EAChBmC,WAAW,KAAK;IAEpB;IAEAI,OAAOJ,YAAY;IAGnBC,QAAQ,OAAO,CAAC,CAACI;QACf,MAAMC,OAAOD,OAAO,IAAI;QACxB,MAAME,sBAAsB1C,KAAK,WAAW,CAAC,IAAI,CAC/C,CAACwC,SAAWA,OAAO,IAAI,KAAKC;QAG9B7C,MAAM,+BAA+B8C;QACrC,MAAMC,eAAeD,sBACjBE,4BAA4BF,oBAAoB,WAAW,IAC3D,EAAE;QAEN9C,MAAM,gBAAgB+C;QAEtBA,aAAa,OAAO,CAAC,CAACE;YACpB,MAAMC,eAAeN,OAAO,KAAK,CAACK,MAAM;YACxC,IAAIC,cACF,IAAIrC,QACF+B,OAAO,KAAK,CAACK,MAAM,GAAGE,cACpBD,cACA9B,YACAC,aACAC,YACAC,aACAV;iBAEG;gBACL,MAAMuC,UAAUrC,YAAYmC;gBAC5B,IAAIE,SACFR,OAAO,KAAK,CAACK,MAAM,CAAC,EAAE,GAAGG,QAAQ,EAAE;YAEvC;QAEJ;IACF;IAEAT,OAAO,CAACJ,WAAW,KAAK,EAAE,CAAC,wBAAwB,EAAEA,WAAW,KAAK,EAAE;IAEvE,IACEC,AAAmB,MAAnBA,QAAQ,MAAM,IACdC,YAAY,kCAAkC,IAC9C,CAACA,YAAY,KAAK,EAElBY,QAAQ,IAAI,CACV,8EACAlD;IAIJM,QAAAA,uBAAAA,oBAAqB,MAAM,CAAC;QAC1B,MAAM;QACN,SAAS;YACP;gBACE,MAAM;gBACN,MAAM2B;YACR;SACD;IACH;IACA3B,QAAAA,uBAAAA,oBAAqB,MAAM,CAAC;QAC1B,MAAM;QACN,SAAS;YACP;gBACE,MAAM;gBACN,MAAM;YACR;SACD;IACH;IAEA,OAAOgC;AACT"}
|
|
1
|
+
{"version":3,"file":"ai-model/llm-planning.mjs","sources":["webpack://@midscene/core/./src/ai-model/llm-planning.ts"],"sourcesContent":["import type {\n DeviceAction,\n InterfaceType,\n PlanningAIResponse,\n UIContext,\n} from '@/types';\nimport type { IModelConfig } from '@midscene/shared/env';\nimport { paddingToMatchBlockByBase64 } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport type {\n ChatCompletionContentPart,\n ChatCompletionMessageParam,\n} from 'openai/resources/index';\nimport {\n AIActionType,\n type AIArgs,\n buildYamlFlowFromPlans,\n fillBboxParam,\n findAllMidsceneLocatorField,\n markupImageForLLM,\n warnGPT4oSizeLimit,\n} from './common';\nimport type { ConversationHistory } from './conversation-history';\nimport { systemPromptToTaskPlanning } from './prompt/llm-planning';\nimport { describeUserPage } from './prompt/util';\nimport { callAIWithObjectResponse } from './service-caller/index';\n\nconst debug = getDebug('planning');\n\nexport async function plan(\n userInstruction: string,\n opts: {\n context: UIContext;\n interfaceType: InterfaceType;\n actionSpace: DeviceAction<any>[];\n actionContext?: string;\n modelConfig: IModelConfig;\n conversationHistory?: ConversationHistory;\n },\n): Promise<PlanningAIResponse> {\n const { context, modelConfig, conversationHistory } = opts;\n const { screenshotBase64, size } = context;\n\n const { modelName, vlMode } = modelConfig;\n\n const { description: pageDescription, elementById } = await describeUserPage(\n context,\n { vlMode },\n );\n const systemPrompt = await systemPromptToTaskPlanning({\n actionSpace: opts.actionSpace,\n vlMode: vlMode,\n });\n\n let imagePayload = screenshotBase64;\n if (vlMode === 'qwen-vl') {\n imagePayload = await paddingToMatchBlockByBase64(imagePayload);\n } else if (!vlMode) {\n imagePayload = await markupImageForLLM(\n screenshotBase64,\n context.tree,\n context.size,\n );\n }\n\n warnGPT4oSizeLimit(size, modelName);\n\n const historyLog =\n opts.conversationHistory\n ?.snapshot()\n .filter((item) => item.role === 'assistant') || [];\n\n const knowledgeContext: ChatCompletionMessageParam[] = opts.actionContext\n ? [\n {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `<high_priority_knowledge>${opts.actionContext}</high_priority_knowledge>`,\n },\n ],\n },\n ]\n : [];\n\n const instruction: ChatCompletionMessageParam[] = [\n {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `<user_instruction>${userInstruction}</user_instruction>`,\n },\n ],\n },\n ];\n\n const msgs: ChatCompletionMessageParam[] = [\n { role: 'system', content: systemPrompt },\n ...knowledgeContext,\n ...instruction,\n ...historyLog,\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n ...(vlMode\n ? []\n : ([\n {\n type: 'text',\n text: pageDescription,\n },\n ] as ChatCompletionContentPart[])),\n ],\n },\n ];\n\n const { content, usage } = await callAIWithObjectResponse<PlanningAIResponse>(\n msgs,\n AIActionType.PLAN,\n modelConfig,\n );\n const rawResponse = JSON.stringify(content, undefined, 2);\n const planFromAI = content;\n\n const actions =\n (planFromAI.action?.type ? [planFromAI.action] : planFromAI.actions) || [];\n const returnValue: PlanningAIResponse = {\n ...planFromAI,\n actions,\n rawResponse,\n usage,\n yamlFlow: buildYamlFlowFromPlans(\n actions,\n opts.actionSpace,\n planFromAI.sleep,\n ),\n };\n\n assert(planFromAI, \"can't get plans from AI\");\n\n // TODO: use zod.parse to parse the action.param, and then fill the bbox param.\n actions.forEach((action) => {\n const type = action.type;\n const actionInActionSpace = opts.actionSpace.find(\n (action) => action.name === type,\n );\n\n debug('actionInActionSpace matched', actionInActionSpace);\n const locateFields = actionInActionSpace\n ? findAllMidsceneLocatorField(actionInActionSpace.paramSchema)\n : [];\n\n debug('locateFields', locateFields);\n\n locateFields.forEach((field) => {\n const locateResult = action.param[field];\n if (locateResult) {\n if (vlMode) {\n action.param[field] = fillBboxParam(\n locateResult,\n size.width,\n size.height,\n vlMode,\n );\n } else {\n const element = elementById(locateResult);\n if (element) {\n action.param[field].id = element.id;\n }\n }\n }\n });\n });\n // in Qwen-VL, error means error. In GPT-4o, error may mean more actions are needed.\n assert(!planFromAI.error, `Failed to plan actions: ${planFromAI.error}`);\n\n if (\n actions.length === 0 &&\n returnValue.more_actions_needed_by_instruction &&\n !returnValue.sleep\n ) {\n console.warn(\n 'No actions planned for the prompt, but model said more actions are needed:',\n userInstruction,\n );\n }\n\n conversationHistory?.append({\n role: 'assistant',\n content: [\n {\n type: 'text',\n text: rawResponse,\n },\n ],\n });\n\n return returnValue;\n}\n"],"names":["debug","getDebug","plan","userInstruction","opts","_opts_conversationHistory","_planFromAI_action","context","modelConfig","conversationHistory","screenshotBase64","size","modelName","vlMode","pageDescription","elementById","describeUserPage","systemPrompt","systemPromptToTaskPlanning","imagePayload","paddingToMatchBlockByBase64","markupImageForLLM","warnGPT4oSizeLimit","historyLog","item","knowledgeContext","instruction","msgs","content","usage","callAIWithObjectResponse","AIActionType","rawResponse","JSON","undefined","planFromAI","actions","returnValue","buildYamlFlowFromPlans","assert","action","type","actionInActionSpace","locateFields","findAllMidsceneLocatorField","field","locateResult","fillBboxParam","element","console"],"mappings":";;;;;;;AA4BA,MAAMA,QAAQC,SAAS;AAEhB,eAAeC,KACpBC,eAAuB,EACvBC,IAOC;QA8BCC,2BAkECC;IA9FH,MAAM,EAAEC,OAAO,EAAEC,WAAW,EAAEC,mBAAmB,EAAE,GAAGL;IACtD,MAAM,EAAEM,gBAAgB,EAAEC,IAAI,EAAE,GAAGJ;IAEnC,MAAM,EAAEK,SAAS,EAAEC,MAAM,EAAE,GAAGL;IAE9B,MAAM,EAAE,aAAaM,eAAe,EAAEC,WAAW,EAAE,GAAG,MAAMC,iBAC1DT,SACA;QAAEM;IAAO;IAEX,MAAMI,eAAe,MAAMC,2BAA2B;QACpD,aAAad,KAAK,WAAW;QAC7B,QAAQS;IACV;IAEA,IAAIM,eAAeT;IACnB,IAAIG,AAAW,cAAXA,QACFM,eAAe,MAAMC,4BAA4BD;SAC5C,IAAI,CAACN,QACVM,eAAe,MAAME,kBACnBX,kBACAH,QAAQ,IAAI,EACZA,QAAQ,IAAI;IAIhBe,mBAAmBX,MAAMC;IAEzB,MAAMW,aACJlB,AAAAA,SAAAA,CAAAA,4BAAAA,KAAK,mBAAmB,AAAD,IAAvBA,KAAAA,IAAAA,0BACI,QAAQ,GACT,MAAM,CAAC,CAACmB,OAASA,AAAc,gBAAdA,KAAK,IAAI,CAAgB,KAAK,EAAE;IAEtD,MAAMC,mBAAiDrB,KAAK,aAAa,GACrE;QACE;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM,CAAC,yBAAyB,EAAEA,KAAK,aAAa,CAAC,0BAA0B,CAAC;gBAClF;aACD;QACH;KACD,GACD,EAAE;IAEN,MAAMsB,cAA4C;QAChD;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM,CAAC,kBAAkB,EAAEvB,gBAAgB,mBAAmB,CAAC;gBACjE;aACD;QACH;KACD;IAED,MAAMwB,OAAqC;QACzC;YAAE,MAAM;YAAU,SAASV;QAAa;WACrCQ;WACAC;WACAH;QACH;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKJ;wBACL,QAAQ;oBACV;gBACF;mBACIN,SACA,EAAE,GACD;oBACC;wBACE,MAAM;wBACN,MAAMC;oBACR;iBACD;aACN;QACH;KACD;IAED,MAAM,EAAEc,OAAO,EAAEC,KAAK,EAAE,GAAG,MAAMC,yBAC/BH,MACAI,aAAa,IAAI,EACjBvB;IAEF,MAAMwB,cAAcC,KAAK,SAAS,CAACL,SAASM,QAAW;IACvD,MAAMC,aAAaP;IAEnB,MAAMQ,UACH9B,AAAAA,CAAAA,SAAAA,CAAAA,qBAAAA,WAAW,MAAM,AAAD,IAAhBA,KAAAA,IAAAA,mBAAmB,IAAI,AAAD,IAAI;QAAC6B,WAAW,MAAM;KAAC,GAAGA,WAAW,OAAM,KAAM,EAAE;IAC5E,MAAME,cAAkC;QACtC,GAAGF,UAAU;QACbC;QACAJ;QACAH;QACA,UAAUS,uBACRF,SACAhC,KAAK,WAAW,EAChB+B,WAAW,KAAK;IAEpB;IAEAI,OAAOJ,YAAY;IAGnBC,QAAQ,OAAO,CAAC,CAACI;QACf,MAAMC,OAAOD,OAAO,IAAI;QACxB,MAAME,sBAAsBtC,KAAK,WAAW,CAAC,IAAI,CAC/C,CAACoC,SAAWA,OAAO,IAAI,KAAKC;QAG9BzC,MAAM,+BAA+B0C;QACrC,MAAMC,eAAeD,sBACjBE,4BAA4BF,oBAAoB,WAAW,IAC3D,EAAE;QAEN1C,MAAM,gBAAgB2C;QAEtBA,aAAa,OAAO,CAAC,CAACE;YACpB,MAAMC,eAAeN,OAAO,KAAK,CAACK,MAAM;YACxC,IAAIC,cACF,IAAIjC,QACF2B,OAAO,KAAK,CAACK,MAAM,GAAGE,cACpBD,cACAnC,KAAK,KAAK,EACVA,KAAK,MAAM,EACXE;iBAEG;gBACL,MAAMmC,UAAUjC,YAAY+B;gBAC5B,IAAIE,SACFR,OAAO,KAAK,CAACK,MAAM,CAAC,EAAE,GAAGG,QAAQ,EAAE;YAEvC;QAEJ;IACF;IAEAT,OAAO,CAACJ,WAAW,KAAK,EAAE,CAAC,wBAAwB,EAAEA,WAAW,KAAK,EAAE;IAEvE,IACEC,AAAmB,MAAnBA,QAAQ,MAAM,IACdC,YAAY,kCAAkC,IAC9C,CAACA,YAAY,KAAK,EAElBY,QAAQ,IAAI,CACV,8EACA9C;IAIJM,QAAAA,uBAAAA,oBAAqB,MAAM,CAAC;QAC1B,MAAM;QACN,SAAS;YACP;gBACE,MAAM;gBACN,MAAMuB;YACR;SACD;IACH;IAEA,OAAOK;AACT"}
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import { ifMidsceneLocatorField } from "../common.mjs";
|
|
2
2
|
import { bboxDescription } from "./common.mjs";
|
|
3
|
-
const vlCurrentLog = '"log": string, // Log your thoughts and what the next one action (ONLY ONE!) you can do according to the screenshot and the instruction. The typical log looks like "The user wants to do ... . According to the instruction and the previous logs,
|
|
3
|
+
const vlCurrentLog = '"log": string, // Log your thoughts and what the next one action (ONLY ONE!) you can do according to the screenshot and the instruction. The typical log looks like "The user wants to do ... . According to the instruction and the previous logs, i think the next action should be ....". If no action should be done, log the reason. Use the same language as the user\'s instruction.';
|
|
4
4
|
const llmCurrentLog = '"log": string, // Log what the next actions you can do according to the screenshot and the instruction. The typical log looks like "Now i want to use action \'{ action-type }\' to do ..". If no action should be done, log the reason. ". Use the same language as the user\'s instruction.';
|
|
5
5
|
const commonOutputFields = `"error"?: string, // Error messages about unexpected situations, if any. Only think it is an error when the situation is not foreseeable according to the instruction. Use the same language as the user's instruction.
|
|
6
6
|
"more_actions_needed_by_instruction": boolean, // Consider if there is still more action(s) to do after the action in "Log" is done, according to the instruction. If so, set this field to true. Otherwise, set it to false.`;
|
|
@@ -110,7 +110,7 @@ For example, when the instruction is "click 'Confirm' button, and click 'Yes' in
|
|
|
110
110
|
this and output the JSON:
|
|
111
111
|
|
|
112
112
|
{
|
|
113
|
-
"log": "The user wants to do click 'Confirm' button, and click 'Yes' in popup. According to the instruction and the previous logs,
|
|
113
|
+
"log": "The user wants to do click 'Confirm' button, and click 'Yes' in popup. According to the instruction and the previous logs, i think the next action should be click 'Yes' in popup.",
|
|
114
114
|
"action": {
|
|
115
115
|
"type": "Tap",
|
|
116
116
|
"param": {
|