npm - @midscene/core - Versions diffs - 0.30.5-beta-20251021035431.0 → 1.0.1-beta-20251021060907.0 - Mend

@midscene/core 0.30.5-beta-20251021035431.0 → 1.0.1-beta-20251021060907.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (87) hide show

package/dist/es/agent/agent.mjs +41 -33
package/dist/es/agent/agent.mjs.map +1 -1
package/dist/es/agent/execution-session.mjs +41 -0
package/dist/es/agent/execution-session.mjs.map +1 -0
package/dist/es/agent/task-builder.mjs +303 -0
package/dist/es/agent/task-builder.mjs.map +1 -0
package/dist/es/agent/tasks.mjs +68 -391
package/dist/es/agent/tasks.mjs.map +1 -1
package/dist/es/agent/ui-utils.mjs.map +1 -1
package/dist/es/agent/utils.mjs +6 -6
package/dist/es/agent/utils.mjs.map +1 -1
package/dist/es/ai-model/common.mjs +1 -15
package/dist/es/ai-model/common.mjs.map +1 -1
package/dist/es/ai-model/inspect.mjs +2 -3
package/dist/es/ai-model/inspect.mjs.map +1 -1
package/dist/es/ai-model/llm-planning.mjs +6 -24
package/dist/es/ai-model/llm-planning.mjs.map +1 -1
package/dist/es/ai-model/prompt/llm-locator.mjs +3 -204
package/dist/es/ai-model/prompt/llm-locator.mjs.map +1 -1
package/dist/es/ai-model/service-caller/index.mjs +101 -231
package/dist/es/ai-model/service-caller/index.mjs.map +1 -1
package/dist/es/index.mjs +3 -2
package/dist/es/index.mjs.map +1 -1
package/dist/es/insight/index.mjs +18 -19
package/dist/es/insight/index.mjs.map +1 -1
package/dist/es/insight/utils.mjs +3 -3
package/dist/es/insight/utils.mjs.map +1 -1
package/dist/es/report.mjs.map +1 -1
package/dist/es/{ai-model/action-executor.mjs → task-runner.mjs} +69 -10
package/dist/es/task-runner.mjs.map +1 -0
package/dist/es/types.mjs +18 -1
package/dist/es/types.mjs.map +1 -1
package/dist/es/utils.mjs +2 -2
package/dist/es/yaml/player.mjs +18 -14
package/dist/es/yaml/player.mjs.map +1 -1
package/dist/lib/agent/agent.js +41 -33
package/dist/lib/agent/agent.js.map +1 -1
package/dist/lib/agent/execution-session.js +75 -0
package/dist/lib/agent/execution-session.js.map +1 -0
package/dist/lib/agent/task-builder.js +340 -0
package/dist/lib/agent/task-builder.js.map +1 -0
package/dist/lib/agent/tasks.js +68 -391
package/dist/lib/agent/tasks.js.map +1 -1
package/dist/lib/agent/ui-utils.js.map +1 -1
package/dist/lib/agent/utils.js +6 -6
package/dist/lib/agent/utils.js.map +1 -1
package/dist/lib/ai-model/common.js +2 -19
package/dist/lib/ai-model/common.js.map +1 -1
package/dist/lib/ai-model/inspect.js +1 -2
package/dist/lib/ai-model/inspect.js.map +1 -1
package/dist/lib/ai-model/llm-planning.js +5 -23
package/dist/lib/ai-model/llm-planning.js.map +1 -1
package/dist/lib/ai-model/prompt/llm-locator.js +2 -206
package/dist/lib/ai-model/prompt/llm-locator.js.map +1 -1
package/dist/lib/ai-model/service-caller/index.js +236 -384
package/dist/lib/ai-model/service-caller/index.js.map +1 -1
package/dist/lib/index.js +9 -5
package/dist/lib/index.js.map +1 -1
package/dist/lib/insight/index.js +17 -18
package/dist/lib/insight/index.js.map +1 -1
package/dist/lib/insight/utils.js +5 -5
package/dist/lib/insight/utils.js.map +1 -1
package/dist/lib/report.js.map +1 -1
package/dist/lib/{ai-model/action-executor.js → task-runner.js} +71 -12
package/dist/lib/task-runner.js.map +1 -0
package/dist/lib/types.js +22 -1
package/dist/lib/types.js.map +1 -1
package/dist/lib/utils.js +2 -2
package/dist/lib/yaml/player.js +18 -14
package/dist/lib/yaml/player.js.map +1 -1
package/dist/types/agent/agent.d.ts +16 -0
package/dist/types/agent/execution-session.d.ts +27 -0
package/dist/types/agent/task-builder.d.ts +24 -0
package/dist/types/agent/tasks.d.ts +8 -11
package/dist/types/agent/ui-utils.d.ts +2 -2
package/dist/types/agent/utils.d.ts +5 -2
package/dist/types/ai-model/common.d.ts +0 -1
package/dist/types/ai-model/prompt/llm-locator.d.ts +0 -2
package/dist/types/index.d.ts +4 -3
package/dist/types/insight/index.d.ts +5 -10
package/dist/types/insight/utils.d.ts +2 -2
package/dist/types/{ai-model/action-executor.d.ts → task-runner.d.ts} +14 -3
package/dist/types/types.d.ts +47 -4
package/dist/types/yaml.d.ts +3 -1
package/package.json +4 -7
package/dist/es/ai-model/action-executor.mjs.map +0 -1
package/dist/lib/ai-model/action-executor.js.map +0 -1

package/dist/es/ai-model/inspect.mjs.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"ai-model/inspect.mjs","sources":["webpack://@midscene/core/./src/ai-model/inspect.ts"],"sourcesContent":["import type {\n AIDataExtractionResponse,\n AIElementLocatorResponse,\n AIElementResponse,\n AISectionLocatorResponse,\n AIUsageInfo,\n BaseElement,\n ElementById,\n InsightExtractOption,\n Rect,\n ReferenceImage,\n UIContext,\n} from '@/types';\nimport type { IModelConfig } from '@midscene/shared/env';\nimport {\n cropByRect,\n paddingToMatchBlockByBase64,\n preProcessImageUrl,\n} from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport type {\n ChatCompletionSystemMessageParam,\n ChatCompletionUserMessageParam,\n} from 'openai/resources/index';\nimport type { TMultimodalPrompt, TUserPrompt } from './common';\nimport {\n AIActionType,\n adaptBboxToRect,\n expandSearchArea,\n markupImageForLLM,\n mergeRects,\n} from './common';\nimport {\n extractDataQueryPrompt,\n systemPromptToExtract,\n} from './prompt/extraction';\nimport {\n findElementPrompt,\n systemPromptToLocateElement,\n} from './prompt/llm-locator';\nimport {\n sectionLocatorInstruction,\n systemPromptToLocateSection,\n} from './prompt/llm-section-locator';\nimport {\n describeUserPage,\n distance,\n distanceThreshold,\n elementByPositionWithElementInfo,\n} from './prompt/util';\nimport { callAIWithObjectResponse } from './service-caller/index';\n\nexport type AIArgs = [\n ChatCompletionSystemMessageParam,\n ...ChatCompletionUserMessageParam[],\n];\n\nconst debugInspect = getDebug('ai:inspect');\nconst debugSection = getDebug('ai:section');\n\nconst extraTextFromUserPrompt = (prompt: TUserPrompt): string => {\n if (typeof prompt === 'string') {\n return prompt;\n } else {\n return prompt.prompt;\n }\n};\n\nconst promptsToChatParam = async (\n multimodalPrompt: TMultimodalPrompt,\n): Promise<ChatCompletionUserMessageParam[]> => {\n const msgs: ChatCompletionUserMessageParam[] = [];\n if (multimodalPrompt?.images?.length) {\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: 'Next, I will provide all the reference images.',\n },\n ],\n });\n\n for (const item of multimodalPrompt.images) {\n const base64 = await preProcessImageUrl(\n item.url,\n !!multimodalPrompt.convertHttpImage2Base64,\n );\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: `reference image ${item.name}:`,\n },\n ],\n });\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: base64,\n detail: 'high',\n },\n },\n ],\n });\n }\n }\n return msgs;\n};\n\nexport async function AiLocateElement<\n ElementType extends BaseElement = BaseElement,\n>(options: {\n context: UIContext<ElementType>;\n targetElementDescription: TUserPrompt;\n referenceImage?: ReferenceImage;\n callAIFn: typeof callAIWithObjectResponse<\n AIElementResponse \| [number, number]\n >;\n searchConfig?: Awaited<ReturnType<typeof AiLocateSection>>;\n modelConfig: IModelConfig;\n}): Promise<{\n parseResult: AIElementLocatorResponse;\n rect?: Rect;\n rawResponse: string;\n elementById: ElementById;\n usage?: AIUsageInfo;\n isOrderSensitive?: boolean;\n}> {\n const { context, targetElementDescription, callAIFn, modelConfig } = options;\n const { vlMode } = modelConfig;\n const { screenshotBase64 } = context;\n\n const { description, elementById, insertElementByPosition } =\n await describeUserPage(context, { vlMode });\n\n assert(\n targetElementDescription,\n 'cannot find the target element description',\n );\n const userInstructionPrompt = await findElementPrompt.format({\n pageDescription: description,\n targetElementDescription: extraTextFromUserPrompt(targetElementDescription),\n });\n const systemPrompt = systemPromptToLocateElement(vlMode);\n\n let imagePayload = screenshotBase64;\n let imageWidth = context.size.width;\n let imageHeight = context.size.height;\n let originalImageWidth = imageWidth;\n let originalImageHeight = imageHeight;\n\n if (options.searchConfig) {\n assert(\n options.searchConfig.rect,\n 'searchArea is provided but its rect cannot be found. Failed to locate element',\n );\n assert(\n options.searchConfig.imageBase64,\n 'searchArea is provided but its imageBase64 cannot be found. Failed to locate element',\n );\n\n imagePayload = options.searchConfig.imageBase64;\n imageWidth = options.searchConfig.rect?.width;\n imageHeight = options.searchConfig.rect?.height;\n originalImageWidth = imageWidth;\n originalImageHeight = imageHeight;\n } else if (vlMode === 'qwen-vl') {\n const paddedResult = await paddingToMatchBlockByBase64(imagePayload);\n imageWidth = paddedResult.width;\n imageHeight = paddedResult.height;\n imagePayload = paddedResult.imageBase64;\n } else if (vlMode === 'qwen3-vl') {\n // const paddedResult = await paddingToMatchBlockByBase64(imagePayload, 32);\n // imageWidth = paddedResult.width;\n // imageHeight = paddedResult.height;\n // imagePayload = paddedResult.imageBase64;\n } else if (!vlMode) {\n imagePayload = await markupImageForLLM(\n screenshotBase64,\n context.tree,\n context.size,\n );\n }\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: userInstructionPrompt,\n },\n ],\n },\n ];\n\n if (typeof targetElementDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: targetElementDescription.images,\n convertHttpImage2Base64: targetElementDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const res = await callAIFn(msgs, AIActionType.INSPECT_ELEMENT, modelConfig);\n\n const rawResponse = JSON.stringify(res.content);\n\n let resRect: Rect \| undefined;\n let matchedElements: AIElementLocatorResponse['elements'] =\n 'elements' in res.content ? res.content.elements : [];\n let errors: AIElementLocatorResponse['errors'] \| undefined =\n 'errors' in res.content ? res.content.errors : [];\n try {\n if ('bbox' in res.content && Array.isArray(res.content.bbox)) {\n resRect = adaptBboxToRect(\n res.content.bbox,\n imageWidth,\n imageHeight,\n options.searchConfig?.rect?.left,\n options.searchConfig?.rect?.top,\n originalImageWidth,\n originalImageHeight,\n vlMode,\n );\n\n debugInspect('resRect', resRect);\n\n const rectCenter = {\n x: resRect.left + resRect.width / 2,\n y: resRect.top + resRect.height / 2,\n };\n let element = elementByPositionWithElementInfo(context.tree, rectCenter);\n\n const distanceToCenter = element\n ? distance({ x: element.center[0], y: element.center[1] }, rectCenter)\n : 0;\n\n if (!element \|\| distanceToCenter > distanceThreshold) {\n element = insertElementByPosition(rectCenter);\n }\n\n if (element) {\n matchedElements = [element];\n errors = [];\n }\n }\n } catch (e) {\n const msg =\n e instanceof Error\n ? `Failed to parse bbox: ${e.message}`\n : 'unknown error in locate';\n if (!errors \|\| errors?.length === 0) {\n errors = [msg];\n } else {\n errors.push(`(${msg})`);\n }\n }\n\n return {\n rect: resRect,\n parseResult: {\n elements: matchedElements,\n errors,\n },\n rawResponse,\n elementById,\n usage: res.usage,\n isOrderSensitive:\n typeof res.content === 'object' &&\n res.content !== null &&\n 'isOrderSensitive' in res.content\n ? (res.content as any).isOrderSensitive\n : undefined,\n };\n}\n\nexport async function AiLocateSection(options: {\n context: UIContext<BaseElement>;\n sectionDescription: TUserPrompt;\n modelConfig: IModelConfig;\n}): Promise<{\n rect?: Rect;\n imageBase64?: string;\n error?: string;\n rawResponse: string;\n usage?: AIUsageInfo;\n}> {\n const { context, sectionDescription, modelConfig } = options;\n const { vlMode } = modelConfig;\n const { screenshotBase64 } = context;\n\n const systemPrompt = systemPromptToLocateSection(vlMode);\n const sectionLocatorInstructionText = await sectionLocatorInstruction.format({\n sectionDescription: extraTextFromUserPrompt(sectionDescription),\n });\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: sectionLocatorInstructionText,\n },\n ],\n },\n ];\n\n if (typeof sectionDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: sectionDescription.images,\n convertHttpImage2Base64: sectionDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const result = await callAIWithObjectResponse<AISectionLocatorResponse>(\n msgs,\n AIActionType.EXTRACT_DATA,\n modelConfig,\n );\n\n let sectionRect: Rect \| undefined;\n const sectionBbox = result.content.bbox;\n if (sectionBbox) {\n const targetRect = adaptBboxToRect(\n sectionBbox,\n context.size.width,\n context.size.height,\n 0,\n 0,\n context.size.width,\n context.size.height,\n vlMode,\n );\n debugSection('original targetRect %j', targetRect);\n\n const referenceBboxList = result.content.references_bbox \|\| [];\n debugSection('referenceBboxList %j', referenceBboxList);\n\n const referenceRects = referenceBboxList\n .filter((bbox) => Array.isArray(bbox))\n .map((bbox) => {\n return adaptBboxToRect(\n bbox,\n context.size.width,\n context.size.height,\n 0,\n 0,\n context.size.width,\n context.size.height,\n vlMode,\n );\n });\n debugSection('referenceRects %j', referenceRects);\n\n // merge the sectionRect and referenceRects\n const mergedRect = mergeRects([targetRect, ...referenceRects]);\n debugSection('mergedRect %j', mergedRect);\n\n // expand search area to at least 200 x 200\n sectionRect = expandSearchArea(mergedRect, context.size, vlMode);\n debugSection('expanded sectionRect %j', sectionRect);\n }\n\n let imageBase64 = screenshotBase64;\n if (sectionRect) {\n const croppedResult = await cropByRect(\n screenshotBase64,\n sectionRect,\n vlMode === 'qwen-vl',\n );\n imageBase64 = croppedResult.imageBase64;\n sectionRect.width = croppedResult.width;\n sectionRect.height = croppedResult.height;\n }\n\n return {\n rect: sectionRect,\n imageBase64,\n error: result.content.error,\n rawResponse: JSON.stringify(result.content),\n usage: result.usage,\n };\n}\n\nexport async function AiExtractElementInfo<\n T,\n ElementType extends BaseElement = BaseElement,\n>(options: {\n dataQuery: string \| Record<string, string>;\n multimodalPrompt?: TMultimodalPrompt;\n context: UIContext<ElementType>;\n extractOption?: InsightExtractOption;\n modelConfig: IModelConfig;\n}) {\n const { dataQuery, context, extractOption, multimodalPrompt, modelConfig } =\n options;\n const { vlMode } = modelConfig;\n const systemPrompt = systemPromptToExtract();\n\n const { screenshotBase64 } = context;\n\n const { description, elementById } = await describeUserPage(context, {\n truncateTextLength: 200,\n filterNonTextContent: false,\n visibleOnly: false,\n domIncluded: extractOption?.domIncluded,\n vlMode,\n });\n\n const extractDataPromptText = await extractDataQueryPrompt(\n description,\n dataQuery,\n );\n\n const userContent: ChatCompletionUserMessageParam['content'] = [];\n\n if (extractOption?.screenshotIncluded !== false) {\n userContent.push({\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n });\n }\n\n userContent.push({\n type: 'text',\n text: extractDataPromptText,\n });\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: userContent,\n },\n ];\n\n if (multimodalPrompt) {\n const addOns = await promptsToChatParam({\n images: multimodalPrompt.images,\n convertHttpImage2Base64: multimodalPrompt.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const result = await callAIWithObjectResponse<AIDataExtractionResponse<T>>(\n msgs,\n AIActionType.EXTRACT_DATA,\n modelConfig,\n );\n return {\n parseResult: result.content,\n elementById,\n usage: result.usage,\n };\n}\n"],"names":["debugInspect","getDebug","debugSection","extraTextFromUserPrompt","prompt","promptsToChatParam","multimodalPrompt","_multimodalPrompt_images","msgs","item","base64","preProcessImageUrl","AiLocateElement","options","context","targetElementDescription","callAIFn","modelConfig","vlMode","screenshotBase64","description","elementById","insertElementByPosition","describeUserPage","assert","userInstructionPrompt","findElementPrompt","systemPrompt","systemPromptToLocateElement","imagePayload","imageWidth","imageHeight","originalImageWidth","originalImageHeight","_options_searchConfig_rect","_options_searchConfig_rect1","paddedResult","paddingToMatchBlockByBase64","markupImageForLLM","addOns","res","AIActionType","rawResponse","JSON","resRect","matchedElements","errors","Array","_options_searchConfig_rect2","_options_searchConfig_rect3","adaptBboxToRect","rectCenter","element","elementByPositionWithElementInfo","distanceToCenter","distance","distanceThreshold","e","msg","Error","undefined","AiLocateSection","sectionDescription","systemPromptToLocateSection","sectionLocatorInstructionText","sectionLocatorInstruction","result","callAIWithObjectResponse","sectionRect","sectionBbox","targetRect","referenceBboxList","referenceRects","bbox","mergedRect","mergeRects","expandSearchArea","imageBase64","croppedResult","cropByRect","AiExtractElementInfo","dataQuery","extractOption","systemPromptToExtract","extractDataPromptText","extractDataQueryPrompt","userContent"],"mappings":";;;;;;;;;AA0DA,MAAMA,eAAeC,SAAS;AAC9B,MAAMC,eAAeD,SAAS;AAE9B,MAAME,0BAA0B,CAACC;IAC/B,IAAI,AAAkB,YAAlB,OAAOA,QACT,OAAOA;IAEP,OAAOA,OAAO,MAAM;AAExB;AAEA,MAAMC,qBAAqB,OACzBC;QAGIC;IADJ,MAAMC,OAAyC,EAAE;IACjD,IAAID,QAAAA,mBAAAA,KAAAA,IAAAA,QAAAA,CAAAA,2BAAAA,iBAAkB,MAAM,AAAD,IAAvBA,KAAAA,IAAAA,yBAA0B,MAAM,EAAE;QACpCC,KAAK,IAAI,CAAC;YACR,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM;gBACR;aACD;QACH;QAEA,KAAK,MAAMC,QAAQH,iBAAiB,MAAM,CAAE;YAC1C,MAAMI,SAAS,MAAMC,mBACnBF,KAAK,GAAG,EACR,CAAC,CAACH,iBAAiB,uBAAuB;YAG5CE,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,MAAM,CAAC,gBAAgB,EAAEC,KAAK,IAAI,CAAC,CAAC,CAAC;oBACvC;iBACD;YACH;YAEAD,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,WAAW;4BACT,KAAKE;4BACL,QAAQ;wBACV;oBACF;iBACD;YACH;QACF;IACF;IACA,OAAOF;AACT;AAEO,eAAeI,gBAEpBC,OASD;IAQC,MAAM,EAAEC,OAAO,EAAEC,wBAAwB,EAAEC,QAAQ,EAAEC,WAAW,EAAE,GAAGJ;IACrE,MAAM,EAAEK,MAAM,EAAE,GAAGD;IACnB,MAAM,EAAEE,gBAAgB,EAAE,GAAGL;IAE7B,MAAM,EAAEM,WAAW,EAAEC,WAAW,EAAEC,uBAAuB,EAAE,GACzD,MAAMC,iBAAiBT,SAAS;QAAEI;IAAO;IAE3CM,OACET,0BACA;IAEF,MAAMU,wBAAwB,MAAMC,kBAAkB,MAAM,CAAC;QAC3D,iBAAiBN;QACjB,0BAA0BjB,wBAAwBY;IACpD;IACA,MAAMY,eAAeC,4BAA4BV;IAEjD,IAAIW,eAAeV;IACnB,IAAIW,aAAahB,QAAQ,IAAI,CAAC,KAAK;IACnC,IAAIiB,cAAcjB,QAAQ,IAAI,CAAC,MAAM;IACrC,IAAIkB,qBAAqBF;IACzB,IAAIG,sBAAsBF;IAE1B,IAAIlB,QAAQ,YAAY,EAAE;YAWXqB,4BACCC;QAXdX,OACEX,QAAQ,YAAY,CAAC,IAAI,EACzB;QAEFW,OACEX,QAAQ,YAAY,CAAC,WAAW,EAChC;QAGFgB,eAAehB,QAAQ,YAAY,CAAC,WAAW;QAC/CiB,aAAa,QAAAI,CAAAA,6BAAAA,QAAQ,YAAY,CAAC,IAAI,AAAD,IAAxBA,KAAAA,IAAAA,2BAA2B,KAAK;QAC7CH,cAAc,QAAAI,CAAAA,8BAAAA,QAAQ,YAAY,CAAC,IAAI,AAAD,IAAxBA,KAAAA,IAAAA,4BAA2B,MAAM;QAC/CH,qBAAqBF;QACrBG,sBAAsBF;IACxB,OAAO,IAAIb,AAAW,cAAXA,QAAsB;QAC/B,MAAMkB,eAAe,MAAMC,4BAA4BR;QACvDC,aAAaM,aAAa,KAAK;QAC/BL,cAAcK,aAAa,MAAM;QACjCP,eAAeO,aAAa,WAAW;IACzC,OAAO,IAAIlB,AAAW,eAAXA;SAKJ,IAAI,CAACA,QACVW,eAAe,MAAMS,kBACnBnB,kBACAL,QAAQ,IAAI,EACZA,QAAQ,IAAI;IAIhB,MAAMN,OAAe;QACnB;YAAE,MAAM;YAAU,SAASmB;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKE;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAMJ;gBACR;aACD;QACH;KACD;IAED,IAAI,AAAoC,YAApC,OAAOV,0BAAuC;QAChD,MAAMwB,SAAS,MAAMlC,mBAAmB;YACtC,QAAQU,yBAAyB,MAAM;YACvC,yBAAyBA,yBAAyB,uBAAuB;QAC3E;QACAP,KAAK,IAAI,IAAI+B;IACf;IAEA,MAAMC,MAAM,MAAMxB,SAASR,MAAMiC,aAAa,eAAe,EAAExB;IAE/D,MAAMyB,cAAcC,KAAK,SAAS,CAACH,IAAI,OAAO;IAE9C,IAAII;IACJ,IAAIC,kBACF,cAAcL,IAAI,OAAO,GAAGA,IAAI,OAAO,CAAC,QAAQ,GAAG,EAAE;IACvD,IAAIM,SACF,YAAYN,IAAI,OAAO,GAAGA,IAAI,OAAO,CAAC,MAAM,GAAG,EAAE;IACnD,IAAI;QACF,IAAI,UAAUA,IAAI,OAAO,IAAIO,MAAM,OAAO,CAACP,IAAI,OAAO,CAAC,IAAI,GAAG;gBAK1DQ,6BAAAA,uBACAC,6BAAAA;YALFL,UAAUM,gBACRV,IAAI,OAAO,CAAC,IAAI,EAChBV,YACAC,aAAAA,QACAiB,CAAAA,wBAAAA,QAAQ,YAAY,AAAD,IAAnBA,KAAAA,IAAAA,QAAAA,CAAAA,8BAAAA,sBAAsB,IAAI,AAAD,IAAzBA,KAAAA,IAAAA,4BAA4B,IAAI,UAChCC,CAAAA,yBAAAA,QAAQ,YAAY,AAAD,IAAnBA,KAAAA,IAAAA,QAAAA,CAAAA,8BAAAA,uBAAsB,IAAI,AAAD,IAAzBA,KAAAA,IAAAA,4BAA4B,GAAG,EAC/BjB,oBACAC,qBACAf;YAGFlB,aAAa,WAAW4C;YAExB,MAAMO,aAAa;gBACjB,GAAGP,QAAQ,IAAI,GAAGA,QAAQ,KAAK,GAAG;gBAClC,GAAGA,QAAQ,GAAG,GAAGA,QAAQ,MAAM,GAAG;YACpC;YACA,IAAIQ,UAAUC,iCAAiCvC,QAAQ,IAAI,EAAEqC;YAE7D,MAAMG,mBAAmBF,UACrBG,SAAS;gBAAE,GAAGH,QAAQ,MAAM,CAAC,EAAE;gBAAE,GAAGA,QAAQ,MAAM,CAAC,EAAE;YAAC,GAAGD,cACzD;YAEJ,IAAI,CAACC,WAAWE,mBAAmBE,mBACjCJ,UAAU9B,wBAAwB6B;YAGpC,IAAIC,SAAS;gBACXP,kBAAkB;oBAACO;iBAAQ;gBAC3BN,SAAS,EAAE;YACb;QACF;IACF,EAAE,OAAOW,GAAG;QACV,MAAMC,MACJD,aAAaE,QACT,CAAC,sBAAsB,EAAEF,EAAE,OAAO,EAAE,GACpC;QACN,IAAI,AAACX,UAAUA,AAAAA,CAAAA,QAAAA,SAAAA,KAAAA,IAAAA,OAAQ,MAAM,AAAD,MAAM,GAGhCA,OAAO,IAAI,CAAC,CAAC,CAAC,EAAEY,IAAI,CAAC,CAAC;aAFtBZ,SAAS;YAACY;SAAI;IAIlB;IAEA,OAAO;QACL,MAAMd;QACN,aAAa;YACX,UAAUC;YACVC;QACF;QACAJ;QACArB;QACA,OAAOmB,IAAI,KAAK;QAChB,kBACE,AAAuB,YAAvB,OAAOA,IAAI,OAAO,IAClBA,AAAgB,SAAhBA,IAAI,OAAO,IACX,sBAAsBA,IAAI,OAAO,GAC5BA,IAAI,OAAO,CAAS,gBAAgB,GACrCoB;IACR;AACF;AAEO,eAAeC,gBAAgBhD,OAIrC;IAOC,MAAM,EAAEC,OAAO,EAAEgD,kBAAkB,EAAE7C,WAAW,EAAE,GAAGJ;IACrD,MAAM,EAAEK,MAAM,EAAE,GAAGD;IACnB,MAAM,EAAEE,gBAAgB,EAAE,GAAGL;IAE7B,MAAMa,eAAeoC,4BAA4B7C;IACjD,MAAM8C,gCAAgC,MAAMC,0BAA0B,MAAM,CAAC;QAC3E,oBAAoB9D,wBAAwB2D;IAC9C;IACA,MAAMtD,OAAe;QACnB;YAAE,MAAM;YAAU,SAASmB;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKR;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAM6C;gBACR;aACD;QACH;KACD;IAED,IAAI,AAA8B,YAA9B,OAAOF,oBAAiC;QAC1C,MAAMvB,SAAS,MAAMlC,mBAAmB;YACtC,QAAQyD,mBAAmB,MAAM;YACjC,yBAAyBA,mBAAmB,uBAAuB;QACrE;QACAtD,KAAK,IAAI,IAAI+B;IACf;IAEA,MAAM2B,SAAS,MAAMC,yBACnB3D,MACAiC,aAAa,YAAY,EACzBxB;IAGF,IAAImD;IACJ,MAAMC,cAAcH,OAAO,OAAO,CAAC,IAAI;IACvC,IAAIG,aAAa;QACf,MAAMC,aAAapB,gBACjBmB,aACAvD,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnB,GACA,GACAA,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnBI;QAEFhB,aAAa,0BAA0BoE;QAEvC,MAAMC,oBAAoBL,OAAO,OAAO,CAAC,eAAe,IAAI,EAAE;QAC9DhE,aAAa,wBAAwBqE;QAErC,MAAMC,iBAAiBD,kBACpB,MAAM,CAAC,CAACE,OAAS1B,MAAM,OAAO,CAAC0B,OAC/B,GAAG,CAAC,CAACA,OACGvB,gBACLuB,MACA3D,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnB,GACA,GACAA,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnBI;QAGNhB,aAAa,qBAAqBsE;QAGlC,MAAME,aAAaC,WAAW;YAACL;eAAeE;SAAe;QAC7DtE,aAAa,iBAAiBwE;QAG9BN,cAAcQ,iBAAiBF,YAAY5D,QAAQ,IAAI,EAAEI;QACzDhB,aAAa,2BAA2BkE;IAC1C;IAEA,IAAIS,cAAc1D;IAClB,IAAIiD,aAAa;QACf,MAAMU,gBAAgB,MAAMC,WAC1B5D,kBACAiD,aACAlD,AAAW,cAAXA;QAEF2D,cAAcC,cAAc,WAAW;QACvCV,YAAY,KAAK,GAAGU,cAAc,KAAK;QACvCV,YAAY,MAAM,GAAGU,cAAc,MAAM;IAC3C;IAEA,OAAO;QACL,MAAMV;QACNS;QACA,OAAOX,OAAO,OAAO,CAAC,KAAK;QAC3B,aAAavB,KAAK,SAAS,CAACuB,OAAO,OAAO;QAC1C,OAAOA,OAAO,KAAK;IACrB;AACF;AAEO,eAAec,qBAGpBnE,OAMD;IACC,MAAM,EAAEoE,SAAS,EAAEnE,OAAO,EAAEoE,aAAa,EAAE5E,gBAAgB,EAAEW,WAAW,EAAE,GACxEJ;IACF,MAAM,EAAEK,MAAM,EAAE,GAAGD;IACnB,MAAMU,eAAewD;IAErB,MAAM,EAAEhE,gBAAgB,EAAE,GAAGL;IAE7B,MAAM,EAAEM,WAAW,EAAEC,WAAW,EAAE,GAAG,MAAME,iBAAiBT,SAAS;QACnE,oBAAoB;QACpB,sBAAsB;QACtB,aAAa;QACb,aAAaoE,QAAAA,gBAAAA,KAAAA,IAAAA,cAAe,WAAW;QACvChE;IACF;IAEA,MAAMkE,wBAAwB,MAAMC,uBAClCjE,aACA6D;IAGF,MAAMK,cAAyD,EAAE;IAEjE,IAAIJ,AAAAA,CAAAA,QAAAA,gBAAAA,KAAAA,IAAAA,cAAe,kBAAkB,AAAD,MAAM,OACxCI,YAAY,IAAI,CAAC;QACf,MAAM;QACN,WAAW;YACT,KAAKnE;YACL,QAAQ;QACV;IACF;IAGFmE,YAAY,IAAI,CAAC;QACf,MAAM;QACN,MAAMF;IACR;IAEA,MAAM5E,OAAe;QACnB;YAAE,MAAM;YAAU,SAASmB;QAAa;QACxC;YACE,MAAM;YACN,SAAS2D;QACX;KACD;IAED,IAAIhF,kBAAkB;QACpB,MAAMiC,SAAS,MAAMlC,mBAAmB;YACtC,QAAQC,iBAAiB,MAAM;YAC/B,yBAAyBA,iBAAiB,uBAAuB;QACnE;QACAE,KAAK,IAAI,IAAI+B;IACf;IAEA,MAAM2B,SAAS,MAAMC,yBACnB3D,MACAiC,aAAa,YAAY,EACzBxB;IAEF,OAAO;QACL,aAAaiD,OAAO,OAAO;QAC3B7C;QACA,OAAO6C,OAAO,KAAK;IACrB;AACF"}
1	+ {"version":3,"file":"ai-model/inspect.mjs","sources":["webpack://@midscene/core/./src/ai-model/inspect.ts"],"sourcesContent":["import type {\n AIDataExtractionResponse,\n AIElementLocatorResponse,\n AIElementResponse,\n AISectionLocatorResponse,\n AIUsageInfo,\n BaseElement,\n ElementById,\n InsightExtractOption,\n Rect,\n ReferenceImage,\n UIContext,\n} from '@/types';\nimport type { IModelConfig } from '@midscene/shared/env';\nimport {\n cropByRect,\n paddingToMatchBlockByBase64,\n preProcessImageUrl,\n} from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport type {\n ChatCompletionSystemMessageParam,\n ChatCompletionUserMessageParam,\n} from 'openai/resources/index';\nimport type { TMultimodalPrompt, TUserPrompt } from './common';\nimport {\n AIActionType,\n adaptBboxToRect,\n expandSearchArea,\n mergeRects,\n} from './common';\nimport {\n extractDataQueryPrompt,\n systemPromptToExtract,\n} from './prompt/extraction';\nimport {\n findElementPrompt,\n systemPromptToLocateElement,\n} from './prompt/llm-locator';\nimport {\n sectionLocatorInstruction,\n systemPromptToLocateSection,\n} from './prompt/llm-section-locator';\nimport {\n describeUserPage,\n distance,\n distanceThreshold,\n elementByPositionWithElementInfo,\n} from './prompt/util';\nimport { callAIWithObjectResponse } from './service-caller/index';\n\nexport type AIArgs = [\n ChatCompletionSystemMessageParam,\n ...ChatCompletionUserMessageParam[],\n];\n\nconst debugInspect = getDebug('ai:inspect');\nconst debugSection = getDebug('ai:section');\n\nconst extraTextFromUserPrompt = (prompt: TUserPrompt): string => {\n if (typeof prompt === 'string') {\n return prompt;\n } else {\n return prompt.prompt;\n }\n};\n\nconst promptsToChatParam = async (\n multimodalPrompt: TMultimodalPrompt,\n): Promise<ChatCompletionUserMessageParam[]> => {\n const msgs: ChatCompletionUserMessageParam[] = [];\n if (multimodalPrompt?.images?.length) {\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: 'Next, I will provide all the reference images.',\n },\n ],\n });\n\n for (const item of multimodalPrompt.images) {\n const base64 = await preProcessImageUrl(\n item.url,\n !!multimodalPrompt.convertHttpImage2Base64,\n );\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: `reference image ${item.name}:`,\n },\n ],\n });\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: base64,\n detail: 'high',\n },\n },\n ],\n });\n }\n }\n return msgs;\n};\n\nexport async function AiLocateElement<\n ElementType extends BaseElement = BaseElement,\n>(options: {\n context: UIContext<ElementType>;\n targetElementDescription: TUserPrompt;\n referenceImage?: ReferenceImage;\n callAIFn: typeof callAIWithObjectResponse<\n AIElementResponse \| [number, number]\n >;\n searchConfig?: Awaited<ReturnType<typeof AiLocateSection>>;\n modelConfig: IModelConfig;\n}): Promise<{\n parseResult: AIElementLocatorResponse;\n rect?: Rect;\n rawResponse: string;\n elementById: ElementById;\n usage?: AIUsageInfo;\n isOrderSensitive?: boolean;\n}> {\n const { context, targetElementDescription, callAIFn, modelConfig } = options;\n const { vlMode } = modelConfig;\n const { screenshotBase64 } = context;\n\n const { description, elementById, insertElementByPosition } =\n await describeUserPage(context, { vlMode });\n\n assert(\n targetElementDescription,\n 'cannot find the target element description',\n );\n const userInstructionPrompt = await findElementPrompt.format({\n pageDescription: description,\n targetElementDescription: extraTextFromUserPrompt(targetElementDescription),\n });\n const systemPrompt = systemPromptToLocateElement(vlMode);\n\n let imagePayload = screenshotBase64;\n let imageWidth = context.size.width;\n let imageHeight = context.size.height;\n let originalImageWidth = imageWidth;\n let originalImageHeight = imageHeight;\n\n if (options.searchConfig) {\n assert(\n options.searchConfig.rect,\n 'searchArea is provided but its rect cannot be found. Failed to locate element',\n );\n assert(\n options.searchConfig.imageBase64,\n 'searchArea is provided but its imageBase64 cannot be found. Failed to locate element',\n );\n\n imagePayload = options.searchConfig.imageBase64;\n imageWidth = options.searchConfig.rect?.width;\n imageHeight = options.searchConfig.rect?.height;\n originalImageWidth = imageWidth;\n originalImageHeight = imageHeight;\n } else if (vlMode === 'qwen-vl') {\n const paddedResult = await paddingToMatchBlockByBase64(imagePayload);\n imageWidth = paddedResult.width;\n imageHeight = paddedResult.height;\n imagePayload = paddedResult.imageBase64;\n }\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: userInstructionPrompt,\n },\n ],\n },\n ];\n\n if (typeof targetElementDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: targetElementDescription.images,\n convertHttpImage2Base64: targetElementDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const res = await callAIFn(msgs, AIActionType.INSPECT_ELEMENT, modelConfig);\n\n const rawResponse = JSON.stringify(res.content);\n\n let resRect: Rect \| undefined;\n let matchedElements: AIElementLocatorResponse['elements'] =\n 'elements' in res.content ? res.content.elements : [];\n let errors: AIElementLocatorResponse['errors'] \| undefined =\n 'errors' in res.content ? res.content.errors : [];\n try {\n if ('bbox' in res.content && Array.isArray(res.content.bbox)) {\n resRect = adaptBboxToRect(\n res.content.bbox,\n imageWidth,\n imageHeight,\n options.searchConfig?.rect?.left,\n options.searchConfig?.rect?.top,\n originalImageWidth,\n originalImageHeight,\n vlMode,\n );\n\n debugInspect('resRect', resRect);\n\n const rectCenter = {\n x: resRect.left + resRect.width / 2,\n y: resRect.top + resRect.height / 2,\n };\n let element = elementByPositionWithElementInfo(context.tree, rectCenter);\n\n const distanceToCenter = element\n ? distance({ x: element.center[0], y: element.center[1] }, rectCenter)\n : 0;\n\n if (!element \|\| distanceToCenter > distanceThreshold) {\n element = insertElementByPosition(rectCenter);\n }\n\n if (element) {\n matchedElements = [element];\n errors = [];\n }\n }\n } catch (e) {\n const msg =\n e instanceof Error\n ? `Failed to parse bbox: ${e.message}`\n : 'unknown error in locate';\n if (!errors \|\| errors?.length === 0) {\n errors = [msg];\n } else {\n errors.push(`(${msg})`);\n }\n }\n\n return {\n rect: resRect,\n parseResult: {\n elements: matchedElements,\n errors,\n },\n rawResponse,\n elementById,\n usage: res.usage,\n isOrderSensitive:\n typeof res.content === 'object' &&\n res.content !== null &&\n 'isOrderSensitive' in res.content\n ? (res.content as any).isOrderSensitive\n : undefined,\n };\n}\n\nexport async function AiLocateSection(options: {\n context: UIContext<BaseElement>;\n sectionDescription: TUserPrompt;\n modelConfig: IModelConfig;\n}): Promise<{\n rect?: Rect;\n imageBase64?: string;\n error?: string;\n rawResponse: string;\n usage?: AIUsageInfo;\n}> {\n const { context, sectionDescription, modelConfig } = options;\n const { vlMode } = modelConfig;\n const { screenshotBase64 } = context;\n\n const systemPrompt = systemPromptToLocateSection(vlMode);\n const sectionLocatorInstructionText = await sectionLocatorInstruction.format({\n sectionDescription: extraTextFromUserPrompt(sectionDescription),\n });\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: sectionLocatorInstructionText,\n },\n ],\n },\n ];\n\n if (typeof sectionDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: sectionDescription.images,\n convertHttpImage2Base64: sectionDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const result = await callAIWithObjectResponse<AISectionLocatorResponse>(\n msgs,\n AIActionType.EXTRACT_DATA,\n modelConfig,\n );\n\n let sectionRect: Rect \| undefined;\n const sectionBbox = result.content.bbox;\n if (sectionBbox) {\n const targetRect = adaptBboxToRect(\n sectionBbox,\n context.size.width,\n context.size.height,\n 0,\n 0,\n context.size.width,\n context.size.height,\n vlMode,\n );\n debugSection('original targetRect %j', targetRect);\n\n const referenceBboxList = result.content.references_bbox \|\| [];\n debugSection('referenceBboxList %j', referenceBboxList);\n\n const referenceRects = referenceBboxList\n .filter((bbox) => Array.isArray(bbox))\n .map((bbox) => {\n return adaptBboxToRect(\n bbox,\n context.size.width,\n context.size.height,\n 0,\n 0,\n context.size.width,\n context.size.height,\n vlMode,\n );\n });\n debugSection('referenceRects %j', referenceRects);\n\n // merge the sectionRect and referenceRects\n const mergedRect = mergeRects([targetRect, ...referenceRects]);\n debugSection('mergedRect %j', mergedRect);\n\n // expand search area to at least 200 x 200\n sectionRect = expandSearchArea(mergedRect, context.size, vlMode);\n debugSection('expanded sectionRect %j', sectionRect);\n }\n\n let imageBase64 = screenshotBase64;\n if (sectionRect) {\n const croppedResult = await cropByRect(\n screenshotBase64,\n sectionRect,\n vlMode === 'qwen-vl',\n );\n imageBase64 = croppedResult.imageBase64;\n sectionRect.width = croppedResult.width;\n sectionRect.height = croppedResult.height;\n }\n\n return {\n rect: sectionRect,\n imageBase64,\n error: result.content.error,\n rawResponse: JSON.stringify(result.content),\n usage: result.usage,\n };\n}\n\nexport async function AiExtractElementInfo<\n T,\n ElementType extends BaseElement = BaseElement,\n>(options: {\n dataQuery: string \| Record<string, string>;\n multimodalPrompt?: TMultimodalPrompt;\n context: UIContext<ElementType>;\n extractOption?: InsightExtractOption;\n modelConfig: IModelConfig;\n}) {\n const { dataQuery, context, extractOption, multimodalPrompt, modelConfig } =\n options;\n const { vlMode } = modelConfig;\n const systemPrompt = systemPromptToExtract();\n\n const { screenshotBase64 } = context;\n\n const { description, elementById } = await describeUserPage(context, {\n truncateTextLength: 200,\n filterNonTextContent: false,\n visibleOnly: false,\n domIncluded: extractOption?.domIncluded,\n vlMode,\n });\n\n const extractDataPromptText = await extractDataQueryPrompt(\n description,\n dataQuery,\n );\n\n const userContent: ChatCompletionUserMessageParam['content'] = [];\n\n if (extractOption?.screenshotIncluded !== false) {\n userContent.push({\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n });\n }\n\n userContent.push({\n type: 'text',\n text: extractDataPromptText,\n });\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: userContent,\n },\n ];\n\n if (multimodalPrompt) {\n const addOns = await promptsToChatParam({\n images: multimodalPrompt.images,\n convertHttpImage2Base64: multimodalPrompt.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const result = await callAIWithObjectResponse<AIDataExtractionResponse<T>>(\n msgs,\n AIActionType.EXTRACT_DATA,\n modelConfig,\n );\n return {\n parseResult: result.content,\n elementById,\n usage: result.usage,\n };\n}\n"],"names":["debugInspect","getDebug","debugSection","extraTextFromUserPrompt","prompt","promptsToChatParam","multimodalPrompt","_multimodalPrompt_images","msgs","item","base64","preProcessImageUrl","AiLocateElement","options","context","targetElementDescription","callAIFn","modelConfig","vlMode","screenshotBase64","description","elementById","insertElementByPosition","describeUserPage","assert","userInstructionPrompt","findElementPrompt","systemPrompt","systemPromptToLocateElement","imagePayload","imageWidth","imageHeight","originalImageWidth","originalImageHeight","_options_searchConfig_rect","_options_searchConfig_rect1","paddedResult","paddingToMatchBlockByBase64","addOns","res","AIActionType","rawResponse","JSON","resRect","matchedElements","errors","Array","_options_searchConfig_rect2","_options_searchConfig_rect3","adaptBboxToRect","rectCenter","element","elementByPositionWithElementInfo","distanceToCenter","distance","distanceThreshold","e","msg","Error","undefined","AiLocateSection","sectionDescription","systemPromptToLocateSection","sectionLocatorInstructionText","sectionLocatorInstruction","result","callAIWithObjectResponse","sectionRect","sectionBbox","targetRect","referenceBboxList","referenceRects","bbox","mergedRect","mergeRects","expandSearchArea","imageBase64","croppedResult","cropByRect","AiExtractElementInfo","dataQuery","extractOption","systemPromptToExtract","extractDataPromptText","extractDataQueryPrompt","userContent"],"mappings":";;;;;;;;;AAyDA,MAAMA,eAAeC,SAAS;AAC9B,MAAMC,eAAeD,SAAS;AAE9B,MAAME,0BAA0B,CAACC;IAC/B,IAAI,AAAkB,YAAlB,OAAOA,QACT,OAAOA;IAEP,OAAOA,OAAO,MAAM;AAExB;AAEA,MAAMC,qBAAqB,OACzBC;QAGIC;IADJ,MAAMC,OAAyC,EAAE;IACjD,IAAID,QAAAA,mBAAAA,KAAAA,IAAAA,QAAAA,CAAAA,2BAAAA,iBAAkB,MAAM,AAAD,IAAvBA,KAAAA,IAAAA,yBAA0B,MAAM,EAAE;QACpCC,KAAK,IAAI,CAAC;YACR,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM;gBACR;aACD;QACH;QAEA,KAAK,MAAMC,QAAQH,iBAAiB,MAAM,CAAE;YAC1C,MAAMI,SAAS,MAAMC,mBACnBF,KAAK,GAAG,EACR,CAAC,CAACH,iBAAiB,uBAAuB;YAG5CE,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,MAAM,CAAC,gBAAgB,EAAEC,KAAK,IAAI,CAAC,CAAC,CAAC;oBACvC;iBACD;YACH;YAEAD,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,WAAW;4BACT,KAAKE;4BACL,QAAQ;wBACV;oBACF;iBACD;YACH;QACF;IACF;IACA,OAAOF;AACT;AAEO,eAAeI,gBAEpBC,OASD;IAQC,MAAM,EAAEC,OAAO,EAAEC,wBAAwB,EAAEC,QAAQ,EAAEC,WAAW,EAAE,GAAGJ;IACrE,MAAM,EAAEK,MAAM,EAAE,GAAGD;IACnB,MAAM,EAAEE,gBAAgB,EAAE,GAAGL;IAE7B,MAAM,EAAEM,WAAW,EAAEC,WAAW,EAAEC,uBAAuB,EAAE,GACzD,MAAMC,iBAAiBT,SAAS;QAAEI;IAAO;IAE3CM,OACET,0BACA;IAEF,MAAMU,wBAAwB,MAAMC,kBAAkB,MAAM,CAAC;QAC3D,iBAAiBN;QACjB,0BAA0BjB,wBAAwBY;IACpD;IACA,MAAMY,eAAeC,4BAA4BV;IAEjD,IAAIW,eAAeV;IACnB,IAAIW,aAAahB,QAAQ,IAAI,CAAC,KAAK;IACnC,IAAIiB,cAAcjB,QAAQ,IAAI,CAAC,MAAM;IACrC,IAAIkB,qBAAqBF;IACzB,IAAIG,sBAAsBF;IAE1B,IAAIlB,QAAQ,YAAY,EAAE;YAWXqB,4BACCC;QAXdX,OACEX,QAAQ,YAAY,CAAC,IAAI,EACzB;QAEFW,OACEX,QAAQ,YAAY,CAAC,WAAW,EAChC;QAGFgB,eAAehB,QAAQ,YAAY,CAAC,WAAW;QAC/CiB,aAAa,QAAAI,CAAAA,6BAAAA,QAAQ,YAAY,CAAC,IAAI,AAAD,IAAxBA,KAAAA,IAAAA,2BAA2B,KAAK;QAC7CH,cAAc,QAAAI,CAAAA,8BAAAA,QAAQ,YAAY,CAAC,IAAI,AAAD,IAAxBA,KAAAA,IAAAA,4BAA2B,MAAM;QAC/CH,qBAAqBF;QACrBG,sBAAsBF;IACxB,OAAO,IAAIb,AAAW,cAAXA,QAAsB;QAC/B,MAAMkB,eAAe,MAAMC,4BAA4BR;QACvDC,aAAaM,aAAa,KAAK;QAC/BL,cAAcK,aAAa,MAAM;QACjCP,eAAeO,aAAa,WAAW;IACzC;IAEA,MAAM5B,OAAe;QACnB;YAAE,MAAM;YAAU,SAASmB;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKE;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAMJ;gBACR;aACD;QACH;KACD;IAED,IAAI,AAAoC,YAApC,OAAOV,0BAAuC;QAChD,MAAMuB,SAAS,MAAMjC,mBAAmB;YACtC,QAAQU,yBAAyB,MAAM;YACvC,yBAAyBA,yBAAyB,uBAAuB;QAC3E;QACAP,KAAK,IAAI,IAAI8B;IACf;IAEA,MAAMC,MAAM,MAAMvB,SAASR,MAAMgC,aAAa,eAAe,EAAEvB;IAE/D,MAAMwB,cAAcC,KAAK,SAAS,CAACH,IAAI,OAAO;IAE9C,IAAII;IACJ,IAAIC,kBACF,cAAcL,IAAI,OAAO,GAAGA,IAAI,OAAO,CAAC,QAAQ,GAAG,EAAE;IACvD,IAAIM,SACF,YAAYN,IAAI,OAAO,GAAGA,IAAI,OAAO,CAAC,MAAM,GAAG,EAAE;IACnD,IAAI;QACF,IAAI,UAAUA,IAAI,OAAO,IAAIO,MAAM,OAAO,CAACP,IAAI,OAAO,CAAC,IAAI,GAAG;gBAK1DQ,6BAAAA,uBACAC,6BAAAA;YALFL,UAAUM,gBACRV,IAAI,OAAO,CAAC,IAAI,EAChBT,YACAC,aAAAA,QACAgB,CAAAA,wBAAAA,QAAQ,YAAY,AAAD,IAAnBA,KAAAA,IAAAA,QAAAA,CAAAA,8BAAAA,sBAAsB,IAAI,AAAD,IAAzBA,KAAAA,IAAAA,4BAA4B,IAAI,UAChCC,CAAAA,yBAAAA,QAAQ,YAAY,AAAD,IAAnBA,KAAAA,IAAAA,QAAAA,CAAAA,8BAAAA,uBAAsB,IAAI,AAAD,IAAzBA,KAAAA,IAAAA,4BAA4B,GAAG,EAC/BhB,oBACAC,qBACAf;YAGFlB,aAAa,WAAW2C;YAExB,MAAMO,aAAa;gBACjB,GAAGP,QAAQ,IAAI,GAAGA,QAAQ,KAAK,GAAG;gBAClC,GAAGA,QAAQ,GAAG,GAAGA,QAAQ,MAAM,GAAG;YACpC;YACA,IAAIQ,UAAUC,iCAAiCtC,QAAQ,IAAI,EAAEoC;YAE7D,MAAMG,mBAAmBF,UACrBG,SAAS;gBAAE,GAAGH,QAAQ,MAAM,CAAC,EAAE;gBAAE,GAAGA,QAAQ,MAAM,CAAC,EAAE;YAAC,GAAGD,cACzD;YAEJ,IAAI,CAACC,WAAWE,mBAAmBE,mBACjCJ,UAAU7B,wBAAwB4B;YAGpC,IAAIC,SAAS;gBACXP,kBAAkB;oBAACO;iBAAQ;gBAC3BN,SAAS,EAAE;YACb;QACF;IACF,EAAE,OAAOW,GAAG;QACV,MAAMC,MACJD,aAAaE,QACT,CAAC,sBAAsB,EAAEF,EAAE,OAAO,EAAE,GACpC;QACN,IAAI,AAACX,UAAUA,AAAAA,CAAAA,QAAAA,SAAAA,KAAAA,IAAAA,OAAQ,MAAM,AAAD,MAAM,GAGhCA,OAAO,IAAI,CAAC,CAAC,CAAC,EAAEY,IAAI,CAAC,CAAC;aAFtBZ,SAAS;YAACY;SAAI;IAIlB;IAEA,OAAO;QACL,MAAMd;QACN,aAAa;YACX,UAAUC;YACVC;QACF;QACAJ;QACApB;QACA,OAAOkB,IAAI,KAAK;QAChB,kBACE,AAAuB,YAAvB,OAAOA,IAAI,OAAO,IAClBA,AAAgB,SAAhBA,IAAI,OAAO,IACX,sBAAsBA,IAAI,OAAO,GAC5BA,IAAI,OAAO,CAAS,gBAAgB,GACrCoB;IACR;AACF;AAEO,eAAeC,gBAAgB/C,OAIrC;IAOC,MAAM,EAAEC,OAAO,EAAE+C,kBAAkB,EAAE5C,WAAW,EAAE,GAAGJ;IACrD,MAAM,EAAEK,MAAM,EAAE,GAAGD;IACnB,MAAM,EAAEE,gBAAgB,EAAE,GAAGL;IAE7B,MAAMa,eAAemC,4BAA4B5C;IACjD,MAAM6C,gCAAgC,MAAMC,0BAA0B,MAAM,CAAC;QAC3E,oBAAoB7D,wBAAwB0D;IAC9C;IACA,MAAMrD,OAAe;QACnB;YAAE,MAAM;YAAU,SAASmB;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKR;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAM4C;gBACR;aACD;QACH;KACD;IAED,IAAI,AAA8B,YAA9B,OAAOF,oBAAiC;QAC1C,MAAMvB,SAAS,MAAMjC,mBAAmB;YACtC,QAAQwD,mBAAmB,MAAM;YACjC,yBAAyBA,mBAAmB,uBAAuB;QACrE;QACArD,KAAK,IAAI,IAAI8B;IACf;IAEA,MAAM2B,SAAS,MAAMC,yBACnB1D,MACAgC,aAAa,YAAY,EACzBvB;IAGF,IAAIkD;IACJ,MAAMC,cAAcH,OAAO,OAAO,CAAC,IAAI;IACvC,IAAIG,aAAa;QACf,MAAMC,aAAapB,gBACjBmB,aACAtD,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnB,GACA,GACAA,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnBI;QAEFhB,aAAa,0BAA0BmE;QAEvC,MAAMC,oBAAoBL,OAAO,OAAO,CAAC,eAAe,IAAI,EAAE;QAC9D/D,aAAa,wBAAwBoE;QAErC,MAAMC,iBAAiBD,kBACpB,MAAM,CAAC,CAACE,OAAS1B,MAAM,OAAO,CAAC0B,OAC/B,GAAG,CAAC,CAACA,OACGvB,gBACLuB,MACA1D,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnB,GACA,GACAA,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnBI;QAGNhB,aAAa,qBAAqBqE;QAGlC,MAAME,aAAaC,WAAW;YAACL;eAAeE;SAAe;QAC7DrE,aAAa,iBAAiBuE;QAG9BN,cAAcQ,iBAAiBF,YAAY3D,QAAQ,IAAI,EAAEI;QACzDhB,aAAa,2BAA2BiE;IAC1C;IAEA,IAAIS,cAAczD;IAClB,IAAIgD,aAAa;QACf,MAAMU,gBAAgB,MAAMC,WAC1B3D,kBACAgD,aACAjD,AAAW,cAAXA;QAEF0D,cAAcC,cAAc,WAAW;QACvCV,YAAY,KAAK,GAAGU,cAAc,KAAK;QACvCV,YAAY,MAAM,GAAGU,cAAc,MAAM;IAC3C;IAEA,OAAO;QACL,MAAMV;QACNS;QACA,OAAOX,OAAO,OAAO,CAAC,KAAK;QAC3B,aAAavB,KAAK,SAAS,CAACuB,OAAO,OAAO;QAC1C,OAAOA,OAAO,KAAK;IACrB;AACF;AAEO,eAAec,qBAGpBlE,OAMD;IACC,MAAM,EAAEmE,SAAS,EAAElE,OAAO,EAAEmE,aAAa,EAAE3E,gBAAgB,EAAEW,WAAW,EAAE,GACxEJ;IACF,MAAM,EAAEK,MAAM,EAAE,GAAGD;IACnB,MAAMU,eAAeuD;IAErB,MAAM,EAAE/D,gBAAgB,EAAE,GAAGL;IAE7B,MAAM,EAAEM,WAAW,EAAEC,WAAW,EAAE,GAAG,MAAME,iBAAiBT,SAAS;QACnE,oBAAoB;QACpB,sBAAsB;QACtB,aAAa;QACb,aAAamE,QAAAA,gBAAAA,KAAAA,IAAAA,cAAe,WAAW;QACvC/D;IACF;IAEA,MAAMiE,wBAAwB,MAAMC,uBAClChE,aACA4D;IAGF,MAAMK,cAAyD,EAAE;IAEjE,IAAIJ,AAAAA,CAAAA,QAAAA,gBAAAA,KAAAA,IAAAA,cAAe,kBAAkB,AAAD,MAAM,OACxCI,YAAY,IAAI,CAAC;QACf,MAAM;QACN,WAAW;YACT,KAAKlE;YACL,QAAQ;QACV;IACF;IAGFkE,YAAY,IAAI,CAAC;QACf,MAAM;QACN,MAAMF;IACR;IAEA,MAAM3E,OAAe;QACnB;YAAE,MAAM;YAAU,SAASmB;QAAa;QACxC;YACE,MAAM;YACN,SAAS0D;QACX;KACD;IAED,IAAI/E,kBAAkB;QACpB,MAAMgC,SAAS,MAAMjC,mBAAmB;YACtC,QAAQC,iBAAiB,MAAM;YAC/B,yBAAyBA,iBAAiB,uBAAuB;QACnE;QACAE,KAAK,IAAI,IAAI8B;IACf;IAEA,MAAM2B,SAAS,MAAMC,yBACnB1D,MACAgC,aAAa,YAAY,EACzBvB;IAEF,OAAO;QACL,aAAagD,OAAO,OAAO;QAC3B5C;QACA,OAAO4C,OAAO,KAAK;IACrB;AACF"}

package/dist/es/ai-model/llm-planning.mjs CHANGED Viewed

@@ -1,19 +1,16 @@
 import { paddingToMatchBlockByBase64 } from "@midscene/shared/img";
 import { getDebug } from "@midscene/shared/logger";
 import { assert } from "@midscene/shared/utils";
-import { AIActionType, buildYamlFlowFromPlans, fillBboxParam, findAllMidsceneLocatorField, markupImageForLLM, warnGPT4oSizeLimit } from "./common.mjs";
+import { AIActionType, buildYamlFlowFromPlans, fillBboxParam, findAllMidsceneLocatorField } from "./common.mjs";
 import { systemPromptToTaskPlanning } from "./prompt/llm-planning.mjs";
-import { describeUserPage } from "./prompt/util.mjs";
 import { callAIWithObjectResponse } from "./service-caller/index.mjs";
 const debug = getDebug('planning');
 async function plan(userInstruction, opts) {
     var _opts_conversationHistory, _planFromAI_action;
     const { context, modelConfig, conversationHistory } = opts;
     const { screenshotBase64, size } = context;
-    const { modelName, vlMode } = modelConfig;
-    const { description: pageDescription, elementById } = await describeUserPage(context, {
-        vlMode
-    });
+    const { vlMode } = modelConfig;
+    assert(vlMode, 'Planning requires vlMode to be configured.');
     const systemPrompt = await systemPromptToTaskPlanning({
         actionSpace: opts.actionSpace,
         vlMode: vlMode
@@ -28,12 +25,7 @@ async function plan(userInstruction, opts) {
         imageWidth = paddedResult.width;
         imageHeight = paddedResult.height;
         imagePayload = paddedResult.imageBase64;
-    } else if ('qwen3-vl' === vlMode) ;
-    else if (!vlMode) imagePayload = await markupImageForLLM(screenshotBase64, context.tree, {
-        width: imageWidth,
-        height: imageHeight
-    });
-    warnGPT4oSizeLimit(size, modelName);
+    }
     const historyLog = (null == (_opts_conversationHistory = opts.conversationHistory) ? void 0 : _opts_conversationHistory.snapshot()) || [];
     const knowledgeContext = opts.actionContext ? [
         {
@@ -74,13 +66,7 @@ async function plan(userInstruction, opts) {
                         url: imagePayload,
                         detail: 'high'
                     }
-                },
-                ...vlMode ? [] : [
-                    {
-                        type: 'text',
-                        text: pageDescription
-                    }
-                ]
+                }
             ]
         }
     ];
@@ -106,11 +92,7 @@ async function plan(userInstruction, opts) {
         debug('locateFields', locateFields);
         locateFields.forEach((field)=>{
             const locateResult = action.param[field];
-            if (locateResult) if (vlMode) action.param[field] = fillBboxParam(locateResult, imageWidth, imageHeight, rightLimit, bottomLimit, vlMode);
-            else {
-                const element = elementById(locateResult);
-                if (element) action.param[field].id = element.id;
-            }
+            if (locateResult) action.param[field] = fillBboxParam(locateResult, imageWidth, imageHeight, rightLimit, bottomLimit, vlMode);
         });
     });
     assert(!planFromAI.error, `Failed to plan actions: ${planFromAI.error}`);

package/dist/es/ai-model/llm-planning.mjs.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"ai-model/llm-planning.mjs","sources":["webpack://@midscene/core/./src/ai-model/llm-planning.ts"],"sourcesContent":["import type {\n DeviceAction,\n InterfaceType,\n PlanningAIResponse,\n UIContext,\n} from '@/types';\nimport type { IModelConfig } from '@midscene/shared/env';\nimport { paddingToMatchBlockByBase64 } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport type {~~\n ChatCompletionContentPart,\n~~ ChatCompletionMessageParam~~,\n~~} from 'openai/resources/index';\nimport {\n AIActionType,\n buildYamlFlowFromPlans,\n fillBboxParam,\n findAllMidsceneLocatorField,\n ~~markupImageForLLM,\n warnGPT4oSizeLimit,\n~~} from './common';\nimport type { ConversationHistory } from './conversation-history';\nimport { systemPromptToTaskPlanning } from './prompt/llm-planning';\nimport { ~~describeUserPage } from './prompt/util';\nimport {~~ callAIWithObjectResponse } from './service-caller/index';\n\nconst debug = getDebug('planning');\n\nexport async function plan(\n userInstruction: string,\n opts: {\n context: UIContext;\n interfaceType: InterfaceType;\n actionSpace: DeviceAction<any>[];\n actionContext?: string;\n modelConfig: IModelConfig;\n conversationHistory?: ConversationHistory;\n },\n): Promise<PlanningAIResponse> {\n const { context, modelConfig, conversationHistory } = opts;\n const { screenshotBase64, size } = context;\n\n const { ~~modelName,~~ vlMode } = modelConfig;\n\n ~~const~~ { ~~description:~~ ~~pageDescription,~~ ~~elementById~~ } = ~~await describeUserPage(~~\n ~~context,\n {~~ vlMode ~~},\n~~ );\n const systemPrompt = await systemPromptToTaskPlanning({\n actionSpace: opts.actionSpace,\n vlMode: vlMode,\n });\n\n let imagePayload = screenshotBase64;\n let imageWidth = size.width;\n let imageHeight = size.height;\n const rightLimit = imageWidth;\n const bottomLimit = imageHeight;\n ~~if (vlMode === 'qwen-vl') {~~\n ~~const~~ ~~paddedResult~~ = ~~await~~ ~~paddingToMatchBlockByBase64(imagePayload);\n imageWidth~~ = ~~paddedResult.width;\n imageHeight~~ ~~= paddedResult.height;\~~n ~~imagePayload = paddedResult.imageBase64;\n~~ ~~} else~~ if (vlMode === '~~qwen3~~-vl') {\n // const paddedResult = await paddingToMatchBlockByBase64(imagePayload~~, 32~~);\n // imageWidth = paddedResult.width;\n // imageHeight = paddedResult.height;\n // imagePayload = paddedResult.imageBase64;\n } ~~else if (!vlMode) {~~\n ~~imagePayload = await markupImageForLLM(screenshotBase64, context.tree, {~~\n ~~width: imageWidth,\n height: imageHeight,\n });\n~~ ~~}\n\n warnGPT4oSizeLimit(size, modelName);\n\n~~ const historyLog = opts.conversationHistory?.snapshot() \|\| [];\n // .filter((item) => item.role === 'assistant') \|\| [];\n\n const knowledgeContext: ChatCompletionMessageParam[] = opts.actionContext\n ? [\n {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `<high_priority_knowledge>${opts.actionContext}</high_priority_knowledge>`,\n },\n ],\n },\n ]\n : [];\n\n const instruction: ChatCompletionMessageParam[] = [\n {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `<user_instruction>${userInstruction}</user_instruction>`,\n },\n ],\n },\n ];\n\n const msgs: ChatCompletionMessageParam[] = [\n { role: 'system', content: systemPrompt },\n ...knowledgeContext,\n ...instruction,\n ...historyLog,\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n ~~...(vlMode\n ?~~ ~~[]\n :~~ ~~([\n {\n type:~~ ~~'text',\n text:~~ ~~pageDescription,\n },\n ]~~ as ~~ChatCompletionContentPart[])),\~~n ],\n },\n ];\n\n const { content, usage } = await callAIWithObjectResponse<PlanningAIResponse>(\n msgs,\n AIActionType.PLAN,\n modelConfig,\n );\n const rawResponse = JSON.stringify(content, undefined, 2);\n const planFromAI = content;\n\n const actions =\n (planFromAI.action?.type ? [planFromAI.action] : planFromAI.actions) \|\| [];\n const returnValue: PlanningAIResponse = {\n ...planFromAI,\n actions,\n rawResponse,\n usage,\n yamlFlow: buildYamlFlowFromPlans(\n actions,\n opts.actionSpace,\n planFromAI.sleep,\n ),\n };\n\n assert(planFromAI, \"can't get plans from AI\");\n\n // TODO: use zod.parse to parse the action.param, and then fill the bbox param.\n actions.forEach((action) => {\n const type = action.type;\n const actionInActionSpace = opts.actionSpace.find(\n (action) => action.name === type,\n );\n\n debug('actionInActionSpace matched', actionInActionSpace);\n const locateFields = actionInActionSpace\n ? findAllMidsceneLocatorField(actionInActionSpace.paramSchema)\n : [];\n\n debug('locateFields', locateFields);\n\n locateFields.forEach((field) => {\n const locateResult = action.param[field];\n if (locateResult) {\n if ~~(vlMode)~~ {\n action.param[field] = fillBboxParam(\n locateResult,\n imageWidth,\n imageHeight,\n rightLimit,\n bottomLimit,\n vlMode,\n ~~);\n~~ ~~} else {\n const element = elementById(locateResult~~);\n ~~if (element) {\n action.param[field].id = element.id;\n }\n }\n~~ }\n });\n });\n // in Qwen-VL, error means error. In GPT-4o, error may mean more actions are needed.\n assert(!planFromAI.error, `Failed to plan actions: ${planFromAI.error}`);\n\n if (\n actions.length === 0 &&\n returnValue.more_actions_needed_by_instruction &&\n !returnValue.sleep\n ) {\n console.warn(\n 'No actions planned for the prompt, but model said more actions are needed:',\n userInstruction,\n );\n }\n\n conversationHistory?.append({\n role: 'assistant',\n content: [\n {\n type: 'text',\n text: rawResponse,\n },\n ],\n });\n conversationHistory?.append({\n role: 'user',\n content: [\n {\n type: 'text',\n text: 'I have finished the action previously planned',\n },\n ],\n });\n\n return returnValue;\n}\n"],"names":["debug","getDebug","plan","userInstruction","opts","_opts_conversationHistory","_planFromAI_action","context","modelConfig","conversationHistory","screenshotBase64","size","~~modelName","~~vlMode","~~pageDescription~~","~~elementById","describeUserPage","~~systemPrompt","systemPromptToTaskPlanning","imagePayload","imageWidth","imageHeight","rightLimit","bottomLimit","paddedResult","paddingToMatchBlockByBase64","~~markupImageForLLM","warnGPT4oSizeLimit","~~historyLog","knowledgeContext","instruction","msgs","content","usage","callAIWithObjectResponse","AIActionType","rawResponse","JSON","undefined","planFromAI","actions","returnValue","buildYamlFlowFromPlans","~~assert","~~action","type","actionInActionSpace","locateFields","findAllMidsceneLocatorField","field","locateResult","fillBboxParam","~~element","~~console"],"mappings":"~~;;;;;;;AA2BA~~,MAAMA,QAAQC,SAAS;AAEhB,eAAeC,KACpBC,eAAuB,EACvBC,IAOC;~~QAwCkBC~~,~~2BAiEhBC~~;~~IAvGH~~,MAAM,EAAEC,OAAO,EAAEC,WAAW,EAAEC,mBAAmB,EAAE,GAAGL;IACtD,MAAM,EAAEM,gBAAgB,EAAEC,IAAI,EAAE,GAAGJ;IAEnC,MAAM,EAAEK,~~SAAS,EAAEC,~~MAAM,EAAE,~~GAAGL~~;~~IAE9B~~,~~MAAM~~,~~EAAE,aAAaM,eAAe,EAAEC,WAAW,EAAE,GAAG,MAAMC,iBAC1DT,SACA~~;~~QAAEM;IAAO;IAEX~~,~~MAAMI~~,eAAe,MAAMC,2BAA2B;QACpD,~~aAAad~~,KAAK,WAAW;QAC7B,~~QAAQS~~;IACV;IAEA,~~IAAIM~~,~~eAAeT~~;IACnB,~~IAAIU~~,~~aAAaT~~,KAAK,KAAK;IAC3B,~~IAAIU~~,~~cAAcV~~,KAAK,MAAM;IAC7B,~~MAAMW~~,aAAaF;IACnB,MAAMG,cAAcF;~~IACpB~~,~~IAAIR~~,AAAW,cAAXA,QAAsB;QACxB,~~MAAMW~~,eAAe,MAAMC,4BAA4BN;QACvDC,aAAaI,aAAa,KAAK;QAC/BH,cAAcG,aAAa,MAAM;QACjCL,eAAeK,aAAa,WAAW;IACzC~~,OAAO,IAAIX,AAAW,eAAXA~~;~~SAKJ~~,~~IAAI~~,~~CAACA~~,~~QACVM,eAAe,MAAMO,kBAAkBhB,kBAAkBH,QAAQ,IAAI,EAAE;QACrE,OAAOa;QACP,QAAQC;IACV;IAGFM,mBAAmBhB,MAAMC;IAEzB,MAAMgB,aAAavB,~~AAAAA,SAAAA,CAAAA,4BAAAA,KAAK,mBAAmB,AAAD,IAAvBA,KAAAA,IAAAA,0BAA0B,QAAQ,EAAC,KAAK,EAAE;IAG7D,~~MAAMwB~~,~~mBAAiDzB~~,KAAK,aAAa,GACrE;QACE;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM,CAAC,yBAAyB,EAAEA,KAAK,aAAa,CAAC,0BAA0B,CAAC;gBAClF;aACD;QACH;KACD,GACD,EAAE;IAEN,~~MAAM0B~~,cAA4C;QAChD;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM,CAAC,kBAAkB,~~EAAE3B~~,gBAAgB,mBAAmB,CAAC;gBACjE;aACD;QACH;KACD;IAED,~~MAAM4B~~,OAAqC;QACzC;YAAE,MAAM;YAAU,~~SAASd~~;QAAa;~~WACrCY~~;WACAC;WACAF;QACH;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,~~KAAKT~~;wBACL,QAAQ;oBACV;gBACF;~~mBACIN,SACA,EAAE,GACD~~;~~oBACC;wBACE,MAAM;wBACN,MAAMC;oBACR;iBACD;aACN;~~QACH;KACD;IAED,MAAM,~~EAAEkB~~,OAAO,EAAEC,KAAK,EAAE,GAAG,MAAMC,yBAC/BH,MACAI,aAAa,IAAI,~~EACjB3B~~;IAEF,~~MAAM4B~~,cAAcC,KAAK,SAAS,CAACL,SAASM,QAAW;IACvD,MAAMC,aAAaP;IAEnB,MAAMQ,~~UACHlC~~,AAAAA,CAAAA,SAAAA,CAAAA,qBAAAA,WAAW,MAAM,AAAD,IAAhBA,KAAAA,IAAAA,mBAAmB,IAAI,AAAD,IAAI;~~QAACiC~~,WAAW,MAAM;KAAC,GAAGA,WAAW,OAAM,KAAM,EAAE;IAC5E,MAAME,cAAkC;QACtC,GAAGF,UAAU;QACbC;QACAJ;QACAH;QACA,UAAUS,uBACRF,~~SACApC~~,KAAK,WAAW,~~EAChBmC~~,WAAW,KAAK;IAEpB;~~IAEAI~~,~~OAAOJ~~,YAAY;IAGnBC,QAAQ,OAAO,CAAC,~~CAACI~~;QACf,MAAMC,OAAOD,OAAO,IAAI;QACxB,MAAME,~~sBAAsB1C~~,KAAK,WAAW,CAAC,IAAI,CAC/C,~~CAACwC~~,SAAWA,OAAO,IAAI,KAAKC;~~QAG9B7C~~,MAAM,+BAA+~~B8C~~;QACrC,MAAMC,eAAeD,sBACjBE,4BAA4BF,oBAAoB,WAAW,IAC3D,EAAE;~~QAEN9C~~,MAAM,~~gBAAgB+C~~;QAEtBA,aAAa,OAAO,CAAC,CAACE;YACpB,MAAMC,eAAeN,OAAO,KAAK,CAACK,MAAM;YACxC,IAAIC,~~cACF~~,~~IAAIrC,QACF+B,~~OAAO,KAAK,CAACK,MAAM,GAAGE,cACpBD,~~cACA9B~~,YACAC,aACAC,YACAC,~~aACAV~~;~~iBAEG~~;~~gBACL,MAAMuC,UAAUrC,YAAYmC;gBAC5B,IAAIE,SACFR,OAAO,KAAK,CAACK,MAAM,CAAC,EAAE,GAAGG,QAAQ,EAAE;YAEvC;QAEJ;~~IACF;~~IAEAT~~,OAAO,~~CAACJ~~,WAAW,KAAK,EAAE,CAAC,wBAAwB,EAAEA,WAAW,KAAK,EAAE;IAEvE,IACEC,AAAmB,MAAnBA,QAAQ,MAAM,IACdC,YAAY,kCAAkC,IAC9C,CAACA,YAAY,KAAK,~~EAElBY~~,QAAQ,IAAI,CACV,~~8EACAlD~~;IAIJM,QAAAA,uBAAAA,oBAAqB,MAAM,CAAC;QAC1B,MAAM;QACN,SAAS;YACP;gBACE,MAAM;gBACN,~~MAAM2B~~;YACR;SACD;IACH;~~IACA3B~~,QAAAA,uBAAAA,oBAAqB,MAAM,CAAC;QAC1B,MAAM;QACN,SAAS;YACP;gBACE,MAAM;gBACN,MAAM;YACR;SACD;IACH;IAEA,~~OAAOgC~~;AACT"}
1	+ {"version":3,"file":"ai-model/llm-planning.mjs","sources":["webpack://@midscene/core/./src/ai-model/llm-planning.ts"],"sourcesContent":["import type {\n DeviceAction,\n InterfaceType,\n PlanningAIResponse,\n UIContext,\n} from '@/types';\nimport type { IModelConfig } from '@midscene/shared/env';\nimport { paddingToMatchBlockByBase64 } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport type { ChatCompletionMessageParam } from 'openai/resources/index';\nimport {\n AIActionType,\n buildYamlFlowFromPlans,\n fillBboxParam,\n findAllMidsceneLocatorField,\n} from './common';\nimport type { ConversationHistory } from './conversation-history';\nimport { systemPromptToTaskPlanning } from './prompt/llm-planning';\nimport { callAIWithObjectResponse } from './service-caller/index';\n\nconst debug = getDebug('planning');\n\nexport async function plan(\n userInstruction: string,\n opts: {\n context: UIContext;\n interfaceType: InterfaceType;\n actionSpace: DeviceAction<any>[];\n actionContext?: string;\n modelConfig: IModelConfig;\n conversationHistory?: ConversationHistory;\n },\n): Promise<PlanningAIResponse> {\n const { context, modelConfig, conversationHistory } = opts;\n const { screenshotBase64, size } = context;\n\n const { vlMode } = modelConfig;\n\n // Planning requires VL mode (validated by ModelConfigManager.getModelConfig)\n assert(vlMode, 'Planning requires vlMode to be configured.');\n\n const systemPrompt = await systemPromptToTaskPlanning({\n actionSpace: opts.actionSpace,\n vlMode: vlMode,\n });\n\n let imagePayload = screenshotBase64;\n let imageWidth = size.width;\n let imageHeight = size.height;\n const rightLimit = imageWidth;\n const bottomLimit = imageHeight;\n\n // Process image based on VL mode requirements\n if (vlMode === 'qwen-vl') {\n const paddedResult = await paddingToMatchBlockByBase64(imagePayload);\n imageWidth = paddedResult.width;\n imageHeight = paddedResult.height;\n imagePayload = paddedResult.imageBase64;\n }\n\n const historyLog = opts.conversationHistory?.snapshot() \|\| [];\n // .filter((item) => item.role === 'assistant') \|\| [];\n\n const knowledgeContext: ChatCompletionMessageParam[] = opts.actionContext\n ? [\n {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `<high_priority_knowledge>${opts.actionContext}</high_priority_knowledge>`,\n },\n ],\n },\n ]\n : [];\n\n const instruction: ChatCompletionMessageParam[] = [\n {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `<user_instruction>${userInstruction}</user_instruction>`,\n },\n ],\n },\n ];\n\n const msgs: ChatCompletionMessageParam[] = [\n { role: 'system', content: systemPrompt },\n ...knowledgeContext,\n ...instruction,\n ...historyLog,\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n // Planning uses pure vision mode, no DOM description needed\n ],\n },\n ];\n\n const { content, usage } = await callAIWithObjectResponse<PlanningAIResponse>(\n msgs,\n AIActionType.PLAN,\n modelConfig,\n );\n const rawResponse = JSON.stringify(content, undefined, 2);\n const planFromAI = content;\n\n const actions =\n (planFromAI.action?.type ? [planFromAI.action] : planFromAI.actions) \|\| [];\n const returnValue: PlanningAIResponse = {\n ...planFromAI,\n actions,\n rawResponse,\n usage,\n yamlFlow: buildYamlFlowFromPlans(\n actions,\n opts.actionSpace,\n planFromAI.sleep,\n ),\n };\n\n assert(planFromAI, \"can't get plans from AI\");\n\n // TODO: use zod.parse to parse the action.param, and then fill the bbox param.\n actions.forEach((action) => {\n const type = action.type;\n const actionInActionSpace = opts.actionSpace.find(\n (action) => action.name === type,\n );\n\n debug('actionInActionSpace matched', actionInActionSpace);\n const locateFields = actionInActionSpace\n ? findAllMidsceneLocatorField(actionInActionSpace.paramSchema)\n : [];\n\n debug('locateFields', locateFields);\n\n locateFields.forEach((field) => {\n const locateResult = action.param[field];\n if (locateResult) {\n // Always use VL mode to fill bbox parameters\n action.param[field] = fillBboxParam(\n locateResult,\n imageWidth,\n imageHeight,\n rightLimit,\n bottomLimit,\n vlMode,\n );\n }\n });\n });\n // in Qwen-VL, error means error. In GPT-4o, error may mean more actions are needed.\n assert(!planFromAI.error, `Failed to plan actions: ${planFromAI.error}`);\n\n if (\n actions.length === 0 &&\n returnValue.more_actions_needed_by_instruction &&\n !returnValue.sleep\n ) {\n console.warn(\n 'No actions planned for the prompt, but model said more actions are needed:',\n userInstruction,\n );\n }\n\n conversationHistory?.append({\n role: 'assistant',\n content: [\n {\n type: 'text',\n text: rawResponse,\n },\n ],\n });\n conversationHistory?.append({\n role: 'user',\n content: [\n {\n type: 'text',\n text: 'I have finished the action previously planned',\n },\n ],\n });\n\n return returnValue;\n}\n"],"names":["debug","getDebug","plan","userInstruction","opts","_opts_conversationHistory","_planFromAI_action","context","modelConfig","conversationHistory","screenshotBase64","size","vlMode","assert","systemPrompt","systemPromptToTaskPlanning","imagePayload","imageWidth","imageHeight","rightLimit","bottomLimit","paddedResult","paddingToMatchBlockByBase64","historyLog","knowledgeContext","instruction","msgs","content","usage","callAIWithObjectResponse","AIActionType","rawResponse","JSON","undefined","planFromAI","actions","returnValue","buildYamlFlowFromPlans","action","type","actionInActionSpace","locateFields","findAllMidsceneLocatorField","field","locateResult","fillBboxParam","console"],"mappings":";;;;;;AAqBA,MAAMA,QAAQC,SAAS;AAEhB,eAAeC,KACpBC,eAAuB,EACvBC,IAOC;QA6BkBC,2BA0DhBC;IArFH,MAAM,EAAEC,OAAO,EAAEC,WAAW,EAAEC,mBAAmB,EAAE,GAAGL;IACtD,MAAM,EAAEM,gBAAgB,EAAEC,IAAI,EAAE,GAAGJ;IAEnC,MAAM,EAAEK,MAAM,EAAE,GAAGJ;IAGnBK,OAAOD,QAAQ;IAEf,MAAME,eAAe,MAAMC,2BAA2B;QACpD,aAAaX,KAAK,WAAW;QAC7B,QAAQQ;IACV;IAEA,IAAII,eAAeN;IACnB,IAAIO,aAAaN,KAAK,KAAK;IAC3B,IAAIO,cAAcP,KAAK,MAAM;IAC7B,MAAMQ,aAAaF;IACnB,MAAMG,cAAcF;IAGpB,IAAIN,AAAW,cAAXA,QAAsB;QACxB,MAAMS,eAAe,MAAMC,4BAA4BN;QACvDC,aAAaI,aAAa,KAAK;QAC/BH,cAAcG,aAAa,MAAM;QACjCL,eAAeK,aAAa,WAAW;IACzC;IAEA,MAAME,aAAalB,AAAAA,SAAAA,CAAAA,4BAAAA,KAAK,mBAAmB,AAAD,IAAvBA,KAAAA,IAAAA,0BAA0B,QAAQ,EAAC,KAAK,EAAE;IAG7D,MAAMmB,mBAAiDpB,KAAK,aAAa,GACrE;QACE;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM,CAAC,yBAAyB,EAAEA,KAAK,aAAa,CAAC,0BAA0B,CAAC;gBAClF;aACD;QACH;KACD,GACD,EAAE;IAEN,MAAMqB,cAA4C;QAChD;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM,CAAC,kBAAkB,EAAEtB,gBAAgB,mBAAmB,CAAC;gBACjE;aACD;QACH;KACD;IAED,MAAMuB,OAAqC;QACzC;YAAE,MAAM;YAAU,SAASZ;QAAa;WACrCU;WACAC;WACAF;QACH;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKP;wBACL,QAAQ;oBACV;gBACF;aAED;QACH;KACD;IAED,MAAM,EAAEW,OAAO,EAAEC,KAAK,EAAE,GAAG,MAAMC,yBAC/BH,MACAI,aAAa,IAAI,EACjBtB;IAEF,MAAMuB,cAAcC,KAAK,SAAS,CAACL,SAASM,QAAW;IACvD,MAAMC,aAAaP;IAEnB,MAAMQ,UACH7B,AAAAA,CAAAA,SAAAA,CAAAA,qBAAAA,WAAW,MAAM,AAAD,IAAhBA,KAAAA,IAAAA,mBAAmB,IAAI,AAAD,IAAI;QAAC4B,WAAW,MAAM;KAAC,GAAGA,WAAW,OAAM,KAAM,EAAE;IAC5E,MAAME,cAAkC;QACtC,GAAGF,UAAU;QACbC;QACAJ;QACAH;QACA,UAAUS,uBACRF,SACA/B,KAAK,WAAW,EAChB8B,WAAW,KAAK;IAEpB;IAEArB,OAAOqB,YAAY;IAGnBC,QAAQ,OAAO,CAAC,CAACG;QACf,MAAMC,OAAOD,OAAO,IAAI;QACxB,MAAME,sBAAsBpC,KAAK,WAAW,CAAC,IAAI,CAC/C,CAACkC,SAAWA,OAAO,IAAI,KAAKC;QAG9BvC,MAAM,+BAA+BwC;QACrC,MAAMC,eAAeD,sBACjBE,4BAA4BF,oBAAoB,WAAW,IAC3D,EAAE;QAENxC,MAAM,gBAAgByC;QAEtBA,aAAa,OAAO,CAAC,CAACE;YACpB,MAAMC,eAAeN,OAAO,KAAK,CAACK,MAAM;YACxC,IAAIC,cAEFN,OAAO,KAAK,CAACK,MAAM,GAAGE,cACpBD,cACA3B,YACAC,aACAC,YACAC,aACAR;QAGN;IACF;IAEAC,OAAO,CAACqB,WAAW,KAAK,EAAE,CAAC,wBAAwB,EAAEA,WAAW,KAAK,EAAE;IAEvE,IACEC,AAAmB,MAAnBA,QAAQ,MAAM,IACdC,YAAY,kCAAkC,IAC9C,CAACA,YAAY,KAAK,EAElBU,QAAQ,IAAI,CACV,8EACA3C;IAIJM,QAAAA,uBAAAA,oBAAqB,MAAM,CAAC;QAC1B,MAAM;QACN,SAAS;YACP;gBACE,MAAM;gBACN,MAAMsB;YACR;SACD;IACH;IACAtB,QAAAA,uBAAAA,oBAAqB,MAAM,CAAC;QAC1B,MAAM;QACN,SAAS;YACP;gBACE,MAAM;gBACN,MAAM;YACR;SACD;IACH;IAEA,OAAO2B;AACT"}

package/dist/es/ai-model/prompt/llm-locator.mjs CHANGED Viewed

@@ -1,9 +1,8 @@
 import { PromptTemplate } from "@langchain/core/prompts";
 import { bboxDescription } from "./common.mjs";
 function systemPromptToLocateElement(vlMode) {
-    if (vlMode) {
-        const bboxComment = bboxDescription(vlMode);
-        return `
+    const bboxComment = bboxDescription(vlMode);
+    return `
 ## Role:
 You are an expert in software testing.
@@ -55,207 +54,7 @@ When no element is found and the description is not order-sensitive:
 }
 \`\`\`
 `;
-    }
-    return `
-## Role:
-You are an expert in software page image (2D) and page element text analysis.
-## Objective:
-- Identify elements in screenshots and text that match the user's description.
-- Return JSON data containing the selection reason and element ID.
-- Determine whether the user's description is order-sensitive (e.g., contains phrases like 'the third item in the list', 'the last button', etc.).
-## Skills:
-- Image analysis and recognition
-- Multilingual text understanding
-- Software UI design and testing
-## Workflow:
-1. Receive the user's element description, screenshot, and element description information. Note that the text may contain non-English characters (e.g., Chinese), indicating that the application may be non-English.
-2. Based on the user's description, locate the target element ID in the list of element descriptions and the screenshot.
-3. Found the required number of elements
-4. Return JSON data containing the selection reason and element ID.
-5. Judge whether the user's description is order-sensitive (see below for definition and examples).
-## Constraints:
-- Strictly adhere to the specified location when describing the required element; do not select elements from other locations.
-- Elements in the image with NodeType other than "TEXT Node" have been highlighted to identify the element among multiple non-text elements.
-- Accurately identify element information based on the user's description and return the corresponding element ID from the element description information, not extracted from the image.
-- If no elements are found, the "elements" array should be empty.
-- The returned data must conform to the specified JSON format.
-- The returned value id information must use the id from element info (important: **use id not indexId, id is hash content**)
-## Order-Sensitive Definition:
-- If the description contains phrases like "the third item in the list", "the last button", "the first input box", "the second row", etc., it is order-sensitive (isOrderSensitive = true).
-- If the description is like "confirm button", "search box", "password input", etc., it is not order-sensitive (isOrderSensitive = false).
-## Output Format:
-Please return the result in JSON format as follows:
-\`\`\`json
-{
-  "elements": [
-    // If no matching elements are found, return an empty array []
-    {
-      "reason": "PLACEHOLDER", // The thought process for finding the element, replace PLACEHOLDER with your thought process
-      "text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
-      "id": "PLACEHOLDER" // Replace PLACEHOLDER with the ID (important: **use id not indexId, id is hash content**) of elementInfo
-    }
-    // More elements...
-  ],
-  "isOrderSensitive": true, // or false, depending on the user's description
-  "errors": [] // Array of strings containing any error messages
-}
-\`\`\`
-## Example:
-Example 1:
-Input Example:
-\`\`\`json
-// Description: "Shopping cart icon in the upper right corner"
-{
-  "description": "PLACEHOLDER", // Description of the target element
-  "screenshot": "path/screenshot.png",
-  "text": '{
-      "pageSize": {
-        "width": 400, // Width of the page
-        "height": 905 // Height of the page
-      },
-      "elementInfos": [
-        {
-          "id": "1231", // ID of the element
-          "indexId": "0", // Index of the element\u{FF0C}The image is labeled to the left of the element
-          "attributes": { // Attributes of the element
-            "nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
-            "src": "https://ap-southeast-3.m",
-            "class": ".img"
-          },
-          "content": "", // Text content of the element
-          "rect": {
-            "left": 280, // Distance from the left side of the page
-            "top": 8, // Distance from the top of the page
-            "width": 44, // Width of the element
-            "height": 44 // Height of the element
-          }
-        },
-        {
-          "id": "66551", // ID of the element
-          "indexId": "1", // Index of the element,The image is labeled to the left of the element
-          "attributes": { // Attributes of the element
-            "nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
-            "src": "data:image/png;base64,iVBORw0KGgoAAAANSU...",
-            "class": ".icon"
-          },
-          "content": "", // Text content of the element
-          "rect": {
-            "left": 350, // Distance from the left side of the page
-            "top": 16, // Distance from the top of the page
-            "width": 25, // Width of the element
-            "height": 25 // Height of the element
-          }
-        },
-        ...
-        {
-          "id": "12344",
-          "indexId": "2", // Index of the element\u{FF0C}The image is labeled to the left of the element
-          "attributes": {
-            "nodeType": "TEXT Node",
-            "class": ".product-name"
-          },
-          "center": [
-            288,
-            834
-          ],
-          "content": "Mango Drink",
-          "rect": {
-            "left": 188,
-            "top": 827,
-            "width": 199,
-            "height": 13
-          }
-        },
-        ...
-      ]
-    }
-  '
-}
-\`\`\`
-Output Example:
-\`\`\`json
-{
-  "elements": [
-    {
-      // Describe the reason for finding this element, replace with actual value in practice
-      "reason": "Reason for finding element 4: It is located in the upper right corner, is an image type, and according to the screenshot, it is a shopping cart icon button",
-      "text": "",
-      // ID(**use id not indexId**) of this element, replace with actual value in practice, **use id not indexId**
-      "id": "1231"
-    }
-  ],
-  "isOrderSensitive": true,
-  "errors": []
-}
-\`\`\`
-  `;
 }
-const locatorSchema = {
-    type: 'json_schema',
-    json_schema: {
-        name: 'find_elements',
-        strict: true,
-        schema: {
-            type: 'object',
-            properties: {
-                elements: {
-                    type: 'array',
-                    items: {
-                        type: 'object',
-                        properties: {
-                            reason: {
-                                type: 'string',
-                                description: 'Reason for finding this element'
-                            },
-                            text: {
-                                type: 'string',
-                                description: 'Text content of the element'
-                            },
-                            id: {
-                                type: 'string',
-                                description: 'ID of this element'
-                            }
-                        },
-                        required: [
-                            'reason',
-                            'text',
-                            'id'
-                        ],
-                        additionalProperties: false
-                    },
-                    description: 'List of found elements'
-                },
-                isOrderSensitive: {
-                    type: 'boolean',
-                    description: "Whether the targetElementDescription is order-sensitive (true/false)"
-                },
-                errors: {
-                    type: 'array',
-                    items: {
-                        type: 'string'
-                    },
-                    description: 'List of error messages, if any'
-                }
-            },
-            required: [
-                'elements',
-                'isOrderSensitive',
-                'errors'
-            ],
-            additionalProperties: false
-        }
-    }
-};
 const findElementPrompt = new PromptTemplate({
     template: `
 Here is the item user want to find:
@@ -270,6 +69,6 @@ Here is the item user want to find:
         "targetElementDescription"
     ]
 });
-export { findElementPrompt, locatorSchema, systemPromptToLocateElement };
+export { findElementPrompt, systemPromptToLocateElement };
 //# sourceMappingURL=llm-locator.mjs.map

package/dist/es/ai-model/prompt/llm-locator.mjs.map CHANGED Viewed

	@@ -1 +1 @@
1	- {"version":3,"file":"ai-model/prompt/llm-locator.mjs","sources":["webpack://@midscene/core/./src/ai-model/prompt/llm-locator.ts"],"sourcesContent":["import { PromptTemplate } from '@langchain/core/prompts';\nimport type { TVlModeTypes } from '@midscene/shared/env';\nimport type { ResponseFormatJSONSchema } from 'openai/resources/index';\nimport { bboxDescription } from './common';\nexport function systemPromptToLocateElement(vlMode: TVlModeTypes \| undefined) {\n if (vlMode) {\n const bboxComment = bboxDescription(vlMode);\n return `\n## Role:\nYou are an expert in software testing.\n\n## Objective:\n- Identify elements in screenshots and text that match the user's description.\n- Give the coordinates of the element that matches the user's description best in the screenshot.\n- Determine whether the user's description is order-sensitive (e.g., contains phrases like 'the third item in the list', 'the last button', etc.).\n\n## Output Format:\n\\`\\`\\`json\n{\n \"bbox\": [number, number, number, number], // ${bboxComment}\n \"errors\"?: string[],\n \"isOrderSensitive\": boolean // Whether the targetElementDescription is order-sensitive (true/false)\n}\n\\`\\`\\`\n\nFields:\n* \\`bbox\\` is the bounding box of the element that matches the user's description best in the screenshot\n* \\`isOrderSensitive\\` is a boolean indicating whether the user's description is order-sensitive (true/false)\n* \\`errors\\` is an optional array of error messages (if any)\n\nOrder-sensitive means the description contains phrases like:\n- \"the third item in the list\"\n- \"the last button\"\n- \"the first input box\"\n- \"the second row\"\n\nNot order-sensitive means the description is like:\n- \"confirm button\"\n- \"search box\"\n- \"password input\"\n\nFor example, when an element is found and the description is order-sensitive:\n\\`\\`\\`json\n{\n \"bbox\": [100, 100, 200, 200],\n \"isOrderSensitive\": true,\n \"errors\": []\n}\n\\`\\`\\`\n\nWhen no element is found and the description is not order-sensitive:\n\\`\\`\\`json\n{\n \"bbox\": [],\n \"isOrderSensitive\": false,\n \"errors\": [\"I can see ..., but {some element} is not found\"]\n}\n\\`\\`\\`\n`;\n }\n\n return `\n## Role:\nYou are an expert in software page image (2D) and page element text analysis.\n\n## Objective:\n- Identify elements in screenshots and text that match the user's description.\n- Return JSON data containing the selection reason and element ID.\n- Determine whether the user's description is order-sensitive (e.g., contains phrases like 'the third item in the list', 'the last button', etc.).\n\n## Skills:\n- Image analysis and recognition\n- Multilingual text understanding\n- Software UI design and testing\n\n## Workflow:\n1. Receive the user's element description, screenshot, and element description information. Note that the text may contain non-English characters (e.g., Chinese), indicating that the application may be non-English.\n2. Based on the user's description, locate the target element ID in the list of element descriptions and the screenshot.\n3. Found the required number of elements\n4. Return JSON data containing the selection reason and element ID.\n5. Judge whether the user's description is order-sensitive (see below for definition and examples).\n\n## Constraints:\n- Strictly adhere to the specified location when describing the required element; do not select elements from other locations.\n- Elements in the image with NodeType other than \"TEXT Node\" have been highlighted to identify the element among multiple non-text elements.\n- Accurately identify element information based on the user's description and return the corresponding element ID from the element description information, not extracted from the image.\n- If no elements are found, the \"elements\" array should be empty.\n- The returned data must conform to the specified JSON format.\n- The returned value id information must use the id from element info (important: use id not indexId, id is hash content)\n\n## Order-Sensitive Definition:\n- If the description contains phrases like \"the third item in the list\", \"the last button\", \"the first input box\", \"the second row\", etc., it is order-sensitive (isOrderSensitive = true).\n- If the description is like \"confirm button\", \"search box\", \"password input\", etc., it is not order-sensitive (isOrderSensitive = false).\n\n## Output Format:\n\nPlease return the result in JSON format as follows:\n\n\\`\\`\\`json\n{\n \"elements\": [\n // If no matching elements are found, return an empty array []\n {\n \"reason\": \"PLACEHOLDER\", // The thought process for finding the element, replace PLACEHOLDER with your thought process\n \"text\": \"PLACEHOLDER\", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty\n \"id\": \"PLACEHOLDER\" // Replace PLACEHOLDER with the ID (important: use id not indexId, id is hash content) of elementInfo\n }\n // More elements...\n ],\n \"isOrderSensitive\": true, // or false, depending on the user's description\n \"errors\": [] // Array of strings containing any error messages\n}\n\\`\\`\\`\n\n## Example:\nExample 1:\nInput Example:\n\\`\\`\\`json\n// Description: \"Shopping cart icon in the upper right corner\"\n{\n \"description\": \"PLACEHOLDER\", // Description of the target element\n \"screenshot\": \"path/screenshot.png\",\n \"text\": '{\n \"pageSize\": {\n \"width\": 400, // Width of the page\n \"height\": 905 // Height of the page\n },\n \"elementInfos\": [\n {\n \"id\": \"1231\", // ID of the element\n \"indexId\": \"0\", // Index of the element，The image is labeled to the left of the element\n \"attributes\": { // Attributes of the element\n \"nodeType\": \"IMG Node\", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node\n \"src\": \"https://ap-southeast-3.m\",\n \"class\": \".img\"\n },\n \"content\": \"\", // Text content of the element\n \"rect\": {\n \"left\": 280, // Distance from the left side of the page\n \"top\": 8, // Distance from the top of the page\n \"width\": 44, // Width of the element\n \"height\": 44 // Height of the element\n }\n },\n {\n \"id\": \"66551\", // ID of the element\n \"indexId\": \"1\", // Index of the element,The image is labeled to the left of the element\n \"attributes\": { // Attributes of the element\n \"nodeType\": \"IMG Node\", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node\n \"src\": \"data:image/png;base64,iVBORw0KGgoAAAANSU...\",\n \"class\": \".icon\"\n },\n \"content\": \"\", // Text content of the element\n \"rect\": {\n \"left\": 350, // Distance from the left side of the page\n \"top\": 16, // Distance from the top of the page\n \"width\": 25, // Width of the element\n \"height\": 25 // Height of the element\n }\n },\n ...\n {\n \"id\": \"12344\",\n \"indexId\": \"2\", // Index of the element，The image is labeled to the left of the element\n \"attributes\": {\n \"nodeType\": \"TEXT Node\",\n \"class\": \".product-name\"\n },\n \"center\": [\n 288,\n 834\n ],\n \"content\": \"Mango Drink\",\n \"rect\": {\n \"left\": 188,\n \"top\": 827,\n \"width\": 199,\n \"height\": 13\n }\n },\n ...\n ]\n }\n '\n}\n\\`\\`\\`\nOutput Example:\n\\`\\`\\`json\n{\n \"elements\": [\n {\n // Describe the reason for finding this element, replace with actual value in practice\n \"reason\": \"Reason for finding element 4: It is located in the upper right corner, is an image type, and according to the screenshot, it is a shopping cart icon button\",\n \"text\": \"\",\n // ID(use id not indexId) of this element, replace with actual value in practice, use id not indexId\n \"id\": \"1231\"\n }\n ],\n \"isOrderSensitive\": true,\n \"errors\": []\n}\n\\`\\`\\`\n \n `;\n}\n\nexport const locatorSchema: ResponseFormatJSONSchema = {\n type: 'json_schema',\n json_schema: {\n name: 'find_elements',\n strict: true,\n schema: {\n type: 'object',\n properties: {\n elements: {\n type: 'array',\n items: {\n type: 'object',\n properties: {\n reason: {\n type: 'string',\n description: 'Reason for finding this element',\n },\n text: {\n type: 'string',\n description: 'Text content of the element',\n },\n id: {\n type: 'string',\n description: 'ID of this element',\n },\n },\n required: ['reason', 'text', 'id'],\n additionalProperties: false,\n },\n description: 'List of found elements',\n },\n isOrderSensitive: {\n type: 'boolean',\n description:\n 'Whether the targetElementDescription is order-sensitive (true/false)',\n },\n errors: {\n type: 'array',\n items: {\n type: 'string',\n },\n description: 'List of error messages, if any',\n },\n },\n required: ['elements', 'isOrderSensitive', 'errors'],\n additionalProperties: false,\n },\n },\n};\n\nexport const findElementPrompt = new PromptTemplate({\n template: `\nHere is the item user want to find:\n=====================================\n{targetElementDescription}\n=====================================\n\n{pageDescription}\n `,\n inputVariables: ['pageDescription', 'targetElementDescription'],\n});\n"],"names":["systemPromptToLocateElement","vlMode","bboxComment","bboxDescription","locatorSchema","findElementPrompt","PromptTemplate"],"mappings":";;AAIO,SAASA,4BAA4BC,MAAgC;IAC1E,IAAIA,QAAQ;QACV,MAAMC,cAAcC,gBAAgBF;QACpC,OAAO,CAAC;;;;;;;;;;;;gDAYoC,EAAEC,YAAY;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAuC9D,CAAC;IACC;IAEA,OAAO,CAAC;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EA8IR,CAAC;AACH;AAEO,MAAME,gBAA0C;IACrD,MAAM;IACN,aAAa;QACX,MAAM;QACN,QAAQ;QACR,QAAQ;YACN,MAAM;YACN,YAAY;gBACV,UAAU;oBACR,MAAM;oBACN,OAAO;wBACL,MAAM;wBACN,YAAY;4BACV,QAAQ;gCACN,MAAM;gCACN,aAAa;4BACf;4BACA,MAAM;gCACJ,MAAM;gCACN,aAAa;4BACf;4BACA,IAAI;gCACF,MAAM;gCACN,aAAa;4BACf;wBACF;wBACA,UAAU;4BAAC;4BAAU;4BAAQ;yBAAK;wBAClC,sBAAsB;oBACxB;oBACA,aAAa;gBACf;gBACA,kBAAkB;oBAChB,MAAM;oBACN,aACE;gBACJ;gBACA,QAAQ;oBACN,MAAM;oBACN,OAAO;wBACL,MAAM;oBACR;oBACA,aAAa;gBACf;YACF;YACA,UAAU;gBAAC;gBAAY;gBAAoB;aAAS;YACpD,sBAAsB;QACxB;IACF;AACF;AAEO,MAAMC,oBAAoB,IAAIC,eAAe;IAClD,UAAU,CAAC;;;;;;;EAOX,CAAC;IACD,gBAAgB;QAAC;QAAmB;KAA2B;AACjE"}
1	+ {"version":3,"file":"ai-model/prompt/llm-locator.mjs","sources":["webpack://@midscene/core/./src/ai-model/prompt/llm-locator.ts"],"sourcesContent":["import { PromptTemplate } from '@langchain/core/prompts';\nimport type { TVlModeTypes } from '@midscene/shared/env';\nimport { bboxDescription } from './common';\nexport function systemPromptToLocateElement(vlMode: TVlModeTypes \| undefined) {\n const bboxComment = bboxDescription(vlMode);\n return `\n## Role:\nYou are an expert in software testing.\n\n## Objective:\n- Identify elements in screenshots and text that match the user's description.\n- Give the coordinates of the element that matches the user's description best in the screenshot.\n- Determine whether the user's description is order-sensitive (e.g., contains phrases like 'the third item in the list', 'the last button', etc.).\n\n## Output Format:\n\\`\\`\\`json\n{\n \"bbox\": [number, number, number, number], // ${bboxComment}\n \"errors\"?: string[],\n \"isOrderSensitive\": boolean // Whether the targetElementDescription is order-sensitive (true/false)\n}\n\\`\\`\\`\n\nFields:\n* \\`bbox\\` is the bounding box of the element that matches the user's description best in the screenshot\n* \\`isOrderSensitive\\` is a boolean indicating whether the user's description is order-sensitive (true/false)\n* \\`errors\\` is an optional array of error messages (if any)\n\nOrder-sensitive means the description contains phrases like:\n- \"the third item in the list\"\n- \"the last button\"\n- \"the first input box\"\n- \"the second row\"\n\nNot order-sensitive means the description is like:\n- \"confirm button\"\n- \"search box\"\n- \"password input\"\n\nFor example, when an element is found and the description is order-sensitive:\n\\`\\`\\`json\n{\n \"bbox\": [100, 100, 200, 200],\n \"isOrderSensitive\": true,\n \"errors\": []\n}\n\\`\\`\\`\n\nWhen no element is found and the description is not order-sensitive:\n\\`\\`\\`json\n{\n \"bbox\": [],\n \"isOrderSensitive\": false,\n \"errors\": [\"I can see ..., but {some element} is not found\"]\n}\n\\`\\`\\`\n`;\n}\n\nexport const findElementPrompt = new PromptTemplate({\n template: `\nHere is the item user want to find:\n=====================================\n{targetElementDescription}\n=====================================\n\n{pageDescription}\n `,\n inputVariables: ['pageDescription', 'targetElementDescription'],\n});\n"],"names":["systemPromptToLocateElement","vlMode","bboxComment","bboxDescription","findElementPrompt","PromptTemplate"],"mappings":";;AAGO,SAASA,4BAA4BC,MAAgC;IAC1E,MAAMC,cAAcC,gBAAgBF;IACpC,OAAO,CAAC;;;;;;;;;;;;gDAYsC,EAAEC,YAAY;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAuC9D,CAAC;AACD;AAEO,MAAME,oBAAoB,IAAIC,eAAe;IAClD,UAAU,CAAC;;;;;;;EAOX,CAAC;IACD,gBAAgB;QAAC;QAAmB;KAA2B;AACjE"}