@midscene/core 0.26.7-beta-20250818035341.0 → 0.26.7-beta-20250820105545.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/es/ai-model/action-executor.mjs +0 -8
- package/dist/es/ai-model/action-executor.mjs.map +1 -1
- package/dist/es/ai-model/common.mjs +73 -52
- package/dist/es/ai-model/common.mjs.map +1 -1
- package/dist/es/ai-model/index.mjs +3 -3
- package/dist/es/ai-model/inspect.mjs +29 -66
- package/dist/es/ai-model/inspect.mjs.map +1 -1
- package/dist/es/ai-model/llm-planning.mjs +27 -24
- package/dist/es/ai-model/llm-planning.mjs.map +1 -1
- package/dist/es/ai-model/prompt/assertion.mjs +1 -25
- package/dist/es/ai-model/prompt/assertion.mjs.map +1 -1
- package/dist/es/ai-model/prompt/llm-planning.mjs +50 -23
- package/dist/es/ai-model/prompt/llm-planning.mjs.map +1 -1
- package/dist/es/ai-model/prompt/playwright-generator.mjs +9 -3
- package/dist/es/ai-model/prompt/playwright-generator.mjs.map +1 -1
- package/dist/es/ai-model/prompt/util.mjs +2 -2
- package/dist/es/ai-model/prompt/util.mjs.map +1 -1
- package/dist/es/ai-model/prompt/yaml-generator.mjs +9 -3
- package/dist/es/ai-model/prompt/yaml-generator.mjs.map +1 -1
- package/dist/es/ai-model/service-caller/index.mjs +75 -118
- package/dist/es/ai-model/service-caller/index.mjs.map +1 -1
- package/dist/es/ai-model/ui-tars-planning.mjs +5 -5
- package/dist/es/ai-model/ui-tars-planning.mjs.map +1 -1
- package/dist/es/index.mjs +3 -2
- package/dist/es/index.mjs.map +1 -1
- package/dist/es/insight/index.mjs +14 -97
- package/dist/es/insight/index.mjs.map +1 -1
- package/dist/es/insight/utils.mjs +1 -3
- package/dist/es/insight/utils.mjs.map +1 -1
- package/dist/es/types.mjs.map +1 -1
- package/dist/es/utils.mjs +5 -6
- package/dist/es/utils.mjs.map +1 -1
- package/dist/lib/ai-model/action-executor.js +0 -8
- package/dist/lib/ai-model/action-executor.js.map +1 -1
- package/dist/lib/ai-model/common.js +97 -55
- package/dist/lib/ai-model/common.js.map +1 -1
- package/dist/lib/ai-model/index.js +16 -4
- package/dist/lib/ai-model/inspect.js +29 -69
- package/dist/lib/ai-model/inspect.js.map +1 -1
- package/dist/lib/ai-model/llm-planning.js +26 -23
- package/dist/lib/ai-model/llm-planning.js.map +1 -1
- package/dist/lib/ai-model/prompt/assertion.js +2 -29
- package/dist/lib/ai-model/prompt/assertion.js.map +1 -1
- package/dist/lib/ai-model/prompt/llm-planning.js +52 -25
- package/dist/lib/ai-model/prompt/llm-planning.js.map +1 -1
- package/dist/lib/ai-model/prompt/playwright-generator.js +9 -3
- package/dist/lib/ai-model/prompt/playwright-generator.js.map +1 -1
- package/dist/lib/ai-model/prompt/util.js +2 -2
- package/dist/lib/ai-model/prompt/util.js.map +1 -1
- package/dist/lib/ai-model/prompt/yaml-generator.js +9 -3
- package/dist/lib/ai-model/prompt/yaml-generator.js.map +1 -1
- package/dist/lib/ai-model/service-caller/index.js +78 -124
- package/dist/lib/ai-model/service-caller/index.js.map +1 -1
- package/dist/lib/ai-model/ui-tars-planning.js +5 -5
- package/dist/lib/ai-model/ui-tars-planning.js.map +1 -1
- package/dist/lib/index.js +20 -7
- package/dist/lib/index.js.map +1 -1
- package/dist/lib/insight/index.js +10 -93
- package/dist/lib/insight/index.js.map +1 -1
- package/dist/lib/insight/utils.js +1 -3
- package/dist/lib/insight/utils.js.map +1 -1
- package/dist/lib/types.js.map +1 -1
- package/dist/lib/utils.js +4 -5
- package/dist/lib/utils.js.map +1 -1
- package/dist/types/ai-model/common.d.ts +162 -8
- package/dist/types/ai-model/index.d.ts +2 -1
- package/dist/types/ai-model/inspect.d.ts +3 -8
- package/dist/types/ai-model/llm-planning.d.ts +1 -1
- package/dist/types/ai-model/prompt/assertion.d.ts +0 -3
- package/dist/types/ai-model/prompt/llm-planning.d.ts +2 -2
- package/dist/types/ai-model/prompt/util.d.ts +2 -1
- package/dist/types/ai-model/service-caller/index.d.ts +6 -6
- package/dist/types/ai-model/ui-tars-planning.d.ts +3 -1
- package/dist/types/index.d.ts +3 -1
- package/dist/types/insight/index.d.ts +1 -5
- package/dist/types/types.d.ts +11 -12
- package/dist/types/yaml.d.ts +7 -6
- package/package.json +4 -3
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ai-model/inspect.mjs","sources":["webpack://@midscene/core/./src/ai-model/inspect.ts"],"sourcesContent":["import type {\n AIAssertionResponse,\n AIDataExtractionResponse,\n AIElementLocatorResponse,\n AIElementResponse,\n AISectionLocatorResponse,\n AIUsageInfo,\n BaseElement,\n ElementById,\n InsightExtractOption,\n Rect,\n ReferenceImage,\n TMultimodalPrompt,\n TUserPrompt,\n UIContext,\n} from '@/types';\nimport {\n MIDSCENE_USE_QWEN_VL,\n MIDSCENE_USE_VLM_UI_TARS,\n getAIConfigInBoolean,\n vlLocateMode,\n} from '@midscene/shared/env';\nimport {\n cropByRect,\n paddingToMatchBlockByBase64,\n preProcessImageUrl,\n} from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport type {\n ChatCompletionSystemMessageParam,\n ChatCompletionUserMessageParam,\n} from 'openai/resources/index';\nimport {\n AIActionType,\n adaptBboxToRect,\n callAiFn,\n expandSearchArea,\n markupImageForLLM,\n mergeRects,\n} from './common';\nimport { systemPromptToAssert } from './prompt/assertion';\nimport {\n extractDataQueryPrompt,\n systemPromptToExtract,\n} from './prompt/extraction';\nimport {\n findElementPrompt,\n systemPromptToLocateElement,\n} from './prompt/llm-locator';\nimport {\n sectionLocatorInstruction,\n systemPromptToLocateSection,\n} from './prompt/llm-section-locator';\nimport {\n describeUserPage,\n distance,\n distanceThreshold,\n elementByPositionWithElementInfo,\n} from './prompt/util';\nimport { callToGetJSONObject } from './service-caller/index';\n\nexport type AIArgs = [\n ChatCompletionSystemMessageParam,\n ...ChatCompletionUserMessageParam[],\n];\n\nconst debugInspect = getDebug('ai:inspect');\nconst debugSection = getDebug('ai:section');\n\nconst extraTextFromUserPrompt = (prompt: TUserPrompt): string => {\n if (typeof prompt === 'string') {\n return prompt;\n } else {\n return prompt.prompt;\n }\n};\n\nconst promptsToChatParam = async (\n multimodalPrompt: TMultimodalPrompt,\n): Promise<ChatCompletionUserMessageParam[]> => {\n const msgs: ChatCompletionUserMessageParam[] = [];\n if (multimodalPrompt?.images?.length) {\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: 'Next, I will provide all the reference images.',\n },\n ],\n });\n\n for (const item of multimodalPrompt.images) {\n const base64 = await preProcessImageUrl(\n item.url,\n !!multimodalPrompt.convertHttpImage2Base64,\n );\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: `reference image ${item.name}:`,\n },\n ],\n });\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: base64,\n detail: 'high',\n },\n },\n ],\n });\n }\n }\n return msgs;\n};\n\nexport async function AiLocateElement<\n ElementType extends BaseElement = BaseElement,\n>(options: {\n context: UIContext<ElementType>;\n targetElementDescription: TUserPrompt;\n referenceImage?: ReferenceImage;\n callAI?: typeof callAiFn<AIElementResponse | [number, number]>;\n searchConfig?: Awaited<ReturnType<typeof AiLocateSection>>;\n}): Promise<{\n parseResult: AIElementLocatorResponse;\n rect?: Rect;\n rawResponse: string;\n elementById: ElementById;\n usage?: AIUsageInfo;\n isOrderSensitive?: boolean;\n}> {\n const { context, targetElementDescription, callAI } = options;\n const { screenshotBase64 } = context;\n const { description, elementById, insertElementByPosition } =\n await describeUserPage(context);\n\n assert(\n targetElementDescription,\n 'cannot find the target element description',\n );\n\n const userInstructionPrompt = await findElementPrompt.format({\n pageDescription: description,\n targetElementDescription: extraTextFromUserPrompt(targetElementDescription),\n });\n const systemPrompt = systemPromptToLocateElement(vlLocateMode());\n\n let imagePayload = screenshotBase64;\n\n if (options.searchConfig) {\n assert(\n options.searchConfig.rect,\n 'searchArea is provided but its rect cannot be found. Failed to locate element',\n );\n assert(\n options.searchConfig.imageBase64,\n 'searchArea is provided but its imageBase64 cannot be found. Failed to locate element',\n );\n\n imagePayload = options.searchConfig.imageBase64;\n } else if (vlLocateMode() === 'qwen-vl') {\n imagePayload = await paddingToMatchBlockByBase64(imagePayload);\n } else if (!vlLocateMode()) {\n imagePayload = await markupImageForLLM(\n screenshotBase64,\n context.tree,\n context.size,\n );\n }\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: userInstructionPrompt,\n },\n ],\n },\n ];\n\n if (typeof targetElementDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: targetElementDescription.images,\n convertHttpImage2Base64: targetElementDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const callAIFn =\n callAI || callToGetJSONObject<AIElementResponse | [number, number]>;\n\n const res = await callAIFn(msgs, AIActionType.INSPECT_ELEMENT);\n\n const rawResponse = JSON.stringify(res.content);\n\n let resRect: Rect | undefined;\n let matchedElements: AIElementLocatorResponse['elements'] =\n 'elements' in res.content ? res.content.elements : [];\n let errors: AIElementLocatorResponse['errors'] | undefined =\n 'errors' in res.content ? res.content.errors : [];\n try {\n if ('bbox' in res.content && Array.isArray(res.content.bbox)) {\n resRect = adaptBboxToRect(\n res.content.bbox,\n options.searchConfig?.rect?.width || context.size.width,\n options.searchConfig?.rect?.height || context.size.height,\n options.searchConfig?.rect?.left,\n options.searchConfig?.rect?.top,\n );\n debugInspect('resRect', resRect);\n\n const rectCenter = {\n x: resRect.left + resRect.width / 2,\n y: resRect.top + resRect.height / 2,\n };\n let element = elementByPositionWithElementInfo(context.tree, rectCenter);\n\n const distanceToCenter = element\n ? distance({ x: element.center[0], y: element.center[1] }, rectCenter)\n : 0;\n\n if (!element || distanceToCenter > distanceThreshold) {\n element = insertElementByPosition(rectCenter);\n }\n\n if (element) {\n matchedElements = [element];\n errors = [];\n }\n }\n } catch (e) {\n const msg =\n e instanceof Error\n ? `Failed to parse bbox: ${e.message}`\n : 'unknown error in locate';\n if (!errors || errors?.length === 0) {\n errors = [msg];\n } else {\n errors.push(`(${msg})`);\n }\n }\n\n return {\n rect: resRect,\n parseResult: {\n elements: matchedElements,\n errors,\n },\n rawResponse,\n elementById,\n usage: res.usage,\n isOrderSensitive:\n typeof res.content === 'object' &&\n res.content !== null &&\n 'isOrderSensitive' in res.content\n ? (res.content as any).isOrderSensitive\n : undefined,\n };\n}\n\nexport async function AiLocateSection(options: {\n context: UIContext<BaseElement>;\n sectionDescription: TUserPrompt;\n callAI?: typeof callAiFn<AISectionLocatorResponse>;\n}): Promise<{\n rect?: Rect;\n imageBase64?: string;\n error?: string;\n rawResponse: string;\n usage?: AIUsageInfo;\n}> {\n const { context, sectionDescription } = options;\n const { screenshotBase64 } = context;\n\n const systemPrompt = systemPromptToLocateSection(vlLocateMode());\n const sectionLocatorInstructionText = await sectionLocatorInstruction.format({\n sectionDescription: extraTextFromUserPrompt(sectionDescription),\n });\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: sectionLocatorInstructionText,\n },\n ],\n },\n ];\n\n if (typeof sectionDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: sectionDescription.images,\n convertHttpImage2Base64: sectionDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const result = await callAiFn<AISectionLocatorResponse>(\n msgs,\n AIActionType.EXTRACT_DATA,\n );\n\n let sectionRect: Rect | undefined;\n const sectionBbox = result.content.bbox;\n if (sectionBbox) {\n const targetRect = adaptBboxToRect(\n sectionBbox,\n context.size.width,\n context.size.height,\n );\n debugSection('original targetRect %j', targetRect);\n\n const referenceBboxList = result.content.references_bbox || [];\n debugSection('referenceBboxList %j', referenceBboxList);\n\n const referenceRects = referenceBboxList\n .filter((bbox) => Array.isArray(bbox))\n .map((bbox) => {\n return adaptBboxToRect(bbox, context.size.width, context.size.height);\n });\n debugSection('referenceRects %j', referenceRects);\n\n // merge the sectionRect and referenceRects\n const mergedRect = mergeRects([targetRect, ...referenceRects]);\n debugSection('mergedRect %j', mergedRect);\n\n // expand search area to at least 200 x 200\n sectionRect = expandSearchArea(mergedRect, context.size);\n debugSection('expanded sectionRect %j', sectionRect);\n }\n\n let imageBase64 = screenshotBase64;\n if (sectionRect) {\n imageBase64 = await cropByRect(\n screenshotBase64,\n sectionRect,\n getAIConfigInBoolean(MIDSCENE_USE_QWEN_VL),\n );\n }\n\n return {\n rect: sectionRect,\n imageBase64,\n error: result.content.error,\n rawResponse: JSON.stringify(result.content),\n usage: result.usage,\n };\n}\n\nexport async function AiExtractElementInfo<\n T,\n ElementType extends BaseElement = BaseElement,\n>(options: {\n dataQuery: string | Record<string, string>;\n multimodalPrompt?: TMultimodalPrompt;\n context: UIContext<ElementType>;\n extractOption?: InsightExtractOption;\n}) {\n const { dataQuery, context, extractOption, multimodalPrompt } = options;\n const systemPrompt = systemPromptToExtract();\n\n const { screenshotBase64 } = context;\n const { description, elementById } = await describeUserPage(context, {\n truncateTextLength: 200,\n filterNonTextContent: false,\n visibleOnly: false,\n domIncluded: extractOption?.domIncluded,\n });\n\n const extractDataPromptText = await extractDataQueryPrompt(\n description,\n dataQuery,\n );\n\n const userContent: ChatCompletionUserMessageParam['content'] = [];\n\n if (extractOption?.screenshotIncluded !== false) {\n userContent.push({\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n });\n }\n\n userContent.push({\n type: 'text',\n text: extractDataPromptText,\n });\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: userContent,\n },\n ];\n\n if (options.extractOption?.returnThought) {\n msgs.push({\n role: 'user',\n content: 'Please provide reasons.',\n });\n }\n\n if (multimodalPrompt) {\n const addOns = await promptsToChatParam({\n images: multimodalPrompt.images,\n convertHttpImage2Base64: multimodalPrompt.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const result = await callAiFn<AIDataExtractionResponse<T>>(\n msgs,\n AIActionType.EXTRACT_DATA,\n );\n return {\n parseResult: result.content,\n elementById,\n usage: result.usage,\n };\n}\n\nexport async function AiAssert<\n ElementType extends BaseElement = BaseElement,\n>(options: { assertion: TUserPrompt; context: UIContext<ElementType> }) {\n const { assertion, context } = options;\n\n assert(assertion, 'assertion should not be empty');\n\n const { screenshotBase64 } = context;\n\n const systemPrompt = systemPromptToAssert({\n isUITars: getAIConfigInBoolean(MIDSCENE_USE_VLM_UI_TARS),\n });\n\n const assertionText = extraTextFromUserPrompt(assertion);\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: `\nHere is the assertion. Please tell whether it is truthy according to the screenshot.\n=====================================\n${assertionText}\n=====================================\n `,\n },\n ],\n },\n ];\n\n if (typeof assertion !== 'string') {\n const addOns = await promptsToChatParam({\n images: assertion.images,\n convertHttpImage2Base64: assertion.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const { content: assertResult, usage } = await callAiFn<AIAssertionResponse>(\n msgs,\n AIActionType.ASSERT,\n );\n return {\n content: assertResult,\n usage,\n };\n}\n"],"names":["debugInspect","getDebug","debugSection","extraTextFromUserPrompt","prompt","promptsToChatParam","multimodalPrompt","_multimodalPrompt_images","msgs","item","base64","preProcessImageUrl","AiLocateElement","options","context","targetElementDescription","callAI","screenshotBase64","description","elementById","insertElementByPosition","describeUserPage","assert","userInstructionPrompt","findElementPrompt","systemPrompt","systemPromptToLocateElement","vlLocateMode","imagePayload","paddingToMatchBlockByBase64","markupImageForLLM","addOns","callAIFn","callToGetJSONObject","res","AIActionType","rawResponse","JSON","resRect","matchedElements","errors","Array","_options_searchConfig_rect","_options_searchConfig_rect1","_options_searchConfig_rect2","_options_searchConfig_rect3","adaptBboxToRect","rectCenter","element","elementByPositionWithElementInfo","distanceToCenter","distance","distanceThreshold","e","msg","Error","undefined","AiLocateSection","sectionDescription","systemPromptToLocateSection","sectionLocatorInstructionText","sectionLocatorInstruction","result","callAiFn","sectionRect","sectionBbox","targetRect","referenceBboxList","referenceRects","bbox","mergedRect","mergeRects","expandSearchArea","imageBase64","cropByRect","getAIConfigInBoolean","MIDSCENE_USE_QWEN_VL","AiExtractElementInfo","_options_extractOption","dataQuery","extractOption","systemPromptToExtract","extractDataPromptText","extractDataQueryPrompt","userContent","AiAssert","assertion","systemPromptToAssert","MIDSCENE_USE_VLM_UI_TARS","assertionText","assertResult","usage"],"mappings":";;;;;;;;;;;AAmEA,MAAMA,eAAeC,SAAS;AAC9B,MAAMC,eAAeD,SAAS;AAE9B,MAAME,0BAA0B,CAACC;IAC/B,IAAI,AAAkB,YAAlB,OAAOA,QACT,OAAOA;IAEP,OAAOA,OAAO,MAAM;AAExB;AAEA,MAAMC,qBAAqB,OACzBC;QAGIC;IADJ,MAAMC,OAAyC,EAAE;IACjD,IAAID,QAAAA,mBAAAA,KAAAA,IAAAA,QAAAA,CAAAA,2BAAAA,iBAAkB,MAAM,AAAD,IAAvBA,KAAAA,IAAAA,yBAA0B,MAAM,EAAE;QACpCC,KAAK,IAAI,CAAC;YACR,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM;gBACR;aACD;QACH;QAEA,KAAK,MAAMC,QAAQH,iBAAiB,MAAM,CAAE;YAC1C,MAAMI,SAAS,MAAMC,mBACnBF,KAAK,GAAG,EACR,CAAC,CAACH,iBAAiB,uBAAuB;YAG5CE,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,MAAM,CAAC,gBAAgB,EAAEC,KAAK,IAAI,CAAC,CAAC,CAAC;oBACvC;iBACD;YACH;YAEAD,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,WAAW;4BACT,KAAKE;4BACL,QAAQ;wBACV;oBACF;iBACD;YACH;QACF;IACF;IACA,OAAOF;AACT;AAEO,eAAeI,gBAEpBC,OAMD;IAQC,MAAM,EAAEC,OAAO,EAAEC,wBAAwB,EAAEC,MAAM,EAAE,GAAGH;IACtD,MAAM,EAAEI,gBAAgB,EAAE,GAAGH;IAC7B,MAAM,EAAEI,WAAW,EAAEC,WAAW,EAAEC,uBAAuB,EAAE,GACzD,MAAMC,iBAAiBP;IAEzBQ,OACEP,0BACA;IAGF,MAAMQ,wBAAwB,MAAMC,kBAAkB,MAAM,CAAC;QAC3D,iBAAiBN;QACjB,0BAA0Bf,wBAAwBY;IACpD;IACA,MAAMU,eAAeC,4BAA4BC;IAEjD,IAAIC,eAAeX;IAEnB,IAAIJ,QAAQ,YAAY,EAAE;QACxBS,OACET,QAAQ,YAAY,CAAC,IAAI,EACzB;QAEFS,OACET,QAAQ,YAAY,CAAC,WAAW,EAChC;QAGFe,eAAef,QAAQ,YAAY,CAAC,WAAW;IACjD,OAAO,IAAIc,AAAmB,cAAnBA,gBACTC,eAAe,MAAMC,4BAA4BD;SAC5C,IAAI,CAACD,gBACVC,eAAe,MAAME,kBACnBb,kBACAH,QAAQ,IAAI,EACZA,QAAQ,IAAI;IAIhB,MAAMN,OAAe;QACnB;YAAE,MAAM;YAAU,SAASiB;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKG;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAML;gBACR;aACD;QACH;KACD;IAED,IAAI,AAAoC,YAApC,OAAOR,0BAAuC;QAChD,MAAMgB,SAAS,MAAM1B,mBAAmB;YACtC,QAAQU,yBAAyB,MAAM;YACvC,yBAAyBA,yBAAyB,uBAAuB;QAC3E;QACAP,KAAK,IAAI,IAAIuB;IACf;IAEA,MAAMC,WACJhB,UAAUiB;IAEZ,MAAMC,MAAM,MAAMF,SAASxB,MAAM2B,aAAa,eAAe;IAE7D,MAAMC,cAAcC,KAAK,SAAS,CAACH,IAAI,OAAO;IAE9C,IAAII;IACJ,IAAIC,kBACF,cAAcL,IAAI,OAAO,GAAGA,IAAI,OAAO,CAAC,QAAQ,GAAG,EAAE;IACvD,IAAIM,SACF,YAAYN,IAAI,OAAO,GAAGA,IAAI,OAAO,CAAC,MAAM,GAAG,EAAE;IACnD,IAAI;QACF,IAAI,UAAUA,IAAI,OAAO,IAAIO,MAAM,OAAO,CAACP,IAAI,OAAO,CAAC,IAAI,GAAG;gBAG1DQ,4BAAAA,uBACAC,6BAAAA,wBACAC,6BAAAA,wBACAC,6BAAAA;YALFP,UAAUQ,gBACRZ,IAAI,OAAO,CAAC,IAAI,EAChBQ,AAAAA,SAAAA,CAAAA,wBAAAA,QAAQ,YAAY,AAAD,IAAnBA,KAAAA,IAAAA,QAAAA,CAAAA,6BAAAA,sBAAsB,IAAI,AAAD,IAAzBA,KAAAA,IAAAA,2BAA4B,KAAK,AAAD,KAAK5B,QAAQ,IAAI,CAAC,KAAK,EACvD6B,AAAAA,SAAAA,CAAAA,yBAAAA,QAAQ,YAAY,AAAD,IAAnBA,KAAAA,IAAAA,QAAAA,CAAAA,8BAAAA,uBAAsB,IAAI,AAAD,IAAzBA,KAAAA,IAAAA,4BAA4B,MAAM,AAAD,KAAK7B,QAAQ,IAAI,CAAC,MAAM,UACzD8B,CAAAA,yBAAAA,QAAQ,YAAY,AAAD,IAAnBA,KAAAA,IAAAA,QAAAA,CAAAA,8BAAAA,uBAAsB,IAAI,AAAD,IAAzBA,KAAAA,IAAAA,4BAA4B,IAAI,UAChCC,CAAAA,yBAAAA,QAAQ,YAAY,AAAD,IAAnBA,KAAAA,IAAAA,QAAAA,CAAAA,8BAAAA,uBAAsB,IAAI,AAAD,IAAzBA,KAAAA,IAAAA,4BAA4B,GAAG;YAEjC7C,aAAa,WAAWsC;YAExB,MAAMS,aAAa;gBACjB,GAAGT,QAAQ,IAAI,GAAGA,QAAQ,KAAK,GAAG;gBAClC,GAAGA,QAAQ,GAAG,GAAGA,QAAQ,MAAM,GAAG;YACpC;YACA,IAAIU,UAAUC,iCAAiCnC,QAAQ,IAAI,EAAEiC;YAE7D,MAAMG,mBAAmBF,UACrBG,SAAS;gBAAE,GAAGH,QAAQ,MAAM,CAAC,EAAE;gBAAE,GAAGA,QAAQ,MAAM,CAAC,EAAE;YAAC,GAAGD,cACzD;YAEJ,IAAI,CAACC,WAAWE,mBAAmBE,mBACjCJ,UAAU5B,wBAAwB2B;YAGpC,IAAIC,SAAS;gBACXT,kBAAkB;oBAACS;iBAAQ;gBAC3BR,SAAS,EAAE;YACb;QACF;IACF,EAAE,OAAOa,GAAG;QACV,MAAMC,MACJD,aAAaE,QACT,CAAC,sBAAsB,EAAEF,EAAE,OAAO,EAAE,GACpC;QACN,IAAI,AAACb,UAAUA,AAAAA,CAAAA,QAAAA,SAAAA,KAAAA,IAAAA,OAAQ,MAAM,AAAD,MAAM,GAGhCA,OAAO,IAAI,CAAC,CAAC,CAAC,EAAEc,IAAI,CAAC,CAAC;aAFtBd,SAAS;YAACc;SAAI;IAIlB;IAEA,OAAO;QACL,MAAMhB;QACN,aAAa;YACX,UAAUC;YACVC;QACF;QACAJ;QACAjB;QACA,OAAOe,IAAI,KAAK;QAChB,kBACE,AAAuB,YAAvB,OAAOA,IAAI,OAAO,IAClBA,AAAgB,SAAhBA,IAAI,OAAO,IACX,sBAAsBA,IAAI,OAAO,GAC5BA,IAAI,OAAO,CAAS,gBAAgB,GACrCsB;IACR;AACF;AAEO,eAAeC,gBAAgB5C,OAIrC;IAOC,MAAM,EAAEC,OAAO,EAAE4C,kBAAkB,EAAE,GAAG7C;IACxC,MAAM,EAAEI,gBAAgB,EAAE,GAAGH;IAE7B,MAAMW,eAAekC,4BAA4BhC;IACjD,MAAMiC,gCAAgC,MAAMC,0BAA0B,MAAM,CAAC;QAC3E,oBAAoB1D,wBAAwBuD;IAC9C;IACA,MAAMlD,OAAe;QACnB;YAAE,MAAM;YAAU,SAASiB;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKR;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAM2C;gBACR;aACD;QACH;KACD;IAED,IAAI,AAA8B,YAA9B,OAAOF,oBAAiC;QAC1C,MAAM3B,SAAS,MAAM1B,mBAAmB;YACtC,QAAQqD,mBAAmB,MAAM;YACjC,yBAAyBA,mBAAmB,uBAAuB;QACrE;QACAlD,KAAK,IAAI,IAAIuB;IACf;IAEA,MAAM+B,SAAS,MAAMC,SACnBvD,MACA2B,aAAa,YAAY;IAG3B,IAAI6B;IACJ,MAAMC,cAAcH,OAAO,OAAO,CAAC,IAAI;IACvC,IAAIG,aAAa;QACf,MAAMC,aAAapB,gBACjBmB,aACAnD,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM;QAErBZ,aAAa,0BAA0BgE;QAEvC,MAAMC,oBAAoBL,OAAO,OAAO,CAAC,eAAe,IAAI,EAAE;QAC9D5D,aAAa,wBAAwBiE;QAErC,MAAMC,iBAAiBD,kBACpB,MAAM,CAAC,CAACE,OAAS5B,MAAM,OAAO,CAAC4B,OAC/B,GAAG,CAAC,CAACA,OACGvB,gBAAgBuB,MAAMvD,QAAQ,IAAI,CAAC,KAAK,EAAEA,QAAQ,IAAI,CAAC,MAAM;QAExEZ,aAAa,qBAAqBkE;QAGlC,MAAME,aAAaC,WAAW;YAACL;eAAeE;SAAe;QAC7DlE,aAAa,iBAAiBoE;QAG9BN,cAAcQ,iBAAiBF,YAAYxD,QAAQ,IAAI;QACvDZ,aAAa,2BAA2B8D;IAC1C;IAEA,IAAIS,cAAcxD;IAClB,IAAI+C,aACFS,cAAc,MAAMC,WAClBzD,kBACA+C,aACAW,qBAAqBC;IAIzB,OAAO;QACL,MAAMZ;QACNS;QACA,OAAOX,OAAO,OAAO,CAAC,KAAK;QAC3B,aAAazB,KAAK,SAAS,CAACyB,OAAO,OAAO;QAC1C,OAAOA,OAAO,KAAK;IACrB;AACF;AAEO,eAAee,qBAGpBhE,OAKD;QA0CKiE;IAzCJ,MAAM,EAAEC,SAAS,EAAEjE,OAAO,EAAEkE,aAAa,EAAE1E,gBAAgB,EAAE,GAAGO;IAChE,MAAMY,eAAewD;IAErB,MAAM,EAAEhE,gBAAgB,EAAE,GAAGH;IAC7B,MAAM,EAAEI,WAAW,EAAEC,WAAW,EAAE,GAAG,MAAME,iBAAiBP,SAAS;QACnE,oBAAoB;QACpB,sBAAsB;QACtB,aAAa;QACb,aAAakE,QAAAA,gBAAAA,KAAAA,IAAAA,cAAe,WAAW;IACzC;IAEA,MAAME,wBAAwB,MAAMC,uBAClCjE,aACA6D;IAGF,MAAMK,cAAyD,EAAE;IAEjE,IAAIJ,AAAAA,CAAAA,QAAAA,gBAAAA,KAAAA,IAAAA,cAAe,kBAAkB,AAAD,MAAM,OACxCI,YAAY,IAAI,CAAC;QACf,MAAM;QACN,WAAW;YACT,KAAKnE;YACL,QAAQ;QACV;IACF;IAGFmE,YAAY,IAAI,CAAC;QACf,MAAM;QACN,MAAMF;IACR;IAEA,MAAM1E,OAAe;QACnB;YAAE,MAAM;YAAU,SAASiB;QAAa;QACxC;YACE,MAAM;YACN,SAAS2D;QACX;KACD;IAED,IAAI,QAAAN,CAAAA,yBAAAA,QAAQ,aAAa,AAAD,IAApBA,KAAAA,IAAAA,uBAAuB,aAAa,EACtCtE,KAAK,IAAI,CAAC;QACR,MAAM;QACN,SAAS;IACX;IAGF,IAAIF,kBAAkB;QACpB,MAAMyB,SAAS,MAAM1B,mBAAmB;YACtC,QAAQC,iBAAiB,MAAM;YAC/B,yBAAyBA,iBAAiB,uBAAuB;QACnE;QACAE,KAAK,IAAI,IAAIuB;IACf;IAEA,MAAM+B,SAAS,MAAMC,SACnBvD,MACA2B,aAAa,YAAY;IAE3B,OAAO;QACL,aAAa2B,OAAO,OAAO;QAC3B3C;QACA,OAAO2C,OAAO,KAAK;IACrB;AACF;AAEO,eAAeuB,SAEpBxE,OAAoE;IACpE,MAAM,EAAEyE,SAAS,EAAExE,OAAO,EAAE,GAAGD;IAE/BS,OAAOgE,WAAW;IAElB,MAAM,EAAErE,gBAAgB,EAAE,GAAGH;IAE7B,MAAMW,eAAe8D,qBAAqB;QACxC,UAAUZ,qBAAqBa;IACjC;IAEA,MAAMC,gBAAgBtF,wBAAwBmF;IAE9C,MAAM9E,OAAe;QACnB;YAAE,MAAM;YAAU,SAASiB;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKR;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAM,CAAC;;;AAGjB,EAAEwE,cAAc;;EAEd,CAAC;gBACK;aACD;QACH;KACD;IAED,IAAI,AAAqB,YAArB,OAAOH,WAAwB;QACjC,MAAMvD,SAAS,MAAM1B,mBAAmB;YACtC,QAAQiF,UAAU,MAAM;YACxB,yBAAyBA,UAAU,uBAAuB;QAC5D;QACA9E,KAAK,IAAI,IAAIuB;IACf;IAEA,MAAM,EAAE,SAAS2D,YAAY,EAAEC,KAAK,EAAE,GAAG,MAAM5B,SAC7CvD,MACA2B,aAAa,MAAM;IAErB,OAAO;QACL,SAASuD;QACTC;IACF;AACF"}
|
|
1
|
+
{"version":3,"file":"ai-model/inspect.mjs","sources":["webpack://@midscene/core/./src/ai-model/inspect.ts"],"sourcesContent":["import type {\n AIDataExtractionResponse,\n AIElementLocatorResponse,\n AIElementResponse,\n AISectionLocatorResponse,\n AIUsageInfo,\n BaseElement,\n ElementById,\n InsightExtractOption,\n Rect,\n ReferenceImage,\n TMultimodalPrompt,\n TUserPrompt,\n UIContext,\n} from '@/types';\nimport {\n type IModelPreferences,\n getIsUseQwenVl,\n vlLocateMode,\n} from '@midscene/shared/env';\nimport {\n cropByRect,\n paddingToMatchBlockByBase64,\n preProcessImageUrl,\n} from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport type {\n ChatCompletionSystemMessageParam,\n ChatCompletionUserMessageParam,\n} from 'openai/resources/index';\nimport {\n AIActionType,\n adaptBboxToRect,\n callAiFn,\n expandSearchArea,\n markupImageForLLM,\n mergeRects,\n} from './common';\nimport {\n extractDataQueryPrompt,\n systemPromptToExtract,\n} from './prompt/extraction';\nimport {\n findElementPrompt,\n systemPromptToLocateElement,\n} from './prompt/llm-locator';\nimport {\n sectionLocatorInstruction,\n systemPromptToLocateSection,\n} from './prompt/llm-section-locator';\nimport {\n describeUserPage,\n distance,\n distanceThreshold,\n elementByPositionWithElementInfo,\n} from './prompt/util';\nimport { callToGetJSONObject } from './service-caller/index';\n\nexport type AIArgs = [\n ChatCompletionSystemMessageParam,\n ...ChatCompletionUserMessageParam[],\n];\n\nconst debugInspect = getDebug('ai:inspect');\nconst debugSection = getDebug('ai:section');\n\nconst extraTextFromUserPrompt = (prompt: TUserPrompt): string => {\n if (typeof prompt === 'string') {\n return prompt;\n } else {\n return prompt.prompt;\n }\n};\n\nconst promptsToChatParam = async (\n multimodalPrompt: TMultimodalPrompt,\n): Promise<ChatCompletionUserMessageParam[]> => {\n const msgs: ChatCompletionUserMessageParam[] = [];\n if (multimodalPrompt?.images?.length) {\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: 'Next, I will provide all the reference images.',\n },\n ],\n });\n\n for (const item of multimodalPrompt.images) {\n const base64 = await preProcessImageUrl(\n item.url,\n !!multimodalPrompt.convertHttpImage2Base64,\n );\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: `reference image ${item.name}:`,\n },\n ],\n });\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: base64,\n detail: 'high',\n },\n },\n ],\n });\n }\n }\n return msgs;\n};\n\nexport async function AiLocateElement<\n ElementType extends BaseElement = BaseElement,\n>(options: {\n context: UIContext<ElementType>;\n targetElementDescription: TUserPrompt;\n referenceImage?: ReferenceImage;\n callAI?: typeof callAiFn<AIElementResponse | [number, number]>;\n searchConfig?: Awaited<ReturnType<typeof AiLocateSection>>;\n}): Promise<{\n parseResult: AIElementLocatorResponse;\n rect?: Rect;\n rawResponse: string;\n elementById: ElementById;\n usage?: AIUsageInfo;\n isOrderSensitive?: boolean;\n}> {\n const { context, targetElementDescription, callAI } = options;\n const { screenshotBase64 } = context;\n\n const modelPreferences: IModelPreferences = {\n intent: 'grounding',\n };\n\n const { description, elementById, insertElementByPosition } =\n await describeUserPage(context, modelPreferences);\n\n assert(\n targetElementDescription,\n 'cannot find the target element description',\n );\n\n const userInstructionPrompt = await findElementPrompt.format({\n pageDescription: description,\n targetElementDescription: extraTextFromUserPrompt(targetElementDescription),\n });\n const systemPrompt = systemPromptToLocateElement(\n vlLocateMode(modelPreferences),\n );\n\n let imagePayload = screenshotBase64;\n\n if (options.searchConfig) {\n assert(\n options.searchConfig.rect,\n 'searchArea is provided but its rect cannot be found. Failed to locate element',\n );\n assert(\n options.searchConfig.imageBase64,\n 'searchArea is provided but its imageBase64 cannot be found. Failed to locate element',\n );\n\n imagePayload = options.searchConfig.imageBase64;\n } else if (vlLocateMode(modelPreferences) === 'qwen-vl') {\n imagePayload = await paddingToMatchBlockByBase64(imagePayload);\n } else if (!vlLocateMode(modelPreferences)) {\n imagePayload = await markupImageForLLM(\n screenshotBase64,\n context.tree,\n context.size,\n );\n }\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: userInstructionPrompt,\n },\n ],\n },\n ];\n\n if (typeof targetElementDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: targetElementDescription.images,\n convertHttpImage2Base64: targetElementDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const callAIFn =\n callAI || callToGetJSONObject<AIElementResponse | [number, number]>;\n\n const res = await callAIFn(msgs, AIActionType.INSPECT_ELEMENT, {\n intent: 'grounding',\n });\n\n const rawResponse = JSON.stringify(res.content);\n\n let resRect: Rect | undefined;\n let matchedElements: AIElementLocatorResponse['elements'] =\n 'elements' in res.content ? res.content.elements : [];\n let errors: AIElementLocatorResponse['errors'] | undefined =\n 'errors' in res.content ? res.content.errors : [];\n try {\n if ('bbox' in res.content && Array.isArray(res.content.bbox)) {\n resRect = adaptBboxToRect(\n res.content.bbox,\n options.searchConfig?.rect?.width || context.size.width,\n options.searchConfig?.rect?.height || context.size.height,\n modelPreferences,\n options.searchConfig?.rect?.left,\n options.searchConfig?.rect?.top,\n );\n debugInspect('resRect', resRect);\n\n const rectCenter = {\n x: resRect.left + resRect.width / 2,\n y: resRect.top + resRect.height / 2,\n };\n let element = elementByPositionWithElementInfo(context.tree, rectCenter);\n\n const distanceToCenter = element\n ? distance({ x: element.center[0], y: element.center[1] }, rectCenter)\n : 0;\n\n if (!element || distanceToCenter > distanceThreshold) {\n element = insertElementByPosition(rectCenter);\n }\n\n if (element) {\n matchedElements = [element];\n errors = [];\n }\n }\n } catch (e) {\n const msg =\n e instanceof Error\n ? `Failed to parse bbox: ${e.message}`\n : 'unknown error in locate';\n if (!errors || errors?.length === 0) {\n errors = [msg];\n } else {\n errors.push(`(${msg})`);\n }\n }\n\n return {\n rect: resRect,\n parseResult: {\n elements: matchedElements,\n errors,\n },\n rawResponse,\n elementById,\n usage: res.usage,\n isOrderSensitive:\n typeof res.content === 'object' &&\n res.content !== null &&\n 'isOrderSensitive' in res.content\n ? (res.content as any).isOrderSensitive\n : undefined,\n };\n}\n\nexport async function AiLocateSection(options: {\n context: UIContext<BaseElement>;\n sectionDescription: TUserPrompt;\n callAI?: typeof callAiFn<AISectionLocatorResponse>;\n}): Promise<{\n rect?: Rect;\n imageBase64?: string;\n error?: string;\n rawResponse: string;\n usage?: AIUsageInfo;\n}> {\n const { context, sectionDescription } = options;\n const { screenshotBase64 } = context;\n\n const modelPreferences: IModelPreferences = {\n intent: 'grounding',\n };\n\n const systemPrompt = systemPromptToLocateSection(\n vlLocateMode(modelPreferences),\n );\n const sectionLocatorInstructionText = await sectionLocatorInstruction.format({\n sectionDescription: extraTextFromUserPrompt(sectionDescription),\n });\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: sectionLocatorInstructionText,\n },\n ],\n },\n ];\n\n if (typeof sectionDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: sectionDescription.images,\n convertHttpImage2Base64: sectionDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const result = await callAiFn<AISectionLocatorResponse>(\n msgs,\n AIActionType.EXTRACT_DATA,\n {\n intent: 'grounding',\n },\n );\n\n let sectionRect: Rect | undefined;\n const sectionBbox = result.content.bbox;\n if (sectionBbox) {\n const targetRect = adaptBboxToRect(\n sectionBbox,\n context.size.width,\n context.size.height,\n modelPreferences,\n );\n debugSection('original targetRect %j', targetRect);\n\n const referenceBboxList = result.content.references_bbox || [];\n debugSection('referenceBboxList %j', referenceBboxList);\n\n const referenceRects = referenceBboxList\n .filter((bbox) => Array.isArray(bbox))\n .map((bbox) => {\n return adaptBboxToRect(\n bbox,\n context.size.width,\n context.size.height,\n modelPreferences,\n );\n });\n debugSection('referenceRects %j', referenceRects);\n\n // merge the sectionRect and referenceRects\n const mergedRect = mergeRects([targetRect, ...referenceRects]);\n debugSection('mergedRect %j', mergedRect);\n\n // expand search area to at least 200 x 200\n sectionRect = expandSearchArea(mergedRect, context.size, modelPreferences);\n debugSection('expanded sectionRect %j', sectionRect);\n }\n\n let imageBase64 = screenshotBase64;\n if (sectionRect) {\n imageBase64 = await cropByRect(\n screenshotBase64,\n sectionRect,\n getIsUseQwenVl({\n intent: 'grounding',\n }),\n );\n }\n\n return {\n rect: sectionRect,\n imageBase64,\n error: result.content.error,\n rawResponse: JSON.stringify(result.content),\n usage: result.usage,\n };\n}\n\nexport async function AiExtractElementInfo<\n T,\n ElementType extends BaseElement = BaseElement,\n>(options: {\n dataQuery: string | Record<string, string>;\n multimodalPrompt?: TMultimodalPrompt;\n context: UIContext<ElementType>;\n extractOption?: InsightExtractOption;\n modelPreferences: IModelPreferences;\n}) {\n const {\n dataQuery,\n context,\n extractOption,\n multimodalPrompt,\n modelPreferences,\n } = options;\n const systemPrompt = systemPromptToExtract();\n\n const { screenshotBase64 } = context;\n const { description, elementById } = await describeUserPage(\n context,\n modelPreferences,\n {\n truncateTextLength: 200,\n filterNonTextContent: false,\n visibleOnly: false,\n domIncluded: extractOption?.domIncluded,\n },\n );\n\n const extractDataPromptText = await extractDataQueryPrompt(\n description,\n dataQuery,\n );\n\n const userContent: ChatCompletionUserMessageParam['content'] = [];\n\n if (extractOption?.screenshotIncluded !== false) {\n userContent.push({\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n });\n }\n\n userContent.push({\n type: 'text',\n text: extractDataPromptText,\n });\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: userContent,\n },\n ];\n\n if (options.extractOption?.returnThought) {\n msgs.push({\n role: 'user',\n content: 'Please provide reasons.',\n });\n }\n\n if (multimodalPrompt) {\n const addOns = await promptsToChatParam({\n images: multimodalPrompt.images,\n convertHttpImage2Base64: multimodalPrompt.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const result = await callAiFn<AIDataExtractionResponse<T>>(\n msgs,\n AIActionType.EXTRACT_DATA,\n modelPreferences,\n );\n return {\n parseResult: result.content,\n elementById,\n usage: result.usage,\n };\n}\n"],"names":["debugInspect","getDebug","debugSection","extraTextFromUserPrompt","prompt","promptsToChatParam","multimodalPrompt","_multimodalPrompt_images","msgs","item","base64","preProcessImageUrl","AiLocateElement","options","context","targetElementDescription","callAI","screenshotBase64","modelPreferences","description","elementById","insertElementByPosition","describeUserPage","assert","userInstructionPrompt","findElementPrompt","systemPrompt","systemPromptToLocateElement","vlLocateMode","imagePayload","paddingToMatchBlockByBase64","markupImageForLLM","addOns","callAIFn","callToGetJSONObject","res","AIActionType","rawResponse","JSON","resRect","matchedElements","errors","Array","_options_searchConfig_rect","_options_searchConfig_rect1","_options_searchConfig_rect2","_options_searchConfig_rect3","adaptBboxToRect","rectCenter","element","elementByPositionWithElementInfo","distanceToCenter","distance","distanceThreshold","e","msg","Error","undefined","AiLocateSection","sectionDescription","systemPromptToLocateSection","sectionLocatorInstructionText","sectionLocatorInstruction","result","callAiFn","sectionRect","sectionBbox","targetRect","referenceBboxList","referenceRects","bbox","mergedRect","mergeRects","expandSearchArea","imageBase64","cropByRect","getIsUseQwenVl","AiExtractElementInfo","_options_extractOption","dataQuery","extractOption","systemPromptToExtract","extractDataPromptText","extractDataQueryPrompt","userContent"],"mappings":";;;;;;;;;;AAgEA,MAAMA,eAAeC,SAAS;AAC9B,MAAMC,eAAeD,SAAS;AAE9B,MAAME,0BAA0B,CAACC;IAC/B,IAAI,AAAkB,YAAlB,OAAOA,QACT,OAAOA;IAEP,OAAOA,OAAO,MAAM;AAExB;AAEA,MAAMC,qBAAqB,OACzBC;QAGIC;IADJ,MAAMC,OAAyC,EAAE;IACjD,IAAID,QAAAA,mBAAAA,KAAAA,IAAAA,QAAAA,CAAAA,2BAAAA,iBAAkB,MAAM,AAAD,IAAvBA,KAAAA,IAAAA,yBAA0B,MAAM,EAAE;QACpCC,KAAK,IAAI,CAAC;YACR,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM;gBACR;aACD;QACH;QAEA,KAAK,MAAMC,QAAQH,iBAAiB,MAAM,CAAE;YAC1C,MAAMI,SAAS,MAAMC,mBACnBF,KAAK,GAAG,EACR,CAAC,CAACH,iBAAiB,uBAAuB;YAG5CE,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,MAAM,CAAC,gBAAgB,EAAEC,KAAK,IAAI,CAAC,CAAC,CAAC;oBACvC;iBACD;YACH;YAEAD,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,WAAW;4BACT,KAAKE;4BACL,QAAQ;wBACV;oBACF;iBACD;YACH;QACF;IACF;IACA,OAAOF;AACT;AAEO,eAAeI,gBAEpBC,OAMD;IAQC,MAAM,EAAEC,OAAO,EAAEC,wBAAwB,EAAEC,MAAM,EAAE,GAAGH;IACtD,MAAM,EAAEI,gBAAgB,EAAE,GAAGH;IAE7B,MAAMI,mBAAsC;QAC1C,QAAQ;IACV;IAEA,MAAM,EAAEC,WAAW,EAAEC,WAAW,EAAEC,uBAAuB,EAAE,GACzD,MAAMC,iBAAiBR,SAASI;IAElCK,OACER,0BACA;IAGF,MAAMS,wBAAwB,MAAMC,kBAAkB,MAAM,CAAC;QAC3D,iBAAiBN;QACjB,0BAA0BhB,wBAAwBY;IACpD;IACA,MAAMW,eAAeC,4BACnBC,aAAaV;IAGf,IAAIW,eAAeZ;IAEnB,IAAIJ,QAAQ,YAAY,EAAE;QACxBU,OACEV,QAAQ,YAAY,CAAC,IAAI,EACzB;QAEFU,OACEV,QAAQ,YAAY,CAAC,WAAW,EAChC;QAGFgB,eAAehB,QAAQ,YAAY,CAAC,WAAW;IACjD,OAAO,IAAIe,AAAmC,cAAnCA,aAAaV,mBACtBW,eAAe,MAAMC,4BAA4BD;SAC5C,IAAI,CAACD,aAAaV,mBACvBW,eAAe,MAAME,kBACnBd,kBACAH,QAAQ,IAAI,EACZA,QAAQ,IAAI;IAIhB,MAAMN,OAAe;QACnB;YAAE,MAAM;YAAU,SAASkB;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKG;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAML;gBACR;aACD;QACH;KACD;IAED,IAAI,AAAoC,YAApC,OAAOT,0BAAuC;QAChD,MAAMiB,SAAS,MAAM3B,mBAAmB;YACtC,QAAQU,yBAAyB,MAAM;YACvC,yBAAyBA,yBAAyB,uBAAuB;QAC3E;QACAP,KAAK,IAAI,IAAIwB;IACf;IAEA,MAAMC,WACJjB,UAAUkB;IAEZ,MAAMC,MAAM,MAAMF,SAASzB,MAAM4B,aAAa,eAAe,EAAE;QAC7D,QAAQ;IACV;IAEA,MAAMC,cAAcC,KAAK,SAAS,CAACH,IAAI,OAAO;IAE9C,IAAII;IACJ,IAAIC,kBACF,cAAcL,IAAI,OAAO,GAAGA,IAAI,OAAO,CAAC,QAAQ,GAAG,EAAE;IACvD,IAAIM,SACF,YAAYN,IAAI,OAAO,GAAGA,IAAI,OAAO,CAAC,MAAM,GAAG,EAAE;IACnD,IAAI;QACF,IAAI,UAAUA,IAAI,OAAO,IAAIO,MAAM,OAAO,CAACP,IAAI,OAAO,CAAC,IAAI,GAAG;gBAG1DQ,4BAAAA,uBACAC,6BAAAA,wBAEAC,6BAAAA,wBACAC,6BAAAA;YANFP,UAAUQ,gBACRZ,IAAI,OAAO,CAAC,IAAI,EAChBQ,AAAAA,SAAAA,CAAAA,wBAAAA,QAAQ,YAAY,AAAD,IAAnBA,KAAAA,IAAAA,QAAAA,CAAAA,6BAAAA,sBAAsB,IAAI,AAAD,IAAzBA,KAAAA,IAAAA,2BAA4B,KAAK,AAAD,KAAK7B,QAAQ,IAAI,CAAC,KAAK,EACvD8B,AAAAA,SAAAA,CAAAA,yBAAAA,QAAQ,YAAY,AAAD,IAAnBA,KAAAA,IAAAA,QAAAA,CAAAA,8BAAAA,uBAAsB,IAAI,AAAD,IAAzBA,KAAAA,IAAAA,4BAA4B,MAAM,AAAD,KAAK9B,QAAQ,IAAI,CAAC,MAAM,EACzDI,kBAAAA,QACA2B,CAAAA,yBAAAA,QAAQ,YAAY,AAAD,IAAnBA,KAAAA,IAAAA,QAAAA,CAAAA,8BAAAA,uBAAsB,IAAI,AAAD,IAAzBA,KAAAA,IAAAA,4BAA4B,IAAI,UAChCC,CAAAA,yBAAAA,QAAQ,YAAY,AAAD,IAAnBA,KAAAA,IAAAA,QAAAA,CAAAA,8BAAAA,uBAAsB,IAAI,AAAD,IAAzBA,KAAAA,IAAAA,4BAA4B,GAAG;YAEjC9C,aAAa,WAAWuC;YAExB,MAAMS,aAAa;gBACjB,GAAGT,QAAQ,IAAI,GAAGA,QAAQ,KAAK,GAAG;gBAClC,GAAGA,QAAQ,GAAG,GAAGA,QAAQ,MAAM,GAAG;YACpC;YACA,IAAIU,UAAUC,iCAAiCpC,QAAQ,IAAI,EAAEkC;YAE7D,MAAMG,mBAAmBF,UACrBG,SAAS;gBAAE,GAAGH,QAAQ,MAAM,CAAC,EAAE;gBAAE,GAAGA,QAAQ,MAAM,CAAC,EAAE;YAAC,GAAGD,cACzD;YAEJ,IAAI,CAACC,WAAWE,mBAAmBE,mBACjCJ,UAAU5B,wBAAwB2B;YAGpC,IAAIC,SAAS;gBACXT,kBAAkB;oBAACS;iBAAQ;gBAC3BR,SAAS,EAAE;YACb;QACF;IACF,EAAE,OAAOa,GAAG;QACV,MAAMC,MACJD,aAAaE,QACT,CAAC,sBAAsB,EAAEF,EAAE,OAAO,EAAE,GACpC;QACN,IAAI,AAACb,UAAUA,AAAAA,CAAAA,QAAAA,SAAAA,KAAAA,IAAAA,OAAQ,MAAM,AAAD,MAAM,GAGhCA,OAAO,IAAI,CAAC,CAAC,CAAC,EAAEc,IAAI,CAAC,CAAC;aAFtBd,SAAS;YAACc;SAAI;IAIlB;IAEA,OAAO;QACL,MAAMhB;QACN,aAAa;YACX,UAAUC;YACVC;QACF;QACAJ;QACAjB;QACA,OAAOe,IAAI,KAAK;QAChB,kBACE,AAAuB,YAAvB,OAAOA,IAAI,OAAO,IAClBA,AAAgB,SAAhBA,IAAI,OAAO,IACX,sBAAsBA,IAAI,OAAO,GAC5BA,IAAI,OAAO,CAAS,gBAAgB,GACrCsB;IACR;AACF;AAEO,eAAeC,gBAAgB7C,OAIrC;IAOC,MAAM,EAAEC,OAAO,EAAE6C,kBAAkB,EAAE,GAAG9C;IACxC,MAAM,EAAEI,gBAAgB,EAAE,GAAGH;IAE7B,MAAMI,mBAAsC;QAC1C,QAAQ;IACV;IAEA,MAAMQ,eAAekC,4BACnBhC,aAAaV;IAEf,MAAM2C,gCAAgC,MAAMC,0BAA0B,MAAM,CAAC;QAC3E,oBAAoB3D,wBAAwBwD;IAC9C;IACA,MAAMnD,OAAe;QACnB;YAAE,MAAM;YAAU,SAASkB;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKT;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAM4C;gBACR;aACD;QACH;KACD;IAED,IAAI,AAA8B,YAA9B,OAAOF,oBAAiC;QAC1C,MAAM3B,SAAS,MAAM3B,mBAAmB;YACtC,QAAQsD,mBAAmB,MAAM;YACjC,yBAAyBA,mBAAmB,uBAAuB;QACrE;QACAnD,KAAK,IAAI,IAAIwB;IACf;IAEA,MAAM+B,SAAS,MAAMC,SACnBxD,MACA4B,aAAa,YAAY,EACzB;QACE,QAAQ;IACV;IAGF,IAAI6B;IACJ,MAAMC,cAAcH,OAAO,OAAO,CAAC,IAAI;IACvC,IAAIG,aAAa;QACf,MAAMC,aAAapB,gBACjBmB,aACApD,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnBI;QAEFhB,aAAa,0BAA0BiE;QAEvC,MAAMC,oBAAoBL,OAAO,OAAO,CAAC,eAAe,IAAI,EAAE;QAC9D7D,aAAa,wBAAwBkE;QAErC,MAAMC,iBAAiBD,kBACpB,MAAM,CAAC,CAACE,OAAS5B,MAAM,OAAO,CAAC4B,OAC/B,GAAG,CAAC,CAACA,OACGvB,gBACLuB,MACAxD,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnBI;QAGNhB,aAAa,qBAAqBmE;QAGlC,MAAME,aAAaC,WAAW;YAACL;eAAeE;SAAe;QAC7DnE,aAAa,iBAAiBqE;QAG9BN,cAAcQ,iBAAiBF,YAAYzD,QAAQ,IAAI,EAAEI;QACzDhB,aAAa,2BAA2B+D;IAC1C;IAEA,IAAIS,cAAczD;IAClB,IAAIgD,aACFS,cAAc,MAAMC,WAClB1D,kBACAgD,aACAW,eAAe;QACb,QAAQ;IACV;IAIJ,OAAO;QACL,MAAMX;QACNS;QACA,OAAOX,OAAO,OAAO,CAAC,KAAK;QAC3B,aAAazB,KAAK,SAAS,CAACyB,OAAO,OAAO;QAC1C,OAAOA,OAAO,KAAK;IACrB;AACF;AAEO,eAAec,qBAGpBhE,OAMD;QAoDKiE;IAnDJ,MAAM,EACJC,SAAS,EACTjE,OAAO,EACPkE,aAAa,EACb1E,gBAAgB,EAChBY,gBAAgB,EACjB,GAAGL;IACJ,MAAMa,eAAeuD;IAErB,MAAM,EAAEhE,gBAAgB,EAAE,GAAGH;IAC7B,MAAM,EAAEK,WAAW,EAAEC,WAAW,EAAE,GAAG,MAAME,iBACzCR,SACAI,kBACA;QACE,oBAAoB;QACpB,sBAAsB;QACtB,aAAa;QACb,aAAa8D,QAAAA,gBAAAA,KAAAA,IAAAA,cAAe,WAAW;IACzC;IAGF,MAAME,wBAAwB,MAAMC,uBAClChE,aACA4D;IAGF,MAAMK,cAAyD,EAAE;IAEjE,IAAIJ,AAAAA,CAAAA,QAAAA,gBAAAA,KAAAA,IAAAA,cAAe,kBAAkB,AAAD,MAAM,OACxCI,YAAY,IAAI,CAAC;QACf,MAAM;QACN,WAAW;YACT,KAAKnE;YACL,QAAQ;QACV;IACF;IAGFmE,YAAY,IAAI,CAAC;QACf,MAAM;QACN,MAAMF;IACR;IAEA,MAAM1E,OAAe;QACnB;YAAE,MAAM;YAAU,SAASkB;QAAa;QACxC;YACE,MAAM;YACN,SAAS0D;QACX;KACD;IAED,IAAI,QAAAN,CAAAA,yBAAAA,QAAQ,aAAa,AAAD,IAApBA,KAAAA,IAAAA,uBAAuB,aAAa,EACtCtE,KAAK,IAAI,CAAC;QACR,MAAM;QACN,SAAS;IACX;IAGF,IAAIF,kBAAkB;QACpB,MAAM0B,SAAS,MAAM3B,mBAAmB;YACtC,QAAQC,iBAAiB,MAAM;YAC/B,yBAAyBA,iBAAiB,uBAAuB;QACnE;QACAE,KAAK,IAAI,IAAIwB;IACf;IAEA,MAAM+B,SAAS,MAAMC,SACnBxD,MACA4B,aAAa,YAAY,EACzBlB;IAEF,OAAO;QACL,aAAa6C,OAAO,OAAO;QAC3B3C;QACA,OAAO2C,OAAO,KAAK;IACrB;AACF"}
|
|
@@ -1,27 +1,32 @@
|
|
|
1
1
|
import { vlLocateMode } from "@midscene/shared/env";
|
|
2
2
|
import { paddingToMatchBlockByBase64 } from "@midscene/shared/img";
|
|
3
|
+
import { getDebug } from "@midscene/shared/logger";
|
|
3
4
|
import { assert } from "@midscene/shared/utils";
|
|
4
|
-
import { AIActionType, buildYamlFlowFromPlans, callAiFn, fillBboxParam, markupImageForLLM, warnGPT4oSizeLimit } from "./common.mjs";
|
|
5
|
+
import { AIActionType, buildYamlFlowFromPlans, callAiFn, fillBboxParam, findAllMidsceneLocatorField, markupImageForLLM, warnGPT4oSizeLimit } from "./common.mjs";
|
|
5
6
|
import { automationUserPrompt, generateTaskBackgroundContext, systemPromptToTaskPlanning } from "./prompt/llm-planning.mjs";
|
|
6
7
|
import { describeUserPage } from "./prompt/util.mjs";
|
|
8
|
+
const debug = getDebug('planning');
|
|
7
9
|
async function plan(userInstruction, opts) {
|
|
8
10
|
var _planFromAI_action;
|
|
9
11
|
const { callAI, context } = opts || {};
|
|
10
12
|
const { screenshotBase64, size } = context;
|
|
11
|
-
const
|
|
13
|
+
const modelPreferences = {
|
|
14
|
+
intent: 'planning'
|
|
15
|
+
};
|
|
16
|
+
const { description: pageDescription, elementById } = await describeUserPage(context, modelPreferences);
|
|
12
17
|
const systemPrompt = await systemPromptToTaskPlanning({
|
|
13
18
|
actionSpace: opts.actionSpace,
|
|
14
|
-
vlMode: vlLocateMode()
|
|
19
|
+
vlMode: vlLocateMode(modelPreferences)
|
|
15
20
|
});
|
|
16
21
|
const taskBackgroundContextText = generateTaskBackgroundContext(userInstruction, opts.log, opts.actionContext);
|
|
17
|
-
const userInstructionPrompt = await automationUserPrompt(vlLocateMode()).format({
|
|
22
|
+
const userInstructionPrompt = await automationUserPrompt(vlLocateMode(modelPreferences)).format({
|
|
18
23
|
pageDescription,
|
|
19
24
|
taskBackgroundContext: taskBackgroundContextText
|
|
20
25
|
});
|
|
21
26
|
let imagePayload = screenshotBase64;
|
|
22
|
-
if ('qwen-vl' === vlLocateMode()) imagePayload = await paddingToMatchBlockByBase64(imagePayload);
|
|
23
|
-
else if (!vlLocateMode()) imagePayload = await markupImageForLLM(screenshotBase64, context.tree, context.size);
|
|
24
|
-
warnGPT4oSizeLimit(size);
|
|
27
|
+
if ('qwen-vl' === vlLocateMode(modelPreferences)) imagePayload = await paddingToMatchBlockByBase64(imagePayload);
|
|
28
|
+
else if (!vlLocateMode(modelPreferences)) imagePayload = await markupImageForLLM(screenshotBase64, context.tree, context.size);
|
|
29
|
+
warnGPT4oSizeLimit(size, modelPreferences);
|
|
25
30
|
const msgs = [
|
|
26
31
|
{
|
|
27
32
|
role: 'system',
|
|
@@ -45,7 +50,7 @@ async function plan(userInstruction, opts) {
|
|
|
45
50
|
}
|
|
46
51
|
];
|
|
47
52
|
const call = callAI || callAiFn;
|
|
48
|
-
const { content, usage } = await call(msgs, AIActionType.PLAN);
|
|
53
|
+
const { content, usage } = await call(msgs, AIActionType.PLAN, modelPreferences);
|
|
49
54
|
const rawResponse = JSON.stringify(content, void 0, 2);
|
|
50
55
|
const planFromAI = content;
|
|
51
56
|
const actions = ((null == (_planFromAI_action = planFromAI.action) ? void 0 : _planFromAI_action.type) ? [
|
|
@@ -56,27 +61,25 @@ async function plan(userInstruction, opts) {
|
|
|
56
61
|
actions,
|
|
57
62
|
rawResponse,
|
|
58
63
|
usage,
|
|
59
|
-
yamlFlow: buildYamlFlowFromPlans(actions, planFromAI.sleep)
|
|
64
|
+
yamlFlow: buildYamlFlowFromPlans(actions, opts.actionSpace, planFromAI.sleep)
|
|
60
65
|
};
|
|
61
66
|
assert(planFromAI, "can't get plans from AI");
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
67
|
+
actions.forEach((action)=>{
|
|
68
|
+
const type = action.type;
|
|
69
|
+
const actionInActionSpace = opts.actionSpace.find((action)=>action.name === type);
|
|
70
|
+
const locateFields = actionInActionSpace ? findAllMidsceneLocatorField(actionInActionSpace.paramSchema) : [];
|
|
71
|
+
debug('locateFields', locateFields);
|
|
72
|
+
locateFields.forEach((field)=>{
|
|
73
|
+
const locateResult = action.param[field];
|
|
74
|
+
if (locateResult) if (vlLocateMode(modelPreferences)) action.param[field] = fillBboxParam(locateResult, size.width, size.height, modelPreferences);
|
|
75
|
+
else {
|
|
76
|
+
const element = elementById(locateResult);
|
|
77
|
+
if (element) action.param[field].id = element.id;
|
|
70
78
|
}
|
|
79
|
+
action.locate = action.param[field];
|
|
71
80
|
});
|
|
72
|
-
assert(!planFromAI.error, `Failed to plan actions: ${planFromAI.error}`);
|
|
73
|
-
} else actions.forEach((action)=>{
|
|
74
|
-
var _action_locate;
|
|
75
|
-
if (null == (_action_locate = action.locate) ? void 0 : _action_locate.id) {
|
|
76
|
-
const element = elementById(action.locate.id);
|
|
77
|
-
if (element) action.locate.id = element.id;
|
|
78
|
-
}
|
|
79
81
|
});
|
|
82
|
+
assert(!planFromAI.error, `Failed to plan actions: ${planFromAI.error}`);
|
|
80
83
|
if (0 === actions.length && returnValue.more_actions_needed_by_instruction && !returnValue.sleep) console.warn('No actions planned for the prompt, but model said more actions are needed:', userInstruction);
|
|
81
84
|
return returnValue;
|
|
82
85
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ai-model/llm-planning.mjs","sources":["webpack://@midscene/core/./src/ai-model/llm-planning.ts"],"sourcesContent":["import type {\n DeviceAction,\n PageType,\n PlanningAIResponse,\n UIContext,\n} from '@/types';\nimport { vlLocateMode } from '@midscene/shared/env';\nimport { paddingToMatchBlockByBase64 } from '@midscene/shared/img';\nimport { assert } from '@midscene/shared/utils';\nimport {\n AIActionType,\n type AIArgs,\n buildYamlFlowFromPlans,\n callAiFn,\n fillBboxParam,\n markupImageForLLM,\n warnGPT4oSizeLimit,\n} from './common';\nimport {\n automationUserPrompt,\n generateTaskBackgroundContext,\n systemPromptToTaskPlanning,\n} from './prompt/llm-planning';\nimport { describeUserPage } from './prompt/util';\n\nexport async function plan(\n userInstruction: string,\n opts: {\n context: UIContext;\n pageType: PageType;\n actionSpace: DeviceAction[];\n callAI?: typeof callAiFn<PlanningAIResponse>;\n log?: string;\n actionContext?: string;\n },\n): Promise<PlanningAIResponse> {\n const { callAI, context } = opts || {};\n const { screenshotBase64, size } = context;\n const { description: pageDescription, elementById }
|
|
1
|
+
{"version":3,"file":"ai-model/llm-planning.mjs","sources":["webpack://@midscene/core/./src/ai-model/llm-planning.ts"],"sourcesContent":["import type {\n DeviceAction,\n PageType,\n PlanningAIResponse,\n UIContext,\n} from '@/types';\nimport { type IModelPreferences, vlLocateMode } from '@midscene/shared/env';\nimport { paddingToMatchBlockByBase64 } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport {\n AIActionType,\n type AIArgs,\n buildYamlFlowFromPlans,\n callAiFn,\n fillBboxParam,\n findAllMidsceneLocatorField,\n markupImageForLLM,\n warnGPT4oSizeLimit,\n} from './common';\nimport {\n automationUserPrompt,\n generateTaskBackgroundContext,\n systemPromptToTaskPlanning,\n} from './prompt/llm-planning';\nimport { describeUserPage } from './prompt/util';\n\nconst debug = getDebug('planning');\n\nexport async function plan(\n userInstruction: string,\n opts: {\n context: UIContext;\n pageType: PageType;\n actionSpace: DeviceAction<any>[];\n callAI?: typeof callAiFn<PlanningAIResponse>;\n log?: string;\n actionContext?: string;\n },\n): Promise<PlanningAIResponse> {\n const { callAI, context } = opts || {};\n const { screenshotBase64, size } = context;\n\n const modelPreferences: IModelPreferences = {\n intent: 'planning',\n };\n const { description: pageDescription, elementById } = await describeUserPage(\n context,\n modelPreferences,\n );\n\n const systemPrompt = await systemPromptToTaskPlanning({\n actionSpace: opts.actionSpace,\n vlMode: vlLocateMode(modelPreferences),\n });\n const taskBackgroundContextText = generateTaskBackgroundContext(\n userInstruction,\n opts.log,\n opts.actionContext,\n );\n const userInstructionPrompt = await automationUserPrompt(\n vlLocateMode(modelPreferences),\n ).format({\n pageDescription,\n taskBackgroundContext: taskBackgroundContextText,\n });\n\n let imagePayload = screenshotBase64;\n if (vlLocateMode(modelPreferences) === 'qwen-vl') {\n imagePayload = await paddingToMatchBlockByBase64(imagePayload);\n } else if (!vlLocateMode(modelPreferences)) {\n imagePayload = await markupImageForLLM(\n screenshotBase64,\n context.tree,\n context.size,\n );\n }\n\n warnGPT4oSizeLimit(size, modelPreferences);\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: userInstructionPrompt,\n },\n ],\n },\n ];\n\n const call = callAI || callAiFn;\n const { content, usage } = await call(\n msgs,\n AIActionType.PLAN,\n modelPreferences,\n );\n const rawResponse = JSON.stringify(content, undefined, 2);\n const planFromAI = content;\n\n const actions =\n (planFromAI.action?.type ? [planFromAI.action] : planFromAI.actions) || [];\n const returnValue: PlanningAIResponse = {\n ...planFromAI,\n actions,\n rawResponse,\n usage,\n yamlFlow: buildYamlFlowFromPlans(\n actions,\n opts.actionSpace,\n planFromAI.sleep,\n ),\n };\n\n assert(planFromAI, \"can't get plans from AI\");\n\n // TODO: use zod.parse to parse the action.param, and then fill the bbox param.\n actions.forEach((action) => {\n const type = action.type;\n const actionInActionSpace = opts.actionSpace.find(\n (action) => action.name === type,\n );\n const locateFields = actionInActionSpace\n ? findAllMidsceneLocatorField(actionInActionSpace.paramSchema)\n : [];\n\n debug('locateFields', locateFields);\n\n locateFields.forEach((field) => {\n const locateResult = action.param[field];\n if (locateResult) {\n if (vlLocateMode(modelPreferences)) {\n action.param[field] = fillBboxParam(\n locateResult,\n size.width,\n size.height,\n modelPreferences,\n );\n } else {\n const element = elementById(locateResult);\n if (element) {\n action.param[field].id = element.id;\n }\n }\n }\n\n // to be compatible with the web-integration\n action.locate = action.param[field];\n });\n });\n // in Qwen-VL, error means error. In GPT-4o, error may mean more actions are needed.\n assert(!planFromAI.error, `Failed to plan actions: ${planFromAI.error}`);\n\n if (\n actions.length === 0 &&\n returnValue.more_actions_needed_by_instruction &&\n !returnValue.sleep\n ) {\n console.warn(\n 'No actions planned for the prompt, but model said more actions are needed:',\n userInstruction,\n );\n }\n\n return returnValue;\n}\n"],"names":["debug","getDebug","plan","userInstruction","opts","_planFromAI_action","callAI","context","screenshotBase64","size","modelPreferences","pageDescription","elementById","describeUserPage","systemPrompt","systemPromptToTaskPlanning","vlLocateMode","taskBackgroundContextText","generateTaskBackgroundContext","userInstructionPrompt","automationUserPrompt","imagePayload","paddingToMatchBlockByBase64","markupImageForLLM","warnGPT4oSizeLimit","msgs","call","callAiFn","content","usage","AIActionType","rawResponse","JSON","undefined","planFromAI","actions","returnValue","buildYamlFlowFromPlans","assert","action","type","actionInActionSpace","locateFields","findAllMidsceneLocatorField","field","locateResult","fillBboxParam","element","console"],"mappings":";;;;;;;AA2BA,MAAMA,QAAQC,SAAS;AAEhB,eAAeC,KACpBC,eAAuB,EACvBC,IAOC;QAwEEC;IAtEH,MAAM,EAAEC,MAAM,EAAEC,OAAO,EAAE,GAAGH,QAAQ,CAAC;IACrC,MAAM,EAAEI,gBAAgB,EAAEC,IAAI,EAAE,GAAGF;IAEnC,MAAMG,mBAAsC;QAC1C,QAAQ;IACV;IACA,MAAM,EAAE,aAAaC,eAAe,EAAEC,WAAW,EAAE,GAAG,MAAMC,iBAC1DN,SACAG;IAGF,MAAMI,eAAe,MAAMC,2BAA2B;QACpD,aAAaX,KAAK,WAAW;QAC7B,QAAQY,aAAaN;IACvB;IACA,MAAMO,4BAA4BC,8BAChCf,iBACAC,KAAK,GAAG,EACRA,KAAK,aAAa;IAEpB,MAAMe,wBAAwB,MAAMC,qBAClCJ,aAAaN,mBACb,MAAM,CAAC;QACPC;QACA,uBAAuBM;IACzB;IAEA,IAAII,eAAeb;IACnB,IAAIQ,AAAmC,cAAnCA,aAAaN,mBACfW,eAAe,MAAMC,4BAA4BD;SAC5C,IAAI,CAACL,aAAaN,mBACvBW,eAAe,MAAME,kBACnBf,kBACAD,QAAQ,IAAI,EACZA,QAAQ,IAAI;IAIhBiB,mBAAmBf,MAAMC;IAEzB,MAAMe,OAAe;QACnB;YAAE,MAAM;YAAU,SAASX;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKO;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAMF;gBACR;aACD;QACH;KACD;IAED,MAAMO,OAAOpB,UAAUqB;IACvB,MAAM,EAAEC,OAAO,EAAEC,KAAK,EAAE,GAAG,MAAMH,KAC/BD,MACAK,aAAa,IAAI,EACjBpB;IAEF,MAAMqB,cAAcC,KAAK,SAAS,CAACJ,SAASK,QAAW;IACvD,MAAMC,aAAaN;IAEnB,MAAMO,UACH9B,AAAAA,CAAAA,SAAAA,CAAAA,qBAAAA,WAAW,MAAM,AAAD,IAAhBA,KAAAA,IAAAA,mBAAmB,IAAI,AAAD,IAAI;QAAC6B,WAAW,MAAM;KAAC,GAAGA,WAAW,OAAM,KAAM,EAAE;IAC5E,MAAME,cAAkC;QACtC,GAAGF,UAAU;QACbC;QACAJ;QACAF;QACA,UAAUQ,uBACRF,SACA/B,KAAK,WAAW,EAChB8B,WAAW,KAAK;IAEpB;IAEAI,OAAOJ,YAAY;IAGnBC,QAAQ,OAAO,CAAC,CAACI;QACf,MAAMC,OAAOD,OAAO,IAAI;QACxB,MAAME,sBAAsBrC,KAAK,WAAW,CAAC,IAAI,CAC/C,CAACmC,SAAWA,OAAO,IAAI,KAAKC;QAE9B,MAAME,eAAeD,sBACjBE,4BAA4BF,oBAAoB,WAAW,IAC3D,EAAE;QAENzC,MAAM,gBAAgB0C;QAEtBA,aAAa,OAAO,CAAC,CAACE;YACpB,MAAMC,eAAeN,OAAO,KAAK,CAACK,MAAM;YACxC,IAAIC,cACF,IAAI7B,aAAaN,mBACf6B,OAAO,KAAK,CAACK,MAAM,GAAGE,cACpBD,cACApC,KAAK,KAAK,EACVA,KAAK,MAAM,EACXC;iBAEG;gBACL,MAAMqC,UAAUnC,YAAYiC;gBAC5B,IAAIE,SACFR,OAAO,KAAK,CAACK,MAAM,CAAC,EAAE,GAAGG,QAAQ,EAAE;YAEvC;YAIFR,OAAO,MAAM,GAAGA,OAAO,KAAK,CAACK,MAAM;QACrC;IACF;IAEAN,OAAO,CAACJ,WAAW,KAAK,EAAE,CAAC,wBAAwB,EAAEA,WAAW,KAAK,EAAE;IAEvE,IACEC,AAAmB,MAAnBA,QAAQ,MAAM,IACdC,YAAY,kCAAkC,IAC9C,CAACA,YAAY,KAAK,EAElBY,QAAQ,IAAI,CACV,8EACA7C;IAIJ,OAAOiC;AACT"}
|
|
@@ -1,27 +1,3 @@
|
|
|
1
|
-
import { getPreferredLanguage } from "@midscene/shared/env";
|
|
2
|
-
const defaultAssertionPrompt = 'You are a senior testing engineer. User will give an assertion and a screenshot of a page. By carefully viewing the screenshot, please tell whether the assertion is truthy.';
|
|
3
|
-
const defaultAssertionResponseJsonFormat = `Return in the following JSON format:
|
|
4
|
-
{
|
|
5
|
-
pass: boolean, // whether the assertion is truthy
|
|
6
|
-
thought: string | null, // string, if the result is falsy, give the reason why it is falsy. Otherwise, put null.
|
|
7
|
-
}`;
|
|
8
|
-
const getUiTarsAssertionResponseJsonFormat = ()=>`## Output Json String Format
|
|
9
|
-
\`\`\`
|
|
10
|
-
"{
|
|
11
|
-
"pass": <<is a boolean value from the enum [true, false], true means the assertion is truthy>>,
|
|
12
|
-
"thought": "<<is a string, give the reason why the assertion is falsy or truthy. Otherwise.>>"
|
|
13
|
-
}"
|
|
14
|
-
\`\`\`
|
|
15
|
-
|
|
16
|
-
## Rules **MUST** follow
|
|
17
|
-
- Make sure to return **only** the JSON, with **no additional** text or explanations.
|
|
18
|
-
- Use ${getPreferredLanguage()} in \`thought\` part.
|
|
19
|
-
- You **MUST** strictly follow up the **Output Json String Format**.`;
|
|
20
|
-
function systemPromptToAssert(model) {
|
|
21
|
-
return `${defaultAssertionPrompt}
|
|
22
|
-
|
|
23
|
-
${model.isUITars ? getUiTarsAssertionResponseJsonFormat() : defaultAssertionResponseJsonFormat}`;
|
|
24
|
-
}
|
|
25
1
|
const assertSchema = {
|
|
26
2
|
type: 'json_schema',
|
|
27
3
|
json_schema: {
|
|
@@ -50,6 +26,6 @@ const assertSchema = {
|
|
|
50
26
|
}
|
|
51
27
|
}
|
|
52
28
|
};
|
|
53
|
-
export { assertSchema
|
|
29
|
+
export { assertSchema };
|
|
54
30
|
|
|
55
31
|
//# sourceMappingURL=assertion.mjs.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ai-model/prompt/assertion.mjs","sources":["webpack://@midscene/core/./src/ai-model/prompt/assertion.ts"],"sourcesContent":["import
|
|
1
|
+
{"version":3,"file":"ai-model/prompt/assertion.mjs","sources":["webpack://@midscene/core/./src/ai-model/prompt/assertion.ts"],"sourcesContent":["import type { ResponseFormatJSONSchema } from 'openai/resources/index';\n\nexport const assertSchema: ResponseFormatJSONSchema = {\n type: 'json_schema',\n json_schema: {\n name: 'assert',\n strict: true,\n schema: {\n type: 'object',\n properties: {\n pass: {\n type: 'boolean',\n description: 'Whether the assertion passed or failed',\n },\n thought: {\n type: ['string', 'null'],\n description: 'The thought process behind the assertion',\n },\n },\n required: ['pass', 'thought'],\n additionalProperties: false,\n },\n },\n};\n"],"names":["assertSchema"],"mappings":"AAEO,MAAMA,eAAyC;IACpD,MAAM;IACN,aAAa;QACX,MAAM;QACN,QAAQ;QACR,QAAQ;YACN,MAAM;YACN,YAAY;gBACV,MAAM;oBACJ,MAAM;oBACN,aAAa;gBACf;gBACA,SAAS;oBACP,MAAM;wBAAC;wBAAU;qBAAO;oBACxB,aAAa;gBACf;YACF;YACA,UAAU;gBAAC;gBAAQ;aAAU;YAC7B,sBAAsB;QACxB;IACF;AACF"}
|
|
@@ -1,40 +1,67 @@
|
|
|
1
1
|
import node_assert from "node:assert";
|
|
2
2
|
import { PromptTemplate } from "@langchain/core/prompts";
|
|
3
|
+
import { z } from "zod";
|
|
4
|
+
import { ifMidsceneLocatorField } from "../common.mjs";
|
|
3
5
|
import { bboxDescription } from "./common.mjs";
|
|
4
6
|
const vlCoTLog = '"what_the_user_wants_to_do_next_by_instruction": string, // What the user wants to do according to the instruction and previous logs. ';
|
|
5
7
|
const vlCurrentLog = '"log": string, // Log what the next one action (ONLY ONE!) you can do according to the screenshot and the instruction. The typical log looks like "Now i want to use action \'{ action-type }\' to do .. first". If no action should be done, log the reason. ". Use the same language as the user\'s instruction.';
|
|
6
8
|
const llmCurrentLog = '"log": string, // Log what the next actions you can do according to the screenshot and the instruction. The typical log looks like "Now i want to use action \'{ action-type }\' to do ..". If no action should be done, log the reason. ". Use the same language as the user\'s instruction.';
|
|
7
9
|
const commonOutputFields = `"error"?: string, // Error messages about unexpected situations, if any. Only think it is an error when the situation is not foreseeable according to the instruction. Use the same language as the user's instruction.
|
|
8
10
|
"more_actions_needed_by_instruction": boolean, // Consider if there is still more action(s) to do after the action in "Log" is done, according to the instruction. If so, set this field to true. Otherwise, set it to false.`;
|
|
9
|
-
const vlLocateParam = (
|
|
10
|
-
const llmLocateParam = (
|
|
11
|
-
const descriptionForAction = (action,
|
|
11
|
+
const vlLocateParam = ()=>'{bbox: [number, number, number, number], prompt: string }';
|
|
12
|
+
const llmLocateParam = ()=>'{"id": string, "prompt": string}';
|
|
13
|
+
const descriptionForAction = (action, locatorSchemaTypeDescription)=>{
|
|
12
14
|
const tab = ' ';
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
15
|
+
const fields = [];
|
|
16
|
+
fields.push(`- type: "${action.name}"`);
|
|
17
|
+
if (action.paramSchema) {
|
|
18
|
+
node_assert(action.paramSchema instanceof z.ZodObject, 'paramSchema must be a zod object');
|
|
19
|
+
const shape = action.paramSchema.shape;
|
|
20
|
+
const paramLines = [];
|
|
21
|
+
const getTypeName = (field)=>{
|
|
22
|
+
var _field__def, _actualField__def, _actualField__def1, _actualField__def2, _actualField__def3, _actualField__def4;
|
|
23
|
+
const actualField = (null == (_field__def = field._def) ? void 0 : _field__def.innerType) || field;
|
|
24
|
+
if ((null == (_actualField__def = actualField._def) ? void 0 : _actualField__def.typeName) === 'ZodString') return 'string';
|
|
25
|
+
if ((null == (_actualField__def1 = actualField._def) ? void 0 : _actualField__def1.typeName) === 'ZodNumber') return 'number';
|
|
26
|
+
if ((null == (_actualField__def2 = actualField._def) ? void 0 : _actualField__def2.typeName) === 'ZodBoolean') return 'boolean';
|
|
27
|
+
if ((null == (_actualField__def3 = actualField._def) ? void 0 : _actualField__def3.typeName) === 'ZodArray') return 'array';
|
|
28
|
+
if ((null == (_actualField__def4 = actualField._def) ? void 0 : _actualField__def4.typeName) === 'ZodObject') {
|
|
29
|
+
if (ifMidsceneLocatorField(actualField)) return locatorSchemaTypeDescription;
|
|
30
|
+
return 'object';
|
|
31
|
+
}
|
|
32
|
+
console.warn('unknown type: ', actualField._def);
|
|
33
|
+
return 'type';
|
|
34
|
+
};
|
|
35
|
+
const getDescription = (field)=>{
|
|
36
|
+
var _field__def, _actualField__def;
|
|
37
|
+
const actualField = (null == (_field__def = field._def) ? void 0 : _field__def.innerType) || field;
|
|
38
|
+
if ("description" in field) return field.description || null;
|
|
39
|
+
if ((null == (_actualField__def = actualField._def) ? void 0 : _actualField__def.typeName) === 'ZodObject') {
|
|
40
|
+
if ('midscene_location_field_flag' in actualField._def.shape()) return 'Location information for the target element';
|
|
41
|
+
}
|
|
42
|
+
return null;
|
|
43
|
+
};
|
|
44
|
+
for (const [key, field] of Object.entries(shape))if (field && 'object' == typeof field) {
|
|
45
|
+
const isOptional = 'function' == typeof field.isOptional && field.isOptional();
|
|
46
|
+
const keyWithOptional = isOptional ? `${key}?` : key;
|
|
47
|
+
const typeName = getTypeName(field);
|
|
48
|
+
const description = getDescription(field);
|
|
49
|
+
let paramLine = `${keyWithOptional}: ${typeName}`;
|
|
50
|
+
if (description) paramLine += ` // ${description}`;
|
|
51
|
+
paramLines.push(paramLine);
|
|
52
|
+
}
|
|
53
|
+
if (paramLines.length > 0) {
|
|
54
|
+
fields.push('- param:');
|
|
55
|
+
for (const paramLine of paramLines)fields.push(` - ${paramLine}`);
|
|
56
|
+
}
|
|
25
57
|
}
|
|
26
|
-
|
|
27
|
-
paramSchema,
|
|
28
|
-
locatorParam
|
|
29
|
-
].filter(Boolean);
|
|
30
|
-
return `- ${action.name}, ${action.description}
|
|
31
|
-
${tab}- type: "${action.name}"
|
|
58
|
+
return `- ${action.name}, ${action.description || "No description provided"}
|
|
32
59
|
${tab}${fields.join(`\n${tab}`)}
|
|
33
60
|
`.trim();
|
|
34
61
|
};
|
|
35
62
|
const systemTemplateOfVLPlanning = ({ actionSpace, vlMode })=>{
|
|
36
63
|
const actionNameList = actionSpace.map((action)=>action.name).join(', ');
|
|
37
|
-
const actionDescriptionList = actionSpace.map((action)=>descriptionForAction(action, vlLocateParam(
|
|
64
|
+
const actionDescriptionList = actionSpace.map((action)=>descriptionForAction(action, vlLocateParam()));
|
|
38
65
|
const actionList = actionDescriptionList.join('\n');
|
|
39
66
|
return `
|
|
40
67
|
Target: User will give you a screenshot, an instruction and some previous logs indicating what have been done. Please tell what the next one action is (or null if no action should be done) to do the tasks the instruction requires.
|
|
@@ -84,7 +111,7 @@ this and output the JSON:
|
|
|
84
111
|
};
|
|
85
112
|
const systemTemplateOfLLM = ({ actionSpace })=>{
|
|
86
113
|
const actionNameList = actionSpace.map((action)=>action.name).join(' / ');
|
|
87
|
-
const actionDescriptionList = actionSpace.map((action)=>descriptionForAction(action, llmLocateParam(
|
|
114
|
+
const actionDescriptionList = actionSpace.map((action)=>descriptionForAction(action, llmLocateParam()));
|
|
88
115
|
const actionList = actionDescriptionList.join('\n');
|
|
89
116
|
return `
|
|
90
117
|
## Role
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ai-model/prompt/llm-planning.mjs","sources":["webpack://@midscene/core/./src/ai-model/prompt/llm-planning.ts"],"sourcesContent":["import assert from 'node:assert';\nimport type { DeviceAction } from '@/types';\nimport { PromptTemplate } from '@langchain/core/prompts';\nimport type { vlLocateMode } from '@midscene/shared/env';\nimport type { ResponseFormatJSONSchema } from 'openai/resources/index';\nimport { bboxDescription } from './common';\n\n// Note: put the log field first to trigger the CoT\nconst vlCoTLog = `\"what_the_user_wants_to_do_next_by_instruction\": string, // What the user wants to do according to the instruction and previous logs. `;\nconst vlCurrentLog = `\"log\": string, // Log what the next one action (ONLY ONE!) you can do according to the screenshot and the instruction. The typical log looks like \"Now i want to use action '{ action-type }' to do .. first\". If no action should be done, log the reason. \". Use the same language as the user's instruction.`;\nconst llmCurrentLog = `\"log\": string, // Log what the next actions you can do according to the screenshot and the instruction. The typical log looks like \"Now i want to use action '{ action-type }' to do ..\". If no action should be done, log the reason. \". Use the same language as the user's instruction.`;\n\nconst commonOutputFields = `\"error\"?: string, // Error messages about unexpected situations, if any. Only think it is an error when the situation is not foreseeable according to the instruction. Use the same language as the user's instruction.\n \"more_actions_needed_by_instruction\": boolean, // Consider if there is still more action(s) to do after the action in \"Log\" is done, according to the instruction. If so, set this field to true. Otherwise, set it to false.`;\nconst vlLocateParam = (required: boolean) =>\n `locate${required ? '' : '?'}: {bbox: [number, number, number, number], prompt: string }`;\nconst llmLocateParam = (required: boolean) =>\n `locate${required ? '' : '?'}: {\"id\": string, \"prompt\": string}`;\n\nexport const descriptionForAction = (\n action: DeviceAction,\n locatorScheme: string,\n) => {\n const tab = ' ';\n let locateParam = '';\n if (action.location === 'required') {\n locateParam = locatorScheme;\n } else if (action.location === 'optional') {\n locateParam = `${locatorScheme} | null`;\n } else if (action.location === false) {\n locateParam = '';\n }\n const locatorParam = locateParam ? `- ${locateParam}` : '';\n\n if (action.whatToLocate) {\n if (!locateParam) {\n console.warn(\n `whatToLocate is provided for action ${action.name}, but location is not required or optional. The whatToLocate will be ignored.`,\n );\n } else {\n locateParam += ` // ${action.whatToLocate}`;\n }\n }\n\n let paramSchema = '';\n if (action.paramSchema) {\n paramSchema = `- param: ${action.paramSchema}`;\n }\n if (action.paramDescription) {\n assert(\n paramSchema,\n `paramSchema is required when paramDescription is provided for action ${action.name}, but got ${action.paramSchema}`,\n );\n paramSchema += ` // ${action.paramDescription}`;\n }\n\n const fields = [paramSchema, locatorParam].filter(Boolean);\n\n return `- ${action.name}, ${action.description}\n${tab}- type: \"${action.name}\"\n${tab}${fields.join(`\\n${tab}`)}\n`.trim();\n};\n\nconst systemTemplateOfVLPlanning = ({\n actionSpace,\n vlMode,\n}: {\n actionSpace: DeviceAction[];\n vlMode: ReturnType<typeof vlLocateMode>;\n}) => {\n const actionNameList = actionSpace.map((action) => action.name).join(', ');\n const actionDescriptionList = actionSpace.map((action) =>\n descriptionForAction(action, vlLocateParam(action.location === 'required')),\n );\n const actionList = actionDescriptionList.join('\\n');\n\n return `\nTarget: User will give you a screenshot, an instruction and some previous logs indicating what have been done. Please tell what the next one action is (or null if no action should be done) to do the tasks the instruction requires. \n\nRestriction:\n- Don't give extra actions or plans beyond the instruction. ONLY plan for what the instruction requires. For example, don't try to submit the form if the instruction is only to fill something.\n- Always give ONLY ONE action in \\`log\\` field (or null if no action should be done), instead of multiple actions. Supported actions are ${actionNameList}.\n- Don't repeat actions in the previous logs.\n- Bbox is the bounding box of the element to be located. It's an array of 4 numbers, representing ${bboxDescription(vlMode)}.\n\nSupporting actions:\n${actionList}\n\nField description:\n* The \\`prompt\\` field inside the \\`locate\\` field is a short description that could be used to locate the element.\n\nReturn in JSON format:\n{\n ${vlCoTLog}\n ${vlCurrentLog}\n ${commonOutputFields}\n \"action\": \n {\n // one of the supporting actions\n } | null,\n ,\n \"sleep\"?: number, // The sleep time after the action, in milliseconds.\n}\n\nFor example, when the instruction is \"click 'Confirm' button, and click 'Yes' in popup\" and the log is \"I will use action Tap to click 'Confirm' button\", by viewing the screenshot and previous logs, you should consider: We have already clicked the 'Confirm' button, so next we should find and click 'Yes' in popup.\n\nthis and output the JSON:\n\n{\n \"what_the_user_wants_to_do_next_by_instruction\": \"We have already clicked the 'Confirm' button, so next we should find and click 'Yes' in popup\",\n \"log\": \"I will use action Tap to click 'Yes' in popup\",\n \"more_actions_needed_by_instruction\": false,\n \"action\": {\n \"type\": \"Tap\",\n \"locate\": {\n \"bbox\": [100, 100, 200, 200],\n \"prompt\": \"The 'Yes' button in popup\"\n }\n }\n}\n`;\n};\n\nconst systemTemplateOfLLM = ({\n actionSpace,\n}: { actionSpace: DeviceAction[] }) => {\n const actionNameList = actionSpace.map((action) => action.name).join(' / ');\n const actionDescriptionList = actionSpace.map((action) =>\n descriptionForAction(\n action,\n llmLocateParam(action.location === 'required'),\n ),\n );\n const actionList = actionDescriptionList.join('\\n');\n\n return `\n## Role\n\nYou are a versatile professional in software UI automation. Your outstanding contributions will impact the user experience of billions of users.\n\n## Objective\n\n- Decompose the instruction user asked into a series of actions\n- Locate the target element if possible\n- If the instruction cannot be accomplished, give a further plan.\n\n## Workflow\n\n1. Receive the screenshot, element description of screenshot(if any), user's instruction and previous logs.\n2. Decompose the user's task into a sequence of feasible actions, and place it in the \\`actions\\` field. There are different types of actions (${actionNameList}). The \"About the action\" section below will give you more details.\n3. Consider whether the user's instruction will be accomplished after the actions you composed.\n- If the instruction is accomplished, set \\`more_actions_needed_by_instruction\\` to false.\n- If more actions are needed, set \\`more_actions_needed_by_instruction\\` to true. Get ready to hand over to the next talent people like you. Carefully log what have been done in the \\`log\\` field, he or she will continue the task according to your logs.\n4. If the task is not feasible on this page, set \\`error\\` field to the reason.\n\n## Constraints\n\n- All the actions you composed MUST be feasible, which means all the action fields can be filled with the page context information you get. If not, don't plan this action.\n- Trust the \"What have been done\" field about the task (if any), don't repeat actions in it.\n- Respond only with valid JSON. Do not write an introduction or summary or markdown prefix like \\`\\`\\`json\\`\\`\\`.\n- If the screenshot and the instruction are totally irrelevant, set reason in the \\`error\\` field.\n\n## About the \\`actions\\` field\n\nThe \\`locate\\` param is commonly used in the \\`param\\` field of the action, means to locate the target element to perform the action, it conforms to the following scheme:\n\ntype LocateParam = {\n \"id\": string, // the id of the element found. It should either be the id marked with a rectangle in the screenshot or the id described in the description.\n \"prompt\"?: string // the description of the element to find. It can only be omitted when locate is null.\n} | null // If it's not on the page, the LocateParam should be null\n\n## Supported actions\n\nEach action has a \\`type\\` and corresponding \\`param\\`. To be detailed:\n${actionList}\n\n`.trim();\n};\n\nconst outputTemplate = `\n## Output JSON Format:\n\nThe JSON format is as follows:\n\n{\n \"actions\": [\n // ... some actions\n ],\n ${llmCurrentLog}\n ${commonOutputFields}\n}\n\n## Examples\n\n### Example: Decompose a task\n\nWhen you received the following information:\n\n* Instruction: 'Click the language switch button, wait 1s, click \"English\"'\n* Logs: null\n* Page Context (screenshot and description) shows: There is a language switch button, and the \"English\" option is not shown in the screenshot now.\n\nBy viewing the page screenshot and description, you should consider this and output the JSON:\n\n* The user intent is: tap the switch button, sleep, and tap the 'English' option\n* The language switch button is shown in the screenshot, and can be located by the page description or the id marked with a rectangle. So we can plan a Tap action to do this.\n* Plan a Sleep action to wait for 1 second to ensure the language options are displayed.\n* The \"English\" option button is not shown in the screenshot now, it means it may only show after the previous actions are finished. So don't plan any action to do this.\n* Log what these action do: Click the language switch button to open the language options. Wait for 1 second.\n* The task cannot be accomplished (because the last tapping action is not finished yet), so the \\`more_actions_needed_by_instruction\\` field is true. The \\`error\\` field is null.\n\n{\n \"actions\":[\n {\n \"thought\": \"Click the language switch button to open the language options.\",\n \"type\": \"Tap\", \n \"param\": null,\n \"locate\": { id: \"c81c4e9a33\", prompt: \"The language switch button\" }},\n },\n {\n \"thought\": \"Wait for 1 second to ensure the language options are displayed.\",\n \"type\": \"Sleep\",\n \"param\": { \"timeMs\": 1000 },\n }\n ],\n \"error\": null,\n \"more_actions_needed_by_instruction\": true,\n \"log\": \"Click the language switch button to open the language options. Wait for 1 second\",\n}\n\n### Example: What NOT to do\nWrong output:\n{\n \"actions\":[\n {\n \"thought\": \"Click the language switch button to open the language options.\",\n \"type\": \"Tap\",\n \"param\": null,\n \"locate\": {\n { \"id\": \"c81c4e9a33\" }, // WRONG: prompt is missing, this is not a valid LocateParam\n }\n },\n {\n \"thought\": \"Click the English option\",\n \"type\": \"Tap\", \n \"param\": null,\n \"locate\": null, // This means the 'English' option is not shown in the screenshot, the task cannot be accomplished\n }\n ],\n \"more_actions_needed_by_instruction\": false, // WRONG: should be true\n \"log\": \"Click the language switch button to open the language options\",\n}\n`;\n\nexport async function systemPromptToTaskPlanning({\n actionSpace,\n vlMode,\n}: {\n actionSpace: DeviceAction[];\n vlMode: ReturnType<typeof vlLocateMode>;\n}) {\n if (vlMode) {\n return systemTemplateOfVLPlanning({ actionSpace, vlMode });\n }\n\n return `${systemTemplateOfLLM({ actionSpace })}\\n\\n${outputTemplate}`;\n}\n\nexport const planSchema: ResponseFormatJSONSchema = {\n type: 'json_schema',\n json_schema: {\n name: 'action_items',\n strict: false,\n schema: {\n type: 'object',\n strict: false,\n properties: {\n actions: {\n type: 'array',\n items: {\n type: 'object',\n strict: false,\n properties: {\n thought: {\n type: 'string',\n description:\n 'Reasons for generating this task, and why this task is feasible on this page',\n },\n type: {\n type: 'string',\n description: 'Type of action',\n },\n param: {\n anyOf: [\n { type: 'null' },\n {\n type: 'object',\n additionalProperties: true,\n },\n ],\n description: 'Parameter of the action',\n },\n locate: {\n type: ['object', 'null'],\n properties: {\n id: { type: 'string' },\n prompt: { type: 'string' },\n },\n required: ['id', 'prompt'],\n additionalProperties: false,\n description: 'Location information for the target element',\n },\n },\n required: ['thought', 'type', 'param', 'locate'],\n additionalProperties: false,\n },\n description: 'List of actions to be performed',\n },\n more_actions_needed_by_instruction: {\n type: 'boolean',\n description:\n 'If all the actions described in the instruction have been covered by this action and logs, set this field to false.',\n },\n log: {\n type: 'string',\n description:\n 'Log what these planned actions do. Do not include further actions that have not been planned.',\n },\n error: {\n type: ['string', 'null'],\n description: 'Error messages about unexpected situations',\n },\n },\n required: [\n 'actions',\n 'more_actions_needed_by_instruction',\n 'log',\n 'error',\n ],\n additionalProperties: false,\n },\n },\n};\n\nexport const generateTaskBackgroundContext = (\n userInstruction: string,\n log?: string,\n userActionContext?: string,\n) => {\n if (log) {\n return `\nHere is the user's instruction:\n\n<instruction>\n <high_priority_knowledge>\n ${userActionContext}\n </high_priority_knowledge>\n\n ${userInstruction}\n</instruction>\n\nThese are the logs from previous executions, which indicate what was done in the previous actions.\nDo NOT repeat these actions.\n<previous_logs>\n${log}\n</previous_logs>\n`;\n }\n\n return `\nHere is the user's instruction:\n<instruction>\n <high_priority_knowledge>\n ${userActionContext}\n </high_priority_knowledge>\n\n ${userInstruction}\n</instruction>\n`;\n};\n\nexport const automationUserPrompt = (\n vlMode: ReturnType<typeof vlLocateMode>,\n) => {\n if (vlMode) {\n return new PromptTemplate({\n template: '{taskBackgroundContext}',\n inputVariables: ['taskBackgroundContext'],\n });\n }\n\n return new PromptTemplate({\n template: `\npageDescription:\n=====================================\n{pageDescription}\n=====================================\n\n{taskBackgroundContext}`,\n inputVariables: ['pageDescription', 'taskBackgroundContext'],\n });\n};\n"],"names":["vlCoTLog","vlCurrentLog","llmCurrentLog","commonOutputFields","vlLocateParam","required","llmLocateParam","descriptionForAction","action","locatorScheme","tab","locateParam","locatorParam","console","paramSchema","assert","fields","Boolean","systemTemplateOfVLPlanning","actionSpace","vlMode","actionNameList","actionDescriptionList","actionList","bboxDescription","systemTemplateOfLLM","outputTemplate","systemPromptToTaskPlanning","planSchema","generateTaskBackgroundContext","userInstruction","log","userActionContext","automationUserPrompt","PromptTemplate"],"mappings":";;;AAQA,MAAMA,WAAW;AACjB,MAAMC,eAAe;AACrB,MAAMC,gBAAgB;AAEtB,MAAMC,qBAAqB,CAAC;+NACmM,CAAC;AAChO,MAAMC,gBAAgB,CAACC,WACrB,CAAC,MAAM,EAAEA,WAAW,KAAK,IAAI,2DAA2D,CAAC;AAC3F,MAAMC,iBAAiB,CAACD,WACtB,CAAC,MAAM,EAAEA,WAAW,KAAK,IAAI,kCAAkC,CAAC;AAE3D,MAAME,uBAAuB,CAClCC,QACAC;IAEA,MAAMC,MAAM;IACZ,IAAIC,cAAc;IAClB,IAAIH,AAAoB,eAApBA,OAAO,QAAQ,EACjBG,cAAcF;SACT,IAAID,AAAoB,eAApBA,OAAO,QAAQ,EACxBG,cAAc,GAAGF,cAAc,OAAO,CAAC;SAClC,IAAID,AAAoB,UAApBA,OAAO,QAAQ,EACxBG,cAAc;IAEhB,MAAMC,eAAeD,cAAc,CAAC,EAAE,EAAEA,aAAa,GAAG;IAExD,IAAIH,OAAO,YAAY,EACrB,IAAKG,aAKHA,eAAe,CAAC,IAAI,EAAEH,OAAO,YAAY,EAAE;SAJ3CK,QAAQ,IAAI,CACV,CAAC,oCAAoC,EAAEL,OAAO,IAAI,CAAC,6EAA6E,CAAC;IAOvI,IAAIM,cAAc;IAClB,IAAIN,OAAO,WAAW,EACpBM,cAAc,CAAC,SAAS,EAAEN,OAAO,WAAW,EAAE;IAEhD,IAAIA,OAAO,gBAAgB,EAAE;QAC3BO,YACED,aACA,CAAC,qEAAqE,EAAEN,OAAO,IAAI,CAAC,UAAU,EAAEA,OAAO,WAAW,EAAE;QAEtHM,eAAe,CAAC,IAAI,EAAEN,OAAO,gBAAgB,EAAE;IACjD;IAEA,MAAMQ,SAAS;QAACF;QAAaF;KAAa,CAAC,MAAM,CAACK;IAElD,OAAO,CAAC,EAAE,EAAET,OAAO,IAAI,CAAC,EAAE,EAAEA,OAAO,WAAW,CAAC;AACjD,EAAEE,IAAI,SAAS,EAAEF,OAAO,IAAI,CAAC;AAC7B,EAAEE,MAAMM,OAAO,IAAI,CAAC,CAAC,EAAE,EAAEN,KAAK,EAAE;AAChC,CAAC,CAAC,IAAI;AACN;AAEA,MAAMQ,6BAA6B,CAAC,EAClCC,WAAW,EACXC,MAAM,EAIP;IACC,MAAMC,iBAAiBF,YAAY,GAAG,CAAC,CAACX,SAAWA,OAAO,IAAI,EAAE,IAAI,CAAC;IACrE,MAAMc,wBAAwBH,YAAY,GAAG,CAAC,CAACX,SAC7CD,qBAAqBC,QAAQJ,cAAcI,AAAoB,eAApBA,OAAO,QAAQ;IAE5D,MAAMe,aAAaD,sBAAsB,IAAI,CAAC;IAE9C,OAAO,CAAC;;;;;yIAK+H,EAAED,eAAe;;kGAExD,EAAEG,gBAAgBJ,QAAQ;;;AAG5H,EAAEG,WAAW;;;;;;;EAOX,EAAEvB,SAAS;EACX,EAAEC,aAAa;EACf,EAAEE,mBAAmB;;;;;;;;;;;;;;;;;;;;;;;;;AAyBvB,CAAC;AACD;AAEA,MAAMsB,sBAAsB,CAAC,EAC3BN,WAAW,EACqB;IAChC,MAAME,iBAAiBF,YAAY,GAAG,CAAC,CAACX,SAAWA,OAAO,IAAI,EAAE,IAAI,CAAC;IACrE,MAAMc,wBAAwBH,YAAY,GAAG,CAAC,CAACX,SAC7CD,qBACEC,QACAF,eAAeE,AAAoB,eAApBA,OAAO,QAAQ;IAGlC,MAAMe,aAAaD,sBAAsB,IAAI,CAAC;IAE9C,OAAO,CAAC;;;;;;;;;;;;;;+IAcqI,EAAED,eAAe;;;;;;;;;;;;;;;;;;;;;;;;;AAyBhK,EAAEE,WAAW;;AAEb,CAAC,CAAC,IAAI;AACN;AAEA,MAAMG,iBAAiB,CAAC;;;;;;;;;EAStB,EAAExB,cAAc;EAChB,EAAEC,mBAAmB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AA+DvB,CAAC;AAEM,eAAewB,2BAA2B,EAC/CR,WAAW,EACXC,MAAM,EAIP;IACC,IAAIA,QACF,OAAOF,2BAA2B;QAAEC;QAAaC;IAAO;IAG1D,OAAO,GAAGK,oBAAoB;QAAEN;IAAY,GAAG,IAAI,EAAEO,gBAAgB;AACvE;AAEO,MAAME,aAAuC;IAClD,MAAM;IACN,aAAa;QACX,MAAM;QACN,QAAQ;QACR,QAAQ;YACN,MAAM;YACN,QAAQ;YACR,YAAY;gBACV,SAAS;oBACP,MAAM;oBACN,OAAO;wBACL,MAAM;wBACN,QAAQ;wBACR,YAAY;4BACV,SAAS;gCACP,MAAM;gCACN,aACE;4BACJ;4BACA,MAAM;gCACJ,MAAM;gCACN,aAAa;4BACf;4BACA,OAAO;gCACL,OAAO;oCACL;wCAAE,MAAM;oCAAO;oCACf;wCACE,MAAM;wCACN,sBAAsB;oCACxB;iCACD;gCACD,aAAa;4BACf;4BACA,QAAQ;gCACN,MAAM;oCAAC;oCAAU;iCAAO;gCACxB,YAAY;oCACV,IAAI;wCAAE,MAAM;oCAAS;oCACrB,QAAQ;wCAAE,MAAM;oCAAS;gCAC3B;gCACA,UAAU;oCAAC;oCAAM;iCAAS;gCAC1B,sBAAsB;gCACtB,aAAa;4BACf;wBACF;wBACA,UAAU;4BAAC;4BAAW;4BAAQ;4BAAS;yBAAS;wBAChD,sBAAsB;oBACxB;oBACA,aAAa;gBACf;gBACA,oCAAoC;oBAClC,MAAM;oBACN,aACE;gBACJ;gBACA,KAAK;oBACH,MAAM;oBACN,aACE;gBACJ;gBACA,OAAO;oBACL,MAAM;wBAAC;wBAAU;qBAAO;oBACxB,aAAa;gBACf;YACF;YACA,UAAU;gBACR;gBACA;gBACA;gBACA;aACD;YACD,sBAAsB;QACxB;IACF;AACF;AAEO,MAAMC,gCAAgC,CAC3CC,iBACAC,KACAC;IAEA,IAAID,KACF,OAAO,CAAC;;;;;IAKR,EAAEC,kBAAkB;;;EAGtB,EAAEF,gBAAgB;;;;;;AAMpB,EAAEC,IAAI;;AAEN,CAAC;IAGC,OAAO,CAAC;;;;IAIN,EAAEC,kBAAkB;;;EAGtB,EAAEF,gBAAgB;;AAEpB,CAAC;AACD;AAEO,MAAMG,uBAAuB,CAClCb;IAEA,IAAIA,QACF,OAAO,IAAIc,eAAe;QACxB,UAAU;QACV,gBAAgB;YAAC;SAAwB;IAC3C;IAGF,OAAO,IAAIA,eAAe;QACxB,UAAU,CAAC;;;;;;uBAMQ,CAAC;QACpB,gBAAgB;YAAC;YAAmB;SAAwB;IAC9D;AACF"}
|
|
1
|
+
{"version":3,"file":"ai-model/prompt/llm-planning.mjs","sources":["webpack://@midscene/core/./src/ai-model/prompt/llm-planning.ts"],"sourcesContent":["import assert from 'node:assert';\nimport type { DeviceAction } from '@/types';\nimport { PromptTemplate } from '@langchain/core/prompts';\nimport type { vlLocateMode } from '@midscene/shared/env';\nimport type { ResponseFormatJSONSchema } from 'openai/resources/index';\nimport { z } from 'zod';\nimport { ifMidsceneLocatorField } from '../common';\nimport { bboxDescription } from './common';\n\n// Note: put the log field first to trigger the CoT\nconst vlCoTLog = `\"what_the_user_wants_to_do_next_by_instruction\": string, // What the user wants to do according to the instruction and previous logs. `;\nconst vlCurrentLog = `\"log\": string, // Log what the next one action (ONLY ONE!) you can do according to the screenshot and the instruction. The typical log looks like \"Now i want to use action '{ action-type }' to do .. first\". If no action should be done, log the reason. \". Use the same language as the user's instruction.`;\nconst llmCurrentLog = `\"log\": string, // Log what the next actions you can do according to the screenshot and the instruction. The typical log looks like \"Now i want to use action '{ action-type }' to do ..\". If no action should be done, log the reason. \". Use the same language as the user's instruction.`;\n\nconst commonOutputFields = `\"error\"?: string, // Error messages about unexpected situations, if any. Only think it is an error when the situation is not foreseeable according to the instruction. Use the same language as the user's instruction.\n \"more_actions_needed_by_instruction\": boolean, // Consider if there is still more action(s) to do after the action in \"Log\" is done, according to the instruction. If so, set this field to true. Otherwise, set it to false.`;\nconst vlLocateParam = () =>\n '{bbox: [number, number, number, number], prompt: string }';\nconst llmLocateParam = () => '{\"id\": string, \"prompt\": string}';\n\nexport const descriptionForAction = (\n action: DeviceAction<any>,\n locatorSchemaTypeDescription: string,\n) => {\n const tab = ' ';\n const fields: string[] = [];\n\n // Add the action type field\n fields.push(`- type: \"${action.name}\"`);\n\n // Handle paramSchema if it exists\n if (action.paramSchema) {\n assert(\n action.paramSchema instanceof z.ZodObject,\n 'paramSchema must be a zod object',\n );\n // Try to extract parameter information from the zod schema\n // For zod object schemas, extract type information and descriptions\n const shape = action.paramSchema.shape;\n const paramLines: string[] = [];\n\n // Helper function to get type name from zod schema\n const getTypeName = (field: any): string => {\n // Handle unwrapped optional fields\n const actualField = field._def?.innerType || field;\n\n if (actualField._def?.typeName === 'ZodString') return 'string';\n if (actualField._def?.typeName === 'ZodNumber') return 'number';\n if (actualField._def?.typeName === 'ZodBoolean') return 'boolean';\n if (actualField._def?.typeName === 'ZodArray') return 'array';\n if (actualField._def?.typeName === 'ZodObject') {\n // Check if this is a passthrough object (like MidsceneLocation)\n if (ifMidsceneLocatorField(actualField)) {\n return locatorSchemaTypeDescription;\n }\n return 'object';\n }\n\n console.warn('unknown type: ', actualField._def);\n return 'type';\n };\n\n // Helper function to get description from zod schema\n const getDescription = (field: z.ZodTypeAny): string | null => {\n // Handle unwrapped optional fields\n const actualField = field._def?.innerType || field;\n\n // Check for direct description\n if ('description' in field) {\n return field.description || null;\n }\n\n // Check for MidsceneLocation fields and add description\n if (actualField._def?.typeName === 'ZodObject') {\n if ('midscene_location_field_flag' in actualField._def.shape()) {\n return 'Location information for the target element';\n }\n }\n\n return null;\n };\n\n for (const [key, field] of Object.entries(shape)) {\n if (field && typeof field === 'object') {\n // Check if field is optional\n const isOptional =\n typeof (field as any).isOptional === 'function' &&\n (field as any).isOptional();\n const keyWithOptional = isOptional ? `${key}?` : key;\n\n // Get the type name\n const typeName = getTypeName(field);\n\n // Get description\n const description = getDescription(field as z.ZodTypeAny);\n\n // Build param line for this field\n let paramLine = `${keyWithOptional}: ${typeName}`;\n if (description) {\n paramLine += ` // ${description}`;\n }\n\n paramLines.push(paramLine);\n }\n }\n\n if (paramLines.length > 0) {\n fields.push('- param:');\n for (const paramLine of paramLines) {\n fields.push(` - ${paramLine}`);\n }\n }\n }\n\n return `- ${action.name}, ${action.description || 'No description provided'}\n${tab}${fields.join(`\\n${tab}`)}\n`.trim();\n};\n\nconst systemTemplateOfVLPlanning = ({\n actionSpace,\n vlMode,\n}: {\n actionSpace: DeviceAction<any>[];\n vlMode: ReturnType<typeof vlLocateMode>;\n}) => {\n const actionNameList = actionSpace.map((action) => action.name).join(', ');\n const actionDescriptionList = actionSpace.map((action) => {\n return descriptionForAction(action, vlLocateParam());\n });\n const actionList = actionDescriptionList.join('\\n');\n\n return `\nTarget: User will give you a screenshot, an instruction and some previous logs indicating what have been done. Please tell what the next one action is (or null if no action should be done) to do the tasks the instruction requires. \n\nRestriction:\n- Don't give extra actions or plans beyond the instruction. ONLY plan for what the instruction requires. For example, don't try to submit the form if the instruction is only to fill something.\n- Always give ONLY ONE action in \\`log\\` field (or null if no action should be done), instead of multiple actions. Supported actions are ${actionNameList}.\n- Don't repeat actions in the previous logs.\n- Bbox is the bounding box of the element to be located. It's an array of 4 numbers, representing ${bboxDescription(vlMode)}.\n\nSupporting actions:\n${actionList}\n\nField description:\n* The \\`prompt\\` field inside the \\`locate\\` field is a short description that could be used to locate the element.\n\nReturn in JSON format:\n{\n ${vlCoTLog}\n ${vlCurrentLog}\n ${commonOutputFields}\n \"action\": \n {\n // one of the supporting actions\n } | null,\n ,\n \"sleep\"?: number, // The sleep time after the action, in milliseconds.\n}\n\nFor example, when the instruction is \"click 'Confirm' button, and click 'Yes' in popup\" and the log is \"I will use action Tap to click 'Confirm' button\", by viewing the screenshot and previous logs, you should consider: We have already clicked the 'Confirm' button, so next we should find and click 'Yes' in popup.\n\nthis and output the JSON:\n\n{\n \"what_the_user_wants_to_do_next_by_instruction\": \"We have already clicked the 'Confirm' button, so next we should find and click 'Yes' in popup\",\n \"log\": \"I will use action Tap to click 'Yes' in popup\",\n \"more_actions_needed_by_instruction\": false,\n \"action\": {\n \"type\": \"Tap\",\n \"locate\": {\n \"bbox\": [100, 100, 200, 200],\n \"prompt\": \"The 'Yes' button in popup\"\n }\n }\n}\n`;\n};\n\nconst systemTemplateOfLLM = ({\n actionSpace,\n}: { actionSpace: DeviceAction<any>[] }) => {\n const actionNameList = actionSpace.map((action) => action.name).join(' / ');\n const actionDescriptionList = actionSpace.map((action) => {\n return descriptionForAction(action, llmLocateParam());\n });\n const actionList = actionDescriptionList.join('\\n');\n\n return `\n## Role\n\nYou are a versatile professional in software UI automation. Your outstanding contributions will impact the user experience of billions of users.\n\n## Objective\n\n- Decompose the instruction user asked into a series of actions\n- Locate the target element if possible\n- If the instruction cannot be accomplished, give a further plan.\n\n## Workflow\n\n1. Receive the screenshot, element description of screenshot(if any), user's instruction and previous logs.\n2. Decompose the user's task into a sequence of feasible actions, and place it in the \\`actions\\` field. There are different types of actions (${actionNameList}). The \"About the action\" section below will give you more details.\n3. Consider whether the user's instruction will be accomplished after the actions you composed.\n- If the instruction is accomplished, set \\`more_actions_needed_by_instruction\\` to false.\n- If more actions are needed, set \\`more_actions_needed_by_instruction\\` to true. Get ready to hand over to the next talent people like you. Carefully log what have been done in the \\`log\\` field, he or she will continue the task according to your logs.\n4. If the task is not feasible on this page, set \\`error\\` field to the reason.\n\n## Constraints\n\n- All the actions you composed MUST be feasible, which means all the action fields can be filled with the page context information you get. If not, don't plan this action.\n- Trust the \"What have been done\" field about the task (if any), don't repeat actions in it.\n- Respond only with valid JSON. Do not write an introduction or summary or markdown prefix like \\`\\`\\`json\\`\\`\\`.\n- If the screenshot and the instruction are totally irrelevant, set reason in the \\`error\\` field.\n\n## About the \\`actions\\` field\n\nThe \\`locate\\` param is commonly used in the \\`param\\` field of the action, means to locate the target element to perform the action, it conforms to the following scheme:\n\ntype LocateParam = {\n \"id\": string, // the id of the element found. It should either be the id marked with a rectangle in the screenshot or the id described in the description.\n \"prompt\"?: string // the description of the element to find. It can only be omitted when locate is null.\n} | null // If it's not on the page, the LocateParam should be null\n\n## Supported actions\n\nEach action has a \\`type\\` and corresponding \\`param\\`. To be detailed:\n${actionList}\n\n`.trim();\n};\n\nconst outputTemplate = `\n## Output JSON Format:\n\nThe JSON format is as follows:\n\n{\n \"actions\": [\n // ... some actions\n ],\n ${llmCurrentLog}\n ${commonOutputFields}\n}\n\n## Examples\n\n### Example: Decompose a task\n\nWhen you received the following information:\n\n* Instruction: 'Click the language switch button, wait 1s, click \"English\"'\n* Logs: null\n* Page Context (screenshot and description) shows: There is a language switch button, and the \"English\" option is not shown in the screenshot now.\n\nBy viewing the page screenshot and description, you should consider this and output the JSON:\n\n* The user intent is: tap the switch button, sleep, and tap the 'English' option\n* The language switch button is shown in the screenshot, and can be located by the page description or the id marked with a rectangle. So we can plan a Tap action to do this.\n* Plan a Sleep action to wait for 1 second to ensure the language options are displayed.\n* The \"English\" option button is not shown in the screenshot now, it means it may only show after the previous actions are finished. So don't plan any action to do this.\n* Log what these action do: Click the language switch button to open the language options. Wait for 1 second.\n* The task cannot be accomplished (because the last tapping action is not finished yet), so the \\`more_actions_needed_by_instruction\\` field is true. The \\`error\\` field is null.\n\n{\n \"actions\":[\n {\n \"thought\": \"Click the language switch button to open the language options.\",\n \"type\": \"Tap\", \n \"param\": null,\n \"locate\": { id: \"c81c4e9a33\", prompt: \"The language switch button\" }},\n },\n {\n \"thought\": \"Wait for 1 second to ensure the language options are displayed.\",\n \"type\": \"Sleep\",\n \"param\": { \"timeMs\": 1000 },\n }\n ],\n \"error\": null,\n \"more_actions_needed_by_instruction\": true,\n \"log\": \"Click the language switch button to open the language options. Wait for 1 second\",\n}\n\n### Example: What NOT to do\nWrong output:\n{\n \"actions\":[\n {\n \"thought\": \"Click the language switch button to open the language options.\",\n \"type\": \"Tap\",\n \"param\": null,\n \"locate\": {\n { \"id\": \"c81c4e9a33\" }, // WRONG: prompt is missing, this is not a valid LocateParam\n }\n },\n {\n \"thought\": \"Click the English option\",\n \"type\": \"Tap\", \n \"param\": null,\n \"locate\": null, // This means the 'English' option is not shown in the screenshot, the task cannot be accomplished\n }\n ],\n \"more_actions_needed_by_instruction\": false, // WRONG: should be true\n \"log\": \"Click the language switch button to open the language options\",\n}\n`;\n\nexport async function systemPromptToTaskPlanning({\n actionSpace,\n vlMode,\n}: {\n actionSpace: DeviceAction<any>[];\n vlMode: ReturnType<typeof vlLocateMode>;\n}) {\n if (vlMode) {\n return systemTemplateOfVLPlanning({ actionSpace, vlMode });\n }\n\n return `${systemTemplateOfLLM({ actionSpace })}\\n\\n${outputTemplate}`;\n}\n\nexport const planSchema: ResponseFormatJSONSchema = {\n type: 'json_schema',\n json_schema: {\n name: 'action_items',\n strict: false,\n schema: {\n type: 'object',\n strict: false,\n properties: {\n actions: {\n type: 'array',\n items: {\n type: 'object',\n strict: false,\n properties: {\n thought: {\n type: 'string',\n description:\n 'Reasons for generating this task, and why this task is feasible on this page',\n },\n type: {\n type: 'string',\n description: 'Type of action',\n },\n param: {\n anyOf: [\n { type: 'null' },\n {\n type: 'object',\n additionalProperties: true,\n },\n ],\n description: 'Parameter of the action',\n },\n locate: {\n type: ['object', 'null'],\n properties: {\n id: { type: 'string' },\n prompt: { type: 'string' },\n },\n required: ['id', 'prompt'],\n additionalProperties: false,\n description: 'Location information for the target element',\n },\n },\n required: ['thought', 'type', 'param', 'locate'],\n additionalProperties: false,\n },\n description: 'List of actions to be performed',\n },\n more_actions_needed_by_instruction: {\n type: 'boolean',\n description:\n 'If all the actions described in the instruction have been covered by this action and logs, set this field to false.',\n },\n log: {\n type: 'string',\n description:\n 'Log what these planned actions do. Do not include further actions that have not been planned.',\n },\n error: {\n type: ['string', 'null'],\n description: 'Error messages about unexpected situations',\n },\n },\n required: [\n 'actions',\n 'more_actions_needed_by_instruction',\n 'log',\n 'error',\n ],\n additionalProperties: false,\n },\n },\n};\n\nexport const generateTaskBackgroundContext = (\n userInstruction: string,\n log?: string,\n userActionContext?: string,\n) => {\n if (log) {\n return `\nHere is the user's instruction:\n\n<instruction>\n <high_priority_knowledge>\n ${userActionContext}\n </high_priority_knowledge>\n\n ${userInstruction}\n</instruction>\n\nThese are the logs from previous executions, which indicate what was done in the previous actions.\nDo NOT repeat these actions.\n<previous_logs>\n${log}\n</previous_logs>\n`;\n }\n\n return `\nHere is the user's instruction:\n<instruction>\n <high_priority_knowledge>\n ${userActionContext}\n </high_priority_knowledge>\n\n ${userInstruction}\n</instruction>\n`;\n};\n\nexport const automationUserPrompt = (\n vlMode: ReturnType<typeof vlLocateMode>,\n) => {\n if (vlMode) {\n return new PromptTemplate({\n template: '{taskBackgroundContext}',\n inputVariables: ['taskBackgroundContext'],\n });\n }\n\n return new PromptTemplate({\n template: `\npageDescription:\n=====================================\n{pageDescription}\n=====================================\n\n{taskBackgroundContext}`,\n inputVariables: ['pageDescription', 'taskBackgroundContext'],\n });\n};\n"],"names":["vlCoTLog","vlCurrentLog","llmCurrentLog","commonOutputFields","vlLocateParam","llmLocateParam","descriptionForAction","action","locatorSchemaTypeDescription","tab","fields","assert","z","shape","paramLines","getTypeName","field","_field__def","_actualField__def","_actualField__def1","_actualField__def2","_actualField__def3","_actualField__def4","actualField","ifMidsceneLocatorField","console","getDescription","key","Object","isOptional","keyWithOptional","typeName","description","paramLine","systemTemplateOfVLPlanning","actionSpace","vlMode","actionNameList","actionDescriptionList","actionList","bboxDescription","systemTemplateOfLLM","outputTemplate","systemPromptToTaskPlanning","planSchema","generateTaskBackgroundContext","userInstruction","log","userActionContext","automationUserPrompt","PromptTemplate"],"mappings":";;;;;AAUA,MAAMA,WAAW;AACjB,MAAMC,eAAe;AACrB,MAAMC,gBAAgB;AAEtB,MAAMC,qBAAqB,CAAC;+NACmM,CAAC;AAChO,MAAMC,gBAAgB,IACpB;AACF,MAAMC,iBAAiB,IAAM;AAEtB,MAAMC,uBAAuB,CAClCC,QACAC;IAEA,MAAMC,MAAM;IACZ,MAAMC,SAAmB,EAAE;IAG3BA,OAAO,IAAI,CAAC,CAAC,SAAS,EAAEH,OAAO,IAAI,CAAC,CAAC,CAAC;IAGtC,IAAIA,OAAO,WAAW,EAAE;QACtBI,YACEJ,OAAO,WAAW,YAAYK,EAAE,SAAS,EACzC;QAIF,MAAMC,QAAQN,OAAO,WAAW,CAAC,KAAK;QACtC,MAAMO,aAAuB,EAAE;QAG/B,MAAMC,cAAc,CAACC;gBAECC,aAEhBC,mBACAC,oBACAC,oBACAC,oBACAC;YANJ,MAAMC,cAAcN,AAAAA,SAAAA,CAAAA,cAAAA,MAAM,IAAI,AAAD,IAATA,KAAAA,IAAAA,YAAY,SAAS,AAAD,KAAKD;YAE7C,IAAIE,AAAAA,SAAAA,CAAAA,oBAAAA,YAAY,IAAI,AAAD,IAAfA,KAAAA,IAAAA,kBAAkB,QAAQ,AAAD,MAAM,aAAa,OAAO;YACvD,IAAIC,AAAAA,SAAAA,CAAAA,qBAAAA,YAAY,IAAI,AAAD,IAAfA,KAAAA,IAAAA,mBAAkB,QAAQ,AAAD,MAAM,aAAa,OAAO;YACvD,IAAIC,AAAAA,SAAAA,CAAAA,qBAAAA,YAAY,IAAI,AAAD,IAAfA,KAAAA,IAAAA,mBAAkB,QAAQ,AAAD,MAAM,cAAc,OAAO;YACxD,IAAIC,AAAAA,SAAAA,CAAAA,qBAAAA,YAAY,IAAI,AAAD,IAAfA,KAAAA,IAAAA,mBAAkB,QAAQ,AAAD,MAAM,YAAY,OAAO;YACtD,IAAIC,AAAAA,SAAAA,CAAAA,qBAAAA,YAAY,IAAI,AAAD,IAAfA,KAAAA,IAAAA,mBAAkB,QAAQ,AAAD,MAAM,aAAa;gBAE9C,IAAIE,uBAAuBD,cACzB,OAAOf;gBAET,OAAO;YACT;YAEAiB,QAAQ,IAAI,CAAC,kBAAkBF,YAAY,IAAI;YAC/C,OAAO;QACT;QAGA,MAAMG,iBAAiB,CAACV;gBAEFC,aAQhBC;YARJ,MAAMK,cAAcN,AAAAA,SAAAA,CAAAA,cAAAA,MAAM,IAAI,AAAD,IAATA,KAAAA,IAAAA,YAAY,SAAS,AAAD,KAAKD;YAG7C,IAAI,iBAAiBA,OACnB,OAAOA,MAAM,WAAW,IAAI;YAI9B,IAAIE,AAAAA,SAAAA,CAAAA,oBAAAA,YAAY,IAAI,AAAD,IAAfA,KAAAA,IAAAA,kBAAkB,QAAQ,AAAD,MAAM,aACjC;gBAAA,IAAI,kCAAkCK,YAAY,IAAI,CAAC,KAAK,IAC1D,OAAO;YACT;YAGF,OAAO;QACT;QAEA,KAAK,MAAM,CAACI,KAAKX,MAAM,IAAIY,OAAO,OAAO,CAACf,OACxC,IAAIG,SAAS,AAAiB,YAAjB,OAAOA,OAAoB;YAEtC,MAAMa,aACJ,AAAqC,cAArC,OAAQb,MAAc,UAAU,IAC/BA,MAAc,UAAU;YAC3B,MAAMc,kBAAkBD,aAAa,GAAGF,IAAI,CAAC,CAAC,GAAGA;YAGjD,MAAMI,WAAWhB,YAAYC;YAG7B,MAAMgB,cAAcN,eAAeV;YAGnC,IAAIiB,YAAY,GAAGH,gBAAgB,EAAE,EAAEC,UAAU;YACjD,IAAIC,aACFC,aAAa,CAAC,IAAI,EAAED,aAAa;YAGnClB,WAAW,IAAI,CAACmB;QAClB;QAGF,IAAInB,WAAW,MAAM,GAAG,GAAG;YACzBJ,OAAO,IAAI,CAAC;YACZ,KAAK,MAAMuB,aAAanB,WACtBJ,OAAO,IAAI,CAAC,CAAC,IAAI,EAAEuB,WAAW;QAElC;IACF;IAEA,OAAO,CAAC,EAAE,EAAE1B,OAAO,IAAI,CAAC,EAAE,EAAEA,OAAO,WAAW,IAAI,0BAA0B;AAC9E,EAAEE,MAAMC,OAAO,IAAI,CAAC,CAAC,EAAE,EAAED,KAAK,EAAE;AAChC,CAAC,CAAC,IAAI;AACN;AAEA,MAAMyB,6BAA6B,CAAC,EAClCC,WAAW,EACXC,MAAM,EAIP;IACC,MAAMC,iBAAiBF,YAAY,GAAG,CAAC,CAAC5B,SAAWA,OAAO,IAAI,EAAE,IAAI,CAAC;IACrE,MAAM+B,wBAAwBH,YAAY,GAAG,CAAC,CAAC5B,SACtCD,qBAAqBC,QAAQH;IAEtC,MAAMmC,aAAaD,sBAAsB,IAAI,CAAC;IAE9C,OAAO,CAAC;;;;;yIAK+H,EAAED,eAAe;;kGAExD,EAAEG,gBAAgBJ,QAAQ;;;AAG5H,EAAEG,WAAW;;;;;;;EAOX,EAAEvC,SAAS;EACX,EAAEC,aAAa;EACf,EAAEE,mBAAmB;;;;;;;;;;;;;;;;;;;;;;;;;AAyBvB,CAAC;AACD;AAEA,MAAMsC,sBAAsB,CAAC,EAC3BN,WAAW,EAC0B;IACrC,MAAME,iBAAiBF,YAAY,GAAG,CAAC,CAAC5B,SAAWA,OAAO,IAAI,EAAE,IAAI,CAAC;IACrE,MAAM+B,wBAAwBH,YAAY,GAAG,CAAC,CAAC5B,SACtCD,qBAAqBC,QAAQF;IAEtC,MAAMkC,aAAaD,sBAAsB,IAAI,CAAC;IAE9C,OAAO,CAAC;;;;;;;;;;;;;;+IAcqI,EAAED,eAAe;;;;;;;;;;;;;;;;;;;;;;;;;AAyBhK,EAAEE,WAAW;;AAEb,CAAC,CAAC,IAAI;AACN;AAEA,MAAMG,iBAAiB,CAAC;;;;;;;;;EAStB,EAAExC,cAAc;EAChB,EAAEC,mBAAmB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AA+DvB,CAAC;AAEM,eAAewC,2BAA2B,EAC/CR,WAAW,EACXC,MAAM,EAIP;IACC,IAAIA,QACF,OAAOF,2BAA2B;QAAEC;QAAaC;IAAO;IAG1D,OAAO,GAAGK,oBAAoB;QAAEN;IAAY,GAAG,IAAI,EAAEO,gBAAgB;AACvE;AAEO,MAAME,aAAuC;IAClD,MAAM;IACN,aAAa;QACX,MAAM;QACN,QAAQ;QACR,QAAQ;YACN,MAAM;YACN,QAAQ;YACR,YAAY;gBACV,SAAS;oBACP,MAAM;oBACN,OAAO;wBACL,MAAM;wBACN,QAAQ;wBACR,YAAY;4BACV,SAAS;gCACP,MAAM;gCACN,aACE;4BACJ;4BACA,MAAM;gCACJ,MAAM;gCACN,aAAa;4BACf;4BACA,OAAO;gCACL,OAAO;oCACL;wCAAE,MAAM;oCAAO;oCACf;wCACE,MAAM;wCACN,sBAAsB;oCACxB;iCACD;gCACD,aAAa;4BACf;4BACA,QAAQ;gCACN,MAAM;oCAAC;oCAAU;iCAAO;gCACxB,YAAY;oCACV,IAAI;wCAAE,MAAM;oCAAS;oCACrB,QAAQ;wCAAE,MAAM;oCAAS;gCAC3B;gCACA,UAAU;oCAAC;oCAAM;iCAAS;gCAC1B,sBAAsB;gCACtB,aAAa;4BACf;wBACF;wBACA,UAAU;4BAAC;4BAAW;4BAAQ;4BAAS;yBAAS;wBAChD,sBAAsB;oBACxB;oBACA,aAAa;gBACf;gBACA,oCAAoC;oBAClC,MAAM;oBACN,aACE;gBACJ;gBACA,KAAK;oBACH,MAAM;oBACN,aACE;gBACJ;gBACA,OAAO;oBACL,MAAM;wBAAC;wBAAU;qBAAO;oBACxB,aAAa;gBACf;YACF;YACA,UAAU;gBACR;gBACA;gBACA;gBACA;aACD;YACD,sBAAsB;QACxB;IACF;AACF;AAEO,MAAMC,gCAAgC,CAC3CC,iBACAC,KACAC;IAEA,IAAID,KACF,OAAO,CAAC;;;;;IAKR,EAAEC,kBAAkB;;;EAGtB,EAAEF,gBAAgB;;;;;;AAMpB,EAAEC,IAAI;;AAEN,CAAC;IAGC,OAAO,CAAC;;;;IAIN,EAAEC,kBAAkB;;;EAGtB,EAAEF,gBAAgB;;AAEpB,CAAC;AACD;AAEO,MAAMG,uBAAuB,CAClCb;IAEA,IAAIA,QACF,OAAO,IAAIc,eAAe;QACxB,UAAU;QACV,gBAAgB;YAAC;SAAwB;IAC3C;IAGF,OAAO,IAAIA,eAAe;QACxB,UAAU,CAAC;;;;;;uBAMQ,CAAC;QACpB,gBAAgB;YAAC;YAAmB;SAAwB;IAC9D;AACF"}
|
|
@@ -47,7 +47,9 @@ ${PLAYWRIGHT_EXAMPLE_CODE}`;
|
|
|
47
47
|
content: messageContent
|
|
48
48
|
}
|
|
49
49
|
];
|
|
50
|
-
const response = await callAi(prompt, AIActionType.EXTRACT_DATA
|
|
50
|
+
const response = await callAi(prompt, AIActionType.EXTRACT_DATA, {
|
|
51
|
+
intent: 'default'
|
|
52
|
+
});
|
|
51
53
|
if ((null == response ? void 0 : response.content) && 'string' == typeof response.content) return response.content;
|
|
52
54
|
throw new Error('Failed to generate Playwright test code');
|
|
53
55
|
};
|
|
@@ -98,12 +100,16 @@ ${PLAYWRIGHT_EXAMPLE_CODE}`;
|
|
|
98
100
|
content: messageContent
|
|
99
101
|
}
|
|
100
102
|
];
|
|
101
|
-
if (options.stream && options.onChunk) return await callAi(prompt, AIActionType.EXTRACT_DATA,
|
|
103
|
+
if (options.stream && options.onChunk) return await callAi(prompt, AIActionType.EXTRACT_DATA, {
|
|
104
|
+
intent: 'default'
|
|
105
|
+
}, {
|
|
102
106
|
stream: true,
|
|
103
107
|
onChunk: options.onChunk
|
|
104
108
|
});
|
|
105
109
|
{
|
|
106
|
-
const response = await callAi(prompt, AIActionType.EXTRACT_DATA
|
|
110
|
+
const response = await callAi(prompt, AIActionType.EXTRACT_DATA, {
|
|
111
|
+
intent: 'default'
|
|
112
|
+
});
|
|
107
113
|
if ((null == response ? void 0 : response.content) && 'string' == typeof response.content) return {
|
|
108
114
|
content: response.content,
|
|
109
115
|
usage: response.usage,
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ai-model/prompt/playwright-generator.mjs","sources":["webpack://@midscene/core/./src/ai-model/prompt/playwright-generator.ts"],"sourcesContent":["import type {\n StreamingAIResponse,\n StreamingCodeGenerationOptions,\n} from '@/types';\nimport { PLAYWRIGHT_EXAMPLE_CODE } from '@midscene/shared/constants';\nimport type { ChatCompletionMessageParam } from 'openai/resources/index';\nimport { AIActionType, callAi } from '../index';\n\n// Import shared utilities and types from yaml-generator\nimport {\n type ChromeRecordedEvent,\n type EventCounts,\n type EventSummary,\n type InputDescription,\n type ProcessedEvent,\n createEventCounts,\n createMessageContent,\n extractInputDescriptions,\n filterEventsByType,\n getScreenshotsForLLM,\n prepareEventSummary,\n processEventsForLLM,\n validateEvents,\n} from './yaml-generator';\n\n// Playwright-specific interfaces\nexport interface PlaywrightGenerationOptions {\n testName?: string;\n includeScreenshots?: boolean;\n includeTimestamps?: boolean;\n maxScreenshots?: number;\n description?: string;\n viewportSize?: { width: number; height: number };\n waitForNetworkIdle?: boolean;\n waitForNetworkIdleTimeout?: number;\n}\n\n// Re-export shared types for backward compatibility\nexport type {\n ChromeRecordedEvent,\n EventCounts,\n InputDescription,\n ProcessedEvent,\n EventSummary,\n};\n\n// Re-export shared utilities for backward compatibility\nexport {\n getScreenshotsForLLM,\n filterEventsByType,\n createEventCounts,\n extractInputDescriptions,\n processEventsForLLM,\n prepareEventSummary,\n createMessageContent,\n validateEvents,\n};\n\n/**\n * Generates Playwright test code from recorded events\n */\nexport const generatePlaywrightTest = async (\n events: ChromeRecordedEvent[],\n options: PlaywrightGenerationOptions = {},\n): Promise<string> => {\n // Validate input\n validateEvents(events);\n\n // Prepare event summary using shared utilities\n const summary = prepareEventSummary(events, {\n testName: options.testName,\n maxScreenshots: options.maxScreenshots || 3,\n });\n\n // Add Playwright-specific options to summary\n const playwrightSummary = {\n ...summary,\n waitForNetworkIdle: options.waitForNetworkIdle !== false,\n waitForNetworkIdleTimeout: options.waitForNetworkIdleTimeout || 2000,\n viewportSize: options.viewportSize || { width: 1280, height: 800 },\n };\n\n // Get screenshots for visual context\n const screenshots = getScreenshotsForLLM(events, options.maxScreenshots || 3);\n\n // Create prompt text\n const promptText = `Generate a Playwright test using @midscene/web/playwright that reproduces this recorded browser session. The test should be based on the following events and follow the structure of the example provided. Make the test descriptive with appropriate assertions and validations.\n\nEvent Summary:\n${JSON.stringify(playwrightSummary, null, 2)}\n\nGenerated code should:\n1. Import required dependencies\n2. Set up the test with proper configuration\n3. Include a beforeEach hook to navigate to the starting URL\n4. Implement a test that uses Midscene AI methods (aiTap, aiInput, aiAssert, etc.)\n5. Include appropriate assertions and validations\n6. Follow best practices for Playwright tests\n7. Be ready to execute without further modification\n\nRespond ONLY with the complete Playwright test code, no explanations.`;\n\n // Create message content with screenshots\n const messageContent = createMessageContent(\n promptText,\n screenshots,\n options.includeScreenshots !== false,\n );\n\n // Create system prompt\n const systemPrompt = `You are an expert test automation engineer specializing in Playwright and Midscene. \nYour task is to generate a complete, executable Playwright test using @midscene/web/playwright that reproduces a recorded browser session.\n\n${PLAYWRIGHT_EXAMPLE_CODE}`;\n\n // Use LLM to generate the Playwright test code\n const prompt: ChatCompletionMessageParam[] = [\n {\n role: 'system',\n content: systemPrompt,\n },\n {\n role: 'user',\n content: messageContent,\n },\n ];\n\n const response = await callAi(prompt, AIActionType.EXTRACT_DATA);\n\n if (response?.content && typeof response.content === 'string') {\n return response.content;\n }\n\n throw new Error('Failed to generate Playwright test code');\n};\n\n/**\n * Generates Playwright test code from recorded events with streaming support\n */\nexport const generatePlaywrightTestStream = async (\n events: ChromeRecordedEvent[],\n options: PlaywrightGenerationOptions & StreamingCodeGenerationOptions = {},\n): Promise<StreamingAIResponse> => {\n // Validate input\n validateEvents(events);\n\n // Prepare event summary using shared utilities\n const summary = prepareEventSummary(events, {\n testName: options.testName,\n maxScreenshots: options.maxScreenshots || 3,\n });\n\n // Add Playwright-specific options to summary\n const playwrightSummary = {\n ...summary,\n waitForNetworkIdle: options.waitForNetworkIdle !== false,\n waitForNetworkIdleTimeout: options.waitForNetworkIdleTimeout || 2000,\n viewportSize: options.viewportSize || { width: 1280, height: 800 },\n };\n\n // Get screenshots for visual context\n const screenshots = getScreenshotsForLLM(events, options.maxScreenshots || 3);\n\n // Create prompt text\n const promptText = `Generate a Playwright test using @midscene/web/playwright that reproduces this recorded browser session. The test should be based on the following events and follow the structure of the example provided. Make the test descriptive with appropriate assertions and validations.\n\nEvent Summary:\n${JSON.stringify(playwrightSummary, null, 2)}\n\nGenerated code should:\n1. Import required dependencies\n2. Set up the test with proper configuration\n3. Include a beforeEach hook to navigate to the starting URL\n4. Implement a test that uses Midscene AI methods (aiTap, aiInput, aiAssert, etc.)\n5. Include appropriate assertions and validations\n6. Follow best practices for Playwright tests\n7. Be ready to execute without further modification\n8. can't wrap this test code in markdown code block\n\nRespond ONLY with the complete Playwright test code, no explanations.`;\n\n // Create message content with screenshots\n const messageContent = createMessageContent(\n promptText,\n screenshots,\n options.includeScreenshots !== false,\n );\n\n // Create system prompt\n const systemPrompt = `You are an expert test automation engineer specializing in Playwright and Midscene. \nYour task is to generate a complete, executable Playwright test using @midscene/web/playwright that reproduces a recorded browser session.\n\n${PLAYWRIGHT_EXAMPLE_CODE}`;\n\n // Use LLM to generate the Playwright test code with streaming\n const prompt: ChatCompletionMessageParam[] = [\n {\n role: 'system',\n content: systemPrompt,\n },\n {\n role: 'user',\n content: messageContent,\n },\n ];\n\n if (options.stream && options.onChunk) {\n // Use streaming\n return await callAi(prompt, AIActionType.EXTRACT_DATA, undefined, {\n stream: true,\n onChunk: options.onChunk,\n });\n } else {\n // Fallback to non-streaming\n const response = await callAi(prompt, AIActionType.EXTRACT_DATA);\n\n if (response?.content && typeof response.content === 'string') {\n return {\n content: response.content,\n usage: response.usage,\n isStreamed: false,\n };\n }\n\n throw new Error('Failed to generate Playwright test code');\n }\n};\n"],"names":["generatePlaywrightTest","events","options","validateEvents","summary","prepareEventSummary","playwrightSummary","screenshots","getScreenshotsForLLM","promptText","JSON","messageContent","createMessageContent","systemPrompt","PLAYWRIGHT_EXAMPLE_CODE","prompt","response","callAi","AIActionType","Error","generatePlaywrightTestStream","undefined"],"mappings":";;;AA6DO,MAAMA,yBAAyB,OACpCC,QACAC,UAAuC,CAAC,CAAC;IAGzCC,eAAeF;IAGf,MAAMG,UAAUC,oBAAoBJ,QAAQ;QAC1C,UAAUC,QAAQ,QAAQ;QAC1B,gBAAgBA,QAAQ,cAAc,IAAI;IAC5C;IAGA,MAAMI,oBAAoB;QACxB,GAAGF,OAAO;QACV,oBAAoBF,AAA+B,UAA/BA,QAAQ,kBAAkB;QAC9C,2BAA2BA,QAAQ,yBAAyB,IAAI;QAChE,cAAcA,QAAQ,YAAY,IAAI;YAAE,OAAO;YAAM,QAAQ;QAAI;IACnE;IAGA,MAAMK,cAAcC,qBAAqBP,QAAQC,QAAQ,cAAc,IAAI;IAG3E,MAAMO,aAAa,CAAC;;;AAGtB,EAAEC,KAAK,SAAS,CAACJ,mBAAmB,MAAM,GAAG;;;;;;;;;;;qEAWwB,CAAC;IAGpE,MAAMK,iBAAiBC,qBACrBH,YACAF,aACAL,AAA+B,UAA/BA,QAAQ,kBAAkB;IAI5B,MAAMW,eAAe,CAAC;;;AAGxB,EAAEC,yBAAyB;IAGzB,MAAMC,SAAuC;QAC3C;YACE,MAAM;YACN,SAASF;QACX;QACA;YACE,MAAM;YACN,SAASF;QACX;KACD;IAED,MAAMK,WAAW,MAAMC,OAAOF,QAAQG,aAAa,YAAY;IAE/D,IAAIF,AAAAA,CAAAA,QAAAA,WAAAA,KAAAA,IAAAA,SAAU,OAAO,AAAD,KAAK,AAA4B,YAA5B,OAAOA,SAAS,OAAO,EAC9C,OAAOA,SAAS,OAAO;IAGzB,MAAM,IAAIG,MAAM;AAClB;AAKO,MAAMC,+BAA+B,OAC1CnB,QACAC,UAAwE,CAAC,CAAC;IAG1EC,eAAeF;IAGf,MAAMG,UAAUC,oBAAoBJ,QAAQ;QAC1C,UAAUC,QAAQ,QAAQ;QAC1B,gBAAgBA,QAAQ,cAAc,IAAI;IAC5C;IAGA,MAAMI,oBAAoB;QACxB,GAAGF,OAAO;QACV,oBAAoBF,AAA+B,UAA/BA,QAAQ,kBAAkB;QAC9C,2BAA2BA,QAAQ,yBAAyB,IAAI;QAChE,cAAcA,QAAQ,YAAY,IAAI;YAAE,OAAO;YAAM,QAAQ;QAAI;IACnE;IAGA,MAAMK,cAAcC,qBAAqBP,QAAQC,QAAQ,cAAc,IAAI;IAG3E,MAAMO,aAAa,CAAC;;;AAGtB,EAAEC,KAAK,SAAS,CAACJ,mBAAmB,MAAM,GAAG;;;;;;;;;;;;qEAYwB,CAAC;IAGpE,MAAMK,iBAAiBC,qBACrBH,YACAF,aACAL,AAA+B,UAA/BA,QAAQ,kBAAkB;IAI5B,MAAMW,eAAe,CAAC;;;AAGxB,EAAEC,yBAAyB;IAGzB,MAAMC,SAAuC;QAC3C;YACE,MAAM;YACN,SAASF;QACX;QACA;YACE,MAAM;YACN,SAASF;QACX;KACD;IAED,IAAIT,QAAQ,MAAM,IAAIA,QAAQ,OAAO,EAEnC,OAAO,MAAMe,OAAOF,QAAQG,aAAa,YAAY,EAAEG,QAAW;QAChE,QAAQ;QACR,SAASnB,QAAQ,OAAO;IAC1B;IACK;QAEL,MAAMc,WAAW,MAAMC,OAAOF,QAAQG,aAAa,YAAY;QAE/D,IAAIF,AAAAA,CAAAA,QAAAA,WAAAA,KAAAA,IAAAA,SAAU,OAAO,AAAD,KAAK,AAA4B,YAA5B,OAAOA,SAAS,OAAO,EAC9C,OAAO;YACL,SAASA,SAAS,OAAO;YACzB,OAAOA,SAAS,KAAK;YACrB,YAAY;QACd;QAGF,MAAM,IAAIG,MAAM;IAClB;AACF"}
|
|
1
|
+
{"version":3,"file":"ai-model/prompt/playwright-generator.mjs","sources":["webpack://@midscene/core/./src/ai-model/prompt/playwright-generator.ts"],"sourcesContent":["import type {\n StreamingAIResponse,\n StreamingCodeGenerationOptions,\n} from '@/types';\nimport { PLAYWRIGHT_EXAMPLE_CODE } from '@midscene/shared/constants';\nimport type { ChatCompletionMessageParam } from 'openai/resources/index';\nimport { AIActionType, callAi } from '../index';\n\n// Import shared utilities and types from yaml-generator\nimport {\n type ChromeRecordedEvent,\n type EventCounts,\n type EventSummary,\n type InputDescription,\n type ProcessedEvent,\n createEventCounts,\n createMessageContent,\n extractInputDescriptions,\n filterEventsByType,\n getScreenshotsForLLM,\n prepareEventSummary,\n processEventsForLLM,\n validateEvents,\n} from './yaml-generator';\n\n// Playwright-specific interfaces\nexport interface PlaywrightGenerationOptions {\n testName?: string;\n includeScreenshots?: boolean;\n includeTimestamps?: boolean;\n maxScreenshots?: number;\n description?: string;\n viewportSize?: { width: number; height: number };\n waitForNetworkIdle?: boolean;\n waitForNetworkIdleTimeout?: number;\n}\n\n// Re-export shared types for backward compatibility\nexport type {\n ChromeRecordedEvent,\n EventCounts,\n InputDescription,\n ProcessedEvent,\n EventSummary,\n};\n\n// Re-export shared utilities for backward compatibility\nexport {\n getScreenshotsForLLM,\n filterEventsByType,\n createEventCounts,\n extractInputDescriptions,\n processEventsForLLM,\n prepareEventSummary,\n createMessageContent,\n validateEvents,\n};\n\n/**\n * Generates Playwright test code from recorded events\n */\nexport const generatePlaywrightTest = async (\n events: ChromeRecordedEvent[],\n options: PlaywrightGenerationOptions = {},\n): Promise<string> => {\n // Validate input\n validateEvents(events);\n\n // Prepare event summary using shared utilities\n const summary = prepareEventSummary(events, {\n testName: options.testName,\n maxScreenshots: options.maxScreenshots || 3,\n });\n\n // Add Playwright-specific options to summary\n const playwrightSummary = {\n ...summary,\n waitForNetworkIdle: options.waitForNetworkIdle !== false,\n waitForNetworkIdleTimeout: options.waitForNetworkIdleTimeout || 2000,\n viewportSize: options.viewportSize || { width: 1280, height: 800 },\n };\n\n // Get screenshots for visual context\n const screenshots = getScreenshotsForLLM(events, options.maxScreenshots || 3);\n\n // Create prompt text\n const promptText = `Generate a Playwright test using @midscene/web/playwright that reproduces this recorded browser session. The test should be based on the following events and follow the structure of the example provided. Make the test descriptive with appropriate assertions and validations.\n\nEvent Summary:\n${JSON.stringify(playwrightSummary, null, 2)}\n\nGenerated code should:\n1. Import required dependencies\n2. Set up the test with proper configuration\n3. Include a beforeEach hook to navigate to the starting URL\n4. Implement a test that uses Midscene AI methods (aiTap, aiInput, aiAssert, etc.)\n5. Include appropriate assertions and validations\n6. Follow best practices for Playwright tests\n7. Be ready to execute without further modification\n\nRespond ONLY with the complete Playwright test code, no explanations.`;\n\n // Create message content with screenshots\n const messageContent = createMessageContent(\n promptText,\n screenshots,\n options.includeScreenshots !== false,\n );\n\n // Create system prompt\n const systemPrompt = `You are an expert test automation engineer specializing in Playwright and Midscene. \nYour task is to generate a complete, executable Playwright test using @midscene/web/playwright that reproduces a recorded browser session.\n\n${PLAYWRIGHT_EXAMPLE_CODE}`;\n\n // Use LLM to generate the Playwright test code\n const prompt: ChatCompletionMessageParam[] = [\n {\n role: 'system',\n content: systemPrompt,\n },\n {\n role: 'user',\n content: messageContent,\n },\n ];\n\n const response = await callAi(prompt, AIActionType.EXTRACT_DATA, {\n intent: 'default',\n });\n\n if (response?.content && typeof response.content === 'string') {\n return response.content;\n }\n\n throw new Error('Failed to generate Playwright test code');\n};\n\n/**\n * Generates Playwright test code from recorded events with streaming support\n */\nexport const generatePlaywrightTestStream = async (\n events: ChromeRecordedEvent[],\n options: PlaywrightGenerationOptions & StreamingCodeGenerationOptions = {},\n): Promise<StreamingAIResponse> => {\n // Validate input\n validateEvents(events);\n\n // Prepare event summary using shared utilities\n const summary = prepareEventSummary(events, {\n testName: options.testName,\n maxScreenshots: options.maxScreenshots || 3,\n });\n\n // Add Playwright-specific options to summary\n const playwrightSummary = {\n ...summary,\n waitForNetworkIdle: options.waitForNetworkIdle !== false,\n waitForNetworkIdleTimeout: options.waitForNetworkIdleTimeout || 2000,\n viewportSize: options.viewportSize || { width: 1280, height: 800 },\n };\n\n // Get screenshots for visual context\n const screenshots = getScreenshotsForLLM(events, options.maxScreenshots || 3);\n\n // Create prompt text\n const promptText = `Generate a Playwright test using @midscene/web/playwright that reproduces this recorded browser session. The test should be based on the following events and follow the structure of the example provided. Make the test descriptive with appropriate assertions and validations.\n\nEvent Summary:\n${JSON.stringify(playwrightSummary, null, 2)}\n\nGenerated code should:\n1. Import required dependencies\n2. Set up the test with proper configuration\n3. Include a beforeEach hook to navigate to the starting URL\n4. Implement a test that uses Midscene AI methods (aiTap, aiInput, aiAssert, etc.)\n5. Include appropriate assertions and validations\n6. Follow best practices for Playwright tests\n7. Be ready to execute without further modification\n8. can't wrap this test code in markdown code block\n\nRespond ONLY with the complete Playwright test code, no explanations.`;\n\n // Create message content with screenshots\n const messageContent = createMessageContent(\n promptText,\n screenshots,\n options.includeScreenshots !== false,\n );\n\n // Create system prompt\n const systemPrompt = `You are an expert test automation engineer specializing in Playwright and Midscene. \nYour task is to generate a complete, executable Playwright test using @midscene/web/playwright that reproduces a recorded browser session.\n\n${PLAYWRIGHT_EXAMPLE_CODE}`;\n\n // Use LLM to generate the Playwright test code with streaming\n const prompt: ChatCompletionMessageParam[] = [\n {\n role: 'system',\n content: systemPrompt,\n },\n {\n role: 'user',\n content: messageContent,\n },\n ];\n\n if (options.stream && options.onChunk) {\n // Use streaming\n return await callAi(\n prompt,\n AIActionType.EXTRACT_DATA,\n {\n intent: 'default',\n },\n {\n stream: true,\n onChunk: options.onChunk,\n },\n );\n } else {\n // Fallback to non-streaming\n const response = await callAi(prompt, AIActionType.EXTRACT_DATA, {\n intent: 'default',\n });\n\n if (response?.content && typeof response.content === 'string') {\n return {\n content: response.content,\n usage: response.usage,\n isStreamed: false,\n };\n }\n\n throw new Error('Failed to generate Playwright test code');\n }\n};\n"],"names":["generatePlaywrightTest","events","options","validateEvents","summary","prepareEventSummary","playwrightSummary","screenshots","getScreenshotsForLLM","promptText","JSON","messageContent","createMessageContent","systemPrompt","PLAYWRIGHT_EXAMPLE_CODE","prompt","response","callAi","AIActionType","Error","generatePlaywrightTestStream"],"mappings":";;;AA6DO,MAAMA,yBAAyB,OACpCC,QACAC,UAAuC,CAAC,CAAC;IAGzCC,eAAeF;IAGf,MAAMG,UAAUC,oBAAoBJ,QAAQ;QAC1C,UAAUC,QAAQ,QAAQ;QAC1B,gBAAgBA,QAAQ,cAAc,IAAI;IAC5C;IAGA,MAAMI,oBAAoB;QACxB,GAAGF,OAAO;QACV,oBAAoBF,AAA+B,UAA/BA,QAAQ,kBAAkB;QAC9C,2BAA2BA,QAAQ,yBAAyB,IAAI;QAChE,cAAcA,QAAQ,YAAY,IAAI;YAAE,OAAO;YAAM,QAAQ;QAAI;IACnE;IAGA,MAAMK,cAAcC,qBAAqBP,QAAQC,QAAQ,cAAc,IAAI;IAG3E,MAAMO,aAAa,CAAC;;;AAGtB,EAAEC,KAAK,SAAS,CAACJ,mBAAmB,MAAM,GAAG;;;;;;;;;;;qEAWwB,CAAC;IAGpE,MAAMK,iBAAiBC,qBACrBH,YACAF,aACAL,AAA+B,UAA/BA,QAAQ,kBAAkB;IAI5B,MAAMW,eAAe,CAAC;;;AAGxB,EAAEC,yBAAyB;IAGzB,MAAMC,SAAuC;QAC3C;YACE,MAAM;YACN,SAASF;QACX;QACA;YACE,MAAM;YACN,SAASF;QACX;KACD;IAED,MAAMK,WAAW,MAAMC,OAAOF,QAAQG,aAAa,YAAY,EAAE;QAC/D,QAAQ;IACV;IAEA,IAAIF,AAAAA,CAAAA,QAAAA,WAAAA,KAAAA,IAAAA,SAAU,OAAO,AAAD,KAAK,AAA4B,YAA5B,OAAOA,SAAS,OAAO,EAC9C,OAAOA,SAAS,OAAO;IAGzB,MAAM,IAAIG,MAAM;AAClB;AAKO,MAAMC,+BAA+B,OAC1CnB,QACAC,UAAwE,CAAC,CAAC;IAG1EC,eAAeF;IAGf,MAAMG,UAAUC,oBAAoBJ,QAAQ;QAC1C,UAAUC,QAAQ,QAAQ;QAC1B,gBAAgBA,QAAQ,cAAc,IAAI;IAC5C;IAGA,MAAMI,oBAAoB;QACxB,GAAGF,OAAO;QACV,oBAAoBF,AAA+B,UAA/BA,QAAQ,kBAAkB;QAC9C,2BAA2BA,QAAQ,yBAAyB,IAAI;QAChE,cAAcA,QAAQ,YAAY,IAAI;YAAE,OAAO;YAAM,QAAQ;QAAI;IACnE;IAGA,MAAMK,cAAcC,qBAAqBP,QAAQC,QAAQ,cAAc,IAAI;IAG3E,MAAMO,aAAa,CAAC;;;AAGtB,EAAEC,KAAK,SAAS,CAACJ,mBAAmB,MAAM,GAAG;;;;;;;;;;;;qEAYwB,CAAC;IAGpE,MAAMK,iBAAiBC,qBACrBH,YACAF,aACAL,AAA+B,UAA/BA,QAAQ,kBAAkB;IAI5B,MAAMW,eAAe,CAAC;;;AAGxB,EAAEC,yBAAyB;IAGzB,MAAMC,SAAuC;QAC3C;YACE,MAAM;YACN,SAASF;QACX;QACA;YACE,MAAM;YACN,SAASF;QACX;KACD;IAED,IAAIT,QAAQ,MAAM,IAAIA,QAAQ,OAAO,EAEnC,OAAO,MAAMe,OACXF,QACAG,aAAa,YAAY,EACzB;QACE,QAAQ;IACV,GACA;QACE,QAAQ;QACR,SAAShB,QAAQ,OAAO;IAC1B;IAEG;QAEL,MAAMc,WAAW,MAAMC,OAAOF,QAAQG,aAAa,YAAY,EAAE;YAC/D,QAAQ;QACV;QAEA,IAAIF,AAAAA,CAAAA,QAAAA,WAAAA,KAAAA,IAAAA,SAAU,OAAO,AAAD,KAAK,AAA4B,YAA5B,OAAOA,SAAS,OAAO,EAC9C,OAAO;YACL,SAASA,SAAS,OAAO;YACzB,OAAOA,SAAS,KAAK;YACrB,YAAY;QACd;QAGF,MAAM,IAAIG,MAAM;IAClB;AACF"}
|