@midscene/core 0.30.10 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/es/agent/agent.mjs +233 -144
- package/dist/es/agent/agent.mjs.map +1 -1
- package/dist/es/agent/execution-session.mjs +41 -0
- package/dist/es/agent/execution-session.mjs.map +1 -0
- package/dist/es/agent/index.mjs +3 -3
- package/dist/es/agent/task-builder.mjs +319 -0
- package/dist/es/agent/task-builder.mjs.map +1 -0
- package/dist/es/agent/task-cache.mjs +4 -4
- package/dist/es/agent/task-cache.mjs.map +1 -1
- package/dist/es/agent/tasks.mjs +197 -504
- package/dist/es/agent/tasks.mjs.map +1 -1
- package/dist/es/agent/ui-utils.mjs +54 -35
- package/dist/es/agent/ui-utils.mjs.map +1 -1
- package/dist/es/agent/utils.mjs +16 -58
- package/dist/es/agent/utils.mjs.map +1 -1
- package/dist/es/ai-model/conversation-history.mjs +25 -13
- package/dist/es/ai-model/conversation-history.mjs.map +1 -1
- package/dist/es/ai-model/index.mjs +4 -4
- package/dist/es/ai-model/inspect.mjs +45 -54
- package/dist/es/ai-model/inspect.mjs.map +1 -1
- package/dist/es/ai-model/llm-planning.mjs +47 -65
- package/dist/es/ai-model/llm-planning.mjs.map +1 -1
- package/dist/es/ai-model/prompt/assertion.mjs.map +1 -1
- package/dist/es/ai-model/prompt/common.mjs.map +1 -1
- package/dist/es/ai-model/prompt/describe.mjs.map +1 -1
- package/dist/es/ai-model/prompt/extraction.mjs.map +1 -1
- package/dist/es/ai-model/prompt/llm-locator.mjs +11 -235
- package/dist/es/ai-model/prompt/llm-locator.mjs.map +1 -1
- package/dist/es/ai-model/prompt/llm-planning.mjs +76 -322
- package/dist/es/ai-model/prompt/llm-planning.mjs.map +1 -1
- package/dist/es/ai-model/prompt/llm-section-locator.mjs +15 -14
- package/dist/es/ai-model/prompt/llm-section-locator.mjs.map +1 -1
- package/dist/es/ai-model/prompt/order-sensitive-judge.mjs +35 -0
- package/dist/es/ai-model/prompt/order-sensitive-judge.mjs.map +1 -0
- package/dist/es/ai-model/prompt/playwright-generator.mjs +2 -2
- package/dist/es/ai-model/prompt/playwright-generator.mjs.map +1 -1
- package/dist/es/ai-model/prompt/ui-tars-locator.mjs.map +1 -1
- package/dist/es/ai-model/prompt/ui-tars-planning.mjs.map +1 -1
- package/dist/es/ai-model/prompt/util.mjs +3 -88
- package/dist/es/ai-model/prompt/util.mjs.map +1 -1
- package/dist/es/ai-model/prompt/yaml-generator.mjs +10 -10
- package/dist/es/ai-model/prompt/yaml-generator.mjs.map +1 -1
- package/dist/es/ai-model/service-caller/index.mjs +182 -274
- package/dist/es/ai-model/service-caller/index.mjs.map +1 -1
- package/dist/es/ai-model/ui-tars-planning.mjs +69 -8
- package/dist/es/ai-model/ui-tars-planning.mjs.map +1 -1
- package/dist/es/{ai-model/common.mjs → common.mjs} +18 -30
- package/dist/es/common.mjs.map +1 -0
- package/dist/es/device/device-options.mjs +0 -0
- package/dist/es/device/index.mjs +29 -12
- package/dist/es/device/index.mjs.map +1 -1
- package/dist/es/index.mjs +5 -4
- package/dist/es/index.mjs.map +1 -1
- package/dist/es/report.mjs.map +1 -1
- package/dist/es/{insight → service}/index.mjs +38 -51
- package/dist/es/service/index.mjs.map +1 -0
- package/dist/es/{insight → service}/utils.mjs +3 -3
- package/dist/es/service/utils.mjs.map +1 -0
- package/dist/es/task-runner.mjs +264 -0
- package/dist/es/task-runner.mjs.map +1 -0
- package/dist/es/tree.mjs +13 -2
- package/dist/es/tree.mjs.map +1 -0
- package/dist/es/types.mjs +18 -1
- package/dist/es/types.mjs.map +1 -1
- package/dist/es/utils.mjs +6 -7
- package/dist/es/utils.mjs.map +1 -1
- package/dist/es/yaml/builder.mjs.map +1 -1
- package/dist/es/yaml/player.mjs +121 -98
- package/dist/es/yaml/player.mjs.map +1 -1
- package/dist/es/yaml/utils.mjs +1 -1
- package/dist/es/yaml/utils.mjs.map +1 -1
- package/dist/lib/agent/agent.js +231 -142
- package/dist/lib/agent/agent.js.map +1 -1
- package/dist/lib/agent/common.js +1 -1
- package/dist/lib/agent/execution-session.js +75 -0
- package/dist/lib/agent/execution-session.js.map +1 -0
- package/dist/lib/agent/index.js +14 -14
- package/dist/lib/agent/index.js.map +1 -1
- package/dist/lib/agent/task-builder.js +356 -0
- package/dist/lib/agent/task-builder.js.map +1 -0
- package/dist/lib/agent/task-cache.js +8 -8
- package/dist/lib/agent/task-cache.js.map +1 -1
- package/dist/lib/agent/tasks.js +202 -506
- package/dist/lib/agent/tasks.js.map +1 -1
- package/dist/lib/agent/ui-utils.js +58 -36
- package/dist/lib/agent/ui-utils.js.map +1 -1
- package/dist/lib/agent/utils.js +26 -68
- package/dist/lib/agent/utils.js.map +1 -1
- package/dist/lib/ai-model/conversation-history.js +27 -15
- package/dist/lib/ai-model/conversation-history.js.map +1 -1
- package/dist/lib/ai-model/index.js +27 -27
- package/dist/lib/ai-model/index.js.map +1 -1
- package/dist/lib/ai-model/inspect.js +51 -57
- package/dist/lib/ai-model/inspect.js.map +1 -1
- package/dist/lib/ai-model/llm-planning.js +49 -67
- package/dist/lib/ai-model/llm-planning.js.map +1 -1
- package/dist/lib/ai-model/prompt/assertion.js +2 -2
- package/dist/lib/ai-model/prompt/assertion.js.map +1 -1
- package/dist/lib/ai-model/prompt/common.js +2 -2
- package/dist/lib/ai-model/prompt/common.js.map +1 -1
- package/dist/lib/ai-model/prompt/describe.js +2 -2
- package/dist/lib/ai-model/prompt/describe.js.map +1 -1
- package/dist/lib/ai-model/prompt/extraction.js +2 -2
- package/dist/lib/ai-model/prompt/extraction.js.map +1 -1
- package/dist/lib/ai-model/prompt/llm-locator.js +14 -241
- package/dist/lib/ai-model/prompt/llm-locator.js.map +1 -1
- package/dist/lib/ai-model/prompt/llm-planning.js +79 -328
- package/dist/lib/ai-model/prompt/llm-planning.js.map +1 -1
- package/dist/lib/ai-model/prompt/llm-section-locator.js +17 -16
- package/dist/lib/ai-model/prompt/llm-section-locator.js.map +1 -1
- package/dist/lib/ai-model/prompt/order-sensitive-judge.js +72 -0
- package/dist/lib/ai-model/prompt/order-sensitive-judge.js.map +1 -0
- package/dist/lib/ai-model/prompt/playwright-generator.js +11 -11
- package/dist/lib/ai-model/prompt/playwright-generator.js.map +1 -1
- package/dist/lib/ai-model/prompt/ui-tars-locator.js +2 -2
- package/dist/lib/ai-model/prompt/ui-tars-locator.js.map +1 -1
- package/dist/lib/ai-model/prompt/ui-tars-planning.js +2 -2
- package/dist/lib/ai-model/prompt/ui-tars-planning.js.map +1 -1
- package/dist/lib/ai-model/prompt/util.js +7 -95
- package/dist/lib/ai-model/prompt/util.js.map +1 -1
- package/dist/lib/ai-model/prompt/yaml-generator.js +18 -18
- package/dist/lib/ai-model/prompt/yaml-generator.js.map +1 -1
- package/dist/lib/ai-model/service-caller/index.js +288 -401
- package/dist/lib/ai-model/service-caller/index.js.map +1 -1
- package/dist/lib/ai-model/ui-tars-planning.js +71 -10
- package/dist/lib/ai-model/ui-tars-planning.js.map +1 -1
- package/dist/lib/{ai-model/common.js → common.js} +40 -55
- package/dist/lib/common.js.map +1 -0
- package/dist/lib/device/device-options.js +20 -0
- package/dist/lib/device/device-options.js.map +1 -0
- package/dist/lib/device/index.js +63 -40
- package/dist/lib/device/index.js.map +1 -1
- package/dist/lib/image/index.js +5 -5
- package/dist/lib/image/index.js.map +1 -1
- package/dist/lib/index.js +24 -20
- package/dist/lib/index.js.map +1 -1
- package/dist/lib/report.js +2 -2
- package/dist/lib/report.js.map +1 -1
- package/dist/lib/{insight → service}/index.js +41 -54
- package/dist/lib/service/index.js.map +1 -0
- package/dist/lib/{insight → service}/utils.js +7 -7
- package/dist/lib/service/utils.js.map +1 -0
- package/dist/lib/task-runner.js +301 -0
- package/dist/lib/task-runner.js.map +1 -0
- package/dist/lib/tree.js +13 -4
- package/dist/lib/tree.js.map +1 -1
- package/dist/lib/types.js +31 -12
- package/dist/lib/types.js.map +1 -1
- package/dist/lib/utils.js +16 -17
- package/dist/lib/utils.js.map +1 -1
- package/dist/lib/yaml/builder.js +2 -2
- package/dist/lib/yaml/builder.js.map +1 -1
- package/dist/lib/yaml/index.js +16 -22
- package/dist/lib/yaml/index.js.map +1 -1
- package/dist/lib/yaml/player.js +123 -100
- package/dist/lib/yaml/player.js.map +1 -1
- package/dist/lib/yaml/utils.js +6 -6
- package/dist/lib/yaml/utils.js.map +1 -1
- package/dist/lib/yaml.js +1 -1
- package/dist/lib/yaml.js.map +1 -1
- package/dist/types/agent/agent.d.ts +62 -17
- package/dist/types/agent/execution-session.d.ts +36 -0
- package/dist/types/agent/index.d.ts +3 -2
- package/dist/types/agent/task-builder.d.ts +35 -0
- package/dist/types/agent/tasks.d.ts +32 -23
- package/dist/types/agent/ui-utils.d.ts +9 -2
- package/dist/types/agent/utils.d.ts +9 -35
- package/dist/types/ai-model/conversation-history.d.ts +8 -4
- package/dist/types/ai-model/index.d.ts +5 -5
- package/dist/types/ai-model/inspect.d.ts +20 -12
- package/dist/types/ai-model/llm-planning.d.ts +3 -1
- package/dist/types/ai-model/prompt/llm-locator.d.ts +1 -6
- package/dist/types/ai-model/prompt/llm-planning.d.ts +2 -3
- package/dist/types/ai-model/prompt/llm-section-locator.d.ts +1 -3
- package/dist/types/ai-model/prompt/order-sensitive-judge.d.ts +2 -0
- package/dist/types/ai-model/prompt/util.d.ts +2 -34
- package/dist/types/ai-model/service-caller/index.d.ts +2 -3
- package/dist/types/ai-model/ui-tars-planning.d.ts +15 -2
- package/dist/types/{ai-model/common.d.ts → common.d.ts} +6 -6
- package/dist/types/device/device-options.d.ts +57 -0
- package/dist/types/device/index.d.ts +55 -39
- package/dist/types/index.d.ts +7 -6
- package/dist/types/service/index.d.ts +26 -0
- package/dist/types/service/utils.d.ts +2 -0
- package/dist/types/task-runner.d.ts +49 -0
- package/dist/types/tree.d.ts +4 -1
- package/dist/types/types.d.ts +103 -66
- package/dist/types/yaml/utils.d.ts +1 -1
- package/dist/types/yaml.d.ts +68 -43
- package/package.json +9 -12
- package/dist/es/ai-model/action-executor.mjs +0 -129
- package/dist/es/ai-model/action-executor.mjs.map +0 -1
- package/dist/es/ai-model/common.mjs.map +0 -1
- package/dist/es/insight/index.mjs.map +0 -1
- package/dist/es/insight/utils.mjs.map +0 -1
- package/dist/lib/ai-model/action-executor.js +0 -163
- package/dist/lib/ai-model/action-executor.js.map +0 -1
- package/dist/lib/ai-model/common.js.map +0 -1
- package/dist/lib/insight/index.js.map +0 -1
- package/dist/lib/insight/utils.js.map +0 -1
- package/dist/types/ai-model/action-executor.d.ts +0 -19
- package/dist/types/insight/index.d.ts +0 -31
- package/dist/types/insight/utils.d.ts +0 -2
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ai-model/llm-planning.mjs","sources":["
|
|
1
|
+
{"version":3,"file":"ai-model/llm-planning.mjs","sources":["../../../src/ai-model/llm-planning.ts"],"sourcesContent":["import type {\n DeviceAction,\n InterfaceType,\n PlanningAIResponse,\n RawResponsePlanningAIResponse,\n UIContext,\n} from '@/types';\nimport type { IModelConfig } from '@midscene/shared/env';\nimport { paddingToMatchBlockByBase64 } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport type { ChatCompletionMessageParam } from 'openai/resources/index';\nimport {\n AIActionType,\n buildYamlFlowFromPlans,\n fillBboxParam,\n findAllMidsceneLocatorField,\n} from '../common';\nimport type { ConversationHistory } from './conversation-history';\nimport { systemPromptToTaskPlanning } from './prompt/llm-planning';\nimport { callAIWithObjectResponse } from './service-caller/index';\n\nconst debug = getDebug('planning');\n\nexport async function plan(\n userInstruction: string,\n opts: {\n context: UIContext;\n interfaceType: InterfaceType;\n actionSpace: DeviceAction<any>[];\n actionContext?: string;\n modelConfig: IModelConfig;\n conversationHistory: ConversationHistory;\n includeBbox: boolean;\n imagesIncludeCount?: number;\n },\n): Promise<PlanningAIResponse> {\n const { context, modelConfig, conversationHistory } = opts;\n const { screenshotBase64, size } = context;\n\n const { vlMode } = modelConfig;\n\n const systemPrompt = await systemPromptToTaskPlanning({\n actionSpace: opts.actionSpace,\n vlMode,\n includeBbox: opts.includeBbox,\n });\n\n let imagePayload = screenshotBase64;\n let imageWidth = size.width;\n let imageHeight = size.height;\n const rightLimit = imageWidth;\n const bottomLimit = imageHeight;\n\n // Process image based on VL mode requirements\n if (vlMode === 'qwen2.5-vl') {\n const paddedResult = await paddingToMatchBlockByBase64(imagePayload);\n imageWidth = paddedResult.width;\n imageHeight = paddedResult.height;\n imagePayload = paddedResult.imageBase64;\n }\n\n const actionContext = opts.actionContext\n ? `<high_priority_knowledge>${opts.actionContext}</high_priority_knowledge>\\n`\n : '';\n\n const instruction: ChatCompletionMessageParam[] = [\n {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `${actionContext}<user_instruction>${userInstruction}</user_instruction>`,\n },\n ],\n },\n ];\n\n let latestFeedbackMessage: ChatCompletionMessageParam;\n\n if (conversationHistory.pendingFeedbackMessage) {\n latestFeedbackMessage = {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `${conversationHistory.pendingFeedbackMessage}. The last screenshot is attached. Please going on according to the instruction.`,\n },\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n ],\n };\n\n conversationHistory.resetPendingFeedbackMessageIfExists();\n } else {\n latestFeedbackMessage = {\n role: 'user',\n content: [\n {\n type: 'text',\n text: 'this is the latest screenshot',\n },\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n ],\n };\n }\n conversationHistory.append(latestFeedbackMessage);\n const historyLog = conversationHistory.snapshot(opts.imagesIncludeCount);\n\n const msgs: ChatCompletionMessageParam[] = [\n { role: 'system', content: systemPrompt },\n ...instruction,\n ...historyLog,\n ];\n\n const {\n content: planFromAI,\n contentString: rawResponse,\n usage,\n } = await callAIWithObjectResponse<RawResponsePlanningAIResponse>(\n msgs,\n AIActionType.PLAN,\n modelConfig,\n );\n\n const actions = planFromAI.action ? [planFromAI.action] : [];\n const returnValue: PlanningAIResponse = {\n ...planFromAI,\n actions,\n rawResponse,\n usage,\n yamlFlow: buildYamlFlowFromPlans(\n actions,\n opts.actionSpace,\n planFromAI.sleep,\n ),\n };\n\n assert(planFromAI, \"can't get plans from AI\");\n\n actions.forEach((action) => {\n const type = action.type;\n const actionInActionSpace = opts.actionSpace.find(\n (action) => action.name === type,\n );\n\n debug('actionInActionSpace matched', actionInActionSpace);\n const locateFields = actionInActionSpace\n ? findAllMidsceneLocatorField(actionInActionSpace.paramSchema)\n : [];\n\n debug('locateFields', locateFields);\n\n locateFields.forEach((field) => {\n const locateResult = action.param[field];\n if (locateResult && vlMode !== undefined) {\n // Always use VL mode to fill bbox parameters\n action.param[field] = fillBboxParam(\n locateResult,\n imageWidth,\n imageHeight,\n rightLimit,\n bottomLimit,\n vlMode,\n );\n }\n });\n });\n\n if (\n actions.length === 0 &&\n returnValue.more_actions_needed_by_instruction &&\n !returnValue.sleep\n ) {\n console.warn(\n 'No actions planned for the prompt, but model said more actions are needed:',\n userInstruction,\n );\n }\n\n conversationHistory.append({\n role: 'assistant',\n content: [\n {\n type: 'text',\n text: rawResponse,\n },\n ],\n });\n\n return returnValue;\n}\n"],"names":["debug","getDebug","plan","userInstruction","opts","context","modelConfig","conversationHistory","screenshotBase64","size","vlMode","systemPrompt","systemPromptToTaskPlanning","imagePayload","imageWidth","imageHeight","rightLimit","bottomLimit","paddedResult","paddingToMatchBlockByBase64","actionContext","instruction","latestFeedbackMessage","historyLog","msgs","planFromAI","rawResponse","usage","callAIWithObjectResponse","AIActionType","actions","returnValue","buildYamlFlowFromPlans","assert","action","type","actionInActionSpace","locateFields","findAllMidsceneLocatorField","field","locateResult","undefined","fillBboxParam","console"],"mappings":";;;;;;AAsBA,MAAMA,QAAQC,SAAS;AAEhB,eAAeC,KACpBC,eAAuB,EACvBC,IASC;IAED,MAAM,EAAEC,OAAO,EAAEC,WAAW,EAAEC,mBAAmB,EAAE,GAAGH;IACtD,MAAM,EAAEI,gBAAgB,EAAEC,IAAI,EAAE,GAAGJ;IAEnC,MAAM,EAAEK,MAAM,EAAE,GAAGJ;IAEnB,MAAMK,eAAe,MAAMC,2BAA2B;QACpD,aAAaR,KAAK,WAAW;QAC7BM;QACA,aAAaN,KAAK,WAAW;IAC/B;IAEA,IAAIS,eAAeL;IACnB,IAAIM,aAAaL,KAAK,KAAK;IAC3B,IAAIM,cAAcN,KAAK,MAAM;IAC7B,MAAMO,aAAaF;IACnB,MAAMG,cAAcF;IAGpB,IAAIL,AAAW,iBAAXA,QAAyB;QAC3B,MAAMQ,eAAe,MAAMC,4BAA4BN;QACvDC,aAAaI,aAAa,KAAK;QAC/BH,cAAcG,aAAa,MAAM;QACjCL,eAAeK,aAAa,WAAW;IACzC;IAEA,MAAME,gBAAgBhB,KAAK,aAAa,GACpC,CAAC,yBAAyB,EAAEA,KAAK,aAAa,CAAC,4BAA4B,CAAC,GAC5E;IAEJ,MAAMiB,cAA4C;QAChD;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM,GAAGD,cAAc,kBAAkB,EAAEjB,gBAAgB,mBAAmB,CAAC;gBACjF;aACD;QACH;KACD;IAED,IAAImB;IAEJ,IAAIf,oBAAoB,sBAAsB,EAAE;QAC9Ce,wBAAwB;YACtB,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM,GAAGf,oBAAoB,sBAAsB,CAAC,gFAAgF,CAAC;gBACvI;gBACA;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKM;wBACL,QAAQ;oBACV;gBACF;aACD;QACH;QAEAN,oBAAoB,mCAAmC;IACzD,OACEe,wBAAwB;QACtB,MAAM;QACN,SAAS;YACP;gBACE,MAAM;gBACN,MAAM;YACR;YACA;gBACE,MAAM;gBACN,WAAW;oBACT,KAAKT;oBACL,QAAQ;gBACV;YACF;SACD;IACH;IAEFN,oBAAoB,MAAM,CAACe;IAC3B,MAAMC,aAAahB,oBAAoB,QAAQ,CAACH,KAAK,kBAAkB;IAEvE,MAAMoB,OAAqC;QACzC;YAAE,MAAM;YAAU,SAASb;QAAa;WACrCU;WACAE;KACJ;IAED,MAAM,EACJ,SAASE,UAAU,EACnB,eAAeC,WAAW,EAC1BC,KAAK,EACN,GAAG,MAAMC,yBACRJ,MACAK,aAAa,IAAI,EACjBvB;IAGF,MAAMwB,UAAUL,WAAW,MAAM,GAAG;QAACA,WAAW,MAAM;KAAC,GAAG,EAAE;IAC5D,MAAMM,cAAkC;QACtC,GAAGN,UAAU;QACbK;QACAJ;QACAC;QACA,UAAUK,uBACRF,SACA1B,KAAK,WAAW,EAChBqB,WAAW,KAAK;IAEpB;IAEAQ,OAAOR,YAAY;IAEnBK,QAAQ,OAAO,CAAC,CAACI;QACf,MAAMC,OAAOD,OAAO,IAAI;QACxB,MAAME,sBAAsBhC,KAAK,WAAW,CAAC,IAAI,CAC/C,CAAC8B,SAAWA,OAAO,IAAI,KAAKC;QAG9BnC,MAAM,+BAA+BoC;QACrC,MAAMC,eAAeD,sBACjBE,4BAA4BF,oBAAoB,WAAW,IAC3D,EAAE;QAENpC,MAAM,gBAAgBqC;QAEtBA,aAAa,OAAO,CAAC,CAACE;YACpB,MAAMC,eAAeN,OAAO,KAAK,CAACK,MAAM;YACxC,IAAIC,gBAAgB9B,AAAW+B,WAAX/B,QAElBwB,OAAO,KAAK,CAACK,MAAM,GAAGG,cACpBF,cACA1B,YACAC,aACAC,YACAC,aACAP;QAGN;IACF;IAEA,IACEoB,AAAmB,MAAnBA,QAAQ,MAAM,IACdC,YAAY,kCAAkC,IAC9C,CAACA,YAAY,KAAK,EAElBY,QAAQ,IAAI,CACV,8EACAxC;IAIJI,oBAAoB,MAAM,CAAC;QACzB,MAAM;QACN,SAAS;YACP;gBACE,MAAM;gBACN,MAAMmB;YACR;SACD;IACH;IAEA,OAAOK;AACT"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ai-model/prompt/assertion.mjs","sources":["
|
|
1
|
+
{"version":3,"file":"ai-model/prompt/assertion.mjs","sources":["../../../../src/ai-model/prompt/assertion.ts"],"sourcesContent":["import type { ResponseFormatJSONSchema } from 'openai/resources/index';\n\nexport const assertSchema: ResponseFormatJSONSchema = {\n type: 'json_schema',\n json_schema: {\n name: 'assert',\n strict: true,\n schema: {\n type: 'object',\n properties: {\n pass: {\n type: 'boolean',\n description: 'Whether the assertion passed or failed',\n },\n thought: {\n type: ['string', 'null'],\n description: 'The thought process behind the assertion',\n },\n },\n required: ['pass', 'thought'],\n additionalProperties: false,\n },\n },\n};\n"],"names":["assertSchema"],"mappings":"AAEO,MAAMA,eAAyC;IACpD,MAAM;IACN,aAAa;QACX,MAAM;QACN,QAAQ;QACR,QAAQ;YACN,MAAM;YACN,YAAY;gBACV,MAAM;oBACJ,MAAM;oBACN,aAAa;gBACf;gBACA,SAAS;oBACP,MAAM;wBAAC;wBAAU;qBAAO;oBACxB,aAAa;gBACf;YACF;YACA,UAAU;gBAAC;gBAAQ;aAAU;YAC7B,sBAAsB;QACxB;IACF;AACF"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ai-model/prompt/common.mjs","sources":["
|
|
1
|
+
{"version":3,"file":"ai-model/prompt/common.mjs","sources":["../../../../src/ai-model/prompt/common.ts"],"sourcesContent":["import type { TVlModeTypes } from '@midscene/shared/env';\nexport function bboxDescription(vlMode: TVlModeTypes | undefined) {\n if (vlMode === 'gemini') {\n return 'box_2d bounding box for the target element, should be [ymin, xmin, ymax, xmax] normalized to 0-1000.';\n }\n return '2d bounding box as [xmin, ymin, xmax, ymax]';\n}\n"],"names":["bboxDescription","vlMode"],"mappings":"AACO,SAASA,gBAAgBC,MAAgC;IAC9D,IAAIA,AAAW,aAAXA,QACF,OAAO;IAET,OAAO;AACT"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ai-model/prompt/describe.mjs","sources":["
|
|
1
|
+
{"version":3,"file":"ai-model/prompt/describe.mjs","sources":["../../../../src/ai-model/prompt/describe.ts"],"sourcesContent":["import { getPreferredLanguage } from '@midscene/shared/env';\n\nexport const elementDescriberInstruction = () => {\n return `\nDescribe the element in the red rectangle for precise identification. Use ${getPreferredLanguage()}.\n\nCRITICAL REQUIREMENTS:\n1. UNIQUENESS: The description must uniquely identify this element on the current page\n2. UNIVERSALITY: Use generic, reusable selectors that work across different contexts\n3. PRECISION: Be specific enough to distinguish from similar elements\n\nDESCRIPTION STRUCTURE:\n1. Element type (button, input, link, div, etc.)\n2. Primary identifier (in order of preference):\n - Unique text content: \"with text 'Login'\"\n - Unique attribute: \"with aria-label 'Search'\"\n - Unique class/ID: \"with class 'primary-button'\"\n - Unique position: \"in header navigation\"\n3. Secondary identifiers (if needed for uniqueness):\n - Visual features: \"blue background\", \"with icon\"\n - Relative position: \"below search bar\", \"in sidebar\"\n - Parent context: \"in login form\", \"in main menu\"\n\nGUIDELINES:\n- Keep description under 25 words\n- Prioritize semantic identifiers over visual ones\n- Use consistent terminology across similar elements\n- Avoid page-specific or temporary content\n- Don't mention the red rectangle or selection box\n- Focus on stable, reusable characteristics\n\nEXAMPLES:\n- \"Login button with text 'Sign In'\"\n- \"Search input with placeholder 'Enter keywords'\"\n- \"Navigation link with text 'Home' in header\"\n- \"Submit button in contact form\"\n- \"Menu icon with aria-label 'Open menu'\"\n\nReturn JSON:\n{\n \"description\": \"unique element identifier\",\n \"error\"?: \"error message if any\"\n}`;\n};\n"],"names":["elementDescriberInstruction","getPreferredLanguage"],"mappings":";AAEO,MAAMA,8BAA8B,IAClC,CAAC;0EACgE,EAAEC,uBAAuB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CAsClG,CAAC"}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ai-model/prompt/extraction.mjs","sources":["
|
|
1
|
+
{"version":3,"file":"ai-model/prompt/extraction.mjs","sources":["../../../../src/ai-model/prompt/extraction.ts"],"sourcesContent":["import type { ResponseFormatJSONSchema } from 'openai/resources/index';\n\nexport function systemPromptToExtract() {\n return `\nYou are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.\n\nThe user will give you a screenshot, the contents of it (optional), and some data requirements in <DATA_DEMAND>. You need to understand the user's requirements and extract the data satisfying the <DATA_DEMAND>.\n\nIf a key specifies a JSON data type (such as Number, String, Boolean, Object, Array), ensure the returned value strictly matches that data type.\n\nIf the user provides multiple reference images, please carefully review the reference images with the screenshot and provide the correct answer for <DATA_DEMAND>.\n\n\nReturn in the following JSON format:\n{\n thought: string, // the thinking process of the extraction, less then 300 words\n data: any, // the extracted data. Make sure both the value and scheme meet the DATA_DEMAND. If you want to write some description in this field, use the same language as the DATA_DEMAND.\n errors: [], // string[], error message if any\n}\n\n# Example 1\nFor example, if the DATA_DEMAND is:\n\n<DATA_DEMAND>\n{\n \"name\": \"name shows on the left panel, string\",\n \"age\": \"age shows on the right panel, number\",\n \"isAdmin\": \"if the user is admin, boolean\"\n}\n</DATA_DEMAND>\n\nBy viewing the screenshot and page contents, you can extract the following data:\n\n{\n thought: \"According to the screenshot, i can see ...\",\n data: {\n name: \"John\",\n age: 30,\n isAdmin: true\n },\n}\n\n# Example 2\nIf the DATA_DEMAND is:\n\n<DATA_DEMAND>\nthe todo items list, string[]\n</DATA_DEMAND>\n\nBy viewing the screenshot and page contents, you can extract the following data:\n\n{\n thought: \"According to the screenshot, i can see ...\",\n data: [\"todo 1\", \"todo 2\", \"todo 3\"],\n}\n\n# Example 3\nIf the DATA_DEMAND is:\n\n<DATA_DEMAND>\nthe page title, string\n</DATA_DEMAND>\n\nBy viewing the screenshot and page contents, you can extract the following data:\n\n{\n thought: \"According to the screenshot, i can see ...\",\n data: \"todo list\",\n}\n\n# Example 4\nIf the DATA_DEMAND is:\n\n<DATA_DEMAND>\n{\n \"result\": \"Boolean, is it currently the SMS page?\"\n}\n</DATA_DEMAND>\n\nBy viewing the screenshot and page contents, you can extract the following data:\n\n{\n thought: \"According to the screenshot, i can see ...\",\n data: { result: true },\n}\n`;\n}\n\nexport const extractDataQueryPrompt = (\n pageDescription: string,\n dataQuery: string | Record<string, string>,\n) => {\n let dataQueryText = '';\n if (typeof dataQuery === 'string') {\n dataQueryText = dataQuery;\n } else {\n dataQueryText = JSON.stringify(dataQuery, null, 2);\n }\n\n return `\n<PageDescription>\n${pageDescription}\n</PageDescription>\n\n<DATA_DEMAND>\n${dataQueryText}\n</DATA_DEMAND>\n `;\n};\n\nexport const extractDataSchema: ResponseFormatJSONSchema = {\n type: 'json_schema',\n json_schema: {\n name: 'extract_data',\n strict: true,\n schema: {\n type: 'object',\n properties: {\n data: {\n type: 'object',\n description: 'The extracted data',\n },\n errors: {\n type: 'array',\n items: {\n type: 'string',\n },\n description: 'Error messages, if any',\n },\n },\n required: ['data', 'errors'],\n additionalProperties: false,\n },\n },\n};\n"],"names":["systemPromptToExtract","extractDataQueryPrompt","pageDescription","dataQuery","dataQueryText","JSON","extractDataSchema"],"mappings":"AAEO,SAASA;IACd,OAAO,CAAC;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAkFV,CAAC;AACD;AAEO,MAAMC,yBAAyB,CACpCC,iBACAC;IAEA,IAAIC,gBAAgB;IAElBA,gBADE,AAAqB,YAArB,OAAOD,YACOA,YAEAE,KAAK,SAAS,CAACF,WAAW,MAAM;IAGlD,OAAO,CAAC;;AAEV,EAAED,gBAAgB;;;;AAIlB,EAAEE,cAAc;;EAEd,CAAC;AACH;AAEO,MAAME,oBAA8C;IACzD,MAAM;IACN,aAAa;QACX,MAAM;QACN,QAAQ;QACR,QAAQ;YACN,MAAM;YACN,YAAY;gBACV,MAAM;oBACJ,MAAM;oBACN,aAAa;gBACf;gBACA,QAAQ;oBACN,MAAM;oBACN,OAAO;wBACL,MAAM;oBACR;oBACA,aAAa;gBACf;YACF;YACA,UAAU;gBAAC;gBAAQ;aAAS;YAC5B,sBAAsB;QACxB;IACF;AACF"}
|
|
@@ -1,268 +1,44 @@
|
|
|
1
1
|
import { bboxDescription } from "./common.mjs";
|
|
2
2
|
function systemPromptToLocateElement(vlMode) {
|
|
3
|
-
|
|
4
|
-
|
|
5
|
-
return `
|
|
3
|
+
const bboxComment = bboxDescription(vlMode);
|
|
4
|
+
return `
|
|
6
5
|
## Role:
|
|
7
|
-
You are an
|
|
6
|
+
You are an AI assistant that helps identify UI elements.
|
|
8
7
|
|
|
9
8
|
## Objective:
|
|
10
|
-
- Identify elements in screenshots
|
|
11
|
-
-
|
|
12
|
-
- Determine whether the user's description is order-sensitive (e.g., contains phrases like 'the third item in the list', 'the last button', etc.).
|
|
9
|
+
- Identify elements in screenshots that match the user's description.
|
|
10
|
+
- Provide the coordinates of the element that matches the user's description.
|
|
13
11
|
|
|
14
12
|
## Output Format:
|
|
15
13
|
\`\`\`json
|
|
16
14
|
{
|
|
17
15
|
"bbox": [number, number, number, number], // ${bboxComment}
|
|
18
|
-
"errors"?: string[]
|
|
19
|
-
"isOrderSensitive": boolean // Whether the targetElementDescription is order-sensitive (true/false)
|
|
16
|
+
"errors"?: string[]
|
|
20
17
|
}
|
|
21
18
|
\`\`\`
|
|
22
19
|
|
|
23
20
|
Fields:
|
|
24
|
-
* \`bbox\` is the bounding box of the element that matches the user's description
|
|
25
|
-
* \`isOrderSensitive\` is a boolean indicating whether the user's description is order-sensitive (true/false)
|
|
21
|
+
* \`bbox\` is the bounding box of the element that matches the user's description
|
|
26
22
|
* \`errors\` is an optional array of error messages (if any)
|
|
27
23
|
|
|
28
|
-
|
|
29
|
-
- "the third item in the list"
|
|
30
|
-
- "the last button"
|
|
31
|
-
- "the first input box"
|
|
32
|
-
- "the second row"
|
|
33
|
-
|
|
34
|
-
Not order-sensitive means the description is like:
|
|
35
|
-
- "confirm button"
|
|
36
|
-
- "search box"
|
|
37
|
-
- "password input"
|
|
38
|
-
|
|
39
|
-
For example, when an element is found and the description is order-sensitive:
|
|
24
|
+
For example, when an element is found:
|
|
40
25
|
\`\`\`json
|
|
41
26
|
{
|
|
42
27
|
"bbox": [100, 100, 200, 200],
|
|
43
|
-
"isOrderSensitive": true,
|
|
44
28
|
"errors": []
|
|
45
29
|
}
|
|
46
30
|
\`\`\`
|
|
47
31
|
|
|
48
|
-
When no element is found
|
|
32
|
+
When no element is found:
|
|
49
33
|
\`\`\`json
|
|
50
34
|
{
|
|
51
35
|
"bbox": [],
|
|
52
|
-
"isOrderSensitive": false,
|
|
53
36
|
"errors": ["I can see ..., but {some element} is not found"]
|
|
54
37
|
}
|
|
55
38
|
\`\`\`
|
|
56
39
|
`;
|
|
57
|
-
}
|
|
58
|
-
return `
|
|
59
|
-
## Role:
|
|
60
|
-
You are an expert in software page image (2D) and page element text analysis.
|
|
61
|
-
|
|
62
|
-
## Objective:
|
|
63
|
-
- Identify elements in screenshots and text that match the user's description.
|
|
64
|
-
- Return JSON data containing the selection reason and element ID.
|
|
65
|
-
- Determine whether the user's description is order-sensitive (e.g., contains phrases like 'the third item in the list', 'the last button', etc.).
|
|
66
|
-
|
|
67
|
-
## Skills:
|
|
68
|
-
- Image analysis and recognition
|
|
69
|
-
- Multilingual text understanding
|
|
70
|
-
- Software UI design and testing
|
|
71
|
-
|
|
72
|
-
## Workflow:
|
|
73
|
-
1. Receive the user's element description, screenshot, and element description information. Note that the text may contain non-English characters (e.g., Chinese), indicating that the application may be non-English.
|
|
74
|
-
2. Based on the user's description, locate the target element ID in the list of element descriptions and the screenshot.
|
|
75
|
-
3. Found the required number of elements
|
|
76
|
-
4. Return JSON data containing the selection reason and element ID.
|
|
77
|
-
5. Judge whether the user's description is order-sensitive (see below for definition and examples).
|
|
78
|
-
|
|
79
|
-
## Constraints:
|
|
80
|
-
- Strictly adhere to the specified location when describing the required element; do not select elements from other locations.
|
|
81
|
-
- Elements in the image with NodeType other than "TEXT Node" have been highlighted to identify the element among multiple non-text elements.
|
|
82
|
-
- Accurately identify element information based on the user's description and return the corresponding element ID from the element description information, not extracted from the image.
|
|
83
|
-
- If no elements are found, the "elements" array should be empty.
|
|
84
|
-
- The returned data must conform to the specified JSON format.
|
|
85
|
-
- The returned value id information must use the id from element info (important: **use id not indexId, id is hash content**)
|
|
86
|
-
|
|
87
|
-
## Order-Sensitive Definition:
|
|
88
|
-
- If the description contains phrases like "the third item in the list", "the last button", "the first input box", "the second row", etc., it is order-sensitive (isOrderSensitive = true).
|
|
89
|
-
- If the description is like "confirm button", "search box", "password input", etc., it is not order-sensitive (isOrderSensitive = false).
|
|
90
|
-
|
|
91
|
-
## Output Format:
|
|
92
|
-
|
|
93
|
-
Please return the result in JSON format as follows:
|
|
94
|
-
|
|
95
|
-
\`\`\`json
|
|
96
|
-
{
|
|
97
|
-
"elements": [
|
|
98
|
-
// If no matching elements are found, return an empty array []
|
|
99
|
-
{
|
|
100
|
-
"reason": "PLACEHOLDER", // The thought process for finding the element, replace PLACEHOLDER with your thought process
|
|
101
|
-
"text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
|
|
102
|
-
"id": "PLACEHOLDER" // Replace PLACEHOLDER with the ID (important: **use id not indexId, id is hash content**) of elementInfo
|
|
103
|
-
}
|
|
104
|
-
// More elements...
|
|
105
|
-
],
|
|
106
|
-
"isOrderSensitive": true, // or false, depending on the user's description
|
|
107
|
-
"errors": [] // Array of strings containing any error messages
|
|
108
|
-
}
|
|
109
|
-
\`\`\`
|
|
110
|
-
|
|
111
|
-
## Example:
|
|
112
|
-
Example 1:
|
|
113
|
-
Input Example:
|
|
114
|
-
\`\`\`json
|
|
115
|
-
// Description: "Shopping cart icon in the upper right corner"
|
|
116
|
-
{
|
|
117
|
-
"description": "PLACEHOLDER", // Description of the target element
|
|
118
|
-
"screenshot": "path/screenshot.png",
|
|
119
|
-
"text": '{
|
|
120
|
-
"pageSize": {
|
|
121
|
-
"width": 400, // Width of the page
|
|
122
|
-
"height": 905 // Height of the page
|
|
123
|
-
},
|
|
124
|
-
"elementInfos": [
|
|
125
|
-
{
|
|
126
|
-
"id": "1231", // ID of the element
|
|
127
|
-
"indexId": "0", // Index of the element\u{FF0C}The image is labeled to the left of the element
|
|
128
|
-
"attributes": { // Attributes of the element
|
|
129
|
-
"nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
|
|
130
|
-
"src": "https://ap-southeast-3.m",
|
|
131
|
-
"class": ".img"
|
|
132
|
-
},
|
|
133
|
-
"content": "", // Text content of the element
|
|
134
|
-
"rect": {
|
|
135
|
-
"left": 280, // Distance from the left side of the page
|
|
136
|
-
"top": 8, // Distance from the top of the page
|
|
137
|
-
"width": 44, // Width of the element
|
|
138
|
-
"height": 44 // Height of the element
|
|
139
|
-
}
|
|
140
|
-
},
|
|
141
|
-
{
|
|
142
|
-
"id": "66551", // ID of the element
|
|
143
|
-
"indexId": "1", // Index of the element,The image is labeled to the left of the element
|
|
144
|
-
"attributes": { // Attributes of the element
|
|
145
|
-
"nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
|
|
146
|
-
"src": "data:image/png;base64,iVBORw0KGgoAAAANSU...",
|
|
147
|
-
"class": ".icon"
|
|
148
|
-
},
|
|
149
|
-
"content": "", // Text content of the element
|
|
150
|
-
"rect": {
|
|
151
|
-
"left": 350, // Distance from the left side of the page
|
|
152
|
-
"top": 16, // Distance from the top of the page
|
|
153
|
-
"width": 25, // Width of the element
|
|
154
|
-
"height": 25 // Height of the element
|
|
155
|
-
}
|
|
156
|
-
},
|
|
157
|
-
...
|
|
158
|
-
{
|
|
159
|
-
"id": "12344",
|
|
160
|
-
"indexId": "2", // Index of the element\u{FF0C}The image is labeled to the left of the element
|
|
161
|
-
"attributes": {
|
|
162
|
-
"nodeType": "TEXT Node",
|
|
163
|
-
"class": ".product-name"
|
|
164
|
-
},
|
|
165
|
-
"center": [
|
|
166
|
-
288,
|
|
167
|
-
834
|
|
168
|
-
],
|
|
169
|
-
"content": "Mango Drink",
|
|
170
|
-
"rect": {
|
|
171
|
-
"left": 188,
|
|
172
|
-
"top": 827,
|
|
173
|
-
"width": 199,
|
|
174
|
-
"height": 13
|
|
175
|
-
}
|
|
176
|
-
},
|
|
177
|
-
...
|
|
178
|
-
]
|
|
179
|
-
}
|
|
180
|
-
'
|
|
181
40
|
}
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
\`\`\`json
|
|
185
|
-
{
|
|
186
|
-
"elements": [
|
|
187
|
-
{
|
|
188
|
-
// Describe the reason for finding this element, replace with actual value in practice
|
|
189
|
-
"reason": "Reason for finding element 4: It is located in the upper right corner, is an image type, and according to the screenshot, it is a shopping cart icon button",
|
|
190
|
-
"text": "",
|
|
191
|
-
// ID(**use id not indexId**) of this element, replace with actual value in practice, **use id not indexId**
|
|
192
|
-
"id": "1231"
|
|
193
|
-
}
|
|
194
|
-
],
|
|
195
|
-
"isOrderSensitive": true,
|
|
196
|
-
"errors": []
|
|
197
|
-
}
|
|
198
|
-
\`\`\`
|
|
199
|
-
|
|
200
|
-
`;
|
|
201
|
-
}
|
|
202
|
-
const locatorSchema = {
|
|
203
|
-
type: 'json_schema',
|
|
204
|
-
json_schema: {
|
|
205
|
-
name: 'find_elements',
|
|
206
|
-
strict: true,
|
|
207
|
-
schema: {
|
|
208
|
-
type: 'object',
|
|
209
|
-
properties: {
|
|
210
|
-
elements: {
|
|
211
|
-
type: 'array',
|
|
212
|
-
items: {
|
|
213
|
-
type: 'object',
|
|
214
|
-
properties: {
|
|
215
|
-
reason: {
|
|
216
|
-
type: 'string',
|
|
217
|
-
description: 'Reason for finding this element'
|
|
218
|
-
},
|
|
219
|
-
text: {
|
|
220
|
-
type: 'string',
|
|
221
|
-
description: 'Text content of the element'
|
|
222
|
-
},
|
|
223
|
-
id: {
|
|
224
|
-
type: 'string',
|
|
225
|
-
description: 'ID of this element'
|
|
226
|
-
}
|
|
227
|
-
},
|
|
228
|
-
required: [
|
|
229
|
-
'reason',
|
|
230
|
-
'text',
|
|
231
|
-
'id'
|
|
232
|
-
],
|
|
233
|
-
additionalProperties: false
|
|
234
|
-
},
|
|
235
|
-
description: 'List of found elements'
|
|
236
|
-
},
|
|
237
|
-
isOrderSensitive: {
|
|
238
|
-
type: 'boolean',
|
|
239
|
-
description: "Whether the targetElementDescription is order-sensitive (true/false)"
|
|
240
|
-
},
|
|
241
|
-
errors: {
|
|
242
|
-
type: 'array',
|
|
243
|
-
items: {
|
|
244
|
-
type: 'string'
|
|
245
|
-
},
|
|
246
|
-
description: 'List of error messages, if any'
|
|
247
|
-
}
|
|
248
|
-
},
|
|
249
|
-
required: [
|
|
250
|
-
'elements',
|
|
251
|
-
'isOrderSensitive',
|
|
252
|
-
'errors'
|
|
253
|
-
],
|
|
254
|
-
additionalProperties: false
|
|
255
|
-
}
|
|
256
|
-
}
|
|
257
|
-
};
|
|
258
|
-
const findElementPrompt = ({ pageDescription, targetElementDescription })=>`
|
|
259
|
-
Here is the item user want to find:
|
|
260
|
-
=====================================
|
|
261
|
-
${targetElementDescription}
|
|
262
|
-
=====================================
|
|
263
|
-
|
|
264
|
-
${pageDescription}
|
|
265
|
-
`;
|
|
266
|
-
export { findElementPrompt, locatorSchema, systemPromptToLocateElement };
|
|
41
|
+
const findElementPrompt = (targetElementDescription)=>`Find: ${targetElementDescription}`;
|
|
42
|
+
export { findElementPrompt, systemPromptToLocateElement };
|
|
267
43
|
|
|
268
44
|
//# sourceMappingURL=llm-locator.mjs.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ai-model/prompt/llm-locator.mjs","sources":["webpack://@midscene/core/./src/ai-model/prompt/llm-locator.ts"],"sourcesContent":["import type { TVlModeTypes } from '@midscene/shared/env';\nimport type { ResponseFormatJSONSchema } from 'openai/resources/index';\nimport { bboxDescription } from './common';\nexport function systemPromptToLocateElement(vlMode: TVlModeTypes | undefined) {\n if (vlMode) {\n const bboxComment = bboxDescription(vlMode);\n return `\n## Role:\nYou are an expert in software testing.\n\n## Objective:\n- Identify elements in screenshots and text that match the user's description.\n- Give the coordinates of the element that matches the user's description best in the screenshot.\n- Determine whether the user's description is order-sensitive (e.g., contains phrases like 'the third item in the list', 'the last button', etc.).\n\n## Output Format:\n\\`\\`\\`json\n{\n \"bbox\": [number, number, number, number], // ${bboxComment}\n \"errors\"?: string[],\n \"isOrderSensitive\": boolean // Whether the targetElementDescription is order-sensitive (true/false)\n}\n\\`\\`\\`\n\nFields:\n* \\`bbox\\` is the bounding box of the element that matches the user's description best in the screenshot\n* \\`isOrderSensitive\\` is a boolean indicating whether the user's description is order-sensitive (true/false)\n* \\`errors\\` is an optional array of error messages (if any)\n\nOrder-sensitive means the description contains phrases like:\n- \"the third item in the list\"\n- \"the last button\"\n- \"the first input box\"\n- \"the second row\"\n\nNot order-sensitive means the description is like:\n- \"confirm button\"\n- \"search box\"\n- \"password input\"\n\nFor example, when an element is found and the description is order-sensitive:\n\\`\\`\\`json\n{\n \"bbox\": [100, 100, 200, 200],\n \"isOrderSensitive\": true,\n \"errors\": []\n}\n\\`\\`\\`\n\nWhen no element is found and the description is not order-sensitive:\n\\`\\`\\`json\n{\n \"bbox\": [],\n \"isOrderSensitive\": false,\n \"errors\": [\"I can see ..., but {some element} is not found\"]\n}\n\\`\\`\\`\n`;\n }\n\n return `\n## Role:\nYou are an expert in software page image (2D) and page element text analysis.\n\n## Objective:\n- Identify elements in screenshots and text that match the user's description.\n- Return JSON data containing the selection reason and element ID.\n- Determine whether the user's description is order-sensitive (e.g., contains phrases like 'the third item in the list', 'the last button', etc.).\n\n## Skills:\n- Image analysis and recognition\n- Multilingual text understanding\n- Software UI design and testing\n\n## Workflow:\n1. Receive the user's element description, screenshot, and element description information. Note that the text may contain non-English characters (e.g., Chinese), indicating that the application may be non-English.\n2. Based on the user's description, locate the target element ID in the list of element descriptions and the screenshot.\n3. Found the required number of elements\n4. Return JSON data containing the selection reason and element ID.\n5. Judge whether the user's description is order-sensitive (see below for definition and examples).\n\n## Constraints:\n- Strictly adhere to the specified location when describing the required element; do not select elements from other locations.\n- Elements in the image with NodeType other than \"TEXT Node\" have been highlighted to identify the element among multiple non-text elements.\n- Accurately identify element information based on the user's description and return the corresponding element ID from the element description information, not extracted from the image.\n- If no elements are found, the \"elements\" array should be empty.\n- The returned data must conform to the specified JSON format.\n- The returned value id information must use the id from element info (important: **use id not indexId, id is hash content**)\n\n## Order-Sensitive Definition:\n- If the description contains phrases like \"the third item in the list\", \"the last button\", \"the first input box\", \"the second row\", etc., it is order-sensitive (isOrderSensitive = true).\n- If the description is like \"confirm button\", \"search box\", \"password input\", etc., it is not order-sensitive (isOrderSensitive = false).\n\n## Output Format:\n\nPlease return the result in JSON format as follows:\n\n\\`\\`\\`json\n{\n \"elements\": [\n // If no matching elements are found, return an empty array []\n {\n \"reason\": \"PLACEHOLDER\", // The thought process for finding the element, replace PLACEHOLDER with your thought process\n \"text\": \"PLACEHOLDER\", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty\n \"id\": \"PLACEHOLDER\" // Replace PLACEHOLDER with the ID (important: **use id not indexId, id is hash content**) of elementInfo\n }\n // More elements...\n ],\n \"isOrderSensitive\": true, // or false, depending on the user's description\n \"errors\": [] // Array of strings containing any error messages\n}\n\\`\\`\\`\n\n## Example:\nExample 1:\nInput Example:\n\\`\\`\\`json\n// Description: \"Shopping cart icon in the upper right corner\"\n{\n \"description\": \"PLACEHOLDER\", // Description of the target element\n \"screenshot\": \"path/screenshot.png\",\n \"text\": '{\n \"pageSize\": {\n \"width\": 400, // Width of the page\n \"height\": 905 // Height of the page\n },\n \"elementInfos\": [\n {\n \"id\": \"1231\", // ID of the element\n \"indexId\": \"0\", // Index of the element,The image is labeled to the left of the element\n \"attributes\": { // Attributes of the element\n \"nodeType\": \"IMG Node\", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node\n \"src\": \"https://ap-southeast-3.m\",\n \"class\": \".img\"\n },\n \"content\": \"\", // Text content of the element\n \"rect\": {\n \"left\": 280, // Distance from the left side of the page\n \"top\": 8, // Distance from the top of the page\n \"width\": 44, // Width of the element\n \"height\": 44 // Height of the element\n }\n },\n {\n \"id\": \"66551\", // ID of the element\n \"indexId\": \"1\", // Index of the element,The image is labeled to the left of the element\n \"attributes\": { // Attributes of the element\n \"nodeType\": \"IMG Node\", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node\n \"src\": \"data:image/png;base64,iVBORw0KGgoAAAANSU...\",\n \"class\": \".icon\"\n },\n \"content\": \"\", // Text content of the element\n \"rect\": {\n \"left\": 350, // Distance from the left side of the page\n \"top\": 16, // Distance from the top of the page\n \"width\": 25, // Width of the element\n \"height\": 25 // Height of the element\n }\n },\n ...\n {\n \"id\": \"12344\",\n \"indexId\": \"2\", // Index of the element,The image is labeled to the left of the element\n \"attributes\": {\n \"nodeType\": \"TEXT Node\",\n \"class\": \".product-name\"\n },\n \"center\": [\n 288,\n 834\n ],\n \"content\": \"Mango Drink\",\n \"rect\": {\n \"left\": 188,\n \"top\": 827,\n \"width\": 199,\n \"height\": 13\n }\n },\n ...\n ]\n }\n '\n}\n\\`\\`\\`\nOutput Example:\n\\`\\`\\`json\n{\n \"elements\": [\n {\n // Describe the reason for finding this element, replace with actual value in practice\n \"reason\": \"Reason for finding element 4: It is located in the upper right corner, is an image type, and according to the screenshot, it is a shopping cart icon button\",\n \"text\": \"\",\n // ID(**use id not indexId**) of this element, replace with actual value in practice, **use id not indexId**\n \"id\": \"1231\"\n }\n ],\n \"isOrderSensitive\": true,\n \"errors\": []\n}\n\\`\\`\\`\n \n `;\n}\n\nexport const locatorSchema: ResponseFormatJSONSchema = {\n type: 'json_schema',\n json_schema: {\n name: 'find_elements',\n strict: true,\n schema: {\n type: 'object',\n properties: {\n elements: {\n type: 'array',\n items: {\n type: 'object',\n properties: {\n reason: {\n type: 'string',\n description: 'Reason for finding this element',\n },\n text: {\n type: 'string',\n description: 'Text content of the element',\n },\n id: {\n type: 'string',\n description: 'ID of this element',\n },\n },\n required: ['reason', 'text', 'id'],\n additionalProperties: false,\n },\n description: 'List of found elements',\n },\n isOrderSensitive: {\n type: 'boolean',\n description:\n 'Whether the targetElementDescription is order-sensitive (true/false)',\n },\n errors: {\n type: 'array',\n items: {\n type: 'string',\n },\n description: 'List of error messages, if any',\n },\n },\n required: ['elements', 'isOrderSensitive', 'errors'],\n additionalProperties: false,\n },\n },\n};\n\nexport const findElementPrompt = ({\n pageDescription,\n targetElementDescription,\n}: {\n pageDescription: string;\n targetElementDescription: string;\n}) => `\nHere is the item user want to find:\n=====================================\n${targetElementDescription}\n=====================================\n\n${pageDescription}\n `;\n"],"names":["systemPromptToLocateElement","vlMode","bboxComment","bboxDescription","locatorSchema","findElementPrompt","pageDescription","targetElementDescription"],"mappings":";AAGO,SAASA,4BAA4BC,MAAgC;IAC1E,IAAIA,QAAQ;QACV,MAAMC,cAAcC,gBAAgBF;QACpC,OAAO,CAAC;;;;;;;;;;;;gDAYoC,EAAEC,YAAY;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAuC9D,CAAC;IACC;IAEA,OAAO,CAAC;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EA8IR,CAAC;AACH;AAEO,MAAME,gBAA0C;IACrD,MAAM;IACN,aAAa;QACX,MAAM;QACN,QAAQ;QACR,QAAQ;YACN,MAAM;YACN,YAAY;gBACV,UAAU;oBACR,MAAM;oBACN,OAAO;wBACL,MAAM;wBACN,YAAY;4BACV,QAAQ;gCACN,MAAM;gCACN,aAAa;4BACf;4BACA,MAAM;gCACJ,MAAM;gCACN,aAAa;4BACf;4BACA,IAAI;gCACF,MAAM;gCACN,aAAa;4BACf;wBACF;wBACA,UAAU;4BAAC;4BAAU;4BAAQ;yBAAK;wBAClC,sBAAsB;oBACxB;oBACA,aAAa;gBACf;gBACA,kBAAkB;oBAChB,MAAM;oBACN,aACE;gBACJ;gBACA,QAAQ;oBACN,MAAM;oBACN,OAAO;wBACL,MAAM;oBACR;oBACA,aAAa;gBACf;YACF;YACA,UAAU;gBAAC;gBAAY;gBAAoB;aAAS;YACpD,sBAAsB;QACxB;IACF;AACF;AAEO,MAAMC,oBAAoB,CAAC,EAChCC,eAAe,EACfC,wBAAwB,EAIzB,GAAK,CAAC;;;AAGP,EAAEA,yBAAyB;;;AAG3B,EAAED,gBAAgB;EAChB,CAAC"}
|
|
1
|
+
{"version":3,"file":"ai-model/prompt/llm-locator.mjs","sources":["../../../../src/ai-model/prompt/llm-locator.ts"],"sourcesContent":["import type { TVlModeTypes } from '@midscene/shared/env';\nimport { bboxDescription } from './common';\nexport function systemPromptToLocateElement(vlMode: TVlModeTypes | undefined) {\n const bboxComment = bboxDescription(vlMode);\n return `\n## Role:\nYou are an AI assistant that helps identify UI elements.\n\n## Objective:\n- Identify elements in screenshots that match the user's description.\n- Provide the coordinates of the element that matches the user's description.\n\n## Output Format:\n\\`\\`\\`json\n{\n \"bbox\": [number, number, number, number], // ${bboxComment}\n \"errors\"?: string[]\n}\n\\`\\`\\`\n\nFields:\n* \\`bbox\\` is the bounding box of the element that matches the user's description\n* \\`errors\\` is an optional array of error messages (if any)\n\nFor example, when an element is found:\n\\`\\`\\`json\n{\n \"bbox\": [100, 100, 200, 200],\n \"errors\": []\n}\n\\`\\`\\`\n\nWhen no element is found:\n\\`\\`\\`json\n{\n \"bbox\": [],\n \"errors\": [\"I can see ..., but {some element} is not found\"]\n}\n\\`\\`\\`\n`;\n}\n\nexport const findElementPrompt = (targetElementDescription: string) =>\n `Find: ${targetElementDescription}`;\n"],"names":["systemPromptToLocateElement","vlMode","bboxComment","bboxDescription","findElementPrompt","targetElementDescription"],"mappings":";AAEO,SAASA,4BAA4BC,MAAgC;IAC1E,MAAMC,cAAcC,gBAAgBF;IACpC,OAAO,CAAC;;;;;;;;;;;gDAWsC,EAAEC,YAAY;;;;;;;;;;;;;;;;;;;;;;;;AAwB9D,CAAC;AACD;AAEO,MAAME,oBAAoB,CAACC,2BAChC,CAAC,MAAM,EAAEA,0BAA0B"}
|