@midscene/core 1.3.0 → 1.3.1-beta-20260128032156.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/dist/es/agent/agent.mjs +1 -2
  2. package/dist/es/agent/agent.mjs.map +1 -1
  3. package/dist/es/agent/task-builder.mjs +6 -9
  4. package/dist/es/agent/task-builder.mjs.map +1 -1
  5. package/dist/es/agent/utils.mjs +8 -6
  6. package/dist/es/agent/utils.mjs.map +1 -1
  7. package/dist/es/ai-model/auto-glm/actions.mjs +1 -1
  8. package/dist/es/ai-model/auto-glm/actions.mjs.map +1 -1
  9. package/dist/es/ai-model/inspect.mjs +11 -24
  10. package/dist/es/ai-model/inspect.mjs.map +1 -1
  11. package/dist/es/ai-model/llm-planning.mjs +1 -3
  12. package/dist/es/ai-model/llm-planning.mjs.map +1 -1
  13. package/dist/es/ai-model/prompt/llm-planning.mjs +3 -2
  14. package/dist/es/ai-model/prompt/llm-planning.mjs.map +1 -1
  15. package/dist/es/common.mjs +10 -14
  16. package/dist/es/common.mjs.map +1 -1
  17. package/dist/es/device/index.mjs +1 -16
  18. package/dist/es/device/index.mjs.map +1 -1
  19. package/dist/es/utils.mjs +2 -2
  20. package/dist/lib/agent/agent.js +0 -1
  21. package/dist/lib/agent/agent.js.map +1 -1
  22. package/dist/lib/agent/task-builder.js +5 -8
  23. package/dist/lib/agent/task-builder.js.map +1 -1
  24. package/dist/lib/agent/utils.js +7 -5
  25. package/dist/lib/agent/utils.js.map +1 -1
  26. package/dist/lib/ai-model/auto-glm/actions.js +1 -1
  27. package/dist/lib/ai-model/auto-glm/actions.js.map +1 -1
  28. package/dist/lib/ai-model/inspect.js +10 -23
  29. package/dist/lib/ai-model/inspect.js.map +1 -1
  30. package/dist/lib/ai-model/llm-planning.js +1 -3
  31. package/dist/lib/ai-model/llm-planning.js.map +1 -1
  32. package/dist/lib/ai-model/prompt/llm-planning.js +3 -2
  33. package/dist/lib/ai-model/prompt/llm-planning.js.map +1 -1
  34. package/dist/lib/common.js +10 -14
  35. package/dist/lib/common.js.map +1 -1
  36. package/dist/lib/device/index.js +2 -23
  37. package/dist/lib/device/index.js.map +1 -1
  38. package/dist/lib/utils.js +2 -2
  39. package/dist/types/common.d.ts +2 -2
  40. package/dist/types/device/index.d.ts +1 -20
  41. package/package.json +2 -2
@@ -1 +1 @@
1
- {"version":3,"file":"agent/utils.mjs","sources":["../../../src/agent/utils.ts"],"sourcesContent":["import type { TMultimodalPrompt, TUserPrompt } from '@/common';\nimport type { AbstractInterface } from '@/device';\nimport { ScreenshotItem } from '@/screenshot-item';\nimport type {\n ElementCacheFeature,\n LocateResultElement,\n PlanningLocateParam,\n UIContext,\n} from '@/types';\nimport { uploadTestInfoToServer } from '@/utils';\nimport {\n MIDSCENE_REPORT_QUIET,\n MIDSCENE_REPORT_TAG_NAME,\n globalConfigManager,\n} from '@midscene/shared/env';\nimport { generateElementByPosition } from '@midscene/shared/extractor';\nimport { getDebug } from '@midscene/shared/logger';\nimport { _keyDefinitions } from '@midscene/shared/us-keyboard-layout';\nimport { assert, logMsg, uuid } from '@midscene/shared/utils';\nimport dayjs from 'dayjs';\nimport type { TaskCache } from './task-cache';\nimport { debug as cacheDebug } from './task-cache';\n\nconst debugProfile = getDebug('web:tool:profile');\n\nexport async function commonContextParser(\n interfaceInstance: AbstractInterface,\n _opt: { uploadServerUrl?: string },\n): Promise<UIContext> {\n assert(interfaceInstance, 'interfaceInstance is required');\n\n debugProfile('Getting interface description');\n const description = interfaceInstance.describe?.() || '';\n debugProfile('Interface description end');\n\n debugProfile('Uploading test info to server');\n uploadTestInfoToServer({\n testUrl: description,\n serverUrl: _opt.uploadServerUrl,\n });\n debugProfile('UploadTestInfoToServer end');\n\n const screenshotBase64 = await interfaceInstance.screenshotBase64();\n assert(screenshotBase64!, 'screenshotBase64 is required');\n\n debugProfile('will get size');\n const size = await interfaceInstance.size();\n debugProfile(`size: ${size.width}x${size.height} dpr: ${size.dpr}`);\n\n const screenshot = ScreenshotItem.create(screenshotBase64!);\n\n return {\n size,\n screenshot,\n };\n}\n\nexport function getReportFileName(tag = 'web') {\n const reportTagName = globalConfigManager.getEnvConfigValue(\n MIDSCENE_REPORT_TAG_NAME,\n );\n const dateTimeInFileName = dayjs().format('YYYY-MM-DD_HH-mm-ss');\n // ensure uniqueness at the same time\n const uniqueId = uuid().substring(0, 8);\n return `${reportTagName || tag}-${dateTimeInFileName}-${uniqueId}`;\n}\n\nexport function printReportMsg(filepath: string) {\n if (globalConfigManager.getEnvConfigInBoolean(MIDSCENE_REPORT_QUIET)) {\n return;\n }\n logMsg(`Midscene - report file updated: ${filepath}`);\n}\n\n/**\n * Get the current execution file name\n * @returns The name of the current execution file\n */\nexport function getCurrentExecutionFile(trace?: string): string | false {\n const error = new Error();\n const stackTrace = trace || error.stack;\n const pkgDir = process.cwd() || '';\n if (stackTrace) {\n const stackLines = stackTrace.split('\\n');\n for (const line of stackLines) {\n if (\n line.includes('.spec.') ||\n line.includes('.test.') ||\n line.includes('.ts') ||\n line.includes('.js')\n ) {\n const match = line.match(/(?:at\\s+)?(.*?\\.(?:spec|test)\\.[jt]s)/);\n if (match?.[1]) {\n const targetFileName = match[1]\n .replace(pkgDir, '')\n .trim()\n .replace('at ', '');\n return targetFileName;\n }\n }\n }\n }\n return false;\n}\n\nconst testFileIndex = new Map<string, number>();\n\nexport function generateCacheId(fileName?: string): string {\n let taskFile = fileName || getCurrentExecutionFile();\n if (!taskFile) {\n taskFile = uuid();\n console.warn(\n 'Midscene - using random UUID for cache id. Cache may be invalid.',\n );\n }\n\n if (testFileIndex.has(taskFile)) {\n const currentIndex = testFileIndex.get(taskFile);\n if (currentIndex !== undefined) {\n testFileIndex.set(taskFile, currentIndex + 1);\n }\n } else {\n testFileIndex.set(taskFile, 1);\n }\n return `${taskFile}-${testFileIndex.get(taskFile)}`;\n}\n\nexport function ifPlanLocateParamIsBbox(\n planLocateParam: PlanningLocateParam,\n): boolean {\n return !!(\n planLocateParam.bbox &&\n Array.isArray(planLocateParam.bbox) &&\n planLocateParam.bbox.length === 4\n );\n}\n\nexport function matchElementFromPlan(\n planLocateParam: PlanningLocateParam,\n): LocateResultElement | undefined {\n if (!planLocateParam) {\n return undefined;\n }\n\n if (planLocateParam.bbox) {\n const centerPosition = {\n x: Math.floor((planLocateParam.bbox[0] + planLocateParam.bbox[2]) / 2),\n y: Math.floor((planLocateParam.bbox[1] + planLocateParam.bbox[3]) / 2),\n };\n\n const element = generateElementByPosition(\n centerPosition,\n typeof planLocateParam.prompt === 'string'\n ? planLocateParam.prompt\n : planLocateParam.prompt?.prompt || '',\n );\n return element;\n }\n\n return undefined;\n}\n\nexport async function matchElementFromCache(\n context: {\n taskCache?: TaskCache;\n interfaceInstance: AbstractInterface;\n },\n cacheEntry: ElementCacheFeature | undefined,\n cachePrompt: TUserPrompt,\n cacheable: boolean | undefined,\n): Promise<LocateResultElement | undefined> {\n if (!cacheEntry) {\n return undefined;\n }\n\n if (cacheable === false) {\n cacheDebug('cache disabled for prompt: %s', cachePrompt);\n return undefined;\n }\n\n if (!context.taskCache?.isCacheResultUsed) {\n return undefined;\n }\n\n if (!context.interfaceInstance.rectMatchesCacheFeature) {\n cacheDebug(\n 'interface does not implement rectMatchesCacheFeature, skip cache',\n );\n return undefined;\n }\n\n try {\n const rect =\n await context.interfaceInstance.rectMatchesCacheFeature(cacheEntry);\n const element: LocateResultElement = {\n center: [\n Math.round(rect.left + rect.width / 2),\n Math.round(rect.top + rect.height / 2),\n ],\n rect,\n description:\n typeof cachePrompt === 'string'\n ? cachePrompt\n : cachePrompt.prompt || '',\n };\n\n cacheDebug('cache hit, prompt: %s', cachePrompt);\n return element;\n } catch (error) {\n cacheDebug('rectMatchesCacheFeature error: %s', error);\n return undefined;\n }\n}\n\ndeclare const __VERSION__: string | undefined;\n\nexport const getMidsceneVersion = (): string => {\n if (typeof __VERSION__ !== 'undefined') {\n return __VERSION__;\n } else if (\n process.env.__VERSION__ &&\n process.env.__VERSION__ !== 'undefined'\n ) {\n return process.env.__VERSION__;\n }\n throw new Error('__VERSION__ inject failed during build');\n};\n\nexport const parsePrompt = (\n prompt: TUserPrompt,\n): {\n textPrompt: string;\n multimodalPrompt?: TMultimodalPrompt;\n} => {\n if (typeof prompt === 'string') {\n return {\n textPrompt: prompt,\n multimodalPrompt: undefined,\n };\n }\n return {\n textPrompt: prompt.prompt,\n multimodalPrompt: prompt.images\n ? {\n images: prompt.images,\n convertHttpImage2Base64: !!prompt.convertHttpImage2Base64,\n }\n : undefined,\n };\n};\n"],"names":["debugProfile","getDebug","commonContextParser","interfaceInstance","_opt","assert","description","uploadTestInfoToServer","screenshotBase64","size","screenshot","ScreenshotItem","getReportFileName","tag","reportTagName","globalConfigManager","MIDSCENE_REPORT_TAG_NAME","dateTimeInFileName","dayjs","uniqueId","uuid","printReportMsg","filepath","MIDSCENE_REPORT_QUIET","logMsg","getCurrentExecutionFile","trace","error","Error","stackTrace","pkgDir","process","stackLines","line","match","targetFileName","testFileIndex","Map","generateCacheId","fileName","taskFile","console","currentIndex","undefined","ifPlanLocateParamIsBbox","planLocateParam","Array","matchElementFromPlan","centerPosition","Math","element","generateElementByPosition","matchElementFromCache","context","cacheEntry","cachePrompt","cacheable","cacheDebug","rect","getMidsceneVersion","__VERSION__","parsePrompt","prompt"],"mappings":";;;;;;;;AAuBA,MAAMA,eAAeC,SAAS;AAEvB,eAAeC,oBACpBC,iBAAoC,EACpCC,IAAkC;IAElCC,OAAOF,mBAAmB;IAE1BH,aAAa;IACb,MAAMM,cAAcH,kBAAkB,QAAQ,QAAQ;IACtDH,aAAa;IAEbA,aAAa;IACbO,uBAAuB;QACrB,SAASD;QACT,WAAWF,KAAK,eAAe;IACjC;IACAJ,aAAa;IAEb,MAAMQ,mBAAmB,MAAML,kBAAkB,gBAAgB;IACjEE,OAAOG,kBAAmB;IAE1BR,aAAa;IACb,MAAMS,OAAO,MAAMN,kBAAkB,IAAI;IACzCH,aAAa,CAAC,MAAM,EAAES,KAAK,KAAK,CAAC,CAAC,EAAEA,KAAK,MAAM,CAAC,MAAM,EAAEA,KAAK,GAAG,EAAE;IAElE,MAAMC,aAAaC,eAAe,MAAM,CAACH;IAEzC,OAAO;QACLC;QACAC;IACF;AACF;AAEO,SAASE,kBAAkBC,MAAM,KAAK;IAC3C,MAAMC,gBAAgBC,oBAAoB,iBAAiB,CACzDC;IAEF,MAAMC,qBAAqBC,QAAQ,MAAM,CAAC;IAE1C,MAAMC,WAAWC,OAAO,SAAS,CAAC,GAAG;IACrC,OAAO,GAAGN,iBAAiBD,IAAI,CAAC,EAAEI,mBAAmB,CAAC,EAAEE,UAAU;AACpE;AAEO,SAASE,eAAeC,QAAgB;IAC7C,IAAIP,oBAAoB,qBAAqB,CAACQ,wBAC5C;IAEFC,OAAO,CAAC,gCAAgC,EAAEF,UAAU;AACtD;AAMO,SAASG,wBAAwBC,KAAc;IACpD,MAAMC,QAAQ,IAAIC;IAClB,MAAMC,aAAaH,SAASC,MAAM,KAAK;IACvC,MAAMG,SAASC,QAAQ,GAAG,MAAM;IAChC,IAAIF,YAAY;QACd,MAAMG,aAAaH,WAAW,KAAK,CAAC;QACpC,KAAK,MAAMI,QAAQD,WACjB,IACEC,KAAK,QAAQ,CAAC,aACdA,KAAK,QAAQ,CAAC,aACdA,KAAK,QAAQ,CAAC,UACdA,KAAK,QAAQ,CAAC,QACd;YACA,MAAMC,QAAQD,KAAK,KAAK,CAAC;YACzB,IAAIC,OAAO,CAAC,EAAE,EAAE;gBACd,MAAMC,iBAAiBD,KAAK,CAAC,EAAE,CAC5B,OAAO,CAACJ,QAAQ,IAChB,IAAI,GACJ,OAAO,CAAC,OAAO;gBAClB,OAAOK;YACT;QACF;IAEJ;IACA,OAAO;AACT;AAEA,MAAMC,gBAAgB,IAAIC;AAEnB,SAASC,gBAAgBC,QAAiB;IAC/C,IAAIC,WAAWD,YAAYd;IAC3B,IAAI,CAACe,UAAU;QACbA,WAAWpB;QACXqB,QAAQ,IAAI,CACV;IAEJ;IAEA,IAAIL,cAAc,GAAG,CAACI,WAAW;QAC/B,MAAME,eAAeN,cAAc,GAAG,CAACI;QACvC,IAAIE,AAAiBC,WAAjBD,cACFN,cAAc,GAAG,CAACI,UAAUE,eAAe;IAE/C,OACEN,cAAc,GAAG,CAACI,UAAU;IAE9B,OAAO,GAAGA,SAAS,CAAC,EAAEJ,cAAc,GAAG,CAACI,WAAW;AACrD;AAEO,SAASI,wBACdC,eAAoC;IAEpC,OAAO,CAAC,CACNA,CAAAA,gBAAgB,IAAI,IACpBC,MAAM,OAAO,CAACD,gBAAgB,IAAI,KAClCA,AAAgC,MAAhCA,gBAAgB,IAAI,CAAC,MAAM,AAAK;AAEpC;AAEO,SAASE,qBACdF,eAAoC;IAEpC,IAAI,CAACA,iBACH;IAGF,IAAIA,gBAAgB,IAAI,EAAE;QACxB,MAAMG,iBAAiB;YACrB,GAAGC,KAAK,KAAK,CAAEJ,AAAAA,CAAAA,gBAAgB,IAAI,CAAC,EAAE,GAAGA,gBAAgB,IAAI,CAAC,EAAC,IAAK;YACpE,GAAGI,KAAK,KAAK,CAAEJ,AAAAA,CAAAA,gBAAgB,IAAI,CAAC,EAAE,GAAGA,gBAAgB,IAAI,CAAC,EAAC,IAAK;QACtE;QAEA,MAAMK,UAAUC,0BACdH,gBACA,AAAkC,YAAlC,OAAOH,gBAAgB,MAAM,GACzBA,gBAAgB,MAAM,GACtBA,gBAAgB,MAAM,EAAE,UAAU;QAExC,OAAOK;IACT;AAGF;AAEO,eAAeE,sBACpBC,OAGC,EACDC,UAA2C,EAC3CC,WAAwB,EACxBC,SAA8B;IAE9B,IAAI,CAACF,YACH;IAGF,IAAIE,AAAc,UAAdA,WAAqB,YACvBC,MAAW,iCAAiCF;IAI9C,IAAI,CAACF,QAAQ,SAAS,EAAE,mBACtB;IAGF,IAAI,CAACA,QAAQ,iBAAiB,CAAC,uBAAuB,EAAE,YACtDI,MACE;IAKJ,IAAI;QACF,MAAMC,OACJ,MAAML,QAAQ,iBAAiB,CAAC,uBAAuB,CAACC;QAC1D,MAAMJ,UAA+B;YACnC,QAAQ;gBACND,KAAK,KAAK,CAACS,KAAK,IAAI,GAAGA,KAAK,KAAK,GAAG;gBACpCT,KAAK,KAAK,CAACS,KAAK,GAAG,GAAGA,KAAK,MAAM,GAAG;aACrC;YACDA;YACA,aACE,AAAuB,YAAvB,OAAOH,cACHA,cACAA,YAAY,MAAM,IAAI;QAC9B;QAEAE,MAAW,yBAAyBF;QACpC,OAAOL;IACT,EAAE,OAAOvB,OAAO;QACd8B,MAAW,qCAAqC9B;QAChD;IACF;AACF;AAIO,MAAMgC,qBAAqB,IAEvBC;AAUJ,MAAMC,cAAc,CACzBC;IAKA,IAAI,AAAkB,YAAlB,OAAOA,QACT,OAAO;QACL,YAAYA;QACZ,kBAAkBnB;IACpB;IAEF,OAAO;QACL,YAAYmB,OAAO,MAAM;QACzB,kBAAkBA,OAAO,MAAM,GAC3B;YACE,QAAQA,OAAO,MAAM;YACrB,yBAAyB,CAAC,CAACA,OAAO,uBAAuB;QAC3D,IACAnB;IACN;AACF"}
1
+ {"version":3,"file":"agent/utils.mjs","sources":["../../../src/agent/utils.ts"],"sourcesContent":["import type { TMultimodalPrompt, TUserPrompt } from '@/common';\nimport type { AbstractInterface } from '@/device';\nimport { ScreenshotItem } from '@/screenshot-item';\nimport type {\n ElementCacheFeature,\n LocateResultElement,\n PlanningLocateParam,\n UIContext,\n} from '@/types';\nimport { uploadTestInfoToServer } from '@/utils';\nimport {\n MIDSCENE_REPORT_QUIET,\n MIDSCENE_REPORT_TAG_NAME,\n globalConfigManager,\n} from '@midscene/shared/env';\nimport { generateElementByRect } from '@midscene/shared/extractor';\nimport { getDebug } from '@midscene/shared/logger';\nimport { _keyDefinitions } from '@midscene/shared/us-keyboard-layout';\nimport { assert, logMsg, uuid } from '@midscene/shared/utils';\nimport dayjs from 'dayjs';\nimport type { TaskCache } from './task-cache';\nimport { debug as cacheDebug } from './task-cache';\n\nconst debugProfile = getDebug('web:tool:profile');\n\nexport async function commonContextParser(\n interfaceInstance: AbstractInterface,\n _opt: { uploadServerUrl?: string },\n): Promise<UIContext> {\n assert(interfaceInstance, 'interfaceInstance is required');\n\n debugProfile('Getting interface description');\n const description = interfaceInstance.describe?.() || '';\n debugProfile('Interface description end');\n\n debugProfile('Uploading test info to server');\n uploadTestInfoToServer({\n testUrl: description,\n serverUrl: _opt.uploadServerUrl,\n });\n debugProfile('UploadTestInfoToServer end');\n\n const screenshotBase64 = await interfaceInstance.screenshotBase64();\n assert(screenshotBase64!, 'screenshotBase64 is required');\n\n debugProfile('will get size');\n const size = await interfaceInstance.size();\n debugProfile(`size: ${size.width}x${size.height} dpr: ${size.dpr}`);\n\n const screenshot = ScreenshotItem.create(screenshotBase64!);\n\n return {\n size,\n screenshot,\n };\n}\n\nexport function getReportFileName(tag = 'web') {\n const reportTagName = globalConfigManager.getEnvConfigValue(\n MIDSCENE_REPORT_TAG_NAME,\n );\n const dateTimeInFileName = dayjs().format('YYYY-MM-DD_HH-mm-ss');\n // ensure uniqueness at the same time\n const uniqueId = uuid().substring(0, 8);\n return `${reportTagName || tag}-${dateTimeInFileName}-${uniqueId}`;\n}\n\nexport function printReportMsg(filepath: string) {\n if (globalConfigManager.getEnvConfigInBoolean(MIDSCENE_REPORT_QUIET)) {\n return;\n }\n logMsg(`Midscene - report file updated: ${filepath}`);\n}\n\n/**\n * Get the current execution file name\n * @returns The name of the current execution file\n */\nexport function getCurrentExecutionFile(trace?: string): string | false {\n const error = new Error();\n const stackTrace = trace || error.stack;\n const pkgDir = process.cwd() || '';\n if (stackTrace) {\n const stackLines = stackTrace.split('\\n');\n for (const line of stackLines) {\n if (\n line.includes('.spec.') ||\n line.includes('.test.') ||\n line.includes('.ts') ||\n line.includes('.js')\n ) {\n const match = line.match(/(?:at\\s+)?(.*?\\.(?:spec|test)\\.[jt]s)/);\n if (match?.[1]) {\n const targetFileName = match[1]\n .replace(pkgDir, '')\n .trim()\n .replace('at ', '');\n return targetFileName;\n }\n }\n }\n }\n return false;\n}\n\nconst testFileIndex = new Map<string, number>();\n\nexport function generateCacheId(fileName?: string): string {\n let taskFile = fileName || getCurrentExecutionFile();\n if (!taskFile) {\n taskFile = uuid();\n console.warn(\n 'Midscene - using random UUID for cache id. Cache may be invalid.',\n );\n }\n\n if (testFileIndex.has(taskFile)) {\n const currentIndex = testFileIndex.get(taskFile);\n if (currentIndex !== undefined) {\n testFileIndex.set(taskFile, currentIndex + 1);\n }\n } else {\n testFileIndex.set(taskFile, 1);\n }\n return `${taskFile}-${testFileIndex.get(taskFile)}`;\n}\n\nexport function ifPlanLocateParamIsBbox(\n planLocateParam: PlanningLocateParam,\n): boolean {\n return !!(\n planLocateParam.bbox &&\n Array.isArray(planLocateParam.bbox) &&\n planLocateParam.bbox.length === 4\n );\n}\n\nexport function matchElementFromPlan(\n planLocateParam: PlanningLocateParam,\n): LocateResultElement | undefined {\n if (!planLocateParam) {\n return undefined;\n }\n\n if (planLocateParam.bbox) {\n // Convert bbox [x1, y1, x2, y2] to rect {left, top, width, height}\n const rect = {\n left: planLocateParam.bbox[0],\n top: planLocateParam.bbox[1],\n width: planLocateParam.bbox[2] - planLocateParam.bbox[0] + 1,\n height: planLocateParam.bbox[3] - planLocateParam.bbox[1] + 1,\n };\n\n const element = generateElementByRect(\n rect,\n typeof planLocateParam.prompt === 'string'\n ? planLocateParam.prompt\n : planLocateParam.prompt?.prompt || '',\n );\n return element;\n }\n\n return undefined;\n}\n\nexport async function matchElementFromCache(\n context: {\n taskCache?: TaskCache;\n interfaceInstance: AbstractInterface;\n },\n cacheEntry: ElementCacheFeature | undefined,\n cachePrompt: TUserPrompt,\n cacheable: boolean | undefined,\n): Promise<LocateResultElement | undefined> {\n if (!cacheEntry) {\n return undefined;\n }\n\n if (cacheable === false) {\n cacheDebug('cache disabled for prompt: %s', cachePrompt);\n return undefined;\n }\n\n if (!context.taskCache?.isCacheResultUsed) {\n return undefined;\n }\n\n if (!context.interfaceInstance.rectMatchesCacheFeature) {\n cacheDebug(\n 'interface does not implement rectMatchesCacheFeature, skip cache',\n );\n return undefined;\n }\n\n try {\n const rect =\n await context.interfaceInstance.rectMatchesCacheFeature(cacheEntry);\n const element: LocateResultElement = {\n center: [\n Math.round(rect.left + rect.width / 2),\n Math.round(rect.top + rect.height / 2),\n ],\n rect,\n description:\n typeof cachePrompt === 'string'\n ? cachePrompt\n : cachePrompt.prompt || '',\n };\n\n cacheDebug('cache hit, prompt: %s', cachePrompt);\n return element;\n } catch (error) {\n cacheDebug('rectMatchesCacheFeature error: %s', error);\n return undefined;\n }\n}\n\ndeclare const __VERSION__: string | undefined;\n\nexport const getMidsceneVersion = (): string => {\n if (typeof __VERSION__ !== 'undefined') {\n return __VERSION__;\n } else if (\n process.env.__VERSION__ &&\n process.env.__VERSION__ !== 'undefined'\n ) {\n return process.env.__VERSION__;\n }\n throw new Error('__VERSION__ inject failed during build');\n};\n\nexport const parsePrompt = (\n prompt: TUserPrompt,\n): {\n textPrompt: string;\n multimodalPrompt?: TMultimodalPrompt;\n} => {\n if (typeof prompt === 'string') {\n return {\n textPrompt: prompt,\n multimodalPrompt: undefined,\n };\n }\n return {\n textPrompt: prompt.prompt,\n multimodalPrompt: prompt.images\n ? {\n images: prompt.images,\n convertHttpImage2Base64: !!prompt.convertHttpImage2Base64,\n }\n : undefined,\n };\n};\n"],"names":["debugProfile","getDebug","commonContextParser","interfaceInstance","_opt","assert","description","uploadTestInfoToServer","screenshotBase64","size","screenshot","ScreenshotItem","getReportFileName","tag","reportTagName","globalConfigManager","MIDSCENE_REPORT_TAG_NAME","dateTimeInFileName","dayjs","uniqueId","uuid","printReportMsg","filepath","MIDSCENE_REPORT_QUIET","logMsg","getCurrentExecutionFile","trace","error","Error","stackTrace","pkgDir","process","stackLines","line","match","targetFileName","testFileIndex","Map","generateCacheId","fileName","taskFile","console","currentIndex","undefined","ifPlanLocateParamIsBbox","planLocateParam","Array","matchElementFromPlan","rect","element","generateElementByRect","matchElementFromCache","context","cacheEntry","cachePrompt","cacheable","cacheDebug","Math","getMidsceneVersion","__VERSION__","parsePrompt","prompt"],"mappings":";;;;;;;;AAuBA,MAAMA,eAAeC,SAAS;AAEvB,eAAeC,oBACpBC,iBAAoC,EACpCC,IAAkC;IAElCC,OAAOF,mBAAmB;IAE1BH,aAAa;IACb,MAAMM,cAAcH,kBAAkB,QAAQ,QAAQ;IACtDH,aAAa;IAEbA,aAAa;IACbO,uBAAuB;QACrB,SAASD;QACT,WAAWF,KAAK,eAAe;IACjC;IACAJ,aAAa;IAEb,MAAMQ,mBAAmB,MAAML,kBAAkB,gBAAgB;IACjEE,OAAOG,kBAAmB;IAE1BR,aAAa;IACb,MAAMS,OAAO,MAAMN,kBAAkB,IAAI;IACzCH,aAAa,CAAC,MAAM,EAAES,KAAK,KAAK,CAAC,CAAC,EAAEA,KAAK,MAAM,CAAC,MAAM,EAAEA,KAAK,GAAG,EAAE;IAElE,MAAMC,aAAaC,eAAe,MAAM,CAACH;IAEzC,OAAO;QACLC;QACAC;IACF;AACF;AAEO,SAASE,kBAAkBC,MAAM,KAAK;IAC3C,MAAMC,gBAAgBC,oBAAoB,iBAAiB,CACzDC;IAEF,MAAMC,qBAAqBC,QAAQ,MAAM,CAAC;IAE1C,MAAMC,WAAWC,OAAO,SAAS,CAAC,GAAG;IACrC,OAAO,GAAGN,iBAAiBD,IAAI,CAAC,EAAEI,mBAAmB,CAAC,EAAEE,UAAU;AACpE;AAEO,SAASE,eAAeC,QAAgB;IAC7C,IAAIP,oBAAoB,qBAAqB,CAACQ,wBAC5C;IAEFC,OAAO,CAAC,gCAAgC,EAAEF,UAAU;AACtD;AAMO,SAASG,wBAAwBC,KAAc;IACpD,MAAMC,QAAQ,IAAIC;IAClB,MAAMC,aAAaH,SAASC,MAAM,KAAK;IACvC,MAAMG,SAASC,QAAQ,GAAG,MAAM;IAChC,IAAIF,YAAY;QACd,MAAMG,aAAaH,WAAW,KAAK,CAAC;QACpC,KAAK,MAAMI,QAAQD,WACjB,IACEC,KAAK,QAAQ,CAAC,aACdA,KAAK,QAAQ,CAAC,aACdA,KAAK,QAAQ,CAAC,UACdA,KAAK,QAAQ,CAAC,QACd;YACA,MAAMC,QAAQD,KAAK,KAAK,CAAC;YACzB,IAAIC,OAAO,CAAC,EAAE,EAAE;gBACd,MAAMC,iBAAiBD,KAAK,CAAC,EAAE,CAC5B,OAAO,CAACJ,QAAQ,IAChB,IAAI,GACJ,OAAO,CAAC,OAAO;gBAClB,OAAOK;YACT;QACF;IAEJ;IACA,OAAO;AACT;AAEA,MAAMC,gBAAgB,IAAIC;AAEnB,SAASC,gBAAgBC,QAAiB;IAC/C,IAAIC,WAAWD,YAAYd;IAC3B,IAAI,CAACe,UAAU;QACbA,WAAWpB;QACXqB,QAAQ,IAAI,CACV;IAEJ;IAEA,IAAIL,cAAc,GAAG,CAACI,WAAW;QAC/B,MAAME,eAAeN,cAAc,GAAG,CAACI;QACvC,IAAIE,AAAiBC,WAAjBD,cACFN,cAAc,GAAG,CAACI,UAAUE,eAAe;IAE/C,OACEN,cAAc,GAAG,CAACI,UAAU;IAE9B,OAAO,GAAGA,SAAS,CAAC,EAAEJ,cAAc,GAAG,CAACI,WAAW;AACrD;AAEO,SAASI,wBACdC,eAAoC;IAEpC,OAAO,CAAC,CACNA,CAAAA,gBAAgB,IAAI,IACpBC,MAAM,OAAO,CAACD,gBAAgB,IAAI,KAClCA,AAAgC,MAAhCA,gBAAgB,IAAI,CAAC,MAAM,AAAK;AAEpC;AAEO,SAASE,qBACdF,eAAoC;IAEpC,IAAI,CAACA,iBACH;IAGF,IAAIA,gBAAgB,IAAI,EAAE;QAExB,MAAMG,OAAO;YACX,MAAMH,gBAAgB,IAAI,CAAC,EAAE;YAC7B,KAAKA,gBAAgB,IAAI,CAAC,EAAE;YAC5B,OAAOA,gBAAgB,IAAI,CAAC,EAAE,GAAGA,gBAAgB,IAAI,CAAC,EAAE,GAAG;YAC3D,QAAQA,gBAAgB,IAAI,CAAC,EAAE,GAAGA,gBAAgB,IAAI,CAAC,EAAE,GAAG;QAC9D;QAEA,MAAMI,UAAUC,sBACdF,MACA,AAAkC,YAAlC,OAAOH,gBAAgB,MAAM,GACzBA,gBAAgB,MAAM,GACtBA,gBAAgB,MAAM,EAAE,UAAU;QAExC,OAAOI;IACT;AAGF;AAEO,eAAeE,sBACpBC,OAGC,EACDC,UAA2C,EAC3CC,WAAwB,EACxBC,SAA8B;IAE9B,IAAI,CAACF,YACH;IAGF,IAAIE,AAAc,UAAdA,WAAqB,YACvBC,MAAW,iCAAiCF;IAI9C,IAAI,CAACF,QAAQ,SAAS,EAAE,mBACtB;IAGF,IAAI,CAACA,QAAQ,iBAAiB,CAAC,uBAAuB,EAAE,YACtDI,MACE;IAKJ,IAAI;QACF,MAAMR,OACJ,MAAMI,QAAQ,iBAAiB,CAAC,uBAAuB,CAACC;QAC1D,MAAMJ,UAA+B;YACnC,QAAQ;gBACNQ,KAAK,KAAK,CAACT,KAAK,IAAI,GAAGA,KAAK,KAAK,GAAG;gBACpCS,KAAK,KAAK,CAACT,KAAK,GAAG,GAAGA,KAAK,MAAM,GAAG;aACrC;YACDA;YACA,aACE,AAAuB,YAAvB,OAAOM,cACHA,cACAA,YAAY,MAAM,IAAI;QAC9B;QAEAE,MAAW,yBAAyBF;QACpC,OAAOL;IACT,EAAE,OAAOtB,OAAO;QACd6B,MAAW,qCAAqC7B;QAChD;IACF;AACF;AAIO,MAAM+B,qBAAqB,IAEvBC;AAUJ,MAAMC,cAAc,CACzBC;IAKA,IAAI,AAAkB,YAAlB,OAAOA,QACT,OAAO;QACL,YAAYA;QACZ,kBAAkBlB;IACpB;IAEF,OAAO;QACL,YAAYkB,OAAO,MAAM;QACzB,kBAAkBA,OAAO,MAAM,GAC3B;YACE,QAAQA,OAAO,MAAM;YACrB,yBAAyB,CAAC,CAACA,OAAO,uBAAuB;QAC3D,IACAlB;IACN;AACF"}
@@ -4,7 +4,7 @@ const debug = getDebug('auto-glm-actions');
4
4
  const AUTO_GLM_COORDINATE_MAX = 1000;
5
5
  function autoGLMCoordinateToBbox(x, y, width, height) {
6
6
  const bbox = pointToBbox(x, y, 10);
7
- return adaptBbox(bbox, width, height, width, height, 'auto-glm');
7
+ return adaptBbox(bbox, width, height, 'auto-glm');
8
8
  }
9
9
  function transformAutoGLMAction(action, size) {
10
10
  try {
@@ -1 +1 @@
1
- {"version":3,"file":"ai-model/auto-glm/actions.mjs","sources":["../../../../src/ai-model/auto-glm/actions.ts"],"sourcesContent":["import { adaptBbox, pointToBbox } from '@/common';\nimport type { PlanningAction } from '@/types';\nimport { getDebug } from '@midscene/shared/logger';\n\nconst debug = getDebug('auto-glm-actions');\n\n/**\n * Auto-GLM coordinate system range: [0, AUTO_GLM_COORDINATE_MAX]\n */\nconst AUTO_GLM_COORDINATE_MAX = 1000;\n\n/**\n * Convert auto-glm coordinate [0,1000] to bbox in pixel coordinates\n */\nfunction autoGLMCoordinateToBbox(\n x: number,\n y: number,\n width: number,\n height: number,\n): [number, number, number, number] {\n const bbox = pointToBbox(x, y, 10);\n return adaptBbox(bbox, width, height, width, height, 'auto-glm');\n}\n\nexport interface BaseAction {\n _metadata: string;\n think?: string;\n}\n\nexport interface TapAction extends BaseAction {\n _metadata: 'do';\n action: 'Tap';\n element: [number, number];\n}\n\nexport interface DoubleTapAction extends BaseAction {\n _metadata: 'do';\n action: 'Double Tap';\n element: [number, number];\n}\n\nexport interface TypeAction extends BaseAction {\n _metadata: 'do';\n action: 'Type';\n text: string;\n}\n\nexport interface SwipeAction extends BaseAction {\n _metadata: 'do';\n action: 'Swipe';\n start: [number, number];\n end: [number, number];\n}\n\nexport interface LongPressAction extends BaseAction {\n _metadata: 'do';\n action: 'Long Press';\n element: [number, number];\n}\n\nexport interface LaunchAction extends BaseAction {\n _metadata: 'do';\n action: 'Launch';\n app: string;\n}\n\nexport interface BackAction extends BaseAction {\n _metadata: 'do';\n action: 'Back';\n}\n\nexport interface HomeAction extends BaseAction {\n _metadata: 'do';\n action: 'Home';\n}\n\nexport interface WaitAction extends BaseAction {\n _metadata: 'do';\n action: 'Wait';\n durationMs: number;\n}\n\nexport interface InteractAction extends BaseAction {\n _metadata: 'do';\n action: 'Interact';\n}\n\nexport interface CallAPIAction extends BaseAction {\n _metadata: 'do';\n action: 'Call_API';\n instruction: string;\n}\n\nexport interface TakeoverAction extends BaseAction {\n _metadata: 'do';\n action: 'Take_over';\n message: string;\n}\n\nexport interface NoteAction extends BaseAction {\n _metadata: 'do';\n action: 'Note';\n message: string;\n}\n\nexport interface FinishAction extends BaseAction {\n _metadata: 'finish';\n message: string;\n}\n\nexport type ParsedAction =\n | TapAction\n | DoubleTapAction\n | TypeAction\n | SwipeAction\n | LongPressAction\n | LaunchAction\n | BackAction\n | HomeAction\n | WaitAction\n | InteractAction\n | CallAPIAction\n | TakeoverAction\n | NoteAction\n | FinishAction;\n\nexport function transformAutoGLMAction(\n action: ParsedAction,\n size: { width: number; height: number },\n): PlanningAction[] {\n try {\n switch (action._metadata) {\n case 'finish': {\n const finishAction = action as FinishAction;\n debug('Transform finish action:', finishAction);\n return [\n {\n type: 'Finished',\n param: {},\n thought: finishAction.message,\n },\n ];\n }\n case 'do': {\n const doAction = action as\n | TapAction\n | DoubleTapAction\n | TypeAction\n | SwipeAction\n | LongPressAction\n | LaunchAction\n | BackAction\n | HomeAction\n | WaitAction\n | InteractAction\n | CallAPIAction\n | TakeoverAction\n | NoteAction;\n\n switch ((doAction as any).action) {\n case 'Tap': {\n const tapAction = doAction as TapAction;\n debug('Transform Tap action:', tapAction);\n const [x1, y1, x2, y2] = autoGLMCoordinateToBbox(\n tapAction.element[0],\n tapAction.element[1],\n size.width,\n size.height,\n );\n\n const locate: {\n prompt: string;\n bbox: [number, number, number, number];\n } = {\n prompt: '',\n bbox: [x1, y1, x2, y2],\n };\n\n return [\n {\n type: 'Tap',\n param: {\n locate,\n },\n },\n ];\n }\n case 'Double Tap': {\n const doubleTapAction = doAction as DoubleTapAction;\n debug('Transform Double Tap action:', doubleTapAction);\n const [x1, y1, x2, y2] = autoGLMCoordinateToBbox(\n doubleTapAction.element[0],\n doubleTapAction.element[1],\n size.width,\n size.height,\n );\n\n const locate: {\n prompt: string;\n bbox: [number, number, number, number];\n } = {\n prompt: '',\n bbox: [x1, y1, x2, y2],\n };\n\n return [\n {\n type: 'DoubleClick',\n param: {\n locate,\n },\n },\n ];\n }\n case 'Type': {\n const typeAction = doAction as TypeAction;\n debug('Transform Type action:', typeAction);\n\n return [\n {\n type: 'Input',\n param: {\n value: typeAction.text,\n },\n },\n ];\n }\n case 'Swipe': {\n const swipeAction = doAction as SwipeAction;\n debug('Transform Swipe action:', swipeAction);\n\n // Calculate locate using start coordinate\n const [x1, y1, x2, y2] = autoGLMCoordinateToBbox(\n swipeAction.start[0],\n swipeAction.start[1],\n size.width,\n size.height,\n );\n\n const locate: {\n prompt: string;\n bbox: [number, number, number, number];\n } = {\n prompt: '',\n bbox: [x1, y1, x2, y2],\n };\n\n // Calculate horizontal and vertical delta in [0,AUTO_GLM_COORDINATE_MAX] coordinate system\n const deltaX = swipeAction.end[0] - swipeAction.start[0];\n const deltaY = swipeAction.end[1] - swipeAction.start[1];\n\n // Determine direction and distance\n let direction: 'up' | 'down' | 'left' | 'right';\n let distance: number;\n\n const absDeltaX = Math.abs(deltaX);\n const absDeltaY = Math.abs(deltaY);\n\n if (absDeltaY > absDeltaX) {\n // Vertical scroll\n distance = Math.round(\n (absDeltaY * size.height) / AUTO_GLM_COORDINATE_MAX,\n );\n direction = deltaY > 0 ? 'up' : 'down';\n } else {\n // Horizontal scroll\n distance = Math.round(\n (absDeltaX * size.width) / AUTO_GLM_COORDINATE_MAX,\n );\n direction = deltaX > 0 ? 'left' : 'right';\n }\n\n debug(\n `Calculate swipe direction: ${direction}, distance: ${distance}`,\n );\n\n return [\n {\n type: 'Scroll',\n param: {\n locate,\n // The scrolling direction here all refers to which direction of the page's content will appear on the screen.\n distance,\n direction,\n },\n thought: swipeAction.think || '',\n },\n ];\n }\n case 'Long Press': {\n const longPressAction = doAction as LongPressAction;\n debug('Transform Long Press action:', longPressAction);\n const [x1, y1, x2, y2] = autoGLMCoordinateToBbox(\n longPressAction.element[0],\n longPressAction.element[1],\n size.width,\n size.height,\n );\n\n const locate: {\n prompt: string;\n bbox: [number, number, number, number];\n } = {\n prompt: '',\n bbox: [x1, y1, x2, y2],\n };\n\n return [\n {\n type: 'LongPress',\n param: {\n locate,\n },\n thought: longPressAction.think || '',\n },\n ];\n }\n case 'Back': {\n const backAction = doAction as BackAction;\n debug('Transform Back action:', backAction);\n return [\n {\n type: 'AndroidBackButton',\n param: {},\n thought: backAction.think || '',\n },\n ];\n }\n case 'Home': {\n const homeAction = doAction as HomeAction;\n debug('Transform Home action:', homeAction);\n return [\n {\n type: 'AndroidHomeButton',\n param: {},\n thought: homeAction.think || '',\n },\n ];\n }\n case 'Wait': {\n const waitAction = doAction as WaitAction;\n debug('Transform Wait action:', waitAction);\n return [\n {\n type: 'Sleep',\n param: {\n timeMs: waitAction.durationMs,\n },\n thought: waitAction.think || '',\n },\n ];\n }\n case 'Launch': {\n const launchAction = doAction as LaunchAction;\n debug('Transform Launch action:', launchAction);\n return [\n {\n type: 'Launch',\n param: launchAction.app,\n thought: launchAction.think || '',\n },\n ];\n }\n case 'Interact': {\n throw new Error(\n `Action \"Interact\" from auto-glm is not supported in the current implementation.`,\n );\n }\n case 'Call_API': {\n throw new Error(\n `Action \"Call_API\" from auto-glm is not supported in the current implementation.`,\n );\n }\n case 'Take_over': {\n throw new Error(\n `Action \"Take_over\" from auto-glm is not supported in the current implementation.`,\n );\n }\n case 'Note': {\n throw new Error(\n `Action \"Note\" from auto-glm is not supported in the current implementation.`,\n );\n }\n default:\n throw new Error(\n `Unknown do() action type: ${(doAction as any).action}`,\n );\n }\n }\n default:\n throw new Error(\n `Unknown action metadata: ${(action as any)._metadata}`,\n );\n }\n } catch (error) {\n const errorMessage = error instanceof Error ? error.message : String(error);\n debug('Transform error:', errorMessage);\n throw new Error(`Failed to transform action: ${errorMessage}`);\n }\n}\n"],"names":["debug","getDebug","AUTO_GLM_COORDINATE_MAX","autoGLMCoordinateToBbox","x","y","width","height","bbox","pointToBbox","adaptBbox","transformAutoGLMAction","action","size","finishAction","doAction","tapAction","x1","y1","x2","y2","locate","doubleTapAction","typeAction","swipeAction","deltaX","deltaY","direction","distance","absDeltaX","Math","absDeltaY","longPressAction","backAction","homeAction","waitAction","launchAction","Error","error","errorMessage","String"],"mappings":";;AAIA,MAAMA,QAAQC,SAAS;AAKvB,MAAMC,0BAA0B;AAKhC,SAASC,wBACPC,CAAS,EACTC,CAAS,EACTC,KAAa,EACbC,MAAc;IAEd,MAAMC,OAAOC,YAAYL,GAAGC,GAAG;IAC/B,OAAOK,UAAUF,MAAMF,OAAOC,QAAQD,OAAOC,QAAQ;AACvD;AAwGO,SAASI,uBACdC,MAAoB,EACpBC,IAAuC;IAEvC,IAAI;QACF,OAAQD,OAAO,SAAS;YACtB,KAAK;gBAAU;oBACb,MAAME,eAAeF;oBACrBZ,MAAM,4BAA4Bc;oBAClC,OAAO;wBACL;4BACE,MAAM;4BACN,OAAO,CAAC;4BACR,SAASA,aAAa,OAAO;wBAC/B;qBACD;gBACH;YACA,KAAK;gBAAM;oBACT,MAAMC,WAAWH;oBAejB,OAASG,SAAiB,MAAM;wBAC9B,KAAK;4BAAO;gCACV,MAAMC,YAAYD;gCAClBf,MAAM,yBAAyBgB;gCAC/B,MAAM,CAACC,IAAIC,IAAIC,IAAIC,GAAG,GAAGjB,wBACvBa,UAAU,OAAO,CAAC,EAAE,EACpBA,UAAU,OAAO,CAAC,EAAE,EACpBH,KAAK,KAAK,EACVA,KAAK,MAAM;gCAGb,MAAMQ,SAGF;oCACF,QAAQ;oCACR,MAAM;wCAACJ;wCAAIC;wCAAIC;wCAAIC;qCAAG;gCACxB;gCAEA,OAAO;oCACL;wCACE,MAAM;wCACN,OAAO;4CACLC;wCACF;oCACF;iCACD;4BACH;wBACA,KAAK;4BAAc;gCACjB,MAAMC,kBAAkBP;gCACxBf,MAAM,gCAAgCsB;gCACtC,MAAM,CAACL,IAAIC,IAAIC,IAAIC,GAAG,GAAGjB,wBACvBmB,gBAAgB,OAAO,CAAC,EAAE,EAC1BA,gBAAgB,OAAO,CAAC,EAAE,EAC1BT,KAAK,KAAK,EACVA,KAAK,MAAM;gCAGb,MAAMQ,SAGF;oCACF,QAAQ;oCACR,MAAM;wCAACJ;wCAAIC;wCAAIC;wCAAIC;qCAAG;gCACxB;gCAEA,OAAO;oCACL;wCACE,MAAM;wCACN,OAAO;4CACLC;wCACF;oCACF;iCACD;4BACH;wBACA,KAAK;4BAAQ;gCACX,MAAME,aAAaR;gCACnBf,MAAM,0BAA0BuB;gCAEhC,OAAO;oCACL;wCACE,MAAM;wCACN,OAAO;4CACL,OAAOA,WAAW,IAAI;wCACxB;oCACF;iCACD;4BACH;wBACA,KAAK;4BAAS;gCACZ,MAAMC,cAAcT;gCACpBf,MAAM,2BAA2BwB;gCAGjC,MAAM,CAACP,IAAIC,IAAIC,IAAIC,GAAG,GAAGjB,wBACvBqB,YAAY,KAAK,CAAC,EAAE,EACpBA,YAAY,KAAK,CAAC,EAAE,EACpBX,KAAK,KAAK,EACVA,KAAK,MAAM;gCAGb,MAAMQ,SAGF;oCACF,QAAQ;oCACR,MAAM;wCAACJ;wCAAIC;wCAAIC;wCAAIC;qCAAG;gCACxB;gCAGA,MAAMK,SAASD,YAAY,GAAG,CAAC,EAAE,GAAGA,YAAY,KAAK,CAAC,EAAE;gCACxD,MAAME,SAASF,YAAY,GAAG,CAAC,EAAE,GAAGA,YAAY,KAAK,CAAC,EAAE;gCAGxD,IAAIG;gCACJ,IAAIC;gCAEJ,MAAMC,YAAYC,KAAK,GAAG,CAACL;gCAC3B,MAAMM,YAAYD,KAAK,GAAG,CAACJ;gCAE3B,IAAIK,YAAYF,WAAW;oCAEzBD,WAAWE,KAAK,KAAK,CAClBC,YAAYlB,KAAK,MAAM,GAAIX;oCAE9ByB,YAAYD,SAAS,IAAI,OAAO;gCAClC,OAAO;oCAELE,WAAWE,KAAK,KAAK,CAClBD,YAAYhB,KAAK,KAAK,GAAIX;oCAE7ByB,YAAYF,SAAS,IAAI,SAAS;gCACpC;gCAEAzB,MACE,CAAC,2BAA2B,EAAE2B,UAAU,YAAY,EAAEC,UAAU;gCAGlE,OAAO;oCACL;wCACE,MAAM;wCACN,OAAO;4CACLP;4CAEAO;4CACAD;wCACF;wCACA,SAASH,YAAY,KAAK,IAAI;oCAChC;iCACD;4BACH;wBACA,KAAK;4BAAc;gCACjB,MAAMQ,kBAAkBjB;gCACxBf,MAAM,gCAAgCgC;gCACtC,MAAM,CAACf,IAAIC,IAAIC,IAAIC,GAAG,GAAGjB,wBACvB6B,gBAAgB,OAAO,CAAC,EAAE,EAC1BA,gBAAgB,OAAO,CAAC,EAAE,EAC1BnB,KAAK,KAAK,EACVA,KAAK,MAAM;gCAGb,MAAMQ,SAGF;oCACF,QAAQ;oCACR,MAAM;wCAACJ;wCAAIC;wCAAIC;wCAAIC;qCAAG;gCACxB;gCAEA,OAAO;oCACL;wCACE,MAAM;wCACN,OAAO;4CACLC;wCACF;wCACA,SAASW,gBAAgB,KAAK,IAAI;oCACpC;iCACD;4BACH;wBACA,KAAK;4BAAQ;gCACX,MAAMC,aAAalB;gCACnBf,MAAM,0BAA0BiC;gCAChC,OAAO;oCACL;wCACE,MAAM;wCACN,OAAO,CAAC;wCACR,SAASA,WAAW,KAAK,IAAI;oCAC/B;iCACD;4BACH;wBACA,KAAK;4BAAQ;gCACX,MAAMC,aAAanB;gCACnBf,MAAM,0BAA0BkC;gCAChC,OAAO;oCACL;wCACE,MAAM;wCACN,OAAO,CAAC;wCACR,SAASA,WAAW,KAAK,IAAI;oCAC/B;iCACD;4BACH;wBACA,KAAK;4BAAQ;gCACX,MAAMC,aAAapB;gCACnBf,MAAM,0BAA0BmC;gCAChC,OAAO;oCACL;wCACE,MAAM;wCACN,OAAO;4CACL,QAAQA,WAAW,UAAU;wCAC/B;wCACA,SAASA,WAAW,KAAK,IAAI;oCAC/B;iCACD;4BACH;wBACA,KAAK;4BAAU;gCACb,MAAMC,eAAerB;gCACrBf,MAAM,4BAA4BoC;gCAClC,OAAO;oCACL;wCACE,MAAM;wCACN,OAAOA,aAAa,GAAG;wCACvB,SAASA,aAAa,KAAK,IAAI;oCACjC;iCACD;4BACH;wBACA,KAAK;4BACH,MAAM,IAAIC,MACR;wBAGJ,KAAK;4BACH,MAAM,IAAIA,MACR;wBAGJ,KAAK;4BACH,MAAM,IAAIA,MACR;wBAGJ,KAAK;4BACH,MAAM,IAAIA,MACR;wBAGJ;4BACE,MAAM,IAAIA,MACR,CAAC,0BAA0B,EAAGtB,SAAiB,MAAM,EAAE;oBAE7D;gBACF;YACA;gBACE,MAAM,IAAIsB,MACR,CAAC,yBAAyB,EAAGzB,OAAe,SAAS,EAAE;QAE7D;IACF,EAAE,OAAO0B,OAAO;QACd,MAAMC,eAAeD,iBAAiBD,QAAQC,MAAM,OAAO,GAAGE,OAAOF;QACrEtC,MAAM,oBAAoBuC;QAC1B,MAAM,IAAIF,MAAM,CAAC,4BAA4B,EAAEE,cAAc;IAC/D;AACF"}
1
+ {"version":3,"file":"ai-model/auto-glm/actions.mjs","sources":["../../../../src/ai-model/auto-glm/actions.ts"],"sourcesContent":["import { adaptBbox, pointToBbox } from '@/common';\nimport type { PlanningAction } from '@/types';\nimport { getDebug } from '@midscene/shared/logger';\n\nconst debug = getDebug('auto-glm-actions');\n\n/**\n * Auto-GLM coordinate system range: [0, AUTO_GLM_COORDINATE_MAX]\n */\nconst AUTO_GLM_COORDINATE_MAX = 1000;\n\n/**\n * Convert auto-glm coordinate [0,1000] to bbox in pixel coordinates\n */\nfunction autoGLMCoordinateToBbox(\n x: number,\n y: number,\n width: number,\n height: number,\n): [number, number, number, number] {\n const bbox = pointToBbox(x, y, 10);\n return adaptBbox(bbox, width, height, 'auto-glm');\n}\n\nexport interface BaseAction {\n _metadata: string;\n think?: string;\n}\n\nexport interface TapAction extends BaseAction {\n _metadata: 'do';\n action: 'Tap';\n element: [number, number];\n}\n\nexport interface DoubleTapAction extends BaseAction {\n _metadata: 'do';\n action: 'Double Tap';\n element: [number, number];\n}\n\nexport interface TypeAction extends BaseAction {\n _metadata: 'do';\n action: 'Type';\n text: string;\n}\n\nexport interface SwipeAction extends BaseAction {\n _metadata: 'do';\n action: 'Swipe';\n start: [number, number];\n end: [number, number];\n}\n\nexport interface LongPressAction extends BaseAction {\n _metadata: 'do';\n action: 'Long Press';\n element: [number, number];\n}\n\nexport interface LaunchAction extends BaseAction {\n _metadata: 'do';\n action: 'Launch';\n app: string;\n}\n\nexport interface BackAction extends BaseAction {\n _metadata: 'do';\n action: 'Back';\n}\n\nexport interface HomeAction extends BaseAction {\n _metadata: 'do';\n action: 'Home';\n}\n\nexport interface WaitAction extends BaseAction {\n _metadata: 'do';\n action: 'Wait';\n durationMs: number;\n}\n\nexport interface InteractAction extends BaseAction {\n _metadata: 'do';\n action: 'Interact';\n}\n\nexport interface CallAPIAction extends BaseAction {\n _metadata: 'do';\n action: 'Call_API';\n instruction: string;\n}\n\nexport interface TakeoverAction extends BaseAction {\n _metadata: 'do';\n action: 'Take_over';\n message: string;\n}\n\nexport interface NoteAction extends BaseAction {\n _metadata: 'do';\n action: 'Note';\n message: string;\n}\n\nexport interface FinishAction extends BaseAction {\n _metadata: 'finish';\n message: string;\n}\n\nexport type ParsedAction =\n | TapAction\n | DoubleTapAction\n | TypeAction\n | SwipeAction\n | LongPressAction\n | LaunchAction\n | BackAction\n | HomeAction\n | WaitAction\n | InteractAction\n | CallAPIAction\n | TakeoverAction\n | NoteAction\n | FinishAction;\n\nexport function transformAutoGLMAction(\n action: ParsedAction,\n size: { width: number; height: number },\n): PlanningAction[] {\n try {\n switch (action._metadata) {\n case 'finish': {\n const finishAction = action as FinishAction;\n debug('Transform finish action:', finishAction);\n return [\n {\n type: 'Finished',\n param: {},\n thought: finishAction.message,\n },\n ];\n }\n case 'do': {\n const doAction = action as\n | TapAction\n | DoubleTapAction\n | TypeAction\n | SwipeAction\n | LongPressAction\n | LaunchAction\n | BackAction\n | HomeAction\n | WaitAction\n | InteractAction\n | CallAPIAction\n | TakeoverAction\n | NoteAction;\n\n switch ((doAction as any).action) {\n case 'Tap': {\n const tapAction = doAction as TapAction;\n debug('Transform Tap action:', tapAction);\n const [x1, y1, x2, y2] = autoGLMCoordinateToBbox(\n tapAction.element[0],\n tapAction.element[1],\n size.width,\n size.height,\n );\n\n const locate: {\n prompt: string;\n bbox: [number, number, number, number];\n } = {\n prompt: '',\n bbox: [x1, y1, x2, y2],\n };\n\n return [\n {\n type: 'Tap',\n param: {\n locate,\n },\n },\n ];\n }\n case 'Double Tap': {\n const doubleTapAction = doAction as DoubleTapAction;\n debug('Transform Double Tap action:', doubleTapAction);\n const [x1, y1, x2, y2] = autoGLMCoordinateToBbox(\n doubleTapAction.element[0],\n doubleTapAction.element[1],\n size.width,\n size.height,\n );\n\n const locate: {\n prompt: string;\n bbox: [number, number, number, number];\n } = {\n prompt: '',\n bbox: [x1, y1, x2, y2],\n };\n\n return [\n {\n type: 'DoubleClick',\n param: {\n locate,\n },\n },\n ];\n }\n case 'Type': {\n const typeAction = doAction as TypeAction;\n debug('Transform Type action:', typeAction);\n\n return [\n {\n type: 'Input',\n param: {\n value: typeAction.text,\n },\n },\n ];\n }\n case 'Swipe': {\n const swipeAction = doAction as SwipeAction;\n debug('Transform Swipe action:', swipeAction);\n\n // Calculate locate using start coordinate\n const [x1, y1, x2, y2] = autoGLMCoordinateToBbox(\n swipeAction.start[0],\n swipeAction.start[1],\n size.width,\n size.height,\n );\n\n const locate: {\n prompt: string;\n bbox: [number, number, number, number];\n } = {\n prompt: '',\n bbox: [x1, y1, x2, y2],\n };\n\n // Calculate horizontal and vertical delta in [0,AUTO_GLM_COORDINATE_MAX] coordinate system\n const deltaX = swipeAction.end[0] - swipeAction.start[0];\n const deltaY = swipeAction.end[1] - swipeAction.start[1];\n\n // Determine direction and distance\n let direction: 'up' | 'down' | 'left' | 'right';\n let distance: number;\n\n const absDeltaX = Math.abs(deltaX);\n const absDeltaY = Math.abs(deltaY);\n\n if (absDeltaY > absDeltaX) {\n // Vertical scroll\n distance = Math.round(\n (absDeltaY * size.height) / AUTO_GLM_COORDINATE_MAX,\n );\n direction = deltaY > 0 ? 'up' : 'down';\n } else {\n // Horizontal scroll\n distance = Math.round(\n (absDeltaX * size.width) / AUTO_GLM_COORDINATE_MAX,\n );\n direction = deltaX > 0 ? 'left' : 'right';\n }\n\n debug(\n `Calculate swipe direction: ${direction}, distance: ${distance}`,\n );\n\n return [\n {\n type: 'Scroll',\n param: {\n locate,\n // The scrolling direction here all refers to which direction of the page's content will appear on the screen.\n distance,\n direction,\n },\n thought: swipeAction.think || '',\n },\n ];\n }\n case 'Long Press': {\n const longPressAction = doAction as LongPressAction;\n debug('Transform Long Press action:', longPressAction);\n const [x1, y1, x2, y2] = autoGLMCoordinateToBbox(\n longPressAction.element[0],\n longPressAction.element[1],\n size.width,\n size.height,\n );\n\n const locate: {\n prompt: string;\n bbox: [number, number, number, number];\n } = {\n prompt: '',\n bbox: [x1, y1, x2, y2],\n };\n\n return [\n {\n type: 'LongPress',\n param: {\n locate,\n },\n thought: longPressAction.think || '',\n },\n ];\n }\n case 'Back': {\n const backAction = doAction as BackAction;\n debug('Transform Back action:', backAction);\n return [\n {\n type: 'AndroidBackButton',\n param: {},\n thought: backAction.think || '',\n },\n ];\n }\n case 'Home': {\n const homeAction = doAction as HomeAction;\n debug('Transform Home action:', homeAction);\n return [\n {\n type: 'AndroidHomeButton',\n param: {},\n thought: homeAction.think || '',\n },\n ];\n }\n case 'Wait': {\n const waitAction = doAction as WaitAction;\n debug('Transform Wait action:', waitAction);\n return [\n {\n type: 'Sleep',\n param: {\n timeMs: waitAction.durationMs,\n },\n thought: waitAction.think || '',\n },\n ];\n }\n case 'Launch': {\n const launchAction = doAction as LaunchAction;\n debug('Transform Launch action:', launchAction);\n return [\n {\n type: 'Launch',\n param: launchAction.app,\n thought: launchAction.think || '',\n },\n ];\n }\n case 'Interact': {\n throw new Error(\n `Action \"Interact\" from auto-glm is not supported in the current implementation.`,\n );\n }\n case 'Call_API': {\n throw new Error(\n `Action \"Call_API\" from auto-glm is not supported in the current implementation.`,\n );\n }\n case 'Take_over': {\n throw new Error(\n `Action \"Take_over\" from auto-glm is not supported in the current implementation.`,\n );\n }\n case 'Note': {\n throw new Error(\n `Action \"Note\" from auto-glm is not supported in the current implementation.`,\n );\n }\n default:\n throw new Error(\n `Unknown do() action type: ${(doAction as any).action}`,\n );\n }\n }\n default:\n throw new Error(\n `Unknown action metadata: ${(action as any)._metadata}`,\n );\n }\n } catch (error) {\n const errorMessage = error instanceof Error ? error.message : String(error);\n debug('Transform error:', errorMessage);\n throw new Error(`Failed to transform action: ${errorMessage}`);\n }\n}\n"],"names":["debug","getDebug","AUTO_GLM_COORDINATE_MAX","autoGLMCoordinateToBbox","x","y","width","height","bbox","pointToBbox","adaptBbox","transformAutoGLMAction","action","size","finishAction","doAction","tapAction","x1","y1","x2","y2","locate","doubleTapAction","typeAction","swipeAction","deltaX","deltaY","direction","distance","absDeltaX","Math","absDeltaY","longPressAction","backAction","homeAction","waitAction","launchAction","Error","error","errorMessage","String"],"mappings":";;AAIA,MAAMA,QAAQC,SAAS;AAKvB,MAAMC,0BAA0B;AAKhC,SAASC,wBACPC,CAAS,EACTC,CAAS,EACTC,KAAa,EACbC,MAAc;IAEd,MAAMC,OAAOC,YAAYL,GAAGC,GAAG;IAC/B,OAAOK,UAAUF,MAAMF,OAAOC,QAAQ;AACxC;AAwGO,SAASI,uBACdC,MAAoB,EACpBC,IAAuC;IAEvC,IAAI;QACF,OAAQD,OAAO,SAAS;YACtB,KAAK;gBAAU;oBACb,MAAME,eAAeF;oBACrBZ,MAAM,4BAA4Bc;oBAClC,OAAO;wBACL;4BACE,MAAM;4BACN,OAAO,CAAC;4BACR,SAASA,aAAa,OAAO;wBAC/B;qBACD;gBACH;YACA,KAAK;gBAAM;oBACT,MAAMC,WAAWH;oBAejB,OAASG,SAAiB,MAAM;wBAC9B,KAAK;4BAAO;gCACV,MAAMC,YAAYD;gCAClBf,MAAM,yBAAyBgB;gCAC/B,MAAM,CAACC,IAAIC,IAAIC,IAAIC,GAAG,GAAGjB,wBACvBa,UAAU,OAAO,CAAC,EAAE,EACpBA,UAAU,OAAO,CAAC,EAAE,EACpBH,KAAK,KAAK,EACVA,KAAK,MAAM;gCAGb,MAAMQ,SAGF;oCACF,QAAQ;oCACR,MAAM;wCAACJ;wCAAIC;wCAAIC;wCAAIC;qCAAG;gCACxB;gCAEA,OAAO;oCACL;wCACE,MAAM;wCACN,OAAO;4CACLC;wCACF;oCACF;iCACD;4BACH;wBACA,KAAK;4BAAc;gCACjB,MAAMC,kBAAkBP;gCACxBf,MAAM,gCAAgCsB;gCACtC,MAAM,CAACL,IAAIC,IAAIC,IAAIC,GAAG,GAAGjB,wBACvBmB,gBAAgB,OAAO,CAAC,EAAE,EAC1BA,gBAAgB,OAAO,CAAC,EAAE,EAC1BT,KAAK,KAAK,EACVA,KAAK,MAAM;gCAGb,MAAMQ,SAGF;oCACF,QAAQ;oCACR,MAAM;wCAACJ;wCAAIC;wCAAIC;wCAAIC;qCAAG;gCACxB;gCAEA,OAAO;oCACL;wCACE,MAAM;wCACN,OAAO;4CACLC;wCACF;oCACF;iCACD;4BACH;wBACA,KAAK;4BAAQ;gCACX,MAAME,aAAaR;gCACnBf,MAAM,0BAA0BuB;gCAEhC,OAAO;oCACL;wCACE,MAAM;wCACN,OAAO;4CACL,OAAOA,WAAW,IAAI;wCACxB;oCACF;iCACD;4BACH;wBACA,KAAK;4BAAS;gCACZ,MAAMC,cAAcT;gCACpBf,MAAM,2BAA2BwB;gCAGjC,MAAM,CAACP,IAAIC,IAAIC,IAAIC,GAAG,GAAGjB,wBACvBqB,YAAY,KAAK,CAAC,EAAE,EACpBA,YAAY,KAAK,CAAC,EAAE,EACpBX,KAAK,KAAK,EACVA,KAAK,MAAM;gCAGb,MAAMQ,SAGF;oCACF,QAAQ;oCACR,MAAM;wCAACJ;wCAAIC;wCAAIC;wCAAIC;qCAAG;gCACxB;gCAGA,MAAMK,SAASD,YAAY,GAAG,CAAC,EAAE,GAAGA,YAAY,KAAK,CAAC,EAAE;gCACxD,MAAME,SAASF,YAAY,GAAG,CAAC,EAAE,GAAGA,YAAY,KAAK,CAAC,EAAE;gCAGxD,IAAIG;gCACJ,IAAIC;gCAEJ,MAAMC,YAAYC,KAAK,GAAG,CAACL;gCAC3B,MAAMM,YAAYD,KAAK,GAAG,CAACJ;gCAE3B,IAAIK,YAAYF,WAAW;oCAEzBD,WAAWE,KAAK,KAAK,CAClBC,YAAYlB,KAAK,MAAM,GAAIX;oCAE9ByB,YAAYD,SAAS,IAAI,OAAO;gCAClC,OAAO;oCAELE,WAAWE,KAAK,KAAK,CAClBD,YAAYhB,KAAK,KAAK,GAAIX;oCAE7ByB,YAAYF,SAAS,IAAI,SAAS;gCACpC;gCAEAzB,MACE,CAAC,2BAA2B,EAAE2B,UAAU,YAAY,EAAEC,UAAU;gCAGlE,OAAO;oCACL;wCACE,MAAM;wCACN,OAAO;4CACLP;4CAEAO;4CACAD;wCACF;wCACA,SAASH,YAAY,KAAK,IAAI;oCAChC;iCACD;4BACH;wBACA,KAAK;4BAAc;gCACjB,MAAMQ,kBAAkBjB;gCACxBf,MAAM,gCAAgCgC;gCACtC,MAAM,CAACf,IAAIC,IAAIC,IAAIC,GAAG,GAAGjB,wBACvB6B,gBAAgB,OAAO,CAAC,EAAE,EAC1BA,gBAAgB,OAAO,CAAC,EAAE,EAC1BnB,KAAK,KAAK,EACVA,KAAK,MAAM;gCAGb,MAAMQ,SAGF;oCACF,QAAQ;oCACR,MAAM;wCAACJ;wCAAIC;wCAAIC;wCAAIC;qCAAG;gCACxB;gCAEA,OAAO;oCACL;wCACE,MAAM;wCACN,OAAO;4CACLC;wCACF;wCACA,SAASW,gBAAgB,KAAK,IAAI;oCACpC;iCACD;4BACH;wBACA,KAAK;4BAAQ;gCACX,MAAMC,aAAalB;gCACnBf,MAAM,0BAA0BiC;gCAChC,OAAO;oCACL;wCACE,MAAM;wCACN,OAAO,CAAC;wCACR,SAASA,WAAW,KAAK,IAAI;oCAC/B;iCACD;4BACH;wBACA,KAAK;4BAAQ;gCACX,MAAMC,aAAanB;gCACnBf,MAAM,0BAA0BkC;gCAChC,OAAO;oCACL;wCACE,MAAM;wCACN,OAAO,CAAC;wCACR,SAASA,WAAW,KAAK,IAAI;oCAC/B;iCACD;4BACH;wBACA,KAAK;4BAAQ;gCACX,MAAMC,aAAapB;gCACnBf,MAAM,0BAA0BmC;gCAChC,OAAO;oCACL;wCACE,MAAM;wCACN,OAAO;4CACL,QAAQA,WAAW,UAAU;wCAC/B;wCACA,SAASA,WAAW,KAAK,IAAI;oCAC/B;iCACD;4BACH;wBACA,KAAK;4BAAU;gCACb,MAAMC,eAAerB;gCACrBf,MAAM,4BAA4BoC;gCAClC,OAAO;oCACL;wCACE,MAAM;wCACN,OAAOA,aAAa,GAAG;wCACvB,SAASA,aAAa,KAAK,IAAI;oCACjC;iCACD;4BACH;wBACA,KAAK;4BACH,MAAM,IAAIC,MACR;wBAGJ,KAAK;4BACH,MAAM,IAAIA,MACR;wBAGJ,KAAK;4BACH,MAAM,IAAIA,MACR;wBAGJ,KAAK;4BACH,MAAM,IAAIA,MACR;wBAGJ;4BACE,MAAM,IAAIA,MACR,CAAC,0BAA0B,EAAGtB,SAAiB,MAAM,EAAE;oBAE7D;gBACF;YACA;gBACE,MAAM,IAAIsB,MACR,CAAC,yBAAyB,EAAGzB,OAAe,SAAS,EAAE;QAE7D;IACF,EAAE,OAAO0B,OAAO;QACd,MAAMC,eAAeD,iBAAiBD,QAAQC,MAAM,OAAO,GAAGE,OAAOF;QACrEtC,MAAM,oBAAoBuC;QAC1B,MAAM,IAAIF,MAAM,CAAC,4BAA4B,EAAEE,cAAc;IAC/D;AACF"}
@@ -1,4 +1,4 @@
1
- import { generateElementByPosition } from "@midscene/shared/extractor/dom-util";
1
+ import { generateElementByPoint, generateElementByRect } from "@midscene/shared/extractor/dom-util";
2
2
  import { cropByRect, paddingToMatchBlockByBase64, preProcessImageUrl } from "@midscene/shared/img";
3
3
  import { getDebug } from "@midscene/shared/logger";
4
4
  import { assert } from "@midscene/shared/utils";
@@ -138,27 +138,18 @@ async function AiLocateElement(options) {
138
138
  pixelX,
139
139
  pixelY
140
140
  });
141
- const bboxSize = 10;
142
- const x1 = Math.max(pixelX - bboxSize / 2, 0);
143
- const y1 = Math.max(pixelY - bboxSize / 2, 0);
144
- const x2 = Math.min(pixelX + bboxSize / 2, imageWidth);
145
- const y2 = Math.min(pixelY + bboxSize / 2, imageHeight);
146
- resRect = {
147
- left: x1,
148
- top: y1,
149
- width: x2 - x1,
150
- height: y2 - y1
151
- };
141
+ let finalX = pixelX;
142
+ let finalY = pixelY;
152
143
  if (options.searchConfig?.rect) {
153
- resRect.left += options.searchConfig.rect.left;
154
- resRect.top += options.searchConfig.rect.top;
144
+ finalX += options.searchConfig.rect.left;
145
+ finalY += options.searchConfig.rect.top;
155
146
  }
147
+ const element = generateElementByPoint([
148
+ finalX,
149
+ finalY
150
+ ], targetElementDescriptionText);
151
+ resRect = element.rect;
156
152
  debugInspect('auto-glm resRect:', resRect);
157
- const rectCenter = {
158
- x: resRect.left + resRect.width / 2,
159
- y: resRect.top + resRect.height / 2
160
- };
161
- const element = generateElementByPosition(rectCenter, targetElementDescriptionText);
162
153
  if (element) matchedElements = [
163
154
  element
164
155
  ];
@@ -202,11 +193,7 @@ async function AiLocateElement(options) {
202
193
  if ('bbox' in res.content && Array.isArray(res.content.bbox) && res.content.bbox.length >= 1) {
203
194
  resRect = adaptBboxToRect(res.content.bbox, imageWidth, imageHeight, options.searchConfig?.rect?.left, options.searchConfig?.rect?.top, originalImageWidth, originalImageHeight, modelFamily);
204
195
  debugInspect('resRect', resRect);
205
- const rectCenter = {
206
- x: resRect.left + resRect.width / 2,
207
- y: resRect.top + resRect.height / 2
208
- };
209
- const element = generateElementByPosition(rectCenter, targetElementDescriptionText);
196
+ const element = generateElementByRect(resRect, targetElementDescriptionText);
210
197
  errors = [];
211
198
  if (element) matchedElements = [
212
199
  element
@@ -1 +1 @@
1
- {"version":3,"file":"ai-model/inspect.mjs","sources":["../../../src/ai-model/inspect.ts"],"sourcesContent":["import type {\n AIDataExtractionResponse,\n AIElementResponse,\n AISectionLocatorResponse,\n AIUsageInfo,\n Rect,\n ServiceExtractOption,\n UIContext,\n} from '@/types';\nimport type { IModelConfig } from '@midscene/shared/env';\nimport { generateElementByPosition } from '@midscene/shared/extractor/dom-util';\nimport {\n cropByRect,\n paddingToMatchBlockByBase64,\n preProcessImageUrl,\n} from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport type { LocateResultElement } from '@midscene/shared/types';\nimport { assert } from '@midscene/shared/utils';\nimport type {\n ChatCompletionSystemMessageParam,\n ChatCompletionUserMessageParam,\n} from 'openai/resources/index';\nimport type { TMultimodalPrompt, TUserPrompt } from '../common';\nimport { adaptBboxToRect, expandSearchArea, mergeRects } from '../common';\nimport { parseAutoGLMLocateResponse } from './auto-glm/parser';\nimport { getAutoGLMLocatePrompt } from './auto-glm/prompt';\nimport { isAutoGLM } from './auto-glm/util';\nimport {\n extractDataQueryPrompt,\n parseXMLExtractionResponse,\n systemPromptToExtract,\n} from './prompt/extraction';\nimport {\n findElementPrompt,\n systemPromptToLocateElement,\n} from './prompt/llm-locator';\nimport {\n sectionLocatorInstruction,\n systemPromptToLocateSection,\n} from './prompt/llm-section-locator';\nimport {\n orderSensitiveJudgePrompt,\n systemPromptToJudgeOrderSensitive,\n} from './prompt/order-sensitive-judge';\nimport {\n AIResponseParseError,\n callAI,\n callAIWithObjectResponse,\n callAIWithStringResponse,\n} from './service-caller/index';\n\nexport type AIArgs = [\n ChatCompletionSystemMessageParam,\n ...ChatCompletionUserMessageParam[],\n];\n\nconst debugInspect = getDebug('ai:inspect');\nconst debugSection = getDebug('ai:section');\n\nconst extraTextFromUserPrompt = (prompt: TUserPrompt): string => {\n if (typeof prompt === 'string') {\n return prompt;\n } else {\n return prompt.prompt;\n }\n};\n\nconst promptsToChatParam = async (\n multimodalPrompt: TMultimodalPrompt,\n): Promise<ChatCompletionUserMessageParam[]> => {\n const msgs: ChatCompletionUserMessageParam[] = [];\n if (multimodalPrompt?.images?.length) {\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: 'Next, I will provide all the reference images.',\n },\n ],\n });\n\n for (const item of multimodalPrompt.images) {\n const base64 = await preProcessImageUrl(\n item.url,\n !!multimodalPrompt.convertHttpImage2Base64,\n );\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: `this is the reference image named '${item.name}':`,\n },\n ],\n });\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: base64,\n detail: 'high',\n },\n },\n ],\n });\n }\n }\n return msgs;\n};\n\nexport async function AiLocateElement(options: {\n context: UIContext;\n targetElementDescription: TUserPrompt;\n callAIFn: typeof callAIWithObjectResponse<\n AIElementResponse | [number, number]\n >;\n searchConfig?: Awaited<ReturnType<typeof AiLocateSection>>;\n modelConfig: IModelConfig;\n}): Promise<{\n parseResult: {\n elements: LocateResultElement[];\n errors?: string[];\n };\n rect?: Rect;\n rawResponse: string;\n usage?: AIUsageInfo;\n reasoning_content?: string;\n}> {\n const { context, targetElementDescription, callAIFn, modelConfig } = options;\n const { modelFamily } = modelConfig;\n const screenshotBase64 = context.screenshot.base64;\n\n assert(\n targetElementDescription,\n 'cannot find the target element description',\n );\n const targetElementDescriptionText = extraTextFromUserPrompt(\n targetElementDescription,\n );\n const userInstructionPrompt = findElementPrompt(targetElementDescriptionText);\n const systemPrompt = isAutoGLM(modelFamily)\n ? getAutoGLMLocatePrompt(modelFamily)\n : systemPromptToLocateElement(modelFamily);\n\n let imagePayload = screenshotBase64;\n let imageWidth = context.size.width;\n let imageHeight = context.size.height;\n let originalImageWidth = imageWidth;\n let originalImageHeight = imageHeight;\n\n if (options.searchConfig) {\n assert(\n options.searchConfig.rect,\n 'searchArea is provided but its rect cannot be found. Failed to locate element',\n );\n assert(\n options.searchConfig.imageBase64,\n 'searchArea is provided but its imageBase64 cannot be found. Failed to locate element',\n );\n\n imagePayload = options.searchConfig.imageBase64;\n imageWidth = options.searchConfig.rect?.width;\n imageHeight = options.searchConfig.rect?.height;\n originalImageWidth = imageWidth;\n originalImageHeight = imageHeight;\n } else if (modelFamily === 'qwen2.5-vl') {\n const paddedResult = await paddingToMatchBlockByBase64(imagePayload);\n imageWidth = paddedResult.width;\n imageHeight = paddedResult.height;\n imagePayload = paddedResult.imageBase64;\n }\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: isAutoGLM(modelFamily)\n ? `Tap: ${userInstructionPrompt}`\n : userInstructionPrompt,\n },\n ],\n },\n ];\n\n if (typeof targetElementDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: targetElementDescription.images,\n convertHttpImage2Base64: targetElementDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n if (isAutoGLM(modelFamily)) {\n const { content: rawResponseContent, usage } =\n await callAIWithStringResponse(msgs, modelConfig);\n\n debugInspect('auto-glm rawResponse:', rawResponseContent);\n\n const parsed = parseAutoGLMLocateResponse(rawResponseContent);\n\n debugInspect('auto-glm thinking:', parsed.think);\n debugInspect('auto-glm coordinates:', parsed.coordinates);\n\n let resRect: Rect | undefined;\n let matchedElements: LocateResultElement[] = [];\n let errors: string[] = [];\n\n if (parsed.error || !parsed.coordinates) {\n errors = [parsed.error || 'Failed to parse auto-glm response'];\n debugInspect('auto-glm parse error:', errors[0]);\n } else {\n const { x, y } = parsed.coordinates;\n\n debugInspect('auto-glm coordinates [0-999]:', { x, y });\n\n // Convert auto-glm coordinates [0,999] to pixel bbox\n // Map from [0,999] to pixel coordinates\n const pixelX = Math.round((x * imageWidth) / 1000);\n const pixelY = Math.round((y * imageHeight) / 1000);\n\n debugInspect('auto-glm pixel coordinates:', { pixelX, pixelY });\n\n // Create a small bbox around the point\n const bboxSize = 10;\n const x1 = Math.max(pixelX - bboxSize / 2, 0);\n const y1 = Math.max(pixelY - bboxSize / 2, 0);\n const x2 = Math.min(pixelX + bboxSize / 2, imageWidth);\n const y2 = Math.min(pixelY + bboxSize / 2, imageHeight);\n\n // Convert to Rect format\n resRect = {\n left: x1,\n top: y1,\n width: x2 - x1,\n height: y2 - y1,\n };\n\n // Apply offset if searching in a cropped area\n if (options.searchConfig?.rect) {\n resRect.left += options.searchConfig.rect.left;\n resRect.top += options.searchConfig.rect.top;\n }\n\n debugInspect('auto-glm resRect:', resRect);\n\n const rectCenter = {\n x: resRect.left + resRect.width / 2,\n y: resRect.top + resRect.height / 2,\n };\n\n const element: LocateResultElement = generateElementByPosition(\n rectCenter,\n targetElementDescriptionText as string,\n );\n\n if (element) {\n matchedElements = [element];\n }\n }\n\n return {\n rect: resRect,\n parseResult: {\n elements: matchedElements,\n errors,\n },\n rawResponse: rawResponseContent,\n usage,\n reasoning_content: parsed.think,\n };\n }\n\n let res: Awaited<ReturnType<typeof callAIFn>>;\n try {\n res = await callAIFn(msgs, modelConfig);\n } catch (callError) {\n // Return error with usage and rawResponse if available\n const errorMessage =\n callError instanceof Error ? callError.message : String(callError);\n const rawResponse =\n callError instanceof AIResponseParseError\n ? callError.rawResponse\n : errorMessage;\n const usage =\n callError instanceof AIResponseParseError ? callError.usage : undefined;\n return {\n rect: undefined,\n parseResult: {\n elements: [],\n errors: [`AI call error: ${errorMessage}`],\n },\n rawResponse,\n usage,\n reasoning_content: undefined,\n };\n }\n\n const rawResponse = JSON.stringify(res.content);\n\n let resRect: Rect | undefined;\n let matchedElements: LocateResultElement[] = [];\n let errors: string[] | undefined =\n 'errors' in res.content ? res.content.errors : [];\n try {\n if (\n 'bbox' in res.content &&\n Array.isArray(res.content.bbox) &&\n res.content.bbox.length >= 1\n ) {\n resRect = adaptBboxToRect(\n res.content.bbox,\n imageWidth,\n imageHeight,\n options.searchConfig?.rect?.left,\n options.searchConfig?.rect?.top,\n originalImageWidth,\n originalImageHeight,\n modelFamily,\n );\n\n debugInspect('resRect', resRect);\n\n const rectCenter = {\n x: resRect.left + resRect.width / 2,\n y: resRect.top + resRect.height / 2,\n };\n\n const element: LocateResultElement = generateElementByPosition(\n rectCenter,\n targetElementDescriptionText as string,\n );\n errors = [];\n\n if (element) {\n matchedElements = [element];\n }\n }\n } catch (e) {\n const msg =\n e instanceof Error\n ? `Failed to parse bbox: ${e.message}`\n : 'unknown error in locate';\n if (!errors || errors?.length === 0) {\n errors = [msg];\n } else {\n errors.push(`(${msg})`);\n }\n }\n\n return {\n rect: resRect,\n parseResult: {\n elements: matchedElements as LocateResultElement[],\n errors: errors as string[],\n },\n rawResponse,\n usage: res.usage,\n reasoning_content: res.reasoning_content,\n };\n}\n\nexport async function AiLocateSection(options: {\n context: UIContext;\n sectionDescription: TUserPrompt;\n modelConfig: IModelConfig;\n}): Promise<{\n rect?: Rect;\n imageBase64?: string;\n error?: string;\n rawResponse: string;\n usage?: AIUsageInfo;\n}> {\n const { context, sectionDescription, modelConfig } = options;\n const { modelFamily } = modelConfig;\n const screenshotBase64 = context.screenshot.base64;\n\n const systemPrompt = systemPromptToLocateSection(modelFamily);\n const sectionLocatorInstructionText = sectionLocatorInstruction(\n extraTextFromUserPrompt(sectionDescription),\n );\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: sectionLocatorInstructionText,\n },\n ],\n },\n ];\n\n if (typeof sectionDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: sectionDescription.images,\n convertHttpImage2Base64: sectionDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n let result: Awaited<\n ReturnType<typeof callAIWithObjectResponse<AISectionLocatorResponse>>\n >;\n try {\n result = await callAIWithObjectResponse<AISectionLocatorResponse>(\n msgs,\n modelConfig,\n );\n } catch (callError) {\n // Return error with usage and rawResponse if available\n const errorMessage =\n callError instanceof Error ? callError.message : String(callError);\n const rawResponse =\n callError instanceof AIResponseParseError\n ? callError.rawResponse\n : errorMessage;\n const usage =\n callError instanceof AIResponseParseError ? callError.usage : undefined;\n return {\n rect: undefined,\n imageBase64: undefined,\n error: `AI call error: ${errorMessage}`,\n rawResponse,\n usage,\n };\n }\n\n let sectionRect: Rect | undefined;\n const sectionBbox = result.content.bbox;\n if (sectionBbox) {\n const targetRect = adaptBboxToRect(\n sectionBbox,\n context.size.width,\n context.size.height,\n 0,\n 0,\n context.size.width,\n context.size.height,\n modelFamily,\n );\n debugSection('original targetRect %j', targetRect);\n\n const referenceBboxList = result.content.references_bbox || [];\n debugSection('referenceBboxList %j', referenceBboxList);\n\n const referenceRects = referenceBboxList\n .filter((bbox) => Array.isArray(bbox))\n .map((bbox) => {\n return adaptBboxToRect(\n bbox,\n context.size.width,\n context.size.height,\n 0,\n 0,\n context.size.width,\n context.size.height,\n modelFamily,\n );\n });\n debugSection('referenceRects %j', referenceRects);\n\n // merge the sectionRect and referenceRects\n const mergedRect = mergeRects([targetRect, ...referenceRects]);\n debugSection('mergedRect %j', mergedRect);\n\n // expand search area to at least 200 x 200\n sectionRect = expandSearchArea(mergedRect, context.size, modelFamily);\n debugSection('expanded sectionRect %j', sectionRect);\n }\n\n let imageBase64 = screenshotBase64;\n if (sectionRect) {\n const croppedResult = await cropByRect(\n screenshotBase64,\n sectionRect,\n modelFamily === 'qwen2.5-vl',\n );\n imageBase64 = croppedResult.imageBase64;\n sectionRect.width = croppedResult.width;\n sectionRect.height = croppedResult.height;\n }\n\n return {\n rect: sectionRect,\n imageBase64,\n error: result.content.error,\n rawResponse: JSON.stringify(result.content),\n usage: result.usage,\n };\n}\n\nexport async function AiExtractElementInfo<T>(options: {\n dataQuery: string | Record<string, string>;\n multimodalPrompt?: TMultimodalPrompt;\n context: UIContext;\n pageDescription?: string;\n extractOption?: ServiceExtractOption;\n modelConfig: IModelConfig;\n}) {\n const { dataQuery, context, extractOption, multimodalPrompt, modelConfig } =\n options;\n const systemPrompt = systemPromptToExtract();\n const screenshotBase64 = context.screenshot.base64;\n\n const extractDataPromptText = extractDataQueryPrompt(\n options.pageDescription || '',\n dataQuery,\n );\n\n const userContent: ChatCompletionUserMessageParam['content'] = [];\n\n if (extractOption?.screenshotIncluded !== false) {\n userContent.push({\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n });\n }\n\n userContent.push({\n type: 'text',\n text: extractDataPromptText,\n });\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: userContent,\n },\n ];\n\n if (multimodalPrompt) {\n const addOns = await promptsToChatParam({\n images: multimodalPrompt.images,\n convertHttpImage2Base64: multimodalPrompt.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const {\n content: rawResponse,\n usage,\n reasoning_content,\n } = await callAI(msgs, modelConfig);\n\n // Parse XML response to JSON object\n let parseResult: AIDataExtractionResponse<T>;\n try {\n parseResult = parseXMLExtractionResponse<T>(rawResponse);\n } catch (parseError) {\n // Throw AIResponseParseError with usage and rawResponse preserved\n const errorMessage =\n parseError instanceof Error ? parseError.message : String(parseError);\n throw new AIResponseParseError(\n `XML parse error: ${errorMessage}`,\n rawResponse,\n usage,\n );\n }\n\n return {\n parseResult,\n rawResponse,\n usage,\n reasoning_content,\n };\n}\n\nexport async function AiJudgeOrderSensitive(\n description: string,\n callAIFn: typeof callAIWithObjectResponse<{ isOrderSensitive: boolean }>,\n modelConfig: IModelConfig,\n): Promise<{\n isOrderSensitive: boolean;\n usage?: AIUsageInfo;\n}> {\n const systemPrompt = systemPromptToJudgeOrderSensitive();\n const userPrompt = orderSensitiveJudgePrompt(description);\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: userPrompt,\n },\n ];\n\n const result = await callAIFn(msgs, modelConfig);\n\n return {\n isOrderSensitive: result.content.isOrderSensitive ?? false,\n usage: result.usage,\n };\n}\n"],"names":["debugInspect","getDebug","debugSection","extraTextFromUserPrompt","prompt","promptsToChatParam","multimodalPrompt","msgs","item","base64","preProcessImageUrl","AiLocateElement","options","context","targetElementDescription","callAIFn","modelConfig","modelFamily","screenshotBase64","assert","targetElementDescriptionText","userInstructionPrompt","findElementPrompt","systemPrompt","isAutoGLM","getAutoGLMLocatePrompt","systemPromptToLocateElement","imagePayload","imageWidth","imageHeight","originalImageWidth","originalImageHeight","paddedResult","paddingToMatchBlockByBase64","addOns","rawResponseContent","usage","callAIWithStringResponse","parsed","parseAutoGLMLocateResponse","resRect","matchedElements","errors","x","y","pixelX","Math","pixelY","bboxSize","x1","y1","x2","y2","rectCenter","element","generateElementByPosition","res","callError","errorMessage","Error","String","rawResponse","AIResponseParseError","undefined","JSON","Array","adaptBboxToRect","e","msg","AiLocateSection","sectionDescription","systemPromptToLocateSection","sectionLocatorInstructionText","sectionLocatorInstruction","result","callAIWithObjectResponse","sectionRect","sectionBbox","targetRect","referenceBboxList","referenceRects","bbox","mergedRect","mergeRects","expandSearchArea","imageBase64","croppedResult","cropByRect","AiExtractElementInfo","dataQuery","extractOption","systemPromptToExtract","extractDataPromptText","extractDataQueryPrompt","userContent","reasoning_content","callAI","parseResult","parseXMLExtractionResponse","parseError","AiJudgeOrderSensitive","description","systemPromptToJudgeOrderSensitive","userPrompt","orderSensitiveJudgePrompt"],"mappings":";;;;;;;;;;;;;AAyDA,MAAMA,eAAeC,SAAS;AAC9B,MAAMC,eAAeD,SAAS;AAE9B,MAAME,0BAA0B,CAACC;IAC/B,IAAI,AAAkB,YAAlB,OAAOA,QACT,OAAOA;IAEP,OAAOA,OAAO,MAAM;AAExB;AAEA,MAAMC,qBAAqB,OACzBC;IAEA,MAAMC,OAAyC,EAAE;IACjD,IAAID,kBAAkB,QAAQ,QAAQ;QACpCC,KAAK,IAAI,CAAC;YACR,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM;gBACR;aACD;QACH;QAEA,KAAK,MAAMC,QAAQF,iBAAiB,MAAM,CAAE;YAC1C,MAAMG,SAAS,MAAMC,mBACnBF,KAAK,GAAG,EACR,CAAC,CAACF,iBAAiB,uBAAuB;YAG5CC,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,MAAM,CAAC,mCAAmC,EAAEC,KAAK,IAAI,CAAC,EAAE,CAAC;oBAC3D;iBACD;YACH;YAEAD,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,WAAW;4BACT,KAAKE;4BACL,QAAQ;wBACV;oBACF;iBACD;YACH;QACF;IACF;IACA,OAAOF;AACT;AAEO,eAAeI,gBAAgBC,OAQrC;IAUC,MAAM,EAAEC,OAAO,EAAEC,wBAAwB,EAAEC,QAAQ,EAAEC,WAAW,EAAE,GAAGJ;IACrE,MAAM,EAAEK,WAAW,EAAE,GAAGD;IACxB,MAAME,mBAAmBL,QAAQ,UAAU,CAAC,MAAM;IAElDM,OACEL,0BACA;IAEF,MAAMM,+BAA+BjB,wBACnCW;IAEF,MAAMO,wBAAwBC,kBAAkBF;IAChD,MAAMG,eAAeC,UAAUP,eAC3BQ,uBAAuBR,eACvBS,4BAA4BT;IAEhC,IAAIU,eAAeT;IACnB,IAAIU,aAAaf,QAAQ,IAAI,CAAC,KAAK;IACnC,IAAIgB,cAAchB,QAAQ,IAAI,CAAC,MAAM;IACrC,IAAIiB,qBAAqBF;IACzB,IAAIG,sBAAsBF;IAE1B,IAAIjB,QAAQ,YAAY,EAAE;QACxBO,OACEP,QAAQ,YAAY,CAAC,IAAI,EACzB;QAEFO,OACEP,QAAQ,YAAY,CAAC,WAAW,EAChC;QAGFe,eAAef,QAAQ,YAAY,CAAC,WAAW;QAC/CgB,aAAahB,QAAQ,YAAY,CAAC,IAAI,EAAE;QACxCiB,cAAcjB,QAAQ,YAAY,CAAC,IAAI,EAAE;QACzCkB,qBAAqBF;QACrBG,sBAAsBF;IACxB,OAAO,IAAIZ,AAAgB,iBAAhBA,aAA8B;QACvC,MAAMe,eAAe,MAAMC,4BAA4BN;QACvDC,aAAaI,aAAa,KAAK;QAC/BH,cAAcG,aAAa,MAAM;QACjCL,eAAeK,aAAa,WAAW;IACzC;IAEA,MAAMzB,OAAe;QACnB;YAAE,MAAM;YAAU,SAASgB;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKI;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAMH,UAAUP,eACZ,CAAC,KAAK,EAAEI,uBAAuB,GAC/BA;gBACN;aACD;QACH;KACD;IAED,IAAI,AAAoC,YAApC,OAAOP,0BAAuC;QAChD,MAAMoB,SAAS,MAAM7B,mBAAmB;YACtC,QAAQS,yBAAyB,MAAM;YACvC,yBAAyBA,yBAAyB,uBAAuB;QAC3E;QACAP,KAAK,IAAI,IAAI2B;IACf;IAEA,IAAIV,UAAUP,cAAc;QAC1B,MAAM,EAAE,SAASkB,kBAAkB,EAAEC,KAAK,EAAE,GAC1C,MAAMC,yBAAyB9B,MAAMS;QAEvChB,aAAa,yBAAyBmC;QAEtC,MAAMG,SAASC,2BAA2BJ;QAE1CnC,aAAa,sBAAsBsC,OAAO,KAAK;QAC/CtC,aAAa,yBAAyBsC,OAAO,WAAW;QAExD,IAAIE;QACJ,IAAIC,kBAAyC,EAAE;QAC/C,IAAIC,SAAmB,EAAE;QAEzB,IAAIJ,OAAO,KAAK,IAAI,CAACA,OAAO,WAAW,EAAE;YACvCI,SAAS;gBAACJ,OAAO,KAAK,IAAI;aAAoC;YAC9DtC,aAAa,yBAAyB0C,MAAM,CAAC,EAAE;QACjD,OAAO;YACL,MAAM,EAAEC,CAAC,EAAEC,CAAC,EAAE,GAAGN,OAAO,WAAW;YAEnCtC,aAAa,iCAAiC;gBAAE2C;gBAAGC;YAAE;YAIrD,MAAMC,SAASC,KAAK,KAAK,CAAEH,IAAIf,aAAc;YAC7C,MAAMmB,SAASD,KAAK,KAAK,CAAEF,IAAIf,cAAe;YAE9C7B,aAAa,+BAA+B;gBAAE6C;gBAAQE;YAAO;YAG7D,MAAMC,WAAW;YACjB,MAAMC,KAAKH,KAAK,GAAG,CAACD,SAASG,WAAW,GAAG;YAC3C,MAAME,KAAKJ,KAAK,GAAG,CAACC,SAASC,WAAW,GAAG;YAC3C,MAAMG,KAAKL,KAAK,GAAG,CAACD,SAASG,WAAW,GAAGpB;YAC3C,MAAMwB,KAAKN,KAAK,GAAG,CAACC,SAASC,WAAW,GAAGnB;YAG3CW,UAAU;gBACR,MAAMS;gBACN,KAAKC;gBACL,OAAOC,KAAKF;gBACZ,QAAQG,KAAKF;YACf;YAGA,IAAItC,QAAQ,YAAY,EAAE,MAAM;gBAC9B4B,QAAQ,IAAI,IAAI5B,QAAQ,YAAY,CAAC,IAAI,CAAC,IAAI;gBAC9C4B,QAAQ,GAAG,IAAI5B,QAAQ,YAAY,CAAC,IAAI,CAAC,GAAG;YAC9C;YAEAZ,aAAa,qBAAqBwC;YAElC,MAAMa,aAAa;gBACjB,GAAGb,QAAQ,IAAI,GAAGA,QAAQ,KAAK,GAAG;gBAClC,GAAGA,QAAQ,GAAG,GAAGA,QAAQ,MAAM,GAAG;YACpC;YAEA,MAAMc,UAA+BC,0BACnCF,YACAjC;YAGF,IAAIkC,SACFb,kBAAkB;gBAACa;aAAQ;QAE/B;QAEA,OAAO;YACL,MAAMd;YACN,aAAa;gBACX,UAAUC;gBACVC;YACF;YACA,aAAaP;YACbC;YACA,mBAAmBE,OAAO,KAAK;QACjC;IACF;IAEA,IAAIkB;IACJ,IAAI;QACFA,MAAM,MAAMzC,SAASR,MAAMS;IAC7B,EAAE,OAAOyC,WAAW;QAElB,MAAMC,eACJD,qBAAqBE,QAAQF,UAAU,OAAO,GAAGG,OAAOH;QAC1D,MAAMI,cACJJ,qBAAqBK,uBACjBL,UAAU,WAAW,GACrBC;QACN,MAAMtB,QACJqB,qBAAqBK,uBAAuBL,UAAU,KAAK,GAAGM;QAChE,OAAO;YACL,MAAMA;YACN,aAAa;gBACX,UAAU,EAAE;gBACZ,QAAQ;oBAAC,CAAC,eAAe,EAAEL,cAAc;iBAAC;YAC5C;YACAG;YACAzB;YACA,mBAAmB2B;QACrB;IACF;IAEA,MAAMF,cAAcG,KAAK,SAAS,CAACR,IAAI,OAAO;IAE9C,IAAIhB;IACJ,IAAIC,kBAAyC,EAAE;IAC/C,IAAIC,SACF,YAAYc,IAAI,OAAO,GAAGA,IAAI,OAAO,CAAC,MAAM,GAAG,EAAE;IACnD,IAAI;QACF,IACE,UAAUA,IAAI,OAAO,IACrBS,MAAM,OAAO,CAACT,IAAI,OAAO,CAAC,IAAI,KAC9BA,IAAI,OAAO,CAAC,IAAI,CAAC,MAAM,IAAI,GAC3B;YACAhB,UAAU0B,gBACRV,IAAI,OAAO,CAAC,IAAI,EAChB5B,YACAC,aACAjB,QAAQ,YAAY,EAAE,MAAM,MAC5BA,QAAQ,YAAY,EAAE,MAAM,KAC5BkB,oBACAC,qBACAd;YAGFjB,aAAa,WAAWwC;YAExB,MAAMa,aAAa;gBACjB,GAAGb,QAAQ,IAAI,GAAGA,QAAQ,KAAK,GAAG;gBAClC,GAAGA,QAAQ,GAAG,GAAGA,QAAQ,MAAM,GAAG;YACpC;YAEA,MAAMc,UAA+BC,0BACnCF,YACAjC;YAEFsB,SAAS,EAAE;YAEX,IAAIY,SACFb,kBAAkB;gBAACa;aAAQ;QAE/B;IACF,EAAE,OAAOa,GAAG;QACV,MAAMC,MACJD,aAAaR,QACT,CAAC,sBAAsB,EAAEQ,EAAE,OAAO,EAAE,GACpC;QACN,IAAI,AAACzB,UAAUA,QAAQ,WAAW,GAGhCA,OAAO,IAAI,CAAC,CAAC,CAAC,EAAE0B,IAAI,CAAC,CAAC;aAFtB1B,SAAS;YAAC0B;SAAI;IAIlB;IAEA,OAAO;QACL,MAAM5B;QACN,aAAa;YACX,UAAUC;YACV,QAAQC;QACV;QACAmB;QACA,OAAOL,IAAI,KAAK;QAChB,mBAAmBA,IAAI,iBAAiB;IAC1C;AACF;AAEO,eAAea,gBAAgBzD,OAIrC;IAOC,MAAM,EAAEC,OAAO,EAAEyD,kBAAkB,EAAEtD,WAAW,EAAE,GAAGJ;IACrD,MAAM,EAAEK,WAAW,EAAE,GAAGD;IACxB,MAAME,mBAAmBL,QAAQ,UAAU,CAAC,MAAM;IAElD,MAAMU,eAAegD,4BAA4BtD;IACjD,MAAMuD,gCAAgCC,0BACpCtE,wBAAwBmE;IAE1B,MAAM/D,OAAe;QACnB;YAAE,MAAM;YAAU,SAASgB;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKL;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAMsD;gBACR;aACD;QACH;KACD;IAED,IAAI,AAA8B,YAA9B,OAAOF,oBAAiC;QAC1C,MAAMpC,SAAS,MAAM7B,mBAAmB;YACtC,QAAQiE,mBAAmB,MAAM;YACjC,yBAAyBA,mBAAmB,uBAAuB;QACrE;QACA/D,KAAK,IAAI,IAAI2B;IACf;IAEA,IAAIwC;IAGJ,IAAI;QACFA,SAAS,MAAMC,yBACbpE,MACAS;IAEJ,EAAE,OAAOyC,WAAW;QAElB,MAAMC,eACJD,qBAAqBE,QAAQF,UAAU,OAAO,GAAGG,OAAOH;QAC1D,MAAMI,cACJJ,qBAAqBK,uBACjBL,UAAU,WAAW,GACrBC;QACN,MAAMtB,QACJqB,qBAAqBK,uBAAuBL,UAAU,KAAK,GAAGM;QAChE,OAAO;YACL,MAAMA;YACN,aAAaA;YACb,OAAO,CAAC,eAAe,EAAEL,cAAc;YACvCG;YACAzB;QACF;IACF;IAEA,IAAIwC;IACJ,MAAMC,cAAcH,OAAO,OAAO,CAAC,IAAI;IACvC,IAAIG,aAAa;QACf,MAAMC,aAAaZ,gBACjBW,aACAhE,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnB,GACA,GACAA,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnBI;QAEFf,aAAa,0BAA0B4E;QAEvC,MAAMC,oBAAoBL,OAAO,OAAO,CAAC,eAAe,IAAI,EAAE;QAC9DxE,aAAa,wBAAwB6E;QAErC,MAAMC,iBAAiBD,kBACpB,MAAM,CAAC,CAACE,OAAShB,MAAM,OAAO,CAACgB,OAC/B,GAAG,CAAC,CAACA,OACGf,gBACLe,MACApE,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnB,GACA,GACAA,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnBI;QAGNf,aAAa,qBAAqB8E;QAGlC,MAAME,aAAaC,WAAW;YAACL;eAAeE;SAAe;QAC7D9E,aAAa,iBAAiBgF;QAG9BN,cAAcQ,iBAAiBF,YAAYrE,QAAQ,IAAI,EAAEI;QACzDf,aAAa,2BAA2B0E;IAC1C;IAEA,IAAIS,cAAcnE;IAClB,IAAI0D,aAAa;QACf,MAAMU,gBAAgB,MAAMC,WAC1BrE,kBACA0D,aACA3D,AAAgB,iBAAhBA;QAEFoE,cAAcC,cAAc,WAAW;QACvCV,YAAY,KAAK,GAAGU,cAAc,KAAK;QACvCV,YAAY,MAAM,GAAGU,cAAc,MAAM;IAC3C;IAEA,OAAO;QACL,MAAMV;QACNS;QACA,OAAOX,OAAO,OAAO,CAAC,KAAK;QAC3B,aAAaV,KAAK,SAAS,CAACU,OAAO,OAAO;QAC1C,OAAOA,OAAO,KAAK;IACrB;AACF;AAEO,eAAec,qBAAwB5E,OAO7C;IACC,MAAM,EAAE6E,SAAS,EAAE5E,OAAO,EAAE6E,aAAa,EAAEpF,gBAAgB,EAAEU,WAAW,EAAE,GACxEJ;IACF,MAAMW,eAAeoE;IACrB,MAAMzE,mBAAmBL,QAAQ,UAAU,CAAC,MAAM;IAElD,MAAM+E,wBAAwBC,uBAC5BjF,QAAQ,eAAe,IAAI,IAC3B6E;IAGF,MAAMK,cAAyD,EAAE;IAEjE,IAAIJ,eAAe,uBAAuB,OACxCI,YAAY,IAAI,CAAC;QACf,MAAM;QACN,WAAW;YACT,KAAK5E;YACL,QAAQ;QACV;IACF;IAGF4E,YAAY,IAAI,CAAC;QACf,MAAM;QACN,MAAMF;IACR;IAEA,MAAMrF,OAAe;QACnB;YAAE,MAAM;YAAU,SAASgB;QAAa;QACxC;YACE,MAAM;YACN,SAASuE;QACX;KACD;IAED,IAAIxF,kBAAkB;QACpB,MAAM4B,SAAS,MAAM7B,mBAAmB;YACtC,QAAQC,iBAAiB,MAAM;YAC/B,yBAAyBA,iBAAiB,uBAAuB;QACnE;QACAC,KAAK,IAAI,IAAI2B;IACf;IAEA,MAAM,EACJ,SAAS2B,WAAW,EACpBzB,KAAK,EACL2D,iBAAiB,EAClB,GAAG,MAAMC,OAAOzF,MAAMS;IAGvB,IAAIiF;IACJ,IAAI;QACFA,cAAcC,2BAA8BrC;IAC9C,EAAE,OAAOsC,YAAY;QAEnB,MAAMzC,eACJyC,sBAAsBxC,QAAQwC,WAAW,OAAO,GAAGvC,OAAOuC;QAC5D,MAAM,IAAIrC,qBACR,CAAC,iBAAiB,EAAEJ,cAAc,EAClCG,aACAzB;IAEJ;IAEA,OAAO;QACL6D;QACApC;QACAzB;QACA2D;IACF;AACF;AAEO,eAAeK,sBACpBC,WAAmB,EACnBtF,QAAwE,EACxEC,WAAyB;IAKzB,MAAMO,eAAe+E;IACrB,MAAMC,aAAaC,0BAA0BH;IAE7C,MAAM9F,OAAe;QACnB;YAAE,MAAM;YAAU,SAASgB;QAAa;QACxC;YACE,MAAM;YACN,SAASgF;QACX;KACD;IAED,MAAM7B,SAAS,MAAM3D,SAASR,MAAMS;IAEpC,OAAO;QACL,kBAAkB0D,OAAO,OAAO,CAAC,gBAAgB,IAAI;QACrD,OAAOA,OAAO,KAAK;IACrB;AACF"}
1
+ {"version":3,"file":"ai-model/inspect.mjs","sources":["../../../src/ai-model/inspect.ts"],"sourcesContent":["import type {\n AIDataExtractionResponse,\n AIElementResponse,\n AISectionLocatorResponse,\n AIUsageInfo,\n Rect,\n ServiceExtractOption,\n UIContext,\n} from '@/types';\nimport type { IModelConfig } from '@midscene/shared/env';\nimport {\n generateElementByPoint,\n generateElementByRect,\n} from '@midscene/shared/extractor/dom-util';\nimport {\n cropByRect,\n paddingToMatchBlockByBase64,\n preProcessImageUrl,\n} from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport type { LocateResultElement } from '@midscene/shared/types';\nimport { assert } from '@midscene/shared/utils';\nimport type {\n ChatCompletionSystemMessageParam,\n ChatCompletionUserMessageParam,\n} from 'openai/resources/index';\nimport type { TMultimodalPrompt, TUserPrompt } from '../common';\nimport { adaptBboxToRect, expandSearchArea, mergeRects } from '../common';\nimport { parseAutoGLMLocateResponse } from './auto-glm/parser';\nimport { getAutoGLMLocatePrompt } from './auto-glm/prompt';\nimport { isAutoGLM } from './auto-glm/util';\nimport {\n extractDataQueryPrompt,\n parseXMLExtractionResponse,\n systemPromptToExtract,\n} from './prompt/extraction';\nimport {\n findElementPrompt,\n systemPromptToLocateElement,\n} from './prompt/llm-locator';\nimport {\n sectionLocatorInstruction,\n systemPromptToLocateSection,\n} from './prompt/llm-section-locator';\nimport {\n orderSensitiveJudgePrompt,\n systemPromptToJudgeOrderSensitive,\n} from './prompt/order-sensitive-judge';\nimport {\n AIResponseParseError,\n callAI,\n callAIWithObjectResponse,\n callAIWithStringResponse,\n} from './service-caller/index';\n\nexport type AIArgs = [\n ChatCompletionSystemMessageParam,\n ...ChatCompletionUserMessageParam[],\n];\n\nconst debugInspect = getDebug('ai:inspect');\nconst debugSection = getDebug('ai:section');\n\nconst extraTextFromUserPrompt = (prompt: TUserPrompt): string => {\n if (typeof prompt === 'string') {\n return prompt;\n } else {\n return prompt.prompt;\n }\n};\n\nconst promptsToChatParam = async (\n multimodalPrompt: TMultimodalPrompt,\n): Promise<ChatCompletionUserMessageParam[]> => {\n const msgs: ChatCompletionUserMessageParam[] = [];\n if (multimodalPrompt?.images?.length) {\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: 'Next, I will provide all the reference images.',\n },\n ],\n });\n\n for (const item of multimodalPrompt.images) {\n const base64 = await preProcessImageUrl(\n item.url,\n !!multimodalPrompt.convertHttpImage2Base64,\n );\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'text',\n text: `this is the reference image named '${item.name}':`,\n },\n ],\n });\n\n msgs.push({\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: base64,\n detail: 'high',\n },\n },\n ],\n });\n }\n }\n return msgs;\n};\n\nexport async function AiLocateElement(options: {\n context: UIContext;\n targetElementDescription: TUserPrompt;\n callAIFn: typeof callAIWithObjectResponse<\n AIElementResponse | [number, number]\n >;\n searchConfig?: Awaited<ReturnType<typeof AiLocateSection>>;\n modelConfig: IModelConfig;\n}): Promise<{\n parseResult: {\n elements: LocateResultElement[];\n errors?: string[];\n };\n rect?: Rect;\n rawResponse: string;\n usage?: AIUsageInfo;\n reasoning_content?: string;\n}> {\n const { context, targetElementDescription, callAIFn, modelConfig } = options;\n const { modelFamily } = modelConfig;\n const screenshotBase64 = context.screenshot.base64;\n\n assert(\n targetElementDescription,\n 'cannot find the target element description',\n );\n const targetElementDescriptionText = extraTextFromUserPrompt(\n targetElementDescription,\n );\n const userInstructionPrompt = findElementPrompt(targetElementDescriptionText);\n const systemPrompt = isAutoGLM(modelFamily)\n ? getAutoGLMLocatePrompt(modelFamily)\n : systemPromptToLocateElement(modelFamily);\n\n let imagePayload = screenshotBase64;\n let imageWidth = context.size.width;\n let imageHeight = context.size.height;\n let originalImageWidth = imageWidth;\n let originalImageHeight = imageHeight;\n\n if (options.searchConfig) {\n assert(\n options.searchConfig.rect,\n 'searchArea is provided but its rect cannot be found. Failed to locate element',\n );\n assert(\n options.searchConfig.imageBase64,\n 'searchArea is provided but its imageBase64 cannot be found. Failed to locate element',\n );\n\n imagePayload = options.searchConfig.imageBase64;\n imageWidth = options.searchConfig.rect?.width;\n imageHeight = options.searchConfig.rect?.height;\n originalImageWidth = imageWidth;\n originalImageHeight = imageHeight;\n } else if (modelFamily === 'qwen2.5-vl') {\n const paddedResult = await paddingToMatchBlockByBase64(imagePayload);\n imageWidth = paddedResult.width;\n imageHeight = paddedResult.height;\n imagePayload = paddedResult.imageBase64;\n }\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: isAutoGLM(modelFamily)\n ? `Tap: ${userInstructionPrompt}`\n : userInstructionPrompt,\n },\n ],\n },\n ];\n\n if (typeof targetElementDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: targetElementDescription.images,\n convertHttpImage2Base64: targetElementDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n if (isAutoGLM(modelFamily)) {\n const { content: rawResponseContent, usage } =\n await callAIWithStringResponse(msgs, modelConfig);\n\n debugInspect('auto-glm rawResponse:', rawResponseContent);\n\n const parsed = parseAutoGLMLocateResponse(rawResponseContent);\n\n debugInspect('auto-glm thinking:', parsed.think);\n debugInspect('auto-glm coordinates:', parsed.coordinates);\n\n let resRect: Rect | undefined;\n let matchedElements: LocateResultElement[] = [];\n let errors: string[] = [];\n\n if (parsed.error || !parsed.coordinates) {\n errors = [parsed.error || 'Failed to parse auto-glm response'];\n debugInspect('auto-glm parse error:', errors[0]);\n } else {\n const { x, y } = parsed.coordinates;\n\n debugInspect('auto-glm coordinates [0-999]:', { x, y });\n\n // Convert auto-glm coordinates [0,999] to pixel bbox\n // Map from [0,999] to pixel coordinates\n const pixelX = Math.round((x * imageWidth) / 1000);\n const pixelY = Math.round((y * imageHeight) / 1000);\n\n debugInspect('auto-glm pixel coordinates:', { pixelX, pixelY });\n\n // Apply offset if searching in a cropped area\n let finalX = pixelX;\n let finalY = pixelY;\n if (options.searchConfig?.rect) {\n finalX += options.searchConfig.rect.left;\n finalY += options.searchConfig.rect.top;\n }\n\n const element: LocateResultElement = generateElementByPoint(\n [finalX, finalY],\n targetElementDescriptionText as string,\n );\n\n resRect = element.rect;\n debugInspect('auto-glm resRect:', resRect);\n\n if (element) {\n matchedElements = [element];\n }\n }\n\n return {\n rect: resRect,\n parseResult: {\n elements: matchedElements,\n errors,\n },\n rawResponse: rawResponseContent,\n usage,\n reasoning_content: parsed.think,\n };\n }\n\n let res: Awaited<ReturnType<typeof callAIFn>>;\n try {\n res = await callAIFn(msgs, modelConfig);\n } catch (callError) {\n // Return error with usage and rawResponse if available\n const errorMessage =\n callError instanceof Error ? callError.message : String(callError);\n const rawResponse =\n callError instanceof AIResponseParseError\n ? callError.rawResponse\n : errorMessage;\n const usage =\n callError instanceof AIResponseParseError ? callError.usage : undefined;\n return {\n rect: undefined,\n parseResult: {\n elements: [],\n errors: [`AI call error: ${errorMessage}`],\n },\n rawResponse,\n usage,\n reasoning_content: undefined,\n };\n }\n\n const rawResponse = JSON.stringify(res.content);\n\n let resRect: Rect | undefined;\n let matchedElements: LocateResultElement[] = [];\n let errors: string[] | undefined =\n 'errors' in res.content ? res.content.errors : [];\n try {\n if (\n 'bbox' in res.content &&\n Array.isArray(res.content.bbox) &&\n res.content.bbox.length >= 1\n ) {\n resRect = adaptBboxToRect(\n res.content.bbox,\n imageWidth,\n imageHeight,\n options.searchConfig?.rect?.left,\n options.searchConfig?.rect?.top,\n originalImageWidth,\n originalImageHeight,\n modelFamily,\n );\n\n debugInspect('resRect', resRect);\n\n const element: LocateResultElement = generateElementByRect(\n resRect,\n targetElementDescriptionText as string,\n );\n errors = [];\n\n if (element) {\n matchedElements = [element];\n }\n }\n } catch (e) {\n const msg =\n e instanceof Error\n ? `Failed to parse bbox: ${e.message}`\n : 'unknown error in locate';\n if (!errors || errors?.length === 0) {\n errors = [msg];\n } else {\n errors.push(`(${msg})`);\n }\n }\n\n return {\n rect: resRect,\n parseResult: {\n elements: matchedElements as LocateResultElement[],\n errors: errors as string[],\n },\n rawResponse,\n usage: res.usage,\n reasoning_content: res.reasoning_content,\n };\n}\n\nexport async function AiLocateSection(options: {\n context: UIContext;\n sectionDescription: TUserPrompt;\n modelConfig: IModelConfig;\n}): Promise<{\n rect?: Rect;\n imageBase64?: string;\n error?: string;\n rawResponse: string;\n usage?: AIUsageInfo;\n}> {\n const { context, sectionDescription, modelConfig } = options;\n const { modelFamily } = modelConfig;\n const screenshotBase64 = context.screenshot.base64;\n\n const systemPrompt = systemPromptToLocateSection(modelFamily);\n const sectionLocatorInstructionText = sectionLocatorInstruction(\n extraTextFromUserPrompt(sectionDescription),\n );\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n },\n {\n type: 'text',\n text: sectionLocatorInstructionText,\n },\n ],\n },\n ];\n\n if (typeof sectionDescription !== 'string') {\n const addOns = await promptsToChatParam({\n images: sectionDescription.images,\n convertHttpImage2Base64: sectionDescription.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n let result: Awaited<\n ReturnType<typeof callAIWithObjectResponse<AISectionLocatorResponse>>\n >;\n try {\n result = await callAIWithObjectResponse<AISectionLocatorResponse>(\n msgs,\n modelConfig,\n );\n } catch (callError) {\n // Return error with usage and rawResponse if available\n const errorMessage =\n callError instanceof Error ? callError.message : String(callError);\n const rawResponse =\n callError instanceof AIResponseParseError\n ? callError.rawResponse\n : errorMessage;\n const usage =\n callError instanceof AIResponseParseError ? callError.usage : undefined;\n return {\n rect: undefined,\n imageBase64: undefined,\n error: `AI call error: ${errorMessage}`,\n rawResponse,\n usage,\n };\n }\n\n let sectionRect: Rect | undefined;\n const sectionBbox = result.content.bbox;\n if (sectionBbox) {\n const targetRect = adaptBboxToRect(\n sectionBbox,\n context.size.width,\n context.size.height,\n 0,\n 0,\n context.size.width,\n context.size.height,\n modelFamily,\n );\n debugSection('original targetRect %j', targetRect);\n\n const referenceBboxList = result.content.references_bbox || [];\n debugSection('referenceBboxList %j', referenceBboxList);\n\n const referenceRects = referenceBboxList\n .filter((bbox) => Array.isArray(bbox))\n .map((bbox) => {\n return adaptBboxToRect(\n bbox,\n context.size.width,\n context.size.height,\n 0,\n 0,\n context.size.width,\n context.size.height,\n modelFamily,\n );\n });\n debugSection('referenceRects %j', referenceRects);\n\n // merge the sectionRect and referenceRects\n const mergedRect = mergeRects([targetRect, ...referenceRects]);\n debugSection('mergedRect %j', mergedRect);\n\n // expand search area to at least 200 x 200\n sectionRect = expandSearchArea(mergedRect, context.size, modelFamily);\n debugSection('expanded sectionRect %j', sectionRect);\n }\n\n let imageBase64 = screenshotBase64;\n if (sectionRect) {\n const croppedResult = await cropByRect(\n screenshotBase64,\n sectionRect,\n modelFamily === 'qwen2.5-vl',\n );\n imageBase64 = croppedResult.imageBase64;\n sectionRect.width = croppedResult.width;\n sectionRect.height = croppedResult.height;\n }\n\n return {\n rect: sectionRect,\n imageBase64,\n error: result.content.error,\n rawResponse: JSON.stringify(result.content),\n usage: result.usage,\n };\n}\n\nexport async function AiExtractElementInfo<T>(options: {\n dataQuery: string | Record<string, string>;\n multimodalPrompt?: TMultimodalPrompt;\n context: UIContext;\n pageDescription?: string;\n extractOption?: ServiceExtractOption;\n modelConfig: IModelConfig;\n}) {\n const { dataQuery, context, extractOption, multimodalPrompt, modelConfig } =\n options;\n const systemPrompt = systemPromptToExtract();\n const screenshotBase64 = context.screenshot.base64;\n\n const extractDataPromptText = extractDataQueryPrompt(\n options.pageDescription || '',\n dataQuery,\n );\n\n const userContent: ChatCompletionUserMessageParam['content'] = [];\n\n if (extractOption?.screenshotIncluded !== false) {\n userContent.push({\n type: 'image_url',\n image_url: {\n url: screenshotBase64,\n detail: 'high',\n },\n });\n }\n\n userContent.push({\n type: 'text',\n text: extractDataPromptText,\n });\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: userContent,\n },\n ];\n\n if (multimodalPrompt) {\n const addOns = await promptsToChatParam({\n images: multimodalPrompt.images,\n convertHttpImage2Base64: multimodalPrompt.convertHttpImage2Base64,\n });\n msgs.push(...addOns);\n }\n\n const {\n content: rawResponse,\n usage,\n reasoning_content,\n } = await callAI(msgs, modelConfig);\n\n // Parse XML response to JSON object\n let parseResult: AIDataExtractionResponse<T>;\n try {\n parseResult = parseXMLExtractionResponse<T>(rawResponse);\n } catch (parseError) {\n // Throw AIResponseParseError with usage and rawResponse preserved\n const errorMessage =\n parseError instanceof Error ? parseError.message : String(parseError);\n throw new AIResponseParseError(\n `XML parse error: ${errorMessage}`,\n rawResponse,\n usage,\n );\n }\n\n return {\n parseResult,\n rawResponse,\n usage,\n reasoning_content,\n };\n}\n\nexport async function AiJudgeOrderSensitive(\n description: string,\n callAIFn: typeof callAIWithObjectResponse<{ isOrderSensitive: boolean }>,\n modelConfig: IModelConfig,\n): Promise<{\n isOrderSensitive: boolean;\n usage?: AIUsageInfo;\n}> {\n const systemPrompt = systemPromptToJudgeOrderSensitive();\n const userPrompt = orderSensitiveJudgePrompt(description);\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: userPrompt,\n },\n ];\n\n const result = await callAIFn(msgs, modelConfig);\n\n return {\n isOrderSensitive: result.content.isOrderSensitive ?? false,\n usage: result.usage,\n };\n}\n"],"names":["debugInspect","getDebug","debugSection","extraTextFromUserPrompt","prompt","promptsToChatParam","multimodalPrompt","msgs","item","base64","preProcessImageUrl","AiLocateElement","options","context","targetElementDescription","callAIFn","modelConfig","modelFamily","screenshotBase64","assert","targetElementDescriptionText","userInstructionPrompt","findElementPrompt","systemPrompt","isAutoGLM","getAutoGLMLocatePrompt","systemPromptToLocateElement","imagePayload","imageWidth","imageHeight","originalImageWidth","originalImageHeight","paddedResult","paddingToMatchBlockByBase64","addOns","rawResponseContent","usage","callAIWithStringResponse","parsed","parseAutoGLMLocateResponse","resRect","matchedElements","errors","x","y","pixelX","Math","pixelY","finalX","finalY","element","generateElementByPoint","res","callError","errorMessage","Error","String","rawResponse","AIResponseParseError","undefined","JSON","Array","adaptBboxToRect","generateElementByRect","e","msg","AiLocateSection","sectionDescription","systemPromptToLocateSection","sectionLocatorInstructionText","sectionLocatorInstruction","result","callAIWithObjectResponse","sectionRect","sectionBbox","targetRect","referenceBboxList","referenceRects","bbox","mergedRect","mergeRects","expandSearchArea","imageBase64","croppedResult","cropByRect","AiExtractElementInfo","dataQuery","extractOption","systemPromptToExtract","extractDataPromptText","extractDataQueryPrompt","userContent","reasoning_content","callAI","parseResult","parseXMLExtractionResponse","parseError","AiJudgeOrderSensitive","description","systemPromptToJudgeOrderSensitive","userPrompt","orderSensitiveJudgePrompt"],"mappings":";;;;;;;;;;;;;AA4DA,MAAMA,eAAeC,SAAS;AAC9B,MAAMC,eAAeD,SAAS;AAE9B,MAAME,0BAA0B,CAACC;IAC/B,IAAI,AAAkB,YAAlB,OAAOA,QACT,OAAOA;IAEP,OAAOA,OAAO,MAAM;AAExB;AAEA,MAAMC,qBAAqB,OACzBC;IAEA,MAAMC,OAAyC,EAAE;IACjD,IAAID,kBAAkB,QAAQ,QAAQ;QACpCC,KAAK,IAAI,CAAC;YACR,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM;gBACR;aACD;QACH;QAEA,KAAK,MAAMC,QAAQF,iBAAiB,MAAM,CAAE;YAC1C,MAAMG,SAAS,MAAMC,mBACnBF,KAAK,GAAG,EACR,CAAC,CAACF,iBAAiB,uBAAuB;YAG5CC,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,MAAM,CAAC,mCAAmC,EAAEC,KAAK,IAAI,CAAC,EAAE,CAAC;oBAC3D;iBACD;YACH;YAEAD,KAAK,IAAI,CAAC;gBACR,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,WAAW;4BACT,KAAKE;4BACL,QAAQ;wBACV;oBACF;iBACD;YACH;QACF;IACF;IACA,OAAOF;AACT;AAEO,eAAeI,gBAAgBC,OAQrC;IAUC,MAAM,EAAEC,OAAO,EAAEC,wBAAwB,EAAEC,QAAQ,EAAEC,WAAW,EAAE,GAAGJ;IACrE,MAAM,EAAEK,WAAW,EAAE,GAAGD;IACxB,MAAME,mBAAmBL,QAAQ,UAAU,CAAC,MAAM;IAElDM,OACEL,0BACA;IAEF,MAAMM,+BAA+BjB,wBACnCW;IAEF,MAAMO,wBAAwBC,kBAAkBF;IAChD,MAAMG,eAAeC,UAAUP,eAC3BQ,uBAAuBR,eACvBS,4BAA4BT;IAEhC,IAAIU,eAAeT;IACnB,IAAIU,aAAaf,QAAQ,IAAI,CAAC,KAAK;IACnC,IAAIgB,cAAchB,QAAQ,IAAI,CAAC,MAAM;IACrC,IAAIiB,qBAAqBF;IACzB,IAAIG,sBAAsBF;IAE1B,IAAIjB,QAAQ,YAAY,EAAE;QACxBO,OACEP,QAAQ,YAAY,CAAC,IAAI,EACzB;QAEFO,OACEP,QAAQ,YAAY,CAAC,WAAW,EAChC;QAGFe,eAAef,QAAQ,YAAY,CAAC,WAAW;QAC/CgB,aAAahB,QAAQ,YAAY,CAAC,IAAI,EAAE;QACxCiB,cAAcjB,QAAQ,YAAY,CAAC,IAAI,EAAE;QACzCkB,qBAAqBF;QACrBG,sBAAsBF;IACxB,OAAO,IAAIZ,AAAgB,iBAAhBA,aAA8B;QACvC,MAAMe,eAAe,MAAMC,4BAA4BN;QACvDC,aAAaI,aAAa,KAAK;QAC/BH,cAAcG,aAAa,MAAM;QACjCL,eAAeK,aAAa,WAAW;IACzC;IAEA,MAAMzB,OAAe;QACnB;YAAE,MAAM;YAAU,SAASgB;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKI;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAMH,UAAUP,eACZ,CAAC,KAAK,EAAEI,uBAAuB,GAC/BA;gBACN;aACD;QACH;KACD;IAED,IAAI,AAAoC,YAApC,OAAOP,0BAAuC;QAChD,MAAMoB,SAAS,MAAM7B,mBAAmB;YACtC,QAAQS,yBAAyB,MAAM;YACvC,yBAAyBA,yBAAyB,uBAAuB;QAC3E;QACAP,KAAK,IAAI,IAAI2B;IACf;IAEA,IAAIV,UAAUP,cAAc;QAC1B,MAAM,EAAE,SAASkB,kBAAkB,EAAEC,KAAK,EAAE,GAC1C,MAAMC,yBAAyB9B,MAAMS;QAEvChB,aAAa,yBAAyBmC;QAEtC,MAAMG,SAASC,2BAA2BJ;QAE1CnC,aAAa,sBAAsBsC,OAAO,KAAK;QAC/CtC,aAAa,yBAAyBsC,OAAO,WAAW;QAExD,IAAIE;QACJ,IAAIC,kBAAyC,EAAE;QAC/C,IAAIC,SAAmB,EAAE;QAEzB,IAAIJ,OAAO,KAAK,IAAI,CAACA,OAAO,WAAW,EAAE;YACvCI,SAAS;gBAACJ,OAAO,KAAK,IAAI;aAAoC;YAC9DtC,aAAa,yBAAyB0C,MAAM,CAAC,EAAE;QACjD,OAAO;YACL,MAAM,EAAEC,CAAC,EAAEC,CAAC,EAAE,GAAGN,OAAO,WAAW;YAEnCtC,aAAa,iCAAiC;gBAAE2C;gBAAGC;YAAE;YAIrD,MAAMC,SAASC,KAAK,KAAK,CAAEH,IAAIf,aAAc;YAC7C,MAAMmB,SAASD,KAAK,KAAK,CAAEF,IAAIf,cAAe;YAE9C7B,aAAa,+BAA+B;gBAAE6C;gBAAQE;YAAO;YAG7D,IAAIC,SAASH;YACb,IAAII,SAASF;YACb,IAAInC,QAAQ,YAAY,EAAE,MAAM;gBAC9BoC,UAAUpC,QAAQ,YAAY,CAAC,IAAI,CAAC,IAAI;gBACxCqC,UAAUrC,QAAQ,YAAY,CAAC,IAAI,CAAC,GAAG;YACzC;YAEA,MAAMsC,UAA+BC,uBACnC;gBAACH;gBAAQC;aAAO,EAChB7B;YAGFoB,UAAUU,QAAQ,IAAI;YACtBlD,aAAa,qBAAqBwC;YAElC,IAAIU,SACFT,kBAAkB;gBAACS;aAAQ;QAE/B;QAEA,OAAO;YACL,MAAMV;YACN,aAAa;gBACX,UAAUC;gBACVC;YACF;YACA,aAAaP;YACbC;YACA,mBAAmBE,OAAO,KAAK;QACjC;IACF;IAEA,IAAIc;IACJ,IAAI;QACFA,MAAM,MAAMrC,SAASR,MAAMS;IAC7B,EAAE,OAAOqC,WAAW;QAElB,MAAMC,eACJD,qBAAqBE,QAAQF,UAAU,OAAO,GAAGG,OAAOH;QAC1D,MAAMI,cACJJ,qBAAqBK,uBACjBL,UAAU,WAAW,GACrBC;QACN,MAAMlB,QACJiB,qBAAqBK,uBAAuBL,UAAU,KAAK,GAAGM;QAChE,OAAO;YACL,MAAMA;YACN,aAAa;gBACX,UAAU,EAAE;gBACZ,QAAQ;oBAAC,CAAC,eAAe,EAAEL,cAAc;iBAAC;YAC5C;YACAG;YACArB;YACA,mBAAmBuB;QACrB;IACF;IAEA,MAAMF,cAAcG,KAAK,SAAS,CAACR,IAAI,OAAO;IAE9C,IAAIZ;IACJ,IAAIC,kBAAyC,EAAE;IAC/C,IAAIC,SACF,YAAYU,IAAI,OAAO,GAAGA,IAAI,OAAO,CAAC,MAAM,GAAG,EAAE;IACnD,IAAI;QACF,IACE,UAAUA,IAAI,OAAO,IACrBS,MAAM,OAAO,CAACT,IAAI,OAAO,CAAC,IAAI,KAC9BA,IAAI,OAAO,CAAC,IAAI,CAAC,MAAM,IAAI,GAC3B;YACAZ,UAAUsB,gBACRV,IAAI,OAAO,CAAC,IAAI,EAChBxB,YACAC,aACAjB,QAAQ,YAAY,EAAE,MAAM,MAC5BA,QAAQ,YAAY,EAAE,MAAM,KAC5BkB,oBACAC,qBACAd;YAGFjB,aAAa,WAAWwC;YAExB,MAAMU,UAA+Ba,sBACnCvB,SACApB;YAEFsB,SAAS,EAAE;YAEX,IAAIQ,SACFT,kBAAkB;gBAACS;aAAQ;QAE/B;IACF,EAAE,OAAOc,GAAG;QACV,MAAMC,MACJD,aAAaT,QACT,CAAC,sBAAsB,EAAES,EAAE,OAAO,EAAE,GACpC;QACN,IAAI,AAACtB,UAAUA,QAAQ,WAAW,GAGhCA,OAAO,IAAI,CAAC,CAAC,CAAC,EAAEuB,IAAI,CAAC,CAAC;aAFtBvB,SAAS;YAACuB;SAAI;IAIlB;IAEA,OAAO;QACL,MAAMzB;QACN,aAAa;YACX,UAAUC;YACV,QAAQC;QACV;QACAe;QACA,OAAOL,IAAI,KAAK;QAChB,mBAAmBA,IAAI,iBAAiB;IAC1C;AACF;AAEO,eAAec,gBAAgBtD,OAIrC;IAOC,MAAM,EAAEC,OAAO,EAAEsD,kBAAkB,EAAEnD,WAAW,EAAE,GAAGJ;IACrD,MAAM,EAAEK,WAAW,EAAE,GAAGD;IACxB,MAAME,mBAAmBL,QAAQ,UAAU,CAAC,MAAM;IAElD,MAAMU,eAAe6C,4BAA4BnD;IACjD,MAAMoD,gCAAgCC,0BACpCnE,wBAAwBgE;IAE1B,MAAM5D,OAAe;QACnB;YAAE,MAAM;YAAU,SAASgB;QAAa;QACxC;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKL;wBACL,QAAQ;oBACV;gBACF;gBACA;oBACE,MAAM;oBACN,MAAMmD;gBACR;aACD;QACH;KACD;IAED,IAAI,AAA8B,YAA9B,OAAOF,oBAAiC;QAC1C,MAAMjC,SAAS,MAAM7B,mBAAmB;YACtC,QAAQ8D,mBAAmB,MAAM;YACjC,yBAAyBA,mBAAmB,uBAAuB;QACrE;QACA5D,KAAK,IAAI,IAAI2B;IACf;IAEA,IAAIqC;IAGJ,IAAI;QACFA,SAAS,MAAMC,yBACbjE,MACAS;IAEJ,EAAE,OAAOqC,WAAW;QAElB,MAAMC,eACJD,qBAAqBE,QAAQF,UAAU,OAAO,GAAGG,OAAOH;QAC1D,MAAMI,cACJJ,qBAAqBK,uBACjBL,UAAU,WAAW,GACrBC;QACN,MAAMlB,QACJiB,qBAAqBK,uBAAuBL,UAAU,KAAK,GAAGM;QAChE,OAAO;YACL,MAAMA;YACN,aAAaA;YACb,OAAO,CAAC,eAAe,EAAEL,cAAc;YACvCG;YACArB;QACF;IACF;IAEA,IAAIqC;IACJ,MAAMC,cAAcH,OAAO,OAAO,CAAC,IAAI;IACvC,IAAIG,aAAa;QACf,MAAMC,aAAab,gBACjBY,aACA7D,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnB,GACA,GACAA,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnBI;QAEFf,aAAa,0BAA0ByE;QAEvC,MAAMC,oBAAoBL,OAAO,OAAO,CAAC,eAAe,IAAI,EAAE;QAC9DrE,aAAa,wBAAwB0E;QAErC,MAAMC,iBAAiBD,kBACpB,MAAM,CAAC,CAACE,OAASjB,MAAM,OAAO,CAACiB,OAC/B,GAAG,CAAC,CAACA,OACGhB,gBACLgB,MACAjE,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnB,GACA,GACAA,QAAQ,IAAI,CAAC,KAAK,EAClBA,QAAQ,IAAI,CAAC,MAAM,EACnBI;QAGNf,aAAa,qBAAqB2E;QAGlC,MAAME,aAAaC,WAAW;YAACL;eAAeE;SAAe;QAC7D3E,aAAa,iBAAiB6E;QAG9BN,cAAcQ,iBAAiBF,YAAYlE,QAAQ,IAAI,EAAEI;QACzDf,aAAa,2BAA2BuE;IAC1C;IAEA,IAAIS,cAAchE;IAClB,IAAIuD,aAAa;QACf,MAAMU,gBAAgB,MAAMC,WAC1BlE,kBACAuD,aACAxD,AAAgB,iBAAhBA;QAEFiE,cAAcC,cAAc,WAAW;QACvCV,YAAY,KAAK,GAAGU,cAAc,KAAK;QACvCV,YAAY,MAAM,GAAGU,cAAc,MAAM;IAC3C;IAEA,OAAO;QACL,MAAMV;QACNS;QACA,OAAOX,OAAO,OAAO,CAAC,KAAK;QAC3B,aAAaX,KAAK,SAAS,CAACW,OAAO,OAAO;QAC1C,OAAOA,OAAO,KAAK;IACrB;AACF;AAEO,eAAec,qBAAwBzE,OAO7C;IACC,MAAM,EAAE0E,SAAS,EAAEzE,OAAO,EAAE0E,aAAa,EAAEjF,gBAAgB,EAAEU,WAAW,EAAE,GACxEJ;IACF,MAAMW,eAAeiE;IACrB,MAAMtE,mBAAmBL,QAAQ,UAAU,CAAC,MAAM;IAElD,MAAM4E,wBAAwBC,uBAC5B9E,QAAQ,eAAe,IAAI,IAC3B0E;IAGF,MAAMK,cAAyD,EAAE;IAEjE,IAAIJ,eAAe,uBAAuB,OACxCI,YAAY,IAAI,CAAC;QACf,MAAM;QACN,WAAW;YACT,KAAKzE;YACL,QAAQ;QACV;IACF;IAGFyE,YAAY,IAAI,CAAC;QACf,MAAM;QACN,MAAMF;IACR;IAEA,MAAMlF,OAAe;QACnB;YAAE,MAAM;YAAU,SAASgB;QAAa;QACxC;YACE,MAAM;YACN,SAASoE;QACX;KACD;IAED,IAAIrF,kBAAkB;QACpB,MAAM4B,SAAS,MAAM7B,mBAAmB;YACtC,QAAQC,iBAAiB,MAAM;YAC/B,yBAAyBA,iBAAiB,uBAAuB;QACnE;QACAC,KAAK,IAAI,IAAI2B;IACf;IAEA,MAAM,EACJ,SAASuB,WAAW,EACpBrB,KAAK,EACLwD,iBAAiB,EAClB,GAAG,MAAMC,OAAOtF,MAAMS;IAGvB,IAAI8E;IACJ,IAAI;QACFA,cAAcC,2BAA8BtC;IAC9C,EAAE,OAAOuC,YAAY;QAEnB,MAAM1C,eACJ0C,sBAAsBzC,QAAQyC,WAAW,OAAO,GAAGxC,OAAOwC;QAC5D,MAAM,IAAItC,qBACR,CAAC,iBAAiB,EAAEJ,cAAc,EAClCG,aACArB;IAEJ;IAEA,OAAO;QACL0D;QACArC;QACArB;QACAwD;IACF;AACF;AAEO,eAAeK,sBACpBC,WAAmB,EACnBnF,QAAwE,EACxEC,WAAyB;IAKzB,MAAMO,eAAe4E;IACrB,MAAMC,aAAaC,0BAA0BH;IAE7C,MAAM3F,OAAe;QACnB;YAAE,MAAM;YAAU,SAASgB;QAAa;QACxC;YACE,MAAM;YACN,SAAS6E;QACX;KACD;IAED,MAAM7B,SAAS,MAAMxD,SAASR,MAAMS;IAEpC,OAAO;QACL,kBAAkBuD,OAAO,OAAO,CAAC,gBAAgB,IAAI;QACrD,OAAOA,OAAO,KAAK;IACrB;AACF"}
@@ -83,8 +83,6 @@ async function plan(userInstruction, opts) {
83
83
  let imagePayload = screenshotBase64;
84
84
  let imageWidth = size.width;
85
85
  let imageHeight = size.height;
86
- const rightLimit = imageWidth;
87
- const bottomLimit = imageHeight;
88
86
  if ('qwen2.5-vl' === modelFamily) {
89
87
  const paddedResult = await paddingToMatchBlockByBase64(imagePayload);
90
88
  imageWidth = paddedResult.width;
@@ -195,7 +193,7 @@ async function plan(userInstruction, opts) {
195
193
  debug('locateFields', locateFields);
196
194
  locateFields.forEach((field)=>{
197
195
  const locateResult = action.param[field];
198
- if (locateResult && void 0 !== modelFamily) action.param[field] = fillBboxParam(locateResult, imageWidth, imageHeight, rightLimit, bottomLimit, modelFamily);
196
+ if (locateResult && void 0 !== modelFamily) action.param[field] = fillBboxParam(locateResult, imageWidth, imageHeight, modelFamily);
199
197
  });
200
198
  });
201
199
  if (includeSubGoals) {
@@ -1 +1 @@
1
- {"version":3,"file":"ai-model/llm-planning.mjs","sources":["../../../src/ai-model/llm-planning.ts"],"sourcesContent":["import type {\n DeepThinkOption,\n DeviceAction,\n InterfaceType,\n PlanningAIResponse,\n RawResponsePlanningAIResponse,\n UIContext,\n} from '@/types';\nimport type { IModelConfig, TModelFamily } from '@midscene/shared/env';\nimport { paddingToMatchBlockByBase64 } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport type { ChatCompletionMessageParam } from 'openai/resources/index';\nimport {\n buildYamlFlowFromPlans,\n fillBboxParam,\n findAllMidsceneLocatorField,\n} from '../common';\nimport type { ConversationHistory } from './conversation-history';\nimport { systemPromptToTaskPlanning } from './prompt/llm-planning';\nimport {\n extractXMLTag,\n parseMarkFinishedIndexes,\n parseSubGoalsFromXML,\n} from './prompt/util';\nimport {\n AIResponseParseError,\n callAI,\n safeParseJson,\n} from './service-caller/index';\n\nconst debug = getDebug('planning');\n\n/**\n * Parse XML response from LLM and convert to RawResponsePlanningAIResponse\n */\nexport function parseXMLPlanningResponse(\n xmlString: string,\n modelFamily: TModelFamily | undefined,\n): RawResponsePlanningAIResponse {\n const thought = extractXMLTag(xmlString, 'thought');\n const note = extractXMLTag(xmlString, 'note');\n const log = extractXMLTag(xmlString, 'log') || '';\n const error = extractXMLTag(xmlString, 'error');\n const actionType = extractXMLTag(xmlString, 'action-type');\n const actionParamStr = extractXMLTag(xmlString, 'action-param-json');\n\n // Parse complete-goal tag with success attribute\n const completeGoalRegex =\n /<complete-goal\\s+success=\"(true|false)\">([\\s\\S]*?)<\\/complete-goal>/i;\n const completeGoalMatch = xmlString.match(completeGoalRegex);\n let finalizeMessage: string | undefined;\n let finalizeSuccess: boolean | undefined;\n\n if (completeGoalMatch) {\n finalizeSuccess = completeGoalMatch[1] === 'true';\n finalizeMessage = completeGoalMatch[2]?.trim() || undefined;\n }\n\n // Parse sub-goal related tags\n const updatePlanContent = extractXMLTag(xmlString, 'update-plan-content');\n const markSubGoalDone = extractXMLTag(xmlString, 'mark-sub-goal-done');\n\n const updateSubGoals = updatePlanContent\n ? parseSubGoalsFromXML(updatePlanContent)\n : undefined;\n const markFinishedIndexes = markSubGoalDone\n ? parseMarkFinishedIndexes(markSubGoalDone)\n : undefined;\n\n // Parse action\n let action: any = null;\n if (actionType && actionType.toLowerCase() !== 'null') {\n const type = actionType.trim();\n let param: any = undefined;\n\n if (actionParamStr) {\n try {\n // Parse the JSON string in action-param-json\n param = safeParseJson(actionParamStr, modelFamily);\n } catch (e) {\n throw new Error(`Failed to parse action-param-json: ${e}`);\n }\n }\n\n action = {\n type,\n ...(param !== undefined ? { param } : {}),\n };\n }\n\n return {\n ...(thought ? { thought } : {}),\n ...(note ? { note } : {}),\n log,\n ...(error ? { error } : {}),\n action,\n ...(finalizeMessage !== undefined ? { finalizeMessage } : {}),\n ...(finalizeSuccess !== undefined ? { finalizeSuccess } : {}),\n ...(updateSubGoals?.length ? { updateSubGoals } : {}),\n ...(markFinishedIndexes?.length ? { markFinishedIndexes } : {}),\n };\n}\n\nexport async function plan(\n userInstruction: string,\n opts: {\n context: UIContext;\n interfaceType: InterfaceType;\n actionSpace: DeviceAction<any>[];\n actionContext?: string;\n modelConfig: IModelConfig;\n conversationHistory: ConversationHistory;\n includeBbox: boolean;\n imagesIncludeCount?: number;\n deepThink?: DeepThinkOption;\n },\n): Promise<PlanningAIResponse> {\n const { context, modelConfig, conversationHistory } = opts;\n const { size } = context;\n const screenshotBase64 = context.screenshot.base64;\n\n const { modelFamily } = modelConfig;\n\n // Only enable sub-goals when deepThink is true\n const includeSubGoals = opts.deepThink === true;\n\n const systemPrompt = await systemPromptToTaskPlanning({\n actionSpace: opts.actionSpace,\n modelFamily,\n includeBbox: opts.includeBbox,\n includeThought: true, // always include thought\n includeSubGoals,\n });\n\n let imagePayload = screenshotBase64;\n let imageWidth = size.width;\n let imageHeight = size.height;\n const rightLimit = imageWidth;\n const bottomLimit = imageHeight;\n\n // Process image based on VL mode requirements\n if (modelFamily === 'qwen2.5-vl') {\n const paddedResult = await paddingToMatchBlockByBase64(imagePayload);\n imageWidth = paddedResult.width;\n imageHeight = paddedResult.height;\n imagePayload = paddedResult.imageBase64;\n }\n\n const actionContext = opts.actionContext\n ? `<high_priority_knowledge>${opts.actionContext}</high_priority_knowledge>\\n`\n : '';\n\n const instruction: ChatCompletionMessageParam[] = [\n {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `${actionContext}<user_instruction>${userInstruction}</user_instruction>`,\n },\n ],\n },\n ];\n\n let latestFeedbackMessage: ChatCompletionMessageParam;\n\n // Build sub-goal status text to include in the message (only when deepThink is enabled)\n const subGoalsText = includeSubGoals\n ? conversationHistory.subGoalsToText()\n : '';\n const subGoalsSection = subGoalsText ? `\\n\\n${subGoalsText}` : '';\n\n // Build notes text to include in the message\n const notesText = conversationHistory.notesToText();\n const notesSection = notesText ? `\\n\\n${notesText}` : '';\n\n if (conversationHistory.pendingFeedbackMessage) {\n latestFeedbackMessage = {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `${conversationHistory.pendingFeedbackMessage}. The last screenshot is attached. Please going on according to the instruction.${notesSection}${subGoalsSection}`,\n },\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n ],\n };\n\n conversationHistory.resetPendingFeedbackMessageIfExists();\n } else {\n latestFeedbackMessage = {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `this is the latest screenshot${notesSection}${subGoalsSection}`,\n },\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n ],\n };\n }\n conversationHistory.append(latestFeedbackMessage);\n\n // Compress history if it exceeds the threshold to avoid context overflow\n conversationHistory.compressHistory(50, 20);\n\n const historyLog = conversationHistory.snapshot(opts.imagesIncludeCount);\n\n const msgs: ChatCompletionMessageParam[] = [\n { role: 'system', content: systemPrompt },\n ...instruction,\n ...historyLog,\n ];\n\n const {\n content: rawResponse,\n usage,\n reasoning_content,\n } = await callAI(msgs, modelConfig, {\n deepThink: opts.deepThink === 'unset' ? undefined : opts.deepThink,\n });\n\n // Parse XML response to JSON object, capture parsing errors\n let planFromAI: RawResponsePlanningAIResponse;\n try {\n planFromAI = parseXMLPlanningResponse(rawResponse, modelFamily);\n } catch (parseError) {\n // Throw AIResponseParseError with usage and rawResponse preserved\n const errorMessage =\n parseError instanceof Error ? parseError.message : String(parseError);\n throw new AIResponseParseError(\n `XML parse error: ${errorMessage}`,\n rawResponse,\n usage,\n );\n }\n\n if (planFromAI.action && planFromAI.finalizeSuccess !== undefined) {\n console.warn(\n 'Planning response included both an action and complete-goal; ignoring complete-goal output.',\n );\n planFromAI.finalizeMessage = undefined;\n planFromAI.finalizeSuccess = undefined;\n }\n\n const actions = planFromAI.action ? [planFromAI.action] : [];\n let shouldContinuePlanning = true;\n\n // Check if goal is completed via complete-goal tag\n if (planFromAI.finalizeSuccess !== undefined) {\n debug('goal completed via complete-goal tag, stop planning');\n shouldContinuePlanning = false;\n // Mark all sub-goals as finished when goal is completed (only when deepThink is enabled)\n if (includeSubGoals) {\n conversationHistory.markAllSubGoalsFinished();\n }\n }\n\n const returnValue: PlanningAIResponse = {\n ...planFromAI,\n actions,\n rawResponse,\n usage,\n reasoning_content,\n yamlFlow: buildYamlFlowFromPlans(actions, opts.actionSpace),\n shouldContinuePlanning,\n };\n\n assert(planFromAI, \"can't get plans from AI\");\n\n actions.forEach((action) => {\n const type = action.type;\n const actionInActionSpace = opts.actionSpace.find(\n (action) => action.name === type,\n );\n\n debug('actionInActionSpace matched', actionInActionSpace);\n const locateFields = actionInActionSpace\n ? findAllMidsceneLocatorField(actionInActionSpace.paramSchema)\n : [];\n\n debug('locateFields', locateFields);\n\n locateFields.forEach((field) => {\n const locateResult = action.param[field];\n if (locateResult && modelFamily !== undefined) {\n // Always use model family to fill bbox parameters\n action.param[field] = fillBboxParam(\n locateResult,\n imageWidth,\n imageHeight,\n rightLimit,\n bottomLimit,\n modelFamily,\n );\n }\n });\n });\n\n // Update sub-goals in conversation history based on response (only when deepThink is enabled)\n if (includeSubGoals) {\n if (planFromAI.updateSubGoals?.length) {\n conversationHistory.setSubGoals(planFromAI.updateSubGoals);\n }\n if (planFromAI.markFinishedIndexes?.length) {\n for (const index of planFromAI.markFinishedIndexes) {\n conversationHistory.markSubGoalFinished(index);\n }\n }\n }\n\n // Append note to conversation history if present\n if (planFromAI.note) {\n conversationHistory.appendNote(planFromAI.note);\n }\n\n conversationHistory.append({\n role: 'assistant',\n content: [\n {\n type: 'text',\n text: rawResponse,\n },\n ],\n });\n\n return returnValue;\n}\n"],"names":["debug","getDebug","parseXMLPlanningResponse","xmlString","modelFamily","thought","extractXMLTag","note","log","error","actionType","actionParamStr","completeGoalRegex","completeGoalMatch","finalizeMessage","finalizeSuccess","undefined","updatePlanContent","markSubGoalDone","updateSubGoals","parseSubGoalsFromXML","markFinishedIndexes","parseMarkFinishedIndexes","action","type","param","safeParseJson","e","Error","plan","userInstruction","opts","context","modelConfig","conversationHistory","size","screenshotBase64","includeSubGoals","systemPrompt","systemPromptToTaskPlanning","imagePayload","imageWidth","imageHeight","rightLimit","bottomLimit","paddedResult","paddingToMatchBlockByBase64","actionContext","instruction","latestFeedbackMessage","subGoalsText","subGoalsSection","notesText","notesSection","historyLog","msgs","rawResponse","usage","reasoning_content","callAI","planFromAI","parseError","errorMessage","String","AIResponseParseError","console","actions","shouldContinuePlanning","returnValue","buildYamlFlowFromPlans","assert","actionInActionSpace","locateFields","findAllMidsceneLocatorField","field","locateResult","fillBboxParam","index"],"mappings":";;;;;;;AA+BA,MAAMA,QAAQC,SAAS;AAKhB,SAASC,yBACdC,SAAiB,EACjBC,WAAqC;IAErC,MAAMC,UAAUC,cAAcH,WAAW;IACzC,MAAMI,OAAOD,cAAcH,WAAW;IACtC,MAAMK,MAAMF,cAAcH,WAAW,UAAU;IAC/C,MAAMM,QAAQH,cAAcH,WAAW;IACvC,MAAMO,aAAaJ,cAAcH,WAAW;IAC5C,MAAMQ,iBAAiBL,cAAcH,WAAW;IAGhD,MAAMS,oBACJ;IACF,MAAMC,oBAAoBV,UAAU,KAAK,CAACS;IAC1C,IAAIE;IACJ,IAAIC;IAEJ,IAAIF,mBAAmB;QACrBE,kBAAkBF,AAAyB,WAAzBA,iBAAiB,CAAC,EAAE;QACtCC,kBAAkBD,iBAAiB,CAAC,EAAE,EAAE,UAAUG;IACpD;IAGA,MAAMC,oBAAoBX,cAAcH,WAAW;IACnD,MAAMe,kBAAkBZ,cAAcH,WAAW;IAEjD,MAAMgB,iBAAiBF,oBACnBG,qBAAqBH,qBACrBD;IACJ,MAAMK,sBAAsBH,kBACxBI,yBAAyBJ,mBACzBF;IAGJ,IAAIO,SAAc;IAClB,IAAIb,cAAcA,AAA6B,WAA7BA,WAAW,WAAW,IAAe;QACrD,MAAMc,OAAOd,WAAW,IAAI;QAC5B,IAAIe;QAEJ,IAAId,gBACF,IAAI;YAEFc,QAAQC,cAAcf,gBAAgBP;QACxC,EAAE,OAAOuB,GAAG;YACV,MAAM,IAAIC,MAAM,CAAC,mCAAmC,EAAED,GAAG;QAC3D;QAGFJ,SAAS;YACPC;YACA,GAAIC,AAAUT,WAAVS,QAAsB;gBAAEA;YAAM,IAAI,CAAC,CAAC;QAC1C;IACF;IAEA,OAAO;QACL,GAAIpB,UAAU;YAAEA;QAAQ,IAAI,CAAC,CAAC;QAC9B,GAAIE,OAAO;YAAEA;QAAK,IAAI,CAAC,CAAC;QACxBC;QACA,GAAIC,QAAQ;YAAEA;QAAM,IAAI,CAAC,CAAC;QAC1Bc;QACA,GAAIT,AAAoBE,WAApBF,kBAAgC;YAAEA;QAAgB,IAAI,CAAC,CAAC;QAC5D,GAAIC,AAAoBC,WAApBD,kBAAgC;YAAEA;QAAgB,IAAI,CAAC,CAAC;QAC5D,GAAII,gBAAgB,SAAS;YAAEA;QAAe,IAAI,CAAC,CAAC;QACpD,GAAIE,qBAAqB,SAAS;YAAEA;QAAoB,IAAI,CAAC,CAAC;IAChE;AACF;AAEO,eAAeQ,KACpBC,eAAuB,EACvBC,IAUC;IAED,MAAM,EAAEC,OAAO,EAAEC,WAAW,EAAEC,mBAAmB,EAAE,GAAGH;IACtD,MAAM,EAAEI,IAAI,EAAE,GAAGH;IACjB,MAAMI,mBAAmBJ,QAAQ,UAAU,CAAC,MAAM;IAElD,MAAM,EAAE5B,WAAW,EAAE,GAAG6B;IAGxB,MAAMI,kBAAkBN,AAAmB,SAAnBA,KAAK,SAAS;IAEtC,MAAMO,eAAe,MAAMC,2BAA2B;QACpD,aAAaR,KAAK,WAAW;QAC7B3B;QACA,aAAa2B,KAAK,WAAW;QAC7B,gBAAgB;QAChBM;IACF;IAEA,IAAIG,eAAeJ;IACnB,IAAIK,aAAaN,KAAK,KAAK;IAC3B,IAAIO,cAAcP,KAAK,MAAM;IAC7B,MAAMQ,aAAaF;IACnB,MAAMG,cAAcF;IAGpB,IAAItC,AAAgB,iBAAhBA,aAA8B;QAChC,MAAMyC,eAAe,MAAMC,4BAA4BN;QACvDC,aAAaI,aAAa,KAAK;QAC/BH,cAAcG,aAAa,MAAM;QACjCL,eAAeK,aAAa,WAAW;IACzC;IAEA,MAAME,gBAAgBhB,KAAK,aAAa,GACpC,CAAC,yBAAyB,EAAEA,KAAK,aAAa,CAAC,4BAA4B,CAAC,GAC5E;IAEJ,MAAMiB,cAA4C;QAChD;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM,GAAGD,cAAc,kBAAkB,EAAEjB,gBAAgB,mBAAmB,CAAC;gBACjF;aACD;QACH;KACD;IAED,IAAImB;IAGJ,MAAMC,eAAeb,kBACjBH,oBAAoB,cAAc,KAClC;IACJ,MAAMiB,kBAAkBD,eAAe,CAAC,IAAI,EAAEA,cAAc,GAAG;IAG/D,MAAME,YAAYlB,oBAAoB,WAAW;IACjD,MAAMmB,eAAeD,YAAY,CAAC,IAAI,EAAEA,WAAW,GAAG;IAEtD,IAAIlB,oBAAoB,sBAAsB,EAAE;QAC9Ce,wBAAwB;YACtB,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM,GAAGf,oBAAoB,sBAAsB,CAAC,gFAAgF,EAAEmB,eAAeF,iBAAiB;gBACxK;gBACA;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKX;wBACL,QAAQ;oBACV;gBACF;aACD;QACH;QAEAN,oBAAoB,mCAAmC;IACzD,OACEe,wBAAwB;QACtB,MAAM;QACN,SAAS;YACP;gBACE,MAAM;gBACN,MAAM,CAAC,6BAA6B,EAAEI,eAAeF,iBAAiB;YACxE;YACA;gBACE,MAAM;gBACN,WAAW;oBACT,KAAKX;oBACL,QAAQ;gBACV;YACF;SACD;IACH;IAEFN,oBAAoB,MAAM,CAACe;IAG3Bf,oBAAoB,eAAe,CAAC,IAAI;IAExC,MAAMoB,aAAapB,oBAAoB,QAAQ,CAACH,KAAK,kBAAkB;IAEvE,MAAMwB,OAAqC;QACzC;YAAE,MAAM;YAAU,SAASjB;QAAa;WACrCU;WACAM;KACJ;IAED,MAAM,EACJ,SAASE,WAAW,EACpBC,KAAK,EACLC,iBAAiB,EAClB,GAAG,MAAMC,OAAOJ,MAAMtB,aAAa;QAClC,WAAWF,AAAmB,YAAnBA,KAAK,SAAS,GAAef,SAAYe,KAAK,SAAS;IACpE;IAGA,IAAI6B;IACJ,IAAI;QACFA,aAAa1D,yBAAyBsD,aAAapD;IACrD,EAAE,OAAOyD,YAAY;QAEnB,MAAMC,eACJD,sBAAsBjC,QAAQiC,WAAW,OAAO,GAAGE,OAAOF;QAC5D,MAAM,IAAIG,qBACR,CAAC,iBAAiB,EAAEF,cAAc,EAClCN,aACAC;IAEJ;IAEA,IAAIG,WAAW,MAAM,IAAIA,AAA+B5C,WAA/B4C,WAAW,eAAe,EAAgB;QACjEK,QAAQ,IAAI,CACV;QAEFL,WAAW,eAAe,GAAG5C;QAC7B4C,WAAW,eAAe,GAAG5C;IAC/B;IAEA,MAAMkD,UAAUN,WAAW,MAAM,GAAG;QAACA,WAAW,MAAM;KAAC,GAAG,EAAE;IAC5D,IAAIO,yBAAyB;IAG7B,IAAIP,AAA+B5C,WAA/B4C,WAAW,eAAe,EAAgB;QAC5C5D,MAAM;QACNmE,yBAAyB;QAEzB,IAAI9B,iBACFH,oBAAoB,uBAAuB;IAE/C;IAEA,MAAMkC,cAAkC;QACtC,GAAGR,UAAU;QACbM;QACAV;QACAC;QACAC;QACA,UAAUW,uBAAuBH,SAASnC,KAAK,WAAW;QAC1DoC;IACF;IAEAG,OAAOV,YAAY;IAEnBM,QAAQ,OAAO,CAAC,CAAC3C;QACf,MAAMC,OAAOD,OAAO,IAAI;QACxB,MAAMgD,sBAAsBxC,KAAK,WAAW,CAAC,IAAI,CAC/C,CAACR,SAAWA,OAAO,IAAI,KAAKC;QAG9BxB,MAAM,+BAA+BuE;QACrC,MAAMC,eAAeD,sBACjBE,4BAA4BF,oBAAoB,WAAW,IAC3D,EAAE;QAENvE,MAAM,gBAAgBwE;QAEtBA,aAAa,OAAO,CAAC,CAACE;YACpB,MAAMC,eAAepD,OAAO,KAAK,CAACmD,MAAM;YACxC,IAAIC,gBAAgBvE,AAAgBY,WAAhBZ,aAElBmB,OAAO,KAAK,CAACmD,MAAM,GAAGE,cACpBD,cACAlC,YACAC,aACAC,YACAC,aACAxC;QAGN;IACF;IAGA,IAAIiC,iBAAiB;QACnB,IAAIuB,WAAW,cAAc,EAAE,QAC7B1B,oBAAoB,WAAW,CAAC0B,WAAW,cAAc;QAE3D,IAAIA,WAAW,mBAAmB,EAAE,QAClC,KAAK,MAAMiB,SAASjB,WAAW,mBAAmB,CAChD1B,oBAAoB,mBAAmB,CAAC2C;IAG9C;IAGA,IAAIjB,WAAW,IAAI,EACjB1B,oBAAoB,UAAU,CAAC0B,WAAW,IAAI;IAGhD1B,oBAAoB,MAAM,CAAC;QACzB,MAAM;QACN,SAAS;YACP;gBACE,MAAM;gBACN,MAAMsB;YACR;SACD;IACH;IAEA,OAAOY;AACT"}
1
+ {"version":3,"file":"ai-model/llm-planning.mjs","sources":["../../../src/ai-model/llm-planning.ts"],"sourcesContent":["import type {\n DeepThinkOption,\n DeviceAction,\n InterfaceType,\n PlanningAIResponse,\n RawResponsePlanningAIResponse,\n UIContext,\n} from '@/types';\nimport type { IModelConfig, TModelFamily } from '@midscene/shared/env';\nimport { paddingToMatchBlockByBase64 } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport type { ChatCompletionMessageParam } from 'openai/resources/index';\nimport {\n buildYamlFlowFromPlans,\n fillBboxParam,\n findAllMidsceneLocatorField,\n} from '../common';\nimport type { ConversationHistory } from './conversation-history';\nimport { systemPromptToTaskPlanning } from './prompt/llm-planning';\nimport {\n extractXMLTag,\n parseMarkFinishedIndexes,\n parseSubGoalsFromXML,\n} from './prompt/util';\nimport {\n AIResponseParseError,\n callAI,\n safeParseJson,\n} from './service-caller/index';\n\nconst debug = getDebug('planning');\n\n/**\n * Parse XML response from LLM and convert to RawResponsePlanningAIResponse\n */\nexport function parseXMLPlanningResponse(\n xmlString: string,\n modelFamily: TModelFamily | undefined,\n): RawResponsePlanningAIResponse {\n const thought = extractXMLTag(xmlString, 'thought');\n const note = extractXMLTag(xmlString, 'note');\n const log = extractXMLTag(xmlString, 'log') || '';\n const error = extractXMLTag(xmlString, 'error');\n const actionType = extractXMLTag(xmlString, 'action-type');\n const actionParamStr = extractXMLTag(xmlString, 'action-param-json');\n\n // Parse complete-goal tag with success attribute\n const completeGoalRegex =\n /<complete-goal\\s+success=\"(true|false)\">([\\s\\S]*?)<\\/complete-goal>/i;\n const completeGoalMatch = xmlString.match(completeGoalRegex);\n let finalizeMessage: string | undefined;\n let finalizeSuccess: boolean | undefined;\n\n if (completeGoalMatch) {\n finalizeSuccess = completeGoalMatch[1] === 'true';\n finalizeMessage = completeGoalMatch[2]?.trim() || undefined;\n }\n\n // Parse sub-goal related tags\n const updatePlanContent = extractXMLTag(xmlString, 'update-plan-content');\n const markSubGoalDone = extractXMLTag(xmlString, 'mark-sub-goal-done');\n\n const updateSubGoals = updatePlanContent\n ? parseSubGoalsFromXML(updatePlanContent)\n : undefined;\n const markFinishedIndexes = markSubGoalDone\n ? parseMarkFinishedIndexes(markSubGoalDone)\n : undefined;\n\n // Parse action\n let action: any = null;\n if (actionType && actionType.toLowerCase() !== 'null') {\n const type = actionType.trim();\n let param: any = undefined;\n\n if (actionParamStr) {\n try {\n // Parse the JSON string in action-param-json\n param = safeParseJson(actionParamStr, modelFamily);\n } catch (e) {\n throw new Error(`Failed to parse action-param-json: ${e}`);\n }\n }\n\n action = {\n type,\n ...(param !== undefined ? { param } : {}),\n };\n }\n\n return {\n ...(thought ? { thought } : {}),\n ...(note ? { note } : {}),\n log,\n ...(error ? { error } : {}),\n action,\n ...(finalizeMessage !== undefined ? { finalizeMessage } : {}),\n ...(finalizeSuccess !== undefined ? { finalizeSuccess } : {}),\n ...(updateSubGoals?.length ? { updateSubGoals } : {}),\n ...(markFinishedIndexes?.length ? { markFinishedIndexes } : {}),\n };\n}\n\nexport async function plan(\n userInstruction: string,\n opts: {\n context: UIContext;\n interfaceType: InterfaceType;\n actionSpace: DeviceAction<any>[];\n actionContext?: string;\n modelConfig: IModelConfig;\n conversationHistory: ConversationHistory;\n includeBbox: boolean;\n imagesIncludeCount?: number;\n deepThink?: DeepThinkOption;\n },\n): Promise<PlanningAIResponse> {\n const { context, modelConfig, conversationHistory } = opts;\n const { size } = context;\n const screenshotBase64 = context.screenshot.base64;\n\n const { modelFamily } = modelConfig;\n\n // Only enable sub-goals when deepThink is true\n const includeSubGoals = opts.deepThink === true;\n\n const systemPrompt = await systemPromptToTaskPlanning({\n actionSpace: opts.actionSpace,\n modelFamily,\n includeBbox: opts.includeBbox,\n includeThought: true, // always include thought\n includeSubGoals,\n });\n\n let imagePayload = screenshotBase64;\n let imageWidth = size.width;\n let imageHeight = size.height;\n const rightLimit = imageWidth;\n const bottomLimit = imageHeight;\n\n // Process image based on VL mode requirements\n if (modelFamily === 'qwen2.5-vl') {\n const paddedResult = await paddingToMatchBlockByBase64(imagePayload);\n imageWidth = paddedResult.width;\n imageHeight = paddedResult.height;\n imagePayload = paddedResult.imageBase64;\n }\n\n const actionContext = opts.actionContext\n ? `<high_priority_knowledge>${opts.actionContext}</high_priority_knowledge>\\n`\n : '';\n\n const instruction: ChatCompletionMessageParam[] = [\n {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `${actionContext}<user_instruction>${userInstruction}</user_instruction>`,\n },\n ],\n },\n ];\n\n let latestFeedbackMessage: ChatCompletionMessageParam;\n\n // Build sub-goal status text to include in the message (only when deepThink is enabled)\n const subGoalsText = includeSubGoals\n ? conversationHistory.subGoalsToText()\n : '';\n const subGoalsSection = subGoalsText ? `\\n\\n${subGoalsText}` : '';\n\n // Build notes text to include in the message\n const notesText = conversationHistory.notesToText();\n const notesSection = notesText ? `\\n\\n${notesText}` : '';\n\n if (conversationHistory.pendingFeedbackMessage) {\n latestFeedbackMessage = {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `${conversationHistory.pendingFeedbackMessage}. The last screenshot is attached. Please going on according to the instruction.${notesSection}${subGoalsSection}`,\n },\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n ],\n };\n\n conversationHistory.resetPendingFeedbackMessageIfExists();\n } else {\n latestFeedbackMessage = {\n role: 'user',\n content: [\n {\n type: 'text',\n text: `this is the latest screenshot${notesSection}${subGoalsSection}`,\n },\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n ],\n };\n }\n conversationHistory.append(latestFeedbackMessage);\n\n // Compress history if it exceeds the threshold to avoid context overflow\n conversationHistory.compressHistory(50, 20);\n\n const historyLog = conversationHistory.snapshot(opts.imagesIncludeCount);\n\n const msgs: ChatCompletionMessageParam[] = [\n { role: 'system', content: systemPrompt },\n ...instruction,\n ...historyLog,\n ];\n\n const {\n content: rawResponse,\n usage,\n reasoning_content,\n } = await callAI(msgs, modelConfig, {\n deepThink: opts.deepThink === 'unset' ? undefined : opts.deepThink,\n });\n\n // Parse XML response to JSON object, capture parsing errors\n let planFromAI: RawResponsePlanningAIResponse;\n try {\n planFromAI = parseXMLPlanningResponse(rawResponse, modelFamily);\n } catch (parseError) {\n // Throw AIResponseParseError with usage and rawResponse preserved\n const errorMessage =\n parseError instanceof Error ? parseError.message : String(parseError);\n throw new AIResponseParseError(\n `XML parse error: ${errorMessage}`,\n rawResponse,\n usage,\n );\n }\n\n if (planFromAI.action && planFromAI.finalizeSuccess !== undefined) {\n console.warn(\n 'Planning response included both an action and complete-goal; ignoring complete-goal output.',\n );\n planFromAI.finalizeMessage = undefined;\n planFromAI.finalizeSuccess = undefined;\n }\n\n const actions = planFromAI.action ? [planFromAI.action] : [];\n let shouldContinuePlanning = true;\n\n // Check if goal is completed via complete-goal tag\n if (planFromAI.finalizeSuccess !== undefined) {\n debug('goal completed via complete-goal tag, stop planning');\n shouldContinuePlanning = false;\n // Mark all sub-goals as finished when goal is completed (only when deepThink is enabled)\n if (includeSubGoals) {\n conversationHistory.markAllSubGoalsFinished();\n }\n }\n\n const returnValue: PlanningAIResponse = {\n ...planFromAI,\n actions,\n rawResponse,\n usage,\n reasoning_content,\n yamlFlow: buildYamlFlowFromPlans(actions, opts.actionSpace),\n shouldContinuePlanning,\n };\n\n assert(planFromAI, \"can't get plans from AI\");\n\n actions.forEach((action) => {\n const type = action.type;\n const actionInActionSpace = opts.actionSpace.find(\n (action) => action.name === type,\n );\n\n debug('actionInActionSpace matched', actionInActionSpace);\n const locateFields = actionInActionSpace\n ? findAllMidsceneLocatorField(actionInActionSpace.paramSchema)\n : [];\n\n debug('locateFields', locateFields);\n\n locateFields.forEach((field) => {\n const locateResult = action.param[field];\n if (locateResult && modelFamily !== undefined) {\n // Always use model family to fill bbox parameters\n action.param[field] = fillBboxParam(\n locateResult,\n imageWidth,\n imageHeight,\n modelFamily,\n );\n }\n });\n });\n\n // Update sub-goals in conversation history based on response (only when deepThink is enabled)\n if (includeSubGoals) {\n if (planFromAI.updateSubGoals?.length) {\n conversationHistory.setSubGoals(planFromAI.updateSubGoals);\n }\n if (planFromAI.markFinishedIndexes?.length) {\n for (const index of planFromAI.markFinishedIndexes) {\n conversationHistory.markSubGoalFinished(index);\n }\n }\n }\n\n // Append note to conversation history if present\n if (planFromAI.note) {\n conversationHistory.appendNote(planFromAI.note);\n }\n\n conversationHistory.append({\n role: 'assistant',\n content: [\n {\n type: 'text',\n text: rawResponse,\n },\n ],\n });\n\n return returnValue;\n}\n"],"names":["debug","getDebug","parseXMLPlanningResponse","xmlString","modelFamily","thought","extractXMLTag","note","log","error","actionType","actionParamStr","completeGoalRegex","completeGoalMatch","finalizeMessage","finalizeSuccess","undefined","updatePlanContent","markSubGoalDone","updateSubGoals","parseSubGoalsFromXML","markFinishedIndexes","parseMarkFinishedIndexes","action","type","param","safeParseJson","e","Error","plan","userInstruction","opts","context","modelConfig","conversationHistory","size","screenshotBase64","includeSubGoals","systemPrompt","systemPromptToTaskPlanning","imagePayload","imageWidth","imageHeight","paddedResult","paddingToMatchBlockByBase64","actionContext","instruction","latestFeedbackMessage","subGoalsText","subGoalsSection","notesText","notesSection","historyLog","msgs","rawResponse","usage","reasoning_content","callAI","planFromAI","parseError","errorMessage","String","AIResponseParseError","console","actions","shouldContinuePlanning","returnValue","buildYamlFlowFromPlans","assert","actionInActionSpace","locateFields","findAllMidsceneLocatorField","field","locateResult","fillBboxParam","index"],"mappings":";;;;;;;AA+BA,MAAMA,QAAQC,SAAS;AAKhB,SAASC,yBACdC,SAAiB,EACjBC,WAAqC;IAErC,MAAMC,UAAUC,cAAcH,WAAW;IACzC,MAAMI,OAAOD,cAAcH,WAAW;IACtC,MAAMK,MAAMF,cAAcH,WAAW,UAAU;IAC/C,MAAMM,QAAQH,cAAcH,WAAW;IACvC,MAAMO,aAAaJ,cAAcH,WAAW;IAC5C,MAAMQ,iBAAiBL,cAAcH,WAAW;IAGhD,MAAMS,oBACJ;IACF,MAAMC,oBAAoBV,UAAU,KAAK,CAACS;IAC1C,IAAIE;IACJ,IAAIC;IAEJ,IAAIF,mBAAmB;QACrBE,kBAAkBF,AAAyB,WAAzBA,iBAAiB,CAAC,EAAE;QACtCC,kBAAkBD,iBAAiB,CAAC,EAAE,EAAE,UAAUG;IACpD;IAGA,MAAMC,oBAAoBX,cAAcH,WAAW;IACnD,MAAMe,kBAAkBZ,cAAcH,WAAW;IAEjD,MAAMgB,iBAAiBF,oBACnBG,qBAAqBH,qBACrBD;IACJ,MAAMK,sBAAsBH,kBACxBI,yBAAyBJ,mBACzBF;IAGJ,IAAIO,SAAc;IAClB,IAAIb,cAAcA,AAA6B,WAA7BA,WAAW,WAAW,IAAe;QACrD,MAAMc,OAAOd,WAAW,IAAI;QAC5B,IAAIe;QAEJ,IAAId,gBACF,IAAI;YAEFc,QAAQC,cAAcf,gBAAgBP;QACxC,EAAE,OAAOuB,GAAG;YACV,MAAM,IAAIC,MAAM,CAAC,mCAAmC,EAAED,GAAG;QAC3D;QAGFJ,SAAS;YACPC;YACA,GAAIC,AAAUT,WAAVS,QAAsB;gBAAEA;YAAM,IAAI,CAAC,CAAC;QAC1C;IACF;IAEA,OAAO;QACL,GAAIpB,UAAU;YAAEA;QAAQ,IAAI,CAAC,CAAC;QAC9B,GAAIE,OAAO;YAAEA;QAAK,IAAI,CAAC,CAAC;QACxBC;QACA,GAAIC,QAAQ;YAAEA;QAAM,IAAI,CAAC,CAAC;QAC1Bc;QACA,GAAIT,AAAoBE,WAApBF,kBAAgC;YAAEA;QAAgB,IAAI,CAAC,CAAC;QAC5D,GAAIC,AAAoBC,WAApBD,kBAAgC;YAAEA;QAAgB,IAAI,CAAC,CAAC;QAC5D,GAAII,gBAAgB,SAAS;YAAEA;QAAe,IAAI,CAAC,CAAC;QACpD,GAAIE,qBAAqB,SAAS;YAAEA;QAAoB,IAAI,CAAC,CAAC;IAChE;AACF;AAEO,eAAeQ,KACpBC,eAAuB,EACvBC,IAUC;IAED,MAAM,EAAEC,OAAO,EAAEC,WAAW,EAAEC,mBAAmB,EAAE,GAAGH;IACtD,MAAM,EAAEI,IAAI,EAAE,GAAGH;IACjB,MAAMI,mBAAmBJ,QAAQ,UAAU,CAAC,MAAM;IAElD,MAAM,EAAE5B,WAAW,EAAE,GAAG6B;IAGxB,MAAMI,kBAAkBN,AAAmB,SAAnBA,KAAK,SAAS;IAEtC,MAAMO,eAAe,MAAMC,2BAA2B;QACpD,aAAaR,KAAK,WAAW;QAC7B3B;QACA,aAAa2B,KAAK,WAAW;QAC7B,gBAAgB;QAChBM;IACF;IAEA,IAAIG,eAAeJ;IACnB,IAAIK,aAAaN,KAAK,KAAK;IAC3B,IAAIO,cAAcP,KAAK,MAAM;IAK7B,IAAI/B,AAAgB,iBAAhBA,aAA8B;QAChC,MAAMuC,eAAe,MAAMC,4BAA4BJ;QACvDC,aAAaE,aAAa,KAAK;QAC/BD,cAAcC,aAAa,MAAM;QACjCH,eAAeG,aAAa,WAAW;IACzC;IAEA,MAAME,gBAAgBd,KAAK,aAAa,GACpC,CAAC,yBAAyB,EAAEA,KAAK,aAAa,CAAC,4BAA4B,CAAC,GAC5E;IAEJ,MAAMe,cAA4C;QAChD;YACE,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM,GAAGD,cAAc,kBAAkB,EAAEf,gBAAgB,mBAAmB,CAAC;gBACjF;aACD;QACH;KACD;IAED,IAAIiB;IAGJ,MAAMC,eAAeX,kBACjBH,oBAAoB,cAAc,KAClC;IACJ,MAAMe,kBAAkBD,eAAe,CAAC,IAAI,EAAEA,cAAc,GAAG;IAG/D,MAAME,YAAYhB,oBAAoB,WAAW;IACjD,MAAMiB,eAAeD,YAAY,CAAC,IAAI,EAAEA,WAAW,GAAG;IAEtD,IAAIhB,oBAAoB,sBAAsB,EAAE;QAC9Ca,wBAAwB;YACtB,MAAM;YACN,SAAS;gBACP;oBACE,MAAM;oBACN,MAAM,GAAGb,oBAAoB,sBAAsB,CAAC,gFAAgF,EAAEiB,eAAeF,iBAAiB;gBACxK;gBACA;oBACE,MAAM;oBACN,WAAW;wBACT,KAAKT;wBACL,QAAQ;oBACV;gBACF;aACD;QACH;QAEAN,oBAAoB,mCAAmC;IACzD,OACEa,wBAAwB;QACtB,MAAM;QACN,SAAS;YACP;gBACE,MAAM;gBACN,MAAM,CAAC,6BAA6B,EAAEI,eAAeF,iBAAiB;YACxE;YACA;gBACE,MAAM;gBACN,WAAW;oBACT,KAAKT;oBACL,QAAQ;gBACV;YACF;SACD;IACH;IAEFN,oBAAoB,MAAM,CAACa;IAG3Bb,oBAAoB,eAAe,CAAC,IAAI;IAExC,MAAMkB,aAAalB,oBAAoB,QAAQ,CAACH,KAAK,kBAAkB;IAEvE,MAAMsB,OAAqC;QACzC;YAAE,MAAM;YAAU,SAASf;QAAa;WACrCQ;WACAM;KACJ;IAED,MAAM,EACJ,SAASE,WAAW,EACpBC,KAAK,EACLC,iBAAiB,EAClB,GAAG,MAAMC,OAAOJ,MAAMpB,aAAa;QAClC,WAAWF,AAAmB,YAAnBA,KAAK,SAAS,GAAef,SAAYe,KAAK,SAAS;IACpE;IAGA,IAAI2B;IACJ,IAAI;QACFA,aAAaxD,yBAAyBoD,aAAalD;IACrD,EAAE,OAAOuD,YAAY;QAEnB,MAAMC,eACJD,sBAAsB/B,QAAQ+B,WAAW,OAAO,GAAGE,OAAOF;QAC5D,MAAM,IAAIG,qBACR,CAAC,iBAAiB,EAAEF,cAAc,EAClCN,aACAC;IAEJ;IAEA,IAAIG,WAAW,MAAM,IAAIA,AAA+B1C,WAA/B0C,WAAW,eAAe,EAAgB;QACjEK,QAAQ,IAAI,CACV;QAEFL,WAAW,eAAe,GAAG1C;QAC7B0C,WAAW,eAAe,GAAG1C;IAC/B;IAEA,MAAMgD,UAAUN,WAAW,MAAM,GAAG;QAACA,WAAW,MAAM;KAAC,GAAG,EAAE;IAC5D,IAAIO,yBAAyB;IAG7B,IAAIP,AAA+B1C,WAA/B0C,WAAW,eAAe,EAAgB;QAC5C1D,MAAM;QACNiE,yBAAyB;QAEzB,IAAI5B,iBACFH,oBAAoB,uBAAuB;IAE/C;IAEA,MAAMgC,cAAkC;QACtC,GAAGR,UAAU;QACbM;QACAV;QACAC;QACAC;QACA,UAAUW,uBAAuBH,SAASjC,KAAK,WAAW;QAC1DkC;IACF;IAEAG,OAAOV,YAAY;IAEnBM,QAAQ,OAAO,CAAC,CAACzC;QACf,MAAMC,OAAOD,OAAO,IAAI;QACxB,MAAM8C,sBAAsBtC,KAAK,WAAW,CAAC,IAAI,CAC/C,CAACR,SAAWA,OAAO,IAAI,KAAKC;QAG9BxB,MAAM,+BAA+BqE;QACrC,MAAMC,eAAeD,sBACjBE,4BAA4BF,oBAAoB,WAAW,IAC3D,EAAE;QAENrE,MAAM,gBAAgBsE;QAEtBA,aAAa,OAAO,CAAC,CAACE;YACpB,MAAMC,eAAelD,OAAO,KAAK,CAACiD,MAAM;YACxC,IAAIC,gBAAgBrE,AAAgBY,WAAhBZ,aAElBmB,OAAO,KAAK,CAACiD,MAAM,GAAGE,cACpBD,cACAhC,YACAC,aACAtC;QAGN;IACF;IAGA,IAAIiC,iBAAiB;QACnB,IAAIqB,WAAW,cAAc,EAAE,QAC7BxB,oBAAoB,WAAW,CAACwB,WAAW,cAAc;QAE3D,IAAIA,WAAW,mBAAmB,EAAE,QAClC,KAAK,MAAMiB,SAASjB,WAAW,mBAAmB,CAChDxB,oBAAoB,mBAAmB,CAACyC;IAG9C;IAGA,IAAIjB,WAAW,IAAI,EACjBxB,oBAAoB,UAAU,CAACwB,WAAW,IAAI;IAGhDxB,oBAAoB,MAAM,CAAC;QACzB,MAAM;QACN,SAAS;YACP;gBACE,MAAM;gBACN,MAAMoB;YACR;SACD;IACH;IAEA,OAAOY;AACT"}
@@ -160,8 +160,11 @@ Don't use this tag if no information needs to be preserved.
160
160
  Based on the current screenshot${shouldIncludeSubGoals ? ' and the status of all sub-goals' : ''}, determine if the entire task is completed.
161
161
 
162
162
  - Use the <complete-goal success="true|false">message</complete-goal> tag to output the result if the goal is accomplished or failed.
163
+ - IMPORTANT: The "goal" means strictly following the user's instruction - nothing more, nothing less. For example, if the user says "fill out the form", you should mark success="true" once all form fields are filled, do NOT submit the form. If the user says "click the login button", you should mark success="true" once the button is clicked, do NOT wait for the page to load or verify login success unless explicitly asked.
164
+
163
165
  - the 'success' attribute is required. It means whether the expected goal is accomplished based on what you observe in the current screenshot. No matter what actions were executed or what errors occurred during execution, if the expected goal is accomplished, set success="true". If the expected goal is not accomplished and cannot be accomplished, set success="false".
164
166
  - the 'message' is the information that will be provided to the user. If the user asks for a specific format, strictly follow that.
167
+ - If the user's instruction includes an assertion (e.g., "verify that...", "check that...", "assert..."), and you observe from the screenshot that the assertion condition is NOT satisfied and cannot be satisfied, you should use <complete-goal success="false">reason</complete-goal> to indicate the checkpoint failed, and explain why the assertion failed in the message.
165
168
  - If you output <complete-goal>, do NOT output <action-type> or <action-param-json>. The task ends here.
166
169
  - If the task is NOT complete, skip this section and continue to Step ${actionStepNumber}.
167
170
 
@@ -198,8 +201,6 @@ The <log> tag is a brief preamble message to the user explaining what you're abo
198
201
 
199
202
  - Use the <action-type> and <action-param-json> tags to output the action to be executed.
200
203
  - The <action-type> MUST be one of the supporting actions. 'complete-goal' is NOT a valid action-type.
201
- - Use <action-type>Print_Assert_Result</action-type> if the user clearly asks for an assertion.
202
-
203
204
  For example:
204
205
  <action-type>Tap</action-type>
205
206
  <action-param-json>
@@ -1 +1 @@
1
- {"version":3,"file":"ai-model/prompt/llm-planning.mjs","sources":["../../../../src/ai-model/prompt/llm-planning.ts"],"sourcesContent":["import type { DeviceAction } from '@/types';\nimport type { TModelFamily } from '@midscene/shared/env';\nimport { getPreferredLanguage } from '@midscene/shared/env';\nimport {\n getZodDescription,\n getZodTypeName,\n} from '@midscene/shared/zod-schema-utils';\nimport type { z } from 'zod';\nimport { bboxDescription } from './common';\n\nconst vlLocateParam = (modelFamily: TModelFamily | undefined) => {\n if (modelFamily) {\n return `{bbox: [number, number, number, number], prompt: string } // ${bboxDescription(modelFamily)}`;\n }\n return '{ prompt: string /* description of the target element */ }';\n};\n\n/**\n * Find ZodDefault in the wrapper chain and return its default value\n */\nconst findDefaultValue = (field: unknown): any | undefined => {\n let current = field;\n const visited = new Set<unknown>();\n\n while (current && !visited.has(current)) {\n visited.add(current);\n const currentWithDef = current as {\n _def?: {\n typeName?: string;\n defaultValue?: () => any;\n innerType?: unknown;\n };\n };\n\n if (!currentWithDef._def?.typeName) break;\n\n if (currentWithDef._def.typeName === 'ZodDefault') {\n return currentWithDef._def.defaultValue?.();\n }\n\n // Continue unwrapping if it's a wrapper type\n if (\n currentWithDef._def.typeName === 'ZodOptional' ||\n currentWithDef._def.typeName === 'ZodNullable'\n ) {\n current = currentWithDef._def.innerType;\n } else {\n break;\n }\n }\n\n return undefined;\n};\n\nexport const descriptionForAction = (\n action: DeviceAction<any>,\n locatorSchemaTypeDescription: string,\n) => {\n const tab = ' ';\n const fields: string[] = [];\n\n // Add the action type field\n fields.push(`- type: \"${action.name}\"`);\n\n // Handle paramSchema if it exists\n if (action.paramSchema) {\n const paramLines: string[] = [];\n\n // Check if paramSchema is a ZodObject with shape\n const schema = action.paramSchema as {\n _def?: { typeName?: string };\n shape?: Record<string, unknown>;\n };\n const isZodObject = schema._def?.typeName === 'ZodObject';\n\n if (isZodObject && schema.shape) {\n // Original logic for ZodObject schemas\n const shape = schema.shape;\n\n for (const [key, field] of Object.entries(shape)) {\n if (field && typeof field === 'object') {\n // Check if field is optional\n const isOptional =\n typeof (field as { isOptional?: () => boolean }).isOptional ===\n 'function' &&\n (field as { isOptional: () => boolean }).isOptional();\n const keyWithOptional = isOptional ? `${key}?` : key;\n\n // Get the type name using extracted helper\n const typeName = getZodTypeName(field, locatorSchemaTypeDescription);\n\n // Get description using extracted helper\n const description = getZodDescription(field as z.ZodTypeAny);\n\n // Check if field has a default value by searching the wrapper chain\n const defaultValue = findDefaultValue(field);\n const hasDefault = defaultValue !== undefined;\n\n // Build param line for this field\n let paramLine = `${keyWithOptional}: ${typeName}`;\n const comments: string[] = [];\n if (description) {\n comments.push(description);\n }\n if (hasDefault) {\n const defaultStr =\n typeof defaultValue === 'string'\n ? `\"${defaultValue}\"`\n : JSON.stringify(defaultValue);\n comments.push(`default: ${defaultStr}`);\n }\n if (comments.length > 0) {\n paramLine += ` // ${comments.join(', ')}`;\n }\n\n paramLines.push(paramLine);\n }\n }\n\n // Add the param section to fields if there are paramLines\n if (paramLines.length > 0) {\n fields.push('- param:');\n paramLines.forEach((line) => {\n fields.push(` - ${line}`);\n });\n }\n } else {\n // Handle non-object schemas (string, number, etc.)\n const typeName = getZodTypeName(schema);\n const description = getZodDescription(schema as z.ZodTypeAny);\n\n // For simple types, indicate that param should be the direct value, not an object\n let paramDescription = `- param: ${typeName}`;\n if (description) {\n paramDescription += ` // ${description}`;\n }\n paramDescription += ' (pass the value directly, not as an object)';\n\n fields.push(paramDescription);\n }\n }\n\n return `- ${action.name}, ${action.description || 'No description provided'}\n${tab}${fields.join(`\\n${tab}`)}\n`.trim();\n};\n\nexport async function systemPromptToTaskPlanning({\n actionSpace,\n modelFamily,\n includeBbox,\n includeThought,\n includeSubGoals,\n}: {\n actionSpace: DeviceAction<any>[];\n modelFamily: TModelFamily | undefined;\n includeBbox: boolean;\n includeThought?: boolean;\n includeSubGoals?: boolean;\n}) {\n const preferredLanguage = getPreferredLanguage();\n\n // Validate parameters: if includeBbox is true, modelFamily must be defined\n if (includeBbox && !modelFamily) {\n throw new Error(\n 'modelFamily cannot be undefined when includeBbox is true. A valid modelFamily is required for bbox-based location.',\n );\n }\n\n const actionDescriptionList = actionSpace.map((action) => {\n return descriptionForAction(\n action,\n vlLocateParam(includeBbox ? modelFamily : undefined),\n );\n });\n const actionList = actionDescriptionList.join('\\n');\n\n const shouldIncludeThought = includeThought ?? true;\n const shouldIncludeSubGoals = includeSubGoals ?? false;\n\n // Generate locate object examples based on includeBbox\n const locateExample1 = includeBbox\n ? `{\n \"prompt\": \"Add to cart button for Sauce Labs Backpack\",\n \"bbox\": [345, 442, 458, 483]\n }`\n : `{\n \"prompt\": \"Add to cart button for Sauce Labs Backpack\"\n }`;\n\n const thoughtTag = (content: string) =>\n shouldIncludeThought ? `<thought>${content}</thought>\\n` : '';\n\n // Sub-goals related content - only included when shouldIncludeSubGoals is true\n const step1Title = shouldIncludeSubGoals\n ? '## Step 1: Observe and Plan (related tags: <thought>, <update-plan-content>, <mark-sub-goal-done>)'\n : '## Step 1: Observe (related tags: <thought>)';\n\n const step1Description = shouldIncludeSubGoals\n ? \"First, observe the current screenshot and previous logs, then break down the user's instruction into multiple high-level sub-goals. Update the status of sub-goals based on what you see in the current screenshot.\"\n : 'First, observe the current screenshot and previous logs to understand the current state.';\n\n const thoughtTagDescription = shouldIncludeSubGoals\n ? \"Include your thought process in the <thought> tag. It should answer: What is the user's requirement? What is the current state based on the screenshot? Are all sub-goals completed? If not, what should be the next action? Write your thoughts naturally without numbering or section headers.\"\n : 'Include your thought process in the <thought> tag. It should answer: What is the current state based on the screenshot? What should be the next action? Write your thoughts naturally without numbering or section headers.';\n\n const subGoalTags = shouldIncludeSubGoals\n ? `\n\n* <update-plan-content> tag\n\nUse this structure to give or update your plan:\n\n<update-plan-content>\n <sub-goal index=\"1\" status=\"finished|pending\">sub goal description</sub-goal>\n <sub-goal index=\"2\" status=\"finished|pending\">sub goal description</sub-goal>\n ...\n</update-plan-content>\n\n* <mark-sub-goal-done> tag\n\nUse this structure to mark a sub-goal as done:\n\n<mark-sub-goal-done>\n <sub-goal index=\"1\" status=\"finished\" />\n</mark-sub-goal-done>\n\nIMPORTANT: You MUST only mark a sub-goal as \"finished\" AFTER you have confirmed the task is actually completed by observing the result in the screenshot. Do NOT mark a sub-goal as done just because you expect the next action will complete it. Wait until you see visual confirmation in the screenshot that the sub-goal has been achieved.\n\n* Note\n\nDuring execution, you can call <update-plan-content> at any time to update the plan based on the latest screenshot and completed sub-goals.\n\n### Example\n\nIf the user wants to \"log in to a system using username and password, complete all to-do items, and submit a registration form\", you can break it down into the following sub-goals:\n\n<thought>...</thought>\n<update-plan-content>\n <sub-goal index=\"1\" status=\"pending\">Log in to the system</sub-goal>\n <sub-goal index=\"2\" status=\"pending\">Complete all to-do items</sub-goal>\n <sub-goal index=\"3\" status=\"pending\">Submit the registration form</sub-goal>\n</update-plan-content>\n\nAfter logging in and seeing the to-do items, you can mark the sub-goal as done:\n\n<mark-sub-goal-done>\n <sub-goal index=\"1\" status=\"finished\" />\n</mark-sub-goal-done>\n\nAt this point, the status of all sub-goals is:\n\n<update-plan-content>\n <sub-goal index=\"1\" status=\"finished\" />\n <sub-goal index=\"2\" status=\"pending\" />\n <sub-goal index=\"3\" status=\"pending\" />\n</update-plan-content>\n\nAfter some time, when the last sub-goal is also completed, you can mark it as done as well:\n\n<mark-sub-goal-done>\n <sub-goal index=\"3\" status=\"finished\" />\n</mark-sub-goal-done>`\n : '';\n\n // Step numbering adjusts based on whether sub-goals are included\n const noteStepNumber = shouldIncludeSubGoals ? 2 : 2;\n const checkGoalStepNumber = shouldIncludeSubGoals ? 3 : 3;\n const actionStepNumber = shouldIncludeSubGoals ? 4 : 4;\n\n return `\nTarget: You are an expert to manipulate the UI to accomplish the user's instruction. User will give you an instruction, some screenshots, background knowledge and previous logs indicating what have been done. Your task is to accomplish the instruction by thinking through the path to complete the task and give the next action to execute.\n\n${step1Title}\n\n${step1Description}\n\n* <thought> tag\n\n${thoughtTagDescription}\n${subGoalTags}\n\n## Step ${noteStepNumber}: Note Data from Current Screenshot (related tags: <note>)\n\nWhile observing the current screenshot, if you notice any information that might be needed in follow-up actions, record it here. The current screenshot will NOT be available in subsequent steps, so this note is your only way to preserve essential information. Examples: extracted data, element states, content that needs to be referenced.\n\nDon't use this tag if no information needs to be preserved.\n\n## Step ${checkGoalStepNumber}: Check if Goal is Accomplished (related tags: <complete-goal>)\n\nBased on the current screenshot${shouldIncludeSubGoals ? ' and the status of all sub-goals' : ''}, determine if the entire task is completed.\n\n- Use the <complete-goal success=\"true|false\">message</complete-goal> tag to output the result if the goal is accomplished or failed.\n - the 'success' attribute is required. It means whether the expected goal is accomplished based on what you observe in the current screenshot. No matter what actions were executed or what errors occurred during execution, if the expected goal is accomplished, set success=\"true\". If the expected goal is not accomplished and cannot be accomplished, set success=\"false\".\n - the 'message' is the information that will be provided to the user. If the user asks for a specific format, strictly follow that.\n- If you output <complete-goal>, do NOT output <action-type> or <action-param-json>. The task ends here.\n- If the task is NOT complete, skip this section and continue to Step ${actionStepNumber}.\n\n## Step ${actionStepNumber}: Determine Next Action (related tags: <log>, <action-type>, <action-param-json>, <error>)\n\nONLY if the task is not complete: Think what the next action is according to the current screenshot${shouldIncludeSubGoals ? ' and the plan' : ''}.\n\n- Don't give extra actions or plans beyond the instruction or the plan. For example, don't try to submit the form if the instruction is only to fill something.\n- Consider the current screenshot and give the action that is most likely to accomplish the instruction. For example, if the next step is to click a button but it's not visible in the screenshot, you should try to find it first instead of give a click action.\n- Make sure the previous actions are completed successfully. Otherwise, retry or do something else to recover.\n- Give just the next ONE action you should do (if any)\n- If there are some error messages reported by the previous actions, don't give up, try parse a new action to recover. If the error persists for more than 3 times, you should think this is an error and set the \"error\" field to the error message.\n\n### Supporting actions list\n\n${actionList}\n\n### Log to give user feedback (preamble message)\n\nThe <log> tag is a brief preamble message to the user explaining what you're about to do. It should follow these principles and examples:\n\n- **Use ${preferredLanguage}**\n- **Keep it concise**: be no more than 1-2 sentences, focused on immediate, tangible next steps. (8–12 words or Chinese characters for quick updates).\n- **Build on prior context**: if this is not the first action to be done, use the preamble message to connect the dots with what's been done so far and create a sense of momentum and clarity for the user to understand your next actions.\n- **Keep your tone light, friendly and curious**: add small touches of personality in preambles feel collaborative and engaging.\n\n**Examples:**\n- <log>Click the login button</log>\n- <log>Scroll to find the 'Yes' button in popup</log>\n- <log>Previous actions failed to find the 'Yes' button, i will try again</log>\n- <log>Go back to find the login button</log>\n\n### If there is some action to do ...\n\n- Use the <action-type> and <action-param-json> tags to output the action to be executed.\n- The <action-type> MUST be one of the supporting actions. 'complete-goal' is NOT a valid action-type.\n- Use <action-type>Print_Assert_Result</action-type> if the user clearly asks for an assertion.\n\nFor example:\n<action-type>Tap</action-type>\n<action-param-json>\n{\n \"locate\": ${locateExample1}\n}\n</action-param-json>\n\n### If you think there is an error ...\n\n- Use the <error> tag to output the error message.\n\nFor example:\n<error>Unable to find the required element on the page</error>\n\n### If there is no action to do ...\n\n- Don't output <action-type> or <action-param-json> if there is no action to do.\n\n## Return Format\n\nReturn in XML format following this decision flow:\n\n**Always include:**\n<!-- Step 1: Observe${shouldIncludeSubGoals ? ' and Plan' : ''} -->\n<thought>...</thought>\n${\n shouldIncludeSubGoals\n ? `\n<!-- required when no update-plan-content is provided in the previous response -->\n<update-plan-content>...</update-plan-content>\n\n<!-- required when any sub-goal is completed -->\n<mark-sub-goal-done>\n <sub-goal index=\"1\" status=\"finished\" />\n</mark-sub-goal-done>\n`\n : ''\n}\n<!-- Step ${noteStepNumber}: Note data from current screenshot if needed -->\n<note>...</note>\n\n**Then choose ONE of the following paths:**\n\n**Path A: If the goal is accomplished or failed (Step ${checkGoalStepNumber})**\n<complete-goal success=\"true|false\">...</complete-goal>\n\n**Path B: If the goal is NOT complete yet (Step ${actionStepNumber})**\n<!-- Determine next action -->\n<log>...</log>\n<action-type>...</action-type>\n<action-param-json>...</action-param-json>\n\n<!-- OR if there's an error -->\n<error>...</error>\n`;\n}\n"],"names":["vlLocateParam","modelFamily","bboxDescription","findDefaultValue","field","current","visited","Set","currentWithDef","descriptionForAction","action","locatorSchemaTypeDescription","tab","fields","paramLines","schema","isZodObject","shape","key","Object","isOptional","keyWithOptional","typeName","getZodTypeName","description","getZodDescription","defaultValue","hasDefault","undefined","paramLine","comments","defaultStr","JSON","line","paramDescription","systemPromptToTaskPlanning","actionSpace","includeBbox","includeThought","includeSubGoals","preferredLanguage","getPreferredLanguage","Error","actionDescriptionList","actionList","shouldIncludeSubGoals","locateExample1","step1Title","step1Description","thoughtTagDescription","subGoalTags","noteStepNumber","checkGoalStepNumber","actionStepNumber"],"mappings":";;;AAUA,MAAMA,gBAAgB,CAACC;IACrB,IAAIA,aACF,OAAO,CAAC,6DAA6D,EAAEC,gBAAgBD,cAAc;IAEvG,OAAO;AACT;AAKA,MAAME,mBAAmB,CAACC;IACxB,IAAIC,UAAUD;IACd,MAAME,UAAU,IAAIC;IAEpB,MAAOF,WAAW,CAACC,QAAQ,GAAG,CAACD,SAAU;QACvCC,QAAQ,GAAG,CAACD;QACZ,MAAMG,iBAAiBH;QAQvB,IAAI,CAACG,eAAe,IAAI,EAAE,UAAU;QAEpC,IAAIA,AAAiC,iBAAjCA,eAAe,IAAI,CAAC,QAAQ,EAC9B,OAAOA,eAAe,IAAI,CAAC,YAAY;QAIzC,IACEA,AAAiC,kBAAjCA,eAAe,IAAI,CAAC,QAAQ,IAC5BA,AAAiC,kBAAjCA,eAAe,IAAI,CAAC,QAAQ,EAE5BH,UAAUG,eAAe,IAAI,CAAC,SAAS;aAEvC;IAEJ;AAGF;AAEO,MAAMC,uBAAuB,CAClCC,QACAC;IAEA,MAAMC,MAAM;IACZ,MAAMC,SAAmB,EAAE;IAG3BA,OAAO,IAAI,CAAC,CAAC,SAAS,EAAEH,OAAO,IAAI,CAAC,CAAC,CAAC;IAGtC,IAAIA,OAAO,WAAW,EAAE;QACtB,MAAMI,aAAuB,EAAE;QAG/B,MAAMC,SAASL,OAAO,WAAW;QAIjC,MAAMM,cAAcD,OAAO,IAAI,EAAE,aAAa;QAE9C,IAAIC,eAAeD,OAAO,KAAK,EAAE;YAE/B,MAAME,QAAQF,OAAO,KAAK;YAE1B,KAAK,MAAM,CAACG,KAAKd,MAAM,IAAIe,OAAO,OAAO,CAACF,OACxC,IAAIb,SAAS,AAAiB,YAAjB,OAAOA,OAAoB;gBAEtC,MAAMgB,aACJ,AACE,cADF,OAAQhB,MAAyC,UAAU,IAE1DA,MAAwC,UAAU;gBACrD,MAAMiB,kBAAkBD,aAAa,GAAGF,IAAI,CAAC,CAAC,GAAGA;gBAGjD,MAAMI,WAAWC,eAAenB,OAAOO;gBAGvC,MAAMa,cAAcC,kBAAkBrB;gBAGtC,MAAMsB,eAAevB,iBAAiBC;gBACtC,MAAMuB,aAAaD,AAAiBE,WAAjBF;gBAGnB,IAAIG,YAAY,GAAGR,gBAAgB,EAAE,EAAEC,UAAU;gBACjD,MAAMQ,WAAqB,EAAE;gBAC7B,IAAIN,aACFM,SAAS,IAAI,CAACN;gBAEhB,IAAIG,YAAY;oBACd,MAAMI,aACJ,AAAwB,YAAxB,OAAOL,eACH,CAAC,CAAC,EAAEA,aAAa,CAAC,CAAC,GACnBM,KAAK,SAAS,CAACN;oBACrBI,SAAS,IAAI,CAAC,CAAC,SAAS,EAAEC,YAAY;gBACxC;gBACA,IAAID,SAAS,MAAM,GAAG,GACpBD,aAAa,CAAC,IAAI,EAAEC,SAAS,IAAI,CAAC,OAAO;gBAG3ChB,WAAW,IAAI,CAACe;YAClB;YAIF,IAAIf,WAAW,MAAM,GAAG,GAAG;gBACzBD,OAAO,IAAI,CAAC;gBACZC,WAAW,OAAO,CAAC,CAACmB;oBAClBpB,OAAO,IAAI,CAAC,CAAC,IAAI,EAAEoB,MAAM;gBAC3B;YACF;QACF,OAAO;YAEL,MAAMX,WAAWC,eAAeR;YAChC,MAAMS,cAAcC,kBAAkBV;YAGtC,IAAImB,mBAAmB,CAAC,SAAS,EAAEZ,UAAU;YAC7C,IAAIE,aACFU,oBAAoB,CAAC,IAAI,EAAEV,aAAa;YAE1CU,oBAAoB;YAEpBrB,OAAO,IAAI,CAACqB;QACd;IACF;IAEA,OAAO,CAAC,EAAE,EAAExB,OAAO,IAAI,CAAC,EAAE,EAAEA,OAAO,WAAW,IAAI,0BAA0B;AAC9E,EAAEE,MAAMC,OAAO,IAAI,CAAC,CAAC,EAAE,EAAED,KAAK,EAAE;AAChC,CAAC,CAAC,IAAI;AACN;AAEO,eAAeuB,2BAA2B,EAC/CC,WAAW,EACXnC,WAAW,EACXoC,WAAW,EACXC,cAAc,EACdC,eAAe,EAOhB;IACC,MAAMC,oBAAoBC;IAG1B,IAAIJ,eAAe,CAACpC,aAClB,MAAM,IAAIyC,MACR;IAIJ,MAAMC,wBAAwBP,YAAY,GAAG,CAAC,CAAC1B,SACtCD,qBACLC,QACAV,cAAcqC,cAAcpC,cAAc2B;IAG9C,MAAMgB,aAAaD,sBAAsB,IAAI,CAAC;IAG9C,MAAME,wBAAwBN,mBAAmB;IAGjD,MAAMO,iBAAiBT,cACnB,CAAC;;;GAGJ,CAAC,GACE,CAAC;;GAEJ,CAAC;IAMF,MAAMU,aAAaF,wBACf,uGACA;IAEJ,MAAMG,mBAAmBH,wBACrB,wNACA;IAEJ,MAAMI,wBAAwBJ,wBAC1B,qSACA;IAEJ,MAAMK,cAAcL,wBAChB,CAAC;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;qBAuDc,CAAC,GAChB;IAGJ,MAAMM,iBAAiBN,wBAAwB,IAAI;IACnD,MAAMO,sBAAsBP,wBAAwB,IAAI;IACxD,MAAMQ,mBAAmBR,wBAAwB,IAAI;IAErD,OAAO,CAAC;;;AAGV,EAAEE,WAAW;;AAEb,EAAEC,iBAAiB;;;;AAInB,EAAEC,sBAAsB;AACxB,EAAEC,YAAY;;QAEN,EAAEC,eAAe;;;;;;QAMjB,EAAEC,oBAAoB;;+BAEC,EAAEP,wBAAwB,qCAAqC,GAAG;;;;;;sEAM3B,EAAEQ,iBAAiB;;QAEjF,EAAEA,iBAAiB;;mGAEwE,EAAER,wBAAwB,kBAAkB,GAAG;;;;;;;;;;AAUlJ,EAAED,WAAW;;;;;;QAML,EAAEJ,kBAAkB;;;;;;;;;;;;;;;;;;;;;YAqBhB,EAAEM,eAAe;;;;;;;;;;;;;;;;;;;;oBAoBT,EAAED,wBAAwB,cAAc,GAAG;;AAE/D,EACEA,wBACI,CAAC;;;;;;;;AAQP,CAAC,GACK,GACL;UACS,EAAEM,eAAe;;;;;sDAK2B,EAAEC,oBAAoB;;;gDAG5B,EAAEC,iBAAiB;;;;;;;;AAQnE,CAAC;AACD"}
1
+ {"version":3,"file":"ai-model/prompt/llm-planning.mjs","sources":["../../../../src/ai-model/prompt/llm-planning.ts"],"sourcesContent":["import type { DeviceAction } from '@/types';\nimport type { TModelFamily } from '@midscene/shared/env';\nimport { getPreferredLanguage } from '@midscene/shared/env';\nimport {\n getZodDescription,\n getZodTypeName,\n} from '@midscene/shared/zod-schema-utils';\nimport type { z } from 'zod';\nimport { bboxDescription } from './common';\n\nconst vlLocateParam = (modelFamily: TModelFamily | undefined) => {\n if (modelFamily) {\n return `{bbox: [number, number, number, number], prompt: string } // ${bboxDescription(modelFamily)}`;\n }\n return '{ prompt: string /* description of the target element */ }';\n};\n\n/**\n * Find ZodDefault in the wrapper chain and return its default value\n */\nconst findDefaultValue = (field: unknown): any | undefined => {\n let current = field;\n const visited = new Set<unknown>();\n\n while (current && !visited.has(current)) {\n visited.add(current);\n const currentWithDef = current as {\n _def?: {\n typeName?: string;\n defaultValue?: () => any;\n innerType?: unknown;\n };\n };\n\n if (!currentWithDef._def?.typeName) break;\n\n if (currentWithDef._def.typeName === 'ZodDefault') {\n return currentWithDef._def.defaultValue?.();\n }\n\n // Continue unwrapping if it's a wrapper type\n if (\n currentWithDef._def.typeName === 'ZodOptional' ||\n currentWithDef._def.typeName === 'ZodNullable'\n ) {\n current = currentWithDef._def.innerType;\n } else {\n break;\n }\n }\n\n return undefined;\n};\n\nexport const descriptionForAction = (\n action: DeviceAction<any>,\n locatorSchemaTypeDescription: string,\n) => {\n const tab = ' ';\n const fields: string[] = [];\n\n // Add the action type field\n fields.push(`- type: \"${action.name}\"`);\n\n // Handle paramSchema if it exists\n if (action.paramSchema) {\n const paramLines: string[] = [];\n\n // Check if paramSchema is a ZodObject with shape\n const schema = action.paramSchema as {\n _def?: { typeName?: string };\n shape?: Record<string, unknown>;\n };\n const isZodObject = schema._def?.typeName === 'ZodObject';\n\n if (isZodObject && schema.shape) {\n // Original logic for ZodObject schemas\n const shape = schema.shape;\n\n for (const [key, field] of Object.entries(shape)) {\n if (field && typeof field === 'object') {\n // Check if field is optional\n const isOptional =\n typeof (field as { isOptional?: () => boolean }).isOptional ===\n 'function' &&\n (field as { isOptional: () => boolean }).isOptional();\n const keyWithOptional = isOptional ? `${key}?` : key;\n\n // Get the type name using extracted helper\n const typeName = getZodTypeName(field, locatorSchemaTypeDescription);\n\n // Get description using extracted helper\n const description = getZodDescription(field as z.ZodTypeAny);\n\n // Check if field has a default value by searching the wrapper chain\n const defaultValue = findDefaultValue(field);\n const hasDefault = defaultValue !== undefined;\n\n // Build param line for this field\n let paramLine = `${keyWithOptional}: ${typeName}`;\n const comments: string[] = [];\n if (description) {\n comments.push(description);\n }\n if (hasDefault) {\n const defaultStr =\n typeof defaultValue === 'string'\n ? `\"${defaultValue}\"`\n : JSON.stringify(defaultValue);\n comments.push(`default: ${defaultStr}`);\n }\n if (comments.length > 0) {\n paramLine += ` // ${comments.join(', ')}`;\n }\n\n paramLines.push(paramLine);\n }\n }\n\n // Add the param section to fields if there are paramLines\n if (paramLines.length > 0) {\n fields.push('- param:');\n paramLines.forEach((line) => {\n fields.push(` - ${line}`);\n });\n }\n } else {\n // Handle non-object schemas (string, number, etc.)\n const typeName = getZodTypeName(schema);\n const description = getZodDescription(schema as z.ZodTypeAny);\n\n // For simple types, indicate that param should be the direct value, not an object\n let paramDescription = `- param: ${typeName}`;\n if (description) {\n paramDescription += ` // ${description}`;\n }\n paramDescription += ' (pass the value directly, not as an object)';\n\n fields.push(paramDescription);\n }\n }\n\n return `- ${action.name}, ${action.description || 'No description provided'}\n${tab}${fields.join(`\\n${tab}`)}\n`.trim();\n};\n\nexport async function systemPromptToTaskPlanning({\n actionSpace,\n modelFamily,\n includeBbox,\n includeThought,\n includeSubGoals,\n}: {\n actionSpace: DeviceAction<any>[];\n modelFamily: TModelFamily | undefined;\n includeBbox: boolean;\n includeThought?: boolean;\n includeSubGoals?: boolean;\n}) {\n const preferredLanguage = getPreferredLanguage();\n\n // Validate parameters: if includeBbox is true, modelFamily must be defined\n if (includeBbox && !modelFamily) {\n throw new Error(\n 'modelFamily cannot be undefined when includeBbox is true. A valid modelFamily is required for bbox-based location.',\n );\n }\n\n const actionDescriptionList = actionSpace.map((action) => {\n return descriptionForAction(\n action,\n vlLocateParam(includeBbox ? modelFamily : undefined),\n );\n });\n const actionList = actionDescriptionList.join('\\n');\n\n const shouldIncludeThought = includeThought ?? true;\n const shouldIncludeSubGoals = includeSubGoals ?? false;\n\n // Generate locate object examples based on includeBbox\n const locateExample1 = includeBbox\n ? `{\n \"prompt\": \"Add to cart button for Sauce Labs Backpack\",\n \"bbox\": [345, 442, 458, 483]\n }`\n : `{\n \"prompt\": \"Add to cart button for Sauce Labs Backpack\"\n }`;\n\n const thoughtTag = (content: string) =>\n shouldIncludeThought ? `<thought>${content}</thought>\\n` : '';\n\n // Sub-goals related content - only included when shouldIncludeSubGoals is true\n const step1Title = shouldIncludeSubGoals\n ? '## Step 1: Observe and Plan (related tags: <thought>, <update-plan-content>, <mark-sub-goal-done>)'\n : '## Step 1: Observe (related tags: <thought>)';\n\n const step1Description = shouldIncludeSubGoals\n ? \"First, observe the current screenshot and previous logs, then break down the user's instruction into multiple high-level sub-goals. Update the status of sub-goals based on what you see in the current screenshot.\"\n : 'First, observe the current screenshot and previous logs to understand the current state.';\n\n const thoughtTagDescription = shouldIncludeSubGoals\n ? \"Include your thought process in the <thought> tag. It should answer: What is the user's requirement? What is the current state based on the screenshot? Are all sub-goals completed? If not, what should be the next action? Write your thoughts naturally without numbering or section headers.\"\n : 'Include your thought process in the <thought> tag. It should answer: What is the current state based on the screenshot? What should be the next action? Write your thoughts naturally without numbering or section headers.';\n\n const subGoalTags = shouldIncludeSubGoals\n ? `\n\n* <update-plan-content> tag\n\nUse this structure to give or update your plan:\n\n<update-plan-content>\n <sub-goal index=\"1\" status=\"finished|pending\">sub goal description</sub-goal>\n <sub-goal index=\"2\" status=\"finished|pending\">sub goal description</sub-goal>\n ...\n</update-plan-content>\n\n* <mark-sub-goal-done> tag\n\nUse this structure to mark a sub-goal as done:\n\n<mark-sub-goal-done>\n <sub-goal index=\"1\" status=\"finished\" />\n</mark-sub-goal-done>\n\nIMPORTANT: You MUST only mark a sub-goal as \"finished\" AFTER you have confirmed the task is actually completed by observing the result in the screenshot. Do NOT mark a sub-goal as done just because you expect the next action will complete it. Wait until you see visual confirmation in the screenshot that the sub-goal has been achieved.\n\n* Note\n\nDuring execution, you can call <update-plan-content> at any time to update the plan based on the latest screenshot and completed sub-goals.\n\n### Example\n\nIf the user wants to \"log in to a system using username and password, complete all to-do items, and submit a registration form\", you can break it down into the following sub-goals:\n\n<thought>...</thought>\n<update-plan-content>\n <sub-goal index=\"1\" status=\"pending\">Log in to the system</sub-goal>\n <sub-goal index=\"2\" status=\"pending\">Complete all to-do items</sub-goal>\n <sub-goal index=\"3\" status=\"pending\">Submit the registration form</sub-goal>\n</update-plan-content>\n\nAfter logging in and seeing the to-do items, you can mark the sub-goal as done:\n\n<mark-sub-goal-done>\n <sub-goal index=\"1\" status=\"finished\" />\n</mark-sub-goal-done>\n\nAt this point, the status of all sub-goals is:\n\n<update-plan-content>\n <sub-goal index=\"1\" status=\"finished\" />\n <sub-goal index=\"2\" status=\"pending\" />\n <sub-goal index=\"3\" status=\"pending\" />\n</update-plan-content>\n\nAfter some time, when the last sub-goal is also completed, you can mark it as done as well:\n\n<mark-sub-goal-done>\n <sub-goal index=\"3\" status=\"finished\" />\n</mark-sub-goal-done>`\n : '';\n\n // Step numbering adjusts based on whether sub-goals are included\n const noteStepNumber = shouldIncludeSubGoals ? 2 : 2;\n const checkGoalStepNumber = shouldIncludeSubGoals ? 3 : 3;\n const actionStepNumber = shouldIncludeSubGoals ? 4 : 4;\n\n return `\nTarget: You are an expert to manipulate the UI to accomplish the user's instruction. User will give you an instruction, some screenshots, background knowledge and previous logs indicating what have been done. Your task is to accomplish the instruction by thinking through the path to complete the task and give the next action to execute.\n\n${step1Title}\n\n${step1Description}\n\n* <thought> tag\n\n${thoughtTagDescription}\n${subGoalTags}\n\n## Step ${noteStepNumber}: Note Data from Current Screenshot (related tags: <note>)\n\nWhile observing the current screenshot, if you notice any information that might be needed in follow-up actions, record it here. The current screenshot will NOT be available in subsequent steps, so this note is your only way to preserve essential information. Examples: extracted data, element states, content that needs to be referenced.\n\nDon't use this tag if no information needs to be preserved.\n\n## Step ${checkGoalStepNumber}: Check if Goal is Accomplished (related tags: <complete-goal>)\n\nBased on the current screenshot${shouldIncludeSubGoals ? ' and the status of all sub-goals' : ''}, determine if the entire task is completed.\n\n- Use the <complete-goal success=\"true|false\">message</complete-goal> tag to output the result if the goal is accomplished or failed.\n - IMPORTANT: The \"goal\" means strictly following the user's instruction - nothing more, nothing less. For example, if the user says \"fill out the form\", you should mark success=\"true\" once all form fields are filled, do NOT submit the form. If the user says \"click the login button\", you should mark success=\"true\" once the button is clicked, do NOT wait for the page to load or verify login success unless explicitly asked.\n\n - the 'success' attribute is required. It means whether the expected goal is accomplished based on what you observe in the current screenshot. No matter what actions were executed or what errors occurred during execution, if the expected goal is accomplished, set success=\"true\". If the expected goal is not accomplished and cannot be accomplished, set success=\"false\".\n - the 'message' is the information that will be provided to the user. If the user asks for a specific format, strictly follow that.\n - If the user's instruction includes an assertion (e.g., \"verify that...\", \"check that...\", \"assert...\"), and you observe from the screenshot that the assertion condition is NOT satisfied and cannot be satisfied, you should use <complete-goal success=\"false\">reason</complete-goal> to indicate the checkpoint failed, and explain why the assertion failed in the message.\n- If you output <complete-goal>, do NOT output <action-type> or <action-param-json>. The task ends here.\n- If the task is NOT complete, skip this section and continue to Step ${actionStepNumber}.\n\n## Step ${actionStepNumber}: Determine Next Action (related tags: <log>, <action-type>, <action-param-json>, <error>)\n\nONLY if the task is not complete: Think what the next action is according to the current screenshot${shouldIncludeSubGoals ? ' and the plan' : ''}.\n\n- Don't give extra actions or plans beyond the instruction or the plan. For example, don't try to submit the form if the instruction is only to fill something.\n- Consider the current screenshot and give the action that is most likely to accomplish the instruction. For example, if the next step is to click a button but it's not visible in the screenshot, you should try to find it first instead of give a click action.\n- Make sure the previous actions are completed successfully. Otherwise, retry or do something else to recover.\n- Give just the next ONE action you should do (if any)\n- If there are some error messages reported by the previous actions, don't give up, try parse a new action to recover. If the error persists for more than 3 times, you should think this is an error and set the \"error\" field to the error message.\n\n### Supporting actions list\n\n${actionList}\n\n### Log to give user feedback (preamble message)\n\nThe <log> tag is a brief preamble message to the user explaining what you're about to do. It should follow these principles and examples:\n\n- **Use ${preferredLanguage}**\n- **Keep it concise**: be no more than 1-2 sentences, focused on immediate, tangible next steps. (8–12 words or Chinese characters for quick updates).\n- **Build on prior context**: if this is not the first action to be done, use the preamble message to connect the dots with what's been done so far and create a sense of momentum and clarity for the user to understand your next actions.\n- **Keep your tone light, friendly and curious**: add small touches of personality in preambles feel collaborative and engaging.\n\n**Examples:**\n- <log>Click the login button</log>\n- <log>Scroll to find the 'Yes' button in popup</log>\n- <log>Previous actions failed to find the 'Yes' button, i will try again</log>\n- <log>Go back to find the login button</log>\n\n### If there is some action to do ...\n\n- Use the <action-type> and <action-param-json> tags to output the action to be executed.\n- The <action-type> MUST be one of the supporting actions. 'complete-goal' is NOT a valid action-type.\nFor example:\n<action-type>Tap</action-type>\n<action-param-json>\n{\n \"locate\": ${locateExample1}\n}\n</action-param-json>\n\n### If you think there is an error ...\n\n- Use the <error> tag to output the error message.\n\nFor example:\n<error>Unable to find the required element on the page</error>\n\n### If there is no action to do ...\n\n- Don't output <action-type> or <action-param-json> if there is no action to do.\n\n## Return Format\n\nReturn in XML format following this decision flow:\n\n**Always include:**\n<!-- Step 1: Observe${shouldIncludeSubGoals ? ' and Plan' : ''} -->\n<thought>...</thought>\n${\n shouldIncludeSubGoals\n ? `\n<!-- required when no update-plan-content is provided in the previous response -->\n<update-plan-content>...</update-plan-content>\n\n<!-- required when any sub-goal is completed -->\n<mark-sub-goal-done>\n <sub-goal index=\"1\" status=\"finished\" />\n</mark-sub-goal-done>\n`\n : ''\n}\n<!-- Step ${noteStepNumber}: Note data from current screenshot if needed -->\n<note>...</note>\n\n**Then choose ONE of the following paths:**\n\n**Path A: If the goal is accomplished or failed (Step ${checkGoalStepNumber})**\n<complete-goal success=\"true|false\">...</complete-goal>\n\n**Path B: If the goal is NOT complete yet (Step ${actionStepNumber})**\n<!-- Determine next action -->\n<log>...</log>\n<action-type>...</action-type>\n<action-param-json>...</action-param-json>\n\n<!-- OR if there's an error -->\n<error>...</error>\n`;\n}\n"],"names":["vlLocateParam","modelFamily","bboxDescription","findDefaultValue","field","current","visited","Set","currentWithDef","descriptionForAction","action","locatorSchemaTypeDescription","tab","fields","paramLines","schema","isZodObject","shape","key","Object","isOptional","keyWithOptional","typeName","getZodTypeName","description","getZodDescription","defaultValue","hasDefault","undefined","paramLine","comments","defaultStr","JSON","line","paramDescription","systemPromptToTaskPlanning","actionSpace","includeBbox","includeThought","includeSubGoals","preferredLanguage","getPreferredLanguage","Error","actionDescriptionList","actionList","shouldIncludeSubGoals","locateExample1","step1Title","step1Description","thoughtTagDescription","subGoalTags","noteStepNumber","checkGoalStepNumber","actionStepNumber"],"mappings":";;;AAUA,MAAMA,gBAAgB,CAACC;IACrB,IAAIA,aACF,OAAO,CAAC,6DAA6D,EAAEC,gBAAgBD,cAAc;IAEvG,OAAO;AACT;AAKA,MAAME,mBAAmB,CAACC;IACxB,IAAIC,UAAUD;IACd,MAAME,UAAU,IAAIC;IAEpB,MAAOF,WAAW,CAACC,QAAQ,GAAG,CAACD,SAAU;QACvCC,QAAQ,GAAG,CAACD;QACZ,MAAMG,iBAAiBH;QAQvB,IAAI,CAACG,eAAe,IAAI,EAAE,UAAU;QAEpC,IAAIA,AAAiC,iBAAjCA,eAAe,IAAI,CAAC,QAAQ,EAC9B,OAAOA,eAAe,IAAI,CAAC,YAAY;QAIzC,IACEA,AAAiC,kBAAjCA,eAAe,IAAI,CAAC,QAAQ,IAC5BA,AAAiC,kBAAjCA,eAAe,IAAI,CAAC,QAAQ,EAE5BH,UAAUG,eAAe,IAAI,CAAC,SAAS;aAEvC;IAEJ;AAGF;AAEO,MAAMC,uBAAuB,CAClCC,QACAC;IAEA,MAAMC,MAAM;IACZ,MAAMC,SAAmB,EAAE;IAG3BA,OAAO,IAAI,CAAC,CAAC,SAAS,EAAEH,OAAO,IAAI,CAAC,CAAC,CAAC;IAGtC,IAAIA,OAAO,WAAW,EAAE;QACtB,MAAMI,aAAuB,EAAE;QAG/B,MAAMC,SAASL,OAAO,WAAW;QAIjC,MAAMM,cAAcD,OAAO,IAAI,EAAE,aAAa;QAE9C,IAAIC,eAAeD,OAAO,KAAK,EAAE;YAE/B,MAAME,QAAQF,OAAO,KAAK;YAE1B,KAAK,MAAM,CAACG,KAAKd,MAAM,IAAIe,OAAO,OAAO,CAACF,OACxC,IAAIb,SAAS,AAAiB,YAAjB,OAAOA,OAAoB;gBAEtC,MAAMgB,aACJ,AACE,cADF,OAAQhB,MAAyC,UAAU,IAE1DA,MAAwC,UAAU;gBACrD,MAAMiB,kBAAkBD,aAAa,GAAGF,IAAI,CAAC,CAAC,GAAGA;gBAGjD,MAAMI,WAAWC,eAAenB,OAAOO;gBAGvC,MAAMa,cAAcC,kBAAkBrB;gBAGtC,MAAMsB,eAAevB,iBAAiBC;gBACtC,MAAMuB,aAAaD,AAAiBE,WAAjBF;gBAGnB,IAAIG,YAAY,GAAGR,gBAAgB,EAAE,EAAEC,UAAU;gBACjD,MAAMQ,WAAqB,EAAE;gBAC7B,IAAIN,aACFM,SAAS,IAAI,CAACN;gBAEhB,IAAIG,YAAY;oBACd,MAAMI,aACJ,AAAwB,YAAxB,OAAOL,eACH,CAAC,CAAC,EAAEA,aAAa,CAAC,CAAC,GACnBM,KAAK,SAAS,CAACN;oBACrBI,SAAS,IAAI,CAAC,CAAC,SAAS,EAAEC,YAAY;gBACxC;gBACA,IAAID,SAAS,MAAM,GAAG,GACpBD,aAAa,CAAC,IAAI,EAAEC,SAAS,IAAI,CAAC,OAAO;gBAG3ChB,WAAW,IAAI,CAACe;YAClB;YAIF,IAAIf,WAAW,MAAM,GAAG,GAAG;gBACzBD,OAAO,IAAI,CAAC;gBACZC,WAAW,OAAO,CAAC,CAACmB;oBAClBpB,OAAO,IAAI,CAAC,CAAC,IAAI,EAAEoB,MAAM;gBAC3B;YACF;QACF,OAAO;YAEL,MAAMX,WAAWC,eAAeR;YAChC,MAAMS,cAAcC,kBAAkBV;YAGtC,IAAImB,mBAAmB,CAAC,SAAS,EAAEZ,UAAU;YAC7C,IAAIE,aACFU,oBAAoB,CAAC,IAAI,EAAEV,aAAa;YAE1CU,oBAAoB;YAEpBrB,OAAO,IAAI,CAACqB;QACd;IACF;IAEA,OAAO,CAAC,EAAE,EAAExB,OAAO,IAAI,CAAC,EAAE,EAAEA,OAAO,WAAW,IAAI,0BAA0B;AAC9E,EAAEE,MAAMC,OAAO,IAAI,CAAC,CAAC,EAAE,EAAED,KAAK,EAAE;AAChC,CAAC,CAAC,IAAI;AACN;AAEO,eAAeuB,2BAA2B,EAC/CC,WAAW,EACXnC,WAAW,EACXoC,WAAW,EACXC,cAAc,EACdC,eAAe,EAOhB;IACC,MAAMC,oBAAoBC;IAG1B,IAAIJ,eAAe,CAACpC,aAClB,MAAM,IAAIyC,MACR;IAIJ,MAAMC,wBAAwBP,YAAY,GAAG,CAAC,CAAC1B,SACtCD,qBACLC,QACAV,cAAcqC,cAAcpC,cAAc2B;IAG9C,MAAMgB,aAAaD,sBAAsB,IAAI,CAAC;IAG9C,MAAME,wBAAwBN,mBAAmB;IAGjD,MAAMO,iBAAiBT,cACnB,CAAC;;;GAGJ,CAAC,GACE,CAAC;;GAEJ,CAAC;IAMF,MAAMU,aAAaF,wBACf,uGACA;IAEJ,MAAMG,mBAAmBH,wBACrB,wNACA;IAEJ,MAAMI,wBAAwBJ,wBAC1B,qSACA;IAEJ,MAAMK,cAAcL,wBAChB,CAAC;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;qBAuDc,CAAC,GAChB;IAGJ,MAAMM,iBAAiBN,wBAAwB,IAAI;IACnD,MAAMO,sBAAsBP,wBAAwB,IAAI;IACxD,MAAMQ,mBAAmBR,wBAAwB,IAAI;IAErD,OAAO,CAAC;;;AAGV,EAAEE,WAAW;;AAEb,EAAEC,iBAAiB;;;;AAInB,EAAEC,sBAAsB;AACxB,EAAEC,YAAY;;QAEN,EAAEC,eAAe;;;;;;QAMjB,EAAEC,oBAAoB;;+BAEC,EAAEP,wBAAwB,qCAAqC,GAAG;;;;;;;;;sEAS3B,EAAEQ,iBAAiB;;QAEjF,EAAEA,iBAAiB;;mGAEwE,EAAER,wBAAwB,kBAAkB,GAAG;;;;;;;;;;AAUlJ,EAAED,WAAW;;;;;;QAML,EAAEJ,kBAAkB;;;;;;;;;;;;;;;;;;;YAmBhB,EAAEM,eAAe;;;;;;;;;;;;;;;;;;;;oBAoBT,EAAED,wBAAwB,cAAc,GAAG;;AAE/D,EACEA,wBACI,CAAC;;;;;;;;AAQP,CAAC,GACK,GACL;UACS,EAAEM,eAAe;;;;;sDAK2B,EAAEC,oBAAoB;;;gDAG5B,EAAEC,iBAAiB;;;;;;;;AAQnE,CAAC;AACD"}