@midscene/core 0.26.7-beta-20250818035341.0 → 0.26.7-beta-20250820105545.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. package/dist/es/ai-model/action-executor.mjs +0 -8
  2. package/dist/es/ai-model/action-executor.mjs.map +1 -1
  3. package/dist/es/ai-model/common.mjs +73 -52
  4. package/dist/es/ai-model/common.mjs.map +1 -1
  5. package/dist/es/ai-model/index.mjs +3 -3
  6. package/dist/es/ai-model/inspect.mjs +29 -66
  7. package/dist/es/ai-model/inspect.mjs.map +1 -1
  8. package/dist/es/ai-model/llm-planning.mjs +27 -24
  9. package/dist/es/ai-model/llm-planning.mjs.map +1 -1
  10. package/dist/es/ai-model/prompt/assertion.mjs +1 -25
  11. package/dist/es/ai-model/prompt/assertion.mjs.map +1 -1
  12. package/dist/es/ai-model/prompt/llm-planning.mjs +50 -23
  13. package/dist/es/ai-model/prompt/llm-planning.mjs.map +1 -1
  14. package/dist/es/ai-model/prompt/playwright-generator.mjs +9 -3
  15. package/dist/es/ai-model/prompt/playwright-generator.mjs.map +1 -1
  16. package/dist/es/ai-model/prompt/util.mjs +2 -2
  17. package/dist/es/ai-model/prompt/util.mjs.map +1 -1
  18. package/dist/es/ai-model/prompt/yaml-generator.mjs +9 -3
  19. package/dist/es/ai-model/prompt/yaml-generator.mjs.map +1 -1
  20. package/dist/es/ai-model/service-caller/index.mjs +75 -118
  21. package/dist/es/ai-model/service-caller/index.mjs.map +1 -1
  22. package/dist/es/ai-model/ui-tars-planning.mjs +5 -5
  23. package/dist/es/ai-model/ui-tars-planning.mjs.map +1 -1
  24. package/dist/es/index.mjs +3 -2
  25. package/dist/es/index.mjs.map +1 -1
  26. package/dist/es/insight/index.mjs +14 -97
  27. package/dist/es/insight/index.mjs.map +1 -1
  28. package/dist/es/insight/utils.mjs +1 -3
  29. package/dist/es/insight/utils.mjs.map +1 -1
  30. package/dist/es/types.mjs.map +1 -1
  31. package/dist/es/utils.mjs +5 -6
  32. package/dist/es/utils.mjs.map +1 -1
  33. package/dist/lib/ai-model/action-executor.js +0 -8
  34. package/dist/lib/ai-model/action-executor.js.map +1 -1
  35. package/dist/lib/ai-model/common.js +97 -55
  36. package/dist/lib/ai-model/common.js.map +1 -1
  37. package/dist/lib/ai-model/index.js +16 -4
  38. package/dist/lib/ai-model/inspect.js +29 -69
  39. package/dist/lib/ai-model/inspect.js.map +1 -1
  40. package/dist/lib/ai-model/llm-planning.js +26 -23
  41. package/dist/lib/ai-model/llm-planning.js.map +1 -1
  42. package/dist/lib/ai-model/prompt/assertion.js +2 -29
  43. package/dist/lib/ai-model/prompt/assertion.js.map +1 -1
  44. package/dist/lib/ai-model/prompt/llm-planning.js +52 -25
  45. package/dist/lib/ai-model/prompt/llm-planning.js.map +1 -1
  46. package/dist/lib/ai-model/prompt/playwright-generator.js +9 -3
  47. package/dist/lib/ai-model/prompt/playwright-generator.js.map +1 -1
  48. package/dist/lib/ai-model/prompt/util.js +2 -2
  49. package/dist/lib/ai-model/prompt/util.js.map +1 -1
  50. package/dist/lib/ai-model/prompt/yaml-generator.js +9 -3
  51. package/dist/lib/ai-model/prompt/yaml-generator.js.map +1 -1
  52. package/dist/lib/ai-model/service-caller/index.js +78 -124
  53. package/dist/lib/ai-model/service-caller/index.js.map +1 -1
  54. package/dist/lib/ai-model/ui-tars-planning.js +5 -5
  55. package/dist/lib/ai-model/ui-tars-planning.js.map +1 -1
  56. package/dist/lib/index.js +20 -7
  57. package/dist/lib/index.js.map +1 -1
  58. package/dist/lib/insight/index.js +10 -93
  59. package/dist/lib/insight/index.js.map +1 -1
  60. package/dist/lib/insight/utils.js +1 -3
  61. package/dist/lib/insight/utils.js.map +1 -1
  62. package/dist/lib/types.js.map +1 -1
  63. package/dist/lib/utils.js +4 -5
  64. package/dist/lib/utils.js.map +1 -1
  65. package/dist/types/ai-model/common.d.ts +162 -8
  66. package/dist/types/ai-model/index.d.ts +2 -1
  67. package/dist/types/ai-model/inspect.d.ts +3 -8
  68. package/dist/types/ai-model/llm-planning.d.ts +1 -1
  69. package/dist/types/ai-model/prompt/assertion.d.ts +0 -3
  70. package/dist/types/ai-model/prompt/llm-planning.d.ts +2 -2
  71. package/dist/types/ai-model/prompt/util.d.ts +2 -1
  72. package/dist/types/ai-model/service-caller/index.d.ts +6 -6
  73. package/dist/types/ai-model/ui-tars-planning.d.ts +3 -1
  74. package/dist/types/index.d.ts +3 -1
  75. package/dist/types/insight/index.d.ts +1 -5
  76. package/dist/types/types.d.ts +11 -12
  77. package/dist/types/yaml.d.ts +7 -6
  78. package/package.json +4 -3
@@ -1,5 +1,4 @@
1
1
  import { getVersion } from "../utils.mjs";
2
- import { MIDSCENE_MODEL_NAME, getAIConfig, uiTarsModelVersion, vlLocateMode } from "@midscene/shared/env";
3
2
  import { assert } from "@midscene/shared/utils";
4
3
  function _define_property(obj, key, value) {
5
4
  if (key in obj) Object.defineProperty(obj, key, {
@@ -108,15 +107,8 @@ class Executor {
108
107
  return null;
109
108
  }
110
109
  dump() {
111
- let modelDescription = '';
112
- if (vlLocateMode()) {
113
- const uiTarsModelVer = uiTarsModelVersion();
114
- modelDescription = uiTarsModelVer ? `UI-TARS=${uiTarsModelVer}` : `${vlLocateMode()} mode`;
115
- }
116
110
  const dumpData = {
117
111
  sdkVersion: getVersion(),
118
- model_name: getAIConfig(MIDSCENE_MODEL_NAME) || '',
119
- model_description: modelDescription,
120
112
  logTime: Date.now(),
121
113
  name: this.name,
122
114
  tasks: this.tasks
@@ -1 +1 @@
1
- {"version":3,"file":"ai-model/action-executor.mjs","sources":["webpack://@midscene/core/./src/ai-model/action-executor.ts"],"sourcesContent":["import type {\n ExecutionDump,\n ExecutionTask,\n ExecutionTaskApply,\n ExecutionTaskInsightLocateOutput,\n ExecutionTaskProgressOptions,\n ExecutionTaskReturn,\n ExecutorContext,\n} from '@/types';\nimport { getVersion } from '@/utils';\nimport {\n MIDSCENE_MODEL_NAME,\n getAIConfig,\n uiTarsModelVersion,\n vlLocateMode,\n} from '@midscene/shared/env';\nimport { assert } from '@midscene/shared/utils';\n\nexport class Executor {\n name: string;\n\n tasks: ExecutionTask[];\n\n // status of executor\n status: 'init' | 'pending' | 'running' | 'completed' | 'error';\n\n onTaskStart?: ExecutionTaskProgressOptions['onTaskStart'];\n\n constructor(\n name: string,\n options?: ExecutionTaskProgressOptions & {\n tasks?: ExecutionTaskApply[];\n },\n ) {\n this.status =\n options?.tasks && options.tasks.length > 0 ? 'pending' : 'init';\n this.name = name;\n this.tasks = (options?.tasks || []).map((item) =>\n this.markTaskAsPending(item),\n );\n this.onTaskStart = options?.onTaskStart;\n }\n\n private markTaskAsPending(task: ExecutionTaskApply): ExecutionTask {\n return {\n status: 'pending',\n ...task,\n };\n }\n\n async append(task: ExecutionTaskApply[] | ExecutionTaskApply): Promise<void> {\n assert(\n this.status !== 'error',\n `executor is in error state, cannot append task\\nerror=${this.latestErrorTask()?.error}\\n${this.latestErrorTask()?.errorStack}`,\n );\n if (Array.isArray(task)) {\n this.tasks.push(...task.map((item) => this.markTaskAsPending(item)));\n } else {\n this.tasks.push(this.markTaskAsPending(task));\n }\n if (this.status !== 'running') {\n this.status = 'pending';\n }\n }\n\n async flush(): Promise<{ output: any; thought?: string } | undefined> {\n if (this.status === 'init' && this.tasks.length > 0) {\n console.warn(\n 'illegal state for executor, status is init but tasks are not empty',\n );\n }\n\n assert(this.status !== 'running', 'executor is already running');\n assert(this.status !== 'completed', 'executor is already completed');\n assert(this.status !== 'error', 'executor is in error state');\n\n const nextPendingIndex = this.tasks.findIndex(\n (task) => task.status === 'pending',\n );\n if (nextPendingIndex < 0) {\n // all tasks are completed\n return;\n }\n\n this.status = 'running';\n let taskIndex = nextPendingIndex;\n let successfullyCompleted = true;\n\n let previousFindOutput: ExecutionTaskInsightLocateOutput | undefined;\n\n while (taskIndex < this.tasks.length) {\n const task = this.tasks[taskIndex];\n assert(\n task.status === 'pending',\n `task status should be pending, but got: ${task.status}`,\n );\n task.timing = {\n start: Date.now(),\n };\n try {\n task.status = 'running';\n try {\n if (this.onTaskStart) {\n await this.onTaskStart(task);\n }\n } catch (e) {\n console.error('error in onTaskStart', e);\n }\n assert(\n ['Insight', 'Action', 'Planning'].indexOf(task.type) >= 0,\n `unsupported task type: ${task.type}`,\n );\n\n const { executor, param } = task;\n assert(executor, `executor is required for task type: ${task.type}`);\n\n let returnValue;\n const executorContext: ExecutorContext = {\n task,\n element: previousFindOutput?.element,\n };\n\n if (task.type === 'Insight') {\n assert(\n task.subType === 'Locate' ||\n task.subType === 'Query' ||\n task.subType === 'Assert' ||\n task.subType === 'Boolean' ||\n task.subType === 'Number' ||\n task.subType === 'String',\n `unsupported insight subType: ${task.subType}`,\n );\n returnValue = await task.executor(param, executorContext);\n if (task.subType === 'Locate') {\n previousFindOutput = (\n returnValue as ExecutionTaskReturn<ExecutionTaskInsightLocateOutput>\n )?.output;\n }\n } else if (task.type === 'Action' || task.type === 'Planning') {\n returnValue = await task.executor(param, executorContext);\n } else {\n console.warn(\n `unsupported task type: ${task.type}, will try to execute it directly`,\n );\n returnValue = await task.executor(param, executorContext);\n }\n\n Object.assign(task, returnValue);\n task.status = 'finished';\n task.timing.end = Date.now();\n task.timing.cost = task.timing.end - task.timing.start;\n taskIndex++;\n } catch (e: any) {\n successfullyCompleted = false;\n task.error = e;\n task.errorMessage =\n e?.message || (typeof e === 'string' ? e : 'error-without-message');\n task.errorStack = e.stack;\n\n task.status = 'failed';\n task.timing.end = Date.now();\n task.timing.cost = task.timing.end - task.timing.start;\n break;\n }\n }\n\n // set all remaining tasks as cancelled\n for (let i = taskIndex + 1; i < this.tasks.length; i++) {\n this.tasks[i].status = 'cancelled';\n }\n\n if (successfullyCompleted) {\n this.status = 'completed';\n } else {\n this.status = 'error';\n }\n\n if (this.tasks.length) {\n // return the last output\n const outputIndex = Math.min(taskIndex, this.tasks.length - 1);\n const { thought, output } = this.tasks[outputIndex];\n return {\n thought,\n output,\n };\n }\n }\n\n isInErrorState(): boolean {\n return this.status === 'error';\n }\n\n latestErrorTask(): ExecutionTask | null {\n if (this.status !== 'error') {\n return null;\n }\n const errorTaskIndex = this.tasks.findIndex(\n (task) => task.status === 'failed',\n );\n if (errorTaskIndex >= 0) {\n return this.tasks[errorTaskIndex];\n }\n return null;\n }\n\n dump(): ExecutionDump {\n let modelDescription = '';\n\n if (vlLocateMode()) {\n const uiTarsModelVer = uiTarsModelVersion();\n if (uiTarsModelVer) {\n modelDescription = `UI-TARS=${uiTarsModelVer}`;\n } else {\n modelDescription = `${vlLocateMode()} mode`;\n }\n }\n const dumpData: ExecutionDump = {\n sdkVersion: getVersion(),\n model_name: getAIConfig(MIDSCENE_MODEL_NAME) || '',\n model_description: modelDescription,\n logTime: Date.now(),\n name: this.name,\n tasks: this.tasks,\n };\n return dumpData;\n }\n}\n"],"names":["Executor","task","_this_latestErrorTask","_this_latestErrorTask1","assert","Array","item","console","nextPendingIndex","taskIndex","successfullyCompleted","previousFindOutput","Date","e","executor","param","returnValue","executorContext","Object","i","outputIndex","Math","thought","output","errorTaskIndex","modelDescription","vlLocateMode","uiTarsModelVer","uiTarsModelVersion","dumpData","getVersion","getAIConfig","MIDSCENE_MODEL_NAME","name","options"],"mappings":";;;;;;;;;;;;;AAkBO,MAAMA;IAyBH,kBAAkBC,IAAwB,EAAiB;QACjE,OAAO;YACL,QAAQ;YACR,GAAGA,IAAI;QACT;IACF;IAEA,MAAM,OAAOA,IAA+C,EAAiB;YAGhBC,uBAAkCC;QAF7FC,OACE,AAAgB,YAAhB,IAAI,CAAC,MAAM,EACX,CAAC,sDAAsD,EAAE,QAAAF,CAAAA,wBAAAA,IAAI,CAAC,eAAe,EAAC,IAArBA,KAAAA,IAAAA,sBAAwB,KAAK,CAAC,EAAE,EAAE,QAAAC,CAAAA,yBAAAA,IAAI,CAAC,eAAe,EAAC,IAArBA,KAAAA,IAAAA,uBAAwB,UAAU,EAAE;QAEjI,IAAIE,MAAM,OAAO,CAACJ,OAChB,IAAI,CAAC,KAAK,CAAC,IAAI,IAAIA,KAAK,GAAG,CAAC,CAACK,OAAS,IAAI,CAAC,iBAAiB,CAACA;aAE7D,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,iBAAiB,CAACL;QAEzC,IAAI,AAAgB,cAAhB,IAAI,CAAC,MAAM,EACb,IAAI,CAAC,MAAM,GAAG;IAElB;IAEA,MAAM,QAAgE;QACpE,IAAI,AAAgB,WAAhB,IAAI,CAAC,MAAM,IAAe,IAAI,CAAC,KAAK,CAAC,MAAM,GAAG,GAChDM,QAAQ,IAAI,CACV;QAIJH,OAAO,AAAgB,cAAhB,IAAI,CAAC,MAAM,EAAgB;QAClCA,OAAO,AAAgB,gBAAhB,IAAI,CAAC,MAAM,EAAkB;QACpCA,OAAO,AAAgB,YAAhB,IAAI,CAAC,MAAM,EAAc;QAEhC,MAAMI,mBAAmB,IAAI,CAAC,KAAK,CAAC,SAAS,CAC3C,CAACP,OAASA,AAAgB,cAAhBA,KAAK,MAAM;QAEvB,IAAIO,mBAAmB,GAErB;QAGF,IAAI,CAAC,MAAM,GAAG;QACd,IAAIC,YAAYD;QAChB,IAAIE,wBAAwB;QAE5B,IAAIC;QAEJ,MAAOF,YAAY,IAAI,CAAC,KAAK,CAAC,MAAM,CAAE;YACpC,MAAMR,OAAO,IAAI,CAAC,KAAK,CAACQ,UAAU;YAClCL,OACEH,AAAgB,cAAhBA,KAAK,MAAM,EACX,CAAC,wCAAwC,EAAEA,KAAK,MAAM,EAAE;YAE1DA,KAAK,MAAM,GAAG;gBACZ,OAAOW,KAAK,GAAG;YACjB;YACA,IAAI;gBACFX,KAAK,MAAM,GAAG;gBACd,IAAI;oBACF,IAAI,IAAI,CAAC,WAAW,EAClB,MAAM,IAAI,CAAC,WAAW,CAACA;gBAE3B,EAAE,OAAOY,GAAG;oBACVN,QAAQ,KAAK,CAAC,wBAAwBM;gBACxC;gBACAT,OACE;oBAAC;oBAAW;oBAAU;iBAAW,CAAC,OAAO,CAACH,KAAK,IAAI,KAAK,GACxD,CAAC,uBAAuB,EAAEA,KAAK,IAAI,EAAE;gBAGvC,MAAM,EAAEa,QAAQ,EAAEC,KAAK,EAAE,GAAGd;gBAC5BG,OAAOU,UAAU,CAAC,oCAAoC,EAAEb,KAAK,IAAI,EAAE;gBAEnE,IAAIe;gBACJ,MAAMC,kBAAmC;oBACvChB;oBACA,SAASU,QAAAA,qBAAAA,KAAAA,IAAAA,mBAAoB,OAAO;gBACtC;gBAEA,IAAIV,AAAc,cAAdA,KAAK,IAAI,EAAgB;oBAC3BG,OACEH,AAAiB,aAAjBA,KAAK,OAAO,IACVA,AAAiB,YAAjBA,KAAK,OAAO,IACZA,AAAiB,aAAjBA,KAAK,OAAO,IACZA,AAAiB,cAAjBA,KAAK,OAAO,IACZA,AAAiB,aAAjBA,KAAK,OAAO,IACZA,AAAiB,aAAjBA,KAAK,OAAO,EACd,CAAC,6BAA6B,EAAEA,KAAK,OAAO,EAAE;oBAEhDe,cAAc,MAAMf,KAAK,QAAQ,CAACc,OAAOE;oBACzC,IAAIhB,AAAiB,aAAjBA,KAAK,OAAO,EACdU,qBACEK,QAAAA,cAAAA,KAAAA,IAAAA,YACC,MAAM;gBAEb,OAAO,IAAIf,AAAc,aAAdA,KAAK,IAAI,IAAiBA,AAAc,eAAdA,KAAK,IAAI,EAC5Ce,cAAc,MAAMf,KAAK,QAAQ,CAACc,OAAOE;qBACpC;oBACLV,QAAQ,IAAI,CACV,CAAC,uBAAuB,EAAEN,KAAK,IAAI,CAAC,iCAAiC,CAAC;oBAExEe,cAAc,MAAMf,KAAK,QAAQ,CAACc,OAAOE;gBAC3C;gBAEAC,OAAO,MAAM,CAACjB,MAAMe;gBACpBf,KAAK,MAAM,GAAG;gBACdA,KAAK,MAAM,CAAC,GAAG,GAAGW,KAAK,GAAG;gBAC1BX,KAAK,MAAM,CAAC,IAAI,GAAGA,KAAK,MAAM,CAAC,GAAG,GAAGA,KAAK,MAAM,CAAC,KAAK;gBACtDQ;YACF,EAAE,OAAOI,GAAQ;gBACfH,wBAAwB;gBACxBT,KAAK,KAAK,GAAGY;gBACbZ,KAAK,YAAY,GACfY,AAAAA,CAAAA,QAAAA,IAAAA,KAAAA,IAAAA,EAAG,OAAO,AAAD,KAAM,CAAa,YAAb,OAAOA,IAAiBA,IAAI,uBAAsB;gBACnEZ,KAAK,UAAU,GAAGY,EAAE,KAAK;gBAEzBZ,KAAK,MAAM,GAAG;gBACdA,KAAK,MAAM,CAAC,GAAG,GAAGW,KAAK,GAAG;gBAC1BX,KAAK,MAAM,CAAC,IAAI,GAAGA,KAAK,MAAM,CAAC,GAAG,GAAGA,KAAK,MAAM,CAAC,KAAK;gBACtD;YACF;QACF;QAGA,IAAK,IAAIkB,IAAIV,YAAY,GAAGU,IAAI,IAAI,CAAC,KAAK,CAAC,MAAM,EAAEA,IACjD,IAAI,CAAC,KAAK,CAACA,EAAE,CAAC,MAAM,GAAG;QAGzB,IAAIT,uBACF,IAAI,CAAC,MAAM,GAAG;aAEd,IAAI,CAAC,MAAM,GAAG;QAGhB,IAAI,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE;YAErB,MAAMU,cAAcC,KAAK,GAAG,CAACZ,WAAW,IAAI,CAAC,KAAK,CAAC,MAAM,GAAG;YAC5D,MAAM,EAAEa,OAAO,EAAEC,MAAM,EAAE,GAAG,IAAI,CAAC,KAAK,CAACH,YAAY;YACnD,OAAO;gBACLE;gBACAC;YACF;QACF;IACF;IAEA,iBAA0B;QACxB,OAAO,AAAgB,YAAhB,IAAI,CAAC,MAAM;IACpB;IAEA,kBAAwC;QACtC,IAAI,AAAgB,YAAhB,IAAI,CAAC,MAAM,EACb,OAAO;QAET,MAAMC,iBAAiB,IAAI,CAAC,KAAK,CAAC,SAAS,CACzC,CAACvB,OAASA,AAAgB,aAAhBA,KAAK,MAAM;QAEvB,IAAIuB,kBAAkB,GACpB,OAAO,IAAI,CAAC,KAAK,CAACA,eAAe;QAEnC,OAAO;IACT;IAEA,OAAsB;QACpB,IAAIC,mBAAmB;QAEvB,IAAIC,gBAAgB;YAClB,MAAMC,iBAAiBC;YAErBH,mBADEE,iBACiB,CAAC,QAAQ,EAAEA,gBAAgB,GAE3B,GAAGD,eAAe,KAAK,CAAC;QAE/C;QACA,MAAMG,WAA0B;YAC9B,YAAYC;YACZ,YAAYC,YAAYC,wBAAwB;YAChD,mBAAmBP;YACnB,SAASb,KAAK,GAAG;YACjB,MAAM,IAAI,CAAC,IAAI;YACf,OAAO,IAAI,CAAC,KAAK;QACnB;QACA,OAAOiB;IACT;IArMA,YACEI,IAAY,EACZC,OAEC,CACD;QAdF;QAEA;QAGA;QAEA;QAQE,IAAI,CAAC,MAAM,GACTA,AAAAA,CAAAA,QAAAA,UAAAA,KAAAA,IAAAA,QAAS,KAAK,AAAD,KAAKA,QAAQ,KAAK,CAAC,MAAM,GAAG,IAAI,YAAY;QAC3D,IAAI,CAAC,IAAI,GAAGD;QACZ,IAAI,CAAC,KAAK,GAAIC,AAAAA,CAAAA,CAAAA,QAAAA,UAAAA,KAAAA,IAAAA,QAAS,KAAK,AAAD,KAAK,EAAC,EAAG,GAAG,CAAC,CAAC5B,OACvC,IAAI,CAAC,iBAAiB,CAACA;QAEzB,IAAI,CAAC,WAAW,GAAG4B,QAAAA,UAAAA,KAAAA,IAAAA,QAAS,WAAW;IACzC;AAyLF"}
1
+ {"version":3,"file":"ai-model/action-executor.mjs","sources":["webpack://@midscene/core/./src/ai-model/action-executor.ts"],"sourcesContent":["import type {\n ExecutionDump,\n ExecutionTask,\n ExecutionTaskApply,\n ExecutionTaskInsightLocateOutput,\n ExecutionTaskProgressOptions,\n ExecutionTaskReturn,\n ExecutorContext,\n} from '@/types';\nimport { getVersion } from '@/utils';\nimport { assert } from '@midscene/shared/utils';\n\nexport class Executor {\n name: string;\n\n tasks: ExecutionTask[];\n\n // status of executor\n status: 'init' | 'pending' | 'running' | 'completed' | 'error';\n\n onTaskStart?: ExecutionTaskProgressOptions['onTaskStart'];\n\n constructor(\n name: string,\n options?: ExecutionTaskProgressOptions & {\n tasks?: ExecutionTaskApply[];\n },\n ) {\n this.status =\n options?.tasks && options.tasks.length > 0 ? 'pending' : 'init';\n this.name = name;\n this.tasks = (options?.tasks || []).map((item) =>\n this.markTaskAsPending(item),\n );\n this.onTaskStart = options?.onTaskStart;\n }\n\n private markTaskAsPending(task: ExecutionTaskApply): ExecutionTask {\n return {\n status: 'pending',\n ...task,\n };\n }\n\n async append(task: ExecutionTaskApply[] | ExecutionTaskApply): Promise<void> {\n assert(\n this.status !== 'error',\n `executor is in error state, cannot append task\\nerror=${this.latestErrorTask()?.error}\\n${this.latestErrorTask()?.errorStack}`,\n );\n if (Array.isArray(task)) {\n this.tasks.push(...task.map((item) => this.markTaskAsPending(item)));\n } else {\n this.tasks.push(this.markTaskAsPending(task));\n }\n if (this.status !== 'running') {\n this.status = 'pending';\n }\n }\n\n async flush(): Promise<{ output: any; thought?: string } | undefined> {\n if (this.status === 'init' && this.tasks.length > 0) {\n console.warn(\n 'illegal state for executor, status is init but tasks are not empty',\n );\n }\n\n assert(this.status !== 'running', 'executor is already running');\n assert(this.status !== 'completed', 'executor is already completed');\n assert(this.status !== 'error', 'executor is in error state');\n\n const nextPendingIndex = this.tasks.findIndex(\n (task) => task.status === 'pending',\n );\n if (nextPendingIndex < 0) {\n // all tasks are completed\n return;\n }\n\n this.status = 'running';\n let taskIndex = nextPendingIndex;\n let successfullyCompleted = true;\n\n let previousFindOutput: ExecutionTaskInsightLocateOutput | undefined;\n\n while (taskIndex < this.tasks.length) {\n const task = this.tasks[taskIndex];\n assert(\n task.status === 'pending',\n `task status should be pending, but got: ${task.status}`,\n );\n task.timing = {\n start: Date.now(),\n };\n try {\n task.status = 'running';\n try {\n if (this.onTaskStart) {\n await this.onTaskStart(task);\n }\n } catch (e) {\n console.error('error in onTaskStart', e);\n }\n assert(\n ['Insight', 'Action', 'Planning'].indexOf(task.type) >= 0,\n `unsupported task type: ${task.type}`,\n );\n\n const { executor, param } = task;\n assert(executor, `executor is required for task type: ${task.type}`);\n\n let returnValue;\n const executorContext: ExecutorContext = {\n task,\n element: previousFindOutput?.element,\n };\n\n if (task.type === 'Insight') {\n assert(\n task.subType === 'Locate' ||\n task.subType === 'Query' ||\n task.subType === 'Assert' ||\n task.subType === 'Boolean' ||\n task.subType === 'Number' ||\n task.subType === 'String',\n `unsupported insight subType: ${task.subType}`,\n );\n returnValue = await task.executor(param, executorContext);\n if (task.subType === 'Locate') {\n previousFindOutput = (\n returnValue as ExecutionTaskReturn<ExecutionTaskInsightLocateOutput>\n )?.output;\n }\n } else if (task.type === 'Action' || task.type === 'Planning') {\n returnValue = await task.executor(param, executorContext);\n } else {\n console.warn(\n `unsupported task type: ${task.type}, will try to execute it directly`,\n );\n returnValue = await task.executor(param, executorContext);\n }\n\n Object.assign(task, returnValue);\n task.status = 'finished';\n task.timing.end = Date.now();\n task.timing.cost = task.timing.end - task.timing.start;\n taskIndex++;\n } catch (e: any) {\n successfullyCompleted = false;\n task.error = e;\n task.errorMessage =\n e?.message || (typeof e === 'string' ? e : 'error-without-message');\n task.errorStack = e.stack;\n\n task.status = 'failed';\n task.timing.end = Date.now();\n task.timing.cost = task.timing.end - task.timing.start;\n break;\n }\n }\n\n // set all remaining tasks as cancelled\n for (let i = taskIndex + 1; i < this.tasks.length; i++) {\n this.tasks[i].status = 'cancelled';\n }\n\n if (successfullyCompleted) {\n this.status = 'completed';\n } else {\n this.status = 'error';\n }\n\n if (this.tasks.length) {\n // return the last output\n const outputIndex = Math.min(taskIndex, this.tasks.length - 1);\n const { thought, output } = this.tasks[outputIndex];\n return {\n thought,\n output,\n };\n }\n }\n\n isInErrorState(): boolean {\n return this.status === 'error';\n }\n\n latestErrorTask(): ExecutionTask | null {\n if (this.status !== 'error') {\n return null;\n }\n const errorTaskIndex = this.tasks.findIndex(\n (task) => task.status === 'failed',\n );\n if (errorTaskIndex >= 0) {\n return this.tasks[errorTaskIndex];\n }\n return null;\n }\n\n dump(): ExecutionDump {\n const dumpData: ExecutionDump = {\n sdkVersion: getVersion(),\n logTime: Date.now(),\n name: this.name,\n tasks: this.tasks,\n };\n return dumpData;\n }\n}\n"],"names":["Executor","task","_this_latestErrorTask","_this_latestErrorTask1","assert","Array","item","console","nextPendingIndex","taskIndex","successfullyCompleted","previousFindOutput","Date","e","executor","param","returnValue","executorContext","Object","i","outputIndex","Math","thought","output","errorTaskIndex","dumpData","getVersion","name","options"],"mappings":";;;;;;;;;;;;AAYO,MAAMA;IAyBH,kBAAkBC,IAAwB,EAAiB;QACjE,OAAO;YACL,QAAQ;YACR,GAAGA,IAAI;QACT;IACF;IAEA,MAAM,OAAOA,IAA+C,EAAiB;YAGhBC,uBAAkCC;QAF7FC,OACE,AAAgB,YAAhB,IAAI,CAAC,MAAM,EACX,CAAC,sDAAsD,EAAE,QAAAF,CAAAA,wBAAAA,IAAI,CAAC,eAAe,EAAC,IAArBA,KAAAA,IAAAA,sBAAwB,KAAK,CAAC,EAAE,EAAE,QAAAC,CAAAA,yBAAAA,IAAI,CAAC,eAAe,EAAC,IAArBA,KAAAA,IAAAA,uBAAwB,UAAU,EAAE;QAEjI,IAAIE,MAAM,OAAO,CAACJ,OAChB,IAAI,CAAC,KAAK,CAAC,IAAI,IAAIA,KAAK,GAAG,CAAC,CAACK,OAAS,IAAI,CAAC,iBAAiB,CAACA;aAE7D,IAAI,CAAC,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,iBAAiB,CAACL;QAEzC,IAAI,AAAgB,cAAhB,IAAI,CAAC,MAAM,EACb,IAAI,CAAC,MAAM,GAAG;IAElB;IAEA,MAAM,QAAgE;QACpE,IAAI,AAAgB,WAAhB,IAAI,CAAC,MAAM,IAAe,IAAI,CAAC,KAAK,CAAC,MAAM,GAAG,GAChDM,QAAQ,IAAI,CACV;QAIJH,OAAO,AAAgB,cAAhB,IAAI,CAAC,MAAM,EAAgB;QAClCA,OAAO,AAAgB,gBAAhB,IAAI,CAAC,MAAM,EAAkB;QACpCA,OAAO,AAAgB,YAAhB,IAAI,CAAC,MAAM,EAAc;QAEhC,MAAMI,mBAAmB,IAAI,CAAC,KAAK,CAAC,SAAS,CAC3C,CAACP,OAASA,AAAgB,cAAhBA,KAAK,MAAM;QAEvB,IAAIO,mBAAmB,GAErB;QAGF,IAAI,CAAC,MAAM,GAAG;QACd,IAAIC,YAAYD;QAChB,IAAIE,wBAAwB;QAE5B,IAAIC;QAEJ,MAAOF,YAAY,IAAI,CAAC,KAAK,CAAC,MAAM,CAAE;YACpC,MAAMR,OAAO,IAAI,CAAC,KAAK,CAACQ,UAAU;YAClCL,OACEH,AAAgB,cAAhBA,KAAK,MAAM,EACX,CAAC,wCAAwC,EAAEA,KAAK,MAAM,EAAE;YAE1DA,KAAK,MAAM,GAAG;gBACZ,OAAOW,KAAK,GAAG;YACjB;YACA,IAAI;gBACFX,KAAK,MAAM,GAAG;gBACd,IAAI;oBACF,IAAI,IAAI,CAAC,WAAW,EAClB,MAAM,IAAI,CAAC,WAAW,CAACA;gBAE3B,EAAE,OAAOY,GAAG;oBACVN,QAAQ,KAAK,CAAC,wBAAwBM;gBACxC;gBACAT,OACE;oBAAC;oBAAW;oBAAU;iBAAW,CAAC,OAAO,CAACH,KAAK,IAAI,KAAK,GACxD,CAAC,uBAAuB,EAAEA,KAAK,IAAI,EAAE;gBAGvC,MAAM,EAAEa,QAAQ,EAAEC,KAAK,EAAE,GAAGd;gBAC5BG,OAAOU,UAAU,CAAC,oCAAoC,EAAEb,KAAK,IAAI,EAAE;gBAEnE,IAAIe;gBACJ,MAAMC,kBAAmC;oBACvChB;oBACA,SAASU,QAAAA,qBAAAA,KAAAA,IAAAA,mBAAoB,OAAO;gBACtC;gBAEA,IAAIV,AAAc,cAAdA,KAAK,IAAI,EAAgB;oBAC3BG,OACEH,AAAiB,aAAjBA,KAAK,OAAO,IACVA,AAAiB,YAAjBA,KAAK,OAAO,IACZA,AAAiB,aAAjBA,KAAK,OAAO,IACZA,AAAiB,cAAjBA,KAAK,OAAO,IACZA,AAAiB,aAAjBA,KAAK,OAAO,IACZA,AAAiB,aAAjBA,KAAK,OAAO,EACd,CAAC,6BAA6B,EAAEA,KAAK,OAAO,EAAE;oBAEhDe,cAAc,MAAMf,KAAK,QAAQ,CAACc,OAAOE;oBACzC,IAAIhB,AAAiB,aAAjBA,KAAK,OAAO,EACdU,qBACEK,QAAAA,cAAAA,KAAAA,IAAAA,YACC,MAAM;gBAEb,OAAO,IAAIf,AAAc,aAAdA,KAAK,IAAI,IAAiBA,AAAc,eAAdA,KAAK,IAAI,EAC5Ce,cAAc,MAAMf,KAAK,QAAQ,CAACc,OAAOE;qBACpC;oBACLV,QAAQ,IAAI,CACV,CAAC,uBAAuB,EAAEN,KAAK,IAAI,CAAC,iCAAiC,CAAC;oBAExEe,cAAc,MAAMf,KAAK,QAAQ,CAACc,OAAOE;gBAC3C;gBAEAC,OAAO,MAAM,CAACjB,MAAMe;gBACpBf,KAAK,MAAM,GAAG;gBACdA,KAAK,MAAM,CAAC,GAAG,GAAGW,KAAK,GAAG;gBAC1BX,KAAK,MAAM,CAAC,IAAI,GAAGA,KAAK,MAAM,CAAC,GAAG,GAAGA,KAAK,MAAM,CAAC,KAAK;gBACtDQ;YACF,EAAE,OAAOI,GAAQ;gBACfH,wBAAwB;gBACxBT,KAAK,KAAK,GAAGY;gBACbZ,KAAK,YAAY,GACfY,AAAAA,CAAAA,QAAAA,IAAAA,KAAAA,IAAAA,EAAG,OAAO,AAAD,KAAM,CAAa,YAAb,OAAOA,IAAiBA,IAAI,uBAAsB;gBACnEZ,KAAK,UAAU,GAAGY,EAAE,KAAK;gBAEzBZ,KAAK,MAAM,GAAG;gBACdA,KAAK,MAAM,CAAC,GAAG,GAAGW,KAAK,GAAG;gBAC1BX,KAAK,MAAM,CAAC,IAAI,GAAGA,KAAK,MAAM,CAAC,GAAG,GAAGA,KAAK,MAAM,CAAC,KAAK;gBACtD;YACF;QACF;QAGA,IAAK,IAAIkB,IAAIV,YAAY,GAAGU,IAAI,IAAI,CAAC,KAAK,CAAC,MAAM,EAAEA,IACjD,IAAI,CAAC,KAAK,CAACA,EAAE,CAAC,MAAM,GAAG;QAGzB,IAAIT,uBACF,IAAI,CAAC,MAAM,GAAG;aAEd,IAAI,CAAC,MAAM,GAAG;QAGhB,IAAI,IAAI,CAAC,KAAK,CAAC,MAAM,EAAE;YAErB,MAAMU,cAAcC,KAAK,GAAG,CAACZ,WAAW,IAAI,CAAC,KAAK,CAAC,MAAM,GAAG;YAC5D,MAAM,EAAEa,OAAO,EAAEC,MAAM,EAAE,GAAG,IAAI,CAAC,KAAK,CAACH,YAAY;YACnD,OAAO;gBACLE;gBACAC;YACF;QACF;IACF;IAEA,iBAA0B;QACxB,OAAO,AAAgB,YAAhB,IAAI,CAAC,MAAM;IACpB;IAEA,kBAAwC;QACtC,IAAI,AAAgB,YAAhB,IAAI,CAAC,MAAM,EACb,OAAO;QAET,MAAMC,iBAAiB,IAAI,CAAC,KAAK,CAAC,SAAS,CACzC,CAACvB,OAASA,AAAgB,aAAhBA,KAAK,MAAM;QAEvB,IAAIuB,kBAAkB,GACpB,OAAO,IAAI,CAAC,KAAK,CAACA,eAAe;QAEnC,OAAO;IACT;IAEA,OAAsB;QACpB,MAAMC,WAA0B;YAC9B,YAAYC;YACZ,SAASd,KAAK,GAAG;YACjB,MAAM,IAAI,CAAC,IAAI;YACf,OAAO,IAAI,CAAC,KAAK;QACnB;QACA,OAAOa;IACT;IAzLA,YACEE,IAAY,EACZC,OAEC,CACD;QAdF;QAEA;QAGA;QAEA;QAQE,IAAI,CAAC,MAAM,GACTA,AAAAA,CAAAA,QAAAA,UAAAA,KAAAA,IAAAA,QAAS,KAAK,AAAD,KAAKA,QAAQ,KAAK,CAAC,MAAM,GAAG,IAAI,YAAY;QAC3D,IAAI,CAAC,IAAI,GAAGD;QACZ,IAAI,CAAC,KAAK,GAAIC,AAAAA,CAAAA,CAAAA,QAAAA,UAAAA,KAAAA,IAAAA,QAAS,KAAK,AAAD,KAAK,EAAC,EAAG,GAAG,CAAC,CAACtB,OACvC,IAAI,CAAC,iBAAiB,CAACA;QAEzB,IAAI,CAAC,WAAW,GAAGsB,QAAAA,UAAAA,KAAAA,IAAAA,QAAS,WAAW;IACzC;AA6KF"}
@@ -1,10 +1,11 @@
1
1
  import { assert } from "@midscene/shared/utils";
2
- import { callToGetJSONObject, getModelName } from "./service-caller/index.mjs";
2
+ import { callToGetJSONObject } from "./service-caller/index.mjs";
3
3
  import { NodeType } from "@midscene/shared/constants";
4
- import { vlLocateMode } from "@midscene/shared/env";
4
+ import { getModelName, vlLocateMode } from "@midscene/shared/env";
5
5
  import { treeToList } from "@midscene/shared/extractor";
6
6
  import { compositeElementInfoImg } from "@midscene/shared/img";
7
7
  import { getDebug } from "@midscene/shared/logger";
8
+ import { z } from "zod";
8
9
  var common_AIActionType = /*#__PURE__*/ function(AIActionType) {
9
10
  AIActionType[AIActionType["ASSERT"] = 0] = "ASSERT";
10
11
  AIActionType[AIActionType["INSPECT_ELEMENT"] = 1] = "INSPECT_ELEMENT";
@@ -13,8 +14,9 @@ var common_AIActionType = /*#__PURE__*/ function(AIActionType) {
13
14
  AIActionType[AIActionType["DESCRIBE_ELEMENT"] = 4] = "DESCRIBE_ELEMENT";
14
15
  return AIActionType;
15
16
  }({});
16
- async function callAiFn(msgs, AIActionTypeValue) {
17
- const jsonObject = await callToGetJSONObject(msgs, AIActionTypeValue);
17
+ const actionSpaceTypePrefix = 'action_space_';
18
+ async function callAiFn(msgs, AIActionTypeValue, modelPreferences) {
19
+ const jsonObject = await callToGetJSONObject(msgs, AIActionTypeValue, modelPreferences);
18
20
  return {
19
21
  content: jsonObject.content,
20
22
  usage: jsonObject.usage
@@ -22,12 +24,12 @@ async function callAiFn(msgs, AIActionTypeValue) {
22
24
  }
23
25
  const defaultBboxSize = 20;
24
26
  const debugInspectUtils = getDebug('ai:common');
25
- function fillBboxParam(locate, width, height) {
27
+ function fillBboxParam(locate, width, height, modelPreferences) {
26
28
  if (locate.bbox_2d && !(null == locate ? void 0 : locate.bbox)) {
27
29
  locate.bbox = locate.bbox_2d;
28
30
  delete locate.bbox_2d;
29
31
  }
30
- if (null == locate ? void 0 : locate.bbox) locate.bbox = adaptBbox(locate.bbox, width, height);
32
+ if (null == locate ? void 0 : locate.bbox) locate.bbox = adaptBbox(locate.bbox, width, height, modelPreferences);
31
33
  return locate;
32
34
  }
33
35
  function adaptQwenBbox(bbox) {
@@ -89,9 +91,9 @@ function adaptDoubaoBbox(bbox, width, height) {
89
91
  const msg = `invalid bbox data for doubao-vision mode: ${JSON.stringify(bbox)} `;
90
92
  throw new Error(msg);
91
93
  }
92
- function adaptBbox(bbox, width, height) {
93
- if ('doubao-vision' === vlLocateMode() || 'vlm-ui-tars' === vlLocateMode()) return adaptDoubaoBbox(bbox, width, height);
94
- if ('gemini' === vlLocateMode()) return adaptGeminiBbox(bbox, width, height);
94
+ function adaptBbox(bbox, width, height, modelPreferences) {
95
+ if ('doubao-vision' === vlLocateMode(modelPreferences) || 'vlm-ui-tars' === vlLocateMode(modelPreferences)) return adaptDoubaoBbox(bbox, width, height);
96
+ if ('gemini' === vlLocateMode(modelPreferences)) return adaptGeminiBbox(bbox, width, height);
95
97
  return adaptQwenBbox(bbox);
96
98
  }
97
99
  function adaptGeminiBbox(bbox, width, height) {
@@ -106,9 +108,9 @@ function adaptGeminiBbox(bbox, width, height) {
106
108
  bottom
107
109
  ];
108
110
  }
109
- function adaptBboxToRect(bbox, width, height, offsetX = 0, offsetY = 0) {
111
+ function adaptBboxToRect(bbox, width, height, modelPreferences, offsetX = 0, offsetY = 0) {
110
112
  debugInspectUtils('adaptBboxToRect', bbox, width, height, offsetX, offsetY);
111
- const [left, top, right, bottom] = adaptBbox(bbox, width, height);
113
+ const [left, top, right, bottom] = adaptBbox(bbox, width, height, modelPreferences);
112
114
  const rect = {
113
115
  left: left + offsetX,
114
116
  top: top + offsetY,
@@ -119,10 +121,10 @@ function adaptBboxToRect(bbox, width, height, offsetX = 0, offsetY = 0) {
119
121
  return rect;
120
122
  }
121
123
  let warned = false;
122
- function warnGPT4oSizeLimit(size) {
124
+ function warnGPT4oSizeLimit(size, modelPreferences) {
123
125
  var _getModelName;
124
126
  if (warned) return;
125
- if (null == (_getModelName = getModelName()) ? void 0 : _getModelName.toLowerCase().includes('gpt-4o')) {
127
+ if (null == (_getModelName = getModelName(modelPreferences)) ? void 0 : _getModelName.toLowerCase().includes('gpt-4o')) {
126
128
  const warningMsg = `GPT-4o has a maximum image input size of 2000x768 or 768x2000, but got ${size.width}x${size.height}. Please set your page to a smaller resolution. Otherwise, the result may be inaccurate.`;
127
129
  if (Math.max(size.width, size.height) > 2000 || Math.min(size.width, size.height) > 768) {
128
130
  console.warn(warningMsg);
@@ -145,8 +147,8 @@ function mergeRects(rects) {
145
147
  height: maxBottom - minTop
146
148
  };
147
149
  }
148
- function expandSearchArea(rect, screenSize) {
149
- const minEdgeSize = 'doubao-vision' === vlLocateMode() ? 500 : 300;
150
+ function expandSearchArea(rect, screenSize, modelPreferences) {
151
+ const minEdgeSize = 'doubao-vision' === vlLocateMode(modelPreferences) ? 500 : 300;
150
152
  const defaultPadding = 160;
151
153
  const paddingSizeHorizontal = rect.width < minEdgeSize ? Math.ceil((minEdgeSize - rect.width) / 2) : defaultPadding;
152
154
  const paddingSizeVertical = rect.height < minEdgeSize ? Math.ceil((minEdgeSize - rect.height) / 2) : defaultPadding;
@@ -169,51 +171,70 @@ async function markupImageForLLM(screenshotBase64, tree, size) {
169
171
  });
170
172
  return imagePayload;
171
173
  }
172
- function buildYamlFlowFromPlans(plans, sleep) {
174
+ function buildYamlFlowFromPlans(plans, actionSpace, sleep) {
173
175
  const flow = [];
174
176
  for (const plan of plans){
175
177
  var _plan_locate;
176
- const type = plan.type;
178
+ const verb = plan.type;
179
+ const action = actionSpace.find((action)=>action.name === verb);
180
+ if (!action) {
181
+ console.warn(`Cannot convert action ${verb} to yaml flow. Will ignore it.`);
182
+ continue;
183
+ }
177
184
  const locate = null == (_plan_locate = plan.locate) ? void 0 : _plan_locate.prompt;
178
- if ('Tap' === type) flow.push({
179
- aiTap: locate
180
- });
181
- else if ('Hover' === type) flow.push({
182
- aiHover: locate
183
- });
184
- else if ('Input' === type) {
185
- const param = plan.param;
186
- flow.push({
187
- aiInput: param.value,
188
- locate
189
- });
190
- } else if ('KeyboardPress' === type) {
191
- const param = plan.param;
192
- flow.push({
193
- aiKeyboardPress: param.value,
194
- locate
195
- });
196
- } else if ('Scroll' === type) {
197
- const param = plan.param;
198
- flow.push({
199
- aiScroll: null,
200
- locate,
201
- direction: param.direction,
202
- scrollType: param.scrollType,
203
- distance: param.distance
204
- });
205
- } else if ('Sleep' === type) {
206
- const param = plan.param;
207
- flow.push({
208
- sleep: param.timeMs
209
- });
210
- } else 'AndroidBackButton' === type || 'AndroidHomeButton' === type || 'AndroidRecentAppsButton' === type || 'AndroidLongPress' === type || 'AndroidPull' === type || 'Error' === type || 'Assert' === type || 'AssertWithoutThrow' === type || 'Finished' === type || console.warn(`Cannot convert action ${type} to yaml flow. This should be a bug of Midscene.`);
185
+ const flowKey = action.interfaceAlias || `${actionSpaceTypePrefix}${verb}`;
186
+ const flowItem = {
187
+ [flowKey]: locate || '',
188
+ ...plan.param || {}
189
+ };
190
+ flow.push(flowItem);
211
191
  }
212
192
  if (sleep) flow.push({
213
- sleep: sleep
193
+ sleep
214
194
  });
215
195
  return flow;
216
196
  }
217
- export { common_AIActionType as AIActionType, adaptBbox, adaptBboxToRect, adaptDoubaoBbox, adaptGeminiBbox, adaptQwenBbox, buildYamlFlowFromPlans, callAiFn, expandSearchArea, fillBboxParam, markupImageForLLM, mergeRects, warnGPT4oSizeLimit };
197
+ const PointSchema = z.object({
198
+ left: z.number(),
199
+ top: z.number()
200
+ });
201
+ const SizeSchema = z.object({
202
+ width: z.number(),
203
+ height: z.number(),
204
+ dpr: z.number().optional()
205
+ });
206
+ const RectSchema = PointSchema.and(SizeSchema).and(z.object({
207
+ zoom: z.number().optional()
208
+ }));
209
+ const MidsceneLocation = z.object({
210
+ midscene_location_field_flag: z.literal(true),
211
+ prompt: z.string(),
212
+ center: z.tuple([
213
+ z.number(),
214
+ z.number()
215
+ ]),
216
+ rect: RectSchema
217
+ }).passthrough();
218
+ const ifMidsceneLocatorField = (field)=>{
219
+ var _actualField__def, _actualField__def1;
220
+ let actualField = field;
221
+ if ((null == (_actualField__def = actualField._def) ? void 0 : _actualField__def.typeName) === 'ZodOptional') actualField = actualField._def.innerType;
222
+ if ((null == (_actualField__def1 = actualField._def) ? void 0 : _actualField__def1.typeName) === 'ZodObject') {
223
+ const shape = actualField._def.shape();
224
+ return 'midscene_location_field_flag' in shape;
225
+ }
226
+ return false;
227
+ };
228
+ const findAllMidsceneLocatorField = (zodType)=>{
229
+ var _zodObject__def;
230
+ if (!zodType) return [];
231
+ const zodObject = zodType;
232
+ if ((null == (_zodObject__def = zodObject._def) ? void 0 : _zodObject__def.typeName) === 'ZodObject' && zodObject.shape) {
233
+ const keys = Object.keys(zodObject.shape);
234
+ return keys.filter((key)=>ifMidsceneLocatorField(zodObject.shape[key]));
235
+ }
236
+ return [];
237
+ };
238
+ export { common_AIActionType as AIActionType, MidsceneLocation, PointSchema, RectSchema, SizeSchema, actionSpaceTypePrefix, adaptBbox, adaptBboxToRect, adaptDoubaoBbox, adaptGeminiBbox, adaptQwenBbox, buildYamlFlowFromPlans, callAiFn, expandSearchArea, fillBboxParam, findAllMidsceneLocatorField, ifMidsceneLocatorField, markupImageForLLM, mergeRects, warnGPT4oSizeLimit };
218
239
 
219
240
  //# sourceMappingURL=common.mjs.map
@@ -1 +1 @@
1
- {"version":3,"file":"ai-model/common.mjs","sources":["webpack://@midscene/core/./src/ai-model/common.ts"],"sourcesContent":["import type {\n AIUsageInfo,\n BaseElement,\n ElementTreeNode,\n MidsceneYamlFlowItem,\n PlanningAction,\n PlanningActionParamInputOrKeyPress,\n PlanningActionParamSleep,\n Rect,\n ScrollParam,\n Size,\n} from '@/types';\nimport { assert } from '@midscene/shared/utils';\n\nimport type {\n ChatCompletionSystemMessageParam,\n ChatCompletionUserMessageParam,\n} from 'openai/resources/index';\nimport {\n call,\n callToGetJSONObject,\n getModelName,\n} from './service-caller/index';\n\nimport type { PlanningLocateParam } from '@/types';\nimport { NodeType } from '@midscene/shared/constants';\nimport { vlLocateMode } from '@midscene/shared/env';\nimport { treeToList } from '@midscene/shared/extractor';\nimport { compositeElementInfoImg } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\n\nexport type AIArgs = [\n ChatCompletionSystemMessageParam,\n ...ChatCompletionUserMessageParam[],\n];\n\nexport enum AIActionType {\n ASSERT = 0,\n INSPECT_ELEMENT = 1,\n EXTRACT_DATA = 2,\n PLAN = 3,\n DESCRIBE_ELEMENT = 4,\n}\n\nexport async function callAiFn<T>(\n msgs: AIArgs,\n AIActionTypeValue: AIActionType,\n): Promise<{ content: T; usage?: AIUsageInfo }> {\n const jsonObject = await callToGetJSONObject<T>(msgs, AIActionTypeValue);\n\n return {\n content: jsonObject.content,\n usage: jsonObject.usage,\n };\n}\n\nconst defaultBboxSize = 20; // must be even number\nconst debugInspectUtils = getDebug('ai:common');\n\n// transform the param of locate from qwen mode\nexport function fillBboxParam(\n locate: PlanningLocateParam,\n width: number,\n height: number,\n) {\n // The Qwen model might have hallucinations of naming bbox as bbox_2d.\n if ((locate as any).bbox_2d && !locate?.bbox) {\n locate.bbox = (locate as any).bbox_2d;\n // biome-ignore lint/performance/noDelete: <explanation>\n delete (locate as any).bbox_2d;\n }\n\n if (locate?.bbox) {\n locate.bbox = adaptBbox(locate.bbox, width, height);\n }\n\n return locate;\n}\n\nexport function adaptQwenBbox(\n bbox: number[],\n): [number, number, number, number] {\n if (bbox.length < 2) {\n const msg = `invalid bbox data for qwen-vl mode: ${JSON.stringify(bbox)} `;\n throw new Error(msg);\n }\n\n const result: [number, number, number, number] = [\n Math.round(bbox[0]),\n Math.round(bbox[1]),\n typeof bbox[2] === 'number'\n ? Math.round(bbox[2])\n : Math.round(bbox[0] + defaultBboxSize),\n typeof bbox[3] === 'number'\n ? Math.round(bbox[3])\n : Math.round(bbox[1] + defaultBboxSize),\n ];\n return result;\n}\n\nexport function adaptDoubaoBbox(\n bbox: string[] | number[] | string,\n width: number,\n height: number,\n): [number, number, number, number] {\n assert(\n width > 0 && height > 0,\n 'width and height must be greater than 0 in doubao mode',\n );\n\n if (typeof bbox === 'string') {\n assert(\n /^(\\d+)\\s(\\d+)\\s(\\d+)\\s(\\d+)$/.test(bbox.trim()),\n `invalid bbox data string for doubao-vision mode: ${bbox}`,\n );\n const splitted = bbox.split(' ');\n if (splitted.length === 4) {\n return [\n Math.round((Number(splitted[0]) * width) / 1000),\n Math.round((Number(splitted[1]) * height) / 1000),\n Math.round((Number(splitted[2]) * width) / 1000),\n Math.round((Number(splitted[3]) * height) / 1000),\n ];\n }\n throw new Error(`invalid bbox data string for doubao-vision mode: ${bbox}`);\n }\n\n if (Array.isArray(bbox) && Array.isArray(bbox[0])) {\n bbox = bbox[0];\n }\n\n let bboxList: number[] = [];\n if (Array.isArray(bbox) && typeof bbox[0] === 'string') {\n bbox.forEach((item) => {\n if (typeof item === 'string' && item.includes(',')) {\n const [x, y] = item.split(',');\n bboxList.push(Number(x.trim()), Number(y.trim()));\n } else if (typeof item === 'string' && item.includes(' ')) {\n const [x, y] = item.split(' ');\n bboxList.push(Number(x.trim()), Number(y.trim()));\n } else {\n bboxList.push(Number(item));\n }\n });\n } else {\n bboxList = bbox as any;\n }\n\n if (bboxList.length === 4 || bboxList.length === 5) {\n return [\n Math.round((bboxList[0] * width) / 1000),\n Math.round((bboxList[1] * height) / 1000),\n Math.round((bboxList[2] * width) / 1000),\n Math.round((bboxList[3] * height) / 1000),\n ];\n }\n\n // treat the bbox as a center point\n if (\n bboxList.length === 6 ||\n bboxList.length === 2 ||\n bboxList.length === 3 ||\n bboxList.length === 7\n ) {\n return [\n Math.max(\n 0,\n Math.round((bboxList[0] * width) / 1000) - defaultBboxSize / 2,\n ),\n Math.max(\n 0,\n Math.round((bboxList[1] * height) / 1000) - defaultBboxSize / 2,\n ),\n Math.min(\n width,\n Math.round((bboxList[0] * width) / 1000) + defaultBboxSize / 2,\n ),\n Math.min(\n height,\n Math.round((bboxList[1] * height) / 1000) + defaultBboxSize / 2,\n ),\n ];\n }\n\n if (bbox.length === 8) {\n return [\n Math.round((bboxList[0] * width) / 1000),\n Math.round((bboxList[1] * height) / 1000),\n Math.round((bboxList[4] * width) / 1000),\n Math.round((bboxList[5] * height) / 1000),\n ];\n }\n\n const msg = `invalid bbox data for doubao-vision mode: ${JSON.stringify(bbox)} `;\n throw new Error(msg);\n}\n\nexport function adaptBbox(\n bbox: number[],\n width: number,\n height: number,\n): [number, number, number, number] {\n if (vlLocateMode() === 'doubao-vision' || vlLocateMode() === 'vlm-ui-tars') {\n return adaptDoubaoBbox(bbox, width, height);\n }\n\n if (vlLocateMode() === 'gemini') {\n return adaptGeminiBbox(bbox, width, height);\n }\n\n return adaptQwenBbox(bbox);\n}\n\nexport function adaptGeminiBbox(\n bbox: number[],\n width: number,\n height: number,\n): [number, number, number, number] {\n const left = Math.round((bbox[1] * width) / 1000);\n const top = Math.round((bbox[0] * height) / 1000);\n const right = Math.round((bbox[3] * width) / 1000);\n const bottom = Math.round((bbox[2] * height) / 1000);\n return [left, top, right, bottom];\n}\n\nexport function adaptBboxToRect(\n bbox: number[],\n width: number,\n height: number,\n offsetX = 0,\n offsetY = 0,\n): Rect {\n debugInspectUtils('adaptBboxToRect', bbox, width, height, offsetX, offsetY);\n const [left, top, right, bottom] = adaptBbox(bbox, width, height);\n const rect = {\n left: left + offsetX,\n top: top + offsetY,\n width: right - left,\n height: bottom - top,\n };\n debugInspectUtils('adaptBboxToRect, result=', rect);\n return rect;\n}\n\nlet warned = false;\nexport function warnGPT4oSizeLimit(size: Size) {\n if (warned) return;\n if (getModelName()?.toLowerCase().includes('gpt-4o')) {\n const warningMsg = `GPT-4o has a maximum image input size of 2000x768 or 768x2000, but got ${size.width}x${size.height}. Please set your page to a smaller resolution. Otherwise, the result may be inaccurate.`;\n\n if (\n Math.max(size.width, size.height) > 2000 ||\n Math.min(size.width, size.height) > 768\n ) {\n console.warn(warningMsg);\n warned = true;\n }\n } else if (size.width > 1800 || size.height > 1800) {\n console.warn(\n `The image size seems too large (${size.width}x${size.height}). It may lead to more token usage, slower response, and inaccurate result.`,\n );\n warned = true;\n }\n}\n\nexport function mergeRects(rects: Rect[]) {\n const minLeft = Math.min(...rects.map((r) => r.left));\n const minTop = Math.min(...rects.map((r) => r.top));\n const maxRight = Math.max(...rects.map((r) => r.left + r.width));\n const maxBottom = Math.max(...rects.map((r) => r.top + r.height));\n return {\n left: minLeft,\n top: minTop,\n width: maxRight - minLeft,\n height: maxBottom - minTop,\n };\n}\n\n// expand the search area to at least 300 x 300, or add a default padding\nexport function expandSearchArea(rect: Rect, screenSize: Size) {\n const minEdgeSize = vlLocateMode() === 'doubao-vision' ? 500 : 300;\n const defaultPadding = 160;\n\n const paddingSizeHorizontal =\n rect.width < minEdgeSize\n ? Math.ceil((minEdgeSize - rect.width) / 2)\n : defaultPadding;\n const paddingSizeVertical =\n rect.height < minEdgeSize\n ? Math.ceil((minEdgeSize - rect.height) / 2)\n : defaultPadding;\n rect.left = Math.max(0, rect.left - paddingSizeHorizontal);\n rect.width = Math.min(\n rect.width + paddingSizeHorizontal * 2,\n screenSize.width - rect.left,\n );\n rect.top = Math.max(0, rect.top - paddingSizeVertical);\n rect.height = Math.min(\n rect.height + paddingSizeVertical * 2,\n screenSize.height - rect.top,\n );\n return rect;\n}\n\nexport async function markupImageForLLM(\n screenshotBase64: string,\n tree: ElementTreeNode<BaseElement>,\n size: Size,\n) {\n const elementsInfo = treeToList(tree);\n const elementsPositionInfoWithoutText = elementsInfo!.filter(\n (elementInfo) => {\n if (elementInfo.attributes.nodeType === NodeType.TEXT) {\n return false;\n }\n return true;\n },\n );\n\n const imagePayload = await compositeElementInfoImg({\n inputImgBase64: screenshotBase64,\n elementsPositionInfo: elementsPositionInfoWithoutText,\n size,\n });\n return imagePayload;\n}\n\nexport function buildYamlFlowFromPlans(\n plans: PlanningAction[],\n sleep?: number,\n): MidsceneYamlFlowItem[] {\n const flow: MidsceneYamlFlowItem[] = [];\n\n for (const plan of plans) {\n const type = plan.type;\n const locate = plan.locate?.prompt!; // TODO: check if locate is null\n\n if (type === 'Tap') {\n flow.push({\n aiTap: locate!,\n });\n } else if (type === 'Hover') {\n flow.push({\n aiHover: locate!,\n });\n } else if (type === 'Input') {\n const param = plan.param as PlanningActionParamInputOrKeyPress;\n flow.push({\n aiInput: param.value,\n locate,\n });\n } else if (type === 'KeyboardPress') {\n const param = plan.param as PlanningActionParamInputOrKeyPress;\n flow.push({\n aiKeyboardPress: param.value,\n locate,\n });\n } else if (type === 'Scroll') {\n const param = plan.param as ScrollParam;\n flow.push({\n aiScroll: null,\n locate,\n direction: param.direction,\n scrollType: param.scrollType,\n distance: param.distance,\n });\n } else if (type === 'Sleep') {\n const param = plan.param as PlanningActionParamSleep;\n flow.push({\n sleep: param.timeMs,\n });\n } else if (\n type === 'AndroidBackButton' ||\n type === 'AndroidHomeButton' ||\n type === 'AndroidRecentAppsButton' ||\n type === 'AndroidLongPress' ||\n type === 'AndroidPull'\n ) {\n // not implemented in yaml yet\n } else if (\n type === 'Error' ||\n type === 'Assert' ||\n type === 'AssertWithoutThrow' ||\n type === 'Finished'\n ) {\n // do nothing\n } else {\n console.warn(\n `Cannot convert action ${type} to yaml flow. This should be a bug of Midscene.`,\n );\n }\n }\n\n if (sleep) {\n flow.push({\n sleep: sleep,\n });\n }\n\n return flow;\n}\n"],"names":["AIActionType","callAiFn","msgs","AIActionTypeValue","jsonObject","callToGetJSONObject","defaultBboxSize","debugInspectUtils","getDebug","fillBboxParam","locate","width","height","adaptBbox","adaptQwenBbox","bbox","msg","JSON","Error","result","Math","adaptDoubaoBbox","assert","splitted","Number","Array","bboxList","item","x","y","vlLocateMode","adaptGeminiBbox","left","top","right","bottom","adaptBboxToRect","offsetX","offsetY","rect","warned","warnGPT4oSizeLimit","size","_getModelName","warningMsg","console","mergeRects","rects","minLeft","r","minTop","maxRight","maxBottom","expandSearchArea","screenSize","minEdgeSize","defaultPadding","paddingSizeHorizontal","paddingSizeVertical","markupImageForLLM","screenshotBase64","tree","elementsInfo","treeToList","elementsPositionInfoWithoutText","elementInfo","NodeType","imagePayload","compositeElementInfoImg","buildYamlFlowFromPlans","plans","sleep","flow","plan","_plan_locate","type","param"],"mappings":";;;;;;;AAoCO,IAAKA,sBAAYA,WAAAA,GAAAA,SAAZA,YAAY;;;;;;WAAZA;;AAQL,eAAeC,SACpBC,IAAY,EACZC,iBAA+B;IAE/B,MAAMC,aAAa,MAAMC,oBAAuBH,MAAMC;IAEtD,OAAO;QACL,SAASC,WAAW,OAAO;QAC3B,OAAOA,WAAW,KAAK;IACzB;AACF;AAEA,MAAME,kBAAkB;AACxB,MAAMC,oBAAoBC,SAAS;AAG5B,SAASC,cACdC,MAA2B,EAC3BC,KAAa,EACbC,MAAc;IAGd,IAAKF,OAAe,OAAO,IAAI,CAACA,CAAAA,QAAAA,SAAAA,KAAAA,IAAAA,OAAQ,IAAI,AAAD,GAAG;QAC5CA,OAAO,IAAI,GAAIA,OAAe,OAAO;QAErC,OAAQA,OAAe,OAAO;IAChC;IAEA,IAAIA,QAAAA,SAAAA,KAAAA,IAAAA,OAAQ,IAAI,EACdA,OAAO,IAAI,GAAGG,UAAUH,OAAO,IAAI,EAAEC,OAAOC;IAG9C,OAAOF;AACT;AAEO,SAASI,cACdC,IAAc;IAEd,IAAIA,KAAK,MAAM,GAAG,GAAG;QACnB,MAAMC,MAAM,CAAC,oCAAoC,EAAEC,KAAK,SAAS,CAACF,MAAM,CAAC,CAAC;QAC1E,MAAM,IAAIG,MAAMF;IAClB;IAEA,MAAMG,SAA2C;QAC/CC,KAAK,KAAK,CAACL,IAAI,CAAC,EAAE;QAClBK,KAAK,KAAK,CAACL,IAAI,CAAC,EAAE;QACC,YAAnB,OAAOA,IAAI,CAAC,EAAE,GACVK,KAAK,KAAK,CAACL,IAAI,CAAC,EAAE,IAClBK,KAAK,KAAK,CAACL,IAAI,CAAC,EAAE,GAAGT;QACN,YAAnB,OAAOS,IAAI,CAAC,EAAE,GACVK,KAAK,KAAK,CAACL,IAAI,CAAC,EAAE,IAClBK,KAAK,KAAK,CAACL,IAAI,CAAC,EAAE,GAAGT;KAC1B;IACD,OAAOa;AACT;AAEO,SAASE,gBACdN,IAAkC,EAClCJ,KAAa,EACbC,MAAc;IAEdU,OACEX,QAAQ,KAAKC,SAAS,GACtB;IAGF,IAAI,AAAgB,YAAhB,OAAOG,MAAmB;QAC5BO,OACE,+BAA+B,IAAI,CAACP,KAAK,IAAI,KAC7C,CAAC,iDAAiD,EAAEA,MAAM;QAE5D,MAAMQ,WAAWR,KAAK,KAAK,CAAC;QAC5B,IAAIQ,AAAoB,MAApBA,SAAS,MAAM,EACjB,OAAO;YACLH,KAAK,KAAK,CAAEI,OAAOD,QAAQ,CAAC,EAAE,IAAIZ,QAAS;YAC3CS,KAAK,KAAK,CAAEI,OAAOD,QAAQ,CAAC,EAAE,IAAIX,SAAU;YAC5CQ,KAAK,KAAK,CAAEI,OAAOD,QAAQ,CAAC,EAAE,IAAIZ,QAAS;YAC3CS,KAAK,KAAK,CAAEI,OAAOD,QAAQ,CAAC,EAAE,IAAIX,SAAU;SAC7C;QAEH,MAAM,IAAIM,MAAM,CAAC,iDAAiD,EAAEH,MAAM;IAC5E;IAEA,IAAIU,MAAM,OAAO,CAACV,SAASU,MAAM,OAAO,CAACV,IAAI,CAAC,EAAE,GAC9CA,OAAOA,IAAI,CAAC,EAAE;IAGhB,IAAIW,WAAqB,EAAE;IAC3B,IAAID,MAAM,OAAO,CAACV,SAAS,AAAmB,YAAnB,OAAOA,IAAI,CAAC,EAAE,EACvCA,KAAK,OAAO,CAAC,CAACY;QACZ,IAAI,AAAgB,YAAhB,OAAOA,QAAqBA,KAAK,QAAQ,CAAC,MAAM;YAClD,MAAM,CAACC,GAAGC,EAAE,GAAGF,KAAK,KAAK,CAAC;YAC1BD,SAAS,IAAI,CAACF,OAAOI,EAAE,IAAI,KAAKJ,OAAOK,EAAE,IAAI;QAC/C,OAAO,IAAI,AAAgB,YAAhB,OAAOF,QAAqBA,KAAK,QAAQ,CAAC,MAAM;YACzD,MAAM,CAACC,GAAGC,EAAE,GAAGF,KAAK,KAAK,CAAC;YAC1BD,SAAS,IAAI,CAACF,OAAOI,EAAE,IAAI,KAAKJ,OAAOK,EAAE,IAAI;QAC/C,OACEH,SAAS,IAAI,CAACF,OAAOG;IAEzB;SAEAD,WAAWX;IAGb,IAAIW,AAAoB,MAApBA,SAAS,MAAM,IAAUA,AAAoB,MAApBA,SAAS,MAAM,EAC1C,OAAO;QACLN,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGf,QAAS;QACnCS,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGd,SAAU;QACpCQ,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGf,QAAS;QACnCS,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGd,SAAU;KACrC;IAIH,IACEc,AAAoB,MAApBA,SAAS,MAAM,IACfA,AAAoB,MAApBA,SAAS,MAAM,IACfA,AAAoB,MAApBA,SAAS,MAAM,IACfA,AAAoB,MAApBA,SAAS,MAAM,EAEf,OAAO;QACLN,KAAK,GAAG,CACN,GACAA,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGf,QAAS,QAAQL,kBAAkB;QAE/Dc,KAAK,GAAG,CACN,GACAA,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGd,SAAU,QAAQN,kBAAkB;QAEhEc,KAAK,GAAG,CACNT,OACAS,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGf,QAAS,QAAQL,kBAAkB;QAE/Dc,KAAK,GAAG,CACNR,QACAQ,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGd,SAAU,QAAQN,kBAAkB;KAEjE;IAGH,IAAIS,AAAgB,MAAhBA,KAAK,MAAM,EACb,OAAO;QACLK,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGf,QAAS;QACnCS,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGd,SAAU;QACpCQ,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGf,QAAS;QACnCS,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGd,SAAU;KACrC;IAGH,MAAMI,MAAM,CAAC,0CAA0C,EAAEC,KAAK,SAAS,CAACF,MAAM,CAAC,CAAC;IAChF,MAAM,IAAIG,MAAMF;AAClB;AAEO,SAASH,UACdE,IAAc,EACdJ,KAAa,EACbC,MAAc;IAEd,IAAIkB,AAAmB,oBAAnBA,kBAAsCA,AAAmB,kBAAnBA,gBACxC,OAAOT,gBAAgBN,MAAMJ,OAAOC;IAGtC,IAAIkB,AAAmB,aAAnBA,gBACF,OAAOC,gBAAgBhB,MAAMJ,OAAOC;IAGtC,OAAOE,cAAcC;AACvB;AAEO,SAASgB,gBACdhB,IAAc,EACdJ,KAAa,EACbC,MAAc;IAEd,MAAMoB,OAAOZ,KAAK,KAAK,CAAEL,IAAI,CAAC,EAAE,GAAGJ,QAAS;IAC5C,MAAMsB,MAAMb,KAAK,KAAK,CAAEL,IAAI,CAAC,EAAE,GAAGH,SAAU;IAC5C,MAAMsB,QAAQd,KAAK,KAAK,CAAEL,IAAI,CAAC,EAAE,GAAGJ,QAAS;IAC7C,MAAMwB,SAASf,KAAK,KAAK,CAAEL,IAAI,CAAC,EAAE,GAAGH,SAAU;IAC/C,OAAO;QAACoB;QAAMC;QAAKC;QAAOC;KAAO;AACnC;AAEO,SAASC,gBACdrB,IAAc,EACdJ,KAAa,EACbC,MAAc,EACdyB,UAAU,CAAC,EACXC,UAAU,CAAC;IAEX/B,kBAAkB,mBAAmBQ,MAAMJ,OAAOC,QAAQyB,SAASC;IACnE,MAAM,CAACN,MAAMC,KAAKC,OAAOC,OAAO,GAAGtB,UAAUE,MAAMJ,OAAOC;IAC1D,MAAM2B,OAAO;QACX,MAAMP,OAAOK;QACb,KAAKJ,MAAMK;QACX,OAAOJ,QAAQF;QACf,QAAQG,SAASF;IACnB;IACA1B,kBAAkB,4BAA4BgC;IAC9C,OAAOA;AACT;AAEA,IAAIC,SAAS;AACN,SAASC,mBAAmBC,IAAU;QAEvCC;IADJ,IAAIH,QAAQ;IACZ,IAAI,QAAAG,CAAAA,gBAAAA,cAAa,IAAbA,KAAAA,IAAAA,cAAgB,WAAW,GAAG,QAAQ,CAAC,WAAW;QACpD,MAAMC,aAAa,CAAC,uEAAuE,EAAEF,KAAK,KAAK,CAAC,CAAC,EAAEA,KAAK,MAAM,CAAC,wFAAwF,CAAC;QAEhN,IACEtB,KAAK,GAAG,CAACsB,KAAK,KAAK,EAAEA,KAAK,MAAM,IAAI,QACpCtB,KAAK,GAAG,CAACsB,KAAK,KAAK,EAAEA,KAAK,MAAM,IAAI,KACpC;YACAG,QAAQ,IAAI,CAACD;YACbJ,SAAS;QACX;IACF,OAAO,IAAIE,KAAK,KAAK,GAAG,QAAQA,KAAK,MAAM,GAAG,MAAM;QAClDG,QAAQ,IAAI,CACV,CAAC,gCAAgC,EAAEH,KAAK,KAAK,CAAC,CAAC,EAAEA,KAAK,MAAM,CAAC,2EAA2E,CAAC;QAE3IF,SAAS;IACX;AACF;AAEO,SAASM,WAAWC,KAAa;IACtC,MAAMC,UAAU5B,KAAK,GAAG,IAAI2B,MAAM,GAAG,CAAC,CAACE,IAAMA,EAAE,IAAI;IACnD,MAAMC,SAAS9B,KAAK,GAAG,IAAI2B,MAAM,GAAG,CAAC,CAACE,IAAMA,EAAE,GAAG;IACjD,MAAME,WAAW/B,KAAK,GAAG,IAAI2B,MAAM,GAAG,CAAC,CAACE,IAAMA,EAAE,IAAI,GAAGA,EAAE,KAAK;IAC9D,MAAMG,YAAYhC,KAAK,GAAG,IAAI2B,MAAM,GAAG,CAAC,CAACE,IAAMA,EAAE,GAAG,GAAGA,EAAE,MAAM;IAC/D,OAAO;QACL,MAAMD;QACN,KAAKE;QACL,OAAOC,WAAWH;QAClB,QAAQI,YAAYF;IACtB;AACF;AAGO,SAASG,iBAAiBd,IAAU,EAAEe,UAAgB;IAC3D,MAAMC,cAAczB,AAAmB,oBAAnBA,iBAAqC,MAAM;IAC/D,MAAM0B,iBAAiB;IAEvB,MAAMC,wBACJlB,KAAK,KAAK,GAAGgB,cACTnC,KAAK,IAAI,CAAEmC,AAAAA,CAAAA,cAAchB,KAAK,KAAI,IAAK,KACvCiB;IACN,MAAME,sBACJnB,KAAK,MAAM,GAAGgB,cACVnC,KAAK,IAAI,CAAEmC,AAAAA,CAAAA,cAAchB,KAAK,MAAK,IAAK,KACxCiB;IACNjB,KAAK,IAAI,GAAGnB,KAAK,GAAG,CAAC,GAAGmB,KAAK,IAAI,GAAGkB;IACpClB,KAAK,KAAK,GAAGnB,KAAK,GAAG,CACnBmB,KAAK,KAAK,GAAGkB,AAAwB,IAAxBA,uBACbH,WAAW,KAAK,GAAGf,KAAK,IAAI;IAE9BA,KAAK,GAAG,GAAGnB,KAAK,GAAG,CAAC,GAAGmB,KAAK,GAAG,GAAGmB;IAClCnB,KAAK,MAAM,GAAGnB,KAAK,GAAG,CACpBmB,KAAK,MAAM,GAAGmB,AAAsB,IAAtBA,qBACdJ,WAAW,MAAM,GAAGf,KAAK,GAAG;IAE9B,OAAOA;AACT;AAEO,eAAeoB,kBACpBC,gBAAwB,EACxBC,IAAkC,EAClCnB,IAAU;IAEV,MAAMoB,eAAeC,WAAWF;IAChC,MAAMG,kCAAkCF,aAAc,MAAM,CAC1D,CAACG;QACC,IAAIA,YAAY,UAAU,CAAC,QAAQ,KAAKC,SAAS,IAAI,EACnD,OAAO;QAET,OAAO;IACT;IAGF,MAAMC,eAAe,MAAMC,wBAAwB;QACjD,gBAAgBR;QAChB,sBAAsBI;QACtBtB;IACF;IACA,OAAOyB;AACT;AAEO,SAASE,uBACdC,KAAuB,EACvBC,KAAc;IAEd,MAAMC,OAA+B,EAAE;IAEvC,KAAK,MAAMC,QAAQH,MAAO;YAETI;QADf,MAAMC,OAAOF,KAAK,IAAI;QACtB,MAAM/D,SAAS,QAAAgE,CAAAA,eAAAA,KAAK,MAAM,AAAD,IAAVA,KAAAA,IAAAA,aAAa,MAAM;QAElC,IAAIC,AAAS,UAATA,MACFH,KAAK,IAAI,CAAC;YACR,OAAO9D;QACT;aACK,IAAIiE,AAAS,YAATA,MACTH,KAAK,IAAI,CAAC;YACR,SAAS9D;QACX;aACK,IAAIiE,AAAS,YAATA,MAAkB;YAC3B,MAAMC,QAAQH,KAAK,KAAK;YACxBD,KAAK,IAAI,CAAC;gBACR,SAASI,MAAM,KAAK;gBACpBlE;YACF;QACF,OAAO,IAAIiE,AAAS,oBAATA,MAA0B;YACnC,MAAMC,QAAQH,KAAK,KAAK;YACxBD,KAAK,IAAI,CAAC;gBACR,iBAAiBI,MAAM,KAAK;gBAC5BlE;YACF;QACF,OAAO,IAAIiE,AAAS,aAATA,MAAmB;YAC5B,MAAMC,QAAQH,KAAK,KAAK;YACxBD,KAAK,IAAI,CAAC;gBACR,UAAU;gBACV9D;gBACA,WAAWkE,MAAM,SAAS;gBAC1B,YAAYA,MAAM,UAAU;gBAC5B,UAAUA,MAAM,QAAQ;YAC1B;QACF,OAAO,IAAID,AAAS,YAATA,MAAkB;YAC3B,MAAMC,QAAQH,KAAK,KAAK;YACxBD,KAAK,IAAI,CAAC;gBACR,OAAOI,MAAM,MAAM;YACrB;QACF,OACW,wBAATD,QACAA,AAAS,wBAATA,QACAA,AAAS,8BAATA,QACAA,AAAS,uBAATA,QACAA,AAAS,kBAATA,QAIAA,AAAS,YAATA,QACAA,AAAS,aAATA,QACAA,AAAS,yBAATA,QACAA,AAAS,eAATA,QAIA9B,QAAQ,IAAI,CACV,CAAC,sBAAsB,EAAE8B,KAAK,gDAAgD,CAAC;IAGrF;IAEA,IAAIJ,OACFC,KAAK,IAAI,CAAC;QACR,OAAOD;IACT;IAGF,OAAOC;AACT"}
1
+ {"version":3,"file":"ai-model/common.mjs","sources":["webpack://@midscene/core/./src/ai-model/common.ts"],"sourcesContent":["import type {\n AIUsageInfo,\n BaseElement,\n DeviceAction,\n ElementTreeNode,\n MidsceneYamlFlowItem,\n PlanningAction,\n Rect,\n Size,\n} from '@/types';\nimport { assert } from '@midscene/shared/utils';\n\nimport type {\n ChatCompletionSystemMessageParam,\n ChatCompletionUserMessageParam,\n} from 'openai/resources/index';\nimport { callToGetJSONObject } from './service-caller/index';\n\nimport type { PlanningLocateParam } from '@/types';\nimport { NodeType } from '@midscene/shared/constants';\nimport {\n type IModelPreferences,\n getModelName,\n vlLocateMode,\n} from '@midscene/shared/env';\nimport { treeToList } from '@midscene/shared/extractor';\nimport { compositeElementInfoImg } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { z } from 'zod';\n\nexport type AIArgs = [\n ChatCompletionSystemMessageParam,\n ...ChatCompletionUserMessageParam[],\n];\n\nexport enum AIActionType {\n ASSERT = 0,\n INSPECT_ELEMENT = 1,\n EXTRACT_DATA = 2,\n PLAN = 3,\n DESCRIBE_ELEMENT = 4,\n}\n\nexport const actionSpaceTypePrefix = 'action_space_';\n\nexport async function callAiFn<T>(\n msgs: AIArgs,\n AIActionTypeValue: AIActionType,\n modelPreferences: IModelPreferences,\n): Promise<{ content: T; usage?: AIUsageInfo }> {\n const jsonObject = await callToGetJSONObject<T>(\n msgs,\n AIActionTypeValue,\n modelPreferences,\n );\n\n return {\n content: jsonObject.content,\n usage: jsonObject.usage,\n };\n}\n\nconst defaultBboxSize = 20; // must be even number\nconst debugInspectUtils = getDebug('ai:common');\n\n// transform the param of locate from qwen mode\nexport function fillBboxParam(\n locate: PlanningLocateParam,\n width: number,\n height: number,\n modelPreferences: IModelPreferences,\n) {\n // The Qwen model might have hallucinations of naming bbox as bbox_2d.\n if ((locate as any).bbox_2d && !locate?.bbox) {\n locate.bbox = (locate as any).bbox_2d;\n // biome-ignore lint/performance/noDelete: <explanation>\n delete (locate as any).bbox_2d;\n }\n\n if (locate?.bbox) {\n locate.bbox = adaptBbox(locate.bbox, width, height, modelPreferences);\n }\n\n return locate;\n}\n\nexport function adaptQwenBbox(\n bbox: number[],\n): [number, number, number, number] {\n if (bbox.length < 2) {\n const msg = `invalid bbox data for qwen-vl mode: ${JSON.stringify(bbox)} `;\n throw new Error(msg);\n }\n\n const result: [number, number, number, number] = [\n Math.round(bbox[0]),\n Math.round(bbox[1]),\n typeof bbox[2] === 'number'\n ? Math.round(bbox[2])\n : Math.round(bbox[0] + defaultBboxSize),\n typeof bbox[3] === 'number'\n ? Math.round(bbox[3])\n : Math.round(bbox[1] + defaultBboxSize),\n ];\n return result;\n}\n\nexport function adaptDoubaoBbox(\n bbox: string[] | number[] | string,\n width: number,\n height: number,\n): [number, number, number, number] {\n assert(\n width > 0 && height > 0,\n 'width and height must be greater than 0 in doubao mode',\n );\n\n if (typeof bbox === 'string') {\n assert(\n /^(\\d+)\\s(\\d+)\\s(\\d+)\\s(\\d+)$/.test(bbox.trim()),\n `invalid bbox data string for doubao-vision mode: ${bbox}`,\n );\n const splitted = bbox.split(' ');\n if (splitted.length === 4) {\n return [\n Math.round((Number(splitted[0]) * width) / 1000),\n Math.round((Number(splitted[1]) * height) / 1000),\n Math.round((Number(splitted[2]) * width) / 1000),\n Math.round((Number(splitted[3]) * height) / 1000),\n ];\n }\n throw new Error(`invalid bbox data string for doubao-vision mode: ${bbox}`);\n }\n\n if (Array.isArray(bbox) && Array.isArray(bbox[0])) {\n bbox = bbox[0];\n }\n\n let bboxList: number[] = [];\n if (Array.isArray(bbox) && typeof bbox[0] === 'string') {\n bbox.forEach((item) => {\n if (typeof item === 'string' && item.includes(',')) {\n const [x, y] = item.split(',');\n bboxList.push(Number(x.trim()), Number(y.trim()));\n } else if (typeof item === 'string' && item.includes(' ')) {\n const [x, y] = item.split(' ');\n bboxList.push(Number(x.trim()), Number(y.trim()));\n } else {\n bboxList.push(Number(item));\n }\n });\n } else {\n bboxList = bbox as any;\n }\n\n if (bboxList.length === 4 || bboxList.length === 5) {\n return [\n Math.round((bboxList[0] * width) / 1000),\n Math.round((bboxList[1] * height) / 1000),\n Math.round((bboxList[2] * width) / 1000),\n Math.round((bboxList[3] * height) / 1000),\n ];\n }\n\n // treat the bbox as a center point\n if (\n bboxList.length === 6 ||\n bboxList.length === 2 ||\n bboxList.length === 3 ||\n bboxList.length === 7\n ) {\n return [\n Math.max(\n 0,\n Math.round((bboxList[0] * width) / 1000) - defaultBboxSize / 2,\n ),\n Math.max(\n 0,\n Math.round((bboxList[1] * height) / 1000) - defaultBboxSize / 2,\n ),\n Math.min(\n width,\n Math.round((bboxList[0] * width) / 1000) + defaultBboxSize / 2,\n ),\n Math.min(\n height,\n Math.round((bboxList[1] * height) / 1000) + defaultBboxSize / 2,\n ),\n ];\n }\n\n if (bbox.length === 8) {\n return [\n Math.round((bboxList[0] * width) / 1000),\n Math.round((bboxList[1] * height) / 1000),\n Math.round((bboxList[4] * width) / 1000),\n Math.round((bboxList[5] * height) / 1000),\n ];\n }\n\n const msg = `invalid bbox data for doubao-vision mode: ${JSON.stringify(bbox)} `;\n throw new Error(msg);\n}\n\nexport function adaptBbox(\n bbox: number[],\n width: number,\n height: number,\n modelPreferences: IModelPreferences,\n): [number, number, number, number] {\n if (\n vlLocateMode(modelPreferences) === 'doubao-vision' ||\n vlLocateMode(modelPreferences) === 'vlm-ui-tars'\n ) {\n return adaptDoubaoBbox(bbox, width, height);\n }\n\n if (vlLocateMode(modelPreferences) === 'gemini') {\n return adaptGeminiBbox(bbox, width, height);\n }\n\n return adaptQwenBbox(bbox);\n}\n\nexport function adaptGeminiBbox(\n bbox: number[],\n width: number,\n height: number,\n): [number, number, number, number] {\n const left = Math.round((bbox[1] * width) / 1000);\n const top = Math.round((bbox[0] * height) / 1000);\n const right = Math.round((bbox[3] * width) / 1000);\n const bottom = Math.round((bbox[2] * height) / 1000);\n return [left, top, right, bottom];\n}\n\nexport function adaptBboxToRect(\n bbox: number[],\n width: number,\n height: number,\n modelPreferences: IModelPreferences,\n offsetX = 0,\n offsetY = 0,\n): Rect {\n debugInspectUtils('adaptBboxToRect', bbox, width, height, offsetX, offsetY);\n const [left, top, right, bottom] = adaptBbox(\n bbox,\n width,\n height,\n modelPreferences,\n );\n const rect = {\n left: left + offsetX,\n top: top + offsetY,\n width: right - left,\n height: bottom - top,\n };\n debugInspectUtils('adaptBboxToRect, result=', rect);\n return rect;\n}\n\nlet warned = false;\nexport function warnGPT4oSizeLimit(\n size: Size,\n modelPreferences: IModelPreferences,\n) {\n if (warned) return;\n if (getModelName(modelPreferences)?.toLowerCase().includes('gpt-4o')) {\n const warningMsg = `GPT-4o has a maximum image input size of 2000x768 or 768x2000, but got ${size.width}x${size.height}. Please set your page to a smaller resolution. Otherwise, the result may be inaccurate.`;\n\n if (\n Math.max(size.width, size.height) > 2000 ||\n Math.min(size.width, size.height) > 768\n ) {\n console.warn(warningMsg);\n warned = true;\n }\n } else if (size.width > 1800 || size.height > 1800) {\n console.warn(\n `The image size seems too large (${size.width}x${size.height}). It may lead to more token usage, slower response, and inaccurate result.`,\n );\n warned = true;\n }\n}\n\nexport function mergeRects(rects: Rect[]) {\n const minLeft = Math.min(...rects.map((r) => r.left));\n const minTop = Math.min(...rects.map((r) => r.top));\n const maxRight = Math.max(...rects.map((r) => r.left + r.width));\n const maxBottom = Math.max(...rects.map((r) => r.top + r.height));\n return {\n left: minLeft,\n top: minTop,\n width: maxRight - minLeft,\n height: maxBottom - minTop,\n };\n}\n\n// expand the search area to at least 300 x 300, or add a default padding\nexport function expandSearchArea(\n rect: Rect,\n screenSize: Size,\n modelPreferences: IModelPreferences,\n) {\n const minEdgeSize =\n vlLocateMode(modelPreferences) === 'doubao-vision' ? 500 : 300;\n const defaultPadding = 160;\n\n const paddingSizeHorizontal =\n rect.width < minEdgeSize\n ? Math.ceil((minEdgeSize - rect.width) / 2)\n : defaultPadding;\n const paddingSizeVertical =\n rect.height < minEdgeSize\n ? Math.ceil((minEdgeSize - rect.height) / 2)\n : defaultPadding;\n rect.left = Math.max(0, rect.left - paddingSizeHorizontal);\n rect.width = Math.min(\n rect.width + paddingSizeHorizontal * 2,\n screenSize.width - rect.left,\n );\n rect.top = Math.max(0, rect.top - paddingSizeVertical);\n rect.height = Math.min(\n rect.height + paddingSizeVertical * 2,\n screenSize.height - rect.top,\n );\n return rect;\n}\n\nexport async function markupImageForLLM(\n screenshotBase64: string,\n tree: ElementTreeNode<BaseElement>,\n size: Size,\n) {\n const elementsInfo = treeToList(tree);\n const elementsPositionInfoWithoutText = elementsInfo!.filter(\n (elementInfo) => {\n if (elementInfo.attributes.nodeType === NodeType.TEXT) {\n return false;\n }\n return true;\n },\n );\n\n const imagePayload = await compositeElementInfoImg({\n inputImgBase64: screenshotBase64,\n elementsPositionInfo: elementsPositionInfoWithoutText,\n size,\n });\n return imagePayload;\n}\n\nexport function buildYamlFlowFromPlans(\n plans: PlanningAction[],\n actionSpace: DeviceAction<any>[],\n sleep?: number,\n): MidsceneYamlFlowItem[] {\n const flow: MidsceneYamlFlowItem[] = [];\n\n for (const plan of plans) {\n const verb = plan.type;\n\n const action = actionSpace.find((action) => action.name === verb);\n if (!action) {\n console.warn(\n `Cannot convert action ${verb} to yaml flow. Will ignore it.`,\n );\n continue;\n }\n\n const locate = plan.locate?.prompt;\n const flowKey = action.interfaceAlias || `${actionSpaceTypePrefix}${verb}`;\n\n const flowItem: MidsceneYamlFlowItem = {\n [flowKey]: locate || '',\n ...(plan.param || {}),\n };\n\n flow.push(flowItem);\n }\n\n if (sleep) {\n flow.push({\n sleep,\n });\n }\n\n return flow;\n}\n\n// Zod schemas for shared types\nexport const PointSchema = z.object({\n left: z.number(),\n top: z.number(),\n});\n\nexport const SizeSchema = z.object({\n width: z.number(),\n height: z.number(),\n dpr: z.number().optional(),\n});\n\nexport const RectSchema = PointSchema.and(SizeSchema).and(\n z.object({\n zoom: z.number().optional(),\n }),\n);\n\nexport const MidsceneLocation = z\n .object({\n midscene_location_field_flag: z.literal(true),\n prompt: z.string(),\n center: z.tuple([z.number(), z.number()]),\n rect: RectSchema,\n })\n .passthrough();\n\nexport type MidsceneLocationType = z.infer<typeof MidsceneLocation>;\n\nexport const ifMidsceneLocatorField = (field: any): boolean => {\n // Handle optional fields by getting the inner type\n let actualField = field;\n if (actualField._def?.typeName === 'ZodOptional') {\n actualField = actualField._def.innerType;\n }\n\n // Check if this is a ZodObject with midscene_location_field_flag\n if (actualField._def?.typeName === 'ZodObject') {\n const shape = actualField._def.shape();\n return 'midscene_location_field_flag' in shape;\n }\n\n return false;\n};\n\nexport const findAllMidsceneLocatorField = (\n zodType?: z.ZodType<any>,\n): string[] => {\n if (!zodType) {\n return [];\n }\n\n // Check if this is a ZodObject by checking if it has a shape property\n const zodObject = zodType as any;\n if (zodObject._def?.typeName === 'ZodObject' && zodObject.shape) {\n const keys = Object.keys(zodObject.shape);\n return keys.filter((key) => ifMidsceneLocatorField(zodObject.shape[key]));\n }\n\n // For other ZodType instances, we can't extract field names\n return [];\n};\n"],"names":["AIActionType","actionSpaceTypePrefix","callAiFn","msgs","AIActionTypeValue","modelPreferences","jsonObject","callToGetJSONObject","defaultBboxSize","debugInspectUtils","getDebug","fillBboxParam","locate","width","height","adaptBbox","adaptQwenBbox","bbox","msg","JSON","Error","result","Math","adaptDoubaoBbox","assert","splitted","Number","Array","bboxList","item","x","y","vlLocateMode","adaptGeminiBbox","left","top","right","bottom","adaptBboxToRect","offsetX","offsetY","rect","warned","warnGPT4oSizeLimit","size","_getModelName","warningMsg","console","mergeRects","rects","minLeft","r","minTop","maxRight","maxBottom","expandSearchArea","screenSize","minEdgeSize","defaultPadding","paddingSizeHorizontal","paddingSizeVertical","markupImageForLLM","screenshotBase64","tree","elementsInfo","treeToList","elementsPositionInfoWithoutText","elementInfo","NodeType","imagePayload","compositeElementInfoImg","buildYamlFlowFromPlans","plans","actionSpace","sleep","flow","plan","_plan_locate","verb","action","flowKey","flowItem","PointSchema","z","SizeSchema","RectSchema","MidsceneLocation","ifMidsceneLocatorField","field","_actualField__def","_actualField__def1","actualField","shape","findAllMidsceneLocatorField","zodType","_zodObject__def","zodObject","keys","Object","key"],"mappings":";;;;;;;;AAmCO,IAAKA,sBAAYA,WAAAA,GAAAA,SAAZA,YAAY;;;;;;WAAZA;;AAQL,MAAMC,wBAAwB;AAE9B,eAAeC,SACpBC,IAAY,EACZC,iBAA+B,EAC/BC,gBAAmC;IAEnC,MAAMC,aAAa,MAAMC,oBACvBJ,MACAC,mBACAC;IAGF,OAAO;QACL,SAASC,WAAW,OAAO;QAC3B,OAAOA,WAAW,KAAK;IACzB;AACF;AAEA,MAAME,kBAAkB;AACxB,MAAMC,oBAAoBC,SAAS;AAG5B,SAASC,cACdC,MAA2B,EAC3BC,KAAa,EACbC,MAAc,EACdT,gBAAmC;IAGnC,IAAKO,OAAe,OAAO,IAAI,CAACA,CAAAA,QAAAA,SAAAA,KAAAA,IAAAA,OAAQ,IAAI,AAAD,GAAG;QAC5CA,OAAO,IAAI,GAAIA,OAAe,OAAO;QAErC,OAAQA,OAAe,OAAO;IAChC;IAEA,IAAIA,QAAAA,SAAAA,KAAAA,IAAAA,OAAQ,IAAI,EACdA,OAAO,IAAI,GAAGG,UAAUH,OAAO,IAAI,EAAEC,OAAOC,QAAQT;IAGtD,OAAOO;AACT;AAEO,SAASI,cACdC,IAAc;IAEd,IAAIA,KAAK,MAAM,GAAG,GAAG;QACnB,MAAMC,MAAM,CAAC,oCAAoC,EAAEC,KAAK,SAAS,CAACF,MAAM,CAAC,CAAC;QAC1E,MAAM,IAAIG,MAAMF;IAClB;IAEA,MAAMG,SAA2C;QAC/CC,KAAK,KAAK,CAACL,IAAI,CAAC,EAAE;QAClBK,KAAK,KAAK,CAACL,IAAI,CAAC,EAAE;QACC,YAAnB,OAAOA,IAAI,CAAC,EAAE,GACVK,KAAK,KAAK,CAACL,IAAI,CAAC,EAAE,IAClBK,KAAK,KAAK,CAACL,IAAI,CAAC,EAAE,GAAGT;QACN,YAAnB,OAAOS,IAAI,CAAC,EAAE,GACVK,KAAK,KAAK,CAACL,IAAI,CAAC,EAAE,IAClBK,KAAK,KAAK,CAACL,IAAI,CAAC,EAAE,GAAGT;KAC1B;IACD,OAAOa;AACT;AAEO,SAASE,gBACdN,IAAkC,EAClCJ,KAAa,EACbC,MAAc;IAEdU,OACEX,QAAQ,KAAKC,SAAS,GACtB;IAGF,IAAI,AAAgB,YAAhB,OAAOG,MAAmB;QAC5BO,OACE,+BAA+B,IAAI,CAACP,KAAK,IAAI,KAC7C,CAAC,iDAAiD,EAAEA,MAAM;QAE5D,MAAMQ,WAAWR,KAAK,KAAK,CAAC;QAC5B,IAAIQ,AAAoB,MAApBA,SAAS,MAAM,EACjB,OAAO;YACLH,KAAK,KAAK,CAAEI,OAAOD,QAAQ,CAAC,EAAE,IAAIZ,QAAS;YAC3CS,KAAK,KAAK,CAAEI,OAAOD,QAAQ,CAAC,EAAE,IAAIX,SAAU;YAC5CQ,KAAK,KAAK,CAAEI,OAAOD,QAAQ,CAAC,EAAE,IAAIZ,QAAS;YAC3CS,KAAK,KAAK,CAAEI,OAAOD,QAAQ,CAAC,EAAE,IAAIX,SAAU;SAC7C;QAEH,MAAM,IAAIM,MAAM,CAAC,iDAAiD,EAAEH,MAAM;IAC5E;IAEA,IAAIU,MAAM,OAAO,CAACV,SAASU,MAAM,OAAO,CAACV,IAAI,CAAC,EAAE,GAC9CA,OAAOA,IAAI,CAAC,EAAE;IAGhB,IAAIW,WAAqB,EAAE;IAC3B,IAAID,MAAM,OAAO,CAACV,SAAS,AAAmB,YAAnB,OAAOA,IAAI,CAAC,EAAE,EACvCA,KAAK,OAAO,CAAC,CAACY;QACZ,IAAI,AAAgB,YAAhB,OAAOA,QAAqBA,KAAK,QAAQ,CAAC,MAAM;YAClD,MAAM,CAACC,GAAGC,EAAE,GAAGF,KAAK,KAAK,CAAC;YAC1BD,SAAS,IAAI,CAACF,OAAOI,EAAE,IAAI,KAAKJ,OAAOK,EAAE,IAAI;QAC/C,OAAO,IAAI,AAAgB,YAAhB,OAAOF,QAAqBA,KAAK,QAAQ,CAAC,MAAM;YACzD,MAAM,CAACC,GAAGC,EAAE,GAAGF,KAAK,KAAK,CAAC;YAC1BD,SAAS,IAAI,CAACF,OAAOI,EAAE,IAAI,KAAKJ,OAAOK,EAAE,IAAI;QAC/C,OACEH,SAAS,IAAI,CAACF,OAAOG;IAEzB;SAEAD,WAAWX;IAGb,IAAIW,AAAoB,MAApBA,SAAS,MAAM,IAAUA,AAAoB,MAApBA,SAAS,MAAM,EAC1C,OAAO;QACLN,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGf,QAAS;QACnCS,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGd,SAAU;QACpCQ,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGf,QAAS;QACnCS,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGd,SAAU;KACrC;IAIH,IACEc,AAAoB,MAApBA,SAAS,MAAM,IACfA,AAAoB,MAApBA,SAAS,MAAM,IACfA,AAAoB,MAApBA,SAAS,MAAM,IACfA,AAAoB,MAApBA,SAAS,MAAM,EAEf,OAAO;QACLN,KAAK,GAAG,CACN,GACAA,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGf,QAAS,QAAQL,kBAAkB;QAE/Dc,KAAK,GAAG,CACN,GACAA,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGd,SAAU,QAAQN,kBAAkB;QAEhEc,KAAK,GAAG,CACNT,OACAS,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGf,QAAS,QAAQL,kBAAkB;QAE/Dc,KAAK,GAAG,CACNR,QACAQ,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGd,SAAU,QAAQN,kBAAkB;KAEjE;IAGH,IAAIS,AAAgB,MAAhBA,KAAK,MAAM,EACb,OAAO;QACLK,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGf,QAAS;QACnCS,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGd,SAAU;QACpCQ,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGf,QAAS;QACnCS,KAAK,KAAK,CAAEM,QAAQ,CAAC,EAAE,GAAGd,SAAU;KACrC;IAGH,MAAMI,MAAM,CAAC,0CAA0C,EAAEC,KAAK,SAAS,CAACF,MAAM,CAAC,CAAC;IAChF,MAAM,IAAIG,MAAMF;AAClB;AAEO,SAASH,UACdE,IAAc,EACdJ,KAAa,EACbC,MAAc,EACdT,gBAAmC;IAEnC,IACE2B,AAAmC,oBAAnCA,aAAa3B,qBACb2B,AAAmC,kBAAnCA,aAAa3B,mBAEb,OAAOkB,gBAAgBN,MAAMJ,OAAOC;IAGtC,IAAIkB,AAAmC,aAAnCA,aAAa3B,mBACf,OAAO4B,gBAAgBhB,MAAMJ,OAAOC;IAGtC,OAAOE,cAAcC;AACvB;AAEO,SAASgB,gBACdhB,IAAc,EACdJ,KAAa,EACbC,MAAc;IAEd,MAAMoB,OAAOZ,KAAK,KAAK,CAAEL,IAAI,CAAC,EAAE,GAAGJ,QAAS;IAC5C,MAAMsB,MAAMb,KAAK,KAAK,CAAEL,IAAI,CAAC,EAAE,GAAGH,SAAU;IAC5C,MAAMsB,QAAQd,KAAK,KAAK,CAAEL,IAAI,CAAC,EAAE,GAAGJ,QAAS;IAC7C,MAAMwB,SAASf,KAAK,KAAK,CAAEL,IAAI,CAAC,EAAE,GAAGH,SAAU;IAC/C,OAAO;QAACoB;QAAMC;QAAKC;QAAOC;KAAO;AACnC;AAEO,SAASC,gBACdrB,IAAc,EACdJ,KAAa,EACbC,MAAc,EACdT,gBAAmC,EACnCkC,UAAU,CAAC,EACXC,UAAU,CAAC;IAEX/B,kBAAkB,mBAAmBQ,MAAMJ,OAAOC,QAAQyB,SAASC;IACnE,MAAM,CAACN,MAAMC,KAAKC,OAAOC,OAAO,GAAGtB,UACjCE,MACAJ,OACAC,QACAT;IAEF,MAAMoC,OAAO;QACX,MAAMP,OAAOK;QACb,KAAKJ,MAAMK;QACX,OAAOJ,QAAQF;QACf,QAAQG,SAASF;IACnB;IACA1B,kBAAkB,4BAA4BgC;IAC9C,OAAOA;AACT;AAEA,IAAIC,SAAS;AACN,SAASC,mBACdC,IAAU,EACVvC,gBAAmC;QAG/BwC;IADJ,IAAIH,QAAQ;IACZ,IAAI,QAAAG,CAAAA,gBAAAA,aAAaxC,iBAAgB,IAA7BwC,KAAAA,IAAAA,cAAgC,WAAW,GAAG,QAAQ,CAAC,WAAW;QACpE,MAAMC,aAAa,CAAC,uEAAuE,EAAEF,KAAK,KAAK,CAAC,CAAC,EAAEA,KAAK,MAAM,CAAC,wFAAwF,CAAC;QAEhN,IACEtB,KAAK,GAAG,CAACsB,KAAK,KAAK,EAAEA,KAAK,MAAM,IAAI,QACpCtB,KAAK,GAAG,CAACsB,KAAK,KAAK,EAAEA,KAAK,MAAM,IAAI,KACpC;YACAG,QAAQ,IAAI,CAACD;YACbJ,SAAS;QACX;IACF,OAAO,IAAIE,KAAK,KAAK,GAAG,QAAQA,KAAK,MAAM,GAAG,MAAM;QAClDG,QAAQ,IAAI,CACV,CAAC,gCAAgC,EAAEH,KAAK,KAAK,CAAC,CAAC,EAAEA,KAAK,MAAM,CAAC,2EAA2E,CAAC;QAE3IF,SAAS;IACX;AACF;AAEO,SAASM,WAAWC,KAAa;IACtC,MAAMC,UAAU5B,KAAK,GAAG,IAAI2B,MAAM,GAAG,CAAC,CAACE,IAAMA,EAAE,IAAI;IACnD,MAAMC,SAAS9B,KAAK,GAAG,IAAI2B,MAAM,GAAG,CAAC,CAACE,IAAMA,EAAE,GAAG;IACjD,MAAME,WAAW/B,KAAK,GAAG,IAAI2B,MAAM,GAAG,CAAC,CAACE,IAAMA,EAAE,IAAI,GAAGA,EAAE,KAAK;IAC9D,MAAMG,YAAYhC,KAAK,GAAG,IAAI2B,MAAM,GAAG,CAAC,CAACE,IAAMA,EAAE,GAAG,GAAGA,EAAE,MAAM;IAC/D,OAAO;QACL,MAAMD;QACN,KAAKE;QACL,OAAOC,WAAWH;QAClB,QAAQI,YAAYF;IACtB;AACF;AAGO,SAASG,iBACdd,IAAU,EACVe,UAAgB,EAChBnD,gBAAmC;IAEnC,MAAMoD,cACJzB,AAAmC,oBAAnCA,aAAa3B,oBAAwC,MAAM;IAC7D,MAAMqD,iBAAiB;IAEvB,MAAMC,wBACJlB,KAAK,KAAK,GAAGgB,cACTnC,KAAK,IAAI,CAAEmC,AAAAA,CAAAA,cAAchB,KAAK,KAAI,IAAK,KACvCiB;IACN,MAAME,sBACJnB,KAAK,MAAM,GAAGgB,cACVnC,KAAK,IAAI,CAAEmC,AAAAA,CAAAA,cAAchB,KAAK,MAAK,IAAK,KACxCiB;IACNjB,KAAK,IAAI,GAAGnB,KAAK,GAAG,CAAC,GAAGmB,KAAK,IAAI,GAAGkB;IACpClB,KAAK,KAAK,GAAGnB,KAAK,GAAG,CACnBmB,KAAK,KAAK,GAAGkB,AAAwB,IAAxBA,uBACbH,WAAW,KAAK,GAAGf,KAAK,IAAI;IAE9BA,KAAK,GAAG,GAAGnB,KAAK,GAAG,CAAC,GAAGmB,KAAK,GAAG,GAAGmB;IAClCnB,KAAK,MAAM,GAAGnB,KAAK,GAAG,CACpBmB,KAAK,MAAM,GAAGmB,AAAsB,IAAtBA,qBACdJ,WAAW,MAAM,GAAGf,KAAK,GAAG;IAE9B,OAAOA;AACT;AAEO,eAAeoB,kBACpBC,gBAAwB,EACxBC,IAAkC,EAClCnB,IAAU;IAEV,MAAMoB,eAAeC,WAAWF;IAChC,MAAMG,kCAAkCF,aAAc,MAAM,CAC1D,CAACG;QACC,IAAIA,YAAY,UAAU,CAAC,QAAQ,KAAKC,SAAS,IAAI,EACnD,OAAO;QAET,OAAO;IACT;IAGF,MAAMC,eAAe,MAAMC,wBAAwB;QACjD,gBAAgBR;QAChB,sBAAsBI;QACtBtB;IACF;IACA,OAAOyB;AACT;AAEO,SAASE,uBACdC,KAAuB,EACvBC,WAAgC,EAChCC,KAAc;IAEd,MAAMC,OAA+B,EAAE;IAEvC,KAAK,MAAMC,QAAQJ,MAAO;YAWTK;QAVf,MAAMC,OAAOF,KAAK,IAAI;QAEtB,MAAMG,SAASN,YAAY,IAAI,CAAC,CAACM,SAAWA,OAAO,IAAI,KAAKD;QAC5D,IAAI,CAACC,QAAQ;YACXhC,QAAQ,IAAI,CACV,CAAC,sBAAsB,EAAE+B,KAAK,8BAA8B,CAAC;YAE/D;QACF;QAEA,MAAMlE,SAAS,QAAAiE,CAAAA,eAAAA,KAAK,MAAM,AAAD,IAAVA,KAAAA,IAAAA,aAAa,MAAM;QAClC,MAAMG,UAAUD,OAAO,cAAc,IAAI,GAAG9E,wBAAwB6E,MAAM;QAE1E,MAAMG,WAAiC;YACrC,CAACD,QAAQ,EAAEpE,UAAU;YACrB,GAAIgE,KAAK,KAAK,IAAI,CAAC,CAAC;QACtB;QAEAD,KAAK,IAAI,CAACM;IACZ;IAEA,IAAIP,OACFC,KAAK,IAAI,CAAC;QACRD;IACF;IAGF,OAAOC;AACT;AAGO,MAAMO,cAAcC,EAAE,MAAM,CAAC;IAClC,MAAMA,EAAE,MAAM;IACd,KAAKA,EAAE,MAAM;AACf;AAEO,MAAMC,aAAaD,EAAE,MAAM,CAAC;IACjC,OAAOA,EAAE,MAAM;IACf,QAAQA,EAAE,MAAM;IAChB,KAAKA,EAAE,MAAM,GAAG,QAAQ;AAC1B;AAEO,MAAME,aAAaH,YAAY,GAAG,CAACE,YAAY,GAAG,CACvDD,EAAE,MAAM,CAAC;IACP,MAAMA,EAAE,MAAM,GAAG,QAAQ;AAC3B;AAGK,MAAMG,mBAAmBH,EAAAA,MACvB,CAAC;IACN,8BAA8BA,EAAE,OAAO,CAAC;IACxC,QAAQA,EAAE,MAAM;IAChB,QAAQA,EAAE,KAAK,CAAC;QAACA,EAAE,MAAM;QAAIA,EAAE,MAAM;KAAG;IACxC,MAAME;AACR,GACC,WAAW;AAIP,MAAME,yBAAyB,CAACC;QAGjCC,mBAKAC;IANJ,IAAIC,cAAcH;IAClB,IAAIC,AAAAA,SAAAA,CAAAA,oBAAAA,YAAY,IAAI,AAAD,IAAfA,KAAAA,IAAAA,kBAAkB,QAAQ,AAAD,MAAM,eACjCE,cAAcA,YAAY,IAAI,CAAC,SAAS;IAI1C,IAAID,AAAAA,SAAAA,CAAAA,qBAAAA,YAAY,IAAI,AAAD,IAAfA,KAAAA,IAAAA,mBAAkB,QAAQ,AAAD,MAAM,aAAa;QAC9C,MAAME,QAAQD,YAAY,IAAI,CAAC,KAAK;QACpC,OAAO,kCAAkCC;IAC3C;IAEA,OAAO;AACT;AAEO,MAAMC,8BAA8B,CACzCC;QAQIC;IANJ,IAAI,CAACD,SACH,OAAO,EAAE;IAIX,MAAME,YAAYF;IAClB,IAAIC,AAAAA,SAAAA,CAAAA,kBAAAA,UAAU,IAAI,AAAD,IAAbA,KAAAA,IAAAA,gBAAgB,QAAQ,AAAD,MAAM,eAAeC,UAAU,KAAK,EAAE;QAC/D,MAAMC,OAAOC,OAAO,IAAI,CAACF,UAAU,KAAK;QACxC,OAAOC,KAAK,MAAM,CAAC,CAACE,MAAQZ,uBAAuBS,UAAU,KAAK,CAACG,IAAI;IACzE;IAGA,OAAO,EAAE;AACX"}
@@ -3,8 +3,8 @@ import { systemPromptToLocateElement } from "./prompt/llm-locator.mjs";
3
3
  import { describeUserPage, elementByPositionWithElementInfo } from "./prompt/util.mjs";
4
4
  import { generatePlaywrightTest, generatePlaywrightTestStream } from "./prompt/playwright-generator.mjs";
5
5
  import { generateYamlTest, generateYamlTestStream } from "./prompt/yaml-generator.mjs";
6
- import { AiAssert, AiExtractElementInfo, AiLocateElement, AiLocateSection } from "./inspect.mjs";
6
+ import { AiExtractElementInfo, AiLocateElement, AiLocateSection } from "./inspect.mjs";
7
7
  import { plan } from "./llm-planning.mjs";
8
- import { AIActionType, adaptBboxToRect, callAiFn } from "./common.mjs";
8
+ import { AIActionType, MidsceneLocation, PointSchema, RectSchema, SizeSchema, actionSpaceTypePrefix, adaptBboxToRect, callAiFn } from "./common.mjs";
9
9
  import { resizeImageForUiTars, vlmPlanning } from "./ui-tars-planning.mjs";
10
- export { AIActionType, AiAssert, AiExtractElementInfo, AiLocateElement, AiLocateSection, adaptBboxToRect, call as callAi, callAiFn, callAiFnWithStringResponse, callToGetJSONObject, describeUserPage, elementByPositionWithElementInfo, generatePlaywrightTest, generatePlaywrightTestStream, generateYamlTest, generateYamlTestStream, plan, resizeImageForUiTars, systemPromptToLocateElement, vlmPlanning };
10
+ export { AIActionType, AiExtractElementInfo, AiLocateElement, AiLocateSection, MidsceneLocation, PointSchema, RectSchema, SizeSchema, actionSpaceTypePrefix, adaptBboxToRect, call as callAi, callAiFn, callAiFnWithStringResponse, callToGetJSONObject, describeUserPage, elementByPositionWithElementInfo, generatePlaywrightTest, generatePlaywrightTestStream, generateYamlTest, generateYamlTestStream, plan, resizeImageForUiTars, systemPromptToLocateElement, vlmPlanning };
@@ -1,9 +1,8 @@
1
- import { MIDSCENE_USE_QWEN_VL, MIDSCENE_USE_VLM_UI_TARS, getAIConfigInBoolean, vlLocateMode } from "@midscene/shared/env";
1
+ import { getIsUseQwenVl, vlLocateMode } from "@midscene/shared/env";
2
2
  import { cropByRect, paddingToMatchBlockByBase64, preProcessImageUrl } from "@midscene/shared/img";
3
3
  import { getDebug } from "@midscene/shared/logger";
4
4
  import { assert } from "@midscene/shared/utils";
5
5
  import { AIActionType, adaptBboxToRect, callAiFn, expandSearchArea, markupImageForLLM, mergeRects } from "./common.mjs";
6
- import { systemPromptToAssert } from "./prompt/assertion.mjs";
7
6
  import { extractDataQueryPrompt, systemPromptToExtract } from "./prompt/extraction.mjs";
8
7
  import { findElementPrompt, systemPromptToLocateElement } from "./prompt/llm-locator.mjs";
9
8
  import { sectionLocatorInstruction, systemPromptToLocateSection } from "./prompt/llm-section-locator.mjs";
@@ -58,20 +57,23 @@ const promptsToChatParam = async (multimodalPrompt)=>{
58
57
  async function AiLocateElement(options) {
59
58
  const { context, targetElementDescription, callAI } = options;
60
59
  const { screenshotBase64 } = context;
61
- const { description, elementById, insertElementByPosition } = await describeUserPage(context);
60
+ const modelPreferences = {
61
+ intent: 'grounding'
62
+ };
63
+ const { description, elementById, insertElementByPosition } = await describeUserPage(context, modelPreferences);
62
64
  assert(targetElementDescription, "cannot find the target element description");
63
65
  const userInstructionPrompt = await findElementPrompt.format({
64
66
  pageDescription: description,
65
67
  targetElementDescription: extraTextFromUserPrompt(targetElementDescription)
66
68
  });
67
- const systemPrompt = systemPromptToLocateElement(vlLocateMode());
69
+ const systemPrompt = systemPromptToLocateElement(vlLocateMode(modelPreferences));
68
70
  let imagePayload = screenshotBase64;
69
71
  if (options.searchConfig) {
70
72
  assert(options.searchConfig.rect, 'searchArea is provided but its rect cannot be found. Failed to locate element');
71
73
  assert(options.searchConfig.imageBase64, 'searchArea is provided but its imageBase64 cannot be found. Failed to locate element');
72
74
  imagePayload = options.searchConfig.imageBase64;
73
- } else if ('qwen-vl' === vlLocateMode()) imagePayload = await paddingToMatchBlockByBase64(imagePayload);
74
- else if (!vlLocateMode()) imagePayload = await markupImageForLLM(screenshotBase64, context.tree, context.size);
75
+ } else if ('qwen-vl' === vlLocateMode(modelPreferences)) imagePayload = await paddingToMatchBlockByBase64(imagePayload);
76
+ else if (!vlLocateMode(modelPreferences)) imagePayload = await markupImageForLLM(screenshotBase64, context.tree, context.size);
75
77
  const msgs = [
76
78
  {
77
79
  role: 'system',
@@ -102,7 +104,9 @@ async function AiLocateElement(options) {
102
104
  msgs.push(...addOns);
103
105
  }
104
106
  const callAIFn = callAI || callToGetJSONObject;
105
- const res = await callAIFn(msgs, AIActionType.INSPECT_ELEMENT);
107
+ const res = await callAIFn(msgs, AIActionType.INSPECT_ELEMENT, {
108
+ intent: 'grounding'
109
+ });
106
110
  const rawResponse = JSON.stringify(res.content);
107
111
  let resRect;
108
112
  let matchedElements = 'elements' in res.content ? res.content.elements : [];
@@ -110,7 +114,7 @@ async function AiLocateElement(options) {
110
114
  try {
111
115
  if ('bbox' in res.content && Array.isArray(res.content.bbox)) {
112
116
  var _options_searchConfig_rect, _options_searchConfig, _options_searchConfig_rect1, _options_searchConfig1, _options_searchConfig_rect2, _options_searchConfig2, _options_searchConfig_rect3, _options_searchConfig3;
113
- resRect = adaptBboxToRect(res.content.bbox, (null == (_options_searchConfig = options.searchConfig) ? void 0 : null == (_options_searchConfig_rect = _options_searchConfig.rect) ? void 0 : _options_searchConfig_rect.width) || context.size.width, (null == (_options_searchConfig1 = options.searchConfig) ? void 0 : null == (_options_searchConfig_rect1 = _options_searchConfig1.rect) ? void 0 : _options_searchConfig_rect1.height) || context.size.height, null == (_options_searchConfig2 = options.searchConfig) ? void 0 : null == (_options_searchConfig_rect2 = _options_searchConfig2.rect) ? void 0 : _options_searchConfig_rect2.left, null == (_options_searchConfig3 = options.searchConfig) ? void 0 : null == (_options_searchConfig_rect3 = _options_searchConfig3.rect) ? void 0 : _options_searchConfig_rect3.top);
117
+ resRect = adaptBboxToRect(res.content.bbox, (null == (_options_searchConfig = options.searchConfig) ? void 0 : null == (_options_searchConfig_rect = _options_searchConfig.rect) ? void 0 : _options_searchConfig_rect.width) || context.size.width, (null == (_options_searchConfig1 = options.searchConfig) ? void 0 : null == (_options_searchConfig_rect1 = _options_searchConfig1.rect) ? void 0 : _options_searchConfig_rect1.height) || context.size.height, modelPreferences, null == (_options_searchConfig2 = options.searchConfig) ? void 0 : null == (_options_searchConfig_rect2 = _options_searchConfig2.rect) ? void 0 : _options_searchConfig_rect2.left, null == (_options_searchConfig3 = options.searchConfig) ? void 0 : null == (_options_searchConfig_rect3 = _options_searchConfig3.rect) ? void 0 : _options_searchConfig_rect3.top);
114
118
  debugInspect('resRect', resRect);
115
119
  const rectCenter = {
116
120
  x: resRect.left + resRect.width / 2,
@@ -151,7 +155,10 @@ async function AiLocateElement(options) {
151
155
  async function AiLocateSection(options) {
152
156
  const { context, sectionDescription } = options;
153
157
  const { screenshotBase64 } = context;
154
- const systemPrompt = systemPromptToLocateSection(vlLocateMode());
158
+ const modelPreferences = {
159
+ intent: 'grounding'
160
+ };
161
+ const systemPrompt = systemPromptToLocateSection(vlLocateMode(modelPreferences));
155
162
  const sectionLocatorInstructionText = await sectionLocatorInstruction.format({
156
163
  sectionDescription: extraTextFromUserPrompt(sectionDescription)
157
164
  });
@@ -184,26 +191,30 @@ async function AiLocateSection(options) {
184
191
  });
185
192
  msgs.push(...addOns);
186
193
  }
187
- const result = await callAiFn(msgs, AIActionType.EXTRACT_DATA);
194
+ const result = await callAiFn(msgs, AIActionType.EXTRACT_DATA, {
195
+ intent: 'grounding'
196
+ });
188
197
  let sectionRect;
189
198
  const sectionBbox = result.content.bbox;
190
199
  if (sectionBbox) {
191
- const targetRect = adaptBboxToRect(sectionBbox, context.size.width, context.size.height);
200
+ const targetRect = adaptBboxToRect(sectionBbox, context.size.width, context.size.height, modelPreferences);
192
201
  debugSection('original targetRect %j', targetRect);
193
202
  const referenceBboxList = result.content.references_bbox || [];
194
203
  debugSection('referenceBboxList %j', referenceBboxList);
195
- const referenceRects = referenceBboxList.filter((bbox)=>Array.isArray(bbox)).map((bbox)=>adaptBboxToRect(bbox, context.size.width, context.size.height));
204
+ const referenceRects = referenceBboxList.filter((bbox)=>Array.isArray(bbox)).map((bbox)=>adaptBboxToRect(bbox, context.size.width, context.size.height, modelPreferences));
196
205
  debugSection('referenceRects %j', referenceRects);
197
206
  const mergedRect = mergeRects([
198
207
  targetRect,
199
208
  ...referenceRects
200
209
  ]);
201
210
  debugSection('mergedRect %j', mergedRect);
202
- sectionRect = expandSearchArea(mergedRect, context.size);
211
+ sectionRect = expandSearchArea(mergedRect, context.size, modelPreferences);
203
212
  debugSection('expanded sectionRect %j', sectionRect);
204
213
  }
205
214
  let imageBase64 = screenshotBase64;
206
- if (sectionRect) imageBase64 = await cropByRect(screenshotBase64, sectionRect, getAIConfigInBoolean(MIDSCENE_USE_QWEN_VL));
215
+ if (sectionRect) imageBase64 = await cropByRect(screenshotBase64, sectionRect, getIsUseQwenVl({
216
+ intent: 'grounding'
217
+ }));
207
218
  return {
208
219
  rect: sectionRect,
209
220
  imageBase64,
@@ -214,10 +225,10 @@ async function AiLocateSection(options) {
214
225
  }
215
226
  async function AiExtractElementInfo(options) {
216
227
  var _options_extractOption;
217
- const { dataQuery, context, extractOption, multimodalPrompt } = options;
228
+ const { dataQuery, context, extractOption, multimodalPrompt, modelPreferences } = options;
218
229
  const systemPrompt = systemPromptToExtract();
219
230
  const { screenshotBase64 } = context;
220
- const { description, elementById } = await describeUserPage(context, {
231
+ const { description, elementById } = await describeUserPage(context, modelPreferences, {
221
232
  truncateTextLength: 200,
222
233
  filterNonTextContent: false,
223
234
  visibleOnly: false,
@@ -257,61 +268,13 @@ async function AiExtractElementInfo(options) {
257
268
  });
258
269
  msgs.push(...addOns);
259
270
  }
260
- const result = await callAiFn(msgs, AIActionType.EXTRACT_DATA);
271
+ const result = await callAiFn(msgs, AIActionType.EXTRACT_DATA, modelPreferences);
261
272
  return {
262
273
  parseResult: result.content,
263
274
  elementById,
264
275
  usage: result.usage
265
276
  };
266
277
  }
267
- async function AiAssert(options) {
268
- const { assertion, context } = options;
269
- assert(assertion, 'assertion should not be empty');
270
- const { screenshotBase64 } = context;
271
- const systemPrompt = systemPromptToAssert({
272
- isUITars: getAIConfigInBoolean(MIDSCENE_USE_VLM_UI_TARS)
273
- });
274
- const assertionText = extraTextFromUserPrompt(assertion);
275
- const msgs = [
276
- {
277
- role: 'system',
278
- content: systemPrompt
279
- },
280
- {
281
- role: 'user',
282
- content: [
283
- {
284
- type: 'image_url',
285
- image_url: {
286
- url: screenshotBase64,
287
- detail: 'high'
288
- }
289
- },
290
- {
291
- type: 'text',
292
- text: `
293
- Here is the assertion. Please tell whether it is truthy according to the screenshot.
294
- =====================================
295
- ${assertionText}
296
- =====================================
297
- `
298
- }
299
- ]
300
- }
301
- ];
302
- if ('string' != typeof assertion) {
303
- const addOns = await promptsToChatParam({
304
- images: assertion.images,
305
- convertHttpImage2Base64: assertion.convertHttpImage2Base64
306
- });
307
- msgs.push(...addOns);
308
- }
309
- const { content: assertResult, usage } = await callAiFn(msgs, AIActionType.ASSERT);
310
- return {
311
- content: assertResult,
312
- usage
313
- };
314
- }
315
- export { AiAssert, AiExtractElementInfo, AiLocateElement, AiLocateSection };
278
+ export { AiExtractElementInfo, AiLocateElement, AiLocateSection };
316
279
 
317
280
  //# sourceMappingURL=inspect.mjs.map