@midscene/core 0.26.7-beta-20250821124744.0 → 0.26.7-beta-20250821134240.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -68,47 +68,52 @@ async function vlmPlanning(options) {
68
68
  },
69
69
  modelVer: modelVer || void 0
70
70
  });
71
- debug('ui-tars modelVer', modelVer, ', parsed', JSON.stringify(parsed));
71
+ debug('modelVer', modelVer, 'parsed', JSON.stringify(parsed));
72
72
  const transformActions = [];
73
73
  parsed.forEach((action)=>{
74
74
  if ('click' === action.action_type) {
75
75
  (0, utils_namespaceObject.assert)(action.action_inputs.start_box, 'start_box is required');
76
76
  const point = getPoint(action.action_inputs.start_box, size);
77
77
  transformActions.push({
78
- type: 'Tap',
79
- param: {
80
- locate: {
81
- prompt: action.thought || '',
82
- bbox: pointToBbox({
83
- x: point[0],
84
- y: point[1]
85
- }, size.width, size.height)
86
- }
78
+ type: 'Locate',
79
+ param: {},
80
+ locate: {
81
+ prompt: action.thought || '',
82
+ bbox: pointToBbox({
83
+ x: point[0],
84
+ y: point[1]
85
+ }, size.width, size.height)
87
86
  }
88
87
  });
88
+ transformActions.push({
89
+ type: 'Tap',
90
+ locate: {
91
+ prompt: action.thought || '',
92
+ bbox: pointToBbox({
93
+ x: point[0],
94
+ y: point[1]
95
+ }, size.width, size.height)
96
+ },
97
+ param: action.thought || ''
98
+ });
89
99
  } else if ('drag' === action.action_type) {
90
100
  (0, utils_namespaceObject.assert)(action.action_inputs.start_box, 'start_box is required');
91
101
  (0, utils_namespaceObject.assert)(action.action_inputs.end_box, 'end_box is required');
92
102
  const startPoint = getPoint(action.action_inputs.start_box, size);
93
103
  const endPoint = getPoint(action.action_inputs.end_box, size);
94
104
  transformActions.push({
95
- type: 'DragAndDrop',
105
+ type: 'Drag',
96
106
  param: {
97
- from: {
98
- prompt: action.thought || '',
99
- bbox: pointToBbox({
100
- x: startPoint[0],
101
- y: startPoint[1]
102
- }, size.width, size.height)
107
+ start_box: {
108
+ x: startPoint[0],
109
+ y: startPoint[1]
103
110
  },
104
- to: {
105
- prompt: action.thought || '',
106
- bbox: pointToBbox({
107
- x: endPoint[0],
108
- y: endPoint[1]
109
- }, size.width, size.height)
111
+ end_box: {
112
+ x: endPoint[0],
113
+ y: endPoint[1]
110
114
  }
111
115
  },
116
+ locate: null,
112
117
  thought: action.thought || ''
113
118
  });
114
119
  } else if ('type' === action.action_type) transformActions.push({
@@ -116,6 +121,7 @@ async function vlmPlanning(options) {
116
121
  param: {
117
122
  value: action.action_inputs.content
118
123
  },
124
+ locate: null,
119
125
  thought: action.thought || ''
120
126
  });
121
127
  else if ('scroll' === action.action_type) transformActions.push({
@@ -123,11 +129,13 @@ async function vlmPlanning(options) {
123
129
  param: {
124
130
  direction: action.action_inputs.direction
125
131
  },
132
+ locate: null,
126
133
  thought: action.thought || ''
127
134
  });
128
135
  else if ('finished' === action.action_type) transformActions.push({
129
136
  type: 'Finished',
130
137
  param: {},
138
+ locate: null,
131
139
  thought: action.thought || ''
132
140
  });
133
141
  else if ('hotkey' === action.action_type) if (action.action_inputs.key) {
@@ -135,8 +143,9 @@ async function vlmPlanning(options) {
135
143
  transformActions.push({
136
144
  type: 'KeyboardPress',
137
145
  param: {
138
- keyName: keys
146
+ value: keys
139
147
  },
148
+ locate: null,
140
149
  thought: action.thought || ''
141
150
  });
142
151
  } else console.warn('No key found in action: hotkey. Will not perform action.');
@@ -145,16 +154,19 @@ async function vlmPlanning(options) {
145
154
  param: {
146
155
  timeMs: 1000
147
156
  },
157
+ locate: null,
148
158
  thought: action.thought || ''
149
159
  });
150
160
  else if ('androidBackButton' === action.action_type) transformActions.push({
151
161
  type: 'AndroidBackButton',
152
162
  param: {},
163
+ locate: null,
153
164
  thought: action.thought || ''
154
165
  });
155
166
  else if ('androidHomeButton' === action.action_type) transformActions.push({
156
167
  type: 'AndroidHomeButton',
157
168
  param: {},
169
+ locate: null,
158
170
  thought: action.thought || ''
159
171
  });
160
172
  else if ('androidRecentAppsButton' === action.action_type) transformActions.push({
@@ -162,31 +174,33 @@ async function vlmPlanning(options) {
162
174
  param: {}
163
175
  });
164
176
  else if ('androidLongPress' === action.action_type) {
165
- (0, utils_namespaceObject.assert)(action.action_inputs.start_box, 'start_box is required for androidLongPress');
166
- const start = getPoint(action.action_inputs.start_box, size);
177
+ (0, utils_namespaceObject.assert)(action.action_inputs.start_coords, 'start_coords is required for androidLongPress');
178
+ const point = action.action_inputs.start_coords;
167
179
  transformActions.push({
168
180
  type: 'AndroidLongPress',
169
181
  param: {
170
- locate: {
171
- prompt: action.thought || '',
172
- bbox: pointToBbox({
173
- x: start[0],
174
- y: start[1]
175
- }, size.width, size.height)
176
- },
182
+ x: point[0],
183
+ y: point[1],
177
184
  duration: 1000
178
185
  },
186
+ locate: null,
179
187
  thought: action.thought || ''
180
188
  });
181
189
  } else if ('androidPull' === action.action_type) {
182
190
  const pullDirection = action.action_inputs.direction || 'down';
191
+ const startPoint = action.action_inputs.start_coords ? {
192
+ x: action.action_inputs.start_coords[0],
193
+ y: action.action_inputs.start_coords[1]
194
+ } : void 0;
183
195
  transformActions.push({
184
196
  type: 'AndroidPull',
185
197
  param: {
186
198
  direction: pullDirection,
199
+ startPoint,
187
200
  distance: action.action_inputs.distance,
188
201
  duration: action.action_inputs.duration || 500
189
202
  },
203
+ locate: null,
190
204
  thought: action.thought || ''
191
205
  });
192
206
  }
@@ -197,7 +211,6 @@ async function vlmPlanning(options) {
197
211
  parsed
198
212
  }
199
213
  });
200
- debug('transformActions', JSON.stringify(transformActions, null, 2));
201
214
  return {
202
215
  actions: transformActions,
203
216
  actionsFromModel: parsed,
@@ -1 +1 @@
1
- {"version":3,"file":"ai-model/ui-tars-planning.js","sources":["webpack://@midscene/core/webpack/runtime/define_property_getters","webpack://@midscene/core/webpack/runtime/has_own_property","webpack://@midscene/core/webpack/runtime/make_namespace_object","webpack://@midscene/core/./src/ai-model/ui-tars-planning.ts"],"sourcesContent":["__webpack_require__.d = (exports, definition) => {\n\tfor(var key in definition) {\n if(__webpack_require__.o(definition, key) && !__webpack_require__.o(exports, key)) {\n Object.defineProperty(exports, key, { enumerable: true, get: definition[key] });\n }\n }\n};","__webpack_require__.o = (obj, prop) => (Object.prototype.hasOwnProperty.call(obj, prop))","// define __esModule on exports\n__webpack_require__.r = (exports) => {\n\tif(typeof Symbol !== 'undefined' && Symbol.toStringTag) {\n\t\tObject.defineProperty(exports, Symbol.toStringTag, { value: 'Module' });\n\t}\n\tObject.defineProperty(exports, '__esModule', { value: true });\n};","import type {\n AIUsageInfo,\n MidsceneYamlFlowItem,\n PlanningAction,\n Size,\n} from '@/types';\nimport {\n type IModelPreferences,\n UITarsModelVersion,\n uiTarsModelVersion,\n vlLocateMode,\n} from '@midscene/shared/env';\nimport { resizeImgBase64 } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { transformHotkeyInput } from '@midscene/shared/us-keyboard-layout';\nimport { assert } from '@midscene/shared/utils';\nimport { actionParser } from '@ui-tars/action-parser';\nimport type { ChatCompletionMessageParam } from 'openai/resources/index';\nimport { AIActionType } from './common';\nimport { getSummary, getUiTarsPlanningPrompt } from './prompt/ui-tars-planning';\nimport { call } from './service-caller/index';\ntype ActionType =\n | 'click'\n | 'drag'\n | 'type'\n | 'hotkey'\n | 'finished'\n | 'scroll'\n | 'wait'\n | 'androidBackButton'\n | 'androidHomeButton'\n | 'androidRecentAppsButton'\n | 'androidLongPress'\n | 'androidPull';\n\nconst debug = getDebug('ui-tars-planning');\nconst bboxSize = 10;\nconst pointToBbox = (\n point: { x: number; y: number },\n width: number,\n height: number,\n): [number, number, number, number] => {\n return [\n Math.round(Math.max(point.x - bboxSize / 2, 0)),\n Math.round(Math.max(point.y - bboxSize / 2, 0)),\n Math.round(Math.min(point.x + bboxSize / 2, width)),\n Math.round(Math.min(point.y + bboxSize / 2, height)),\n ];\n};\n\nexport async function vlmPlanning(options: {\n userInstruction: string;\n conversationHistory: ChatCompletionMessageParam[];\n size: { width: number; height: number };\n modelPreferences: IModelPreferences;\n}): Promise<{\n actions: PlanningAction<any>[];\n actionsFromModel: ReturnType<typeof actionParser>['parsed'];\n action_summary: string;\n yamlFlow?: MidsceneYamlFlowItem[];\n usage?: AIUsageInfo;\n rawResponse?: string;\n}> {\n const { conversationHistory, userInstruction, size, modelPreferences } =\n options;\n const systemPrompt = getUiTarsPlanningPrompt() + userInstruction;\n\n const res = await call(\n [\n {\n role: 'user',\n content: systemPrompt,\n },\n ...conversationHistory,\n ],\n AIActionType.INSPECT_ELEMENT,\n modelPreferences,\n );\n const convertedText = convertBboxToCoordinates(res.content);\n\n const modelVer = uiTarsModelVersion(modelPreferences);\n\n const { parsed } = actionParser({\n prediction: convertedText,\n factor: [1000, 1000],\n screenContext: {\n width: size.width,\n height: size.height,\n },\n modelVer: modelVer || undefined,\n });\n\n debug('ui-tars modelVer', modelVer, ', parsed', JSON.stringify(parsed));\n\n const transformActions: PlanningAction[] = [];\n parsed.forEach((action) => {\n if (action.action_type === 'click') {\n assert(action.action_inputs.start_box, 'start_box is required');\n const point = getPoint(action.action_inputs.start_box, size);\n transformActions.push({\n type: 'Tap',\n param: {\n locate: {\n prompt: action.thought || '',\n bbox: pointToBbox(\n { x: point[0], y: point[1] },\n size.width,\n size.height,\n ),\n },\n },\n });\n } else if (action.action_type === 'drag') {\n assert(action.action_inputs.start_box, 'start_box is required');\n assert(action.action_inputs.end_box, 'end_box is required');\n const startPoint = getPoint(action.action_inputs.start_box, size);\n const endPoint = getPoint(action.action_inputs.end_box, size);\n transformActions.push({\n type: 'DragAndDrop',\n param: {\n from: {\n prompt: action.thought || '',\n bbox: pointToBbox(\n { x: startPoint[0], y: startPoint[1] },\n size.width,\n size.height,\n ),\n },\n to: {\n prompt: action.thought || '',\n bbox: pointToBbox(\n { x: endPoint[0], y: endPoint[1] },\n size.width,\n size.height,\n ),\n },\n },\n thought: action.thought || '',\n });\n } else if (action.action_type === 'type') {\n transformActions.push({\n type: 'Input',\n param: {\n value: action.action_inputs.content,\n },\n thought: action.thought || '',\n });\n } else if (action.action_type === 'scroll') {\n transformActions.push({\n type: 'Scroll',\n param: {\n direction: action.action_inputs.direction,\n },\n thought: action.thought || '',\n });\n } else if (action.action_type === 'finished') {\n transformActions.push({\n type: 'Finished',\n param: {},\n thought: action.thought || '',\n });\n } else if (action.action_type === 'hotkey') {\n if (!action.action_inputs.key) {\n console.warn(\n 'No key found in action: hotkey. Will not perform action.',\n );\n } else {\n const keys = transformHotkeyInput(action.action_inputs.key);\n\n transformActions.push({\n type: 'KeyboardPress',\n param: {\n keyName: keys,\n },\n thought: action.thought || '',\n });\n }\n } else if (action.action_type === 'wait') {\n transformActions.push({\n type: 'Sleep',\n param: {\n timeMs: 1000,\n },\n thought: action.thought || '',\n });\n } else if (action.action_type === 'androidBackButton') {\n transformActions.push({\n type: 'AndroidBackButton',\n param: {},\n thought: action.thought || '',\n });\n } else if (action.action_type === 'androidHomeButton') {\n transformActions.push({\n type: 'AndroidHomeButton',\n param: {},\n thought: action.thought || '',\n });\n } else if (action.action_type === 'androidRecentAppsButton') {\n transformActions.push({\n type: 'AndroidRecentAppsButton',\n param: {},\n });\n } else if (action.action_type === 'androidLongPress') {\n assert(\n action.action_inputs.start_box,\n 'start_box is required for androidLongPress',\n );\n const start = getPoint(action.action_inputs.start_box, size);\n transformActions.push({\n type: 'AndroidLongPress',\n param: {\n locate: {\n prompt: action.thought || '',\n bbox: pointToBbox(\n { x: start[0], y: start[1] },\n size.width,\n size.height,\n ),\n },\n duration: 1000,\n },\n thought: action.thought || '',\n });\n } else if (action.action_type === 'androidPull') {\n const pullDirection = action.action_inputs.direction || 'down';\n transformActions.push({\n type: 'AndroidPull',\n param: {\n direction: pullDirection as 'up' | 'down',\n distance: (action.action_inputs as any).distance,\n duration: (action.action_inputs as any).duration || 500,\n },\n thought: action.thought || '',\n });\n }\n });\n\n if (transformActions.length === 0) {\n throw new Error(`No actions found, response: ${res.content}`, {\n cause: {\n prediction: res.content,\n parsed,\n },\n });\n }\n\n debug('transformActions', JSON.stringify(transformActions, null, 2));\n\n return {\n actions: transformActions,\n actionsFromModel: parsed,\n action_summary: getSummary(res.content),\n usage: res.usage,\n rawResponse: JSON.stringify(res.content, undefined, 2),\n };\n}\n\n/**\n * Converts bounding box notation to coordinate points\n * @param text - The text containing bbox tags to be converted\n * @returns The text with bbox tags replaced by coordinate points\n */\nfunction convertBboxToCoordinates(text: string): string {\n // Match the four numbers after <bbox>\n const pattern = /<bbox>(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)<\\/bbox>/g;\n\n function replaceMatch(\n match: string,\n x1: string,\n y1: string,\n x2: string,\n y2: string,\n ): string {\n // Convert strings to numbers and calculate center point\n const x1Num = Number.parseInt(x1, 10);\n const y1Num = Number.parseInt(y1, 10);\n const x2Num = Number.parseInt(x2, 10);\n const y2Num = Number.parseInt(y2, 10);\n\n // Use Math.floor to truncate and calculate center point\n const x = Math.floor((x1Num + x2Num) / 2);\n const y = Math.floor((y1Num + y2Num) / 2);\n\n // Return formatted coordinate string\n return `(${x},${y})`;\n }\n\n // Remove [EOS] and replace <bbox> coordinates\n const cleanedText = text.replace(/\\[EOS\\]/g, '');\n return cleanedText.replace(pattern, replaceMatch).trim();\n}\n\nfunction getPoint(startBox: string, size: { width: number; height: number }) {\n const [x, y] = JSON.parse(startBox);\n return [x * size.width, y * size.height];\n}\n\ninterface BaseAction {\n action_type: ActionType;\n action_inputs: Record<string, any>;\n reflection: string | null;\n thought: string | null;\n}\n\ninterface ClickAction extends BaseAction {\n action_type: 'click';\n action_inputs: {\n start_box: string; // JSON string of [x, y] coordinates\n };\n}\n\ninterface DragAction extends BaseAction {\n action_type: 'drag';\n action_inputs: {\n start_box: string; // JSON string of [x, y] coordinates\n end_box: string; // JSON string of [x, y] coordinates\n };\n}\n\ninterface WaitAction extends BaseAction {\n action_type: 'wait';\n action_inputs: {\n time: string; // JSON string of [x, y] coordinates\n };\n}\n\ninterface TypeAction extends BaseAction {\n action_type: 'type';\n action_inputs: {\n content: string;\n };\n}\n\ninterface HotkeyAction extends BaseAction {\n action_type: 'hotkey';\n action_inputs: {\n key: string;\n };\n}\n\ninterface ScrollAction extends BaseAction {\n action_type: 'scroll';\n action_inputs: {\n direction: 'up' | 'down';\n };\n}\n\ninterface FinishedAction extends BaseAction {\n action_type: 'finished';\n action_inputs: Record<string, never>;\n}\n\ninterface AndroidLongPressAction extends BaseAction {\n action_type: 'androidLongPress';\n action_inputs: {\n start_coords: [number, number]; // Coordinates for long press\n duration?: number; // Duration in milliseconds\n };\n}\n\nexport type Action =\n | ClickAction\n | DragAction\n | TypeAction\n | HotkeyAction\n | ScrollAction\n | FinishedAction\n | WaitAction\n | AndroidLongPressAction;\n\nexport async function resizeImageForUiTars(\n imageBase64: string,\n size: Size,\n modelPreferences: IModelPreferences,\n) {\n if (\n vlLocateMode(modelPreferences) === 'vlm-ui-tars' &&\n uiTarsModelVersion(modelPreferences) === UITarsModelVersion.V1_5\n ) {\n debug('ui-tars-v1.5, will check image size', size);\n const currentPixels = size.width * size.height;\n const maxPixels = 16384 * 28 * 28; //\n if (currentPixels > maxPixels) {\n const resizeFactor = Math.sqrt(maxPixels / currentPixels);\n const newWidth = Math.floor(size.width * resizeFactor);\n const newHeight = Math.floor(size.height * resizeFactor);\n debug(\n 'resize image for ui-tars, new width: %s, new height: %s',\n newWidth,\n newHeight,\n );\n const resizedImage = await resizeImgBase64(imageBase64, {\n width: newWidth,\n height: newHeight,\n });\n return resizedImage;\n }\n }\n return imageBase64;\n}\n"],"names":["__webpack_require__","definition","key","Object","obj","prop","Symbol","debug","getDebug","bboxSize","pointToBbox","point","width","height","Math","vlmPlanning","options","conversationHistory","userInstruction","size","modelPreferences","systemPrompt","getUiTarsPlanningPrompt","res","call","AIActionType","convertedText","convertBboxToCoordinates","modelVer","uiTarsModelVersion","parsed","actionParser","undefined","JSON","transformActions","action","assert","getPoint","startPoint","endPoint","keys","transformHotkeyInput","console","start","pullDirection","Error","getSummary","text","pattern","replaceMatch","match","x1","y1","x2","y2","x1Num","Number","y1Num","x2Num","y2Num","x","y","cleanedText","startBox","resizeImageForUiTars","imageBase64","vlLocateMode","UITarsModelVersion","currentPixels","maxPixels","resizeFactor","newWidth","newHeight","resizedImage","resizeImgBase64"],"mappings":";;;IAAAA,oBAAoB,CAAC,GAAG,CAAC,UAASC;QACjC,IAAI,IAAIC,OAAOD,WACR,IAAGD,oBAAoB,CAAC,CAACC,YAAYC,QAAQ,CAACF,oBAAoB,CAAC,CAAC,UAASE,MACzEC,OAAO,cAAc,CAAC,UAASD,KAAK;YAAE,YAAY;YAAM,KAAKD,UAAU,CAACC,IAAI;QAAC;IAGzF;;;ICNAF,oBAAoB,CAAC,GAAG,CAACI,KAAKC,OAAUF,OAAO,SAAS,CAAC,cAAc,CAAC,IAAI,CAACC,KAAKC;;;ICClFL,oBAAoB,CAAC,GAAG,CAAC;QACxB,IAAG,AAAkB,eAAlB,OAAOM,UAA0BA,OAAO,WAAW,EACrDH,OAAO,cAAc,CAAC,UAASG,OAAO,WAAW,EAAE;YAAE,OAAO;QAAS;QAEtEH,OAAO,cAAc,CAAC,UAAS,cAAc;YAAE,OAAO;QAAK;IAC5D;;;;;;;;;;;;;;;;;AC6BA,MAAMI,QAAQC,AAAAA,IAAAA,uBAAAA,QAAAA,AAAAA,EAAS;AACvB,MAAMC,WAAW;AACjB,MAAMC,cAAc,CAClBC,OACAC,OACAC,SAEO;QACLC,KAAK,KAAK,CAACA,KAAK,GAAG,CAACH,MAAM,CAAC,GAAGF,WAAW,GAAG;QAC5CK,KAAK,KAAK,CAACA,KAAK,GAAG,CAACH,MAAM,CAAC,GAAGF,WAAW,GAAG;QAC5CK,KAAK,KAAK,CAACA,KAAK,GAAG,CAACH,MAAM,CAAC,GAAGF,WAAW,GAAGG;QAC5CE,KAAK,KAAK,CAACA,KAAK,GAAG,CAACH,MAAM,CAAC,GAAGF,WAAW,GAAGI;KAC7C;AAGI,eAAeE,YAAYC,OAKjC;IAQC,MAAM,EAAEC,mBAAmB,EAAEC,eAAe,EAAEC,IAAI,EAAEC,gBAAgB,EAAE,GACpEJ;IACF,MAAMK,eAAeC,AAAAA,IAAAA,oCAAAA,uBAAAA,AAAAA,MAA4BJ;IAEjD,MAAMK,MAAM,MAAMC,AAAAA,IAAAA,yBAAAA,IAAAA,AAAAA,EAChB;QACE;YACE,MAAM;YACN,SAASH;QACX;WACGJ;KACJ,EACDQ,mCAAAA,YAAAA,CAAAA,eAA4B,EAC5BL;IAEF,MAAMM,gBAAgBC,yBAAyBJ,IAAI,OAAO;IAE1D,MAAMK,WAAWC,AAAAA,IAAAA,oBAAAA,kBAAAA,AAAAA,EAAmBT;IAEpC,MAAM,EAAEU,MAAM,EAAE,GAAGC,AAAAA,IAAAA,8BAAAA,YAAAA,AAAAA,EAAa;QAC9B,YAAYL;QACZ,QAAQ;YAAC;YAAM;SAAK;QACpB,eAAe;YACb,OAAOP,KAAK,KAAK;YACjB,QAAQA,KAAK,MAAM;QACrB;QACA,UAAUS,YAAYI;IACxB;IAEAzB,MAAM,oBAAoBqB,UAAU,YAAYK,KAAK,SAAS,CAACH;IAE/D,MAAMI,mBAAqC,EAAE;IAC7CJ,OAAO,OAAO,CAAC,CAACK;QACd,IAAIA,AAAuB,YAAvBA,OAAO,WAAW,EAAc;YAClCC,IAAAA,sBAAAA,MAAAA,AAAAA,EAAOD,OAAO,aAAa,CAAC,SAAS,EAAE;YACvC,MAAMxB,QAAQ0B,SAASF,OAAO,aAAa,CAAC,SAAS,EAAEhB;YACvDe,iBAAiB,IAAI,CAAC;gBACpB,MAAM;gBACN,OAAO;oBACL,QAAQ;wBACN,QAAQC,OAAO,OAAO,IAAI;wBAC1B,MAAMzB,YACJ;4BAAE,GAAGC,KAAK,CAAC,EAAE;4BAAE,GAAGA,KAAK,CAAC,EAAE;wBAAC,GAC3BQ,KAAK,KAAK,EACVA,KAAK,MAAM;oBAEf;gBACF;YACF;QACF,OAAO,IAAIgB,AAAuB,WAAvBA,OAAO,WAAW,EAAa;YACxCC,IAAAA,sBAAAA,MAAAA,AAAAA,EAAOD,OAAO,aAAa,CAAC,SAAS,EAAE;YACvCC,IAAAA,sBAAAA,MAAAA,AAAAA,EAAOD,OAAO,aAAa,CAAC,OAAO,EAAE;YACrC,MAAMG,aAAaD,SAASF,OAAO,aAAa,CAAC,SAAS,EAAEhB;YAC5D,MAAMoB,WAAWF,SAASF,OAAO,aAAa,CAAC,OAAO,EAAEhB;YACxDe,iBAAiB,IAAI,CAAC;gBACpB,MAAM;gBACN,OAAO;oBACL,MAAM;wBACJ,QAAQC,OAAO,OAAO,IAAI;wBAC1B,MAAMzB,YACJ;4BAAE,GAAG4B,UAAU,CAAC,EAAE;4BAAE,GAAGA,UAAU,CAAC,EAAE;wBAAC,GACrCnB,KAAK,KAAK,EACVA,KAAK,MAAM;oBAEf;oBACA,IAAI;wBACF,QAAQgB,OAAO,OAAO,IAAI;wBAC1B,MAAMzB,YACJ;4BAAE,GAAG6B,QAAQ,CAAC,EAAE;4BAAE,GAAGA,QAAQ,CAAC,EAAE;wBAAC,GACjCpB,KAAK,KAAK,EACVA,KAAK,MAAM;oBAEf;gBACF;gBACA,SAASgB,OAAO,OAAO,IAAI;YAC7B;QACF,OAAO,IAAIA,AAAuB,WAAvBA,OAAO,WAAW,EAC3BD,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO;gBACL,OAAOC,OAAO,aAAa,CAAC,OAAO;YACrC;YACA,SAASA,OAAO,OAAO,IAAI;QAC7B;aACK,IAAIA,AAAuB,aAAvBA,OAAO,WAAW,EAC3BD,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO;gBACL,WAAWC,OAAO,aAAa,CAAC,SAAS;YAC3C;YACA,SAASA,OAAO,OAAO,IAAI;QAC7B;aACK,IAAIA,AAAuB,eAAvBA,OAAO,WAAW,EAC3BD,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO,CAAC;YACR,SAASC,OAAO,OAAO,IAAI;QAC7B;aACK,IAAIA,AAAuB,aAAvBA,OAAO,WAAW,EAC3B,IAAKA,OAAO,aAAa,CAAC,GAAG,EAItB;YACL,MAAMK,OAAOC,AAAAA,IAAAA,mCAAAA,oBAAAA,AAAAA,EAAqBN,OAAO,aAAa,CAAC,GAAG;YAE1DD,iBAAiB,IAAI,CAAC;gBACpB,MAAM;gBACN,OAAO;oBACL,SAASM;gBACX;gBACA,SAASL,OAAO,OAAO,IAAI;YAC7B;QACF,OAbEO,QAAQ,IAAI,CACV;aAaC,IAAIP,AAAuB,WAAvBA,OAAO,WAAW,EAC3BD,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO;gBACL,QAAQ;YACV;YACA,SAASC,OAAO,OAAO,IAAI;QAC7B;aACK,IAAIA,AAAuB,wBAAvBA,OAAO,WAAW,EAC3BD,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO,CAAC;YACR,SAASC,OAAO,OAAO,IAAI;QAC7B;aACK,IAAIA,AAAuB,wBAAvBA,OAAO,WAAW,EAC3BD,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO,CAAC;YACR,SAASC,OAAO,OAAO,IAAI;QAC7B;aACK,IAAIA,AAAuB,8BAAvBA,OAAO,WAAW,EAC3BD,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO,CAAC;QACV;aACK,IAAIC,AAAuB,uBAAvBA,OAAO,WAAW,EAAyB;YACpDC,IAAAA,sBAAAA,MAAAA,AAAAA,EACED,OAAO,aAAa,CAAC,SAAS,EAC9B;YAEF,MAAMQ,QAAQN,SAASF,OAAO,aAAa,CAAC,SAAS,EAAEhB;YACvDe,iBAAiB,IAAI,CAAC;gBACpB,MAAM;gBACN,OAAO;oBACL,QAAQ;wBACN,QAAQC,OAAO,OAAO,IAAI;wBAC1B,MAAMzB,YACJ;4BAAE,GAAGiC,KAAK,CAAC,EAAE;4BAAE,GAAGA,KAAK,CAAC,EAAE;wBAAC,GAC3BxB,KAAK,KAAK,EACVA,KAAK,MAAM;oBAEf;oBACA,UAAU;gBACZ;gBACA,SAASgB,OAAO,OAAO,IAAI;YAC7B;QACF,OAAO,IAAIA,AAAuB,kBAAvBA,OAAO,WAAW,EAAoB;YAC/C,MAAMS,gBAAgBT,OAAO,aAAa,CAAC,SAAS,IAAI;YACxDD,iBAAiB,IAAI,CAAC;gBACpB,MAAM;gBACN,OAAO;oBACL,WAAWU;oBACX,UAAWT,OAAO,aAAa,CAAS,QAAQ;oBAChD,UAAWA,OAAO,aAAa,CAAS,QAAQ,IAAI;gBACtD;gBACA,SAASA,OAAO,OAAO,IAAI;YAC7B;QACF;IACF;IAEA,IAAID,AAA4B,MAA5BA,iBAAiB,MAAM,EACzB,MAAM,IAAIW,MAAM,CAAC,4BAA4B,EAAEtB,IAAI,OAAO,EAAE,EAAE;QAC5D,OAAO;YACL,YAAYA,IAAI,OAAO;YACvBO;QACF;IACF;IAGFvB,MAAM,oBAAoB0B,KAAK,SAAS,CAACC,kBAAkB,MAAM;IAEjE,OAAO;QACL,SAASA;QACT,kBAAkBJ;QAClB,gBAAgBgB,AAAAA,IAAAA,oCAAAA,UAAAA,AAAAA,EAAWvB,IAAI,OAAO;QACtC,OAAOA,IAAI,KAAK;QAChB,aAAaU,KAAK,SAAS,CAACV,IAAI,OAAO,EAAES,QAAW;IACtD;AACF;AAOA,SAASL,yBAAyBoB,IAAY;IAE5C,MAAMC,UAAU;IAEhB,SAASC,aACPC,KAAa,EACbC,EAAU,EACVC,EAAU,EACVC,EAAU,EACVC,EAAU;QAGV,MAAMC,QAAQC,OAAO,QAAQ,CAACL,IAAI;QAClC,MAAMM,QAAQD,OAAO,QAAQ,CAACJ,IAAI;QAClC,MAAMM,QAAQF,OAAO,QAAQ,CAACH,IAAI;QAClC,MAAMM,QAAQH,OAAO,QAAQ,CAACF,IAAI;QAGlC,MAAMM,IAAI9C,KAAK,KAAK,CAAEyC,AAAAA,CAAAA,QAAQG,KAAI,IAAK;QACvC,MAAMG,IAAI/C,KAAK,KAAK,CAAE2C,AAAAA,CAAAA,QAAQE,KAAI,IAAK;QAGvC,OAAO,CAAC,CAAC,EAAEC,EAAE,CAAC,EAAEC,EAAE,CAAC,CAAC;IACtB;IAGA,MAAMC,cAAcf,KAAK,OAAO,CAAC,YAAY;IAC7C,OAAOe,YAAY,OAAO,CAACd,SAASC,cAAc,IAAI;AACxD;AAEA,SAASZ,SAAS0B,QAAgB,EAAE5C,IAAuC;IACzE,MAAM,CAACyC,GAAGC,EAAE,GAAG5B,KAAK,KAAK,CAAC8B;IAC1B,OAAO;QAACH,IAAIzC,KAAK,KAAK;QAAE0C,IAAI1C,KAAK,MAAM;KAAC;AAC1C;AA2EO,eAAe6C,qBACpBC,WAAmB,EACnB9C,IAAU,EACVC,gBAAmC;IAEnC,IACE8C,AAAmC,kBAAnCA,AAAAA,IAAAA,oBAAAA,YAAAA,AAAAA,EAAa9C,qBACbS,AAAAA,IAAAA,oBAAAA,kBAAAA,AAAAA,EAAmBT,sBAAsB+C,oBAAAA,kBAAAA,CAAAA,IAAuB,EAChE;QACA5D,MAAM,uCAAuCY;QAC7C,MAAMiD,gBAAgBjD,KAAK,KAAK,GAAGA,KAAK,MAAM;QAC9C,MAAMkD,YAAY;QAClB,IAAID,gBAAgBC,WAAW;YAC7B,MAAMC,eAAexD,KAAK,IAAI,CAACuD,YAAYD;YAC3C,MAAMG,WAAWzD,KAAK,KAAK,CAACK,KAAK,KAAK,GAAGmD;YACzC,MAAME,YAAY1D,KAAK,KAAK,CAACK,KAAK,MAAM,GAAGmD;YAC3C/D,MACE,2DACAgE,UACAC;YAEF,MAAMC,eAAe,MAAMC,AAAAA,IAAAA,oBAAAA,eAAAA,AAAAA,EAAgBT,aAAa;gBACtD,OAAOM;gBACP,QAAQC;YACV;YACA,OAAOC;QACT;IACF;IACA,OAAOR;AACT"}
1
+ {"version":3,"file":"ai-model/ui-tars-planning.js","sources":["webpack://@midscene/core/webpack/runtime/define_property_getters","webpack://@midscene/core/webpack/runtime/has_own_property","webpack://@midscene/core/webpack/runtime/make_namespace_object","webpack://@midscene/core/./src/ai-model/ui-tars-planning.ts"],"sourcesContent":["__webpack_require__.d = (exports, definition) => {\n\tfor(var key in definition) {\n if(__webpack_require__.o(definition, key) && !__webpack_require__.o(exports, key)) {\n Object.defineProperty(exports, key, { enumerable: true, get: definition[key] });\n }\n }\n};","__webpack_require__.o = (obj, prop) => (Object.prototype.hasOwnProperty.call(obj, prop))","// define __esModule on exports\n__webpack_require__.r = (exports) => {\n\tif(typeof Symbol !== 'undefined' && Symbol.toStringTag) {\n\t\tObject.defineProperty(exports, Symbol.toStringTag, { value: 'Module' });\n\t}\n\tObject.defineProperty(exports, '__esModule', { value: true });\n};","import type {\n AIUsageInfo,\n MidsceneYamlFlowItem,\n PlanningAction,\n Size,\n} from '@/types';\nimport {\n type IModelPreferences,\n UITarsModelVersion,\n uiTarsModelVersion,\n vlLocateMode,\n} from '@midscene/shared/env';\nimport { resizeImgBase64 } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { transformHotkeyInput } from '@midscene/shared/us-keyboard-layout';\nimport { assert } from '@midscene/shared/utils';\nimport { actionParser } from '@ui-tars/action-parser';\nimport type { ChatCompletionMessageParam } from 'openai/resources/index';\nimport { AIActionType } from './common';\nimport { getSummary, getUiTarsPlanningPrompt } from './prompt/ui-tars-planning';\nimport { call } from './service-caller/index';\ntype ActionType =\n | 'click'\n | 'drag'\n | 'type'\n | 'hotkey'\n | 'finished'\n | 'scroll'\n | 'wait'\n | 'androidBackButton'\n | 'androidHomeButton'\n | 'androidRecentAppsButton'\n | 'androidLongPress'\n | 'androidPull';\n\nconst debug = getDebug('ui-tars-planning');\nconst bboxSize = 10;\nconst pointToBbox = (\n point: { x: number; y: number },\n width: number,\n height: number,\n): [number, number, number, number] => {\n return [\n Math.round(Math.max(point.x - bboxSize / 2, 0)),\n Math.round(Math.max(point.y - bboxSize / 2, 0)),\n Math.round(Math.min(point.x + bboxSize / 2, width)),\n Math.round(Math.min(point.y + bboxSize / 2, height)),\n ];\n};\n\nexport async function vlmPlanning(options: {\n userInstruction: string;\n conversationHistory: ChatCompletionMessageParam[];\n size: { width: number; height: number };\n modelPreferences: IModelPreferences;\n}): Promise<{\n actions: PlanningAction<any>[];\n actionsFromModel: ReturnType<typeof actionParser>['parsed'];\n action_summary: string;\n yamlFlow?: MidsceneYamlFlowItem[];\n usage?: AIUsageInfo;\n rawResponse?: string;\n}> {\n const { conversationHistory, userInstruction, size, modelPreferences } =\n options;\n const systemPrompt = getUiTarsPlanningPrompt() + userInstruction;\n\n const res = await call(\n [\n {\n role: 'user',\n content: systemPrompt,\n },\n ...conversationHistory,\n ],\n AIActionType.INSPECT_ELEMENT,\n modelPreferences,\n );\n const convertedText = convertBboxToCoordinates(res.content);\n\n const modelVer = uiTarsModelVersion(modelPreferences);\n\n const { parsed } = actionParser({\n prediction: convertedText,\n factor: [1000, 1000],\n screenContext: {\n width: size.width,\n height: size.height,\n },\n modelVer: modelVer || undefined,\n });\n\n debug('modelVer', modelVer, 'parsed', JSON.stringify(parsed));\n\n const transformActions: PlanningAction[] = [];\n parsed.forEach((action) => {\n if (action.action_type === 'click') {\n assert(action.action_inputs.start_box, 'start_box is required');\n const point = getPoint(action.action_inputs.start_box, size);\n transformActions.push({\n type: 'Locate',\n param: {},\n locate: {\n prompt: action.thought || '',\n bbox: pointToBbox(\n { x: point[0], y: point[1] },\n size.width,\n size.height,\n ),\n },\n });\n transformActions.push({\n type: 'Tap',\n locate: {\n prompt: action.thought || '',\n bbox: pointToBbox(\n { x: point[0], y: point[1] },\n size.width,\n size.height,\n ),\n },\n param: action.thought || '',\n });\n } else if (action.action_type === 'drag') {\n assert(action.action_inputs.start_box, 'start_box is required');\n assert(action.action_inputs.end_box, 'end_box is required');\n const startPoint = getPoint(action.action_inputs.start_box, size);\n const endPoint = getPoint(action.action_inputs.end_box, size);\n transformActions.push({\n type: 'Drag',\n param: {\n start_box: { x: startPoint[0], y: startPoint[1] },\n end_box: { x: endPoint[0], y: endPoint[1] },\n },\n locate: null,\n thought: action.thought || '',\n });\n } else if (action.action_type === 'type') {\n transformActions.push({\n type: 'Input',\n param: {\n value: action.action_inputs.content,\n },\n locate: null,\n thought: action.thought || '',\n });\n } else if (action.action_type === 'scroll') {\n transformActions.push({\n type: 'Scroll',\n param: {\n direction: action.action_inputs.direction,\n },\n locate: null,\n thought: action.thought || '',\n });\n } else if (action.action_type === 'finished') {\n transformActions.push({\n type: 'Finished',\n param: {},\n locate: null,\n thought: action.thought || '',\n });\n } else if (action.action_type === 'hotkey') {\n if (!action.action_inputs.key) {\n console.warn(\n 'No key found in action: hotkey. Will not perform action.',\n );\n } else {\n const keys = transformHotkeyInput(action.action_inputs.key);\n\n transformActions.push({\n type: 'KeyboardPress',\n param: {\n value: keys,\n },\n locate: null,\n thought: action.thought || '',\n });\n }\n } else if (action.action_type === 'wait') {\n transformActions.push({\n type: 'Sleep',\n param: {\n timeMs: 1000,\n },\n locate: null,\n thought: action.thought || '',\n });\n } else if (action.action_type === 'androidBackButton') {\n transformActions.push({\n type: 'AndroidBackButton',\n param: {},\n locate: null,\n thought: action.thought || '',\n });\n } else if (action.action_type === 'androidHomeButton') {\n transformActions.push({\n type: 'AndroidHomeButton',\n param: {},\n locate: null,\n thought: action.thought || '',\n });\n } else if (action.action_type === 'androidRecentAppsButton') {\n transformActions.push({\n type: 'AndroidRecentAppsButton',\n param: {},\n });\n } else if (action.action_type === 'androidLongPress') {\n assert(\n action.action_inputs.start_coords,\n 'start_coords is required for androidLongPress',\n );\n const point = action.action_inputs.start_coords;\n transformActions.push({\n type: 'AndroidLongPress',\n param: {\n x: point[0],\n y: point[1],\n duration: 1000,\n },\n locate: null,\n thought: action.thought || '',\n });\n } else if (action.action_type === 'androidPull') {\n const pullDirection = action.action_inputs.direction || 'down';\n const startPoint = action.action_inputs.start_coords\n ? {\n x: action.action_inputs.start_coords[0],\n y: action.action_inputs.start_coords[1],\n }\n : undefined;\n\n transformActions.push({\n type: 'AndroidPull',\n param: {\n direction: pullDirection as 'up' | 'down',\n startPoint,\n distance: (action.action_inputs as any).distance,\n duration: (action.action_inputs as any).duration || 500,\n },\n locate: null,\n thought: action.thought || '',\n });\n }\n });\n\n if (transformActions.length === 0) {\n throw new Error(`No actions found, response: ${res.content}`, {\n cause: {\n prediction: res.content,\n parsed,\n },\n });\n }\n\n return {\n actions: transformActions,\n actionsFromModel: parsed,\n action_summary: getSummary(res.content),\n usage: res.usage,\n rawResponse: JSON.stringify(res.content, undefined, 2),\n };\n}\n\n/**\n * Converts bounding box notation to coordinate points\n * @param text - The text containing bbox tags to be converted\n * @returns The text with bbox tags replaced by coordinate points\n */\nfunction convertBboxToCoordinates(text: string): string {\n // Match the four numbers after <bbox>\n const pattern = /<bbox>(\\d+)\\s+(\\d+)\\s+(\\d+)\\s+(\\d+)<\\/bbox>/g;\n\n function replaceMatch(\n match: string,\n x1: string,\n y1: string,\n x2: string,\n y2: string,\n ): string {\n // Convert strings to numbers and calculate center point\n const x1Num = Number.parseInt(x1, 10);\n const y1Num = Number.parseInt(y1, 10);\n const x2Num = Number.parseInt(x2, 10);\n const y2Num = Number.parseInt(y2, 10);\n\n // Use Math.floor to truncate and calculate center point\n const x = Math.floor((x1Num + x2Num) / 2);\n const y = Math.floor((y1Num + y2Num) / 2);\n\n // Return formatted coordinate string\n return `(${x},${y})`;\n }\n\n // Remove [EOS] and replace <bbox> coordinates\n const cleanedText = text.replace(/\\[EOS\\]/g, '');\n return cleanedText.replace(pattern, replaceMatch).trim();\n}\n\nfunction getPoint(startBox: string, size: { width: number; height: number }) {\n const [x, y] = JSON.parse(startBox);\n return [x * size.width, y * size.height];\n}\n\ninterface BaseAction {\n action_type: ActionType;\n action_inputs: Record<string, any>;\n reflection: string | null;\n thought: string | null;\n}\n\ninterface ClickAction extends BaseAction {\n action_type: 'click';\n action_inputs: {\n start_box: string; // JSON string of [x, y] coordinates\n };\n}\n\ninterface DragAction extends BaseAction {\n action_type: 'drag';\n action_inputs: {\n start_box: string; // JSON string of [x, y] coordinates\n end_box: string; // JSON string of [x, y] coordinates\n };\n}\n\ninterface WaitAction extends BaseAction {\n action_type: 'wait';\n action_inputs: {\n time: string; // JSON string of [x, y] coordinates\n };\n}\n\ninterface TypeAction extends BaseAction {\n action_type: 'type';\n action_inputs: {\n content: string;\n };\n}\n\ninterface HotkeyAction extends BaseAction {\n action_type: 'hotkey';\n action_inputs: {\n key: string;\n };\n}\n\ninterface ScrollAction extends BaseAction {\n action_type: 'scroll';\n action_inputs: {\n direction: 'up' | 'down';\n };\n}\n\ninterface FinishedAction extends BaseAction {\n action_type: 'finished';\n action_inputs: Record<string, never>;\n}\n\ninterface AndroidLongPressAction extends BaseAction {\n action_type: 'androidLongPress';\n action_inputs: {\n start_coords: [number, number]; // Coordinates for long press\n duration?: number; // Duration in milliseconds\n };\n}\n\nexport type Action =\n | ClickAction\n | DragAction\n | TypeAction\n | HotkeyAction\n | ScrollAction\n | FinishedAction\n | WaitAction\n | AndroidLongPressAction;\n\nexport async function resizeImageForUiTars(\n imageBase64: string,\n size: Size,\n modelPreferences: IModelPreferences,\n) {\n if (\n vlLocateMode(modelPreferences) === 'vlm-ui-tars' &&\n uiTarsModelVersion(modelPreferences) === UITarsModelVersion.V1_5\n ) {\n debug('ui-tars-v1.5, will check image size', size);\n const currentPixels = size.width * size.height;\n const maxPixels = 16384 * 28 * 28; //\n if (currentPixels > maxPixels) {\n const resizeFactor = Math.sqrt(maxPixels / currentPixels);\n const newWidth = Math.floor(size.width * resizeFactor);\n const newHeight = Math.floor(size.height * resizeFactor);\n debug(\n 'resize image for ui-tars, new width: %s, new height: %s',\n newWidth,\n newHeight,\n );\n const resizedImage = await resizeImgBase64(imageBase64, {\n width: newWidth,\n height: newHeight,\n });\n return resizedImage;\n }\n }\n return imageBase64;\n}\n"],"names":["__webpack_require__","definition","key","Object","obj","prop","Symbol","debug","getDebug","bboxSize","pointToBbox","point","width","height","Math","vlmPlanning","options","conversationHistory","userInstruction","size","modelPreferences","systemPrompt","getUiTarsPlanningPrompt","res","call","AIActionType","convertedText","convertBboxToCoordinates","modelVer","uiTarsModelVersion","parsed","actionParser","undefined","JSON","transformActions","action","assert","getPoint","startPoint","endPoint","keys","transformHotkeyInput","console","pullDirection","Error","getSummary","text","pattern","replaceMatch","match","x1","y1","x2","y2","x1Num","Number","y1Num","x2Num","y2Num","x","y","cleanedText","startBox","resizeImageForUiTars","imageBase64","vlLocateMode","UITarsModelVersion","currentPixels","maxPixels","resizeFactor","newWidth","newHeight","resizedImage","resizeImgBase64"],"mappings":";;;IAAAA,oBAAoB,CAAC,GAAG,CAAC,UAASC;QACjC,IAAI,IAAIC,OAAOD,WACR,IAAGD,oBAAoB,CAAC,CAACC,YAAYC,QAAQ,CAACF,oBAAoB,CAAC,CAAC,UAASE,MACzEC,OAAO,cAAc,CAAC,UAASD,KAAK;YAAE,YAAY;YAAM,KAAKD,UAAU,CAACC,IAAI;QAAC;IAGzF;;;ICNAF,oBAAoB,CAAC,GAAG,CAACI,KAAKC,OAAUF,OAAO,SAAS,CAAC,cAAc,CAAC,IAAI,CAACC,KAAKC;;;ICClFL,oBAAoB,CAAC,GAAG,CAAC;QACxB,IAAG,AAAkB,eAAlB,OAAOM,UAA0BA,OAAO,WAAW,EACrDH,OAAO,cAAc,CAAC,UAASG,OAAO,WAAW,EAAE;YAAE,OAAO;QAAS;QAEtEH,OAAO,cAAc,CAAC,UAAS,cAAc;YAAE,OAAO;QAAK;IAC5D;;;;;;;;;;;;;;;;;AC6BA,MAAMI,QAAQC,AAAAA,IAAAA,uBAAAA,QAAAA,AAAAA,EAAS;AACvB,MAAMC,WAAW;AACjB,MAAMC,cAAc,CAClBC,OACAC,OACAC,SAEO;QACLC,KAAK,KAAK,CAACA,KAAK,GAAG,CAACH,MAAM,CAAC,GAAGF,WAAW,GAAG;QAC5CK,KAAK,KAAK,CAACA,KAAK,GAAG,CAACH,MAAM,CAAC,GAAGF,WAAW,GAAG;QAC5CK,KAAK,KAAK,CAACA,KAAK,GAAG,CAACH,MAAM,CAAC,GAAGF,WAAW,GAAGG;QAC5CE,KAAK,KAAK,CAACA,KAAK,GAAG,CAACH,MAAM,CAAC,GAAGF,WAAW,GAAGI;KAC7C;AAGI,eAAeE,YAAYC,OAKjC;IAQC,MAAM,EAAEC,mBAAmB,EAAEC,eAAe,EAAEC,IAAI,EAAEC,gBAAgB,EAAE,GACpEJ;IACF,MAAMK,eAAeC,AAAAA,IAAAA,oCAAAA,uBAAAA,AAAAA,MAA4BJ;IAEjD,MAAMK,MAAM,MAAMC,AAAAA,IAAAA,yBAAAA,IAAAA,AAAAA,EAChB;QACE;YACE,MAAM;YACN,SAASH;QACX;WACGJ;KACJ,EACDQ,mCAAAA,YAAAA,CAAAA,eAA4B,EAC5BL;IAEF,MAAMM,gBAAgBC,yBAAyBJ,IAAI,OAAO;IAE1D,MAAMK,WAAWC,AAAAA,IAAAA,oBAAAA,kBAAAA,AAAAA,EAAmBT;IAEpC,MAAM,EAAEU,MAAM,EAAE,GAAGC,AAAAA,IAAAA,8BAAAA,YAAAA,AAAAA,EAAa;QAC9B,YAAYL;QACZ,QAAQ;YAAC;YAAM;SAAK;QACpB,eAAe;YACb,OAAOP,KAAK,KAAK;YACjB,QAAQA,KAAK,MAAM;QACrB;QACA,UAAUS,YAAYI;IACxB;IAEAzB,MAAM,YAAYqB,UAAU,UAAUK,KAAK,SAAS,CAACH;IAErD,MAAMI,mBAAqC,EAAE;IAC7CJ,OAAO,OAAO,CAAC,CAACK;QACd,IAAIA,AAAuB,YAAvBA,OAAO,WAAW,EAAc;YAClCC,IAAAA,sBAAAA,MAAAA,AAAAA,EAAOD,OAAO,aAAa,CAAC,SAAS,EAAE;YACvC,MAAMxB,QAAQ0B,SAASF,OAAO,aAAa,CAAC,SAAS,EAAEhB;YACvDe,iBAAiB,IAAI,CAAC;gBACpB,MAAM;gBACN,OAAO,CAAC;gBACR,QAAQ;oBACN,QAAQC,OAAO,OAAO,IAAI;oBAC1B,MAAMzB,YACJ;wBAAE,GAAGC,KAAK,CAAC,EAAE;wBAAE,GAAGA,KAAK,CAAC,EAAE;oBAAC,GAC3BQ,KAAK,KAAK,EACVA,KAAK,MAAM;gBAEf;YACF;YACAe,iBAAiB,IAAI,CAAC;gBACpB,MAAM;gBACN,QAAQ;oBACN,QAAQC,OAAO,OAAO,IAAI;oBAC1B,MAAMzB,YACJ;wBAAE,GAAGC,KAAK,CAAC,EAAE;wBAAE,GAAGA,KAAK,CAAC,EAAE;oBAAC,GAC3BQ,KAAK,KAAK,EACVA,KAAK,MAAM;gBAEf;gBACA,OAAOgB,OAAO,OAAO,IAAI;YAC3B;QACF,OAAO,IAAIA,AAAuB,WAAvBA,OAAO,WAAW,EAAa;YACxCC,IAAAA,sBAAAA,MAAAA,AAAAA,EAAOD,OAAO,aAAa,CAAC,SAAS,EAAE;YACvCC,IAAAA,sBAAAA,MAAAA,AAAAA,EAAOD,OAAO,aAAa,CAAC,OAAO,EAAE;YACrC,MAAMG,aAAaD,SAASF,OAAO,aAAa,CAAC,SAAS,EAAEhB;YAC5D,MAAMoB,WAAWF,SAASF,OAAO,aAAa,CAAC,OAAO,EAAEhB;YACxDe,iBAAiB,IAAI,CAAC;gBACpB,MAAM;gBACN,OAAO;oBACL,WAAW;wBAAE,GAAGI,UAAU,CAAC,EAAE;wBAAE,GAAGA,UAAU,CAAC,EAAE;oBAAC;oBAChD,SAAS;wBAAE,GAAGC,QAAQ,CAAC,EAAE;wBAAE,GAAGA,QAAQ,CAAC,EAAE;oBAAC;gBAC5C;gBACA,QAAQ;gBACR,SAASJ,OAAO,OAAO,IAAI;YAC7B;QACF,OAAO,IAAIA,AAAuB,WAAvBA,OAAO,WAAW,EAC3BD,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO;gBACL,OAAOC,OAAO,aAAa,CAAC,OAAO;YACrC;YACA,QAAQ;YACR,SAASA,OAAO,OAAO,IAAI;QAC7B;aACK,IAAIA,AAAuB,aAAvBA,OAAO,WAAW,EAC3BD,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO;gBACL,WAAWC,OAAO,aAAa,CAAC,SAAS;YAC3C;YACA,QAAQ;YACR,SAASA,OAAO,OAAO,IAAI;QAC7B;aACK,IAAIA,AAAuB,eAAvBA,OAAO,WAAW,EAC3BD,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO,CAAC;YACR,QAAQ;YACR,SAASC,OAAO,OAAO,IAAI;QAC7B;aACK,IAAIA,AAAuB,aAAvBA,OAAO,WAAW,EAC3B,IAAKA,OAAO,aAAa,CAAC,GAAG,EAItB;YACL,MAAMK,OAAOC,AAAAA,IAAAA,mCAAAA,oBAAAA,AAAAA,EAAqBN,OAAO,aAAa,CAAC,GAAG;YAE1DD,iBAAiB,IAAI,CAAC;gBACpB,MAAM;gBACN,OAAO;oBACL,OAAOM;gBACT;gBACA,QAAQ;gBACR,SAASL,OAAO,OAAO,IAAI;YAC7B;QACF,OAdEO,QAAQ,IAAI,CACV;aAcC,IAAIP,AAAuB,WAAvBA,OAAO,WAAW,EAC3BD,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO;gBACL,QAAQ;YACV;YACA,QAAQ;YACR,SAASC,OAAO,OAAO,IAAI;QAC7B;aACK,IAAIA,AAAuB,wBAAvBA,OAAO,WAAW,EAC3BD,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO,CAAC;YACR,QAAQ;YACR,SAASC,OAAO,OAAO,IAAI;QAC7B;aACK,IAAIA,AAAuB,wBAAvBA,OAAO,WAAW,EAC3BD,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO,CAAC;YACR,QAAQ;YACR,SAASC,OAAO,OAAO,IAAI;QAC7B;aACK,IAAIA,AAAuB,8BAAvBA,OAAO,WAAW,EAC3BD,iBAAiB,IAAI,CAAC;YACpB,MAAM;YACN,OAAO,CAAC;QACV;aACK,IAAIC,AAAuB,uBAAvBA,OAAO,WAAW,EAAyB;YACpDC,IAAAA,sBAAAA,MAAAA,AAAAA,EACED,OAAO,aAAa,CAAC,YAAY,EACjC;YAEF,MAAMxB,QAAQwB,OAAO,aAAa,CAAC,YAAY;YAC/CD,iBAAiB,IAAI,CAAC;gBACpB,MAAM;gBACN,OAAO;oBACL,GAAGvB,KAAK,CAAC,EAAE;oBACX,GAAGA,KAAK,CAAC,EAAE;oBACX,UAAU;gBACZ;gBACA,QAAQ;gBACR,SAASwB,OAAO,OAAO,IAAI;YAC7B;QACF,OAAO,IAAIA,AAAuB,kBAAvBA,OAAO,WAAW,EAAoB;YAC/C,MAAMQ,gBAAgBR,OAAO,aAAa,CAAC,SAAS,IAAI;YACxD,MAAMG,aAAaH,OAAO,aAAa,CAAC,YAAY,GAChD;gBACE,GAAGA,OAAO,aAAa,CAAC,YAAY,CAAC,EAAE;gBACvC,GAAGA,OAAO,aAAa,CAAC,YAAY,CAAC,EAAE;YACzC,IACAH;YAEJE,iBAAiB,IAAI,CAAC;gBACpB,MAAM;gBACN,OAAO;oBACL,WAAWS;oBACXL;oBACA,UAAWH,OAAO,aAAa,CAAS,QAAQ;oBAChD,UAAWA,OAAO,aAAa,CAAS,QAAQ,IAAI;gBACtD;gBACA,QAAQ;gBACR,SAASA,OAAO,OAAO,IAAI;YAC7B;QACF;IACF;IAEA,IAAID,AAA4B,MAA5BA,iBAAiB,MAAM,EACzB,MAAM,IAAIU,MAAM,CAAC,4BAA4B,EAAErB,IAAI,OAAO,EAAE,EAAE;QAC5D,OAAO;YACL,YAAYA,IAAI,OAAO;YACvBO;QACF;IACF;IAGF,OAAO;QACL,SAASI;QACT,kBAAkBJ;QAClB,gBAAgBe,AAAAA,IAAAA,oCAAAA,UAAAA,AAAAA,EAAWtB,IAAI,OAAO;QACtC,OAAOA,IAAI,KAAK;QAChB,aAAaU,KAAK,SAAS,CAACV,IAAI,OAAO,EAAES,QAAW;IACtD;AACF;AAOA,SAASL,yBAAyBmB,IAAY;IAE5C,MAAMC,UAAU;IAEhB,SAASC,aACPC,KAAa,EACbC,EAAU,EACVC,EAAU,EACVC,EAAU,EACVC,EAAU;QAGV,MAAMC,QAAQC,OAAO,QAAQ,CAACL,IAAI;QAClC,MAAMM,QAAQD,OAAO,QAAQ,CAACJ,IAAI;QAClC,MAAMM,QAAQF,OAAO,QAAQ,CAACH,IAAI;QAClC,MAAMM,QAAQH,OAAO,QAAQ,CAACF,IAAI;QAGlC,MAAMM,IAAI7C,KAAK,KAAK,CAAEwC,AAAAA,CAAAA,QAAQG,KAAI,IAAK;QACvC,MAAMG,IAAI9C,KAAK,KAAK,CAAE0C,AAAAA,CAAAA,QAAQE,KAAI,IAAK;QAGvC,OAAO,CAAC,CAAC,EAAEC,EAAE,CAAC,EAAEC,EAAE,CAAC,CAAC;IACtB;IAGA,MAAMC,cAAcf,KAAK,OAAO,CAAC,YAAY;IAC7C,OAAOe,YAAY,OAAO,CAACd,SAASC,cAAc,IAAI;AACxD;AAEA,SAASX,SAASyB,QAAgB,EAAE3C,IAAuC;IACzE,MAAM,CAACwC,GAAGC,EAAE,GAAG3B,KAAK,KAAK,CAAC6B;IAC1B,OAAO;QAACH,IAAIxC,KAAK,KAAK;QAAEyC,IAAIzC,KAAK,MAAM;KAAC;AAC1C;AA2EO,eAAe4C,qBACpBC,WAAmB,EACnB7C,IAAU,EACVC,gBAAmC;IAEnC,IACE6C,AAAmC,kBAAnCA,AAAAA,IAAAA,oBAAAA,YAAAA,AAAAA,EAAa7C,qBACbS,AAAAA,IAAAA,oBAAAA,kBAAAA,AAAAA,EAAmBT,sBAAsB8C,oBAAAA,kBAAAA,CAAAA,IAAuB,EAChE;QACA3D,MAAM,uCAAuCY;QAC7C,MAAMgD,gBAAgBhD,KAAK,KAAK,GAAGA,KAAK,MAAM;QAC9C,MAAMiD,YAAY;QAClB,IAAID,gBAAgBC,WAAW;YAC7B,MAAMC,eAAevD,KAAK,IAAI,CAACsD,YAAYD;YAC3C,MAAMG,WAAWxD,KAAK,KAAK,CAACK,KAAK,KAAK,GAAGkD;YACzC,MAAME,YAAYzD,KAAK,KAAK,CAACK,KAAK,MAAM,GAAGkD;YAC3C9D,MACE,2DACA+D,UACAC;YAEF,MAAMC,eAAe,MAAMC,AAAAA,IAAAA,oBAAAA,eAAAA,AAAAA,EAAgBT,aAAa;gBACtD,OAAOM;gBACP,QAAQC;YACV;YACA,OAAOC;QACT;IACF;IACA,OAAOR;AACT"}
@@ -196,9 +196,6 @@ class Insight {
196
196
  const context = await this.contextRetrieverFn('describe');
197
197
  const { screenshotBase64, size } = context;
198
198
  (0, utils_namespaceObject.assert)(screenshotBase64, 'screenshot is required for insight.describe');
199
- const modelPreferences = {
200
- intent: 'grounding'
201
- };
202
199
  const systemPrompt = (0, describe_js_namespaceObject.elementDescriberInstruction)();
203
200
  const defaultRectSize = 30;
204
201
  const targetRect = Array.isArray(target) ? {
@@ -218,6 +215,9 @@ class Insight {
218
215
  borderThickness: 3
219
216
  });
220
217
  if (null == opt ? void 0 : opt.deepThink) {
218
+ const modelPreferences = {
219
+ intent: 'grounding'
220
+ };
221
221
  const searchArea = (0, common_js_namespaceObject.expandSearchArea)(targetRect, context.size, modelPreferences);
222
222
  debug('describe: set searchArea', searchArea);
223
223
  imagePayload = await (0, img_namespaceObject.cropByRect)(imagePayload, searchArea, (0, env_namespaceObject.getIsUseQwenVl)(modelPreferences));
@@ -241,7 +241,7 @@ class Insight {
241
241
  }
242
242
  ];
243
243
  const callAIFn = this.aiVendorFn || index_js_namespaceObject.callToGetJSONObject;
244
- const res = await callAIFn(msgs, common_js_namespaceObject.AIActionType.DESCRIBE_ELEMENT, modelPreferences);
244
+ const res = await callAIFn(msgs, common_js_namespaceObject.AIActionType.DESCRIBE_ELEMENT);
245
245
  const { content } = res;
246
246
  (0, utils_namespaceObject.assert)(!content.error, `describe failed: ${content.error}`);
247
247
  (0, utils_namespaceObject.assert)(content.description, 'failed to describe the element');
@@ -1 +1 @@
1
- {"version":3,"file":"insight/index.js","sources":["webpack://@midscene/core/webpack/runtime/define_property_getters","webpack://@midscene/core/webpack/runtime/has_own_property","webpack://@midscene/core/webpack/runtime/make_namespace_object","webpack://@midscene/core/./src/insight/index.ts"],"sourcesContent":["__webpack_require__.d = (exports, definition) => {\n\tfor(var key in definition) {\n if(__webpack_require__.o(definition, key) && !__webpack_require__.o(exports, key)) {\n Object.defineProperty(exports, key, { enumerable: true, get: definition[key] });\n }\n }\n};","__webpack_require__.o = (obj, prop) => (Object.prototype.hasOwnProperty.call(obj, prop))","// define __esModule on exports\n__webpack_require__.r = (exports) => {\n\tif(typeof Symbol !== 'undefined' && Symbol.toStringTag) {\n\t\tObject.defineProperty(exports, Symbol.toStringTag, { value: 'Module' });\n\t}\n\tObject.defineProperty(exports, '__esModule', { value: true });\n};","import {\n AIActionType,\n type AIArgs,\n callAiFn,\n expandSearchArea,\n} from '@/ai-model/common';\nimport {\n AiExtractElementInfo,\n AiLocateElement,\n callToGetJSONObject,\n} from '@/ai-model/index';\nimport { AiLocateSection } from '@/ai-model/inspect';\nimport { elementDescriberInstruction } from '@/ai-model/prompt/describe';\nimport type {\n AIDescribeElementResponse,\n AIElementResponse,\n AIUsageInfo,\n BaseElement,\n DetailedLocateParam,\n DumpSubscriber,\n InsightAction,\n InsightExtractOption,\n InsightExtractParam,\n InsightOptions,\n InsightTaskInfo,\n LocateResult,\n PartialInsightDumpFromSDK,\n Rect,\n UIContext,\n} from '@/types';\nimport {\n type IModelPreferences,\n MIDSCENE_FORCE_DEEP_THINK,\n getAIConfigInBoolean,\n getIsUseQwenVl,\n vlLocateMode,\n} from '@midscene/shared/env';\nimport { compositeElementInfoImg, cropByRect } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport type { TMultimodalPrompt } from '../ai-model/common';\nimport { emitInsightDump } from './utils';\n\nexport interface LocateOpts {\n context?: UIContext<BaseElement>;\n callAI?: typeof callAiFn<AIElementResponse>;\n}\n\nexport type AnyValue<T> = {\n [K in keyof T]: unknown extends T[K] ? any : T[K];\n};\n\nconst debug = getDebug('ai:insight');\nexport default class Insight<\n ElementType extends BaseElement = BaseElement,\n ContextType extends UIContext<ElementType> = UIContext<ElementType>,\n> {\n contextRetrieverFn: (\n action: InsightAction,\n ) => Promise<ContextType> | ContextType;\n\n aiVendorFn: (...args: Array<any>) => Promise<any> = callAiFn;\n\n onceDumpUpdatedFn?: DumpSubscriber;\n\n taskInfo?: Omit<InsightTaskInfo, 'durationMs'>;\n\n constructor(\n context:\n | ContextType\n | ((action: InsightAction) => Promise<ContextType> | ContextType),\n opt?: InsightOptions,\n ) {\n assert(context, 'context is required for Insight');\n if (typeof context === 'function') {\n this.contextRetrieverFn = context;\n } else {\n this.contextRetrieverFn = () => Promise.resolve(context);\n }\n\n if (typeof opt?.aiVendorFn !== 'undefined') {\n this.aiVendorFn = opt.aiVendorFn;\n }\n if (typeof opt?.taskInfo !== 'undefined') {\n this.taskInfo = opt.taskInfo;\n }\n }\n\n async locate(\n query: DetailedLocateParam,\n opt?: LocateOpts,\n ): Promise<LocateResult> {\n const { callAI } = opt || {};\n const queryPrompt = typeof query === 'string' ? query : query.prompt;\n assert(queryPrompt, 'query is required for locate');\n const dumpSubscriber = this.onceDumpUpdatedFn;\n this.onceDumpUpdatedFn = undefined;\n\n assert(typeof query === 'object', 'query should be an object for locate');\n\n const globalDeepThinkSwitch = getAIConfigInBoolean(\n MIDSCENE_FORCE_DEEP_THINK,\n );\n if (globalDeepThinkSwitch) {\n debug('globalDeepThinkSwitch', globalDeepThinkSwitch);\n }\n let searchAreaPrompt;\n if (query.deepThink || globalDeepThinkSwitch) {\n searchAreaPrompt = query.prompt;\n }\n const modelPreferences: IModelPreferences = {\n intent: 'grounding',\n };\n\n if (searchAreaPrompt && !vlLocateMode(modelPreferences)) {\n console.warn(\n 'The \"deepThink\" feature is not supported with multimodal LLM. Please config VL model for Midscene. https://midscenejs.com/choose-a-model',\n );\n searchAreaPrompt = undefined;\n }\n\n const context = opt?.context || (await this.contextRetrieverFn('locate'));\n\n let searchArea: Rect | undefined = undefined;\n let searchAreaRawResponse: string | undefined = undefined;\n let searchAreaUsage: AIUsageInfo | undefined = undefined;\n let searchAreaResponse:\n | Awaited<ReturnType<typeof AiLocateSection>>\n | undefined = undefined;\n if (searchAreaPrompt) {\n searchAreaResponse = await AiLocateSection({\n context,\n sectionDescription: searchAreaPrompt,\n });\n assert(\n searchAreaResponse.rect,\n `cannot find search area for \"${searchAreaPrompt}\"${\n searchAreaResponse.error ? `: ${searchAreaResponse.error}` : ''\n }`,\n );\n searchAreaRawResponse = searchAreaResponse.rawResponse;\n searchAreaUsage = searchAreaResponse.usage;\n searchArea = searchAreaResponse.rect;\n }\n\n const startTime = Date.now();\n const {\n parseResult,\n rect,\n elementById,\n rawResponse,\n usage,\n isOrderSensitive,\n } = await AiLocateElement({\n callAI: callAI || this.aiVendorFn,\n context,\n targetElementDescription: queryPrompt,\n searchConfig: searchAreaResponse,\n });\n\n const timeCost = Date.now() - startTime;\n const taskInfo: InsightTaskInfo = {\n ...(this.taskInfo ? this.taskInfo : {}),\n durationMs: timeCost,\n rawResponse: JSON.stringify(rawResponse),\n formatResponse: JSON.stringify(parseResult),\n usage,\n searchArea,\n searchAreaRawResponse,\n searchAreaUsage,\n };\n\n let errorLog: string | undefined;\n if (parseResult.errors?.length) {\n errorLog = `AI model failed to locate: \\n${parseResult.errors.join('\\n')}`;\n }\n\n const dumpData: PartialInsightDumpFromSDK = {\n type: 'locate',\n userQuery: {\n element: queryPrompt,\n },\n matchedElement: [],\n matchedRect: rect,\n data: null,\n taskInfo,\n deepThink: !!searchArea,\n error: errorLog,\n };\n\n const elements: BaseElement[] = [];\n (parseResult.elements || []).forEach((item) => {\n if ('id' in item) {\n const element = elementById(item?.id);\n\n if (!element) {\n console.warn(\n `locate: cannot find element id=${item.id}. Maybe an unstable response from AI model`,\n );\n return;\n }\n elements.push(element);\n }\n });\n\n emitInsightDump(\n {\n ...dumpData,\n matchedElement: elements,\n },\n dumpSubscriber,\n );\n\n if (errorLog) {\n throw new Error(errorLog);\n }\n\n assert(\n elements.length <= 1,\n `locate: multiple elements found, length = ${elements.length}`,\n );\n\n if (elements.length === 1) {\n return {\n element: {\n id: elements[0]!.id,\n indexId: elements[0]!.indexId,\n center: elements[0]!.center,\n rect: elements[0]!.rect,\n xpaths: elements[0]!.xpaths || [],\n attributes: elements[0]!.attributes,\n isOrderSensitive,\n },\n rect,\n };\n }\n return {\n element: null,\n rect,\n };\n }\n\n async extract<T>(\n dataDemand: InsightExtractParam,\n opt?: InsightExtractOption,\n multimodalPrompt?: TMultimodalPrompt,\n ): Promise<{\n data: T;\n thought?: string;\n usage?: AIUsageInfo;\n }> {\n assert(\n typeof dataDemand === 'object' || typeof dataDemand === 'string',\n `dataDemand should be object or string, but get ${typeof dataDemand}`,\n );\n const dumpSubscriber = this.onceDumpUpdatedFn;\n this.onceDumpUpdatedFn = undefined;\n\n const modelPreferences: IModelPreferences = {\n intent: 'VQA',\n };\n\n const context = await this.contextRetrieverFn('extract');\n\n const startTime = Date.now();\n const { parseResult, usage } = await AiExtractElementInfo<T>({\n context,\n dataQuery: dataDemand,\n multimodalPrompt,\n extractOption: opt,\n modelPreferences,\n });\n\n const timeCost = Date.now() - startTime;\n const taskInfo: InsightTaskInfo = {\n ...(this.taskInfo ? this.taskInfo : {}),\n durationMs: timeCost,\n rawResponse: JSON.stringify(parseResult),\n };\n\n let errorLog: string | undefined;\n if (parseResult.errors?.length) {\n errorLog = `AI response error: \\n${parseResult.errors.join('\\n')}`;\n }\n\n const dumpData: PartialInsightDumpFromSDK = {\n type: 'extract',\n userQuery: {\n dataDemand,\n },\n matchedElement: [],\n data: null,\n taskInfo,\n error: errorLog,\n };\n\n const { data, thought } = parseResult || {};\n\n // 4\n emitInsightDump(\n {\n ...dumpData,\n data,\n },\n dumpSubscriber,\n );\n\n if (errorLog && !data && !opt?.doNotThrowError) {\n throw new Error(errorLog);\n }\n\n return {\n data,\n thought,\n usage,\n };\n }\n\n async describe(\n target: Rect | [number, number],\n opt?: {\n deepThink?: boolean;\n },\n ): Promise<Pick<AIDescribeElementResponse, 'description'>> {\n assert(target, 'target is required for insight.describe');\n const context = await this.contextRetrieverFn('describe');\n const { screenshotBase64, size } = context;\n assert(screenshotBase64, 'screenshot is required for insight.describe');\n // The result of the \"describe\" function will be used for positioning, so essentially it is a form of grounding.\n const modelPreferences: IModelPreferences = { intent: 'grounding' };\n const systemPrompt = elementDescriberInstruction();\n\n // Convert [x,y] center point to Rect if needed\n const defaultRectSize = 30;\n const targetRect: Rect = Array.isArray(target)\n ? {\n left: Math.floor(target[0] - defaultRectSize / 2),\n top: Math.floor(target[1] - defaultRectSize / 2),\n width: defaultRectSize,\n height: defaultRectSize,\n }\n : target;\n\n let imagePayload = await compositeElementInfoImg({\n inputImgBase64: screenshotBase64,\n size,\n elementsPositionInfo: [\n {\n rect: targetRect,\n },\n ],\n borderThickness: 3,\n });\n\n if (opt?.deepThink) {\n const searchArea = expandSearchArea(\n targetRect,\n context.size,\n modelPreferences,\n );\n debug('describe: set searchArea', searchArea);\n imagePayload = await cropByRect(\n imagePayload,\n searchArea,\n getIsUseQwenVl(modelPreferences),\n );\n }\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n ],\n },\n ];\n\n const callAIFn =\n this.aiVendorFn || callToGetJSONObject<AIDescribeElementResponse>;\n\n const res = await callAIFn(\n msgs,\n AIActionType.DESCRIBE_ELEMENT,\n modelPreferences,\n );\n\n const { content } = res;\n assert(!content.error, `describe failed: ${content.error}`);\n assert(content.description, 'failed to describe the element');\n return content;\n }\n}\n"],"names":["__webpack_require__","definition","key","Object","obj","prop","Symbol","debug","getDebug","Insight","query","opt","_parseResult_errors","callAI","queryPrompt","assert","dumpSubscriber","undefined","globalDeepThinkSwitch","getAIConfigInBoolean","MIDSCENE_FORCE_DEEP_THINK","searchAreaPrompt","modelPreferences","vlLocateMode","console","context","searchArea","searchAreaRawResponse","searchAreaUsage","searchAreaResponse","AiLocateSection","startTime","Date","parseResult","rect","elementById","rawResponse","usage","isOrderSensitive","AiLocateElement","timeCost","taskInfo","JSON","errorLog","dumpData","elements","item","element","emitInsightDump","Error","dataDemand","multimodalPrompt","AiExtractElementInfo","data","thought","target","screenshotBase64","size","systemPrompt","elementDescriberInstruction","defaultRectSize","targetRect","Array","Math","imagePayload","compositeElementInfoImg","expandSearchArea","cropByRect","getIsUseQwenVl","msgs","callAIFn","callToGetJSONObject","res","AIActionType","content","callAiFn","Promise"],"mappings":";;;IAAAA,oBAAoB,CAAC,GAAG,CAAC,UAASC;QACjC,IAAI,IAAIC,OAAOD,WACR,IAAGD,oBAAoB,CAAC,CAACC,YAAYC,QAAQ,CAACF,oBAAoB,CAAC,CAAC,UAASE,MACzEC,OAAO,cAAc,CAAC,UAASD,KAAK;YAAE,YAAY;YAAM,KAAKD,UAAU,CAACC,IAAI;QAAC;IAGzF;;;ICNAF,oBAAoB,CAAC,GAAG,CAACI,KAAKC,OAAUF,OAAO,SAAS,CAAC,cAAc,CAAC,IAAI,CAACC,KAAKC;;;ICClFL,oBAAoB,CAAC,GAAG,CAAC;QACxB,IAAG,AAAkB,eAAlB,OAAOM,UAA0BA,OAAO,WAAW,EACrDH,OAAO,cAAc,CAAC,UAASG,OAAO,WAAW,EAAE;YAAE,OAAO;QAAS;QAEtEH,OAAO,cAAc,CAAC,UAAS,cAAc;YAAE,OAAO;QAAK;IAC5D;;;;;;;;;;;;;;;;;;;;;;;;;;AC8CA,MAAMI,QAAQC,AAAAA,IAAAA,uBAAAA,QAAAA,AAAAA,EAAS;AACR,MAAMC;IAmCnB,MAAM,OACJC,KAA0B,EAC1BC,GAAgB,EACO;YAkFnBC;QAjFJ,MAAM,EAAEC,MAAM,EAAE,GAAGF,OAAO,CAAC;QAC3B,MAAMG,cAAc,AAAiB,YAAjB,OAAOJ,QAAqBA,QAAQA,MAAM,MAAM;QACpEK,IAAAA,sBAAAA,MAAAA,AAAAA,EAAOD,aAAa;QACpB,MAAME,iBAAiB,IAAI,CAAC,iBAAiB;QAC7C,IAAI,CAAC,iBAAiB,GAAGC;QAEzBF,IAAAA,sBAAAA,MAAAA,AAAAA,EAAO,AAAiB,YAAjB,OAAOL,OAAoB;QAElC,MAAMQ,wBAAwBC,AAAAA,IAAAA,oBAAAA,oBAAAA,AAAAA,EAC5BC,oBAAAA,yBAAyBA;QAE3B,IAAIF,uBACFX,MAAM,yBAAyBW;QAEjC,IAAIG;QACJ,IAAIX,MAAM,SAAS,IAAIQ,uBACrBG,mBAAmBX,MAAM,MAAM;QAEjC,MAAMY,mBAAsC;YAC1C,QAAQ;QACV;QAEA,IAAID,oBAAoB,CAACE,AAAAA,IAAAA,oBAAAA,YAAAA,AAAAA,EAAaD,mBAAmB;YACvDE,QAAQ,IAAI,CACV;YAEFH,mBAAmBJ;QACrB;QAEA,MAAMQ,UAAUd,AAAAA,CAAAA,QAAAA,MAAAA,KAAAA,IAAAA,IAAK,OAAO,AAAD,KAAM,MAAM,IAAI,CAAC,kBAAkB,CAAC;QAE/D,IAAIe;QACJ,IAAIC;QACJ,IAAIC;QACJ,IAAIC;QAGJ,IAAIR,kBAAkB;YACpBQ,qBAAqB,MAAMC,AAAAA,IAAAA,2BAAAA,eAAAA,AAAAA,EAAgB;gBACzCL;gBACA,oBAAoBJ;YACtB;YACAN,IAAAA,sBAAAA,MAAAA,AAAAA,EACEc,mBAAmB,IAAI,EACvB,CAAC,6BAA6B,EAAER,iBAAiB,CAAC,EAChDQ,mBAAmB,KAAK,GAAG,CAAC,EAAE,EAAEA,mBAAmB,KAAK,EAAE,GAAG,IAC7D;YAEJF,wBAAwBE,mBAAmB,WAAW;YACtDD,kBAAkBC,mBAAmB,KAAK;YAC1CH,aAAaG,mBAAmB,IAAI;QACtC;QAEA,MAAME,YAAYC,KAAK,GAAG;QAC1B,MAAM,EACJC,WAAW,EACXC,IAAI,EACJC,WAAW,EACXC,WAAW,EACXC,KAAK,EACLC,gBAAgB,EACjB,GAAG,MAAMC,AAAAA,IAAAA,yBAAAA,eAAAA,AAAAA,EAAgB;YACxB,QAAQ1B,UAAU,IAAI,CAAC,UAAU;YACjCY;YACA,0BAA0BX;YAC1B,cAAce;QAChB;QAEA,MAAMW,WAAWR,KAAK,GAAG,KAAKD;QAC9B,MAAMU,WAA4B;YAChC,GAAI,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC,QAAQ,GAAG,CAAC,CAAC;YACtC,YAAYD;YACZ,aAAaE,KAAK,SAAS,CAACN;YAC5B,gBAAgBM,KAAK,SAAS,CAACT;YAC/BI;YACAX;YACAC;YACAC;QACF;QAEA,IAAIe;QACJ,IAAI,QAAA/B,CAAAA,sBAAAA,YAAY,MAAM,AAAD,IAAjBA,KAAAA,IAAAA,oBAAoB,MAAM,EAC5B+B,WAAW,CAAC,6BAA6B,EAAEV,YAAY,MAAM,CAAC,IAAI,CAAC,OAAO;QAG5E,MAAMW,WAAsC;YAC1C,MAAM;YACN,WAAW;gBACT,SAAS9B;YACX;YACA,gBAAgB,EAAE;YAClB,aAAaoB;YACb,MAAM;YACNO;YACA,WAAW,CAAC,CAACf;YACb,OAAOiB;QACT;QAEA,MAAME,WAA0B,EAAE;QACjCZ,CAAAA,YAAY,QAAQ,IAAI,EAAC,EAAG,OAAO,CAAC,CAACa;YACpC,IAAI,QAAQA,MAAM;gBAChB,MAAMC,UAAUZ,YAAYW,QAAAA,OAAAA,KAAAA,IAAAA,KAAM,EAAE;gBAEpC,IAAI,CAACC,SAAS,YACZvB,QAAQ,IAAI,CACV,CAAC,+BAA+B,EAAEsB,KAAK,EAAE,CAAC,0CAA0C,CAAC;gBAIzFD,SAAS,IAAI,CAACE;YAChB;QACF;QAEAC,IAAAA,kCAAAA,eAAAA,AAAAA,EACE;YACE,GAAGJ,QAAQ;YACX,gBAAgBC;QAClB,GACA7B;QAGF,IAAI2B,UACF,MAAM,IAAIM,MAAMN;QAGlB5B,IAAAA,sBAAAA,MAAAA,AAAAA,EACE8B,SAAS,MAAM,IAAI,GACnB,CAAC,0CAA0C,EAAEA,SAAS,MAAM,EAAE;QAGhE,IAAIA,AAAoB,MAApBA,SAAS,MAAM,EACjB,OAAO;YACL,SAAS;gBACP,IAAIA,QAAQ,CAAC,EAAE,CAAE,EAAE;gBACnB,SAASA,QAAQ,CAAC,EAAE,CAAE,OAAO;gBAC7B,QAAQA,QAAQ,CAAC,EAAE,CAAE,MAAM;gBAC3B,MAAMA,QAAQ,CAAC,EAAE,CAAE,IAAI;gBACvB,QAAQA,QAAQ,CAAC,EAAE,CAAE,MAAM,IAAI,EAAE;gBACjC,YAAYA,QAAQ,CAAC,EAAE,CAAE,UAAU;gBACnCP;YACF;YACAJ;QACF;QAEF,OAAO;YACL,SAAS;YACTA;QACF;IACF;IAEA,MAAM,QACJgB,UAA+B,EAC/BvC,GAA0B,EAC1BwC,gBAAoC,EAKnC;YA+BGvC;QA9BJG,IAAAA,sBAAAA,MAAAA,AAAAA,EACE,AAAsB,YAAtB,OAAOmC,cAA2B,AAAsB,YAAtB,OAAOA,YACzC,CAAC,+CAA+C,EAAE,OAAOA,YAAY;QAEvE,MAAMlC,iBAAiB,IAAI,CAAC,iBAAiB;QAC7C,IAAI,CAAC,iBAAiB,GAAGC;QAEzB,MAAMK,mBAAsC;YAC1C,QAAQ;QACV;QAEA,MAAMG,UAAU,MAAM,IAAI,CAAC,kBAAkB,CAAC;QAE9C,MAAMM,YAAYC,KAAK,GAAG;QAC1B,MAAM,EAAEC,WAAW,EAAEI,KAAK,EAAE,GAAG,MAAMe,AAAAA,IAAAA,yBAAAA,oBAAAA,AAAAA,EAAwB;YAC3D3B;YACA,WAAWyB;YACXC;YACA,eAAexC;YACfW;QACF;QAEA,MAAMkB,WAAWR,KAAK,GAAG,KAAKD;QAC9B,MAAMU,WAA4B;YAChC,GAAI,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC,QAAQ,GAAG,CAAC,CAAC;YACtC,YAAYD;YACZ,aAAaE,KAAK,SAAS,CAACT;QAC9B;QAEA,IAAIU;QACJ,IAAI,QAAA/B,CAAAA,sBAAAA,YAAY,MAAM,AAAD,IAAjBA,KAAAA,IAAAA,oBAAoB,MAAM,EAC5B+B,WAAW,CAAC,qBAAqB,EAAEV,YAAY,MAAM,CAAC,IAAI,CAAC,OAAO;QAGpE,MAAMW,WAAsC;YAC1C,MAAM;YACN,WAAW;gBACTM;YACF;YACA,gBAAgB,EAAE;YAClB,MAAM;YACNT;YACA,OAAOE;QACT;QAEA,MAAM,EAAEU,IAAI,EAAEC,OAAO,EAAE,GAAGrB,eAAe,CAAC;QAG1Ce,IAAAA,kCAAAA,eAAAA,AAAAA,EACE;YACE,GAAGJ,QAAQ;YACXS;QACF,GACArC;QAGF,IAAI2B,YAAY,CAACU,QAAQ,CAAC1C,CAAAA,QAAAA,MAAAA,KAAAA,IAAAA,IAAK,eAAe,AAAD,GAC3C,MAAM,IAAIsC,MAAMN;QAGlB,OAAO;YACLU;YACAC;YACAjB;QACF;IACF;IAEA,MAAM,SACJkB,MAA+B,EAC/B5C,GAEC,EACwD;QACzDI,IAAAA,sBAAAA,MAAAA,AAAAA,EAAOwC,QAAQ;QACf,MAAM9B,UAAU,MAAM,IAAI,CAAC,kBAAkB,CAAC;QAC9C,MAAM,EAAE+B,gBAAgB,EAAEC,IAAI,EAAE,GAAGhC;QACnCV,IAAAA,sBAAAA,MAAAA,AAAAA,EAAOyC,kBAAkB;QAEzB,MAAMlC,mBAAsC;YAAE,QAAQ;QAAY;QAClE,MAAMoC,eAAeC,AAAAA,IAAAA,4BAAAA,2BAAAA,AAAAA;QAGrB,MAAMC,kBAAkB;QACxB,MAAMC,aAAmBC,MAAM,OAAO,CAACP,UACnC;YACE,MAAMQ,KAAK,KAAK,CAACR,MAAM,CAAC,EAAE,GAAGK,kBAAkB;YAC/C,KAAKG,KAAK,KAAK,CAACR,MAAM,CAAC,EAAE,GAAGK,kBAAkB;YAC9C,OAAOA;YACP,QAAQA;QACV,IACAL;QAEJ,IAAIS,eAAe,MAAMC,AAAAA,IAAAA,oBAAAA,uBAAAA,AAAAA,EAAwB;YAC/C,gBAAgBT;YAChBC;YACA,sBAAsB;gBACpB;oBACE,MAAMI;gBACR;aACD;YACD,iBAAiB;QACnB;QAEA,IAAIlD,QAAAA,MAAAA,KAAAA,IAAAA,IAAK,SAAS,EAAE;YAClB,MAAMe,aAAawC,AAAAA,IAAAA,0BAAAA,gBAAAA,AAAAA,EACjBL,YACApC,QAAQ,IAAI,EACZH;YAEFf,MAAM,4BAA4BmB;YAClCsC,eAAe,MAAMG,AAAAA,IAAAA,oBAAAA,UAAAA,AAAAA,EACnBH,cACAtC,YACA0C,AAAAA,IAAAA,oBAAAA,cAAAA,AAAAA,EAAe9C;QAEnB;QAEA,MAAM+C,OAAe;YACnB;gBAAE,MAAM;gBAAU,SAASX;YAAa;YACxC;gBACE,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,WAAW;4BACT,KAAKM;4BACL,QAAQ;wBACV;oBACF;iBACD;YACH;SACD;QAED,MAAMM,WACJ,IAAI,CAAC,UAAU,IAAIC,yBAAAA,mBAAmBA;QAExC,MAAMC,MAAM,MAAMF,SAChBD,MACAI,0BAAAA,YAAAA,CAAAA,gBAA6B,EAC7BnD;QAGF,MAAM,EAAEoD,OAAO,EAAE,GAAGF;QACpBzD,IAAAA,sBAAAA,MAAAA,AAAAA,EAAO,CAAC2D,QAAQ,KAAK,EAAE,CAAC,iBAAiB,EAAEA,QAAQ,KAAK,EAAE;QAC1D3D,IAAAA,sBAAAA,MAAAA,AAAAA,EAAO2D,QAAQ,WAAW,EAAE;QAC5B,OAAOA;IACT;IA1UA,YACEjD,OAEmE,EACnEd,GAAoB,CACpB;QAfF;QAIA,qCAAoDgE,0BAAAA,QAAQA;QAE5D;QAEA;QAQE5D,IAAAA,sBAAAA,MAAAA,AAAAA,EAAOU,SAAS;QAChB,IAAI,AAAmB,cAAnB,OAAOA,SACT,IAAI,CAAC,kBAAkB,GAAGA;aAE1B,IAAI,CAAC,kBAAkB,GAAG,IAAMmD,QAAQ,OAAO,CAACnD;QAGlD,IAAI,AAA2B,WAApBd,CAAAA,QAAAA,MAAAA,KAAAA,IAAAA,IAAK,UAAU,AAAD,GACvB,IAAI,CAAC,UAAU,GAAGA,IAAI,UAAU;QAElC,IAAI,AAAyB,WAAlBA,CAAAA,QAAAA,MAAAA,KAAAA,IAAAA,IAAK,QAAQ,AAAD,GACrB,IAAI,CAAC,QAAQ,GAAGA,IAAI,QAAQ;IAEhC;AAwTF"}
1
+ {"version":3,"file":"insight/index.js","sources":["webpack://@midscene/core/webpack/runtime/define_property_getters","webpack://@midscene/core/webpack/runtime/has_own_property","webpack://@midscene/core/webpack/runtime/make_namespace_object","webpack://@midscene/core/./src/insight/index.ts"],"sourcesContent":["__webpack_require__.d = (exports, definition) => {\n\tfor(var key in definition) {\n if(__webpack_require__.o(definition, key) && !__webpack_require__.o(exports, key)) {\n Object.defineProperty(exports, key, { enumerable: true, get: definition[key] });\n }\n }\n};","__webpack_require__.o = (obj, prop) => (Object.prototype.hasOwnProperty.call(obj, prop))","// define __esModule on exports\n__webpack_require__.r = (exports) => {\n\tif(typeof Symbol !== 'undefined' && Symbol.toStringTag) {\n\t\tObject.defineProperty(exports, Symbol.toStringTag, { value: 'Module' });\n\t}\n\tObject.defineProperty(exports, '__esModule', { value: true });\n};","import {\n AIActionType,\n type AIArgs,\n callAiFn,\n expandSearchArea,\n} from '@/ai-model/common';\nimport {\n AiExtractElementInfo,\n AiLocateElement,\n callToGetJSONObject,\n} from '@/ai-model/index';\nimport { AiLocateSection } from '@/ai-model/inspect';\nimport { elementDescriberInstruction } from '@/ai-model/prompt/describe';\nimport type {\n AIDescribeElementResponse,\n AIElementResponse,\n AIUsageInfo,\n BaseElement,\n DetailedLocateParam,\n DumpSubscriber,\n InsightAction,\n InsightExtractOption,\n InsightExtractParam,\n InsightOptions,\n InsightTaskInfo,\n LocateResult,\n PartialInsightDumpFromSDK,\n Rect,\n UIContext,\n} from '@/types';\nimport {\n type IModelPreferences,\n MIDSCENE_FORCE_DEEP_THINK,\n getAIConfigInBoolean,\n getIsUseQwenVl,\n vlLocateMode,\n} from '@midscene/shared/env';\nimport { compositeElementInfoImg, cropByRect } from '@midscene/shared/img';\nimport { getDebug } from '@midscene/shared/logger';\nimport { assert } from '@midscene/shared/utils';\nimport type { TMultimodalPrompt } from '../ai-model/common';\nimport { emitInsightDump } from './utils';\n\nexport interface LocateOpts {\n context?: UIContext<BaseElement>;\n callAI?: typeof callAiFn<AIElementResponse>;\n}\n\nexport type AnyValue<T> = {\n [K in keyof T]: unknown extends T[K] ? any : T[K];\n};\n\nconst debug = getDebug('ai:insight');\nexport default class Insight<\n ElementType extends BaseElement = BaseElement,\n ContextType extends UIContext<ElementType> = UIContext<ElementType>,\n> {\n contextRetrieverFn: (\n action: InsightAction,\n ) => Promise<ContextType> | ContextType;\n\n aiVendorFn: (...args: Array<any>) => Promise<any> = callAiFn;\n\n onceDumpUpdatedFn?: DumpSubscriber;\n\n taskInfo?: Omit<InsightTaskInfo, 'durationMs'>;\n\n constructor(\n context:\n | ContextType\n | ((action: InsightAction) => Promise<ContextType> | ContextType),\n opt?: InsightOptions,\n ) {\n assert(context, 'context is required for Insight');\n if (typeof context === 'function') {\n this.contextRetrieverFn = context;\n } else {\n this.contextRetrieverFn = () => Promise.resolve(context);\n }\n\n if (typeof opt?.aiVendorFn !== 'undefined') {\n this.aiVendorFn = opt.aiVendorFn;\n }\n if (typeof opt?.taskInfo !== 'undefined') {\n this.taskInfo = opt.taskInfo;\n }\n }\n\n async locate(\n query: DetailedLocateParam,\n opt?: LocateOpts,\n ): Promise<LocateResult> {\n const { callAI } = opt || {};\n const queryPrompt = typeof query === 'string' ? query : query.prompt;\n assert(queryPrompt, 'query is required for locate');\n const dumpSubscriber = this.onceDumpUpdatedFn;\n this.onceDumpUpdatedFn = undefined;\n\n assert(typeof query === 'object', 'query should be an object for locate');\n\n const globalDeepThinkSwitch = getAIConfigInBoolean(\n MIDSCENE_FORCE_DEEP_THINK,\n );\n if (globalDeepThinkSwitch) {\n debug('globalDeepThinkSwitch', globalDeepThinkSwitch);\n }\n let searchAreaPrompt;\n if (query.deepThink || globalDeepThinkSwitch) {\n searchAreaPrompt = query.prompt;\n }\n const modelPreferences: IModelPreferences = {\n intent: 'grounding',\n };\n\n if (searchAreaPrompt && !vlLocateMode(modelPreferences)) {\n console.warn(\n 'The \"deepThink\" feature is not supported with multimodal LLM. Please config VL model for Midscene. https://midscenejs.com/choose-a-model',\n );\n searchAreaPrompt = undefined;\n }\n\n const context = opt?.context || (await this.contextRetrieverFn('locate'));\n\n let searchArea: Rect | undefined = undefined;\n let searchAreaRawResponse: string | undefined = undefined;\n let searchAreaUsage: AIUsageInfo | undefined = undefined;\n let searchAreaResponse:\n | Awaited<ReturnType<typeof AiLocateSection>>\n | undefined = undefined;\n if (searchAreaPrompt) {\n searchAreaResponse = await AiLocateSection({\n context,\n sectionDescription: searchAreaPrompt,\n });\n assert(\n searchAreaResponse.rect,\n `cannot find search area for \"${searchAreaPrompt}\"${\n searchAreaResponse.error ? `: ${searchAreaResponse.error}` : ''\n }`,\n );\n searchAreaRawResponse = searchAreaResponse.rawResponse;\n searchAreaUsage = searchAreaResponse.usage;\n searchArea = searchAreaResponse.rect;\n }\n\n const startTime = Date.now();\n const {\n parseResult,\n rect,\n elementById,\n rawResponse,\n usage,\n isOrderSensitive,\n } = await AiLocateElement({\n callAI: callAI || this.aiVendorFn,\n context,\n targetElementDescription: queryPrompt,\n searchConfig: searchAreaResponse,\n });\n\n const timeCost = Date.now() - startTime;\n const taskInfo: InsightTaskInfo = {\n ...(this.taskInfo ? this.taskInfo : {}),\n durationMs: timeCost,\n rawResponse: JSON.stringify(rawResponse),\n formatResponse: JSON.stringify(parseResult),\n usage,\n searchArea,\n searchAreaRawResponse,\n searchAreaUsage,\n };\n\n let errorLog: string | undefined;\n if (parseResult.errors?.length) {\n errorLog = `AI model failed to locate: \\n${parseResult.errors.join('\\n')}`;\n }\n\n const dumpData: PartialInsightDumpFromSDK = {\n type: 'locate',\n userQuery: {\n element: queryPrompt,\n },\n matchedElement: [],\n matchedRect: rect,\n data: null,\n taskInfo,\n deepThink: !!searchArea,\n error: errorLog,\n };\n\n const elements: BaseElement[] = [];\n (parseResult.elements || []).forEach((item) => {\n if ('id' in item) {\n const element = elementById(item?.id);\n\n if (!element) {\n console.warn(\n `locate: cannot find element id=${item.id}. Maybe an unstable response from AI model`,\n );\n return;\n }\n elements.push(element);\n }\n });\n\n emitInsightDump(\n {\n ...dumpData,\n matchedElement: elements,\n },\n dumpSubscriber,\n );\n\n if (errorLog) {\n throw new Error(errorLog);\n }\n\n assert(\n elements.length <= 1,\n `locate: multiple elements found, length = ${elements.length}`,\n );\n\n if (elements.length === 1) {\n return {\n element: {\n id: elements[0]!.id,\n indexId: elements[0]!.indexId,\n center: elements[0]!.center,\n rect: elements[0]!.rect,\n xpaths: elements[0]!.xpaths || [],\n attributes: elements[0]!.attributes,\n isOrderSensitive,\n },\n rect,\n };\n }\n return {\n element: null,\n rect,\n };\n }\n\n async extract<T>(\n dataDemand: InsightExtractParam,\n opt?: InsightExtractOption,\n multimodalPrompt?: TMultimodalPrompt,\n ): Promise<{\n data: T;\n thought?: string;\n usage?: AIUsageInfo;\n }> {\n assert(\n typeof dataDemand === 'object' || typeof dataDemand === 'string',\n `dataDemand should be object or string, but get ${typeof dataDemand}`,\n );\n const dumpSubscriber = this.onceDumpUpdatedFn;\n this.onceDumpUpdatedFn = undefined;\n\n const modelPreferences: IModelPreferences = {\n intent: 'VQA',\n };\n\n const context = await this.contextRetrieverFn('extract');\n\n const startTime = Date.now();\n const { parseResult, usage } = await AiExtractElementInfo<T>({\n context,\n dataQuery: dataDemand,\n multimodalPrompt,\n extractOption: opt,\n modelPreferences,\n });\n\n const timeCost = Date.now() - startTime;\n const taskInfo: InsightTaskInfo = {\n ...(this.taskInfo ? this.taskInfo : {}),\n durationMs: timeCost,\n rawResponse: JSON.stringify(parseResult),\n };\n\n let errorLog: string | undefined;\n if (parseResult.errors?.length) {\n errorLog = `AI response error: \\n${parseResult.errors.join('\\n')}`;\n }\n\n const dumpData: PartialInsightDumpFromSDK = {\n type: 'extract',\n userQuery: {\n dataDemand,\n },\n matchedElement: [],\n data: null,\n taskInfo,\n error: errorLog,\n };\n\n const { data, thought } = parseResult || {};\n\n // 4\n emitInsightDump(\n {\n ...dumpData,\n data,\n },\n dumpSubscriber,\n );\n\n if (errorLog && !data && !opt?.doNotThrowError) {\n throw new Error(errorLog);\n }\n\n return {\n data,\n thought,\n usage,\n };\n }\n\n async describe(\n target: Rect | [number, number],\n opt?: {\n deepThink?: boolean;\n },\n ): Promise<Pick<AIDescribeElementResponse, 'description'>> {\n assert(target, 'target is required for insight.describe');\n const context = await this.contextRetrieverFn('describe');\n const { screenshotBase64, size } = context;\n assert(screenshotBase64, 'screenshot is required for insight.describe');\n\n const systemPrompt = elementDescriberInstruction();\n\n // Convert [x,y] center point to Rect if needed\n const defaultRectSize = 30;\n const targetRect: Rect = Array.isArray(target)\n ? {\n left: Math.floor(target[0] - defaultRectSize / 2),\n top: Math.floor(target[1] - defaultRectSize / 2),\n width: defaultRectSize,\n height: defaultRectSize,\n }\n : target;\n\n let imagePayload = await compositeElementInfoImg({\n inputImgBase64: screenshotBase64,\n size,\n elementsPositionInfo: [\n {\n rect: targetRect,\n },\n ],\n borderThickness: 3,\n });\n\n if (opt?.deepThink) {\n // The result of the \"describe\" function will be used for positioning, so essentially it is a form of grounding.\n const modelPreferences: IModelPreferences = { intent: 'grounding' };\n const searchArea = expandSearchArea(\n targetRect,\n context.size,\n modelPreferences,\n );\n debug('describe: set searchArea', searchArea);\n imagePayload = await cropByRect(\n imagePayload,\n searchArea,\n getIsUseQwenVl(modelPreferences),\n );\n }\n\n const msgs: AIArgs = [\n { role: 'system', content: systemPrompt },\n {\n role: 'user',\n content: [\n {\n type: 'image_url',\n image_url: {\n url: imagePayload,\n detail: 'high',\n },\n },\n ],\n },\n ];\n\n const callAIFn =\n this.aiVendorFn || callToGetJSONObject<AIDescribeElementResponse>;\n\n const res = await callAIFn(msgs, AIActionType.DESCRIBE_ELEMENT);\n\n const { content } = res;\n assert(!content.error, `describe failed: ${content.error}`);\n assert(content.description, 'failed to describe the element');\n return content;\n }\n}\n"],"names":["__webpack_require__","definition","key","Object","obj","prop","Symbol","debug","getDebug","Insight","query","opt","_parseResult_errors","callAI","queryPrompt","assert","dumpSubscriber","undefined","globalDeepThinkSwitch","getAIConfigInBoolean","MIDSCENE_FORCE_DEEP_THINK","searchAreaPrompt","modelPreferences","vlLocateMode","console","context","searchArea","searchAreaRawResponse","searchAreaUsage","searchAreaResponse","AiLocateSection","startTime","Date","parseResult","rect","elementById","rawResponse","usage","isOrderSensitive","AiLocateElement","timeCost","taskInfo","JSON","errorLog","dumpData","elements","item","element","emitInsightDump","Error","dataDemand","multimodalPrompt","AiExtractElementInfo","data","thought","target","screenshotBase64","size","systemPrompt","elementDescriberInstruction","defaultRectSize","targetRect","Array","Math","imagePayload","compositeElementInfoImg","expandSearchArea","cropByRect","getIsUseQwenVl","msgs","callAIFn","callToGetJSONObject","res","AIActionType","content","callAiFn","Promise"],"mappings":";;;IAAAA,oBAAoB,CAAC,GAAG,CAAC,UAASC;QACjC,IAAI,IAAIC,OAAOD,WACR,IAAGD,oBAAoB,CAAC,CAACC,YAAYC,QAAQ,CAACF,oBAAoB,CAAC,CAAC,UAASE,MACzEC,OAAO,cAAc,CAAC,UAASD,KAAK;YAAE,YAAY;YAAM,KAAKD,UAAU,CAACC,IAAI;QAAC;IAGzF;;;ICNAF,oBAAoB,CAAC,GAAG,CAACI,KAAKC,OAAUF,OAAO,SAAS,CAAC,cAAc,CAAC,IAAI,CAACC,KAAKC;;;ICClFL,oBAAoB,CAAC,GAAG,CAAC;QACxB,IAAG,AAAkB,eAAlB,OAAOM,UAA0BA,OAAO,WAAW,EACrDH,OAAO,cAAc,CAAC,UAASG,OAAO,WAAW,EAAE;YAAE,OAAO;QAAS;QAEtEH,OAAO,cAAc,CAAC,UAAS,cAAc;YAAE,OAAO;QAAK;IAC5D;;;;;;;;;;;;;;;;;;;;;;;;;;AC8CA,MAAMI,QAAQC,AAAAA,IAAAA,uBAAAA,QAAAA,AAAAA,EAAS;AACR,MAAMC;IAmCnB,MAAM,OACJC,KAA0B,EAC1BC,GAAgB,EACO;YAkFnBC;QAjFJ,MAAM,EAAEC,MAAM,EAAE,GAAGF,OAAO,CAAC;QAC3B,MAAMG,cAAc,AAAiB,YAAjB,OAAOJ,QAAqBA,QAAQA,MAAM,MAAM;QACpEK,IAAAA,sBAAAA,MAAAA,AAAAA,EAAOD,aAAa;QACpB,MAAME,iBAAiB,IAAI,CAAC,iBAAiB;QAC7C,IAAI,CAAC,iBAAiB,GAAGC;QAEzBF,IAAAA,sBAAAA,MAAAA,AAAAA,EAAO,AAAiB,YAAjB,OAAOL,OAAoB;QAElC,MAAMQ,wBAAwBC,AAAAA,IAAAA,oBAAAA,oBAAAA,AAAAA,EAC5BC,oBAAAA,yBAAyBA;QAE3B,IAAIF,uBACFX,MAAM,yBAAyBW;QAEjC,IAAIG;QACJ,IAAIX,MAAM,SAAS,IAAIQ,uBACrBG,mBAAmBX,MAAM,MAAM;QAEjC,MAAMY,mBAAsC;YAC1C,QAAQ;QACV;QAEA,IAAID,oBAAoB,CAACE,AAAAA,IAAAA,oBAAAA,YAAAA,AAAAA,EAAaD,mBAAmB;YACvDE,QAAQ,IAAI,CACV;YAEFH,mBAAmBJ;QACrB;QAEA,MAAMQ,UAAUd,AAAAA,CAAAA,QAAAA,MAAAA,KAAAA,IAAAA,IAAK,OAAO,AAAD,KAAM,MAAM,IAAI,CAAC,kBAAkB,CAAC;QAE/D,IAAIe;QACJ,IAAIC;QACJ,IAAIC;QACJ,IAAIC;QAGJ,IAAIR,kBAAkB;YACpBQ,qBAAqB,MAAMC,AAAAA,IAAAA,2BAAAA,eAAAA,AAAAA,EAAgB;gBACzCL;gBACA,oBAAoBJ;YACtB;YACAN,IAAAA,sBAAAA,MAAAA,AAAAA,EACEc,mBAAmB,IAAI,EACvB,CAAC,6BAA6B,EAAER,iBAAiB,CAAC,EAChDQ,mBAAmB,KAAK,GAAG,CAAC,EAAE,EAAEA,mBAAmB,KAAK,EAAE,GAAG,IAC7D;YAEJF,wBAAwBE,mBAAmB,WAAW;YACtDD,kBAAkBC,mBAAmB,KAAK;YAC1CH,aAAaG,mBAAmB,IAAI;QACtC;QAEA,MAAME,YAAYC,KAAK,GAAG;QAC1B,MAAM,EACJC,WAAW,EACXC,IAAI,EACJC,WAAW,EACXC,WAAW,EACXC,KAAK,EACLC,gBAAgB,EACjB,GAAG,MAAMC,AAAAA,IAAAA,yBAAAA,eAAAA,AAAAA,EAAgB;YACxB,QAAQ1B,UAAU,IAAI,CAAC,UAAU;YACjCY;YACA,0BAA0BX;YAC1B,cAAce;QAChB;QAEA,MAAMW,WAAWR,KAAK,GAAG,KAAKD;QAC9B,MAAMU,WAA4B;YAChC,GAAI,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC,QAAQ,GAAG,CAAC,CAAC;YACtC,YAAYD;YACZ,aAAaE,KAAK,SAAS,CAACN;YAC5B,gBAAgBM,KAAK,SAAS,CAACT;YAC/BI;YACAX;YACAC;YACAC;QACF;QAEA,IAAIe;QACJ,IAAI,QAAA/B,CAAAA,sBAAAA,YAAY,MAAM,AAAD,IAAjBA,KAAAA,IAAAA,oBAAoB,MAAM,EAC5B+B,WAAW,CAAC,6BAA6B,EAAEV,YAAY,MAAM,CAAC,IAAI,CAAC,OAAO;QAG5E,MAAMW,WAAsC;YAC1C,MAAM;YACN,WAAW;gBACT,SAAS9B;YACX;YACA,gBAAgB,EAAE;YAClB,aAAaoB;YACb,MAAM;YACNO;YACA,WAAW,CAAC,CAACf;YACb,OAAOiB;QACT;QAEA,MAAME,WAA0B,EAAE;QACjCZ,CAAAA,YAAY,QAAQ,IAAI,EAAC,EAAG,OAAO,CAAC,CAACa;YACpC,IAAI,QAAQA,MAAM;gBAChB,MAAMC,UAAUZ,YAAYW,QAAAA,OAAAA,KAAAA,IAAAA,KAAM,EAAE;gBAEpC,IAAI,CAACC,SAAS,YACZvB,QAAQ,IAAI,CACV,CAAC,+BAA+B,EAAEsB,KAAK,EAAE,CAAC,0CAA0C,CAAC;gBAIzFD,SAAS,IAAI,CAACE;YAChB;QACF;QAEAC,IAAAA,kCAAAA,eAAAA,AAAAA,EACE;YACE,GAAGJ,QAAQ;YACX,gBAAgBC;QAClB,GACA7B;QAGF,IAAI2B,UACF,MAAM,IAAIM,MAAMN;QAGlB5B,IAAAA,sBAAAA,MAAAA,AAAAA,EACE8B,SAAS,MAAM,IAAI,GACnB,CAAC,0CAA0C,EAAEA,SAAS,MAAM,EAAE;QAGhE,IAAIA,AAAoB,MAApBA,SAAS,MAAM,EACjB,OAAO;YACL,SAAS;gBACP,IAAIA,QAAQ,CAAC,EAAE,CAAE,EAAE;gBACnB,SAASA,QAAQ,CAAC,EAAE,CAAE,OAAO;gBAC7B,QAAQA,QAAQ,CAAC,EAAE,CAAE,MAAM;gBAC3B,MAAMA,QAAQ,CAAC,EAAE,CAAE,IAAI;gBACvB,QAAQA,QAAQ,CAAC,EAAE,CAAE,MAAM,IAAI,EAAE;gBACjC,YAAYA,QAAQ,CAAC,EAAE,CAAE,UAAU;gBACnCP;YACF;YACAJ;QACF;QAEF,OAAO;YACL,SAAS;YACTA;QACF;IACF;IAEA,MAAM,QACJgB,UAA+B,EAC/BvC,GAA0B,EAC1BwC,gBAAoC,EAKnC;YA+BGvC;QA9BJG,IAAAA,sBAAAA,MAAAA,AAAAA,EACE,AAAsB,YAAtB,OAAOmC,cAA2B,AAAsB,YAAtB,OAAOA,YACzC,CAAC,+CAA+C,EAAE,OAAOA,YAAY;QAEvE,MAAMlC,iBAAiB,IAAI,CAAC,iBAAiB;QAC7C,IAAI,CAAC,iBAAiB,GAAGC;QAEzB,MAAMK,mBAAsC;YAC1C,QAAQ;QACV;QAEA,MAAMG,UAAU,MAAM,IAAI,CAAC,kBAAkB,CAAC;QAE9C,MAAMM,YAAYC,KAAK,GAAG;QAC1B,MAAM,EAAEC,WAAW,EAAEI,KAAK,EAAE,GAAG,MAAMe,AAAAA,IAAAA,yBAAAA,oBAAAA,AAAAA,EAAwB;YAC3D3B;YACA,WAAWyB;YACXC;YACA,eAAexC;YACfW;QACF;QAEA,MAAMkB,WAAWR,KAAK,GAAG,KAAKD;QAC9B,MAAMU,WAA4B;YAChC,GAAI,IAAI,CAAC,QAAQ,GAAG,IAAI,CAAC,QAAQ,GAAG,CAAC,CAAC;YACtC,YAAYD;YACZ,aAAaE,KAAK,SAAS,CAACT;QAC9B;QAEA,IAAIU;QACJ,IAAI,QAAA/B,CAAAA,sBAAAA,YAAY,MAAM,AAAD,IAAjBA,KAAAA,IAAAA,oBAAoB,MAAM,EAC5B+B,WAAW,CAAC,qBAAqB,EAAEV,YAAY,MAAM,CAAC,IAAI,CAAC,OAAO;QAGpE,MAAMW,WAAsC;YAC1C,MAAM;YACN,WAAW;gBACTM;YACF;YACA,gBAAgB,EAAE;YAClB,MAAM;YACNT;YACA,OAAOE;QACT;QAEA,MAAM,EAAEU,IAAI,EAAEC,OAAO,EAAE,GAAGrB,eAAe,CAAC;QAG1Ce,IAAAA,kCAAAA,eAAAA,AAAAA,EACE;YACE,GAAGJ,QAAQ;YACXS;QACF,GACArC;QAGF,IAAI2B,YAAY,CAACU,QAAQ,CAAC1C,CAAAA,QAAAA,MAAAA,KAAAA,IAAAA,IAAK,eAAe,AAAD,GAC3C,MAAM,IAAIsC,MAAMN;QAGlB,OAAO;YACLU;YACAC;YACAjB;QACF;IACF;IAEA,MAAM,SACJkB,MAA+B,EAC/B5C,GAEC,EACwD;QACzDI,IAAAA,sBAAAA,MAAAA,AAAAA,EAAOwC,QAAQ;QACf,MAAM9B,UAAU,MAAM,IAAI,CAAC,kBAAkB,CAAC;QAC9C,MAAM,EAAE+B,gBAAgB,EAAEC,IAAI,EAAE,GAAGhC;QACnCV,IAAAA,sBAAAA,MAAAA,AAAAA,EAAOyC,kBAAkB;QAEzB,MAAME,eAAeC,AAAAA,IAAAA,4BAAAA,2BAAAA,AAAAA;QAGrB,MAAMC,kBAAkB;QACxB,MAAMC,aAAmBC,MAAM,OAAO,CAACP,UACnC;YACE,MAAMQ,KAAK,KAAK,CAACR,MAAM,CAAC,EAAE,GAAGK,kBAAkB;YAC/C,KAAKG,KAAK,KAAK,CAACR,MAAM,CAAC,EAAE,GAAGK,kBAAkB;YAC9C,OAAOA;YACP,QAAQA;QACV,IACAL;QAEJ,IAAIS,eAAe,MAAMC,AAAAA,IAAAA,oBAAAA,uBAAAA,AAAAA,EAAwB;YAC/C,gBAAgBT;YAChBC;YACA,sBAAsB;gBACpB;oBACE,MAAMI;gBACR;aACD;YACD,iBAAiB;QACnB;QAEA,IAAIlD,QAAAA,MAAAA,KAAAA,IAAAA,IAAK,SAAS,EAAE;YAElB,MAAMW,mBAAsC;gBAAE,QAAQ;YAAY;YAClE,MAAMI,aAAawC,AAAAA,IAAAA,0BAAAA,gBAAAA,AAAAA,EACjBL,YACApC,QAAQ,IAAI,EACZH;YAEFf,MAAM,4BAA4BmB;YAClCsC,eAAe,MAAMG,AAAAA,IAAAA,oBAAAA,UAAAA,AAAAA,EACnBH,cACAtC,YACA0C,AAAAA,IAAAA,oBAAAA,cAAAA,AAAAA,EAAe9C;QAEnB;QAEA,MAAM+C,OAAe;YACnB;gBAAE,MAAM;gBAAU,SAASX;YAAa;YACxC;gBACE,MAAM;gBACN,SAAS;oBACP;wBACE,MAAM;wBACN,WAAW;4BACT,KAAKM;4BACL,QAAQ;wBACV;oBACF;iBACD;YACH;SACD;QAED,MAAMM,WACJ,IAAI,CAAC,UAAU,IAAIC,yBAAAA,mBAAmBA;QAExC,MAAMC,MAAM,MAAMF,SAASD,MAAMI,0BAAAA,YAAAA,CAAAA,gBAA6B;QAE9D,MAAM,EAAEC,OAAO,EAAE,GAAGF;QACpBzD,IAAAA,sBAAAA,MAAAA,AAAAA,EAAO,CAAC2D,QAAQ,KAAK,EAAE,CAAC,iBAAiB,EAAEA,QAAQ,KAAK,EAAE;QAC1D3D,IAAAA,sBAAAA,MAAAA,AAAAA,EAAO2D,QAAQ,WAAW,EAAE;QAC5B,OAAOA;IACT;IAvUA,YACEjD,OAEmE,EACnEd,GAAoB,CACpB;QAfF;QAIA,qCAAoDgE,0BAAAA,QAAQA;QAE5D;QAEA;QAQE5D,IAAAA,sBAAAA,MAAAA,AAAAA,EAAOU,SAAS;QAChB,IAAI,AAAmB,cAAnB,OAAOA,SACT,IAAI,CAAC,kBAAkB,GAAGA;aAE1B,IAAI,CAAC,kBAAkB,GAAG,IAAMmD,QAAQ,OAAO,CAACnD;QAGlD,IAAI,AAA2B,WAApBd,CAAAA,QAAAA,MAAAA,KAAAA,IAAAA,IAAK,UAAU,AAAD,GACvB,IAAI,CAAC,UAAU,GAAGA,IAAI,UAAU;QAElC,IAAI,AAAyB,WAAlBA,CAAAA,QAAAA,MAAAA,KAAAA,IAAAA,IAAK,QAAQ,AAAD,GACrB,IAAI,CAAC,QAAQ,GAAGA,IAAI,QAAQ;IAEhC;AAqTF"}