@midscene/core 0.26.2 → 0.26.3-beta-20250813021342.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/es/ai-model.mjs +2502 -0
- package/dist/es/ai-model.mjs.map +1 -0
- package/dist/es/index.mjs +2362 -0
- package/dist/es/index.mjs.map +1 -0
- package/dist/es/tree.mjs +2 -0
- package/dist/es/utils.mjs +188 -0
- package/dist/es/{chunk-O3KUKF2A.js.map → utils.mjs.map} +1 -1
- package/dist/lib/ai-model.js +2581 -3
- package/dist/lib/ai-model.js.map +1 -0
- package/dist/lib/index.js +2375 -493
- package/dist/lib/index.js.map +1 -1
- package/dist/lib/tree.js +42 -11
- package/dist/lib/tree.js.map +1 -1
- package/dist/lib/utils.js +257 -29
- package/dist/lib/utils.js.map +1 -0
- package/dist/types/ai-model.d.ts +505 -99
- package/dist/types/index.d.ts +1299 -53
- package/dist/types/tree.d.ts +11 -1
- package/dist/types/utils.d.ts +47 -33
- package/package.json +28 -12
- package/dist/es/ai-model.d.ts +0 -99
- package/dist/es/ai-model.js +0 -44
- package/dist/es/chunk-DDYIQHOA.js +0 -2883
- package/dist/es/chunk-DDYIQHOA.js.map +0 -1
- package/dist/es/chunk-O3KUKF2A.js +0 -265
- package/dist/es/index.d.ts +0 -53
- package/dist/es/index.js +0 -570
- package/dist/es/index.js.map +0 -1
- package/dist/es/llm-planning-4e0c16fe.d.ts +0 -106
- package/dist/es/tree.d.ts +0 -1
- package/dist/es/tree.js +0 -13
- package/dist/es/tree.js.map +0 -1
- package/dist/es/types-8a6be57c.d.ts +0 -577
- package/dist/es/utils.d.ts +0 -33
- package/dist/es/utils.js +0 -30
- package/dist/lib/ai-model.d.ts +0 -99
- package/dist/lib/chunk-DDYIQHOA.js +0 -2883
- package/dist/lib/chunk-DDYIQHOA.js.map +0 -1
- package/dist/lib/chunk-O3KUKF2A.js +0 -265
- package/dist/lib/chunk-O3KUKF2A.js.map +0 -1
- package/dist/lib/index.d.ts +0 -53
- package/dist/lib/llm-planning-4e0c16fe.d.ts +0 -106
- package/dist/lib/tree.d.ts +0 -1
- package/dist/lib/types-8a6be57c.d.ts +0 -577
- package/dist/lib/utils.d.ts +0 -33
- package/dist/types/llm-planning-4e0c16fe.d.ts +0 -106
- package/dist/types/types-8a6be57c.d.ts +0 -577
package/dist/lib/index.js
CHANGED
|
@@ -1,197 +1,2126 @@
|
|
|
1
|
-
"use strict";
|
|
1
|
+
"use strict";
|
|
2
|
+
var __webpack_modules__ = {
|
|
3
|
+
"langsmith/wrappers": function(module) {
|
|
4
|
+
module.exports = import("langsmith/wrappers").then(function(module) {
|
|
5
|
+
return module;
|
|
6
|
+
});
|
|
7
|
+
}
|
|
8
|
+
};
|
|
9
|
+
var __webpack_module_cache__ = {};
|
|
10
|
+
function __webpack_require__(moduleId) {
|
|
11
|
+
var cachedModule = __webpack_module_cache__[moduleId];
|
|
12
|
+
if (void 0 !== cachedModule) return cachedModule.exports;
|
|
13
|
+
var module = __webpack_module_cache__[moduleId] = {
|
|
14
|
+
exports: {}
|
|
15
|
+
};
|
|
16
|
+
__webpack_modules__[moduleId](module, module.exports, __webpack_require__);
|
|
17
|
+
return module.exports;
|
|
18
|
+
}
|
|
19
|
+
(()=>{
|
|
20
|
+
__webpack_require__.n = (module)=>{
|
|
21
|
+
var getter = module && module.__esModule ? ()=>module['default'] : ()=>module;
|
|
22
|
+
__webpack_require__.d(getter, {
|
|
23
|
+
a: getter
|
|
24
|
+
});
|
|
25
|
+
return getter;
|
|
26
|
+
};
|
|
27
|
+
})();
|
|
28
|
+
(()=>{
|
|
29
|
+
__webpack_require__.d = (exports1, definition)=>{
|
|
30
|
+
for(var key in definition)if (__webpack_require__.o(definition, key) && !__webpack_require__.o(exports1, key)) Object.defineProperty(exports1, key, {
|
|
31
|
+
enumerable: true,
|
|
32
|
+
get: definition[key]
|
|
33
|
+
});
|
|
34
|
+
};
|
|
35
|
+
})();
|
|
36
|
+
(()=>{
|
|
37
|
+
__webpack_require__.o = (obj, prop)=>Object.prototype.hasOwnProperty.call(obj, prop);
|
|
38
|
+
})();
|
|
39
|
+
(()=>{
|
|
40
|
+
__webpack_require__.r = (exports1)=>{
|
|
41
|
+
if ('undefined' != typeof Symbol && Symbol.toStringTag) Object.defineProperty(exports1, Symbol.toStringTag, {
|
|
42
|
+
value: 'Module'
|
|
43
|
+
});
|
|
44
|
+
Object.defineProperty(exports1, '__esModule', {
|
|
45
|
+
value: true
|
|
46
|
+
});
|
|
47
|
+
};
|
|
48
|
+
})();
|
|
49
|
+
var __webpack_exports__ = {};
|
|
50
|
+
(()=>{
|
|
51
|
+
__webpack_require__.r(__webpack_exports__);
|
|
52
|
+
__webpack_require__.d(__webpack_exports__, {
|
|
53
|
+
AiAssert: ()=>AiAssert,
|
|
54
|
+
MIDSCENE_MODEL_NAME: ()=>env_namespaceObject.MIDSCENE_MODEL_NAME,
|
|
55
|
+
default: ()=>src,
|
|
56
|
+
getVersion: ()=>getVersion,
|
|
57
|
+
plan: ()=>llm_planning_plan,
|
|
58
|
+
getAIConfig: ()=>env_namespaceObject.getAIConfig,
|
|
59
|
+
AiLocateElement: ()=>AiLocateElement,
|
|
60
|
+
Insight: ()=>Insight,
|
|
61
|
+
describeUserPage: ()=>describeUserPage,
|
|
62
|
+
Executor: ()=>Executor
|
|
63
|
+
});
|
|
64
|
+
require("node:child_process");
|
|
65
|
+
require("node:fs");
|
|
66
|
+
require("node:os");
|
|
67
|
+
require("node:path");
|
|
68
|
+
require("@midscene/shared/common");
|
|
69
|
+
const env_namespaceObject = require("@midscene/shared/env");
|
|
70
|
+
require("@midscene/shared/node");
|
|
71
|
+
const utils_namespaceObject = require("@midscene/shared/utils");
|
|
72
|
+
new Map();
|
|
73
|
+
function getVersion() {
|
|
74
|
+
return "0.26.3-beta-20250813021342.0";
|
|
75
|
+
}
|
|
76
|
+
function _define_property(obj, key, value) {
|
|
77
|
+
if (key in obj) Object.defineProperty(obj, key, {
|
|
78
|
+
value: value,
|
|
79
|
+
enumerable: true,
|
|
80
|
+
configurable: true,
|
|
81
|
+
writable: true
|
|
82
|
+
});
|
|
83
|
+
else obj[key] = value;
|
|
84
|
+
return obj;
|
|
85
|
+
}
|
|
86
|
+
class Executor {
|
|
87
|
+
markTaskAsPending(task) {
|
|
88
|
+
return {
|
|
89
|
+
status: 'pending',
|
|
90
|
+
...task
|
|
91
|
+
};
|
|
92
|
+
}
|
|
93
|
+
async append(task) {
|
|
94
|
+
var _this_latestErrorTask, _this_latestErrorTask1;
|
|
95
|
+
(0, utils_namespaceObject.assert)('error' !== this.status, `executor is in error state, cannot append task\nerror=${null == (_this_latestErrorTask = this.latestErrorTask()) ? void 0 : _this_latestErrorTask.error}\n${null == (_this_latestErrorTask1 = this.latestErrorTask()) ? void 0 : _this_latestErrorTask1.errorStack}`);
|
|
96
|
+
if (Array.isArray(task)) this.tasks.push(...task.map((item)=>this.markTaskAsPending(item)));
|
|
97
|
+
else this.tasks.push(this.markTaskAsPending(task));
|
|
98
|
+
if ('running' !== this.status) this.status = 'pending';
|
|
99
|
+
}
|
|
100
|
+
async flush() {
|
|
101
|
+
if ('init' === this.status && this.tasks.length > 0) console.warn('illegal state for executor, status is init but tasks are not empty');
|
|
102
|
+
(0, utils_namespaceObject.assert)('running' !== this.status, 'executor is already running');
|
|
103
|
+
(0, utils_namespaceObject.assert)('completed' !== this.status, 'executor is already completed');
|
|
104
|
+
(0, utils_namespaceObject.assert)('error' !== this.status, 'executor is in error state');
|
|
105
|
+
const nextPendingIndex = this.tasks.findIndex((task)=>'pending' === task.status);
|
|
106
|
+
if (nextPendingIndex < 0) return;
|
|
107
|
+
this.status = 'running';
|
|
108
|
+
let taskIndex = nextPendingIndex;
|
|
109
|
+
let successfullyCompleted = true;
|
|
110
|
+
let previousFindOutput;
|
|
111
|
+
while(taskIndex < this.tasks.length){
|
|
112
|
+
const task = this.tasks[taskIndex];
|
|
113
|
+
(0, utils_namespaceObject.assert)('pending' === task.status, `task status should be pending, but got: ${task.status}`);
|
|
114
|
+
task.timing = {
|
|
115
|
+
start: Date.now()
|
|
116
|
+
};
|
|
117
|
+
try {
|
|
118
|
+
task.status = 'running';
|
|
119
|
+
try {
|
|
120
|
+
if (this.onTaskStart) await this.onTaskStart(task);
|
|
121
|
+
} catch (e) {
|
|
122
|
+
console.error('error in onTaskStart', e);
|
|
123
|
+
}
|
|
124
|
+
(0, utils_namespaceObject.assert)([
|
|
125
|
+
'Insight',
|
|
126
|
+
'Action',
|
|
127
|
+
'Planning'
|
|
128
|
+
].indexOf(task.type) >= 0, `unsupported task type: ${task.type}`);
|
|
129
|
+
const { executor, param } = task;
|
|
130
|
+
(0, utils_namespaceObject.assert)(executor, `executor is required for task type: ${task.type}`);
|
|
131
|
+
let returnValue;
|
|
132
|
+
const executorContext = {
|
|
133
|
+
task,
|
|
134
|
+
element: null == previousFindOutput ? void 0 : previousFindOutput.element
|
|
135
|
+
};
|
|
136
|
+
if ('Insight' === task.type) {
|
|
137
|
+
(0, utils_namespaceObject.assert)('Locate' === task.subType || 'Query' === task.subType || 'Assert' === task.subType || 'Boolean' === task.subType || 'Number' === task.subType || 'String' === task.subType, `unsupported insight subType: ${task.subType}`);
|
|
138
|
+
returnValue = await task.executor(param, executorContext);
|
|
139
|
+
if ('Locate' === task.subType) previousFindOutput = null == returnValue ? void 0 : returnValue.output;
|
|
140
|
+
} else if ('Action' === task.type || 'Planning' === task.type) returnValue = await task.executor(param, executorContext);
|
|
141
|
+
else {
|
|
142
|
+
console.warn(`unsupported task type: ${task.type}, will try to execute it directly`);
|
|
143
|
+
returnValue = await task.executor(param, executorContext);
|
|
144
|
+
}
|
|
145
|
+
Object.assign(task, returnValue);
|
|
146
|
+
task.status = 'finished';
|
|
147
|
+
task.timing.end = Date.now();
|
|
148
|
+
task.timing.cost = task.timing.end - task.timing.start;
|
|
149
|
+
taskIndex++;
|
|
150
|
+
} catch (e) {
|
|
151
|
+
successfullyCompleted = false;
|
|
152
|
+
task.error = e;
|
|
153
|
+
task.errorMessage = (null == e ? void 0 : e.message) || ('string' == typeof e ? e : 'error-without-message');
|
|
154
|
+
task.errorStack = e.stack;
|
|
155
|
+
task.status = 'failed';
|
|
156
|
+
task.timing.end = Date.now();
|
|
157
|
+
task.timing.cost = task.timing.end - task.timing.start;
|
|
158
|
+
break;
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
for(let i = taskIndex + 1; i < this.tasks.length; i++)this.tasks[i].status = 'cancelled';
|
|
162
|
+
if (successfullyCompleted) this.status = 'completed';
|
|
163
|
+
else this.status = 'error';
|
|
164
|
+
if (this.tasks.length) {
|
|
165
|
+
const outputIndex = Math.min(taskIndex, this.tasks.length - 1);
|
|
166
|
+
const { thought, output } = this.tasks[outputIndex];
|
|
167
|
+
return {
|
|
168
|
+
thought,
|
|
169
|
+
output
|
|
170
|
+
};
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
isInErrorState() {
|
|
174
|
+
return 'error' === this.status;
|
|
175
|
+
}
|
|
176
|
+
latestErrorTask() {
|
|
177
|
+
if ('error' !== this.status) return null;
|
|
178
|
+
const errorTaskIndex = this.tasks.findIndex((task)=>'failed' === task.status);
|
|
179
|
+
if (errorTaskIndex >= 0) return this.tasks[errorTaskIndex];
|
|
180
|
+
return null;
|
|
181
|
+
}
|
|
182
|
+
dump() {
|
|
183
|
+
let modelDescription = '';
|
|
184
|
+
if ((0, env_namespaceObject.vlLocateMode)()) {
|
|
185
|
+
const uiTarsModelVer = (0, env_namespaceObject.uiTarsModelVersion)();
|
|
186
|
+
modelDescription = uiTarsModelVer ? `UI-TARS=${uiTarsModelVer}` : `${(0, env_namespaceObject.vlLocateMode)()} mode`;
|
|
187
|
+
}
|
|
188
|
+
const dumpData = {
|
|
189
|
+
sdkVersion: getVersion(),
|
|
190
|
+
model_name: (0, env_namespaceObject.getAIConfig)(env_namespaceObject.MIDSCENE_MODEL_NAME) || '',
|
|
191
|
+
model_description: modelDescription,
|
|
192
|
+
logTime: Date.now(),
|
|
193
|
+
name: this.name,
|
|
194
|
+
tasks: this.tasks
|
|
195
|
+
};
|
|
196
|
+
return dumpData;
|
|
197
|
+
}
|
|
198
|
+
constructor(name, options){
|
|
199
|
+
_define_property(this, "name", void 0);
|
|
200
|
+
_define_property(this, "tasks", void 0);
|
|
201
|
+
_define_property(this, "status", void 0);
|
|
202
|
+
_define_property(this, "onTaskStart", void 0);
|
|
203
|
+
this.status = (null == options ? void 0 : options.tasks) && options.tasks.length > 0 ? 'pending' : 'init';
|
|
204
|
+
this.name = name;
|
|
205
|
+
this.tasks = ((null == options ? void 0 : options.tasks) || []).map((item)=>this.markTaskAsPending(item));
|
|
206
|
+
this.onTaskStart = null == options ? void 0 : options.onTaskStart;
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
var types_AIResponseFormat = /*#__PURE__*/ function(AIResponseFormat) {
|
|
210
|
+
AIResponseFormat["JSON"] = "json_object";
|
|
211
|
+
AIResponseFormat["TEXT"] = "text";
|
|
212
|
+
return AIResponseFormat;
|
|
213
|
+
}({});
|
|
214
|
+
const sdk_namespaceObject = require("@anthropic-ai/sdk");
|
|
215
|
+
const identity_namespaceObject = require("@azure/identity");
|
|
216
|
+
const logger_namespaceObject = require("@midscene/shared/logger");
|
|
217
|
+
const external_https_proxy_agent_namespaceObject = require("https-proxy-agent");
|
|
218
|
+
const external_jsonrepair_namespaceObject = require("jsonrepair");
|
|
219
|
+
const external_openai_namespaceObject = require("openai");
|
|
220
|
+
var external_openai_default = /*#__PURE__*/ __webpack_require__.n(external_openai_namespaceObject);
|
|
221
|
+
const external_socks_proxy_agent_namespaceObject = require("socks-proxy-agent");
|
|
222
|
+
const defaultAssertionPrompt = 'You are a senior testing engineer. User will give an assertion and a screenshot of a page. By carefully viewing the screenshot, please tell whether the assertion is truthy.';
|
|
223
|
+
const defaultAssertionResponseJsonFormat = `Return in the following JSON format:
|
|
224
|
+
{
|
|
225
|
+
pass: boolean, // whether the assertion is truthy
|
|
226
|
+
thought: string | null, // string, if the result is falsy, give the reason why it is falsy. Otherwise, put null.
|
|
227
|
+
}`;
|
|
228
|
+
const getUiTarsAssertionResponseJsonFormat = ()=>`## Output Json String Format
|
|
229
|
+
\`\`\`
|
|
230
|
+
"{
|
|
231
|
+
"pass": <<is a boolean value from the enum [true, false], true means the assertion is truthy>>,
|
|
232
|
+
"thought": "<<is a string, give the reason why the assertion is falsy or truthy. Otherwise.>>"
|
|
233
|
+
}"
|
|
234
|
+
\`\`\`
|
|
2
235
|
|
|
3
|
-
|
|
236
|
+
## Rules **MUST** follow
|
|
237
|
+
- Make sure to return **only** the JSON, with **no additional** text or explanations.
|
|
238
|
+
- Use ${(0, env_namespaceObject.getPreferredLanguage)()} in \`thought\` part.
|
|
239
|
+
- You **MUST** strictly follow up the **Output Json String Format**.`;
|
|
240
|
+
function systemPromptToAssert(model) {
|
|
241
|
+
return `${defaultAssertionPrompt}
|
|
4
242
|
|
|
243
|
+
${model.isUITars ? getUiTarsAssertionResponseJsonFormat() : defaultAssertionResponseJsonFormat}`;
|
|
244
|
+
}
|
|
245
|
+
const assertSchema = {
|
|
246
|
+
type: 'json_schema',
|
|
247
|
+
json_schema: {
|
|
248
|
+
name: 'assert',
|
|
249
|
+
strict: true,
|
|
250
|
+
schema: {
|
|
251
|
+
type: 'object',
|
|
252
|
+
properties: {
|
|
253
|
+
pass: {
|
|
254
|
+
type: 'boolean',
|
|
255
|
+
description: 'Whether the assertion passed or failed'
|
|
256
|
+
},
|
|
257
|
+
thought: {
|
|
258
|
+
type: [
|
|
259
|
+
'string',
|
|
260
|
+
'null'
|
|
261
|
+
],
|
|
262
|
+
description: 'The thought process behind the assertion'
|
|
263
|
+
}
|
|
264
|
+
},
|
|
265
|
+
required: [
|
|
266
|
+
'pass',
|
|
267
|
+
'thought'
|
|
268
|
+
],
|
|
269
|
+
additionalProperties: false
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
};
|
|
273
|
+
const prompts_namespaceObject = require("@langchain/core/prompts");
|
|
274
|
+
function bboxDescription(vlMode) {
|
|
275
|
+
if ('gemini' === vlMode) return '2d bounding box as [ymin, xmin, ymax, xmax]';
|
|
276
|
+
return '2d bounding box as [xmin, ymin, xmax, ymax]';
|
|
277
|
+
}
|
|
278
|
+
function systemPromptToLocateElement(vlMode) {
|
|
279
|
+
if (vlMode) {
|
|
280
|
+
const bboxComment = bboxDescription(vlMode);
|
|
281
|
+
return `
|
|
282
|
+
## Role:
|
|
283
|
+
You are an expert in software testing.
|
|
5
284
|
|
|
285
|
+
## Objective:
|
|
286
|
+
- Identify elements in screenshots and text that match the user's description.
|
|
287
|
+
- Give the coordinates of the element that matches the user's description best in the screenshot.
|
|
288
|
+
- Determine whether the user's description is order-sensitive (e.g., contains phrases like 'the third item in the list', 'the last button', etc.).
|
|
6
289
|
|
|
290
|
+
## Output Format:
|
|
291
|
+
\`\`\`json
|
|
292
|
+
{
|
|
293
|
+
"bbox": [number, number, number, number], // ${bboxComment}
|
|
294
|
+
"errors"?: string[],
|
|
295
|
+
"isOrderSensitive": boolean // Whether the targetElementDescription is order-sensitive (true/false)
|
|
296
|
+
}
|
|
297
|
+
\`\`\`
|
|
7
298
|
|
|
299
|
+
Fields:
|
|
300
|
+
* \`bbox\` is the bounding box of the element that matches the user's description best in the screenshot
|
|
301
|
+
* \`isOrderSensitive\` is a boolean indicating whether the user's description is order-sensitive (true/false)
|
|
302
|
+
* \`errors\` is an optional array of error messages (if any)
|
|
8
303
|
|
|
304
|
+
Order-sensitive means the description contains phrases like:
|
|
305
|
+
- "the third item in the list"
|
|
306
|
+
- "the last button"
|
|
307
|
+
- "the first input box"
|
|
308
|
+
- "the second row"
|
|
9
309
|
|
|
310
|
+
Not order-sensitive means the description is like:
|
|
311
|
+
- "confirm button"
|
|
312
|
+
- "search box"
|
|
313
|
+
- "password input"
|
|
10
314
|
|
|
315
|
+
For example, when an element is found and the description is order-sensitive:
|
|
316
|
+
\`\`\`json
|
|
317
|
+
{
|
|
318
|
+
"bbox": [100, 100, 200, 200],
|
|
319
|
+
"isOrderSensitive": true,
|
|
320
|
+
"errors": []
|
|
321
|
+
}
|
|
322
|
+
\`\`\`
|
|
11
323
|
|
|
324
|
+
When no element is found and the description is not order-sensitive:
|
|
325
|
+
\`\`\`json
|
|
326
|
+
{
|
|
327
|
+
"bbox": [],
|
|
328
|
+
"isOrderSensitive": false,
|
|
329
|
+
"errors": ["I can see ..., but {some element} is not found"]
|
|
330
|
+
}
|
|
331
|
+
\`\`\`
|
|
332
|
+
`;
|
|
333
|
+
}
|
|
334
|
+
return `
|
|
335
|
+
## Role:
|
|
336
|
+
You are an expert in software page image (2D) and page element text analysis.
|
|
12
337
|
|
|
338
|
+
## Objective:
|
|
339
|
+
- Identify elements in screenshots and text that match the user's description.
|
|
340
|
+
- Return JSON data containing the selection reason and element ID.
|
|
341
|
+
- Determine whether the user's description is order-sensitive (e.g., contains phrases like 'the third item in the list', 'the last button', etc.).
|
|
13
342
|
|
|
14
|
-
|
|
343
|
+
## Skills:
|
|
344
|
+
- Image analysis and recognition
|
|
345
|
+
- Multilingual text understanding
|
|
346
|
+
- Software UI design and testing
|
|
15
347
|
|
|
16
|
-
|
|
348
|
+
## Workflow:
|
|
349
|
+
1. Receive the user's element description, screenshot, and element description information. Note that the text may contain non-English characters (e.g., Chinese), indicating that the application may be non-English.
|
|
350
|
+
2. Based on the user's description, locate the target element ID in the list of element descriptions and the screenshot.
|
|
351
|
+
3. Found the required number of elements
|
|
352
|
+
4. Return JSON data containing the selection reason and element ID.
|
|
353
|
+
5. Judge whether the user's description is order-sensitive (see below for definition and examples).
|
|
17
354
|
|
|
355
|
+
## Constraints:
|
|
356
|
+
- Strictly adhere to the specified location when describing the required element; do not select elements from other locations.
|
|
357
|
+
- Elements in the image with NodeType other than "TEXT Node" have been highlighted to identify the element among multiple non-text elements.
|
|
358
|
+
- Accurately identify element information based on the user's description and return the corresponding element ID from the element description information, not extracted from the image.
|
|
359
|
+
- If no elements are found, the "elements" array should be empty.
|
|
360
|
+
- The returned data must conform to the specified JSON format.
|
|
361
|
+
- The returned value id information must use the id from element info (important: **use id not indexId, id is hash content**)
|
|
18
362
|
|
|
363
|
+
## Order-Sensitive Definition:
|
|
364
|
+
- If the description contains phrases like "the third item in the list", "the last button", "the first input box", "the second row", etc., it is order-sensitive (isOrderSensitive = true).
|
|
365
|
+
- If the description is like "confirm button", "search box", "password input", etc., it is not order-sensitive (isOrderSensitive = false).
|
|
19
366
|
|
|
367
|
+
## Output Format:
|
|
20
368
|
|
|
369
|
+
Please return the result in JSON format as follows:
|
|
21
370
|
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
(
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
371
|
+
\`\`\`json
|
|
372
|
+
{
|
|
373
|
+
"elements": [
|
|
374
|
+
// If no matching elements are found, return an empty array []
|
|
375
|
+
{
|
|
376
|
+
"reason": "PLACEHOLDER", // The thought process for finding the element, replace PLACEHOLDER with your thought process
|
|
377
|
+
"text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
|
|
378
|
+
"id": "PLACEHOLDER" // Replace PLACEHOLDER with the ID (important: **use id not indexId, id is hash content**) of elementInfo
|
|
379
|
+
}
|
|
380
|
+
// More elements...
|
|
381
|
+
],
|
|
382
|
+
"isOrderSensitive": true, // or false, depending on the user's description
|
|
383
|
+
"errors": [] // Array of strings containing any error messages
|
|
384
|
+
}
|
|
385
|
+
\`\`\`
|
|
386
|
+
|
|
387
|
+
## Example:
|
|
388
|
+
Example 1:
|
|
389
|
+
Input Example:
|
|
390
|
+
\`\`\`json
|
|
391
|
+
// Description: "Shopping cart icon in the upper right corner"
|
|
392
|
+
{
|
|
393
|
+
"description": "PLACEHOLDER", // Description of the target element
|
|
394
|
+
"screenshot": "path/screenshot.png",
|
|
395
|
+
"text": '{
|
|
396
|
+
"pageSize": {
|
|
397
|
+
"width": 400, // Width of the page
|
|
398
|
+
"height": 905 // Height of the page
|
|
399
|
+
},
|
|
400
|
+
"elementInfos": [
|
|
401
|
+
{
|
|
402
|
+
"id": "1231", // ID of the element
|
|
403
|
+
"indexId": "0", // Index of the element\u{FF0C}The image is labeled to the left of the element
|
|
404
|
+
"attributes": { // Attributes of the element
|
|
405
|
+
"nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
|
|
406
|
+
"src": "https://ap-southeast-3.m",
|
|
407
|
+
"class": ".img"
|
|
408
|
+
},
|
|
409
|
+
"content": "", // Text content of the element
|
|
410
|
+
"rect": {
|
|
411
|
+
"left": 280, // Distance from the left side of the page
|
|
412
|
+
"top": 8, // Distance from the top of the page
|
|
413
|
+
"width": 44, // Width of the element
|
|
414
|
+
"height": 44 // Height of the element
|
|
415
|
+
}
|
|
416
|
+
},
|
|
417
|
+
{
|
|
418
|
+
"id": "66551", // ID of the element
|
|
419
|
+
"indexId": "1", // Index of the element,The image is labeled to the left of the element
|
|
420
|
+
"attributes": { // Attributes of the element
|
|
421
|
+
"nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
|
|
422
|
+
"src": "data:image/png;base64,iVBORw0KGgoAAAANSU...",
|
|
423
|
+
"class": ".icon"
|
|
424
|
+
},
|
|
425
|
+
"content": "", // Text content of the element
|
|
426
|
+
"rect": {
|
|
427
|
+
"left": 350, // Distance from the left side of the page
|
|
428
|
+
"top": 16, // Distance from the top of the page
|
|
429
|
+
"width": 25, // Width of the element
|
|
430
|
+
"height": 25 // Height of the element
|
|
431
|
+
}
|
|
432
|
+
},
|
|
433
|
+
...
|
|
434
|
+
{
|
|
435
|
+
"id": "12344",
|
|
436
|
+
"indexId": "2", // Index of the element\u{FF0C}The image is labeled to the left of the element
|
|
437
|
+
"attributes": {
|
|
438
|
+
"nodeType": "TEXT Node",
|
|
439
|
+
"class": ".product-name"
|
|
440
|
+
},
|
|
441
|
+
"center": [
|
|
442
|
+
288,
|
|
443
|
+
834
|
|
444
|
+
],
|
|
445
|
+
"content": "Mango Drink",
|
|
446
|
+
"rect": {
|
|
447
|
+
"left": 188,
|
|
448
|
+
"top": 827,
|
|
449
|
+
"width": 199,
|
|
450
|
+
"height": 13
|
|
451
|
+
}
|
|
452
|
+
},
|
|
453
|
+
...
|
|
454
|
+
]
|
|
455
|
+
}
|
|
456
|
+
'
|
|
457
|
+
}
|
|
458
|
+
\`\`\`
|
|
459
|
+
Output Example:
|
|
460
|
+
\`\`\`json
|
|
461
|
+
{
|
|
462
|
+
"elements": [
|
|
463
|
+
{
|
|
464
|
+
// Describe the reason for finding this element, replace with actual value in practice
|
|
465
|
+
"reason": "Reason for finding element 4: It is located in the upper right corner, is an image type, and according to the screenshot, it is a shopping cart icon button",
|
|
466
|
+
"text": "",
|
|
467
|
+
// ID(**use id not indexId**) of this element, replace with actual value in practice, **use id not indexId**
|
|
468
|
+
"id": "1231"
|
|
469
|
+
}
|
|
470
|
+
],
|
|
471
|
+
"isOrderSensitive": true,
|
|
472
|
+
"errors": []
|
|
473
|
+
}
|
|
474
|
+
\`\`\`
|
|
475
|
+
|
|
476
|
+
`;
|
|
477
|
+
}
|
|
478
|
+
const locatorSchema = {
|
|
479
|
+
type: 'json_schema',
|
|
480
|
+
json_schema: {
|
|
481
|
+
name: 'find_elements',
|
|
482
|
+
strict: true,
|
|
483
|
+
schema: {
|
|
484
|
+
type: 'object',
|
|
485
|
+
properties: {
|
|
486
|
+
elements: {
|
|
487
|
+
type: 'array',
|
|
488
|
+
items: {
|
|
489
|
+
type: 'object',
|
|
490
|
+
properties: {
|
|
491
|
+
reason: {
|
|
492
|
+
type: 'string',
|
|
493
|
+
description: 'Reason for finding this element'
|
|
494
|
+
},
|
|
495
|
+
text: {
|
|
496
|
+
type: 'string',
|
|
497
|
+
description: 'Text content of the element'
|
|
498
|
+
},
|
|
499
|
+
id: {
|
|
500
|
+
type: 'string',
|
|
501
|
+
description: 'ID of this element'
|
|
502
|
+
}
|
|
503
|
+
},
|
|
504
|
+
required: [
|
|
505
|
+
'reason',
|
|
506
|
+
'text',
|
|
507
|
+
'id'
|
|
508
|
+
],
|
|
509
|
+
additionalProperties: false
|
|
510
|
+
},
|
|
511
|
+
description: 'List of found elements'
|
|
512
|
+
},
|
|
513
|
+
isOrderSensitive: {
|
|
514
|
+
type: 'boolean',
|
|
515
|
+
description: "Whether the targetElementDescription is order-sensitive (true/false)"
|
|
516
|
+
},
|
|
517
|
+
errors: {
|
|
518
|
+
type: 'array',
|
|
519
|
+
items: {
|
|
520
|
+
type: 'string'
|
|
521
|
+
},
|
|
522
|
+
description: 'List of error messages, if any'
|
|
523
|
+
}
|
|
524
|
+
},
|
|
525
|
+
required: [
|
|
526
|
+
'elements',
|
|
527
|
+
'isOrderSensitive',
|
|
528
|
+
'errors'
|
|
529
|
+
],
|
|
530
|
+
additionalProperties: false
|
|
531
|
+
}
|
|
532
|
+
}
|
|
37
533
|
};
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
534
|
+
const findElementPrompt = new prompts_namespaceObject.PromptTemplate({
|
|
535
|
+
template: `
|
|
536
|
+
Here is the item user want to find:
|
|
537
|
+
=====================================
|
|
538
|
+
{targetElementDescription}
|
|
539
|
+
=====================================
|
|
540
|
+
|
|
541
|
+
{pageDescription}
|
|
542
|
+
`,
|
|
543
|
+
inputVariables: [
|
|
544
|
+
"pageDescription",
|
|
545
|
+
"targetElementDescription"
|
|
546
|
+
]
|
|
547
|
+
});
|
|
548
|
+
const external_node_assert_namespaceObject = require("node:assert");
|
|
549
|
+
var external_node_assert_default = /*#__PURE__*/ __webpack_require__.n(external_node_assert_namespaceObject);
|
|
550
|
+
const vlCoTLog = '"what_the_user_wants_to_do_next_by_instruction": string, // What the user wants to do according to the instruction and previous logs. ';
|
|
551
|
+
const vlCurrentLog = '"log": string, // Log what the next one action (ONLY ONE!) you can do according to the screenshot and the instruction. The typical log looks like "Now i want to use action \'{ action-type }\' to do .. first". If no action should be done, log the reason. ". Use the same language as the user\'s instruction.';
|
|
552
|
+
const llmCurrentLog = '"log": string, // Log what the next actions you can do according to the screenshot and the instruction. The typical log looks like "Now i want to use action \'{ action-type }\' to do ..". If no action should be done, log the reason. ". Use the same language as the user\'s instruction.';
|
|
553
|
+
const commonOutputFields = `"error"?: string, // Error messages about unexpected situations, if any. Only think it is an error when the situation is not foreseeable according to the instruction. Use the same language as the user's instruction.
|
|
554
|
+
"more_actions_needed_by_instruction": boolean, // Consider if there is still more action(s) to do after the action in "Log" is done, according to the instruction. If so, set this field to true. Otherwise, set it to false.`;
|
|
555
|
+
const vlLocateParam = (required)=>`locate${required ? '' : '?'}: {bbox: [number, number, number, number], prompt: string }`;
|
|
556
|
+
const llmLocateParam = (required)=>`locate${required ? '' : '?'}: {"id": string, "prompt": string}`;
|
|
557
|
+
const descriptionForAction = (action, locatorScheme)=>{
|
|
558
|
+
const tab = ' ';
|
|
559
|
+
let locateParam = '';
|
|
560
|
+
if ('required' === action.location) locateParam = locatorScheme;
|
|
561
|
+
else if ('optional' === action.location) locateParam = `${locatorScheme} | null`;
|
|
562
|
+
else if (false === action.location) locateParam = '';
|
|
563
|
+
const locatorParam = locateParam ? `- ${locateParam}` : '';
|
|
564
|
+
if (action.whatToLocate) if (locateParam) locateParam += ` // ${action.whatToLocate}`;
|
|
565
|
+
else console.warn(`whatToLocate is provided for action ${action.name}, but location is not required or optional. The whatToLocate will be ignored.`);
|
|
566
|
+
let paramSchema = '';
|
|
567
|
+
if (action.paramSchema) paramSchema = `- param: ${action.paramSchema}`;
|
|
568
|
+
if (action.paramDescription) {
|
|
569
|
+
external_node_assert_default()(paramSchema, `paramSchema is required when paramDescription is provided for action ${action.name}, but got ${action.paramSchema}`);
|
|
570
|
+
paramSchema += ` // ${action.paramDescription}`;
|
|
571
|
+
}
|
|
572
|
+
const fields = [
|
|
573
|
+
paramSchema,
|
|
574
|
+
locatorParam
|
|
575
|
+
].filter(Boolean);
|
|
576
|
+
return `- ${action.name}, ${action.description}
|
|
577
|
+
${tab}- type: "${action.name}"
|
|
578
|
+
${tab}${fields.join(`\n${tab}`)}
|
|
579
|
+
`.trim();
|
|
580
|
+
};
|
|
581
|
+
const systemTemplateOfVLPlanning = ({ actionSpace, vlMode })=>{
|
|
582
|
+
const actionNameList = actionSpace.map((action)=>action.name).join(', ');
|
|
583
|
+
const actionDescriptionList = actionSpace.map((action)=>descriptionForAction(action, vlLocateParam('required' === action.location)));
|
|
584
|
+
const actionList = actionDescriptionList.join('\n');
|
|
585
|
+
return `
|
|
586
|
+
Target: User will give you a screenshot, an instruction and some previous logs indicating what have been done. Please tell what the next one action is (or null if no action should be done) to do the tasks the instruction requires.
|
|
587
|
+
|
|
588
|
+
Restriction:
|
|
589
|
+
- Don't give extra actions or plans beyond the instruction. ONLY plan for what the instruction requires. For example, don't try to submit the form if the instruction is only to fill something.
|
|
590
|
+
- Always give ONLY ONE action in \`log\` field (or null if no action should be done), instead of multiple actions. Supported actions are ${actionNameList}.
|
|
591
|
+
- Don't repeat actions in the previous logs.
|
|
592
|
+
- Bbox is the bounding box of the element to be located. It's an array of 4 numbers, representing ${bboxDescription(vlMode)}.
|
|
593
|
+
|
|
594
|
+
Supporting actions:
|
|
595
|
+
${actionList}
|
|
596
|
+
|
|
597
|
+
Field description:
|
|
598
|
+
* The \`prompt\` field inside the \`locate\` field is a short description that could be used to locate the element.
|
|
599
|
+
|
|
600
|
+
Return in JSON format:
|
|
601
|
+
{
|
|
602
|
+
${vlCoTLog}
|
|
603
|
+
${vlCurrentLog}
|
|
604
|
+
${commonOutputFields}
|
|
605
|
+
"action":
|
|
606
|
+
{
|
|
607
|
+
// one of the supporting actions
|
|
608
|
+
} | null,
|
|
609
|
+
,
|
|
610
|
+
"sleep"?: number, // The sleep time after the action, in milliseconds.
|
|
611
|
+
}
|
|
612
|
+
|
|
613
|
+
For example, when the instruction is "click 'Confirm' button, and click 'Yes' in popup" and the log is "I will use action Tap to click 'Confirm' button", by viewing the screenshot and previous logs, you should consider: We have already clicked the 'Confirm' button, so next we should find and click 'Yes' in popup.
|
|
614
|
+
|
|
615
|
+
this and output the JSON:
|
|
616
|
+
|
|
617
|
+
{
|
|
618
|
+
"what_the_user_wants_to_do_next_by_instruction": "We have already clicked the 'Confirm' button, so next we should find and click 'Yes' in popup",
|
|
619
|
+
"log": "I will use action Tap to click 'Yes' in popup",
|
|
620
|
+
"more_actions_needed_by_instruction": false,
|
|
621
|
+
"action": {
|
|
622
|
+
"type": "Tap",
|
|
623
|
+
"locate": {
|
|
624
|
+
"bbox": [100, 100, 200, 200],
|
|
625
|
+
"prompt": "The 'Yes' button in popup"
|
|
53
626
|
}
|
|
54
627
|
}
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
65
|
-
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
|
|
71
|
-
|
|
72
|
-
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
628
|
+
}
|
|
629
|
+
`;
|
|
630
|
+
};
|
|
631
|
+
const systemTemplateOfLLM = ({ actionSpace })=>{
|
|
632
|
+
const actionNameList = actionSpace.map((action)=>action.name).join(' / ');
|
|
633
|
+
const actionDescriptionList = actionSpace.map((action)=>descriptionForAction(action, llmLocateParam('required' === action.location)));
|
|
634
|
+
const actionList = actionDescriptionList.join('\n');
|
|
635
|
+
return `
|
|
636
|
+
## Role
|
|
637
|
+
|
|
638
|
+
You are a versatile professional in software UI automation. Your outstanding contributions will impact the user experience of billions of users.
|
|
639
|
+
|
|
640
|
+
## Objective
|
|
641
|
+
|
|
642
|
+
- Decompose the instruction user asked into a series of actions
|
|
643
|
+
- Locate the target element if possible
|
|
644
|
+
- If the instruction cannot be accomplished, give a further plan.
|
|
645
|
+
|
|
646
|
+
## Workflow
|
|
647
|
+
|
|
648
|
+
1. Receive the screenshot, element description of screenshot(if any), user's instruction and previous logs.
|
|
649
|
+
2. Decompose the user's task into a sequence of feasible actions, and place it in the \`actions\` field. There are different types of actions (${actionNameList}). The "About the action" section below will give you more details.
|
|
650
|
+
3. Consider whether the user's instruction will be accomplished after the actions you composed.
|
|
651
|
+
- If the instruction is accomplished, set \`more_actions_needed_by_instruction\` to false.
|
|
652
|
+
- If more actions are needed, set \`more_actions_needed_by_instruction\` to true. Get ready to hand over to the next talent people like you. Carefully log what have been done in the \`log\` field, he or she will continue the task according to your logs.
|
|
653
|
+
4. If the task is not feasible on this page, set \`error\` field to the reason.
|
|
654
|
+
|
|
655
|
+
## Constraints
|
|
656
|
+
|
|
657
|
+
- All the actions you composed MUST be feasible, which means all the action fields can be filled with the page context information you get. If not, don't plan this action.
|
|
658
|
+
- Trust the "What have been done" field about the task (if any), don't repeat actions in it.
|
|
659
|
+
- Respond only with valid JSON. Do not write an introduction or summary or markdown prefix like \`\`\`json\`\`\`.
|
|
660
|
+
- If the screenshot and the instruction are totally irrelevant, set reason in the \`error\` field.
|
|
661
|
+
|
|
662
|
+
## About the \`actions\` field
|
|
663
|
+
|
|
664
|
+
The \`locate\` param is commonly used in the \`param\` field of the action, means to locate the target element to perform the action, it conforms to the following scheme:
|
|
665
|
+
|
|
666
|
+
type LocateParam = {
|
|
667
|
+
"id": string, // the id of the element found. It should either be the id marked with a rectangle in the screenshot or the id described in the description.
|
|
668
|
+
"prompt"?: string // the description of the element to find. It can only be omitted when locate is null.
|
|
669
|
+
} | null // If it's not on the page, the LocateParam should be null
|
|
670
|
+
|
|
671
|
+
## Supported actions
|
|
672
|
+
|
|
673
|
+
Each action has a \`type\` and corresponding \`param\`. To be detailed:
|
|
674
|
+
${actionList}
|
|
675
|
+
|
|
676
|
+
`.trim();
|
|
677
|
+
};
|
|
678
|
+
const outputTemplate = `
|
|
679
|
+
## Output JSON Format:
|
|
680
|
+
|
|
681
|
+
The JSON format is as follows:
|
|
682
|
+
|
|
683
|
+
{
|
|
684
|
+
"actions": [
|
|
685
|
+
// ... some actions
|
|
686
|
+
],
|
|
687
|
+
${llmCurrentLog}
|
|
688
|
+
${commonOutputFields}
|
|
689
|
+
}
|
|
690
|
+
|
|
691
|
+
## Examples
|
|
692
|
+
|
|
693
|
+
### Example: Decompose a task
|
|
694
|
+
|
|
695
|
+
When you received the following information:
|
|
696
|
+
|
|
697
|
+
* Instruction: 'Click the language switch button, wait 1s, click "English"'
|
|
698
|
+
* Logs: null
|
|
699
|
+
* Page Context (screenshot and description) shows: There is a language switch button, and the "English" option is not shown in the screenshot now.
|
|
700
|
+
|
|
701
|
+
By viewing the page screenshot and description, you should consider this and output the JSON:
|
|
702
|
+
|
|
703
|
+
* The user intent is: tap the switch button, sleep, and tap the 'English' option
|
|
704
|
+
* The language switch button is shown in the screenshot, and can be located by the page description or the id marked with a rectangle. So we can plan a Tap action to do this.
|
|
705
|
+
* Plan a Sleep action to wait for 1 second to ensure the language options are displayed.
|
|
706
|
+
* The "English" option button is not shown in the screenshot now, it means it may only show after the previous actions are finished. So don't plan any action to do this.
|
|
707
|
+
* Log what these action do: Click the language switch button to open the language options. Wait for 1 second.
|
|
708
|
+
* The task cannot be accomplished (because the last tapping action is not finished yet), so the \`more_actions_needed_by_instruction\` field is true. The \`error\` field is null.
|
|
709
|
+
|
|
710
|
+
{
|
|
711
|
+
"actions":[
|
|
712
|
+
{
|
|
713
|
+
"thought": "Click the language switch button to open the language options.",
|
|
714
|
+
"type": "Tap",
|
|
715
|
+
"param": null,
|
|
716
|
+
"locate": { id: "c81c4e9a33", prompt: "The language switch button" }},
|
|
717
|
+
},
|
|
718
|
+
{
|
|
719
|
+
"thought": "Wait for 1 second to ensure the language options are displayed.",
|
|
720
|
+
"type": "Sleep",
|
|
721
|
+
"param": { "timeMs": 1000 },
|
|
722
|
+
}
|
|
723
|
+
],
|
|
724
|
+
"error": null,
|
|
725
|
+
"more_actions_needed_by_instruction": true,
|
|
726
|
+
"log": "Click the language switch button to open the language options. Wait for 1 second",
|
|
727
|
+
}
|
|
728
|
+
|
|
729
|
+
### Example: What NOT to do
|
|
730
|
+
Wrong output:
|
|
731
|
+
{
|
|
732
|
+
"actions":[
|
|
733
|
+
{
|
|
734
|
+
"thought": "Click the language switch button to open the language options.",
|
|
735
|
+
"type": "Tap",
|
|
736
|
+
"param": null,
|
|
737
|
+
"locate": {
|
|
738
|
+
{ "id": "c81c4e9a33" }, // WRONG: prompt is missing, this is not a valid LocateParam
|
|
739
|
+
}
|
|
740
|
+
},
|
|
741
|
+
{
|
|
742
|
+
"thought": "Click the English option",
|
|
743
|
+
"type": "Tap",
|
|
744
|
+
"param": null,
|
|
745
|
+
"locate": null, // This means the 'English' option is not shown in the screenshot, the task cannot be accomplished
|
|
746
|
+
}
|
|
747
|
+
],
|
|
748
|
+
"more_actions_needed_by_instruction": false, // WRONG: should be true
|
|
749
|
+
"log": "Click the language switch button to open the language options",
|
|
750
|
+
}
|
|
751
|
+
`;
|
|
752
|
+
async function systemPromptToTaskPlanning({ actionSpace, vlMode }) {
|
|
753
|
+
if (vlMode) return systemTemplateOfVLPlanning({
|
|
754
|
+
actionSpace,
|
|
755
|
+
vlMode
|
|
756
|
+
});
|
|
757
|
+
return `${systemTemplateOfLLM({
|
|
758
|
+
actionSpace
|
|
759
|
+
})}\n\n${outputTemplate}`;
|
|
760
|
+
}
|
|
761
|
+
const planSchema = {
|
|
762
|
+
type: 'json_schema',
|
|
763
|
+
json_schema: {
|
|
764
|
+
name: 'action_items',
|
|
765
|
+
strict: false,
|
|
766
|
+
schema: {
|
|
767
|
+
type: 'object',
|
|
768
|
+
strict: false,
|
|
769
|
+
properties: {
|
|
770
|
+
actions: {
|
|
771
|
+
type: 'array',
|
|
772
|
+
items: {
|
|
773
|
+
type: 'object',
|
|
774
|
+
strict: false,
|
|
775
|
+
properties: {
|
|
776
|
+
thought: {
|
|
777
|
+
type: 'string',
|
|
778
|
+
description: 'Reasons for generating this task, and why this task is feasible on this page'
|
|
779
|
+
},
|
|
780
|
+
type: {
|
|
781
|
+
type: 'string',
|
|
782
|
+
description: 'Type of action'
|
|
783
|
+
},
|
|
784
|
+
param: {
|
|
785
|
+
anyOf: [
|
|
786
|
+
{
|
|
787
|
+
type: 'null'
|
|
788
|
+
},
|
|
789
|
+
{
|
|
790
|
+
type: 'object',
|
|
791
|
+
additionalProperties: true
|
|
792
|
+
}
|
|
793
|
+
],
|
|
794
|
+
description: 'Parameter of the action'
|
|
795
|
+
},
|
|
796
|
+
locate: {
|
|
797
|
+
type: [
|
|
798
|
+
'object',
|
|
799
|
+
'null'
|
|
800
|
+
],
|
|
801
|
+
properties: {
|
|
802
|
+
id: {
|
|
803
|
+
type: 'string'
|
|
804
|
+
},
|
|
805
|
+
prompt: {
|
|
806
|
+
type: 'string'
|
|
807
|
+
}
|
|
808
|
+
},
|
|
809
|
+
required: [
|
|
810
|
+
'id',
|
|
811
|
+
'prompt'
|
|
812
|
+
],
|
|
813
|
+
additionalProperties: false,
|
|
814
|
+
description: 'Location information for the target element'
|
|
815
|
+
}
|
|
816
|
+
},
|
|
817
|
+
required: [
|
|
818
|
+
'thought',
|
|
819
|
+
'type',
|
|
820
|
+
'param',
|
|
821
|
+
'locate'
|
|
822
|
+
],
|
|
823
|
+
additionalProperties: false
|
|
824
|
+
},
|
|
825
|
+
description: 'List of actions to be performed'
|
|
826
|
+
},
|
|
827
|
+
more_actions_needed_by_instruction: {
|
|
828
|
+
type: 'boolean',
|
|
829
|
+
description: 'If all the actions described in the instruction have been covered by this action and logs, set this field to false.'
|
|
830
|
+
},
|
|
831
|
+
log: {
|
|
832
|
+
type: 'string',
|
|
833
|
+
description: 'Log what these planned actions do. Do not include further actions that have not been planned.'
|
|
834
|
+
},
|
|
835
|
+
error: {
|
|
836
|
+
type: [
|
|
837
|
+
'string',
|
|
838
|
+
'null'
|
|
839
|
+
],
|
|
840
|
+
description: 'Error messages about unexpected situations'
|
|
841
|
+
}
|
|
842
|
+
},
|
|
843
|
+
required: [
|
|
844
|
+
'actions',
|
|
845
|
+
'more_actions_needed_by_instruction',
|
|
846
|
+
'log',
|
|
847
|
+
'error'
|
|
848
|
+
],
|
|
849
|
+
additionalProperties: false
|
|
850
|
+
}
|
|
851
|
+
}
|
|
852
|
+
};
|
|
853
|
+
const generateTaskBackgroundContext = (userInstruction, log, userActionContext)=>{
|
|
854
|
+
if (log) return `
|
|
855
|
+
Here is the user's instruction:
|
|
856
|
+
|
|
857
|
+
<instruction>
|
|
858
|
+
<high_priority_knowledge>
|
|
859
|
+
${userActionContext}
|
|
860
|
+
</high_priority_knowledge>
|
|
861
|
+
|
|
862
|
+
${userInstruction}
|
|
863
|
+
</instruction>
|
|
864
|
+
|
|
865
|
+
These are the logs from previous executions, which indicate what was done in the previous actions.
|
|
866
|
+
Do NOT repeat these actions.
|
|
867
|
+
<previous_logs>
|
|
868
|
+
${log}
|
|
869
|
+
</previous_logs>
|
|
870
|
+
`;
|
|
871
|
+
return `
|
|
872
|
+
Here is the user's instruction:
|
|
873
|
+
<instruction>
|
|
874
|
+
<high_priority_knowledge>
|
|
875
|
+
${userActionContext}
|
|
876
|
+
</high_priority_knowledge>
|
|
877
|
+
|
|
878
|
+
${userInstruction}
|
|
879
|
+
</instruction>
|
|
880
|
+
`;
|
|
881
|
+
};
|
|
882
|
+
const automationUserPrompt = (vlMode)=>{
|
|
883
|
+
if (vlMode) return new prompts_namespaceObject.PromptTemplate({
|
|
884
|
+
template: '{taskBackgroundContext}',
|
|
885
|
+
inputVariables: [
|
|
886
|
+
'taskBackgroundContext'
|
|
887
|
+
]
|
|
888
|
+
});
|
|
889
|
+
return new prompts_namespaceObject.PromptTemplate({
|
|
890
|
+
template: `
|
|
891
|
+
pageDescription:
|
|
892
|
+
=====================================
|
|
893
|
+
{pageDescription}
|
|
894
|
+
=====================================
|
|
895
|
+
|
|
896
|
+
{taskBackgroundContext}`,
|
|
897
|
+
inputVariables: [
|
|
898
|
+
"pageDescription",
|
|
899
|
+
'taskBackgroundContext'
|
|
900
|
+
]
|
|
901
|
+
});
|
|
902
|
+
};
|
|
903
|
+
function checkAIConfig() {
|
|
904
|
+
const openaiKey = (0, env_namespaceObject.getAIConfig)(env_namespaceObject.OPENAI_API_KEY);
|
|
905
|
+
const azureConfig = (0, env_namespaceObject.getAIConfig)(env_namespaceObject.MIDSCENE_USE_AZURE_OPENAI);
|
|
906
|
+
const anthropicKey = (0, env_namespaceObject.getAIConfig)(env_namespaceObject.ANTHROPIC_API_KEY);
|
|
907
|
+
const initConfigJson = (0, env_namespaceObject.getAIConfig)(env_namespaceObject.MIDSCENE_OPENAI_INIT_CONFIG_JSON);
|
|
908
|
+
if (openaiKey) return true;
|
|
909
|
+
if (azureConfig) return true;
|
|
910
|
+
if (anthropicKey) return true;
|
|
911
|
+
return Boolean(initConfigJson);
|
|
912
|
+
}
|
|
913
|
+
let debugConfigInitialized = false;
|
|
914
|
+
function initDebugConfig() {
|
|
915
|
+
if (debugConfigInitialized) return;
|
|
916
|
+
const shouldPrintTiming = (0, env_namespaceObject.getAIConfigInBoolean)(env_namespaceObject.MIDSCENE_DEBUG_AI_PROFILE);
|
|
917
|
+
let debugConfig = '';
|
|
918
|
+
if (shouldPrintTiming) {
|
|
919
|
+
console.warn('MIDSCENE_DEBUG_AI_PROFILE is deprecated, use DEBUG=midscene:ai:profile instead');
|
|
920
|
+
debugConfig = 'ai:profile';
|
|
921
|
+
}
|
|
922
|
+
const shouldPrintAIResponse = (0, env_namespaceObject.getAIConfigInBoolean)(env_namespaceObject.MIDSCENE_DEBUG_AI_RESPONSE);
|
|
923
|
+
if (shouldPrintAIResponse) {
|
|
924
|
+
console.warn('MIDSCENE_DEBUG_AI_RESPONSE is deprecated, use DEBUG=midscene:ai:response instead');
|
|
925
|
+
debugConfig = debugConfig ? 'ai:*' : 'ai:call';
|
|
926
|
+
}
|
|
927
|
+
if (debugConfig) (0, logger_namespaceObject.enableDebug)(debugConfig);
|
|
928
|
+
debugConfigInitialized = true;
|
|
929
|
+
}
|
|
930
|
+
const defaultModel = 'gpt-4o';
|
|
931
|
+
function getModelName() {
|
|
932
|
+
let modelName = defaultModel;
|
|
933
|
+
const nameInConfig = (0, env_namespaceObject.getAIConfig)(env_namespaceObject.MIDSCENE_MODEL_NAME);
|
|
934
|
+
if (nameInConfig) modelName = nameInConfig;
|
|
935
|
+
return modelName;
|
|
936
|
+
}
|
|
937
|
+
async function createChatClient({ AIActionTypeValue }) {
|
|
938
|
+
initDebugConfig();
|
|
939
|
+
let openai;
|
|
940
|
+
const extraConfig = (0, env_namespaceObject.getAIConfigInJson)(env_namespaceObject.MIDSCENE_OPENAI_INIT_CONFIG_JSON);
|
|
941
|
+
const socksProxy = (0, env_namespaceObject.getAIConfig)(env_namespaceObject.MIDSCENE_OPENAI_SOCKS_PROXY);
|
|
942
|
+
const httpProxy = (0, env_namespaceObject.getAIConfig)(env_namespaceObject.MIDSCENE_OPENAI_HTTP_PROXY);
|
|
943
|
+
let proxyAgent;
|
|
944
|
+
const debugProxy = (0, logger_namespaceObject.getDebug)('ai:call:proxy');
|
|
945
|
+
if (httpProxy) {
|
|
946
|
+
debugProxy('using http proxy', httpProxy);
|
|
947
|
+
proxyAgent = new external_https_proxy_agent_namespaceObject.HttpsProxyAgent(httpProxy);
|
|
948
|
+
} else if (socksProxy) {
|
|
949
|
+
debugProxy('using socks proxy', socksProxy);
|
|
950
|
+
proxyAgent = new external_socks_proxy_agent_namespaceObject.SocksProxyAgent(socksProxy);
|
|
951
|
+
}
|
|
952
|
+
if ((0, env_namespaceObject.getAIConfig)(env_namespaceObject.OPENAI_USE_AZURE)) openai = new external_openai_namespaceObject.AzureOpenAI({
|
|
953
|
+
baseURL: (0, env_namespaceObject.getAIConfig)(env_namespaceObject.OPENAI_BASE_URL),
|
|
954
|
+
apiKey: (0, env_namespaceObject.getAIConfig)(env_namespaceObject.OPENAI_API_KEY),
|
|
955
|
+
httpAgent: proxyAgent,
|
|
956
|
+
...extraConfig,
|
|
957
|
+
dangerouslyAllowBrowser: true
|
|
958
|
+
});
|
|
959
|
+
else if ((0, env_namespaceObject.getAIConfig)(env_namespaceObject.MIDSCENE_USE_AZURE_OPENAI)) {
|
|
960
|
+
const extraAzureConfig = (0, env_namespaceObject.getAIConfigInJson)(env_namespaceObject.MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON);
|
|
961
|
+
const scope = (0, env_namespaceObject.getAIConfig)(env_namespaceObject.MIDSCENE_AZURE_OPENAI_SCOPE);
|
|
962
|
+
let tokenProvider;
|
|
963
|
+
if (scope) {
|
|
964
|
+
(0, utils_namespaceObject.assert)(!utils_namespaceObject.ifInBrowser, 'Azure OpenAI is not supported in browser with Midscene.');
|
|
965
|
+
const credential = new identity_namespaceObject.DefaultAzureCredential();
|
|
966
|
+
(0, utils_namespaceObject.assert)(scope, 'MIDSCENE_AZURE_OPENAI_SCOPE is required');
|
|
967
|
+
tokenProvider = (0, identity_namespaceObject.getBearerTokenProvider)(credential, scope);
|
|
968
|
+
openai = new external_openai_namespaceObject.AzureOpenAI({
|
|
969
|
+
azureADTokenProvider: tokenProvider,
|
|
970
|
+
endpoint: (0, env_namespaceObject.getAIConfig)(env_namespaceObject.AZURE_OPENAI_ENDPOINT),
|
|
971
|
+
apiVersion: (0, env_namespaceObject.getAIConfig)(env_namespaceObject.AZURE_OPENAI_API_VERSION),
|
|
972
|
+
deployment: (0, env_namespaceObject.getAIConfig)(env_namespaceObject.AZURE_OPENAI_DEPLOYMENT),
|
|
973
|
+
...extraConfig,
|
|
974
|
+
...extraAzureConfig
|
|
975
|
+
});
|
|
976
|
+
} else openai = new external_openai_namespaceObject.AzureOpenAI({
|
|
977
|
+
apiKey: (0, env_namespaceObject.getAIConfig)(env_namespaceObject.AZURE_OPENAI_KEY),
|
|
978
|
+
endpoint: (0, env_namespaceObject.getAIConfig)(env_namespaceObject.AZURE_OPENAI_ENDPOINT),
|
|
979
|
+
apiVersion: (0, env_namespaceObject.getAIConfig)(env_namespaceObject.AZURE_OPENAI_API_VERSION),
|
|
980
|
+
deployment: (0, env_namespaceObject.getAIConfig)(env_namespaceObject.AZURE_OPENAI_DEPLOYMENT),
|
|
981
|
+
dangerouslyAllowBrowser: true,
|
|
982
|
+
...extraConfig,
|
|
983
|
+
...extraAzureConfig
|
|
984
|
+
});
|
|
985
|
+
} else if (!(0, env_namespaceObject.getAIConfig)(env_namespaceObject.MIDSCENE_USE_ANTHROPIC_SDK)) {
|
|
986
|
+
const baseURL = (0, env_namespaceObject.getAIConfig)(env_namespaceObject.OPENAI_BASE_URL);
|
|
987
|
+
if ('string' == typeof baseURL) {
|
|
988
|
+
if (!/^https?:\/\//.test(baseURL)) throw new Error(`OPENAI_BASE_URL must be a valid URL starting with http:// or https://, but got: ${baseURL}\nPlease check your config.`);
|
|
989
|
+
}
|
|
990
|
+
openai = new (external_openai_default())({
|
|
991
|
+
baseURL: (0, env_namespaceObject.getAIConfig)(env_namespaceObject.OPENAI_BASE_URL),
|
|
992
|
+
apiKey: (0, env_namespaceObject.getAIConfig)(env_namespaceObject.OPENAI_API_KEY),
|
|
993
|
+
httpAgent: proxyAgent,
|
|
994
|
+
...extraConfig,
|
|
995
|
+
defaultHeaders: {
|
|
996
|
+
...(null == extraConfig ? void 0 : extraConfig.defaultHeaders) || {},
|
|
997
|
+
[env_namespaceObject.MIDSCENE_API_TYPE]: AIActionTypeValue.toString()
|
|
998
|
+
},
|
|
999
|
+
dangerouslyAllowBrowser: true
|
|
1000
|
+
});
|
|
1001
|
+
}
|
|
1002
|
+
if (openai && (0, env_namespaceObject.getAIConfigInBoolean)(env_namespaceObject.MIDSCENE_LANGSMITH_DEBUG)) {
|
|
1003
|
+
if (utils_namespaceObject.ifInBrowser) throw new Error('langsmith is not supported in browser');
|
|
1004
|
+
console.log('DEBUGGING MODE: langsmith wrapper enabled');
|
|
1005
|
+
const { wrapOpenAI } = await Promise.resolve().then(__webpack_require__.bind(__webpack_require__, "langsmith/wrappers"));
|
|
1006
|
+
openai = wrapOpenAI(openai);
|
|
1007
|
+
}
|
|
1008
|
+
if (void 0 !== openai) return {
|
|
1009
|
+
completion: openai.chat.completions,
|
|
1010
|
+
style: 'openai'
|
|
1011
|
+
};
|
|
1012
|
+
if ((0, env_namespaceObject.getAIConfig)(env_namespaceObject.MIDSCENE_USE_ANTHROPIC_SDK)) {
|
|
1013
|
+
const apiKey = (0, env_namespaceObject.getAIConfig)(env_namespaceObject.ANTHROPIC_API_KEY);
|
|
1014
|
+
(0, utils_namespaceObject.assert)(apiKey, 'ANTHROPIC_API_KEY is required');
|
|
1015
|
+
openai = new sdk_namespaceObject.Anthropic({
|
|
1016
|
+
apiKey,
|
|
1017
|
+
httpAgent: proxyAgent,
|
|
1018
|
+
dangerouslyAllowBrowser: true
|
|
1019
|
+
});
|
|
1020
|
+
}
|
|
1021
|
+
if (void 0 !== openai && openai.messages) return {
|
|
1022
|
+
completion: openai.messages,
|
|
1023
|
+
style: 'anthropic'
|
|
1024
|
+
};
|
|
1025
|
+
throw new Error('Openai SDK or Anthropic SDK is not initialized');
|
|
1026
|
+
}
|
|
1027
|
+
async function service_caller_call(messages, AIActionTypeValue, responseFormat, options) {
|
|
1028
|
+
(0, utils_namespaceObject.assert)(checkAIConfig(), 'Cannot find config for AI model service. If you are using a self-hosted model without validating the API key, please set `OPENAI_API_KEY` to any non-null value. https://midscenejs.com/model-provider.html');
|
|
1029
|
+
const { completion, style } = await createChatClient({
|
|
1030
|
+
AIActionTypeValue
|
|
1031
|
+
});
|
|
1032
|
+
const maxTokens = (0, env_namespaceObject.getAIConfig)(env_namespaceObject.OPENAI_MAX_TOKENS);
|
|
1033
|
+
const debugCall = (0, logger_namespaceObject.getDebug)('ai:call');
|
|
1034
|
+
const debugProfileStats = (0, logger_namespaceObject.getDebug)('ai:profile:stats');
|
|
1035
|
+
const debugProfileDetail = (0, logger_namespaceObject.getDebug)('ai:profile:detail');
|
|
1036
|
+
const startTime = Date.now();
|
|
1037
|
+
const model = getModelName();
|
|
1038
|
+
const isStreaming = (null == options ? void 0 : options.stream) && (null == options ? void 0 : options.onChunk);
|
|
1039
|
+
let content;
|
|
1040
|
+
let accumulated = '';
|
|
1041
|
+
let usage;
|
|
1042
|
+
let timeCost;
|
|
1043
|
+
const commonConfig = {
|
|
1044
|
+
temperature: 'vlm-ui-tars' === (0, env_namespaceObject.vlLocateMode)() ? 0.0 : 0.1,
|
|
1045
|
+
stream: !!isStreaming,
|
|
1046
|
+
max_tokens: 'number' == typeof maxTokens ? maxTokens : Number.parseInt(maxTokens || '2048', 10),
|
|
1047
|
+
...'qwen-vl' === (0, env_namespaceObject.vlLocateMode)() ? {
|
|
1048
|
+
vl_high_resolution_images: true
|
|
1049
|
+
} : {}
|
|
1050
|
+
};
|
|
85
1051
|
try {
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
1052
|
+
if ('openai' === style) {
|
|
1053
|
+
debugCall(`sending ${isStreaming ? 'streaming ' : ''}request to ${model}`);
|
|
1054
|
+
if (isStreaming) {
|
|
1055
|
+
const stream = await completion.create({
|
|
1056
|
+
model,
|
|
1057
|
+
messages,
|
|
1058
|
+
response_format: responseFormat,
|
|
1059
|
+
...commonConfig
|
|
1060
|
+
}, {
|
|
1061
|
+
stream: true
|
|
1062
|
+
});
|
|
1063
|
+
for await (const chunk of stream){
|
|
1064
|
+
var _chunk_choices__delta, _chunk_choices_, _chunk_choices, _chunk_choices__delta1, _chunk_choices_1, _chunk_choices1, _chunk_choices_2, _chunk_choices2;
|
|
1065
|
+
const content = (null == (_chunk_choices = chunk.choices) ? void 0 : null == (_chunk_choices_ = _chunk_choices[0]) ? void 0 : null == (_chunk_choices__delta = _chunk_choices_.delta) ? void 0 : _chunk_choices__delta.content) || '';
|
|
1066
|
+
const reasoning_content = (null == (_chunk_choices1 = chunk.choices) ? void 0 : null == (_chunk_choices_1 = _chunk_choices1[0]) ? void 0 : null == (_chunk_choices__delta1 = _chunk_choices_1.delta) ? void 0 : _chunk_choices__delta1.reasoning_content) || '';
|
|
1067
|
+
if (chunk.usage) usage = chunk.usage;
|
|
1068
|
+
if (content || reasoning_content) {
|
|
1069
|
+
accumulated += content;
|
|
1070
|
+
const chunkData = {
|
|
1071
|
+
content,
|
|
1072
|
+
reasoning_content,
|
|
1073
|
+
accumulated,
|
|
1074
|
+
isComplete: false,
|
|
1075
|
+
usage: void 0
|
|
1076
|
+
};
|
|
1077
|
+
options.onChunk(chunkData);
|
|
1078
|
+
}
|
|
1079
|
+
if (null == (_chunk_choices2 = chunk.choices) ? void 0 : null == (_chunk_choices_2 = _chunk_choices2[0]) ? void 0 : _chunk_choices_2.finish_reason) {
|
|
1080
|
+
timeCost = Date.now() - startTime;
|
|
1081
|
+
if (!usage) {
|
|
1082
|
+
const estimatedTokens = Math.max(1, Math.floor(accumulated.length / 4));
|
|
1083
|
+
usage = {
|
|
1084
|
+
prompt_tokens: estimatedTokens,
|
|
1085
|
+
completion_tokens: estimatedTokens,
|
|
1086
|
+
total_tokens: 2 * estimatedTokens
|
|
1087
|
+
};
|
|
1088
|
+
}
|
|
1089
|
+
const finalChunk = {
|
|
1090
|
+
content: '',
|
|
1091
|
+
accumulated,
|
|
1092
|
+
reasoning_content: '',
|
|
1093
|
+
isComplete: true,
|
|
1094
|
+
usage: {
|
|
1095
|
+
prompt_tokens: usage.prompt_tokens ?? 0,
|
|
1096
|
+
completion_tokens: usage.completion_tokens ?? 0,
|
|
1097
|
+
total_tokens: usage.total_tokens ?? 0,
|
|
1098
|
+
time_cost: timeCost ?? 0
|
|
1099
|
+
}
|
|
1100
|
+
};
|
|
1101
|
+
options.onChunk(finalChunk);
|
|
1102
|
+
break;
|
|
1103
|
+
}
|
|
1104
|
+
}
|
|
1105
|
+
content = accumulated;
|
|
1106
|
+
debugProfileStats(`streaming model, ${model}, mode, ${(0, env_namespaceObject.vlLocateMode)() || 'default'}, cost-ms, ${timeCost}`);
|
|
1107
|
+
} else {
|
|
1108
|
+
var _result_usage, _result_usage1, _result_usage2;
|
|
1109
|
+
const result = await completion.create({
|
|
1110
|
+
model,
|
|
1111
|
+
messages,
|
|
1112
|
+
response_format: responseFormat,
|
|
1113
|
+
...commonConfig
|
|
1114
|
+
});
|
|
1115
|
+
timeCost = Date.now() - startTime;
|
|
1116
|
+
debugProfileStats(`model, ${model}, mode, ${(0, env_namespaceObject.vlLocateMode)() || 'default'}, ui-tars-version, ${(0, env_namespaceObject.uiTarsModelVersion)()}, prompt-tokens, ${(null == (_result_usage = result.usage) ? void 0 : _result_usage.prompt_tokens) || ''}, completion-tokens, ${(null == (_result_usage1 = result.usage) ? void 0 : _result_usage1.completion_tokens) || ''}, total-tokens, ${(null == (_result_usage2 = result.usage) ? void 0 : _result_usage2.total_tokens) || ''}, cost-ms, ${timeCost}, requestId, ${result._request_id || ''}`);
|
|
1117
|
+
debugProfileDetail(`model usage detail: ${JSON.stringify(result.usage)}`);
|
|
1118
|
+
(0, utils_namespaceObject.assert)(result.choices, `invalid response from LLM service: ${JSON.stringify(result)}`);
|
|
1119
|
+
content = result.choices[0].message.content;
|
|
1120
|
+
usage = result.usage;
|
|
1121
|
+
}
|
|
1122
|
+
debugCall(`response: ${content}`);
|
|
1123
|
+
(0, utils_namespaceObject.assert)(content, 'empty content');
|
|
1124
|
+
} else if ('anthropic' === style) {
|
|
1125
|
+
const convertImageContent = (content)=>{
|
|
1126
|
+
if ('image_url' === content.type) {
|
|
1127
|
+
const imgBase64 = content.image_url.url;
|
|
1128
|
+
(0, utils_namespaceObject.assert)(imgBase64, 'image_url is required');
|
|
1129
|
+
return {
|
|
1130
|
+
source: {
|
|
1131
|
+
type: 'base64',
|
|
1132
|
+
media_type: imgBase64.includes('data:image/png;base64,') ? 'image/png' : 'image/jpeg',
|
|
1133
|
+
data: imgBase64.split(',')[1]
|
|
1134
|
+
},
|
|
1135
|
+
type: 'image'
|
|
1136
|
+
};
|
|
1137
|
+
}
|
|
1138
|
+
return content;
|
|
1139
|
+
};
|
|
1140
|
+
if (isStreaming) {
|
|
1141
|
+
const stream = await completion.create({
|
|
1142
|
+
model,
|
|
1143
|
+
system: 'You are a versatile professional in software UI automation',
|
|
1144
|
+
messages: messages.map((m)=>({
|
|
1145
|
+
role: 'user',
|
|
1146
|
+
content: Array.isArray(m.content) ? m.content.map(convertImageContent) : m.content
|
|
1147
|
+
})),
|
|
1148
|
+
response_format: responseFormat,
|
|
1149
|
+
...commonConfig
|
|
1150
|
+
});
|
|
1151
|
+
for await (const chunk of stream){
|
|
1152
|
+
var _chunk_delta;
|
|
1153
|
+
const content = (null == (_chunk_delta = chunk.delta) ? void 0 : _chunk_delta.text) || '';
|
|
1154
|
+
if (content) {
|
|
1155
|
+
accumulated += content;
|
|
1156
|
+
const chunkData = {
|
|
1157
|
+
content,
|
|
1158
|
+
accumulated,
|
|
1159
|
+
reasoning_content: '',
|
|
1160
|
+
isComplete: false,
|
|
1161
|
+
usage: void 0
|
|
1162
|
+
};
|
|
1163
|
+
options.onChunk(chunkData);
|
|
1164
|
+
}
|
|
1165
|
+
if ('message_stop' === chunk.type) {
|
|
1166
|
+
timeCost = Date.now() - startTime;
|
|
1167
|
+
const anthropicUsage = chunk.usage;
|
|
1168
|
+
const finalChunk = {
|
|
1169
|
+
content: '',
|
|
1170
|
+
accumulated,
|
|
1171
|
+
reasoning_content: '',
|
|
1172
|
+
isComplete: true,
|
|
1173
|
+
usage: anthropicUsage ? {
|
|
1174
|
+
prompt_tokens: anthropicUsage.input_tokens ?? 0,
|
|
1175
|
+
completion_tokens: anthropicUsage.output_tokens ?? 0,
|
|
1176
|
+
total_tokens: (anthropicUsage.input_tokens ?? 0) + (anthropicUsage.output_tokens ?? 0),
|
|
1177
|
+
time_cost: timeCost ?? 0
|
|
1178
|
+
} : void 0
|
|
1179
|
+
};
|
|
1180
|
+
options.onChunk(finalChunk);
|
|
1181
|
+
break;
|
|
1182
|
+
}
|
|
1183
|
+
}
|
|
1184
|
+
content = accumulated;
|
|
1185
|
+
} else {
|
|
1186
|
+
const result = await completion.create({
|
|
1187
|
+
model,
|
|
1188
|
+
system: 'You are a versatile professional in software UI automation',
|
|
1189
|
+
messages: messages.map((m)=>({
|
|
1190
|
+
role: 'user',
|
|
1191
|
+
content: Array.isArray(m.content) ? m.content.map(convertImageContent) : m.content
|
|
1192
|
+
})),
|
|
1193
|
+
response_format: responseFormat,
|
|
1194
|
+
...commonConfig
|
|
1195
|
+
});
|
|
1196
|
+
timeCost = Date.now() - startTime;
|
|
1197
|
+
content = result.content[0].text;
|
|
1198
|
+
usage = result.usage;
|
|
1199
|
+
}
|
|
1200
|
+
(0, utils_namespaceObject.assert)(content, 'empty content');
|
|
1201
|
+
}
|
|
1202
|
+
if (isStreaming && !usage) {
|
|
1203
|
+
const estimatedTokens = Math.max(1, Math.floor((content || '').length / 4));
|
|
1204
|
+
usage = {
|
|
1205
|
+
prompt_tokens: estimatedTokens,
|
|
1206
|
+
completion_tokens: estimatedTokens,
|
|
1207
|
+
total_tokens: 2 * estimatedTokens
|
|
1208
|
+
};
|
|
1209
|
+
}
|
|
1210
|
+
return {
|
|
1211
|
+
content: content || '',
|
|
1212
|
+
usage: usage ? {
|
|
1213
|
+
prompt_tokens: usage.prompt_tokens ?? 0,
|
|
1214
|
+
completion_tokens: usage.completion_tokens ?? 0,
|
|
1215
|
+
total_tokens: usage.total_tokens ?? 0,
|
|
1216
|
+
time_cost: timeCost ?? 0
|
|
1217
|
+
} : void 0,
|
|
1218
|
+
isStreamed: !!isStreaming
|
|
1219
|
+
};
|
|
89
1220
|
} catch (e) {
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
let
|
|
99
|
-
const
|
|
100
|
-
|
|
101
|
-
|
|
1221
|
+
console.error(' call AI error', e);
|
|
1222
|
+
const newError = new Error(`failed to call ${isStreaming ? 'streaming ' : ''}AI model service: ${e.message}. Trouble shooting: https://midscenejs.com/model-provider.html`, {
|
|
1223
|
+
cause: e
|
|
1224
|
+
});
|
|
1225
|
+
throw newError;
|
|
1226
|
+
}
|
|
1227
|
+
}
|
|
1228
|
+
async function callToGetJSONObject(messages, AIActionTypeValue) {
|
|
1229
|
+
let responseFormat;
|
|
1230
|
+
const model = getModelName();
|
|
1231
|
+
if (model.includes('gpt-4')) switch(AIActionTypeValue){
|
|
1232
|
+
case common_AIActionType.ASSERT:
|
|
1233
|
+
responseFormat = assertSchema;
|
|
1234
|
+
break;
|
|
1235
|
+
case common_AIActionType.INSPECT_ELEMENT:
|
|
1236
|
+
responseFormat = locatorSchema;
|
|
1237
|
+
break;
|
|
1238
|
+
case common_AIActionType.PLAN:
|
|
1239
|
+
responseFormat = planSchema;
|
|
1240
|
+
break;
|
|
1241
|
+
case common_AIActionType.EXTRACT_DATA:
|
|
1242
|
+
case common_AIActionType.DESCRIBE_ELEMENT:
|
|
1243
|
+
responseFormat = {
|
|
1244
|
+
type: types_AIResponseFormat.JSON
|
|
1245
|
+
};
|
|
1246
|
+
break;
|
|
1247
|
+
}
|
|
1248
|
+
if ('gpt-4o-2024-05-13' === model) responseFormat = {
|
|
1249
|
+
type: types_AIResponseFormat.JSON
|
|
1250
|
+
};
|
|
1251
|
+
const response = await service_caller_call(messages, AIActionTypeValue, responseFormat);
|
|
1252
|
+
(0, utils_namespaceObject.assert)(response, 'empty response');
|
|
1253
|
+
const jsonContent = safeParseJson(response.content);
|
|
1254
|
+
return {
|
|
1255
|
+
content: jsonContent,
|
|
1256
|
+
usage: response.usage
|
|
102
1257
|
};
|
|
103
|
-
if (task.type === "Insight") {
|
|
104
|
-
_utils.assert.call(void 0,
|
|
105
|
-
task.subType === "Locate" || task.subType === "Query" || task.subType === "Assert" || task.subType === "Boolean" || task.subType === "Number" || task.subType === "String",
|
|
106
|
-
`unsupported insight subType: ${task.subType}`
|
|
107
|
-
);
|
|
108
|
-
returnValue = await task.executor(param, executorContext);
|
|
109
|
-
if (task.subType === "Locate") {
|
|
110
|
-
previousFindOutput = _optionalChain([returnValue, 'optionalAccess', _11 => _11.output]);
|
|
111
|
-
}
|
|
112
|
-
} else if (task.type === "Action" || task.type === "Planning") {
|
|
113
|
-
returnValue = await task.executor(param, executorContext);
|
|
114
|
-
} else {
|
|
115
|
-
console.warn(
|
|
116
|
-
`unsupported task type: ${task.type}, will try to execute it directly`
|
|
117
|
-
);
|
|
118
|
-
returnValue = await task.executor(param, executorContext);
|
|
119
|
-
}
|
|
120
|
-
Object.assign(task, returnValue);
|
|
121
|
-
task.status = "finished";
|
|
122
|
-
task.timing.end = Date.now();
|
|
123
|
-
task.timing.cost = task.timing.end - task.timing.start;
|
|
124
|
-
taskIndex++;
|
|
125
|
-
} catch (e) {
|
|
126
|
-
successfullyCompleted = false;
|
|
127
|
-
task.error = e;
|
|
128
|
-
task.errorMessage = _optionalChain([e, 'optionalAccess', _12 => _12.message]) || (typeof e === "string" ? e : "error-without-message");
|
|
129
|
-
task.errorStack = e.stack;
|
|
130
|
-
task.status = "failed";
|
|
131
|
-
task.timing.end = Date.now();
|
|
132
|
-
task.timing.cost = task.timing.end - task.timing.start;
|
|
133
|
-
break;
|
|
134
|
-
}
|
|
135
1258
|
}
|
|
136
|
-
|
|
137
|
-
|
|
1259
|
+
function extractJSONFromCodeBlock(response) {
|
|
1260
|
+
try {
|
|
1261
|
+
const jsonMatch = response.match(/^\s*(\{[\s\S]*\})\s*$/);
|
|
1262
|
+
if (jsonMatch) return jsonMatch[1];
|
|
1263
|
+
const codeBlockMatch = response.match(/```(?:json)?\s*(\{[\s\S]*?\})\s*```/);
|
|
1264
|
+
if (codeBlockMatch) return codeBlockMatch[1];
|
|
1265
|
+
const jsonLikeMatch = response.match(/\{[\s\S]*\}/);
|
|
1266
|
+
if (jsonLikeMatch) return jsonLikeMatch[0];
|
|
1267
|
+
} catch {}
|
|
1268
|
+
return response;
|
|
138
1269
|
}
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
this.status = "error";
|
|
1270
|
+
function preprocessDoubaoBboxJson(input) {
|
|
1271
|
+
if (input.includes('bbox')) while(/\d+\s+\d+/.test(input))input = input.replace(/(\d+)\s+(\d+)/g, '$1,$2');
|
|
1272
|
+
return input;
|
|
143
1273
|
}
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
147
|
-
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
1274
|
+
function safeParseJson(input) {
|
|
1275
|
+
const cleanJsonString = extractJSONFromCodeBlock(input);
|
|
1276
|
+
if (null == cleanJsonString ? void 0 : cleanJsonString.match(/\((\d+),(\d+)\)/)) {
|
|
1277
|
+
var _cleanJsonString_match;
|
|
1278
|
+
return null == (_cleanJsonString_match = cleanJsonString.match(/\((\d+),(\d+)\)/)) ? void 0 : _cleanJsonString_match.slice(1).map(Number);
|
|
1279
|
+
}
|
|
1280
|
+
try {
|
|
1281
|
+
return JSON.parse(cleanJsonString);
|
|
1282
|
+
} catch {}
|
|
1283
|
+
try {
|
|
1284
|
+
return JSON.parse((0, external_jsonrepair_namespaceObject.jsonrepair)(cleanJsonString));
|
|
1285
|
+
} catch (e) {}
|
|
1286
|
+
if ('doubao-vision' === (0, env_namespaceObject.vlLocateMode)() || 'vlm-ui-tars' === (0, env_namespaceObject.vlLocateMode)()) {
|
|
1287
|
+
const jsonString = preprocessDoubaoBboxJson(cleanJsonString);
|
|
1288
|
+
return JSON.parse((0, external_jsonrepair_namespaceObject.jsonrepair)(jsonString));
|
|
1289
|
+
}
|
|
1290
|
+
throw Error(`failed to parse json response: ${input}`);
|
|
151
1291
|
}
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
1292
|
+
const constants_namespaceObject = require("@midscene/shared/constants");
|
|
1293
|
+
const extractor_namespaceObject = require("@midscene/shared/extractor");
|
|
1294
|
+
const img_namespaceObject = require("@midscene/shared/img");
|
|
1295
|
+
var common_AIActionType = /*#__PURE__*/ function(AIActionType) {
|
|
1296
|
+
AIActionType[AIActionType["ASSERT"] = 0] = "ASSERT";
|
|
1297
|
+
AIActionType[AIActionType["INSPECT_ELEMENT"] = 1] = "INSPECT_ELEMENT";
|
|
1298
|
+
AIActionType[AIActionType["EXTRACT_DATA"] = 2] = "EXTRACT_DATA";
|
|
1299
|
+
AIActionType[AIActionType["PLAN"] = 3] = "PLAN";
|
|
1300
|
+
AIActionType[AIActionType["DESCRIBE_ELEMENT"] = 4] = "DESCRIBE_ELEMENT";
|
|
1301
|
+
return AIActionType;
|
|
1302
|
+
}({});
|
|
1303
|
+
async function callAiFn(msgs, AIActionTypeValue) {
|
|
1304
|
+
const jsonObject = await callToGetJSONObject(msgs, AIActionTypeValue);
|
|
1305
|
+
return {
|
|
1306
|
+
content: jsonObject.content,
|
|
1307
|
+
usage: jsonObject.usage
|
|
1308
|
+
};
|
|
159
1309
|
}
|
|
160
|
-
const
|
|
161
|
-
|
|
162
|
-
)
|
|
163
|
-
|
|
164
|
-
|
|
1310
|
+
const defaultBboxSize = 20;
|
|
1311
|
+
const debugInspectUtils = (0, logger_namespaceObject.getDebug)('ai:common');
|
|
1312
|
+
function fillBboxParam(locate, width, height) {
|
|
1313
|
+
if (locate.bbox_2d && !(null == locate ? void 0 : locate.bbox)) {
|
|
1314
|
+
locate.bbox = locate.bbox_2d;
|
|
1315
|
+
delete locate.bbox_2d;
|
|
1316
|
+
}
|
|
1317
|
+
if (null == locate ? void 0 : locate.bbox) locate.bbox = adaptBbox(locate.bbox, width, height);
|
|
1318
|
+
return locate;
|
|
165
1319
|
}
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
1320
|
+
function adaptQwenBbox(bbox) {
|
|
1321
|
+
if (bbox.length < 2) {
|
|
1322
|
+
const msg = `invalid bbox data for qwen-vl mode: ${JSON.stringify(bbox)} `;
|
|
1323
|
+
throw new Error(msg);
|
|
1324
|
+
}
|
|
1325
|
+
const result = [
|
|
1326
|
+
Math.round(bbox[0]),
|
|
1327
|
+
Math.round(bbox[1]),
|
|
1328
|
+
'number' == typeof bbox[2] ? Math.round(bbox[2]) : Math.round(bbox[0] + defaultBboxSize),
|
|
1329
|
+
'number' == typeof bbox[3] ? Math.round(bbox[3]) : Math.round(bbox[1] + defaultBboxSize)
|
|
1330
|
+
];
|
|
1331
|
+
return result;
|
|
177
1332
|
}
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
1333
|
+
function adaptDoubaoBbox(bbox, width, height) {
|
|
1334
|
+
(0, utils_namespaceObject.assert)(width > 0 && height > 0, 'width and height must be greater than 0 in doubao mode');
|
|
1335
|
+
if ('string' == typeof bbox) {
|
|
1336
|
+
(0, utils_namespaceObject.assert)(/^(\d+)\s(\d+)\s(\d+)\s(\d+)$/.test(bbox.trim()), `invalid bbox data string for doubao-vision mode: ${bbox}`);
|
|
1337
|
+
const splitted = bbox.split(' ');
|
|
1338
|
+
if (4 === splitted.length) return [
|
|
1339
|
+
Math.round(Number(splitted[0]) * width / 1000),
|
|
1340
|
+
Math.round(Number(splitted[1]) * height / 1000),
|
|
1341
|
+
Math.round(Number(splitted[2]) * width / 1000),
|
|
1342
|
+
Math.round(Number(splitted[3]) * height / 1000)
|
|
1343
|
+
];
|
|
1344
|
+
throw new Error(`invalid bbox data string for doubao-vision mode: ${bbox}`);
|
|
1345
|
+
}
|
|
1346
|
+
if (Array.isArray(bbox) && Array.isArray(bbox[0])) bbox = bbox[0];
|
|
1347
|
+
let bboxList = [];
|
|
1348
|
+
if (Array.isArray(bbox) && 'string' == typeof bbox[0]) bbox.forEach((item)=>{
|
|
1349
|
+
if ('string' == typeof item && item.includes(',')) {
|
|
1350
|
+
const [x, y] = item.split(',');
|
|
1351
|
+
bboxList.push(Number(x.trim()), Number(y.trim()));
|
|
1352
|
+
} else if ('string' == typeof item && item.includes(' ')) {
|
|
1353
|
+
const [x, y] = item.split(' ');
|
|
1354
|
+
bboxList.push(Number(x.trim()), Number(y.trim()));
|
|
1355
|
+
} else bboxList.push(Number(item));
|
|
1356
|
+
});
|
|
1357
|
+
else bboxList = bbox;
|
|
1358
|
+
if (4 === bboxList.length || 5 === bboxList.length) return [
|
|
1359
|
+
Math.round(bboxList[0] * width / 1000),
|
|
1360
|
+
Math.round(bboxList[1] * height / 1000),
|
|
1361
|
+
Math.round(bboxList[2] * width / 1000),
|
|
1362
|
+
Math.round(bboxList[3] * height / 1000)
|
|
1363
|
+
];
|
|
1364
|
+
if (6 === bboxList.length || 2 === bboxList.length || 3 === bboxList.length || 7 === bboxList.length) return [
|
|
1365
|
+
Math.max(0, Math.round(bboxList[0] * width / 1000) - defaultBboxSize / 2),
|
|
1366
|
+
Math.max(0, Math.round(bboxList[1] * height / 1000) - defaultBboxSize / 2),
|
|
1367
|
+
Math.min(width, Math.round(bboxList[0] * width / 1000) + defaultBboxSize / 2),
|
|
1368
|
+
Math.min(height, Math.round(bboxList[1] * height / 1000) + defaultBboxSize / 2)
|
|
1369
|
+
];
|
|
1370
|
+
if (8 === bbox.length) return [
|
|
1371
|
+
Math.round(bboxList[0] * width / 1000),
|
|
1372
|
+
Math.round(bboxList[1] * height / 1000),
|
|
1373
|
+
Math.round(bboxList[4] * width / 1000),
|
|
1374
|
+
Math.round(bboxList[5] * height / 1000)
|
|
1375
|
+
];
|
|
1376
|
+
const msg = `invalid bbox data for doubao-vision mode: ${JSON.stringify(bbox)} `;
|
|
1377
|
+
throw new Error(msg);
|
|
1378
|
+
}
|
|
1379
|
+
function adaptBbox(bbox, width, height) {
|
|
1380
|
+
if ('doubao-vision' === (0, env_namespaceObject.vlLocateMode)() || 'vlm-ui-tars' === (0, env_namespaceObject.vlLocateMode)()) return adaptDoubaoBbox(bbox, width, height);
|
|
1381
|
+
if ('gemini' === (0, env_namespaceObject.vlLocateMode)()) return adaptGeminiBbox(bbox, width, height);
|
|
1382
|
+
return adaptQwenBbox(bbox);
|
|
1383
|
+
}
|
|
1384
|
+
function adaptGeminiBbox(bbox, width, height) {
|
|
1385
|
+
const left = Math.round(bbox[1] * width / 1000);
|
|
1386
|
+
const top = Math.round(bbox[0] * height / 1000);
|
|
1387
|
+
const right = Math.round(bbox[3] * width / 1000);
|
|
1388
|
+
const bottom = Math.round(bbox[2] * height / 1000);
|
|
1389
|
+
return [
|
|
1390
|
+
left,
|
|
1391
|
+
top,
|
|
1392
|
+
right,
|
|
1393
|
+
bottom
|
|
1394
|
+
];
|
|
1395
|
+
}
|
|
1396
|
+
function adaptBboxToRect(bbox, width, height, offsetX = 0, offsetY = 0) {
|
|
1397
|
+
debugInspectUtils('adaptBboxToRect', bbox, width, height, offsetX, offsetY);
|
|
1398
|
+
const [left, top, right, bottom] = adaptBbox(bbox, width, height);
|
|
1399
|
+
const rect = {
|
|
1400
|
+
left: left + offsetX,
|
|
1401
|
+
top: top + offsetY,
|
|
1402
|
+
width: right - left,
|
|
1403
|
+
height: bottom - top
|
|
1404
|
+
};
|
|
1405
|
+
debugInspectUtils('adaptBboxToRect, result=', rect);
|
|
1406
|
+
return rect;
|
|
1407
|
+
}
|
|
1408
|
+
let warned = false;
|
|
1409
|
+
function warnGPT4oSizeLimit(size) {
|
|
1410
|
+
var _getModelName;
|
|
1411
|
+
if (warned) return;
|
|
1412
|
+
if (null == (_getModelName = getModelName()) ? void 0 : _getModelName.toLowerCase().includes('gpt-4o')) {
|
|
1413
|
+
const warningMsg = `GPT-4o has a maximum image input size of 2000x768 or 768x2000, but got ${size.width}x${size.height}. Please set your page to a smaller resolution. Otherwise, the result may be inaccurate.`;
|
|
1414
|
+
if (Math.max(size.width, size.height) > 2000 || Math.min(size.width, size.height) > 768) {
|
|
1415
|
+
console.warn(warningMsg);
|
|
1416
|
+
warned = true;
|
|
1417
|
+
}
|
|
1418
|
+
} else if (size.width > 1800 || size.height > 1800) {
|
|
1419
|
+
console.warn(`The image size seems too large (${size.width}x${size.height}). It may lead to more token usage, slower response, and inaccurate result.`);
|
|
1420
|
+
warned = true;
|
|
1421
|
+
}
|
|
1422
|
+
}
|
|
1423
|
+
function mergeRects(rects) {
|
|
1424
|
+
const minLeft = Math.min(...rects.map((r)=>r.left));
|
|
1425
|
+
const minTop = Math.min(...rects.map((r)=>r.top));
|
|
1426
|
+
const maxRight = Math.max(...rects.map((r)=>r.left + r.width));
|
|
1427
|
+
const maxBottom = Math.max(...rects.map((r)=>r.top + r.height));
|
|
1428
|
+
return {
|
|
1429
|
+
left: minLeft,
|
|
1430
|
+
top: minTop,
|
|
1431
|
+
width: maxRight - minLeft,
|
|
1432
|
+
height: maxBottom - minTop
|
|
1433
|
+
};
|
|
1434
|
+
}
|
|
1435
|
+
function expandSearchArea(rect, screenSize) {
|
|
1436
|
+
const minEdgeSize = 'doubao-vision' === (0, env_namespaceObject.vlLocateMode)() ? 500 : 300;
|
|
1437
|
+
const defaultPadding = 160;
|
|
1438
|
+
const paddingSizeHorizontal = rect.width < minEdgeSize ? Math.ceil((minEdgeSize - rect.width) / 2) : defaultPadding;
|
|
1439
|
+
const paddingSizeVertical = rect.height < minEdgeSize ? Math.ceil((minEdgeSize - rect.height) / 2) : defaultPadding;
|
|
1440
|
+
rect.left = Math.max(0, rect.left - paddingSizeHorizontal);
|
|
1441
|
+
rect.width = Math.min(rect.width + 2 * paddingSizeHorizontal, screenSize.width - rect.left);
|
|
1442
|
+
rect.top = Math.max(0, rect.top - paddingSizeVertical);
|
|
1443
|
+
rect.height = Math.min(rect.height + 2 * paddingSizeVertical, screenSize.height - rect.top);
|
|
1444
|
+
return rect;
|
|
1445
|
+
}
|
|
1446
|
+
async function markupImageForLLM(screenshotBase64, tree, size) {
|
|
1447
|
+
const elementsInfo = (0, extractor_namespaceObject.treeToList)(tree);
|
|
1448
|
+
const elementsPositionInfoWithoutText = elementsInfo.filter((elementInfo)=>{
|
|
1449
|
+
if (elementInfo.attributes.nodeType === constants_namespaceObject.NodeType.TEXT) return false;
|
|
1450
|
+
return true;
|
|
1451
|
+
});
|
|
1452
|
+
const imagePayload = await (0, img_namespaceObject.compositeElementInfoImg)({
|
|
1453
|
+
inputImgBase64: screenshotBase64,
|
|
1454
|
+
elementsPositionInfo: elementsPositionInfoWithoutText,
|
|
1455
|
+
size
|
|
1456
|
+
});
|
|
1457
|
+
return imagePayload;
|
|
1458
|
+
}
|
|
1459
|
+
function buildYamlFlowFromPlans(plans, sleep) {
|
|
1460
|
+
const flow = [];
|
|
1461
|
+
for (const plan of plans){
|
|
1462
|
+
var _plan_locate;
|
|
1463
|
+
const type = plan.type;
|
|
1464
|
+
const locate = null == (_plan_locate = plan.locate) ? void 0 : _plan_locate.prompt;
|
|
1465
|
+
if ('Tap' === type) flow.push({
|
|
1466
|
+
aiTap: locate
|
|
1467
|
+
});
|
|
1468
|
+
else if ('Hover' === type) flow.push({
|
|
1469
|
+
aiHover: locate
|
|
1470
|
+
});
|
|
1471
|
+
else if ('Input' === type) {
|
|
1472
|
+
const param = plan.param;
|
|
1473
|
+
flow.push({
|
|
1474
|
+
aiInput: param.value,
|
|
1475
|
+
locate
|
|
1476
|
+
});
|
|
1477
|
+
} else if ('KeyboardPress' === type) {
|
|
1478
|
+
const param = plan.param;
|
|
1479
|
+
flow.push({
|
|
1480
|
+
aiKeyboardPress: param.value,
|
|
1481
|
+
locate
|
|
1482
|
+
});
|
|
1483
|
+
} else if ('Scroll' === type) {
|
|
1484
|
+
const param = plan.param;
|
|
1485
|
+
flow.push({
|
|
1486
|
+
aiScroll: null,
|
|
1487
|
+
locate,
|
|
1488
|
+
direction: param.direction,
|
|
1489
|
+
scrollType: param.scrollType,
|
|
1490
|
+
distance: param.distance
|
|
1491
|
+
});
|
|
1492
|
+
} else if ('Sleep' === type) {
|
|
1493
|
+
const param = plan.param;
|
|
1494
|
+
flow.push({
|
|
1495
|
+
sleep: param.timeMs
|
|
1496
|
+
});
|
|
1497
|
+
} else 'AndroidBackButton' === type || 'AndroidHomeButton' === type || 'AndroidRecentAppsButton' === type || 'AndroidLongPress' === type || 'AndroidPull' === type || 'Error' === type || 'Assert' === type || 'AssertWithoutThrow' === type || 'Finished' === type || console.warn(`Cannot convert action ${type} to yaml flow. This should be a bug of Midscene.`);
|
|
1498
|
+
}
|
|
1499
|
+
if (sleep) flow.push({
|
|
1500
|
+
sleep: sleep
|
|
1501
|
+
});
|
|
1502
|
+
return flow;
|
|
1503
|
+
}
|
|
1504
|
+
function describeSize(size) {
|
|
1505
|
+
return `${size.width} x ${size.height}`;
|
|
1506
|
+
}
|
|
1507
|
+
const distanceThreshold = 16;
|
|
1508
|
+
function elementByPositionWithElementInfo(treeRoot, position, options) {
|
|
1509
|
+
const requireStrictDistance = (null == options ? void 0 : options.requireStrictDistance) ?? true;
|
|
1510
|
+
const filterPositionElements = (null == options ? void 0 : options.filterPositionElements) ?? false;
|
|
1511
|
+
(0, utils_namespaceObject.assert)(void 0 !== position, 'position is required for query');
|
|
1512
|
+
const matchingElements = [];
|
|
1513
|
+
function dfs(node) {
|
|
1514
|
+
if (null == node ? void 0 : node.node) {
|
|
1515
|
+
const item = node.node;
|
|
1516
|
+
if (item.rect.left <= position.x && position.x <= item.rect.left + item.rect.width && item.rect.top <= position.y && position.y <= item.rect.top + item.rect.height) {
|
|
1517
|
+
var _item_attributes;
|
|
1518
|
+
if (!(filterPositionElements && (null == (_item_attributes = item.attributes) ? void 0 : _item_attributes.nodeType) === constants_namespaceObject.NodeType.POSITION) && item.isVisible) matchingElements.push(item);
|
|
1519
|
+
}
|
|
1520
|
+
}
|
|
1521
|
+
for (const child of node.children)dfs(child);
|
|
1522
|
+
}
|
|
1523
|
+
dfs(treeRoot);
|
|
1524
|
+
if (0 === matchingElements.length) return;
|
|
1525
|
+
const element = matchingElements.reduce((smallest, current)=>{
|
|
1526
|
+
const smallestArea = smallest.rect.width * smallest.rect.height;
|
|
1527
|
+
const currentArea = current.rect.width * current.rect.height;
|
|
1528
|
+
return currentArea < smallestArea ? current : smallest;
|
|
1529
|
+
});
|
|
1530
|
+
const distanceToCenter = distance({
|
|
1531
|
+
x: element.center[0],
|
|
1532
|
+
y: element.center[1]
|
|
1533
|
+
}, position);
|
|
1534
|
+
if (requireStrictDistance) return distanceToCenter <= distanceThreshold ? element : void 0;
|
|
1535
|
+
return element;
|
|
1536
|
+
}
|
|
1537
|
+
function distance(point1, point2) {
|
|
1538
|
+
return Math.sqrt((point1.x - point2.x) ** 2 + (point1.y - point2.y) ** 2);
|
|
1539
|
+
}
|
|
1540
|
+
async function describeUserPage(context, opt) {
|
|
1541
|
+
const { screenshotBase64 } = context;
|
|
1542
|
+
let width;
|
|
1543
|
+
let height;
|
|
1544
|
+
if (context.size) ({ width, height } = context.size);
|
|
1545
|
+
else {
|
|
1546
|
+
const imgSize = await (0, img_namespaceObject.imageInfoOfBase64)(screenshotBase64);
|
|
1547
|
+
({ width, height } = imgSize);
|
|
1548
|
+
}
|
|
1549
|
+
const treeRoot = context.tree;
|
|
1550
|
+
const idElementMap = {};
|
|
1551
|
+
const flatElements = (0, extractor_namespaceObject.treeToList)(treeRoot);
|
|
1552
|
+
if ((null == opt ? void 0 : opt.domIncluded) === true && flatElements.length >= 5000) console.warn('The number of elements is too large, it may cause the prompt to be too long, please use domIncluded: "visible-only" to reduce the number of elements');
|
|
1553
|
+
flatElements.forEach((element)=>{
|
|
1554
|
+
idElementMap[element.id] = element;
|
|
1555
|
+
if (void 0 !== element.indexId) idElementMap[`${element.indexId}`] = element;
|
|
1556
|
+
});
|
|
1557
|
+
let pageDescription = '';
|
|
1558
|
+
const visibleOnly = (null == opt ? void 0 : opt.visibleOnly) ?? (null == opt ? void 0 : opt.domIncluded) === 'visible-only';
|
|
1559
|
+
if ((null == opt ? void 0 : opt.domIncluded) || !(0, env_namespaceObject.vlLocateMode)()) {
|
|
1560
|
+
const contentTree = await (0, extractor_namespaceObject.descriptionOfTree)(treeRoot, null == opt ? void 0 : opt.truncateTextLength, null == opt ? void 0 : opt.filterNonTextContent, visibleOnly);
|
|
1561
|
+
const sizeDescription = describeSize({
|
|
1562
|
+
width,
|
|
1563
|
+
height
|
|
1564
|
+
});
|
|
1565
|
+
pageDescription = `The size of the page: ${sizeDescription} \n The page elements tree:\n${contentTree}`;
|
|
1566
|
+
}
|
|
1567
|
+
return {
|
|
1568
|
+
description: pageDescription,
|
|
1569
|
+
elementById (idOrIndexId) {
|
|
1570
|
+
(0, utils_namespaceObject.assert)(void 0 !== idOrIndexId, 'id is required for query');
|
|
1571
|
+
const item = idElementMap[`${idOrIndexId}`];
|
|
1572
|
+
return item;
|
|
1573
|
+
},
|
|
1574
|
+
elementByPosition (position, size) {
|
|
1575
|
+
return elementByPositionWithElementInfo(treeRoot, position);
|
|
1576
|
+
},
|
|
1577
|
+
insertElementByPosition (position) {
|
|
1578
|
+
const element = (0, extractor_namespaceObject.generateElementByPosition)(position);
|
|
1579
|
+
treeRoot.children.push({
|
|
1580
|
+
node: element,
|
|
1581
|
+
children: []
|
|
1582
|
+
});
|
|
1583
|
+
flatElements.push(element);
|
|
1584
|
+
idElementMap[element.id] = element;
|
|
1585
|
+
return element;
|
|
1586
|
+
},
|
|
1587
|
+
size: {
|
|
1588
|
+
width,
|
|
1589
|
+
height
|
|
1590
|
+
}
|
|
1591
|
+
};
|
|
1592
|
+
}
|
|
1593
|
+
function systemPromptToExtract() {
|
|
1594
|
+
return `
|
|
1595
|
+
You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.
|
|
1596
|
+
|
|
1597
|
+
The user will give you a screenshot, the contents of it (optional), and some data requirements in <DATA_DEMAND>. You need to extract the data according to the <DATA_DEMAND>.
|
|
1598
|
+
|
|
1599
|
+
If a key specifies a JSON data type (such as Number, String, Boolean, Object, Array), ensure the returned value strictly matches that data type.
|
|
1600
|
+
|
|
1601
|
+
If the user provides multiple reference images, please carefully review the reference images with the screenshot and provide the correct answer for <DATA_DEMAND>.
|
|
1602
|
+
|
|
1603
|
+
If the user requests reasons to be provided, please provide the thought field in response, less then 100 words.
|
|
1604
|
+
|
|
1605
|
+
Return in the following JSON format:
|
|
1606
|
+
{
|
|
1607
|
+
thought: string, // the thought process of the extraction, less then 100 words, not required by default.
|
|
1608
|
+
data: any, // the extracted data. Make sure both the value and scheme meet the DATA_DEMAND. If you want to write some description in this field, use the same language as the DATA_DEMAND.
|
|
1609
|
+
errors: [], // string[], error message if any
|
|
1610
|
+
}
|
|
1611
|
+
|
|
1612
|
+
# Example 1
|
|
1613
|
+
For example, if the DATA_DEMAND is:
|
|
1614
|
+
|
|
1615
|
+
<DATA_DEMAND>
|
|
1616
|
+
{
|
|
1617
|
+
"name": "name shows on the left panel, string",
|
|
1618
|
+
"age": "age shows on the right panel, number",
|
|
1619
|
+
"isAdmin": "if the user is admin, boolean"
|
|
1620
|
+
}
|
|
1621
|
+
</DATA_DEMAND>
|
|
1622
|
+
|
|
1623
|
+
By viewing the screenshot and page contents, you can extract the following data:
|
|
1624
|
+
|
|
1625
|
+
{
|
|
1626
|
+
data: {
|
|
1627
|
+
name: "John",
|
|
1628
|
+
age: 30,
|
|
1629
|
+
isAdmin: true
|
|
1630
|
+
},
|
|
1631
|
+
}
|
|
1632
|
+
|
|
1633
|
+
# Example 2
|
|
1634
|
+
If the DATA_DEMAND is:
|
|
1635
|
+
|
|
1636
|
+
<DATA_DEMAND>
|
|
1637
|
+
the todo items list, string[]
|
|
1638
|
+
</DATA_DEMAND>
|
|
1639
|
+
|
|
1640
|
+
By viewing the screenshot and page contents, you can extract the following data:
|
|
1641
|
+
|
|
1642
|
+
{
|
|
1643
|
+
data: ["todo 1", "todo 2", "todo 3"],
|
|
1644
|
+
}
|
|
1645
|
+
|
|
1646
|
+
# Example 3
|
|
1647
|
+
If the DATA_DEMAND is:
|
|
1648
|
+
|
|
1649
|
+
<DATA_DEMAND>
|
|
1650
|
+
the page title, string
|
|
1651
|
+
</DATA_DEMAND>
|
|
1652
|
+
|
|
1653
|
+
By viewing the screenshot and page contents, you can extract the following data:
|
|
1654
|
+
|
|
1655
|
+
{
|
|
1656
|
+
data: "todo list",
|
|
1657
|
+
}
|
|
1658
|
+
|
|
1659
|
+
# Example 4
|
|
1660
|
+
If the DATA_DEMAND is:
|
|
1661
|
+
|
|
1662
|
+
<DATA_DEMAND>
|
|
1663
|
+
{
|
|
1664
|
+
"result": "Boolean, is it currently the SMS page?"
|
|
1665
|
+
}
|
|
1666
|
+
</DATA_DEMAND>
|
|
1667
|
+
|
|
1668
|
+
By viewing the screenshot and page contents, you can extract the following data:
|
|
1669
|
+
|
|
1670
|
+
{
|
|
1671
|
+
data: { result: true },
|
|
1672
|
+
}
|
|
1673
|
+
`;
|
|
1674
|
+
}
|
|
1675
|
+
const extractDataQueryPrompt = async (pageDescription, dataQuery)=>{
|
|
1676
|
+
let dataQueryText = '';
|
|
1677
|
+
dataQueryText = 'string' == typeof dataQuery ? dataQuery : JSON.stringify(dataQuery, null, 2);
|
|
1678
|
+
const extractDataPrompt = new prompts_namespaceObject.PromptTemplate({
|
|
1679
|
+
template: `
|
|
1680
|
+
<PageDescription>
|
|
1681
|
+
{pageDescription}
|
|
1682
|
+
</PageDescription>
|
|
1683
|
+
|
|
1684
|
+
<DATA_DEMAND>
|
|
1685
|
+
{dataQuery}
|
|
1686
|
+
</DATA_DEMAND>
|
|
1687
|
+
`,
|
|
1688
|
+
inputVariables: [
|
|
1689
|
+
"pageDescription",
|
|
1690
|
+
'dataQuery'
|
|
1691
|
+
]
|
|
1692
|
+
});
|
|
1693
|
+
return await extractDataPrompt.format({
|
|
1694
|
+
pageDescription,
|
|
1695
|
+
dataQuery: dataQueryText
|
|
1696
|
+
});
|
|
185
1697
|
};
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
1698
|
+
function systemPromptToLocateSection(vlMode) {
|
|
1699
|
+
return `
|
|
1700
|
+
You goal is to find out one section containing the target element in the screenshot, put it in the \`bbox\` field. If the user describe the target element with some reference elements, you should also find the section containing the reference elements, put it in the \`references_bbox\` field.
|
|
1701
|
+
|
|
1702
|
+
Usually, it should be approximately an area not more than 300x300px. Changes of the size are allowed if there are many elements to cover.
|
|
1703
|
+
|
|
1704
|
+
return in this JSON format:
|
|
1705
|
+
\`\`\`json
|
|
1706
|
+
{
|
|
1707
|
+
"bbox": [number, number, number, number],
|
|
1708
|
+
"references_bbox"?: [
|
|
1709
|
+
[number, number, number, number],
|
|
1710
|
+
[number, number, number, number],
|
|
1711
|
+
...
|
|
1712
|
+
],
|
|
1713
|
+
"error"?: string
|
|
1714
|
+
}
|
|
1715
|
+
\`\`\`
|
|
189
1716
|
|
|
190
|
-
|
|
1717
|
+
In which, all the numbers in the \`bbox\` and \`references_bbox\` represent ${bboxDescription(vlMode)}.
|
|
191
1718
|
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
1719
|
+
For example, if the user describe the target element as "the delete button on the second row with title 'Peter'", you should put the bounding box of the delete button in the \`bbox\` field, and the bounding box of the second row in the \`references_bbox\` field.
|
|
1720
|
+
|
|
1721
|
+
the return value should be like this:
|
|
1722
|
+
\`\`\`json
|
|
1723
|
+
{
|
|
1724
|
+
"bbox": [100, 100, 200, 200],
|
|
1725
|
+
"references_bbox": [[100, 100, 200, 200]]
|
|
1726
|
+
}
|
|
1727
|
+
\`\`\`
|
|
1728
|
+
`;
|
|
1729
|
+
}
|
|
1730
|
+
const sectionLocatorInstruction = new prompts_namespaceObject.PromptTemplate({
|
|
1731
|
+
template: `Here is the target element user interested in:
|
|
1732
|
+
<targetDescription>
|
|
1733
|
+
{sectionDescription}
|
|
1734
|
+
</targetDescription>
|
|
1735
|
+
`,
|
|
1736
|
+
inputVariables: [
|
|
1737
|
+
"sectionDescription"
|
|
1738
|
+
]
|
|
1739
|
+
});
|
|
1740
|
+
const debugInspect = (0, logger_namespaceObject.getDebug)('ai:inspect');
|
|
1741
|
+
const debugSection = (0, logger_namespaceObject.getDebug)('ai:section');
|
|
1742
|
+
const extraTextFromUserPrompt = (prompt)=>{
|
|
1743
|
+
if ('string' == typeof prompt) return prompt;
|
|
1744
|
+
return prompt.prompt;
|
|
1745
|
+
};
|
|
1746
|
+
const promptsToChatParam = async (multimodalPrompt)=>{
|
|
1747
|
+
var _multimodalPrompt_images;
|
|
1748
|
+
const msgs = [];
|
|
1749
|
+
if (null == multimodalPrompt ? void 0 : null == (_multimodalPrompt_images = multimodalPrompt.images) ? void 0 : _multimodalPrompt_images.length) {
|
|
1750
|
+
msgs.push({
|
|
1751
|
+
role: 'user',
|
|
1752
|
+
content: [
|
|
1753
|
+
{
|
|
1754
|
+
type: 'text',
|
|
1755
|
+
text: 'Next, I will provide all the reference images.'
|
|
1756
|
+
}
|
|
1757
|
+
]
|
|
1758
|
+
});
|
|
1759
|
+
for (const item of multimodalPrompt.images){
|
|
1760
|
+
const base64 = await (0, img_namespaceObject.preProcessImageUrl)(item.url, !!multimodalPrompt.convertHttpImage2Base64);
|
|
1761
|
+
msgs.push({
|
|
1762
|
+
role: 'user',
|
|
1763
|
+
content: [
|
|
1764
|
+
{
|
|
1765
|
+
type: 'text',
|
|
1766
|
+
text: `reference image ${item.name}:`
|
|
1767
|
+
}
|
|
1768
|
+
]
|
|
1769
|
+
});
|
|
1770
|
+
msgs.push({
|
|
1771
|
+
role: 'user',
|
|
1772
|
+
content: [
|
|
1773
|
+
{
|
|
1774
|
+
type: 'image_url',
|
|
1775
|
+
image_url: {
|
|
1776
|
+
url: base64,
|
|
1777
|
+
detail: 'high'
|
|
1778
|
+
}
|
|
1779
|
+
}
|
|
1780
|
+
]
|
|
1781
|
+
});
|
|
1782
|
+
}
|
|
1783
|
+
}
|
|
1784
|
+
return msgs;
|
|
1785
|
+
};
|
|
1786
|
+
async function AiLocateElement(options) {
|
|
1787
|
+
const { context, targetElementDescription, callAI } = options;
|
|
1788
|
+
const { screenshotBase64 } = context;
|
|
1789
|
+
const { description, elementById, insertElementByPosition } = await describeUserPage(context);
|
|
1790
|
+
(0, utils_namespaceObject.assert)(targetElementDescription, "cannot find the target element description");
|
|
1791
|
+
const userInstructionPrompt = await findElementPrompt.format({
|
|
1792
|
+
pageDescription: description,
|
|
1793
|
+
targetElementDescription: extraTextFromUserPrompt(targetElementDescription)
|
|
1794
|
+
});
|
|
1795
|
+
const systemPrompt = systemPromptToLocateElement((0, env_namespaceObject.vlLocateMode)());
|
|
1796
|
+
let imagePayload = screenshotBase64;
|
|
1797
|
+
if (options.searchConfig) {
|
|
1798
|
+
(0, utils_namespaceObject.assert)(options.searchConfig.rect, 'searchArea is provided but its rect cannot be found. Failed to locate element');
|
|
1799
|
+
(0, utils_namespaceObject.assert)(options.searchConfig.imageBase64, 'searchArea is provided but its imageBase64 cannot be found. Failed to locate element');
|
|
1800
|
+
imagePayload = options.searchConfig.imageBase64;
|
|
1801
|
+
} else if ('qwen-vl' === (0, env_namespaceObject.vlLocateMode)()) imagePayload = await (0, img_namespaceObject.paddingToMatchBlockByBase64)(imagePayload);
|
|
1802
|
+
else if (!(0, env_namespaceObject.vlLocateMode)()) imagePayload = await markupImageForLLM(screenshotBase64, context.tree, context.size);
|
|
1803
|
+
const msgs = [
|
|
1804
|
+
{
|
|
1805
|
+
role: 'system',
|
|
1806
|
+
content: systemPrompt
|
|
1807
|
+
},
|
|
1808
|
+
{
|
|
1809
|
+
role: 'user',
|
|
1810
|
+
content: [
|
|
1811
|
+
{
|
|
1812
|
+
type: 'image_url',
|
|
1813
|
+
image_url: {
|
|
1814
|
+
url: imagePayload,
|
|
1815
|
+
detail: 'high'
|
|
1816
|
+
}
|
|
1817
|
+
},
|
|
1818
|
+
{
|
|
1819
|
+
type: 'text',
|
|
1820
|
+
text: userInstructionPrompt
|
|
1821
|
+
}
|
|
1822
|
+
]
|
|
1823
|
+
}
|
|
1824
|
+
];
|
|
1825
|
+
if ('string' != typeof targetElementDescription) {
|
|
1826
|
+
const addOns = await promptsToChatParam({
|
|
1827
|
+
images: targetElementDescription.images,
|
|
1828
|
+
convertHttpImage2Base64: targetElementDescription.convertHttpImage2Base64
|
|
1829
|
+
});
|
|
1830
|
+
msgs.push(...addOns);
|
|
1831
|
+
}
|
|
1832
|
+
const callAIFn = callAI || callToGetJSONObject;
|
|
1833
|
+
const res = await callAIFn(msgs, common_AIActionType.INSPECT_ELEMENT);
|
|
1834
|
+
const rawResponse = JSON.stringify(res.content);
|
|
1835
|
+
let resRect;
|
|
1836
|
+
let matchedElements = 'elements' in res.content ? res.content.elements : [];
|
|
1837
|
+
let errors = 'errors' in res.content ? res.content.errors : [];
|
|
1838
|
+
try {
|
|
1839
|
+
if ('bbox' in res.content && Array.isArray(res.content.bbox)) {
|
|
1840
|
+
var _options_searchConfig_rect, _options_searchConfig, _options_searchConfig_rect1, _options_searchConfig1, _options_searchConfig_rect2, _options_searchConfig2, _options_searchConfig_rect3, _options_searchConfig3;
|
|
1841
|
+
resRect = adaptBboxToRect(res.content.bbox, (null == (_options_searchConfig = options.searchConfig) ? void 0 : null == (_options_searchConfig_rect = _options_searchConfig.rect) ? void 0 : _options_searchConfig_rect.width) || context.size.width, (null == (_options_searchConfig1 = options.searchConfig) ? void 0 : null == (_options_searchConfig_rect1 = _options_searchConfig1.rect) ? void 0 : _options_searchConfig_rect1.height) || context.size.height, null == (_options_searchConfig2 = options.searchConfig) ? void 0 : null == (_options_searchConfig_rect2 = _options_searchConfig2.rect) ? void 0 : _options_searchConfig_rect2.left, null == (_options_searchConfig3 = options.searchConfig) ? void 0 : null == (_options_searchConfig_rect3 = _options_searchConfig3.rect) ? void 0 : _options_searchConfig_rect3.top);
|
|
1842
|
+
debugInspect('resRect', resRect);
|
|
1843
|
+
const rectCenter = {
|
|
1844
|
+
x: resRect.left + resRect.width / 2,
|
|
1845
|
+
y: resRect.top + resRect.height / 2
|
|
1846
|
+
};
|
|
1847
|
+
let element = elementByPositionWithElementInfo(context.tree, rectCenter);
|
|
1848
|
+
const distanceToCenter = element ? distance({
|
|
1849
|
+
x: element.center[0],
|
|
1850
|
+
y: element.center[1]
|
|
1851
|
+
}, rectCenter) : 0;
|
|
1852
|
+
if (!element || distanceToCenter > distanceThreshold) element = insertElementByPosition(rectCenter);
|
|
1853
|
+
if (element) {
|
|
1854
|
+
matchedElements = [
|
|
1855
|
+
element
|
|
1856
|
+
];
|
|
1857
|
+
errors = [];
|
|
1858
|
+
}
|
|
1859
|
+
}
|
|
1860
|
+
} catch (e) {
|
|
1861
|
+
const msg = e instanceof Error ? `Failed to parse bbox: ${e.message}` : 'unknown error in locate';
|
|
1862
|
+
if (errors && (null == errors ? void 0 : errors.length) !== 0) errors.push(`(${msg})`);
|
|
1863
|
+
else errors = [
|
|
1864
|
+
msg
|
|
1865
|
+
];
|
|
1866
|
+
}
|
|
1867
|
+
return {
|
|
1868
|
+
rect: resRect,
|
|
1869
|
+
parseResult: {
|
|
1870
|
+
elements: matchedElements,
|
|
1871
|
+
errors
|
|
1872
|
+
},
|
|
1873
|
+
rawResponse,
|
|
1874
|
+
elementById,
|
|
1875
|
+
usage: res.usage,
|
|
1876
|
+
isOrderSensitive: 'object' == typeof res.content && null !== res.content && 'isOrderSensitive' in res.content ? res.content.isOrderSensitive : void 0
|
|
1877
|
+
};
|
|
1878
|
+
}
|
|
1879
|
+
async function AiLocateSection(options) {
|
|
1880
|
+
const { context, sectionDescription } = options;
|
|
1881
|
+
const { screenshotBase64 } = context;
|
|
1882
|
+
const systemPrompt = systemPromptToLocateSection((0, env_namespaceObject.vlLocateMode)());
|
|
1883
|
+
const sectionLocatorInstructionText = await sectionLocatorInstruction.format({
|
|
1884
|
+
sectionDescription: extraTextFromUserPrompt(sectionDescription)
|
|
1885
|
+
});
|
|
1886
|
+
const msgs = [
|
|
1887
|
+
{
|
|
1888
|
+
role: 'system',
|
|
1889
|
+
content: systemPrompt
|
|
1890
|
+
},
|
|
1891
|
+
{
|
|
1892
|
+
role: 'user',
|
|
1893
|
+
content: [
|
|
1894
|
+
{
|
|
1895
|
+
type: 'image_url',
|
|
1896
|
+
image_url: {
|
|
1897
|
+
url: screenshotBase64,
|
|
1898
|
+
detail: 'high'
|
|
1899
|
+
}
|
|
1900
|
+
},
|
|
1901
|
+
{
|
|
1902
|
+
type: 'text',
|
|
1903
|
+
text: sectionLocatorInstructionText
|
|
1904
|
+
}
|
|
1905
|
+
]
|
|
1906
|
+
}
|
|
1907
|
+
];
|
|
1908
|
+
if ('string' != typeof sectionDescription) {
|
|
1909
|
+
const addOns = await promptsToChatParam({
|
|
1910
|
+
images: sectionDescription.images,
|
|
1911
|
+
convertHttpImage2Base64: sectionDescription.convertHttpImage2Base64
|
|
1912
|
+
});
|
|
1913
|
+
msgs.push(...addOns);
|
|
1914
|
+
}
|
|
1915
|
+
const result = await callAiFn(msgs, common_AIActionType.EXTRACT_DATA);
|
|
1916
|
+
let sectionRect;
|
|
1917
|
+
const sectionBbox = result.content.bbox;
|
|
1918
|
+
if (sectionBbox) {
|
|
1919
|
+
const targetRect = adaptBboxToRect(sectionBbox, context.size.width, context.size.height);
|
|
1920
|
+
debugSection('original targetRect %j', targetRect);
|
|
1921
|
+
const referenceBboxList = result.content.references_bbox || [];
|
|
1922
|
+
debugSection('referenceBboxList %j', referenceBboxList);
|
|
1923
|
+
const referenceRects = referenceBboxList.filter((bbox)=>Array.isArray(bbox)).map((bbox)=>adaptBboxToRect(bbox, context.size.width, context.size.height));
|
|
1924
|
+
debugSection('referenceRects %j', referenceRects);
|
|
1925
|
+
const mergedRect = mergeRects([
|
|
1926
|
+
targetRect,
|
|
1927
|
+
...referenceRects
|
|
1928
|
+
]);
|
|
1929
|
+
debugSection('mergedRect %j', mergedRect);
|
|
1930
|
+
sectionRect = expandSearchArea(mergedRect, context.size);
|
|
1931
|
+
debugSection('expanded sectionRect %j', sectionRect);
|
|
1932
|
+
}
|
|
1933
|
+
let imageBase64 = screenshotBase64;
|
|
1934
|
+
if (sectionRect) imageBase64 = await (0, img_namespaceObject.cropByRect)(screenshotBase64, sectionRect, (0, env_namespaceObject.getAIConfigInBoolean)(env_namespaceObject.MIDSCENE_USE_QWEN_VL));
|
|
1935
|
+
return {
|
|
1936
|
+
rect: sectionRect,
|
|
1937
|
+
imageBase64,
|
|
1938
|
+
error: result.content.error,
|
|
1939
|
+
rawResponse: JSON.stringify(result.content),
|
|
1940
|
+
usage: result.usage
|
|
1941
|
+
};
|
|
1942
|
+
}
|
|
1943
|
+
async function AiExtractElementInfo(options) {
|
|
1944
|
+
var _options_extractOption;
|
|
1945
|
+
const { dataQuery, context, extractOption, multimodalPrompt } = options;
|
|
1946
|
+
const systemPrompt = systemPromptToExtract();
|
|
1947
|
+
const { screenshotBase64 } = context;
|
|
1948
|
+
const { description, elementById } = await describeUserPage(context, {
|
|
1949
|
+
truncateTextLength: 200,
|
|
1950
|
+
filterNonTextContent: false,
|
|
1951
|
+
visibleOnly: false,
|
|
1952
|
+
domIncluded: null == extractOption ? void 0 : extractOption.domIncluded
|
|
1953
|
+
});
|
|
1954
|
+
const extractDataPromptText = await extractDataQueryPrompt(description, dataQuery);
|
|
1955
|
+
const userContent = [];
|
|
1956
|
+
if ((null == extractOption ? void 0 : extractOption.screenshotIncluded) !== false) userContent.push({
|
|
1957
|
+
type: 'image_url',
|
|
1958
|
+
image_url: {
|
|
1959
|
+
url: screenshotBase64,
|
|
1960
|
+
detail: 'high'
|
|
1961
|
+
}
|
|
1962
|
+
});
|
|
1963
|
+
userContent.push({
|
|
1964
|
+
type: 'text',
|
|
1965
|
+
text: extractDataPromptText
|
|
1966
|
+
});
|
|
1967
|
+
const msgs = [
|
|
1968
|
+
{
|
|
1969
|
+
role: 'system',
|
|
1970
|
+
content: systemPrompt
|
|
1971
|
+
},
|
|
1972
|
+
{
|
|
1973
|
+
role: 'user',
|
|
1974
|
+
content: userContent
|
|
1975
|
+
}
|
|
1976
|
+
];
|
|
1977
|
+
if (null == (_options_extractOption = options.extractOption) ? void 0 : _options_extractOption.returnThought) msgs.push({
|
|
1978
|
+
role: 'user',
|
|
1979
|
+
content: 'Please provide reasons.'
|
|
1980
|
+
});
|
|
1981
|
+
if (multimodalPrompt) {
|
|
1982
|
+
const addOns = await promptsToChatParam({
|
|
1983
|
+
images: multimodalPrompt.images,
|
|
1984
|
+
convertHttpImage2Base64: multimodalPrompt.convertHttpImage2Base64
|
|
1985
|
+
});
|
|
1986
|
+
msgs.push(...addOns);
|
|
1987
|
+
}
|
|
1988
|
+
const result = await callAiFn(msgs, common_AIActionType.EXTRACT_DATA);
|
|
1989
|
+
return {
|
|
1990
|
+
parseResult: result.content,
|
|
1991
|
+
elementById,
|
|
1992
|
+
usage: result.usage
|
|
1993
|
+
};
|
|
1994
|
+
}
|
|
1995
|
+
async function AiAssert(options) {
|
|
1996
|
+
const { assertion, context } = options;
|
|
1997
|
+
(0, utils_namespaceObject.assert)(assertion, 'assertion should not be empty');
|
|
1998
|
+
const { screenshotBase64 } = context;
|
|
1999
|
+
const systemPrompt = systemPromptToAssert({
|
|
2000
|
+
isUITars: (0, env_namespaceObject.getAIConfigInBoolean)(env_namespaceObject.MIDSCENE_USE_VLM_UI_TARS)
|
|
2001
|
+
});
|
|
2002
|
+
const assertionText = extraTextFromUserPrompt(assertion);
|
|
2003
|
+
const msgs = [
|
|
2004
|
+
{
|
|
2005
|
+
role: 'system',
|
|
2006
|
+
content: systemPrompt
|
|
2007
|
+
},
|
|
2008
|
+
{
|
|
2009
|
+
role: 'user',
|
|
2010
|
+
content: [
|
|
2011
|
+
{
|
|
2012
|
+
type: 'image_url',
|
|
2013
|
+
image_url: {
|
|
2014
|
+
url: screenshotBase64,
|
|
2015
|
+
detail: 'high'
|
|
2016
|
+
}
|
|
2017
|
+
},
|
|
2018
|
+
{
|
|
2019
|
+
type: 'text',
|
|
2020
|
+
text: `
|
|
2021
|
+
Here is the assertion. Please tell whether it is truthy according to the screenshot.
|
|
2022
|
+
=====================================
|
|
2023
|
+
${assertionText}
|
|
2024
|
+
=====================================
|
|
2025
|
+
`
|
|
2026
|
+
}
|
|
2027
|
+
]
|
|
2028
|
+
}
|
|
2029
|
+
];
|
|
2030
|
+
if ('string' != typeof assertion) {
|
|
2031
|
+
const addOns = await promptsToChatParam({
|
|
2032
|
+
images: assertion.images,
|
|
2033
|
+
convertHttpImage2Base64: assertion.convertHttpImage2Base64
|
|
2034
|
+
});
|
|
2035
|
+
msgs.push(...addOns);
|
|
2036
|
+
}
|
|
2037
|
+
const { content: assertResult, usage } = await callAiFn(msgs, common_AIActionType.ASSERT);
|
|
2038
|
+
return {
|
|
2039
|
+
content: assertResult,
|
|
2040
|
+
usage
|
|
2041
|
+
};
|
|
2042
|
+
}
|
|
2043
|
+
async function llm_planning_plan(userInstruction, opts) {
|
|
2044
|
+
var _planFromAI_action;
|
|
2045
|
+
const { callAI, context } = opts || {};
|
|
2046
|
+
const { screenshotBase64, size } = context;
|
|
2047
|
+
const { description: pageDescription, elementById } = await describeUserPage(context);
|
|
2048
|
+
const systemPrompt = await systemPromptToTaskPlanning({
|
|
2049
|
+
actionSpace: opts.actionSpace,
|
|
2050
|
+
vlMode: (0, env_namespaceObject.vlLocateMode)()
|
|
2051
|
+
});
|
|
2052
|
+
const taskBackgroundContextText = generateTaskBackgroundContext(userInstruction, opts.log, opts.actionContext);
|
|
2053
|
+
const userInstructionPrompt = await automationUserPrompt((0, env_namespaceObject.vlLocateMode)()).format({
|
|
2054
|
+
pageDescription,
|
|
2055
|
+
taskBackgroundContext: taskBackgroundContextText
|
|
2056
|
+
});
|
|
2057
|
+
let imagePayload = screenshotBase64;
|
|
2058
|
+
if ('qwen-vl' === (0, env_namespaceObject.vlLocateMode)()) imagePayload = await (0, img_namespaceObject.paddingToMatchBlockByBase64)(imagePayload);
|
|
2059
|
+
else if (!(0, env_namespaceObject.vlLocateMode)()) imagePayload = await markupImageForLLM(screenshotBase64, context.tree, context.size);
|
|
2060
|
+
warnGPT4oSizeLimit(size);
|
|
2061
|
+
const msgs = [
|
|
2062
|
+
{
|
|
2063
|
+
role: 'system',
|
|
2064
|
+
content: systemPrompt
|
|
2065
|
+
},
|
|
2066
|
+
{
|
|
2067
|
+
role: 'user',
|
|
2068
|
+
content: [
|
|
2069
|
+
{
|
|
2070
|
+
type: 'image_url',
|
|
2071
|
+
image_url: {
|
|
2072
|
+
url: imagePayload,
|
|
2073
|
+
detail: 'high'
|
|
2074
|
+
}
|
|
2075
|
+
},
|
|
2076
|
+
{
|
|
2077
|
+
type: 'text',
|
|
2078
|
+
text: userInstructionPrompt
|
|
2079
|
+
}
|
|
2080
|
+
]
|
|
2081
|
+
}
|
|
2082
|
+
];
|
|
2083
|
+
const call = callAI || callAiFn;
|
|
2084
|
+
const { content, usage } = await call(msgs, common_AIActionType.PLAN);
|
|
2085
|
+
const rawResponse = JSON.stringify(content, void 0, 2);
|
|
2086
|
+
const planFromAI = content;
|
|
2087
|
+
const actions = ((null == (_planFromAI_action = planFromAI.action) ? void 0 : _planFromAI_action.type) ? [
|
|
2088
|
+
planFromAI.action
|
|
2089
|
+
] : planFromAI.actions) || [];
|
|
2090
|
+
const returnValue = {
|
|
2091
|
+
...planFromAI,
|
|
2092
|
+
actions,
|
|
2093
|
+
rawResponse,
|
|
2094
|
+
usage,
|
|
2095
|
+
yamlFlow: buildYamlFlowFromPlans(actions, planFromAI.sleep)
|
|
2096
|
+
};
|
|
2097
|
+
(0, utils_namespaceObject.assert)(planFromAI, "can't get plans from AI");
|
|
2098
|
+
if ((0, env_namespaceObject.vlLocateMode)()) {
|
|
2099
|
+
actions.forEach((action)=>{
|
|
2100
|
+
if (action.locate) try {
|
|
2101
|
+
action.locate = fillBboxParam(action.locate, size.width, size.height);
|
|
2102
|
+
} catch (e) {
|
|
2103
|
+
throw new Error(`Failed to fill locate param: ${planFromAI.error} (${e instanceof Error ? e.message : 'unknown error'})`, {
|
|
2104
|
+
cause: e
|
|
2105
|
+
});
|
|
2106
|
+
}
|
|
2107
|
+
});
|
|
2108
|
+
(0, utils_namespaceObject.assert)(!planFromAI.error, `Failed to plan actions: ${planFromAI.error}`);
|
|
2109
|
+
} else actions.forEach((action)=>{
|
|
2110
|
+
var _action_locate;
|
|
2111
|
+
if (null == (_action_locate = action.locate) ? void 0 : _action_locate.id) {
|
|
2112
|
+
const element = elementById(action.locate.id);
|
|
2113
|
+
if (element) action.locate.id = element.id;
|
|
2114
|
+
}
|
|
2115
|
+
});
|
|
2116
|
+
if (0 === actions.length && returnValue.more_actions_needed_by_instruction && !returnValue.sleep) console.warn('No actions planned for the prompt, but model said more actions are needed:', userInstruction);
|
|
2117
|
+
return returnValue;
|
|
2118
|
+
}
|
|
2119
|
+
require("@midscene/shared/us-keyboard-layout");
|
|
2120
|
+
require("@ui-tars/action-parser");
|
|
2121
|
+
(0, logger_namespaceObject.getDebug)('ui-tars-planning');
|
|
2122
|
+
const elementDescriberInstruction = ()=>`
|
|
2123
|
+
Describe the element in the red rectangle for precise identification. Use ${(0, env_namespaceObject.getPreferredLanguage)()}.
|
|
195
2124
|
|
|
196
2125
|
CRITICAL REQUIREMENTS:
|
|
197
2126
|
1. UNIQUENESS: The description must uniquely identify this element on the current page
|
|
@@ -230,341 +2159,294 @@ Return JSON:
|
|
|
230
2159
|
"description": "unique element identifier",
|
|
231
2160
|
"error"?: "error message if any"
|
|
232
2161
|
}`;
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
// src/insight/utils.ts
|
|
247
|
-
|
|
248
|
-
|
|
249
|
-
function emitInsightDump(data, dumpSubscriber) {
|
|
250
|
-
const baseData = {
|
|
251
|
-
sdkVersion: _chunkO3KUKF2Ajs.getVersion.call(void 0, ),
|
|
252
|
-
logTime: Date.now(),
|
|
253
|
-
model_name: _env.getAIConfig.call(void 0, _env.MIDSCENE_MODEL_NAME) || ""
|
|
254
|
-
};
|
|
255
|
-
const finalData = {
|
|
256
|
-
logId: _utils.uuid.call(void 0, ),
|
|
257
|
-
...baseData,
|
|
258
|
-
...data
|
|
259
|
-
};
|
|
260
|
-
_optionalChain([dumpSubscriber, 'optionalCall', _13 => _13(finalData)]);
|
|
261
|
-
}
|
|
262
|
-
|
|
263
|
-
// src/insight/index.ts
|
|
264
|
-
var debug = _logger.getDebug.call(void 0, "ai:insight");
|
|
265
|
-
var Insight = class {
|
|
266
|
-
constructor(context, opt) {
|
|
267
|
-
this.aiVendorFn = _chunkDDYIQHOAjs.callAiFn;
|
|
268
|
-
_utils.assert.call(void 0, context, "context is required for Insight");
|
|
269
|
-
if (typeof context === "function") {
|
|
270
|
-
this.contextRetrieverFn = context;
|
|
271
|
-
} else {
|
|
272
|
-
this.contextRetrieverFn = () => Promise.resolve(context);
|
|
273
|
-
}
|
|
274
|
-
if (typeof _optionalChain([opt, 'optionalAccess', _14 => _14.aiVendorFn]) !== "undefined") {
|
|
275
|
-
this.aiVendorFn = opt.aiVendorFn;
|
|
276
|
-
}
|
|
277
|
-
if (typeof _optionalChain([opt, 'optionalAccess', _15 => _15.taskInfo]) !== "undefined") {
|
|
278
|
-
this.taskInfo = opt.taskInfo;
|
|
2162
|
+
function emitInsightDump(data, dumpSubscriber) {
|
|
2163
|
+
const baseData = {
|
|
2164
|
+
sdkVersion: getVersion(),
|
|
2165
|
+
logTime: Date.now(),
|
|
2166
|
+
model_name: (0, env_namespaceObject.getAIConfig)(env_namespaceObject.MIDSCENE_MODEL_NAME) || ''
|
|
2167
|
+
};
|
|
2168
|
+
const finalData = {
|
|
2169
|
+
logId: (0, utils_namespaceObject.uuid)(),
|
|
2170
|
+
...baseData,
|
|
2171
|
+
...data
|
|
2172
|
+
};
|
|
2173
|
+
null == dumpSubscriber || dumpSubscriber(finalData);
|
|
279
2174
|
}
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
_env.MIDSCENE_FORCE_DEEP_THINK
|
|
290
|
-
);
|
|
291
|
-
if (globalDeepThinkSwitch) {
|
|
292
|
-
debug("globalDeepThinkSwitch", globalDeepThinkSwitch);
|
|
293
|
-
}
|
|
294
|
-
let searchAreaPrompt;
|
|
295
|
-
if (query.deepThink || globalDeepThinkSwitch) {
|
|
296
|
-
searchAreaPrompt = query.prompt;
|
|
297
|
-
}
|
|
298
|
-
if (searchAreaPrompt && !_env.vlLocateMode.call(void 0, )) {
|
|
299
|
-
console.warn(
|
|
300
|
-
'The "deepThink" feature is not supported with multimodal LLM. Please config VL model for Midscene. https://midscenejs.com/choose-a-model'
|
|
301
|
-
);
|
|
302
|
-
searchAreaPrompt = void 0;
|
|
303
|
-
}
|
|
304
|
-
const context = _optionalChain([opt, 'optionalAccess', _16 => _16.context]) || await this.contextRetrieverFn("locate");
|
|
305
|
-
let searchArea = void 0;
|
|
306
|
-
let searchAreaRawResponse = void 0;
|
|
307
|
-
let searchAreaUsage = void 0;
|
|
308
|
-
let searchAreaResponse = void 0;
|
|
309
|
-
if (searchAreaPrompt) {
|
|
310
|
-
searchAreaResponse = await _chunkDDYIQHOAjs.AiLocateSection.call(void 0, {
|
|
311
|
-
context,
|
|
312
|
-
sectionDescription: searchAreaPrompt
|
|
313
|
-
});
|
|
314
|
-
_utils.assert.call(void 0,
|
|
315
|
-
searchAreaResponse.rect,
|
|
316
|
-
`cannot find search area for "${searchAreaPrompt}"${searchAreaResponse.error ? `: ${searchAreaResponse.error}` : ""}`
|
|
317
|
-
);
|
|
318
|
-
searchAreaRawResponse = searchAreaResponse.rawResponse;
|
|
319
|
-
searchAreaUsage = searchAreaResponse.usage;
|
|
320
|
-
searchArea = searchAreaResponse.rect;
|
|
321
|
-
}
|
|
322
|
-
const startTime = Date.now();
|
|
323
|
-
const {
|
|
324
|
-
parseResult,
|
|
325
|
-
rect,
|
|
326
|
-
elementById,
|
|
327
|
-
rawResponse,
|
|
328
|
-
usage,
|
|
329
|
-
isOrderSensitive
|
|
330
|
-
} = await _chunkDDYIQHOAjs.AiLocateElement.call(void 0, {
|
|
331
|
-
callAI: callAI || this.aiVendorFn,
|
|
332
|
-
context,
|
|
333
|
-
targetElementDescription: queryPrompt,
|
|
334
|
-
searchConfig: searchAreaResponse
|
|
335
|
-
});
|
|
336
|
-
const timeCost = Date.now() - startTime;
|
|
337
|
-
const taskInfo = {
|
|
338
|
-
...this.taskInfo ? this.taskInfo : {},
|
|
339
|
-
durationMs: timeCost,
|
|
340
|
-
rawResponse: JSON.stringify(rawResponse),
|
|
341
|
-
formatResponse: JSON.stringify(parseResult),
|
|
342
|
-
usage,
|
|
343
|
-
searchArea,
|
|
344
|
-
searchAreaRawResponse,
|
|
345
|
-
searchAreaUsage
|
|
346
|
-
};
|
|
347
|
-
let errorLog;
|
|
348
|
-
if (_optionalChain([parseResult, 'access', _17 => _17.errors, 'optionalAccess', _18 => _18.length])) {
|
|
349
|
-
errorLog = `AI model failed to locate:
|
|
350
|
-
${parseResult.errors.join("\n")}`;
|
|
351
|
-
}
|
|
352
|
-
const dumpData = {
|
|
353
|
-
type: "locate",
|
|
354
|
-
userQuery: {
|
|
355
|
-
element: queryPrompt
|
|
356
|
-
},
|
|
357
|
-
matchedElement: [],
|
|
358
|
-
matchedRect: rect,
|
|
359
|
-
data: null,
|
|
360
|
-
taskInfo,
|
|
361
|
-
deepThink: !!searchArea,
|
|
362
|
-
error: errorLog
|
|
363
|
-
};
|
|
364
|
-
const elements = [];
|
|
365
|
-
(parseResult.elements || []).forEach((item) => {
|
|
366
|
-
if ("id" in item) {
|
|
367
|
-
const element = elementById(_optionalChain([item, 'optionalAccess', _19 => _19.id]));
|
|
368
|
-
if (!element) {
|
|
369
|
-
console.warn(
|
|
370
|
-
`locate: cannot find element id=${item.id}. Maybe an unstable response from AI model`
|
|
371
|
-
);
|
|
372
|
-
return;
|
|
373
|
-
}
|
|
374
|
-
elements.push(element);
|
|
375
|
-
}
|
|
376
|
-
});
|
|
377
|
-
emitInsightDump(
|
|
378
|
-
{
|
|
379
|
-
...dumpData,
|
|
380
|
-
matchedElement: elements
|
|
381
|
-
},
|
|
382
|
-
dumpSubscriber
|
|
383
|
-
);
|
|
384
|
-
if (errorLog) {
|
|
385
|
-
throw new Error(errorLog);
|
|
386
|
-
}
|
|
387
|
-
_utils.assert.call(void 0,
|
|
388
|
-
elements.length <= 1,
|
|
389
|
-
`locate: multiple elements found, length = ${elements.length}`
|
|
390
|
-
);
|
|
391
|
-
if (elements.length === 1) {
|
|
392
|
-
return {
|
|
393
|
-
element: {
|
|
394
|
-
id: elements[0].id,
|
|
395
|
-
indexId: elements[0].indexId,
|
|
396
|
-
center: elements[0].center,
|
|
397
|
-
rect: elements[0].rect,
|
|
398
|
-
xpaths: elements[0].xpaths || [],
|
|
399
|
-
attributes: elements[0].attributes,
|
|
400
|
-
isOrderSensitive
|
|
401
|
-
},
|
|
402
|
-
rect
|
|
403
|
-
};
|
|
2175
|
+
function insight_define_property(obj, key, value) {
|
|
2176
|
+
if (key in obj) Object.defineProperty(obj, key, {
|
|
2177
|
+
value: value,
|
|
2178
|
+
enumerable: true,
|
|
2179
|
+
configurable: true,
|
|
2180
|
+
writable: true
|
|
2181
|
+
});
|
|
2182
|
+
else obj[key] = value;
|
|
2183
|
+
return obj;
|
|
404
2184
|
}
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
|
|
421
|
-
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
|
|
429
|
-
|
|
430
|
-
|
|
431
|
-
|
|
432
|
-
|
|
433
|
-
|
|
434
|
-
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
|
|
440
|
-
|
|
441
|
-
|
|
442
|
-
|
|
443
|
-
|
|
444
|
-
|
|
445
|
-
|
|
446
|
-
|
|
447
|
-
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
455
|
-
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
460
|
-
|
|
461
|
-
|
|
462
|
-
|
|
463
|
-
|
|
464
|
-
|
|
465
|
-
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
488
|
-
|
|
489
|
-
|
|
490
|
-
|
|
491
|
-
|
|
492
|
-
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
497
|
-
|
|
498
|
-
|
|
499
|
-
|
|
500
|
-
const context = await this.contextRetrieverFn("describe");
|
|
501
|
-
const { screenshotBase64, size } = context;
|
|
502
|
-
_utils.assert.call(void 0, screenshotBase64, "screenshot is required for insight.describe");
|
|
503
|
-
const systemPrompt = elementDescriberInstruction();
|
|
504
|
-
const defaultRectSize = 30;
|
|
505
|
-
const targetRect = Array.isArray(target) ? {
|
|
506
|
-
left: Math.floor(target[0] - defaultRectSize / 2),
|
|
507
|
-
top: Math.floor(target[1] - defaultRectSize / 2),
|
|
508
|
-
width: defaultRectSize,
|
|
509
|
-
height: defaultRectSize
|
|
510
|
-
} : target;
|
|
511
|
-
let imagePayload = await _img.compositeElementInfoImg.call(void 0, {
|
|
512
|
-
inputImgBase64: screenshotBase64,
|
|
513
|
-
size,
|
|
514
|
-
elementsPositionInfo: [
|
|
515
|
-
{
|
|
516
|
-
rect: targetRect
|
|
2185
|
+
const insight_debug = (0, logger_namespaceObject.getDebug)('ai:insight');
|
|
2186
|
+
class Insight {
|
|
2187
|
+
async locate(query, opt) {
|
|
2188
|
+
var _parseResult_errors;
|
|
2189
|
+
const { callAI } = opt || {};
|
|
2190
|
+
const queryPrompt = 'string' == typeof query ? query : query.prompt;
|
|
2191
|
+
(0, utils_namespaceObject.assert)(queryPrompt, 'query is required for locate');
|
|
2192
|
+
const dumpSubscriber = this.onceDumpUpdatedFn;
|
|
2193
|
+
this.onceDumpUpdatedFn = void 0;
|
|
2194
|
+
(0, utils_namespaceObject.assert)('object' == typeof query, 'query should be an object for locate');
|
|
2195
|
+
const globalDeepThinkSwitch = (0, env_namespaceObject.getAIConfigInBoolean)(env_namespaceObject.MIDSCENE_FORCE_DEEP_THINK);
|
|
2196
|
+
if (globalDeepThinkSwitch) insight_debug('globalDeepThinkSwitch', globalDeepThinkSwitch);
|
|
2197
|
+
let searchAreaPrompt;
|
|
2198
|
+
if (query.deepThink || globalDeepThinkSwitch) searchAreaPrompt = query.prompt;
|
|
2199
|
+
if (searchAreaPrompt && !(0, env_namespaceObject.vlLocateMode)()) {
|
|
2200
|
+
console.warn('The "deepThink" feature is not supported with multimodal LLM. Please config VL model for Midscene. https://midscenejs.com/choose-a-model');
|
|
2201
|
+
searchAreaPrompt = void 0;
|
|
2202
|
+
}
|
|
2203
|
+
const context = (null == opt ? void 0 : opt.context) || await this.contextRetrieverFn('locate');
|
|
2204
|
+
let searchArea;
|
|
2205
|
+
let searchAreaRawResponse;
|
|
2206
|
+
let searchAreaUsage;
|
|
2207
|
+
let searchAreaResponse;
|
|
2208
|
+
if (searchAreaPrompt) {
|
|
2209
|
+
searchAreaResponse = await AiLocateSection({
|
|
2210
|
+
context,
|
|
2211
|
+
sectionDescription: searchAreaPrompt
|
|
2212
|
+
});
|
|
2213
|
+
(0, utils_namespaceObject.assert)(searchAreaResponse.rect, `cannot find search area for "${searchAreaPrompt}"${searchAreaResponse.error ? `: ${searchAreaResponse.error}` : ''}`);
|
|
2214
|
+
searchAreaRawResponse = searchAreaResponse.rawResponse;
|
|
2215
|
+
searchAreaUsage = searchAreaResponse.usage;
|
|
2216
|
+
searchArea = searchAreaResponse.rect;
|
|
2217
|
+
}
|
|
2218
|
+
const startTime = Date.now();
|
|
2219
|
+
const { parseResult, rect, elementById, rawResponse, usage, isOrderSensitive } = await AiLocateElement({
|
|
2220
|
+
callAI: callAI || this.aiVendorFn,
|
|
2221
|
+
context,
|
|
2222
|
+
targetElementDescription: queryPrompt,
|
|
2223
|
+
searchConfig: searchAreaResponse
|
|
2224
|
+
});
|
|
2225
|
+
const timeCost = Date.now() - startTime;
|
|
2226
|
+
const taskInfo = {
|
|
2227
|
+
...this.taskInfo ? this.taskInfo : {},
|
|
2228
|
+
durationMs: timeCost,
|
|
2229
|
+
rawResponse: JSON.stringify(rawResponse),
|
|
2230
|
+
formatResponse: JSON.stringify(parseResult),
|
|
2231
|
+
usage,
|
|
2232
|
+
searchArea,
|
|
2233
|
+
searchAreaRawResponse,
|
|
2234
|
+
searchAreaUsage
|
|
2235
|
+
};
|
|
2236
|
+
let errorLog;
|
|
2237
|
+
if (null == (_parseResult_errors = parseResult.errors) ? void 0 : _parseResult_errors.length) errorLog = `AI model failed to locate: \n${parseResult.errors.join('\n')}`;
|
|
2238
|
+
const dumpData = {
|
|
2239
|
+
type: 'locate',
|
|
2240
|
+
userQuery: {
|
|
2241
|
+
element: queryPrompt
|
|
2242
|
+
},
|
|
2243
|
+
matchedElement: [],
|
|
2244
|
+
matchedRect: rect,
|
|
2245
|
+
data: null,
|
|
2246
|
+
taskInfo,
|
|
2247
|
+
deepThink: !!searchArea,
|
|
2248
|
+
error: errorLog
|
|
2249
|
+
};
|
|
2250
|
+
const elements = [];
|
|
2251
|
+
(parseResult.elements || []).forEach((item)=>{
|
|
2252
|
+
if ('id' in item) {
|
|
2253
|
+
const element = elementById(null == item ? void 0 : item.id);
|
|
2254
|
+
if (!element) return void console.warn(`locate: cannot find element id=${item.id}. Maybe an unstable response from AI model`);
|
|
2255
|
+
elements.push(element);
|
|
2256
|
+
}
|
|
2257
|
+
});
|
|
2258
|
+
emitInsightDump({
|
|
2259
|
+
...dumpData,
|
|
2260
|
+
matchedElement: elements
|
|
2261
|
+
}, dumpSubscriber);
|
|
2262
|
+
if (errorLog) throw new Error(errorLog);
|
|
2263
|
+
(0, utils_namespaceObject.assert)(elements.length <= 1, `locate: multiple elements found, length = ${elements.length}`);
|
|
2264
|
+
if (1 === elements.length) return {
|
|
2265
|
+
element: {
|
|
2266
|
+
id: elements[0].id,
|
|
2267
|
+
indexId: elements[0].indexId,
|
|
2268
|
+
center: elements[0].center,
|
|
2269
|
+
rect: elements[0].rect,
|
|
2270
|
+
xpaths: elements[0].xpaths || [],
|
|
2271
|
+
attributes: elements[0].attributes,
|
|
2272
|
+
isOrderSensitive
|
|
2273
|
+
},
|
|
2274
|
+
rect
|
|
2275
|
+
};
|
|
2276
|
+
return {
|
|
2277
|
+
element: null,
|
|
2278
|
+
rect
|
|
2279
|
+
};
|
|
517
2280
|
}
|
|
518
|
-
|
|
519
|
-
|
|
520
|
-
|
|
521
|
-
|
|
522
|
-
|
|
523
|
-
|
|
524
|
-
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
|
|
2281
|
+
async extract(dataDemand, opt, multimodalPrompt) {
|
|
2282
|
+
var _parseResult_errors;
|
|
2283
|
+
(0, utils_namespaceObject.assert)('object' == typeof dataDemand || 'string' == typeof dataDemand, `dataDemand should be object or string, but get ${typeof dataDemand}`);
|
|
2284
|
+
const dumpSubscriber = this.onceDumpUpdatedFn;
|
|
2285
|
+
this.onceDumpUpdatedFn = void 0;
|
|
2286
|
+
const context = await this.contextRetrieverFn('extract');
|
|
2287
|
+
const startTime = Date.now();
|
|
2288
|
+
const { parseResult, usage } = await AiExtractElementInfo({
|
|
2289
|
+
context,
|
|
2290
|
+
dataQuery: dataDemand,
|
|
2291
|
+
multimodalPrompt,
|
|
2292
|
+
extractOption: opt
|
|
2293
|
+
});
|
|
2294
|
+
const timeCost = Date.now() - startTime;
|
|
2295
|
+
const taskInfo = {
|
|
2296
|
+
...this.taskInfo ? this.taskInfo : {},
|
|
2297
|
+
durationMs: timeCost,
|
|
2298
|
+
rawResponse: JSON.stringify(parseResult)
|
|
2299
|
+
};
|
|
2300
|
+
let errorLog;
|
|
2301
|
+
if (null == (_parseResult_errors = parseResult.errors) ? void 0 : _parseResult_errors.length) errorLog = `AI response error: \n${parseResult.errors.join('\n')}`;
|
|
2302
|
+
const dumpData = {
|
|
2303
|
+
type: 'extract',
|
|
2304
|
+
userQuery: {
|
|
2305
|
+
dataDemand
|
|
2306
|
+
},
|
|
2307
|
+
matchedElement: [],
|
|
2308
|
+
data: null,
|
|
2309
|
+
taskInfo,
|
|
2310
|
+
error: errorLog
|
|
2311
|
+
};
|
|
2312
|
+
const { data, thought } = parseResult || {};
|
|
2313
|
+
emitInsightDump({
|
|
2314
|
+
...dumpData,
|
|
2315
|
+
data
|
|
2316
|
+
}, dumpSubscriber);
|
|
2317
|
+
if (errorLog && !data) throw new Error(errorLog);
|
|
2318
|
+
return {
|
|
2319
|
+
data,
|
|
2320
|
+
thought,
|
|
2321
|
+
usage
|
|
2322
|
+
};
|
|
2323
|
+
}
|
|
2324
|
+
async assert(assertion) {
|
|
2325
|
+
const dumpSubscriber = this.onceDumpUpdatedFn;
|
|
2326
|
+
this.onceDumpUpdatedFn = void 0;
|
|
2327
|
+
const context = await this.contextRetrieverFn('assert');
|
|
2328
|
+
const startTime = Date.now();
|
|
2329
|
+
const assertResult = await AiAssert({
|
|
2330
|
+
assertion,
|
|
2331
|
+
context
|
|
2332
|
+
});
|
|
2333
|
+
const timeCost = Date.now() - startTime;
|
|
2334
|
+
const taskInfo = {
|
|
2335
|
+
...this.taskInfo ? this.taskInfo : {},
|
|
2336
|
+
durationMs: timeCost,
|
|
2337
|
+
rawResponse: JSON.stringify(assertResult.content)
|
|
2338
|
+
};
|
|
2339
|
+
const { thought, pass } = assertResult.content;
|
|
2340
|
+
const dumpData = {
|
|
2341
|
+
type: 'assert',
|
|
2342
|
+
userQuery: {
|
|
2343
|
+
assertion
|
|
2344
|
+
},
|
|
2345
|
+
matchedElement: [],
|
|
2346
|
+
data: null,
|
|
2347
|
+
taskInfo,
|
|
2348
|
+
assertionPass: pass,
|
|
2349
|
+
assertionThought: thought,
|
|
2350
|
+
error: pass ? void 0 : thought
|
|
2351
|
+
};
|
|
2352
|
+
emitInsightDump(dumpData, dumpSubscriber);
|
|
2353
|
+
return {
|
|
2354
|
+
pass,
|
|
2355
|
+
thought,
|
|
2356
|
+
usage: assertResult.usage
|
|
2357
|
+
};
|
|
2358
|
+
}
|
|
2359
|
+
async describe(target, opt) {
|
|
2360
|
+
(0, utils_namespaceObject.assert)(target, 'target is required for insight.describe');
|
|
2361
|
+
const context = await this.contextRetrieverFn('describe');
|
|
2362
|
+
const { screenshotBase64, size } = context;
|
|
2363
|
+
(0, utils_namespaceObject.assert)(screenshotBase64, 'screenshot is required for insight.describe');
|
|
2364
|
+
const systemPrompt = elementDescriberInstruction();
|
|
2365
|
+
const defaultRectSize = 30;
|
|
2366
|
+
const targetRect = Array.isArray(target) ? {
|
|
2367
|
+
left: Math.floor(target[0] - defaultRectSize / 2),
|
|
2368
|
+
top: Math.floor(target[1] - defaultRectSize / 2),
|
|
2369
|
+
width: defaultRectSize,
|
|
2370
|
+
height: defaultRectSize
|
|
2371
|
+
} : target;
|
|
2372
|
+
let imagePayload = await (0, img_namespaceObject.compositeElementInfoImg)({
|
|
2373
|
+
inputImgBase64: screenshotBase64,
|
|
2374
|
+
size,
|
|
2375
|
+
elementsPositionInfo: [
|
|
2376
|
+
{
|
|
2377
|
+
rect: targetRect
|
|
2378
|
+
}
|
|
2379
|
+
],
|
|
2380
|
+
borderThickness: 3
|
|
2381
|
+
});
|
|
2382
|
+
if (null == opt ? void 0 : opt.deepThink) {
|
|
2383
|
+
const searchArea = expandSearchArea(targetRect, context.size);
|
|
2384
|
+
insight_debug('describe: set searchArea', searchArea);
|
|
2385
|
+
imagePayload = await (0, img_namespaceObject.cropByRect)(imagePayload, searchArea, (0, env_namespaceObject.getAIConfigInBoolean)(env_namespaceObject.MIDSCENE_USE_QWEN_VL));
|
|
540
2386
|
}
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
545
|
-
|
|
546
|
-
|
|
547
|
-
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
2387
|
+
const msgs = [
|
|
2388
|
+
{
|
|
2389
|
+
role: 'system',
|
|
2390
|
+
content: systemPrompt
|
|
2391
|
+
},
|
|
2392
|
+
{
|
|
2393
|
+
role: 'user',
|
|
2394
|
+
content: [
|
|
2395
|
+
{
|
|
2396
|
+
type: 'image_url',
|
|
2397
|
+
image_url: {
|
|
2398
|
+
url: imagePayload,
|
|
2399
|
+
detail: 'high'
|
|
2400
|
+
}
|
|
2401
|
+
}
|
|
2402
|
+
]
|
|
2403
|
+
}
|
|
2404
|
+
];
|
|
2405
|
+
const callAIFn = this.aiVendorFn || callToGetJSONObject;
|
|
2406
|
+
const res = await callAIFn(msgs, common_AIActionType.DESCRIBE_ELEMENT);
|
|
2407
|
+
const { content } = res;
|
|
2408
|
+
(0, utils_namespaceObject.assert)(!content.error, `describe failed: ${content.error}`);
|
|
2409
|
+
(0, utils_namespaceObject.assert)(content.description, 'failed to describe the element');
|
|
2410
|
+
return content;
|
|
2411
|
+
}
|
|
2412
|
+
constructor(context, opt){
|
|
2413
|
+
insight_define_property(this, "contextRetrieverFn", void 0);
|
|
2414
|
+
insight_define_property(this, "aiVendorFn", callAiFn);
|
|
2415
|
+
insight_define_property(this, "onceDumpUpdatedFn", void 0);
|
|
2416
|
+
insight_define_property(this, "taskInfo", void 0);
|
|
2417
|
+
(0, utils_namespaceObject.assert)(context, 'context is required for Insight');
|
|
2418
|
+
if ('function' == typeof context) this.contextRetrieverFn = context;
|
|
2419
|
+
else this.contextRetrieverFn = ()=>Promise.resolve(context);
|
|
2420
|
+
if (void 0 !== (null == opt ? void 0 : opt.aiVendorFn)) this.aiVendorFn = opt.aiVendorFn;
|
|
2421
|
+
if (void 0 !== (null == opt ? void 0 : opt.taskInfo)) this.taskInfo = opt.taskInfo;
|
|
2422
|
+
}
|
|
2423
|
+
}
|
|
2424
|
+
const src = Insight;
|
|
2425
|
+
})();
|
|
2426
|
+
exports.AiAssert = __webpack_exports__.AiAssert;
|
|
2427
|
+
exports.AiLocateElement = __webpack_exports__.AiLocateElement;
|
|
2428
|
+
exports.Executor = __webpack_exports__.Executor;
|
|
2429
|
+
exports.Insight = __webpack_exports__.Insight;
|
|
2430
|
+
exports.MIDSCENE_MODEL_NAME = __webpack_exports__.MIDSCENE_MODEL_NAME;
|
|
2431
|
+
exports["default"] = __webpack_exports__["default"];
|
|
2432
|
+
exports.describeUserPage = __webpack_exports__.describeUserPage;
|
|
2433
|
+
exports.getAIConfig = __webpack_exports__.getAIConfig;
|
|
2434
|
+
exports.getVersion = __webpack_exports__.getVersion;
|
|
2435
|
+
exports.plan = __webpack_exports__.plan;
|
|
2436
|
+
for(var __webpack_i__ in __webpack_exports__)if (-1 === [
|
|
2437
|
+
"AiAssert",
|
|
2438
|
+
"AiLocateElement",
|
|
2439
|
+
"Executor",
|
|
2440
|
+
"Insight",
|
|
2441
|
+
"MIDSCENE_MODEL_NAME",
|
|
2442
|
+
"default",
|
|
2443
|
+
"describeUserPage",
|
|
2444
|
+
"getAIConfig",
|
|
2445
|
+
"getVersion",
|
|
2446
|
+
"plan"
|
|
2447
|
+
].indexOf(__webpack_i__)) exports[__webpack_i__] = __webpack_exports__[__webpack_i__];
|
|
2448
|
+
Object.defineProperty(exports, '__esModule', {
|
|
2449
|
+
value: true
|
|
2450
|
+
});
|
|
569
2451
|
|
|
570
2452
|
//# sourceMappingURL=index.js.map
|