@midscene/core 0.27.1-beta-20250822053848.0 → 0.27.1-beta-20250822103738.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/es/agent/agent.mjs +518 -0
- package/dist/es/agent/agent.mjs.map +1 -0
- package/dist/es/agent/index.mjs +6 -0
- package/dist/es/agent/task-cache.mjs +149 -0
- package/dist/es/agent/task-cache.mjs.map +1 -0
- package/dist/es/agent/tasks.mjs +767 -0
- package/dist/es/agent/tasks.mjs.map +1 -0
- package/dist/es/agent/ui-utils.mjs +89 -0
- package/dist/es/agent/ui-utils.mjs.map +1 -0
- package/dist/es/agent/utils.mjs +303 -0
- package/dist/es/agent/utils.mjs.map +1 -0
- package/dist/es/device/index.mjs +20 -0
- package/dist/es/device/index.mjs.map +1 -0
- package/dist/es/index.mjs +2 -1
- package/dist/es/index.mjs.map +1 -1
- package/dist/es/types.mjs.map +1 -1
- package/dist/es/utils.mjs +2 -2
- package/dist/es/yaml/builder.mjs +13 -0
- package/dist/es/yaml/builder.mjs.map +1 -0
- package/dist/es/yaml/index.mjs +3 -0
- package/dist/es/yaml/player.mjs +375 -0
- package/dist/es/yaml/player.mjs.map +1 -0
- package/dist/es/yaml/utils.mjs +75 -0
- package/dist/es/yaml/utils.mjs.map +1 -0
- package/dist/lib/agent/agent.js +562 -0
- package/dist/lib/agent/agent.js.map +1 -0
- package/dist/lib/agent/index.js +84 -0
- package/dist/lib/agent/index.js.map +1 -0
- package/dist/lib/agent/task-cache.js +201 -0
- package/dist/lib/agent/task-cache.js.map +1 -0
- package/dist/lib/agent/tasks.js +804 -0
- package/dist/lib/agent/tasks.js.map +1 -0
- package/dist/lib/agent/ui-utils.js +141 -0
- package/dist/lib/agent/ui-utils.js.map +1 -0
- package/dist/lib/agent/utils.js +377 -0
- package/dist/lib/agent/utils.js.map +1 -0
- package/dist/lib/device/index.js +54 -0
- package/dist/lib/device/index.js.map +1 -0
- package/dist/lib/index.js +5 -1
- package/dist/lib/index.js.map +1 -1
- package/dist/lib/types.js.map +1 -1
- package/dist/lib/utils.js +2 -2
- package/dist/lib/yaml/builder.js +57 -0
- package/dist/lib/yaml/builder.js.map +1 -0
- package/dist/lib/yaml/index.js +80 -0
- package/dist/lib/yaml/index.js.map +1 -0
- package/dist/lib/yaml/player.js +409 -0
- package/dist/lib/yaml/player.js.map +1 -0
- package/dist/lib/yaml/utils.js +128 -0
- package/dist/lib/yaml/utils.js.map +1 -0
- package/dist/types/agent/agent.d.ts +123 -0
- package/dist/types/agent/index.d.ts +9 -0
- package/dist/types/agent/task-cache.d.ts +38 -0
- package/dist/types/agent/tasks.d.ts +56 -0
- package/dist/types/agent/ui-utils.d.ts +11 -0
- package/dist/types/agent/utils.d.ts +49 -0
- package/dist/types/device/index.d.ts +55 -0
- package/dist/types/index.d.ts +1 -0
- package/dist/types/types.d.ts +1 -0
- package/dist/types/yaml/builder.d.ts +2 -0
- package/dist/types/yaml/index.d.ts +3 -0
- package/dist/types/yaml/player.d.ts +34 -0
- package/dist/types/yaml/utils.d.ts +8 -0
- package/package.json +27 -3
|
@@ -0,0 +1,767 @@
|
|
|
1
|
+
import { elementByPositionWithElementInfo, findAllMidsceneLocatorField, resizeImageForUiTars, vlmPlanning } from "../ai-model/index.mjs";
|
|
2
|
+
import { Executor, plan as external_index_mjs_plan } from "../index.mjs";
|
|
3
|
+
import { sleep as external_utils_mjs_sleep } from "../utils.mjs";
|
|
4
|
+
import { NodeType } from "@midscene/shared/constants";
|
|
5
|
+
import { MIDSCENE_REPLANNING_CYCLE_LIMIT, getAIConfigInNumber } from "@midscene/shared/env";
|
|
6
|
+
import { getDebug } from "@midscene/shared/logger";
|
|
7
|
+
import { assert } from "@midscene/shared/utils";
|
|
8
|
+
import { taskTitleStr } from "./ui-utils.mjs";
|
|
9
|
+
import { matchElementFromCache, matchElementFromPlan, parsePrompt } from "./utils.mjs";
|
|
10
|
+
function _define_property(obj, key, value) {
|
|
11
|
+
if (key in obj) Object.defineProperty(obj, key, {
|
|
12
|
+
value: value,
|
|
13
|
+
enumerable: true,
|
|
14
|
+
configurable: true,
|
|
15
|
+
writable: true
|
|
16
|
+
});
|
|
17
|
+
else obj[key] = value;
|
|
18
|
+
return obj;
|
|
19
|
+
}
|
|
20
|
+
const debug = getDebug('device-task-executor');
|
|
21
|
+
const defaultReplanningCycleLimit = 10;
|
|
22
|
+
function locatePlanForLocate(param) {
|
|
23
|
+
const locate = 'string' == typeof param ? {
|
|
24
|
+
prompt: param
|
|
25
|
+
} : param;
|
|
26
|
+
const locatePlan = {
|
|
27
|
+
type: 'Locate',
|
|
28
|
+
locate,
|
|
29
|
+
param: locate,
|
|
30
|
+
thought: ''
|
|
31
|
+
};
|
|
32
|
+
return locatePlan;
|
|
33
|
+
}
|
|
34
|
+
class PageTaskExecutor {
|
|
35
|
+
async recordScreenshot(timing) {
|
|
36
|
+
const base64 = await this.page.screenshotBase64();
|
|
37
|
+
const item = {
|
|
38
|
+
type: 'screenshot',
|
|
39
|
+
ts: Date.now(),
|
|
40
|
+
screenshot: base64,
|
|
41
|
+
timing
|
|
42
|
+
};
|
|
43
|
+
return item;
|
|
44
|
+
}
|
|
45
|
+
async getElementXpath(pageContext, element) {
|
|
46
|
+
var _element_attributes;
|
|
47
|
+
if (!this.page.getXpathsByPoint) return void debug('getXpathsByPoint is not supported for this page');
|
|
48
|
+
let elementId = null == element ? void 0 : element.id;
|
|
49
|
+
if ((null == element ? void 0 : element.isOrderSensitive) !== void 0) {
|
|
50
|
+
const xpaths = await this.page.getXpathsByPoint({
|
|
51
|
+
left: element.center[0],
|
|
52
|
+
top: element.center[1]
|
|
53
|
+
}, null == element ? void 0 : element.isOrderSensitive);
|
|
54
|
+
return xpaths;
|
|
55
|
+
}
|
|
56
|
+
if ((null == element ? void 0 : null == (_element_attributes = element.attributes) ? void 0 : _element_attributes.nodeType) === NodeType.POSITION) {
|
|
57
|
+
await this.insight.contextRetrieverFn('locate');
|
|
58
|
+
const info = elementByPositionWithElementInfo(pageContext.tree, {
|
|
59
|
+
x: element.center[0],
|
|
60
|
+
y: element.center[1]
|
|
61
|
+
}, {
|
|
62
|
+
requireStrictDistance: false,
|
|
63
|
+
filterPositionElements: true
|
|
64
|
+
});
|
|
65
|
+
if (null == info ? void 0 : info.id) elementId = info.id;
|
|
66
|
+
else debug('no element id found for position node, will not update cache', element);
|
|
67
|
+
}
|
|
68
|
+
if (!elementId) return;
|
|
69
|
+
try {
|
|
70
|
+
const result = await this.page.getXpathsById(elementId);
|
|
71
|
+
return result;
|
|
72
|
+
} catch (error) {
|
|
73
|
+
debug('getXpathsById error: ', error);
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
prependExecutorWithScreenshot(taskApply, appendAfterExecution = false) {
|
|
77
|
+
const taskWithScreenshot = {
|
|
78
|
+
...taskApply,
|
|
79
|
+
executor: async (param, context, ...args)=>{
|
|
80
|
+
const recorder = [];
|
|
81
|
+
const { task } = context;
|
|
82
|
+
task.recorder = recorder;
|
|
83
|
+
const shot = await this.recordScreenshot(`before ${task.type}`);
|
|
84
|
+
recorder.push(shot);
|
|
85
|
+
const result = await taskApply.executor(param, context, ...args);
|
|
86
|
+
if ('Action' === taskApply.type) await Promise.all([
|
|
87
|
+
(async ()=>{
|
|
88
|
+
await external_utils_mjs_sleep(100);
|
|
89
|
+
if (this.page.beforeAction) {
|
|
90
|
+
debug('will call "beforeAction" for page');
|
|
91
|
+
await this.page.beforeAction();
|
|
92
|
+
}
|
|
93
|
+
})(),
|
|
94
|
+
external_utils_mjs_sleep(200)
|
|
95
|
+
]);
|
|
96
|
+
if (appendAfterExecution) {
|
|
97
|
+
const shot2 = await this.recordScreenshot('after Action');
|
|
98
|
+
recorder.push(shot2);
|
|
99
|
+
}
|
|
100
|
+
return result;
|
|
101
|
+
}
|
|
102
|
+
};
|
|
103
|
+
return taskWithScreenshot;
|
|
104
|
+
}
|
|
105
|
+
async convertPlanToExecutable(plans) {
|
|
106
|
+
const tasks = [];
|
|
107
|
+
const taskForLocatePlan = (plan, detailedLocateParam, onResult)=>{
|
|
108
|
+
if ('string' == typeof detailedLocateParam) detailedLocateParam = {
|
|
109
|
+
prompt: detailedLocateParam
|
|
110
|
+
};
|
|
111
|
+
const taskFind = {
|
|
112
|
+
type: 'Insight',
|
|
113
|
+
subType: 'Locate',
|
|
114
|
+
param: detailedLocateParam,
|
|
115
|
+
thought: plan.thought,
|
|
116
|
+
executor: async (param, taskContext)=>{
|
|
117
|
+
var _this_taskCache, _locateCacheRecord_cacheContent;
|
|
118
|
+
const { task } = taskContext;
|
|
119
|
+
assert((null == param ? void 0 : param.prompt) || (null == param ? void 0 : param.id) || (null == param ? void 0 : param.bbox), `No prompt or id or position or bbox to locate, param=${JSON.stringify(param)}`);
|
|
120
|
+
let insightDump;
|
|
121
|
+
let usage;
|
|
122
|
+
const dumpCollector = (dump)=>{
|
|
123
|
+
var _dump_taskInfo;
|
|
124
|
+
insightDump = dump;
|
|
125
|
+
usage = null == dump ? void 0 : null == (_dump_taskInfo = dump.taskInfo) ? void 0 : _dump_taskInfo.usage;
|
|
126
|
+
task.log = {
|
|
127
|
+
dump: insightDump
|
|
128
|
+
};
|
|
129
|
+
task.usage = usage;
|
|
130
|
+
};
|
|
131
|
+
this.insight.onceDumpUpdatedFn = dumpCollector;
|
|
132
|
+
const shotTime = Date.now();
|
|
133
|
+
const pageContext = await this.insight.contextRetrieverFn('locate');
|
|
134
|
+
task.pageContext = pageContext;
|
|
135
|
+
const recordItem = {
|
|
136
|
+
type: 'screenshot',
|
|
137
|
+
ts: shotTime,
|
|
138
|
+
screenshot: pageContext.screenshotBase64,
|
|
139
|
+
timing: 'before Insight'
|
|
140
|
+
};
|
|
141
|
+
task.recorder = [
|
|
142
|
+
recordItem
|
|
143
|
+
];
|
|
144
|
+
const elementFromXpath = param.xpath && this.page.getElementInfoByXpath ? await this.page.getElementInfoByXpath(param.xpath) : void 0;
|
|
145
|
+
const userExpectedPathHitFlag = !!elementFromXpath;
|
|
146
|
+
const cachePrompt = param.prompt;
|
|
147
|
+
const locateCacheRecord = null == (_this_taskCache = this.taskCache) ? void 0 : _this_taskCache.matchLocateCache(cachePrompt);
|
|
148
|
+
const xpaths = null == locateCacheRecord ? void 0 : null == (_locateCacheRecord_cacheContent = locateCacheRecord.cacheContent) ? void 0 : _locateCacheRecord_cacheContent.xpaths;
|
|
149
|
+
const elementFromCache = userExpectedPathHitFlag ? null : await matchElementFromCache(this, xpaths, cachePrompt, param.cacheable);
|
|
150
|
+
const cacheHitFlag = !!elementFromCache;
|
|
151
|
+
const elementFromPlan = userExpectedPathHitFlag || cacheHitFlag ? void 0 : matchElementFromPlan(param, pageContext.tree);
|
|
152
|
+
const planHitFlag = !!elementFromPlan;
|
|
153
|
+
const elementFromAiLocate = userExpectedPathHitFlag || cacheHitFlag || planHitFlag ? void 0 : (await this.insight.locate(param, {
|
|
154
|
+
context: pageContext
|
|
155
|
+
})).element;
|
|
156
|
+
const aiLocateHitFlag = !!elementFromAiLocate;
|
|
157
|
+
const element = elementFromXpath || elementFromCache || elementFromPlan || elementFromAiLocate;
|
|
158
|
+
let currentXpaths;
|
|
159
|
+
if (element && this.taskCache && !cacheHitFlag && (null == param ? void 0 : param.cacheable) !== false) {
|
|
160
|
+
const elementXpaths = await this.getElementXpath(pageContext, element);
|
|
161
|
+
if (null == elementXpaths ? void 0 : elementXpaths.length) {
|
|
162
|
+
currentXpaths = elementXpaths;
|
|
163
|
+
this.taskCache.updateOrAppendCacheRecord({
|
|
164
|
+
type: 'locate',
|
|
165
|
+
prompt: cachePrompt,
|
|
166
|
+
xpaths: elementXpaths
|
|
167
|
+
}, locateCacheRecord);
|
|
168
|
+
} else debug('no xpaths found, will not update cache', cachePrompt, elementXpaths);
|
|
169
|
+
}
|
|
170
|
+
if (!element) throw new Error(`Element not found: ${param.prompt}`);
|
|
171
|
+
let hitBy;
|
|
172
|
+
if (userExpectedPathHitFlag) hitBy = {
|
|
173
|
+
from: 'User expected path',
|
|
174
|
+
context: {
|
|
175
|
+
xpath: param.xpath
|
|
176
|
+
}
|
|
177
|
+
};
|
|
178
|
+
else if (cacheHitFlag) hitBy = {
|
|
179
|
+
from: 'Cache',
|
|
180
|
+
context: {
|
|
181
|
+
xpathsFromCache: xpaths,
|
|
182
|
+
xpathsToSave: currentXpaths
|
|
183
|
+
}
|
|
184
|
+
};
|
|
185
|
+
else if (planHitFlag) hitBy = {
|
|
186
|
+
from: 'Planning',
|
|
187
|
+
context: {
|
|
188
|
+
id: null == elementFromPlan ? void 0 : elementFromPlan.id,
|
|
189
|
+
bbox: null == elementFromPlan ? void 0 : elementFromPlan.bbox
|
|
190
|
+
}
|
|
191
|
+
};
|
|
192
|
+
else if (aiLocateHitFlag) hitBy = {
|
|
193
|
+
from: 'AI model',
|
|
194
|
+
context: {
|
|
195
|
+
prompt: param.prompt
|
|
196
|
+
}
|
|
197
|
+
};
|
|
198
|
+
null == onResult || onResult(element);
|
|
199
|
+
return {
|
|
200
|
+
output: {
|
|
201
|
+
element
|
|
202
|
+
},
|
|
203
|
+
pageContext,
|
|
204
|
+
hitBy
|
|
205
|
+
};
|
|
206
|
+
}
|
|
207
|
+
};
|
|
208
|
+
return taskFind;
|
|
209
|
+
};
|
|
210
|
+
for (const plan of plans)if ('Locate' === plan.type) {
|
|
211
|
+
var _plan_locate, _plan_locate1;
|
|
212
|
+
if (!plan.locate || null === plan.locate || (null == (_plan_locate = plan.locate) ? void 0 : _plan_locate.id) === null || (null == (_plan_locate1 = plan.locate) ? void 0 : _plan_locate1.id) === 'null') {
|
|
213
|
+
debug('Locate action with id is null, will be ignored', plan);
|
|
214
|
+
continue;
|
|
215
|
+
}
|
|
216
|
+
const taskLocate = taskForLocatePlan(plan, plan.locate);
|
|
217
|
+
tasks.push(taskLocate);
|
|
218
|
+
} else if ('Error' === plan.type) {
|
|
219
|
+
var _plan_param;
|
|
220
|
+
const taskActionError = {
|
|
221
|
+
type: 'Action',
|
|
222
|
+
subType: 'Error',
|
|
223
|
+
param: plan.param,
|
|
224
|
+
thought: plan.thought || (null == (_plan_param = plan.param) ? void 0 : _plan_param.thought),
|
|
225
|
+
locate: plan.locate,
|
|
226
|
+
executor: async ()=>{
|
|
227
|
+
var _plan_param;
|
|
228
|
+
throw new Error((null == plan ? void 0 : plan.thought) || (null == (_plan_param = plan.param) ? void 0 : _plan_param.thought) || 'error without thought');
|
|
229
|
+
}
|
|
230
|
+
};
|
|
231
|
+
tasks.push(taskActionError);
|
|
232
|
+
} else if ('Finished' === plan.type) {
|
|
233
|
+
const taskActionFinished = {
|
|
234
|
+
type: 'Action',
|
|
235
|
+
subType: 'Finished',
|
|
236
|
+
param: null,
|
|
237
|
+
thought: plan.thought,
|
|
238
|
+
locate: plan.locate,
|
|
239
|
+
executor: async (param)=>{}
|
|
240
|
+
};
|
|
241
|
+
tasks.push(taskActionFinished);
|
|
242
|
+
} else if ('Sleep' === plan.type) {
|
|
243
|
+
const taskActionSleep = {
|
|
244
|
+
type: 'Action',
|
|
245
|
+
subType: 'Sleep',
|
|
246
|
+
param: plan.param,
|
|
247
|
+
thought: plan.thought,
|
|
248
|
+
locate: plan.locate,
|
|
249
|
+
executor: async (taskParam)=>{
|
|
250
|
+
await external_utils_mjs_sleep((null == taskParam ? void 0 : taskParam.timeMs) || 3000);
|
|
251
|
+
}
|
|
252
|
+
};
|
|
253
|
+
tasks.push(taskActionSleep);
|
|
254
|
+
} else if ('Drag' === plan.type) {
|
|
255
|
+
const taskActionDrag = {
|
|
256
|
+
type: 'Action',
|
|
257
|
+
subType: 'Drag',
|
|
258
|
+
param: plan.param,
|
|
259
|
+
thought: plan.thought,
|
|
260
|
+
locate: plan.locate,
|
|
261
|
+
executor: async (taskParam)=>{
|
|
262
|
+
assert((null == taskParam ? void 0 : taskParam.start_box) && (null == taskParam ? void 0 : taskParam.end_box), 'No start_box or end_box to drag');
|
|
263
|
+
await this.page.mouse.drag(taskParam.start_box, taskParam.end_box);
|
|
264
|
+
}
|
|
265
|
+
};
|
|
266
|
+
tasks.push(taskActionDrag);
|
|
267
|
+
} else {
|
|
268
|
+
const planType = plan.type;
|
|
269
|
+
const actionSpace = await this.page.actionSpace();
|
|
270
|
+
const action = actionSpace.find((action)=>action.name === planType);
|
|
271
|
+
const param = plan.param;
|
|
272
|
+
if (!action) throw new Error(`Action type '${planType}' not found`);
|
|
273
|
+
const locateFields = action ? findAllMidsceneLocatorField(action.paramSchema) : [];
|
|
274
|
+
const requiredLocateFields = action ? findAllMidsceneLocatorField(action.paramSchema, true) : [];
|
|
275
|
+
locateFields.forEach((field)=>{
|
|
276
|
+
if (param[field]) {
|
|
277
|
+
const locatePlan = locatePlanForLocate(param[field]);
|
|
278
|
+
debug('will prepend locate param for field', `action.type=${planType}`, `param=${JSON.stringify(param[field])}`, `locatePlan=${JSON.stringify(locatePlan)}`);
|
|
279
|
+
const locateTask = taskForLocatePlan(locatePlan, param[field], (result)=>{
|
|
280
|
+
param[field] = result;
|
|
281
|
+
});
|
|
282
|
+
tasks.push(locateTask);
|
|
283
|
+
} else {
|
|
284
|
+
assert(!requiredLocateFields.includes(field), `Required locate field '${field}' is not provided for action ${planType}`);
|
|
285
|
+
debug(`field '${field}' is not provided for action ${planType}`);
|
|
286
|
+
}
|
|
287
|
+
});
|
|
288
|
+
const task = {
|
|
289
|
+
type: 'Action',
|
|
290
|
+
subType: planType,
|
|
291
|
+
thought: plan.thought,
|
|
292
|
+
param: plan.param,
|
|
293
|
+
executor: async (param, context)=>{
|
|
294
|
+
var _context_element;
|
|
295
|
+
debug('executing action', planType, param, `context.element.center: ${null == (_context_element = context.element) ? void 0 : _context_element.center}`);
|
|
296
|
+
const pageContext = await this.insight.contextRetrieverFn('locate');
|
|
297
|
+
context.task.pageContext = pageContext;
|
|
298
|
+
requiredLocateFields.forEach((field)=>{
|
|
299
|
+
assert(param[field], `field '${field}' is required for action ${planType} but not provided. Cannot execute action ${planType}.`);
|
|
300
|
+
});
|
|
301
|
+
const actionFn = action.call.bind(this.page);
|
|
302
|
+
await actionFn(param, context);
|
|
303
|
+
return {
|
|
304
|
+
output: {
|
|
305
|
+
success: true,
|
|
306
|
+
action: planType,
|
|
307
|
+
param: param
|
|
308
|
+
}
|
|
309
|
+
};
|
|
310
|
+
}
|
|
311
|
+
};
|
|
312
|
+
tasks.push(task);
|
|
313
|
+
}
|
|
314
|
+
const wrappedTasks = tasks.map((task, index)=>{
|
|
315
|
+
if ('Action' === task.type) return this.prependExecutorWithScreenshot(task, index === tasks.length - 1);
|
|
316
|
+
return task;
|
|
317
|
+
});
|
|
318
|
+
return {
|
|
319
|
+
tasks: wrappedTasks
|
|
320
|
+
};
|
|
321
|
+
}
|
|
322
|
+
async setupPlanningContext(executorContext) {
|
|
323
|
+
const shotTime = Date.now();
|
|
324
|
+
const pageContext = await this.insight.contextRetrieverFn('locate');
|
|
325
|
+
const recordItem = {
|
|
326
|
+
type: 'screenshot',
|
|
327
|
+
ts: shotTime,
|
|
328
|
+
screenshot: pageContext.screenshotBase64,
|
|
329
|
+
timing: 'before Planning'
|
|
330
|
+
};
|
|
331
|
+
executorContext.task.recorder = [
|
|
332
|
+
recordItem
|
|
333
|
+
];
|
|
334
|
+
executorContext.task.pageContext = pageContext;
|
|
335
|
+
return {
|
|
336
|
+
pageContext
|
|
337
|
+
};
|
|
338
|
+
}
|
|
339
|
+
async loadYamlFlowAsPlanning(userInstruction, yamlString) {
|
|
340
|
+
const taskExecutor = new Executor(taskTitleStr('Action', userInstruction), {
|
|
341
|
+
onTaskStart: this.onTaskStartCallback
|
|
342
|
+
});
|
|
343
|
+
const task = {
|
|
344
|
+
type: 'Planning',
|
|
345
|
+
subType: 'LoadYaml',
|
|
346
|
+
locate: null,
|
|
347
|
+
param: {
|
|
348
|
+
userInstruction
|
|
349
|
+
},
|
|
350
|
+
executor: async (param, executorContext)=>{
|
|
351
|
+
await this.setupPlanningContext(executorContext);
|
|
352
|
+
return {
|
|
353
|
+
output: {
|
|
354
|
+
actions: [],
|
|
355
|
+
more_actions_needed_by_instruction: false,
|
|
356
|
+
log: '',
|
|
357
|
+
yamlString
|
|
358
|
+
},
|
|
359
|
+
cache: {
|
|
360
|
+
hit: true
|
|
361
|
+
},
|
|
362
|
+
hitBy: {
|
|
363
|
+
from: 'Cache',
|
|
364
|
+
context: {
|
|
365
|
+
yamlString
|
|
366
|
+
}
|
|
367
|
+
}
|
|
368
|
+
};
|
|
369
|
+
}
|
|
370
|
+
};
|
|
371
|
+
await taskExecutor.append(task);
|
|
372
|
+
await taskExecutor.flush();
|
|
373
|
+
return {
|
|
374
|
+
executor: taskExecutor
|
|
375
|
+
};
|
|
376
|
+
}
|
|
377
|
+
planningTaskFromPrompt(userInstruction, log, actionContext) {
|
|
378
|
+
const task = {
|
|
379
|
+
type: 'Planning',
|
|
380
|
+
subType: 'Plan',
|
|
381
|
+
locate: null,
|
|
382
|
+
param: {
|
|
383
|
+
userInstruction,
|
|
384
|
+
log
|
|
385
|
+
},
|
|
386
|
+
executor: async (param, executorContext)=>{
|
|
387
|
+
const startTime = Date.now();
|
|
388
|
+
const { pageContext } = await this.setupPlanningContext(executorContext);
|
|
389
|
+
assert(this.page.actionSpace, 'actionSpace for device is not implemented');
|
|
390
|
+
const actionSpace = await this.page.actionSpace();
|
|
391
|
+
debug('actionSpace for page is:', actionSpace.map((action)=>action.name).join(', '));
|
|
392
|
+
assert(Array.isArray(actionSpace), 'actionSpace must be an array');
|
|
393
|
+
if (0 === actionSpace.length) console.warn(`ActionSpace for ${this.page.pageType} is empty. This may lead to unexpected behavior.`);
|
|
394
|
+
const planResult = await external_index_mjs_plan(param.userInstruction, {
|
|
395
|
+
context: pageContext,
|
|
396
|
+
log: param.log,
|
|
397
|
+
actionContext,
|
|
398
|
+
pageType: this.page.pageType,
|
|
399
|
+
actionSpace
|
|
400
|
+
});
|
|
401
|
+
const { actions, log, more_actions_needed_by_instruction, error, usage, rawResponse, sleep } = planResult;
|
|
402
|
+
executorContext.task.log = {
|
|
403
|
+
...executorContext.task.log || {},
|
|
404
|
+
rawResponse
|
|
405
|
+
};
|
|
406
|
+
executorContext.task.usage = usage;
|
|
407
|
+
const finalActions = actions || [];
|
|
408
|
+
if (sleep) {
|
|
409
|
+
const timeNow = Date.now();
|
|
410
|
+
const timeRemaining = sleep - (timeNow - startTime);
|
|
411
|
+
if (timeRemaining > 0) finalActions.push({
|
|
412
|
+
type: 'Sleep',
|
|
413
|
+
param: {
|
|
414
|
+
timeMs: timeRemaining
|
|
415
|
+
},
|
|
416
|
+
locate: null
|
|
417
|
+
});
|
|
418
|
+
}
|
|
419
|
+
if (0 === finalActions.length) assert(!more_actions_needed_by_instruction || sleep, error ? `Failed to plan: ${error}` : 'No plan found');
|
|
420
|
+
return {
|
|
421
|
+
output: {
|
|
422
|
+
actions: finalActions,
|
|
423
|
+
more_actions_needed_by_instruction,
|
|
424
|
+
log,
|
|
425
|
+
yamlFlow: planResult.yamlFlow
|
|
426
|
+
},
|
|
427
|
+
cache: {
|
|
428
|
+
hit: false
|
|
429
|
+
},
|
|
430
|
+
pageContext
|
|
431
|
+
};
|
|
432
|
+
}
|
|
433
|
+
};
|
|
434
|
+
return task;
|
|
435
|
+
}
|
|
436
|
+
planningTaskToGoal(userInstruction, modelPreferences) {
|
|
437
|
+
const task = {
|
|
438
|
+
type: 'Planning',
|
|
439
|
+
subType: 'Plan',
|
|
440
|
+
locate: null,
|
|
441
|
+
param: {
|
|
442
|
+
userInstruction
|
|
443
|
+
},
|
|
444
|
+
executor: async (param, executorContext)=>{
|
|
445
|
+
var _actions_;
|
|
446
|
+
const { pageContext } = await this.setupPlanningContext(executorContext);
|
|
447
|
+
const imagePayload = await resizeImageForUiTars(pageContext.screenshotBase64, pageContext.size, modelPreferences);
|
|
448
|
+
this.appendConversationHistory({
|
|
449
|
+
role: 'user',
|
|
450
|
+
content: [
|
|
451
|
+
{
|
|
452
|
+
type: 'image_url',
|
|
453
|
+
image_url: {
|
|
454
|
+
url: imagePayload
|
|
455
|
+
}
|
|
456
|
+
}
|
|
457
|
+
]
|
|
458
|
+
});
|
|
459
|
+
const planResult = await vlmPlanning({
|
|
460
|
+
userInstruction: param.userInstruction,
|
|
461
|
+
conversationHistory: this.conversationHistory,
|
|
462
|
+
size: pageContext.size,
|
|
463
|
+
modelPreferences
|
|
464
|
+
});
|
|
465
|
+
const { actions, action_summary, usage } = planResult;
|
|
466
|
+
executorContext.task.log = {
|
|
467
|
+
...executorContext.task.log || {},
|
|
468
|
+
rawResponse: planResult.rawResponse
|
|
469
|
+
};
|
|
470
|
+
executorContext.task.usage = usage;
|
|
471
|
+
this.appendConversationHistory({
|
|
472
|
+
role: 'assistant',
|
|
473
|
+
content: action_summary
|
|
474
|
+
});
|
|
475
|
+
return {
|
|
476
|
+
output: {
|
|
477
|
+
actions,
|
|
478
|
+
thought: null == (_actions_ = actions[0]) ? void 0 : _actions_.thought,
|
|
479
|
+
actionType: actions[0].type,
|
|
480
|
+
more_actions_needed_by_instruction: true,
|
|
481
|
+
log: '',
|
|
482
|
+
yamlFlow: planResult.yamlFlow
|
|
483
|
+
},
|
|
484
|
+
cache: {
|
|
485
|
+
hit: false
|
|
486
|
+
}
|
|
487
|
+
};
|
|
488
|
+
}
|
|
489
|
+
};
|
|
490
|
+
return task;
|
|
491
|
+
}
|
|
492
|
+
async runPlans(title, plans) {
|
|
493
|
+
const taskExecutor = new Executor(title, {
|
|
494
|
+
onTaskStart: this.onTaskStartCallback
|
|
495
|
+
});
|
|
496
|
+
const { tasks } = await this.convertPlanToExecutable(plans);
|
|
497
|
+
await taskExecutor.append(tasks);
|
|
498
|
+
const result = await taskExecutor.flush();
|
|
499
|
+
const { output } = result;
|
|
500
|
+
return {
|
|
501
|
+
output,
|
|
502
|
+
executor: taskExecutor
|
|
503
|
+
};
|
|
504
|
+
}
|
|
505
|
+
async action(userPrompt, actionContext) {
|
|
506
|
+
const taskExecutor = new Executor(taskTitleStr('Action', userPrompt), {
|
|
507
|
+
onTaskStart: this.onTaskStartCallback
|
|
508
|
+
});
|
|
509
|
+
let planningTask = this.planningTaskFromPrompt(userPrompt, void 0, actionContext);
|
|
510
|
+
let replanCount = 0;
|
|
511
|
+
const logList = [];
|
|
512
|
+
const yamlFlow = [];
|
|
513
|
+
const replanningCycleLimit = getAIConfigInNumber(MIDSCENE_REPLANNING_CYCLE_LIMIT) || defaultReplanningCycleLimit;
|
|
514
|
+
while(planningTask){
|
|
515
|
+
if (replanCount > replanningCycleLimit) {
|
|
516
|
+
const errorMsg = 'Replanning too many times, please split the task into multiple steps';
|
|
517
|
+
return this.appendErrorPlan(taskExecutor, errorMsg);
|
|
518
|
+
}
|
|
519
|
+
await taskExecutor.append(planningTask);
|
|
520
|
+
const result = await taskExecutor.flush();
|
|
521
|
+
const planResult = null == result ? void 0 : result.output;
|
|
522
|
+
if (taskExecutor.isInErrorState()) return {
|
|
523
|
+
output: planResult,
|
|
524
|
+
executor: taskExecutor
|
|
525
|
+
};
|
|
526
|
+
const plans = planResult.actions || [];
|
|
527
|
+
yamlFlow.push(...planResult.yamlFlow || []);
|
|
528
|
+
let executables;
|
|
529
|
+
try {
|
|
530
|
+
executables = await this.convertPlanToExecutable(plans);
|
|
531
|
+
taskExecutor.append(executables.tasks);
|
|
532
|
+
} catch (error) {
|
|
533
|
+
return this.appendErrorPlan(taskExecutor, `Error converting plans to executable tasks: ${error}, plans: ${JSON.stringify(plans)}`);
|
|
534
|
+
}
|
|
535
|
+
await taskExecutor.flush();
|
|
536
|
+
if (taskExecutor.isInErrorState()) return {
|
|
537
|
+
output: void 0,
|
|
538
|
+
executor: taskExecutor
|
|
539
|
+
};
|
|
540
|
+
if (null == planResult ? void 0 : planResult.log) logList.push(planResult.log);
|
|
541
|
+
if (!planResult.more_actions_needed_by_instruction) {
|
|
542
|
+
planningTask = null;
|
|
543
|
+
break;
|
|
544
|
+
}
|
|
545
|
+
planningTask = this.planningTaskFromPrompt(userPrompt, logList.length > 0 ? `- ${logList.join('\n- ')}` : void 0, actionContext);
|
|
546
|
+
replanCount++;
|
|
547
|
+
}
|
|
548
|
+
return {
|
|
549
|
+
output: {
|
|
550
|
+
yamlFlow
|
|
551
|
+
},
|
|
552
|
+
executor: taskExecutor
|
|
553
|
+
};
|
|
554
|
+
}
|
|
555
|
+
async actionToGoal(userPrompt) {
|
|
556
|
+
const taskExecutor = new Executor(taskTitleStr('Action', userPrompt), {
|
|
557
|
+
onTaskStart: this.onTaskStartCallback
|
|
558
|
+
});
|
|
559
|
+
this.conversationHistory = [];
|
|
560
|
+
const isCompleted = false;
|
|
561
|
+
let currentActionCount = 0;
|
|
562
|
+
const maxActionNumber = 40;
|
|
563
|
+
const yamlFlow = [];
|
|
564
|
+
while(!isCompleted && currentActionCount < maxActionNumber){
|
|
565
|
+
currentActionCount++;
|
|
566
|
+
debug('actionToGoal, currentActionCount:', currentActionCount, 'userPrompt:', userPrompt);
|
|
567
|
+
const planningTask = this.planningTaskToGoal(userPrompt, {
|
|
568
|
+
intent: 'planning'
|
|
569
|
+
});
|
|
570
|
+
await taskExecutor.append(planningTask);
|
|
571
|
+
const result = await taskExecutor.flush();
|
|
572
|
+
if (taskExecutor.isInErrorState()) return {
|
|
573
|
+
output: void 0,
|
|
574
|
+
executor: taskExecutor
|
|
575
|
+
};
|
|
576
|
+
if (!result) throw new Error('result of taskExecutor.flush() is undefined in function actionToGoal');
|
|
577
|
+
const { output } = result;
|
|
578
|
+
const plans = output.actions;
|
|
579
|
+
yamlFlow.push(...output.yamlFlow || []);
|
|
580
|
+
let executables;
|
|
581
|
+
try {
|
|
582
|
+
executables = await this.convertPlanToExecutable(plans);
|
|
583
|
+
taskExecutor.append(executables.tasks);
|
|
584
|
+
} catch (error) {
|
|
585
|
+
return this.appendErrorPlan(taskExecutor, `Error converting plans to executable tasks: ${error}, plans: ${JSON.stringify(plans)}`);
|
|
586
|
+
}
|
|
587
|
+
await taskExecutor.flush();
|
|
588
|
+
if (taskExecutor.isInErrorState()) return {
|
|
589
|
+
output: void 0,
|
|
590
|
+
executor: taskExecutor
|
|
591
|
+
};
|
|
592
|
+
if ('Finished' === plans[0].type) break;
|
|
593
|
+
}
|
|
594
|
+
return {
|
|
595
|
+
output: {
|
|
596
|
+
yamlFlow
|
|
597
|
+
},
|
|
598
|
+
executor: taskExecutor
|
|
599
|
+
};
|
|
600
|
+
}
|
|
601
|
+
createTypeQueryTask(type, demand, opt, multimodalPrompt) {
|
|
602
|
+
const queryTask = {
|
|
603
|
+
type: 'Insight',
|
|
604
|
+
subType: type,
|
|
605
|
+
locate: null,
|
|
606
|
+
param: {
|
|
607
|
+
dataDemand: multimodalPrompt ? {
|
|
608
|
+
demand,
|
|
609
|
+
multimodalPrompt
|
|
610
|
+
} : demand
|
|
611
|
+
},
|
|
612
|
+
executor: async (param, taskContext)=>{
|
|
613
|
+
const { task } = taskContext;
|
|
614
|
+
let insightDump;
|
|
615
|
+
const dumpCollector = (dump)=>{
|
|
616
|
+
insightDump = dump;
|
|
617
|
+
};
|
|
618
|
+
this.insight.onceDumpUpdatedFn = dumpCollector;
|
|
619
|
+
const shotTime = Date.now();
|
|
620
|
+
const pageContext = await this.insight.contextRetrieverFn('extract');
|
|
621
|
+
task.pageContext = pageContext;
|
|
622
|
+
const recordItem = {
|
|
623
|
+
type: 'screenshot',
|
|
624
|
+
ts: shotTime,
|
|
625
|
+
screenshot: pageContext.screenshotBase64,
|
|
626
|
+
timing: 'before Extract'
|
|
627
|
+
};
|
|
628
|
+
task.recorder = [
|
|
629
|
+
recordItem
|
|
630
|
+
];
|
|
631
|
+
const ifTypeRestricted = 'Query' !== type;
|
|
632
|
+
let demandInput = demand;
|
|
633
|
+
if (ifTypeRestricted) {
|
|
634
|
+
const returnType = 'Assert' === type ? 'Boolean' : type;
|
|
635
|
+
demandInput = {
|
|
636
|
+
result: `${returnType}, ${demand}`
|
|
637
|
+
};
|
|
638
|
+
}
|
|
639
|
+
const { data, usage, thought } = await this.insight.extract(demandInput, opt, multimodalPrompt);
|
|
640
|
+
let outputResult = data;
|
|
641
|
+
if (ifTypeRestricted) {
|
|
642
|
+
assert((null == data ? void 0 : data.result) !== void 0, 'No result in query data');
|
|
643
|
+
outputResult = data.result;
|
|
644
|
+
}
|
|
645
|
+
return {
|
|
646
|
+
output: outputResult,
|
|
647
|
+
log: {
|
|
648
|
+
dump: insightDump,
|
|
649
|
+
isWaitForAssert: null == opt ? void 0 : opt.isWaitForAssert
|
|
650
|
+
},
|
|
651
|
+
usage,
|
|
652
|
+
thought
|
|
653
|
+
};
|
|
654
|
+
}
|
|
655
|
+
};
|
|
656
|
+
return queryTask;
|
|
657
|
+
}
|
|
658
|
+
async createTypeQueryExecution(type, demand, opt, multimodalPrompt) {
|
|
659
|
+
const taskExecutor = new Executor(taskTitleStr(type, 'string' == typeof demand ? demand : JSON.stringify(demand)), {
|
|
660
|
+
onTaskStart: this.onTaskStartCallback
|
|
661
|
+
});
|
|
662
|
+
const queryTask = await this.createTypeQueryTask(type, demand, opt, multimodalPrompt);
|
|
663
|
+
await taskExecutor.append(this.prependExecutorWithScreenshot(queryTask));
|
|
664
|
+
const result = await taskExecutor.flush();
|
|
665
|
+
if (!result) throw new Error('result of taskExecutor.flush() is undefined in function createTypeQueryTask');
|
|
666
|
+
const { output, thought } = result;
|
|
667
|
+
return {
|
|
668
|
+
output,
|
|
669
|
+
thought,
|
|
670
|
+
executor: taskExecutor
|
|
671
|
+
};
|
|
672
|
+
}
|
|
673
|
+
async assert(assertion, opt) {
|
|
674
|
+
const { textPrompt, multimodalPrompt } = parsePrompt(assertion);
|
|
675
|
+
return await this.createTypeQueryExecution('Assert', textPrompt, opt, multimodalPrompt);
|
|
676
|
+
}
|
|
677
|
+
appendConversationHistory(conversationHistory) {
|
|
678
|
+
if ('user' === conversationHistory.role) {
|
|
679
|
+
const userImgItems = this.conversationHistory.filter((item)=>'user' === item.role);
|
|
680
|
+
if (userImgItems.length >= 4 && 'user' === conversationHistory.role) {
|
|
681
|
+
const firstUserImgIndex = this.conversationHistory.findIndex((item)=>'user' === item.role);
|
|
682
|
+
if (firstUserImgIndex >= 0) this.conversationHistory.splice(firstUserImgIndex, 1);
|
|
683
|
+
}
|
|
684
|
+
}
|
|
685
|
+
this.conversationHistory.push(conversationHistory);
|
|
686
|
+
}
|
|
687
|
+
async appendErrorPlan(taskExecutor, errorMsg) {
|
|
688
|
+
const errorPlan = {
|
|
689
|
+
type: 'Error',
|
|
690
|
+
param: {
|
|
691
|
+
thought: errorMsg
|
|
692
|
+
},
|
|
693
|
+
locate: null
|
|
694
|
+
};
|
|
695
|
+
const { tasks } = await this.convertPlanToExecutable([
|
|
696
|
+
errorPlan
|
|
697
|
+
]);
|
|
698
|
+
await taskExecutor.append(this.prependExecutorWithScreenshot(tasks[0]));
|
|
699
|
+
await taskExecutor.flush();
|
|
700
|
+
return {
|
|
701
|
+
output: void 0,
|
|
702
|
+
executor: taskExecutor
|
|
703
|
+
};
|
|
704
|
+
}
|
|
705
|
+
async waitFor(assertion, opt) {
|
|
706
|
+
const { textPrompt, multimodalPrompt } = parsePrompt(assertion);
|
|
707
|
+
const description = `waitFor: ${textPrompt}`;
|
|
708
|
+
const taskExecutor = new Executor(taskTitleStr('WaitFor', description), {
|
|
709
|
+
onTaskStart: this.onTaskStartCallback
|
|
710
|
+
});
|
|
711
|
+
const { timeoutMs, checkIntervalMs } = opt;
|
|
712
|
+
assert(assertion, 'No assertion for waitFor');
|
|
713
|
+
assert(timeoutMs, 'No timeoutMs for waitFor');
|
|
714
|
+
assert(checkIntervalMs, 'No checkIntervalMs for waitFor');
|
|
715
|
+
assert(checkIntervalMs <= timeoutMs, `wrong config for waitFor: checkIntervalMs must be less than timeoutMs, config: {checkIntervalMs: ${checkIntervalMs}, timeoutMs: ${timeoutMs}}`);
|
|
716
|
+
const overallStartTime = Date.now();
|
|
717
|
+
let startTime = Date.now();
|
|
718
|
+
let errorThought = '';
|
|
719
|
+
while(Date.now() - overallStartTime < timeoutMs){
|
|
720
|
+
startTime = Date.now();
|
|
721
|
+
const queryTask = await this.createTypeQueryTask('Assert', textPrompt, {
|
|
722
|
+
isWaitForAssert: true,
|
|
723
|
+
returnThought: true,
|
|
724
|
+
doNotThrowError: true
|
|
725
|
+
}, multimodalPrompt);
|
|
726
|
+
await taskExecutor.append(this.prependExecutorWithScreenshot(queryTask));
|
|
727
|
+
const result = await taskExecutor.flush();
|
|
728
|
+
if (!result) throw new Error('result of taskExecutor.flush() is undefined in function waitFor');
|
|
729
|
+
if (null == result ? void 0 : result.output) return {
|
|
730
|
+
output: void 0,
|
|
731
|
+
executor: taskExecutor
|
|
732
|
+
};
|
|
733
|
+
errorThought = (null == result ? void 0 : result.thought) || `unknown error when waiting for assertion: ${textPrompt}`;
|
|
734
|
+
const now = Date.now();
|
|
735
|
+
if (now - startTime < checkIntervalMs) {
|
|
736
|
+
const timeRemaining = checkIntervalMs - (now - startTime);
|
|
737
|
+
const sleepPlan = {
|
|
738
|
+
type: 'Sleep',
|
|
739
|
+
param: {
|
|
740
|
+
timeMs: timeRemaining
|
|
741
|
+
},
|
|
742
|
+
locate: null
|
|
743
|
+
};
|
|
744
|
+
const { tasks: sleepTasks } = await this.convertPlanToExecutable([
|
|
745
|
+
sleepPlan
|
|
746
|
+
]);
|
|
747
|
+
await taskExecutor.append(this.prependExecutorWithScreenshot(sleepTasks[0]));
|
|
748
|
+
await taskExecutor.flush();
|
|
749
|
+
}
|
|
750
|
+
}
|
|
751
|
+
return this.appendErrorPlan(taskExecutor, `waitFor timeout: ${errorThought}`);
|
|
752
|
+
}
|
|
753
|
+
constructor(page, insight, opts){
|
|
754
|
+
_define_property(this, "page", void 0);
|
|
755
|
+
_define_property(this, "insight", void 0);
|
|
756
|
+
_define_property(this, "taskCache", void 0);
|
|
757
|
+
_define_property(this, "conversationHistory", []);
|
|
758
|
+
_define_property(this, "onTaskStartCallback", void 0);
|
|
759
|
+
this.page = page;
|
|
760
|
+
this.insight = insight;
|
|
761
|
+
this.taskCache = opts.taskCache;
|
|
762
|
+
this.onTaskStartCallback = null == opts ? void 0 : opts.onTaskStart;
|
|
763
|
+
}
|
|
764
|
+
}
|
|
765
|
+
export { PageTaskExecutor, locatePlanForLocate };
|
|
766
|
+
|
|
767
|
+
//# sourceMappingURL=tasks.mjs.map
|