@midscene/core 0.30.6-beta-20251022093704.0 → 1.0.1-beta-20251022061922.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/es/agent/agent.mjs +41 -33
- package/dist/es/agent/agent.mjs.map +1 -1
- package/dist/es/agent/execution-session.mjs +41 -0
- package/dist/es/agent/execution-session.mjs.map +1 -0
- package/dist/es/agent/task-builder.mjs +315 -0
- package/dist/es/agent/task-builder.mjs.map +1 -0
- package/dist/es/agent/tasks.mjs +80 -405
- package/dist/es/agent/tasks.mjs.map +1 -1
- package/dist/es/agent/ui-utils.mjs.map +1 -1
- package/dist/es/agent/utils.mjs +6 -6
- package/dist/es/agent/utils.mjs.map +1 -1
- package/dist/es/ai-model/common.mjs +1 -15
- package/dist/es/ai-model/common.mjs.map +1 -1
- package/dist/es/ai-model/inspect.mjs +2 -3
- package/dist/es/ai-model/inspect.mjs.map +1 -1
- package/dist/es/ai-model/llm-planning.mjs +11 -30
- package/dist/es/ai-model/llm-planning.mjs.map +1 -1
- package/dist/es/ai-model/prompt/llm-locator.mjs +3 -204
- package/dist/es/ai-model/prompt/llm-locator.mjs.map +1 -1
- package/dist/es/ai-model/service-caller/index.mjs +101 -231
- package/dist/es/ai-model/service-caller/index.mjs.map +1 -1
- package/dist/es/index.mjs +3 -2
- package/dist/es/index.mjs.map +1 -1
- package/dist/es/insight/index.mjs +18 -19
- package/dist/es/insight/index.mjs.map +1 -1
- package/dist/es/insight/utils.mjs +3 -3
- package/dist/es/insight/utils.mjs.map +1 -1
- package/dist/es/report.mjs.map +1 -1
- package/dist/es/{ai-model/action-executor.mjs → task-runner.mjs} +81 -10
- package/dist/es/task-runner.mjs.map +1 -0
- package/dist/es/types.mjs +18 -1
- package/dist/es/types.mjs.map +1 -1
- package/dist/es/utils.mjs +2 -2
- package/dist/es/yaml/player.mjs +18 -14
- package/dist/es/yaml/player.mjs.map +1 -1
- package/dist/lib/agent/agent.js +41 -33
- package/dist/lib/agent/agent.js.map +1 -1
- package/dist/lib/agent/execution-session.js +75 -0
- package/dist/lib/agent/execution-session.js.map +1 -0
- package/dist/lib/agent/task-builder.js +352 -0
- package/dist/lib/agent/task-builder.js.map +1 -0
- package/dist/lib/agent/tasks.js +80 -405
- package/dist/lib/agent/tasks.js.map +1 -1
- package/dist/lib/agent/ui-utils.js.map +1 -1
- package/dist/lib/agent/utils.js +6 -6
- package/dist/lib/agent/utils.js.map +1 -1
- package/dist/lib/ai-model/common.js +2 -19
- package/dist/lib/ai-model/common.js.map +1 -1
- package/dist/lib/ai-model/inspect.js +1 -2
- package/dist/lib/ai-model/inspect.js.map +1 -1
- package/dist/lib/ai-model/llm-planning.js +10 -29
- package/dist/lib/ai-model/llm-planning.js.map +1 -1
- package/dist/lib/ai-model/prompt/llm-locator.js +2 -206
- package/dist/lib/ai-model/prompt/llm-locator.js.map +1 -1
- package/dist/lib/ai-model/service-caller/index.js +236 -384
- package/dist/lib/ai-model/service-caller/index.js.map +1 -1
- package/dist/lib/index.js +9 -5
- package/dist/lib/index.js.map +1 -1
- package/dist/lib/insight/index.js +17 -18
- package/dist/lib/insight/index.js.map +1 -1
- package/dist/lib/insight/utils.js +5 -5
- package/dist/lib/insight/utils.js.map +1 -1
- package/dist/lib/report.js.map +1 -1
- package/dist/lib/{ai-model/action-executor.js → task-runner.js} +83 -12
- package/dist/lib/task-runner.js.map +1 -0
- package/dist/lib/types.js +22 -1
- package/dist/lib/types.js.map +1 -1
- package/dist/lib/utils.js +2 -2
- package/dist/lib/yaml/player.js +18 -14
- package/dist/lib/yaml/player.js.map +1 -1
- package/dist/types/agent/agent.d.ts +16 -0
- package/dist/types/agent/execution-session.d.ts +27 -0
- package/dist/types/agent/task-builder.d.ts +34 -0
- package/dist/types/agent/tasks.d.ts +14 -13
- package/dist/types/agent/ui-utils.d.ts +2 -2
- package/dist/types/agent/utils.d.ts +6 -2
- package/dist/types/ai-model/common.d.ts +0 -1
- package/dist/types/ai-model/prompt/llm-locator.d.ts +0 -2
- package/dist/types/device/index.d.ts +20 -20
- package/dist/types/index.d.ts +4 -3
- package/dist/types/insight/index.d.ts +5 -10
- package/dist/types/insight/utils.d.ts +2 -2
- package/dist/types/task-runner.d.ts +31 -0
- package/dist/types/types.d.ts +53 -14
- package/dist/types/yaml.d.ts +3 -1
- package/package.json +4 -7
- package/dist/es/ai-model/action-executor.mjs.map +0 -1
- package/dist/lib/ai-model/action-executor.js.map +0 -1
- package/dist/types/ai-model/action-executor.d.ts +0 -19
package/dist/es/agent/tasks.mjs
CHANGED
|
@@ -1,11 +1,12 @@
|
|
|
1
|
-
import { ConversationHistory,
|
|
2
|
-
import {
|
|
3
|
-
import { sleep as external_utils_mjs_sleep } from "../utils.mjs";
|
|
1
|
+
import { ConversationHistory, plan, uiTarsPlanning } from "../ai-model/index.mjs";
|
|
2
|
+
import { InsightError } from "../types.mjs";
|
|
4
3
|
import { MIDSCENE_REPLANNING_CYCLE_LIMIT, globalConfigManager } from "@midscene/shared/env";
|
|
5
4
|
import { getDebug } from "@midscene/shared/logger";
|
|
6
5
|
import { assert } from "@midscene/shared/utils";
|
|
6
|
+
import { ExecutionSession } from "./execution-session.mjs";
|
|
7
|
+
import { TaskBuilder, locatePlanForLocate } from "./task-builder.mjs";
|
|
7
8
|
import { taskTitleStr } from "./ui-utils.mjs";
|
|
8
|
-
import {
|
|
9
|
+
import { parsePrompt } from "./utils.mjs";
|
|
9
10
|
function _define_property(obj, key, value) {
|
|
10
11
|
if (key in obj) Object.defineProperty(obj, key, {
|
|
11
12
|
value: value,
|
|
@@ -19,325 +20,21 @@ function _define_property(obj, key, value) {
|
|
|
19
20
|
const debug = getDebug('device-task-executor');
|
|
20
21
|
const defaultReplanningCycleLimit = 10;
|
|
21
22
|
const defaultVlmUiTarsReplanningCycleLimit = 40;
|
|
22
|
-
function locatePlanForLocate(param) {
|
|
23
|
-
const locate = 'string' == typeof param ? {
|
|
24
|
-
prompt: param
|
|
25
|
-
} : param;
|
|
26
|
-
const locatePlan = {
|
|
27
|
-
type: 'Locate',
|
|
28
|
-
locate,
|
|
29
|
-
param: locate,
|
|
30
|
-
thought: ''
|
|
31
|
-
};
|
|
32
|
-
return locatePlan;
|
|
33
|
-
}
|
|
34
23
|
class TaskExecutor {
|
|
35
24
|
get page() {
|
|
36
25
|
return this.interface;
|
|
37
26
|
}
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
ts: Date.now(),
|
|
43
|
-
screenshot: base64,
|
|
44
|
-
timing
|
|
45
|
-
};
|
|
46
|
-
return item;
|
|
47
|
-
}
|
|
48
|
-
prependExecutorWithScreenshot(taskApply, appendAfterExecution = false) {
|
|
49
|
-
const taskWithScreenshot = {
|
|
50
|
-
...taskApply,
|
|
51
|
-
executor: async (param, context, ...args)=>{
|
|
52
|
-
const recorder = [];
|
|
53
|
-
const { task } = context;
|
|
54
|
-
task.recorder = recorder;
|
|
55
|
-
const shot = await this.recordScreenshot(`before ${task.type}`);
|
|
56
|
-
recorder.push(shot);
|
|
57
|
-
const result = await taskApply.executor(param, context, ...args);
|
|
58
|
-
if (appendAfterExecution) {
|
|
59
|
-
const shot2 = await this.recordScreenshot('after Action');
|
|
60
|
-
recorder.push(shot2);
|
|
61
|
-
}
|
|
62
|
-
return result;
|
|
63
|
-
}
|
|
64
|
-
};
|
|
65
|
-
return taskWithScreenshot;
|
|
66
|
-
}
|
|
67
|
-
async convertPlanToExecutable(plans, modelConfig, cacheable) {
|
|
68
|
-
const tasks = [];
|
|
69
|
-
const taskForLocatePlan = (plan, detailedLocateParam, onResult)=>{
|
|
70
|
-
if ('string' == typeof detailedLocateParam) detailedLocateParam = {
|
|
71
|
-
prompt: detailedLocateParam
|
|
72
|
-
};
|
|
73
|
-
if (void 0 !== cacheable) detailedLocateParam = {
|
|
74
|
-
...detailedLocateParam,
|
|
75
|
-
cacheable
|
|
76
|
-
};
|
|
77
|
-
const taskFind = {
|
|
78
|
-
type: 'Insight',
|
|
79
|
-
subType: 'Locate',
|
|
80
|
-
param: detailedLocateParam,
|
|
81
|
-
thought: plan.thought,
|
|
82
|
-
executor: async (param, taskContext)=>{
|
|
83
|
-
var _this_taskCache, _locateCacheRecord_cacheContent;
|
|
84
|
-
const { task } = taskContext;
|
|
85
|
-
assert((null == param ? void 0 : param.prompt) || (null == param ? void 0 : param.id) || (null == param ? void 0 : param.bbox), `No prompt or id or position or bbox to locate, param=${JSON.stringify(param)}`);
|
|
86
|
-
let insightDump;
|
|
87
|
-
let usage;
|
|
88
|
-
const dumpCollector = (dump)=>{
|
|
89
|
-
var _dump_taskInfo, _dump_taskInfo1;
|
|
90
|
-
insightDump = dump;
|
|
91
|
-
usage = null == dump ? void 0 : null == (_dump_taskInfo = dump.taskInfo) ? void 0 : _dump_taskInfo.usage;
|
|
92
|
-
task.log = {
|
|
93
|
-
dump: insightDump
|
|
94
|
-
};
|
|
95
|
-
task.usage = usage;
|
|
96
|
-
if (null == dump ? void 0 : null == (_dump_taskInfo1 = dump.taskInfo) ? void 0 : _dump_taskInfo1.searchAreaUsage) task.searchAreaUsage = dump.taskInfo.searchAreaUsage;
|
|
97
|
-
};
|
|
98
|
-
this.insight.onceDumpUpdatedFn = dumpCollector;
|
|
99
|
-
const shotTime = Date.now();
|
|
100
|
-
const uiContext = await this.insight.contextRetrieverFn('locate');
|
|
101
|
-
task.uiContext = uiContext;
|
|
102
|
-
const recordItem = {
|
|
103
|
-
type: 'screenshot',
|
|
104
|
-
ts: shotTime,
|
|
105
|
-
screenshot: uiContext.screenshotBase64,
|
|
106
|
-
timing: 'before Insight'
|
|
107
|
-
};
|
|
108
|
-
task.recorder = [
|
|
109
|
-
recordItem
|
|
110
|
-
];
|
|
111
|
-
const elementFromXpath = param.xpath && this.interface.getElementInfoByXpath ? await this.interface.getElementInfoByXpath(param.xpath) : void 0;
|
|
112
|
-
const userExpectedPathHitFlag = !!elementFromXpath;
|
|
113
|
-
const cachePrompt = param.prompt;
|
|
114
|
-
const locateCacheRecord = null == (_this_taskCache = this.taskCache) ? void 0 : _this_taskCache.matchLocateCache(cachePrompt);
|
|
115
|
-
const cacheEntry = null == locateCacheRecord ? void 0 : null == (_locateCacheRecord_cacheContent = locateCacheRecord.cacheContent) ? void 0 : _locateCacheRecord_cacheContent.cache;
|
|
116
|
-
const elementFromCache = userExpectedPathHitFlag ? null : await matchElementFromCache(this, cacheEntry, cachePrompt, param.cacheable);
|
|
117
|
-
const cacheHitFlag = !!elementFromCache;
|
|
118
|
-
const elementFromPlan = userExpectedPathHitFlag || cacheHitFlag ? void 0 : matchElementFromPlan(param, uiContext.tree);
|
|
119
|
-
const planHitFlag = !!elementFromPlan;
|
|
120
|
-
const elementFromAiLocate = userExpectedPathHitFlag || cacheHitFlag || planHitFlag ? void 0 : (await this.insight.locate(param, {
|
|
121
|
-
context: uiContext
|
|
122
|
-
}, modelConfig)).element;
|
|
123
|
-
const aiLocateHitFlag = !!elementFromAiLocate;
|
|
124
|
-
const element = elementFromXpath || elementFromCache || elementFromPlan || elementFromAiLocate;
|
|
125
|
-
let currentCacheEntry;
|
|
126
|
-
if (element && this.taskCache && !cacheHitFlag && (null == param ? void 0 : param.cacheable) !== false) if (this.interface.cacheFeatureForRect) try {
|
|
127
|
-
const feature = await this.interface.cacheFeatureForRect(element.rect, void 0 !== element.isOrderSensitive ? {
|
|
128
|
-
_orderSensitive: element.isOrderSensitive
|
|
129
|
-
} : void 0);
|
|
130
|
-
if (feature && Object.keys(feature).length > 0) {
|
|
131
|
-
debug('update cache, prompt: %s, cache: %o', cachePrompt, feature);
|
|
132
|
-
currentCacheEntry = feature;
|
|
133
|
-
this.taskCache.updateOrAppendCacheRecord({
|
|
134
|
-
type: 'locate',
|
|
135
|
-
prompt: cachePrompt,
|
|
136
|
-
cache: feature
|
|
137
|
-
}, locateCacheRecord);
|
|
138
|
-
} else debug('no cache data returned, skip cache update, prompt: %s', cachePrompt);
|
|
139
|
-
} catch (error) {
|
|
140
|
-
debug('cacheFeatureForRect failed: %s', error);
|
|
141
|
-
}
|
|
142
|
-
else debug('cacheFeatureForRect is not supported, skip cache update');
|
|
143
|
-
if (!element) throw new Error(`Element not found: ${param.prompt}`);
|
|
144
|
-
let hitBy;
|
|
145
|
-
if (userExpectedPathHitFlag) hitBy = {
|
|
146
|
-
from: 'User expected path',
|
|
147
|
-
context: {
|
|
148
|
-
xpath: param.xpath
|
|
149
|
-
}
|
|
150
|
-
};
|
|
151
|
-
else if (cacheHitFlag) hitBy = {
|
|
152
|
-
from: 'Cache',
|
|
153
|
-
context: {
|
|
154
|
-
cacheEntry,
|
|
155
|
-
cacheToSave: currentCacheEntry
|
|
156
|
-
}
|
|
157
|
-
};
|
|
158
|
-
else if (planHitFlag) hitBy = {
|
|
159
|
-
from: 'Planning',
|
|
160
|
-
context: {
|
|
161
|
-
id: null == elementFromPlan ? void 0 : elementFromPlan.id,
|
|
162
|
-
bbox: null == elementFromPlan ? void 0 : elementFromPlan.bbox
|
|
163
|
-
}
|
|
164
|
-
};
|
|
165
|
-
else if (aiLocateHitFlag) hitBy = {
|
|
166
|
-
from: 'AI model',
|
|
167
|
-
context: {
|
|
168
|
-
prompt: param.prompt
|
|
169
|
-
}
|
|
170
|
-
};
|
|
171
|
-
null == onResult || onResult(element);
|
|
172
|
-
return {
|
|
173
|
-
output: {
|
|
174
|
-
element
|
|
175
|
-
},
|
|
176
|
-
uiContext,
|
|
177
|
-
hitBy
|
|
178
|
-
};
|
|
179
|
-
}
|
|
180
|
-
};
|
|
181
|
-
return taskFind;
|
|
182
|
-
};
|
|
183
|
-
for (const plan of plans)if ('Locate' === plan.type) {
|
|
184
|
-
var _plan_locate, _plan_locate1;
|
|
185
|
-
if (!plan.locate || null === plan.locate || (null == (_plan_locate = plan.locate) ? void 0 : _plan_locate.id) === null || (null == (_plan_locate1 = plan.locate) ? void 0 : _plan_locate1.id) === 'null') {
|
|
186
|
-
debug('Locate action with id is null, will be ignored', plan);
|
|
187
|
-
continue;
|
|
188
|
-
}
|
|
189
|
-
const taskLocate = taskForLocatePlan(plan, plan.locate);
|
|
190
|
-
tasks.push(taskLocate);
|
|
191
|
-
} else if ('Error' === plan.type) {
|
|
192
|
-
var _plan_param;
|
|
193
|
-
const taskActionError = {
|
|
194
|
-
type: 'Action',
|
|
195
|
-
subType: 'Error',
|
|
196
|
-
param: plan.param,
|
|
197
|
-
thought: plan.thought || (null == (_plan_param = plan.param) ? void 0 : _plan_param.thought),
|
|
198
|
-
locate: plan.locate,
|
|
199
|
-
executor: async ()=>{
|
|
200
|
-
var _plan_param;
|
|
201
|
-
throw new Error((null == plan ? void 0 : plan.thought) || (null == (_plan_param = plan.param) ? void 0 : _plan_param.thought) || 'error without thought');
|
|
202
|
-
}
|
|
203
|
-
};
|
|
204
|
-
tasks.push(taskActionError);
|
|
205
|
-
} else if ('Finished' === plan.type) {
|
|
206
|
-
const taskActionFinished = {
|
|
207
|
-
type: 'Action',
|
|
208
|
-
subType: 'Finished',
|
|
209
|
-
param: null,
|
|
210
|
-
thought: plan.thought,
|
|
211
|
-
locate: plan.locate,
|
|
212
|
-
executor: async (param)=>{}
|
|
213
|
-
};
|
|
214
|
-
tasks.push(taskActionFinished);
|
|
215
|
-
} else if ('Sleep' === plan.type) {
|
|
216
|
-
const taskActionSleep = {
|
|
217
|
-
type: 'Action',
|
|
218
|
-
subType: 'Sleep',
|
|
219
|
-
param: plan.param,
|
|
220
|
-
thought: plan.thought,
|
|
221
|
-
locate: plan.locate,
|
|
222
|
-
executor: async (taskParam)=>{
|
|
223
|
-
await external_utils_mjs_sleep((null == taskParam ? void 0 : taskParam.timeMs) || 3000);
|
|
224
|
-
}
|
|
225
|
-
};
|
|
226
|
-
tasks.push(taskActionSleep);
|
|
227
|
-
} else {
|
|
228
|
-
const planType = plan.type;
|
|
229
|
-
const actionSpace = await this.interface.actionSpace();
|
|
230
|
-
const action = actionSpace.find((action)=>action.name === planType);
|
|
231
|
-
const param = plan.param;
|
|
232
|
-
if (!action) throw new Error(`Action type '${planType}' not found`);
|
|
233
|
-
const locateFields = action ? findAllMidsceneLocatorField(action.paramSchema) : [];
|
|
234
|
-
const requiredLocateFields = action ? findAllMidsceneLocatorField(action.paramSchema, true) : [];
|
|
235
|
-
locateFields.forEach((field)=>{
|
|
236
|
-
if (param[field]) {
|
|
237
|
-
const locatePlan = locatePlanForLocate(param[field]);
|
|
238
|
-
debug('will prepend locate param for field', `action.type=${planType}`, `param=${JSON.stringify(param[field])}`, `locatePlan=${JSON.stringify(locatePlan)}`);
|
|
239
|
-
const locateTask = taskForLocatePlan(locatePlan, param[field], (result)=>{
|
|
240
|
-
param[field] = result;
|
|
241
|
-
});
|
|
242
|
-
tasks.push(locateTask);
|
|
243
|
-
} else {
|
|
244
|
-
assert(!requiredLocateFields.includes(field), `Required locate field '${field}' is not provided for action ${planType}`);
|
|
245
|
-
debug(`field '${field}' is not provided for action ${planType}`);
|
|
246
|
-
}
|
|
247
|
-
});
|
|
248
|
-
const task = {
|
|
249
|
-
type: 'Action',
|
|
250
|
-
subType: planType,
|
|
251
|
-
thought: plan.thought,
|
|
252
|
-
param: plan.param,
|
|
253
|
-
executor: async (param, context)=>{
|
|
254
|
-
var _context_element;
|
|
255
|
-
debug('executing action', planType, param, `context.element.center: ${null == (_context_element = context.element) ? void 0 : _context_element.center}`);
|
|
256
|
-
const uiContext = await this.insight.contextRetrieverFn('locate');
|
|
257
|
-
context.task.uiContext = uiContext;
|
|
258
|
-
requiredLocateFields.forEach((field)=>{
|
|
259
|
-
assert(param[field], `field '${field}' is required for action ${planType} but not provided. Cannot execute action ${planType}.`);
|
|
260
|
-
});
|
|
261
|
-
try {
|
|
262
|
-
await Promise.all([
|
|
263
|
-
(async ()=>{
|
|
264
|
-
if (this.interface.beforeInvokeAction) {
|
|
265
|
-
debug('will call "beforeInvokeAction" for interface');
|
|
266
|
-
await this.interface.beforeInvokeAction(action.name, param);
|
|
267
|
-
debug('called "beforeInvokeAction" for interface');
|
|
268
|
-
}
|
|
269
|
-
})(),
|
|
270
|
-
external_utils_mjs_sleep(200)
|
|
271
|
-
]);
|
|
272
|
-
} catch (originalError) {
|
|
273
|
-
const originalMessage = (null == originalError ? void 0 : originalError.message) || String(originalError);
|
|
274
|
-
throw new Error(`error in running beforeInvokeAction for ${action.name}: ${originalMessage}`, {
|
|
275
|
-
cause: originalError
|
|
276
|
-
});
|
|
277
|
-
}
|
|
278
|
-
if (action.paramSchema) try {
|
|
279
|
-
param = parseActionParam(param, action.paramSchema);
|
|
280
|
-
} catch (error) {
|
|
281
|
-
throw new Error(`Invalid parameters for action ${action.name}: ${error.message}\nParameters: ${JSON.stringify(param)}`, {
|
|
282
|
-
cause: error
|
|
283
|
-
});
|
|
284
|
-
}
|
|
285
|
-
debug('calling action', action.name);
|
|
286
|
-
const actionFn = action.call.bind(this.interface);
|
|
287
|
-
await actionFn(param, context);
|
|
288
|
-
debug('called action', action.name);
|
|
289
|
-
try {
|
|
290
|
-
if (this.interface.afterInvokeAction) {
|
|
291
|
-
debug('will call "afterInvokeAction" for interface');
|
|
292
|
-
await this.interface.afterInvokeAction(action.name, param);
|
|
293
|
-
debug('called "afterInvokeAction" for interface');
|
|
294
|
-
}
|
|
295
|
-
} catch (originalError) {
|
|
296
|
-
const originalMessage = (null == originalError ? void 0 : originalError.message) || String(originalError);
|
|
297
|
-
throw new Error(`error in running afterInvokeAction for ${action.name}: ${originalMessage}`, {
|
|
298
|
-
cause: originalError
|
|
299
|
-
});
|
|
300
|
-
}
|
|
301
|
-
return {
|
|
302
|
-
output: {
|
|
303
|
-
success: true,
|
|
304
|
-
action: planType,
|
|
305
|
-
param: param
|
|
306
|
-
}
|
|
307
|
-
};
|
|
308
|
-
}
|
|
309
|
-
};
|
|
310
|
-
tasks.push(task);
|
|
311
|
-
}
|
|
312
|
-
const wrappedTasks = tasks.map((task, index)=>{
|
|
313
|
-
if ('Action' === task.type) return this.prependExecutorWithScreenshot(task, index === tasks.length - 1);
|
|
314
|
-
return task;
|
|
27
|
+
createExecutionSession(title, options) {
|
|
28
|
+
return new ExecutionSession(title, ()=>Promise.resolve(this.insight.contextRetrieverFn()), {
|
|
29
|
+
onTaskStart: this.onTaskStartCallback,
|
|
30
|
+
tasks: null == options ? void 0 : options.tasks
|
|
315
31
|
});
|
|
316
|
-
return {
|
|
317
|
-
tasks: wrappedTasks
|
|
318
|
-
};
|
|
319
32
|
}
|
|
320
|
-
async
|
|
321
|
-
|
|
322
|
-
const uiContext = await this.insight.contextRetrieverFn('locate');
|
|
323
|
-
const recordItem = {
|
|
324
|
-
type: 'screenshot',
|
|
325
|
-
ts: shotTime,
|
|
326
|
-
screenshot: uiContext.screenshotBase64,
|
|
327
|
-
timing: 'before Planning'
|
|
328
|
-
};
|
|
329
|
-
executorContext.task.recorder = [
|
|
330
|
-
recordItem
|
|
331
|
-
];
|
|
332
|
-
executorContext.task.uiContext = uiContext;
|
|
333
|
-
return {
|
|
334
|
-
uiContext
|
|
335
|
-
};
|
|
33
|
+
async convertPlanToExecutable(plans, modelConfig, options) {
|
|
34
|
+
return this.taskBuilder.build(plans, modelConfig, options);
|
|
336
35
|
}
|
|
337
36
|
async loadYamlFlowAsPlanning(userInstruction, yamlString) {
|
|
338
|
-
const
|
|
339
|
-
onTaskStart: this.onTaskStartCallback
|
|
340
|
-
});
|
|
37
|
+
const session = this.createExecutionSession(taskTitleStr('Action', userInstruction));
|
|
341
38
|
const task = {
|
|
342
39
|
type: 'Planning',
|
|
343
40
|
subType: 'LoadYaml',
|
|
@@ -346,7 +43,8 @@ class TaskExecutor {
|
|
|
346
43
|
userInstruction
|
|
347
44
|
},
|
|
348
45
|
executor: async (param, executorContext)=>{
|
|
349
|
-
|
|
46
|
+
const { uiContext } = executorContext;
|
|
47
|
+
assert(uiContext, 'uiContext is required for Planning task');
|
|
350
48
|
return {
|
|
351
49
|
output: {
|
|
352
50
|
actions: [],
|
|
@@ -366,10 +64,9 @@ class TaskExecutor {
|
|
|
366
64
|
};
|
|
367
65
|
}
|
|
368
66
|
};
|
|
369
|
-
await
|
|
370
|
-
await taskExecutor.flush();
|
|
67
|
+
await session.appendAndRun(task);
|
|
371
68
|
return {
|
|
372
|
-
|
|
69
|
+
runner: session.getRunner()
|
|
373
70
|
};
|
|
374
71
|
}
|
|
375
72
|
createPlanningTask(userInstruction, actionContext, modelConfig) {
|
|
@@ -382,7 +79,8 @@ class TaskExecutor {
|
|
|
382
79
|
},
|
|
383
80
|
executor: async (param, executorContext)=>{
|
|
384
81
|
const startTime = Date.now();
|
|
385
|
-
const { uiContext } =
|
|
82
|
+
const { uiContext } = executorContext;
|
|
83
|
+
assert(uiContext, 'uiContext is required for Planning task');
|
|
386
84
|
const { vlMode } = modelConfig;
|
|
387
85
|
const uiTarsModelVersion = 'vlm-ui-tars' === vlMode ? modelConfig.uiTarsModelVersion : void 0;
|
|
388
86
|
assert(this.interface.actionSpace, 'actionSpace for device is not implemented');
|
|
@@ -390,7 +88,7 @@ class TaskExecutor {
|
|
|
390
88
|
debug('actionSpace for this interface is:', actionSpace.map((action)=>action.name).join(', '));
|
|
391
89
|
assert(Array.isArray(actionSpace), 'actionSpace must be an array');
|
|
392
90
|
if (0 === actionSpace.length) console.warn(`ActionSpace for ${this.interface.interfaceType} is empty. This may lead to unexpected behavior.`);
|
|
393
|
-
const planResult = await (uiTarsModelVersion ? uiTarsPlanning :
|
|
91
|
+
const planResult = await (uiTarsModelVersion ? uiTarsPlanning : plan)(param.userInstruction, {
|
|
394
92
|
context: uiContext,
|
|
395
93
|
actionContext,
|
|
396
94
|
interfaceType: this.interface.interfaceType,
|
|
@@ -409,13 +107,7 @@ class TaskExecutor {
|
|
|
409
107
|
if (sleep) {
|
|
410
108
|
const timeNow = Date.now();
|
|
411
109
|
const timeRemaining = sleep - (timeNow - startTime);
|
|
412
|
-
if (timeRemaining > 0) finalActions.push(
|
|
413
|
-
type: 'Sleep',
|
|
414
|
-
param: {
|
|
415
|
-
timeMs: timeRemaining
|
|
416
|
-
},
|
|
417
|
-
locate: null
|
|
418
|
-
});
|
|
110
|
+
if (timeRemaining > 0) finalActions.push(this.sleepPlan(timeRemaining));
|
|
419
111
|
}
|
|
420
112
|
if (0 === finalActions.length) assert(!more_actions_needed_by_instruction || sleep, error ? `Failed to plan: ${error}` : 'No plan found');
|
|
421
113
|
return {
|
|
@@ -435,16 +127,13 @@ class TaskExecutor {
|
|
|
435
127
|
return task;
|
|
436
128
|
}
|
|
437
129
|
async runPlans(title, plans, modelConfig) {
|
|
438
|
-
const
|
|
439
|
-
onTaskStart: this.onTaskStartCallback
|
|
440
|
-
});
|
|
130
|
+
const session = this.createExecutionSession(title);
|
|
441
131
|
const { tasks } = await this.convertPlanToExecutable(plans, modelConfig);
|
|
442
|
-
await
|
|
443
|
-
const result = await taskExecutor.flush();
|
|
132
|
+
const result = await session.appendAndRun(tasks);
|
|
444
133
|
const { output } = result;
|
|
445
134
|
return {
|
|
446
135
|
output,
|
|
447
|
-
|
|
136
|
+
runner: session.getRunner()
|
|
448
137
|
};
|
|
449
138
|
}
|
|
450
139
|
getReplanningCycleLimit(isVlmUiTars) {
|
|
@@ -452,38 +141,38 @@ class TaskExecutor {
|
|
|
452
141
|
}
|
|
453
142
|
async action(userPrompt, modelConfig, actionContext, cacheable) {
|
|
454
143
|
this.conversationHistory.reset();
|
|
455
|
-
const
|
|
456
|
-
|
|
457
|
-
});
|
|
144
|
+
const session = this.createExecutionSession(taskTitleStr('Action', userPrompt));
|
|
145
|
+
const runner = session.getRunner();
|
|
458
146
|
let replanCount = 0;
|
|
459
147
|
const yamlFlow = [];
|
|
460
148
|
const replanningCycleLimit = this.getReplanningCycleLimit('vlm-ui-tars' === modelConfig.vlMode);
|
|
461
149
|
while(true){
|
|
462
150
|
if (replanCount > replanningCycleLimit) {
|
|
463
151
|
const errorMsg = `Replanning ${replanningCycleLimit} times, which is more than the limit, please split the task into multiple steps`;
|
|
464
|
-
return
|
|
152
|
+
return session.appendErrorPlan(errorMsg);
|
|
465
153
|
}
|
|
466
154
|
const planningTask = this.createPlanningTask(userPrompt, actionContext, modelConfig);
|
|
467
|
-
await
|
|
468
|
-
const result = await taskExecutor.flush();
|
|
155
|
+
const result = await session.appendAndRun(planningTask);
|
|
469
156
|
const planResult = null == result ? void 0 : result.output;
|
|
470
|
-
if (
|
|
157
|
+
if (session.isInErrorState()) return {
|
|
471
158
|
output: planResult,
|
|
472
|
-
|
|
159
|
+
runner
|
|
473
160
|
};
|
|
474
161
|
const plans = planResult.actions || [];
|
|
475
162
|
yamlFlow.push(...planResult.yamlFlow || []);
|
|
476
163
|
let executables;
|
|
477
164
|
try {
|
|
478
|
-
executables = await this.convertPlanToExecutable(plans, modelConfig,
|
|
479
|
-
|
|
165
|
+
executables = await this.convertPlanToExecutable(plans, modelConfig, {
|
|
166
|
+
cacheable,
|
|
167
|
+
subTask: true
|
|
168
|
+
});
|
|
169
|
+
await session.appendAndRun(executables.tasks);
|
|
480
170
|
} catch (error) {
|
|
481
|
-
return
|
|
171
|
+
return session.appendErrorPlan(`Error converting plans to executable tasks: ${error}, plans: ${JSON.stringify(plans)}`);
|
|
482
172
|
}
|
|
483
|
-
|
|
484
|
-
if (taskExecutor.isInErrorState()) return {
|
|
173
|
+
if (session.isInErrorState()) return {
|
|
485
174
|
output: void 0,
|
|
486
|
-
|
|
175
|
+
runner
|
|
487
176
|
};
|
|
488
177
|
if (!planResult.more_actions_needed_by_instruction) break;
|
|
489
178
|
replanCount++;
|
|
@@ -492,7 +181,7 @@ class TaskExecutor {
|
|
|
492
181
|
output: {
|
|
493
182
|
yamlFlow
|
|
494
183
|
},
|
|
495
|
-
|
|
184
|
+
runner
|
|
496
185
|
};
|
|
497
186
|
}
|
|
498
187
|
createTypeQueryTask(type, demand, modelConfig, opt, multimodalPrompt) {
|
|
@@ -508,23 +197,15 @@ class TaskExecutor {
|
|
|
508
197
|
},
|
|
509
198
|
executor: async (param, taskContext)=>{
|
|
510
199
|
const { task } = taskContext;
|
|
511
|
-
let
|
|
512
|
-
const
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
const uiContext = await this.insight.contextRetrieverFn('extract');
|
|
518
|
-
task.uiContext = uiContext;
|
|
519
|
-
const recordItem = {
|
|
520
|
-
type: 'screenshot',
|
|
521
|
-
ts: shotTime,
|
|
522
|
-
screenshot: uiContext.screenshotBase64,
|
|
523
|
-
timing: 'before Extract'
|
|
200
|
+
let queryDump;
|
|
201
|
+
const applyDump = (dump)=>{
|
|
202
|
+
queryDump = dump;
|
|
203
|
+
task.log = {
|
|
204
|
+
dump
|
|
205
|
+
};
|
|
524
206
|
};
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
];
|
|
207
|
+
const uiContext = taskContext.uiContext;
|
|
208
|
+
assert(uiContext, 'uiContext is required for Query task');
|
|
528
209
|
const ifTypeRestricted = 'Query' !== type;
|
|
529
210
|
let demandInput = demand;
|
|
530
211
|
let keyOfResult = 'result';
|
|
@@ -537,7 +218,15 @@ class TaskExecutor {
|
|
|
537
218
|
} else if (ifTypeRestricted) demandInput = {
|
|
538
219
|
[keyOfResult]: `${type}, ${demand}`
|
|
539
220
|
};
|
|
540
|
-
|
|
221
|
+
let extractResult;
|
|
222
|
+
try {
|
|
223
|
+
extractResult = await this.insight.extract(demandInput, modelConfig, opt, multimodalPrompt);
|
|
224
|
+
} catch (error) {
|
|
225
|
+
if (error instanceof InsightError) applyDump(error.dump);
|
|
226
|
+
throw error;
|
|
227
|
+
}
|
|
228
|
+
const { data, usage, thought, dump } = extractResult;
|
|
229
|
+
applyDump(dump);
|
|
541
230
|
let outputResult = data;
|
|
542
231
|
if (ifTypeRestricted) if ('string' == typeof data) outputResult = data;
|
|
543
232
|
else {
|
|
@@ -546,7 +235,7 @@ class TaskExecutor {
|
|
|
546
235
|
}
|
|
547
236
|
return {
|
|
548
237
|
output: outputResult,
|
|
549
|
-
log:
|
|
238
|
+
log: queryDump,
|
|
550
239
|
usage,
|
|
551
240
|
thought
|
|
552
241
|
};
|
|
@@ -555,57 +244,36 @@ class TaskExecutor {
|
|
|
555
244
|
return queryTask;
|
|
556
245
|
}
|
|
557
246
|
async createTypeQueryExecution(type, demand, modelConfig, opt, multimodalPrompt) {
|
|
558
|
-
const
|
|
559
|
-
onTaskStart: this.onTaskStartCallback
|
|
560
|
-
});
|
|
247
|
+
const session = this.createExecutionSession(taskTitleStr(type, 'string' == typeof demand ? demand : JSON.stringify(demand)));
|
|
561
248
|
const queryTask = await this.createTypeQueryTask(type, demand, modelConfig, opt, multimodalPrompt);
|
|
562
|
-
await
|
|
563
|
-
const result = await taskExecutor.flush();
|
|
249
|
+
const result = await session.appendAndRun(queryTask);
|
|
564
250
|
if (!result) throw new Error('result of taskExecutor.flush() is undefined in function createTypeQueryTask');
|
|
565
251
|
const { output, thought } = result;
|
|
566
252
|
return {
|
|
567
253
|
output,
|
|
568
254
|
thought,
|
|
569
|
-
|
|
255
|
+
runner: session.getRunner()
|
|
570
256
|
};
|
|
571
257
|
}
|
|
572
|
-
|
|
573
|
-
const errorPlan = {
|
|
574
|
-
type: 'Error',
|
|
575
|
-
param: {
|
|
576
|
-
thought: errorMsg
|
|
577
|
-
},
|
|
578
|
-
locate: null
|
|
579
|
-
};
|
|
580
|
-
const { tasks } = await this.convertPlanToExecutable([
|
|
581
|
-
errorPlan
|
|
582
|
-
], modelConfig);
|
|
583
|
-
await taskExecutor.append(this.prependExecutorWithScreenshot(tasks[0]));
|
|
584
|
-
await taskExecutor.flush();
|
|
258
|
+
sleepPlan(timeMs) {
|
|
585
259
|
return {
|
|
586
|
-
output: void 0,
|
|
587
|
-
executor: taskExecutor
|
|
588
|
-
};
|
|
589
|
-
}
|
|
590
|
-
async taskForSleep(timeMs, modelConfig) {
|
|
591
|
-
const sleepPlan = {
|
|
592
260
|
type: 'Sleep',
|
|
593
261
|
param: {
|
|
594
262
|
timeMs
|
|
595
263
|
},
|
|
596
264
|
locate: null
|
|
597
265
|
};
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
266
|
+
}
|
|
267
|
+
async taskForSleep(timeMs, _modelConfig) {
|
|
268
|
+
return this.taskBuilder.createSleepTask({
|
|
269
|
+
timeMs
|
|
270
|
+
});
|
|
602
271
|
}
|
|
603
272
|
async waitFor(assertion, opt, modelConfig) {
|
|
604
273
|
const { textPrompt, multimodalPrompt } = parsePrompt(assertion);
|
|
605
274
|
const description = `waitFor: ${textPrompt}`;
|
|
606
|
-
const
|
|
607
|
-
|
|
608
|
-
});
|
|
275
|
+
const session = this.createExecutionSession(taskTitleStr('WaitFor', description));
|
|
276
|
+
const runner = session.getRunner();
|
|
609
277
|
const { timeoutMs, checkIntervalMs } = opt;
|
|
610
278
|
assert(assertion, 'No assertion for waitFor');
|
|
611
279
|
assert(timeoutMs, 'No timeoutMs for waitFor');
|
|
@@ -619,26 +287,28 @@ class TaskExecutor {
|
|
|
619
287
|
const queryTask = await this.createTypeQueryTask('WaitFor', textPrompt, modelConfig, {
|
|
620
288
|
doNotThrowError: true
|
|
621
289
|
}, multimodalPrompt);
|
|
622
|
-
await
|
|
623
|
-
const result = await taskExecutor.flush();
|
|
290
|
+
const result = await session.appendAndRun(queryTask);
|
|
624
291
|
if (null == result ? void 0 : result.output) return {
|
|
625
292
|
output: void 0,
|
|
626
|
-
|
|
293
|
+
runner
|
|
627
294
|
};
|
|
628
295
|
errorThought = (null == result ? void 0 : result.thought) || !result && `No result from assertion: ${textPrompt}` || `unknown error when waiting for assertion: ${textPrompt}`;
|
|
629
296
|
const now = Date.now();
|
|
630
297
|
if (now - startTime < checkIntervalMs) {
|
|
631
298
|
const timeRemaining = checkIntervalMs - (now - startTime);
|
|
632
|
-
const sleepTask =
|
|
633
|
-
|
|
299
|
+
const sleepTask = this.taskBuilder.createSleepTask({
|
|
300
|
+
timeMs: timeRemaining
|
|
301
|
+
});
|
|
302
|
+
await session.append(sleepTask);
|
|
634
303
|
}
|
|
635
304
|
}
|
|
636
|
-
return
|
|
305
|
+
return session.appendErrorPlan(`waitFor timeout: ${errorThought}`);
|
|
637
306
|
}
|
|
638
307
|
constructor(interfaceInstance, insight, opts){
|
|
639
308
|
_define_property(this, "interface", void 0);
|
|
640
309
|
_define_property(this, "insight", void 0);
|
|
641
310
|
_define_property(this, "taskCache", void 0);
|
|
311
|
+
_define_property(this, "taskBuilder", void 0);
|
|
642
312
|
_define_property(this, "conversationHistory", void 0);
|
|
643
313
|
_define_property(this, "onTaskStartCallback", void 0);
|
|
644
314
|
_define_property(this, "replanningCycleLimit", void 0);
|
|
@@ -648,6 +318,11 @@ class TaskExecutor {
|
|
|
648
318
|
this.onTaskStartCallback = null == opts ? void 0 : opts.onTaskStart;
|
|
649
319
|
this.replanningCycleLimit = opts.replanningCycleLimit;
|
|
650
320
|
this.conversationHistory = new ConversationHistory();
|
|
321
|
+
this.taskBuilder = new TaskBuilder({
|
|
322
|
+
interfaceInstance,
|
|
323
|
+
insight,
|
|
324
|
+
taskCache: opts.taskCache
|
|
325
|
+
});
|
|
651
326
|
}
|
|
652
327
|
}
|
|
653
328
|
export { TaskExecutor, locatePlanForLocate };
|