@midscene/core 0.30.5 → 1.0.1-beta-20251021060907.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/es/agent/agent.mjs +41 -33
- package/dist/es/agent/agent.mjs.map +1 -1
- package/dist/es/agent/execution-session.mjs +41 -0
- package/dist/es/agent/execution-session.mjs.map +1 -0
- package/dist/es/agent/task-builder.mjs +303 -0
- package/dist/es/agent/task-builder.mjs.map +1 -0
- package/dist/es/agent/tasks.mjs +68 -391
- package/dist/es/agent/tasks.mjs.map +1 -1
- package/dist/es/agent/ui-utils.mjs.map +1 -1
- package/dist/es/agent/utils.mjs +6 -6
- package/dist/es/agent/utils.mjs.map +1 -1
- package/dist/es/ai-model/common.mjs +1 -15
- package/dist/es/ai-model/common.mjs.map +1 -1
- package/dist/es/ai-model/inspect.mjs +2 -3
- package/dist/es/ai-model/inspect.mjs.map +1 -1
- package/dist/es/ai-model/llm-planning.mjs +6 -24
- package/dist/es/ai-model/llm-planning.mjs.map +1 -1
- package/dist/es/ai-model/prompt/llm-locator.mjs +3 -204
- package/dist/es/ai-model/prompt/llm-locator.mjs.map +1 -1
- package/dist/es/ai-model/service-caller/index.mjs +101 -231
- package/dist/es/ai-model/service-caller/index.mjs.map +1 -1
- package/dist/es/index.mjs +3 -2
- package/dist/es/index.mjs.map +1 -1
- package/dist/es/insight/index.mjs +18 -19
- package/dist/es/insight/index.mjs.map +1 -1
- package/dist/es/insight/utils.mjs +3 -3
- package/dist/es/insight/utils.mjs.map +1 -1
- package/dist/es/report.mjs.map +1 -1
- package/dist/es/{ai-model/action-executor.mjs → task-runner.mjs} +69 -10
- package/dist/es/task-runner.mjs.map +1 -0
- package/dist/es/types.mjs +18 -1
- package/dist/es/types.mjs.map +1 -1
- package/dist/es/utils.mjs +2 -2
- package/dist/es/yaml/player.mjs +18 -14
- package/dist/es/yaml/player.mjs.map +1 -1
- package/dist/lib/agent/agent.js +41 -33
- package/dist/lib/agent/agent.js.map +1 -1
- package/dist/lib/agent/execution-session.js +75 -0
- package/dist/lib/agent/execution-session.js.map +1 -0
- package/dist/lib/agent/task-builder.js +340 -0
- package/dist/lib/agent/task-builder.js.map +1 -0
- package/dist/lib/agent/tasks.js +68 -391
- package/dist/lib/agent/tasks.js.map +1 -1
- package/dist/lib/agent/ui-utils.js.map +1 -1
- package/dist/lib/agent/utils.js +6 -6
- package/dist/lib/agent/utils.js.map +1 -1
- package/dist/lib/ai-model/common.js +2 -19
- package/dist/lib/ai-model/common.js.map +1 -1
- package/dist/lib/ai-model/inspect.js +1 -2
- package/dist/lib/ai-model/inspect.js.map +1 -1
- package/dist/lib/ai-model/llm-planning.js +5 -23
- package/dist/lib/ai-model/llm-planning.js.map +1 -1
- package/dist/lib/ai-model/prompt/llm-locator.js +2 -206
- package/dist/lib/ai-model/prompt/llm-locator.js.map +1 -1
- package/dist/lib/ai-model/service-caller/index.js +236 -384
- package/dist/lib/ai-model/service-caller/index.js.map +1 -1
- package/dist/lib/index.js +9 -5
- package/dist/lib/index.js.map +1 -1
- package/dist/lib/insight/index.js +17 -18
- package/dist/lib/insight/index.js.map +1 -1
- package/dist/lib/insight/utils.js +5 -5
- package/dist/lib/insight/utils.js.map +1 -1
- package/dist/lib/report.js.map +1 -1
- package/dist/lib/{ai-model/action-executor.js → task-runner.js} +71 -12
- package/dist/lib/task-runner.js.map +1 -0
- package/dist/lib/types.js +22 -1
- package/dist/lib/types.js.map +1 -1
- package/dist/lib/utils.js +2 -2
- package/dist/lib/yaml/player.js +18 -14
- package/dist/lib/yaml/player.js.map +1 -1
- package/dist/types/agent/agent.d.ts +16 -0
- package/dist/types/agent/execution-session.d.ts +27 -0
- package/dist/types/agent/task-builder.d.ts +24 -0
- package/dist/types/agent/tasks.d.ts +8 -11
- package/dist/types/agent/ui-utils.d.ts +2 -2
- package/dist/types/agent/utils.d.ts +5 -2
- package/dist/types/ai-model/common.d.ts +0 -1
- package/dist/types/ai-model/prompt/llm-locator.d.ts +0 -2
- package/dist/types/index.d.ts +4 -3
- package/dist/types/insight/index.d.ts +5 -10
- package/dist/types/insight/utils.d.ts +2 -2
- package/dist/types/{ai-model/action-executor.d.ts → task-runner.d.ts} +14 -3
- package/dist/types/types.d.ts +47 -4
- package/dist/types/yaml.d.ts +3 -1
- package/package.json +4 -7
- package/dist/es/ai-model/action-executor.mjs.map +0 -1
- package/dist/lib/ai-model/action-executor.js.map +0 -1
package/dist/es/agent/tasks.mjs
CHANGED
|
@@ -1,11 +1,12 @@
|
|
|
1
|
-
import { ConversationHistory,
|
|
2
|
-
import {
|
|
3
|
-
import { sleep as external_utils_mjs_sleep } from "../utils.mjs";
|
|
1
|
+
import { ConversationHistory, plan, uiTarsPlanning } from "../ai-model/index.mjs";
|
|
2
|
+
import { InsightError } from "../types.mjs";
|
|
4
3
|
import { MIDSCENE_REPLANNING_CYCLE_LIMIT, globalConfigManager } from "@midscene/shared/env";
|
|
5
4
|
import { getDebug } from "@midscene/shared/logger";
|
|
6
5
|
import { assert } from "@midscene/shared/utils";
|
|
6
|
+
import { ExecutionSession } from "./execution-session.mjs";
|
|
7
|
+
import { TaskBuilder, locatePlanForLocate } from "./task-builder.mjs";
|
|
7
8
|
import { taskTitleStr } from "./ui-utils.mjs";
|
|
8
|
-
import {
|
|
9
|
+
import { parsePrompt } from "./utils.mjs";
|
|
9
10
|
function _define_property(obj, key, value) {
|
|
10
11
|
if (key in obj) Object.defineProperty(obj, key, {
|
|
11
12
|
value: value,
|
|
@@ -19,325 +20,23 @@ function _define_property(obj, key, value) {
|
|
|
19
20
|
const debug = getDebug('device-task-executor');
|
|
20
21
|
const defaultReplanningCycleLimit = 10;
|
|
21
22
|
const defaultVlmUiTarsReplanningCycleLimit = 40;
|
|
22
|
-
function locatePlanForLocate(param) {
|
|
23
|
-
const locate = 'string' == typeof param ? {
|
|
24
|
-
prompt: param
|
|
25
|
-
} : param;
|
|
26
|
-
const locatePlan = {
|
|
27
|
-
type: 'Locate',
|
|
28
|
-
locate,
|
|
29
|
-
param: locate,
|
|
30
|
-
thought: ''
|
|
31
|
-
};
|
|
32
|
-
return locatePlan;
|
|
33
|
-
}
|
|
34
23
|
class TaskExecutor {
|
|
35
24
|
get page() {
|
|
36
25
|
return this.interface;
|
|
37
26
|
}
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
43
|
-
screenshot: base64,
|
|
44
|
-
timing
|
|
45
|
-
};
|
|
46
|
-
return item;
|
|
47
|
-
}
|
|
48
|
-
prependExecutorWithScreenshot(taskApply, appendAfterExecution = false) {
|
|
49
|
-
const taskWithScreenshot = {
|
|
50
|
-
...taskApply,
|
|
51
|
-
executor: async (param, context, ...args)=>{
|
|
52
|
-
const recorder = [];
|
|
53
|
-
const { task } = context;
|
|
54
|
-
task.recorder = recorder;
|
|
55
|
-
const shot = await this.recordScreenshot(`before ${task.type}`);
|
|
56
|
-
recorder.push(shot);
|
|
57
|
-
const result = await taskApply.executor(param, context, ...args);
|
|
58
|
-
if (appendAfterExecution) {
|
|
59
|
-
const shot2 = await this.recordScreenshot('after Action');
|
|
60
|
-
recorder.push(shot2);
|
|
61
|
-
}
|
|
62
|
-
return result;
|
|
63
|
-
}
|
|
64
|
-
};
|
|
65
|
-
return taskWithScreenshot;
|
|
27
|
+
createExecutionSession(title, options) {
|
|
28
|
+
return new ExecutionSession(title, ()=>Promise.resolve(this.insight.contextRetrieverFn()), {
|
|
29
|
+
onTaskStart: this.onTaskStartCallback,
|
|
30
|
+
tasks: null == options ? void 0 : options.tasks
|
|
31
|
+
});
|
|
66
32
|
}
|
|
67
33
|
async convertPlanToExecutable(plans, modelConfig, cacheable) {
|
|
68
|
-
|
|
69
|
-
|
|
70
|
-
if ('string' == typeof detailedLocateParam) detailedLocateParam = {
|
|
71
|
-
prompt: detailedLocateParam
|
|
72
|
-
};
|
|
73
|
-
if (void 0 !== cacheable) detailedLocateParam = {
|
|
74
|
-
...detailedLocateParam,
|
|
75
|
-
cacheable
|
|
76
|
-
};
|
|
77
|
-
const taskFind = {
|
|
78
|
-
type: 'Insight',
|
|
79
|
-
subType: 'Locate',
|
|
80
|
-
param: detailedLocateParam,
|
|
81
|
-
thought: plan.thought,
|
|
82
|
-
executor: async (param, taskContext)=>{
|
|
83
|
-
var _this_taskCache, _locateCacheRecord_cacheContent;
|
|
84
|
-
const { task } = taskContext;
|
|
85
|
-
assert((null == param ? void 0 : param.prompt) || (null == param ? void 0 : param.id) || (null == param ? void 0 : param.bbox), `No prompt or id or position or bbox to locate, param=${JSON.stringify(param)}`);
|
|
86
|
-
let insightDump;
|
|
87
|
-
let usage;
|
|
88
|
-
const dumpCollector = (dump)=>{
|
|
89
|
-
var _dump_taskInfo, _dump_taskInfo1;
|
|
90
|
-
insightDump = dump;
|
|
91
|
-
usage = null == dump ? void 0 : null == (_dump_taskInfo = dump.taskInfo) ? void 0 : _dump_taskInfo.usage;
|
|
92
|
-
task.log = {
|
|
93
|
-
dump: insightDump
|
|
94
|
-
};
|
|
95
|
-
task.usage = usage;
|
|
96
|
-
if (null == dump ? void 0 : null == (_dump_taskInfo1 = dump.taskInfo) ? void 0 : _dump_taskInfo1.searchAreaUsage) task.searchAreaUsage = dump.taskInfo.searchAreaUsage;
|
|
97
|
-
};
|
|
98
|
-
this.insight.onceDumpUpdatedFn = dumpCollector;
|
|
99
|
-
const shotTime = Date.now();
|
|
100
|
-
const uiContext = await this.insight.contextRetrieverFn('locate');
|
|
101
|
-
task.uiContext = uiContext;
|
|
102
|
-
const recordItem = {
|
|
103
|
-
type: 'screenshot',
|
|
104
|
-
ts: shotTime,
|
|
105
|
-
screenshot: uiContext.screenshotBase64,
|
|
106
|
-
timing: 'before Insight'
|
|
107
|
-
};
|
|
108
|
-
task.recorder = [
|
|
109
|
-
recordItem
|
|
110
|
-
];
|
|
111
|
-
const elementFromXpath = param.xpath && this.interface.getElementInfoByXpath ? await this.interface.getElementInfoByXpath(param.xpath) : void 0;
|
|
112
|
-
const userExpectedPathHitFlag = !!elementFromXpath;
|
|
113
|
-
const cachePrompt = param.prompt;
|
|
114
|
-
const locateCacheRecord = null == (_this_taskCache = this.taskCache) ? void 0 : _this_taskCache.matchLocateCache(cachePrompt);
|
|
115
|
-
const cacheEntry = null == locateCacheRecord ? void 0 : null == (_locateCacheRecord_cacheContent = locateCacheRecord.cacheContent) ? void 0 : _locateCacheRecord_cacheContent.cache;
|
|
116
|
-
const elementFromCache = userExpectedPathHitFlag ? null : await matchElementFromCache(this, cacheEntry, cachePrompt, param.cacheable);
|
|
117
|
-
const cacheHitFlag = !!elementFromCache;
|
|
118
|
-
const elementFromPlan = userExpectedPathHitFlag || cacheHitFlag ? void 0 : matchElementFromPlan(param, uiContext.tree);
|
|
119
|
-
const planHitFlag = !!elementFromPlan;
|
|
120
|
-
const elementFromAiLocate = userExpectedPathHitFlag || cacheHitFlag || planHitFlag ? void 0 : (await this.insight.locate(param, {
|
|
121
|
-
context: uiContext
|
|
122
|
-
}, modelConfig)).element;
|
|
123
|
-
const aiLocateHitFlag = !!elementFromAiLocate;
|
|
124
|
-
const element = elementFromXpath || elementFromCache || elementFromPlan || elementFromAiLocate;
|
|
125
|
-
let currentCacheEntry;
|
|
126
|
-
if (element && this.taskCache && !cacheHitFlag && (null == param ? void 0 : param.cacheable) !== false) if (this.interface.cacheFeatureForRect) try {
|
|
127
|
-
const feature = await this.interface.cacheFeatureForRect(element.rect, void 0 !== element.isOrderSensitive ? {
|
|
128
|
-
_orderSensitive: element.isOrderSensitive
|
|
129
|
-
} : void 0);
|
|
130
|
-
if (feature && Object.keys(feature).length > 0) {
|
|
131
|
-
debug('update cache, prompt: %s, cache: %o', cachePrompt, feature);
|
|
132
|
-
currentCacheEntry = feature;
|
|
133
|
-
this.taskCache.updateOrAppendCacheRecord({
|
|
134
|
-
type: 'locate',
|
|
135
|
-
prompt: cachePrompt,
|
|
136
|
-
cache: feature
|
|
137
|
-
}, locateCacheRecord);
|
|
138
|
-
} else debug('no cache data returned, skip cache update, prompt: %s', cachePrompt);
|
|
139
|
-
} catch (error) {
|
|
140
|
-
debug('cacheFeatureForRect failed: %s', error);
|
|
141
|
-
}
|
|
142
|
-
else debug('cacheFeatureForRect is not supported, skip cache update');
|
|
143
|
-
if (!element) throw new Error(`Element not found: ${param.prompt}`);
|
|
144
|
-
let hitBy;
|
|
145
|
-
if (userExpectedPathHitFlag) hitBy = {
|
|
146
|
-
from: 'User expected path',
|
|
147
|
-
context: {
|
|
148
|
-
xpath: param.xpath
|
|
149
|
-
}
|
|
150
|
-
};
|
|
151
|
-
else if (cacheHitFlag) hitBy = {
|
|
152
|
-
from: 'Cache',
|
|
153
|
-
context: {
|
|
154
|
-
cacheEntry,
|
|
155
|
-
cacheToSave: currentCacheEntry
|
|
156
|
-
}
|
|
157
|
-
};
|
|
158
|
-
else if (planHitFlag) hitBy = {
|
|
159
|
-
from: 'Planning',
|
|
160
|
-
context: {
|
|
161
|
-
id: null == elementFromPlan ? void 0 : elementFromPlan.id,
|
|
162
|
-
bbox: null == elementFromPlan ? void 0 : elementFromPlan.bbox
|
|
163
|
-
}
|
|
164
|
-
};
|
|
165
|
-
else if (aiLocateHitFlag) hitBy = {
|
|
166
|
-
from: 'AI model',
|
|
167
|
-
context: {
|
|
168
|
-
prompt: param.prompt
|
|
169
|
-
}
|
|
170
|
-
};
|
|
171
|
-
null == onResult || onResult(element);
|
|
172
|
-
return {
|
|
173
|
-
output: {
|
|
174
|
-
element
|
|
175
|
-
},
|
|
176
|
-
uiContext,
|
|
177
|
-
hitBy
|
|
178
|
-
};
|
|
179
|
-
}
|
|
180
|
-
};
|
|
181
|
-
return taskFind;
|
|
182
|
-
};
|
|
183
|
-
for (const plan of plans)if ('Locate' === plan.type) {
|
|
184
|
-
var _plan_locate, _plan_locate1;
|
|
185
|
-
if (!plan.locate || null === plan.locate || (null == (_plan_locate = plan.locate) ? void 0 : _plan_locate.id) === null || (null == (_plan_locate1 = plan.locate) ? void 0 : _plan_locate1.id) === 'null') {
|
|
186
|
-
debug('Locate action with id is null, will be ignored', plan);
|
|
187
|
-
continue;
|
|
188
|
-
}
|
|
189
|
-
const taskLocate = taskForLocatePlan(plan, plan.locate);
|
|
190
|
-
tasks.push(taskLocate);
|
|
191
|
-
} else if ('Error' === plan.type) {
|
|
192
|
-
var _plan_param;
|
|
193
|
-
const taskActionError = {
|
|
194
|
-
type: 'Action',
|
|
195
|
-
subType: 'Error',
|
|
196
|
-
param: plan.param,
|
|
197
|
-
thought: plan.thought || (null == (_plan_param = plan.param) ? void 0 : _plan_param.thought),
|
|
198
|
-
locate: plan.locate,
|
|
199
|
-
executor: async ()=>{
|
|
200
|
-
var _plan_param;
|
|
201
|
-
throw new Error((null == plan ? void 0 : plan.thought) || (null == (_plan_param = plan.param) ? void 0 : _plan_param.thought) || 'error without thought');
|
|
202
|
-
}
|
|
203
|
-
};
|
|
204
|
-
tasks.push(taskActionError);
|
|
205
|
-
} else if ('Finished' === plan.type) {
|
|
206
|
-
const taskActionFinished = {
|
|
207
|
-
type: 'Action',
|
|
208
|
-
subType: 'Finished',
|
|
209
|
-
param: null,
|
|
210
|
-
thought: plan.thought,
|
|
211
|
-
locate: plan.locate,
|
|
212
|
-
executor: async (param)=>{}
|
|
213
|
-
};
|
|
214
|
-
tasks.push(taskActionFinished);
|
|
215
|
-
} else if ('Sleep' === plan.type) {
|
|
216
|
-
const taskActionSleep = {
|
|
217
|
-
type: 'Action',
|
|
218
|
-
subType: 'Sleep',
|
|
219
|
-
param: plan.param,
|
|
220
|
-
thought: plan.thought,
|
|
221
|
-
locate: plan.locate,
|
|
222
|
-
executor: async (taskParam)=>{
|
|
223
|
-
await external_utils_mjs_sleep((null == taskParam ? void 0 : taskParam.timeMs) || 3000);
|
|
224
|
-
}
|
|
225
|
-
};
|
|
226
|
-
tasks.push(taskActionSleep);
|
|
227
|
-
} else {
|
|
228
|
-
const planType = plan.type;
|
|
229
|
-
const actionSpace = await this.interface.actionSpace();
|
|
230
|
-
const action = actionSpace.find((action)=>action.name === planType);
|
|
231
|
-
const param = plan.param;
|
|
232
|
-
if (!action) throw new Error(`Action type '${planType}' not found`);
|
|
233
|
-
const locateFields = action ? findAllMidsceneLocatorField(action.paramSchema) : [];
|
|
234
|
-
const requiredLocateFields = action ? findAllMidsceneLocatorField(action.paramSchema, true) : [];
|
|
235
|
-
locateFields.forEach((field)=>{
|
|
236
|
-
if (param[field]) {
|
|
237
|
-
const locatePlan = locatePlanForLocate(param[field]);
|
|
238
|
-
debug('will prepend locate param for field', `action.type=${planType}`, `param=${JSON.stringify(param[field])}`, `locatePlan=${JSON.stringify(locatePlan)}`);
|
|
239
|
-
const locateTask = taskForLocatePlan(locatePlan, param[field], (result)=>{
|
|
240
|
-
param[field] = result;
|
|
241
|
-
});
|
|
242
|
-
tasks.push(locateTask);
|
|
243
|
-
} else {
|
|
244
|
-
assert(!requiredLocateFields.includes(field), `Required locate field '${field}' is not provided for action ${planType}`);
|
|
245
|
-
debug(`field '${field}' is not provided for action ${planType}`);
|
|
246
|
-
}
|
|
247
|
-
});
|
|
248
|
-
const task = {
|
|
249
|
-
type: 'Action',
|
|
250
|
-
subType: planType,
|
|
251
|
-
thought: plan.thought,
|
|
252
|
-
param: plan.param,
|
|
253
|
-
executor: async (param, context)=>{
|
|
254
|
-
var _context_element;
|
|
255
|
-
debug('executing action', planType, param, `context.element.center: ${null == (_context_element = context.element) ? void 0 : _context_element.center}`);
|
|
256
|
-
const uiContext = await this.insight.contextRetrieverFn('locate');
|
|
257
|
-
context.task.uiContext = uiContext;
|
|
258
|
-
requiredLocateFields.forEach((field)=>{
|
|
259
|
-
assert(param[field], `field '${field}' is required for action ${planType} but not provided. Cannot execute action ${planType}.`);
|
|
260
|
-
});
|
|
261
|
-
try {
|
|
262
|
-
await Promise.all([
|
|
263
|
-
(async ()=>{
|
|
264
|
-
if (this.interface.beforeInvokeAction) {
|
|
265
|
-
debug('will call "beforeInvokeAction" for interface');
|
|
266
|
-
await this.interface.beforeInvokeAction(action.name, param);
|
|
267
|
-
debug('called "beforeInvokeAction" for interface');
|
|
268
|
-
}
|
|
269
|
-
})(),
|
|
270
|
-
external_utils_mjs_sleep(200)
|
|
271
|
-
]);
|
|
272
|
-
} catch (originalError) {
|
|
273
|
-
const originalMessage = (null == originalError ? void 0 : originalError.message) || String(originalError);
|
|
274
|
-
throw new Error(`error in running beforeInvokeAction for ${action.name}: ${originalMessage}`, {
|
|
275
|
-
cause: originalError
|
|
276
|
-
});
|
|
277
|
-
}
|
|
278
|
-
if (action.paramSchema) try {
|
|
279
|
-
param = parseActionParam(param, action.paramSchema);
|
|
280
|
-
} catch (error) {
|
|
281
|
-
throw new Error(`Invalid parameters for action ${action.name}: ${error.message}\nParameters: ${JSON.stringify(param)}`, {
|
|
282
|
-
cause: error
|
|
283
|
-
});
|
|
284
|
-
}
|
|
285
|
-
debug('calling action', action.name);
|
|
286
|
-
const actionFn = action.call.bind(this.interface);
|
|
287
|
-
await actionFn(param, context);
|
|
288
|
-
debug('called action', action.name);
|
|
289
|
-
try {
|
|
290
|
-
if (this.interface.afterInvokeAction) {
|
|
291
|
-
debug('will call "afterInvokeAction" for interface');
|
|
292
|
-
await this.interface.afterInvokeAction(action.name, param);
|
|
293
|
-
debug('called "afterInvokeAction" for interface');
|
|
294
|
-
}
|
|
295
|
-
} catch (originalError) {
|
|
296
|
-
const originalMessage = (null == originalError ? void 0 : originalError.message) || String(originalError);
|
|
297
|
-
throw new Error(`error in running afterInvokeAction for ${action.name}: ${originalMessage}`, {
|
|
298
|
-
cause: originalError
|
|
299
|
-
});
|
|
300
|
-
}
|
|
301
|
-
return {
|
|
302
|
-
output: {
|
|
303
|
-
success: true,
|
|
304
|
-
action: planType,
|
|
305
|
-
param: param
|
|
306
|
-
}
|
|
307
|
-
};
|
|
308
|
-
}
|
|
309
|
-
};
|
|
310
|
-
tasks.push(task);
|
|
311
|
-
}
|
|
312
|
-
const wrappedTasks = tasks.map((task, index)=>{
|
|
313
|
-
if ('Action' === task.type) return this.prependExecutorWithScreenshot(task, index === tasks.length - 1);
|
|
314
|
-
return task;
|
|
34
|
+
return this.taskBuilder.build(plans, modelConfig, {
|
|
35
|
+
cacheable
|
|
315
36
|
});
|
|
316
|
-
return {
|
|
317
|
-
tasks: wrappedTasks
|
|
318
|
-
};
|
|
319
|
-
}
|
|
320
|
-
async setupPlanningContext(executorContext) {
|
|
321
|
-
const shotTime = Date.now();
|
|
322
|
-
const uiContext = await this.insight.contextRetrieverFn('locate');
|
|
323
|
-
const recordItem = {
|
|
324
|
-
type: 'screenshot',
|
|
325
|
-
ts: shotTime,
|
|
326
|
-
screenshot: uiContext.screenshotBase64,
|
|
327
|
-
timing: 'before Planning'
|
|
328
|
-
};
|
|
329
|
-
executorContext.task.recorder = [
|
|
330
|
-
recordItem
|
|
331
|
-
];
|
|
332
|
-
executorContext.task.uiContext = uiContext;
|
|
333
|
-
return {
|
|
334
|
-
uiContext
|
|
335
|
-
};
|
|
336
37
|
}
|
|
337
38
|
async loadYamlFlowAsPlanning(userInstruction, yamlString) {
|
|
338
|
-
const
|
|
339
|
-
onTaskStart: this.onTaskStartCallback
|
|
340
|
-
});
|
|
39
|
+
const session = this.createExecutionSession(taskTitleStr('Action', userInstruction));
|
|
341
40
|
const task = {
|
|
342
41
|
type: 'Planning',
|
|
343
42
|
subType: 'LoadYaml',
|
|
@@ -346,7 +45,8 @@ class TaskExecutor {
|
|
|
346
45
|
userInstruction
|
|
347
46
|
},
|
|
348
47
|
executor: async (param, executorContext)=>{
|
|
349
|
-
|
|
48
|
+
const { uiContext } = executorContext;
|
|
49
|
+
assert(uiContext, 'uiContext is required for Planning task');
|
|
350
50
|
return {
|
|
351
51
|
output: {
|
|
352
52
|
actions: [],
|
|
@@ -366,10 +66,9 @@ class TaskExecutor {
|
|
|
366
66
|
};
|
|
367
67
|
}
|
|
368
68
|
};
|
|
369
|
-
await
|
|
370
|
-
await taskExecutor.flush();
|
|
69
|
+
await session.appendAndRun(task);
|
|
371
70
|
return {
|
|
372
|
-
|
|
71
|
+
runner: session.getRunner()
|
|
373
72
|
};
|
|
374
73
|
}
|
|
375
74
|
createPlanningTask(userInstruction, actionContext, modelConfig) {
|
|
@@ -382,7 +81,8 @@ class TaskExecutor {
|
|
|
382
81
|
},
|
|
383
82
|
executor: async (param, executorContext)=>{
|
|
384
83
|
const startTime = Date.now();
|
|
385
|
-
const { uiContext } =
|
|
84
|
+
const { uiContext } = executorContext;
|
|
85
|
+
assert(uiContext, 'uiContext is required for Planning task');
|
|
386
86
|
const { vlMode } = modelConfig;
|
|
387
87
|
const uiTarsModelVersion = 'vlm-ui-tars' === vlMode ? modelConfig.uiTarsModelVersion : void 0;
|
|
388
88
|
assert(this.interface.actionSpace, 'actionSpace for device is not implemented');
|
|
@@ -390,7 +90,7 @@ class TaskExecutor {
|
|
|
390
90
|
debug('actionSpace for this interface is:', actionSpace.map((action)=>action.name).join(', '));
|
|
391
91
|
assert(Array.isArray(actionSpace), 'actionSpace must be an array');
|
|
392
92
|
if (0 === actionSpace.length) console.warn(`ActionSpace for ${this.interface.interfaceType} is empty. This may lead to unexpected behavior.`);
|
|
393
|
-
const planResult = await (uiTarsModelVersion ? uiTarsPlanning :
|
|
93
|
+
const planResult = await (uiTarsModelVersion ? uiTarsPlanning : plan)(param.userInstruction, {
|
|
394
94
|
context: uiContext,
|
|
395
95
|
actionContext,
|
|
396
96
|
interfaceType: this.interface.interfaceType,
|
|
@@ -435,16 +135,13 @@ class TaskExecutor {
|
|
|
435
135
|
return task;
|
|
436
136
|
}
|
|
437
137
|
async runPlans(title, plans, modelConfig) {
|
|
438
|
-
const
|
|
439
|
-
onTaskStart: this.onTaskStartCallback
|
|
440
|
-
});
|
|
138
|
+
const session = this.createExecutionSession(title);
|
|
441
139
|
const { tasks } = await this.convertPlanToExecutable(plans, modelConfig);
|
|
442
|
-
await
|
|
443
|
-
const result = await taskExecutor.flush();
|
|
140
|
+
const result = await session.appendAndRun(tasks);
|
|
444
141
|
const { output } = result;
|
|
445
142
|
return {
|
|
446
143
|
output,
|
|
447
|
-
|
|
144
|
+
runner: session.getRunner()
|
|
448
145
|
};
|
|
449
146
|
}
|
|
450
147
|
getReplanningCycleLimit(isVlmUiTars) {
|
|
@@ -452,38 +149,35 @@ class TaskExecutor {
|
|
|
452
149
|
}
|
|
453
150
|
async action(userPrompt, modelConfig, actionContext, cacheable) {
|
|
454
151
|
this.conversationHistory.reset();
|
|
455
|
-
const
|
|
456
|
-
|
|
457
|
-
});
|
|
152
|
+
const session = this.createExecutionSession(taskTitleStr('Action', userPrompt));
|
|
153
|
+
const runner = session.getRunner();
|
|
458
154
|
let replanCount = 0;
|
|
459
155
|
const yamlFlow = [];
|
|
460
156
|
const replanningCycleLimit = this.getReplanningCycleLimit('vlm-ui-tars' === modelConfig.vlMode);
|
|
461
157
|
while(true){
|
|
462
158
|
if (replanCount > replanningCycleLimit) {
|
|
463
159
|
const errorMsg = `Replanning ${replanningCycleLimit} times, which is more than the limit, please split the task into multiple steps`;
|
|
464
|
-
return
|
|
160
|
+
return session.appendErrorPlan(errorMsg);
|
|
465
161
|
}
|
|
466
162
|
const planningTask = this.createPlanningTask(userPrompt, actionContext, modelConfig);
|
|
467
|
-
await
|
|
468
|
-
const result = await taskExecutor.flush();
|
|
163
|
+
const result = await session.appendAndRun(planningTask);
|
|
469
164
|
const planResult = null == result ? void 0 : result.output;
|
|
470
|
-
if (
|
|
165
|
+
if (session.isInErrorState()) return {
|
|
471
166
|
output: planResult,
|
|
472
|
-
|
|
167
|
+
runner
|
|
473
168
|
};
|
|
474
169
|
const plans = planResult.actions || [];
|
|
475
170
|
yamlFlow.push(...planResult.yamlFlow || []);
|
|
476
171
|
let executables;
|
|
477
172
|
try {
|
|
478
173
|
executables = await this.convertPlanToExecutable(plans, modelConfig, cacheable);
|
|
479
|
-
|
|
174
|
+
await session.appendAndRun(executables.tasks);
|
|
480
175
|
} catch (error) {
|
|
481
|
-
return
|
|
176
|
+
return session.appendErrorPlan(`Error converting plans to executable tasks: ${error}, plans: ${JSON.stringify(plans)}`);
|
|
482
177
|
}
|
|
483
|
-
|
|
484
|
-
if (taskExecutor.isInErrorState()) return {
|
|
178
|
+
if (session.isInErrorState()) return {
|
|
485
179
|
output: void 0,
|
|
486
|
-
|
|
180
|
+
runner
|
|
487
181
|
};
|
|
488
182
|
if (!planResult.more_actions_needed_by_instruction) break;
|
|
489
183
|
replanCount++;
|
|
@@ -492,7 +186,7 @@ class TaskExecutor {
|
|
|
492
186
|
output: {
|
|
493
187
|
yamlFlow
|
|
494
188
|
},
|
|
495
|
-
|
|
189
|
+
runner
|
|
496
190
|
};
|
|
497
191
|
}
|
|
498
192
|
createTypeQueryTask(type, demand, modelConfig, opt, multimodalPrompt) {
|
|
@@ -508,23 +202,15 @@ class TaskExecutor {
|
|
|
508
202
|
},
|
|
509
203
|
executor: async (param, taskContext)=>{
|
|
510
204
|
const { task } = taskContext;
|
|
511
|
-
let
|
|
512
|
-
const
|
|
513
|
-
|
|
514
|
-
|
|
515
|
-
|
|
516
|
-
|
|
517
|
-
const uiContext = await this.insight.contextRetrieverFn('extract');
|
|
518
|
-
task.uiContext = uiContext;
|
|
519
|
-
const recordItem = {
|
|
520
|
-
type: 'screenshot',
|
|
521
|
-
ts: shotTime,
|
|
522
|
-
screenshot: uiContext.screenshotBase64,
|
|
523
|
-
timing: 'before Extract'
|
|
205
|
+
let queryDump;
|
|
206
|
+
const applyDump = (dump)=>{
|
|
207
|
+
queryDump = dump;
|
|
208
|
+
task.log = {
|
|
209
|
+
dump
|
|
210
|
+
};
|
|
524
211
|
};
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
];
|
|
212
|
+
const uiContext = taskContext.uiContext;
|
|
213
|
+
assert(uiContext, 'uiContext is required for Query task');
|
|
528
214
|
const ifTypeRestricted = 'Query' !== type;
|
|
529
215
|
let demandInput = demand;
|
|
530
216
|
let keyOfResult = 'result';
|
|
@@ -537,7 +223,15 @@ class TaskExecutor {
|
|
|
537
223
|
} else if (ifTypeRestricted) demandInput = {
|
|
538
224
|
[keyOfResult]: `${type}, ${demand}`
|
|
539
225
|
};
|
|
540
|
-
|
|
226
|
+
let extractResult;
|
|
227
|
+
try {
|
|
228
|
+
extractResult = await this.insight.extract(demandInput, modelConfig, opt, multimodalPrompt);
|
|
229
|
+
} catch (error) {
|
|
230
|
+
if (error instanceof InsightError) applyDump(error.dump);
|
|
231
|
+
throw error;
|
|
232
|
+
}
|
|
233
|
+
const { data, usage, thought, dump } = extractResult;
|
|
234
|
+
applyDump(dump);
|
|
541
235
|
let outputResult = data;
|
|
542
236
|
if (ifTypeRestricted) if ('string' == typeof data) outputResult = data;
|
|
543
237
|
else {
|
|
@@ -546,7 +240,7 @@ class TaskExecutor {
|
|
|
546
240
|
}
|
|
547
241
|
return {
|
|
548
242
|
output: outputResult,
|
|
549
|
-
log:
|
|
243
|
+
log: queryDump,
|
|
550
244
|
usage,
|
|
551
245
|
thought
|
|
552
246
|
};
|
|
@@ -555,36 +249,15 @@ class TaskExecutor {
|
|
|
555
249
|
return queryTask;
|
|
556
250
|
}
|
|
557
251
|
async createTypeQueryExecution(type, demand, modelConfig, opt, multimodalPrompt) {
|
|
558
|
-
const
|
|
559
|
-
onTaskStart: this.onTaskStartCallback
|
|
560
|
-
});
|
|
252
|
+
const session = this.createExecutionSession(taskTitleStr(type, 'string' == typeof demand ? demand : JSON.stringify(demand)));
|
|
561
253
|
const queryTask = await this.createTypeQueryTask(type, demand, modelConfig, opt, multimodalPrompt);
|
|
562
|
-
await
|
|
563
|
-
const result = await taskExecutor.flush();
|
|
254
|
+
const result = await session.appendAndRun(queryTask);
|
|
564
255
|
if (!result) throw new Error('result of taskExecutor.flush() is undefined in function createTypeQueryTask');
|
|
565
256
|
const { output, thought } = result;
|
|
566
257
|
return {
|
|
567
258
|
output,
|
|
568
259
|
thought,
|
|
569
|
-
|
|
570
|
-
};
|
|
571
|
-
}
|
|
572
|
-
async appendErrorPlan(taskExecutor, errorMsg, modelConfig) {
|
|
573
|
-
const errorPlan = {
|
|
574
|
-
type: 'Error',
|
|
575
|
-
param: {
|
|
576
|
-
thought: errorMsg
|
|
577
|
-
},
|
|
578
|
-
locate: null
|
|
579
|
-
};
|
|
580
|
-
const { tasks } = await this.convertPlanToExecutable([
|
|
581
|
-
errorPlan
|
|
582
|
-
], modelConfig);
|
|
583
|
-
await taskExecutor.append(this.prependExecutorWithScreenshot(tasks[0]));
|
|
584
|
-
await taskExecutor.flush();
|
|
585
|
-
return {
|
|
586
|
-
output: void 0,
|
|
587
|
-
executor: taskExecutor
|
|
260
|
+
runner: session.getRunner()
|
|
588
261
|
};
|
|
589
262
|
}
|
|
590
263
|
async taskForSleep(timeMs, modelConfig) {
|
|
@@ -598,14 +271,13 @@ class TaskExecutor {
|
|
|
598
271
|
const { tasks: sleepTasks } = await this.convertPlanToExecutable([
|
|
599
272
|
sleepPlan
|
|
600
273
|
], modelConfig);
|
|
601
|
-
return
|
|
274
|
+
return sleepTasks[0];
|
|
602
275
|
}
|
|
603
276
|
async waitFor(assertion, opt, modelConfig) {
|
|
604
277
|
const { textPrompt, multimodalPrompt } = parsePrompt(assertion);
|
|
605
278
|
const description = `waitFor: ${textPrompt}`;
|
|
606
|
-
const
|
|
607
|
-
|
|
608
|
-
});
|
|
279
|
+
const session = this.createExecutionSession(taskTitleStr('WaitFor', description));
|
|
280
|
+
const runner = session.getRunner();
|
|
609
281
|
const { timeoutMs, checkIntervalMs } = opt;
|
|
610
282
|
assert(assertion, 'No assertion for waitFor');
|
|
611
283
|
assert(timeoutMs, 'No timeoutMs for waitFor');
|
|
@@ -619,26 +291,26 @@ class TaskExecutor {
|
|
|
619
291
|
const queryTask = await this.createTypeQueryTask('WaitFor', textPrompt, modelConfig, {
|
|
620
292
|
doNotThrowError: true
|
|
621
293
|
}, multimodalPrompt);
|
|
622
|
-
await
|
|
623
|
-
const result = await taskExecutor.flush();
|
|
294
|
+
const result = await session.appendAndRun(queryTask);
|
|
624
295
|
if (null == result ? void 0 : result.output) return {
|
|
625
296
|
output: void 0,
|
|
626
|
-
|
|
297
|
+
runner
|
|
627
298
|
};
|
|
628
299
|
errorThought = (null == result ? void 0 : result.thought) || !result && `No result from assertion: ${textPrompt}` || `unknown error when waiting for assertion: ${textPrompt}`;
|
|
629
300
|
const now = Date.now();
|
|
630
301
|
if (now - startTime < checkIntervalMs) {
|
|
631
302
|
const timeRemaining = checkIntervalMs - (now - startTime);
|
|
632
303
|
const sleepTask = await this.taskForSleep(timeRemaining, modelConfig);
|
|
633
|
-
await
|
|
304
|
+
await session.append(sleepTask);
|
|
634
305
|
}
|
|
635
306
|
}
|
|
636
|
-
return
|
|
307
|
+
return session.appendErrorPlan(`waitFor timeout: ${errorThought}`);
|
|
637
308
|
}
|
|
638
309
|
constructor(interfaceInstance, insight, opts){
|
|
639
310
|
_define_property(this, "interface", void 0);
|
|
640
311
|
_define_property(this, "insight", void 0);
|
|
641
312
|
_define_property(this, "taskCache", void 0);
|
|
313
|
+
_define_property(this, "taskBuilder", void 0);
|
|
642
314
|
_define_property(this, "conversationHistory", void 0);
|
|
643
315
|
_define_property(this, "onTaskStartCallback", void 0);
|
|
644
316
|
_define_property(this, "replanningCycleLimit", void 0);
|
|
@@ -648,6 +320,11 @@ class TaskExecutor {
|
|
|
648
320
|
this.onTaskStartCallback = null == opts ? void 0 : opts.onTaskStart;
|
|
649
321
|
this.replanningCycleLimit = opts.replanningCycleLimit;
|
|
650
322
|
this.conversationHistory = new ConversationHistory();
|
|
323
|
+
this.taskBuilder = new TaskBuilder({
|
|
324
|
+
interfaceInstance,
|
|
325
|
+
insight,
|
|
326
|
+
taskCache: opts.taskCache
|
|
327
|
+
});
|
|
651
328
|
}
|
|
652
329
|
}
|
|
653
330
|
export { TaskExecutor, locatePlanForLocate };
|