@midscene/core 0.30.6-beta-20251022093704.0 → 1.0.1-beta-20251022061922.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (89) hide show
  1. package/dist/es/agent/agent.mjs +41 -33
  2. package/dist/es/agent/agent.mjs.map +1 -1
  3. package/dist/es/agent/execution-session.mjs +41 -0
  4. package/dist/es/agent/execution-session.mjs.map +1 -0
  5. package/dist/es/agent/task-builder.mjs +315 -0
  6. package/dist/es/agent/task-builder.mjs.map +1 -0
  7. package/dist/es/agent/tasks.mjs +80 -405
  8. package/dist/es/agent/tasks.mjs.map +1 -1
  9. package/dist/es/agent/ui-utils.mjs.map +1 -1
  10. package/dist/es/agent/utils.mjs +6 -6
  11. package/dist/es/agent/utils.mjs.map +1 -1
  12. package/dist/es/ai-model/common.mjs +1 -15
  13. package/dist/es/ai-model/common.mjs.map +1 -1
  14. package/dist/es/ai-model/inspect.mjs +2 -3
  15. package/dist/es/ai-model/inspect.mjs.map +1 -1
  16. package/dist/es/ai-model/llm-planning.mjs +11 -30
  17. package/dist/es/ai-model/llm-planning.mjs.map +1 -1
  18. package/dist/es/ai-model/prompt/llm-locator.mjs +3 -204
  19. package/dist/es/ai-model/prompt/llm-locator.mjs.map +1 -1
  20. package/dist/es/ai-model/service-caller/index.mjs +101 -231
  21. package/dist/es/ai-model/service-caller/index.mjs.map +1 -1
  22. package/dist/es/index.mjs +3 -2
  23. package/dist/es/index.mjs.map +1 -1
  24. package/dist/es/insight/index.mjs +18 -19
  25. package/dist/es/insight/index.mjs.map +1 -1
  26. package/dist/es/insight/utils.mjs +3 -3
  27. package/dist/es/insight/utils.mjs.map +1 -1
  28. package/dist/es/report.mjs.map +1 -1
  29. package/dist/es/{ai-model/action-executor.mjs → task-runner.mjs} +81 -10
  30. package/dist/es/task-runner.mjs.map +1 -0
  31. package/dist/es/types.mjs +18 -1
  32. package/dist/es/types.mjs.map +1 -1
  33. package/dist/es/utils.mjs +2 -2
  34. package/dist/es/yaml/player.mjs +18 -14
  35. package/dist/es/yaml/player.mjs.map +1 -1
  36. package/dist/lib/agent/agent.js +41 -33
  37. package/dist/lib/agent/agent.js.map +1 -1
  38. package/dist/lib/agent/execution-session.js +75 -0
  39. package/dist/lib/agent/execution-session.js.map +1 -0
  40. package/dist/lib/agent/task-builder.js +352 -0
  41. package/dist/lib/agent/task-builder.js.map +1 -0
  42. package/dist/lib/agent/tasks.js +80 -405
  43. package/dist/lib/agent/tasks.js.map +1 -1
  44. package/dist/lib/agent/ui-utils.js.map +1 -1
  45. package/dist/lib/agent/utils.js +6 -6
  46. package/dist/lib/agent/utils.js.map +1 -1
  47. package/dist/lib/ai-model/common.js +2 -19
  48. package/dist/lib/ai-model/common.js.map +1 -1
  49. package/dist/lib/ai-model/inspect.js +1 -2
  50. package/dist/lib/ai-model/inspect.js.map +1 -1
  51. package/dist/lib/ai-model/llm-planning.js +10 -29
  52. package/dist/lib/ai-model/llm-planning.js.map +1 -1
  53. package/dist/lib/ai-model/prompt/llm-locator.js +2 -206
  54. package/dist/lib/ai-model/prompt/llm-locator.js.map +1 -1
  55. package/dist/lib/ai-model/service-caller/index.js +236 -384
  56. package/dist/lib/ai-model/service-caller/index.js.map +1 -1
  57. package/dist/lib/index.js +9 -5
  58. package/dist/lib/index.js.map +1 -1
  59. package/dist/lib/insight/index.js +17 -18
  60. package/dist/lib/insight/index.js.map +1 -1
  61. package/dist/lib/insight/utils.js +5 -5
  62. package/dist/lib/insight/utils.js.map +1 -1
  63. package/dist/lib/report.js.map +1 -1
  64. package/dist/lib/{ai-model/action-executor.js → task-runner.js} +83 -12
  65. package/dist/lib/task-runner.js.map +1 -0
  66. package/dist/lib/types.js +22 -1
  67. package/dist/lib/types.js.map +1 -1
  68. package/dist/lib/utils.js +2 -2
  69. package/dist/lib/yaml/player.js +18 -14
  70. package/dist/lib/yaml/player.js.map +1 -1
  71. package/dist/types/agent/agent.d.ts +16 -0
  72. package/dist/types/agent/execution-session.d.ts +27 -0
  73. package/dist/types/agent/task-builder.d.ts +34 -0
  74. package/dist/types/agent/tasks.d.ts +14 -13
  75. package/dist/types/agent/ui-utils.d.ts +2 -2
  76. package/dist/types/agent/utils.d.ts +6 -2
  77. package/dist/types/ai-model/common.d.ts +0 -1
  78. package/dist/types/ai-model/prompt/llm-locator.d.ts +0 -2
  79. package/dist/types/device/index.d.ts +20 -20
  80. package/dist/types/index.d.ts +4 -3
  81. package/dist/types/insight/index.d.ts +5 -10
  82. package/dist/types/insight/utils.d.ts +2 -2
  83. package/dist/types/task-runner.d.ts +31 -0
  84. package/dist/types/types.d.ts +53 -14
  85. package/dist/types/yaml.d.ts +3 -1
  86. package/package.json +4 -7
  87. package/dist/es/ai-model/action-executor.mjs.map +0 -1
  88. package/dist/lib/ai-model/action-executor.js.map +0 -1
  89. package/dist/types/ai-model/action-executor.d.ts +0 -19
@@ -1,11 +1,12 @@
1
- import { ConversationHistory, findAllMidsceneLocatorField, parseActionParam, plan as index_mjs_plan, uiTarsPlanning } from "../ai-model/index.mjs";
2
- import { Executor } from "../ai-model/action-executor.mjs";
3
- import { sleep as external_utils_mjs_sleep } from "../utils.mjs";
1
+ import { ConversationHistory, plan, uiTarsPlanning } from "../ai-model/index.mjs";
2
+ import { InsightError } from "../types.mjs";
4
3
  import { MIDSCENE_REPLANNING_CYCLE_LIMIT, globalConfigManager } from "@midscene/shared/env";
5
4
  import { getDebug } from "@midscene/shared/logger";
6
5
  import { assert } from "@midscene/shared/utils";
6
+ import { ExecutionSession } from "./execution-session.mjs";
7
+ import { TaskBuilder, locatePlanForLocate } from "./task-builder.mjs";
7
8
  import { taskTitleStr } from "./ui-utils.mjs";
8
- import { matchElementFromCache, matchElementFromPlan, parsePrompt } from "./utils.mjs";
9
+ import { parsePrompt } from "./utils.mjs";
9
10
  function _define_property(obj, key, value) {
10
11
  if (key in obj) Object.defineProperty(obj, key, {
11
12
  value: value,
@@ -19,325 +20,21 @@ function _define_property(obj, key, value) {
19
20
  const debug = getDebug('device-task-executor');
20
21
  const defaultReplanningCycleLimit = 10;
21
22
  const defaultVlmUiTarsReplanningCycleLimit = 40;
22
- function locatePlanForLocate(param) {
23
- const locate = 'string' == typeof param ? {
24
- prompt: param
25
- } : param;
26
- const locatePlan = {
27
- type: 'Locate',
28
- locate,
29
- param: locate,
30
- thought: ''
31
- };
32
- return locatePlan;
33
- }
34
23
  class TaskExecutor {
35
24
  get page() {
36
25
  return this.interface;
37
26
  }
38
- async recordScreenshot(timing) {
39
- const base64 = await this.interface.screenshotBase64();
40
- const item = {
41
- type: 'screenshot',
42
- ts: Date.now(),
43
- screenshot: base64,
44
- timing
45
- };
46
- return item;
47
- }
48
- prependExecutorWithScreenshot(taskApply, appendAfterExecution = false) {
49
- const taskWithScreenshot = {
50
- ...taskApply,
51
- executor: async (param, context, ...args)=>{
52
- const recorder = [];
53
- const { task } = context;
54
- task.recorder = recorder;
55
- const shot = await this.recordScreenshot(`before ${task.type}`);
56
- recorder.push(shot);
57
- const result = await taskApply.executor(param, context, ...args);
58
- if (appendAfterExecution) {
59
- const shot2 = await this.recordScreenshot('after Action');
60
- recorder.push(shot2);
61
- }
62
- return result;
63
- }
64
- };
65
- return taskWithScreenshot;
66
- }
67
- async convertPlanToExecutable(plans, modelConfig, cacheable) {
68
- const tasks = [];
69
- const taskForLocatePlan = (plan, detailedLocateParam, onResult)=>{
70
- if ('string' == typeof detailedLocateParam) detailedLocateParam = {
71
- prompt: detailedLocateParam
72
- };
73
- if (void 0 !== cacheable) detailedLocateParam = {
74
- ...detailedLocateParam,
75
- cacheable
76
- };
77
- const taskFind = {
78
- type: 'Insight',
79
- subType: 'Locate',
80
- param: detailedLocateParam,
81
- thought: plan.thought,
82
- executor: async (param, taskContext)=>{
83
- var _this_taskCache, _locateCacheRecord_cacheContent;
84
- const { task } = taskContext;
85
- assert((null == param ? void 0 : param.prompt) || (null == param ? void 0 : param.id) || (null == param ? void 0 : param.bbox), `No prompt or id or position or bbox to locate, param=${JSON.stringify(param)}`);
86
- let insightDump;
87
- let usage;
88
- const dumpCollector = (dump)=>{
89
- var _dump_taskInfo, _dump_taskInfo1;
90
- insightDump = dump;
91
- usage = null == dump ? void 0 : null == (_dump_taskInfo = dump.taskInfo) ? void 0 : _dump_taskInfo.usage;
92
- task.log = {
93
- dump: insightDump
94
- };
95
- task.usage = usage;
96
- if (null == dump ? void 0 : null == (_dump_taskInfo1 = dump.taskInfo) ? void 0 : _dump_taskInfo1.searchAreaUsage) task.searchAreaUsage = dump.taskInfo.searchAreaUsage;
97
- };
98
- this.insight.onceDumpUpdatedFn = dumpCollector;
99
- const shotTime = Date.now();
100
- const uiContext = await this.insight.contextRetrieverFn('locate');
101
- task.uiContext = uiContext;
102
- const recordItem = {
103
- type: 'screenshot',
104
- ts: shotTime,
105
- screenshot: uiContext.screenshotBase64,
106
- timing: 'before Insight'
107
- };
108
- task.recorder = [
109
- recordItem
110
- ];
111
- const elementFromXpath = param.xpath && this.interface.getElementInfoByXpath ? await this.interface.getElementInfoByXpath(param.xpath) : void 0;
112
- const userExpectedPathHitFlag = !!elementFromXpath;
113
- const cachePrompt = param.prompt;
114
- const locateCacheRecord = null == (_this_taskCache = this.taskCache) ? void 0 : _this_taskCache.matchLocateCache(cachePrompt);
115
- const cacheEntry = null == locateCacheRecord ? void 0 : null == (_locateCacheRecord_cacheContent = locateCacheRecord.cacheContent) ? void 0 : _locateCacheRecord_cacheContent.cache;
116
- const elementFromCache = userExpectedPathHitFlag ? null : await matchElementFromCache(this, cacheEntry, cachePrompt, param.cacheable);
117
- const cacheHitFlag = !!elementFromCache;
118
- const elementFromPlan = userExpectedPathHitFlag || cacheHitFlag ? void 0 : matchElementFromPlan(param, uiContext.tree);
119
- const planHitFlag = !!elementFromPlan;
120
- const elementFromAiLocate = userExpectedPathHitFlag || cacheHitFlag || planHitFlag ? void 0 : (await this.insight.locate(param, {
121
- context: uiContext
122
- }, modelConfig)).element;
123
- const aiLocateHitFlag = !!elementFromAiLocate;
124
- const element = elementFromXpath || elementFromCache || elementFromPlan || elementFromAiLocate;
125
- let currentCacheEntry;
126
- if (element && this.taskCache && !cacheHitFlag && (null == param ? void 0 : param.cacheable) !== false) if (this.interface.cacheFeatureForRect) try {
127
- const feature = await this.interface.cacheFeatureForRect(element.rect, void 0 !== element.isOrderSensitive ? {
128
- _orderSensitive: element.isOrderSensitive
129
- } : void 0);
130
- if (feature && Object.keys(feature).length > 0) {
131
- debug('update cache, prompt: %s, cache: %o', cachePrompt, feature);
132
- currentCacheEntry = feature;
133
- this.taskCache.updateOrAppendCacheRecord({
134
- type: 'locate',
135
- prompt: cachePrompt,
136
- cache: feature
137
- }, locateCacheRecord);
138
- } else debug('no cache data returned, skip cache update, prompt: %s', cachePrompt);
139
- } catch (error) {
140
- debug('cacheFeatureForRect failed: %s', error);
141
- }
142
- else debug('cacheFeatureForRect is not supported, skip cache update');
143
- if (!element) throw new Error(`Element not found: ${param.prompt}`);
144
- let hitBy;
145
- if (userExpectedPathHitFlag) hitBy = {
146
- from: 'User expected path',
147
- context: {
148
- xpath: param.xpath
149
- }
150
- };
151
- else if (cacheHitFlag) hitBy = {
152
- from: 'Cache',
153
- context: {
154
- cacheEntry,
155
- cacheToSave: currentCacheEntry
156
- }
157
- };
158
- else if (planHitFlag) hitBy = {
159
- from: 'Planning',
160
- context: {
161
- id: null == elementFromPlan ? void 0 : elementFromPlan.id,
162
- bbox: null == elementFromPlan ? void 0 : elementFromPlan.bbox
163
- }
164
- };
165
- else if (aiLocateHitFlag) hitBy = {
166
- from: 'AI model',
167
- context: {
168
- prompt: param.prompt
169
- }
170
- };
171
- null == onResult || onResult(element);
172
- return {
173
- output: {
174
- element
175
- },
176
- uiContext,
177
- hitBy
178
- };
179
- }
180
- };
181
- return taskFind;
182
- };
183
- for (const plan of plans)if ('Locate' === plan.type) {
184
- var _plan_locate, _plan_locate1;
185
- if (!plan.locate || null === plan.locate || (null == (_plan_locate = plan.locate) ? void 0 : _plan_locate.id) === null || (null == (_plan_locate1 = plan.locate) ? void 0 : _plan_locate1.id) === 'null') {
186
- debug('Locate action with id is null, will be ignored', plan);
187
- continue;
188
- }
189
- const taskLocate = taskForLocatePlan(plan, plan.locate);
190
- tasks.push(taskLocate);
191
- } else if ('Error' === plan.type) {
192
- var _plan_param;
193
- const taskActionError = {
194
- type: 'Action',
195
- subType: 'Error',
196
- param: plan.param,
197
- thought: plan.thought || (null == (_plan_param = plan.param) ? void 0 : _plan_param.thought),
198
- locate: plan.locate,
199
- executor: async ()=>{
200
- var _plan_param;
201
- throw new Error((null == plan ? void 0 : plan.thought) || (null == (_plan_param = plan.param) ? void 0 : _plan_param.thought) || 'error without thought');
202
- }
203
- };
204
- tasks.push(taskActionError);
205
- } else if ('Finished' === plan.type) {
206
- const taskActionFinished = {
207
- type: 'Action',
208
- subType: 'Finished',
209
- param: null,
210
- thought: plan.thought,
211
- locate: plan.locate,
212
- executor: async (param)=>{}
213
- };
214
- tasks.push(taskActionFinished);
215
- } else if ('Sleep' === plan.type) {
216
- const taskActionSleep = {
217
- type: 'Action',
218
- subType: 'Sleep',
219
- param: plan.param,
220
- thought: plan.thought,
221
- locate: plan.locate,
222
- executor: async (taskParam)=>{
223
- await external_utils_mjs_sleep((null == taskParam ? void 0 : taskParam.timeMs) || 3000);
224
- }
225
- };
226
- tasks.push(taskActionSleep);
227
- } else {
228
- const planType = plan.type;
229
- const actionSpace = await this.interface.actionSpace();
230
- const action = actionSpace.find((action)=>action.name === planType);
231
- const param = plan.param;
232
- if (!action) throw new Error(`Action type '${planType}' not found`);
233
- const locateFields = action ? findAllMidsceneLocatorField(action.paramSchema) : [];
234
- const requiredLocateFields = action ? findAllMidsceneLocatorField(action.paramSchema, true) : [];
235
- locateFields.forEach((field)=>{
236
- if (param[field]) {
237
- const locatePlan = locatePlanForLocate(param[field]);
238
- debug('will prepend locate param for field', `action.type=${planType}`, `param=${JSON.stringify(param[field])}`, `locatePlan=${JSON.stringify(locatePlan)}`);
239
- const locateTask = taskForLocatePlan(locatePlan, param[field], (result)=>{
240
- param[field] = result;
241
- });
242
- tasks.push(locateTask);
243
- } else {
244
- assert(!requiredLocateFields.includes(field), `Required locate field '${field}' is not provided for action ${planType}`);
245
- debug(`field '${field}' is not provided for action ${planType}`);
246
- }
247
- });
248
- const task = {
249
- type: 'Action',
250
- subType: planType,
251
- thought: plan.thought,
252
- param: plan.param,
253
- executor: async (param, context)=>{
254
- var _context_element;
255
- debug('executing action', planType, param, `context.element.center: ${null == (_context_element = context.element) ? void 0 : _context_element.center}`);
256
- const uiContext = await this.insight.contextRetrieverFn('locate');
257
- context.task.uiContext = uiContext;
258
- requiredLocateFields.forEach((field)=>{
259
- assert(param[field], `field '${field}' is required for action ${planType} but not provided. Cannot execute action ${planType}.`);
260
- });
261
- try {
262
- await Promise.all([
263
- (async ()=>{
264
- if (this.interface.beforeInvokeAction) {
265
- debug('will call "beforeInvokeAction" for interface');
266
- await this.interface.beforeInvokeAction(action.name, param);
267
- debug('called "beforeInvokeAction" for interface');
268
- }
269
- })(),
270
- external_utils_mjs_sleep(200)
271
- ]);
272
- } catch (originalError) {
273
- const originalMessage = (null == originalError ? void 0 : originalError.message) || String(originalError);
274
- throw new Error(`error in running beforeInvokeAction for ${action.name}: ${originalMessage}`, {
275
- cause: originalError
276
- });
277
- }
278
- if (action.paramSchema) try {
279
- param = parseActionParam(param, action.paramSchema);
280
- } catch (error) {
281
- throw new Error(`Invalid parameters for action ${action.name}: ${error.message}\nParameters: ${JSON.stringify(param)}`, {
282
- cause: error
283
- });
284
- }
285
- debug('calling action', action.name);
286
- const actionFn = action.call.bind(this.interface);
287
- await actionFn(param, context);
288
- debug('called action', action.name);
289
- try {
290
- if (this.interface.afterInvokeAction) {
291
- debug('will call "afterInvokeAction" for interface');
292
- await this.interface.afterInvokeAction(action.name, param);
293
- debug('called "afterInvokeAction" for interface');
294
- }
295
- } catch (originalError) {
296
- const originalMessage = (null == originalError ? void 0 : originalError.message) || String(originalError);
297
- throw new Error(`error in running afterInvokeAction for ${action.name}: ${originalMessage}`, {
298
- cause: originalError
299
- });
300
- }
301
- return {
302
- output: {
303
- success: true,
304
- action: planType,
305
- param: param
306
- }
307
- };
308
- }
309
- };
310
- tasks.push(task);
311
- }
312
- const wrappedTasks = tasks.map((task, index)=>{
313
- if ('Action' === task.type) return this.prependExecutorWithScreenshot(task, index === tasks.length - 1);
314
- return task;
27
+ createExecutionSession(title, options) {
28
+ return new ExecutionSession(title, ()=>Promise.resolve(this.insight.contextRetrieverFn()), {
29
+ onTaskStart: this.onTaskStartCallback,
30
+ tasks: null == options ? void 0 : options.tasks
315
31
  });
316
- return {
317
- tasks: wrappedTasks
318
- };
319
32
  }
320
- async setupPlanningContext(executorContext) {
321
- const shotTime = Date.now();
322
- const uiContext = await this.insight.contextRetrieverFn('locate');
323
- const recordItem = {
324
- type: 'screenshot',
325
- ts: shotTime,
326
- screenshot: uiContext.screenshotBase64,
327
- timing: 'before Planning'
328
- };
329
- executorContext.task.recorder = [
330
- recordItem
331
- ];
332
- executorContext.task.uiContext = uiContext;
333
- return {
334
- uiContext
335
- };
33
+ async convertPlanToExecutable(plans, modelConfig, options) {
34
+ return this.taskBuilder.build(plans, modelConfig, options);
336
35
  }
337
36
  async loadYamlFlowAsPlanning(userInstruction, yamlString) {
338
- const taskExecutor = new Executor(taskTitleStr('Action', userInstruction), {
339
- onTaskStart: this.onTaskStartCallback
340
- });
37
+ const session = this.createExecutionSession(taskTitleStr('Action', userInstruction));
341
38
  const task = {
342
39
  type: 'Planning',
343
40
  subType: 'LoadYaml',
@@ -346,7 +43,8 @@ class TaskExecutor {
346
43
  userInstruction
347
44
  },
348
45
  executor: async (param, executorContext)=>{
349
- await this.setupPlanningContext(executorContext);
46
+ const { uiContext } = executorContext;
47
+ assert(uiContext, 'uiContext is required for Planning task');
350
48
  return {
351
49
  output: {
352
50
  actions: [],
@@ -366,10 +64,9 @@ class TaskExecutor {
366
64
  };
367
65
  }
368
66
  };
369
- await taskExecutor.append(task);
370
- await taskExecutor.flush();
67
+ await session.appendAndRun(task);
371
68
  return {
372
- executor: taskExecutor
69
+ runner: session.getRunner()
373
70
  };
374
71
  }
375
72
  createPlanningTask(userInstruction, actionContext, modelConfig) {
@@ -382,7 +79,8 @@ class TaskExecutor {
382
79
  },
383
80
  executor: async (param, executorContext)=>{
384
81
  const startTime = Date.now();
385
- const { uiContext } = await this.setupPlanningContext(executorContext);
82
+ const { uiContext } = executorContext;
83
+ assert(uiContext, 'uiContext is required for Planning task');
386
84
  const { vlMode } = modelConfig;
387
85
  const uiTarsModelVersion = 'vlm-ui-tars' === vlMode ? modelConfig.uiTarsModelVersion : void 0;
388
86
  assert(this.interface.actionSpace, 'actionSpace for device is not implemented');
@@ -390,7 +88,7 @@ class TaskExecutor {
390
88
  debug('actionSpace for this interface is:', actionSpace.map((action)=>action.name).join(', '));
391
89
  assert(Array.isArray(actionSpace), 'actionSpace must be an array');
392
90
  if (0 === actionSpace.length) console.warn(`ActionSpace for ${this.interface.interfaceType} is empty. This may lead to unexpected behavior.`);
393
- const planResult = await (uiTarsModelVersion ? uiTarsPlanning : index_mjs_plan)(param.userInstruction, {
91
+ const planResult = await (uiTarsModelVersion ? uiTarsPlanning : plan)(param.userInstruction, {
394
92
  context: uiContext,
395
93
  actionContext,
396
94
  interfaceType: this.interface.interfaceType,
@@ -409,13 +107,7 @@ class TaskExecutor {
409
107
  if (sleep) {
410
108
  const timeNow = Date.now();
411
109
  const timeRemaining = sleep - (timeNow - startTime);
412
- if (timeRemaining > 0) finalActions.push({
413
- type: 'Sleep',
414
- param: {
415
- timeMs: timeRemaining
416
- },
417
- locate: null
418
- });
110
+ if (timeRemaining > 0) finalActions.push(this.sleepPlan(timeRemaining));
419
111
  }
420
112
  if (0 === finalActions.length) assert(!more_actions_needed_by_instruction || sleep, error ? `Failed to plan: ${error}` : 'No plan found');
421
113
  return {
@@ -435,16 +127,13 @@ class TaskExecutor {
435
127
  return task;
436
128
  }
437
129
  async runPlans(title, plans, modelConfig) {
438
- const taskExecutor = new Executor(title, {
439
- onTaskStart: this.onTaskStartCallback
440
- });
130
+ const session = this.createExecutionSession(title);
441
131
  const { tasks } = await this.convertPlanToExecutable(plans, modelConfig);
442
- await taskExecutor.append(tasks);
443
- const result = await taskExecutor.flush();
132
+ const result = await session.appendAndRun(tasks);
444
133
  const { output } = result;
445
134
  return {
446
135
  output,
447
- executor: taskExecutor
136
+ runner: session.getRunner()
448
137
  };
449
138
  }
450
139
  getReplanningCycleLimit(isVlmUiTars) {
@@ -452,38 +141,38 @@ class TaskExecutor {
452
141
  }
453
142
  async action(userPrompt, modelConfig, actionContext, cacheable) {
454
143
  this.conversationHistory.reset();
455
- const taskExecutor = new Executor(taskTitleStr('Action', userPrompt), {
456
- onTaskStart: this.onTaskStartCallback
457
- });
144
+ const session = this.createExecutionSession(taskTitleStr('Action', userPrompt));
145
+ const runner = session.getRunner();
458
146
  let replanCount = 0;
459
147
  const yamlFlow = [];
460
148
  const replanningCycleLimit = this.getReplanningCycleLimit('vlm-ui-tars' === modelConfig.vlMode);
461
149
  while(true){
462
150
  if (replanCount > replanningCycleLimit) {
463
151
  const errorMsg = `Replanning ${replanningCycleLimit} times, which is more than the limit, please split the task into multiple steps`;
464
- return this.appendErrorPlan(taskExecutor, errorMsg, modelConfig);
152
+ return session.appendErrorPlan(errorMsg);
465
153
  }
466
154
  const planningTask = this.createPlanningTask(userPrompt, actionContext, modelConfig);
467
- await taskExecutor.append(planningTask);
468
- const result = await taskExecutor.flush();
155
+ const result = await session.appendAndRun(planningTask);
469
156
  const planResult = null == result ? void 0 : result.output;
470
- if (taskExecutor.isInErrorState()) return {
157
+ if (session.isInErrorState()) return {
471
158
  output: planResult,
472
- executor: taskExecutor
159
+ runner
473
160
  };
474
161
  const plans = planResult.actions || [];
475
162
  yamlFlow.push(...planResult.yamlFlow || []);
476
163
  let executables;
477
164
  try {
478
- executables = await this.convertPlanToExecutable(plans, modelConfig, cacheable);
479
- taskExecutor.append(executables.tasks);
165
+ executables = await this.convertPlanToExecutable(plans, modelConfig, {
166
+ cacheable,
167
+ subTask: true
168
+ });
169
+ await session.appendAndRun(executables.tasks);
480
170
  } catch (error) {
481
- return this.appendErrorPlan(taskExecutor, `Error converting plans to executable tasks: ${error}, plans: ${JSON.stringify(plans)}`, modelConfig);
171
+ return session.appendErrorPlan(`Error converting plans to executable tasks: ${error}, plans: ${JSON.stringify(plans)}`);
482
172
  }
483
- await taskExecutor.flush();
484
- if (taskExecutor.isInErrorState()) return {
173
+ if (session.isInErrorState()) return {
485
174
  output: void 0,
486
- executor: taskExecutor
175
+ runner
487
176
  };
488
177
  if (!planResult.more_actions_needed_by_instruction) break;
489
178
  replanCount++;
@@ -492,7 +181,7 @@ class TaskExecutor {
492
181
  output: {
493
182
  yamlFlow
494
183
  },
495
- executor: taskExecutor
184
+ runner
496
185
  };
497
186
  }
498
187
  createTypeQueryTask(type, demand, modelConfig, opt, multimodalPrompt) {
@@ -508,23 +197,15 @@ class TaskExecutor {
508
197
  },
509
198
  executor: async (param, taskContext)=>{
510
199
  const { task } = taskContext;
511
- let insightDump;
512
- const dumpCollector = (dump)=>{
513
- insightDump = dump;
514
- };
515
- this.insight.onceDumpUpdatedFn = dumpCollector;
516
- const shotTime = Date.now();
517
- const uiContext = await this.insight.contextRetrieverFn('extract');
518
- task.uiContext = uiContext;
519
- const recordItem = {
520
- type: 'screenshot',
521
- ts: shotTime,
522
- screenshot: uiContext.screenshotBase64,
523
- timing: 'before Extract'
200
+ let queryDump;
201
+ const applyDump = (dump)=>{
202
+ queryDump = dump;
203
+ task.log = {
204
+ dump
205
+ };
524
206
  };
525
- task.recorder = [
526
- recordItem
527
- ];
207
+ const uiContext = taskContext.uiContext;
208
+ assert(uiContext, 'uiContext is required for Query task');
528
209
  const ifTypeRestricted = 'Query' !== type;
529
210
  let demandInput = demand;
530
211
  let keyOfResult = 'result';
@@ -537,7 +218,15 @@ class TaskExecutor {
537
218
  } else if (ifTypeRestricted) demandInput = {
538
219
  [keyOfResult]: `${type}, ${demand}`
539
220
  };
540
- const { data, usage, thought } = await this.insight.extract(demandInput, modelConfig, opt, multimodalPrompt);
221
+ let extractResult;
222
+ try {
223
+ extractResult = await this.insight.extract(demandInput, modelConfig, opt, multimodalPrompt);
224
+ } catch (error) {
225
+ if (error instanceof InsightError) applyDump(error.dump);
226
+ throw error;
227
+ }
228
+ const { data, usage, thought, dump } = extractResult;
229
+ applyDump(dump);
541
230
  let outputResult = data;
542
231
  if (ifTypeRestricted) if ('string' == typeof data) outputResult = data;
543
232
  else {
@@ -546,7 +235,7 @@ class TaskExecutor {
546
235
  }
547
236
  return {
548
237
  output: outputResult,
549
- log: insightDump,
238
+ log: queryDump,
550
239
  usage,
551
240
  thought
552
241
  };
@@ -555,57 +244,36 @@ class TaskExecutor {
555
244
  return queryTask;
556
245
  }
557
246
  async createTypeQueryExecution(type, demand, modelConfig, opt, multimodalPrompt) {
558
- const taskExecutor = new Executor(taskTitleStr(type, 'string' == typeof demand ? demand : JSON.stringify(demand)), {
559
- onTaskStart: this.onTaskStartCallback
560
- });
247
+ const session = this.createExecutionSession(taskTitleStr(type, 'string' == typeof demand ? demand : JSON.stringify(demand)));
561
248
  const queryTask = await this.createTypeQueryTask(type, demand, modelConfig, opt, multimodalPrompt);
562
- await taskExecutor.append(this.prependExecutorWithScreenshot(queryTask));
563
- const result = await taskExecutor.flush();
249
+ const result = await session.appendAndRun(queryTask);
564
250
  if (!result) throw new Error('result of taskExecutor.flush() is undefined in function createTypeQueryTask');
565
251
  const { output, thought } = result;
566
252
  return {
567
253
  output,
568
254
  thought,
569
- executor: taskExecutor
255
+ runner: session.getRunner()
570
256
  };
571
257
  }
572
- async appendErrorPlan(taskExecutor, errorMsg, modelConfig) {
573
- const errorPlan = {
574
- type: 'Error',
575
- param: {
576
- thought: errorMsg
577
- },
578
- locate: null
579
- };
580
- const { tasks } = await this.convertPlanToExecutable([
581
- errorPlan
582
- ], modelConfig);
583
- await taskExecutor.append(this.prependExecutorWithScreenshot(tasks[0]));
584
- await taskExecutor.flush();
258
+ sleepPlan(timeMs) {
585
259
  return {
586
- output: void 0,
587
- executor: taskExecutor
588
- };
589
- }
590
- async taskForSleep(timeMs, modelConfig) {
591
- const sleepPlan = {
592
260
  type: 'Sleep',
593
261
  param: {
594
262
  timeMs
595
263
  },
596
264
  locate: null
597
265
  };
598
- const { tasks: sleepTasks } = await this.convertPlanToExecutable([
599
- sleepPlan
600
- ], modelConfig);
601
- return this.prependExecutorWithScreenshot(sleepTasks[0]);
266
+ }
267
+ async taskForSleep(timeMs, _modelConfig) {
268
+ return this.taskBuilder.createSleepTask({
269
+ timeMs
270
+ });
602
271
  }
603
272
  async waitFor(assertion, opt, modelConfig) {
604
273
  const { textPrompt, multimodalPrompt } = parsePrompt(assertion);
605
274
  const description = `waitFor: ${textPrompt}`;
606
- const taskExecutor = new Executor(taskTitleStr('WaitFor', description), {
607
- onTaskStart: this.onTaskStartCallback
608
- });
275
+ const session = this.createExecutionSession(taskTitleStr('WaitFor', description));
276
+ const runner = session.getRunner();
609
277
  const { timeoutMs, checkIntervalMs } = opt;
610
278
  assert(assertion, 'No assertion for waitFor');
611
279
  assert(timeoutMs, 'No timeoutMs for waitFor');
@@ -619,26 +287,28 @@ class TaskExecutor {
619
287
  const queryTask = await this.createTypeQueryTask('WaitFor', textPrompt, modelConfig, {
620
288
  doNotThrowError: true
621
289
  }, multimodalPrompt);
622
- await taskExecutor.append(this.prependExecutorWithScreenshot(queryTask));
623
- const result = await taskExecutor.flush();
290
+ const result = await session.appendAndRun(queryTask);
624
291
  if (null == result ? void 0 : result.output) return {
625
292
  output: void 0,
626
- executor: taskExecutor
293
+ runner
627
294
  };
628
295
  errorThought = (null == result ? void 0 : result.thought) || !result && `No result from assertion: ${textPrompt}` || `unknown error when waiting for assertion: ${textPrompt}`;
629
296
  const now = Date.now();
630
297
  if (now - startTime < checkIntervalMs) {
631
298
  const timeRemaining = checkIntervalMs - (now - startTime);
632
- const sleepTask = await this.taskForSleep(timeRemaining, modelConfig);
633
- await taskExecutor.append(sleepTask);
299
+ const sleepTask = this.taskBuilder.createSleepTask({
300
+ timeMs: timeRemaining
301
+ });
302
+ await session.append(sleepTask);
634
303
  }
635
304
  }
636
- return this.appendErrorPlan(taskExecutor, `waitFor timeout: ${errorThought}`, modelConfig);
305
+ return session.appendErrorPlan(`waitFor timeout: ${errorThought}`);
637
306
  }
638
307
  constructor(interfaceInstance, insight, opts){
639
308
  _define_property(this, "interface", void 0);
640
309
  _define_property(this, "insight", void 0);
641
310
  _define_property(this, "taskCache", void 0);
311
+ _define_property(this, "taskBuilder", void 0);
642
312
  _define_property(this, "conversationHistory", void 0);
643
313
  _define_property(this, "onTaskStartCallback", void 0);
644
314
  _define_property(this, "replanningCycleLimit", void 0);
@@ -648,6 +318,11 @@ class TaskExecutor {
648
318
  this.onTaskStartCallback = null == opts ? void 0 : opts.onTaskStart;
649
319
  this.replanningCycleLimit = opts.replanningCycleLimit;
650
320
  this.conversationHistory = new ConversationHistory();
321
+ this.taskBuilder = new TaskBuilder({
322
+ interfaceInstance,
323
+ insight,
324
+ taskCache: opts.taskCache
325
+ });
651
326
  }
652
327
  }
653
328
  export { TaskExecutor, locatePlanForLocate };