@midscene/core 0.30.5 → 1.0.1-beta-20251021060907.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (87) hide show
  1. package/dist/es/agent/agent.mjs +41 -33
  2. package/dist/es/agent/agent.mjs.map +1 -1
  3. package/dist/es/agent/execution-session.mjs +41 -0
  4. package/dist/es/agent/execution-session.mjs.map +1 -0
  5. package/dist/es/agent/task-builder.mjs +303 -0
  6. package/dist/es/agent/task-builder.mjs.map +1 -0
  7. package/dist/es/agent/tasks.mjs +68 -391
  8. package/dist/es/agent/tasks.mjs.map +1 -1
  9. package/dist/es/agent/ui-utils.mjs.map +1 -1
  10. package/dist/es/agent/utils.mjs +6 -6
  11. package/dist/es/agent/utils.mjs.map +1 -1
  12. package/dist/es/ai-model/common.mjs +1 -15
  13. package/dist/es/ai-model/common.mjs.map +1 -1
  14. package/dist/es/ai-model/inspect.mjs +2 -3
  15. package/dist/es/ai-model/inspect.mjs.map +1 -1
  16. package/dist/es/ai-model/llm-planning.mjs +6 -24
  17. package/dist/es/ai-model/llm-planning.mjs.map +1 -1
  18. package/dist/es/ai-model/prompt/llm-locator.mjs +3 -204
  19. package/dist/es/ai-model/prompt/llm-locator.mjs.map +1 -1
  20. package/dist/es/ai-model/service-caller/index.mjs +101 -231
  21. package/dist/es/ai-model/service-caller/index.mjs.map +1 -1
  22. package/dist/es/index.mjs +3 -2
  23. package/dist/es/index.mjs.map +1 -1
  24. package/dist/es/insight/index.mjs +18 -19
  25. package/dist/es/insight/index.mjs.map +1 -1
  26. package/dist/es/insight/utils.mjs +3 -3
  27. package/dist/es/insight/utils.mjs.map +1 -1
  28. package/dist/es/report.mjs.map +1 -1
  29. package/dist/es/{ai-model/action-executor.mjs → task-runner.mjs} +69 -10
  30. package/dist/es/task-runner.mjs.map +1 -0
  31. package/dist/es/types.mjs +18 -1
  32. package/dist/es/types.mjs.map +1 -1
  33. package/dist/es/utils.mjs +2 -2
  34. package/dist/es/yaml/player.mjs +18 -14
  35. package/dist/es/yaml/player.mjs.map +1 -1
  36. package/dist/lib/agent/agent.js +41 -33
  37. package/dist/lib/agent/agent.js.map +1 -1
  38. package/dist/lib/agent/execution-session.js +75 -0
  39. package/dist/lib/agent/execution-session.js.map +1 -0
  40. package/dist/lib/agent/task-builder.js +340 -0
  41. package/dist/lib/agent/task-builder.js.map +1 -0
  42. package/dist/lib/agent/tasks.js +68 -391
  43. package/dist/lib/agent/tasks.js.map +1 -1
  44. package/dist/lib/agent/ui-utils.js.map +1 -1
  45. package/dist/lib/agent/utils.js +6 -6
  46. package/dist/lib/agent/utils.js.map +1 -1
  47. package/dist/lib/ai-model/common.js +2 -19
  48. package/dist/lib/ai-model/common.js.map +1 -1
  49. package/dist/lib/ai-model/inspect.js +1 -2
  50. package/dist/lib/ai-model/inspect.js.map +1 -1
  51. package/dist/lib/ai-model/llm-planning.js +5 -23
  52. package/dist/lib/ai-model/llm-planning.js.map +1 -1
  53. package/dist/lib/ai-model/prompt/llm-locator.js +2 -206
  54. package/dist/lib/ai-model/prompt/llm-locator.js.map +1 -1
  55. package/dist/lib/ai-model/service-caller/index.js +236 -384
  56. package/dist/lib/ai-model/service-caller/index.js.map +1 -1
  57. package/dist/lib/index.js +9 -5
  58. package/dist/lib/index.js.map +1 -1
  59. package/dist/lib/insight/index.js +17 -18
  60. package/dist/lib/insight/index.js.map +1 -1
  61. package/dist/lib/insight/utils.js +5 -5
  62. package/dist/lib/insight/utils.js.map +1 -1
  63. package/dist/lib/report.js.map +1 -1
  64. package/dist/lib/{ai-model/action-executor.js → task-runner.js} +71 -12
  65. package/dist/lib/task-runner.js.map +1 -0
  66. package/dist/lib/types.js +22 -1
  67. package/dist/lib/types.js.map +1 -1
  68. package/dist/lib/utils.js +2 -2
  69. package/dist/lib/yaml/player.js +18 -14
  70. package/dist/lib/yaml/player.js.map +1 -1
  71. package/dist/types/agent/agent.d.ts +16 -0
  72. package/dist/types/agent/execution-session.d.ts +27 -0
  73. package/dist/types/agent/task-builder.d.ts +24 -0
  74. package/dist/types/agent/tasks.d.ts +8 -11
  75. package/dist/types/agent/ui-utils.d.ts +2 -2
  76. package/dist/types/agent/utils.d.ts +5 -2
  77. package/dist/types/ai-model/common.d.ts +0 -1
  78. package/dist/types/ai-model/prompt/llm-locator.d.ts +0 -2
  79. package/dist/types/index.d.ts +4 -3
  80. package/dist/types/insight/index.d.ts +5 -10
  81. package/dist/types/insight/utils.d.ts +2 -2
  82. package/dist/types/{ai-model/action-executor.d.ts → task-runner.d.ts} +14 -3
  83. package/dist/types/types.d.ts +47 -4
  84. package/dist/types/yaml.d.ts +3 -1
  85. package/package.json +4 -7
  86. package/dist/es/ai-model/action-executor.mjs.map +0 -1
  87. package/dist/lib/ai-model/action-executor.js.map +0 -1
@@ -1,11 +1,12 @@
1
- import { ConversationHistory, findAllMidsceneLocatorField, parseActionParam, plan as index_mjs_plan, uiTarsPlanning } from "../ai-model/index.mjs";
2
- import { Executor } from "../ai-model/action-executor.mjs";
3
- import { sleep as external_utils_mjs_sleep } from "../utils.mjs";
1
+ import { ConversationHistory, plan, uiTarsPlanning } from "../ai-model/index.mjs";
2
+ import { InsightError } from "../types.mjs";
4
3
  import { MIDSCENE_REPLANNING_CYCLE_LIMIT, globalConfigManager } from "@midscene/shared/env";
5
4
  import { getDebug } from "@midscene/shared/logger";
6
5
  import { assert } from "@midscene/shared/utils";
6
+ import { ExecutionSession } from "./execution-session.mjs";
7
+ import { TaskBuilder, locatePlanForLocate } from "./task-builder.mjs";
7
8
  import { taskTitleStr } from "./ui-utils.mjs";
8
- import { matchElementFromCache, matchElementFromPlan, parsePrompt } from "./utils.mjs";
9
+ import { parsePrompt } from "./utils.mjs";
9
10
  function _define_property(obj, key, value) {
10
11
  if (key in obj) Object.defineProperty(obj, key, {
11
12
  value: value,
@@ -19,325 +20,23 @@ function _define_property(obj, key, value) {
19
20
  const debug = getDebug('device-task-executor');
20
21
  const defaultReplanningCycleLimit = 10;
21
22
  const defaultVlmUiTarsReplanningCycleLimit = 40;
22
- function locatePlanForLocate(param) {
23
- const locate = 'string' == typeof param ? {
24
- prompt: param
25
- } : param;
26
- const locatePlan = {
27
- type: 'Locate',
28
- locate,
29
- param: locate,
30
- thought: ''
31
- };
32
- return locatePlan;
33
- }
34
23
  class TaskExecutor {
35
24
  get page() {
36
25
  return this.interface;
37
26
  }
38
- async recordScreenshot(timing) {
39
- const base64 = await this.interface.screenshotBase64();
40
- const item = {
41
- type: 'screenshot',
42
- ts: Date.now(),
43
- screenshot: base64,
44
- timing
45
- };
46
- return item;
47
- }
48
- prependExecutorWithScreenshot(taskApply, appendAfterExecution = false) {
49
- const taskWithScreenshot = {
50
- ...taskApply,
51
- executor: async (param, context, ...args)=>{
52
- const recorder = [];
53
- const { task } = context;
54
- task.recorder = recorder;
55
- const shot = await this.recordScreenshot(`before ${task.type}`);
56
- recorder.push(shot);
57
- const result = await taskApply.executor(param, context, ...args);
58
- if (appendAfterExecution) {
59
- const shot2 = await this.recordScreenshot('after Action');
60
- recorder.push(shot2);
61
- }
62
- return result;
63
- }
64
- };
65
- return taskWithScreenshot;
27
+ createExecutionSession(title, options) {
28
+ return new ExecutionSession(title, ()=>Promise.resolve(this.insight.contextRetrieverFn()), {
29
+ onTaskStart: this.onTaskStartCallback,
30
+ tasks: null == options ? void 0 : options.tasks
31
+ });
66
32
  }
67
33
  async convertPlanToExecutable(plans, modelConfig, cacheable) {
68
- const tasks = [];
69
- const taskForLocatePlan = (plan, detailedLocateParam, onResult)=>{
70
- if ('string' == typeof detailedLocateParam) detailedLocateParam = {
71
- prompt: detailedLocateParam
72
- };
73
- if (void 0 !== cacheable) detailedLocateParam = {
74
- ...detailedLocateParam,
75
- cacheable
76
- };
77
- const taskFind = {
78
- type: 'Insight',
79
- subType: 'Locate',
80
- param: detailedLocateParam,
81
- thought: plan.thought,
82
- executor: async (param, taskContext)=>{
83
- var _this_taskCache, _locateCacheRecord_cacheContent;
84
- const { task } = taskContext;
85
- assert((null == param ? void 0 : param.prompt) || (null == param ? void 0 : param.id) || (null == param ? void 0 : param.bbox), `No prompt or id or position or bbox to locate, param=${JSON.stringify(param)}`);
86
- let insightDump;
87
- let usage;
88
- const dumpCollector = (dump)=>{
89
- var _dump_taskInfo, _dump_taskInfo1;
90
- insightDump = dump;
91
- usage = null == dump ? void 0 : null == (_dump_taskInfo = dump.taskInfo) ? void 0 : _dump_taskInfo.usage;
92
- task.log = {
93
- dump: insightDump
94
- };
95
- task.usage = usage;
96
- if (null == dump ? void 0 : null == (_dump_taskInfo1 = dump.taskInfo) ? void 0 : _dump_taskInfo1.searchAreaUsage) task.searchAreaUsage = dump.taskInfo.searchAreaUsage;
97
- };
98
- this.insight.onceDumpUpdatedFn = dumpCollector;
99
- const shotTime = Date.now();
100
- const uiContext = await this.insight.contextRetrieverFn('locate');
101
- task.uiContext = uiContext;
102
- const recordItem = {
103
- type: 'screenshot',
104
- ts: shotTime,
105
- screenshot: uiContext.screenshotBase64,
106
- timing: 'before Insight'
107
- };
108
- task.recorder = [
109
- recordItem
110
- ];
111
- const elementFromXpath = param.xpath && this.interface.getElementInfoByXpath ? await this.interface.getElementInfoByXpath(param.xpath) : void 0;
112
- const userExpectedPathHitFlag = !!elementFromXpath;
113
- const cachePrompt = param.prompt;
114
- const locateCacheRecord = null == (_this_taskCache = this.taskCache) ? void 0 : _this_taskCache.matchLocateCache(cachePrompt);
115
- const cacheEntry = null == locateCacheRecord ? void 0 : null == (_locateCacheRecord_cacheContent = locateCacheRecord.cacheContent) ? void 0 : _locateCacheRecord_cacheContent.cache;
116
- const elementFromCache = userExpectedPathHitFlag ? null : await matchElementFromCache(this, cacheEntry, cachePrompt, param.cacheable);
117
- const cacheHitFlag = !!elementFromCache;
118
- const elementFromPlan = userExpectedPathHitFlag || cacheHitFlag ? void 0 : matchElementFromPlan(param, uiContext.tree);
119
- const planHitFlag = !!elementFromPlan;
120
- const elementFromAiLocate = userExpectedPathHitFlag || cacheHitFlag || planHitFlag ? void 0 : (await this.insight.locate(param, {
121
- context: uiContext
122
- }, modelConfig)).element;
123
- const aiLocateHitFlag = !!elementFromAiLocate;
124
- const element = elementFromXpath || elementFromCache || elementFromPlan || elementFromAiLocate;
125
- let currentCacheEntry;
126
- if (element && this.taskCache && !cacheHitFlag && (null == param ? void 0 : param.cacheable) !== false) if (this.interface.cacheFeatureForRect) try {
127
- const feature = await this.interface.cacheFeatureForRect(element.rect, void 0 !== element.isOrderSensitive ? {
128
- _orderSensitive: element.isOrderSensitive
129
- } : void 0);
130
- if (feature && Object.keys(feature).length > 0) {
131
- debug('update cache, prompt: %s, cache: %o', cachePrompt, feature);
132
- currentCacheEntry = feature;
133
- this.taskCache.updateOrAppendCacheRecord({
134
- type: 'locate',
135
- prompt: cachePrompt,
136
- cache: feature
137
- }, locateCacheRecord);
138
- } else debug('no cache data returned, skip cache update, prompt: %s', cachePrompt);
139
- } catch (error) {
140
- debug('cacheFeatureForRect failed: %s', error);
141
- }
142
- else debug('cacheFeatureForRect is not supported, skip cache update');
143
- if (!element) throw new Error(`Element not found: ${param.prompt}`);
144
- let hitBy;
145
- if (userExpectedPathHitFlag) hitBy = {
146
- from: 'User expected path',
147
- context: {
148
- xpath: param.xpath
149
- }
150
- };
151
- else if (cacheHitFlag) hitBy = {
152
- from: 'Cache',
153
- context: {
154
- cacheEntry,
155
- cacheToSave: currentCacheEntry
156
- }
157
- };
158
- else if (planHitFlag) hitBy = {
159
- from: 'Planning',
160
- context: {
161
- id: null == elementFromPlan ? void 0 : elementFromPlan.id,
162
- bbox: null == elementFromPlan ? void 0 : elementFromPlan.bbox
163
- }
164
- };
165
- else if (aiLocateHitFlag) hitBy = {
166
- from: 'AI model',
167
- context: {
168
- prompt: param.prompt
169
- }
170
- };
171
- null == onResult || onResult(element);
172
- return {
173
- output: {
174
- element
175
- },
176
- uiContext,
177
- hitBy
178
- };
179
- }
180
- };
181
- return taskFind;
182
- };
183
- for (const plan of plans)if ('Locate' === plan.type) {
184
- var _plan_locate, _plan_locate1;
185
- if (!plan.locate || null === plan.locate || (null == (_plan_locate = plan.locate) ? void 0 : _plan_locate.id) === null || (null == (_plan_locate1 = plan.locate) ? void 0 : _plan_locate1.id) === 'null') {
186
- debug('Locate action with id is null, will be ignored', plan);
187
- continue;
188
- }
189
- const taskLocate = taskForLocatePlan(plan, plan.locate);
190
- tasks.push(taskLocate);
191
- } else if ('Error' === plan.type) {
192
- var _plan_param;
193
- const taskActionError = {
194
- type: 'Action',
195
- subType: 'Error',
196
- param: plan.param,
197
- thought: plan.thought || (null == (_plan_param = plan.param) ? void 0 : _plan_param.thought),
198
- locate: plan.locate,
199
- executor: async ()=>{
200
- var _plan_param;
201
- throw new Error((null == plan ? void 0 : plan.thought) || (null == (_plan_param = plan.param) ? void 0 : _plan_param.thought) || 'error without thought');
202
- }
203
- };
204
- tasks.push(taskActionError);
205
- } else if ('Finished' === plan.type) {
206
- const taskActionFinished = {
207
- type: 'Action',
208
- subType: 'Finished',
209
- param: null,
210
- thought: plan.thought,
211
- locate: plan.locate,
212
- executor: async (param)=>{}
213
- };
214
- tasks.push(taskActionFinished);
215
- } else if ('Sleep' === plan.type) {
216
- const taskActionSleep = {
217
- type: 'Action',
218
- subType: 'Sleep',
219
- param: plan.param,
220
- thought: plan.thought,
221
- locate: plan.locate,
222
- executor: async (taskParam)=>{
223
- await external_utils_mjs_sleep((null == taskParam ? void 0 : taskParam.timeMs) || 3000);
224
- }
225
- };
226
- tasks.push(taskActionSleep);
227
- } else {
228
- const planType = plan.type;
229
- const actionSpace = await this.interface.actionSpace();
230
- const action = actionSpace.find((action)=>action.name === planType);
231
- const param = plan.param;
232
- if (!action) throw new Error(`Action type '${planType}' not found`);
233
- const locateFields = action ? findAllMidsceneLocatorField(action.paramSchema) : [];
234
- const requiredLocateFields = action ? findAllMidsceneLocatorField(action.paramSchema, true) : [];
235
- locateFields.forEach((field)=>{
236
- if (param[field]) {
237
- const locatePlan = locatePlanForLocate(param[field]);
238
- debug('will prepend locate param for field', `action.type=${planType}`, `param=${JSON.stringify(param[field])}`, `locatePlan=${JSON.stringify(locatePlan)}`);
239
- const locateTask = taskForLocatePlan(locatePlan, param[field], (result)=>{
240
- param[field] = result;
241
- });
242
- tasks.push(locateTask);
243
- } else {
244
- assert(!requiredLocateFields.includes(field), `Required locate field '${field}' is not provided for action ${planType}`);
245
- debug(`field '${field}' is not provided for action ${planType}`);
246
- }
247
- });
248
- const task = {
249
- type: 'Action',
250
- subType: planType,
251
- thought: plan.thought,
252
- param: plan.param,
253
- executor: async (param, context)=>{
254
- var _context_element;
255
- debug('executing action', planType, param, `context.element.center: ${null == (_context_element = context.element) ? void 0 : _context_element.center}`);
256
- const uiContext = await this.insight.contextRetrieverFn('locate');
257
- context.task.uiContext = uiContext;
258
- requiredLocateFields.forEach((field)=>{
259
- assert(param[field], `field '${field}' is required for action ${planType} but not provided. Cannot execute action ${planType}.`);
260
- });
261
- try {
262
- await Promise.all([
263
- (async ()=>{
264
- if (this.interface.beforeInvokeAction) {
265
- debug('will call "beforeInvokeAction" for interface');
266
- await this.interface.beforeInvokeAction(action.name, param);
267
- debug('called "beforeInvokeAction" for interface');
268
- }
269
- })(),
270
- external_utils_mjs_sleep(200)
271
- ]);
272
- } catch (originalError) {
273
- const originalMessage = (null == originalError ? void 0 : originalError.message) || String(originalError);
274
- throw new Error(`error in running beforeInvokeAction for ${action.name}: ${originalMessage}`, {
275
- cause: originalError
276
- });
277
- }
278
- if (action.paramSchema) try {
279
- param = parseActionParam(param, action.paramSchema);
280
- } catch (error) {
281
- throw new Error(`Invalid parameters for action ${action.name}: ${error.message}\nParameters: ${JSON.stringify(param)}`, {
282
- cause: error
283
- });
284
- }
285
- debug('calling action', action.name);
286
- const actionFn = action.call.bind(this.interface);
287
- await actionFn(param, context);
288
- debug('called action', action.name);
289
- try {
290
- if (this.interface.afterInvokeAction) {
291
- debug('will call "afterInvokeAction" for interface');
292
- await this.interface.afterInvokeAction(action.name, param);
293
- debug('called "afterInvokeAction" for interface');
294
- }
295
- } catch (originalError) {
296
- const originalMessage = (null == originalError ? void 0 : originalError.message) || String(originalError);
297
- throw new Error(`error in running afterInvokeAction for ${action.name}: ${originalMessage}`, {
298
- cause: originalError
299
- });
300
- }
301
- return {
302
- output: {
303
- success: true,
304
- action: planType,
305
- param: param
306
- }
307
- };
308
- }
309
- };
310
- tasks.push(task);
311
- }
312
- const wrappedTasks = tasks.map((task, index)=>{
313
- if ('Action' === task.type) return this.prependExecutorWithScreenshot(task, index === tasks.length - 1);
314
- return task;
34
+ return this.taskBuilder.build(plans, modelConfig, {
35
+ cacheable
315
36
  });
316
- return {
317
- tasks: wrappedTasks
318
- };
319
- }
320
- async setupPlanningContext(executorContext) {
321
- const shotTime = Date.now();
322
- const uiContext = await this.insight.contextRetrieverFn('locate');
323
- const recordItem = {
324
- type: 'screenshot',
325
- ts: shotTime,
326
- screenshot: uiContext.screenshotBase64,
327
- timing: 'before Planning'
328
- };
329
- executorContext.task.recorder = [
330
- recordItem
331
- ];
332
- executorContext.task.uiContext = uiContext;
333
- return {
334
- uiContext
335
- };
336
37
  }
337
38
  async loadYamlFlowAsPlanning(userInstruction, yamlString) {
338
- const taskExecutor = new Executor(taskTitleStr('Action', userInstruction), {
339
- onTaskStart: this.onTaskStartCallback
340
- });
39
+ const session = this.createExecutionSession(taskTitleStr('Action', userInstruction));
341
40
  const task = {
342
41
  type: 'Planning',
343
42
  subType: 'LoadYaml',
@@ -346,7 +45,8 @@ class TaskExecutor {
346
45
  userInstruction
347
46
  },
348
47
  executor: async (param, executorContext)=>{
349
- await this.setupPlanningContext(executorContext);
48
+ const { uiContext } = executorContext;
49
+ assert(uiContext, 'uiContext is required for Planning task');
350
50
  return {
351
51
  output: {
352
52
  actions: [],
@@ -366,10 +66,9 @@ class TaskExecutor {
366
66
  };
367
67
  }
368
68
  };
369
- await taskExecutor.append(task);
370
- await taskExecutor.flush();
69
+ await session.appendAndRun(task);
371
70
  return {
372
- executor: taskExecutor
71
+ runner: session.getRunner()
373
72
  };
374
73
  }
375
74
  createPlanningTask(userInstruction, actionContext, modelConfig) {
@@ -382,7 +81,8 @@ class TaskExecutor {
382
81
  },
383
82
  executor: async (param, executorContext)=>{
384
83
  const startTime = Date.now();
385
- const { uiContext } = await this.setupPlanningContext(executorContext);
84
+ const { uiContext } = executorContext;
85
+ assert(uiContext, 'uiContext is required for Planning task');
386
86
  const { vlMode } = modelConfig;
387
87
  const uiTarsModelVersion = 'vlm-ui-tars' === vlMode ? modelConfig.uiTarsModelVersion : void 0;
388
88
  assert(this.interface.actionSpace, 'actionSpace for device is not implemented');
@@ -390,7 +90,7 @@ class TaskExecutor {
390
90
  debug('actionSpace for this interface is:', actionSpace.map((action)=>action.name).join(', '));
391
91
  assert(Array.isArray(actionSpace), 'actionSpace must be an array');
392
92
  if (0 === actionSpace.length) console.warn(`ActionSpace for ${this.interface.interfaceType} is empty. This may lead to unexpected behavior.`);
393
- const planResult = await (uiTarsModelVersion ? uiTarsPlanning : index_mjs_plan)(param.userInstruction, {
93
+ const planResult = await (uiTarsModelVersion ? uiTarsPlanning : plan)(param.userInstruction, {
394
94
  context: uiContext,
395
95
  actionContext,
396
96
  interfaceType: this.interface.interfaceType,
@@ -435,16 +135,13 @@ class TaskExecutor {
435
135
  return task;
436
136
  }
437
137
  async runPlans(title, plans, modelConfig) {
438
- const taskExecutor = new Executor(title, {
439
- onTaskStart: this.onTaskStartCallback
440
- });
138
+ const session = this.createExecutionSession(title);
441
139
  const { tasks } = await this.convertPlanToExecutable(plans, modelConfig);
442
- await taskExecutor.append(tasks);
443
- const result = await taskExecutor.flush();
140
+ const result = await session.appendAndRun(tasks);
444
141
  const { output } = result;
445
142
  return {
446
143
  output,
447
- executor: taskExecutor
144
+ runner: session.getRunner()
448
145
  };
449
146
  }
450
147
  getReplanningCycleLimit(isVlmUiTars) {
@@ -452,38 +149,35 @@ class TaskExecutor {
452
149
  }
453
150
  async action(userPrompt, modelConfig, actionContext, cacheable) {
454
151
  this.conversationHistory.reset();
455
- const taskExecutor = new Executor(taskTitleStr('Action', userPrompt), {
456
- onTaskStart: this.onTaskStartCallback
457
- });
152
+ const session = this.createExecutionSession(taskTitleStr('Action', userPrompt));
153
+ const runner = session.getRunner();
458
154
  let replanCount = 0;
459
155
  const yamlFlow = [];
460
156
  const replanningCycleLimit = this.getReplanningCycleLimit('vlm-ui-tars' === modelConfig.vlMode);
461
157
  while(true){
462
158
  if (replanCount > replanningCycleLimit) {
463
159
  const errorMsg = `Replanning ${replanningCycleLimit} times, which is more than the limit, please split the task into multiple steps`;
464
- return this.appendErrorPlan(taskExecutor, errorMsg, modelConfig);
160
+ return session.appendErrorPlan(errorMsg);
465
161
  }
466
162
  const planningTask = this.createPlanningTask(userPrompt, actionContext, modelConfig);
467
- await taskExecutor.append(planningTask);
468
- const result = await taskExecutor.flush();
163
+ const result = await session.appendAndRun(planningTask);
469
164
  const planResult = null == result ? void 0 : result.output;
470
- if (taskExecutor.isInErrorState()) return {
165
+ if (session.isInErrorState()) return {
471
166
  output: planResult,
472
- executor: taskExecutor
167
+ runner
473
168
  };
474
169
  const plans = planResult.actions || [];
475
170
  yamlFlow.push(...planResult.yamlFlow || []);
476
171
  let executables;
477
172
  try {
478
173
  executables = await this.convertPlanToExecutable(plans, modelConfig, cacheable);
479
- taskExecutor.append(executables.tasks);
174
+ await session.appendAndRun(executables.tasks);
480
175
  } catch (error) {
481
- return this.appendErrorPlan(taskExecutor, `Error converting plans to executable tasks: ${error}, plans: ${JSON.stringify(plans)}`, modelConfig);
176
+ return session.appendErrorPlan(`Error converting plans to executable tasks: ${error}, plans: ${JSON.stringify(plans)}`);
482
177
  }
483
- await taskExecutor.flush();
484
- if (taskExecutor.isInErrorState()) return {
178
+ if (session.isInErrorState()) return {
485
179
  output: void 0,
486
- executor: taskExecutor
180
+ runner
487
181
  };
488
182
  if (!planResult.more_actions_needed_by_instruction) break;
489
183
  replanCount++;
@@ -492,7 +186,7 @@ class TaskExecutor {
492
186
  output: {
493
187
  yamlFlow
494
188
  },
495
- executor: taskExecutor
189
+ runner
496
190
  };
497
191
  }
498
192
  createTypeQueryTask(type, demand, modelConfig, opt, multimodalPrompt) {
@@ -508,23 +202,15 @@ class TaskExecutor {
508
202
  },
509
203
  executor: async (param, taskContext)=>{
510
204
  const { task } = taskContext;
511
- let insightDump;
512
- const dumpCollector = (dump)=>{
513
- insightDump = dump;
514
- };
515
- this.insight.onceDumpUpdatedFn = dumpCollector;
516
- const shotTime = Date.now();
517
- const uiContext = await this.insight.contextRetrieverFn('extract');
518
- task.uiContext = uiContext;
519
- const recordItem = {
520
- type: 'screenshot',
521
- ts: shotTime,
522
- screenshot: uiContext.screenshotBase64,
523
- timing: 'before Extract'
205
+ let queryDump;
206
+ const applyDump = (dump)=>{
207
+ queryDump = dump;
208
+ task.log = {
209
+ dump
210
+ };
524
211
  };
525
- task.recorder = [
526
- recordItem
527
- ];
212
+ const uiContext = taskContext.uiContext;
213
+ assert(uiContext, 'uiContext is required for Query task');
528
214
  const ifTypeRestricted = 'Query' !== type;
529
215
  let demandInput = demand;
530
216
  let keyOfResult = 'result';
@@ -537,7 +223,15 @@ class TaskExecutor {
537
223
  } else if (ifTypeRestricted) demandInput = {
538
224
  [keyOfResult]: `${type}, ${demand}`
539
225
  };
540
- const { data, usage, thought } = await this.insight.extract(demandInput, modelConfig, opt, multimodalPrompt);
226
+ let extractResult;
227
+ try {
228
+ extractResult = await this.insight.extract(demandInput, modelConfig, opt, multimodalPrompt);
229
+ } catch (error) {
230
+ if (error instanceof InsightError) applyDump(error.dump);
231
+ throw error;
232
+ }
233
+ const { data, usage, thought, dump } = extractResult;
234
+ applyDump(dump);
541
235
  let outputResult = data;
542
236
  if (ifTypeRestricted) if ('string' == typeof data) outputResult = data;
543
237
  else {
@@ -546,7 +240,7 @@ class TaskExecutor {
546
240
  }
547
241
  return {
548
242
  output: outputResult,
549
- log: insightDump,
243
+ log: queryDump,
550
244
  usage,
551
245
  thought
552
246
  };
@@ -555,36 +249,15 @@ class TaskExecutor {
555
249
  return queryTask;
556
250
  }
557
251
  async createTypeQueryExecution(type, demand, modelConfig, opt, multimodalPrompt) {
558
- const taskExecutor = new Executor(taskTitleStr(type, 'string' == typeof demand ? demand : JSON.stringify(demand)), {
559
- onTaskStart: this.onTaskStartCallback
560
- });
252
+ const session = this.createExecutionSession(taskTitleStr(type, 'string' == typeof demand ? demand : JSON.stringify(demand)));
561
253
  const queryTask = await this.createTypeQueryTask(type, demand, modelConfig, opt, multimodalPrompt);
562
- await taskExecutor.append(this.prependExecutorWithScreenshot(queryTask));
563
- const result = await taskExecutor.flush();
254
+ const result = await session.appendAndRun(queryTask);
564
255
  if (!result) throw new Error('result of taskExecutor.flush() is undefined in function createTypeQueryTask');
565
256
  const { output, thought } = result;
566
257
  return {
567
258
  output,
568
259
  thought,
569
- executor: taskExecutor
570
- };
571
- }
572
- async appendErrorPlan(taskExecutor, errorMsg, modelConfig) {
573
- const errorPlan = {
574
- type: 'Error',
575
- param: {
576
- thought: errorMsg
577
- },
578
- locate: null
579
- };
580
- const { tasks } = await this.convertPlanToExecutable([
581
- errorPlan
582
- ], modelConfig);
583
- await taskExecutor.append(this.prependExecutorWithScreenshot(tasks[0]));
584
- await taskExecutor.flush();
585
- return {
586
- output: void 0,
587
- executor: taskExecutor
260
+ runner: session.getRunner()
588
261
  };
589
262
  }
590
263
  async taskForSleep(timeMs, modelConfig) {
@@ -598,14 +271,13 @@ class TaskExecutor {
598
271
  const { tasks: sleepTasks } = await this.convertPlanToExecutable([
599
272
  sleepPlan
600
273
  ], modelConfig);
601
- return this.prependExecutorWithScreenshot(sleepTasks[0]);
274
+ return sleepTasks[0];
602
275
  }
603
276
  async waitFor(assertion, opt, modelConfig) {
604
277
  const { textPrompt, multimodalPrompt } = parsePrompt(assertion);
605
278
  const description = `waitFor: ${textPrompt}`;
606
- const taskExecutor = new Executor(taskTitleStr('WaitFor', description), {
607
- onTaskStart: this.onTaskStartCallback
608
- });
279
+ const session = this.createExecutionSession(taskTitleStr('WaitFor', description));
280
+ const runner = session.getRunner();
609
281
  const { timeoutMs, checkIntervalMs } = opt;
610
282
  assert(assertion, 'No assertion for waitFor');
611
283
  assert(timeoutMs, 'No timeoutMs for waitFor');
@@ -619,26 +291,26 @@ class TaskExecutor {
619
291
  const queryTask = await this.createTypeQueryTask('WaitFor', textPrompt, modelConfig, {
620
292
  doNotThrowError: true
621
293
  }, multimodalPrompt);
622
- await taskExecutor.append(this.prependExecutorWithScreenshot(queryTask));
623
- const result = await taskExecutor.flush();
294
+ const result = await session.appendAndRun(queryTask);
624
295
  if (null == result ? void 0 : result.output) return {
625
296
  output: void 0,
626
- executor: taskExecutor
297
+ runner
627
298
  };
628
299
  errorThought = (null == result ? void 0 : result.thought) || !result && `No result from assertion: ${textPrompt}` || `unknown error when waiting for assertion: ${textPrompt}`;
629
300
  const now = Date.now();
630
301
  if (now - startTime < checkIntervalMs) {
631
302
  const timeRemaining = checkIntervalMs - (now - startTime);
632
303
  const sleepTask = await this.taskForSleep(timeRemaining, modelConfig);
633
- await taskExecutor.append(sleepTask);
304
+ await session.append(sleepTask);
634
305
  }
635
306
  }
636
- return this.appendErrorPlan(taskExecutor, `waitFor timeout: ${errorThought}`, modelConfig);
307
+ return session.appendErrorPlan(`waitFor timeout: ${errorThought}`);
637
308
  }
638
309
  constructor(interfaceInstance, insight, opts){
639
310
  _define_property(this, "interface", void 0);
640
311
  _define_property(this, "insight", void 0);
641
312
  _define_property(this, "taskCache", void 0);
313
+ _define_property(this, "taskBuilder", void 0);
642
314
  _define_property(this, "conversationHistory", void 0);
643
315
  _define_property(this, "onTaskStartCallback", void 0);
644
316
  _define_property(this, "replanningCycleLimit", void 0);
@@ -648,6 +320,11 @@ class TaskExecutor {
648
320
  this.onTaskStartCallback = null == opts ? void 0 : opts.onTaskStart;
649
321
  this.replanningCycleLimit = opts.replanningCycleLimit;
650
322
  this.conversationHistory = new ConversationHistory();
323
+ this.taskBuilder = new TaskBuilder({
324
+ interfaceInstance,
325
+ insight,
326
+ taskCache: opts.taskCache
327
+ });
651
328
  }
652
329
  }
653
330
  export { TaskExecutor, locatePlanForLocate };