misoai-web 1.0.1 → 1.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +8 -8
  3. package/bin/midscene-playground +2 -2
  4. package/package.json +23 -24
  5. package/dist/es/agent.js +0 -2451
  6. package/dist/es/agent.js.map +0 -1
  7. package/dist/es/bridge-mode-browser.js +0 -908
  8. package/dist/es/bridge-mode-browser.js.map +0 -1
  9. package/dist/es/bridge-mode.js +0 -2812
  10. package/dist/es/bridge-mode.js.map +0 -1
  11. package/dist/es/chrome-extension.js +0 -3152
  12. package/dist/es/chrome-extension.js.map +0 -1
  13. package/dist/es/index.js +0 -3052
  14. package/dist/es/index.js.map +0 -1
  15. package/dist/es/midscene-playground.js +0 -2781
  16. package/dist/es/midscene-playground.js.map +0 -1
  17. package/dist/es/midscene-server.js +0 -247
  18. package/dist/es/midscene-server.js.map +0 -1
  19. package/dist/es/playground.js +0 -2552
  20. package/dist/es/playground.js.map +0 -1
  21. package/dist/es/playwright-report.js +0 -120
  22. package/dist/es/playwright-report.js.map +0 -1
  23. package/dist/es/playwright.js +0 -2997
  24. package/dist/es/playwright.js.map +0 -1
  25. package/dist/es/puppeteer-agent-launcher.js +0 -2947
  26. package/dist/es/puppeteer-agent-launcher.js.map +0 -1
  27. package/dist/es/puppeteer.js +0 -2794
  28. package/dist/es/puppeteer.js.map +0 -1
  29. package/dist/es/ui-utils.js +0 -106
  30. package/dist/es/ui-utils.js.map +0 -1
  31. package/dist/es/utils.js +0 -197
  32. package/dist/es/utils.js.map +0 -1
  33. package/dist/es/yaml.js +0 -333
  34. package/dist/es/yaml.js.map +0 -1
  35. package/dist/lib/agent.js +0 -2466
  36. package/dist/lib/agent.js.map +0 -1
  37. package/dist/lib/bridge-mode-browser.js +0 -942
  38. package/dist/lib/bridge-mode-browser.js.map +0 -1
  39. package/dist/lib/bridge-mode.js +0 -2832
  40. package/dist/lib/bridge-mode.js.map +0 -1
  41. package/dist/lib/chrome-extension.js +0 -3169
  42. package/dist/lib/chrome-extension.js.map +0 -1
  43. package/dist/lib/index.js +0 -3071
  44. package/dist/lib/index.js.map +0 -1
  45. package/dist/lib/midscene-playground.js +0 -2785
  46. package/dist/lib/midscene-playground.js.map +0 -1
  47. package/dist/lib/midscene-server.js +0 -273
  48. package/dist/lib/midscene-server.js.map +0 -1
  49. package/dist/lib/playground.js +0 -2571
  50. package/dist/lib/playground.js.map +0 -1
  51. package/dist/lib/playwright-report.js +0 -148
  52. package/dist/lib/playwright-report.js.map +0 -1
  53. package/dist/lib/playwright.js +0 -3017
  54. package/dist/lib/playwright.js.map +0 -1
  55. package/dist/lib/puppeteer-agent-launcher.js +0 -2963
  56. package/dist/lib/puppeteer-agent-launcher.js.map +0 -1
  57. package/dist/lib/puppeteer.js +0 -2808
  58. package/dist/lib/puppeteer.js.map +0 -1
  59. package/dist/lib/ui-utils.js +0 -137
  60. package/dist/lib/ui-utils.js.map +0 -1
  61. package/dist/lib/utils.js +0 -235
  62. package/dist/lib/utils.js.map +0 -1
  63. package/dist/lib/yaml.js +0 -372
  64. package/dist/lib/yaml.js.map +0 -1
  65. package/dist/types/agent.d.ts +0 -254
  66. package/dist/types/bridge-mode-browser.d.ts +0 -9
  67. package/dist/types/bridge-mode.d.ts +0 -40
  68. package/dist/types/browser-d447695b.d.ts +0 -37
  69. package/dist/types/chrome-extension.d.ts +0 -18
  70. package/dist/types/index.d.ts +0 -16
  71. package/dist/types/midscene-playground.d.ts +0 -2
  72. package/dist/types/midscene-server.d.ts +0 -31
  73. package/dist/types/page-b8ada1f3.d.ts +0 -322
  74. package/dist/types/playground.d.ts +0 -17
  75. package/dist/types/playwright-report.d.ts +0 -11
  76. package/dist/types/playwright.d.ts +0 -87
  77. package/dist/types/puppeteer-agent-launcher.d.ts +0 -40
  78. package/dist/types/puppeteer.d.ts +0 -17
  79. package/dist/types/ui-utils.d.ts +0 -14
  80. package/dist/types/utils-badc824e.d.ts +0 -34
  81. package/dist/types/utils.d.ts +0 -8
  82. package/dist/types/yaml.d.ts +0 -15
@@ -1,3152 +0,0 @@
1
- var __accessCheck = (obj, member, msg) => {
2
- if (!member.has(obj))
3
- throw TypeError("Cannot " + msg);
4
- };
5
- var __privateGet = (obj, member, getter) => {
6
- __accessCheck(obj, member, "read from private field");
7
- return getter ? getter.call(obj) : member.get(obj);
8
- };
9
- var __privateAdd = (obj, member, value) => {
10
- if (member.has(obj))
11
- throw TypeError("Cannot add the same private member more than once");
12
- member instanceof WeakSet ? member.add(obj) : member.set(obj, value);
13
- };
14
- var __privateSet = (obj, member, value, setter) => {
15
- __accessCheck(obj, member, "write to private field");
16
- setter ? setter.call(obj, value) : member.set(obj, value);
17
- return value;
18
- };
19
- var __privateMethod = (obj, member, method) => {
20
- __accessCheck(obj, member, "access private method");
21
- return method;
22
- };
23
-
24
- // src/common/utils.ts
25
- import { elementByPositionWithElementInfo } from "misoai-core/ai-model";
26
- import { uploadTestInfoToServer } from "misoai-core/utils";
27
- import { MIDSCENE_REPORT_TAG_NAME, getAIConfig } from "misoai-shared/env";
28
- import {
29
- generateElementByPosition,
30
- getNodeFromCacheList,
31
- traverseTree,
32
- treeToList
33
- } from "misoai-shared/extractor";
34
- import { resizeImgBase64 } from "misoai-shared/img";
35
- import { assert, logMsg, uuid } from "misoai-shared/utils";
36
- import dayjs from "dayjs";
37
-
38
- // src/web-element.ts
39
- var WebElementInfo = class {
40
- constructor({
41
- content,
42
- rect,
43
- // page,
44
- locator,
45
- id,
46
- attributes,
47
- indexId,
48
- xpaths
49
- }) {
50
- this.content = content;
51
- this.rect = rect;
52
- this.center = [
53
- Math.floor(rect.left + rect.width / 2),
54
- Math.floor(rect.top + rect.height / 2)
55
- ];
56
- this.locator = locator;
57
- this.id = id;
58
- this.attributes = attributes;
59
- this.indexId = indexId;
60
- this.xpaths = xpaths;
61
- }
62
- };
63
-
64
- // src/common/utils.ts
65
- async function parseContextFromWebPage(page, _opt) {
66
- assert(page, "page is required");
67
- if (page._forceUsePageContext) {
68
- return await page._forceUsePageContext();
69
- }
70
- const url = await page.url();
71
- uploadTestInfoToServer({ testUrl: url });
72
- let screenshotBase64;
73
- let tree;
74
- await Promise.all([
75
- page.screenshotBase64().then((base64) => {
76
- screenshotBase64 = base64;
77
- }),
78
- page.getElementsNodeTree().then(async (treeRoot) => {
79
- tree = treeRoot;
80
- })
81
- ]);
82
- const webTree = traverseTree(tree, (elementInfo) => {
83
- const { rect, id, content, attributes, locator, indexId } = elementInfo;
84
- return new WebElementInfo({
85
- rect,
86
- locator,
87
- id,
88
- content,
89
- attributes,
90
- indexId
91
- });
92
- });
93
- assert(screenshotBase64, "screenshotBase64 is required");
94
- const elementsInfo = treeToList(webTree);
95
- const size = await page.size();
96
- if (size.dpr && size.dpr > 1) {
97
- screenshotBase64 = await resizeImgBase64(screenshotBase64, {
98
- width: size.width,
99
- height: size.height
100
- });
101
- }
102
- return {
103
- content: elementsInfo,
104
- tree: webTree,
105
- size,
106
- screenshotBase64,
107
- url
108
- };
109
- }
110
- function reportFileName(tag = "web") {
111
- const reportTagName = getAIConfig(MIDSCENE_REPORT_TAG_NAME);
112
- const dateTimeInFileName = dayjs().format("YYYY-MM-DD_HH-mm-ss");
113
- const uniqueId = uuid().substring(0, 8);
114
- return `${reportTagName || tag}-${dateTimeInFileName}-${uniqueId}`;
115
- }
116
- function printReportMsg(filepath) {
117
- logMsg(`Midscene - report file updated: ${filepath}`);
118
- }
119
- var ERROR_CODE_NOT_IMPLEMENTED_AS_DESIGNED = "NOT_IMPLEMENTED_AS_DESIGNED";
120
- function replaceIllegalPathCharsAndSpace(str) {
121
- return str.replace(/[/\\:*?"<>| ]/g, "-");
122
- }
123
- function matchElementFromPlan(planLocateParam, tree) {
124
- if (!planLocateParam) {
125
- return void 0;
126
- }
127
- if (planLocateParam.id) {
128
- return getNodeFromCacheList(planLocateParam.id);
129
- }
130
- if (planLocateParam.bbox) {
131
- const centerPosition = {
132
- x: Math.floor((planLocateParam.bbox[0] + planLocateParam.bbox[2]) / 2),
133
- y: Math.floor((planLocateParam.bbox[1] + planLocateParam.bbox[3]) / 2)
134
- };
135
- let element = elementByPositionWithElementInfo(tree, centerPosition);
136
- if (!element) {
137
- element = generateElementByPosition(centerPosition);
138
- }
139
- return element;
140
- }
141
- return void 0;
142
- }
143
-
144
- // src/common/agent.ts
145
- import { Insight } from "misoai-core";
146
- import yaml4 from "js-yaml";
147
-
148
- // src/yaml/player.ts
149
- import { existsSync, mkdirSync, writeFileSync } from "fs";
150
- import { dirname, join, resolve } from "path";
151
- import { assert as assert2, ifInBrowser } from "misoai-shared/utils";
152
- import { getMidsceneRunSubDir } from "misoai-shared/common";
153
- var ScriptPlayer = class {
154
- constructor(script, setupAgent, onTaskStatusChange) {
155
- this.script = script;
156
- this.setupAgent = setupAgent;
157
- this.onTaskStatusChange = onTaskStatusChange;
158
- this.taskStatusList = [];
159
- this.status = "init";
160
- this.unnamedResultIndex = 0;
161
- this.pageAgent = null;
162
- this.result = {};
163
- if (ifInBrowser) {
164
- this.output = void 0;
165
- } else if (script.target?.output) {
166
- this.output = resolve(process.cwd(), script.target.output);
167
- } else {
168
- this.output = join(getMidsceneRunSubDir("output"), `${process.pid}.json`);
169
- }
170
- this.taskStatusList = (script.tasks || []).map((task, taskIndex) => ({
171
- ...task,
172
- index: taskIndex,
173
- status: "init",
174
- totalSteps: task.flow?.length || 0
175
- }));
176
- }
177
- setResult(key, value) {
178
- const keyToUse = key || this.unnamedResultIndex++;
179
- if (this.result[keyToUse]) {
180
- console.warn(`result key ${keyToUse} already exists, will overwrite`);
181
- }
182
- this.result[keyToUse] = value;
183
- this.flushResult();
184
- }
185
- setPlayerStatus(status, error) {
186
- this.status = status;
187
- this.errorInSetup = error;
188
- }
189
- notifyCurrentTaskStatusChange(taskIndex) {
190
- const taskIndexToNotify = typeof taskIndex === "number" ? taskIndex : this.currentTaskIndex;
191
- if (typeof taskIndexToNotify !== "number") {
192
- return;
193
- }
194
- const taskStatus = this.taskStatusList[taskIndexToNotify];
195
- if (this.onTaskStatusChange) {
196
- this.onTaskStatusChange(taskStatus);
197
- }
198
- }
199
- async setTaskStatus(index, statusValue, error) {
200
- this.taskStatusList[index].status = statusValue;
201
- if (error) {
202
- this.taskStatusList[index].error = error;
203
- }
204
- this.notifyCurrentTaskStatusChange(index);
205
- }
206
- setTaskIndex(taskIndex) {
207
- this.currentTaskIndex = taskIndex;
208
- }
209
- flushResult() {
210
- if (Object.keys(this.result).length && this.output) {
211
- const output = resolve(process.cwd(), this.output);
212
- const outputDir = dirname(output);
213
- if (!existsSync(outputDir)) {
214
- mkdirSync(outputDir, { recursive: true });
215
- }
216
- writeFileSync(output, JSON.stringify(this.result, void 0, 2));
217
- }
218
- }
219
- async playTask(taskStatus, agent) {
220
- const { flow } = taskStatus;
221
- assert2(flow, "missing flow in task");
222
- for (const flowItemIndex in flow) {
223
- const currentStep = Number.parseInt(flowItemIndex, 10);
224
- taskStatus.currentStep = currentStep;
225
- const flowItem = flow[flowItemIndex];
226
- if ("aiAction" in flowItem || "ai" in flowItem) {
227
- const actionTask = flowItem;
228
- const prompt = actionTask.aiAction || actionTask.ai;
229
- assert2(prompt, "missing prompt for ai (aiAction)");
230
- assert2(
231
- typeof prompt === "string",
232
- "prompt for aiAction must be a string"
233
- );
234
- await agent.aiAction(prompt);
235
- } else if ("aiAssert" in flowItem) {
236
- const assertTask = flowItem;
237
- const prompt = assertTask.aiAssert;
238
- assert2(prompt, "missing prompt for aiAssert");
239
- assert2(
240
- typeof prompt === "string",
241
- "prompt for aiAssert must be a string"
242
- );
243
- await agent.aiAssert(prompt);
244
- } else if ("aiQuery" in flowItem) {
245
- const queryTask = flowItem;
246
- const prompt = queryTask.aiQuery;
247
- assert2(prompt, "missing prompt for aiQuery");
248
- assert2(
249
- typeof prompt === "string",
250
- "prompt for aiQuery must be a string"
251
- );
252
- const queryResult = await agent.aiQuery(prompt);
253
- this.setResult(queryTask.name, queryResult);
254
- } else if ("aiNumber" in flowItem) {
255
- const numberTask = flowItem;
256
- const prompt = numberTask.aiNumber;
257
- assert2(prompt, "missing prompt for number");
258
- assert2(
259
- typeof prompt === "string",
260
- "prompt for number must be a string"
261
- );
262
- const numberResult = await agent.aiNumber(prompt);
263
- this.setResult(numberTask.name, numberResult);
264
- } else if ("aiString" in flowItem) {
265
- const stringTask = flowItem;
266
- const prompt = stringTask.aiString;
267
- assert2(prompt, "missing prompt for string");
268
- assert2(
269
- typeof prompt === "string",
270
- "prompt for string must be a string"
271
- );
272
- const stringResult = await agent.aiString(prompt);
273
- this.setResult(stringTask.name, stringResult);
274
- } else if ("aiBoolean" in flowItem) {
275
- const booleanTask = flowItem;
276
- const prompt = booleanTask.aiBoolean;
277
- assert2(prompt, "missing prompt for boolean");
278
- assert2(
279
- typeof prompt === "string",
280
- "prompt for boolean must be a string"
281
- );
282
- const booleanResult = await agent.aiBoolean(prompt);
283
- this.setResult(booleanTask.name, booleanResult);
284
- } else if ("aiLocate" in flowItem) {
285
- const locateTask = flowItem;
286
- const prompt = locateTask.aiLocate;
287
- assert2(prompt, "missing prompt for aiLocate");
288
- assert2(
289
- typeof prompt === "string",
290
- "prompt for aiLocate must be a string"
291
- );
292
- const locateResult = await agent.aiLocate(prompt);
293
- this.setResult(locateTask.name, locateResult);
294
- } else if ("aiWaitFor" in flowItem) {
295
- const waitForTask = flowItem;
296
- const prompt = waitForTask.aiWaitFor;
297
- assert2(prompt, "missing prompt for aiWaitFor");
298
- assert2(
299
- typeof prompt === "string",
300
- "prompt for aiWaitFor must be a string"
301
- );
302
- const timeout = waitForTask.timeout;
303
- await agent.aiWaitFor(prompt, { timeoutMs: timeout });
304
- } else if ("sleep" in flowItem) {
305
- const sleepTask = flowItem;
306
- const ms = sleepTask.sleep;
307
- let msNumber = ms;
308
- if (typeof ms === "string") {
309
- msNumber = Number.parseInt(ms, 10);
310
- }
311
- assert2(
312
- msNumber && msNumber > 0,
313
- `ms for sleep must be greater than 0, but got ${ms}`
314
- );
315
- await new Promise((resolve2) => setTimeout(resolve2, msNumber));
316
- } else if ("aiTap" in flowItem) {
317
- const tapTask = flowItem;
318
- await agent.aiTap(tapTask.aiTap, tapTask);
319
- } else if ("aiHover" in flowItem) {
320
- const hoverTask = flowItem;
321
- await agent.aiHover(hoverTask.aiHover, hoverTask);
322
- } else if ("aiInput" in flowItem) {
323
- const inputTask = flowItem;
324
- await agent.aiInput(inputTask.aiInput, inputTask.locate, inputTask);
325
- } else if ("aiKeyboardPress" in flowItem) {
326
- const keyboardPressTask = flowItem;
327
- await agent.aiKeyboardPress(
328
- keyboardPressTask.aiKeyboardPress,
329
- keyboardPressTask.locate,
330
- keyboardPressTask
331
- );
332
- } else if ("aiScroll" in flowItem) {
333
- const scrollTask = flowItem;
334
- await agent.aiScroll(scrollTask, scrollTask.locate, scrollTask);
335
- } else if ("javascript" in flowItem) {
336
- const evaluateJavaScriptTask = flowItem;
337
- const result = await agent.evaluateJavaScript(
338
- evaluateJavaScriptTask.javascript
339
- );
340
- this.setResult(evaluateJavaScriptTask.name, result);
341
- } else {
342
- throw new Error(`unknown flowItem: ${JSON.stringify(flowItem)}`);
343
- }
344
- }
345
- this.reportFile = agent.reportFile;
346
- }
347
- async run() {
348
- const { target, web, android, tasks } = this.script;
349
- const webEnv = web || target;
350
- const androidEnv = android;
351
- const platform = webEnv || androidEnv;
352
- this.setPlayerStatus("running");
353
- let agent = null;
354
- let freeFn = [];
355
- try {
356
- const { agent: newAgent, freeFn: newFreeFn } = await this.setupAgent(
357
- platform
358
- );
359
- agent = newAgent;
360
- const originalOnTaskStartTip = agent.onTaskStartTip;
361
- agent.onTaskStartTip = (tip) => {
362
- if (this.status === "running") {
363
- this.agentStatusTip = tip;
364
- }
365
- originalOnTaskStartTip?.(tip);
366
- };
367
- freeFn = [
368
- ...newFreeFn || [],
369
- {
370
- name: "restore-agent-onTaskStartTip",
371
- fn: () => {
372
- if (agent) {
373
- agent.onTaskStartTip = originalOnTaskStartTip;
374
- }
375
- }
376
- }
377
- ];
378
- } catch (e) {
379
- this.setPlayerStatus("error", e);
380
- return;
381
- }
382
- this.pageAgent = agent;
383
- let taskIndex = 0;
384
- this.setPlayerStatus("running");
385
- let errorFlag = false;
386
- while (taskIndex < tasks.length) {
387
- const taskStatus = this.taskStatusList[taskIndex];
388
- this.setTaskStatus(taskIndex, "running");
389
- this.setTaskIndex(taskIndex);
390
- try {
391
- await this.playTask(taskStatus, this.pageAgent);
392
- this.setTaskStatus(taskIndex, "done");
393
- } catch (e) {
394
- this.setTaskStatus(taskIndex, "error", e);
395
- if (taskStatus.continueOnError) {
396
- } else {
397
- this.reportFile = agent.reportFile;
398
- errorFlag = true;
399
- break;
400
- }
401
- }
402
- this.reportFile = agent.reportFile;
403
- taskIndex++;
404
- }
405
- if (errorFlag) {
406
- this.setPlayerStatus("error");
407
- } else {
408
- this.setPlayerStatus("done");
409
- }
410
- this.agentStatusTip = "";
411
- for (const fn of freeFn) {
412
- try {
413
- await fn.fn();
414
- } catch (e) {
415
- }
416
- }
417
- }
418
- };
419
-
420
- // src/yaml/builder.ts
421
- import yaml from "js-yaml";
422
-
423
- // src/yaml/utils.ts
424
- import { assert as assert3 } from "misoai-shared/utils";
425
- import yaml2 from "js-yaml";
426
- function interpolateEnvVars(content) {
427
- return content.replace(/\$\{([^}]+)\}/g, (_, envVar) => {
428
- const value = process.env[envVar.trim()];
429
- if (value === void 0) {
430
- throw new Error(`Environment variable "${envVar.trim()}" is not defined`);
431
- }
432
- return value;
433
- });
434
- }
435
- function parseYamlScript(content, filePath, ignoreCheckingTarget) {
436
- const interpolatedContent = interpolateEnvVars(content);
437
- const obj = yaml2.load(interpolatedContent);
438
- const pathTip = filePath ? `, failed to load ${filePath}` : "";
439
- const android = typeof obj.android !== "undefined" ? Object.assign({}, obj.android || {}) : void 0;
440
- const webConfig = obj.web || obj.target;
441
- const web = typeof webConfig !== "undefined" ? Object.assign({}, webConfig || {}) : void 0;
442
- if (!ignoreCheckingTarget) {
443
- assert3(
444
- web || android,
445
- `at least one of "target", "web", or "android" properties is required in yaml script${pathTip}`
446
- );
447
- assert3(
448
- web && !android || !web && android,
449
- `only one of "target", "web", or "android" properties is allowed in yaml script${pathTip}`
450
- );
451
- if (web || android) {
452
- assert3(
453
- typeof web === "object" || typeof android === "object",
454
- `property "target/web/android" must be an object${pathTip}`
455
- );
456
- }
457
- }
458
- assert3(obj.tasks, `property "tasks" is required in yaml script ${pathTip}`);
459
- assert3(
460
- Array.isArray(obj.tasks),
461
- `property "tasks" must be an array in yaml script, but got ${obj.tasks}`
462
- );
463
- return obj;
464
- }
465
-
466
- // src/common/agent.ts
467
- import {
468
- groupedActionDumpFileExt,
469
- reportHTMLContent,
470
- stringifyDumpData,
471
- writeLogFile
472
- } from "misoai-core/utils";
473
- import {
474
- DEFAULT_WAIT_FOR_NAVIGATION_TIMEOUT,
475
- DEFAULT_WAIT_FOR_NETWORK_IDLE_TIMEOUT
476
- } from "misoai-shared/constants";
477
- import { getAIConfigInBoolean, vlLocateMode } from "misoai-shared/env";
478
- import { getDebug as getDebug4 } from "misoai-shared/logger";
479
- import { assert as assert7 } from "misoai-shared/utils";
480
-
481
- // src/common/tasks.ts
482
- import {
483
- Executor,
484
- plan
485
- } from "misoai-core";
486
- import {
487
- elementByPositionWithElementInfo as elementByPositionWithElementInfo2,
488
- resizeImageForUiTars,
489
- vlmPlanning
490
- } from "misoai-core/ai-model";
491
- import { sleep } from "misoai-core/utils";
492
- import { NodeType } from "misoai-shared/constants";
493
- import { getElementInfosScriptContent } from "misoai-shared/fs";
494
- import { getDebug } from "misoai-shared/logger";
495
- import { assert as assert4 } from "misoai-shared/utils";
496
-
497
- // src/common/ui-utils.ts
498
- function typeStr(task) {
499
- return task.subType && task.subType !== "Plan" ? `${task.type} / ${task.subType || ""}` : task.type;
500
- }
501
- function getKeyCommands(value) {
502
- const keys = Array.isArray(value) ? value : [value];
503
- return keys.reduce((acc, k) => {
504
- const includeMeta = keys.includes("Meta") || keys.includes("Control");
505
- if (includeMeta && (k === "a" || k === "A")) {
506
- return acc.concat([{ key: k, command: "SelectAll" }]);
507
- }
508
- if (includeMeta && (k === "c" || k === "C")) {
509
- return acc.concat([{ key: k, command: "Copy" }]);
510
- }
511
- if (includeMeta && (k === "v" || k === "V")) {
512
- return acc.concat([{ key: k, command: "Paste" }]);
513
- }
514
- return acc.concat([{ key: k }]);
515
- }, []);
516
- }
517
- function locateParamStr(locate) {
518
- if (!locate) {
519
- return "";
520
- }
521
- if (typeof locate === "string") {
522
- return locate;
523
- }
524
- return locate.prompt;
525
- }
526
- function scrollParamStr(scrollParam) {
527
- if (!scrollParam) {
528
- return "";
529
- }
530
- return `${scrollParam.direction || "down"}, ${scrollParam.scrollType || "once"}, ${scrollParam.distance || "distance-not-set"}`;
531
- }
532
- function taskTitleStr(type, prompt) {
533
- if (prompt) {
534
- return `${type} - ${prompt}`;
535
- }
536
- return type;
537
- }
538
- function paramStr(task) {
539
- let value;
540
- if (task.type === "Planning") {
541
- value = task?.param?.userInstruction;
542
- }
543
- if (task.type === "Insight") {
544
- value = task?.param?.prompt || task?.param?.id || task?.param?.dataDemand || task?.param?.assertion;
545
- }
546
- if (task.type === "Action") {
547
- const locate = task?.locate;
548
- const locateStr = locate ? locateParamStr(locate) : "";
549
- value = task.thought || "";
550
- if (typeof task?.param?.timeMs === "number") {
551
- value = `${task?.param?.timeMs}ms`;
552
- } else if (typeof task?.param?.scrollType === "string") {
553
- value = scrollParamStr(task?.param);
554
- } else if (typeof task?.param?.value !== "undefined") {
555
- value = task?.param?.value;
556
- }
557
- if (locateStr) {
558
- if (value) {
559
- value = `${locateStr} - ${value}`;
560
- } else {
561
- value = locateStr;
562
- }
563
- }
564
- }
565
- if (typeof value === "undefined")
566
- return "";
567
- return typeof value === "string" ? value : JSON.stringify(value, void 0, 2);
568
- }
569
- var limitOpenNewTabScript = `
570
- if (!window.__MIDSCENE_NEW_TAB_INTERCEPTOR_INITIALIZED__) {
571
- window.__MIDSCENE_NEW_TAB_INTERCEPTOR_INITIALIZED__ = true;
572
-
573
- // Intercept the window.open method (only once)
574
- window.open = function(url) {
575
- console.log('Blocked window.open:', url);
576
- window.location.href = url;
577
- return null;
578
- };
579
-
580
- // Block all a tag clicks with target="_blank" (only once)
581
- document.addEventListener('click', function(e) {
582
- const target = e.target.closest('a');
583
- if (target && target.target === '_blank') {
584
- e.preventDefault();
585
- console.log('Blocked new tab:', target.href);
586
- window.location.href = target.href;
587
- target.removeAttribute('target');
588
- }
589
- }, true);
590
- }
591
- `;
592
-
593
- // src/common/tasks.ts
594
- var debug = getDebug("page-task-executor");
595
- var replanningCountLimit = 10;
596
- var isAndroidPage = (page) => {
597
- return page.pageType === "android";
598
- };
599
- var PageTaskExecutor = class {
600
- constructor(page, insight, opts) {
601
- this.conversationHistory = [];
602
- this.page = page;
603
- this.insight = insight;
604
- this.taskCache = opts.taskCache;
605
- this.onTaskStartCallback = opts?.onTaskStart;
606
- }
607
- async recordScreenshot(timing) {
608
- const base64 = await this.page.screenshotBase64();
609
- const item = {
610
- type: "screenshot",
611
- ts: Date.now(),
612
- screenshot: base64,
613
- timing
614
- };
615
- return item;
616
- }
617
- async getElementXpath(pageContext, element) {
618
- let elementId = element?.id;
619
- if (element?.attributes?.nodeType === NodeType.POSITION) {
620
- await this.insight.contextRetrieverFn("locate");
621
- const info = elementByPositionWithElementInfo2(
622
- pageContext.tree,
623
- {
624
- x: element.center[0],
625
- y: element.center[1]
626
- },
627
- {
628
- requireStrictDistance: false,
629
- filterPositionElements: true
630
- }
631
- );
632
- if (info?.id) {
633
- elementId = info.id;
634
- }
635
- }
636
- if (!elementId) {
637
- return void 0;
638
- }
639
- try {
640
- const elementInfosScriptContent = getElementInfosScriptContent();
641
- const result = await this.page.evaluateJavaScript?.(
642
- `${elementInfosScriptContent}midscene_element_inspector.getXpathsById('${elementId}')`
643
- );
644
- return result;
645
- } catch (error) {
646
- debug("getXpathsById error: ", error);
647
- }
648
- }
649
- prependExecutorWithScreenshot(taskApply, appendAfterExecution = false) {
650
- const taskWithScreenshot = {
651
- ...taskApply,
652
- executor: async (param, context, ...args) => {
653
- const recorder = [];
654
- const { task } = context;
655
- task.recorder = recorder;
656
- const shot = await this.recordScreenshot(`before ${task.type}`);
657
- recorder.push(shot);
658
- const result = await taskApply.executor(param, context, ...args);
659
- if (taskApply.type === "Action") {
660
- await Promise.all([
661
- (async () => {
662
- await sleep(100);
663
- if (this.page.waitUntilNetworkIdle) {
664
- try {
665
- await this.page.waitUntilNetworkIdle();
666
- } catch (error) {
667
- }
668
- }
669
- })(),
670
- sleep(200)
671
- ]);
672
- }
673
- if (appendAfterExecution) {
674
- const shot2 = await this.recordScreenshot("after Action");
675
- recorder.push(shot2);
676
- }
677
- return result;
678
- }
679
- };
680
- return taskWithScreenshot;
681
- }
682
- async convertPlanToExecutable(plans) {
683
- const tasks = [];
684
- plans.forEach((plan2) => {
685
- if (plan2.type === "Locate") {
686
- if (plan2.locate === null || plan2.locate?.id === null || plan2.locate?.id === "null") {
687
- return;
688
- }
689
- const taskFind = {
690
- type: "Insight",
691
- subType: "Locate",
692
- param: plan2.locate || void 0,
693
- thought: plan2.thought,
694
- locate: plan2.locate,
695
- executor: async (param, taskContext) => {
696
- const { task } = taskContext;
697
- assert4(
698
- param?.prompt || param?.id || param?.bbox,
699
- "No prompt or id or position or bbox to locate"
700
- );
701
- let insightDump;
702
- let usage;
703
- const dumpCollector = (dump) => {
704
- insightDump = dump;
705
- usage = dump?.taskInfo?.usage;
706
- task.log = {
707
- dump: insightDump
708
- };
709
- task.usage = usage;
710
- };
711
- this.insight.onceDumpUpdatedFn = dumpCollector;
712
- const shotTime = Date.now();
713
- const pageContext = await this.insight.contextRetrieverFn("locate");
714
- task.pageContext = pageContext;
715
- const recordItem = {
716
- type: "screenshot",
717
- ts: shotTime,
718
- screenshot: pageContext.screenshotBase64,
719
- timing: "before locate"
720
- };
721
- task.recorder = [recordItem];
722
- let cacheHitFlag = false;
723
- const cachePrompt = param.prompt;
724
- const locateCacheRecord = this.taskCache?.matchLocateCache(cachePrompt);
725
- const xpaths = locateCacheRecord?.cacheContent?.xpaths;
726
- let elementFromCache = null;
727
- try {
728
- if (xpaths?.length && this.taskCache?.isCacheResultUsed && param?.cacheable !== false) {
729
- const elementInfosScriptContent = getElementInfosScriptContent();
730
- const element2 = await this.page.evaluateJavaScript?.(
731
- `${elementInfosScriptContent}midscene_element_inspector.getElementInfoByXpath('${xpaths[0]}')`
732
- );
733
- if (element2?.id) {
734
- elementFromCache = element2;
735
- debug("cache hit, prompt: %s", cachePrompt);
736
- cacheHitFlag = true;
737
- debug(
738
- "found a new new element with same xpath, xpath: %s, id: %s",
739
- xpaths[0],
740
- element2?.id
741
- );
742
- }
743
- }
744
- } catch (error) {
745
- debug("get element info by xpath error: ", error);
746
- }
747
- const startTime = Date.now();
748
- const element = elementFromCache || // try to match element from cache
749
- matchElementFromPlan(param, pageContext.tree) || // try to match element from plan
750
- (await this.insight.locate(param, {
751
- context: pageContext
752
- })).element;
753
- const aiCost = Date.now() - startTime;
754
- if (element && this.taskCache && !cacheHitFlag && param?.cacheable !== false) {
755
- const elementXpaths = await this.getElementXpath(
756
- pageContext,
757
- element
758
- );
759
- if (elementXpaths) {
760
- this.taskCache.updateOrAppendCacheRecord(
761
- {
762
- type: "locate",
763
- prompt: cachePrompt,
764
- xpaths: elementXpaths
765
- },
766
- locateCacheRecord
767
- );
768
- } else {
769
- debug("no xpaths found, will not update cache", cachePrompt);
770
- }
771
- }
772
- if (!element) {
773
- throw new Error(`Element not found: ${param.prompt}`);
774
- }
775
- return {
776
- output: {
777
- element
778
- },
779
- pageContext,
780
- cache: {
781
- hit: cacheHitFlag
782
- },
783
- aiCost
784
- };
785
- }
786
- };
787
- tasks.push(taskFind);
788
- } else if (plan2.type === "Assert" || plan2.type === "AssertWithoutThrow") {
789
- const assertPlan = plan2;
790
- const taskAssert = {
791
- type: "Insight",
792
- subType: "Assert",
793
- param: assertPlan.param,
794
- thought: assertPlan.thought,
795
- locate: assertPlan.locate,
796
- executor: async (param, taskContext) => {
797
- const { task } = taskContext;
798
- let insightDump;
799
- const dumpCollector = (dump) => {
800
- insightDump = dump;
801
- };
802
- this.insight.onceDumpUpdatedFn = dumpCollector;
803
- const assertion = await this.insight.assert(
804
- assertPlan.param.assertion
805
- );
806
- if (!assertion.pass) {
807
- if (plan2.type === "Assert") {
808
- task.output = assertion;
809
- task.log = {
810
- dump: insightDump
811
- };
812
- throw new Error(
813
- assertion.thought || "Assertion failed without reason"
814
- );
815
- }
816
- task.error = assertion.thought;
817
- }
818
- return {
819
- output: assertion,
820
- log: {
821
- dump: insightDump
822
- },
823
- usage: assertion.usage
824
- };
825
- }
826
- };
827
- tasks.push(taskAssert);
828
- } else if (plan2.type === "Input") {
829
- const taskActionInput = {
830
- type: "Action",
831
- subType: "Input",
832
- param: plan2.param,
833
- thought: plan2.thought,
834
- locate: plan2.locate,
835
- executor: async (taskParam, { element }) => {
836
- if (element) {
837
- await this.page.clearInput(element);
838
- if (!taskParam || !taskParam.value) {
839
- return;
840
- }
841
- await this.page.keyboard.type(taskParam.value);
842
- } else {
843
- await this.page.keyboard.type(taskParam.value);
844
- }
845
- }
846
- };
847
- tasks.push(taskActionInput);
848
- } else if (plan2.type === "KeyboardPress") {
849
- const taskActionKeyboardPress = {
850
- type: "Action",
851
- subType: "KeyboardPress",
852
- param: plan2.param,
853
- thought: plan2.thought,
854
- locate: plan2.locate,
855
- executor: async (taskParam) => {
856
- const keys = getKeyCommands(taskParam.value);
857
- await this.page.keyboard.press(keys);
858
- }
859
- };
860
- tasks.push(taskActionKeyboardPress);
861
- } else if (plan2.type === "Tap") {
862
- const taskActionTap = {
863
- type: "Action",
864
- subType: "Tap",
865
- thought: plan2.thought,
866
- locate: plan2.locate,
867
- executor: async (param, { element }) => {
868
- assert4(element, "Element not found, cannot tap");
869
- await this.page.mouse.click(element.center[0], element.center[1]);
870
- }
871
- };
872
- tasks.push(taskActionTap);
873
- } else if (plan2.type === "Drag") {
874
- const taskActionDrag = {
875
- type: "Action",
876
- subType: "Drag",
877
- param: plan2.param,
878
- thought: plan2.thought,
879
- locate: plan2.locate,
880
- executor: async (taskParam) => {
881
- assert4(
882
- taskParam?.start_box && taskParam?.end_box,
883
- "No start_box or end_box to drag"
884
- );
885
- await this.page.mouse.drag(taskParam.start_box, taskParam.end_box);
886
- }
887
- };
888
- tasks.push(taskActionDrag);
889
- } else if (plan2.type === "Hover") {
890
- const taskActionHover = {
891
- type: "Action",
892
- subType: "Hover",
893
- thought: plan2.thought,
894
- locate: plan2.locate,
895
- executor: async (param, { element }) => {
896
- assert4(element, "Element not found, cannot hover");
897
- await this.page.mouse.move(element.center[0], element.center[1]);
898
- }
899
- };
900
- tasks.push(taskActionHover);
901
- } else if (plan2.type === "Scroll") {
902
- const taskActionScroll = {
903
- type: "Action",
904
- subType: "Scroll",
905
- param: plan2.param,
906
- thought: plan2.thought,
907
- locate: plan2.locate,
908
- executor: async (taskParam, { element }) => {
909
- const startingPoint = element ? {
910
- left: element.center[0],
911
- top: element.center[1]
912
- } : void 0;
913
- const scrollToEventName = taskParam?.scrollType;
914
- if (scrollToEventName === "untilTop") {
915
- await this.page.scrollUntilTop(startingPoint);
916
- } else if (scrollToEventName === "untilBottom") {
917
- await this.page.scrollUntilBottom(startingPoint);
918
- } else if (scrollToEventName === "untilRight") {
919
- await this.page.scrollUntilRight(startingPoint);
920
- } else if (scrollToEventName === "untilLeft") {
921
- await this.page.scrollUntilLeft(startingPoint);
922
- } else if (scrollToEventName === "once" || !scrollToEventName) {
923
- if (taskParam?.direction === "down" || !taskParam || !taskParam.direction) {
924
- await this.page.scrollDown(
925
- taskParam?.distance || void 0,
926
- startingPoint
927
- );
928
- } else if (taskParam.direction === "up") {
929
- await this.page.scrollUp(
930
- taskParam.distance || void 0,
931
- startingPoint
932
- );
933
- } else if (taskParam.direction === "left") {
934
- await this.page.scrollLeft(
935
- taskParam.distance || void 0,
936
- startingPoint
937
- );
938
- } else if (taskParam.direction === "right") {
939
- await this.page.scrollRight(
940
- taskParam.distance || void 0,
941
- startingPoint
942
- );
943
- } else {
944
- throw new Error(
945
- `Unknown scroll direction: ${taskParam.direction}`
946
- );
947
- }
948
- await sleep(500);
949
- } else {
950
- throw new Error(
951
- `Unknown scroll event type: ${scrollToEventName}, taskParam: ${JSON.stringify(
952
- taskParam
953
- )}`
954
- );
955
- }
956
- }
957
- };
958
- tasks.push(taskActionScroll);
959
- } else if (plan2.type === "Sleep") {
960
- const taskActionSleep = {
961
- type: "Action",
962
- subType: "Sleep",
963
- param: plan2.param,
964
- thought: plan2.thought,
965
- locate: plan2.locate,
966
- executor: async (taskParam) => {
967
- await sleep(taskParam?.timeMs || 3e3);
968
- }
969
- };
970
- tasks.push(taskActionSleep);
971
- } else if (plan2.type === "Error") {
972
- const taskActionError = {
973
- type: "Action",
974
- subType: "Error",
975
- param: plan2.param,
976
- thought: plan2.thought || plan2.param?.thought,
977
- locate: plan2.locate,
978
- executor: async () => {
979
- throw new Error(
980
- plan2?.thought || plan2.param?.thought || "error without thought"
981
- );
982
- }
983
- };
984
- tasks.push(taskActionError);
985
- } else if (plan2.type === "ExpectedFalsyCondition") {
986
- const taskActionFalsyConditionStatement = {
987
- type: "Action",
988
- subType: "ExpectedFalsyCondition",
989
- param: null,
990
- thought: plan2.param?.reason,
991
- locate: plan2.locate,
992
- executor: async () => {
993
- }
994
- };
995
- tasks.push(taskActionFalsyConditionStatement);
996
- } else if (plan2.type === "Finished") {
997
- const taskActionFinished = {
998
- type: "Action",
999
- subType: "Finished",
1000
- param: null,
1001
- thought: plan2.thought,
1002
- locate: plan2.locate,
1003
- executor: async (param) => {
1004
- }
1005
- };
1006
- tasks.push(taskActionFinished);
1007
- } else if (plan2.type === "AndroidHomeButton") {
1008
- const taskActionAndroidHomeButton = {
1009
- type: "Action",
1010
- subType: "AndroidHomeButton",
1011
- param: null,
1012
- thought: plan2.thought,
1013
- locate: plan2.locate,
1014
- executor: async (param) => {
1015
- assert4(
1016
- isAndroidPage(this.page),
1017
- "Cannot use home button on non-Android devices"
1018
- );
1019
- await this.page.home();
1020
- }
1021
- };
1022
- tasks.push(taskActionAndroidHomeButton);
1023
- } else if (plan2.type === "AndroidBackButton") {
1024
- const taskActionAndroidBackButton = {
1025
- type: "Action",
1026
- subType: "AndroidBackButton",
1027
- param: null,
1028
- thought: plan2.thought,
1029
- locate: plan2.locate,
1030
- executor: async (param) => {
1031
- assert4(
1032
- isAndroidPage(this.page),
1033
- "Cannot use back button on non-Android devices"
1034
- );
1035
- await this.page.back();
1036
- }
1037
- };
1038
- tasks.push(taskActionAndroidBackButton);
1039
- } else if (plan2.type === "AndroidRecentAppsButton") {
1040
- const taskActionAndroidRecentAppsButton = {
1041
- type: "Action",
1042
- subType: "AndroidRecentAppsButton",
1043
- param: null,
1044
- thought: plan2.thought,
1045
- locate: plan2.locate,
1046
- executor: async (param) => {
1047
- assert4(
1048
- isAndroidPage(this.page),
1049
- "Cannot use recent apps button on non-Android devices"
1050
- );
1051
- await this.page.recentApps();
1052
- }
1053
- };
1054
- tasks.push(taskActionAndroidRecentAppsButton);
1055
- } else {
1056
- throw new Error(`Unknown or unsupported task type: ${plan2.type}`);
1057
- }
1058
- });
1059
- const wrappedTasks = tasks.map(
1060
- (task, index) => {
1061
- if (task.type === "Action") {
1062
- return this.prependExecutorWithScreenshot(
1063
- task,
1064
- index === tasks.length - 1
1065
- );
1066
- }
1067
- return task;
1068
- }
1069
- );
1070
- return {
1071
- tasks: wrappedTasks
1072
- };
1073
- }
1074
- async setupPlanningContext(executorContext) {
1075
- const shotTime = Date.now();
1076
- const pageContext = await this.insight.contextRetrieverFn("locate");
1077
- const recordItem = {
1078
- type: "screenshot",
1079
- ts: shotTime,
1080
- screenshot: pageContext.screenshotBase64,
1081
- timing: "before planning"
1082
- };
1083
- executorContext.task.recorder = [recordItem];
1084
- executorContext.task.pageContext = pageContext;
1085
- return {
1086
- pageContext
1087
- };
1088
- }
1089
- async loadYamlFlowAsPlanning(userInstruction, yamlString) {
1090
- const taskExecutor = new Executor(taskTitleStr("Action", userInstruction), {
1091
- onTaskStart: this.onTaskStartCallback
1092
- });
1093
- const task = {
1094
- type: "Planning",
1095
- subType: "LoadYaml",
1096
- locate: null,
1097
- param: {
1098
- userInstruction
1099
- },
1100
- executor: async (param, executorContext) => {
1101
- await this.setupPlanningContext(executorContext);
1102
- return {
1103
- output: {
1104
- actions: [],
1105
- more_actions_needed_by_instruction: false,
1106
- log: "",
1107
- yamlString
1108
- },
1109
- cache: {
1110
- hit: true
1111
- }
1112
- };
1113
- }
1114
- };
1115
- await taskExecutor.append(task);
1116
- await taskExecutor.flush();
1117
- return {
1118
- executor: taskExecutor
1119
- };
1120
- }
1121
- planningTaskFromPrompt(userInstruction, log, actionContext) {
1122
- const task = {
1123
- type: "Planning",
1124
- subType: "Plan",
1125
- locate: null,
1126
- param: {
1127
- userInstruction,
1128
- log
1129
- },
1130
- executor: async (param, executorContext) => {
1131
- const startTime = Date.now();
1132
- const { pageContext } = await this.setupPlanningContext(executorContext);
1133
- const planResult = await plan(param.userInstruction, {
1134
- context: pageContext,
1135
- log: param.log,
1136
- actionContext,
1137
- pageType: this.page.pageType
1138
- });
1139
- const {
1140
- actions,
1141
- log: log2,
1142
- more_actions_needed_by_instruction,
1143
- error,
1144
- usage,
1145
- rawResponse,
1146
- sleep: sleep3
1147
- } = planResult;
1148
- executorContext.task.log = {
1149
- rawResponse
1150
- };
1151
- executorContext.task.usage = usage;
1152
- let stopCollecting = false;
1153
- let bboxCollected = false;
1154
- let planParsingError = "";
1155
- const finalActions = (actions || []).reduce(
1156
- (acc, planningAction) => {
1157
- if (stopCollecting) {
1158
- return acc;
1159
- }
1160
- if (planningAction.locate) {
1161
- if (bboxCollected && planningAction.locate.bbox) {
1162
- delete planningAction.locate.bbox;
1163
- }
1164
- if (planningAction.locate.bbox) {
1165
- bboxCollected = true;
1166
- }
1167
- acc.push({
1168
- type: "Locate",
1169
- locate: planningAction.locate,
1170
- param: null,
1171
- thought: planningAction.locate.prompt
1172
- });
1173
- } else if (["Tap", "Hover", "Input"].includes(planningAction.type)) {
1174
- planParsingError = `invalid planning response: ${JSON.stringify(planningAction)}`;
1175
- stopCollecting = true;
1176
- return acc;
1177
- }
1178
- acc.push(planningAction);
1179
- return acc;
1180
- },
1181
- []
1182
- );
1183
- if (sleep3) {
1184
- const timeNow = Date.now();
1185
- const timeRemaining = sleep3 - (timeNow - startTime);
1186
- if (timeRemaining > 0) {
1187
- finalActions.push({
1188
- type: "Sleep",
1189
- param: {
1190
- timeMs: timeRemaining
1191
- },
1192
- locate: null
1193
- });
1194
- }
1195
- }
1196
- if (finalActions.length === 0) {
1197
- assert4(
1198
- !more_actions_needed_by_instruction || sleep3,
1199
- error ? `Failed to plan: ${error}` : planParsingError || "No plan found"
1200
- );
1201
- }
1202
- return {
1203
- output: {
1204
- actions: finalActions,
1205
- more_actions_needed_by_instruction,
1206
- log: log2,
1207
- yamlFlow: planResult.yamlFlow
1208
- },
1209
- cache: {
1210
- hit: false
1211
- },
1212
- pageContext
1213
- };
1214
- }
1215
- };
1216
- return task;
1217
- }
1218
- planningTaskToGoal(userInstruction) {
1219
- const task = {
1220
- type: "Planning",
1221
- subType: "Plan",
1222
- locate: null,
1223
- param: {
1224
- userInstruction
1225
- },
1226
- executor: async (param, executorContext) => {
1227
- const { pageContext } = await this.setupPlanningContext(executorContext);
1228
- const imagePayload = await resizeImageForUiTars(
1229
- pageContext.screenshotBase64,
1230
- pageContext.size
1231
- );
1232
- this.appendConversationHistory({
1233
- role: "user",
1234
- content: [
1235
- {
1236
- type: "image_url",
1237
- image_url: {
1238
- url: imagePayload
1239
- }
1240
- }
1241
- ]
1242
- });
1243
- const startTime = Date.now();
1244
- const planResult = await vlmPlanning({
1245
- userInstruction: param.userInstruction,
1246
- conversationHistory: this.conversationHistory,
1247
- size: pageContext.size
1248
- });
1249
- const aiCost = Date.now() - startTime;
1250
- const { actions, action_summary } = planResult;
1251
- this.appendConversationHistory({
1252
- role: "assistant",
1253
- content: action_summary
1254
- });
1255
- return {
1256
- output: {
1257
- actions,
1258
- thought: actions[0]?.thought,
1259
- actionType: actions[0].type,
1260
- more_actions_needed_by_instruction: true,
1261
- log: "",
1262
- yamlFlow: planResult.yamlFlow
1263
- },
1264
- cache: {
1265
- hit: false
1266
- },
1267
- aiCost
1268
- };
1269
- }
1270
- };
1271
- return task;
1272
- }
1273
- async runPlans(title, plans) {
1274
- const taskExecutor = new Executor(title, {
1275
- onTaskStart: this.onTaskStartCallback
1276
- });
1277
- const { tasks } = await this.convertPlanToExecutable(plans);
1278
- await taskExecutor.append(tasks);
1279
- const result = await taskExecutor.flush();
1280
- return {
1281
- output: result,
1282
- executor: taskExecutor
1283
- };
1284
- }
1285
- async action(userPrompt, actionContext) {
1286
- const taskExecutor = new Executor(taskTitleStr("Action", userPrompt), {
1287
- onTaskStart: this.onTaskStartCallback
1288
- });
1289
- let planningTask = this.planningTaskFromPrompt(userPrompt, void 0, actionContext);
1290
- let replanCount = 0;
1291
- const logList = [];
1292
- const yamlFlow = [];
1293
- while (planningTask) {
1294
- if (replanCount > replanningCountLimit) {
1295
- const errorMsg = "Replanning too many times, please split the task into multiple steps";
1296
- return this.appendErrorPlan(taskExecutor, errorMsg);
1297
- }
1298
- await taskExecutor.append(planningTask);
1299
- const planResult = await taskExecutor.flush();
1300
- if (taskExecutor.isInErrorState()) {
1301
- return {
1302
- output: planResult,
1303
- executor: taskExecutor
1304
- };
1305
- }
1306
- const plans = planResult.actions || [];
1307
- yamlFlow.push(...planResult.yamlFlow || []);
1308
- let executables;
1309
- try {
1310
- executables = await this.convertPlanToExecutable(plans);
1311
- taskExecutor.append(executables.tasks);
1312
- } catch (error) {
1313
- return this.appendErrorPlan(
1314
- taskExecutor,
1315
- `Error converting plans to executable tasks: ${error}, plans: ${JSON.stringify(
1316
- plans
1317
- )}`
1318
- );
1319
- }
1320
- await taskExecutor.flush();
1321
- if (taskExecutor.isInErrorState()) {
1322
- return {
1323
- output: void 0,
1324
- executor: taskExecutor
1325
- };
1326
- }
1327
- if (planResult?.log) {
1328
- logList.push(planResult.log);
1329
- }
1330
- if (!planResult.more_actions_needed_by_instruction) {
1331
- planningTask = null;
1332
- break;
1333
- }
1334
- planningTask = this.planningTaskFromPrompt(
1335
- userPrompt,
1336
- logList.length > 0 ? `- ${logList.join("\n- ")}` : void 0,
1337
- actionContext
1338
- );
1339
- replanCount++;
1340
- }
1341
- return {
1342
- output: {
1343
- yamlFlow
1344
- },
1345
- executor: taskExecutor
1346
- };
1347
- }
1348
- async actionToGoal(userPrompt) {
1349
- const taskExecutor = new Executor(taskTitleStr("Action", userPrompt), {
1350
- onTaskStart: this.onTaskStartCallback
1351
- });
1352
- this.conversationHistory = [];
1353
- const isCompleted = false;
1354
- let currentActionNumber = 0;
1355
- const maxActionNumber = 40;
1356
- const yamlFlow = [];
1357
- while (!isCompleted && currentActionNumber < maxActionNumber) {
1358
- currentActionNumber++;
1359
- const planningTask = this.planningTaskToGoal(userPrompt);
1360
- await taskExecutor.append(planningTask);
1361
- const output = await taskExecutor.flush();
1362
- if (taskExecutor.isInErrorState()) {
1363
- return {
1364
- output: void 0,
1365
- executor: taskExecutor
1366
- };
1367
- }
1368
- const plans = output.actions;
1369
- yamlFlow.push(...output.yamlFlow || []);
1370
- let executables;
1371
- try {
1372
- executables = await this.convertPlanToExecutable(plans);
1373
- taskExecutor.append(executables.tasks);
1374
- } catch (error) {
1375
- return this.appendErrorPlan(
1376
- taskExecutor,
1377
- `Error converting plans to executable tasks: ${error}, plans: ${JSON.stringify(
1378
- plans
1379
- )}`
1380
- );
1381
- }
1382
- await taskExecutor.flush();
1383
- if (taskExecutor.isInErrorState()) {
1384
- return {
1385
- output: void 0,
1386
- executor: taskExecutor
1387
- };
1388
- }
1389
- if (plans[0].type === "Finished") {
1390
- break;
1391
- }
1392
- }
1393
- return {
1394
- output: {
1395
- yamlFlow
1396
- },
1397
- executor: taskExecutor
1398
- };
1399
- }
1400
- async createTypeQueryTask(type, demand) {
1401
- const taskExecutor = new Executor(
1402
- taskTitleStr(
1403
- type,
1404
- typeof demand === "string" ? demand : JSON.stringify(demand)
1405
- ),
1406
- {
1407
- onTaskStart: this.onTaskStartCallback
1408
- }
1409
- );
1410
- const queryTask = {
1411
- type: "Insight",
1412
- subType: type,
1413
- locate: null,
1414
- param: {
1415
- dataDemand: demand
1416
- // for user param presentation in report right sidebar
1417
- },
1418
- executor: async (param) => {
1419
- let insightDump;
1420
- const dumpCollector = (dump) => {
1421
- insightDump = dump;
1422
- };
1423
- this.insight.onceDumpUpdatedFn = dumpCollector;
1424
- const ifTypeRestricted = type !== "Query";
1425
- let demandInput = demand;
1426
- if (ifTypeRestricted) {
1427
- demandInput = {
1428
- result: `${type}, ${demand}`
1429
- };
1430
- }
1431
- const { data, usage } = await this.insight.extract(demandInput);
1432
- let outputResult = data;
1433
- if (ifTypeRestricted) {
1434
- assert4(data?.result !== void 0, "No result in query data");
1435
- outputResult = data.result;
1436
- }
1437
- return {
1438
- output: outputResult,
1439
- log: { dump: insightDump },
1440
- usage
1441
- };
1442
- }
1443
- };
1444
- await taskExecutor.append(this.prependExecutorWithScreenshot(queryTask));
1445
- const output = await taskExecutor.flush();
1446
- return {
1447
- output,
1448
- executor: taskExecutor
1449
- };
1450
- }
1451
- async query(demand) {
1452
- return this.createTypeQueryTask("Query", demand);
1453
- }
1454
- async boolean(prompt) {
1455
- return this.createTypeQueryTask("Boolean", prompt);
1456
- }
1457
- async number(prompt) {
1458
- return this.createTypeQueryTask("Number", prompt);
1459
- }
1460
- async string(prompt) {
1461
- return this.createTypeQueryTask("String", prompt);
1462
- }
1463
- async assert(assertion) {
1464
- const description = `assert: ${assertion}`;
1465
- const taskExecutor = new Executor(taskTitleStr("Assert", description), {
1466
- onTaskStart: this.onTaskStartCallback
1467
- });
1468
- const assertionPlan = {
1469
- type: "Assert",
1470
- param: {
1471
- assertion
1472
- },
1473
- locate: null
1474
- };
1475
- const { tasks } = await this.convertPlanToExecutable([assertionPlan]);
1476
- await taskExecutor.append(this.prependExecutorWithScreenshot(tasks[0]));
1477
- const output = await taskExecutor.flush();
1478
- return {
1479
- output,
1480
- executor: taskExecutor
1481
- };
1482
- }
1483
- /**
1484
- * Append a message to the conversation history
1485
- * For user messages with images:
1486
- * - Keep max 4 user image messages in history
1487
- * - Remove oldest user image message when limit reached
1488
- * For assistant messages:
1489
- * - Simply append to history
1490
- * @param conversationHistory Message to append
1491
- */
1492
- appendConversationHistory(conversationHistory) {
1493
- if (conversationHistory.role === "user") {
1494
- const userImgItems = this.conversationHistory.filter(
1495
- (item) => item.role === "user"
1496
- );
1497
- if (userImgItems.length >= 4 && conversationHistory.role === "user") {
1498
- const firstUserImgIndex = this.conversationHistory.findIndex(
1499
- (item) => item.role === "user"
1500
- );
1501
- if (firstUserImgIndex >= 0) {
1502
- this.conversationHistory.splice(firstUserImgIndex, 1);
1503
- }
1504
- }
1505
- }
1506
- this.conversationHistory.push(conversationHistory);
1507
- }
1508
- async appendErrorPlan(taskExecutor, errorMsg) {
1509
- const errorPlan = {
1510
- type: "Error",
1511
- param: {
1512
- thought: errorMsg
1513
- },
1514
- locate: null
1515
- };
1516
- const { tasks } = await this.convertPlanToExecutable([errorPlan]);
1517
- await taskExecutor.append(this.prependExecutorWithScreenshot(tasks[0]));
1518
- await taskExecutor.flush();
1519
- return {
1520
- output: void 0,
1521
- executor: taskExecutor
1522
- };
1523
- }
1524
- async waitFor(assertion, opt) {
1525
- const description = `waitFor: ${assertion}`;
1526
- const taskExecutor = new Executor(taskTitleStr("WaitFor", description), {
1527
- onTaskStart: this.onTaskStartCallback
1528
- });
1529
- const { timeoutMs, checkIntervalMs } = opt;
1530
- assert4(assertion, "No assertion for waitFor");
1531
- assert4(timeoutMs, "No timeoutMs for waitFor");
1532
- assert4(checkIntervalMs, "No checkIntervalMs for waitFor");
1533
- const overallStartTime = Date.now();
1534
- let startTime = Date.now();
1535
- let errorThought = "";
1536
- while (Date.now() - overallStartTime < timeoutMs) {
1537
- startTime = Date.now();
1538
- const assertPlan = {
1539
- type: "AssertWithoutThrow",
1540
- param: {
1541
- assertion
1542
- },
1543
- locate: null
1544
- };
1545
- const { tasks: assertTasks } = await this.convertPlanToExecutable([
1546
- assertPlan
1547
- ]);
1548
- await taskExecutor.append(
1549
- this.prependExecutorWithScreenshot(assertTasks[0])
1550
- );
1551
- const output = await taskExecutor.flush();
1552
- if (output?.pass) {
1553
- return {
1554
- output: void 0,
1555
- executor: taskExecutor
1556
- };
1557
- }
1558
- errorThought = output?.thought || `unknown error when waiting for assertion: ${assertion}`;
1559
- const now = Date.now();
1560
- if (now - startTime < checkIntervalMs) {
1561
- const timeRemaining = checkIntervalMs - (now - startTime);
1562
- const sleepPlan = {
1563
- type: "Sleep",
1564
- param: {
1565
- timeMs: timeRemaining
1566
- },
1567
- locate: null
1568
- };
1569
- const { tasks: sleepTasks } = await this.convertPlanToExecutable([
1570
- sleepPlan
1571
- ]);
1572
- await taskExecutor.append(
1573
- this.prependExecutorWithScreenshot(sleepTasks[0])
1574
- );
1575
- await taskExecutor.flush();
1576
- }
1577
- }
1578
- return this.appendErrorPlan(
1579
- taskExecutor,
1580
- `waitFor timeout: ${errorThought}`
1581
- );
1582
- }
1583
- };
1584
-
1585
- // src/common/plan-builder.ts
1586
- import { getDebug as getDebug2 } from "misoai-shared/logger";
1587
- import { assert as assert5 } from "misoai-shared/utils";
1588
- var debug2 = getDebug2("plan-builder");
1589
- function buildPlans(type, locateParam, param) {
1590
- let returnPlans = [];
1591
- const locatePlan = locateParam ? {
1592
- type: "Locate",
1593
- locate: locateParam,
1594
- param: locateParam,
1595
- thought: ""
1596
- } : null;
1597
- if (type === "Tap" || type === "Hover") {
1598
- assert5(locateParam, `missing locate info for action "${type}"`);
1599
- assert5(locatePlan, `missing locate info for action "${type}"`);
1600
- const tapPlan = {
1601
- type,
1602
- param: null,
1603
- thought: "",
1604
- locate: locateParam
1605
- };
1606
- returnPlans = [locatePlan, tapPlan];
1607
- }
1608
- if (type === "Input" || type === "KeyboardPress") {
1609
- if (type === "Input") {
1610
- assert5(locateParam, `missing locate info for action "${type}"`);
1611
- }
1612
- assert5(param, `missing param for action "${type}"`);
1613
- const inputPlan = {
1614
- type,
1615
- param,
1616
- thought: "",
1617
- locate: locateParam
1618
- };
1619
- if (locatePlan) {
1620
- returnPlans = [locatePlan, inputPlan];
1621
- } else {
1622
- returnPlans = [inputPlan];
1623
- }
1624
- }
1625
- if (type === "Scroll") {
1626
- assert5(param, `missing param for action "${type}"`);
1627
- const scrollPlan = {
1628
- type,
1629
- param,
1630
- thought: "",
1631
- locate: locateParam
1632
- };
1633
- if (locatePlan) {
1634
- returnPlans = [locatePlan, scrollPlan];
1635
- } else {
1636
- returnPlans = [scrollPlan];
1637
- }
1638
- }
1639
- if (type === "Sleep") {
1640
- assert5(param, `missing param for action "${type}"`);
1641
- const sleepPlan = {
1642
- type,
1643
- param,
1644
- thought: "",
1645
- locate: null
1646
- };
1647
- returnPlans = [sleepPlan];
1648
- }
1649
- if (type === "Locate") {
1650
- assert5(locateParam, `missing locate info for action "${type}"`);
1651
- const locatePlan2 = {
1652
- type,
1653
- param: locateParam,
1654
- locate: locateParam,
1655
- thought: ""
1656
- };
1657
- returnPlans = [locatePlan2];
1658
- }
1659
- if (returnPlans) {
1660
- debug2("buildPlans", returnPlans);
1661
- return returnPlans;
1662
- }
1663
- throw new Error(`Not supported type: ${type}`);
1664
- }
1665
-
1666
- // src/common/task-cache.ts
1667
- import assert6 from "assert";
1668
- import { existsSync as existsSync2, readFileSync, writeFileSync as writeFileSync2 } from "fs";
1669
- import { join as join2 } from "path";
1670
- import { getMidsceneRunSubDir as getMidsceneRunSubDir2 } from "misoai-shared/common";
1671
- import { getDebug as getDebug3 } from "misoai-shared/logger";
1672
- import { ifInBrowser as ifInBrowser2 } from "misoai-shared/utils";
1673
- import yaml3 from "js-yaml";
1674
- import semver from "semver";
1675
-
1676
- // package.json
1677
- var version = "1.0.0";
1678
-
1679
- // src/common/task-cache.ts
1680
- var debug3 = getDebug3("cache");
1681
- var lowestSupportedMidsceneVersion = "0.16.10";
1682
- var cacheFileExt = ".cache.yaml";
1683
- var TaskCache = class {
1684
- // Track matched records
1685
- constructor(cacheId, isCacheResultUsed, cacheFilePath) {
1686
- this.matchedCacheIndices = /* @__PURE__ */ new Set();
1687
- assert6(cacheId, "cacheId is required");
1688
- this.cacheId = replaceIllegalPathCharsAndSpace(cacheId);
1689
- this.cacheFilePath = ifInBrowser2 ? void 0 : cacheFilePath || join2(getMidsceneRunSubDir2("cache"), `${this.cacheId}${cacheFileExt}`);
1690
- this.isCacheResultUsed = isCacheResultUsed;
1691
- let cacheContent;
1692
- if (this.cacheFilePath) {
1693
- cacheContent = this.loadCacheFromFile();
1694
- }
1695
- if (!cacheContent) {
1696
- cacheContent = {
1697
- midsceneVersion: version,
1698
- cacheId: this.cacheId,
1699
- caches: []
1700
- };
1701
- }
1702
- this.cache = cacheContent;
1703
- this.cacheOriginalLength = this.cache.caches.length;
1704
- }
1705
- matchCache(prompt, type) {
1706
- for (let i = 0; i < this.cacheOriginalLength; i++) {
1707
- const item = this.cache.caches[i];
1708
- const key = `${type}:${prompt}:${i}`;
1709
- if (item.type === type && item.prompt === prompt && !this.matchedCacheIndices.has(key)) {
1710
- this.matchedCacheIndices.add(key);
1711
- debug3(
1712
- "cache found and marked as used, type: %s, prompt: %s, index: %d",
1713
- type,
1714
- prompt,
1715
- i
1716
- );
1717
- return {
1718
- cacheContent: item,
1719
- updateFn: (cb) => {
1720
- debug3(
1721
- "will call updateFn to update cache, type: %s, prompt: %s, index: %d",
1722
- type,
1723
- prompt,
1724
- i
1725
- );
1726
- cb(item);
1727
- debug3(
1728
- "cache updated, will flush to file, type: %s, prompt: %s, index: %d",
1729
- type,
1730
- prompt,
1731
- i
1732
- );
1733
- this.flushCacheToFile();
1734
- }
1735
- };
1736
- }
1737
- }
1738
- debug3("no unused cache found, type: %s, prompt: %s", type, prompt);
1739
- return void 0;
1740
- }
1741
- matchPlanCache(prompt) {
1742
- return this.matchCache(prompt, "plan");
1743
- }
1744
- matchLocateCache(prompt) {
1745
- return this.matchCache(prompt, "locate");
1746
- }
1747
- appendCache(cache) {
1748
- debug3("will append cache", cache);
1749
- this.cache.caches.push(cache);
1750
- this.flushCacheToFile();
1751
- }
1752
- loadCacheFromFile() {
1753
- const cacheFile = this.cacheFilePath;
1754
- assert6(cacheFile, "cache file path is required");
1755
- if (!existsSync2(cacheFile)) {
1756
- debug3("no cache file found, path: %s", cacheFile);
1757
- return void 0;
1758
- }
1759
- const jsonTypeCacheFile = cacheFile.replace(cacheFileExt, ".json");
1760
- if (existsSync2(jsonTypeCacheFile) && this.isCacheResultUsed) {
1761
- console.warn(
1762
- `An outdated cache file from an earlier version of Midscene has been detected. Since version 0.17, we have implemented an improved caching strategy. Please delete the old file located at: ${jsonTypeCacheFile}.`
1763
- );
1764
- return void 0;
1765
- }
1766
- try {
1767
- const data = readFileSync(cacheFile, "utf8");
1768
- const jsonData = yaml3.load(data);
1769
- if (!version) {
1770
- debug3("no midscene version info, will not read cache from file");
1771
- return void 0;
1772
- }
1773
- if (semver.lt(jsonData.midsceneVersion, lowestSupportedMidsceneVersion) && !jsonData.midsceneVersion.includes("beta")) {
1774
- console.warn(
1775
- `You are using an old version of Midscene cache file, and we cannot match any info from it. Starting from Midscene v0.17, we changed our strategy to use xpath for cache info, providing better performance.
1776
- Please delete the existing cache and rebuild it. Sorry for the inconvenience.
1777
- cache file: ${cacheFile}`
1778
- );
1779
- return void 0;
1780
- }
1781
- debug3(
1782
- "cache loaded from file, path: %s, cache version: %s, record length: %s",
1783
- cacheFile,
1784
- jsonData.midsceneVersion,
1785
- jsonData.caches.length
1786
- );
1787
- jsonData.midsceneVersion = version;
1788
- return jsonData;
1789
- } catch (err) {
1790
- debug3(
1791
- "cache file exists but load failed, path: %s, error: %s",
1792
- cacheFile,
1793
- err
1794
- );
1795
- return void 0;
1796
- }
1797
- }
1798
- flushCacheToFile() {
1799
- if (!version) {
1800
- debug3("no midscene version info, will not write cache to file");
1801
- return;
1802
- }
1803
- if (!this.cacheFilePath) {
1804
- debug3("no cache file path, will not write cache to file");
1805
- return;
1806
- }
1807
- try {
1808
- const yamlData = yaml3.dump(this.cache);
1809
- writeFileSync2(this.cacheFilePath, yamlData);
1810
- } catch (err) {
1811
- debug3(
1812
- "write cache to file failed, path: %s, error: %s",
1813
- this.cacheFilePath,
1814
- err
1815
- );
1816
- }
1817
- }
1818
- updateOrAppendCacheRecord(newRecord, cachedRecord) {
1819
- if (cachedRecord) {
1820
- if (newRecord.type === "plan") {
1821
- cachedRecord.updateFn((cache) => {
1822
- cache.yamlWorkflow = newRecord.yamlWorkflow;
1823
- });
1824
- } else {
1825
- cachedRecord.updateFn((cache) => {
1826
- cache.xpaths = newRecord.xpaths;
1827
- });
1828
- }
1829
- } else {
1830
- this.appendCache(newRecord);
1831
- }
1832
- }
1833
- };
1834
-
1835
- // src/common/agent.ts
1836
- var debug4 = getDebug4("web-integration");
1837
- var distanceOfTwoPoints = (p1, p2) => {
1838
- const [x1, y1] = p1;
1839
- const [x2, y2] = p2;
1840
- return Math.round(Math.sqrt((x1 - x2) ** 2 + (y1 - y2) ** 2));
1841
- };
1842
- var includedInRect = (point, rect) => {
1843
- const [x, y] = point;
1844
- const { left, top, width, height } = rect;
1845
- return x >= left && x <= left + width && y >= top && y <= top + height;
1846
- };
1847
- var PageAgent = class {
1848
- constructor(page, opts) {
1849
- /**
1850
- * If true, the agent will not perform any actions
1851
- */
1852
- this.dryMode = false;
1853
- this.page = page;
1854
- this.opts = Object.assign(
1855
- {
1856
- generateReport: true,
1857
- autoPrintReportMsg: true,
1858
- groupName: "Midscene Report",
1859
- groupDescription: ""
1860
- },
1861
- opts || {}
1862
- );
1863
- if (this.page.pageType === "puppeteer" || this.page.pageType === "playwright") {
1864
- this.page.waitForNavigationTimeout = this.opts.waitForNavigationTimeout || DEFAULT_WAIT_FOR_NAVIGATION_TIMEOUT;
1865
- this.page.waitForNetworkIdleTimeout = this.opts.waitForNetworkIdleTimeout || DEFAULT_WAIT_FOR_NETWORK_IDLE_TIMEOUT;
1866
- }
1867
- this.onTaskStartTip = this.opts.onTaskStartTip;
1868
- this.insight = new Insight(
1869
- async (action) => {
1870
- return this.getUIContext(action);
1871
- }
1872
- );
1873
- if (opts?.cacheId && this.page.pageType !== "android") {
1874
- this.taskCache = new TaskCache(
1875
- opts.cacheId,
1876
- getAIConfigInBoolean("MIDSCENE_CACHE")
1877
- // if we should use cache to match the element
1878
- );
1879
- }
1880
- this.taskExecutor = new PageTaskExecutor(this.page, this.insight, {
1881
- taskCache: this.taskCache,
1882
- onTaskStart: this.callbackOnTaskStartTip.bind(this)
1883
- });
1884
- this.dump = this.resetDump();
1885
- this.reportFileName = reportFileName(
1886
- opts?.testId || this.page.pageType || "web"
1887
- );
1888
- }
1889
- async getUIContext(action) {
1890
- if (action && (action === "extract" || action === "assert" || action === "captcha")) {
1891
- return await parseContextFromWebPage(this.page, {
1892
- ignoreMarker: true
1893
- });
1894
- }
1895
- return await parseContextFromWebPage(this.page, {
1896
- ignoreMarker: !!vlLocateMode()
1897
- });
1898
- }
1899
- // Helper method to call the insight.captcha method
1900
- async _callInsightCaptcha(options) {
1901
- const context = await this.getUIContext();
1902
- if (this.page.url) {
1903
- const url = await this.page.url();
1904
- context.url = url;
1905
- }
1906
- return this.insight.captcha(context, options);
1907
- }
1908
- async setAIActionContext(prompt) {
1909
- this.opts.aiActionContext = prompt;
1910
- }
1911
- resetDump() {
1912
- this.dump = {
1913
- groupName: this.opts.groupName,
1914
- groupDescription: this.opts.groupDescription,
1915
- executions: []
1916
- };
1917
- return this.dump;
1918
- }
1919
- appendExecutionDump(execution) {
1920
- const currentDump = this.dump;
1921
- currentDump.executions.push(execution);
1922
- }
1923
- dumpDataString() {
1924
- this.dump.groupName = this.opts.groupName;
1925
- this.dump.groupDescription = this.opts.groupDescription;
1926
- return stringifyDumpData(this.dump);
1927
- }
1928
- reportHTMLString() {
1929
- return reportHTMLContent(this.dumpDataString());
1930
- }
1931
- writeOutActionDumps() {
1932
- const { generateReport, autoPrintReportMsg } = this.opts;
1933
- this.reportFile = writeLogFile({
1934
- fileName: this.reportFileName,
1935
- fileExt: groupedActionDumpFileExt,
1936
- fileContent: this.dumpDataString(),
1937
- type: "dump",
1938
- generateReport
1939
- });
1940
- debug4("writeOutActionDumps", this.reportFile);
1941
- if (generateReport && autoPrintReportMsg && this.reportFile) {
1942
- printReportMsg(this.reportFile);
1943
- }
1944
- }
1945
- async callbackOnTaskStartTip(task) {
1946
- const param = paramStr(task);
1947
- const tip = param ? `${typeStr(task)} - ${param}` : typeStr(task);
1948
- if (this.onTaskStartTip) {
1949
- await this.onTaskStartTip(tip);
1950
- }
1951
- }
1952
- afterTaskRunning(executor, doNotThrowError = false) {
1953
- this.appendExecutionDump(executor.dump());
1954
- this.writeOutActionDumps();
1955
- if (executor.isInErrorState() && !doNotThrowError) {
1956
- const errorTask = executor.latestErrorTask();
1957
- throw new Error(`${errorTask?.error}`);
1958
- }
1959
- const lastTask = executor.tasks[executor.tasks.length - 1];
1960
- const allThoughts = executor.tasks.filter((task) => task.thought).map((task) => task.thought);
1961
- const allLocates = executor.tasks.filter((task) => task.locate).map((task) => task.locate);
1962
- const allPlans = executor.tasks.filter((task) => task.param?.plans).map((task) => task.param?.plans);
1963
- const planningTasks = executor.tasks.filter((task) => task.type === "Planning");
1964
- const insightTasks = executor.tasks.filter((task) => task.type === "Insight");
1965
- const actionTasks = executor.tasks.filter((task) => task.type === "Action");
1966
- const planning = planningTasks.length > 0 ? {
1967
- type: "Planning",
1968
- description: `Planning for task execution`,
1969
- steps: planningTasks.map((task) => task.thought || "Planning step")
1970
- } : void 0;
1971
- const insight = insightTasks.length > 0 ? {
1972
- type: "Insight",
1973
- description: `Insight for task execution`,
1974
- elements: insightTasks.map((task) => task.thought || "Insight element")
1975
- } : void 0;
1976
- const action = actionTasks.length > 0 ? {
1977
- type: "Action",
1978
- description: `Action for task execution`,
1979
- result: lastTask?.output
1980
- } : void 0;
1981
- const actionDetails = executor.tasks.map((task) => ({
1982
- type: task.type,
1983
- subType: task.subType,
1984
- status: task.status,
1985
- thought: task.thought
1986
- }));
1987
- const metadata = {
1988
- status: lastTask?.status,
1989
- start: lastTask?.timing?.start,
1990
- end: lastTask?.timing?.end,
1991
- totalTime: lastTask?.timing?.cost,
1992
- cache: lastTask?.cache,
1993
- usage: lastTask?.usage,
1994
- thought: allThoughts.length > 0 ? allThoughts.join("\n") : lastTask?.thought,
1995
- locate: allLocates.length > 0 ? allLocates : lastTask?.locate,
1996
- plan: allPlans.length > 0 ? allPlans : lastTask?.param?.plans,
1997
- // Add planning, insight, and action information
1998
- planning,
1999
- insight,
2000
- action,
2001
- actionDetails,
2002
- // Include raw tasks for debugging
2003
- tasks: executor.tasks.map((task) => ({
2004
- type: task.type,
2005
- subType: task.subType,
2006
- status: task.status,
2007
- thought: task.thought,
2008
- locate: task.locate,
2009
- timing: task.timing,
2010
- usage: task.usage,
2011
- cache: task.cache,
2012
- error: task.error
2013
- }))
2014
- };
2015
- return metadata;
2016
- }
2017
- buildDetailedLocateParam(locatePrompt, opt) {
2018
- assert7(locatePrompt, "missing locate prompt");
2019
- if (typeof opt === "object") {
2020
- const prompt = opt.prompt || locatePrompt;
2021
- const deepThink = opt.deepThink || false;
2022
- const cacheable = opt.cacheable || true;
2023
- return {
2024
- prompt,
2025
- deepThink,
2026
- cacheable
2027
- };
2028
- }
2029
- return {
2030
- prompt: locatePrompt
2031
- };
2032
- }
2033
- async aiTap(locatePrompt, opt) {
2034
- const detailedLocateParam = this.buildDetailedLocateParam(
2035
- locatePrompt,
2036
- opt
2037
- );
2038
- const plans = buildPlans("Tap", detailedLocateParam);
2039
- const { executor, output } = await this.taskExecutor.runPlans(
2040
- taskTitleStr("Tap", locateParamStr(detailedLocateParam)),
2041
- plans
2042
- );
2043
- const metadata = this.afterTaskRunning(executor);
2044
- return {
2045
- result: output,
2046
- metadata
2047
- };
2048
- }
2049
- async aiHover(locatePrompt, opt) {
2050
- const detailedLocateParam = this.buildDetailedLocateParam(
2051
- locatePrompt,
2052
- opt
2053
- );
2054
- const plans = buildPlans("Hover", detailedLocateParam);
2055
- const { executor, output } = await this.taskExecutor.runPlans(
2056
- taskTitleStr("Hover", locateParamStr(detailedLocateParam)),
2057
- plans
2058
- );
2059
- const metadata = this.afterTaskRunning(executor);
2060
- return {
2061
- result: output,
2062
- metadata
2063
- };
2064
- }
2065
- async aiInput(value, locatePrompt, opt) {
2066
- assert7(
2067
- typeof value === "string",
2068
- "input value must be a string, use empty string if you want to clear the input"
2069
- );
2070
- assert7(locatePrompt, "missing locate prompt for input");
2071
- const detailedLocateParam = this.buildDetailedLocateParam(
2072
- locatePrompt,
2073
- opt
2074
- );
2075
- const plans = buildPlans("Input", detailedLocateParam, {
2076
- value
2077
- });
2078
- const { executor, output } = await this.taskExecutor.runPlans(
2079
- taskTitleStr("Input", locateParamStr(detailedLocateParam)),
2080
- plans
2081
- );
2082
- const metadata = this.afterTaskRunning(executor);
2083
- return {
2084
- result: output,
2085
- metadata
2086
- };
2087
- }
2088
- async aiKeyboardPress(keyName, locatePrompt, opt) {
2089
- assert7(keyName, "missing keyName for keyboard press");
2090
- const detailedLocateParam = locatePrompt ? this.buildDetailedLocateParam(locatePrompt, opt) : void 0;
2091
- const plans = buildPlans("KeyboardPress", detailedLocateParam, {
2092
- value: keyName
2093
- });
2094
- const { executor, output } = await this.taskExecutor.runPlans(
2095
- taskTitleStr("KeyboardPress", locateParamStr(detailedLocateParam)),
2096
- plans
2097
- );
2098
- const metadata = this.afterTaskRunning(executor);
2099
- return {
2100
- result: output,
2101
- metadata
2102
- };
2103
- }
2104
- async aiScroll(scrollParam, locatePrompt, opt) {
2105
- const detailedLocateParam = locatePrompt ? this.buildDetailedLocateParam(locatePrompt, opt) : void 0;
2106
- const plans = buildPlans("Scroll", detailedLocateParam, scrollParam);
2107
- const paramInTitle = locatePrompt ? `${locateParamStr(detailedLocateParam)} - ${scrollParamStr(scrollParam)}` : scrollParamStr(scrollParam);
2108
- const { executor, output } = await this.taskExecutor.runPlans(
2109
- taskTitleStr("Scroll", paramInTitle),
2110
- plans
2111
- );
2112
- const metadata = this.afterTaskRunning(executor);
2113
- return {
2114
- result: output,
2115
- metadata
2116
- };
2117
- }
2118
- async aiAction(taskPrompt, opt) {
2119
- const cacheable = opt?.cacheable;
2120
- const isVlmUiTars = vlLocateMode() === "vlm-ui-tars";
2121
- const matchedCache = isVlmUiTars || cacheable === false ? void 0 : this.taskCache?.matchPlanCache(taskPrompt);
2122
- if (matchedCache && this.taskCache?.isCacheResultUsed) {
2123
- const { executor: executor2 } = await this.taskExecutor.loadYamlFlowAsPlanning(
2124
- taskPrompt,
2125
- matchedCache.cacheContent?.yamlWorkflow
2126
- );
2127
- const metadata2 = this.afterTaskRunning(executor2);
2128
- debug4("matched cache, will call .runYaml to run the action");
2129
- const yaml5 = matchedCache.cacheContent?.yamlWorkflow;
2130
- const result = await this.runYaml(yaml5);
2131
- return {
2132
- result: result.result,
2133
- metadata: metadata2
2134
- };
2135
- }
2136
- const { output, executor } = await (isVlmUiTars ? this.taskExecutor.actionToGoal(taskPrompt) : this.taskExecutor.action(taskPrompt, this.opts.aiActionContext));
2137
- if (this.taskCache && output?.yamlFlow && cacheable !== false) {
2138
- const yamlContent = {
2139
- tasks: [
2140
- {
2141
- name: taskPrompt,
2142
- flow: output.yamlFlow
2143
- }
2144
- ]
2145
- };
2146
- const yamlFlowStr = yaml4.dump(yamlContent);
2147
- this.taskCache.updateOrAppendCacheRecord(
2148
- {
2149
- type: "plan",
2150
- prompt: taskPrompt,
2151
- yamlWorkflow: yamlFlowStr
2152
- },
2153
- matchedCache
2154
- );
2155
- }
2156
- const metadata = this.afterTaskRunning(executor);
2157
- return {
2158
- result: output,
2159
- metadata
2160
- };
2161
- }
2162
- async aiQuery(demand) {
2163
- const { output, executor } = await this.taskExecutor.query(demand);
2164
- const metadata = this.afterTaskRunning(executor);
2165
- return {
2166
- result: output,
2167
- metadata
2168
- };
2169
- }
2170
- async aiBoolean(prompt) {
2171
- const { output, executor } = await this.taskExecutor.boolean(prompt);
2172
- const metadata = this.afterTaskRunning(executor);
2173
- return {
2174
- result: output,
2175
- metadata
2176
- };
2177
- }
2178
- async aiNumber(prompt) {
2179
- const { output, executor } = await this.taskExecutor.number(prompt);
2180
- const metadata = this.afterTaskRunning(executor);
2181
- return {
2182
- result: output,
2183
- metadata
2184
- };
2185
- }
2186
- async aiString(prompt) {
2187
- const { output, executor } = await this.taskExecutor.string(prompt);
2188
- const metadata = this.afterTaskRunning(executor);
2189
- return {
2190
- result: output,
2191
- metadata
2192
- };
2193
- }
2194
- async describeElementAtPoint(center, opt) {
2195
- const { verifyPrompt = true, retryLimit = 3 } = opt || {};
2196
- let success = false;
2197
- let retryCount = 0;
2198
- let resultPrompt = "";
2199
- let deepThink = opt?.deepThink || false;
2200
- let verifyResult;
2201
- while (!success && retryCount < retryLimit) {
2202
- if (retryCount >= 2) {
2203
- deepThink = true;
2204
- }
2205
- debug4(
2206
- "aiDescribe",
2207
- center,
2208
- "verifyPrompt",
2209
- verifyPrompt,
2210
- "retryCount",
2211
- retryCount,
2212
- "deepThink",
2213
- deepThink
2214
- );
2215
- const text = await this.insight.describe(center, { deepThink });
2216
- debug4("aiDescribe text", text);
2217
- assert7(text.description, `failed to describe element at [${center}]`);
2218
- resultPrompt = text.description;
2219
- verifyResult = await this.verifyLocator(
2220
- resultPrompt,
2221
- deepThink ? { deepThink: true } : void 0,
2222
- center,
2223
- opt
2224
- );
2225
- if (verifyResult.pass) {
2226
- success = true;
2227
- } else {
2228
- retryCount++;
2229
- }
2230
- }
2231
- return {
2232
- prompt: resultPrompt,
2233
- deepThink,
2234
- verifyResult
2235
- };
2236
- }
2237
- async verifyLocator(prompt, locateOpt, expectCenter, verifyLocateOption) {
2238
- debug4("verifyLocator", prompt, locateOpt, expectCenter, verifyLocateOption);
2239
- const locateResult = await this.aiLocate(prompt, locateOpt);
2240
- const { center: verifyCenter, rect: verifyRect } = locateResult.result;
2241
- const distance = distanceOfTwoPoints(expectCenter, verifyCenter);
2242
- const included = includedInRect(expectCenter, verifyRect);
2243
- const pass = distance <= (verifyLocateOption?.centerDistanceThreshold || 20) || included;
2244
- const verifyResult = {
2245
- pass,
2246
- rect: verifyRect,
2247
- center: verifyCenter,
2248
- centerDistance: distance
2249
- };
2250
- debug4("aiDescribe verifyResult", verifyResult);
2251
- return verifyResult;
2252
- }
2253
- async aiLocate(prompt, opt) {
2254
- const detailedLocateParam = this.buildDetailedLocateParam(prompt, opt);
2255
- const plans = buildPlans("Locate", detailedLocateParam);
2256
- const { executor, output } = await this.taskExecutor.runPlans(
2257
- taskTitleStr("Locate", locateParamStr(detailedLocateParam)),
2258
- plans
2259
- );
2260
- const metadata = this.afterTaskRunning(executor);
2261
- const { element } = output;
2262
- const result = {
2263
- rect: element?.rect,
2264
- center: element?.center
2265
- };
2266
- return {
2267
- result,
2268
- metadata
2269
- };
2270
- }
2271
- async aiAssert(assertion, msg, opt) {
2272
- let currentUrl = "";
2273
- if (this.page.url) {
2274
- try {
2275
- currentUrl = await this.page.url();
2276
- } catch (e) {
2277
- }
2278
- }
2279
- const assertionWithContext = currentUrl ? `For the page at URL "${currentUrl}", ${assertion}` : assertion;
2280
- const { output, executor } = await this.taskExecutor.assert(assertionWithContext);
2281
- const metadata = this.afterTaskRunning(executor, true);
2282
- if (output && opt?.keepRawResponse) {
2283
- return {
2284
- result: output,
2285
- metadata
2286
- };
2287
- }
2288
- if (!output?.pass) {
2289
- const errMsg = msg || `Assertion failed: ${assertion}`;
2290
- const reasonMsg = `Reason: ${output?.thought || executor.latestErrorTask()?.error || "(no_reason)"}`;
2291
- throw new Error(`${errMsg}
2292
- ${reasonMsg}`);
2293
- }
2294
- return {
2295
- result: true,
2296
- metadata
2297
- };
2298
- }
2299
- async aiCaptcha(options) {
2300
- const { deepThink = false, autoDetectComplexity = true } = options || {};
2301
- let shouldUseDeepThink = deepThink;
2302
- if (autoDetectComplexity && !deepThink) {
2303
- const context = await this.getUIContext();
2304
- const { screenshotBase64 } = context;
2305
- try {
2306
- const complexityAnalysisPrompt = `
2307
- Analyze this screenshot and determine if it contains a complex CAPTCHA that would benefit from deep thinking.
2308
- A complex CAPTCHA typically has one or more of these characteristics:
2309
- - Distorted or overlapping text that is hard to read
2310
- - Multiple images that need to be selected based on a specific criteria
2311
- - Puzzles that require spatial reasoning
2312
- - Multiple steps or verification methods
2313
- - Small or hard-to-distinguish elements
2314
-
2315
- Return only "complex" or "simple" based on your analysis.
2316
- `;
2317
- const complexityMsgs = [
2318
- { role: "system", content: "You are an AI assistant that analyzes screenshots to determine CAPTCHA complexity." },
2319
- {
2320
- role: "user",
2321
- content: [
2322
- {
2323
- type: "image_url",
2324
- image_url: {
2325
- url: screenshotBase64,
2326
- detail: "high"
2327
- }
2328
- },
2329
- {
2330
- type: "text",
2331
- text: complexityAnalysisPrompt
2332
- }
2333
- ]
2334
- }
2335
- ];
2336
- const complexityResult = await this.insight.aiVendorFn(
2337
- complexityMsgs,
2338
- { type: "extract_data" }
2339
- );
2340
- const responseText = typeof complexityResult.content === "string" ? complexityResult.content.toLowerCase() : JSON.stringify(complexityResult.content).toLowerCase();
2341
- shouldUseDeepThink = responseText.includes("complex");
2342
- debug4("CAPTCHA complexity analysis:", responseText, "Using deep think:", shouldUseDeepThink);
2343
- } catch (error) {
2344
- debug4("Failed to analyze CAPTCHA complexity:", error);
2345
- }
2346
- }
2347
- const captchaResponse = await this._callInsightCaptcha({
2348
- deepThink: shouldUseDeepThink
2349
- });
2350
- const captchaResult = captchaResponse.content;
2351
- const usage = captchaResponse.usage;
2352
- const actualDeepThink = captchaResponse.deepThink || false;
2353
- if (captchaResult.captchaType === "text") {
2354
- for (const action of captchaResult.actions) {
2355
- if (action.type === "click" && action.target) {
2356
- await this.aiTap(action.target, { deepThink: shouldUseDeepThink });
2357
- } else if (action.type === "input" && action.value) {
2358
- if (action.target) {
2359
- await this.aiInput(action.value, action.target, { deepThink: shouldUseDeepThink });
2360
- }
2361
- } else if (action.type === "verify" && action.target) {
2362
- await this.aiTap(action.target, { deepThink: shouldUseDeepThink });
2363
- }
2364
- }
2365
- } else if (captchaResult.captchaType === "image") {
2366
- for (const action of captchaResult.actions) {
2367
- if (action.type === "click") {
2368
- if (action.coordinates) {
2369
- const x = action.coordinates[0];
2370
- const y = action.coordinates[1];
2371
- await this.aiTap(`element at coordinates (${x}, ${y})`, { deepThink: shouldUseDeepThink });
2372
- } else if (action.target) {
2373
- await this.aiTap(action.target, { deepThink: shouldUseDeepThink });
2374
- }
2375
- } else if (action.type === "verify" && action.target) {
2376
- await this.aiTap(action.target, { deepThink: shouldUseDeepThink });
2377
- }
2378
- }
2379
- }
2380
- await new Promise((resolve2) => setTimeout(resolve2, 3e3));
2381
- const metadata = {
2382
- status: "finished",
2383
- usage,
2384
- thought: captchaResult.thought
2385
- };
2386
- metadata.deepThink = actualDeepThink;
2387
- if (autoDetectComplexity && !deepThink) {
2388
- metadata.autoDetectedComplexity = shouldUseDeepThink;
2389
- }
2390
- return {
2391
- result: captchaResult,
2392
- metadata
2393
- };
2394
- }
2395
- async aiWaitFor(assertion, opt) {
2396
- const startTime = Date.now();
2397
- const { executor } = await this.taskExecutor.waitFor(assertion, {
2398
- timeoutMs: opt?.timeoutMs || 15 * 1e3,
2399
- checkIntervalMs: opt?.checkIntervalMs || 3 * 1e3,
2400
- assertion
2401
- });
2402
- const metadata = {
2403
- status: executor.isInErrorState() ? "failed" : "finished",
2404
- start: startTime,
2405
- end: Date.now(),
2406
- totalTime: Date.now() - startTime,
2407
- thought: executor.latestErrorTask()?.thought,
2408
- actionDetails: executor.tasks.map((task) => ({
2409
- type: task.type,
2410
- subType: task.subType,
2411
- status: task.status,
2412
- thought: task.thought
2413
- }))
2414
- };
2415
- this.appendExecutionDump(executor.dump());
2416
- this.writeOutActionDumps();
2417
- if (executor.isInErrorState()) {
2418
- const errorTask = executor.latestErrorTask();
2419
- throw new Error(`${errorTask?.error}
2420
- ${errorTask?.errorStack}`);
2421
- }
2422
- return {
2423
- result: true,
2424
- // Successfully waited
2425
- metadata
2426
- };
2427
- }
2428
- async ai(taskPrompt, type = "action", options) {
2429
- if (type === "action") {
2430
- return this.aiAction(taskPrompt);
2431
- }
2432
- if (type === "query") {
2433
- return this.aiQuery(taskPrompt);
2434
- }
2435
- if (type === "assert") {
2436
- return this.aiAssert(taskPrompt);
2437
- }
2438
- if (type === "tap") {
2439
- return this.aiTap(taskPrompt, options);
2440
- }
2441
- if (type === "captcha") {
2442
- return this.aiCaptcha(options);
2443
- }
2444
- throw new Error(
2445
- `Unknown type: ${type}, only support 'action', 'query', 'assert', 'tap', 'captcha'`
2446
- );
2447
- }
2448
- async runYaml(yamlScriptContent) {
2449
- const startTime = Date.now();
2450
- const script = parseYamlScript(yamlScriptContent, "yaml", true);
2451
- const player = new ScriptPlayer(script, async () => {
2452
- return { agent: this, freeFn: [] };
2453
- });
2454
- await player.run();
2455
- const endTime = Date.now();
2456
- const metadata = {
2457
- status: player.status,
2458
- start: startTime,
2459
- end: endTime,
2460
- totalTime: endTime - startTime,
2461
- tasks: player.taskStatusList.map((task) => ({
2462
- type: "yaml-task",
2463
- subType: task.name,
2464
- status: task.status,
2465
- error: task.error?.message
2466
- }))
2467
- };
2468
- if (player.status === "error") {
2469
- const errors = player.taskStatusList.filter((task) => task.status === "error").map((task) => {
2470
- return `task - ${task.name}: ${task.error?.message}`;
2471
- }).join("\n");
2472
- throw new Error(`Error(s) occurred in running yaml script:
2473
- ${errors}`);
2474
- }
2475
- return {
2476
- result: player.result,
2477
- metadata
2478
- };
2479
- }
2480
- async evaluateJavaScript(script) {
2481
- assert7(
2482
- this.page.evaluateJavaScript,
2483
- "evaluateJavaScript is not supported in current agent"
2484
- );
2485
- if (this.page.evaluateJavaScript) {
2486
- return this.page.evaluateJavaScript(script);
2487
- }
2488
- throw new Error("evaluateJavaScript is not supported in current agent");
2489
- }
2490
- async destroy() {
2491
- await this.page.destroy();
2492
- }
2493
- };
2494
-
2495
- // src/chrome-extension/agent.ts
2496
- var ChromeExtensionProxyPageAgent = class extends PageAgent {
2497
- // biome-ignore lint/complexity/noUselessConstructor: <explanation>
2498
- constructor(page, opts) {
2499
- super(page, opts);
2500
- }
2501
- };
2502
-
2503
- // src/chrome-extension/page.ts
2504
- import { treeToList as treeToList2 } from "misoai-shared/extractor";
2505
- import { assert as assert9 } from "misoai-shared/utils";
2506
-
2507
- // src/chrome-extension/cdpInput.ts
2508
- import {
2509
- _keyDefinitions
2510
- } from "misoai-shared/keyboard-layout";
2511
- import { assert as assert8 } from "misoai-shared/utils";
2512
- var _pressedKeys, _client, _modifierBit, modifierBit_fn, _keyDescriptionForString, keyDescriptionForString_fn;
2513
- var CdpKeyboard = class {
2514
- constructor(client) {
2515
- __privateAdd(this, _modifierBit);
2516
- __privateAdd(this, _keyDescriptionForString);
2517
- __privateAdd(this, _pressedKeys, /* @__PURE__ */ new Set());
2518
- __privateAdd(this, _client, void 0);
2519
- this._modifiers = 0;
2520
- __privateSet(this, _client, client);
2521
- }
2522
- updateClient(client) {
2523
- __privateSet(this, _client, client);
2524
- }
2525
- async down(key, options = {
2526
- text: void 0,
2527
- commands: []
2528
- }) {
2529
- const description = __privateMethod(this, _keyDescriptionForString, keyDescriptionForString_fn).call(this, key);
2530
- const autoRepeat = __privateGet(this, _pressedKeys).has(description.code);
2531
- __privateGet(this, _pressedKeys).add(description.code);
2532
- this._modifiers |= __privateMethod(this, _modifierBit, modifierBit_fn).call(this, description.key);
2533
- const text = options.text === void 0 ? description.text : options.text;
2534
- await __privateGet(this, _client).send("Input.dispatchKeyEvent", {
2535
- type: text ? "keyDown" : "rawKeyDown",
2536
- modifiers: this._modifiers,
2537
- windowsVirtualKeyCode: description.keyCode,
2538
- code: description.code,
2539
- key: description.key,
2540
- text,
2541
- unmodifiedText: text,
2542
- autoRepeat,
2543
- location: description.location,
2544
- isKeypad: description.location === 3,
2545
- commands: options.commands
2546
- });
2547
- }
2548
- async up(key) {
2549
- const description = __privateMethod(this, _keyDescriptionForString, keyDescriptionForString_fn).call(this, key);
2550
- this._modifiers &= ~__privateMethod(this, _modifierBit, modifierBit_fn).call(this, description.key);
2551
- __privateGet(this, _pressedKeys).delete(description.code);
2552
- await __privateGet(this, _client).send("Input.dispatchKeyEvent", {
2553
- type: "keyUp",
2554
- modifiers: this._modifiers,
2555
- key: description.key,
2556
- windowsVirtualKeyCode: description.keyCode,
2557
- code: description.code,
2558
- location: description.location
2559
- });
2560
- }
2561
- async sendCharacter(char) {
2562
- await __privateGet(this, _client).send("Input.insertText", { text: char });
2563
- }
2564
- charIsKey(char) {
2565
- return !!_keyDefinitions[char];
2566
- }
2567
- async type(text, options = {}) {
2568
- const delay = options.delay || void 0;
2569
- for (const char of text) {
2570
- if (this.charIsKey(char)) {
2571
- await this.press(char, { delay });
2572
- } else {
2573
- if (delay) {
2574
- await new Promise((f) => {
2575
- return setTimeout(f, delay);
2576
- });
2577
- }
2578
- await this.sendCharacter(char);
2579
- }
2580
- }
2581
- }
2582
- async press(key, options = {}) {
2583
- const { delay = null } = options;
2584
- const keys = Array.isArray(key) ? key : [key];
2585
- for (const k of keys) {
2586
- await this.down(k, options);
2587
- }
2588
- if (delay) {
2589
- await new Promise((f) => {
2590
- return setTimeout(f, options.delay);
2591
- });
2592
- }
2593
- for (const k of [...keys].reverse()) {
2594
- await this.up(k);
2595
- }
2596
- }
2597
- };
2598
- _pressedKeys = new WeakMap();
2599
- _client = new WeakMap();
2600
- _modifierBit = new WeakSet();
2601
- modifierBit_fn = function(key) {
2602
- if (key === "Alt") {
2603
- return 1;
2604
- }
2605
- if (key === "Control") {
2606
- return 2;
2607
- }
2608
- if (key === "Meta") {
2609
- return 4;
2610
- }
2611
- if (key === "Shift") {
2612
- return 8;
2613
- }
2614
- return 0;
2615
- };
2616
- _keyDescriptionForString = new WeakSet();
2617
- keyDescriptionForString_fn = function(keyString) {
2618
- const shift = this._modifiers & 8;
2619
- const description = {
2620
- key: "",
2621
- keyCode: 0,
2622
- code: "",
2623
- text: "",
2624
- location: 0
2625
- };
2626
- const definition = _keyDefinitions[keyString];
2627
- assert8(definition, `Unknown key: "${keyString}"`);
2628
- if (definition.key) {
2629
- description.key = definition.key;
2630
- }
2631
- if (shift && definition.shiftKey) {
2632
- description.key = definition.shiftKey;
2633
- }
2634
- if (definition.keyCode) {
2635
- description.keyCode = definition.keyCode;
2636
- }
2637
- if (shift && definition.shiftKeyCode) {
2638
- description.keyCode = definition.shiftKeyCode;
2639
- }
2640
- if (definition.code) {
2641
- description.code = definition.code;
2642
- }
2643
- if (definition.location) {
2644
- description.location = definition.location;
2645
- }
2646
- if (description.key.length === 1) {
2647
- description.text = description.key;
2648
- }
2649
- if (definition.text) {
2650
- description.text = definition.text;
2651
- }
2652
- if (shift && definition.shiftText) {
2653
- description.text = definition.shiftText;
2654
- }
2655
- if (this._modifiers & ~8) {
2656
- description.text = "";
2657
- }
2658
- return description;
2659
- };
2660
-
2661
- // src/chrome-extension/dynamic-scripts.ts
2662
- import fs from "fs";
2663
- import { ifInBrowser as ifInBrowser3 } from "misoai-shared/utils";
2664
- var scriptFileContentCache = null;
2665
- var getHtmlElementScript = async () => {
2666
- const scriptFileToRetrieve = chrome.runtime.getURL("scripts/htmlElement.js");
2667
- if (scriptFileContentCache)
2668
- return scriptFileContentCache;
2669
- if (ifInBrowser3) {
2670
- const script = await fetch(scriptFileToRetrieve);
2671
- scriptFileContentCache = await script.text();
2672
- return scriptFileContentCache;
2673
- }
2674
- return fs.readFileSync(scriptFileToRetrieve, "utf8");
2675
- };
2676
- var waterFlowScriptFileContentCache = null;
2677
- var injectWaterFlowAnimation = async () => {
2678
- const waterFlowScriptFileToRetrieve = chrome.runtime.getURL(
2679
- "scripts/water-flow.js"
2680
- );
2681
- if (waterFlowScriptFileContentCache)
2682
- return waterFlowScriptFileContentCache;
2683
- if (ifInBrowser3) {
2684
- const script = await fetch(waterFlowScriptFileToRetrieve);
2685
- waterFlowScriptFileContentCache = await script.text();
2686
- return waterFlowScriptFileContentCache;
2687
- }
2688
- return fs.readFileSync(waterFlowScriptFileToRetrieve, "utf8");
2689
- };
2690
- var stopWaterFlowScriptFileContentCache = null;
2691
- var injectStopWaterFlowAnimation = async () => {
2692
- const stopWaterFlowScriptFileToRetrieve = chrome.runtime.getURL(
2693
- "scripts/stop-water-flow.js"
2694
- );
2695
- if (stopWaterFlowScriptFileContentCache)
2696
- return stopWaterFlowScriptFileContentCache;
2697
- if (ifInBrowser3) {
2698
- const script = await fetch(stopWaterFlowScriptFileToRetrieve);
2699
- stopWaterFlowScriptFileContentCache = await script.text();
2700
- return stopWaterFlowScriptFileContentCache;
2701
- }
2702
- return fs.readFileSync(stopWaterFlowScriptFileToRetrieve, "utf8");
2703
- };
2704
-
2705
- // src/chrome-extension/page.ts
2706
- function sleep2(ms) {
2707
- return new Promise((resolve2) => setTimeout(resolve2, ms));
2708
- }
2709
- var ChromeExtensionProxyPage = class {
2710
- constructor(forceSameTabNavigation) {
2711
- this.pageType = "chrome-extension-proxy";
2712
- this.version = "1.0.0";
2713
- this.activeTabId = null;
2714
- this.tabIdOfDebuggerAttached = null;
2715
- this.attachingDebugger = null;
2716
- this.destroyed = false;
2717
- this.latestMouseX = 100;
2718
- this.latestMouseY = 100;
2719
- this.mouse = {
2720
- click: async (x, y) => {
2721
- await this.mouse.move(x, y);
2722
- await this.sendCommandToDebugger("Input.dispatchMouseEvent", {
2723
- type: "mousePressed",
2724
- x,
2725
- y,
2726
- button: "left",
2727
- clickCount: 1
2728
- });
2729
- await this.sendCommandToDebugger("Input.dispatchMouseEvent", {
2730
- type: "mouseReleased",
2731
- x,
2732
- y,
2733
- button: "left",
2734
- clickCount: 1
2735
- });
2736
- },
2737
- wheel: async (deltaX, deltaY, startX, startY) => {
2738
- const finalX = startX || this.latestMouseX;
2739
- const finalY = startY || this.latestMouseY;
2740
- await this.showMousePointer(finalX, finalY);
2741
- await this.sendCommandToDebugger("Input.dispatchMouseEvent", {
2742
- type: "mouseWheel",
2743
- x: finalX,
2744
- y: finalY,
2745
- deltaX,
2746
- deltaY
2747
- });
2748
- this.latestMouseX = finalX;
2749
- this.latestMouseY = finalY;
2750
- },
2751
- move: async (x, y) => {
2752
- await this.showMousePointer(x, y);
2753
- await this.sendCommandToDebugger("Input.dispatchMouseEvent", {
2754
- type: "mouseMoved",
2755
- x,
2756
- y
2757
- });
2758
- this.latestMouseX = x;
2759
- this.latestMouseY = y;
2760
- },
2761
- drag: async (from, to) => {
2762
- await this.mouse.move(from.x, from.y);
2763
- await this.sendCommandToDebugger("Input.dispatchMouseEvent", {
2764
- type: "mousePressed",
2765
- x: from.x,
2766
- y: from.y,
2767
- button: "left",
2768
- clickCount: 1
2769
- });
2770
- await this.mouse.move(to.x, to.y);
2771
- await this.sendCommandToDebugger("Input.dispatchMouseEvent", {
2772
- type: "mouseReleased",
2773
- x: to.x,
2774
- y: to.y,
2775
- button: "left",
2776
- clickCount: 1
2777
- });
2778
- }
2779
- };
2780
- this.keyboard = {
2781
- type: async (text) => {
2782
- const cdpKeyboard = new CdpKeyboard({
2783
- send: this.sendCommandToDebugger.bind(this)
2784
- });
2785
- await cdpKeyboard.type(text, { delay: 0 });
2786
- },
2787
- press: async (action) => {
2788
- const cdpKeyboard = new CdpKeyboard({
2789
- send: this.sendCommandToDebugger.bind(this)
2790
- });
2791
- const keys = Array.isArray(action) ? action : [action];
2792
- for (const k of keys) {
2793
- const commands = k.command ? [k.command] : [];
2794
- await cdpKeyboard.down(k.key, { commands });
2795
- }
2796
- for (const k of [...keys].reverse()) {
2797
- await cdpKeyboard.up(k.key);
2798
- }
2799
- }
2800
- };
2801
- this.forceSameTabNavigation = forceSameTabNavigation;
2802
- }
2803
- async setActiveTabId(tabId) {
2804
- if (this.activeTabId) {
2805
- throw new Error(
2806
- `Active tab id is already set, which is ${this.activeTabId}, cannot set it to ${tabId}`
2807
- );
2808
- }
2809
- await chrome.tabs.update(tabId, { active: true });
2810
- this.activeTabId = tabId;
2811
- }
2812
- async getActiveTabId() {
2813
- return this.activeTabId;
2814
- }
2815
- /**
2816
- * Get a list of current tabs
2817
- * @returns {Promise<Array<{id: number, title: string, url: string}>>}
2818
- */
2819
- async getBrowserTabList() {
2820
- const tabs = await chrome.tabs.query({ currentWindow: true });
2821
- return tabs.map((tab) => ({
2822
- id: `${tab.id}`,
2823
- title: tab.title,
2824
- url: tab.url,
2825
- currentActiveTab: tab.active
2826
- })).filter((tab) => tab.id && tab.title && tab.url);
2827
- }
2828
- async getTabIdOrConnectToCurrentTab() {
2829
- if (this.activeTabId) {
2830
- return this.activeTabId;
2831
- }
2832
- const tabId = await chrome.tabs.query({ active: true, currentWindow: true }).then((tabs) => tabs[0]?.id);
2833
- this.activeTabId = tabId || 0;
2834
- return this.activeTabId;
2835
- }
2836
- async attachDebugger() {
2837
- assert9(!this.destroyed, "Page is destroyed");
2838
- if (this.attachingDebugger) {
2839
- await this.attachingDebugger;
2840
- return;
2841
- }
2842
- this.attachingDebugger = (async () => {
2843
- const url = await this.url();
2844
- let error = null;
2845
- if (url.startsWith("chrome://")) {
2846
- throw new Error(
2847
- "Cannot attach debugger to chrome:// pages, please use Midscene in a normal page with http://, https:// or file://"
2848
- );
2849
- }
2850
- try {
2851
- const currentTabId = await this.getTabIdOrConnectToCurrentTab();
2852
- if (this.tabIdOfDebuggerAttached === currentTabId) {
2853
- return;
2854
- }
2855
- if (this.tabIdOfDebuggerAttached && this.tabIdOfDebuggerAttached !== currentTabId) {
2856
- console.log(
2857
- "detach the previous tab",
2858
- this.tabIdOfDebuggerAttached,
2859
- "->",
2860
- currentTabId
2861
- );
2862
- try {
2863
- await this.detachDebugger(this.tabIdOfDebuggerAttached);
2864
- } catch (error2) {
2865
- console.error("Failed to detach debugger", error2);
2866
- }
2867
- }
2868
- console.log("attaching debugger", currentTabId);
2869
- await chrome.debugger.attach({ tabId: currentTabId }, "1.3");
2870
- await sleep2(500);
2871
- this.tabIdOfDebuggerAttached = currentTabId;
2872
- await this.enableWaterFlowAnimation();
2873
- } catch (e) {
2874
- console.error("Failed to attach debugger", e);
2875
- error = e;
2876
- } finally {
2877
- this.attachingDebugger = null;
2878
- }
2879
- if (error) {
2880
- throw error;
2881
- }
2882
- })();
2883
- await this.attachingDebugger;
2884
- }
2885
- async showMousePointer(x, y) {
2886
- const pointerScript = `(() => {
2887
- if(typeof window.midsceneWaterFlowAnimation !== 'undefined') {
2888
- window.midsceneWaterFlowAnimation.enable();
2889
- window.midsceneWaterFlowAnimation.showMousePointer(${x}, ${y});
2890
- } else {
2891
- console.log('midsceneWaterFlowAnimation is not defined');
2892
- }
2893
- })()`;
2894
- await this.sendCommandToDebugger("Runtime.evaluate", {
2895
- expression: `${pointerScript}`
2896
- });
2897
- }
2898
- async hideMousePointer() {
2899
- await this.sendCommandToDebugger("Runtime.evaluate", {
2900
- expression: `(() => {
2901
- if(typeof window.midsceneWaterFlowAnimation !== 'undefined') {
2902
- window.midsceneWaterFlowAnimation.hideMousePointer();
2903
- }
2904
- })()`
2905
- });
2906
- }
2907
- async detachDebugger(tabId) {
2908
- const tabIdToDetach = tabId || this.tabIdOfDebuggerAttached;
2909
- console.log("detaching debugger", tabIdToDetach);
2910
- if (!tabIdToDetach) {
2911
- console.warn("No tab id to detach");
2912
- return;
2913
- }
2914
- try {
2915
- await this.disableWaterFlowAnimation(tabIdToDetach);
2916
- await sleep2(200);
2917
- } catch (error) {
2918
- console.warn("Failed to disable water flow animation", error);
2919
- }
2920
- try {
2921
- await chrome.debugger.detach({ tabId: tabIdToDetach });
2922
- } catch (error) {
2923
- console.warn("Failed to detach debugger", error);
2924
- }
2925
- this.tabIdOfDebuggerAttached = null;
2926
- }
2927
- async enableWaterFlowAnimation() {
2928
- if (this.forceSameTabNavigation) {
2929
- await chrome.debugger.sendCommand(
2930
- { tabId: this.tabIdOfDebuggerAttached },
2931
- "Runtime.evaluate",
2932
- {
2933
- expression: limitOpenNewTabScript
2934
- }
2935
- );
2936
- }
2937
- const script = await injectWaterFlowAnimation();
2938
- await chrome.debugger.sendCommand(
2939
- { tabId: this.tabIdOfDebuggerAttached },
2940
- "Runtime.evaluate",
2941
- {
2942
- expression: script
2943
- }
2944
- );
2945
- }
2946
- async disableWaterFlowAnimation(tabId) {
2947
- const script = await injectStopWaterFlowAnimation();
2948
- await chrome.debugger.sendCommand({ tabId }, "Runtime.evaluate", {
2949
- expression: script
2950
- });
2951
- }
2952
- async sendCommandToDebugger(command, params) {
2953
- await this.attachDebugger();
2954
- assert9(this.tabIdOfDebuggerAttached, "Debugger is not attached");
2955
- this.enableWaterFlowAnimation();
2956
- return await chrome.debugger.sendCommand(
2957
- { tabId: this.tabIdOfDebuggerAttached },
2958
- command,
2959
- params
2960
- );
2961
- }
2962
- async getPageContentByCDP() {
2963
- const script = await getHtmlElementScript();
2964
- await this.sendCommandToDebugger("Runtime.evaluate", {
2965
- expression: script
2966
- });
2967
- const expression = () => {
2968
- return {
2969
- tree: window.midscene_element_inspector.webExtractNodeTree(),
2970
- size: {
2971
- width: document.documentElement.clientWidth,
2972
- height: document.documentElement.clientHeight,
2973
- dpr: window.devicePixelRatio
2974
- }
2975
- };
2976
- };
2977
- const returnValue = await this.sendCommandToDebugger("Runtime.evaluate", {
2978
- expression: `(${expression.toString()})()`,
2979
- returnByValue: true
2980
- });
2981
- if (!returnValue.result.value) {
2982
- const errorDescription = returnValue.exceptionDetails?.exception?.description || "";
2983
- if (!errorDescription) {
2984
- console.error("returnValue from cdp", returnValue);
2985
- }
2986
- throw new Error(
2987
- `Failed to get page content from page, error: ${errorDescription}`
2988
- );
2989
- }
2990
- return returnValue.result.value;
2991
- }
2992
- async evaluateJavaScript(script) {
2993
- return this.sendCommandToDebugger("Runtime.evaluate", {
2994
- expression: script
2995
- });
2996
- }
2997
- // current implementation is wait until domReadyState is complete
2998
- async waitUntilNetworkIdle() {
2999
- const timeout = 1e4;
3000
- const startTime = Date.now();
3001
- let lastReadyState = "";
3002
- while (Date.now() - startTime < timeout) {
3003
- const result = await this.sendCommandToDebugger("Runtime.evaluate", {
3004
- expression: "document.readyState"
3005
- });
3006
- lastReadyState = result.result.value;
3007
- if (lastReadyState === "complete") {
3008
- await new Promise((resolve2) => setTimeout(resolve2, 300));
3009
- return;
3010
- }
3011
- await new Promise((resolve2) => setTimeout(resolve2, 300));
3012
- }
3013
- throw new Error(
3014
- `Failed to wait until network idle, last readyState: ${lastReadyState}`
3015
- );
3016
- }
3017
- async getElementsInfo() {
3018
- const tree = await this.getElementsNodeTree();
3019
- return treeToList2(tree);
3020
- }
3021
- async getElementsNodeTree() {
3022
- await this.hideMousePointer();
3023
- const content = await this.getPageContentByCDP();
3024
- if (content?.size) {
3025
- this.viewportSize = content.size;
3026
- }
3027
- return content?.tree || { node: null, children: [] };
3028
- }
3029
- async size() {
3030
- if (this.viewportSize)
3031
- return this.viewportSize;
3032
- const content = await this.getPageContentByCDP();
3033
- return content.size;
3034
- }
3035
- async screenshotBase64() {
3036
- await this.hideMousePointer();
3037
- const base64 = await this.sendCommandToDebugger("Page.captureScreenshot", {
3038
- format: "jpeg",
3039
- quality: 90
3040
- });
3041
- return `data:image/jpeg;base64,${base64.data}`;
3042
- }
3043
- async url() {
3044
- const tabId = await this.getTabIdOrConnectToCurrentTab();
3045
- const url = await chrome.tabs.get(tabId).then((tab) => tab.url);
3046
- return url || "";
3047
- }
3048
- async scrollUntilTop(startingPoint) {
3049
- if (startingPoint) {
3050
- await this.mouse.move(startingPoint.left, startingPoint.top);
3051
- }
3052
- return this.mouse.wheel(0, -9999999);
3053
- }
3054
- async scrollUntilBottom(startingPoint) {
3055
- if (startingPoint) {
3056
- await this.mouse.move(startingPoint.left, startingPoint.top);
3057
- }
3058
- return this.mouse.wheel(0, 9999999);
3059
- }
3060
- async scrollUntilLeft(startingPoint) {
3061
- if (startingPoint) {
3062
- await this.mouse.move(startingPoint.left, startingPoint.top);
3063
- }
3064
- return this.mouse.wheel(-9999999, 0);
3065
- }
3066
- async scrollUntilRight(startingPoint) {
3067
- if (startingPoint) {
3068
- await this.mouse.move(startingPoint.left, startingPoint.top);
3069
- }
3070
- return this.mouse.wheel(9999999, 0);
3071
- }
3072
- async scrollUp(distance, startingPoint) {
3073
- const { height } = await this.size();
3074
- const scrollDistance = distance || height * 0.7;
3075
- return this.mouse.wheel(
3076
- 0,
3077
- -scrollDistance,
3078
- startingPoint?.left,
3079
- startingPoint?.top
3080
- );
3081
- }
3082
- async scrollDown(distance, startingPoint) {
3083
- const { height } = await this.size();
3084
- const scrollDistance = distance || height * 0.7;
3085
- return this.mouse.wheel(
3086
- 0,
3087
- scrollDistance,
3088
- startingPoint?.left,
3089
- startingPoint?.top
3090
- );
3091
- }
3092
- async scrollLeft(distance, startingPoint) {
3093
- const { width } = await this.size();
3094
- const scrollDistance = distance || width * 0.7;
3095
- return this.mouse.wheel(
3096
- -scrollDistance,
3097
- 0,
3098
- startingPoint?.left,
3099
- startingPoint?.top
3100
- );
3101
- }
3102
- async scrollRight(distance, startingPoint) {
3103
- const { width } = await this.size();
3104
- const scrollDistance = distance || width * 0.7;
3105
- return this.mouse.wheel(
3106
- scrollDistance,
3107
- 0,
3108
- startingPoint?.left,
3109
- startingPoint?.top
3110
- );
3111
- }
3112
- async clearInput(element) {
3113
- if (!element) {
3114
- console.warn("No element to clear input");
3115
- return;
3116
- }
3117
- await this.mouse.click(element.center[0], element.center[1]);
3118
- await this.sendCommandToDebugger("Input.dispatchKeyEvent", {
3119
- type: "keyDown",
3120
- commands: ["selectAll"]
3121
- });
3122
- await this.sendCommandToDebugger("Input.dispatchKeyEvent", {
3123
- type: "keyUp",
3124
- commands: ["selectAll"]
3125
- });
3126
- await sleep2(100);
3127
- await this.keyboard.press({
3128
- key: "Backspace"
3129
- });
3130
- }
3131
- async destroy() {
3132
- this.activeTabId = null;
3133
- await this.detachDebugger();
3134
- this.destroyed = true;
3135
- }
3136
- };
3137
-
3138
- // src/chrome-extension/index.ts
3139
- import { overrideAIConfig } from "misoai-shared/env";
3140
- export {
3141
- ChromeExtensionProxyPage,
3142
- ChromeExtensionProxyPageAgent,
3143
- ERROR_CODE_NOT_IMPLEMENTED_AS_DESIGNED,
3144
- overrideAIConfig
3145
- };
3146
- /**
3147
- * @license
3148
- * Copyright 2017 Google Inc.
3149
- * SPDX-License-Identifier: Apache-2.0
3150
- */
3151
-
3152
- //# sourceMappingURL=chrome-extension.js.map