@midscene/web 0.12.7 → 0.12.8-beta-20250317112356.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (53) hide show
  1. package/dist/es/appium.js +249 -77
  2. package/dist/es/appium.js.map +1 -1
  3. package/dist/es/bridge-mode-browser.js +3 -3
  4. package/dist/es/bridge-mode-browser.js.map +1 -1
  5. package/dist/es/bridge-mode.js +253 -81
  6. package/dist/es/bridge-mode.js.map +1 -1
  7. package/dist/es/chrome-extension.js +252 -80
  8. package/dist/es/chrome-extension.js.map +1 -1
  9. package/dist/es/index.js +302 -95
  10. package/dist/es/index.js.map +1 -1
  11. package/dist/es/midscene-playground.js +252 -80
  12. package/dist/es/midscene-playground.js.map +1 -1
  13. package/dist/es/playground.js +246 -74
  14. package/dist/es/playground.js.map +1 -1
  15. package/dist/es/playwright.js +298 -91
  16. package/dist/es/playwright.js.map +1 -1
  17. package/dist/es/puppeteer.js +307 -100
  18. package/dist/es/puppeteer.js.map +1 -1
  19. package/dist/lib/appium.js +257 -84
  20. package/dist/lib/appium.js.map +1 -1
  21. package/dist/lib/bridge-mode-browser.js +3 -3
  22. package/dist/lib/bridge-mode-browser.js.map +1 -1
  23. package/dist/lib/bridge-mode.js +259 -86
  24. package/dist/lib/bridge-mode.js.map +1 -1
  25. package/dist/lib/chrome-extension.js +260 -87
  26. package/dist/lib/chrome-extension.js.map +1 -1
  27. package/dist/lib/index.js +312 -104
  28. package/dist/lib/index.js.map +1 -1
  29. package/dist/lib/midscene-playground.js +259 -86
  30. package/dist/lib/midscene-playground.js.map +1 -1
  31. package/dist/lib/playground.js +250 -77
  32. package/dist/lib/playground.js.map +1 -1
  33. package/dist/lib/playwright.js +305 -97
  34. package/dist/lib/playwright.js.map +1 -1
  35. package/dist/lib/puppeteer.js +314 -106
  36. package/dist/lib/puppeteer.js.map +1 -1
  37. package/dist/types/{agent-ae5d08b8.d.ts → agent-91413f27.d.ts} +15 -9
  38. package/dist/types/appium.d.ts +3 -3
  39. package/dist/types/bridge-mode-browser.d.ts +3 -3
  40. package/dist/types/bridge-mode.d.ts +4 -4
  41. package/dist/types/{browser-98c36453.d.ts → browser-fd1cecbf.d.ts} +1 -1
  42. package/dist/types/chrome-extension.d.ts +4 -4
  43. package/dist/types/index.d.ts +4 -4
  44. package/dist/types/{page-b934ce61.d.ts → page-76bf46bb.d.ts} +2 -2
  45. package/dist/types/playground.d.ts +4 -4
  46. package/dist/types/playwright.d.ts +3 -3
  47. package/dist/types/puppeteer.d.ts +3 -3
  48. package/dist/types/{utils-e00799b2.d.ts → utils-406f2cf3.d.ts} +1 -1
  49. package/dist/types/utils.d.ts +2 -2
  50. package/dist/types/yaml.d.ts +4 -4
  51. package/iife-script/htmlElement.js +0 -21
  52. package/iife-script/htmlElementDebug.js +0 -21
  53. package/package.json +3 -3
package/dist/es/appium.js CHANGED
@@ -219,10 +219,9 @@ function parseYamlScript(content, filePath, ignoreCheckingTarget) {
219
219
 
220
220
  // src/common/agent.ts
221
221
  import {
222
- MATCH_BY_POSITION,
223
222
  MIDSCENE_USE_VLM_UI_TARS as MIDSCENE_USE_VLM_UI_TARS2,
224
223
  getAIConfig as getAIConfig3,
225
- getAIConfigInBoolean as getAIConfigInBoolean2
224
+ vlLocateMode
226
225
  } from "@midscene/core/env";
227
226
  import {
228
227
  groupedActionDumpFileExt,
@@ -274,12 +273,12 @@ var TaskCache = class {
274
273
  tasks: newCacheGroup
275
274
  });
276
275
  return {
277
- readCache: (pageContext, type, actionPrompt) => {
276
+ matchCache: (pageContext, type, actionPrompt) => {
278
277
  if (index === -1) {
279
278
  return false;
280
279
  }
281
280
  if (type === "plan") {
282
- return this.readCache(
281
+ return this.matchCache(
283
282
  pageContext,
284
283
  type,
285
284
  actionPrompt,
@@ -287,14 +286,14 @@ var TaskCache = class {
287
286
  );
288
287
  }
289
288
  if (type === "ui-tars-plan") {
290
- return this.readCache(
289
+ return this.matchCache(
291
290
  pageContext,
292
291
  type,
293
292
  actionPrompt,
294
293
  aiTasks[index].tasks
295
294
  );
296
295
  }
297
- return this.readCache(
296
+ return this.matchCache(
298
297
  pageContext,
299
298
  type,
300
299
  actionPrompt,
@@ -303,11 +302,16 @@ var TaskCache = class {
303
302
  },
304
303
  saveCache: (cache) => {
305
304
  newCacheGroup.push(cache);
305
+ debug(
306
+ "saving cache to file, type: %s, cacheId: %s",
307
+ cache.type,
308
+ this.cacheId
309
+ );
306
310
  this.writeCacheToFile();
307
311
  }
308
312
  };
309
313
  }
310
- readCache(pageContext, type, userPrompt, cacheGroup) {
314
+ matchCache(pageContext, type, userPrompt, cacheGroup) {
311
315
  debug(
312
316
  "will read cache, type: %s, prompt: %s, cacheGroupLength: %s",
313
317
  type,
@@ -327,17 +331,20 @@ var TaskCache = class {
327
331
  userPrompt,
328
332
  taskRes?.response
329
333
  );
330
- if (taskRes?.type === "locate" && !taskRes.response?.elements.every((element) => {
331
- const findIndex = pageContext.content.findIndex(
332
- (contentElement) => contentElement.id === element.id
334
+ if (taskRes?.type === "locate") {
335
+ const id = taskRes.response?.elements[0].id;
336
+ if (!id) {
337
+ debug("no id in cached response");
338
+ return false;
339
+ }
340
+ const foundInContext = pageContext.content.find(
341
+ (contentElement) => contentElement.id === id
333
342
  );
334
- if (findIndex === -1) {
343
+ if (!foundInContext) {
344
+ debug("cannot match element with same id in current page");
335
345
  return false;
336
346
  }
337
- return true;
338
- })) {
339
- debug("cannot find element with same id in current page");
340
- return false;
347
+ return taskRes.response;
341
348
  }
342
349
  if (taskRes && taskRes.type === type && taskRes.prompt === userPrompt) {
343
350
  const contextEqual = this.pageContextEqual(
@@ -384,7 +391,10 @@ var TaskCache = class {
384
391
  return void 0;
385
392
  }
386
393
  const cacheFile = join2(getLogDirByType("cache"), `${this.cacheId}.json`);
387
- if (getAIConfigInBoolean("MIDSCENE_CACHE") && existsSync2(cacheFile)) {
394
+ if (!getAIConfigInBoolean("MIDSCENE_CACHE")) {
395
+ return void 0;
396
+ }
397
+ if (existsSync2(cacheFile)) {
388
398
  try {
389
399
  const data = readFileSync(cacheFile, "utf8");
390
400
  const jsonData = JSON.parse(data);
@@ -396,16 +406,28 @@ var TaskCache = class {
396
406
  if (jsonDataPkgVersion[0] !== midscenePkgInfoPkgVersion[0] || jsonDataPkgVersion[1] !== midscenePkgInfoPkgVersion[1]) {
397
407
  return void 0;
398
408
  }
409
+ debug("read cache from file: %s", cacheFile);
399
410
  return jsonData;
400
411
  } catch (err) {
412
+ debug(
413
+ "cache file exists but parse failed, path: %s, error: %s",
414
+ cacheFile,
415
+ err
416
+ );
401
417
  return void 0;
402
418
  }
403
419
  }
420
+ debug("no cache file found, path: %s", cacheFile);
404
421
  return void 0;
405
422
  }
406
423
  writeCacheToFile() {
407
424
  const midscenePkgInfo = getRunningPkgInfo();
408
- if (!midscenePkgInfo || !this.cacheId) {
425
+ if (!midscenePkgInfo) {
426
+ debug("no midscene pkg info, will not write cache to file");
427
+ return;
428
+ }
429
+ if (!this.cacheId) {
430
+ debug("no cache id, will not write cache to file");
409
431
  return;
410
432
  }
411
433
  if (!ifInBrowser) {
@@ -549,7 +571,7 @@ var PageTaskExecutor = class {
549
571
  executor: async (param, taskContext) => {
550
572
  const { task } = taskContext;
551
573
  assert3(
552
- param?.prompt || param?.id || param?.position || param?.bbox,
574
+ param?.prompt || param?.id || param?.bbox,
553
575
  "No prompt or id or position or bbox to locate"
554
576
  );
555
577
  let insightDump;
@@ -567,40 +589,45 @@ var PageTaskExecutor = class {
567
589
  screenshot: pageContext.screenshotBase64,
568
590
  timing: "before locate"
569
591
  };
570
- const locateCache = cacheGroup?.readCache(
592
+ const cachePrompt = `${param.prompt} @ ${param.searchArea || ""}`;
593
+ const locateCache = cacheGroup?.matchCache(
571
594
  pageContext,
572
595
  "locate",
573
- param.prompt
596
+ cachePrompt
574
597
  );
575
- let locateResult;
576
- const callAI = this.insight.aiVendorFn;
598
+ const idInCache = locateCache?.elements?.[0]?.id;
599
+ let cacheHitFlag = false;
600
+ let quickAnswerId = param?.id;
601
+ if (!quickAnswerId && idInCache) {
602
+ quickAnswerId = idInCache;
603
+ }
577
604
  const quickAnswer = {
578
- id: param?.id,
579
- position: param?.position,
605
+ id: quickAnswerId,
580
606
  bbox: param?.bbox
581
607
  };
582
608
  const startTime = Date.now();
583
- const element = await this.insight.locate(param.prompt, {
584
- quickAnswer,
585
- callAI: async (...message) => {
586
- if (locateCache) {
587
- locateResult = locateCache;
588
- return Promise.resolve({ content: locateCache });
589
- }
590
- const { content: aiResult, usage: usage2 } = await callAI(...message);
591
- return { content: aiResult, usage: usage2 };
592
- }
609
+ const element = await this.insight.locate(param, {
610
+ quickAnswer
593
611
  });
594
612
  const aiCost = Date.now() - startTime;
595
- if (locateResult) {
613
+ if (element && element.id === quickAnswerId) {
614
+ cacheHitFlag = true;
615
+ }
616
+ if (element) {
596
617
  cacheGroup?.saveCache({
597
618
  type: "locate",
598
619
  pageContext: {
599
620
  url: pageContext.url,
600
621
  size: pageContext.size
601
622
  },
602
- prompt: param.prompt,
603
- response: locateResult
623
+ prompt: cachePrompt,
624
+ response: {
625
+ elements: [
626
+ {
627
+ id: element.id
628
+ }
629
+ ]
630
+ }
604
631
  });
605
632
  }
606
633
  if (!element) {
@@ -618,7 +645,7 @@ var PageTaskExecutor = class {
618
645
  dump: insightDump
619
646
  },
620
647
  cache: {
621
- hit: Boolean(locateCache)
648
+ hit: cacheHitFlag
622
649
  },
623
650
  recorder: [recordItem],
624
651
  aiCost,
@@ -884,13 +911,22 @@ var PageTaskExecutor = class {
884
911
  };
885
912
  executorContext.task.recorder = [recordItem];
886
913
  executorContext.task.pageContext = pageContext;
887
- const planCache = cacheGroup.readCache(
914
+ const cachePrompt = `${param.userInstruction} @ ${param.log || ""}`;
915
+ const planCache = cacheGroup.matchCache(
888
916
  pageContext,
889
917
  "plan",
890
- param.userInstruction
918
+ cachePrompt
891
919
  );
892
920
  let planResult;
893
921
  if (planCache) {
922
+ if ("actions" in planCache && Array.isArray(planCache.actions)) {
923
+ planCache.actions = planCache.actions.map((action) => {
924
+ if (action.locate) {
925
+ delete action.locate.bbox;
926
+ }
927
+ return action;
928
+ });
929
+ }
894
930
  planResult = planCache;
895
931
  } else {
896
932
  planResult = await plan(param.userInstruction, {
@@ -953,7 +989,7 @@ var PageTaskExecutor = class {
953
989
  }
954
990
  if (finalActions.length === 0) {
955
991
  assert3(
956
- !more_actions_needed_by_instruction,
992
+ !more_actions_needed_by_instruction || sleep2,
957
993
  error ? `Failed to plan: ${error}` : planParsingError || "No plan found"
958
994
  );
959
995
  }
@@ -963,7 +999,7 @@ var PageTaskExecutor = class {
963
999
  url: pageContext.url,
964
1000
  size: pageContext.size
965
1001
  },
966
- prompt: userInstruction,
1002
+ prompt: cachePrompt,
967
1003
  response: planResult
968
1004
  });
969
1005
  return {
@@ -1014,7 +1050,7 @@ var PageTaskExecutor = class {
1014
1050
  ]
1015
1051
  });
1016
1052
  const startTime = Date.now();
1017
- const planCache = cacheGroup.readCache(
1053
+ const planCache = cacheGroup.matchCache(
1018
1054
  pageContext,
1019
1055
  "ui-tars-plan",
1020
1056
  userInstruction
@@ -1061,6 +1097,18 @@ var PageTaskExecutor = class {
1061
1097
  };
1062
1098
  return task;
1063
1099
  }
1100
+ async runPlans(title, plans, options) {
1101
+ const taskExecutor = new Executor(title, {
1102
+ onTaskStart: options?.onTaskStart
1103
+ });
1104
+ const { tasks } = await this.convertPlanToExecutable(plans);
1105
+ await taskExecutor.append(tasks);
1106
+ const result = await taskExecutor.flush();
1107
+ return {
1108
+ output: result,
1109
+ executor: taskExecutor
1110
+ };
1111
+ }
1064
1112
  async action(userPrompt, options) {
1065
1113
  const taskExecutor = new Executor(userPrompt, {
1066
1114
  onTaskStart: options?.onTaskStart
@@ -1355,6 +1403,75 @@ var WebElementInfo = class {
1355
1403
  }
1356
1404
  };
1357
1405
 
1406
+ // src/common/plan-builder.ts
1407
+ import { assert as assert4, getDebug as getDebug2 } from "@midscene/shared/utils";
1408
+ var debug2 = getDebug2("plan-builder");
1409
+ function buildPlans(type, locate, param) {
1410
+ let returnPlans = [];
1411
+ const locatePlan = locate ? {
1412
+ type: "Locate",
1413
+ locate,
1414
+ param: locate,
1415
+ thought: ""
1416
+ } : null;
1417
+ if (type === "Tap" || type === "Hover") {
1418
+ assert4(locatePlan, `missing locate info for action "${type}"`);
1419
+ const tapPlan = {
1420
+ type,
1421
+ param: null,
1422
+ thought: "",
1423
+ locate
1424
+ };
1425
+ returnPlans = [locatePlan, tapPlan];
1426
+ }
1427
+ if (type === "Input" || type === "KeyboardPress") {
1428
+ if (type === "Input") {
1429
+ assert4(locatePlan, `missing locate info for action "${type}"`);
1430
+ }
1431
+ assert4(param, `missing param for action "${type}"`);
1432
+ const inputPlan = {
1433
+ type,
1434
+ param,
1435
+ thought: "",
1436
+ locate
1437
+ };
1438
+ if (locatePlan) {
1439
+ returnPlans = [locatePlan, inputPlan];
1440
+ } else {
1441
+ returnPlans = [inputPlan];
1442
+ }
1443
+ }
1444
+ if (type === "Scroll") {
1445
+ assert4(param, `missing param for action "${type}"`);
1446
+ const scrollPlan = {
1447
+ type,
1448
+ param,
1449
+ thought: "",
1450
+ locate
1451
+ };
1452
+ if (locatePlan) {
1453
+ returnPlans = [locatePlan, scrollPlan];
1454
+ } else {
1455
+ returnPlans = [scrollPlan];
1456
+ }
1457
+ }
1458
+ if (type === "Sleep") {
1459
+ assert4(param, `missing param for action "${type}"`);
1460
+ const sleepPlan = {
1461
+ type,
1462
+ param,
1463
+ thought: "",
1464
+ locate: null
1465
+ };
1466
+ returnPlans = [sleepPlan];
1467
+ }
1468
+ if (returnPlans) {
1469
+ debug2("buildPlans", returnPlans);
1470
+ return returnPlans;
1471
+ }
1472
+ throw new Error(`Not supported type: ${type}`);
1473
+ }
1474
+
1358
1475
  // src/common/utils.ts
1359
1476
  import {
1360
1477
  MIDSCENE_REPORT_TAG_NAME,
@@ -1365,10 +1482,10 @@ import { uploadTestInfoToServer } from "@midscene/core/utils";
1365
1482
  import { NodeType } from "@midscene/shared/constants";
1366
1483
  import { traverseTree, treeToList } from "@midscene/shared/extractor";
1367
1484
  import { compositeElementInfoImg, resizeImgBase64 } from "@midscene/shared/img";
1368
- import { assert as assert4, uuid } from "@midscene/shared/utils";
1485
+ import { assert as assert5, uuid } from "@midscene/shared/utils";
1369
1486
  import dayjs from "dayjs";
1370
1487
  async function parseContextFromWebPage(page, _opt) {
1371
- assert4(page, "page is required");
1488
+ assert5(page, "page is required");
1372
1489
  if (page._forceUsePageContext) {
1373
1490
  return await page._forceUsePageContext();
1374
1491
  }
@@ -1396,7 +1513,7 @@ async function parseContextFromWebPage(page, _opt) {
1396
1513
  });
1397
1514
  });
1398
1515
  const elementsInfo = treeToList(webTree);
1399
- assert4(screenshotBase64, "screenshotBase64 is required");
1516
+ assert5(screenshotBase64, "screenshotBase64 is required");
1400
1517
  const elementsPositionInfoWithoutText = elementsInfo.filter(
1401
1518
  (elementInfo) => {
1402
1519
  if (elementInfo.attributes.nodeType === NodeType.TEXT) {
@@ -1488,7 +1605,7 @@ var PageAgent = class {
1488
1605
  });
1489
1606
  }
1490
1607
  return await parseContextFromWebPage(this.page, {
1491
- ignoreMarker: getAIConfigInBoolean2(MATCH_BY_POSITION)
1608
+ ignoreMarker: !!vlLocateMode()
1492
1609
  });
1493
1610
  }
1494
1611
  resetDump() {
@@ -1535,50 +1652,105 @@ var PageAgent = class {
1535
1652
  }
1536
1653
  }
1537
1654
  }
1538
- async aiAction(taskPrompt) {
1539
- if (getAIConfig3(MIDSCENE_USE_VLM_UI_TARS2)) {
1540
- const { executor } = await this.taskExecutor.actionToGoal(taskPrompt, {
1541
- onTaskStart: this.callbackOnTaskStartTip.bind(this)
1542
- });
1543
- this.appendExecutionDump(executor.dump());
1544
- this.writeOutActionDumps();
1545
- if (executor.isInErrorState()) {
1546
- const errorTask = executor.latestErrorTask();
1547
- throw new Error(`${errorTask?.error}
1655
+ afterTaskRunning(executor, doNotThrowError = false) {
1656
+ this.appendExecutionDump(executor.dump());
1657
+ this.writeOutActionDumps();
1658
+ if (executor.isInErrorState() && !doNotThrowError) {
1659
+ const errorTask = executor.latestErrorTask();
1660
+ throw new Error(`${errorTask?.error}
1548
1661
  ${errorTask?.errorStack}`);
1662
+ }
1663
+ }
1664
+ async aiTap(targetPrompt, searchArea) {
1665
+ const plans = buildPlans("Tap", {
1666
+ prompt: targetPrompt,
1667
+ searchArea
1668
+ });
1669
+ const { executor, output } = await this.taskExecutor.runPlans(
1670
+ `Tap ${targetPrompt}`,
1671
+ plans
1672
+ );
1673
+ this.afterTaskRunning(executor);
1674
+ return output;
1675
+ }
1676
+ async aiHover(taskPrompt) {
1677
+ const plans = buildPlans("Hover", {
1678
+ prompt: taskPrompt
1679
+ });
1680
+ const { executor, output } = await this.taskExecutor.runPlans(
1681
+ `Hover ${taskPrompt}`,
1682
+ plans
1683
+ );
1684
+ this.afterTaskRunning(executor);
1685
+ return output;
1686
+ }
1687
+ async aiInput(where, value) {
1688
+ const plans = buildPlans(
1689
+ "Input",
1690
+ {
1691
+ prompt: where
1692
+ },
1693
+ {
1694
+ value
1549
1695
  }
1550
- } else {
1551
- const { executor } = await this.taskExecutor.action(taskPrompt, {
1552
- onTaskStart: this.callbackOnTaskStartTip.bind(this)
1553
- });
1554
- this.appendExecutionDump(executor.dump());
1555
- this.writeOutActionDumps();
1556
- if (executor.isInErrorState()) {
1557
- const errorTask = executor.latestErrorTask();
1558
- throw new Error(`${errorTask?.error}
1559
- ${errorTask?.errorStack}`);
1696
+ );
1697
+ const { executor, output } = await this.taskExecutor.runPlans(
1698
+ `Input ${where} - ${value}`,
1699
+ plans
1700
+ );
1701
+ this.afterTaskRunning(executor);
1702
+ return output;
1703
+ }
1704
+ async aiKeyboardPress(where, value) {
1705
+ const plans = buildPlans(
1706
+ "KeyboardPress",
1707
+ {
1708
+ prompt: where
1709
+ },
1710
+ {
1711
+ value
1560
1712
  }
1561
- }
1713
+ );
1714
+ const { executor, output } = await this.taskExecutor.runPlans(
1715
+ `KeyboardPress ${where} - ${value}`,
1716
+ plans
1717
+ );
1718
+ this.afterTaskRunning(executor);
1719
+ return output;
1720
+ }
1721
+ // async aiScroll(where: string, param: PlanningActionParamScroll) {
1722
+ // const plans = buildPlans(
1723
+ // 'Scroll',
1724
+ // {
1725
+ // prompt: where,
1726
+ // },
1727
+ // param,
1728
+ // );
1729
+ // const { executor, output } = await this.taskExecutor.runPlans(
1730
+ // `Scroll ${where} - ${paramStr(param)}`,
1731
+ // plans,
1732
+ // );
1733
+ // }
1734
+ async aiAction(taskPrompt) {
1735
+ const { executor } = await (getAIConfig3(MIDSCENE_USE_VLM_UI_TARS2) ? this.taskExecutor.actionToGoal(taskPrompt, {
1736
+ onTaskStart: this.callbackOnTaskStartTip.bind(this)
1737
+ }) : this.taskExecutor.action(taskPrompt, {
1738
+ onTaskStart: this.callbackOnTaskStartTip.bind(this)
1739
+ }));
1740
+ this.afterTaskRunning(executor);
1562
1741
  }
1563
1742
  async aiQuery(demand) {
1564
1743
  const { output, executor } = await this.taskExecutor.query(demand, {
1565
1744
  onTaskStart: this.callbackOnTaskStartTip.bind(this)
1566
1745
  });
1567
- this.appendExecutionDump(executor.dump());
1568
- this.writeOutActionDumps();
1569
- if (executor.isInErrorState()) {
1570
- const errorTask = executor.latestErrorTask();
1571
- throw new Error(`${errorTask?.error}
1572
- ${errorTask?.errorStack}`);
1573
- }
1746
+ this.afterTaskRunning(executor);
1574
1747
  return output;
1575
1748
  }
1576
1749
  async aiAssert(assertion, msg, opt) {
1577
1750
  const { output, executor } = await this.taskExecutor.assert(assertion, {
1578
1751
  onTaskStart: this.callbackOnTaskStartTip.bind(this)
1579
1752
  });
1580
- this.appendExecutionDump(executor.dump());
1581
- this.writeOutActionDumps();
1753
+ this.afterTaskRunning(executor, true);
1582
1754
  if (output && opt?.keepRawResponse) {
1583
1755
  return output;
1584
1756
  }