@midscene/core 0.17.2-beta-20250521031635.0 → 0.17.2-beta-20250521131112.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. package/dist/es/ai-model.d.ts +3 -3
  2. package/dist/es/ai-model.js +1 -1
  3. package/dist/es/{chunk-ZKT2DWJO.js → chunk-GHP3FR4O.js} +3 -3
  4. package/dist/es/{chunk-OINLEVDF.js → chunk-K2IXQ5O2.js} +132 -82
  5. package/dist/es/chunk-K2IXQ5O2.js.map +1 -0
  6. package/dist/es/index.d.ts +8 -5
  7. package/dist/es/index.js +85 -2
  8. package/dist/es/index.js.map +1 -1
  9. package/dist/es/{llm-planning-26db5c81.d.ts → llm-planning-3bdabecb.d.ts} +4 -2
  10. package/dist/es/tree.d.ts +1 -1
  11. package/dist/es/{types-cbcbeb4e.d.ts → types-01381369.d.ts} +26 -2
  12. package/dist/es/utils.d.ts +1 -1
  13. package/dist/es/utils.js +1 -1
  14. package/dist/lib/ai-model.d.ts +3 -3
  15. package/dist/lib/ai-model.js +2 -2
  16. package/dist/lib/{chunk-ZKT2DWJO.js → chunk-GHP3FR4O.js} +3 -3
  17. package/dist/lib/{chunk-OINLEVDF.js → chunk-K2IXQ5O2.js} +140 -90
  18. package/dist/lib/chunk-K2IXQ5O2.js.map +1 -0
  19. package/dist/lib/index.d.ts +8 -5
  20. package/dist/lib/index.js +93 -10
  21. package/dist/lib/index.js.map +1 -1
  22. package/dist/lib/{llm-planning-26db5c81.d.ts → llm-planning-3bdabecb.d.ts} +4 -2
  23. package/dist/lib/tree.d.ts +1 -1
  24. package/dist/{types/types-cbcbeb4e.d.ts → lib/types-01381369.d.ts} +26 -2
  25. package/dist/lib/utils.d.ts +1 -1
  26. package/dist/lib/utils.js +2 -2
  27. package/dist/types/ai-model.d.ts +3 -3
  28. package/dist/types/index.d.ts +8 -5
  29. package/dist/types/{llm-planning-26db5c81.d.ts → llm-planning-3bdabecb.d.ts} +4 -2
  30. package/dist/types/tree.d.ts +1 -1
  31. package/dist/{lib/types-cbcbeb4e.d.ts → types/types-01381369.d.ts} +26 -2
  32. package/dist/types/utils.d.ts +1 -1
  33. package/package.json +2 -2
  34. package/dist/es/chunk-OINLEVDF.js.map +0 -1
  35. package/dist/lib/chunk-OINLEVDF.js.map +0 -1
  36. /package/dist/es/{chunk-ZKT2DWJO.js.map → chunk-GHP3FR4O.js.map} +0 -0
  37. /package/dist/lib/{chunk-ZKT2DWJO.js.map → chunk-GHP3FR4O.js.map} +0 -0
@@ -28,8 +28,6 @@ var _identity = require('@azure/identity');
28
28
 
29
29
 
30
30
 
31
-
32
-
33
31
 
34
32
 
35
33
 
@@ -310,53 +308,9 @@ function buildYamlFlowFromPlans(plans, sleep) {
310
308
  return flow;
311
309
  }
312
310
 
313
- // src/ai-model/prompt/ui-tars-planning.ts
314
- function getTimeZoneInfo() {
315
- const timeZone = Intl.DateTimeFormat().resolvedOptions().timeZone;
316
- const offset = -(/* @__PURE__ */ new Date()).getTimezoneOffset() / 60;
317
- return {
318
- timezone: `UTC${offset >= 0 ? "+" : ""}${offset}`,
319
- isChina: timeZone === "Asia/Shanghai"
320
- };
321
- }
322
- function getLanguage() {
323
- return getTimeZoneInfo().isChina ? "Chinese" : "English";
324
- }
325
- function getUiTarsPlanningPrompt() {
326
- const language2 = getLanguage();
327
- return `
328
- You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
329
-
330
- ## Output Format
331
- \`\`\`
332
- Thought: ...
333
- Action: ...
334
- \`\`\`
335
-
336
- ## Action Space
337
-
338
- click(start_box='[x1, y1, x2, y2]')
339
- left_double(start_box='[x1, y1, x2, y2]')
340
- right_single(start_box='[x1, y1, x2, y2]')
341
- drag(start_box='[x1, y1, x2, y2]', end_box='[x3, y3, x4, y4]')
342
- hotkey(key='')
343
- type(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format. If you want to submit your input, use \\n at the end of content.
344
- scroll(start_box='[x1, y1, x2, y2]', direction='down or up or right or left')
345
- wait() #Sleep for 5s and take a screenshot to check for any changes.
346
- finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format.
347
-
348
-
349
- ## Note
350
- - Use ${language2} in \`Thought\` part.
351
- - Write a small plan and finally summarize your next action (with its target element) in one sentence in \`Thought\` part.
352
-
353
- ## User Instruction
354
- `;
355
- }
356
- var getSummary = (prediction) => prediction.replace(/Reflection:[\s\S]*?(?=Action_Summary:|Action:|$)/g, "").trim();
357
-
358
311
  // src/ai-model/prompt/assertion.ts
359
- var language = getTimeZoneInfo().isChina ? "Chinese" : "English";
312
+
313
+ var preferredLanguage = _env.getPreferredLanguage.call(void 0, );
360
314
  var defaultAssertionPrompt = "You are a senior testing engineer. User will give an assertion and a screenshot of a page. By carefully viewing the screenshot, please tell whether the assertion is truthy.";
361
315
  var defaultAssertionResponseJsonFormat = `Return in the following JSON format:
362
316
  {
@@ -373,7 +327,7 @@ var uiTarsAssertionResponseJsonFormat = `## Output Json String Format
373
327
 
374
328
  ## Rules **MUST** follow
375
329
  - Make sure to return **only** the JSON, with **no additional** text or explanations.
376
- - Use ${language} in \`thought\` part.
330
+ - Use ${preferredLanguage} in \`thought\` part.
377
331
  - You **MUST** strictly follow up the **Output Json String Format**.`;
378
332
  function systemPromptToAssert(model) {
379
333
  return `${defaultAssertionPrompt}
@@ -1331,10 +1285,10 @@ async function call(messages, AIActionTypeValue, responseFormat) {
1331
1285
  let content;
1332
1286
  let usage;
1333
1287
  const commonConfig = {
1334
- temperature: _env.getAIConfigInBoolean.call(void 0, _env.MIDSCENE_USE_VLM_UI_TARS) ? 0 : 0.1,
1288
+ temperature: _env.vlLocateMode.call(void 0, ) === "vlm-ui-tars" ? 0 : 0.1,
1335
1289
  stream: false,
1336
1290
  max_tokens: typeof maxTokens === "number" ? maxTokens : Number.parseInt(maxTokens || "2048", 10),
1337
- ..._env.getAIConfigInBoolean.call(void 0, _env.MIDSCENE_USE_QWEN_VL) ? {
1291
+ ..._env.vlLocateMode.call(void 0, ) === "qwen-vl" ? {
1338
1292
  vl_high_resolution_images: true
1339
1293
  } : {}
1340
1294
  };
@@ -1412,12 +1366,13 @@ async function callToGetJSONObject(messages, AIActionTypeValue) {
1412
1366
  case 1 /* INSPECT_ELEMENT */:
1413
1367
  responseFormat = locatorSchema;
1414
1368
  break;
1415
- case 2 /* EXTRACT_DATA */:
1416
- responseFormat = { type: "json_object" /* JSON */ };
1417
- break;
1418
1369
  case 3 /* PLAN */:
1419
1370
  responseFormat = planSchema;
1420
1371
  break;
1372
+ case 2 /* EXTRACT_DATA */:
1373
+ case 4 /* DESCRIBE_ELEMENT */:
1374
+ responseFormat = { type: "json_object" /* JSON */ };
1375
+ break;
1421
1376
  }
1422
1377
  }
1423
1378
  if (model === "gpt-4o-2024-05-13") {
@@ -1493,30 +1448,89 @@ function systemPromptToExtract() {
1493
1448
  return `
1494
1449
  You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.
1495
1450
 
1496
- The user will give you a screenshot, the contents of it (optional), and some data requirements in DATA_DEMAND. You need to extract the data according to the DATA_DEMAND.
1451
+ The user will give you a screenshot, the contents of it (optional), and some data requirements in <DATA_DEMAND>. You need to extract the data according to the <DATA_DEMAND>.
1452
+
1453
+ If a key specifies a JSON data type (such as Number, String, Boolean, Object, Array), ensure the returned value strictly matches that data type.
1497
1454
 
1498
1455
  Return in the following JSON format:
1499
1456
  {
1500
1457
  data: any, // the extracted data. Make sure both the value and scheme meet the DATA_DEMAND. If you want to write some description in this field, use the same language as the DATA_DEMAND.
1501
1458
  errors: [], // string[], error message if any
1502
1459
  }
1503
- `;
1460
+
1461
+ # Example 1
1462
+ For example, if the DATA_DEMAND is:
1463
+
1464
+ <DATA_DEMAND>
1465
+ {
1466
+ "name": "name shows on the left panel, string",
1467
+ "age": "age shows on the right panel, number",
1468
+ "isAdmin": "if the user is admin, boolean"
1504
1469
  }
1505
- var extractDataPrompt = new (0, _prompts.PromptTemplate)({
1506
- template: `
1507
- pageDescription: {pageDescription}
1470
+ </DATA_DEMAND>
1508
1471
 
1509
- Extract the following data and place it in the \`data\` field. If a key specifies a JSON data type (such as Number, String, Boolean, Object, Array), ensure the returned value strictly matches that data type.
1510
- DATA_DEMAND start:
1511
- =====================================
1512
- {dataKeys}
1472
+ By viewing the screenshot and page contents, you can extract the following data:
1473
+
1474
+ {
1475
+ data: {
1476
+ name: "John",
1477
+ age: 30,
1478
+ isAdmin: true
1479
+ },
1480
+ }
1481
+
1482
+ # Example 2
1483
+ If the DATA_DEMAND is:
1484
+
1485
+ <DATA_DEMAND>
1486
+ the todo items list, string[]
1487
+ </DATA_DEMAND>
1488
+
1489
+ By viewing the screenshot and page contents, you can extract the following data:
1490
+
1491
+ {
1492
+ data: ["todo 1", "todo 2", "todo 3"],
1493
+ }
1494
+
1495
+ # Example 3
1496
+ If the DATA_DEMAND is:
1497
+
1498
+ <DATA_DEMAND>
1499
+ the page title, string
1500
+ </DATA_DEMAND>
1501
+
1502
+ By viewing the screenshot and page contents, you can extract the following data:
1503
+
1504
+ {
1505
+ data: "todo list",
1506
+ }
1513
1507
 
1508
+ `;
1509
+ }
1510
+ var extractDataQueryPrompt = async (pageDescription, dataQuery) => {
1511
+ let dataQueryText = "";
1512
+ if (typeof dataQuery === "string") {
1513
+ dataQueryText = dataQuery;
1514
+ } else {
1515
+ dataQueryText = JSON.stringify(dataQuery, null, 2);
1516
+ }
1517
+ const extractDataPrompt = new (0, _prompts.PromptTemplate)({
1518
+ template: `
1519
+ <PageDescription>
1520
+ {pageDescription}
1521
+ </PageDescription>
1522
+
1523
+ <DATA_DEMAND>
1514
1524
  {dataQuery}
1515
- =====================================
1516
- DATA_DEMAND ends.
1525
+ </DATA_DEMAND>
1517
1526
  `,
1518
- inputVariables: ["pageDescription", "dataKeys", "dataQuery"]
1519
- });
1527
+ inputVariables: ["pageDescription", "dataQuery"]
1528
+ });
1529
+ return await extractDataPrompt.format({
1530
+ pageDescription,
1531
+ dataQuery: dataQueryText
1532
+ });
1533
+ };
1520
1534
 
1521
1535
  // src/ai-model/prompt/llm-section-locator.ts
1522
1536
 
@@ -1601,6 +1615,14 @@ async function AiLocateElement(options) {
1601
1615
  context.size
1602
1616
  );
1603
1617
  }
1618
+ let referenceImagePayload;
1619
+ if (_optionalChain([options, 'access', _32 => _32.referenceImage, 'optionalAccess', _33 => _33.rect]) && options.referenceImage.base64) {
1620
+ referenceImagePayload = await _img.cropByRect.call(void 0,
1621
+ options.referenceImage.base64,
1622
+ options.referenceImage.rect,
1623
+ _env.getAIConfigInBoolean.call(void 0, _env.MIDSCENE_USE_QWEN_VL)
1624
+ );
1625
+ }
1604
1626
  const msgs = [
1605
1627
  { role: "system", content: systemPrompt },
1606
1628
  {
@@ -1630,10 +1652,10 @@ async function AiLocateElement(options) {
1630
1652
  if ("bbox" in res.content && Array.isArray(res.content.bbox)) {
1631
1653
  resRect = adaptBboxToRect(
1632
1654
  res.content.bbox,
1633
- _optionalChain([options, 'access', _32 => _32.searchConfig, 'optionalAccess', _33 => _33.rect, 'optionalAccess', _34 => _34.width]) || context.size.width,
1634
- _optionalChain([options, 'access', _35 => _35.searchConfig, 'optionalAccess', _36 => _36.rect, 'optionalAccess', _37 => _37.height]) || context.size.height,
1635
- _optionalChain([options, 'access', _38 => _38.searchConfig, 'optionalAccess', _39 => _39.rect, 'optionalAccess', _40 => _40.left]),
1636
- _optionalChain([options, 'access', _41 => _41.searchConfig, 'optionalAccess', _42 => _42.rect, 'optionalAccess', _43 => _43.top])
1655
+ _optionalChain([options, 'access', _34 => _34.searchConfig, 'optionalAccess', _35 => _35.rect, 'optionalAccess', _36 => _36.width]) || context.size.width,
1656
+ _optionalChain([options, 'access', _37 => _37.searchConfig, 'optionalAccess', _38 => _38.rect, 'optionalAccess', _39 => _39.height]) || context.size.height,
1657
+ _optionalChain([options, 'access', _40 => _40.searchConfig, 'optionalAccess', _41 => _41.rect, 'optionalAccess', _42 => _42.left]),
1658
+ _optionalChain([options, 'access', _43 => _43.searchConfig, 'optionalAccess', _44 => _44.rect, 'optionalAccess', _45 => _45.top])
1637
1659
  );
1638
1660
  debugInspect("resRect", resRect);
1639
1661
  const rectCenter = {
@@ -1652,7 +1674,7 @@ async function AiLocateElement(options) {
1652
1674
  }
1653
1675
  } catch (e) {
1654
1676
  const msg = e instanceof Error ? `Failed to parse bbox: ${e.message}` : "unknown error in locate";
1655
- if (!errors || _optionalChain([errors, 'optionalAccess', _44 => _44.length]) === 0) {
1677
+ if (!errors || _optionalChain([errors, 'optionalAccess', _46 => _46.length]) === 0) {
1656
1678
  errors = [msg];
1657
1679
  } else {
1658
1680
  errors.push(`(${msg})`);
@@ -1743,20 +1765,10 @@ async function AiExtractElementInfo(options) {
1743
1765
  context,
1744
1766
  liteContextConfig
1745
1767
  );
1746
- let dataKeys = "";
1747
- let dataQueryText = "";
1748
- if (typeof dataQuery === "string") {
1749
- dataKeys = "";
1750
- dataQueryText = dataQuery;
1751
- } else {
1752
- dataKeys = `return in key-value style object, keys are ${Object.keys(dataQuery).join(",")}`;
1753
- dataQueryText = JSON.stringify(dataQuery, null, 2);
1754
- }
1755
- const extractDataPromptText = await extractDataPrompt.format({
1756
- pageDescription: description,
1757
- dataKeys,
1758
- dataQuery: dataQueryText
1759
- });
1768
+ const extractDataPromptText = await extractDataQueryPrompt(
1769
+ description,
1770
+ dataQuery
1771
+ );
1760
1772
  const msgs = [
1761
1773
  { role: "system", content: systemPrompt },
1762
1774
  {
@@ -1884,7 +1896,7 @@ async function plan(userInstruction, opts) {
1884
1896
  const { content, usage } = await call2(msgs, 3 /* PLAN */);
1885
1897
  const rawResponse = JSON.stringify(content, void 0, 2);
1886
1898
  const planFromAI = content;
1887
- const actions = (_optionalChain([planFromAI, 'access', _45 => _45.action, 'optionalAccess', _46 => _46.type]) ? [planFromAI.action] : planFromAI.actions) || [];
1899
+ const actions = (_optionalChain([planFromAI, 'access', _47 => _47.action, 'optionalAccess', _48 => _48.type]) ? [planFromAI.action] : planFromAI.actions) || [];
1888
1900
  const returnValue = {
1889
1901
  ...planFromAI,
1890
1902
  actions,
@@ -1911,7 +1923,7 @@ async function plan(userInstruction, opts) {
1911
1923
  _utils.assert.call(void 0, !planFromAI.error, `Failed to plan actions: ${planFromAI.error}`);
1912
1924
  } else {
1913
1925
  actions.forEach((action) => {
1914
- if (_optionalChain([action, 'access', _47 => _47.locate, 'optionalAccess', _48 => _48.id])) {
1926
+ if (_optionalChain([action, 'access', _49 => _49.locate, 'optionalAccess', _50 => _50.id])) {
1915
1927
  const element = elementById(action.locate.id);
1916
1928
  if (element) {
1917
1929
  action.locate.id = element.id;
@@ -1939,6 +1951,43 @@ var _keyboardlayout = require('@midscene/shared/keyboard-layout');
1939
1951
 
1940
1952
 
1941
1953
  var _actionparser = require('@ui-tars/action-parser');
1954
+
1955
+ // src/ai-model/prompt/ui-tars-planning.ts
1956
+
1957
+ function getUiTarsPlanningPrompt() {
1958
+ const preferredLanguage2 = _env.getPreferredLanguage.call(void 0, );
1959
+ return `
1960
+ You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
1961
+
1962
+ ## Output Format
1963
+ \`\`\`
1964
+ Thought: ...
1965
+ Action: ...
1966
+ \`\`\`
1967
+
1968
+ ## Action Space
1969
+
1970
+ click(start_box='[x1, y1, x2, y2]')
1971
+ left_double(start_box='[x1, y1, x2, y2]')
1972
+ right_single(start_box='[x1, y1, x2, y2]')
1973
+ drag(start_box='[x1, y1, x2, y2]', end_box='[x3, y3, x4, y4]')
1974
+ hotkey(key='')
1975
+ type(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format. If you want to submit your input, use \\n at the end of content.
1976
+ scroll(start_box='[x1, y1, x2, y2]', direction='down or up or right or left')
1977
+ wait() #Sleep for 5s and take a screenshot to check for any changes.
1978
+ finished(content='xxx') # Use escape characters \\', \\", and \\n in content part to ensure we can parse the content in normal python string format.
1979
+
1980
+
1981
+ ## Note
1982
+ - Use ${preferredLanguage2} in \`Thought\` part.
1983
+ - Write a small plan and finally summarize your next action (with its target element) in one sentence in \`Thought\` part.
1984
+
1985
+ ## User Instruction
1986
+ `;
1987
+ }
1988
+ var getSummary = (prediction) => prediction.replace(/Reflection:[\s\S]*?(?=Action_Summary:|Action:|$)/g, "").trim();
1989
+
1990
+ // src/ai-model/ui-tars-planning.ts
1942
1991
  var debug = _logger.getDebug.call(void 0, "ui-tars-planning");
1943
1992
  var bboxSize = 10;
1944
1993
  var pointToBbox = (point, width, height) => {
@@ -2159,6 +2208,7 @@ async function resizeImageForUiTars(imageBase64, size) {
2159
2208
 
2160
2209
 
2161
2210
 
2162
- exports.systemPromptToLocateElement = systemPromptToLocateElement; exports.elementByPositionWithElementInfo = elementByPositionWithElementInfo; exports.describeUserPage = describeUserPage; exports.callToGetJSONObject = callToGetJSONObject; exports.callAiFn = callAiFn; exports.adaptBboxToRect = adaptBboxToRect; exports.AiLocateElement = AiLocateElement; exports.AiLocateSection = AiLocateSection; exports.AiExtractElementInfo = AiExtractElementInfo; exports.AiAssert = AiAssert; exports.plan = plan; exports.vlmPlanning = vlmPlanning; exports.resizeImageForUiTars = resizeImageForUiTars;
2163
2211
 
2164
- //# sourceMappingURL=chunk-OINLEVDF.js.map
2212
+ exports.systemPromptToLocateElement = systemPromptToLocateElement; exports.elementByPositionWithElementInfo = elementByPositionWithElementInfo; exports.describeUserPage = describeUserPage; exports.callToGetJSONObject = callToGetJSONObject; exports.callAiFn = callAiFn; exports.adaptBboxToRect = adaptBboxToRect; exports.expandSearchArea = expandSearchArea; exports.AiLocateElement = AiLocateElement; exports.AiLocateSection = AiLocateSection; exports.AiExtractElementInfo = AiExtractElementInfo; exports.AiAssert = AiAssert; exports.plan = plan; exports.vlmPlanning = vlmPlanning; exports.resizeImageForUiTars = resizeImageForUiTars;
2213
+
2214
+ //# sourceMappingURL=chunk-K2IXQ5O2.js.map