@midscene/core 0.9.2 → 0.10.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -9,7 +9,7 @@
9
9
 
10
10
 
11
11
 
12
- var _chunkP2MMY6CZjs = require('./chunk-P2MMY6CZ.js');
12
+ var _chunkCERQVVPJjs = require('./chunk-CERQVVPJ.js');
13
13
  require('./chunk-JP3JBDZS.js');
14
14
  require('./chunk-YSQDPG26.js');
15
15
 
@@ -23,4 +23,4 @@ require('./chunk-YSQDPG26.js');
23
23
 
24
24
 
25
25
 
26
- exports.AiAssert = _chunkP2MMY6CZjs.AiAssert; exports.AiExtractElementInfo = _chunkP2MMY6CZjs.AiExtractElementInfo; exports.AiInspectElement = _chunkP2MMY6CZjs.AiInspectElement; exports.callAiFn = _chunkP2MMY6CZjs.callAiFn; exports.callToGetJSONObject = _chunkP2MMY6CZjs.callToGetJSONObject; exports.describeUserPage = _chunkP2MMY6CZjs.describeUserPage; exports.plan = _chunkP2MMY6CZjs.plan; exports.systemPromptToLocateElement = _chunkP2MMY6CZjs.systemPromptToLocateElement; exports.transformElementPositionToId = _chunkP2MMY6CZjs.transformElementPositionToId; exports.vlmPlanning = _chunkP2MMY6CZjs.vlmPlanning;
26
+ exports.AiAssert = _chunkCERQVVPJjs.AiAssert; exports.AiExtractElementInfo = _chunkCERQVVPJjs.AiExtractElementInfo; exports.AiInspectElement = _chunkCERQVVPJjs.AiInspectElement; exports.callAiFn = _chunkCERQVVPJjs.callAiFn; exports.callToGetJSONObject = _chunkCERQVVPJjs.callToGetJSONObject; exports.describeUserPage = _chunkCERQVVPJjs.describeUserPage; exports.plan = _chunkCERQVVPJjs.plan; exports.systemPromptToLocateElement = _chunkCERQVVPJjs.systemPromptToLocateElement; exports.transformElementPositionToId = _chunkCERQVVPJjs.transformElementPositionToId; exports.vlmPlanning = _chunkCERQVVPJjs.vlmPlanning;
@@ -1109,32 +1109,192 @@ async function callAiFn(msgs, AIActionTypeValue) {
1109
1109
  // src/ai-model/prompt/llm-locator.ts
1110
1110
  var _prompts = require('@langchain/core/prompts');
1111
1111
 
1112
+ // src/ai-model/prompt/ui-tars-planning.ts
1113
+ function getTimeZoneInfo() {
1114
+ const timeZone = Intl.DateTimeFormat().resolvedOptions().timeZone;
1115
+ const offset = -(/* @__PURE__ */ new Date()).getTimezoneOffset() / 60;
1116
+ return {
1117
+ timezone: `UTC${offset >= 0 ? "+" : ""}${offset}`,
1118
+ isChina: timeZone === "Asia/Shanghai"
1119
+ };
1120
+ }
1121
+ var language = getTimeZoneInfo().isChina ? "Chinese" : "English";
1122
+ var uiTarsPlanningPrompt = `
1123
+ You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
1124
+
1125
+ ## Output Format
1126
+ \`\`\`
1127
+ Thought: ...
1128
+ Action: ...
1129
+ \`\`\`
1130
+
1131
+ ## Action Space
1132
+ click(start_box='[x1, y1, x2, y2]')
1133
+ left_double(start_box='[x1, y1, x2, y2]')
1134
+ right_single(start_box='[x1, y1, x2, y2]')
1135
+ drag(start_box='[x1, y1, x2, y2]', end_box='[x3, y3, x4, y4]')
1136
+ hotkey(key='')
1137
+ type(content='') #If you want to submit your input, use "\\n" at the end of \`content\`.
1138
+ scroll(start_box='[x1, y1, x2, y2]', direction='down or up or right or left')
1139
+ wait() #Sleep for 5s and take a screenshot to check for any changes.
1140
+ finished()
1141
+ call_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help.
1142
+
1143
+ ## Note
1144
+ - Use ${language} in \`Thought\` part.
1145
+ - Write a small plan and finally summarize your next action (with its target element) in one sentence in \`Thought\` part.
1146
+
1147
+ ## User Instruction
1148
+ `;
1149
+ var getSummary = (prediction) => prediction.replace(/Reflection:[\s\S]*?(?=Action_Summary:|Action:|$)/g, "").trim();
1150
+ function parseActionFromVlm(text, factor = 1e3, mode = "bc") {
1151
+ let reflection = null;
1152
+ let thought = null;
1153
+ let actionStr = "";
1154
+ text = text.trim();
1155
+ if (mode === "bc") {
1156
+ if (text.startsWith("Thought:")) {
1157
+ const thoughtMatch = text.match(/Thought: (.+?)(?=\s*Action:|$)/s);
1158
+ if (thoughtMatch) {
1159
+ thought = thoughtMatch[1].trim();
1160
+ }
1161
+ } else if (text.startsWith("Reflection:")) {
1162
+ const reflectionMatch = text.match(
1163
+ /Reflection: (.+?)Action_Summary: (.+?)(?=\s*Action:|$)/
1164
+ );
1165
+ if (reflectionMatch) {
1166
+ thought = reflectionMatch[2].trim();
1167
+ reflection = reflectionMatch[1].trim();
1168
+ }
1169
+ } else if (text.startsWith("Action_Summary:")) {
1170
+ const summaryMatch = text.match(/Action_Summary: (.+?)(?=\s*Action:|$)/);
1171
+ if (summaryMatch) {
1172
+ thought = summaryMatch[1].trim();
1173
+ }
1174
+ }
1175
+ if (!text.includes("Action:")) {
1176
+ actionStr = text;
1177
+ } else {
1178
+ const actionParts = text.split("Action:");
1179
+ actionStr = actionParts[actionParts.length - 1];
1180
+ }
1181
+ } else if (mode === "o1") {
1182
+ const thoughtMatch = text.match(/<Thought>\s*(.*?)\s*<\/Thought>/);
1183
+ const actionSummaryMatch = text.match(
1184
+ /\nAction_Summary:\s*(.*?)\s*Action:/
1185
+ );
1186
+ const actionMatch = text.match(/\nAction:\s*(.*?)\s*<\/Output>/);
1187
+ const thoughtContent = thoughtMatch ? thoughtMatch[1] : null;
1188
+ const actionSummaryContent = actionSummaryMatch ? actionSummaryMatch[1] : null;
1189
+ const actionContent = actionMatch ? actionMatch[1] : null;
1190
+ thought = `${thoughtContent}
1191
+ <Action_Summary>
1192
+ ${actionSummaryContent}`;
1193
+ actionStr = actionContent || "";
1194
+ }
1195
+ const allActions = actionStr.split("\n\n");
1196
+ const actions = [];
1197
+ for (const rawStr of allActions) {
1198
+ const actionInstance = parseAction(rawStr.replace(/\n/g, "\\n").trim());
1199
+ if (!actionInstance) {
1200
+ console.log(`Action can't parse: ${rawStr}`);
1201
+ continue;
1202
+ }
1203
+ const actionType = actionInstance.function;
1204
+ const params = actionInstance.args;
1205
+ const actionInputs = {};
1206
+ for (const [paramName, param] of Object.entries(params)) {
1207
+ if (!param)
1208
+ continue;
1209
+ const trimmedParam = param.trim();
1210
+ actionInputs[paramName.trim()] = trimmedParam;
1211
+ if (paramName.includes("start_box") || paramName.includes("end_box")) {
1212
+ const oriBox = trimmedParam;
1213
+ const numbers = oriBox.replace(/[()]/g, "").split(",");
1214
+ const floatNumbers = numbers.map(
1215
+ (num) => Number.parseFloat(num) / factor
1216
+ );
1217
+ if (floatNumbers.length === 2) {
1218
+ floatNumbers.push(floatNumbers[0], floatNumbers[1]);
1219
+ }
1220
+ actionInputs[paramName.trim()] = JSON.stringify(floatNumbers);
1221
+ }
1222
+ }
1223
+ if (actionType === "finished") {
1224
+ actions.push({
1225
+ reflection,
1226
+ thought,
1227
+ action_type: "finished",
1228
+ action_inputs: {}
1229
+ });
1230
+ } else {
1231
+ actions.push({
1232
+ reflection,
1233
+ thought,
1234
+ action_type: actionType,
1235
+ action_inputs: actionInputs
1236
+ });
1237
+ }
1238
+ }
1239
+ return actions;
1240
+ }
1241
+ function parseAction(actionStr) {
1242
+ try {
1243
+ const functionPattern = /^(\w+)\((.*)\)$/;
1244
+ const match = actionStr.trim().match(functionPattern);
1245
+ if (!match) {
1246
+ throw new Error("Not a function call");
1247
+ }
1248
+ const [_, functionName, argsStr] = match;
1249
+ const kwargs = {};
1250
+ if (argsStr.trim()) {
1251
+ const argPairs = argsStr.match(/([^,']|'[^']*')+/g) || [];
1252
+ for (const pair of argPairs) {
1253
+ const [key, ...valueParts] = pair.split("=");
1254
+ if (!key)
1255
+ continue;
1256
+ const value = valueParts.join("=").trim().replace(/^['"]|['"]$/g, "");
1257
+ kwargs[key.trim()] = value;
1258
+ }
1259
+ }
1260
+ return {
1261
+ function: functionName,
1262
+ args: kwargs
1263
+ };
1264
+ } catch (e) {
1265
+ console.error(`Failed to parse action '${actionStr}': ${e}`);
1266
+ return null;
1267
+ }
1268
+ }
1269
+
1112
1270
  // src/ai-model/prompt/ui-tars-locator.ts
1113
1271
  function systemPromptToLocateElementPosition() {
1114
1272
  return `
1115
- You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
1116
-
1117
- ## Output Format
1118
- \`\`\`
1119
- Action_Summary: ...
1120
- Action: ...
1121
- \`\`\`
1122
-
1123
- ## Action Space
1124
- click(start_box='[x1, y1, x2, y2]')
1125
- long_press(start_box='[x1, y1, x2, y2]', time='')
1126
- type(content='')
1127
- scroll(direction='down or up or right or left')
1128
- open_app(app_name='')
1129
- navigate_back()
1130
- navigate_home()
1131
- WAIT()
1132
- finished() # Submit the task regardless of whether it succeeds or fails.
1133
-
1134
- ## Note
1135
- - Use Chinese in \`Action_Summary\` part.
1136
-
1137
- ## User Instruction
1273
+ You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
1274
+
1275
+ ## Output Format
1276
+ \`\`\`
1277
+ Thought: ...
1278
+ Action: ...
1279
+ \`\`\`
1280
+
1281
+ ## Action Space
1282
+ click(start_box='[x1, y1, x2, y2]')
1283
+ left_double(start_box='[x1, y1, x2, y2]')
1284
+ right_single(start_box='[x1, y1, x2, y2]')
1285
+ drag(start_box='[x1, y1, x2, y2]', end_box='[x3, y3, x4, y4]')
1286
+ hotkey(key='')
1287
+ type(content='') #If you want to submit your input, use "\\n" at the end of \`content\`.
1288
+ scroll(start_box='[x1, y1, x2, y2]', direction='down or up or right or left')
1289
+ wait() #Sleep for 5s and take a screenshot to check for any changes.
1290
+ finished()
1291
+ call_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help.
1292
+
1293
+ ## Note
1294
+ - Use ${language} in \`Thought\` part.
1295
+ - Write a small plan and finally summarize your next action (with its target element) in one sentence in \`Thought\` part.
1296
+
1297
+ ## User Instruction
1138
1298
  `;
1139
1299
  }
1140
1300
 
@@ -1360,8 +1520,6 @@ var _img = require('@midscene/shared/img');
1360
1520
  var _constants = require('@midscene/shared/constants');
1361
1521
 
1362
1522
  var characteristic = "You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.";
1363
- var contextFormatIntro = `
1364
- The user will give you a screenshot and some of the texts on it. There may be some none-English characters (like Chinese) on it, indicating it's an non-English app. If some text is shown on screenshot but not introduced by the JSON description, use the information you see on screenshot.`;
1365
1523
  function systemPromptToExtract() {
1366
1524
  return `
1367
1525
  You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.
@@ -1397,7 +1555,6 @@ DATA_DEMAND start:
1397
1555
  {dataKeys}
1398
1556
 
1399
1557
  {dataQuery}
1400
-
1401
1558
  =====================================
1402
1559
  DATA_DEMAND ends.
1403
1560
  `,
@@ -1406,14 +1563,12 @@ DATA_DEMAND ends.
1406
1563
  function systemPromptToAssert() {
1407
1564
  return `
1408
1565
  ${characteristic}
1409
- ${contextFormatIntro}
1410
-
1411
- Based on the information you get, Return assertion judgment:
1566
+ User will give an assertion, and some information about the page. Based on the information you get, tell whether the assertion is truthy.
1412
1567
 
1413
1568
  Return in the following JSON format:
1414
1569
  {
1415
1570
  thought: string, // string, the thought of the assertion. Should in the same language as the assertion.
1416
- pass: true, // true or false, whether the assertion is passed
1571
+ pass: true, // true or false, whether the assertion is truthy
1417
1572
  }
1418
1573
  `;
1419
1574
  }
@@ -1454,7 +1609,7 @@ function truncateText(text, maxLength = 100) {
1454
1609
  function elementByPositionWithElementInfo(elementsInfo, position) {
1455
1610
  _assert2.default.call(void 0, typeof position !== "undefined", "position is required for query");
1456
1611
  const matchingElements = elementsInfo.filter((item) => {
1457
- return item.rect.left <= position.x && position.x <= item.rect.left + item.rect.width && item.rect.top <= position.y && position.y <= item.rect.top + item.rect.height;
1612
+ return item.attributes.nodeType !== _constants.NodeType.CONTAINER && item.rect.left <= position.x && position.x <= item.rect.left + item.rect.width && item.rect.top <= position.y && position.y <= item.rect.top + item.rect.height;
1458
1613
  });
1459
1614
  if (matchingElements.length === 0) {
1460
1615
  return void 0;
@@ -1710,7 +1865,7 @@ The JSON format is as follows:
1710
1865
  {{
1711
1866
  "actions": [
1712
1867
  {{
1713
- "thought": "Reasons for generating this task, and why this task is feasible on this page",
1868
+ "thought": "Reasons for generating this task, and why this task is feasible on this page.", // Use the same language as the user's instruction.
1714
1869
  "type": "Tap",
1715
1870
  "param": null,
1716
1871
  "locate": {sample} | null,
@@ -1718,8 +1873,8 @@ The JSON format is as follows:
1718
1873
  // ... more actions
1719
1874
  ],
1720
1875
  "taskWillBeAccomplished": boolean,
1721
- "furtherPlan": {{ "whatHaveDone": string, "whatToDoNext": string }} | null,
1722
- "error"?: string
1876
+ "furtherPlan": {{ "whatHaveDone": string, "whatToDoNext": string }} | null, // Use the same language as the user's instruction.
1877
+ "error"?: string // Use the same language as the user's instruction.
1723
1878
  }}
1724
1879
  Here is an example of how to decompose a task:
1725
1880
 
@@ -2416,13 +2571,10 @@ async function AiAssert(options) {
2416
2571
  {
2417
2572
  type: "text",
2418
2573
  text: `
2419
- pageDescription:
2420
-
2421
- ${description}
2422
- Here is the description of the assertion. Just go ahead:
2423
- =====================================
2424
- ${assertion}
2425
- =====================================
2574
+ Here is the description of the assertion. Just go ahead:
2575
+ =====================================
2576
+ ${assertion}
2577
+ =====================================
2426
2578
  `
2427
2579
  }
2428
2580
  ]
@@ -2485,156 +2637,6 @@ async function plan(userPrompt, opts) {
2485
2637
  return planFromAI;
2486
2638
  }
2487
2639
 
2488
- // src/ai-model/prompt/ui-tars-planning.ts
2489
- var uiTarsPlanningPrompt = `
2490
- You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
2491
-
2492
- ## Output Format
2493
-
2494
- \`\`\`
2495
- Thought: ...
2496
- Action: ...
2497
- \`\`\`
2498
-
2499
- ## Action Space
2500
- click(start_box='[x1, y1, x2, y2]')
2501
- left_double(start_box='[x1, y1, x2, y2]')
2502
- right_single(start_box='[x1, y1, x2, y2]')
2503
- drag(start_box='[x1, y1, x2, y2]', end_box='[x3, y3, x4, y4]')
2504
- hotkey(key='')
2505
- type(content='') #If you want to submit your input, use "\\n" at the end of \`content\`.
2506
- scroll(start_box='[x1, y1, x2, y2]', direction='down or up or right or left')
2507
- wait() #Sleep for 5s and take a screenshot to check for any changes.
2508
- finished()
2509
- call_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help.
2510
-
2511
- ## Note
2512
- - Use Chinese in \`Thought\` part.
2513
- - Write a small plan and finally summarize your next action (with its target element) in one sentence in \`Thought\` part.
2514
-
2515
- ## User Instruction
2516
- `;
2517
- var getSummary = (prediction) => prediction.replace(/Reflection:[\s\S]*?(?=Action_Summary:|Action:|$)/g, "").trim();
2518
- function parseActionFromVlm(text, factor = 1e3, mode = "bc") {
2519
- let reflection = null;
2520
- let thought = null;
2521
- let actionStr = "";
2522
- text = text.trim();
2523
- if (mode === "bc") {
2524
- if (text.startsWith("Thought:")) {
2525
- const thoughtMatch = text.match(/Thought: (.+?)(?=\s*Action:|$)/s);
2526
- if (thoughtMatch) {
2527
- thought = thoughtMatch[1].trim();
2528
- }
2529
- } else if (text.startsWith("Reflection:")) {
2530
- const reflectionMatch = text.match(
2531
- /Reflection: (.+?)Action_Summary: (.+?)(?=\s*Action:|$)/
2532
- );
2533
- if (reflectionMatch) {
2534
- thought = reflectionMatch[2].trim();
2535
- reflection = reflectionMatch[1].trim();
2536
- }
2537
- } else if (text.startsWith("Action_Summary:")) {
2538
- const summaryMatch = text.match(/Action_Summary: (.+?)(?=\s*Action:|$)/);
2539
- if (summaryMatch) {
2540
- thought = summaryMatch[1].trim();
2541
- }
2542
- }
2543
- if (!text.includes("Action:")) {
2544
- actionStr = text;
2545
- } else {
2546
- const actionParts = text.split("Action:");
2547
- actionStr = actionParts[actionParts.length - 1];
2548
- }
2549
- } else if (mode === "o1") {
2550
- const thoughtMatch = text.match(/<Thought>\s*(.*?)\s*<\/Thought>/);
2551
- const actionSummaryMatch = text.match(
2552
- /\nAction_Summary:\s*(.*?)\s*Action:/
2553
- );
2554
- const actionMatch = text.match(/\nAction:\s*(.*?)\s*<\/Output>/);
2555
- const thoughtContent = thoughtMatch ? thoughtMatch[1] : null;
2556
- const actionSummaryContent = actionSummaryMatch ? actionSummaryMatch[1] : null;
2557
- const actionContent = actionMatch ? actionMatch[1] : null;
2558
- thought = `${thoughtContent}
2559
- <Action_Summary>
2560
- ${actionSummaryContent}`;
2561
- actionStr = actionContent || "";
2562
- }
2563
- const allActions = actionStr.split("\n\n");
2564
- const actions = [];
2565
- for (const rawStr of allActions) {
2566
- const actionInstance = parseAction(rawStr.replace(/\n/g, "\\n").trim());
2567
- if (!actionInstance) {
2568
- console.log(`Action can't parse: ${rawStr}`);
2569
- continue;
2570
- }
2571
- const actionType = actionInstance.function;
2572
- const params = actionInstance.args;
2573
- const actionInputs = {};
2574
- for (const [paramName, param] of Object.entries(params)) {
2575
- if (!param)
2576
- continue;
2577
- const trimmedParam = param.trim();
2578
- actionInputs[paramName.trim()] = trimmedParam;
2579
- if (paramName.includes("start_box") || paramName.includes("end_box")) {
2580
- const oriBox = trimmedParam;
2581
- const numbers = oriBox.replace(/[()]/g, "").split(",");
2582
- const floatNumbers = numbers.map(
2583
- (num) => Number.parseFloat(num) / factor
2584
- );
2585
- if (floatNumbers.length === 2) {
2586
- floatNumbers.push(floatNumbers[0], floatNumbers[1]);
2587
- }
2588
- actionInputs[paramName.trim()] = JSON.stringify(floatNumbers);
2589
- }
2590
- }
2591
- if (actionType === "finished") {
2592
- actions.push({
2593
- reflection,
2594
- thought,
2595
- action_type: "finished",
2596
- action_inputs: {}
2597
- });
2598
- } else {
2599
- actions.push({
2600
- reflection,
2601
- thought,
2602
- action_type: actionType,
2603
- action_inputs: actionInputs
2604
- });
2605
- }
2606
- }
2607
- return actions;
2608
- }
2609
- function parseAction(actionStr) {
2610
- try {
2611
- const functionPattern = /^(\w+)\((.*)\)$/;
2612
- const match = actionStr.trim().match(functionPattern);
2613
- if (!match) {
2614
- throw new Error("Not a function call");
2615
- }
2616
- const [_, functionName, argsStr] = match;
2617
- const kwargs = {};
2618
- if (argsStr.trim()) {
2619
- const argPairs = argsStr.match(/([^,']|'[^']*')+/g) || [];
2620
- for (const pair of argPairs) {
2621
- const [key, ...valueParts] = pair.split("=");
2622
- if (!key)
2623
- continue;
2624
- const value = valueParts.join("=").trim().replace(/^['"]|['"]$/g, "");
2625
- kwargs[key.trim()] = value;
2626
- }
2627
- }
2628
- return {
2629
- function: functionName,
2630
- args: kwargs
2631
- };
2632
- } catch (e) {
2633
- console.error(`Failed to parse action '${actionStr}': ${e}`);
2634
- return null;
2635
- }
2636
- }
2637
-
2638
2640
  // src/ai-model/ui-tars-planning.ts
2639
2641
  function capitalize(str) {
2640
2642
  return str.charAt(0).toUpperCase() + str.slice(1);
@@ -184,7 +184,7 @@ function stringifyDumpData(data, indents) {
184
184
  return JSON.stringify(data, replacerForPageObject, indents);
185
185
  }
186
186
  function getVersion() {
187
- return "0.9.2";
187
+ return "0.10.0";
188
188
  }
189
189
  function debugLog(...message) {
190
190
  const debugMode = _chunkJP3JBDZSjs.getAIConfig.call(void 0, _chunkJP3JBDZSjs.MIDSCENE_DEBUG_MODE);
package/dist/lib/index.js CHANGED
@@ -6,7 +6,7 @@
6
6
 
7
7
 
8
8
 
9
- var _chunk6MKLXHAYjs = require('./chunk-6MKLXHAY.js');
9
+ var _chunkMN5JVUKUjs = require('./chunk-MN5JVUKU.js');
10
10
 
11
11
 
12
12
 
@@ -17,7 +17,8 @@ var _chunk6MKLXHAYjs = require('./chunk-6MKLXHAY.js');
17
17
 
18
18
 
19
19
 
20
- var _chunkP2MMY6CZjs = require('./chunk-P2MMY6CZ.js');
20
+ var _chunkCERQVVPJjs = require('./chunk-CERQVVPJ.js');
21
+
21
22
 
22
23
 
23
24
 
@@ -168,7 +169,7 @@ ${(_b = this.latestErrorTask()) == null ? void 0 : _b.errorStack}`
168
169
  }
169
170
  dump() {
170
171
  const dumpData = {
171
- sdkVersion: _chunk6MKLXHAYjs.getVersion.call(void 0, ),
172
+ sdkVersion: _chunkMN5JVUKUjs.getVersion.call(void 0, ),
172
173
  model_name: _chunkJP3JBDZSjs.getAIConfig.call(void 0, _chunkJP3JBDZSjs.MIDSCENE_MODEL_NAME) || "",
173
174
  logTime: Date.now(),
174
175
  name: this.name,
@@ -191,16 +192,17 @@ var logFileName = "";
191
192
  var logContent = [];
192
193
  var logIdIndexMap = {};
193
194
  var { pid } = process;
194
- var logFileExt = _chunk6MKLXHAYjs.insightDumpFileExt;
195
+ var logFileExt = _chunkMN5JVUKUjs.insightDumpFileExt;
195
196
  var ifInBrowser = typeof window !== "undefined";
196
197
  function writeInsightDump(data, logId, dumpSubscriber) {
197
- const logDir = _chunk6MKLXHAYjs.getLogDir.call(void 0, );
198
+ const logDir = _chunkMN5JVUKUjs.getLogDir.call(void 0, );
198
199
  _assert2.default.call(void 0, logDir, "logDir should be set before writing dump file");
199
200
  const id = logId || _utils.uuid.call(void 0, );
200
201
  const baseData = {
201
- sdkVersion: _chunk6MKLXHAYjs.getVersion.call(void 0, ),
202
+ sdkVersion: _chunkMN5JVUKUjs.getVersion.call(void 0, ),
202
203
  logTime: Date.now(),
203
- model_name: _chunkJP3JBDZSjs.getAIConfig.call(void 0, _chunkJP3JBDZSjs.MIDSCENE_MODEL_NAME) || ""
204
+ model_name: _chunkJP3JBDZSjs.getAIConfig.call(void 0, _chunkJP3JBDZSjs.MIDSCENE_MODEL_NAME) || "",
205
+ model_description: _chunkJP3JBDZSjs.getAIConfig.call(void 0, _chunkJP3JBDZSjs.MIDSCENE_USE_VLM_UI_TARS) ? "vlm-ui-tars enabled" : ""
204
206
  };
205
207
  const finalData = {
206
208
  logId: id,
@@ -208,7 +210,7 @@ function writeInsightDump(data, logId, dumpSubscriber) {
208
210
  ...data
209
211
  };
210
212
  dumpSubscriber == null ? void 0 : dumpSubscriber(finalData);
211
- const dataString = _chunk6MKLXHAYjs.stringifyDumpData.call(void 0, finalData, 2);
213
+ const dataString = _chunkMN5JVUKUjs.stringifyDumpData.call(void 0, finalData, 2);
212
214
  if (typeof logIdIndexMap[id] === "number") {
213
215
  logContent[logIdIndexMap[id]] = dataString;
214
216
  } else {
@@ -222,7 +224,7 @@ function writeInsightDump(data, logId, dumpSubscriber) {
222
224
  logFileName = `${pid}_${baseData.logTime}-${Math.random()}`;
223
225
  }
224
226
  }
225
- _chunk6MKLXHAYjs.writeLogFile.call(void 0, {
227
+ _chunkMN5JVUKUjs.writeLogFile.call(void 0, {
226
228
  fileName: logFileName,
227
229
  fileExt: logFileExt,
228
230
  fileContent: `[
@@ -237,7 +239,7 @@ ${logContent.join(",\n")}
237
239
  // src/insight/index.ts
238
240
  var Insight = class {
239
241
  constructor(context, opt) {
240
- this.aiVendorFn = _chunkP2MMY6CZjs.callAiFn;
242
+ this.aiVendorFn = _chunkCERQVVPJjs.callAiFn;
241
243
  _assert2.default.call(void 0, context, "context is required for Insight");
242
244
  if (typeof context === "function") {
243
245
  this.contextRetrieverFn = context;
@@ -263,7 +265,7 @@ var Insight = class {
263
265
  this.onceDumpUpdatedFn = void 0;
264
266
  const context = await this.contextRetrieverFn("locate");
265
267
  const startTime = Date.now();
266
- const { parseResult, elementById, rawResponse, usage } = await _chunkP2MMY6CZjs.AiInspectElement.call(void 0, {
268
+ const { parseResult, elementById, rawResponse, usage } = await _chunkCERQVVPJjs.AiInspectElement.call(void 0, {
267
269
  callAI: callAI || this.aiVendorFn,
268
270
  context,
269
271
  multi: Boolean(multi),
@@ -346,7 +348,7 @@ ${parseResult.errors.join("\n")}`;
346
348
  this.onceDumpUpdatedFn = void 0;
347
349
  const context = await this.contextRetrieverFn("extract");
348
350
  const startTime = Date.now();
349
- const { parseResult, elementById } = await _chunkP2MMY6CZjs.AiExtractElementInfo.call(void 0, {
351
+ const { parseResult, elementById } = await _chunkCERQVVPJjs.AiExtractElementInfo.call(void 0, {
350
352
  context,
351
353
  dataQuery: dataDemand
352
354
  });
@@ -400,7 +402,7 @@ ${parseResult.errors.join("\n")}`;
400
402
  this.onceDumpUpdatedFn = void 0;
401
403
  const context = await this.contextRetrieverFn("assert");
402
404
  const startTime = Date.now();
403
- const assertResult = await _chunkP2MMY6CZjs.AiAssert.call(void 0, {
405
+ const assertResult = await _chunkCERQVVPJjs.AiAssert.call(void 0, {
404
406
  assertion,
405
407
  context
406
408
  });
@@ -448,4 +450,4 @@ var src_default = Insight;
448
450
 
449
451
 
450
452
 
451
- exports.AIResponseFormat = _chunkP2MMY6CZjs.AIResponseFormat; exports.BaseElement = _chunkP2MMY6CZjs.BaseElement; exports.Executor = Executor; exports.Insight = Insight; exports.UIContext = _chunkP2MMY6CZjs.UIContext; exports.default = src_default; exports.getLogDirByType = _chunk6MKLXHAYjs.getLogDirByType; exports.getVersion = _chunk6MKLXHAYjs.getVersion; exports.plan = _chunkP2MMY6CZjs.plan; exports.setLogDir = _chunk6MKLXHAYjs.setLogDir; exports.transformElementPositionToId = _chunkP2MMY6CZjs.transformElementPositionToId;
453
+ exports.AIResponseFormat = _chunkCERQVVPJjs.AIResponseFormat; exports.BaseElement = _chunkCERQVVPJjs.BaseElement; exports.Executor = Executor; exports.Insight = Insight; exports.UIContext = _chunkCERQVVPJjs.UIContext; exports.default = src_default; exports.getLogDirByType = _chunkMN5JVUKUjs.getLogDirByType; exports.getVersion = _chunkMN5JVUKUjs.getVersion; exports.plan = _chunkCERQVVPJjs.plan; exports.setLogDir = _chunkMN5JVUKUjs.setLogDir; exports.transformElementPositionToId = _chunkCERQVVPJjs.transformElementPositionToId;
@@ -1,8 +1,8 @@
1
- import { g as AIUsageInfo, B as BaseElement, U as UIContext, y as PlanningAction } from './types-c4bec333.js';
1
+ import { g as AIUsageInfo, B as BaseElement, U as UIContext, y as PlanningAction } from './types-64c4d87b.js';
2
2
  import { ChatCompletionMessageParam } from 'openai/resources';
3
3
  export { ChatCompletionMessageParam } from 'openai/resources';
4
- import { A as AIActionType } from './llm-planning-7247f4e9.js';
5
- export { d as AiAssert, b as AiExtractElementInfo, a as AiInspectElement, c as callAiFn, p as plan, t as transformElementPositionToId } from './llm-planning-7247f4e9.js';
4
+ import { A as AIActionType } from './llm-planning-ca109221.js';
5
+ export { d as AiAssert, b as AiExtractElementInfo, a as AiInspectElement, c as callAiFn, p as plan, t as transformElementPositionToId } from './llm-planning-ca109221.js';
6
6
  import '@midscene/shared/constants';
7
7
 
8
8
  declare function callToGetJSONObject<T>(messages: ChatCompletionMessageParam[], AIActionTypeValue: AIActionType): Promise<{
@@ -1,7 +1,7 @@
1
- import { E as ExecutionTask, a as ExecutionTaskProgressOptions, b as ExecutionTaskApply, c as ExecutionDump, B as BaseElement, U as UIContext, I as InsightAction, D as DumpSubscriber, d as InsightOptions, e as InsightTaskInfo, f as InsightAssertionResponse, A as AISingleElementResponse } from './types-c4bec333.js';
2
- export { n as AIAssertionResponse, k as AIElementIdResponse, l as AIElementResponse, h as AIResponseFormat, m as AISectionParseResponse, i as AISingleElementResponseById, j as AISingleElementResponseByPosition, g as AIUsageInfo, w as AgentAssertOpt, v as AgentWaitForOpt, X as BaseAgentParserOpt, C as CallAIFn, W as Color, q as DumpMeta, u as ElementById, o as EnsureObject, _ as ExecutionRecorderItem, ag as ExecutionTaskAction, af as ExecutionTaskActionApply, ae as ExecutionTaskInsightAssertion, ad as ExecutionTaskInsightAssertionApply, ac as ExecutionTaskInsightAssertionParam, a5 as ExecutionTaskInsightDumpLog, a7 as ExecutionTaskInsightLocate, a6 as ExecutionTaskInsightLocateApply, a4 as ExecutionTaskInsightLocateOutput, a3 as ExecutionTaskInsightLocateParam, ab as ExecutionTaskInsightQuery, aa as ExecutionTaskInsightQueryApply, a9 as ExecutionTaskInsightQueryOutput, a8 as ExecutionTaskInsightQueryParam, ai as ExecutionTaskPlanning, ah as ExecutionTaskPlanningApply, a2 as ExecutionTaskReturn, $ as ExecutionTaskType, a0 as ExecutorContext, aj as GroupedActionDump, s as InsightDump, p as InsightExtractParam, L as LiteUISection, O as OnTaskStartTip, t as PartialInsightDumpFromSDK, z as PlanningAIResponse, y as PlanningAction, N as PlanningActionParamAssert, T as PlanningActionParamError, J as PlanningActionParamHover, K as PlanningActionParamInputOrKeyPress, G as PlanningActionParamPlan, M as PlanningActionParamScroll, Q as PlanningActionParamSleep, H as PlanningActionParamTap, V as PlanningActionParamWaitFor, F as PlanningFurtherPlan, x as PlanningLocateParam, Z as PlaywrightParserOpt, P as Point, Y as PuppeteerParserOpt, R as Rect, r as ReportDumpWithAttributes, S as Size, a1 as TaskCacheInfo } from './types-c4bec333.js';
3
- import { c as callAiFn } from './llm-planning-7247f4e9.js';
4
- export { p as plan, t as transformElementPositionToId } from './llm-planning-7247f4e9.js';
1
+ import { E as ExecutionTask, a as ExecutionTaskProgressOptions, b as ExecutionTaskApply, c as ExecutionDump, B as BaseElement, U as UIContext, I as InsightAction, D as DumpSubscriber, d as InsightOptions, e as InsightTaskInfo, f as InsightAssertionResponse, A as AISingleElementResponse } from './types-64c4d87b.js';
2
+ export { n as AIAssertionResponse, k as AIElementIdResponse, l as AIElementResponse, h as AIResponseFormat, m as AISectionParseResponse, i as AISingleElementResponseById, j as AISingleElementResponseByPosition, g as AIUsageInfo, w as AgentAssertOpt, v as AgentWaitForOpt, X as BaseAgentParserOpt, C as CallAIFn, W as Color, q as DumpMeta, u as ElementById, o as EnsureObject, _ as ExecutionRecorderItem, ag as ExecutionTaskAction, af as ExecutionTaskActionApply, ae as ExecutionTaskInsightAssertion, ad as ExecutionTaskInsightAssertionApply, ac as ExecutionTaskInsightAssertionParam, a5 as ExecutionTaskInsightDumpLog, a7 as ExecutionTaskInsightLocate, a6 as ExecutionTaskInsightLocateApply, a4 as ExecutionTaskInsightLocateOutput, a3 as ExecutionTaskInsightLocateParam, ab as ExecutionTaskInsightQuery, aa as ExecutionTaskInsightQueryApply, a9 as ExecutionTaskInsightQueryOutput, a8 as ExecutionTaskInsightQueryParam, ai as ExecutionTaskPlanning, ah as ExecutionTaskPlanningApply, a2 as ExecutionTaskReturn, $ as ExecutionTaskType, a0 as ExecutorContext, aj as GroupedActionDump, s as InsightDump, p as InsightExtractParam, L as LiteUISection, O as OnTaskStartTip, t as PartialInsightDumpFromSDK, z as PlanningAIResponse, y as PlanningAction, N as PlanningActionParamAssert, T as PlanningActionParamError, J as PlanningActionParamHover, K as PlanningActionParamInputOrKeyPress, G as PlanningActionParamPlan, M as PlanningActionParamScroll, Q as PlanningActionParamSleep, H as PlanningActionParamTap, V as PlanningActionParamWaitFor, F as PlanningFurtherPlan, x as PlanningLocateParam, Z as PlaywrightParserOpt, P as Point, Y as PuppeteerParserOpt, R as Rect, r as ReportDumpWithAttributes, S as Size, a1 as TaskCacheInfo } from './types-64c4d87b.js';
3
+ import { c as callAiFn } from './llm-planning-ca109221.js';
4
+ export { p as plan, t as transformElementPositionToId } from './llm-planning-ca109221.js';
5
5
  export { getLogDirByType, getVersion, setLogDir } from './utils.js';
6
6
  import '@midscene/shared/constants';
7
7
  import 'openai/resources';
@@ -1,4 +1,4 @@
1
- import { g as AIUsageInfo, l as AIElementResponse, B as BaseElement, U as UIContext, A as AISingleElementResponse, j as AISingleElementResponseByPosition, u as ElementById, m as AISectionParseResponse, n as AIAssertionResponse, z as PlanningAIResponse } from './types-c4bec333.js';
1
+ import { g as AIUsageInfo, l as AIElementResponse, B as BaseElement, U as UIContext, A as AISingleElementResponse, j as AISingleElementResponseByPosition, u as ElementById, m as AISectionParseResponse, n as AIAssertionResponse, z as PlanningAIResponse } from './types-64c4d87b.js';
2
2
  import { ChatCompletionSystemMessageParam, ChatCompletionUserMessageParam } from 'openai/resources';
3
3
 
4
4
  type AIArgs = [
@@ -101,6 +101,7 @@ interface DumpMeta {
101
101
  sdkVersion: string;
102
102
  logTime: number;
103
103
  model_name: string;
104
+ model_description?: string;
104
105
  }
105
106
  interface ReportDumpWithAttributes {
106
107
  dumpString: string;
@@ -1,4 +1,4 @@
1
- import { r as ReportDumpWithAttributes, R as Rect } from './types-c4bec333.js';
1
+ import { r as ReportDumpWithAttributes, R as Rect } from './types-64c4d87b.js';
2
2
  import '@midscene/shared/constants';
3
3
  import 'openai/resources';
4
4
 
package/dist/lib/utils.js CHANGED
@@ -16,7 +16,7 @@
16
16
 
17
17
 
18
18
 
19
- var _chunk6MKLXHAYjs = require('./chunk-6MKLXHAY.js');
19
+ var _chunkMN5JVUKUjs = require('./chunk-MN5JVUKU.js');
20
20
  require('./chunk-JP3JBDZS.js');
21
21
  require('./chunk-YSQDPG26.js');
22
22
 
@@ -37,4 +37,4 @@ require('./chunk-YSQDPG26.js');
37
37
 
38
38
 
39
39
 
40
- exports.getLogDir = _chunk6MKLXHAYjs.getLogDir; exports.getLogDirByType = _chunk6MKLXHAYjs.getLogDirByType; exports.getTmpDir = _chunk6MKLXHAYjs.getTmpDir; exports.getTmpFile = _chunk6MKLXHAYjs.getTmpFile; exports.getVersion = _chunk6MKLXHAYjs.getVersion; exports.groupedActionDumpFileExt = _chunk6MKLXHAYjs.groupedActionDumpFileExt; exports.insightDumpFileExt = _chunk6MKLXHAYjs.insightDumpFileExt; exports.overlapped = _chunk6MKLXHAYjs.overlapped; exports.replaceStringWithFirstAppearance = _chunk6MKLXHAYjs.replaceStringWithFirstAppearance; exports.replacerForPageObject = _chunk6MKLXHAYjs.replacerForPageObject; exports.reportHTMLContent = _chunk6MKLXHAYjs.reportHTMLContent; exports.setLogDir = _chunk6MKLXHAYjs.setLogDir; exports.sleep = _chunk6MKLXHAYjs.sleep; exports.stringifyDumpData = _chunk6MKLXHAYjs.stringifyDumpData; exports.uploadTestInfoToServer = _chunk6MKLXHAYjs.uploadTestInfoToServer; exports.writeDumpReport = _chunk6MKLXHAYjs.writeDumpReport; exports.writeLogFile = _chunk6MKLXHAYjs.writeLogFile;
40
+ exports.getLogDir = _chunkMN5JVUKUjs.getLogDir; exports.getLogDirByType = _chunkMN5JVUKUjs.getLogDirByType; exports.getTmpDir = _chunkMN5JVUKUjs.getTmpDir; exports.getTmpFile = _chunkMN5JVUKUjs.getTmpFile; exports.getVersion = _chunkMN5JVUKUjs.getVersion; exports.groupedActionDumpFileExt = _chunkMN5JVUKUjs.groupedActionDumpFileExt; exports.insightDumpFileExt = _chunkMN5JVUKUjs.insightDumpFileExt; exports.overlapped = _chunkMN5JVUKUjs.overlapped; exports.replaceStringWithFirstAppearance = _chunkMN5JVUKUjs.replaceStringWithFirstAppearance; exports.replacerForPageObject = _chunkMN5JVUKUjs.replacerForPageObject; exports.reportHTMLContent = _chunkMN5JVUKUjs.reportHTMLContent; exports.setLogDir = _chunkMN5JVUKUjs.setLogDir; exports.sleep = _chunkMN5JVUKUjs.sleep; exports.stringifyDumpData = _chunkMN5JVUKUjs.stringifyDumpData; exports.uploadTestInfoToServer = _chunkMN5JVUKUjs.uploadTestInfoToServer; exports.writeDumpReport = _chunkMN5JVUKUjs.writeDumpReport; exports.writeLogFile = _chunkMN5JVUKUjs.writeLogFile;