@midscene/core 0.9.2 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/lib/ai-model.js +2 -2
- package/dist/lib/{chunk-P2MMY6CZ.js → chunk-CERQVVPJ.js} +193 -191
- package/dist/lib/{chunk-6MKLXHAY.js → chunk-MN5JVUKU.js} +1 -1
- package/dist/lib/index.js +16 -14
- package/dist/lib/types/ai-model.d.ts +3 -3
- package/dist/lib/types/index.d.ts +4 -4
- package/dist/lib/types/{llm-planning-7247f4e9.d.ts → llm-planning-ca109221.d.ts} +1 -1
- package/dist/lib/types/{types-c4bec333.d.ts → types-64c4d87b.d.ts} +1 -0
- package/dist/lib/types/utils.d.ts +1 -1
- package/dist/lib/utils.js +2 -2
- package/package.json +3 -2
- package/report/index.html +2 -2
package/dist/lib/ai-model.js
CHANGED
|
@@ -9,7 +9,7 @@
|
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
|
|
12
|
-
var
|
|
12
|
+
var _chunkCERQVVPJjs = require('./chunk-CERQVVPJ.js');
|
|
13
13
|
require('./chunk-JP3JBDZS.js');
|
|
14
14
|
require('./chunk-YSQDPG26.js');
|
|
15
15
|
|
|
@@ -23,4 +23,4 @@ require('./chunk-YSQDPG26.js');
|
|
|
23
23
|
|
|
24
24
|
|
|
25
25
|
|
|
26
|
-
exports.AiAssert =
|
|
26
|
+
exports.AiAssert = _chunkCERQVVPJjs.AiAssert; exports.AiExtractElementInfo = _chunkCERQVVPJjs.AiExtractElementInfo; exports.AiInspectElement = _chunkCERQVVPJjs.AiInspectElement; exports.callAiFn = _chunkCERQVVPJjs.callAiFn; exports.callToGetJSONObject = _chunkCERQVVPJjs.callToGetJSONObject; exports.describeUserPage = _chunkCERQVVPJjs.describeUserPage; exports.plan = _chunkCERQVVPJjs.plan; exports.systemPromptToLocateElement = _chunkCERQVVPJjs.systemPromptToLocateElement; exports.transformElementPositionToId = _chunkCERQVVPJjs.transformElementPositionToId; exports.vlmPlanning = _chunkCERQVVPJjs.vlmPlanning;
|
|
@@ -1109,32 +1109,192 @@ async function callAiFn(msgs, AIActionTypeValue) {
|
|
|
1109
1109
|
// src/ai-model/prompt/llm-locator.ts
|
|
1110
1110
|
var _prompts = require('@langchain/core/prompts');
|
|
1111
1111
|
|
|
1112
|
+
// src/ai-model/prompt/ui-tars-planning.ts
|
|
1113
|
+
function getTimeZoneInfo() {
|
|
1114
|
+
const timeZone = Intl.DateTimeFormat().resolvedOptions().timeZone;
|
|
1115
|
+
const offset = -(/* @__PURE__ */ new Date()).getTimezoneOffset() / 60;
|
|
1116
|
+
return {
|
|
1117
|
+
timezone: `UTC${offset >= 0 ? "+" : ""}${offset}`,
|
|
1118
|
+
isChina: timeZone === "Asia/Shanghai"
|
|
1119
|
+
};
|
|
1120
|
+
}
|
|
1121
|
+
var language = getTimeZoneInfo().isChina ? "Chinese" : "English";
|
|
1122
|
+
var uiTarsPlanningPrompt = `
|
|
1123
|
+
You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
|
|
1124
|
+
|
|
1125
|
+
## Output Format
|
|
1126
|
+
\`\`\`
|
|
1127
|
+
Thought: ...
|
|
1128
|
+
Action: ...
|
|
1129
|
+
\`\`\`
|
|
1130
|
+
|
|
1131
|
+
## Action Space
|
|
1132
|
+
click(start_box='[x1, y1, x2, y2]')
|
|
1133
|
+
left_double(start_box='[x1, y1, x2, y2]')
|
|
1134
|
+
right_single(start_box='[x1, y1, x2, y2]')
|
|
1135
|
+
drag(start_box='[x1, y1, x2, y2]', end_box='[x3, y3, x4, y4]')
|
|
1136
|
+
hotkey(key='')
|
|
1137
|
+
type(content='') #If you want to submit your input, use "\\n" at the end of \`content\`.
|
|
1138
|
+
scroll(start_box='[x1, y1, x2, y2]', direction='down or up or right or left')
|
|
1139
|
+
wait() #Sleep for 5s and take a screenshot to check for any changes.
|
|
1140
|
+
finished()
|
|
1141
|
+
call_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help.
|
|
1142
|
+
|
|
1143
|
+
## Note
|
|
1144
|
+
- Use ${language} in \`Thought\` part.
|
|
1145
|
+
- Write a small plan and finally summarize your next action (with its target element) in one sentence in \`Thought\` part.
|
|
1146
|
+
|
|
1147
|
+
## User Instruction
|
|
1148
|
+
`;
|
|
1149
|
+
var getSummary = (prediction) => prediction.replace(/Reflection:[\s\S]*?(?=Action_Summary:|Action:|$)/g, "").trim();
|
|
1150
|
+
function parseActionFromVlm(text, factor = 1e3, mode = "bc") {
|
|
1151
|
+
let reflection = null;
|
|
1152
|
+
let thought = null;
|
|
1153
|
+
let actionStr = "";
|
|
1154
|
+
text = text.trim();
|
|
1155
|
+
if (mode === "bc") {
|
|
1156
|
+
if (text.startsWith("Thought:")) {
|
|
1157
|
+
const thoughtMatch = text.match(/Thought: (.+?)(?=\s*Action:|$)/s);
|
|
1158
|
+
if (thoughtMatch) {
|
|
1159
|
+
thought = thoughtMatch[1].trim();
|
|
1160
|
+
}
|
|
1161
|
+
} else if (text.startsWith("Reflection:")) {
|
|
1162
|
+
const reflectionMatch = text.match(
|
|
1163
|
+
/Reflection: (.+?)Action_Summary: (.+?)(?=\s*Action:|$)/
|
|
1164
|
+
);
|
|
1165
|
+
if (reflectionMatch) {
|
|
1166
|
+
thought = reflectionMatch[2].trim();
|
|
1167
|
+
reflection = reflectionMatch[1].trim();
|
|
1168
|
+
}
|
|
1169
|
+
} else if (text.startsWith("Action_Summary:")) {
|
|
1170
|
+
const summaryMatch = text.match(/Action_Summary: (.+?)(?=\s*Action:|$)/);
|
|
1171
|
+
if (summaryMatch) {
|
|
1172
|
+
thought = summaryMatch[1].trim();
|
|
1173
|
+
}
|
|
1174
|
+
}
|
|
1175
|
+
if (!text.includes("Action:")) {
|
|
1176
|
+
actionStr = text;
|
|
1177
|
+
} else {
|
|
1178
|
+
const actionParts = text.split("Action:");
|
|
1179
|
+
actionStr = actionParts[actionParts.length - 1];
|
|
1180
|
+
}
|
|
1181
|
+
} else if (mode === "o1") {
|
|
1182
|
+
const thoughtMatch = text.match(/<Thought>\s*(.*?)\s*<\/Thought>/);
|
|
1183
|
+
const actionSummaryMatch = text.match(
|
|
1184
|
+
/\nAction_Summary:\s*(.*?)\s*Action:/
|
|
1185
|
+
);
|
|
1186
|
+
const actionMatch = text.match(/\nAction:\s*(.*?)\s*<\/Output>/);
|
|
1187
|
+
const thoughtContent = thoughtMatch ? thoughtMatch[1] : null;
|
|
1188
|
+
const actionSummaryContent = actionSummaryMatch ? actionSummaryMatch[1] : null;
|
|
1189
|
+
const actionContent = actionMatch ? actionMatch[1] : null;
|
|
1190
|
+
thought = `${thoughtContent}
|
|
1191
|
+
<Action_Summary>
|
|
1192
|
+
${actionSummaryContent}`;
|
|
1193
|
+
actionStr = actionContent || "";
|
|
1194
|
+
}
|
|
1195
|
+
const allActions = actionStr.split("\n\n");
|
|
1196
|
+
const actions = [];
|
|
1197
|
+
for (const rawStr of allActions) {
|
|
1198
|
+
const actionInstance = parseAction(rawStr.replace(/\n/g, "\\n").trim());
|
|
1199
|
+
if (!actionInstance) {
|
|
1200
|
+
console.log(`Action can't parse: ${rawStr}`);
|
|
1201
|
+
continue;
|
|
1202
|
+
}
|
|
1203
|
+
const actionType = actionInstance.function;
|
|
1204
|
+
const params = actionInstance.args;
|
|
1205
|
+
const actionInputs = {};
|
|
1206
|
+
for (const [paramName, param] of Object.entries(params)) {
|
|
1207
|
+
if (!param)
|
|
1208
|
+
continue;
|
|
1209
|
+
const trimmedParam = param.trim();
|
|
1210
|
+
actionInputs[paramName.trim()] = trimmedParam;
|
|
1211
|
+
if (paramName.includes("start_box") || paramName.includes("end_box")) {
|
|
1212
|
+
const oriBox = trimmedParam;
|
|
1213
|
+
const numbers = oriBox.replace(/[()]/g, "").split(",");
|
|
1214
|
+
const floatNumbers = numbers.map(
|
|
1215
|
+
(num) => Number.parseFloat(num) / factor
|
|
1216
|
+
);
|
|
1217
|
+
if (floatNumbers.length === 2) {
|
|
1218
|
+
floatNumbers.push(floatNumbers[0], floatNumbers[1]);
|
|
1219
|
+
}
|
|
1220
|
+
actionInputs[paramName.trim()] = JSON.stringify(floatNumbers);
|
|
1221
|
+
}
|
|
1222
|
+
}
|
|
1223
|
+
if (actionType === "finished") {
|
|
1224
|
+
actions.push({
|
|
1225
|
+
reflection,
|
|
1226
|
+
thought,
|
|
1227
|
+
action_type: "finished",
|
|
1228
|
+
action_inputs: {}
|
|
1229
|
+
});
|
|
1230
|
+
} else {
|
|
1231
|
+
actions.push({
|
|
1232
|
+
reflection,
|
|
1233
|
+
thought,
|
|
1234
|
+
action_type: actionType,
|
|
1235
|
+
action_inputs: actionInputs
|
|
1236
|
+
});
|
|
1237
|
+
}
|
|
1238
|
+
}
|
|
1239
|
+
return actions;
|
|
1240
|
+
}
|
|
1241
|
+
function parseAction(actionStr) {
|
|
1242
|
+
try {
|
|
1243
|
+
const functionPattern = /^(\w+)\((.*)\)$/;
|
|
1244
|
+
const match = actionStr.trim().match(functionPattern);
|
|
1245
|
+
if (!match) {
|
|
1246
|
+
throw new Error("Not a function call");
|
|
1247
|
+
}
|
|
1248
|
+
const [_, functionName, argsStr] = match;
|
|
1249
|
+
const kwargs = {};
|
|
1250
|
+
if (argsStr.trim()) {
|
|
1251
|
+
const argPairs = argsStr.match(/([^,']|'[^']*')+/g) || [];
|
|
1252
|
+
for (const pair of argPairs) {
|
|
1253
|
+
const [key, ...valueParts] = pair.split("=");
|
|
1254
|
+
if (!key)
|
|
1255
|
+
continue;
|
|
1256
|
+
const value = valueParts.join("=").trim().replace(/^['"]|['"]$/g, "");
|
|
1257
|
+
kwargs[key.trim()] = value;
|
|
1258
|
+
}
|
|
1259
|
+
}
|
|
1260
|
+
return {
|
|
1261
|
+
function: functionName,
|
|
1262
|
+
args: kwargs
|
|
1263
|
+
};
|
|
1264
|
+
} catch (e) {
|
|
1265
|
+
console.error(`Failed to parse action '${actionStr}': ${e}`);
|
|
1266
|
+
return null;
|
|
1267
|
+
}
|
|
1268
|
+
}
|
|
1269
|
+
|
|
1112
1270
|
// src/ai-model/prompt/ui-tars-locator.ts
|
|
1113
1271
|
function systemPromptToLocateElementPosition() {
|
|
1114
1272
|
return `
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
-
|
|
1120
|
-
|
|
1121
|
-
|
|
1122
|
-
|
|
1123
|
-
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
|
|
1127
|
-
|
|
1128
|
-
|
|
1129
|
-
|
|
1130
|
-
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1273
|
+
You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
|
|
1274
|
+
|
|
1275
|
+
## Output Format
|
|
1276
|
+
\`\`\`
|
|
1277
|
+
Thought: ...
|
|
1278
|
+
Action: ...
|
|
1279
|
+
\`\`\`
|
|
1280
|
+
|
|
1281
|
+
## Action Space
|
|
1282
|
+
click(start_box='[x1, y1, x2, y2]')
|
|
1283
|
+
left_double(start_box='[x1, y1, x2, y2]')
|
|
1284
|
+
right_single(start_box='[x1, y1, x2, y2]')
|
|
1285
|
+
drag(start_box='[x1, y1, x2, y2]', end_box='[x3, y3, x4, y4]')
|
|
1286
|
+
hotkey(key='')
|
|
1287
|
+
type(content='') #If you want to submit your input, use "\\n" at the end of \`content\`.
|
|
1288
|
+
scroll(start_box='[x1, y1, x2, y2]', direction='down or up or right or left')
|
|
1289
|
+
wait() #Sleep for 5s and take a screenshot to check for any changes.
|
|
1290
|
+
finished()
|
|
1291
|
+
call_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help.
|
|
1292
|
+
|
|
1293
|
+
## Note
|
|
1294
|
+
- Use ${language} in \`Thought\` part.
|
|
1295
|
+
- Write a small plan and finally summarize your next action (with its target element) in one sentence in \`Thought\` part.
|
|
1296
|
+
|
|
1297
|
+
## User Instruction
|
|
1138
1298
|
`;
|
|
1139
1299
|
}
|
|
1140
1300
|
|
|
@@ -1360,8 +1520,6 @@ var _img = require('@midscene/shared/img');
|
|
|
1360
1520
|
var _constants = require('@midscene/shared/constants');
|
|
1361
1521
|
|
|
1362
1522
|
var characteristic = "You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.";
|
|
1363
|
-
var contextFormatIntro = `
|
|
1364
|
-
The user will give you a screenshot and some of the texts on it. There may be some none-English characters (like Chinese) on it, indicating it's an non-English app. If some text is shown on screenshot but not introduced by the JSON description, use the information you see on screenshot.`;
|
|
1365
1523
|
function systemPromptToExtract() {
|
|
1366
1524
|
return `
|
|
1367
1525
|
You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.
|
|
@@ -1397,7 +1555,6 @@ DATA_DEMAND start:
|
|
|
1397
1555
|
{dataKeys}
|
|
1398
1556
|
|
|
1399
1557
|
{dataQuery}
|
|
1400
|
-
|
|
1401
1558
|
=====================================
|
|
1402
1559
|
DATA_DEMAND ends.
|
|
1403
1560
|
`,
|
|
@@ -1406,14 +1563,12 @@ DATA_DEMAND ends.
|
|
|
1406
1563
|
function systemPromptToAssert() {
|
|
1407
1564
|
return `
|
|
1408
1565
|
${characteristic}
|
|
1409
|
-
|
|
1410
|
-
|
|
1411
|
-
Based on the information you get, Return assertion judgment:
|
|
1566
|
+
User will give an assertion, and some information about the page. Based on the information you get, tell whether the assertion is truthy.
|
|
1412
1567
|
|
|
1413
1568
|
Return in the following JSON format:
|
|
1414
1569
|
{
|
|
1415
1570
|
thought: string, // string, the thought of the assertion. Should in the same language as the assertion.
|
|
1416
|
-
pass: true, // true or false, whether the assertion is
|
|
1571
|
+
pass: true, // true or false, whether the assertion is truthy
|
|
1417
1572
|
}
|
|
1418
1573
|
`;
|
|
1419
1574
|
}
|
|
@@ -1454,7 +1609,7 @@ function truncateText(text, maxLength = 100) {
|
|
|
1454
1609
|
function elementByPositionWithElementInfo(elementsInfo, position) {
|
|
1455
1610
|
_assert2.default.call(void 0, typeof position !== "undefined", "position is required for query");
|
|
1456
1611
|
const matchingElements = elementsInfo.filter((item) => {
|
|
1457
|
-
return item.rect.left <= position.x && position.x <= item.rect.left + item.rect.width && item.rect.top <= position.y && position.y <= item.rect.top + item.rect.height;
|
|
1612
|
+
return item.attributes.nodeType !== _constants.NodeType.CONTAINER && item.rect.left <= position.x && position.x <= item.rect.left + item.rect.width && item.rect.top <= position.y && position.y <= item.rect.top + item.rect.height;
|
|
1458
1613
|
});
|
|
1459
1614
|
if (matchingElements.length === 0) {
|
|
1460
1615
|
return void 0;
|
|
@@ -1710,7 +1865,7 @@ The JSON format is as follows:
|
|
|
1710
1865
|
{{
|
|
1711
1866
|
"actions": [
|
|
1712
1867
|
{{
|
|
1713
|
-
"thought": "Reasons for generating this task, and why this task is feasible on this page",
|
|
1868
|
+
"thought": "Reasons for generating this task, and why this task is feasible on this page.", // Use the same language as the user's instruction.
|
|
1714
1869
|
"type": "Tap",
|
|
1715
1870
|
"param": null,
|
|
1716
1871
|
"locate": {sample} | null,
|
|
@@ -1718,8 +1873,8 @@ The JSON format is as follows:
|
|
|
1718
1873
|
// ... more actions
|
|
1719
1874
|
],
|
|
1720
1875
|
"taskWillBeAccomplished": boolean,
|
|
1721
|
-
"furtherPlan": {{ "whatHaveDone": string, "whatToDoNext": string }} | null,
|
|
1722
|
-
"error"?: string
|
|
1876
|
+
"furtherPlan": {{ "whatHaveDone": string, "whatToDoNext": string }} | null, // Use the same language as the user's instruction.
|
|
1877
|
+
"error"?: string // Use the same language as the user's instruction.
|
|
1723
1878
|
}}
|
|
1724
1879
|
Here is an example of how to decompose a task:
|
|
1725
1880
|
|
|
@@ -2416,13 +2571,10 @@ async function AiAssert(options) {
|
|
|
2416
2571
|
{
|
|
2417
2572
|
type: "text",
|
|
2418
2573
|
text: `
|
|
2419
|
-
|
|
2420
|
-
|
|
2421
|
-
|
|
2422
|
-
|
|
2423
|
-
=====================================
|
|
2424
|
-
${assertion}
|
|
2425
|
-
=====================================
|
|
2574
|
+
Here is the description of the assertion. Just go ahead:
|
|
2575
|
+
=====================================
|
|
2576
|
+
${assertion}
|
|
2577
|
+
=====================================
|
|
2426
2578
|
`
|
|
2427
2579
|
}
|
|
2428
2580
|
]
|
|
@@ -2485,156 +2637,6 @@ async function plan(userPrompt, opts) {
|
|
|
2485
2637
|
return planFromAI;
|
|
2486
2638
|
}
|
|
2487
2639
|
|
|
2488
|
-
// src/ai-model/prompt/ui-tars-planning.ts
|
|
2489
|
-
var uiTarsPlanningPrompt = `
|
|
2490
|
-
You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
|
|
2491
|
-
|
|
2492
|
-
## Output Format
|
|
2493
|
-
|
|
2494
|
-
\`\`\`
|
|
2495
|
-
Thought: ...
|
|
2496
|
-
Action: ...
|
|
2497
|
-
\`\`\`
|
|
2498
|
-
|
|
2499
|
-
## Action Space
|
|
2500
|
-
click(start_box='[x1, y1, x2, y2]')
|
|
2501
|
-
left_double(start_box='[x1, y1, x2, y2]')
|
|
2502
|
-
right_single(start_box='[x1, y1, x2, y2]')
|
|
2503
|
-
drag(start_box='[x1, y1, x2, y2]', end_box='[x3, y3, x4, y4]')
|
|
2504
|
-
hotkey(key='')
|
|
2505
|
-
type(content='') #If you want to submit your input, use "\\n" at the end of \`content\`.
|
|
2506
|
-
scroll(start_box='[x1, y1, x2, y2]', direction='down or up or right or left')
|
|
2507
|
-
wait() #Sleep for 5s and take a screenshot to check for any changes.
|
|
2508
|
-
finished()
|
|
2509
|
-
call_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help.
|
|
2510
|
-
|
|
2511
|
-
## Note
|
|
2512
|
-
- Use Chinese in \`Thought\` part.
|
|
2513
|
-
- Write a small plan and finally summarize your next action (with its target element) in one sentence in \`Thought\` part.
|
|
2514
|
-
|
|
2515
|
-
## User Instruction
|
|
2516
|
-
`;
|
|
2517
|
-
var getSummary = (prediction) => prediction.replace(/Reflection:[\s\S]*?(?=Action_Summary:|Action:|$)/g, "").trim();
|
|
2518
|
-
function parseActionFromVlm(text, factor = 1e3, mode = "bc") {
|
|
2519
|
-
let reflection = null;
|
|
2520
|
-
let thought = null;
|
|
2521
|
-
let actionStr = "";
|
|
2522
|
-
text = text.trim();
|
|
2523
|
-
if (mode === "bc") {
|
|
2524
|
-
if (text.startsWith("Thought:")) {
|
|
2525
|
-
const thoughtMatch = text.match(/Thought: (.+?)(?=\s*Action:|$)/s);
|
|
2526
|
-
if (thoughtMatch) {
|
|
2527
|
-
thought = thoughtMatch[1].trim();
|
|
2528
|
-
}
|
|
2529
|
-
} else if (text.startsWith("Reflection:")) {
|
|
2530
|
-
const reflectionMatch = text.match(
|
|
2531
|
-
/Reflection: (.+?)Action_Summary: (.+?)(?=\s*Action:|$)/
|
|
2532
|
-
);
|
|
2533
|
-
if (reflectionMatch) {
|
|
2534
|
-
thought = reflectionMatch[2].trim();
|
|
2535
|
-
reflection = reflectionMatch[1].trim();
|
|
2536
|
-
}
|
|
2537
|
-
} else if (text.startsWith("Action_Summary:")) {
|
|
2538
|
-
const summaryMatch = text.match(/Action_Summary: (.+?)(?=\s*Action:|$)/);
|
|
2539
|
-
if (summaryMatch) {
|
|
2540
|
-
thought = summaryMatch[1].trim();
|
|
2541
|
-
}
|
|
2542
|
-
}
|
|
2543
|
-
if (!text.includes("Action:")) {
|
|
2544
|
-
actionStr = text;
|
|
2545
|
-
} else {
|
|
2546
|
-
const actionParts = text.split("Action:");
|
|
2547
|
-
actionStr = actionParts[actionParts.length - 1];
|
|
2548
|
-
}
|
|
2549
|
-
} else if (mode === "o1") {
|
|
2550
|
-
const thoughtMatch = text.match(/<Thought>\s*(.*?)\s*<\/Thought>/);
|
|
2551
|
-
const actionSummaryMatch = text.match(
|
|
2552
|
-
/\nAction_Summary:\s*(.*?)\s*Action:/
|
|
2553
|
-
);
|
|
2554
|
-
const actionMatch = text.match(/\nAction:\s*(.*?)\s*<\/Output>/);
|
|
2555
|
-
const thoughtContent = thoughtMatch ? thoughtMatch[1] : null;
|
|
2556
|
-
const actionSummaryContent = actionSummaryMatch ? actionSummaryMatch[1] : null;
|
|
2557
|
-
const actionContent = actionMatch ? actionMatch[1] : null;
|
|
2558
|
-
thought = `${thoughtContent}
|
|
2559
|
-
<Action_Summary>
|
|
2560
|
-
${actionSummaryContent}`;
|
|
2561
|
-
actionStr = actionContent || "";
|
|
2562
|
-
}
|
|
2563
|
-
const allActions = actionStr.split("\n\n");
|
|
2564
|
-
const actions = [];
|
|
2565
|
-
for (const rawStr of allActions) {
|
|
2566
|
-
const actionInstance = parseAction(rawStr.replace(/\n/g, "\\n").trim());
|
|
2567
|
-
if (!actionInstance) {
|
|
2568
|
-
console.log(`Action can't parse: ${rawStr}`);
|
|
2569
|
-
continue;
|
|
2570
|
-
}
|
|
2571
|
-
const actionType = actionInstance.function;
|
|
2572
|
-
const params = actionInstance.args;
|
|
2573
|
-
const actionInputs = {};
|
|
2574
|
-
for (const [paramName, param] of Object.entries(params)) {
|
|
2575
|
-
if (!param)
|
|
2576
|
-
continue;
|
|
2577
|
-
const trimmedParam = param.trim();
|
|
2578
|
-
actionInputs[paramName.trim()] = trimmedParam;
|
|
2579
|
-
if (paramName.includes("start_box") || paramName.includes("end_box")) {
|
|
2580
|
-
const oriBox = trimmedParam;
|
|
2581
|
-
const numbers = oriBox.replace(/[()]/g, "").split(",");
|
|
2582
|
-
const floatNumbers = numbers.map(
|
|
2583
|
-
(num) => Number.parseFloat(num) / factor
|
|
2584
|
-
);
|
|
2585
|
-
if (floatNumbers.length === 2) {
|
|
2586
|
-
floatNumbers.push(floatNumbers[0], floatNumbers[1]);
|
|
2587
|
-
}
|
|
2588
|
-
actionInputs[paramName.trim()] = JSON.stringify(floatNumbers);
|
|
2589
|
-
}
|
|
2590
|
-
}
|
|
2591
|
-
if (actionType === "finished") {
|
|
2592
|
-
actions.push({
|
|
2593
|
-
reflection,
|
|
2594
|
-
thought,
|
|
2595
|
-
action_type: "finished",
|
|
2596
|
-
action_inputs: {}
|
|
2597
|
-
});
|
|
2598
|
-
} else {
|
|
2599
|
-
actions.push({
|
|
2600
|
-
reflection,
|
|
2601
|
-
thought,
|
|
2602
|
-
action_type: actionType,
|
|
2603
|
-
action_inputs: actionInputs
|
|
2604
|
-
});
|
|
2605
|
-
}
|
|
2606
|
-
}
|
|
2607
|
-
return actions;
|
|
2608
|
-
}
|
|
2609
|
-
function parseAction(actionStr) {
|
|
2610
|
-
try {
|
|
2611
|
-
const functionPattern = /^(\w+)\((.*)\)$/;
|
|
2612
|
-
const match = actionStr.trim().match(functionPattern);
|
|
2613
|
-
if (!match) {
|
|
2614
|
-
throw new Error("Not a function call");
|
|
2615
|
-
}
|
|
2616
|
-
const [_, functionName, argsStr] = match;
|
|
2617
|
-
const kwargs = {};
|
|
2618
|
-
if (argsStr.trim()) {
|
|
2619
|
-
const argPairs = argsStr.match(/([^,']|'[^']*')+/g) || [];
|
|
2620
|
-
for (const pair of argPairs) {
|
|
2621
|
-
const [key, ...valueParts] = pair.split("=");
|
|
2622
|
-
if (!key)
|
|
2623
|
-
continue;
|
|
2624
|
-
const value = valueParts.join("=").trim().replace(/^['"]|['"]$/g, "");
|
|
2625
|
-
kwargs[key.trim()] = value;
|
|
2626
|
-
}
|
|
2627
|
-
}
|
|
2628
|
-
return {
|
|
2629
|
-
function: functionName,
|
|
2630
|
-
args: kwargs
|
|
2631
|
-
};
|
|
2632
|
-
} catch (e) {
|
|
2633
|
-
console.error(`Failed to parse action '${actionStr}': ${e}`);
|
|
2634
|
-
return null;
|
|
2635
|
-
}
|
|
2636
|
-
}
|
|
2637
|
-
|
|
2638
2640
|
// src/ai-model/ui-tars-planning.ts
|
|
2639
2641
|
function capitalize(str) {
|
|
2640
2642
|
return str.charAt(0).toUpperCase() + str.slice(1);
|
|
@@ -184,7 +184,7 @@ function stringifyDumpData(data, indents) {
|
|
|
184
184
|
return JSON.stringify(data, replacerForPageObject, indents);
|
|
185
185
|
}
|
|
186
186
|
function getVersion() {
|
|
187
|
-
return "0.
|
|
187
|
+
return "0.10.0";
|
|
188
188
|
}
|
|
189
189
|
function debugLog(...message) {
|
|
190
190
|
const debugMode = _chunkJP3JBDZSjs.getAIConfig.call(void 0, _chunkJP3JBDZSjs.MIDSCENE_DEBUG_MODE);
|
package/dist/lib/index.js
CHANGED
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
|
|
9
|
-
var
|
|
9
|
+
var _chunkMN5JVUKUjs = require('./chunk-MN5JVUKU.js');
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
|
|
@@ -17,7 +17,8 @@ var _chunk6MKLXHAYjs = require('./chunk-6MKLXHAY.js');
|
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
|
|
20
|
-
var
|
|
20
|
+
var _chunkCERQVVPJjs = require('./chunk-CERQVVPJ.js');
|
|
21
|
+
|
|
21
22
|
|
|
22
23
|
|
|
23
24
|
|
|
@@ -168,7 +169,7 @@ ${(_b = this.latestErrorTask()) == null ? void 0 : _b.errorStack}`
|
|
|
168
169
|
}
|
|
169
170
|
dump() {
|
|
170
171
|
const dumpData = {
|
|
171
|
-
sdkVersion:
|
|
172
|
+
sdkVersion: _chunkMN5JVUKUjs.getVersion.call(void 0, ),
|
|
172
173
|
model_name: _chunkJP3JBDZSjs.getAIConfig.call(void 0, _chunkJP3JBDZSjs.MIDSCENE_MODEL_NAME) || "",
|
|
173
174
|
logTime: Date.now(),
|
|
174
175
|
name: this.name,
|
|
@@ -191,16 +192,17 @@ var logFileName = "";
|
|
|
191
192
|
var logContent = [];
|
|
192
193
|
var logIdIndexMap = {};
|
|
193
194
|
var { pid } = process;
|
|
194
|
-
var logFileExt =
|
|
195
|
+
var logFileExt = _chunkMN5JVUKUjs.insightDumpFileExt;
|
|
195
196
|
var ifInBrowser = typeof window !== "undefined";
|
|
196
197
|
function writeInsightDump(data, logId, dumpSubscriber) {
|
|
197
|
-
const logDir =
|
|
198
|
+
const logDir = _chunkMN5JVUKUjs.getLogDir.call(void 0, );
|
|
198
199
|
_assert2.default.call(void 0, logDir, "logDir should be set before writing dump file");
|
|
199
200
|
const id = logId || _utils.uuid.call(void 0, );
|
|
200
201
|
const baseData = {
|
|
201
|
-
sdkVersion:
|
|
202
|
+
sdkVersion: _chunkMN5JVUKUjs.getVersion.call(void 0, ),
|
|
202
203
|
logTime: Date.now(),
|
|
203
|
-
model_name: _chunkJP3JBDZSjs.getAIConfig.call(void 0, _chunkJP3JBDZSjs.MIDSCENE_MODEL_NAME) || ""
|
|
204
|
+
model_name: _chunkJP3JBDZSjs.getAIConfig.call(void 0, _chunkJP3JBDZSjs.MIDSCENE_MODEL_NAME) || "",
|
|
205
|
+
model_description: _chunkJP3JBDZSjs.getAIConfig.call(void 0, _chunkJP3JBDZSjs.MIDSCENE_USE_VLM_UI_TARS) ? "vlm-ui-tars enabled" : ""
|
|
204
206
|
};
|
|
205
207
|
const finalData = {
|
|
206
208
|
logId: id,
|
|
@@ -208,7 +210,7 @@ function writeInsightDump(data, logId, dumpSubscriber) {
|
|
|
208
210
|
...data
|
|
209
211
|
};
|
|
210
212
|
dumpSubscriber == null ? void 0 : dumpSubscriber(finalData);
|
|
211
|
-
const dataString =
|
|
213
|
+
const dataString = _chunkMN5JVUKUjs.stringifyDumpData.call(void 0, finalData, 2);
|
|
212
214
|
if (typeof logIdIndexMap[id] === "number") {
|
|
213
215
|
logContent[logIdIndexMap[id]] = dataString;
|
|
214
216
|
} else {
|
|
@@ -222,7 +224,7 @@ function writeInsightDump(data, logId, dumpSubscriber) {
|
|
|
222
224
|
logFileName = `${pid}_${baseData.logTime}-${Math.random()}`;
|
|
223
225
|
}
|
|
224
226
|
}
|
|
225
|
-
|
|
227
|
+
_chunkMN5JVUKUjs.writeLogFile.call(void 0, {
|
|
226
228
|
fileName: logFileName,
|
|
227
229
|
fileExt: logFileExt,
|
|
228
230
|
fileContent: `[
|
|
@@ -237,7 +239,7 @@ ${logContent.join(",\n")}
|
|
|
237
239
|
// src/insight/index.ts
|
|
238
240
|
var Insight = class {
|
|
239
241
|
constructor(context, opt) {
|
|
240
|
-
this.aiVendorFn =
|
|
242
|
+
this.aiVendorFn = _chunkCERQVVPJjs.callAiFn;
|
|
241
243
|
_assert2.default.call(void 0, context, "context is required for Insight");
|
|
242
244
|
if (typeof context === "function") {
|
|
243
245
|
this.contextRetrieverFn = context;
|
|
@@ -263,7 +265,7 @@ var Insight = class {
|
|
|
263
265
|
this.onceDumpUpdatedFn = void 0;
|
|
264
266
|
const context = await this.contextRetrieverFn("locate");
|
|
265
267
|
const startTime = Date.now();
|
|
266
|
-
const { parseResult, elementById, rawResponse, usage } = await
|
|
268
|
+
const { parseResult, elementById, rawResponse, usage } = await _chunkCERQVVPJjs.AiInspectElement.call(void 0, {
|
|
267
269
|
callAI: callAI || this.aiVendorFn,
|
|
268
270
|
context,
|
|
269
271
|
multi: Boolean(multi),
|
|
@@ -346,7 +348,7 @@ ${parseResult.errors.join("\n")}`;
|
|
|
346
348
|
this.onceDumpUpdatedFn = void 0;
|
|
347
349
|
const context = await this.contextRetrieverFn("extract");
|
|
348
350
|
const startTime = Date.now();
|
|
349
|
-
const { parseResult, elementById } = await
|
|
351
|
+
const { parseResult, elementById } = await _chunkCERQVVPJjs.AiExtractElementInfo.call(void 0, {
|
|
350
352
|
context,
|
|
351
353
|
dataQuery: dataDemand
|
|
352
354
|
});
|
|
@@ -400,7 +402,7 @@ ${parseResult.errors.join("\n")}`;
|
|
|
400
402
|
this.onceDumpUpdatedFn = void 0;
|
|
401
403
|
const context = await this.contextRetrieverFn("assert");
|
|
402
404
|
const startTime = Date.now();
|
|
403
|
-
const assertResult = await
|
|
405
|
+
const assertResult = await _chunkCERQVVPJjs.AiAssert.call(void 0, {
|
|
404
406
|
assertion,
|
|
405
407
|
context
|
|
406
408
|
});
|
|
@@ -448,4 +450,4 @@ var src_default = Insight;
|
|
|
448
450
|
|
|
449
451
|
|
|
450
452
|
|
|
451
|
-
exports.AIResponseFormat =
|
|
453
|
+
exports.AIResponseFormat = _chunkCERQVVPJjs.AIResponseFormat; exports.BaseElement = _chunkCERQVVPJjs.BaseElement; exports.Executor = Executor; exports.Insight = Insight; exports.UIContext = _chunkCERQVVPJjs.UIContext; exports.default = src_default; exports.getLogDirByType = _chunkMN5JVUKUjs.getLogDirByType; exports.getVersion = _chunkMN5JVUKUjs.getVersion; exports.plan = _chunkCERQVVPJjs.plan; exports.setLogDir = _chunkMN5JVUKUjs.setLogDir; exports.transformElementPositionToId = _chunkCERQVVPJjs.transformElementPositionToId;
|
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
import { g as AIUsageInfo, B as BaseElement, U as UIContext, y as PlanningAction } from './types-
|
|
1
|
+
import { g as AIUsageInfo, B as BaseElement, U as UIContext, y as PlanningAction } from './types-64c4d87b.js';
|
|
2
2
|
import { ChatCompletionMessageParam } from 'openai/resources';
|
|
3
3
|
export { ChatCompletionMessageParam } from 'openai/resources';
|
|
4
|
-
import { A as AIActionType } from './llm-planning-
|
|
5
|
-
export { d as AiAssert, b as AiExtractElementInfo, a as AiInspectElement, c as callAiFn, p as plan, t as transformElementPositionToId } from './llm-planning-
|
|
4
|
+
import { A as AIActionType } from './llm-planning-ca109221.js';
|
|
5
|
+
export { d as AiAssert, b as AiExtractElementInfo, a as AiInspectElement, c as callAiFn, p as plan, t as transformElementPositionToId } from './llm-planning-ca109221.js';
|
|
6
6
|
import '@midscene/shared/constants';
|
|
7
7
|
|
|
8
8
|
declare function callToGetJSONObject<T>(messages: ChatCompletionMessageParam[], AIActionTypeValue: AIActionType): Promise<{
|
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
import { E as ExecutionTask, a as ExecutionTaskProgressOptions, b as ExecutionTaskApply, c as ExecutionDump, B as BaseElement, U as UIContext, I as InsightAction, D as DumpSubscriber, d as InsightOptions, e as InsightTaskInfo, f as InsightAssertionResponse, A as AISingleElementResponse } from './types-
|
|
2
|
-
export { n as AIAssertionResponse, k as AIElementIdResponse, l as AIElementResponse, h as AIResponseFormat, m as AISectionParseResponse, i as AISingleElementResponseById, j as AISingleElementResponseByPosition, g as AIUsageInfo, w as AgentAssertOpt, v as AgentWaitForOpt, X as BaseAgentParserOpt, C as CallAIFn, W as Color, q as DumpMeta, u as ElementById, o as EnsureObject, _ as ExecutionRecorderItem, ag as ExecutionTaskAction, af as ExecutionTaskActionApply, ae as ExecutionTaskInsightAssertion, ad as ExecutionTaskInsightAssertionApply, ac as ExecutionTaskInsightAssertionParam, a5 as ExecutionTaskInsightDumpLog, a7 as ExecutionTaskInsightLocate, a6 as ExecutionTaskInsightLocateApply, a4 as ExecutionTaskInsightLocateOutput, a3 as ExecutionTaskInsightLocateParam, ab as ExecutionTaskInsightQuery, aa as ExecutionTaskInsightQueryApply, a9 as ExecutionTaskInsightQueryOutput, a8 as ExecutionTaskInsightQueryParam, ai as ExecutionTaskPlanning, ah as ExecutionTaskPlanningApply, a2 as ExecutionTaskReturn, $ as ExecutionTaskType, a0 as ExecutorContext, aj as GroupedActionDump, s as InsightDump, p as InsightExtractParam, L as LiteUISection, O as OnTaskStartTip, t as PartialInsightDumpFromSDK, z as PlanningAIResponse, y as PlanningAction, N as PlanningActionParamAssert, T as PlanningActionParamError, J as PlanningActionParamHover, K as PlanningActionParamInputOrKeyPress, G as PlanningActionParamPlan, M as PlanningActionParamScroll, Q as PlanningActionParamSleep, H as PlanningActionParamTap, V as PlanningActionParamWaitFor, F as PlanningFurtherPlan, x as PlanningLocateParam, Z as PlaywrightParserOpt, P as Point, Y as PuppeteerParserOpt, R as Rect, r as ReportDumpWithAttributes, S as Size, a1 as TaskCacheInfo } from './types-
|
|
3
|
-
import { c as callAiFn } from './llm-planning-
|
|
4
|
-
export { p as plan, t as transformElementPositionToId } from './llm-planning-
|
|
1
|
+
import { E as ExecutionTask, a as ExecutionTaskProgressOptions, b as ExecutionTaskApply, c as ExecutionDump, B as BaseElement, U as UIContext, I as InsightAction, D as DumpSubscriber, d as InsightOptions, e as InsightTaskInfo, f as InsightAssertionResponse, A as AISingleElementResponse } from './types-64c4d87b.js';
|
|
2
|
+
export { n as AIAssertionResponse, k as AIElementIdResponse, l as AIElementResponse, h as AIResponseFormat, m as AISectionParseResponse, i as AISingleElementResponseById, j as AISingleElementResponseByPosition, g as AIUsageInfo, w as AgentAssertOpt, v as AgentWaitForOpt, X as BaseAgentParserOpt, C as CallAIFn, W as Color, q as DumpMeta, u as ElementById, o as EnsureObject, _ as ExecutionRecorderItem, ag as ExecutionTaskAction, af as ExecutionTaskActionApply, ae as ExecutionTaskInsightAssertion, ad as ExecutionTaskInsightAssertionApply, ac as ExecutionTaskInsightAssertionParam, a5 as ExecutionTaskInsightDumpLog, a7 as ExecutionTaskInsightLocate, a6 as ExecutionTaskInsightLocateApply, a4 as ExecutionTaskInsightLocateOutput, a3 as ExecutionTaskInsightLocateParam, ab as ExecutionTaskInsightQuery, aa as ExecutionTaskInsightQueryApply, a9 as ExecutionTaskInsightQueryOutput, a8 as ExecutionTaskInsightQueryParam, ai as ExecutionTaskPlanning, ah as ExecutionTaskPlanningApply, a2 as ExecutionTaskReturn, $ as ExecutionTaskType, a0 as ExecutorContext, aj as GroupedActionDump, s as InsightDump, p as InsightExtractParam, L as LiteUISection, O as OnTaskStartTip, t as PartialInsightDumpFromSDK, z as PlanningAIResponse, y as PlanningAction, N as PlanningActionParamAssert, T as PlanningActionParamError, J as PlanningActionParamHover, K as PlanningActionParamInputOrKeyPress, G as PlanningActionParamPlan, M as PlanningActionParamScroll, Q as PlanningActionParamSleep, H as PlanningActionParamTap, V as PlanningActionParamWaitFor, F as PlanningFurtherPlan, x as PlanningLocateParam, Z as PlaywrightParserOpt, P as Point, Y as PuppeteerParserOpt, R as Rect, r as ReportDumpWithAttributes, S as Size, a1 as TaskCacheInfo } from './types-64c4d87b.js';
|
|
3
|
+
import { c as callAiFn } from './llm-planning-ca109221.js';
|
|
4
|
+
export { p as plan, t as transformElementPositionToId } from './llm-planning-ca109221.js';
|
|
5
5
|
export { getLogDirByType, getVersion, setLogDir } from './utils.js';
|
|
6
6
|
import '@midscene/shared/constants';
|
|
7
7
|
import 'openai/resources';
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import { g as AIUsageInfo, l as AIElementResponse, B as BaseElement, U as UIContext, A as AISingleElementResponse, j as AISingleElementResponseByPosition, u as ElementById, m as AISectionParseResponse, n as AIAssertionResponse, z as PlanningAIResponse } from './types-
|
|
1
|
+
import { g as AIUsageInfo, l as AIElementResponse, B as BaseElement, U as UIContext, A as AISingleElementResponse, j as AISingleElementResponseByPosition, u as ElementById, m as AISectionParseResponse, n as AIAssertionResponse, z as PlanningAIResponse } from './types-64c4d87b.js';
|
|
2
2
|
import { ChatCompletionSystemMessageParam, ChatCompletionUserMessageParam } from 'openai/resources';
|
|
3
3
|
|
|
4
4
|
type AIArgs = [
|
package/dist/lib/utils.js
CHANGED
|
@@ -16,7 +16,7 @@
|
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
|
|
19
|
-
var
|
|
19
|
+
var _chunkMN5JVUKUjs = require('./chunk-MN5JVUKU.js');
|
|
20
20
|
require('./chunk-JP3JBDZS.js');
|
|
21
21
|
require('./chunk-YSQDPG26.js');
|
|
22
22
|
|
|
@@ -37,4 +37,4 @@ require('./chunk-YSQDPG26.js');
|
|
|
37
37
|
|
|
38
38
|
|
|
39
39
|
|
|
40
|
-
exports.getLogDir =
|
|
40
|
+
exports.getLogDir = _chunkMN5JVUKUjs.getLogDir; exports.getLogDirByType = _chunkMN5JVUKUjs.getLogDirByType; exports.getTmpDir = _chunkMN5JVUKUjs.getTmpDir; exports.getTmpFile = _chunkMN5JVUKUjs.getTmpFile; exports.getVersion = _chunkMN5JVUKUjs.getVersion; exports.groupedActionDumpFileExt = _chunkMN5JVUKUjs.groupedActionDumpFileExt; exports.insightDumpFileExt = _chunkMN5JVUKUjs.insightDumpFileExt; exports.overlapped = _chunkMN5JVUKUjs.overlapped; exports.replaceStringWithFirstAppearance = _chunkMN5JVUKUjs.replaceStringWithFirstAppearance; exports.replacerForPageObject = _chunkMN5JVUKUjs.replacerForPageObject; exports.reportHTMLContent = _chunkMN5JVUKUjs.reportHTMLContent; exports.setLogDir = _chunkMN5JVUKUjs.setLogDir; exports.sleep = _chunkMN5JVUKUjs.sleep; exports.stringifyDumpData = _chunkMN5JVUKUjs.stringifyDumpData; exports.uploadTestInfoToServer = _chunkMN5JVUKUjs.uploadTestInfoToServer; exports.writeDumpReport = _chunkMN5JVUKUjs.writeDumpReport; exports.writeLogFile = _chunkMN5JVUKUjs.writeLogFile;
|