@midscene/core 0.9.2 → 0.9.3-beta-20250116143806.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/lib/ai-model.js
CHANGED
|
@@ -9,7 +9,7 @@
|
|
|
9
9
|
|
|
10
10
|
|
|
11
11
|
|
|
12
|
-
var
|
|
12
|
+
var _chunkCERQVVPJjs = require('./chunk-CERQVVPJ.js');
|
|
13
13
|
require('./chunk-JP3JBDZS.js');
|
|
14
14
|
require('./chunk-YSQDPG26.js');
|
|
15
15
|
|
|
@@ -23,4 +23,4 @@ require('./chunk-YSQDPG26.js');
|
|
|
23
23
|
|
|
24
24
|
|
|
25
25
|
|
|
26
|
-
exports.AiAssert =
|
|
26
|
+
exports.AiAssert = _chunkCERQVVPJjs.AiAssert; exports.AiExtractElementInfo = _chunkCERQVVPJjs.AiExtractElementInfo; exports.AiInspectElement = _chunkCERQVVPJjs.AiInspectElement; exports.callAiFn = _chunkCERQVVPJjs.callAiFn; exports.callToGetJSONObject = _chunkCERQVVPJjs.callToGetJSONObject; exports.describeUserPage = _chunkCERQVVPJjs.describeUserPage; exports.plan = _chunkCERQVVPJjs.plan; exports.systemPromptToLocateElement = _chunkCERQVVPJjs.systemPromptToLocateElement; exports.transformElementPositionToId = _chunkCERQVVPJjs.transformElementPositionToId; exports.vlmPlanning = _chunkCERQVVPJjs.vlmPlanning;
|
|
@@ -184,7 +184,7 @@ function stringifyDumpData(data, indents) {
|
|
|
184
184
|
return JSON.stringify(data, replacerForPageObject, indents);
|
|
185
185
|
}
|
|
186
186
|
function getVersion() {
|
|
187
|
-
return "0.9.
|
|
187
|
+
return "0.9.3-beta-20250116143806.0";
|
|
188
188
|
}
|
|
189
189
|
function debugLog(...message) {
|
|
190
190
|
const debugMode = _chunkJP3JBDZSjs.getAIConfig.call(void 0, _chunkJP3JBDZSjs.MIDSCENE_DEBUG_MODE);
|
|
@@ -1109,32 +1109,192 @@ async function callAiFn(msgs, AIActionTypeValue) {
|
|
|
1109
1109
|
// src/ai-model/prompt/llm-locator.ts
|
|
1110
1110
|
var _prompts = require('@langchain/core/prompts');
|
|
1111
1111
|
|
|
1112
|
+
// src/ai-model/prompt/ui-tars-planning.ts
|
|
1113
|
+
function getTimeZoneInfo() {
|
|
1114
|
+
const timeZone = Intl.DateTimeFormat().resolvedOptions().timeZone;
|
|
1115
|
+
const offset = -(/* @__PURE__ */ new Date()).getTimezoneOffset() / 60;
|
|
1116
|
+
return {
|
|
1117
|
+
timezone: `UTC${offset >= 0 ? "+" : ""}${offset}`,
|
|
1118
|
+
isChina: timeZone === "Asia/Shanghai"
|
|
1119
|
+
};
|
|
1120
|
+
}
|
|
1121
|
+
var language = getTimeZoneInfo().isChina ? "Chinese" : "English";
|
|
1122
|
+
var uiTarsPlanningPrompt = `
|
|
1123
|
+
You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
|
|
1124
|
+
|
|
1125
|
+
## Output Format
|
|
1126
|
+
\`\`\`
|
|
1127
|
+
Thought: ...
|
|
1128
|
+
Action: ...
|
|
1129
|
+
\`\`\`
|
|
1130
|
+
|
|
1131
|
+
## Action Space
|
|
1132
|
+
click(start_box='[x1, y1, x2, y2]')
|
|
1133
|
+
left_double(start_box='[x1, y1, x2, y2]')
|
|
1134
|
+
right_single(start_box='[x1, y1, x2, y2]')
|
|
1135
|
+
drag(start_box='[x1, y1, x2, y2]', end_box='[x3, y3, x4, y4]')
|
|
1136
|
+
hotkey(key='')
|
|
1137
|
+
type(content='') #If you want to submit your input, use "\\n" at the end of \`content\`.
|
|
1138
|
+
scroll(start_box='[x1, y1, x2, y2]', direction='down or up or right or left')
|
|
1139
|
+
wait() #Sleep for 5s and take a screenshot to check for any changes.
|
|
1140
|
+
finished()
|
|
1141
|
+
call_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help.
|
|
1142
|
+
|
|
1143
|
+
## Note
|
|
1144
|
+
- Use ${language} in \`Thought\` part.
|
|
1145
|
+
- Write a small plan and finally summarize your next action (with its target element) in one sentence in \`Thought\` part.
|
|
1146
|
+
|
|
1147
|
+
## User Instruction
|
|
1148
|
+
`;
|
|
1149
|
+
var getSummary = (prediction) => prediction.replace(/Reflection:[\s\S]*?(?=Action_Summary:|Action:|$)/g, "").trim();
|
|
1150
|
+
function parseActionFromVlm(text, factor = 1e3, mode = "bc") {
|
|
1151
|
+
let reflection = null;
|
|
1152
|
+
let thought = null;
|
|
1153
|
+
let actionStr = "";
|
|
1154
|
+
text = text.trim();
|
|
1155
|
+
if (mode === "bc") {
|
|
1156
|
+
if (text.startsWith("Thought:")) {
|
|
1157
|
+
const thoughtMatch = text.match(/Thought: (.+?)(?=\s*Action:|$)/s);
|
|
1158
|
+
if (thoughtMatch) {
|
|
1159
|
+
thought = thoughtMatch[1].trim();
|
|
1160
|
+
}
|
|
1161
|
+
} else if (text.startsWith("Reflection:")) {
|
|
1162
|
+
const reflectionMatch = text.match(
|
|
1163
|
+
/Reflection: (.+?)Action_Summary: (.+?)(?=\s*Action:|$)/
|
|
1164
|
+
);
|
|
1165
|
+
if (reflectionMatch) {
|
|
1166
|
+
thought = reflectionMatch[2].trim();
|
|
1167
|
+
reflection = reflectionMatch[1].trim();
|
|
1168
|
+
}
|
|
1169
|
+
} else if (text.startsWith("Action_Summary:")) {
|
|
1170
|
+
const summaryMatch = text.match(/Action_Summary: (.+?)(?=\s*Action:|$)/);
|
|
1171
|
+
if (summaryMatch) {
|
|
1172
|
+
thought = summaryMatch[1].trim();
|
|
1173
|
+
}
|
|
1174
|
+
}
|
|
1175
|
+
if (!text.includes("Action:")) {
|
|
1176
|
+
actionStr = text;
|
|
1177
|
+
} else {
|
|
1178
|
+
const actionParts = text.split("Action:");
|
|
1179
|
+
actionStr = actionParts[actionParts.length - 1];
|
|
1180
|
+
}
|
|
1181
|
+
} else if (mode === "o1") {
|
|
1182
|
+
const thoughtMatch = text.match(/<Thought>\s*(.*?)\s*<\/Thought>/);
|
|
1183
|
+
const actionSummaryMatch = text.match(
|
|
1184
|
+
/\nAction_Summary:\s*(.*?)\s*Action:/
|
|
1185
|
+
);
|
|
1186
|
+
const actionMatch = text.match(/\nAction:\s*(.*?)\s*<\/Output>/);
|
|
1187
|
+
const thoughtContent = thoughtMatch ? thoughtMatch[1] : null;
|
|
1188
|
+
const actionSummaryContent = actionSummaryMatch ? actionSummaryMatch[1] : null;
|
|
1189
|
+
const actionContent = actionMatch ? actionMatch[1] : null;
|
|
1190
|
+
thought = `${thoughtContent}
|
|
1191
|
+
<Action_Summary>
|
|
1192
|
+
${actionSummaryContent}`;
|
|
1193
|
+
actionStr = actionContent || "";
|
|
1194
|
+
}
|
|
1195
|
+
const allActions = actionStr.split("\n\n");
|
|
1196
|
+
const actions = [];
|
|
1197
|
+
for (const rawStr of allActions) {
|
|
1198
|
+
const actionInstance = parseAction(rawStr.replace(/\n/g, "\\n").trim());
|
|
1199
|
+
if (!actionInstance) {
|
|
1200
|
+
console.log(`Action can't parse: ${rawStr}`);
|
|
1201
|
+
continue;
|
|
1202
|
+
}
|
|
1203
|
+
const actionType = actionInstance.function;
|
|
1204
|
+
const params = actionInstance.args;
|
|
1205
|
+
const actionInputs = {};
|
|
1206
|
+
for (const [paramName, param] of Object.entries(params)) {
|
|
1207
|
+
if (!param)
|
|
1208
|
+
continue;
|
|
1209
|
+
const trimmedParam = param.trim();
|
|
1210
|
+
actionInputs[paramName.trim()] = trimmedParam;
|
|
1211
|
+
if (paramName.includes("start_box") || paramName.includes("end_box")) {
|
|
1212
|
+
const oriBox = trimmedParam;
|
|
1213
|
+
const numbers = oriBox.replace(/[()]/g, "").split(",");
|
|
1214
|
+
const floatNumbers = numbers.map(
|
|
1215
|
+
(num) => Number.parseFloat(num) / factor
|
|
1216
|
+
);
|
|
1217
|
+
if (floatNumbers.length === 2) {
|
|
1218
|
+
floatNumbers.push(floatNumbers[0], floatNumbers[1]);
|
|
1219
|
+
}
|
|
1220
|
+
actionInputs[paramName.trim()] = JSON.stringify(floatNumbers);
|
|
1221
|
+
}
|
|
1222
|
+
}
|
|
1223
|
+
if (actionType === "finished") {
|
|
1224
|
+
actions.push({
|
|
1225
|
+
reflection,
|
|
1226
|
+
thought,
|
|
1227
|
+
action_type: "finished",
|
|
1228
|
+
action_inputs: {}
|
|
1229
|
+
});
|
|
1230
|
+
} else {
|
|
1231
|
+
actions.push({
|
|
1232
|
+
reflection,
|
|
1233
|
+
thought,
|
|
1234
|
+
action_type: actionType,
|
|
1235
|
+
action_inputs: actionInputs
|
|
1236
|
+
});
|
|
1237
|
+
}
|
|
1238
|
+
}
|
|
1239
|
+
return actions;
|
|
1240
|
+
}
|
|
1241
|
+
function parseAction(actionStr) {
|
|
1242
|
+
try {
|
|
1243
|
+
const functionPattern = /^(\w+)\((.*)\)$/;
|
|
1244
|
+
const match = actionStr.trim().match(functionPattern);
|
|
1245
|
+
if (!match) {
|
|
1246
|
+
throw new Error("Not a function call");
|
|
1247
|
+
}
|
|
1248
|
+
const [_, functionName, argsStr] = match;
|
|
1249
|
+
const kwargs = {};
|
|
1250
|
+
if (argsStr.trim()) {
|
|
1251
|
+
const argPairs = argsStr.match(/([^,']|'[^']*')+/g) || [];
|
|
1252
|
+
for (const pair of argPairs) {
|
|
1253
|
+
const [key, ...valueParts] = pair.split("=");
|
|
1254
|
+
if (!key)
|
|
1255
|
+
continue;
|
|
1256
|
+
const value = valueParts.join("=").trim().replace(/^['"]|['"]$/g, "");
|
|
1257
|
+
kwargs[key.trim()] = value;
|
|
1258
|
+
}
|
|
1259
|
+
}
|
|
1260
|
+
return {
|
|
1261
|
+
function: functionName,
|
|
1262
|
+
args: kwargs
|
|
1263
|
+
};
|
|
1264
|
+
} catch (e) {
|
|
1265
|
+
console.error(`Failed to parse action '${actionStr}': ${e}`);
|
|
1266
|
+
return null;
|
|
1267
|
+
}
|
|
1268
|
+
}
|
|
1269
|
+
|
|
1112
1270
|
// src/ai-model/prompt/ui-tars-locator.ts
|
|
1113
1271
|
function systemPromptToLocateElementPosition() {
|
|
1114
1272
|
return `
|
|
1115
|
-
|
|
1116
|
-
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
-
|
|
1120
|
-
|
|
1121
|
-
|
|
1122
|
-
|
|
1123
|
-
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
|
|
1127
|
-
|
|
1128
|
-
|
|
1129
|
-
|
|
1130
|
-
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
|
|
1134
|
-
|
|
1135
|
-
|
|
1136
|
-
|
|
1137
|
-
|
|
1273
|
+
You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
|
|
1274
|
+
|
|
1275
|
+
## Output Format
|
|
1276
|
+
\`\`\`
|
|
1277
|
+
Thought: ...
|
|
1278
|
+
Action: ...
|
|
1279
|
+
\`\`\`
|
|
1280
|
+
|
|
1281
|
+
## Action Space
|
|
1282
|
+
click(start_box='[x1, y1, x2, y2]')
|
|
1283
|
+
left_double(start_box='[x1, y1, x2, y2]')
|
|
1284
|
+
right_single(start_box='[x1, y1, x2, y2]')
|
|
1285
|
+
drag(start_box='[x1, y1, x2, y2]', end_box='[x3, y3, x4, y4]')
|
|
1286
|
+
hotkey(key='')
|
|
1287
|
+
type(content='') #If you want to submit your input, use "\\n" at the end of \`content\`.
|
|
1288
|
+
scroll(start_box='[x1, y1, x2, y2]', direction='down or up or right or left')
|
|
1289
|
+
wait() #Sleep for 5s and take a screenshot to check for any changes.
|
|
1290
|
+
finished()
|
|
1291
|
+
call_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help.
|
|
1292
|
+
|
|
1293
|
+
## Note
|
|
1294
|
+
- Use ${language} in \`Thought\` part.
|
|
1295
|
+
- Write a small plan and finally summarize your next action (with its target element) in one sentence in \`Thought\` part.
|
|
1296
|
+
|
|
1297
|
+
## User Instruction
|
|
1138
1298
|
`;
|
|
1139
1299
|
}
|
|
1140
1300
|
|
|
@@ -1360,8 +1520,6 @@ var _img = require('@midscene/shared/img');
|
|
|
1360
1520
|
var _constants = require('@midscene/shared/constants');
|
|
1361
1521
|
|
|
1362
1522
|
var characteristic = "You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.";
|
|
1363
|
-
var contextFormatIntro = `
|
|
1364
|
-
The user will give you a screenshot and some of the texts on it. There may be some none-English characters (like Chinese) on it, indicating it's an non-English app. If some text is shown on screenshot but not introduced by the JSON description, use the information you see on screenshot.`;
|
|
1365
1523
|
function systemPromptToExtract() {
|
|
1366
1524
|
return `
|
|
1367
1525
|
You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.
|
|
@@ -1397,7 +1555,6 @@ DATA_DEMAND start:
|
|
|
1397
1555
|
{dataKeys}
|
|
1398
1556
|
|
|
1399
1557
|
{dataQuery}
|
|
1400
|
-
|
|
1401
1558
|
=====================================
|
|
1402
1559
|
DATA_DEMAND ends.
|
|
1403
1560
|
`,
|
|
@@ -1406,14 +1563,12 @@ DATA_DEMAND ends.
|
|
|
1406
1563
|
function systemPromptToAssert() {
|
|
1407
1564
|
return `
|
|
1408
1565
|
${characteristic}
|
|
1409
|
-
|
|
1410
|
-
|
|
1411
|
-
Based on the information you get, Return assertion judgment:
|
|
1566
|
+
User will give an assertion, and some information about the page. Based on the information you get, tell whether the assertion is truthy.
|
|
1412
1567
|
|
|
1413
1568
|
Return in the following JSON format:
|
|
1414
1569
|
{
|
|
1415
1570
|
thought: string, // string, the thought of the assertion. Should in the same language as the assertion.
|
|
1416
|
-
pass: true, // true or false, whether the assertion is
|
|
1571
|
+
pass: true, // true or false, whether the assertion is truthy
|
|
1417
1572
|
}
|
|
1418
1573
|
`;
|
|
1419
1574
|
}
|
|
@@ -1454,7 +1609,7 @@ function truncateText(text, maxLength = 100) {
|
|
|
1454
1609
|
function elementByPositionWithElementInfo(elementsInfo, position) {
|
|
1455
1610
|
_assert2.default.call(void 0, typeof position !== "undefined", "position is required for query");
|
|
1456
1611
|
const matchingElements = elementsInfo.filter((item) => {
|
|
1457
|
-
return item.rect.left <= position.x && position.x <= item.rect.left + item.rect.width && item.rect.top <= position.y && position.y <= item.rect.top + item.rect.height;
|
|
1612
|
+
return item.attributes.nodeType !== _constants.NodeType.CONTAINER && item.rect.left <= position.x && position.x <= item.rect.left + item.rect.width && item.rect.top <= position.y && position.y <= item.rect.top + item.rect.height;
|
|
1458
1613
|
});
|
|
1459
1614
|
if (matchingElements.length === 0) {
|
|
1460
1615
|
return void 0;
|
|
@@ -1710,7 +1865,7 @@ The JSON format is as follows:
|
|
|
1710
1865
|
{{
|
|
1711
1866
|
"actions": [
|
|
1712
1867
|
{{
|
|
1713
|
-
"thought": "Reasons for generating this task, and why this task is feasible on this page",
|
|
1868
|
+
"thought": "Reasons for generating this task, and why this task is feasible on this page.", // Use the same language as the user's instruction.
|
|
1714
1869
|
"type": "Tap",
|
|
1715
1870
|
"param": null,
|
|
1716
1871
|
"locate": {sample} | null,
|
|
@@ -1718,8 +1873,8 @@ The JSON format is as follows:
|
|
|
1718
1873
|
// ... more actions
|
|
1719
1874
|
],
|
|
1720
1875
|
"taskWillBeAccomplished": boolean,
|
|
1721
|
-
"furtherPlan": {{ "whatHaveDone": string, "whatToDoNext": string }} | null,
|
|
1722
|
-
"error"?: string
|
|
1876
|
+
"furtherPlan": {{ "whatHaveDone": string, "whatToDoNext": string }} | null, // Use the same language as the user's instruction.
|
|
1877
|
+
"error"?: string // Use the same language as the user's instruction.
|
|
1723
1878
|
}}
|
|
1724
1879
|
Here is an example of how to decompose a task:
|
|
1725
1880
|
|
|
@@ -2416,13 +2571,10 @@ async function AiAssert(options) {
|
|
|
2416
2571
|
{
|
|
2417
2572
|
type: "text",
|
|
2418
2573
|
text: `
|
|
2419
|
-
|
|
2420
|
-
|
|
2421
|
-
|
|
2422
|
-
|
|
2423
|
-
=====================================
|
|
2424
|
-
${assertion}
|
|
2425
|
-
=====================================
|
|
2574
|
+
Here is the description of the assertion. Just go ahead:
|
|
2575
|
+
=====================================
|
|
2576
|
+
${assertion}
|
|
2577
|
+
=====================================
|
|
2426
2578
|
`
|
|
2427
2579
|
}
|
|
2428
2580
|
]
|
|
@@ -2485,156 +2637,6 @@ async function plan(userPrompt, opts) {
|
|
|
2485
2637
|
return planFromAI;
|
|
2486
2638
|
}
|
|
2487
2639
|
|
|
2488
|
-
// src/ai-model/prompt/ui-tars-planning.ts
|
|
2489
|
-
var uiTarsPlanningPrompt = `
|
|
2490
|
-
You are a GUI agent. You are given a task and your action history, with screenshots. You need to perform the next action to complete the task.
|
|
2491
|
-
|
|
2492
|
-
## Output Format
|
|
2493
|
-
|
|
2494
|
-
\`\`\`
|
|
2495
|
-
Thought: ...
|
|
2496
|
-
Action: ...
|
|
2497
|
-
\`\`\`
|
|
2498
|
-
|
|
2499
|
-
## Action Space
|
|
2500
|
-
click(start_box='[x1, y1, x2, y2]')
|
|
2501
|
-
left_double(start_box='[x1, y1, x2, y2]')
|
|
2502
|
-
right_single(start_box='[x1, y1, x2, y2]')
|
|
2503
|
-
drag(start_box='[x1, y1, x2, y2]', end_box='[x3, y3, x4, y4]')
|
|
2504
|
-
hotkey(key='')
|
|
2505
|
-
type(content='') #If you want to submit your input, use "\\n" at the end of \`content\`.
|
|
2506
|
-
scroll(start_box='[x1, y1, x2, y2]', direction='down or up or right or left')
|
|
2507
|
-
wait() #Sleep for 5s and take a screenshot to check for any changes.
|
|
2508
|
-
finished()
|
|
2509
|
-
call_user() # Submit the task and call the user when the task is unsolvable, or when you need the user's help.
|
|
2510
|
-
|
|
2511
|
-
## Note
|
|
2512
|
-
- Use Chinese in \`Thought\` part.
|
|
2513
|
-
- Write a small plan and finally summarize your next action (with its target element) in one sentence in \`Thought\` part.
|
|
2514
|
-
|
|
2515
|
-
## User Instruction
|
|
2516
|
-
`;
|
|
2517
|
-
var getSummary = (prediction) => prediction.replace(/Reflection:[\s\S]*?(?=Action_Summary:|Action:|$)/g, "").trim();
|
|
2518
|
-
function parseActionFromVlm(text, factor = 1e3, mode = "bc") {
|
|
2519
|
-
let reflection = null;
|
|
2520
|
-
let thought = null;
|
|
2521
|
-
let actionStr = "";
|
|
2522
|
-
text = text.trim();
|
|
2523
|
-
if (mode === "bc") {
|
|
2524
|
-
if (text.startsWith("Thought:")) {
|
|
2525
|
-
const thoughtMatch = text.match(/Thought: (.+?)(?=\s*Action:|$)/s);
|
|
2526
|
-
if (thoughtMatch) {
|
|
2527
|
-
thought = thoughtMatch[1].trim();
|
|
2528
|
-
}
|
|
2529
|
-
} else if (text.startsWith("Reflection:")) {
|
|
2530
|
-
const reflectionMatch = text.match(
|
|
2531
|
-
/Reflection: (.+?)Action_Summary: (.+?)(?=\s*Action:|$)/
|
|
2532
|
-
);
|
|
2533
|
-
if (reflectionMatch) {
|
|
2534
|
-
thought = reflectionMatch[2].trim();
|
|
2535
|
-
reflection = reflectionMatch[1].trim();
|
|
2536
|
-
}
|
|
2537
|
-
} else if (text.startsWith("Action_Summary:")) {
|
|
2538
|
-
const summaryMatch = text.match(/Action_Summary: (.+?)(?=\s*Action:|$)/);
|
|
2539
|
-
if (summaryMatch) {
|
|
2540
|
-
thought = summaryMatch[1].trim();
|
|
2541
|
-
}
|
|
2542
|
-
}
|
|
2543
|
-
if (!text.includes("Action:")) {
|
|
2544
|
-
actionStr = text;
|
|
2545
|
-
} else {
|
|
2546
|
-
const actionParts = text.split("Action:");
|
|
2547
|
-
actionStr = actionParts[actionParts.length - 1];
|
|
2548
|
-
}
|
|
2549
|
-
} else if (mode === "o1") {
|
|
2550
|
-
const thoughtMatch = text.match(/<Thought>\s*(.*?)\s*<\/Thought>/);
|
|
2551
|
-
const actionSummaryMatch = text.match(
|
|
2552
|
-
/\nAction_Summary:\s*(.*?)\s*Action:/
|
|
2553
|
-
);
|
|
2554
|
-
const actionMatch = text.match(/\nAction:\s*(.*?)\s*<\/Output>/);
|
|
2555
|
-
const thoughtContent = thoughtMatch ? thoughtMatch[1] : null;
|
|
2556
|
-
const actionSummaryContent = actionSummaryMatch ? actionSummaryMatch[1] : null;
|
|
2557
|
-
const actionContent = actionMatch ? actionMatch[1] : null;
|
|
2558
|
-
thought = `${thoughtContent}
|
|
2559
|
-
<Action_Summary>
|
|
2560
|
-
${actionSummaryContent}`;
|
|
2561
|
-
actionStr = actionContent || "";
|
|
2562
|
-
}
|
|
2563
|
-
const allActions = actionStr.split("\n\n");
|
|
2564
|
-
const actions = [];
|
|
2565
|
-
for (const rawStr of allActions) {
|
|
2566
|
-
const actionInstance = parseAction(rawStr.replace(/\n/g, "\\n").trim());
|
|
2567
|
-
if (!actionInstance) {
|
|
2568
|
-
console.log(`Action can't parse: ${rawStr}`);
|
|
2569
|
-
continue;
|
|
2570
|
-
}
|
|
2571
|
-
const actionType = actionInstance.function;
|
|
2572
|
-
const params = actionInstance.args;
|
|
2573
|
-
const actionInputs = {};
|
|
2574
|
-
for (const [paramName, param] of Object.entries(params)) {
|
|
2575
|
-
if (!param)
|
|
2576
|
-
continue;
|
|
2577
|
-
const trimmedParam = param.trim();
|
|
2578
|
-
actionInputs[paramName.trim()] = trimmedParam;
|
|
2579
|
-
if (paramName.includes("start_box") || paramName.includes("end_box")) {
|
|
2580
|
-
const oriBox = trimmedParam;
|
|
2581
|
-
const numbers = oriBox.replace(/[()]/g, "").split(",");
|
|
2582
|
-
const floatNumbers = numbers.map(
|
|
2583
|
-
(num) => Number.parseFloat(num) / factor
|
|
2584
|
-
);
|
|
2585
|
-
if (floatNumbers.length === 2) {
|
|
2586
|
-
floatNumbers.push(floatNumbers[0], floatNumbers[1]);
|
|
2587
|
-
}
|
|
2588
|
-
actionInputs[paramName.trim()] = JSON.stringify(floatNumbers);
|
|
2589
|
-
}
|
|
2590
|
-
}
|
|
2591
|
-
if (actionType === "finished") {
|
|
2592
|
-
actions.push({
|
|
2593
|
-
reflection,
|
|
2594
|
-
thought,
|
|
2595
|
-
action_type: "finished",
|
|
2596
|
-
action_inputs: {}
|
|
2597
|
-
});
|
|
2598
|
-
} else {
|
|
2599
|
-
actions.push({
|
|
2600
|
-
reflection,
|
|
2601
|
-
thought,
|
|
2602
|
-
action_type: actionType,
|
|
2603
|
-
action_inputs: actionInputs
|
|
2604
|
-
});
|
|
2605
|
-
}
|
|
2606
|
-
}
|
|
2607
|
-
return actions;
|
|
2608
|
-
}
|
|
2609
|
-
function parseAction(actionStr) {
|
|
2610
|
-
try {
|
|
2611
|
-
const functionPattern = /^(\w+)\((.*)\)$/;
|
|
2612
|
-
const match = actionStr.trim().match(functionPattern);
|
|
2613
|
-
if (!match) {
|
|
2614
|
-
throw new Error("Not a function call");
|
|
2615
|
-
}
|
|
2616
|
-
const [_, functionName, argsStr] = match;
|
|
2617
|
-
const kwargs = {};
|
|
2618
|
-
if (argsStr.trim()) {
|
|
2619
|
-
const argPairs = argsStr.match(/([^,']|'[^']*')+/g) || [];
|
|
2620
|
-
for (const pair of argPairs) {
|
|
2621
|
-
const [key, ...valueParts] = pair.split("=");
|
|
2622
|
-
if (!key)
|
|
2623
|
-
continue;
|
|
2624
|
-
const value = valueParts.join("=").trim().replace(/^['"]|['"]$/g, "");
|
|
2625
|
-
kwargs[key.trim()] = value;
|
|
2626
|
-
}
|
|
2627
|
-
}
|
|
2628
|
-
return {
|
|
2629
|
-
function: functionName,
|
|
2630
|
-
args: kwargs
|
|
2631
|
-
};
|
|
2632
|
-
} catch (e) {
|
|
2633
|
-
console.error(`Failed to parse action '${actionStr}': ${e}`);
|
|
2634
|
-
return null;
|
|
2635
|
-
}
|
|
2636
|
-
}
|
|
2637
|
-
|
|
2638
2640
|
// src/ai-model/ui-tars-planning.ts
|
|
2639
2641
|
function capitalize(str) {
|
|
2640
2642
|
return str.charAt(0).toUpperCase() + str.slice(1);
|
package/dist/lib/index.js
CHANGED
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
|
|
7
7
|
|
|
8
8
|
|
|
9
|
-
var
|
|
9
|
+
var _chunk57PVXCD2js = require('./chunk-57PVXCD2.js');
|
|
10
10
|
|
|
11
11
|
|
|
12
12
|
|
|
@@ -17,7 +17,7 @@ var _chunk6MKLXHAYjs = require('./chunk-6MKLXHAY.js');
|
|
|
17
17
|
|
|
18
18
|
|
|
19
19
|
|
|
20
|
-
var
|
|
20
|
+
var _chunkCERQVVPJjs = require('./chunk-CERQVVPJ.js');
|
|
21
21
|
|
|
22
22
|
|
|
23
23
|
|
|
@@ -168,7 +168,7 @@ ${(_b = this.latestErrorTask()) == null ? void 0 : _b.errorStack}`
|
|
|
168
168
|
}
|
|
169
169
|
dump() {
|
|
170
170
|
const dumpData = {
|
|
171
|
-
sdkVersion:
|
|
171
|
+
sdkVersion: _chunk57PVXCD2js.getVersion.call(void 0, ),
|
|
172
172
|
model_name: _chunkJP3JBDZSjs.getAIConfig.call(void 0, _chunkJP3JBDZSjs.MIDSCENE_MODEL_NAME) || "",
|
|
173
173
|
logTime: Date.now(),
|
|
174
174
|
name: this.name,
|
|
@@ -191,14 +191,14 @@ var logFileName = "";
|
|
|
191
191
|
var logContent = [];
|
|
192
192
|
var logIdIndexMap = {};
|
|
193
193
|
var { pid } = process;
|
|
194
|
-
var logFileExt =
|
|
194
|
+
var logFileExt = _chunk57PVXCD2js.insightDumpFileExt;
|
|
195
195
|
var ifInBrowser = typeof window !== "undefined";
|
|
196
196
|
function writeInsightDump(data, logId, dumpSubscriber) {
|
|
197
|
-
const logDir =
|
|
197
|
+
const logDir = _chunk57PVXCD2js.getLogDir.call(void 0, );
|
|
198
198
|
_assert2.default.call(void 0, logDir, "logDir should be set before writing dump file");
|
|
199
199
|
const id = logId || _utils.uuid.call(void 0, );
|
|
200
200
|
const baseData = {
|
|
201
|
-
sdkVersion:
|
|
201
|
+
sdkVersion: _chunk57PVXCD2js.getVersion.call(void 0, ),
|
|
202
202
|
logTime: Date.now(),
|
|
203
203
|
model_name: _chunkJP3JBDZSjs.getAIConfig.call(void 0, _chunkJP3JBDZSjs.MIDSCENE_MODEL_NAME) || ""
|
|
204
204
|
};
|
|
@@ -208,7 +208,7 @@ function writeInsightDump(data, logId, dumpSubscriber) {
|
|
|
208
208
|
...data
|
|
209
209
|
};
|
|
210
210
|
dumpSubscriber == null ? void 0 : dumpSubscriber(finalData);
|
|
211
|
-
const dataString =
|
|
211
|
+
const dataString = _chunk57PVXCD2js.stringifyDumpData.call(void 0, finalData, 2);
|
|
212
212
|
if (typeof logIdIndexMap[id] === "number") {
|
|
213
213
|
logContent[logIdIndexMap[id]] = dataString;
|
|
214
214
|
} else {
|
|
@@ -222,7 +222,7 @@ function writeInsightDump(data, logId, dumpSubscriber) {
|
|
|
222
222
|
logFileName = `${pid}_${baseData.logTime}-${Math.random()}`;
|
|
223
223
|
}
|
|
224
224
|
}
|
|
225
|
-
|
|
225
|
+
_chunk57PVXCD2js.writeLogFile.call(void 0, {
|
|
226
226
|
fileName: logFileName,
|
|
227
227
|
fileExt: logFileExt,
|
|
228
228
|
fileContent: `[
|
|
@@ -237,7 +237,7 @@ ${logContent.join(",\n")}
|
|
|
237
237
|
// src/insight/index.ts
|
|
238
238
|
var Insight = class {
|
|
239
239
|
constructor(context, opt) {
|
|
240
|
-
this.aiVendorFn =
|
|
240
|
+
this.aiVendorFn = _chunkCERQVVPJjs.callAiFn;
|
|
241
241
|
_assert2.default.call(void 0, context, "context is required for Insight");
|
|
242
242
|
if (typeof context === "function") {
|
|
243
243
|
this.contextRetrieverFn = context;
|
|
@@ -263,7 +263,7 @@ var Insight = class {
|
|
|
263
263
|
this.onceDumpUpdatedFn = void 0;
|
|
264
264
|
const context = await this.contextRetrieverFn("locate");
|
|
265
265
|
const startTime = Date.now();
|
|
266
|
-
const { parseResult, elementById, rawResponse, usage } = await
|
|
266
|
+
const { parseResult, elementById, rawResponse, usage } = await _chunkCERQVVPJjs.AiInspectElement.call(void 0, {
|
|
267
267
|
callAI: callAI || this.aiVendorFn,
|
|
268
268
|
context,
|
|
269
269
|
multi: Boolean(multi),
|
|
@@ -346,7 +346,7 @@ ${parseResult.errors.join("\n")}`;
|
|
|
346
346
|
this.onceDumpUpdatedFn = void 0;
|
|
347
347
|
const context = await this.contextRetrieverFn("extract");
|
|
348
348
|
const startTime = Date.now();
|
|
349
|
-
const { parseResult, elementById } = await
|
|
349
|
+
const { parseResult, elementById } = await _chunkCERQVVPJjs.AiExtractElementInfo.call(void 0, {
|
|
350
350
|
context,
|
|
351
351
|
dataQuery: dataDemand
|
|
352
352
|
});
|
|
@@ -400,7 +400,7 @@ ${parseResult.errors.join("\n")}`;
|
|
|
400
400
|
this.onceDumpUpdatedFn = void 0;
|
|
401
401
|
const context = await this.contextRetrieverFn("assert");
|
|
402
402
|
const startTime = Date.now();
|
|
403
|
-
const assertResult = await
|
|
403
|
+
const assertResult = await _chunkCERQVVPJjs.AiAssert.call(void 0, {
|
|
404
404
|
assertion,
|
|
405
405
|
context
|
|
406
406
|
});
|
|
@@ -448,4 +448,4 @@ var src_default = Insight;
|
|
|
448
448
|
|
|
449
449
|
|
|
450
450
|
|
|
451
|
-
exports.AIResponseFormat =
|
|
451
|
+
exports.AIResponseFormat = _chunkCERQVVPJjs.AIResponseFormat; exports.BaseElement = _chunkCERQVVPJjs.BaseElement; exports.Executor = Executor; exports.Insight = Insight; exports.UIContext = _chunkCERQVVPJjs.UIContext; exports.default = src_default; exports.getLogDirByType = _chunk57PVXCD2js.getLogDirByType; exports.getVersion = _chunk57PVXCD2js.getVersion; exports.plan = _chunkCERQVVPJjs.plan; exports.setLogDir = _chunk57PVXCD2js.setLogDir; exports.transformElementPositionToId = _chunkCERQVVPJjs.transformElementPositionToId;
|
package/dist/lib/utils.js
CHANGED
|
@@ -16,7 +16,7 @@
|
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
|
|
19
|
-
var
|
|
19
|
+
var _chunk57PVXCD2js = require('./chunk-57PVXCD2.js');
|
|
20
20
|
require('./chunk-JP3JBDZS.js');
|
|
21
21
|
require('./chunk-YSQDPG26.js');
|
|
22
22
|
|
|
@@ -37,4 +37,4 @@ require('./chunk-YSQDPG26.js');
|
|
|
37
37
|
|
|
38
38
|
|
|
39
39
|
|
|
40
|
-
exports.getLogDir =
|
|
40
|
+
exports.getLogDir = _chunk57PVXCD2js.getLogDir; exports.getLogDirByType = _chunk57PVXCD2js.getLogDirByType; exports.getTmpDir = _chunk57PVXCD2js.getTmpDir; exports.getTmpFile = _chunk57PVXCD2js.getTmpFile; exports.getVersion = _chunk57PVXCD2js.getVersion; exports.groupedActionDumpFileExt = _chunk57PVXCD2js.groupedActionDumpFileExt; exports.insightDumpFileExt = _chunk57PVXCD2js.insightDumpFileExt; exports.overlapped = _chunk57PVXCD2js.overlapped; exports.replaceStringWithFirstAppearance = _chunk57PVXCD2js.replaceStringWithFirstAppearance; exports.replacerForPageObject = _chunk57PVXCD2js.replacerForPageObject; exports.reportHTMLContent = _chunk57PVXCD2js.reportHTMLContent; exports.setLogDir = _chunk57PVXCD2js.setLogDir; exports.sleep = _chunk57PVXCD2js.sleep; exports.stringifyDumpData = _chunk57PVXCD2js.stringifyDumpData; exports.uploadTestInfoToServer = _chunk57PVXCD2js.uploadTestInfoToServer; exports.writeDumpReport = _chunk57PVXCD2js.writeDumpReport; exports.writeLogFile = _chunk57PVXCD2js.writeLogFile;
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@midscene/core",
|
|
3
3
|
"description": "Automate browser actions, extract data, and perform assertions using AI. It offers JavaScript SDK, Chrome extension, and support for scripting in YAML. See https://midscenejs.com/ for details.",
|
|
4
|
-
"version": "0.9.
|
|
4
|
+
"version": "0.9.3-beta-20250116143806.0",
|
|
5
5
|
"repository": "https://github.com/web-infra-dev/midscene",
|
|
6
6
|
"homepage": "https://midscenejs.com/",
|
|
7
7
|
"jsnext:source": "./src/index.ts",
|
|
@@ -41,7 +41,7 @@
|
|
|
41
41
|
"@langchain/core": "0.3.26",
|
|
42
42
|
"socks-proxy-agent": "8.0.4",
|
|
43
43
|
"openai": "4.57.1",
|
|
44
|
-
"@midscene/shared": "0.9.
|
|
44
|
+
"@midscene/shared": "0.9.3-beta-20250116143806.0"
|
|
45
45
|
},
|
|
46
46
|
"devDependencies": {
|
|
47
47
|
"@modern-js/module-tools": "2.60.6",
|
|
@@ -71,6 +71,7 @@
|
|
|
71
71
|
"test:ai": "AITEST=true npm run test",
|
|
72
72
|
"computer": "TEST_COMPUTER=true npm run test:ai -- tests/ai/evaluate/computer.test.ts",
|
|
73
73
|
"evaluate": "npm run test:ai -- tests/ai/evaluate/inspect.test.ts",
|
|
74
|
+
"evaluate:assertion": "npm run test:ai -- tests/ai/evaluate/assertion.test.ts",
|
|
74
75
|
"prompt": "npm run test:ai -- tests/ai/parse-action.test.ts",
|
|
75
76
|
"evaluate:update": "UPDATE_AI_DATA=true npm run test:ai -- tests/ai/evaluate/inspect.test.ts"
|
|
76
77
|
}
|