@midscene/core 0.1.4 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +1 -1
- package/README.md +7 -0
- package/dist/es/ai-model.js +52 -30
- package/dist/es/image.js +16 -10
- package/dist/es/index.js +594 -476
- package/dist/es/utils.js +14 -9
- package/dist/lib/ai-model.js +58 -42
- package/dist/lib/image.js +24 -24
- package/dist/lib/index.js +611 -499
- package/dist/lib/utils.js +32 -33
- package/dist/types/ai-model.d.ts +3 -5
- package/dist/types/image.d.ts +1 -1
- package/dist/types/index.d.ts +19 -19
- package/dist/types/{types-1f7912d5.d.ts → types-2c92867c.d.ts} +33 -15
- package/dist/types/{util-3a13ce3d.d.ts → util-3931e76e.d.ts} +1 -1
- package/dist/types/utils.d.ts +1 -1
- package/package.json +5 -4
package/dist/es/index.js
CHANGED
|
@@ -1008,41 +1008,21 @@ var require_dist = __commonJS({
|
|
|
1008
1008
|
}
|
|
1009
1009
|
});
|
|
1010
1010
|
|
|
1011
|
-
// src/
|
|
1012
|
-
import
|
|
1013
|
-
|
|
1014
|
-
// src/ai-model/prompt/util.ts
|
|
1015
|
-
import assert3 from "assert";
|
|
1016
|
-
|
|
1017
|
-
// src/image/info.ts
|
|
1018
|
-
import assert from "assert";
|
|
1019
|
-
import { Buffer as Buffer2 } from "buffer";
|
|
1020
|
-
import { readFileSync } from "fs";
|
|
1021
|
-
import Sharp from "sharp";
|
|
1022
|
-
async function imageInfo(image) {
|
|
1023
|
-
const { width, height } = await Sharp(image).metadata();
|
|
1024
|
-
assert(width && height, `invalid image: ${image}`);
|
|
1025
|
-
return { width, height };
|
|
1026
|
-
}
|
|
1027
|
-
async function imageInfoOfBase64(imageBase64) {
|
|
1028
|
-
const base64Data = imageBase64.replace(/^data:image\/\w+;base64,/, "");
|
|
1029
|
-
return imageInfo(Buffer2.from(base64Data, "base64"));
|
|
1030
|
-
}
|
|
1031
|
-
|
|
1032
|
-
// src/image/transform.ts
|
|
1033
|
-
import { Buffer as Buffer3 } from "buffer";
|
|
1034
|
-
import Sharp2 from "sharp";
|
|
1035
|
-
|
|
1036
|
-
// src/image/visualization.ts
|
|
1037
|
-
import { Buffer as Buffer4 } from "buffer";
|
|
1038
|
-
import Sharp3 from "sharp";
|
|
1011
|
+
// src/action/executor.ts
|
|
1012
|
+
import assert2 from "assert";
|
|
1039
1013
|
|
|
1040
1014
|
// src/utils.ts
|
|
1015
|
+
import assert from "assert";
|
|
1016
|
+
import { randomUUID } from "crypto";
|
|
1017
|
+
import {
|
|
1018
|
+
copyFileSync,
|
|
1019
|
+
existsSync,
|
|
1020
|
+
mkdirSync,
|
|
1021
|
+
readFileSync,
|
|
1022
|
+
writeFileSync
|
|
1023
|
+
} from "fs";
|
|
1041
1024
|
import { tmpdir } from "os";
|
|
1042
1025
|
import { basename, join } from "path";
|
|
1043
|
-
import { copyFileSync, existsSync, mkdirSync, readFileSync as readFileSync2, writeFileSync } from "fs";
|
|
1044
|
-
import { randomUUID } from "crypto";
|
|
1045
|
-
import assert2 from "assert";
|
|
1046
1026
|
var pkg;
|
|
1047
1027
|
function getPkgInfo() {
|
|
1048
1028
|
if (pkg) {
|
|
@@ -1055,15 +1035,14 @@ function getPkgInfo() {
|
|
|
1055
1035
|
pkgJsonFile = join(__dirname, "../../../package.json");
|
|
1056
1036
|
}
|
|
1057
1037
|
if (pkgJsonFile) {
|
|
1058
|
-
const { name, version } = JSON.parse(
|
|
1038
|
+
const { name, version } = JSON.parse(readFileSync(pkgJsonFile, "utf-8"));
|
|
1059
1039
|
pkg = { name, version };
|
|
1060
1040
|
return pkg;
|
|
1061
|
-
} else {
|
|
1062
|
-
return {
|
|
1063
|
-
name: "midscene-unknown-page-name",
|
|
1064
|
-
version: "0.0.0"
|
|
1065
|
-
};
|
|
1066
1041
|
}
|
|
1042
|
+
return {
|
|
1043
|
+
name: "midscene-unknown-page-name",
|
|
1044
|
+
version: "0.0.0"
|
|
1045
|
+
};
|
|
1067
1046
|
}
|
|
1068
1047
|
var logDir = join(process.cwd(), "./midscene_run/");
|
|
1069
1048
|
var logEnvReady = false;
|
|
@@ -1084,18 +1063,18 @@ function writeDumpFile(opts) {
|
|
|
1084
1063
|
mkdirSync(targetDir, { recursive: true });
|
|
1085
1064
|
}
|
|
1086
1065
|
if (!logEnvReady) {
|
|
1087
|
-
|
|
1066
|
+
assert(targetDir, "logDir should be set before writing dump file");
|
|
1088
1067
|
const gitIgnorePath = join(targetDir, "../../.gitignore");
|
|
1089
1068
|
let gitIgnoreContent = "";
|
|
1090
1069
|
if (existsSync(gitIgnorePath)) {
|
|
1091
|
-
gitIgnoreContent =
|
|
1070
|
+
gitIgnoreContent = readFileSync(gitIgnorePath, "utf-8");
|
|
1092
1071
|
}
|
|
1093
1072
|
const logDirName = basename(logDir);
|
|
1094
1073
|
if (!gitIgnoreContent.includes(`${logDirName}/`)) {
|
|
1095
1074
|
writeFileSync(
|
|
1096
1075
|
gitIgnorePath,
|
|
1097
1076
|
`${gitIgnoreContent}
|
|
1098
|
-
#
|
|
1077
|
+
# Midscene.js dump files
|
|
1099
1078
|
${logDirName}/report
|
|
1100
1079
|
${logDirName}/dump-logger
|
|
1101
1080
|
`,
|
|
@@ -1112,286 +1091,157 @@ ${logDirName}/dump-logger
|
|
|
1112
1091
|
return filePath;
|
|
1113
1092
|
}
|
|
1114
1093
|
|
|
1115
|
-
// src/
|
|
1116
|
-
var
|
|
1117
|
-
|
|
1118
|
-
|
|
1119
|
-
|
|
1120
|
-
|
|
1121
|
-
|
|
1122
|
-
|
|
1123
|
-
|
|
1124
|
-
|
|
1125
|
-
|
|
1126
|
-
|
|
1127
|
-
|
|
1128
|
-
|
|
1129
|
-
* Analyze the page in a top-to-bottom and left-to-right order.
|
|
1130
|
-
* The evidence indicates a separate section, for example
|
|
1131
|
-
- The background color of certain parts of the page changes.
|
|
1132
|
-
- A section of a page includes a title.
|
|
1133
|
-
* Provide the following data for each of the UI section you found.
|
|
1134
|
-
{
|
|
1135
|
-
"name": "name of the section",
|
|
1136
|
-
"description": "briefly summarize the key content or usage of this section.",
|
|
1137
|
-
"sectionCharacteristics": "In view of the need to distinguish this section from the surrounding sections, explain the characteristics and how to define boundaries and what precautions to take.",
|
|
1138
|
-
"textIds": ["5", "6", "7"], // ids of all text elements in this section
|
|
1094
|
+
// src/action/executor.ts
|
|
1095
|
+
var Executor = class {
|
|
1096
|
+
constructor(name, description, tasks) {
|
|
1097
|
+
__publicField(this, "name");
|
|
1098
|
+
__publicField(this, "description");
|
|
1099
|
+
__publicField(this, "tasks");
|
|
1100
|
+
// status of executor
|
|
1101
|
+
__publicField(this, "status");
|
|
1102
|
+
__publicField(this, "errorMsg");
|
|
1103
|
+
__publicField(this, "dumpFileName");
|
|
1104
|
+
this.status = tasks && tasks.length > 0 ? "pending" : "init";
|
|
1105
|
+
this.name = name;
|
|
1106
|
+
this.description = description;
|
|
1107
|
+
this.tasks = (tasks || []).map((item) => this.markTaskAsPending(item));
|
|
1139
1108
|
}
|
|
1140
|
-
|
|
1141
|
-
|
|
1142
|
-
|
|
1143
|
-
|
|
1144
|
-
* User will give you some data requirements in DATA_DEMAND. Consider the UI context, follow the user's instructions, and provide comprehensive data accordingly.
|
|
1145
|
-
* There may be some special commands in DATA_DEMAND, please pay extra attention
|
|
1146
|
-
- ${ONE_ELEMENT_LOCATOR_PREFIX} and ${ELEMENTS_LOCATOR_PREFIX}: if you see a description that mentions the keyword ${ONE_ELEMENT_LOCATOR_PREFIX} or ${ELEMENTS_LOCATOR_PREFIX}(e.g. follow ${ONE_ELEMENT_LOCATOR_PREFIX} : i want to find ...), it means user wants to locate a specific element meets the description. Return in this way: prefix + the id / comma-separated ids, for example: ${ONE_ELEMENT_LOCATOR_PREFIX}/1 , ${ELEMENTS_LOCATOR_PREFIX}/1,2,3 . If not found, keep the prefix and leave the suffix empty, like ${ONE_ELEMENT_LOCATOR_PREFIX}/ .`;
|
|
1147
|
-
function promptsOfSectionQuery(constraints) {
|
|
1148
|
-
if (!constraints.length) {
|
|
1149
|
-
return "";
|
|
1109
|
+
markTaskAsPending(task) {
|
|
1110
|
+
return __spreadValues({
|
|
1111
|
+
status: "pending"
|
|
1112
|
+
}, task);
|
|
1150
1113
|
}
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
"either `name` or `description` is required to define a section constraint"
|
|
1114
|
+
async append(task) {
|
|
1115
|
+
assert2(
|
|
1116
|
+
this.status !== "error",
|
|
1117
|
+
"executor is in error state, cannot append task"
|
|
1156
1118
|
);
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
return basic;
|
|
1162
|
-
};
|
|
1163
|
-
return `${instruction}
|
|
1164
|
-
${constraints.map(singleSection).join("\n")}`;
|
|
1165
|
-
}
|
|
1166
|
-
function systemPromptToExtract(dataQuery, sections) {
|
|
1167
|
-
const allSectionNames = (sections == null ? void 0 : sections.filter((c) => c.name).map((c) => c.name || "")) || [];
|
|
1168
|
-
const sectionFindingPrompt = promptsOfSectionQuery(sections || []);
|
|
1169
|
-
const sectionReturnFormat = allSectionNames.length ? " sections: [], // detailed information of each section from segment_a_web_page skill" : "";
|
|
1170
|
-
return `
|
|
1171
|
-
${characteristic}
|
|
1172
|
-
${contextFormatIntro}
|
|
1173
|
-
|
|
1174
|
-
You have the following skills:
|
|
1175
|
-
${allSectionNames.length ? skillSegment : ""}
|
|
1176
|
-
${skillExtractData}
|
|
1177
|
-
|
|
1178
|
-
Now, do the following jobs:
|
|
1179
|
-
${sectionFindingPrompt}
|
|
1180
|
-
Use your extract_data_from_UI skill to find the following data, placing it in the \`data\` field
|
|
1181
|
-
DATA_DEMAND start:
|
|
1182
|
-
${typeof dataQuery === "object" ? `return in key-value style object, keys are ${Object.keys(dataQuery).join(",")}` : ""};
|
|
1183
|
-
${typeof dataQuery === "string" ? dataQuery : JSON.stringify(dataQuery, null, 2)}
|
|
1184
|
-
DATA_DEMAND ends.
|
|
1185
|
-
|
|
1186
|
-
Return in the following JSON format:
|
|
1187
|
-
{
|
|
1188
|
-
language: "en", // "en" or "zh", the language of the page. Use the same language to describe section name, description, and similar fields.
|
|
1189
|
-
${sectionReturnFormat}
|
|
1190
|
-
data: any, // the extracted data from extract_data_from_UI skill. Make sure both the value and scheme meet the DATA_DEMAND.
|
|
1191
|
-
errors?: [], // string[], error message if any
|
|
1192
|
-
}
|
|
1193
|
-
`;
|
|
1194
|
-
}
|
|
1195
|
-
function describeSize(size) {
|
|
1196
|
-
return `${size.width} x ${size.height}`;
|
|
1197
|
-
}
|
|
1198
|
-
function truncateText(text) {
|
|
1199
|
-
const maxLength = 50;
|
|
1200
|
-
if (text && text.length > maxLength) {
|
|
1201
|
-
return `${text.slice(0, maxLength)}...`;
|
|
1202
|
-
}
|
|
1203
|
-
return text;
|
|
1204
|
-
}
|
|
1205
|
-
async function describeUserPage(context) {
|
|
1206
|
-
const { screenshotBase64 } = context;
|
|
1207
|
-
let width;
|
|
1208
|
-
let height;
|
|
1209
|
-
if (context.size) {
|
|
1210
|
-
({ width, height } = context.size);
|
|
1211
|
-
} else {
|
|
1212
|
-
const imgSize = await imageInfoOfBase64(screenshotBase64);
|
|
1213
|
-
({ width, height } = imgSize);
|
|
1214
|
-
}
|
|
1215
|
-
const elementsInfo = context.content;
|
|
1216
|
-
const idElementMap = {};
|
|
1217
|
-
elementsInfo.forEach((item) => {
|
|
1218
|
-
idElementMap[item.id] = item;
|
|
1219
|
-
return __spreadValues({}, item);
|
|
1220
|
-
});
|
|
1221
|
-
const elementInfosDescription = cropfieldInformation(elementsInfo);
|
|
1222
|
-
return {
|
|
1223
|
-
description: `
|
|
1224
|
-
{
|
|
1225
|
-
// The size of the page
|
|
1226
|
-
"pageSize": ${describeSize({ width, height })},
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
// json description of the element
|
|
1230
|
-
"elementInfos": ${JSON.stringify(elementInfosDescription)}
|
|
1231
|
-
}`,
|
|
1232
|
-
elementById(id) {
|
|
1233
|
-
assert3(typeof id !== "undefined", "id is required for query");
|
|
1234
|
-
const item = idElementMap[`${id}`];
|
|
1235
|
-
return item;
|
|
1236
|
-
}
|
|
1237
|
-
};
|
|
1238
|
-
}
|
|
1239
|
-
function cropfieldInformation(elementsInfo) {
|
|
1240
|
-
const elementInfosDescription = elementsInfo.map((item) => {
|
|
1241
|
-
const { id, attributes = {}, rect, content } = item;
|
|
1242
|
-
const tailorContent = truncateText(content);
|
|
1243
|
-
const tailorAttributes = Object.keys(attributes).reduce((res, currentKey) => {
|
|
1244
|
-
const attributeVal = attributes[currentKey];
|
|
1245
|
-
res[currentKey] = truncateText(attributeVal);
|
|
1246
|
-
return res;
|
|
1247
|
-
}, {});
|
|
1248
|
-
return {
|
|
1249
|
-
id,
|
|
1250
|
-
attributes: tailorAttributes,
|
|
1251
|
-
rect,
|
|
1252
|
-
content: tailorContent
|
|
1253
|
-
};
|
|
1254
|
-
});
|
|
1255
|
-
return JSON.stringify(elementInfosDescription);
|
|
1256
|
-
}
|
|
1257
|
-
function retrieveElement(prompt, opt) {
|
|
1258
|
-
if (opt == null ? void 0 : opt.multi) {
|
|
1259
|
-
return `follow ${ELEMENTS_LOCATOR_PREFIX}: ${prompt}`;
|
|
1260
|
-
}
|
|
1261
|
-
return `follow ${ONE_ELEMENT_LOCATOR_PREFIX}: ${prompt}`;
|
|
1262
|
-
}
|
|
1263
|
-
function ifElementTypeResponse(response) {
|
|
1264
|
-
if (typeof response !== "string") {
|
|
1265
|
-
return false;
|
|
1266
|
-
}
|
|
1267
|
-
return response.startsWith(ONE_ELEMENT_LOCATOR_PREFIX) || response.startsWith(ELEMENTS_LOCATOR_PREFIX);
|
|
1268
|
-
}
|
|
1269
|
-
function splitElementResponse(response) {
|
|
1270
|
-
const oneElementSplitter = `${ONE_ELEMENT_LOCATOR_PREFIX}/`;
|
|
1271
|
-
if (response.startsWith(oneElementSplitter)) {
|
|
1272
|
-
const id = response.slice(oneElementSplitter.length);
|
|
1273
|
-
if (id.indexOf(",") >= 0) {
|
|
1274
|
-
console.warn(`unexpected comma in one element response: ${id}`);
|
|
1275
|
-
}
|
|
1276
|
-
return id ? id : null;
|
|
1277
|
-
}
|
|
1278
|
-
const elementsSplitter = `${ELEMENTS_LOCATOR_PREFIX}/`;
|
|
1279
|
-
if (response.startsWith(elementsSplitter)) {
|
|
1280
|
-
const idsString = response.slice(elementsSplitter.length);
|
|
1281
|
-
if (!idsString) {
|
|
1282
|
-
return [];
|
|
1119
|
+
if (Array.isArray(task)) {
|
|
1120
|
+
this.tasks.push(...task.map((item) => this.markTaskAsPending(item)));
|
|
1121
|
+
} else {
|
|
1122
|
+
this.tasks.push(this.markTaskAsPending(task));
|
|
1283
1123
|
}
|
|
1284
|
-
|
|
1285
|
-
|
|
1286
|
-
return null;
|
|
1287
|
-
}
|
|
1288
|
-
function retrieveSection(prompt) {
|
|
1289
|
-
return `${SECTION_MATCHER_FLAG}${prompt}`;
|
|
1290
|
-
}
|
|
1291
|
-
function extractSectionQuery(input) {
|
|
1292
|
-
if (typeof input === "string" && input.startsWith(SECTION_MATCHER_FLAG)) {
|
|
1293
|
-
return input.slice(SECTION_MATCHER_FLAG.length);
|
|
1294
|
-
}
|
|
1295
|
-
return false;
|
|
1296
|
-
}
|
|
1297
|
-
|
|
1298
|
-
// src/insight/utils.ts
|
|
1299
|
-
import { existsSync as existsSync2 } from "fs";
|
|
1300
|
-
import { join as join2 } from "path";
|
|
1301
|
-
import { randomUUID as randomUUID2 } from "crypto";
|
|
1302
|
-
import assert4 from "assert";
|
|
1303
|
-
var logFileName = "";
|
|
1304
|
-
var logContent = [];
|
|
1305
|
-
var logIdIndexMap = {};
|
|
1306
|
-
var { pid } = process;
|
|
1307
|
-
var logFileExt = insightDumpFileExt;
|
|
1308
|
-
function writeInsightDump(data, logId, dumpSubscriber) {
|
|
1309
|
-
const logDir2 = getDumpDir();
|
|
1310
|
-
assert4(logDir2, "logDir should be set before writing dump file");
|
|
1311
|
-
const id = logId || randomUUID2();
|
|
1312
|
-
const baseData = {
|
|
1313
|
-
sdkVersion: getPkgInfo().version,
|
|
1314
|
-
logTime: Date.now()
|
|
1315
|
-
};
|
|
1316
|
-
const finalData = __spreadValues(__spreadValues({
|
|
1317
|
-
logId: id
|
|
1318
|
-
}, baseData), data);
|
|
1319
|
-
dumpSubscriber == null ? void 0 : dumpSubscriber(finalData);
|
|
1320
|
-
if (!logFileName) {
|
|
1321
|
-
logFileName = `pid_${pid}_${baseData.logTime}`;
|
|
1322
|
-
while (existsSync2(join2(logDir2, `${logFileName}.${logFileExt}`))) {
|
|
1323
|
-
logFileName = `${pid}_${baseData.logTime}-${Math.random()}`;
|
|
1124
|
+
if (this.status !== "running") {
|
|
1125
|
+
this.status = "pending";
|
|
1324
1126
|
}
|
|
1325
1127
|
}
|
|
1326
|
-
|
|
1327
|
-
|
|
1328
|
-
|
|
1329
|
-
|
|
1330
|
-
|
|
1331
|
-
logIdIndexMap[id] = length - 1;
|
|
1332
|
-
}
|
|
1333
|
-
writeDumpFile({
|
|
1334
|
-
fileName: logFileName,
|
|
1335
|
-
fileExt: logFileExt,
|
|
1336
|
-
fileContent: `[
|
|
1337
|
-
${logContent.join(",\n")}
|
|
1338
|
-
]`
|
|
1339
|
-
});
|
|
1340
|
-
return id;
|
|
1341
|
-
}
|
|
1342
|
-
function idsIntoElements(ids, elementById) {
|
|
1343
|
-
return ids.reduce((acc, id) => {
|
|
1344
|
-
const element = elementById(id);
|
|
1345
|
-
if (element) {
|
|
1346
|
-
acc.push(element);
|
|
1347
|
-
} else {
|
|
1348
|
-
console.warn(`element not found by id: ${id}`);
|
|
1128
|
+
async flush() {
|
|
1129
|
+
if (this.status === "init" && this.tasks.length > 0) {
|
|
1130
|
+
console.warn(
|
|
1131
|
+
"illegal state for executor, status is init but tasks are not empty"
|
|
1132
|
+
);
|
|
1349
1133
|
}
|
|
1350
|
-
|
|
1351
|
-
|
|
1352
|
-
|
|
1353
|
-
|
|
1354
|
-
|
|
1355
|
-
|
|
1356
|
-
|
|
1357
|
-
|
|
1358
|
-
data[key] = elementsById(value);
|
|
1359
|
-
} else if (Array.isArray(value)) {
|
|
1360
|
-
const newValue = value.map((id) => ifMeet(id) ? elementsById(id) : id);
|
|
1361
|
-
data[key] = newValue;
|
|
1134
|
+
assert2(this.status !== "running", "executor is already running");
|
|
1135
|
+
assert2(this.status !== "completed", "executor is already completed");
|
|
1136
|
+
assert2(this.status !== "error", "executor is in error state");
|
|
1137
|
+
const nextPendingIndex = this.tasks.findIndex(
|
|
1138
|
+
(task) => task.status === "pending"
|
|
1139
|
+
);
|
|
1140
|
+
if (nextPendingIndex < 0) {
|
|
1141
|
+
return;
|
|
1362
1142
|
}
|
|
1363
|
-
|
|
1364
|
-
|
|
1365
|
-
|
|
1366
|
-
|
|
1367
|
-
|
|
1368
|
-
|
|
1369
|
-
|
|
1370
|
-
|
|
1371
|
-
|
|
1372
|
-
|
|
1373
|
-
|
|
1374
|
-
|
|
1375
|
-
|
|
1376
|
-
|
|
1377
|
-
|
|
1378
|
-
|
|
1379
|
-
|
|
1380
|
-
|
|
1381
|
-
|
|
1382
|
-
|
|
1383
|
-
|
|
1384
|
-
|
|
1385
|
-
|
|
1386
|
-
|
|
1387
|
-
|
|
1388
|
-
|
|
1389
|
-
|
|
1390
|
-
|
|
1143
|
+
this.status = "running";
|
|
1144
|
+
let taskIndex = nextPendingIndex;
|
|
1145
|
+
let successfullyCompleted = true;
|
|
1146
|
+
let errorMsg = "";
|
|
1147
|
+
let previousFindOutput;
|
|
1148
|
+
while (taskIndex < this.tasks.length) {
|
|
1149
|
+
const task = this.tasks[taskIndex];
|
|
1150
|
+
assert2(
|
|
1151
|
+
task.status === "pending",
|
|
1152
|
+
`task status should be pending, but got: ${task.status}`
|
|
1153
|
+
);
|
|
1154
|
+
task.timing = {
|
|
1155
|
+
start: Date.now()
|
|
1156
|
+
};
|
|
1157
|
+
try {
|
|
1158
|
+
task.status = "running";
|
|
1159
|
+
assert2(
|
|
1160
|
+
["Insight", "Action", "Planning"].indexOf(task.type) >= 0,
|
|
1161
|
+
`unsupported task type: ${task.type}`
|
|
1162
|
+
);
|
|
1163
|
+
const { executor, param } = task;
|
|
1164
|
+
assert2(executor, `executor is required for task type: ${task.type}`);
|
|
1165
|
+
let returnValue;
|
|
1166
|
+
const executorContext = {
|
|
1167
|
+
task,
|
|
1168
|
+
element: previousFindOutput == null ? void 0 : previousFindOutput.element
|
|
1169
|
+
};
|
|
1170
|
+
if (task.type === "Insight") {
|
|
1171
|
+
assert2(
|
|
1172
|
+
task.subType === "Locate" || task.subType === "Query" || task.subType === "Assert",
|
|
1173
|
+
`unsupported insight subType: ${task.subType}`
|
|
1174
|
+
);
|
|
1175
|
+
returnValue = await task.executor(param, executorContext);
|
|
1176
|
+
if (task.subType === "Locate") {
|
|
1177
|
+
previousFindOutput = returnValue == null ? void 0 : returnValue.output;
|
|
1178
|
+
}
|
|
1179
|
+
} else if (task.type === "Action" || task.type === "Planning") {
|
|
1180
|
+
returnValue = await task.executor(param, executorContext);
|
|
1181
|
+
} else {
|
|
1182
|
+
console.warn(
|
|
1183
|
+
`unsupported task type: ${task.type}, will try to execute it directly`
|
|
1184
|
+
);
|
|
1185
|
+
returnValue = await task.executor(param, executorContext);
|
|
1186
|
+
}
|
|
1187
|
+
Object.assign(task, returnValue);
|
|
1188
|
+
task.status = "success";
|
|
1189
|
+
task.timing.end = Date.now();
|
|
1190
|
+
task.timing.cost = task.timing.end - task.timing.start;
|
|
1191
|
+
taskIndex++;
|
|
1192
|
+
} catch (e) {
|
|
1193
|
+
successfullyCompleted = false;
|
|
1194
|
+
task.status = "fail";
|
|
1195
|
+
errorMsg = `${e == null ? void 0 : e.message}
|
|
1196
|
+
${e == null ? void 0 : e.stack}`;
|
|
1197
|
+
task.error = errorMsg;
|
|
1198
|
+
task.timing.end = Date.now();
|
|
1199
|
+
task.timing.cost = task.timing.end - task.timing.start;
|
|
1200
|
+
this.errorMsg = errorMsg;
|
|
1201
|
+
break;
|
|
1202
|
+
}
|
|
1203
|
+
}
|
|
1204
|
+
for (let i = taskIndex + 1; i < this.tasks.length; i++) {
|
|
1205
|
+
this.tasks[i].status = "cancelled";
|
|
1206
|
+
}
|
|
1207
|
+
if (successfullyCompleted) {
|
|
1208
|
+
this.status = "completed";
|
|
1209
|
+
if (this.tasks.length) {
|
|
1210
|
+
return this.tasks[this.tasks.length - 1].output;
|
|
1211
|
+
}
|
|
1212
|
+
} else {
|
|
1213
|
+
this.status = "error";
|
|
1214
|
+
throw new Error(`executor failed: ${errorMsg}`);
|
|
1215
|
+
}
|
|
1216
|
+
}
|
|
1217
|
+
dump() {
|
|
1218
|
+
const dumpData = {
|
|
1219
|
+
sdkVersion: getPkgInfo().version,
|
|
1220
|
+
logTime: Date.now(),
|
|
1221
|
+
name: this.name,
|
|
1222
|
+
description: this.description,
|
|
1223
|
+
tasks: this.tasks
|
|
1224
|
+
};
|
|
1225
|
+
return dumpData;
|
|
1226
|
+
}
|
|
1227
|
+
};
|
|
1228
|
+
|
|
1229
|
+
// src/insight/index.ts
|
|
1230
|
+
import assert8 from "assert";
|
|
1391
1231
|
|
|
1392
1232
|
// src/ai-model/openai.ts
|
|
1393
|
-
import
|
|
1394
|
-
|
|
1233
|
+
import assert3 from "assert";
|
|
1234
|
+
|
|
1235
|
+
// src/types.ts
|
|
1236
|
+
var BaseElement = class {
|
|
1237
|
+
};
|
|
1238
|
+
var AIResponseFormat = /* @__PURE__ */ ((AIResponseFormat2) => {
|
|
1239
|
+
AIResponseFormat2["JSON"] = "json_object";
|
|
1240
|
+
AIResponseFormat2["TEXT"] = "text";
|
|
1241
|
+
return AIResponseFormat2;
|
|
1242
|
+
})(AIResponseFormat || {});
|
|
1243
|
+
var UIContext = class {
|
|
1244
|
+
};
|
|
1395
1245
|
|
|
1396
1246
|
// ../../node_modules/.pnpm/langsmith@0.1.36_openai@4.47.1/node_modules/langsmith/dist/traceable.js
|
|
1397
1247
|
import { AsyncLocalStorage } from "async_hooks";
|
|
@@ -4605,23 +4455,13 @@ var wrapOpenAI = (openai, options) => {
|
|
|
4605
4455
|
return openai;
|
|
4606
4456
|
};
|
|
4607
4457
|
|
|
4608
|
-
// src/types.ts
|
|
4609
|
-
var BaseElement = class {
|
|
4610
|
-
};
|
|
4611
|
-
var AIResponseFormat = /* @__PURE__ */ ((AIResponseFormat2) => {
|
|
4612
|
-
AIResponseFormat2["JSON"] = "json_object";
|
|
4613
|
-
AIResponseFormat2["TEXT"] = "text";
|
|
4614
|
-
return AIResponseFormat2;
|
|
4615
|
-
})(AIResponseFormat || {});
|
|
4616
|
-
var UIContext = class {
|
|
4617
|
-
};
|
|
4618
|
-
|
|
4619
4458
|
// src/ai-model/openai.ts
|
|
4459
|
+
import OpenAI from "openai";
|
|
4620
4460
|
var envConfigKey = "MIDSCENE_OPENAI_INIT_CONFIG_JSON";
|
|
4621
4461
|
var envModelKey = "MIDSCENE_MODEL_NAME";
|
|
4622
4462
|
var envSmithDebug = "MIDSCENE_LANGSMITH_DEBUG";
|
|
4623
4463
|
var extraConfig = {};
|
|
4624
|
-
if (typeof process.env[envConfigKey] === "string") {
|
|
4464
|
+
if (typeof process.env[envConfigKey] === "string" && process.env[envConfigKey]) {
|
|
4625
4465
|
console.log("config for openai loaded");
|
|
4626
4466
|
extraConfig = JSON.parse(process.env[envConfigKey]);
|
|
4627
4467
|
}
|
|
@@ -4647,12 +4487,12 @@ async function call(messages, responseFormat) {
|
|
|
4647
4487
|
response_format: { type: responseFormat }
|
|
4648
4488
|
});
|
|
4649
4489
|
const { content } = completion.choices[0].message;
|
|
4650
|
-
|
|
4490
|
+
assert3(content, "empty content");
|
|
4651
4491
|
return content;
|
|
4652
4492
|
}
|
|
4653
4493
|
async function callToGetJSONObject(messages) {
|
|
4654
4494
|
const response = await call(messages, "json_object" /* JSON */);
|
|
4655
|
-
|
|
4495
|
+
assert3(response, "empty response");
|
|
4656
4496
|
return JSON.parse(response);
|
|
4657
4497
|
}
|
|
4658
4498
|
|
|
@@ -4690,9 +4530,9 @@ You are an expert in software page image (2D) and page element text analysis.
|
|
|
4690
4530
|
"elements": [
|
|
4691
4531
|
// If no matching elements are found, return an empty array []
|
|
4692
4532
|
{
|
|
4693
|
-
"reason": "
|
|
4694
|
-
"text": "
|
|
4695
|
-
"id": "
|
|
4533
|
+
"reason": "PLACEHOLDER", // The thought process for finding the element, replace PLACEHOLDER with your thought process
|
|
4534
|
+
"text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
|
|
4535
|
+
"id": "PLACEHOLDER" // Replace PLACEHOLDER with the ID of elementInfo
|
|
4696
4536
|
}
|
|
4697
4537
|
// More elements...
|
|
4698
4538
|
],
|
|
@@ -4787,9 +4627,244 @@ Output Example:
|
|
|
4787
4627
|
`;
|
|
4788
4628
|
}
|
|
4789
4629
|
|
|
4630
|
+
// src/ai-model/prompt/util.ts
|
|
4631
|
+
import assert5 from "assert";
|
|
4632
|
+
|
|
4633
|
+
// src/image/info.ts
|
|
4634
|
+
import assert4 from "assert";
|
|
4635
|
+
import { Buffer as Buffer2 } from "buffer";
|
|
4636
|
+
import { readFileSync as readFileSync2 } from "fs";
|
|
4637
|
+
import Sharp from "sharp";
|
|
4638
|
+
async function imageInfo(image) {
|
|
4639
|
+
const { width, height } = await Sharp(image).metadata();
|
|
4640
|
+
assert4(width && height, `invalid image: ${image}`);
|
|
4641
|
+
return { width, height };
|
|
4642
|
+
}
|
|
4643
|
+
async function imageInfoOfBase64(imageBase64) {
|
|
4644
|
+
const base64Data = imageBase64.replace(/^data:image\/\w+;base64,/, "");
|
|
4645
|
+
return imageInfo(Buffer2.from(base64Data, "base64"));
|
|
4646
|
+
}
|
|
4647
|
+
|
|
4648
|
+
// src/image/transform.ts
|
|
4649
|
+
import { Buffer as Buffer3 } from "buffer";
|
|
4650
|
+
import Sharp2 from "sharp";
|
|
4651
|
+
|
|
4652
|
+
// src/image/visualization.ts
|
|
4653
|
+
import { Buffer as Buffer4 } from "buffer";
|
|
4654
|
+
import Sharp3 from "sharp";
|
|
4655
|
+
|
|
4656
|
+
// src/ai-model/prompt/util.ts
|
|
4657
|
+
var characteristic = "You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.";
|
|
4658
|
+
var contextFormatIntro = `
|
|
4659
|
+
The user will give you a screenshot and the texts on it. There may be some none-English characters (like Chinese) on it, indicating it's an non-English app.`;
|
|
4660
|
+
var ONE_ELEMENT_LOCATOR_PREFIX = "LOCATE_ONE_ELEMENT";
|
|
4661
|
+
var ELEMENTS_LOCATOR_PREFIX = "LOCATE_ONE_OR_MORE_ELEMENTS";
|
|
4662
|
+
var SECTION_MATCHER_FLAG = "SECTION_MATCHER_FLAG/";
|
|
4663
|
+
var skillSegment = `skill name: segment_a_web_page
|
|
4664
|
+
skill content:
|
|
4665
|
+
Based on the functions and content of various elements on the page, segment the screenshot into different sections like navigation bar, product list, news area, etc.
|
|
4666
|
+
Some general rules for segmentation:
|
|
4667
|
+
* Each section should NOT overlap with each other.
|
|
4668
|
+
* Each text should only belong to one section.
|
|
4669
|
+
* [IMPORTANT] Whether the content visually appears to belong to different sections is a significant factor in segmenting the page.
|
|
4670
|
+
* Analyze the page in a top-to-bottom and left-to-right order.
|
|
4671
|
+
* The evidence indicates a separate section, for example
|
|
4672
|
+
- The background color of certain parts of the page changes.
|
|
4673
|
+
- A section of a page includes a title.
|
|
4674
|
+
* Provide the following data for each of the UI section you found.
|
|
4675
|
+
{
|
|
4676
|
+
"name": "name of the section",
|
|
4677
|
+
"description": "briefly summarize the key content or usage of this section.",
|
|
4678
|
+
"sectionCharacteristics": "In view of the need to distinguish this section from the surrounding sections, explain the characteristics and how to define boundaries and what precautions to take.",
|
|
4679
|
+
"textIds": ["5", "6", "7"], // ids of all text elements in this section
|
|
4680
|
+
}
|
|
4681
|
+
`;
|
|
4682
|
+
var skillExtractData = `skill name: extract_data_from_UI
|
|
4683
|
+
related input: DATA_DEMAND
|
|
4684
|
+
skill content:
|
|
4685
|
+
* User will give you some data requirements in DATA_DEMAND. Consider the UI context, follow the user's instructions, and provide comprehensive data accordingly.
|
|
4686
|
+
* There may be some special commands in DATA_DEMAND, please pay extra attention
|
|
4687
|
+
- ${ONE_ELEMENT_LOCATOR_PREFIX} and ${ELEMENTS_LOCATOR_PREFIX}: if you see a description that mentions the keyword ${ONE_ELEMENT_LOCATOR_PREFIX} or ${ELEMENTS_LOCATOR_PREFIX}(e.g. follow ${ONE_ELEMENT_LOCATOR_PREFIX} : i want to find ...), it means user wants to locate a specific element meets the description. Return in this way: prefix + the id / comma-separated ids, for example: ${ONE_ELEMENT_LOCATOR_PREFIX}/1 , ${ELEMENTS_LOCATOR_PREFIX}/1,2,3 . If not found, keep the prefix and leave the suffix empty, like ${ONE_ELEMENT_LOCATOR_PREFIX}/ .`;
|
|
4688
|
+
function promptsOfSectionQuery(constraints) {
|
|
4689
|
+
if (!constraints.length) {
|
|
4690
|
+
return "";
|
|
4691
|
+
}
|
|
4692
|
+
const instruction = "Use your segment_a_web_page skill to find the following section(s)";
|
|
4693
|
+
const singleSection = (c) => {
|
|
4694
|
+
assert5(
|
|
4695
|
+
c.name || c.description,
|
|
4696
|
+
"either `name` or `description` is required to define a section constraint"
|
|
4697
|
+
);
|
|
4698
|
+
const number = "One section";
|
|
4699
|
+
const name = c.name ? `named \`${c.name}\`` : "";
|
|
4700
|
+
const description = c.description ? `, usage or criteria : ${c.description}` : "";
|
|
4701
|
+
const basic = `* ${number} ${name}${description}`;
|
|
4702
|
+
return basic;
|
|
4703
|
+
};
|
|
4704
|
+
return `${instruction}
|
|
4705
|
+
${constraints.map(singleSection).join("\n")}`;
|
|
4706
|
+
}
|
|
4707
|
+
function systemPromptToExtract(dataQuery, sections) {
|
|
4708
|
+
const allSectionNames = (sections == null ? void 0 : sections.filter((c) => c.name).map((c) => c.name || "")) || [];
|
|
4709
|
+
const sectionFindingPrompt = promptsOfSectionQuery(sections || []);
|
|
4710
|
+
const sectionReturnFormat = allSectionNames.length ? " sections: [], // detailed information of each section from segment_a_web_page skill" : "";
|
|
4711
|
+
return `
|
|
4712
|
+
${characteristic}
|
|
4713
|
+
${contextFormatIntro}
|
|
4714
|
+
|
|
4715
|
+
You have the following skills:
|
|
4716
|
+
${allSectionNames.length ? skillSegment : ""}
|
|
4717
|
+
${skillExtractData}
|
|
4718
|
+
|
|
4719
|
+
Now, do the following jobs:
|
|
4720
|
+
${sectionFindingPrompt}
|
|
4721
|
+
Use your extract_data_from_UI skill to find the following data, placing it in the \`data\` field
|
|
4722
|
+
DATA_DEMAND start:
|
|
4723
|
+
${typeof dataQuery === "object" ? `return in key-value style object, keys are ${Object.keys(dataQuery).join(",")}` : ""};
|
|
4724
|
+
${typeof dataQuery === "string" ? dataQuery : JSON.stringify(dataQuery, null, 2)}
|
|
4725
|
+
DATA_DEMAND ends.
|
|
4726
|
+
|
|
4727
|
+
Return in the following JSON format:
|
|
4728
|
+
{
|
|
4729
|
+
language: "en", // "en" or "zh", the language of the page. Use the same language to describe section name, description, and similar fields.
|
|
4730
|
+
${sectionReturnFormat}
|
|
4731
|
+
data: any, // the extracted data from extract_data_from_UI skill. Make sure both the value and scheme meet the DATA_DEMAND.
|
|
4732
|
+
errors?: [], // string[], error message if any
|
|
4733
|
+
}
|
|
4734
|
+
`;
|
|
4735
|
+
}
|
|
4736
|
+
function systemPromptToAssert(assertion) {
|
|
4737
|
+
return `
|
|
4738
|
+
${characteristic}
|
|
4739
|
+
${contextFormatIntro}
|
|
4740
|
+
|
|
4741
|
+
Based on the information you get, assert the following:
|
|
4742
|
+
${assertion}
|
|
4743
|
+
|
|
4744
|
+
Return in the following JSON format:
|
|
4745
|
+
{
|
|
4746
|
+
thought: string, // string, the thought of the assertion
|
|
4747
|
+
pass: true, // true or false, whether the assertion is passed
|
|
4748
|
+
}
|
|
4749
|
+
`;
|
|
4750
|
+
}
|
|
4751
|
+
function describeSize(size) {
|
|
4752
|
+
return `${size.width} x ${size.height}`;
|
|
4753
|
+
}
|
|
4754
|
+
function truncateText(text) {
|
|
4755
|
+
const maxLength = 50;
|
|
4756
|
+
if (text && text.length > maxLength) {
|
|
4757
|
+
return `${text.slice(0, maxLength)}...`;
|
|
4758
|
+
}
|
|
4759
|
+
return text;
|
|
4760
|
+
}
|
|
4761
|
+
async function describeUserPage(context) {
|
|
4762
|
+
const { screenshotBase64 } = context;
|
|
4763
|
+
let width;
|
|
4764
|
+
let height;
|
|
4765
|
+
if (context.size) {
|
|
4766
|
+
({ width, height } = context.size);
|
|
4767
|
+
} else {
|
|
4768
|
+
const imgSize = await imageInfoOfBase64(screenshotBase64);
|
|
4769
|
+
({ width, height } = imgSize);
|
|
4770
|
+
}
|
|
4771
|
+
const elementsInfo = context.content;
|
|
4772
|
+
const idElementMap = {};
|
|
4773
|
+
elementsInfo.forEach((item) => {
|
|
4774
|
+
idElementMap[item.id] = item;
|
|
4775
|
+
return __spreadValues({}, item);
|
|
4776
|
+
});
|
|
4777
|
+
const elementInfosDescription = cropfieldInformation(elementsInfo);
|
|
4778
|
+
return {
|
|
4779
|
+
description: `
|
|
4780
|
+
{
|
|
4781
|
+
// The size of the page
|
|
4782
|
+
"pageSize": ${describeSize({ width, height })},
|
|
4783
|
+
|
|
4784
|
+
|
|
4785
|
+
// json description of the element
|
|
4786
|
+
"elementInfos": ${JSON.stringify(elementInfosDescription)}
|
|
4787
|
+
}`,
|
|
4788
|
+
elementById(id) {
|
|
4789
|
+
assert5(typeof id !== "undefined", "id is required for query");
|
|
4790
|
+
const item = idElementMap[`${id}`];
|
|
4791
|
+
return item;
|
|
4792
|
+
}
|
|
4793
|
+
};
|
|
4794
|
+
}
|
|
4795
|
+
function cropfieldInformation(elementsInfo) {
|
|
4796
|
+
const elementInfosDescription = elementsInfo.map(
|
|
4797
|
+
(item) => {
|
|
4798
|
+
const { id, attributes = {}, rect, content } = item;
|
|
4799
|
+
const tailorContent = truncateText(content);
|
|
4800
|
+
const tailorAttributes = Object.keys(attributes).reduce(
|
|
4801
|
+
(res, currentKey) => {
|
|
4802
|
+
const attributeVal = attributes[currentKey];
|
|
4803
|
+
res[currentKey] = truncateText(attributeVal);
|
|
4804
|
+
return res;
|
|
4805
|
+
},
|
|
4806
|
+
{}
|
|
4807
|
+
);
|
|
4808
|
+
return {
|
|
4809
|
+
id,
|
|
4810
|
+
attributes: tailorAttributes,
|
|
4811
|
+
rect,
|
|
4812
|
+
content: tailorContent
|
|
4813
|
+
};
|
|
4814
|
+
}
|
|
4815
|
+
);
|
|
4816
|
+
return JSON.stringify(elementInfosDescription);
|
|
4817
|
+
}
|
|
4818
|
+
function retrieveElement(prompt, opt) {
|
|
4819
|
+
if (opt == null ? void 0 : opt.multi) {
|
|
4820
|
+
return `follow ${ELEMENTS_LOCATOR_PREFIX}: ${prompt}`;
|
|
4821
|
+
}
|
|
4822
|
+
return `follow ${ONE_ELEMENT_LOCATOR_PREFIX}: ${prompt}`;
|
|
4823
|
+
}
|
|
4824
|
+
function ifElementTypeResponse(response) {
|
|
4825
|
+
if (typeof response !== "string") {
|
|
4826
|
+
return false;
|
|
4827
|
+
}
|
|
4828
|
+
return response.startsWith(ONE_ELEMENT_LOCATOR_PREFIX) || response.startsWith(ELEMENTS_LOCATOR_PREFIX);
|
|
4829
|
+
}
|
|
4830
|
+
function splitElementResponse(response) {
|
|
4831
|
+
const oneElementSplitter = `${ONE_ELEMENT_LOCATOR_PREFIX}/`;
|
|
4832
|
+
if (response.startsWith(oneElementSplitter)) {
|
|
4833
|
+
const id = response.slice(oneElementSplitter.length);
|
|
4834
|
+
if (id.indexOf(",") >= 0) {
|
|
4835
|
+
console.warn(`unexpected comma in one element response: ${id}`);
|
|
4836
|
+
}
|
|
4837
|
+
return id ? id : null;
|
|
4838
|
+
}
|
|
4839
|
+
const elementsSplitter = `${ELEMENTS_LOCATOR_PREFIX}/`;
|
|
4840
|
+
if (response.startsWith(elementsSplitter)) {
|
|
4841
|
+
const idsString = response.slice(elementsSplitter.length);
|
|
4842
|
+
if (!idsString) {
|
|
4843
|
+
return [];
|
|
4844
|
+
}
|
|
4845
|
+
return idsString.split(",");
|
|
4846
|
+
}
|
|
4847
|
+
return null;
|
|
4848
|
+
}
|
|
4849
|
+
function retrieveSection(prompt) {
|
|
4850
|
+
return `${SECTION_MATCHER_FLAG}${prompt}`;
|
|
4851
|
+
}
|
|
4852
|
+
function extractSectionQuery(input) {
|
|
4853
|
+
if (typeof input === "string" && input.startsWith(SECTION_MATCHER_FLAG)) {
|
|
4854
|
+
return input.slice(SECTION_MATCHER_FLAG.length);
|
|
4855
|
+
}
|
|
4856
|
+
return false;
|
|
4857
|
+
}
|
|
4858
|
+
|
|
4790
4859
|
// src/ai-model/inspect.ts
|
|
4860
|
+
import assert6 from "assert";
|
|
4791
4861
|
async function AiInspectElement(options) {
|
|
4792
|
-
const {
|
|
4862
|
+
const {
|
|
4863
|
+
context,
|
|
4864
|
+
multi,
|
|
4865
|
+
findElementDescription,
|
|
4866
|
+
callAI = callToGetJSONObject
|
|
4867
|
+
} = options;
|
|
4793
4868
|
const { screenshotBase64 } = context;
|
|
4794
4869
|
const { description, elementById } = await describeUserPage(context);
|
|
4795
4870
|
const systemPrompt = systemPromptToFindElement(findElementDescription, multi);
|
|
@@ -4815,12 +4890,16 @@ async function AiInspectElement(options) {
|
|
|
4815
4890
|
const parseResult = await callAI(msgs);
|
|
4816
4891
|
return {
|
|
4817
4892
|
parseResult,
|
|
4818
|
-
elementById
|
|
4819
|
-
systemPrompt
|
|
4893
|
+
elementById
|
|
4820
4894
|
};
|
|
4821
4895
|
}
|
|
4822
4896
|
async function AiExtractElementInfo(options) {
|
|
4823
|
-
const {
|
|
4897
|
+
const {
|
|
4898
|
+
dataQuery,
|
|
4899
|
+
sectionConstraints,
|
|
4900
|
+
context,
|
|
4901
|
+
callAI = callToGetJSONObject
|
|
4902
|
+
} = options;
|
|
4824
4903
|
const systemPrompt = systemPromptToExtract(dataQuery, sectionConstraints);
|
|
4825
4904
|
const { screenshotBase64 } = context;
|
|
4826
4905
|
const { description, elementById } = await describeUserPage(context);
|
|
@@ -4845,18 +4924,137 @@ async function AiExtractElementInfo(options) {
|
|
|
4845
4924
|
const parseResult = await callAI(msgs);
|
|
4846
4925
|
return {
|
|
4847
4926
|
parseResult,
|
|
4848
|
-
elementById
|
|
4849
|
-
|
|
4927
|
+
elementById
|
|
4928
|
+
};
|
|
4929
|
+
}
|
|
4930
|
+
async function AiAssert(options) {
|
|
4931
|
+
const { assertion, context, callAI = callToGetJSONObject } = options;
|
|
4932
|
+
assert6(assertion, "assertion should be a string");
|
|
4933
|
+
const systemPrompt = systemPromptToAssert(assertion);
|
|
4934
|
+
const { screenshotBase64 } = context;
|
|
4935
|
+
const { description, elementById } = await describeUserPage(context);
|
|
4936
|
+
const msgs = [
|
|
4937
|
+
{ role: "system", content: systemPrompt },
|
|
4938
|
+
{
|
|
4939
|
+
role: "user",
|
|
4940
|
+
content: [
|
|
4941
|
+
{
|
|
4942
|
+
type: "image_url",
|
|
4943
|
+
image_url: {
|
|
4944
|
+
url: screenshotBase64
|
|
4945
|
+
}
|
|
4946
|
+
},
|
|
4947
|
+
{
|
|
4948
|
+
type: "text",
|
|
4949
|
+
text: description
|
|
4950
|
+
}
|
|
4951
|
+
]
|
|
4952
|
+
}
|
|
4953
|
+
];
|
|
4954
|
+
const assertResult = await callAI(msgs);
|
|
4955
|
+
return assertResult;
|
|
4956
|
+
}
|
|
4957
|
+
|
|
4958
|
+
// src/insight/utils.ts
|
|
4959
|
+
import assert7 from "assert";
|
|
4960
|
+
import { randomUUID as randomUUID2 } from "crypto";
|
|
4961
|
+
import { existsSync as existsSync2 } from "fs";
|
|
4962
|
+
import { join as join2 } from "path";
|
|
4963
|
+
var logFileName = "";
|
|
4964
|
+
var logContent = [];
|
|
4965
|
+
var logIdIndexMap = {};
|
|
4966
|
+
var { pid } = process;
|
|
4967
|
+
var logFileExt = insightDumpFileExt;
|
|
4968
|
+
function writeInsightDump(data, logId, dumpSubscriber) {
|
|
4969
|
+
const logDir2 = getDumpDir();
|
|
4970
|
+
assert7(logDir2, "logDir should be set before writing dump file");
|
|
4971
|
+
const id = logId || randomUUID2();
|
|
4972
|
+
const baseData = {
|
|
4973
|
+
sdkVersion: getPkgInfo().version,
|
|
4974
|
+
logTime: Date.now()
|
|
4975
|
+
};
|
|
4976
|
+
const finalData = __spreadValues(__spreadValues({
|
|
4977
|
+
logId: id
|
|
4978
|
+
}, baseData), data);
|
|
4979
|
+
dumpSubscriber == null ? void 0 : dumpSubscriber(finalData);
|
|
4980
|
+
if (!logFileName) {
|
|
4981
|
+
logFileName = `pid_${pid}_${baseData.logTime}`;
|
|
4982
|
+
while (existsSync2(join2(logDir2, `${logFileName}.${logFileExt}`))) {
|
|
4983
|
+
logFileName = `${pid}_${baseData.logTime}-${Math.random()}`;
|
|
4984
|
+
}
|
|
4985
|
+
}
|
|
4986
|
+
const dataString = JSON.stringify(finalData, null, 2);
|
|
4987
|
+
if (typeof logIdIndexMap[id] === "number") {
|
|
4988
|
+
logContent[logIdIndexMap[id]] = dataString;
|
|
4989
|
+
} else {
|
|
4990
|
+
const length = logContent.push(dataString);
|
|
4991
|
+
logIdIndexMap[id] = length - 1;
|
|
4992
|
+
}
|
|
4993
|
+
writeDumpFile({
|
|
4994
|
+
fileName: logFileName,
|
|
4995
|
+
fileExt: logFileExt,
|
|
4996
|
+
fileContent: `[
|
|
4997
|
+
${logContent.join(",\n")}
|
|
4998
|
+
]`
|
|
4999
|
+
});
|
|
5000
|
+
return id;
|
|
5001
|
+
}
|
|
5002
|
+
function idsIntoElements(ids, elementById) {
|
|
5003
|
+
return ids.reduce((acc, id) => {
|
|
5004
|
+
const element = elementById(id);
|
|
5005
|
+
if (element) {
|
|
5006
|
+
acc.push(element);
|
|
5007
|
+
} else {
|
|
5008
|
+
console.warn(`element not found by id: ${id}`);
|
|
5009
|
+
}
|
|
5010
|
+
return acc;
|
|
5011
|
+
}, []);
|
|
5012
|
+
}
|
|
5013
|
+
function shallowExpandIds(data, ifMeet, elementsById) {
|
|
5014
|
+
const keys = Object.keys(data);
|
|
5015
|
+
keys.forEach((key) => {
|
|
5016
|
+
const value = data[key];
|
|
5017
|
+
if (typeof value === "string" && ifMeet(value)) {
|
|
5018
|
+
data[key] = elementsById(value);
|
|
5019
|
+
} else if (Array.isArray(value)) {
|
|
5020
|
+
const newValue = value.map((id) => ifMeet(id) ? elementsById(id) : id);
|
|
5021
|
+
data[key] = newValue;
|
|
5022
|
+
}
|
|
5023
|
+
});
|
|
5024
|
+
return data;
|
|
5025
|
+
}
|
|
5026
|
+
function expandLiteSection(liteSection, elementById) {
|
|
5027
|
+
const _a = liteSection, { textIds } = _a, remainingFields = __objRest(_a, ["textIds"]);
|
|
5028
|
+
const texts = idsIntoElements(textIds, elementById);
|
|
5029
|
+
let leftMost = -1;
|
|
5030
|
+
let topMost = -1;
|
|
5031
|
+
let rightMost = -1;
|
|
5032
|
+
let bottomMost = -1;
|
|
5033
|
+
texts.forEach((text) => {
|
|
5034
|
+
leftMost = leftMost === -1 ? text.rect.left : Math.min(leftMost, text.rect.left);
|
|
5035
|
+
topMost = topMost === -1 ? text.rect.top : Math.min(topMost, text.rect.top);
|
|
5036
|
+
rightMost = Math.max(rightMost, text.rect.left + text.rect.width);
|
|
5037
|
+
bottomMost = Math.max(bottomMost, text.rect.top + text.rect.height);
|
|
5038
|
+
});
|
|
5039
|
+
const sectionRect = {
|
|
5040
|
+
left: leftMost,
|
|
5041
|
+
top: topMost,
|
|
5042
|
+
width: rightMost - leftMost,
|
|
5043
|
+
height: bottomMost - topMost
|
|
4850
5044
|
};
|
|
5045
|
+
const section = __spreadProps(__spreadValues({}, remainingFields), {
|
|
5046
|
+
content: texts,
|
|
5047
|
+
rect: sectionRect
|
|
5048
|
+
});
|
|
5049
|
+
return section;
|
|
4851
5050
|
}
|
|
4852
5051
|
|
|
4853
5052
|
// src/insight/index.ts
|
|
4854
5053
|
var sortByOrder = (a, b) => {
|
|
4855
5054
|
if (a.rect.top - b.rect.top !== 0) {
|
|
4856
5055
|
return a.rect.top - b.rect.top;
|
|
4857
|
-
} else {
|
|
4858
|
-
return a.rect.left - b.rect.left;
|
|
4859
5056
|
}
|
|
5057
|
+
return a.rect.left - b.rect.left;
|
|
4860
5058
|
};
|
|
4861
5059
|
var Insight = class {
|
|
4862
5060
|
constructor(context, opt) {
|
|
@@ -4864,7 +5062,7 @@ var Insight = class {
|
|
|
4864
5062
|
__publicField(this, "aiVendorFn", callToGetJSONObject);
|
|
4865
5063
|
__publicField(this, "onceDumpUpdatedFn");
|
|
4866
5064
|
__publicField(this, "taskInfo");
|
|
4867
|
-
|
|
5065
|
+
assert8(context, "context is required for Insight");
|
|
4868
5066
|
if (typeof context === "function") {
|
|
4869
5067
|
this.contextRetrieverFn = context;
|
|
4870
5068
|
} else {
|
|
@@ -4880,12 +5078,12 @@ var Insight = class {
|
|
|
4880
5078
|
async locate(queryPrompt, opt) {
|
|
4881
5079
|
var _a;
|
|
4882
5080
|
const { callAI = this.aiVendorFn, multi = false } = opt || {};
|
|
4883
|
-
|
|
5081
|
+
assert8(queryPrompt, "query is required for located");
|
|
4884
5082
|
const dumpSubscriber = this.onceDumpUpdatedFn;
|
|
4885
5083
|
this.onceDumpUpdatedFn = void 0;
|
|
4886
5084
|
const context = await this.contextRetrieverFn();
|
|
4887
5085
|
const startTime = Date.now();
|
|
4888
|
-
const { parseResult,
|
|
5086
|
+
const { parseResult, elementById } = await AiInspectElement({
|
|
4889
5087
|
callAI,
|
|
4890
5088
|
context,
|
|
4891
5089
|
multi: Boolean(multi),
|
|
@@ -4894,8 +5092,7 @@ var Insight = class {
|
|
|
4894
5092
|
const timeCost = Date.now() - startTime;
|
|
4895
5093
|
const taskInfo = __spreadProps(__spreadValues({}, this.taskInfo ? this.taskInfo : {}), {
|
|
4896
5094
|
durationMs: timeCost,
|
|
4897
|
-
rawResponse: JSON.stringify(parseResult)
|
|
4898
|
-
systemPrompt
|
|
5095
|
+
rawResponse: JSON.stringify(parseResult)
|
|
4899
5096
|
});
|
|
4900
5097
|
let errorLog;
|
|
4901
5098
|
if ((_a = parseResult.errors) == null ? void 0 : _a.length) {
|
|
@@ -4923,7 +5120,9 @@ ${parseResult.errors.join("\n")}`;
|
|
|
4923
5120
|
parseResult.elements.forEach((item) => {
|
|
4924
5121
|
const element = elementById(item.id);
|
|
4925
5122
|
if (!element) {
|
|
4926
|
-
console.warn(
|
|
5123
|
+
console.warn(
|
|
5124
|
+
`locate: cannot find element id=${item.id}. Maybe an unstable response from AI model`
|
|
5125
|
+
);
|
|
4927
5126
|
return;
|
|
4928
5127
|
}
|
|
4929
5128
|
elements.push(element);
|
|
@@ -4937,20 +5136,23 @@ ${parseResult.errors.join("\n")}`;
|
|
|
4937
5136
|
);
|
|
4938
5137
|
if (opt == null ? void 0 : opt.multi) {
|
|
4939
5138
|
return elements;
|
|
4940
|
-
}
|
|
4941
|
-
|
|
5139
|
+
}
|
|
5140
|
+
if (elements.length >= 2) {
|
|
5141
|
+
console.warn(
|
|
5142
|
+
`locate: multiple elements found, return the first one. (query: ${queryPrompt})`
|
|
5143
|
+
);
|
|
4942
5144
|
return elements[0];
|
|
4943
|
-
}
|
|
5145
|
+
}
|
|
5146
|
+
if (elements.length === 1) {
|
|
4944
5147
|
return elements[0];
|
|
4945
|
-
} else {
|
|
4946
|
-
return null;
|
|
4947
5148
|
}
|
|
5149
|
+
return null;
|
|
4948
5150
|
}
|
|
4949
5151
|
async extract(dataDemand) {
|
|
4950
5152
|
var _a;
|
|
4951
5153
|
let dataQuery = {};
|
|
4952
5154
|
const sectionQueryMap = {};
|
|
4953
|
-
|
|
5155
|
+
assert8(
|
|
4954
5156
|
typeof dataDemand === "object" || typeof dataDemand === "string",
|
|
4955
5157
|
`dataDemand should be object or string, but get ${typeof dataDemand}`
|
|
4956
5158
|
);
|
|
@@ -4979,7 +5181,7 @@ ${parseResult.errors.join("\n")}`;
|
|
|
4979
5181
|
});
|
|
4980
5182
|
const context = await this.contextRetrieverFn();
|
|
4981
5183
|
const startTime = Date.now();
|
|
4982
|
-
const { parseResult,
|
|
5184
|
+
const { parseResult, elementById } = await AiExtractElementInfo({
|
|
4983
5185
|
context,
|
|
4984
5186
|
dataQuery,
|
|
4985
5187
|
sectionConstraints,
|
|
@@ -4988,8 +5190,7 @@ ${parseResult.errors.join("\n")}`;
|
|
|
4988
5190
|
const timeCost = Date.now() - startTime;
|
|
4989
5191
|
const taskInfo = __spreadProps(__spreadValues({}, this.taskInfo ? this.taskInfo : {}), {
|
|
4990
5192
|
durationMs: timeCost,
|
|
4991
|
-
rawResponse: JSON.stringify(parseResult)
|
|
4992
|
-
systemPrompt
|
|
5193
|
+
rawResponse: JSON.stringify(parseResult)
|
|
4993
5194
|
});
|
|
4994
5195
|
let errorLog;
|
|
4995
5196
|
if ((_a = parseResult.errors) == null ? void 0 : _a.length) {
|
|
@@ -5014,7 +5215,10 @@ ${parseResult.errors.join("\n")}`;
|
|
|
5014
5215
|
throw new Error(errorLog);
|
|
5015
5216
|
}
|
|
5016
5217
|
const sectionsArr = (parseResult.sections || []).map((liteSection) => {
|
|
5017
|
-
const section = expandLiteSection(
|
|
5218
|
+
const section = expandLiteSection(
|
|
5219
|
+
liteSection,
|
|
5220
|
+
(id) => elementById(id)
|
|
5221
|
+
);
|
|
5018
5222
|
return section;
|
|
5019
5223
|
}).sort(sortByOrder);
|
|
5020
5224
|
const sectionMap = sectionsArr.reduce((acc, section) => {
|
|
@@ -5038,7 +5242,8 @@ ${parseResult.errors.join("\n")}`;
|
|
|
5038
5242
|
const idList = splitElementResponse(id);
|
|
5039
5243
|
if (typeof idList === "string") {
|
|
5040
5244
|
return elementById(idList);
|
|
5041
|
-
}
|
|
5245
|
+
}
|
|
5246
|
+
if (Array.isArray(idList)) {
|
|
5042
5247
|
return idsIntoElements(idList, elementById);
|
|
5043
5248
|
}
|
|
5044
5249
|
return idList;
|
|
@@ -5055,133 +5260,46 @@ ${parseResult.errors.join("\n")}`;
|
|
|
5055
5260
|
);
|
|
5056
5261
|
return mergedData;
|
|
5057
5262
|
}
|
|
5058
|
-
|
|
5059
|
-
|
|
5060
|
-
|
|
5061
|
-
|
|
5062
|
-
|
|
5063
|
-
};
|
|
5064
|
-
}
|
|
5065
|
-
};
|
|
5066
|
-
|
|
5067
|
-
// src/action/executor.ts
|
|
5068
|
-
import assert7 from "assert";
|
|
5069
|
-
var Executor = class {
|
|
5070
|
-
constructor(name, description, tasks) {
|
|
5071
|
-
__publicField(this, "name");
|
|
5072
|
-
__publicField(this, "description");
|
|
5073
|
-
__publicField(this, "tasks");
|
|
5074
|
-
// status of executor
|
|
5075
|
-
__publicField(this, "status");
|
|
5076
|
-
__publicField(this, "errorMsg");
|
|
5077
|
-
__publicField(this, "dumpFileName");
|
|
5078
|
-
this.status = tasks && tasks.length > 0 ? "pending" : "init";
|
|
5079
|
-
this.name = name;
|
|
5080
|
-
this.description = description;
|
|
5081
|
-
this.tasks = (tasks || []).map((item) => this.markTaskAsPending(item));
|
|
5082
|
-
}
|
|
5083
|
-
markTaskAsPending(task) {
|
|
5084
|
-
return __spreadValues({
|
|
5085
|
-
status: "pending"
|
|
5086
|
-
}, task);
|
|
5087
|
-
}
|
|
5088
|
-
async append(task) {
|
|
5089
|
-
assert7(this.status !== "error", "executor is in error state, cannot append task");
|
|
5090
|
-
if (Array.isArray(task)) {
|
|
5091
|
-
this.tasks.push(...task.map((item) => this.markTaskAsPending(item)));
|
|
5092
|
-
} else {
|
|
5093
|
-
this.tasks.push(this.markTaskAsPending(task));
|
|
5094
|
-
}
|
|
5095
|
-
if (this.status !== "running") {
|
|
5096
|
-
this.status = "pending";
|
|
5097
|
-
}
|
|
5098
|
-
}
|
|
5099
|
-
async flush() {
|
|
5100
|
-
if (this.status === "init" && this.tasks.length > 0) {
|
|
5101
|
-
console.warn("illegal state for executor, status is init but tasks are not empty");
|
|
5102
|
-
}
|
|
5103
|
-
assert7(this.status !== "running", "executor is already running");
|
|
5104
|
-
assert7(this.status !== "completed", "executor is already completed");
|
|
5105
|
-
assert7(this.status !== "error", "executor is in error state");
|
|
5106
|
-
const nextPendingIndex = this.tasks.findIndex((task) => task.status === "pending");
|
|
5107
|
-
if (nextPendingIndex < 0) {
|
|
5108
|
-
return;
|
|
5109
|
-
}
|
|
5110
|
-
this.status = "running";
|
|
5111
|
-
let taskIndex = nextPendingIndex;
|
|
5112
|
-
let successfullyCompleted = true;
|
|
5113
|
-
let errorMsg = "";
|
|
5114
|
-
let previousFindOutput;
|
|
5115
|
-
while (taskIndex < this.tasks.length) {
|
|
5116
|
-
const task = this.tasks[taskIndex];
|
|
5117
|
-
assert7(task.status === "pending", `task status should be pending, but got: ${task.status}`);
|
|
5118
|
-
task.timing = {
|
|
5119
|
-
start: Date.now()
|
|
5120
|
-
};
|
|
5121
|
-
try {
|
|
5122
|
-
task.status = "running";
|
|
5123
|
-
assert7(
|
|
5124
|
-
["Insight", "Action", "Planning"].indexOf(task.type) >= 0,
|
|
5125
|
-
`unsupported task type: ${task.type}`
|
|
5126
|
-
);
|
|
5127
|
-
const { executor, param } = task;
|
|
5128
|
-
assert7(executor, `executor is required for task type: ${task.type}`);
|
|
5129
|
-
let returnValue;
|
|
5130
|
-
const executorContext = {
|
|
5131
|
-
task,
|
|
5132
|
-
element: previousFindOutput == null ? void 0 : previousFindOutput.element
|
|
5133
|
-
};
|
|
5134
|
-
if (task.type === "Insight") {
|
|
5135
|
-
assert7(
|
|
5136
|
-
task.subType === "Locate" || task.subType === "Query",
|
|
5137
|
-
`unsupported insight subType: ${task.subType}`
|
|
5138
|
-
);
|
|
5139
|
-
returnValue = await task.executor(param, executorContext);
|
|
5140
|
-
if (task.subType === "Locate") {
|
|
5141
|
-
previousFindOutput = returnValue == null ? void 0 : returnValue.output;
|
|
5142
|
-
}
|
|
5143
|
-
} else if (task.type === "Action" || task.type === "Planning") {
|
|
5144
|
-
returnValue = await task.executor(param, executorContext);
|
|
5145
|
-
} else {
|
|
5146
|
-
console.warn(`unsupported task type: ${task.type}, will try to execute it directly`);
|
|
5147
|
-
returnValue = await task.executor(param, executorContext);
|
|
5148
|
-
}
|
|
5149
|
-
Object.assign(task, returnValue);
|
|
5150
|
-
task.status = "success";
|
|
5151
|
-
task.timing.end = Date.now();
|
|
5152
|
-
task.timing.cost = task.timing.end - task.timing.start;
|
|
5153
|
-
taskIndex++;
|
|
5154
|
-
} catch (e) {
|
|
5155
|
-
successfullyCompleted = false;
|
|
5156
|
-
task.status = "fail";
|
|
5157
|
-
errorMsg = `${e == null ? void 0 : e.message}
|
|
5158
|
-
${e == null ? void 0 : e.stack}`;
|
|
5159
|
-
task.error = errorMsg;
|
|
5160
|
-
task.timing.end = Date.now();
|
|
5161
|
-
task.timing.cost = task.timing.end - task.timing.start;
|
|
5162
|
-
this.errorMsg = errorMsg;
|
|
5163
|
-
break;
|
|
5164
|
-
}
|
|
5165
|
-
}
|
|
5166
|
-
for (let i = taskIndex + 1; i < this.tasks.length; i++) {
|
|
5167
|
-
this.tasks[i].status = "cancelled";
|
|
5168
|
-
}
|
|
5169
|
-
if (successfullyCompleted) {
|
|
5170
|
-
this.status = "completed";
|
|
5171
|
-
} else {
|
|
5172
|
-
this.status = "error";
|
|
5173
|
-
throw new Error(`executor failed: ${errorMsg}`);
|
|
5263
|
+
async assert(assertion) {
|
|
5264
|
+
if (typeof assertion !== "string") {
|
|
5265
|
+
throw new Error(
|
|
5266
|
+
"This is the assert method for Midscene, the first argument should be a string. If you want to use the assert method from Node.js, please import it from the Node.js assert module."
|
|
5267
|
+
);
|
|
5174
5268
|
}
|
|
5175
|
-
|
|
5176
|
-
|
|
5269
|
+
const dumpSubscriber = this.onceDumpUpdatedFn;
|
|
5270
|
+
this.onceDumpUpdatedFn = void 0;
|
|
5271
|
+
const context = await this.contextRetrieverFn();
|
|
5272
|
+
const startTime = Date.now();
|
|
5273
|
+
const assertResult = await AiAssert({
|
|
5274
|
+
assertion,
|
|
5275
|
+
callAI: this.aiVendorFn,
|
|
5276
|
+
context
|
|
5277
|
+
});
|
|
5278
|
+
const timeCost = Date.now() - startTime;
|
|
5279
|
+
const taskInfo = __spreadProps(__spreadValues({}, this.taskInfo ? this.taskInfo : {}), {
|
|
5280
|
+
durationMs: timeCost,
|
|
5281
|
+
rawResponse: JSON.stringify(assertResult)
|
|
5282
|
+
});
|
|
5283
|
+
const { thought, pass } = assertResult;
|
|
5177
5284
|
const dumpData = {
|
|
5178
|
-
|
|
5179
|
-
|
|
5180
|
-
|
|
5181
|
-
|
|
5182
|
-
|
|
5285
|
+
type: "assert",
|
|
5286
|
+
context,
|
|
5287
|
+
userQuery: {
|
|
5288
|
+
assertion
|
|
5289
|
+
},
|
|
5290
|
+
matchedSection: [],
|
|
5291
|
+
matchedElement: [],
|
|
5292
|
+
data: null,
|
|
5293
|
+
taskInfo,
|
|
5294
|
+
assertionPass: pass,
|
|
5295
|
+
assertionThought: thought,
|
|
5296
|
+
error: pass ? void 0 : thought
|
|
5297
|
+
};
|
|
5298
|
+
writeInsightDump(dumpData, void 0, dumpSubscriber);
|
|
5299
|
+
return {
|
|
5300
|
+
pass,
|
|
5301
|
+
thought
|
|
5183
5302
|
};
|
|
5184
|
-
return dumpData;
|
|
5185
5303
|
}
|
|
5186
5304
|
};
|
|
5187
5305
|
|