@midscene/core 0.1.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/lib/index.js CHANGED
@@ -1029,64 +1029,37 @@ __export(src_exports, {
1029
1029
  });
1030
1030
  module.exports = __toCommonJS(src_exports);
1031
1031
 
1032
- // src/insight/index.ts
1033
- var import_assert5 = __toESM(require("assert"));
1034
-
1035
- // src/ai-model/prompt/util.ts
1036
- var import_assert2 = __toESM(require("assert"));
1032
+ // src/action/executor.ts
1033
+ var import_node_assert2 = __toESM(require("assert"));
1037
1034
 
1038
- // src/image/info.ts
1035
+ // src/utils.ts
1039
1036
  var import_node_assert = __toESM(require("assert"));
1040
- var import_node_buffer = require("buffer");
1037
+ var import_node_crypto = require("crypto");
1041
1038
  var import_node_fs = require("fs");
1042
- var import_sharp = __toESM(require("sharp"));
1043
- async function imageInfo(image) {
1044
- const { width, height } = await (0, import_sharp.default)(image).metadata();
1045
- (0, import_node_assert.default)(width && height, `invalid image: ${image}`);
1046
- return { width, height };
1047
- }
1048
- async function imageInfoOfBase64(imageBase64) {
1049
- const base64Data = imageBase64.replace(/^data:image\/\w+;base64,/, "");
1050
- return imageInfo(import_node_buffer.Buffer.from(base64Data, "base64"));
1051
- }
1052
-
1053
- // src/image/transform.ts
1054
- var import_node_buffer2 = require("buffer");
1055
- var import_sharp2 = __toESM(require("sharp"));
1056
-
1057
- // src/image/visualization.ts
1058
- var import_buffer = require("buffer");
1059
- var import_sharp3 = __toESM(require("sharp"));
1060
-
1061
- // src/utils.ts
1062
- var import_os = require("os");
1063
- var import_path = require("path");
1064
- var import_fs = require("fs");
1065
- var import_crypto = require("crypto");
1066
- var import_assert = __toESM(require("assert"));
1039
+ var import_node_os = require("os");
1040
+ var import_node_path = require("path");
1067
1041
  var pkg;
1068
1042
  function getPkgInfo() {
1069
1043
  if (pkg) {
1070
1044
  return pkg;
1071
1045
  }
1072
1046
  let pkgJsonFile = "";
1073
- if ((0, import_fs.existsSync)((0, import_path.join)(__dirname, "../package.json"))) {
1074
- pkgJsonFile = (0, import_path.join)(__dirname, "../package.json");
1075
- } else if ((0, import_fs.existsSync)((0, import_path.join)(__dirname, "../../../package.json"))) {
1076
- pkgJsonFile = (0, import_path.join)(__dirname, "../../../package.json");
1047
+ if ((0, import_node_fs.existsSync)((0, import_node_path.join)(__dirname, "../package.json"))) {
1048
+ pkgJsonFile = (0, import_node_path.join)(__dirname, "../package.json");
1049
+ } else if ((0, import_node_fs.existsSync)((0, import_node_path.join)(__dirname, "../../../package.json"))) {
1050
+ pkgJsonFile = (0, import_node_path.join)(__dirname, "../../../package.json");
1077
1051
  }
1078
1052
  if (pkgJsonFile) {
1079
- const { name, version } = JSON.parse((0, import_fs.readFileSync)(pkgJsonFile, "utf-8"));
1053
+ const { name, version } = JSON.parse((0, import_node_fs.readFileSync)(pkgJsonFile, "utf-8"));
1080
1054
  pkg = { name, version };
1081
1055
  return pkg;
1082
- } else {
1083
- return {
1084
- name: "midscene-unknown-page-name",
1085
- version: "0.0.0"
1086
- };
1087
1056
  }
1057
+ return {
1058
+ name: "midscene-unknown-page-name",
1059
+ version: "0.0.0"
1060
+ };
1088
1061
  }
1089
- var logDir = (0, import_path.join)(process.cwd(), "./midscene_run/");
1062
+ var logDir = (0, import_node_path.join)(process.cwd(), "./midscene_run/");
1090
1063
  var logEnvReady = false;
1091
1064
  var insightDumpFileExt = "insight-dump.json";
1092
1065
  function getDumpDir() {
@@ -1096,27 +1069,27 @@ function setDumpDir(dir) {
1096
1069
  logDir = dir;
1097
1070
  }
1098
1071
  function getDumpDirPath(type) {
1099
- return (0, import_path.join)(getDumpDir(), type);
1072
+ return (0, import_node_path.join)(getDumpDir(), type);
1100
1073
  }
1101
1074
  function writeDumpFile(opts) {
1102
1075
  const { fileName, fileExt, fileContent, type = "dump" } = opts;
1103
1076
  const targetDir = getDumpDirPath(type);
1104
- if (!(0, import_fs.existsSync)(targetDir)) {
1105
- (0, import_fs.mkdirSync)(targetDir, { recursive: true });
1077
+ if (!(0, import_node_fs.existsSync)(targetDir)) {
1078
+ (0, import_node_fs.mkdirSync)(targetDir, { recursive: true });
1106
1079
  }
1107
1080
  if (!logEnvReady) {
1108
- (0, import_assert.default)(targetDir, "logDir should be set before writing dump file");
1109
- const gitIgnorePath = (0, import_path.join)(targetDir, "../../.gitignore");
1081
+ (0, import_node_assert.default)(targetDir, "logDir should be set before writing dump file");
1082
+ const gitIgnorePath = (0, import_node_path.join)(targetDir, "../../.gitignore");
1110
1083
  let gitIgnoreContent = "";
1111
- if ((0, import_fs.existsSync)(gitIgnorePath)) {
1112
- gitIgnoreContent = (0, import_fs.readFileSync)(gitIgnorePath, "utf-8");
1084
+ if ((0, import_node_fs.existsSync)(gitIgnorePath)) {
1085
+ gitIgnoreContent = (0, import_node_fs.readFileSync)(gitIgnorePath, "utf-8");
1113
1086
  }
1114
- const logDirName = (0, import_path.basename)(logDir);
1087
+ const logDirName = (0, import_node_path.basename)(logDir);
1115
1088
  if (!gitIgnoreContent.includes(`${logDirName}/`)) {
1116
- (0, import_fs.writeFileSync)(
1089
+ (0, import_node_fs.writeFileSync)(
1117
1090
  gitIgnorePath,
1118
1091
  `${gitIgnoreContent}
1119
- # MidScene.js dump files
1092
+ # Midscene.js dump files
1120
1093
  ${logDirName}/report
1121
1094
  ${logDirName}/dump-logger
1122
1095
  `,
@@ -1125,305 +1098,176 @@ ${logDirName}/dump-logger
1125
1098
  }
1126
1099
  logEnvReady = true;
1127
1100
  }
1128
- const filePath = (0, import_path.join)(targetDir, `${fileName}.${fileExt}`);
1129
- (0, import_fs.writeFileSync)(filePath, fileContent);
1101
+ const filePath = (0, import_node_path.join)(targetDir, `${fileName}.${fileExt}`);
1102
+ (0, import_node_fs.writeFileSync)(filePath, fileContent);
1130
1103
  if (type === "dump") {
1131
- (0, import_fs.copyFileSync)(filePath, (0, import_path.join)(targetDir, `latest.${fileExt}`));
1104
+ (0, import_node_fs.copyFileSync)(filePath, (0, import_node_path.join)(targetDir, `latest.${fileExt}`));
1132
1105
  }
1133
1106
  return filePath;
1134
1107
  }
1135
1108
 
1136
- // src/ai-model/prompt/util.ts
1137
- var characteristic = "You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.";
1138
- var contextFormatIntro = `
1139
- The user will give you a screenshot and the texts on it. There may be some none-English characters (like Chinese) on it, indicating it's an non-English app.`;
1140
- var ONE_ELEMENT_LOCATOR_PREFIX = "LOCATE_ONE_ELEMENT";
1141
- var ELEMENTS_LOCATOR_PREFIX = "LOCATE_ONE_OR_MORE_ELEMENTS";
1142
- var SECTION_MATCHER_FLAG = "SECTION_MATCHER_FLAG/";
1143
- var skillSegment = `skill name: segment_a_web_page
1144
- skill content:
1145
- Based on the functions and content of various elements on the page, segment the screenshot into different sections like navigation bar, product list, news area, etc.
1146
- Some general rules for segmentation:
1147
- * Each section should NOT overlap with each other.
1148
- * Each text should only belong to one section.
1149
- * [IMPORTANT] Whether the content visually appears to belong to different sections is a significant factor in segmenting the page.
1150
- * Analyze the page in a top-to-bottom and left-to-right order.
1151
- * The evidence indicates a separate section, for example
1152
- - The background color of certain parts of the page changes.
1153
- - A section of a page includes a title.
1154
- * Provide the following data for each of the UI section you found.
1155
- {
1156
- "name": "name of the section",
1157
- "description": "briefly summarize the key content or usage of this section.",
1158
- "sectionCharacteristics": "In view of the need to distinguish this section from the surrounding sections, explain the characteristics and how to define boundaries and what precautions to take.",
1159
- "textIds": ["5", "6", "7"], // ids of all text elements in this section
1109
+ // src/action/executor.ts
1110
+ var Executor = class {
1111
+ constructor(name, description, tasks) {
1112
+ __publicField(this, "name");
1113
+ __publicField(this, "description");
1114
+ __publicField(this, "tasks");
1115
+ // status of executor
1116
+ __publicField(this, "status");
1117
+ __publicField(this, "errorMsg");
1118
+ __publicField(this, "dumpFileName");
1119
+ this.status = tasks && tasks.length > 0 ? "pending" : "init";
1120
+ this.name = name;
1121
+ this.description = description;
1122
+ this.tasks = (tasks || []).map((item) => this.markTaskAsPending(item));
1160
1123
  }
1161
- `;
1162
- var skillExtractData = `skill name: extract_data_from_UI
1163
- related input: DATA_DEMAND
1164
- skill content:
1165
- * User will give you some data requirements in DATA_DEMAND. Consider the UI context, follow the user's instructions, and provide comprehensive data accordingly.
1166
- * There may be some special commands in DATA_DEMAND, please pay extra attention
1167
- - ${ONE_ELEMENT_LOCATOR_PREFIX} and ${ELEMENTS_LOCATOR_PREFIX}: if you see a description that mentions the keyword ${ONE_ELEMENT_LOCATOR_PREFIX} or ${ELEMENTS_LOCATOR_PREFIX}(e.g. follow ${ONE_ELEMENT_LOCATOR_PREFIX} : i want to find ...), it means user wants to locate a specific element meets the description. Return in this way: prefix + the id / comma-separated ids, for example: ${ONE_ELEMENT_LOCATOR_PREFIX}/1 , ${ELEMENTS_LOCATOR_PREFIX}/1,2,3 . If not found, keep the prefix and leave the suffix empty, like ${ONE_ELEMENT_LOCATOR_PREFIX}/ .`;
1168
- function promptsOfSectionQuery(constraints) {
1169
- if (!constraints.length) {
1170
- return "";
1124
+ markTaskAsPending(task) {
1125
+ return __spreadValues({
1126
+ status: "pending"
1127
+ }, task);
1171
1128
  }
1172
- const instruction = "Use your segment_a_web_page skill to find the following section(s)";
1173
- const singleSection = (c) => {
1174
- (0, import_assert2.default)(
1175
- c.name || c.description,
1176
- "either `name` or `description` is required to define a section constraint"
1129
+ async append(task) {
1130
+ (0, import_node_assert2.default)(
1131
+ this.status !== "error",
1132
+ "executor is in error state, cannot append task"
1177
1133
  );
1178
- const number = "One section";
1179
- const name = c.name ? `named \`${c.name}\`` : "";
1180
- const description = c.description ? `, usage or criteria : ${c.description}` : "";
1181
- const basic = `* ${number} ${name}${description}`;
1182
- return basic;
1183
- };
1184
- return `${instruction}
1185
- ${constraints.map(singleSection).join("\n")}`;
1186
- }
1187
- function systemPromptToExtract(dataQuery, sections) {
1188
- const allSectionNames = (sections == null ? void 0 : sections.filter((c) => c.name).map((c) => c.name || "")) || [];
1189
- const sectionFindingPrompt = promptsOfSectionQuery(sections || []);
1190
- const sectionReturnFormat = allSectionNames.length ? " sections: [], // detailed information of each section from segment_a_web_page skill" : "";
1191
- return `
1192
- ${characteristic}
1193
- ${contextFormatIntro}
1194
-
1195
- You have the following skills:
1196
- ${allSectionNames.length ? skillSegment : ""}
1197
- ${skillExtractData}
1198
-
1199
- Now, do the following jobs:
1200
- ${sectionFindingPrompt}
1201
- Use your extract_data_from_UI skill to find the following data, placing it in the \`data\` field
1202
- DATA_DEMAND start:
1203
- ${typeof dataQuery === "object" ? `return in key-value style object, keys are ${Object.keys(dataQuery).join(",")}` : ""};
1204
- ${typeof dataQuery === "string" ? dataQuery : JSON.stringify(dataQuery, null, 2)}
1205
- DATA_DEMAND ends.
1206
-
1207
- Return in the following JSON format:
1208
- {
1209
- language: "en", // "en" or "zh", the language of the page. Use the same language to describe section name, description, and similar fields.
1210
- ${sectionReturnFormat}
1211
- data: any, // the extracted data from extract_data_from_UI skill. Make sure both the value and scheme meet the DATA_DEMAND.
1212
- errors?: [], // string[], error message if any
1213
- }
1214
- `;
1215
- }
1216
- function describeSize(size) {
1217
- return `${size.width} x ${size.height}`;
1218
- }
1219
- function truncateText(text) {
1220
- const maxLength = 50;
1221
- if (text && text.length > maxLength) {
1222
- return `${text.slice(0, maxLength)}...`;
1223
- }
1224
- return text;
1225
- }
1226
- async function describeUserPage(context) {
1227
- const { screenshotBase64 } = context;
1228
- let width;
1229
- let height;
1230
- if (context.size) {
1231
- ({ width, height } = context.size);
1232
- } else {
1233
- const imgSize = await imageInfoOfBase64(screenshotBase64);
1234
- ({ width, height } = imgSize);
1235
- }
1236
- const elementsInfo = context.content;
1237
- const idElementMap = {};
1238
- elementsInfo.forEach((item) => {
1239
- idElementMap[item.id] = item;
1240
- return __spreadValues({}, item);
1241
- });
1242
- const elementInfosDescription = cropfieldInformation(elementsInfo);
1243
- return {
1244
- description: `
1245
- {
1246
- // The size of the page
1247
- "pageSize": ${describeSize({ width, height })},
1248
-
1249
-
1250
- // json description of the element
1251
- "elementInfos": ${JSON.stringify(elementInfosDescription)}
1252
- }`,
1253
- elementById(id) {
1254
- (0, import_assert2.default)(typeof id !== "undefined", "id is required for query");
1255
- const item = idElementMap[`${id}`];
1256
- return item;
1257
- }
1258
- };
1259
- }
1260
- function cropfieldInformation(elementsInfo) {
1261
- const elementInfosDescription = elementsInfo.map((item) => {
1262
- const { id, attributes = {}, rect, content } = item;
1263
- const tailorContent = truncateText(content);
1264
- const tailorAttributes = Object.keys(attributes).reduce((res, currentKey) => {
1265
- const attributeVal = attributes[currentKey];
1266
- res[currentKey] = truncateText(attributeVal);
1267
- return res;
1268
- }, {});
1269
- return {
1270
- id,
1271
- attributes: tailorAttributes,
1272
- rect,
1273
- content: tailorContent
1274
- };
1275
- });
1276
- return JSON.stringify(elementInfosDescription);
1277
- }
1278
- function retrieveElement(prompt, opt) {
1279
- if (opt == null ? void 0 : opt.multi) {
1280
- return `follow ${ELEMENTS_LOCATOR_PREFIX}: ${prompt}`;
1281
- }
1282
- return `follow ${ONE_ELEMENT_LOCATOR_PREFIX}: ${prompt}`;
1283
- }
1284
- function ifElementTypeResponse(response) {
1285
- if (typeof response !== "string") {
1286
- return false;
1287
- }
1288
- return response.startsWith(ONE_ELEMENT_LOCATOR_PREFIX) || response.startsWith(ELEMENTS_LOCATOR_PREFIX);
1289
- }
1290
- function splitElementResponse(response) {
1291
- const oneElementSplitter = `${ONE_ELEMENT_LOCATOR_PREFIX}/`;
1292
- if (response.startsWith(oneElementSplitter)) {
1293
- const id = response.slice(oneElementSplitter.length);
1294
- if (id.indexOf(",") >= 0) {
1295
- console.warn(`unexpected comma in one element response: ${id}`);
1296
- }
1297
- return id ? id : null;
1298
- }
1299
- const elementsSplitter = `${ELEMENTS_LOCATOR_PREFIX}/`;
1300
- if (response.startsWith(elementsSplitter)) {
1301
- const idsString = response.slice(elementsSplitter.length);
1302
- if (!idsString) {
1303
- return [];
1134
+ if (Array.isArray(task)) {
1135
+ this.tasks.push(...task.map((item) => this.markTaskAsPending(item)));
1136
+ } else {
1137
+ this.tasks.push(this.markTaskAsPending(task));
1304
1138
  }
1305
- return idsString.split(",");
1306
- }
1307
- return null;
1308
- }
1309
- function retrieveSection(prompt) {
1310
- return `${SECTION_MATCHER_FLAG}${prompt}`;
1311
- }
1312
- function extractSectionQuery(input) {
1313
- if (typeof input === "string" && input.startsWith(SECTION_MATCHER_FLAG)) {
1314
- return input.slice(SECTION_MATCHER_FLAG.length);
1315
- }
1316
- return false;
1317
- }
1318
-
1319
- // src/insight/utils.ts
1320
- var import_fs2 = require("fs");
1321
- var import_path2 = require("path");
1322
- var import_crypto2 = require("crypto");
1323
- var import_assert3 = __toESM(require("assert"));
1324
- var logFileName = "";
1325
- var logContent = [];
1326
- var logIdIndexMap = {};
1327
- var { pid } = process;
1328
- var logFileExt = insightDumpFileExt;
1329
- function writeInsightDump(data, logId, dumpSubscriber) {
1330
- const logDir2 = getDumpDir();
1331
- (0, import_assert3.default)(logDir2, "logDir should be set before writing dump file");
1332
- const id = logId || (0, import_crypto2.randomUUID)();
1333
- const baseData = {
1334
- sdkVersion: getPkgInfo().version,
1335
- logTime: Date.now()
1336
- };
1337
- const finalData = __spreadValues(__spreadValues({
1338
- logId: id
1339
- }, baseData), data);
1340
- dumpSubscriber == null ? void 0 : dumpSubscriber(finalData);
1341
- if (!logFileName) {
1342
- logFileName = `pid_${pid}_${baseData.logTime}`;
1343
- while ((0, import_fs2.existsSync)((0, import_path2.join)(logDir2, `${logFileName}.${logFileExt}`))) {
1344
- logFileName = `${pid}_${baseData.logTime}-${Math.random()}`;
1139
+ if (this.status !== "running") {
1140
+ this.status = "pending";
1345
1141
  }
1346
1142
  }
1347
- const dataString = JSON.stringify(finalData, null, 2);
1348
- if (typeof logIdIndexMap[id] === "number") {
1349
- logContent[logIdIndexMap[id]] = dataString;
1350
- } else {
1351
- const length = logContent.push(dataString);
1352
- logIdIndexMap[id] = length - 1;
1353
- }
1354
- writeDumpFile({
1355
- fileName: logFileName,
1356
- fileExt: logFileExt,
1357
- fileContent: `[
1358
- ${logContent.join(",\n")}
1359
- ]`
1360
- });
1361
- return id;
1362
- }
1363
- function idsIntoElements(ids, elementById) {
1364
- return ids.reduce((acc, id) => {
1365
- const element = elementById(id);
1366
- if (element) {
1367
- acc.push(element);
1368
- } else {
1369
- console.warn(`element not found by id: ${id}`);
1143
+ async flush() {
1144
+ if (this.status === "init" && this.tasks.length > 0) {
1145
+ console.warn(
1146
+ "illegal state for executor, status is init but tasks are not empty"
1147
+ );
1370
1148
  }
1371
- return acc;
1372
- }, []);
1373
- }
1374
- function shallowExpandIds(data, ifMeet, elementsById) {
1375
- const keys = Object.keys(data);
1376
- keys.forEach((key) => {
1377
- const value = data[key];
1378
- if (typeof value === "string" && ifMeet(value)) {
1379
- data[key] = elementsById(value);
1380
- } else if (Array.isArray(value)) {
1381
- const newValue = value.map((id) => ifMeet(id) ? elementsById(id) : id);
1382
- data[key] = newValue;
1149
+ (0, import_node_assert2.default)(this.status !== "running", "executor is already running");
1150
+ (0, import_node_assert2.default)(this.status !== "completed", "executor is already completed");
1151
+ (0, import_node_assert2.default)(this.status !== "error", "executor is in error state");
1152
+ const nextPendingIndex = this.tasks.findIndex(
1153
+ (task) => task.status === "pending"
1154
+ );
1155
+ if (nextPendingIndex < 0) {
1156
+ return;
1383
1157
  }
1384
- });
1385
- return data;
1386
- }
1387
- function expandLiteSection(liteSection, elementById) {
1388
- const _a = liteSection, { textIds } = _a, remainingFields = __objRest(_a, ["textIds"]);
1389
- const texts = idsIntoElements(textIds, elementById);
1390
- let leftMost = -1;
1391
- let topMost = -1;
1392
- let rightMost = -1;
1393
- let bottomMost = -1;
1394
- texts.forEach((text) => {
1395
- leftMost = leftMost === -1 ? text.rect.left : Math.min(leftMost, text.rect.left);
1396
- topMost = topMost === -1 ? text.rect.top : Math.min(topMost, text.rect.top);
1397
- rightMost = Math.max(rightMost, text.rect.left + text.rect.width);
1398
- bottomMost = Math.max(bottomMost, text.rect.top + text.rect.height);
1399
- });
1400
- const sectionRect = {
1401
- left: leftMost,
1402
- top: topMost,
1403
- width: rightMost - leftMost,
1404
- height: bottomMost - topMost
1405
- };
1406
- const section = __spreadProps(__spreadValues({}, remainingFields), {
1407
- content: texts,
1408
- rect: sectionRect
1409
- });
1410
- return section;
1411
- }
1158
+ this.status = "running";
1159
+ let taskIndex = nextPendingIndex;
1160
+ let successfullyCompleted = true;
1161
+ let errorMsg = "";
1162
+ let previousFindOutput;
1163
+ while (taskIndex < this.tasks.length) {
1164
+ const task = this.tasks[taskIndex];
1165
+ (0, import_node_assert2.default)(
1166
+ task.status === "pending",
1167
+ `task status should be pending, but got: ${task.status}`
1168
+ );
1169
+ task.timing = {
1170
+ start: Date.now()
1171
+ };
1172
+ try {
1173
+ task.status = "running";
1174
+ (0, import_node_assert2.default)(
1175
+ ["Insight", "Action", "Planning"].indexOf(task.type) >= 0,
1176
+ `unsupported task type: ${task.type}`
1177
+ );
1178
+ const { executor, param } = task;
1179
+ (0, import_node_assert2.default)(executor, `executor is required for task type: ${task.type}`);
1180
+ let returnValue;
1181
+ const executorContext = {
1182
+ task,
1183
+ element: previousFindOutput == null ? void 0 : previousFindOutput.element
1184
+ };
1185
+ if (task.type === "Insight") {
1186
+ (0, import_node_assert2.default)(
1187
+ task.subType === "Locate" || task.subType === "Query" || task.subType === "Assert",
1188
+ `unsupported insight subType: ${task.subType}`
1189
+ );
1190
+ returnValue = await task.executor(param, executorContext);
1191
+ if (task.subType === "Locate") {
1192
+ previousFindOutput = returnValue == null ? void 0 : returnValue.output;
1193
+ }
1194
+ } else if (task.type === "Action" || task.type === "Planning") {
1195
+ returnValue = await task.executor(param, executorContext);
1196
+ } else {
1197
+ console.warn(
1198
+ `unsupported task type: ${task.type}, will try to execute it directly`
1199
+ );
1200
+ returnValue = await task.executor(param, executorContext);
1201
+ }
1202
+ Object.assign(task, returnValue);
1203
+ task.status = "success";
1204
+ task.timing.end = Date.now();
1205
+ task.timing.cost = task.timing.end - task.timing.start;
1206
+ taskIndex++;
1207
+ } catch (e) {
1208
+ successfullyCompleted = false;
1209
+ task.status = "fail";
1210
+ errorMsg = `${e == null ? void 0 : e.message}
1211
+ ${e == null ? void 0 : e.stack}`;
1212
+ task.error = errorMsg;
1213
+ task.timing.end = Date.now();
1214
+ task.timing.cost = task.timing.end - task.timing.start;
1215
+ this.errorMsg = errorMsg;
1216
+ break;
1217
+ }
1218
+ }
1219
+ for (let i = taskIndex + 1; i < this.tasks.length; i++) {
1220
+ this.tasks[i].status = "cancelled";
1221
+ }
1222
+ if (successfullyCompleted) {
1223
+ this.status = "completed";
1224
+ if (this.tasks.length) {
1225
+ return this.tasks[this.tasks.length - 1].output;
1226
+ }
1227
+ } else {
1228
+ this.status = "error";
1229
+ throw new Error(`executor failed: ${errorMsg}`);
1230
+ }
1231
+ }
1232
+ dump() {
1233
+ const dumpData = {
1234
+ sdkVersion: getPkgInfo().version,
1235
+ logTime: Date.now(),
1236
+ name: this.name,
1237
+ description: this.description,
1238
+ tasks: this.tasks
1239
+ };
1240
+ return dumpData;
1241
+ }
1242
+ };
1243
+
1244
+ // src/insight/index.ts
1245
+ var import_node_assert8 = __toESM(require("assert"));
1412
1246
 
1413
1247
  // src/ai-model/openai.ts
1414
- var import_assert4 = __toESM(require("assert"));
1415
- var import_openai = __toESM(require("openai"));
1248
+ var import_node_assert3 = __toESM(require("assert"));
1249
+
1250
+ // src/types.ts
1251
+ var BaseElement = class {
1252
+ };
1253
+ var AIResponseFormat = /* @__PURE__ */ ((AIResponseFormat2) => {
1254
+ AIResponseFormat2["JSON"] = "json_object";
1255
+ AIResponseFormat2["TEXT"] = "text";
1256
+ return AIResponseFormat2;
1257
+ })(AIResponseFormat || {});
1258
+ var UIContext = class {
1259
+ };
1416
1260
 
1417
1261
  // ../../node_modules/.pnpm/langsmith@0.1.36_openai@4.47.1/node_modules/langsmith/dist/traceable.js
1418
1262
  var import_node_async_hooks = require("async_hooks");
1419
1263
 
1420
1264
  // ../../node_modules/.pnpm/uuid@9.0.1/node_modules/uuid/dist/esm-node/rng.js
1421
- var import_crypto3 = __toESM(require("crypto"));
1265
+ var import_crypto = __toESM(require("crypto"));
1422
1266
  var rnds8Pool = new Uint8Array(256);
1423
1267
  var poolPtr = rnds8Pool.length;
1424
1268
  function rng() {
1425
1269
  if (poolPtr > rnds8Pool.length - 16) {
1426
- import_crypto3.default.randomFillSync(rnds8Pool);
1270
+ import_crypto.default.randomFillSync(rnds8Pool);
1427
1271
  poolPtr = 0;
1428
1272
  }
1429
1273
  return rnds8Pool.slice(poolPtr, poolPtr += 16);
@@ -1448,9 +1292,9 @@ function unsafeStringify(arr, offset = 0) {
1448
1292
  }
1449
1293
 
1450
1294
  // ../../node_modules/.pnpm/uuid@9.0.1/node_modules/uuid/dist/esm-node/native.js
1451
- var import_crypto4 = __toESM(require("crypto"));
1295
+ var import_crypto2 = __toESM(require("crypto"));
1452
1296
  var native_default = {
1453
- randomUUID: import_crypto4.default.randomUUID
1297
+ randomUUID: import_crypto2.default.randomUUID
1454
1298
  };
1455
1299
 
1456
1300
  // ../../node_modules/.pnpm/uuid@9.0.1/node_modules/uuid/dist/esm-node/v4.js
@@ -4626,23 +4470,13 @@ var wrapOpenAI = (openai, options) => {
4626
4470
  return openai;
4627
4471
  };
4628
4472
 
4629
- // src/types.ts
4630
- var BaseElement = class {
4631
- };
4632
- var AIResponseFormat = /* @__PURE__ */ ((AIResponseFormat2) => {
4633
- AIResponseFormat2["JSON"] = "json_object";
4634
- AIResponseFormat2["TEXT"] = "text";
4635
- return AIResponseFormat2;
4636
- })(AIResponseFormat || {});
4637
- var UIContext = class {
4638
- };
4639
-
4640
4473
  // src/ai-model/openai.ts
4474
+ var import_openai = __toESM(require("openai"));
4641
4475
  var envConfigKey = "MIDSCENE_OPENAI_INIT_CONFIG_JSON";
4642
4476
  var envModelKey = "MIDSCENE_MODEL_NAME";
4643
4477
  var envSmithDebug = "MIDSCENE_LANGSMITH_DEBUG";
4644
4478
  var extraConfig = {};
4645
- if (typeof process.env[envConfigKey] === "string") {
4479
+ if (typeof process.env[envConfigKey] === "string" && process.env[envConfigKey]) {
4646
4480
  console.log("config for openai loaded");
4647
4481
  extraConfig = JSON.parse(process.env[envConfigKey]);
4648
4482
  }
@@ -4668,12 +4502,12 @@ async function call(messages, responseFormat) {
4668
4502
  response_format: { type: responseFormat }
4669
4503
  });
4670
4504
  const { content } = completion.choices[0].message;
4671
- (0, import_assert4.default)(content, "empty content");
4505
+ (0, import_node_assert3.default)(content, "empty content");
4672
4506
  return content;
4673
4507
  }
4674
4508
  async function callToGetJSONObject(messages) {
4675
4509
  const response = await call(messages, "json_object" /* JSON */);
4676
- (0, import_assert4.default)(response, "empty response");
4510
+ (0, import_node_assert3.default)(response, "empty response");
4677
4511
  return JSON.parse(response);
4678
4512
  }
4679
4513
 
@@ -4711,9 +4545,9 @@ You are an expert in software page image (2D) and page element text analysis.
4711
4545
  "elements": [
4712
4546
  // If no matching elements are found, return an empty array []
4713
4547
  {
4714
- "reason": "xxx", // The thought process for finding the element, replace xxx with your thought process
4715
- "text": "xxx", // Replace xxx with the text of elementInfo, if none, leave empty
4716
- "id": "xxx" // Replace xxx with the ID of elementInfo
4548
+ "reason": "PLACEHOLDER", // The thought process for finding the element, replace PLACEHOLDER with your thought process
4549
+ "text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
4550
+ "id": "PLACEHOLDER" // Replace PLACEHOLDER with the ID of elementInfo
4717
4551
  }
4718
4552
  // More elements...
4719
4553
  ],
@@ -4803,14 +4637,249 @@ Output Example:
4803
4637
  ],
4804
4638
  "errors": []
4805
4639
  }
4806
- \`\`\`
4807
-
4808
- `;
4640
+ \`\`\`
4641
+
4642
+ `;
4643
+ }
4644
+
4645
+ // src/ai-model/prompt/util.ts
4646
+ var import_node_assert5 = __toESM(require("assert"));
4647
+
4648
+ // src/image/info.ts
4649
+ var import_node_assert4 = __toESM(require("assert"));
4650
+ var import_node_buffer = require("buffer");
4651
+ var import_node_fs2 = require("fs");
4652
+ var import_sharp = __toESM(require("sharp"));
4653
+ async function imageInfo(image) {
4654
+ const { width, height } = await (0, import_sharp.default)(image).metadata();
4655
+ (0, import_node_assert4.default)(width && height, `invalid image: ${image}`);
4656
+ return { width, height };
4657
+ }
4658
+ async function imageInfoOfBase64(imageBase64) {
4659
+ const base64Data = imageBase64.replace(/^data:image\/\w+;base64,/, "");
4660
+ return imageInfo(import_node_buffer.Buffer.from(base64Data, "base64"));
4661
+ }
4662
+
4663
+ // src/image/transform.ts
4664
+ var import_node_buffer2 = require("buffer");
4665
+ var import_sharp2 = __toESM(require("sharp"));
4666
+
4667
+ // src/image/visualization.ts
4668
+ var import_node_buffer3 = require("buffer");
4669
+ var import_sharp3 = __toESM(require("sharp"));
4670
+
4671
+ // src/ai-model/prompt/util.ts
4672
+ var characteristic = "You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.";
4673
+ var contextFormatIntro = `
4674
+ The user will give you a screenshot and the texts on it. There may be some none-English characters (like Chinese) on it, indicating it's an non-English app.`;
4675
+ var ONE_ELEMENT_LOCATOR_PREFIX = "LOCATE_ONE_ELEMENT";
4676
+ var ELEMENTS_LOCATOR_PREFIX = "LOCATE_ONE_OR_MORE_ELEMENTS";
4677
+ var SECTION_MATCHER_FLAG = "SECTION_MATCHER_FLAG/";
4678
+ var skillSegment = `skill name: segment_a_web_page
4679
+ skill content:
4680
+ Based on the functions and content of various elements on the page, segment the screenshot into different sections like navigation bar, product list, news area, etc.
4681
+ Some general rules for segmentation:
4682
+ * Each section should NOT overlap with each other.
4683
+ * Each text should only belong to one section.
4684
+ * [IMPORTANT] Whether the content visually appears to belong to different sections is a significant factor in segmenting the page.
4685
+ * Analyze the page in a top-to-bottom and left-to-right order.
4686
+ * The evidence indicates a separate section, for example
4687
+ - The background color of certain parts of the page changes.
4688
+ - A section of a page includes a title.
4689
+ * Provide the following data for each of the UI section you found.
4690
+ {
4691
+ "name": "name of the section",
4692
+ "description": "briefly summarize the key content or usage of this section.",
4693
+ "sectionCharacteristics": "In view of the need to distinguish this section from the surrounding sections, explain the characteristics and how to define boundaries and what precautions to take.",
4694
+ "textIds": ["5", "6", "7"], // ids of all text elements in this section
4695
+ }
4696
+ `;
4697
+ var skillExtractData = `skill name: extract_data_from_UI
4698
+ related input: DATA_DEMAND
4699
+ skill content:
4700
+ * User will give you some data requirements in DATA_DEMAND. Consider the UI context, follow the user's instructions, and provide comprehensive data accordingly.
4701
+ * There may be some special commands in DATA_DEMAND, please pay extra attention
4702
+ - ${ONE_ELEMENT_LOCATOR_PREFIX} and ${ELEMENTS_LOCATOR_PREFIX}: if you see a description that mentions the keyword ${ONE_ELEMENT_LOCATOR_PREFIX} or ${ELEMENTS_LOCATOR_PREFIX}(e.g. follow ${ONE_ELEMENT_LOCATOR_PREFIX} : i want to find ...), it means user wants to locate a specific element meets the description. Return in this way: prefix + the id / comma-separated ids, for example: ${ONE_ELEMENT_LOCATOR_PREFIX}/1 , ${ELEMENTS_LOCATOR_PREFIX}/1,2,3 . If not found, keep the prefix and leave the suffix empty, like ${ONE_ELEMENT_LOCATOR_PREFIX}/ .`;
4703
+ function promptsOfSectionQuery(constraints) {
4704
+ if (!constraints.length) {
4705
+ return "";
4706
+ }
4707
+ const instruction = "Use your segment_a_web_page skill to find the following section(s)";
4708
+ const singleSection = (c) => {
4709
+ (0, import_node_assert5.default)(
4710
+ c.name || c.description,
4711
+ "either `name` or `description` is required to define a section constraint"
4712
+ );
4713
+ const number = "One section";
4714
+ const name = c.name ? `named \`${c.name}\`` : "";
4715
+ const description = c.description ? `, usage or criteria : ${c.description}` : "";
4716
+ const basic = `* ${number} ${name}${description}`;
4717
+ return basic;
4718
+ };
4719
+ return `${instruction}
4720
+ ${constraints.map(singleSection).join("\n")}`;
4721
+ }
4722
+ function systemPromptToExtract(dataQuery, sections) {
4723
+ const allSectionNames = (sections == null ? void 0 : sections.filter((c) => c.name).map((c) => c.name || "")) || [];
4724
+ const sectionFindingPrompt = promptsOfSectionQuery(sections || []);
4725
+ const sectionReturnFormat = allSectionNames.length ? " sections: [], // detailed information of each section from segment_a_web_page skill" : "";
4726
+ return `
4727
+ ${characteristic}
4728
+ ${contextFormatIntro}
4729
+
4730
+ You have the following skills:
4731
+ ${allSectionNames.length ? skillSegment : ""}
4732
+ ${skillExtractData}
4733
+
4734
+ Now, do the following jobs:
4735
+ ${sectionFindingPrompt}
4736
+ Use your extract_data_from_UI skill to find the following data, placing it in the \`data\` field
4737
+ DATA_DEMAND start:
4738
+ ${typeof dataQuery === "object" ? `return in key-value style object, keys are ${Object.keys(dataQuery).join(",")}` : ""};
4739
+ ${typeof dataQuery === "string" ? dataQuery : JSON.stringify(dataQuery, null, 2)}
4740
+ DATA_DEMAND ends.
4741
+
4742
+ Return in the following JSON format:
4743
+ {
4744
+ language: "en", // "en" or "zh", the language of the page. Use the same language to describe section name, description, and similar fields.
4745
+ ${sectionReturnFormat}
4746
+ data: any, // the extracted data from extract_data_from_UI skill. Make sure both the value and scheme meet the DATA_DEMAND.
4747
+ errors?: [], // string[], error message if any
4748
+ }
4749
+ `;
4750
+ }
4751
+ function systemPromptToAssert(assertion) {
4752
+ return `
4753
+ ${characteristic}
4754
+ ${contextFormatIntro}
4755
+
4756
+ Based on the information you get, assert the following:
4757
+ ${assertion}
4758
+
4759
+ Return in the following JSON format:
4760
+ {
4761
+ thought: string, // string, the thought of the assertion
4762
+ pass: true, // true or false, whether the assertion is passed
4763
+ }
4764
+ `;
4765
+ }
4766
+ function describeSize(size) {
4767
+ return `${size.width} x ${size.height}`;
4768
+ }
4769
+ function truncateText(text) {
4770
+ const maxLength = 50;
4771
+ if (text && text.length > maxLength) {
4772
+ return `${text.slice(0, maxLength)}...`;
4773
+ }
4774
+ return text;
4775
+ }
4776
+ async function describeUserPage(context) {
4777
+ const { screenshotBase64 } = context;
4778
+ let width;
4779
+ let height;
4780
+ if (context.size) {
4781
+ ({ width, height } = context.size);
4782
+ } else {
4783
+ const imgSize = await imageInfoOfBase64(screenshotBase64);
4784
+ ({ width, height } = imgSize);
4785
+ }
4786
+ const elementsInfo = context.content;
4787
+ const idElementMap = {};
4788
+ elementsInfo.forEach((item) => {
4789
+ idElementMap[item.id] = item;
4790
+ return __spreadValues({}, item);
4791
+ });
4792
+ const elementInfosDescription = cropfieldInformation(elementsInfo);
4793
+ return {
4794
+ description: `
4795
+ {
4796
+ // The size of the page
4797
+ "pageSize": ${describeSize({ width, height })},
4798
+
4799
+
4800
+ // json description of the element
4801
+ "elementInfos": ${JSON.stringify(elementInfosDescription)}
4802
+ }`,
4803
+ elementById(id) {
4804
+ (0, import_node_assert5.default)(typeof id !== "undefined", "id is required for query");
4805
+ const item = idElementMap[`${id}`];
4806
+ return item;
4807
+ }
4808
+ };
4809
+ }
4810
+ function cropfieldInformation(elementsInfo) {
4811
+ const elementInfosDescription = elementsInfo.map(
4812
+ (item) => {
4813
+ const { id, attributes = {}, rect, content } = item;
4814
+ const tailorContent = truncateText(content);
4815
+ const tailorAttributes = Object.keys(attributes).reduce(
4816
+ (res, currentKey) => {
4817
+ const attributeVal = attributes[currentKey];
4818
+ res[currentKey] = truncateText(attributeVal);
4819
+ return res;
4820
+ },
4821
+ {}
4822
+ );
4823
+ return {
4824
+ id,
4825
+ attributes: tailorAttributes,
4826
+ rect,
4827
+ content: tailorContent
4828
+ };
4829
+ }
4830
+ );
4831
+ return JSON.stringify(elementInfosDescription);
4832
+ }
4833
+ function retrieveElement(prompt, opt) {
4834
+ if (opt == null ? void 0 : opt.multi) {
4835
+ return `follow ${ELEMENTS_LOCATOR_PREFIX}: ${prompt}`;
4836
+ }
4837
+ return `follow ${ONE_ELEMENT_LOCATOR_PREFIX}: ${prompt}`;
4838
+ }
4839
+ function ifElementTypeResponse(response) {
4840
+ if (typeof response !== "string") {
4841
+ return false;
4842
+ }
4843
+ return response.startsWith(ONE_ELEMENT_LOCATOR_PREFIX) || response.startsWith(ELEMENTS_LOCATOR_PREFIX);
4844
+ }
4845
+ function splitElementResponse(response) {
4846
+ const oneElementSplitter = `${ONE_ELEMENT_LOCATOR_PREFIX}/`;
4847
+ if (response.startsWith(oneElementSplitter)) {
4848
+ const id = response.slice(oneElementSplitter.length);
4849
+ if (id.indexOf(",") >= 0) {
4850
+ console.warn(`unexpected comma in one element response: ${id}`);
4851
+ }
4852
+ return id ? id : null;
4853
+ }
4854
+ const elementsSplitter = `${ELEMENTS_LOCATOR_PREFIX}/`;
4855
+ if (response.startsWith(elementsSplitter)) {
4856
+ const idsString = response.slice(elementsSplitter.length);
4857
+ if (!idsString) {
4858
+ return [];
4859
+ }
4860
+ return idsString.split(",");
4861
+ }
4862
+ return null;
4863
+ }
4864
+ function retrieveSection(prompt) {
4865
+ return `${SECTION_MATCHER_FLAG}${prompt}`;
4866
+ }
4867
+ function extractSectionQuery(input) {
4868
+ if (typeof input === "string" && input.startsWith(SECTION_MATCHER_FLAG)) {
4869
+ return input.slice(SECTION_MATCHER_FLAG.length);
4870
+ }
4871
+ return false;
4809
4872
  }
4810
4873
 
4811
4874
  // src/ai-model/inspect.ts
4875
+ var import_node_assert6 = __toESM(require("assert"));
4812
4876
  async function AiInspectElement(options) {
4813
- const { context, multi, findElementDescription, callAI = callToGetJSONObject } = options;
4877
+ const {
4878
+ context,
4879
+ multi,
4880
+ findElementDescription,
4881
+ callAI = callToGetJSONObject
4882
+ } = options;
4814
4883
  const { screenshotBase64 } = context;
4815
4884
  const { description, elementById } = await describeUserPage(context);
4816
4885
  const systemPrompt = systemPromptToFindElement(findElementDescription, multi);
@@ -4836,12 +4905,16 @@ async function AiInspectElement(options) {
4836
4905
  const parseResult = await callAI(msgs);
4837
4906
  return {
4838
4907
  parseResult,
4839
- elementById,
4840
- systemPrompt
4908
+ elementById
4841
4909
  };
4842
4910
  }
4843
4911
  async function AiExtractElementInfo(options) {
4844
- const { dataQuery, sectionConstraints, context, callAI = callToGetJSONObject } = options;
4912
+ const {
4913
+ dataQuery,
4914
+ sectionConstraints,
4915
+ context,
4916
+ callAI = callToGetJSONObject
4917
+ } = options;
4845
4918
  const systemPrompt = systemPromptToExtract(dataQuery, sectionConstraints);
4846
4919
  const { screenshotBase64 } = context;
4847
4920
  const { description, elementById } = await describeUserPage(context);
@@ -4866,18 +4939,137 @@ async function AiExtractElementInfo(options) {
4866
4939
  const parseResult = await callAI(msgs);
4867
4940
  return {
4868
4941
  parseResult,
4869
- elementById,
4870
- systemPrompt
4942
+ elementById
4943
+ };
4944
+ }
4945
+ async function AiAssert(options) {
4946
+ const { assertion, context, callAI = callToGetJSONObject } = options;
4947
+ (0, import_node_assert6.default)(assertion, "assertion should be a string");
4948
+ const systemPrompt = systemPromptToAssert(assertion);
4949
+ const { screenshotBase64 } = context;
4950
+ const { description, elementById } = await describeUserPage(context);
4951
+ const msgs = [
4952
+ { role: "system", content: systemPrompt },
4953
+ {
4954
+ role: "user",
4955
+ content: [
4956
+ {
4957
+ type: "image_url",
4958
+ image_url: {
4959
+ url: screenshotBase64
4960
+ }
4961
+ },
4962
+ {
4963
+ type: "text",
4964
+ text: description
4965
+ }
4966
+ ]
4967
+ }
4968
+ ];
4969
+ const assertResult = await callAI(msgs);
4970
+ return assertResult;
4971
+ }
4972
+
4973
+ // src/insight/utils.ts
4974
+ var import_node_assert7 = __toESM(require("assert"));
4975
+ var import_node_crypto2 = require("crypto");
4976
+ var import_node_fs3 = require("fs");
4977
+ var import_node_path2 = require("path");
4978
+ var logFileName = "";
4979
+ var logContent = [];
4980
+ var logIdIndexMap = {};
4981
+ var { pid } = process;
4982
+ var logFileExt = insightDumpFileExt;
4983
+ function writeInsightDump(data, logId, dumpSubscriber) {
4984
+ const logDir2 = getDumpDir();
4985
+ (0, import_node_assert7.default)(logDir2, "logDir should be set before writing dump file");
4986
+ const id = logId || (0, import_node_crypto2.randomUUID)();
4987
+ const baseData = {
4988
+ sdkVersion: getPkgInfo().version,
4989
+ logTime: Date.now()
4990
+ };
4991
+ const finalData = __spreadValues(__spreadValues({
4992
+ logId: id
4993
+ }, baseData), data);
4994
+ dumpSubscriber == null ? void 0 : dumpSubscriber(finalData);
4995
+ if (!logFileName) {
4996
+ logFileName = `pid_${pid}_${baseData.logTime}`;
4997
+ while ((0, import_node_fs3.existsSync)((0, import_node_path2.join)(logDir2, `${logFileName}.${logFileExt}`))) {
4998
+ logFileName = `${pid}_${baseData.logTime}-${Math.random()}`;
4999
+ }
5000
+ }
5001
+ const dataString = JSON.stringify(finalData, null, 2);
5002
+ if (typeof logIdIndexMap[id] === "number") {
5003
+ logContent[logIdIndexMap[id]] = dataString;
5004
+ } else {
5005
+ const length = logContent.push(dataString);
5006
+ logIdIndexMap[id] = length - 1;
5007
+ }
5008
+ writeDumpFile({
5009
+ fileName: logFileName,
5010
+ fileExt: logFileExt,
5011
+ fileContent: `[
5012
+ ${logContent.join(",\n")}
5013
+ ]`
5014
+ });
5015
+ return id;
5016
+ }
5017
+ function idsIntoElements(ids, elementById) {
5018
+ return ids.reduce((acc, id) => {
5019
+ const element = elementById(id);
5020
+ if (element) {
5021
+ acc.push(element);
5022
+ } else {
5023
+ console.warn(`element not found by id: ${id}`);
5024
+ }
5025
+ return acc;
5026
+ }, []);
5027
+ }
5028
+ function shallowExpandIds(data, ifMeet, elementsById) {
5029
+ const keys = Object.keys(data);
5030
+ keys.forEach((key) => {
5031
+ const value = data[key];
5032
+ if (typeof value === "string" && ifMeet(value)) {
5033
+ data[key] = elementsById(value);
5034
+ } else if (Array.isArray(value)) {
5035
+ const newValue = value.map((id) => ifMeet(id) ? elementsById(id) : id);
5036
+ data[key] = newValue;
5037
+ }
5038
+ });
5039
+ return data;
5040
+ }
5041
+ function expandLiteSection(liteSection, elementById) {
5042
+ const _a = liteSection, { textIds } = _a, remainingFields = __objRest(_a, ["textIds"]);
5043
+ const texts = idsIntoElements(textIds, elementById);
5044
+ let leftMost = -1;
5045
+ let topMost = -1;
5046
+ let rightMost = -1;
5047
+ let bottomMost = -1;
5048
+ texts.forEach((text) => {
5049
+ leftMost = leftMost === -1 ? text.rect.left : Math.min(leftMost, text.rect.left);
5050
+ topMost = topMost === -1 ? text.rect.top : Math.min(topMost, text.rect.top);
5051
+ rightMost = Math.max(rightMost, text.rect.left + text.rect.width);
5052
+ bottomMost = Math.max(bottomMost, text.rect.top + text.rect.height);
5053
+ });
5054
+ const sectionRect = {
5055
+ left: leftMost,
5056
+ top: topMost,
5057
+ width: rightMost - leftMost,
5058
+ height: bottomMost - topMost
4871
5059
  };
5060
+ const section = __spreadProps(__spreadValues({}, remainingFields), {
5061
+ content: texts,
5062
+ rect: sectionRect
5063
+ });
5064
+ return section;
4872
5065
  }
4873
5066
 
4874
5067
  // src/insight/index.ts
4875
5068
  var sortByOrder = (a, b) => {
4876
5069
  if (a.rect.top - b.rect.top !== 0) {
4877
5070
  return a.rect.top - b.rect.top;
4878
- } else {
4879
- return a.rect.left - b.rect.left;
4880
5071
  }
5072
+ return a.rect.left - b.rect.left;
4881
5073
  };
4882
5074
  var Insight = class {
4883
5075
  constructor(context, opt) {
@@ -4885,7 +5077,7 @@ var Insight = class {
4885
5077
  __publicField(this, "aiVendorFn", callToGetJSONObject);
4886
5078
  __publicField(this, "onceDumpUpdatedFn");
4887
5079
  __publicField(this, "taskInfo");
4888
- (0, import_assert5.default)(context, "context is required for Insight");
5080
+ (0, import_node_assert8.default)(context, "context is required for Insight");
4889
5081
  if (typeof context === "function") {
4890
5082
  this.contextRetrieverFn = context;
4891
5083
  } else {
@@ -4901,12 +5093,12 @@ var Insight = class {
4901
5093
  async locate(queryPrompt, opt) {
4902
5094
  var _a;
4903
5095
  const { callAI = this.aiVendorFn, multi = false } = opt || {};
4904
- (0, import_assert5.default)(queryPrompt, "query is required for located");
5096
+ (0, import_node_assert8.default)(queryPrompt, "query is required for located");
4905
5097
  const dumpSubscriber = this.onceDumpUpdatedFn;
4906
5098
  this.onceDumpUpdatedFn = void 0;
4907
5099
  const context = await this.contextRetrieverFn();
4908
5100
  const startTime = Date.now();
4909
- const { parseResult, systemPrompt, elementById } = await AiInspectElement({
5101
+ const { parseResult, elementById } = await AiInspectElement({
4910
5102
  callAI,
4911
5103
  context,
4912
5104
  multi: Boolean(multi),
@@ -4915,8 +5107,7 @@ var Insight = class {
4915
5107
  const timeCost = Date.now() - startTime;
4916
5108
  const taskInfo = __spreadProps(__spreadValues({}, this.taskInfo ? this.taskInfo : {}), {
4917
5109
  durationMs: timeCost,
4918
- rawResponse: JSON.stringify(parseResult),
4919
- systemPrompt
5110
+ rawResponse: JSON.stringify(parseResult)
4920
5111
  });
4921
5112
  let errorLog;
4922
5113
  if ((_a = parseResult.errors) == null ? void 0 : _a.length) {
@@ -4944,7 +5135,9 @@ ${parseResult.errors.join("\n")}`;
4944
5135
  parseResult.elements.forEach((item) => {
4945
5136
  const element = elementById(item.id);
4946
5137
  if (!element) {
4947
- console.warn(`locate: cannot find element id=${item.id}. Maybe an unstable response from AI model`);
5138
+ console.warn(
5139
+ `locate: cannot find element id=${item.id}. Maybe an unstable response from AI model`
5140
+ );
4948
5141
  return;
4949
5142
  }
4950
5143
  elements.push(element);
@@ -4958,20 +5151,23 @@ ${parseResult.errors.join("\n")}`;
4958
5151
  );
4959
5152
  if (opt == null ? void 0 : opt.multi) {
4960
5153
  return elements;
4961
- } else if (elements.length >= 2) {
4962
- console.warn(`locate: multiple elements found, return the first one. (query: ${queryPrompt})`);
5154
+ }
5155
+ if (elements.length >= 2) {
5156
+ console.warn(
5157
+ `locate: multiple elements found, return the first one. (query: ${queryPrompt})`
5158
+ );
4963
5159
  return elements[0];
4964
- } else if (elements.length === 1) {
5160
+ }
5161
+ if (elements.length === 1) {
4965
5162
  return elements[0];
4966
- } else {
4967
- return null;
4968
5163
  }
5164
+ return null;
4969
5165
  }
4970
5166
  async extract(dataDemand) {
4971
5167
  var _a;
4972
5168
  let dataQuery = {};
4973
5169
  const sectionQueryMap = {};
4974
- (0, import_assert5.default)(
5170
+ (0, import_node_assert8.default)(
4975
5171
  typeof dataDemand === "object" || typeof dataDemand === "string",
4976
5172
  `dataDemand should be object or string, but get ${typeof dataDemand}`
4977
5173
  );
@@ -5000,7 +5196,7 @@ ${parseResult.errors.join("\n")}`;
5000
5196
  });
5001
5197
  const context = await this.contextRetrieverFn();
5002
5198
  const startTime = Date.now();
5003
- const { parseResult, systemPrompt, elementById } = await AiExtractElementInfo({
5199
+ const { parseResult, elementById } = await AiExtractElementInfo({
5004
5200
  context,
5005
5201
  dataQuery,
5006
5202
  sectionConstraints,
@@ -5009,8 +5205,7 @@ ${parseResult.errors.join("\n")}`;
5009
5205
  const timeCost = Date.now() - startTime;
5010
5206
  const taskInfo = __spreadProps(__spreadValues({}, this.taskInfo ? this.taskInfo : {}), {
5011
5207
  durationMs: timeCost,
5012
- rawResponse: JSON.stringify(parseResult),
5013
- systemPrompt
5208
+ rawResponse: JSON.stringify(parseResult)
5014
5209
  });
5015
5210
  let errorLog;
5016
5211
  if ((_a = parseResult.errors) == null ? void 0 : _a.length) {
@@ -5035,7 +5230,10 @@ ${parseResult.errors.join("\n")}`;
5035
5230
  throw new Error(errorLog);
5036
5231
  }
5037
5232
  const sectionsArr = (parseResult.sections || []).map((liteSection) => {
5038
- const section = expandLiteSection(liteSection, (id) => elementById(id));
5233
+ const section = expandLiteSection(
5234
+ liteSection,
5235
+ (id) => elementById(id)
5236
+ );
5039
5237
  return section;
5040
5238
  }).sort(sortByOrder);
5041
5239
  const sectionMap = sectionsArr.reduce((acc, section) => {
@@ -5059,7 +5257,8 @@ ${parseResult.errors.join("\n")}`;
5059
5257
  const idList = splitElementResponse(id);
5060
5258
  if (typeof idList === "string") {
5061
5259
  return elementById(idList);
5062
- } else if (Array.isArray(idList)) {
5260
+ }
5261
+ if (Array.isArray(idList)) {
5063
5262
  return idsIntoElements(idList, elementById);
5064
5263
  }
5065
5264
  return idList;
@@ -5076,133 +5275,46 @@ ${parseResult.errors.join("\n")}`;
5076
5275
  );
5077
5276
  return mergedData;
5078
5277
  }
5079
- setAiVendorFn(aiVendorFn) {
5080
- const origin = this.aiVendorFn;
5081
- this.aiVendorFn = aiVendorFn;
5082
- return () => {
5083
- this.aiVendorFn = origin;
5084
- };
5085
- }
5086
- };
5087
-
5088
- // src/action/executor.ts
5089
- var import_assert6 = __toESM(require("assert"));
5090
- var Executor = class {
5091
- constructor(name, description, tasks) {
5092
- __publicField(this, "name");
5093
- __publicField(this, "description");
5094
- __publicField(this, "tasks");
5095
- // status of executor
5096
- __publicField(this, "status");
5097
- __publicField(this, "errorMsg");
5098
- __publicField(this, "dumpFileName");
5099
- this.status = tasks && tasks.length > 0 ? "pending" : "init";
5100
- this.name = name;
5101
- this.description = description;
5102
- this.tasks = (tasks || []).map((item) => this.markTaskAsPending(item));
5103
- }
5104
- markTaskAsPending(task) {
5105
- return __spreadValues({
5106
- status: "pending"
5107
- }, task);
5108
- }
5109
- async append(task) {
5110
- (0, import_assert6.default)(this.status !== "error", "executor is in error state, cannot append task");
5111
- if (Array.isArray(task)) {
5112
- this.tasks.push(...task.map((item) => this.markTaskAsPending(item)));
5113
- } else {
5114
- this.tasks.push(this.markTaskAsPending(task));
5115
- }
5116
- if (this.status !== "running") {
5117
- this.status = "pending";
5118
- }
5119
- }
5120
- async flush() {
5121
- if (this.status === "init" && this.tasks.length > 0) {
5122
- console.warn("illegal state for executor, status is init but tasks are not empty");
5123
- }
5124
- (0, import_assert6.default)(this.status !== "running", "executor is already running");
5125
- (0, import_assert6.default)(this.status !== "completed", "executor is already completed");
5126
- (0, import_assert6.default)(this.status !== "error", "executor is in error state");
5127
- const nextPendingIndex = this.tasks.findIndex((task) => task.status === "pending");
5128
- if (nextPendingIndex < 0) {
5129
- return;
5130
- }
5131
- this.status = "running";
5132
- let taskIndex = nextPendingIndex;
5133
- let successfullyCompleted = true;
5134
- let errorMsg = "";
5135
- let previousFindOutput;
5136
- while (taskIndex < this.tasks.length) {
5137
- const task = this.tasks[taskIndex];
5138
- (0, import_assert6.default)(task.status === "pending", `task status should be pending, but got: ${task.status}`);
5139
- task.timing = {
5140
- start: Date.now()
5141
- };
5142
- try {
5143
- task.status = "running";
5144
- (0, import_assert6.default)(
5145
- ["Insight", "Action", "Planning"].indexOf(task.type) >= 0,
5146
- `unsupported task type: ${task.type}`
5147
- );
5148
- const { executor, param } = task;
5149
- (0, import_assert6.default)(executor, `executor is required for task type: ${task.type}`);
5150
- let returnValue;
5151
- const executorContext = {
5152
- task,
5153
- element: previousFindOutput == null ? void 0 : previousFindOutput.element
5154
- };
5155
- if (task.type === "Insight") {
5156
- (0, import_assert6.default)(
5157
- task.subType === "Locate" || task.subType === "Query",
5158
- `unsupported insight subType: ${task.subType}`
5159
- );
5160
- returnValue = await task.executor(param, executorContext);
5161
- if (task.subType === "Locate") {
5162
- previousFindOutput = returnValue == null ? void 0 : returnValue.output;
5163
- }
5164
- } else if (task.type === "Action" || task.type === "Planning") {
5165
- returnValue = await task.executor(param, executorContext);
5166
- } else {
5167
- console.warn(`unsupported task type: ${task.type}, will try to execute it directly`);
5168
- returnValue = await task.executor(param, executorContext);
5169
- }
5170
- Object.assign(task, returnValue);
5171
- task.status = "success";
5172
- task.timing.end = Date.now();
5173
- task.timing.cost = task.timing.end - task.timing.start;
5174
- taskIndex++;
5175
- } catch (e) {
5176
- successfullyCompleted = false;
5177
- task.status = "fail";
5178
- errorMsg = `${e == null ? void 0 : e.message}
5179
- ${e == null ? void 0 : e.stack}`;
5180
- task.error = errorMsg;
5181
- task.timing.end = Date.now();
5182
- task.timing.cost = task.timing.end - task.timing.start;
5183
- this.errorMsg = errorMsg;
5184
- break;
5185
- }
5186
- }
5187
- for (let i = taskIndex + 1; i < this.tasks.length; i++) {
5188
- this.tasks[i].status = "cancelled";
5189
- }
5190
- if (successfullyCompleted) {
5191
- this.status = "completed";
5192
- } else {
5193
- this.status = "error";
5194
- throw new Error(`executor failed: ${errorMsg}`);
5278
+ async assert(assertion) {
5279
+ if (typeof assertion !== "string") {
5280
+ throw new Error(
5281
+ "This is the assert method for Midscene, the first argument should be a string. If you want to use the assert method from Node.js, please import it from the Node.js assert module."
5282
+ );
5195
5283
  }
5196
- }
5197
- dump() {
5284
+ const dumpSubscriber = this.onceDumpUpdatedFn;
5285
+ this.onceDumpUpdatedFn = void 0;
5286
+ const context = await this.contextRetrieverFn();
5287
+ const startTime = Date.now();
5288
+ const assertResult = await AiAssert({
5289
+ assertion,
5290
+ callAI: this.aiVendorFn,
5291
+ context
5292
+ });
5293
+ const timeCost = Date.now() - startTime;
5294
+ const taskInfo = __spreadProps(__spreadValues({}, this.taskInfo ? this.taskInfo : {}), {
5295
+ durationMs: timeCost,
5296
+ rawResponse: JSON.stringify(assertResult)
5297
+ });
5298
+ const { thought, pass } = assertResult;
5198
5299
  const dumpData = {
5199
- sdkVersion: getPkgInfo().version,
5200
- logTime: Date.now(),
5201
- name: this.name,
5202
- description: this.description,
5203
- tasks: this.tasks
5300
+ type: "assert",
5301
+ context,
5302
+ userQuery: {
5303
+ assertion
5304
+ },
5305
+ matchedSection: [],
5306
+ matchedElement: [],
5307
+ data: null,
5308
+ taskInfo,
5309
+ assertionPass: pass,
5310
+ assertionThought: thought,
5311
+ error: pass ? void 0 : thought
5312
+ };
5313
+ writeInsightDump(dumpData, void 0, dumpSubscriber);
5314
+ return {
5315
+ pass,
5316
+ thought
5204
5317
  };
5205
- return dumpData;
5206
5318
  }
5207
5319
  };
5208
5320