@midscene/core 0.1.4 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/es/index.js CHANGED
@@ -1008,41 +1008,21 @@ var require_dist = __commonJS({
1008
1008
  }
1009
1009
  });
1010
1010
 
1011
- // src/insight/index.ts
1012
- import assert6 from "assert";
1013
-
1014
- // src/ai-model/prompt/util.ts
1015
- import assert3 from "assert";
1016
-
1017
- // src/image/info.ts
1018
- import assert from "assert";
1019
- import { Buffer as Buffer2 } from "buffer";
1020
- import { readFileSync } from "fs";
1021
- import Sharp from "sharp";
1022
- async function imageInfo(image) {
1023
- const { width, height } = await Sharp(image).metadata();
1024
- assert(width && height, `invalid image: ${image}`);
1025
- return { width, height };
1026
- }
1027
- async function imageInfoOfBase64(imageBase64) {
1028
- const base64Data = imageBase64.replace(/^data:image\/\w+;base64,/, "");
1029
- return imageInfo(Buffer2.from(base64Data, "base64"));
1030
- }
1031
-
1032
- // src/image/transform.ts
1033
- import { Buffer as Buffer3 } from "buffer";
1034
- import Sharp2 from "sharp";
1035
-
1036
- // src/image/visualization.ts
1037
- import { Buffer as Buffer4 } from "buffer";
1038
- import Sharp3 from "sharp";
1011
+ // src/action/executor.ts
1012
+ import assert2 from "assert";
1039
1013
 
1040
1014
  // src/utils.ts
1015
+ import assert from "assert";
1016
+ import { randomUUID } from "crypto";
1017
+ import {
1018
+ copyFileSync,
1019
+ existsSync,
1020
+ mkdirSync,
1021
+ readFileSync,
1022
+ writeFileSync
1023
+ } from "fs";
1041
1024
  import { tmpdir } from "os";
1042
1025
  import { basename, join } from "path";
1043
- import { copyFileSync, existsSync, mkdirSync, readFileSync as readFileSync2, writeFileSync } from "fs";
1044
- import { randomUUID } from "crypto";
1045
- import assert2 from "assert";
1046
1026
  var pkg;
1047
1027
  function getPkgInfo() {
1048
1028
  if (pkg) {
@@ -1055,15 +1035,14 @@ function getPkgInfo() {
1055
1035
  pkgJsonFile = join(__dirname, "../../../package.json");
1056
1036
  }
1057
1037
  if (pkgJsonFile) {
1058
- const { name, version } = JSON.parse(readFileSync2(pkgJsonFile, "utf-8"));
1038
+ const { name, version } = JSON.parse(readFileSync(pkgJsonFile, "utf-8"));
1059
1039
  pkg = { name, version };
1060
1040
  return pkg;
1061
- } else {
1062
- return {
1063
- name: "midscene-unknown-page-name",
1064
- version: "0.0.0"
1065
- };
1066
1041
  }
1042
+ return {
1043
+ name: "midscene-unknown-page-name",
1044
+ version: "0.0.0"
1045
+ };
1067
1046
  }
1068
1047
  var logDir = join(process.cwd(), "./midscene_run/");
1069
1048
  var logEnvReady = false;
@@ -1084,18 +1063,18 @@ function writeDumpFile(opts) {
1084
1063
  mkdirSync(targetDir, { recursive: true });
1085
1064
  }
1086
1065
  if (!logEnvReady) {
1087
- assert2(targetDir, "logDir should be set before writing dump file");
1066
+ assert(targetDir, "logDir should be set before writing dump file");
1088
1067
  const gitIgnorePath = join(targetDir, "../../.gitignore");
1089
1068
  let gitIgnoreContent = "";
1090
1069
  if (existsSync(gitIgnorePath)) {
1091
- gitIgnoreContent = readFileSync2(gitIgnorePath, "utf-8");
1070
+ gitIgnoreContent = readFileSync(gitIgnorePath, "utf-8");
1092
1071
  }
1093
1072
  const logDirName = basename(logDir);
1094
1073
  if (!gitIgnoreContent.includes(`${logDirName}/`)) {
1095
1074
  writeFileSync(
1096
1075
  gitIgnorePath,
1097
1076
  `${gitIgnoreContent}
1098
- # MidScene.js dump files
1077
+ # Midscene.js dump files
1099
1078
  ${logDirName}/report
1100
1079
  ${logDirName}/dump-logger
1101
1080
  `,
@@ -1112,286 +1091,157 @@ ${logDirName}/dump-logger
1112
1091
  return filePath;
1113
1092
  }
1114
1093
 
1115
- // src/ai-model/prompt/util.ts
1116
- var characteristic = "You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.";
1117
- var contextFormatIntro = `
1118
- The user will give you a screenshot and the texts on it. There may be some none-English characters (like Chinese) on it, indicating it's an non-English app.`;
1119
- var ONE_ELEMENT_LOCATOR_PREFIX = "LOCATE_ONE_ELEMENT";
1120
- var ELEMENTS_LOCATOR_PREFIX = "LOCATE_ONE_OR_MORE_ELEMENTS";
1121
- var SECTION_MATCHER_FLAG = "SECTION_MATCHER_FLAG/";
1122
- var skillSegment = `skill name: segment_a_web_page
1123
- skill content:
1124
- Based on the functions and content of various elements on the page, segment the screenshot into different sections like navigation bar, product list, news area, etc.
1125
- Some general rules for segmentation:
1126
- * Each section should NOT overlap with each other.
1127
- * Each text should only belong to one section.
1128
- * [IMPORTANT] Whether the content visually appears to belong to different sections is a significant factor in segmenting the page.
1129
- * Analyze the page in a top-to-bottom and left-to-right order.
1130
- * The evidence indicates a separate section, for example
1131
- - The background color of certain parts of the page changes.
1132
- - A section of a page includes a title.
1133
- * Provide the following data for each of the UI section you found.
1134
- {
1135
- "name": "name of the section",
1136
- "description": "briefly summarize the key content or usage of this section.",
1137
- "sectionCharacteristics": "In view of the need to distinguish this section from the surrounding sections, explain the characteristics and how to define boundaries and what precautions to take.",
1138
- "textIds": ["5", "6", "7"], // ids of all text elements in this section
1094
+ // src/action/executor.ts
1095
+ var Executor = class {
1096
+ constructor(name, description, tasks) {
1097
+ __publicField(this, "name");
1098
+ __publicField(this, "description");
1099
+ __publicField(this, "tasks");
1100
+ // status of executor
1101
+ __publicField(this, "status");
1102
+ __publicField(this, "errorMsg");
1103
+ __publicField(this, "dumpFileName");
1104
+ this.status = tasks && tasks.length > 0 ? "pending" : "init";
1105
+ this.name = name;
1106
+ this.description = description;
1107
+ this.tasks = (tasks || []).map((item) => this.markTaskAsPending(item));
1139
1108
  }
1140
- `;
1141
- var skillExtractData = `skill name: extract_data_from_UI
1142
- related input: DATA_DEMAND
1143
- skill content:
1144
- * User will give you some data requirements in DATA_DEMAND. Consider the UI context, follow the user's instructions, and provide comprehensive data accordingly.
1145
- * There may be some special commands in DATA_DEMAND, please pay extra attention
1146
- - ${ONE_ELEMENT_LOCATOR_PREFIX} and ${ELEMENTS_LOCATOR_PREFIX}: if you see a description that mentions the keyword ${ONE_ELEMENT_LOCATOR_PREFIX} or ${ELEMENTS_LOCATOR_PREFIX}(e.g. follow ${ONE_ELEMENT_LOCATOR_PREFIX} : i want to find ...), it means user wants to locate a specific element meets the description. Return in this way: prefix + the id / comma-separated ids, for example: ${ONE_ELEMENT_LOCATOR_PREFIX}/1 , ${ELEMENTS_LOCATOR_PREFIX}/1,2,3 . If not found, keep the prefix and leave the suffix empty, like ${ONE_ELEMENT_LOCATOR_PREFIX}/ .`;
1147
- function promptsOfSectionQuery(constraints) {
1148
- if (!constraints.length) {
1149
- return "";
1109
+ markTaskAsPending(task) {
1110
+ return __spreadValues({
1111
+ status: "pending"
1112
+ }, task);
1150
1113
  }
1151
- const instruction = "Use your segment_a_web_page skill to find the following section(s)";
1152
- const singleSection = (c) => {
1153
- assert3(
1154
- c.name || c.description,
1155
- "either `name` or `description` is required to define a section constraint"
1114
+ async append(task) {
1115
+ assert2(
1116
+ this.status !== "error",
1117
+ "executor is in error state, cannot append task"
1156
1118
  );
1157
- const number = "One section";
1158
- const name = c.name ? `named \`${c.name}\`` : "";
1159
- const description = c.description ? `, usage or criteria : ${c.description}` : "";
1160
- const basic = `* ${number} ${name}${description}`;
1161
- return basic;
1162
- };
1163
- return `${instruction}
1164
- ${constraints.map(singleSection).join("\n")}`;
1165
- }
1166
- function systemPromptToExtract(dataQuery, sections) {
1167
- const allSectionNames = (sections == null ? void 0 : sections.filter((c) => c.name).map((c) => c.name || "")) || [];
1168
- const sectionFindingPrompt = promptsOfSectionQuery(sections || []);
1169
- const sectionReturnFormat = allSectionNames.length ? " sections: [], // detailed information of each section from segment_a_web_page skill" : "";
1170
- return `
1171
- ${characteristic}
1172
- ${contextFormatIntro}
1173
-
1174
- You have the following skills:
1175
- ${allSectionNames.length ? skillSegment : ""}
1176
- ${skillExtractData}
1177
-
1178
- Now, do the following jobs:
1179
- ${sectionFindingPrompt}
1180
- Use your extract_data_from_UI skill to find the following data, placing it in the \`data\` field
1181
- DATA_DEMAND start:
1182
- ${typeof dataQuery === "object" ? `return in key-value style object, keys are ${Object.keys(dataQuery).join(",")}` : ""};
1183
- ${typeof dataQuery === "string" ? dataQuery : JSON.stringify(dataQuery, null, 2)}
1184
- DATA_DEMAND ends.
1185
-
1186
- Return in the following JSON format:
1187
- {
1188
- language: "en", // "en" or "zh", the language of the page. Use the same language to describe section name, description, and similar fields.
1189
- ${sectionReturnFormat}
1190
- data: any, // the extracted data from extract_data_from_UI skill. Make sure both the value and scheme meet the DATA_DEMAND.
1191
- errors?: [], // string[], error message if any
1192
- }
1193
- `;
1194
- }
1195
- function describeSize(size) {
1196
- return `${size.width} x ${size.height}`;
1197
- }
1198
- function truncateText(text) {
1199
- const maxLength = 50;
1200
- if (text && text.length > maxLength) {
1201
- return `${text.slice(0, maxLength)}...`;
1202
- }
1203
- return text;
1204
- }
1205
- async function describeUserPage(context) {
1206
- const { screenshotBase64 } = context;
1207
- let width;
1208
- let height;
1209
- if (context.size) {
1210
- ({ width, height } = context.size);
1211
- } else {
1212
- const imgSize = await imageInfoOfBase64(screenshotBase64);
1213
- ({ width, height } = imgSize);
1214
- }
1215
- const elementsInfo = context.content;
1216
- const idElementMap = {};
1217
- elementsInfo.forEach((item) => {
1218
- idElementMap[item.id] = item;
1219
- return __spreadValues({}, item);
1220
- });
1221
- const elementInfosDescription = cropfieldInformation(elementsInfo);
1222
- return {
1223
- description: `
1224
- {
1225
- // The size of the page
1226
- "pageSize": ${describeSize({ width, height })},
1227
-
1228
-
1229
- // json description of the element
1230
- "elementInfos": ${JSON.stringify(elementInfosDescription)}
1231
- }`,
1232
- elementById(id) {
1233
- assert3(typeof id !== "undefined", "id is required for query");
1234
- const item = idElementMap[`${id}`];
1235
- return item;
1236
- }
1237
- };
1238
- }
1239
- function cropfieldInformation(elementsInfo) {
1240
- const elementInfosDescription = elementsInfo.map((item) => {
1241
- const { id, attributes = {}, rect, content } = item;
1242
- const tailorContent = truncateText(content);
1243
- const tailorAttributes = Object.keys(attributes).reduce((res, currentKey) => {
1244
- const attributeVal = attributes[currentKey];
1245
- res[currentKey] = truncateText(attributeVal);
1246
- return res;
1247
- }, {});
1248
- return {
1249
- id,
1250
- attributes: tailorAttributes,
1251
- rect,
1252
- content: tailorContent
1253
- };
1254
- });
1255
- return JSON.stringify(elementInfosDescription);
1256
- }
1257
- function retrieveElement(prompt, opt) {
1258
- if (opt == null ? void 0 : opt.multi) {
1259
- return `follow ${ELEMENTS_LOCATOR_PREFIX}: ${prompt}`;
1260
- }
1261
- return `follow ${ONE_ELEMENT_LOCATOR_PREFIX}: ${prompt}`;
1262
- }
1263
- function ifElementTypeResponse(response) {
1264
- if (typeof response !== "string") {
1265
- return false;
1266
- }
1267
- return response.startsWith(ONE_ELEMENT_LOCATOR_PREFIX) || response.startsWith(ELEMENTS_LOCATOR_PREFIX);
1268
- }
1269
- function splitElementResponse(response) {
1270
- const oneElementSplitter = `${ONE_ELEMENT_LOCATOR_PREFIX}/`;
1271
- if (response.startsWith(oneElementSplitter)) {
1272
- const id = response.slice(oneElementSplitter.length);
1273
- if (id.indexOf(",") >= 0) {
1274
- console.warn(`unexpected comma in one element response: ${id}`);
1275
- }
1276
- return id ? id : null;
1277
- }
1278
- const elementsSplitter = `${ELEMENTS_LOCATOR_PREFIX}/`;
1279
- if (response.startsWith(elementsSplitter)) {
1280
- const idsString = response.slice(elementsSplitter.length);
1281
- if (!idsString) {
1282
- return [];
1119
+ if (Array.isArray(task)) {
1120
+ this.tasks.push(...task.map((item) => this.markTaskAsPending(item)));
1121
+ } else {
1122
+ this.tasks.push(this.markTaskAsPending(task));
1283
1123
  }
1284
- return idsString.split(",");
1285
- }
1286
- return null;
1287
- }
1288
- function retrieveSection(prompt) {
1289
- return `${SECTION_MATCHER_FLAG}${prompt}`;
1290
- }
1291
- function extractSectionQuery(input) {
1292
- if (typeof input === "string" && input.startsWith(SECTION_MATCHER_FLAG)) {
1293
- return input.slice(SECTION_MATCHER_FLAG.length);
1294
- }
1295
- return false;
1296
- }
1297
-
1298
- // src/insight/utils.ts
1299
- import { existsSync as existsSync2 } from "fs";
1300
- import { join as join2 } from "path";
1301
- import { randomUUID as randomUUID2 } from "crypto";
1302
- import assert4 from "assert";
1303
- var logFileName = "";
1304
- var logContent = [];
1305
- var logIdIndexMap = {};
1306
- var { pid } = process;
1307
- var logFileExt = insightDumpFileExt;
1308
- function writeInsightDump(data, logId, dumpSubscriber) {
1309
- const logDir2 = getDumpDir();
1310
- assert4(logDir2, "logDir should be set before writing dump file");
1311
- const id = logId || randomUUID2();
1312
- const baseData = {
1313
- sdkVersion: getPkgInfo().version,
1314
- logTime: Date.now()
1315
- };
1316
- const finalData = __spreadValues(__spreadValues({
1317
- logId: id
1318
- }, baseData), data);
1319
- dumpSubscriber == null ? void 0 : dumpSubscriber(finalData);
1320
- if (!logFileName) {
1321
- logFileName = `pid_${pid}_${baseData.logTime}`;
1322
- while (existsSync2(join2(logDir2, `${logFileName}.${logFileExt}`))) {
1323
- logFileName = `${pid}_${baseData.logTime}-${Math.random()}`;
1124
+ if (this.status !== "running") {
1125
+ this.status = "pending";
1324
1126
  }
1325
1127
  }
1326
- const dataString = JSON.stringify(finalData, null, 2);
1327
- if (typeof logIdIndexMap[id] === "number") {
1328
- logContent[logIdIndexMap[id]] = dataString;
1329
- } else {
1330
- const length = logContent.push(dataString);
1331
- logIdIndexMap[id] = length - 1;
1332
- }
1333
- writeDumpFile({
1334
- fileName: logFileName,
1335
- fileExt: logFileExt,
1336
- fileContent: `[
1337
- ${logContent.join(",\n")}
1338
- ]`
1339
- });
1340
- return id;
1341
- }
1342
- function idsIntoElements(ids, elementById) {
1343
- return ids.reduce((acc, id) => {
1344
- const element = elementById(id);
1345
- if (element) {
1346
- acc.push(element);
1347
- } else {
1348
- console.warn(`element not found by id: ${id}`);
1128
+ async flush() {
1129
+ if (this.status === "init" && this.tasks.length > 0) {
1130
+ console.warn(
1131
+ "illegal state for executor, status is init but tasks are not empty"
1132
+ );
1349
1133
  }
1350
- return acc;
1351
- }, []);
1352
- }
1353
- function shallowExpandIds(data, ifMeet, elementsById) {
1354
- const keys = Object.keys(data);
1355
- keys.forEach((key) => {
1356
- const value = data[key];
1357
- if (typeof value === "string" && ifMeet(value)) {
1358
- data[key] = elementsById(value);
1359
- } else if (Array.isArray(value)) {
1360
- const newValue = value.map((id) => ifMeet(id) ? elementsById(id) : id);
1361
- data[key] = newValue;
1134
+ assert2(this.status !== "running", "executor is already running");
1135
+ assert2(this.status !== "completed", "executor is already completed");
1136
+ assert2(this.status !== "error", "executor is in error state");
1137
+ const nextPendingIndex = this.tasks.findIndex(
1138
+ (task) => task.status === "pending"
1139
+ );
1140
+ if (nextPendingIndex < 0) {
1141
+ return;
1362
1142
  }
1363
- });
1364
- return data;
1365
- }
1366
- function expandLiteSection(liteSection, elementById) {
1367
- const _a = liteSection, { textIds } = _a, remainingFields = __objRest(_a, ["textIds"]);
1368
- const texts = idsIntoElements(textIds, elementById);
1369
- let leftMost = -1;
1370
- let topMost = -1;
1371
- let rightMost = -1;
1372
- let bottomMost = -1;
1373
- texts.forEach((text) => {
1374
- leftMost = leftMost === -1 ? text.rect.left : Math.min(leftMost, text.rect.left);
1375
- topMost = topMost === -1 ? text.rect.top : Math.min(topMost, text.rect.top);
1376
- rightMost = Math.max(rightMost, text.rect.left + text.rect.width);
1377
- bottomMost = Math.max(bottomMost, text.rect.top + text.rect.height);
1378
- });
1379
- const sectionRect = {
1380
- left: leftMost,
1381
- top: topMost,
1382
- width: rightMost - leftMost,
1383
- height: bottomMost - topMost
1384
- };
1385
- const section = __spreadProps(__spreadValues({}, remainingFields), {
1386
- content: texts,
1387
- rect: sectionRect
1388
- });
1389
- return section;
1390
- }
1143
+ this.status = "running";
1144
+ let taskIndex = nextPendingIndex;
1145
+ let successfullyCompleted = true;
1146
+ let errorMsg = "";
1147
+ let previousFindOutput;
1148
+ while (taskIndex < this.tasks.length) {
1149
+ const task = this.tasks[taskIndex];
1150
+ assert2(
1151
+ task.status === "pending",
1152
+ `task status should be pending, but got: ${task.status}`
1153
+ );
1154
+ task.timing = {
1155
+ start: Date.now()
1156
+ };
1157
+ try {
1158
+ task.status = "running";
1159
+ assert2(
1160
+ ["Insight", "Action", "Planning"].indexOf(task.type) >= 0,
1161
+ `unsupported task type: ${task.type}`
1162
+ );
1163
+ const { executor, param } = task;
1164
+ assert2(executor, `executor is required for task type: ${task.type}`);
1165
+ let returnValue;
1166
+ const executorContext = {
1167
+ task,
1168
+ element: previousFindOutput == null ? void 0 : previousFindOutput.element
1169
+ };
1170
+ if (task.type === "Insight") {
1171
+ assert2(
1172
+ task.subType === "Locate" || task.subType === "Query" || task.subType === "Assert",
1173
+ `unsupported insight subType: ${task.subType}`
1174
+ );
1175
+ returnValue = await task.executor(param, executorContext);
1176
+ if (task.subType === "Locate") {
1177
+ previousFindOutput = returnValue == null ? void 0 : returnValue.output;
1178
+ }
1179
+ } else if (task.type === "Action" || task.type === "Planning") {
1180
+ returnValue = await task.executor(param, executorContext);
1181
+ } else {
1182
+ console.warn(
1183
+ `unsupported task type: ${task.type}, will try to execute it directly`
1184
+ );
1185
+ returnValue = await task.executor(param, executorContext);
1186
+ }
1187
+ Object.assign(task, returnValue);
1188
+ task.status = "success";
1189
+ task.timing.end = Date.now();
1190
+ task.timing.cost = task.timing.end - task.timing.start;
1191
+ taskIndex++;
1192
+ } catch (e) {
1193
+ successfullyCompleted = false;
1194
+ task.status = "fail";
1195
+ errorMsg = `${e == null ? void 0 : e.message}
1196
+ ${e == null ? void 0 : e.stack}`;
1197
+ task.error = errorMsg;
1198
+ task.timing.end = Date.now();
1199
+ task.timing.cost = task.timing.end - task.timing.start;
1200
+ this.errorMsg = errorMsg;
1201
+ break;
1202
+ }
1203
+ }
1204
+ for (let i = taskIndex + 1; i < this.tasks.length; i++) {
1205
+ this.tasks[i].status = "cancelled";
1206
+ }
1207
+ if (successfullyCompleted) {
1208
+ this.status = "completed";
1209
+ if (this.tasks.length) {
1210
+ return this.tasks[this.tasks.length - 1].output;
1211
+ }
1212
+ } else {
1213
+ this.status = "error";
1214
+ throw new Error(`executor failed: ${errorMsg}`);
1215
+ }
1216
+ }
1217
+ dump() {
1218
+ const dumpData = {
1219
+ sdkVersion: getPkgInfo().version,
1220
+ logTime: Date.now(),
1221
+ name: this.name,
1222
+ description: this.description,
1223
+ tasks: this.tasks
1224
+ };
1225
+ return dumpData;
1226
+ }
1227
+ };
1228
+
1229
+ // src/insight/index.ts
1230
+ import assert8 from "assert";
1391
1231
 
1392
1232
  // src/ai-model/openai.ts
1393
- import assert5 from "assert";
1394
- import OpenAI from "openai";
1233
+ import assert3 from "assert";
1234
+
1235
+ // src/types.ts
1236
+ var BaseElement = class {
1237
+ };
1238
+ var AIResponseFormat = /* @__PURE__ */ ((AIResponseFormat2) => {
1239
+ AIResponseFormat2["JSON"] = "json_object";
1240
+ AIResponseFormat2["TEXT"] = "text";
1241
+ return AIResponseFormat2;
1242
+ })(AIResponseFormat || {});
1243
+ var UIContext = class {
1244
+ };
1395
1245
 
1396
1246
  // ../../node_modules/.pnpm/langsmith@0.1.36_openai@4.47.1/node_modules/langsmith/dist/traceable.js
1397
1247
  import { AsyncLocalStorage } from "async_hooks";
@@ -4605,23 +4455,13 @@ var wrapOpenAI = (openai, options) => {
4605
4455
  return openai;
4606
4456
  };
4607
4457
 
4608
- // src/types.ts
4609
- var BaseElement = class {
4610
- };
4611
- var AIResponseFormat = /* @__PURE__ */ ((AIResponseFormat2) => {
4612
- AIResponseFormat2["JSON"] = "json_object";
4613
- AIResponseFormat2["TEXT"] = "text";
4614
- return AIResponseFormat2;
4615
- })(AIResponseFormat || {});
4616
- var UIContext = class {
4617
- };
4618
-
4619
4458
  // src/ai-model/openai.ts
4459
+ import OpenAI from "openai";
4620
4460
  var envConfigKey = "MIDSCENE_OPENAI_INIT_CONFIG_JSON";
4621
4461
  var envModelKey = "MIDSCENE_MODEL_NAME";
4622
4462
  var envSmithDebug = "MIDSCENE_LANGSMITH_DEBUG";
4623
4463
  var extraConfig = {};
4624
- if (typeof process.env[envConfigKey] === "string") {
4464
+ if (typeof process.env[envConfigKey] === "string" && process.env[envConfigKey]) {
4625
4465
  console.log("config for openai loaded");
4626
4466
  extraConfig = JSON.parse(process.env[envConfigKey]);
4627
4467
  }
@@ -4647,12 +4487,12 @@ async function call(messages, responseFormat) {
4647
4487
  response_format: { type: responseFormat }
4648
4488
  });
4649
4489
  const { content } = completion.choices[0].message;
4650
- assert5(content, "empty content");
4490
+ assert3(content, "empty content");
4651
4491
  return content;
4652
4492
  }
4653
4493
  async function callToGetJSONObject(messages) {
4654
4494
  const response = await call(messages, "json_object" /* JSON */);
4655
- assert5(response, "empty response");
4495
+ assert3(response, "empty response");
4656
4496
  return JSON.parse(response);
4657
4497
  }
4658
4498
 
@@ -4690,9 +4530,9 @@ You are an expert in software page image (2D) and page element text analysis.
4690
4530
  "elements": [
4691
4531
  // If no matching elements are found, return an empty array []
4692
4532
  {
4693
- "reason": "xxx", // The thought process for finding the element, replace xxx with your thought process
4694
- "text": "xxx", // Replace xxx with the text of elementInfo, if none, leave empty
4695
- "id": "xxx" // Replace xxx with the ID of elementInfo
4533
+ "reason": "PLACEHOLDER", // The thought process for finding the element, replace PLACEHOLDER with your thought process
4534
+ "text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
4535
+ "id": "PLACEHOLDER" // Replace PLACEHOLDER with the ID of elementInfo
4696
4536
  }
4697
4537
  // More elements...
4698
4538
  ],
@@ -4787,9 +4627,244 @@ Output Example:
4787
4627
  `;
4788
4628
  }
4789
4629
 
4630
+ // src/ai-model/prompt/util.ts
4631
+ import assert5 from "assert";
4632
+
4633
+ // src/image/info.ts
4634
+ import assert4 from "assert";
4635
+ import { Buffer as Buffer2 } from "buffer";
4636
+ import { readFileSync as readFileSync2 } from "fs";
4637
+ import Sharp from "sharp";
4638
+ async function imageInfo(image) {
4639
+ const { width, height } = await Sharp(image).metadata();
4640
+ assert4(width && height, `invalid image: ${image}`);
4641
+ return { width, height };
4642
+ }
4643
+ async function imageInfoOfBase64(imageBase64) {
4644
+ const base64Data = imageBase64.replace(/^data:image\/\w+;base64,/, "");
4645
+ return imageInfo(Buffer2.from(base64Data, "base64"));
4646
+ }
4647
+
4648
+ // src/image/transform.ts
4649
+ import { Buffer as Buffer3 } from "buffer";
4650
+ import Sharp2 from "sharp";
4651
+
4652
+ // src/image/visualization.ts
4653
+ import { Buffer as Buffer4 } from "buffer";
4654
+ import Sharp3 from "sharp";
4655
+
4656
+ // src/ai-model/prompt/util.ts
4657
+ var characteristic = "You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.";
4658
+ var contextFormatIntro = `
4659
+ The user will give you a screenshot and the texts on it. There may be some none-English characters (like Chinese) on it, indicating it's an non-English app.`;
4660
+ var ONE_ELEMENT_LOCATOR_PREFIX = "LOCATE_ONE_ELEMENT";
4661
+ var ELEMENTS_LOCATOR_PREFIX = "LOCATE_ONE_OR_MORE_ELEMENTS";
4662
+ var SECTION_MATCHER_FLAG = "SECTION_MATCHER_FLAG/";
4663
+ var skillSegment = `skill name: segment_a_web_page
4664
+ skill content:
4665
+ Based on the functions and content of various elements on the page, segment the screenshot into different sections like navigation bar, product list, news area, etc.
4666
+ Some general rules for segmentation:
4667
+ * Each section should NOT overlap with each other.
4668
+ * Each text should only belong to one section.
4669
+ * [IMPORTANT] Whether the content visually appears to belong to different sections is a significant factor in segmenting the page.
4670
+ * Analyze the page in a top-to-bottom and left-to-right order.
4671
+ * The evidence indicates a separate section, for example
4672
+ - The background color of certain parts of the page changes.
4673
+ - A section of a page includes a title.
4674
+ * Provide the following data for each of the UI section you found.
4675
+ {
4676
+ "name": "name of the section",
4677
+ "description": "briefly summarize the key content or usage of this section.",
4678
+ "sectionCharacteristics": "In view of the need to distinguish this section from the surrounding sections, explain the characteristics and how to define boundaries and what precautions to take.",
4679
+ "textIds": ["5", "6", "7"], // ids of all text elements in this section
4680
+ }
4681
+ `;
4682
+ var skillExtractData = `skill name: extract_data_from_UI
4683
+ related input: DATA_DEMAND
4684
+ skill content:
4685
+ * User will give you some data requirements in DATA_DEMAND. Consider the UI context, follow the user's instructions, and provide comprehensive data accordingly.
4686
+ * There may be some special commands in DATA_DEMAND, please pay extra attention
4687
+ - ${ONE_ELEMENT_LOCATOR_PREFIX} and ${ELEMENTS_LOCATOR_PREFIX}: if you see a description that mentions the keyword ${ONE_ELEMENT_LOCATOR_PREFIX} or ${ELEMENTS_LOCATOR_PREFIX}(e.g. follow ${ONE_ELEMENT_LOCATOR_PREFIX} : i want to find ...), it means user wants to locate a specific element meets the description. Return in this way: prefix + the id / comma-separated ids, for example: ${ONE_ELEMENT_LOCATOR_PREFIX}/1 , ${ELEMENTS_LOCATOR_PREFIX}/1,2,3 . If not found, keep the prefix and leave the suffix empty, like ${ONE_ELEMENT_LOCATOR_PREFIX}/ .`;
4688
+ function promptsOfSectionQuery(constraints) {
4689
+ if (!constraints.length) {
4690
+ return "";
4691
+ }
4692
+ const instruction = "Use your segment_a_web_page skill to find the following section(s)";
4693
+ const singleSection = (c) => {
4694
+ assert5(
4695
+ c.name || c.description,
4696
+ "either `name` or `description` is required to define a section constraint"
4697
+ );
4698
+ const number = "One section";
4699
+ const name = c.name ? `named \`${c.name}\`` : "";
4700
+ const description = c.description ? `, usage or criteria : ${c.description}` : "";
4701
+ const basic = `* ${number} ${name}${description}`;
4702
+ return basic;
4703
+ };
4704
+ return `${instruction}
4705
+ ${constraints.map(singleSection).join("\n")}`;
4706
+ }
4707
+ function systemPromptToExtract(dataQuery, sections) {
4708
+ const allSectionNames = (sections == null ? void 0 : sections.filter((c) => c.name).map((c) => c.name || "")) || [];
4709
+ const sectionFindingPrompt = promptsOfSectionQuery(sections || []);
4710
+ const sectionReturnFormat = allSectionNames.length ? " sections: [], // detailed information of each section from segment_a_web_page skill" : "";
4711
+ return `
4712
+ ${characteristic}
4713
+ ${contextFormatIntro}
4714
+
4715
+ You have the following skills:
4716
+ ${allSectionNames.length ? skillSegment : ""}
4717
+ ${skillExtractData}
4718
+
4719
+ Now, do the following jobs:
4720
+ ${sectionFindingPrompt}
4721
+ Use your extract_data_from_UI skill to find the following data, placing it in the \`data\` field
4722
+ DATA_DEMAND start:
4723
+ ${typeof dataQuery === "object" ? `return in key-value style object, keys are ${Object.keys(dataQuery).join(",")}` : ""};
4724
+ ${typeof dataQuery === "string" ? dataQuery : JSON.stringify(dataQuery, null, 2)}
4725
+ DATA_DEMAND ends.
4726
+
4727
+ Return in the following JSON format:
4728
+ {
4729
+ language: "en", // "en" or "zh", the language of the page. Use the same language to describe section name, description, and similar fields.
4730
+ ${sectionReturnFormat}
4731
+ data: any, // the extracted data from extract_data_from_UI skill. Make sure both the value and scheme meet the DATA_DEMAND.
4732
+ errors?: [], // string[], error message if any
4733
+ }
4734
+ `;
4735
+ }
4736
+ function systemPromptToAssert(assertion) {
4737
+ return `
4738
+ ${characteristic}
4739
+ ${contextFormatIntro}
4740
+
4741
+ Based on the information you get, assert the following:
4742
+ ${assertion}
4743
+
4744
+ Return in the following JSON format:
4745
+ {
4746
+ thought: string, // string, the thought of the assertion
4747
+ pass: true, // true or false, whether the assertion is passed
4748
+ }
4749
+ `;
4750
+ }
4751
+ function describeSize(size) {
4752
+ return `${size.width} x ${size.height}`;
4753
+ }
4754
+ function truncateText(text) {
4755
+ const maxLength = 50;
4756
+ if (text && text.length > maxLength) {
4757
+ return `${text.slice(0, maxLength)}...`;
4758
+ }
4759
+ return text;
4760
+ }
4761
+ async function describeUserPage(context) {
4762
+ const { screenshotBase64 } = context;
4763
+ let width;
4764
+ let height;
4765
+ if (context.size) {
4766
+ ({ width, height } = context.size);
4767
+ } else {
4768
+ const imgSize = await imageInfoOfBase64(screenshotBase64);
4769
+ ({ width, height } = imgSize);
4770
+ }
4771
+ const elementsInfo = context.content;
4772
+ const idElementMap = {};
4773
+ elementsInfo.forEach((item) => {
4774
+ idElementMap[item.id] = item;
4775
+ return __spreadValues({}, item);
4776
+ });
4777
+ const elementInfosDescription = cropfieldInformation(elementsInfo);
4778
+ return {
4779
+ description: `
4780
+ {
4781
+ // The size of the page
4782
+ "pageSize": ${describeSize({ width, height })},
4783
+
4784
+
4785
+ // json description of the element
4786
+ "elementInfos": ${JSON.stringify(elementInfosDescription)}
4787
+ }`,
4788
+ elementById(id) {
4789
+ assert5(typeof id !== "undefined", "id is required for query");
4790
+ const item = idElementMap[`${id}`];
4791
+ return item;
4792
+ }
4793
+ };
4794
+ }
4795
+ function cropfieldInformation(elementsInfo) {
4796
+ const elementInfosDescription = elementsInfo.map(
4797
+ (item) => {
4798
+ const { id, attributes = {}, rect, content } = item;
4799
+ const tailorContent = truncateText(content);
4800
+ const tailorAttributes = Object.keys(attributes).reduce(
4801
+ (res, currentKey) => {
4802
+ const attributeVal = attributes[currentKey];
4803
+ res[currentKey] = truncateText(attributeVal);
4804
+ return res;
4805
+ },
4806
+ {}
4807
+ );
4808
+ return {
4809
+ id,
4810
+ attributes: tailorAttributes,
4811
+ rect,
4812
+ content: tailorContent
4813
+ };
4814
+ }
4815
+ );
4816
+ return JSON.stringify(elementInfosDescription);
4817
+ }
4818
+ function retrieveElement(prompt, opt) {
4819
+ if (opt == null ? void 0 : opt.multi) {
4820
+ return `follow ${ELEMENTS_LOCATOR_PREFIX}: ${prompt}`;
4821
+ }
4822
+ return `follow ${ONE_ELEMENT_LOCATOR_PREFIX}: ${prompt}`;
4823
+ }
4824
+ function ifElementTypeResponse(response) {
4825
+ if (typeof response !== "string") {
4826
+ return false;
4827
+ }
4828
+ return response.startsWith(ONE_ELEMENT_LOCATOR_PREFIX) || response.startsWith(ELEMENTS_LOCATOR_PREFIX);
4829
+ }
4830
+ function splitElementResponse(response) {
4831
+ const oneElementSplitter = `${ONE_ELEMENT_LOCATOR_PREFIX}/`;
4832
+ if (response.startsWith(oneElementSplitter)) {
4833
+ const id = response.slice(oneElementSplitter.length);
4834
+ if (id.indexOf(",") >= 0) {
4835
+ console.warn(`unexpected comma in one element response: ${id}`);
4836
+ }
4837
+ return id ? id : null;
4838
+ }
4839
+ const elementsSplitter = `${ELEMENTS_LOCATOR_PREFIX}/`;
4840
+ if (response.startsWith(elementsSplitter)) {
4841
+ const idsString = response.slice(elementsSplitter.length);
4842
+ if (!idsString) {
4843
+ return [];
4844
+ }
4845
+ return idsString.split(",");
4846
+ }
4847
+ return null;
4848
+ }
4849
+ function retrieveSection(prompt) {
4850
+ return `${SECTION_MATCHER_FLAG}${prompt}`;
4851
+ }
4852
+ function extractSectionQuery(input) {
4853
+ if (typeof input === "string" && input.startsWith(SECTION_MATCHER_FLAG)) {
4854
+ return input.slice(SECTION_MATCHER_FLAG.length);
4855
+ }
4856
+ return false;
4857
+ }
4858
+
4790
4859
  // src/ai-model/inspect.ts
4860
+ import assert6 from "assert";
4791
4861
  async function AiInspectElement(options) {
4792
- const { context, multi, findElementDescription, callAI = callToGetJSONObject } = options;
4862
+ const {
4863
+ context,
4864
+ multi,
4865
+ findElementDescription,
4866
+ callAI = callToGetJSONObject
4867
+ } = options;
4793
4868
  const { screenshotBase64 } = context;
4794
4869
  const { description, elementById } = await describeUserPage(context);
4795
4870
  const systemPrompt = systemPromptToFindElement(findElementDescription, multi);
@@ -4815,12 +4890,16 @@ async function AiInspectElement(options) {
4815
4890
  const parseResult = await callAI(msgs);
4816
4891
  return {
4817
4892
  parseResult,
4818
- elementById,
4819
- systemPrompt
4893
+ elementById
4820
4894
  };
4821
4895
  }
4822
4896
  async function AiExtractElementInfo(options) {
4823
- const { dataQuery, sectionConstraints, context, callAI = callToGetJSONObject } = options;
4897
+ const {
4898
+ dataQuery,
4899
+ sectionConstraints,
4900
+ context,
4901
+ callAI = callToGetJSONObject
4902
+ } = options;
4824
4903
  const systemPrompt = systemPromptToExtract(dataQuery, sectionConstraints);
4825
4904
  const { screenshotBase64 } = context;
4826
4905
  const { description, elementById } = await describeUserPage(context);
@@ -4845,18 +4924,137 @@ async function AiExtractElementInfo(options) {
4845
4924
  const parseResult = await callAI(msgs);
4846
4925
  return {
4847
4926
  parseResult,
4848
- elementById,
4849
- systemPrompt
4927
+ elementById
4928
+ };
4929
+ }
4930
+ async function AiAssert(options) {
4931
+ const { assertion, context, callAI = callToGetJSONObject } = options;
4932
+ assert6(assertion, "assertion should be a string");
4933
+ const systemPrompt = systemPromptToAssert(assertion);
4934
+ const { screenshotBase64 } = context;
4935
+ const { description, elementById } = await describeUserPage(context);
4936
+ const msgs = [
4937
+ { role: "system", content: systemPrompt },
4938
+ {
4939
+ role: "user",
4940
+ content: [
4941
+ {
4942
+ type: "image_url",
4943
+ image_url: {
4944
+ url: screenshotBase64
4945
+ }
4946
+ },
4947
+ {
4948
+ type: "text",
4949
+ text: description
4950
+ }
4951
+ ]
4952
+ }
4953
+ ];
4954
+ const assertResult = await callAI(msgs);
4955
+ return assertResult;
4956
+ }
4957
+
4958
+ // src/insight/utils.ts
4959
+ import assert7 from "assert";
4960
+ import { randomUUID as randomUUID2 } from "crypto";
4961
+ import { existsSync as existsSync2 } from "fs";
4962
+ import { join as join2 } from "path";
4963
+ var logFileName = "";
4964
+ var logContent = [];
4965
+ var logIdIndexMap = {};
4966
+ var { pid } = process;
4967
+ var logFileExt = insightDumpFileExt;
4968
+ function writeInsightDump(data, logId, dumpSubscriber) {
4969
+ const logDir2 = getDumpDir();
4970
+ assert7(logDir2, "logDir should be set before writing dump file");
4971
+ const id = logId || randomUUID2();
4972
+ const baseData = {
4973
+ sdkVersion: getPkgInfo().version,
4974
+ logTime: Date.now()
4975
+ };
4976
+ const finalData = __spreadValues(__spreadValues({
4977
+ logId: id
4978
+ }, baseData), data);
4979
+ dumpSubscriber == null ? void 0 : dumpSubscriber(finalData);
4980
+ if (!logFileName) {
4981
+ logFileName = `pid_${pid}_${baseData.logTime}`;
4982
+ while (existsSync2(join2(logDir2, `${logFileName}.${logFileExt}`))) {
4983
+ logFileName = `${pid}_${baseData.logTime}-${Math.random()}`;
4984
+ }
4985
+ }
4986
+ const dataString = JSON.stringify(finalData, null, 2);
4987
+ if (typeof logIdIndexMap[id] === "number") {
4988
+ logContent[logIdIndexMap[id]] = dataString;
4989
+ } else {
4990
+ const length = logContent.push(dataString);
4991
+ logIdIndexMap[id] = length - 1;
4992
+ }
4993
+ writeDumpFile({
4994
+ fileName: logFileName,
4995
+ fileExt: logFileExt,
4996
+ fileContent: `[
4997
+ ${logContent.join(",\n")}
4998
+ ]`
4999
+ });
5000
+ return id;
5001
+ }
5002
+ function idsIntoElements(ids, elementById) {
5003
+ return ids.reduce((acc, id) => {
5004
+ const element = elementById(id);
5005
+ if (element) {
5006
+ acc.push(element);
5007
+ } else {
5008
+ console.warn(`element not found by id: ${id}`);
5009
+ }
5010
+ return acc;
5011
+ }, []);
5012
+ }
5013
+ function shallowExpandIds(data, ifMeet, elementsById) {
5014
+ const keys = Object.keys(data);
5015
+ keys.forEach((key) => {
5016
+ const value = data[key];
5017
+ if (typeof value === "string" && ifMeet(value)) {
5018
+ data[key] = elementsById(value);
5019
+ } else if (Array.isArray(value)) {
5020
+ const newValue = value.map((id) => ifMeet(id) ? elementsById(id) : id);
5021
+ data[key] = newValue;
5022
+ }
5023
+ });
5024
+ return data;
5025
+ }
5026
+ function expandLiteSection(liteSection, elementById) {
5027
+ const _a = liteSection, { textIds } = _a, remainingFields = __objRest(_a, ["textIds"]);
5028
+ const texts = idsIntoElements(textIds, elementById);
5029
+ let leftMost = -1;
5030
+ let topMost = -1;
5031
+ let rightMost = -1;
5032
+ let bottomMost = -1;
5033
+ texts.forEach((text) => {
5034
+ leftMost = leftMost === -1 ? text.rect.left : Math.min(leftMost, text.rect.left);
5035
+ topMost = topMost === -1 ? text.rect.top : Math.min(topMost, text.rect.top);
5036
+ rightMost = Math.max(rightMost, text.rect.left + text.rect.width);
5037
+ bottomMost = Math.max(bottomMost, text.rect.top + text.rect.height);
5038
+ });
5039
+ const sectionRect = {
5040
+ left: leftMost,
5041
+ top: topMost,
5042
+ width: rightMost - leftMost,
5043
+ height: bottomMost - topMost
4850
5044
  };
5045
+ const section = __spreadProps(__spreadValues({}, remainingFields), {
5046
+ content: texts,
5047
+ rect: sectionRect
5048
+ });
5049
+ return section;
4851
5050
  }
4852
5051
 
4853
5052
  // src/insight/index.ts
4854
5053
  var sortByOrder = (a, b) => {
4855
5054
  if (a.rect.top - b.rect.top !== 0) {
4856
5055
  return a.rect.top - b.rect.top;
4857
- } else {
4858
- return a.rect.left - b.rect.left;
4859
5056
  }
5057
+ return a.rect.left - b.rect.left;
4860
5058
  };
4861
5059
  var Insight = class {
4862
5060
  constructor(context, opt) {
@@ -4864,7 +5062,7 @@ var Insight = class {
4864
5062
  __publicField(this, "aiVendorFn", callToGetJSONObject);
4865
5063
  __publicField(this, "onceDumpUpdatedFn");
4866
5064
  __publicField(this, "taskInfo");
4867
- assert6(context, "context is required for Insight");
5065
+ assert8(context, "context is required for Insight");
4868
5066
  if (typeof context === "function") {
4869
5067
  this.contextRetrieverFn = context;
4870
5068
  } else {
@@ -4880,12 +5078,12 @@ var Insight = class {
4880
5078
  async locate(queryPrompt, opt) {
4881
5079
  var _a;
4882
5080
  const { callAI = this.aiVendorFn, multi = false } = opt || {};
4883
- assert6(queryPrompt, "query is required for located");
5081
+ assert8(queryPrompt, "query is required for located");
4884
5082
  const dumpSubscriber = this.onceDumpUpdatedFn;
4885
5083
  this.onceDumpUpdatedFn = void 0;
4886
5084
  const context = await this.contextRetrieverFn();
4887
5085
  const startTime = Date.now();
4888
- const { parseResult, systemPrompt, elementById } = await AiInspectElement({
5086
+ const { parseResult, elementById } = await AiInspectElement({
4889
5087
  callAI,
4890
5088
  context,
4891
5089
  multi: Boolean(multi),
@@ -4894,8 +5092,7 @@ var Insight = class {
4894
5092
  const timeCost = Date.now() - startTime;
4895
5093
  const taskInfo = __spreadProps(__spreadValues({}, this.taskInfo ? this.taskInfo : {}), {
4896
5094
  durationMs: timeCost,
4897
- rawResponse: JSON.stringify(parseResult),
4898
- systemPrompt
5095
+ rawResponse: JSON.stringify(parseResult)
4899
5096
  });
4900
5097
  let errorLog;
4901
5098
  if ((_a = parseResult.errors) == null ? void 0 : _a.length) {
@@ -4923,7 +5120,9 @@ ${parseResult.errors.join("\n")}`;
4923
5120
  parseResult.elements.forEach((item) => {
4924
5121
  const element = elementById(item.id);
4925
5122
  if (!element) {
4926
- console.warn(`locate: cannot find element id=${item.id}. Maybe an unstable response from AI model`);
5123
+ console.warn(
5124
+ `locate: cannot find element id=${item.id}. Maybe an unstable response from AI model`
5125
+ );
4927
5126
  return;
4928
5127
  }
4929
5128
  elements.push(element);
@@ -4937,20 +5136,23 @@ ${parseResult.errors.join("\n")}`;
4937
5136
  );
4938
5137
  if (opt == null ? void 0 : opt.multi) {
4939
5138
  return elements;
4940
- } else if (elements.length >= 2) {
4941
- console.warn(`locate: multiple elements found, return the first one. (query: ${queryPrompt})`);
5139
+ }
5140
+ if (elements.length >= 2) {
5141
+ console.warn(
5142
+ `locate: multiple elements found, return the first one. (query: ${queryPrompt})`
5143
+ );
4942
5144
  return elements[0];
4943
- } else if (elements.length === 1) {
5145
+ }
5146
+ if (elements.length === 1) {
4944
5147
  return elements[0];
4945
- } else {
4946
- return null;
4947
5148
  }
5149
+ return null;
4948
5150
  }
4949
5151
  async extract(dataDemand) {
4950
5152
  var _a;
4951
5153
  let dataQuery = {};
4952
5154
  const sectionQueryMap = {};
4953
- assert6(
5155
+ assert8(
4954
5156
  typeof dataDemand === "object" || typeof dataDemand === "string",
4955
5157
  `dataDemand should be object or string, but get ${typeof dataDemand}`
4956
5158
  );
@@ -4979,7 +5181,7 @@ ${parseResult.errors.join("\n")}`;
4979
5181
  });
4980
5182
  const context = await this.contextRetrieverFn();
4981
5183
  const startTime = Date.now();
4982
- const { parseResult, systemPrompt, elementById } = await AiExtractElementInfo({
5184
+ const { parseResult, elementById } = await AiExtractElementInfo({
4983
5185
  context,
4984
5186
  dataQuery,
4985
5187
  sectionConstraints,
@@ -4988,8 +5190,7 @@ ${parseResult.errors.join("\n")}`;
4988
5190
  const timeCost = Date.now() - startTime;
4989
5191
  const taskInfo = __spreadProps(__spreadValues({}, this.taskInfo ? this.taskInfo : {}), {
4990
5192
  durationMs: timeCost,
4991
- rawResponse: JSON.stringify(parseResult),
4992
- systemPrompt
5193
+ rawResponse: JSON.stringify(parseResult)
4993
5194
  });
4994
5195
  let errorLog;
4995
5196
  if ((_a = parseResult.errors) == null ? void 0 : _a.length) {
@@ -5014,7 +5215,10 @@ ${parseResult.errors.join("\n")}`;
5014
5215
  throw new Error(errorLog);
5015
5216
  }
5016
5217
  const sectionsArr = (parseResult.sections || []).map((liteSection) => {
5017
- const section = expandLiteSection(liteSection, (id) => elementById(id));
5218
+ const section = expandLiteSection(
5219
+ liteSection,
5220
+ (id) => elementById(id)
5221
+ );
5018
5222
  return section;
5019
5223
  }).sort(sortByOrder);
5020
5224
  const sectionMap = sectionsArr.reduce((acc, section) => {
@@ -5038,7 +5242,8 @@ ${parseResult.errors.join("\n")}`;
5038
5242
  const idList = splitElementResponse(id);
5039
5243
  if (typeof idList === "string") {
5040
5244
  return elementById(idList);
5041
- } else if (Array.isArray(idList)) {
5245
+ }
5246
+ if (Array.isArray(idList)) {
5042
5247
  return idsIntoElements(idList, elementById);
5043
5248
  }
5044
5249
  return idList;
@@ -5055,133 +5260,46 @@ ${parseResult.errors.join("\n")}`;
5055
5260
  );
5056
5261
  return mergedData;
5057
5262
  }
5058
- setAiVendorFn(aiVendorFn) {
5059
- const origin = this.aiVendorFn;
5060
- this.aiVendorFn = aiVendorFn;
5061
- return () => {
5062
- this.aiVendorFn = origin;
5063
- };
5064
- }
5065
- };
5066
-
5067
- // src/action/executor.ts
5068
- import assert7 from "assert";
5069
- var Executor = class {
5070
- constructor(name, description, tasks) {
5071
- __publicField(this, "name");
5072
- __publicField(this, "description");
5073
- __publicField(this, "tasks");
5074
- // status of executor
5075
- __publicField(this, "status");
5076
- __publicField(this, "errorMsg");
5077
- __publicField(this, "dumpFileName");
5078
- this.status = tasks && tasks.length > 0 ? "pending" : "init";
5079
- this.name = name;
5080
- this.description = description;
5081
- this.tasks = (tasks || []).map((item) => this.markTaskAsPending(item));
5082
- }
5083
- markTaskAsPending(task) {
5084
- return __spreadValues({
5085
- status: "pending"
5086
- }, task);
5087
- }
5088
- async append(task) {
5089
- assert7(this.status !== "error", "executor is in error state, cannot append task");
5090
- if (Array.isArray(task)) {
5091
- this.tasks.push(...task.map((item) => this.markTaskAsPending(item)));
5092
- } else {
5093
- this.tasks.push(this.markTaskAsPending(task));
5094
- }
5095
- if (this.status !== "running") {
5096
- this.status = "pending";
5097
- }
5098
- }
5099
- async flush() {
5100
- if (this.status === "init" && this.tasks.length > 0) {
5101
- console.warn("illegal state for executor, status is init but tasks are not empty");
5102
- }
5103
- assert7(this.status !== "running", "executor is already running");
5104
- assert7(this.status !== "completed", "executor is already completed");
5105
- assert7(this.status !== "error", "executor is in error state");
5106
- const nextPendingIndex = this.tasks.findIndex((task) => task.status === "pending");
5107
- if (nextPendingIndex < 0) {
5108
- return;
5109
- }
5110
- this.status = "running";
5111
- let taskIndex = nextPendingIndex;
5112
- let successfullyCompleted = true;
5113
- let errorMsg = "";
5114
- let previousFindOutput;
5115
- while (taskIndex < this.tasks.length) {
5116
- const task = this.tasks[taskIndex];
5117
- assert7(task.status === "pending", `task status should be pending, but got: ${task.status}`);
5118
- task.timing = {
5119
- start: Date.now()
5120
- };
5121
- try {
5122
- task.status = "running";
5123
- assert7(
5124
- ["Insight", "Action", "Planning"].indexOf(task.type) >= 0,
5125
- `unsupported task type: ${task.type}`
5126
- );
5127
- const { executor, param } = task;
5128
- assert7(executor, `executor is required for task type: ${task.type}`);
5129
- let returnValue;
5130
- const executorContext = {
5131
- task,
5132
- element: previousFindOutput == null ? void 0 : previousFindOutput.element
5133
- };
5134
- if (task.type === "Insight") {
5135
- assert7(
5136
- task.subType === "Locate" || task.subType === "Query",
5137
- `unsupported insight subType: ${task.subType}`
5138
- );
5139
- returnValue = await task.executor(param, executorContext);
5140
- if (task.subType === "Locate") {
5141
- previousFindOutput = returnValue == null ? void 0 : returnValue.output;
5142
- }
5143
- } else if (task.type === "Action" || task.type === "Planning") {
5144
- returnValue = await task.executor(param, executorContext);
5145
- } else {
5146
- console.warn(`unsupported task type: ${task.type}, will try to execute it directly`);
5147
- returnValue = await task.executor(param, executorContext);
5148
- }
5149
- Object.assign(task, returnValue);
5150
- task.status = "success";
5151
- task.timing.end = Date.now();
5152
- task.timing.cost = task.timing.end - task.timing.start;
5153
- taskIndex++;
5154
- } catch (e) {
5155
- successfullyCompleted = false;
5156
- task.status = "fail";
5157
- errorMsg = `${e == null ? void 0 : e.message}
5158
- ${e == null ? void 0 : e.stack}`;
5159
- task.error = errorMsg;
5160
- task.timing.end = Date.now();
5161
- task.timing.cost = task.timing.end - task.timing.start;
5162
- this.errorMsg = errorMsg;
5163
- break;
5164
- }
5165
- }
5166
- for (let i = taskIndex + 1; i < this.tasks.length; i++) {
5167
- this.tasks[i].status = "cancelled";
5168
- }
5169
- if (successfullyCompleted) {
5170
- this.status = "completed";
5171
- } else {
5172
- this.status = "error";
5173
- throw new Error(`executor failed: ${errorMsg}`);
5263
+ async assert(assertion) {
5264
+ if (typeof assertion !== "string") {
5265
+ throw new Error(
5266
+ "This is the assert method for Midscene, the first argument should be a string. If you want to use the assert method from Node.js, please import it from the Node.js assert module."
5267
+ );
5174
5268
  }
5175
- }
5176
- dump() {
5269
+ const dumpSubscriber = this.onceDumpUpdatedFn;
5270
+ this.onceDumpUpdatedFn = void 0;
5271
+ const context = await this.contextRetrieverFn();
5272
+ const startTime = Date.now();
5273
+ const assertResult = await AiAssert({
5274
+ assertion,
5275
+ callAI: this.aiVendorFn,
5276
+ context
5277
+ });
5278
+ const timeCost = Date.now() - startTime;
5279
+ const taskInfo = __spreadProps(__spreadValues({}, this.taskInfo ? this.taskInfo : {}), {
5280
+ durationMs: timeCost,
5281
+ rawResponse: JSON.stringify(assertResult)
5282
+ });
5283
+ const { thought, pass } = assertResult;
5177
5284
  const dumpData = {
5178
- sdkVersion: getPkgInfo().version,
5179
- logTime: Date.now(),
5180
- name: this.name,
5181
- description: this.description,
5182
- tasks: this.tasks
5285
+ type: "assert",
5286
+ context,
5287
+ userQuery: {
5288
+ assertion
5289
+ },
5290
+ matchedSection: [],
5291
+ matchedElement: [],
5292
+ data: null,
5293
+ taskInfo,
5294
+ assertionPass: pass,
5295
+ assertionThought: thought,
5296
+ error: pass ? void 0 : thought
5297
+ };
5298
+ writeInsightDump(dumpData, void 0, dumpSubscriber);
5299
+ return {
5300
+ pass,
5301
+ thought
5183
5302
  };
5184
- return dumpData;
5185
5303
  }
5186
5304
  };
5187
5305