@midscene/core 0.8.4 → 0.8.5-beta-20241126063126.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/lib/ai-model.js +147 -92
- package/dist/lib/env.js +102 -0
- package/dist/lib/index.js +904 -843
- package/dist/lib/types/ai-model.d.ts +7 -3
- package/dist/lib/types/env.d.ts +48 -0
- package/dist/lib/types/{index-690c2a06.d.ts → index-41db6188.d.ts} +3 -36
- package/dist/lib/types/index.d.ts +6 -5
- package/dist/lib/types/{types-29994b1b.d.ts → types-0d8eeece.d.ts} +3 -1
- package/dist/lib/types/utils.d.ts +3 -3
- package/dist/lib/utils.js +15 -30
- package/package.json +6 -2
- package/report/index.html +2 -2
package/dist/lib/index.js
CHANGED
|
@@ -4292,6 +4292,7 @@ __export(src_exports, {
|
|
|
4292
4292
|
default: () => src_default,
|
|
4293
4293
|
getAIConfig: () => getAIConfig,
|
|
4294
4294
|
getElement: () => getElement,
|
|
4295
|
+
getLogDirByType: () => getLogDirByType,
|
|
4295
4296
|
getSection: () => getSection,
|
|
4296
4297
|
getVersion: () => getVersion,
|
|
4297
4298
|
overrideAIConfig: () => overrideAIConfig,
|
|
@@ -4302,220 +4303,562 @@ __export(src_exports, {
|
|
|
4302
4303
|
module.exports = __toCommonJS(src_exports);
|
|
4303
4304
|
|
|
4304
4305
|
// src/action/executor.ts
|
|
4305
|
-
var
|
|
4306
|
-
|
|
4307
|
-
// src/ai-model/openai/index.ts
|
|
4308
|
-
var import_node_assert3 = __toESM(require("assert"));
|
|
4306
|
+
var import_node_assert2 = __toESM(require("assert"));
|
|
4309
4307
|
|
|
4310
|
-
// src/
|
|
4311
|
-
var
|
|
4308
|
+
// src/env.ts
|
|
4309
|
+
var MIDSCENE_OPENAI_INIT_CONFIG_JSON = "MIDSCENE_OPENAI_INIT_CONFIG_JSON";
|
|
4310
|
+
var MIDSCENE_MODEL_NAME = "MIDSCENE_MODEL_NAME";
|
|
4311
|
+
var MIDSCENE_LANGSMITH_DEBUG = "MIDSCENE_LANGSMITH_DEBUG";
|
|
4312
|
+
var MIDSCENE_DEBUG_AI_PROFILE = "MIDSCENE_DEBUG_AI_PROFILE";
|
|
4313
|
+
var MIDSCENE_DANGEROUSLY_PRINT_ALL_CONFIG = "MIDSCENE_DANGEROUSLY_PRINT_ALL_CONFIG";
|
|
4314
|
+
var MIDSCENE_DEBUG_MODE = "MIDSCENE_DEBUG_MODE";
|
|
4315
|
+
var OPENAI_API_KEY = "OPENAI_API_KEY";
|
|
4316
|
+
var OPENAI_BASE_URL = "OPENAI_BASE_URL";
|
|
4317
|
+
var MIDSCENE_MODEL_TEXT_ONLY = "MIDSCENE_MODEL_TEXT_ONLY";
|
|
4318
|
+
var OPENAI_USE_AZURE = "OPENAI_USE_AZURE";
|
|
4319
|
+
var MIDSCENE_CACHE = "MIDSCENE_CACHE";
|
|
4320
|
+
var MATCH_BY_POSITION = "MATCH_BY_POSITION";
|
|
4321
|
+
var MIDSCENE_REPORT_TAG_NAME = "MIDSCENE_REPORT_TAG_NAME";
|
|
4322
|
+
var allConfigFromEnv = () => {
|
|
4323
|
+
return {
|
|
4324
|
+
[MIDSCENE_OPENAI_INIT_CONFIG_JSON]: process.env[MIDSCENE_OPENAI_INIT_CONFIG_JSON] || void 0,
|
|
4325
|
+
[MIDSCENE_MODEL_NAME]: process.env[MIDSCENE_MODEL_NAME] || void 0,
|
|
4326
|
+
[MIDSCENE_DEBUG_MODE]: process.env[MIDSCENE_DEBUG_MODE] || void 0,
|
|
4327
|
+
[MIDSCENE_LANGSMITH_DEBUG]: process.env[MIDSCENE_LANGSMITH_DEBUG] || void 0,
|
|
4328
|
+
[MIDSCENE_DEBUG_AI_PROFILE]: process.env[MIDSCENE_DEBUG_AI_PROFILE] || void 0,
|
|
4329
|
+
[MIDSCENE_DANGEROUSLY_PRINT_ALL_CONFIG]: process.env[MIDSCENE_DANGEROUSLY_PRINT_ALL_CONFIG] || void 0,
|
|
4330
|
+
[OPENAI_API_KEY]: process.env[OPENAI_API_KEY] || void 0,
|
|
4331
|
+
[OPENAI_BASE_URL]: process.env[OPENAI_BASE_URL] || void 0,
|
|
4332
|
+
[MIDSCENE_MODEL_TEXT_ONLY]: process.env[MIDSCENE_MODEL_TEXT_ONLY] || void 0,
|
|
4333
|
+
[OPENAI_USE_AZURE]: process.env[OPENAI_USE_AZURE] || void 0,
|
|
4334
|
+
[MIDSCENE_CACHE]: process.env[MIDSCENE_CACHE] || void 0,
|
|
4335
|
+
[MATCH_BY_POSITION]: process.env[MATCH_BY_POSITION] || void 0,
|
|
4336
|
+
[MIDSCENE_REPORT_TAG_NAME]: process.env[MIDSCENE_REPORT_TAG_NAME] || void 0
|
|
4337
|
+
};
|
|
4312
4338
|
};
|
|
4313
|
-
var
|
|
4314
|
-
|
|
4315
|
-
|
|
4316
|
-
|
|
4317
|
-
}
|
|
4318
|
-
|
|
4339
|
+
var userConfig = {};
|
|
4340
|
+
var getAIConfig = (configKey) => {
|
|
4341
|
+
if (typeof userConfig[configKey] !== "undefined") {
|
|
4342
|
+
return userConfig[configKey];
|
|
4343
|
+
}
|
|
4344
|
+
return allConfigFromEnv()[configKey];
|
|
4345
|
+
};
|
|
4346
|
+
var allAIConfig = () => {
|
|
4347
|
+
return { ...allConfigFromEnv(), ...userConfig };
|
|
4348
|
+
};
|
|
4349
|
+
var overrideAIConfig = (newConfig, extendMode) => {
|
|
4350
|
+
userConfig = extendMode ? { ...userConfig, ...newConfig } : { ...newConfig };
|
|
4319
4351
|
};
|
|
4320
4352
|
|
|
4321
|
-
// src/
|
|
4322
|
-
var import_utils = require("@midscene/shared/utils");
|
|
4323
|
-
var import_openai5 = __toESM(require("openai"));
|
|
4324
|
-
|
|
4325
|
-
// src/ai-model/coze/index.ts
|
|
4353
|
+
// src/utils.ts
|
|
4326
4354
|
var import_node_assert = __toESM(require("assert"));
|
|
4327
|
-
var
|
|
4328
|
-
var
|
|
4329
|
-
var
|
|
4330
|
-
var
|
|
4331
|
-
var
|
|
4332
|
-
|
|
4333
|
-
|
|
4334
|
-
|
|
4335
|
-
|
|
4355
|
+
var import_node_child_process = require("child_process");
|
|
4356
|
+
var import_node_fs = require("fs");
|
|
4357
|
+
var import_node_os = require("os");
|
|
4358
|
+
var import_node_path = require("path");
|
|
4359
|
+
var import_fs = require("@midscene/shared/fs");
|
|
4360
|
+
var import_utils = require("@midscene/shared/utils");
|
|
4361
|
+
var logDir = (0, import_node_path.join)(process.cwd(), "./midscene_run/");
|
|
4362
|
+
var logEnvReady = false;
|
|
4363
|
+
var insightDumpFileExt = "insight-dump.json";
|
|
4364
|
+
function getLogDir() {
|
|
4365
|
+
return logDir;
|
|
4336
4366
|
}
|
|
4337
|
-
|
|
4338
|
-
|
|
4339
|
-
|
|
4340
|
-
|
|
4341
|
-
|
|
4342
|
-
|
|
4343
|
-
|
|
4344
|
-
"Content-Type": "application/json",
|
|
4345
|
-
Accept: "*/*",
|
|
4346
|
-
Host: "api.coze.com",
|
|
4347
|
-
Connection: "keep-alive"
|
|
4348
|
-
},
|
|
4349
|
-
body: JSON.stringify({
|
|
4350
|
-
conversation_id: "123",
|
|
4351
|
-
bot_id: botId,
|
|
4352
|
-
user: "29032201862555",
|
|
4353
|
-
query,
|
|
4354
|
-
meta_data: {
|
|
4355
|
-
img: imgs.map((imgPath) => {
|
|
4356
|
-
return {
|
|
4357
|
-
url: imgPath
|
|
4358
|
-
};
|
|
4359
|
-
})
|
|
4360
|
-
},
|
|
4361
|
-
stream: false
|
|
4362
|
-
})
|
|
4363
|
-
});
|
|
4364
|
-
if (!completion.ok) {
|
|
4365
|
-
console.error("CozeAI reponse error", completion);
|
|
4366
|
-
throw new Error("Network response was not ok");
|
|
4367
|
-
}
|
|
4368
|
-
const aiResponse = await completion.json();
|
|
4369
|
-
if (aiResponse.code !== 0) {
|
|
4370
|
-
console.error("CozeAI error response", aiResponse.msg);
|
|
4371
|
-
throw new Error(`CozeAI error response ${aiResponse.msg}`);
|
|
4367
|
+
function setLogDir(dir) {
|
|
4368
|
+
logDir = dir;
|
|
4369
|
+
}
|
|
4370
|
+
function getLogDirByType(type) {
|
|
4371
|
+
const dir = (0, import_node_path.join)(getLogDir(), type);
|
|
4372
|
+
if (!(0, import_node_fs.existsSync)(dir)) {
|
|
4373
|
+
(0, import_node_fs.mkdirSync)(dir, { recursive: true });
|
|
4372
4374
|
}
|
|
4373
|
-
|
|
4374
|
-
|
|
4375
|
-
|
|
4375
|
+
return dir;
|
|
4376
|
+
}
|
|
4377
|
+
var reportTpl = null;
|
|
4378
|
+
function getReportTpl() {
|
|
4379
|
+
if (import_utils.ifInBrowser) {
|
|
4380
|
+
if (!reportTpl && window.midscene_report_tpl) {
|
|
4381
|
+
reportTpl = window.midscene_report_tpl;
|
|
4382
|
+
}
|
|
4383
|
+
(0, import_node_assert.default)(
|
|
4384
|
+
reportTpl,
|
|
4385
|
+
"reportTpl should be set before writing report in browser"
|
|
4386
|
+
);
|
|
4387
|
+
return reportTpl;
|
|
4376
4388
|
}
|
|
4377
|
-
|
|
4378
|
-
|
|
4379
|
-
|
|
4380
|
-
|
|
4381
|
-
|
|
4382
|
-
|
|
4383
|
-
throw Error("can't parse coze content");
|
|
4389
|
+
if (!reportTpl) {
|
|
4390
|
+
let reportPath = (0, import_node_path.join)(__dirname, "../../report/index.html");
|
|
4391
|
+
if (!(0, import_node_fs.existsSync)(reportPath)) {
|
|
4392
|
+
reportPath = (0, import_node_path.join)(__dirname, "../report/index.html");
|
|
4393
|
+
}
|
|
4394
|
+
reportTpl = (0, import_node_fs.readFileSync)(reportPath, "utf-8");
|
|
4384
4395
|
}
|
|
4396
|
+
return reportTpl;
|
|
4385
4397
|
}
|
|
4386
|
-
function
|
|
4387
|
-
|
|
4388
|
-
|
|
4389
|
-
if (
|
|
4390
|
-
|
|
4391
|
-
|
|
4392
|
-
|
|
4393
|
-
|
|
4398
|
+
function reportHTMLContent(dumpData) {
|
|
4399
|
+
const tpl = getReportTpl();
|
|
4400
|
+
let reportContent;
|
|
4401
|
+
if (Array.isArray(dumpData) && dumpData.length === 0 || typeof dumpData === "undefined") {
|
|
4402
|
+
reportContent = tpl.replace(
|
|
4403
|
+
/\s+{{dump}}\s+/,
|
|
4404
|
+
`<script type="midscene_web_dump" type="application/json"></script>`
|
|
4405
|
+
);
|
|
4406
|
+
} else if (typeof dumpData === "string") {
|
|
4407
|
+
reportContent = tpl.replace(
|
|
4408
|
+
/\s+{{dump}}\s+/,
|
|
4409
|
+
`<script type="midscene_web_dump" type="application/json">${dumpData}</script>`
|
|
4410
|
+
);
|
|
4411
|
+
} else {
|
|
4412
|
+
const dumps = dumpData.map(({ dumpString, attributes }) => {
|
|
4413
|
+
const attributesArr = Object.keys(attributes || {}).map((key) => {
|
|
4414
|
+
return `${key}="${encodeURIComponent(attributes[key])}"`;
|
|
4415
|
+
});
|
|
4416
|
+
return `<script type="midscene_web_dump" type="application/json" ${attributesArr.join(
|
|
4417
|
+
" "
|
|
4418
|
+
)}
|
|
4419
|
+
>${dumpString}
|
|
4420
|
+
</script>`;
|
|
4421
|
+
});
|
|
4422
|
+
reportContent = tpl.replace(/\s+{{dump}}\s+/, dumps.join("\n"));
|
|
4394
4423
|
}
|
|
4395
|
-
return
|
|
4396
|
-
query: msg.content.reduce((res, next) => {
|
|
4397
|
-
if (next.type === "text") {
|
|
4398
|
-
res += `
|
|
4399
|
-
${next.text}`;
|
|
4400
|
-
}
|
|
4401
|
-
return res;
|
|
4402
|
-
}, ""),
|
|
4403
|
-
imgs: msg.content.reduce(
|
|
4404
|
-
(res, next) => {
|
|
4405
|
-
if (next.type === "image_url") {
|
|
4406
|
-
res.push(next.image_url.url);
|
|
4407
|
-
}
|
|
4408
|
-
return res;
|
|
4409
|
-
},
|
|
4410
|
-
[]
|
|
4411
|
-
)
|
|
4412
|
-
};
|
|
4424
|
+
return reportContent;
|
|
4413
4425
|
}
|
|
4414
|
-
|
|
4415
|
-
|
|
4416
|
-
|
|
4417
|
-
|
|
4418
|
-
if (preferOpenAIModel(useModel)) {
|
|
4419
|
-
const parseResult = await callToGetJSONObject(msgs, AIActionTypeValue);
|
|
4420
|
-
return parseResult;
|
|
4426
|
+
function writeDumpReport(fileName, dumpData) {
|
|
4427
|
+
if (import_utils.ifInBrowser) {
|
|
4428
|
+
console.log("will not write report in browser");
|
|
4429
|
+
return null;
|
|
4421
4430
|
}
|
|
4422
|
-
|
|
4423
|
-
|
|
4424
|
-
|
|
4425
|
-
|
|
4426
|
-
botId = COZE_AI_ASSERT_BOT_ID;
|
|
4427
|
-
break;
|
|
4428
|
-
case 2 /* EXTRACT_DATA */:
|
|
4429
|
-
botId = COZE_EXTRACT_INFO_BOT_ID;
|
|
4430
|
-
break;
|
|
4431
|
-
case 1 /* INSPECT_ELEMENT */:
|
|
4432
|
-
botId = COZE_INSPECT_ELEMENT_BOT_ID;
|
|
4433
|
-
break;
|
|
4434
|
-
default:
|
|
4435
|
-
botId = COZE_AI_ACTION_BOT_ID;
|
|
4436
|
-
}
|
|
4437
|
-
const cozeMsg = transformOpenAiArgsToCoze(msgs[1]);
|
|
4438
|
-
const parseResult = await callCozeAi({
|
|
4439
|
-
...cozeMsg,
|
|
4440
|
-
botId
|
|
4441
|
-
});
|
|
4442
|
-
return parseResult;
|
|
4431
|
+
const midscenePkgInfo = (0, import_fs.getRunningPkgInfo)(__dirname);
|
|
4432
|
+
if (!midscenePkgInfo) {
|
|
4433
|
+
console.warn("midscenePkgInfo not found, will not write report");
|
|
4434
|
+
return null;
|
|
4443
4435
|
}
|
|
4444
|
-
|
|
4445
|
-
|
|
4446
|
-
);
|
|
4436
|
+
const reportPath = (0, import_node_path.join)(getLogDirByType("report"), `${fileName}.html`);
|
|
4437
|
+
const reportContent = reportHTMLContent(dumpData);
|
|
4438
|
+
(0, import_node_fs.writeFileSync)(reportPath, reportContent);
|
|
4439
|
+
return reportPath;
|
|
4447
4440
|
}
|
|
4448
|
-
function
|
|
4449
|
-
|
|
4450
|
-
|
|
4451
|
-
|
|
4452
|
-
|
|
4453
|
-
|
|
4454
|
-
|
|
4441
|
+
function writeLogFile(opts) {
|
|
4442
|
+
if (import_utils.ifInBrowser) {
|
|
4443
|
+
return "/mock/report.html";
|
|
4444
|
+
}
|
|
4445
|
+
const { fileName, fileExt, fileContent, type = "dump" } = opts;
|
|
4446
|
+
const targetDir = getLogDirByType(type);
|
|
4447
|
+
if (!logEnvReady) {
|
|
4448
|
+
(0, import_node_assert.default)(targetDir, "logDir should be set before writing dump file");
|
|
4449
|
+
const gitIgnorePath = (0, import_node_path.join)(targetDir, "../../.gitignore");
|
|
4450
|
+
let gitIgnoreContent = "";
|
|
4451
|
+
if ((0, import_node_fs.existsSync)(gitIgnorePath)) {
|
|
4452
|
+
gitIgnoreContent = (0, import_node_fs.readFileSync)(gitIgnorePath, "utf-8");
|
|
4455
4453
|
}
|
|
4456
|
-
|
|
4457
|
-
|
|
4454
|
+
const logDirName = (0, import_node_path.basename)(logDir);
|
|
4455
|
+
if (!gitIgnoreContent.includes(`${logDirName}/`)) {
|
|
4456
|
+
(0, import_node_fs.writeFileSync)(
|
|
4457
|
+
gitIgnorePath,
|
|
4458
|
+
`${gitIgnoreContent}
|
|
4459
|
+
# Midscene.js dump files
|
|
4460
|
+
${logDirName}/report
|
|
4461
|
+
${logDirName}/dump
|
|
4462
|
+
${logDirName}/tmp
|
|
4463
|
+
`,
|
|
4464
|
+
"utf-8"
|
|
4465
|
+
);
|
|
4466
|
+
}
|
|
4467
|
+
logEnvReady = true;
|
|
4468
|
+
}
|
|
4469
|
+
const filePath = (0, import_node_path.join)(targetDir, `${fileName}.${fileExt}`);
|
|
4470
|
+
const outputResourceDir = (0, import_node_path.dirname)(filePath);
|
|
4471
|
+
if (!(0, import_node_fs.existsSync)(outputResourceDir)) {
|
|
4472
|
+
(0, import_node_fs.mkdirSync)(outputResourceDir, { recursive: true });
|
|
4473
|
+
}
|
|
4474
|
+
(0, import_node_fs.writeFileSync)(filePath, fileContent);
|
|
4475
|
+
if (opts == null ? void 0 : opts.generateReport) {
|
|
4476
|
+
return writeDumpReport(fileName, fileContent);
|
|
4477
|
+
}
|
|
4478
|
+
return filePath;
|
|
4458
4479
|
}
|
|
4459
|
-
|
|
4460
|
-
|
|
4461
|
-
|
|
4462
|
-
|
|
4463
|
-
return systemPromptToFindElementPosition();
|
|
4480
|
+
function replacerForPageObject(key, value) {
|
|
4481
|
+
var _a, _b;
|
|
4482
|
+
if (value && ((_a = value.constructor) == null ? void 0 : _a.name) === "Page") {
|
|
4483
|
+
return "[Page object]";
|
|
4464
4484
|
}
|
|
4465
|
-
|
|
4466
|
-
|
|
4467
|
-
|
|
4468
|
-
|
|
4469
|
-
|
|
4470
|
-
|
|
4471
|
-
|
|
4485
|
+
if (value && ((_b = value.constructor) == null ? void 0 : _b.name) === "Browser") {
|
|
4486
|
+
return "[Browser object]";
|
|
4487
|
+
}
|
|
4488
|
+
return value;
|
|
4489
|
+
}
|
|
4490
|
+
function stringifyDumpData(data, indents) {
|
|
4491
|
+
return JSON.stringify(data, replacerForPageObject, indents);
|
|
4492
|
+
}
|
|
4493
|
+
function getVersion() {
|
|
4494
|
+
return "0.8.5-beta-20241126063126.0";
|
|
4495
|
+
}
|
|
4472
4496
|
|
|
4473
|
-
|
|
4474
|
-
|
|
4475
|
-
|
|
4476
|
-
|
|
4497
|
+
// src/action/executor.ts
|
|
4498
|
+
var Executor = class {
|
|
4499
|
+
constructor(name, description, tasks) {
|
|
4500
|
+
__publicField(this, "name");
|
|
4501
|
+
__publicField(this, "description");
|
|
4502
|
+
__publicField(this, "tasks");
|
|
4503
|
+
// status of executor
|
|
4504
|
+
__publicField(this, "status");
|
|
4505
|
+
this.status = tasks && tasks.length > 0 ? "pending" : "init";
|
|
4506
|
+
this.name = name;
|
|
4507
|
+
this.description = description;
|
|
4508
|
+
this.tasks = (tasks || []).map((item) => this.markTaskAsPending(item));
|
|
4509
|
+
}
|
|
4510
|
+
markTaskAsPending(task) {
|
|
4511
|
+
return {
|
|
4512
|
+
status: "pending",
|
|
4513
|
+
...task
|
|
4514
|
+
};
|
|
4515
|
+
}
|
|
4516
|
+
async append(task) {
|
|
4517
|
+
var _a, _b;
|
|
4518
|
+
(0, import_node_assert2.default)(
|
|
4519
|
+
this.status !== "error",
|
|
4520
|
+
`executor is in error state, cannot append task
|
|
4521
|
+
error=${(_a = this.latestErrorTask()) == null ? void 0 : _a.error}
|
|
4522
|
+
${(_b = this.latestErrorTask()) == null ? void 0 : _b.errorStack}`
|
|
4523
|
+
);
|
|
4524
|
+
if (Array.isArray(task)) {
|
|
4525
|
+
this.tasks.push(...task.map((item) => this.markTaskAsPending(item)));
|
|
4526
|
+
} else {
|
|
4527
|
+
this.tasks.push(this.markTaskAsPending(task));
|
|
4528
|
+
}
|
|
4529
|
+
if (this.status !== "running") {
|
|
4530
|
+
this.status = "pending";
|
|
4531
|
+
}
|
|
4532
|
+
}
|
|
4533
|
+
async flush() {
|
|
4534
|
+
if (this.status === "init" && this.tasks.length > 0) {
|
|
4535
|
+
console.warn(
|
|
4536
|
+
"illegal state for executor, status is init but tasks are not empty"
|
|
4537
|
+
);
|
|
4538
|
+
}
|
|
4539
|
+
(0, import_node_assert2.default)(this.status !== "running", "executor is already running");
|
|
4540
|
+
(0, import_node_assert2.default)(this.status !== "completed", "executor is already completed");
|
|
4541
|
+
(0, import_node_assert2.default)(this.status !== "error", "executor is in error state");
|
|
4542
|
+
const nextPendingIndex = this.tasks.findIndex(
|
|
4543
|
+
(task) => task.status === "pending"
|
|
4544
|
+
);
|
|
4545
|
+
if (nextPendingIndex < 0) {
|
|
4546
|
+
return;
|
|
4547
|
+
}
|
|
4548
|
+
this.status = "running";
|
|
4549
|
+
let taskIndex = nextPendingIndex;
|
|
4550
|
+
let successfullyCompleted = true;
|
|
4551
|
+
let previousFindOutput;
|
|
4552
|
+
while (taskIndex < this.tasks.length) {
|
|
4553
|
+
const task = this.tasks[taskIndex];
|
|
4554
|
+
(0, import_node_assert2.default)(
|
|
4555
|
+
task.status === "pending",
|
|
4556
|
+
`task status should be pending, but got: ${task.status}`
|
|
4557
|
+
);
|
|
4558
|
+
task.timing = {
|
|
4559
|
+
start: Date.now()
|
|
4560
|
+
};
|
|
4561
|
+
try {
|
|
4562
|
+
task.status = "running";
|
|
4563
|
+
(0, import_node_assert2.default)(
|
|
4564
|
+
["Insight", "Action", "Planning"].indexOf(task.type) >= 0,
|
|
4565
|
+
`unsupported task type: ${task.type}`
|
|
4566
|
+
);
|
|
4567
|
+
const { executor, param } = task;
|
|
4568
|
+
(0, import_node_assert2.default)(executor, `executor is required for task type: ${task.type}`);
|
|
4569
|
+
let returnValue;
|
|
4570
|
+
const executorContext = {
|
|
4571
|
+
task,
|
|
4572
|
+
element: previousFindOutput == null ? void 0 : previousFindOutput.element
|
|
4573
|
+
};
|
|
4574
|
+
if (task.type === "Insight") {
|
|
4575
|
+
(0, import_node_assert2.default)(
|
|
4576
|
+
task.subType === "Locate" || task.subType === "Query" || task.subType === "Assert",
|
|
4577
|
+
`unsupported insight subType: ${task.subType}`
|
|
4578
|
+
);
|
|
4579
|
+
returnValue = await task.executor(param, executorContext);
|
|
4580
|
+
if (task.subType === "Locate") {
|
|
4581
|
+
previousFindOutput = returnValue == null ? void 0 : returnValue.output;
|
|
4582
|
+
}
|
|
4583
|
+
} else if (task.type === "Action" || task.type === "Planning") {
|
|
4584
|
+
returnValue = await task.executor(param, executorContext);
|
|
4585
|
+
} else {
|
|
4586
|
+
console.warn(
|
|
4587
|
+
`unsupported task type: ${task.type}, will try to execute it directly`
|
|
4588
|
+
);
|
|
4589
|
+
returnValue = await task.executor(param, executorContext);
|
|
4590
|
+
}
|
|
4591
|
+
Object.assign(task, returnValue);
|
|
4592
|
+
task.status = "finished";
|
|
4593
|
+
task.timing.end = Date.now();
|
|
4594
|
+
task.timing.cost = task.timing.end - task.timing.start;
|
|
4595
|
+
taskIndex++;
|
|
4596
|
+
} catch (e) {
|
|
4597
|
+
successfullyCompleted = false;
|
|
4598
|
+
task.error = (e == null ? void 0 : e.message) || "error-without-message";
|
|
4599
|
+
task.errorStack = e.stack;
|
|
4600
|
+
task.status = "failed";
|
|
4601
|
+
task.timing.end = Date.now();
|
|
4602
|
+
task.timing.cost = task.timing.end - task.timing.start;
|
|
4603
|
+
break;
|
|
4604
|
+
}
|
|
4605
|
+
}
|
|
4606
|
+
for (let i = taskIndex + 1; i < this.tasks.length; i++) {
|
|
4607
|
+
this.tasks[i].status = "cancelled";
|
|
4608
|
+
}
|
|
4609
|
+
if (successfullyCompleted) {
|
|
4610
|
+
this.status = "completed";
|
|
4611
|
+
} else {
|
|
4612
|
+
this.status = "error";
|
|
4613
|
+
}
|
|
4614
|
+
if (this.tasks.length) {
|
|
4615
|
+
const outputIndex = Math.min(taskIndex, this.tasks.length - 1);
|
|
4616
|
+
return this.tasks[outputIndex].output;
|
|
4617
|
+
}
|
|
4618
|
+
}
|
|
4619
|
+
isInErrorState() {
|
|
4620
|
+
return this.status === "error";
|
|
4621
|
+
}
|
|
4622
|
+
latestErrorTask() {
|
|
4623
|
+
if (this.status !== "error") {
|
|
4624
|
+
return null;
|
|
4625
|
+
}
|
|
4626
|
+
const errorTaskIndex = this.tasks.findIndex(
|
|
4627
|
+
(task) => task.status === "failed"
|
|
4628
|
+
);
|
|
4629
|
+
if (errorTaskIndex >= 0) {
|
|
4630
|
+
return this.tasks[errorTaskIndex];
|
|
4631
|
+
}
|
|
4632
|
+
return null;
|
|
4633
|
+
}
|
|
4634
|
+
dump() {
|
|
4635
|
+
const dumpData = {
|
|
4636
|
+
sdkVersion: getVersion(),
|
|
4637
|
+
model_name: getAIConfig(MIDSCENE_MODEL_NAME) || "",
|
|
4638
|
+
logTime: Date.now(),
|
|
4639
|
+
name: this.name,
|
|
4640
|
+
description: this.description,
|
|
4641
|
+
tasks: this.tasks
|
|
4642
|
+
};
|
|
4643
|
+
return dumpData;
|
|
4644
|
+
}
|
|
4645
|
+
};
|
|
4477
4646
|
|
|
4478
|
-
|
|
4479
|
-
|
|
4480
|
-
2. Based on the user's description, locate the target element ID in the list of element descriptions and the screenshot.
|
|
4481
|
-
3. Found the required number of elements
|
|
4482
|
-
4. Return JSON data containing the selection reason and element ID.
|
|
4647
|
+
// src/insight/index.ts
|
|
4648
|
+
var import_node_assert9 = __toESM(require("assert"));
|
|
4483
4649
|
|
|
4484
|
-
|
|
4485
|
-
|
|
4486
|
-
- Elements in the image with NodeType other than "TEXT Node" have been highlighted to identify the element among multiple non-text elements.
|
|
4487
|
-
- Accurately identify element information based on the user's description and return the corresponding element ID from the element description information, not extracted from the image.
|
|
4488
|
-
- If no elements are found, the "elements" array should be empty.
|
|
4489
|
-
- The returned data must conform to the specified JSON format.
|
|
4490
|
-
- The returned value id information must use the id from element info (important: **use id not indexId, id is hash content**)
|
|
4650
|
+
// src/ai-model/openai/index.ts
|
|
4651
|
+
var import_node_assert5 = __toESM(require("assert"));
|
|
4491
4652
|
|
|
4492
|
-
|
|
4653
|
+
// src/types.ts
|
|
4654
|
+
var BaseElement = class {
|
|
4655
|
+
};
|
|
4656
|
+
var AIResponseFormat = /* @__PURE__ */ ((AIResponseFormat2) => {
|
|
4657
|
+
AIResponseFormat2["JSON"] = "json_object";
|
|
4658
|
+
AIResponseFormat2["TEXT"] = "text";
|
|
4659
|
+
return AIResponseFormat2;
|
|
4660
|
+
})(AIResponseFormat || {});
|
|
4661
|
+
var UIContext = class {
|
|
4662
|
+
};
|
|
4493
4663
|
|
|
4494
|
-
|
|
4664
|
+
// src/ai-model/openai/index.ts
|
|
4665
|
+
var import_utils3 = require("@midscene/shared/utils");
|
|
4666
|
+
var import_openai2 = __toESM(require("openai"));
|
|
4495
4667
|
|
|
4496
|
-
|
|
4497
|
-
|
|
4498
|
-
|
|
4499
|
-
|
|
4500
|
-
|
|
4501
|
-
|
|
4502
|
-
|
|
4503
|
-
|
|
4504
|
-
|
|
4505
|
-
|
|
4506
|
-
]
|
|
4507
|
-
"errors": [] // Array of strings containing any error messages
|
|
4668
|
+
// src/ai-model/coze/index.ts
|
|
4669
|
+
var import_node_assert3 = __toESM(require("assert"));
|
|
4670
|
+
var COZE_INSPECT_ELEMENT_BOT_ID = process.env.COZE_INSPECT_ELEMENT_BOT_ID || "";
|
|
4671
|
+
var COZE_AI_ACTION_BOT_ID = process.env.COZE_AI_ACTION_BOT_ID || "";
|
|
4672
|
+
var COZE_AI_ASSERT_BOT_ID = process.env.COZE_AI_ASSERT_BOT_ID || "";
|
|
4673
|
+
var COZE_EXTRACT_INFO_BOT_ID = process.env.COZE_EXTRACT_INFO_BOT_ID || "";
|
|
4674
|
+
var COZE_BOT_TOKEN = "COZE_BOT_TOKEN";
|
|
4675
|
+
function preferCozeModel(preferVendor) {
|
|
4676
|
+
if (preferVendor && preferVendor !== "coze")
|
|
4677
|
+
return false;
|
|
4678
|
+
return process.env[COZE_BOT_TOKEN] && process.env.COZE_INSPECT_ELEMENT_BOT_ID && process.env.COZE_AI_ACTION_BOT_ID && process.env.COZE_AI_ASSERT_BOT_ID && process.env.COZE_EXTRACT_INFO_BOT_ID;
|
|
4508
4679
|
}
|
|
4509
|
-
|
|
4510
|
-
|
|
4511
|
-
|
|
4512
|
-
|
|
4513
|
-
|
|
4514
|
-
|
|
4515
|
-
|
|
4516
|
-
|
|
4517
|
-
|
|
4518
|
-
|
|
4680
|
+
async function callCozeAi(options) {
|
|
4681
|
+
var _a, _b;
|
|
4682
|
+
const { query, imgs, botId } = options;
|
|
4683
|
+
const completion = await fetch("https://api.coze.com/open_api/v2/chat", {
|
|
4684
|
+
method: "POST",
|
|
4685
|
+
headers: {
|
|
4686
|
+
Authorization: `Bearer ${process.env[COZE_BOT_TOKEN]}`,
|
|
4687
|
+
"Content-Type": "application/json",
|
|
4688
|
+
Accept: "*/*",
|
|
4689
|
+
Host: "api.coze.com",
|
|
4690
|
+
Connection: "keep-alive"
|
|
4691
|
+
},
|
|
4692
|
+
body: JSON.stringify({
|
|
4693
|
+
conversation_id: "123",
|
|
4694
|
+
bot_id: botId,
|
|
4695
|
+
user: "29032201862555",
|
|
4696
|
+
query,
|
|
4697
|
+
meta_data: {
|
|
4698
|
+
img: imgs.map((imgPath) => {
|
|
4699
|
+
return {
|
|
4700
|
+
url: imgPath
|
|
4701
|
+
};
|
|
4702
|
+
})
|
|
4703
|
+
},
|
|
4704
|
+
stream: false
|
|
4705
|
+
})
|
|
4706
|
+
});
|
|
4707
|
+
if (!completion.ok) {
|
|
4708
|
+
console.error("CozeAI reponse error", completion);
|
|
4709
|
+
throw new Error("Network response was not ok");
|
|
4710
|
+
}
|
|
4711
|
+
const aiResponse = await completion.json();
|
|
4712
|
+
if (aiResponse.code !== 0) {
|
|
4713
|
+
console.error("CozeAI error response", aiResponse.msg);
|
|
4714
|
+
throw new Error(`CozeAI error response ${aiResponse.msg}`);
|
|
4715
|
+
}
|
|
4716
|
+
if (!(aiResponse == null ? void 0 : aiResponse.messages) || !((_a = aiResponse == null ? void 0 : aiResponse.messages[0]) == null ? void 0 : _a.content)) {
|
|
4717
|
+
console.error("aiResponse", aiResponse);
|
|
4718
|
+
throw new Error("aiResponse is undefined", aiResponse);
|
|
4719
|
+
}
|
|
4720
|
+
const parseContent = (_b = aiResponse == null ? void 0 : aiResponse.messages[0]) == null ? void 0 : _b.content;
|
|
4721
|
+
(0, import_node_assert3.default)(parseContent, "empty content");
|
|
4722
|
+
try {
|
|
4723
|
+
return JSON.parse(parseContent);
|
|
4724
|
+
} catch (err) {
|
|
4725
|
+
console.error("can't parse coze content", aiResponse, err);
|
|
4726
|
+
throw Error("can't parse coze content");
|
|
4727
|
+
}
|
|
4728
|
+
}
|
|
4729
|
+
function transformOpenAiArgsToCoze(msg) {
|
|
4730
|
+
if (msg.role !== "user")
|
|
4731
|
+
throw Error(`can't transform ${msg} to coze args`);
|
|
4732
|
+
if (typeof msg.content === "string") {
|
|
4733
|
+
return {
|
|
4734
|
+
query: msg.content,
|
|
4735
|
+
imgs: []
|
|
4736
|
+
};
|
|
4737
|
+
}
|
|
4738
|
+
return {
|
|
4739
|
+
query: msg.content.reduce((res, next) => {
|
|
4740
|
+
if (next.type === "text") {
|
|
4741
|
+
res += `
|
|
4742
|
+
${next.text}`;
|
|
4743
|
+
}
|
|
4744
|
+
return res;
|
|
4745
|
+
}, ""),
|
|
4746
|
+
imgs: msg.content.reduce(
|
|
4747
|
+
(res, next) => {
|
|
4748
|
+
if (next.type === "image_url") {
|
|
4749
|
+
res.push(next.image_url.url);
|
|
4750
|
+
}
|
|
4751
|
+
return res;
|
|
4752
|
+
},
|
|
4753
|
+
[]
|
|
4754
|
+
)
|
|
4755
|
+
};
|
|
4756
|
+
}
|
|
4757
|
+
|
|
4758
|
+
// src/ai-model/common.ts
|
|
4759
|
+
async function callAiFn(options) {
|
|
4760
|
+
const { useModel, msgs, AIActionType: AIActionTypeValue } = options;
|
|
4761
|
+
if (preferOpenAIModel(useModel)) {
|
|
4762
|
+
const parseResult = await callToGetJSONObject(msgs, AIActionTypeValue);
|
|
4763
|
+
return parseResult;
|
|
4764
|
+
}
|
|
4765
|
+
if (preferCozeModel(useModel)) {
|
|
4766
|
+
let botId = "";
|
|
4767
|
+
switch (AIActionTypeValue) {
|
|
4768
|
+
case 0 /* ASSERT */:
|
|
4769
|
+
botId = COZE_AI_ASSERT_BOT_ID;
|
|
4770
|
+
break;
|
|
4771
|
+
case 2 /* EXTRACT_DATA */:
|
|
4772
|
+
botId = COZE_EXTRACT_INFO_BOT_ID;
|
|
4773
|
+
break;
|
|
4774
|
+
case 1 /* INSPECT_ELEMENT */:
|
|
4775
|
+
botId = COZE_INSPECT_ELEMENT_BOT_ID;
|
|
4776
|
+
break;
|
|
4777
|
+
default:
|
|
4778
|
+
botId = COZE_AI_ACTION_BOT_ID;
|
|
4779
|
+
}
|
|
4780
|
+
const cozeMsg = transformOpenAiArgsToCoze(msgs[1]);
|
|
4781
|
+
const parseResult = await callCozeAi({
|
|
4782
|
+
...cozeMsg,
|
|
4783
|
+
botId
|
|
4784
|
+
});
|
|
4785
|
+
return parseResult;
|
|
4786
|
+
}
|
|
4787
|
+
throw Error(
|
|
4788
|
+
"Cannot find Coze or OpenAI config. You should set at least one of them."
|
|
4789
|
+
);
|
|
4790
|
+
}
|
|
4791
|
+
function transformUserMessages(msgs) {
|
|
4792
|
+
const textOnly = Boolean(getAIConfig(MIDSCENE_MODEL_TEXT_ONLY));
|
|
4793
|
+
if (!textOnly)
|
|
4794
|
+
return msgs;
|
|
4795
|
+
return msgs.reduce((res, msg) => {
|
|
4796
|
+
if (msg.type === "text") {
|
|
4797
|
+
res += msg.text;
|
|
4798
|
+
}
|
|
4799
|
+
return res;
|
|
4800
|
+
}, "");
|
|
4801
|
+
}
|
|
4802
|
+
|
|
4803
|
+
// src/ai-model/prompt/element_inspector.ts
|
|
4804
|
+
function systemPromptToFindElement() {
|
|
4805
|
+
if (getAIConfig(MATCH_BY_POSITION)) {
|
|
4806
|
+
return systemPromptToFindElementPosition();
|
|
4807
|
+
}
|
|
4808
|
+
return `
|
|
4809
|
+
## Role:
|
|
4810
|
+
You are an expert in software page image (2D) and page element text analysis.
|
|
4811
|
+
|
|
4812
|
+
## Objective:
|
|
4813
|
+
- Identify elements in screenshots and text that match the user's description.
|
|
4814
|
+
- Return JSON data containing the selection reason and element ID.
|
|
4815
|
+
|
|
4816
|
+
## Skills:
|
|
4817
|
+
- Image analysis and recognition
|
|
4818
|
+
- Multilingual text understanding
|
|
4819
|
+
- Software UI design and testing
|
|
4820
|
+
|
|
4821
|
+
## Workflow:
|
|
4822
|
+
1. Receive the user's element description, screenshot, and element description information. Note that the text may contain non-English characters (e.g., Chinese), indicating that the application may be non-English.
|
|
4823
|
+
2. Based on the user's description, locate the target element ID in the list of element descriptions and the screenshot.
|
|
4824
|
+
3. Found the required number of elements
|
|
4825
|
+
4. Return JSON data containing the selection reason and element ID.
|
|
4826
|
+
|
|
4827
|
+
## Constraints:
|
|
4828
|
+
- Strictly adhere to the specified location when describing the required element; do not select elements from other locations.
|
|
4829
|
+
- Elements in the image with NodeType other than "TEXT Node" have been highlighted to identify the element among multiple non-text elements.
|
|
4830
|
+
- Accurately identify element information based on the user's description and return the corresponding element ID from the element description information, not extracted from the image.
|
|
4831
|
+
- If no elements are found, the "elements" array should be empty.
|
|
4832
|
+
- The returned data must conform to the specified JSON format.
|
|
4833
|
+
- The returned value id information must use the id from element info (important: **use id not indexId, id is hash content**)
|
|
4834
|
+
|
|
4835
|
+
## Output Format:
|
|
4836
|
+
|
|
4837
|
+
Please return the result in JSON format as follows:
|
|
4838
|
+
|
|
4839
|
+
\`\`\`json
|
|
4840
|
+
{
|
|
4841
|
+
"elements": [
|
|
4842
|
+
// If no matching elements are found, return an empty array []
|
|
4843
|
+
{
|
|
4844
|
+
"reason": "PLACEHOLDER", // The thought process for finding the element, replace PLACEHOLDER with your thought process
|
|
4845
|
+
"text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
|
|
4846
|
+
"id": "PLACEHOLDER" // Replace PLACEHOLDER with the ID (important: **use id not indexId, id is hash content**) of elementInfo
|
|
4847
|
+
}
|
|
4848
|
+
// More elements...
|
|
4849
|
+
],
|
|
4850
|
+
"errors": [] // Array of strings containing any error messages
|
|
4851
|
+
}
|
|
4852
|
+
\`\`\`
|
|
4853
|
+
|
|
4854
|
+
## Example:
|
|
4855
|
+
Example 1:
|
|
4856
|
+
Input Example:
|
|
4857
|
+
\`\`\`json
|
|
4858
|
+
// Description: "Shopping cart icon in the upper right corner"
|
|
4859
|
+
{
|
|
4860
|
+
"description": "PLACEHOLDER", // Description of the target element
|
|
4861
|
+
"multi": "PLACEHOLDER", //Find the number of elements
|
|
4519
4862
|
"screenshot": "path/screenshot.png",
|
|
4520
4863
|
"text": '{
|
|
4521
4864
|
"pageSize": {
|
|
@@ -4524,7 +4867,7 @@ Input Example:
|
|
|
4524
4867
|
},
|
|
4525
4868
|
"elementInfos": [
|
|
4526
4869
|
{
|
|
4527
|
-
"id": "
|
|
4870
|
+
"id": "1231", // ID of the element
|
|
4528
4871
|
"indexId": "0", // Index of the element,The image is labeled to the left of the element
|
|
4529
4872
|
"attributes": { // Attributes of the element
|
|
4530
4873
|
"nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
|
|
@@ -4540,7 +4883,7 @@ Input Example:
|
|
|
4540
4883
|
}
|
|
4541
4884
|
},
|
|
4542
4885
|
{
|
|
4543
|
-
"id": "
|
|
4886
|
+
"id": "66551", // ID of the element
|
|
4544
4887
|
"indexId": "1", // Index of the element,The image is labeled to the left of the element
|
|
4545
4888
|
"attributes": { // Attributes of the element
|
|
4546
4889
|
"nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
|
|
@@ -4557,7 +4900,7 @@ Input Example:
|
|
|
4557
4900
|
},
|
|
4558
4901
|
...
|
|
4559
4902
|
{
|
|
4560
|
-
"id": "
|
|
4903
|
+
"id": "12344",
|
|
4561
4904
|
"indexId": "2", // Index of the element,The image is labeled to the left of the element
|
|
4562
4905
|
"attributes": {
|
|
4563
4906
|
"nodeType": "TEXT Node",
|
|
@@ -4590,7 +4933,7 @@ Output Example:
|
|
|
4590
4933
|
"reason": "Reason for finding element 4: It is located in the upper right corner, is an image type, and according to the screenshot, it is a shopping cart icon button",
|
|
4591
4934
|
"text": "",
|
|
4592
4935
|
// ID(**use id not indexId**) of this element, replace with actual value in practice, **use id not indexId**
|
|
4593
|
-
"id": "
|
|
4936
|
+
"id": "1231"
|
|
4594
4937
|
}
|
|
4595
4938
|
],
|
|
4596
4939
|
"errors": []
|
|
@@ -4677,6 +5020,19 @@ var findElementSchema = {
|
|
|
4677
5020
|
};
|
|
4678
5021
|
|
|
4679
5022
|
// src/ai-model/prompt/planning.ts
|
|
5023
|
+
var quickAnswerFormat = () => {
|
|
5024
|
+
const matchByPosition = getAIConfig(MATCH_BY_POSITION);
|
|
5025
|
+
const description = `
|
|
5026
|
+
${matchByPosition ? `"position": { x: number; y: number } // Represents the position of the element; replace with actual values in practice (ensure it reflects the element's position)` : '"id": string // Represents the ID of the element; replace with actual values in practice'}
|
|
5027
|
+
`;
|
|
5028
|
+
const format = matchByPosition ? '"position": { x: number; y: number }' : '"id": string';
|
|
5029
|
+
const sample = matchByPosition ? '{"position": { x: 100, y: 200 }}' : '{"id": "14562"}';
|
|
5030
|
+
return {
|
|
5031
|
+
description,
|
|
5032
|
+
format,
|
|
5033
|
+
sample
|
|
5034
|
+
};
|
|
5035
|
+
};
|
|
4680
5036
|
function systemPromptToTaskPlanning() {
|
|
4681
5037
|
return `
|
|
4682
5038
|
## Role:
|
|
@@ -4700,32 +5056,24 @@ Each action has a type and corresponding param. To be detailed:
|
|
|
4700
5056
|
* type: 'KeyboardPress', press a key
|
|
4701
5057
|
* param: { value: string }, the value to input or the key to press. Use (Enter, Shift, Control, Alt, Meta, ShiftLeft, ControlOrMeta, ControlOrMeta) to represent the key.
|
|
4702
5058
|
* type: 'Scroll'
|
|
4703
|
-
* param: { scrollType: 'scrollDownOneScreen'
|
|
5059
|
+
* param: { scrollType: 'scrollDownOneScreen' | 'scrollUpOneScreen' | 'scrollUntilBottom' | 'scrollUntilTop' }
|
|
4704
5060
|
* type: 'Error'
|
|
4705
5061
|
* param: { message: string }, the error message
|
|
4706
5062
|
* type: 'Sleep'
|
|
4707
5063
|
* param: { timeMs: number }, wait for timeMs milliseconds
|
|
4708
5064
|
|
|
4709
|
-
Here is an example of how to decompose a task.
|
|
4710
|
-
When a user says 'Input "Weather in Shanghai" into the search bar, wait 1 second, hit enter', by viewing the page screenshot and description, you may decompose this task into something like this:
|
|
4711
|
-
* Locate: 'The search bar'
|
|
4712
|
-
* Input: 'Weather in Shanghai'
|
|
4713
|
-
* Sleep: 1000
|
|
4714
|
-
* KeyboardPress: 'Enter'
|
|
4715
|
-
|
|
4716
5065
|
Remember:
|
|
4717
5066
|
1. The actions you composed MUST be based on the page context information you get. Instead of making up actions that are not related to the page context.
|
|
4718
|
-
2. In most cases, you should Locate one element first, then do other actions on it. For example,
|
|
5067
|
+
2. In most cases, you should Locate one element first, then do other actions on it. For example, Locate one element, then hover on it. But if you think it's necessary to do other actions first (like global scroll, global key press), you can do that.
|
|
5068
|
+
3. If the planned actions are sequential and some actions may appear only after the execution of previous actions, this is considered normal. Thoughts, prompts, and error messages should all be in the same language as the user's description.
|
|
4719
5069
|
|
|
4720
|
-
|
|
5070
|
+
## Objective 2 (sub objective, only for action with type "Locate"): Give a quick answer to the action with type "Locate" you just planned, append a \`quickAnswer\` field as a sibling of the \`param\` field
|
|
4721
5071
|
|
|
4722
|
-
|
|
4723
|
-
|
|
4724
|
-
If the action type is 'Locate', provide a quick answer: Does any element meet the description in the prompt? If so, answer with the following format, as the \`quickAnswer\` field in the output JSON:
|
|
5072
|
+
If the action type is 'Locate', think about this: does any element on screen meet the description in the prompt? If so, answer with the following format, as the \`quickAnswer\` field in the output JSON:
|
|
4725
5073
|
{
|
|
4726
|
-
"reason": "
|
|
5074
|
+
"reason": "It is located (somewhere), is an (node type). According to the screenshot, it is a shopping cart icon button (or it's text is 'Shopping Cart')",
|
|
4727
5075
|
"text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
|
|
4728
|
-
${
|
|
5076
|
+
${quickAnswerFormat().description}
|
|
4729
5077
|
}
|
|
4730
5078
|
|
|
4731
5079
|
If there is no element meets the description in the prompt (usually because it will show up later after some interaction), the \`quickAnswer\` field should be null.
|
|
@@ -4738,33 +5086,71 @@ Please return the result in JSON format as follows:
|
|
|
4738
5086
|
actions: [ // always return in Array
|
|
4739
5087
|
{
|
|
4740
5088
|
"thought": "find out the search bar",
|
|
4741
|
-
"type": "Locate", //
|
|
4742
|
-
"param": {
|
|
5089
|
+
"type": "Locate", // type of action according to Object 1, like 'Tap' 'Hover' ...
|
|
5090
|
+
"param": { //
|
|
4743
5091
|
"prompt": "The search bar"
|
|
4744
5092
|
},
|
|
4745
|
-
"quickAnswer": {
|
|
4746
|
-
"reason": "
|
|
4747
|
-
"text":
|
|
4748
|
-
${
|
|
5093
|
+
"quickAnswer": {
|
|
5094
|
+
"reason": "This is ...",
|
|
5095
|
+
"text": string, // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
|
|
5096
|
+
${quickAnswerFormat().format}
|
|
4749
5097
|
} | null,
|
|
4750
5098
|
},
|
|
4751
5099
|
{
|
|
4752
5100
|
"thought": "Reasons for generating this task, and why this task is feasible on this page",
|
|
4753
|
-
"type": "Tap",
|
|
4754
|
-
"param":
|
|
5101
|
+
"type": "Tap",
|
|
5102
|
+
"param": null,
|
|
4755
5103
|
},
|
|
5104
|
+
// ... more actions
|
|
5105
|
+
],
|
|
5106
|
+
error?: string, // Overall error messages. If there is any error occurs during the task planning (i.e. error in previous 'actions' array), conclude the errors again, put error messages here,
|
|
5107
|
+
}
|
|
5108
|
+
|
|
5109
|
+
## Here is an example of how to decompose a task
|
|
5110
|
+
|
|
5111
|
+
When a user says 'Click the language switch button, wait 1s, click "English"', by viewing the page screenshot and description, you should consider this:
|
|
5112
|
+
|
|
5113
|
+
* The main steps are: Find the switch button, tap it, sleep, find the 'English' element, and tap on it.
|
|
5114
|
+
* Think and look in detail and fill all the fields in the JSON format.
|
|
5115
|
+
|
|
5116
|
+
\`\`\`json
|
|
5117
|
+
{
|
|
5118
|
+
queryLanguage: 'English',
|
|
5119
|
+
actions:[
|
|
4756
5120
|
{
|
|
4757
|
-
|
|
4758
|
-
|
|
4759
|
-
|
|
4760
|
-
|
|
5121
|
+
thought: "Locate the language switch button with the text '中文'.",
|
|
5122
|
+
type: 'Locate',
|
|
5123
|
+
param: { prompt: "The language switch button with the text '中文'" },
|
|
5124
|
+
quickAnswer: { // according to Objective 2, this action type is 'Locate', and we can find the element, so we need to give a quick answer
|
|
5125
|
+
reason: "It is located near the top center, is an text node. According to the screenshot, it is a language switch button with the text '中文'.",
|
|
5126
|
+
text: '中文',
|
|
5127
|
+
${quickAnswerFormat().sample}
|
|
4761
5128
|
},
|
|
4762
|
-
"quickAnswer": null,
|
|
4763
5129
|
},
|
|
4764
|
-
|
|
5130
|
+
{
|
|
5131
|
+
thought: 'Click the language switch button to open the language options.',
|
|
5132
|
+
type: 'Tap',
|
|
5133
|
+
param: null,
|
|
5134
|
+
},
|
|
5135
|
+
{
|
|
5136
|
+
thought: 'Wait for 1 second to ensure the language options are displayed.',
|
|
5137
|
+
type: 'Sleep',
|
|
5138
|
+
param: { timeMs: 1000 },
|
|
5139
|
+
},
|
|
5140
|
+
{
|
|
5141
|
+
thought: "Locate the 'English' option in the language menu.",
|
|
5142
|
+
type: 'Locate',
|
|
5143
|
+
param: { prompt: "The 'English' option in the language menu" },
|
|
5144
|
+
quickAnswer: null, // we cannot find this item in the description (it will show only after the previous interactions), so the quick answer is null here
|
|
5145
|
+
},
|
|
5146
|
+
{
|
|
5147
|
+
thought: "Click the 'English' option to switch the language.",
|
|
5148
|
+
type: 'Tap',
|
|
5149
|
+
param: null,
|
|
5150
|
+
}
|
|
4765
5151
|
],
|
|
4766
|
-
error?: string, // Overall error messages. If there is any error occurs during the task planning (i.e. error in previous 'actions' array), conclude the errors again, put error messages here,
|
|
4767
5152
|
}
|
|
5153
|
+
\`\`\`
|
|
4768
5154
|
`;
|
|
4769
5155
|
}
|
|
4770
5156
|
var planSchema = {
|
|
@@ -4802,7 +5188,7 @@ var planSchema = {
|
|
|
4802
5188
|
properties: {
|
|
4803
5189
|
reason: {
|
|
4804
5190
|
type: "string",
|
|
4805
|
-
description: "Reason for finding element
|
|
5191
|
+
description: "Reason for finding this element"
|
|
4806
5192
|
},
|
|
4807
5193
|
text: {
|
|
4808
5194
|
type: "string",
|
|
@@ -4824,659 +5210,333 @@ var planSchema = {
|
|
|
4824
5210
|
},
|
|
4825
5211
|
error: {
|
|
4826
5212
|
type: ["string", "null"],
|
|
4827
|
-
description: "Overall error messages. If there is any error occurs during the task planning, conclude the errors again and put error messages here"
|
|
4828
|
-
}
|
|
4829
|
-
},
|
|
4830
|
-
required: ["queryLanguage", "actions", "error"],
|
|
4831
|
-
additionalProperties: false
|
|
4832
|
-
}
|
|
4833
|
-
}
|
|
4834
|
-
};
|
|
4835
|
-
|
|
4836
|
-
// src/ai-model/prompt/util.ts
|
|
4837
|
-
var import_node_assert2 = __toESM(require("assert"));
|
|
4838
|
-
|
|
4839
|
-
// src/image/index.ts
|
|
4840
|
-
var import_img = require("@midscene/shared/img");
|
|
4841
|
-
|
|
4842
|
-
// src/ai-model/prompt/util.ts
|
|
4843
|
-
var characteristic = "You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.";
|
|
4844
|
-
var contextFormatIntro = `
|
|
4845
|
-
The user will give you a screenshot and the texts on it. There may be some none-English characters (like Chinese) on it, indicating it's an non-English app.`;
|
|
4846
|
-
var ONE_ELEMENT_LOCATOR_PREFIX = "LOCATE_ONE_ELEMENT";
|
|
4847
|
-
var ELEMENTS_LOCATOR_PREFIX = "LOCATE_ONE_OR_MORE_ELEMENTS";
|
|
4848
|
-
var SECTION_MATCHER_FLAG = "SECTION_MATCHER_FLAG/";
|
|
4849
|
-
function systemPromptToExtract() {
|
|
4850
|
-
return `
|
|
4851
|
-
You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.
|
|
4852
|
-
The user will give you a screenshot and the contents of it. There may be some none-English characters (like Chinese) on it, indicating it's an non-English app.
|
|
4853
|
-
|
|
4854
|
-
You have the following skills:
|
|
4855
|
-
|
|
4856
|
-
skill name: extract_data_from_UI
|
|
4857
|
-
related input: DATA_DEMAND
|
|
4858
|
-
skill content:
|
|
4859
|
-
* User will give you some data requirements in DATA_DEMAND. Consider the UI context, follow the user's instructions, and provide comprehensive data accordingly.
|
|
4860
|
-
* There may be some special commands in DATA_DEMAND, please pay extra attention
|
|
4861
|
-
- LOCATE_ONE_ELEMENT and LOCATE_ONE_OR_MORE_ELEMENTS: if you see a description that mentions the keyword LOCATE_ONE_ELEMENT
|
|
4862
|
-
- LOCATE_ONE_OR_MORE_ELEMENTS(e.g. follow LOCATE_ONE_ELEMENT : i want to find ...), it means user wants to locate a specific element meets the description.
|
|
4863
|
-
|
|
4864
|
-
Return in this way: prefix + the id / comma-separated ids, for example: LOCATE_ONE_ELEMENT/1 , LOCATE_ONE_OR_MORE_ELEMENTS/1,2,3 . If not found, keep the prefix and leave the suffix empty, like LOCATE_ONE_ELEMENT/ .
|
|
4865
|
-
|
|
4866
|
-
|
|
4867
|
-
|
|
4868
|
-
Return in the following JSON format:
|
|
4869
|
-
{
|
|
4870
|
-
language: "en", // "en" or "zh", the language of the page. Use the same language to describe section name, description, and similar fields.
|
|
4871
|
-
data: any, // the extracted data from extract_data_from_UI skill. Make sure both the value and scheme meet the DATA_DEMAND.
|
|
4872
|
-
errors: [], // string[], error message if any
|
|
4873
|
-
}
|
|
4874
|
-
`;
|
|
4875
|
-
}
|
|
4876
|
-
function systemPromptToAssert() {
|
|
4877
|
-
return `
|
|
4878
|
-
${characteristic}
|
|
4879
|
-
${contextFormatIntro}
|
|
4880
|
-
|
|
4881
|
-
Based on the information you get, Return assertion judgment:
|
|
4882
|
-
|
|
4883
|
-
Return in the following JSON format:
|
|
4884
|
-
{
|
|
4885
|
-
thought: string, // string, the thought of the assertion. Should in the same language as the assertion.
|
|
4886
|
-
pass: true, // true or false, whether the assertion is passed
|
|
4887
|
-
}
|
|
4888
|
-
`;
|
|
4889
|
-
}
|
|
4890
|
-
var assertSchema = {
|
|
4891
|
-
type: "json_schema",
|
|
4892
|
-
json_schema: {
|
|
4893
|
-
name: "assert",
|
|
4894
|
-
strict: true,
|
|
4895
|
-
schema: {
|
|
4896
|
-
type: "object",
|
|
4897
|
-
properties: {
|
|
4898
|
-
thought: {
|
|
4899
|
-
type: "string",
|
|
4900
|
-
description: "The thought process behind the assertion"
|
|
4901
|
-
},
|
|
4902
|
-
pass: {
|
|
4903
|
-
type: "boolean",
|
|
4904
|
-
description: "Whether the assertion passed or failed"
|
|
4905
|
-
}
|
|
4906
|
-
},
|
|
4907
|
-
required: ["thought", "pass"],
|
|
4908
|
-
additionalProperties: false
|
|
4909
|
-
}
|
|
4910
|
-
}
|
|
4911
|
-
};
|
|
4912
|
-
function describeSize(size) {
|
|
4913
|
-
return `${size.width} x ${size.height}`;
|
|
4914
|
-
}
|
|
4915
|
-
function truncateText(text) {
|
|
4916
|
-
const maxLength = 50;
|
|
4917
|
-
if (text && text.length > maxLength) {
|
|
4918
|
-
return `${text.slice(0, maxLength)}...`;
|
|
4919
|
-
}
|
|
4920
|
-
return text;
|
|
4921
|
-
}
|
|
4922
|
-
function elementByPosition(elementsInfo, position) {
|
|
4923
|
-
(0, import_node_assert2.default)(typeof position !== "undefined", "position is required for query");
|
|
4924
|
-
const item = elementsInfo.find((item2) => {
|
|
4925
|
-
return item2.rect.left <= position.x && position.x <= item2.rect.left + item2.rect.width && item2.rect.top <= position.y && position.y <= item2.rect.top + item2.rect.height;
|
|
4926
|
-
});
|
|
4927
|
-
return item;
|
|
4928
|
-
}
|
|
4929
|
-
async function describeUserPage(context) {
|
|
4930
|
-
const { screenshotBase64 } = context;
|
|
4931
|
-
let width;
|
|
4932
|
-
let height;
|
|
4933
|
-
if (context.size) {
|
|
4934
|
-
({ width, height } = context.size);
|
|
4935
|
-
} else {
|
|
4936
|
-
const imgSize = await (0, import_img.imageInfoOfBase64)(screenshotBase64);
|
|
4937
|
-
({ width, height } = imgSize);
|
|
4938
|
-
}
|
|
4939
|
-
const elementsInfo = context.content;
|
|
4940
|
-
const idElementMap = {};
|
|
4941
|
-
elementsInfo.forEach((item) => {
|
|
4942
|
-
idElementMap[item.id] = item;
|
|
4943
|
-
return { ...item };
|
|
4944
|
-
});
|
|
4945
|
-
const elementInfosDescription = cropFieldInformation(elementsInfo);
|
|
4946
|
-
return {
|
|
4947
|
-
description: `
|
|
4948
|
-
{
|
|
4949
|
-
// The size of the page
|
|
4950
|
-
"pageSize": ${describeSize({ width, height })},
|
|
4951
|
-
|
|
4952
|
-
${// if match by id, use the description of the element
|
|
4953
|
-
!getAIConfig(MATCH_BY_POSITION) ? `
|
|
4954
|
-
// json description of the element
|
|
4955
|
-
"content": ${JSON.stringify(elementInfosDescription)}
|
|
4956
|
-
` : ""}
|
|
4957
|
-
}`,
|
|
4958
|
-
elementById(id) {
|
|
4959
|
-
(0, import_node_assert2.default)(typeof id !== "undefined", "id is required for query");
|
|
4960
|
-
const item = idElementMap[`${id}`];
|
|
4961
|
-
return item;
|
|
4962
|
-
},
|
|
4963
|
-
elementByPosition(position) {
|
|
4964
|
-
return elementByPosition(elementsInfo, position);
|
|
4965
|
-
}
|
|
4966
|
-
};
|
|
4967
|
-
}
|
|
4968
|
-
function cropFieldInformation(elementsInfo) {
|
|
4969
|
-
const elementInfosDescription = elementsInfo.map(
|
|
4970
|
-
(item) => {
|
|
4971
|
-
const { id, attributes = {}, rect, content } = item;
|
|
4972
|
-
const tailorContent = truncateText(content);
|
|
4973
|
-
const tailorAttributes = Object.keys(attributes).reduce(
|
|
4974
|
-
(res, currentKey) => {
|
|
4975
|
-
const attributeVal = attributes[currentKey];
|
|
4976
|
-
res[currentKey] = truncateText(attributeVal);
|
|
4977
|
-
return res;
|
|
4978
|
-
},
|
|
4979
|
-
{}
|
|
4980
|
-
);
|
|
4981
|
-
return {
|
|
4982
|
-
id,
|
|
4983
|
-
markerId: item.indexId,
|
|
4984
|
-
attributes: tailorAttributes,
|
|
4985
|
-
rect,
|
|
4986
|
-
content: tailorContent
|
|
4987
|
-
};
|
|
4988
|
-
}
|
|
4989
|
-
);
|
|
4990
|
-
return JSON.stringify(elementInfosDescription);
|
|
4991
|
-
}
|
|
4992
|
-
function retrieveElement(prompt, opt) {
|
|
4993
|
-
if (opt == null ? void 0 : opt.multi) {
|
|
4994
|
-
return `follow ${ELEMENTS_LOCATOR_PREFIX}: ${prompt}`;
|
|
4995
|
-
}
|
|
4996
|
-
return `follow ${ONE_ELEMENT_LOCATOR_PREFIX}: ${prompt}`;
|
|
4997
|
-
}
|
|
4998
|
-
function ifElementTypeResponse(response) {
|
|
4999
|
-
if (typeof response !== "string") {
|
|
5000
|
-
return false;
|
|
5001
|
-
}
|
|
5002
|
-
return response.startsWith(ONE_ELEMENT_LOCATOR_PREFIX) || response.startsWith(ELEMENTS_LOCATOR_PREFIX);
|
|
5003
|
-
}
|
|
5004
|
-
function splitElementResponse(response) {
|
|
5005
|
-
const oneElementSplitter = `${ONE_ELEMENT_LOCATOR_PREFIX}/`;
|
|
5006
|
-
if (response.startsWith(oneElementSplitter)) {
|
|
5007
|
-
const id = response.slice(oneElementSplitter.length);
|
|
5008
|
-
if (id.indexOf(",") >= 0) {
|
|
5009
|
-
console.warn(`unexpected comma in one element response: ${id}`);
|
|
5010
|
-
}
|
|
5011
|
-
return id ? id : null;
|
|
5012
|
-
}
|
|
5013
|
-
const elementsSplitter = `${ELEMENTS_LOCATOR_PREFIX}/`;
|
|
5014
|
-
if (response.startsWith(elementsSplitter)) {
|
|
5015
|
-
const idsString = response.slice(elementsSplitter.length);
|
|
5016
|
-
if (!idsString) {
|
|
5017
|
-
return [];
|
|
5018
|
-
}
|
|
5019
|
-
return idsString.split(",");
|
|
5020
|
-
}
|
|
5021
|
-
return null;
|
|
5022
|
-
}
|
|
5023
|
-
function retrieveSection(prompt) {
|
|
5024
|
-
return `${SECTION_MATCHER_FLAG}${prompt}`;
|
|
5025
|
-
}
|
|
5026
|
-
|
|
5027
|
-
// src/ai-model/openai/index.ts
|
|
5028
|
-
var MIDSCENE_OPENAI_INIT_CONFIG_JSON = "MIDSCENE_OPENAI_INIT_CONFIG_JSON";
|
|
5029
|
-
var MIDSCENE_MODEL_NAME = "MIDSCENE_MODEL_NAME";
|
|
5030
|
-
var MIDSCENE_LANGSMITH_DEBUG = "MIDSCENE_LANGSMITH_DEBUG";
|
|
5031
|
-
var MIDSCENE_DEBUG_AI_PROFILE = "MIDSCENE_DEBUG_AI_PROFILE";
|
|
5032
|
-
var MIDSCENE_DANGEROUSLY_PRINT_ALL_CONFIG = "MIDSCENE_DANGEROUSLY_PRINT_ALL_CONFIG";
|
|
5033
|
-
var MIDSCENE_DEBUG_MODE = "MIDSCENE_DEBUG_MODE";
|
|
5034
|
-
var OPENAI_API_KEY = "OPENAI_API_KEY";
|
|
5035
|
-
var OPENAI_BASE_URL = "OPENAI_BASE_URL";
|
|
5036
|
-
var MIDSCENE_MODEL_TEXT_ONLY = "MIDSCENE_MODEL_TEXT_ONLY";
|
|
5037
|
-
var OPENAI_USE_AZURE = "OPENAI_USE_AZURE";
|
|
5038
|
-
var MIDSCENE_CACHE = "MIDSCENE_CACHE";
|
|
5039
|
-
var MATCH_BY_POSITION = "MATCH_BY_POSITION";
|
|
5040
|
-
var allConfigFromEnv = () => {
|
|
5041
|
-
return {
|
|
5042
|
-
[MIDSCENE_OPENAI_INIT_CONFIG_JSON]: process.env[MIDSCENE_OPENAI_INIT_CONFIG_JSON] || void 0,
|
|
5043
|
-
[MIDSCENE_MODEL_NAME]: process.env[MIDSCENE_MODEL_NAME] || void 0,
|
|
5044
|
-
[MIDSCENE_DEBUG_MODE]: process.env[MIDSCENE_DEBUG_MODE] || void 0,
|
|
5045
|
-
[MIDSCENE_LANGSMITH_DEBUG]: process.env[MIDSCENE_LANGSMITH_DEBUG] || void 0,
|
|
5046
|
-
[MIDSCENE_DEBUG_AI_PROFILE]: process.env[MIDSCENE_DEBUG_AI_PROFILE] || void 0,
|
|
5047
|
-
[MIDSCENE_DANGEROUSLY_PRINT_ALL_CONFIG]: process.env[MIDSCENE_DANGEROUSLY_PRINT_ALL_CONFIG] || void 0,
|
|
5048
|
-
[OPENAI_API_KEY]: process.env[OPENAI_API_KEY] || void 0,
|
|
5049
|
-
[OPENAI_BASE_URL]: process.env[OPENAI_BASE_URL] || void 0,
|
|
5050
|
-
[MIDSCENE_MODEL_TEXT_ONLY]: process.env[MIDSCENE_MODEL_TEXT_ONLY] || void 0,
|
|
5051
|
-
[OPENAI_USE_AZURE]: process.env[OPENAI_USE_AZURE] || void 0,
|
|
5052
|
-
[MIDSCENE_CACHE]: process.env[MIDSCENE_CACHE] || void 0,
|
|
5053
|
-
[MATCH_BY_POSITION]: process.env[MATCH_BY_POSITION] || void 0
|
|
5054
|
-
};
|
|
5055
|
-
};
|
|
5056
|
-
var userConfig = {};
|
|
5057
|
-
var getAIConfig = (configKey) => {
|
|
5058
|
-
if (typeof userConfig[configKey] !== "undefined") {
|
|
5059
|
-
return userConfig[configKey];
|
|
5060
|
-
}
|
|
5061
|
-
return allConfigFromEnv()[configKey];
|
|
5062
|
-
};
|
|
5063
|
-
var allAIConfig = () => {
|
|
5064
|
-
return { ...allConfigFromEnv(), ...userConfig };
|
|
5065
|
-
};
|
|
5066
|
-
var overrideAIConfig = (newConfig, extendMode) => {
|
|
5067
|
-
userConfig = extendMode ? { ...userConfig, ...newConfig } : { ...newConfig };
|
|
5068
|
-
};
|
|
5069
|
-
function preferOpenAIModel(preferVendor) {
|
|
5070
|
-
if (preferVendor && preferVendor !== "openAI")
|
|
5071
|
-
return false;
|
|
5072
|
-
if (getAIConfig(OPENAI_API_KEY))
|
|
5073
|
-
return true;
|
|
5074
|
-
return Boolean(getAIConfig(MIDSCENE_OPENAI_INIT_CONFIG_JSON));
|
|
5075
|
-
}
|
|
5076
|
-
var defaultModel = "gpt-4o-2024-08-06";
|
|
5077
|
-
function getModelName() {
|
|
5078
|
-
let modelName = defaultModel;
|
|
5079
|
-
const nameInConfig = getAIConfig(MIDSCENE_MODEL_NAME);
|
|
5080
|
-
if (nameInConfig) {
|
|
5081
|
-
modelName = nameInConfig;
|
|
5082
|
-
}
|
|
5083
|
-
return modelName;
|
|
5084
|
-
}
|
|
5085
|
-
async function createOpenAI() {
|
|
5086
|
-
let openai;
|
|
5087
|
-
const extraConfigString = getAIConfig(MIDSCENE_OPENAI_INIT_CONFIG_JSON);
|
|
5088
|
-
const extraConfig = extraConfigString ? JSON.parse(extraConfigString) : {};
|
|
5089
|
-
if (getAIConfig(OPENAI_USE_AZURE)) {
|
|
5090
|
-
openai = new import_openai5.AzureOpenAI({
|
|
5091
|
-
baseURL: getAIConfig(OPENAI_BASE_URL),
|
|
5092
|
-
apiKey: getAIConfig(OPENAI_API_KEY),
|
|
5093
|
-
...extraConfig,
|
|
5094
|
-
dangerouslyAllowBrowser: true
|
|
5095
|
-
});
|
|
5096
|
-
} else {
|
|
5097
|
-
openai = new import_openai5.default({
|
|
5098
|
-
baseURL: getAIConfig(OPENAI_BASE_URL),
|
|
5099
|
-
apiKey: getAIConfig(OPENAI_API_KEY),
|
|
5100
|
-
...extraConfig,
|
|
5101
|
-
dangerouslyAllowBrowser: true
|
|
5102
|
-
});
|
|
5103
|
-
}
|
|
5104
|
-
if (getAIConfig(MIDSCENE_LANGSMITH_DEBUG)) {
|
|
5105
|
-
if (import_utils.ifInBrowser) {
|
|
5106
|
-
throw new Error("langsmith is not supported in browser");
|
|
5107
|
-
}
|
|
5108
|
-
console.log("DEBUGGING MODE: langsmith wrapper enabled");
|
|
5109
|
-
const { wrapOpenAI: wrapOpenAI2 } = await Promise.resolve().then(() => (init_wrappers2(), wrappers_exports));
|
|
5110
|
-
openai = wrapOpenAI2(openai);
|
|
5111
|
-
}
|
|
5112
|
-
return openai;
|
|
5113
|
-
}
|
|
5114
|
-
async function call(messages, responseFormat) {
|
|
5115
|
-
const openai = await createOpenAI();
|
|
5116
|
-
const shouldPrintTiming = typeof getAIConfig(MIDSCENE_DEBUG_AI_PROFILE) === "string";
|
|
5117
|
-
if (getAIConfig(MIDSCENE_DANGEROUSLY_PRINT_ALL_CONFIG)) {
|
|
5118
|
-
console.log(allAIConfig());
|
|
5119
|
-
}
|
|
5120
|
-
const startTime = Date.now();
|
|
5121
|
-
const model = getModelName();
|
|
5122
|
-
const completion = await openai.chat.completions.create({
|
|
5123
|
-
model,
|
|
5124
|
-
messages,
|
|
5125
|
-
response_format: responseFormat,
|
|
5126
|
-
temperature: 0.1,
|
|
5127
|
-
stream: false
|
|
5128
|
-
// betas: ['computer-use-2024-10-22'],
|
|
5129
|
-
});
|
|
5130
|
-
shouldPrintTiming && console.log(
|
|
5131
|
-
"Midscene - AI call",
|
|
5132
|
-
model,
|
|
5133
|
-
completion.usage,
|
|
5134
|
-
`${Date.now() - startTime}ms`
|
|
5135
|
-
);
|
|
5136
|
-
const { content } = completion.choices[0].message;
|
|
5137
|
-
(0, import_node_assert3.default)(content, "empty content");
|
|
5138
|
-
return content;
|
|
5139
|
-
}
|
|
5140
|
-
async function callToGetJSONObject(messages, AIActionTypeValue) {
|
|
5141
|
-
let responseFormat = {
|
|
5142
|
-
type: "json_object" /* JSON */
|
|
5143
|
-
};
|
|
5144
|
-
const model = getModelName();
|
|
5145
|
-
if (model === "gpt-4o-2024-08-06") {
|
|
5146
|
-
switch (AIActionTypeValue) {
|
|
5147
|
-
case 0 /* ASSERT */:
|
|
5148
|
-
responseFormat = assertSchema;
|
|
5149
|
-
break;
|
|
5150
|
-
case 1 /* INSPECT_ELEMENT */:
|
|
5151
|
-
responseFormat = findElementSchema;
|
|
5152
|
-
break;
|
|
5153
|
-
case 2 /* EXTRACT_DATA */:
|
|
5154
|
-
break;
|
|
5155
|
-
case 3 /* PLAN */:
|
|
5156
|
-
responseFormat = planSchema;
|
|
5157
|
-
break;
|
|
5158
|
-
}
|
|
5159
|
-
}
|
|
5160
|
-
if (model.startsWith("gemini")) {
|
|
5161
|
-
responseFormat = { type: "text" /* TEXT */ };
|
|
5162
|
-
}
|
|
5163
|
-
const response = await call(messages, responseFormat);
|
|
5164
|
-
(0, import_node_assert3.default)(response, "empty response");
|
|
5165
|
-
const jsonContent = extractJSONFromCodeBlock(response);
|
|
5166
|
-
try {
|
|
5167
|
-
return JSON.parse(jsonContent);
|
|
5168
|
-
} catch (e) {
|
|
5169
|
-
throw Error(`parse json error: ${jsonContent}`);
|
|
5170
|
-
}
|
|
5171
|
-
}
|
|
5172
|
-
function extractJSONFromCodeBlock(response) {
|
|
5173
|
-
const jsonMatch = response.match(/^\s*(\{[\s\S]*\})\s*$/);
|
|
5174
|
-
if (jsonMatch) {
|
|
5175
|
-
return jsonMatch[1];
|
|
5176
|
-
}
|
|
5177
|
-
const codeBlockMatch = response.match(/```(?:json)?\s*(\{[\s\S]*?\})\s*```/);
|
|
5178
|
-
if (codeBlockMatch) {
|
|
5179
|
-
return codeBlockMatch[1];
|
|
5180
|
-
}
|
|
5181
|
-
const jsonLikeMatch = response.match(/\{[\s\S]*\}/);
|
|
5182
|
-
if (jsonLikeMatch) {
|
|
5183
|
-
return jsonLikeMatch[0];
|
|
5213
|
+
description: "Overall error messages. If there is any error occurs during the task planning, conclude the errors again and put error messages here"
|
|
5214
|
+
}
|
|
5215
|
+
},
|
|
5216
|
+
required: ["queryLanguage", "actions", "error"],
|
|
5217
|
+
additionalProperties: false
|
|
5218
|
+
}
|
|
5184
5219
|
}
|
|
5185
|
-
|
|
5186
|
-
}
|
|
5220
|
+
};
|
|
5187
5221
|
|
|
5188
|
-
// src/
|
|
5222
|
+
// src/ai-model/prompt/util.ts
|
|
5189
5223
|
var import_node_assert4 = __toESM(require("assert"));
|
|
5190
|
-
|
|
5191
|
-
|
|
5192
|
-
var
|
|
5193
|
-
|
|
5194
|
-
|
|
5195
|
-
var
|
|
5196
|
-
var
|
|
5197
|
-
|
|
5198
|
-
var
|
|
5199
|
-
|
|
5200
|
-
|
|
5224
|
+
|
|
5225
|
+
// src/image/index.ts
|
|
5226
|
+
var import_img = require("@midscene/shared/img");
|
|
5227
|
+
|
|
5228
|
+
// src/ai-model/prompt/util.ts
|
|
5229
|
+
var characteristic = "You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.";
|
|
5230
|
+
var contextFormatIntro = `
|
|
5231
|
+
The user will give you a screenshot and the texts on it. There may be some none-English characters (like Chinese) on it, indicating it's an non-English app.`;
|
|
5232
|
+
var ONE_ELEMENT_LOCATOR_PREFIX = "LOCATE_ONE_ELEMENT";
|
|
5233
|
+
var ELEMENTS_LOCATOR_PREFIX = "LOCATE_ONE_OR_MORE_ELEMENTS";
|
|
5234
|
+
var SECTION_MATCHER_FLAG = "SECTION_MATCHER_FLAG/";
|
|
5235
|
+
function systemPromptToExtract() {
|
|
5236
|
+
return `
|
|
5237
|
+
You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.
|
|
5238
|
+
The user will give you a screenshot and the contents of it. There may be some none-English characters (like Chinese) on it, indicating it's an non-English app.
|
|
5239
|
+
|
|
5240
|
+
You have the following skills:
|
|
5241
|
+
|
|
5242
|
+
skill name: extract_data_from_UI
|
|
5243
|
+
related input: DATA_DEMAND
|
|
5244
|
+
skill content:
|
|
5245
|
+
* User will give you some data requirements in DATA_DEMAND. Consider the UI context, follow the user's instructions, and provide comprehensive data accordingly.
|
|
5246
|
+
* There may be some special commands in DATA_DEMAND, please pay extra attention
|
|
5247
|
+
- LOCATE_ONE_ELEMENT and LOCATE_ONE_OR_MORE_ELEMENTS: if you see a description that mentions the keyword LOCATE_ONE_ELEMENT
|
|
5248
|
+
- LOCATE_ONE_OR_MORE_ELEMENTS(e.g. follow LOCATE_ONE_ELEMENT : i want to find ...), it means user wants to locate a specific element meets the description.
|
|
5249
|
+
|
|
5250
|
+
Return in this way: prefix + the id / comma-separated ids, for example: LOCATE_ONE_ELEMENT/1 , LOCATE_ONE_OR_MORE_ELEMENTS/1,2,3 . If not found, keep the prefix and leave the suffix empty, like LOCATE_ONE_ELEMENT/ .
|
|
5251
|
+
|
|
5252
|
+
Return in the following JSON format:
|
|
5253
|
+
{
|
|
5254
|
+
language: "en", // "en" or "zh", the language of the page. Use the same language to describe section name, description, and similar fields.
|
|
5255
|
+
data: any, // the extracted data from extract_data_from_UI skill. Make sure both the value and scheme meet the DATA_DEMAND.
|
|
5256
|
+
errors: [], // string[], error message if any
|
|
5201
5257
|
}
|
|
5202
|
-
|
|
5203
|
-
logDir = dir;
|
|
5258
|
+
`;
|
|
5204
5259
|
}
|
|
5205
|
-
function
|
|
5206
|
-
|
|
5207
|
-
|
|
5208
|
-
|
|
5209
|
-
|
|
5210
|
-
|
|
5260
|
+
function systemPromptToAssert() {
|
|
5261
|
+
return `
|
|
5262
|
+
${characteristic}
|
|
5263
|
+
${contextFormatIntro}
|
|
5264
|
+
|
|
5265
|
+
Based on the information you get, Return assertion judgment:
|
|
5266
|
+
|
|
5267
|
+
Return in the following JSON format:
|
|
5268
|
+
{
|
|
5269
|
+
thought: string, // string, the thought of the assertion. Should in the same language as the assertion.
|
|
5270
|
+
pass: true, // true or false, whether the assertion is passed
|
|
5211
5271
|
}
|
|
5212
|
-
|
|
5213
|
-
|
|
5214
|
-
|
|
5215
|
-
|
|
5216
|
-
|
|
5217
|
-
|
|
5218
|
-
|
|
5219
|
-
|
|
5220
|
-
|
|
5221
|
-
|
|
5222
|
-
|
|
5223
|
-
|
|
5224
|
-
|
|
5225
|
-
|
|
5226
|
-
|
|
5227
|
-
|
|
5272
|
+
`;
|
|
5273
|
+
}
|
|
5274
|
+
var assertSchema = {
|
|
5275
|
+
type: "json_schema",
|
|
5276
|
+
json_schema: {
|
|
5277
|
+
name: "assert",
|
|
5278
|
+
strict: true,
|
|
5279
|
+
schema: {
|
|
5280
|
+
type: "object",
|
|
5281
|
+
properties: {
|
|
5282
|
+
thought: {
|
|
5283
|
+
type: "string",
|
|
5284
|
+
description: "The thought process behind the assertion"
|
|
5285
|
+
},
|
|
5286
|
+
pass: {
|
|
5287
|
+
type: "boolean",
|
|
5288
|
+
description: "Whether the assertion passed or failed"
|
|
5289
|
+
}
|
|
5290
|
+
},
|
|
5291
|
+
required: ["thought", "pass"],
|
|
5292
|
+
additionalProperties: false
|
|
5228
5293
|
}
|
|
5229
|
-
reportTpl = (0, import_node_fs.readFileSync)(reportPath, "utf-8");
|
|
5230
5294
|
}
|
|
5231
|
-
|
|
5295
|
+
};
|
|
5296
|
+
function describeSize(size) {
|
|
5297
|
+
return `${size.width} x ${size.height}`;
|
|
5232
5298
|
}
|
|
5233
|
-
function
|
|
5234
|
-
|
|
5235
|
-
|
|
5236
|
-
if (Array.isArray(dumpData) && dumpData.length === 0 || typeof dumpData === "undefined") {
|
|
5237
|
-
reportContent = tpl.replace(
|
|
5238
|
-
/\s+{{dump}}\s+/,
|
|
5239
|
-
`<script type="midscene_web_dump" type="application/json"></script>`
|
|
5240
|
-
);
|
|
5241
|
-
} else if (typeof dumpData === "string") {
|
|
5242
|
-
reportContent = tpl.replace(
|
|
5243
|
-
/\s+{{dump}}\s+/,
|
|
5244
|
-
`<script type="midscene_web_dump" type="application/json">${dumpData}</script>`
|
|
5245
|
-
);
|
|
5246
|
-
} else {
|
|
5247
|
-
const dumps = dumpData.map(({ dumpString, attributes }) => {
|
|
5248
|
-
const attributesArr = Object.keys(attributes || {}).map((key) => {
|
|
5249
|
-
return `${key}="${encodeURIComponent(attributes[key])}"`;
|
|
5250
|
-
});
|
|
5251
|
-
return `<script type="midscene_web_dump" type="application/json" ${attributesArr.join(
|
|
5252
|
-
" "
|
|
5253
|
-
)}
|
|
5254
|
-
>${dumpString}
|
|
5255
|
-
</script>`;
|
|
5256
|
-
});
|
|
5257
|
-
reportContent = tpl.replace(/\s+{{dump}}\s+/, dumps.join("\n"));
|
|
5299
|
+
function truncateText(text, maxLength = 20) {
|
|
5300
|
+
if (text && text.length > maxLength) {
|
|
5301
|
+
return `${text.slice(0, maxLength)}...`;
|
|
5258
5302
|
}
|
|
5259
|
-
return
|
|
5303
|
+
return text;
|
|
5260
5304
|
}
|
|
5261
|
-
function
|
|
5262
|
-
|
|
5263
|
-
|
|
5264
|
-
return
|
|
5265
|
-
}
|
|
5266
|
-
|
|
5267
|
-
if (!midscenePkgInfo) {
|
|
5268
|
-
console.warn("midscenePkgInfo not found, will not write report");
|
|
5269
|
-
return null;
|
|
5270
|
-
}
|
|
5271
|
-
const reportPath = (0, import_node_path.join)(getLogDirByType("report"), `${fileName}.html`);
|
|
5272
|
-
const reportContent = reportHTMLContent(dumpData);
|
|
5273
|
-
(0, import_node_fs.writeFileSync)(reportPath, reportContent);
|
|
5274
|
-
return reportPath;
|
|
5305
|
+
function elementByPosition(elementsInfo, position) {
|
|
5306
|
+
(0, import_node_assert4.default)(typeof position !== "undefined", "position is required for query");
|
|
5307
|
+
const item = elementsInfo.find((item2) => {
|
|
5308
|
+
return item2.rect.left <= position.x && position.x <= item2.rect.left + item2.rect.width && item2.rect.top <= position.y && position.y <= item2.rect.top + item2.rect.height;
|
|
5309
|
+
});
|
|
5310
|
+
return item;
|
|
5275
5311
|
}
|
|
5276
|
-
function
|
|
5277
|
-
|
|
5278
|
-
|
|
5312
|
+
async function describeUserPage(context) {
|
|
5313
|
+
const { screenshotBase64 } = context;
|
|
5314
|
+
let width;
|
|
5315
|
+
let height;
|
|
5316
|
+
if (context.size) {
|
|
5317
|
+
({ width, height } = context.size);
|
|
5318
|
+
} else {
|
|
5319
|
+
const imgSize = await (0, import_img.imageInfoOfBase64)(screenshotBase64);
|
|
5320
|
+
({ width, height } = imgSize);
|
|
5279
5321
|
}
|
|
5280
|
-
const
|
|
5281
|
-
const
|
|
5282
|
-
|
|
5283
|
-
|
|
5284
|
-
|
|
5285
|
-
|
|
5286
|
-
|
|
5287
|
-
|
|
5322
|
+
const elementsInfo = context.content;
|
|
5323
|
+
const idElementMap = {};
|
|
5324
|
+
elementsInfo.forEach((item) => {
|
|
5325
|
+
idElementMap[item.id] = item;
|
|
5326
|
+
return { ...item };
|
|
5327
|
+
});
|
|
5328
|
+
const elementInfosDescription = cropFieldInformation(elementsInfo);
|
|
5329
|
+
return {
|
|
5330
|
+
description: `
|
|
5331
|
+
{
|
|
5332
|
+
// The size of the page
|
|
5333
|
+
"pageSize": ${describeSize({ width, height })},
|
|
5334
|
+
|
|
5335
|
+
${// if match by id, use the description of the element
|
|
5336
|
+
getAIConfig(MATCH_BY_POSITION) ? "" : `// json description of the element
|
|
5337
|
+
"content": ${JSON.stringify(elementInfosDescription)}
|
|
5338
|
+
`}
|
|
5339
|
+
}`,
|
|
5340
|
+
elementById(id) {
|
|
5341
|
+
(0, import_node_assert4.default)(typeof id !== "undefined", "id is required for query");
|
|
5342
|
+
const item = idElementMap[`${id}`];
|
|
5343
|
+
return item;
|
|
5344
|
+
},
|
|
5345
|
+
elementByPosition(position) {
|
|
5346
|
+
return elementByPosition(elementsInfo, position);
|
|
5288
5347
|
}
|
|
5289
|
-
|
|
5290
|
-
|
|
5291
|
-
|
|
5292
|
-
|
|
5293
|
-
|
|
5294
|
-
|
|
5295
|
-
|
|
5296
|
-
|
|
5297
|
-
|
|
5298
|
-
|
|
5348
|
+
};
|
|
5349
|
+
}
|
|
5350
|
+
function cropFieldInformation(elementsInfo) {
|
|
5351
|
+
const elementInfosDescription = elementsInfo.map(
|
|
5352
|
+
(item) => {
|
|
5353
|
+
const { id, attributes = {}, rect, content } = item;
|
|
5354
|
+
const tailorContent = truncateText(content);
|
|
5355
|
+
const tailorAttributes = Object.keys(attributes).reduce(
|
|
5356
|
+
(res, currentKey) => {
|
|
5357
|
+
const attributeVal = attributes[currentKey];
|
|
5358
|
+
if (currentKey === "style" || currentKey === "src")
|
|
5359
|
+
return res;
|
|
5360
|
+
if (currentKey === "nodeType") {
|
|
5361
|
+
res[currentKey] = attributeVal.replace(/\sNode$/, "");
|
|
5362
|
+
} else {
|
|
5363
|
+
res[currentKey] = truncateText(attributeVal);
|
|
5364
|
+
}
|
|
5365
|
+
return res;
|
|
5366
|
+
},
|
|
5367
|
+
{}
|
|
5299
5368
|
);
|
|
5369
|
+
return {
|
|
5370
|
+
id,
|
|
5371
|
+
markerId: item.indexId,
|
|
5372
|
+
attributes: tailorAttributes,
|
|
5373
|
+
rect: {
|
|
5374
|
+
left: rect.left,
|
|
5375
|
+
top: rect.top,
|
|
5376
|
+
width: rect.width,
|
|
5377
|
+
height: rect.height
|
|
5378
|
+
// remove 'zoom' if it exists
|
|
5379
|
+
},
|
|
5380
|
+
content: tailorContent
|
|
5381
|
+
};
|
|
5300
5382
|
}
|
|
5301
|
-
|
|
5302
|
-
|
|
5303
|
-
|
|
5304
|
-
|
|
5305
|
-
if (
|
|
5306
|
-
|
|
5383
|
+
);
|
|
5384
|
+
return elementInfosDescription;
|
|
5385
|
+
}
|
|
5386
|
+
function retrieveElement(prompt, opt) {
|
|
5387
|
+
if (opt == null ? void 0 : opt.multi) {
|
|
5388
|
+
return `follow ${ELEMENTS_LOCATOR_PREFIX}: ${prompt}`;
|
|
5307
5389
|
}
|
|
5308
|
-
|
|
5309
|
-
|
|
5310
|
-
|
|
5390
|
+
return `follow ${ONE_ELEMENT_LOCATOR_PREFIX}: ${prompt}`;
|
|
5391
|
+
}
|
|
5392
|
+
function ifElementTypeResponse(response) {
|
|
5393
|
+
if (typeof response !== "string") {
|
|
5394
|
+
return false;
|
|
5311
5395
|
}
|
|
5312
|
-
return
|
|
5396
|
+
return response.startsWith(ONE_ELEMENT_LOCATOR_PREFIX) || response.startsWith(ELEMENTS_LOCATOR_PREFIX);
|
|
5313
5397
|
}
|
|
5314
|
-
function
|
|
5315
|
-
|
|
5316
|
-
if (
|
|
5317
|
-
|
|
5398
|
+
function splitElementResponse(response) {
|
|
5399
|
+
const oneElementSplitter = `${ONE_ELEMENT_LOCATOR_PREFIX}/`;
|
|
5400
|
+
if (response.startsWith(oneElementSplitter)) {
|
|
5401
|
+
const id = response.slice(oneElementSplitter.length);
|
|
5402
|
+
if (id.indexOf(",") >= 0) {
|
|
5403
|
+
console.warn(`unexpected comma in one element response: ${id}`);
|
|
5404
|
+
}
|
|
5405
|
+
return id ? id : null;
|
|
5318
5406
|
}
|
|
5319
|
-
|
|
5320
|
-
|
|
5407
|
+
const elementsSplitter = `${ELEMENTS_LOCATOR_PREFIX}/`;
|
|
5408
|
+
if (response.startsWith(elementsSplitter)) {
|
|
5409
|
+
const idsString = response.slice(elementsSplitter.length);
|
|
5410
|
+
if (!idsString) {
|
|
5411
|
+
return [];
|
|
5412
|
+
}
|
|
5413
|
+
return idsString.split(",");
|
|
5321
5414
|
}
|
|
5322
|
-
return
|
|
5323
|
-
}
|
|
5324
|
-
function stringifyDumpData(data, indents) {
|
|
5325
|
-
return JSON.stringify(data, replacerForPageObject, indents);
|
|
5415
|
+
return null;
|
|
5326
5416
|
}
|
|
5327
|
-
function
|
|
5328
|
-
return
|
|
5417
|
+
function retrieveSection(prompt) {
|
|
5418
|
+
return `${SECTION_MATCHER_FLAG}${prompt}`;
|
|
5329
5419
|
}
|
|
5330
5420
|
|
|
5331
|
-
// src/
|
|
5332
|
-
|
|
5333
|
-
|
|
5334
|
-
|
|
5335
|
-
|
|
5336
|
-
|
|
5337
|
-
|
|
5338
|
-
|
|
5339
|
-
|
|
5340
|
-
|
|
5341
|
-
|
|
5342
|
-
|
|
5421
|
+
// src/ai-model/openai/index.ts
|
|
5422
|
+
function preferOpenAIModel(preferVendor) {
|
|
5423
|
+
if (preferVendor && preferVendor !== "openAI")
|
|
5424
|
+
return false;
|
|
5425
|
+
if (getAIConfig(OPENAI_API_KEY))
|
|
5426
|
+
return true;
|
|
5427
|
+
return Boolean(getAIConfig(MIDSCENE_OPENAI_INIT_CONFIG_JSON));
|
|
5428
|
+
}
|
|
5429
|
+
var defaultModel = "gpt-4o-2024-08-06";
|
|
5430
|
+
function getModelName() {
|
|
5431
|
+
let modelName = defaultModel;
|
|
5432
|
+
const nameInConfig = getAIConfig(MIDSCENE_MODEL_NAME);
|
|
5433
|
+
if (nameInConfig) {
|
|
5434
|
+
modelName = nameInConfig;
|
|
5343
5435
|
}
|
|
5344
|
-
|
|
5345
|
-
|
|
5346
|
-
|
|
5347
|
-
|
|
5348
|
-
|
|
5436
|
+
return modelName;
|
|
5437
|
+
}
|
|
5438
|
+
async function createOpenAI() {
|
|
5439
|
+
let openai;
|
|
5440
|
+
const extraConfigString = getAIConfig(MIDSCENE_OPENAI_INIT_CONFIG_JSON);
|
|
5441
|
+
const extraConfig = extraConfigString ? JSON.parse(extraConfigString) : {};
|
|
5442
|
+
if (getAIConfig(OPENAI_USE_AZURE)) {
|
|
5443
|
+
openai = new import_openai2.AzureOpenAI({
|
|
5444
|
+
baseURL: getAIConfig(OPENAI_BASE_URL),
|
|
5445
|
+
apiKey: getAIConfig(OPENAI_API_KEY),
|
|
5446
|
+
...extraConfig,
|
|
5447
|
+
dangerouslyAllowBrowser: true
|
|
5448
|
+
});
|
|
5449
|
+
} else {
|
|
5450
|
+
openai = new import_openai2.default({
|
|
5451
|
+
baseURL: getAIConfig(OPENAI_BASE_URL),
|
|
5452
|
+
apiKey: getAIConfig(OPENAI_API_KEY),
|
|
5453
|
+
...extraConfig,
|
|
5454
|
+
dangerouslyAllowBrowser: true
|
|
5455
|
+
});
|
|
5349
5456
|
}
|
|
5350
|
-
|
|
5351
|
-
(
|
|
5352
|
-
|
|
5353
|
-
"executor is in error state, cannot append task"
|
|
5354
|
-
);
|
|
5355
|
-
if (Array.isArray(task)) {
|
|
5356
|
-
this.tasks.push(...task.map((item) => this.markTaskAsPending(item)));
|
|
5357
|
-
} else {
|
|
5358
|
-
this.tasks.push(this.markTaskAsPending(task));
|
|
5359
|
-
}
|
|
5360
|
-
if (this.status !== "running") {
|
|
5361
|
-
this.status = "pending";
|
|
5457
|
+
if (getAIConfig(MIDSCENE_LANGSMITH_DEBUG)) {
|
|
5458
|
+
if (import_utils3.ifInBrowser) {
|
|
5459
|
+
throw new Error("langsmith is not supported in browser");
|
|
5362
5460
|
}
|
|
5461
|
+
console.log("DEBUGGING MODE: langsmith wrapper enabled");
|
|
5462
|
+
const { wrapOpenAI: wrapOpenAI2 } = await Promise.resolve().then(() => (init_wrappers2(), wrappers_exports));
|
|
5463
|
+
openai = wrapOpenAI2(openai);
|
|
5363
5464
|
}
|
|
5364
|
-
|
|
5365
|
-
|
|
5366
|
-
|
|
5367
|
-
|
|
5368
|
-
|
|
5369
|
-
|
|
5370
|
-
|
|
5371
|
-
|
|
5372
|
-
|
|
5373
|
-
|
|
5374
|
-
|
|
5375
|
-
|
|
5376
|
-
|
|
5377
|
-
|
|
5378
|
-
|
|
5379
|
-
|
|
5380
|
-
|
|
5381
|
-
|
|
5382
|
-
|
|
5383
|
-
|
|
5384
|
-
|
|
5385
|
-
|
|
5386
|
-
|
|
5387
|
-
|
|
5388
|
-
|
|
5389
|
-
|
|
5390
|
-
|
|
5391
|
-
|
|
5392
|
-
|
|
5393
|
-
|
|
5394
|
-
|
|
5395
|
-
|
|
5396
|
-
|
|
5397
|
-
|
|
5398
|
-
|
|
5399
|
-
|
|
5400
|
-
|
|
5401
|
-
|
|
5402
|
-
|
|
5403
|
-
|
|
5404
|
-
|
|
5405
|
-
|
|
5406
|
-
|
|
5407
|
-
|
|
5408
|
-
|
|
5409
|
-
);
|
|
5410
|
-
returnValue = await task.executor(param, executorContext);
|
|
5411
|
-
if (task.subType === "Locate") {
|
|
5412
|
-
previousFindOutput = returnValue == null ? void 0 : returnValue.output;
|
|
5413
|
-
}
|
|
5414
|
-
} else if (task.type === "Action" || task.type === "Planning") {
|
|
5415
|
-
returnValue = await task.executor(param, executorContext);
|
|
5416
|
-
} else {
|
|
5417
|
-
console.warn(
|
|
5418
|
-
`unsupported task type: ${task.type}, will try to execute it directly`
|
|
5419
|
-
);
|
|
5420
|
-
returnValue = await task.executor(param, executorContext);
|
|
5421
|
-
}
|
|
5422
|
-
Object.assign(task, returnValue);
|
|
5423
|
-
task.status = "finished";
|
|
5424
|
-
task.timing.end = Date.now();
|
|
5425
|
-
task.timing.cost = task.timing.end - task.timing.start;
|
|
5426
|
-
taskIndex++;
|
|
5427
|
-
} catch (e) {
|
|
5428
|
-
successfullyCompleted = false;
|
|
5429
|
-
task.error = (e == null ? void 0 : e.message) || "error-without-message";
|
|
5430
|
-
task.errorStack = e.stack;
|
|
5431
|
-
task.status = "failed";
|
|
5432
|
-
task.timing.end = Date.now();
|
|
5433
|
-
task.timing.cost = task.timing.end - task.timing.start;
|
|
5465
|
+
return openai;
|
|
5466
|
+
}
|
|
5467
|
+
async function call(messages, responseFormat) {
|
|
5468
|
+
const openai = await createOpenAI();
|
|
5469
|
+
const shouldPrintTiming = typeof getAIConfig(MIDSCENE_DEBUG_AI_PROFILE) === "string";
|
|
5470
|
+
if (getAIConfig(MIDSCENE_DANGEROUSLY_PRINT_ALL_CONFIG)) {
|
|
5471
|
+
console.log(allAIConfig());
|
|
5472
|
+
}
|
|
5473
|
+
const startTime = Date.now();
|
|
5474
|
+
const model = getModelName();
|
|
5475
|
+
const completion = await openai.chat.completions.create({
|
|
5476
|
+
model,
|
|
5477
|
+
messages,
|
|
5478
|
+
response_format: responseFormat,
|
|
5479
|
+
temperature: 0.1,
|
|
5480
|
+
stream: false
|
|
5481
|
+
// betas: ['computer-use-2024-10-22'],
|
|
5482
|
+
});
|
|
5483
|
+
shouldPrintTiming && console.log(
|
|
5484
|
+
"Midscene - AI call",
|
|
5485
|
+
model,
|
|
5486
|
+
completion.usage,
|
|
5487
|
+
`${Date.now() - startTime}ms`
|
|
5488
|
+
);
|
|
5489
|
+
const { content } = completion.choices[0].message;
|
|
5490
|
+
(0, import_node_assert5.default)(content, "empty content");
|
|
5491
|
+
return content;
|
|
5492
|
+
}
|
|
5493
|
+
async function callToGetJSONObject(messages, AIActionTypeValue) {
|
|
5494
|
+
let responseFormat = {
|
|
5495
|
+
type: "json_object" /* JSON */
|
|
5496
|
+
};
|
|
5497
|
+
const model = getModelName();
|
|
5498
|
+
if (model === "gpt-4o-2024-08-06") {
|
|
5499
|
+
switch (AIActionTypeValue) {
|
|
5500
|
+
case 0 /* ASSERT */:
|
|
5501
|
+
responseFormat = assertSchema;
|
|
5502
|
+
break;
|
|
5503
|
+
case 1 /* INSPECT_ELEMENT */:
|
|
5504
|
+
responseFormat = findElementSchema;
|
|
5505
|
+
break;
|
|
5506
|
+
case 2 /* EXTRACT_DATA */:
|
|
5507
|
+
break;
|
|
5508
|
+
case 3 /* PLAN */:
|
|
5509
|
+
responseFormat = planSchema;
|
|
5434
5510
|
break;
|
|
5435
|
-
}
|
|
5436
|
-
}
|
|
5437
|
-
for (let i = taskIndex + 1; i < this.tasks.length; i++) {
|
|
5438
|
-
this.tasks[i].status = "cancelled";
|
|
5439
|
-
}
|
|
5440
|
-
if (successfullyCompleted) {
|
|
5441
|
-
this.status = "completed";
|
|
5442
|
-
} else {
|
|
5443
|
-
this.status = "error";
|
|
5444
|
-
}
|
|
5445
|
-
if (this.tasks.length) {
|
|
5446
|
-
const outputIndex = Math.min(taskIndex, this.tasks.length - 1);
|
|
5447
|
-
return this.tasks[outputIndex].output;
|
|
5448
5511
|
}
|
|
5449
5512
|
}
|
|
5450
|
-
|
|
5451
|
-
|
|
5513
|
+
if (model.startsWith("gemini")) {
|
|
5514
|
+
responseFormat = { type: "text" /* TEXT */ };
|
|
5452
5515
|
}
|
|
5453
|
-
|
|
5454
|
-
|
|
5455
|
-
|
|
5456
|
-
|
|
5457
|
-
|
|
5458
|
-
|
|
5459
|
-
);
|
|
5460
|
-
if (errorTaskIndex >= 0) {
|
|
5461
|
-
return this.tasks[errorTaskIndex];
|
|
5462
|
-
}
|
|
5463
|
-
return null;
|
|
5516
|
+
const response = await call(messages, responseFormat);
|
|
5517
|
+
(0, import_node_assert5.default)(response, "empty response");
|
|
5518
|
+
const jsonContent = extractJSONFromCodeBlock(response);
|
|
5519
|
+
try {
|
|
5520
|
+
return JSON.parse(jsonContent);
|
|
5521
|
+
} catch (e) {
|
|
5522
|
+
throw Error(`parse json error: ${jsonContent}`);
|
|
5464
5523
|
}
|
|
5465
|
-
|
|
5466
|
-
|
|
5467
|
-
|
|
5468
|
-
|
|
5469
|
-
|
|
5470
|
-
name: this.name,
|
|
5471
|
-
description: this.description,
|
|
5472
|
-
tasks: this.tasks
|
|
5473
|
-
};
|
|
5474
|
-
return dumpData;
|
|
5524
|
+
}
|
|
5525
|
+
function extractJSONFromCodeBlock(response) {
|
|
5526
|
+
const jsonMatch = response.match(/^\s*(\{[\s\S]*\})\s*$/);
|
|
5527
|
+
if (jsonMatch) {
|
|
5528
|
+
return jsonMatch[1];
|
|
5475
5529
|
}
|
|
5476
|
-
};
|
|
5477
|
-
|
|
5478
|
-
|
|
5479
|
-
|
|
5530
|
+
const codeBlockMatch = response.match(/```(?:json)?\s*(\{[\s\S]*?\})\s*```/);
|
|
5531
|
+
if (codeBlockMatch) {
|
|
5532
|
+
return codeBlockMatch[1];
|
|
5533
|
+
}
|
|
5534
|
+
const jsonLikeMatch = response.match(/\{[\s\S]*\}/);
|
|
5535
|
+
if (jsonLikeMatch) {
|
|
5536
|
+
return jsonLikeMatch[0];
|
|
5537
|
+
}
|
|
5538
|
+
return response;
|
|
5539
|
+
}
|
|
5480
5540
|
|
|
5481
5541
|
// src/ai-model/inspect.ts
|
|
5482
5542
|
var import_node_assert6 = __toESM(require("assert"));
|
|
@@ -6094,6 +6154,7 @@ var src_default = Insight;
|
|
|
6094
6154
|
allAIConfig,
|
|
6095
6155
|
getAIConfig,
|
|
6096
6156
|
getElement,
|
|
6157
|
+
getLogDirByType,
|
|
6097
6158
|
getSection,
|
|
6098
6159
|
getVersion,
|
|
6099
6160
|
overrideAIConfig,
|