@midscene/core 0.8.4 → 0.8.5-beta-20241126063126.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/lib/index.js CHANGED
@@ -4292,6 +4292,7 @@ __export(src_exports, {
4292
4292
  default: () => src_default,
4293
4293
  getAIConfig: () => getAIConfig,
4294
4294
  getElement: () => getElement,
4295
+ getLogDirByType: () => getLogDirByType,
4295
4296
  getSection: () => getSection,
4296
4297
  getVersion: () => getVersion,
4297
4298
  overrideAIConfig: () => overrideAIConfig,
@@ -4302,220 +4303,562 @@ __export(src_exports, {
4302
4303
  module.exports = __toCommonJS(src_exports);
4303
4304
 
4304
4305
  // src/action/executor.ts
4305
- var import_node_assert5 = __toESM(require("assert"));
4306
-
4307
- // src/ai-model/openai/index.ts
4308
- var import_node_assert3 = __toESM(require("assert"));
4306
+ var import_node_assert2 = __toESM(require("assert"));
4309
4307
 
4310
- // src/types.ts
4311
- var BaseElement = class {
4308
+ // src/env.ts
4309
+ var MIDSCENE_OPENAI_INIT_CONFIG_JSON = "MIDSCENE_OPENAI_INIT_CONFIG_JSON";
4310
+ var MIDSCENE_MODEL_NAME = "MIDSCENE_MODEL_NAME";
4311
+ var MIDSCENE_LANGSMITH_DEBUG = "MIDSCENE_LANGSMITH_DEBUG";
4312
+ var MIDSCENE_DEBUG_AI_PROFILE = "MIDSCENE_DEBUG_AI_PROFILE";
4313
+ var MIDSCENE_DANGEROUSLY_PRINT_ALL_CONFIG = "MIDSCENE_DANGEROUSLY_PRINT_ALL_CONFIG";
4314
+ var MIDSCENE_DEBUG_MODE = "MIDSCENE_DEBUG_MODE";
4315
+ var OPENAI_API_KEY = "OPENAI_API_KEY";
4316
+ var OPENAI_BASE_URL = "OPENAI_BASE_URL";
4317
+ var MIDSCENE_MODEL_TEXT_ONLY = "MIDSCENE_MODEL_TEXT_ONLY";
4318
+ var OPENAI_USE_AZURE = "OPENAI_USE_AZURE";
4319
+ var MIDSCENE_CACHE = "MIDSCENE_CACHE";
4320
+ var MATCH_BY_POSITION = "MATCH_BY_POSITION";
4321
+ var MIDSCENE_REPORT_TAG_NAME = "MIDSCENE_REPORT_TAG_NAME";
4322
+ var allConfigFromEnv = () => {
4323
+ return {
4324
+ [MIDSCENE_OPENAI_INIT_CONFIG_JSON]: process.env[MIDSCENE_OPENAI_INIT_CONFIG_JSON] || void 0,
4325
+ [MIDSCENE_MODEL_NAME]: process.env[MIDSCENE_MODEL_NAME] || void 0,
4326
+ [MIDSCENE_DEBUG_MODE]: process.env[MIDSCENE_DEBUG_MODE] || void 0,
4327
+ [MIDSCENE_LANGSMITH_DEBUG]: process.env[MIDSCENE_LANGSMITH_DEBUG] || void 0,
4328
+ [MIDSCENE_DEBUG_AI_PROFILE]: process.env[MIDSCENE_DEBUG_AI_PROFILE] || void 0,
4329
+ [MIDSCENE_DANGEROUSLY_PRINT_ALL_CONFIG]: process.env[MIDSCENE_DANGEROUSLY_PRINT_ALL_CONFIG] || void 0,
4330
+ [OPENAI_API_KEY]: process.env[OPENAI_API_KEY] || void 0,
4331
+ [OPENAI_BASE_URL]: process.env[OPENAI_BASE_URL] || void 0,
4332
+ [MIDSCENE_MODEL_TEXT_ONLY]: process.env[MIDSCENE_MODEL_TEXT_ONLY] || void 0,
4333
+ [OPENAI_USE_AZURE]: process.env[OPENAI_USE_AZURE] || void 0,
4334
+ [MIDSCENE_CACHE]: process.env[MIDSCENE_CACHE] || void 0,
4335
+ [MATCH_BY_POSITION]: process.env[MATCH_BY_POSITION] || void 0,
4336
+ [MIDSCENE_REPORT_TAG_NAME]: process.env[MIDSCENE_REPORT_TAG_NAME] || void 0
4337
+ };
4312
4338
  };
4313
- var AIResponseFormat = /* @__PURE__ */ ((AIResponseFormat2) => {
4314
- AIResponseFormat2["JSON"] = "json_object";
4315
- AIResponseFormat2["TEXT"] = "text";
4316
- return AIResponseFormat2;
4317
- })(AIResponseFormat || {});
4318
- var UIContext = class {
4339
+ var userConfig = {};
4340
+ var getAIConfig = (configKey) => {
4341
+ if (typeof userConfig[configKey] !== "undefined") {
4342
+ return userConfig[configKey];
4343
+ }
4344
+ return allConfigFromEnv()[configKey];
4345
+ };
4346
+ var allAIConfig = () => {
4347
+ return { ...allConfigFromEnv(), ...userConfig };
4348
+ };
4349
+ var overrideAIConfig = (newConfig, extendMode) => {
4350
+ userConfig = extendMode ? { ...userConfig, ...newConfig } : { ...newConfig };
4319
4351
  };
4320
4352
 
4321
- // src/ai-model/openai/index.ts
4322
- var import_utils = require("@midscene/shared/utils");
4323
- var import_openai5 = __toESM(require("openai"));
4324
-
4325
- // src/ai-model/coze/index.ts
4353
+ // src/utils.ts
4326
4354
  var import_node_assert = __toESM(require("assert"));
4327
- var COZE_INSPECT_ELEMENT_BOT_ID = process.env.COZE_INSPECT_ELEMENT_BOT_ID || "";
4328
- var COZE_AI_ACTION_BOT_ID = process.env.COZE_AI_ACTION_BOT_ID || "";
4329
- var COZE_AI_ASSERT_BOT_ID = process.env.COZE_AI_ASSERT_BOT_ID || "";
4330
- var COZE_EXTRACT_INFO_BOT_ID = process.env.COZE_EXTRACT_INFO_BOT_ID || "";
4331
- var COZE_BOT_TOKEN = "COZE_BOT_TOKEN";
4332
- function preferCozeModel(preferVendor) {
4333
- if (preferVendor && preferVendor !== "coze")
4334
- return false;
4335
- return process.env[COZE_BOT_TOKEN] && process.env.COZE_INSPECT_ELEMENT_BOT_ID && process.env.COZE_AI_ACTION_BOT_ID && process.env.COZE_AI_ASSERT_BOT_ID && process.env.COZE_EXTRACT_INFO_BOT_ID;
4355
+ var import_node_child_process = require("child_process");
4356
+ var import_node_fs = require("fs");
4357
+ var import_node_os = require("os");
4358
+ var import_node_path = require("path");
4359
+ var import_fs = require("@midscene/shared/fs");
4360
+ var import_utils = require("@midscene/shared/utils");
4361
+ var logDir = (0, import_node_path.join)(process.cwd(), "./midscene_run/");
4362
+ var logEnvReady = false;
4363
+ var insightDumpFileExt = "insight-dump.json";
4364
+ function getLogDir() {
4365
+ return logDir;
4336
4366
  }
4337
- async function callCozeAi(options) {
4338
- var _a, _b;
4339
- const { query, imgs, botId } = options;
4340
- const completion = await fetch("https://api.coze.com/open_api/v2/chat", {
4341
- method: "POST",
4342
- headers: {
4343
- Authorization: `Bearer ${process.env[COZE_BOT_TOKEN]}`,
4344
- "Content-Type": "application/json",
4345
- Accept: "*/*",
4346
- Host: "api.coze.com",
4347
- Connection: "keep-alive"
4348
- },
4349
- body: JSON.stringify({
4350
- conversation_id: "123",
4351
- bot_id: botId,
4352
- user: "29032201862555",
4353
- query,
4354
- meta_data: {
4355
- img: imgs.map((imgPath) => {
4356
- return {
4357
- url: imgPath
4358
- };
4359
- })
4360
- },
4361
- stream: false
4362
- })
4363
- });
4364
- if (!completion.ok) {
4365
- console.error("CozeAI reponse error", completion);
4366
- throw new Error("Network response was not ok");
4367
- }
4368
- const aiResponse = await completion.json();
4369
- if (aiResponse.code !== 0) {
4370
- console.error("CozeAI error response", aiResponse.msg);
4371
- throw new Error(`CozeAI error response ${aiResponse.msg}`);
4367
+ function setLogDir(dir) {
4368
+ logDir = dir;
4369
+ }
4370
+ function getLogDirByType(type) {
4371
+ const dir = (0, import_node_path.join)(getLogDir(), type);
4372
+ if (!(0, import_node_fs.existsSync)(dir)) {
4373
+ (0, import_node_fs.mkdirSync)(dir, { recursive: true });
4372
4374
  }
4373
- if (!(aiResponse == null ? void 0 : aiResponse.messages) || !((_a = aiResponse == null ? void 0 : aiResponse.messages[0]) == null ? void 0 : _a.content)) {
4374
- console.error("aiResponse", aiResponse);
4375
- throw new Error("aiResponse is undefined", aiResponse);
4375
+ return dir;
4376
+ }
4377
+ var reportTpl = null;
4378
+ function getReportTpl() {
4379
+ if (import_utils.ifInBrowser) {
4380
+ if (!reportTpl && window.midscene_report_tpl) {
4381
+ reportTpl = window.midscene_report_tpl;
4382
+ }
4383
+ (0, import_node_assert.default)(
4384
+ reportTpl,
4385
+ "reportTpl should be set before writing report in browser"
4386
+ );
4387
+ return reportTpl;
4376
4388
  }
4377
- const parseContent = (_b = aiResponse == null ? void 0 : aiResponse.messages[0]) == null ? void 0 : _b.content;
4378
- (0, import_node_assert.default)(parseContent, "empty content");
4379
- try {
4380
- return JSON.parse(parseContent);
4381
- } catch (err) {
4382
- console.error("can't parse coze content", aiResponse, err);
4383
- throw Error("can't parse coze content");
4389
+ if (!reportTpl) {
4390
+ let reportPath = (0, import_node_path.join)(__dirname, "../../report/index.html");
4391
+ if (!(0, import_node_fs.existsSync)(reportPath)) {
4392
+ reportPath = (0, import_node_path.join)(__dirname, "../report/index.html");
4393
+ }
4394
+ reportTpl = (0, import_node_fs.readFileSync)(reportPath, "utf-8");
4384
4395
  }
4396
+ return reportTpl;
4385
4397
  }
4386
- function transformOpenAiArgsToCoze(msg) {
4387
- if (msg.role !== "user")
4388
- throw Error(`can't transform ${msg} to coze args`);
4389
- if (typeof msg.content === "string") {
4390
- return {
4391
- query: msg.content,
4392
- imgs: []
4393
- };
4398
+ function reportHTMLContent(dumpData) {
4399
+ const tpl = getReportTpl();
4400
+ let reportContent;
4401
+ if (Array.isArray(dumpData) && dumpData.length === 0 || typeof dumpData === "undefined") {
4402
+ reportContent = tpl.replace(
4403
+ /\s+{{dump}}\s+/,
4404
+ `<script type="midscene_web_dump" type="application/json"></script>`
4405
+ );
4406
+ } else if (typeof dumpData === "string") {
4407
+ reportContent = tpl.replace(
4408
+ /\s+{{dump}}\s+/,
4409
+ `<script type="midscene_web_dump" type="application/json">${dumpData}</script>`
4410
+ );
4411
+ } else {
4412
+ const dumps = dumpData.map(({ dumpString, attributes }) => {
4413
+ const attributesArr = Object.keys(attributes || {}).map((key) => {
4414
+ return `${key}="${encodeURIComponent(attributes[key])}"`;
4415
+ });
4416
+ return `<script type="midscene_web_dump" type="application/json" ${attributesArr.join(
4417
+ " "
4418
+ )}
4419
+ >${dumpString}
4420
+ </script>`;
4421
+ });
4422
+ reportContent = tpl.replace(/\s+{{dump}}\s+/, dumps.join("\n"));
4394
4423
  }
4395
- return {
4396
- query: msg.content.reduce((res, next) => {
4397
- if (next.type === "text") {
4398
- res += `
4399
- ${next.text}`;
4400
- }
4401
- return res;
4402
- }, ""),
4403
- imgs: msg.content.reduce(
4404
- (res, next) => {
4405
- if (next.type === "image_url") {
4406
- res.push(next.image_url.url);
4407
- }
4408
- return res;
4409
- },
4410
- []
4411
- )
4412
- };
4424
+ return reportContent;
4413
4425
  }
4414
-
4415
- // src/ai-model/common.ts
4416
- async function callAiFn(options) {
4417
- const { useModel, msgs, AIActionType: AIActionTypeValue } = options;
4418
- if (preferOpenAIModel(useModel)) {
4419
- const parseResult = await callToGetJSONObject(msgs, AIActionTypeValue);
4420
- return parseResult;
4426
+ function writeDumpReport(fileName, dumpData) {
4427
+ if (import_utils.ifInBrowser) {
4428
+ console.log("will not write report in browser");
4429
+ return null;
4421
4430
  }
4422
- if (preferCozeModel(useModel)) {
4423
- let botId = "";
4424
- switch (AIActionTypeValue) {
4425
- case 0 /* ASSERT */:
4426
- botId = COZE_AI_ASSERT_BOT_ID;
4427
- break;
4428
- case 2 /* EXTRACT_DATA */:
4429
- botId = COZE_EXTRACT_INFO_BOT_ID;
4430
- break;
4431
- case 1 /* INSPECT_ELEMENT */:
4432
- botId = COZE_INSPECT_ELEMENT_BOT_ID;
4433
- break;
4434
- default:
4435
- botId = COZE_AI_ACTION_BOT_ID;
4436
- }
4437
- const cozeMsg = transformOpenAiArgsToCoze(msgs[1]);
4438
- const parseResult = await callCozeAi({
4439
- ...cozeMsg,
4440
- botId
4441
- });
4442
- return parseResult;
4431
+ const midscenePkgInfo = (0, import_fs.getRunningPkgInfo)(__dirname);
4432
+ if (!midscenePkgInfo) {
4433
+ console.warn("midscenePkgInfo not found, will not write report");
4434
+ return null;
4443
4435
  }
4444
- throw Error(
4445
- "Cannot find Coze or OpenAI config. You should set at least one of them."
4446
- );
4436
+ const reportPath = (0, import_node_path.join)(getLogDirByType("report"), `${fileName}.html`);
4437
+ const reportContent = reportHTMLContent(dumpData);
4438
+ (0, import_node_fs.writeFileSync)(reportPath, reportContent);
4439
+ return reportPath;
4447
4440
  }
4448
- function transformUserMessages(msgs) {
4449
- const textOnly = Boolean(getAIConfig(MIDSCENE_MODEL_TEXT_ONLY));
4450
- if (!textOnly)
4451
- return msgs;
4452
- return msgs.reduce((res, msg) => {
4453
- if (msg.type === "text") {
4454
- res += msg.text;
4441
+ function writeLogFile(opts) {
4442
+ if (import_utils.ifInBrowser) {
4443
+ return "/mock/report.html";
4444
+ }
4445
+ const { fileName, fileExt, fileContent, type = "dump" } = opts;
4446
+ const targetDir = getLogDirByType(type);
4447
+ if (!logEnvReady) {
4448
+ (0, import_node_assert.default)(targetDir, "logDir should be set before writing dump file");
4449
+ const gitIgnorePath = (0, import_node_path.join)(targetDir, "../../.gitignore");
4450
+ let gitIgnoreContent = "";
4451
+ if ((0, import_node_fs.existsSync)(gitIgnorePath)) {
4452
+ gitIgnoreContent = (0, import_node_fs.readFileSync)(gitIgnorePath, "utf-8");
4455
4453
  }
4456
- return res;
4457
- }, "");
4454
+ const logDirName = (0, import_node_path.basename)(logDir);
4455
+ if (!gitIgnoreContent.includes(`${logDirName}/`)) {
4456
+ (0, import_node_fs.writeFileSync)(
4457
+ gitIgnorePath,
4458
+ `${gitIgnoreContent}
4459
+ # Midscene.js dump files
4460
+ ${logDirName}/report
4461
+ ${logDirName}/dump
4462
+ ${logDirName}/tmp
4463
+ `,
4464
+ "utf-8"
4465
+ );
4466
+ }
4467
+ logEnvReady = true;
4468
+ }
4469
+ const filePath = (0, import_node_path.join)(targetDir, `${fileName}.${fileExt}`);
4470
+ const outputResourceDir = (0, import_node_path.dirname)(filePath);
4471
+ if (!(0, import_node_fs.existsSync)(outputResourceDir)) {
4472
+ (0, import_node_fs.mkdirSync)(outputResourceDir, { recursive: true });
4473
+ }
4474
+ (0, import_node_fs.writeFileSync)(filePath, fileContent);
4475
+ if (opts == null ? void 0 : opts.generateReport) {
4476
+ return writeDumpReport(fileName, fileContent);
4477
+ }
4478
+ return filePath;
4458
4479
  }
4459
-
4460
- // src/ai-model/prompt/element_inspector.ts
4461
- function systemPromptToFindElement() {
4462
- if (getAIConfig(MATCH_BY_POSITION)) {
4463
- return systemPromptToFindElementPosition();
4480
+ function replacerForPageObject(key, value) {
4481
+ var _a, _b;
4482
+ if (value && ((_a = value.constructor) == null ? void 0 : _a.name) === "Page") {
4483
+ return "[Page object]";
4464
4484
  }
4465
- return `
4466
- ## Role:
4467
- You are an expert in software page image (2D) and page element text analysis.
4468
-
4469
- ## Objective:
4470
- - Identify elements in screenshots and text that match the user's description.
4471
- - Return JSON data containing the selection reason and element ID.
4485
+ if (value && ((_b = value.constructor) == null ? void 0 : _b.name) === "Browser") {
4486
+ return "[Browser object]";
4487
+ }
4488
+ return value;
4489
+ }
4490
+ function stringifyDumpData(data, indents) {
4491
+ return JSON.stringify(data, replacerForPageObject, indents);
4492
+ }
4493
+ function getVersion() {
4494
+ return "0.8.5-beta-20241126063126.0";
4495
+ }
4472
4496
 
4473
- ## Skills:
4474
- - Image analysis and recognition
4475
- - Multilingual text understanding
4476
- - Software UI design and testing
4497
+ // src/action/executor.ts
4498
+ var Executor = class {
4499
+ constructor(name, description, tasks) {
4500
+ __publicField(this, "name");
4501
+ __publicField(this, "description");
4502
+ __publicField(this, "tasks");
4503
+ // status of executor
4504
+ __publicField(this, "status");
4505
+ this.status = tasks && tasks.length > 0 ? "pending" : "init";
4506
+ this.name = name;
4507
+ this.description = description;
4508
+ this.tasks = (tasks || []).map((item) => this.markTaskAsPending(item));
4509
+ }
4510
+ markTaskAsPending(task) {
4511
+ return {
4512
+ status: "pending",
4513
+ ...task
4514
+ };
4515
+ }
4516
+ async append(task) {
4517
+ var _a, _b;
4518
+ (0, import_node_assert2.default)(
4519
+ this.status !== "error",
4520
+ `executor is in error state, cannot append task
4521
+ error=${(_a = this.latestErrorTask()) == null ? void 0 : _a.error}
4522
+ ${(_b = this.latestErrorTask()) == null ? void 0 : _b.errorStack}`
4523
+ );
4524
+ if (Array.isArray(task)) {
4525
+ this.tasks.push(...task.map((item) => this.markTaskAsPending(item)));
4526
+ } else {
4527
+ this.tasks.push(this.markTaskAsPending(task));
4528
+ }
4529
+ if (this.status !== "running") {
4530
+ this.status = "pending";
4531
+ }
4532
+ }
4533
+ async flush() {
4534
+ if (this.status === "init" && this.tasks.length > 0) {
4535
+ console.warn(
4536
+ "illegal state for executor, status is init but tasks are not empty"
4537
+ );
4538
+ }
4539
+ (0, import_node_assert2.default)(this.status !== "running", "executor is already running");
4540
+ (0, import_node_assert2.default)(this.status !== "completed", "executor is already completed");
4541
+ (0, import_node_assert2.default)(this.status !== "error", "executor is in error state");
4542
+ const nextPendingIndex = this.tasks.findIndex(
4543
+ (task) => task.status === "pending"
4544
+ );
4545
+ if (nextPendingIndex < 0) {
4546
+ return;
4547
+ }
4548
+ this.status = "running";
4549
+ let taskIndex = nextPendingIndex;
4550
+ let successfullyCompleted = true;
4551
+ let previousFindOutput;
4552
+ while (taskIndex < this.tasks.length) {
4553
+ const task = this.tasks[taskIndex];
4554
+ (0, import_node_assert2.default)(
4555
+ task.status === "pending",
4556
+ `task status should be pending, but got: ${task.status}`
4557
+ );
4558
+ task.timing = {
4559
+ start: Date.now()
4560
+ };
4561
+ try {
4562
+ task.status = "running";
4563
+ (0, import_node_assert2.default)(
4564
+ ["Insight", "Action", "Planning"].indexOf(task.type) >= 0,
4565
+ `unsupported task type: ${task.type}`
4566
+ );
4567
+ const { executor, param } = task;
4568
+ (0, import_node_assert2.default)(executor, `executor is required for task type: ${task.type}`);
4569
+ let returnValue;
4570
+ const executorContext = {
4571
+ task,
4572
+ element: previousFindOutput == null ? void 0 : previousFindOutput.element
4573
+ };
4574
+ if (task.type === "Insight") {
4575
+ (0, import_node_assert2.default)(
4576
+ task.subType === "Locate" || task.subType === "Query" || task.subType === "Assert",
4577
+ `unsupported insight subType: ${task.subType}`
4578
+ );
4579
+ returnValue = await task.executor(param, executorContext);
4580
+ if (task.subType === "Locate") {
4581
+ previousFindOutput = returnValue == null ? void 0 : returnValue.output;
4582
+ }
4583
+ } else if (task.type === "Action" || task.type === "Planning") {
4584
+ returnValue = await task.executor(param, executorContext);
4585
+ } else {
4586
+ console.warn(
4587
+ `unsupported task type: ${task.type}, will try to execute it directly`
4588
+ );
4589
+ returnValue = await task.executor(param, executorContext);
4590
+ }
4591
+ Object.assign(task, returnValue);
4592
+ task.status = "finished";
4593
+ task.timing.end = Date.now();
4594
+ task.timing.cost = task.timing.end - task.timing.start;
4595
+ taskIndex++;
4596
+ } catch (e) {
4597
+ successfullyCompleted = false;
4598
+ task.error = (e == null ? void 0 : e.message) || "error-without-message";
4599
+ task.errorStack = e.stack;
4600
+ task.status = "failed";
4601
+ task.timing.end = Date.now();
4602
+ task.timing.cost = task.timing.end - task.timing.start;
4603
+ break;
4604
+ }
4605
+ }
4606
+ for (let i = taskIndex + 1; i < this.tasks.length; i++) {
4607
+ this.tasks[i].status = "cancelled";
4608
+ }
4609
+ if (successfullyCompleted) {
4610
+ this.status = "completed";
4611
+ } else {
4612
+ this.status = "error";
4613
+ }
4614
+ if (this.tasks.length) {
4615
+ const outputIndex = Math.min(taskIndex, this.tasks.length - 1);
4616
+ return this.tasks[outputIndex].output;
4617
+ }
4618
+ }
4619
+ isInErrorState() {
4620
+ return this.status === "error";
4621
+ }
4622
+ latestErrorTask() {
4623
+ if (this.status !== "error") {
4624
+ return null;
4625
+ }
4626
+ const errorTaskIndex = this.tasks.findIndex(
4627
+ (task) => task.status === "failed"
4628
+ );
4629
+ if (errorTaskIndex >= 0) {
4630
+ return this.tasks[errorTaskIndex];
4631
+ }
4632
+ return null;
4633
+ }
4634
+ dump() {
4635
+ const dumpData = {
4636
+ sdkVersion: getVersion(),
4637
+ model_name: getAIConfig(MIDSCENE_MODEL_NAME) || "",
4638
+ logTime: Date.now(),
4639
+ name: this.name,
4640
+ description: this.description,
4641
+ tasks: this.tasks
4642
+ };
4643
+ return dumpData;
4644
+ }
4645
+ };
4477
4646
 
4478
- ## Workflow:
4479
- 1. Receive the user's element description, screenshot, and element description information. Note that the text may contain non-English characters (e.g., Chinese), indicating that the application may be non-English.
4480
- 2. Based on the user's description, locate the target element ID in the list of element descriptions and the screenshot.
4481
- 3. Found the required number of elements
4482
- 4. Return JSON data containing the selection reason and element ID.
4647
+ // src/insight/index.ts
4648
+ var import_node_assert9 = __toESM(require("assert"));
4483
4649
 
4484
- ## Constraints:
4485
- - Strictly adhere to the specified location when describing the required element; do not select elements from other locations.
4486
- - Elements in the image with NodeType other than "TEXT Node" have been highlighted to identify the element among multiple non-text elements.
4487
- - Accurately identify element information based on the user's description and return the corresponding element ID from the element description information, not extracted from the image.
4488
- - If no elements are found, the "elements" array should be empty.
4489
- - The returned data must conform to the specified JSON format.
4490
- - The returned value id information must use the id from element info (important: **use id not indexId, id is hash content**)
4650
+ // src/ai-model/openai/index.ts
4651
+ var import_node_assert5 = __toESM(require("assert"));
4491
4652
 
4492
- ## Output Format:
4653
+ // src/types.ts
4654
+ var BaseElement = class {
4655
+ };
4656
+ var AIResponseFormat = /* @__PURE__ */ ((AIResponseFormat2) => {
4657
+ AIResponseFormat2["JSON"] = "json_object";
4658
+ AIResponseFormat2["TEXT"] = "text";
4659
+ return AIResponseFormat2;
4660
+ })(AIResponseFormat || {});
4661
+ var UIContext = class {
4662
+ };
4493
4663
 
4494
- Please return the result in JSON format as follows:
4664
+ // src/ai-model/openai/index.ts
4665
+ var import_utils3 = require("@midscene/shared/utils");
4666
+ var import_openai2 = __toESM(require("openai"));
4495
4667
 
4496
- \`\`\`json
4497
- {
4498
- "elements": [
4499
- // If no matching elements are found, return an empty array []
4500
- {
4501
- "reason": "PLACEHOLDER", // The thought process for finding the element, replace PLACEHOLDER with your thought process
4502
- "text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
4503
- "id": "PLACEHOLDER" // Replace PLACEHOLDER with the ID (important: **use id not indexId, id is hash content**) of elementInfo
4504
- }
4505
- // More elements...
4506
- ],
4507
- "errors": [] // Array of strings containing any error messages
4668
+ // src/ai-model/coze/index.ts
4669
+ var import_node_assert3 = __toESM(require("assert"));
4670
+ var COZE_INSPECT_ELEMENT_BOT_ID = process.env.COZE_INSPECT_ELEMENT_BOT_ID || "";
4671
+ var COZE_AI_ACTION_BOT_ID = process.env.COZE_AI_ACTION_BOT_ID || "";
4672
+ var COZE_AI_ASSERT_BOT_ID = process.env.COZE_AI_ASSERT_BOT_ID || "";
4673
+ var COZE_EXTRACT_INFO_BOT_ID = process.env.COZE_EXTRACT_INFO_BOT_ID || "";
4674
+ var COZE_BOT_TOKEN = "COZE_BOT_TOKEN";
4675
+ function preferCozeModel(preferVendor) {
4676
+ if (preferVendor && preferVendor !== "coze")
4677
+ return false;
4678
+ return process.env[COZE_BOT_TOKEN] && process.env.COZE_INSPECT_ELEMENT_BOT_ID && process.env.COZE_AI_ACTION_BOT_ID && process.env.COZE_AI_ASSERT_BOT_ID && process.env.COZE_EXTRACT_INFO_BOT_ID;
4508
4679
  }
4509
- \`\`\`
4510
-
4511
- ## Example:
4512
- Example 1:
4513
- Input Example:
4514
- \`\`\`json
4515
- // Description: "Shopping cart icon in the upper right corner"
4516
- {
4517
- "description": "PLACEHOLDER", // Description of the target element
4518
- "multi": "PLACEHOLDER", //Find the number of elements
4680
+ async function callCozeAi(options) {
4681
+ var _a, _b;
4682
+ const { query, imgs, botId } = options;
4683
+ const completion = await fetch("https://api.coze.com/open_api/v2/chat", {
4684
+ method: "POST",
4685
+ headers: {
4686
+ Authorization: `Bearer ${process.env[COZE_BOT_TOKEN]}`,
4687
+ "Content-Type": "application/json",
4688
+ Accept: "*/*",
4689
+ Host: "api.coze.com",
4690
+ Connection: "keep-alive"
4691
+ },
4692
+ body: JSON.stringify({
4693
+ conversation_id: "123",
4694
+ bot_id: botId,
4695
+ user: "29032201862555",
4696
+ query,
4697
+ meta_data: {
4698
+ img: imgs.map((imgPath) => {
4699
+ return {
4700
+ url: imgPath
4701
+ };
4702
+ })
4703
+ },
4704
+ stream: false
4705
+ })
4706
+ });
4707
+ if (!completion.ok) {
4708
+ console.error("CozeAI reponse error", completion);
4709
+ throw new Error("Network response was not ok");
4710
+ }
4711
+ const aiResponse = await completion.json();
4712
+ if (aiResponse.code !== 0) {
4713
+ console.error("CozeAI error response", aiResponse.msg);
4714
+ throw new Error(`CozeAI error response ${aiResponse.msg}`);
4715
+ }
4716
+ if (!(aiResponse == null ? void 0 : aiResponse.messages) || !((_a = aiResponse == null ? void 0 : aiResponse.messages[0]) == null ? void 0 : _a.content)) {
4717
+ console.error("aiResponse", aiResponse);
4718
+ throw new Error("aiResponse is undefined", aiResponse);
4719
+ }
4720
+ const parseContent = (_b = aiResponse == null ? void 0 : aiResponse.messages[0]) == null ? void 0 : _b.content;
4721
+ (0, import_node_assert3.default)(parseContent, "empty content");
4722
+ try {
4723
+ return JSON.parse(parseContent);
4724
+ } catch (err) {
4725
+ console.error("can't parse coze content", aiResponse, err);
4726
+ throw Error("can't parse coze content");
4727
+ }
4728
+ }
4729
+ function transformOpenAiArgsToCoze(msg) {
4730
+ if (msg.role !== "user")
4731
+ throw Error(`can't transform ${msg} to coze args`);
4732
+ if (typeof msg.content === "string") {
4733
+ return {
4734
+ query: msg.content,
4735
+ imgs: []
4736
+ };
4737
+ }
4738
+ return {
4739
+ query: msg.content.reduce((res, next) => {
4740
+ if (next.type === "text") {
4741
+ res += `
4742
+ ${next.text}`;
4743
+ }
4744
+ return res;
4745
+ }, ""),
4746
+ imgs: msg.content.reduce(
4747
+ (res, next) => {
4748
+ if (next.type === "image_url") {
4749
+ res.push(next.image_url.url);
4750
+ }
4751
+ return res;
4752
+ },
4753
+ []
4754
+ )
4755
+ };
4756
+ }
4757
+
4758
+ // src/ai-model/common.ts
4759
+ async function callAiFn(options) {
4760
+ const { useModel, msgs, AIActionType: AIActionTypeValue } = options;
4761
+ if (preferOpenAIModel(useModel)) {
4762
+ const parseResult = await callToGetJSONObject(msgs, AIActionTypeValue);
4763
+ return parseResult;
4764
+ }
4765
+ if (preferCozeModel(useModel)) {
4766
+ let botId = "";
4767
+ switch (AIActionTypeValue) {
4768
+ case 0 /* ASSERT */:
4769
+ botId = COZE_AI_ASSERT_BOT_ID;
4770
+ break;
4771
+ case 2 /* EXTRACT_DATA */:
4772
+ botId = COZE_EXTRACT_INFO_BOT_ID;
4773
+ break;
4774
+ case 1 /* INSPECT_ELEMENT */:
4775
+ botId = COZE_INSPECT_ELEMENT_BOT_ID;
4776
+ break;
4777
+ default:
4778
+ botId = COZE_AI_ACTION_BOT_ID;
4779
+ }
4780
+ const cozeMsg = transformOpenAiArgsToCoze(msgs[1]);
4781
+ const parseResult = await callCozeAi({
4782
+ ...cozeMsg,
4783
+ botId
4784
+ });
4785
+ return parseResult;
4786
+ }
4787
+ throw Error(
4788
+ "Cannot find Coze or OpenAI config. You should set at least one of them."
4789
+ );
4790
+ }
4791
+ function transformUserMessages(msgs) {
4792
+ const textOnly = Boolean(getAIConfig(MIDSCENE_MODEL_TEXT_ONLY));
4793
+ if (!textOnly)
4794
+ return msgs;
4795
+ return msgs.reduce((res, msg) => {
4796
+ if (msg.type === "text") {
4797
+ res += msg.text;
4798
+ }
4799
+ return res;
4800
+ }, "");
4801
+ }
4802
+
4803
+ // src/ai-model/prompt/element_inspector.ts
4804
+ function systemPromptToFindElement() {
4805
+ if (getAIConfig(MATCH_BY_POSITION)) {
4806
+ return systemPromptToFindElementPosition();
4807
+ }
4808
+ return `
4809
+ ## Role:
4810
+ You are an expert in software page image (2D) and page element text analysis.
4811
+
4812
+ ## Objective:
4813
+ - Identify elements in screenshots and text that match the user's description.
4814
+ - Return JSON data containing the selection reason and element ID.
4815
+
4816
+ ## Skills:
4817
+ - Image analysis and recognition
4818
+ - Multilingual text understanding
4819
+ - Software UI design and testing
4820
+
4821
+ ## Workflow:
4822
+ 1. Receive the user's element description, screenshot, and element description information. Note that the text may contain non-English characters (e.g., Chinese), indicating that the application may be non-English.
4823
+ 2. Based on the user's description, locate the target element ID in the list of element descriptions and the screenshot.
4824
+ 3. Found the required number of elements
4825
+ 4. Return JSON data containing the selection reason and element ID.
4826
+
4827
+ ## Constraints:
4828
+ - Strictly adhere to the specified location when describing the required element; do not select elements from other locations.
4829
+ - Elements in the image with NodeType other than "TEXT Node" have been highlighted to identify the element among multiple non-text elements.
4830
+ - Accurately identify element information based on the user's description and return the corresponding element ID from the element description information, not extracted from the image.
4831
+ - If no elements are found, the "elements" array should be empty.
4832
+ - The returned data must conform to the specified JSON format.
4833
+ - The returned value id information must use the id from element info (important: **use id not indexId, id is hash content**)
4834
+
4835
+ ## Output Format:
4836
+
4837
+ Please return the result in JSON format as follows:
4838
+
4839
+ \`\`\`json
4840
+ {
4841
+ "elements": [
4842
+ // If no matching elements are found, return an empty array []
4843
+ {
4844
+ "reason": "PLACEHOLDER", // The thought process for finding the element, replace PLACEHOLDER with your thought process
4845
+ "text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
4846
+ "id": "PLACEHOLDER" // Replace PLACEHOLDER with the ID (important: **use id not indexId, id is hash content**) of elementInfo
4847
+ }
4848
+ // More elements...
4849
+ ],
4850
+ "errors": [] // Array of strings containing any error messages
4851
+ }
4852
+ \`\`\`
4853
+
4854
+ ## Example:
4855
+ Example 1:
4856
+ Input Example:
4857
+ \`\`\`json
4858
+ // Description: "Shopping cart icon in the upper right corner"
4859
+ {
4860
+ "description": "PLACEHOLDER", // Description of the target element
4861
+ "multi": "PLACEHOLDER", //Find the number of elements
4519
4862
  "screenshot": "path/screenshot.png",
4520
4863
  "text": '{
4521
4864
  "pageSize": {
@@ -4524,7 +4867,7 @@ Input Example:
4524
4867
  },
4525
4868
  "elementInfos": [
4526
4869
  {
4527
- "id": "we23xsfwe", // ID of the element
4870
+ "id": "1231", // ID of the element
4528
4871
  "indexId": "0", // Index of the element,The image is labeled to the left of the element
4529
4872
  "attributes": { // Attributes of the element
4530
4873
  "nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
@@ -4540,7 +4883,7 @@ Input Example:
4540
4883
  }
4541
4884
  },
4542
4885
  {
4543
- "id": "wefew2222few2", // ID of the element
4886
+ "id": "66551", // ID of the element
4544
4887
  "indexId": "1", // Index of the element,The image is labeled to the left of the element
4545
4888
  "attributes": { // Attributes of the element
4546
4889
  "nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
@@ -4557,7 +4900,7 @@ Input Example:
4557
4900
  },
4558
4901
  ...
4559
4902
  {
4560
- "id": "kwekfj2323",
4903
+ "id": "12344",
4561
4904
  "indexId": "2", // Index of the element,The image is labeled to the left of the element
4562
4905
  "attributes": {
4563
4906
  "nodeType": "TEXT Node",
@@ -4590,7 +4933,7 @@ Output Example:
4590
4933
  "reason": "Reason for finding element 4: It is located in the upper right corner, is an image type, and according to the screenshot, it is a shopping cart icon button",
4591
4934
  "text": "",
4592
4935
  // ID(**use id not indexId**) of this element, replace with actual value in practice, **use id not indexId**
4593
- "id": "wefew2222few2"
4936
+ "id": "1231"
4594
4937
  }
4595
4938
  ],
4596
4939
  "errors": []
@@ -4677,6 +5020,19 @@ var findElementSchema = {
4677
5020
  };
4678
5021
 
4679
5022
  // src/ai-model/prompt/planning.ts
5023
+ var quickAnswerFormat = () => {
5024
+ const matchByPosition = getAIConfig(MATCH_BY_POSITION);
5025
+ const description = `
5026
+ ${matchByPosition ? `"position": { x: number; y: number } // Represents the position of the element; replace with actual values in practice (ensure it reflects the element's position)` : '"id": string // Represents the ID of the element; replace with actual values in practice'}
5027
+ `;
5028
+ const format = matchByPosition ? '"position": { x: number; y: number }' : '"id": string';
5029
+ const sample = matchByPosition ? '{"position": { x: 100, y: 200 }}' : '{"id": "14562"}';
5030
+ return {
5031
+ description,
5032
+ format,
5033
+ sample
5034
+ };
5035
+ };
4680
5036
  function systemPromptToTaskPlanning() {
4681
5037
  return `
4682
5038
  ## Role:
@@ -4700,32 +5056,24 @@ Each action has a type and corresponding param. To be detailed:
4700
5056
  * type: 'KeyboardPress', press a key
4701
5057
  * param: { value: string }, the value to input or the key to press. Use (Enter, Shift, Control, Alt, Meta, ShiftLeft, ControlOrMeta, ControlOrMeta) to represent the key.
4702
5058
  * type: 'Scroll'
4703
- * param: { scrollType: 'scrollDownOneScreen', 'scrollUpOneScreen', 'scrollUntilBottom', 'scrollUntilTop' }
5059
+ * param: { scrollType: 'scrollDownOneScreen' | 'scrollUpOneScreen' | 'scrollUntilBottom' | 'scrollUntilTop' }
4704
5060
  * type: 'Error'
4705
5061
  * param: { message: string }, the error message
4706
5062
  * type: 'Sleep'
4707
5063
  * param: { timeMs: number }, wait for timeMs milliseconds
4708
5064
 
4709
- Here is an example of how to decompose a task.
4710
- When a user says 'Input "Weather in Shanghai" into the search bar, wait 1 second, hit enter', by viewing the page screenshot and description, you may decompose this task into something like this:
4711
- * Locate: 'The search bar'
4712
- * Input: 'Weather in Shanghai'
4713
- * Sleep: 1000
4714
- * KeyboardPress: 'Enter'
4715
-
4716
5065
  Remember:
4717
5066
  1. The actions you composed MUST be based on the page context information you get. Instead of making up actions that are not related to the page context.
4718
- 2. In most cases, you should Locate one element first, then do other actions on it. For example, alway Find one element, then hover on it. But if you think it's necessary to do other actions first (like global scroll, global key press), you can do that.
5067
+ 2. In most cases, you should Locate one element first, then do other actions on it. For example, Locate one element, then hover on it. But if you think it's necessary to do other actions first (like global scroll, global key press), you can do that.
5068
+ 3. If the planned actions are sequential and some actions may appear only after the execution of previous actions, this is considered normal. Thoughts, prompts, and error messages should all be in the same language as the user's description.
4719
5069
 
4720
- If the planned tasks are sequential and tasks may appear only after the execution of previous tasks, this is considered normal. Thoughts, prompts, and error messages should all be in the same language as the user query.
5070
+ ## Objective 2 (sub objective, only for action with type "Locate"): Give a quick answer to the action with type "Locate" you just planned, append a \`quickAnswer\` field as a sibling of the \`param\` field
4721
5071
 
4722
- ## Objective 2 (sub objective, only for action with type "Locate"): Give a quick answer to the action with type "Locate" you just planned, append a \`quickAnswer\` field after the \`param\` field
4723
-
4724
- If the action type is 'Locate', provide a quick answer: Does any element meet the description in the prompt? If so, answer with the following format, as the \`quickAnswer\` field in the output JSON:
5072
+ If the action type is 'Locate', think about this: does any element on screen meet the description in the prompt? If so, answer with the following format, as the \`quickAnswer\` field in the output JSON:
4725
5073
  {
4726
- "reason": "Reason for finding element 4: It is located in the upper right corner, is an image type, and according to the screenshot, it is a shopping cart icon button",
5074
+ "reason": "It is located (somewhere), is an (node type). According to the screenshot, it is a shopping cart icon button (or it's text is 'Shopping Cart')",
4727
5075
  "text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
4728
- ${getAIConfig(MATCH_BY_POSITION) ? `"position": { x: number; y: number } // Represents the position of the element; replace with actual values in practice (ensure it reflects the element's position)` : '"id": "wefew2222few2" // Represents the ID of the element; replace with actual values in practice'}
5076
+ ${quickAnswerFormat().description}
4729
5077
  }
4730
5078
 
4731
5079
  If there is no element meets the description in the prompt (usually because it will show up later after some interaction), the \`quickAnswer\` field should be null.
@@ -4738,33 +5086,71 @@ Please return the result in JSON format as follows:
4738
5086
  actions: [ // always return in Array
4739
5087
  {
4740
5088
  "thought": "find out the search bar",
4741
- "type": "Locate", // Type of action, like 'Tap' 'Hover' ...
4742
- "param": {
5089
+ "type": "Locate", // type of action according to Object 1, like 'Tap' 'Hover' ...
5090
+ "param": { //
4743
5091
  "prompt": "The search bar"
4744
5092
  },
4745
- "quickAnswer": { // since this action type is 'Locate', and we can find the element, so we need to give a quick answer
4746
- "reason": "Reason for finding element 4: It is located in the upper right corner, is an input type, and according to the screenshot, it is a search bar",
4747
- "text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
4748
- ${getAIConfig(MATCH_BY_POSITION) ? `"position": { x: number; y: number } // Represents the position of the element; replace with actual values in practice (ensure it reflects the element's position)` : '"id": "wefew2222few2" // Represents the ID of the element; replace with actual values in practice'}
5093
+ "quickAnswer": {
5094
+ "reason": "This is ...",
5095
+ "text": string, // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
5096
+ ${quickAnswerFormat().format}
4749
5097
  } | null,
4750
5098
  },
4751
5099
  {
4752
5100
  "thought": "Reasons for generating this task, and why this task is feasible on this page",
4753
- "type": "Tap", // Type of action, like 'Tap' 'Hover' ...
4754
- "param": any, // Parameter towards the task type
5101
+ "type": "Tap",
5102
+ "param": null,
4755
5103
  },
5104
+ // ... more actions
5105
+ ],
5106
+ error?: string, // Overall error messages. If there is any error occurs during the task planning (i.e. error in previous 'actions' array), conclude the errors again, put error messages here,
5107
+ }
5108
+
5109
+ ## Here is an example of how to decompose a task
5110
+
5111
+ When a user says 'Click the language switch button, wait 1s, click "English"', by viewing the page screenshot and description, you should consider this:
5112
+
5113
+ * The main steps are: Find the switch button, tap it, sleep, find the 'English' element, and tap on it.
5114
+ * Think and look in detail and fill all the fields in the JSON format.
5115
+
5116
+ \`\`\`json
5117
+ {
5118
+ queryLanguage: 'English',
5119
+ actions:[
4756
5120
  {
4757
- "thought": "Reasons for generating this task, and why this task is feasible on this page",
4758
- "type": "Locate", // Type of action, like 'Tap' 'Hover' ...
4759
- "param": {
4760
- "prompt": "The search bar"
5121
+ thought: "Locate the language switch button with the text '中文'.",
5122
+ type: 'Locate',
5123
+ param: { prompt: "The language switch button with the text '中文'" },
5124
+ quickAnswer: { // according to Objective 2, this action type is 'Locate', and we can find the element, so we need to give a quick answer
5125
+ reason: "It is located near the top center, is an text node. According to the screenshot, it is a language switch button with the text '中文'.",
5126
+ text: '中文',
5127
+ ${quickAnswerFormat().sample}
4761
5128
  },
4762
- "quickAnswer": null,
4763
5129
  },
4764
- // ... more actions
5130
+ {
5131
+ thought: 'Click the language switch button to open the language options.',
5132
+ type: 'Tap',
5133
+ param: null,
5134
+ },
5135
+ {
5136
+ thought: 'Wait for 1 second to ensure the language options are displayed.',
5137
+ type: 'Sleep',
5138
+ param: { timeMs: 1000 },
5139
+ },
5140
+ {
5141
+ thought: "Locate the 'English' option in the language menu.",
5142
+ type: 'Locate',
5143
+ param: { prompt: "The 'English' option in the language menu" },
5144
+ quickAnswer: null, // we cannot find this item in the description (it will show only after the previous interactions), so the quick answer is null here
5145
+ },
5146
+ {
5147
+ thought: "Click the 'English' option to switch the language.",
5148
+ type: 'Tap',
5149
+ param: null,
5150
+ }
4765
5151
  ],
4766
- error?: string, // Overall error messages. If there is any error occurs during the task planning (i.e. error in previous 'actions' array), conclude the errors again, put error messages here,
4767
5152
  }
5153
+ \`\`\`
4768
5154
  `;
4769
5155
  }
4770
5156
  var planSchema = {
@@ -4802,7 +5188,7 @@ var planSchema = {
4802
5188
  properties: {
4803
5189
  reason: {
4804
5190
  type: "string",
4805
- description: "Reason for finding element 4"
5191
+ description: "Reason for finding this element"
4806
5192
  },
4807
5193
  text: {
4808
5194
  type: "string",
@@ -4824,659 +5210,333 @@ var planSchema = {
4824
5210
  },
4825
5211
  error: {
4826
5212
  type: ["string", "null"],
4827
- description: "Overall error messages. If there is any error occurs during the task planning, conclude the errors again and put error messages here"
4828
- }
4829
- },
4830
- required: ["queryLanguage", "actions", "error"],
4831
- additionalProperties: false
4832
- }
4833
- }
4834
- };
4835
-
4836
- // src/ai-model/prompt/util.ts
4837
- var import_node_assert2 = __toESM(require("assert"));
4838
-
4839
- // src/image/index.ts
4840
- var import_img = require("@midscene/shared/img");
4841
-
4842
- // src/ai-model/prompt/util.ts
4843
- var characteristic = "You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.";
4844
- var contextFormatIntro = `
4845
- The user will give you a screenshot and the texts on it. There may be some none-English characters (like Chinese) on it, indicating it's an non-English app.`;
4846
- var ONE_ELEMENT_LOCATOR_PREFIX = "LOCATE_ONE_ELEMENT";
4847
- var ELEMENTS_LOCATOR_PREFIX = "LOCATE_ONE_OR_MORE_ELEMENTS";
4848
- var SECTION_MATCHER_FLAG = "SECTION_MATCHER_FLAG/";
4849
- function systemPromptToExtract() {
4850
- return `
4851
- You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.
4852
- The user will give you a screenshot and the contents of it. There may be some none-English characters (like Chinese) on it, indicating it's an non-English app.
4853
-
4854
- You have the following skills:
4855
-
4856
- skill name: extract_data_from_UI
4857
- related input: DATA_DEMAND
4858
- skill content:
4859
- * User will give you some data requirements in DATA_DEMAND. Consider the UI context, follow the user's instructions, and provide comprehensive data accordingly.
4860
- * There may be some special commands in DATA_DEMAND, please pay extra attention
4861
- - LOCATE_ONE_ELEMENT and LOCATE_ONE_OR_MORE_ELEMENTS: if you see a description that mentions the keyword LOCATE_ONE_ELEMENT
4862
- - LOCATE_ONE_OR_MORE_ELEMENTS(e.g. follow LOCATE_ONE_ELEMENT : i want to find ...), it means user wants to locate a specific element meets the description.
4863
-
4864
- Return in this way: prefix + the id / comma-separated ids, for example: LOCATE_ONE_ELEMENT/1 , LOCATE_ONE_OR_MORE_ELEMENTS/1,2,3 . If not found, keep the prefix and leave the suffix empty, like LOCATE_ONE_ELEMENT/ .
4865
-
4866
-
4867
-
4868
- Return in the following JSON format:
4869
- {
4870
- language: "en", // "en" or "zh", the language of the page. Use the same language to describe section name, description, and similar fields.
4871
- data: any, // the extracted data from extract_data_from_UI skill. Make sure both the value and scheme meet the DATA_DEMAND.
4872
- errors: [], // string[], error message if any
4873
- }
4874
- `;
4875
- }
4876
- function systemPromptToAssert() {
4877
- return `
4878
- ${characteristic}
4879
- ${contextFormatIntro}
4880
-
4881
- Based on the information you get, Return assertion judgment:
4882
-
4883
- Return in the following JSON format:
4884
- {
4885
- thought: string, // string, the thought of the assertion. Should in the same language as the assertion.
4886
- pass: true, // true or false, whether the assertion is passed
4887
- }
4888
- `;
4889
- }
4890
- var assertSchema = {
4891
- type: "json_schema",
4892
- json_schema: {
4893
- name: "assert",
4894
- strict: true,
4895
- schema: {
4896
- type: "object",
4897
- properties: {
4898
- thought: {
4899
- type: "string",
4900
- description: "The thought process behind the assertion"
4901
- },
4902
- pass: {
4903
- type: "boolean",
4904
- description: "Whether the assertion passed or failed"
4905
- }
4906
- },
4907
- required: ["thought", "pass"],
4908
- additionalProperties: false
4909
- }
4910
- }
4911
- };
4912
- function describeSize(size) {
4913
- return `${size.width} x ${size.height}`;
4914
- }
4915
- function truncateText(text) {
4916
- const maxLength = 50;
4917
- if (text && text.length > maxLength) {
4918
- return `${text.slice(0, maxLength)}...`;
4919
- }
4920
- return text;
4921
- }
4922
- function elementByPosition(elementsInfo, position) {
4923
- (0, import_node_assert2.default)(typeof position !== "undefined", "position is required for query");
4924
- const item = elementsInfo.find((item2) => {
4925
- return item2.rect.left <= position.x && position.x <= item2.rect.left + item2.rect.width && item2.rect.top <= position.y && position.y <= item2.rect.top + item2.rect.height;
4926
- });
4927
- return item;
4928
- }
4929
- async function describeUserPage(context) {
4930
- const { screenshotBase64 } = context;
4931
- let width;
4932
- let height;
4933
- if (context.size) {
4934
- ({ width, height } = context.size);
4935
- } else {
4936
- const imgSize = await (0, import_img.imageInfoOfBase64)(screenshotBase64);
4937
- ({ width, height } = imgSize);
4938
- }
4939
- const elementsInfo = context.content;
4940
- const idElementMap = {};
4941
- elementsInfo.forEach((item) => {
4942
- idElementMap[item.id] = item;
4943
- return { ...item };
4944
- });
4945
- const elementInfosDescription = cropFieldInformation(elementsInfo);
4946
- return {
4947
- description: `
4948
- {
4949
- // The size of the page
4950
- "pageSize": ${describeSize({ width, height })},
4951
-
4952
- ${// if match by id, use the description of the element
4953
- !getAIConfig(MATCH_BY_POSITION) ? `
4954
- // json description of the element
4955
- "content": ${JSON.stringify(elementInfosDescription)}
4956
- ` : ""}
4957
- }`,
4958
- elementById(id) {
4959
- (0, import_node_assert2.default)(typeof id !== "undefined", "id is required for query");
4960
- const item = idElementMap[`${id}`];
4961
- return item;
4962
- },
4963
- elementByPosition(position) {
4964
- return elementByPosition(elementsInfo, position);
4965
- }
4966
- };
4967
- }
4968
- function cropFieldInformation(elementsInfo) {
4969
- const elementInfosDescription = elementsInfo.map(
4970
- (item) => {
4971
- const { id, attributes = {}, rect, content } = item;
4972
- const tailorContent = truncateText(content);
4973
- const tailorAttributes = Object.keys(attributes).reduce(
4974
- (res, currentKey) => {
4975
- const attributeVal = attributes[currentKey];
4976
- res[currentKey] = truncateText(attributeVal);
4977
- return res;
4978
- },
4979
- {}
4980
- );
4981
- return {
4982
- id,
4983
- markerId: item.indexId,
4984
- attributes: tailorAttributes,
4985
- rect,
4986
- content: tailorContent
4987
- };
4988
- }
4989
- );
4990
- return JSON.stringify(elementInfosDescription);
4991
- }
4992
- function retrieveElement(prompt, opt) {
4993
- if (opt == null ? void 0 : opt.multi) {
4994
- return `follow ${ELEMENTS_LOCATOR_PREFIX}: ${prompt}`;
4995
- }
4996
- return `follow ${ONE_ELEMENT_LOCATOR_PREFIX}: ${prompt}`;
4997
- }
4998
- function ifElementTypeResponse(response) {
4999
- if (typeof response !== "string") {
5000
- return false;
5001
- }
5002
- return response.startsWith(ONE_ELEMENT_LOCATOR_PREFIX) || response.startsWith(ELEMENTS_LOCATOR_PREFIX);
5003
- }
5004
- function splitElementResponse(response) {
5005
- const oneElementSplitter = `${ONE_ELEMENT_LOCATOR_PREFIX}/`;
5006
- if (response.startsWith(oneElementSplitter)) {
5007
- const id = response.slice(oneElementSplitter.length);
5008
- if (id.indexOf(",") >= 0) {
5009
- console.warn(`unexpected comma in one element response: ${id}`);
5010
- }
5011
- return id ? id : null;
5012
- }
5013
- const elementsSplitter = `${ELEMENTS_LOCATOR_PREFIX}/`;
5014
- if (response.startsWith(elementsSplitter)) {
5015
- const idsString = response.slice(elementsSplitter.length);
5016
- if (!idsString) {
5017
- return [];
5018
- }
5019
- return idsString.split(",");
5020
- }
5021
- return null;
5022
- }
5023
- function retrieveSection(prompt) {
5024
- return `${SECTION_MATCHER_FLAG}${prompt}`;
5025
- }
5026
-
5027
- // src/ai-model/openai/index.ts
5028
- var MIDSCENE_OPENAI_INIT_CONFIG_JSON = "MIDSCENE_OPENAI_INIT_CONFIG_JSON";
5029
- var MIDSCENE_MODEL_NAME = "MIDSCENE_MODEL_NAME";
5030
- var MIDSCENE_LANGSMITH_DEBUG = "MIDSCENE_LANGSMITH_DEBUG";
5031
- var MIDSCENE_DEBUG_AI_PROFILE = "MIDSCENE_DEBUG_AI_PROFILE";
5032
- var MIDSCENE_DANGEROUSLY_PRINT_ALL_CONFIG = "MIDSCENE_DANGEROUSLY_PRINT_ALL_CONFIG";
5033
- var MIDSCENE_DEBUG_MODE = "MIDSCENE_DEBUG_MODE";
5034
- var OPENAI_API_KEY = "OPENAI_API_KEY";
5035
- var OPENAI_BASE_URL = "OPENAI_BASE_URL";
5036
- var MIDSCENE_MODEL_TEXT_ONLY = "MIDSCENE_MODEL_TEXT_ONLY";
5037
- var OPENAI_USE_AZURE = "OPENAI_USE_AZURE";
5038
- var MIDSCENE_CACHE = "MIDSCENE_CACHE";
5039
- var MATCH_BY_POSITION = "MATCH_BY_POSITION";
5040
- var allConfigFromEnv = () => {
5041
- return {
5042
- [MIDSCENE_OPENAI_INIT_CONFIG_JSON]: process.env[MIDSCENE_OPENAI_INIT_CONFIG_JSON] || void 0,
5043
- [MIDSCENE_MODEL_NAME]: process.env[MIDSCENE_MODEL_NAME] || void 0,
5044
- [MIDSCENE_DEBUG_MODE]: process.env[MIDSCENE_DEBUG_MODE] || void 0,
5045
- [MIDSCENE_LANGSMITH_DEBUG]: process.env[MIDSCENE_LANGSMITH_DEBUG] || void 0,
5046
- [MIDSCENE_DEBUG_AI_PROFILE]: process.env[MIDSCENE_DEBUG_AI_PROFILE] || void 0,
5047
- [MIDSCENE_DANGEROUSLY_PRINT_ALL_CONFIG]: process.env[MIDSCENE_DANGEROUSLY_PRINT_ALL_CONFIG] || void 0,
5048
- [OPENAI_API_KEY]: process.env[OPENAI_API_KEY] || void 0,
5049
- [OPENAI_BASE_URL]: process.env[OPENAI_BASE_URL] || void 0,
5050
- [MIDSCENE_MODEL_TEXT_ONLY]: process.env[MIDSCENE_MODEL_TEXT_ONLY] || void 0,
5051
- [OPENAI_USE_AZURE]: process.env[OPENAI_USE_AZURE] || void 0,
5052
- [MIDSCENE_CACHE]: process.env[MIDSCENE_CACHE] || void 0,
5053
- [MATCH_BY_POSITION]: process.env[MATCH_BY_POSITION] || void 0
5054
- };
5055
- };
5056
- var userConfig = {};
5057
- var getAIConfig = (configKey) => {
5058
- if (typeof userConfig[configKey] !== "undefined") {
5059
- return userConfig[configKey];
5060
- }
5061
- return allConfigFromEnv()[configKey];
5062
- };
5063
- var allAIConfig = () => {
5064
- return { ...allConfigFromEnv(), ...userConfig };
5065
- };
5066
- var overrideAIConfig = (newConfig, extendMode) => {
5067
- userConfig = extendMode ? { ...userConfig, ...newConfig } : { ...newConfig };
5068
- };
5069
- function preferOpenAIModel(preferVendor) {
5070
- if (preferVendor && preferVendor !== "openAI")
5071
- return false;
5072
- if (getAIConfig(OPENAI_API_KEY))
5073
- return true;
5074
- return Boolean(getAIConfig(MIDSCENE_OPENAI_INIT_CONFIG_JSON));
5075
- }
5076
- var defaultModel = "gpt-4o-2024-08-06";
5077
- function getModelName() {
5078
- let modelName = defaultModel;
5079
- const nameInConfig = getAIConfig(MIDSCENE_MODEL_NAME);
5080
- if (nameInConfig) {
5081
- modelName = nameInConfig;
5082
- }
5083
- return modelName;
5084
- }
5085
- async function createOpenAI() {
5086
- let openai;
5087
- const extraConfigString = getAIConfig(MIDSCENE_OPENAI_INIT_CONFIG_JSON);
5088
- const extraConfig = extraConfigString ? JSON.parse(extraConfigString) : {};
5089
- if (getAIConfig(OPENAI_USE_AZURE)) {
5090
- openai = new import_openai5.AzureOpenAI({
5091
- baseURL: getAIConfig(OPENAI_BASE_URL),
5092
- apiKey: getAIConfig(OPENAI_API_KEY),
5093
- ...extraConfig,
5094
- dangerouslyAllowBrowser: true
5095
- });
5096
- } else {
5097
- openai = new import_openai5.default({
5098
- baseURL: getAIConfig(OPENAI_BASE_URL),
5099
- apiKey: getAIConfig(OPENAI_API_KEY),
5100
- ...extraConfig,
5101
- dangerouslyAllowBrowser: true
5102
- });
5103
- }
5104
- if (getAIConfig(MIDSCENE_LANGSMITH_DEBUG)) {
5105
- if (import_utils.ifInBrowser) {
5106
- throw new Error("langsmith is not supported in browser");
5107
- }
5108
- console.log("DEBUGGING MODE: langsmith wrapper enabled");
5109
- const { wrapOpenAI: wrapOpenAI2 } = await Promise.resolve().then(() => (init_wrappers2(), wrappers_exports));
5110
- openai = wrapOpenAI2(openai);
5111
- }
5112
- return openai;
5113
- }
5114
- async function call(messages, responseFormat) {
5115
- const openai = await createOpenAI();
5116
- const shouldPrintTiming = typeof getAIConfig(MIDSCENE_DEBUG_AI_PROFILE) === "string";
5117
- if (getAIConfig(MIDSCENE_DANGEROUSLY_PRINT_ALL_CONFIG)) {
5118
- console.log(allAIConfig());
5119
- }
5120
- const startTime = Date.now();
5121
- const model = getModelName();
5122
- const completion = await openai.chat.completions.create({
5123
- model,
5124
- messages,
5125
- response_format: responseFormat,
5126
- temperature: 0.1,
5127
- stream: false
5128
- // betas: ['computer-use-2024-10-22'],
5129
- });
5130
- shouldPrintTiming && console.log(
5131
- "Midscene - AI call",
5132
- model,
5133
- completion.usage,
5134
- `${Date.now() - startTime}ms`
5135
- );
5136
- const { content } = completion.choices[0].message;
5137
- (0, import_node_assert3.default)(content, "empty content");
5138
- return content;
5139
- }
5140
- async function callToGetJSONObject(messages, AIActionTypeValue) {
5141
- let responseFormat = {
5142
- type: "json_object" /* JSON */
5143
- };
5144
- const model = getModelName();
5145
- if (model === "gpt-4o-2024-08-06") {
5146
- switch (AIActionTypeValue) {
5147
- case 0 /* ASSERT */:
5148
- responseFormat = assertSchema;
5149
- break;
5150
- case 1 /* INSPECT_ELEMENT */:
5151
- responseFormat = findElementSchema;
5152
- break;
5153
- case 2 /* EXTRACT_DATA */:
5154
- break;
5155
- case 3 /* PLAN */:
5156
- responseFormat = planSchema;
5157
- break;
5158
- }
5159
- }
5160
- if (model.startsWith("gemini")) {
5161
- responseFormat = { type: "text" /* TEXT */ };
5162
- }
5163
- const response = await call(messages, responseFormat);
5164
- (0, import_node_assert3.default)(response, "empty response");
5165
- const jsonContent = extractJSONFromCodeBlock(response);
5166
- try {
5167
- return JSON.parse(jsonContent);
5168
- } catch (e) {
5169
- throw Error(`parse json error: ${jsonContent}`);
5170
- }
5171
- }
5172
- function extractJSONFromCodeBlock(response) {
5173
- const jsonMatch = response.match(/^\s*(\{[\s\S]*\})\s*$/);
5174
- if (jsonMatch) {
5175
- return jsonMatch[1];
5176
- }
5177
- const codeBlockMatch = response.match(/```(?:json)?\s*(\{[\s\S]*?\})\s*```/);
5178
- if (codeBlockMatch) {
5179
- return codeBlockMatch[1];
5180
- }
5181
- const jsonLikeMatch = response.match(/\{[\s\S]*\}/);
5182
- if (jsonLikeMatch) {
5183
- return jsonLikeMatch[0];
5213
+ description: "Overall error messages. If there is any error occurs during the task planning, conclude the errors again and put error messages here"
5214
+ }
5215
+ },
5216
+ required: ["queryLanguage", "actions", "error"],
5217
+ additionalProperties: false
5218
+ }
5184
5219
  }
5185
- return response;
5186
- }
5220
+ };
5187
5221
 
5188
- // src/utils.ts
5222
+ // src/ai-model/prompt/util.ts
5189
5223
  var import_node_assert4 = __toESM(require("assert"));
5190
- var import_node_child_process = require("child_process");
5191
- var import_node_fs = require("fs");
5192
- var import_node_os = require("os");
5193
- var import_node_path = require("path");
5194
- var import_fs = require("@midscene/shared/fs");
5195
- var import_utils2 = require("@midscene/shared/utils");
5196
- var logDir = (0, import_node_path.join)(process.cwd(), "./midscene_run/");
5197
- var logEnvReady = false;
5198
- var insightDumpFileExt = "insight-dump.json";
5199
- function getLogDir() {
5200
- return logDir;
5224
+
5225
+ // src/image/index.ts
5226
+ var import_img = require("@midscene/shared/img");
5227
+
5228
+ // src/ai-model/prompt/util.ts
5229
+ var characteristic = "You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.";
5230
+ var contextFormatIntro = `
5231
+ The user will give you a screenshot and the texts on it. There may be some none-English characters (like Chinese) on it, indicating it's an non-English app.`;
5232
+ var ONE_ELEMENT_LOCATOR_PREFIX = "LOCATE_ONE_ELEMENT";
5233
+ var ELEMENTS_LOCATOR_PREFIX = "LOCATE_ONE_OR_MORE_ELEMENTS";
5234
+ var SECTION_MATCHER_FLAG = "SECTION_MATCHER_FLAG/";
5235
+ function systemPromptToExtract() {
5236
+ return `
5237
+ You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.
5238
+ The user will give you a screenshot and the contents of it. There may be some none-English characters (like Chinese) on it, indicating it's an non-English app.
5239
+
5240
+ You have the following skills:
5241
+
5242
+ skill name: extract_data_from_UI
5243
+ related input: DATA_DEMAND
5244
+ skill content:
5245
+ * User will give you some data requirements in DATA_DEMAND. Consider the UI context, follow the user's instructions, and provide comprehensive data accordingly.
5246
+ * There may be some special commands in DATA_DEMAND, please pay extra attention
5247
+ - LOCATE_ONE_ELEMENT and LOCATE_ONE_OR_MORE_ELEMENTS: if you see a description that mentions the keyword LOCATE_ONE_ELEMENT
5248
+ - LOCATE_ONE_OR_MORE_ELEMENTS(e.g. follow LOCATE_ONE_ELEMENT : i want to find ...), it means user wants to locate a specific element meets the description.
5249
+
5250
+ Return in this way: prefix + the id / comma-separated ids, for example: LOCATE_ONE_ELEMENT/1 , LOCATE_ONE_OR_MORE_ELEMENTS/1,2,3 . If not found, keep the prefix and leave the suffix empty, like LOCATE_ONE_ELEMENT/ .
5251
+
5252
+ Return in the following JSON format:
5253
+ {
5254
+ language: "en", // "en" or "zh", the language of the page. Use the same language to describe section name, description, and similar fields.
5255
+ data: any, // the extracted data from extract_data_from_UI skill. Make sure both the value and scheme meet the DATA_DEMAND.
5256
+ errors: [], // string[], error message if any
5201
5257
  }
5202
- function setLogDir(dir) {
5203
- logDir = dir;
5258
+ `;
5204
5259
  }
5205
- function getLogDirByType(type) {
5206
- const dir = (0, import_node_path.join)(getLogDir(), type);
5207
- if (!(0, import_node_fs.existsSync)(dir)) {
5208
- (0, import_node_fs.mkdirSync)(dir, { recursive: true });
5209
- }
5210
- return dir;
5260
+ function systemPromptToAssert() {
5261
+ return `
5262
+ ${characteristic}
5263
+ ${contextFormatIntro}
5264
+
5265
+ Based on the information you get, Return assertion judgment:
5266
+
5267
+ Return in the following JSON format:
5268
+ {
5269
+ thought: string, // string, the thought of the assertion. Should in the same language as the assertion.
5270
+ pass: true, // true or false, whether the assertion is passed
5211
5271
  }
5212
- var reportTpl = null;
5213
- function getReportTpl() {
5214
- if (import_utils2.ifInBrowser) {
5215
- if (!reportTpl && window.midscene_report_tpl) {
5216
- reportTpl = window.midscene_report_tpl;
5217
- }
5218
- (0, import_node_assert4.default)(
5219
- reportTpl,
5220
- "reportTpl should be set before writing report in browser"
5221
- );
5222
- return reportTpl;
5223
- }
5224
- if (!reportTpl) {
5225
- let reportPath = (0, import_node_path.join)(__dirname, "../../report/index.html");
5226
- if (!(0, import_node_fs.existsSync)(reportPath)) {
5227
- reportPath = (0, import_node_path.join)(__dirname, "../report/index.html");
5272
+ `;
5273
+ }
5274
+ var assertSchema = {
5275
+ type: "json_schema",
5276
+ json_schema: {
5277
+ name: "assert",
5278
+ strict: true,
5279
+ schema: {
5280
+ type: "object",
5281
+ properties: {
5282
+ thought: {
5283
+ type: "string",
5284
+ description: "The thought process behind the assertion"
5285
+ },
5286
+ pass: {
5287
+ type: "boolean",
5288
+ description: "Whether the assertion passed or failed"
5289
+ }
5290
+ },
5291
+ required: ["thought", "pass"],
5292
+ additionalProperties: false
5228
5293
  }
5229
- reportTpl = (0, import_node_fs.readFileSync)(reportPath, "utf-8");
5230
5294
  }
5231
- return reportTpl;
5295
+ };
5296
+ function describeSize(size) {
5297
+ return `${size.width} x ${size.height}`;
5232
5298
  }
5233
- function reportHTMLContent(dumpData) {
5234
- const tpl = getReportTpl();
5235
- let reportContent;
5236
- if (Array.isArray(dumpData) && dumpData.length === 0 || typeof dumpData === "undefined") {
5237
- reportContent = tpl.replace(
5238
- /\s+{{dump}}\s+/,
5239
- `<script type="midscene_web_dump" type="application/json"></script>`
5240
- );
5241
- } else if (typeof dumpData === "string") {
5242
- reportContent = tpl.replace(
5243
- /\s+{{dump}}\s+/,
5244
- `<script type="midscene_web_dump" type="application/json">${dumpData}</script>`
5245
- );
5246
- } else {
5247
- const dumps = dumpData.map(({ dumpString, attributes }) => {
5248
- const attributesArr = Object.keys(attributes || {}).map((key) => {
5249
- return `${key}="${encodeURIComponent(attributes[key])}"`;
5250
- });
5251
- return `<script type="midscene_web_dump" type="application/json" ${attributesArr.join(
5252
- " "
5253
- )}
5254
- >${dumpString}
5255
- </script>`;
5256
- });
5257
- reportContent = tpl.replace(/\s+{{dump}}\s+/, dumps.join("\n"));
5299
+ function truncateText(text, maxLength = 20) {
5300
+ if (text && text.length > maxLength) {
5301
+ return `${text.slice(0, maxLength)}...`;
5258
5302
  }
5259
- return reportContent;
5303
+ return text;
5260
5304
  }
5261
- function writeDumpReport(fileName, dumpData) {
5262
- if (import_utils2.ifInBrowser) {
5263
- console.log("will not write report in browser");
5264
- return null;
5265
- }
5266
- const midscenePkgInfo = (0, import_fs.getRunningPkgInfo)(__dirname);
5267
- if (!midscenePkgInfo) {
5268
- console.warn("midscenePkgInfo not found, will not write report");
5269
- return null;
5270
- }
5271
- const reportPath = (0, import_node_path.join)(getLogDirByType("report"), `${fileName}.html`);
5272
- const reportContent = reportHTMLContent(dumpData);
5273
- (0, import_node_fs.writeFileSync)(reportPath, reportContent);
5274
- return reportPath;
5305
+ function elementByPosition(elementsInfo, position) {
5306
+ (0, import_node_assert4.default)(typeof position !== "undefined", "position is required for query");
5307
+ const item = elementsInfo.find((item2) => {
5308
+ return item2.rect.left <= position.x && position.x <= item2.rect.left + item2.rect.width && item2.rect.top <= position.y && position.y <= item2.rect.top + item2.rect.height;
5309
+ });
5310
+ return item;
5275
5311
  }
5276
- function writeLogFile(opts) {
5277
- if (import_utils2.ifInBrowser) {
5278
- return "/mock/report.html";
5312
+ async function describeUserPage(context) {
5313
+ const { screenshotBase64 } = context;
5314
+ let width;
5315
+ let height;
5316
+ if (context.size) {
5317
+ ({ width, height } = context.size);
5318
+ } else {
5319
+ const imgSize = await (0, import_img.imageInfoOfBase64)(screenshotBase64);
5320
+ ({ width, height } = imgSize);
5279
5321
  }
5280
- const { fileName, fileExt, fileContent, type = "dump" } = opts;
5281
- const targetDir = getLogDirByType(type);
5282
- if (!logEnvReady) {
5283
- (0, import_node_assert4.default)(targetDir, "logDir should be set before writing dump file");
5284
- const gitIgnorePath = (0, import_node_path.join)(targetDir, "../../.gitignore");
5285
- let gitIgnoreContent = "";
5286
- if ((0, import_node_fs.existsSync)(gitIgnorePath)) {
5287
- gitIgnoreContent = (0, import_node_fs.readFileSync)(gitIgnorePath, "utf-8");
5322
+ const elementsInfo = context.content;
5323
+ const idElementMap = {};
5324
+ elementsInfo.forEach((item) => {
5325
+ idElementMap[item.id] = item;
5326
+ return { ...item };
5327
+ });
5328
+ const elementInfosDescription = cropFieldInformation(elementsInfo);
5329
+ return {
5330
+ description: `
5331
+ {
5332
+ // The size of the page
5333
+ "pageSize": ${describeSize({ width, height })},
5334
+
5335
+ ${// if match by id, use the description of the element
5336
+ getAIConfig(MATCH_BY_POSITION) ? "" : `// json description of the element
5337
+ "content": ${JSON.stringify(elementInfosDescription)}
5338
+ `}
5339
+ }`,
5340
+ elementById(id) {
5341
+ (0, import_node_assert4.default)(typeof id !== "undefined", "id is required for query");
5342
+ const item = idElementMap[`${id}`];
5343
+ return item;
5344
+ },
5345
+ elementByPosition(position) {
5346
+ return elementByPosition(elementsInfo, position);
5288
5347
  }
5289
- const logDirName = (0, import_node_path.basename)(logDir);
5290
- if (!gitIgnoreContent.includes(`${logDirName}/`)) {
5291
- (0, import_node_fs.writeFileSync)(
5292
- gitIgnorePath,
5293
- `${gitIgnoreContent}
5294
- # Midscene.js dump files
5295
- ${logDirName}/report
5296
- ${logDirName}/dump
5297
- `,
5298
- "utf-8"
5348
+ };
5349
+ }
5350
+ function cropFieldInformation(elementsInfo) {
5351
+ const elementInfosDescription = elementsInfo.map(
5352
+ (item) => {
5353
+ const { id, attributes = {}, rect, content } = item;
5354
+ const tailorContent = truncateText(content);
5355
+ const tailorAttributes = Object.keys(attributes).reduce(
5356
+ (res, currentKey) => {
5357
+ const attributeVal = attributes[currentKey];
5358
+ if (currentKey === "style" || currentKey === "src")
5359
+ return res;
5360
+ if (currentKey === "nodeType") {
5361
+ res[currentKey] = attributeVal.replace(/\sNode$/, "");
5362
+ } else {
5363
+ res[currentKey] = truncateText(attributeVal);
5364
+ }
5365
+ return res;
5366
+ },
5367
+ {}
5299
5368
  );
5369
+ return {
5370
+ id,
5371
+ markerId: item.indexId,
5372
+ attributes: tailorAttributes,
5373
+ rect: {
5374
+ left: rect.left,
5375
+ top: rect.top,
5376
+ width: rect.width,
5377
+ height: rect.height
5378
+ // remove 'zoom' if it exists
5379
+ },
5380
+ content: tailorContent
5381
+ };
5300
5382
  }
5301
- logEnvReady = true;
5302
- }
5303
- const filePath = (0, import_node_path.join)(targetDir, `${fileName}.${fileExt}`);
5304
- const outputResourceDir = (0, import_node_path.dirname)(filePath);
5305
- if (!(0, import_node_fs.existsSync)(outputResourceDir)) {
5306
- (0, import_node_fs.mkdirSync)(outputResourceDir, { recursive: true });
5383
+ );
5384
+ return elementInfosDescription;
5385
+ }
5386
+ function retrieveElement(prompt, opt) {
5387
+ if (opt == null ? void 0 : opt.multi) {
5388
+ return `follow ${ELEMENTS_LOCATOR_PREFIX}: ${prompt}`;
5307
5389
  }
5308
- (0, import_node_fs.writeFileSync)(filePath, fileContent);
5309
- if (opts == null ? void 0 : opts.generateReport) {
5310
- return writeDumpReport(fileName, fileContent);
5390
+ return `follow ${ONE_ELEMENT_LOCATOR_PREFIX}: ${prompt}`;
5391
+ }
5392
+ function ifElementTypeResponse(response) {
5393
+ if (typeof response !== "string") {
5394
+ return false;
5311
5395
  }
5312
- return filePath;
5396
+ return response.startsWith(ONE_ELEMENT_LOCATOR_PREFIX) || response.startsWith(ELEMENTS_LOCATOR_PREFIX);
5313
5397
  }
5314
- function replacerForPageObject(key, value) {
5315
- var _a, _b;
5316
- if (value && ((_a = value.constructor) == null ? void 0 : _a.name) === "Page") {
5317
- return "[Page object]";
5398
+ function splitElementResponse(response) {
5399
+ const oneElementSplitter = `${ONE_ELEMENT_LOCATOR_PREFIX}/`;
5400
+ if (response.startsWith(oneElementSplitter)) {
5401
+ const id = response.slice(oneElementSplitter.length);
5402
+ if (id.indexOf(",") >= 0) {
5403
+ console.warn(`unexpected comma in one element response: ${id}`);
5404
+ }
5405
+ return id ? id : null;
5318
5406
  }
5319
- if (value && ((_b = value.constructor) == null ? void 0 : _b.name) === "Browser") {
5320
- return "[Browser object]";
5407
+ const elementsSplitter = `${ELEMENTS_LOCATOR_PREFIX}/`;
5408
+ if (response.startsWith(elementsSplitter)) {
5409
+ const idsString = response.slice(elementsSplitter.length);
5410
+ if (!idsString) {
5411
+ return [];
5412
+ }
5413
+ return idsString.split(",");
5321
5414
  }
5322
- return value;
5323
- }
5324
- function stringifyDumpData(data, indents) {
5325
- return JSON.stringify(data, replacerForPageObject, indents);
5415
+ return null;
5326
5416
  }
5327
- function getVersion() {
5328
- return "0.8.4";
5417
+ function retrieveSection(prompt) {
5418
+ return `${SECTION_MATCHER_FLAG}${prompt}`;
5329
5419
  }
5330
5420
 
5331
- // src/action/executor.ts
5332
- var Executor = class {
5333
- constructor(name, description, tasks) {
5334
- __publicField(this, "name");
5335
- __publicField(this, "description");
5336
- __publicField(this, "tasks");
5337
- // status of executor
5338
- __publicField(this, "status");
5339
- this.status = tasks && tasks.length > 0 ? "pending" : "init";
5340
- this.name = name;
5341
- this.description = description;
5342
- this.tasks = (tasks || []).map((item) => this.markTaskAsPending(item));
5421
+ // src/ai-model/openai/index.ts
5422
+ function preferOpenAIModel(preferVendor) {
5423
+ if (preferVendor && preferVendor !== "openAI")
5424
+ return false;
5425
+ if (getAIConfig(OPENAI_API_KEY))
5426
+ return true;
5427
+ return Boolean(getAIConfig(MIDSCENE_OPENAI_INIT_CONFIG_JSON));
5428
+ }
5429
+ var defaultModel = "gpt-4o-2024-08-06";
5430
+ function getModelName() {
5431
+ let modelName = defaultModel;
5432
+ const nameInConfig = getAIConfig(MIDSCENE_MODEL_NAME);
5433
+ if (nameInConfig) {
5434
+ modelName = nameInConfig;
5343
5435
  }
5344
- markTaskAsPending(task) {
5345
- return {
5346
- status: "pending",
5347
- ...task
5348
- };
5436
+ return modelName;
5437
+ }
5438
+ async function createOpenAI() {
5439
+ let openai;
5440
+ const extraConfigString = getAIConfig(MIDSCENE_OPENAI_INIT_CONFIG_JSON);
5441
+ const extraConfig = extraConfigString ? JSON.parse(extraConfigString) : {};
5442
+ if (getAIConfig(OPENAI_USE_AZURE)) {
5443
+ openai = new import_openai2.AzureOpenAI({
5444
+ baseURL: getAIConfig(OPENAI_BASE_URL),
5445
+ apiKey: getAIConfig(OPENAI_API_KEY),
5446
+ ...extraConfig,
5447
+ dangerouslyAllowBrowser: true
5448
+ });
5449
+ } else {
5450
+ openai = new import_openai2.default({
5451
+ baseURL: getAIConfig(OPENAI_BASE_URL),
5452
+ apiKey: getAIConfig(OPENAI_API_KEY),
5453
+ ...extraConfig,
5454
+ dangerouslyAllowBrowser: true
5455
+ });
5349
5456
  }
5350
- async append(task) {
5351
- (0, import_node_assert5.default)(
5352
- this.status !== "error",
5353
- "executor is in error state, cannot append task"
5354
- );
5355
- if (Array.isArray(task)) {
5356
- this.tasks.push(...task.map((item) => this.markTaskAsPending(item)));
5357
- } else {
5358
- this.tasks.push(this.markTaskAsPending(task));
5359
- }
5360
- if (this.status !== "running") {
5361
- this.status = "pending";
5457
+ if (getAIConfig(MIDSCENE_LANGSMITH_DEBUG)) {
5458
+ if (import_utils3.ifInBrowser) {
5459
+ throw new Error("langsmith is not supported in browser");
5362
5460
  }
5461
+ console.log("DEBUGGING MODE: langsmith wrapper enabled");
5462
+ const { wrapOpenAI: wrapOpenAI2 } = await Promise.resolve().then(() => (init_wrappers2(), wrappers_exports));
5463
+ openai = wrapOpenAI2(openai);
5363
5464
  }
5364
- async flush() {
5365
- if (this.status === "init" && this.tasks.length > 0) {
5366
- console.warn(
5367
- "illegal state for executor, status is init but tasks are not empty"
5368
- );
5369
- }
5370
- (0, import_node_assert5.default)(this.status !== "running", "executor is already running");
5371
- (0, import_node_assert5.default)(this.status !== "completed", "executor is already completed");
5372
- (0, import_node_assert5.default)(this.status !== "error", "executor is in error state");
5373
- const nextPendingIndex = this.tasks.findIndex(
5374
- (task) => task.status === "pending"
5375
- );
5376
- if (nextPendingIndex < 0) {
5377
- return;
5378
- }
5379
- this.status = "running";
5380
- let taskIndex = nextPendingIndex;
5381
- let successfullyCompleted = true;
5382
- let previousFindOutput;
5383
- while (taskIndex < this.tasks.length) {
5384
- const task = this.tasks[taskIndex];
5385
- (0, import_node_assert5.default)(
5386
- task.status === "pending",
5387
- `task status should be pending, but got: ${task.status}`
5388
- );
5389
- task.timing = {
5390
- start: Date.now()
5391
- };
5392
- try {
5393
- task.status = "running";
5394
- (0, import_node_assert5.default)(
5395
- ["Insight", "Action", "Planning"].indexOf(task.type) >= 0,
5396
- `unsupported task type: ${task.type}`
5397
- );
5398
- const { executor, param } = task;
5399
- (0, import_node_assert5.default)(executor, `executor is required for task type: ${task.type}`);
5400
- let returnValue;
5401
- const executorContext = {
5402
- task,
5403
- element: previousFindOutput == null ? void 0 : previousFindOutput.element
5404
- };
5405
- if (task.type === "Insight") {
5406
- (0, import_node_assert5.default)(
5407
- task.subType === "Locate" || task.subType === "Query" || task.subType === "Assert",
5408
- `unsupported insight subType: ${task.subType}`
5409
- );
5410
- returnValue = await task.executor(param, executorContext);
5411
- if (task.subType === "Locate") {
5412
- previousFindOutput = returnValue == null ? void 0 : returnValue.output;
5413
- }
5414
- } else if (task.type === "Action" || task.type === "Planning") {
5415
- returnValue = await task.executor(param, executorContext);
5416
- } else {
5417
- console.warn(
5418
- `unsupported task type: ${task.type}, will try to execute it directly`
5419
- );
5420
- returnValue = await task.executor(param, executorContext);
5421
- }
5422
- Object.assign(task, returnValue);
5423
- task.status = "finished";
5424
- task.timing.end = Date.now();
5425
- task.timing.cost = task.timing.end - task.timing.start;
5426
- taskIndex++;
5427
- } catch (e) {
5428
- successfullyCompleted = false;
5429
- task.error = (e == null ? void 0 : e.message) || "error-without-message";
5430
- task.errorStack = e.stack;
5431
- task.status = "failed";
5432
- task.timing.end = Date.now();
5433
- task.timing.cost = task.timing.end - task.timing.start;
5465
+ return openai;
5466
+ }
5467
+ async function call(messages, responseFormat) {
5468
+ const openai = await createOpenAI();
5469
+ const shouldPrintTiming = typeof getAIConfig(MIDSCENE_DEBUG_AI_PROFILE) === "string";
5470
+ if (getAIConfig(MIDSCENE_DANGEROUSLY_PRINT_ALL_CONFIG)) {
5471
+ console.log(allAIConfig());
5472
+ }
5473
+ const startTime = Date.now();
5474
+ const model = getModelName();
5475
+ const completion = await openai.chat.completions.create({
5476
+ model,
5477
+ messages,
5478
+ response_format: responseFormat,
5479
+ temperature: 0.1,
5480
+ stream: false
5481
+ // betas: ['computer-use-2024-10-22'],
5482
+ });
5483
+ shouldPrintTiming && console.log(
5484
+ "Midscene - AI call",
5485
+ model,
5486
+ completion.usage,
5487
+ `${Date.now() - startTime}ms`
5488
+ );
5489
+ const { content } = completion.choices[0].message;
5490
+ (0, import_node_assert5.default)(content, "empty content");
5491
+ return content;
5492
+ }
5493
+ async function callToGetJSONObject(messages, AIActionTypeValue) {
5494
+ let responseFormat = {
5495
+ type: "json_object" /* JSON */
5496
+ };
5497
+ const model = getModelName();
5498
+ if (model === "gpt-4o-2024-08-06") {
5499
+ switch (AIActionTypeValue) {
5500
+ case 0 /* ASSERT */:
5501
+ responseFormat = assertSchema;
5502
+ break;
5503
+ case 1 /* INSPECT_ELEMENT */:
5504
+ responseFormat = findElementSchema;
5505
+ break;
5506
+ case 2 /* EXTRACT_DATA */:
5507
+ break;
5508
+ case 3 /* PLAN */:
5509
+ responseFormat = planSchema;
5434
5510
  break;
5435
- }
5436
- }
5437
- for (let i = taskIndex + 1; i < this.tasks.length; i++) {
5438
- this.tasks[i].status = "cancelled";
5439
- }
5440
- if (successfullyCompleted) {
5441
- this.status = "completed";
5442
- } else {
5443
- this.status = "error";
5444
- }
5445
- if (this.tasks.length) {
5446
- const outputIndex = Math.min(taskIndex, this.tasks.length - 1);
5447
- return this.tasks[outputIndex].output;
5448
5511
  }
5449
5512
  }
5450
- isInErrorState() {
5451
- return this.status === "error";
5513
+ if (model.startsWith("gemini")) {
5514
+ responseFormat = { type: "text" /* TEXT */ };
5452
5515
  }
5453
- latestErrorTask() {
5454
- if (this.status !== "error") {
5455
- return null;
5456
- }
5457
- const errorTaskIndex = this.tasks.findIndex(
5458
- (task) => task.status === "failed"
5459
- );
5460
- if (errorTaskIndex >= 0) {
5461
- return this.tasks[errorTaskIndex];
5462
- }
5463
- return null;
5516
+ const response = await call(messages, responseFormat);
5517
+ (0, import_node_assert5.default)(response, "empty response");
5518
+ const jsonContent = extractJSONFromCodeBlock(response);
5519
+ try {
5520
+ return JSON.parse(jsonContent);
5521
+ } catch (e) {
5522
+ throw Error(`parse json error: ${jsonContent}`);
5464
5523
  }
5465
- dump() {
5466
- const dumpData = {
5467
- sdkVersion: getVersion(),
5468
- model_name: getAIConfig(MIDSCENE_MODEL_NAME) || "",
5469
- logTime: Date.now(),
5470
- name: this.name,
5471
- description: this.description,
5472
- tasks: this.tasks
5473
- };
5474
- return dumpData;
5524
+ }
5525
+ function extractJSONFromCodeBlock(response) {
5526
+ const jsonMatch = response.match(/^\s*(\{[\s\S]*\})\s*$/);
5527
+ if (jsonMatch) {
5528
+ return jsonMatch[1];
5475
5529
  }
5476
- };
5477
-
5478
- // src/insight/index.ts
5479
- var import_node_assert9 = __toESM(require("assert"));
5530
+ const codeBlockMatch = response.match(/```(?:json)?\s*(\{[\s\S]*?\})\s*```/);
5531
+ if (codeBlockMatch) {
5532
+ return codeBlockMatch[1];
5533
+ }
5534
+ const jsonLikeMatch = response.match(/\{[\s\S]*\}/);
5535
+ if (jsonLikeMatch) {
5536
+ return jsonLikeMatch[0];
5537
+ }
5538
+ return response;
5539
+ }
5480
5540
 
5481
5541
  // src/ai-model/inspect.ts
5482
5542
  var import_node_assert6 = __toESM(require("assert"));
@@ -6094,6 +6154,7 @@ var src_default = Insight;
6094
6154
  allAIConfig,
6095
6155
  getAIConfig,
6096
6156
  getElement,
6157
+ getLogDirByType,
6097
6158
  getSection,
6098
6159
  getVersion,
6099
6160
  overrideAIConfig,