npm - @midscene/core - Versions diffs - 0.8.4 → 0.8.5-beta-20241126063126.0 - Mend

@midscene/core 0.8.4 → 0.8.5-beta-20241126063126.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/dist/lib/ai-model.js +147 -92
package/dist/lib/env.js +102 -0
package/dist/lib/index.js +904 -843
package/dist/lib/types/ai-model.d.ts +7 -3
package/dist/lib/types/env.d.ts +48 -0
package/dist/lib/types/{index-690c2a06.d.ts → index-41db6188.d.ts} +3 -36
package/dist/lib/types/index.d.ts +6 -5
package/dist/lib/types/{types-29994b1b.d.ts → types-0d8eeece.d.ts} +3 -1
package/dist/lib/types/utils.d.ts +3 -3
package/dist/lib/utils.js +15 -30
package/package.json +6 -2
package/report/index.html +2 -2

package/dist/lib/index.js CHANGED Viewed

@@ -4292,6 +4292,7 @@ __export(src_exports, {
   default: () => src_default,
   getAIConfig: () => getAIConfig,
   getElement: () => getElement,
+  getLogDirByType: () => getLogDirByType,
   getSection: () => getSection,
   getVersion: () => getVersion,
   overrideAIConfig: () => overrideAIConfig,
@@ -4302,220 +4303,562 @@ __export(src_exports, {
 module.exports = __toCommonJS(src_exports);
 // src/action/executor.ts
-var import_node_assert5 = __toESM(require("assert"));
-// src/ai-model/openai/index.ts
-var import_node_assert3 = __toESM(require("assert"));
+var import_node_assert2 = __toESM(require("assert"));
-// src/types.ts
-var BaseElement = class {
+// src/env.ts
+var MIDSCENE_OPENAI_INIT_CONFIG_JSON = "MIDSCENE_OPENAI_INIT_CONFIG_JSON";
+var MIDSCENE_MODEL_NAME = "MIDSCENE_MODEL_NAME";
+var MIDSCENE_LANGSMITH_DEBUG = "MIDSCENE_LANGSMITH_DEBUG";
+var MIDSCENE_DEBUG_AI_PROFILE = "MIDSCENE_DEBUG_AI_PROFILE";
+var MIDSCENE_DANGEROUSLY_PRINT_ALL_CONFIG = "MIDSCENE_DANGEROUSLY_PRINT_ALL_CONFIG";
+var MIDSCENE_DEBUG_MODE = "MIDSCENE_DEBUG_MODE";
+var OPENAI_API_KEY = "OPENAI_API_KEY";
+var OPENAI_BASE_URL = "OPENAI_BASE_URL";
+var MIDSCENE_MODEL_TEXT_ONLY = "MIDSCENE_MODEL_TEXT_ONLY";
+var OPENAI_USE_AZURE = "OPENAI_USE_AZURE";
+var MIDSCENE_CACHE = "MIDSCENE_CACHE";
+var MATCH_BY_POSITION = "MATCH_BY_POSITION";
+var MIDSCENE_REPORT_TAG_NAME = "MIDSCENE_REPORT_TAG_NAME";
+var allConfigFromEnv = () => {
+  return {
+    [MIDSCENE_OPENAI_INIT_CONFIG_JSON]: process.env[MIDSCENE_OPENAI_INIT_CONFIG_JSON] || void 0,
+    [MIDSCENE_MODEL_NAME]: process.env[MIDSCENE_MODEL_NAME] || void 0,
+    [MIDSCENE_DEBUG_MODE]: process.env[MIDSCENE_DEBUG_MODE] || void 0,
+    [MIDSCENE_LANGSMITH_DEBUG]: process.env[MIDSCENE_LANGSMITH_DEBUG] || void 0,
+    [MIDSCENE_DEBUG_AI_PROFILE]: process.env[MIDSCENE_DEBUG_AI_PROFILE] || void 0,
+    [MIDSCENE_DANGEROUSLY_PRINT_ALL_CONFIG]: process.env[MIDSCENE_DANGEROUSLY_PRINT_ALL_CONFIG] || void 0,
+    [OPENAI_API_KEY]: process.env[OPENAI_API_KEY] || void 0,
+    [OPENAI_BASE_URL]: process.env[OPENAI_BASE_URL] || void 0,
+    [MIDSCENE_MODEL_TEXT_ONLY]: process.env[MIDSCENE_MODEL_TEXT_ONLY] || void 0,
+    [OPENAI_USE_AZURE]: process.env[OPENAI_USE_AZURE] || void 0,
+    [MIDSCENE_CACHE]: process.env[MIDSCENE_CACHE] || void 0,
+    [MATCH_BY_POSITION]: process.env[MATCH_BY_POSITION] || void 0,
+    [MIDSCENE_REPORT_TAG_NAME]: process.env[MIDSCENE_REPORT_TAG_NAME] || void 0
+  };
 };
-var AIResponseFormat = /* @__PURE__ */ ((AIResponseFormat2) => {
-  AIResponseFormat2["JSON"] = "json_object";
-  AIResponseFormat2["TEXT"] = "text";
-  return AIResponseFormat2;
-})(AIResponseFormat || {});
-var UIContext = class {
+var userConfig = {};
+var getAIConfig = (configKey) => {
+  if (typeof userConfig[configKey] !== "undefined") {
+    return userConfig[configKey];
+  }
+  return allConfigFromEnv()[configKey];
+};
+var allAIConfig = () => {
+  return { ...allConfigFromEnv(), ...userConfig };
+};
+var overrideAIConfig = (newConfig, extendMode) => {
+  userConfig = extendMode ? { ...userConfig, ...newConfig } : { ...newConfig };
 };
-// src/ai-model/openai/index.ts
-var import_utils = require("@midscene/shared/utils");
-var import_openai5 = __toESM(require("openai"));
-// src/ai-model/coze/index.ts
+// src/utils.ts
 var import_node_assert = __toESM(require("assert"));
-var COZE_INSPECT_ELEMENT_BOT_ID = process.env.COZE_INSPECT_ELEMENT_BOT_ID || "";
-var COZE_AI_ACTION_BOT_ID = process.env.COZE_AI_ACTION_BOT_ID || "";
-var COZE_AI_ASSERT_BOT_ID = process.env.COZE_AI_ASSERT_BOT_ID || "";
-var COZE_EXTRACT_INFO_BOT_ID = process.env.COZE_EXTRACT_INFO_BOT_ID || "";
-var COZE_BOT_TOKEN = "COZE_BOT_TOKEN";
-function preferCozeModel(preferVendor) {
-  if (preferVendor && preferVendor !== "coze")
-    return false;
-  return process.env[COZE_BOT_TOKEN] && process.env.COZE_INSPECT_ELEMENT_BOT_ID && process.env.COZE_AI_ACTION_BOT_ID && process.env.COZE_AI_ASSERT_BOT_ID && process.env.COZE_EXTRACT_INFO_BOT_ID;
+var import_node_child_process = require("child_process");
+var import_node_fs = require("fs");
+var import_node_os = require("os");
+var import_node_path = require("path");
+var import_fs = require("@midscene/shared/fs");
+var import_utils = require("@midscene/shared/utils");
+var logDir = (0, import_node_path.join)(process.cwd(), "./midscene_run/");
+var logEnvReady = false;
+var insightDumpFileExt = "insight-dump.json";
+function getLogDir() {
+  return logDir;
 }
-async function callCozeAi(options) {
-  var _a, _b;
-  const { query, imgs, botId } = options;
-  const completion = await fetch("https://api.coze.com/open_api/v2/chat", {
-    method: "POST",
-    headers: {
-      Authorization: `Bearer ${process.env[COZE_BOT_TOKEN]}`,
-      "Content-Type": "application/json",
-      Accept: "*/*",
-      Host: "api.coze.com",
-      Connection: "keep-alive"
-    },
-    body: JSON.stringify({
-      conversation_id: "123",
-      bot_id: botId,
-      user: "29032201862555",
-      query,
-      meta_data: {
-        img: imgs.map((imgPath) => {
-          return {
-            url: imgPath
-          };
-        })
-      },
-      stream: false
-    })
-  });
-  if (!completion.ok) {
-    console.error("CozeAI reponse error", completion);
-    throw new Error("Network response was not ok");
-  }
-  const aiResponse = await completion.json();
-  if (aiResponse.code !== 0) {
-    console.error("CozeAI error response", aiResponse.msg);
-    throw new Error(`CozeAI error response ${aiResponse.msg}`);
+function setLogDir(dir) {
+  logDir = dir;
+}
+function getLogDirByType(type) {
+  const dir = (0, import_node_path.join)(getLogDir(), type);
+  if (!(0, import_node_fs.existsSync)(dir)) {
+    (0, import_node_fs.mkdirSync)(dir, { recursive: true });
   }
-  if (!(aiResponse == null ? void 0 : aiResponse.messages) || !((_a = aiResponse == null ? void 0 : aiResponse.messages[0]) == null ? void 0 : _a.content)) {
-    console.error("aiResponse", aiResponse);
-    throw new Error("aiResponse is undefined", aiResponse);
+  return dir;
+}
+var reportTpl = null;
+function getReportTpl() {
+  if (import_utils.ifInBrowser) {
+    if (!reportTpl && window.midscene_report_tpl) {
+      reportTpl = window.midscene_report_tpl;
+    }
+    (0, import_node_assert.default)(
+      reportTpl,
+      "reportTpl should be set before writing report in browser"
+    );
+    return reportTpl;
   }
-  const parseContent = (_b = aiResponse == null ? void 0 : aiResponse.messages[0]) == null ? void 0 : _b.content;
-  (0, import_node_assert.default)(parseContent, "empty content");
-  try {
-    return JSON.parse(parseContent);
-  } catch (err) {
-    console.error("can't parse coze content", aiResponse, err);
-    throw Error("can't parse coze content");
+  if (!reportTpl) {
+    let reportPath = (0, import_node_path.join)(__dirname, "../../report/index.html");
+    if (!(0, import_node_fs.existsSync)(reportPath)) {
+      reportPath = (0, import_node_path.join)(__dirname, "../report/index.html");
+    }
+    reportTpl = (0, import_node_fs.readFileSync)(reportPath, "utf-8");
   }
+  return reportTpl;
 }
-function transformOpenAiArgsToCoze(msg) {
-  if (msg.role !== "user")
-    throw Error(`can't transform ${msg} to coze args`);
-  if (typeof msg.content === "string") {
-    return {
-      query: msg.content,
-      imgs: []
-    };
+function reportHTMLContent(dumpData) {
+  const tpl = getReportTpl();
+  let reportContent;
+  if (Array.isArray(dumpData) && dumpData.length === 0 || typeof dumpData === "undefined") {
+    reportContent = tpl.replace(
+      /\s+{{dump}}\s+/,
+      `<script type="midscene_web_dump" type="application/json"></script>`
+    );
+  } else if (typeof dumpData === "string") {
+    reportContent = tpl.replace(
+      /\s+{{dump}}\s+/,
+      `<script type="midscene_web_dump" type="application/json">${dumpData}</script>`
+    );
+  } else {
+    const dumps = dumpData.map(({ dumpString, attributes }) => {
+      const attributesArr = Object.keys(attributes || {}).map((key) => {
+        return `${key}="${encodeURIComponent(attributes[key])}"`;
+      });
+      return `<script type="midscene_web_dump" type="application/json" ${attributesArr.join(
+        " "
+      )}
+>${dumpString}
+</script>`;
+    });
+    reportContent = tpl.replace(/\s+{{dump}}\s+/, dumps.join("\n"));
   }
-  return {
-    query: msg.content.reduce((res, next) => {
-      if (next.type === "text") {
-        res += `
-${next.text}`;
-      }
-      return res;
-    }, ""),
-    imgs: msg.content.reduce(
-      (res, next) => {
-        if (next.type === "image_url") {
-          res.push(next.image_url.url);
-        }
-        return res;
-      },
-      []
-    )
-  };
+  return reportContent;
 }
-// src/ai-model/common.ts
-async function callAiFn(options) {
-  const { useModel, msgs, AIActionType: AIActionTypeValue } = options;
-  if (preferOpenAIModel(useModel)) {
-    const parseResult = await callToGetJSONObject(msgs, AIActionTypeValue);
-    return parseResult;
+function writeDumpReport(fileName, dumpData) {
+  if (import_utils.ifInBrowser) {
+    console.log("will not write report in browser");
+    return null;
   }
-  if (preferCozeModel(useModel)) {
-    let botId = "";
-    switch (AIActionTypeValue) {
-      case 0 /* ASSERT */:
-        botId = COZE_AI_ASSERT_BOT_ID;
-        break;
-      case 2 /* EXTRACT_DATA */:
-        botId = COZE_EXTRACT_INFO_BOT_ID;
-        break;
-      case 1 /* INSPECT_ELEMENT */:
-        botId = COZE_INSPECT_ELEMENT_BOT_ID;
-        break;
-      default:
-        botId = COZE_AI_ACTION_BOT_ID;
-    }
-    const cozeMsg = transformOpenAiArgsToCoze(msgs[1]);
-    const parseResult = await callCozeAi({
-      ...cozeMsg,
-      botId
-    });
-    return parseResult;
+  const midscenePkgInfo = (0, import_fs.getRunningPkgInfo)(__dirname);
+  if (!midscenePkgInfo) {
+    console.warn("midscenePkgInfo not found, will not write report");
+    return null;
   }
-  throw Error(
-    "Cannot find Coze or OpenAI config. You should set at least one of them."
-  );
+  const reportPath = (0, import_node_path.join)(getLogDirByType("report"), `${fileName}.html`);
+  const reportContent = reportHTMLContent(dumpData);
+  (0, import_node_fs.writeFileSync)(reportPath, reportContent);
+  return reportPath;
 }
-function transformUserMessages(msgs) {
-  const textOnly = Boolean(getAIConfig(MIDSCENE_MODEL_TEXT_ONLY));
-  if (!textOnly)
-    return msgs;
-  return msgs.reduce((res, msg) => {
-    if (msg.type === "text") {
-      res += msg.text;
+function writeLogFile(opts) {
+  if (import_utils.ifInBrowser) {
+    return "/mock/report.html";
+  }
+  const { fileName, fileExt, fileContent, type = "dump" } = opts;
+  const targetDir = getLogDirByType(type);
+  if (!logEnvReady) {
+    (0, import_node_assert.default)(targetDir, "logDir should be set before writing dump file");
+    const gitIgnorePath = (0, import_node_path.join)(targetDir, "../../.gitignore");
+    let gitIgnoreContent = "";
+    if ((0, import_node_fs.existsSync)(gitIgnorePath)) {
+      gitIgnoreContent = (0, import_node_fs.readFileSync)(gitIgnorePath, "utf-8");
     }
-    return res;
-  }, "");
+    const logDirName = (0, import_node_path.basename)(logDir);
+    if (!gitIgnoreContent.includes(`${logDirName}/`)) {
+      (0, import_node_fs.writeFileSync)(
+        gitIgnorePath,
+        `${gitIgnoreContent}
+# Midscene.js dump files
+${logDirName}/report
+${logDirName}/dump
+${logDirName}/tmp
+`,
+        "utf-8"
+      );
+    }
+    logEnvReady = true;
+  }
+  const filePath = (0, import_node_path.join)(targetDir, `${fileName}.${fileExt}`);
+  const outputResourceDir = (0, import_node_path.dirname)(filePath);
+  if (!(0, import_node_fs.existsSync)(outputResourceDir)) {
+    (0, import_node_fs.mkdirSync)(outputResourceDir, { recursive: true });
+  }
+  (0, import_node_fs.writeFileSync)(filePath, fileContent);
+  if (opts == null ? void 0 : opts.generateReport) {
+    return writeDumpReport(fileName, fileContent);
+  }
+  return filePath;
 }
-// src/ai-model/prompt/element_inspector.ts
-function systemPromptToFindElement() {
-  if (getAIConfig(MATCH_BY_POSITION)) {
-    return systemPromptToFindElementPosition();
+function replacerForPageObject(key, value) {
+  var _a, _b;
+  if (value && ((_a = value.constructor) == null ? void 0 : _a.name) === "Page") {
+    return "[Page object]";
   }
-  return `
-## Role:
-You are an expert in software page image (2D) and page element text analysis.
-## Objective:
-- Identify elements in screenshots and text that match the user's description.
-- Return JSON data containing the selection reason and element ID.
+  if (value && ((_b = value.constructor) == null ? void 0 : _b.name) === "Browser") {
+    return "[Browser object]";
+  }
+  return value;
+}
+function stringifyDumpData(data, indents) {
+  return JSON.stringify(data, replacerForPageObject, indents);
+}
+function getVersion() {
+  return "0.8.5-beta-20241126063126.0";
+}
-## Skills:
-- Image analysis and recognition
-- Multilingual text understanding
-- Software UI design and testing
+// src/action/executor.ts
+var Executor = class {
+  constructor(name, description, tasks) {
+    __publicField(this, "name");
+    __publicField(this, "description");
+    __publicField(this, "tasks");
+    // status of executor
+    __publicField(this, "status");
+    this.status = tasks && tasks.length > 0 ? "pending" : "init";
+    this.name = name;
+    this.description = description;
+    this.tasks = (tasks || []).map((item) => this.markTaskAsPending(item));
+  }
+  markTaskAsPending(task) {
+    return {
+      status: "pending",
+      ...task
+    };
+  }
+  async append(task) {
+    var _a, _b;
+    (0, import_node_assert2.default)(
+      this.status !== "error",
+      `executor is in error state, cannot append task
+error=${(_a = this.latestErrorTask()) == null ? void 0 : _a.error}
+${(_b = this.latestErrorTask()) == null ? void 0 : _b.errorStack}`
+    );
+    if (Array.isArray(task)) {
+      this.tasks.push(...task.map((item) => this.markTaskAsPending(item)));
+    } else {
+      this.tasks.push(this.markTaskAsPending(task));
+    }
+    if (this.status !== "running") {
+      this.status = "pending";
+    }
+  }
+  async flush() {
+    if (this.status === "init" && this.tasks.length > 0) {
+      console.warn(
+        "illegal state for executor, status is init but tasks are not empty"
+      );
+    }
+    (0, import_node_assert2.default)(this.status !== "running", "executor is already running");
+    (0, import_node_assert2.default)(this.status !== "completed", "executor is already completed");
+    (0, import_node_assert2.default)(this.status !== "error", "executor is in error state");
+    const nextPendingIndex = this.tasks.findIndex(
+      (task) => task.status === "pending"
+    );
+    if (nextPendingIndex < 0) {
+      return;
+    }
+    this.status = "running";
+    let taskIndex = nextPendingIndex;
+    let successfullyCompleted = true;
+    let previousFindOutput;
+    while (taskIndex < this.tasks.length) {
+      const task = this.tasks[taskIndex];
+      (0, import_node_assert2.default)(
+        task.status === "pending",
+        `task status should be pending, but got: ${task.status}`
+      );
+      task.timing = {
+        start: Date.now()
+      };
+      try {
+        task.status = "running";
+        (0, import_node_assert2.default)(
+          ["Insight", "Action", "Planning"].indexOf(task.type) >= 0,
+          `unsupported task type: ${task.type}`
+        );
+        const { executor, param } = task;
+        (0, import_node_assert2.default)(executor, `executor is required for task type: ${task.type}`);
+        let returnValue;
+        const executorContext = {
+          task,
+          element: previousFindOutput == null ? void 0 : previousFindOutput.element
+        };
+        if (task.type === "Insight") {
+          (0, import_node_assert2.default)(
+            task.subType === "Locate" || task.subType === "Query" || task.subType === "Assert",
+            `unsupported insight subType: ${task.subType}`
+          );
+          returnValue = await task.executor(param, executorContext);
+          if (task.subType === "Locate") {
+            previousFindOutput = returnValue == null ? void 0 : returnValue.output;
+          }
+        } else if (task.type === "Action" || task.type === "Planning") {
+          returnValue = await task.executor(param, executorContext);
+        } else {
+          console.warn(
+            `unsupported task type: ${task.type}, will try to execute it directly`
+          );
+          returnValue = await task.executor(param, executorContext);
+        }
+        Object.assign(task, returnValue);
+        task.status = "finished";
+        task.timing.end = Date.now();
+        task.timing.cost = task.timing.end - task.timing.start;
+        taskIndex++;
+      } catch (e) {
+        successfullyCompleted = false;
+        task.error = (e == null ? void 0 : e.message) || "error-without-message";
+        task.errorStack = e.stack;
+        task.status = "failed";
+        task.timing.end = Date.now();
+        task.timing.cost = task.timing.end - task.timing.start;
+        break;
+      }
+    }
+    for (let i = taskIndex + 1; i < this.tasks.length; i++) {
+      this.tasks[i].status = "cancelled";
+    }
+    if (successfullyCompleted) {
+      this.status = "completed";
+    } else {
+      this.status = "error";
+    }
+    if (this.tasks.length) {
+      const outputIndex = Math.min(taskIndex, this.tasks.length - 1);
+      return this.tasks[outputIndex].output;
+    }
+  }
+  isInErrorState() {
+    return this.status === "error";
+  }
+  latestErrorTask() {
+    if (this.status !== "error") {
+      return null;
+    }
+    const errorTaskIndex = this.tasks.findIndex(
+      (task) => task.status === "failed"
+    );
+    if (errorTaskIndex >= 0) {
+      return this.tasks[errorTaskIndex];
+    }
+    return null;
+  }
+  dump() {
+    const dumpData = {
+      sdkVersion: getVersion(),
+      model_name: getAIConfig(MIDSCENE_MODEL_NAME) || "",
+      logTime: Date.now(),
+      name: this.name,
+      description: this.description,
+      tasks: this.tasks
+    };
+    return dumpData;
+  }
+};
-## Workflow:
-1. Receive the user's element description, screenshot, and element description information. Note that the text may contain non-English characters (e.g., Chinese), indicating that the application may be non-English.
-2. Based on the user's description, locate the target element ID in the list of element descriptions and the screenshot.
-3. Found the required number of elements
-4. Return JSON data containing the selection reason and element ID.
+// src/insight/index.ts
+var import_node_assert9 = __toESM(require("assert"));
-## Constraints:
-- Strictly adhere to the specified location when describing the required element; do not select elements from other locations.
-- Elements in the image with NodeType other than "TEXT Node" have been highlighted to identify the element among multiple non-text elements.
-- Accurately identify element information based on the user's description and return the corresponding element ID from the element description information, not extracted from the image.
-- If no elements are found, the "elements" array should be empty.
-- The returned data must conform to the specified JSON format.
-- The returned value id information must use the id from element info (important: **use id not indexId, id is hash content**)
+// src/ai-model/openai/index.ts
+var import_node_assert5 = __toESM(require("assert"));
-## Output Format:
+// src/types.ts
+var BaseElement = class {
+};
+var AIResponseFormat = /* @__PURE__ */ ((AIResponseFormat2) => {
+  AIResponseFormat2["JSON"] = "json_object";
+  AIResponseFormat2["TEXT"] = "text";
+  return AIResponseFormat2;
+})(AIResponseFormat || {});
+var UIContext = class {
+};
-Please return the result in JSON format as follows:
+// src/ai-model/openai/index.ts
+var import_utils3 = require("@midscene/shared/utils");
+var import_openai2 = __toESM(require("openai"));
-\`\`\`json
-{
-  "elements": [
-    // If no matching elements are found, return an empty array []
-    {
-      "reason": "PLACEHOLDER", // The thought process for finding the element, replace PLACEHOLDER with your thought process
-      "text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
-      "id": "PLACEHOLDER" // Replace PLACEHOLDER with the ID (important: **use id not indexId, id is hash content**) of elementInfo
-    }
-    // More elements...
-  ],
-  "errors": [] // Array of strings containing any error messages
+// src/ai-model/coze/index.ts
+var import_node_assert3 = __toESM(require("assert"));
+var COZE_INSPECT_ELEMENT_BOT_ID = process.env.COZE_INSPECT_ELEMENT_BOT_ID || "";
+var COZE_AI_ACTION_BOT_ID = process.env.COZE_AI_ACTION_BOT_ID || "";
+var COZE_AI_ASSERT_BOT_ID = process.env.COZE_AI_ASSERT_BOT_ID || "";
+var COZE_EXTRACT_INFO_BOT_ID = process.env.COZE_EXTRACT_INFO_BOT_ID || "";
+var COZE_BOT_TOKEN = "COZE_BOT_TOKEN";
+function preferCozeModel(preferVendor) {
+  if (preferVendor && preferVendor !== "coze")
+    return false;
+  return process.env[COZE_BOT_TOKEN] && process.env.COZE_INSPECT_ELEMENT_BOT_ID && process.env.COZE_AI_ACTION_BOT_ID && process.env.COZE_AI_ASSERT_BOT_ID && process.env.COZE_EXTRACT_INFO_BOT_ID;
 }
-\`\`\`
-## Example:
-Example 1:
-Input Example:
-\`\`\`json
-// Description: "Shopping cart icon in the upper right corner"
-{
-  "description": "PLACEHOLDER", // Description of the target element
-  "multi": "PLACEHOLDER", //Find the number of elements
+async function callCozeAi(options) {
+  var _a, _b;
+  const { query, imgs, botId } = options;
+  const completion = await fetch("https://api.coze.com/open_api/v2/chat", {
+    method: "POST",
+    headers: {
+      Authorization: `Bearer ${process.env[COZE_BOT_TOKEN]}`,
+      "Content-Type": "application/json",
+      Accept: "*/*",
+      Host: "api.coze.com",
+      Connection: "keep-alive"
+    },
+    body: JSON.stringify({
+      conversation_id: "123",
+      bot_id: botId,
+      user: "29032201862555",
+      query,
+      meta_data: {
+        img: imgs.map((imgPath) => {
+          return {
+            url: imgPath
+          };
+        })
+      },
+      stream: false
+    })
+  });
+  if (!completion.ok) {
+    console.error("CozeAI reponse error", completion);
+    throw new Error("Network response was not ok");
+  }
+  const aiResponse = await completion.json();
+  if (aiResponse.code !== 0) {
+    console.error("CozeAI error response", aiResponse.msg);
+    throw new Error(`CozeAI error response ${aiResponse.msg}`);
+  }
+  if (!(aiResponse == null ? void 0 : aiResponse.messages) || !((_a = aiResponse == null ? void 0 : aiResponse.messages[0]) == null ? void 0 : _a.content)) {
+    console.error("aiResponse", aiResponse);
+    throw new Error("aiResponse is undefined", aiResponse);
+  }
+  const parseContent = (_b = aiResponse == null ? void 0 : aiResponse.messages[0]) == null ? void 0 : _b.content;
+  (0, import_node_assert3.default)(parseContent, "empty content");
+  try {
+    return JSON.parse(parseContent);
+  } catch (err) {
+    console.error("can't parse coze content", aiResponse, err);
+    throw Error("can't parse coze content");
+  }
+}
+function transformOpenAiArgsToCoze(msg) {
+  if (msg.role !== "user")
+    throw Error(`can't transform ${msg} to coze args`);
+  if (typeof msg.content === "string") {
+    return {
+      query: msg.content,
+      imgs: []
+    };
+  }
+  return {
+    query: msg.content.reduce((res, next) => {
+      if (next.type === "text") {
+        res += `
+${next.text}`;
+      }
+      return res;
+    }, ""),
+    imgs: msg.content.reduce(
+      (res, next) => {
+        if (next.type === "image_url") {
+          res.push(next.image_url.url);
+        }
+        return res;
+      },
+      []
+    )
+  };
+}
+// src/ai-model/common.ts
+async function callAiFn(options) {
+  const { useModel, msgs, AIActionType: AIActionTypeValue } = options;
+  if (preferOpenAIModel(useModel)) {
+    const parseResult = await callToGetJSONObject(msgs, AIActionTypeValue);
+    return parseResult;
+  }
+  if (preferCozeModel(useModel)) {
+    let botId = "";
+    switch (AIActionTypeValue) {
+      case 0 /* ASSERT */:
+        botId = COZE_AI_ASSERT_BOT_ID;
+        break;
+      case 2 /* EXTRACT_DATA */:
+        botId = COZE_EXTRACT_INFO_BOT_ID;
+        break;
+      case 1 /* INSPECT_ELEMENT */:
+        botId = COZE_INSPECT_ELEMENT_BOT_ID;
+        break;
+      default:
+        botId = COZE_AI_ACTION_BOT_ID;
+    }
+    const cozeMsg = transformOpenAiArgsToCoze(msgs[1]);
+    const parseResult = await callCozeAi({
+      ...cozeMsg,
+      botId
+    });
+    return parseResult;
+  }
+  throw Error(
+    "Cannot find Coze or OpenAI config. You should set at least one of them."
+  );
+}
+function transformUserMessages(msgs) {
+  const textOnly = Boolean(getAIConfig(MIDSCENE_MODEL_TEXT_ONLY));
+  if (!textOnly)
+    return msgs;
+  return msgs.reduce((res, msg) => {
+    if (msg.type === "text") {
+      res += msg.text;
+    }
+    return res;
+  }, "");
+}
+// src/ai-model/prompt/element_inspector.ts
+function systemPromptToFindElement() {
+  if (getAIConfig(MATCH_BY_POSITION)) {
+    return systemPromptToFindElementPosition();
+  }
+  return `
+## Role:
+You are an expert in software page image (2D) and page element text analysis.
+## Objective:
+- Identify elements in screenshots and text that match the user's description.
+- Return JSON data containing the selection reason and element ID.
+## Skills:
+- Image analysis and recognition
+- Multilingual text understanding
+- Software UI design and testing
+## Workflow:
+1. Receive the user's element description, screenshot, and element description information. Note that the text may contain non-English characters (e.g., Chinese), indicating that the application may be non-English.
+2. Based on the user's description, locate the target element ID in the list of element descriptions and the screenshot.
+3. Found the required number of elements
+4. Return JSON data containing the selection reason and element ID.
+## Constraints:
+- Strictly adhere to the specified location when describing the required element; do not select elements from other locations.
+- Elements in the image with NodeType other than "TEXT Node" have been highlighted to identify the element among multiple non-text elements.
+- Accurately identify element information based on the user's description and return the corresponding element ID from the element description information, not extracted from the image.
+- If no elements are found, the "elements" array should be empty.
+- The returned data must conform to the specified JSON format.
+- The returned value id information must use the id from element info (important: **use id not indexId, id is hash content**)
+## Output Format:
+Please return the result in JSON format as follows:
+\`\`\`json
+{
+  "elements": [
+    // If no matching elements are found, return an empty array []
+    {
+      "reason": "PLACEHOLDER", // The thought process for finding the element, replace PLACEHOLDER with your thought process
+      "text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
+      "id": "PLACEHOLDER" // Replace PLACEHOLDER with the ID (important: **use id not indexId, id is hash content**) of elementInfo
+    }
+    // More elements...
+  ],
+  "errors": [] // Array of strings containing any error messages
+}
+\`\`\`
+## Example:
+Example 1:
+Input Example:
+\`\`\`json
+// Description: "Shopping cart icon in the upper right corner"
+{
+  "description": "PLACEHOLDER", // Description of the target element
+  "multi": "PLACEHOLDER", //Find the number of elements
   "screenshot": "path/screenshot.png",
   "text": '{
       "pageSize": {
@@ -4524,7 +4867,7 @@ Input Example:
       },
       "elementInfos": [
         {
-          "id": "we23xsfwe", // ID of the element
+          "id": "1231", // ID of the element
           "indexId": "0", // Index of the element，The image is labeled to the left of the element
           "attributes": { // Attributes of the element
             "nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
@@ -4540,7 +4883,7 @@ Input Example:
           }
         },
         {
-          "id": "wefew2222few2", // ID of the element
+          "id": "66551", // ID of the element
           "indexId": "1", // Index of the element,The image is labeled to the left of the element
           "attributes": { // Attributes of the element
             "nodeType": "IMG Node", // Type of element, types include: TEXT Node, IMG Node, BUTTON Node, INPUT Node
@@ -4557,7 +4900,7 @@ Input Example:
         },
         ...
         {
-          "id": "kwekfj2323",
+          "id": "12344",
           "indexId": "2", // Index of the element，The image is labeled to the left of the element
           "attributes": {
             "nodeType": "TEXT Node",
@@ -4590,7 +4933,7 @@ Output Example:
       "reason": "Reason for finding element 4: It is located in the upper right corner, is an image type, and according to the screenshot, it is a shopping cart icon button",
       "text": "",
       // ID(**use id not indexId**) of this element, replace with actual value in practice, **use id not indexId**
-      "id": "wefew2222few2"
+      "id": "1231"
     }
   ],
   "errors": []
@@ -4677,6 +5020,19 @@ var findElementSchema = {
 };
 // src/ai-model/prompt/planning.ts
+var quickAnswerFormat = () => {
+  const matchByPosition = getAIConfig(MATCH_BY_POSITION);
+  const description = `
+  ${matchByPosition ? `"position": { x: number; y: number } // Represents the position of the element; replace with actual values in practice (ensure it reflects the element's position)` : '"id": string // Represents the ID of the element; replace with actual values in practice'}
+  `;
+  const format = matchByPosition ? '"position": { x: number; y: number }' : '"id": string';
+  const sample = matchByPosition ? '{"position": { x: 100, y: 200 }}' : '{"id": "14562"}';
+  return {
+    description,
+    format,
+    sample
+  };
+};
 function systemPromptToTaskPlanning() {
   return `
 ## Role:
@@ -4700,32 +5056,24 @@ Each action has a type and corresponding param. To be detailed:
 * type: 'KeyboardPress',  press a key
   * param: { value: string },  the value to input or the key to press. Use （Enter, Shift, Control, Alt, Meta, ShiftLeft, ControlOrMeta, ControlOrMeta） to represent the key.
 * type: 'Scroll'
-  * param: { scrollType: 'scrollDownOneScreen', 'scrollUpOneScreen', 'scrollUntilBottom', 'scrollUntilTop' }
+  * param: { scrollType: 'scrollDownOneScreen' | 'scrollUpOneScreen' | 'scrollUntilBottom' | 'scrollUntilTop' }
 * type: 'Error'
   * param: { message: string }, the error message
 * type: 'Sleep'
   * param: { timeMs: number }, wait for timeMs milliseconds
-Here is an example of how to decompose a task.
-When a user says 'Input "Weather in Shanghai" into the search bar, wait 1 second, hit enter', by viewing the page screenshot and description, you may decompose this task into something like this:
-* Locate: 'The search bar'
-* Input: 'Weather in Shanghai'
-* Sleep: 1000
-* KeyboardPress: 'Enter'
 Remember:
 1. The actions you composed MUST be based on the page context information you get. Instead of making up actions that are not related to the page context.
-2. In most cases, you should Locate one element first, then do other actions on it. For example, alway Find one element, then hover on it. But if you think it's necessary to do other actions first (like global scroll, global key press), you can do that.
+2. In most cases, you should Locate one element first, then do other actions on it. For example, Locate one element, then hover on it. But if you think it's necessary to do other actions first (like global scroll, global key press), you can do that.
+3. If the planned actions are sequential and some actions may appear only after the execution of previous actions, this is considered normal. Thoughts, prompts, and error messages should all be in the same language as the user's description.
-If the planned tasks are sequential and tasks may appear only after the execution of previous tasks, this is considered normal. Thoughts, prompts, and error messages should all be in the same language as the user query.
+## Objective 2 (sub objective, only for action with type "Locate"): Give a quick answer to the action with type "Locate" you just planned, append a \`quickAnswer\` field as a sibling of the \`param\` field
-## Objective 2 (sub objective, only for action with type "Locate"): Give a quick answer to the action with type "Locate" you just planned, append a \`quickAnswer\` field after the \`param\` field
-If the action type is 'Locate', provide a quick answer: Does any element meet the description in the prompt? If so, answer with the following format, as the \`quickAnswer\` field in the output JSON:
+If the action type is 'Locate', think about this: does any element on screen meet the description in the prompt? If so, answer with the following format, as the \`quickAnswer\` field in the output JSON:
 {
-  "reason": "Reason for finding element 4: It is located in the upper right corner, is an image type, and according to the screenshot, it is a shopping cart icon button",
+  "reason": "It is located (somewhere), is an (node type). According to the screenshot, it is a shopping cart icon button (or it's text is 'Shopping Cart')",
   "text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
-  ${getAIConfig(MATCH_BY_POSITION) ? `"position": { x: number; y: number } // Represents the position of the element; replace with actual values in practice (ensure it reflects the element's position)` : '"id": "wefew2222few2" // Represents the ID of the element; replace with actual values in practice'}
+  ${quickAnswerFormat().description}
 }
 If there is no element meets the description in the prompt (usually because it will show up later after some interaction), the \`quickAnswer\` field should be null.
@@ -4738,33 +5086,71 @@ Please return the result in JSON format as follows:
   actions: [ // always return in Array
     {
       "thought": "find out the search bar",
-      "type": "Locate", // Type of action, like 'Tap' 'Hover' ...
-      "param": {
+      "type": "Locate", // type of action according to Object 1, like 'Tap' 'Hover' ...
+      "param": { //
         "prompt": "The search bar"
       },
-      "quickAnswer": { // since this action type is 'Locate', and we can find the element, so we need to give a quick answer
-        "reason": "Reason for finding element 4: It is located in the upper right corner, is an input type, and according to the screenshot, it is a search bar",
-        "text": "PLACEHOLDER", // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
-        ${getAIConfig(MATCH_BY_POSITION) ? `"position": { x: number; y: number } // Represents the position of the element; replace with actual values in practice (ensure it reflects the element's position)` : '"id": "wefew2222few2" // Represents the ID of the element; replace with actual values in practice'}
+      "quickAnswer": {
+        "reason": "This is ...",
+        "text": string, // Replace PLACEHOLDER with the text of elementInfo, if none, leave empty
+        ${quickAnswerFormat().format}
       } | null,
     },
     {
       "thought": "Reasons for generating this task, and why this task is feasible on this page",
-      "type": "Tap", // Type of action, like 'Tap' 'Hover' ...
-      "param": any, // Parameter towards the task type
+      "type": "Tap",
+      "param": null,
     },
+    // ... more actions
+  ],
+  error?: string, // Overall error messages. If there is any error occurs during the task planning (i.e. error in previous 'actions' array), conclude the errors again, put error messages here,
+}
+## Here is an example of how to decompose a task
+When a user says 'Click the language switch button, wait 1s, click "English"', by viewing the page screenshot and description, you should consider this:
+* The main steps are: Find the switch button, tap it, sleep, find the 'English' element, and tap on it.
+* Think and look in detail and fill all the fields in the JSON format.
+\`\`\`json
+{
+  queryLanguage: 'English',
+  actions:[
     {
-      "thought": "Reasons for generating this task, and why this task is feasible on this page",
-      "type": "Locate", // Type of action, like 'Tap' 'Hover' ...
-      "param": {
-        "prompt": "The search bar"
+      thought: "Locate the language switch button with the text '中文'.",
+      type: 'Locate',
+      param: { prompt: "The language switch button with the text '中文'" },
+      quickAnswer: { // according to Objective 2,  this action type is 'Locate', and we can find the element, so we need to give a quick answer
+        reason: "It is located  near the top center, is an text node. According to the screenshot, it is a language switch button with the text '中文'.",
+        text: '中文',
+        ${quickAnswerFormat().sample}
       },
-      "quickAnswer": null,
     },
-    // ... more actions
+    {
+      thought: 'Click the language switch button to open the language options.',
+      type: 'Tap',
+      param: null,
+    },
+    {
+      thought: 'Wait for 1 second to ensure the language options are displayed.',
+      type: 'Sleep',
+      param: { timeMs: 1000 },
+    },
+    {
+      thought: "Locate the 'English' option in the language menu.",
+      type: 'Locate',
+      param: { prompt: "The 'English' option in the language menu" },
+      quickAnswer: null, // we cannot find this item in the description (it will show only after the previous interactions), so the quick answer is null here
+    },
+    {
+      thought: "Click the 'English' option to switch the language.",
+      type: 'Tap',
+      param: null,
+    }
   ],
-  error?: string, // Overall error messages. If there is any error occurs during the task planning (i.e. error in previous 'actions' array), conclude the errors again, put error messages here,
 }
+\`\`\`
 `;
 }
 var planSchema = {
@@ -4802,7 +5188,7 @@ var planSchema = {
                 properties: {
                   reason: {
                     type: "string",
-                    description: "Reason for finding element 4"
+                    description: "Reason for finding this element"
                   },
                   text: {
                     type: "string",
@@ -4824,659 +5210,333 @@ var planSchema = {
         },
         error: {
           type: ["string", "null"],
-          description: "Overall error messages. If there is any error occurs during the task planning, conclude the errors again and put error messages here"
-        }
-      },
-      required: ["queryLanguage", "actions", "error"],
-      additionalProperties: false
-    }
-  }
-};
-// src/ai-model/prompt/util.ts
-var import_node_assert2 = __toESM(require("assert"));
-// src/image/index.ts
-var import_img = require("@midscene/shared/img");
-// src/ai-model/prompt/util.ts
-var characteristic = "You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.";
-var contextFormatIntro = `
-The user will give you a screenshot and the texts on it. There may be some none-English characters (like Chinese) on it, indicating it's an non-English app.`;
-var ONE_ELEMENT_LOCATOR_PREFIX = "LOCATE_ONE_ELEMENT";
-var ELEMENTS_LOCATOR_PREFIX = "LOCATE_ONE_OR_MORE_ELEMENTS";
-var SECTION_MATCHER_FLAG = "SECTION_MATCHER_FLAG/";
-function systemPromptToExtract() {
-  return `
-You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.
-The user will give you a screenshot and the contents of it. There may be some none-English characters (like Chinese) on it, indicating it's an non-English app.
-You have the following skills:
-skill name: extract_data_from_UI
-related input: DATA_DEMAND
-skill content:
-* User will give you some data requirements in DATA_DEMAND. Consider the UI context, follow the user's instructions, and provide comprehensive data accordingly.
-* There may be some special commands in DATA_DEMAND, please pay extra attention
-  - LOCATE_ONE_ELEMENT and LOCATE_ONE_OR_MORE_ELEMENTS: if you see a description that mentions the keyword LOCATE_ONE_ELEMENT
-  - LOCATE_ONE_OR_MORE_ELEMENTS(e.g. follow LOCATE_ONE_ELEMENT : i want to find ...), it means user wants to locate a specific element meets the description.
-Return in this way: prefix + the id / comma-separated ids, for example: LOCATE_ONE_ELEMENT/1 , LOCATE_ONE_OR_MORE_ELEMENTS/1,2,3 . If not found, keep the prefix and leave the suffix empty, like LOCATE_ONE_ELEMENT/ .
-Return in the following JSON format:
-{
-  language: "en", // "en" or "zh", the language of the page. Use the same language to describe section name, description, and similar fields.
-  data: any, // the extracted data from extract_data_from_UI skill. Make sure both the value and scheme meet the DATA_DEMAND.
-  errors: [], // string[], error message if any
-}
-`;
-}
-function systemPromptToAssert() {
-  return `
-${characteristic}
-${contextFormatIntro}
-Based on the information you get, Return assertion judgment:
-Return in the following JSON format:
-{
-  thought: string, // string, the thought of the assertion. Should in the same language as the assertion.
-  pass: true, // true or false, whether the assertion is passed
-}
-`;
-}
-var assertSchema = {
-  type: "json_schema",
-  json_schema: {
-    name: "assert",
-    strict: true,
-    schema: {
-      type: "object",
-      properties: {
-        thought: {
-          type: "string",
-          description: "The thought process behind the assertion"
-        },
-        pass: {
-          type: "boolean",
-          description: "Whether the assertion passed or failed"
-        }
-      },
-      required: ["thought", "pass"],
-      additionalProperties: false
-    }
-  }
-};
-function describeSize(size) {
-  return `${size.width} x ${size.height}`;
-}
-function truncateText(text) {
-  const maxLength = 50;
-  if (text && text.length > maxLength) {
-    return `${text.slice(0, maxLength)}...`;
-  }
-  return text;
-}
-function elementByPosition(elementsInfo, position) {
-  (0, import_node_assert2.default)(typeof position !== "undefined", "position is required for query");
-  const item = elementsInfo.find((item2) => {
-    return item2.rect.left <= position.x && position.x <= item2.rect.left + item2.rect.width && item2.rect.top <= position.y && position.y <= item2.rect.top + item2.rect.height;
-  });
-  return item;
-}
-async function describeUserPage(context) {
-  const { screenshotBase64 } = context;
-  let width;
-  let height;
-  if (context.size) {
-    ({ width, height } = context.size);
-  } else {
-    const imgSize = await (0, import_img.imageInfoOfBase64)(screenshotBase64);
-    ({ width, height } = imgSize);
-  }
-  const elementsInfo = context.content;
-  const idElementMap = {};
-  elementsInfo.forEach((item) => {
-    idElementMap[item.id] = item;
-    return { ...item };
-  });
-  const elementInfosDescription = cropFieldInformation(elementsInfo);
-  return {
-    description: `
-    {
-      // The size of the page
-      "pageSize": ${describeSize({ width, height })},
-      ${// if match by id, use the description of the element
-    !getAIConfig(MATCH_BY_POSITION) ? `
-          // json description of the element
-          "content": ${JSON.stringify(elementInfosDescription)}
-          ` : ""}
-    }`,
-    elementById(id) {
-      (0, import_node_assert2.default)(typeof id !== "undefined", "id is required for query");
-      const item = idElementMap[`${id}`];
-      return item;
-    },
-    elementByPosition(position) {
-      return elementByPosition(elementsInfo, position);
-    }
-  };
-}
-function cropFieldInformation(elementsInfo) {
-  const elementInfosDescription = elementsInfo.map(
-    (item) => {
-      const { id, attributes = {}, rect, content } = item;
-      const tailorContent = truncateText(content);
-      const tailorAttributes = Object.keys(attributes).reduce(
-        (res, currentKey) => {
-          const attributeVal = attributes[currentKey];
-          res[currentKey] = truncateText(attributeVal);
-          return res;
-        },
-        {}
-      );
-      return {
-        id,
-        markerId: item.indexId,
-        attributes: tailorAttributes,
-        rect,
-        content: tailorContent
-      };
-    }
-  );
-  return JSON.stringify(elementInfosDescription);
-}
-function retrieveElement(prompt, opt) {
-  if (opt == null ? void 0 : opt.multi) {
-    return `follow ${ELEMENTS_LOCATOR_PREFIX}: ${prompt}`;
-  }
-  return `follow ${ONE_ELEMENT_LOCATOR_PREFIX}: ${prompt}`;
-}
-function ifElementTypeResponse(response) {
-  if (typeof response !== "string") {
-    return false;
-  }
-  return response.startsWith(ONE_ELEMENT_LOCATOR_PREFIX) || response.startsWith(ELEMENTS_LOCATOR_PREFIX);
-}
-function splitElementResponse(response) {
-  const oneElementSplitter = `${ONE_ELEMENT_LOCATOR_PREFIX}/`;
-  if (response.startsWith(oneElementSplitter)) {
-    const id = response.slice(oneElementSplitter.length);
-    if (id.indexOf(",") >= 0) {
-      console.warn(`unexpected comma in one element response: ${id}`);
-    }
-    return id ? id : null;
-  }
-  const elementsSplitter = `${ELEMENTS_LOCATOR_PREFIX}/`;
-  if (response.startsWith(elementsSplitter)) {
-    const idsString = response.slice(elementsSplitter.length);
-    if (!idsString) {
-      return [];
-    }
-    return idsString.split(",");
-  }
-  return null;
-}
-function retrieveSection(prompt) {
-  return `${SECTION_MATCHER_FLAG}${prompt}`;
-}
-// src/ai-model/openai/index.ts
-var MIDSCENE_OPENAI_INIT_CONFIG_JSON = "MIDSCENE_OPENAI_INIT_CONFIG_JSON";
-var MIDSCENE_MODEL_NAME = "MIDSCENE_MODEL_NAME";
-var MIDSCENE_LANGSMITH_DEBUG = "MIDSCENE_LANGSMITH_DEBUG";
-var MIDSCENE_DEBUG_AI_PROFILE = "MIDSCENE_DEBUG_AI_PROFILE";
-var MIDSCENE_DANGEROUSLY_PRINT_ALL_CONFIG = "MIDSCENE_DANGEROUSLY_PRINT_ALL_CONFIG";
-var MIDSCENE_DEBUG_MODE = "MIDSCENE_DEBUG_MODE";
-var OPENAI_API_KEY = "OPENAI_API_KEY";
-var OPENAI_BASE_URL = "OPENAI_BASE_URL";
-var MIDSCENE_MODEL_TEXT_ONLY = "MIDSCENE_MODEL_TEXT_ONLY";
-var OPENAI_USE_AZURE = "OPENAI_USE_AZURE";
-var MIDSCENE_CACHE = "MIDSCENE_CACHE";
-var MATCH_BY_POSITION = "MATCH_BY_POSITION";
-var allConfigFromEnv = () => {
-  return {
-    [MIDSCENE_OPENAI_INIT_CONFIG_JSON]: process.env[MIDSCENE_OPENAI_INIT_CONFIG_JSON] || void 0,
-    [MIDSCENE_MODEL_NAME]: process.env[MIDSCENE_MODEL_NAME] || void 0,
-    [MIDSCENE_DEBUG_MODE]: process.env[MIDSCENE_DEBUG_MODE] || void 0,
-    [MIDSCENE_LANGSMITH_DEBUG]: process.env[MIDSCENE_LANGSMITH_DEBUG] || void 0,
-    [MIDSCENE_DEBUG_AI_PROFILE]: process.env[MIDSCENE_DEBUG_AI_PROFILE] || void 0,
-    [MIDSCENE_DANGEROUSLY_PRINT_ALL_CONFIG]: process.env[MIDSCENE_DANGEROUSLY_PRINT_ALL_CONFIG] || void 0,
-    [OPENAI_API_KEY]: process.env[OPENAI_API_KEY] || void 0,
-    [OPENAI_BASE_URL]: process.env[OPENAI_BASE_URL] || void 0,
-    [MIDSCENE_MODEL_TEXT_ONLY]: process.env[MIDSCENE_MODEL_TEXT_ONLY] || void 0,
-    [OPENAI_USE_AZURE]: process.env[OPENAI_USE_AZURE] || void 0,
-    [MIDSCENE_CACHE]: process.env[MIDSCENE_CACHE] || void 0,
-    [MATCH_BY_POSITION]: process.env[MATCH_BY_POSITION] || void 0
-  };
-};
-var userConfig = {};
-var getAIConfig = (configKey) => {
-  if (typeof userConfig[configKey] !== "undefined") {
-    return userConfig[configKey];
-  }
-  return allConfigFromEnv()[configKey];
-};
-var allAIConfig = () => {
-  return { ...allConfigFromEnv(), ...userConfig };
-};
-var overrideAIConfig = (newConfig, extendMode) => {
-  userConfig = extendMode ? { ...userConfig, ...newConfig } : { ...newConfig };
-};
-function preferOpenAIModel(preferVendor) {
-  if (preferVendor && preferVendor !== "openAI")
-    return false;
-  if (getAIConfig(OPENAI_API_KEY))
-    return true;
-  return Boolean(getAIConfig(MIDSCENE_OPENAI_INIT_CONFIG_JSON));
-}
-var defaultModel = "gpt-4o-2024-08-06";
-function getModelName() {
-  let modelName = defaultModel;
-  const nameInConfig = getAIConfig(MIDSCENE_MODEL_NAME);
-  if (nameInConfig) {
-    modelName = nameInConfig;
-  }
-  return modelName;
-}
-async function createOpenAI() {
-  let openai;
-  const extraConfigString = getAIConfig(MIDSCENE_OPENAI_INIT_CONFIG_JSON);
-  const extraConfig = extraConfigString ? JSON.parse(extraConfigString) : {};
-  if (getAIConfig(OPENAI_USE_AZURE)) {
-    openai = new import_openai5.AzureOpenAI({
-      baseURL: getAIConfig(OPENAI_BASE_URL),
-      apiKey: getAIConfig(OPENAI_API_KEY),
-      ...extraConfig,
-      dangerouslyAllowBrowser: true
-    });
-  } else {
-    openai = new import_openai5.default({
-      baseURL: getAIConfig(OPENAI_BASE_URL),
-      apiKey: getAIConfig(OPENAI_API_KEY),
-      ...extraConfig,
-      dangerouslyAllowBrowser: true
-    });
-  }
-  if (getAIConfig(MIDSCENE_LANGSMITH_DEBUG)) {
-    if (import_utils.ifInBrowser) {
-      throw new Error("langsmith is not supported in browser");
-    }
-    console.log("DEBUGGING MODE: langsmith wrapper enabled");
-    const { wrapOpenAI: wrapOpenAI2 } = await Promise.resolve().then(() => (init_wrappers2(), wrappers_exports));
-    openai = wrapOpenAI2(openai);
-  }
-  return openai;
-}
-async function call(messages, responseFormat) {
-  const openai = await createOpenAI();
-  const shouldPrintTiming = typeof getAIConfig(MIDSCENE_DEBUG_AI_PROFILE) === "string";
-  if (getAIConfig(MIDSCENE_DANGEROUSLY_PRINT_ALL_CONFIG)) {
-    console.log(allAIConfig());
-  }
-  const startTime = Date.now();
-  const model = getModelName();
-  const completion = await openai.chat.completions.create({
-    model,
-    messages,
-    response_format: responseFormat,
-    temperature: 0.1,
-    stream: false
-    // betas: ['computer-use-2024-10-22'],
-  });
-  shouldPrintTiming && console.log(
-    "Midscene - AI call",
-    model,
-    completion.usage,
-    `${Date.now() - startTime}ms`
-  );
-  const { content } = completion.choices[0].message;
-  (0, import_node_assert3.default)(content, "empty content");
-  return content;
-}
-async function callToGetJSONObject(messages, AIActionTypeValue) {
-  let responseFormat = {
-    type: "json_object" /* JSON */
-  };
-  const model = getModelName();
-  if (model === "gpt-4o-2024-08-06") {
-    switch (AIActionTypeValue) {
-      case 0 /* ASSERT */:
-        responseFormat = assertSchema;
-        break;
-      case 1 /* INSPECT_ELEMENT */:
-        responseFormat = findElementSchema;
-        break;
-      case 2 /* EXTRACT_DATA */:
-        break;
-      case 3 /* PLAN */:
-        responseFormat = planSchema;
-        break;
-    }
-  }
-  if (model.startsWith("gemini")) {
-    responseFormat = { type: "text" /* TEXT */ };
-  }
-  const response = await call(messages, responseFormat);
-  (0, import_node_assert3.default)(response, "empty response");
-  const jsonContent = extractJSONFromCodeBlock(response);
-  try {
-    return JSON.parse(jsonContent);
-  } catch (e) {
-    throw Error(`parse json error: ${jsonContent}`);
-  }
-}
-function extractJSONFromCodeBlock(response) {
-  const jsonMatch = response.match(/^\s*(\{[\s\S]*\})\s*$/);
-  if (jsonMatch) {
-    return jsonMatch[1];
-  }
-  const codeBlockMatch = response.match(/```(?:json)?\s*(\{[\s\S]*?\})\s*```/);
-  if (codeBlockMatch) {
-    return codeBlockMatch[1];
-  }
-  const jsonLikeMatch = response.match(/\{[\s\S]*\}/);
-  if (jsonLikeMatch) {
-    return jsonLikeMatch[0];
+          description: "Overall error messages. If there is any error occurs during the task planning, conclude the errors again and put error messages here"
+        }
+      },
+      required: ["queryLanguage", "actions", "error"],
+      additionalProperties: false
+    }
   }
-  return response;
-}
+};
-// src/utils.ts
+// src/ai-model/prompt/util.ts
 var import_node_assert4 = __toESM(require("assert"));
-var import_node_child_process = require("child_process");
-var import_node_fs = require("fs");
-var import_node_os = require("os");
-var import_node_path = require("path");
-var import_fs = require("@midscene/shared/fs");
-var import_utils2 = require("@midscene/shared/utils");
-var logDir = (0, import_node_path.join)(process.cwd(), "./midscene_run/");
-var logEnvReady = false;
-var insightDumpFileExt = "insight-dump.json";
-function getLogDir() {
-  return logDir;
+// src/image/index.ts
+var import_img = require("@midscene/shared/img");
+// src/ai-model/prompt/util.ts
+var characteristic = "You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.";
+var contextFormatIntro = `
+The user will give you a screenshot and the texts on it. There may be some none-English characters (like Chinese) on it, indicating it's an non-English app.`;
+var ONE_ELEMENT_LOCATOR_PREFIX = "LOCATE_ONE_ELEMENT";
+var ELEMENTS_LOCATOR_PREFIX = "LOCATE_ONE_OR_MORE_ELEMENTS";
+var SECTION_MATCHER_FLAG = "SECTION_MATCHER_FLAG/";
+function systemPromptToExtract() {
+  return `
+You are a versatile professional in software UI design and testing. Your outstanding contributions will impact the user experience of billions of users.
+The user will give you a screenshot and the contents of it. There may be some none-English characters (like Chinese) on it, indicating it's an non-English app.
+You have the following skills:
+skill name: extract_data_from_UI
+related input: DATA_DEMAND
+skill content:
+* User will give you some data requirements in DATA_DEMAND. Consider the UI context, follow the user's instructions, and provide comprehensive data accordingly.
+* There may be some special commands in DATA_DEMAND, please pay extra attention
+  - LOCATE_ONE_ELEMENT and LOCATE_ONE_OR_MORE_ELEMENTS: if you see a description that mentions the keyword LOCATE_ONE_ELEMENT
+  - LOCATE_ONE_OR_MORE_ELEMENTS(e.g. follow LOCATE_ONE_ELEMENT : i want to find ...), it means user wants to locate a specific element meets the description.
+Return in this way: prefix + the id / comma-separated ids, for example: LOCATE_ONE_ELEMENT/1 , LOCATE_ONE_OR_MORE_ELEMENTS/1,2,3 . If not found, keep the prefix and leave the suffix empty, like LOCATE_ONE_ELEMENT/ .
+Return in the following JSON format:
+{
+  language: "en", // "en" or "zh", the language of the page. Use the same language to describe section name, description, and similar fields.
+  data: any, // the extracted data from extract_data_from_UI skill. Make sure both the value and scheme meet the DATA_DEMAND.
+  errors: [], // string[], error message if any
 }
-function setLogDir(dir) {
-  logDir = dir;
+`;
 }
-function getLogDirByType(type) {
-  const dir = (0, import_node_path.join)(getLogDir(), type);
-  if (!(0, import_node_fs.existsSync)(dir)) {
-    (0, import_node_fs.mkdirSync)(dir, { recursive: true });
-  }
-  return dir;
+function systemPromptToAssert() {
+  return `
+${characteristic}
+${contextFormatIntro}
+Based on the information you get, Return assertion judgment:
+Return in the following JSON format:
+{
+  thought: string, // string, the thought of the assertion. Should in the same language as the assertion.
+  pass: true, // true or false, whether the assertion is passed
 }
-var reportTpl = null;
-function getReportTpl() {
-  if (import_utils2.ifInBrowser) {
-    if (!reportTpl && window.midscene_report_tpl) {
-      reportTpl = window.midscene_report_tpl;
-    }
-    (0, import_node_assert4.default)(
-      reportTpl,
-      "reportTpl should be set before writing report in browser"
-    );
-    return reportTpl;
-  }
-  if (!reportTpl) {
-    let reportPath = (0, import_node_path.join)(__dirname, "../../report/index.html");
-    if (!(0, import_node_fs.existsSync)(reportPath)) {
-      reportPath = (0, import_node_path.join)(__dirname, "../report/index.html");
+`;
+}
+var assertSchema = {
+  type: "json_schema",
+  json_schema: {
+    name: "assert",
+    strict: true,
+    schema: {
+      type: "object",
+      properties: {
+        thought: {
+          type: "string",
+          description: "The thought process behind the assertion"
+        },
+        pass: {
+          type: "boolean",
+          description: "Whether the assertion passed or failed"
+        }
+      },
+      required: ["thought", "pass"],
+      additionalProperties: false
     }
-    reportTpl = (0, import_node_fs.readFileSync)(reportPath, "utf-8");
   }
-  return reportTpl;
+};
+function describeSize(size) {
+  return `${size.width} x ${size.height}`;
 }
-function reportHTMLContent(dumpData) {
-  const tpl = getReportTpl();
-  let reportContent;
-  if (Array.isArray(dumpData) && dumpData.length === 0 || typeof dumpData === "undefined") {
-    reportContent = tpl.replace(
-      /\s+{{dump}}\s+/,
-      `<script type="midscene_web_dump" type="application/json"></script>`
-    );
-  } else if (typeof dumpData === "string") {
-    reportContent = tpl.replace(
-      /\s+{{dump}}\s+/,
-      `<script type="midscene_web_dump" type="application/json">${dumpData}</script>`
-    );
-  } else {
-    const dumps = dumpData.map(({ dumpString, attributes }) => {
-      const attributesArr = Object.keys(attributes || {}).map((key) => {
-        return `${key}="${encodeURIComponent(attributes[key])}"`;
-      });
-      return `<script type="midscene_web_dump" type="application/json" ${attributesArr.join(
-        " "
-      )}
->${dumpString}
-</script>`;
-    });
-    reportContent = tpl.replace(/\s+{{dump}}\s+/, dumps.join("\n"));
+function truncateText(text, maxLength = 20) {
+  if (text && text.length > maxLength) {
+    return `${text.slice(0, maxLength)}...`;
   }
-  return reportContent;
+  return text;
 }
-function writeDumpReport(fileName, dumpData) {
-  if (import_utils2.ifInBrowser) {
-    console.log("will not write report in browser");
-    return null;
-  }
-  const midscenePkgInfo = (0, import_fs.getRunningPkgInfo)(__dirname);
-  if (!midscenePkgInfo) {
-    console.warn("midscenePkgInfo not found, will not write report");
-    return null;
-  }
-  const reportPath = (0, import_node_path.join)(getLogDirByType("report"), `${fileName}.html`);
-  const reportContent = reportHTMLContent(dumpData);
-  (0, import_node_fs.writeFileSync)(reportPath, reportContent);
-  return reportPath;
+function elementByPosition(elementsInfo, position) {
+  (0, import_node_assert4.default)(typeof position !== "undefined", "position is required for query");
+  const item = elementsInfo.find((item2) => {
+    return item2.rect.left <= position.x && position.x <= item2.rect.left + item2.rect.width && item2.rect.top <= position.y && position.y <= item2.rect.top + item2.rect.height;
+  });
+  return item;
 }
-function writeLogFile(opts) {
-  if (import_utils2.ifInBrowser) {
-    return "/mock/report.html";
+async function describeUserPage(context) {
+  const { screenshotBase64 } = context;
+  let width;
+  let height;
+  if (context.size) {
+    ({ width, height } = context.size);
+  } else {
+    const imgSize = await (0, import_img.imageInfoOfBase64)(screenshotBase64);
+    ({ width, height } = imgSize);
   }
-  const { fileName, fileExt, fileContent, type = "dump" } = opts;
-  const targetDir = getLogDirByType(type);
-  if (!logEnvReady) {
-    (0, import_node_assert4.default)(targetDir, "logDir should be set before writing dump file");
-    const gitIgnorePath = (0, import_node_path.join)(targetDir, "../../.gitignore");
-    let gitIgnoreContent = "";
-    if ((0, import_node_fs.existsSync)(gitIgnorePath)) {
-      gitIgnoreContent = (0, import_node_fs.readFileSync)(gitIgnorePath, "utf-8");
+  const elementsInfo = context.content;
+  const idElementMap = {};
+  elementsInfo.forEach((item) => {
+    idElementMap[item.id] = item;
+    return { ...item };
+  });
+  const elementInfosDescription = cropFieldInformation(elementsInfo);
+  return {
+    description: `
+{
+  // The size of the page
+  "pageSize": ${describeSize({ width, height })},
+  ${// if match by id, use the description of the element
+    getAIConfig(MATCH_BY_POSITION) ? "" : `// json description of the element
+  "content": ${JSON.stringify(elementInfosDescription)}
+      `}
+}`,
+    elementById(id) {
+      (0, import_node_assert4.default)(typeof id !== "undefined", "id is required for query");
+      const item = idElementMap[`${id}`];
+      return item;
+    },
+    elementByPosition(position) {
+      return elementByPosition(elementsInfo, position);
     }
-    const logDirName = (0, import_node_path.basename)(logDir);
-    if (!gitIgnoreContent.includes(`${logDirName}/`)) {
-      (0, import_node_fs.writeFileSync)(
-        gitIgnorePath,
-        `${gitIgnoreContent}
-# Midscene.js dump files
-${logDirName}/report
-${logDirName}/dump
-`,
-        "utf-8"
+  };
+}
+function cropFieldInformation(elementsInfo) {
+  const elementInfosDescription = elementsInfo.map(
+    (item) => {
+      const { id, attributes = {}, rect, content } = item;
+      const tailorContent = truncateText(content);
+      const tailorAttributes = Object.keys(attributes).reduce(
+        (res, currentKey) => {
+          const attributeVal = attributes[currentKey];
+          if (currentKey === "style" || currentKey === "src")
+            return res;
+          if (currentKey === "nodeType") {
+            res[currentKey] = attributeVal.replace(/\sNode$/, "");
+          } else {
+            res[currentKey] = truncateText(attributeVal);
+          }
+          return res;
+        },
+        {}
       );
+      return {
+        id,
+        markerId: item.indexId,
+        attributes: tailorAttributes,
+        rect: {
+          left: rect.left,
+          top: rect.top,
+          width: rect.width,
+          height: rect.height
+          // remove 'zoom' if it exists
+        },
+        content: tailorContent
+      };
     }
-    logEnvReady = true;
-  }
-  const filePath = (0, import_node_path.join)(targetDir, `${fileName}.${fileExt}`);
-  const outputResourceDir = (0, import_node_path.dirname)(filePath);
-  if (!(0, import_node_fs.existsSync)(outputResourceDir)) {
-    (0, import_node_fs.mkdirSync)(outputResourceDir, { recursive: true });
+  );
+  return elementInfosDescription;
+}
+function retrieveElement(prompt, opt) {
+  if (opt == null ? void 0 : opt.multi) {
+    return `follow ${ELEMENTS_LOCATOR_PREFIX}: ${prompt}`;
   }
-  (0, import_node_fs.writeFileSync)(filePath, fileContent);
-  if (opts == null ? void 0 : opts.generateReport) {
-    return writeDumpReport(fileName, fileContent);
+  return `follow ${ONE_ELEMENT_LOCATOR_PREFIX}: ${prompt}`;
+}
+function ifElementTypeResponse(response) {
+  if (typeof response !== "string") {
+    return false;
   }
-  return filePath;
+  return response.startsWith(ONE_ELEMENT_LOCATOR_PREFIX) || response.startsWith(ELEMENTS_LOCATOR_PREFIX);
 }
-function replacerForPageObject(key, value) {
-  var _a, _b;
-  if (value && ((_a = value.constructor) == null ? void 0 : _a.name) === "Page") {
-    return "[Page object]";
+function splitElementResponse(response) {
+  const oneElementSplitter = `${ONE_ELEMENT_LOCATOR_PREFIX}/`;
+  if (response.startsWith(oneElementSplitter)) {
+    const id = response.slice(oneElementSplitter.length);
+    if (id.indexOf(",") >= 0) {
+      console.warn(`unexpected comma in one element response: ${id}`);
+    }
+    return id ? id : null;
   }
-  if (value && ((_b = value.constructor) == null ? void 0 : _b.name) === "Browser") {
-    return "[Browser object]";
+  const elementsSplitter = `${ELEMENTS_LOCATOR_PREFIX}/`;
+  if (response.startsWith(elementsSplitter)) {
+    const idsString = response.slice(elementsSplitter.length);
+    if (!idsString) {
+      return [];
+    }
+    return idsString.split(",");
   }
-  return value;
-}
-function stringifyDumpData(data, indents) {
-  return JSON.stringify(data, replacerForPageObject, indents);
+  return null;
 }
-function getVersion() {
-  return "0.8.4";
+function retrieveSection(prompt) {
+  return `${SECTION_MATCHER_FLAG}${prompt}`;
 }
-// src/action/executor.ts
-var Executor = class {
-  constructor(name, description, tasks) {
-    __publicField(this, "name");
-    __publicField(this, "description");
-    __publicField(this, "tasks");
-    // status of executor
-    __publicField(this, "status");
-    this.status = tasks && tasks.length > 0 ? "pending" : "init";
-    this.name = name;
-    this.description = description;
-    this.tasks = (tasks || []).map((item) => this.markTaskAsPending(item));
+// src/ai-model/openai/index.ts
+function preferOpenAIModel(preferVendor) {
+  if (preferVendor && preferVendor !== "openAI")
+    return false;
+  if (getAIConfig(OPENAI_API_KEY))
+    return true;
+  return Boolean(getAIConfig(MIDSCENE_OPENAI_INIT_CONFIG_JSON));
+}
+var defaultModel = "gpt-4o-2024-08-06";
+function getModelName() {
+  let modelName = defaultModel;
+  const nameInConfig = getAIConfig(MIDSCENE_MODEL_NAME);
+  if (nameInConfig) {
+    modelName = nameInConfig;
   }
-  markTaskAsPending(task) {
-    return {
-      status: "pending",
-      ...task
-    };
+  return modelName;
+}
+async function createOpenAI() {
+  let openai;
+  const extraConfigString = getAIConfig(MIDSCENE_OPENAI_INIT_CONFIG_JSON);
+  const extraConfig = extraConfigString ? JSON.parse(extraConfigString) : {};
+  if (getAIConfig(OPENAI_USE_AZURE)) {
+    openai = new import_openai2.AzureOpenAI({
+      baseURL: getAIConfig(OPENAI_BASE_URL),
+      apiKey: getAIConfig(OPENAI_API_KEY),
+      ...extraConfig,
+      dangerouslyAllowBrowser: true
+    });
+  } else {
+    openai = new import_openai2.default({
+      baseURL: getAIConfig(OPENAI_BASE_URL),
+      apiKey: getAIConfig(OPENAI_API_KEY),
+      ...extraConfig,
+      dangerouslyAllowBrowser: true
+    });
   }
-  async append(task) {
-    (0, import_node_assert5.default)(
-      this.status !== "error",
-      "executor is in error state, cannot append task"
-    );
-    if (Array.isArray(task)) {
-      this.tasks.push(...task.map((item) => this.markTaskAsPending(item)));
-    } else {
-      this.tasks.push(this.markTaskAsPending(task));
-    }
-    if (this.status !== "running") {
-      this.status = "pending";
+  if (getAIConfig(MIDSCENE_LANGSMITH_DEBUG)) {
+    if (import_utils3.ifInBrowser) {
+      throw new Error("langsmith is not supported in browser");
     }
+    console.log("DEBUGGING MODE: langsmith wrapper enabled");
+    const { wrapOpenAI: wrapOpenAI2 } = await Promise.resolve().then(() => (init_wrappers2(), wrappers_exports));
+    openai = wrapOpenAI2(openai);
   }
-  async flush() {
-    if (this.status === "init" && this.tasks.length > 0) {
-      console.warn(
-        "illegal state for executor, status is init but tasks are not empty"
-      );
-    }
-    (0, import_node_assert5.default)(this.status !== "running", "executor is already running");
-    (0, import_node_assert5.default)(this.status !== "completed", "executor is already completed");
-    (0, import_node_assert5.default)(this.status !== "error", "executor is in error state");
-    const nextPendingIndex = this.tasks.findIndex(
-      (task) => task.status === "pending"
-    );
-    if (nextPendingIndex < 0) {
-      return;
-    }
-    this.status = "running";
-    let taskIndex = nextPendingIndex;
-    let successfullyCompleted = true;
-    let previousFindOutput;
-    while (taskIndex < this.tasks.length) {
-      const task = this.tasks[taskIndex];
-      (0, import_node_assert5.default)(
-        task.status === "pending",
-        `task status should be pending, but got: ${task.status}`
-      );
-      task.timing = {
-        start: Date.now()
-      };
-      try {
-        task.status = "running";
-        (0, import_node_assert5.default)(
-          ["Insight", "Action", "Planning"].indexOf(task.type) >= 0,
-          `unsupported task type: ${task.type}`
-        );
-        const { executor, param } = task;
-        (0, import_node_assert5.default)(executor, `executor is required for task type: ${task.type}`);
-        let returnValue;
-        const executorContext = {
-          task,
-          element: previousFindOutput == null ? void 0 : previousFindOutput.element
-        };
-        if (task.type === "Insight") {
-          (0, import_node_assert5.default)(
-            task.subType === "Locate" || task.subType === "Query" || task.subType === "Assert",
-            `unsupported insight subType: ${task.subType}`
-          );
-          returnValue = await task.executor(param, executorContext);
-          if (task.subType === "Locate") {
-            previousFindOutput = returnValue == null ? void 0 : returnValue.output;
-          }
-        } else if (task.type === "Action" || task.type === "Planning") {
-          returnValue = await task.executor(param, executorContext);
-        } else {
-          console.warn(
-            `unsupported task type: ${task.type}, will try to execute it directly`
-          );
-          returnValue = await task.executor(param, executorContext);
-        }
-        Object.assign(task, returnValue);
-        task.status = "finished";
-        task.timing.end = Date.now();
-        task.timing.cost = task.timing.end - task.timing.start;
-        taskIndex++;
-      } catch (e) {
-        successfullyCompleted = false;
-        task.error = (e == null ? void 0 : e.message) || "error-without-message";
-        task.errorStack = e.stack;
-        task.status = "failed";
-        task.timing.end = Date.now();
-        task.timing.cost = task.timing.end - task.timing.start;
+  return openai;
+}
+async function call(messages, responseFormat) {
+  const openai = await createOpenAI();
+  const shouldPrintTiming = typeof getAIConfig(MIDSCENE_DEBUG_AI_PROFILE) === "string";
+  if (getAIConfig(MIDSCENE_DANGEROUSLY_PRINT_ALL_CONFIG)) {
+    console.log(allAIConfig());
+  }
+  const startTime = Date.now();
+  const model = getModelName();
+  const completion = await openai.chat.completions.create({
+    model,
+    messages,
+    response_format: responseFormat,
+    temperature: 0.1,
+    stream: false
+    // betas: ['computer-use-2024-10-22'],
+  });
+  shouldPrintTiming && console.log(
+    "Midscene - AI call",
+    model,
+    completion.usage,
+    `${Date.now() - startTime}ms`
+  );
+  const { content } = completion.choices[0].message;
+  (0, import_node_assert5.default)(content, "empty content");
+  return content;
+}
+async function callToGetJSONObject(messages, AIActionTypeValue) {
+  let responseFormat = {
+    type: "json_object" /* JSON */
+  };
+  const model = getModelName();
+  if (model === "gpt-4o-2024-08-06") {
+    switch (AIActionTypeValue) {
+      case 0 /* ASSERT */:
+        responseFormat = assertSchema;
+        break;
+      case 1 /* INSPECT_ELEMENT */:
+        responseFormat = findElementSchema;
+        break;
+      case 2 /* EXTRACT_DATA */:
+        break;
+      case 3 /* PLAN */:
+        responseFormat = planSchema;
         break;
-      }
-    }
-    for (let i = taskIndex + 1; i < this.tasks.length; i++) {
-      this.tasks[i].status = "cancelled";
-    }
-    if (successfullyCompleted) {
-      this.status = "completed";
-    } else {
-      this.status = "error";
-    }
-    if (this.tasks.length) {
-      const outputIndex = Math.min(taskIndex, this.tasks.length - 1);
-      return this.tasks[outputIndex].output;
     }
   }
-  isInErrorState() {
-    return this.status === "error";
+  if (model.startsWith("gemini")) {
+    responseFormat = { type: "text" /* TEXT */ };
   }
-  latestErrorTask() {
-    if (this.status !== "error") {
-      return null;
-    }
-    const errorTaskIndex = this.tasks.findIndex(
-      (task) => task.status === "failed"
-    );
-    if (errorTaskIndex >= 0) {
-      return this.tasks[errorTaskIndex];
-    }
-    return null;
+  const response = await call(messages, responseFormat);
+  (0, import_node_assert5.default)(response, "empty response");
+  const jsonContent = extractJSONFromCodeBlock(response);
+  try {
+    return JSON.parse(jsonContent);
+  } catch (e) {
+    throw Error(`parse json error: ${jsonContent}`);
   }
-  dump() {
-    const dumpData = {
-      sdkVersion: getVersion(),
-      model_name: getAIConfig(MIDSCENE_MODEL_NAME) || "",
-      logTime: Date.now(),
-      name: this.name,
-      description: this.description,
-      tasks: this.tasks
-    };
-    return dumpData;
+}
+function extractJSONFromCodeBlock(response) {
+  const jsonMatch = response.match(/^\s*(\{[\s\S]*\})\s*$/);
+  if (jsonMatch) {
+    return jsonMatch[1];
   }
-};
-// src/insight/index.ts
-var import_node_assert9 = __toESM(require("assert"));
+  const codeBlockMatch = response.match(/```(?:json)?\s*(\{[\s\S]*?\})\s*```/);
+  if (codeBlockMatch) {
+    return codeBlockMatch[1];
+  }
+  const jsonLikeMatch = response.match(/\{[\s\S]*\}/);
+  if (jsonLikeMatch) {
+    return jsonLikeMatch[0];
+  }
+  return response;
+}
 // src/ai-model/inspect.ts
 var import_node_assert6 = __toESM(require("assert"));
@@ -6094,6 +6154,7 @@ var src_default = Insight;
   allAIConfig,
   getAIConfig,
   getElement,
+  getLogDirByType,
   getSection,
   getVersion,
   overrideAIConfig,