npm - @midscene/core - Versions diffs - 0.25.4-beta-20250807062119.0 → 0.25.4-beta-20250811113343.0 - Mend

@midscene/core 0.25.4-beta-20250807062119.0 → 0.25.4-beta-20250811113343.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

package/dist/es/ai-model.d.ts +7 -6
package/dist/es/ai-model.js +1 -1
package/dist/es/{chunk-G2JTYWI6.js → chunk-5IZMFZPA.js} +374 -598
package/dist/es/chunk-5IZMFZPA.js.map +1 -0
package/dist/es/{chunk-JH54OF4E.js → chunk-H5PRBRMX.js} +3 -3
package/dist/es/index.d.ts +6 -6
package/dist/es/index.js +4 -5
package/dist/es/index.js.map +1 -1
package/dist/es/{llm-planning-f449f3b8.d.ts → llm-planning-374b74b8.d.ts} +3 -3
package/dist/es/{types-7435eba0.d.ts → types-16cd9f75.d.ts} +11 -8
package/dist/es/utils.d.ts +1 -1
package/dist/es/utils.js +1 -1
package/dist/lib/ai-model.d.ts +7 -6
package/dist/lib/ai-model.js +2 -2
package/dist/lib/{chunk-G2JTYWI6.js → chunk-5IZMFZPA.js} +367 -591
package/dist/lib/chunk-5IZMFZPA.js.map +1 -0
package/dist/lib/{chunk-JH54OF4E.js → chunk-H5PRBRMX.js} +3 -3
package/dist/lib/index.d.ts +6 -6
package/dist/lib/index.js +14 -15
package/dist/lib/index.js.map +1 -1
package/dist/lib/{llm-planning-f449f3b8.d.ts → llm-planning-374b74b8.d.ts} +3 -3
package/dist/{types/types-7435eba0.d.ts → lib/types-16cd9f75.d.ts} +11 -8
package/dist/lib/utils.d.ts +1 -1
package/dist/lib/utils.js +2 -2
package/dist/types/ai-model.d.ts +7 -6
package/dist/types/index.d.ts +6 -6
package/dist/types/{llm-planning-f449f3b8.d.ts → llm-planning-374b74b8.d.ts} +3 -3
package/dist/{lib/types-7435eba0.d.ts → types/types-16cd9f75.d.ts} +11 -8
package/dist/types/utils.d.ts +1 -1
package/package.json +3 -3
package/dist/es/chunk-G2JTYWI6.js.map +0 -1
package/dist/lib/chunk-G2JTYWI6.js.map +0 -1
/package/dist/es/{chunk-JH54OF4E.js.map → chunk-H5PRBRMX.js.map} +0 -0
/package/dist/lib/{chunk-JH54OF4E.js.map → chunk-H5PRBRMX.js.map} +0 -0

package/dist/es/{chunk-G2JTYWI6.js → chunk-5IZMFZPA.js} RENAMED Viewed

@@ -5,16 +5,35 @@ import {
   getBearerTokenProvider
 } from "@azure/identity";
 import {
+  ANTHROPIC_API_KEY,
+  AZURE_OPENAI_API_VERSION,
+  AZURE_OPENAI_DEPLOYMENT,
+  AZURE_OPENAI_ENDPOINT,
+  AZURE_OPENAI_KEY,
   MIDSCENE_API_TYPE,
+  MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON,
+  MIDSCENE_AZURE_OPENAI_SCOPE,
+  MIDSCENE_DEBUG_AI_PROFILE,
+  MIDSCENE_DEBUG_AI_RESPONSE,
   MIDSCENE_LANGSMITH_DEBUG,
+  MIDSCENE_MODEL_NAME,
+  MIDSCENE_OPENAI_HTTP_PROXY,
+  MIDSCENE_OPENAI_INIT_CONFIG_JSON,
+  MIDSCENE_OPENAI_SOCKS_PROXY,
+  MIDSCENE_USE_ANTHROPIC_SDK,
+  MIDSCENE_USE_AZURE_OPENAI,
+  OPENAI_API_KEY,
+  OPENAI_BASE_URL,
   OPENAI_MAX_TOKENS,
-  getAIConfig as getAIConfig2,
-  getAIConfigInBoolean as getAIConfigInBoolean2,
+  OPENAI_USE_AZURE,
+  getAIConfig,
+  getAIConfigInBoolean,
+  getAIConfigInJson,
   uiTarsModelVersion,
-  vlLocateMode as vlLocateMode3
+  vlLocateMode as vlLocateMode2
 } from "@midscene/shared/env";
-import { getDebug as getDebug3 } from "@midscene/shared/logger";
-import { assert as assert4 } from "@midscene/shared/utils";
+import { enableDebug, getDebug as getDebug2 } from "@midscene/shared/logger";
+import { assert as assert3 } from "@midscene/shared/utils";
 import { ifInBrowser } from "@midscene/shared/utils";
 import { HttpsProxyAgent } from "https-proxy-agent";
 import { jsonrepair } from "jsonrepair";
@@ -36,11 +55,10 @@ var AIActionType = /* @__PURE__ */ ((AIActionType2) => {
   AIActionType2[AIActionType2["DESCRIBE_ELEMENT"] = 4] = "DESCRIBE_ELEMENT";
   return AIActionType2;
 })(AIActionType || {});
-async function callAiFn(msgs, AIActionTypeValue, modelPreferences) {
+async function callAiFn(msgs, AIActionTypeValue) {
   const { content, usage } = await callToGetJSONObject(
     msgs,
-    AIActionTypeValue,
-    modelPreferences
+    AIActionTypeValue
   );
   return { content, usage };
 }
@@ -615,179 +633,80 @@ Here is the item user want to find:
 });
 // src/ai-model/prompt/llm-planning.ts
+import assert2 from "assert";
 import { PromptTemplate as PromptTemplate2 } from "@langchain/core/prompts";
-// src/image/index.ts
-import {
-  imageInfo,
-  imageInfoOfBase64,
-  localImg2Base64,
-  httpImg2Base64,
-  resizeImg,
-  saveBase64Image,
-  zoomForGPT4o
-} from "@midscene/shared/img";
-// src/ai-model/prompt/util.ts
-import { NodeType as NodeType2 } from "@midscene/shared/constants";
-import { vlLocateMode as vlLocateMode2 } from "@midscene/shared/env";
-import {
-  descriptionOfTree,
-  generateElementByPosition,
-  treeToList as treeToList2
-} from "@midscene/shared/extractor";
-import { assert as assert2 } from "@midscene/shared/utils";
-function describeSize(size) {
-  return `${size.width} x ${size.height}`;
-}
-var distanceThreshold = 16;
-function elementByPositionWithElementInfo(treeRoot, position, options) {
-  const requireStrictDistance = options?.requireStrictDistance ?? true;
-  const filterPositionElements = options?.filterPositionElements ?? false;
-  assert2(typeof position !== "undefined", "position is required for query");
-  const matchingElements = [];
-  function dfs(node) {
-    if (node?.node) {
-      const item = node.node;
-      if (item.rect.left <= position.x && position.x <= item.rect.left + item.rect.width && item.rect.top <= position.y && position.y <= item.rect.top + item.rect.height) {
-        if (!(filterPositionElements && item.attributes?.nodeType === NodeType2.POSITION) && item.isVisible) {
-          matchingElements.push(item);
-        }
-      }
-    }
-    for (const child of node.children) {
-      dfs(child);
-    }
-  }
-  dfs(treeRoot);
-  if (matchingElements.length === 0) {
-    return void 0;
-  }
-  const element = matchingElements.reduce((smallest, current) => {
-    const smallestArea = smallest.rect.width * smallest.rect.height;
-    const currentArea = current.rect.width * current.rect.height;
-    return currentArea < smallestArea ? current : smallest;
-  });
-  const distanceToCenter = distance(
-    { x: element.center[0], y: element.center[1] },
-    position
-  );
-  if (requireStrictDistance) {
-    return distanceToCenter <= distanceThreshold ? element : void 0;
-  }
-  return element;
-}
-function distance(point1, point2) {
-  return Math.sqrt((point1.x - point2.x) ** 2 + (point1.y - point2.y) ** 2);
-}
-var samplePageDescription = `
-And the page is described as follows:
-====================
-The size of the page: 1280 x 720
-Some of the elements are marked with a rectangle in the screenshot corresponding to the markerId, some are not.
-Description of all the elements in screenshot:
-<div id="969f1637" markerId="1" left="100" top="100" width="100" height="100"> // The markerId indicated by the rectangle label in the screenshot
-  <h4 id="b211ecb2" markerId="5" left="150" top="150" width="90" height="60">
-    The username is accepted
-  </h4>
-  ...many more
-</div>
-====================
-`;
-async function describeUserPage(context, opt) {
-  const { screenshotBase64 } = context;
-  let width;
-  let height;
-  if (context.size) {
-    ({ width, height } = context.size);
-  } else {
-    const imgSize = await imageInfoOfBase64(screenshotBase64);
-    ({ width, height } = imgSize);
-  }
-  const treeRoot = context.tree;
-  const idElementMap = {};
-  const flatElements = treeToList2(treeRoot);
-  if (opt?.domIncluded === true && flatElements.length >= 5e3) {
-    console.warn(
-      'The number of elements is too large, it may cause the prompt to be too long, please use domIncluded: "visible-only" to reduce the number of elements'
-    );
-  }
-  flatElements.forEach((element) => {
-    idElementMap[element.id] = element;
-    if (typeof element.indexId !== "undefined") {
-      idElementMap[`${element.indexId}`] = element;
-    }
-  });
-  let pageDescription = "";
-  const visibleOnly = opt?.visibleOnly ?? opt?.domIncluded === "visible-only";
-  if (opt?.domIncluded || !vlLocateMode2()) {
-    const contentTree = await descriptionOfTree(
-      treeRoot,
-      opt?.truncateTextLength,
-      opt?.filterNonTextContent,
-      visibleOnly
-    );
-    const sizeDescription = describeSize({ width, height });
-    pageDescription = `The size of the page: ${sizeDescription}
- The page elements tree:
-${contentTree}`;
-  }
-  return {
-    description: pageDescription,
-    elementById(idOrIndexId) {
-      assert2(typeof idOrIndexId !== "undefined", "id is required for query");
-      const item = idElementMap[`${idOrIndexId}`];
-      return item;
-    },
-    elementByPosition(position, size) {
-      return elementByPositionWithElementInfo(treeRoot, position);
-    },
-    insertElementByPosition(position) {
-      const element = generateElementByPosition(position);
-      treeRoot.children.push({
-        node: element,
-        children: []
-      });
-      flatElements.push(element);
-      idElementMap[element.id] = element;
-      return element;
-    },
-    size: { width, height }
-  };
-}
-// src/ai-model/prompt/llm-planning.ts
 var vlCoTLog = `"what_the_user_wants_to_do_next_by_instruction": string, // What the user wants to do according to the instruction and previous logs. `;
-var vlCurrentLog = `"log": string, // Log what the next one action (ONLY ONE!) you can do according to the screenshot and the instruction. The typical log looks like "Now i want to use action '{{ action-type }}' to do .. first". If no action should be done, log the reason. ". Use the same language as the user's instruction.`;
-var llmCurrentLog = `"log": string, // Log what the next actions you can do according to the screenshot and the instruction. The typical log looks like "Now i want to use action '{{ action-type }}' to do ..". If no action should be done, log the reason. ". Use the same language as the user's instruction.`;
+var vlCurrentLog = `"log": string, // Log what the next one action (ONLY ONE!) you can do according to the screenshot and the instruction. The typical log looks like "Now i want to use action '{ action-type }' to do .. first". If no action should be done, log the reason. ". Use the same language as the user's instruction.`;
+var llmCurrentLog = `"log": string, // Log what the next actions you can do according to the screenshot and the instruction. The typical log looks like "Now i want to use action '{ action-type }' to do ..". If no action should be done, log the reason. ". Use the same language as the user's instruction.`;
 var commonOutputFields = `"error"?: string, // Error messages about unexpected situations, if any. Only think it is an error when the situation is not expected according to the instruction. Use the same language as the user's instruction.
   "more_actions_needed_by_instruction": boolean, // Consider if there is still more action(s) to do after the action in "Log" is done, according to the instruction. If so, set this field to true. Otherwise, set it to false.`;
 var vlLocateParam = "locate: {bbox: [number, number, number, number], prompt: string }";
+var llmLocateParam = `locate: {"id": string, "prompt": string}`;
+var descriptionForAction = (action, locatorScheme) => {
+  const tab = "  ";
+  let locateParam = "";
+  if (action.location === "required") {
+    locateParam = locatorScheme;
+  } else if (action.location === "optional") {
+    locateParam = `${locatorScheme} | null`;
+  } else if (action.location === false) {
+    locateParam = "";
+  }
+  const locatorParam = locateParam ? `${tab}- ${locateParam}` : "";
+  let whatToLocate = "";
+  if (action.whatToLocate) {
+    if (!locateParam) {
+      console.warn(
+        `whatToLocate is provided for action ${action.name}, but location is not required or optional. The whatToLocate will be ignored.`
+      );
+    } else {
+      whatToLocate = `${tab}- whatToLocate: ${action.whatToLocate}`;
+    }
+  }
+  let paramSchema = "";
+  if (action.paramSchema) {
+    paramSchema = `${tab}- paramSchema: ${action.paramSchema}`;
+  }
+  let paramDescription = "";
+  if (action.paramDescription) {
+    assert2(
+      paramSchema,
+      `paramSchema is required when paramDescription is provided for action ${action.name}, but got ${action.paramSchema}`
+    );
+    paramDescription = `${tab}- paramDescription: ${action.paramDescription}`;
+  }
+  const fields = [
+    paramSchema,
+    paramDescription,
+    locatorParam,
+    whatToLocate
+  ].filter(Boolean);
+  return `- ${action.name}
+  - type: "${action.name}"
+  - description: ${action.description}
+${fields.join("\n")}
+`.trim();
+};
 var systemTemplateOfVLPlanning = ({
-  pageType,
+  actionSpace,
   vlMode
-}) => `
+}) => {
+  const actionNameList = actionSpace.map((action) => action.name).join(", ");
+  const actionDescriptionList = actionSpace.map(
+    (action) => descriptionForAction(action, vlLocateParam)
+  );
+  const actionList = actionDescriptionList.join("\n");
+  return `
 Target: User will give you a screenshot, an instruction and some previous logs indicating what have been done. Please tell what the next one action is (or null if no action should be done) to do the tasks the instruction requires.
 Restriction:
 - Don't give extra actions or plans beyond the instruction. ONLY plan for what the instruction requires. For example, don't try to submit the form if the instruction is only to fill something.
-- Always give ONLY ONE action in \`log\` field (or null if no action should be done), instead of multiple actions. Supported actions are Tap, Hover, Input, KeyboardPress, Scroll${pageType === "android" ? ", AndroidBackButton, AndroidHomeButton, AndroidRecentAppsButton, AndroidLongPress, AndroidPull." : "."}
+- Always give ONLY ONE action in \`log\` field (or null if no action should be done), instead of multiple actions. Supported actions are ${actionNameList}.
 - Don't repeat actions in the previous logs.
 - Bbox is the bounding box of the element to be located. It's an array of 4 numbers, representing ${bboxDescription(vlMode)}.
 Supporting actions:
-- Tap: { type: "Tap", ${vlLocateParam} }
-- RightClick: { type: "RightClick", ${vlLocateParam} }
-- Hover: { type: "Hover", ${vlLocateParam} }
-- Input: { type: "Input", ${vlLocateParam}, param: { value: string } } // Replace the input field with a new value. \`value\` is the final that should be filled in the input box. No matter what modifications are required, just provide the final value to replace the existing input value. Giving a blank string means clear the input field.
-- KeyboardPress: { type: "KeyboardPress", param: { value: string } }
-- Scroll: { type: "Scroll", ${vlLocateParam} | null, param: { direction: 'down'(default) | 'up' | 'right' | 'left', scrollType: 'once' (default) | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft', distance: null | number }} // locate is the element to scroll. If it's a page scroll, put \`null\` in the \`locate\` field.
-${pageType === "android" ? `- AndroidBackButton: { type: "AndroidBackButton", param: {} }
-- AndroidHomeButton: { type: "AndroidHomeButton", param: {} }
-- AndroidRecentAppsButton: { type: "AndroidRecentAppsButton", param: {} }
-- AndroidLongPress: { type: "AndroidLongPress", param: { x: number, y: number, duration?: number } }
-- AndroidPull: { type: "AndroidPull", param: { direction: 'up' | 'down', startPoint?: { x: number, y: number }, distance?: number, duration?: number } } // Pull down to refresh (direction: 'down') or pull up to load more (direction: 'up')` : ""}
+${actionList}
 Field description:
 * The \`prompt\` field inside the \`locate\` field is a short description that could be used to locate the element.
@@ -822,8 +741,16 @@ this and output the JSON:
   }
 }
 `;
-var llmLocateParam = `locate: {{"id": string, "prompt": string}} | null`;
-var systemTemplateOfLLM = ({ pageType }) => `
+};
+var systemTemplateOfLLM = ({
+  actionSpace
+}) => {
+  const actionNameList = actionSpace.map((action) => action.name).join(" / ");
+  const actionDescriptionList = actionSpace.map(
+    (action) => descriptionForAction(action, llmLocateParam)
+  );
+  const actionList = actionDescriptionList.join("\n");
+  return `
 ## Role
 You are a versatile professional in software UI automation. Your outstanding contributions will impact the user experience of billions of users.
@@ -837,7 +764,7 @@ You are a versatile professional in software UI automation. Your outstanding con
 ## Workflow
 1. Receive the screenshot, element description of screenshot(if any), user's instruction and previous logs.
-2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (Tap / Hover / Input / KeyboardPress / Scroll / FalsyConditionStatement / Sleep ${pageType === "android" ? "/ AndroidBackButton / AndroidHomeButton / AndroidRecentAppsButton / AndroidLongPress / AndroidPull" : ""}). The "About the action" section below will give you more details.
+2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (${actionNameList}). The "About the action" section below will give you more details.
 3. Precisely locate the target element if it's already shown in the screenshot, put the location info in the \`locate\` field of the action.
 4. If some target elements is not shown in the screenshot, consider the user's instruction is not feasible on this page. Follow the next steps.
 5. Consider whether the user's instruction will be accomplished after all the actions
@@ -855,65 +782,30 @@ You are a versatile professional in software UI automation. Your outstanding con
 The \`locate\` param is commonly used in the \`param\` field of the action, means to locate the target element to perform the action, it conforms to the following scheme:
-type LocateParam = {{
+type LocateParam = {
   "id": string, // the id of the element found. It should either be the id marked with a rectangle in the screenshot or the id described in the description.
   "prompt"?: string // the description of the element to find. It can only be omitted when locate is null.
-}} | null // If it's not on the page, the LocateParam should be null
+} | null // If it's not on the page, the LocateParam should be null
 ## Supported actions
 Each action has a \`type\` and corresponding \`param\`. To be detailed:
-- type: 'Tap'
-  * {{ ${llmLocateParam} }}
-- type: 'RightClick'
-  * {{ ${llmLocateParam} }}
-- type: 'Hover'
-  * {{ ${llmLocateParam} }}
-- type: 'Input', replace the value in the input field
-  * {{ ${llmLocateParam}, param: {{ value: string }} }}
-  * \`value\` is the final value that should be filled in the input field. No matter what modifications are required, just provide the final value user should see after the action is done.
-- type: 'KeyboardPress', press a key
-  * {{ param: {{ value: string }} }}
-- type: 'Scroll', scroll up or down.
-  * {{
-      ${llmLocateParam},
-      param: {{
-        direction: 'down'(default) | 'up' | 'right' | 'left',
-        scrollType: 'once' (default) | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft',
-        distance: null | number
-      }}
-    }}
-    * To scroll some specific element, put the element at the center of the region in the \`locate\` field. If it's a page scroll, put \`null\` in the \`locate\` field.
-    * \`param\` is required in this action. If some fields are not specified, use direction \`down\`, \`once\` scroll type, and \`null\` distance.
-  * {{ param: {{ button: 'Back' | 'Home' | 'RecentApp' }} }}
-- type: 'ExpectedFalsyCondition'
-  * {{ param: {{ reason: string }} }}
-  * use this action when the conditional statement talked about in the instruction is falsy.
-- type: 'Sleep'
-  * {{ param: {{ timeMs: number }} }}
-${pageType === "android" ? `- type: 'AndroidBackButton', trigger the system "back" operation on Android devices
-  * {{ param: {{}} }}
-- type: 'AndroidHomeButton', trigger the system "home" operation on Android devices
-  * {{ param: {{}} }}
-- type: 'AndroidRecentAppsButton', trigger the system "recent apps" operation on Android devices
-  * {{ param: {{}} }}
-- type: 'AndroidLongPress', trigger a long press on the screen at specified coordinates on Android devices
-  * {{ param: {{ x: number, y: number, duration?: number }} }}
-- type: 'AndroidPull', trigger pull down to refresh or pull up actions on Android devices
-  * {{ param: {{ direction: 'up' | 'down', startPoint?: {{ x: number, y: number }}, distance?: number, duration?: number }} }}` : ""}
-`;
+${actionList}
+`.trim();
+};
 var outputTemplate = `
 ## Output JSON Format:
 The JSON format is as follows:
-{{
+{
   "actions": [
     // ... some actions
   ],
   ${llmCurrentLog}
   ${commonOutputFields}
-}}
+}
 ## Examples
@@ -929,68 +821,62 @@ By viewing the page screenshot and description, you should consider this and out
 * Log what these action do: Click the language switch button to open the language options. Wait for 1 second.
 * The task cannot be accomplished (because we cannot see the "English" option now), so the \`more_actions_needed_by_instruction\` field is true.
-{{
+{
   "actions":[
-    {{
+    {
       "type": "Tap",
       "thought": "Click the language switch button to open the language options.",
       "param": null,
-      "locate": {{ id: "c81c4e9a33", prompt: "The language switch button" }},
-    }},
-    {{
+      "locate": { id: "c81c4e9a33", prompt: "The language switch button" }},
+    },
+    {
       "type": "Sleep",
       "thought": "Wait for 1 second to ensure the language options are displayed.",
-      "param": {{ "timeMs": 1000 }},
-    }}
+      "param": { "timeMs": 1000 },
+    }
   ],
   "error": null,
   "more_actions_needed_by_instruction": true,
   "log": "Click the language switch button to open the language options. Wait for 1 second",
-}}
+}
 ### Example: What NOT to do
 Wrong output:
-{{
+{
   "actions":[
-    {{
+    {
       "type": "Tap",
       "thought": "Click the language switch button to open the language options.",
       "param": null,
-      "locate": {{
-        {{ "id": "c81c4e9a33" }}, // WRONG: prompt is missing
-      }}
-    }},
-    {{
+      "locate": {
+        { "id": "c81c4e9a33" }, // WRONG: prompt is missing
+      }
+    },
+    {
       "type": "Tap",
       "thought": "Click the English option",
       "param": null,
       "locate": null, // This means the 'English' option is not shown in the screenshot, the task cannot be accomplished
-    }}
+    }
   ],
   "more_actions_needed_by_instruction": false, // WRONG: should be true
   "log": "Click the language switch button to open the language options",
-}}
+}
 Reason:
 * The \`prompt\` is missing in the first 'Locate' action
 * Since the option button is not shown in the screenshot, there are still more actions to be done, so the \`more_actions_needed_by_instruction\` field should be true
 `;
 async function systemPromptToTaskPlanning({
-  pageType,
+  actionSpace,
   vlMode
 }) {
   if (vlMode) {
-    return systemTemplateOfVLPlanning({ pageType, vlMode });
+    return systemTemplateOfVLPlanning({ actionSpace, vlMode });
   }
-  const promptTemplate = new PromptTemplate2({
-    template: `${systemTemplateOfLLM({ pageType })}
+  return `${systemTemplateOfLLM({ actionSpace })}
-${outputTemplate}`,
-    inputVariables: ["pageDescription"]
-  });
-  return await promptTemplate.format({
-    pageDescription: samplePageDescription
-  });
+${outputTemplate}`;
 }
 var planSchema = {
   type: "json_schema",
@@ -1145,57 +1031,24 @@ pageDescription:
   });
 };
-// src/ai-model/service-caller/utils.ts
-import {
-  ANTHROPIC_API_KEY,
-  AZURE_OPENAI_API_VERSION,
-  AZURE_OPENAI_DEPLOYMENT,
-  AZURE_OPENAI_ENDPOINT,
-  AZURE_OPENAI_KEY,
-  MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON,
-  MIDSCENE_AZURE_OPENAI_SCOPE,
-  MIDSCENE_DEBUG_AI_PROFILE,
-  MIDSCENE_DEBUG_AI_RESPONSE,
-  MIDSCENE_MODEL_NAME,
-  MIDSCENE_OPENAI_HTTP_PROXY,
-  MIDSCENE_OPENAI_INIT_CONFIG_JSON,
-  MIDSCENE_OPENAI_SOCKS_PROXY,
-  MIDSCENE_USE_ANTHROPIC_SDK,
-  MIDSCENE_USE_AZURE_OPENAI,
-  MIDSCENE_VQA_ANTHROPIC_API_KEY,
-  MIDSCENE_VQA_AZURE_OPENAI_API_VERSION,
-  MIDSCENE_VQA_AZURE_OPENAI_DEPLOYMENT,
-  MIDSCENE_VQA_AZURE_OPENAI_ENDPOINT,
-  MIDSCENE_VQA_AZURE_OPENAI_INIT_CONFIG_JSON,
-  MIDSCENE_VQA_AZURE_OPENAI_KEY,
-  MIDSCENE_VQA_AZURE_OPENAI_SCOPE,
-  MIDSCENE_VQA_MODEL_NAME,
-  MIDSCENE_VQA_OPENAI_API_KEY,
-  MIDSCENE_VQA_OPENAI_BASE_URL,
-  MIDSCENE_VQA_OPENAI_HTTP_PROXY,
-  MIDSCENE_VQA_OPENAI_INIT_CONFIG_JSON,
-  MIDSCENE_VQA_OPENAI_SOCKS_PROXY,
-  MIDSCENE_VQA_OPENAI_USE_AZURE,
-  MIDSCENE_VQA_USE_ANTHROPIC_SDK,
-  MIDSCENE_VQA_USE_AZURE_OPENAI,
-  OPENAI_API_KEY,
-  OPENAI_BASE_URL,
-  OPENAI_USE_AZURE,
-  getAIConfig,
-  getAIConfigInBoolean,
-  getAIConfigInJson
-} from "@midscene/shared/env";
-import { enableDebug, getDebug as getDebug2 } from "@midscene/shared/logger";
-import { assert as assert3 } from "@midscene/shared/utils";
-function getModelName() {
-  let modelName = "gpt-4o";
-  const nameInConfig = getAIConfig(MIDSCENE_MODEL_NAME);
-  if (nameInConfig) {
-    modelName = nameInConfig;
-  }
-  return modelName;
+// src/ai-model/service-caller/index.ts
+function checkAIConfig() {
+  const openaiKey = getAIConfig(OPENAI_API_KEY);
+  const azureConfig = getAIConfig(MIDSCENE_USE_AZURE_OPENAI);
+  const anthropicKey = getAIConfig(ANTHROPIC_API_KEY);
+  const initConfigJson = getAIConfig(MIDSCENE_OPENAI_INIT_CONFIG_JSON);
+  if (openaiKey)
+    return true;
+  if (azureConfig)
+    return true;
+  if (anthropicKey)
+    return true;
+  return Boolean(initConfigJson);
 }
+var debugConfigInitialized = false;
 function initDebugConfig() {
+  if (debugConfigInitialized)
+    return;
   const shouldPrintTiming = getAIConfigInBoolean(MIDSCENE_DEBUG_AI_PROFILE);
   let debugConfig = "";
   if (shouldPrintTiming) {
@@ -1220,232 +1073,27 @@ function initDebugConfig() {
   if (debugConfig) {
     enableDebug(debugConfig);
   }
+  debugConfigInitialized = true;
 }
-var createAssert = (modelNameKey, modelName) => (value, key, modelVendorFlag) => {
-  if (modelVendorFlag) {
-    assert3(
-      value,
-      `The ${key} must be a non-empty string because of the ${modelNameKey} is declared as ${modelName} and ${modelVendorFlag} has also been specified, but got: ${value}
-Please check your config.`
-    );
-  } else {
-    assert3(
-      value,
-      `The ${key} must be a non-empty string because of the ${modelNameKey} is declared as ${modelName}, but got: ${value}
-Please check your config.`
-    );
-  }
-};
-var getModelConfigFromEnv = (modelName, keys, valueAssert) => {
-  const socksProxy = getAIConfig(keys.socksProxy);
-  const httpProxy = getAIConfig(keys.httpProxy);
-  if (getAIConfig(keys.openaiUseAzureDeprecated)) {
-    const openaiBaseURL = getAIConfig(keys.openaiBaseURL);
-    const openaiApiKey = getAIConfig(keys.openaiApiKey);
-    const openaiExtraConfig = getAIConfigInJson(keys.openaiExtraConfig);
-    valueAssert(
-      openaiBaseURL,
-      keys.openaiBaseURL,
-      keys.openaiUseAzureDeprecated
-    );
-    valueAssert(openaiApiKey, keys.openaiApiKey, keys.openaiUseAzureDeprecated);
-    return {
-      socksProxy,
-      httpProxy,
-      modelName,
-      openaiUseAzureDeprecated: true,
-      openaiApiKey,
-      openaiBaseURL,
-      openaiExtraConfig
-    };
-  } else if (getAIConfig(keys.useAzureOpenai)) {
-    const azureOpenaiScope = getAIConfig(keys.azureOpenaiScope);
-    const azureOpenaiApiKey = getAIConfig(keys.azureOpenaiApiKey);
-    const azureOpenaiEndpoint = getAIConfig(keys.azureOpenaiEndpoint);
-    const azureOpenaiDeployment = getAIConfig(keys.azureOpenaiDeployment);
-    const azureOpenaiApiVersion = getAIConfig(keys.azureOpenaiApiVersion);
-    const azureExtraConfig = getAIConfigInJson(keys.azureExtraConfig);
-    const openaiExtraConfig = getAIConfigInJson(keys.openaiExtraConfig);
-    valueAssert(azureOpenaiApiKey, keys.azureOpenaiApiKey, keys.useAzureOpenai);
-    return {
-      socksProxy,
-      httpProxy,
-      modelName,
-      useAzureOpenai: true,
-      azureOpenaiScope,
-      azureOpenaiApiKey,
-      azureOpenaiEndpoint,
-      azureOpenaiDeployment,
-      azureOpenaiApiVersion,
-      azureExtraConfig,
-      openaiExtraConfig
-    };
-  } else if (getAIConfig(keys.useAnthropicSdk)) {
-    const anthropicApiKey = getAIConfig(keys.anthropicApiKey);
-    valueAssert(anthropicApiKey, keys.anthropicApiKey, keys.useAnthropicSdk);
-    return {
-      socksProxy,
-      httpProxy,
-      modelName,
-      useAnthropicSdk: true,
-      anthropicApiKey
-    };
-  } else {
-    const openaiBaseURL = getAIConfig(keys.openaiBaseURL);
-    const openaiApiKey = getAIConfig(keys.openaiApiKey);
-    const openaiExtraConfig = getAIConfigInJson(keys.openaiExtraConfig);
-    valueAssert(openaiBaseURL, keys.openaiBaseURL);
-    valueAssert(openaiApiKey, keys.openaiApiKey);
-    return {
-      socksProxy,
-      httpProxy,
-      modelName,
-      openaiBaseURL,
-      openaiApiKey,
-      openaiExtraConfig
-    };
-  }
-};
-var maskKey = (key, maskChar = "*") => {
-  if (typeof key !== "string" || key.length === 0) {
-    return key;
-  }
-  const prefixLen = 3;
-  const suffixLen = 3;
-  const keepLength = prefixLen + suffixLen;
-  if (key.length <= keepLength) {
-    return key;
-  }
-  const prefix = key.substring(0, prefixLen);
-  const suffix = key.substring(key.length - suffixLen);
-  const maskLength = key.length - keepLength;
-  const mask = maskChar.repeat(maskLength);
-  return `${prefix}${mask}${suffix}`;
-};
-var maskConfig = (config) => {
-  return Object.fromEntries(
-    Object.entries(config).map(([key, value]) => [
-      key,
-      ["openaiApiKey", "azureOpenaiApiKey", "anthropicApiKey"].includes(key) ? maskKey(value) : value
-    ])
-  );
-};
-var decideModelConfig = (modelPreferences) => {
-  initDebugConfig();
-  const debugLog = getDebug2("ai:decideModelConfig");
-  debugLog("modelPreferences", modelPreferences);
-  const isVQAIntent = modelPreferences?.intent === "VQA";
-  const vqaModelName = getAIConfig(MIDSCENE_VQA_MODEL_NAME);
-  if (isVQAIntent && vqaModelName) {
-    debugLog(
-      `current action is a VQA action and detected ${MIDSCENE_VQA_MODEL_NAME} ${vqaModelName}, will only read VQA related model config from process.env`
-    );
-    const config = getModelConfigFromEnv(
-      vqaModelName,
-      {
-        /**
-         * proxy
-         */
-        socksProxy: MIDSCENE_VQA_OPENAI_SOCKS_PROXY,
-        httpProxy: MIDSCENE_VQA_OPENAI_HTTP_PROXY,
-        /**
-         * OpenAI
-         */
-        openaiBaseURL: MIDSCENE_VQA_OPENAI_BASE_URL,
-        openaiApiKey: MIDSCENE_VQA_OPENAI_API_KEY,
-        openaiExtraConfig: MIDSCENE_VQA_OPENAI_INIT_CONFIG_JSON,
-        /**
-         * Azure
-         */
-        openaiUseAzureDeprecated: MIDSCENE_VQA_OPENAI_USE_AZURE,
-        useAzureOpenai: MIDSCENE_VQA_USE_AZURE_OPENAI,
-        azureOpenaiScope: MIDSCENE_VQA_AZURE_OPENAI_SCOPE,
-        azureOpenaiApiKey: MIDSCENE_VQA_AZURE_OPENAI_KEY,
-        azureOpenaiEndpoint: MIDSCENE_VQA_AZURE_OPENAI_ENDPOINT,
-        azureOpenaiApiVersion: MIDSCENE_VQA_AZURE_OPENAI_API_VERSION,
-        azureOpenaiDeployment: MIDSCENE_VQA_AZURE_OPENAI_DEPLOYMENT,
-        azureExtraConfig: MIDSCENE_VQA_AZURE_OPENAI_INIT_CONFIG_JSON,
-        /**
-         * Anthropic
-         */
-        useAnthropicSdk: MIDSCENE_VQA_USE_ANTHROPIC_SDK,
-        anthropicApiKey: MIDSCENE_VQA_ANTHROPIC_API_KEY
-      },
-      createAssert(MIDSCENE_VQA_MODEL_NAME, vqaModelName)
-    );
-    debugLog("got model config for VQA usage:", maskConfig(config));
-    return config;
-  } else {
-    debugLog("read model config from process.env as normal.");
-    const commonModelName = getAIConfig(MIDSCENE_MODEL_NAME);
-    assert3(
-      commonModelName,
-      `${MIDSCENE_MODEL_NAME} is empty, please check your config.`
-    );
-    const config = getModelConfigFromEnv(
-      commonModelName,
-      {
-        /**
-         * proxy
-         */
-        socksProxy: MIDSCENE_OPENAI_SOCKS_PROXY,
-        httpProxy: MIDSCENE_OPENAI_HTTP_PROXY,
-        /**
-         * OpenAI
-         */
-        openaiBaseURL: OPENAI_BASE_URL,
-        openaiApiKey: OPENAI_API_KEY,
-        openaiExtraConfig: MIDSCENE_OPENAI_INIT_CONFIG_JSON,
-        /**
-         * Azure
-         */
-        openaiUseAzureDeprecated: OPENAI_USE_AZURE,
-        useAzureOpenai: MIDSCENE_USE_AZURE_OPENAI,
-        azureOpenaiScope: MIDSCENE_AZURE_OPENAI_SCOPE,
-        azureOpenaiApiKey: AZURE_OPENAI_KEY,
-        azureOpenaiEndpoint: AZURE_OPENAI_ENDPOINT,
-        azureOpenaiApiVersion: AZURE_OPENAI_API_VERSION,
-        azureOpenaiDeployment: AZURE_OPENAI_DEPLOYMENT,
-        azureExtraConfig: MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON,
-        /**
-         * Anthropic
-         */
-        useAnthropicSdk: MIDSCENE_USE_ANTHROPIC_SDK,
-        anthropicApiKey: ANTHROPIC_API_KEY
-      },
-      createAssert(MIDSCENE_MODEL_NAME, commonModelName)
-    );
-    debugLog("got model config for common usage:", maskConfig(config));
-    return config;
+var defaultModel = "gpt-4o";
+function getModelName() {
+  let modelName = defaultModel;
+  const nameInConfig = getAIConfig(MIDSCENE_MODEL_NAME);
+  if (nameInConfig) {
+    modelName = nameInConfig;
   }
-};
-// src/ai-model/service-caller/index.ts
+  return modelName;
+}
 async function createChatClient({
-  AIActionTypeValue,
-  modelPreferences
+  AIActionTypeValue
 }) {
-  const {
-    socksProxy,
-    httpProxy,
-    modelName,
-    openaiBaseURL,
-    openaiApiKey,
-    openaiExtraConfig,
-    openaiUseAzureDeprecated,
-    useAzureOpenai,
-    azureOpenaiScope,
-    azureOpenaiApiKey,
-    azureOpenaiEndpoint,
-    azureOpenaiApiVersion,
-    azureOpenaiDeployment,
-    azureExtraConfig,
-    useAnthropicSdk,
-    anthropicApiKey
-  } = decideModelConfig(modelPreferences);
+  initDebugConfig();
   let openai;
+  const extraConfig = getAIConfigInJson(MIDSCENE_OPENAI_INIT_CONFIG_JSON);
+  const socksProxy = getAIConfig(MIDSCENE_OPENAI_SOCKS_PROXY);
+  const httpProxy = getAIConfig(MIDSCENE_OPENAI_HTTP_PROXY);
   let proxyAgent = void 0;
-  const debugProxy = getDebug3("ai:call:proxy");
+  const debugProxy = getDebug2("ai:call:proxy");
   if (httpProxy) {
     debugProxy("using http proxy", httpProxy);
     proxyAgent = new HttpsProxyAgent(httpProxy);
@@ -1453,56 +1101,70 @@ async function createChatClient({
     debugProxy("using socks proxy", socksProxy);
     proxyAgent = new SocksProxyAgent(socksProxy);
   }
-  if (openaiUseAzureDeprecated) {
+  if (getAIConfig(OPENAI_USE_AZURE)) {
     openai = new AzureOpenAI({
-      baseURL: openaiBaseURL,
-      apiKey: openaiApiKey,
+      baseURL: getAIConfig(OPENAI_BASE_URL),
+      apiKey: getAIConfig(OPENAI_API_KEY),
       httpAgent: proxyAgent,
-      ...openaiExtraConfig,
+      ...extraConfig,
       dangerouslyAllowBrowser: true
     });
-  } else if (useAzureOpenai) {
+  } else if (getAIConfig(MIDSCENE_USE_AZURE_OPENAI)) {
+    const extraAzureConfig = getAIConfigInJson(
+      MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON
+    );
+    const scope = getAIConfig(MIDSCENE_AZURE_OPENAI_SCOPE);
     let tokenProvider = void 0;
-    if (azureOpenaiScope) {
-      assert4(
+    if (scope) {
+      assert3(
         !ifInBrowser,
         "Azure OpenAI is not supported in browser with Midscene."
       );
       const credential = new DefaultAzureCredential();
-      tokenProvider = getBearerTokenProvider(credential, azureOpenaiScope);
+      assert3(scope, "MIDSCENE_AZURE_OPENAI_SCOPE is required");
+      tokenProvider = getBearerTokenProvider(credential, scope);
       openai = new AzureOpenAI({
         azureADTokenProvider: tokenProvider,
-        endpoint: azureOpenaiEndpoint,
-        apiVersion: azureOpenaiApiVersion,
-        deployment: azureOpenaiDeployment,
-        ...openaiExtraConfig,
-        ...azureExtraConfig
+        endpoint: getAIConfig(AZURE_OPENAI_ENDPOINT),
+        apiVersion: getAIConfig(AZURE_OPENAI_API_VERSION),
+        deployment: getAIConfig(AZURE_OPENAI_DEPLOYMENT),
+        ...extraConfig,
+        ...extraAzureConfig
       });
     } else {
       openai = new AzureOpenAI({
-        apiKey: azureOpenaiApiKey,
-        endpoint: azureOpenaiEndpoint,
-        apiVersion: azureOpenaiApiVersion,
-        deployment: azureOpenaiDeployment,
+        apiKey: getAIConfig(AZURE_OPENAI_KEY),
+        endpoint: getAIConfig(AZURE_OPENAI_ENDPOINT),
+        apiVersion: getAIConfig(AZURE_OPENAI_API_VERSION),
+        deployment: getAIConfig(AZURE_OPENAI_DEPLOYMENT),
         dangerouslyAllowBrowser: true,
-        ...openaiExtraConfig,
-        ...azureExtraConfig
+        ...extraConfig,
+        ...extraAzureConfig
       });
     }
-  } else if (!useAnthropicSdk) {
+  } else if (!getAIConfig(MIDSCENE_USE_ANTHROPIC_SDK)) {
+    const baseURL = getAIConfig(OPENAI_BASE_URL);
+    if (typeof baseURL === "string") {
+      if (!/^https?:\/\//.test(baseURL)) {
+        throw new Error(
+          `OPENAI_BASE_URL must be a valid URL starting with http:// or https://, but got: ${baseURL}
+Please check your config.`
+        );
+      }
+    }
     openai = new OpenAI({
-      baseURL: openaiBaseURL,
-      apiKey: openaiApiKey,
+      baseURL: getAIConfig(OPENAI_BASE_URL),
+      apiKey: getAIConfig(OPENAI_API_KEY),
       httpAgent: proxyAgent,
-      ...openaiExtraConfig,
+      ...extraConfig,
       defaultHeaders: {
-        ...openaiExtraConfig?.defaultHeaders || {},
+        ...extraConfig?.defaultHeaders || {},
         [MIDSCENE_API_TYPE]: AIActionTypeValue.toString()
       },
       dangerouslyAllowBrowser: true
     });
   }
-  if (openai && getAIConfigInBoolean2(MIDSCENE_LANGSMITH_DEBUG)) {
+  if (openai && getAIConfigInBoolean(MIDSCENE_LANGSMITH_DEBUG)) {
     if (ifInBrowser) {
       throw new Error("langsmith is not supported in browser");
     }
@@ -1513,13 +1175,14 @@ async function createChatClient({
   if (typeof openai !== "undefined") {
     return {
       completion: openai.chat.completions,
-      style: "openai",
-      modelName
+      style: "openai"
     };
   }
-  if (useAnthropicSdk) {
+  if (getAIConfig(MIDSCENE_USE_ANTHROPIC_SDK)) {
+    const apiKey = getAIConfig(ANTHROPIC_API_KEY);
+    assert3(apiKey, "ANTHROPIC_API_KEY is required");
     openai = new Anthropic({
-      apiKey: anthropicApiKey,
+      apiKey,
       httpAgent: proxyAgent,
       dangerouslyAllowBrowser: true
     });
@@ -1527,45 +1190,47 @@ async function createChatClient({
   if (typeof openai !== "undefined" && openai.messages) {
     return {
       completion: openai.messages,
-      style: "anthropic",
-      modelName
+      style: "anthropic"
     };
   }
   throw new Error("Openai SDK or Anthropic SDK is not initialized");
 }
-async function call2(messages, AIActionTypeValue, options, modelPreferences) {
-  const { completion, style, modelName } = await createChatClient({
-    AIActionTypeValue,
-    modelPreferences
+async function call2(messages, AIActionTypeValue, responseFormat, options) {
+  assert3(
+    checkAIConfig(),
+    "Cannot find config for AI model service. If you are using a self-hosted model without validating the API key, please set `OPENAI_API_KEY` to any non-null value. https://midscenejs.com/model-provider.html"
+  );
+  const { completion, style } = await createChatClient({
+    AIActionTypeValue
   });
-  const responseFormat = getResponseFormat(modelName, AIActionTypeValue);
-  const maxTokens = getAIConfig2(OPENAI_MAX_TOKENS);
-  const debugCall = getDebug3("ai:call");
-  const debugProfileStats = getDebug3("ai:profile:stats");
-  const debugProfileDetail = getDebug3("ai:profile:detail");
+  const maxTokens = getAIConfig(OPENAI_MAX_TOKENS);
+  const debugCall = getDebug2("ai:call");
+  const debugProfileStats = getDebug2("ai:profile:stats");
+  const debugProfileDetail = getDebug2("ai:profile:detail");
   const startTime = Date.now();
+  const model = getModelName();
   const isStreaming = options?.stream && options?.onChunk;
   let content;
   let accumulated = "";
   let usage;
   let timeCost;
   const commonConfig = {
-    temperature: vlLocateMode3() === "vlm-ui-tars" ? 0 : 0.1,
+    temperature: vlLocateMode2() === "vlm-ui-tars" ? 0 : 0.1,
     stream: !!isStreaming,
     max_tokens: typeof maxTokens === "number" ? maxTokens : Number.parseInt(maxTokens || "2048", 10),
-    ...vlLocateMode3() === "qwen-vl" ? {
+    ...vlLocateMode2() === "qwen-vl" ? {
       vl_high_resolution_images: true
     } : {}
   };
   try {
     if (style === "openai") {
       debugCall(
-        `sending ${isStreaming ? "streaming " : ""}request to ${modelName}`
+        `sending ${isStreaming ? "streaming " : ""}request to ${model}`
       );
       if (isStreaming) {
         const stream = await completion.create(
           {
-            model: modelName,
+            model,
             messages,
             response_format: responseFormat,
             ...commonConfig
@@ -1622,23 +1287,23 @@ async function call2(messages, AIActionTypeValue, options, modelPreferences) {
         }
         content = accumulated;
         debugProfileStats(
-          `streaming model, ${modelName}, mode, ${vlLocateMode3() || "default"}, cost-ms, ${timeCost}`
+          `streaming model, ${model}, mode, ${vlLocateMode2() || "default"}, cost-ms, ${timeCost}`
         );
       } else {
         const result = await completion.create({
-          model: modelName,
+          model,
           messages,
           response_format: responseFormat,
           ...commonConfig
         });
         timeCost = Date.now() - startTime;
         debugProfileStats(
-          `model, ${modelName}, mode, ${vlLocateMode3() || "default"}, ui-tars-version, ${uiTarsModelVersion()}, prompt-tokens, ${result.usage?.prompt_tokens || ""}, completion-tokens, ${result.usage?.completion_tokens || ""}, total-tokens, ${result.usage?.total_tokens || ""}, cost-ms, ${timeCost}, requestId, ${result._request_id || ""}`
+          `model, ${model}, mode, ${vlLocateMode2() || "default"}, ui-tars-version, ${uiTarsModelVersion()}, prompt-tokens, ${result.usage?.prompt_tokens || ""}, completion-tokens, ${result.usage?.completion_tokens || ""}, total-tokens, ${result.usage?.total_tokens || ""}, cost-ms, ${timeCost}, requestId, ${result._request_id || ""}`
         );
         debugProfileDetail(
           `model usage detail: ${JSON.stringify(result.usage)}`
         );
-        assert4(
+        assert3(
           result.choices,
           `invalid response from LLM service: ${JSON.stringify(result)}`
         );
@@ -1646,12 +1311,12 @@ async function call2(messages, AIActionTypeValue, options, modelPreferences) {
         usage = result.usage;
       }
       debugCall(`response: ${content}`);
-      assert4(content, "empty content");
+      assert3(content, "empty content");
     } else if (style === "anthropic") {
       const convertImageContent = (content2) => {
         if (content2.type === "image_url") {
           const imgBase64 = content2.image_url.url;
-          assert4(imgBase64, "image_url is required");
+          assert3(imgBase64, "image_url is required");
           return {
             source: {
               type: "base64",
@@ -1665,7 +1330,7 @@ async function call2(messages, AIActionTypeValue, options, modelPreferences) {
       };
       if (isStreaming) {
         const stream = await completion.create({
-          model: modelName,
+          model,
           system: "You are a versatile professional in software UI automation",
           messages: messages.map((m) => ({
             role: "user",
@@ -1709,7 +1374,7 @@ async function call2(messages, AIActionTypeValue, options, modelPreferences) {
         content = accumulated;
       } else {
         const result = await completion.create({
-          model: modelName,
+          model,
           system: "You are a versatile professional in software UI automation",
           messages: messages.map((m) => ({
             role: "user",
@@ -1722,7 +1387,7 @@ async function call2(messages, AIActionTypeValue, options, modelPreferences) {
         content = result.content[0].text;
         usage = result.usage;
       }
-      assert4(content, "empty content");
+      assert3(content, "empty content");
     }
     if (isStreaming && !usage) {
       const estimatedTokens = Math.max(
@@ -1756,9 +1421,10 @@ async function call2(messages, AIActionTypeValue, options, modelPreferences) {
     throw newError;
   }
 }
-var getResponseFormat = (modelName, AIActionTypeValue) => {
+async function callToGetJSONObject(messages, AIActionTypeValue) {
   let responseFormat;
-  if (modelName.includes("gpt-4")) {
+  const model = getModelName();
+  if (model.includes("gpt-4")) {
     switch (AIActionTypeValue) {
       case 0 /* ASSERT */:
         responseFormat = assertSchema;
@@ -1775,19 +1441,11 @@ var getResponseFormat = (modelName, AIActionTypeValue) => {
         break;
     }
   }
-  if (modelName === "gpt-4o-2024-05-13") {
+  if (model === "gpt-4o-2024-05-13") {
     responseFormat = { type: "json_object" /* JSON */ };
   }
-  return responseFormat;
-};
-async function callToGetJSONObject(messages, AIActionTypeValue, modelPreferences) {
-  const response = await call2(
-    messages,
-    AIActionTypeValue,
-    void 0,
-    modelPreferences
-  );
-  assert4(response, "empty response");
+  const response = await call2(messages, AIActionTypeValue, responseFormat);
+  assert3(response, "empty response");
   const jsonContent = safeParseJson(response.content);
   return { content: jsonContent, usage: response.usage };
 }
@@ -1836,13 +1494,138 @@ function safeParseJson(input) {
     return JSON.parse(jsonrepair(cleanJsonString));
   } catch (e) {
   }
-  if (vlLocateMode3() === "doubao-vision" || vlLocateMode3() === "vlm-ui-tars") {
+  if (vlLocateMode2() === "doubao-vision" || vlLocateMode2() === "vlm-ui-tars") {
     const jsonString = preprocessDoubaoBboxJson(cleanJsonString);
     return JSON.parse(jsonrepair(jsonString));
   }
   throw Error(`failed to parse json response: ${input}`);
 }
+// src/image/index.ts
+import {
+  imageInfo,
+  imageInfoOfBase64,
+  localImg2Base64,
+  httpImg2Base64,
+  resizeImg,
+  saveBase64Image,
+  zoomForGPT4o
+} from "@midscene/shared/img";
+// src/ai-model/prompt/util.ts
+import { NodeType as NodeType2 } from "@midscene/shared/constants";
+import { vlLocateMode as vlLocateMode3 } from "@midscene/shared/env";
+import {
+  descriptionOfTree,
+  generateElementByPosition,
+  treeToList as treeToList2
+} from "@midscene/shared/extractor";
+import { assert as assert4 } from "@midscene/shared/utils";
+function describeSize(size) {
+  return `${size.width} x ${size.height}`;
+}
+var distanceThreshold = 16;
+function elementByPositionWithElementInfo(treeRoot, position, options) {
+  const requireStrictDistance = options?.requireStrictDistance ?? true;
+  const filterPositionElements = options?.filterPositionElements ?? false;
+  assert4(typeof position !== "undefined", "position is required for query");
+  const matchingElements = [];
+  function dfs(node) {
+    if (node?.node) {
+      const item = node.node;
+      if (item.rect.left <= position.x && position.x <= item.rect.left + item.rect.width && item.rect.top <= position.y && position.y <= item.rect.top + item.rect.height) {
+        if (!(filterPositionElements && item.attributes?.nodeType === NodeType2.POSITION) && item.isVisible) {
+          matchingElements.push(item);
+        }
+      }
+    }
+    for (const child of node.children) {
+      dfs(child);
+    }
+  }
+  dfs(treeRoot);
+  if (matchingElements.length === 0) {
+    return void 0;
+  }
+  const element = matchingElements.reduce((smallest, current) => {
+    const smallestArea = smallest.rect.width * smallest.rect.height;
+    const currentArea = current.rect.width * current.rect.height;
+    return currentArea < smallestArea ? current : smallest;
+  });
+  const distanceToCenter = distance(
+    { x: element.center[0], y: element.center[1] },
+    position
+  );
+  if (requireStrictDistance) {
+    return distanceToCenter <= distanceThreshold ? element : void 0;
+  }
+  return element;
+}
+function distance(point1, point2) {
+  return Math.sqrt((point1.x - point2.x) ** 2 + (point1.y - point2.y) ** 2);
+}
+async function describeUserPage(context, opt) {
+  const { screenshotBase64 } = context;
+  let width;
+  let height;
+  if (context.size) {
+    ({ width, height } = context.size);
+  } else {
+    const imgSize = await imageInfoOfBase64(screenshotBase64);
+    ({ width, height } = imgSize);
+  }
+  const treeRoot = context.tree;
+  const idElementMap = {};
+  const flatElements = treeToList2(treeRoot);
+  if (opt?.domIncluded === true && flatElements.length >= 5e3) {
+    console.warn(
+      'The number of elements is too large, it may cause the prompt to be too long, please use domIncluded: "visible-only" to reduce the number of elements'
+    );
+  }
+  flatElements.forEach((element) => {
+    idElementMap[element.id] = element;
+    if (typeof element.indexId !== "undefined") {
+      idElementMap[`${element.indexId}`] = element;
+    }
+  });
+  let pageDescription = "";
+  const visibleOnly = opt?.visibleOnly ?? opt?.domIncluded === "visible-only";
+  if (opt?.domIncluded || !vlLocateMode3()) {
+    const contentTree = await descriptionOfTree(
+      treeRoot,
+      opt?.truncateTextLength,
+      opt?.filterNonTextContent,
+      visibleOnly
+    );
+    const sizeDescription = describeSize({ width, height });
+    pageDescription = `The size of the page: ${sizeDescription}
+ The page elements tree:
+${contentTree}`;
+  }
+  return {
+    description: pageDescription,
+    elementById(idOrIndexId) {
+      assert4(typeof idOrIndexId !== "undefined", "id is required for query");
+      const item = idElementMap[`${idOrIndexId}`];
+      return item;
+    },
+    elementByPosition(position, size) {
+      return elementByPositionWithElementInfo(treeRoot, position);
+    },
+    insertElementByPosition(position) {
+      const element = generateElementByPosition(position);
+      treeRoot.children.push({
+        node: element,
+        children: []
+      });
+      flatElements.push(element);
+      idElementMap[element.id] = element;
+      return element;
+    },
+    size: { width, height }
+  };
+}
 // src/ai-model/prompt/playwright-generator.ts
 import { PLAYWRIGHT_EXAMPLE_CODE } from "@midscene/shared/constants";
@@ -2071,7 +1854,7 @@ Respond with YAML only, no explanations.`
       });
     }
     if (options.stream && options.onChunk) {
-      return await call2(prompt, 2 /* EXTRACT_DATA */, {
+      return await call2(prompt, 2 /* EXTRACT_DATA */, void 0, {
         stream: true,
         onChunk: options.onChunk
       });
@@ -2194,7 +1977,7 @@ ${PLAYWRIGHT_EXAMPLE_CODE}`;
     }
   ];
   if (options.stream && options.onChunk) {
-    return await call2(prompt, 2 /* EXTRACT_DATA */, {
+    return await call2(prompt, 2 /* EXTRACT_DATA */, void 0, {
       stream: true,
       onChunk: options.onChunk
     });
@@ -2215,7 +1998,7 @@ ${PLAYWRIGHT_EXAMPLE_CODE}`;
 import {
   MIDSCENE_USE_QWEN_VL,
   MIDSCENE_USE_VLM_UI_TARS,
-  getAIConfigInBoolean as getAIConfigInBoolean3,
+  getAIConfigInBoolean as getAIConfigInBoolean2,
   vlLocateMode as vlLocateMode4
 } from "@midscene/shared/env";
 import {
@@ -2223,7 +2006,7 @@ import {
   paddingToMatchBlockByBase64,
   preProcessImageUrl
 } from "@midscene/shared/img";
-import { getDebug as getDebug4 } from "@midscene/shared/logger";
+import { getDebug as getDebug3 } from "@midscene/shared/logger";
 import { assert as assert5 } from "@midscene/shared/utils";
 // src/ai-model/prompt/extraction.ts
@@ -2379,8 +2162,8 @@ var sectionLocatorInstruction = new PromptTemplate4({
 });
 // src/ai-model/inspect.ts
-var debugInspect = getDebug4("ai:inspect");
-var debugSection = getDebug4("ai:section");
+var debugInspect = getDebug3("ai:inspect");
+var debugSection = getDebug3("ai:section");
 var extraTextFromUserPrompt = (prompt) => {
   if (typeof prompt === "string") {
     return prompt;
@@ -2601,7 +2384,7 @@ async function AiLocateSection(options) {
     imageBase64 = await cropByRect(
       screenshotBase64,
       sectionRect,
-      getAIConfigInBoolean3(MIDSCENE_USE_QWEN_VL)
+      getAIConfigInBoolean2(MIDSCENE_USE_QWEN_VL)
     );
   }
   return {
@@ -2613,13 +2396,7 @@ async function AiLocateSection(options) {
   };
 }
 async function AiExtractElementInfo(options) {
-  const {
-    dataQuery,
-    context,
-    extractOption,
-    multimodalPrompt,
-    modelPreferences
-  } = options;
+  const { dataQuery, context, extractOption, multimodalPrompt } = options;
   const systemPrompt = systemPromptToExtract();
   const { screenshotBase64 } = context;
   const { description, elementById } = await describeUserPage(context, {
@@ -2668,8 +2445,7 @@ async function AiExtractElementInfo(options) {
   }
   const result = await callAiFn(
     msgs,
-    2 /* EXTRACT_DATA */,
-    modelPreferences
+    2 /* EXTRACT_DATA */
   );
   return {
     parseResult: result.content,
@@ -2682,7 +2458,7 @@ async function AiAssert(options) {
   assert5(assertion, "assertion should not be empty");
   const { screenshotBase64 } = context;
   const systemPrompt = systemPromptToAssert({
-    isUITars: getAIConfigInBoolean3(MIDSCENE_USE_VLM_UI_TARS)
+    isUITars: getAIConfigInBoolean2(MIDSCENE_USE_VLM_UI_TARS)
   });
   const assertionText = extraTextFromUserPrompt(assertion);
   const msgs = [
@@ -2735,7 +2511,7 @@ async function plan(userInstruction, opts) {
   const { screenshotBase64, size } = context;
   const { description: pageDescription, elementById } = await describeUserPage(context);
   const systemPrompt = await systemPromptToTaskPlanning({
-    pageType: opts.pageType,
+    actionSpace: opts.actionSpace,
     vlMode: vlLocateMode5()
   });
   const taskBackgroundContextText = generateTaskBackgroundContext(
@@ -2835,7 +2611,7 @@ import {
 } from "@midscene/shared/env";
 import { resizeImgBase64 } from "@midscene/shared/img";
 import { transformHotkeyInput } from "@midscene/shared/keyboard-layout";
-import { getDebug as getDebug5 } from "@midscene/shared/logger";
+import { getDebug as getDebug4 } from "@midscene/shared/logger";
 import { assert as assert7 } from "@midscene/shared/utils";
 import { actionParser } from "@ui-tars/action-parser";
@@ -2875,7 +2651,7 @@ finished(content='xxx') # Use escape characters \\', \\", and \\n in content par
 var getSummary = (prediction) => prediction.replace(/Reflection:[\s\S]*?(?=Action_Summary:|Action:|$)/g, "").trim();
 // src/ai-model/ui-tars-planning.ts
-var debug = getDebug5("ui-tars-planning");
+var debug = getDebug4("ui-tars-planning");
 var bboxSize = 10;
 var pointToBbox = (point, width, height) => {
   return [
@@ -3117,8 +2893,6 @@ async function resizeImageForUiTars(imageBase64, size) {
 export {
   systemPromptToLocateElement,
-  elementByPositionWithElementInfo,
-  describeUserPage,
   call2 as call,
   callToGetJSONObject,
   callAiFnWithStringResponse,
@@ -3126,6 +2900,8 @@ export {
   callAiFn,
   adaptBboxToRect,
   expandSearchArea,
+  elementByPositionWithElementInfo,
+  describeUserPage,
   generateYamlTest,
   generateYamlTestStream,
   generatePlaywrightTest,
@@ -3139,4 +2915,4 @@ export {
   resizeImageForUiTars
 };
-//# sourceMappingURL=chunk-G2JTYWI6.js.map
+//# sourceMappingURL=chunk-5IZMFZPA.js.map