npm - @midscene/core - Versions diffs - 0.24.2-beta-20250801024655.0 → 0.24.2-beta-20250801111909.0 - Mend

@midscene/core 0.24.2-beta-20250801024655.0 → 0.24.2-beta-20250801111909.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

package/dist/es/ai-model.d.ts +3 -3
package/dist/es/ai-model.js +1 -1
package/dist/es/{chunk-FMBJ3CM2.js → chunk-2RCMQS5O.js} +3 -3
package/dist/es/{chunk-5HH6E7M4.js → chunk-KFA65L55.js} +93 -9
package/dist/es/chunk-KFA65L55.js.map +1 -0
package/dist/es/index.d.ts +7 -9
package/dist/es/index.js +4 -8
package/dist/es/index.js.map +1 -1
package/dist/es/{llm-planning-d7096b0d.d.ts → llm-planning-4c782a8d.d.ts} +6 -5
package/dist/es/{types-d836fa73.d.ts → types-7b64b80b.d.ts} +33 -13
package/dist/es/utils.d.ts +1 -1
package/dist/es/utils.js +1 -1
package/dist/lib/ai-model.d.ts +3 -3
package/dist/lib/ai-model.js +2 -2
package/dist/lib/{chunk-FMBJ3CM2.js → chunk-2RCMQS5O.js} +3 -3
package/dist/lib/{chunk-5HH6E7M4.js → chunk-KFA65L55.js} +99 -15
package/dist/lib/chunk-KFA65L55.js.map +1 -0
package/dist/lib/index.d.ts +7 -9
package/dist/lib/index.js +14 -18
package/dist/lib/index.js.map +1 -1
package/dist/lib/{llm-planning-d7096b0d.d.ts → llm-planning-4c782a8d.d.ts} +6 -5
package/dist/{types/types-d836fa73.d.ts → lib/types-7b64b80b.d.ts} +33 -13
package/dist/lib/utils.d.ts +1 -1
package/dist/lib/utils.js +2 -2
package/dist/types/ai-model.d.ts +3 -3
package/dist/types/index.d.ts +7 -9
package/dist/types/{llm-planning-d7096b0d.d.ts → llm-planning-4c782a8d.d.ts} +6 -5
package/dist/{lib/types-d836fa73.d.ts → types/types-7b64b80b.d.ts} +33 -13
package/dist/types/utils.d.ts +1 -1
package/package.json +3 -3
package/dist/es/chunk-5HH6E7M4.js.map +0 -1
package/dist/lib/chunk-5HH6E7M4.js.map +0 -1
/package/dist/es/{chunk-FMBJ3CM2.js.map → chunk-2RCMQS5O.js.map} +0 -0
/package/dist/lib/{chunk-FMBJ3CM2.js.map → chunk-2RCMQS5O.js.map} +0 -0

package/dist/es/{chunk-5HH6E7M4.js → chunk-KFA65L55.js} RENAMED Viewed

@@ -643,9 +643,9 @@ import { PromptTemplate as PromptTemplate2 } from "@langchain/core/prompts";
 import {
   imageInfo,
   imageInfoOfBase64,
-  base64Encoded,
+  localImg2Base64,
+  httpImg2Base64,
   resizeImg,
-  transformImgPathToBase64,
   saveBase64Image,
   zoomForGPT4o
 } from "@midscene/shared/img";
@@ -2004,7 +2004,11 @@ import {
   getAIConfigInBoolean as getAIConfigInBoolean2,
   vlLocateMode as vlLocateMode4
 } from "@midscene/shared/env";
-import { cropByRect, paddingToMatchBlockByBase64 } from "@midscene/shared/img";
+import {
+  cropByRect,
+  paddingToMatchBlockByBase64,
+  preProcessImageUrl
+} from "@midscene/shared/img";
 import { getDebug as getDebug3 } from "@midscene/shared/logger";
 import { assert as assert4 } from "@midscene/shared/utils";
@@ -2018,6 +2022,8 @@ The user will give you a screenshot, the contents of it (optional), and some dat
 If a key specifies a JSON data type (such as Number, String, Boolean, Object, Array), ensure the returned value strictly matches that data type.
+If the user provides multiple reference images, please carefully review the reference images with the screenshot and provide the correct answer for <DATA_DEMAND>.
 Return in the following JSON format:
 {
   data: any, // the extracted data. Make sure both the value and scheme meet the DATA_DEMAND. If you want to write some description in this field, use the same language as the DATA_DEMAND.
@@ -2158,6 +2164,55 @@ var sectionLocatorInstruction = new PromptTemplate4({
 // src/ai-model/inspect.ts
 var debugInspect = getDebug3("ai:inspect");
 var debugSection = getDebug3("ai:section");
+var extraTextFromUserPrompt = (prompt) => {
+  if (typeof prompt === "string") {
+    return prompt;
+  } else {
+    return prompt.prompt;
+  }
+};
+var promptsToChatParam = async (multimodalPrompt) => {
+  const msgs = [];
+  if (multimodalPrompt?.images?.length) {
+    msgs.push({
+      role: "user",
+      content: [
+        {
+          type: "text",
+          text: "Next, I will provide all the reference images."
+        }
+      ]
+    });
+    for (const item of multimodalPrompt.images) {
+      const base64 = await preProcessImageUrl(
+        item.url,
+        !!multimodalPrompt.convertHttpImage2Base64
+      );
+      msgs.push({
+        role: "user",
+        content: [
+          {
+            type: "text",
+            text: `reference image ${item.name}:`
+          }
+        ]
+      });
+      msgs.push({
+        role: "user",
+        content: [
+          {
+            type: "image_url",
+            image_url: {
+              url: base64,
+              detail: "high"
+            }
+          }
+        ]
+      });
+    }
+  }
+  return msgs;
+};
 async function AiLocateElement(options) {
   const { context, targetElementDescription, callAI } = options;
   const { screenshotBase64 } = context;
@@ -2168,7 +2223,7 @@ async function AiLocateElement(options) {
   );
   const userInstructionPrompt = await findElementPrompt.format({
     pageDescription: description,
-    targetElementDescription
+    targetElementDescription: extraTextFromUserPrompt(targetElementDescription)
   });
   const systemPrompt = systemPromptToLocateElement(vlLocateMode4());
   let imagePayload = screenshotBase64;
@@ -2210,6 +2265,13 @@ async function AiLocateElement(options) {
       ]
     }
   ];
+  if (typeof targetElementDescription !== "string") {
+    const addOns = await promptsToChatParam({
+      images: targetElementDescription.images,
+      convertHttpImage2Base64: targetElementDescription.convertHttpImage2Base64
+    });
+    msgs.push(...addOns);
+  }
   const callAIFn = callAI || callToGetJSONObject;
   const res = await callAIFn(msgs, 1 /* INSPECT_ELEMENT */);
   const rawResponse = JSON.stringify(res.content);
@@ -2265,7 +2327,7 @@ async function AiLocateSection(options) {
   const { screenshotBase64 } = context;
   const systemPrompt = systemPromptToLocateSection(vlLocateMode4());
   const sectionLocatorInstructionText = await sectionLocatorInstruction.format({
-    sectionDescription
+    sectionDescription: extraTextFromUserPrompt(sectionDescription)
   });
   const msgs = [
     { role: "system", content: systemPrompt },
@@ -2286,6 +2348,13 @@ async function AiLocateSection(options) {
       ]
     }
   ];
+  if (typeof sectionDescription !== "string") {
+    const addOns = await promptsToChatParam({
+      images: sectionDescription.images,
+      convertHttpImage2Base64: sectionDescription.convertHttpImage2Base64
+    });
+    msgs.push(...addOns);
+  }
   const result = await callAiFn(
     msgs,
     2 /* EXTRACT_DATA */
@@ -2327,7 +2396,7 @@ async function AiLocateSection(options) {
   };
 }
 async function AiExtractElementInfo(options) {
-  const { dataQuery, context, extractOption } = options;
+  const { dataQuery, context, extractOption, multimodalPrompt } = options;
   const systemPrompt = systemPromptToExtract();
   const { screenshotBase64 } = context;
   const { description, elementById } = await describeUserPage(context, {
@@ -2361,6 +2430,13 @@ async function AiExtractElementInfo(options) {
       content: userContent
     }
   ];
+  if (multimodalPrompt) {
+    const addOns = await promptsToChatParam({
+      images: multimodalPrompt.images,
+      convertHttpImage2Base64: multimodalPrompt.convertHttpImage2Base64
+    });
+    msgs.push(...addOns);
+  }
   const result = await callAiFn(
     msgs,
     2 /* EXTRACT_DATA */
@@ -2373,11 +2449,12 @@ async function AiExtractElementInfo(options) {
 }
 async function AiAssert(options) {
   const { assertion, context } = options;
-  assert4(assertion, "assertion should be a string");
+  assert4(assertion, "assertion should not be empty");
   const { screenshotBase64 } = context;
   const systemPrompt = systemPromptToAssert({
     isUITars: getAIConfigInBoolean2(MIDSCENE_USE_VLM_UI_TARS)
   });
+  const assertionText = extraTextFromUserPrompt(assertion);
   const msgs = [
     { role: "system", content: systemPrompt },
     {
@@ -2395,13 +2472,20 @@ async function AiAssert(options) {
           text: `
 Here is the assertion. Please tell whether it is truthy according to the screenshot.
 =====================================
-${assertion}
+${assertionText}
 =====================================
   `
         }
       ]
     }
   ];
+  if (typeof assertion !== "string") {
+    const addOns = await promptsToChatParam({
+      images: assertion.images,
+      convertHttpImage2Base64: assertion.convertHttpImage2Base64
+    });
+    msgs.push(...addOns);
+  }
   const { content: assertResult, usage } = await callAiFn(
     msgs,
     0 /* ASSERT */
@@ -2824,4 +2908,4 @@ export {
   resizeImageForUiTars
 };
-//# sourceMappingURL=chunk-5HH6E7M4.js.map
+//# sourceMappingURL=chunk-KFA65L55.js.map