npm - @midscene/core - Versions diffs - 0.24.2-beta-20250801024655.0 → 0.24.2-beta-20250805024613.0 - Mend

@midscene/core 0.24.2-beta-20250801024655.0 → 0.24.2-beta-20250805024613.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

package/dist/es/ai-model.d.ts +8 -4
package/dist/es/ai-model.js +3 -1
package/dist/es/{chunk-5HH6E7M4.js → chunk-JS4CT3XV.js} +114 -25
package/dist/es/chunk-JS4CT3XV.js.map +1 -0
package/dist/es/{chunk-FMBJ3CM2.js → chunk-NZFWY3M5.js} +3 -3
package/dist/es/index.d.ts +7 -9
package/dist/es/index.js +38 -18
package/dist/es/index.js.map +1 -1
package/dist/es/{llm-planning-d7096b0d.d.ts → llm-planning-877248da.d.ts} +7 -6
package/dist/es/{types-d836fa73.d.ts → types-512d3687.d.ts} +31 -12
package/dist/es/utils.d.ts +1 -1
package/dist/es/utils.js +1 -1
package/dist/lib/ai-model.d.ts +8 -4
package/dist/lib/ai-model.js +4 -2
package/dist/lib/{chunk-5HH6E7M4.js → chunk-JS4CT3XV.js} +120 -31
package/dist/lib/chunk-JS4CT3XV.js.map +1 -0
package/dist/lib/{chunk-FMBJ3CM2.js → chunk-NZFWY3M5.js} +3 -3
package/dist/lib/index.d.ts +7 -9
package/dist/lib/index.js +49 -29
package/dist/lib/index.js.map +1 -1
package/dist/lib/{llm-planning-d7096b0d.d.ts → llm-planning-877248da.d.ts} +7 -6
package/dist/{types/types-d836fa73.d.ts → lib/types-512d3687.d.ts} +31 -12
package/dist/lib/utils.d.ts +1 -1
package/dist/lib/utils.js +2 -2
package/dist/types/ai-model.d.ts +8 -4
package/dist/types/index.d.ts +7 -9
package/dist/types/{llm-planning-d7096b0d.d.ts → llm-planning-877248da.d.ts} +7 -6
package/dist/{lib/types-d836fa73.d.ts → types/types-512d3687.d.ts} +31 -12
package/dist/types/utils.d.ts +1 -1
package/package.json +3 -3
package/dist/es/chunk-5HH6E7M4.js.map +0 -1
package/dist/lib/chunk-5HH6E7M4.js.map +0 -1
/package/dist/es/{chunk-FMBJ3CM2.js.map → chunk-NZFWY3M5.js.map} +0 -0
/package/dist/lib/{chunk-FMBJ3CM2.js.map → chunk-NZFWY3M5.js.map} +0 -0

package/dist/es/ai-model.d.ts CHANGED Viewed

@@ -1,9 +1,9 @@
-import { aA as StreamingCallback, m as AIUsageInfo, az as StreamingCodeGenerationOptions, aC as StreamingAIResponse, V as PlanningAction, j as MidsceneYamlFlowItem } from './types-d836fa73.js';
+import { aC as StreamingCallback, o as AIUsageInfo, aB as StreamingCodeGenerationOptions, aE as StreamingAIResponse, X as PlanningAction, l as MidsceneYamlFlowItem } from './types-512d3687.js';
 import OpenAI from 'openai';
 import { ChatCompletionMessageParam } from 'openai/resources';
 export { ChatCompletionMessageParam } from 'openai/resources';
-import { b as AIActionType } from './llm-planning-d7096b0d.js';
-export { a as AiAssert, f as AiExtractElementInfo, A as AiLocateElement, g as AiLocateSection, h as adaptBboxToRect, c as callAiFn, d as describeUserPage, e as elementByPositionWithElementInfo, p as plan } from './llm-planning-d7096b0d.js';
+import { b as AIActionType, e as AIArgs } from './llm-planning-877248da.js';
+export { a as AiAssert, g as AiExtractElementInfo, A as AiLocateElement, h as AiLocateSection, i as adaptBboxToRect, c as callAiFn, d as describeUserPage, f as elementByPositionWithElementInfo, p as plan } from './llm-planning-877248da.js';
 import { vlLocateMode } from '@midscene/shared/env';
 import { actionParser } from '@ui-tars/action-parser';
 import { Size } from '@midscene/shared/types';
@@ -21,6 +21,10 @@ declare function callToGetJSONObject<T>(messages: ChatCompletionMessageParam[],
     content: T;
     usage?: AIUsageInfo;
 }>;
+declare function callAiFnWithStringResponse<T>(msgs: AIArgs, AIActionTypeValue: AIActionType): Promise<{
+    content: string;
+    usage?: AIUsageInfo;
+}>;
 declare function systemPromptToLocateElement(vlMode: ReturnType<typeof vlLocateMode>): string;
@@ -92,4 +96,4 @@ declare function vlmPlanning(options: {
 }>;
 declare function resizeImageForUiTars(imageBase64: string, size: Size): Promise<string>;
-export { AIActionType, call as callAi, callToGetJSONObject, generatePlaywrightTest, generatePlaywrightTestStream, generateYamlTest, generateYamlTestStream, resizeImageForUiTars, systemPromptToLocateElement, vlmPlanning };
+export { AIActionType, AIArgs, call as callAi, callAiFnWithStringResponse, callToGetJSONObject, generatePlaywrightTest, generatePlaywrightTestStream, generateYamlTest, generateYamlTestStream, resizeImageForUiTars, systemPromptToLocateElement, vlmPlanning };

package/dist/es/ai-model.js CHANGED Viewed

@@ -7,6 +7,7 @@ import {
   adaptBboxToRect,
   call,
   callAiFn,
+  callAiFnWithStringResponse,
   callToGetJSONObject,
   describeUserPage,
   elementByPositionWithElementInfo,
@@ -18,7 +19,7 @@ import {
   resizeImageForUiTars,
   systemPromptToLocateElement,
   vlmPlanning
-} from "./chunk-5HH6E7M4.js";
+} from "./chunk-JS4CT3XV.js";
 export {
   AIActionType,
   AiAssert,
@@ -28,6 +29,7 @@ export {
   adaptBboxToRect,
   call as callAi,
   callAiFn,
+  callAiFnWithStringResponse,
   callToGetJSONObject,
   describeUserPage,
   elementByPositionWithElementInfo,

package/dist/es/{chunk-5HH6E7M4.js → chunk-JS4CT3XV.js} RENAMED Viewed

@@ -56,10 +56,6 @@ var AIActionType = /* @__PURE__ */ ((AIActionType2) => {
   return AIActionType2;
 })(AIActionType || {});
 async function callAiFn(msgs, AIActionTypeValue) {
-  assert(
-    checkAIConfig(),
-    "Cannot find config for AI model service. If you are using a self-hosted model without validating the API key, please set `OPENAI_API_KEY` to any non-null value. https://midscenejs.com/model-provider.html"
-  );
   const { content, usage } = await callToGetJSONObject(
     msgs,
     AIActionTypeValue
@@ -643,9 +639,9 @@ import { PromptTemplate as PromptTemplate2 } from "@langchain/core/prompts";
 import {
   imageInfo,
   imageInfoOfBase64,
-  base64Encoded,
+  localImg2Base64,
+  httpImg2Base64,
   resizeImg,
-  transformImgPathToBase64,
   saveBase64Image,
   zoomForGPT4o
 } from "@midscene/shared/img";
@@ -1331,7 +1327,11 @@ Please check your config.`
   }
   throw new Error("Openai SDK or Anthropic SDK is not initialized");
 }
-async function call(messages, AIActionTypeValue, responseFormat, options) {
+async function call2(messages, AIActionTypeValue, responseFormat, options) {
+  assert3(
+    checkAIConfig(),
+    "Cannot find config for AI model service. If you are using a self-hosted model without validating the API key, please set `OPENAI_API_KEY` to any non-null value. https://midscenejs.com/model-provider.html"
+  );
   const { completion, style } = await createChatClient({
     AIActionTypeValue
   });
@@ -1576,11 +1576,15 @@ async function callToGetJSONObject(messages, AIActionTypeValue) {
   if (model === "gpt-4o-2024-05-13") {
     responseFormat = { type: "json_object" /* JSON */ };
   }
-  const response = await call(messages, AIActionTypeValue, responseFormat);
+  const response = await call2(messages, AIActionTypeValue, responseFormat);
   assert3(response, "empty response");
   const jsonContent = safeParseJson(response.content);
   return { content: jsonContent, usage: response.usage };
 }
+async function callAiFnWithStringResponse(msgs, AIActionTypeValue) {
+  const { content, usage } = await call2(msgs, AIActionTypeValue);
+  return { content, usage };
+}
 function extractJSONFromCodeBlock(response) {
   try {
     const jsonMatch = response.match(/^\s*(\{[\s\S]*\})\s*$/);
@@ -1795,7 +1799,7 @@ Respond with YAML only, no explanations.`
         }))
       });
     }
-    const response = await call(prompt, 2 /* EXTRACT_DATA */);
+    const response = await call2(prompt, 2 /* EXTRACT_DATA */);
     if (response?.content && typeof response.content === "string") {
       return response.content;
     }
@@ -1857,12 +1861,12 @@ Respond with YAML only, no explanations.`
       });
     }
     if (options.stream && options.onChunk) {
-      return await call(prompt, 2 /* EXTRACT_DATA */, void 0, {
+      return await call2(prompt, 2 /* EXTRACT_DATA */, void 0, {
         stream: true,
         onChunk: options.onChunk
       });
     } else {
-      const response = await call(prompt, 2 /* EXTRACT_DATA */);
+      const response = await call2(prompt, 2 /* EXTRACT_DATA */);
       if (response?.content && typeof response.content === "string") {
         return {
           content: response.content,
@@ -1925,7 +1929,7 @@ ${PLAYWRIGHT_EXAMPLE_CODE}`;
       content: messageContent
     }
   ];
-  const response = await call(prompt, 2 /* EXTRACT_DATA */);
+  const response = await call2(prompt, 2 /* EXTRACT_DATA */);
   if (response?.content && typeof response.content === "string") {
     return response.content;
   }
@@ -1980,12 +1984,12 @@ ${PLAYWRIGHT_EXAMPLE_CODE}`;
     }
   ];
   if (options.stream && options.onChunk) {
-    return await call(prompt, 2 /* EXTRACT_DATA */, void 0, {
+    return await call2(prompt, 2 /* EXTRACT_DATA */, void 0, {
       stream: true,
       onChunk: options.onChunk
     });
   } else {
-    const response = await call(prompt, 2 /* EXTRACT_DATA */);
+    const response = await call2(prompt, 2 /* EXTRACT_DATA */);
     if (response?.content && typeof response.content === "string") {
       return {
         content: response.content,
@@ -2004,7 +2008,11 @@ import {
   getAIConfigInBoolean as getAIConfigInBoolean2,
   vlLocateMode as vlLocateMode4
 } from "@midscene/shared/env";
-import { cropByRect, paddingToMatchBlockByBase64 } from "@midscene/shared/img";
+import {
+  cropByRect,
+  paddingToMatchBlockByBase64,
+  preProcessImageUrl
+} from "@midscene/shared/img";
 import { getDebug as getDebug3 } from "@midscene/shared/logger";
 import { assert as assert4 } from "@midscene/shared/utils";
@@ -2018,6 +2026,8 @@ The user will give you a screenshot, the contents of it (optional), and some dat
 If a key specifies a JSON data type (such as Number, String, Boolean, Object, Array), ensure the returned value strictly matches that data type.
+If the user provides multiple reference images, please carefully review the reference images with the screenshot and provide the correct answer for <DATA_DEMAND>.
 Return in the following JSON format:
 {
   data: any, // the extracted data. Make sure both the value and scheme meet the DATA_DEMAND. If you want to write some description in this field, use the same language as the DATA_DEMAND.
@@ -2158,6 +2168,55 @@ var sectionLocatorInstruction = new PromptTemplate4({
 // src/ai-model/inspect.ts
 var debugInspect = getDebug3("ai:inspect");
 var debugSection = getDebug3("ai:section");
+var extraTextFromUserPrompt = (prompt) => {
+  if (typeof prompt === "string") {
+    return prompt;
+  } else {
+    return prompt.prompt;
+  }
+};
+var promptsToChatParam = async (multimodalPrompt) => {
+  const msgs = [];
+  if (multimodalPrompt?.images?.length) {
+    msgs.push({
+      role: "user",
+      content: [
+        {
+          type: "text",
+          text: "Next, I will provide all the reference images."
+        }
+      ]
+    });
+    for (const item of multimodalPrompt.images) {
+      const base64 = await preProcessImageUrl(
+        item.url,
+        !!multimodalPrompt.convertHttpImage2Base64
+      );
+      msgs.push({
+        role: "user",
+        content: [
+          {
+            type: "text",
+            text: `reference image ${item.name}:`
+          }
+        ]
+      });
+      msgs.push({
+        role: "user",
+        content: [
+          {
+            type: "image_url",
+            image_url: {
+              url: base64,
+              detail: "high"
+            }
+          }
+        ]
+      });
+    }
+  }
+  return msgs;
+};
 async function AiLocateElement(options) {
   const { context, targetElementDescription, callAI } = options;
   const { screenshotBase64 } = context;
@@ -2168,7 +2227,7 @@ async function AiLocateElement(options) {
   );
   const userInstructionPrompt = await findElementPrompt.format({
     pageDescription: description,
-    targetElementDescription
+    targetElementDescription: extraTextFromUserPrompt(targetElementDescription)
   });
   const systemPrompt = systemPromptToLocateElement(vlLocateMode4());
   let imagePayload = screenshotBase64;
@@ -2210,6 +2269,13 @@ async function AiLocateElement(options) {
       ]
     }
   ];
+  if (typeof targetElementDescription !== "string") {
+    const addOns = await promptsToChatParam({
+      images: targetElementDescription.images,
+      convertHttpImage2Base64: targetElementDescription.convertHttpImage2Base64
+    });
+    msgs.push(...addOns);
+  }
   const callAIFn = callAI || callToGetJSONObject;
   const res = await callAIFn(msgs, 1 /* INSPECT_ELEMENT */);
   const rawResponse = JSON.stringify(res.content);
@@ -2265,7 +2331,7 @@ async function AiLocateSection(options) {
   const { screenshotBase64 } = context;
   const systemPrompt = systemPromptToLocateSection(vlLocateMode4());
   const sectionLocatorInstructionText = await sectionLocatorInstruction.format({
-    sectionDescription
+    sectionDescription: extraTextFromUserPrompt(sectionDescription)
   });
   const msgs = [
     { role: "system", content: systemPrompt },
@@ -2286,6 +2352,13 @@ async function AiLocateSection(options) {
       ]
     }
   ];
+  if (typeof sectionDescription !== "string") {
+    const addOns = await promptsToChatParam({
+      images: sectionDescription.images,
+      convertHttpImage2Base64: sectionDescription.convertHttpImage2Base64
+    });
+    msgs.push(...addOns);
+  }
   const result = await callAiFn(
     msgs,
     2 /* EXTRACT_DATA */
@@ -2327,7 +2400,7 @@ async function AiLocateSection(options) {
   };
 }
 async function AiExtractElementInfo(options) {
-  const { dataQuery, context, extractOption } = options;
+  const { dataQuery, context, extractOption, multimodalPrompt } = options;
   const systemPrompt = systemPromptToExtract();
   const { screenshotBase64 } = context;
   const { description, elementById } = await describeUserPage(context, {
@@ -2361,6 +2434,13 @@ async function AiExtractElementInfo(options) {
       content: userContent
     }
   ];
+  if (multimodalPrompt) {
+    const addOns = await promptsToChatParam({
+      images: multimodalPrompt.images,
+      convertHttpImage2Base64: multimodalPrompt.convertHttpImage2Base64
+    });
+    msgs.push(...addOns);
+  }
   const result = await callAiFn(
     msgs,
     2 /* EXTRACT_DATA */
@@ -2373,11 +2453,12 @@ async function AiExtractElementInfo(options) {
 }
 async function AiAssert(options) {
   const { assertion, context } = options;
-  assert4(assertion, "assertion should be a string");
+  assert4(assertion, "assertion should not be empty");
   const { screenshotBase64 } = context;
   const systemPrompt = systemPromptToAssert({
     isUITars: getAIConfigInBoolean2(MIDSCENE_USE_VLM_UI_TARS)
   });
+  const assertionText = extraTextFromUserPrompt(assertion);
   const msgs = [
     { role: "system", content: systemPrompt },
     {
@@ -2395,13 +2476,20 @@ async function AiAssert(options) {
           text: `
 Here is the assertion. Please tell whether it is truthy according to the screenshot.
 =====================================
-${assertion}
+${assertionText}
 =====================================
   `
         }
       ]
     }
   ];
+  if (typeof assertion !== "string") {
+    const addOns = await promptsToChatParam({
+      images: assertion.images,
+      convertHttpImage2Base64: assertion.convertHttpImage2Base64
+    });
+    msgs.push(...addOns);
+  }
   const { content: assertResult, usage } = await callAiFn(
     msgs,
     0 /* ASSERT */
@@ -2465,8 +2553,8 @@ async function plan(userInstruction, opts) {
       ]
     }
   ];
-  const call2 = callAI || callAiFn;
-  const { content, usage } = await call2(msgs, 3 /* PLAN */);
+  const call3 = callAI || callAiFn;
+  const { content, usage } = await call3(msgs, 3 /* PLAN */);
   const rawResponse = JSON.stringify(content, void 0, 2);
   const planFromAI = content;
   const actions = (planFromAI.action?.type ? [planFromAI.action] : planFromAI.actions) || [];
@@ -2574,7 +2662,7 @@ var pointToBbox = (point, width, height) => {
 async function vlmPlanning(options) {
   const { conversationHistory, userInstruction, size } = options;
   const systemPrompt = getUiTarsPlanningPrompt() + userInstruction;
-  const res = await call(
+  const res = await call2(
     [
       {
         role: "user",
@@ -2805,8 +2893,9 @@ export {
   systemPromptToLocateElement,
   elementByPositionWithElementInfo,
   describeUserPage,
-  call,
+  call2 as call,
   callToGetJSONObject,
+  callAiFnWithStringResponse,
   AIActionType,
   callAiFn,
   adaptBboxToRect,
@@ -2824,4 +2913,4 @@ export {
   resizeImageForUiTars
 };
-//# sourceMappingURL=chunk-5HH6E7M4.js.map
+//# sourceMappingURL=chunk-JS4CT3XV.js.map