npm - @midscene/core - Versions diffs - 0.24.2-beta-20250731151311.0 → 0.24.2-beta-20250801111909.0 - Mend

@midscene/core 0.24.2-beta-20250731151311.0 → 0.24.2-beta-20250801111909.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (34) hide show

package/dist/es/ai-model.d.ts +3 -3
package/dist/es/ai-model.js +1 -1
package/dist/es/{chunk-5HT7CBNE.js → chunk-2RCMQS5O.js} +3 -3
package/dist/es/{chunk-FKQMUAXP.js → chunk-KFA65L55.js} +94 -10
package/dist/es/chunk-KFA65L55.js.map +1 -0
package/dist/es/index.d.ts +7 -9
package/dist/es/index.js +4 -8
package/dist/es/index.js.map +1 -1
package/dist/es/{llm-planning-d7096b0d.d.ts → llm-planning-4c782a8d.d.ts} +6 -5
package/dist/es/{types-d836fa73.d.ts → types-7b64b80b.d.ts} +33 -13
package/dist/es/utils.d.ts +1 -1
package/dist/es/utils.js +1 -1
package/dist/lib/ai-model.d.ts +3 -3
package/dist/lib/ai-model.js +2 -2
package/dist/lib/{chunk-5HT7CBNE.js → chunk-2RCMQS5O.js} +3 -3
package/dist/lib/{chunk-FKQMUAXP.js → chunk-KFA65L55.js} +100 -16
package/dist/lib/chunk-KFA65L55.js.map +1 -0
package/dist/lib/index.d.ts +7 -9
package/dist/lib/index.js +14 -18
package/dist/lib/index.js.map +1 -1
package/dist/lib/{llm-planning-d7096b0d.d.ts → llm-planning-4c782a8d.d.ts} +6 -5
package/dist/{types/types-d836fa73.d.ts → lib/types-7b64b80b.d.ts} +33 -13
package/dist/lib/utils.d.ts +1 -1
package/dist/lib/utils.js +2 -2
package/dist/types/ai-model.d.ts +3 -3
package/dist/types/index.d.ts +7 -9
package/dist/types/{llm-planning-d7096b0d.d.ts → llm-planning-4c782a8d.d.ts} +6 -5
package/dist/{lib/types-d836fa73.d.ts → types/types-7b64b80b.d.ts} +33 -13
package/dist/types/utils.d.ts +1 -1
package/package.json +3 -3
package/dist/es/chunk-FKQMUAXP.js.map +0 -1
package/dist/lib/chunk-FKQMUAXP.js.map +0 -1
/package/dist/es/{chunk-5HT7CBNE.js.map → chunk-2RCMQS5O.js.map} +0 -0
/package/dist/lib/{chunk-5HT7CBNE.js.map → chunk-2RCMQS5O.js.map} +0 -0

package/dist/lib/{chunk-FKQMUAXP.js → chunk-KFA65L55.js} RENAMED Viewed

@@ -228,7 +228,7 @@ function mergeRects(rects) {
   };
 }
 function expandSearchArea(rect, screenSize) {
-  const minEdgeSize = 300;
+  const minEdgeSize = _env.vlLocateMode.call(void 0, ) === "doubao-vision" ? 500 : 300;
   const defaultPadding = 160;
   const paddingSizeHorizontal = rect.width < minEdgeSize ? Math.ceil((minEdgeSize - rect.width) / 2) : defaultPadding;
   const paddingSizeVertical = rect.height < minEdgeSize ? Math.ceil((minEdgeSize - rect.height) / 2) : defaultPadding;
@@ -2008,6 +2008,10 @@ ${_constants.PLAYWRIGHT_EXAMPLE_CODE}`;
 // src/ai-model/prompt/extraction.ts
 function systemPromptToExtract() {
@@ -2018,6 +2022,8 @@ The user will give you a screenshot, the contents of it (optional), and some dat
 If a key specifies a JSON data type (such as Number, String, Boolean, Object, Array), ensure the returned value strictly matches that data type.
+If the user provides multiple reference images, please carefully review the reference images with the screenshot and provide the correct answer for <DATA_DEMAND>.
 Return in the following JSON format:
 {
   data: any, // the extracted data. Make sure both the value and scheme meet the DATA_DEMAND. If you want to write some description in this field, use the same language as the DATA_DEMAND.
@@ -2158,6 +2164,55 @@ var sectionLocatorInstruction = new (0, _prompts.PromptTemplate)({
 // src/ai-model/inspect.ts
 var debugInspect = _logger.getDebug.call(void 0, "ai:inspect");
 var debugSection = _logger.getDebug.call(void 0, "ai:section");
+var extraTextFromUserPrompt = (prompt) => {
+  if (typeof prompt === "string") {
+    return prompt;
+  } else {
+    return prompt.prompt;
+  }
+};
+var promptsToChatParam = async (multimodalPrompt) => {
+  const msgs = [];
+  if (_optionalChain([multimodalPrompt, 'optionalAccess', _55 => _55.images, 'optionalAccess', _56 => _56.length])) {
+    msgs.push({
+      role: "user",
+      content: [
+        {
+          type: "text",
+          text: "Next, I will provide all the reference images."
+        }
+      ]
+    });
+    for (const item of multimodalPrompt.images) {
+      const base64 = await _img.preProcessImageUrl.call(void 0,
+        item.url,
+        !!multimodalPrompt.convertHttpImage2Base64
+      );
+      msgs.push({
+        role: "user",
+        content: [
+          {
+            type: "text",
+            text: `reference image ${item.name}:`
+          }
+        ]
+      });
+      msgs.push({
+        role: "user",
+        content: [
+          {
+            type: "image_url",
+            image_url: {
+              url: base64,
+              detail: "high"
+            }
+          }
+        ]
+      });
+    }
+  }
+  return msgs;
+};
 async function AiLocateElement(options) {
   const { context, targetElementDescription, callAI } = options;
   const { screenshotBase64 } = context;
@@ -2168,7 +2223,7 @@ async function AiLocateElement(options) {
   );
   const userInstructionPrompt = await findElementPrompt.format({
     pageDescription: description,
-    targetElementDescription
+    targetElementDescription: extraTextFromUserPrompt(targetElementDescription)
   });
   const systemPrompt = systemPromptToLocateElement(_env.vlLocateMode.call(void 0, ));
   let imagePayload = screenshotBase64;
@@ -2210,6 +2265,13 @@ async function AiLocateElement(options) {
       ]
     }
   ];
+  if (typeof targetElementDescription !== "string") {
+    const addOns = await promptsToChatParam({
+      images: targetElementDescription.images,
+      convertHttpImage2Base64: targetElementDescription.convertHttpImage2Base64
+    });
+    msgs.push(...addOns);
+  }
   const callAIFn = callAI || callToGetJSONObject;
   const res = await callAIFn(msgs, 1 /* INSPECT_ELEMENT */);
   const rawResponse = JSON.stringify(res.content);
@@ -2220,10 +2282,10 @@ async function AiLocateElement(options) {
     if ("bbox" in res.content && Array.isArray(res.content.bbox)) {
       resRect = adaptBboxToRect(
         res.content.bbox,
-        _optionalChain([options, 'access', _55 => _55.searchConfig, 'optionalAccess', _56 => _56.rect, 'optionalAccess', _57 => _57.width]) || context.size.width,
-        _optionalChain([options, 'access', _58 => _58.searchConfig, 'optionalAccess', _59 => _59.rect, 'optionalAccess', _60 => _60.height]) || context.size.height,
-        _optionalChain([options, 'access', _61 => _61.searchConfig, 'optionalAccess', _62 => _62.rect, 'optionalAccess', _63 => _63.left]),
-        _optionalChain([options, 'access', _64 => _64.searchConfig, 'optionalAccess', _65 => _65.rect, 'optionalAccess', _66 => _66.top])
+        _optionalChain([options, 'access', _57 => _57.searchConfig, 'optionalAccess', _58 => _58.rect, 'optionalAccess', _59 => _59.width]) || context.size.width,
+        _optionalChain([options, 'access', _60 => _60.searchConfig, 'optionalAccess', _61 => _61.rect, 'optionalAccess', _62 => _62.height]) || context.size.height,
+        _optionalChain([options, 'access', _63 => _63.searchConfig, 'optionalAccess', _64 => _64.rect, 'optionalAccess', _65 => _65.left]),
+        _optionalChain([options, 'access', _66 => _66.searchConfig, 'optionalAccess', _67 => _67.rect, 'optionalAccess', _68 => _68.top])
       );
       debugInspect("resRect", resRect);
       const rectCenter = {
@@ -2242,7 +2304,7 @@ async function AiLocateElement(options) {
     }
   } catch (e) {
     const msg = e instanceof Error ? `Failed to parse bbox: ${e.message}` : "unknown error in locate";
-    if (!errors || _optionalChain([errors, 'optionalAccess', _67 => _67.length]) === 0) {
+    if (!errors || _optionalChain([errors, 'optionalAccess', _69 => _69.length]) === 0) {
       errors = [msg];
     } else {
       errors.push(`(${msg})`);
@@ -2265,7 +2327,7 @@ async function AiLocateSection(options) {
   const { screenshotBase64 } = context;
   const systemPrompt = systemPromptToLocateSection(_env.vlLocateMode.call(void 0, ));
   const sectionLocatorInstructionText = await sectionLocatorInstruction.format({
-    sectionDescription
+    sectionDescription: extraTextFromUserPrompt(sectionDescription)
   });
   const msgs = [
     { role: "system", content: systemPrompt },
@@ -2286,6 +2348,13 @@ async function AiLocateSection(options) {
       ]
     }
   ];
+  if (typeof sectionDescription !== "string") {
+    const addOns = await promptsToChatParam({
+      images: sectionDescription.images,
+      convertHttpImage2Base64: sectionDescription.convertHttpImage2Base64
+    });
+    msgs.push(...addOns);
+  }
   const result = await callAiFn(
     msgs,
     2 /* EXTRACT_DATA */
@@ -2327,21 +2396,21 @@ async function AiLocateSection(options) {
   };
 }
 async function AiExtractElementInfo(options) {
-  const { dataQuery, context, extractOption } = options;
+  const { dataQuery, context, extractOption, multimodalPrompt } = options;
   const systemPrompt = systemPromptToExtract();
   const { screenshotBase64 } = context;
   const { description, elementById } = await describeUserPage(context, {
     truncateTextLength: 200,
     filterNonTextContent: false,
     visibleOnly: false,
-    domIncluded: _optionalChain([extractOption, 'optionalAccess', _68 => _68.domIncluded])
+    domIncluded: _optionalChain([extractOption, 'optionalAccess', _70 => _70.domIncluded])
   });
   const extractDataPromptText = await extractDataQueryPrompt(
     description,
     dataQuery
   );
   const userContent = [];
-  if (_optionalChain([extractOption, 'optionalAccess', _69 => _69.screenshotIncluded]) !== false) {
+  if (_optionalChain([extractOption, 'optionalAccess', _71 => _71.screenshotIncluded]) !== false) {
     userContent.push({
       type: "image_url",
       image_url: {
@@ -2361,6 +2430,13 @@ async function AiExtractElementInfo(options) {
       content: userContent
     }
   ];
+  if (multimodalPrompt) {
+    const addOns = await promptsToChatParam({
+      images: multimodalPrompt.images,
+      convertHttpImage2Base64: multimodalPrompt.convertHttpImage2Base64
+    });
+    msgs.push(...addOns);
+  }
   const result = await callAiFn(
     msgs,
     2 /* EXTRACT_DATA */
@@ -2373,11 +2449,12 @@ async function AiExtractElementInfo(options) {
 }
 async function AiAssert(options) {
   const { assertion, context } = options;
-  _utils.assert.call(void 0, assertion, "assertion should be a string");
+  _utils.assert.call(void 0, assertion, "assertion should not be empty");
   const { screenshotBase64 } = context;
   const systemPrompt = systemPromptToAssert({
     isUITars: _env.getAIConfigInBoolean.call(void 0, _env.MIDSCENE_USE_VLM_UI_TARS)
   });
+  const assertionText = extraTextFromUserPrompt(assertion);
   const msgs = [
     { role: "system", content: systemPrompt },
     {
@@ -2395,13 +2472,20 @@ async function AiAssert(options) {
           text: `
 Here is the assertion. Please tell whether it is truthy according to the screenshot.
 =====================================
-${assertion}
+${assertionText}
 =====================================
   `
         }
       ]
     }
   ];
+  if (typeof assertion !== "string") {
+    const addOns = await promptsToChatParam({
+      images: assertion.images,
+      convertHttpImage2Base64: assertion.convertHttpImage2Base64
+    });
+    msgs.push(...addOns);
+  }
   const { content: assertResult, usage } = await callAiFn(
     msgs,
     0 /* ASSERT */
@@ -2469,7 +2553,7 @@ async function plan(userInstruction, opts) {
   const { content, usage } = await call2(msgs, 3 /* PLAN */);
   const rawResponse = JSON.stringify(content, void 0, 2);
   const planFromAI = content;
-  const actions = (_optionalChain([planFromAI, 'access', _70 => _70.action, 'optionalAccess', _71 => _71.type]) ? [planFromAI.action] : planFromAI.actions) || [];
+  const actions = (_optionalChain([planFromAI, 'access', _72 => _72.action, 'optionalAccess', _73 => _73.type]) ? [planFromAI.action] : planFromAI.actions) || [];
   const returnValue = {
     ...planFromAI,
     actions,
@@ -2496,7 +2580,7 @@ async function plan(userInstruction, opts) {
     _utils.assert.call(void 0, !planFromAI.error, `Failed to plan actions: ${planFromAI.error}`);
   } else {
     actions.forEach((action) => {
-      if (_optionalChain([action, 'access', _72 => _72.locate, 'optionalAccess', _73 => _73.id])) {
+      if (_optionalChain([action, 'access', _74 => _74.locate, 'optionalAccess', _75 => _75.id])) {
         const element = elementById(action.locate.id);
         if (element) {
           action.locate.id = element.id;
@@ -2824,4 +2908,4 @@ async function resizeImageForUiTars(imageBase64, size) {
 exports.systemPromptToLocateElement = systemPromptToLocateElement; exports.elementByPositionWithElementInfo = elementByPositionWithElementInfo; exports.describeUserPage = describeUserPage; exports.call = call; exports.callToGetJSONObject = callToGetJSONObject; exports.AIActionType = AIActionType; exports.callAiFn = callAiFn; exports.adaptBboxToRect = adaptBboxToRect; exports.expandSearchArea = expandSearchArea; exports.generateYamlTest = generateYamlTest; exports.generateYamlTestStream = generateYamlTestStream; exports.generatePlaywrightTest = generatePlaywrightTest; exports.generatePlaywrightTestStream = generatePlaywrightTestStream; exports.AiLocateElement = AiLocateElement; exports.AiLocateSection = AiLocateSection; exports.AiExtractElementInfo = AiExtractElementInfo; exports.AiAssert = AiAssert; exports.plan = plan; exports.vlmPlanning = vlmPlanning; exports.resizeImageForUiTars = resizeImageForUiTars;
-//# sourceMappingURL=chunk-FKQMUAXP.js.map
+//# sourceMappingURL=chunk-KFA65L55.js.map