@midscene/core 0.24.2-beta-20250801024655.0 → 0.24.2-beta-20250801111909.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. package/dist/es/ai-model.d.ts +3 -3
  2. package/dist/es/ai-model.js +1 -1
  3. package/dist/es/{chunk-FMBJ3CM2.js → chunk-2RCMQS5O.js} +3 -3
  4. package/dist/es/{chunk-5HH6E7M4.js → chunk-KFA65L55.js} +93 -9
  5. package/dist/es/chunk-KFA65L55.js.map +1 -0
  6. package/dist/es/index.d.ts +7 -9
  7. package/dist/es/index.js +4 -8
  8. package/dist/es/index.js.map +1 -1
  9. package/dist/es/{llm-planning-d7096b0d.d.ts → llm-planning-4c782a8d.d.ts} +6 -5
  10. package/dist/es/{types-d836fa73.d.ts → types-7b64b80b.d.ts} +33 -13
  11. package/dist/es/utils.d.ts +1 -1
  12. package/dist/es/utils.js +1 -1
  13. package/dist/lib/ai-model.d.ts +3 -3
  14. package/dist/lib/ai-model.js +2 -2
  15. package/dist/lib/{chunk-FMBJ3CM2.js → chunk-2RCMQS5O.js} +3 -3
  16. package/dist/lib/{chunk-5HH6E7M4.js → chunk-KFA65L55.js} +99 -15
  17. package/dist/lib/chunk-KFA65L55.js.map +1 -0
  18. package/dist/lib/index.d.ts +7 -9
  19. package/dist/lib/index.js +14 -18
  20. package/dist/lib/index.js.map +1 -1
  21. package/dist/lib/{llm-planning-d7096b0d.d.ts → llm-planning-4c782a8d.d.ts} +6 -5
  22. package/dist/{types/types-d836fa73.d.ts → lib/types-7b64b80b.d.ts} +33 -13
  23. package/dist/lib/utils.d.ts +1 -1
  24. package/dist/lib/utils.js +2 -2
  25. package/dist/types/ai-model.d.ts +3 -3
  26. package/dist/types/index.d.ts +7 -9
  27. package/dist/types/{llm-planning-d7096b0d.d.ts → llm-planning-4c782a8d.d.ts} +6 -5
  28. package/dist/{lib/types-d836fa73.d.ts → types/types-7b64b80b.d.ts} +33 -13
  29. package/dist/types/utils.d.ts +1 -1
  30. package/package.json +3 -3
  31. package/dist/es/chunk-5HH6E7M4.js.map +0 -1
  32. package/dist/lib/chunk-5HH6E7M4.js.map +0 -1
  33. /package/dist/es/{chunk-FMBJ3CM2.js.map → chunk-2RCMQS5O.js.map} +0 -0
  34. /package/dist/lib/{chunk-FMBJ3CM2.js.map → chunk-2RCMQS5O.js.map} +0 -0
@@ -643,9 +643,9 @@ import { PromptTemplate as PromptTemplate2 } from "@langchain/core/prompts";
643
643
  import {
644
644
  imageInfo,
645
645
  imageInfoOfBase64,
646
- base64Encoded,
646
+ localImg2Base64,
647
+ httpImg2Base64,
647
648
  resizeImg,
648
- transformImgPathToBase64,
649
649
  saveBase64Image,
650
650
  zoomForGPT4o
651
651
  } from "@midscene/shared/img";
@@ -2004,7 +2004,11 @@ import {
2004
2004
  getAIConfigInBoolean as getAIConfigInBoolean2,
2005
2005
  vlLocateMode as vlLocateMode4
2006
2006
  } from "@midscene/shared/env";
2007
- import { cropByRect, paddingToMatchBlockByBase64 } from "@midscene/shared/img";
2007
+ import {
2008
+ cropByRect,
2009
+ paddingToMatchBlockByBase64,
2010
+ preProcessImageUrl
2011
+ } from "@midscene/shared/img";
2008
2012
  import { getDebug as getDebug3 } from "@midscene/shared/logger";
2009
2013
  import { assert as assert4 } from "@midscene/shared/utils";
2010
2014
 
@@ -2018,6 +2022,8 @@ The user will give you a screenshot, the contents of it (optional), and some dat
2018
2022
 
2019
2023
  If a key specifies a JSON data type (such as Number, String, Boolean, Object, Array), ensure the returned value strictly matches that data type.
2020
2024
 
2025
+ If the user provides multiple reference images, please carefully review the reference images with the screenshot and provide the correct answer for <DATA_DEMAND>.
2026
+
2021
2027
  Return in the following JSON format:
2022
2028
  {
2023
2029
  data: any, // the extracted data. Make sure both the value and scheme meet the DATA_DEMAND. If you want to write some description in this field, use the same language as the DATA_DEMAND.
@@ -2158,6 +2164,55 @@ var sectionLocatorInstruction = new PromptTemplate4({
2158
2164
  // src/ai-model/inspect.ts
2159
2165
  var debugInspect = getDebug3("ai:inspect");
2160
2166
  var debugSection = getDebug3("ai:section");
2167
+ var extraTextFromUserPrompt = (prompt) => {
2168
+ if (typeof prompt === "string") {
2169
+ return prompt;
2170
+ } else {
2171
+ return prompt.prompt;
2172
+ }
2173
+ };
2174
+ var promptsToChatParam = async (multimodalPrompt) => {
2175
+ const msgs = [];
2176
+ if (multimodalPrompt?.images?.length) {
2177
+ msgs.push({
2178
+ role: "user",
2179
+ content: [
2180
+ {
2181
+ type: "text",
2182
+ text: "Next, I will provide all the reference images."
2183
+ }
2184
+ ]
2185
+ });
2186
+ for (const item of multimodalPrompt.images) {
2187
+ const base64 = await preProcessImageUrl(
2188
+ item.url,
2189
+ !!multimodalPrompt.convertHttpImage2Base64
2190
+ );
2191
+ msgs.push({
2192
+ role: "user",
2193
+ content: [
2194
+ {
2195
+ type: "text",
2196
+ text: `reference image ${item.name}:`
2197
+ }
2198
+ ]
2199
+ });
2200
+ msgs.push({
2201
+ role: "user",
2202
+ content: [
2203
+ {
2204
+ type: "image_url",
2205
+ image_url: {
2206
+ url: base64,
2207
+ detail: "high"
2208
+ }
2209
+ }
2210
+ ]
2211
+ });
2212
+ }
2213
+ }
2214
+ return msgs;
2215
+ };
2161
2216
  async function AiLocateElement(options) {
2162
2217
  const { context, targetElementDescription, callAI } = options;
2163
2218
  const { screenshotBase64 } = context;
@@ -2168,7 +2223,7 @@ async function AiLocateElement(options) {
2168
2223
  );
2169
2224
  const userInstructionPrompt = await findElementPrompt.format({
2170
2225
  pageDescription: description,
2171
- targetElementDescription
2226
+ targetElementDescription: extraTextFromUserPrompt(targetElementDescription)
2172
2227
  });
2173
2228
  const systemPrompt = systemPromptToLocateElement(vlLocateMode4());
2174
2229
  let imagePayload = screenshotBase64;
@@ -2210,6 +2265,13 @@ async function AiLocateElement(options) {
2210
2265
  ]
2211
2266
  }
2212
2267
  ];
2268
+ if (typeof targetElementDescription !== "string") {
2269
+ const addOns = await promptsToChatParam({
2270
+ images: targetElementDescription.images,
2271
+ convertHttpImage2Base64: targetElementDescription.convertHttpImage2Base64
2272
+ });
2273
+ msgs.push(...addOns);
2274
+ }
2213
2275
  const callAIFn = callAI || callToGetJSONObject;
2214
2276
  const res = await callAIFn(msgs, 1 /* INSPECT_ELEMENT */);
2215
2277
  const rawResponse = JSON.stringify(res.content);
@@ -2265,7 +2327,7 @@ async function AiLocateSection(options) {
2265
2327
  const { screenshotBase64 } = context;
2266
2328
  const systemPrompt = systemPromptToLocateSection(vlLocateMode4());
2267
2329
  const sectionLocatorInstructionText = await sectionLocatorInstruction.format({
2268
- sectionDescription
2330
+ sectionDescription: extraTextFromUserPrompt(sectionDescription)
2269
2331
  });
2270
2332
  const msgs = [
2271
2333
  { role: "system", content: systemPrompt },
@@ -2286,6 +2348,13 @@ async function AiLocateSection(options) {
2286
2348
  ]
2287
2349
  }
2288
2350
  ];
2351
+ if (typeof sectionDescription !== "string") {
2352
+ const addOns = await promptsToChatParam({
2353
+ images: sectionDescription.images,
2354
+ convertHttpImage2Base64: sectionDescription.convertHttpImage2Base64
2355
+ });
2356
+ msgs.push(...addOns);
2357
+ }
2289
2358
  const result = await callAiFn(
2290
2359
  msgs,
2291
2360
  2 /* EXTRACT_DATA */
@@ -2327,7 +2396,7 @@ async function AiLocateSection(options) {
2327
2396
  };
2328
2397
  }
2329
2398
  async function AiExtractElementInfo(options) {
2330
- const { dataQuery, context, extractOption } = options;
2399
+ const { dataQuery, context, extractOption, multimodalPrompt } = options;
2331
2400
  const systemPrompt = systemPromptToExtract();
2332
2401
  const { screenshotBase64 } = context;
2333
2402
  const { description, elementById } = await describeUserPage(context, {
@@ -2361,6 +2430,13 @@ async function AiExtractElementInfo(options) {
2361
2430
  content: userContent
2362
2431
  }
2363
2432
  ];
2433
+ if (multimodalPrompt) {
2434
+ const addOns = await promptsToChatParam({
2435
+ images: multimodalPrompt.images,
2436
+ convertHttpImage2Base64: multimodalPrompt.convertHttpImage2Base64
2437
+ });
2438
+ msgs.push(...addOns);
2439
+ }
2364
2440
  const result = await callAiFn(
2365
2441
  msgs,
2366
2442
  2 /* EXTRACT_DATA */
@@ -2373,11 +2449,12 @@ async function AiExtractElementInfo(options) {
2373
2449
  }
2374
2450
  async function AiAssert(options) {
2375
2451
  const { assertion, context } = options;
2376
- assert4(assertion, "assertion should be a string");
2452
+ assert4(assertion, "assertion should not be empty");
2377
2453
  const { screenshotBase64 } = context;
2378
2454
  const systemPrompt = systemPromptToAssert({
2379
2455
  isUITars: getAIConfigInBoolean2(MIDSCENE_USE_VLM_UI_TARS)
2380
2456
  });
2457
+ const assertionText = extraTextFromUserPrompt(assertion);
2381
2458
  const msgs = [
2382
2459
  { role: "system", content: systemPrompt },
2383
2460
  {
@@ -2395,13 +2472,20 @@ async function AiAssert(options) {
2395
2472
  text: `
2396
2473
  Here is the assertion. Please tell whether it is truthy according to the screenshot.
2397
2474
  =====================================
2398
- ${assertion}
2475
+ ${assertionText}
2399
2476
  =====================================
2400
2477
  `
2401
2478
  }
2402
2479
  ]
2403
2480
  }
2404
2481
  ];
2482
+ if (typeof assertion !== "string") {
2483
+ const addOns = await promptsToChatParam({
2484
+ images: assertion.images,
2485
+ convertHttpImage2Base64: assertion.convertHttpImage2Base64
2486
+ });
2487
+ msgs.push(...addOns);
2488
+ }
2405
2489
  const { content: assertResult, usage } = await callAiFn(
2406
2490
  msgs,
2407
2491
  0 /* ASSERT */
@@ -2824,4 +2908,4 @@ export {
2824
2908
  resizeImageForUiTars
2825
2909
  };
2826
2910
 
2827
- //# sourceMappingURL=chunk-5HH6E7M4.js.map
2911
+ //# sourceMappingURL=chunk-KFA65L55.js.map