@midscene/core 0.24.2-beta-20250731151311.0 → 0.24.2-beta-20250801111909.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. package/dist/es/ai-model.d.ts +3 -3
  2. package/dist/es/ai-model.js +1 -1
  3. package/dist/es/{chunk-5HT7CBNE.js → chunk-2RCMQS5O.js} +3 -3
  4. package/dist/es/{chunk-FKQMUAXP.js → chunk-KFA65L55.js} +94 -10
  5. package/dist/es/chunk-KFA65L55.js.map +1 -0
  6. package/dist/es/index.d.ts +7 -9
  7. package/dist/es/index.js +4 -8
  8. package/dist/es/index.js.map +1 -1
  9. package/dist/es/{llm-planning-d7096b0d.d.ts → llm-planning-4c782a8d.d.ts} +6 -5
  10. package/dist/es/{types-d836fa73.d.ts → types-7b64b80b.d.ts} +33 -13
  11. package/dist/es/utils.d.ts +1 -1
  12. package/dist/es/utils.js +1 -1
  13. package/dist/lib/ai-model.d.ts +3 -3
  14. package/dist/lib/ai-model.js +2 -2
  15. package/dist/lib/{chunk-5HT7CBNE.js → chunk-2RCMQS5O.js} +3 -3
  16. package/dist/lib/{chunk-FKQMUAXP.js → chunk-KFA65L55.js} +100 -16
  17. package/dist/lib/chunk-KFA65L55.js.map +1 -0
  18. package/dist/lib/index.d.ts +7 -9
  19. package/dist/lib/index.js +14 -18
  20. package/dist/lib/index.js.map +1 -1
  21. package/dist/lib/{llm-planning-d7096b0d.d.ts → llm-planning-4c782a8d.d.ts} +6 -5
  22. package/dist/{types/types-d836fa73.d.ts → lib/types-7b64b80b.d.ts} +33 -13
  23. package/dist/lib/utils.d.ts +1 -1
  24. package/dist/lib/utils.js +2 -2
  25. package/dist/types/ai-model.d.ts +3 -3
  26. package/dist/types/index.d.ts +7 -9
  27. package/dist/types/{llm-planning-d7096b0d.d.ts → llm-planning-4c782a8d.d.ts} +6 -5
  28. package/dist/{lib/types-d836fa73.d.ts → types/types-7b64b80b.d.ts} +33 -13
  29. package/dist/types/utils.d.ts +1 -1
  30. package/package.json +3 -3
  31. package/dist/es/chunk-FKQMUAXP.js.map +0 -1
  32. package/dist/lib/chunk-FKQMUAXP.js.map +0 -1
  33. /package/dist/es/{chunk-5HT7CBNE.js.map → chunk-2RCMQS5O.js.map} +0 -0
  34. /package/dist/lib/{chunk-5HT7CBNE.js.map → chunk-2RCMQS5O.js.map} +0 -0
@@ -228,7 +228,7 @@ function mergeRects(rects) {
228
228
  };
229
229
  }
230
230
  function expandSearchArea(rect, screenSize) {
231
- const minEdgeSize = 300;
231
+ const minEdgeSize = _env.vlLocateMode.call(void 0, ) === "doubao-vision" ? 500 : 300;
232
232
  const defaultPadding = 160;
233
233
  const paddingSizeHorizontal = rect.width < minEdgeSize ? Math.ceil((minEdgeSize - rect.width) / 2) : defaultPadding;
234
234
  const paddingSizeVertical = rect.height < minEdgeSize ? Math.ceil((minEdgeSize - rect.height) / 2) : defaultPadding;
@@ -2008,6 +2008,10 @@ ${_constants.PLAYWRIGHT_EXAMPLE_CODE}`;
2008
2008
 
2009
2009
 
2010
2010
 
2011
+
2012
+
2013
+
2014
+
2011
2015
  // src/ai-model/prompt/extraction.ts
2012
2016
 
2013
2017
  function systemPromptToExtract() {
@@ -2018,6 +2022,8 @@ The user will give you a screenshot, the contents of it (optional), and some dat
2018
2022
 
2019
2023
  If a key specifies a JSON data type (such as Number, String, Boolean, Object, Array), ensure the returned value strictly matches that data type.
2020
2024
 
2025
+ If the user provides multiple reference images, please carefully review the reference images with the screenshot and provide the correct answer for <DATA_DEMAND>.
2026
+
2021
2027
  Return in the following JSON format:
2022
2028
  {
2023
2029
  data: any, // the extracted data. Make sure both the value and scheme meet the DATA_DEMAND. If you want to write some description in this field, use the same language as the DATA_DEMAND.
@@ -2158,6 +2164,55 @@ var sectionLocatorInstruction = new (0, _prompts.PromptTemplate)({
2158
2164
  // src/ai-model/inspect.ts
2159
2165
  var debugInspect = _logger.getDebug.call(void 0, "ai:inspect");
2160
2166
  var debugSection = _logger.getDebug.call(void 0, "ai:section");
2167
+ var extraTextFromUserPrompt = (prompt) => {
2168
+ if (typeof prompt === "string") {
2169
+ return prompt;
2170
+ } else {
2171
+ return prompt.prompt;
2172
+ }
2173
+ };
2174
+ var promptsToChatParam = async (multimodalPrompt) => {
2175
+ const msgs = [];
2176
+ if (_optionalChain([multimodalPrompt, 'optionalAccess', _55 => _55.images, 'optionalAccess', _56 => _56.length])) {
2177
+ msgs.push({
2178
+ role: "user",
2179
+ content: [
2180
+ {
2181
+ type: "text",
2182
+ text: "Next, I will provide all the reference images."
2183
+ }
2184
+ ]
2185
+ });
2186
+ for (const item of multimodalPrompt.images) {
2187
+ const base64 = await _img.preProcessImageUrl.call(void 0,
2188
+ item.url,
2189
+ !!multimodalPrompt.convertHttpImage2Base64
2190
+ );
2191
+ msgs.push({
2192
+ role: "user",
2193
+ content: [
2194
+ {
2195
+ type: "text",
2196
+ text: `reference image ${item.name}:`
2197
+ }
2198
+ ]
2199
+ });
2200
+ msgs.push({
2201
+ role: "user",
2202
+ content: [
2203
+ {
2204
+ type: "image_url",
2205
+ image_url: {
2206
+ url: base64,
2207
+ detail: "high"
2208
+ }
2209
+ }
2210
+ ]
2211
+ });
2212
+ }
2213
+ }
2214
+ return msgs;
2215
+ };
2161
2216
  async function AiLocateElement(options) {
2162
2217
  const { context, targetElementDescription, callAI } = options;
2163
2218
  const { screenshotBase64 } = context;
@@ -2168,7 +2223,7 @@ async function AiLocateElement(options) {
2168
2223
  );
2169
2224
  const userInstructionPrompt = await findElementPrompt.format({
2170
2225
  pageDescription: description,
2171
- targetElementDescription
2226
+ targetElementDescription: extraTextFromUserPrompt(targetElementDescription)
2172
2227
  });
2173
2228
  const systemPrompt = systemPromptToLocateElement(_env.vlLocateMode.call(void 0, ));
2174
2229
  let imagePayload = screenshotBase64;
@@ -2210,6 +2265,13 @@ async function AiLocateElement(options) {
2210
2265
  ]
2211
2266
  }
2212
2267
  ];
2268
+ if (typeof targetElementDescription !== "string") {
2269
+ const addOns = await promptsToChatParam({
2270
+ images: targetElementDescription.images,
2271
+ convertHttpImage2Base64: targetElementDescription.convertHttpImage2Base64
2272
+ });
2273
+ msgs.push(...addOns);
2274
+ }
2213
2275
  const callAIFn = callAI || callToGetJSONObject;
2214
2276
  const res = await callAIFn(msgs, 1 /* INSPECT_ELEMENT */);
2215
2277
  const rawResponse = JSON.stringify(res.content);
@@ -2220,10 +2282,10 @@ async function AiLocateElement(options) {
2220
2282
  if ("bbox" in res.content && Array.isArray(res.content.bbox)) {
2221
2283
  resRect = adaptBboxToRect(
2222
2284
  res.content.bbox,
2223
- _optionalChain([options, 'access', _55 => _55.searchConfig, 'optionalAccess', _56 => _56.rect, 'optionalAccess', _57 => _57.width]) || context.size.width,
2224
- _optionalChain([options, 'access', _58 => _58.searchConfig, 'optionalAccess', _59 => _59.rect, 'optionalAccess', _60 => _60.height]) || context.size.height,
2225
- _optionalChain([options, 'access', _61 => _61.searchConfig, 'optionalAccess', _62 => _62.rect, 'optionalAccess', _63 => _63.left]),
2226
- _optionalChain([options, 'access', _64 => _64.searchConfig, 'optionalAccess', _65 => _65.rect, 'optionalAccess', _66 => _66.top])
2285
+ _optionalChain([options, 'access', _57 => _57.searchConfig, 'optionalAccess', _58 => _58.rect, 'optionalAccess', _59 => _59.width]) || context.size.width,
2286
+ _optionalChain([options, 'access', _60 => _60.searchConfig, 'optionalAccess', _61 => _61.rect, 'optionalAccess', _62 => _62.height]) || context.size.height,
2287
+ _optionalChain([options, 'access', _63 => _63.searchConfig, 'optionalAccess', _64 => _64.rect, 'optionalAccess', _65 => _65.left]),
2288
+ _optionalChain([options, 'access', _66 => _66.searchConfig, 'optionalAccess', _67 => _67.rect, 'optionalAccess', _68 => _68.top])
2227
2289
  );
2228
2290
  debugInspect("resRect", resRect);
2229
2291
  const rectCenter = {
@@ -2242,7 +2304,7 @@ async function AiLocateElement(options) {
2242
2304
  }
2243
2305
  } catch (e) {
2244
2306
  const msg = e instanceof Error ? `Failed to parse bbox: ${e.message}` : "unknown error in locate";
2245
- if (!errors || _optionalChain([errors, 'optionalAccess', _67 => _67.length]) === 0) {
2307
+ if (!errors || _optionalChain([errors, 'optionalAccess', _69 => _69.length]) === 0) {
2246
2308
  errors = [msg];
2247
2309
  } else {
2248
2310
  errors.push(`(${msg})`);
@@ -2265,7 +2327,7 @@ async function AiLocateSection(options) {
2265
2327
  const { screenshotBase64 } = context;
2266
2328
  const systemPrompt = systemPromptToLocateSection(_env.vlLocateMode.call(void 0, ));
2267
2329
  const sectionLocatorInstructionText = await sectionLocatorInstruction.format({
2268
- sectionDescription
2330
+ sectionDescription: extraTextFromUserPrompt(sectionDescription)
2269
2331
  });
2270
2332
  const msgs = [
2271
2333
  { role: "system", content: systemPrompt },
@@ -2286,6 +2348,13 @@ async function AiLocateSection(options) {
2286
2348
  ]
2287
2349
  }
2288
2350
  ];
2351
+ if (typeof sectionDescription !== "string") {
2352
+ const addOns = await promptsToChatParam({
2353
+ images: sectionDescription.images,
2354
+ convertHttpImage2Base64: sectionDescription.convertHttpImage2Base64
2355
+ });
2356
+ msgs.push(...addOns);
2357
+ }
2289
2358
  const result = await callAiFn(
2290
2359
  msgs,
2291
2360
  2 /* EXTRACT_DATA */
@@ -2327,21 +2396,21 @@ async function AiLocateSection(options) {
2327
2396
  };
2328
2397
  }
2329
2398
  async function AiExtractElementInfo(options) {
2330
- const { dataQuery, context, extractOption } = options;
2399
+ const { dataQuery, context, extractOption, multimodalPrompt } = options;
2331
2400
  const systemPrompt = systemPromptToExtract();
2332
2401
  const { screenshotBase64 } = context;
2333
2402
  const { description, elementById } = await describeUserPage(context, {
2334
2403
  truncateTextLength: 200,
2335
2404
  filterNonTextContent: false,
2336
2405
  visibleOnly: false,
2337
- domIncluded: _optionalChain([extractOption, 'optionalAccess', _68 => _68.domIncluded])
2406
+ domIncluded: _optionalChain([extractOption, 'optionalAccess', _70 => _70.domIncluded])
2338
2407
  });
2339
2408
  const extractDataPromptText = await extractDataQueryPrompt(
2340
2409
  description,
2341
2410
  dataQuery
2342
2411
  );
2343
2412
  const userContent = [];
2344
- if (_optionalChain([extractOption, 'optionalAccess', _69 => _69.screenshotIncluded]) !== false) {
2413
+ if (_optionalChain([extractOption, 'optionalAccess', _71 => _71.screenshotIncluded]) !== false) {
2345
2414
  userContent.push({
2346
2415
  type: "image_url",
2347
2416
  image_url: {
@@ -2361,6 +2430,13 @@ async function AiExtractElementInfo(options) {
2361
2430
  content: userContent
2362
2431
  }
2363
2432
  ];
2433
+ if (multimodalPrompt) {
2434
+ const addOns = await promptsToChatParam({
2435
+ images: multimodalPrompt.images,
2436
+ convertHttpImage2Base64: multimodalPrompt.convertHttpImage2Base64
2437
+ });
2438
+ msgs.push(...addOns);
2439
+ }
2364
2440
  const result = await callAiFn(
2365
2441
  msgs,
2366
2442
  2 /* EXTRACT_DATA */
@@ -2373,11 +2449,12 @@ async function AiExtractElementInfo(options) {
2373
2449
  }
2374
2450
  async function AiAssert(options) {
2375
2451
  const { assertion, context } = options;
2376
- _utils.assert.call(void 0, assertion, "assertion should be a string");
2452
+ _utils.assert.call(void 0, assertion, "assertion should not be empty");
2377
2453
  const { screenshotBase64 } = context;
2378
2454
  const systemPrompt = systemPromptToAssert({
2379
2455
  isUITars: _env.getAIConfigInBoolean.call(void 0, _env.MIDSCENE_USE_VLM_UI_TARS)
2380
2456
  });
2457
+ const assertionText = extraTextFromUserPrompt(assertion);
2381
2458
  const msgs = [
2382
2459
  { role: "system", content: systemPrompt },
2383
2460
  {
@@ -2395,13 +2472,20 @@ async function AiAssert(options) {
2395
2472
  text: `
2396
2473
  Here is the assertion. Please tell whether it is truthy according to the screenshot.
2397
2474
  =====================================
2398
- ${assertion}
2475
+ ${assertionText}
2399
2476
  =====================================
2400
2477
  `
2401
2478
  }
2402
2479
  ]
2403
2480
  }
2404
2481
  ];
2482
+ if (typeof assertion !== "string") {
2483
+ const addOns = await promptsToChatParam({
2484
+ images: assertion.images,
2485
+ convertHttpImage2Base64: assertion.convertHttpImage2Base64
2486
+ });
2487
+ msgs.push(...addOns);
2488
+ }
2405
2489
  const { content: assertResult, usage } = await callAiFn(
2406
2490
  msgs,
2407
2491
  0 /* ASSERT */
@@ -2469,7 +2553,7 @@ async function plan(userInstruction, opts) {
2469
2553
  const { content, usage } = await call2(msgs, 3 /* PLAN */);
2470
2554
  const rawResponse = JSON.stringify(content, void 0, 2);
2471
2555
  const planFromAI = content;
2472
- const actions = (_optionalChain([planFromAI, 'access', _70 => _70.action, 'optionalAccess', _71 => _71.type]) ? [planFromAI.action] : planFromAI.actions) || [];
2556
+ const actions = (_optionalChain([planFromAI, 'access', _72 => _72.action, 'optionalAccess', _73 => _73.type]) ? [planFromAI.action] : planFromAI.actions) || [];
2473
2557
  const returnValue = {
2474
2558
  ...planFromAI,
2475
2559
  actions,
@@ -2496,7 +2580,7 @@ async function plan(userInstruction, opts) {
2496
2580
  _utils.assert.call(void 0, !planFromAI.error, `Failed to plan actions: ${planFromAI.error}`);
2497
2581
  } else {
2498
2582
  actions.forEach((action) => {
2499
- if (_optionalChain([action, 'access', _72 => _72.locate, 'optionalAccess', _73 => _73.id])) {
2583
+ if (_optionalChain([action, 'access', _74 => _74.locate, 'optionalAccess', _75 => _75.id])) {
2500
2584
  const element = elementById(action.locate.id);
2501
2585
  if (element) {
2502
2586
  action.locate.id = element.id;
@@ -2824,4 +2908,4 @@ async function resizeImageForUiTars(imageBase64, size) {
2824
2908
 
2825
2909
  exports.systemPromptToLocateElement = systemPromptToLocateElement; exports.elementByPositionWithElementInfo = elementByPositionWithElementInfo; exports.describeUserPage = describeUserPage; exports.call = call; exports.callToGetJSONObject = callToGetJSONObject; exports.AIActionType = AIActionType; exports.callAiFn = callAiFn; exports.adaptBboxToRect = adaptBboxToRect; exports.expandSearchArea = expandSearchArea; exports.generateYamlTest = generateYamlTest; exports.generateYamlTestStream = generateYamlTestStream; exports.generatePlaywrightTest = generatePlaywrightTest; exports.generatePlaywrightTestStream = generatePlaywrightTestStream; exports.AiLocateElement = AiLocateElement; exports.AiLocateSection = AiLocateSection; exports.AiExtractElementInfo = AiExtractElementInfo; exports.AiAssert = AiAssert; exports.plan = plan; exports.vlmPlanning = vlmPlanning; exports.resizeImageForUiTars = resizeImageForUiTars;
2826
2910
 
2827
- //# sourceMappingURL=chunk-FKQMUAXP.js.map
2911
+ //# sourceMappingURL=chunk-KFA65L55.js.map