@midscene/core 0.24.2-beta-20250801024655.0 → 0.24.2-beta-20250805024613.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. package/dist/es/ai-model.d.ts +8 -4
  2. package/dist/es/ai-model.js +3 -1
  3. package/dist/es/{chunk-5HH6E7M4.js → chunk-JS4CT3XV.js} +114 -25
  4. package/dist/es/chunk-JS4CT3XV.js.map +1 -0
  5. package/dist/es/{chunk-FMBJ3CM2.js → chunk-NZFWY3M5.js} +3 -3
  6. package/dist/es/index.d.ts +7 -9
  7. package/dist/es/index.js +38 -18
  8. package/dist/es/index.js.map +1 -1
  9. package/dist/es/{llm-planning-d7096b0d.d.ts → llm-planning-877248da.d.ts} +7 -6
  10. package/dist/es/{types-d836fa73.d.ts → types-512d3687.d.ts} +31 -12
  11. package/dist/es/utils.d.ts +1 -1
  12. package/dist/es/utils.js +1 -1
  13. package/dist/lib/ai-model.d.ts +8 -4
  14. package/dist/lib/ai-model.js +4 -2
  15. package/dist/lib/{chunk-5HH6E7M4.js → chunk-JS4CT3XV.js} +120 -31
  16. package/dist/lib/chunk-JS4CT3XV.js.map +1 -0
  17. package/dist/lib/{chunk-FMBJ3CM2.js → chunk-NZFWY3M5.js} +3 -3
  18. package/dist/lib/index.d.ts +7 -9
  19. package/dist/lib/index.js +49 -29
  20. package/dist/lib/index.js.map +1 -1
  21. package/dist/lib/{llm-planning-d7096b0d.d.ts → llm-planning-877248da.d.ts} +7 -6
  22. package/dist/{types/types-d836fa73.d.ts → lib/types-512d3687.d.ts} +31 -12
  23. package/dist/lib/utils.d.ts +1 -1
  24. package/dist/lib/utils.js +2 -2
  25. package/dist/types/ai-model.d.ts +8 -4
  26. package/dist/types/index.d.ts +7 -9
  27. package/dist/types/{llm-planning-d7096b0d.d.ts → llm-planning-877248da.d.ts} +7 -6
  28. package/dist/{lib/types-d836fa73.d.ts → types/types-512d3687.d.ts} +31 -12
  29. package/dist/types/utils.d.ts +1 -1
  30. package/package.json +3 -3
  31. package/dist/es/chunk-5HH6E7M4.js.map +0 -1
  32. package/dist/lib/chunk-5HH6E7M4.js.map +0 -1
  33. /package/dist/es/{chunk-FMBJ3CM2.js.map → chunk-NZFWY3M5.js.map} +0 -0
  34. /package/dist/lib/{chunk-FMBJ3CM2.js.map → chunk-NZFWY3M5.js.map} +0 -0
@@ -56,10 +56,6 @@ var AIActionType = /* @__PURE__ */ ((AIActionType2) => {
56
56
  return AIActionType2;
57
57
  })(AIActionType || {});
58
58
  async function callAiFn(msgs, AIActionTypeValue) {
59
- _utils.assert.call(void 0,
60
- checkAIConfig(),
61
- "Cannot find config for AI model service. If you are using a self-hosted model without validating the API key, please set `OPENAI_API_KEY` to any non-null value. https://midscenejs.com/model-provider.html"
62
- );
63
59
  const { content, usage } = await callToGetJSONObject(
64
60
  msgs,
65
61
  AIActionTypeValue
@@ -1331,7 +1327,11 @@ Please check your config.`
1331
1327
  }
1332
1328
  throw new Error("Openai SDK or Anthropic SDK is not initialized");
1333
1329
  }
1334
- async function call(messages, AIActionTypeValue, responseFormat, options) {
1330
+ async function call2(messages, AIActionTypeValue, responseFormat, options) {
1331
+ _utils.assert.call(void 0,
1332
+ checkAIConfig(),
1333
+ "Cannot find config for AI model service. If you are using a self-hosted model without validating the API key, please set `OPENAI_API_KEY` to any non-null value. https://midscenejs.com/model-provider.html"
1334
+ );
1335
1335
  const { completion, style } = await createChatClient({
1336
1336
  AIActionTypeValue
1337
1337
  });
@@ -1576,11 +1576,15 @@ async function callToGetJSONObject(messages, AIActionTypeValue) {
1576
1576
  if (model === "gpt-4o-2024-05-13") {
1577
1577
  responseFormat = { type: "json_object" /* JSON */ };
1578
1578
  }
1579
- const response = await call(messages, AIActionTypeValue, responseFormat);
1579
+ const response = await call2(messages, AIActionTypeValue, responseFormat);
1580
1580
  _utils.assert.call(void 0, response, "empty response");
1581
1581
  const jsonContent = safeParseJson(response.content);
1582
1582
  return { content: jsonContent, usage: response.usage };
1583
1583
  }
1584
+ async function callAiFnWithStringResponse(msgs, AIActionTypeValue) {
1585
+ const { content, usage } = await call2(msgs, AIActionTypeValue);
1586
+ return { content, usage };
1587
+ }
1584
1588
  function extractJSONFromCodeBlock(response) {
1585
1589
  try {
1586
1590
  const jsonMatch = response.match(/^\s*(\{[\s\S]*\})\s*$/);
@@ -1795,7 +1799,7 @@ Respond with YAML only, no explanations.`
1795
1799
  }))
1796
1800
  });
1797
1801
  }
1798
- const response = await call(prompt, 2 /* EXTRACT_DATA */);
1802
+ const response = await call2(prompt, 2 /* EXTRACT_DATA */);
1799
1803
  if (_optionalChain([response, 'optionalAccess', _51 => _51.content]) && typeof response.content === "string") {
1800
1804
  return response.content;
1801
1805
  }
@@ -1857,12 +1861,12 @@ Respond with YAML only, no explanations.`
1857
1861
  });
1858
1862
  }
1859
1863
  if (options.stream && options.onChunk) {
1860
- return await call(prompt, 2 /* EXTRACT_DATA */, void 0, {
1864
+ return await call2(prompt, 2 /* EXTRACT_DATA */, void 0, {
1861
1865
  stream: true,
1862
1866
  onChunk: options.onChunk
1863
1867
  });
1864
1868
  } else {
1865
- const response = await call(prompt, 2 /* EXTRACT_DATA */);
1869
+ const response = await call2(prompt, 2 /* EXTRACT_DATA */);
1866
1870
  if (_optionalChain([response, 'optionalAccess', _52 => _52.content]) && typeof response.content === "string") {
1867
1871
  return {
1868
1872
  content: response.content,
@@ -1925,7 +1929,7 @@ ${_constants.PLAYWRIGHT_EXAMPLE_CODE}`;
1925
1929
  content: messageContent
1926
1930
  }
1927
1931
  ];
1928
- const response = await call(prompt, 2 /* EXTRACT_DATA */);
1932
+ const response = await call2(prompt, 2 /* EXTRACT_DATA */);
1929
1933
  if (_optionalChain([response, 'optionalAccess', _53 => _53.content]) && typeof response.content === "string") {
1930
1934
  return response.content;
1931
1935
  }
@@ -1980,12 +1984,12 @@ ${_constants.PLAYWRIGHT_EXAMPLE_CODE}`;
1980
1984
  }
1981
1985
  ];
1982
1986
  if (options.stream && options.onChunk) {
1983
- return await call(prompt, 2 /* EXTRACT_DATA */, void 0, {
1987
+ return await call2(prompt, 2 /* EXTRACT_DATA */, void 0, {
1984
1988
  stream: true,
1985
1989
  onChunk: options.onChunk
1986
1990
  });
1987
1991
  } else {
1988
- const response = await call(prompt, 2 /* EXTRACT_DATA */);
1992
+ const response = await call2(prompt, 2 /* EXTRACT_DATA */);
1989
1993
  if (_optionalChain([response, 'optionalAccess', _54 => _54.content]) && typeof response.content === "string") {
1990
1994
  return {
1991
1995
  content: response.content,
@@ -2008,6 +2012,10 @@ ${_constants.PLAYWRIGHT_EXAMPLE_CODE}`;
2008
2012
 
2009
2013
 
2010
2014
 
2015
+
2016
+
2017
+
2018
+
2011
2019
  // src/ai-model/prompt/extraction.ts
2012
2020
 
2013
2021
  function systemPromptToExtract() {
@@ -2018,6 +2026,8 @@ The user will give you a screenshot, the contents of it (optional), and some dat
2018
2026
 
2019
2027
  If a key specifies a JSON data type (such as Number, String, Boolean, Object, Array), ensure the returned value strictly matches that data type.
2020
2028
 
2029
+ If the user provides multiple reference images, please carefully review the reference images with the screenshot and provide the correct answer for <DATA_DEMAND>.
2030
+
2021
2031
  Return in the following JSON format:
2022
2032
  {
2023
2033
  data: any, // the extracted data. Make sure both the value and scheme meet the DATA_DEMAND. If you want to write some description in this field, use the same language as the DATA_DEMAND.
@@ -2158,6 +2168,55 @@ var sectionLocatorInstruction = new (0, _prompts.PromptTemplate)({
2158
2168
  // src/ai-model/inspect.ts
2159
2169
  var debugInspect = _logger.getDebug.call(void 0, "ai:inspect");
2160
2170
  var debugSection = _logger.getDebug.call(void 0, "ai:section");
2171
+ var extraTextFromUserPrompt = (prompt) => {
2172
+ if (typeof prompt === "string") {
2173
+ return prompt;
2174
+ } else {
2175
+ return prompt.prompt;
2176
+ }
2177
+ };
2178
+ var promptsToChatParam = async (multimodalPrompt) => {
2179
+ const msgs = [];
2180
+ if (_optionalChain([multimodalPrompt, 'optionalAccess', _55 => _55.images, 'optionalAccess', _56 => _56.length])) {
2181
+ msgs.push({
2182
+ role: "user",
2183
+ content: [
2184
+ {
2185
+ type: "text",
2186
+ text: "Next, I will provide all the reference images."
2187
+ }
2188
+ ]
2189
+ });
2190
+ for (const item of multimodalPrompt.images) {
2191
+ const base64 = await _img.preProcessImageUrl.call(void 0,
2192
+ item.url,
2193
+ !!multimodalPrompt.convertHttpImage2Base64
2194
+ );
2195
+ msgs.push({
2196
+ role: "user",
2197
+ content: [
2198
+ {
2199
+ type: "text",
2200
+ text: `reference image ${item.name}:`
2201
+ }
2202
+ ]
2203
+ });
2204
+ msgs.push({
2205
+ role: "user",
2206
+ content: [
2207
+ {
2208
+ type: "image_url",
2209
+ image_url: {
2210
+ url: base64,
2211
+ detail: "high"
2212
+ }
2213
+ }
2214
+ ]
2215
+ });
2216
+ }
2217
+ }
2218
+ return msgs;
2219
+ };
2161
2220
  async function AiLocateElement(options) {
2162
2221
  const { context, targetElementDescription, callAI } = options;
2163
2222
  const { screenshotBase64 } = context;
@@ -2168,7 +2227,7 @@ async function AiLocateElement(options) {
2168
2227
  );
2169
2228
  const userInstructionPrompt = await findElementPrompt.format({
2170
2229
  pageDescription: description,
2171
- targetElementDescription
2230
+ targetElementDescription: extraTextFromUserPrompt(targetElementDescription)
2172
2231
  });
2173
2232
  const systemPrompt = systemPromptToLocateElement(_env.vlLocateMode.call(void 0, ));
2174
2233
  let imagePayload = screenshotBase64;
@@ -2210,6 +2269,13 @@ async function AiLocateElement(options) {
2210
2269
  ]
2211
2270
  }
2212
2271
  ];
2272
+ if (typeof targetElementDescription !== "string") {
2273
+ const addOns = await promptsToChatParam({
2274
+ images: targetElementDescription.images,
2275
+ convertHttpImage2Base64: targetElementDescription.convertHttpImage2Base64
2276
+ });
2277
+ msgs.push(...addOns);
2278
+ }
2213
2279
  const callAIFn = callAI || callToGetJSONObject;
2214
2280
  const res = await callAIFn(msgs, 1 /* INSPECT_ELEMENT */);
2215
2281
  const rawResponse = JSON.stringify(res.content);
@@ -2220,10 +2286,10 @@ async function AiLocateElement(options) {
2220
2286
  if ("bbox" in res.content && Array.isArray(res.content.bbox)) {
2221
2287
  resRect = adaptBboxToRect(
2222
2288
  res.content.bbox,
2223
- _optionalChain([options, 'access', _55 => _55.searchConfig, 'optionalAccess', _56 => _56.rect, 'optionalAccess', _57 => _57.width]) || context.size.width,
2224
- _optionalChain([options, 'access', _58 => _58.searchConfig, 'optionalAccess', _59 => _59.rect, 'optionalAccess', _60 => _60.height]) || context.size.height,
2225
- _optionalChain([options, 'access', _61 => _61.searchConfig, 'optionalAccess', _62 => _62.rect, 'optionalAccess', _63 => _63.left]),
2226
- _optionalChain([options, 'access', _64 => _64.searchConfig, 'optionalAccess', _65 => _65.rect, 'optionalAccess', _66 => _66.top])
2289
+ _optionalChain([options, 'access', _57 => _57.searchConfig, 'optionalAccess', _58 => _58.rect, 'optionalAccess', _59 => _59.width]) || context.size.width,
2290
+ _optionalChain([options, 'access', _60 => _60.searchConfig, 'optionalAccess', _61 => _61.rect, 'optionalAccess', _62 => _62.height]) || context.size.height,
2291
+ _optionalChain([options, 'access', _63 => _63.searchConfig, 'optionalAccess', _64 => _64.rect, 'optionalAccess', _65 => _65.left]),
2292
+ _optionalChain([options, 'access', _66 => _66.searchConfig, 'optionalAccess', _67 => _67.rect, 'optionalAccess', _68 => _68.top])
2227
2293
  );
2228
2294
  debugInspect("resRect", resRect);
2229
2295
  const rectCenter = {
@@ -2242,7 +2308,7 @@ async function AiLocateElement(options) {
2242
2308
  }
2243
2309
  } catch (e) {
2244
2310
  const msg = e instanceof Error ? `Failed to parse bbox: ${e.message}` : "unknown error in locate";
2245
- if (!errors || _optionalChain([errors, 'optionalAccess', _67 => _67.length]) === 0) {
2311
+ if (!errors || _optionalChain([errors, 'optionalAccess', _69 => _69.length]) === 0) {
2246
2312
  errors = [msg];
2247
2313
  } else {
2248
2314
  errors.push(`(${msg})`);
@@ -2265,7 +2331,7 @@ async function AiLocateSection(options) {
2265
2331
  const { screenshotBase64 } = context;
2266
2332
  const systemPrompt = systemPromptToLocateSection(_env.vlLocateMode.call(void 0, ));
2267
2333
  const sectionLocatorInstructionText = await sectionLocatorInstruction.format({
2268
- sectionDescription
2334
+ sectionDescription: extraTextFromUserPrompt(sectionDescription)
2269
2335
  });
2270
2336
  const msgs = [
2271
2337
  { role: "system", content: systemPrompt },
@@ -2286,6 +2352,13 @@ async function AiLocateSection(options) {
2286
2352
  ]
2287
2353
  }
2288
2354
  ];
2355
+ if (typeof sectionDescription !== "string") {
2356
+ const addOns = await promptsToChatParam({
2357
+ images: sectionDescription.images,
2358
+ convertHttpImage2Base64: sectionDescription.convertHttpImage2Base64
2359
+ });
2360
+ msgs.push(...addOns);
2361
+ }
2289
2362
  const result = await callAiFn(
2290
2363
  msgs,
2291
2364
  2 /* EXTRACT_DATA */
@@ -2327,21 +2400,21 @@ async function AiLocateSection(options) {
2327
2400
  };
2328
2401
  }
2329
2402
  async function AiExtractElementInfo(options) {
2330
- const { dataQuery, context, extractOption } = options;
2403
+ const { dataQuery, context, extractOption, multimodalPrompt } = options;
2331
2404
  const systemPrompt = systemPromptToExtract();
2332
2405
  const { screenshotBase64 } = context;
2333
2406
  const { description, elementById } = await describeUserPage(context, {
2334
2407
  truncateTextLength: 200,
2335
2408
  filterNonTextContent: false,
2336
2409
  visibleOnly: false,
2337
- domIncluded: _optionalChain([extractOption, 'optionalAccess', _68 => _68.domIncluded])
2410
+ domIncluded: _optionalChain([extractOption, 'optionalAccess', _70 => _70.domIncluded])
2338
2411
  });
2339
2412
  const extractDataPromptText = await extractDataQueryPrompt(
2340
2413
  description,
2341
2414
  dataQuery
2342
2415
  );
2343
2416
  const userContent = [];
2344
- if (_optionalChain([extractOption, 'optionalAccess', _69 => _69.screenshotIncluded]) !== false) {
2417
+ if (_optionalChain([extractOption, 'optionalAccess', _71 => _71.screenshotIncluded]) !== false) {
2345
2418
  userContent.push({
2346
2419
  type: "image_url",
2347
2420
  image_url: {
@@ -2361,6 +2434,13 @@ async function AiExtractElementInfo(options) {
2361
2434
  content: userContent
2362
2435
  }
2363
2436
  ];
2437
+ if (multimodalPrompt) {
2438
+ const addOns = await promptsToChatParam({
2439
+ images: multimodalPrompt.images,
2440
+ convertHttpImage2Base64: multimodalPrompt.convertHttpImage2Base64
2441
+ });
2442
+ msgs.push(...addOns);
2443
+ }
2364
2444
  const result = await callAiFn(
2365
2445
  msgs,
2366
2446
  2 /* EXTRACT_DATA */
@@ -2373,11 +2453,12 @@ async function AiExtractElementInfo(options) {
2373
2453
  }
2374
2454
  async function AiAssert(options) {
2375
2455
  const { assertion, context } = options;
2376
- _utils.assert.call(void 0, assertion, "assertion should be a string");
2456
+ _utils.assert.call(void 0, assertion, "assertion should not be empty");
2377
2457
  const { screenshotBase64 } = context;
2378
2458
  const systemPrompt = systemPromptToAssert({
2379
2459
  isUITars: _env.getAIConfigInBoolean.call(void 0, _env.MIDSCENE_USE_VLM_UI_TARS)
2380
2460
  });
2461
+ const assertionText = extraTextFromUserPrompt(assertion);
2381
2462
  const msgs = [
2382
2463
  { role: "system", content: systemPrompt },
2383
2464
  {
@@ -2395,13 +2476,20 @@ async function AiAssert(options) {
2395
2476
  text: `
2396
2477
  Here is the assertion. Please tell whether it is truthy according to the screenshot.
2397
2478
  =====================================
2398
- ${assertion}
2479
+ ${assertionText}
2399
2480
  =====================================
2400
2481
  `
2401
2482
  }
2402
2483
  ]
2403
2484
  }
2404
2485
  ];
2486
+ if (typeof assertion !== "string") {
2487
+ const addOns = await promptsToChatParam({
2488
+ images: assertion.images,
2489
+ convertHttpImage2Base64: assertion.convertHttpImage2Base64
2490
+ });
2491
+ msgs.push(...addOns);
2492
+ }
2405
2493
  const { content: assertResult, usage } = await callAiFn(
2406
2494
  msgs,
2407
2495
  0 /* ASSERT */
@@ -2465,11 +2553,11 @@ async function plan(userInstruction, opts) {
2465
2553
  ]
2466
2554
  }
2467
2555
  ];
2468
- const call2 = callAI || callAiFn;
2469
- const { content, usage } = await call2(msgs, 3 /* PLAN */);
2556
+ const call3 = callAI || callAiFn;
2557
+ const { content, usage } = await call3(msgs, 3 /* PLAN */);
2470
2558
  const rawResponse = JSON.stringify(content, void 0, 2);
2471
2559
  const planFromAI = content;
2472
- const actions = (_optionalChain([planFromAI, 'access', _70 => _70.action, 'optionalAccess', _71 => _71.type]) ? [planFromAI.action] : planFromAI.actions) || [];
2560
+ const actions = (_optionalChain([planFromAI, 'access', _72 => _72.action, 'optionalAccess', _73 => _73.type]) ? [planFromAI.action] : planFromAI.actions) || [];
2473
2561
  const returnValue = {
2474
2562
  ...planFromAI,
2475
2563
  actions,
@@ -2496,7 +2584,7 @@ async function plan(userInstruction, opts) {
2496
2584
  _utils.assert.call(void 0, !planFromAI.error, `Failed to plan actions: ${planFromAI.error}`);
2497
2585
  } else {
2498
2586
  actions.forEach((action) => {
2499
- if (_optionalChain([action, 'access', _72 => _72.locate, 'optionalAccess', _73 => _73.id])) {
2587
+ if (_optionalChain([action, 'access', _74 => _74.locate, 'optionalAccess', _75 => _75.id])) {
2500
2588
  const element = elementById(action.locate.id);
2501
2589
  if (element) {
2502
2590
  action.locate.id = element.id;
@@ -2574,7 +2662,7 @@ var pointToBbox = (point, width, height) => {
2574
2662
  async function vlmPlanning(options) {
2575
2663
  const { conversationHistory, userInstruction, size } = options;
2576
2664
  const systemPrompt = getUiTarsPlanningPrompt() + userInstruction;
2577
- const res = await call(
2665
+ const res = await call2(
2578
2666
  [
2579
2667
  {
2580
2668
  role: "user",
@@ -2822,6 +2910,7 @@ async function resizeImageForUiTars(imageBase64, size) {
2822
2910
 
2823
2911
 
2824
2912
 
2825
- exports.systemPromptToLocateElement = systemPromptToLocateElement; exports.elementByPositionWithElementInfo = elementByPositionWithElementInfo; exports.describeUserPage = describeUserPage; exports.call = call; exports.callToGetJSONObject = callToGetJSONObject; exports.AIActionType = AIActionType; exports.callAiFn = callAiFn; exports.adaptBboxToRect = adaptBboxToRect; exports.expandSearchArea = expandSearchArea; exports.generateYamlTest = generateYamlTest; exports.generateYamlTestStream = generateYamlTestStream; exports.generatePlaywrightTest = generatePlaywrightTest; exports.generatePlaywrightTestStream = generatePlaywrightTestStream; exports.AiLocateElement = AiLocateElement; exports.AiLocateSection = AiLocateSection; exports.AiExtractElementInfo = AiExtractElementInfo; exports.AiAssert = AiAssert; exports.plan = plan; exports.vlmPlanning = vlmPlanning; exports.resizeImageForUiTars = resizeImageForUiTars;
2826
2913
 
2827
- //# sourceMappingURL=chunk-5HH6E7M4.js.map
2914
+ exports.systemPromptToLocateElement = systemPromptToLocateElement; exports.elementByPositionWithElementInfo = elementByPositionWithElementInfo; exports.describeUserPage = describeUserPage; exports.call = call2; exports.callToGetJSONObject = callToGetJSONObject; exports.callAiFnWithStringResponse = callAiFnWithStringResponse; exports.AIActionType = AIActionType; exports.callAiFn = callAiFn; exports.adaptBboxToRect = adaptBboxToRect; exports.expandSearchArea = expandSearchArea; exports.generateYamlTest = generateYamlTest; exports.generateYamlTestStream = generateYamlTestStream; exports.generatePlaywrightTest = generatePlaywrightTest; exports.generatePlaywrightTestStream = generatePlaywrightTestStream; exports.AiLocateElement = AiLocateElement; exports.AiLocateSection = AiLocateSection; exports.AiExtractElementInfo = AiExtractElementInfo; exports.AiAssert = AiAssert; exports.plan = plan; exports.vlmPlanning = vlmPlanning; exports.resizeImageForUiTars = resizeImageForUiTars;
2915
+
2916
+ //# sourceMappingURL=chunk-JS4CT3XV.js.map