@midscene/core 0.24.2-beta-20250801024655.0 → 0.24.2-beta-20250805024613.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. package/dist/es/ai-model.d.ts +8 -4
  2. package/dist/es/ai-model.js +3 -1
  3. package/dist/es/{chunk-5HH6E7M4.js → chunk-JS4CT3XV.js} +114 -25
  4. package/dist/es/chunk-JS4CT3XV.js.map +1 -0
  5. package/dist/es/{chunk-FMBJ3CM2.js → chunk-NZFWY3M5.js} +3 -3
  6. package/dist/es/index.d.ts +7 -9
  7. package/dist/es/index.js +38 -18
  8. package/dist/es/index.js.map +1 -1
  9. package/dist/es/{llm-planning-d7096b0d.d.ts → llm-planning-877248da.d.ts} +7 -6
  10. package/dist/es/{types-d836fa73.d.ts → types-512d3687.d.ts} +31 -12
  11. package/dist/es/utils.d.ts +1 -1
  12. package/dist/es/utils.js +1 -1
  13. package/dist/lib/ai-model.d.ts +8 -4
  14. package/dist/lib/ai-model.js +4 -2
  15. package/dist/lib/{chunk-5HH6E7M4.js → chunk-JS4CT3XV.js} +120 -31
  16. package/dist/lib/chunk-JS4CT3XV.js.map +1 -0
  17. package/dist/lib/{chunk-FMBJ3CM2.js → chunk-NZFWY3M5.js} +3 -3
  18. package/dist/lib/index.d.ts +7 -9
  19. package/dist/lib/index.js +49 -29
  20. package/dist/lib/index.js.map +1 -1
  21. package/dist/lib/{llm-planning-d7096b0d.d.ts → llm-planning-877248da.d.ts} +7 -6
  22. package/dist/{types/types-d836fa73.d.ts → lib/types-512d3687.d.ts} +31 -12
  23. package/dist/lib/utils.d.ts +1 -1
  24. package/dist/lib/utils.js +2 -2
  25. package/dist/types/ai-model.d.ts +8 -4
  26. package/dist/types/index.d.ts +7 -9
  27. package/dist/types/{llm-planning-d7096b0d.d.ts → llm-planning-877248da.d.ts} +7 -6
  28. package/dist/{lib/types-d836fa73.d.ts → types/types-512d3687.d.ts} +31 -12
  29. package/dist/types/utils.d.ts +1 -1
  30. package/package.json +3 -3
  31. package/dist/es/chunk-5HH6E7M4.js.map +0 -1
  32. package/dist/lib/chunk-5HH6E7M4.js.map +0 -1
  33. /package/dist/es/{chunk-FMBJ3CM2.js.map → chunk-NZFWY3M5.js.map} +0 -0
  34. /package/dist/lib/{chunk-FMBJ3CM2.js.map → chunk-NZFWY3M5.js.map} +0 -0
@@ -1,9 +1,9 @@
1
- import { aA as StreamingCallback, m as AIUsageInfo, az as StreamingCodeGenerationOptions, aC as StreamingAIResponse, V as PlanningAction, j as MidsceneYamlFlowItem } from './types-d836fa73.js';
1
+ import { aC as StreamingCallback, o as AIUsageInfo, aB as StreamingCodeGenerationOptions, aE as StreamingAIResponse, X as PlanningAction, l as MidsceneYamlFlowItem } from './types-512d3687.js';
2
2
  import OpenAI from 'openai';
3
3
  import { ChatCompletionMessageParam } from 'openai/resources';
4
4
  export { ChatCompletionMessageParam } from 'openai/resources';
5
- import { b as AIActionType } from './llm-planning-d7096b0d.js';
6
- export { a as AiAssert, f as AiExtractElementInfo, A as AiLocateElement, g as AiLocateSection, h as adaptBboxToRect, c as callAiFn, d as describeUserPage, e as elementByPositionWithElementInfo, p as plan } from './llm-planning-d7096b0d.js';
5
+ import { b as AIActionType, e as AIArgs } from './llm-planning-877248da.js';
6
+ export { a as AiAssert, g as AiExtractElementInfo, A as AiLocateElement, h as AiLocateSection, i as adaptBboxToRect, c as callAiFn, d as describeUserPage, f as elementByPositionWithElementInfo, p as plan } from './llm-planning-877248da.js';
7
7
  import { vlLocateMode } from '@midscene/shared/env';
8
8
  import { actionParser } from '@ui-tars/action-parser';
9
9
  import { Size } from '@midscene/shared/types';
@@ -21,6 +21,10 @@ declare function callToGetJSONObject<T>(messages: ChatCompletionMessageParam[],
21
21
  content: T;
22
22
  usage?: AIUsageInfo;
23
23
  }>;
24
+ declare function callAiFnWithStringResponse<T>(msgs: AIArgs, AIActionTypeValue: AIActionType): Promise<{
25
+ content: string;
26
+ usage?: AIUsageInfo;
27
+ }>;
24
28
 
25
29
  declare function systemPromptToLocateElement(vlMode: ReturnType<typeof vlLocateMode>): string;
26
30
 
@@ -92,4 +96,4 @@ declare function vlmPlanning(options: {
92
96
  }>;
93
97
  declare function resizeImageForUiTars(imageBase64: string, size: Size): Promise<string>;
94
98
 
95
- export { AIActionType, call as callAi, callToGetJSONObject, generatePlaywrightTest, generatePlaywrightTestStream, generateYamlTest, generateYamlTestStream, resizeImageForUiTars, systemPromptToLocateElement, vlmPlanning };
99
+ export { AIActionType, AIArgs, call as callAi, callAiFnWithStringResponse, callToGetJSONObject, generatePlaywrightTest, generatePlaywrightTestStream, generateYamlTest, generateYamlTestStream, resizeImageForUiTars, systemPromptToLocateElement, vlmPlanning };
@@ -7,6 +7,7 @@ import {
7
7
  adaptBboxToRect,
8
8
  call,
9
9
  callAiFn,
10
+ callAiFnWithStringResponse,
10
11
  callToGetJSONObject,
11
12
  describeUserPage,
12
13
  elementByPositionWithElementInfo,
@@ -18,7 +19,7 @@ import {
18
19
  resizeImageForUiTars,
19
20
  systemPromptToLocateElement,
20
21
  vlmPlanning
21
- } from "./chunk-5HH6E7M4.js";
22
+ } from "./chunk-JS4CT3XV.js";
22
23
  export {
23
24
  AIActionType,
24
25
  AiAssert,
@@ -28,6 +29,7 @@ export {
28
29
  adaptBboxToRect,
29
30
  call as callAi,
30
31
  callAiFn,
32
+ callAiFnWithStringResponse,
31
33
  callToGetJSONObject,
32
34
  describeUserPage,
33
35
  elementByPositionWithElementInfo,
@@ -56,10 +56,6 @@ var AIActionType = /* @__PURE__ */ ((AIActionType2) => {
56
56
  return AIActionType2;
57
57
  })(AIActionType || {});
58
58
  async function callAiFn(msgs, AIActionTypeValue) {
59
- assert(
60
- checkAIConfig(),
61
- "Cannot find config for AI model service. If you are using a self-hosted model without validating the API key, please set `OPENAI_API_KEY` to any non-null value. https://midscenejs.com/model-provider.html"
62
- );
63
59
  const { content, usage } = await callToGetJSONObject(
64
60
  msgs,
65
61
  AIActionTypeValue
@@ -643,9 +639,9 @@ import { PromptTemplate as PromptTemplate2 } from "@langchain/core/prompts";
643
639
  import {
644
640
  imageInfo,
645
641
  imageInfoOfBase64,
646
- base64Encoded,
642
+ localImg2Base64,
643
+ httpImg2Base64,
647
644
  resizeImg,
648
- transformImgPathToBase64,
649
645
  saveBase64Image,
650
646
  zoomForGPT4o
651
647
  } from "@midscene/shared/img";
@@ -1331,7 +1327,11 @@ Please check your config.`
1331
1327
  }
1332
1328
  throw new Error("Openai SDK or Anthropic SDK is not initialized");
1333
1329
  }
1334
- async function call(messages, AIActionTypeValue, responseFormat, options) {
1330
+ async function call2(messages, AIActionTypeValue, responseFormat, options) {
1331
+ assert3(
1332
+ checkAIConfig(),
1333
+ "Cannot find config for AI model service. If you are using a self-hosted model without validating the API key, please set `OPENAI_API_KEY` to any non-null value. https://midscenejs.com/model-provider.html"
1334
+ );
1335
1335
  const { completion, style } = await createChatClient({
1336
1336
  AIActionTypeValue
1337
1337
  });
@@ -1576,11 +1576,15 @@ async function callToGetJSONObject(messages, AIActionTypeValue) {
1576
1576
  if (model === "gpt-4o-2024-05-13") {
1577
1577
  responseFormat = { type: "json_object" /* JSON */ };
1578
1578
  }
1579
- const response = await call(messages, AIActionTypeValue, responseFormat);
1579
+ const response = await call2(messages, AIActionTypeValue, responseFormat);
1580
1580
  assert3(response, "empty response");
1581
1581
  const jsonContent = safeParseJson(response.content);
1582
1582
  return { content: jsonContent, usage: response.usage };
1583
1583
  }
1584
+ async function callAiFnWithStringResponse(msgs, AIActionTypeValue) {
1585
+ const { content, usage } = await call2(msgs, AIActionTypeValue);
1586
+ return { content, usage };
1587
+ }
1584
1588
  function extractJSONFromCodeBlock(response) {
1585
1589
  try {
1586
1590
  const jsonMatch = response.match(/^\s*(\{[\s\S]*\})\s*$/);
@@ -1795,7 +1799,7 @@ Respond with YAML only, no explanations.`
1795
1799
  }))
1796
1800
  });
1797
1801
  }
1798
- const response = await call(prompt, 2 /* EXTRACT_DATA */);
1802
+ const response = await call2(prompt, 2 /* EXTRACT_DATA */);
1799
1803
  if (response?.content && typeof response.content === "string") {
1800
1804
  return response.content;
1801
1805
  }
@@ -1857,12 +1861,12 @@ Respond with YAML only, no explanations.`
1857
1861
  });
1858
1862
  }
1859
1863
  if (options.stream && options.onChunk) {
1860
- return await call(prompt, 2 /* EXTRACT_DATA */, void 0, {
1864
+ return await call2(prompt, 2 /* EXTRACT_DATA */, void 0, {
1861
1865
  stream: true,
1862
1866
  onChunk: options.onChunk
1863
1867
  });
1864
1868
  } else {
1865
- const response = await call(prompt, 2 /* EXTRACT_DATA */);
1869
+ const response = await call2(prompt, 2 /* EXTRACT_DATA */);
1866
1870
  if (response?.content && typeof response.content === "string") {
1867
1871
  return {
1868
1872
  content: response.content,
@@ -1925,7 +1929,7 @@ ${PLAYWRIGHT_EXAMPLE_CODE}`;
1925
1929
  content: messageContent
1926
1930
  }
1927
1931
  ];
1928
- const response = await call(prompt, 2 /* EXTRACT_DATA */);
1932
+ const response = await call2(prompt, 2 /* EXTRACT_DATA */);
1929
1933
  if (response?.content && typeof response.content === "string") {
1930
1934
  return response.content;
1931
1935
  }
@@ -1980,12 +1984,12 @@ ${PLAYWRIGHT_EXAMPLE_CODE}`;
1980
1984
  }
1981
1985
  ];
1982
1986
  if (options.stream && options.onChunk) {
1983
- return await call(prompt, 2 /* EXTRACT_DATA */, void 0, {
1987
+ return await call2(prompt, 2 /* EXTRACT_DATA */, void 0, {
1984
1988
  stream: true,
1985
1989
  onChunk: options.onChunk
1986
1990
  });
1987
1991
  } else {
1988
- const response = await call(prompt, 2 /* EXTRACT_DATA */);
1992
+ const response = await call2(prompt, 2 /* EXTRACT_DATA */);
1989
1993
  if (response?.content && typeof response.content === "string") {
1990
1994
  return {
1991
1995
  content: response.content,
@@ -2004,7 +2008,11 @@ import {
2004
2008
  getAIConfigInBoolean as getAIConfigInBoolean2,
2005
2009
  vlLocateMode as vlLocateMode4
2006
2010
  } from "@midscene/shared/env";
2007
- import { cropByRect, paddingToMatchBlockByBase64 } from "@midscene/shared/img";
2011
+ import {
2012
+ cropByRect,
2013
+ paddingToMatchBlockByBase64,
2014
+ preProcessImageUrl
2015
+ } from "@midscene/shared/img";
2008
2016
  import { getDebug as getDebug3 } from "@midscene/shared/logger";
2009
2017
  import { assert as assert4 } from "@midscene/shared/utils";
2010
2018
 
@@ -2018,6 +2026,8 @@ The user will give you a screenshot, the contents of it (optional), and some dat
2018
2026
 
2019
2027
  If a key specifies a JSON data type (such as Number, String, Boolean, Object, Array), ensure the returned value strictly matches that data type.
2020
2028
 
2029
+ If the user provides multiple reference images, please carefully review the reference images with the screenshot and provide the correct answer for <DATA_DEMAND>.
2030
+
2021
2031
  Return in the following JSON format:
2022
2032
  {
2023
2033
  data: any, // the extracted data. Make sure both the value and scheme meet the DATA_DEMAND. If you want to write some description in this field, use the same language as the DATA_DEMAND.
@@ -2158,6 +2168,55 @@ var sectionLocatorInstruction = new PromptTemplate4({
2158
2168
  // src/ai-model/inspect.ts
2159
2169
  var debugInspect = getDebug3("ai:inspect");
2160
2170
  var debugSection = getDebug3("ai:section");
2171
+ var extraTextFromUserPrompt = (prompt) => {
2172
+ if (typeof prompt === "string") {
2173
+ return prompt;
2174
+ } else {
2175
+ return prompt.prompt;
2176
+ }
2177
+ };
2178
+ var promptsToChatParam = async (multimodalPrompt) => {
2179
+ const msgs = [];
2180
+ if (multimodalPrompt?.images?.length) {
2181
+ msgs.push({
2182
+ role: "user",
2183
+ content: [
2184
+ {
2185
+ type: "text",
2186
+ text: "Next, I will provide all the reference images."
2187
+ }
2188
+ ]
2189
+ });
2190
+ for (const item of multimodalPrompt.images) {
2191
+ const base64 = await preProcessImageUrl(
2192
+ item.url,
2193
+ !!multimodalPrompt.convertHttpImage2Base64
2194
+ );
2195
+ msgs.push({
2196
+ role: "user",
2197
+ content: [
2198
+ {
2199
+ type: "text",
2200
+ text: `reference image ${item.name}:`
2201
+ }
2202
+ ]
2203
+ });
2204
+ msgs.push({
2205
+ role: "user",
2206
+ content: [
2207
+ {
2208
+ type: "image_url",
2209
+ image_url: {
2210
+ url: base64,
2211
+ detail: "high"
2212
+ }
2213
+ }
2214
+ ]
2215
+ });
2216
+ }
2217
+ }
2218
+ return msgs;
2219
+ };
2161
2220
  async function AiLocateElement(options) {
2162
2221
  const { context, targetElementDescription, callAI } = options;
2163
2222
  const { screenshotBase64 } = context;
@@ -2168,7 +2227,7 @@ async function AiLocateElement(options) {
2168
2227
  );
2169
2228
  const userInstructionPrompt = await findElementPrompt.format({
2170
2229
  pageDescription: description,
2171
- targetElementDescription
2230
+ targetElementDescription: extraTextFromUserPrompt(targetElementDescription)
2172
2231
  });
2173
2232
  const systemPrompt = systemPromptToLocateElement(vlLocateMode4());
2174
2233
  let imagePayload = screenshotBase64;
@@ -2210,6 +2269,13 @@ async function AiLocateElement(options) {
2210
2269
  ]
2211
2270
  }
2212
2271
  ];
2272
+ if (typeof targetElementDescription !== "string") {
2273
+ const addOns = await promptsToChatParam({
2274
+ images: targetElementDescription.images,
2275
+ convertHttpImage2Base64: targetElementDescription.convertHttpImage2Base64
2276
+ });
2277
+ msgs.push(...addOns);
2278
+ }
2213
2279
  const callAIFn = callAI || callToGetJSONObject;
2214
2280
  const res = await callAIFn(msgs, 1 /* INSPECT_ELEMENT */);
2215
2281
  const rawResponse = JSON.stringify(res.content);
@@ -2265,7 +2331,7 @@ async function AiLocateSection(options) {
2265
2331
  const { screenshotBase64 } = context;
2266
2332
  const systemPrompt = systemPromptToLocateSection(vlLocateMode4());
2267
2333
  const sectionLocatorInstructionText = await sectionLocatorInstruction.format({
2268
- sectionDescription
2334
+ sectionDescription: extraTextFromUserPrompt(sectionDescription)
2269
2335
  });
2270
2336
  const msgs = [
2271
2337
  { role: "system", content: systemPrompt },
@@ -2286,6 +2352,13 @@ async function AiLocateSection(options) {
2286
2352
  ]
2287
2353
  }
2288
2354
  ];
2355
+ if (typeof sectionDescription !== "string") {
2356
+ const addOns = await promptsToChatParam({
2357
+ images: sectionDescription.images,
2358
+ convertHttpImage2Base64: sectionDescription.convertHttpImage2Base64
2359
+ });
2360
+ msgs.push(...addOns);
2361
+ }
2289
2362
  const result = await callAiFn(
2290
2363
  msgs,
2291
2364
  2 /* EXTRACT_DATA */
@@ -2327,7 +2400,7 @@ async function AiLocateSection(options) {
2327
2400
  };
2328
2401
  }
2329
2402
  async function AiExtractElementInfo(options) {
2330
- const { dataQuery, context, extractOption } = options;
2403
+ const { dataQuery, context, extractOption, multimodalPrompt } = options;
2331
2404
  const systemPrompt = systemPromptToExtract();
2332
2405
  const { screenshotBase64 } = context;
2333
2406
  const { description, elementById } = await describeUserPage(context, {
@@ -2361,6 +2434,13 @@ async function AiExtractElementInfo(options) {
2361
2434
  content: userContent
2362
2435
  }
2363
2436
  ];
2437
+ if (multimodalPrompt) {
2438
+ const addOns = await promptsToChatParam({
2439
+ images: multimodalPrompt.images,
2440
+ convertHttpImage2Base64: multimodalPrompt.convertHttpImage2Base64
2441
+ });
2442
+ msgs.push(...addOns);
2443
+ }
2364
2444
  const result = await callAiFn(
2365
2445
  msgs,
2366
2446
  2 /* EXTRACT_DATA */
@@ -2373,11 +2453,12 @@ async function AiExtractElementInfo(options) {
2373
2453
  }
2374
2454
  async function AiAssert(options) {
2375
2455
  const { assertion, context } = options;
2376
- assert4(assertion, "assertion should be a string");
2456
+ assert4(assertion, "assertion should not be empty");
2377
2457
  const { screenshotBase64 } = context;
2378
2458
  const systemPrompt = systemPromptToAssert({
2379
2459
  isUITars: getAIConfigInBoolean2(MIDSCENE_USE_VLM_UI_TARS)
2380
2460
  });
2461
+ const assertionText = extraTextFromUserPrompt(assertion);
2381
2462
  const msgs = [
2382
2463
  { role: "system", content: systemPrompt },
2383
2464
  {
@@ -2395,13 +2476,20 @@ async function AiAssert(options) {
2395
2476
  text: `
2396
2477
  Here is the assertion. Please tell whether it is truthy according to the screenshot.
2397
2478
  =====================================
2398
- ${assertion}
2479
+ ${assertionText}
2399
2480
  =====================================
2400
2481
  `
2401
2482
  }
2402
2483
  ]
2403
2484
  }
2404
2485
  ];
2486
+ if (typeof assertion !== "string") {
2487
+ const addOns = await promptsToChatParam({
2488
+ images: assertion.images,
2489
+ convertHttpImage2Base64: assertion.convertHttpImage2Base64
2490
+ });
2491
+ msgs.push(...addOns);
2492
+ }
2405
2493
  const { content: assertResult, usage } = await callAiFn(
2406
2494
  msgs,
2407
2495
  0 /* ASSERT */
@@ -2465,8 +2553,8 @@ async function plan(userInstruction, opts) {
2465
2553
  ]
2466
2554
  }
2467
2555
  ];
2468
- const call2 = callAI || callAiFn;
2469
- const { content, usage } = await call2(msgs, 3 /* PLAN */);
2556
+ const call3 = callAI || callAiFn;
2557
+ const { content, usage } = await call3(msgs, 3 /* PLAN */);
2470
2558
  const rawResponse = JSON.stringify(content, void 0, 2);
2471
2559
  const planFromAI = content;
2472
2560
  const actions = (planFromAI.action?.type ? [planFromAI.action] : planFromAI.actions) || [];
@@ -2574,7 +2662,7 @@ var pointToBbox = (point, width, height) => {
2574
2662
  async function vlmPlanning(options) {
2575
2663
  const { conversationHistory, userInstruction, size } = options;
2576
2664
  const systemPrompt = getUiTarsPlanningPrompt() + userInstruction;
2577
- const res = await call(
2665
+ const res = await call2(
2578
2666
  [
2579
2667
  {
2580
2668
  role: "user",
@@ -2805,8 +2893,9 @@ export {
2805
2893
  systemPromptToLocateElement,
2806
2894
  elementByPositionWithElementInfo,
2807
2895
  describeUserPage,
2808
- call,
2896
+ call2 as call,
2809
2897
  callToGetJSONObject,
2898
+ callAiFnWithStringResponse,
2810
2899
  AIActionType,
2811
2900
  callAiFn,
2812
2901
  adaptBboxToRect,
@@ -2824,4 +2913,4 @@ export {
2824
2913
  resizeImageForUiTars
2825
2914
  };
2826
2915
 
2827
- //# sourceMappingURL=chunk-5HH6E7M4.js.map
2916
+ //# sourceMappingURL=chunk-JS4CT3XV.js.map