@midscene/core 0.23.5-beta-20250728070606.0 → 0.24.1-beta-20250728094050.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. package/dist/es/ai-model.d.ts +3 -3
  2. package/dist/es/ai-model.js +1 -1
  3. package/dist/es/{chunk-QSWMWTO4.js → chunk-AKL2GGPA.js} +3 -3
  4. package/dist/es/{chunk-QFXN2AP7.js → chunk-YCHAOUOW.js} +37 -73
  5. package/dist/es/chunk-YCHAOUOW.js.map +1 -0
  6. package/dist/es/index.d.ts +8 -6
  7. package/dist/es/index.js +13 -6
  8. package/dist/es/index.js.map +1 -1
  9. package/dist/es/{llm-planning-4ac580dc.d.ts → llm-planning-3f26578e.d.ts} +3 -3
  10. package/dist/es/{types-27dc17c9.d.ts → types-c519555a.d.ts} +4 -6
  11. package/dist/es/utils.d.ts +1 -1
  12. package/dist/es/utils.js +1 -1
  13. package/dist/lib/ai-model.d.ts +3 -3
  14. package/dist/lib/ai-model.js +2 -2
  15. package/dist/lib/{chunk-QSWMWTO4.js → chunk-AKL2GGPA.js} +3 -3
  16. package/dist/lib/{chunk-QFXN2AP7.js → chunk-YCHAOUOW.js} +36 -72
  17. package/dist/lib/chunk-YCHAOUOW.js.map +1 -0
  18. package/dist/lib/index.d.ts +8 -6
  19. package/dist/lib/index.js +22 -15
  20. package/dist/lib/index.js.map +1 -1
  21. package/dist/lib/{llm-planning-4ac580dc.d.ts → llm-planning-3f26578e.d.ts} +3 -3
  22. package/dist/{types/types-27dc17c9.d.ts → lib/types-c519555a.d.ts} +4 -6
  23. package/dist/lib/utils.d.ts +1 -1
  24. package/dist/lib/utils.js +2 -2
  25. package/dist/types/ai-model.d.ts +3 -3
  26. package/dist/types/index.d.ts +8 -6
  27. package/dist/types/{llm-planning-4ac580dc.d.ts → llm-planning-3f26578e.d.ts} +3 -3
  28. package/dist/{lib/types-27dc17c9.d.ts → types/types-c519555a.d.ts} +4 -6
  29. package/dist/types/utils.d.ts +1 -1
  30. package/package.json +4 -4
  31. package/dist/es/chunk-QFXN2AP7.js.map +0 -1
  32. package/dist/lib/chunk-QFXN2AP7.js.map +0 -1
  33. /package/dist/es/{chunk-QSWMWTO4.js.map → chunk-AKL2GGPA.js.map} +0 -0
  34. /package/dist/lib/{chunk-QSWMWTO4.js.map → chunk-AKL2GGPA.js.map} +0 -0
@@ -386,31 +386,47 @@ You are an expert in software testing.
386
386
  ## Objective:
387
387
  - Identify elements in screenshots and text that match the user's description.
388
388
  - Give the coordinates of the element that matches the user's description best in the screenshot.
389
+ - Determine whether the user's description is order-sensitive (e.g., contains phrases like 'the third item in the list', 'the last button', etc.).
389
390
 
390
391
  ## Output Format:
391
392
  \`\`\`json
392
393
  {
393
394
  "bbox": [number, number, number, number], // ${bboxComment}
394
- "errors"?: string[]
395
+ "errors"?: string[],
396
+ "isOrderSensitive": boolean // Whether the targetElementDescription is order-sensitive (true/false)
395
397
  }
396
398
  \`\`\`
397
399
 
398
400
  Fields:
399
401
  * \`bbox\` is the bounding box of the element that matches the user's description best in the screenshot
402
+ * \`isOrderSensitive\` is a boolean indicating whether the user's description is order-sensitive (true/false)
400
403
  * \`errors\` is an optional array of error messages (if any)
401
404
 
402
- For example, when an element is found:
405
+ Order-sensitive means the description contains phrases like:
406
+ - "the third item in the list"
407
+ - "the last button"
408
+ - "the first input box"
409
+ - "the second row"
410
+
411
+ Not order-sensitive means the description is like:
412
+ - "confirm button"
413
+ - "search box"
414
+ - "password input"
415
+
416
+ For example, when an element is found and the description is order-sensitive:
403
417
  \`\`\`json
404
418
  {
405
419
  "bbox": [100, 100, 200, 200],
420
+ "isOrderSensitive": true,
406
421
  "errors": []
407
422
  }
408
423
  \`\`\`
409
424
 
410
- When no element is found:
425
+ When no element is found and the description is not order-sensitive:
411
426
  \`\`\`json
412
427
  {
413
428
  "bbox": [],
429
+ "isOrderSensitive": false,
414
430
  "errors": ["I can see ..., but {some element} is not found"]
415
431
  }
416
432
  \`\`\`
@@ -423,6 +439,7 @@ You are an expert in software page image (2D) and page element text analysis.
423
439
  ## Objective:
424
440
  - Identify elements in screenshots and text that match the user's description.
425
441
  - Return JSON data containing the selection reason and element ID.
442
+ - Determine whether the user's description is order-sensitive (e.g., contains phrases like 'the third item in the list', 'the last button', etc.).
426
443
 
427
444
  ## Skills:
428
445
  - Image analysis and recognition
@@ -434,6 +451,7 @@ You are an expert in software page image (2D) and page element text analysis.
434
451
  2. Based on the user's description, locate the target element ID in the list of element descriptions and the screenshot.
435
452
  3. Found the required number of elements
436
453
  4. Return JSON data containing the selection reason and element ID.
454
+ 5. Judge whether the user's description is order-sensitive (see below for definition and examples).
437
455
 
438
456
  ## Constraints:
439
457
  - Strictly adhere to the specified location when describing the required element; do not select elements from other locations.
@@ -443,6 +461,10 @@ You are an expert in software page image (2D) and page element text analysis.
443
461
  - The returned data must conform to the specified JSON format.
444
462
  - The returned value id information must use the id from element info (important: **use id not indexId, id is hash content**)
445
463
 
464
+ ## Order-Sensitive Definition:
465
+ - If the description contains phrases like "the third item in the list", "the last button", "the first input box", "the second row", etc., it is order-sensitive (isOrderSensitive = true).
466
+ - If the description is like "confirm button", "search box", "password input", etc., it is not order-sensitive (isOrderSensitive = false).
467
+
446
468
  ## Output Format:
447
469
 
448
470
  Please return the result in JSON format as follows:
@@ -458,6 +480,7 @@ Please return the result in JSON format as follows:
458
480
  }
459
481
  // More elements...
460
482
  ],
483
+ "isOrderSensitive": true, // or false, depending on the user's description
461
484
  "errors": [] // Array of strings containing any error messages
462
485
  }
463
486
  \`\`\`
@@ -546,6 +569,7 @@ Output Example:
546
569
  "id": "1231"
547
570
  }
548
571
  ],
572
+ "isOrderSensitive": true,
549
573
  "errors": []
550
574
  }
551
575
  \`\`\`
@@ -583,6 +607,10 @@ var locatorSchema = {
583
607
  },
584
608
  description: "List of found elements"
585
609
  },
610
+ isOrderSensitive: {
611
+ type: "boolean",
612
+ description: "Whether the targetElementDescription is order-sensitive (true/false)"
613
+ },
586
614
  errors: {
587
615
  type: "array",
588
616
  items: {
@@ -591,7 +619,7 @@ var locatorSchema = {
591
619
  description: "List of error messages, if any"
592
620
  }
593
621
  },
594
- required: ["elements", "errors"],
622
+ required: ["elements", "isOrderSensitive", "errors"],
595
623
  additionalProperties: false
596
624
  }
597
625
  }
@@ -1970,11 +1998,7 @@ import {
1970
1998
  getAIConfigInBoolean as getAIConfigInBoolean2,
1971
1999
  vlLocateMode as vlLocateMode4
1972
2000
  } from "@midscene/shared/env";
1973
- import {
1974
- cropByRect,
1975
- paddingToMatchBlockByBase64,
1976
- transformImgPathToBase64Str
1977
- } from "@midscene/shared/img";
2001
+ import { cropByRect, paddingToMatchBlockByBase64 } from "@midscene/shared/img";
1978
2002
  import { getDebug as getDebug3 } from "@midscene/shared/logger";
1979
2003
  import { assert as assert4 } from "@midscene/shared/utils";
1980
2004
 
@@ -1988,8 +2012,6 @@ The user will give you a screenshot, the contents of it (optional), and some dat
1988
2012
 
1989
2013
  If a key specifies a JSON data type (such as Number, String, Boolean, Object, Array), ensure the returned value strictly matches that data type.
1990
2014
 
1991
- If the user provides multiple reference images, please carefully compare the reference images with the screenshot and provide the correct answer for <DATA_DEMAND>.
1992
-
1993
2015
  Return in the following JSON format:
1994
2016
  {
1995
2017
  data: any, // the extracted data. Make sure both the value and scheme meet the DATA_DEMAND. If you want to write some description in this field, use the same language as the DATA_DEMAND.
@@ -2236,7 +2258,8 @@ async function AiLocateElement(options) {
2236
2258
  },
2237
2259
  rawResponse,
2238
2260
  elementById,
2239
- usage: res.usage
2261
+ usage: res.usage,
2262
+ isOrderSensitive: typeof res.content === "object" && res.content !== null && "isOrderSensitive" in res.content ? res.content.isOrderSensitive : void 0
2240
2263
  };
2241
2264
  }
2242
2265
  async function AiLocateSection(options) {
@@ -2305,28 +2328,8 @@ async function AiLocateSection(options) {
2305
2328
  usage: result.usage
2306
2329
  };
2307
2330
  }
2308
- var imageUrl2Base64 = async (url) => {
2309
- if (url.startsWith("data:")) {
2310
- return url;
2311
- } else if (url.startsWith("http://") || url.startsWith("https://")) {
2312
- const response = await fetch(url);
2313
- if (!response.ok) {
2314
- throw new Error(`Failed to fetch image: ${url}`);
2315
- }
2316
- const contentType = response.headers.get("content-type");
2317
- if (!contentType) {
2318
- throw new Error(`Failed to fetch image: ${url}`);
2319
- }
2320
- const ext = contentType.split("/")[1];
2321
- assert4(ext, "get mime-type extension from response headers failed");
2322
- const buffer = Buffer.from(await response.arrayBuffer());
2323
- return `data:image/${ext};base64,${buffer.toString("base64")}`;
2324
- } else {
2325
- return await transformImgPathToBase64Str(url);
2326
- }
2327
- };
2328
2331
  async function AiExtractElementInfo(options) {
2329
- const { dataQuery, context, extractOption, promptImages } = options;
2332
+ const { dataQuery, context, extractOption } = options;
2330
2333
  const systemPrompt = systemPromptToExtract();
2331
2334
  const { screenshotBase64 } = context;
2332
2335
  const { description, elementById } = await describeUserPage(context, {
@@ -2358,47 +2361,8 @@ async function AiExtractElementInfo(options) {
2358
2361
  {
2359
2362
  role: "user",
2360
2363
  content: userContent
2361
- },
2362
- {
2363
- role: "user",
2364
- content: [
2365
- {
2366
- type: "text",
2367
- text: ""
2368
- }
2369
- ]
2370
2364
  }
2371
2365
  ];
2372
- const multiMsg = false;
2373
- if (promptImages) {
2374
- for (const [key, url] of Object.entries(promptImages)) {
2375
- const base64 = await imageUrl2Base64(url);
2376
- const text = {
2377
- type: "text",
2378
- text: `reference image ${key}:`
2379
- };
2380
- const img = {
2381
- type: "image_url",
2382
- image_url: {
2383
- url: base64,
2384
- detail: "high"
2385
- }
2386
- };
2387
- if (multiMsg) {
2388
- msgs.push({
2389
- role: "user",
2390
- content: [text]
2391
- });
2392
- msgs.push({
2393
- role: "user",
2394
- content: [img]
2395
- });
2396
- } else {
2397
- userContent.push(text);
2398
- userContent.push(img);
2399
- }
2400
- }
2401
- }
2402
2366
  const result = await callAiFn(
2403
2367
  msgs,
2404
2368
  2 /* EXTRACT_DATA */
@@ -2829,4 +2793,4 @@ export {
2829
2793
  resizeImageForUiTars
2830
2794
  };
2831
2795
 
2832
- //# sourceMappingURL=chunk-QFXN2AP7.js.map
2796
+ //# sourceMappingURL=chunk-YCHAOUOW.js.map