@midscene/core 0.24.1-beta-20250729062523.0 → 0.24.1-beta-20250729081015.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. package/dist/es/ai-model.d.ts +3 -3
  2. package/dist/es/ai-model.js +1 -1
  3. package/dist/es/{chunk-TV47VKQ7.js → chunk-GPQNKTL4.js} +61 -8
  4. package/dist/es/chunk-GPQNKTL4.js.map +1 -0
  5. package/dist/es/{chunk-THZ6N6CG.js → chunk-SI7ZG3EP.js} +3 -3
  6. package/dist/es/index.d.ts +4 -4
  7. package/dist/es/index.js +12 -4
  8. package/dist/es/index.js.map +1 -1
  9. package/dist/es/{llm-planning-3407bd42.d.ts → llm-planning-7d13d5ff.d.ts} +2 -1
  10. package/dist/es/{types-4d9c9a85.d.ts → types-8d202846.d.ts} +4 -0
  11. package/dist/es/utils.d.ts +1 -1
  12. package/dist/es/utils.js +1 -1
  13. package/dist/lib/ai-model.d.ts +3 -3
  14. package/dist/lib/ai-model.js +2 -2
  15. package/dist/lib/{chunk-TV47VKQ7.js → chunk-GPQNKTL4.js} +61 -8
  16. package/dist/lib/chunk-GPQNKTL4.js.map +1 -0
  17. package/dist/lib/{chunk-THZ6N6CG.js → chunk-SI7ZG3EP.js} +3 -3
  18. package/dist/lib/index.d.ts +4 -4
  19. package/dist/lib/index.js +21 -13
  20. package/dist/lib/index.js.map +1 -1
  21. package/dist/lib/{llm-planning-3407bd42.d.ts → llm-planning-7d13d5ff.d.ts} +2 -1
  22. package/dist/{types/types-4d9c9a85.d.ts → lib/types-8d202846.d.ts} +4 -0
  23. package/dist/lib/utils.d.ts +1 -1
  24. package/dist/lib/utils.js +2 -2
  25. package/dist/types/ai-model.d.ts +3 -3
  26. package/dist/types/index.d.ts +4 -4
  27. package/dist/types/{llm-planning-3407bd42.d.ts → llm-planning-7d13d5ff.d.ts} +2 -1
  28. package/dist/{lib/types-4d9c9a85.d.ts → types/types-8d202846.d.ts} +4 -0
  29. package/dist/types/utils.d.ts +1 -1
  30. package/package.json +3 -3
  31. package/dist/es/chunk-TV47VKQ7.js.map +0 -1
  32. package/dist/lib/chunk-TV47VKQ7.js.map +0 -1
  33. /package/dist/es/{chunk-THZ6N6CG.js.map → chunk-SI7ZG3EP.js.map} +0 -0
  34. /package/dist/lib/{chunk-THZ6N6CG.js.map → chunk-SI7ZG3EP.js.map} +0 -0
@@ -1,9 +1,9 @@
1
- import { ay as StreamingCallback, m as AIUsageInfo, ax as StreamingCodeGenerationOptions, aA as StreamingAIResponse, V as PlanningAction, j as MidsceneYamlFlowItem } from './types-4d9c9a85.js';
1
+ import { ay as StreamingCallback, m as AIUsageInfo, ax as StreamingCodeGenerationOptions, aA as StreamingAIResponse, V as PlanningAction, j as MidsceneYamlFlowItem } from './types-8d202846.js';
2
2
  import OpenAI from 'openai';
3
3
  import { ChatCompletionMessageParam } from 'openai/resources';
4
4
  export { ChatCompletionMessageParam } from 'openai/resources';
5
- import { b as AIActionType } from './llm-planning-3407bd42.js';
6
- export { a as AiAssert, f as AiExtractElementInfo, A as AiLocateElement, g as AiLocateSection, h as adaptBboxToRect, c as callAiFn, d as describeUserPage, e as elementByPositionWithElementInfo, p as plan } from './llm-planning-3407bd42.js';
5
+ import { b as AIActionType } from './llm-planning-7d13d5ff.js';
6
+ export { a as AiAssert, f as AiExtractElementInfo, A as AiLocateElement, g as AiLocateSection, h as adaptBboxToRect, c as callAiFn, d as describeUserPage, e as elementByPositionWithElementInfo, p as plan } from './llm-planning-7d13d5ff.js';
7
7
  import { vlLocateMode } from '@midscene/shared/env';
8
8
  import { actionParser } from '@ui-tars/action-parser';
9
9
  import { Size } from '@midscene/shared/types';
@@ -18,7 +18,7 @@ import {
18
18
  resizeImageForUiTars,
19
19
  systemPromptToLocateElement,
20
20
  vlmPlanning
21
- } from "./chunk-TV47VKQ7.js";
21
+ } from "./chunk-GPQNKTL4.js";
22
22
  export {
23
23
  AIActionType,
24
24
  AiAssert,
@@ -386,31 +386,47 @@ You are an expert in software testing.
386
386
  ## Objective:
387
387
  - Identify elements in screenshots and text that match the user's description.
388
388
  - Give the coordinates of the element that matches the user's description best in the screenshot.
389
+ - Determine whether the user's description is order-sensitive (e.g., contains phrases like 'the third item in the list', 'the last button', etc.).
389
390
 
390
391
  ## Output Format:
391
392
  \`\`\`json
392
393
  {
393
394
  "bbox": [number, number, number, number], // ${bboxComment}
394
- "errors"?: string[]
395
+ "errors"?: string[],
396
+ "isOrderSensitive": boolean // Whether the targetElementDescription is order-sensitive (true/false)
395
397
  }
396
398
  \`\`\`
397
399
 
398
400
  Fields:
399
401
  * \`bbox\` is the bounding box of the element that matches the user's description best in the screenshot
402
+ * \`isOrderSensitive\` is a boolean indicating whether the user's description is order-sensitive (true/false)
400
403
  * \`errors\` is an optional array of error messages (if any)
401
404
 
402
- For example, when an element is found:
405
+ Order-sensitive means the description contains phrases like:
406
+ - "the third item in the list"
407
+ - "the last button"
408
+ - "the first input box"
409
+ - "the second row"
410
+
411
+ Not order-sensitive means the description is like:
412
+ - "confirm button"
413
+ - "search box"
414
+ - "password input"
415
+
416
+ For example, when an element is found and the description is order-sensitive:
403
417
  \`\`\`json
404
418
  {
405
419
  "bbox": [100, 100, 200, 200],
420
+ "isOrderSensitive": true,
406
421
  "errors": []
407
422
  }
408
423
  \`\`\`
409
424
 
410
- When no element is found:
425
+ When no element is found and the description is not order-sensitive:
411
426
  \`\`\`json
412
427
  {
413
428
  "bbox": [],
429
+ "isOrderSensitive": false,
414
430
  "errors": ["I can see ..., but {some element} is not found"]
415
431
  }
416
432
  \`\`\`
@@ -423,6 +439,7 @@ You are an expert in software page image (2D) and page element text analysis.
423
439
  ## Objective:
424
440
  - Identify elements in screenshots and text that match the user's description.
425
441
  - Return JSON data containing the selection reason and element ID.
442
+ - Determine whether the user's description is order-sensitive (e.g., contains phrases like 'the third item in the list', 'the last button', etc.).
426
443
 
427
444
  ## Skills:
428
445
  - Image analysis and recognition
@@ -434,6 +451,7 @@ You are an expert in software page image (2D) and page element text analysis.
434
451
  2. Based on the user's description, locate the target element ID in the list of element descriptions and the screenshot.
435
452
  3. Found the required number of elements
436
453
  4. Return JSON data containing the selection reason and element ID.
454
+ 5. Judge whether the user's description is order-sensitive (see below for definition and examples).
437
455
 
438
456
  ## Constraints:
439
457
  - Strictly adhere to the specified location when describing the required element; do not select elements from other locations.
@@ -443,6 +461,10 @@ You are an expert in software page image (2D) and page element text analysis.
443
461
  - The returned data must conform to the specified JSON format.
444
462
  - The returned value id information must use the id from element info (important: **use id not indexId, id is hash content**)
445
463
 
464
+ ## Order-Sensitive Definition:
465
+ - If the description contains phrases like "the third item in the list", "the last button", "the first input box", "the second row", etc., it is order-sensitive (isOrderSensitive = true).
466
+ - If the description is like "confirm button", "search box", "password input", etc., it is not order-sensitive (isOrderSensitive = false).
467
+
446
468
  ## Output Format:
447
469
 
448
470
  Please return the result in JSON format as follows:
@@ -458,6 +480,7 @@ Please return the result in JSON format as follows:
458
480
  }
459
481
  // More elements...
460
482
  ],
483
+ "isOrderSensitive": true, // or false, depending on the user's description
461
484
  "errors": [] // Array of strings containing any error messages
462
485
  }
463
486
  \`\`\`
@@ -546,6 +569,7 @@ Output Example:
546
569
  "id": "1231"
547
570
  }
548
571
  ],
572
+ "isOrderSensitive": true,
549
573
  "errors": []
550
574
  }
551
575
  \`\`\`
@@ -583,6 +607,10 @@ var locatorSchema = {
583
607
  },
584
608
  description: "List of found elements"
585
609
  },
610
+ isOrderSensitive: {
611
+ type: "boolean",
612
+ description: "Whether the targetElementDescription is order-sensitive (true/false)"
613
+ },
586
614
  errors: {
587
615
  type: "array",
588
616
  items: {
@@ -591,7 +619,7 @@ var locatorSchema = {
591
619
  description: "List of error messages, if any"
592
620
  }
593
621
  },
594
- required: ["elements", "errors"],
622
+ required: ["elements", "isOrderSensitive", "errors"],
595
623
  additionalProperties: false
596
624
  }
597
625
  }
@@ -789,6 +817,7 @@ Return in JSON format:
789
817
  ${vlCoTLog}
790
818
  ${vlCurrentLog}
791
819
  ${commonOutputFields}
820
+ "isOrderSensitive": boolean, // Whether the user's instruction is order-sensitive (true/false)
792
821
  "action":
793
822
  {
794
823
  // one of the supporting actions
@@ -901,6 +930,7 @@ The JSON format is as follows:
901
930
  ],
902
931
  ${llmCurrentLog}
903
932
  ${commonOutputFields}
933
+ "isOrderSensitive": boolean, // Whether the user's instruction is order-sensitive (true/false)
904
934
  }}
905
935
 
906
936
  ## Examples
@@ -1060,6 +1090,10 @@ var planSchema = {
1060
1090
  },
1061
1091
  description: "List of actions to be performed"
1062
1092
  },
1093
+ isOrderSensitive: {
1094
+ type: "boolean",
1095
+ description: "Whether the user's instruction is order-sensitive (true/false)"
1096
+ },
1063
1097
  more_actions_needed_by_instruction: {
1064
1098
  type: "boolean",
1065
1099
  description: "If all the actions described in the instruction have been covered by this action and logs, set this field to false."
@@ -2083,6 +2117,7 @@ import { PromptTemplate as PromptTemplate4 } from "@langchain/core/prompts";
2083
2117
  function systemPromptToLocateSection(vlMode) {
2084
2118
  return `
2085
2119
  You goal is to find out one section containing the target element in the screenshot, put it in the \`bbox\` field. If the user describe the target element with some reference elements, you should also find the section containing the reference elements, put it in the \`references_bbox\` field.
2120
+ - Determine whether the user's description is order-sensitive (e.g., contains phrases like 'the third item in the list', 'the last button', etc.).
2086
2121
 
2087
2122
  Usually, it should be approximately an area not more than 300x300px. Changes of the size are allowed if there are many elements to cover.
2088
2123
 
@@ -2095,11 +2130,27 @@ return in this JSON format:
2095
2130
  [number, number, number, number],
2096
2131
  ...
2097
2132
  ],
2133
+ "isOrderSensitive": boolean, // Whether the targetElementDescription is order-sensitive (true/false)
2098
2134
  "error"?: string
2099
2135
  }
2100
2136
  \`\`\`
2101
2137
 
2102
- In which, all the numbers in the \`bbox\` and \`references_bbox\` represent ${bboxDescription(vlMode)}.
2138
+ In which, all the numbers in the \`bbox\` and \`references_bbox\` represent ${bboxDescription(
2139
+ vlMode
2140
+ )}.
2141
+
2142
+ * \`isOrderSensitive\` is a boolean indicating whether the user's description is order-sensitive (true/false)
2143
+
2144
+ Order-sensitive means the description contains phrases like:
2145
+ - "the third item in the list"
2146
+ - "the last button"
2147
+ - "the first input box"
2148
+ - "the second row"
2149
+
2150
+ Not order-sensitive means the description is like:
2151
+ - "confirm button"
2152
+ - "search box"
2153
+ - "password input"
2103
2154
 
2104
2155
  For example, if the user describe the target element as "the delete button on the second row with title 'Peter'", you should put the bounding box of the delete button in the \`bbox\` field, and the bounding box of the second row in the \`references_bbox\` field.
2105
2156
 
@@ -2107,7 +2158,8 @@ the return value should be like this:
2107
2158
  \`\`\`json
2108
2159
  {
2109
2160
  "bbox": [100, 100, 200, 200],
2110
- "references_bbox": [[100, 100, 200, 200]]
2161
+ "references_bbox": [[100, 100, 200, 200]],
2162
+ "isOrderSensitive": true
2111
2163
  }
2112
2164
  \`\`\`
2113
2165
  `;
@@ -2230,7 +2282,8 @@ async function AiLocateElement(options) {
2230
2282
  },
2231
2283
  rawResponse,
2232
2284
  elementById,
2233
- usage: res.usage
2285
+ usage: res.usage,
2286
+ isOrderSensitive: typeof res.content === "object" && res.content !== null && "isOrderSensitive" in res.content ? res.content.isOrderSensitive : void 0
2234
2287
  };
2235
2288
  }
2236
2289
  async function AiLocateSection(options) {
@@ -2764,4 +2817,4 @@ export {
2764
2817
  resizeImageForUiTars
2765
2818
  };
2766
2819
 
2767
- //# sourceMappingURL=chunk-TV47VKQ7.js.map
2820
+ //# sourceMappingURL=chunk-GPQNKTL4.js.map