@midscene/core 0.24.1 → 0.24.2-beta-20250730123854.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. package/dist/es/ai-model.d.ts +3 -3
  2. package/dist/es/ai-model.js +1 -1
  3. package/dist/es/{chunk-TV47VKQ7.js → chunk-3LFZDNMQ.js} +80 -12
  4. package/dist/es/chunk-3LFZDNMQ.js.map +1 -0
  5. package/dist/es/{chunk-HUEQURIG.js → chunk-M4OYNEBY.js} +3 -3
  6. package/dist/es/index.d.ts +4 -4
  7. package/dist/es/index.js +12 -4
  8. package/dist/es/index.js.map +1 -1
  9. package/dist/es/{llm-planning-3407bd42.d.ts → llm-planning-ae19dc49.d.ts} +2 -1
  10. package/dist/es/{types-4d9c9a85.d.ts → types-d187efec.d.ts} +19 -2
  11. package/dist/es/utils.d.ts +1 -1
  12. package/dist/es/utils.js +1 -1
  13. package/dist/lib/ai-model.d.ts +3 -3
  14. package/dist/lib/ai-model.js +2 -2
  15. package/dist/lib/{chunk-TV47VKQ7.js → chunk-3LFZDNMQ.js} +80 -12
  16. package/dist/lib/chunk-3LFZDNMQ.js.map +1 -0
  17. package/dist/lib/{chunk-HUEQURIG.js → chunk-M4OYNEBY.js} +3 -3
  18. package/dist/lib/index.d.ts +4 -4
  19. package/dist/lib/index.js +21 -13
  20. package/dist/lib/index.js.map +1 -1
  21. package/dist/lib/{llm-planning-3407bd42.d.ts → llm-planning-ae19dc49.d.ts} +2 -1
  22. package/dist/{types/types-4d9c9a85.d.ts → lib/types-d187efec.d.ts} +19 -2
  23. package/dist/lib/utils.d.ts +1 -1
  24. package/dist/lib/utils.js +2 -2
  25. package/dist/types/ai-model.d.ts +3 -3
  26. package/dist/types/index.d.ts +4 -4
  27. package/dist/types/{llm-planning-3407bd42.d.ts → llm-planning-ae19dc49.d.ts} +2 -1
  28. package/dist/{lib/types-4d9c9a85.d.ts → types/types-d187efec.d.ts} +19 -2
  29. package/dist/types/utils.d.ts +1 -1
  30. package/package.json +3 -3
  31. package/dist/es/chunk-TV47VKQ7.js.map +0 -1
  32. package/dist/lib/chunk-TV47VKQ7.js.map +0 -1
  33. /package/dist/es/{chunk-HUEQURIG.js.map → chunk-M4OYNEBY.js.map} +0 -0
  34. /package/dist/lib/{chunk-HUEQURIG.js.map → chunk-M4OYNEBY.js.map} +0 -0
@@ -1,9 +1,9 @@
1
- import { ay as StreamingCallback, m as AIUsageInfo, ax as StreamingCodeGenerationOptions, aA as StreamingAIResponse, V as PlanningAction, j as MidsceneYamlFlowItem } from './types-4d9c9a85.js';
1
+ import { aA as StreamingCallback, m as AIUsageInfo, az as StreamingCodeGenerationOptions, aC as StreamingAIResponse, V as PlanningAction, j as MidsceneYamlFlowItem } from './types-d187efec.js';
2
2
  import OpenAI from 'openai';
3
3
  import { ChatCompletionMessageParam } from 'openai/resources';
4
4
  export { ChatCompletionMessageParam } from 'openai/resources';
5
- import { b as AIActionType } from './llm-planning-3407bd42.js';
6
- export { a as AiAssert, f as AiExtractElementInfo, A as AiLocateElement, g as AiLocateSection, h as adaptBboxToRect, c as callAiFn, d as describeUserPage, e as elementByPositionWithElementInfo, p as plan } from './llm-planning-3407bd42.js';
5
+ import { b as AIActionType } from './llm-planning-ae19dc49.js';
6
+ export { a as AiAssert, f as AiExtractElementInfo, A as AiLocateElement, g as AiLocateSection, h as adaptBboxToRect, c as callAiFn, d as describeUserPage, e as elementByPositionWithElementInfo, p as plan } from './llm-planning-ae19dc49.js';
7
7
  import { vlLocateMode } from '@midscene/shared/env';
8
8
  import { actionParser } from '@ui-tars/action-parser';
9
9
  import { Size } from '@midscene/shared/types';
@@ -18,7 +18,7 @@ import {
18
18
  resizeImageForUiTars,
19
19
  systemPromptToLocateElement,
20
20
  vlmPlanning
21
- } from "./chunk-TV47VKQ7.js";
21
+ } from "./chunk-3LFZDNMQ.js";
22
22
  export {
23
23
  AIActionType,
24
24
  AiAssert,
@@ -300,7 +300,7 @@ function buildYamlFlowFromPlans(plans, sleep) {
300
300
  flow.push({
301
301
  sleep: param.timeMs
302
302
  });
303
- } else if (type === "AndroidBackButton" || type === "AndroidHomeButton" || type === "AndroidRecentAppsButton") {
303
+ } else if (type === "AndroidBackButton" || type === "AndroidHomeButton" || type === "AndroidRecentAppsButton" || type === "AndroidLongPress" || type === "AndroidPull") {
304
304
  } else if (type === "Error" || type === "ExpectedFalsyCondition" || type === "Assert" || type === "AssertWithoutThrow" || type === "Finished") {
305
305
  } else {
306
306
  console.warn(
@@ -386,31 +386,47 @@ You are an expert in software testing.
386
386
  ## Objective:
387
387
  - Identify elements in screenshots and text that match the user's description.
388
388
  - Give the coordinates of the element that matches the user's description best in the screenshot.
389
+ - Determine whether the user's description is order-sensitive (e.g., contains phrases like 'the third item in the list', 'the last button', etc.).
389
390
 
390
391
  ## Output Format:
391
392
  \`\`\`json
392
393
  {
393
394
  "bbox": [number, number, number, number], // ${bboxComment}
394
- "errors"?: string[]
395
+ "errors"?: string[],
396
+ "isOrderSensitive": boolean // Whether the targetElementDescription is order-sensitive (true/false)
395
397
  }
396
398
  \`\`\`
397
399
 
398
400
  Fields:
399
401
  * \`bbox\` is the bounding box of the element that matches the user's description best in the screenshot
402
+ * \`isOrderSensitive\` is a boolean indicating whether the user's description is order-sensitive (true/false)
400
403
  * \`errors\` is an optional array of error messages (if any)
401
404
 
402
- For example, when an element is found:
405
+ Order-sensitive means the description contains phrases like:
406
+ - "the third item in the list"
407
+ - "the last button"
408
+ - "the first input box"
409
+ - "the second row"
410
+
411
+ Not order-sensitive means the description is like:
412
+ - "confirm button"
413
+ - "search box"
414
+ - "password input"
415
+
416
+ For example, when an element is found and the description is order-sensitive:
403
417
  \`\`\`json
404
418
  {
405
419
  "bbox": [100, 100, 200, 200],
420
+ "isOrderSensitive": true,
406
421
  "errors": []
407
422
  }
408
423
  \`\`\`
409
424
 
410
- When no element is found:
425
+ When no element is found and the description is not order-sensitive:
411
426
  \`\`\`json
412
427
  {
413
428
  "bbox": [],
429
+ "isOrderSensitive": false,
414
430
  "errors": ["I can see ..., but {some element} is not found"]
415
431
  }
416
432
  \`\`\`
@@ -423,6 +439,7 @@ You are an expert in software page image (2D) and page element text analysis.
423
439
  ## Objective:
424
440
  - Identify elements in screenshots and text that match the user's description.
425
441
  - Return JSON data containing the selection reason and element ID.
442
+ - Determine whether the user's description is order-sensitive (e.g., contains phrases like 'the third item in the list', 'the last button', etc.).
426
443
 
427
444
  ## Skills:
428
445
  - Image analysis and recognition
@@ -434,6 +451,7 @@ You are an expert in software page image (2D) and page element text analysis.
434
451
  2. Based on the user's description, locate the target element ID in the list of element descriptions and the screenshot.
435
452
  3. Found the required number of elements
436
453
  4. Return JSON data containing the selection reason and element ID.
454
+ 5. Judge whether the user's description is order-sensitive (see below for definition and examples).
437
455
 
438
456
  ## Constraints:
439
457
  - Strictly adhere to the specified location when describing the required element; do not select elements from other locations.
@@ -443,6 +461,10 @@ You are an expert in software page image (2D) and page element text analysis.
443
461
  - The returned data must conform to the specified JSON format.
444
462
  - The returned value id information must use the id from element info (important: **use id not indexId, id is hash content**)
445
463
 
464
+ ## Order-Sensitive Definition:
465
+ - If the description contains phrases like "the third item in the list", "the last button", "the first input box", "the second row", etc., it is order-sensitive (isOrderSensitive = true).
466
+ - If the description is like "confirm button", "search box", "password input", etc., it is not order-sensitive (isOrderSensitive = false).
467
+
446
468
  ## Output Format:
447
469
 
448
470
  Please return the result in JSON format as follows:
@@ -458,6 +480,7 @@ Please return the result in JSON format as follows:
458
480
  }
459
481
  // More elements...
460
482
  ],
483
+ "isOrderSensitive": true, // or false, depending on the user's description
461
484
  "errors": [] // Array of strings containing any error messages
462
485
  }
463
486
  \`\`\`
@@ -546,6 +569,7 @@ Output Example:
546
569
  "id": "1231"
547
570
  }
548
571
  ],
572
+ "isOrderSensitive": true,
549
573
  "errors": []
550
574
  }
551
575
  \`\`\`
@@ -583,6 +607,10 @@ var locatorSchema = {
583
607
  },
584
608
  description: "List of found elements"
585
609
  },
610
+ isOrderSensitive: {
611
+ type: "boolean",
612
+ description: "Whether the targetElementDescription is order-sensitive (true/false)"
613
+ },
586
614
  errors: {
587
615
  type: "array",
588
616
  items: {
@@ -591,7 +619,7 @@ var locatorSchema = {
591
619
  description: "List of error messages, if any"
592
620
  }
593
621
  },
594
- required: ["elements", "errors"],
622
+ required: ["elements", "isOrderSensitive", "errors"],
595
623
  additionalProperties: false
596
624
  }
597
625
  }
@@ -766,7 +794,7 @@ Target: User will give you a screenshot, an instruction and some previous logs i
766
794
 
767
795
  Restriction:
768
796
  - Don't give extra actions or plans beyond the instruction. ONLY plan for what the instruction requires. For example, don't try to submit the form if the instruction is only to fill something.
769
- - Always give ONLY ONE action in \`log\` field (or null if no action should be done), instead of multiple actions. Supported actions are Tap, Hover, Input, KeyboardPress, Scroll${pageType === "android" ? ", AndroidBackButton, AndroidHomeButton, AndroidRecentAppsButton." : "."}
797
+ - Always give ONLY ONE action in \`log\` field (or null if no action should be done), instead of multiple actions. Supported actions are Tap, Hover, Input, KeyboardPress, Scroll${pageType === "android" ? ", AndroidBackButton, AndroidHomeButton, AndroidRecentAppsButton, AndroidLongPress, AndroidPull." : "."}
770
798
  - Don't repeat actions in the previous logs.
771
799
  - Bbox is the bounding box of the element to be located. It's an array of 4 numbers, representing ${bboxDescription(vlMode)}.
772
800
 
@@ -779,7 +807,9 @@ Supporting actions:
779
807
  - Scroll: { type: "Scroll", ${vlLocateParam} | null, param: { direction: 'down'(default) | 'up' | 'right' | 'left', scrollType: 'once' (default) | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft', distance: null | number }} // locate is the element to scroll. If it's a page scroll, put \`null\` in the \`locate\` field.
780
808
  ${pageType === "android" ? `- AndroidBackButton: { type: "AndroidBackButton", param: {} }
781
809
  - AndroidHomeButton: { type: "AndroidHomeButton", param: {} }
782
- - AndroidRecentAppsButton: { type: "AndroidRecentAppsButton", param: {} }` : ""}
810
+ - AndroidRecentAppsButton: { type: "AndroidRecentAppsButton", param: {} }
811
+ - AndroidLongPress: { type: "AndroidLongPress", param: { x: number, y: number, duration?: number } }
812
+ - AndroidPull: { type: "AndroidPull", param: { direction: 'up' | 'down', startPoint?: { x: number, y: number }, distance?: number, duration?: number } } // Pull down to refresh (direction: 'down') or pull up to load more (direction: 'up')` : ""}
783
813
 
784
814
  Field description:
785
815
  * The \`prompt\` field inside the \`locate\` field is a short description that could be used to locate the element.
@@ -829,7 +859,7 @@ You are a versatile professional in software UI automation. Your outstanding con
829
859
  ## Workflow
830
860
 
831
861
  1. Receive the screenshot, element description of screenshot(if any), user's instruction and previous logs.
832
- 2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (Tap / Hover / Input / KeyboardPress / Scroll / FalsyConditionStatement / Sleep ${pageType === "android" ? "/ AndroidBackButton / AndroidHomeButton / AndroidRecentAppsButton" : ""}). The "About the action" section below will give you more details.
862
+ 2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (Tap / Hover / Input / KeyboardPress / Scroll / FalsyConditionStatement / Sleep ${pageType === "android" ? "/ AndroidBackButton / AndroidHomeButton / AndroidRecentAppsButton / AndroidLongPress / AndroidPull" : ""}). The "About the action" section below will give you more details.
833
863
  3. Precisely locate the target element if it's already shown in the screenshot, put the location info in the \`locate\` field of the action.
834
864
  4. If some target elements is not shown in the screenshot, consider the user's instruction is not feasible on this page. Follow the next steps.
835
865
  5. Consider whether the user's instruction will be accomplished after all the actions
@@ -888,7 +918,11 @@ ${pageType === "android" ? `- type: 'AndroidBackButton', trigger the system "bac
888
918
  - type: 'AndroidHomeButton', trigger the system "home" operation on Android devices
889
919
  * {{ param: {{}} }}
890
920
  - type: 'AndroidRecentAppsButton', trigger the system "recent apps" operation on Android devices
891
- * {{ param: {{}} }}` : ""}
921
+ * {{ param: {{}} }}
922
+ - type: 'AndroidLongPress', trigger a long press on the screen at specified coordinates on Android devices
923
+ * {{ param: {{ x: number, y: number, duration?: number }} }}
924
+ - type: 'AndroidPull', trigger pull down to refresh or pull up actions on Android devices
925
+ * {{ param: {{ direction: 'up' | 'down', startPoint?: {{ x: number, y: number }}, distance?: number, duration?: number }} }}` : ""}
892
926
  `;
893
927
  var outputTemplate = `
894
928
  ## Output JSON Format:
@@ -1002,7 +1036,7 @@ var planSchema = {
1002
1036
  },
1003
1037
  type: {
1004
1038
  type: "string",
1005
- description: 'Type of action, one of "Tap", "RightClick", "Hover" , "Input", "KeyboardPress", "Scroll", "ExpectedFalsyCondition", "Sleep", "AndroidBackButton", "AndroidHomeButton", "AndroidRecentAppsButton"'
1039
+ description: 'Type of action, one of "Tap", "RightClick", "Hover" , "Input", "KeyboardPress", "Scroll", "ExpectedFalsyCondition", "Sleep", "AndroidBackButton", "AndroidHomeButton", "AndroidRecentAppsButton", "AndroidLongPress"'
1006
1040
  },
1007
1041
  param: {
1008
1042
  anyOf: [
@@ -2230,7 +2264,8 @@ async function AiLocateElement(options) {
2230
2264
  },
2231
2265
  rawResponse,
2232
2266
  elementById,
2233
- usage: res.usage
2267
+ usage: res.usage,
2268
+ isOrderSensitive: typeof res.content === "object" && res.content !== null && "isOrderSensitive" in res.content ? res.content.isOrderSensitive : void 0
2234
2269
  };
2235
2270
  }
2236
2271
  async function AiLocateSection(options) {
@@ -2681,6 +2716,39 @@ async function vlmPlanning(options) {
2681
2716
  type: "AndroidRecentAppsButton",
2682
2717
  param: {}
2683
2718
  });
2719
+ } else if (action.action_type === "androidLongPress") {
2720
+ assert6(
2721
+ action.action_inputs.start_coords,
2722
+ "start_coords is required for androidLongPress"
2723
+ );
2724
+ const point = action.action_inputs.start_coords;
2725
+ transformActions.push({
2726
+ type: "AndroidLongPress",
2727
+ param: {
2728
+ x: point[0],
2729
+ y: point[1],
2730
+ duration: 1e3
2731
+ },
2732
+ locate: null,
2733
+ thought: action.thought || ""
2734
+ });
2735
+ } else if (action.action_type === "androidPull") {
2736
+ const pullDirection = action.action_inputs.direction || "down";
2737
+ const startPoint = action.action_inputs.start_coords ? {
2738
+ x: action.action_inputs.start_coords[0],
2739
+ y: action.action_inputs.start_coords[1]
2740
+ } : void 0;
2741
+ transformActions.push({
2742
+ type: "AndroidPull",
2743
+ param: {
2744
+ direction: pullDirection,
2745
+ startPoint,
2746
+ distance: action.action_inputs.distance,
2747
+ duration: action.action_inputs.duration || 500
2748
+ },
2749
+ locate: null,
2750
+ thought: action.thought || ""
2751
+ });
2684
2752
  }
2685
2753
  });
2686
2754
  if (transformActions.length === 0) {
@@ -2764,4 +2832,4 @@ export {
2764
2832
  resizeImageForUiTars
2765
2833
  };
2766
2834
 
2767
- //# sourceMappingURL=chunk-TV47VKQ7.js.map
2835
+ //# sourceMappingURL=chunk-3LFZDNMQ.js.map