@midscene/core 0.25.4-beta-20250808064529.0 → 0.25.4-beta-20250811115904.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. package/dist/es/ai-model.d.ts +3 -3
  2. package/dist/es/ai-model.js +1 -1
  3. package/dist/es/{chunk-I5LBWOQA.js → chunk-NY6RQSGJ.js} +243 -254
  4. package/dist/es/chunk-NY6RQSGJ.js.map +1 -0
  5. package/dist/es/{chunk-UIEDQYHD.js → chunk-SR67R2OE.js} +3 -3
  6. package/dist/es/index.d.ts +4 -4
  7. package/dist/es/index.js +2 -2
  8. package/dist/es/{llm-planning-92cec090.d.ts → llm-planning-374b74b8.d.ts} +2 -1
  9. package/dist/es/{types-b4a208c6.d.ts → types-16cd9f75.d.ts} +10 -1
  10. package/dist/es/utils.d.ts +1 -1
  11. package/dist/es/utils.js +1 -1
  12. package/dist/lib/ai-model.d.ts +3 -3
  13. package/dist/lib/ai-model.js +2 -2
  14. package/dist/lib/{chunk-I5LBWOQA.js → chunk-NY6RQSGJ.js} +232 -243
  15. package/dist/lib/chunk-NY6RQSGJ.js.map +1 -0
  16. package/dist/lib/{chunk-UIEDQYHD.js → chunk-SR67R2OE.js} +3 -3
  17. package/dist/lib/index.d.ts +4 -4
  18. package/dist/lib/index.js +12 -12
  19. package/dist/lib/{llm-planning-92cec090.d.ts → llm-planning-374b74b8.d.ts} +2 -1
  20. package/dist/{types/types-b4a208c6.d.ts → lib/types-16cd9f75.d.ts} +10 -1
  21. package/dist/lib/utils.d.ts +1 -1
  22. package/dist/lib/utils.js +2 -2
  23. package/dist/types/ai-model.d.ts +3 -3
  24. package/dist/types/index.d.ts +4 -4
  25. package/dist/types/{llm-planning-92cec090.d.ts → llm-planning-374b74b8.d.ts} +2 -1
  26. package/dist/{lib/types-b4a208c6.d.ts → types/types-16cd9f75.d.ts} +10 -1
  27. package/dist/types/utils.d.ts +1 -1
  28. package/package.json +3 -3
  29. package/dist/es/chunk-I5LBWOQA.js.map +0 -1
  30. package/dist/lib/chunk-I5LBWOQA.js.map +0 -1
  31. /package/dist/es/{chunk-UIEDQYHD.js.map → chunk-SR67R2OE.js.map} +0 -0
  32. /package/dist/lib/{chunk-UIEDQYHD.js.map → chunk-SR67R2OE.js.map} +0 -0
@@ -633,179 +633,73 @@ Here is the item user want to find:
633
633
  });
634
634
 
635
635
  // src/ai-model/prompt/llm-planning.ts
636
+ var _assert = require('assert'); var _assert2 = _interopRequireDefault(_assert);
636
637
 
637
-
638
- // src/image/index.ts
639
-
640
-
641
-
642
-
643
-
644
-
645
-
646
-
647
-
648
-
649
- // src/ai-model/prompt/util.ts
650
-
651
-
652
-
653
-
654
-
655
-
656
-
657
-
658
- function describeSize(size) {
659
- return `${size.width} x ${size.height}`;
660
- }
661
- var distanceThreshold = 16;
662
- function elementByPositionWithElementInfo(treeRoot, position, options) {
663
- const requireStrictDistance = _nullishCoalesce(_optionalChain([options, 'optionalAccess', _10 => _10.requireStrictDistance]), () => ( true));
664
- const filterPositionElements = _nullishCoalesce(_optionalChain([options, 'optionalAccess', _11 => _11.filterPositionElements]), () => ( false));
665
- _utils.assert.call(void 0, typeof position !== "undefined", "position is required for query");
666
- const matchingElements = [];
667
- function dfs(node) {
668
- if (_optionalChain([node, 'optionalAccess', _12 => _12.node])) {
669
- const item = node.node;
670
- if (item.rect.left <= position.x && position.x <= item.rect.left + item.rect.width && item.rect.top <= position.y && position.y <= item.rect.top + item.rect.height) {
671
- if (!(filterPositionElements && _optionalChain([item, 'access', _13 => _13.attributes, 'optionalAccess', _14 => _14.nodeType]) === _constants.NodeType.POSITION) && item.isVisible) {
672
- matchingElements.push(item);
673
- }
674
- }
675
- }
676
- for (const child of node.children) {
677
- dfs(child);
638
+ var vlCoTLog = `"what_the_user_wants_to_do_next_by_instruction": string, // What the user wants to do according to the instruction and previous logs. `;
639
+ var vlCurrentLog = `"log": string, // Log what the next one action (ONLY ONE!) you can do according to the screenshot and the instruction. The typical log looks like "Now i want to use action '{ action-type }' to do .. first". If no action should be done, log the reason. ". Use the same language as the user's instruction.`;
640
+ var llmCurrentLog = `"log": string, // Log what the next actions you can do according to the screenshot and the instruction. The typical log looks like "Now i want to use action '{ action-type }' to do ..". If no action should be done, log the reason. ". Use the same language as the user's instruction.`;
641
+ var commonOutputFields = `"error"?: string, // Error messages about unexpected situations, if any. Only think it is an error when the situation is not expected according to the instruction. Use the same language as the user's instruction.
642
+ "more_actions_needed_by_instruction": boolean, // Consider if there is still more action(s) to do after the action in "Log" is done, according to the instruction. If so, set this field to true. Otherwise, set it to false.`;
643
+ var vlLocateParam = (required) => `locate${required ? "" : "?"}: {bbox: [number, number, number, number], prompt: string }`;
644
+ var llmLocateParam = (required) => `locate${required ? "" : "?"}: {"id": string, "prompt": string}`;
645
+ var descriptionForAction = (action, locatorScheme) => {
646
+ const tab = " ";
647
+ let locateParam = "";
648
+ if (action.location === "required") {
649
+ locateParam = locatorScheme;
650
+ } else if (action.location === "optional") {
651
+ locateParam = `${locatorScheme} | null`;
652
+ } else if (action.location === false) {
653
+ locateParam = "";
654
+ }
655
+ const locatorParam = locateParam ? `- ${locateParam}` : "";
656
+ if (action.whatToLocate) {
657
+ if (!locateParam) {
658
+ console.warn(
659
+ `whatToLocate is provided for action ${action.name}, but location is not required or optional. The whatToLocate will be ignored.`
660
+ );
661
+ } else {
662
+ locateParam += ` // ${action.whatToLocate}`;
678
663
  }
679
664
  }
680
- dfs(treeRoot);
681
- if (matchingElements.length === 0) {
682
- return void 0;
665
+ let paramSchema = "";
666
+ if (action.paramSchema) {
667
+ paramSchema = `- param: ${action.paramSchema}`;
683
668
  }
684
- const element = matchingElements.reduce((smallest, current) => {
685
- const smallestArea = smallest.rect.width * smallest.rect.height;
686
- const currentArea = current.rect.width * current.rect.height;
687
- return currentArea < smallestArea ? current : smallest;
688
- });
689
- const distanceToCenter = distance(
690
- { x: element.center[0], y: element.center[1] },
691
- position
692
- );
693
- if (requireStrictDistance) {
694
- return distanceToCenter <= distanceThreshold ? element : void 0;
695
- }
696
- return element;
697
- }
698
- function distance(point1, point2) {
699
- return Math.sqrt((point1.x - point2.x) ** 2 + (point1.y - point2.y) ** 2);
700
- }
701
- var samplePageDescription = `
702
- And the page is described as follows:
703
- ====================
704
- The size of the page: 1280 x 720
705
- Some of the elements are marked with a rectangle in the screenshot corresponding to the markerId, some are not.
706
-
707
- Description of all the elements in screenshot:
708
- <div id="969f1637" markerId="1" left="100" top="100" width="100" height="100"> // The markerId indicated by the rectangle label in the screenshot
709
- <h4 id="b211ecb2" markerId="5" left="150" top="150" width="90" height="60">
710
- The username is accepted
711
- </h4>
712
- ...many more
713
- </div>
714
- ====================
715
- `;
716
- async function describeUserPage(context, opt) {
717
- const { screenshotBase64 } = context;
718
- let width;
719
- let height;
720
- if (context.size) {
721
- ({ width, height } = context.size);
722
- } else {
723
- const imgSize = await _img.imageInfoOfBase64.call(void 0, screenshotBase64);
724
- ({ width, height } = imgSize);
725
- }
726
- const treeRoot = context.tree;
727
- const idElementMap = {};
728
- const flatElements = _extractor.treeToList.call(void 0, treeRoot);
729
- if (_optionalChain([opt, 'optionalAccess', _15 => _15.domIncluded]) === true && flatElements.length >= 5e3) {
730
- console.warn(
731
- 'The number of elements is too large, it may cause the prompt to be too long, please use domIncluded: "visible-only" to reduce the number of elements'
669
+ if (action.paramDescription) {
670
+ _assert2.default.call(void 0,
671
+ paramSchema,
672
+ `paramSchema is required when paramDescription is provided for action ${action.name}, but got ${action.paramSchema}`
732
673
  );
733
- }
734
- flatElements.forEach((element) => {
735
- idElementMap[element.id] = element;
736
- if (typeof element.indexId !== "undefined") {
737
- idElementMap[`${element.indexId}`] = element;
738
- }
739
- });
740
- let pageDescription = "";
741
- const visibleOnly = _nullishCoalesce(_optionalChain([opt, 'optionalAccess', _16 => _16.visibleOnly]), () => ( _optionalChain([opt, 'optionalAccess', _17 => _17.domIncluded]) === "visible-only"));
742
- if (_optionalChain([opt, 'optionalAccess', _18 => _18.domIncluded]) || !_env.vlLocateMode.call(void 0, )) {
743
- const contentTree = await _extractor.descriptionOfTree.call(void 0,
744
- treeRoot,
745
- _optionalChain([opt, 'optionalAccess', _19 => _19.truncateTextLength]),
746
- _optionalChain([opt, 'optionalAccess', _20 => _20.filterNonTextContent]),
747
- visibleOnly
748
- );
749
- const sizeDescription = describeSize({ width, height });
750
- pageDescription = `The size of the page: ${sizeDescription}
751
- The page elements tree:
752
- ${contentTree}`;
753
- }
754
- return {
755
- description: pageDescription,
756
- elementById(idOrIndexId) {
757
- _utils.assert.call(void 0, typeof idOrIndexId !== "undefined", "id is required for query");
758
- const item = idElementMap[`${idOrIndexId}`];
759
- return item;
760
- },
761
- elementByPosition(position, size) {
762
- return elementByPositionWithElementInfo(treeRoot, position);
763
- },
764
- insertElementByPosition(position) {
765
- const element = _extractor.generateElementByPosition.call(void 0, position);
766
- treeRoot.children.push({
767
- node: element,
768
- children: []
769
- });
770
- flatElements.push(element);
771
- idElementMap[element.id] = element;
772
- return element;
773
- },
774
- size: { width, height }
775
- };
776
- }
777
-
778
- // src/ai-model/prompt/llm-planning.ts
779
- var vlCoTLog = `"what_the_user_wants_to_do_next_by_instruction": string, // What the user wants to do according to the instruction and previous logs. `;
780
- var vlCurrentLog = `"log": string, // Log what the next one action (ONLY ONE!) you can do according to the screenshot and the instruction. The typical log looks like "Now i want to use action '{{ action-type }}' to do .. first". If no action should be done, log the reason. ". Use the same language as the user's instruction.`;
781
- var llmCurrentLog = `"log": string, // Log what the next actions you can do according to the screenshot and the instruction. The typical log looks like "Now i want to use action '{{ action-type }}' to do ..". If no action should be done, log the reason. ". Use the same language as the user's instruction.`;
782
- var commonOutputFields = `"error"?: string, // Error messages about unexpected situations, if any. Only think it is an error when the situation is not expected according to the instruction. Use the same language as the user's instruction.
783
- "more_actions_needed_by_instruction": boolean, // Consider if there is still more action(s) to do after the action in "Log" is done, according to the instruction. If so, set this field to true. Otherwise, set it to false.`;
784
- var vlLocateParam = "locate: {bbox: [number, number, number, number], prompt: string }";
674
+ paramSchema += ` // ${action.paramDescription}`;
675
+ }
676
+ const fields = [paramSchema, locatorParam].filter(Boolean);
677
+ return `- ${action.name}, ${action.description}
678
+ ${tab}- type: "${action.name}"
679
+ ${tab}${fields.join(`
680
+ ${tab}`)}
681
+ `.trim();
682
+ };
785
683
  var systemTemplateOfVLPlanning = ({
786
- pageType,
684
+ actionSpace,
787
685
  vlMode
788
- }) => `
686
+ }) => {
687
+ const actionNameList = actionSpace.map((action) => action.name).join(", ");
688
+ const actionDescriptionList = actionSpace.map(
689
+ (action) => descriptionForAction(action, vlLocateParam(action.location === "required"))
690
+ );
691
+ const actionList = actionDescriptionList.join("\n");
692
+ return `
789
693
  Target: User will give you a screenshot, an instruction and some previous logs indicating what have been done. Please tell what the next one action is (or null if no action should be done) to do the tasks the instruction requires.
790
694
 
791
695
  Restriction:
792
696
  - Don't give extra actions or plans beyond the instruction. ONLY plan for what the instruction requires. For example, don't try to submit the form if the instruction is only to fill something.
793
- - Always give ONLY ONE action in \`log\` field (or null if no action should be done), instead of multiple actions. Supported actions are Tap, Hover, Input, KeyboardPress, Scroll${pageType === "android" ? ", AndroidBackButton, AndroidHomeButton, AndroidRecentAppsButton, AndroidLongPress, AndroidPull." : "."}
697
+ - Always give ONLY ONE action in \`log\` field (or null if no action should be done), instead of multiple actions. Supported actions are ${actionNameList}.
794
698
  - Don't repeat actions in the previous logs.
795
699
  - Bbox is the bounding box of the element to be located. It's an array of 4 numbers, representing ${bboxDescription(vlMode)}.
796
700
 
797
701
  Supporting actions:
798
- - Tap: { type: "Tap", ${vlLocateParam} }
799
- - RightClick: { type: "RightClick", ${vlLocateParam} }
800
- - Hover: { type: "Hover", ${vlLocateParam} }
801
- - Input: { type: "Input", ${vlLocateParam}, param: { value: string } } // Replace the input field with a new value. \`value\` is the final that should be filled in the input box. No matter what modifications are required, just provide the final value to replace the existing input value. Giving a blank string means clear the input field.
802
- - KeyboardPress: { type: "KeyboardPress", param: { value: string } }
803
- - Scroll: { type: "Scroll", ${vlLocateParam} | null, param: { direction: 'down'(default) | 'up' | 'right' | 'left', scrollType: 'once' (default) | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft', distance: null | number }} // locate is the element to scroll. If it's a page scroll, put \`null\` in the \`locate\` field.
804
- ${pageType === "android" ? `- AndroidBackButton: { type: "AndroidBackButton", param: {} }
805
- - AndroidHomeButton: { type: "AndroidHomeButton", param: {} }
806
- - AndroidRecentAppsButton: { type: "AndroidRecentAppsButton", param: {} }
807
- - AndroidLongPress: { type: "AndroidLongPress", param: { x: number, y: number, duration?: number } }
808
- - AndroidPull: { type: "AndroidPull", param: { direction: 'up' | 'down', startPoint?: { x: number, y: number }, distance?: number, duration?: number } } // Pull down to refresh (direction: 'down') or pull up to load more (direction: 'up')` : ""}
702
+ ${actionList}
809
703
 
810
704
  Field description:
811
705
  * The \`prompt\` field inside the \`locate\` field is a short description that could be used to locate the element.
@@ -840,8 +734,19 @@ this and output the JSON:
840
734
  }
841
735
  }
842
736
  `;
843
- var llmLocateParam = `locate: {{"id": string, "prompt": string}} | null`;
844
- var systemTemplateOfLLM = ({ pageType }) => `
737
+ };
738
+ var systemTemplateOfLLM = ({
739
+ actionSpace
740
+ }) => {
741
+ const actionNameList = actionSpace.map((action) => action.name).join(" / ");
742
+ const actionDescriptionList = actionSpace.map(
743
+ (action) => descriptionForAction(
744
+ action,
745
+ llmLocateParam(action.location === "required")
746
+ )
747
+ );
748
+ const actionList = actionDescriptionList.join("\n");
749
+ return `
845
750
  ## Role
846
751
 
847
752
  You are a versatile professional in software UI automation. Your outstanding contributions will impact the user experience of billions of users.
@@ -855,7 +760,7 @@ You are a versatile professional in software UI automation. Your outstanding con
855
760
  ## Workflow
856
761
 
857
762
  1. Receive the screenshot, element description of screenshot(if any), user's instruction and previous logs.
858
- 2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (Tap / Hover / Input / KeyboardPress / Scroll / FalsyConditionStatement / Sleep ${pageType === "android" ? "/ AndroidBackButton / AndroidHomeButton / AndroidRecentAppsButton / AndroidLongPress / AndroidPull" : ""}). The "About the action" section below will give you more details.
763
+ 2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (${actionNameList}). The "About the action" section below will give you more details.
859
764
  3. Precisely locate the target element if it's already shown in the screenshot, put the location info in the \`locate\` field of the action.
860
765
  4. If some target elements is not shown in the screenshot, consider the user's instruction is not feasible on this page. Follow the next steps.
861
766
  5. Consider whether the user's instruction will be accomplished after all the actions
@@ -873,65 +778,30 @@ You are a versatile professional in software UI automation. Your outstanding con
873
778
 
874
779
  The \`locate\` param is commonly used in the \`param\` field of the action, means to locate the target element to perform the action, it conforms to the following scheme:
875
780
 
876
- type LocateParam = {{
781
+ type LocateParam = {
877
782
  "id": string, // the id of the element found. It should either be the id marked with a rectangle in the screenshot or the id described in the description.
878
783
  "prompt"?: string // the description of the element to find. It can only be omitted when locate is null.
879
- }} | null // If it's not on the page, the LocateParam should be null
784
+ } | null // If it's not on the page, the LocateParam should be null
880
785
 
881
786
  ## Supported actions
882
787
 
883
788
  Each action has a \`type\` and corresponding \`param\`. To be detailed:
884
- - type: 'Tap'
885
- * {{ ${llmLocateParam} }}
886
- - type: 'RightClick'
887
- * {{ ${llmLocateParam} }}
888
- - type: 'Hover'
889
- * {{ ${llmLocateParam} }}
890
- - type: 'Input', replace the value in the input field
891
- * {{ ${llmLocateParam}, param: {{ value: string }} }}
892
- * \`value\` is the final value that should be filled in the input field. No matter what modifications are required, just provide the final value user should see after the action is done.
893
- - type: 'KeyboardPress', press a key
894
- * {{ param: {{ value: string }} }}
895
- - type: 'Scroll', scroll up or down.
896
- * {{
897
- ${llmLocateParam},
898
- param: {{
899
- direction: 'down'(default) | 'up' | 'right' | 'left',
900
- scrollType: 'once' (default) | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft',
901
- distance: null | number
902
- }}
903
- }}
904
- * To scroll some specific element, put the element at the center of the region in the \`locate\` field. If it's a page scroll, put \`null\` in the \`locate\` field.
905
- * \`param\` is required in this action. If some fields are not specified, use direction \`down\`, \`once\` scroll type, and \`null\` distance.
906
- * {{ param: {{ button: 'Back' | 'Home' | 'RecentApp' }} }}
907
- - type: 'ExpectedFalsyCondition'
908
- * {{ param: {{ reason: string }} }}
909
- * use this action when the conditional statement talked about in the instruction is falsy.
910
- - type: 'Sleep'
911
- * {{ param: {{ timeMs: number }} }}
912
- ${pageType === "android" ? `- type: 'AndroidBackButton', trigger the system "back" operation on Android devices
913
- * {{ param: {{}} }}
914
- - type: 'AndroidHomeButton', trigger the system "home" operation on Android devices
915
- * {{ param: {{}} }}
916
- - type: 'AndroidRecentAppsButton', trigger the system "recent apps" operation on Android devices
917
- * {{ param: {{}} }}
918
- - type: 'AndroidLongPress', trigger a long press on the screen at specified coordinates on Android devices
919
- * {{ param: {{ x: number, y: number, duration?: number }} }}
920
- - type: 'AndroidPull', trigger pull down to refresh or pull up actions on Android devices
921
- * {{ param: {{ direction: 'up' | 'down', startPoint?: {{ x: number, y: number }}, distance?: number, duration?: number }} }}` : ""}
922
- `;
789
+ ${actionList}
790
+
791
+ `.trim();
792
+ };
923
793
  var outputTemplate = `
924
794
  ## Output JSON Format:
925
795
 
926
796
  The JSON format is as follows:
927
797
 
928
- {{
798
+ {
929
799
  "actions": [
930
800
  // ... some actions
931
801
  ],
932
802
  ${llmCurrentLog}
933
803
  ${commonOutputFields}
934
- }}
804
+ }
935
805
 
936
806
  ## Examples
937
807
 
@@ -947,68 +817,62 @@ By viewing the page screenshot and description, you should consider this and out
947
817
  * Log what these action do: Click the language switch button to open the language options. Wait for 1 second.
948
818
  * The task cannot be accomplished (because we cannot see the "English" option now), so the \`more_actions_needed_by_instruction\` field is true.
949
819
 
950
- {{
820
+ {
951
821
  "actions":[
952
- {{
822
+ {
953
823
  "type": "Tap",
954
824
  "thought": "Click the language switch button to open the language options.",
955
825
  "param": null,
956
- "locate": {{ id: "c81c4e9a33", prompt: "The language switch button" }},
957
- }},
958
- {{
826
+ "locate": { id: "c81c4e9a33", prompt: "The language switch button" }},
827
+ },
828
+ {
959
829
  "type": "Sleep",
960
830
  "thought": "Wait for 1 second to ensure the language options are displayed.",
961
- "param": {{ "timeMs": 1000 }},
962
- }}
831
+ "param": { "timeMs": 1000 },
832
+ }
963
833
  ],
964
834
  "error": null,
965
835
  "more_actions_needed_by_instruction": true,
966
836
  "log": "Click the language switch button to open the language options. Wait for 1 second",
967
- }}
837
+ }
968
838
 
969
839
  ### Example: What NOT to do
970
840
  Wrong output:
971
- {{
841
+ {
972
842
  "actions":[
973
- {{
843
+ {
974
844
  "type": "Tap",
975
845
  "thought": "Click the language switch button to open the language options.",
976
846
  "param": null,
977
- "locate": {{
978
- {{ "id": "c81c4e9a33" }}, // WRONG: prompt is missing
979
- }}
980
- }},
981
- {{
847
+ "locate": {
848
+ { "id": "c81c4e9a33" }, // WRONG: prompt is missing
849
+ }
850
+ },
851
+ {
982
852
  "type": "Tap",
983
853
  "thought": "Click the English option",
984
854
  "param": null,
985
855
  "locate": null, // This means the 'English' option is not shown in the screenshot, the task cannot be accomplished
986
- }}
856
+ }
987
857
  ],
988
858
  "more_actions_needed_by_instruction": false, // WRONG: should be true
989
859
  "log": "Click the language switch button to open the language options",
990
- }}
860
+ }
991
861
 
992
862
  Reason:
993
863
  * The \`prompt\` is missing in the first 'Locate' action
994
864
  * Since the option button is not shown in the screenshot, there are still more actions to be done, so the \`more_actions_needed_by_instruction\` field should be true
995
865
  `;
996
866
  async function systemPromptToTaskPlanning({
997
- pageType,
867
+ actionSpace,
998
868
  vlMode
999
869
  }) {
1000
870
  if (vlMode) {
1001
- return systemTemplateOfVLPlanning({ pageType, vlMode });
871
+ return systemTemplateOfVLPlanning({ actionSpace, vlMode });
1002
872
  }
1003
- const promptTemplate = new (0, _prompts.PromptTemplate)({
1004
- template: `${systemTemplateOfLLM({ pageType })}
873
+ return `${systemTemplateOfLLM({ actionSpace })}
1005
874
 
1006
- ${outputTemplate}`,
1007
- inputVariables: ["pageDescription"]
1008
- });
1009
- return await promptTemplate.format({
1010
- pageDescription: samplePageDescription
1011
- });
875
+ ${outputTemplate}`;
1012
876
  }
1013
877
  var planSchema = {
1014
878
  type: "json_schema",
@@ -1290,7 +1154,7 @@ Please check your config.`
1290
1154
  httpAgent: proxyAgent,
1291
1155
  ...extraConfig,
1292
1156
  defaultHeaders: {
1293
- ..._optionalChain([extraConfig, 'optionalAccess', _21 => _21.defaultHeaders]) || {},
1157
+ ..._optionalChain([extraConfig, 'optionalAccess', _10 => _10.defaultHeaders]) || {},
1294
1158
  [_env.MIDSCENE_API_TYPE]: AIActionTypeValue.toString()
1295
1159
  },
1296
1160
  dangerouslyAllowBrowser: true
@@ -1341,7 +1205,7 @@ async function call2(messages, AIActionTypeValue, responseFormat, options) {
1341
1205
  const debugProfileDetail = _logger.getDebug.call(void 0, "ai:profile:detail");
1342
1206
  const startTime = Date.now();
1343
1207
  const model = getModelName();
1344
- const isStreaming = _optionalChain([options, 'optionalAccess', _22 => _22.stream]) && _optionalChain([options, 'optionalAccess', _23 => _23.onChunk]);
1208
+ const isStreaming = _optionalChain([options, 'optionalAccess', _11 => _11.stream]) && _optionalChain([options, 'optionalAccess', _12 => _12.onChunk]);
1345
1209
  let content;
1346
1210
  let accumulated = "";
1347
1211
  let usage;
@@ -1372,8 +1236,8 @@ async function call2(messages, AIActionTypeValue, responseFormat, options) {
1372
1236
  }
1373
1237
  );
1374
1238
  for await (const chunk of stream) {
1375
- const content2 = _optionalChain([chunk, 'access', _24 => _24.choices, 'optionalAccess', _25 => _25[0], 'optionalAccess', _26 => _26.delta, 'optionalAccess', _27 => _27.content]) || "";
1376
- const reasoning_content = _optionalChain([chunk, 'access', _28 => _28.choices, 'optionalAccess', _29 => _29[0], 'optionalAccess', _30 => _30.delta, 'optionalAccess', _31 => _31.reasoning_content]) || "";
1239
+ const content2 = _optionalChain([chunk, 'access', _13 => _13.choices, 'optionalAccess', _14 => _14[0], 'optionalAccess', _15 => _15.delta, 'optionalAccess', _16 => _16.content]) || "";
1240
+ const reasoning_content = _optionalChain([chunk, 'access', _17 => _17.choices, 'optionalAccess', _18 => _18[0], 'optionalAccess', _19 => _19.delta, 'optionalAccess', _20 => _20.reasoning_content]) || "";
1377
1241
  if (chunk.usage) {
1378
1242
  usage = chunk.usage;
1379
1243
  }
@@ -1388,7 +1252,7 @@ async function call2(messages, AIActionTypeValue, responseFormat, options) {
1388
1252
  };
1389
1253
  options.onChunk(chunkData);
1390
1254
  }
1391
- if (_optionalChain([chunk, 'access', _32 => _32.choices, 'optionalAccess', _33 => _33[0], 'optionalAccess', _34 => _34.finish_reason])) {
1255
+ if (_optionalChain([chunk, 'access', _21 => _21.choices, 'optionalAccess', _22 => _22[0], 'optionalAccess', _23 => _23.finish_reason])) {
1392
1256
  timeCost = Date.now() - startTime;
1393
1257
  if (!usage) {
1394
1258
  const estimatedTokens = Math.max(
@@ -1430,7 +1294,7 @@ async function call2(messages, AIActionTypeValue, responseFormat, options) {
1430
1294
  });
1431
1295
  timeCost = Date.now() - startTime;
1432
1296
  debugProfileStats(
1433
- `model, ${model}, mode, ${_env.vlLocateMode.call(void 0, ) || "default"}, ui-tars-version, ${_env.uiTarsModelVersion.call(void 0, )}, prompt-tokens, ${_optionalChain([result, 'access', _35 => _35.usage, 'optionalAccess', _36 => _36.prompt_tokens]) || ""}, completion-tokens, ${_optionalChain([result, 'access', _37 => _37.usage, 'optionalAccess', _38 => _38.completion_tokens]) || ""}, total-tokens, ${_optionalChain([result, 'access', _39 => _39.usage, 'optionalAccess', _40 => _40.total_tokens]) || ""}, cost-ms, ${timeCost}, requestId, ${result._request_id || ""}`
1297
+ `model, ${model}, mode, ${_env.vlLocateMode.call(void 0, ) || "default"}, ui-tars-version, ${_env.uiTarsModelVersion.call(void 0, )}, prompt-tokens, ${_optionalChain([result, 'access', _24 => _24.usage, 'optionalAccess', _25 => _25.prompt_tokens]) || ""}, completion-tokens, ${_optionalChain([result, 'access', _26 => _26.usage, 'optionalAccess', _27 => _27.completion_tokens]) || ""}, total-tokens, ${_optionalChain([result, 'access', _28 => _28.usage, 'optionalAccess', _29 => _29.total_tokens]) || ""}, cost-ms, ${timeCost}, requestId, ${result._request_id || ""}`
1434
1298
  );
1435
1299
  debugProfileDetail(
1436
1300
  `model usage detail: ${JSON.stringify(result.usage)}`
@@ -1472,7 +1336,7 @@ async function call2(messages, AIActionTypeValue, responseFormat, options) {
1472
1336
  ...commonConfig
1473
1337
  });
1474
1338
  for await (const chunk of stream) {
1475
- const content2 = _optionalChain([chunk, 'access', _41 => _41.delta, 'optionalAccess', _42 => _42.text]) || "";
1339
+ const content2 = _optionalChain([chunk, 'access', _30 => _30.delta, 'optionalAccess', _31 => _31.text]) || "";
1476
1340
  if (content2) {
1477
1341
  accumulated += content2;
1478
1342
  const chunkData = {
@@ -1615,8 +1479,8 @@ function preprocessDoubaoBboxJson(input) {
1615
1479
  }
1616
1480
  function safeParseJson(input) {
1617
1481
  const cleanJsonString = extractJSONFromCodeBlock(input);
1618
- if (_optionalChain([cleanJsonString, 'optionalAccess', _43 => _43.match, 'call', _44 => _44(/\((\d+),(\d+)\)/)])) {
1619
- return _optionalChain([cleanJsonString, 'access', _45 => _45.match, 'call', _46 => _46(/\((\d+),(\d+)\)/), 'optionalAccess', _47 => _47.slice, 'call', _48 => _48(1), 'access', _49 => _49.map, 'call', _50 => _50(Number)]);
1482
+ if (_optionalChain([cleanJsonString, 'optionalAccess', _32 => _32.match, 'call', _33 => _33(/\((\d+),(\d+)\)/)])) {
1483
+ return _optionalChain([cleanJsonString, 'access', _34 => _34.match, 'call', _35 => _35(/\((\d+),(\d+)\)/), 'optionalAccess', _36 => _36.slice, 'call', _37 => _37(1), 'access', _38 => _38.map, 'call', _39 => _39(Number)]);
1620
1484
  }
1621
1485
  try {
1622
1486
  return JSON.parse(cleanJsonString);
@@ -1633,6 +1497,131 @@ function safeParseJson(input) {
1633
1497
  throw Error(`failed to parse json response: ${input}`);
1634
1498
  }
1635
1499
 
1500
+ // src/image/index.ts
1501
+
1502
+
1503
+
1504
+
1505
+
1506
+
1507
+
1508
+
1509
+
1510
+
1511
+ // src/ai-model/prompt/util.ts
1512
+
1513
+
1514
+
1515
+
1516
+
1517
+
1518
+
1519
+
1520
+ function describeSize(size) {
1521
+ return `${size.width} x ${size.height}`;
1522
+ }
1523
+ var distanceThreshold = 16;
1524
+ function elementByPositionWithElementInfo(treeRoot, position, options) {
1525
+ const requireStrictDistance = _nullishCoalesce(_optionalChain([options, 'optionalAccess', _40 => _40.requireStrictDistance]), () => ( true));
1526
+ const filterPositionElements = _nullishCoalesce(_optionalChain([options, 'optionalAccess', _41 => _41.filterPositionElements]), () => ( false));
1527
+ _utils.assert.call(void 0, typeof position !== "undefined", "position is required for query");
1528
+ const matchingElements = [];
1529
+ function dfs(node) {
1530
+ if (_optionalChain([node, 'optionalAccess', _42 => _42.node])) {
1531
+ const item = node.node;
1532
+ if (item.rect.left <= position.x && position.x <= item.rect.left + item.rect.width && item.rect.top <= position.y && position.y <= item.rect.top + item.rect.height) {
1533
+ if (!(filterPositionElements && _optionalChain([item, 'access', _43 => _43.attributes, 'optionalAccess', _44 => _44.nodeType]) === _constants.NodeType.POSITION) && item.isVisible) {
1534
+ matchingElements.push(item);
1535
+ }
1536
+ }
1537
+ }
1538
+ for (const child of node.children) {
1539
+ dfs(child);
1540
+ }
1541
+ }
1542
+ dfs(treeRoot);
1543
+ if (matchingElements.length === 0) {
1544
+ return void 0;
1545
+ }
1546
+ const element = matchingElements.reduce((smallest, current) => {
1547
+ const smallestArea = smallest.rect.width * smallest.rect.height;
1548
+ const currentArea = current.rect.width * current.rect.height;
1549
+ return currentArea < smallestArea ? current : smallest;
1550
+ });
1551
+ const distanceToCenter = distance(
1552
+ { x: element.center[0], y: element.center[1] },
1553
+ position
1554
+ );
1555
+ if (requireStrictDistance) {
1556
+ return distanceToCenter <= distanceThreshold ? element : void 0;
1557
+ }
1558
+ return element;
1559
+ }
1560
+ function distance(point1, point2) {
1561
+ return Math.sqrt((point1.x - point2.x) ** 2 + (point1.y - point2.y) ** 2);
1562
+ }
1563
+ async function describeUserPage(context, opt) {
1564
+ const { screenshotBase64 } = context;
1565
+ let width;
1566
+ let height;
1567
+ if (context.size) {
1568
+ ({ width, height } = context.size);
1569
+ } else {
1570
+ const imgSize = await _img.imageInfoOfBase64.call(void 0, screenshotBase64);
1571
+ ({ width, height } = imgSize);
1572
+ }
1573
+ const treeRoot = context.tree;
1574
+ const idElementMap = {};
1575
+ const flatElements = _extractor.treeToList.call(void 0, treeRoot);
1576
+ if (_optionalChain([opt, 'optionalAccess', _45 => _45.domIncluded]) === true && flatElements.length >= 5e3) {
1577
+ console.warn(
1578
+ 'The number of elements is too large, it may cause the prompt to be too long, please use domIncluded: "visible-only" to reduce the number of elements'
1579
+ );
1580
+ }
1581
+ flatElements.forEach((element) => {
1582
+ idElementMap[element.id] = element;
1583
+ if (typeof element.indexId !== "undefined") {
1584
+ idElementMap[`${element.indexId}`] = element;
1585
+ }
1586
+ });
1587
+ let pageDescription = "";
1588
+ const visibleOnly = _nullishCoalesce(_optionalChain([opt, 'optionalAccess', _46 => _46.visibleOnly]), () => ( _optionalChain([opt, 'optionalAccess', _47 => _47.domIncluded]) === "visible-only"));
1589
+ if (_optionalChain([opt, 'optionalAccess', _48 => _48.domIncluded]) || !_env.vlLocateMode.call(void 0, )) {
1590
+ const contentTree = await _extractor.descriptionOfTree.call(void 0,
1591
+ treeRoot,
1592
+ _optionalChain([opt, 'optionalAccess', _49 => _49.truncateTextLength]),
1593
+ _optionalChain([opt, 'optionalAccess', _50 => _50.filterNonTextContent]),
1594
+ visibleOnly
1595
+ );
1596
+ const sizeDescription = describeSize({ width, height });
1597
+ pageDescription = `The size of the page: ${sizeDescription}
1598
+ The page elements tree:
1599
+ ${contentTree}`;
1600
+ }
1601
+ return {
1602
+ description: pageDescription,
1603
+ elementById(idOrIndexId) {
1604
+ _utils.assert.call(void 0, typeof idOrIndexId !== "undefined", "id is required for query");
1605
+ const item = idElementMap[`${idOrIndexId}`];
1606
+ return item;
1607
+ },
1608
+ elementByPosition(position, size) {
1609
+ return elementByPositionWithElementInfo(treeRoot, position);
1610
+ },
1611
+ insertElementByPosition(position) {
1612
+ const element = _extractor.generateElementByPosition.call(void 0, position);
1613
+ treeRoot.children.push({
1614
+ node: element,
1615
+ children: []
1616
+ });
1617
+ flatElements.push(element);
1618
+ idElementMap[element.id] = element;
1619
+ return element;
1620
+ },
1621
+ size: { width, height }
1622
+ };
1623
+ }
1624
+
1636
1625
  // src/ai-model/prompt/playwright-generator.ts
1637
1626
 
1638
1627
 
@@ -2518,7 +2507,7 @@ async function plan(userInstruction, opts) {
2518
2507
  const { screenshotBase64, size } = context;
2519
2508
  const { description: pageDescription, elementById } = await describeUserPage(context);
2520
2509
  const systemPrompt = await systemPromptToTaskPlanning({
2521
- pageType: opts.pageType,
2510
+ actionSpace: opts.actionSpace,
2522
2511
  vlMode: _env.vlLocateMode.call(void 0, )
2523
2512
  });
2524
2513
  const taskBackgroundContextText = generateTaskBackgroundContext(
@@ -2920,6 +2909,6 @@ async function resizeImageForUiTars(imageBase64, size) {
2920
2909
 
2921
2910
 
2922
2911
 
2923
- exports.systemPromptToLocateElement = systemPromptToLocateElement; exports.elementByPositionWithElementInfo = elementByPositionWithElementInfo; exports.describeUserPage = describeUserPage; exports.call = call2; exports.callToGetJSONObject = callToGetJSONObject; exports.callAiFnWithStringResponse = callAiFnWithStringResponse; exports.AIActionType = AIActionType; exports.callAiFn = callAiFn; exports.adaptBboxToRect = adaptBboxToRect; exports.expandSearchArea = expandSearchArea; exports.generateYamlTest = generateYamlTest; exports.generateYamlTestStream = generateYamlTestStream; exports.generatePlaywrightTest = generatePlaywrightTest; exports.generatePlaywrightTestStream = generatePlaywrightTestStream; exports.AiLocateElement = AiLocateElement; exports.AiLocateSection = AiLocateSection; exports.AiExtractElementInfo = AiExtractElementInfo; exports.AiAssert = AiAssert; exports.plan = plan; exports.vlmPlanning = vlmPlanning; exports.resizeImageForUiTars = resizeImageForUiTars;
2912
+ exports.systemPromptToLocateElement = systemPromptToLocateElement; exports.call = call2; exports.callToGetJSONObject = callToGetJSONObject; exports.callAiFnWithStringResponse = callAiFnWithStringResponse; exports.AIActionType = AIActionType; exports.callAiFn = callAiFn; exports.adaptBboxToRect = adaptBboxToRect; exports.expandSearchArea = expandSearchArea; exports.elementByPositionWithElementInfo = elementByPositionWithElementInfo; exports.describeUserPage = describeUserPage; exports.generateYamlTest = generateYamlTest; exports.generateYamlTestStream = generateYamlTestStream; exports.generatePlaywrightTest = generatePlaywrightTest; exports.generatePlaywrightTestStream = generatePlaywrightTestStream; exports.AiLocateElement = AiLocateElement; exports.AiLocateSection = AiLocateSection; exports.AiExtractElementInfo = AiExtractElementInfo; exports.AiAssert = AiAssert; exports.plan = plan; exports.vlmPlanning = vlmPlanning; exports.resizeImageForUiTars = resizeImageForUiTars;
2924
2913
 
2925
- //# sourceMappingURL=chunk-I5LBWOQA.js.map
2914
+ //# sourceMappingURL=chunk-NY6RQSGJ.js.map