@midscene/core 0.25.4-beta-20250811115904.0 → 0.26.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (32) hide show
  1. package/dist/es/ai-model.d.ts +3 -3
  2. package/dist/es/ai-model.js +1 -1
  3. package/dist/es/{chunk-SR67R2OE.js → chunk-4CBFCRNS.js} +3 -3
  4. package/dist/es/{chunk-NY6RQSGJ.js → chunk-I5LBWOQA.js} +254 -243
  5. package/dist/es/chunk-I5LBWOQA.js.map +1 -0
  6. package/dist/es/index.d.ts +4 -4
  7. package/dist/es/index.js +2 -2
  8. package/dist/es/{llm-planning-374b74b8.d.ts → llm-planning-92cec090.d.ts} +1 -2
  9. package/dist/es/{types-16cd9f75.d.ts → types-b4a208c6.d.ts} +1 -10
  10. package/dist/es/utils.d.ts +1 -1
  11. package/dist/es/utils.js +1 -1
  12. package/dist/lib/ai-model.d.ts +3 -3
  13. package/dist/lib/ai-model.js +2 -2
  14. package/dist/lib/{chunk-SR67R2OE.js → chunk-4CBFCRNS.js} +3 -3
  15. package/dist/lib/{chunk-NY6RQSGJ.js → chunk-I5LBWOQA.js} +243 -232
  16. package/dist/lib/chunk-I5LBWOQA.js.map +1 -0
  17. package/dist/lib/index.d.ts +4 -4
  18. package/dist/lib/index.js +12 -12
  19. package/dist/lib/{llm-planning-374b74b8.d.ts → llm-planning-92cec090.d.ts} +1 -2
  20. package/dist/{types/types-16cd9f75.d.ts → lib/types-b4a208c6.d.ts} +1 -10
  21. package/dist/lib/utils.d.ts +1 -1
  22. package/dist/lib/utils.js +2 -2
  23. package/dist/types/ai-model.d.ts +3 -3
  24. package/dist/types/index.d.ts +4 -4
  25. package/dist/types/{llm-planning-374b74b8.d.ts → llm-planning-92cec090.d.ts} +1 -2
  26. package/dist/{lib/types-16cd9f75.d.ts → types/types-b4a208c6.d.ts} +1 -10
  27. package/dist/types/utils.d.ts +1 -1
  28. package/package.json +3 -3
  29. package/dist/es/chunk-NY6RQSGJ.js.map +0 -1
  30. package/dist/lib/chunk-NY6RQSGJ.js.map +0 -1
  31. /package/dist/es/{chunk-SR67R2OE.js.map → chunk-4CBFCRNS.js.map} +0 -0
  32. /package/dist/lib/{chunk-SR67R2OE.js.map → chunk-4CBFCRNS.js.map} +0 -0
@@ -633,73 +633,179 @@ Here is the item user want to find:
633
633
  });
634
634
 
635
635
  // src/ai-model/prompt/llm-planning.ts
636
- var _assert = require('assert'); var _assert2 = _interopRequireDefault(_assert);
637
636
 
638
- var vlCoTLog = `"what_the_user_wants_to_do_next_by_instruction": string, // What the user wants to do according to the instruction and previous logs. `;
639
- var vlCurrentLog = `"log": string, // Log what the next one action (ONLY ONE!) you can do according to the screenshot and the instruction. The typical log looks like "Now i want to use action '{ action-type }' to do .. first". If no action should be done, log the reason. ". Use the same language as the user's instruction.`;
640
- var llmCurrentLog = `"log": string, // Log what the next actions you can do according to the screenshot and the instruction. The typical log looks like "Now i want to use action '{ action-type }' to do ..". If no action should be done, log the reason. ". Use the same language as the user's instruction.`;
641
- var commonOutputFields = `"error"?: string, // Error messages about unexpected situations, if any. Only think it is an error when the situation is not expected according to the instruction. Use the same language as the user's instruction.
642
- "more_actions_needed_by_instruction": boolean, // Consider if there is still more action(s) to do after the action in "Log" is done, according to the instruction. If so, set this field to true. Otherwise, set it to false.`;
643
- var vlLocateParam = (required) => `locate${required ? "" : "?"}: {bbox: [number, number, number, number], prompt: string }`;
644
- var llmLocateParam = (required) => `locate${required ? "" : "?"}: {"id": string, "prompt": string}`;
645
- var descriptionForAction = (action, locatorScheme) => {
646
- const tab = " ";
647
- let locateParam = "";
648
- if (action.location === "required") {
649
- locateParam = locatorScheme;
650
- } else if (action.location === "optional") {
651
- locateParam = `${locatorScheme} | null`;
652
- } else if (action.location === false) {
653
- locateParam = "";
654
- }
655
- const locatorParam = locateParam ? `- ${locateParam}` : "";
656
- if (action.whatToLocate) {
657
- if (!locateParam) {
658
- console.warn(
659
- `whatToLocate is provided for action ${action.name}, but location is not required or optional. The whatToLocate will be ignored.`
660
- );
661
- } else {
662
- locateParam += ` // ${action.whatToLocate}`;
637
+
638
+ // src/image/index.ts
639
+
640
+
641
+
642
+
643
+
644
+
645
+
646
+
647
+
648
+
649
+ // src/ai-model/prompt/util.ts
650
+
651
+
652
+
653
+
654
+
655
+
656
+
657
+
658
+ function describeSize(size) {
659
+ return `${size.width} x ${size.height}`;
660
+ }
661
+ var distanceThreshold = 16;
662
+ function elementByPositionWithElementInfo(treeRoot, position, options) {
663
+ const requireStrictDistance = _nullishCoalesce(_optionalChain([options, 'optionalAccess', _10 => _10.requireStrictDistance]), () => ( true));
664
+ const filterPositionElements = _nullishCoalesce(_optionalChain([options, 'optionalAccess', _11 => _11.filterPositionElements]), () => ( false));
665
+ _utils.assert.call(void 0, typeof position !== "undefined", "position is required for query");
666
+ const matchingElements = [];
667
+ function dfs(node) {
668
+ if (_optionalChain([node, 'optionalAccess', _12 => _12.node])) {
669
+ const item = node.node;
670
+ if (item.rect.left <= position.x && position.x <= item.rect.left + item.rect.width && item.rect.top <= position.y && position.y <= item.rect.top + item.rect.height) {
671
+ if (!(filterPositionElements && _optionalChain([item, 'access', _13 => _13.attributes, 'optionalAccess', _14 => _14.nodeType]) === _constants.NodeType.POSITION) && item.isVisible) {
672
+ matchingElements.push(item);
673
+ }
674
+ }
675
+ }
676
+ for (const child of node.children) {
677
+ dfs(child);
663
678
  }
664
679
  }
665
- let paramSchema = "";
666
- if (action.paramSchema) {
667
- paramSchema = `- param: ${action.paramSchema}`;
680
+ dfs(treeRoot);
681
+ if (matchingElements.length === 0) {
682
+ return void 0;
668
683
  }
669
- if (action.paramDescription) {
670
- _assert2.default.call(void 0,
671
- paramSchema,
672
- `paramSchema is required when paramDescription is provided for action ${action.name}, but got ${action.paramSchema}`
684
+ const element = matchingElements.reduce((smallest, current) => {
685
+ const smallestArea = smallest.rect.width * smallest.rect.height;
686
+ const currentArea = current.rect.width * current.rect.height;
687
+ return currentArea < smallestArea ? current : smallest;
688
+ });
689
+ const distanceToCenter = distance(
690
+ { x: element.center[0], y: element.center[1] },
691
+ position
692
+ );
693
+ if (requireStrictDistance) {
694
+ return distanceToCenter <= distanceThreshold ? element : void 0;
695
+ }
696
+ return element;
697
+ }
698
+ function distance(point1, point2) {
699
+ return Math.sqrt((point1.x - point2.x) ** 2 + (point1.y - point2.y) ** 2);
700
+ }
701
+ var samplePageDescription = `
702
+ And the page is described as follows:
703
+ ====================
704
+ The size of the page: 1280 x 720
705
+ Some of the elements are marked with a rectangle in the screenshot corresponding to the markerId, some are not.
706
+
707
+ Description of all the elements in screenshot:
708
+ <div id="969f1637" markerId="1" left="100" top="100" width="100" height="100"> // The markerId indicated by the rectangle label in the screenshot
709
+ <h4 id="b211ecb2" markerId="5" left="150" top="150" width="90" height="60">
710
+ The username is accepted
711
+ </h4>
712
+ ...many more
713
+ </div>
714
+ ====================
715
+ `;
716
+ async function describeUserPage(context, opt) {
717
+ const { screenshotBase64 } = context;
718
+ let width;
719
+ let height;
720
+ if (context.size) {
721
+ ({ width, height } = context.size);
722
+ } else {
723
+ const imgSize = await _img.imageInfoOfBase64.call(void 0, screenshotBase64);
724
+ ({ width, height } = imgSize);
725
+ }
726
+ const treeRoot = context.tree;
727
+ const idElementMap = {};
728
+ const flatElements = _extractor.treeToList.call(void 0, treeRoot);
729
+ if (_optionalChain([opt, 'optionalAccess', _15 => _15.domIncluded]) === true && flatElements.length >= 5e3) {
730
+ console.warn(
731
+ 'The number of elements is too large, it may cause the prompt to be too long, please use domIncluded: "visible-only" to reduce the number of elements'
673
732
  );
674
- paramSchema += ` // ${action.paramDescription}`;
675
- }
676
- const fields = [paramSchema, locatorParam].filter(Boolean);
677
- return `- ${action.name}, ${action.description}
678
- ${tab}- type: "${action.name}"
679
- ${tab}${fields.join(`
680
- ${tab}`)}
681
- `.trim();
682
- };
733
+ }
734
+ flatElements.forEach((element) => {
735
+ idElementMap[element.id] = element;
736
+ if (typeof element.indexId !== "undefined") {
737
+ idElementMap[`${element.indexId}`] = element;
738
+ }
739
+ });
740
+ let pageDescription = "";
741
+ const visibleOnly = _nullishCoalesce(_optionalChain([opt, 'optionalAccess', _16 => _16.visibleOnly]), () => ( _optionalChain([opt, 'optionalAccess', _17 => _17.domIncluded]) === "visible-only"));
742
+ if (_optionalChain([opt, 'optionalAccess', _18 => _18.domIncluded]) || !_env.vlLocateMode.call(void 0, )) {
743
+ const contentTree = await _extractor.descriptionOfTree.call(void 0,
744
+ treeRoot,
745
+ _optionalChain([opt, 'optionalAccess', _19 => _19.truncateTextLength]),
746
+ _optionalChain([opt, 'optionalAccess', _20 => _20.filterNonTextContent]),
747
+ visibleOnly
748
+ );
749
+ const sizeDescription = describeSize({ width, height });
750
+ pageDescription = `The size of the page: ${sizeDescription}
751
+ The page elements tree:
752
+ ${contentTree}`;
753
+ }
754
+ return {
755
+ description: pageDescription,
756
+ elementById(idOrIndexId) {
757
+ _utils.assert.call(void 0, typeof idOrIndexId !== "undefined", "id is required for query");
758
+ const item = idElementMap[`${idOrIndexId}`];
759
+ return item;
760
+ },
761
+ elementByPosition(position, size) {
762
+ return elementByPositionWithElementInfo(treeRoot, position);
763
+ },
764
+ insertElementByPosition(position) {
765
+ const element = _extractor.generateElementByPosition.call(void 0, position);
766
+ treeRoot.children.push({
767
+ node: element,
768
+ children: []
769
+ });
770
+ flatElements.push(element);
771
+ idElementMap[element.id] = element;
772
+ return element;
773
+ },
774
+ size: { width, height }
775
+ };
776
+ }
777
+
778
+ // src/ai-model/prompt/llm-planning.ts
779
+ var vlCoTLog = `"what_the_user_wants_to_do_next_by_instruction": string, // What the user wants to do according to the instruction and previous logs. `;
780
+ var vlCurrentLog = `"log": string, // Log what the next one action (ONLY ONE!) you can do according to the screenshot and the instruction. The typical log looks like "Now i want to use action '{{ action-type }}' to do .. first". If no action should be done, log the reason. ". Use the same language as the user's instruction.`;
781
+ var llmCurrentLog = `"log": string, // Log what the next actions you can do according to the screenshot and the instruction. The typical log looks like "Now i want to use action '{{ action-type }}' to do ..". If no action should be done, log the reason. ". Use the same language as the user's instruction.`;
782
+ var commonOutputFields = `"error"?: string, // Error messages about unexpected situations, if any. Only think it is an error when the situation is not expected according to the instruction. Use the same language as the user's instruction.
783
+ "more_actions_needed_by_instruction": boolean, // Consider if there is still more action(s) to do after the action in "Log" is done, according to the instruction. If so, set this field to true. Otherwise, set it to false.`;
784
+ var vlLocateParam = "locate: {bbox: [number, number, number, number], prompt: string }";
683
785
  var systemTemplateOfVLPlanning = ({
684
- actionSpace,
786
+ pageType,
685
787
  vlMode
686
- }) => {
687
- const actionNameList = actionSpace.map((action) => action.name).join(", ");
688
- const actionDescriptionList = actionSpace.map(
689
- (action) => descriptionForAction(action, vlLocateParam(action.location === "required"))
690
- );
691
- const actionList = actionDescriptionList.join("\n");
692
- return `
788
+ }) => `
693
789
  Target: User will give you a screenshot, an instruction and some previous logs indicating what have been done. Please tell what the next one action is (or null if no action should be done) to do the tasks the instruction requires.
694
790
 
695
791
  Restriction:
696
792
  - Don't give extra actions or plans beyond the instruction. ONLY plan for what the instruction requires. For example, don't try to submit the form if the instruction is only to fill something.
697
- - Always give ONLY ONE action in \`log\` field (or null if no action should be done), instead of multiple actions. Supported actions are ${actionNameList}.
793
+ - Always give ONLY ONE action in \`log\` field (or null if no action should be done), instead of multiple actions. Supported actions are Tap, Hover, Input, KeyboardPress, Scroll${pageType === "android" ? ", AndroidBackButton, AndroidHomeButton, AndroidRecentAppsButton, AndroidLongPress, AndroidPull." : "."}
698
794
  - Don't repeat actions in the previous logs.
699
795
  - Bbox is the bounding box of the element to be located. It's an array of 4 numbers, representing ${bboxDescription(vlMode)}.
700
796
 
701
797
  Supporting actions:
702
- ${actionList}
798
+ - Tap: { type: "Tap", ${vlLocateParam} }
799
+ - RightClick: { type: "RightClick", ${vlLocateParam} }
800
+ - Hover: { type: "Hover", ${vlLocateParam} }
801
+ - Input: { type: "Input", ${vlLocateParam}, param: { value: string } } // Replace the input field with a new value. \`value\` is the final that should be filled in the input box. No matter what modifications are required, just provide the final value to replace the existing input value. Giving a blank string means clear the input field.
802
+ - KeyboardPress: { type: "KeyboardPress", param: { value: string } }
803
+ - Scroll: { type: "Scroll", ${vlLocateParam} | null, param: { direction: 'down'(default) | 'up' | 'right' | 'left', scrollType: 'once' (default) | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft', distance: null | number }} // locate is the element to scroll. If it's a page scroll, put \`null\` in the \`locate\` field.
804
+ ${pageType === "android" ? `- AndroidBackButton: { type: "AndroidBackButton", param: {} }
805
+ - AndroidHomeButton: { type: "AndroidHomeButton", param: {} }
806
+ - AndroidRecentAppsButton: { type: "AndroidRecentAppsButton", param: {} }
807
+ - AndroidLongPress: { type: "AndroidLongPress", param: { x: number, y: number, duration?: number } }
808
+ - AndroidPull: { type: "AndroidPull", param: { direction: 'up' | 'down', startPoint?: { x: number, y: number }, distance?: number, duration?: number } } // Pull down to refresh (direction: 'down') or pull up to load more (direction: 'up')` : ""}
703
809
 
704
810
  Field description:
705
811
  * The \`prompt\` field inside the \`locate\` field is a short description that could be used to locate the element.
@@ -734,19 +840,8 @@ this and output the JSON:
734
840
  }
735
841
  }
736
842
  `;
737
- };
738
- var systemTemplateOfLLM = ({
739
- actionSpace
740
- }) => {
741
- const actionNameList = actionSpace.map((action) => action.name).join(" / ");
742
- const actionDescriptionList = actionSpace.map(
743
- (action) => descriptionForAction(
744
- action,
745
- llmLocateParam(action.location === "required")
746
- )
747
- );
748
- const actionList = actionDescriptionList.join("\n");
749
- return `
843
+ var llmLocateParam = `locate: {{"id": string, "prompt": string}} | null`;
844
+ var systemTemplateOfLLM = ({ pageType }) => `
750
845
  ## Role
751
846
 
752
847
  You are a versatile professional in software UI automation. Your outstanding contributions will impact the user experience of billions of users.
@@ -760,7 +855,7 @@ You are a versatile professional in software UI automation. Your outstanding con
760
855
  ## Workflow
761
856
 
762
857
  1. Receive the screenshot, element description of screenshot(if any), user's instruction and previous logs.
763
- 2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (${actionNameList}). The "About the action" section below will give you more details.
858
+ 2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (Tap / Hover / Input / KeyboardPress / Scroll / FalsyConditionStatement / Sleep ${pageType === "android" ? "/ AndroidBackButton / AndroidHomeButton / AndroidRecentAppsButton / AndroidLongPress / AndroidPull" : ""}). The "About the action" section below will give you more details.
764
859
  3. Precisely locate the target element if it's already shown in the screenshot, put the location info in the \`locate\` field of the action.
765
860
  4. If some target elements is not shown in the screenshot, consider the user's instruction is not feasible on this page. Follow the next steps.
766
861
  5. Consider whether the user's instruction will be accomplished after all the actions
@@ -778,30 +873,65 @@ You are a versatile professional in software UI automation. Your outstanding con
778
873
 
779
874
  The \`locate\` param is commonly used in the \`param\` field of the action, means to locate the target element to perform the action, it conforms to the following scheme:
780
875
 
781
- type LocateParam = {
876
+ type LocateParam = {{
782
877
  "id": string, // the id of the element found. It should either be the id marked with a rectangle in the screenshot or the id described in the description.
783
878
  "prompt"?: string // the description of the element to find. It can only be omitted when locate is null.
784
- } | null // If it's not on the page, the LocateParam should be null
879
+ }} | null // If it's not on the page, the LocateParam should be null
785
880
 
786
881
  ## Supported actions
787
882
 
788
883
  Each action has a \`type\` and corresponding \`param\`. To be detailed:
789
- ${actionList}
790
-
791
- `.trim();
792
- };
884
+ - type: 'Tap'
885
+ * {{ ${llmLocateParam} }}
886
+ - type: 'RightClick'
887
+ * {{ ${llmLocateParam} }}
888
+ - type: 'Hover'
889
+ * {{ ${llmLocateParam} }}
890
+ - type: 'Input', replace the value in the input field
891
+ * {{ ${llmLocateParam}, param: {{ value: string }} }}
892
+ * \`value\` is the final value that should be filled in the input field. No matter what modifications are required, just provide the final value user should see after the action is done.
893
+ - type: 'KeyboardPress', press a key
894
+ * {{ param: {{ value: string }} }}
895
+ - type: 'Scroll', scroll up or down.
896
+ * {{
897
+ ${llmLocateParam},
898
+ param: {{
899
+ direction: 'down'(default) | 'up' | 'right' | 'left',
900
+ scrollType: 'once' (default) | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft',
901
+ distance: null | number
902
+ }}
903
+ }}
904
+ * To scroll some specific element, put the element at the center of the region in the \`locate\` field. If it's a page scroll, put \`null\` in the \`locate\` field.
905
+ * \`param\` is required in this action. If some fields are not specified, use direction \`down\`, \`once\` scroll type, and \`null\` distance.
906
+ * {{ param: {{ button: 'Back' | 'Home' | 'RecentApp' }} }}
907
+ - type: 'ExpectedFalsyCondition'
908
+ * {{ param: {{ reason: string }} }}
909
+ * use this action when the conditional statement talked about in the instruction is falsy.
910
+ - type: 'Sleep'
911
+ * {{ param: {{ timeMs: number }} }}
912
+ ${pageType === "android" ? `- type: 'AndroidBackButton', trigger the system "back" operation on Android devices
913
+ * {{ param: {{}} }}
914
+ - type: 'AndroidHomeButton', trigger the system "home" operation on Android devices
915
+ * {{ param: {{}} }}
916
+ - type: 'AndroidRecentAppsButton', trigger the system "recent apps" operation on Android devices
917
+ * {{ param: {{}} }}
918
+ - type: 'AndroidLongPress', trigger a long press on the screen at specified coordinates on Android devices
919
+ * {{ param: {{ x: number, y: number, duration?: number }} }}
920
+ - type: 'AndroidPull', trigger pull down to refresh or pull up actions on Android devices
921
+ * {{ param: {{ direction: 'up' | 'down', startPoint?: {{ x: number, y: number }}, distance?: number, duration?: number }} }}` : ""}
922
+ `;
793
923
  var outputTemplate = `
794
924
  ## Output JSON Format:
795
925
 
796
926
  The JSON format is as follows:
797
927
 
798
- {
928
+ {{
799
929
  "actions": [
800
930
  // ... some actions
801
931
  ],
802
932
  ${llmCurrentLog}
803
933
  ${commonOutputFields}
804
- }
934
+ }}
805
935
 
806
936
  ## Examples
807
937
 
@@ -817,62 +947,68 @@ By viewing the page screenshot and description, you should consider this and out
817
947
  * Log what these action do: Click the language switch button to open the language options. Wait for 1 second.
818
948
  * The task cannot be accomplished (because we cannot see the "English" option now), so the \`more_actions_needed_by_instruction\` field is true.
819
949
 
820
- {
950
+ {{
821
951
  "actions":[
822
- {
952
+ {{
823
953
  "type": "Tap",
824
954
  "thought": "Click the language switch button to open the language options.",
825
955
  "param": null,
826
- "locate": { id: "c81c4e9a33", prompt: "The language switch button" }},
827
- },
828
- {
956
+ "locate": {{ id: "c81c4e9a33", prompt: "The language switch button" }},
957
+ }},
958
+ {{
829
959
  "type": "Sleep",
830
960
  "thought": "Wait for 1 second to ensure the language options are displayed.",
831
- "param": { "timeMs": 1000 },
832
- }
961
+ "param": {{ "timeMs": 1000 }},
962
+ }}
833
963
  ],
834
964
  "error": null,
835
965
  "more_actions_needed_by_instruction": true,
836
966
  "log": "Click the language switch button to open the language options. Wait for 1 second",
837
- }
967
+ }}
838
968
 
839
969
  ### Example: What NOT to do
840
970
  Wrong output:
841
- {
971
+ {{
842
972
  "actions":[
843
- {
973
+ {{
844
974
  "type": "Tap",
845
975
  "thought": "Click the language switch button to open the language options.",
846
976
  "param": null,
847
- "locate": {
848
- { "id": "c81c4e9a33" }, // WRONG: prompt is missing
849
- }
850
- },
851
- {
977
+ "locate": {{
978
+ {{ "id": "c81c4e9a33" }}, // WRONG: prompt is missing
979
+ }}
980
+ }},
981
+ {{
852
982
  "type": "Tap",
853
983
  "thought": "Click the English option",
854
984
  "param": null,
855
985
  "locate": null, // This means the 'English' option is not shown in the screenshot, the task cannot be accomplished
856
- }
986
+ }}
857
987
  ],
858
988
  "more_actions_needed_by_instruction": false, // WRONG: should be true
859
989
  "log": "Click the language switch button to open the language options",
860
- }
990
+ }}
861
991
 
862
992
  Reason:
863
993
  * The \`prompt\` is missing in the first 'Locate' action
864
994
  * Since the option button is not shown in the screenshot, there are still more actions to be done, so the \`more_actions_needed_by_instruction\` field should be true
865
995
  `;
866
996
  async function systemPromptToTaskPlanning({
867
- actionSpace,
997
+ pageType,
868
998
  vlMode
869
999
  }) {
870
1000
  if (vlMode) {
871
- return systemTemplateOfVLPlanning({ actionSpace, vlMode });
1001
+ return systemTemplateOfVLPlanning({ pageType, vlMode });
872
1002
  }
873
- return `${systemTemplateOfLLM({ actionSpace })}
1003
+ const promptTemplate = new (0, _prompts.PromptTemplate)({
1004
+ template: `${systemTemplateOfLLM({ pageType })}
874
1005
 
875
- ${outputTemplate}`;
1006
+ ${outputTemplate}`,
1007
+ inputVariables: ["pageDescription"]
1008
+ });
1009
+ return await promptTemplate.format({
1010
+ pageDescription: samplePageDescription
1011
+ });
876
1012
  }
877
1013
  var planSchema = {
878
1014
  type: "json_schema",
@@ -1154,7 +1290,7 @@ Please check your config.`
1154
1290
  httpAgent: proxyAgent,
1155
1291
  ...extraConfig,
1156
1292
  defaultHeaders: {
1157
- ..._optionalChain([extraConfig, 'optionalAccess', _10 => _10.defaultHeaders]) || {},
1293
+ ..._optionalChain([extraConfig, 'optionalAccess', _21 => _21.defaultHeaders]) || {},
1158
1294
  [_env.MIDSCENE_API_TYPE]: AIActionTypeValue.toString()
1159
1295
  },
1160
1296
  dangerouslyAllowBrowser: true
@@ -1205,7 +1341,7 @@ async function call2(messages, AIActionTypeValue, responseFormat, options) {
1205
1341
  const debugProfileDetail = _logger.getDebug.call(void 0, "ai:profile:detail");
1206
1342
  const startTime = Date.now();
1207
1343
  const model = getModelName();
1208
- const isStreaming = _optionalChain([options, 'optionalAccess', _11 => _11.stream]) && _optionalChain([options, 'optionalAccess', _12 => _12.onChunk]);
1344
+ const isStreaming = _optionalChain([options, 'optionalAccess', _22 => _22.stream]) && _optionalChain([options, 'optionalAccess', _23 => _23.onChunk]);
1209
1345
  let content;
1210
1346
  let accumulated = "";
1211
1347
  let usage;
@@ -1236,8 +1372,8 @@ async function call2(messages, AIActionTypeValue, responseFormat, options) {
1236
1372
  }
1237
1373
  );
1238
1374
  for await (const chunk of stream) {
1239
- const content2 = _optionalChain([chunk, 'access', _13 => _13.choices, 'optionalAccess', _14 => _14[0], 'optionalAccess', _15 => _15.delta, 'optionalAccess', _16 => _16.content]) || "";
1240
- const reasoning_content = _optionalChain([chunk, 'access', _17 => _17.choices, 'optionalAccess', _18 => _18[0], 'optionalAccess', _19 => _19.delta, 'optionalAccess', _20 => _20.reasoning_content]) || "";
1375
+ const content2 = _optionalChain([chunk, 'access', _24 => _24.choices, 'optionalAccess', _25 => _25[0], 'optionalAccess', _26 => _26.delta, 'optionalAccess', _27 => _27.content]) || "";
1376
+ const reasoning_content = _optionalChain([chunk, 'access', _28 => _28.choices, 'optionalAccess', _29 => _29[0], 'optionalAccess', _30 => _30.delta, 'optionalAccess', _31 => _31.reasoning_content]) || "";
1241
1377
  if (chunk.usage) {
1242
1378
  usage = chunk.usage;
1243
1379
  }
@@ -1252,7 +1388,7 @@ async function call2(messages, AIActionTypeValue, responseFormat, options) {
1252
1388
  };
1253
1389
  options.onChunk(chunkData);
1254
1390
  }
1255
- if (_optionalChain([chunk, 'access', _21 => _21.choices, 'optionalAccess', _22 => _22[0], 'optionalAccess', _23 => _23.finish_reason])) {
1391
+ if (_optionalChain([chunk, 'access', _32 => _32.choices, 'optionalAccess', _33 => _33[0], 'optionalAccess', _34 => _34.finish_reason])) {
1256
1392
  timeCost = Date.now() - startTime;
1257
1393
  if (!usage) {
1258
1394
  const estimatedTokens = Math.max(
@@ -1294,7 +1430,7 @@ async function call2(messages, AIActionTypeValue, responseFormat, options) {
1294
1430
  });
1295
1431
  timeCost = Date.now() - startTime;
1296
1432
  debugProfileStats(
1297
- `model, ${model}, mode, ${_env.vlLocateMode.call(void 0, ) || "default"}, ui-tars-version, ${_env.uiTarsModelVersion.call(void 0, )}, prompt-tokens, ${_optionalChain([result, 'access', _24 => _24.usage, 'optionalAccess', _25 => _25.prompt_tokens]) || ""}, completion-tokens, ${_optionalChain([result, 'access', _26 => _26.usage, 'optionalAccess', _27 => _27.completion_tokens]) || ""}, total-tokens, ${_optionalChain([result, 'access', _28 => _28.usage, 'optionalAccess', _29 => _29.total_tokens]) || ""}, cost-ms, ${timeCost}, requestId, ${result._request_id || ""}`
1433
+ `model, ${model}, mode, ${_env.vlLocateMode.call(void 0, ) || "default"}, ui-tars-version, ${_env.uiTarsModelVersion.call(void 0, )}, prompt-tokens, ${_optionalChain([result, 'access', _35 => _35.usage, 'optionalAccess', _36 => _36.prompt_tokens]) || ""}, completion-tokens, ${_optionalChain([result, 'access', _37 => _37.usage, 'optionalAccess', _38 => _38.completion_tokens]) || ""}, total-tokens, ${_optionalChain([result, 'access', _39 => _39.usage, 'optionalAccess', _40 => _40.total_tokens]) || ""}, cost-ms, ${timeCost}, requestId, ${result._request_id || ""}`
1298
1434
  );
1299
1435
  debugProfileDetail(
1300
1436
  `model usage detail: ${JSON.stringify(result.usage)}`
@@ -1336,7 +1472,7 @@ async function call2(messages, AIActionTypeValue, responseFormat, options) {
1336
1472
  ...commonConfig
1337
1473
  });
1338
1474
  for await (const chunk of stream) {
1339
- const content2 = _optionalChain([chunk, 'access', _30 => _30.delta, 'optionalAccess', _31 => _31.text]) || "";
1475
+ const content2 = _optionalChain([chunk, 'access', _41 => _41.delta, 'optionalAccess', _42 => _42.text]) || "";
1340
1476
  if (content2) {
1341
1477
  accumulated += content2;
1342
1478
  const chunkData = {
@@ -1479,8 +1615,8 @@ function preprocessDoubaoBboxJson(input) {
1479
1615
  }
1480
1616
  function safeParseJson(input) {
1481
1617
  const cleanJsonString = extractJSONFromCodeBlock(input);
1482
- if (_optionalChain([cleanJsonString, 'optionalAccess', _32 => _32.match, 'call', _33 => _33(/\((\d+),(\d+)\)/)])) {
1483
- return _optionalChain([cleanJsonString, 'access', _34 => _34.match, 'call', _35 => _35(/\((\d+),(\d+)\)/), 'optionalAccess', _36 => _36.slice, 'call', _37 => _37(1), 'access', _38 => _38.map, 'call', _39 => _39(Number)]);
1618
+ if (_optionalChain([cleanJsonString, 'optionalAccess', _43 => _43.match, 'call', _44 => _44(/\((\d+),(\d+)\)/)])) {
1619
+ return _optionalChain([cleanJsonString, 'access', _45 => _45.match, 'call', _46 => _46(/\((\d+),(\d+)\)/), 'optionalAccess', _47 => _47.slice, 'call', _48 => _48(1), 'access', _49 => _49.map, 'call', _50 => _50(Number)]);
1484
1620
  }
1485
1621
  try {
1486
1622
  return JSON.parse(cleanJsonString);
@@ -1497,131 +1633,6 @@ function safeParseJson(input) {
1497
1633
  throw Error(`failed to parse json response: ${input}`);
1498
1634
  }
1499
1635
 
1500
- // src/image/index.ts
1501
-
1502
-
1503
-
1504
-
1505
-
1506
-
1507
-
1508
-
1509
-
1510
-
1511
- // src/ai-model/prompt/util.ts
1512
-
1513
-
1514
-
1515
-
1516
-
1517
-
1518
-
1519
-
1520
- function describeSize(size) {
1521
- return `${size.width} x ${size.height}`;
1522
- }
1523
- var distanceThreshold = 16;
1524
- function elementByPositionWithElementInfo(treeRoot, position, options) {
1525
- const requireStrictDistance = _nullishCoalesce(_optionalChain([options, 'optionalAccess', _40 => _40.requireStrictDistance]), () => ( true));
1526
- const filterPositionElements = _nullishCoalesce(_optionalChain([options, 'optionalAccess', _41 => _41.filterPositionElements]), () => ( false));
1527
- _utils.assert.call(void 0, typeof position !== "undefined", "position is required for query");
1528
- const matchingElements = [];
1529
- function dfs(node) {
1530
- if (_optionalChain([node, 'optionalAccess', _42 => _42.node])) {
1531
- const item = node.node;
1532
- if (item.rect.left <= position.x && position.x <= item.rect.left + item.rect.width && item.rect.top <= position.y && position.y <= item.rect.top + item.rect.height) {
1533
- if (!(filterPositionElements && _optionalChain([item, 'access', _43 => _43.attributes, 'optionalAccess', _44 => _44.nodeType]) === _constants.NodeType.POSITION) && item.isVisible) {
1534
- matchingElements.push(item);
1535
- }
1536
- }
1537
- }
1538
- for (const child of node.children) {
1539
- dfs(child);
1540
- }
1541
- }
1542
- dfs(treeRoot);
1543
- if (matchingElements.length === 0) {
1544
- return void 0;
1545
- }
1546
- const element = matchingElements.reduce((smallest, current) => {
1547
- const smallestArea = smallest.rect.width * smallest.rect.height;
1548
- const currentArea = current.rect.width * current.rect.height;
1549
- return currentArea < smallestArea ? current : smallest;
1550
- });
1551
- const distanceToCenter = distance(
1552
- { x: element.center[0], y: element.center[1] },
1553
- position
1554
- );
1555
- if (requireStrictDistance) {
1556
- return distanceToCenter <= distanceThreshold ? element : void 0;
1557
- }
1558
- return element;
1559
- }
1560
- function distance(point1, point2) {
1561
- return Math.sqrt((point1.x - point2.x) ** 2 + (point1.y - point2.y) ** 2);
1562
- }
1563
- async function describeUserPage(context, opt) {
1564
- const { screenshotBase64 } = context;
1565
- let width;
1566
- let height;
1567
- if (context.size) {
1568
- ({ width, height } = context.size);
1569
- } else {
1570
- const imgSize = await _img.imageInfoOfBase64.call(void 0, screenshotBase64);
1571
- ({ width, height } = imgSize);
1572
- }
1573
- const treeRoot = context.tree;
1574
- const idElementMap = {};
1575
- const flatElements = _extractor.treeToList.call(void 0, treeRoot);
1576
- if (_optionalChain([opt, 'optionalAccess', _45 => _45.domIncluded]) === true && flatElements.length >= 5e3) {
1577
- console.warn(
1578
- 'The number of elements is too large, it may cause the prompt to be too long, please use domIncluded: "visible-only" to reduce the number of elements'
1579
- );
1580
- }
1581
- flatElements.forEach((element) => {
1582
- idElementMap[element.id] = element;
1583
- if (typeof element.indexId !== "undefined") {
1584
- idElementMap[`${element.indexId}`] = element;
1585
- }
1586
- });
1587
- let pageDescription = "";
1588
- const visibleOnly = _nullishCoalesce(_optionalChain([opt, 'optionalAccess', _46 => _46.visibleOnly]), () => ( _optionalChain([opt, 'optionalAccess', _47 => _47.domIncluded]) === "visible-only"));
1589
- if (_optionalChain([opt, 'optionalAccess', _48 => _48.domIncluded]) || !_env.vlLocateMode.call(void 0, )) {
1590
- const contentTree = await _extractor.descriptionOfTree.call(void 0,
1591
- treeRoot,
1592
- _optionalChain([opt, 'optionalAccess', _49 => _49.truncateTextLength]),
1593
- _optionalChain([opt, 'optionalAccess', _50 => _50.filterNonTextContent]),
1594
- visibleOnly
1595
- );
1596
- const sizeDescription = describeSize({ width, height });
1597
- pageDescription = `The size of the page: ${sizeDescription}
1598
- The page elements tree:
1599
- ${contentTree}`;
1600
- }
1601
- return {
1602
- description: pageDescription,
1603
- elementById(idOrIndexId) {
1604
- _utils.assert.call(void 0, typeof idOrIndexId !== "undefined", "id is required for query");
1605
- const item = idElementMap[`${idOrIndexId}`];
1606
- return item;
1607
- },
1608
- elementByPosition(position, size) {
1609
- return elementByPositionWithElementInfo(treeRoot, position);
1610
- },
1611
- insertElementByPosition(position) {
1612
- const element = _extractor.generateElementByPosition.call(void 0, position);
1613
- treeRoot.children.push({
1614
- node: element,
1615
- children: []
1616
- });
1617
- flatElements.push(element);
1618
- idElementMap[element.id] = element;
1619
- return element;
1620
- },
1621
- size: { width, height }
1622
- };
1623
- }
1624
-
1625
1636
  // src/ai-model/prompt/playwright-generator.ts
1626
1637
 
1627
1638
 
@@ -2507,7 +2518,7 @@ async function plan(userInstruction, opts) {
2507
2518
  const { screenshotBase64, size } = context;
2508
2519
  const { description: pageDescription, elementById } = await describeUserPage(context);
2509
2520
  const systemPrompt = await systemPromptToTaskPlanning({
2510
- actionSpace: opts.actionSpace,
2521
+ pageType: opts.pageType,
2511
2522
  vlMode: _env.vlLocateMode.call(void 0, )
2512
2523
  });
2513
2524
  const taskBackgroundContextText = generateTaskBackgroundContext(
@@ -2909,6 +2920,6 @@ async function resizeImageForUiTars(imageBase64, size) {
2909
2920
 
2910
2921
 
2911
2922
 
2912
- exports.systemPromptToLocateElement = systemPromptToLocateElement; exports.call = call2; exports.callToGetJSONObject = callToGetJSONObject; exports.callAiFnWithStringResponse = callAiFnWithStringResponse; exports.AIActionType = AIActionType; exports.callAiFn = callAiFn; exports.adaptBboxToRect = adaptBboxToRect; exports.expandSearchArea = expandSearchArea; exports.elementByPositionWithElementInfo = elementByPositionWithElementInfo; exports.describeUserPage = describeUserPage; exports.generateYamlTest = generateYamlTest; exports.generateYamlTestStream = generateYamlTestStream; exports.generatePlaywrightTest = generatePlaywrightTest; exports.generatePlaywrightTestStream = generatePlaywrightTestStream; exports.AiLocateElement = AiLocateElement; exports.AiLocateSection = AiLocateSection; exports.AiExtractElementInfo = AiExtractElementInfo; exports.AiAssert = AiAssert; exports.plan = plan; exports.vlmPlanning = vlmPlanning; exports.resizeImageForUiTars = resizeImageForUiTars;
2923
+ exports.systemPromptToLocateElement = systemPromptToLocateElement; exports.elementByPositionWithElementInfo = elementByPositionWithElementInfo; exports.describeUserPage = describeUserPage; exports.call = call2; exports.callToGetJSONObject = callToGetJSONObject; exports.callAiFnWithStringResponse = callAiFnWithStringResponse; exports.AIActionType = AIActionType; exports.callAiFn = callAiFn; exports.adaptBboxToRect = adaptBboxToRect; exports.expandSearchArea = expandSearchArea; exports.generateYamlTest = generateYamlTest; exports.generateYamlTestStream = generateYamlTestStream; exports.generatePlaywrightTest = generatePlaywrightTest; exports.generatePlaywrightTestStream = generatePlaywrightTestStream; exports.AiLocateElement = AiLocateElement; exports.AiLocateSection = AiLocateSection; exports.AiExtractElementInfo = AiExtractElementInfo; exports.AiAssert = AiAssert; exports.plan = plan; exports.vlmPlanning = vlmPlanning; exports.resizeImageForUiTars = resizeImageForUiTars;
2913
2924
 
2914
- //# sourceMappingURL=chunk-NY6RQSGJ.js.map
2925
+ //# sourceMappingURL=chunk-I5LBWOQA.js.map