@midscene/core 0.25.4-beta-20250807062119.0 → 0.25.4-beta-20250811113343.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. package/dist/es/ai-model.d.ts +7 -6
  2. package/dist/es/ai-model.js +1 -1
  3. package/dist/es/{chunk-G2JTYWI6.js → chunk-5IZMFZPA.js} +374 -598
  4. package/dist/es/chunk-5IZMFZPA.js.map +1 -0
  5. package/dist/es/{chunk-JH54OF4E.js → chunk-H5PRBRMX.js} +3 -3
  6. package/dist/es/index.d.ts +6 -6
  7. package/dist/es/index.js +4 -5
  8. package/dist/es/index.js.map +1 -1
  9. package/dist/es/{llm-planning-f449f3b8.d.ts → llm-planning-374b74b8.d.ts} +3 -3
  10. package/dist/es/{types-7435eba0.d.ts → types-16cd9f75.d.ts} +11 -8
  11. package/dist/es/utils.d.ts +1 -1
  12. package/dist/es/utils.js +1 -1
  13. package/dist/lib/ai-model.d.ts +7 -6
  14. package/dist/lib/ai-model.js +2 -2
  15. package/dist/lib/{chunk-G2JTYWI6.js → chunk-5IZMFZPA.js} +367 -591
  16. package/dist/lib/chunk-5IZMFZPA.js.map +1 -0
  17. package/dist/lib/{chunk-JH54OF4E.js → chunk-H5PRBRMX.js} +3 -3
  18. package/dist/lib/index.d.ts +6 -6
  19. package/dist/lib/index.js +14 -15
  20. package/dist/lib/index.js.map +1 -1
  21. package/dist/lib/{llm-planning-f449f3b8.d.ts → llm-planning-374b74b8.d.ts} +3 -3
  22. package/dist/{types/types-7435eba0.d.ts → lib/types-16cd9f75.d.ts} +11 -8
  23. package/dist/lib/utils.d.ts +1 -1
  24. package/dist/lib/utils.js +2 -2
  25. package/dist/types/ai-model.d.ts +7 -6
  26. package/dist/types/index.d.ts +6 -6
  27. package/dist/types/{llm-planning-f449f3b8.d.ts → llm-planning-374b74b8.d.ts} +3 -3
  28. package/dist/{lib/types-7435eba0.d.ts → types/types-16cd9f75.d.ts} +11 -8
  29. package/dist/types/utils.d.ts +1 -1
  30. package/package.json +3 -3
  31. package/dist/es/chunk-G2JTYWI6.js.map +0 -1
  32. package/dist/lib/chunk-G2JTYWI6.js.map +0 -1
  33. /package/dist/es/{chunk-JH54OF4E.js.map → chunk-H5PRBRMX.js.map} +0 -0
  34. /package/dist/lib/{chunk-JH54OF4E.js.map → chunk-H5PRBRMX.js.map} +0 -0
@@ -5,16 +5,35 @@ import {
5
5
  getBearerTokenProvider
6
6
  } from "@azure/identity";
7
7
  import {
8
+ ANTHROPIC_API_KEY,
9
+ AZURE_OPENAI_API_VERSION,
10
+ AZURE_OPENAI_DEPLOYMENT,
11
+ AZURE_OPENAI_ENDPOINT,
12
+ AZURE_OPENAI_KEY,
8
13
  MIDSCENE_API_TYPE,
14
+ MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON,
15
+ MIDSCENE_AZURE_OPENAI_SCOPE,
16
+ MIDSCENE_DEBUG_AI_PROFILE,
17
+ MIDSCENE_DEBUG_AI_RESPONSE,
9
18
  MIDSCENE_LANGSMITH_DEBUG,
19
+ MIDSCENE_MODEL_NAME,
20
+ MIDSCENE_OPENAI_HTTP_PROXY,
21
+ MIDSCENE_OPENAI_INIT_CONFIG_JSON,
22
+ MIDSCENE_OPENAI_SOCKS_PROXY,
23
+ MIDSCENE_USE_ANTHROPIC_SDK,
24
+ MIDSCENE_USE_AZURE_OPENAI,
25
+ OPENAI_API_KEY,
26
+ OPENAI_BASE_URL,
10
27
  OPENAI_MAX_TOKENS,
11
- getAIConfig as getAIConfig2,
12
- getAIConfigInBoolean as getAIConfigInBoolean2,
28
+ OPENAI_USE_AZURE,
29
+ getAIConfig,
30
+ getAIConfigInBoolean,
31
+ getAIConfigInJson,
13
32
  uiTarsModelVersion,
14
- vlLocateMode as vlLocateMode3
33
+ vlLocateMode as vlLocateMode2
15
34
  } from "@midscene/shared/env";
16
- import { getDebug as getDebug3 } from "@midscene/shared/logger";
17
- import { assert as assert4 } from "@midscene/shared/utils";
35
+ import { enableDebug, getDebug as getDebug2 } from "@midscene/shared/logger";
36
+ import { assert as assert3 } from "@midscene/shared/utils";
18
37
  import { ifInBrowser } from "@midscene/shared/utils";
19
38
  import { HttpsProxyAgent } from "https-proxy-agent";
20
39
  import { jsonrepair } from "jsonrepair";
@@ -36,11 +55,10 @@ var AIActionType = /* @__PURE__ */ ((AIActionType2) => {
36
55
  AIActionType2[AIActionType2["DESCRIBE_ELEMENT"] = 4] = "DESCRIBE_ELEMENT";
37
56
  return AIActionType2;
38
57
  })(AIActionType || {});
39
- async function callAiFn(msgs, AIActionTypeValue, modelPreferences) {
58
+ async function callAiFn(msgs, AIActionTypeValue) {
40
59
  const { content, usage } = await callToGetJSONObject(
41
60
  msgs,
42
- AIActionTypeValue,
43
- modelPreferences
61
+ AIActionTypeValue
44
62
  );
45
63
  return { content, usage };
46
64
  }
@@ -615,179 +633,80 @@ Here is the item user want to find:
615
633
  });
616
634
 
617
635
  // src/ai-model/prompt/llm-planning.ts
636
+ import assert2 from "assert";
618
637
  import { PromptTemplate as PromptTemplate2 } from "@langchain/core/prompts";
619
-
620
- // src/image/index.ts
621
- import {
622
- imageInfo,
623
- imageInfoOfBase64,
624
- localImg2Base64,
625
- httpImg2Base64,
626
- resizeImg,
627
- saveBase64Image,
628
- zoomForGPT4o
629
- } from "@midscene/shared/img";
630
-
631
- // src/ai-model/prompt/util.ts
632
- import { NodeType as NodeType2 } from "@midscene/shared/constants";
633
- import { vlLocateMode as vlLocateMode2 } from "@midscene/shared/env";
634
- import {
635
- descriptionOfTree,
636
- generateElementByPosition,
637
- treeToList as treeToList2
638
- } from "@midscene/shared/extractor";
639
- import { assert as assert2 } from "@midscene/shared/utils";
640
- function describeSize(size) {
641
- return `${size.width} x ${size.height}`;
642
- }
643
- var distanceThreshold = 16;
644
- function elementByPositionWithElementInfo(treeRoot, position, options) {
645
- const requireStrictDistance = options?.requireStrictDistance ?? true;
646
- const filterPositionElements = options?.filterPositionElements ?? false;
647
- assert2(typeof position !== "undefined", "position is required for query");
648
- const matchingElements = [];
649
- function dfs(node) {
650
- if (node?.node) {
651
- const item = node.node;
652
- if (item.rect.left <= position.x && position.x <= item.rect.left + item.rect.width && item.rect.top <= position.y && position.y <= item.rect.top + item.rect.height) {
653
- if (!(filterPositionElements && item.attributes?.nodeType === NodeType2.POSITION) && item.isVisible) {
654
- matchingElements.push(item);
655
- }
656
- }
657
- }
658
- for (const child of node.children) {
659
- dfs(child);
660
- }
661
- }
662
- dfs(treeRoot);
663
- if (matchingElements.length === 0) {
664
- return void 0;
665
- }
666
- const element = matchingElements.reduce((smallest, current) => {
667
- const smallestArea = smallest.rect.width * smallest.rect.height;
668
- const currentArea = current.rect.width * current.rect.height;
669
- return currentArea < smallestArea ? current : smallest;
670
- });
671
- const distanceToCenter = distance(
672
- { x: element.center[0], y: element.center[1] },
673
- position
674
- );
675
- if (requireStrictDistance) {
676
- return distanceToCenter <= distanceThreshold ? element : void 0;
677
- }
678
- return element;
679
- }
680
- function distance(point1, point2) {
681
- return Math.sqrt((point1.x - point2.x) ** 2 + (point1.y - point2.y) ** 2);
682
- }
683
- var samplePageDescription = `
684
- And the page is described as follows:
685
- ====================
686
- The size of the page: 1280 x 720
687
- Some of the elements are marked with a rectangle in the screenshot corresponding to the markerId, some are not.
688
-
689
- Description of all the elements in screenshot:
690
- <div id="969f1637" markerId="1" left="100" top="100" width="100" height="100"> // The markerId indicated by the rectangle label in the screenshot
691
- <h4 id="b211ecb2" markerId="5" left="150" top="150" width="90" height="60">
692
- The username is accepted
693
- </h4>
694
- ...many more
695
- </div>
696
- ====================
697
- `;
698
- async function describeUserPage(context, opt) {
699
- const { screenshotBase64 } = context;
700
- let width;
701
- let height;
702
- if (context.size) {
703
- ({ width, height } = context.size);
704
- } else {
705
- const imgSize = await imageInfoOfBase64(screenshotBase64);
706
- ({ width, height } = imgSize);
707
- }
708
- const treeRoot = context.tree;
709
- const idElementMap = {};
710
- const flatElements = treeToList2(treeRoot);
711
- if (opt?.domIncluded === true && flatElements.length >= 5e3) {
712
- console.warn(
713
- 'The number of elements is too large, it may cause the prompt to be too long, please use domIncluded: "visible-only" to reduce the number of elements'
714
- );
715
- }
716
- flatElements.forEach((element) => {
717
- idElementMap[element.id] = element;
718
- if (typeof element.indexId !== "undefined") {
719
- idElementMap[`${element.indexId}`] = element;
720
- }
721
- });
722
- let pageDescription = "";
723
- const visibleOnly = opt?.visibleOnly ?? opt?.domIncluded === "visible-only";
724
- if (opt?.domIncluded || !vlLocateMode2()) {
725
- const contentTree = await descriptionOfTree(
726
- treeRoot,
727
- opt?.truncateTextLength,
728
- opt?.filterNonTextContent,
729
- visibleOnly
730
- );
731
- const sizeDescription = describeSize({ width, height });
732
- pageDescription = `The size of the page: ${sizeDescription}
733
- The page elements tree:
734
- ${contentTree}`;
735
- }
736
- return {
737
- description: pageDescription,
738
- elementById(idOrIndexId) {
739
- assert2(typeof idOrIndexId !== "undefined", "id is required for query");
740
- const item = idElementMap[`${idOrIndexId}`];
741
- return item;
742
- },
743
- elementByPosition(position, size) {
744
- return elementByPositionWithElementInfo(treeRoot, position);
745
- },
746
- insertElementByPosition(position) {
747
- const element = generateElementByPosition(position);
748
- treeRoot.children.push({
749
- node: element,
750
- children: []
751
- });
752
- flatElements.push(element);
753
- idElementMap[element.id] = element;
754
- return element;
755
- },
756
- size: { width, height }
757
- };
758
- }
759
-
760
- // src/ai-model/prompt/llm-planning.ts
761
638
  var vlCoTLog = `"what_the_user_wants_to_do_next_by_instruction": string, // What the user wants to do according to the instruction and previous logs. `;
762
- var vlCurrentLog = `"log": string, // Log what the next one action (ONLY ONE!) you can do according to the screenshot and the instruction. The typical log looks like "Now i want to use action '{{ action-type }}' to do .. first". If no action should be done, log the reason. ". Use the same language as the user's instruction.`;
763
- var llmCurrentLog = `"log": string, // Log what the next actions you can do according to the screenshot and the instruction. The typical log looks like "Now i want to use action '{{ action-type }}' to do ..". If no action should be done, log the reason. ". Use the same language as the user's instruction.`;
639
+ var vlCurrentLog = `"log": string, // Log what the next one action (ONLY ONE!) you can do according to the screenshot and the instruction. The typical log looks like "Now i want to use action '{ action-type }' to do .. first". If no action should be done, log the reason. ". Use the same language as the user's instruction.`;
640
+ var llmCurrentLog = `"log": string, // Log what the next actions you can do according to the screenshot and the instruction. The typical log looks like "Now i want to use action '{ action-type }' to do ..". If no action should be done, log the reason. ". Use the same language as the user's instruction.`;
764
641
  var commonOutputFields = `"error"?: string, // Error messages about unexpected situations, if any. Only think it is an error when the situation is not expected according to the instruction. Use the same language as the user's instruction.
765
642
  "more_actions_needed_by_instruction": boolean, // Consider if there is still more action(s) to do after the action in "Log" is done, according to the instruction. If so, set this field to true. Otherwise, set it to false.`;
766
643
  var vlLocateParam = "locate: {bbox: [number, number, number, number], prompt: string }";
644
+ var llmLocateParam = `locate: {"id": string, "prompt": string}`;
645
+ var descriptionForAction = (action, locatorScheme) => {
646
+ const tab = " ";
647
+ let locateParam = "";
648
+ if (action.location === "required") {
649
+ locateParam = locatorScheme;
650
+ } else if (action.location === "optional") {
651
+ locateParam = `${locatorScheme} | null`;
652
+ } else if (action.location === false) {
653
+ locateParam = "";
654
+ }
655
+ const locatorParam = locateParam ? `${tab}- ${locateParam}` : "";
656
+ let whatToLocate = "";
657
+ if (action.whatToLocate) {
658
+ if (!locateParam) {
659
+ console.warn(
660
+ `whatToLocate is provided for action ${action.name}, but location is not required or optional. The whatToLocate will be ignored.`
661
+ );
662
+ } else {
663
+ whatToLocate = `${tab}- whatToLocate: ${action.whatToLocate}`;
664
+ }
665
+ }
666
+ let paramSchema = "";
667
+ if (action.paramSchema) {
668
+ paramSchema = `${tab}- paramSchema: ${action.paramSchema}`;
669
+ }
670
+ let paramDescription = "";
671
+ if (action.paramDescription) {
672
+ assert2(
673
+ paramSchema,
674
+ `paramSchema is required when paramDescription is provided for action ${action.name}, but got ${action.paramSchema}`
675
+ );
676
+ paramDescription = `${tab}- paramDescription: ${action.paramDescription}`;
677
+ }
678
+ const fields = [
679
+ paramSchema,
680
+ paramDescription,
681
+ locatorParam,
682
+ whatToLocate
683
+ ].filter(Boolean);
684
+ return `- ${action.name}
685
+ - type: "${action.name}"
686
+ - description: ${action.description}
687
+ ${fields.join("\n")}
688
+ `.trim();
689
+ };
767
690
  var systemTemplateOfVLPlanning = ({
768
- pageType,
691
+ actionSpace,
769
692
  vlMode
770
- }) => `
693
+ }) => {
694
+ const actionNameList = actionSpace.map((action) => action.name).join(", ");
695
+ const actionDescriptionList = actionSpace.map(
696
+ (action) => descriptionForAction(action, vlLocateParam)
697
+ );
698
+ const actionList = actionDescriptionList.join("\n");
699
+ return `
771
700
  Target: User will give you a screenshot, an instruction and some previous logs indicating what have been done. Please tell what the next one action is (or null if no action should be done) to do the tasks the instruction requires.
772
701
 
773
702
  Restriction:
774
703
  - Don't give extra actions or plans beyond the instruction. ONLY plan for what the instruction requires. For example, don't try to submit the form if the instruction is only to fill something.
775
- - Always give ONLY ONE action in \`log\` field (or null if no action should be done), instead of multiple actions. Supported actions are Tap, Hover, Input, KeyboardPress, Scroll${pageType === "android" ? ", AndroidBackButton, AndroidHomeButton, AndroidRecentAppsButton, AndroidLongPress, AndroidPull." : "."}
704
+ - Always give ONLY ONE action in \`log\` field (or null if no action should be done), instead of multiple actions. Supported actions are ${actionNameList}.
776
705
  - Don't repeat actions in the previous logs.
777
706
  - Bbox is the bounding box of the element to be located. It's an array of 4 numbers, representing ${bboxDescription(vlMode)}.
778
707
 
779
708
  Supporting actions:
780
- - Tap: { type: "Tap", ${vlLocateParam} }
781
- - RightClick: { type: "RightClick", ${vlLocateParam} }
782
- - Hover: { type: "Hover", ${vlLocateParam} }
783
- - Input: { type: "Input", ${vlLocateParam}, param: { value: string } } // Replace the input field with a new value. \`value\` is the final that should be filled in the input box. No matter what modifications are required, just provide the final value to replace the existing input value. Giving a blank string means clear the input field.
784
- - KeyboardPress: { type: "KeyboardPress", param: { value: string } }
785
- - Scroll: { type: "Scroll", ${vlLocateParam} | null, param: { direction: 'down'(default) | 'up' | 'right' | 'left', scrollType: 'once' (default) | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft', distance: null | number }} // locate is the element to scroll. If it's a page scroll, put \`null\` in the \`locate\` field.
786
- ${pageType === "android" ? `- AndroidBackButton: { type: "AndroidBackButton", param: {} }
787
- - AndroidHomeButton: { type: "AndroidHomeButton", param: {} }
788
- - AndroidRecentAppsButton: { type: "AndroidRecentAppsButton", param: {} }
789
- - AndroidLongPress: { type: "AndroidLongPress", param: { x: number, y: number, duration?: number } }
790
- - AndroidPull: { type: "AndroidPull", param: { direction: 'up' | 'down', startPoint?: { x: number, y: number }, distance?: number, duration?: number } } // Pull down to refresh (direction: 'down') or pull up to load more (direction: 'up')` : ""}
709
+ ${actionList}
791
710
 
792
711
  Field description:
793
712
  * The \`prompt\` field inside the \`locate\` field is a short description that could be used to locate the element.
@@ -822,8 +741,16 @@ this and output the JSON:
822
741
  }
823
742
  }
824
743
  `;
825
- var llmLocateParam = `locate: {{"id": string, "prompt": string}} | null`;
826
- var systemTemplateOfLLM = ({ pageType }) => `
744
+ };
745
+ var systemTemplateOfLLM = ({
746
+ actionSpace
747
+ }) => {
748
+ const actionNameList = actionSpace.map((action) => action.name).join(" / ");
749
+ const actionDescriptionList = actionSpace.map(
750
+ (action) => descriptionForAction(action, llmLocateParam)
751
+ );
752
+ const actionList = actionDescriptionList.join("\n");
753
+ return `
827
754
  ## Role
828
755
 
829
756
  You are a versatile professional in software UI automation. Your outstanding contributions will impact the user experience of billions of users.
@@ -837,7 +764,7 @@ You are a versatile professional in software UI automation. Your outstanding con
837
764
  ## Workflow
838
765
 
839
766
  1. Receive the screenshot, element description of screenshot(if any), user's instruction and previous logs.
840
- 2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (Tap / Hover / Input / KeyboardPress / Scroll / FalsyConditionStatement / Sleep ${pageType === "android" ? "/ AndroidBackButton / AndroidHomeButton / AndroidRecentAppsButton / AndroidLongPress / AndroidPull" : ""}). The "About the action" section below will give you more details.
767
+ 2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (${actionNameList}). The "About the action" section below will give you more details.
841
768
  3. Precisely locate the target element if it's already shown in the screenshot, put the location info in the \`locate\` field of the action.
842
769
  4. If some target elements is not shown in the screenshot, consider the user's instruction is not feasible on this page. Follow the next steps.
843
770
  5. Consider whether the user's instruction will be accomplished after all the actions
@@ -855,65 +782,30 @@ You are a versatile professional in software UI automation. Your outstanding con
855
782
 
856
783
  The \`locate\` param is commonly used in the \`param\` field of the action, means to locate the target element to perform the action, it conforms to the following scheme:
857
784
 
858
- type LocateParam = {{
785
+ type LocateParam = {
859
786
  "id": string, // the id of the element found. It should either be the id marked with a rectangle in the screenshot or the id described in the description.
860
787
  "prompt"?: string // the description of the element to find. It can only be omitted when locate is null.
861
- }} | null // If it's not on the page, the LocateParam should be null
788
+ } | null // If it's not on the page, the LocateParam should be null
862
789
 
863
790
  ## Supported actions
864
791
 
865
792
  Each action has a \`type\` and corresponding \`param\`. To be detailed:
866
- - type: 'Tap'
867
- * {{ ${llmLocateParam} }}
868
- - type: 'RightClick'
869
- * {{ ${llmLocateParam} }}
870
- - type: 'Hover'
871
- * {{ ${llmLocateParam} }}
872
- - type: 'Input', replace the value in the input field
873
- * {{ ${llmLocateParam}, param: {{ value: string }} }}
874
- * \`value\` is the final value that should be filled in the input field. No matter what modifications are required, just provide the final value user should see after the action is done.
875
- - type: 'KeyboardPress', press a key
876
- * {{ param: {{ value: string }} }}
877
- - type: 'Scroll', scroll up or down.
878
- * {{
879
- ${llmLocateParam},
880
- param: {{
881
- direction: 'down'(default) | 'up' | 'right' | 'left',
882
- scrollType: 'once' (default) | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft',
883
- distance: null | number
884
- }}
885
- }}
886
- * To scroll some specific element, put the element at the center of the region in the \`locate\` field. If it's a page scroll, put \`null\` in the \`locate\` field.
887
- * \`param\` is required in this action. If some fields are not specified, use direction \`down\`, \`once\` scroll type, and \`null\` distance.
888
- * {{ param: {{ button: 'Back' | 'Home' | 'RecentApp' }} }}
889
- - type: 'ExpectedFalsyCondition'
890
- * {{ param: {{ reason: string }} }}
891
- * use this action when the conditional statement talked about in the instruction is falsy.
892
- - type: 'Sleep'
893
- * {{ param: {{ timeMs: number }} }}
894
- ${pageType === "android" ? `- type: 'AndroidBackButton', trigger the system "back" operation on Android devices
895
- * {{ param: {{}} }}
896
- - type: 'AndroidHomeButton', trigger the system "home" operation on Android devices
897
- * {{ param: {{}} }}
898
- - type: 'AndroidRecentAppsButton', trigger the system "recent apps" operation on Android devices
899
- * {{ param: {{}} }}
900
- - type: 'AndroidLongPress', trigger a long press on the screen at specified coordinates on Android devices
901
- * {{ param: {{ x: number, y: number, duration?: number }} }}
902
- - type: 'AndroidPull', trigger pull down to refresh or pull up actions on Android devices
903
- * {{ param: {{ direction: 'up' | 'down', startPoint?: {{ x: number, y: number }}, distance?: number, duration?: number }} }}` : ""}
904
- `;
793
+ ${actionList}
794
+
795
+ `.trim();
796
+ };
905
797
  var outputTemplate = `
906
798
  ## Output JSON Format:
907
799
 
908
800
  The JSON format is as follows:
909
801
 
910
- {{
802
+ {
911
803
  "actions": [
912
804
  // ... some actions
913
805
  ],
914
806
  ${llmCurrentLog}
915
807
  ${commonOutputFields}
916
- }}
808
+ }
917
809
 
918
810
  ## Examples
919
811
 
@@ -929,68 +821,62 @@ By viewing the page screenshot and description, you should consider this and out
929
821
  * Log what these action do: Click the language switch button to open the language options. Wait for 1 second.
930
822
  * The task cannot be accomplished (because we cannot see the "English" option now), so the \`more_actions_needed_by_instruction\` field is true.
931
823
 
932
- {{
824
+ {
933
825
  "actions":[
934
- {{
826
+ {
935
827
  "type": "Tap",
936
828
  "thought": "Click the language switch button to open the language options.",
937
829
  "param": null,
938
- "locate": {{ id: "c81c4e9a33", prompt: "The language switch button" }},
939
- }},
940
- {{
830
+ "locate": { id: "c81c4e9a33", prompt: "The language switch button" }},
831
+ },
832
+ {
941
833
  "type": "Sleep",
942
834
  "thought": "Wait for 1 second to ensure the language options are displayed.",
943
- "param": {{ "timeMs": 1000 }},
944
- }}
835
+ "param": { "timeMs": 1000 },
836
+ }
945
837
  ],
946
838
  "error": null,
947
839
  "more_actions_needed_by_instruction": true,
948
840
  "log": "Click the language switch button to open the language options. Wait for 1 second",
949
- }}
841
+ }
950
842
 
951
843
  ### Example: What NOT to do
952
844
  Wrong output:
953
- {{
845
+ {
954
846
  "actions":[
955
- {{
847
+ {
956
848
  "type": "Tap",
957
849
  "thought": "Click the language switch button to open the language options.",
958
850
  "param": null,
959
- "locate": {{
960
- {{ "id": "c81c4e9a33" }}, // WRONG: prompt is missing
961
- }}
962
- }},
963
- {{
851
+ "locate": {
852
+ { "id": "c81c4e9a33" }, // WRONG: prompt is missing
853
+ }
854
+ },
855
+ {
964
856
  "type": "Tap",
965
857
  "thought": "Click the English option",
966
858
  "param": null,
967
859
  "locate": null, // This means the 'English' option is not shown in the screenshot, the task cannot be accomplished
968
- }}
860
+ }
969
861
  ],
970
862
  "more_actions_needed_by_instruction": false, // WRONG: should be true
971
863
  "log": "Click the language switch button to open the language options",
972
- }}
864
+ }
973
865
 
974
866
  Reason:
975
867
  * The \`prompt\` is missing in the first 'Locate' action
976
868
  * Since the option button is not shown in the screenshot, there are still more actions to be done, so the \`more_actions_needed_by_instruction\` field should be true
977
869
  `;
978
870
  async function systemPromptToTaskPlanning({
979
- pageType,
871
+ actionSpace,
980
872
  vlMode
981
873
  }) {
982
874
  if (vlMode) {
983
- return systemTemplateOfVLPlanning({ pageType, vlMode });
875
+ return systemTemplateOfVLPlanning({ actionSpace, vlMode });
984
876
  }
985
- const promptTemplate = new PromptTemplate2({
986
- template: `${systemTemplateOfLLM({ pageType })}
877
+ return `${systemTemplateOfLLM({ actionSpace })}
987
878
 
988
- ${outputTemplate}`,
989
- inputVariables: ["pageDescription"]
990
- });
991
- return await promptTemplate.format({
992
- pageDescription: samplePageDescription
993
- });
879
+ ${outputTemplate}`;
994
880
  }
995
881
  var planSchema = {
996
882
  type: "json_schema",
@@ -1145,57 +1031,24 @@ pageDescription:
1145
1031
  });
1146
1032
  };
1147
1033
 
1148
- // src/ai-model/service-caller/utils.ts
1149
- import {
1150
- ANTHROPIC_API_KEY,
1151
- AZURE_OPENAI_API_VERSION,
1152
- AZURE_OPENAI_DEPLOYMENT,
1153
- AZURE_OPENAI_ENDPOINT,
1154
- AZURE_OPENAI_KEY,
1155
- MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON,
1156
- MIDSCENE_AZURE_OPENAI_SCOPE,
1157
- MIDSCENE_DEBUG_AI_PROFILE,
1158
- MIDSCENE_DEBUG_AI_RESPONSE,
1159
- MIDSCENE_MODEL_NAME,
1160
- MIDSCENE_OPENAI_HTTP_PROXY,
1161
- MIDSCENE_OPENAI_INIT_CONFIG_JSON,
1162
- MIDSCENE_OPENAI_SOCKS_PROXY,
1163
- MIDSCENE_USE_ANTHROPIC_SDK,
1164
- MIDSCENE_USE_AZURE_OPENAI,
1165
- MIDSCENE_VQA_ANTHROPIC_API_KEY,
1166
- MIDSCENE_VQA_AZURE_OPENAI_API_VERSION,
1167
- MIDSCENE_VQA_AZURE_OPENAI_DEPLOYMENT,
1168
- MIDSCENE_VQA_AZURE_OPENAI_ENDPOINT,
1169
- MIDSCENE_VQA_AZURE_OPENAI_INIT_CONFIG_JSON,
1170
- MIDSCENE_VQA_AZURE_OPENAI_KEY,
1171
- MIDSCENE_VQA_AZURE_OPENAI_SCOPE,
1172
- MIDSCENE_VQA_MODEL_NAME,
1173
- MIDSCENE_VQA_OPENAI_API_KEY,
1174
- MIDSCENE_VQA_OPENAI_BASE_URL,
1175
- MIDSCENE_VQA_OPENAI_HTTP_PROXY,
1176
- MIDSCENE_VQA_OPENAI_INIT_CONFIG_JSON,
1177
- MIDSCENE_VQA_OPENAI_SOCKS_PROXY,
1178
- MIDSCENE_VQA_OPENAI_USE_AZURE,
1179
- MIDSCENE_VQA_USE_ANTHROPIC_SDK,
1180
- MIDSCENE_VQA_USE_AZURE_OPENAI,
1181
- OPENAI_API_KEY,
1182
- OPENAI_BASE_URL,
1183
- OPENAI_USE_AZURE,
1184
- getAIConfig,
1185
- getAIConfigInBoolean,
1186
- getAIConfigInJson
1187
- } from "@midscene/shared/env";
1188
- import { enableDebug, getDebug as getDebug2 } from "@midscene/shared/logger";
1189
- import { assert as assert3 } from "@midscene/shared/utils";
1190
- function getModelName() {
1191
- let modelName = "gpt-4o";
1192
- const nameInConfig = getAIConfig(MIDSCENE_MODEL_NAME);
1193
- if (nameInConfig) {
1194
- modelName = nameInConfig;
1195
- }
1196
- return modelName;
1034
+ // src/ai-model/service-caller/index.ts
1035
+ function checkAIConfig() {
1036
+ const openaiKey = getAIConfig(OPENAI_API_KEY);
1037
+ const azureConfig = getAIConfig(MIDSCENE_USE_AZURE_OPENAI);
1038
+ const anthropicKey = getAIConfig(ANTHROPIC_API_KEY);
1039
+ const initConfigJson = getAIConfig(MIDSCENE_OPENAI_INIT_CONFIG_JSON);
1040
+ if (openaiKey)
1041
+ return true;
1042
+ if (azureConfig)
1043
+ return true;
1044
+ if (anthropicKey)
1045
+ return true;
1046
+ return Boolean(initConfigJson);
1197
1047
  }
1048
+ var debugConfigInitialized = false;
1198
1049
  function initDebugConfig() {
1050
+ if (debugConfigInitialized)
1051
+ return;
1199
1052
  const shouldPrintTiming = getAIConfigInBoolean(MIDSCENE_DEBUG_AI_PROFILE);
1200
1053
  let debugConfig = "";
1201
1054
  if (shouldPrintTiming) {
@@ -1220,232 +1073,27 @@ function initDebugConfig() {
1220
1073
  if (debugConfig) {
1221
1074
  enableDebug(debugConfig);
1222
1075
  }
1076
+ debugConfigInitialized = true;
1223
1077
  }
1224
- var createAssert = (modelNameKey, modelName) => (value, key, modelVendorFlag) => {
1225
- if (modelVendorFlag) {
1226
- assert3(
1227
- value,
1228
- `The ${key} must be a non-empty string because of the ${modelNameKey} is declared as ${modelName} and ${modelVendorFlag} has also been specified, but got: ${value}
1229
- Please check your config.`
1230
- );
1231
- } else {
1232
- assert3(
1233
- value,
1234
- `The ${key} must be a non-empty string because of the ${modelNameKey} is declared as ${modelName}, but got: ${value}
1235
- Please check your config.`
1236
- );
1237
- }
1238
- };
1239
- var getModelConfigFromEnv = (modelName, keys, valueAssert) => {
1240
- const socksProxy = getAIConfig(keys.socksProxy);
1241
- const httpProxy = getAIConfig(keys.httpProxy);
1242
- if (getAIConfig(keys.openaiUseAzureDeprecated)) {
1243
- const openaiBaseURL = getAIConfig(keys.openaiBaseURL);
1244
- const openaiApiKey = getAIConfig(keys.openaiApiKey);
1245
- const openaiExtraConfig = getAIConfigInJson(keys.openaiExtraConfig);
1246
- valueAssert(
1247
- openaiBaseURL,
1248
- keys.openaiBaseURL,
1249
- keys.openaiUseAzureDeprecated
1250
- );
1251
- valueAssert(openaiApiKey, keys.openaiApiKey, keys.openaiUseAzureDeprecated);
1252
- return {
1253
- socksProxy,
1254
- httpProxy,
1255
- modelName,
1256
- openaiUseAzureDeprecated: true,
1257
- openaiApiKey,
1258
- openaiBaseURL,
1259
- openaiExtraConfig
1260
- };
1261
- } else if (getAIConfig(keys.useAzureOpenai)) {
1262
- const azureOpenaiScope = getAIConfig(keys.azureOpenaiScope);
1263
- const azureOpenaiApiKey = getAIConfig(keys.azureOpenaiApiKey);
1264
- const azureOpenaiEndpoint = getAIConfig(keys.azureOpenaiEndpoint);
1265
- const azureOpenaiDeployment = getAIConfig(keys.azureOpenaiDeployment);
1266
- const azureOpenaiApiVersion = getAIConfig(keys.azureOpenaiApiVersion);
1267
- const azureExtraConfig = getAIConfigInJson(keys.azureExtraConfig);
1268
- const openaiExtraConfig = getAIConfigInJson(keys.openaiExtraConfig);
1269
- valueAssert(azureOpenaiApiKey, keys.azureOpenaiApiKey, keys.useAzureOpenai);
1270
- return {
1271
- socksProxy,
1272
- httpProxy,
1273
- modelName,
1274
- useAzureOpenai: true,
1275
- azureOpenaiScope,
1276
- azureOpenaiApiKey,
1277
- azureOpenaiEndpoint,
1278
- azureOpenaiDeployment,
1279
- azureOpenaiApiVersion,
1280
- azureExtraConfig,
1281
- openaiExtraConfig
1282
- };
1283
- } else if (getAIConfig(keys.useAnthropicSdk)) {
1284
- const anthropicApiKey = getAIConfig(keys.anthropicApiKey);
1285
- valueAssert(anthropicApiKey, keys.anthropicApiKey, keys.useAnthropicSdk);
1286
- return {
1287
- socksProxy,
1288
- httpProxy,
1289
- modelName,
1290
- useAnthropicSdk: true,
1291
- anthropicApiKey
1292
- };
1293
- } else {
1294
- const openaiBaseURL = getAIConfig(keys.openaiBaseURL);
1295
- const openaiApiKey = getAIConfig(keys.openaiApiKey);
1296
- const openaiExtraConfig = getAIConfigInJson(keys.openaiExtraConfig);
1297
- valueAssert(openaiBaseURL, keys.openaiBaseURL);
1298
- valueAssert(openaiApiKey, keys.openaiApiKey);
1299
- return {
1300
- socksProxy,
1301
- httpProxy,
1302
- modelName,
1303
- openaiBaseURL,
1304
- openaiApiKey,
1305
- openaiExtraConfig
1306
- };
1307
- }
1308
- };
1309
- var maskKey = (key, maskChar = "*") => {
1310
- if (typeof key !== "string" || key.length === 0) {
1311
- return key;
1312
- }
1313
- const prefixLen = 3;
1314
- const suffixLen = 3;
1315
- const keepLength = prefixLen + suffixLen;
1316
- if (key.length <= keepLength) {
1317
- return key;
1318
- }
1319
- const prefix = key.substring(0, prefixLen);
1320
- const suffix = key.substring(key.length - suffixLen);
1321
- const maskLength = key.length - keepLength;
1322
- const mask = maskChar.repeat(maskLength);
1323
- return `${prefix}${mask}${suffix}`;
1324
- };
1325
- var maskConfig = (config) => {
1326
- return Object.fromEntries(
1327
- Object.entries(config).map(([key, value]) => [
1328
- key,
1329
- ["openaiApiKey", "azureOpenaiApiKey", "anthropicApiKey"].includes(key) ? maskKey(value) : value
1330
- ])
1331
- );
1332
- };
1333
- var decideModelConfig = (modelPreferences) => {
1334
- initDebugConfig();
1335
- const debugLog = getDebug2("ai:decideModelConfig");
1336
- debugLog("modelPreferences", modelPreferences);
1337
- const isVQAIntent = modelPreferences?.intent === "VQA";
1338
- const vqaModelName = getAIConfig(MIDSCENE_VQA_MODEL_NAME);
1339
- if (isVQAIntent && vqaModelName) {
1340
- debugLog(
1341
- `current action is a VQA action and detected ${MIDSCENE_VQA_MODEL_NAME} ${vqaModelName}, will only read VQA related model config from process.env`
1342
- );
1343
- const config = getModelConfigFromEnv(
1344
- vqaModelName,
1345
- {
1346
- /**
1347
- * proxy
1348
- */
1349
- socksProxy: MIDSCENE_VQA_OPENAI_SOCKS_PROXY,
1350
- httpProxy: MIDSCENE_VQA_OPENAI_HTTP_PROXY,
1351
- /**
1352
- * OpenAI
1353
- */
1354
- openaiBaseURL: MIDSCENE_VQA_OPENAI_BASE_URL,
1355
- openaiApiKey: MIDSCENE_VQA_OPENAI_API_KEY,
1356
- openaiExtraConfig: MIDSCENE_VQA_OPENAI_INIT_CONFIG_JSON,
1357
- /**
1358
- * Azure
1359
- */
1360
- openaiUseAzureDeprecated: MIDSCENE_VQA_OPENAI_USE_AZURE,
1361
- useAzureOpenai: MIDSCENE_VQA_USE_AZURE_OPENAI,
1362
- azureOpenaiScope: MIDSCENE_VQA_AZURE_OPENAI_SCOPE,
1363
- azureOpenaiApiKey: MIDSCENE_VQA_AZURE_OPENAI_KEY,
1364
- azureOpenaiEndpoint: MIDSCENE_VQA_AZURE_OPENAI_ENDPOINT,
1365
- azureOpenaiApiVersion: MIDSCENE_VQA_AZURE_OPENAI_API_VERSION,
1366
- azureOpenaiDeployment: MIDSCENE_VQA_AZURE_OPENAI_DEPLOYMENT,
1367
- azureExtraConfig: MIDSCENE_VQA_AZURE_OPENAI_INIT_CONFIG_JSON,
1368
- /**
1369
- * Anthropic
1370
- */
1371
- useAnthropicSdk: MIDSCENE_VQA_USE_ANTHROPIC_SDK,
1372
- anthropicApiKey: MIDSCENE_VQA_ANTHROPIC_API_KEY
1373
- },
1374
- createAssert(MIDSCENE_VQA_MODEL_NAME, vqaModelName)
1375
- );
1376
- debugLog("got model config for VQA usage:", maskConfig(config));
1377
- return config;
1378
- } else {
1379
- debugLog("read model config from process.env as normal.");
1380
- const commonModelName = getAIConfig(MIDSCENE_MODEL_NAME);
1381
- assert3(
1382
- commonModelName,
1383
- `${MIDSCENE_MODEL_NAME} is empty, please check your config.`
1384
- );
1385
- const config = getModelConfigFromEnv(
1386
- commonModelName,
1387
- {
1388
- /**
1389
- * proxy
1390
- */
1391
- socksProxy: MIDSCENE_OPENAI_SOCKS_PROXY,
1392
- httpProxy: MIDSCENE_OPENAI_HTTP_PROXY,
1393
- /**
1394
- * OpenAI
1395
- */
1396
- openaiBaseURL: OPENAI_BASE_URL,
1397
- openaiApiKey: OPENAI_API_KEY,
1398
- openaiExtraConfig: MIDSCENE_OPENAI_INIT_CONFIG_JSON,
1399
- /**
1400
- * Azure
1401
- */
1402
- openaiUseAzureDeprecated: OPENAI_USE_AZURE,
1403
- useAzureOpenai: MIDSCENE_USE_AZURE_OPENAI,
1404
- azureOpenaiScope: MIDSCENE_AZURE_OPENAI_SCOPE,
1405
- azureOpenaiApiKey: AZURE_OPENAI_KEY,
1406
- azureOpenaiEndpoint: AZURE_OPENAI_ENDPOINT,
1407
- azureOpenaiApiVersion: AZURE_OPENAI_API_VERSION,
1408
- azureOpenaiDeployment: AZURE_OPENAI_DEPLOYMENT,
1409
- azureExtraConfig: MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON,
1410
- /**
1411
- * Anthropic
1412
- */
1413
- useAnthropicSdk: MIDSCENE_USE_ANTHROPIC_SDK,
1414
- anthropicApiKey: ANTHROPIC_API_KEY
1415
- },
1416
- createAssert(MIDSCENE_MODEL_NAME, commonModelName)
1417
- );
1418
- debugLog("got model config for common usage:", maskConfig(config));
1419
- return config;
1078
+ var defaultModel = "gpt-4o";
1079
+ function getModelName() {
1080
+ let modelName = defaultModel;
1081
+ const nameInConfig = getAIConfig(MIDSCENE_MODEL_NAME);
1082
+ if (nameInConfig) {
1083
+ modelName = nameInConfig;
1420
1084
  }
1421
- };
1422
-
1423
- // src/ai-model/service-caller/index.ts
1085
+ return modelName;
1086
+ }
1424
1087
  async function createChatClient({
1425
- AIActionTypeValue,
1426
- modelPreferences
1088
+ AIActionTypeValue
1427
1089
  }) {
1428
- const {
1429
- socksProxy,
1430
- httpProxy,
1431
- modelName,
1432
- openaiBaseURL,
1433
- openaiApiKey,
1434
- openaiExtraConfig,
1435
- openaiUseAzureDeprecated,
1436
- useAzureOpenai,
1437
- azureOpenaiScope,
1438
- azureOpenaiApiKey,
1439
- azureOpenaiEndpoint,
1440
- azureOpenaiApiVersion,
1441
- azureOpenaiDeployment,
1442
- azureExtraConfig,
1443
- useAnthropicSdk,
1444
- anthropicApiKey
1445
- } = decideModelConfig(modelPreferences);
1090
+ initDebugConfig();
1446
1091
  let openai;
1092
+ const extraConfig = getAIConfigInJson(MIDSCENE_OPENAI_INIT_CONFIG_JSON);
1093
+ const socksProxy = getAIConfig(MIDSCENE_OPENAI_SOCKS_PROXY);
1094
+ const httpProxy = getAIConfig(MIDSCENE_OPENAI_HTTP_PROXY);
1447
1095
  let proxyAgent = void 0;
1448
- const debugProxy = getDebug3("ai:call:proxy");
1096
+ const debugProxy = getDebug2("ai:call:proxy");
1449
1097
  if (httpProxy) {
1450
1098
  debugProxy("using http proxy", httpProxy);
1451
1099
  proxyAgent = new HttpsProxyAgent(httpProxy);
@@ -1453,56 +1101,70 @@ async function createChatClient({
1453
1101
  debugProxy("using socks proxy", socksProxy);
1454
1102
  proxyAgent = new SocksProxyAgent(socksProxy);
1455
1103
  }
1456
- if (openaiUseAzureDeprecated) {
1104
+ if (getAIConfig(OPENAI_USE_AZURE)) {
1457
1105
  openai = new AzureOpenAI({
1458
- baseURL: openaiBaseURL,
1459
- apiKey: openaiApiKey,
1106
+ baseURL: getAIConfig(OPENAI_BASE_URL),
1107
+ apiKey: getAIConfig(OPENAI_API_KEY),
1460
1108
  httpAgent: proxyAgent,
1461
- ...openaiExtraConfig,
1109
+ ...extraConfig,
1462
1110
  dangerouslyAllowBrowser: true
1463
1111
  });
1464
- } else if (useAzureOpenai) {
1112
+ } else if (getAIConfig(MIDSCENE_USE_AZURE_OPENAI)) {
1113
+ const extraAzureConfig = getAIConfigInJson(
1114
+ MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON
1115
+ );
1116
+ const scope = getAIConfig(MIDSCENE_AZURE_OPENAI_SCOPE);
1465
1117
  let tokenProvider = void 0;
1466
- if (azureOpenaiScope) {
1467
- assert4(
1118
+ if (scope) {
1119
+ assert3(
1468
1120
  !ifInBrowser,
1469
1121
  "Azure OpenAI is not supported in browser with Midscene."
1470
1122
  );
1471
1123
  const credential = new DefaultAzureCredential();
1472
- tokenProvider = getBearerTokenProvider(credential, azureOpenaiScope);
1124
+ assert3(scope, "MIDSCENE_AZURE_OPENAI_SCOPE is required");
1125
+ tokenProvider = getBearerTokenProvider(credential, scope);
1473
1126
  openai = new AzureOpenAI({
1474
1127
  azureADTokenProvider: tokenProvider,
1475
- endpoint: azureOpenaiEndpoint,
1476
- apiVersion: azureOpenaiApiVersion,
1477
- deployment: azureOpenaiDeployment,
1478
- ...openaiExtraConfig,
1479
- ...azureExtraConfig
1128
+ endpoint: getAIConfig(AZURE_OPENAI_ENDPOINT),
1129
+ apiVersion: getAIConfig(AZURE_OPENAI_API_VERSION),
1130
+ deployment: getAIConfig(AZURE_OPENAI_DEPLOYMENT),
1131
+ ...extraConfig,
1132
+ ...extraAzureConfig
1480
1133
  });
1481
1134
  } else {
1482
1135
  openai = new AzureOpenAI({
1483
- apiKey: azureOpenaiApiKey,
1484
- endpoint: azureOpenaiEndpoint,
1485
- apiVersion: azureOpenaiApiVersion,
1486
- deployment: azureOpenaiDeployment,
1136
+ apiKey: getAIConfig(AZURE_OPENAI_KEY),
1137
+ endpoint: getAIConfig(AZURE_OPENAI_ENDPOINT),
1138
+ apiVersion: getAIConfig(AZURE_OPENAI_API_VERSION),
1139
+ deployment: getAIConfig(AZURE_OPENAI_DEPLOYMENT),
1487
1140
  dangerouslyAllowBrowser: true,
1488
- ...openaiExtraConfig,
1489
- ...azureExtraConfig
1141
+ ...extraConfig,
1142
+ ...extraAzureConfig
1490
1143
  });
1491
1144
  }
1492
- } else if (!useAnthropicSdk) {
1145
+ } else if (!getAIConfig(MIDSCENE_USE_ANTHROPIC_SDK)) {
1146
+ const baseURL = getAIConfig(OPENAI_BASE_URL);
1147
+ if (typeof baseURL === "string") {
1148
+ if (!/^https?:\/\//.test(baseURL)) {
1149
+ throw new Error(
1150
+ `OPENAI_BASE_URL must be a valid URL starting with http:// or https://, but got: ${baseURL}
1151
+ Please check your config.`
1152
+ );
1153
+ }
1154
+ }
1493
1155
  openai = new OpenAI({
1494
- baseURL: openaiBaseURL,
1495
- apiKey: openaiApiKey,
1156
+ baseURL: getAIConfig(OPENAI_BASE_URL),
1157
+ apiKey: getAIConfig(OPENAI_API_KEY),
1496
1158
  httpAgent: proxyAgent,
1497
- ...openaiExtraConfig,
1159
+ ...extraConfig,
1498
1160
  defaultHeaders: {
1499
- ...openaiExtraConfig?.defaultHeaders || {},
1161
+ ...extraConfig?.defaultHeaders || {},
1500
1162
  [MIDSCENE_API_TYPE]: AIActionTypeValue.toString()
1501
1163
  },
1502
1164
  dangerouslyAllowBrowser: true
1503
1165
  });
1504
1166
  }
1505
- if (openai && getAIConfigInBoolean2(MIDSCENE_LANGSMITH_DEBUG)) {
1167
+ if (openai && getAIConfigInBoolean(MIDSCENE_LANGSMITH_DEBUG)) {
1506
1168
  if (ifInBrowser) {
1507
1169
  throw new Error("langsmith is not supported in browser");
1508
1170
  }
@@ -1513,13 +1175,14 @@ async function createChatClient({
1513
1175
  if (typeof openai !== "undefined") {
1514
1176
  return {
1515
1177
  completion: openai.chat.completions,
1516
- style: "openai",
1517
- modelName
1178
+ style: "openai"
1518
1179
  };
1519
1180
  }
1520
- if (useAnthropicSdk) {
1181
+ if (getAIConfig(MIDSCENE_USE_ANTHROPIC_SDK)) {
1182
+ const apiKey = getAIConfig(ANTHROPIC_API_KEY);
1183
+ assert3(apiKey, "ANTHROPIC_API_KEY is required");
1521
1184
  openai = new Anthropic({
1522
- apiKey: anthropicApiKey,
1185
+ apiKey,
1523
1186
  httpAgent: proxyAgent,
1524
1187
  dangerouslyAllowBrowser: true
1525
1188
  });
@@ -1527,45 +1190,47 @@ async function createChatClient({
1527
1190
  if (typeof openai !== "undefined" && openai.messages) {
1528
1191
  return {
1529
1192
  completion: openai.messages,
1530
- style: "anthropic",
1531
- modelName
1193
+ style: "anthropic"
1532
1194
  };
1533
1195
  }
1534
1196
  throw new Error("Openai SDK or Anthropic SDK is not initialized");
1535
1197
  }
1536
- async function call2(messages, AIActionTypeValue, options, modelPreferences) {
1537
- const { completion, style, modelName } = await createChatClient({
1538
- AIActionTypeValue,
1539
- modelPreferences
1198
+ async function call2(messages, AIActionTypeValue, responseFormat, options) {
1199
+ assert3(
1200
+ checkAIConfig(),
1201
+ "Cannot find config for AI model service. If you are using a self-hosted model without validating the API key, please set `OPENAI_API_KEY` to any non-null value. https://midscenejs.com/model-provider.html"
1202
+ );
1203
+ const { completion, style } = await createChatClient({
1204
+ AIActionTypeValue
1540
1205
  });
1541
- const responseFormat = getResponseFormat(modelName, AIActionTypeValue);
1542
- const maxTokens = getAIConfig2(OPENAI_MAX_TOKENS);
1543
- const debugCall = getDebug3("ai:call");
1544
- const debugProfileStats = getDebug3("ai:profile:stats");
1545
- const debugProfileDetail = getDebug3("ai:profile:detail");
1206
+ const maxTokens = getAIConfig(OPENAI_MAX_TOKENS);
1207
+ const debugCall = getDebug2("ai:call");
1208
+ const debugProfileStats = getDebug2("ai:profile:stats");
1209
+ const debugProfileDetail = getDebug2("ai:profile:detail");
1546
1210
  const startTime = Date.now();
1211
+ const model = getModelName();
1547
1212
  const isStreaming = options?.stream && options?.onChunk;
1548
1213
  let content;
1549
1214
  let accumulated = "";
1550
1215
  let usage;
1551
1216
  let timeCost;
1552
1217
  const commonConfig = {
1553
- temperature: vlLocateMode3() === "vlm-ui-tars" ? 0 : 0.1,
1218
+ temperature: vlLocateMode2() === "vlm-ui-tars" ? 0 : 0.1,
1554
1219
  stream: !!isStreaming,
1555
1220
  max_tokens: typeof maxTokens === "number" ? maxTokens : Number.parseInt(maxTokens || "2048", 10),
1556
- ...vlLocateMode3() === "qwen-vl" ? {
1221
+ ...vlLocateMode2() === "qwen-vl" ? {
1557
1222
  vl_high_resolution_images: true
1558
1223
  } : {}
1559
1224
  };
1560
1225
  try {
1561
1226
  if (style === "openai") {
1562
1227
  debugCall(
1563
- `sending ${isStreaming ? "streaming " : ""}request to ${modelName}`
1228
+ `sending ${isStreaming ? "streaming " : ""}request to ${model}`
1564
1229
  );
1565
1230
  if (isStreaming) {
1566
1231
  const stream = await completion.create(
1567
1232
  {
1568
- model: modelName,
1233
+ model,
1569
1234
  messages,
1570
1235
  response_format: responseFormat,
1571
1236
  ...commonConfig
@@ -1622,23 +1287,23 @@ async function call2(messages, AIActionTypeValue, options, modelPreferences) {
1622
1287
  }
1623
1288
  content = accumulated;
1624
1289
  debugProfileStats(
1625
- `streaming model, ${modelName}, mode, ${vlLocateMode3() || "default"}, cost-ms, ${timeCost}`
1290
+ `streaming model, ${model}, mode, ${vlLocateMode2() || "default"}, cost-ms, ${timeCost}`
1626
1291
  );
1627
1292
  } else {
1628
1293
  const result = await completion.create({
1629
- model: modelName,
1294
+ model,
1630
1295
  messages,
1631
1296
  response_format: responseFormat,
1632
1297
  ...commonConfig
1633
1298
  });
1634
1299
  timeCost = Date.now() - startTime;
1635
1300
  debugProfileStats(
1636
- `model, ${modelName}, mode, ${vlLocateMode3() || "default"}, ui-tars-version, ${uiTarsModelVersion()}, prompt-tokens, ${result.usage?.prompt_tokens || ""}, completion-tokens, ${result.usage?.completion_tokens || ""}, total-tokens, ${result.usage?.total_tokens || ""}, cost-ms, ${timeCost}, requestId, ${result._request_id || ""}`
1301
+ `model, ${model}, mode, ${vlLocateMode2() || "default"}, ui-tars-version, ${uiTarsModelVersion()}, prompt-tokens, ${result.usage?.prompt_tokens || ""}, completion-tokens, ${result.usage?.completion_tokens || ""}, total-tokens, ${result.usage?.total_tokens || ""}, cost-ms, ${timeCost}, requestId, ${result._request_id || ""}`
1637
1302
  );
1638
1303
  debugProfileDetail(
1639
1304
  `model usage detail: ${JSON.stringify(result.usage)}`
1640
1305
  );
1641
- assert4(
1306
+ assert3(
1642
1307
  result.choices,
1643
1308
  `invalid response from LLM service: ${JSON.stringify(result)}`
1644
1309
  );
@@ -1646,12 +1311,12 @@ async function call2(messages, AIActionTypeValue, options, modelPreferences) {
1646
1311
  usage = result.usage;
1647
1312
  }
1648
1313
  debugCall(`response: ${content}`);
1649
- assert4(content, "empty content");
1314
+ assert3(content, "empty content");
1650
1315
  } else if (style === "anthropic") {
1651
1316
  const convertImageContent = (content2) => {
1652
1317
  if (content2.type === "image_url") {
1653
1318
  const imgBase64 = content2.image_url.url;
1654
- assert4(imgBase64, "image_url is required");
1319
+ assert3(imgBase64, "image_url is required");
1655
1320
  return {
1656
1321
  source: {
1657
1322
  type: "base64",
@@ -1665,7 +1330,7 @@ async function call2(messages, AIActionTypeValue, options, modelPreferences) {
1665
1330
  };
1666
1331
  if (isStreaming) {
1667
1332
  const stream = await completion.create({
1668
- model: modelName,
1333
+ model,
1669
1334
  system: "You are a versatile professional in software UI automation",
1670
1335
  messages: messages.map((m) => ({
1671
1336
  role: "user",
@@ -1709,7 +1374,7 @@ async function call2(messages, AIActionTypeValue, options, modelPreferences) {
1709
1374
  content = accumulated;
1710
1375
  } else {
1711
1376
  const result = await completion.create({
1712
- model: modelName,
1377
+ model,
1713
1378
  system: "You are a versatile professional in software UI automation",
1714
1379
  messages: messages.map((m) => ({
1715
1380
  role: "user",
@@ -1722,7 +1387,7 @@ async function call2(messages, AIActionTypeValue, options, modelPreferences) {
1722
1387
  content = result.content[0].text;
1723
1388
  usage = result.usage;
1724
1389
  }
1725
- assert4(content, "empty content");
1390
+ assert3(content, "empty content");
1726
1391
  }
1727
1392
  if (isStreaming && !usage) {
1728
1393
  const estimatedTokens = Math.max(
@@ -1756,9 +1421,10 @@ async function call2(messages, AIActionTypeValue, options, modelPreferences) {
1756
1421
  throw newError;
1757
1422
  }
1758
1423
  }
1759
- var getResponseFormat = (modelName, AIActionTypeValue) => {
1424
+ async function callToGetJSONObject(messages, AIActionTypeValue) {
1760
1425
  let responseFormat;
1761
- if (modelName.includes("gpt-4")) {
1426
+ const model = getModelName();
1427
+ if (model.includes("gpt-4")) {
1762
1428
  switch (AIActionTypeValue) {
1763
1429
  case 0 /* ASSERT */:
1764
1430
  responseFormat = assertSchema;
@@ -1775,19 +1441,11 @@ var getResponseFormat = (modelName, AIActionTypeValue) => {
1775
1441
  break;
1776
1442
  }
1777
1443
  }
1778
- if (modelName === "gpt-4o-2024-05-13") {
1444
+ if (model === "gpt-4o-2024-05-13") {
1779
1445
  responseFormat = { type: "json_object" /* JSON */ };
1780
1446
  }
1781
- return responseFormat;
1782
- };
1783
- async function callToGetJSONObject(messages, AIActionTypeValue, modelPreferences) {
1784
- const response = await call2(
1785
- messages,
1786
- AIActionTypeValue,
1787
- void 0,
1788
- modelPreferences
1789
- );
1790
- assert4(response, "empty response");
1447
+ const response = await call2(messages, AIActionTypeValue, responseFormat);
1448
+ assert3(response, "empty response");
1791
1449
  const jsonContent = safeParseJson(response.content);
1792
1450
  return { content: jsonContent, usage: response.usage };
1793
1451
  }
@@ -1836,13 +1494,138 @@ function safeParseJson(input) {
1836
1494
  return JSON.parse(jsonrepair(cleanJsonString));
1837
1495
  } catch (e) {
1838
1496
  }
1839
- if (vlLocateMode3() === "doubao-vision" || vlLocateMode3() === "vlm-ui-tars") {
1497
+ if (vlLocateMode2() === "doubao-vision" || vlLocateMode2() === "vlm-ui-tars") {
1840
1498
  const jsonString = preprocessDoubaoBboxJson(cleanJsonString);
1841
1499
  return JSON.parse(jsonrepair(jsonString));
1842
1500
  }
1843
1501
  throw Error(`failed to parse json response: ${input}`);
1844
1502
  }
1845
1503
 
1504
+ // src/image/index.ts
1505
+ import {
1506
+ imageInfo,
1507
+ imageInfoOfBase64,
1508
+ localImg2Base64,
1509
+ httpImg2Base64,
1510
+ resizeImg,
1511
+ saveBase64Image,
1512
+ zoomForGPT4o
1513
+ } from "@midscene/shared/img";
1514
+
1515
+ // src/ai-model/prompt/util.ts
1516
+ import { NodeType as NodeType2 } from "@midscene/shared/constants";
1517
+ import { vlLocateMode as vlLocateMode3 } from "@midscene/shared/env";
1518
+ import {
1519
+ descriptionOfTree,
1520
+ generateElementByPosition,
1521
+ treeToList as treeToList2
1522
+ } from "@midscene/shared/extractor";
1523
+ import { assert as assert4 } from "@midscene/shared/utils";
1524
+ function describeSize(size) {
1525
+ return `${size.width} x ${size.height}`;
1526
+ }
1527
+ var distanceThreshold = 16;
1528
+ function elementByPositionWithElementInfo(treeRoot, position, options) {
1529
+ const requireStrictDistance = options?.requireStrictDistance ?? true;
1530
+ const filterPositionElements = options?.filterPositionElements ?? false;
1531
+ assert4(typeof position !== "undefined", "position is required for query");
1532
+ const matchingElements = [];
1533
+ function dfs(node) {
1534
+ if (node?.node) {
1535
+ const item = node.node;
1536
+ if (item.rect.left <= position.x && position.x <= item.rect.left + item.rect.width && item.rect.top <= position.y && position.y <= item.rect.top + item.rect.height) {
1537
+ if (!(filterPositionElements && item.attributes?.nodeType === NodeType2.POSITION) && item.isVisible) {
1538
+ matchingElements.push(item);
1539
+ }
1540
+ }
1541
+ }
1542
+ for (const child of node.children) {
1543
+ dfs(child);
1544
+ }
1545
+ }
1546
+ dfs(treeRoot);
1547
+ if (matchingElements.length === 0) {
1548
+ return void 0;
1549
+ }
1550
+ const element = matchingElements.reduce((smallest, current) => {
1551
+ const smallestArea = smallest.rect.width * smallest.rect.height;
1552
+ const currentArea = current.rect.width * current.rect.height;
1553
+ return currentArea < smallestArea ? current : smallest;
1554
+ });
1555
+ const distanceToCenter = distance(
1556
+ { x: element.center[0], y: element.center[1] },
1557
+ position
1558
+ );
1559
+ if (requireStrictDistance) {
1560
+ return distanceToCenter <= distanceThreshold ? element : void 0;
1561
+ }
1562
+ return element;
1563
+ }
1564
+ function distance(point1, point2) {
1565
+ return Math.sqrt((point1.x - point2.x) ** 2 + (point1.y - point2.y) ** 2);
1566
+ }
1567
+ async function describeUserPage(context, opt) {
1568
+ const { screenshotBase64 } = context;
1569
+ let width;
1570
+ let height;
1571
+ if (context.size) {
1572
+ ({ width, height } = context.size);
1573
+ } else {
1574
+ const imgSize = await imageInfoOfBase64(screenshotBase64);
1575
+ ({ width, height } = imgSize);
1576
+ }
1577
+ const treeRoot = context.tree;
1578
+ const idElementMap = {};
1579
+ const flatElements = treeToList2(treeRoot);
1580
+ if (opt?.domIncluded === true && flatElements.length >= 5e3) {
1581
+ console.warn(
1582
+ 'The number of elements is too large, it may cause the prompt to be too long, please use domIncluded: "visible-only" to reduce the number of elements'
1583
+ );
1584
+ }
1585
+ flatElements.forEach((element) => {
1586
+ idElementMap[element.id] = element;
1587
+ if (typeof element.indexId !== "undefined") {
1588
+ idElementMap[`${element.indexId}`] = element;
1589
+ }
1590
+ });
1591
+ let pageDescription = "";
1592
+ const visibleOnly = opt?.visibleOnly ?? opt?.domIncluded === "visible-only";
1593
+ if (opt?.domIncluded || !vlLocateMode3()) {
1594
+ const contentTree = await descriptionOfTree(
1595
+ treeRoot,
1596
+ opt?.truncateTextLength,
1597
+ opt?.filterNonTextContent,
1598
+ visibleOnly
1599
+ );
1600
+ const sizeDescription = describeSize({ width, height });
1601
+ pageDescription = `The size of the page: ${sizeDescription}
1602
+ The page elements tree:
1603
+ ${contentTree}`;
1604
+ }
1605
+ return {
1606
+ description: pageDescription,
1607
+ elementById(idOrIndexId) {
1608
+ assert4(typeof idOrIndexId !== "undefined", "id is required for query");
1609
+ const item = idElementMap[`${idOrIndexId}`];
1610
+ return item;
1611
+ },
1612
+ elementByPosition(position, size) {
1613
+ return elementByPositionWithElementInfo(treeRoot, position);
1614
+ },
1615
+ insertElementByPosition(position) {
1616
+ const element = generateElementByPosition(position);
1617
+ treeRoot.children.push({
1618
+ node: element,
1619
+ children: []
1620
+ });
1621
+ flatElements.push(element);
1622
+ idElementMap[element.id] = element;
1623
+ return element;
1624
+ },
1625
+ size: { width, height }
1626
+ };
1627
+ }
1628
+
1846
1629
  // src/ai-model/prompt/playwright-generator.ts
1847
1630
  import { PLAYWRIGHT_EXAMPLE_CODE } from "@midscene/shared/constants";
1848
1631
 
@@ -2071,7 +1854,7 @@ Respond with YAML only, no explanations.`
2071
1854
  });
2072
1855
  }
2073
1856
  if (options.stream && options.onChunk) {
2074
- return await call2(prompt, 2 /* EXTRACT_DATA */, {
1857
+ return await call2(prompt, 2 /* EXTRACT_DATA */, void 0, {
2075
1858
  stream: true,
2076
1859
  onChunk: options.onChunk
2077
1860
  });
@@ -2194,7 +1977,7 @@ ${PLAYWRIGHT_EXAMPLE_CODE}`;
2194
1977
  }
2195
1978
  ];
2196
1979
  if (options.stream && options.onChunk) {
2197
- return await call2(prompt, 2 /* EXTRACT_DATA */, {
1980
+ return await call2(prompt, 2 /* EXTRACT_DATA */, void 0, {
2198
1981
  stream: true,
2199
1982
  onChunk: options.onChunk
2200
1983
  });
@@ -2215,7 +1998,7 @@ ${PLAYWRIGHT_EXAMPLE_CODE}`;
2215
1998
  import {
2216
1999
  MIDSCENE_USE_QWEN_VL,
2217
2000
  MIDSCENE_USE_VLM_UI_TARS,
2218
- getAIConfigInBoolean as getAIConfigInBoolean3,
2001
+ getAIConfigInBoolean as getAIConfigInBoolean2,
2219
2002
  vlLocateMode as vlLocateMode4
2220
2003
  } from "@midscene/shared/env";
2221
2004
  import {
@@ -2223,7 +2006,7 @@ import {
2223
2006
  paddingToMatchBlockByBase64,
2224
2007
  preProcessImageUrl
2225
2008
  } from "@midscene/shared/img";
2226
- import { getDebug as getDebug4 } from "@midscene/shared/logger";
2009
+ import { getDebug as getDebug3 } from "@midscene/shared/logger";
2227
2010
  import { assert as assert5 } from "@midscene/shared/utils";
2228
2011
 
2229
2012
  // src/ai-model/prompt/extraction.ts
@@ -2379,8 +2162,8 @@ var sectionLocatorInstruction = new PromptTemplate4({
2379
2162
  });
2380
2163
 
2381
2164
  // src/ai-model/inspect.ts
2382
- var debugInspect = getDebug4("ai:inspect");
2383
- var debugSection = getDebug4("ai:section");
2165
+ var debugInspect = getDebug3("ai:inspect");
2166
+ var debugSection = getDebug3("ai:section");
2384
2167
  var extraTextFromUserPrompt = (prompt) => {
2385
2168
  if (typeof prompt === "string") {
2386
2169
  return prompt;
@@ -2601,7 +2384,7 @@ async function AiLocateSection(options) {
2601
2384
  imageBase64 = await cropByRect(
2602
2385
  screenshotBase64,
2603
2386
  sectionRect,
2604
- getAIConfigInBoolean3(MIDSCENE_USE_QWEN_VL)
2387
+ getAIConfigInBoolean2(MIDSCENE_USE_QWEN_VL)
2605
2388
  );
2606
2389
  }
2607
2390
  return {
@@ -2613,13 +2396,7 @@ async function AiLocateSection(options) {
2613
2396
  };
2614
2397
  }
2615
2398
  async function AiExtractElementInfo(options) {
2616
- const {
2617
- dataQuery,
2618
- context,
2619
- extractOption,
2620
- multimodalPrompt,
2621
- modelPreferences
2622
- } = options;
2399
+ const { dataQuery, context, extractOption, multimodalPrompt } = options;
2623
2400
  const systemPrompt = systemPromptToExtract();
2624
2401
  const { screenshotBase64 } = context;
2625
2402
  const { description, elementById } = await describeUserPage(context, {
@@ -2668,8 +2445,7 @@ async function AiExtractElementInfo(options) {
2668
2445
  }
2669
2446
  const result = await callAiFn(
2670
2447
  msgs,
2671
- 2 /* EXTRACT_DATA */,
2672
- modelPreferences
2448
+ 2 /* EXTRACT_DATA */
2673
2449
  );
2674
2450
  return {
2675
2451
  parseResult: result.content,
@@ -2682,7 +2458,7 @@ async function AiAssert(options) {
2682
2458
  assert5(assertion, "assertion should not be empty");
2683
2459
  const { screenshotBase64 } = context;
2684
2460
  const systemPrompt = systemPromptToAssert({
2685
- isUITars: getAIConfigInBoolean3(MIDSCENE_USE_VLM_UI_TARS)
2461
+ isUITars: getAIConfigInBoolean2(MIDSCENE_USE_VLM_UI_TARS)
2686
2462
  });
2687
2463
  const assertionText = extraTextFromUserPrompt(assertion);
2688
2464
  const msgs = [
@@ -2735,7 +2511,7 @@ async function plan(userInstruction, opts) {
2735
2511
  const { screenshotBase64, size } = context;
2736
2512
  const { description: pageDescription, elementById } = await describeUserPage(context);
2737
2513
  const systemPrompt = await systemPromptToTaskPlanning({
2738
- pageType: opts.pageType,
2514
+ actionSpace: opts.actionSpace,
2739
2515
  vlMode: vlLocateMode5()
2740
2516
  });
2741
2517
  const taskBackgroundContextText = generateTaskBackgroundContext(
@@ -2835,7 +2611,7 @@ import {
2835
2611
  } from "@midscene/shared/env";
2836
2612
  import { resizeImgBase64 } from "@midscene/shared/img";
2837
2613
  import { transformHotkeyInput } from "@midscene/shared/keyboard-layout";
2838
- import { getDebug as getDebug5 } from "@midscene/shared/logger";
2614
+ import { getDebug as getDebug4 } from "@midscene/shared/logger";
2839
2615
  import { assert as assert7 } from "@midscene/shared/utils";
2840
2616
  import { actionParser } from "@ui-tars/action-parser";
2841
2617
 
@@ -2875,7 +2651,7 @@ finished(content='xxx') # Use escape characters \\', \\", and \\n in content par
2875
2651
  var getSummary = (prediction) => prediction.replace(/Reflection:[\s\S]*?(?=Action_Summary:|Action:|$)/g, "").trim();
2876
2652
 
2877
2653
  // src/ai-model/ui-tars-planning.ts
2878
- var debug = getDebug5("ui-tars-planning");
2654
+ var debug = getDebug4("ui-tars-planning");
2879
2655
  var bboxSize = 10;
2880
2656
  var pointToBbox = (point, width, height) => {
2881
2657
  return [
@@ -3117,8 +2893,6 @@ async function resizeImageForUiTars(imageBase64, size) {
3117
2893
 
3118
2894
  export {
3119
2895
  systemPromptToLocateElement,
3120
- elementByPositionWithElementInfo,
3121
- describeUserPage,
3122
2896
  call2 as call,
3123
2897
  callToGetJSONObject,
3124
2898
  callAiFnWithStringResponse,
@@ -3126,6 +2900,8 @@ export {
3126
2900
  callAiFn,
3127
2901
  adaptBboxToRect,
3128
2902
  expandSearchArea,
2903
+ elementByPositionWithElementInfo,
2904
+ describeUserPage,
3129
2905
  generateYamlTest,
3130
2906
  generateYamlTestStream,
3131
2907
  generatePlaywrightTest,
@@ -3139,4 +2915,4 @@ export {
3139
2915
  resizeImageForUiTars
3140
2916
  };
3141
2917
 
3142
- //# sourceMappingURL=chunk-G2JTYWI6.js.map
2918
+ //# sourceMappingURL=chunk-5IZMFZPA.js.map