@midscene/core 0.25.4-beta-20250811115904.0 → 0.26.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/es/ai-model.d.ts +3 -3
- package/dist/es/ai-model.js +1 -1
- package/dist/es/{chunk-SR67R2OE.js → chunk-4CBFCRNS.js} +3 -3
- package/dist/es/{chunk-NY6RQSGJ.js → chunk-I5LBWOQA.js} +254 -243
- package/dist/es/chunk-I5LBWOQA.js.map +1 -0
- package/dist/es/index.d.ts +4 -4
- package/dist/es/index.js +2 -2
- package/dist/es/{llm-planning-374b74b8.d.ts → llm-planning-92cec090.d.ts} +1 -2
- package/dist/es/{types-16cd9f75.d.ts → types-b4a208c6.d.ts} +1 -10
- package/dist/es/utils.d.ts +1 -1
- package/dist/es/utils.js +1 -1
- package/dist/lib/ai-model.d.ts +3 -3
- package/dist/lib/ai-model.js +2 -2
- package/dist/lib/{chunk-SR67R2OE.js → chunk-4CBFCRNS.js} +3 -3
- package/dist/lib/{chunk-NY6RQSGJ.js → chunk-I5LBWOQA.js} +243 -232
- package/dist/lib/chunk-I5LBWOQA.js.map +1 -0
- package/dist/lib/index.d.ts +4 -4
- package/dist/lib/index.js +12 -12
- package/dist/lib/{llm-planning-374b74b8.d.ts → llm-planning-92cec090.d.ts} +1 -2
- package/dist/{types/types-16cd9f75.d.ts → lib/types-b4a208c6.d.ts} +1 -10
- package/dist/lib/utils.d.ts +1 -1
- package/dist/lib/utils.js +2 -2
- package/dist/types/ai-model.d.ts +3 -3
- package/dist/types/index.d.ts +4 -4
- package/dist/types/{llm-planning-374b74b8.d.ts → llm-planning-92cec090.d.ts} +1 -2
- package/dist/{lib/types-16cd9f75.d.ts → types/types-b4a208c6.d.ts} +1 -10
- package/dist/types/utils.d.ts +1 -1
- package/package.json +3 -3
- package/dist/es/chunk-NY6RQSGJ.js.map +0 -1
- package/dist/lib/chunk-NY6RQSGJ.js.map +0 -1
- /package/dist/es/{chunk-SR67R2OE.js.map → chunk-4CBFCRNS.js.map} +0 -0
- /package/dist/lib/{chunk-SR67R2OE.js.map → chunk-4CBFCRNS.js.map} +0 -0
|
@@ -30,7 +30,7 @@ import {
|
|
|
30
30
|
getAIConfigInBoolean,
|
|
31
31
|
getAIConfigInJson,
|
|
32
32
|
uiTarsModelVersion,
|
|
33
|
-
vlLocateMode as
|
|
33
|
+
vlLocateMode as vlLocateMode3
|
|
34
34
|
} from "@midscene/shared/env";
|
|
35
35
|
import { enableDebug, getDebug as getDebug2 } from "@midscene/shared/logger";
|
|
36
36
|
import { assert as assert3 } from "@midscene/shared/utils";
|
|
@@ -633,73 +633,179 @@ Here is the item user want to find:
|
|
|
633
633
|
});
|
|
634
634
|
|
|
635
635
|
// src/ai-model/prompt/llm-planning.ts
|
|
636
|
-
import assert2 from "assert";
|
|
637
636
|
import { PromptTemplate as PromptTemplate2 } from "@langchain/core/prompts";
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
637
|
+
|
|
638
|
+
// src/image/index.ts
|
|
639
|
+
import {
|
|
640
|
+
imageInfo,
|
|
641
|
+
imageInfoOfBase64,
|
|
642
|
+
localImg2Base64,
|
|
643
|
+
httpImg2Base64,
|
|
644
|
+
resizeImg,
|
|
645
|
+
saveBase64Image,
|
|
646
|
+
zoomForGPT4o
|
|
647
|
+
} from "@midscene/shared/img";
|
|
648
|
+
|
|
649
|
+
// src/ai-model/prompt/util.ts
|
|
650
|
+
import { NodeType as NodeType2 } from "@midscene/shared/constants";
|
|
651
|
+
import { vlLocateMode as vlLocateMode2 } from "@midscene/shared/env";
|
|
652
|
+
import {
|
|
653
|
+
descriptionOfTree,
|
|
654
|
+
generateElementByPosition,
|
|
655
|
+
treeToList as treeToList2
|
|
656
|
+
} from "@midscene/shared/extractor";
|
|
657
|
+
import { assert as assert2 } from "@midscene/shared/utils";
|
|
658
|
+
function describeSize(size) {
|
|
659
|
+
return `${size.width} x ${size.height}`;
|
|
660
|
+
}
|
|
661
|
+
var distanceThreshold = 16;
|
|
662
|
+
function elementByPositionWithElementInfo(treeRoot, position, options) {
|
|
663
|
+
const requireStrictDistance = options?.requireStrictDistance ?? true;
|
|
664
|
+
const filterPositionElements = options?.filterPositionElements ?? false;
|
|
665
|
+
assert2(typeof position !== "undefined", "position is required for query");
|
|
666
|
+
const matchingElements = [];
|
|
667
|
+
function dfs(node) {
|
|
668
|
+
if (node?.node) {
|
|
669
|
+
const item = node.node;
|
|
670
|
+
if (item.rect.left <= position.x && position.x <= item.rect.left + item.rect.width && item.rect.top <= position.y && position.y <= item.rect.top + item.rect.height) {
|
|
671
|
+
if (!(filterPositionElements && item.attributes?.nodeType === NodeType2.POSITION) && item.isVisible) {
|
|
672
|
+
matchingElements.push(item);
|
|
673
|
+
}
|
|
674
|
+
}
|
|
663
675
|
}
|
|
676
|
+
for (const child of node.children) {
|
|
677
|
+
dfs(child);
|
|
678
|
+
}
|
|
679
|
+
}
|
|
680
|
+
dfs(treeRoot);
|
|
681
|
+
if (matchingElements.length === 0) {
|
|
682
|
+
return void 0;
|
|
683
|
+
}
|
|
684
|
+
const element = matchingElements.reduce((smallest, current) => {
|
|
685
|
+
const smallestArea = smallest.rect.width * smallest.rect.height;
|
|
686
|
+
const currentArea = current.rect.width * current.rect.height;
|
|
687
|
+
return currentArea < smallestArea ? current : smallest;
|
|
688
|
+
});
|
|
689
|
+
const distanceToCenter = distance(
|
|
690
|
+
{ x: element.center[0], y: element.center[1] },
|
|
691
|
+
position
|
|
692
|
+
);
|
|
693
|
+
if (requireStrictDistance) {
|
|
694
|
+
return distanceToCenter <= distanceThreshold ? element : void 0;
|
|
695
|
+
}
|
|
696
|
+
return element;
|
|
697
|
+
}
|
|
698
|
+
function distance(point1, point2) {
|
|
699
|
+
return Math.sqrt((point1.x - point2.x) ** 2 + (point1.y - point2.y) ** 2);
|
|
700
|
+
}
|
|
701
|
+
var samplePageDescription = `
|
|
702
|
+
And the page is described as follows:
|
|
703
|
+
====================
|
|
704
|
+
The size of the page: 1280 x 720
|
|
705
|
+
Some of the elements are marked with a rectangle in the screenshot corresponding to the markerId, some are not.
|
|
706
|
+
|
|
707
|
+
Description of all the elements in screenshot:
|
|
708
|
+
<div id="969f1637" markerId="1" left="100" top="100" width="100" height="100"> // The markerId indicated by the rectangle label in the screenshot
|
|
709
|
+
<h4 id="b211ecb2" markerId="5" left="150" top="150" width="90" height="60">
|
|
710
|
+
The username is accepted
|
|
711
|
+
</h4>
|
|
712
|
+
...many more
|
|
713
|
+
</div>
|
|
714
|
+
====================
|
|
715
|
+
`;
|
|
716
|
+
async function describeUserPage(context, opt) {
|
|
717
|
+
const { screenshotBase64 } = context;
|
|
718
|
+
let width;
|
|
719
|
+
let height;
|
|
720
|
+
if (context.size) {
|
|
721
|
+
({ width, height } = context.size);
|
|
722
|
+
} else {
|
|
723
|
+
const imgSize = await imageInfoOfBase64(screenshotBase64);
|
|
724
|
+
({ width, height } = imgSize);
|
|
664
725
|
}
|
|
665
|
-
|
|
666
|
-
|
|
667
|
-
|
|
726
|
+
const treeRoot = context.tree;
|
|
727
|
+
const idElementMap = {};
|
|
728
|
+
const flatElements = treeToList2(treeRoot);
|
|
729
|
+
if (opt?.domIncluded === true && flatElements.length >= 5e3) {
|
|
730
|
+
console.warn(
|
|
731
|
+
'The number of elements is too large, it may cause the prompt to be too long, please use domIncluded: "visible-only" to reduce the number of elements'
|
|
732
|
+
);
|
|
668
733
|
}
|
|
669
|
-
|
|
670
|
-
|
|
671
|
-
|
|
672
|
-
|
|
734
|
+
flatElements.forEach((element) => {
|
|
735
|
+
idElementMap[element.id] = element;
|
|
736
|
+
if (typeof element.indexId !== "undefined") {
|
|
737
|
+
idElementMap[`${element.indexId}`] = element;
|
|
738
|
+
}
|
|
739
|
+
});
|
|
740
|
+
let pageDescription = "";
|
|
741
|
+
const visibleOnly = opt?.visibleOnly ?? opt?.domIncluded === "visible-only";
|
|
742
|
+
if (opt?.domIncluded || !vlLocateMode2()) {
|
|
743
|
+
const contentTree = await descriptionOfTree(
|
|
744
|
+
treeRoot,
|
|
745
|
+
opt?.truncateTextLength,
|
|
746
|
+
opt?.filterNonTextContent,
|
|
747
|
+
visibleOnly
|
|
673
748
|
);
|
|
674
|
-
|
|
675
|
-
|
|
676
|
-
|
|
677
|
-
|
|
678
|
-
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
749
|
+
const sizeDescription = describeSize({ width, height });
|
|
750
|
+
pageDescription = `The size of the page: ${sizeDescription}
|
|
751
|
+
The page elements tree:
|
|
752
|
+
${contentTree}`;
|
|
753
|
+
}
|
|
754
|
+
return {
|
|
755
|
+
description: pageDescription,
|
|
756
|
+
elementById(idOrIndexId) {
|
|
757
|
+
assert2(typeof idOrIndexId !== "undefined", "id is required for query");
|
|
758
|
+
const item = idElementMap[`${idOrIndexId}`];
|
|
759
|
+
return item;
|
|
760
|
+
},
|
|
761
|
+
elementByPosition(position, size) {
|
|
762
|
+
return elementByPositionWithElementInfo(treeRoot, position);
|
|
763
|
+
},
|
|
764
|
+
insertElementByPosition(position) {
|
|
765
|
+
const element = generateElementByPosition(position);
|
|
766
|
+
treeRoot.children.push({
|
|
767
|
+
node: element,
|
|
768
|
+
children: []
|
|
769
|
+
});
|
|
770
|
+
flatElements.push(element);
|
|
771
|
+
idElementMap[element.id] = element;
|
|
772
|
+
return element;
|
|
773
|
+
},
|
|
774
|
+
size: { width, height }
|
|
775
|
+
};
|
|
776
|
+
}
|
|
777
|
+
|
|
778
|
+
// src/ai-model/prompt/llm-planning.ts
|
|
779
|
+
var vlCoTLog = `"what_the_user_wants_to_do_next_by_instruction": string, // What the user wants to do according to the instruction and previous logs. `;
|
|
780
|
+
var vlCurrentLog = `"log": string, // Log what the next one action (ONLY ONE!) you can do according to the screenshot and the instruction. The typical log looks like "Now i want to use action '{{ action-type }}' to do .. first". If no action should be done, log the reason. ". Use the same language as the user's instruction.`;
|
|
781
|
+
var llmCurrentLog = `"log": string, // Log what the next actions you can do according to the screenshot and the instruction. The typical log looks like "Now i want to use action '{{ action-type }}' to do ..". If no action should be done, log the reason. ". Use the same language as the user's instruction.`;
|
|
782
|
+
var commonOutputFields = `"error"?: string, // Error messages about unexpected situations, if any. Only think it is an error when the situation is not expected according to the instruction. Use the same language as the user's instruction.
|
|
783
|
+
"more_actions_needed_by_instruction": boolean, // Consider if there is still more action(s) to do after the action in "Log" is done, according to the instruction. If so, set this field to true. Otherwise, set it to false.`;
|
|
784
|
+
var vlLocateParam = "locate: {bbox: [number, number, number, number], prompt: string }";
|
|
683
785
|
var systemTemplateOfVLPlanning = ({
|
|
684
|
-
|
|
786
|
+
pageType,
|
|
685
787
|
vlMode
|
|
686
|
-
}) =>
|
|
687
|
-
const actionNameList = actionSpace.map((action) => action.name).join(", ");
|
|
688
|
-
const actionDescriptionList = actionSpace.map(
|
|
689
|
-
(action) => descriptionForAction(action, vlLocateParam(action.location === "required"))
|
|
690
|
-
);
|
|
691
|
-
const actionList = actionDescriptionList.join("\n");
|
|
692
|
-
return `
|
|
788
|
+
}) => `
|
|
693
789
|
Target: User will give you a screenshot, an instruction and some previous logs indicating what have been done. Please tell what the next one action is (or null if no action should be done) to do the tasks the instruction requires.
|
|
694
790
|
|
|
695
791
|
Restriction:
|
|
696
792
|
- Don't give extra actions or plans beyond the instruction. ONLY plan for what the instruction requires. For example, don't try to submit the form if the instruction is only to fill something.
|
|
697
|
-
- Always give ONLY ONE action in \`log\` field (or null if no action should be done), instead of multiple actions. Supported actions are ${
|
|
793
|
+
- Always give ONLY ONE action in \`log\` field (or null if no action should be done), instead of multiple actions. Supported actions are Tap, Hover, Input, KeyboardPress, Scroll${pageType === "android" ? ", AndroidBackButton, AndroidHomeButton, AndroidRecentAppsButton, AndroidLongPress, AndroidPull." : "."}
|
|
698
794
|
- Don't repeat actions in the previous logs.
|
|
699
795
|
- Bbox is the bounding box of the element to be located. It's an array of 4 numbers, representing ${bboxDescription(vlMode)}.
|
|
700
796
|
|
|
701
797
|
Supporting actions:
|
|
702
|
-
${
|
|
798
|
+
- Tap: { type: "Tap", ${vlLocateParam} }
|
|
799
|
+
- RightClick: { type: "RightClick", ${vlLocateParam} }
|
|
800
|
+
- Hover: { type: "Hover", ${vlLocateParam} }
|
|
801
|
+
- Input: { type: "Input", ${vlLocateParam}, param: { value: string } } // Replace the input field with a new value. \`value\` is the final that should be filled in the input box. No matter what modifications are required, just provide the final value to replace the existing input value. Giving a blank string means clear the input field.
|
|
802
|
+
- KeyboardPress: { type: "KeyboardPress", param: { value: string } }
|
|
803
|
+
- Scroll: { type: "Scroll", ${vlLocateParam} | null, param: { direction: 'down'(default) | 'up' | 'right' | 'left', scrollType: 'once' (default) | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft', distance: null | number }} // locate is the element to scroll. If it's a page scroll, put \`null\` in the \`locate\` field.
|
|
804
|
+
${pageType === "android" ? `- AndroidBackButton: { type: "AndroidBackButton", param: {} }
|
|
805
|
+
- AndroidHomeButton: { type: "AndroidHomeButton", param: {} }
|
|
806
|
+
- AndroidRecentAppsButton: { type: "AndroidRecentAppsButton", param: {} }
|
|
807
|
+
- AndroidLongPress: { type: "AndroidLongPress", param: { x: number, y: number, duration?: number } }
|
|
808
|
+
- AndroidPull: { type: "AndroidPull", param: { direction: 'up' | 'down', startPoint?: { x: number, y: number }, distance?: number, duration?: number } } // Pull down to refresh (direction: 'down') or pull up to load more (direction: 'up')` : ""}
|
|
703
809
|
|
|
704
810
|
Field description:
|
|
705
811
|
* The \`prompt\` field inside the \`locate\` field is a short description that could be used to locate the element.
|
|
@@ -734,19 +840,8 @@ this and output the JSON:
|
|
|
734
840
|
}
|
|
735
841
|
}
|
|
736
842
|
`;
|
|
737
|
-
}
|
|
738
|
-
var systemTemplateOfLLM = ({
|
|
739
|
-
actionSpace
|
|
740
|
-
}) => {
|
|
741
|
-
const actionNameList = actionSpace.map((action) => action.name).join(" / ");
|
|
742
|
-
const actionDescriptionList = actionSpace.map(
|
|
743
|
-
(action) => descriptionForAction(
|
|
744
|
-
action,
|
|
745
|
-
llmLocateParam(action.location === "required")
|
|
746
|
-
)
|
|
747
|
-
);
|
|
748
|
-
const actionList = actionDescriptionList.join("\n");
|
|
749
|
-
return `
|
|
843
|
+
var llmLocateParam = `locate: {{"id": string, "prompt": string}} | null`;
|
|
844
|
+
var systemTemplateOfLLM = ({ pageType }) => `
|
|
750
845
|
## Role
|
|
751
846
|
|
|
752
847
|
You are a versatile professional in software UI automation. Your outstanding contributions will impact the user experience of billions of users.
|
|
@@ -760,7 +855,7 @@ You are a versatile professional in software UI automation. Your outstanding con
|
|
|
760
855
|
## Workflow
|
|
761
856
|
|
|
762
857
|
1. Receive the screenshot, element description of screenshot(if any), user's instruction and previous logs.
|
|
763
|
-
2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (${
|
|
858
|
+
2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (Tap / Hover / Input / KeyboardPress / Scroll / FalsyConditionStatement / Sleep ${pageType === "android" ? "/ AndroidBackButton / AndroidHomeButton / AndroidRecentAppsButton / AndroidLongPress / AndroidPull" : ""}). The "About the action" section below will give you more details.
|
|
764
859
|
3. Precisely locate the target element if it's already shown in the screenshot, put the location info in the \`locate\` field of the action.
|
|
765
860
|
4. If some target elements is not shown in the screenshot, consider the user's instruction is not feasible on this page. Follow the next steps.
|
|
766
861
|
5. Consider whether the user's instruction will be accomplished after all the actions
|
|
@@ -778,30 +873,65 @@ You are a versatile professional in software UI automation. Your outstanding con
|
|
|
778
873
|
|
|
779
874
|
The \`locate\` param is commonly used in the \`param\` field of the action, means to locate the target element to perform the action, it conforms to the following scheme:
|
|
780
875
|
|
|
781
|
-
type LocateParam = {
|
|
876
|
+
type LocateParam = {{
|
|
782
877
|
"id": string, // the id of the element found. It should either be the id marked with a rectangle in the screenshot or the id described in the description.
|
|
783
878
|
"prompt"?: string // the description of the element to find. It can only be omitted when locate is null.
|
|
784
|
-
} | null // If it's not on the page, the LocateParam should be null
|
|
879
|
+
}} | null // If it's not on the page, the LocateParam should be null
|
|
785
880
|
|
|
786
881
|
## Supported actions
|
|
787
882
|
|
|
788
883
|
Each action has a \`type\` and corresponding \`param\`. To be detailed:
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
|
|
792
|
-
}
|
|
884
|
+
- type: 'Tap'
|
|
885
|
+
* {{ ${llmLocateParam} }}
|
|
886
|
+
- type: 'RightClick'
|
|
887
|
+
* {{ ${llmLocateParam} }}
|
|
888
|
+
- type: 'Hover'
|
|
889
|
+
* {{ ${llmLocateParam} }}
|
|
890
|
+
- type: 'Input', replace the value in the input field
|
|
891
|
+
* {{ ${llmLocateParam}, param: {{ value: string }} }}
|
|
892
|
+
* \`value\` is the final value that should be filled in the input field. No matter what modifications are required, just provide the final value user should see after the action is done.
|
|
893
|
+
- type: 'KeyboardPress', press a key
|
|
894
|
+
* {{ param: {{ value: string }} }}
|
|
895
|
+
- type: 'Scroll', scroll up or down.
|
|
896
|
+
* {{
|
|
897
|
+
${llmLocateParam},
|
|
898
|
+
param: {{
|
|
899
|
+
direction: 'down'(default) | 'up' | 'right' | 'left',
|
|
900
|
+
scrollType: 'once' (default) | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft',
|
|
901
|
+
distance: null | number
|
|
902
|
+
}}
|
|
903
|
+
}}
|
|
904
|
+
* To scroll some specific element, put the element at the center of the region in the \`locate\` field. If it's a page scroll, put \`null\` in the \`locate\` field.
|
|
905
|
+
* \`param\` is required in this action. If some fields are not specified, use direction \`down\`, \`once\` scroll type, and \`null\` distance.
|
|
906
|
+
* {{ param: {{ button: 'Back' | 'Home' | 'RecentApp' }} }}
|
|
907
|
+
- type: 'ExpectedFalsyCondition'
|
|
908
|
+
* {{ param: {{ reason: string }} }}
|
|
909
|
+
* use this action when the conditional statement talked about in the instruction is falsy.
|
|
910
|
+
- type: 'Sleep'
|
|
911
|
+
* {{ param: {{ timeMs: number }} }}
|
|
912
|
+
${pageType === "android" ? `- type: 'AndroidBackButton', trigger the system "back" operation on Android devices
|
|
913
|
+
* {{ param: {{}} }}
|
|
914
|
+
- type: 'AndroidHomeButton', trigger the system "home" operation on Android devices
|
|
915
|
+
* {{ param: {{}} }}
|
|
916
|
+
- type: 'AndroidRecentAppsButton', trigger the system "recent apps" operation on Android devices
|
|
917
|
+
* {{ param: {{}} }}
|
|
918
|
+
- type: 'AndroidLongPress', trigger a long press on the screen at specified coordinates on Android devices
|
|
919
|
+
* {{ param: {{ x: number, y: number, duration?: number }} }}
|
|
920
|
+
- type: 'AndroidPull', trigger pull down to refresh or pull up actions on Android devices
|
|
921
|
+
* {{ param: {{ direction: 'up' | 'down', startPoint?: {{ x: number, y: number }}, distance?: number, duration?: number }} }}` : ""}
|
|
922
|
+
`;
|
|
793
923
|
var outputTemplate = `
|
|
794
924
|
## Output JSON Format:
|
|
795
925
|
|
|
796
926
|
The JSON format is as follows:
|
|
797
927
|
|
|
798
|
-
{
|
|
928
|
+
{{
|
|
799
929
|
"actions": [
|
|
800
930
|
// ... some actions
|
|
801
931
|
],
|
|
802
932
|
${llmCurrentLog}
|
|
803
933
|
${commonOutputFields}
|
|
804
|
-
}
|
|
934
|
+
}}
|
|
805
935
|
|
|
806
936
|
## Examples
|
|
807
937
|
|
|
@@ -817,62 +947,68 @@ By viewing the page screenshot and description, you should consider this and out
|
|
|
817
947
|
* Log what these action do: Click the language switch button to open the language options. Wait for 1 second.
|
|
818
948
|
* The task cannot be accomplished (because we cannot see the "English" option now), so the \`more_actions_needed_by_instruction\` field is true.
|
|
819
949
|
|
|
820
|
-
{
|
|
950
|
+
{{
|
|
821
951
|
"actions":[
|
|
822
|
-
{
|
|
952
|
+
{{
|
|
823
953
|
"type": "Tap",
|
|
824
954
|
"thought": "Click the language switch button to open the language options.",
|
|
825
955
|
"param": null,
|
|
826
|
-
"locate": { id: "c81c4e9a33", prompt: "The language switch button" }},
|
|
827
|
-
},
|
|
828
|
-
{
|
|
956
|
+
"locate": {{ id: "c81c4e9a33", prompt: "The language switch button" }},
|
|
957
|
+
}},
|
|
958
|
+
{{
|
|
829
959
|
"type": "Sleep",
|
|
830
960
|
"thought": "Wait for 1 second to ensure the language options are displayed.",
|
|
831
|
-
"param": { "timeMs": 1000 },
|
|
832
|
-
}
|
|
961
|
+
"param": {{ "timeMs": 1000 }},
|
|
962
|
+
}}
|
|
833
963
|
],
|
|
834
964
|
"error": null,
|
|
835
965
|
"more_actions_needed_by_instruction": true,
|
|
836
966
|
"log": "Click the language switch button to open the language options. Wait for 1 second",
|
|
837
|
-
}
|
|
967
|
+
}}
|
|
838
968
|
|
|
839
969
|
### Example: What NOT to do
|
|
840
970
|
Wrong output:
|
|
841
|
-
{
|
|
971
|
+
{{
|
|
842
972
|
"actions":[
|
|
843
|
-
{
|
|
973
|
+
{{
|
|
844
974
|
"type": "Tap",
|
|
845
975
|
"thought": "Click the language switch button to open the language options.",
|
|
846
976
|
"param": null,
|
|
847
|
-
"locate": {
|
|
848
|
-
{ "id": "c81c4e9a33" }, // WRONG: prompt is missing
|
|
849
|
-
}
|
|
850
|
-
},
|
|
851
|
-
{
|
|
977
|
+
"locate": {{
|
|
978
|
+
{{ "id": "c81c4e9a33" }}, // WRONG: prompt is missing
|
|
979
|
+
}}
|
|
980
|
+
}},
|
|
981
|
+
{{
|
|
852
982
|
"type": "Tap",
|
|
853
983
|
"thought": "Click the English option",
|
|
854
984
|
"param": null,
|
|
855
985
|
"locate": null, // This means the 'English' option is not shown in the screenshot, the task cannot be accomplished
|
|
856
|
-
}
|
|
986
|
+
}}
|
|
857
987
|
],
|
|
858
988
|
"more_actions_needed_by_instruction": false, // WRONG: should be true
|
|
859
989
|
"log": "Click the language switch button to open the language options",
|
|
860
|
-
}
|
|
990
|
+
}}
|
|
861
991
|
|
|
862
992
|
Reason:
|
|
863
993
|
* The \`prompt\` is missing in the first 'Locate' action
|
|
864
994
|
* Since the option button is not shown in the screenshot, there are still more actions to be done, so the \`more_actions_needed_by_instruction\` field should be true
|
|
865
995
|
`;
|
|
866
996
|
async function systemPromptToTaskPlanning({
|
|
867
|
-
|
|
997
|
+
pageType,
|
|
868
998
|
vlMode
|
|
869
999
|
}) {
|
|
870
1000
|
if (vlMode) {
|
|
871
|
-
return systemTemplateOfVLPlanning({
|
|
1001
|
+
return systemTemplateOfVLPlanning({ pageType, vlMode });
|
|
872
1002
|
}
|
|
873
|
-
|
|
1003
|
+
const promptTemplate = new PromptTemplate2({
|
|
1004
|
+
template: `${systemTemplateOfLLM({ pageType })}
|
|
874
1005
|
|
|
875
|
-
${outputTemplate}
|
|
1006
|
+
${outputTemplate}`,
|
|
1007
|
+
inputVariables: ["pageDescription"]
|
|
1008
|
+
});
|
|
1009
|
+
return await promptTemplate.format({
|
|
1010
|
+
pageDescription: samplePageDescription
|
|
1011
|
+
});
|
|
876
1012
|
}
|
|
877
1013
|
var planSchema = {
|
|
878
1014
|
type: "json_schema",
|
|
@@ -1211,10 +1347,10 @@ async function call2(messages, AIActionTypeValue, responseFormat, options) {
|
|
|
1211
1347
|
let usage;
|
|
1212
1348
|
let timeCost;
|
|
1213
1349
|
const commonConfig = {
|
|
1214
|
-
temperature:
|
|
1350
|
+
temperature: vlLocateMode3() === "vlm-ui-tars" ? 0 : 0.1,
|
|
1215
1351
|
stream: !!isStreaming,
|
|
1216
1352
|
max_tokens: typeof maxTokens === "number" ? maxTokens : Number.parseInt(maxTokens || "2048", 10),
|
|
1217
|
-
...
|
|
1353
|
+
...vlLocateMode3() === "qwen-vl" ? {
|
|
1218
1354
|
vl_high_resolution_images: true
|
|
1219
1355
|
} : {}
|
|
1220
1356
|
};
|
|
@@ -1283,7 +1419,7 @@ async function call2(messages, AIActionTypeValue, responseFormat, options) {
|
|
|
1283
1419
|
}
|
|
1284
1420
|
content = accumulated;
|
|
1285
1421
|
debugProfileStats(
|
|
1286
|
-
`streaming model, ${model}, mode, ${
|
|
1422
|
+
`streaming model, ${model}, mode, ${vlLocateMode3() || "default"}, cost-ms, ${timeCost}`
|
|
1287
1423
|
);
|
|
1288
1424
|
} else {
|
|
1289
1425
|
const result = await completion.create({
|
|
@@ -1294,7 +1430,7 @@ async function call2(messages, AIActionTypeValue, responseFormat, options) {
|
|
|
1294
1430
|
});
|
|
1295
1431
|
timeCost = Date.now() - startTime;
|
|
1296
1432
|
debugProfileStats(
|
|
1297
|
-
`model, ${model}, mode, ${
|
|
1433
|
+
`model, ${model}, mode, ${vlLocateMode3() || "default"}, ui-tars-version, ${uiTarsModelVersion()}, prompt-tokens, ${result.usage?.prompt_tokens || ""}, completion-tokens, ${result.usage?.completion_tokens || ""}, total-tokens, ${result.usage?.total_tokens || ""}, cost-ms, ${timeCost}, requestId, ${result._request_id || ""}`
|
|
1298
1434
|
);
|
|
1299
1435
|
debugProfileDetail(
|
|
1300
1436
|
`model usage detail: ${JSON.stringify(result.usage)}`
|
|
@@ -1490,138 +1626,13 @@ function safeParseJson(input) {
|
|
|
1490
1626
|
return JSON.parse(jsonrepair(cleanJsonString));
|
|
1491
1627
|
} catch (e) {
|
|
1492
1628
|
}
|
|
1493
|
-
if (
|
|
1629
|
+
if (vlLocateMode3() === "doubao-vision" || vlLocateMode3() === "vlm-ui-tars") {
|
|
1494
1630
|
const jsonString = preprocessDoubaoBboxJson(cleanJsonString);
|
|
1495
1631
|
return JSON.parse(jsonrepair(jsonString));
|
|
1496
1632
|
}
|
|
1497
1633
|
throw Error(`failed to parse json response: ${input}`);
|
|
1498
1634
|
}
|
|
1499
1635
|
|
|
1500
|
-
// src/image/index.ts
|
|
1501
|
-
import {
|
|
1502
|
-
imageInfo,
|
|
1503
|
-
imageInfoOfBase64,
|
|
1504
|
-
localImg2Base64,
|
|
1505
|
-
httpImg2Base64,
|
|
1506
|
-
resizeImg,
|
|
1507
|
-
saveBase64Image,
|
|
1508
|
-
zoomForGPT4o
|
|
1509
|
-
} from "@midscene/shared/img";
|
|
1510
|
-
|
|
1511
|
-
// src/ai-model/prompt/util.ts
|
|
1512
|
-
import { NodeType as NodeType2 } from "@midscene/shared/constants";
|
|
1513
|
-
import { vlLocateMode as vlLocateMode3 } from "@midscene/shared/env";
|
|
1514
|
-
import {
|
|
1515
|
-
descriptionOfTree,
|
|
1516
|
-
generateElementByPosition,
|
|
1517
|
-
treeToList as treeToList2
|
|
1518
|
-
} from "@midscene/shared/extractor";
|
|
1519
|
-
import { assert as assert4 } from "@midscene/shared/utils";
|
|
1520
|
-
function describeSize(size) {
|
|
1521
|
-
return `${size.width} x ${size.height}`;
|
|
1522
|
-
}
|
|
1523
|
-
var distanceThreshold = 16;
|
|
1524
|
-
function elementByPositionWithElementInfo(treeRoot, position, options) {
|
|
1525
|
-
const requireStrictDistance = options?.requireStrictDistance ?? true;
|
|
1526
|
-
const filterPositionElements = options?.filterPositionElements ?? false;
|
|
1527
|
-
assert4(typeof position !== "undefined", "position is required for query");
|
|
1528
|
-
const matchingElements = [];
|
|
1529
|
-
function dfs(node) {
|
|
1530
|
-
if (node?.node) {
|
|
1531
|
-
const item = node.node;
|
|
1532
|
-
if (item.rect.left <= position.x && position.x <= item.rect.left + item.rect.width && item.rect.top <= position.y && position.y <= item.rect.top + item.rect.height) {
|
|
1533
|
-
if (!(filterPositionElements && item.attributes?.nodeType === NodeType2.POSITION) && item.isVisible) {
|
|
1534
|
-
matchingElements.push(item);
|
|
1535
|
-
}
|
|
1536
|
-
}
|
|
1537
|
-
}
|
|
1538
|
-
for (const child of node.children) {
|
|
1539
|
-
dfs(child);
|
|
1540
|
-
}
|
|
1541
|
-
}
|
|
1542
|
-
dfs(treeRoot);
|
|
1543
|
-
if (matchingElements.length === 0) {
|
|
1544
|
-
return void 0;
|
|
1545
|
-
}
|
|
1546
|
-
const element = matchingElements.reduce((smallest, current) => {
|
|
1547
|
-
const smallestArea = smallest.rect.width * smallest.rect.height;
|
|
1548
|
-
const currentArea = current.rect.width * current.rect.height;
|
|
1549
|
-
return currentArea < smallestArea ? current : smallest;
|
|
1550
|
-
});
|
|
1551
|
-
const distanceToCenter = distance(
|
|
1552
|
-
{ x: element.center[0], y: element.center[1] },
|
|
1553
|
-
position
|
|
1554
|
-
);
|
|
1555
|
-
if (requireStrictDistance) {
|
|
1556
|
-
return distanceToCenter <= distanceThreshold ? element : void 0;
|
|
1557
|
-
}
|
|
1558
|
-
return element;
|
|
1559
|
-
}
|
|
1560
|
-
function distance(point1, point2) {
|
|
1561
|
-
return Math.sqrt((point1.x - point2.x) ** 2 + (point1.y - point2.y) ** 2);
|
|
1562
|
-
}
|
|
1563
|
-
async function describeUserPage(context, opt) {
|
|
1564
|
-
const { screenshotBase64 } = context;
|
|
1565
|
-
let width;
|
|
1566
|
-
let height;
|
|
1567
|
-
if (context.size) {
|
|
1568
|
-
({ width, height } = context.size);
|
|
1569
|
-
} else {
|
|
1570
|
-
const imgSize = await imageInfoOfBase64(screenshotBase64);
|
|
1571
|
-
({ width, height } = imgSize);
|
|
1572
|
-
}
|
|
1573
|
-
const treeRoot = context.tree;
|
|
1574
|
-
const idElementMap = {};
|
|
1575
|
-
const flatElements = treeToList2(treeRoot);
|
|
1576
|
-
if (opt?.domIncluded === true && flatElements.length >= 5e3) {
|
|
1577
|
-
console.warn(
|
|
1578
|
-
'The number of elements is too large, it may cause the prompt to be too long, please use domIncluded: "visible-only" to reduce the number of elements'
|
|
1579
|
-
);
|
|
1580
|
-
}
|
|
1581
|
-
flatElements.forEach((element) => {
|
|
1582
|
-
idElementMap[element.id] = element;
|
|
1583
|
-
if (typeof element.indexId !== "undefined") {
|
|
1584
|
-
idElementMap[`${element.indexId}`] = element;
|
|
1585
|
-
}
|
|
1586
|
-
});
|
|
1587
|
-
let pageDescription = "";
|
|
1588
|
-
const visibleOnly = opt?.visibleOnly ?? opt?.domIncluded === "visible-only";
|
|
1589
|
-
if (opt?.domIncluded || !vlLocateMode3()) {
|
|
1590
|
-
const contentTree = await descriptionOfTree(
|
|
1591
|
-
treeRoot,
|
|
1592
|
-
opt?.truncateTextLength,
|
|
1593
|
-
opt?.filterNonTextContent,
|
|
1594
|
-
visibleOnly
|
|
1595
|
-
);
|
|
1596
|
-
const sizeDescription = describeSize({ width, height });
|
|
1597
|
-
pageDescription = `The size of the page: ${sizeDescription}
|
|
1598
|
-
The page elements tree:
|
|
1599
|
-
${contentTree}`;
|
|
1600
|
-
}
|
|
1601
|
-
return {
|
|
1602
|
-
description: pageDescription,
|
|
1603
|
-
elementById(idOrIndexId) {
|
|
1604
|
-
assert4(typeof idOrIndexId !== "undefined", "id is required for query");
|
|
1605
|
-
const item = idElementMap[`${idOrIndexId}`];
|
|
1606
|
-
return item;
|
|
1607
|
-
},
|
|
1608
|
-
elementByPosition(position, size) {
|
|
1609
|
-
return elementByPositionWithElementInfo(treeRoot, position);
|
|
1610
|
-
},
|
|
1611
|
-
insertElementByPosition(position) {
|
|
1612
|
-
const element = generateElementByPosition(position);
|
|
1613
|
-
treeRoot.children.push({
|
|
1614
|
-
node: element,
|
|
1615
|
-
children: []
|
|
1616
|
-
});
|
|
1617
|
-
flatElements.push(element);
|
|
1618
|
-
idElementMap[element.id] = element;
|
|
1619
|
-
return element;
|
|
1620
|
-
},
|
|
1621
|
-
size: { width, height }
|
|
1622
|
-
};
|
|
1623
|
-
}
|
|
1624
|
-
|
|
1625
1636
|
// src/ai-model/prompt/playwright-generator.ts
|
|
1626
1637
|
import { PLAYWRIGHT_EXAMPLE_CODE } from "@midscene/shared/constants";
|
|
1627
1638
|
|
|
@@ -2003,7 +2014,7 @@ import {
|
|
|
2003
2014
|
preProcessImageUrl
|
|
2004
2015
|
} from "@midscene/shared/img";
|
|
2005
2016
|
import { getDebug as getDebug3 } from "@midscene/shared/logger";
|
|
2006
|
-
import { assert as
|
|
2017
|
+
import { assert as assert4 } from "@midscene/shared/utils";
|
|
2007
2018
|
|
|
2008
2019
|
// src/ai-model/prompt/extraction.ts
|
|
2009
2020
|
import { PromptTemplate as PromptTemplate3 } from "@langchain/core/prompts";
|
|
@@ -2213,7 +2224,7 @@ async function AiLocateElement(options) {
|
|
|
2213
2224
|
const { context, targetElementDescription, callAI } = options;
|
|
2214
2225
|
const { screenshotBase64 } = context;
|
|
2215
2226
|
const { description, elementById, insertElementByPosition } = await describeUserPage(context);
|
|
2216
|
-
|
|
2227
|
+
assert4(
|
|
2217
2228
|
targetElementDescription,
|
|
2218
2229
|
"cannot find the target element description"
|
|
2219
2230
|
);
|
|
@@ -2224,11 +2235,11 @@ async function AiLocateElement(options) {
|
|
|
2224
2235
|
const systemPrompt = systemPromptToLocateElement(vlLocateMode4());
|
|
2225
2236
|
let imagePayload = screenshotBase64;
|
|
2226
2237
|
if (options.searchConfig) {
|
|
2227
|
-
|
|
2238
|
+
assert4(
|
|
2228
2239
|
options.searchConfig.rect,
|
|
2229
2240
|
"searchArea is provided but its rect cannot be found. Failed to locate element"
|
|
2230
2241
|
);
|
|
2231
|
-
|
|
2242
|
+
assert4(
|
|
2232
2243
|
options.searchConfig.imageBase64,
|
|
2233
2244
|
"searchArea is provided but its imageBase64 cannot be found. Failed to locate element"
|
|
2234
2245
|
);
|
|
@@ -2451,7 +2462,7 @@ async function AiExtractElementInfo(options) {
|
|
|
2451
2462
|
}
|
|
2452
2463
|
async function AiAssert(options) {
|
|
2453
2464
|
const { assertion, context } = options;
|
|
2454
|
-
|
|
2465
|
+
assert4(assertion, "assertion should not be empty");
|
|
2455
2466
|
const { screenshotBase64 } = context;
|
|
2456
2467
|
const systemPrompt = systemPromptToAssert({
|
|
2457
2468
|
isUITars: getAIConfigInBoolean2(MIDSCENE_USE_VLM_UI_TARS)
|
|
@@ -2501,13 +2512,13 @@ ${assertionText}
|
|
|
2501
2512
|
// src/ai-model/llm-planning.ts
|
|
2502
2513
|
import { vlLocateMode as vlLocateMode5 } from "@midscene/shared/env";
|
|
2503
2514
|
import { paddingToMatchBlockByBase64 as paddingToMatchBlockByBase642 } from "@midscene/shared/img";
|
|
2504
|
-
import { assert as
|
|
2515
|
+
import { assert as assert5 } from "@midscene/shared/utils";
|
|
2505
2516
|
async function plan(userInstruction, opts) {
|
|
2506
2517
|
const { callAI, context } = opts || {};
|
|
2507
2518
|
const { screenshotBase64, size } = context;
|
|
2508
2519
|
const { description: pageDescription, elementById } = await describeUserPage(context);
|
|
2509
2520
|
const systemPrompt = await systemPromptToTaskPlanning({
|
|
2510
|
-
|
|
2521
|
+
pageType: opts.pageType,
|
|
2511
2522
|
vlMode: vlLocateMode5()
|
|
2512
2523
|
});
|
|
2513
2524
|
const taskBackgroundContextText = generateTaskBackgroundContext(
|
|
@@ -2563,7 +2574,7 @@ async function plan(userInstruction, opts) {
|
|
|
2563
2574
|
usage,
|
|
2564
2575
|
yamlFlow: buildYamlFlowFromPlans(actions, planFromAI.sleep)
|
|
2565
2576
|
};
|
|
2566
|
-
|
|
2577
|
+
assert5(planFromAI, "can't get plans from AI");
|
|
2567
2578
|
if (vlLocateMode5()) {
|
|
2568
2579
|
actions.forEach((action) => {
|
|
2569
2580
|
if (action.locate) {
|
|
@@ -2579,7 +2590,7 @@ async function plan(userInstruction, opts) {
|
|
|
2579
2590
|
}
|
|
2580
2591
|
}
|
|
2581
2592
|
});
|
|
2582
|
-
|
|
2593
|
+
assert5(!planFromAI.error, `Failed to plan actions: ${planFromAI.error}`);
|
|
2583
2594
|
} else {
|
|
2584
2595
|
actions.forEach((action) => {
|
|
2585
2596
|
if (action.locate?.id) {
|
|
@@ -2608,7 +2619,7 @@ import {
|
|
|
2608
2619
|
import { resizeImgBase64 } from "@midscene/shared/img";
|
|
2609
2620
|
import { transformHotkeyInput } from "@midscene/shared/keyboard-layout";
|
|
2610
2621
|
import { getDebug as getDebug4 } from "@midscene/shared/logger";
|
|
2611
|
-
import { assert as
|
|
2622
|
+
import { assert as assert6 } from "@midscene/shared/utils";
|
|
2612
2623
|
import { actionParser } from "@ui-tars/action-parser";
|
|
2613
2624
|
|
|
2614
2625
|
// src/ai-model/prompt/ui-tars-planning.ts
|
|
@@ -2685,7 +2696,7 @@ async function vlmPlanning(options) {
|
|
|
2685
2696
|
const transformActions = [];
|
|
2686
2697
|
parsed.forEach((action) => {
|
|
2687
2698
|
if (action.action_type === "click") {
|
|
2688
|
-
|
|
2699
|
+
assert6(action.action_inputs.start_box, "start_box is required");
|
|
2689
2700
|
const point = getPoint(action.action_inputs.start_box, size);
|
|
2690
2701
|
transformActions.push({
|
|
2691
2702
|
type: "Locate",
|
|
@@ -2712,8 +2723,8 @@ async function vlmPlanning(options) {
|
|
|
2712
2723
|
param: action.thought || ""
|
|
2713
2724
|
});
|
|
2714
2725
|
} else if (action.action_type === "drag") {
|
|
2715
|
-
|
|
2716
|
-
|
|
2726
|
+
assert6(action.action_inputs.start_box, "start_box is required");
|
|
2727
|
+
assert6(action.action_inputs.end_box, "end_box is required");
|
|
2717
2728
|
const startPoint = getPoint(action.action_inputs.start_box, size);
|
|
2718
2729
|
const endPoint = getPoint(action.action_inputs.end_box, size);
|
|
2719
2730
|
transformActions.push({
|
|
@@ -2795,7 +2806,7 @@ async function vlmPlanning(options) {
|
|
|
2795
2806
|
param: {}
|
|
2796
2807
|
});
|
|
2797
2808
|
} else if (action.action_type === "androidLongPress") {
|
|
2798
|
-
|
|
2809
|
+
assert6(
|
|
2799
2810
|
action.action_inputs.start_coords,
|
|
2800
2811
|
"start_coords is required for androidLongPress"
|
|
2801
2812
|
);
|
|
@@ -2889,6 +2900,8 @@ async function resizeImageForUiTars(imageBase64, size) {
|
|
|
2889
2900
|
|
|
2890
2901
|
export {
|
|
2891
2902
|
systemPromptToLocateElement,
|
|
2903
|
+
elementByPositionWithElementInfo,
|
|
2904
|
+
describeUserPage,
|
|
2892
2905
|
call2 as call,
|
|
2893
2906
|
callToGetJSONObject,
|
|
2894
2907
|
callAiFnWithStringResponse,
|
|
@@ -2896,8 +2909,6 @@ export {
|
|
|
2896
2909
|
callAiFn,
|
|
2897
2910
|
adaptBboxToRect,
|
|
2898
2911
|
expandSearchArea,
|
|
2899
|
-
elementByPositionWithElementInfo,
|
|
2900
|
-
describeUserPage,
|
|
2901
2912
|
generateYamlTest,
|
|
2902
2913
|
generateYamlTestStream,
|
|
2903
2914
|
generatePlaywrightTest,
|
|
@@ -2911,4 +2922,4 @@ export {
|
|
|
2911
2922
|
resizeImageForUiTars
|
|
2912
2923
|
};
|
|
2913
2924
|
|
|
2914
|
-
//# sourceMappingURL=chunk-
|
|
2925
|
+
//# sourceMappingURL=chunk-I5LBWOQA.js.map
|