@midscene/core 0.25.4-beta-20250808064529.0 → 0.25.4-beta-20250811113343.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/es/ai-model.d.ts +3 -3
- package/dist/es/ai-model.js +1 -1
- package/dist/es/{chunk-I5LBWOQA.js → chunk-5IZMFZPA.js} +247 -254
- package/dist/es/chunk-5IZMFZPA.js.map +1 -0
- package/dist/es/{chunk-UIEDQYHD.js → chunk-H5PRBRMX.js} +3 -3
- package/dist/es/index.d.ts +4 -4
- package/dist/es/index.js +2 -2
- package/dist/es/{llm-planning-92cec090.d.ts → llm-planning-374b74b8.d.ts} +2 -1
- package/dist/es/{types-b4a208c6.d.ts → types-16cd9f75.d.ts} +10 -1
- package/dist/es/utils.d.ts +1 -1
- package/dist/es/utils.js +1 -1
- package/dist/lib/ai-model.d.ts +3 -3
- package/dist/lib/ai-model.js +2 -2
- package/dist/lib/{chunk-I5LBWOQA.js → chunk-5IZMFZPA.js} +236 -243
- package/dist/lib/chunk-5IZMFZPA.js.map +1 -0
- package/dist/lib/{chunk-UIEDQYHD.js → chunk-H5PRBRMX.js} +3 -3
- package/dist/lib/index.d.ts +4 -4
- package/dist/lib/index.js +12 -12
- package/dist/lib/{llm-planning-92cec090.d.ts → llm-planning-374b74b8.d.ts} +2 -1
- package/dist/{types/types-b4a208c6.d.ts → lib/types-16cd9f75.d.ts} +10 -1
- package/dist/lib/utils.d.ts +1 -1
- package/dist/lib/utils.js +2 -2
- package/dist/types/ai-model.d.ts +3 -3
- package/dist/types/index.d.ts +4 -4
- package/dist/types/{llm-planning-92cec090.d.ts → llm-planning-374b74b8.d.ts} +2 -1
- package/dist/{lib/types-b4a208c6.d.ts → types/types-16cd9f75.d.ts} +10 -1
- package/dist/types/utils.d.ts +1 -1
- package/package.json +3 -3
- package/dist/es/chunk-I5LBWOQA.js.map +0 -1
- package/dist/lib/chunk-I5LBWOQA.js.map +0 -1
- /package/dist/es/{chunk-UIEDQYHD.js.map → chunk-H5PRBRMX.js.map} +0 -0
- /package/dist/lib/{chunk-UIEDQYHD.js.map → chunk-H5PRBRMX.js.map} +0 -0
|
@@ -30,7 +30,7 @@ import {
|
|
|
30
30
|
getAIConfigInBoolean,
|
|
31
31
|
getAIConfigInJson,
|
|
32
32
|
uiTarsModelVersion,
|
|
33
|
-
vlLocateMode as
|
|
33
|
+
vlLocateMode as vlLocateMode2
|
|
34
34
|
} from "@midscene/shared/env";
|
|
35
35
|
import { enableDebug, getDebug as getDebug2 } from "@midscene/shared/logger";
|
|
36
36
|
import { assert as assert3 } from "@midscene/shared/utils";
|
|
@@ -633,179 +633,80 @@ Here is the item user want to find:
|
|
|
633
633
|
});
|
|
634
634
|
|
|
635
635
|
// src/ai-model/prompt/llm-planning.ts
|
|
636
|
+
import assert2 from "assert";
|
|
636
637
|
import { PromptTemplate as PromptTemplate2 } from "@langchain/core/prompts";
|
|
637
|
-
|
|
638
|
-
// src/image/index.ts
|
|
639
|
-
import {
|
|
640
|
-
imageInfo,
|
|
641
|
-
imageInfoOfBase64,
|
|
642
|
-
localImg2Base64,
|
|
643
|
-
httpImg2Base64,
|
|
644
|
-
resizeImg,
|
|
645
|
-
saveBase64Image,
|
|
646
|
-
zoomForGPT4o
|
|
647
|
-
} from "@midscene/shared/img";
|
|
648
|
-
|
|
649
|
-
// src/ai-model/prompt/util.ts
|
|
650
|
-
import { NodeType as NodeType2 } from "@midscene/shared/constants";
|
|
651
|
-
import { vlLocateMode as vlLocateMode2 } from "@midscene/shared/env";
|
|
652
|
-
import {
|
|
653
|
-
descriptionOfTree,
|
|
654
|
-
generateElementByPosition,
|
|
655
|
-
treeToList as treeToList2
|
|
656
|
-
} from "@midscene/shared/extractor";
|
|
657
|
-
import { assert as assert2 } from "@midscene/shared/utils";
|
|
658
|
-
function describeSize(size) {
|
|
659
|
-
return `${size.width} x ${size.height}`;
|
|
660
|
-
}
|
|
661
|
-
var distanceThreshold = 16;
|
|
662
|
-
function elementByPositionWithElementInfo(treeRoot, position, options) {
|
|
663
|
-
const requireStrictDistance = options?.requireStrictDistance ?? true;
|
|
664
|
-
const filterPositionElements = options?.filterPositionElements ?? false;
|
|
665
|
-
assert2(typeof position !== "undefined", "position is required for query");
|
|
666
|
-
const matchingElements = [];
|
|
667
|
-
function dfs(node) {
|
|
668
|
-
if (node?.node) {
|
|
669
|
-
const item = node.node;
|
|
670
|
-
if (item.rect.left <= position.x && position.x <= item.rect.left + item.rect.width && item.rect.top <= position.y && position.y <= item.rect.top + item.rect.height) {
|
|
671
|
-
if (!(filterPositionElements && item.attributes?.nodeType === NodeType2.POSITION) && item.isVisible) {
|
|
672
|
-
matchingElements.push(item);
|
|
673
|
-
}
|
|
674
|
-
}
|
|
675
|
-
}
|
|
676
|
-
for (const child of node.children) {
|
|
677
|
-
dfs(child);
|
|
678
|
-
}
|
|
679
|
-
}
|
|
680
|
-
dfs(treeRoot);
|
|
681
|
-
if (matchingElements.length === 0) {
|
|
682
|
-
return void 0;
|
|
683
|
-
}
|
|
684
|
-
const element = matchingElements.reduce((smallest, current) => {
|
|
685
|
-
const smallestArea = smallest.rect.width * smallest.rect.height;
|
|
686
|
-
const currentArea = current.rect.width * current.rect.height;
|
|
687
|
-
return currentArea < smallestArea ? current : smallest;
|
|
688
|
-
});
|
|
689
|
-
const distanceToCenter = distance(
|
|
690
|
-
{ x: element.center[0], y: element.center[1] },
|
|
691
|
-
position
|
|
692
|
-
);
|
|
693
|
-
if (requireStrictDistance) {
|
|
694
|
-
return distanceToCenter <= distanceThreshold ? element : void 0;
|
|
695
|
-
}
|
|
696
|
-
return element;
|
|
697
|
-
}
|
|
698
|
-
function distance(point1, point2) {
|
|
699
|
-
return Math.sqrt((point1.x - point2.x) ** 2 + (point1.y - point2.y) ** 2);
|
|
700
|
-
}
|
|
701
|
-
var samplePageDescription = `
|
|
702
|
-
And the page is described as follows:
|
|
703
|
-
====================
|
|
704
|
-
The size of the page: 1280 x 720
|
|
705
|
-
Some of the elements are marked with a rectangle in the screenshot corresponding to the markerId, some are not.
|
|
706
|
-
|
|
707
|
-
Description of all the elements in screenshot:
|
|
708
|
-
<div id="969f1637" markerId="1" left="100" top="100" width="100" height="100"> // The markerId indicated by the rectangle label in the screenshot
|
|
709
|
-
<h4 id="b211ecb2" markerId="5" left="150" top="150" width="90" height="60">
|
|
710
|
-
The username is accepted
|
|
711
|
-
</h4>
|
|
712
|
-
...many more
|
|
713
|
-
</div>
|
|
714
|
-
====================
|
|
715
|
-
`;
|
|
716
|
-
async function describeUserPage(context, opt) {
|
|
717
|
-
const { screenshotBase64 } = context;
|
|
718
|
-
let width;
|
|
719
|
-
let height;
|
|
720
|
-
if (context.size) {
|
|
721
|
-
({ width, height } = context.size);
|
|
722
|
-
} else {
|
|
723
|
-
const imgSize = await imageInfoOfBase64(screenshotBase64);
|
|
724
|
-
({ width, height } = imgSize);
|
|
725
|
-
}
|
|
726
|
-
const treeRoot = context.tree;
|
|
727
|
-
const idElementMap = {};
|
|
728
|
-
const flatElements = treeToList2(treeRoot);
|
|
729
|
-
if (opt?.domIncluded === true && flatElements.length >= 5e3) {
|
|
730
|
-
console.warn(
|
|
731
|
-
'The number of elements is too large, it may cause the prompt to be too long, please use domIncluded: "visible-only" to reduce the number of elements'
|
|
732
|
-
);
|
|
733
|
-
}
|
|
734
|
-
flatElements.forEach((element) => {
|
|
735
|
-
idElementMap[element.id] = element;
|
|
736
|
-
if (typeof element.indexId !== "undefined") {
|
|
737
|
-
idElementMap[`${element.indexId}`] = element;
|
|
738
|
-
}
|
|
739
|
-
});
|
|
740
|
-
let pageDescription = "";
|
|
741
|
-
const visibleOnly = opt?.visibleOnly ?? opt?.domIncluded === "visible-only";
|
|
742
|
-
if (opt?.domIncluded || !vlLocateMode2()) {
|
|
743
|
-
const contentTree = await descriptionOfTree(
|
|
744
|
-
treeRoot,
|
|
745
|
-
opt?.truncateTextLength,
|
|
746
|
-
opt?.filterNonTextContent,
|
|
747
|
-
visibleOnly
|
|
748
|
-
);
|
|
749
|
-
const sizeDescription = describeSize({ width, height });
|
|
750
|
-
pageDescription = `The size of the page: ${sizeDescription}
|
|
751
|
-
The page elements tree:
|
|
752
|
-
${contentTree}`;
|
|
753
|
-
}
|
|
754
|
-
return {
|
|
755
|
-
description: pageDescription,
|
|
756
|
-
elementById(idOrIndexId) {
|
|
757
|
-
assert2(typeof idOrIndexId !== "undefined", "id is required for query");
|
|
758
|
-
const item = idElementMap[`${idOrIndexId}`];
|
|
759
|
-
return item;
|
|
760
|
-
},
|
|
761
|
-
elementByPosition(position, size) {
|
|
762
|
-
return elementByPositionWithElementInfo(treeRoot, position);
|
|
763
|
-
},
|
|
764
|
-
insertElementByPosition(position) {
|
|
765
|
-
const element = generateElementByPosition(position);
|
|
766
|
-
treeRoot.children.push({
|
|
767
|
-
node: element,
|
|
768
|
-
children: []
|
|
769
|
-
});
|
|
770
|
-
flatElements.push(element);
|
|
771
|
-
idElementMap[element.id] = element;
|
|
772
|
-
return element;
|
|
773
|
-
},
|
|
774
|
-
size: { width, height }
|
|
775
|
-
};
|
|
776
|
-
}
|
|
777
|
-
|
|
778
|
-
// src/ai-model/prompt/llm-planning.ts
|
|
779
638
|
var vlCoTLog = `"what_the_user_wants_to_do_next_by_instruction": string, // What the user wants to do according to the instruction and previous logs. `;
|
|
780
|
-
var vlCurrentLog = `"log": string, // Log what the next one action (ONLY ONE!) you can do according to the screenshot and the instruction. The typical log looks like "Now i want to use action '{
|
|
781
|
-
var llmCurrentLog = `"log": string, // Log what the next actions you can do according to the screenshot and the instruction. The typical log looks like "Now i want to use action '{
|
|
639
|
+
var vlCurrentLog = `"log": string, // Log what the next one action (ONLY ONE!) you can do according to the screenshot and the instruction. The typical log looks like "Now i want to use action '{ action-type }' to do .. first". If no action should be done, log the reason. ". Use the same language as the user's instruction.`;
|
|
640
|
+
var llmCurrentLog = `"log": string, // Log what the next actions you can do according to the screenshot and the instruction. The typical log looks like "Now i want to use action '{ action-type }' to do ..". If no action should be done, log the reason. ". Use the same language as the user's instruction.`;
|
|
782
641
|
var commonOutputFields = `"error"?: string, // Error messages about unexpected situations, if any. Only think it is an error when the situation is not expected according to the instruction. Use the same language as the user's instruction.
|
|
783
642
|
"more_actions_needed_by_instruction": boolean, // Consider if there is still more action(s) to do after the action in "Log" is done, according to the instruction. If so, set this field to true. Otherwise, set it to false.`;
|
|
784
643
|
var vlLocateParam = "locate: {bbox: [number, number, number, number], prompt: string }";
|
|
644
|
+
var llmLocateParam = `locate: {"id": string, "prompt": string}`;
|
|
645
|
+
var descriptionForAction = (action, locatorScheme) => {
|
|
646
|
+
const tab = " ";
|
|
647
|
+
let locateParam = "";
|
|
648
|
+
if (action.location === "required") {
|
|
649
|
+
locateParam = locatorScheme;
|
|
650
|
+
} else if (action.location === "optional") {
|
|
651
|
+
locateParam = `${locatorScheme} | null`;
|
|
652
|
+
} else if (action.location === false) {
|
|
653
|
+
locateParam = "";
|
|
654
|
+
}
|
|
655
|
+
const locatorParam = locateParam ? `${tab}- ${locateParam}` : "";
|
|
656
|
+
let whatToLocate = "";
|
|
657
|
+
if (action.whatToLocate) {
|
|
658
|
+
if (!locateParam) {
|
|
659
|
+
console.warn(
|
|
660
|
+
`whatToLocate is provided for action ${action.name}, but location is not required or optional. The whatToLocate will be ignored.`
|
|
661
|
+
);
|
|
662
|
+
} else {
|
|
663
|
+
whatToLocate = `${tab}- whatToLocate: ${action.whatToLocate}`;
|
|
664
|
+
}
|
|
665
|
+
}
|
|
666
|
+
let paramSchema = "";
|
|
667
|
+
if (action.paramSchema) {
|
|
668
|
+
paramSchema = `${tab}- paramSchema: ${action.paramSchema}`;
|
|
669
|
+
}
|
|
670
|
+
let paramDescription = "";
|
|
671
|
+
if (action.paramDescription) {
|
|
672
|
+
assert2(
|
|
673
|
+
paramSchema,
|
|
674
|
+
`paramSchema is required when paramDescription is provided for action ${action.name}, but got ${action.paramSchema}`
|
|
675
|
+
);
|
|
676
|
+
paramDescription = `${tab}- paramDescription: ${action.paramDescription}`;
|
|
677
|
+
}
|
|
678
|
+
const fields = [
|
|
679
|
+
paramSchema,
|
|
680
|
+
paramDescription,
|
|
681
|
+
locatorParam,
|
|
682
|
+
whatToLocate
|
|
683
|
+
].filter(Boolean);
|
|
684
|
+
return `- ${action.name}
|
|
685
|
+
- type: "${action.name}"
|
|
686
|
+
- description: ${action.description}
|
|
687
|
+
${fields.join("\n")}
|
|
688
|
+
`.trim();
|
|
689
|
+
};
|
|
785
690
|
var systemTemplateOfVLPlanning = ({
|
|
786
|
-
|
|
691
|
+
actionSpace,
|
|
787
692
|
vlMode
|
|
788
|
-
}) =>
|
|
693
|
+
}) => {
|
|
694
|
+
const actionNameList = actionSpace.map((action) => action.name).join(", ");
|
|
695
|
+
const actionDescriptionList = actionSpace.map(
|
|
696
|
+
(action) => descriptionForAction(action, vlLocateParam)
|
|
697
|
+
);
|
|
698
|
+
const actionList = actionDescriptionList.join("\n");
|
|
699
|
+
return `
|
|
789
700
|
Target: User will give you a screenshot, an instruction and some previous logs indicating what have been done. Please tell what the next one action is (or null if no action should be done) to do the tasks the instruction requires.
|
|
790
701
|
|
|
791
702
|
Restriction:
|
|
792
703
|
- Don't give extra actions or plans beyond the instruction. ONLY plan for what the instruction requires. For example, don't try to submit the form if the instruction is only to fill something.
|
|
793
|
-
- Always give ONLY ONE action in \`log\` field (or null if no action should be done), instead of multiple actions. Supported actions are
|
|
704
|
+
- Always give ONLY ONE action in \`log\` field (or null if no action should be done), instead of multiple actions. Supported actions are ${actionNameList}.
|
|
794
705
|
- Don't repeat actions in the previous logs.
|
|
795
706
|
- Bbox is the bounding box of the element to be located. It's an array of 4 numbers, representing ${bboxDescription(vlMode)}.
|
|
796
707
|
|
|
797
708
|
Supporting actions:
|
|
798
|
-
|
|
799
|
-
- RightClick: { type: "RightClick", ${vlLocateParam} }
|
|
800
|
-
- Hover: { type: "Hover", ${vlLocateParam} }
|
|
801
|
-
- Input: { type: "Input", ${vlLocateParam}, param: { value: string } } // Replace the input field with a new value. \`value\` is the final that should be filled in the input box. No matter what modifications are required, just provide the final value to replace the existing input value. Giving a blank string means clear the input field.
|
|
802
|
-
- KeyboardPress: { type: "KeyboardPress", param: { value: string } }
|
|
803
|
-
- Scroll: { type: "Scroll", ${vlLocateParam} | null, param: { direction: 'down'(default) | 'up' | 'right' | 'left', scrollType: 'once' (default) | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft', distance: null | number }} // locate is the element to scroll. If it's a page scroll, put \`null\` in the \`locate\` field.
|
|
804
|
-
${pageType === "android" ? `- AndroidBackButton: { type: "AndroidBackButton", param: {} }
|
|
805
|
-
- AndroidHomeButton: { type: "AndroidHomeButton", param: {} }
|
|
806
|
-
- AndroidRecentAppsButton: { type: "AndroidRecentAppsButton", param: {} }
|
|
807
|
-
- AndroidLongPress: { type: "AndroidLongPress", param: { x: number, y: number, duration?: number } }
|
|
808
|
-
- AndroidPull: { type: "AndroidPull", param: { direction: 'up' | 'down', startPoint?: { x: number, y: number }, distance?: number, duration?: number } } // Pull down to refresh (direction: 'down') or pull up to load more (direction: 'up')` : ""}
|
|
709
|
+
${actionList}
|
|
809
710
|
|
|
810
711
|
Field description:
|
|
811
712
|
* The \`prompt\` field inside the \`locate\` field is a short description that could be used to locate the element.
|
|
@@ -840,8 +741,16 @@ this and output the JSON:
|
|
|
840
741
|
}
|
|
841
742
|
}
|
|
842
743
|
`;
|
|
843
|
-
|
|
844
|
-
var systemTemplateOfLLM = ({
|
|
744
|
+
};
|
|
745
|
+
var systemTemplateOfLLM = ({
|
|
746
|
+
actionSpace
|
|
747
|
+
}) => {
|
|
748
|
+
const actionNameList = actionSpace.map((action) => action.name).join(" / ");
|
|
749
|
+
const actionDescriptionList = actionSpace.map(
|
|
750
|
+
(action) => descriptionForAction(action, llmLocateParam)
|
|
751
|
+
);
|
|
752
|
+
const actionList = actionDescriptionList.join("\n");
|
|
753
|
+
return `
|
|
845
754
|
## Role
|
|
846
755
|
|
|
847
756
|
You are a versatile professional in software UI automation. Your outstanding contributions will impact the user experience of billions of users.
|
|
@@ -855,7 +764,7 @@ You are a versatile professional in software UI automation. Your outstanding con
|
|
|
855
764
|
## Workflow
|
|
856
765
|
|
|
857
766
|
1. Receive the screenshot, element description of screenshot(if any), user's instruction and previous logs.
|
|
858
|
-
2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (
|
|
767
|
+
2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (${actionNameList}). The "About the action" section below will give you more details.
|
|
859
768
|
3. Precisely locate the target element if it's already shown in the screenshot, put the location info in the \`locate\` field of the action.
|
|
860
769
|
4. If some target elements is not shown in the screenshot, consider the user's instruction is not feasible on this page. Follow the next steps.
|
|
861
770
|
5. Consider whether the user's instruction will be accomplished after all the actions
|
|
@@ -873,65 +782,30 @@ You are a versatile professional in software UI automation. Your outstanding con
|
|
|
873
782
|
|
|
874
783
|
The \`locate\` param is commonly used in the \`param\` field of the action, means to locate the target element to perform the action, it conforms to the following scheme:
|
|
875
784
|
|
|
876
|
-
type LocateParam = {
|
|
785
|
+
type LocateParam = {
|
|
877
786
|
"id": string, // the id of the element found. It should either be the id marked with a rectangle in the screenshot or the id described in the description.
|
|
878
787
|
"prompt"?: string // the description of the element to find. It can only be omitted when locate is null.
|
|
879
|
-
}
|
|
788
|
+
} | null // If it's not on the page, the LocateParam should be null
|
|
880
789
|
|
|
881
790
|
## Supported actions
|
|
882
791
|
|
|
883
792
|
Each action has a \`type\` and corresponding \`param\`. To be detailed:
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
- type: 'Hover'
|
|
889
|
-
* {{ ${llmLocateParam} }}
|
|
890
|
-
- type: 'Input', replace the value in the input field
|
|
891
|
-
* {{ ${llmLocateParam}, param: {{ value: string }} }}
|
|
892
|
-
* \`value\` is the final value that should be filled in the input field. No matter what modifications are required, just provide the final value user should see after the action is done.
|
|
893
|
-
- type: 'KeyboardPress', press a key
|
|
894
|
-
* {{ param: {{ value: string }} }}
|
|
895
|
-
- type: 'Scroll', scroll up or down.
|
|
896
|
-
* {{
|
|
897
|
-
${llmLocateParam},
|
|
898
|
-
param: {{
|
|
899
|
-
direction: 'down'(default) | 'up' | 'right' | 'left',
|
|
900
|
-
scrollType: 'once' (default) | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft',
|
|
901
|
-
distance: null | number
|
|
902
|
-
}}
|
|
903
|
-
}}
|
|
904
|
-
* To scroll some specific element, put the element at the center of the region in the \`locate\` field. If it's a page scroll, put \`null\` in the \`locate\` field.
|
|
905
|
-
* \`param\` is required in this action. If some fields are not specified, use direction \`down\`, \`once\` scroll type, and \`null\` distance.
|
|
906
|
-
* {{ param: {{ button: 'Back' | 'Home' | 'RecentApp' }} }}
|
|
907
|
-
- type: 'ExpectedFalsyCondition'
|
|
908
|
-
* {{ param: {{ reason: string }} }}
|
|
909
|
-
* use this action when the conditional statement talked about in the instruction is falsy.
|
|
910
|
-
- type: 'Sleep'
|
|
911
|
-
* {{ param: {{ timeMs: number }} }}
|
|
912
|
-
${pageType === "android" ? `- type: 'AndroidBackButton', trigger the system "back" operation on Android devices
|
|
913
|
-
* {{ param: {{}} }}
|
|
914
|
-
- type: 'AndroidHomeButton', trigger the system "home" operation on Android devices
|
|
915
|
-
* {{ param: {{}} }}
|
|
916
|
-
- type: 'AndroidRecentAppsButton', trigger the system "recent apps" operation on Android devices
|
|
917
|
-
* {{ param: {{}} }}
|
|
918
|
-
- type: 'AndroidLongPress', trigger a long press on the screen at specified coordinates on Android devices
|
|
919
|
-
* {{ param: {{ x: number, y: number, duration?: number }} }}
|
|
920
|
-
- type: 'AndroidPull', trigger pull down to refresh or pull up actions on Android devices
|
|
921
|
-
* {{ param: {{ direction: 'up' | 'down', startPoint?: {{ x: number, y: number }}, distance?: number, duration?: number }} }}` : ""}
|
|
922
|
-
`;
|
|
793
|
+
${actionList}
|
|
794
|
+
|
|
795
|
+
`.trim();
|
|
796
|
+
};
|
|
923
797
|
var outputTemplate = `
|
|
924
798
|
## Output JSON Format:
|
|
925
799
|
|
|
926
800
|
The JSON format is as follows:
|
|
927
801
|
|
|
928
|
-
{
|
|
802
|
+
{
|
|
929
803
|
"actions": [
|
|
930
804
|
// ... some actions
|
|
931
805
|
],
|
|
932
806
|
${llmCurrentLog}
|
|
933
807
|
${commonOutputFields}
|
|
934
|
-
}
|
|
808
|
+
}
|
|
935
809
|
|
|
936
810
|
## Examples
|
|
937
811
|
|
|
@@ -947,68 +821,62 @@ By viewing the page screenshot and description, you should consider this and out
|
|
|
947
821
|
* Log what these action do: Click the language switch button to open the language options. Wait for 1 second.
|
|
948
822
|
* The task cannot be accomplished (because we cannot see the "English" option now), so the \`more_actions_needed_by_instruction\` field is true.
|
|
949
823
|
|
|
950
|
-
{
|
|
824
|
+
{
|
|
951
825
|
"actions":[
|
|
952
|
-
{
|
|
826
|
+
{
|
|
953
827
|
"type": "Tap",
|
|
954
828
|
"thought": "Click the language switch button to open the language options.",
|
|
955
829
|
"param": null,
|
|
956
|
-
"locate": {
|
|
957
|
-
}
|
|
958
|
-
{
|
|
830
|
+
"locate": { id: "c81c4e9a33", prompt: "The language switch button" }},
|
|
831
|
+
},
|
|
832
|
+
{
|
|
959
833
|
"type": "Sleep",
|
|
960
834
|
"thought": "Wait for 1 second to ensure the language options are displayed.",
|
|
961
|
-
"param": {
|
|
962
|
-
}
|
|
835
|
+
"param": { "timeMs": 1000 },
|
|
836
|
+
}
|
|
963
837
|
],
|
|
964
838
|
"error": null,
|
|
965
839
|
"more_actions_needed_by_instruction": true,
|
|
966
840
|
"log": "Click the language switch button to open the language options. Wait for 1 second",
|
|
967
|
-
}
|
|
841
|
+
}
|
|
968
842
|
|
|
969
843
|
### Example: What NOT to do
|
|
970
844
|
Wrong output:
|
|
971
|
-
{
|
|
845
|
+
{
|
|
972
846
|
"actions":[
|
|
973
|
-
{
|
|
847
|
+
{
|
|
974
848
|
"type": "Tap",
|
|
975
849
|
"thought": "Click the language switch button to open the language options.",
|
|
976
850
|
"param": null,
|
|
977
|
-
"locate": {
|
|
978
|
-
{
|
|
979
|
-
}
|
|
980
|
-
}
|
|
981
|
-
{
|
|
851
|
+
"locate": {
|
|
852
|
+
{ "id": "c81c4e9a33" }, // WRONG: prompt is missing
|
|
853
|
+
}
|
|
854
|
+
},
|
|
855
|
+
{
|
|
982
856
|
"type": "Tap",
|
|
983
857
|
"thought": "Click the English option",
|
|
984
858
|
"param": null,
|
|
985
859
|
"locate": null, // This means the 'English' option is not shown in the screenshot, the task cannot be accomplished
|
|
986
|
-
}
|
|
860
|
+
}
|
|
987
861
|
],
|
|
988
862
|
"more_actions_needed_by_instruction": false, // WRONG: should be true
|
|
989
863
|
"log": "Click the language switch button to open the language options",
|
|
990
|
-
}
|
|
864
|
+
}
|
|
991
865
|
|
|
992
866
|
Reason:
|
|
993
867
|
* The \`prompt\` is missing in the first 'Locate' action
|
|
994
868
|
* Since the option button is not shown in the screenshot, there are still more actions to be done, so the \`more_actions_needed_by_instruction\` field should be true
|
|
995
869
|
`;
|
|
996
870
|
async function systemPromptToTaskPlanning({
|
|
997
|
-
|
|
871
|
+
actionSpace,
|
|
998
872
|
vlMode
|
|
999
873
|
}) {
|
|
1000
874
|
if (vlMode) {
|
|
1001
|
-
return systemTemplateOfVLPlanning({
|
|
875
|
+
return systemTemplateOfVLPlanning({ actionSpace, vlMode });
|
|
1002
876
|
}
|
|
1003
|
-
|
|
1004
|
-
template: `${systemTemplateOfLLM({ pageType })}
|
|
877
|
+
return `${systemTemplateOfLLM({ actionSpace })}
|
|
1005
878
|
|
|
1006
|
-
${outputTemplate}
|
|
1007
|
-
inputVariables: ["pageDescription"]
|
|
1008
|
-
});
|
|
1009
|
-
return await promptTemplate.format({
|
|
1010
|
-
pageDescription: samplePageDescription
|
|
1011
|
-
});
|
|
879
|
+
${outputTemplate}`;
|
|
1012
880
|
}
|
|
1013
881
|
var planSchema = {
|
|
1014
882
|
type: "json_schema",
|
|
@@ -1347,10 +1215,10 @@ async function call2(messages, AIActionTypeValue, responseFormat, options) {
|
|
|
1347
1215
|
let usage;
|
|
1348
1216
|
let timeCost;
|
|
1349
1217
|
const commonConfig = {
|
|
1350
|
-
temperature:
|
|
1218
|
+
temperature: vlLocateMode2() === "vlm-ui-tars" ? 0 : 0.1,
|
|
1351
1219
|
stream: !!isStreaming,
|
|
1352
1220
|
max_tokens: typeof maxTokens === "number" ? maxTokens : Number.parseInt(maxTokens || "2048", 10),
|
|
1353
|
-
...
|
|
1221
|
+
...vlLocateMode2() === "qwen-vl" ? {
|
|
1354
1222
|
vl_high_resolution_images: true
|
|
1355
1223
|
} : {}
|
|
1356
1224
|
};
|
|
@@ -1419,7 +1287,7 @@ async function call2(messages, AIActionTypeValue, responseFormat, options) {
|
|
|
1419
1287
|
}
|
|
1420
1288
|
content = accumulated;
|
|
1421
1289
|
debugProfileStats(
|
|
1422
|
-
`streaming model, ${model}, mode, ${
|
|
1290
|
+
`streaming model, ${model}, mode, ${vlLocateMode2() || "default"}, cost-ms, ${timeCost}`
|
|
1423
1291
|
);
|
|
1424
1292
|
} else {
|
|
1425
1293
|
const result = await completion.create({
|
|
@@ -1430,7 +1298,7 @@ async function call2(messages, AIActionTypeValue, responseFormat, options) {
|
|
|
1430
1298
|
});
|
|
1431
1299
|
timeCost = Date.now() - startTime;
|
|
1432
1300
|
debugProfileStats(
|
|
1433
|
-
`model, ${model}, mode, ${
|
|
1301
|
+
`model, ${model}, mode, ${vlLocateMode2() || "default"}, ui-tars-version, ${uiTarsModelVersion()}, prompt-tokens, ${result.usage?.prompt_tokens || ""}, completion-tokens, ${result.usage?.completion_tokens || ""}, total-tokens, ${result.usage?.total_tokens || ""}, cost-ms, ${timeCost}, requestId, ${result._request_id || ""}`
|
|
1434
1302
|
);
|
|
1435
1303
|
debugProfileDetail(
|
|
1436
1304
|
`model usage detail: ${JSON.stringify(result.usage)}`
|
|
@@ -1626,13 +1494,138 @@ function safeParseJson(input) {
|
|
|
1626
1494
|
return JSON.parse(jsonrepair(cleanJsonString));
|
|
1627
1495
|
} catch (e) {
|
|
1628
1496
|
}
|
|
1629
|
-
if (
|
|
1497
|
+
if (vlLocateMode2() === "doubao-vision" || vlLocateMode2() === "vlm-ui-tars") {
|
|
1630
1498
|
const jsonString = preprocessDoubaoBboxJson(cleanJsonString);
|
|
1631
1499
|
return JSON.parse(jsonrepair(jsonString));
|
|
1632
1500
|
}
|
|
1633
1501
|
throw Error(`failed to parse json response: ${input}`);
|
|
1634
1502
|
}
|
|
1635
1503
|
|
|
1504
|
+
// src/image/index.ts
|
|
1505
|
+
import {
|
|
1506
|
+
imageInfo,
|
|
1507
|
+
imageInfoOfBase64,
|
|
1508
|
+
localImg2Base64,
|
|
1509
|
+
httpImg2Base64,
|
|
1510
|
+
resizeImg,
|
|
1511
|
+
saveBase64Image,
|
|
1512
|
+
zoomForGPT4o
|
|
1513
|
+
} from "@midscene/shared/img";
|
|
1514
|
+
|
|
1515
|
+
// src/ai-model/prompt/util.ts
|
|
1516
|
+
import { NodeType as NodeType2 } from "@midscene/shared/constants";
|
|
1517
|
+
import { vlLocateMode as vlLocateMode3 } from "@midscene/shared/env";
|
|
1518
|
+
import {
|
|
1519
|
+
descriptionOfTree,
|
|
1520
|
+
generateElementByPosition,
|
|
1521
|
+
treeToList as treeToList2
|
|
1522
|
+
} from "@midscene/shared/extractor";
|
|
1523
|
+
import { assert as assert4 } from "@midscene/shared/utils";
|
|
1524
|
+
function describeSize(size) {
|
|
1525
|
+
return `${size.width} x ${size.height}`;
|
|
1526
|
+
}
|
|
1527
|
+
var distanceThreshold = 16;
|
|
1528
|
+
function elementByPositionWithElementInfo(treeRoot, position, options) {
|
|
1529
|
+
const requireStrictDistance = options?.requireStrictDistance ?? true;
|
|
1530
|
+
const filterPositionElements = options?.filterPositionElements ?? false;
|
|
1531
|
+
assert4(typeof position !== "undefined", "position is required for query");
|
|
1532
|
+
const matchingElements = [];
|
|
1533
|
+
function dfs(node) {
|
|
1534
|
+
if (node?.node) {
|
|
1535
|
+
const item = node.node;
|
|
1536
|
+
if (item.rect.left <= position.x && position.x <= item.rect.left + item.rect.width && item.rect.top <= position.y && position.y <= item.rect.top + item.rect.height) {
|
|
1537
|
+
if (!(filterPositionElements && item.attributes?.nodeType === NodeType2.POSITION) && item.isVisible) {
|
|
1538
|
+
matchingElements.push(item);
|
|
1539
|
+
}
|
|
1540
|
+
}
|
|
1541
|
+
}
|
|
1542
|
+
for (const child of node.children) {
|
|
1543
|
+
dfs(child);
|
|
1544
|
+
}
|
|
1545
|
+
}
|
|
1546
|
+
dfs(treeRoot);
|
|
1547
|
+
if (matchingElements.length === 0) {
|
|
1548
|
+
return void 0;
|
|
1549
|
+
}
|
|
1550
|
+
const element = matchingElements.reduce((smallest, current) => {
|
|
1551
|
+
const smallestArea = smallest.rect.width * smallest.rect.height;
|
|
1552
|
+
const currentArea = current.rect.width * current.rect.height;
|
|
1553
|
+
return currentArea < smallestArea ? current : smallest;
|
|
1554
|
+
});
|
|
1555
|
+
const distanceToCenter = distance(
|
|
1556
|
+
{ x: element.center[0], y: element.center[1] },
|
|
1557
|
+
position
|
|
1558
|
+
);
|
|
1559
|
+
if (requireStrictDistance) {
|
|
1560
|
+
return distanceToCenter <= distanceThreshold ? element : void 0;
|
|
1561
|
+
}
|
|
1562
|
+
return element;
|
|
1563
|
+
}
|
|
1564
|
+
function distance(point1, point2) {
|
|
1565
|
+
return Math.sqrt((point1.x - point2.x) ** 2 + (point1.y - point2.y) ** 2);
|
|
1566
|
+
}
|
|
1567
|
+
async function describeUserPage(context, opt) {
|
|
1568
|
+
const { screenshotBase64 } = context;
|
|
1569
|
+
let width;
|
|
1570
|
+
let height;
|
|
1571
|
+
if (context.size) {
|
|
1572
|
+
({ width, height } = context.size);
|
|
1573
|
+
} else {
|
|
1574
|
+
const imgSize = await imageInfoOfBase64(screenshotBase64);
|
|
1575
|
+
({ width, height } = imgSize);
|
|
1576
|
+
}
|
|
1577
|
+
const treeRoot = context.tree;
|
|
1578
|
+
const idElementMap = {};
|
|
1579
|
+
const flatElements = treeToList2(treeRoot);
|
|
1580
|
+
if (opt?.domIncluded === true && flatElements.length >= 5e3) {
|
|
1581
|
+
console.warn(
|
|
1582
|
+
'The number of elements is too large, it may cause the prompt to be too long, please use domIncluded: "visible-only" to reduce the number of elements'
|
|
1583
|
+
);
|
|
1584
|
+
}
|
|
1585
|
+
flatElements.forEach((element) => {
|
|
1586
|
+
idElementMap[element.id] = element;
|
|
1587
|
+
if (typeof element.indexId !== "undefined") {
|
|
1588
|
+
idElementMap[`${element.indexId}`] = element;
|
|
1589
|
+
}
|
|
1590
|
+
});
|
|
1591
|
+
let pageDescription = "";
|
|
1592
|
+
const visibleOnly = opt?.visibleOnly ?? opt?.domIncluded === "visible-only";
|
|
1593
|
+
if (opt?.domIncluded || !vlLocateMode3()) {
|
|
1594
|
+
const contentTree = await descriptionOfTree(
|
|
1595
|
+
treeRoot,
|
|
1596
|
+
opt?.truncateTextLength,
|
|
1597
|
+
opt?.filterNonTextContent,
|
|
1598
|
+
visibleOnly
|
|
1599
|
+
);
|
|
1600
|
+
const sizeDescription = describeSize({ width, height });
|
|
1601
|
+
pageDescription = `The size of the page: ${sizeDescription}
|
|
1602
|
+
The page elements tree:
|
|
1603
|
+
${contentTree}`;
|
|
1604
|
+
}
|
|
1605
|
+
return {
|
|
1606
|
+
description: pageDescription,
|
|
1607
|
+
elementById(idOrIndexId) {
|
|
1608
|
+
assert4(typeof idOrIndexId !== "undefined", "id is required for query");
|
|
1609
|
+
const item = idElementMap[`${idOrIndexId}`];
|
|
1610
|
+
return item;
|
|
1611
|
+
},
|
|
1612
|
+
elementByPosition(position, size) {
|
|
1613
|
+
return elementByPositionWithElementInfo(treeRoot, position);
|
|
1614
|
+
},
|
|
1615
|
+
insertElementByPosition(position) {
|
|
1616
|
+
const element = generateElementByPosition(position);
|
|
1617
|
+
treeRoot.children.push({
|
|
1618
|
+
node: element,
|
|
1619
|
+
children: []
|
|
1620
|
+
});
|
|
1621
|
+
flatElements.push(element);
|
|
1622
|
+
idElementMap[element.id] = element;
|
|
1623
|
+
return element;
|
|
1624
|
+
},
|
|
1625
|
+
size: { width, height }
|
|
1626
|
+
};
|
|
1627
|
+
}
|
|
1628
|
+
|
|
1636
1629
|
// src/ai-model/prompt/playwright-generator.ts
|
|
1637
1630
|
import { PLAYWRIGHT_EXAMPLE_CODE } from "@midscene/shared/constants";
|
|
1638
1631
|
|
|
@@ -2014,7 +2007,7 @@ import {
|
|
|
2014
2007
|
preProcessImageUrl
|
|
2015
2008
|
} from "@midscene/shared/img";
|
|
2016
2009
|
import { getDebug as getDebug3 } from "@midscene/shared/logger";
|
|
2017
|
-
import { assert as
|
|
2010
|
+
import { assert as assert5 } from "@midscene/shared/utils";
|
|
2018
2011
|
|
|
2019
2012
|
// src/ai-model/prompt/extraction.ts
|
|
2020
2013
|
import { PromptTemplate as PromptTemplate3 } from "@langchain/core/prompts";
|
|
@@ -2224,7 +2217,7 @@ async function AiLocateElement(options) {
|
|
|
2224
2217
|
const { context, targetElementDescription, callAI } = options;
|
|
2225
2218
|
const { screenshotBase64 } = context;
|
|
2226
2219
|
const { description, elementById, insertElementByPosition } = await describeUserPage(context);
|
|
2227
|
-
|
|
2220
|
+
assert5(
|
|
2228
2221
|
targetElementDescription,
|
|
2229
2222
|
"cannot find the target element description"
|
|
2230
2223
|
);
|
|
@@ -2235,11 +2228,11 @@ async function AiLocateElement(options) {
|
|
|
2235
2228
|
const systemPrompt = systemPromptToLocateElement(vlLocateMode4());
|
|
2236
2229
|
let imagePayload = screenshotBase64;
|
|
2237
2230
|
if (options.searchConfig) {
|
|
2238
|
-
|
|
2231
|
+
assert5(
|
|
2239
2232
|
options.searchConfig.rect,
|
|
2240
2233
|
"searchArea is provided but its rect cannot be found. Failed to locate element"
|
|
2241
2234
|
);
|
|
2242
|
-
|
|
2235
|
+
assert5(
|
|
2243
2236
|
options.searchConfig.imageBase64,
|
|
2244
2237
|
"searchArea is provided but its imageBase64 cannot be found. Failed to locate element"
|
|
2245
2238
|
);
|
|
@@ -2462,7 +2455,7 @@ async function AiExtractElementInfo(options) {
|
|
|
2462
2455
|
}
|
|
2463
2456
|
async function AiAssert(options) {
|
|
2464
2457
|
const { assertion, context } = options;
|
|
2465
|
-
|
|
2458
|
+
assert5(assertion, "assertion should not be empty");
|
|
2466
2459
|
const { screenshotBase64 } = context;
|
|
2467
2460
|
const systemPrompt = systemPromptToAssert({
|
|
2468
2461
|
isUITars: getAIConfigInBoolean2(MIDSCENE_USE_VLM_UI_TARS)
|
|
@@ -2512,13 +2505,13 @@ ${assertionText}
|
|
|
2512
2505
|
// src/ai-model/llm-planning.ts
|
|
2513
2506
|
import { vlLocateMode as vlLocateMode5 } from "@midscene/shared/env";
|
|
2514
2507
|
import { paddingToMatchBlockByBase64 as paddingToMatchBlockByBase642 } from "@midscene/shared/img";
|
|
2515
|
-
import { assert as
|
|
2508
|
+
import { assert as assert6 } from "@midscene/shared/utils";
|
|
2516
2509
|
async function plan(userInstruction, opts) {
|
|
2517
2510
|
const { callAI, context } = opts || {};
|
|
2518
2511
|
const { screenshotBase64, size } = context;
|
|
2519
2512
|
const { description: pageDescription, elementById } = await describeUserPage(context);
|
|
2520
2513
|
const systemPrompt = await systemPromptToTaskPlanning({
|
|
2521
|
-
|
|
2514
|
+
actionSpace: opts.actionSpace,
|
|
2522
2515
|
vlMode: vlLocateMode5()
|
|
2523
2516
|
});
|
|
2524
2517
|
const taskBackgroundContextText = generateTaskBackgroundContext(
|
|
@@ -2574,7 +2567,7 @@ async function plan(userInstruction, opts) {
|
|
|
2574
2567
|
usage,
|
|
2575
2568
|
yamlFlow: buildYamlFlowFromPlans(actions, planFromAI.sleep)
|
|
2576
2569
|
};
|
|
2577
|
-
|
|
2570
|
+
assert6(planFromAI, "can't get plans from AI");
|
|
2578
2571
|
if (vlLocateMode5()) {
|
|
2579
2572
|
actions.forEach((action) => {
|
|
2580
2573
|
if (action.locate) {
|
|
@@ -2590,7 +2583,7 @@ async function plan(userInstruction, opts) {
|
|
|
2590
2583
|
}
|
|
2591
2584
|
}
|
|
2592
2585
|
});
|
|
2593
|
-
|
|
2586
|
+
assert6(!planFromAI.error, `Failed to plan actions: ${planFromAI.error}`);
|
|
2594
2587
|
} else {
|
|
2595
2588
|
actions.forEach((action) => {
|
|
2596
2589
|
if (action.locate?.id) {
|
|
@@ -2619,7 +2612,7 @@ import {
|
|
|
2619
2612
|
import { resizeImgBase64 } from "@midscene/shared/img";
|
|
2620
2613
|
import { transformHotkeyInput } from "@midscene/shared/keyboard-layout";
|
|
2621
2614
|
import { getDebug as getDebug4 } from "@midscene/shared/logger";
|
|
2622
|
-
import { assert as
|
|
2615
|
+
import { assert as assert7 } from "@midscene/shared/utils";
|
|
2623
2616
|
import { actionParser } from "@ui-tars/action-parser";
|
|
2624
2617
|
|
|
2625
2618
|
// src/ai-model/prompt/ui-tars-planning.ts
|
|
@@ -2696,7 +2689,7 @@ async function vlmPlanning(options) {
|
|
|
2696
2689
|
const transformActions = [];
|
|
2697
2690
|
parsed.forEach((action) => {
|
|
2698
2691
|
if (action.action_type === "click") {
|
|
2699
|
-
|
|
2692
|
+
assert7(action.action_inputs.start_box, "start_box is required");
|
|
2700
2693
|
const point = getPoint(action.action_inputs.start_box, size);
|
|
2701
2694
|
transformActions.push({
|
|
2702
2695
|
type: "Locate",
|
|
@@ -2723,8 +2716,8 @@ async function vlmPlanning(options) {
|
|
|
2723
2716
|
param: action.thought || ""
|
|
2724
2717
|
});
|
|
2725
2718
|
} else if (action.action_type === "drag") {
|
|
2726
|
-
|
|
2727
|
-
|
|
2719
|
+
assert7(action.action_inputs.start_box, "start_box is required");
|
|
2720
|
+
assert7(action.action_inputs.end_box, "end_box is required");
|
|
2728
2721
|
const startPoint = getPoint(action.action_inputs.start_box, size);
|
|
2729
2722
|
const endPoint = getPoint(action.action_inputs.end_box, size);
|
|
2730
2723
|
transformActions.push({
|
|
@@ -2806,7 +2799,7 @@ async function vlmPlanning(options) {
|
|
|
2806
2799
|
param: {}
|
|
2807
2800
|
});
|
|
2808
2801
|
} else if (action.action_type === "androidLongPress") {
|
|
2809
|
-
|
|
2802
|
+
assert7(
|
|
2810
2803
|
action.action_inputs.start_coords,
|
|
2811
2804
|
"start_coords is required for androidLongPress"
|
|
2812
2805
|
);
|
|
@@ -2900,8 +2893,6 @@ async function resizeImageForUiTars(imageBase64, size) {
|
|
|
2900
2893
|
|
|
2901
2894
|
export {
|
|
2902
2895
|
systemPromptToLocateElement,
|
|
2903
|
-
elementByPositionWithElementInfo,
|
|
2904
|
-
describeUserPage,
|
|
2905
2896
|
call2 as call,
|
|
2906
2897
|
callToGetJSONObject,
|
|
2907
2898
|
callAiFnWithStringResponse,
|
|
@@ -2909,6 +2900,8 @@ export {
|
|
|
2909
2900
|
callAiFn,
|
|
2910
2901
|
adaptBboxToRect,
|
|
2911
2902
|
expandSearchArea,
|
|
2903
|
+
elementByPositionWithElementInfo,
|
|
2904
|
+
describeUserPage,
|
|
2912
2905
|
generateYamlTest,
|
|
2913
2906
|
generateYamlTestStream,
|
|
2914
2907
|
generatePlaywrightTest,
|
|
@@ -2922,4 +2915,4 @@ export {
|
|
|
2922
2915
|
resizeImageForUiTars
|
|
2923
2916
|
};
|
|
2924
2917
|
|
|
2925
|
-
//# sourceMappingURL=chunk-
|
|
2918
|
+
//# sourceMappingURL=chunk-5IZMFZPA.js.map
|