@midscene/core 0.26.2-beta-20250812035614.0 → 0.26.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/es/ai-model.d.ts +3 -3
- package/dist/es/ai-model.js +1 -1
- package/dist/es/{chunk-I5LBWOQA.js → chunk-DDYIQHOA.js} +268 -310
- package/dist/es/chunk-DDYIQHOA.js.map +1 -0
- package/dist/es/{chunk-KAYSYGXR.js → chunk-O3KUKF2A.js} +3 -3
- package/dist/es/index.d.ts +4 -4
- package/dist/es/index.js +2 -2
- package/dist/es/{llm-planning-92cec090.d.ts → llm-planning-4e0c16fe.d.ts} +2 -1
- package/dist/es/{types-b4a208c6.d.ts → types-8a6be57c.d.ts} +11 -2
- package/dist/es/utils.d.ts +1 -1
- package/dist/es/utils.js +1 -1
- package/dist/lib/ai-model.d.ts +3 -3
- package/dist/lib/ai-model.js +2 -2
- package/dist/lib/{chunk-I5LBWOQA.js → chunk-DDYIQHOA.js} +257 -299
- package/dist/lib/chunk-DDYIQHOA.js.map +1 -0
- package/dist/lib/{chunk-KAYSYGXR.js → chunk-O3KUKF2A.js} +3 -3
- package/dist/lib/index.d.ts +4 -4
- package/dist/lib/index.js +12 -12
- package/dist/lib/{llm-planning-92cec090.d.ts → llm-planning-4e0c16fe.d.ts} +2 -1
- package/dist/{types/types-b4a208c6.d.ts → lib/types-8a6be57c.d.ts} +11 -2
- package/dist/lib/utils.d.ts +1 -1
- package/dist/lib/utils.js +2 -2
- package/dist/types/ai-model.d.ts +3 -3
- package/dist/types/index.d.ts +4 -4
- package/dist/types/{llm-planning-92cec090.d.ts → llm-planning-4e0c16fe.d.ts} +2 -1
- package/dist/{lib/types-b4a208c6.d.ts → types/types-8a6be57c.d.ts} +11 -2
- package/dist/types/utils.d.ts +1 -1
- package/package.json +3 -3
- package/dist/es/chunk-I5LBWOQA.js.map +0 -1
- package/dist/lib/chunk-I5LBWOQA.js.map +0 -1
- /package/dist/es/{chunk-KAYSYGXR.js.map → chunk-O3KUKF2A.js.map} +0 -0
- /package/dist/lib/{chunk-KAYSYGXR.js.map → chunk-O3KUKF2A.js.map} +0 -0
|
@@ -30,7 +30,7 @@ import {
|
|
|
30
30
|
getAIConfigInBoolean,
|
|
31
31
|
getAIConfigInJson,
|
|
32
32
|
uiTarsModelVersion,
|
|
33
|
-
vlLocateMode as
|
|
33
|
+
vlLocateMode as vlLocateMode2
|
|
34
34
|
} from "@midscene/shared/env";
|
|
35
35
|
import { enableDebug, getDebug as getDebug2 } from "@midscene/shared/logger";
|
|
36
36
|
import { assert as assert3 } from "@midscene/shared/utils";
|
|
@@ -297,7 +297,7 @@ function buildYamlFlowFromPlans(plans, sleep) {
|
|
|
297
297
|
sleep: param.timeMs
|
|
298
298
|
});
|
|
299
299
|
} else if (type === "AndroidBackButton" || type === "AndroidHomeButton" || type === "AndroidRecentAppsButton" || type === "AndroidLongPress" || type === "AndroidPull") {
|
|
300
|
-
} else if (type === "Error" || type === "
|
|
300
|
+
} else if (type === "Error" || type === "Assert" || type === "AssertWithoutThrow" || type === "Finished") {
|
|
301
301
|
} else {
|
|
302
302
|
console.warn(
|
|
303
303
|
`Cannot convert action ${type} to yaml flow. This should be a bug of Midscene.`
|
|
@@ -633,179 +633,73 @@ Here is the item user want to find:
|
|
|
633
633
|
});
|
|
634
634
|
|
|
635
635
|
// src/ai-model/prompt/llm-planning.ts
|
|
636
|
+
import assert2 from "assert";
|
|
636
637
|
import { PromptTemplate as PromptTemplate2 } from "@langchain/core/prompts";
|
|
637
|
-
|
|
638
|
-
//
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
}
|
|
661
|
-
|
|
662
|
-
function elementByPositionWithElementInfo(treeRoot, position, options) {
|
|
663
|
-
const requireStrictDistance = options?.requireStrictDistance ?? true;
|
|
664
|
-
const filterPositionElements = options?.filterPositionElements ?? false;
|
|
665
|
-
assert2(typeof position !== "undefined", "position is required for query");
|
|
666
|
-
const matchingElements = [];
|
|
667
|
-
function dfs(node) {
|
|
668
|
-
if (node?.node) {
|
|
669
|
-
const item = node.node;
|
|
670
|
-
if (item.rect.left <= position.x && position.x <= item.rect.left + item.rect.width && item.rect.top <= position.y && position.y <= item.rect.top + item.rect.height) {
|
|
671
|
-
if (!(filterPositionElements && item.attributes?.nodeType === NodeType2.POSITION) && item.isVisible) {
|
|
672
|
-
matchingElements.push(item);
|
|
673
|
-
}
|
|
674
|
-
}
|
|
675
|
-
}
|
|
676
|
-
for (const child of node.children) {
|
|
677
|
-
dfs(child);
|
|
638
|
+
var vlCoTLog = `"what_the_user_wants_to_do_next_by_instruction": string, // What the user wants to do according to the instruction and previous logs. `;
|
|
639
|
+
var vlCurrentLog = `"log": string, // Log what the next one action (ONLY ONE!) you can do according to the screenshot and the instruction. The typical log looks like "Now i want to use action '{ action-type }' to do .. first". If no action should be done, log the reason. ". Use the same language as the user's instruction.`;
|
|
640
|
+
var llmCurrentLog = `"log": string, // Log what the next actions you can do according to the screenshot and the instruction. The typical log looks like "Now i want to use action '{ action-type }' to do ..". If no action should be done, log the reason. ". Use the same language as the user's instruction.`;
|
|
641
|
+
var commonOutputFields = `"error"?: string, // Error messages about unexpected situations, if any. Only think it is an error when the situation is not foreseeable according to the instruction. Use the same language as the user's instruction.
|
|
642
|
+
"more_actions_needed_by_instruction": boolean, // Consider if there is still more action(s) to do after the action in "Log" is done, according to the instruction. If so, set this field to true. Otherwise, set it to false.`;
|
|
643
|
+
var vlLocateParam = (required) => `locate${required ? "" : "?"}: {bbox: [number, number, number, number], prompt: string }`;
|
|
644
|
+
var llmLocateParam = (required) => `locate${required ? "" : "?"}: {"id": string, "prompt": string}`;
|
|
645
|
+
var descriptionForAction = (action, locatorScheme) => {
|
|
646
|
+
const tab = " ";
|
|
647
|
+
let locateParam = "";
|
|
648
|
+
if (action.location === "required") {
|
|
649
|
+
locateParam = locatorScheme;
|
|
650
|
+
} else if (action.location === "optional") {
|
|
651
|
+
locateParam = `${locatorScheme} | null`;
|
|
652
|
+
} else if (action.location === false) {
|
|
653
|
+
locateParam = "";
|
|
654
|
+
}
|
|
655
|
+
const locatorParam = locateParam ? `- ${locateParam}` : "";
|
|
656
|
+
if (action.whatToLocate) {
|
|
657
|
+
if (!locateParam) {
|
|
658
|
+
console.warn(
|
|
659
|
+
`whatToLocate is provided for action ${action.name}, but location is not required or optional. The whatToLocate will be ignored.`
|
|
660
|
+
);
|
|
661
|
+
} else {
|
|
662
|
+
locateParam += ` // ${action.whatToLocate}`;
|
|
678
663
|
}
|
|
679
664
|
}
|
|
680
|
-
|
|
681
|
-
if (
|
|
682
|
-
|
|
683
|
-
}
|
|
684
|
-
const element = matchingElements.reduce((smallest, current) => {
|
|
685
|
-
const smallestArea = smallest.rect.width * smallest.rect.height;
|
|
686
|
-
const currentArea = current.rect.width * current.rect.height;
|
|
687
|
-
return currentArea < smallestArea ? current : smallest;
|
|
688
|
-
});
|
|
689
|
-
const distanceToCenter = distance(
|
|
690
|
-
{ x: element.center[0], y: element.center[1] },
|
|
691
|
-
position
|
|
692
|
-
);
|
|
693
|
-
if (requireStrictDistance) {
|
|
694
|
-
return distanceToCenter <= distanceThreshold ? element : void 0;
|
|
695
|
-
}
|
|
696
|
-
return element;
|
|
697
|
-
}
|
|
698
|
-
function distance(point1, point2) {
|
|
699
|
-
return Math.sqrt((point1.x - point2.x) ** 2 + (point1.y - point2.y) ** 2);
|
|
700
|
-
}
|
|
701
|
-
var samplePageDescription = `
|
|
702
|
-
And the page is described as follows:
|
|
703
|
-
====================
|
|
704
|
-
The size of the page: 1280 x 720
|
|
705
|
-
Some of the elements are marked with a rectangle in the screenshot corresponding to the markerId, some are not.
|
|
706
|
-
|
|
707
|
-
Description of all the elements in screenshot:
|
|
708
|
-
<div id="969f1637" markerId="1" left="100" top="100" width="100" height="100"> // The markerId indicated by the rectangle label in the screenshot
|
|
709
|
-
<h4 id="b211ecb2" markerId="5" left="150" top="150" width="90" height="60">
|
|
710
|
-
The username is accepted
|
|
711
|
-
</h4>
|
|
712
|
-
...many more
|
|
713
|
-
</div>
|
|
714
|
-
====================
|
|
715
|
-
`;
|
|
716
|
-
async function describeUserPage(context, opt) {
|
|
717
|
-
const { screenshotBase64 } = context;
|
|
718
|
-
let width;
|
|
719
|
-
let height;
|
|
720
|
-
if (context.size) {
|
|
721
|
-
({ width, height } = context.size);
|
|
722
|
-
} else {
|
|
723
|
-
const imgSize = await imageInfoOfBase64(screenshotBase64);
|
|
724
|
-
({ width, height } = imgSize);
|
|
665
|
+
let paramSchema = "";
|
|
666
|
+
if (action.paramSchema) {
|
|
667
|
+
paramSchema = `- param: ${action.paramSchema}`;
|
|
725
668
|
}
|
|
726
|
-
|
|
727
|
-
|
|
728
|
-
|
|
729
|
-
|
|
730
|
-
console.warn(
|
|
731
|
-
'The number of elements is too large, it may cause the prompt to be too long, please use domIncluded: "visible-only" to reduce the number of elements'
|
|
732
|
-
);
|
|
733
|
-
}
|
|
734
|
-
flatElements.forEach((element) => {
|
|
735
|
-
idElementMap[element.id] = element;
|
|
736
|
-
if (typeof element.indexId !== "undefined") {
|
|
737
|
-
idElementMap[`${element.indexId}`] = element;
|
|
738
|
-
}
|
|
739
|
-
});
|
|
740
|
-
let pageDescription = "";
|
|
741
|
-
const visibleOnly = opt?.visibleOnly ?? opt?.domIncluded === "visible-only";
|
|
742
|
-
if (opt?.domIncluded || !vlLocateMode2()) {
|
|
743
|
-
const contentTree = await descriptionOfTree(
|
|
744
|
-
treeRoot,
|
|
745
|
-
opt?.truncateTextLength,
|
|
746
|
-
opt?.filterNonTextContent,
|
|
747
|
-
visibleOnly
|
|
669
|
+
if (action.paramDescription) {
|
|
670
|
+
assert2(
|
|
671
|
+
paramSchema,
|
|
672
|
+
`paramSchema is required when paramDescription is provided for action ${action.name}, but got ${action.paramSchema}`
|
|
748
673
|
);
|
|
749
|
-
|
|
750
|
-
|
|
751
|
-
|
|
752
|
-
${
|
|
753
|
-
|
|
754
|
-
|
|
755
|
-
|
|
756
|
-
|
|
757
|
-
|
|
758
|
-
const item = idElementMap[`${idOrIndexId}`];
|
|
759
|
-
return item;
|
|
760
|
-
},
|
|
761
|
-
elementByPosition(position, size) {
|
|
762
|
-
return elementByPositionWithElementInfo(treeRoot, position);
|
|
763
|
-
},
|
|
764
|
-
insertElementByPosition(position) {
|
|
765
|
-
const element = generateElementByPosition(position);
|
|
766
|
-
treeRoot.children.push({
|
|
767
|
-
node: element,
|
|
768
|
-
children: []
|
|
769
|
-
});
|
|
770
|
-
flatElements.push(element);
|
|
771
|
-
idElementMap[element.id] = element;
|
|
772
|
-
return element;
|
|
773
|
-
},
|
|
774
|
-
size: { width, height }
|
|
775
|
-
};
|
|
776
|
-
}
|
|
777
|
-
|
|
778
|
-
// src/ai-model/prompt/llm-planning.ts
|
|
779
|
-
var vlCoTLog = `"what_the_user_wants_to_do_next_by_instruction": string, // What the user wants to do according to the instruction and previous logs. `;
|
|
780
|
-
var vlCurrentLog = `"log": string, // Log what the next one action (ONLY ONE!) you can do according to the screenshot and the instruction. The typical log looks like "Now i want to use action '{{ action-type }}' to do .. first". If no action should be done, log the reason. ". Use the same language as the user's instruction.`;
|
|
781
|
-
var llmCurrentLog = `"log": string, // Log what the next actions you can do according to the screenshot and the instruction. The typical log looks like "Now i want to use action '{{ action-type }}' to do ..". If no action should be done, log the reason. ". Use the same language as the user's instruction.`;
|
|
782
|
-
var commonOutputFields = `"error"?: string, // Error messages about unexpected situations, if any. Only think it is an error when the situation is not expected according to the instruction. Use the same language as the user's instruction.
|
|
783
|
-
"more_actions_needed_by_instruction": boolean, // Consider if there is still more action(s) to do after the action in "Log" is done, according to the instruction. If so, set this field to true. Otherwise, set it to false.`;
|
|
784
|
-
var vlLocateParam = "locate: {bbox: [number, number, number, number], prompt: string }";
|
|
674
|
+
paramSchema += ` // ${action.paramDescription}`;
|
|
675
|
+
}
|
|
676
|
+
const fields = [paramSchema, locatorParam].filter(Boolean);
|
|
677
|
+
return `- ${action.name}, ${action.description}
|
|
678
|
+
${tab}- type: "${action.name}"
|
|
679
|
+
${tab}${fields.join(`
|
|
680
|
+
${tab}`)}
|
|
681
|
+
`.trim();
|
|
682
|
+
};
|
|
785
683
|
var systemTemplateOfVLPlanning = ({
|
|
786
|
-
|
|
684
|
+
actionSpace,
|
|
787
685
|
vlMode
|
|
788
|
-
}) =>
|
|
686
|
+
}) => {
|
|
687
|
+
const actionNameList = actionSpace.map((action) => action.name).join(", ");
|
|
688
|
+
const actionDescriptionList = actionSpace.map(
|
|
689
|
+
(action) => descriptionForAction(action, vlLocateParam(action.location === "required"))
|
|
690
|
+
);
|
|
691
|
+
const actionList = actionDescriptionList.join("\n");
|
|
692
|
+
return `
|
|
789
693
|
Target: User will give you a screenshot, an instruction and some previous logs indicating what have been done. Please tell what the next one action is (or null if no action should be done) to do the tasks the instruction requires.
|
|
790
694
|
|
|
791
695
|
Restriction:
|
|
792
696
|
- Don't give extra actions or plans beyond the instruction. ONLY plan for what the instruction requires. For example, don't try to submit the form if the instruction is only to fill something.
|
|
793
|
-
- Always give ONLY ONE action in \`log\` field (or null if no action should be done), instead of multiple actions. Supported actions are
|
|
697
|
+
- Always give ONLY ONE action in \`log\` field (or null if no action should be done), instead of multiple actions. Supported actions are ${actionNameList}.
|
|
794
698
|
- Don't repeat actions in the previous logs.
|
|
795
699
|
- Bbox is the bounding box of the element to be located. It's an array of 4 numbers, representing ${bboxDescription(vlMode)}.
|
|
796
700
|
|
|
797
701
|
Supporting actions:
|
|
798
|
-
|
|
799
|
-
- RightClick: { type: "RightClick", ${vlLocateParam} }
|
|
800
|
-
- Hover: { type: "Hover", ${vlLocateParam} }
|
|
801
|
-
- Input: { type: "Input", ${vlLocateParam}, param: { value: string } } // Replace the input field with a new value. \`value\` is the final that should be filled in the input box. No matter what modifications are required, just provide the final value to replace the existing input value. Giving a blank string means clear the input field.
|
|
802
|
-
- KeyboardPress: { type: "KeyboardPress", param: { value: string } }
|
|
803
|
-
- Scroll: { type: "Scroll", ${vlLocateParam} | null, param: { direction: 'down'(default) | 'up' | 'right' | 'left', scrollType: 'once' (default) | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft', distance: null | number }} // locate is the element to scroll. If it's a page scroll, put \`null\` in the \`locate\` field.
|
|
804
|
-
${pageType === "android" ? `- AndroidBackButton: { type: "AndroidBackButton", param: {} }
|
|
805
|
-
- AndroidHomeButton: { type: "AndroidHomeButton", param: {} }
|
|
806
|
-
- AndroidRecentAppsButton: { type: "AndroidRecentAppsButton", param: {} }
|
|
807
|
-
- AndroidLongPress: { type: "AndroidLongPress", param: { x: number, y: number, duration?: number } }
|
|
808
|
-
- AndroidPull: { type: "AndroidPull", param: { direction: 'up' | 'down', startPoint?: { x: number, y: number }, distance?: number, duration?: number } } // Pull down to refresh (direction: 'down') or pull up to load more (direction: 'up')` : ""}
|
|
702
|
+
${actionList}
|
|
809
703
|
|
|
810
704
|
Field description:
|
|
811
705
|
* The \`prompt\` field inside the \`locate\` field is a short description that could be used to locate the element.
|
|
@@ -840,8 +734,19 @@ this and output the JSON:
|
|
|
840
734
|
}
|
|
841
735
|
}
|
|
842
736
|
`;
|
|
843
|
-
|
|
844
|
-
var systemTemplateOfLLM = ({
|
|
737
|
+
};
|
|
738
|
+
var systemTemplateOfLLM = ({
|
|
739
|
+
actionSpace
|
|
740
|
+
}) => {
|
|
741
|
+
const actionNameList = actionSpace.map((action) => action.name).join(" / ");
|
|
742
|
+
const actionDescriptionList = actionSpace.map(
|
|
743
|
+
(action) => descriptionForAction(
|
|
744
|
+
action,
|
|
745
|
+
llmLocateParam(action.location === "required")
|
|
746
|
+
)
|
|
747
|
+
);
|
|
748
|
+
const actionList = actionDescriptionList.join("\n");
|
|
749
|
+
return `
|
|
845
750
|
## Role
|
|
846
751
|
|
|
847
752
|
You are a versatile professional in software UI automation. Your outstanding contributions will impact the user experience of billions of users.
|
|
@@ -855,16 +760,15 @@ You are a versatile professional in software UI automation. Your outstanding con
|
|
|
855
760
|
## Workflow
|
|
856
761
|
|
|
857
762
|
1. Receive the screenshot, element description of screenshot(if any), user's instruction and previous logs.
|
|
858
|
-
2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (
|
|
859
|
-
3.
|
|
860
|
-
|
|
861
|
-
|
|
862
|
-
|
|
863
|
-
- If no, don't plan more actions by closing the array. Get ready to reevaluate the task. Some talent people like you will handle this. Give him a clear description of what have been done and what to do next. Put your new plan in the \`furtherPlan\` field. The "How to compose the \`taskWillBeAccomplished\` and \`furtherPlan\` fields" section will give you more details.
|
|
763
|
+
2. Decompose the user's task into a sequence of feasible actions, and place it in the \`actions\` field. There are different types of actions (${actionNameList}). The "About the action" section below will give you more details.
|
|
764
|
+
3. Consider whether the user's instruction will be accomplished after the actions you composed.
|
|
765
|
+
- If the instruction is accomplished, set \`more_actions_needed_by_instruction\` to false.
|
|
766
|
+
- If more actions are needed, set \`more_actions_needed_by_instruction\` to true. Get ready to hand over to the next talent people like you. Carefully log what have been done in the \`log\` field, he or she will continue the task according to your logs.
|
|
767
|
+
4. If the task is not feasible on this page, set \`error\` field to the reason.
|
|
864
768
|
|
|
865
769
|
## Constraints
|
|
866
770
|
|
|
867
|
-
- All the actions you composed MUST be
|
|
771
|
+
- All the actions you composed MUST be feasible, which means all the action fields can be filled with the page context information you get. If not, don't plan this action.
|
|
868
772
|
- Trust the "What have been done" field about the task (if any), don't repeat actions in it.
|
|
869
773
|
- Respond only with valid JSON. Do not write an introduction or summary or markdown prefix like \`\`\`json\`\`\`.
|
|
870
774
|
- If the screenshot and the instruction are totally irrelevant, set reason in the \`error\` field.
|
|
@@ -873,158 +777,117 @@ You are a versatile professional in software UI automation. Your outstanding con
|
|
|
873
777
|
|
|
874
778
|
The \`locate\` param is commonly used in the \`param\` field of the action, means to locate the target element to perform the action, it conforms to the following scheme:
|
|
875
779
|
|
|
876
|
-
type LocateParam = {
|
|
780
|
+
type LocateParam = {
|
|
877
781
|
"id": string, // the id of the element found. It should either be the id marked with a rectangle in the screenshot or the id described in the description.
|
|
878
782
|
"prompt"?: string // the description of the element to find. It can only be omitted when locate is null.
|
|
879
|
-
}
|
|
783
|
+
} | null // If it's not on the page, the LocateParam should be null
|
|
880
784
|
|
|
881
785
|
## Supported actions
|
|
882
786
|
|
|
883
787
|
Each action has a \`type\` and corresponding \`param\`. To be detailed:
|
|
884
|
-
|
|
885
|
-
|
|
886
|
-
|
|
887
|
-
|
|
888
|
-
- type: 'Hover'
|
|
889
|
-
* {{ ${llmLocateParam} }}
|
|
890
|
-
- type: 'Input', replace the value in the input field
|
|
891
|
-
* {{ ${llmLocateParam}, param: {{ value: string }} }}
|
|
892
|
-
* \`value\` is the final value that should be filled in the input field. No matter what modifications are required, just provide the final value user should see after the action is done.
|
|
893
|
-
- type: 'KeyboardPress', press a key
|
|
894
|
-
* {{ param: {{ value: string }} }}
|
|
895
|
-
- type: 'Scroll', scroll up or down.
|
|
896
|
-
* {{
|
|
897
|
-
${llmLocateParam},
|
|
898
|
-
param: {{
|
|
899
|
-
direction: 'down'(default) | 'up' | 'right' | 'left',
|
|
900
|
-
scrollType: 'once' (default) | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft',
|
|
901
|
-
distance: null | number
|
|
902
|
-
}}
|
|
903
|
-
}}
|
|
904
|
-
* To scroll some specific element, put the element at the center of the region in the \`locate\` field. If it's a page scroll, put \`null\` in the \`locate\` field.
|
|
905
|
-
* \`param\` is required in this action. If some fields are not specified, use direction \`down\`, \`once\` scroll type, and \`null\` distance.
|
|
906
|
-
* {{ param: {{ button: 'Back' | 'Home' | 'RecentApp' }} }}
|
|
907
|
-
- type: 'ExpectedFalsyCondition'
|
|
908
|
-
* {{ param: {{ reason: string }} }}
|
|
909
|
-
* use this action when the conditional statement talked about in the instruction is falsy.
|
|
910
|
-
- type: 'Sleep'
|
|
911
|
-
* {{ param: {{ timeMs: number }} }}
|
|
912
|
-
${pageType === "android" ? `- type: 'AndroidBackButton', trigger the system "back" operation on Android devices
|
|
913
|
-
* {{ param: {{}} }}
|
|
914
|
-
- type: 'AndroidHomeButton', trigger the system "home" operation on Android devices
|
|
915
|
-
* {{ param: {{}} }}
|
|
916
|
-
- type: 'AndroidRecentAppsButton', trigger the system "recent apps" operation on Android devices
|
|
917
|
-
* {{ param: {{}} }}
|
|
918
|
-
- type: 'AndroidLongPress', trigger a long press on the screen at specified coordinates on Android devices
|
|
919
|
-
* {{ param: {{ x: number, y: number, duration?: number }} }}
|
|
920
|
-
- type: 'AndroidPull', trigger pull down to refresh or pull up actions on Android devices
|
|
921
|
-
* {{ param: {{ direction: 'up' | 'down', startPoint?: {{ x: number, y: number }}, distance?: number, duration?: number }} }}` : ""}
|
|
922
|
-
`;
|
|
788
|
+
${actionList}
|
|
789
|
+
|
|
790
|
+
`.trim();
|
|
791
|
+
};
|
|
923
792
|
var outputTemplate = `
|
|
924
793
|
## Output JSON Format:
|
|
925
794
|
|
|
926
795
|
The JSON format is as follows:
|
|
927
796
|
|
|
928
|
-
{
|
|
797
|
+
{
|
|
929
798
|
"actions": [
|
|
930
799
|
// ... some actions
|
|
931
800
|
],
|
|
932
801
|
${llmCurrentLog}
|
|
933
802
|
${commonOutputFields}
|
|
934
|
-
}
|
|
803
|
+
}
|
|
935
804
|
|
|
936
805
|
## Examples
|
|
937
806
|
|
|
938
807
|
### Example: Decompose a task
|
|
939
808
|
|
|
940
|
-
When
|
|
809
|
+
When you received the following information:
|
|
810
|
+
|
|
811
|
+
* Instruction: 'Click the language switch button, wait 1s, click "English"'
|
|
812
|
+
* Logs: null
|
|
813
|
+
* Page Context (screenshot and description) shows: There is a language switch button, and the "English" option is not shown in the screenshot now.
|
|
941
814
|
|
|
942
815
|
By viewing the page screenshot and description, you should consider this and output the JSON:
|
|
943
816
|
|
|
944
|
-
* The
|
|
945
|
-
* The language switch button is shown in the screenshot,
|
|
817
|
+
* The user intent is: tap the switch button, sleep, and tap the 'English' option
|
|
818
|
+
* The language switch button is shown in the screenshot, and can be located by the page description or the id marked with a rectangle. So we can plan a Tap action to do this.
|
|
819
|
+
* Plan a Sleep action to wait for 1 second to ensure the language options are displayed.
|
|
946
820
|
* The "English" option button is not shown in the screenshot now, it means it may only show after the previous actions are finished. So don't plan any action to do this.
|
|
947
821
|
* Log what these action do: Click the language switch button to open the language options. Wait for 1 second.
|
|
948
|
-
* The task cannot be accomplished (because
|
|
822
|
+
* The task cannot be accomplished (because the last tapping action is not finished yet), so the \`more_actions_needed_by_instruction\` field is true. The \`error\` field is null.
|
|
949
823
|
|
|
950
|
-
{
|
|
824
|
+
{
|
|
951
825
|
"actions":[
|
|
952
|
-
{
|
|
953
|
-
"type": "Tap",
|
|
826
|
+
{
|
|
954
827
|
"thought": "Click the language switch button to open the language options.",
|
|
828
|
+
"type": "Tap",
|
|
955
829
|
"param": null,
|
|
956
|
-
"locate": {
|
|
957
|
-
}
|
|
958
|
-
{
|
|
959
|
-
"type": "Sleep",
|
|
830
|
+
"locate": { id: "c81c4e9a33", prompt: "The language switch button" }},
|
|
831
|
+
},
|
|
832
|
+
{
|
|
960
833
|
"thought": "Wait for 1 second to ensure the language options are displayed.",
|
|
961
|
-
"
|
|
962
|
-
|
|
834
|
+
"type": "Sleep",
|
|
835
|
+
"param": { "timeMs": 1000 },
|
|
836
|
+
}
|
|
963
837
|
],
|
|
964
838
|
"error": null,
|
|
965
839
|
"more_actions_needed_by_instruction": true,
|
|
966
840
|
"log": "Click the language switch button to open the language options. Wait for 1 second",
|
|
967
|
-
}
|
|
841
|
+
}
|
|
968
842
|
|
|
969
843
|
### Example: What NOT to do
|
|
970
844
|
Wrong output:
|
|
971
|
-
{
|
|
845
|
+
{
|
|
972
846
|
"actions":[
|
|
973
|
-
{
|
|
974
|
-
"type": "Tap",
|
|
847
|
+
{
|
|
975
848
|
"thought": "Click the language switch button to open the language options.",
|
|
849
|
+
"type": "Tap",
|
|
976
850
|
"param": null,
|
|
977
|
-
"locate": {
|
|
978
|
-
{
|
|
979
|
-
}
|
|
980
|
-
}
|
|
981
|
-
{
|
|
982
|
-
"type": "Tap",
|
|
851
|
+
"locate": {
|
|
852
|
+
{ "id": "c81c4e9a33" }, // WRONG: prompt is missing, this is not a valid LocateParam
|
|
853
|
+
}
|
|
854
|
+
},
|
|
855
|
+
{
|
|
983
856
|
"thought": "Click the English option",
|
|
857
|
+
"type": "Tap",
|
|
984
858
|
"param": null,
|
|
985
859
|
"locate": null, // This means the 'English' option is not shown in the screenshot, the task cannot be accomplished
|
|
986
|
-
}
|
|
860
|
+
}
|
|
987
861
|
],
|
|
988
862
|
"more_actions_needed_by_instruction": false, // WRONG: should be true
|
|
989
863
|
"log": "Click the language switch button to open the language options",
|
|
990
|
-
}
|
|
991
|
-
|
|
992
|
-
Reason:
|
|
993
|
-
* The \`prompt\` is missing in the first 'Locate' action
|
|
994
|
-
* Since the option button is not shown in the screenshot, there are still more actions to be done, so the \`more_actions_needed_by_instruction\` field should be true
|
|
864
|
+
}
|
|
995
865
|
`;
|
|
996
866
|
async function systemPromptToTaskPlanning({
|
|
997
|
-
|
|
867
|
+
actionSpace,
|
|
998
868
|
vlMode
|
|
999
869
|
}) {
|
|
1000
870
|
if (vlMode) {
|
|
1001
|
-
return systemTemplateOfVLPlanning({
|
|
871
|
+
return systemTemplateOfVLPlanning({ actionSpace, vlMode });
|
|
1002
872
|
}
|
|
1003
|
-
|
|
1004
|
-
template: `${systemTemplateOfLLM({ pageType })}
|
|
873
|
+
return `${systemTemplateOfLLM({ actionSpace })}
|
|
1005
874
|
|
|
1006
|
-
${outputTemplate}
|
|
1007
|
-
inputVariables: ["pageDescription"]
|
|
1008
|
-
});
|
|
1009
|
-
return await promptTemplate.format({
|
|
1010
|
-
pageDescription: samplePageDescription
|
|
1011
|
-
});
|
|
875
|
+
${outputTemplate}`;
|
|
1012
876
|
}
|
|
1013
877
|
var planSchema = {
|
|
1014
878
|
type: "json_schema",
|
|
1015
879
|
json_schema: {
|
|
1016
880
|
name: "action_items",
|
|
1017
|
-
strict:
|
|
881
|
+
strict: false,
|
|
1018
882
|
schema: {
|
|
1019
883
|
type: "object",
|
|
1020
|
-
strict:
|
|
884
|
+
strict: false,
|
|
1021
885
|
properties: {
|
|
1022
886
|
actions: {
|
|
1023
|
-
// TODO
|
|
1024
887
|
type: "array",
|
|
1025
888
|
items: {
|
|
1026
889
|
type: "object",
|
|
1027
|
-
strict:
|
|
890
|
+
strict: false,
|
|
1028
891
|
properties: {
|
|
1029
892
|
thought: {
|
|
1030
893
|
type: "string",
|
|
@@ -1032,47 +895,17 @@ var planSchema = {
|
|
|
1032
895
|
},
|
|
1033
896
|
type: {
|
|
1034
897
|
type: "string",
|
|
1035
|
-
description:
|
|
898
|
+
description: "Type of action"
|
|
1036
899
|
},
|
|
1037
900
|
param: {
|
|
1038
901
|
anyOf: [
|
|
1039
902
|
{ type: "null" },
|
|
1040
903
|
{
|
|
1041
904
|
type: "object",
|
|
1042
|
-
|
|
1043
|
-
required: ["value"],
|
|
1044
|
-
additionalProperties: false
|
|
1045
|
-
},
|
|
1046
|
-
{
|
|
1047
|
-
type: "object",
|
|
1048
|
-
properties: { timeMs: { type: ["number", "string"] } },
|
|
1049
|
-
required: ["timeMs"],
|
|
1050
|
-
additionalProperties: false
|
|
1051
|
-
},
|
|
1052
|
-
{
|
|
1053
|
-
type: "object",
|
|
1054
|
-
properties: {
|
|
1055
|
-
direction: { type: "string" },
|
|
1056
|
-
scrollType: { type: "string" },
|
|
1057
|
-
distance: { type: ["number", "string", "null"] }
|
|
1058
|
-
},
|
|
1059
|
-
required: ["direction", "scrollType", "distance"],
|
|
1060
|
-
additionalProperties: false
|
|
1061
|
-
},
|
|
1062
|
-
{
|
|
1063
|
-
type: "object",
|
|
1064
|
-
properties: { reason: { type: "string" } },
|
|
1065
|
-
required: ["reason"],
|
|
1066
|
-
additionalProperties: false
|
|
1067
|
-
},
|
|
1068
|
-
{
|
|
1069
|
-
type: "object",
|
|
1070
|
-
properties: { button: { type: "string" } },
|
|
1071
|
-
required: ["button"],
|
|
1072
|
-
additionalProperties: false
|
|
905
|
+
additionalProperties: true
|
|
1073
906
|
}
|
|
1074
907
|
],
|
|
1075
|
-
description: "Parameter of the action
|
|
908
|
+
description: "Parameter of the action"
|
|
1076
909
|
},
|
|
1077
910
|
locate: {
|
|
1078
911
|
type: ["object", "null"],
|
|
@@ -1347,10 +1180,10 @@ async function call2(messages, AIActionTypeValue, responseFormat, options) {
|
|
|
1347
1180
|
let usage;
|
|
1348
1181
|
let timeCost;
|
|
1349
1182
|
const commonConfig = {
|
|
1350
|
-
temperature:
|
|
1183
|
+
temperature: vlLocateMode2() === "vlm-ui-tars" ? 0 : 0.1,
|
|
1351
1184
|
stream: !!isStreaming,
|
|
1352
1185
|
max_tokens: typeof maxTokens === "number" ? maxTokens : Number.parseInt(maxTokens || "2048", 10),
|
|
1353
|
-
...
|
|
1186
|
+
...vlLocateMode2() === "qwen-vl" ? {
|
|
1354
1187
|
vl_high_resolution_images: true
|
|
1355
1188
|
} : {}
|
|
1356
1189
|
};
|
|
@@ -1419,7 +1252,7 @@ async function call2(messages, AIActionTypeValue, responseFormat, options) {
|
|
|
1419
1252
|
}
|
|
1420
1253
|
content = accumulated;
|
|
1421
1254
|
debugProfileStats(
|
|
1422
|
-
`streaming model, ${model}, mode, ${
|
|
1255
|
+
`streaming model, ${model}, mode, ${vlLocateMode2() || "default"}, cost-ms, ${timeCost}`
|
|
1423
1256
|
);
|
|
1424
1257
|
} else {
|
|
1425
1258
|
const result = await completion.create({
|
|
@@ -1430,7 +1263,7 @@ async function call2(messages, AIActionTypeValue, responseFormat, options) {
|
|
|
1430
1263
|
});
|
|
1431
1264
|
timeCost = Date.now() - startTime;
|
|
1432
1265
|
debugProfileStats(
|
|
1433
|
-
`model, ${model}, mode, ${
|
|
1266
|
+
`model, ${model}, mode, ${vlLocateMode2() || "default"}, ui-tars-version, ${uiTarsModelVersion()}, prompt-tokens, ${result.usage?.prompt_tokens || ""}, completion-tokens, ${result.usage?.completion_tokens || ""}, total-tokens, ${result.usage?.total_tokens || ""}, cost-ms, ${timeCost}, requestId, ${result._request_id || ""}`
|
|
1434
1267
|
);
|
|
1435
1268
|
debugProfileDetail(
|
|
1436
1269
|
`model usage detail: ${JSON.stringify(result.usage)}`
|
|
@@ -1626,13 +1459,138 @@ function safeParseJson(input) {
|
|
|
1626
1459
|
return JSON.parse(jsonrepair(cleanJsonString));
|
|
1627
1460
|
} catch (e) {
|
|
1628
1461
|
}
|
|
1629
|
-
if (
|
|
1462
|
+
if (vlLocateMode2() === "doubao-vision" || vlLocateMode2() === "vlm-ui-tars") {
|
|
1630
1463
|
const jsonString = preprocessDoubaoBboxJson(cleanJsonString);
|
|
1631
1464
|
return JSON.parse(jsonrepair(jsonString));
|
|
1632
1465
|
}
|
|
1633
1466
|
throw Error(`failed to parse json response: ${input}`);
|
|
1634
1467
|
}
|
|
1635
1468
|
|
|
1469
|
+
// src/image/index.ts
|
|
1470
|
+
import {
|
|
1471
|
+
imageInfo,
|
|
1472
|
+
imageInfoOfBase64,
|
|
1473
|
+
localImg2Base64,
|
|
1474
|
+
httpImg2Base64,
|
|
1475
|
+
resizeImg,
|
|
1476
|
+
saveBase64Image,
|
|
1477
|
+
zoomForGPT4o
|
|
1478
|
+
} from "@midscene/shared/img";
|
|
1479
|
+
|
|
1480
|
+
// src/ai-model/prompt/util.ts
|
|
1481
|
+
import { NodeType as NodeType2 } from "@midscene/shared/constants";
|
|
1482
|
+
import { vlLocateMode as vlLocateMode3 } from "@midscene/shared/env";
|
|
1483
|
+
import {
|
|
1484
|
+
descriptionOfTree,
|
|
1485
|
+
generateElementByPosition,
|
|
1486
|
+
treeToList as treeToList2
|
|
1487
|
+
} from "@midscene/shared/extractor";
|
|
1488
|
+
import { assert as assert4 } from "@midscene/shared/utils";
|
|
1489
|
+
function describeSize(size) {
|
|
1490
|
+
return `${size.width} x ${size.height}`;
|
|
1491
|
+
}
|
|
1492
|
+
var distanceThreshold = 16;
|
|
1493
|
+
function elementByPositionWithElementInfo(treeRoot, position, options) {
|
|
1494
|
+
const requireStrictDistance = options?.requireStrictDistance ?? true;
|
|
1495
|
+
const filterPositionElements = options?.filterPositionElements ?? false;
|
|
1496
|
+
assert4(typeof position !== "undefined", "position is required for query");
|
|
1497
|
+
const matchingElements = [];
|
|
1498
|
+
function dfs(node) {
|
|
1499
|
+
if (node?.node) {
|
|
1500
|
+
const item = node.node;
|
|
1501
|
+
if (item.rect.left <= position.x && position.x <= item.rect.left + item.rect.width && item.rect.top <= position.y && position.y <= item.rect.top + item.rect.height) {
|
|
1502
|
+
if (!(filterPositionElements && item.attributes?.nodeType === NodeType2.POSITION) && item.isVisible) {
|
|
1503
|
+
matchingElements.push(item);
|
|
1504
|
+
}
|
|
1505
|
+
}
|
|
1506
|
+
}
|
|
1507
|
+
for (const child of node.children) {
|
|
1508
|
+
dfs(child);
|
|
1509
|
+
}
|
|
1510
|
+
}
|
|
1511
|
+
dfs(treeRoot);
|
|
1512
|
+
if (matchingElements.length === 0) {
|
|
1513
|
+
return void 0;
|
|
1514
|
+
}
|
|
1515
|
+
const element = matchingElements.reduce((smallest, current) => {
|
|
1516
|
+
const smallestArea = smallest.rect.width * smallest.rect.height;
|
|
1517
|
+
const currentArea = current.rect.width * current.rect.height;
|
|
1518
|
+
return currentArea < smallestArea ? current : smallest;
|
|
1519
|
+
});
|
|
1520
|
+
const distanceToCenter = distance(
|
|
1521
|
+
{ x: element.center[0], y: element.center[1] },
|
|
1522
|
+
position
|
|
1523
|
+
);
|
|
1524
|
+
if (requireStrictDistance) {
|
|
1525
|
+
return distanceToCenter <= distanceThreshold ? element : void 0;
|
|
1526
|
+
}
|
|
1527
|
+
return element;
|
|
1528
|
+
}
|
|
1529
|
+
function distance(point1, point2) {
|
|
1530
|
+
return Math.sqrt((point1.x - point2.x) ** 2 + (point1.y - point2.y) ** 2);
|
|
1531
|
+
}
|
|
1532
|
+
async function describeUserPage(context, opt) {
|
|
1533
|
+
const { screenshotBase64 } = context;
|
|
1534
|
+
let width;
|
|
1535
|
+
let height;
|
|
1536
|
+
if (context.size) {
|
|
1537
|
+
({ width, height } = context.size);
|
|
1538
|
+
} else {
|
|
1539
|
+
const imgSize = await imageInfoOfBase64(screenshotBase64);
|
|
1540
|
+
({ width, height } = imgSize);
|
|
1541
|
+
}
|
|
1542
|
+
const treeRoot = context.tree;
|
|
1543
|
+
const idElementMap = {};
|
|
1544
|
+
const flatElements = treeToList2(treeRoot);
|
|
1545
|
+
if (opt?.domIncluded === true && flatElements.length >= 5e3) {
|
|
1546
|
+
console.warn(
|
|
1547
|
+
'The number of elements is too large, it may cause the prompt to be too long, please use domIncluded: "visible-only" to reduce the number of elements'
|
|
1548
|
+
);
|
|
1549
|
+
}
|
|
1550
|
+
flatElements.forEach((element) => {
|
|
1551
|
+
idElementMap[element.id] = element;
|
|
1552
|
+
if (typeof element.indexId !== "undefined") {
|
|
1553
|
+
idElementMap[`${element.indexId}`] = element;
|
|
1554
|
+
}
|
|
1555
|
+
});
|
|
1556
|
+
let pageDescription = "";
|
|
1557
|
+
const visibleOnly = opt?.visibleOnly ?? opt?.domIncluded === "visible-only";
|
|
1558
|
+
if (opt?.domIncluded || !vlLocateMode3()) {
|
|
1559
|
+
const contentTree = await descriptionOfTree(
|
|
1560
|
+
treeRoot,
|
|
1561
|
+
opt?.truncateTextLength,
|
|
1562
|
+
opt?.filterNonTextContent,
|
|
1563
|
+
visibleOnly
|
|
1564
|
+
);
|
|
1565
|
+
const sizeDescription = describeSize({ width, height });
|
|
1566
|
+
pageDescription = `The size of the page: ${sizeDescription}
|
|
1567
|
+
The page elements tree:
|
|
1568
|
+
${contentTree}`;
|
|
1569
|
+
}
|
|
1570
|
+
return {
|
|
1571
|
+
description: pageDescription,
|
|
1572
|
+
elementById(idOrIndexId) {
|
|
1573
|
+
assert4(typeof idOrIndexId !== "undefined", "id is required for query");
|
|
1574
|
+
const item = idElementMap[`${idOrIndexId}`];
|
|
1575
|
+
return item;
|
|
1576
|
+
},
|
|
1577
|
+
elementByPosition(position, size) {
|
|
1578
|
+
return elementByPositionWithElementInfo(treeRoot, position);
|
|
1579
|
+
},
|
|
1580
|
+
insertElementByPosition(position) {
|
|
1581
|
+
const element = generateElementByPosition(position);
|
|
1582
|
+
treeRoot.children.push({
|
|
1583
|
+
node: element,
|
|
1584
|
+
children: []
|
|
1585
|
+
});
|
|
1586
|
+
flatElements.push(element);
|
|
1587
|
+
idElementMap[element.id] = element;
|
|
1588
|
+
return element;
|
|
1589
|
+
},
|
|
1590
|
+
size: { width, height }
|
|
1591
|
+
};
|
|
1592
|
+
}
|
|
1593
|
+
|
|
1636
1594
|
// src/ai-model/prompt/playwright-generator.ts
|
|
1637
1595
|
import { PLAYWRIGHT_EXAMPLE_CODE } from "@midscene/shared/constants";
|
|
1638
1596
|
|
|
@@ -2014,7 +1972,7 @@ import {
|
|
|
2014
1972
|
preProcessImageUrl
|
|
2015
1973
|
} from "@midscene/shared/img";
|
|
2016
1974
|
import { getDebug as getDebug3 } from "@midscene/shared/logger";
|
|
2017
|
-
import { assert as
|
|
1975
|
+
import { assert as assert5 } from "@midscene/shared/utils";
|
|
2018
1976
|
|
|
2019
1977
|
// src/ai-model/prompt/extraction.ts
|
|
2020
1978
|
import { PromptTemplate as PromptTemplate3 } from "@langchain/core/prompts";
|
|
@@ -2224,7 +2182,7 @@ async function AiLocateElement(options) {
|
|
|
2224
2182
|
const { context, targetElementDescription, callAI } = options;
|
|
2225
2183
|
const { screenshotBase64 } = context;
|
|
2226
2184
|
const { description, elementById, insertElementByPosition } = await describeUserPage(context);
|
|
2227
|
-
|
|
2185
|
+
assert5(
|
|
2228
2186
|
targetElementDescription,
|
|
2229
2187
|
"cannot find the target element description"
|
|
2230
2188
|
);
|
|
@@ -2235,11 +2193,11 @@ async function AiLocateElement(options) {
|
|
|
2235
2193
|
const systemPrompt = systemPromptToLocateElement(vlLocateMode4());
|
|
2236
2194
|
let imagePayload = screenshotBase64;
|
|
2237
2195
|
if (options.searchConfig) {
|
|
2238
|
-
|
|
2196
|
+
assert5(
|
|
2239
2197
|
options.searchConfig.rect,
|
|
2240
2198
|
"searchArea is provided but its rect cannot be found. Failed to locate element"
|
|
2241
2199
|
);
|
|
2242
|
-
|
|
2200
|
+
assert5(
|
|
2243
2201
|
options.searchConfig.imageBase64,
|
|
2244
2202
|
"searchArea is provided but its imageBase64 cannot be found. Failed to locate element"
|
|
2245
2203
|
);
|
|
@@ -2462,7 +2420,7 @@ async function AiExtractElementInfo(options) {
|
|
|
2462
2420
|
}
|
|
2463
2421
|
async function AiAssert(options) {
|
|
2464
2422
|
const { assertion, context } = options;
|
|
2465
|
-
|
|
2423
|
+
assert5(assertion, "assertion should not be empty");
|
|
2466
2424
|
const { screenshotBase64 } = context;
|
|
2467
2425
|
const systemPrompt = systemPromptToAssert({
|
|
2468
2426
|
isUITars: getAIConfigInBoolean2(MIDSCENE_USE_VLM_UI_TARS)
|
|
@@ -2512,13 +2470,13 @@ ${assertionText}
|
|
|
2512
2470
|
// src/ai-model/llm-planning.ts
|
|
2513
2471
|
import { vlLocateMode as vlLocateMode5 } from "@midscene/shared/env";
|
|
2514
2472
|
import { paddingToMatchBlockByBase64 as paddingToMatchBlockByBase642 } from "@midscene/shared/img";
|
|
2515
|
-
import { assert as
|
|
2473
|
+
import { assert as assert6 } from "@midscene/shared/utils";
|
|
2516
2474
|
async function plan(userInstruction, opts) {
|
|
2517
2475
|
const { callAI, context } = opts || {};
|
|
2518
2476
|
const { screenshotBase64, size } = context;
|
|
2519
2477
|
const { description: pageDescription, elementById } = await describeUserPage(context);
|
|
2520
2478
|
const systemPrompt = await systemPromptToTaskPlanning({
|
|
2521
|
-
|
|
2479
|
+
actionSpace: opts.actionSpace,
|
|
2522
2480
|
vlMode: vlLocateMode5()
|
|
2523
2481
|
});
|
|
2524
2482
|
const taskBackgroundContextText = generateTaskBackgroundContext(
|
|
@@ -2574,7 +2532,7 @@ async function plan(userInstruction, opts) {
|
|
|
2574
2532
|
usage,
|
|
2575
2533
|
yamlFlow: buildYamlFlowFromPlans(actions, planFromAI.sleep)
|
|
2576
2534
|
};
|
|
2577
|
-
|
|
2535
|
+
assert6(planFromAI, "can't get plans from AI");
|
|
2578
2536
|
if (vlLocateMode5()) {
|
|
2579
2537
|
actions.forEach((action) => {
|
|
2580
2538
|
if (action.locate) {
|
|
@@ -2590,7 +2548,7 @@ async function plan(userInstruction, opts) {
|
|
|
2590
2548
|
}
|
|
2591
2549
|
}
|
|
2592
2550
|
});
|
|
2593
|
-
|
|
2551
|
+
assert6(!planFromAI.error, `Failed to plan actions: ${planFromAI.error}`);
|
|
2594
2552
|
} else {
|
|
2595
2553
|
actions.forEach((action) => {
|
|
2596
2554
|
if (action.locate?.id) {
|
|
@@ -2619,7 +2577,7 @@ import {
|
|
|
2619
2577
|
import { resizeImgBase64 } from "@midscene/shared/img";
|
|
2620
2578
|
import { transformHotkeyInput } from "@midscene/shared/keyboard-layout";
|
|
2621
2579
|
import { getDebug as getDebug4 } from "@midscene/shared/logger";
|
|
2622
|
-
import { assert as
|
|
2580
|
+
import { assert as assert7 } from "@midscene/shared/utils";
|
|
2623
2581
|
import { actionParser } from "@ui-tars/action-parser";
|
|
2624
2582
|
|
|
2625
2583
|
// src/ai-model/prompt/ui-tars-planning.ts
|
|
@@ -2696,7 +2654,7 @@ async function vlmPlanning(options) {
|
|
|
2696
2654
|
const transformActions = [];
|
|
2697
2655
|
parsed.forEach((action) => {
|
|
2698
2656
|
if (action.action_type === "click") {
|
|
2699
|
-
|
|
2657
|
+
assert7(action.action_inputs.start_box, "start_box is required");
|
|
2700
2658
|
const point = getPoint(action.action_inputs.start_box, size);
|
|
2701
2659
|
transformActions.push({
|
|
2702
2660
|
type: "Locate",
|
|
@@ -2723,8 +2681,8 @@ async function vlmPlanning(options) {
|
|
|
2723
2681
|
param: action.thought || ""
|
|
2724
2682
|
});
|
|
2725
2683
|
} else if (action.action_type === "drag") {
|
|
2726
|
-
|
|
2727
|
-
|
|
2684
|
+
assert7(action.action_inputs.start_box, "start_box is required");
|
|
2685
|
+
assert7(action.action_inputs.end_box, "end_box is required");
|
|
2728
2686
|
const startPoint = getPoint(action.action_inputs.start_box, size);
|
|
2729
2687
|
const endPoint = getPoint(action.action_inputs.end_box, size);
|
|
2730
2688
|
transformActions.push({
|
|
@@ -2806,7 +2764,7 @@ async function vlmPlanning(options) {
|
|
|
2806
2764
|
param: {}
|
|
2807
2765
|
});
|
|
2808
2766
|
} else if (action.action_type === "androidLongPress") {
|
|
2809
|
-
|
|
2767
|
+
assert7(
|
|
2810
2768
|
action.action_inputs.start_coords,
|
|
2811
2769
|
"start_coords is required for androidLongPress"
|
|
2812
2770
|
);
|
|
@@ -2900,8 +2858,6 @@ async function resizeImageForUiTars(imageBase64, size) {
|
|
|
2900
2858
|
|
|
2901
2859
|
export {
|
|
2902
2860
|
systemPromptToLocateElement,
|
|
2903
|
-
elementByPositionWithElementInfo,
|
|
2904
|
-
describeUserPage,
|
|
2905
2861
|
call2 as call,
|
|
2906
2862
|
callToGetJSONObject,
|
|
2907
2863
|
callAiFnWithStringResponse,
|
|
@@ -2909,6 +2865,8 @@ export {
|
|
|
2909
2865
|
callAiFn,
|
|
2910
2866
|
adaptBboxToRect,
|
|
2911
2867
|
expandSearchArea,
|
|
2868
|
+
elementByPositionWithElementInfo,
|
|
2869
|
+
describeUserPage,
|
|
2912
2870
|
generateYamlTest,
|
|
2913
2871
|
generateYamlTestStream,
|
|
2914
2872
|
generatePlaywrightTest,
|
|
@@ -2922,4 +2880,4 @@ export {
|
|
|
2922
2880
|
resizeImageForUiTars
|
|
2923
2881
|
};
|
|
2924
2882
|
|
|
2925
|
-
//# sourceMappingURL=chunk-
|
|
2883
|
+
//# sourceMappingURL=chunk-DDYIQHOA.js.map
|