@midscene/core 0.25.4-beta-20250807062119.0 → 0.25.4-beta-20250811113343.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/es/ai-model.d.ts +7 -6
- package/dist/es/ai-model.js +1 -1
- package/dist/es/{chunk-G2JTYWI6.js → chunk-5IZMFZPA.js} +374 -598
- package/dist/es/chunk-5IZMFZPA.js.map +1 -0
- package/dist/es/{chunk-JH54OF4E.js → chunk-H5PRBRMX.js} +3 -3
- package/dist/es/index.d.ts +6 -6
- package/dist/es/index.js +4 -5
- package/dist/es/index.js.map +1 -1
- package/dist/es/{llm-planning-f449f3b8.d.ts → llm-planning-374b74b8.d.ts} +3 -3
- package/dist/es/{types-7435eba0.d.ts → types-16cd9f75.d.ts} +11 -8
- package/dist/es/utils.d.ts +1 -1
- package/dist/es/utils.js +1 -1
- package/dist/lib/ai-model.d.ts +7 -6
- package/dist/lib/ai-model.js +2 -2
- package/dist/lib/{chunk-G2JTYWI6.js → chunk-5IZMFZPA.js} +367 -591
- package/dist/lib/chunk-5IZMFZPA.js.map +1 -0
- package/dist/lib/{chunk-JH54OF4E.js → chunk-H5PRBRMX.js} +3 -3
- package/dist/lib/index.d.ts +6 -6
- package/dist/lib/index.js +14 -15
- package/dist/lib/index.js.map +1 -1
- package/dist/lib/{llm-planning-f449f3b8.d.ts → llm-planning-374b74b8.d.ts} +3 -3
- package/dist/{types/types-7435eba0.d.ts → lib/types-16cd9f75.d.ts} +11 -8
- package/dist/lib/utils.d.ts +1 -1
- package/dist/lib/utils.js +2 -2
- package/dist/types/ai-model.d.ts +7 -6
- package/dist/types/index.d.ts +6 -6
- package/dist/types/{llm-planning-f449f3b8.d.ts → llm-planning-374b74b8.d.ts} +3 -3
- package/dist/{lib/types-7435eba0.d.ts → types/types-16cd9f75.d.ts} +11 -8
- package/dist/types/utils.d.ts +1 -1
- package/package.json +3 -3
- package/dist/es/chunk-G2JTYWI6.js.map +0 -1
- package/dist/lib/chunk-G2JTYWI6.js.map +0 -1
- /package/dist/es/{chunk-JH54OF4E.js.map → chunk-H5PRBRMX.js.map} +0 -0
- /package/dist/lib/{chunk-JH54OF4E.js.map → chunk-H5PRBRMX.js.map} +0 -0
|
@@ -5,16 +5,35 @@ import {
|
|
|
5
5
|
getBearerTokenProvider
|
|
6
6
|
} from "@azure/identity";
|
|
7
7
|
import {
|
|
8
|
+
ANTHROPIC_API_KEY,
|
|
9
|
+
AZURE_OPENAI_API_VERSION,
|
|
10
|
+
AZURE_OPENAI_DEPLOYMENT,
|
|
11
|
+
AZURE_OPENAI_ENDPOINT,
|
|
12
|
+
AZURE_OPENAI_KEY,
|
|
8
13
|
MIDSCENE_API_TYPE,
|
|
14
|
+
MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON,
|
|
15
|
+
MIDSCENE_AZURE_OPENAI_SCOPE,
|
|
16
|
+
MIDSCENE_DEBUG_AI_PROFILE,
|
|
17
|
+
MIDSCENE_DEBUG_AI_RESPONSE,
|
|
9
18
|
MIDSCENE_LANGSMITH_DEBUG,
|
|
19
|
+
MIDSCENE_MODEL_NAME,
|
|
20
|
+
MIDSCENE_OPENAI_HTTP_PROXY,
|
|
21
|
+
MIDSCENE_OPENAI_INIT_CONFIG_JSON,
|
|
22
|
+
MIDSCENE_OPENAI_SOCKS_PROXY,
|
|
23
|
+
MIDSCENE_USE_ANTHROPIC_SDK,
|
|
24
|
+
MIDSCENE_USE_AZURE_OPENAI,
|
|
25
|
+
OPENAI_API_KEY,
|
|
26
|
+
OPENAI_BASE_URL,
|
|
10
27
|
OPENAI_MAX_TOKENS,
|
|
11
|
-
|
|
12
|
-
|
|
28
|
+
OPENAI_USE_AZURE,
|
|
29
|
+
getAIConfig,
|
|
30
|
+
getAIConfigInBoolean,
|
|
31
|
+
getAIConfigInJson,
|
|
13
32
|
uiTarsModelVersion,
|
|
14
|
-
vlLocateMode as
|
|
33
|
+
vlLocateMode as vlLocateMode2
|
|
15
34
|
} from "@midscene/shared/env";
|
|
16
|
-
import { getDebug as
|
|
17
|
-
import { assert as
|
|
35
|
+
import { enableDebug, getDebug as getDebug2 } from "@midscene/shared/logger";
|
|
36
|
+
import { assert as assert3 } from "@midscene/shared/utils";
|
|
18
37
|
import { ifInBrowser } from "@midscene/shared/utils";
|
|
19
38
|
import { HttpsProxyAgent } from "https-proxy-agent";
|
|
20
39
|
import { jsonrepair } from "jsonrepair";
|
|
@@ -36,11 +55,10 @@ var AIActionType = /* @__PURE__ */ ((AIActionType2) => {
|
|
|
36
55
|
AIActionType2[AIActionType2["DESCRIBE_ELEMENT"] = 4] = "DESCRIBE_ELEMENT";
|
|
37
56
|
return AIActionType2;
|
|
38
57
|
})(AIActionType || {});
|
|
39
|
-
async function callAiFn(msgs, AIActionTypeValue
|
|
58
|
+
async function callAiFn(msgs, AIActionTypeValue) {
|
|
40
59
|
const { content, usage } = await callToGetJSONObject(
|
|
41
60
|
msgs,
|
|
42
|
-
AIActionTypeValue
|
|
43
|
-
modelPreferences
|
|
61
|
+
AIActionTypeValue
|
|
44
62
|
);
|
|
45
63
|
return { content, usage };
|
|
46
64
|
}
|
|
@@ -615,179 +633,80 @@ Here is the item user want to find:
|
|
|
615
633
|
});
|
|
616
634
|
|
|
617
635
|
// src/ai-model/prompt/llm-planning.ts
|
|
636
|
+
import assert2 from "assert";
|
|
618
637
|
import { PromptTemplate as PromptTemplate2 } from "@langchain/core/prompts";
|
|
619
|
-
|
|
620
|
-
// src/image/index.ts
|
|
621
|
-
import {
|
|
622
|
-
imageInfo,
|
|
623
|
-
imageInfoOfBase64,
|
|
624
|
-
localImg2Base64,
|
|
625
|
-
httpImg2Base64,
|
|
626
|
-
resizeImg,
|
|
627
|
-
saveBase64Image,
|
|
628
|
-
zoomForGPT4o
|
|
629
|
-
} from "@midscene/shared/img";
|
|
630
|
-
|
|
631
|
-
// src/ai-model/prompt/util.ts
|
|
632
|
-
import { NodeType as NodeType2 } from "@midscene/shared/constants";
|
|
633
|
-
import { vlLocateMode as vlLocateMode2 } from "@midscene/shared/env";
|
|
634
|
-
import {
|
|
635
|
-
descriptionOfTree,
|
|
636
|
-
generateElementByPosition,
|
|
637
|
-
treeToList as treeToList2
|
|
638
|
-
} from "@midscene/shared/extractor";
|
|
639
|
-
import { assert as assert2 } from "@midscene/shared/utils";
|
|
640
|
-
function describeSize(size) {
|
|
641
|
-
return `${size.width} x ${size.height}`;
|
|
642
|
-
}
|
|
643
|
-
var distanceThreshold = 16;
|
|
644
|
-
function elementByPositionWithElementInfo(treeRoot, position, options) {
|
|
645
|
-
const requireStrictDistance = options?.requireStrictDistance ?? true;
|
|
646
|
-
const filterPositionElements = options?.filterPositionElements ?? false;
|
|
647
|
-
assert2(typeof position !== "undefined", "position is required for query");
|
|
648
|
-
const matchingElements = [];
|
|
649
|
-
function dfs(node) {
|
|
650
|
-
if (node?.node) {
|
|
651
|
-
const item = node.node;
|
|
652
|
-
if (item.rect.left <= position.x && position.x <= item.rect.left + item.rect.width && item.rect.top <= position.y && position.y <= item.rect.top + item.rect.height) {
|
|
653
|
-
if (!(filterPositionElements && item.attributes?.nodeType === NodeType2.POSITION) && item.isVisible) {
|
|
654
|
-
matchingElements.push(item);
|
|
655
|
-
}
|
|
656
|
-
}
|
|
657
|
-
}
|
|
658
|
-
for (const child of node.children) {
|
|
659
|
-
dfs(child);
|
|
660
|
-
}
|
|
661
|
-
}
|
|
662
|
-
dfs(treeRoot);
|
|
663
|
-
if (matchingElements.length === 0) {
|
|
664
|
-
return void 0;
|
|
665
|
-
}
|
|
666
|
-
const element = matchingElements.reduce((smallest, current) => {
|
|
667
|
-
const smallestArea = smallest.rect.width * smallest.rect.height;
|
|
668
|
-
const currentArea = current.rect.width * current.rect.height;
|
|
669
|
-
return currentArea < smallestArea ? current : smallest;
|
|
670
|
-
});
|
|
671
|
-
const distanceToCenter = distance(
|
|
672
|
-
{ x: element.center[0], y: element.center[1] },
|
|
673
|
-
position
|
|
674
|
-
);
|
|
675
|
-
if (requireStrictDistance) {
|
|
676
|
-
return distanceToCenter <= distanceThreshold ? element : void 0;
|
|
677
|
-
}
|
|
678
|
-
return element;
|
|
679
|
-
}
|
|
680
|
-
function distance(point1, point2) {
|
|
681
|
-
return Math.sqrt((point1.x - point2.x) ** 2 + (point1.y - point2.y) ** 2);
|
|
682
|
-
}
|
|
683
|
-
var samplePageDescription = `
|
|
684
|
-
And the page is described as follows:
|
|
685
|
-
====================
|
|
686
|
-
The size of the page: 1280 x 720
|
|
687
|
-
Some of the elements are marked with a rectangle in the screenshot corresponding to the markerId, some are not.
|
|
688
|
-
|
|
689
|
-
Description of all the elements in screenshot:
|
|
690
|
-
<div id="969f1637" markerId="1" left="100" top="100" width="100" height="100"> // The markerId indicated by the rectangle label in the screenshot
|
|
691
|
-
<h4 id="b211ecb2" markerId="5" left="150" top="150" width="90" height="60">
|
|
692
|
-
The username is accepted
|
|
693
|
-
</h4>
|
|
694
|
-
...many more
|
|
695
|
-
</div>
|
|
696
|
-
====================
|
|
697
|
-
`;
|
|
698
|
-
async function describeUserPage(context, opt) {
|
|
699
|
-
const { screenshotBase64 } = context;
|
|
700
|
-
let width;
|
|
701
|
-
let height;
|
|
702
|
-
if (context.size) {
|
|
703
|
-
({ width, height } = context.size);
|
|
704
|
-
} else {
|
|
705
|
-
const imgSize = await imageInfoOfBase64(screenshotBase64);
|
|
706
|
-
({ width, height } = imgSize);
|
|
707
|
-
}
|
|
708
|
-
const treeRoot = context.tree;
|
|
709
|
-
const idElementMap = {};
|
|
710
|
-
const flatElements = treeToList2(treeRoot);
|
|
711
|
-
if (opt?.domIncluded === true && flatElements.length >= 5e3) {
|
|
712
|
-
console.warn(
|
|
713
|
-
'The number of elements is too large, it may cause the prompt to be too long, please use domIncluded: "visible-only" to reduce the number of elements'
|
|
714
|
-
);
|
|
715
|
-
}
|
|
716
|
-
flatElements.forEach((element) => {
|
|
717
|
-
idElementMap[element.id] = element;
|
|
718
|
-
if (typeof element.indexId !== "undefined") {
|
|
719
|
-
idElementMap[`${element.indexId}`] = element;
|
|
720
|
-
}
|
|
721
|
-
});
|
|
722
|
-
let pageDescription = "";
|
|
723
|
-
const visibleOnly = opt?.visibleOnly ?? opt?.domIncluded === "visible-only";
|
|
724
|
-
if (opt?.domIncluded || !vlLocateMode2()) {
|
|
725
|
-
const contentTree = await descriptionOfTree(
|
|
726
|
-
treeRoot,
|
|
727
|
-
opt?.truncateTextLength,
|
|
728
|
-
opt?.filterNonTextContent,
|
|
729
|
-
visibleOnly
|
|
730
|
-
);
|
|
731
|
-
const sizeDescription = describeSize({ width, height });
|
|
732
|
-
pageDescription = `The size of the page: ${sizeDescription}
|
|
733
|
-
The page elements tree:
|
|
734
|
-
${contentTree}`;
|
|
735
|
-
}
|
|
736
|
-
return {
|
|
737
|
-
description: pageDescription,
|
|
738
|
-
elementById(idOrIndexId) {
|
|
739
|
-
assert2(typeof idOrIndexId !== "undefined", "id is required for query");
|
|
740
|
-
const item = idElementMap[`${idOrIndexId}`];
|
|
741
|
-
return item;
|
|
742
|
-
},
|
|
743
|
-
elementByPosition(position, size) {
|
|
744
|
-
return elementByPositionWithElementInfo(treeRoot, position);
|
|
745
|
-
},
|
|
746
|
-
insertElementByPosition(position) {
|
|
747
|
-
const element = generateElementByPosition(position);
|
|
748
|
-
treeRoot.children.push({
|
|
749
|
-
node: element,
|
|
750
|
-
children: []
|
|
751
|
-
});
|
|
752
|
-
flatElements.push(element);
|
|
753
|
-
idElementMap[element.id] = element;
|
|
754
|
-
return element;
|
|
755
|
-
},
|
|
756
|
-
size: { width, height }
|
|
757
|
-
};
|
|
758
|
-
}
|
|
759
|
-
|
|
760
|
-
// src/ai-model/prompt/llm-planning.ts
|
|
761
638
|
var vlCoTLog = `"what_the_user_wants_to_do_next_by_instruction": string, // What the user wants to do according to the instruction and previous logs. `;
|
|
762
|
-
var vlCurrentLog = `"log": string, // Log what the next one action (ONLY ONE!) you can do according to the screenshot and the instruction. The typical log looks like "Now i want to use action '{
|
|
763
|
-
var llmCurrentLog = `"log": string, // Log what the next actions you can do according to the screenshot and the instruction. The typical log looks like "Now i want to use action '{
|
|
639
|
+
var vlCurrentLog = `"log": string, // Log what the next one action (ONLY ONE!) you can do according to the screenshot and the instruction. The typical log looks like "Now i want to use action '{ action-type }' to do .. first". If no action should be done, log the reason. ". Use the same language as the user's instruction.`;
|
|
640
|
+
var llmCurrentLog = `"log": string, // Log what the next actions you can do according to the screenshot and the instruction. The typical log looks like "Now i want to use action '{ action-type }' to do ..". If no action should be done, log the reason. ". Use the same language as the user's instruction.`;
|
|
764
641
|
var commonOutputFields = `"error"?: string, // Error messages about unexpected situations, if any. Only think it is an error when the situation is not expected according to the instruction. Use the same language as the user's instruction.
|
|
765
642
|
"more_actions_needed_by_instruction": boolean, // Consider if there is still more action(s) to do after the action in "Log" is done, according to the instruction. If so, set this field to true. Otherwise, set it to false.`;
|
|
766
643
|
var vlLocateParam = "locate: {bbox: [number, number, number, number], prompt: string }";
|
|
644
|
+
var llmLocateParam = `locate: {"id": string, "prompt": string}`;
|
|
645
|
+
var descriptionForAction = (action, locatorScheme) => {
|
|
646
|
+
const tab = " ";
|
|
647
|
+
let locateParam = "";
|
|
648
|
+
if (action.location === "required") {
|
|
649
|
+
locateParam = locatorScheme;
|
|
650
|
+
} else if (action.location === "optional") {
|
|
651
|
+
locateParam = `${locatorScheme} | null`;
|
|
652
|
+
} else if (action.location === false) {
|
|
653
|
+
locateParam = "";
|
|
654
|
+
}
|
|
655
|
+
const locatorParam = locateParam ? `${tab}- ${locateParam}` : "";
|
|
656
|
+
let whatToLocate = "";
|
|
657
|
+
if (action.whatToLocate) {
|
|
658
|
+
if (!locateParam) {
|
|
659
|
+
console.warn(
|
|
660
|
+
`whatToLocate is provided for action ${action.name}, but location is not required or optional. The whatToLocate will be ignored.`
|
|
661
|
+
);
|
|
662
|
+
} else {
|
|
663
|
+
whatToLocate = `${tab}- whatToLocate: ${action.whatToLocate}`;
|
|
664
|
+
}
|
|
665
|
+
}
|
|
666
|
+
let paramSchema = "";
|
|
667
|
+
if (action.paramSchema) {
|
|
668
|
+
paramSchema = `${tab}- paramSchema: ${action.paramSchema}`;
|
|
669
|
+
}
|
|
670
|
+
let paramDescription = "";
|
|
671
|
+
if (action.paramDescription) {
|
|
672
|
+
assert2(
|
|
673
|
+
paramSchema,
|
|
674
|
+
`paramSchema is required when paramDescription is provided for action ${action.name}, but got ${action.paramSchema}`
|
|
675
|
+
);
|
|
676
|
+
paramDescription = `${tab}- paramDescription: ${action.paramDescription}`;
|
|
677
|
+
}
|
|
678
|
+
const fields = [
|
|
679
|
+
paramSchema,
|
|
680
|
+
paramDescription,
|
|
681
|
+
locatorParam,
|
|
682
|
+
whatToLocate
|
|
683
|
+
].filter(Boolean);
|
|
684
|
+
return `- ${action.name}
|
|
685
|
+
- type: "${action.name}"
|
|
686
|
+
- description: ${action.description}
|
|
687
|
+
${fields.join("\n")}
|
|
688
|
+
`.trim();
|
|
689
|
+
};
|
|
767
690
|
var systemTemplateOfVLPlanning = ({
|
|
768
|
-
|
|
691
|
+
actionSpace,
|
|
769
692
|
vlMode
|
|
770
|
-
}) =>
|
|
693
|
+
}) => {
|
|
694
|
+
const actionNameList = actionSpace.map((action) => action.name).join(", ");
|
|
695
|
+
const actionDescriptionList = actionSpace.map(
|
|
696
|
+
(action) => descriptionForAction(action, vlLocateParam)
|
|
697
|
+
);
|
|
698
|
+
const actionList = actionDescriptionList.join("\n");
|
|
699
|
+
return `
|
|
771
700
|
Target: User will give you a screenshot, an instruction and some previous logs indicating what have been done. Please tell what the next one action is (or null if no action should be done) to do the tasks the instruction requires.
|
|
772
701
|
|
|
773
702
|
Restriction:
|
|
774
703
|
- Don't give extra actions or plans beyond the instruction. ONLY plan for what the instruction requires. For example, don't try to submit the form if the instruction is only to fill something.
|
|
775
|
-
- Always give ONLY ONE action in \`log\` field (or null if no action should be done), instead of multiple actions. Supported actions are
|
|
704
|
+
- Always give ONLY ONE action in \`log\` field (or null if no action should be done), instead of multiple actions. Supported actions are ${actionNameList}.
|
|
776
705
|
- Don't repeat actions in the previous logs.
|
|
777
706
|
- Bbox is the bounding box of the element to be located. It's an array of 4 numbers, representing ${bboxDescription(vlMode)}.
|
|
778
707
|
|
|
779
708
|
Supporting actions:
|
|
780
|
-
|
|
781
|
-
- RightClick: { type: "RightClick", ${vlLocateParam} }
|
|
782
|
-
- Hover: { type: "Hover", ${vlLocateParam} }
|
|
783
|
-
- Input: { type: "Input", ${vlLocateParam}, param: { value: string } } // Replace the input field with a new value. \`value\` is the final that should be filled in the input box. No matter what modifications are required, just provide the final value to replace the existing input value. Giving a blank string means clear the input field.
|
|
784
|
-
- KeyboardPress: { type: "KeyboardPress", param: { value: string } }
|
|
785
|
-
- Scroll: { type: "Scroll", ${vlLocateParam} | null, param: { direction: 'down'(default) | 'up' | 'right' | 'left', scrollType: 'once' (default) | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft', distance: null | number }} // locate is the element to scroll. If it's a page scroll, put \`null\` in the \`locate\` field.
|
|
786
|
-
${pageType === "android" ? `- AndroidBackButton: { type: "AndroidBackButton", param: {} }
|
|
787
|
-
- AndroidHomeButton: { type: "AndroidHomeButton", param: {} }
|
|
788
|
-
- AndroidRecentAppsButton: { type: "AndroidRecentAppsButton", param: {} }
|
|
789
|
-
- AndroidLongPress: { type: "AndroidLongPress", param: { x: number, y: number, duration?: number } }
|
|
790
|
-
- AndroidPull: { type: "AndroidPull", param: { direction: 'up' | 'down', startPoint?: { x: number, y: number }, distance?: number, duration?: number } } // Pull down to refresh (direction: 'down') or pull up to load more (direction: 'up')` : ""}
|
|
709
|
+
${actionList}
|
|
791
710
|
|
|
792
711
|
Field description:
|
|
793
712
|
* The \`prompt\` field inside the \`locate\` field is a short description that could be used to locate the element.
|
|
@@ -822,8 +741,16 @@ this and output the JSON:
|
|
|
822
741
|
}
|
|
823
742
|
}
|
|
824
743
|
`;
|
|
825
|
-
|
|
826
|
-
var systemTemplateOfLLM = ({
|
|
744
|
+
};
|
|
745
|
+
var systemTemplateOfLLM = ({
|
|
746
|
+
actionSpace
|
|
747
|
+
}) => {
|
|
748
|
+
const actionNameList = actionSpace.map((action) => action.name).join(" / ");
|
|
749
|
+
const actionDescriptionList = actionSpace.map(
|
|
750
|
+
(action) => descriptionForAction(action, llmLocateParam)
|
|
751
|
+
);
|
|
752
|
+
const actionList = actionDescriptionList.join("\n");
|
|
753
|
+
return `
|
|
827
754
|
## Role
|
|
828
755
|
|
|
829
756
|
You are a versatile professional in software UI automation. Your outstanding contributions will impact the user experience of billions of users.
|
|
@@ -837,7 +764,7 @@ You are a versatile professional in software UI automation. Your outstanding con
|
|
|
837
764
|
## Workflow
|
|
838
765
|
|
|
839
766
|
1. Receive the screenshot, element description of screenshot(if any), user's instruction and previous logs.
|
|
840
|
-
2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (
|
|
767
|
+
2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (${actionNameList}). The "About the action" section below will give you more details.
|
|
841
768
|
3. Precisely locate the target element if it's already shown in the screenshot, put the location info in the \`locate\` field of the action.
|
|
842
769
|
4. If some target elements is not shown in the screenshot, consider the user's instruction is not feasible on this page. Follow the next steps.
|
|
843
770
|
5. Consider whether the user's instruction will be accomplished after all the actions
|
|
@@ -855,65 +782,30 @@ You are a versatile professional in software UI automation. Your outstanding con
|
|
|
855
782
|
|
|
856
783
|
The \`locate\` param is commonly used in the \`param\` field of the action, means to locate the target element to perform the action, it conforms to the following scheme:
|
|
857
784
|
|
|
858
|
-
type LocateParam = {
|
|
785
|
+
type LocateParam = {
|
|
859
786
|
"id": string, // the id of the element found. It should either be the id marked with a rectangle in the screenshot or the id described in the description.
|
|
860
787
|
"prompt"?: string // the description of the element to find. It can only be omitted when locate is null.
|
|
861
|
-
}
|
|
788
|
+
} | null // If it's not on the page, the LocateParam should be null
|
|
862
789
|
|
|
863
790
|
## Supported actions
|
|
864
791
|
|
|
865
792
|
Each action has a \`type\` and corresponding \`param\`. To be detailed:
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
- type: 'Hover'
|
|
871
|
-
* {{ ${llmLocateParam} }}
|
|
872
|
-
- type: 'Input', replace the value in the input field
|
|
873
|
-
* {{ ${llmLocateParam}, param: {{ value: string }} }}
|
|
874
|
-
* \`value\` is the final value that should be filled in the input field. No matter what modifications are required, just provide the final value user should see after the action is done.
|
|
875
|
-
- type: 'KeyboardPress', press a key
|
|
876
|
-
* {{ param: {{ value: string }} }}
|
|
877
|
-
- type: 'Scroll', scroll up or down.
|
|
878
|
-
* {{
|
|
879
|
-
${llmLocateParam},
|
|
880
|
-
param: {{
|
|
881
|
-
direction: 'down'(default) | 'up' | 'right' | 'left',
|
|
882
|
-
scrollType: 'once' (default) | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft',
|
|
883
|
-
distance: null | number
|
|
884
|
-
}}
|
|
885
|
-
}}
|
|
886
|
-
* To scroll some specific element, put the element at the center of the region in the \`locate\` field. If it's a page scroll, put \`null\` in the \`locate\` field.
|
|
887
|
-
* \`param\` is required in this action. If some fields are not specified, use direction \`down\`, \`once\` scroll type, and \`null\` distance.
|
|
888
|
-
* {{ param: {{ button: 'Back' | 'Home' | 'RecentApp' }} }}
|
|
889
|
-
- type: 'ExpectedFalsyCondition'
|
|
890
|
-
* {{ param: {{ reason: string }} }}
|
|
891
|
-
* use this action when the conditional statement talked about in the instruction is falsy.
|
|
892
|
-
- type: 'Sleep'
|
|
893
|
-
* {{ param: {{ timeMs: number }} }}
|
|
894
|
-
${pageType === "android" ? `- type: 'AndroidBackButton', trigger the system "back" operation on Android devices
|
|
895
|
-
* {{ param: {{}} }}
|
|
896
|
-
- type: 'AndroidHomeButton', trigger the system "home" operation on Android devices
|
|
897
|
-
* {{ param: {{}} }}
|
|
898
|
-
- type: 'AndroidRecentAppsButton', trigger the system "recent apps" operation on Android devices
|
|
899
|
-
* {{ param: {{}} }}
|
|
900
|
-
- type: 'AndroidLongPress', trigger a long press on the screen at specified coordinates on Android devices
|
|
901
|
-
* {{ param: {{ x: number, y: number, duration?: number }} }}
|
|
902
|
-
- type: 'AndroidPull', trigger pull down to refresh or pull up actions on Android devices
|
|
903
|
-
* {{ param: {{ direction: 'up' | 'down', startPoint?: {{ x: number, y: number }}, distance?: number, duration?: number }} }}` : ""}
|
|
904
|
-
`;
|
|
793
|
+
${actionList}
|
|
794
|
+
|
|
795
|
+
`.trim();
|
|
796
|
+
};
|
|
905
797
|
var outputTemplate = `
|
|
906
798
|
## Output JSON Format:
|
|
907
799
|
|
|
908
800
|
The JSON format is as follows:
|
|
909
801
|
|
|
910
|
-
{
|
|
802
|
+
{
|
|
911
803
|
"actions": [
|
|
912
804
|
// ... some actions
|
|
913
805
|
],
|
|
914
806
|
${llmCurrentLog}
|
|
915
807
|
${commonOutputFields}
|
|
916
|
-
}
|
|
808
|
+
}
|
|
917
809
|
|
|
918
810
|
## Examples
|
|
919
811
|
|
|
@@ -929,68 +821,62 @@ By viewing the page screenshot and description, you should consider this and out
|
|
|
929
821
|
* Log what these action do: Click the language switch button to open the language options. Wait for 1 second.
|
|
930
822
|
* The task cannot be accomplished (because we cannot see the "English" option now), so the \`more_actions_needed_by_instruction\` field is true.
|
|
931
823
|
|
|
932
|
-
{
|
|
824
|
+
{
|
|
933
825
|
"actions":[
|
|
934
|
-
{
|
|
826
|
+
{
|
|
935
827
|
"type": "Tap",
|
|
936
828
|
"thought": "Click the language switch button to open the language options.",
|
|
937
829
|
"param": null,
|
|
938
|
-
"locate": {
|
|
939
|
-
}
|
|
940
|
-
{
|
|
830
|
+
"locate": { id: "c81c4e9a33", prompt: "The language switch button" }},
|
|
831
|
+
},
|
|
832
|
+
{
|
|
941
833
|
"type": "Sleep",
|
|
942
834
|
"thought": "Wait for 1 second to ensure the language options are displayed.",
|
|
943
|
-
"param": {
|
|
944
|
-
}
|
|
835
|
+
"param": { "timeMs": 1000 },
|
|
836
|
+
}
|
|
945
837
|
],
|
|
946
838
|
"error": null,
|
|
947
839
|
"more_actions_needed_by_instruction": true,
|
|
948
840
|
"log": "Click the language switch button to open the language options. Wait for 1 second",
|
|
949
|
-
}
|
|
841
|
+
}
|
|
950
842
|
|
|
951
843
|
### Example: What NOT to do
|
|
952
844
|
Wrong output:
|
|
953
|
-
{
|
|
845
|
+
{
|
|
954
846
|
"actions":[
|
|
955
|
-
{
|
|
847
|
+
{
|
|
956
848
|
"type": "Tap",
|
|
957
849
|
"thought": "Click the language switch button to open the language options.",
|
|
958
850
|
"param": null,
|
|
959
|
-
"locate": {
|
|
960
|
-
{
|
|
961
|
-
}
|
|
962
|
-
}
|
|
963
|
-
{
|
|
851
|
+
"locate": {
|
|
852
|
+
{ "id": "c81c4e9a33" }, // WRONG: prompt is missing
|
|
853
|
+
}
|
|
854
|
+
},
|
|
855
|
+
{
|
|
964
856
|
"type": "Tap",
|
|
965
857
|
"thought": "Click the English option",
|
|
966
858
|
"param": null,
|
|
967
859
|
"locate": null, // This means the 'English' option is not shown in the screenshot, the task cannot be accomplished
|
|
968
|
-
}
|
|
860
|
+
}
|
|
969
861
|
],
|
|
970
862
|
"more_actions_needed_by_instruction": false, // WRONG: should be true
|
|
971
863
|
"log": "Click the language switch button to open the language options",
|
|
972
|
-
}
|
|
864
|
+
}
|
|
973
865
|
|
|
974
866
|
Reason:
|
|
975
867
|
* The \`prompt\` is missing in the first 'Locate' action
|
|
976
868
|
* Since the option button is not shown in the screenshot, there are still more actions to be done, so the \`more_actions_needed_by_instruction\` field should be true
|
|
977
869
|
`;
|
|
978
870
|
async function systemPromptToTaskPlanning({
|
|
979
|
-
|
|
871
|
+
actionSpace,
|
|
980
872
|
vlMode
|
|
981
873
|
}) {
|
|
982
874
|
if (vlMode) {
|
|
983
|
-
return systemTemplateOfVLPlanning({
|
|
875
|
+
return systemTemplateOfVLPlanning({ actionSpace, vlMode });
|
|
984
876
|
}
|
|
985
|
-
|
|
986
|
-
template: `${systemTemplateOfLLM({ pageType })}
|
|
877
|
+
return `${systemTemplateOfLLM({ actionSpace })}
|
|
987
878
|
|
|
988
|
-
${outputTemplate}
|
|
989
|
-
inputVariables: ["pageDescription"]
|
|
990
|
-
});
|
|
991
|
-
return await promptTemplate.format({
|
|
992
|
-
pageDescription: samplePageDescription
|
|
993
|
-
});
|
|
879
|
+
${outputTemplate}`;
|
|
994
880
|
}
|
|
995
881
|
var planSchema = {
|
|
996
882
|
type: "json_schema",
|
|
@@ -1145,57 +1031,24 @@ pageDescription:
|
|
|
1145
1031
|
});
|
|
1146
1032
|
};
|
|
1147
1033
|
|
|
1148
|
-
// src/ai-model/service-caller/
|
|
1149
|
-
|
|
1150
|
-
|
|
1151
|
-
|
|
1152
|
-
|
|
1153
|
-
|
|
1154
|
-
|
|
1155
|
-
|
|
1156
|
-
|
|
1157
|
-
|
|
1158
|
-
|
|
1159
|
-
|
|
1160
|
-
|
|
1161
|
-
MIDSCENE_OPENAI_INIT_CONFIG_JSON,
|
|
1162
|
-
MIDSCENE_OPENAI_SOCKS_PROXY,
|
|
1163
|
-
MIDSCENE_USE_ANTHROPIC_SDK,
|
|
1164
|
-
MIDSCENE_USE_AZURE_OPENAI,
|
|
1165
|
-
MIDSCENE_VQA_ANTHROPIC_API_KEY,
|
|
1166
|
-
MIDSCENE_VQA_AZURE_OPENAI_API_VERSION,
|
|
1167
|
-
MIDSCENE_VQA_AZURE_OPENAI_DEPLOYMENT,
|
|
1168
|
-
MIDSCENE_VQA_AZURE_OPENAI_ENDPOINT,
|
|
1169
|
-
MIDSCENE_VQA_AZURE_OPENAI_INIT_CONFIG_JSON,
|
|
1170
|
-
MIDSCENE_VQA_AZURE_OPENAI_KEY,
|
|
1171
|
-
MIDSCENE_VQA_AZURE_OPENAI_SCOPE,
|
|
1172
|
-
MIDSCENE_VQA_MODEL_NAME,
|
|
1173
|
-
MIDSCENE_VQA_OPENAI_API_KEY,
|
|
1174
|
-
MIDSCENE_VQA_OPENAI_BASE_URL,
|
|
1175
|
-
MIDSCENE_VQA_OPENAI_HTTP_PROXY,
|
|
1176
|
-
MIDSCENE_VQA_OPENAI_INIT_CONFIG_JSON,
|
|
1177
|
-
MIDSCENE_VQA_OPENAI_SOCKS_PROXY,
|
|
1178
|
-
MIDSCENE_VQA_OPENAI_USE_AZURE,
|
|
1179
|
-
MIDSCENE_VQA_USE_ANTHROPIC_SDK,
|
|
1180
|
-
MIDSCENE_VQA_USE_AZURE_OPENAI,
|
|
1181
|
-
OPENAI_API_KEY,
|
|
1182
|
-
OPENAI_BASE_URL,
|
|
1183
|
-
OPENAI_USE_AZURE,
|
|
1184
|
-
getAIConfig,
|
|
1185
|
-
getAIConfigInBoolean,
|
|
1186
|
-
getAIConfigInJson
|
|
1187
|
-
} from "@midscene/shared/env";
|
|
1188
|
-
import { enableDebug, getDebug as getDebug2 } from "@midscene/shared/logger";
|
|
1189
|
-
import { assert as assert3 } from "@midscene/shared/utils";
|
|
1190
|
-
function getModelName() {
|
|
1191
|
-
let modelName = "gpt-4o";
|
|
1192
|
-
const nameInConfig = getAIConfig(MIDSCENE_MODEL_NAME);
|
|
1193
|
-
if (nameInConfig) {
|
|
1194
|
-
modelName = nameInConfig;
|
|
1195
|
-
}
|
|
1196
|
-
return modelName;
|
|
1034
|
+
// src/ai-model/service-caller/index.ts
|
|
1035
|
+
function checkAIConfig() {
|
|
1036
|
+
const openaiKey = getAIConfig(OPENAI_API_KEY);
|
|
1037
|
+
const azureConfig = getAIConfig(MIDSCENE_USE_AZURE_OPENAI);
|
|
1038
|
+
const anthropicKey = getAIConfig(ANTHROPIC_API_KEY);
|
|
1039
|
+
const initConfigJson = getAIConfig(MIDSCENE_OPENAI_INIT_CONFIG_JSON);
|
|
1040
|
+
if (openaiKey)
|
|
1041
|
+
return true;
|
|
1042
|
+
if (azureConfig)
|
|
1043
|
+
return true;
|
|
1044
|
+
if (anthropicKey)
|
|
1045
|
+
return true;
|
|
1046
|
+
return Boolean(initConfigJson);
|
|
1197
1047
|
}
|
|
1048
|
+
var debugConfigInitialized = false;
|
|
1198
1049
|
function initDebugConfig() {
|
|
1050
|
+
if (debugConfigInitialized)
|
|
1051
|
+
return;
|
|
1199
1052
|
const shouldPrintTiming = getAIConfigInBoolean(MIDSCENE_DEBUG_AI_PROFILE);
|
|
1200
1053
|
let debugConfig = "";
|
|
1201
1054
|
if (shouldPrintTiming) {
|
|
@@ -1220,232 +1073,27 @@ function initDebugConfig() {
|
|
|
1220
1073
|
if (debugConfig) {
|
|
1221
1074
|
enableDebug(debugConfig);
|
|
1222
1075
|
}
|
|
1076
|
+
debugConfigInitialized = true;
|
|
1223
1077
|
}
|
|
1224
|
-
var
|
|
1225
|
-
|
|
1226
|
-
|
|
1227
|
-
|
|
1228
|
-
|
|
1229
|
-
|
|
1230
|
-
);
|
|
1231
|
-
} else {
|
|
1232
|
-
assert3(
|
|
1233
|
-
value,
|
|
1234
|
-
`The ${key} must be a non-empty string because of the ${modelNameKey} is declared as ${modelName}, but got: ${value}
|
|
1235
|
-
Please check your config.`
|
|
1236
|
-
);
|
|
1237
|
-
}
|
|
1238
|
-
};
|
|
1239
|
-
var getModelConfigFromEnv = (modelName, keys, valueAssert) => {
|
|
1240
|
-
const socksProxy = getAIConfig(keys.socksProxy);
|
|
1241
|
-
const httpProxy = getAIConfig(keys.httpProxy);
|
|
1242
|
-
if (getAIConfig(keys.openaiUseAzureDeprecated)) {
|
|
1243
|
-
const openaiBaseURL = getAIConfig(keys.openaiBaseURL);
|
|
1244
|
-
const openaiApiKey = getAIConfig(keys.openaiApiKey);
|
|
1245
|
-
const openaiExtraConfig = getAIConfigInJson(keys.openaiExtraConfig);
|
|
1246
|
-
valueAssert(
|
|
1247
|
-
openaiBaseURL,
|
|
1248
|
-
keys.openaiBaseURL,
|
|
1249
|
-
keys.openaiUseAzureDeprecated
|
|
1250
|
-
);
|
|
1251
|
-
valueAssert(openaiApiKey, keys.openaiApiKey, keys.openaiUseAzureDeprecated);
|
|
1252
|
-
return {
|
|
1253
|
-
socksProxy,
|
|
1254
|
-
httpProxy,
|
|
1255
|
-
modelName,
|
|
1256
|
-
openaiUseAzureDeprecated: true,
|
|
1257
|
-
openaiApiKey,
|
|
1258
|
-
openaiBaseURL,
|
|
1259
|
-
openaiExtraConfig
|
|
1260
|
-
};
|
|
1261
|
-
} else if (getAIConfig(keys.useAzureOpenai)) {
|
|
1262
|
-
const azureOpenaiScope = getAIConfig(keys.azureOpenaiScope);
|
|
1263
|
-
const azureOpenaiApiKey = getAIConfig(keys.azureOpenaiApiKey);
|
|
1264
|
-
const azureOpenaiEndpoint = getAIConfig(keys.azureOpenaiEndpoint);
|
|
1265
|
-
const azureOpenaiDeployment = getAIConfig(keys.azureOpenaiDeployment);
|
|
1266
|
-
const azureOpenaiApiVersion = getAIConfig(keys.azureOpenaiApiVersion);
|
|
1267
|
-
const azureExtraConfig = getAIConfigInJson(keys.azureExtraConfig);
|
|
1268
|
-
const openaiExtraConfig = getAIConfigInJson(keys.openaiExtraConfig);
|
|
1269
|
-
valueAssert(azureOpenaiApiKey, keys.azureOpenaiApiKey, keys.useAzureOpenai);
|
|
1270
|
-
return {
|
|
1271
|
-
socksProxy,
|
|
1272
|
-
httpProxy,
|
|
1273
|
-
modelName,
|
|
1274
|
-
useAzureOpenai: true,
|
|
1275
|
-
azureOpenaiScope,
|
|
1276
|
-
azureOpenaiApiKey,
|
|
1277
|
-
azureOpenaiEndpoint,
|
|
1278
|
-
azureOpenaiDeployment,
|
|
1279
|
-
azureOpenaiApiVersion,
|
|
1280
|
-
azureExtraConfig,
|
|
1281
|
-
openaiExtraConfig
|
|
1282
|
-
};
|
|
1283
|
-
} else if (getAIConfig(keys.useAnthropicSdk)) {
|
|
1284
|
-
const anthropicApiKey = getAIConfig(keys.anthropicApiKey);
|
|
1285
|
-
valueAssert(anthropicApiKey, keys.anthropicApiKey, keys.useAnthropicSdk);
|
|
1286
|
-
return {
|
|
1287
|
-
socksProxy,
|
|
1288
|
-
httpProxy,
|
|
1289
|
-
modelName,
|
|
1290
|
-
useAnthropicSdk: true,
|
|
1291
|
-
anthropicApiKey
|
|
1292
|
-
};
|
|
1293
|
-
} else {
|
|
1294
|
-
const openaiBaseURL = getAIConfig(keys.openaiBaseURL);
|
|
1295
|
-
const openaiApiKey = getAIConfig(keys.openaiApiKey);
|
|
1296
|
-
const openaiExtraConfig = getAIConfigInJson(keys.openaiExtraConfig);
|
|
1297
|
-
valueAssert(openaiBaseURL, keys.openaiBaseURL);
|
|
1298
|
-
valueAssert(openaiApiKey, keys.openaiApiKey);
|
|
1299
|
-
return {
|
|
1300
|
-
socksProxy,
|
|
1301
|
-
httpProxy,
|
|
1302
|
-
modelName,
|
|
1303
|
-
openaiBaseURL,
|
|
1304
|
-
openaiApiKey,
|
|
1305
|
-
openaiExtraConfig
|
|
1306
|
-
};
|
|
1307
|
-
}
|
|
1308
|
-
};
|
|
1309
|
-
var maskKey = (key, maskChar = "*") => {
|
|
1310
|
-
if (typeof key !== "string" || key.length === 0) {
|
|
1311
|
-
return key;
|
|
1312
|
-
}
|
|
1313
|
-
const prefixLen = 3;
|
|
1314
|
-
const suffixLen = 3;
|
|
1315
|
-
const keepLength = prefixLen + suffixLen;
|
|
1316
|
-
if (key.length <= keepLength) {
|
|
1317
|
-
return key;
|
|
1318
|
-
}
|
|
1319
|
-
const prefix = key.substring(0, prefixLen);
|
|
1320
|
-
const suffix = key.substring(key.length - suffixLen);
|
|
1321
|
-
const maskLength = key.length - keepLength;
|
|
1322
|
-
const mask = maskChar.repeat(maskLength);
|
|
1323
|
-
return `${prefix}${mask}${suffix}`;
|
|
1324
|
-
};
|
|
1325
|
-
var maskConfig = (config) => {
|
|
1326
|
-
return Object.fromEntries(
|
|
1327
|
-
Object.entries(config).map(([key, value]) => [
|
|
1328
|
-
key,
|
|
1329
|
-
["openaiApiKey", "azureOpenaiApiKey", "anthropicApiKey"].includes(key) ? maskKey(value) : value
|
|
1330
|
-
])
|
|
1331
|
-
);
|
|
1332
|
-
};
|
|
1333
|
-
var decideModelConfig = (modelPreferences) => {
|
|
1334
|
-
initDebugConfig();
|
|
1335
|
-
const debugLog = getDebug2("ai:decideModelConfig");
|
|
1336
|
-
debugLog("modelPreferences", modelPreferences);
|
|
1337
|
-
const isVQAIntent = modelPreferences?.intent === "VQA";
|
|
1338
|
-
const vqaModelName = getAIConfig(MIDSCENE_VQA_MODEL_NAME);
|
|
1339
|
-
if (isVQAIntent && vqaModelName) {
|
|
1340
|
-
debugLog(
|
|
1341
|
-
`current action is a VQA action and detected ${MIDSCENE_VQA_MODEL_NAME} ${vqaModelName}, will only read VQA related model config from process.env`
|
|
1342
|
-
);
|
|
1343
|
-
const config = getModelConfigFromEnv(
|
|
1344
|
-
vqaModelName,
|
|
1345
|
-
{
|
|
1346
|
-
/**
|
|
1347
|
-
* proxy
|
|
1348
|
-
*/
|
|
1349
|
-
socksProxy: MIDSCENE_VQA_OPENAI_SOCKS_PROXY,
|
|
1350
|
-
httpProxy: MIDSCENE_VQA_OPENAI_HTTP_PROXY,
|
|
1351
|
-
/**
|
|
1352
|
-
* OpenAI
|
|
1353
|
-
*/
|
|
1354
|
-
openaiBaseURL: MIDSCENE_VQA_OPENAI_BASE_URL,
|
|
1355
|
-
openaiApiKey: MIDSCENE_VQA_OPENAI_API_KEY,
|
|
1356
|
-
openaiExtraConfig: MIDSCENE_VQA_OPENAI_INIT_CONFIG_JSON,
|
|
1357
|
-
/**
|
|
1358
|
-
* Azure
|
|
1359
|
-
*/
|
|
1360
|
-
openaiUseAzureDeprecated: MIDSCENE_VQA_OPENAI_USE_AZURE,
|
|
1361
|
-
useAzureOpenai: MIDSCENE_VQA_USE_AZURE_OPENAI,
|
|
1362
|
-
azureOpenaiScope: MIDSCENE_VQA_AZURE_OPENAI_SCOPE,
|
|
1363
|
-
azureOpenaiApiKey: MIDSCENE_VQA_AZURE_OPENAI_KEY,
|
|
1364
|
-
azureOpenaiEndpoint: MIDSCENE_VQA_AZURE_OPENAI_ENDPOINT,
|
|
1365
|
-
azureOpenaiApiVersion: MIDSCENE_VQA_AZURE_OPENAI_API_VERSION,
|
|
1366
|
-
azureOpenaiDeployment: MIDSCENE_VQA_AZURE_OPENAI_DEPLOYMENT,
|
|
1367
|
-
azureExtraConfig: MIDSCENE_VQA_AZURE_OPENAI_INIT_CONFIG_JSON,
|
|
1368
|
-
/**
|
|
1369
|
-
* Anthropic
|
|
1370
|
-
*/
|
|
1371
|
-
useAnthropicSdk: MIDSCENE_VQA_USE_ANTHROPIC_SDK,
|
|
1372
|
-
anthropicApiKey: MIDSCENE_VQA_ANTHROPIC_API_KEY
|
|
1373
|
-
},
|
|
1374
|
-
createAssert(MIDSCENE_VQA_MODEL_NAME, vqaModelName)
|
|
1375
|
-
);
|
|
1376
|
-
debugLog("got model config for VQA usage:", maskConfig(config));
|
|
1377
|
-
return config;
|
|
1378
|
-
} else {
|
|
1379
|
-
debugLog("read model config from process.env as normal.");
|
|
1380
|
-
const commonModelName = getAIConfig(MIDSCENE_MODEL_NAME);
|
|
1381
|
-
assert3(
|
|
1382
|
-
commonModelName,
|
|
1383
|
-
`${MIDSCENE_MODEL_NAME} is empty, please check your config.`
|
|
1384
|
-
);
|
|
1385
|
-
const config = getModelConfigFromEnv(
|
|
1386
|
-
commonModelName,
|
|
1387
|
-
{
|
|
1388
|
-
/**
|
|
1389
|
-
* proxy
|
|
1390
|
-
*/
|
|
1391
|
-
socksProxy: MIDSCENE_OPENAI_SOCKS_PROXY,
|
|
1392
|
-
httpProxy: MIDSCENE_OPENAI_HTTP_PROXY,
|
|
1393
|
-
/**
|
|
1394
|
-
* OpenAI
|
|
1395
|
-
*/
|
|
1396
|
-
openaiBaseURL: OPENAI_BASE_URL,
|
|
1397
|
-
openaiApiKey: OPENAI_API_KEY,
|
|
1398
|
-
openaiExtraConfig: MIDSCENE_OPENAI_INIT_CONFIG_JSON,
|
|
1399
|
-
/**
|
|
1400
|
-
* Azure
|
|
1401
|
-
*/
|
|
1402
|
-
openaiUseAzureDeprecated: OPENAI_USE_AZURE,
|
|
1403
|
-
useAzureOpenai: MIDSCENE_USE_AZURE_OPENAI,
|
|
1404
|
-
azureOpenaiScope: MIDSCENE_AZURE_OPENAI_SCOPE,
|
|
1405
|
-
azureOpenaiApiKey: AZURE_OPENAI_KEY,
|
|
1406
|
-
azureOpenaiEndpoint: AZURE_OPENAI_ENDPOINT,
|
|
1407
|
-
azureOpenaiApiVersion: AZURE_OPENAI_API_VERSION,
|
|
1408
|
-
azureOpenaiDeployment: AZURE_OPENAI_DEPLOYMENT,
|
|
1409
|
-
azureExtraConfig: MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON,
|
|
1410
|
-
/**
|
|
1411
|
-
* Anthropic
|
|
1412
|
-
*/
|
|
1413
|
-
useAnthropicSdk: MIDSCENE_USE_ANTHROPIC_SDK,
|
|
1414
|
-
anthropicApiKey: ANTHROPIC_API_KEY
|
|
1415
|
-
},
|
|
1416
|
-
createAssert(MIDSCENE_MODEL_NAME, commonModelName)
|
|
1417
|
-
);
|
|
1418
|
-
debugLog("got model config for common usage:", maskConfig(config));
|
|
1419
|
-
return config;
|
|
1078
|
+
var defaultModel = "gpt-4o";
|
|
1079
|
+
function getModelName() {
|
|
1080
|
+
let modelName = defaultModel;
|
|
1081
|
+
const nameInConfig = getAIConfig(MIDSCENE_MODEL_NAME);
|
|
1082
|
+
if (nameInConfig) {
|
|
1083
|
+
modelName = nameInConfig;
|
|
1420
1084
|
}
|
|
1421
|
-
|
|
1422
|
-
|
|
1423
|
-
// src/ai-model/service-caller/index.ts
|
|
1085
|
+
return modelName;
|
|
1086
|
+
}
|
|
1424
1087
|
async function createChatClient({
|
|
1425
|
-
AIActionTypeValue
|
|
1426
|
-
modelPreferences
|
|
1088
|
+
AIActionTypeValue
|
|
1427
1089
|
}) {
|
|
1428
|
-
|
|
1429
|
-
socksProxy,
|
|
1430
|
-
httpProxy,
|
|
1431
|
-
modelName,
|
|
1432
|
-
openaiBaseURL,
|
|
1433
|
-
openaiApiKey,
|
|
1434
|
-
openaiExtraConfig,
|
|
1435
|
-
openaiUseAzureDeprecated,
|
|
1436
|
-
useAzureOpenai,
|
|
1437
|
-
azureOpenaiScope,
|
|
1438
|
-
azureOpenaiApiKey,
|
|
1439
|
-
azureOpenaiEndpoint,
|
|
1440
|
-
azureOpenaiApiVersion,
|
|
1441
|
-
azureOpenaiDeployment,
|
|
1442
|
-
azureExtraConfig,
|
|
1443
|
-
useAnthropicSdk,
|
|
1444
|
-
anthropicApiKey
|
|
1445
|
-
} = decideModelConfig(modelPreferences);
|
|
1090
|
+
initDebugConfig();
|
|
1446
1091
|
let openai;
|
|
1092
|
+
const extraConfig = getAIConfigInJson(MIDSCENE_OPENAI_INIT_CONFIG_JSON);
|
|
1093
|
+
const socksProxy = getAIConfig(MIDSCENE_OPENAI_SOCKS_PROXY);
|
|
1094
|
+
const httpProxy = getAIConfig(MIDSCENE_OPENAI_HTTP_PROXY);
|
|
1447
1095
|
let proxyAgent = void 0;
|
|
1448
|
-
const debugProxy =
|
|
1096
|
+
const debugProxy = getDebug2("ai:call:proxy");
|
|
1449
1097
|
if (httpProxy) {
|
|
1450
1098
|
debugProxy("using http proxy", httpProxy);
|
|
1451
1099
|
proxyAgent = new HttpsProxyAgent(httpProxy);
|
|
@@ -1453,56 +1101,70 @@ async function createChatClient({
|
|
|
1453
1101
|
debugProxy("using socks proxy", socksProxy);
|
|
1454
1102
|
proxyAgent = new SocksProxyAgent(socksProxy);
|
|
1455
1103
|
}
|
|
1456
|
-
if (
|
|
1104
|
+
if (getAIConfig(OPENAI_USE_AZURE)) {
|
|
1457
1105
|
openai = new AzureOpenAI({
|
|
1458
|
-
baseURL:
|
|
1459
|
-
apiKey:
|
|
1106
|
+
baseURL: getAIConfig(OPENAI_BASE_URL),
|
|
1107
|
+
apiKey: getAIConfig(OPENAI_API_KEY),
|
|
1460
1108
|
httpAgent: proxyAgent,
|
|
1461
|
-
...
|
|
1109
|
+
...extraConfig,
|
|
1462
1110
|
dangerouslyAllowBrowser: true
|
|
1463
1111
|
});
|
|
1464
|
-
} else if (
|
|
1112
|
+
} else if (getAIConfig(MIDSCENE_USE_AZURE_OPENAI)) {
|
|
1113
|
+
const extraAzureConfig = getAIConfigInJson(
|
|
1114
|
+
MIDSCENE_AZURE_OPENAI_INIT_CONFIG_JSON
|
|
1115
|
+
);
|
|
1116
|
+
const scope = getAIConfig(MIDSCENE_AZURE_OPENAI_SCOPE);
|
|
1465
1117
|
let tokenProvider = void 0;
|
|
1466
|
-
if (
|
|
1467
|
-
|
|
1118
|
+
if (scope) {
|
|
1119
|
+
assert3(
|
|
1468
1120
|
!ifInBrowser,
|
|
1469
1121
|
"Azure OpenAI is not supported in browser with Midscene."
|
|
1470
1122
|
);
|
|
1471
1123
|
const credential = new DefaultAzureCredential();
|
|
1472
|
-
|
|
1124
|
+
assert3(scope, "MIDSCENE_AZURE_OPENAI_SCOPE is required");
|
|
1125
|
+
tokenProvider = getBearerTokenProvider(credential, scope);
|
|
1473
1126
|
openai = new AzureOpenAI({
|
|
1474
1127
|
azureADTokenProvider: tokenProvider,
|
|
1475
|
-
endpoint:
|
|
1476
|
-
apiVersion:
|
|
1477
|
-
deployment:
|
|
1478
|
-
...
|
|
1479
|
-
...
|
|
1128
|
+
endpoint: getAIConfig(AZURE_OPENAI_ENDPOINT),
|
|
1129
|
+
apiVersion: getAIConfig(AZURE_OPENAI_API_VERSION),
|
|
1130
|
+
deployment: getAIConfig(AZURE_OPENAI_DEPLOYMENT),
|
|
1131
|
+
...extraConfig,
|
|
1132
|
+
...extraAzureConfig
|
|
1480
1133
|
});
|
|
1481
1134
|
} else {
|
|
1482
1135
|
openai = new AzureOpenAI({
|
|
1483
|
-
apiKey:
|
|
1484
|
-
endpoint:
|
|
1485
|
-
apiVersion:
|
|
1486
|
-
deployment:
|
|
1136
|
+
apiKey: getAIConfig(AZURE_OPENAI_KEY),
|
|
1137
|
+
endpoint: getAIConfig(AZURE_OPENAI_ENDPOINT),
|
|
1138
|
+
apiVersion: getAIConfig(AZURE_OPENAI_API_VERSION),
|
|
1139
|
+
deployment: getAIConfig(AZURE_OPENAI_DEPLOYMENT),
|
|
1487
1140
|
dangerouslyAllowBrowser: true,
|
|
1488
|
-
...
|
|
1489
|
-
...
|
|
1141
|
+
...extraConfig,
|
|
1142
|
+
...extraAzureConfig
|
|
1490
1143
|
});
|
|
1491
1144
|
}
|
|
1492
|
-
} else if (!
|
|
1145
|
+
} else if (!getAIConfig(MIDSCENE_USE_ANTHROPIC_SDK)) {
|
|
1146
|
+
const baseURL = getAIConfig(OPENAI_BASE_URL);
|
|
1147
|
+
if (typeof baseURL === "string") {
|
|
1148
|
+
if (!/^https?:\/\//.test(baseURL)) {
|
|
1149
|
+
throw new Error(
|
|
1150
|
+
`OPENAI_BASE_URL must be a valid URL starting with http:// or https://, but got: ${baseURL}
|
|
1151
|
+
Please check your config.`
|
|
1152
|
+
);
|
|
1153
|
+
}
|
|
1154
|
+
}
|
|
1493
1155
|
openai = new OpenAI({
|
|
1494
|
-
baseURL:
|
|
1495
|
-
apiKey:
|
|
1156
|
+
baseURL: getAIConfig(OPENAI_BASE_URL),
|
|
1157
|
+
apiKey: getAIConfig(OPENAI_API_KEY),
|
|
1496
1158
|
httpAgent: proxyAgent,
|
|
1497
|
-
...
|
|
1159
|
+
...extraConfig,
|
|
1498
1160
|
defaultHeaders: {
|
|
1499
|
-
...
|
|
1161
|
+
...extraConfig?.defaultHeaders || {},
|
|
1500
1162
|
[MIDSCENE_API_TYPE]: AIActionTypeValue.toString()
|
|
1501
1163
|
},
|
|
1502
1164
|
dangerouslyAllowBrowser: true
|
|
1503
1165
|
});
|
|
1504
1166
|
}
|
|
1505
|
-
if (openai &&
|
|
1167
|
+
if (openai && getAIConfigInBoolean(MIDSCENE_LANGSMITH_DEBUG)) {
|
|
1506
1168
|
if (ifInBrowser) {
|
|
1507
1169
|
throw new Error("langsmith is not supported in browser");
|
|
1508
1170
|
}
|
|
@@ -1513,13 +1175,14 @@ async function createChatClient({
|
|
|
1513
1175
|
if (typeof openai !== "undefined") {
|
|
1514
1176
|
return {
|
|
1515
1177
|
completion: openai.chat.completions,
|
|
1516
|
-
style: "openai"
|
|
1517
|
-
modelName
|
|
1178
|
+
style: "openai"
|
|
1518
1179
|
};
|
|
1519
1180
|
}
|
|
1520
|
-
if (
|
|
1181
|
+
if (getAIConfig(MIDSCENE_USE_ANTHROPIC_SDK)) {
|
|
1182
|
+
const apiKey = getAIConfig(ANTHROPIC_API_KEY);
|
|
1183
|
+
assert3(apiKey, "ANTHROPIC_API_KEY is required");
|
|
1521
1184
|
openai = new Anthropic({
|
|
1522
|
-
apiKey
|
|
1185
|
+
apiKey,
|
|
1523
1186
|
httpAgent: proxyAgent,
|
|
1524
1187
|
dangerouslyAllowBrowser: true
|
|
1525
1188
|
});
|
|
@@ -1527,45 +1190,47 @@ async function createChatClient({
|
|
|
1527
1190
|
if (typeof openai !== "undefined" && openai.messages) {
|
|
1528
1191
|
return {
|
|
1529
1192
|
completion: openai.messages,
|
|
1530
|
-
style: "anthropic"
|
|
1531
|
-
modelName
|
|
1193
|
+
style: "anthropic"
|
|
1532
1194
|
};
|
|
1533
1195
|
}
|
|
1534
1196
|
throw new Error("Openai SDK or Anthropic SDK is not initialized");
|
|
1535
1197
|
}
|
|
1536
|
-
async function call2(messages, AIActionTypeValue,
|
|
1537
|
-
|
|
1538
|
-
|
|
1539
|
-
|
|
1198
|
+
async function call2(messages, AIActionTypeValue, responseFormat, options) {
|
|
1199
|
+
assert3(
|
|
1200
|
+
checkAIConfig(),
|
|
1201
|
+
"Cannot find config for AI model service. If you are using a self-hosted model without validating the API key, please set `OPENAI_API_KEY` to any non-null value. https://midscenejs.com/model-provider.html"
|
|
1202
|
+
);
|
|
1203
|
+
const { completion, style } = await createChatClient({
|
|
1204
|
+
AIActionTypeValue
|
|
1540
1205
|
});
|
|
1541
|
-
const
|
|
1542
|
-
const
|
|
1543
|
-
const
|
|
1544
|
-
const
|
|
1545
|
-
const debugProfileDetail = getDebug3("ai:profile:detail");
|
|
1206
|
+
const maxTokens = getAIConfig(OPENAI_MAX_TOKENS);
|
|
1207
|
+
const debugCall = getDebug2("ai:call");
|
|
1208
|
+
const debugProfileStats = getDebug2("ai:profile:stats");
|
|
1209
|
+
const debugProfileDetail = getDebug2("ai:profile:detail");
|
|
1546
1210
|
const startTime = Date.now();
|
|
1211
|
+
const model = getModelName();
|
|
1547
1212
|
const isStreaming = options?.stream && options?.onChunk;
|
|
1548
1213
|
let content;
|
|
1549
1214
|
let accumulated = "";
|
|
1550
1215
|
let usage;
|
|
1551
1216
|
let timeCost;
|
|
1552
1217
|
const commonConfig = {
|
|
1553
|
-
temperature:
|
|
1218
|
+
temperature: vlLocateMode2() === "vlm-ui-tars" ? 0 : 0.1,
|
|
1554
1219
|
stream: !!isStreaming,
|
|
1555
1220
|
max_tokens: typeof maxTokens === "number" ? maxTokens : Number.parseInt(maxTokens || "2048", 10),
|
|
1556
|
-
...
|
|
1221
|
+
...vlLocateMode2() === "qwen-vl" ? {
|
|
1557
1222
|
vl_high_resolution_images: true
|
|
1558
1223
|
} : {}
|
|
1559
1224
|
};
|
|
1560
1225
|
try {
|
|
1561
1226
|
if (style === "openai") {
|
|
1562
1227
|
debugCall(
|
|
1563
|
-
`sending ${isStreaming ? "streaming " : ""}request to ${
|
|
1228
|
+
`sending ${isStreaming ? "streaming " : ""}request to ${model}`
|
|
1564
1229
|
);
|
|
1565
1230
|
if (isStreaming) {
|
|
1566
1231
|
const stream = await completion.create(
|
|
1567
1232
|
{
|
|
1568
|
-
model
|
|
1233
|
+
model,
|
|
1569
1234
|
messages,
|
|
1570
1235
|
response_format: responseFormat,
|
|
1571
1236
|
...commonConfig
|
|
@@ -1622,23 +1287,23 @@ async function call2(messages, AIActionTypeValue, options, modelPreferences) {
|
|
|
1622
1287
|
}
|
|
1623
1288
|
content = accumulated;
|
|
1624
1289
|
debugProfileStats(
|
|
1625
|
-
`streaming model, ${
|
|
1290
|
+
`streaming model, ${model}, mode, ${vlLocateMode2() || "default"}, cost-ms, ${timeCost}`
|
|
1626
1291
|
);
|
|
1627
1292
|
} else {
|
|
1628
1293
|
const result = await completion.create({
|
|
1629
|
-
model
|
|
1294
|
+
model,
|
|
1630
1295
|
messages,
|
|
1631
1296
|
response_format: responseFormat,
|
|
1632
1297
|
...commonConfig
|
|
1633
1298
|
});
|
|
1634
1299
|
timeCost = Date.now() - startTime;
|
|
1635
1300
|
debugProfileStats(
|
|
1636
|
-
`model, ${
|
|
1301
|
+
`model, ${model}, mode, ${vlLocateMode2() || "default"}, ui-tars-version, ${uiTarsModelVersion()}, prompt-tokens, ${result.usage?.prompt_tokens || ""}, completion-tokens, ${result.usage?.completion_tokens || ""}, total-tokens, ${result.usage?.total_tokens || ""}, cost-ms, ${timeCost}, requestId, ${result._request_id || ""}`
|
|
1637
1302
|
);
|
|
1638
1303
|
debugProfileDetail(
|
|
1639
1304
|
`model usage detail: ${JSON.stringify(result.usage)}`
|
|
1640
1305
|
);
|
|
1641
|
-
|
|
1306
|
+
assert3(
|
|
1642
1307
|
result.choices,
|
|
1643
1308
|
`invalid response from LLM service: ${JSON.stringify(result)}`
|
|
1644
1309
|
);
|
|
@@ -1646,12 +1311,12 @@ async function call2(messages, AIActionTypeValue, options, modelPreferences) {
|
|
|
1646
1311
|
usage = result.usage;
|
|
1647
1312
|
}
|
|
1648
1313
|
debugCall(`response: ${content}`);
|
|
1649
|
-
|
|
1314
|
+
assert3(content, "empty content");
|
|
1650
1315
|
} else if (style === "anthropic") {
|
|
1651
1316
|
const convertImageContent = (content2) => {
|
|
1652
1317
|
if (content2.type === "image_url") {
|
|
1653
1318
|
const imgBase64 = content2.image_url.url;
|
|
1654
|
-
|
|
1319
|
+
assert3(imgBase64, "image_url is required");
|
|
1655
1320
|
return {
|
|
1656
1321
|
source: {
|
|
1657
1322
|
type: "base64",
|
|
@@ -1665,7 +1330,7 @@ async function call2(messages, AIActionTypeValue, options, modelPreferences) {
|
|
|
1665
1330
|
};
|
|
1666
1331
|
if (isStreaming) {
|
|
1667
1332
|
const stream = await completion.create({
|
|
1668
|
-
model
|
|
1333
|
+
model,
|
|
1669
1334
|
system: "You are a versatile professional in software UI automation",
|
|
1670
1335
|
messages: messages.map((m) => ({
|
|
1671
1336
|
role: "user",
|
|
@@ -1709,7 +1374,7 @@ async function call2(messages, AIActionTypeValue, options, modelPreferences) {
|
|
|
1709
1374
|
content = accumulated;
|
|
1710
1375
|
} else {
|
|
1711
1376
|
const result = await completion.create({
|
|
1712
|
-
model
|
|
1377
|
+
model,
|
|
1713
1378
|
system: "You are a versatile professional in software UI automation",
|
|
1714
1379
|
messages: messages.map((m) => ({
|
|
1715
1380
|
role: "user",
|
|
@@ -1722,7 +1387,7 @@ async function call2(messages, AIActionTypeValue, options, modelPreferences) {
|
|
|
1722
1387
|
content = result.content[0].text;
|
|
1723
1388
|
usage = result.usage;
|
|
1724
1389
|
}
|
|
1725
|
-
|
|
1390
|
+
assert3(content, "empty content");
|
|
1726
1391
|
}
|
|
1727
1392
|
if (isStreaming && !usage) {
|
|
1728
1393
|
const estimatedTokens = Math.max(
|
|
@@ -1756,9 +1421,10 @@ async function call2(messages, AIActionTypeValue, options, modelPreferences) {
|
|
|
1756
1421
|
throw newError;
|
|
1757
1422
|
}
|
|
1758
1423
|
}
|
|
1759
|
-
|
|
1424
|
+
async function callToGetJSONObject(messages, AIActionTypeValue) {
|
|
1760
1425
|
let responseFormat;
|
|
1761
|
-
|
|
1426
|
+
const model = getModelName();
|
|
1427
|
+
if (model.includes("gpt-4")) {
|
|
1762
1428
|
switch (AIActionTypeValue) {
|
|
1763
1429
|
case 0 /* ASSERT */:
|
|
1764
1430
|
responseFormat = assertSchema;
|
|
@@ -1775,19 +1441,11 @@ var getResponseFormat = (modelName, AIActionTypeValue) => {
|
|
|
1775
1441
|
break;
|
|
1776
1442
|
}
|
|
1777
1443
|
}
|
|
1778
|
-
if (
|
|
1444
|
+
if (model === "gpt-4o-2024-05-13") {
|
|
1779
1445
|
responseFormat = { type: "json_object" /* JSON */ };
|
|
1780
1446
|
}
|
|
1781
|
-
|
|
1782
|
-
|
|
1783
|
-
async function callToGetJSONObject(messages, AIActionTypeValue, modelPreferences) {
|
|
1784
|
-
const response = await call2(
|
|
1785
|
-
messages,
|
|
1786
|
-
AIActionTypeValue,
|
|
1787
|
-
void 0,
|
|
1788
|
-
modelPreferences
|
|
1789
|
-
);
|
|
1790
|
-
assert4(response, "empty response");
|
|
1447
|
+
const response = await call2(messages, AIActionTypeValue, responseFormat);
|
|
1448
|
+
assert3(response, "empty response");
|
|
1791
1449
|
const jsonContent = safeParseJson(response.content);
|
|
1792
1450
|
return { content: jsonContent, usage: response.usage };
|
|
1793
1451
|
}
|
|
@@ -1836,13 +1494,138 @@ function safeParseJson(input) {
|
|
|
1836
1494
|
return JSON.parse(jsonrepair(cleanJsonString));
|
|
1837
1495
|
} catch (e) {
|
|
1838
1496
|
}
|
|
1839
|
-
if (
|
|
1497
|
+
if (vlLocateMode2() === "doubao-vision" || vlLocateMode2() === "vlm-ui-tars") {
|
|
1840
1498
|
const jsonString = preprocessDoubaoBboxJson(cleanJsonString);
|
|
1841
1499
|
return JSON.parse(jsonrepair(jsonString));
|
|
1842
1500
|
}
|
|
1843
1501
|
throw Error(`failed to parse json response: ${input}`);
|
|
1844
1502
|
}
|
|
1845
1503
|
|
|
1504
|
+
// src/image/index.ts
|
|
1505
|
+
import {
|
|
1506
|
+
imageInfo,
|
|
1507
|
+
imageInfoOfBase64,
|
|
1508
|
+
localImg2Base64,
|
|
1509
|
+
httpImg2Base64,
|
|
1510
|
+
resizeImg,
|
|
1511
|
+
saveBase64Image,
|
|
1512
|
+
zoomForGPT4o
|
|
1513
|
+
} from "@midscene/shared/img";
|
|
1514
|
+
|
|
1515
|
+
// src/ai-model/prompt/util.ts
|
|
1516
|
+
import { NodeType as NodeType2 } from "@midscene/shared/constants";
|
|
1517
|
+
import { vlLocateMode as vlLocateMode3 } from "@midscene/shared/env";
|
|
1518
|
+
import {
|
|
1519
|
+
descriptionOfTree,
|
|
1520
|
+
generateElementByPosition,
|
|
1521
|
+
treeToList as treeToList2
|
|
1522
|
+
} from "@midscene/shared/extractor";
|
|
1523
|
+
import { assert as assert4 } from "@midscene/shared/utils";
|
|
1524
|
+
function describeSize(size) {
|
|
1525
|
+
return `${size.width} x ${size.height}`;
|
|
1526
|
+
}
|
|
1527
|
+
var distanceThreshold = 16;
|
|
1528
|
+
function elementByPositionWithElementInfo(treeRoot, position, options) {
|
|
1529
|
+
const requireStrictDistance = options?.requireStrictDistance ?? true;
|
|
1530
|
+
const filterPositionElements = options?.filterPositionElements ?? false;
|
|
1531
|
+
assert4(typeof position !== "undefined", "position is required for query");
|
|
1532
|
+
const matchingElements = [];
|
|
1533
|
+
function dfs(node) {
|
|
1534
|
+
if (node?.node) {
|
|
1535
|
+
const item = node.node;
|
|
1536
|
+
if (item.rect.left <= position.x && position.x <= item.rect.left + item.rect.width && item.rect.top <= position.y && position.y <= item.rect.top + item.rect.height) {
|
|
1537
|
+
if (!(filterPositionElements && item.attributes?.nodeType === NodeType2.POSITION) && item.isVisible) {
|
|
1538
|
+
matchingElements.push(item);
|
|
1539
|
+
}
|
|
1540
|
+
}
|
|
1541
|
+
}
|
|
1542
|
+
for (const child of node.children) {
|
|
1543
|
+
dfs(child);
|
|
1544
|
+
}
|
|
1545
|
+
}
|
|
1546
|
+
dfs(treeRoot);
|
|
1547
|
+
if (matchingElements.length === 0) {
|
|
1548
|
+
return void 0;
|
|
1549
|
+
}
|
|
1550
|
+
const element = matchingElements.reduce((smallest, current) => {
|
|
1551
|
+
const smallestArea = smallest.rect.width * smallest.rect.height;
|
|
1552
|
+
const currentArea = current.rect.width * current.rect.height;
|
|
1553
|
+
return currentArea < smallestArea ? current : smallest;
|
|
1554
|
+
});
|
|
1555
|
+
const distanceToCenter = distance(
|
|
1556
|
+
{ x: element.center[0], y: element.center[1] },
|
|
1557
|
+
position
|
|
1558
|
+
);
|
|
1559
|
+
if (requireStrictDistance) {
|
|
1560
|
+
return distanceToCenter <= distanceThreshold ? element : void 0;
|
|
1561
|
+
}
|
|
1562
|
+
return element;
|
|
1563
|
+
}
|
|
1564
|
+
function distance(point1, point2) {
|
|
1565
|
+
return Math.sqrt((point1.x - point2.x) ** 2 + (point1.y - point2.y) ** 2);
|
|
1566
|
+
}
|
|
1567
|
+
async function describeUserPage(context, opt) {
|
|
1568
|
+
const { screenshotBase64 } = context;
|
|
1569
|
+
let width;
|
|
1570
|
+
let height;
|
|
1571
|
+
if (context.size) {
|
|
1572
|
+
({ width, height } = context.size);
|
|
1573
|
+
} else {
|
|
1574
|
+
const imgSize = await imageInfoOfBase64(screenshotBase64);
|
|
1575
|
+
({ width, height } = imgSize);
|
|
1576
|
+
}
|
|
1577
|
+
const treeRoot = context.tree;
|
|
1578
|
+
const idElementMap = {};
|
|
1579
|
+
const flatElements = treeToList2(treeRoot);
|
|
1580
|
+
if (opt?.domIncluded === true && flatElements.length >= 5e3) {
|
|
1581
|
+
console.warn(
|
|
1582
|
+
'The number of elements is too large, it may cause the prompt to be too long, please use domIncluded: "visible-only" to reduce the number of elements'
|
|
1583
|
+
);
|
|
1584
|
+
}
|
|
1585
|
+
flatElements.forEach((element) => {
|
|
1586
|
+
idElementMap[element.id] = element;
|
|
1587
|
+
if (typeof element.indexId !== "undefined") {
|
|
1588
|
+
idElementMap[`${element.indexId}`] = element;
|
|
1589
|
+
}
|
|
1590
|
+
});
|
|
1591
|
+
let pageDescription = "";
|
|
1592
|
+
const visibleOnly = opt?.visibleOnly ?? opt?.domIncluded === "visible-only";
|
|
1593
|
+
if (opt?.domIncluded || !vlLocateMode3()) {
|
|
1594
|
+
const contentTree = await descriptionOfTree(
|
|
1595
|
+
treeRoot,
|
|
1596
|
+
opt?.truncateTextLength,
|
|
1597
|
+
opt?.filterNonTextContent,
|
|
1598
|
+
visibleOnly
|
|
1599
|
+
);
|
|
1600
|
+
const sizeDescription = describeSize({ width, height });
|
|
1601
|
+
pageDescription = `The size of the page: ${sizeDescription}
|
|
1602
|
+
The page elements tree:
|
|
1603
|
+
${contentTree}`;
|
|
1604
|
+
}
|
|
1605
|
+
return {
|
|
1606
|
+
description: pageDescription,
|
|
1607
|
+
elementById(idOrIndexId) {
|
|
1608
|
+
assert4(typeof idOrIndexId !== "undefined", "id is required for query");
|
|
1609
|
+
const item = idElementMap[`${idOrIndexId}`];
|
|
1610
|
+
return item;
|
|
1611
|
+
},
|
|
1612
|
+
elementByPosition(position, size) {
|
|
1613
|
+
return elementByPositionWithElementInfo(treeRoot, position);
|
|
1614
|
+
},
|
|
1615
|
+
insertElementByPosition(position) {
|
|
1616
|
+
const element = generateElementByPosition(position);
|
|
1617
|
+
treeRoot.children.push({
|
|
1618
|
+
node: element,
|
|
1619
|
+
children: []
|
|
1620
|
+
});
|
|
1621
|
+
flatElements.push(element);
|
|
1622
|
+
idElementMap[element.id] = element;
|
|
1623
|
+
return element;
|
|
1624
|
+
},
|
|
1625
|
+
size: { width, height }
|
|
1626
|
+
};
|
|
1627
|
+
}
|
|
1628
|
+
|
|
1846
1629
|
// src/ai-model/prompt/playwright-generator.ts
|
|
1847
1630
|
import { PLAYWRIGHT_EXAMPLE_CODE } from "@midscene/shared/constants";
|
|
1848
1631
|
|
|
@@ -2071,7 +1854,7 @@ Respond with YAML only, no explanations.`
|
|
|
2071
1854
|
});
|
|
2072
1855
|
}
|
|
2073
1856
|
if (options.stream && options.onChunk) {
|
|
2074
|
-
return await call2(prompt, 2 /* EXTRACT_DATA */, {
|
|
1857
|
+
return await call2(prompt, 2 /* EXTRACT_DATA */, void 0, {
|
|
2075
1858
|
stream: true,
|
|
2076
1859
|
onChunk: options.onChunk
|
|
2077
1860
|
});
|
|
@@ -2194,7 +1977,7 @@ ${PLAYWRIGHT_EXAMPLE_CODE}`;
|
|
|
2194
1977
|
}
|
|
2195
1978
|
];
|
|
2196
1979
|
if (options.stream && options.onChunk) {
|
|
2197
|
-
return await call2(prompt, 2 /* EXTRACT_DATA */, {
|
|
1980
|
+
return await call2(prompt, 2 /* EXTRACT_DATA */, void 0, {
|
|
2198
1981
|
stream: true,
|
|
2199
1982
|
onChunk: options.onChunk
|
|
2200
1983
|
});
|
|
@@ -2215,7 +1998,7 @@ ${PLAYWRIGHT_EXAMPLE_CODE}`;
|
|
|
2215
1998
|
import {
|
|
2216
1999
|
MIDSCENE_USE_QWEN_VL,
|
|
2217
2000
|
MIDSCENE_USE_VLM_UI_TARS,
|
|
2218
|
-
getAIConfigInBoolean as
|
|
2001
|
+
getAIConfigInBoolean as getAIConfigInBoolean2,
|
|
2219
2002
|
vlLocateMode as vlLocateMode4
|
|
2220
2003
|
} from "@midscene/shared/env";
|
|
2221
2004
|
import {
|
|
@@ -2223,7 +2006,7 @@ import {
|
|
|
2223
2006
|
paddingToMatchBlockByBase64,
|
|
2224
2007
|
preProcessImageUrl
|
|
2225
2008
|
} from "@midscene/shared/img";
|
|
2226
|
-
import { getDebug as
|
|
2009
|
+
import { getDebug as getDebug3 } from "@midscene/shared/logger";
|
|
2227
2010
|
import { assert as assert5 } from "@midscene/shared/utils";
|
|
2228
2011
|
|
|
2229
2012
|
// src/ai-model/prompt/extraction.ts
|
|
@@ -2379,8 +2162,8 @@ var sectionLocatorInstruction = new PromptTemplate4({
|
|
|
2379
2162
|
});
|
|
2380
2163
|
|
|
2381
2164
|
// src/ai-model/inspect.ts
|
|
2382
|
-
var debugInspect =
|
|
2383
|
-
var debugSection =
|
|
2165
|
+
var debugInspect = getDebug3("ai:inspect");
|
|
2166
|
+
var debugSection = getDebug3("ai:section");
|
|
2384
2167
|
var extraTextFromUserPrompt = (prompt) => {
|
|
2385
2168
|
if (typeof prompt === "string") {
|
|
2386
2169
|
return prompt;
|
|
@@ -2601,7 +2384,7 @@ async function AiLocateSection(options) {
|
|
|
2601
2384
|
imageBase64 = await cropByRect(
|
|
2602
2385
|
screenshotBase64,
|
|
2603
2386
|
sectionRect,
|
|
2604
|
-
|
|
2387
|
+
getAIConfigInBoolean2(MIDSCENE_USE_QWEN_VL)
|
|
2605
2388
|
);
|
|
2606
2389
|
}
|
|
2607
2390
|
return {
|
|
@@ -2613,13 +2396,7 @@ async function AiLocateSection(options) {
|
|
|
2613
2396
|
};
|
|
2614
2397
|
}
|
|
2615
2398
|
async function AiExtractElementInfo(options) {
|
|
2616
|
-
const {
|
|
2617
|
-
dataQuery,
|
|
2618
|
-
context,
|
|
2619
|
-
extractOption,
|
|
2620
|
-
multimodalPrompt,
|
|
2621
|
-
modelPreferences
|
|
2622
|
-
} = options;
|
|
2399
|
+
const { dataQuery, context, extractOption, multimodalPrompt } = options;
|
|
2623
2400
|
const systemPrompt = systemPromptToExtract();
|
|
2624
2401
|
const { screenshotBase64 } = context;
|
|
2625
2402
|
const { description, elementById } = await describeUserPage(context, {
|
|
@@ -2668,8 +2445,7 @@ async function AiExtractElementInfo(options) {
|
|
|
2668
2445
|
}
|
|
2669
2446
|
const result = await callAiFn(
|
|
2670
2447
|
msgs,
|
|
2671
|
-
2 /* EXTRACT_DATA
|
|
2672
|
-
modelPreferences
|
|
2448
|
+
2 /* EXTRACT_DATA */
|
|
2673
2449
|
);
|
|
2674
2450
|
return {
|
|
2675
2451
|
parseResult: result.content,
|
|
@@ -2682,7 +2458,7 @@ async function AiAssert(options) {
|
|
|
2682
2458
|
assert5(assertion, "assertion should not be empty");
|
|
2683
2459
|
const { screenshotBase64 } = context;
|
|
2684
2460
|
const systemPrompt = systemPromptToAssert({
|
|
2685
|
-
isUITars:
|
|
2461
|
+
isUITars: getAIConfigInBoolean2(MIDSCENE_USE_VLM_UI_TARS)
|
|
2686
2462
|
});
|
|
2687
2463
|
const assertionText = extraTextFromUserPrompt(assertion);
|
|
2688
2464
|
const msgs = [
|
|
@@ -2735,7 +2511,7 @@ async function plan(userInstruction, opts) {
|
|
|
2735
2511
|
const { screenshotBase64, size } = context;
|
|
2736
2512
|
const { description: pageDescription, elementById } = await describeUserPage(context);
|
|
2737
2513
|
const systemPrompt = await systemPromptToTaskPlanning({
|
|
2738
|
-
|
|
2514
|
+
actionSpace: opts.actionSpace,
|
|
2739
2515
|
vlMode: vlLocateMode5()
|
|
2740
2516
|
});
|
|
2741
2517
|
const taskBackgroundContextText = generateTaskBackgroundContext(
|
|
@@ -2835,7 +2611,7 @@ import {
|
|
|
2835
2611
|
} from "@midscene/shared/env";
|
|
2836
2612
|
import { resizeImgBase64 } from "@midscene/shared/img";
|
|
2837
2613
|
import { transformHotkeyInput } from "@midscene/shared/keyboard-layout";
|
|
2838
|
-
import { getDebug as
|
|
2614
|
+
import { getDebug as getDebug4 } from "@midscene/shared/logger";
|
|
2839
2615
|
import { assert as assert7 } from "@midscene/shared/utils";
|
|
2840
2616
|
import { actionParser } from "@ui-tars/action-parser";
|
|
2841
2617
|
|
|
@@ -2875,7 +2651,7 @@ finished(content='xxx') # Use escape characters \\', \\", and \\n in content par
|
|
|
2875
2651
|
var getSummary = (prediction) => prediction.replace(/Reflection:[\s\S]*?(?=Action_Summary:|Action:|$)/g, "").trim();
|
|
2876
2652
|
|
|
2877
2653
|
// src/ai-model/ui-tars-planning.ts
|
|
2878
|
-
var debug =
|
|
2654
|
+
var debug = getDebug4("ui-tars-planning");
|
|
2879
2655
|
var bboxSize = 10;
|
|
2880
2656
|
var pointToBbox = (point, width, height) => {
|
|
2881
2657
|
return [
|
|
@@ -3117,8 +2893,6 @@ async function resizeImageForUiTars(imageBase64, size) {
|
|
|
3117
2893
|
|
|
3118
2894
|
export {
|
|
3119
2895
|
systemPromptToLocateElement,
|
|
3120
|
-
elementByPositionWithElementInfo,
|
|
3121
|
-
describeUserPage,
|
|
3122
2896
|
call2 as call,
|
|
3123
2897
|
callToGetJSONObject,
|
|
3124
2898
|
callAiFnWithStringResponse,
|
|
@@ -3126,6 +2900,8 @@ export {
|
|
|
3126
2900
|
callAiFn,
|
|
3127
2901
|
adaptBboxToRect,
|
|
3128
2902
|
expandSearchArea,
|
|
2903
|
+
elementByPositionWithElementInfo,
|
|
2904
|
+
describeUserPage,
|
|
3129
2905
|
generateYamlTest,
|
|
3130
2906
|
generateYamlTestStream,
|
|
3131
2907
|
generatePlaywrightTest,
|
|
@@ -3139,4 +2915,4 @@ export {
|
|
|
3139
2915
|
resizeImageForUiTars
|
|
3140
2916
|
};
|
|
3141
2917
|
|
|
3142
|
-
//# sourceMappingURL=chunk-
|
|
2918
|
+
//# sourceMappingURL=chunk-5IZMFZPA.js.map
|