@midscene/core 0.25.4-beta-20250811113343.0 → 0.25.4-beta-20250812025613.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/es/ai-model.js +1 -1
- package/dist/es/{chunk-5IZMFZPA.js → chunk-UC5NNLPY.js} +34 -38
- package/dist/es/chunk-UC5NNLPY.js.map +1 -0
- package/dist/es/{chunk-H5PRBRMX.js → chunk-YNPMUA35.js} +3 -3
- package/dist/es/index.js +2 -2
- package/dist/es/utils.js +1 -1
- package/dist/lib/ai-model.js +2 -2
- package/dist/lib/{chunk-5IZMFZPA.js → chunk-UC5NNLPY.js} +34 -38
- package/dist/lib/chunk-UC5NNLPY.js.map +1 -0
- package/dist/lib/{chunk-H5PRBRMX.js → chunk-YNPMUA35.js} +3 -3
- package/dist/lib/index.js +12 -12
- package/dist/lib/utils.js +2 -2
- package/package.json +3 -3
- package/dist/es/chunk-5IZMFZPA.js.map +0 -1
- package/dist/lib/chunk-5IZMFZPA.js.map +0 -1
- /package/dist/es/{chunk-H5PRBRMX.js.map → chunk-YNPMUA35.js.map} +0 -0
- /package/dist/lib/{chunk-H5PRBRMX.js.map → chunk-YNPMUA35.js.map} +0 -0
package/dist/es/index.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import {
|
|
2
2
|
getVersion
|
|
3
|
-
} from "./chunk-
|
|
3
|
+
} from "./chunk-YNPMUA35.js";
|
|
4
4
|
import {
|
|
5
5
|
AiAssert,
|
|
6
6
|
AiExtractElementInfo,
|
|
@@ -11,7 +11,7 @@ import {
|
|
|
11
11
|
describeUserPage,
|
|
12
12
|
expandSearchArea,
|
|
13
13
|
plan
|
|
14
|
-
} from "./chunk-
|
|
14
|
+
} from "./chunk-UC5NNLPY.js";
|
|
15
15
|
|
|
16
16
|
// src/ai-model/action-executor.ts
|
|
17
17
|
import {
|
package/dist/es/utils.js
CHANGED
package/dist/lib/ai-model.js
CHANGED
|
@@ -19,7 +19,7 @@
|
|
|
19
19
|
|
|
20
20
|
|
|
21
21
|
|
|
22
|
-
var
|
|
22
|
+
var _chunkUC5NNLPYjs = require('./chunk-UC5NNLPY.js');
|
|
23
23
|
|
|
24
24
|
|
|
25
25
|
|
|
@@ -41,4 +41,4 @@ var _chunk5IZMFZPAjs = require('./chunk-5IZMFZPA.js');
|
|
|
41
41
|
|
|
42
42
|
|
|
43
43
|
|
|
44
|
-
exports.AIActionType =
|
|
44
|
+
exports.AIActionType = _chunkUC5NNLPYjs.AIActionType; exports.AiAssert = _chunkUC5NNLPYjs.AiAssert; exports.AiExtractElementInfo = _chunkUC5NNLPYjs.AiExtractElementInfo; exports.AiLocateElement = _chunkUC5NNLPYjs.AiLocateElement; exports.AiLocateSection = _chunkUC5NNLPYjs.AiLocateSection; exports.adaptBboxToRect = _chunkUC5NNLPYjs.adaptBboxToRect; exports.callAi = _chunkUC5NNLPYjs.call; exports.callAiFn = _chunkUC5NNLPYjs.callAiFn; exports.callAiFnWithStringResponse = _chunkUC5NNLPYjs.callAiFnWithStringResponse; exports.callToGetJSONObject = _chunkUC5NNLPYjs.callToGetJSONObject; exports.describeUserPage = _chunkUC5NNLPYjs.describeUserPage; exports.elementByPositionWithElementInfo = _chunkUC5NNLPYjs.elementByPositionWithElementInfo; exports.generatePlaywrightTest = _chunkUC5NNLPYjs.generatePlaywrightTest; exports.generatePlaywrightTestStream = _chunkUC5NNLPYjs.generatePlaywrightTestStream; exports.generateYamlTest = _chunkUC5NNLPYjs.generateYamlTest; exports.generateYamlTestStream = _chunkUC5NNLPYjs.generateYamlTestStream; exports.plan = _chunkUC5NNLPYjs.plan; exports.resizeImageForUiTars = _chunkUC5NNLPYjs.resizeImageForUiTars; exports.systemPromptToLocateElement = _chunkUC5NNLPYjs.systemPromptToLocateElement; exports.vlmPlanning = _chunkUC5NNLPYjs.vlmPlanning;
|
|
@@ -640,8 +640,8 @@ var vlCurrentLog = `"log": string, // Log what the next one action (ONLY ONE!) y
|
|
|
640
640
|
var llmCurrentLog = `"log": string, // Log what the next actions you can do according to the screenshot and the instruction. The typical log looks like "Now i want to use action '{ action-type }' to do ..". If no action should be done, log the reason. ". Use the same language as the user's instruction.`;
|
|
641
641
|
var commonOutputFields = `"error"?: string, // Error messages about unexpected situations, if any. Only think it is an error when the situation is not expected according to the instruction. Use the same language as the user's instruction.
|
|
642
642
|
"more_actions_needed_by_instruction": boolean, // Consider if there is still more action(s) to do after the action in "Log" is done, according to the instruction. If so, set this field to true. Otherwise, set it to false.`;
|
|
643
|
-
var vlLocateParam = "
|
|
644
|
-
var llmLocateParam = `locate: {"id": string, "prompt": string}`;
|
|
643
|
+
var vlLocateParam = (required) => `locate${required ? "" : "?"}: {bbox: [number, number, number, number], prompt: string }`;
|
|
644
|
+
var llmLocateParam = (required) => `locate${required ? "" : "?"}: {"id": string, "prompt": string}`;
|
|
645
645
|
var descriptionForAction = (action, locatorScheme) => {
|
|
646
646
|
const tab = " ";
|
|
647
647
|
let locateParam = "";
|
|
@@ -652,39 +652,32 @@ var descriptionForAction = (action, locatorScheme) => {
|
|
|
652
652
|
} else if (action.location === false) {
|
|
653
653
|
locateParam = "";
|
|
654
654
|
}
|
|
655
|
-
const locatorParam = locateParam ?
|
|
656
|
-
let whatToLocate = "";
|
|
655
|
+
const locatorParam = locateParam ? `- ${locateParam}` : "";
|
|
657
656
|
if (action.whatToLocate) {
|
|
658
657
|
if (!locateParam) {
|
|
659
658
|
console.warn(
|
|
660
659
|
`whatToLocate is provided for action ${action.name}, but location is not required or optional. The whatToLocate will be ignored.`
|
|
661
660
|
);
|
|
662
661
|
} else {
|
|
663
|
-
|
|
662
|
+
locateParam += ` // ${action.whatToLocate}`;
|
|
664
663
|
}
|
|
665
664
|
}
|
|
666
665
|
let paramSchema = "";
|
|
667
666
|
if (action.paramSchema) {
|
|
668
|
-
paramSchema =
|
|
667
|
+
paramSchema = `- param: ${action.paramSchema}`;
|
|
669
668
|
}
|
|
670
|
-
let paramDescription = "";
|
|
671
669
|
if (action.paramDescription) {
|
|
672
670
|
_assert2.default.call(void 0,
|
|
673
671
|
paramSchema,
|
|
674
672
|
`paramSchema is required when paramDescription is provided for action ${action.name}, but got ${action.paramSchema}`
|
|
675
673
|
);
|
|
676
|
-
|
|
677
|
-
}
|
|
678
|
-
const fields = [
|
|
679
|
-
|
|
680
|
-
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
].filter(Boolean);
|
|
684
|
-
return `- ${action.name}
|
|
685
|
-
- type: "${action.name}"
|
|
686
|
-
- description: ${action.description}
|
|
687
|
-
${fields.join("\n")}
|
|
674
|
+
paramSchema += ` // ${action.paramDescription}`;
|
|
675
|
+
}
|
|
676
|
+
const fields = [paramSchema, locatorParam].filter(Boolean);
|
|
677
|
+
return `- ${action.name}, ${action.description}
|
|
678
|
+
${tab}- type: "${action.name}"
|
|
679
|
+
${tab}${fields.join(`
|
|
680
|
+
${tab}`)}
|
|
688
681
|
`.trim();
|
|
689
682
|
};
|
|
690
683
|
var systemTemplateOfVLPlanning = ({
|
|
@@ -693,7 +686,7 @@ var systemTemplateOfVLPlanning = ({
|
|
|
693
686
|
}) => {
|
|
694
687
|
const actionNameList = actionSpace.map((action) => action.name).join(", ");
|
|
695
688
|
const actionDescriptionList = actionSpace.map(
|
|
696
|
-
(action) => descriptionForAction(action, vlLocateParam)
|
|
689
|
+
(action) => descriptionForAction(action, vlLocateParam(action.location === "required"))
|
|
697
690
|
);
|
|
698
691
|
const actionList = actionDescriptionList.join("\n");
|
|
699
692
|
return `
|
|
@@ -747,7 +740,10 @@ var systemTemplateOfLLM = ({
|
|
|
747
740
|
}) => {
|
|
748
741
|
const actionNameList = actionSpace.map((action) => action.name).join(" / ");
|
|
749
742
|
const actionDescriptionList = actionSpace.map(
|
|
750
|
-
(action) => descriptionForAction(
|
|
743
|
+
(action) => descriptionForAction(
|
|
744
|
+
action,
|
|
745
|
+
llmLocateParam(action.location === "required")
|
|
746
|
+
)
|
|
751
747
|
);
|
|
752
748
|
const actionList = actionDescriptionList.join("\n");
|
|
753
749
|
return `
|
|
@@ -764,16 +760,15 @@ You are a versatile professional in software UI automation. Your outstanding con
|
|
|
764
760
|
## Workflow
|
|
765
761
|
|
|
766
762
|
1. Receive the screenshot, element description of screenshot(if any), user's instruction and previous logs.
|
|
767
|
-
2. Decompose the user's task into a sequence of actions, and place it in the \`actions\` field. There are different types of actions (${actionNameList}). The "About the action" section below will give you more details.
|
|
768
|
-
3.
|
|
769
|
-
|
|
770
|
-
|
|
771
|
-
|
|
772
|
-
- If no, don't plan more actions by closing the array. Get ready to reevaluate the task. Some talent people like you will handle this. Give him a clear description of what have been done and what to do next. Put your new plan in the \`furtherPlan\` field. The "How to compose the \`taskWillBeAccomplished\` and \`furtherPlan\` fields" section will give you more details.
|
|
763
|
+
2. Decompose the user's task into a sequence of feasible actions, and place it in the \`actions\` field. There are different types of actions (${actionNameList}). The "About the action" section below will give you more details.
|
|
764
|
+
3. Consider whether the user's instruction will be accomplished after the actions you composed.
|
|
765
|
+
- If the instruction is accomplished, set \`more_actions_needed_by_instruction\` to false.
|
|
766
|
+
- If more actions are needed, set \`more_actions_needed_by_instruction\` to true. Get ready to hand over to the next talent people like you. Carefully log what have been done in the \`log\` field, he or she will continue the task according to your logs.
|
|
767
|
+
4. If the task is not feasible on this page, set \`error\` field to the reason.
|
|
773
768
|
|
|
774
769
|
## Constraints
|
|
775
770
|
|
|
776
|
-
- All the actions you composed MUST be
|
|
771
|
+
- All the actions you composed MUST be feasible, which means all the action fields can be filled with the page context information you get. If not, don't plan this action.
|
|
777
772
|
- Trust the "What have been done" field about the task (if any), don't repeat actions in it.
|
|
778
773
|
- Respond only with valid JSON. Do not write an introduction or summary or markdown prefix like \`\`\`json\`\`\`.
|
|
779
774
|
- If the screenshot and the instruction are totally irrelevant, set reason in the \`error\` field.
|
|
@@ -811,15 +806,20 @@ The JSON format is as follows:
|
|
|
811
806
|
|
|
812
807
|
### Example: Decompose a task
|
|
813
808
|
|
|
814
|
-
When
|
|
809
|
+
When you received the following information:
|
|
810
|
+
|
|
811
|
+
* Instruction: 'Click the language switch button, wait 1s, click "English"'
|
|
812
|
+
* Logs: null
|
|
813
|
+
* Page Context (screenshot and description) shows: There is a language switch button, and the "English" option is not shown in the screenshot now.
|
|
815
814
|
|
|
816
815
|
By viewing the page screenshot and description, you should consider this and output the JSON:
|
|
817
816
|
|
|
818
|
-
* The
|
|
819
|
-
* The language switch button is shown in the screenshot,
|
|
817
|
+
* The user intent is: tap the switch button, sleep, and tap the 'English' option
|
|
818
|
+
* The language switch button is shown in the screenshot, and can be located by the page description or the id marked with a rectangle. So we can plan a Tap action to do this.
|
|
819
|
+
* Plan a Sleep action to wait for 1 second to ensure the language options are displayed.
|
|
820
820
|
* The "English" option button is not shown in the screenshot now, it means it may only show after the previous actions are finished. So don't plan any action to do this.
|
|
821
821
|
* Log what these action do: Click the language switch button to open the language options. Wait for 1 second.
|
|
822
|
-
* The task cannot be accomplished (because
|
|
822
|
+
* The task cannot be accomplished (because the last tapping action is not finished yet), so the \`more_actions_needed_by_instruction\` field is true. The \`error\` field is null.
|
|
823
823
|
|
|
824
824
|
{
|
|
825
825
|
"actions":[
|
|
@@ -849,7 +849,7 @@ Wrong output:
|
|
|
849
849
|
"thought": "Click the language switch button to open the language options.",
|
|
850
850
|
"param": null,
|
|
851
851
|
"locate": {
|
|
852
|
-
{ "id": "c81c4e9a33" }, // WRONG: prompt is missing
|
|
852
|
+
{ "id": "c81c4e9a33" }, // WRONG: prompt is missing, this is not a valid LocateParam
|
|
853
853
|
}
|
|
854
854
|
},
|
|
855
855
|
{
|
|
@@ -862,10 +862,6 @@ Wrong output:
|
|
|
862
862
|
"more_actions_needed_by_instruction": false, // WRONG: should be true
|
|
863
863
|
"log": "Click the language switch button to open the language options",
|
|
864
864
|
}
|
|
865
|
-
|
|
866
|
-
Reason:
|
|
867
|
-
* The \`prompt\` is missing in the first 'Locate' action
|
|
868
|
-
* Since the option button is not shown in the screenshot, there are still more actions to be done, so the \`more_actions_needed_by_instruction\` field should be true
|
|
869
865
|
`;
|
|
870
866
|
async function systemPromptToTaskPlanning({
|
|
871
867
|
actionSpace,
|
|
@@ -2915,4 +2911,4 @@ async function resizeImageForUiTars(imageBase64, size) {
|
|
|
2915
2911
|
|
|
2916
2912
|
exports.systemPromptToLocateElement = systemPromptToLocateElement; exports.call = call2; exports.callToGetJSONObject = callToGetJSONObject; exports.callAiFnWithStringResponse = callAiFnWithStringResponse; exports.AIActionType = AIActionType; exports.callAiFn = callAiFn; exports.adaptBboxToRect = adaptBboxToRect; exports.expandSearchArea = expandSearchArea; exports.elementByPositionWithElementInfo = elementByPositionWithElementInfo; exports.describeUserPage = describeUserPage; exports.generateYamlTest = generateYamlTest; exports.generateYamlTestStream = generateYamlTestStream; exports.generatePlaywrightTest = generatePlaywrightTest; exports.generatePlaywrightTestStream = generatePlaywrightTestStream; exports.AiLocateElement = AiLocateElement; exports.AiLocateSection = AiLocateSection; exports.AiExtractElementInfo = AiExtractElementInfo; exports.AiAssert = AiAssert; exports.plan = plan; exports.vlmPlanning = vlmPlanning; exports.resizeImageForUiTars = resizeImageForUiTars;
|
|
2917
2913
|
|
|
2918
|
-
//# sourceMappingURL=chunk-
|
|
2914
|
+
//# sourceMappingURL=chunk-UC5NNLPY.js.map
|