@midscene/core 0.30.10 → 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/es/agent/agent.mjs +233 -144
- package/dist/es/agent/agent.mjs.map +1 -1
- package/dist/es/agent/execution-session.mjs +41 -0
- package/dist/es/agent/execution-session.mjs.map +1 -0
- package/dist/es/agent/index.mjs +3 -3
- package/dist/es/agent/task-builder.mjs +319 -0
- package/dist/es/agent/task-builder.mjs.map +1 -0
- package/dist/es/agent/task-cache.mjs +4 -4
- package/dist/es/agent/task-cache.mjs.map +1 -1
- package/dist/es/agent/tasks.mjs +197 -504
- package/dist/es/agent/tasks.mjs.map +1 -1
- package/dist/es/agent/ui-utils.mjs +54 -35
- package/dist/es/agent/ui-utils.mjs.map +1 -1
- package/dist/es/agent/utils.mjs +16 -58
- package/dist/es/agent/utils.mjs.map +1 -1
- package/dist/es/ai-model/conversation-history.mjs +25 -13
- package/dist/es/ai-model/conversation-history.mjs.map +1 -1
- package/dist/es/ai-model/index.mjs +4 -4
- package/dist/es/ai-model/inspect.mjs +45 -54
- package/dist/es/ai-model/inspect.mjs.map +1 -1
- package/dist/es/ai-model/llm-planning.mjs +47 -65
- package/dist/es/ai-model/llm-planning.mjs.map +1 -1
- package/dist/es/ai-model/prompt/assertion.mjs.map +1 -1
- package/dist/es/ai-model/prompt/common.mjs.map +1 -1
- package/dist/es/ai-model/prompt/describe.mjs.map +1 -1
- package/dist/es/ai-model/prompt/extraction.mjs.map +1 -1
- package/dist/es/ai-model/prompt/llm-locator.mjs +11 -235
- package/dist/es/ai-model/prompt/llm-locator.mjs.map +1 -1
- package/dist/es/ai-model/prompt/llm-planning.mjs +76 -322
- package/dist/es/ai-model/prompt/llm-planning.mjs.map +1 -1
- package/dist/es/ai-model/prompt/llm-section-locator.mjs +15 -14
- package/dist/es/ai-model/prompt/llm-section-locator.mjs.map +1 -1
- package/dist/es/ai-model/prompt/order-sensitive-judge.mjs +35 -0
- package/dist/es/ai-model/prompt/order-sensitive-judge.mjs.map +1 -0
- package/dist/es/ai-model/prompt/playwright-generator.mjs +2 -2
- package/dist/es/ai-model/prompt/playwright-generator.mjs.map +1 -1
- package/dist/es/ai-model/prompt/ui-tars-locator.mjs.map +1 -1
- package/dist/es/ai-model/prompt/ui-tars-planning.mjs.map +1 -1
- package/dist/es/ai-model/prompt/util.mjs +3 -88
- package/dist/es/ai-model/prompt/util.mjs.map +1 -1
- package/dist/es/ai-model/prompt/yaml-generator.mjs +10 -10
- package/dist/es/ai-model/prompt/yaml-generator.mjs.map +1 -1
- package/dist/es/ai-model/service-caller/index.mjs +182 -274
- package/dist/es/ai-model/service-caller/index.mjs.map +1 -1
- package/dist/es/ai-model/ui-tars-planning.mjs +69 -8
- package/dist/es/ai-model/ui-tars-planning.mjs.map +1 -1
- package/dist/es/{ai-model/common.mjs → common.mjs} +18 -30
- package/dist/es/common.mjs.map +1 -0
- package/dist/es/device/device-options.mjs +0 -0
- package/dist/es/device/index.mjs +29 -12
- package/dist/es/device/index.mjs.map +1 -1
- package/dist/es/index.mjs +5 -4
- package/dist/es/index.mjs.map +1 -1
- package/dist/es/report.mjs.map +1 -1
- package/dist/es/{insight → service}/index.mjs +38 -51
- package/dist/es/service/index.mjs.map +1 -0
- package/dist/es/{insight → service}/utils.mjs +3 -3
- package/dist/es/service/utils.mjs.map +1 -0
- package/dist/es/task-runner.mjs +264 -0
- package/dist/es/task-runner.mjs.map +1 -0
- package/dist/es/tree.mjs +13 -2
- package/dist/es/tree.mjs.map +1 -0
- package/dist/es/types.mjs +18 -1
- package/dist/es/types.mjs.map +1 -1
- package/dist/es/utils.mjs +6 -7
- package/dist/es/utils.mjs.map +1 -1
- package/dist/es/yaml/builder.mjs.map +1 -1
- package/dist/es/yaml/player.mjs +121 -98
- package/dist/es/yaml/player.mjs.map +1 -1
- package/dist/es/yaml/utils.mjs +1 -1
- package/dist/es/yaml/utils.mjs.map +1 -1
- package/dist/lib/agent/agent.js +231 -142
- package/dist/lib/agent/agent.js.map +1 -1
- package/dist/lib/agent/common.js +1 -1
- package/dist/lib/agent/execution-session.js +75 -0
- package/dist/lib/agent/execution-session.js.map +1 -0
- package/dist/lib/agent/index.js +14 -14
- package/dist/lib/agent/index.js.map +1 -1
- package/dist/lib/agent/task-builder.js +356 -0
- package/dist/lib/agent/task-builder.js.map +1 -0
- package/dist/lib/agent/task-cache.js +8 -8
- package/dist/lib/agent/task-cache.js.map +1 -1
- package/dist/lib/agent/tasks.js +202 -506
- package/dist/lib/agent/tasks.js.map +1 -1
- package/dist/lib/agent/ui-utils.js +58 -36
- package/dist/lib/agent/ui-utils.js.map +1 -1
- package/dist/lib/agent/utils.js +26 -68
- package/dist/lib/agent/utils.js.map +1 -1
- package/dist/lib/ai-model/conversation-history.js +27 -15
- package/dist/lib/ai-model/conversation-history.js.map +1 -1
- package/dist/lib/ai-model/index.js +27 -27
- package/dist/lib/ai-model/index.js.map +1 -1
- package/dist/lib/ai-model/inspect.js +51 -57
- package/dist/lib/ai-model/inspect.js.map +1 -1
- package/dist/lib/ai-model/llm-planning.js +49 -67
- package/dist/lib/ai-model/llm-planning.js.map +1 -1
- package/dist/lib/ai-model/prompt/assertion.js +2 -2
- package/dist/lib/ai-model/prompt/assertion.js.map +1 -1
- package/dist/lib/ai-model/prompt/common.js +2 -2
- package/dist/lib/ai-model/prompt/common.js.map +1 -1
- package/dist/lib/ai-model/prompt/describe.js +2 -2
- package/dist/lib/ai-model/prompt/describe.js.map +1 -1
- package/dist/lib/ai-model/prompt/extraction.js +2 -2
- package/dist/lib/ai-model/prompt/extraction.js.map +1 -1
- package/dist/lib/ai-model/prompt/llm-locator.js +14 -241
- package/dist/lib/ai-model/prompt/llm-locator.js.map +1 -1
- package/dist/lib/ai-model/prompt/llm-planning.js +79 -328
- package/dist/lib/ai-model/prompt/llm-planning.js.map +1 -1
- package/dist/lib/ai-model/prompt/llm-section-locator.js +17 -16
- package/dist/lib/ai-model/prompt/llm-section-locator.js.map +1 -1
- package/dist/lib/ai-model/prompt/order-sensitive-judge.js +72 -0
- package/dist/lib/ai-model/prompt/order-sensitive-judge.js.map +1 -0
- package/dist/lib/ai-model/prompt/playwright-generator.js +11 -11
- package/dist/lib/ai-model/prompt/playwright-generator.js.map +1 -1
- package/dist/lib/ai-model/prompt/ui-tars-locator.js +2 -2
- package/dist/lib/ai-model/prompt/ui-tars-locator.js.map +1 -1
- package/dist/lib/ai-model/prompt/ui-tars-planning.js +2 -2
- package/dist/lib/ai-model/prompt/ui-tars-planning.js.map +1 -1
- package/dist/lib/ai-model/prompt/util.js +7 -95
- package/dist/lib/ai-model/prompt/util.js.map +1 -1
- package/dist/lib/ai-model/prompt/yaml-generator.js +18 -18
- package/dist/lib/ai-model/prompt/yaml-generator.js.map +1 -1
- package/dist/lib/ai-model/service-caller/index.js +288 -401
- package/dist/lib/ai-model/service-caller/index.js.map +1 -1
- package/dist/lib/ai-model/ui-tars-planning.js +71 -10
- package/dist/lib/ai-model/ui-tars-planning.js.map +1 -1
- package/dist/lib/{ai-model/common.js → common.js} +40 -55
- package/dist/lib/common.js.map +1 -0
- package/dist/lib/device/device-options.js +20 -0
- package/dist/lib/device/device-options.js.map +1 -0
- package/dist/lib/device/index.js +63 -40
- package/dist/lib/device/index.js.map +1 -1
- package/dist/lib/image/index.js +5 -5
- package/dist/lib/image/index.js.map +1 -1
- package/dist/lib/index.js +24 -20
- package/dist/lib/index.js.map +1 -1
- package/dist/lib/report.js +2 -2
- package/dist/lib/report.js.map +1 -1
- package/dist/lib/{insight → service}/index.js +41 -54
- package/dist/lib/service/index.js.map +1 -0
- package/dist/lib/{insight → service}/utils.js +7 -7
- package/dist/lib/service/utils.js.map +1 -0
- package/dist/lib/task-runner.js +301 -0
- package/dist/lib/task-runner.js.map +1 -0
- package/dist/lib/tree.js +13 -4
- package/dist/lib/tree.js.map +1 -1
- package/dist/lib/types.js +31 -12
- package/dist/lib/types.js.map +1 -1
- package/dist/lib/utils.js +16 -17
- package/dist/lib/utils.js.map +1 -1
- package/dist/lib/yaml/builder.js +2 -2
- package/dist/lib/yaml/builder.js.map +1 -1
- package/dist/lib/yaml/index.js +16 -22
- package/dist/lib/yaml/index.js.map +1 -1
- package/dist/lib/yaml/player.js +123 -100
- package/dist/lib/yaml/player.js.map +1 -1
- package/dist/lib/yaml/utils.js +6 -6
- package/dist/lib/yaml/utils.js.map +1 -1
- package/dist/lib/yaml.js +1 -1
- package/dist/lib/yaml.js.map +1 -1
- package/dist/types/agent/agent.d.ts +62 -17
- package/dist/types/agent/execution-session.d.ts +36 -0
- package/dist/types/agent/index.d.ts +3 -2
- package/dist/types/agent/task-builder.d.ts +35 -0
- package/dist/types/agent/tasks.d.ts +32 -23
- package/dist/types/agent/ui-utils.d.ts +9 -2
- package/dist/types/agent/utils.d.ts +9 -35
- package/dist/types/ai-model/conversation-history.d.ts +8 -4
- package/dist/types/ai-model/index.d.ts +5 -5
- package/dist/types/ai-model/inspect.d.ts +20 -12
- package/dist/types/ai-model/llm-planning.d.ts +3 -1
- package/dist/types/ai-model/prompt/llm-locator.d.ts +1 -6
- package/dist/types/ai-model/prompt/llm-planning.d.ts +2 -3
- package/dist/types/ai-model/prompt/llm-section-locator.d.ts +1 -3
- package/dist/types/ai-model/prompt/order-sensitive-judge.d.ts +2 -0
- package/dist/types/ai-model/prompt/util.d.ts +2 -34
- package/dist/types/ai-model/service-caller/index.d.ts +2 -3
- package/dist/types/ai-model/ui-tars-planning.d.ts +15 -2
- package/dist/types/{ai-model/common.d.ts → common.d.ts} +6 -6
- package/dist/types/device/device-options.d.ts +57 -0
- package/dist/types/device/index.d.ts +55 -39
- package/dist/types/index.d.ts +7 -6
- package/dist/types/service/index.d.ts +26 -0
- package/dist/types/service/utils.d.ts +2 -0
- package/dist/types/task-runner.d.ts +49 -0
- package/dist/types/tree.d.ts +4 -1
- package/dist/types/types.d.ts +103 -66
- package/dist/types/yaml/utils.d.ts +1 -1
- package/dist/types/yaml.d.ts +68 -43
- package/package.json +9 -12
- package/dist/es/ai-model/action-executor.mjs +0 -129
- package/dist/es/ai-model/action-executor.mjs.map +0 -1
- package/dist/es/ai-model/common.mjs.map +0 -1
- package/dist/es/insight/index.mjs.map +0 -1
- package/dist/es/insight/utils.mjs.map +0 -1
- package/dist/lib/ai-model/action-executor.js +0 -163
- package/dist/lib/ai-model/action-executor.js.map +0 -1
- package/dist/lib/ai-model/common.js.map +0 -1
- package/dist/lib/insight/index.js.map +0 -1
- package/dist/lib/insight/utils.js.map +0 -1
- package/dist/types/ai-model/action-executor.d.ts +0 -19
- package/dist/types/insight/index.d.ts +0 -31
- package/dist/types/insight/utils.d.ts +0 -2
|
@@ -1,367 +1,121 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { getZodDescription, getZodTypeName } from "@midscene/shared/zod-schema-utils";
|
|
2
2
|
import { bboxDescription } from "./common.mjs";
|
|
3
|
-
const vlCurrentLog = '"log": string, // Log your thoughts and what the next one action (ONLY ONE!) you can do according to the screenshot and the instruction. The log should contain the following information: "The user wants to do ... . According to the instruction and the previous logs, next step is to .... Now i am going to compose an action \'{ action-type }\' to do ....". If no action should be done, log the reason. Use the same language as the user\'s instruction.';
|
|
4
|
-
const llmCurrentLog = '"log": string, // Log what the next actions you can do according to the screenshot and the instruction. The typical log looks like "Now i want to use action \'{ action-type }\' to do ..". If no action should be done, log the reason. ". Use the same language as the user\'s instruction.';
|
|
5
3
|
const commonOutputFields = `"error"?: string, // Error messages about unexpected situations, if any. Only think it is an error when the situation is not foreseeable according to the instruction. Use the same language as the user's instruction.
|
|
6
4
|
"more_actions_needed_by_instruction": boolean, // Consider if there is still more action(s) to do after the action in "Log" is done, according to the instruction. If so, set this field to true. Otherwise, set it to false.`;
|
|
7
|
-
const vlLocateParam = ()=>
|
|
8
|
-
|
|
5
|
+
const vlLocateParam = (vlMode)=>{
|
|
6
|
+
if (vlMode) return `{bbox: [number, number, number, number], prompt: string } // ${bboxDescription(vlMode)}`;
|
|
7
|
+
return "{ prompt: string /* description of the target element */ }";
|
|
8
|
+
};
|
|
9
9
|
const descriptionForAction = (action, locatorSchemaTypeDescription)=>{
|
|
10
10
|
const tab = ' ';
|
|
11
11
|
const fields = [];
|
|
12
12
|
fields.push(`- type: "${action.name}"`);
|
|
13
13
|
if (action.paramSchema) {
|
|
14
|
-
const shape = action.paramSchema.shape;
|
|
15
|
-
if (!shape) console.warn(`action.paramSchema is not a ZodObject, may lead to unexpected behavior, action name: ${action.name}`);
|
|
16
14
|
const paramLines = [];
|
|
17
|
-
const
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
const fieldTypeName = null == (_actualField__def = actualField._def) ? void 0 : _actualField__def.typeName;
|
|
30
|
-
if ('ZodString' === fieldTypeName) return 'string';
|
|
31
|
-
if ('ZodNumber' === fieldTypeName) return 'number';
|
|
32
|
-
if ('ZodBoolean' === fieldTypeName) return 'boolean';
|
|
33
|
-
if ('ZodArray' === fieldTypeName) return 'array';
|
|
34
|
-
if ('ZodObject' === fieldTypeName) {
|
|
35
|
-
if (ifMidsceneLocatorField(actualField)) return locatorSchemaTypeDescription;
|
|
36
|
-
return 'object';
|
|
37
|
-
}
|
|
38
|
-
if ('ZodEnum' === fieldTypeName) {
|
|
39
|
-
var _actualField__def_values, _actualField__def1;
|
|
40
|
-
const values = (null == (_actualField__def1 = actualField._def) ? void 0 : null == (_actualField__def_values = _actualField__def1.values) ? void 0 : _actualField__def_values.map((option)=>String(`'${option}'`)).join(', ')) ?? 'enum';
|
|
41
|
-
return `enum(${values})`;
|
|
15
|
+
const schema = action.paramSchema;
|
|
16
|
+
const isZodObject = schema._def?.typeName === 'ZodObject';
|
|
17
|
+
if (isZodObject && schema.shape) {
|
|
18
|
+
const shape = schema.shape;
|
|
19
|
+
for (const [key, field] of Object.entries(shape))if (field && 'object' == typeof field) {
|
|
20
|
+
const isOptional = 'function' == typeof field.isOptional && field.isOptional();
|
|
21
|
+
const keyWithOptional = isOptional ? `${key}?` : key;
|
|
22
|
+
const typeName = getZodTypeName(field, locatorSchemaTypeDescription);
|
|
23
|
+
const description = getZodDescription(field);
|
|
24
|
+
let paramLine = `${keyWithOptional}: ${typeName}`;
|
|
25
|
+
if (description) paramLine += ` // ${description}`;
|
|
26
|
+
paramLines.push(paramLine);
|
|
42
27
|
}
|
|
43
|
-
if (
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
return types.join(' | ');
|
|
49
|
-
}
|
|
50
|
-
return 'union';
|
|
28
|
+
if (paramLines.length > 0) {
|
|
29
|
+
fields.push('- param:');
|
|
30
|
+
paramLines.forEach((line)=>{
|
|
31
|
+
fields.push(` - ${line}`);
|
|
32
|
+
});
|
|
51
33
|
}
|
|
52
|
-
|
|
53
|
-
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
const typeName = f._def.typeName;
|
|
60
|
-
if ('ZodOptional' === typeName || 'ZodNullable' === typeName || 'ZodDefault' === typeName) return unwrapField(f._def.innerType);
|
|
61
|
-
if ('ZodEffects' === typeName) {
|
|
62
|
-
if (f._def.schema) return unwrapField(f._def.schema);
|
|
63
|
-
}
|
|
64
|
-
return f;
|
|
65
|
-
};
|
|
66
|
-
if ("description" in field) return field.description || null;
|
|
67
|
-
const actualField = unwrapField(field);
|
|
68
|
-
if ("description" in actualField) return actualField.description || null;
|
|
69
|
-
if ((null == (_actualField__def = actualField._def) ? void 0 : _actualField__def.typeName) === 'ZodObject') {
|
|
70
|
-
if ('midscene_location_field_flag' in actualField._def.shape()) return 'Location information for the target element';
|
|
71
|
-
}
|
|
72
|
-
return null;
|
|
73
|
-
};
|
|
74
|
-
for (const [key, field] of Object.entries(shape))if (field && 'object' == typeof field) {
|
|
75
|
-
const isOptional = 'function' == typeof field.isOptional && field.isOptional();
|
|
76
|
-
const keyWithOptional = isOptional ? `${key}?` : key;
|
|
77
|
-
const typeName = getTypeName(field);
|
|
78
|
-
const description = getDescription(field);
|
|
79
|
-
let paramLine = `${keyWithOptional}: ${typeName}`;
|
|
80
|
-
if (description) paramLine += ` // ${description}`;
|
|
81
|
-
paramLines.push(paramLine);
|
|
82
|
-
}
|
|
83
|
-
if (paramLines.length > 0) {
|
|
84
|
-
fields.push('- param:');
|
|
85
|
-
for (const paramLine of paramLines)fields.push(` - ${paramLine}`);
|
|
34
|
+
} else {
|
|
35
|
+
const typeName = getZodTypeName(schema);
|
|
36
|
+
const description = getZodDescription(schema);
|
|
37
|
+
let paramDescription = `- param: ${typeName}`;
|
|
38
|
+
if (description) paramDescription += ` // ${description}`;
|
|
39
|
+
paramDescription += ' (pass the value directly, not as an object)';
|
|
40
|
+
fields.push(paramDescription);
|
|
86
41
|
}
|
|
87
42
|
}
|
|
88
43
|
return `- ${action.name}, ${action.description || "No description provided"}
|
|
89
44
|
${tab}${fields.join(`\n${tab}`)}
|
|
90
45
|
`.trim();
|
|
91
46
|
};
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
const actionDescriptionList = actionSpace.map((action)=>descriptionForAction(action, vlLocateParam()));
|
|
47
|
+
async function systemPromptToTaskPlanning({ actionSpace, vlMode, includeBbox }) {
|
|
48
|
+
if (includeBbox && !vlMode) throw new Error('vlMode cannot be undefined when includeBbox is true. A valid vlMode is required for bbox-based location.');
|
|
49
|
+
const actionDescriptionList = actionSpace.map((action)=>descriptionForAction(action, vlLocateParam(includeBbox ? vlMode : void 0)));
|
|
95
50
|
const actionList = actionDescriptionList.join('\n');
|
|
51
|
+
const logFieldInstruction = `
|
|
52
|
+
## About the \`log\` field (preamble message)
|
|
53
|
+
|
|
54
|
+
The \`log\` field is a brief preamble message to the user explaining what you’re about to do. It should follow these principles and examples:
|
|
55
|
+
|
|
56
|
+
- **Use the same language as the user's instruction**
|
|
57
|
+
- **Keep it concise**: be no more than 1-2 sentences, focused on immediate, tangible next steps. (8–12 words or Chinese characters for quick updates).
|
|
58
|
+
- **Build on prior context**: if this is not the first action to be done, use the preamble message to connect the dots with what’s been done so far and create a sense of momentum and clarity for the user to understand your next actions.
|
|
59
|
+
- **Keep your tone light, friendly and curious**: add small touches of personality in preambles feel collaborative and engaging.
|
|
60
|
+
|
|
61
|
+
**Examples:**
|
|
62
|
+
- "Click the login button"
|
|
63
|
+
- "Scroll to find the 'Yes' button in popup"
|
|
64
|
+
- "Previous actions failed to find the 'Yes' button, i will try again"
|
|
65
|
+
- "Go back to find the login button"
|
|
66
|
+
`;
|
|
96
67
|
return `
|
|
97
|
-
Target: User will give you
|
|
68
|
+
Target: User will give you an instruction, some screenshots and previous logs indicating what have been done. Your task is to plan the next one action according to current situation to accomplish the instruction.
|
|
98
69
|
|
|
99
|
-
|
|
100
|
-
- Don't give extra actions or plans beyond the instruction. ONLY plan for what the instruction requires. For example, don't try to submit the form if the instruction is only to fill something.
|
|
101
|
-
- Always give ONLY ONE action in \`log\` field (or null if no action should be done), instead of multiple actions. Supported actions are ${actionNameList}.
|
|
102
|
-
- Don't repeat actions in the previous logs.
|
|
103
|
-
- Bbox is the bounding box of the element to be located. It's an array of 4 numbers, representing ${bboxDescription(vlMode)}.
|
|
70
|
+
Please tell what the next one action is (or null if no action should be done) to do the tasks the instruction requires.
|
|
104
71
|
|
|
105
|
-
|
|
72
|
+
## Rules
|
|
73
|
+
|
|
74
|
+
- Don't give extra actions or plans beyond the instruction. For example, don't try to submit the form if the instruction is only to fill something.
|
|
75
|
+
- Give just the next ONE action you should do
|
|
76
|
+
- Consider the current screenshot and give the action that is most likely to accomplish the instruction. For example, if the next step is to click a button but it's not visible in the screenshot, you should try to find it first instead of give a click action.
|
|
77
|
+
- Make sure the previous actions are completed successfully before performing the next step
|
|
78
|
+
- If there are some error messages reported by the previous actions, don't give up, try parse a new action to recover. If the error persists for more than 5 times, you should think this is an error and set the "error" field to the error message.
|
|
79
|
+
- If there is nothing to do but waiting, set the "sleep" field to the positive waiting time in milliseconds and null for the "action" field.
|
|
80
|
+
- Assertions are also important steps. When getting the assertion instruction, a solid conclusion is required. You should explicitly state your conclusion by calling the "Print_Assert_Result" action.
|
|
81
|
+
|
|
82
|
+
## Supporting actions
|
|
106
83
|
${actionList}
|
|
107
84
|
|
|
108
|
-
|
|
109
|
-
|
|
85
|
+
${logFieldInstruction}
|
|
86
|
+
|
|
87
|
+
## Return format
|
|
110
88
|
|
|
111
89
|
Return in JSON format:
|
|
112
90
|
{
|
|
113
|
-
|
|
91
|
+
"log": string, // a brief preamble to the user explaining what you’re about to do
|
|
114
92
|
${commonOutputFields}
|
|
115
93
|
"action":
|
|
116
94
|
{
|
|
117
|
-
//
|
|
95
|
+
"type": string, // the type of the action
|
|
96
|
+
"param"?: { // The parameter of the action, if any
|
|
97
|
+
// k-v style parameter fields
|
|
98
|
+
},
|
|
118
99
|
} | null,
|
|
119
100
|
,
|
|
120
101
|
"sleep"?: number, // The sleep time after the action, in milliseconds.
|
|
121
102
|
}
|
|
122
103
|
|
|
123
|
-
For example,
|
|
124
|
-
|
|
125
|
-
this and output the JSON:
|
|
104
|
+
For example, if the instruction is to login and the form has already been filled, this is a valid return value:
|
|
126
105
|
|
|
127
106
|
{
|
|
128
|
-
"log": "
|
|
107
|
+
"log": "Click the login button",
|
|
108
|
+
"more_actions_needed_by_instruction": false,
|
|
129
109
|
"action": {
|
|
130
110
|
"type": "Tap",
|
|
131
111
|
"param": {
|
|
132
|
-
"locate": {
|
|
133
|
-
"bbox": [100,
|
|
134
|
-
"prompt": "The 'Yes' button in popup"
|
|
135
|
-
}
|
|
136
|
-
}
|
|
137
|
-
},
|
|
138
|
-
"more_actions_needed_by_instruction": false,
|
|
139
|
-
}
|
|
140
|
-
`;
|
|
141
|
-
};
|
|
142
|
-
const systemTemplateOfLLM = ({ actionSpace })=>{
|
|
143
|
-
const actionNameList = actionSpace.map((action)=>action.name).join(' / ');
|
|
144
|
-
const actionDescriptionList = actionSpace.map((action)=>descriptionForAction(action, llmLocateParam()));
|
|
145
|
-
const actionList = actionDescriptionList.join('\n');
|
|
146
|
-
return `
|
|
147
|
-
## Role
|
|
148
|
-
|
|
149
|
-
You are a versatile professional in software UI automation. Your outstanding contributions will impact the user experience of billions of users.
|
|
150
|
-
|
|
151
|
-
## Objective
|
|
152
|
-
|
|
153
|
-
- Decompose the instruction user asked into a series of actions
|
|
154
|
-
- Locate the target element if possible
|
|
155
|
-
- If the instruction cannot be accomplished, give a further plan.
|
|
156
|
-
|
|
157
|
-
## Workflow
|
|
158
|
-
|
|
159
|
-
1. Receive the screenshot, element description of screenshot(if any), user's instruction and previous logs.
|
|
160
|
-
2. Decompose the user's task into a sequence of feasible actions, and place it in the \`actions\` field. There are different types of actions (${actionNameList}). The "About the action" section below will give you more details.
|
|
161
|
-
3. Consider whether the user's instruction will be accomplished after the actions you composed.
|
|
162
|
-
- If the instruction is accomplished, set \`more_actions_needed_by_instruction\` to false.
|
|
163
|
-
- If more actions are needed, set \`more_actions_needed_by_instruction\` to true. Get ready to hand over to the next talent people like you. Carefully log what have been done in the \`log\` field, he or she will continue the task according to your logs.
|
|
164
|
-
4. If the task is not feasible on this page, set \`error\` field to the reason.
|
|
165
|
-
|
|
166
|
-
## Constraints
|
|
167
|
-
|
|
168
|
-
- All the actions you composed MUST be feasible, which means all the action fields can be filled with the page context information you get. If not, don't plan this action.
|
|
169
|
-
- Trust the "What have been done" field about the task (if any), don't repeat actions in it.
|
|
170
|
-
- Respond only with valid JSON. Do not write an introduction or summary or markdown prefix like \`\`\`json\`\`\`.
|
|
171
|
-
- If the screenshot and the instruction are totally irrelevant, set reason in the \`error\` field.
|
|
172
|
-
|
|
173
|
-
## About the \`actions\` field
|
|
174
|
-
|
|
175
|
-
The \`locate\` param is commonly used in the \`param\` field of the action, means to locate the target element to perform the action, it conforms to the following scheme:
|
|
176
|
-
|
|
177
|
-
type LocateParam = {
|
|
178
|
-
"id": string, // the id of the element found. It should either be the id marked with a rectangle in the screenshot or the id described in the description.
|
|
179
|
-
"prompt"?: string // the description of the element to find. It can only be omitted when locate is null.
|
|
180
|
-
} | null // If it's not on the page, the LocateParam should be null
|
|
181
|
-
|
|
182
|
-
## Supported actions
|
|
183
|
-
|
|
184
|
-
Each action has a \`type\` and corresponding \`param\`. To be detailed:
|
|
185
|
-
${actionList}
|
|
186
|
-
|
|
187
|
-
`.trim();
|
|
188
|
-
};
|
|
189
|
-
const outputTemplate = `
|
|
190
|
-
## Output JSON Format:
|
|
191
|
-
|
|
192
|
-
The JSON format is as follows:
|
|
193
|
-
|
|
194
|
-
{
|
|
195
|
-
"actions": [
|
|
196
|
-
// ... some actions
|
|
197
|
-
],
|
|
198
|
-
${llmCurrentLog}
|
|
199
|
-
${commonOutputFields}
|
|
200
|
-
}
|
|
201
|
-
|
|
202
|
-
## Examples
|
|
203
|
-
|
|
204
|
-
### Example: Decompose a task
|
|
205
|
-
|
|
206
|
-
When you received the following information:
|
|
207
|
-
|
|
208
|
-
* Instruction: 'Click the language switch button, wait 1s, click "English"'
|
|
209
|
-
* Logs: null
|
|
210
|
-
* Page Context (screenshot and description) shows: There is a language switch button, and the "English" option is not shown in the screenshot now.
|
|
211
|
-
|
|
212
|
-
By viewing the page screenshot and description, you should consider this and output the JSON:
|
|
213
|
-
|
|
214
|
-
* The user intent is: tap the switch button, sleep, and tap the 'English' option
|
|
215
|
-
* The language switch button is shown in the screenshot, and can be located by the page description or the id marked with a rectangle. So we can plan a Tap action to do this.
|
|
216
|
-
* Plan a Sleep action to wait for 1 second to ensure the language options are displayed.
|
|
217
|
-
* The "English" option button is not shown in the screenshot now, it means it may only show after the previous actions are finished. So don't plan any action to do this.
|
|
218
|
-
* Compose the log: The user wants to do click the language switch button, wait 1s, click "English". According to the instruction and the previous logs, next step is to tap the language switch button to open the language options. Now i am going to compose an action 'Tap' to click the language switch button.
|
|
219
|
-
* The task cannot be accomplished (because the last tapping action is not finished yet), so the \`more_actions_needed_by_instruction\` field is true. The \`error\` field is null.
|
|
220
|
-
|
|
221
|
-
{
|
|
222
|
-
"actions":[
|
|
223
|
-
{
|
|
224
|
-
"thought": "Click the language switch button to open the language options.",
|
|
225
|
-
"type": "Tap",
|
|
226
|
-
"param": {
|
|
227
|
-
"locate": { id: "c81c4e9a33", prompt: "The language switch button" }
|
|
228
|
-
}
|
|
229
|
-
},
|
|
230
|
-
{
|
|
231
|
-
"thought": "Wait for 1 second to ensure the language options are displayed.",
|
|
232
|
-
"type": "Sleep",
|
|
233
|
-
"param": { "timeMs": 1000 },
|
|
234
|
-
}
|
|
235
|
-
],
|
|
236
|
-
"error": null,
|
|
237
|
-
"more_actions_needed_by_instruction": true,
|
|
238
|
-
"log": "The user wants to do click the language switch button, wait 1s, click \"English\". According to the instruction and the previous logs, next step is to tap the language switch button to open the language options. Now i am going to compose an action 'Tap' to click the language switch button.",
|
|
239
|
-
}
|
|
240
|
-
|
|
241
|
-
### Example: What NOT to do
|
|
242
|
-
Wrong output:
|
|
243
|
-
{
|
|
244
|
-
"actions":[
|
|
245
|
-
{
|
|
246
|
-
"thought": "Click the language switch button to open the language options.",
|
|
247
|
-
"type": "Tap",
|
|
248
|
-
"param": {
|
|
249
|
-
"locate": { "id": "c81c4e9a33" } // WRONG: prompt is missing, this is not a valid LocateParam
|
|
250
|
-
}
|
|
251
|
-
},
|
|
252
|
-
{
|
|
253
|
-
"thought": "Click the English option",
|
|
254
|
-
"type": "Tap",
|
|
255
|
-
"param": {
|
|
256
|
-
"locate": null // WRONG: if the element is not on the page, you should not plan this action
|
|
112
|
+
"locate": {
|
|
113
|
+
"prompt": "The login button"${vlMode ? ', "bbox": [100, 200, 300, 400]' : ''}
|
|
257
114
|
}
|
|
258
115
|
}
|
|
259
|
-
|
|
260
|
-
"more_actions_needed_by_instruction": false, // WRONG: should be true
|
|
261
|
-
"log": "The user wants to do click the language switch button, wait 1s, click \"English\". According to the instruction and the previous logs, next step is to tap the language switch button to open the language options. Now i am going to compose an action 'Tap' to click the language switch button.",
|
|
262
|
-
}
|
|
116
|
+
}
|
|
263
117
|
`;
|
|
264
|
-
async function systemPromptToTaskPlanning({ actionSpace, vlMode }) {
|
|
265
|
-
if (vlMode) return systemTemplateOfVLPlanning({
|
|
266
|
-
actionSpace,
|
|
267
|
-
vlMode
|
|
268
|
-
});
|
|
269
|
-
return `${systemTemplateOfLLM({
|
|
270
|
-
actionSpace
|
|
271
|
-
})}\n\n${outputTemplate}`;
|
|
272
118
|
}
|
|
273
|
-
|
|
274
|
-
type: 'json_schema',
|
|
275
|
-
json_schema: {
|
|
276
|
-
name: 'action_items',
|
|
277
|
-
strict: false,
|
|
278
|
-
schema: {
|
|
279
|
-
type: 'object',
|
|
280
|
-
strict: false,
|
|
281
|
-
properties: {
|
|
282
|
-
actions: {
|
|
283
|
-
type: 'array',
|
|
284
|
-
items: {
|
|
285
|
-
type: 'object',
|
|
286
|
-
strict: false,
|
|
287
|
-
properties: {
|
|
288
|
-
thought: {
|
|
289
|
-
type: 'string',
|
|
290
|
-
description: 'Reasons for generating this task, and why this task is feasible on this page'
|
|
291
|
-
},
|
|
292
|
-
type: {
|
|
293
|
-
type: 'string',
|
|
294
|
-
description: 'Type of action'
|
|
295
|
-
},
|
|
296
|
-
param: {
|
|
297
|
-
anyOf: [
|
|
298
|
-
{
|
|
299
|
-
type: 'null'
|
|
300
|
-
},
|
|
301
|
-
{
|
|
302
|
-
type: 'object',
|
|
303
|
-
additionalProperties: true
|
|
304
|
-
}
|
|
305
|
-
],
|
|
306
|
-
description: 'Parameter of the action'
|
|
307
|
-
},
|
|
308
|
-
locate: {
|
|
309
|
-
type: [
|
|
310
|
-
'object',
|
|
311
|
-
'null'
|
|
312
|
-
],
|
|
313
|
-
properties: {
|
|
314
|
-
id: {
|
|
315
|
-
type: 'string'
|
|
316
|
-
},
|
|
317
|
-
prompt: {
|
|
318
|
-
type: 'string'
|
|
319
|
-
}
|
|
320
|
-
},
|
|
321
|
-
required: [
|
|
322
|
-
'id',
|
|
323
|
-
'prompt'
|
|
324
|
-
],
|
|
325
|
-
additionalProperties: false,
|
|
326
|
-
description: 'Location information for the target element'
|
|
327
|
-
}
|
|
328
|
-
},
|
|
329
|
-
required: [
|
|
330
|
-
'thought',
|
|
331
|
-
'type',
|
|
332
|
-
'param',
|
|
333
|
-
'locate'
|
|
334
|
-
],
|
|
335
|
-
additionalProperties: false
|
|
336
|
-
},
|
|
337
|
-
description: 'List of actions to be performed'
|
|
338
|
-
},
|
|
339
|
-
more_actions_needed_by_instruction: {
|
|
340
|
-
type: 'boolean',
|
|
341
|
-
description: 'If all the actions described in the instruction have been covered by this action and logs, set this field to false.'
|
|
342
|
-
},
|
|
343
|
-
log: {
|
|
344
|
-
type: 'string',
|
|
345
|
-
description: 'Log what these planned actions do. Do not include further actions that have not been planned.'
|
|
346
|
-
},
|
|
347
|
-
error: {
|
|
348
|
-
type: [
|
|
349
|
-
'string',
|
|
350
|
-
'null'
|
|
351
|
-
],
|
|
352
|
-
description: 'Error messages about unexpected situations'
|
|
353
|
-
}
|
|
354
|
-
},
|
|
355
|
-
required: [
|
|
356
|
-
'actions',
|
|
357
|
-
'more_actions_needed_by_instruction',
|
|
358
|
-
'log',
|
|
359
|
-
'error'
|
|
360
|
-
],
|
|
361
|
-
additionalProperties: false
|
|
362
|
-
}
|
|
363
|
-
}
|
|
364
|
-
};
|
|
365
|
-
export { descriptionForAction, planSchema, systemPromptToTaskPlanning };
|
|
119
|
+
export { descriptionForAction, systemPromptToTaskPlanning };
|
|
366
120
|
|
|
367
121
|
//# sourceMappingURL=llm-planning.mjs.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ai-model/prompt/llm-planning.mjs","sources":["webpack://@midscene/core/./src/ai-model/prompt/llm-planning.ts"],"sourcesContent":["import type { DeviceAction } from '@/types';\nimport type { TVlModeTypes } from '@midscene/shared/env';\nimport type { ResponseFormatJSONSchema } from 'openai/resources/index';\nimport type { ZodObject, z } from 'zod';\nimport { ifMidsceneLocatorField } from '../common';\nimport { bboxDescription } from './common';\n\n// Note: put the log field first to trigger the CoT\n\nconst vlCurrentLog = `\"log\": string, // Log your thoughts and what the next one action (ONLY ONE!) you can do according to the screenshot and the instruction. The log should contain the following information: \"The user wants to do ... . According to the instruction and the previous logs, next step is to .... Now i am going to compose an action '{ action-type }' to do ....\". If no action should be done, log the reason. Use the same language as the user's instruction.`;\nconst llmCurrentLog = `\"log\": string, // Log what the next actions you can do according to the screenshot and the instruction. The typical log looks like \"Now i want to use action '{ action-type }' to do ..\". If no action should be done, log the reason. \". Use the same language as the user's instruction.`;\n\nconst commonOutputFields = `\"error\"?: string, // Error messages about unexpected situations, if any. Only think it is an error when the situation is not foreseeable according to the instruction. Use the same language as the user's instruction.\n \"more_actions_needed_by_instruction\": boolean, // Consider if there is still more action(s) to do after the action in \"Log\" is done, according to the instruction. If so, set this field to true. Otherwise, set it to false.`;\nconst vlLocateParam = () =>\n '{bbox: [number, number, number, number], prompt: string }';\nconst llmLocateParam = () => '{\"id\": string, \"prompt\": string}';\n\nexport const descriptionForAction = (\n action: DeviceAction<any>,\n locatorSchemaTypeDescription: string,\n) => {\n const tab = ' ';\n const fields: string[] = [];\n\n // Add the action type field\n fields.push(`- type: \"${action.name}\"`);\n\n // Handle paramSchema if it exists\n if (action.paramSchema) {\n // Try to extract parameter information from the zod schema\n // For zod object schemas, extract type information and descriptions\n const shape = (action.paramSchema as ZodObject<any>).shape;\n\n if (!shape) {\n console.warn(\n `action.paramSchema is not a ZodObject, may lead to unexpected behavior, action name: ${action.name}`,\n );\n }\n\n const paramLines: string[] = [];\n\n // Helper function to get type name from zod schema\n const getTypeName = (field: any): string => {\n // Recursively unwrap optional, nullable, and other wrapper types to get the actual inner type\n const unwrapField = (f: any): any => {\n if (!f._def) return f;\n\n const typeName = f._def.typeName;\n\n // Handle wrapper types that have innerType\n if (\n typeName === 'ZodOptional' ||\n typeName === 'ZodNullable' ||\n typeName === 'ZodDefault'\n ) {\n return unwrapField(f._def.innerType);\n }\n\n // Handle ZodEffects (transformations, refinements, preprocessors)\n if (typeName === 'ZodEffects') {\n // For ZodEffects, unwrap the schema field which contains the underlying type\n if (f._def.schema) {\n return unwrapField(f._def.schema);\n }\n }\n\n return f;\n };\n\n const actualField = unwrapField(field);\n const fieldTypeName = actualField._def?.typeName;\n\n if (fieldTypeName === 'ZodString') return 'string';\n if (fieldTypeName === 'ZodNumber') return 'number';\n if (fieldTypeName === 'ZodBoolean') return 'boolean';\n if (fieldTypeName === 'ZodArray') return 'array';\n if (fieldTypeName === 'ZodObject') {\n // Check if this is a passthrough object (like MidsceneLocation)\n if (ifMidsceneLocatorField(actualField)) {\n return locatorSchemaTypeDescription;\n }\n return 'object';\n }\n if (fieldTypeName === 'ZodEnum') {\n const values =\n (actualField._def?.values as unknown[] | undefined)\n ?.map((option: unknown) => String(`'${option}'`))\n .join(', ') ?? 'enum';\n\n return `enum(${values})`;\n }\n // Handle ZodUnion by taking the first option (for display purposes)\n if (fieldTypeName === 'ZodUnion') {\n const options = actualField._def?.options as any[] | undefined;\n if (options && options.length > 0) {\n // For unions, list all types\n const types = options.map((opt: any) => getTypeName(opt));\n return types.join(' | ');\n }\n return 'union';\n }\n\n console.warn(\n 'failed to parse Zod type. This may lead to wrong params from the LLM.\\n',\n actualField._def,\n );\n return actualField.toString();\n };\n\n // Helper function to get description from zod schema\n const getDescription = (field: z.ZodTypeAny): string | null => {\n // Recursively unwrap optional, nullable, and other wrapper types to get the actual inner type\n const unwrapField = (f: any): any => {\n if (!f._def) return f;\n\n const typeName = f._def.typeName;\n\n // Handle wrapper types that have innerType\n if (\n typeName === 'ZodOptional' ||\n typeName === 'ZodNullable' ||\n typeName === 'ZodDefault'\n ) {\n return unwrapField(f._def.innerType);\n }\n\n // Handle ZodEffects (transformations, refinements, preprocessors)\n if (typeName === 'ZodEffects') {\n // For ZodEffects, unwrap the schema field which contains the underlying type\n if (f._def.schema) {\n return unwrapField(f._def.schema);\n }\n }\n\n return f;\n };\n\n // Check for direct description on the original field (wrapper may have description)\n if ('description' in field) {\n return field.description || null;\n }\n\n const actualField = unwrapField(field);\n\n // Check for description on the unwrapped field\n if ('description' in actualField) {\n return actualField.description || null;\n }\n\n // Check for MidsceneLocation fields and add description\n if (actualField._def?.typeName === 'ZodObject') {\n if ('midscene_location_field_flag' in actualField._def.shape()) {\n return 'Location information for the target element';\n }\n }\n\n return null;\n };\n\n for (const [key, field] of Object.entries(shape)) {\n if (field && typeof field === 'object') {\n // Check if field is optional\n const isOptional =\n typeof (field as any).isOptional === 'function' &&\n (field as any).isOptional();\n const keyWithOptional = isOptional ? `${key}?` : key;\n\n // Get the type name\n const typeName = getTypeName(field);\n\n // Get description\n const description = getDescription(field as z.ZodTypeAny);\n\n // Build param line for this field\n let paramLine = `${keyWithOptional}: ${typeName}`;\n if (description) {\n paramLine += ` // ${description}`;\n }\n\n paramLines.push(paramLine);\n }\n }\n\n if (paramLines.length > 0) {\n fields.push('- param:');\n for (const paramLine of paramLines) {\n fields.push(` - ${paramLine}`);\n }\n }\n }\n\n return `- ${action.name}, ${action.description || 'No description provided'}\n${tab}${fields.join(`\\n${tab}`)}\n`.trim();\n};\n\nconst systemTemplateOfVLPlanning = ({\n actionSpace,\n vlMode,\n}: {\n actionSpace: DeviceAction<any>[];\n vlMode: TVlModeTypes | undefined;\n}) => {\n const actionNameList = actionSpace.map((action) => action.name).join(', ');\n const actionDescriptionList = actionSpace.map((action) => {\n return descriptionForAction(action, vlLocateParam());\n });\n const actionList = actionDescriptionList.join('\\n');\n\n return `\nTarget: User will give you a latest screenshot, an instruction and some previous logs indicating what have been done. Please tell what the next one action is (or null if no action should be done) to do the tasks the instruction requires. \n\nRestriction:\n- Don't give extra actions or plans beyond the instruction. ONLY plan for what the instruction requires. For example, don't try to submit the form if the instruction is only to fill something.\n- Always give ONLY ONE action in \\`log\\` field (or null if no action should be done), instead of multiple actions. Supported actions are ${actionNameList}.\n- Don't repeat actions in the previous logs.\n- Bbox is the bounding box of the element to be located. It's an array of 4 numbers, representing ${bboxDescription(vlMode)}.\n\nSupporting actions:\n${actionList}\n\nField description:\n* The \\`prompt\\` field inside the \\`locate\\` field is a short description that could be used to locate the element.\n\nReturn in JSON format:\n{\n ${vlCurrentLog}\n ${commonOutputFields}\n \"action\": \n {\n // one of the supporting actions\n } | null,\n ,\n \"sleep\"?: number, // The sleep time after the action, in milliseconds.\n}\n\nFor example, when the instruction is \"click 'Confirm' button, and click 'Yes' in popup\" and the previous log shows \"The 'Confirm' button has been clicked\", by viewing the screenshot and previous logs, you should consider: We have already clicked the 'Confirm' button, so next we should find and click 'Yes' in popup.\n\nthis and output the JSON:\n\n{\n \"log\": \"The user wants to do click 'Confirm' button, and click 'Yes' in popup. According to the instruction and the previous logs, next step is to tap the 'Yes' button in the popup. Now i am going to compose an action 'Tap' to click 'Yes' in popup.\",\n \"action\": {\n \"type\": \"Tap\",\n \"param\": {\n \"locate\": {\n \"bbox\": [100, 100, 200, 200],\n \"prompt\": \"The 'Yes' button in popup\"\n }\n }\n },\n \"more_actions_needed_by_instruction\": false,\n}\n`;\n};\n\nconst systemTemplateOfLLM = ({\n actionSpace,\n}: { actionSpace: DeviceAction<any>[] }) => {\n const actionNameList = actionSpace.map((action) => action.name).join(' / ');\n const actionDescriptionList = actionSpace.map((action) => {\n return descriptionForAction(action, llmLocateParam());\n });\n const actionList = actionDescriptionList.join('\\n');\n\n return `\n## Role\n\nYou are a versatile professional in software UI automation. Your outstanding contributions will impact the user experience of billions of users.\n\n## Objective\n\n- Decompose the instruction user asked into a series of actions\n- Locate the target element if possible\n- If the instruction cannot be accomplished, give a further plan.\n\n## Workflow\n\n1. Receive the screenshot, element description of screenshot(if any), user's instruction and previous logs.\n2. Decompose the user's task into a sequence of feasible actions, and place it in the \\`actions\\` field. There are different types of actions (${actionNameList}). The \"About the action\" section below will give you more details.\n3. Consider whether the user's instruction will be accomplished after the actions you composed.\n- If the instruction is accomplished, set \\`more_actions_needed_by_instruction\\` to false.\n- If more actions are needed, set \\`more_actions_needed_by_instruction\\` to true. Get ready to hand over to the next talent people like you. Carefully log what have been done in the \\`log\\` field, he or she will continue the task according to your logs.\n4. If the task is not feasible on this page, set \\`error\\` field to the reason.\n\n## Constraints\n\n- All the actions you composed MUST be feasible, which means all the action fields can be filled with the page context information you get. If not, don't plan this action.\n- Trust the \"What have been done\" field about the task (if any), don't repeat actions in it.\n- Respond only with valid JSON. Do not write an introduction or summary or markdown prefix like \\`\\`\\`json\\`\\`\\`.\n- If the screenshot and the instruction are totally irrelevant, set reason in the \\`error\\` field.\n\n## About the \\`actions\\` field\n\nThe \\`locate\\` param is commonly used in the \\`param\\` field of the action, means to locate the target element to perform the action, it conforms to the following scheme:\n\ntype LocateParam = {\n \"id\": string, // the id of the element found. It should either be the id marked with a rectangle in the screenshot or the id described in the description.\n \"prompt\"?: string // the description of the element to find. It can only be omitted when locate is null.\n} | null // If it's not on the page, the LocateParam should be null\n\n## Supported actions\n\nEach action has a \\`type\\` and corresponding \\`param\\`. To be detailed:\n${actionList}\n\n`.trim();\n};\n\nconst outputTemplate = `\n## Output JSON Format:\n\nThe JSON format is as follows:\n\n{\n \"actions\": [\n // ... some actions\n ],\n ${llmCurrentLog}\n ${commonOutputFields}\n}\n\n## Examples\n\n### Example: Decompose a task\n\nWhen you received the following information:\n\n* Instruction: 'Click the language switch button, wait 1s, click \"English\"'\n* Logs: null\n* Page Context (screenshot and description) shows: There is a language switch button, and the \"English\" option is not shown in the screenshot now.\n\nBy viewing the page screenshot and description, you should consider this and output the JSON:\n\n* The user intent is: tap the switch button, sleep, and tap the 'English' option\n* The language switch button is shown in the screenshot, and can be located by the page description or the id marked with a rectangle. So we can plan a Tap action to do this.\n* Plan a Sleep action to wait for 1 second to ensure the language options are displayed.\n* The \"English\" option button is not shown in the screenshot now, it means it may only show after the previous actions are finished. So don't plan any action to do this.\n* Compose the log: The user wants to do click the language switch button, wait 1s, click \"English\". According to the instruction and the previous logs, next step is to tap the language switch button to open the language options. Now i am going to compose an action 'Tap' to click the language switch button.\n* The task cannot be accomplished (because the last tapping action is not finished yet), so the \\`more_actions_needed_by_instruction\\` field is true. The \\`error\\` field is null.\n\n{\n \"actions\":[\n {\n \"thought\": \"Click the language switch button to open the language options.\",\n \"type\": \"Tap\", \n \"param\": {\n \"locate\": { id: \"c81c4e9a33\", prompt: \"The language switch button\" }\n }\n },\n {\n \"thought\": \"Wait for 1 second to ensure the language options are displayed.\",\n \"type\": \"Sleep\",\n \"param\": { \"timeMs\": 1000 },\n }\n ],\n \"error\": null,\n \"more_actions_needed_by_instruction\": true,\n \"log\": \"The user wants to do click the language switch button, wait 1s, click \\\"English\\\". According to the instruction and the previous logs, next step is to tap the language switch button to open the language options. Now i am going to compose an action 'Tap' to click the language switch button.\",\n}\n\n### Example: What NOT to do\nWrong output:\n{\n \"actions\":[\n {\n \"thought\": \"Click the language switch button to open the language options.\",\n \"type\": \"Tap\",\n \"param\": {\n \"locate\": { \"id\": \"c81c4e9a33\" } // WRONG: prompt is missing, this is not a valid LocateParam\n }\n },\n {\n \"thought\": \"Click the English option\",\n \"type\": \"Tap\", \n \"param\": {\n \"locate\": null // WRONG: if the element is not on the page, you should not plan this action\n }\n }\n ],\n \"more_actions_needed_by_instruction\": false, // WRONG: should be true\n \"log\": \"The user wants to do click the language switch button, wait 1s, click \\\"English\\\". According to the instruction and the previous logs, next step is to tap the language switch button to open the language options. Now i am going to compose an action 'Tap' to click the language switch button.\",\n}\n`;\n\nexport async function systemPromptToTaskPlanning({\n actionSpace,\n vlMode,\n}: {\n actionSpace: DeviceAction<any>[];\n vlMode: TVlModeTypes | undefined;\n}) {\n if (vlMode) {\n return systemTemplateOfVLPlanning({ actionSpace, vlMode });\n }\n\n return `${systemTemplateOfLLM({ actionSpace })}\\n\\n${outputTemplate}`;\n}\n\nexport const planSchema: ResponseFormatJSONSchema = {\n type: 'json_schema',\n json_schema: {\n name: 'action_items',\n strict: false,\n schema: {\n type: 'object',\n strict: false,\n properties: {\n actions: {\n type: 'array',\n items: {\n type: 'object',\n strict: false,\n properties: {\n thought: {\n type: 'string',\n description:\n 'Reasons for generating this task, and why this task is feasible on this page',\n },\n type: {\n type: 'string',\n description: 'Type of action',\n },\n param: {\n anyOf: [\n { type: 'null' },\n {\n type: 'object',\n additionalProperties: true,\n },\n ],\n description: 'Parameter of the action',\n },\n locate: {\n type: ['object', 'null'],\n properties: {\n id: { type: 'string' },\n prompt: { type: 'string' },\n },\n required: ['id', 'prompt'],\n additionalProperties: false,\n description: 'Location information for the target element',\n },\n },\n required: ['thought', 'type', 'param', 'locate'],\n additionalProperties: false,\n },\n description: 'List of actions to be performed',\n },\n more_actions_needed_by_instruction: {\n type: 'boolean',\n description:\n 'If all the actions described in the instruction have been covered by this action and logs, set this field to false.',\n },\n log: {\n type: 'string',\n description:\n 'Log what these planned actions do. Do not include further actions that have not been planned.',\n },\n error: {\n type: ['string', 'null'],\n description: 'Error messages about unexpected situations',\n },\n },\n required: [\n 'actions',\n 'more_actions_needed_by_instruction',\n 'log',\n 'error',\n ],\n additionalProperties: false,\n },\n },\n};\n"],"names":["vlCurrentLog","llmCurrentLog","commonOutputFields","vlLocateParam","llmLocateParam","descriptionForAction","action","locatorSchemaTypeDescription","tab","fields","shape","console","paramLines","getTypeName","field","_actualField__def","unwrapField","f","typeName","actualField","fieldTypeName","ifMidsceneLocatorField","_actualField__def_values","values","option","String","_actualField__def2","options","types","opt","getDescription","key","Object","isOptional","keyWithOptional","description","paramLine","systemTemplateOfVLPlanning","actionSpace","vlMode","actionNameList","actionDescriptionList","actionList","bboxDescription","systemTemplateOfLLM","outputTemplate","systemPromptToTaskPlanning","planSchema"],"mappings":";;AASA,MAAMA,eAAe;AACrB,MAAMC,gBAAgB;AAEtB,MAAMC,qBAAqB,CAAC;+NACmM,CAAC;AAChO,MAAMC,gBAAgB,IACpB;AACF,MAAMC,iBAAiB,IAAM;AAEtB,MAAMC,uBAAuB,CAClCC,QACAC;IAEA,MAAMC,MAAM;IACZ,MAAMC,SAAmB,EAAE;IAG3BA,OAAO,IAAI,CAAC,CAAC,SAAS,EAAEH,OAAO,IAAI,CAAC,CAAC,CAAC;IAGtC,IAAIA,OAAO,WAAW,EAAE;QAGtB,MAAMI,QAASJ,OAAO,WAAW,CAAoB,KAAK;QAE1D,IAAI,CAACI,OACHC,QAAQ,IAAI,CACV,CAAC,qFAAqF,EAAEL,OAAO,IAAI,EAAE;QAIzG,MAAMM,aAAuB,EAAE;QAG/B,MAAMC,cAAc,CAACC;gBA4BGC;YA1BtB,MAAMC,cAAc,CAACC;gBACnB,IAAI,CAACA,EAAE,IAAI,EAAE,OAAOA;gBAEpB,MAAMC,WAAWD,EAAE,IAAI,CAAC,QAAQ;gBAGhC,IACEC,AAAa,kBAAbA,YACAA,AAAa,kBAAbA,YACAA,AAAa,iBAAbA,UAEA,OAAOF,YAAYC,EAAE,IAAI,CAAC,SAAS;gBAIrC,IAAIC,AAAa,iBAAbA,UAEF;oBAAA,IAAID,EAAE,IAAI,CAAC,MAAM,EACf,OAAOD,YAAYC,EAAE,IAAI,CAAC,MAAM;gBAClC;gBAGF,OAAOA;YACT;YAEA,MAAME,cAAcH,YAAYF;YAChC,MAAMM,gBAAgB,QAAAL,CAAAA,oBAAAA,YAAY,IAAI,AAAD,IAAfA,KAAAA,IAAAA,kBAAkB,QAAQ;YAEhD,IAAIK,AAAkB,gBAAlBA,eAA+B,OAAO;YAC1C,IAAIA,AAAkB,gBAAlBA,eAA+B,OAAO;YAC1C,IAAIA,AAAkB,iBAAlBA,eAAgC,OAAO;YAC3C,IAAIA,AAAkB,eAAlBA,eAA8B,OAAO;YACzC,IAAIA,AAAkB,gBAAlBA,eAA+B;gBAEjC,IAAIC,uBAAuBF,cACzB,OAAOZ;gBAET,OAAO;YACT;YACA,IAAIa,AAAkB,cAAlBA,eAA6B;oBAE5BE,0BAAAA;gBADH,MAAMC,SACJ,AAAC,SAAAD,CAAAA,qBAAAA,YAAY,IAAI,AAAD,IAAfA,KAAAA,IAAAA,QAAAA,CAAAA,2BAAAA,mBAAkB,MAAM,AAAD,IAAvBA,KAAAA,IAAAA,yBACG,GAAG,CAAC,CAACE,SAAoBC,OAAO,CAAC,CAAC,EAAED,OAAO,CAAC,CAAC,GAC9C,IAAI,CAAC,KAAI,KAAK;gBAEnB,OAAO,CAAC,KAAK,EAAED,OAAO,CAAC,CAAC;YAC1B;YAEA,IAAIH,AAAkB,eAAlBA,eAA8B;oBAChBM;gBAAhB,MAAMC,UAAU,QAAAD,CAAAA,qBAAAA,YAAY,IAAI,AAAD,IAAfA,KAAAA,IAAAA,mBAAkB,OAAO;gBACzC,IAAIC,WAAWA,QAAQ,MAAM,GAAG,GAAG;oBAEjC,MAAMC,QAAQD,QAAQ,GAAG,CAAC,CAACE,MAAahB,YAAYgB;oBACpD,OAAOD,MAAM,IAAI,CAAC;gBACpB;gBACA,OAAO;YACT;YAEAjB,QAAQ,IAAI,CACV,2EACAQ,YAAY,IAAI;YAElB,OAAOA,YAAY,QAAQ;QAC7B;QAGA,MAAMW,iBAAiB,CAAChB;gBAwClBC;YAtCJ,MAAMC,cAAc,CAACC;gBACnB,IAAI,CAACA,EAAE,IAAI,EAAE,OAAOA;gBAEpB,MAAMC,WAAWD,EAAE,IAAI,CAAC,QAAQ;gBAGhC,IACEC,AAAa,kBAAbA,YACAA,AAAa,kBAAbA,YACAA,AAAa,iBAAbA,UAEA,OAAOF,YAAYC,EAAE,IAAI,CAAC,SAAS;gBAIrC,IAAIC,AAAa,iBAAbA,UAEF;oBAAA,IAAID,EAAE,IAAI,CAAC,MAAM,EACf,OAAOD,YAAYC,EAAE,IAAI,CAAC,MAAM;gBAClC;gBAGF,OAAOA;YACT;YAGA,IAAI,iBAAiBH,OACnB,OAAOA,MAAM,WAAW,IAAI;YAG9B,MAAMK,cAAcH,YAAYF;YAGhC,IAAI,iBAAiBK,aACnB,OAAOA,YAAY,WAAW,IAAI;YAIpC,IAAIJ,AAAAA,SAAAA,CAAAA,oBAAAA,YAAY,IAAI,AAAD,IAAfA,KAAAA,IAAAA,kBAAkB,QAAQ,AAAD,MAAM,aACjC;gBAAA,IAAI,kCAAkCI,YAAY,IAAI,CAAC,KAAK,IAC1D,OAAO;YACT;YAGF,OAAO;QACT;QAEA,KAAK,MAAM,CAACY,KAAKjB,MAAM,IAAIkB,OAAO,OAAO,CAACtB,OACxC,IAAII,SAAS,AAAiB,YAAjB,OAAOA,OAAoB;YAEtC,MAAMmB,aACJ,AAAqC,cAArC,OAAQnB,MAAc,UAAU,IAC/BA,MAAc,UAAU;YAC3B,MAAMoB,kBAAkBD,aAAa,GAAGF,IAAI,CAAC,CAAC,GAAGA;YAGjD,MAAMb,WAAWL,YAAYC;YAG7B,MAAMqB,cAAcL,eAAehB;YAGnC,IAAIsB,YAAY,GAAGF,gBAAgB,EAAE,EAAEhB,UAAU;YACjD,IAAIiB,aACFC,aAAa,CAAC,IAAI,EAAED,aAAa;YAGnCvB,WAAW,IAAI,CAACwB;QAClB;QAGF,IAAIxB,WAAW,MAAM,GAAG,GAAG;YACzBH,OAAO,IAAI,CAAC;YACZ,KAAK,MAAM2B,aAAaxB,WACtBH,OAAO,IAAI,CAAC,CAAC,IAAI,EAAE2B,WAAW;QAElC;IACF;IAEA,OAAO,CAAC,EAAE,EAAE9B,OAAO,IAAI,CAAC,EAAE,EAAEA,OAAO,WAAW,IAAI,0BAA0B;AAC9E,EAAEE,MAAMC,OAAO,IAAI,CAAC,CAAC,EAAE,EAAED,KAAK,EAAE;AAChC,CAAC,CAAC,IAAI;AACN;AAEA,MAAM6B,6BAA6B,CAAC,EAClCC,WAAW,EACXC,MAAM,EAIP;IACC,MAAMC,iBAAiBF,YAAY,GAAG,CAAC,CAAChC,SAAWA,OAAO,IAAI,EAAE,IAAI,CAAC;IACrE,MAAMmC,wBAAwBH,YAAY,GAAG,CAAC,CAAChC,SACtCD,qBAAqBC,QAAQH;IAEtC,MAAMuC,aAAaD,sBAAsB,IAAI,CAAC;IAE9C,OAAO,CAAC;;;;;yIAK+H,EAAED,eAAe;;kGAExD,EAAEG,gBAAgBJ,QAAQ;;;AAG5H,EAAEG,WAAW;;;;;;;EAOX,EAAE1C,aAAa;EACf,EAAEE,mBAAmB;;;;;;;;;;;;;;;;;;;;;;;;;;AA0BvB,CAAC;AACD;AAEA,MAAM0C,sBAAsB,CAAC,EAC3BN,WAAW,EAC0B;IACrC,MAAME,iBAAiBF,YAAY,GAAG,CAAC,CAAChC,SAAWA,OAAO,IAAI,EAAE,IAAI,CAAC;IACrE,MAAMmC,wBAAwBH,YAAY,GAAG,CAAC,CAAChC,SACtCD,qBAAqBC,QAAQF;IAEtC,MAAMsC,aAAaD,sBAAsB,IAAI,CAAC;IAE9C,OAAO,CAAC;;;;;;;;;;;;;;+IAcqI,EAAED,eAAe;;;;;;;;;;;;;;;;;;;;;;;;;AAyBhK,EAAEE,WAAW;;AAEb,CAAC,CAAC,IAAI;AACN;AAEA,MAAMG,iBAAiB,CAAC;;;;;;;;;EAStB,EAAE5C,cAAc;EAChB,EAAEC,mBAAmB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;AAgEvB,CAAC;AAEM,eAAe4C,2BAA2B,EAC/CR,WAAW,EACXC,MAAM,EAIP;IACC,IAAIA,QACF,OAAOF,2BAA2B;QAAEC;QAAaC;IAAO;IAG1D,OAAO,GAAGK,oBAAoB;QAAEN;IAAY,GAAG,IAAI,EAAEO,gBAAgB;AACvE;AAEO,MAAME,aAAuC;IAClD,MAAM;IACN,aAAa;QACX,MAAM;QACN,QAAQ;QACR,QAAQ;YACN,MAAM;YACN,QAAQ;YACR,YAAY;gBACV,SAAS;oBACP,MAAM;oBACN,OAAO;wBACL,MAAM;wBACN,QAAQ;wBACR,YAAY;4BACV,SAAS;gCACP,MAAM;gCACN,aACE;4BACJ;4BACA,MAAM;gCACJ,MAAM;gCACN,aAAa;4BACf;4BACA,OAAO;gCACL,OAAO;oCACL;wCAAE,MAAM;oCAAO;oCACf;wCACE,MAAM;wCACN,sBAAsB;oCACxB;iCACD;gCACD,aAAa;4BACf;4BACA,QAAQ;gCACN,MAAM;oCAAC;oCAAU;iCAAO;gCACxB,YAAY;oCACV,IAAI;wCAAE,MAAM;oCAAS;oCACrB,QAAQ;wCAAE,MAAM;oCAAS;gCAC3B;gCACA,UAAU;oCAAC;oCAAM;iCAAS;gCAC1B,sBAAsB;gCACtB,aAAa;4BACf;wBACF;wBACA,UAAU;4BAAC;4BAAW;4BAAQ;4BAAS;yBAAS;wBAChD,sBAAsB;oBACxB;oBACA,aAAa;gBACf;gBACA,oCAAoC;oBAClC,MAAM;oBACN,aACE;gBACJ;gBACA,KAAK;oBACH,MAAM;oBACN,aACE;gBACJ;gBACA,OAAO;oBACL,MAAM;wBAAC;wBAAU;qBAAO;oBACxB,aAAa;gBACf;YACF;YACA,UAAU;gBACR;gBACA;gBACA;gBACA;aACD;YACD,sBAAsB;QACxB;IACF;AACF"}
|
|
1
|
+
{"version":3,"file":"ai-model/prompt/llm-planning.mjs","sources":["../../../../src/ai-model/prompt/llm-planning.ts"],"sourcesContent":["import type { DeviceAction } from '@/types';\nimport type { TVlModeTypes } from '@midscene/shared/env';\nimport {\n getZodDescription,\n getZodTypeName,\n} from '@midscene/shared/zod-schema-utils';\nimport type { ResponseFormatJSONSchema } from 'openai/resources/index';\nimport type { z } from 'zod';\nimport { bboxDescription } from './common';\n\n// Note: put the log field first to trigger the CoT\n\nconst commonOutputFields = `\"error\"?: string, // Error messages about unexpected situations, if any. Only think it is an error when the situation is not foreseeable according to the instruction. Use the same language as the user's instruction.\n \"more_actions_needed_by_instruction\": boolean, // Consider if there is still more action(s) to do after the action in \"Log\" is done, according to the instruction. If so, set this field to true. Otherwise, set it to false.`;\n\nconst vlLocateParam = (vlMode: TVlModeTypes | undefined) => {\n if (vlMode) {\n return `{bbox: [number, number, number, number], prompt: string } // ${bboxDescription(vlMode)}`;\n }\n return '{ prompt: string /* description of the target element */ }';\n};\n\nexport const descriptionForAction = (\n action: DeviceAction<any>,\n locatorSchemaTypeDescription: string,\n) => {\n const tab = ' ';\n const fields: string[] = [];\n\n // Add the action type field\n fields.push(`- type: \"${action.name}\"`);\n\n // Handle paramSchema if it exists\n if (action.paramSchema) {\n const paramLines: string[] = [];\n\n // Check if paramSchema is a ZodObject with shape\n const schema = action.paramSchema as {\n _def?: { typeName?: string };\n shape?: Record<string, unknown>;\n };\n const isZodObject = schema._def?.typeName === 'ZodObject';\n\n if (isZodObject && schema.shape) {\n // Original logic for ZodObject schemas\n const shape = schema.shape;\n\n for (const [key, field] of Object.entries(shape)) {\n if (field && typeof field === 'object') {\n // Check if field is optional\n const isOptional =\n typeof (field as { isOptional?: () => boolean }).isOptional ===\n 'function' &&\n (field as { isOptional: () => boolean }).isOptional();\n const keyWithOptional = isOptional ? `${key}?` : key;\n\n // Get the type name using extracted helper\n const typeName = getZodTypeName(field, locatorSchemaTypeDescription);\n\n // Get description using extracted helper\n const description = getZodDescription(field as z.ZodTypeAny);\n\n // Build param line for this field\n let paramLine = `${keyWithOptional}: ${typeName}`;\n if (description) {\n paramLine += ` // ${description}`;\n }\n\n paramLines.push(paramLine);\n }\n }\n\n // Add the param section to fields if there are paramLines\n if (paramLines.length > 0) {\n fields.push('- param:');\n paramLines.forEach((line) => {\n fields.push(` - ${line}`);\n });\n }\n } else {\n // Handle non-object schemas (string, number, etc.)\n const typeName = getZodTypeName(schema);\n const description = getZodDescription(schema as z.ZodTypeAny);\n\n // For simple types, indicate that param should be the direct value, not an object\n let paramDescription = `- param: ${typeName}`;\n if (description) {\n paramDescription += ` // ${description}`;\n }\n paramDescription += ' (pass the value directly, not as an object)';\n\n fields.push(paramDescription);\n }\n }\n\n return `- ${action.name}, ${action.description || 'No description provided'}\n${tab}${fields.join(`\\n${tab}`)}\n`.trim();\n};\n\nexport async function systemPromptToTaskPlanning({\n actionSpace,\n vlMode,\n includeBbox,\n}: {\n actionSpace: DeviceAction<any>[];\n vlMode: TVlModeTypes | undefined;\n includeBbox: boolean;\n}) {\n // Validate parameters: if includeBbox is true, vlMode must be defined\n if (includeBbox && !vlMode) {\n throw new Error(\n 'vlMode cannot be undefined when includeBbox is true. A valid vlMode is required for bbox-based location.',\n );\n }\n\n const actionDescriptionList = actionSpace.map((action) => {\n return descriptionForAction(\n action,\n vlLocateParam(includeBbox ? vlMode : undefined),\n );\n });\n const actionList = actionDescriptionList.join('\\n');\n\n const logFieldInstruction = `\n## About the \\`log\\` field (preamble message)\n\nThe \\`log\\` field is a brief preamble message to the user explaining what you’re about to do. It should follow these principles and examples:\n\n- **Use the same language as the user's instruction**\n- **Keep it concise**: be no more than 1-2 sentences, focused on immediate, tangible next steps. (8–12 words or Chinese characters for quick updates).\n- **Build on prior context**: if this is not the first action to be done, use the preamble message to connect the dots with what’s been done so far and create a sense of momentum and clarity for the user to understand your next actions.\n- **Keep your tone light, friendly and curious**: add small touches of personality in preambles feel collaborative and engaging.\n\n**Examples:**\n- \"Click the login button\"\n- \"Scroll to find the 'Yes' button in popup\"\n- \"Previous actions failed to find the 'Yes' button, i will try again\"\n- \"Go back to find the login button\"\n`;\n\n return `\nTarget: User will give you an instruction, some screenshots and previous logs indicating what have been done. Your task is to plan the next one action according to current situation to accomplish the instruction.\n\nPlease tell what the next one action is (or null if no action should be done) to do the tasks the instruction requires. \n\n## Rules\n\n- Don't give extra actions or plans beyond the instruction. For example, don't try to submit the form if the instruction is only to fill something.\n- Give just the next ONE action you should do\n- Consider the current screenshot and give the action that is most likely to accomplish the instruction. For example, if the next step is to click a button but it's not visible in the screenshot, you should try to find it first instead of give a click action.\n- Make sure the previous actions are completed successfully before performing the next step\n- If there are some error messages reported by the previous actions, don't give up, try parse a new action to recover. If the error persists for more than 5 times, you should think this is an error and set the \"error\" field to the error message.\n- If there is nothing to do but waiting, set the \"sleep\" field to the positive waiting time in milliseconds and null for the \"action\" field.\n- Assertions are also important steps. When getting the assertion instruction, a solid conclusion is required. You should explicitly state your conclusion by calling the \"Print_Assert_Result\" action.\n\n## Supporting actions\n${actionList}\n\n${logFieldInstruction}\n\n## Return format\n\nReturn in JSON format:\n{\n \"log\": string, // a brief preamble to the user explaining what you’re about to do\n ${commonOutputFields}\n \"action\": \n {\n \"type\": string, // the type of the action\n \"param\"?: { // The parameter of the action, if any\n // k-v style parameter fields\n }, \n } | null,\n ,\n \"sleep\"?: number, // The sleep time after the action, in milliseconds.\n}\n\nFor example, if the instruction is to login and the form has already been filled, this is a valid return value:\n\n{\n \"log\": \"Click the login button\",\n \"more_actions_needed_by_instruction\": false,\n \"action\": {\n \"type\": \"Tap\",\n \"param\": {\n \"locate\": { \n \"prompt\": \"The login button\"${vlMode ? `, \"bbox\": [100, 200, 300, 400]` : ''}\n }\n }\n }\n`;\n}\n"],"names":["commonOutputFields","vlLocateParam","vlMode","bboxDescription","descriptionForAction","action","locatorSchemaTypeDescription","tab","fields","paramLines","schema","isZodObject","shape","key","field","Object","isOptional","keyWithOptional","typeName","getZodTypeName","description","getZodDescription","paramLine","line","paramDescription","systemPromptToTaskPlanning","actionSpace","includeBbox","Error","actionDescriptionList","undefined","actionList","logFieldInstruction"],"mappings":";;AAYA,MAAMA,qBAAqB,CAAC;+NACmM,CAAC;AAEhO,MAAMC,gBAAgB,CAACC;IACrB,IAAIA,QACF,OAAO,CAAC,6DAA6D,EAAEC,gBAAgBD,SAAS;IAElG,OAAO;AACT;AAEO,MAAME,uBAAuB,CAClCC,QACAC;IAEA,MAAMC,MAAM;IACZ,MAAMC,SAAmB,EAAE;IAG3BA,OAAO,IAAI,CAAC,CAAC,SAAS,EAAEH,OAAO,IAAI,CAAC,CAAC,CAAC;IAGtC,IAAIA,OAAO,WAAW,EAAE;QACtB,MAAMI,aAAuB,EAAE;QAG/B,MAAMC,SAASL,OAAO,WAAW;QAIjC,MAAMM,cAAcD,OAAO,IAAI,EAAE,aAAa;QAE9C,IAAIC,eAAeD,OAAO,KAAK,EAAE;YAE/B,MAAME,QAAQF,OAAO,KAAK;YAE1B,KAAK,MAAM,CAACG,KAAKC,MAAM,IAAIC,OAAO,OAAO,CAACH,OACxC,IAAIE,SAAS,AAAiB,YAAjB,OAAOA,OAAoB;gBAEtC,MAAME,aACJ,AACE,cADF,OAAQF,MAAyC,UAAU,IAE1DA,MAAwC,UAAU;gBACrD,MAAMG,kBAAkBD,aAAa,GAAGH,IAAI,CAAC,CAAC,GAAGA;gBAGjD,MAAMK,WAAWC,eAAeL,OAAOR;gBAGvC,MAAMc,cAAcC,kBAAkBP;gBAGtC,IAAIQ,YAAY,GAAGL,gBAAgB,EAAE,EAAEC,UAAU;gBACjD,IAAIE,aACFE,aAAa,CAAC,IAAI,EAAEF,aAAa;gBAGnCX,WAAW,IAAI,CAACa;YAClB;YAIF,IAAIb,WAAW,MAAM,GAAG,GAAG;gBACzBD,OAAO,IAAI,CAAC;gBACZC,WAAW,OAAO,CAAC,CAACc;oBAClBf,OAAO,IAAI,CAAC,CAAC,IAAI,EAAEe,MAAM;gBAC3B;YACF;QACF,OAAO;YAEL,MAAML,WAAWC,eAAeT;YAChC,MAAMU,cAAcC,kBAAkBX;YAGtC,IAAIc,mBAAmB,CAAC,SAAS,EAAEN,UAAU;YAC7C,IAAIE,aACFI,oBAAoB,CAAC,IAAI,EAAEJ,aAAa;YAE1CI,oBAAoB;YAEpBhB,OAAO,IAAI,CAACgB;QACd;IACF;IAEA,OAAO,CAAC,EAAE,EAAEnB,OAAO,IAAI,CAAC,EAAE,EAAEA,OAAO,WAAW,IAAI,0BAA0B;AAC9E,EAAEE,MAAMC,OAAO,IAAI,CAAC,CAAC,EAAE,EAAED,KAAK,EAAE;AAChC,CAAC,CAAC,IAAI;AACN;AAEO,eAAekB,2BAA2B,EAC/CC,WAAW,EACXxB,MAAM,EACNyB,WAAW,EAKZ;IAEC,IAAIA,eAAe,CAACzB,QAClB,MAAM,IAAI0B,MACR;IAIJ,MAAMC,wBAAwBH,YAAY,GAAG,CAAC,CAACrB,SACtCD,qBACLC,QACAJ,cAAc0B,cAAczB,SAAS4B;IAGzC,MAAMC,aAAaF,sBAAsB,IAAI,CAAC;IAE9C,MAAMG,sBAAsB,CAAC;;;;;;;;;;;;;;;AAe/B,CAAC;IAEC,OAAO,CAAC;;;;;;;;;;;;;;;;AAgBV,EAAED,WAAW;;AAEb,EAAEC,oBAAoB;;;;;;;EAOpB,EAAEhC,mBAAmB;;;;;;;;;;;;;;;;;;;;;oCAqBa,EAAEE,SAAS,mCAAmC,GAAG;;;;AAIrF,CAAC;AACD"}
|
|
@@ -1,16 +1,19 @@
|
|
|
1
1
|
import { bboxDescription } from "./common.mjs";
|
|
2
2
|
function systemPromptToLocateSection(vlMode) {
|
|
3
|
+
const bboxFormat = bboxDescription(vlMode);
|
|
3
4
|
return `
|
|
4
|
-
|
|
5
|
+
## Role:
|
|
6
|
+
You are an AI assistant that helps identify UI elements.
|
|
5
7
|
|
|
6
|
-
|
|
8
|
+
## Objective:
|
|
9
|
+
- Find a section containing the target element
|
|
10
|
+
- If the description mentions reference elements, also locate sections containing those references
|
|
7
11
|
|
|
8
|
-
|
|
12
|
+
## Output Format:
|
|
9
13
|
\`\`\`json
|
|
10
14
|
{
|
|
11
|
-
"bbox": [number, number, number, number],
|
|
15
|
+
"bbox": [number, number, number, number], // ${bboxFormat}
|
|
12
16
|
"references_bbox"?: [
|
|
13
|
-
[number, number, number, number],
|
|
14
17
|
[number, number, number, number],
|
|
15
18
|
...
|
|
16
19
|
],
|
|
@@ -18,11 +21,13 @@ return in this JSON format:
|
|
|
18
21
|
}
|
|
19
22
|
\`\`\`
|
|
20
23
|
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
+
Fields:
|
|
25
|
+
* \`bbox\` - Bounding box of the section containing the target element
|
|
26
|
+
* \`references_bbox\` - Optional array of bounding boxes for reference elements
|
|
27
|
+
* \`error\` - Optional error message if the section cannot be found
|
|
24
28
|
|
|
25
|
-
|
|
29
|
+
Example:
|
|
30
|
+
If the description is "delete button on the second row with title 'Peter'", return:
|
|
26
31
|
\`\`\`json
|
|
27
32
|
{
|
|
28
33
|
"bbox": [100, 100, 200, 200],
|
|
@@ -31,11 +36,7 @@ the return value should be like this:
|
|
|
31
36
|
\`\`\`
|
|
32
37
|
`;
|
|
33
38
|
}
|
|
34
|
-
const sectionLocatorInstruction = (
|
|
35
|
-
<targetDescription>
|
|
36
|
-
${sectionDescription}
|
|
37
|
-
</targetDescription>
|
|
38
|
-
`;
|
|
39
|
+
const sectionLocatorInstruction = (sectionDescription)=>`Find section containing: ${sectionDescription}`;
|
|
39
40
|
export { sectionLocatorInstruction, systemPromptToLocateSection };
|
|
40
41
|
|
|
41
42
|
//# sourceMappingURL=llm-section-locator.mjs.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"ai-model/prompt/llm-section-locator.mjs","sources":["
|
|
1
|
+
{"version":3,"file":"ai-model/prompt/llm-section-locator.mjs","sources":["../../../../src/ai-model/prompt/llm-section-locator.ts"],"sourcesContent":["import type { TVlModeTypes } from '@midscene/shared/env';\nimport { bboxDescription } from './common';\n\nexport function systemPromptToLocateSection(vlMode: TVlModeTypes | undefined) {\n const bboxFormat = bboxDescription(vlMode);\n return `\n## Role:\nYou are an AI assistant that helps identify UI elements.\n\n## Objective:\n- Find a section containing the target element\n- If the description mentions reference elements, also locate sections containing those references\n\n## Output Format:\n\\`\\`\\`json\n{\n \"bbox\": [number, number, number, number], // ${bboxFormat}\n \"references_bbox\"?: [\n [number, number, number, number],\n ...\n ],\n \"error\"?: string\n}\n\\`\\`\\`\n\nFields:\n* \\`bbox\\` - Bounding box of the section containing the target element\n* \\`references_bbox\\` - Optional array of bounding boxes for reference elements\n* \\`error\\` - Optional error message if the section cannot be found\n\nExample:\nIf the description is \"delete button on the second row with title 'Peter'\", return:\n\\`\\`\\`json\n{\n \"bbox\": [100, 100, 200, 200],\n \"references_bbox\": [[100, 100, 200, 200]]\n}\n\\`\\`\\`\n`;\n}\n\nexport const sectionLocatorInstruction = (sectionDescription: string) =>\n `Find section containing: ${sectionDescription}`;\n"],"names":["systemPromptToLocateSection","vlMode","bboxFormat","bboxDescription","sectionLocatorInstruction","sectionDescription"],"mappings":";AAGO,SAASA,4BAA4BC,MAAgC;IAC1E,MAAMC,aAAaC,gBAAgBF;IACnC,OAAO,CAAC;;;;;;;;;;;gDAWsC,EAAEC,WAAW;;;;;;;;;;;;;;;;;;;;;;AAsB7D,CAAC;AACD;AAEO,MAAME,4BAA4B,CAACC,qBACxC,CAAC,yBAAyB,EAAEA,oBAAoB"}
|