@midscene/shared 1.6.1-beta-20260331083547.0 → 1.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/es/constants/example-code.mjs +25 -21
- package/dist/es/extractor/dom-util.mjs +9 -5
- package/dist/es/mcp/tool-generator.mjs +130 -56
- package/dist/es/node/fs.mjs +1 -1
- package/dist/lib/constants/example-code.js +25 -21
- package/dist/lib/extractor/dom-util.js +9 -5
- package/dist/lib/mcp/tool-generator.js +129 -55
- package/dist/lib/node/fs.js +1 -1
- package/dist/types/constants/example-code.d.ts +1 -1
- package/dist/types/extractor/dom-util.d.ts +5 -4
- package/dist/types/mcp/types.d.ts +1 -0
- package/package.json +1 -1
- package/src/constants/example-code.ts +25 -21
- package/src/extractor/dom-util.ts +10 -5
- package/src/mcp/tool-generator.ts +217 -66
- package/src/mcp/types.ts +4 -0
|
@@ -80,6 +80,23 @@ test('ai shop', async ({
|
|
|
80
80
|
});
|
|
81
81
|
`;
|
|
82
82
|
const YAML_EXAMPLE_CODE = `
|
|
83
|
+
CRITICAL - YAML Indentation Rules:
|
|
84
|
+
For actions with additional parameters (aiScroll, aiInput, aiKeyboardPress), the parameters must be SIBLING keys at the SAME indentation level as the action key, NOT nested children indented further.
|
|
85
|
+
CORRECT (parameters align with the action key):
|
|
86
|
+
- aiScroll:
|
|
87
|
+
direction: 'down'
|
|
88
|
+
scrollType: 'singleAction'
|
|
89
|
+
distance: 500
|
|
90
|
+
locate: "main content area"
|
|
91
|
+
- aiInput: 'text value'
|
|
92
|
+
locate: 'input field description'
|
|
93
|
+
WRONG (parameters are indented further than the action key, DO NOT do this):
|
|
94
|
+
- aiScroll:
|
|
95
|
+
direction: 'down'
|
|
96
|
+
scrollType: 'singleAction'
|
|
97
|
+
- aiInput: 'text value'
|
|
98
|
+
locate: 'input field description'
|
|
99
|
+
|
|
83
100
|
1. Format:
|
|
84
101
|
|
|
85
102
|
web:
|
|
@@ -91,13 +108,18 @@ tasks:
|
|
|
91
108
|
- name: "descriptive task name"
|
|
92
109
|
flow:
|
|
93
110
|
- aiTap: "element description"
|
|
111
|
+
xpath: '/html/body/div[1]/button[1]'
|
|
94
112
|
- aiInput: 'text value'
|
|
95
113
|
locate: 'input field description'
|
|
114
|
+
xpath: '/html/body/div[1]/input[1]'
|
|
96
115
|
- aiScroll:
|
|
97
|
-
direction: down
|
|
98
|
-
scrollType:
|
|
116
|
+
direction: 'down'
|
|
117
|
+
scrollType: 'singleAction'
|
|
118
|
+
distance: 500
|
|
119
|
+
locate: "scrollable area description"
|
|
120
|
+
xpath: '/html/body/div[1]/main[1]'
|
|
99
121
|
- aiAssert: "expected state"
|
|
100
|
-
- sleep:
|
|
122
|
+
- sleep: 1000
|
|
101
123
|
|
|
102
124
|
2. Action Types:
|
|
103
125
|
- aiTap: for clicks (natural language targeting)
|
|
@@ -106,24 +128,6 @@ tasks:
|
|
|
106
128
|
- aiAssert: for validations
|
|
107
129
|
- sleep: for delays (milliseconds)
|
|
108
130
|
|
|
109
|
-
3. Best Practices:
|
|
110
|
-
- Group related actions into logical tasks
|
|
111
|
-
- Use natural language descriptions
|
|
112
|
-
- Add deepLocate: true for complex interactions
|
|
113
|
-
- Keep task names concise but descriptive
|
|
114
|
-
|
|
115
|
-
4. CRITICAL - YAML Indentation Rules:
|
|
116
|
-
- For actions with additional parameters (aiScroll, aiInput, aiKeyboardPress), the parameters must be SIBLING keys, NOT nested children
|
|
117
|
-
- Parameters like direction, scrollType, locate must align with the action key, not indented further
|
|
118
|
-
- CORRECT indentation example:
|
|
119
|
-
- aiScroll:
|
|
120
|
-
direction: down
|
|
121
|
-
scrollType: singleAction
|
|
122
|
-
- WRONG indentation (DO NOT do this):
|
|
123
|
-
- aiScroll:
|
|
124
|
-
direction: down
|
|
125
|
-
scrollType: singleAction
|
|
126
|
-
|
|
127
131
|
|
|
128
132
|
|
|
129
133
|
YAML type
|
|
@@ -81,12 +81,16 @@ function generateElementByPoint(center, description, edgeSize = 8) {
|
|
|
81
81
|
description: description || ''
|
|
82
82
|
};
|
|
83
83
|
}
|
|
84
|
-
function generateElementByRect(sourceRect, description,
|
|
84
|
+
function generateElementByRect(sourceRect, description, _edgeSize = 8) {
|
|
85
85
|
const centerX = sourceRect.left + Math.floor((sourceRect.width - 1) / 2);
|
|
86
86
|
const centerY = sourceRect.top + Math.floor((sourceRect.height - 1) / 2);
|
|
87
|
-
return
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
87
|
+
return {
|
|
88
|
+
rect: sourceRect,
|
|
89
|
+
center: [
|
|
90
|
+
centerX,
|
|
91
|
+
centerY
|
|
92
|
+
],
|
|
93
|
+
description: description || ''
|
|
94
|
+
};
|
|
91
95
|
}
|
|
92
96
|
export { generateElementByPoint, generateElementByRect, isAElement, isButtonElement, isContainerElement, isFormElement, isImgElement, isNotContainerElement, isSvgElement, isTextElement };
|
|
@@ -1,22 +1,21 @@
|
|
|
1
1
|
import { parseBase64 } from "@midscene/shared/img";
|
|
2
2
|
import { z } from "zod";
|
|
3
|
-
import { getZodDescription, getZodTypeName } from "../zod-schema-utils.mjs";
|
|
3
|
+
import { getZodDescription, getZodTypeName, isMidsceneLocatorField, unwrapZodField } from "../zod-schema-utils.mjs";
|
|
4
4
|
function getErrorMessage(error) {
|
|
5
5
|
return error instanceof Error ? error.message : String(error);
|
|
6
6
|
}
|
|
7
7
|
function describeActionForMCP(action) {
|
|
8
8
|
const actionDesc = action.description || `Execute ${action.name} action`;
|
|
9
9
|
if (!action.paramSchema) return `${action.name} action, ${actionDesc}`;
|
|
10
|
-
const
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
const
|
|
14
|
-
const description = getZodDescription(schema);
|
|
10
|
+
const shape = getZodObjectShape(action.paramSchema);
|
|
11
|
+
if (!shape) {
|
|
12
|
+
const typeName = getZodTypeName(action.paramSchema);
|
|
13
|
+
const description = getZodDescription(action.paramSchema);
|
|
15
14
|
const paramDesc = description ? `${typeName} - ${description}` : typeName;
|
|
16
15
|
return `${action.name} action, ${actionDesc}. Parameter: ${paramDesc}`;
|
|
17
16
|
}
|
|
18
17
|
const paramDescriptions = [];
|
|
19
|
-
for (const [key, field] of Object.entries(
|
|
18
|
+
for (const [key, field] of Object.entries(shape))if (field && 'object' == typeof field) {
|
|
20
19
|
const isFieldOptional = 'function' == typeof field.isOptional && field.isOptional();
|
|
21
20
|
const typeName = getZodTypeName(field);
|
|
22
21
|
const description = getZodDescription(field);
|
|
@@ -30,9 +29,6 @@ function describeActionForMCP(action) {
|
|
|
30
29
|
function isZodOptional(value) {
|
|
31
30
|
return '_def' in value && value._def?.typeName === 'ZodOptional';
|
|
32
31
|
}
|
|
33
|
-
function isZodObject(value) {
|
|
34
|
-
return '_def' in value && value._def?.typeName === 'ZodObject' && 'shape' in value;
|
|
35
|
-
}
|
|
36
32
|
function unwrapOptional(value) {
|
|
37
33
|
if (isZodOptional(value)) return {
|
|
38
34
|
innerValue: value._def.innerType,
|
|
@@ -43,24 +39,31 @@ function unwrapOptional(value) {
|
|
|
43
39
|
isOptional: false
|
|
44
40
|
};
|
|
45
41
|
}
|
|
46
|
-
function
|
|
47
|
-
if (!
|
|
48
|
-
|
|
42
|
+
function getZodObjectShape(value) {
|
|
43
|
+
if (!value) return;
|
|
44
|
+
const actualValue = unwrapZodField(value);
|
|
45
|
+
if (actualValue._def?.typeName !== 'ZodObject') return;
|
|
46
|
+
if ('function' == typeof actualValue._def.shape) return actualValue._def.shape();
|
|
47
|
+
return actualValue.shape;
|
|
48
|
+
}
|
|
49
|
+
function isRecord(value) {
|
|
50
|
+
return 'object' == typeof value && null !== value && !Array.isArray(value);
|
|
49
51
|
}
|
|
50
|
-
function makePromptOptional(
|
|
52
|
+
function makePromptOptional(shape, wrapInOptional) {
|
|
51
53
|
const newShape = {
|
|
52
|
-
...
|
|
54
|
+
...shape
|
|
53
55
|
};
|
|
54
|
-
newShape.prompt =
|
|
56
|
+
newShape.prompt = shape.prompt.optional();
|
|
55
57
|
let newSchema = z.object(newShape).passthrough();
|
|
56
58
|
if (wrapInOptional) newSchema = newSchema.optional();
|
|
57
59
|
return newSchema;
|
|
58
60
|
}
|
|
59
61
|
function transformSchemaField(key, value) {
|
|
60
62
|
const { innerValue, isOptional } = unwrapOptional(value);
|
|
61
|
-
|
|
63
|
+
const shape = getZodObjectShape(innerValue);
|
|
64
|
+
if (shape && isMidsceneLocatorField(innerValue)) return [
|
|
62
65
|
key,
|
|
63
|
-
makePromptOptional(
|
|
66
|
+
makePromptOptional(shape, isOptional)
|
|
64
67
|
];
|
|
65
68
|
return [
|
|
66
69
|
key,
|
|
@@ -69,9 +72,66 @@ function transformSchemaField(key, value) {
|
|
|
69
72
|
}
|
|
70
73
|
function extractActionSchema(paramSchema) {
|
|
71
74
|
if (!paramSchema) return {};
|
|
72
|
-
const
|
|
73
|
-
if (!
|
|
74
|
-
return Object.fromEntries(Object.entries(
|
|
75
|
+
const shape = getZodObjectShape(paramSchema);
|
|
76
|
+
if (!shape) return paramSchema;
|
|
77
|
+
return Object.fromEntries(Object.entries(shape).map(([key, value])=>transformSchemaField(key, value)));
|
|
78
|
+
}
|
|
79
|
+
function getPromptText(prompt) {
|
|
80
|
+
if ('string' == typeof prompt) return prompt;
|
|
81
|
+
if (isRecord(prompt) && 'string' == typeof prompt.prompt) return prompt.prompt;
|
|
82
|
+
}
|
|
83
|
+
function moveLocateExtrasIntoPrompt(value, locateFieldKeys) {
|
|
84
|
+
const promptText = getPromptText(value.prompt);
|
|
85
|
+
if (!promptText) return value;
|
|
86
|
+
const normalizedPrompt = isRecord(value.prompt) ? {
|
|
87
|
+
...value.prompt
|
|
88
|
+
} : {
|
|
89
|
+
prompt: promptText
|
|
90
|
+
};
|
|
91
|
+
const normalizedLocate = {};
|
|
92
|
+
let movedExtraField = false;
|
|
93
|
+
for (const [key, fieldValue] of Object.entries(value))if ('prompt' !== key) {
|
|
94
|
+
if (locateFieldKeys.has(key)) {
|
|
95
|
+
normalizedLocate[key] = fieldValue;
|
|
96
|
+
continue;
|
|
97
|
+
}
|
|
98
|
+
movedExtraField = true;
|
|
99
|
+
if (!(key in normalizedPrompt)) normalizedPrompt[key] = fieldValue;
|
|
100
|
+
}
|
|
101
|
+
if (!movedExtraField) return value;
|
|
102
|
+
return {
|
|
103
|
+
...normalizedLocate,
|
|
104
|
+
prompt: normalizedPrompt
|
|
105
|
+
};
|
|
106
|
+
}
|
|
107
|
+
function normalizeLocateLikeArg(value, fieldSchema) {
|
|
108
|
+
if ('string' == typeof value) return {
|
|
109
|
+
prompt: value
|
|
110
|
+
};
|
|
111
|
+
if (!isRecord(value)) return value;
|
|
112
|
+
const shape = getZodObjectShape(fieldSchema);
|
|
113
|
+
if (!shape) return value;
|
|
114
|
+
return moveLocateExtrasIntoPrompt(value, new Set(Object.keys(shape)));
|
|
115
|
+
}
|
|
116
|
+
function normalizeActionArgs(args, paramSchema) {
|
|
117
|
+
if (!paramSchema) return args;
|
|
118
|
+
const shape = getZodObjectShape(paramSchema);
|
|
119
|
+
if (!shape) return args;
|
|
120
|
+
return Object.fromEntries(Object.entries(args).map(([key, value])=>{
|
|
121
|
+
const fieldSchema = shape[key];
|
|
122
|
+
if (!fieldSchema) return [
|
|
123
|
+
key,
|
|
124
|
+
value
|
|
125
|
+
];
|
|
126
|
+
if (isMidsceneLocatorField(fieldSchema)) return [
|
|
127
|
+
key,
|
|
128
|
+
normalizeLocateLikeArg(value, fieldSchema)
|
|
129
|
+
];
|
|
130
|
+
return [
|
|
131
|
+
key,
|
|
132
|
+
value
|
|
133
|
+
];
|
|
134
|
+
}));
|
|
75
135
|
}
|
|
76
136
|
function serializeArgsToDescription(args) {
|
|
77
137
|
try {
|
|
@@ -90,7 +150,7 @@ function serializeArgsToDescription(args) {
|
|
|
90
150
|
}
|
|
91
151
|
}
|
|
92
152
|
function buildActionInstruction(actionName, args) {
|
|
93
|
-
const locatePrompt = args.locate
|
|
153
|
+
const locatePrompt = isRecord(args.locate) ? getPromptText(args.locate.prompt) : void 0;
|
|
94
154
|
switch(actionName){
|
|
95
155
|
case 'Tap':
|
|
96
156
|
return locatePrompt ? `Tap on "${locatePrompt}"` : 'Tap';
|
|
@@ -118,44 +178,59 @@ function buildActionInstruction(actionName, args) {
|
|
|
118
178
|
}
|
|
119
179
|
}
|
|
120
180
|
}
|
|
121
|
-
async function
|
|
181
|
+
async function executeAction(agent, actionName, args) {
|
|
182
|
+
if (agent.callActionInActionSpace) return agent.callActionInActionSpace(actionName, args);
|
|
183
|
+
if (agent.aiAction) {
|
|
184
|
+
const instruction = buildActionInstruction(actionName, args);
|
|
185
|
+
return agent.aiAction(instruction);
|
|
186
|
+
}
|
|
187
|
+
throw new Error(`Action "${actionName}" is not supported by this agent`);
|
|
188
|
+
}
|
|
189
|
+
async function captureScreenshotResult(agent, actionName, actionResult) {
|
|
190
|
+
const content = [
|
|
191
|
+
{
|
|
192
|
+
type: 'text',
|
|
193
|
+
text: `Action "${actionName}" completed.`
|
|
194
|
+
}
|
|
195
|
+
];
|
|
196
|
+
if (void 0 !== actionResult) content.push({
|
|
197
|
+
type: 'text',
|
|
198
|
+
text: `Result: ${serializeActionResult(actionResult)}`
|
|
199
|
+
});
|
|
122
200
|
try {
|
|
123
201
|
const screenshot = await agent.page?.screenshotBase64();
|
|
124
202
|
if (!screenshot) return {
|
|
125
|
-
content
|
|
126
|
-
{
|
|
127
|
-
type: 'text',
|
|
128
|
-
text: `Action "${actionName}" completed.`
|
|
129
|
-
}
|
|
130
|
-
]
|
|
203
|
+
content
|
|
131
204
|
};
|
|
132
205
|
const { mimeType, body } = parseBase64(screenshot);
|
|
206
|
+
content.push({
|
|
207
|
+
type: 'image',
|
|
208
|
+
data: body,
|
|
209
|
+
mimeType
|
|
210
|
+
});
|
|
133
211
|
return {
|
|
134
|
-
content
|
|
135
|
-
{
|
|
136
|
-
type: 'text',
|
|
137
|
-
text: `Action "${actionName}" completed.`
|
|
138
|
-
},
|
|
139
|
-
{
|
|
140
|
-
type: 'image',
|
|
141
|
-
data: body,
|
|
142
|
-
mimeType
|
|
143
|
-
}
|
|
144
|
-
]
|
|
212
|
+
content
|
|
145
213
|
};
|
|
146
214
|
} catch (error) {
|
|
147
215
|
const errorMessage = getErrorMessage(error);
|
|
148
216
|
console.error('Error capturing screenshot:', errorMessage);
|
|
217
|
+
content[0] = {
|
|
218
|
+
type: 'text',
|
|
219
|
+
text: `Action "${actionName}" completed (screenshot unavailable: ${errorMessage})`
|
|
220
|
+
};
|
|
149
221
|
return {
|
|
150
|
-
content
|
|
151
|
-
{
|
|
152
|
-
type: 'text',
|
|
153
|
-
text: `Action "${actionName}" completed (screenshot unavailable: ${errorMessage})`
|
|
154
|
-
}
|
|
155
|
-
]
|
|
222
|
+
content
|
|
156
223
|
};
|
|
157
224
|
}
|
|
158
225
|
}
|
|
226
|
+
function serializeActionResult(actionResult) {
|
|
227
|
+
if ('string' == typeof actionResult) return actionResult;
|
|
228
|
+
try {
|
|
229
|
+
return JSON.stringify(actionResult);
|
|
230
|
+
} catch {
|
|
231
|
+
return String(actionResult);
|
|
232
|
+
}
|
|
233
|
+
}
|
|
159
234
|
function createErrorResult(message) {
|
|
160
235
|
return {
|
|
161
236
|
content: [
|
|
@@ -214,17 +289,16 @@ function generateToolsFromActionSpace(actionSpace, getAgent) {
|
|
|
214
289
|
handler: async (args)=>{
|
|
215
290
|
try {
|
|
216
291
|
const agent = await getAgent();
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
}
|
|
292
|
+
const normalizedArgs = normalizeActionArgs(args, action.paramSchema);
|
|
293
|
+
let actionResult;
|
|
294
|
+
try {
|
|
295
|
+
actionResult = await executeAction(agent, action.name, normalizedArgs);
|
|
296
|
+
} catch (error) {
|
|
297
|
+
const errorMessage = getErrorMessage(error);
|
|
298
|
+
console.error(`Error executing action "${action.name}":`, errorMessage);
|
|
299
|
+
return await captureFailureResult(agent, action.name, errorMessage);
|
|
226
300
|
}
|
|
227
|
-
return await captureScreenshotResult(agent, action.name);
|
|
301
|
+
return await captureScreenshotResult(agent, action.name, actionResult);
|
|
228
302
|
} catch (error) {
|
|
229
303
|
const errorMessage = getErrorMessage(error);
|
|
230
304
|
console.error(`Error in handler for "${action.name}":`, errorMessage);
|