@midscene/shared 1.6.1-beta-20260331083547.0 → 1.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -80,6 +80,23 @@ test('ai shop', async ({
80
80
  });
81
81
  `;
82
82
  const YAML_EXAMPLE_CODE = `
83
+ CRITICAL - YAML Indentation Rules:
84
+ For actions with additional parameters (aiScroll, aiInput, aiKeyboardPress), the parameters must be SIBLING keys at the SAME indentation level as the action key, NOT nested children indented further.
85
+ CORRECT (parameters align with the action key):
86
+ - aiScroll:
87
+ direction: 'down'
88
+ scrollType: 'singleAction'
89
+ distance: 500
90
+ locate: "main content area"
91
+ - aiInput: 'text value'
92
+ locate: 'input field description'
93
+ WRONG (parameters are indented further than the action key, DO NOT do this):
94
+ - aiScroll:
95
+ direction: 'down'
96
+ scrollType: 'singleAction'
97
+ - aiInput: 'text value'
98
+ locate: 'input field description'
99
+
83
100
  1. Format:
84
101
 
85
102
  web:
@@ -91,13 +108,18 @@ tasks:
91
108
  - name: "descriptive task name"
92
109
  flow:
93
110
  - aiTap: "element description"
111
+ xpath: '/html/body/div[1]/button[1]'
94
112
  - aiInput: 'text value'
95
113
  locate: 'input field description'
114
+ xpath: '/html/body/div[1]/input[1]'
96
115
  - aiScroll:
97
- direction: down/up
98
- scrollType: scrollToBottom/scrollToTop/singleAction
116
+ direction: 'down'
117
+ scrollType: 'singleAction'
118
+ distance: 500
119
+ locate: "scrollable area description"
120
+ xpath: '/html/body/div[1]/main[1]'
99
121
  - aiAssert: "expected state"
100
- - sleep: milliseconds
122
+ - sleep: 1000
101
123
 
102
124
  2. Action Types:
103
125
  - aiTap: for clicks (natural language targeting)
@@ -106,24 +128,6 @@ tasks:
106
128
  - aiAssert: for validations
107
129
  - sleep: for delays (milliseconds)
108
130
 
109
- 3. Best Practices:
110
- - Group related actions into logical tasks
111
- - Use natural language descriptions
112
- - Add deepLocate: true for complex interactions
113
- - Keep task names concise but descriptive
114
-
115
- 4. CRITICAL - YAML Indentation Rules:
116
- - For actions with additional parameters (aiScroll, aiInput, aiKeyboardPress), the parameters must be SIBLING keys, NOT nested children
117
- - Parameters like direction, scrollType, locate must align with the action key, not indented further
118
- - CORRECT indentation example:
119
- - aiScroll:
120
- direction: down
121
- scrollType: singleAction
122
- - WRONG indentation (DO NOT do this):
123
- - aiScroll:
124
- direction: down
125
- scrollType: singleAction
126
-
127
131
 
128
132
 
129
133
  YAML type
@@ -81,12 +81,16 @@ function generateElementByPoint(center, description, edgeSize = 8) {
81
81
  description: description || ''
82
82
  };
83
83
  }
84
- function generateElementByRect(sourceRect, description, edgeSize = 8) {
84
+ function generateElementByRect(sourceRect, description, _edgeSize = 8) {
85
85
  const centerX = sourceRect.left + Math.floor((sourceRect.width - 1) / 2);
86
86
  const centerY = sourceRect.top + Math.floor((sourceRect.height - 1) / 2);
87
- return generateElementByPoint([
88
- centerX,
89
- centerY
90
- ], description, edgeSize);
87
+ return {
88
+ rect: sourceRect,
89
+ center: [
90
+ centerX,
91
+ centerY
92
+ ],
93
+ description: description || ''
94
+ };
91
95
  }
92
96
  export { generateElementByPoint, generateElementByRect, isAElement, isButtonElement, isContainerElement, isFormElement, isImgElement, isNotContainerElement, isSvgElement, isTextElement };
@@ -1,22 +1,21 @@
1
1
  import { parseBase64 } from "@midscene/shared/img";
2
2
  import { z } from "zod";
3
- import { getZodDescription, getZodTypeName } from "../zod-schema-utils.mjs";
3
+ import { getZodDescription, getZodTypeName, isMidsceneLocatorField, unwrapZodField } from "../zod-schema-utils.mjs";
4
4
  function getErrorMessage(error) {
5
5
  return error instanceof Error ? error.message : String(error);
6
6
  }
7
7
  function describeActionForMCP(action) {
8
8
  const actionDesc = action.description || `Execute ${action.name} action`;
9
9
  if (!action.paramSchema) return `${action.name} action, ${actionDesc}`;
10
- const schema = action.paramSchema;
11
- const isZodObjectType = schema._def?.typeName === 'ZodObject';
12
- if (!isZodObjectType || !schema.shape) {
13
- const typeName = getZodTypeName(schema);
14
- const description = getZodDescription(schema);
10
+ const shape = getZodObjectShape(action.paramSchema);
11
+ if (!shape) {
12
+ const typeName = getZodTypeName(action.paramSchema);
13
+ const description = getZodDescription(action.paramSchema);
15
14
  const paramDesc = description ? `${typeName} - ${description}` : typeName;
16
15
  return `${action.name} action, ${actionDesc}. Parameter: ${paramDesc}`;
17
16
  }
18
17
  const paramDescriptions = [];
19
- for (const [key, field] of Object.entries(schema.shape))if (field && 'object' == typeof field) {
18
+ for (const [key, field] of Object.entries(shape))if (field && 'object' == typeof field) {
20
19
  const isFieldOptional = 'function' == typeof field.isOptional && field.isOptional();
21
20
  const typeName = getZodTypeName(field);
22
21
  const description = getZodDescription(field);
@@ -30,9 +29,6 @@ function describeActionForMCP(action) {
30
29
  function isZodOptional(value) {
31
30
  return '_def' in value && value._def?.typeName === 'ZodOptional';
32
31
  }
33
- function isZodObject(value) {
34
- return '_def' in value && value._def?.typeName === 'ZodObject' && 'shape' in value;
35
- }
36
32
  function unwrapOptional(value) {
37
33
  if (isZodOptional(value)) return {
38
34
  innerValue: value._def.innerType,
@@ -43,24 +39,31 @@ function unwrapOptional(value) {
43
39
  isOptional: false
44
40
  };
45
41
  }
46
- function isLocateField(value) {
47
- if (!isZodObject(value)) return false;
48
- return 'prompt' in value.shape;
42
+ function getZodObjectShape(value) {
43
+ if (!value) return;
44
+ const actualValue = unwrapZodField(value);
45
+ if (actualValue._def?.typeName !== 'ZodObject') return;
46
+ if ('function' == typeof actualValue._def.shape) return actualValue._def.shape();
47
+ return actualValue.shape;
48
+ }
49
+ function isRecord(value) {
50
+ return 'object' == typeof value && null !== value && !Array.isArray(value);
49
51
  }
50
- function makePromptOptional(value, wrapInOptional) {
52
+ function makePromptOptional(shape, wrapInOptional) {
51
53
  const newShape = {
52
- ...value.shape
54
+ ...shape
53
55
  };
54
- newShape.prompt = value.shape.prompt.optional();
56
+ newShape.prompt = shape.prompt.optional();
55
57
  let newSchema = z.object(newShape).passthrough();
56
58
  if (wrapInOptional) newSchema = newSchema.optional();
57
59
  return newSchema;
58
60
  }
59
61
  function transformSchemaField(key, value) {
60
62
  const { innerValue, isOptional } = unwrapOptional(value);
61
- if (isZodObject(innerValue) && isLocateField(innerValue)) return [
63
+ const shape = getZodObjectShape(innerValue);
64
+ if (shape && isMidsceneLocatorField(innerValue)) return [
62
65
  key,
63
- makePromptOptional(innerValue, isOptional)
66
+ makePromptOptional(shape, isOptional)
64
67
  ];
65
68
  return [
66
69
  key,
@@ -69,9 +72,66 @@ function transformSchemaField(key, value) {
69
72
  }
70
73
  function extractActionSchema(paramSchema) {
71
74
  if (!paramSchema) return {};
72
- const schema = paramSchema;
73
- if (!isZodObject(schema)) return schema;
74
- return Object.fromEntries(Object.entries(schema.shape).map(([key, value])=>transformSchemaField(key, value)));
75
+ const shape = getZodObjectShape(paramSchema);
76
+ if (!shape) return paramSchema;
77
+ return Object.fromEntries(Object.entries(shape).map(([key, value])=>transformSchemaField(key, value)));
78
+ }
79
+ function getPromptText(prompt) {
80
+ if ('string' == typeof prompt) return prompt;
81
+ if (isRecord(prompt) && 'string' == typeof prompt.prompt) return prompt.prompt;
82
+ }
83
+ function moveLocateExtrasIntoPrompt(value, locateFieldKeys) {
84
+ const promptText = getPromptText(value.prompt);
85
+ if (!promptText) return value;
86
+ const normalizedPrompt = isRecord(value.prompt) ? {
87
+ ...value.prompt
88
+ } : {
89
+ prompt: promptText
90
+ };
91
+ const normalizedLocate = {};
92
+ let movedExtraField = false;
93
+ for (const [key, fieldValue] of Object.entries(value))if ('prompt' !== key) {
94
+ if (locateFieldKeys.has(key)) {
95
+ normalizedLocate[key] = fieldValue;
96
+ continue;
97
+ }
98
+ movedExtraField = true;
99
+ if (!(key in normalizedPrompt)) normalizedPrompt[key] = fieldValue;
100
+ }
101
+ if (!movedExtraField) return value;
102
+ return {
103
+ ...normalizedLocate,
104
+ prompt: normalizedPrompt
105
+ };
106
+ }
107
+ function normalizeLocateLikeArg(value, fieldSchema) {
108
+ if ('string' == typeof value) return {
109
+ prompt: value
110
+ };
111
+ if (!isRecord(value)) return value;
112
+ const shape = getZodObjectShape(fieldSchema);
113
+ if (!shape) return value;
114
+ return moveLocateExtrasIntoPrompt(value, new Set(Object.keys(shape)));
115
+ }
116
+ function normalizeActionArgs(args, paramSchema) {
117
+ if (!paramSchema) return args;
118
+ const shape = getZodObjectShape(paramSchema);
119
+ if (!shape) return args;
120
+ return Object.fromEntries(Object.entries(args).map(([key, value])=>{
121
+ const fieldSchema = shape[key];
122
+ if (!fieldSchema) return [
123
+ key,
124
+ value
125
+ ];
126
+ if (isMidsceneLocatorField(fieldSchema)) return [
127
+ key,
128
+ normalizeLocateLikeArg(value, fieldSchema)
129
+ ];
130
+ return [
131
+ key,
132
+ value
133
+ ];
134
+ }));
75
135
  }
76
136
  function serializeArgsToDescription(args) {
77
137
  try {
@@ -90,7 +150,7 @@ function serializeArgsToDescription(args) {
90
150
  }
91
151
  }
92
152
  function buildActionInstruction(actionName, args) {
93
- const locatePrompt = args.locate && 'object' == typeof args.locate ? args.locate.prompt : void 0;
153
+ const locatePrompt = isRecord(args.locate) ? getPromptText(args.locate.prompt) : void 0;
94
154
  switch(actionName){
95
155
  case 'Tap':
96
156
  return locatePrompt ? `Tap on "${locatePrompt}"` : 'Tap';
@@ -118,44 +178,59 @@ function buildActionInstruction(actionName, args) {
118
178
  }
119
179
  }
120
180
  }
121
- async function captureScreenshotResult(agent, actionName) {
181
+ async function executeAction(agent, actionName, args) {
182
+ if (agent.callActionInActionSpace) return agent.callActionInActionSpace(actionName, args);
183
+ if (agent.aiAction) {
184
+ const instruction = buildActionInstruction(actionName, args);
185
+ return agent.aiAction(instruction);
186
+ }
187
+ throw new Error(`Action "${actionName}" is not supported by this agent`);
188
+ }
189
+ async function captureScreenshotResult(agent, actionName, actionResult) {
190
+ const content = [
191
+ {
192
+ type: 'text',
193
+ text: `Action "${actionName}" completed.`
194
+ }
195
+ ];
196
+ if (void 0 !== actionResult) content.push({
197
+ type: 'text',
198
+ text: `Result: ${serializeActionResult(actionResult)}`
199
+ });
122
200
  try {
123
201
  const screenshot = await agent.page?.screenshotBase64();
124
202
  if (!screenshot) return {
125
- content: [
126
- {
127
- type: 'text',
128
- text: `Action "${actionName}" completed.`
129
- }
130
- ]
203
+ content
131
204
  };
132
205
  const { mimeType, body } = parseBase64(screenshot);
206
+ content.push({
207
+ type: 'image',
208
+ data: body,
209
+ mimeType
210
+ });
133
211
  return {
134
- content: [
135
- {
136
- type: 'text',
137
- text: `Action "${actionName}" completed.`
138
- },
139
- {
140
- type: 'image',
141
- data: body,
142
- mimeType
143
- }
144
- ]
212
+ content
145
213
  };
146
214
  } catch (error) {
147
215
  const errorMessage = getErrorMessage(error);
148
216
  console.error('Error capturing screenshot:', errorMessage);
217
+ content[0] = {
218
+ type: 'text',
219
+ text: `Action "${actionName}" completed (screenshot unavailable: ${errorMessage})`
220
+ };
149
221
  return {
150
- content: [
151
- {
152
- type: 'text',
153
- text: `Action "${actionName}" completed (screenshot unavailable: ${errorMessage})`
154
- }
155
- ]
222
+ content
156
223
  };
157
224
  }
158
225
  }
226
+ function serializeActionResult(actionResult) {
227
+ if ('string' == typeof actionResult) return actionResult;
228
+ try {
229
+ return JSON.stringify(actionResult);
230
+ } catch {
231
+ return String(actionResult);
232
+ }
233
+ }
159
234
  function createErrorResult(message) {
160
235
  return {
161
236
  content: [
@@ -214,17 +289,16 @@ function generateToolsFromActionSpace(actionSpace, getAgent) {
214
289
  handler: async (args)=>{
215
290
  try {
216
291
  const agent = await getAgent();
217
- if (agent.aiAction) {
218
- const instruction = buildActionInstruction(action.name, args);
219
- try {
220
- await agent.aiAction(instruction);
221
- } catch (error) {
222
- const errorMessage = getErrorMessage(error);
223
- console.error(`Error executing action "${action.name}":`, errorMessage);
224
- return await captureFailureResult(agent, action.name, errorMessage);
225
- }
292
+ const normalizedArgs = normalizeActionArgs(args, action.paramSchema);
293
+ let actionResult;
294
+ try {
295
+ actionResult = await executeAction(agent, action.name, normalizedArgs);
296
+ } catch (error) {
297
+ const errorMessage = getErrorMessage(error);
298
+ console.error(`Error executing action "${action.name}":`, errorMessage);
299
+ return await captureFailureResult(agent, action.name, errorMessage);
226
300
  }
227
- return await captureScreenshotResult(agent, action.name);
301
+ return await captureScreenshotResult(agent, action.name, actionResult);
228
302
  } catch (error) {
229
303
  const errorMessage = getErrorMessage(error);
230
304
  console.error(`Error in handler for "${action.name}":`, errorMessage);