@midscene/shared 1.6.1-beta-20260331083547.0 → 1.6.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/es/constants/example-code.mjs +25 -21
- package/dist/es/extractor/dom-util.mjs +9 -5
- package/dist/es/mcp/tool-generator.mjs +130 -56
- package/dist/es/node/fs.mjs +1 -1
- package/dist/lib/constants/example-code.js +25 -21
- package/dist/lib/extractor/dom-util.js +9 -5
- package/dist/lib/mcp/tool-generator.js +129 -55
- package/dist/lib/node/fs.js +1 -1
- package/dist/types/constants/example-code.d.ts +1 -1
- package/dist/types/extractor/dom-util.d.ts +5 -4
- package/dist/types/mcp/types.d.ts +1 -0
- package/package.json +1 -1
- package/src/constants/example-code.ts +25 -21
- package/src/extractor/dom-util.ts +10 -5
- package/src/mcp/tool-generator.ts +217 -66
- package/src/mcp/types.ts +4 -0
|
@@ -1,6 +1,11 @@
|
|
|
1
1
|
import { parseBase64 } from '@midscene/shared/img';
|
|
2
2
|
import { z } from 'zod';
|
|
3
|
-
import {
|
|
3
|
+
import {
|
|
4
|
+
getZodDescription,
|
|
5
|
+
getZodTypeName,
|
|
6
|
+
isMidsceneLocatorField,
|
|
7
|
+
unwrapZodField,
|
|
8
|
+
} from '../zod-schema-utils';
|
|
4
9
|
import type {
|
|
5
10
|
ActionSpaceItem,
|
|
6
11
|
BaseAgent,
|
|
@@ -26,23 +31,18 @@ function describeActionForMCP(action: ActionSpaceItem): string {
|
|
|
26
31
|
return `${action.name} action, ${actionDesc}`;
|
|
27
32
|
}
|
|
28
33
|
|
|
29
|
-
const
|
|
30
|
-
|
|
31
|
-
shape?: Record<string, unknown>;
|
|
32
|
-
};
|
|
33
|
-
const isZodObjectType = schema._def?.typeName === 'ZodObject';
|
|
34
|
-
|
|
35
|
-
if (!isZodObjectType || !schema.shape) {
|
|
34
|
+
const shape = getZodObjectShape(action.paramSchema);
|
|
35
|
+
if (!shape) {
|
|
36
36
|
// Simple type schema
|
|
37
|
-
const typeName = getZodTypeName(
|
|
38
|
-
const description = getZodDescription(
|
|
37
|
+
const typeName = getZodTypeName(action.paramSchema);
|
|
38
|
+
const description = getZodDescription(action.paramSchema as z.ZodTypeAny);
|
|
39
39
|
const paramDesc = description ? `${typeName} - ${description}` : typeName;
|
|
40
40
|
return `${action.name} action, ${actionDesc}. Parameter: ${paramDesc}`;
|
|
41
41
|
}
|
|
42
42
|
|
|
43
43
|
// Object schema with multiple fields
|
|
44
44
|
const paramDescriptions: string[] = [];
|
|
45
|
-
for (const [key, field] of Object.entries(
|
|
45
|
+
for (const [key, field] of Object.entries(shape)) {
|
|
46
46
|
if (field && typeof field === 'object') {
|
|
47
47
|
const isFieldOptional =
|
|
48
48
|
typeof (field as { isOptional?: () => boolean }).isOptional ===
|
|
@@ -96,25 +96,42 @@ function unwrapOptional(value: z.ZodTypeAny): {
|
|
|
96
96
|
return { innerValue: value, isOptional: false };
|
|
97
97
|
}
|
|
98
98
|
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
return false;
|
|
99
|
+
function getZodObjectShape(
|
|
100
|
+
value: z.ZodTypeAny | undefined,
|
|
101
|
+
): Record<string, z.ZodTypeAny> | undefined {
|
|
102
|
+
if (!value) {
|
|
103
|
+
return undefined;
|
|
105
104
|
}
|
|
106
|
-
|
|
105
|
+
|
|
106
|
+
const actualValue = unwrapZodField(value) as {
|
|
107
|
+
_def?: { typeName?: string; shape?: () => Record<string, z.ZodTypeAny> };
|
|
108
|
+
shape?: Record<string, z.ZodTypeAny>;
|
|
109
|
+
};
|
|
110
|
+
|
|
111
|
+
if (actualValue._def?.typeName !== 'ZodObject') {
|
|
112
|
+
return undefined;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
if (typeof actualValue._def.shape === 'function') {
|
|
116
|
+
return actualValue._def.shape();
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
return actualValue.shape;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
function isRecord(value: unknown): value is Record<string, unknown> {
|
|
123
|
+
return typeof value === 'object' && value !== null && !Array.isArray(value);
|
|
107
124
|
}
|
|
108
125
|
|
|
109
126
|
/**
|
|
110
127
|
* Transform a locate field schema to make its 'prompt' field optional
|
|
111
128
|
*/
|
|
112
129
|
function makePromptOptional(
|
|
113
|
-
|
|
130
|
+
shape: Record<string, z.ZodTypeAny>,
|
|
114
131
|
wrapInOptional: boolean,
|
|
115
132
|
): z.ZodTypeAny {
|
|
116
|
-
const newShape = { ...
|
|
117
|
-
newShape.prompt =
|
|
133
|
+
const newShape = { ...shape };
|
|
134
|
+
newShape.prompt = shape.prompt.optional();
|
|
118
135
|
|
|
119
136
|
let newSchema: z.ZodTypeAny = z.object(newShape).passthrough();
|
|
120
137
|
if (wrapInOptional) {
|
|
@@ -131,9 +148,10 @@ function transformSchemaField(
|
|
|
131
148
|
value: z.ZodTypeAny,
|
|
132
149
|
): [string, z.ZodTypeAny] {
|
|
133
150
|
const { innerValue, isOptional } = unwrapOptional(value);
|
|
151
|
+
const shape = getZodObjectShape(innerValue);
|
|
134
152
|
|
|
135
|
-
if (
|
|
136
|
-
return [key, makePromptOptional(
|
|
153
|
+
if (shape && isMidsceneLocatorField(innerValue)) {
|
|
154
|
+
return [key, makePromptOptional(shape, isOptional)];
|
|
137
155
|
}
|
|
138
156
|
return [key, value];
|
|
139
157
|
}
|
|
@@ -148,18 +166,117 @@ function extractActionSchema(
|
|
|
148
166
|
return {};
|
|
149
167
|
}
|
|
150
168
|
|
|
151
|
-
const
|
|
152
|
-
if (!
|
|
153
|
-
return
|
|
169
|
+
const shape = getZodObjectShape(paramSchema);
|
|
170
|
+
if (!shape) {
|
|
171
|
+
return paramSchema as unknown as Record<string, z.ZodTypeAny>;
|
|
154
172
|
}
|
|
155
173
|
|
|
156
174
|
return Object.fromEntries(
|
|
157
|
-
Object.entries(
|
|
175
|
+
Object.entries(shape).map(([key, value]) =>
|
|
158
176
|
transformSchemaField(key, value as z.ZodTypeAny),
|
|
159
177
|
),
|
|
160
178
|
);
|
|
161
179
|
}
|
|
162
180
|
|
|
181
|
+
function getPromptText(prompt: unknown): string | undefined {
|
|
182
|
+
if (typeof prompt === 'string') {
|
|
183
|
+
return prompt;
|
|
184
|
+
}
|
|
185
|
+
|
|
186
|
+
if (isRecord(prompt) && typeof prompt.prompt === 'string') {
|
|
187
|
+
return prompt.prompt;
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
return undefined;
|
|
191
|
+
}
|
|
192
|
+
|
|
193
|
+
function moveLocateExtrasIntoPrompt(
|
|
194
|
+
value: Record<string, unknown>,
|
|
195
|
+
locateFieldKeys: Set<string>,
|
|
196
|
+
): Record<string, unknown> {
|
|
197
|
+
const promptText = getPromptText(value.prompt);
|
|
198
|
+
if (!promptText) {
|
|
199
|
+
return value;
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
const normalizedPrompt: Record<string, unknown> = isRecord(value.prompt)
|
|
203
|
+
? { ...value.prompt }
|
|
204
|
+
: { prompt: promptText };
|
|
205
|
+
const normalizedLocate: Record<string, unknown> = {};
|
|
206
|
+
let movedExtraField = false;
|
|
207
|
+
|
|
208
|
+
for (const [key, fieldValue] of Object.entries(value)) {
|
|
209
|
+
if (key === 'prompt') {
|
|
210
|
+
continue;
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
if (locateFieldKeys.has(key)) {
|
|
214
|
+
normalizedLocate[key] = fieldValue;
|
|
215
|
+
continue;
|
|
216
|
+
}
|
|
217
|
+
|
|
218
|
+
movedExtraField = true;
|
|
219
|
+
if (!(key in normalizedPrompt)) {
|
|
220
|
+
normalizedPrompt[key] = fieldValue;
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
if (!movedExtraField) {
|
|
225
|
+
return value;
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
return { ...normalizedLocate, prompt: normalizedPrompt };
|
|
229
|
+
}
|
|
230
|
+
|
|
231
|
+
function normalizeLocateLikeArg(
|
|
232
|
+
value: unknown,
|
|
233
|
+
fieldSchema: z.ZodTypeAny,
|
|
234
|
+
): unknown {
|
|
235
|
+
if (typeof value === 'string') {
|
|
236
|
+
return { prompt: value };
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
if (!isRecord(value)) {
|
|
240
|
+
return value;
|
|
241
|
+
}
|
|
242
|
+
|
|
243
|
+
const shape = getZodObjectShape(fieldSchema);
|
|
244
|
+
if (!shape) {
|
|
245
|
+
return value;
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
return moveLocateExtrasIntoPrompt(value, new Set(Object.keys(shape)));
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
function normalizeActionArgs(
|
|
252
|
+
args: Record<string, unknown>,
|
|
253
|
+
paramSchema?: z.ZodTypeAny,
|
|
254
|
+
): Record<string, unknown> {
|
|
255
|
+
if (!paramSchema) {
|
|
256
|
+
return args;
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
const shape = getZodObjectShape(paramSchema);
|
|
260
|
+
if (!shape) {
|
|
261
|
+
return args;
|
|
262
|
+
}
|
|
263
|
+
|
|
264
|
+
return Object.fromEntries(
|
|
265
|
+
Object.entries(args).map(([key, value]) => {
|
|
266
|
+
const fieldSchema = shape[key] as z.ZodTypeAny | undefined;
|
|
267
|
+
if (!fieldSchema) {
|
|
268
|
+
return [key, value];
|
|
269
|
+
}
|
|
270
|
+
|
|
271
|
+
if (isMidsceneLocatorField(fieldSchema)) {
|
|
272
|
+
return [key, normalizeLocateLikeArg(value, fieldSchema)];
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
return [key, value];
|
|
276
|
+
}),
|
|
277
|
+
);
|
|
278
|
+
}
|
|
279
|
+
|
|
163
280
|
/**
|
|
164
281
|
* Serialize args to human-readable description for AI action
|
|
165
282
|
*/
|
|
@@ -194,10 +311,9 @@ function buildActionInstruction(
|
|
|
194
311
|
actionName: string,
|
|
195
312
|
args: Record<string, unknown>,
|
|
196
313
|
): string {
|
|
197
|
-
const locatePrompt =
|
|
198
|
-
args.locate
|
|
199
|
-
|
|
200
|
-
: undefined;
|
|
314
|
+
const locatePrompt = isRecord(args.locate)
|
|
315
|
+
? getPromptText(args.locate.prompt)
|
|
316
|
+
: undefined;
|
|
201
317
|
|
|
202
318
|
switch (actionName) {
|
|
203
319
|
case 'Tap':
|
|
@@ -227,39 +343,71 @@ function buildActionInstruction(
|
|
|
227
343
|
}
|
|
228
344
|
}
|
|
229
345
|
|
|
346
|
+
async function executeAction(
|
|
347
|
+
agent: BaseAgent,
|
|
348
|
+
actionName: string,
|
|
349
|
+
args: Record<string, unknown>,
|
|
350
|
+
): Promise<unknown> {
|
|
351
|
+
if (agent.callActionInActionSpace) {
|
|
352
|
+
return agent.callActionInActionSpace(actionName, args);
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
if (agent.aiAction) {
|
|
356
|
+
const instruction = buildActionInstruction(actionName, args);
|
|
357
|
+
return agent.aiAction(instruction);
|
|
358
|
+
}
|
|
359
|
+
|
|
360
|
+
throw new Error(`Action "${actionName}" is not supported by this agent`);
|
|
361
|
+
}
|
|
362
|
+
|
|
230
363
|
/**
|
|
231
364
|
* Capture screenshot and return as tool result
|
|
232
365
|
*/
|
|
233
366
|
async function captureScreenshotResult(
|
|
234
367
|
agent: BaseAgent,
|
|
235
368
|
actionName: string,
|
|
369
|
+
actionResult?: unknown,
|
|
236
370
|
): Promise<ToolResult> {
|
|
371
|
+
const content: ToolResult['content'] = [
|
|
372
|
+
{ type: 'text', text: `Action "${actionName}" completed.` },
|
|
373
|
+
];
|
|
374
|
+
|
|
375
|
+
if (actionResult !== undefined) {
|
|
376
|
+
content.push({
|
|
377
|
+
type: 'text',
|
|
378
|
+
text: `Result: ${serializeActionResult(actionResult)}`,
|
|
379
|
+
});
|
|
380
|
+
}
|
|
381
|
+
|
|
237
382
|
try {
|
|
238
383
|
const screenshot = await agent.page?.screenshotBase64();
|
|
239
384
|
if (!screenshot) {
|
|
240
|
-
return {
|
|
241
|
-
content: [{ type: 'text', text: `Action "${actionName}" completed.` }],
|
|
242
|
-
};
|
|
385
|
+
return { content };
|
|
243
386
|
}
|
|
244
387
|
|
|
245
388
|
const { mimeType, body } = parseBase64(screenshot);
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
{ type: 'text', text: `Action "${actionName}" completed.` },
|
|
249
|
-
{ type: 'image', data: body, mimeType },
|
|
250
|
-
],
|
|
251
|
-
};
|
|
389
|
+
content.push({ type: 'image', data: body, mimeType });
|
|
390
|
+
return { content };
|
|
252
391
|
} catch (error: unknown) {
|
|
253
392
|
const errorMessage = getErrorMessage(error);
|
|
254
393
|
console.error('Error capturing screenshot:', errorMessage);
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
type: 'text',
|
|
259
|
-
text: `Action "${actionName}" completed (screenshot unavailable: ${errorMessage})`,
|
|
260
|
-
},
|
|
261
|
-
],
|
|
394
|
+
content[0] = {
|
|
395
|
+
type: 'text',
|
|
396
|
+
text: `Action "${actionName}" completed (screenshot unavailable: ${errorMessage})`,
|
|
262
397
|
};
|
|
398
|
+
return { content };
|
|
399
|
+
}
|
|
400
|
+
}
|
|
401
|
+
|
|
402
|
+
function serializeActionResult(actionResult: unknown): string {
|
|
403
|
+
if (typeof actionResult === 'string') {
|
|
404
|
+
return actionResult;
|
|
405
|
+
}
|
|
406
|
+
|
|
407
|
+
try {
|
|
408
|
+
return JSON.stringify(actionResult);
|
|
409
|
+
} catch {
|
|
410
|
+
return String(actionResult);
|
|
263
411
|
}
|
|
264
412
|
}
|
|
265
413
|
|
|
@@ -323,28 +471,31 @@ export function generateToolsFromActionSpace(
|
|
|
323
471
|
handler: async (args: Record<string, unknown>) => {
|
|
324
472
|
try {
|
|
325
473
|
const agent = await getAgent();
|
|
474
|
+
const normalizedArgs = normalizeActionArgs(args, action.paramSchema);
|
|
475
|
+
let actionResult: unknown;
|
|
326
476
|
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
errorMessage,
|
|
343
|
-
);
|
|
344
|
-
}
|
|
477
|
+
try {
|
|
478
|
+
actionResult = await executeAction(
|
|
479
|
+
agent,
|
|
480
|
+
action.name,
|
|
481
|
+
normalizedArgs,
|
|
482
|
+
);
|
|
483
|
+
} catch (error: unknown) {
|
|
484
|
+
const errorMessage = getErrorMessage(error);
|
|
485
|
+
console.error(
|
|
486
|
+
`Error executing action "${action.name}":`,
|
|
487
|
+
errorMessage,
|
|
488
|
+
);
|
|
489
|
+
// Return screenshot + warning instead of hard error,
|
|
490
|
+
// so the AI agent can see current state and decide to retry or adjust strategy
|
|
491
|
+
return await captureFailureResult(agent, action.name, errorMessage);
|
|
345
492
|
}
|
|
346
493
|
|
|
347
|
-
return await captureScreenshotResult(
|
|
494
|
+
return await captureScreenshotResult(
|
|
495
|
+
agent,
|
|
496
|
+
action.name,
|
|
497
|
+
actionResult,
|
|
498
|
+
);
|
|
348
499
|
} catch (error: unknown) {
|
|
349
500
|
// Connection/agent errors are still hard errors
|
|
350
501
|
const errorMessage = getErrorMessage(error);
|
package/src/mcp/types.ts
CHANGED
|
@@ -84,6 +84,10 @@ export interface BaseAgent {
|
|
|
84
84
|
page?: {
|
|
85
85
|
screenshotBase64(): Promise<string>;
|
|
86
86
|
};
|
|
87
|
+
callActionInActionSpace?: (
|
|
88
|
+
actionName: string,
|
|
89
|
+
params?: unknown,
|
|
90
|
+
) => Promise<unknown>;
|
|
87
91
|
aiAction?: (
|
|
88
92
|
description: string,
|
|
89
93
|
params?: Record<string, unknown>,
|