@midscene/shared 1.6.1-beta-20260331083547.0 → 1.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,6 +1,11 @@
1
1
  import { parseBase64 } from '@midscene/shared/img';
2
2
  import { z } from 'zod';
3
- import { getZodDescription, getZodTypeName } from '../zod-schema-utils';
3
+ import {
4
+ getZodDescription,
5
+ getZodTypeName,
6
+ isMidsceneLocatorField,
7
+ unwrapZodField,
8
+ } from '../zod-schema-utils';
4
9
  import type {
5
10
  ActionSpaceItem,
6
11
  BaseAgent,
@@ -26,23 +31,18 @@ function describeActionForMCP(action: ActionSpaceItem): string {
26
31
  return `${action.name} action, ${actionDesc}`;
27
32
  }
28
33
 
29
- const schema = action.paramSchema as {
30
- _def?: { typeName?: string };
31
- shape?: Record<string, unknown>;
32
- };
33
- const isZodObjectType = schema._def?.typeName === 'ZodObject';
34
-
35
- if (!isZodObjectType || !schema.shape) {
34
+ const shape = getZodObjectShape(action.paramSchema);
35
+ if (!shape) {
36
36
  // Simple type schema
37
- const typeName = getZodTypeName(schema);
38
- const description = getZodDescription(schema as z.ZodTypeAny);
37
+ const typeName = getZodTypeName(action.paramSchema);
38
+ const description = getZodDescription(action.paramSchema as z.ZodTypeAny);
39
39
  const paramDesc = description ? `${typeName} - ${description}` : typeName;
40
40
  return `${action.name} action, ${actionDesc}. Parameter: ${paramDesc}`;
41
41
  }
42
42
 
43
43
  // Object schema with multiple fields
44
44
  const paramDescriptions: string[] = [];
45
- for (const [key, field] of Object.entries(schema.shape)) {
45
+ for (const [key, field] of Object.entries(shape)) {
46
46
  if (field && typeof field === 'object') {
47
47
  const isFieldOptional =
48
48
  typeof (field as { isOptional?: () => boolean }).isOptional ===
@@ -96,25 +96,42 @@ function unwrapOptional(value: z.ZodTypeAny): {
96
96
  return { innerValue: value, isOptional: false };
97
97
  }
98
98
 
99
- /**
100
- * Check if a Zod object schema contains a 'prompt' field (locate field pattern)
101
- */
102
- function isLocateField(value: z.ZodTypeAny): boolean {
103
- if (!isZodObject(value)) {
104
- return false;
99
+ function getZodObjectShape(
100
+ value: z.ZodTypeAny | undefined,
101
+ ): Record<string, z.ZodTypeAny> | undefined {
102
+ if (!value) {
103
+ return undefined;
105
104
  }
106
- return 'prompt' in value.shape;
105
+
106
+ const actualValue = unwrapZodField(value) as {
107
+ _def?: { typeName?: string; shape?: () => Record<string, z.ZodTypeAny> };
108
+ shape?: Record<string, z.ZodTypeAny>;
109
+ };
110
+
111
+ if (actualValue._def?.typeName !== 'ZodObject') {
112
+ return undefined;
113
+ }
114
+
115
+ if (typeof actualValue._def.shape === 'function') {
116
+ return actualValue._def.shape();
117
+ }
118
+
119
+ return actualValue.shape;
120
+ }
121
+
122
+ function isRecord(value: unknown): value is Record<string, unknown> {
123
+ return typeof value === 'object' && value !== null && !Array.isArray(value);
107
124
  }
108
125
 
109
126
  /**
110
127
  * Transform a locate field schema to make its 'prompt' field optional
111
128
  */
112
129
  function makePromptOptional(
113
- value: z.ZodObject<z.ZodRawShape>,
130
+ shape: Record<string, z.ZodTypeAny>,
114
131
  wrapInOptional: boolean,
115
132
  ): z.ZodTypeAny {
116
- const newShape = { ...value.shape };
117
- newShape.prompt = value.shape.prompt.optional();
133
+ const newShape = { ...shape };
134
+ newShape.prompt = shape.prompt.optional();
118
135
 
119
136
  let newSchema: z.ZodTypeAny = z.object(newShape).passthrough();
120
137
  if (wrapInOptional) {
@@ -131,9 +148,10 @@ function transformSchemaField(
131
148
  value: z.ZodTypeAny,
132
149
  ): [string, z.ZodTypeAny] {
133
150
  const { innerValue, isOptional } = unwrapOptional(value);
151
+ const shape = getZodObjectShape(innerValue);
134
152
 
135
- if (isZodObject(innerValue) && isLocateField(innerValue)) {
136
- return [key, makePromptOptional(innerValue, isOptional)];
153
+ if (shape && isMidsceneLocatorField(innerValue)) {
154
+ return [key, makePromptOptional(shape, isOptional)];
137
155
  }
138
156
  return [key, value];
139
157
  }
@@ -148,18 +166,117 @@ function extractActionSchema(
148
166
  return {};
149
167
  }
150
168
 
151
- const schema = paramSchema as z.ZodTypeAny;
152
- if (!isZodObject(schema)) {
153
- return schema as unknown as Record<string, z.ZodTypeAny>;
169
+ const shape = getZodObjectShape(paramSchema);
170
+ if (!shape) {
171
+ return paramSchema as unknown as Record<string, z.ZodTypeAny>;
154
172
  }
155
173
 
156
174
  return Object.fromEntries(
157
- Object.entries(schema.shape).map(([key, value]) =>
175
+ Object.entries(shape).map(([key, value]) =>
158
176
  transformSchemaField(key, value as z.ZodTypeAny),
159
177
  ),
160
178
  );
161
179
  }
162
180
 
181
+ function getPromptText(prompt: unknown): string | undefined {
182
+ if (typeof prompt === 'string') {
183
+ return prompt;
184
+ }
185
+
186
+ if (isRecord(prompt) && typeof prompt.prompt === 'string') {
187
+ return prompt.prompt;
188
+ }
189
+
190
+ return undefined;
191
+ }
192
+
193
+ function moveLocateExtrasIntoPrompt(
194
+ value: Record<string, unknown>,
195
+ locateFieldKeys: Set<string>,
196
+ ): Record<string, unknown> {
197
+ const promptText = getPromptText(value.prompt);
198
+ if (!promptText) {
199
+ return value;
200
+ }
201
+
202
+ const normalizedPrompt: Record<string, unknown> = isRecord(value.prompt)
203
+ ? { ...value.prompt }
204
+ : { prompt: promptText };
205
+ const normalizedLocate: Record<string, unknown> = {};
206
+ let movedExtraField = false;
207
+
208
+ for (const [key, fieldValue] of Object.entries(value)) {
209
+ if (key === 'prompt') {
210
+ continue;
211
+ }
212
+
213
+ if (locateFieldKeys.has(key)) {
214
+ normalizedLocate[key] = fieldValue;
215
+ continue;
216
+ }
217
+
218
+ movedExtraField = true;
219
+ if (!(key in normalizedPrompt)) {
220
+ normalizedPrompt[key] = fieldValue;
221
+ }
222
+ }
223
+
224
+ if (!movedExtraField) {
225
+ return value;
226
+ }
227
+
228
+ return { ...normalizedLocate, prompt: normalizedPrompt };
229
+ }
230
+
231
+ function normalizeLocateLikeArg(
232
+ value: unknown,
233
+ fieldSchema: z.ZodTypeAny,
234
+ ): unknown {
235
+ if (typeof value === 'string') {
236
+ return { prompt: value };
237
+ }
238
+
239
+ if (!isRecord(value)) {
240
+ return value;
241
+ }
242
+
243
+ const shape = getZodObjectShape(fieldSchema);
244
+ if (!shape) {
245
+ return value;
246
+ }
247
+
248
+ return moveLocateExtrasIntoPrompt(value, new Set(Object.keys(shape)));
249
+ }
250
+
251
+ function normalizeActionArgs(
252
+ args: Record<string, unknown>,
253
+ paramSchema?: z.ZodTypeAny,
254
+ ): Record<string, unknown> {
255
+ if (!paramSchema) {
256
+ return args;
257
+ }
258
+
259
+ const shape = getZodObjectShape(paramSchema);
260
+ if (!shape) {
261
+ return args;
262
+ }
263
+
264
+ return Object.fromEntries(
265
+ Object.entries(args).map(([key, value]) => {
266
+ const fieldSchema = shape[key] as z.ZodTypeAny | undefined;
267
+ if (!fieldSchema) {
268
+ return [key, value];
269
+ }
270
+
271
+ if (isMidsceneLocatorField(fieldSchema)) {
272
+ return [key, normalizeLocateLikeArg(value, fieldSchema)];
273
+ }
274
+
275
+ return [key, value];
276
+ }),
277
+ );
278
+ }
279
+
163
280
  /**
164
281
  * Serialize args to human-readable description for AI action
165
282
  */
@@ -194,10 +311,9 @@ function buildActionInstruction(
194
311
  actionName: string,
195
312
  args: Record<string, unknown>,
196
313
  ): string {
197
- const locatePrompt =
198
- args.locate && typeof args.locate === 'object'
199
- ? (args.locate as { prompt?: string }).prompt
200
- : undefined;
314
+ const locatePrompt = isRecord(args.locate)
315
+ ? getPromptText(args.locate.prompt)
316
+ : undefined;
201
317
 
202
318
  switch (actionName) {
203
319
  case 'Tap':
@@ -227,39 +343,71 @@ function buildActionInstruction(
227
343
  }
228
344
  }
229
345
 
346
+ async function executeAction(
347
+ agent: BaseAgent,
348
+ actionName: string,
349
+ args: Record<string, unknown>,
350
+ ): Promise<unknown> {
351
+ if (agent.callActionInActionSpace) {
352
+ return agent.callActionInActionSpace(actionName, args);
353
+ }
354
+
355
+ if (agent.aiAction) {
356
+ const instruction = buildActionInstruction(actionName, args);
357
+ return agent.aiAction(instruction);
358
+ }
359
+
360
+ throw new Error(`Action "${actionName}" is not supported by this agent`);
361
+ }
362
+
230
363
  /**
231
364
  * Capture screenshot and return as tool result
232
365
  */
233
366
  async function captureScreenshotResult(
234
367
  agent: BaseAgent,
235
368
  actionName: string,
369
+ actionResult?: unknown,
236
370
  ): Promise<ToolResult> {
371
+ const content: ToolResult['content'] = [
372
+ { type: 'text', text: `Action "${actionName}" completed.` },
373
+ ];
374
+
375
+ if (actionResult !== undefined) {
376
+ content.push({
377
+ type: 'text',
378
+ text: `Result: ${serializeActionResult(actionResult)}`,
379
+ });
380
+ }
381
+
237
382
  try {
238
383
  const screenshot = await agent.page?.screenshotBase64();
239
384
  if (!screenshot) {
240
- return {
241
- content: [{ type: 'text', text: `Action "${actionName}" completed.` }],
242
- };
385
+ return { content };
243
386
  }
244
387
 
245
388
  const { mimeType, body } = parseBase64(screenshot);
246
- return {
247
- content: [
248
- { type: 'text', text: `Action "${actionName}" completed.` },
249
- { type: 'image', data: body, mimeType },
250
- ],
251
- };
389
+ content.push({ type: 'image', data: body, mimeType });
390
+ return { content };
252
391
  } catch (error: unknown) {
253
392
  const errorMessage = getErrorMessage(error);
254
393
  console.error('Error capturing screenshot:', errorMessage);
255
- return {
256
- content: [
257
- {
258
- type: 'text',
259
- text: `Action "${actionName}" completed (screenshot unavailable: ${errorMessage})`,
260
- },
261
- ],
394
+ content[0] = {
395
+ type: 'text',
396
+ text: `Action "${actionName}" completed (screenshot unavailable: ${errorMessage})`,
262
397
  };
398
+ return { content };
399
+ }
400
+ }
401
+
402
+ function serializeActionResult(actionResult: unknown): string {
403
+ if (typeof actionResult === 'string') {
404
+ return actionResult;
405
+ }
406
+
407
+ try {
408
+ return JSON.stringify(actionResult);
409
+ } catch {
410
+ return String(actionResult);
263
411
  }
264
412
  }
265
413
 
@@ -323,28 +471,31 @@ export function generateToolsFromActionSpace(
323
471
  handler: async (args: Record<string, unknown>) => {
324
472
  try {
325
473
  const agent = await getAgent();
474
+ const normalizedArgs = normalizeActionArgs(args, action.paramSchema);
475
+ let actionResult: unknown;
326
476
 
327
- if (agent.aiAction) {
328
- const instruction = buildActionInstruction(action.name, args);
329
- try {
330
- await agent.aiAction(instruction);
331
- } catch (error: unknown) {
332
- const errorMessage = getErrorMessage(error);
333
- console.error(
334
- `Error executing action "${action.name}":`,
335
- errorMessage,
336
- );
337
- // Return screenshot + warning instead of hard error,
338
- // so the AI agent can see current state and decide to retry or adjust strategy
339
- return await captureFailureResult(
340
- agent,
341
- action.name,
342
- errorMessage,
343
- );
344
- }
477
+ try {
478
+ actionResult = await executeAction(
479
+ agent,
480
+ action.name,
481
+ normalizedArgs,
482
+ );
483
+ } catch (error: unknown) {
484
+ const errorMessage = getErrorMessage(error);
485
+ console.error(
486
+ `Error executing action "${action.name}":`,
487
+ errorMessage,
488
+ );
489
+ // Return screenshot + warning instead of hard error,
490
+ // so the AI agent can see current state and decide to retry or adjust strategy
491
+ return await captureFailureResult(agent, action.name, errorMessage);
345
492
  }
346
493
 
347
- return await captureScreenshotResult(agent, action.name);
494
+ return await captureScreenshotResult(
495
+ agent,
496
+ action.name,
497
+ actionResult,
498
+ );
348
499
  } catch (error: unknown) {
349
500
  // Connection/agent errors are still hard errors
350
501
  const errorMessage = getErrorMessage(error);
package/src/mcp/types.ts CHANGED
@@ -84,6 +84,10 @@ export interface BaseAgent {
84
84
  page?: {
85
85
  screenshotBase64(): Promise<string>;
86
86
  };
87
+ callActionInActionSpace?: (
88
+ actionName: string,
89
+ params?: unknown,
90
+ ) => Promise<unknown>;
87
91
  aiAction?: (
88
92
  description: string,
89
93
  params?: Record<string, unknown>,