@midscene/shared 1.7.5-beta-20260421030751.0 → 1.7.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/dist/es/cli/cli-args.mjs +95 -0
  2. package/dist/es/cli/cli-error.mjs +24 -0
  3. package/dist/es/cli/cli-runner.mjs +10 -40
  4. package/dist/es/cli/index.mjs +4 -2
  5. package/dist/es/constants/index.mjs +3 -2
  6. package/dist/es/key-alias-utils.mjs +19 -0
  7. package/dist/es/mcp/base-tools.mjs +45 -2
  8. package/dist/es/mcp/index.mjs +1 -0
  9. package/dist/es/mcp/init-arg-utils.mjs +38 -0
  10. package/dist/es/mcp/tool-generator.mjs +29 -11
  11. package/dist/lib/cli/cli-args.js +138 -0
  12. package/dist/lib/cli/cli-error.js +61 -0
  13. package/dist/lib/cli/cli-runner.js +19 -46
  14. package/dist/lib/cli/index.js +8 -3
  15. package/dist/lib/constants/index.js +5 -1
  16. package/dist/lib/key-alias-utils.js +62 -0
  17. package/dist/lib/mcp/base-tools.js +45 -2
  18. package/dist/lib/mcp/index.js +19 -12
  19. package/dist/lib/mcp/init-arg-utils.js +78 -0
  20. package/dist/lib/mcp/tool-generator.js +29 -11
  21. package/dist/types/cli/cli-args.d.ts +8 -0
  22. package/dist/types/cli/cli-error.d.ts +5 -0
  23. package/dist/types/cli/cli-runner.d.ts +4 -7
  24. package/dist/types/cli/index.d.ts +3 -1
  25. package/dist/types/constants/index.d.ts +1 -0
  26. package/dist/types/key-alias-utils.d.ts +9 -0
  27. package/dist/types/mcp/base-tools.d.ts +65 -5
  28. package/dist/types/mcp/index.d.ts +1 -0
  29. package/dist/types/mcp/init-arg-utils.d.ts +13 -0
  30. package/dist/types/mcp/tool-generator.d.ts +3 -3
  31. package/dist/types/mcp/types.d.ts +8 -0
  32. package/package.json +1 -1
  33. package/src/cli/cli-args.ts +173 -0
  34. package/src/cli/cli-error.ts +24 -0
  35. package/src/cli/cli-runner.ts +37 -56
  36. package/src/cli/index.ts +3 -7
  37. package/src/constants/index.ts +2 -0
  38. package/src/key-alias-utils.ts +23 -0
  39. package/src/mcp/base-tools.ts +164 -9
  40. package/src/mcp/index.ts +1 -0
  41. package/src/mcp/init-arg-utils.ts +105 -0
  42. package/src/mcp/tool-generator.ts +47 -10
  43. package/src/mcp/types.ts +10 -0
@@ -1,6 +1,13 @@
1
1
  import { parseBase64 } from '@midscene/shared/img';
2
2
  import { getDebug } from '@midscene/shared/logger';
3
3
  import type { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
4
+ import type { z } from 'zod';
5
+ import { camelToKebab, getKeyAliases } from '../key-alias-utils';
6
+ import {
7
+ createNamespacedInitArgSchema,
8
+ extractNamespacedArgs,
9
+ sanitizeNamespacedArgs,
10
+ } from './init-arg-utils';
4
11
  import {
5
12
  generateCommonTools,
6
13
  generateToolsFromActionSpace,
@@ -10,22 +17,68 @@ import type {
10
17
  BaseAgent,
11
18
  BaseDevice,
12
19
  IMidsceneTools,
20
+ ToolCliMetadata,
13
21
  ToolDefinition,
22
+ ToolSchema,
14
23
  } from './types';
15
24
 
16
25
  const debug = getDebug('mcp:base-tools');
17
26
 
18
27
  /**
19
- * Base class for platform-specific MCP tools
20
- * Generic type TAgent allows subclasses to use their specific agent types
28
+ * Declarative description of a platform's agent init args.
29
+ * Collapses the `extractAgentInitParam` / `sanitizeToolArgs` /
30
+ * `getAgentInitArgSchema` trio into a single data declaration.
31
+ */
32
+ export interface InitArgSpec<TInitParam> {
33
+ /** Arg namespace, e.g. `android`, `ios`. */
34
+ namespace: string;
35
+ /** Zod shape describing the init args. Field names drive the MCP schema. */
36
+ shape: Record<string, z.ZodTypeAny>;
37
+ /**
38
+ * Optional CLI presentation hints. These affect `--help` output for
39
+ * single-platform CLIs but do not alter MCP/YAML protocol keys.
40
+ */
41
+ cli?: {
42
+ /** Prefer bare `--device-id`-style options in platform CLI help output. */
43
+ preferBareKeys?: boolean;
44
+ /** Override the displayed option name for specific init arg fields. */
45
+ preferredNames?: Record<string, string>;
46
+ };
47
+ /**
48
+ * Adapt extracted namespaced args into the concrete `TInitParam` passed to
49
+ * `ensureAgent`. Defaults to returning the raw extracted record.
50
+ */
51
+ adapt?: (
52
+ extracted: Record<string, unknown> | undefined,
53
+ ) => TInitParam | undefined;
54
+ }
55
+
56
+ /**
57
+ * Base class for platform-specific MCP tools.
58
+ * @typeParam TAgent - Platform-specific agent type.
59
+ * @typeParam TInitParam - Platform-specific init parameter consumed by
60
+ * `ensureAgent`. Defaults to `undefined` for platforms that take no args.
21
61
  */
22
- export abstract class BaseMidsceneTools<TAgent extends BaseAgent = BaseAgent>
23
- implements IMidsceneTools
62
+ export abstract class BaseMidsceneTools<
63
+ TAgent extends BaseAgent = BaseAgent,
64
+ TInitParam = unknown,
65
+ > implements IMidsceneTools
24
66
  {
25
67
  protected mcpServer?: McpServer;
26
68
  protected agent?: TAgent;
27
69
  protected toolDefinitions: ToolDefinition[] = [];
28
70
 
71
+ /**
72
+ * Declarative init-arg spec. Subclasses that accept CLI/MCP init args should
73
+ * set this once and get `extractAgentInitParam` / `sanitizeToolArgs` /
74
+ * `getAgentInitArgSchema` auto-implemented.
75
+ *
76
+ * Declared with `declare` so that TS doesn't emit an `Object.defineProperty`
77
+ * for this field on the base constructor, which would otherwise overwrite
78
+ * a subclass field initializer under `useDefineForClassFields`.
79
+ */
80
+ protected declare readonly initArgSpec?: InitArgSpec<TInitParam>;
81
+
29
82
  /**
30
83
  * Ensure agent is initialized and ready for use.
31
84
  * Must be implemented by subclasses to create platform-specific agent.
@@ -33,7 +86,102 @@ export abstract class BaseMidsceneTools<TAgent extends BaseAgent = BaseAgent>
33
86
  * @returns Promise resolving to initialized agent instance
34
87
  * @throws Error if agent initialization fails
35
88
  */
36
- protected abstract ensureAgent(initParam?: string): Promise<TAgent>;
89
+ protected abstract ensureAgent(initParam?: TInitParam): Promise<TAgent>;
90
+
91
+ private getInitArgKeys(): readonly string[] {
92
+ return this.initArgSpec ? Object.keys(this.initArgSpec.shape) : [];
93
+ }
94
+
95
+ /**
96
+ * Extract a platform-specific agent init parameter from CLI/MCP tool args.
97
+ */
98
+ protected extractAgentInitParam(
99
+ args: Record<string, unknown>,
100
+ ): TInitParam | undefined {
101
+ if (!this.initArgSpec) {
102
+ return undefined;
103
+ }
104
+ const extracted = extractNamespacedArgs(
105
+ args,
106
+ this.initArgSpec.namespace,
107
+ this.getInitArgKeys(),
108
+ );
109
+ if (this.initArgSpec.adapt) {
110
+ return this.initArgSpec.adapt(extracted);
111
+ }
112
+ return extracted as TInitParam | undefined;
113
+ }
114
+
115
+ /**
116
+ * Remove platform-specific init args before dispatching a tool payload to the action itself.
117
+ */
118
+ protected sanitizeToolArgs(
119
+ args: Record<string, unknown>,
120
+ ): Record<string, unknown> {
121
+ if (!this.initArgSpec) {
122
+ return args;
123
+ }
124
+ return sanitizeNamespacedArgs(
125
+ args,
126
+ this.initArgSpec.namespace,
127
+ this.getInitArgKeys(),
128
+ );
129
+ }
130
+
131
+ /**
132
+ * Expose platform-specific init args on action/common tool schemas.
133
+ */
134
+ protected getAgentInitArgSchema(): ToolSchema {
135
+ if (!this.initArgSpec) {
136
+ return {};
137
+ }
138
+ return createNamespacedInitArgSchema(
139
+ this.initArgSpec.namespace,
140
+ this.initArgSpec.shape,
141
+ );
142
+ }
143
+
144
+ /**
145
+ * Expose CLI-only metadata for platform init args so single-platform help can
146
+ * show ergonomic bare flags while the underlying schema stays namespaced.
147
+ * When `preferBareKeys` is enabled, single-platform CLIs only accept the
148
+ * bare spellings; namespaced dotted spellings remain available through the
149
+ * MCP/YAML schema instead of the platform CLI surface.
150
+ */
151
+ protected getAgentInitArgCliMetadata(): ToolCliMetadata | undefined {
152
+ if (!this.initArgSpec?.cli) {
153
+ return undefined;
154
+ }
155
+
156
+ const options = Object.fromEntries(
157
+ this.getInitArgKeys().map((key) => {
158
+ const canonicalKey = `${this.initArgSpec!.namespace}.${key}`;
159
+ const preferredName =
160
+ this.initArgSpec!.cli?.preferredNames?.[key] ??
161
+ (this.initArgSpec!.cli?.preferBareKeys
162
+ ? camelToKebab(key)
163
+ : canonicalKey);
164
+
165
+ const acceptedNames = new Set<string>([
166
+ preferredName,
167
+ ...(this.initArgSpec!.cli?.preferBareKeys
168
+ ? getKeyAliases(key)
169
+ : getKeyAliases(canonicalKey)),
170
+ ]);
171
+ acceptedNames.delete(preferredName);
172
+
173
+ return [
174
+ canonicalKey,
175
+ {
176
+ preferredName,
177
+ aliases: [...acceptedNames],
178
+ },
179
+ ];
180
+ }),
181
+ );
182
+
183
+ return { options };
184
+ }
37
185
 
38
186
  /**
39
187
  * Optional: prepare platform-specific tools (e.g., device connection)
@@ -83,13 +231,20 @@ export abstract class BaseMidsceneTools<TAgent extends BaseAgent = BaseAgent>
83
231
  }
84
232
 
85
233
  // 3. Generate tools from action space (core innovation)
86
- const actionTools = generateToolsFromActionSpace(actionSpace, () =>
87
- this.ensureAgent(),
234
+ const actionTools = generateToolsFromActionSpace(
235
+ actionSpace,
236
+ (args = {}) => this.ensureAgent(this.extractAgentInitParam(args)),
237
+ (args = {}) => this.sanitizeToolArgs(args),
238
+ this.getAgentInitArgSchema(),
239
+ this.getAgentInitArgCliMetadata(),
88
240
  );
89
241
 
90
242
  // 4. Add common tools (screenshot, waitFor)
91
- const commonTools = generateCommonTools(() => this.ensureAgent());
92
-
243
+ const commonTools = generateCommonTools(
244
+ (args = {}) => this.ensureAgent(this.extractAgentInitParam(args)),
245
+ this.getAgentInitArgSchema(),
246
+ this.getAgentInitArgCliMetadata(),
247
+ );
93
248
  this.toolDefinitions.push(...actionTools, ...commonTools);
94
249
 
95
250
  debug('Total tools prepared:', this.toolDefinitions.length);
package/src/mcp/index.ts CHANGED
@@ -1,5 +1,6 @@
1
1
  export * from './base-server';
2
2
  export * from './base-tools';
3
+ export * from './init-arg-utils';
3
4
  export * from './error-formatter';
4
5
  export * from './tool-generator';
5
6
  export * from './types';
@@ -0,0 +1,105 @@
1
+ import type { z } from 'zod';
2
+ import { getKeyAliases, isRecord } from '../key-alias-utils';
3
+ import type { ToolSchema } from './types';
4
+
5
+ function readAliasedValue(
6
+ args: Record<string, unknown>,
7
+ key: string,
8
+ ): unknown | undefined {
9
+ for (const alias of getKeyAliases(key)) {
10
+ if (alias in args) {
11
+ return args[alias];
12
+ }
13
+ }
14
+
15
+ return undefined;
16
+ }
17
+
18
+ function readNamespacedArg(
19
+ args: Record<string, unknown>,
20
+ namespace: string,
21
+ key: string,
22
+ ): unknown | undefined {
23
+ // Lookup order: namespace object first, then flat dotted form, then bare key
24
+ // fallback. Namespace-aware inputs win so multi-platform callers cannot be
25
+ // cross-contaminated by a top-level bare `deviceId` leaking into the wrong
26
+ // platform.
27
+ const namespacedArgs = readAliasedValue(args, namespace);
28
+ if (isRecord(namespacedArgs)) {
29
+ const nestedValue = readAliasedValue(namespacedArgs, key);
30
+ if (nestedValue !== undefined) {
31
+ return nestedValue;
32
+ }
33
+ }
34
+
35
+ const dottedValue = readAliasedValue(args, `${namespace}.${key}`);
36
+ if (dottedValue !== undefined) {
37
+ return dottedValue;
38
+ }
39
+
40
+ const directValue = readAliasedValue(args, key);
41
+ if (directValue !== undefined) {
42
+ return directValue;
43
+ }
44
+
45
+ return undefined;
46
+ }
47
+
48
+ export function extractNamespacedArgs<
49
+ TFieldName extends string,
50
+ TArgs extends Record<string, unknown> = Record<string, unknown>,
51
+ >(
52
+ args: Record<string, unknown>,
53
+ namespace: string,
54
+ keys: readonly TFieldName[],
55
+ ): TArgs | undefined {
56
+ const extracted: Record<string, unknown> = {};
57
+
58
+ for (const key of keys) {
59
+ const value = readNamespacedArg(args, namespace, key);
60
+ if (value !== undefined) {
61
+ extracted[key] = value;
62
+ }
63
+ }
64
+
65
+ return Object.keys(extracted).length > 0 ? (extracted as TArgs) : undefined;
66
+ }
67
+
68
+ export function sanitizeNamespacedArgs(
69
+ args: Record<string, unknown>,
70
+ namespace: string,
71
+ keys: readonly string[],
72
+ ): Record<string, unknown> {
73
+ const excludedKeys = new Set<string>(getKeyAliases(namespace));
74
+
75
+ for (const key of keys) {
76
+ for (const alias of getKeyAliases(key)) {
77
+ excludedKeys.add(alias);
78
+ }
79
+
80
+ for (const alias of getKeyAliases(`${namespace}.${key}`)) {
81
+ excludedKeys.add(alias);
82
+ }
83
+ }
84
+
85
+ return Object.fromEntries(
86
+ Object.entries(args).filter(([key]) => !excludedKeys.has(key)),
87
+ );
88
+ }
89
+
90
+ /**
91
+ * Build a flat MCP tool schema whose keys are dotted `"<namespace>.<field>"`.
92
+ *
93
+ * We intentionally stay flat (rather than `{ namespace: z.object({...}) }`) so
94
+ * that CLI (`--android.device-id`), MCP clients, and `--help` output all share
95
+ * the same spelling. `readNamespacedArg` understands all three input shapes:
96
+ * nested namespace object, dotted flat key, and bare key fallback.
97
+ */
98
+ export function createNamespacedInitArgSchema(
99
+ namespace: string,
100
+ shape: Record<string, z.ZodTypeAny>,
101
+ ): ToolSchema {
102
+ return Object.fromEntries(
103
+ Object.entries(shape).map(([key, value]) => [`${namespace}.${key}`, value]),
104
+ );
105
+ }
@@ -10,8 +10,10 @@ import { getErrorMessage } from './error-formatter';
10
10
  import type {
11
11
  ActionSpaceItem,
12
12
  BaseAgent,
13
+ ToolCliMetadata,
13
14
  ToolDefinition,
14
15
  ToolResult,
16
+ ToolSchema,
15
17
  } from './types';
16
18
 
17
19
  /**
@@ -447,25 +449,49 @@ async function captureFailureResult(
447
449
  }
448
450
  }
449
451
 
452
+ function mergeToolCliMetadata(
453
+ base?: ToolCliMetadata,
454
+ extra?: ToolCliMetadata,
455
+ ): ToolCliMetadata | undefined {
456
+ const options = {
457
+ ...(base?.options ?? {}),
458
+ ...(extra?.options ?? {}),
459
+ };
460
+
461
+ return Object.keys(options).length > 0 ? { options } : undefined;
462
+ }
463
+
450
464
  /**
451
465
  * Converts DeviceAction from actionSpace into MCP ToolDefinition
452
466
  * This is the core logic that removes need for hardcoded tool definitions
453
467
  */
454
468
  export function generateToolsFromActionSpace(
455
469
  actionSpace: ActionSpaceItem[],
456
- getAgent: () => Promise<BaseAgent>,
470
+ getAgent: (args?: Record<string, unknown>) => Promise<BaseAgent>,
471
+ sanitizeArgs: (args: Record<string, unknown>) => Record<string, unknown> = (
472
+ args,
473
+ ) => args,
474
+ initArgSchema: ToolSchema = {},
475
+ initArgCliMetadata?: ToolCliMetadata,
457
476
  ): ToolDefinition[] {
458
477
  return actionSpace.map((action) => {
459
- const schema = extractActionSchema(action.paramSchema as z.ZodTypeAny);
478
+ const schema = {
479
+ ...extractActionSchema(action.paramSchema as z.ZodTypeAny),
480
+ ...initArgSchema,
481
+ };
460
482
 
461
483
  return {
462
484
  name: action.name,
463
485
  description: describeActionForMCP(action),
464
486
  schema,
487
+ cli: initArgCliMetadata,
465
488
  handler: async (args: Record<string, unknown>) => {
466
489
  try {
467
- const agent = await getAgent();
468
- const normalizedArgs = normalizeActionArgs(args, action.paramSchema);
490
+ const agent = await getAgent(args);
491
+ const normalizedArgs = normalizeActionArgs(
492
+ sanitizeArgs(args),
493
+ action.paramSchema,
494
+ );
469
495
  let actionResult: unknown;
470
496
 
471
497
  try {
@@ -507,16 +533,23 @@ export function generateToolsFromActionSpace(
507
533
  * Generate common tools (screenshot, act)
508
534
  */
509
535
  export function generateCommonTools(
510
- getAgent: () => Promise<BaseAgent>,
536
+ getAgent: (args?: Record<string, unknown>) => Promise<BaseAgent>,
537
+ initArgSchema: ToolSchema = {},
538
+ initArgCliMetadata?: ToolCliMetadata,
511
539
  ): ToolDefinition[] {
512
540
  return [
513
541
  {
514
542
  name: 'take_screenshot',
515
543
  description: 'Capture screenshot of current page/screen',
516
- schema: {},
517
- handler: async (): Promise<ToolResult> => {
544
+ schema: {
545
+ ...initArgSchema,
546
+ },
547
+ cli: initArgCliMetadata,
548
+ handler: async (
549
+ args: Record<string, unknown> = {},
550
+ ): Promise<ToolResult> => {
518
551
  try {
519
- const agent = await getAgent();
552
+ const agent = await getAgent(args);
520
553
  const screenshot = await agent.page?.screenshotBase64();
521
554
  if (!screenshot) {
522
555
  return createErrorResult('Screenshot not available');
@@ -544,11 +577,15 @@ export function generateCommonTools(
544
577
  .describe(
545
578
  'Natural language description of the action to perform, e.g. "press Command+Space, type Safari, press Enter"',
546
579
  ),
580
+ ...initArgSchema,
547
581
  },
548
- handler: async (args: Record<string, unknown>): Promise<ToolResult> => {
582
+ cli: mergeToolCliMetadata(undefined, initArgCliMetadata),
583
+ handler: async (
584
+ args: Record<string, unknown> = {},
585
+ ): Promise<ToolResult> => {
549
586
  const prompt = args.prompt as string;
550
587
  try {
551
- const agent = await getAgent();
588
+ const agent = await getAgent(args);
552
589
  if (!agent.aiAction) {
553
590
  return createErrorResult('act is not supported by this agent');
554
591
  }
package/src/mcp/types.ts CHANGED
@@ -47,6 +47,15 @@ export type ToolHandler<T = Record<string, unknown>> = (
47
47
  */
48
48
  export type ToolSchema = Record<string, z.ZodTypeAny>;
49
49
 
50
+ export interface ToolCliOption {
51
+ preferredName?: string;
52
+ aliases?: string[];
53
+ }
54
+
55
+ export interface ToolCliMetadata {
56
+ options?: Record<string, ToolCliOption>;
57
+ }
58
+
50
59
  /**
51
60
  * Tool definition for MCP server
52
61
  */
@@ -55,6 +64,7 @@ export interface ToolDefinition<T = Record<string, unknown>> {
55
64
  description: string;
56
65
  schema: ToolSchema;
57
66
  handler: ToolHandler<T>;
67
+ cli?: ToolCliMetadata;
58
68
  }
59
69
 
60
70
  /**