@midscene/shared 1.9.7 → 1.9.8-beta-20260618014851.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (83) hide show
  1. package/dist/es/agent-tools/agent-behavior-init-args.mjs +44 -0
  2. package/dist/es/{mcp → agent-tools}/base-tools.mjs +1 -8
  3. package/dist/es/{mcp → agent-tools}/chrome-path.mjs +14 -3
  4. package/dist/es/{mcp → agent-tools}/index.mjs +1 -3
  5. package/dist/es/{mcp → agent-tools}/tool-generator.mjs +6 -5
  6. package/dist/es/cli/cli-runner.mjs +1 -1
  7. package/dist/es/env/parse-model-config.mjs +1 -1
  8. package/dist/es/env/types.mjs +3 -5
  9. package/dist/es/img/index.mjs +2 -2
  10. package/dist/es/img/transform.mjs +18 -1
  11. package/dist/es/utils.mjs +2 -6
  12. package/dist/lib/agent-tools/agent-behavior-init-args.js +87 -0
  13. package/dist/lib/{mcp → agent-tools}/base-tools.js +1 -8
  14. package/dist/lib/{mcp → agent-tools}/chrome-path.js +13 -2
  15. package/dist/lib/{mcp → agent-tools}/index.js +10 -24
  16. package/dist/lib/{mcp → agent-tools}/tool-generator.js +6 -5
  17. package/dist/lib/cli/cli-runner.js +1 -1
  18. package/dist/lib/env/parse-model-config.js +1 -1
  19. package/dist/lib/env/types.js +5 -10
  20. package/dist/lib/img/index.js +3 -0
  21. package/dist/lib/img/transform.js +20 -0
  22. package/dist/lib/utils.js +8 -15
  23. package/dist/types/agent-tools/agent-behavior-init-args.d.ts +17 -0
  24. package/dist/types/{mcp → agent-tools}/base-tools.d.ts +7 -13
  25. package/dist/types/{mcp → agent-tools}/index.d.ts +1 -3
  26. package/dist/types/{mcp → agent-tools}/init-arg-utils.d.ts +3 -3
  27. package/dist/types/{mcp → agent-tools}/tool-defaults.d.ts +5 -6
  28. package/dist/types/{mcp → agent-tools}/tool-generator.d.ts +1 -1
  29. package/dist/types/{mcp → agent-tools}/types.d.ts +20 -13
  30. package/dist/types/cli/cli-args.d.ts +1 -1
  31. package/dist/types/cli/cli-runner.d.ts +2 -2
  32. package/dist/types/env/types.d.ts +8 -6
  33. package/dist/types/img/index.d.ts +1 -1
  34. package/dist/types/img/transform.d.ts +4 -0
  35. package/dist/types/key-alias-utils.d.ts +2 -2
  36. package/dist/types/utils.d.ts +0 -1
  37. package/package.json +15 -8
  38. package/src/agent-tools/agent-behavior-init-args.ts +109 -0
  39. package/src/{mcp → agent-tools}/base-tools.ts +8 -33
  40. package/src/{mcp → agent-tools}/chrome-path.ts +20 -3
  41. package/src/{mcp → agent-tools}/index.ts +1 -3
  42. package/src/{mcp → agent-tools}/init-arg-utils.ts +3 -3
  43. package/src/{mcp → agent-tools}/tool-defaults.ts +5 -6
  44. package/src/{mcp → agent-tools}/tool-generator.ts +14 -7
  45. package/src/{mcp → agent-tools}/types.ts +22 -10
  46. package/src/cli/cli-args.ts +1 -1
  47. package/src/cli/cli-runner.ts +4 -4
  48. package/src/env/types.ts +5 -5
  49. package/src/img/index.ts +2 -0
  50. package/src/img/transform.ts +45 -0
  51. package/src/key-alias-utils.ts +2 -2
  52. package/src/utils.ts +1 -10
  53. package/dist/es/mcp/base-server.mjs +0 -295
  54. package/dist/es/mcp/inject-report-html-plugin.mjs +0 -53
  55. package/dist/es/mcp/launcher-helper.mjs +0 -52
  56. package/dist/lib/mcp/base-server.js +0 -345
  57. package/dist/lib/mcp/inject-report-html-plugin.js +0 -98
  58. package/dist/lib/mcp/launcher-helper.js +0 -86
  59. package/dist/types/mcp/base-server.d.ts +0 -106
  60. package/dist/types/mcp/inject-report-html-plugin.d.ts +0 -18
  61. package/dist/types/mcp/launcher-helper.d.ts +0 -94
  62. package/src/mcp/base-server.ts +0 -529
  63. package/src/mcp/inject-report-html-plugin.ts +0 -119
  64. package/src/mcp/launcher-helper.ts +0 -200
  65. /package/dist/es/{mcp → agent-tools}/cli-report-session.mjs +0 -0
  66. /package/dist/es/{mcp → agent-tools}/error-formatter.mjs +0 -0
  67. /package/dist/es/{mcp → agent-tools}/init-arg-utils.mjs +0 -0
  68. /package/dist/es/{mcp → agent-tools}/tool-defaults.mjs +0 -0
  69. /package/dist/es/{mcp → agent-tools}/types.mjs +0 -0
  70. /package/dist/es/{mcp → agent-tools}/user-prompt.mjs +0 -0
  71. /package/dist/lib/{mcp → agent-tools}/cli-report-session.js +0 -0
  72. /package/dist/lib/{mcp → agent-tools}/error-formatter.js +0 -0
  73. /package/dist/lib/{mcp → agent-tools}/init-arg-utils.js +0 -0
  74. /package/dist/lib/{mcp → agent-tools}/tool-defaults.js +0 -0
  75. /package/dist/lib/{mcp → agent-tools}/types.js +0 -0
  76. /package/dist/lib/{mcp → agent-tools}/user-prompt.js +0 -0
  77. /package/dist/types/{mcp → agent-tools}/chrome-path.d.ts +0 -0
  78. /package/dist/types/{mcp → agent-tools}/cli-report-session.d.ts +0 -0
  79. /package/dist/types/{mcp → agent-tools}/error-formatter.d.ts +0 -0
  80. /package/dist/types/{mcp → agent-tools}/user-prompt.d.ts +0 -0
  81. /package/src/{mcp → agent-tools}/cli-report-session.ts +0 -0
  82. /package/src/{mcp → agent-tools}/error-formatter.ts +0 -0
  83. /package/src/{mcp → agent-tools}/user-prompt.ts +0 -0
@@ -0,0 +1,109 @@
1
+ import { z } from 'zod';
2
+
3
+ export interface AgentBehaviorInitArgs {
4
+ aiActContext?: string;
5
+ aiActionContext?: string;
6
+ replanningCycleLimit?: number;
7
+ waitAfterAction?: number;
8
+ screenshotShrinkFactor?: number;
9
+ }
10
+
11
+ type ExposedAgentBehaviorInitArgKey = Exclude<
12
+ keyof AgentBehaviorInitArgs,
13
+ 'aiActionContext'
14
+ >;
15
+
16
+ export const agentBehaviorInitArgShape = {
17
+ aiActContext: z
18
+ .string()
19
+ .optional()
20
+ .describe(
21
+ 'Background knowledge passed to aiAct. Default: no extra context.',
22
+ ),
23
+ replanningCycleLimit: z
24
+ .number()
25
+ .int()
26
+ .nonnegative()
27
+ .optional()
28
+ .describe(
29
+ 'Maximum number of replanning cycles for aiAct. Default: model adapter default.',
30
+ ),
31
+ waitAfterAction: z
32
+ .number()
33
+ .nonnegative()
34
+ .optional()
35
+ .describe(
36
+ 'Wait time in milliseconds after each action execution. Default: 300ms.',
37
+ ),
38
+ screenshotShrinkFactor: z
39
+ .number()
40
+ .min(1)
41
+ .optional()
42
+ .describe(
43
+ 'Screenshot shrink factor before sending images to AI. Default: 1; high values may reduce recognition quality, especially on mobile.',
44
+ ),
45
+ } satisfies Record<ExposedAgentBehaviorInitArgKey, z.ZodTypeAny>;
46
+
47
+ export function extractAgentBehaviorInitArgs(
48
+ extracted: Partial<AgentBehaviorInitArgs> | undefined,
49
+ ): AgentBehaviorInitArgs | undefined {
50
+ if (!extracted) {
51
+ return undefined;
52
+ }
53
+
54
+ const agentOptions: AgentBehaviorInitArgs = {
55
+ ...(typeof extracted.aiActContext === 'string'
56
+ ? { aiActContext: extracted.aiActContext }
57
+ : {}),
58
+ ...(typeof extracted.aiActionContext === 'string'
59
+ ? { aiActionContext: extracted.aiActionContext }
60
+ : {}),
61
+ ...(typeof extracted.replanningCycleLimit === 'number'
62
+ ? { replanningCycleLimit: extracted.replanningCycleLimit }
63
+ : {}),
64
+ ...(typeof extracted.waitAfterAction === 'number'
65
+ ? { waitAfterAction: extracted.waitAfterAction }
66
+ : {}),
67
+ ...(typeof extracted.screenshotShrinkFactor === 'number'
68
+ ? { screenshotShrinkFactor: extracted.screenshotShrinkFactor }
69
+ : {}),
70
+ };
71
+
72
+ return Object.keys(agentOptions).length > 0 ? agentOptions : undefined;
73
+ }
74
+
75
+ function stableJsonValue(value: unknown): unknown {
76
+ if (Array.isArray(value)) {
77
+ return value.map(stableJsonValue);
78
+ }
79
+
80
+ if (value && typeof value === 'object') {
81
+ return Object.fromEntries(
82
+ Object.entries(value as Record<string, unknown>)
83
+ .sort(([left], [right]) => left.localeCompare(right))
84
+ .map(([key, nestedValue]) => [key, stableJsonValue(nestedValue)]),
85
+ );
86
+ }
87
+
88
+ return value;
89
+ }
90
+
91
+ export function getAgentInitArgsSignature(
92
+ initArgs: object | undefined,
93
+ ): string | undefined {
94
+ if (!initArgs || Object.keys(initArgs).length === 0) {
95
+ return undefined;
96
+ }
97
+
98
+ return JSON.stringify(stableJsonValue(initArgs));
99
+ }
100
+
101
+ export function shouldRebuildAgentForInitArgs(
102
+ currentSignature: string | undefined,
103
+ nextSignature: string | undefined,
104
+ ): boolean {
105
+ return (
106
+ currentSignature !== nextSignature &&
107
+ (currentSignature !== undefined || nextSignature !== undefined)
108
+ );
109
+ }
@@ -1,6 +1,5 @@
1
1
  import { parseBase64 } from '@midscene/shared/img';
2
2
  import { getDebug } from '@midscene/shared/logger';
3
- import type { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
4
3
  import type { z } from 'zod';
5
4
  import { camelToKebab, getKeyAliases } from '../key-alias-utils';
6
5
  import {
@@ -29,7 +28,7 @@ import type {
29
28
  ToolSchema,
30
29
  } from './types';
31
30
 
32
- const debug = getDebug('mcp:base-tools');
31
+ const debug = getDebug('agent-tools:base-tools');
33
32
 
34
33
  /**
35
34
  * Declarative description of a platform's agent init args.
@@ -39,11 +38,11 @@ const debug = getDebug('mcp:base-tools');
39
38
  export interface InitArgSpec<TInitParam> {
40
39
  /** Arg namespace, e.g. `android`, `ios`. */
41
40
  namespace: string;
42
- /** Zod shape describing the init args. Field names drive the MCP schema. */
41
+ /** Zod shape describing the init args. Field names drive the tool schema. */
43
42
  shape: Record<string, z.ZodTypeAny>;
44
43
  /**
45
44
  * Optional CLI presentation hints. These affect `--help` output for
46
- * single-platform CLIs but do not alter MCP/YAML protocol keys.
45
+ * single-platform CLIs but do not alter YAML protocol keys.
47
46
  */
48
47
  cli?: {
49
48
  /** Prefer bare `--device-id`-style options in platform CLI help output. */
@@ -61,7 +60,7 @@ export interface InitArgSpec<TInitParam> {
61
60
  }
62
61
 
63
62
  /**
64
- * Base class for platform-specific MCP tools.
63
+ * Base class for platform-specific Midscene tools.
65
64
  * @typeParam TAgent - Platform-specific agent type.
66
65
  * @typeParam TInitParam - Platform-specific init parameter consumed by
67
66
  * `ensureAgent`. Defaults to `undefined` for platforms that take no args.
@@ -71,20 +70,19 @@ export abstract class BaseMidsceneTools<
71
70
  TInitParam = unknown,
72
71
  > implements IMidsceneTools
73
72
  {
74
- protected mcpServer?: McpServer;
75
73
  protected agent?: TAgent;
76
74
  protected toolDefinitions: ToolDefinition[] = [];
77
75
 
78
76
  /**
79
77
  * Default options injected into every generated tool call (e.g. forced deep
80
- * locate / deep think). Set from server/CLI behavior flags before
78
+ * locate / deep think). Set from startup/CLI behavior flags before
81
79
  * `initTools()` so they are baked into the generated tool handlers.
82
80
  * See https://github.com/web-infra-dev/midscene/issues/2446.
83
81
  */
84
82
  protected toolDefaults: ToolDefaults = {};
85
83
 
86
84
  /**
87
- * Declarative init-arg spec. Subclasses that accept CLI/MCP init args should
85
+ * Declarative init-arg spec. Subclasses that accept CLI init args should
88
86
  * set this once and get `extractAgentInitParam` / `sanitizeToolArgs` /
89
87
  * `getAgentInitArgSchema` auto-implemented.
90
88
  *
@@ -108,7 +106,7 @@ export abstract class BaseMidsceneTools<
108
106
  }
109
107
 
110
108
  /**
111
- * Extract a platform-specific agent init parameter from CLI/MCP tool args.
109
+ * Extract a platform-specific agent init parameter from CLI tool args.
112
110
  */
113
111
  protected extractAgentInitParam(
114
112
  args: Record<string, unknown>,
@@ -161,7 +159,7 @@ export abstract class BaseMidsceneTools<
161
159
  * show ergonomic bare flags while the underlying schema stays namespaced.
162
160
  * When `preferBareKeys` is enabled, single-platform CLIs only accept the
163
161
  * bare spellings; namespaced dotted spellings remain available through the
164
- * MCP/YAML schema instead of the platform CLI surface.
162
+ * YAML schema instead of the platform CLI surface.
165
163
  */
166
164
  protected getAgentInitArgCliMetadata(): ToolCliMetadata | undefined {
167
165
  if (!this.initArgSpec?.cli) {
@@ -272,7 +270,6 @@ export abstract class BaseMidsceneTools<
272
270
  this.toolDefinitions.push(...platformTools);
273
271
 
274
272
  // 2. Get action space: use pre-set agent if available, otherwise temp device.
275
- // When called via mcpKitForAgent(), agent is set before initTools().
276
273
  // For CLI usage, agent is deferred to the first real command.
277
274
  let actionSpace: ActionSpaceItem[];
278
275
  if (this.agent) {
@@ -313,28 +310,6 @@ export abstract class BaseMidsceneTools<
313
310
  debug('Total tools prepared:', this.toolDefinitions.length);
314
311
  }
315
312
 
316
- /**
317
- * Attach to MCP server and register all tools
318
- */
319
- public attachToServer(server: McpServer): void {
320
- this.mcpServer = server;
321
-
322
- if (this.toolDefinitions.length === 0) {
323
- debug('Warning: No tools to register. Tools may be initialized lazily.');
324
- }
325
-
326
- for (const toolDef of this.toolDefinitions) {
327
- this.mcpServer.tool(
328
- toolDef.name,
329
- toolDef.description,
330
- toolDef.schema,
331
- toolDef.handler,
332
- );
333
- }
334
-
335
- debug('Registered', this.toolDefinitions.length, 'tools');
336
- }
337
-
338
313
  /**
339
314
  * Cleanup method - destroy agent and release resources
340
315
  */
@@ -1,5 +1,13 @@
1
1
  import { existsSync } from 'node:fs';
2
- import { MIDSCENE_MCP_CHROME_PATH, globalConfigManager } from '../env';
2
+ import {
3
+ MIDSCENE_CHROME_PATH,
4
+ MIDSCENE_MCP_CHROME_PATH,
5
+ globalConfigManager,
6
+ } from '../env';
7
+ import { getDebug } from '../logger';
8
+
9
+ const warnChromePath = getDebug('agent-tools:chrome-path', { console: true });
10
+ let hasWarnedLegacyChromePath = false;
3
11
 
4
12
  export function getSystemChromePath(): string | undefined {
5
13
  const platform = process.platform;
@@ -33,9 +41,18 @@ export function getSystemChromePath(): string | undefined {
33
41
  }
34
42
 
35
43
  export function resolveChromePath(): string {
36
- const envPath = globalConfigManager.getEnvConfigValue(
44
+ const primaryEnvPath =
45
+ globalConfigManager.getEnvConfigValue(MIDSCENE_CHROME_PATH);
46
+ const legacyEnvPath = globalConfigManager.getEnvConfigValue(
37
47
  MIDSCENE_MCP_CHROME_PATH,
38
48
  );
49
+ const envPath = primaryEnvPath || legacyEnvPath;
50
+ if (!primaryEnvPath && legacyEnvPath && !hasWarnedLegacyChromePath) {
51
+ warnChromePath(
52
+ 'MIDSCENE_MCP_CHROME_PATH is deprecated. Use MIDSCENE_CHROME_PATH instead.',
53
+ );
54
+ hasWarnedLegacyChromePath = true;
55
+ }
39
56
  if (envPath && envPath !== 'auto' && existsSync(envPath)) {
40
57
  return envPath;
41
58
  }
@@ -43,6 +60,6 @@ export function resolveChromePath(): string {
43
60
  if (systemPath) return systemPath;
44
61
 
45
62
  throw new Error(
46
- 'Chrome not found. Install Google Chrome or set MIDSCENE_MCP_CHROME_PATH environment variable.',
63
+ 'Chrome not found. Install Google Chrome or set MIDSCENE_CHROME_PATH environment variable.',
47
64
  );
48
65
  }
@@ -1,10 +1,8 @@
1
- export * from './base-server';
2
1
  export * from './base-tools';
3
2
  export * from './tool-defaults';
3
+ export * from './agent-behavior-init-args';
4
4
  export * from './init-arg-utils';
5
5
  export * from './error-formatter';
6
6
  export * from './tool-generator';
7
7
  export * from './types';
8
- export * from './inject-report-html-plugin';
9
- export * from './launcher-helper';
10
8
  export * from './chrome-path';
@@ -88,11 +88,11 @@ export function sanitizeNamespacedArgs(
88
88
  }
89
89
 
90
90
  /**
91
- * Build a flat MCP tool schema whose keys are dotted `"<namespace>.<field>"`.
91
+ * Build a flat tool schema whose keys are dotted `"<namespace>.<field>"`.
92
92
  *
93
93
  * We intentionally stay flat (rather than `{ namespace: z.object({...}) }`) so
94
- * that CLI (`--android.device-id`), MCP clients, and `--help` output all share
95
- * the same spelling. `readNamespacedArg` understands all three input shapes:
94
+ * that CLI (`--android.device-id`) and `--help` output share the same spelling.
95
+ * `readNamespacedArg` understands all three input shapes:
96
96
  * nested namespace object, dotted flat key, and bare key fallback.
97
97
  */
98
98
  export function createNamespacedInitArgSchema(
@@ -1,10 +1,10 @@
1
1
  /**
2
2
  * Unified, declarative mechanism for "force a default option on every tool
3
- * call" behaviors exposed by MCP servers and the device / Agent Skill CLIs.
3
+ * call" behaviors exposed by device and Agent Skill CLIs.
4
4
  *
5
5
  * Adding a new behavior flag (e.g. `--deep-search`) is a one-line change to
6
6
  * {@link TOOL_BEHAVIOR_FLAGS}: declare which default-option "bag" it fills.
7
- * The tool generator, servers, tools managers and CLI parsing are all generic
7
+ * The tool generator, tools managers and CLI parsing are all generic
8
8
  * over {@link ToolDefaults} and never need to learn about individual flags.
9
9
  *
10
10
  * See https://github.com/web-infra-dev/midscene/issues/2446.
@@ -91,13 +91,12 @@ export function resolveToolDefaults(
91
91
  *
92
92
  * Behavior flags (e.g. `--deep-locate`) are global: they may appear anywhere
93
93
  * in argv and are not tied to a specific sub-command. They are recognized by
94
- * exact kebab-case match the same surface the MCP `parseArgs` config exposes
95
- * — and removed so a strict per-command parser never sees them. Every other
94
+ * exact kebab-case match and removed so a strict per-command parser never sees them. Every other
96
95
  * token is returned untouched and in order for that per-command parser.
97
96
  *
98
97
  * This is the single place that knows how a behavior flag looks on the command
99
- * line; both the device / Agent Skill CLI and the MCP launch path resolve their
100
- * defaults from {@link TOOL_BEHAVIOR_FLAGS} through here / {@link resolveToolDefaults}.
98
+ * line; the device / Agent Skill CLI resolves defaults from
99
+ * {@link TOOL_BEHAVIOR_FLAGS} through here / {@link resolveToolDefaults}.
101
100
  */
102
101
  export function stripBehaviorFlags(argv: readonly string[]): {
103
102
  rawArgs: string[];
@@ -21,10 +21,10 @@ import { composeUserPrompt, promptInputExtraSchema } from './user-prompt';
21
21
  export { composeUserPrompt };
22
22
 
23
23
  /**
24
- * Generate MCP tool description from ActionSpaceItem
24
+ * Generate tool description from ActionSpaceItem.
25
25
  * Format: "actionName action, description. Parameters: param1 (type) - desc; param2 (type) - desc"
26
26
  */
27
- function describeActionForMCP(action: ActionSpaceItem): string {
27
+ function describeActionForTool(action: ActionSpaceItem): string {
28
28
  const actionDesc = action.description || `Execute ${action.name} action`;
29
29
 
30
30
  if (!action.paramSchema) {
@@ -129,6 +129,7 @@ function isRecord(value: unknown): value is Record<string, unknown> {
129
129
  function makePromptOptional(
130
130
  shape: Record<string, z.ZodTypeAny>,
131
131
  wrapInOptional: boolean,
132
+ description?: string | null,
132
133
  ): z.ZodTypeAny {
133
134
  const newShape = { ...shape };
134
135
  newShape.prompt = shape.prompt.optional();
@@ -137,6 +138,9 @@ function makePromptOptional(
137
138
  if (wrapInOptional) {
138
139
  newSchema = newSchema.optional();
139
140
  }
141
+ if (description) {
142
+ newSchema = newSchema.describe(description);
143
+ }
140
144
  return newSchema;
141
145
  }
142
146
 
@@ -151,7 +155,10 @@ function transformSchemaField(
151
155
  const shape = getZodObjectShape(innerValue);
152
156
 
153
157
  if (shape && isMidsceneLocatorField(innerValue)) {
154
- return [key, makePromptOptional(shape, isOptional)];
158
+ return [
159
+ key,
160
+ makePromptOptional(shape, isOptional, getZodDescription(value)),
161
+ ];
155
162
  }
156
163
  return [key, value];
157
164
  }
@@ -159,7 +166,7 @@ function transformSchemaField(
159
166
  /**
160
167
  * Extract and transform schema from action's paramSchema.
161
168
  *
162
- * CLI and MCP both expose parameters as named fields, so the only schema
169
+ * CLI tools expose parameters as named fields, so the only schema
163
170
  * shapes we can surface are ZodObject (any number of fields) or undefined
164
171
  * (the action takes no parameters). A primitive schema like `z.string()`
165
172
  * silently degraded to leaking the ZodString instance's prototype methods
@@ -181,7 +188,7 @@ function extractActionSchema(
181
188
  (paramSchema as unknown as { _def?: { typeName?: string } })?._def
182
189
  ?.typeName ?? 'unknown';
183
190
  throw new Error(
184
- `Action "${actionName}" declared a non-object paramSchema (${typeName}). CLI and MCP tool schemas must be a ZodObject (e.g. z.object({ uri: z.string() })) or undefined. Wrap primitive fields in an object schema.`,
191
+ `Action "${actionName}" declared a non-object paramSchema (${typeName}). CLI tool schemas must be a ZodObject (e.g. z.object({ uri: z.string() })) or undefined. Wrap primitive fields in an object schema.`,
185
192
  );
186
193
  }
187
194
 
@@ -536,7 +543,7 @@ function mergeToolCliMetadata(
536
543
  }
537
544
 
538
545
  /**
539
- * Converts DeviceAction from actionSpace into MCP ToolDefinition
546
+ * Converts DeviceAction from actionSpace into ToolDefinition.
540
547
  * This is the core logic that removes need for hardcoded tool definitions
541
548
  */
542
549
  export function generateToolsFromActionSpace(
@@ -557,7 +564,7 @@ export function generateToolsFromActionSpace(
557
564
 
558
565
  return {
559
566
  name: action.name,
560
- description: describeActionForMCP(action),
567
+ description: describeActionForTool(action),
561
568
  schema,
562
569
  cli: initArgCliMetadata,
563
570
  handler: async (args: Record<string, unknown>) => {
@@ -1,4 +1,3 @@
1
- import type { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
2
1
  import type { z } from 'zod';
3
2
  import type { ToolDefaults } from './tool-defaults';
4
3
 
@@ -12,7 +11,7 @@ export const defaultAppLoadingTimeoutMs = 10000;
12
11
  export const defaultAppLoadingCheckIntervalMs = 2000;
13
12
 
14
13
  /**
15
- * Content item types for tool results (MCP compatible)
14
+ * Content item types for tool results.
16
15
  */
17
16
  export type ToolResultContent =
18
17
  | { type: 'text'; text: string }
@@ -26,7 +25,7 @@ export type ToolResultContent =
26
25
  };
27
26
 
28
27
  /**
29
- * Result type for tool execution (MCP compatible)
28
+ * Result type for tool execution.
30
29
  */
31
30
  export interface ToolResult {
32
31
  [x: string]: unknown;
@@ -58,7 +57,7 @@ export interface ToolCliMetadata {
58
57
  }
59
58
 
60
59
  /**
61
- * Tool definition for MCP server
60
+ * Tool definition for Midscene CLI and Skill surfaces.
62
61
  */
63
62
  export interface ToolDefinition<T = Record<string, unknown>> {
64
63
  name: string;
@@ -68,9 +67,6 @@ export interface ToolDefinition<T = Record<string, unknown>> {
68
67
  cli?: ToolCliMetadata;
69
68
  }
70
69
 
71
- /**
72
- * Tool type for mcpKitForAgent return value
73
- */
74
70
  export type Tool = ToolDefinition;
75
71
 
76
72
  /**
@@ -100,6 +96,23 @@ export type UserPromptLike =
100
96
  convertHttpImage2Base64?: boolean;
101
97
  };
102
98
 
99
+ export interface RecordToReportScreenshot {
100
+ /**
101
+ * PNG/JPEG data URI, or raw PNG base64 body.
102
+ */
103
+ base64: string;
104
+ description?: string;
105
+ }
106
+
107
+ export interface RecordToReportOptions {
108
+ content?: string;
109
+ /**
110
+ * @deprecated Use `screenshots: [{ base64 }]` instead.
111
+ */
112
+ screenshotBase64?: string;
113
+ screenshots?: RecordToReportScreenshot[];
114
+ }
115
+
103
116
  /**
104
117
  * Base agent interface
105
118
  * Represents a platform-specific agent (Android, iOS, Web)
@@ -113,7 +126,7 @@ export interface BaseAgent {
113
126
  };
114
127
  recordToReport?: (
115
128
  title?: string,
116
- opt?: { content?: string; screenshotBase64?: string },
129
+ opt?: RecordToReportOptions,
117
130
  ) => Promise<void>;
118
131
  callActionInActionSpace?: (
119
132
  actionName: string,
@@ -143,10 +156,9 @@ export interface BaseDevice {
143
156
  }
144
157
 
145
158
  /**
146
- * Interface for platform-specific MCP tools manager
159
+ * Interface for platform-specific tools manager.
147
160
  */
148
161
  export interface IMidsceneTools {
149
- attachToServer(server: McpServer): void;
150
162
  initTools(): Promise<void>;
151
163
  destroy?(): Promise<void>;
152
164
  setToolDefaults?(toolDefaults: ToolDefaults): void;
@@ -1,6 +1,6 @@
1
1
  import { z } from 'zod';
2
+ import type { ToolCliOption, ToolDefinition } from '../agent-tools/types';
2
3
  import { getKeyAliases } from '../key-alias-utils';
3
- import type { ToolCliOption, ToolDefinition } from '../mcp/types';
4
4
  import { CLIError } from './cli-error';
5
5
 
6
6
  export function parseValue(raw: string): unknown {
@@ -2,14 +2,14 @@ import { existsSync, writeFileSync } from 'node:fs';
2
2
  import { tmpdir } from 'node:os';
3
3
  import { join } from 'node:path';
4
4
  import dotenv from 'dotenv';
5
- import { getDebug } from '../logger';
6
- import type { BaseMidsceneTools } from '../mcp/base-tools';
7
- import { stripBehaviorFlags } from '../mcp/tool-defaults';
5
+ import type { BaseMidsceneTools } from '../agent-tools/base-tools';
6
+ import { stripBehaviorFlags } from '../agent-tools/tool-defaults';
8
7
  import type {
9
8
  ToolDefinition,
10
9
  ToolResult,
11
10
  ToolResultContent,
12
- } from '../mcp/types';
11
+ } from '../agent-tools/types';
12
+ import { getDebug } from '../logger';
13
13
  import {
14
14
  canonicalizeCliArgKeys,
15
15
  formatCliValidationError,
package/src/env/types.ts CHANGED
@@ -8,10 +8,11 @@ export const MIDSCENE_DEBUG_MODEL_RESPONSE = 'MIDSCENE_DEBUG_MODEL_RESPONSE';
8
8
  export const MIDSCENE_DANGEROUSLY_PRINT_ALL_CONFIG =
9
9
  'MIDSCENE_DANGEROUSLY_PRINT_ALL_CONFIG';
10
10
  export const MIDSCENE_DEBUG_MODE = 'MIDSCENE_DEBUG_MODE';
11
- export const MIDSCENE_MCP_USE_PUPPETEER_MODE =
12
- 'MIDSCENE_MCP_USE_PUPPETEER_MODE';
11
+ export const MIDSCENE_CHROME_PATH = 'MIDSCENE_CHROME_PATH';
12
+ /**
13
+ * @deprecated Use MIDSCENE_CHROME_PATH instead. This is kept for backward compatibility.
14
+ */
13
15
  export const MIDSCENE_MCP_CHROME_PATH = 'MIDSCENE_MCP_CHROME_PATH';
14
- export const MIDSCENE_MCP_ANDROID_MODE = 'MIDSCENE_MCP_ANDROID_MODE';
15
16
  export const DOCKER_CONTAINER = 'DOCKER_CONTAINER';
16
17
 
17
18
  // Observability
@@ -166,8 +167,6 @@ export const BASIC_ENV_KEYS = [
166
167
 
167
168
  export const BOOLEAN_ENV_KEYS = [
168
169
  MIDSCENE_CACHE,
169
- MIDSCENE_MCP_USE_PUPPETEER_MODE,
170
- MIDSCENE_MCP_ANDROID_MODE,
171
170
  MIDSCENE_LANGSMITH_DEBUG,
172
171
  MIDSCENE_LANGFUSE_DEBUG,
173
172
  MIDSCENE_REPORT_QUIET,
@@ -188,6 +187,7 @@ export const STRING_ENV_KEYS = [
188
187
  MIDSCENE_REPORT_TAG_NAME,
189
188
  MIDSCENE_PREFERRED_LANGUAGE,
190
189
  MATCH_BY_POSITION,
190
+ MIDSCENE_CHROME_PATH,
191
191
  MIDSCENE_MCP_CHROME_PATH,
192
192
  DOCKER_CONTAINER,
193
193
  ] as const;
package/src/img/index.ts CHANGED
@@ -21,6 +21,8 @@ export {
21
21
  createImgBase64ByFormat,
22
22
  inferBase64ImageFormat,
23
23
  normalizeBase64Image,
24
+ normalizeScreenshotBase64,
25
+ type NormalizeScreenshotBase64Options,
24
26
  } from './transform';
25
27
  export {
26
28
  processImageElementInfo,
@@ -157,6 +157,9 @@ export async function resizeAndConvertImgBuffer(
157
157
  export const normalizeBase64Body = (body: string) => body.replace(/\s/g, '');
158
158
 
159
159
  const base64ImageDataUrlPattern = /^data:image\/[a-zA-Z0-9.+-]+;base64,/i;
160
+ const supportedScreenshotDataUriPattern =
161
+ /^data:image\/(png|jpe?g);base64,([\s\S]*)$/i;
162
+ const rawBase64BodyPattern = /^[A-Za-z0-9+/=\s]+$/;
160
163
 
161
164
  export const inferBase64ImageFormat = (base64Body: string) => {
162
165
  if (base64Body.startsWith('iVBORw0KGgo')) {
@@ -207,6 +210,48 @@ export const createImgBase64ByFormat = (format: string, body: string) => {
207
210
  return `data:image/${format};base64,${normalizeBase64Body(body)}`;
208
211
  };
209
212
 
213
+ export interface NormalizeScreenshotBase64Options {
214
+ label?: string;
215
+ }
216
+
217
+ export const normalizeScreenshotBase64 = (
218
+ base64: string,
219
+ options?: NormalizeScreenshotBase64Options,
220
+ ) => {
221
+ const label = options?.label ?? 'screenshot base64';
222
+ const trimmedBase64 = base64.trim();
223
+ if (!trimmedBase64) {
224
+ throw new Error(`${label} cannot be empty`);
225
+ }
226
+
227
+ const dataUriMatch = trimmedBase64.match(supportedScreenshotDataUriPattern);
228
+ if (dataUriMatch) {
229
+ const imageFormat =
230
+ dataUriMatch[1].toLowerCase() === 'jpg'
231
+ ? 'jpeg'
232
+ : dataUriMatch[1].toLowerCase();
233
+ const body = dataUriMatch[2];
234
+ if (!normalizeBase64Body(body)) {
235
+ throw new Error(`${label} cannot be empty`);
236
+ }
237
+ return createImgBase64ByFormat(imageFormat, body);
238
+ }
239
+
240
+ if (trimmedBase64.startsWith('data:')) {
241
+ throw new Error(
242
+ `${label} must be a PNG/JPEG data URI or raw PNG base64 string`,
243
+ );
244
+ }
245
+
246
+ if (!rawBase64BodyPattern.test(trimmedBase64)) {
247
+ throw new Error(
248
+ `${label} must be a PNG/JPEG data URI or raw PNG base64 string`,
249
+ );
250
+ }
251
+
252
+ return createImgBase64ByFormat('png', trimmedBase64);
253
+ };
254
+
210
255
  export const normalizeBase64Image = (base64: string) => {
211
256
  const trimmedBase64 = base64.trim();
212
257
  if (base64ImageDataUrlPattern.test(trimmedBase64)) {
@@ -1,7 +1,7 @@
1
1
  /**
2
- * Internal-only helpers for CLI/MCP argument key aliasing.
2
+ * Internal-only helpers for CLI argument key aliasing.
3
3
  * Not re-exported from the package entry point — keep consumers within
4
- * `cli/` and `mcp/`.
4
+ * `cli/`.
5
5
  */
6
6
 
7
7
  export function kebabToCamel(str: string): string {
package/src/utils.ts CHANGED
@@ -64,17 +64,8 @@ export function assert(condition: any, message?: string): asserts condition {
64
64
  }
65
65
  }
66
66
 
67
- let isMcp = false;
68
-
69
- export function setIsMcp(value: boolean) {
70
- isMcp = value;
71
- }
72
-
73
- //mcp need use obj format to console msg: https://github.com/modelcontextprotocol/typescript-sdk/issues/244
74
67
  export function logMsg(...message: Parameters<typeof console.log>) {
75
- if (!isMcp) {
76
- console.log(...message);
77
- }
68
+ console.log(...message);
78
69
  }
79
70
 
80
71
  export async function repeat(