@midscene/shared 1.9.1 → 1.9.2-beta-20260605084246.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -165,6 +165,34 @@ function normalizeActionArgs(args, paramSchema) {
165
165
  ];
166
166
  }));
167
167
  }
168
+ function mergeLocateDefaults(locate, defaults) {
169
+ let merged;
170
+ for (const [key, value] of Object.entries(defaults))if (void 0 === locate[key]) {
171
+ if ('deepLocate' !== key || void 0 === locate.deepThink) {
172
+ merged = merged ?? {
173
+ ...locate
174
+ };
175
+ merged[key] = value;
176
+ }
177
+ }
178
+ return merged ?? locate;
179
+ }
180
+ function applyLocateDefaults(args, paramSchema, locateDefaults) {
181
+ if (!paramSchema || 0 === Object.keys(locateDefaults).length) return args;
182
+ const shape = getZodObjectShape(paramSchema);
183
+ if (!shape) return args;
184
+ return Object.fromEntries(Object.entries(args).map(([key, value])=>{
185
+ const fieldSchema = shape[key];
186
+ if (fieldSchema && (0, external_zod_schema_utils_js_namespaceObject.isMidsceneLocatorField)(fieldSchema) && isRecord(value)) return [
187
+ key,
188
+ mergeLocateDefaults(value, locateDefaults)
189
+ ];
190
+ return [
191
+ key,
192
+ value
193
+ ];
194
+ }));
195
+ }
168
196
  function serializeArgsToDescription(args) {
169
197
  try {
170
198
  return Object.entries(args).map(([key, value])=>{
@@ -320,7 +348,7 @@ function mergeToolCliMetadata(base, extra) {
320
348
  options
321
349
  } : void 0;
322
350
  }
323
- function generateToolsFromActionSpace(actionSpace, getAgent, sanitizeArgs = (args)=>args, initArgSchema = {}, initArgCliMetadata) {
351
+ function generateToolsFromActionSpace(actionSpace, getAgent, sanitizeArgs = (args)=>args, initArgSchema = {}, initArgCliMetadata, toolDefaults = {}) {
324
352
  return actionSpace.map((action)=>{
325
353
  const schema = {
326
354
  ...extractActionSchema(action.paramSchema, action.name),
@@ -334,7 +362,8 @@ function generateToolsFromActionSpace(actionSpace, getAgent, sanitizeArgs = (arg
334
362
  handler: async (args)=>{
335
363
  try {
336
364
  const agent = await getAgent(args);
337
- const normalizedArgs = normalizeActionArgs(sanitizeArgs(args), action.paramSchema);
365
+ let normalizedArgs = normalizeActionArgs(sanitizeArgs(args), action.paramSchema);
366
+ if (toolDefaults.locate) normalizedArgs = applyLocateDefaults(normalizedArgs, action.paramSchema, toolDefaults.locate);
338
367
  let actionResult;
339
368
  try {
340
369
  actionResult = await executeAction(agent, action.name, normalizedArgs);
@@ -353,7 +382,7 @@ function generateToolsFromActionSpace(actionSpace, getAgent, sanitizeArgs = (arg
353
382
  };
354
383
  });
355
384
  }
356
- function generateCommonTools(getAgent, initArgSchema = {}, initArgCliMetadata) {
385
+ function generateCommonTools(getAgent, initArgSchema = {}, initArgCliMetadata, toolDefaults = {}) {
357
386
  return [
358
387
  {
359
388
  name: 'take_screenshot',
@@ -392,6 +421,8 @@ function generateCommonTools(getAgent, initArgSchema = {}, initArgCliMetadata) {
392
421
  description: 'Execute a natural language action. The AI will plan and perform multi-step operations in a single invocation, useful for transient UI interactions (e.g., Spotlight, dropdown menus) that disappear between separate commands.',
393
422
  schema: {
394
423
  prompt: external_zod_namespaceObject.z.string().describe('Natural language description of the action to perform, e.g. "press Command+Space, type Safari, press Enter"'),
424
+ deepLocate: external_zod_namespaceObject.z.boolean().optional().describe('Use deep locate for every element this action targets. Improves precision for small or ambiguous targets at the cost of speed. Defaults to the server --deep-locate setting.'),
425
+ deepThink: external_zod_namespaceObject.z.boolean().optional().describe('Plan this action with deep thinking (richer context and sub-goal decomposition). Helps with complex multi-step instructions at the cost of speed. Defaults to the server --deep-think setting.'),
395
426
  ...initArgSchema
396
427
  },
397
428
  cli: mergeToolCliMetadata(void 0, initArgCliMetadata),
@@ -400,9 +431,13 @@ function generateCommonTools(getAgent, initArgSchema = {}, initArgCliMetadata) {
400
431
  try {
401
432
  const agent = await getAgent(args);
402
433
  if (!agent.aiAction) return createErrorResult('act is not supported by this agent');
403
- const result = await agent.aiAction(prompt, {
404
- deepThink: false
405
- });
434
+ const actOptions = {
435
+ deepThink: false,
436
+ ...toolDefaults.act
437
+ };
438
+ if (void 0 !== args.deepLocate) actOptions.deepLocate = args.deepLocate;
439
+ if (void 0 !== args.deepThink) actOptions.deepThink = args.deepThink;
440
+ const result = await agent.aiAction(prompt, actOptions);
406
441
  return await captureScreenshotResult(agent, 'act', result);
407
442
  } catch (error) {
408
443
  const errorMessage = (0, external_error_formatter_js_namespaceObject.getErrorMessage)(error);
@@ -416,12 +451,14 @@ function generateCommonTools(getAgent, initArgSchema = {}, initArgCliMetadata) {
416
451
  description: 'Assert a natural language statement against the current page/screen.',
417
452
  schema: {
418
453
  prompt: external_zod_namespaceObject.z.string().describe('Natural language assertion to verify, e.g. "there is a login button visible"'),
454
+ message: external_zod_namespaceObject.z.string().optional().describe('Custom error message to throw when the assertion fails, e.g. "the login button should be visible".'),
419
455
  ...external_user_prompt_js_namespaceObject.promptInputExtraSchema,
420
456
  ...initArgSchema
421
457
  },
422
458
  cli: mergeToolCliMetadata(void 0, initArgCliMetadata),
423
459
  handler: async (args = {})=>{
424
460
  const prompt = args.prompt;
461
+ const message = args.message;
425
462
  try {
426
463
  const agent = await getAgent(args);
427
464
  if (!agent.aiAssert) return createErrorResult('assert is not supported by this agent');
@@ -431,7 +468,7 @@ function generateCommonTools(getAgent, initArgSchema = {}, initArgCliMetadata) {
431
468
  imageName: args.imageName,
432
469
  convertHttpImage2Base64: args.convertHttpImage2Base64
433
470
  });
434
- await agent.aiAssert(userPrompt);
471
+ await agent.aiAssert(userPrompt, message);
435
472
  return {
436
473
  content: [
437
474
  {
@@ -1,5 +1,6 @@
1
1
  import type { ParseArgsConfig } from 'node:util';
2
2
  import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
3
+ import { type ToolDefaults } from './tool-defaults';
3
4
  import type { IMidsceneTools } from './types';
4
5
  export interface BaseMCPServerConfig {
5
6
  name: string;
@@ -25,13 +26,17 @@ export interface LaunchMCPServerResult {
25
26
  close: () => Promise<void>;
26
27
  }
27
28
  /**
28
- * CLI argument configuration for MCP servers
29
+ * CLI argument configuration for MCP servers. Behavior flags (e.g.
30
+ * `--deep-locate`) are generated from {@link TOOL_BEHAVIOR_FLAGS}, so adding a
31
+ * new flag there exposes it here automatically.
29
32
  */
30
33
  export declare const CLI_ARGS_CONFIG: ParseArgsConfig['options'];
31
34
  export interface CLIArgs {
32
35
  mode?: string;
33
36
  port?: string;
34
37
  host?: string;
38
+ /** Behavior flags such as `deep-locate` / `deep-think` (see TOOL_BEHAVIOR_FLAGS). */
39
+ [flag: string]: string | boolean | undefined;
35
40
  }
36
41
  /**
37
42
  * Launch an MCP server based on CLI arguments
@@ -47,7 +52,15 @@ export declare abstract class BaseMCPServer {
47
52
  protected toolsManager?: IMidsceneTools;
48
53
  protected config: BaseMCPServerConfig;
49
54
  protected providedToolsManager?: IMidsceneTools;
55
+ protected toolDefaults: ToolDefaults;
50
56
  constructor(config: BaseMCPServerConfig, toolsManager?: IMidsceneTools);
57
+ /**
58
+ * Set the default options injected into generated tool calls (e.g. forced
59
+ * deep locate / deep think). Must be called before `launch()` /
60
+ * `launchHttp()` so they are applied to the tools manager before its tools
61
+ * are generated. Merges with any previously set defaults.
62
+ */
63
+ setToolDefaults(toolDefaults: ToolDefaults): void;
51
64
  /**
52
65
  * Platform-specific: create tools manager instance
53
66
  * This is only called if no tools manager was provided in constructor
@@ -1,6 +1,7 @@
1
1
  import type { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
2
2
  import type { z } from 'zod';
3
3
  import { type CliReportSession } from './cli-report-session';
4
+ import { type ToolDefaults } from './tool-defaults';
4
5
  import type { BaseAgent, BaseDevice, IMidsceneTools, ToolCliMetadata, ToolDefinition, ToolSchema } from './types';
5
6
  /**
6
7
  * Declarative description of a platform's agent init args.
@@ -38,6 +39,13 @@ export declare abstract class BaseMidsceneTools<TAgent extends BaseAgent = BaseA
38
39
  protected mcpServer?: McpServer;
39
40
  protected agent?: TAgent;
40
41
  protected toolDefinitions: ToolDefinition[];
42
+ /**
43
+ * Default options injected into every generated tool call (e.g. forced deep
44
+ * locate / deep think). Set from server/CLI behavior flags before
45
+ * `initTools()` so they are baked into the generated tool handlers.
46
+ * See https://github.com/web-infra-dev/midscene/issues/2446.
47
+ */
48
+ protected toolDefaults: ToolDefaults;
41
49
  /**
42
50
  * Declarative init-arg spec. Subclasses that accept CLI/MCP init args should
43
51
  * set this once and get `extractAgentInitParam` / `sanitizeToolArgs` /
@@ -117,6 +125,12 @@ export declare abstract class BaseMidsceneTools<TAgent extends BaseAgent = BaseA
117
125
  * Set agent for the tools manager
118
126
  */
119
127
  setAgent(agent: TAgent): void;
128
+ /**
129
+ * Set the default options injected into generated tool calls. Must be called
130
+ * before `initTools()` because the values are captured into the generated
131
+ * tool handlers. Merges with any previously set defaults.
132
+ */
133
+ setToolDefaults(toolDefaults: ToolDefaults): void;
120
134
  /**
121
135
  * Helper: Convert base64 screenshot to image content array
122
136
  */
@@ -1,5 +1,6 @@
1
1
  export * from './base-server';
2
2
  export * from './base-tools';
3
+ export * from './tool-defaults';
3
4
  export * from './init-arg-utils';
4
5
  export * from './error-formatter';
5
6
  export * from './tool-generator';
@@ -0,0 +1,64 @@
1
+ /**
2
+ * Unified, declarative mechanism for "force a default option on every tool
3
+ * call" behaviors exposed by MCP servers and the device / Agent Skill CLIs.
4
+ *
5
+ * Adding a new behavior flag (e.g. `--deep-search`) is a one-line change to
6
+ * {@link TOOL_BEHAVIOR_FLAGS}: declare which default-option "bag" it fills.
7
+ * The tool generator, servers, tools managers and CLI parsing are all generic
8
+ * over {@link ToolDefaults} and never need to learn about individual flags.
9
+ *
10
+ * See https://github.com/web-infra-dev/midscene/issues/2446.
11
+ */
12
+ /**
13
+ * Default options injected into generated tool calls. Each field is an
14
+ * injection point; an explicit per-call value always wins over these defaults.
15
+ */
16
+ export interface ToolDefaults {
17
+ /**
18
+ * Merged into every locate field of action tools (`Tap`, `Input`, ...).
19
+ * e.g. `{ deepLocate: true }`.
20
+ */
21
+ locate?: Record<string, unknown>;
22
+ /**
23
+ * Merged into the `act` tool's `aiAction` options.
24
+ * e.g. `{ deepLocate: true, deepThink: true }`.
25
+ */
26
+ act?: Record<string, unknown>;
27
+ }
28
+ export interface ToolBehaviorFlag {
29
+ /** Kebab-case CLI flag name, e.g. `deep-locate` (exposed as `--deep-locate`). */
30
+ cli: string;
31
+ /** One-line description for help output. */
32
+ description: string;
33
+ /** Default-option bags this flag turns on when present. */
34
+ defaults: ToolDefaults;
35
+ }
36
+ /**
37
+ * The single source of truth for behavior flags. Add a row to support a new
38
+ * `--flag`; nothing else in the pipeline needs to change.
39
+ */
40
+ export declare const TOOL_BEHAVIOR_FLAGS: readonly ToolBehaviorFlag[];
41
+ /** Merge two {@link ToolDefaults}, with `b` taking precedence over `a`. */
42
+ export declare function mergeToolDefaults(a: ToolDefaults, b: ToolDefaults): ToolDefaults;
43
+ /**
44
+ * Resolve the active {@link ToolDefaults} from a predicate that says whether a
45
+ * given flag (by its `cli` name) is enabled.
46
+ */
47
+ export declare function resolveToolDefaults(isEnabled: (cli: string) => boolean): ToolDefaults;
48
+ /**
49
+ * Split argv into the resolved {@link ToolDefaults} and the remaining args.
50
+ *
51
+ * Behavior flags (e.g. `--deep-locate`) are global: they may appear anywhere
52
+ * in argv and are not tied to a specific sub-command. They are recognized by
53
+ * exact kebab-case match — the same surface the MCP `parseArgs` config exposes
54
+ * — and removed so a strict per-command parser never sees them. Every other
55
+ * token is returned untouched and in order for that per-command parser.
56
+ *
57
+ * This is the single place that knows how a behavior flag looks on the command
58
+ * line; both the device / Agent Skill CLI and the MCP launch path resolve their
59
+ * defaults from {@link TOOL_BEHAVIOR_FLAGS} through here / {@link resolveToolDefaults}.
60
+ */
61
+ export declare function stripBehaviorFlags(argv: readonly string[]): {
62
+ rawArgs: string[];
63
+ toolDefaults: ToolDefaults;
64
+ };
@@ -1,3 +1,4 @@
1
+ import type { ToolDefaults } from './tool-defaults';
1
2
  import type { ActionSpaceItem, BaseAgent, ToolCliMetadata, ToolDefinition, ToolSchema } from './types';
2
3
  import { composeUserPrompt } from './user-prompt';
3
4
  export { composeUserPrompt };
@@ -5,8 +6,8 @@ export { composeUserPrompt };
5
6
  * Converts DeviceAction from actionSpace into MCP ToolDefinition
6
7
  * This is the core logic that removes need for hardcoded tool definitions
7
8
  */
8
- export declare function generateToolsFromActionSpace(actionSpace: ActionSpaceItem[], getAgent: (args?: Record<string, unknown>) => Promise<BaseAgent>, sanitizeArgs?: (args: Record<string, unknown>) => Record<string, unknown>, initArgSchema?: ToolSchema, initArgCliMetadata?: ToolCliMetadata): ToolDefinition[];
9
+ export declare function generateToolsFromActionSpace(actionSpace: ActionSpaceItem[], getAgent: (args?: Record<string, unknown>) => Promise<BaseAgent>, sanitizeArgs?: (args: Record<string, unknown>) => Record<string, unknown>, initArgSchema?: ToolSchema, initArgCliMetadata?: ToolCliMetadata, toolDefaults?: ToolDefaults): ToolDefinition[];
9
10
  /**
10
11
  * Generate common tools (screenshot, act)
11
12
  */
12
- export declare function generateCommonTools(getAgent: (args?: Record<string, unknown>) => Promise<BaseAgent>, initArgSchema?: ToolSchema, initArgCliMetadata?: ToolCliMetadata): ToolDefinition[];
13
+ export declare function generateCommonTools(getAgent: (args?: Record<string, unknown>) => Promise<BaseAgent>, initArgSchema?: ToolSchema, initArgCliMetadata?: ToolCliMetadata, toolDefaults?: ToolDefaults): ToolDefinition[];
@@ -1,5 +1,6 @@
1
1
  import type { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
2
2
  import type { z } from 'zod';
3
+ import type { ToolDefaults } from './tool-defaults';
3
4
  /**
4
5
  * Default timeout constants for app loading verification
5
6
  */
@@ -130,4 +131,5 @@ export interface IMidsceneTools {
130
131
  attachToServer(server: McpServer): void;
131
132
  initTools(): Promise<void>;
132
133
  destroy?(): Promise<void>;
134
+ setToolDefaults?(toolDefaults: ToolDefaults): void;
133
135
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@midscene/shared",
3
- "version": "1.9.1",
3
+ "version": "1.9.2-beta-20260605084246.0",
4
4
  "repository": "https://github.com/web-infra-dev/midscene",
5
5
  "homepage": "https://midscenejs.com/",
6
6
  "types": "./dist/types/index.d.ts",
@@ -4,6 +4,7 @@ import { join } from 'node:path';
4
4
  import dotenv from 'dotenv';
5
5
  import { getDebug } from '../logger';
6
6
  import type { BaseMidsceneTools } from '../mcp/base-tools';
7
+ import { stripBehaviorFlags } from '../mcp/tool-defaults';
7
8
  import type {
8
9
  ToolDefinition,
9
10
  ToolResult,
@@ -135,8 +136,18 @@ export async function runToolsCLI(
135
136
  scriptName: string,
136
137
  options?: CLIRunnerOptions,
137
138
  ): Promise<void> {
138
- const rawArgs = options?.argv ?? process.argv.slice(2);
139
- debug('CLI invoked: %s %s', scriptName, rawArgs.join(' '));
139
+ const inputArgs = options?.argv ?? process.argv.slice(2);
140
+ debug('CLI invoked: %s %s', scriptName, inputArgs.join(' '));
141
+
142
+ // Global behavior flags (e.g. `--deep-locate` / `--deep-think`) apply
143
+ // regardless of which command runs. `stripBehaviorFlags` is the single place
144
+ // that knows how they look on the command line: it resolves their defaults
145
+ // and returns the remaining args so the per-command parser never sees them.
146
+ // See https://github.com/web-infra-dev/midscene/issues/2446.
147
+ const { rawArgs, toolDefaults } = stripBehaviorFlags(inputArgs);
148
+ if (Object.keys(toolDefaults).length > 0) {
149
+ tools.setToolDefaults?.(toolDefaults);
150
+ }
140
151
 
141
152
  // Load .env from cwd before any tool initialization
142
153
  const envFile = join(process.cwd(), '.env');
@@ -10,6 +10,12 @@ import express, {
10
10
  type Response,
11
11
  } from 'express';
12
12
  import { getErrorMessage } from './error-formatter';
13
+ import {
14
+ TOOL_BEHAVIOR_FLAGS,
15
+ type ToolDefaults,
16
+ mergeToolDefaults,
17
+ resolveToolDefaults,
18
+ } from './tool-defaults';
13
19
  import type { IMidsceneTools } from './types';
14
20
 
15
21
  export interface BaseMCPServerConfig {
@@ -47,18 +53,25 @@ interface SessionData {
47
53
  }
48
54
 
49
55
  /**
50
- * CLI argument configuration for MCP servers
56
+ * CLI argument configuration for MCP servers. Behavior flags (e.g.
57
+ * `--deep-locate`) are generated from {@link TOOL_BEHAVIOR_FLAGS}, so adding a
58
+ * new flag there exposes it here automatically.
51
59
  */
52
60
  export const CLI_ARGS_CONFIG: ParseArgsConfig['options'] = {
53
61
  mode: { type: 'string', default: 'stdio' },
54
62
  port: { type: 'string', default: '3000' },
55
63
  host: { type: 'string', default: 'localhost' },
64
+ ...Object.fromEntries(
65
+ TOOL_BEHAVIOR_FLAGS.map((flag) => [flag.cli, { type: 'boolean' as const }]),
66
+ ),
56
67
  };
57
68
 
58
69
  export interface CLIArgs {
59
70
  mode?: string;
60
71
  port?: string;
61
72
  host?: string;
73
+ /** Behavior flags such as `deep-locate` / `deep-think` (see TOOL_BEHAVIOR_FLAGS). */
74
+ [flag: string]: string | boolean | undefined;
62
75
  }
63
76
 
64
77
  /**
@@ -69,6 +82,7 @@ export function launchMCPServer(
69
82
  server: BaseMCPServer,
70
83
  args: CLIArgs,
71
84
  ): Promise<LaunchMCPServerResult> {
85
+ server.setToolDefaults(resolveToolDefaults((cli) => args[cli] === true));
72
86
  if (args.mode === 'http') {
73
87
  return server.launchHttp({
74
88
  port: Number.parseInt(args.port || '3000', 10),
@@ -91,6 +105,7 @@ export abstract class BaseMCPServer {
91
105
  protected toolsManager?: IMidsceneTools;
92
106
  protected config: BaseMCPServerConfig;
93
107
  protected providedToolsManager?: IMidsceneTools;
108
+ protected toolDefaults: ToolDefaults = {};
94
109
 
95
110
  constructor(config: BaseMCPServerConfig, toolsManager?: IMidsceneTools) {
96
111
  this.config = config;
@@ -102,6 +117,16 @@ export abstract class BaseMCPServer {
102
117
  this.providedToolsManager = toolsManager;
103
118
  }
104
119
 
120
+ /**
121
+ * Set the default options injected into generated tool calls (e.g. forced
122
+ * deep locate / deep think). Must be called before `launch()` /
123
+ * `launchHttp()` so they are applied to the tools manager before its tools
124
+ * are generated. Merges with any previously set defaults.
125
+ */
126
+ public setToolDefaults(toolDefaults: ToolDefaults): void {
127
+ this.toolDefaults = mergeToolDefaults(this.toolDefaults, toolDefaults);
128
+ }
129
+
105
130
  /**
106
131
  * Platform-specific: create tools manager instance
107
132
  * This is only called if no tools manager was provided in constructor
@@ -117,6 +142,10 @@ export abstract class BaseMCPServer {
117
142
  // Use provided tools manager if available, otherwise create new one
118
143
  this.toolsManager = this.providedToolsManager || this.createToolsManager();
119
144
 
145
+ // Apply the tool defaults before tools are generated so they are baked
146
+ // into the generated tool handlers.
147
+ this.toolsManager.setToolDefaults?.(this.toolDefaults);
148
+
120
149
  try {
121
150
  await this.toolsManager.initTools();
122
151
  } catch (error: unknown) {
@@ -14,6 +14,7 @@ import {
14
14
  extractNamespacedArgs,
15
15
  sanitizeNamespacedArgs,
16
16
  } from './init-arg-utils';
17
+ import { type ToolDefaults, mergeToolDefaults } from './tool-defaults';
17
18
  import {
18
19
  generateCommonTools,
19
20
  generateToolsFromActionSpace,
@@ -74,6 +75,14 @@ export abstract class BaseMidsceneTools<
74
75
  protected agent?: TAgent;
75
76
  protected toolDefinitions: ToolDefinition[] = [];
76
77
 
78
+ /**
79
+ * Default options injected into every generated tool call (e.g. forced deep
80
+ * locate / deep think). Set from server/CLI behavior flags before
81
+ * `initTools()` so they are baked into the generated tool handlers.
82
+ * See https://github.com/web-infra-dev/midscene/issues/2446.
83
+ */
84
+ protected toolDefaults: ToolDefaults = {};
85
+
77
86
  /**
78
87
  * Declarative init-arg spec. Subclasses that accept CLI/MCP init args should
79
88
  * set this once and get `extractAgentInitParam` / `sanitizeToolArgs` /
@@ -289,6 +298,7 @@ export abstract class BaseMidsceneTools<
289
298
  (args = {}) => this.sanitizeToolArgs(args),
290
299
  this.getAgentInitArgSchema(),
291
300
  this.getAgentInitArgCliMetadata(),
301
+ this.toolDefaults,
292
302
  );
293
303
 
294
304
  // 4. Add common tools (screenshot, waitFor)
@@ -296,6 +306,7 @@ export abstract class BaseMidsceneTools<
296
306
  (args = {}) => this.ensureAgent(this.extractAgentInitParam(args)),
297
307
  this.getAgentInitArgSchema(),
298
308
  this.getAgentInitArgCliMetadata(),
309
+ this.toolDefaults,
299
310
  );
300
311
  this.toolDefinitions.push(...actionTools, ...commonTools);
301
312
 
@@ -345,6 +356,15 @@ export abstract class BaseMidsceneTools<
345
356
  this.agent = agent;
346
357
  }
347
358
 
359
+ /**
360
+ * Set the default options injected into generated tool calls. Must be called
361
+ * before `initTools()` because the values are captured into the generated
362
+ * tool handlers. Merges with any previously set defaults.
363
+ */
364
+ public setToolDefaults(toolDefaults: ToolDefaults): void {
365
+ this.toolDefaults = mergeToolDefaults(this.toolDefaults, toolDefaults);
366
+ }
367
+
348
368
  /**
349
369
  * Helper: Convert base64 screenshot to image content array
350
370
  */
package/src/mcp/index.ts CHANGED
@@ -1,5 +1,6 @@
1
1
  export * from './base-server';
2
2
  export * from './base-tools';
3
+ export * from './tool-defaults';
3
4
  export * from './init-arg-utils';
4
5
  export * from './error-formatter';
5
6
  export * from './tool-generator';
@@ -0,0 +1,120 @@
1
+ /**
2
+ * Unified, declarative mechanism for "force a default option on every tool
3
+ * call" behaviors exposed by MCP servers and the device / Agent Skill CLIs.
4
+ *
5
+ * Adding a new behavior flag (e.g. `--deep-search`) is a one-line change to
6
+ * {@link TOOL_BEHAVIOR_FLAGS}: declare which default-option "bag" it fills.
7
+ * The tool generator, servers, tools managers and CLI parsing are all generic
8
+ * over {@link ToolDefaults} and never need to learn about individual flags.
9
+ *
10
+ * See https://github.com/web-infra-dev/midscene/issues/2446.
11
+ */
12
+
13
+ /**
14
+ * Default options injected into generated tool calls. Each field is an
15
+ * injection point; an explicit per-call value always wins over these defaults.
16
+ */
17
+ export interface ToolDefaults {
18
+ /**
19
+ * Merged into every locate field of action tools (`Tap`, `Input`, ...).
20
+ * e.g. `{ deepLocate: true }`.
21
+ */
22
+ locate?: Record<string, unknown>;
23
+ /**
24
+ * Merged into the `act` tool's `aiAction` options.
25
+ * e.g. `{ deepLocate: true, deepThink: true }`.
26
+ */
27
+ act?: Record<string, unknown>;
28
+ }
29
+
30
+ export interface ToolBehaviorFlag {
31
+ /** Kebab-case CLI flag name, e.g. `deep-locate` (exposed as `--deep-locate`). */
32
+ cli: string;
33
+ /** One-line description for help output. */
34
+ description: string;
35
+ /** Default-option bags this flag turns on when present. */
36
+ defaults: ToolDefaults;
37
+ }
38
+
39
+ /**
40
+ * The single source of truth for behavior flags. Add a row to support a new
41
+ * `--flag`; nothing else in the pipeline needs to change.
42
+ */
43
+ export const TOOL_BEHAVIOR_FLAGS: readonly ToolBehaviorFlag[] = [
44
+ {
45
+ cli: 'deep-locate',
46
+ description:
47
+ 'Force deep locate for every locating operation (better precision for small/ambiguous targets, a bit slower).',
48
+ defaults: { locate: { deepLocate: true }, act: { deepLocate: true } },
49
+ },
50
+ {
51
+ cli: 'deep-think',
52
+ description:
53
+ 'Plan the act tool with deep thinking (richer context and sub-goal decomposition, a bit slower).',
54
+ defaults: { act: { deepThink: true } },
55
+ },
56
+ ];
57
+
58
+ /** Merge two {@link ToolDefaults}, with `b` taking precedence over `a`. */
59
+ export function mergeToolDefaults(
60
+ a: ToolDefaults,
61
+ b: ToolDefaults,
62
+ ): ToolDefaults {
63
+ const locate = { ...a.locate, ...b.locate };
64
+ const act = { ...a.act, ...b.act };
65
+ const result: ToolDefaults = {};
66
+ if (Object.keys(locate).length > 0) {
67
+ result.locate = locate;
68
+ }
69
+ if (Object.keys(act).length > 0) {
70
+ result.act = act;
71
+ }
72
+ return result;
73
+ }
74
+
75
+ /**
76
+ * Resolve the active {@link ToolDefaults} from a predicate that says whether a
77
+ * given flag (by its `cli` name) is enabled.
78
+ */
79
+ export function resolveToolDefaults(
80
+ isEnabled: (cli: string) => boolean,
81
+ ): ToolDefaults {
82
+ return TOOL_BEHAVIOR_FLAGS.reduce<ToolDefaults>(
83
+ (acc, flag) =>
84
+ isEnabled(flag.cli) ? mergeToolDefaults(acc, flag.defaults) : acc,
85
+ {},
86
+ );
87
+ }
88
+
89
+ /**
90
+ * Split argv into the resolved {@link ToolDefaults} and the remaining args.
91
+ *
92
+ * Behavior flags (e.g. `--deep-locate`) are global: they may appear anywhere
93
+ * in argv and are not tied to a specific sub-command. They are recognized by
94
+ * exact kebab-case match — the same surface the MCP `parseArgs` config exposes
95
+ * — and removed so a strict per-command parser never sees them. Every other
96
+ * token is returned untouched and in order for that per-command parser.
97
+ *
98
+ * This is the single place that knows how a behavior flag looks on the command
99
+ * line; both the device / Agent Skill CLI and the MCP launch path resolve their
100
+ * defaults from {@link TOOL_BEHAVIOR_FLAGS} through here / {@link resolveToolDefaults}.
101
+ */
102
+ export function stripBehaviorFlags(argv: readonly string[]): {
103
+ rawArgs: string[];
104
+ toolDefaults: ToolDefaults;
105
+ } {
106
+ const enabled = new Set<string>();
107
+ const rawArgs: string[] = [];
108
+ for (const arg of argv) {
109
+ const flag = TOOL_BEHAVIOR_FLAGS.find((f) => arg === `--${f.cli}`);
110
+ if (flag) {
111
+ enabled.add(flag.cli);
112
+ } else {
113
+ rawArgs.push(arg);
114
+ }
115
+ }
116
+ return {
117
+ rawArgs,
118
+ toolDefaults: resolveToolDefaults((cli) => enabled.has(cli)),
119
+ };
120
+ }