@midscene/shared 1.9.1 → 1.9.2-beta-20260605084246.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/es/cli/cli-runner.mjs +5 -2
- package/dist/es/env/parse-model-config.mjs +1 -1
- package/dist/es/mcp/base-server.mjs +14 -1
- package/dist/es/mcp/base-tools.mjs +7 -2
- package/dist/es/mcp/index.mjs +1 -0
- package/dist/es/mcp/tool-defaults.mjs +54 -0
- package/dist/es/mcp/tool-generator.mjs +44 -7
- package/dist/lib/cli/cli-runner.js +5 -2
- package/dist/lib/env/parse-model-config.js +1 -1
- package/dist/lib/mcp/base-server.js +14 -1
- package/dist/lib/mcp/base-tools.js +7 -2
- package/dist/lib/mcp/index.js +21 -14
- package/dist/lib/mcp/tool-defaults.js +97 -0
- package/dist/lib/mcp/tool-generator.js +44 -7
- package/dist/types/mcp/base-server.d.ts +14 -1
- package/dist/types/mcp/base-tools.d.ts +14 -0
- package/dist/types/mcp/index.d.ts +1 -0
- package/dist/types/mcp/tool-defaults.d.ts +64 -0
- package/dist/types/mcp/tool-generator.d.ts +3 -2
- package/dist/types/mcp/types.d.ts +2 -0
- package/package.json +1 -1
- package/src/cli/cli-runner.ts +13 -2
- package/src/mcp/base-server.ts +30 -1
- package/src/mcp/base-tools.ts +20 -0
- package/src/mcp/index.ts +1 -0
- package/src/mcp/tool-defaults.ts +120 -0
- package/src/mcp/tool-generator.ts +100 -3
- package/src/mcp/types.ts +2 -0
|
@@ -165,6 +165,34 @@ function normalizeActionArgs(args, paramSchema) {
|
|
|
165
165
|
];
|
|
166
166
|
}));
|
|
167
167
|
}
|
|
168
|
+
function mergeLocateDefaults(locate, defaults) {
|
|
169
|
+
let merged;
|
|
170
|
+
for (const [key, value] of Object.entries(defaults))if (void 0 === locate[key]) {
|
|
171
|
+
if ('deepLocate' !== key || void 0 === locate.deepThink) {
|
|
172
|
+
merged = merged ?? {
|
|
173
|
+
...locate
|
|
174
|
+
};
|
|
175
|
+
merged[key] = value;
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
return merged ?? locate;
|
|
179
|
+
}
|
|
180
|
+
function applyLocateDefaults(args, paramSchema, locateDefaults) {
|
|
181
|
+
if (!paramSchema || 0 === Object.keys(locateDefaults).length) return args;
|
|
182
|
+
const shape = getZodObjectShape(paramSchema);
|
|
183
|
+
if (!shape) return args;
|
|
184
|
+
return Object.fromEntries(Object.entries(args).map(([key, value])=>{
|
|
185
|
+
const fieldSchema = shape[key];
|
|
186
|
+
if (fieldSchema && (0, external_zod_schema_utils_js_namespaceObject.isMidsceneLocatorField)(fieldSchema) && isRecord(value)) return [
|
|
187
|
+
key,
|
|
188
|
+
mergeLocateDefaults(value, locateDefaults)
|
|
189
|
+
];
|
|
190
|
+
return [
|
|
191
|
+
key,
|
|
192
|
+
value
|
|
193
|
+
];
|
|
194
|
+
}));
|
|
195
|
+
}
|
|
168
196
|
function serializeArgsToDescription(args) {
|
|
169
197
|
try {
|
|
170
198
|
return Object.entries(args).map(([key, value])=>{
|
|
@@ -320,7 +348,7 @@ function mergeToolCliMetadata(base, extra) {
|
|
|
320
348
|
options
|
|
321
349
|
} : void 0;
|
|
322
350
|
}
|
|
323
|
-
function generateToolsFromActionSpace(actionSpace, getAgent, sanitizeArgs = (args)=>args, initArgSchema = {}, initArgCliMetadata) {
|
|
351
|
+
function generateToolsFromActionSpace(actionSpace, getAgent, sanitizeArgs = (args)=>args, initArgSchema = {}, initArgCliMetadata, toolDefaults = {}) {
|
|
324
352
|
return actionSpace.map((action)=>{
|
|
325
353
|
const schema = {
|
|
326
354
|
...extractActionSchema(action.paramSchema, action.name),
|
|
@@ -334,7 +362,8 @@ function generateToolsFromActionSpace(actionSpace, getAgent, sanitizeArgs = (arg
|
|
|
334
362
|
handler: async (args)=>{
|
|
335
363
|
try {
|
|
336
364
|
const agent = await getAgent(args);
|
|
337
|
-
|
|
365
|
+
let normalizedArgs = normalizeActionArgs(sanitizeArgs(args), action.paramSchema);
|
|
366
|
+
if (toolDefaults.locate) normalizedArgs = applyLocateDefaults(normalizedArgs, action.paramSchema, toolDefaults.locate);
|
|
338
367
|
let actionResult;
|
|
339
368
|
try {
|
|
340
369
|
actionResult = await executeAction(agent, action.name, normalizedArgs);
|
|
@@ -353,7 +382,7 @@ function generateToolsFromActionSpace(actionSpace, getAgent, sanitizeArgs = (arg
|
|
|
353
382
|
};
|
|
354
383
|
});
|
|
355
384
|
}
|
|
356
|
-
function generateCommonTools(getAgent, initArgSchema = {}, initArgCliMetadata) {
|
|
385
|
+
function generateCommonTools(getAgent, initArgSchema = {}, initArgCliMetadata, toolDefaults = {}) {
|
|
357
386
|
return [
|
|
358
387
|
{
|
|
359
388
|
name: 'take_screenshot',
|
|
@@ -392,6 +421,8 @@ function generateCommonTools(getAgent, initArgSchema = {}, initArgCliMetadata) {
|
|
|
392
421
|
description: 'Execute a natural language action. The AI will plan and perform multi-step operations in a single invocation, useful for transient UI interactions (e.g., Spotlight, dropdown menus) that disappear between separate commands.',
|
|
393
422
|
schema: {
|
|
394
423
|
prompt: external_zod_namespaceObject.z.string().describe('Natural language description of the action to perform, e.g. "press Command+Space, type Safari, press Enter"'),
|
|
424
|
+
deepLocate: external_zod_namespaceObject.z.boolean().optional().describe('Use deep locate for every element this action targets. Improves precision for small or ambiguous targets at the cost of speed. Defaults to the server --deep-locate setting.'),
|
|
425
|
+
deepThink: external_zod_namespaceObject.z.boolean().optional().describe('Plan this action with deep thinking (richer context and sub-goal decomposition). Helps with complex multi-step instructions at the cost of speed. Defaults to the server --deep-think setting.'),
|
|
395
426
|
...initArgSchema
|
|
396
427
|
},
|
|
397
428
|
cli: mergeToolCliMetadata(void 0, initArgCliMetadata),
|
|
@@ -400,9 +431,13 @@ function generateCommonTools(getAgent, initArgSchema = {}, initArgCliMetadata) {
|
|
|
400
431
|
try {
|
|
401
432
|
const agent = await getAgent(args);
|
|
402
433
|
if (!agent.aiAction) return createErrorResult('act is not supported by this agent');
|
|
403
|
-
const
|
|
404
|
-
deepThink: false
|
|
405
|
-
|
|
434
|
+
const actOptions = {
|
|
435
|
+
deepThink: false,
|
|
436
|
+
...toolDefaults.act
|
|
437
|
+
};
|
|
438
|
+
if (void 0 !== args.deepLocate) actOptions.deepLocate = args.deepLocate;
|
|
439
|
+
if (void 0 !== args.deepThink) actOptions.deepThink = args.deepThink;
|
|
440
|
+
const result = await agent.aiAction(prompt, actOptions);
|
|
406
441
|
return await captureScreenshotResult(agent, 'act', result);
|
|
407
442
|
} catch (error) {
|
|
408
443
|
const errorMessage = (0, external_error_formatter_js_namespaceObject.getErrorMessage)(error);
|
|
@@ -416,12 +451,14 @@ function generateCommonTools(getAgent, initArgSchema = {}, initArgCliMetadata) {
|
|
|
416
451
|
description: 'Assert a natural language statement against the current page/screen.',
|
|
417
452
|
schema: {
|
|
418
453
|
prompt: external_zod_namespaceObject.z.string().describe('Natural language assertion to verify, e.g. "there is a login button visible"'),
|
|
454
|
+
message: external_zod_namespaceObject.z.string().optional().describe('Custom error message to throw when the assertion fails, e.g. "the login button should be visible".'),
|
|
419
455
|
...external_user_prompt_js_namespaceObject.promptInputExtraSchema,
|
|
420
456
|
...initArgSchema
|
|
421
457
|
},
|
|
422
458
|
cli: mergeToolCliMetadata(void 0, initArgCliMetadata),
|
|
423
459
|
handler: async (args = {})=>{
|
|
424
460
|
const prompt = args.prompt;
|
|
461
|
+
const message = args.message;
|
|
425
462
|
try {
|
|
426
463
|
const agent = await getAgent(args);
|
|
427
464
|
if (!agent.aiAssert) return createErrorResult('assert is not supported by this agent');
|
|
@@ -431,7 +468,7 @@ function generateCommonTools(getAgent, initArgSchema = {}, initArgCliMetadata) {
|
|
|
431
468
|
imageName: args.imageName,
|
|
432
469
|
convertHttpImage2Base64: args.convertHttpImage2Base64
|
|
433
470
|
});
|
|
434
|
-
await agent.aiAssert(userPrompt);
|
|
471
|
+
await agent.aiAssert(userPrompt, message);
|
|
435
472
|
return {
|
|
436
473
|
content: [
|
|
437
474
|
{
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import type { ParseArgsConfig } from 'node:util';
|
|
2
2
|
import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
|
|
3
|
+
import { type ToolDefaults } from './tool-defaults';
|
|
3
4
|
import type { IMidsceneTools } from './types';
|
|
4
5
|
export interface BaseMCPServerConfig {
|
|
5
6
|
name: string;
|
|
@@ -25,13 +26,17 @@ export interface LaunchMCPServerResult {
|
|
|
25
26
|
close: () => Promise<void>;
|
|
26
27
|
}
|
|
27
28
|
/**
|
|
28
|
-
* CLI argument configuration for MCP servers
|
|
29
|
+
* CLI argument configuration for MCP servers. Behavior flags (e.g.
|
|
30
|
+
* `--deep-locate`) are generated from {@link TOOL_BEHAVIOR_FLAGS}, so adding a
|
|
31
|
+
* new flag there exposes it here automatically.
|
|
29
32
|
*/
|
|
30
33
|
export declare const CLI_ARGS_CONFIG: ParseArgsConfig['options'];
|
|
31
34
|
export interface CLIArgs {
|
|
32
35
|
mode?: string;
|
|
33
36
|
port?: string;
|
|
34
37
|
host?: string;
|
|
38
|
+
/** Behavior flags such as `deep-locate` / `deep-think` (see TOOL_BEHAVIOR_FLAGS). */
|
|
39
|
+
[flag: string]: string | boolean | undefined;
|
|
35
40
|
}
|
|
36
41
|
/**
|
|
37
42
|
* Launch an MCP server based on CLI arguments
|
|
@@ -47,7 +52,15 @@ export declare abstract class BaseMCPServer {
|
|
|
47
52
|
protected toolsManager?: IMidsceneTools;
|
|
48
53
|
protected config: BaseMCPServerConfig;
|
|
49
54
|
protected providedToolsManager?: IMidsceneTools;
|
|
55
|
+
protected toolDefaults: ToolDefaults;
|
|
50
56
|
constructor(config: BaseMCPServerConfig, toolsManager?: IMidsceneTools);
|
|
57
|
+
/**
|
|
58
|
+
* Set the default options injected into generated tool calls (e.g. forced
|
|
59
|
+
* deep locate / deep think). Must be called before `launch()` /
|
|
60
|
+
* `launchHttp()` so they are applied to the tools manager before its tools
|
|
61
|
+
* are generated. Merges with any previously set defaults.
|
|
62
|
+
*/
|
|
63
|
+
setToolDefaults(toolDefaults: ToolDefaults): void;
|
|
51
64
|
/**
|
|
52
65
|
* Platform-specific: create tools manager instance
|
|
53
66
|
* This is only called if no tools manager was provided in constructor
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import type { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
|
|
2
2
|
import type { z } from 'zod';
|
|
3
3
|
import { type CliReportSession } from './cli-report-session';
|
|
4
|
+
import { type ToolDefaults } from './tool-defaults';
|
|
4
5
|
import type { BaseAgent, BaseDevice, IMidsceneTools, ToolCliMetadata, ToolDefinition, ToolSchema } from './types';
|
|
5
6
|
/**
|
|
6
7
|
* Declarative description of a platform's agent init args.
|
|
@@ -38,6 +39,13 @@ export declare abstract class BaseMidsceneTools<TAgent extends BaseAgent = BaseA
|
|
|
38
39
|
protected mcpServer?: McpServer;
|
|
39
40
|
protected agent?: TAgent;
|
|
40
41
|
protected toolDefinitions: ToolDefinition[];
|
|
42
|
+
/**
|
|
43
|
+
* Default options injected into every generated tool call (e.g. forced deep
|
|
44
|
+
* locate / deep think). Set from server/CLI behavior flags before
|
|
45
|
+
* `initTools()` so they are baked into the generated tool handlers.
|
|
46
|
+
* See https://github.com/web-infra-dev/midscene/issues/2446.
|
|
47
|
+
*/
|
|
48
|
+
protected toolDefaults: ToolDefaults;
|
|
41
49
|
/**
|
|
42
50
|
* Declarative init-arg spec. Subclasses that accept CLI/MCP init args should
|
|
43
51
|
* set this once and get `extractAgentInitParam` / `sanitizeToolArgs` /
|
|
@@ -117,6 +125,12 @@ export declare abstract class BaseMidsceneTools<TAgent extends BaseAgent = BaseA
|
|
|
117
125
|
* Set agent for the tools manager
|
|
118
126
|
*/
|
|
119
127
|
setAgent(agent: TAgent): void;
|
|
128
|
+
/**
|
|
129
|
+
* Set the default options injected into generated tool calls. Must be called
|
|
130
|
+
* before `initTools()` because the values are captured into the generated
|
|
131
|
+
* tool handlers. Merges with any previously set defaults.
|
|
132
|
+
*/
|
|
133
|
+
setToolDefaults(toolDefaults: ToolDefaults): void;
|
|
120
134
|
/**
|
|
121
135
|
* Helper: Convert base64 screenshot to image content array
|
|
122
136
|
*/
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Unified, declarative mechanism for "force a default option on every tool
|
|
3
|
+
* call" behaviors exposed by MCP servers and the device / Agent Skill CLIs.
|
|
4
|
+
*
|
|
5
|
+
* Adding a new behavior flag (e.g. `--deep-search`) is a one-line change to
|
|
6
|
+
* {@link TOOL_BEHAVIOR_FLAGS}: declare which default-option "bag" it fills.
|
|
7
|
+
* The tool generator, servers, tools managers and CLI parsing are all generic
|
|
8
|
+
* over {@link ToolDefaults} and never need to learn about individual flags.
|
|
9
|
+
*
|
|
10
|
+
* See https://github.com/web-infra-dev/midscene/issues/2446.
|
|
11
|
+
*/
|
|
12
|
+
/**
|
|
13
|
+
* Default options injected into generated tool calls. Each field is an
|
|
14
|
+
* injection point; an explicit per-call value always wins over these defaults.
|
|
15
|
+
*/
|
|
16
|
+
export interface ToolDefaults {
|
|
17
|
+
/**
|
|
18
|
+
* Merged into every locate field of action tools (`Tap`, `Input`, ...).
|
|
19
|
+
* e.g. `{ deepLocate: true }`.
|
|
20
|
+
*/
|
|
21
|
+
locate?: Record<string, unknown>;
|
|
22
|
+
/**
|
|
23
|
+
* Merged into the `act` tool's `aiAction` options.
|
|
24
|
+
* e.g. `{ deepLocate: true, deepThink: true }`.
|
|
25
|
+
*/
|
|
26
|
+
act?: Record<string, unknown>;
|
|
27
|
+
}
|
|
28
|
+
export interface ToolBehaviorFlag {
|
|
29
|
+
/** Kebab-case CLI flag name, e.g. `deep-locate` (exposed as `--deep-locate`). */
|
|
30
|
+
cli: string;
|
|
31
|
+
/** One-line description for help output. */
|
|
32
|
+
description: string;
|
|
33
|
+
/** Default-option bags this flag turns on when present. */
|
|
34
|
+
defaults: ToolDefaults;
|
|
35
|
+
}
|
|
36
|
+
/**
|
|
37
|
+
* The single source of truth for behavior flags. Add a row to support a new
|
|
38
|
+
* `--flag`; nothing else in the pipeline needs to change.
|
|
39
|
+
*/
|
|
40
|
+
export declare const TOOL_BEHAVIOR_FLAGS: readonly ToolBehaviorFlag[];
|
|
41
|
+
/** Merge two {@link ToolDefaults}, with `b` taking precedence over `a`. */
|
|
42
|
+
export declare function mergeToolDefaults(a: ToolDefaults, b: ToolDefaults): ToolDefaults;
|
|
43
|
+
/**
|
|
44
|
+
* Resolve the active {@link ToolDefaults} from a predicate that says whether a
|
|
45
|
+
* given flag (by its `cli` name) is enabled.
|
|
46
|
+
*/
|
|
47
|
+
export declare function resolveToolDefaults(isEnabled: (cli: string) => boolean): ToolDefaults;
|
|
48
|
+
/**
|
|
49
|
+
* Split argv into the resolved {@link ToolDefaults} and the remaining args.
|
|
50
|
+
*
|
|
51
|
+
* Behavior flags (e.g. `--deep-locate`) are global: they may appear anywhere
|
|
52
|
+
* in argv and are not tied to a specific sub-command. They are recognized by
|
|
53
|
+
* exact kebab-case match — the same surface the MCP `parseArgs` config exposes
|
|
54
|
+
* — and removed so a strict per-command parser never sees them. Every other
|
|
55
|
+
* token is returned untouched and in order for that per-command parser.
|
|
56
|
+
*
|
|
57
|
+
* This is the single place that knows how a behavior flag looks on the command
|
|
58
|
+
* line; both the device / Agent Skill CLI and the MCP launch path resolve their
|
|
59
|
+
* defaults from {@link TOOL_BEHAVIOR_FLAGS} through here / {@link resolveToolDefaults}.
|
|
60
|
+
*/
|
|
61
|
+
export declare function stripBehaviorFlags(argv: readonly string[]): {
|
|
62
|
+
rawArgs: string[];
|
|
63
|
+
toolDefaults: ToolDefaults;
|
|
64
|
+
};
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import type { ToolDefaults } from './tool-defaults';
|
|
1
2
|
import type { ActionSpaceItem, BaseAgent, ToolCliMetadata, ToolDefinition, ToolSchema } from './types';
|
|
2
3
|
import { composeUserPrompt } from './user-prompt';
|
|
3
4
|
export { composeUserPrompt };
|
|
@@ -5,8 +6,8 @@ export { composeUserPrompt };
|
|
|
5
6
|
* Converts DeviceAction from actionSpace into MCP ToolDefinition
|
|
6
7
|
* This is the core logic that removes need for hardcoded tool definitions
|
|
7
8
|
*/
|
|
8
|
-
export declare function generateToolsFromActionSpace(actionSpace: ActionSpaceItem[], getAgent: (args?: Record<string, unknown>) => Promise<BaseAgent>, sanitizeArgs?: (args: Record<string, unknown>) => Record<string, unknown>, initArgSchema?: ToolSchema, initArgCliMetadata?: ToolCliMetadata): ToolDefinition[];
|
|
9
|
+
export declare function generateToolsFromActionSpace(actionSpace: ActionSpaceItem[], getAgent: (args?: Record<string, unknown>) => Promise<BaseAgent>, sanitizeArgs?: (args: Record<string, unknown>) => Record<string, unknown>, initArgSchema?: ToolSchema, initArgCliMetadata?: ToolCliMetadata, toolDefaults?: ToolDefaults): ToolDefinition[];
|
|
9
10
|
/**
|
|
10
11
|
* Generate common tools (screenshot, act)
|
|
11
12
|
*/
|
|
12
|
-
export declare function generateCommonTools(getAgent: (args?: Record<string, unknown>) => Promise<BaseAgent>, initArgSchema?: ToolSchema, initArgCliMetadata?: ToolCliMetadata): ToolDefinition[];
|
|
13
|
+
export declare function generateCommonTools(getAgent: (args?: Record<string, unknown>) => Promise<BaseAgent>, initArgSchema?: ToolSchema, initArgCliMetadata?: ToolCliMetadata, toolDefaults?: ToolDefaults): ToolDefinition[];
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import type { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
|
|
2
2
|
import type { z } from 'zod';
|
|
3
|
+
import type { ToolDefaults } from './tool-defaults';
|
|
3
4
|
/**
|
|
4
5
|
* Default timeout constants for app loading verification
|
|
5
6
|
*/
|
|
@@ -130,4 +131,5 @@ export interface IMidsceneTools {
|
|
|
130
131
|
attachToServer(server: McpServer): void;
|
|
131
132
|
initTools(): Promise<void>;
|
|
132
133
|
destroy?(): Promise<void>;
|
|
134
|
+
setToolDefaults?(toolDefaults: ToolDefaults): void;
|
|
133
135
|
}
|
package/package.json
CHANGED
package/src/cli/cli-runner.ts
CHANGED
|
@@ -4,6 +4,7 @@ import { join } from 'node:path';
|
|
|
4
4
|
import dotenv from 'dotenv';
|
|
5
5
|
import { getDebug } from '../logger';
|
|
6
6
|
import type { BaseMidsceneTools } from '../mcp/base-tools';
|
|
7
|
+
import { stripBehaviorFlags } from '../mcp/tool-defaults';
|
|
7
8
|
import type {
|
|
8
9
|
ToolDefinition,
|
|
9
10
|
ToolResult,
|
|
@@ -135,8 +136,18 @@ export async function runToolsCLI(
|
|
|
135
136
|
scriptName: string,
|
|
136
137
|
options?: CLIRunnerOptions,
|
|
137
138
|
): Promise<void> {
|
|
138
|
-
const
|
|
139
|
-
debug('CLI invoked: %s %s', scriptName,
|
|
139
|
+
const inputArgs = options?.argv ?? process.argv.slice(2);
|
|
140
|
+
debug('CLI invoked: %s %s', scriptName, inputArgs.join(' '));
|
|
141
|
+
|
|
142
|
+
// Global behavior flags (e.g. `--deep-locate` / `--deep-think`) apply
|
|
143
|
+
// regardless of which command runs. `stripBehaviorFlags` is the single place
|
|
144
|
+
// that knows how they look on the command line: it resolves their defaults
|
|
145
|
+
// and returns the remaining args so the per-command parser never sees them.
|
|
146
|
+
// See https://github.com/web-infra-dev/midscene/issues/2446.
|
|
147
|
+
const { rawArgs, toolDefaults } = stripBehaviorFlags(inputArgs);
|
|
148
|
+
if (Object.keys(toolDefaults).length > 0) {
|
|
149
|
+
tools.setToolDefaults?.(toolDefaults);
|
|
150
|
+
}
|
|
140
151
|
|
|
141
152
|
// Load .env from cwd before any tool initialization
|
|
142
153
|
const envFile = join(process.cwd(), '.env');
|
package/src/mcp/base-server.ts
CHANGED
|
@@ -10,6 +10,12 @@ import express, {
|
|
|
10
10
|
type Response,
|
|
11
11
|
} from 'express';
|
|
12
12
|
import { getErrorMessage } from './error-formatter';
|
|
13
|
+
import {
|
|
14
|
+
TOOL_BEHAVIOR_FLAGS,
|
|
15
|
+
type ToolDefaults,
|
|
16
|
+
mergeToolDefaults,
|
|
17
|
+
resolveToolDefaults,
|
|
18
|
+
} from './tool-defaults';
|
|
13
19
|
import type { IMidsceneTools } from './types';
|
|
14
20
|
|
|
15
21
|
export interface BaseMCPServerConfig {
|
|
@@ -47,18 +53,25 @@ interface SessionData {
|
|
|
47
53
|
}
|
|
48
54
|
|
|
49
55
|
/**
|
|
50
|
-
* CLI argument configuration for MCP servers
|
|
56
|
+
* CLI argument configuration for MCP servers. Behavior flags (e.g.
|
|
57
|
+
* `--deep-locate`) are generated from {@link TOOL_BEHAVIOR_FLAGS}, so adding a
|
|
58
|
+
* new flag there exposes it here automatically.
|
|
51
59
|
*/
|
|
52
60
|
export const CLI_ARGS_CONFIG: ParseArgsConfig['options'] = {
|
|
53
61
|
mode: { type: 'string', default: 'stdio' },
|
|
54
62
|
port: { type: 'string', default: '3000' },
|
|
55
63
|
host: { type: 'string', default: 'localhost' },
|
|
64
|
+
...Object.fromEntries(
|
|
65
|
+
TOOL_BEHAVIOR_FLAGS.map((flag) => [flag.cli, { type: 'boolean' as const }]),
|
|
66
|
+
),
|
|
56
67
|
};
|
|
57
68
|
|
|
58
69
|
export interface CLIArgs {
|
|
59
70
|
mode?: string;
|
|
60
71
|
port?: string;
|
|
61
72
|
host?: string;
|
|
73
|
+
/** Behavior flags such as `deep-locate` / `deep-think` (see TOOL_BEHAVIOR_FLAGS). */
|
|
74
|
+
[flag: string]: string | boolean | undefined;
|
|
62
75
|
}
|
|
63
76
|
|
|
64
77
|
/**
|
|
@@ -69,6 +82,7 @@ export function launchMCPServer(
|
|
|
69
82
|
server: BaseMCPServer,
|
|
70
83
|
args: CLIArgs,
|
|
71
84
|
): Promise<LaunchMCPServerResult> {
|
|
85
|
+
server.setToolDefaults(resolveToolDefaults((cli) => args[cli] === true));
|
|
72
86
|
if (args.mode === 'http') {
|
|
73
87
|
return server.launchHttp({
|
|
74
88
|
port: Number.parseInt(args.port || '3000', 10),
|
|
@@ -91,6 +105,7 @@ export abstract class BaseMCPServer {
|
|
|
91
105
|
protected toolsManager?: IMidsceneTools;
|
|
92
106
|
protected config: BaseMCPServerConfig;
|
|
93
107
|
protected providedToolsManager?: IMidsceneTools;
|
|
108
|
+
protected toolDefaults: ToolDefaults = {};
|
|
94
109
|
|
|
95
110
|
constructor(config: BaseMCPServerConfig, toolsManager?: IMidsceneTools) {
|
|
96
111
|
this.config = config;
|
|
@@ -102,6 +117,16 @@ export abstract class BaseMCPServer {
|
|
|
102
117
|
this.providedToolsManager = toolsManager;
|
|
103
118
|
}
|
|
104
119
|
|
|
120
|
+
/**
|
|
121
|
+
* Set the default options injected into generated tool calls (e.g. forced
|
|
122
|
+
* deep locate / deep think). Must be called before `launch()` /
|
|
123
|
+
* `launchHttp()` so they are applied to the tools manager before its tools
|
|
124
|
+
* are generated. Merges with any previously set defaults.
|
|
125
|
+
*/
|
|
126
|
+
public setToolDefaults(toolDefaults: ToolDefaults): void {
|
|
127
|
+
this.toolDefaults = mergeToolDefaults(this.toolDefaults, toolDefaults);
|
|
128
|
+
}
|
|
129
|
+
|
|
105
130
|
/**
|
|
106
131
|
* Platform-specific: create tools manager instance
|
|
107
132
|
* This is only called if no tools manager was provided in constructor
|
|
@@ -117,6 +142,10 @@ export abstract class BaseMCPServer {
|
|
|
117
142
|
// Use provided tools manager if available, otherwise create new one
|
|
118
143
|
this.toolsManager = this.providedToolsManager || this.createToolsManager();
|
|
119
144
|
|
|
145
|
+
// Apply the tool defaults before tools are generated so they are baked
|
|
146
|
+
// into the generated tool handlers.
|
|
147
|
+
this.toolsManager.setToolDefaults?.(this.toolDefaults);
|
|
148
|
+
|
|
120
149
|
try {
|
|
121
150
|
await this.toolsManager.initTools();
|
|
122
151
|
} catch (error: unknown) {
|
package/src/mcp/base-tools.ts
CHANGED
|
@@ -14,6 +14,7 @@ import {
|
|
|
14
14
|
extractNamespacedArgs,
|
|
15
15
|
sanitizeNamespacedArgs,
|
|
16
16
|
} from './init-arg-utils';
|
|
17
|
+
import { type ToolDefaults, mergeToolDefaults } from './tool-defaults';
|
|
17
18
|
import {
|
|
18
19
|
generateCommonTools,
|
|
19
20
|
generateToolsFromActionSpace,
|
|
@@ -74,6 +75,14 @@ export abstract class BaseMidsceneTools<
|
|
|
74
75
|
protected agent?: TAgent;
|
|
75
76
|
protected toolDefinitions: ToolDefinition[] = [];
|
|
76
77
|
|
|
78
|
+
/**
|
|
79
|
+
* Default options injected into every generated tool call (e.g. forced deep
|
|
80
|
+
* locate / deep think). Set from server/CLI behavior flags before
|
|
81
|
+
* `initTools()` so they are baked into the generated tool handlers.
|
|
82
|
+
* See https://github.com/web-infra-dev/midscene/issues/2446.
|
|
83
|
+
*/
|
|
84
|
+
protected toolDefaults: ToolDefaults = {};
|
|
85
|
+
|
|
77
86
|
/**
|
|
78
87
|
* Declarative init-arg spec. Subclasses that accept CLI/MCP init args should
|
|
79
88
|
* set this once and get `extractAgentInitParam` / `sanitizeToolArgs` /
|
|
@@ -289,6 +298,7 @@ export abstract class BaseMidsceneTools<
|
|
|
289
298
|
(args = {}) => this.sanitizeToolArgs(args),
|
|
290
299
|
this.getAgentInitArgSchema(),
|
|
291
300
|
this.getAgentInitArgCliMetadata(),
|
|
301
|
+
this.toolDefaults,
|
|
292
302
|
);
|
|
293
303
|
|
|
294
304
|
// 4. Add common tools (screenshot, waitFor)
|
|
@@ -296,6 +306,7 @@ export abstract class BaseMidsceneTools<
|
|
|
296
306
|
(args = {}) => this.ensureAgent(this.extractAgentInitParam(args)),
|
|
297
307
|
this.getAgentInitArgSchema(),
|
|
298
308
|
this.getAgentInitArgCliMetadata(),
|
|
309
|
+
this.toolDefaults,
|
|
299
310
|
);
|
|
300
311
|
this.toolDefinitions.push(...actionTools, ...commonTools);
|
|
301
312
|
|
|
@@ -345,6 +356,15 @@ export abstract class BaseMidsceneTools<
|
|
|
345
356
|
this.agent = agent;
|
|
346
357
|
}
|
|
347
358
|
|
|
359
|
+
/**
|
|
360
|
+
* Set the default options injected into generated tool calls. Must be called
|
|
361
|
+
* before `initTools()` because the values are captured into the generated
|
|
362
|
+
* tool handlers. Merges with any previously set defaults.
|
|
363
|
+
*/
|
|
364
|
+
public setToolDefaults(toolDefaults: ToolDefaults): void {
|
|
365
|
+
this.toolDefaults = mergeToolDefaults(this.toolDefaults, toolDefaults);
|
|
366
|
+
}
|
|
367
|
+
|
|
348
368
|
/**
|
|
349
369
|
* Helper: Convert base64 screenshot to image content array
|
|
350
370
|
*/
|
package/src/mcp/index.ts
CHANGED
|
@@ -0,0 +1,120 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Unified, declarative mechanism for "force a default option on every tool
|
|
3
|
+
* call" behaviors exposed by MCP servers and the device / Agent Skill CLIs.
|
|
4
|
+
*
|
|
5
|
+
* Adding a new behavior flag (e.g. `--deep-search`) is a one-line change to
|
|
6
|
+
* {@link TOOL_BEHAVIOR_FLAGS}: declare which default-option "bag" it fills.
|
|
7
|
+
* The tool generator, servers, tools managers and CLI parsing are all generic
|
|
8
|
+
* over {@link ToolDefaults} and never need to learn about individual flags.
|
|
9
|
+
*
|
|
10
|
+
* See https://github.com/web-infra-dev/midscene/issues/2446.
|
|
11
|
+
*/
|
|
12
|
+
|
|
13
|
+
/**
|
|
14
|
+
* Default options injected into generated tool calls. Each field is an
|
|
15
|
+
* injection point; an explicit per-call value always wins over these defaults.
|
|
16
|
+
*/
|
|
17
|
+
export interface ToolDefaults {
|
|
18
|
+
/**
|
|
19
|
+
* Merged into every locate field of action tools (`Tap`, `Input`, ...).
|
|
20
|
+
* e.g. `{ deepLocate: true }`.
|
|
21
|
+
*/
|
|
22
|
+
locate?: Record<string, unknown>;
|
|
23
|
+
/**
|
|
24
|
+
* Merged into the `act` tool's `aiAction` options.
|
|
25
|
+
* e.g. `{ deepLocate: true, deepThink: true }`.
|
|
26
|
+
*/
|
|
27
|
+
act?: Record<string, unknown>;
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
export interface ToolBehaviorFlag {
|
|
31
|
+
/** Kebab-case CLI flag name, e.g. `deep-locate` (exposed as `--deep-locate`). */
|
|
32
|
+
cli: string;
|
|
33
|
+
/** One-line description for help output. */
|
|
34
|
+
description: string;
|
|
35
|
+
/** Default-option bags this flag turns on when present. */
|
|
36
|
+
defaults: ToolDefaults;
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
/**
|
|
40
|
+
* The single source of truth for behavior flags. Add a row to support a new
|
|
41
|
+
* `--flag`; nothing else in the pipeline needs to change.
|
|
42
|
+
*/
|
|
43
|
+
export const TOOL_BEHAVIOR_FLAGS: readonly ToolBehaviorFlag[] = [
|
|
44
|
+
{
|
|
45
|
+
cli: 'deep-locate',
|
|
46
|
+
description:
|
|
47
|
+
'Force deep locate for every locating operation (better precision for small/ambiguous targets, a bit slower).',
|
|
48
|
+
defaults: { locate: { deepLocate: true }, act: { deepLocate: true } },
|
|
49
|
+
},
|
|
50
|
+
{
|
|
51
|
+
cli: 'deep-think',
|
|
52
|
+
description:
|
|
53
|
+
'Plan the act tool with deep thinking (richer context and sub-goal decomposition, a bit slower).',
|
|
54
|
+
defaults: { act: { deepThink: true } },
|
|
55
|
+
},
|
|
56
|
+
];
|
|
57
|
+
|
|
58
|
+
/** Merge two {@link ToolDefaults}, with `b` taking precedence over `a`. */
|
|
59
|
+
export function mergeToolDefaults(
|
|
60
|
+
a: ToolDefaults,
|
|
61
|
+
b: ToolDefaults,
|
|
62
|
+
): ToolDefaults {
|
|
63
|
+
const locate = { ...a.locate, ...b.locate };
|
|
64
|
+
const act = { ...a.act, ...b.act };
|
|
65
|
+
const result: ToolDefaults = {};
|
|
66
|
+
if (Object.keys(locate).length > 0) {
|
|
67
|
+
result.locate = locate;
|
|
68
|
+
}
|
|
69
|
+
if (Object.keys(act).length > 0) {
|
|
70
|
+
result.act = act;
|
|
71
|
+
}
|
|
72
|
+
return result;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/**
|
|
76
|
+
* Resolve the active {@link ToolDefaults} from a predicate that says whether a
|
|
77
|
+
* given flag (by its `cli` name) is enabled.
|
|
78
|
+
*/
|
|
79
|
+
export function resolveToolDefaults(
|
|
80
|
+
isEnabled: (cli: string) => boolean,
|
|
81
|
+
): ToolDefaults {
|
|
82
|
+
return TOOL_BEHAVIOR_FLAGS.reduce<ToolDefaults>(
|
|
83
|
+
(acc, flag) =>
|
|
84
|
+
isEnabled(flag.cli) ? mergeToolDefaults(acc, flag.defaults) : acc,
|
|
85
|
+
{},
|
|
86
|
+
);
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
/**
|
|
90
|
+
* Split argv into the resolved {@link ToolDefaults} and the remaining args.
|
|
91
|
+
*
|
|
92
|
+
* Behavior flags (e.g. `--deep-locate`) are global: they may appear anywhere
|
|
93
|
+
* in argv and are not tied to a specific sub-command. They are recognized by
|
|
94
|
+
* exact kebab-case match — the same surface the MCP `parseArgs` config exposes
|
|
95
|
+
* — and removed so a strict per-command parser never sees them. Every other
|
|
96
|
+
* token is returned untouched and in order for that per-command parser.
|
|
97
|
+
*
|
|
98
|
+
* This is the single place that knows how a behavior flag looks on the command
|
|
99
|
+
* line; both the device / Agent Skill CLI and the MCP launch path resolve their
|
|
100
|
+
* defaults from {@link TOOL_BEHAVIOR_FLAGS} through here / {@link resolveToolDefaults}.
|
|
101
|
+
*/
|
|
102
|
+
export function stripBehaviorFlags(argv: readonly string[]): {
|
|
103
|
+
rawArgs: string[];
|
|
104
|
+
toolDefaults: ToolDefaults;
|
|
105
|
+
} {
|
|
106
|
+
const enabled = new Set<string>();
|
|
107
|
+
const rawArgs: string[] = [];
|
|
108
|
+
for (const arg of argv) {
|
|
109
|
+
const flag = TOOL_BEHAVIOR_FLAGS.find((f) => arg === `--${f.cli}`);
|
|
110
|
+
if (flag) {
|
|
111
|
+
enabled.add(flag.cli);
|
|
112
|
+
} else {
|
|
113
|
+
rawArgs.push(arg);
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
return {
|
|
117
|
+
rawArgs,
|
|
118
|
+
toolDefaults: resolveToolDefaults((cli) => enabled.has(cli)),
|
|
119
|
+
};
|
|
120
|
+
}
|