@midscene/shared 1.0.4 → 1.0.5-beta-20251230124359.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -5,15 +5,22 @@ const PLAYWRIGHT_EXAMPLE_CODE = `
5
5
  IMPORTANT: Follow these exact type signatures for AI functions:
6
6
 
7
7
  // Type signatures for AI functions:
8
- aiInput(value: string, locator: string): Promise<void>
9
- aiTap(locator: string): Promise<void>
10
- aiDoubleClick(locator: string): Promise<void>
11
- aiScroll(scrollParam: {
12
- direction: 'up' | 'down' | 'left' | 'right',
13
- scrollType: 'once' | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft',
14
- distance: number - scroll distance, px is the unit
8
+ aiAct(prompt: string, options?: { cacheable?: boolean }): Promise<void>
9
+ aiInput(text: string, locate: string, options?: { deepThink?: boolean, xpath?: string, cacheable?: boolean }): Promise<void>
10
+ aiTap(locate: string, options?: { deepThink?: boolean, xpath?: string, cacheable?: boolean }): Promise<void>
11
+ aiHover(locate: string, options?: { deepThink?: boolean, xpath?: string, cacheable?: boolean }): Promise<void>
12
+ aiDoubleClick(locate: string, options?: { deepThink?: boolean, xpath?: string, cacheable?: boolean }): Promise<void>
13
+ aiKeyboardPress(key: string, locate?: string, options?: { deepThink?: boolean, xpath?: string, cacheable?: boolean }): Promise<void>
14
+ aiScroll(locate: string | undefined, options: {
15
+ direction?: 'up' | 'down' | 'left' | 'right',
16
+ scrollType?: 'singleAction' | 'scrollToBottom' | 'scrollToTop' | 'scrollToRight' | 'scrollToLeft',
17
+ distance?: number | null,
18
+ deepThink?: boolean,
19
+ xpath?: string,
20
+ cacheable?: boolean
15
21
  }): Promise<void>
16
- aiAssert(assertion: string): Promise<void>
22
+ aiAssert(assertion: string, options?: { errorMessage?: string }): Promise<void>
23
+ aiWaitFor(prompt: string, options?: { timeout?: number }): Promise<void>
17
24
  aiQuery<T>(queryObject: Record<string, string>): Promise<T> // Extracts data from page based on descriptions
18
25
 
19
26
  // examples:
@@ -35,12 +42,14 @@ test.beforeEach(async ({ page }) => {
35
42
  });
36
43
 
37
44
  test('ai shop', async ({
45
+ aiAct,
38
46
  aiInput,
39
47
  aiAssert,
40
48
  aiQuery,
41
49
  aiKeyboardPress,
42
50
  aiHover,
43
51
  aiTap,
52
+ aiWaitFor,
44
53
  agentForPage,
45
54
  page,
46
55
  }) => {
@@ -86,7 +95,7 @@ tasks:
86
95
  locate: 'input field description'
87
96
  - aiScroll:
88
97
  direction: down/up
89
- scrollType: untilBottom/untilTop/page
98
+ scrollType: scrollToBottom/scrollToTop/singleAction
90
99
  - aiAssert: "expected state"
91
100
  - sleep: milliseconds
92
101
 
@@ -159,7 +168,7 @@ tasks:
159
168
  # Scroll globally or on an element described by a prompt.
160
169
  - aiScroll:
161
170
  direction: 'up' # or 'down' | 'left' | 'right'
162
- scrollType: 'once' # or 'untilTop' | 'untilBottom' | 'untilLeft' | 'untilRight'
171
+ scrollType: 'singleAction' # or 'scrollToTop' | 'scrollToBottom' | 'scrollToLeft' | 'scrollToRight'
163
172
  distance: <number> # Optional, the scroll distance in pixels.
164
173
  locate: <prompt> # Optional, the element to scroll on.
165
174
  deepThink: <boolean> # Optional, whether to use deepThink to precisely locate the element. Defaults to False.
@@ -41,7 +41,7 @@ const MAX_SESSIONS = 100;
41
41
  class BaseMCPServer {
42
42
  async initializeToolsManager() {
43
43
  setIsMcp(true);
44
- this.toolsManager = this.createToolsManager();
44
+ this.toolsManager = this.providedToolsManager || this.createToolsManager();
45
45
  try {
46
46
  await this.toolsManager.initTools();
47
47
  } catch (error) {
@@ -91,6 +91,11 @@ class BaseMCPServer {
91
91
  };
92
92
  process.once('SIGINT', cleanup);
93
93
  process.once('SIGTERM', cleanup);
94
+ return {
95
+ close: async ()=>{
96
+ this.performCleanup();
97
+ }
98
+ };
94
99
  }
95
100
  async launchHttp(options) {
96
101
  if (!Number.isInteger(options.port) || options.port < 1 || options.port > 65535) throw new Error(`Invalid port number: ${options.port}. Port must be between 1 and 65535.`);
@@ -151,6 +156,27 @@ class BaseMCPServer {
151
156
  });
152
157
  const cleanupInterval = this.startSessionCleanup(sessions);
153
158
  this.setupHttpShutdownHandlers(server, sessions, cleanupInterval);
159
+ return {
160
+ port: options.port,
161
+ host,
162
+ close: async ()=>{
163
+ clearInterval(cleanupInterval);
164
+ for (const session of sessions.values())try {
165
+ await session.transport.close();
166
+ } catch (error) {
167
+ const message = error instanceof Error ? error.message : String(error);
168
+ console.error(`Failed to close session ${session.transport.sessionId}: ${message}`);
169
+ }
170
+ sessions.clear();
171
+ return new Promise((resolve)=>{
172
+ server.close((err)=>{
173
+ if (err) console.error('Error closing HTTP server:', err);
174
+ this.performCleanup();
175
+ resolve();
176
+ });
177
+ });
178
+ }
179
+ };
154
180
  }
155
181
  async createHttpSession(sessions) {
156
182
  const transport = new StreamableHTTPServerTransport({
@@ -235,16 +261,18 @@ class BaseMCPServer {
235
261
  getToolsManager() {
236
262
  return this.toolsManager;
237
263
  }
238
- constructor(config){
264
+ constructor(config, toolsManager){
239
265
  _define_property(this, "mcpServer", void 0);
240
266
  _define_property(this, "toolsManager", void 0);
241
267
  _define_property(this, "config", void 0);
268
+ _define_property(this, "providedToolsManager", void 0);
242
269
  this.config = config;
243
270
  this.mcpServer = new McpServer({
244
271
  name: config.name,
245
272
  version: config.version,
246
273
  description: config.description
247
274
  });
275
+ this.providedToolsManager = toolsManager;
248
276
  }
249
277
  }
250
278
  export { BaseMCPServer, CLI_ARGS_CONFIG, launchMCPServer };
@@ -65,6 +65,12 @@ class BaseMidsceneTools {
65
65
  async closeBrowser() {
66
66
  await this.agent?.destroy?.();
67
67
  }
68
+ getToolDefinitions() {
69
+ return this.toolDefinitions;
70
+ }
71
+ setAgent(agent) {
72
+ this.agent = agent;
73
+ }
68
74
  buildScreenshotContent(screenshot) {
69
75
  const { mimeType, body } = parseBase64(screenshot);
70
76
  return [
@@ -3,3 +3,4 @@ export * from "./base-tools.mjs";
3
3
  export * from "./tool-generator.mjs";
4
4
  export * from "./types.mjs";
5
5
  export * from "./inject-report-html-plugin.mjs";
6
+ export * from "./launcher-helper.mjs";
@@ -0,0 +1,52 @@
1
+ function createMCPServerLauncher(config) {
2
+ const { agent, platformName, ToolsManagerClass, MCPServerClass } = config;
3
+ function validateAgent() {
4
+ const device = agent.interface;
5
+ if (!device) throw new Error(`Agent must have an 'interface' property that references the underlying device.
6
+ Please ensure your agent instance is properly initialized with a device interface.
7
+ Expected: agent.interface to be defined, but got: ${typeof device}
8
+ Solution: Check that your agent constructor properly sets the interface property.`);
9
+ }
10
+ function createToolsManager() {
11
+ const toolsManager = new ToolsManagerClass();
12
+ toolsManager.agent = agent;
13
+ return toolsManager;
14
+ }
15
+ function logStartupInfo(mode, additionalInfo) {
16
+ const device = agent.interface;
17
+ console.log(`Starting Midscene ${platformName} MCP Server (${mode})...`);
18
+ console.log(`Agent: ${agent.constructor.name}`);
19
+ console.log(`Device: ${device.constructor.name}`);
20
+ if (additionalInfo?.port !== void 0) console.log(`Port: ${additionalInfo.port}`);
21
+ if (additionalInfo?.host) console.log(`Host: ${additionalInfo.host}`);
22
+ }
23
+ return {
24
+ async launch (options = {}) {
25
+ const { verbose = true } = options;
26
+ validateAgent();
27
+ if (verbose) logStartupInfo('stdio');
28
+ const toolsManager = createToolsManager();
29
+ const server = new MCPServerClass(toolsManager);
30
+ const result = await server.launch();
31
+ if (verbose) console.log(`${platformName} MCP Server started (stdio mode)`);
32
+ return result;
33
+ },
34
+ async launchHttp (options) {
35
+ const { port, host = 'localhost', verbose = true } = options;
36
+ validateAgent();
37
+ if (verbose) logStartupInfo('HTTP', {
38
+ port,
39
+ host
40
+ });
41
+ const toolsManager = createToolsManager();
42
+ const server = new MCPServerClass(toolsManager);
43
+ const result = await server.launchHttp({
44
+ port,
45
+ host
46
+ });
47
+ if (verbose) console.log(`${platformName} MCP Server started on http://${result.host}:${result.port}/mcp`);
48
+ return result;
49
+ }
50
+ };
51
+ }
52
+ export { createMCPServerLauncher };
@@ -34,15 +34,22 @@ const PLAYWRIGHT_EXAMPLE_CODE = `
34
34
  IMPORTANT: Follow these exact type signatures for AI functions:
35
35
 
36
36
  // Type signatures for AI functions:
37
- aiInput(value: string, locator: string): Promise<void>
38
- aiTap(locator: string): Promise<void>
39
- aiDoubleClick(locator: string): Promise<void>
40
- aiScroll(scrollParam: {
41
- direction: 'up' | 'down' | 'left' | 'right',
42
- scrollType: 'once' | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft',
43
- distance: number - scroll distance, px is the unit
37
+ aiAct(prompt: string, options?: { cacheable?: boolean }): Promise<void>
38
+ aiInput(text: string, locate: string, options?: { deepThink?: boolean, xpath?: string, cacheable?: boolean }): Promise<void>
39
+ aiTap(locate: string, options?: { deepThink?: boolean, xpath?: string, cacheable?: boolean }): Promise<void>
40
+ aiHover(locate: string, options?: { deepThink?: boolean, xpath?: string, cacheable?: boolean }): Promise<void>
41
+ aiDoubleClick(locate: string, options?: { deepThink?: boolean, xpath?: string, cacheable?: boolean }): Promise<void>
42
+ aiKeyboardPress(key: string, locate?: string, options?: { deepThink?: boolean, xpath?: string, cacheable?: boolean }): Promise<void>
43
+ aiScroll(locate: string | undefined, options: {
44
+ direction?: 'up' | 'down' | 'left' | 'right',
45
+ scrollType?: 'singleAction' | 'scrollToBottom' | 'scrollToTop' | 'scrollToRight' | 'scrollToLeft',
46
+ distance?: number | null,
47
+ deepThink?: boolean,
48
+ xpath?: string,
49
+ cacheable?: boolean
44
50
  }): Promise<void>
45
- aiAssert(assertion: string): Promise<void>
51
+ aiAssert(assertion: string, options?: { errorMessage?: string }): Promise<void>
52
+ aiWaitFor(prompt: string, options?: { timeout?: number }): Promise<void>
46
53
  aiQuery<T>(queryObject: Record<string, string>): Promise<T> // Extracts data from page based on descriptions
47
54
 
48
55
  // examples:
@@ -64,12 +71,14 @@ test.beforeEach(async ({ page }) => {
64
71
  });
65
72
 
66
73
  test('ai shop', async ({
74
+ aiAct,
67
75
  aiInput,
68
76
  aiAssert,
69
77
  aiQuery,
70
78
  aiKeyboardPress,
71
79
  aiHover,
72
80
  aiTap,
81
+ aiWaitFor,
73
82
  agentForPage,
74
83
  page,
75
84
  }) => {
@@ -115,7 +124,7 @@ tasks:
115
124
  locate: 'input field description'
116
125
  - aiScroll:
117
126
  direction: down/up
118
- scrollType: untilBottom/untilTop/page
127
+ scrollType: scrollToBottom/scrollToTop/singleAction
119
128
  - aiAssert: "expected state"
120
129
  - sleep: milliseconds
121
130
 
@@ -188,7 +197,7 @@ tasks:
188
197
  # Scroll globally or on an element described by a prompt.
189
198
  - aiScroll:
190
199
  direction: 'up' # or 'down' | 'left' | 'right'
191
- scrollType: 'once' # or 'untilTop' | 'untilBottom' | 'untilLeft' | 'untilRight'
200
+ scrollType: 'singleAction' # or 'scrollToTop' | 'scrollToBottom' | 'scrollToLeft' | 'scrollToRight'
192
201
  distance: <number> # Optional, the scroll distance in pixels.
193
202
  locate: <prompt> # Optional, the element to scroll on.
194
203
  deepThink: <boolean> # Optional, whether to use deepThink to precisely locate the element. Defaults to False.
@@ -81,7 +81,7 @@ const MAX_SESSIONS = 100;
81
81
  class BaseMCPServer {
82
82
  async initializeToolsManager() {
83
83
  (0, utils_namespaceObject.setIsMcp)(true);
84
- this.toolsManager = this.createToolsManager();
84
+ this.toolsManager = this.providedToolsManager || this.createToolsManager();
85
85
  try {
86
86
  await this.toolsManager.initTools();
87
87
  } catch (error) {
@@ -131,6 +131,11 @@ class BaseMCPServer {
131
131
  };
132
132
  process.once('SIGINT', cleanup);
133
133
  process.once('SIGTERM', cleanup);
134
+ return {
135
+ close: async ()=>{
136
+ this.performCleanup();
137
+ }
138
+ };
134
139
  }
135
140
  async launchHttp(options) {
136
141
  if (!Number.isInteger(options.port) || options.port < 1 || options.port > 65535) throw new Error(`Invalid port number: ${options.port}. Port must be between 1 and 65535.`);
@@ -191,6 +196,27 @@ class BaseMCPServer {
191
196
  });
192
197
  const cleanupInterval = this.startSessionCleanup(sessions);
193
198
  this.setupHttpShutdownHandlers(server, sessions, cleanupInterval);
199
+ return {
200
+ port: options.port,
201
+ host,
202
+ close: async ()=>{
203
+ clearInterval(cleanupInterval);
204
+ for (const session of sessions.values())try {
205
+ await session.transport.close();
206
+ } catch (error) {
207
+ const message = error instanceof Error ? error.message : String(error);
208
+ console.error(`Failed to close session ${session.transport.sessionId}: ${message}`);
209
+ }
210
+ sessions.clear();
211
+ return new Promise((resolve)=>{
212
+ server.close((err)=>{
213
+ if (err) console.error('Error closing HTTP server:', err);
214
+ this.performCleanup();
215
+ resolve();
216
+ });
217
+ });
218
+ }
219
+ };
194
220
  }
195
221
  async createHttpSession(sessions) {
196
222
  const transport = new streamableHttp_js_namespaceObject.StreamableHTTPServerTransport({
@@ -275,16 +301,18 @@ class BaseMCPServer {
275
301
  getToolsManager() {
276
302
  return this.toolsManager;
277
303
  }
278
- constructor(config){
304
+ constructor(config, toolsManager){
279
305
  _define_property(this, "mcpServer", void 0);
280
306
  _define_property(this, "toolsManager", void 0);
281
307
  _define_property(this, "config", void 0);
308
+ _define_property(this, "providedToolsManager", void 0);
282
309
  this.config = config;
283
310
  this.mcpServer = new mcp_js_namespaceObject.McpServer({
284
311
  name: config.name,
285
312
  version: config.version,
286
313
  description: config.description
287
314
  });
315
+ this.providedToolsManager = toolsManager;
288
316
  }
289
317
  }
290
318
  exports.BaseMCPServer = __webpack_exports__.BaseMCPServer;
@@ -93,6 +93,12 @@ class BaseMidsceneTools {
93
93
  async closeBrowser() {
94
94
  await this.agent?.destroy?.();
95
95
  }
96
+ getToolDefinitions() {
97
+ return this.toolDefinitions;
98
+ }
99
+ setAgent(agent) {
100
+ this.agent = agent;
101
+ }
96
102
  buildScreenshotContent(screenshot) {
97
103
  const { mimeType, body } = (0, img_namespaceObject.parseBase64)(screenshot);
98
104
  return [
@@ -9,6 +9,9 @@ var __webpack_modules__ = {
9
9
  "./inject-report-html-plugin" (module) {
10
10
  module.exports = require("./inject-report-html-plugin.js");
11
11
  },
12
+ "./launcher-helper" (module) {
13
+ module.exports = require("./launcher-helper.js");
14
+ },
12
15
  "./tool-generator" (module) {
13
16
  module.exports = require("./tool-generator.js");
14
17
  },
@@ -79,6 +82,10 @@ var __webpack_exports__ = {};
79
82
  var __rspack_reexport = {};
80
83
  for(const __rspack_import_key in _inject_report_html_plugin__rspack_import_4)if ("default" !== __rspack_import_key) __rspack_reexport[__rspack_import_key] = ()=>_inject_report_html_plugin__rspack_import_4[__rspack_import_key];
81
84
  __webpack_require__.d(__webpack_exports__, __rspack_reexport);
85
+ var _launcher_helper__rspack_import_5 = __webpack_require__("./launcher-helper");
86
+ var __rspack_reexport = {};
87
+ for(const __rspack_import_key in _launcher_helper__rspack_import_5)if ("default" !== __rspack_import_key) __rspack_reexport[__rspack_import_key] = ()=>_launcher_helper__rspack_import_5[__rspack_import_key];
88
+ __webpack_require__.d(__webpack_exports__, __rspack_reexport);
82
89
  })();
83
90
  for(var __rspack_i in __webpack_exports__)exports[__rspack_i] = __webpack_exports__[__rspack_i];
84
91
  Object.defineProperty(exports, '__esModule', {
@@ -0,0 +1,86 @@
1
+ "use strict";
2
+ var __webpack_require__ = {};
3
+ (()=>{
4
+ __webpack_require__.d = (exports1, definition)=>{
5
+ for(var key in definition)if (__webpack_require__.o(definition, key) && !__webpack_require__.o(exports1, key)) Object.defineProperty(exports1, key, {
6
+ enumerable: true,
7
+ get: definition[key]
8
+ });
9
+ };
10
+ })();
11
+ (()=>{
12
+ __webpack_require__.o = (obj, prop)=>Object.prototype.hasOwnProperty.call(obj, prop);
13
+ })();
14
+ (()=>{
15
+ __webpack_require__.r = (exports1)=>{
16
+ if ('undefined' != typeof Symbol && Symbol.toStringTag) Object.defineProperty(exports1, Symbol.toStringTag, {
17
+ value: 'Module'
18
+ });
19
+ Object.defineProperty(exports1, '__esModule', {
20
+ value: true
21
+ });
22
+ };
23
+ })();
24
+ var __webpack_exports__ = {};
25
+ __webpack_require__.r(__webpack_exports__);
26
+ __webpack_require__.d(__webpack_exports__, {
27
+ createMCPServerLauncher: ()=>createMCPServerLauncher
28
+ });
29
+ function createMCPServerLauncher(config) {
30
+ const { agent, platformName, ToolsManagerClass, MCPServerClass } = config;
31
+ function validateAgent() {
32
+ const device = agent.interface;
33
+ if (!device) throw new Error(`Agent must have an 'interface' property that references the underlying device.
34
+ Please ensure your agent instance is properly initialized with a device interface.
35
+ Expected: agent.interface to be defined, but got: ${typeof device}
36
+ Solution: Check that your agent constructor properly sets the interface property.`);
37
+ }
38
+ function createToolsManager() {
39
+ const toolsManager = new ToolsManagerClass();
40
+ toolsManager.agent = agent;
41
+ return toolsManager;
42
+ }
43
+ function logStartupInfo(mode, additionalInfo) {
44
+ const device = agent.interface;
45
+ console.log(`Starting Midscene ${platformName} MCP Server (${mode})...`);
46
+ console.log(`Agent: ${agent.constructor.name}`);
47
+ console.log(`Device: ${device.constructor.name}`);
48
+ if (additionalInfo?.port !== void 0) console.log(`Port: ${additionalInfo.port}`);
49
+ if (additionalInfo?.host) console.log(`Host: ${additionalInfo.host}`);
50
+ }
51
+ return {
52
+ async launch (options = {}) {
53
+ const { verbose = true } = options;
54
+ validateAgent();
55
+ if (verbose) logStartupInfo('stdio');
56
+ const toolsManager = createToolsManager();
57
+ const server = new MCPServerClass(toolsManager);
58
+ const result = await server.launch();
59
+ if (verbose) console.log(`${platformName} MCP Server started (stdio mode)`);
60
+ return result;
61
+ },
62
+ async launchHttp (options) {
63
+ const { port, host = 'localhost', verbose = true } = options;
64
+ validateAgent();
65
+ if (verbose) logStartupInfo('HTTP', {
66
+ port,
67
+ host
68
+ });
69
+ const toolsManager = createToolsManager();
70
+ const server = new MCPServerClass(toolsManager);
71
+ const result = await server.launchHttp({
72
+ port,
73
+ host
74
+ });
75
+ if (verbose) console.log(`${platformName} MCP Server started on http://${result.host}:${result.port}/mcp`);
76
+ return result;
77
+ }
78
+ };
79
+ }
80
+ exports.createMCPServerLauncher = __webpack_exports__.createMCPServerLauncher;
81
+ for(var __rspack_i in __webpack_exports__)if (-1 === [
82
+ "createMCPServerLauncher"
83
+ ].indexOf(__rspack_i)) exports[__rspack_i] = __webpack_exports__[__rspack_i];
84
+ Object.defineProperty(exports, '__esModule', {
85
+ value: true
86
+ });
@@ -1,2 +1,2 @@
1
- export declare const PLAYWRIGHT_EXAMPLE_CODE = "\n// Reference the following code to generate Midscene test cases\n// The following is test code for Midscene AI, for reference\n// The following is Playwright syntax, you can use Playwright to assist in test generation\nIMPORTANT: Follow these exact type signatures for AI functions:\n\n// Type signatures for AI functions:\naiInput(value: string, locator: string): Promise<void>\naiTap(locator: string): Promise<void>\naiDoubleClick(locator: string): Promise<void>\naiScroll(scrollParam: {\n direction: 'up' | 'down' | 'left' | 'right',\n scrollType: 'once' | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft',\n distance: number - scroll distance, px is the unit\n}): Promise<void>\naiAssert(assertion: string): Promise<void>\naiQuery<T>(queryObject: Record<string, string>): Promise<T> // Extracts data from page based on descriptions\n\n// examples:\n// Reference the following code to generate Midscene test cases\n// The following is test code for Midscene AI, for reference\n// The following is Playwright syntax, you can use Playwright to assist in test generation\nimport { test as base } from '@playwright/test';\nimport type { PlayWrightAiFixtureType } from '@midscene/web/playwright';\nimport { PlaywrightAiFixture } from '@midscene/web/playwright';\n\nconst test = base.extend<PlayWrightAiFixtureType>(PlaywrightAiFixture({\n waitForNetworkIdleTimeout: 2000, // optional, the timeout for waiting for network idle between each action, default is 2000ms\n}));\n\n\ntest.beforeEach(async ({ page }) => {\n await page.goto('https://www.xxx.com/');\n await page.setViewportSize({ width: 1920, height: 1080 });\n});\n\ntest('ai shop', async ({\n aiInput,\n aiAssert,\n aiQuery,\n aiKeyboardPress,\n aiHover,\n aiTap,\n agentForPage,\n page,\n}) => {\n // login\n await aiAssert('The page shows the login interface');\n await aiInput('user_name', 'in user name input');\n await aiInput('password', 'in password input');\n await aiKeyboardPress('Enter', 'Login Button');\n\n // check the login success\n await aiWaitFor('The page shows that the loading is complete');\n await aiAssert('The current page shows the product detail page');\n\n // check the product info\n const dataA = await aiQuery({\n userInfo: 'User information in the format {name: string}',\n theFirstProductInfo: 'The first product info in the format {name: string, price: number}',\n });\n expect(dataA.theFirstProductInfo.name).toBe('xxx');\n expect(dataA.theFirstProductInfo.price).toBe(100);\n\n\n // add to cart\n await aiTap('click add to cart button');\n \n await aiTap('click right top cart icon');\n await aiAssert('The cart icon shows the number 1');\n});\n";
2
- export declare const YAML_EXAMPLE_CODE = "\n1. Format:\n\nweb:\n url: \"starting_url\"\n viewportWidth: 1280\n viewportHeight: 960\n\ntasks:\n - name: \"descriptive task name\"\n flow:\n - aiTap: \"element description\"\n - aiInput: 'text value'\n locate: 'input field description'\n - aiScroll:\n direction: down/up\n scrollType: untilBottom/untilTop/page\n - aiAssert: \"expected state\"\n - sleep: milliseconds\n\n2. Action Types:\n- aiTap: for clicks (natural language targeting)\n- aiInput: for text input with 'locate' field\n- aiScroll: with direction and scrollType\n- aiAssert: for validations\n- sleep: for delays (milliseconds)\n\n3. Best Practices:\n- Group related actions into logical tasks\n- Use natural language descriptions\n- Add deepThink: true for complex interactions\n- Keep task names concise but descriptive\n\n\n\nYAML type\ntasks:\n - name: <name>\n continueOnError: <boolean> # Optional, whether to continue to the next task on error, defaults to false.\n flow:\n # Auto Planning (.ai)\n # ----------------\n\n # Perform an interaction. `ai` is a shorthand for `aiAct`.\n - ai: <prompt>\n cacheable: <boolean> # Optional, whether to cache the result of this API call when the [caching feature](./caching.mdx) is enabled. Defaults to True.\n\n # This usage is the same as `ai`.\n - aiAct: <prompt>\n cacheable: <boolean> # Optional, whether to cache the result of this API call when the [caching feature](./caching.mdx) is enabled. Defaults to True.\n\n # Instant Action (.aiTap, .aiDoubleClick, .aiHover, .aiInput, .aiKeyboardPress, .aiScroll)\n # ----------------\n\n # Tap an element described by a prompt.\n - aiTap: <prompt>\n deepThink: <boolean> # Optional, whether to use deepThink to precisely locate the element. Defaults to False.\n xpath: <xpath> # Optional, the xpath of the target element for the operation. If provided, Midscene will prioritize this xpath to find the element before using the cache and the AI model. Defaults to empty.\n cacheable: <boolean> # Optional, whether to cache the result of this API call when the [caching feature](./caching.mdx) is enabled. Defaults to True.\n\n # Double click an element described by a prompt.\n - aiDoubleClick: <prompt>\n deepThink: <boolean> # Optional, whether to use deepThink to precisely locate the element. Defaults to False.\n xpath: <xpath> # Optional, the xpath of the target element for the operation. If provided, Midscene will prioritize this xpath to find the element before using the cache and the AI model. Defaults to empty.\n cacheable: <boolean> # Optional, whether to cache the result of this API call when the [caching feature](./caching.mdx) is enabled. Defaults to True.\n\n # Hover over an element described by a prompt.\n - aiHover: <prompt>\n deepThink: <boolean> # Optional, whether to use deepThink to precisely locate the element. Defaults to False.\n xpath: <xpath> # Optional, the xpath of the target element for the operation. If provided, Midscene will prioritize this xpath to find the element before using the cache and the AI model. Defaults to empty.\n cacheable: <boolean> # Optional, whether to cache the result of this API call when the [caching feature](./caching.mdx) is enabled. Defaults to True.\n\n # Input text into an element described by a prompt.\n - aiInput: <final text content of the input>\n locate: <prompt>\n deepThink: <boolean> # Optional, whether to use deepThink to precisely locate the element. Defaults to False.\n xpath: <xpath> # Optional, the xpath of the target element for the operation. If provided, Midscene will prioritize this xpath to find the element before using the cache and the AI model. Defaults to empty.\n cacheable: <boolean> # Optional, whether to cache the result of this API call when the [caching feature](./caching.mdx) is enabled. Defaults to True.\n\n # Press a key (e.g., Enter, Tab, Escape) on an element described by a prompt.\n - aiKeyboardPress: <key>\n locate: <prompt>\n deepThink: <boolean> # Optional, whether to use deepThink to precisely locate the element. Defaults to False.\n xpath: <xpath> # Optional, the xpath of the target element for the operation. If provided, Midscene will prioritize this xpath to find the element before using the cache and the AI model. Defaults to empty.\n cacheable: <boolean> # Optional, whether to cache the result of this API call when the [caching feature](./caching.mdx) is enabled. Defaults to True.\n\n # Scroll globally or on an element described by a prompt.\n - aiScroll:\n direction: 'up' # or 'down' | 'left' | 'right'\n scrollType: 'once' # or 'untilTop' | 'untilBottom' | 'untilLeft' | 'untilRight'\n distance: <number> # Optional, the scroll distance in pixels.\n locate: <prompt> # Optional, the element to scroll on.\n deepThink: <boolean> # Optional, whether to use deepThink to precisely locate the element. Defaults to False.\n xpath: <xpath> # Optional, the xpath of the target element for the operation. If provided, Midscene will prioritize this xpath to find the element before using the cache and the AI model. Defaults to empty.\n cacheable: <boolean> # Optional, whether to cache the result of this API call when the [caching feature](./caching.mdx) is enabled. Defaults to True.\n\n # Record the current screenshot with a description in the report file.\n - recordToReport: <title> # Optional, the title of the screenshot. If not provided, the title will be 'untitled'.\n content: <content> # Optional, the description of the screenshot.\n\n # Data Extraction\n # ----------------\n\n # Perform a query that returns a JSON object.\n - aiQuery: <prompt> # Remember to describe the format of the result in the prompt.\n name: <name> # The key for the query result in the JSON output.\n\n # More APIs\n # ----------------\n\n # Wait for a condition to be met, with a timeout (in ms, optional, defaults to 30000).\n - aiWaitFor: <prompt>\n timeout: <ms>\n\n # Perform an assertion.\n - aiAssert: <prompt>\n errorMessage: <error-message> # Optional, the error message to print if the assertion fails.\n\n # Wait for a specified amount of time.\n - sleep: <ms>\n\n # Execute a piece of JavaScript code in the web page context.\n - javascript: <javascript>\n name: <name> # Optional, assign a name to the return value, which will be used as a key in the JSON output.\n\n - name: <name>\n flow:\n # ...\n";
1
+ export declare const PLAYWRIGHT_EXAMPLE_CODE = "\n// Reference the following code to generate Midscene test cases\n// The following is test code for Midscene AI, for reference\n// The following is Playwright syntax, you can use Playwright to assist in test generation\nIMPORTANT: Follow these exact type signatures for AI functions:\n\n// Type signatures for AI functions:\naiAct(prompt: string, options?: { cacheable?: boolean }): Promise<void>\naiInput(text: string, locate: string, options?: { deepThink?: boolean, xpath?: string, cacheable?: boolean }): Promise<void>\naiTap(locate: string, options?: { deepThink?: boolean, xpath?: string, cacheable?: boolean }): Promise<void>\naiHover(locate: string, options?: { deepThink?: boolean, xpath?: string, cacheable?: boolean }): Promise<void>\naiDoubleClick(locate: string, options?: { deepThink?: boolean, xpath?: string, cacheable?: boolean }): Promise<void>\naiKeyboardPress(key: string, locate?: string, options?: { deepThink?: boolean, xpath?: string, cacheable?: boolean }): Promise<void>\naiScroll(locate: string | undefined, options: {\n direction?: 'up' | 'down' | 'left' | 'right',\n scrollType?: 'singleAction' | 'scrollToBottom' | 'scrollToTop' | 'scrollToRight' | 'scrollToLeft',\n distance?: number | null,\n deepThink?: boolean,\n xpath?: string,\n cacheable?: boolean\n}): Promise<void>\naiAssert(assertion: string, options?: { errorMessage?: string }): Promise<void>\naiWaitFor(prompt: string, options?: { timeout?: number }): Promise<void>\naiQuery<T>(queryObject: Record<string, string>): Promise<T> // Extracts data from page based on descriptions\n\n// examples:\n// Reference the following code to generate Midscene test cases\n// The following is test code for Midscene AI, for reference\n// The following is Playwright syntax, you can use Playwright to assist in test generation\nimport { test as base } from '@playwright/test';\nimport type { PlayWrightAiFixtureType } from '@midscene/web/playwright';\nimport { PlaywrightAiFixture } from '@midscene/web/playwright';\n\nconst test = base.extend<PlayWrightAiFixtureType>(PlaywrightAiFixture({\n waitForNetworkIdleTimeout: 2000, // optional, the timeout for waiting for network idle between each action, default is 2000ms\n}));\n\n\ntest.beforeEach(async ({ page }) => {\n await page.goto('https://www.xxx.com/');\n await page.setViewportSize({ width: 1920, height: 1080 });\n});\n\ntest('ai shop', async ({\n aiAct,\n aiInput,\n aiAssert,\n aiQuery,\n aiKeyboardPress,\n aiHover,\n aiTap,\n aiWaitFor,\n agentForPage,\n page,\n}) => {\n // login\n await aiAssert('The page shows the login interface');\n await aiInput('user_name', 'in user name input');\n await aiInput('password', 'in password input');\n await aiKeyboardPress('Enter', 'Login Button');\n\n // check the login success\n await aiWaitFor('The page shows that the loading is complete');\n await aiAssert('The current page shows the product detail page');\n\n // check the product info\n const dataA = await aiQuery({\n userInfo: 'User information in the format {name: string}',\n theFirstProductInfo: 'The first product info in the format {name: string, price: number}',\n });\n expect(dataA.theFirstProductInfo.name).toBe('xxx');\n expect(dataA.theFirstProductInfo.price).toBe(100);\n\n\n // add to cart\n await aiTap('click add to cart button');\n \n await aiTap('click right top cart icon');\n await aiAssert('The cart icon shows the number 1');\n});\n";
2
+ export declare const YAML_EXAMPLE_CODE = "\n1. Format:\n\nweb:\n url: \"starting_url\"\n viewportWidth: 1280\n viewportHeight: 960\n\ntasks:\n - name: \"descriptive task name\"\n flow:\n - aiTap: \"element description\"\n - aiInput: 'text value'\n locate: 'input field description'\n - aiScroll:\n direction: down/up\n scrollType: scrollToBottom/scrollToTop/singleAction\n - aiAssert: \"expected state\"\n - sleep: milliseconds\n\n2. Action Types:\n- aiTap: for clicks (natural language targeting)\n- aiInput: for text input with 'locate' field\n- aiScroll: with direction and scrollType\n- aiAssert: for validations\n- sleep: for delays (milliseconds)\n\n3. Best Practices:\n- Group related actions into logical tasks\n- Use natural language descriptions\n- Add deepThink: true for complex interactions\n- Keep task names concise but descriptive\n\n\n\nYAML type\ntasks:\n - name: <name>\n continueOnError: <boolean> # Optional, whether to continue to the next task on error, defaults to false.\n flow:\n # Auto Planning (.ai)\n # ----------------\n\n # Perform an interaction. `ai` is a shorthand for `aiAct`.\n - ai: <prompt>\n cacheable: <boolean> # Optional, whether to cache the result of this API call when the [caching feature](./caching.mdx) is enabled. Defaults to True.\n\n # This usage is the same as `ai`.\n - aiAct: <prompt>\n cacheable: <boolean> # Optional, whether to cache the result of this API call when the [caching feature](./caching.mdx) is enabled. Defaults to True.\n\n # Instant Action (.aiTap, .aiDoubleClick, .aiHover, .aiInput, .aiKeyboardPress, .aiScroll)\n # ----------------\n\n # Tap an element described by a prompt.\n - aiTap: <prompt>\n deepThink: <boolean> # Optional, whether to use deepThink to precisely locate the element. Defaults to False.\n xpath: <xpath> # Optional, the xpath of the target element for the operation. If provided, Midscene will prioritize this xpath to find the element before using the cache and the AI model. Defaults to empty.\n cacheable: <boolean> # Optional, whether to cache the result of this API call when the [caching feature](./caching.mdx) is enabled. Defaults to True.\n\n # Double click an element described by a prompt.\n - aiDoubleClick: <prompt>\n deepThink: <boolean> # Optional, whether to use deepThink to precisely locate the element. Defaults to False.\n xpath: <xpath> # Optional, the xpath of the target element for the operation. If provided, Midscene will prioritize this xpath to find the element before using the cache and the AI model. Defaults to empty.\n cacheable: <boolean> # Optional, whether to cache the result of this API call when the [caching feature](./caching.mdx) is enabled. Defaults to True.\n\n # Hover over an element described by a prompt.\n - aiHover: <prompt>\n deepThink: <boolean> # Optional, whether to use deepThink to precisely locate the element. Defaults to False.\n xpath: <xpath> # Optional, the xpath of the target element for the operation. If provided, Midscene will prioritize this xpath to find the element before using the cache and the AI model. Defaults to empty.\n cacheable: <boolean> # Optional, whether to cache the result of this API call when the [caching feature](./caching.mdx) is enabled. Defaults to True.\n\n # Input text into an element described by a prompt.\n - aiInput: <final text content of the input>\n locate: <prompt>\n deepThink: <boolean> # Optional, whether to use deepThink to precisely locate the element. Defaults to False.\n xpath: <xpath> # Optional, the xpath of the target element for the operation. If provided, Midscene will prioritize this xpath to find the element before using the cache and the AI model. Defaults to empty.\n cacheable: <boolean> # Optional, whether to cache the result of this API call when the [caching feature](./caching.mdx) is enabled. Defaults to True.\n\n # Press a key (e.g., Enter, Tab, Escape) on an element described by a prompt.\n - aiKeyboardPress: <key>\n locate: <prompt>\n deepThink: <boolean> # Optional, whether to use deepThink to precisely locate the element. Defaults to False.\n xpath: <xpath> # Optional, the xpath of the target element for the operation. If provided, Midscene will prioritize this xpath to find the element before using the cache and the AI model. Defaults to empty.\n cacheable: <boolean> # Optional, whether to cache the result of this API call when the [caching feature](./caching.mdx) is enabled. Defaults to True.\n\n # Scroll globally or on an element described by a prompt.\n - aiScroll:\n direction: 'up' # or 'down' | 'left' | 'right'\n scrollType: 'singleAction' # or 'scrollToTop' | 'scrollToBottom' | 'scrollToLeft' | 'scrollToRight'\n distance: <number> # Optional, the scroll distance in pixels.\n locate: <prompt> # Optional, the element to scroll on.\n deepThink: <boolean> # Optional, whether to use deepThink to precisely locate the element. Defaults to False.\n xpath: <xpath> # Optional, the xpath of the target element for the operation. If provided, Midscene will prioritize this xpath to find the element before using the cache and the AI model. Defaults to empty.\n cacheable: <boolean> # Optional, whether to cache the result of this API call when the [caching feature](./caching.mdx) is enabled. Defaults to True.\n\n # Record the current screenshot with a description in the report file.\n - recordToReport: <title> # Optional, the title of the screenshot. If not provided, the title will be 'untitled'.\n content: <content> # Optional, the description of the screenshot.\n\n # Data Extraction\n # ----------------\n\n # Perform a query that returns a JSON object.\n - aiQuery: <prompt> # Remember to describe the format of the result in the prompt.\n name: <name> # The key for the query result in the JSON output.\n\n # More APIs\n # ----------------\n\n # Wait for a condition to be met, with a timeout (in ms, optional, defaults to 30000).\n - aiWaitFor: <prompt>\n timeout: <ms>\n\n # Perform an assertion.\n - aiAssert: <prompt>\n errorMessage: <error-message> # Optional, the error message to print if the assertion fails.\n\n # Wait for a specified amount of time.\n - sleep: <ms>\n\n # Execute a piece of JavaScript code in the web page context.\n - javascript: <javascript>\n name: <name> # Optional, assign a name to the return value, which will be used as a key in the JSON output.\n\n - name: <name>\n flow:\n # ...\n";
@@ -10,6 +10,20 @@ export interface HttpLaunchOptions {
10
10
  port: number;
11
11
  host?: string;
12
12
  }
13
+ export interface LaunchMCPServerResult {
14
+ /**
15
+ * The MCP server port (for HTTP mode)
16
+ */
17
+ port?: number;
18
+ /**
19
+ * The server host (for HTTP mode)
20
+ */
21
+ host?: string;
22
+ /**
23
+ * Function to gracefully shutdown the MCP server
24
+ */
25
+ close: () => Promise<void>;
26
+ }
13
27
  /**
14
28
  * CLI argument configuration for MCP servers
15
29
  */
@@ -23,7 +37,7 @@ export interface CLIArgs {
23
37
  * Launch an MCP server based on CLI arguments
24
38
  * Shared helper to reduce duplication across platform CLI entry points
25
39
  */
26
- export declare function launchMCPServer(server: BaseMCPServer, args: CLIArgs): Promise<void>;
40
+ export declare function launchMCPServer(server: BaseMCPServer, args: CLIArgs): Promise<LaunchMCPServerResult>;
27
41
  /**
28
42
  * Base MCP Server class with programmatic launch() API
29
43
  * Each platform extends this to provide their own tools manager
@@ -32,9 +46,11 @@ export declare abstract class BaseMCPServer {
32
46
  protected mcpServer: McpServer;
33
47
  protected toolsManager?: IMidsceneTools;
34
48
  protected config: BaseMCPServerConfig;
35
- constructor(config: BaseMCPServerConfig);
49
+ protected providedToolsManager?: IMidsceneTools;
50
+ constructor(config: BaseMCPServerConfig, toolsManager?: IMidsceneTools);
36
51
  /**
37
52
  * Platform-specific: create tools manager instance
53
+ * This is only called if no tools manager was provided in constructor
38
54
  */
39
55
  protected abstract createToolsManager(): IMidsceneTools;
40
56
  /**
@@ -48,12 +64,12 @@ export declare abstract class BaseMCPServer {
48
64
  /**
49
65
  * Initialize and launch the MCP server with stdio transport
50
66
  */
51
- launch(): Promise<void>;
67
+ launch(): Promise<LaunchMCPServerResult>;
52
68
  /**
53
69
  * Launch MCP server with HTTP transport
54
70
  * Supports stateful sessions for web applications and service integration
55
71
  */
56
- launchHttp(options: HttpLaunchOptions): Promise<void>;
72
+ launchHttp(options: HttpLaunchOptions): Promise<LaunchMCPServerResult>;
57
73
  /**
58
74
  * Create a new HTTP session with transport
59
75
  */
@@ -44,6 +44,14 @@ export declare abstract class BaseMidsceneTools<TAgent extends BaseAgent = BaseA
44
44
  * Cleanup method - destroy agent and release resources
45
45
  */
46
46
  closeBrowser(): Promise<void>;
47
+ /**
48
+ * Get tool definitions
49
+ */
50
+ getToolDefinitions(): ToolDefinition[];
51
+ /**
52
+ * Set agent for the tools manager
53
+ */
54
+ setAgent(agent: TAgent): void;
47
55
  /**
48
56
  * Helper: Convert base64 screenshot to image content array
49
57
  */
@@ -3,3 +3,4 @@ export * from './base-tools';
3
3
  export * from './tool-generator';
4
4
  export * from './types';
5
5
  export * from './inject-report-html-plugin';
6
+ export * from './launcher-helper';
@@ -0,0 +1,94 @@
1
+ import type { BaseMCPServer } from './base-server';
2
+ import type { HttpLaunchOptions, LaunchMCPServerResult } from './base-server';
3
+ import type { IMidsceneTools } from './types';
4
+ export interface LaunchMCPServerOptions extends HttpLaunchOptions {
5
+ /**
6
+ * Whether to show server logs
7
+ * @default true
8
+ */
9
+ verbose?: boolean;
10
+ }
11
+ /**
12
+ * Generic agent type (avoid importing from @midscene/core to prevent circular deps)
13
+ */
14
+ export interface GenericAgent<TDevice = any> {
15
+ interface: TDevice;
16
+ constructor: {
17
+ name: string;
18
+ };
19
+ }
20
+ /**
21
+ * Additional information for logging server startup
22
+ */
23
+ export interface StartupInfo {
24
+ port?: number;
25
+ host?: string;
26
+ }
27
+ export interface MCPServerLauncherConfig<AgentType extends GenericAgent = GenericAgent, ToolsManagerType extends IMidsceneTools = IMidsceneTools> {
28
+ agent: AgentType;
29
+ platformName: string;
30
+ ToolsManagerClass: new (...args: any[]) => ToolsManagerType;
31
+ MCPServerClass: new (toolsManager?: ToolsManagerType) => BaseMCPServer;
32
+ }
33
+ /**
34
+ * Create a generic MCP server launcher for a given agent, tools manager, and MCP server.
35
+ *
36
+ * This helper centralizes the common wiring logic used by platform-specific launchers:
37
+ * it constructs a tools manager, attaches the provided `agent` to it, then instantiates
38
+ * the `MCPServerClass` and exposes convenience methods to start the server over stdio
39
+ * (`launch`) or HTTP (`launchHttp`).
40
+ *
41
+ * Use this helper when adding a new platform-specific launcher or when you want to
42
+ * avoid duplicating boilerplate code for starting an MCP server. Typically, callers
43
+ * provide:
44
+ * - an `agent` instance that contains the underlying device on its `interface` property
45
+ * - a `ToolsManagerClass` that knows how to expose tools for that agent
46
+ * - an `MCPServerClass` that implements the MCP protocol and supports `launch` and
47
+ * `launchHttp` methods.
48
+ *
49
+ * The returned object has two methods:
50
+ * - `launch(options?)` to start the server using stdio transport
51
+ * - `launchHttp(options)` to start the server using HTTP transport
52
+ * Both methods accept a `verbose` flag to control console logging.
53
+ *
54
+ * @param config Configuration describing the agent, platform name (for logging),
55
+ * tools manager implementation, and MCP server implementation.
56
+ *
57
+ * @returns An object with `launch` and `launchHttp` methods to start the MCP server.
58
+ *
59
+ * @example
60
+ * ```typescript
61
+ * import { createMCPServerLauncher } from '@midscene/shared/mcp';
62
+ * import { Agent } from '@midscene/core/agent';
63
+ * import { WebMidsceneTools } from './web-tools';
64
+ * import { WebMCPServer } from './server';
65
+ *
66
+ * const agent = new Agent();
67
+ * const launcher = createMCPServerLauncher({
68
+ * agent,
69
+ * platformName: 'Web',
70
+ * ToolsManagerClass: WebMidsceneTools,
71
+ * MCPServerClass: WebMCPServer,
72
+ * });
73
+ *
74
+ * // Start with stdio
75
+ * await launcher.launch({ verbose: true });
76
+ *
77
+ * // Or start with HTTP
78
+ * await launcher.launchHttp({ port: 3000, host: 'localhost' });
79
+ * ```
80
+ *
81
+ * @internal
82
+ */
83
+ export declare function createMCPServerLauncher<AgentType extends GenericAgent, ToolsManagerType extends IMidsceneTools>(config: MCPServerLauncherConfig<AgentType, ToolsManagerType>): {
84
+ /**
85
+ * Launch the MCP server with stdio transport
86
+ */
87
+ launch(options?: {
88
+ verbose?: boolean;
89
+ }): Promise<LaunchMCPServerResult>;
90
+ /**
91
+ * Launch the MCP server with HTTP transport
92
+ */
93
+ launchHttp(options: LaunchMCPServerOptions): Promise<LaunchMCPServerResult>;
94
+ };
@@ -59,6 +59,10 @@ export interface ToolDefinition<T = Record<string, unknown>> {
59
59
  handler: ToolHandler<T>;
60
60
  autoDestroy?: boolean;
61
61
  }
62
+ /**
63
+ * Tool type for mcpKitForAgent return value
64
+ */
65
+ export type Tool = ToolDefinition;
62
66
  /**
63
67
  * Action space item definition
64
68
  * Note: Intentionally no index signature to maintain compatibility with DeviceAction
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@midscene/shared",
3
- "version": "1.0.4",
3
+ "version": "1.0.5-beta-20251230124359.0",
4
4
  "repository": "https://github.com/web-infra-dev/midscene",
5
5
  "homepage": "https://midscenejs.com/",
6
6
  "types": "./dist/types/index.d.ts",
@@ -79,6 +79,7 @@
79
79
  "README.md"
80
80
  ],
81
81
  "dependencies": {
82
+ "@modelcontextprotocol/sdk": "1.10.2",
82
83
  "@silvia-odwyer/photon": "0.3.3",
83
84
  "@silvia-odwyer/photon-node": "0.3.3",
84
85
  "debug": "4.4.0",
@@ -90,7 +91,6 @@
90
91
  },
91
92
  "devDependencies": {
92
93
  "@rslib/core": "^0.18.3",
93
- "@modelcontextprotocol/sdk": "1.10.2",
94
94
  "@types/debug": "4.1.12",
95
95
  "@types/express": "^4.17.21",
96
96
  "@types/node": "^18.0.0",
@@ -5,15 +5,22 @@ export const PLAYWRIGHT_EXAMPLE_CODE = `
5
5
  IMPORTANT: Follow these exact type signatures for AI functions:
6
6
 
7
7
  // Type signatures for AI functions:
8
- aiInput(value: string, locator: string): Promise<void>
9
- aiTap(locator: string): Promise<void>
10
- aiDoubleClick(locator: string): Promise<void>
11
- aiScroll(scrollParam: {
12
- direction: 'up' | 'down' | 'left' | 'right',
13
- scrollType: 'once' | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft',
14
- distance: number - scroll distance, px is the unit
8
+ aiAct(prompt: string, options?: { cacheable?: boolean }): Promise<void>
9
+ aiInput(text: string, locate: string, options?: { deepThink?: boolean, xpath?: string, cacheable?: boolean }): Promise<void>
10
+ aiTap(locate: string, options?: { deepThink?: boolean, xpath?: string, cacheable?: boolean }): Promise<void>
11
+ aiHover(locate: string, options?: { deepThink?: boolean, xpath?: string, cacheable?: boolean }): Promise<void>
12
+ aiDoubleClick(locate: string, options?: { deepThink?: boolean, xpath?: string, cacheable?: boolean }): Promise<void>
13
+ aiKeyboardPress(key: string, locate?: string, options?: { deepThink?: boolean, xpath?: string, cacheable?: boolean }): Promise<void>
14
+ aiScroll(locate: string | undefined, options: {
15
+ direction?: 'up' | 'down' | 'left' | 'right',
16
+ scrollType?: 'singleAction' | 'scrollToBottom' | 'scrollToTop' | 'scrollToRight' | 'scrollToLeft',
17
+ distance?: number | null,
18
+ deepThink?: boolean,
19
+ xpath?: string,
20
+ cacheable?: boolean
15
21
  }): Promise<void>
16
- aiAssert(assertion: string): Promise<void>
22
+ aiAssert(assertion: string, options?: { errorMessage?: string }): Promise<void>
23
+ aiWaitFor(prompt: string, options?: { timeout?: number }): Promise<void>
17
24
  aiQuery<T>(queryObject: Record<string, string>): Promise<T> // Extracts data from page based on descriptions
18
25
 
19
26
  // examples:
@@ -35,12 +42,14 @@ test.beforeEach(async ({ page }) => {
35
42
  });
36
43
 
37
44
  test('ai shop', async ({
45
+ aiAct,
38
46
  aiInput,
39
47
  aiAssert,
40
48
  aiQuery,
41
49
  aiKeyboardPress,
42
50
  aiHover,
43
51
  aiTap,
52
+ aiWaitFor,
44
53
  agentForPage,
45
54
  page,
46
55
  }) => {
@@ -87,7 +96,7 @@ tasks:
87
96
  locate: 'input field description'
88
97
  - aiScroll:
89
98
  direction: down/up
90
- scrollType: untilBottom/untilTop/page
99
+ scrollType: scrollToBottom/scrollToTop/singleAction
91
100
  - aiAssert: "expected state"
92
101
  - sleep: milliseconds
93
102
 
@@ -160,7 +169,7 @@ tasks:
160
169
  # Scroll globally or on an element described by a prompt.
161
170
  - aiScroll:
162
171
  direction: 'up' # or 'down' | 'left' | 'right'
163
- scrollType: 'once' # or 'untilTop' | 'untilBottom' | 'untilLeft' | 'untilRight'
172
+ scrollType: 'singleAction' # or 'scrollToTop' | 'scrollToBottom' | 'scrollToLeft' | 'scrollToRight'
164
173
  distance: <number> # Optional, the scroll distance in pixels.
165
174
  locate: <prompt> # Optional, the element to scroll on.
166
175
  deepThink: <boolean> # Optional, whether to use deepThink to precisely locate the element. Defaults to False.
@@ -22,6 +22,23 @@ export interface HttpLaunchOptions {
22
22
  host?: string;
23
23
  }
24
24
 
25
+ export interface LaunchMCPServerResult {
26
+ /**
27
+ * The MCP server port (for HTTP mode)
28
+ */
29
+ port?: number;
30
+
31
+ /**
32
+ * The server host (for HTTP mode)
33
+ */
34
+ host?: string;
35
+
36
+ /**
37
+ * Function to gracefully shutdown the MCP server
38
+ */
39
+ close: () => Promise<void>;
40
+ }
41
+
25
42
  interface SessionData {
26
43
  transport: StreamableHTTPServerTransport;
27
44
  createdAt: Date;
@@ -50,7 +67,7 @@ export interface CLIArgs {
50
67
  export function launchMCPServer(
51
68
  server: BaseMCPServer,
52
69
  args: CLIArgs,
53
- ): Promise<void> {
70
+ ): Promise<LaunchMCPServerResult> {
54
71
  if (args.mode === 'http') {
55
72
  return server.launchHttp({
56
73
  port: Number.parseInt(args.port || '3000', 10),
@@ -72,18 +89,21 @@ export abstract class BaseMCPServer {
72
89
  protected mcpServer: McpServer;
73
90
  protected toolsManager?: IMidsceneTools;
74
91
  protected config: BaseMCPServerConfig;
92
+ protected providedToolsManager?: IMidsceneTools;
75
93
 
76
- constructor(config: BaseMCPServerConfig) {
94
+ constructor(config: BaseMCPServerConfig, toolsManager?: IMidsceneTools) {
77
95
  this.config = config;
78
96
  this.mcpServer = new McpServer({
79
97
  name: config.name,
80
98
  version: config.version,
81
99
  description: config.description,
82
100
  });
101
+ this.providedToolsManager = toolsManager;
83
102
  }
84
103
 
85
104
  /**
86
105
  * Platform-specific: create tools manager instance
106
+ * This is only called if no tools manager was provided in constructor
87
107
  */
88
108
  protected abstract createToolsManager(): IMidsceneTools;
89
109
 
@@ -92,7 +112,9 @@ export abstract class BaseMCPServer {
92
112
  */
93
113
  private async initializeToolsManager(): Promise<void> {
94
114
  setIsMcp(true);
95
- this.toolsManager = this.createToolsManager();
115
+
116
+ // Use provided tools manager if available, otherwise create new one
117
+ this.toolsManager = this.providedToolsManager || this.createToolsManager();
96
118
 
97
119
  try {
98
120
  await this.toolsManager.initTools();
@@ -117,7 +139,7 @@ export abstract class BaseMCPServer {
117
139
  /**
118
140
  * Initialize and launch the MCP server with stdio transport
119
141
  */
120
- public async launch(): Promise<void> {
142
+ public async launch(): Promise<LaunchMCPServerResult> {
121
143
  // Hijack stdout-based console methods to stderr for stdio mode
122
144
  // This prevents them from breaking MCP JSON-RPC protocol on stdout
123
145
  // Note: console.warn and console.error already output to stderr
@@ -170,13 +192,21 @@ export abstract class BaseMCPServer {
170
192
 
171
193
  process.once('SIGINT', cleanup);
172
194
  process.once('SIGTERM', cleanup);
195
+
196
+ return {
197
+ close: async () => {
198
+ this.performCleanup();
199
+ },
200
+ };
173
201
  }
174
202
 
175
203
  /**
176
204
  * Launch MCP server with HTTP transport
177
205
  * Supports stateful sessions for web applications and service integration
178
206
  */
179
- public async launchHttp(options: HttpLaunchOptions): Promise<void> {
207
+ public async launchHttp(
208
+ options: HttpLaunchOptions,
209
+ ): Promise<LaunchMCPServerResult> {
180
210
  // Validate port number
181
211
  if (
182
212
  !Number.isInteger(options.port) ||
@@ -286,6 +316,36 @@ export abstract class BaseMCPServer {
286
316
 
287
317
  const cleanupInterval = this.startSessionCleanup(sessions);
288
318
  this.setupHttpShutdownHandlers(server, sessions, cleanupInterval);
319
+
320
+ return {
321
+ port: options.port,
322
+ host,
323
+ close: async () => {
324
+ clearInterval(cleanupInterval);
325
+ for (const session of sessions.values()) {
326
+ try {
327
+ await session.transport.close();
328
+ } catch (error: unknown) {
329
+ const message =
330
+ error instanceof Error ? error.message : String(error);
331
+ console.error(
332
+ `Failed to close session ${session.transport.sessionId}: ${message}`,
333
+ );
334
+ }
335
+ }
336
+ sessions.clear();
337
+
338
+ return new Promise<void>((resolve) => {
339
+ server.close((err) => {
340
+ if (err) {
341
+ console.error('Error closing HTTP server:', err);
342
+ }
343
+ this.performCleanup();
344
+ resolve();
345
+ });
346
+ });
347
+ },
348
+ };
289
349
  }
290
350
 
291
351
  /**
@@ -180,6 +180,20 @@ export abstract class BaseMidsceneTools<TAgent extends BaseAgent = BaseAgent>
180
180
  await this.agent?.destroy?.();
181
181
  }
182
182
 
183
+ /**
184
+ * Get tool definitions
185
+ */
186
+ public getToolDefinitions(): ToolDefinition[] {
187
+ return this.toolDefinitions;
188
+ }
189
+
190
+ /**
191
+ * Set agent for the tools manager
192
+ */
193
+ public setAgent(agent: TAgent): void {
194
+ this.agent = agent;
195
+ }
196
+
183
197
  /**
184
198
  * Helper: Convert base64 screenshot to image content array
185
199
  */
package/src/mcp/index.ts CHANGED
@@ -3,3 +3,4 @@ export * from './base-tools';
3
3
  export * from './tool-generator';
4
4
  export * from './types';
5
5
  export * from './inject-report-html-plugin';
6
+ export * from './launcher-helper';
@@ -0,0 +1,200 @@
1
+ import type { BaseMCPServer } from './base-server';
2
+ import type { HttpLaunchOptions, LaunchMCPServerResult } from './base-server';
3
+ import type { IMidsceneTools } from './types';
4
+
5
+ export interface LaunchMCPServerOptions extends HttpLaunchOptions {
6
+ /**
7
+ * Whether to show server logs
8
+ * @default true
9
+ */
10
+ verbose?: boolean;
11
+ }
12
+
13
+ /**
14
+ * Generic agent type (avoid importing from @midscene/core to prevent circular deps)
15
+ */
16
+ export interface GenericAgent<TDevice = any> {
17
+ interface: TDevice;
18
+ constructor: { name: string };
19
+ }
20
+
21
+ /**
22
+ * Additional information for logging server startup
23
+ */
24
+ export interface StartupInfo {
25
+ port?: number;
26
+ host?: string;
27
+ }
28
+
29
+ export interface MCPServerLauncherConfig<
30
+ AgentType extends GenericAgent = GenericAgent,
31
+ ToolsManagerType extends IMidsceneTools = IMidsceneTools,
32
+ > {
33
+ agent: AgentType;
34
+ platformName: string;
35
+ ToolsManagerClass: new (...args: any[]) => ToolsManagerType;
36
+ MCPServerClass: new (toolsManager?: ToolsManagerType) => BaseMCPServer;
37
+ }
38
+
39
+ /**
40
+ * Create a generic MCP server launcher for a given agent, tools manager, and MCP server.
41
+ *
42
+ * This helper centralizes the common wiring logic used by platform-specific launchers:
43
+ * it constructs a tools manager, attaches the provided `agent` to it, then instantiates
44
+ * the `MCPServerClass` and exposes convenience methods to start the server over stdio
45
+ * (`launch`) or HTTP (`launchHttp`).
46
+ *
47
+ * Use this helper when adding a new platform-specific launcher or when you want to
48
+ * avoid duplicating boilerplate code for starting an MCP server. Typically, callers
49
+ * provide:
50
+ * - an `agent` instance that contains the underlying device on its `interface` property
51
+ * - a `ToolsManagerClass` that knows how to expose tools for that agent
52
+ * - an `MCPServerClass` that implements the MCP protocol and supports `launch` and
53
+ * `launchHttp` methods.
54
+ *
55
+ * The returned object has two methods:
56
+ * - `launch(options?)` to start the server using stdio transport
57
+ * - `launchHttp(options)` to start the server using HTTP transport
58
+ * Both methods accept a `verbose` flag to control console logging.
59
+ *
60
+ * @param config Configuration describing the agent, platform name (for logging),
61
+ * tools manager implementation, and MCP server implementation.
62
+ *
63
+ * @returns An object with `launch` and `launchHttp` methods to start the MCP server.
64
+ *
65
+ * @example
66
+ * ```typescript
67
+ * import { createMCPServerLauncher } from '@midscene/shared/mcp';
68
+ * import { Agent } from '@midscene/core/agent';
69
+ * import { WebMidsceneTools } from './web-tools';
70
+ * import { WebMCPServer } from './server';
71
+ *
72
+ * const agent = new Agent();
73
+ * const launcher = createMCPServerLauncher({
74
+ * agent,
75
+ * platformName: 'Web',
76
+ * ToolsManagerClass: WebMidsceneTools,
77
+ * MCPServerClass: WebMCPServer,
78
+ * });
79
+ *
80
+ * // Start with stdio
81
+ * await launcher.launch({ verbose: true });
82
+ *
83
+ * // Or start with HTTP
84
+ * await launcher.launchHttp({ port: 3000, host: 'localhost' });
85
+ * ```
86
+ *
87
+ * @internal
88
+ */
89
+ export function createMCPServerLauncher<
90
+ AgentType extends GenericAgent,
91
+ ToolsManagerType extends IMidsceneTools,
92
+ >(config: MCPServerLauncherConfig<AgentType, ToolsManagerType>) {
93
+ const { agent, platformName, ToolsManagerClass, MCPServerClass } = config;
94
+
95
+ /**
96
+ * Validate that the agent has the required interface property
97
+ * @throws {Error} If agent.interface is missing
98
+ */
99
+ function validateAgent(): void {
100
+ const device = agent.interface;
101
+ if (!device) {
102
+ throw new Error(
103
+ `Agent must have an 'interface' property that references the underlying device.
104
+ Please ensure your agent instance is properly initialized with a device interface.
105
+ Expected: agent.interface to be defined, but got: ${typeof device}
106
+ Solution: Check that your agent constructor properly sets the interface property.`,
107
+ );
108
+ }
109
+ }
110
+
111
+ /**
112
+ * Create and configure a tools manager with the agent
113
+ * @returns Configured tools manager instance
114
+ */
115
+ function createToolsManager(): ToolsManagerType {
116
+ const toolsManager = new ToolsManagerClass();
117
+ // Type-safe agent injection: define explicit interface for tools manager with agent
118
+ interface ToolsManagerWithAgent extends IMidsceneTools {
119
+ agent: AgentType;
120
+ }
121
+ (toolsManager as unknown as ToolsManagerWithAgent).agent = agent;
122
+ return toolsManager;
123
+ }
124
+
125
+ /**
126
+ * Log server startup information
127
+ * @param mode - Transport mode ('stdio' or 'HTTP')
128
+ * @param additionalInfo - Additional info to log (e.g., port, host)
129
+ */
130
+ function logStartupInfo(
131
+ mode: 'stdio' | 'HTTP',
132
+ additionalInfo?: StartupInfo,
133
+ ): void {
134
+ const device = agent.interface;
135
+ console.log(`Starting Midscene ${platformName} MCP Server (${mode})...`);
136
+ console.log(`Agent: ${agent.constructor.name}`);
137
+ console.log(`Device: ${device.constructor.name}`);
138
+
139
+ if (additionalInfo?.port !== undefined) {
140
+ console.log(`Port: ${additionalInfo.port}`);
141
+ }
142
+ if (additionalInfo?.host) {
143
+ console.log(`Host: ${additionalInfo.host}`);
144
+ }
145
+ }
146
+
147
+ return {
148
+ /**
149
+ * Launch the MCP server with stdio transport
150
+ */
151
+ async launch(
152
+ options: { verbose?: boolean } = {},
153
+ ): Promise<LaunchMCPServerResult> {
154
+ const { verbose = true } = options;
155
+
156
+ validateAgent();
157
+
158
+ if (verbose) {
159
+ logStartupInfo('stdio');
160
+ }
161
+
162
+ const toolsManager = createToolsManager();
163
+ const server = new MCPServerClass(toolsManager);
164
+ const result = await server.launch();
165
+
166
+ if (verbose) {
167
+ console.log(`${platformName} MCP Server started (stdio mode)`);
168
+ }
169
+
170
+ return result;
171
+ },
172
+
173
+ /**
174
+ * Launch the MCP server with HTTP transport
175
+ */
176
+ async launchHttp(
177
+ options: LaunchMCPServerOptions,
178
+ ): Promise<LaunchMCPServerResult> {
179
+ const { port, host = 'localhost', verbose = true } = options;
180
+
181
+ validateAgent();
182
+
183
+ if (verbose) {
184
+ logStartupInfo('HTTP', { port, host });
185
+ }
186
+
187
+ const toolsManager = createToolsManager();
188
+ const server = new MCPServerClass(toolsManager);
189
+ const result = await server.launchHttp({ port, host });
190
+
191
+ if (verbose) {
192
+ console.log(
193
+ `${platformName} MCP Server started on http://${result.host}:${result.port}/mcp`,
194
+ );
195
+ }
196
+
197
+ return result;
198
+ },
199
+ };
200
+ }
package/src/mcp/types.ts CHANGED
@@ -58,6 +58,11 @@ export interface ToolDefinition<T = Record<string, unknown>> {
58
58
  autoDestroy?: boolean;
59
59
  }
60
60
 
61
+ /**
62
+ * Tool type for mcpKitForAgent return value
63
+ */
64
+ export type Tool = ToolDefinition;
65
+
61
66
  /**
62
67
  * Action space item definition
63
68
  * Note: Intentionally no index signature to maintain compatibility with DeviceAction