npm - @midscene/shared - Versions diffs - 1.0.4 → 1.0.5-beta-20251230124359.0 - Mend

@midscene/shared 1.0.4 → 1.0.5-beta-20251230124359.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

package/dist/es/constants/example-code.mjs +19 -10
package/dist/es/mcp/base-server.mjs +30 -2
package/dist/es/mcp/base-tools.mjs +6 -0
package/dist/es/mcp/index.mjs +1 -0
package/dist/es/mcp/launcher-helper.mjs +52 -0
package/dist/lib/constants/example-code.js +19 -10
package/dist/lib/mcp/base-server.js +30 -2
package/dist/lib/mcp/base-tools.js +6 -0
package/dist/lib/mcp/index.js +7 -0
package/dist/lib/mcp/launcher-helper.js +86 -0
package/dist/types/constants/example-code.d.ts +2 -2
package/dist/types/mcp/base-server.d.ts +20 -4
package/dist/types/mcp/base-tools.d.ts +8 -0
package/dist/types/mcp/index.d.ts +1 -0
package/dist/types/mcp/launcher-helper.d.ts +94 -0
package/dist/types/mcp/types.d.ts +4 -0
package/package.json +2 -2
package/src/constants/example-code.ts +19 -10
package/src/mcp/base-server.ts +65 -5
package/src/mcp/base-tools.ts +14 -0
package/src/mcp/index.ts +1 -0
package/src/mcp/launcher-helper.ts +200 -0
package/src/mcp/types.ts +5 -0

package/dist/es/constants/example-code.mjs CHANGED Viewed

@@ -5,15 +5,22 @@ const PLAYWRIGHT_EXAMPLE_CODE = `
 IMPORTANT: Follow these exact type signatures for AI functions:
 // Type signatures for AI functions:
-aiInput(value: string, locator: string): Promise<void>
-aiTap(locator: string): Promise<void>
-aiDoubleClick(locator: string): Promise<void>
-aiScroll(scrollParam: {
-  direction: 'up' | 'down' | 'left' | 'right',
-  scrollType: 'once' | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft',
-  distance: number - scroll distance, px is the unit
+aiAct(prompt: string, options?: { cacheable?: boolean }): Promise<void>
+aiInput(text: string, locate: string, options?: { deepThink?: boolean, xpath?: string, cacheable?: boolean }): Promise<void>
+aiTap(locate: string, options?: { deepThink?: boolean, xpath?: string, cacheable?: boolean }): Promise<void>
+aiHover(locate: string, options?: { deepThink?: boolean, xpath?: string, cacheable?: boolean }): Promise<void>
+aiDoubleClick(locate: string, options?: { deepThink?: boolean, xpath?: string, cacheable?: boolean }): Promise<void>
+aiKeyboardPress(key: string, locate?: string, options?: { deepThink?: boolean, xpath?: string, cacheable?: boolean }): Promise<void>
+aiScroll(locate: string | undefined, options: {
+  direction?: 'up' | 'down' | 'left' | 'right',
+  scrollType?: 'singleAction' | 'scrollToBottom' | 'scrollToTop' | 'scrollToRight' | 'scrollToLeft',
+  distance?: number | null,
+  deepThink?: boolean,
+  xpath?: string,
+  cacheable?: boolean
 }): Promise<void>
-aiAssert(assertion: string): Promise<void>
+aiAssert(assertion: string, options?: { errorMessage?: string }): Promise<void>
+aiWaitFor(prompt: string, options?: { timeout?: number }): Promise<void>
 aiQuery<T>(queryObject: Record<string, string>): Promise<T> // Extracts data from page based on descriptions
 // examples:
@@ -35,12 +42,14 @@ test.beforeEach(async ({ page }) => {
 });
 test('ai shop', async ({
+  aiAct,
   aiInput,
   aiAssert,
   aiQuery,
   aiKeyboardPress,
   aiHover,
   aiTap,
+  aiWaitFor,
   agentForPage,
   page,
 }) => {
@@ -86,7 +95,7 @@ tasks:
         locate: 'input field description'
       - aiScroll:
         direction: down/up
-        scrollType: untilBottom/untilTop/page
+        scrollType: scrollToBottom/scrollToTop/singleAction
       - aiAssert: "expected state"
       - sleep: milliseconds
@@ -159,7 +168,7 @@ tasks:
       # Scroll globally or on an element described by a prompt.
       - aiScroll:
         direction: 'up' # or 'down' | 'left' | 'right'
-        scrollType: 'once' # or 'untilTop' | 'untilBottom' | 'untilLeft' | 'untilRight'
+        scrollType: 'singleAction' # or 'scrollToTop' | 'scrollToBottom' | 'scrollToLeft' | 'scrollToRight'
         distance: <number> # Optional, the scroll distance in pixels.
         locate: <prompt> # Optional, the element to scroll on.
         deepThink: <boolean> # Optional, whether to use deepThink to precisely locate the element. Defaults to False.

package/dist/es/mcp/base-server.mjs CHANGED Viewed

@@ -41,7 +41,7 @@ const MAX_SESSIONS = 100;
 class BaseMCPServer {
     async initializeToolsManager() {
         setIsMcp(true);
-        this.toolsManager = this.createToolsManager();
+        this.toolsManager = this.providedToolsManager || this.createToolsManager();
         try {
             await this.toolsManager.initTools();
         } catch (error) {
@@ -91,6 +91,11 @@ class BaseMCPServer {
         };
         process.once('SIGINT', cleanup);
         process.once('SIGTERM', cleanup);
+        return {
+            close: async ()=>{
+                this.performCleanup();
+            }
+        };
     }
     async launchHttp(options) {
         if (!Number.isInteger(options.port) || options.port < 1 || options.port > 65535) throw new Error(`Invalid port number: ${options.port}. Port must be between 1 and 65535.`);
@@ -151,6 +156,27 @@ class BaseMCPServer {
         });
         const cleanupInterval = this.startSessionCleanup(sessions);
         this.setupHttpShutdownHandlers(server, sessions, cleanupInterval);
+        return {
+            port: options.port,
+            host,
+            close: async ()=>{
+                clearInterval(cleanupInterval);
+                for (const session of sessions.values())try {
+                    await session.transport.close();
+                } catch (error) {
+                    const message = error instanceof Error ? error.message : String(error);
+                    console.error(`Failed to close session ${session.transport.sessionId}: ${message}`);
+                }
+                sessions.clear();
+                return new Promise((resolve)=>{
+                    server.close((err)=>{
+                        if (err) console.error('Error closing HTTP server:', err);
+                        this.performCleanup();
+                        resolve();
+                    });
+                });
+            }
+        };
     }
     async createHttpSession(sessions) {
         const transport = new StreamableHTTPServerTransport({
@@ -235,16 +261,18 @@ class BaseMCPServer {
     getToolsManager() {
         return this.toolsManager;
     }
-    constructor(config){
+    constructor(config, toolsManager){
         _define_property(this, "mcpServer", void 0);
         _define_property(this, "toolsManager", void 0);
         _define_property(this, "config", void 0);
+        _define_property(this, "providedToolsManager", void 0);
         this.config = config;
         this.mcpServer = new McpServer({
             name: config.name,
             version: config.version,
             description: config.description
         });
+        this.providedToolsManager = toolsManager;
     }
 }
 export { BaseMCPServer, CLI_ARGS_CONFIG, launchMCPServer };

package/dist/es/mcp/base-tools.mjs CHANGED Viewed

@@ -65,6 +65,12 @@ class BaseMidsceneTools {
     async closeBrowser() {
         await this.agent?.destroy?.();
     }
+    getToolDefinitions() {
+        return this.toolDefinitions;
+    }
+    setAgent(agent) {
+        this.agent = agent;
+    }
     buildScreenshotContent(screenshot) {
         const { mimeType, body } = parseBase64(screenshot);
         return [

package/dist/es/mcp/index.mjs CHANGED Viewed

@@ -3,3 +3,4 @@ export * from "./base-tools.mjs";
 export * from "./tool-generator.mjs";
 export * from "./types.mjs";
 export * from "./inject-report-html-plugin.mjs";
+export * from "./launcher-helper.mjs";

package/dist/es/mcp/launcher-helper.mjs ADDED Viewed

@@ -0,0 +1,52 @@
+function createMCPServerLauncher(config) {
+    const { agent, platformName, ToolsManagerClass, MCPServerClass } = config;
+    function validateAgent() {
+        const device = agent.interface;
+        if (!device) throw new Error(`Agent must have an 'interface' property that references the underlying device.
+Please ensure your agent instance is properly initialized with a device interface.
+Expected: agent.interface to be defined, but got: ${typeof device}
+Solution: Check that your agent constructor properly sets the interface property.`);
+    }
+    function createToolsManager() {
+        const toolsManager = new ToolsManagerClass();
+        toolsManager.agent = agent;
+        return toolsManager;
+    }
+    function logStartupInfo(mode, additionalInfo) {
+        const device = agent.interface;
+        console.log(`Starting Midscene ${platformName} MCP Server (${mode})...`);
+        console.log(`Agent: ${agent.constructor.name}`);
+        console.log(`Device: ${device.constructor.name}`);
+        if (additionalInfo?.port !== void 0) console.log(`Port: ${additionalInfo.port}`);
+        if (additionalInfo?.host) console.log(`Host: ${additionalInfo.host}`);
+    }
+    return {
+        async launch (options = {}) {
+            const { verbose = true } = options;
+            validateAgent();
+            if (verbose) logStartupInfo('stdio');
+            const toolsManager = createToolsManager();
+            const server = new MCPServerClass(toolsManager);
+            const result = await server.launch();
+            if (verbose) console.log(`${platformName} MCP Server started (stdio mode)`);
+            return result;
+        },
+        async launchHttp (options) {
+            const { port, host = 'localhost', verbose = true } = options;
+            validateAgent();
+            if (verbose) logStartupInfo('HTTP', {
+                port,
+                host
+            });
+            const toolsManager = createToolsManager();
+            const server = new MCPServerClass(toolsManager);
+            const result = await server.launchHttp({
+                port,
+                host
+            });
+            if (verbose) console.log(`${platformName} MCP Server started on http://${result.host}:${result.port}/mcp`);
+            return result;
+        }
+    };
+}
+export { createMCPServerLauncher };

package/dist/lib/constants/example-code.js CHANGED Viewed

@@ -34,15 +34,22 @@ const PLAYWRIGHT_EXAMPLE_CODE = `
 IMPORTANT: Follow these exact type signatures for AI functions:
 // Type signatures for AI functions:
-aiInput(value: string, locator: string): Promise<void>
-aiTap(locator: string): Promise<void>
-aiDoubleClick(locator: string): Promise<void>
-aiScroll(scrollParam: {
-  direction: 'up' | 'down' | 'left' | 'right',
-  scrollType: 'once' | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft',
-  distance: number - scroll distance, px is the unit
+aiAct(prompt: string, options?: { cacheable?: boolean }): Promise<void>
+aiInput(text: string, locate: string, options?: { deepThink?: boolean, xpath?: string, cacheable?: boolean }): Promise<void>
+aiTap(locate: string, options?: { deepThink?: boolean, xpath?: string, cacheable?: boolean }): Promise<void>
+aiHover(locate: string, options?: { deepThink?: boolean, xpath?: string, cacheable?: boolean }): Promise<void>
+aiDoubleClick(locate: string, options?: { deepThink?: boolean, xpath?: string, cacheable?: boolean }): Promise<void>
+aiKeyboardPress(key: string, locate?: string, options?: { deepThink?: boolean, xpath?: string, cacheable?: boolean }): Promise<void>
+aiScroll(locate: string | undefined, options: {
+  direction?: 'up' | 'down' | 'left' | 'right',
+  scrollType?: 'singleAction' | 'scrollToBottom' | 'scrollToTop' | 'scrollToRight' | 'scrollToLeft',
+  distance?: number | null,
+  deepThink?: boolean,
+  xpath?: string,
+  cacheable?: boolean
 }): Promise<void>
-aiAssert(assertion: string): Promise<void>
+aiAssert(assertion: string, options?: { errorMessage?: string }): Promise<void>
+aiWaitFor(prompt: string, options?: { timeout?: number }): Promise<void>
 aiQuery<T>(queryObject: Record<string, string>): Promise<T> // Extracts data from page based on descriptions
 // examples:
@@ -64,12 +71,14 @@ test.beforeEach(async ({ page }) => {
 });
 test('ai shop', async ({
+  aiAct,
   aiInput,
   aiAssert,
   aiQuery,
   aiKeyboardPress,
   aiHover,
   aiTap,
+  aiWaitFor,
   agentForPage,
   page,
 }) => {
@@ -115,7 +124,7 @@ tasks:
         locate: 'input field description'
       - aiScroll:
         direction: down/up
-        scrollType: untilBottom/untilTop/page
+        scrollType: scrollToBottom/scrollToTop/singleAction
       - aiAssert: "expected state"
       - sleep: milliseconds
@@ -188,7 +197,7 @@ tasks:
       # Scroll globally or on an element described by a prompt.
       - aiScroll:
         direction: 'up' # or 'down' | 'left' | 'right'
-        scrollType: 'once' # or 'untilTop' | 'untilBottom' | 'untilLeft' | 'untilRight'
+        scrollType: 'singleAction' # or 'scrollToTop' | 'scrollToBottom' | 'scrollToLeft' | 'scrollToRight'
         distance: <number> # Optional, the scroll distance in pixels.
         locate: <prompt> # Optional, the element to scroll on.
         deepThink: <boolean> # Optional, whether to use deepThink to precisely locate the element. Defaults to False.

package/dist/lib/mcp/base-server.js CHANGED Viewed

@@ -81,7 +81,7 @@ const MAX_SESSIONS = 100;
 class BaseMCPServer {
     async initializeToolsManager() {
         (0, utils_namespaceObject.setIsMcp)(true);
-        this.toolsManager = this.createToolsManager();
+        this.toolsManager = this.providedToolsManager || this.createToolsManager();
         try {
             await this.toolsManager.initTools();
         } catch (error) {
@@ -131,6 +131,11 @@ class BaseMCPServer {
         };
         process.once('SIGINT', cleanup);
         process.once('SIGTERM', cleanup);
+        return {
+            close: async ()=>{
+                this.performCleanup();
+            }
+        };
     }
     async launchHttp(options) {
         if (!Number.isInteger(options.port) || options.port < 1 || options.port > 65535) throw new Error(`Invalid port number: ${options.port}. Port must be between 1 and 65535.`);
@@ -191,6 +196,27 @@ class BaseMCPServer {
         });
         const cleanupInterval = this.startSessionCleanup(sessions);
         this.setupHttpShutdownHandlers(server, sessions, cleanupInterval);
+        return {
+            port: options.port,
+            host,
+            close: async ()=>{
+                clearInterval(cleanupInterval);
+                for (const session of sessions.values())try {
+                    await session.transport.close();
+                } catch (error) {
+                    const message = error instanceof Error ? error.message : String(error);
+                    console.error(`Failed to close session ${session.transport.sessionId}: ${message}`);
+                }
+                sessions.clear();
+                return new Promise((resolve)=>{
+                    server.close((err)=>{
+                        if (err) console.error('Error closing HTTP server:', err);
+                        this.performCleanup();
+                        resolve();
+                    });
+                });
+            }
+        };
     }
     async createHttpSession(sessions) {
         const transport = new streamableHttp_js_namespaceObject.StreamableHTTPServerTransport({
@@ -275,16 +301,18 @@ class BaseMCPServer {
     getToolsManager() {
         return this.toolsManager;
     }
-    constructor(config){
+    constructor(config, toolsManager){
         _define_property(this, "mcpServer", void 0);
         _define_property(this, "toolsManager", void 0);
         _define_property(this, "config", void 0);
+        _define_property(this, "providedToolsManager", void 0);
         this.config = config;
         this.mcpServer = new mcp_js_namespaceObject.McpServer({
             name: config.name,
             version: config.version,
             description: config.description
         });
+        this.providedToolsManager = toolsManager;
     }
 }
 exports.BaseMCPServer = __webpack_exports__.BaseMCPServer;

package/dist/lib/mcp/base-tools.js CHANGED Viewed

@@ -93,6 +93,12 @@ class BaseMidsceneTools {
     async closeBrowser() {
         await this.agent?.destroy?.();
     }
+    getToolDefinitions() {
+        return this.toolDefinitions;
+    }
+    setAgent(agent) {
+        this.agent = agent;
+    }
     buildScreenshotContent(screenshot) {
         const { mimeType, body } = (0, img_namespaceObject.parseBase64)(screenshot);
         return [

package/dist/lib/mcp/index.js CHANGED Viewed

@@ -9,6 +9,9 @@ var __webpack_modules__ = {
     "./inject-report-html-plugin" (module) {
         module.exports = require("./inject-report-html-plugin.js");
     },
+    "./launcher-helper" (module) {
+        module.exports = require("./launcher-helper.js");
+    },
     "./tool-generator" (module) {
         module.exports = require("./tool-generator.js");
     },
@@ -79,6 +82,10 @@ var __webpack_exports__ = {};
     var __rspack_reexport = {};
     for(const __rspack_import_key in _inject_report_html_plugin__rspack_import_4)if ("default" !== __rspack_import_key) __rspack_reexport[__rspack_import_key] = ()=>_inject_report_html_plugin__rspack_import_4[__rspack_import_key];
     __webpack_require__.d(__webpack_exports__, __rspack_reexport);
+    var _launcher_helper__rspack_import_5 = __webpack_require__("./launcher-helper");
+    var __rspack_reexport = {};
+    for(const __rspack_import_key in _launcher_helper__rspack_import_5)if ("default" !== __rspack_import_key) __rspack_reexport[__rspack_import_key] = ()=>_launcher_helper__rspack_import_5[__rspack_import_key];
+    __webpack_require__.d(__webpack_exports__, __rspack_reexport);
 })();
 for(var __rspack_i in __webpack_exports__)exports[__rspack_i] = __webpack_exports__[__rspack_i];
 Object.defineProperty(exports, '__esModule', {

package/dist/lib/mcp/launcher-helper.js ADDED Viewed

@@ -0,0 +1,86 @@
+"use strict";
+var __webpack_require__ = {};
+(()=>{
+    __webpack_require__.d = (exports1, definition)=>{
+        for(var key in definition)if (__webpack_require__.o(definition, key) && !__webpack_require__.o(exports1, key)) Object.defineProperty(exports1, key, {
+            enumerable: true,
+            get: definition[key]
+        });
+    };
+})();
+(()=>{
+    __webpack_require__.o = (obj, prop)=>Object.prototype.hasOwnProperty.call(obj, prop);
+})();
+(()=>{
+    __webpack_require__.r = (exports1)=>{
+        if ('undefined' != typeof Symbol && Symbol.toStringTag) Object.defineProperty(exports1, Symbol.toStringTag, {
+            value: 'Module'
+        });
+        Object.defineProperty(exports1, '__esModule', {
+            value: true
+        });
+    };
+})();
+var __webpack_exports__ = {};
+__webpack_require__.r(__webpack_exports__);
+__webpack_require__.d(__webpack_exports__, {
+    createMCPServerLauncher: ()=>createMCPServerLauncher
+});
+function createMCPServerLauncher(config) {
+    const { agent, platformName, ToolsManagerClass, MCPServerClass } = config;
+    function validateAgent() {
+        const device = agent.interface;
+        if (!device) throw new Error(`Agent must have an 'interface' property that references the underlying device.
+Please ensure your agent instance is properly initialized with a device interface.
+Expected: agent.interface to be defined, but got: ${typeof device}
+Solution: Check that your agent constructor properly sets the interface property.`);
+    }
+    function createToolsManager() {
+        const toolsManager = new ToolsManagerClass();
+        toolsManager.agent = agent;
+        return toolsManager;
+    }
+    function logStartupInfo(mode, additionalInfo) {
+        const device = agent.interface;
+        console.log(`Starting Midscene ${platformName} MCP Server (${mode})...`);
+        console.log(`Agent: ${agent.constructor.name}`);
+        console.log(`Device: ${device.constructor.name}`);
+        if (additionalInfo?.port !== void 0) console.log(`Port: ${additionalInfo.port}`);
+        if (additionalInfo?.host) console.log(`Host: ${additionalInfo.host}`);
+    }
+    return {
+        async launch (options = {}) {
+            const { verbose = true } = options;
+            validateAgent();
+            if (verbose) logStartupInfo('stdio');
+            const toolsManager = createToolsManager();
+            const server = new MCPServerClass(toolsManager);
+            const result = await server.launch();
+            if (verbose) console.log(`${platformName} MCP Server started (stdio mode)`);
+            return result;
+        },
+        async launchHttp (options) {
+            const { port, host = 'localhost', verbose = true } = options;
+            validateAgent();
+            if (verbose) logStartupInfo('HTTP', {
+                port,
+                host
+            });
+            const toolsManager = createToolsManager();
+            const server = new MCPServerClass(toolsManager);
+            const result = await server.launchHttp({
+                port,
+                host
+            });
+            if (verbose) console.log(`${platformName} MCP Server started on http://${result.host}:${result.port}/mcp`);
+            return result;
+        }
+    };
+}
+exports.createMCPServerLauncher = __webpack_exports__.createMCPServerLauncher;
+for(var __rspack_i in __webpack_exports__)if (-1 === [
+    "createMCPServerLauncher"
+].indexOf(__rspack_i)) exports[__rspack_i] = __webpack_exports__[__rspack_i];
+Object.defineProperty(exports, '__esModule', {
+    value: true
+});

package/dist/types/constants/example-code.d.ts CHANGED Viewed

@@ -1,2 +1,2 @@
-export declare const PLAYWRIGHT_EXAMPLE_CODE = "\n// Reference the following code to generate Midscene test cases\n// The following is test code for Midscene AI, for reference\n// The following is Playwright syntax, you can use Playwright to assist in test generation\nIMPORTANT: Follow these exact type signatures for AI functions:\n\n// Type signatures for AI functions:\naiInput(value: string, locator: string): Promise<void>\naiTap(locator: string): Promise<void>\naiDoubleClick(locator: string): Promise<void>\naiScroll(scrollParam: {\n  direction: 'up' | 'down' | 'left' | 'right',\n  scrollType: 'once' | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft',\n  distance: number - scroll distance, px is the unit\n}): Promise<void>\naiAssert(assertion: string): Promise<void>\naiQuery<T>(queryObject: Record<string, string>): Promise<T> // Extracts data from page based on descriptions\n\n// examples:\n// Reference the following code to generate Midscene test cases\n// The following is test code for Midscene AI, for reference\n// The following is Playwright syntax, you can use Playwright to assist in test generation\nimport { test as base } from '@playwright/test';\nimport type { PlayWrightAiFixtureType } from '@midscene/web/playwright';\nimport { PlaywrightAiFixture } from '@midscene/web/playwright';\n\nconst test = base.extend<PlayWrightAiFixtureType>(PlaywrightAiFixture({\n  waitForNetworkIdleTimeout: 2000, // optional, the timeout for waiting for network idle between each action, default is 2000ms\n}));\n\n\ntest.beforeEach(async ({ page }) => {\n  await page.goto('https://www.xxx.com/');\n  await page.setViewportSize({ width: 1920, height: 1080 });\n});\n\ntest('ai shop', async ({\n  aiInput,\n  aiAssert,\n  aiQuery,\n  aiKeyboardPress,\n  aiHover,\n  aiTap,\n  agentForPage,\n  page,\n}) => {\n  // login\n  await aiAssert('The page shows the login interface');\n  await aiInput('user_name', 'in user name input');\n  await aiInput('password', 'in password input');\n  await aiKeyboardPress('Enter', 'Login Button');\n\n  // check the login success\n  await aiWaitFor('The page shows that the loading is complete');\n  await aiAssert('The current page shows the product detail page');\n\n  // check the product info\n  const dataA = await aiQuery({\n    userInfo: 'User information in the format {name: string}',\n    theFirstProductInfo: 'The first product info in the format {name: string, price: number}',\n  });\n  expect(dataA.theFirstProductInfo.name).toBe('xxx');\n  expect(dataA.theFirstProductInfo.price).toBe(100);\n\n\n  // add to cart\n  await aiTap('click add to cart button');\n  \n  await aiTap('click right top cart icon');\n  await aiAssert('The cart icon shows the number 1');\n});\n";
-export declare const YAML_EXAMPLE_CODE = "\n1. Format:\n\nweb:\n  url: \"starting_url\"\n  viewportWidth: 1280\n  viewportHeight: 960\n\ntasks:\n  - name: \"descriptive task name\"\n    flow:\n      - aiTap: \"element description\"\n      - aiInput: 'text value'\n        locate: 'input field description'\n      - aiScroll:\n        direction: down/up\n        scrollType: untilBottom/untilTop/page\n      - aiAssert: \"expected state\"\n      - sleep: milliseconds\n\n2. Action Types:\n- aiTap: for clicks (natural language targeting)\n- aiInput: for text input with 'locate' field\n- aiScroll: with direction and scrollType\n- aiAssert: for validations\n- sleep: for delays (milliseconds)\n\n3. Best Practices:\n- Group related actions into logical tasks\n- Use natural language descriptions\n- Add deepThink: true for complex interactions\n- Keep task names concise but descriptive\n\n\n\nYAML type\ntasks:\n  - name: <name>\n    continueOnError: <boolean> # Optional, whether to continue to the next task on error, defaults to false.\n    flow:\n      # Auto Planning (.ai)\n      # ----------------\n\n      # Perform an interaction. `ai` is a shorthand for `aiAct`.\n      - ai: <prompt>\n        cacheable: <boolean> # Optional, whether to cache the result of this API call when the [caching feature](./caching.mdx) is enabled. Defaults to True.\n\n      # This usage is the same as `ai`.\n      - aiAct: <prompt>\n        cacheable: <boolean> # Optional, whether to cache the result of this API call when the [caching feature](./caching.mdx) is enabled. Defaults to True.\n\n      # Instant Action (.aiTap, .aiDoubleClick, .aiHover, .aiInput, .aiKeyboardPress, .aiScroll)\n      # ----------------\n\n      # Tap an element described by a prompt.\n      - aiTap: <prompt>\n        deepThink: <boolean> # Optional, whether to use deepThink to precisely locate the element. Defaults to False.\n        xpath: <xpath> # Optional, the xpath of the target element for the operation. If provided, Midscene will prioritize this xpath to find the element before using the cache and the AI model. Defaults to empty.\n        cacheable: <boolean> # Optional, whether to cache the result of this API call when the [caching feature](./caching.mdx) is enabled. Defaults to True.\n\n      # Double click an element described by a prompt.\n      - aiDoubleClick: <prompt>\n        deepThink: <boolean> # Optional, whether to use deepThink to precisely locate the element. Defaults to False.\n        xpath: <xpath> # Optional, the xpath of the target element for the operation. If provided, Midscene will prioritize this xpath to find the element before using the cache and the AI model. Defaults to empty.\n        cacheable: <boolean> # Optional, whether to cache the result of this API call when the [caching feature](./caching.mdx) is enabled. Defaults to True.\n\n      # Hover over an element described by a prompt.\n      - aiHover: <prompt>\n        deepThink: <boolean> # Optional, whether to use deepThink to precisely locate the element. Defaults to False.\n        xpath: <xpath> # Optional, the xpath of the target element for the operation. If provided, Midscene will prioritize this xpath to find the element before using the cache and the AI model. Defaults to empty.\n        cacheable: <boolean> # Optional, whether to cache the result of this API call when the [caching feature](./caching.mdx) is enabled. Defaults to True.\n\n      # Input text into an element described by a prompt.\n      - aiInput: <final text content of the input>\n        locate: <prompt>\n        deepThink: <boolean> # Optional, whether to use deepThink to precisely locate the element. Defaults to False.\n        xpath: <xpath> # Optional, the xpath of the target element for the operation. If provided, Midscene will prioritize this xpath to find the element before using the cache and the AI model. Defaults to empty.\n        cacheable: <boolean> # Optional, whether to cache the result of this API call when the [caching feature](./caching.mdx) is enabled. Defaults to True.\n\n      # Press a key (e.g., Enter, Tab, Escape) on an element described by a prompt.\n      - aiKeyboardPress: <key>\n        locate: <prompt>\n        deepThink: <boolean> # Optional, whether to use deepThink to precisely locate the element. Defaults to False.\n        xpath: <xpath> # Optional, the xpath of the target element for the operation. If provided, Midscene will prioritize this xpath to find the element before using the cache and the AI model. Defaults to empty.\n        cacheable: <boolean> # Optional, whether to cache the result of this API call when the [caching feature](./caching.mdx) is enabled. Defaults to True.\n\n      # Scroll globally or on an element described by a prompt.\n      - aiScroll:\n        direction: 'up' # or 'down' | 'left' | 'right'\n        scrollType: 'once' # or 'untilTop' | 'untilBottom' | 'untilLeft' | 'untilRight'\n        distance: <number> # Optional, the scroll distance in pixels.\n        locate: <prompt> # Optional, the element to scroll on.\n        deepThink: <boolean> # Optional, whether to use deepThink to precisely locate the element. Defaults to False.\n        xpath: <xpath> # Optional, the xpath of the target element for the operation. If provided, Midscene will prioritize this xpath to find the element before using the cache and the AI model. Defaults to empty.\n        cacheable: <boolean> # Optional, whether to cache the result of this API call when the [caching feature](./caching.mdx) is enabled. Defaults to True.\n\n      # Record the current screenshot with a description in the report file.\n      - recordToReport: <title> # Optional, the title of the screenshot. If not provided, the title will be 'untitled'.\n        content: <content> # Optional, the description of the screenshot.\n\n      # Data Extraction\n      # ----------------\n\n      # Perform a query that returns a JSON object.\n      - aiQuery: <prompt> # Remember to describe the format of the result in the prompt.\n        name: <name> # The key for the query result in the JSON output.\n\n      # More APIs\n      # ----------------\n\n      # Wait for a condition to be met, with a timeout (in ms, optional, defaults to 30000).\n      - aiWaitFor: <prompt>\n        timeout: <ms>\n\n      # Perform an assertion.\n      - aiAssert: <prompt>\n        errorMessage: <error-message> # Optional, the error message to print if the assertion fails.\n\n      # Wait for a specified amount of time.\n      - sleep: <ms>\n\n      # Execute a piece of JavaScript code in the web page context.\n      - javascript: <javascript>\n        name: <name> # Optional, assign a name to the return value, which will be used as a key in the JSON output.\n\n  - name: <name>\n    flow:\n      # ...\n";
+export declare const PLAYWRIGHT_EXAMPLE_CODE = "\n// Reference the following code to generate Midscene test cases\n// The following is test code for Midscene AI, for reference\n// The following is Playwright syntax, you can use Playwright to assist in test generation\nIMPORTANT: Follow these exact type signatures for AI functions:\n\n// Type signatures for AI functions:\naiAct(prompt: string, options?: { cacheable?: boolean }): Promise<void>\naiInput(text: string, locate: string, options?: { deepThink?: boolean, xpath?: string, cacheable?: boolean }): Promise<void>\naiTap(locate: string, options?: { deepThink?: boolean, xpath?: string, cacheable?: boolean }): Promise<void>\naiHover(locate: string, options?: { deepThink?: boolean, xpath?: string, cacheable?: boolean }): Promise<void>\naiDoubleClick(locate: string, options?: { deepThink?: boolean, xpath?: string, cacheable?: boolean }): Promise<void>\naiKeyboardPress(key: string, locate?: string, options?: { deepThink?: boolean, xpath?: string, cacheable?: boolean }): Promise<void>\naiScroll(locate: string | undefined, options: {\n  direction?: 'up' | 'down' | 'left' | 'right',\n  scrollType?: 'singleAction' | 'scrollToBottom' | 'scrollToTop' | 'scrollToRight' | 'scrollToLeft',\n  distance?: number | null,\n  deepThink?: boolean,\n  xpath?: string,\n  cacheable?: boolean\n}): Promise<void>\naiAssert(assertion: string, options?: { errorMessage?: string }): Promise<void>\naiWaitFor(prompt: string, options?: { timeout?: number }): Promise<void>\naiQuery<T>(queryObject: Record<string, string>): Promise<T> // Extracts data from page based on descriptions\n\n// examples:\n// Reference the following code to generate Midscene test cases\n// The following is test code for Midscene AI, for reference\n// The following is Playwright syntax, you can use Playwright to assist in test generation\nimport { test as base } from '@playwright/test';\nimport type { PlayWrightAiFixtureType } from '@midscene/web/playwright';\nimport { PlaywrightAiFixture } from '@midscene/web/playwright';\n\nconst test = base.extend<PlayWrightAiFixtureType>(PlaywrightAiFixture({\n  waitForNetworkIdleTimeout: 2000, // optional, the timeout for waiting for network idle between each action, default is 2000ms\n}));\n\n\ntest.beforeEach(async ({ page }) => {\n  await page.goto('https://www.xxx.com/');\n  await page.setViewportSize({ width: 1920, height: 1080 });\n});\n\ntest('ai shop', async ({\n  aiAct,\n  aiInput,\n  aiAssert,\n  aiQuery,\n  aiKeyboardPress,\n  aiHover,\n  aiTap,\n  aiWaitFor,\n  agentForPage,\n  page,\n}) => {\n  // login\n  await aiAssert('The page shows the login interface');\n  await aiInput('user_name', 'in user name input');\n  await aiInput('password', 'in password input');\n  await aiKeyboardPress('Enter', 'Login Button');\n\n  // check the login success\n  await aiWaitFor('The page shows that the loading is complete');\n  await aiAssert('The current page shows the product detail page');\n\n  // check the product info\n  const dataA = await aiQuery({\n    userInfo: 'User information in the format {name: string}',\n    theFirstProductInfo: 'The first product info in the format {name: string, price: number}',\n  });\n  expect(dataA.theFirstProductInfo.name).toBe('xxx');\n  expect(dataA.theFirstProductInfo.price).toBe(100);\n\n\n  // add to cart\n  await aiTap('click add to cart button');\n  \n  await aiTap('click right top cart icon');\n  await aiAssert('The cart icon shows the number 1');\n});\n";
+export declare const YAML_EXAMPLE_CODE = "\n1. Format:\n\nweb:\n  url: \"starting_url\"\n  viewportWidth: 1280\n  viewportHeight: 960\n\ntasks:\n  - name: \"descriptive task name\"\n    flow:\n      - aiTap: \"element description\"\n      - aiInput: 'text value'\n        locate: 'input field description'\n      - aiScroll:\n        direction: down/up\n        scrollType: scrollToBottom/scrollToTop/singleAction\n      - aiAssert: \"expected state\"\n      - sleep: milliseconds\n\n2. Action Types:\n- aiTap: for clicks (natural language targeting)\n- aiInput: for text input with 'locate' field\n- aiScroll: with direction and scrollType\n- aiAssert: for validations\n- sleep: for delays (milliseconds)\n\n3. Best Practices:\n- Group related actions into logical tasks\n- Use natural language descriptions\n- Add deepThink: true for complex interactions\n- Keep task names concise but descriptive\n\n\n\nYAML type\ntasks:\n  - name: <name>\n    continueOnError: <boolean> # Optional, whether to continue to the next task on error, defaults to false.\n    flow:\n      # Auto Planning (.ai)\n      # ----------------\n\n      # Perform an interaction. `ai` is a shorthand for `aiAct`.\n      - ai: <prompt>\n        cacheable: <boolean> # Optional, whether to cache the result of this API call when the [caching feature](./caching.mdx) is enabled. Defaults to True.\n\n      # This usage is the same as `ai`.\n      - aiAct: <prompt>\n        cacheable: <boolean> # Optional, whether to cache the result of this API call when the [caching feature](./caching.mdx) is enabled. Defaults to True.\n\n      # Instant Action (.aiTap, .aiDoubleClick, .aiHover, .aiInput, .aiKeyboardPress, .aiScroll)\n      # ----------------\n\n      # Tap an element described by a prompt.\n      - aiTap: <prompt>\n        deepThink: <boolean> # Optional, whether to use deepThink to precisely locate the element. Defaults to False.\n        xpath: <xpath> # Optional, the xpath of the target element for the operation. If provided, Midscene will prioritize this xpath to find the element before using the cache and the AI model. Defaults to empty.\n        cacheable: <boolean> # Optional, whether to cache the result of this API call when the [caching feature](./caching.mdx) is enabled. Defaults to True.\n\n      # Double click an element described by a prompt.\n      - aiDoubleClick: <prompt>\n        deepThink: <boolean> # Optional, whether to use deepThink to precisely locate the element. Defaults to False.\n        xpath: <xpath> # Optional, the xpath of the target element for the operation. If provided, Midscene will prioritize this xpath to find the element before using the cache and the AI model. Defaults to empty.\n        cacheable: <boolean> # Optional, whether to cache the result of this API call when the [caching feature](./caching.mdx) is enabled. Defaults to True.\n\n      # Hover over an element described by a prompt.\n      - aiHover: <prompt>\n        deepThink: <boolean> # Optional, whether to use deepThink to precisely locate the element. Defaults to False.\n        xpath: <xpath> # Optional, the xpath of the target element for the operation. If provided, Midscene will prioritize this xpath to find the element before using the cache and the AI model. Defaults to empty.\n        cacheable: <boolean> # Optional, whether to cache the result of this API call when the [caching feature](./caching.mdx) is enabled. Defaults to True.\n\n      # Input text into an element described by a prompt.\n      - aiInput: <final text content of the input>\n        locate: <prompt>\n        deepThink: <boolean> # Optional, whether to use deepThink to precisely locate the element. Defaults to False.\n        xpath: <xpath> # Optional, the xpath of the target element for the operation. If provided, Midscene will prioritize this xpath to find the element before using the cache and the AI model. Defaults to empty.\n        cacheable: <boolean> # Optional, whether to cache the result of this API call when the [caching feature](./caching.mdx) is enabled. Defaults to True.\n\n      # Press a key (e.g., Enter, Tab, Escape) on an element described by a prompt.\n      - aiKeyboardPress: <key>\n        locate: <prompt>\n        deepThink: <boolean> # Optional, whether to use deepThink to precisely locate the element. Defaults to False.\n        xpath: <xpath> # Optional, the xpath of the target element for the operation. If provided, Midscene will prioritize this xpath to find the element before using the cache and the AI model. Defaults to empty.\n        cacheable: <boolean> # Optional, whether to cache the result of this API call when the [caching feature](./caching.mdx) is enabled. Defaults to True.\n\n      # Scroll globally or on an element described by a prompt.\n      - aiScroll:\n        direction: 'up' # or 'down' | 'left' | 'right'\n        scrollType: 'singleAction' # or 'scrollToTop' | 'scrollToBottom' | 'scrollToLeft' | 'scrollToRight'\n        distance: <number> # Optional, the scroll distance in pixels.\n        locate: <prompt> # Optional, the element to scroll on.\n        deepThink: <boolean> # Optional, whether to use deepThink to precisely locate the element. Defaults to False.\n        xpath: <xpath> # Optional, the xpath of the target element for the operation. If provided, Midscene will prioritize this xpath to find the element before using the cache and the AI model. Defaults to empty.\n        cacheable: <boolean> # Optional, whether to cache the result of this API call when the [caching feature](./caching.mdx) is enabled. Defaults to True.\n\n      # Record the current screenshot with a description in the report file.\n      - recordToReport: <title> # Optional, the title of the screenshot. If not provided, the title will be 'untitled'.\n        content: <content> # Optional, the description of the screenshot.\n\n      # Data Extraction\n      # ----------------\n\n      # Perform a query that returns a JSON object.\n      - aiQuery: <prompt> # Remember to describe the format of the result in the prompt.\n        name: <name> # The key for the query result in the JSON output.\n\n      # More APIs\n      # ----------------\n\n      # Wait for a condition to be met, with a timeout (in ms, optional, defaults to 30000).\n      - aiWaitFor: <prompt>\n        timeout: <ms>\n\n      # Perform an assertion.\n      - aiAssert: <prompt>\n        errorMessage: <error-message> # Optional, the error message to print if the assertion fails.\n\n      # Wait for a specified amount of time.\n      - sleep: <ms>\n\n      # Execute a piece of JavaScript code in the web page context.\n      - javascript: <javascript>\n        name: <name> # Optional, assign a name to the return value, which will be used as a key in the JSON output.\n\n  - name: <name>\n    flow:\n      # ...\n";

package/dist/types/mcp/base-server.d.ts CHANGED Viewed

@@ -10,6 +10,20 @@ export interface HttpLaunchOptions {
     port: number;
     host?: string;
 }
+export interface LaunchMCPServerResult {
+    /**
+     * The MCP server port (for HTTP mode)
+     */
+    port?: number;
+    /**
+     * The server host (for HTTP mode)
+     */
+    host?: string;
+    /**
+     * Function to gracefully shutdown the MCP server
+     */
+    close: () => Promise<void>;
+}
 /**
  * CLI argument configuration for MCP servers
  */
@@ -23,7 +37,7 @@ export interface CLIArgs {
  * Launch an MCP server based on CLI arguments
  * Shared helper to reduce duplication across platform CLI entry points
  */
-export declare function launchMCPServer(server: BaseMCPServer, args: CLIArgs): Promise<void>;
+export declare function launchMCPServer(server: BaseMCPServer, args: CLIArgs): Promise<LaunchMCPServerResult>;
 /**
  * Base MCP Server class with programmatic launch() API
  * Each platform extends this to provide their own tools manager
@@ -32,9 +46,11 @@ export declare abstract class BaseMCPServer {
     protected mcpServer: McpServer;
     protected toolsManager?: IMidsceneTools;
     protected config: BaseMCPServerConfig;
-    constructor(config: BaseMCPServerConfig);
+    protected providedToolsManager?: IMidsceneTools;
+    constructor(config: BaseMCPServerConfig, toolsManager?: IMidsceneTools);
     /**
      * Platform-specific: create tools manager instance
+     * This is only called if no tools manager was provided in constructor
      */
     protected abstract createToolsManager(): IMidsceneTools;
     /**
@@ -48,12 +64,12 @@ export declare abstract class BaseMCPServer {
     /**
      * Initialize and launch the MCP server with stdio transport
      */
-    launch(): Promise<void>;
+    launch(): Promise<LaunchMCPServerResult>;
     /**
      * Launch MCP server with HTTP transport
      * Supports stateful sessions for web applications and service integration
      */
-    launchHttp(options: HttpLaunchOptions): Promise<void>;
+    launchHttp(options: HttpLaunchOptions): Promise<LaunchMCPServerResult>;
     /**
      * Create a new HTTP session with transport
      */

package/dist/types/mcp/base-tools.d.ts CHANGED Viewed

@@ -44,6 +44,14 @@ export declare abstract class BaseMidsceneTools<TAgent extends BaseAgent = BaseA
      * Cleanup method - destroy agent and release resources
      */
     closeBrowser(): Promise<void>;
+    /**
+     * Get tool definitions
+     */
+    getToolDefinitions(): ToolDefinition[];
+    /**
+     * Set agent for the tools manager
+     */
+    setAgent(agent: TAgent): void;
     /**
      * Helper: Convert base64 screenshot to image content array
      */

package/dist/types/mcp/index.d.ts CHANGED Viewed

@@ -3,3 +3,4 @@ export * from './base-tools';
 export * from './tool-generator';
 export * from './types';
 export * from './inject-report-html-plugin';
+export * from './launcher-helper';

package/dist/types/mcp/launcher-helper.d.ts ADDED Viewed

@@ -0,0 +1,94 @@
+import type { BaseMCPServer } from './base-server';
+import type { HttpLaunchOptions, LaunchMCPServerResult } from './base-server';
+import type { IMidsceneTools } from './types';
+export interface LaunchMCPServerOptions extends HttpLaunchOptions {
+    /**
+     * Whether to show server logs
+     * @default true
+     */
+    verbose?: boolean;
+}
+/**
+ * Generic agent type (avoid importing from @midscene/core to prevent circular deps)
+ */
+export interface GenericAgent<TDevice = any> {
+    interface: TDevice;
+    constructor: {
+        name: string;
+    };
+}
+/**
+ * Additional information for logging server startup
+ */
+export interface StartupInfo {
+    port?: number;
+    host?: string;
+}
+export interface MCPServerLauncherConfig<AgentType extends GenericAgent = GenericAgent, ToolsManagerType extends IMidsceneTools = IMidsceneTools> {
+    agent: AgentType;
+    platformName: string;
+    ToolsManagerClass: new (...args: any[]) => ToolsManagerType;
+    MCPServerClass: new (toolsManager?: ToolsManagerType) => BaseMCPServer;
+}
+/**
+ * Create a generic MCP server launcher for a given agent, tools manager, and MCP server.
+ *
+ * This helper centralizes the common wiring logic used by platform-specific launchers:
+ * it constructs a tools manager, attaches the provided `agent` to it, then instantiates
+ * the `MCPServerClass` and exposes convenience methods to start the server over stdio
+ * (`launch`) or HTTP (`launchHttp`).
+ *
+ * Use this helper when adding a new platform-specific launcher or when you want to
+ * avoid duplicating boilerplate code for starting an MCP server. Typically, callers
+ * provide:
+ * - an `agent` instance that contains the underlying device on its `interface` property
+ * - a `ToolsManagerClass` that knows how to expose tools for that agent
+ * - an `MCPServerClass` that implements the MCP protocol and supports `launch` and
+ *   `launchHttp` methods.
+ *
+ * The returned object has two methods:
+ * - `launch(options?)` to start the server using stdio transport
+ * - `launchHttp(options)` to start the server using HTTP transport
+ * Both methods accept a `verbose` flag to control console logging.
+ *
+ * @param config Configuration describing the agent, platform name (for logging),
+ *               tools manager implementation, and MCP server implementation.
+ *
+ * @returns An object with `launch` and `launchHttp` methods to start the MCP server.
+ *
+ * @example
+ * ```typescript
+ * import { createMCPServerLauncher } from '@midscene/shared/mcp';
+ * import { Agent } from '@midscene/core/agent';
+ * import { WebMidsceneTools } from './web-tools';
+ * import { WebMCPServer } from './server';
+ *
+ * const agent = new Agent();
+ * const launcher = createMCPServerLauncher({
+ *   agent,
+ *   platformName: 'Web',
+ *   ToolsManagerClass: WebMidsceneTools,
+ *   MCPServerClass: WebMCPServer,
+ * });
+ *
+ * // Start with stdio
+ * await launcher.launch({ verbose: true });
+ *
+ * // Or start with HTTP
+ * await launcher.launchHttp({ port: 3000, host: 'localhost' });
+ * ```
+ *
+ * @internal
+ */
+export declare function createMCPServerLauncher<AgentType extends GenericAgent, ToolsManagerType extends IMidsceneTools>(config: MCPServerLauncherConfig<AgentType, ToolsManagerType>): {
+    /**
+     * Launch the MCP server with stdio transport
+     */
+    launch(options?: {
+        verbose?: boolean;
+    }): Promise<LaunchMCPServerResult>;
+    /**
+     * Launch the MCP server with HTTP transport
+     */
+    launchHttp(options: LaunchMCPServerOptions): Promise<LaunchMCPServerResult>;
+};

package/dist/types/mcp/types.d.ts CHANGED Viewed

@@ -59,6 +59,10 @@ export interface ToolDefinition<T = Record<string, unknown>> {
     handler: ToolHandler<T>;
     autoDestroy?: boolean;
 }
+/**
+ * Tool type for mcpKitForAgent return value
+ */
+export type Tool = ToolDefinition;
 /**
  * Action space item definition
  * Note: Intentionally no index signature to maintain compatibility with DeviceAction

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@midscene/shared",
-  "version": "1.0.4",
+  "version": "1.0.5-beta-20251230124359.0",
   "repository": "https://github.com/web-infra-dev/midscene",
   "homepage": "https://midscenejs.com/",
   "types": "./dist/types/index.d.ts",
@@ -79,6 +79,7 @@
     "README.md"
   ],
   "dependencies": {
+    "@modelcontextprotocol/sdk": "1.10.2",
     "@silvia-odwyer/photon": "0.3.3",
     "@silvia-odwyer/photon-node": "0.3.3",
     "debug": "4.4.0",
@@ -90,7 +91,6 @@
   },
   "devDependencies": {
     "@rslib/core": "^0.18.3",
-    "@modelcontextprotocol/sdk": "1.10.2",
     "@types/debug": "4.1.12",
     "@types/express": "^4.17.21",
     "@types/node": "^18.0.0",

package/src/constants/example-code.ts CHANGED Viewed

@@ -5,15 +5,22 @@ export const PLAYWRIGHT_EXAMPLE_CODE = `
 IMPORTANT: Follow these exact type signatures for AI functions:
 // Type signatures for AI functions:
-aiInput(value: string, locator: string): Promise<void>
-aiTap(locator: string): Promise<void>
-aiDoubleClick(locator: string): Promise<void>
-aiScroll(scrollParam: {
-  direction: 'up' | 'down' | 'left' | 'right',
-  scrollType: 'once' | 'untilBottom' | 'untilTop' | 'untilRight' | 'untilLeft',
-  distance: number - scroll distance, px is the unit
+aiAct(prompt: string, options?: { cacheable?: boolean }): Promise<void>
+aiInput(text: string, locate: string, options?: { deepThink?: boolean, xpath?: string, cacheable?: boolean }): Promise<void>
+aiTap(locate: string, options?: { deepThink?: boolean, xpath?: string, cacheable?: boolean }): Promise<void>
+aiHover(locate: string, options?: { deepThink?: boolean, xpath?: string, cacheable?: boolean }): Promise<void>
+aiDoubleClick(locate: string, options?: { deepThink?: boolean, xpath?: string, cacheable?: boolean }): Promise<void>
+aiKeyboardPress(key: string, locate?: string, options?: { deepThink?: boolean, xpath?: string, cacheable?: boolean }): Promise<void>
+aiScroll(locate: string | undefined, options: {
+  direction?: 'up' | 'down' | 'left' | 'right',
+  scrollType?: 'singleAction' | 'scrollToBottom' | 'scrollToTop' | 'scrollToRight' | 'scrollToLeft',
+  distance?: number | null,
+  deepThink?: boolean,
+  xpath?: string,
+  cacheable?: boolean
 }): Promise<void>
-aiAssert(assertion: string): Promise<void>
+aiAssert(assertion: string, options?: { errorMessage?: string }): Promise<void>
+aiWaitFor(prompt: string, options?: { timeout?: number }): Promise<void>
 aiQuery<T>(queryObject: Record<string, string>): Promise<T> // Extracts data from page based on descriptions
 // examples:
@@ -35,12 +42,14 @@ test.beforeEach(async ({ page }) => {
 });
 test('ai shop', async ({
+  aiAct,
   aiInput,
   aiAssert,
   aiQuery,
   aiKeyboardPress,
   aiHover,
   aiTap,
+  aiWaitFor,
   agentForPage,
   page,
 }) => {
@@ -87,7 +96,7 @@ tasks:
         locate: 'input field description'
       - aiScroll:
         direction: down/up
-        scrollType: untilBottom/untilTop/page
+        scrollType: scrollToBottom/scrollToTop/singleAction
       - aiAssert: "expected state"
       - sleep: milliseconds
@@ -160,7 +169,7 @@ tasks:
       # Scroll globally or on an element described by a prompt.
       - aiScroll:
         direction: 'up' # or 'down' | 'left' | 'right'
-        scrollType: 'once' # or 'untilTop' | 'untilBottom' | 'untilLeft' | 'untilRight'
+        scrollType: 'singleAction' # or 'scrollToTop' | 'scrollToBottom' | 'scrollToLeft' | 'scrollToRight'
         distance: <number> # Optional, the scroll distance in pixels.
         locate: <prompt> # Optional, the element to scroll on.
         deepThink: <boolean> # Optional, whether to use deepThink to precisely locate the element. Defaults to False.

package/src/mcp/base-server.ts CHANGED Viewed

@@ -22,6 +22,23 @@ export interface HttpLaunchOptions {
   host?: string;
 }
+export interface LaunchMCPServerResult {
+  /**
+   * The MCP server port (for HTTP mode)
+   */
+  port?: number;
+  /**
+   * The server host (for HTTP mode)
+   */
+  host?: string;
+  /**
+   * Function to gracefully shutdown the MCP server
+   */
+  close: () => Promise<void>;
+}
 interface SessionData {
   transport: StreamableHTTPServerTransport;
   createdAt: Date;
@@ -50,7 +67,7 @@ export interface CLIArgs {
 export function launchMCPServer(
   server: BaseMCPServer,
   args: CLIArgs,
-): Promise<void> {
+): Promise<LaunchMCPServerResult> {
   if (args.mode === 'http') {
     return server.launchHttp({
       port: Number.parseInt(args.port || '3000', 10),
@@ -72,18 +89,21 @@ export abstract class BaseMCPServer {
   protected mcpServer: McpServer;
   protected toolsManager?: IMidsceneTools;
   protected config: BaseMCPServerConfig;
+  protected providedToolsManager?: IMidsceneTools;
-  constructor(config: BaseMCPServerConfig) {
+  constructor(config: BaseMCPServerConfig, toolsManager?: IMidsceneTools) {
     this.config = config;
     this.mcpServer = new McpServer({
       name: config.name,
       version: config.version,
       description: config.description,
     });
+    this.providedToolsManager = toolsManager;
   }
   /**
    * Platform-specific: create tools manager instance
+   * This is only called if no tools manager was provided in constructor
    */
   protected abstract createToolsManager(): IMidsceneTools;
@@ -92,7 +112,9 @@ export abstract class BaseMCPServer {
    */
   private async initializeToolsManager(): Promise<void> {
     setIsMcp(true);
-    this.toolsManager = this.createToolsManager();
+    // Use provided tools manager if available, otherwise create new one
+    this.toolsManager = this.providedToolsManager || this.createToolsManager();
     try {
       await this.toolsManager.initTools();
@@ -117,7 +139,7 @@ export abstract class BaseMCPServer {
   /**
    * Initialize and launch the MCP server with stdio transport
    */
-  public async launch(): Promise<void> {
+  public async launch(): Promise<LaunchMCPServerResult> {
     // Hijack stdout-based console methods to stderr for stdio mode
     // This prevents them from breaking MCP JSON-RPC protocol on stdout
     // Note: console.warn and console.error already output to stderr
@@ -170,13 +192,21 @@ export abstract class BaseMCPServer {
     process.once('SIGINT', cleanup);
     process.once('SIGTERM', cleanup);
+    return {
+      close: async () => {
+        this.performCleanup();
+      },
+    };
   }
   /**
    * Launch MCP server with HTTP transport
    * Supports stateful sessions for web applications and service integration
    */
-  public async launchHttp(options: HttpLaunchOptions): Promise<void> {
+  public async launchHttp(
+    options: HttpLaunchOptions,
+  ): Promise<LaunchMCPServerResult> {
     // Validate port number
     if (
       !Number.isInteger(options.port) ||
@@ -286,6 +316,36 @@ export abstract class BaseMCPServer {
     const cleanupInterval = this.startSessionCleanup(sessions);
     this.setupHttpShutdownHandlers(server, sessions, cleanupInterval);
+    return {
+      port: options.port,
+      host,
+      close: async () => {
+        clearInterval(cleanupInterval);
+        for (const session of sessions.values()) {
+          try {
+            await session.transport.close();
+          } catch (error: unknown) {
+            const message =
+              error instanceof Error ? error.message : String(error);
+            console.error(
+              `Failed to close session ${session.transport.sessionId}: ${message}`,
+            );
+          }
+        }
+        sessions.clear();
+        return new Promise<void>((resolve) => {
+          server.close((err) => {
+            if (err) {
+              console.error('Error closing HTTP server:', err);
+            }
+            this.performCleanup();
+            resolve();
+          });
+        });
+      },
+    };
   }
   /**

package/src/mcp/base-tools.ts CHANGED Viewed

@@ -180,6 +180,20 @@ export abstract class BaseMidsceneTools<TAgent extends BaseAgent = BaseAgent>
     await this.agent?.destroy?.();
   }
+  /**
+   * Get tool definitions
+   */
+  public getToolDefinitions(): ToolDefinition[] {
+    return this.toolDefinitions;
+  }
+  /**
+   * Set agent for the tools manager
+   */
+  public setAgent(agent: TAgent): void {
+    this.agent = agent;
+  }
   /**
    * Helper: Convert base64 screenshot to image content array
    */

package/src/mcp/index.ts CHANGED Viewed

@@ -3,3 +3,4 @@ export * from './base-tools';
 export * from './tool-generator';
 export * from './types';
 export * from './inject-report-html-plugin';
+export * from './launcher-helper';

package/src/mcp/launcher-helper.ts ADDED Viewed

@@ -0,0 +1,200 @@
+import type { BaseMCPServer } from './base-server';
+import type { HttpLaunchOptions, LaunchMCPServerResult } from './base-server';
+import type { IMidsceneTools } from './types';
+export interface LaunchMCPServerOptions extends HttpLaunchOptions {
+  /**
+   * Whether to show server logs
+   * @default true
+   */
+  verbose?: boolean;
+}
+/**
+ * Generic agent type (avoid importing from @midscene/core to prevent circular deps)
+ */
+export interface GenericAgent<TDevice = any> {
+  interface: TDevice;
+  constructor: { name: string };
+}
+/**
+ * Additional information for logging server startup
+ */
+export interface StartupInfo {
+  port?: number;
+  host?: string;
+}
+export interface MCPServerLauncherConfig<
+  AgentType extends GenericAgent = GenericAgent,
+  ToolsManagerType extends IMidsceneTools = IMidsceneTools,
+> {
+  agent: AgentType;
+  platformName: string;
+  ToolsManagerClass: new (...args: any[]) => ToolsManagerType;
+  MCPServerClass: new (toolsManager?: ToolsManagerType) => BaseMCPServer;
+}
+/**
+ * Create a generic MCP server launcher for a given agent, tools manager, and MCP server.
+ *
+ * This helper centralizes the common wiring logic used by platform-specific launchers:
+ * it constructs a tools manager, attaches the provided `agent` to it, then instantiates
+ * the `MCPServerClass` and exposes convenience methods to start the server over stdio
+ * (`launch`) or HTTP (`launchHttp`).
+ *
+ * Use this helper when adding a new platform-specific launcher or when you want to
+ * avoid duplicating boilerplate code for starting an MCP server. Typically, callers
+ * provide:
+ * - an `agent` instance that contains the underlying device on its `interface` property
+ * - a `ToolsManagerClass` that knows how to expose tools for that agent
+ * - an `MCPServerClass` that implements the MCP protocol and supports `launch` and
+ *   `launchHttp` methods.
+ *
+ * The returned object has two methods:
+ * - `launch(options?)` to start the server using stdio transport
+ * - `launchHttp(options)` to start the server using HTTP transport
+ * Both methods accept a `verbose` flag to control console logging.
+ *
+ * @param config Configuration describing the agent, platform name (for logging),
+ *               tools manager implementation, and MCP server implementation.
+ *
+ * @returns An object with `launch` and `launchHttp` methods to start the MCP server.
+ *
+ * @example
+ * ```typescript
+ * import { createMCPServerLauncher } from '@midscene/shared/mcp';
+ * import { Agent } from '@midscene/core/agent';
+ * import { WebMidsceneTools } from './web-tools';
+ * import { WebMCPServer } from './server';
+ *
+ * const agent = new Agent();
+ * const launcher = createMCPServerLauncher({
+ *   agent,
+ *   platformName: 'Web',
+ *   ToolsManagerClass: WebMidsceneTools,
+ *   MCPServerClass: WebMCPServer,
+ * });
+ *
+ * // Start with stdio
+ * await launcher.launch({ verbose: true });
+ *
+ * // Or start with HTTP
+ * await launcher.launchHttp({ port: 3000, host: 'localhost' });
+ * ```
+ *
+ * @internal
+ */
+export function createMCPServerLauncher<
+  AgentType extends GenericAgent,
+  ToolsManagerType extends IMidsceneTools,
+>(config: MCPServerLauncherConfig<AgentType, ToolsManagerType>) {
+  const { agent, platformName, ToolsManagerClass, MCPServerClass } = config;
+  /**
+   * Validate that the agent has the required interface property
+   * @throws {Error} If agent.interface is missing
+   */
+  function validateAgent(): void {
+    const device = agent.interface;
+    if (!device) {
+      throw new Error(
+        `Agent must have an 'interface' property that references the underlying device.
+Please ensure your agent instance is properly initialized with a device interface.
+Expected: agent.interface to be defined, but got: ${typeof device}
+Solution: Check that your agent constructor properly sets the interface property.`,
+      );
+    }
+  }
+  /**
+   * Create and configure a tools manager with the agent
+   * @returns Configured tools manager instance
+   */
+  function createToolsManager(): ToolsManagerType {
+    const toolsManager = new ToolsManagerClass();
+    // Type-safe agent injection: define explicit interface for tools manager with agent
+    interface ToolsManagerWithAgent extends IMidsceneTools {
+      agent: AgentType;
+    }
+    (toolsManager as unknown as ToolsManagerWithAgent).agent = agent;
+    return toolsManager;
+  }
+  /**
+   * Log server startup information
+   * @param mode - Transport mode ('stdio' or 'HTTP')
+   * @param additionalInfo - Additional info to log (e.g., port, host)
+   */
+  function logStartupInfo(
+    mode: 'stdio' | 'HTTP',
+    additionalInfo?: StartupInfo,
+  ): void {
+    const device = agent.interface;
+    console.log(`Starting Midscene ${platformName} MCP Server (${mode})...`);
+    console.log(`Agent: ${agent.constructor.name}`);
+    console.log(`Device: ${device.constructor.name}`);
+    if (additionalInfo?.port !== undefined) {
+      console.log(`Port: ${additionalInfo.port}`);
+    }
+    if (additionalInfo?.host) {
+      console.log(`Host: ${additionalInfo.host}`);
+    }
+  }
+  return {
+    /**
+     * Launch the MCP server with stdio transport
+     */
+    async launch(
+      options: { verbose?: boolean } = {},
+    ): Promise<LaunchMCPServerResult> {
+      const { verbose = true } = options;
+      validateAgent();
+      if (verbose) {
+        logStartupInfo('stdio');
+      }
+      const toolsManager = createToolsManager();
+      const server = new MCPServerClass(toolsManager);
+      const result = await server.launch();
+      if (verbose) {
+        console.log(`${platformName} MCP Server started (stdio mode)`);
+      }
+      return result;
+    },
+    /**
+     * Launch the MCP server with HTTP transport
+     */
+    async launchHttp(
+      options: LaunchMCPServerOptions,
+    ): Promise<LaunchMCPServerResult> {
+      const { port, host = 'localhost', verbose = true } = options;
+      validateAgent();
+      if (verbose) {
+        logStartupInfo('HTTP', { port, host });
+      }
+      const toolsManager = createToolsManager();
+      const server = new MCPServerClass(toolsManager);
+      const result = await server.launchHttp({ port, host });
+      if (verbose) {
+        console.log(
+          `${platformName} MCP Server started on http://${result.host}:${result.port}/mcp`,
+        );
+      }
+      return result;
+    },
+  };
+}

package/src/mcp/types.ts CHANGED Viewed

@@ -58,6 +58,11 @@ export interface ToolDefinition<T = Record<string, unknown>> {
   autoDestroy?: boolean;
 }
+/**
+ * Tool type for mcpKitForAgent return value
+ */
+export type Tool = ToolDefinition;
 /**
  * Action space item definition
  * Note: Intentionally no index signature to maintain compatibility with DeviceAction