npm - appium-mcp - Versions diffs - 1.29.0 → 1.31.0 - Mend

appium-mcp 1.29.0 → 1.31.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

package/CHANGELOG.md +12 -0
package/README.md +89 -1
package/dist/ai-finder/types.d.ts +41 -0
package/dist/ai-finder/types.d.ts.map +1 -0
package/dist/ai-finder/types.js +5 -0
package/dist/ai-finder/types.js.map +1 -0
package/dist/ai-finder/vision-finder.d.ts +101 -0
package/dist/ai-finder/vision-finder.d.ts.map +1 -0
package/dist/ai-finder/vision-finder.js +434 -0
package/dist/ai-finder/vision-finder.js.map +1 -0
package/dist/tests/__mocks__/@appium/support.d.ts +23 -0
package/dist/tests/__mocks__/@appium/support.d.ts.map +1 -1
package/dist/tests/__mocks__/@appium/support.js +23 -0
package/dist/tests/__mocks__/@appium/support.js.map +1 -1
package/dist/tests/tools/session/battery-info.test.d.ts +2 -0
package/dist/tests/tools/session/battery-info.test.d.ts.map +1 -0
package/dist/tests/tools/session/battery-info.test.js +69 -0
package/dist/tests/tools/session/battery-info.test.js.map +1 -0
package/dist/tests/vision-finder.test.d.ts +10 -0
package/dist/tests/vision-finder.test.d.ts.map +1 -0
package/dist/tests/vision-finder.test.js +398 -0
package/dist/tests/vision-finder.test.js.map +1 -0
package/dist/tools/index.d.ts.map +1 -1
package/dist/tools/index.js +2 -0
package/dist/tools/index.js.map +1 -1
package/dist/tools/interactions/click.d.ts.map +1 -1
package/dist/tools/interactions/click.js +49 -3
package/dist/tools/interactions/click.js.map +1 -1
package/dist/tools/interactions/find.d.ts +3 -1
package/dist/tools/interactions/find.d.ts.map +1 -1
package/dist/tools/interactions/find.js +87 -5
package/dist/tools/interactions/find.js.map +1 -1
package/dist/tools/interactions/screenshot.d.ts +2 -7
package/dist/tools/interactions/screenshot.d.ts.map +1 -1
package/dist/tools/interactions/screenshot.js +3 -18
package/dist/tools/interactions/screenshot.js.map +1 -1
package/dist/tools/session/battery-info.d.ts +3 -0
package/dist/tools/session/battery-info.d.ts.map +1 -0
package/dist/tools/session/battery-info.js +70 -0
package/dist/tools/session/battery-info.js.map +1 -0
package/dist/utils/paths.d.ts +8 -0
package/dist/utils/paths.d.ts.map +1 -0
package/dist/utils/paths.js +19 -0
package/dist/utils/paths.js.map +1 -0
package/package.json +3 -1
package/server.json +2 -2
package/src/ai-finder/types.ts +41 -0
package/src/ai-finder/vision-finder.ts +568 -0
package/src/tests/__mocks__/@appium/support.ts +43 -0
package/src/tests/tools/session/battery-info.test.ts +102 -0
package/src/tests/vision-finder.test.ts +728 -0
package/src/tools/index.ts +2 -0
package/src/tools/interactions/click.ts +61 -3
package/src/tools/interactions/find.ts +117 -6
package/src/tools/interactions/screenshot.ts +3 -21
package/src/tools/session/battery-info.ts +83 -0
package/src/utils/paths.ts +22 -0

package/CHANGELOG.md CHANGED Viewed

@@ -1,3 +1,15 @@
+## [1.31.0](https://github.com/appium/appium-mcp/compare/v1.30.0...v1.31.0) (2026-03-21)
+### Features
+* **tools:** implement battery info ([#219](https://github.com/appium/appium-mcp/issues/219)) ([f67ea39](https://github.com/appium/appium-mcp/commit/f67ea3989a34dc13ca36bccb915fb87ae8f7304a))
+## [1.30.0](https://github.com/appium/appium-mcp/compare/v1.29.0...v1.30.0) (2026-03-20)
+### Features
+* **ai-finder:** add natural language element finding using vision models ([#200](https://github.com/appium/appium-mcp/issues/200)) ([2b43267](https://github.com/appium/appium-mcp/commit/2b43267b63c936c2e982d36b8e8fed658bd9a51c))
 ## [1.29.0](https://github.com/appium/appium-mcp/compare/v1.28.0...v1.29.0) (2026-03-20)
 ### Features

package/README.md CHANGED Viewed

@@ -19,6 +19,7 @@ MCP Appium is an intelligent MCP (Model Context Protocol) server designed to emp
 ## 🚀 Features
 - **Cross-Platform Support**: Automate tests for both Android (UiAutomator2) and iOS (XCUITest).
+- **AI-Powered Element Finding**: Locate UI elements using natural language descriptions powered by vision models - no need for complex XPath or selectors.
 - **Intelligent Locator Generation**: AI-powered element identification using priority-based strategies.
 - **Interactive Session Management**: Easily create and manage sessions on local mobile devices.
 - **Smart Element Interactions**: Perform actions like clicks, text input, screenshots, and element finding.
@@ -176,6 +177,55 @@ Set the `CAPABILITIES_CONFIG` environment variable to point to your configuratio
 Set the `SCREENSHOTS_DIR` environment variable to specify where screenshots are saved. If not set, screenshots are saved to the current working directory. Supports both absolute and relative paths (relative paths are resolved from the current working directory). The directory is created automatically if it doesn't exist.
+### AI Vision Element Finding
+Configure AI-powered element finding using vision models. This feature allows you to locate UI elements using natural language descriptions instead of traditional XPath or ID selectors.
+**Required Environment Variables:**
+```json
+{
+  "appium-mcp": {
+    "env": {
+      "ANDROID_HOME": "/path/to/android/sdk",
+      "AI_VISION_API_BASE_URL": "https://dashscope.aliyuncs.com/compatible-mode/v1",
+      "AI_VISION_API_TOKEN": "your_api_key_here"
+    }
+  }
+}
+```
+**Optional Environment Variables:**
+- `AI_VISION_MODEL`: Model name (default: `Qwen3-VL-235B-A22B-Instruct`)
+- `AI_VISION_COORD_TYPE`: Coordinate type - `normalized` or `absolute` (default: `normalized`)
+- `AI_VISION_IMAGE_MAX_WIDTH`: Max image width for compression in pixels (default: `1080`)
+- `AI_VISION_IMAGE_QUALITY`: JPEG quality 1-100 (default: `80`)
+**Supported Vision Model Providers:**
+Based on benchmark testing, the following models are recommended:
+1. **Qwen3-VL-235B-A22B-Instruct**
+   - Provider: Alibaba Cloud DashScope
+   - Accuracy: 100%
+   - Speed: 12649ms
+   - API: `https://dashscope.aliyuncs.com/compatible-mode/v1`
+2. **gemini-3-flash-preview**
+   - Provider: Google AI
+   - Accuracy: 100%
+   - Speed: 17353
+   - API: `https://generativelanguage.googleapis.com/v1beta`
+More models benchmarked can be found [here](src/tests/benchmark_model/TEST_REPORT.md).
+**Performance Features:**
+- **Image Compression**: Automatically compresses screenshots to reduce API latency and token costs (50-80% size reduction)
+- **Result Caching**: Caches results for 5 minutes using a module-level LRU cache (max 50 entries) that persists across tool calls, avoiding redundant API calls for identical screenshot + instruction pairs
+- **Coordinate Handling**: In `normalized` mode (default), the model returns 0–1000 range coordinates that are automatically scaled to absolute pixel coordinates using the original image dimensions — independent of any image compression. In `absolute` mode, image resizing is disabled so the model's returned pixel coordinates always map directly to the original screen dimensions.
 ### Performance Optimization
 #### NO_UI Mode
@@ -263,7 +313,7 @@ The default regex pattern allows any URL that starts with `http://` or `https://
 | Tool                  | Description                                                                                  |
 | --------------------- | -------------------------------------------------------------------------------------------- |
-| `appium_find_element` | Find a specific element using various locator strategies (xpath, id, accessibility id, etc.) |
+| `appium_find_element` | Find a specific element using traditional locator strategies (xpath, id, accessibility id, etc.) **OR** AI-powered natural language descriptions (e.g., "yellow search button at bottom"). Supports both traditional and AI modes. |
 | `appium_tap_by_coordinates` | Tap at specific screen coordinates (x, y). On iOS, coordinates are in points. On Android, coordinates are in device pixels. Use `appium_get_page_source` for accurate coordinates. |
 | `appium_click`        | Click on an element                                                                          |
 | `appium_double_tap`   | Perform double tap on an element                                                             |
@@ -290,6 +340,7 @@ The default regex pattern allows any URL that starts with `http://` or `https://
 | `appium_get_geolocation`   | Get the current GPS coordinates (latitude, longitude, altitude) of the device. |
 | `appium_reset_geolocation` | Reset the simulated/mocked geolocation back to the system default. On iOS, clears the simulated location. On Android real devices, removes the mock location provider. Not supported on Android emulators. |
 | `appium_mobile_get_device_info` | Get device information (model, OS version, locale, timezone, screen density, etc.). On iOS real devices, includes detailed lockdown info (hardware model, product type, CPU architecture, etc.). |
+| `appium_mobile_get_battery_info` | Get the current battery level (as a percentage) and charging state of the device. Works on both iOS and Android. |
 ### App Management
@@ -326,6 +377,43 @@ Open Amazon mobile app, search for "iPhone 15 Pro", select the first search resu
 This example demonstrates a complete e-commerce checkout flow that can be automated using MCP Appium's intelligent locator generation and test creation capabilities.
+### AI-Powered Element Finding Examples
+**Traditional Mode (XPath/ID):**
+```json
+{
+  "tool": "appium_find_element",
+  "arguments": {
+    "strategy": "xpath",
+    "selector": "//android.widget.Button[@text='Search']"
+  }
+}
+```
+**AI Mode (Natural Language):**
+```json
+{
+  "tool": "appium_find_element",
+  "arguments": {
+    "strategy": "ai_instruction",
+    "ai_instruction": "yellow search button at the bottom of the screen"
+  }
+}
+```
+**More AI Mode Examples:**
+- `"username input field at top"`
+- `"settings icon in top-right corner"`
+- `"red delete button next to the item"`
+- `"blue submit button at bottom"`
+- `"profile picture in navigation bar"`
+**Benefits of AI Mode:**
+- **No Complex Selectors**: Describe elements in plain language
+- **Resilient to UI Changes**: Semantic understanding adapts to layout changes
+- **Faster Development**: No need to inspect element hierarchies
+- **Works Across Languages**: Describe in any language you're comfortable with
 ### Working in Your Native Language
 **MCP Appium works seamlessly in any language** - you don't need to know English! The AI assistant understands and responds in your native language. Simply describe what you want to do in your preferred language:

package/dist/ai-finder/types.d.ts ADDED Viewed

@@ -0,0 +1,41 @@
+/**
+ * Type definitions for AI Vision Finder
+ */
+/**
+ * AI Vision configuration interface
+ */
+export interface AIVisionConfig {
+    model: string;
+    apiBaseUrl: string;
+    apiToken: string;
+    coordType: 'normalized' | 'absolute';
+    imageMaxWidth: number;
+    imageQuality: number;
+}
+/**
+ * Bounding box type: [x1, y1, x2, y2]
+ * - x1, y1: top-left corner coordinates
+ * - x2, y2: bottom-right corner coordinates
+ */
+export type BBox = [x1: number, y1: number, x2: number, y2: number];
+/**
+ * Bounding box coordinates interface
+ * Matches the format returned by vision models
+ */
+export interface BBoxCoordinates {
+    target: string;
+    bbox_2d: BBox;
+}
+/**
+ * AI element finding result interface
+ */
+export interface AIFindResult {
+    bbox: BBox;
+    center: {
+        x: number;
+        y: number;
+    };
+    target: string;
+    annotatedImagePath?: string;
+}
+//# sourceMappingURL=types.d.ts.map

package/dist/ai-finder/types.d.ts.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/ai-finder/types.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH;;GAEG;AACH,MAAM,WAAW,cAAc;IAC7B,KAAK,EAAE,MAAM,CAAC;IACd,UAAU,EAAE,MAAM,CAAC;IACnB,QAAQ,EAAE,MAAM,CAAC;IACjB,SAAS,EAAE,YAAY,GAAG,UAAU,CAAC;IACrC,aAAa,EAAE,MAAM,CAAC;IACtB,YAAY,EAAE,MAAM,CAAC;CACtB;AAED;;;;GAIG;AACH,MAAM,MAAM,IAAI,GAAG,CAAC,EAAE,EAAE,MAAM,EAAE,EAAE,EAAE,MAAM,EAAE,EAAE,EAAE,MAAM,EAAE,EAAE,EAAE,MAAM,CAAC,CAAC;AAEpE;;;GAGG;AACH,MAAM,WAAW,eAAe;IAC9B,MAAM,EAAE,MAAM,CAAC;IACf,OAAO,EAAE,IAAI,CAAC;CACf;AAED;;GAEG;AACH,MAAM,WAAW,YAAY;IAC3B,IAAI,EAAE,IAAI,CAAC;IACX,MAAM,EAAE;QAAE,CAAC,EAAE,MAAM,CAAC;QAAC,CAAC,EAAE,MAAM,CAAA;KAAE,CAAC;IACjC,MAAM,EAAE,MAAM,CAAC;IACf,kBAAkB,CAAC,EAAE,MAAM,CAAC;CAC7B"}

package/dist/ai-finder/types.js ADDED Viewed

@@ -0,0 +1,5 @@
+/**
+ * Type definitions for AI Vision Finder
+ */
+export {};
+//# sourceMappingURL=types.js.map

package/dist/ai-finder/types.js.map ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"version":3,"file":"types.js","sourceRoot":"","sources":["../../src/ai-finder/types.ts"],"names":[],"mappings":"AAAA;;GAEG"}

package/dist/ai-finder/vision-finder.d.ts ADDED Viewed

@@ -0,0 +1,101 @@
+/**
+ * AI Vision Finder Module
+ *
+ * Core module for AI-powered element finding using vision models.
+ * Implementation aligns with benchmark_model.ts standards.
+ */
+import type { AIFindResult } from './types.js';
+/**
+ * AI Vision Finder class
+ * Based on benchmark results: Qwen3-VL-235B-A22B-Instruct (100% accuracy, 8417ms)
+ */
+export declare class AIVisionFinder {
+    private config;
+    private readonly cache;
+    private readonly CACHE_TTL_MS;
+    constructor();
+    /**
+     * Find element using AI vision model
+     * @param screenshotBase64 - Base64 encoded screenshot
+     * @param instruction - Natural language instruction
+     * @param imageWidth - Original image width
+     * @param imageHeight - Original image height
+     * @returns AI find result with bbox and center coordinates
+     */
+    findElement(screenshotBase64: string, instruction: string, imageWidth: number, imageHeight: number): Promise<AIFindResult>;
+    /**
+     * Compress image using @appium/support sharp utilities
+     * Reduces API latency and token consumption
+     *
+     * Returns both the base64-encoded image and its MIME type so that the caller
+     * can construct a correct data URL. On compression failure the original bytes
+     * are returned with mimeType 'image/png' (Appium screenshots are always PNG).
+     *
+     * **Resizing policy**: Resizing is intentionally skipped when
+     * `coordType === 'absolute'`. In absolute mode the vision model returns pixel
+     * coordinates relative to the image it received. If the image were resized,
+     * those coordinates would map to the compressed dimensions rather than the
+     * original screen dimensions, causing incorrect tap positions. Only JPEG
+     * quality compression is applied in that case.
+     */
+    private compressImage;
+    /**
+     * Build prompt for vision model
+     * Matches benchmark_model.ts prompt format for consistency
+     */
+    private buildPrompt;
+    /**
+     * Call vision model API
+     * Matches benchmark_model.ts implementation
+     */
+    private callVisionAPI;
+    /**
+     * Parse bbox coordinates from model response
+     * Matches benchmark_model.ts parsing logic
+     */
+    private parseBBox;
+    /**
+     * Convert coordinates based on model's coordinate type
+     * Matches benchmark_model.ts coordinate conversion logic
+     *
+     * Coordinate type modes:
+     * - **normalized** (default, AI_VISION_COORD_TYPE=normalized):
+     *   The vision model returns coordinates in the range 0–1000, where
+     *   (0,0) is the top-left corner and (1000,1000) is the bottom-right corner.
+     *   This method scales them to absolute pixel coordinates using the original
+     *   image dimensions. This mode is independent of image compression.
+     *
+     * - **absolute** (AI_VISION_COORD_TYPE=absolute):
+     *   The vision model returns pixel coordinates directly based on the image
+     *   it received (which may be the compressed image). Coordinates are used
+     *   as-is and are NOT automatically scaled back to the original resolution.
+     *   Use this mode only if the model explicitly outputs absolute pixel values.
+     */
+    private convertCoordinates;
+    /**
+     * Draw bounding box on image and save to file
+     * Based on benchmark_model.ts drawBBoxOnImage implementation
+     * @param screenshotBase64 - Base64 encoded screenshot
+     * @param bbox - Bounding box coordinates [x1, y1, x2, y2]
+     * @param imageWidth - Image width
+     * @param imageHeight - Image height
+     * @param targetName - Target element name for label
+     * @returns Absolute path to the annotated image file
+     */
+    private drawBBoxOnImage;
+    /**
+     * Generate cache key from instruction and image
+     */
+    private generateCacheKey;
+    /**
+     * Get result from cache if valid
+     * TTL expiry and LRU eviction are handled automatically by LRUCache
+     */
+    private getFromCache;
+    /**
+     * Save result to cache
+     * TTL expiry and LRU eviction are handled automatically by LRUCache
+     */
+    private saveToCache;
+}
+//# sourceMappingURL=vision-finder.d.ts.map

package/dist/ai-finder/vision-finder.d.ts.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"vision-finder.d.ts","sourceRoot":"","sources":["../../src/ai-finder/vision-finder.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AASH,OAAO,KAAK,EAIV,YAAY,EACb,MAAM,YAAY,CAAC;AAGpB;;;GAGG;AACH,qBAAa,cAAc;IACzB,OAAO,CAAC,MAAM,CAAiB;IAC/B,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAiC;IACvD,OAAO,CAAC,QAAQ,CAAC,YAAY,CAAiB;;IAyC9C;;;;;;;OAOG;IACG,WAAW,CACf,gBAAgB,EAAE,MAAM,EACxB,WAAW,EAAE,MAAM,EACnB,UAAU,EAAE,MAAM,EAClB,WAAW,EAAE,MAAM,GAClB,OAAO,CAAC,YAAY,CAAC;IAkFxB;;;;;;;;;;;;;;OAcG;YACW,aAAa;IA8D3B;;;OAGG;IACH,OAAO,CAAC,WAAW;IAmDnB;;;OAGG;YACW,aAAa;IAyD3B;;;OAGG;IACH,OAAO,CAAC,SAAS;IAyCjB;;;;;;;;;;;;;;;;OAgBG;IACH,OAAO,CAAC,kBAAkB;IA+C1B;;;;;;;;;OASG;YACW,eAAe;IA+D7B;;OAEG;IACH,OAAO,CAAC,gBAAgB;IASxB;;;OAGG;IACH,OAAO,CAAC,YAAY;IAIpB;;;OAGG;IACH,OAAO,CAAC,WAAW;CAGpB"}