appium-mcp 1.29.0 → 1.31.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/CHANGELOG.md +12 -0
  2. package/README.md +89 -1
  3. package/dist/ai-finder/types.d.ts +41 -0
  4. package/dist/ai-finder/types.d.ts.map +1 -0
  5. package/dist/ai-finder/types.js +5 -0
  6. package/dist/ai-finder/types.js.map +1 -0
  7. package/dist/ai-finder/vision-finder.d.ts +101 -0
  8. package/dist/ai-finder/vision-finder.d.ts.map +1 -0
  9. package/dist/ai-finder/vision-finder.js +434 -0
  10. package/dist/ai-finder/vision-finder.js.map +1 -0
  11. package/dist/tests/__mocks__/@appium/support.d.ts +23 -0
  12. package/dist/tests/__mocks__/@appium/support.d.ts.map +1 -1
  13. package/dist/tests/__mocks__/@appium/support.js +23 -0
  14. package/dist/tests/__mocks__/@appium/support.js.map +1 -1
  15. package/dist/tests/tools/session/battery-info.test.d.ts +2 -0
  16. package/dist/tests/tools/session/battery-info.test.d.ts.map +1 -0
  17. package/dist/tests/tools/session/battery-info.test.js +69 -0
  18. package/dist/tests/tools/session/battery-info.test.js.map +1 -0
  19. package/dist/tests/vision-finder.test.d.ts +10 -0
  20. package/dist/tests/vision-finder.test.d.ts.map +1 -0
  21. package/dist/tests/vision-finder.test.js +398 -0
  22. package/dist/tests/vision-finder.test.js.map +1 -0
  23. package/dist/tools/index.d.ts.map +1 -1
  24. package/dist/tools/index.js +2 -0
  25. package/dist/tools/index.js.map +1 -1
  26. package/dist/tools/interactions/click.d.ts.map +1 -1
  27. package/dist/tools/interactions/click.js +49 -3
  28. package/dist/tools/interactions/click.js.map +1 -1
  29. package/dist/tools/interactions/find.d.ts +3 -1
  30. package/dist/tools/interactions/find.d.ts.map +1 -1
  31. package/dist/tools/interactions/find.js +87 -5
  32. package/dist/tools/interactions/find.js.map +1 -1
  33. package/dist/tools/interactions/screenshot.d.ts +2 -7
  34. package/dist/tools/interactions/screenshot.d.ts.map +1 -1
  35. package/dist/tools/interactions/screenshot.js +3 -18
  36. package/dist/tools/interactions/screenshot.js.map +1 -1
  37. package/dist/tools/session/battery-info.d.ts +3 -0
  38. package/dist/tools/session/battery-info.d.ts.map +1 -0
  39. package/dist/tools/session/battery-info.js +70 -0
  40. package/dist/tools/session/battery-info.js.map +1 -0
  41. package/dist/utils/paths.d.ts +8 -0
  42. package/dist/utils/paths.d.ts.map +1 -0
  43. package/dist/utils/paths.js +19 -0
  44. package/dist/utils/paths.js.map +1 -0
  45. package/package.json +3 -1
  46. package/server.json +2 -2
  47. package/src/ai-finder/types.ts +41 -0
  48. package/src/ai-finder/vision-finder.ts +568 -0
  49. package/src/tests/__mocks__/@appium/support.ts +43 -0
  50. package/src/tests/tools/session/battery-info.test.ts +102 -0
  51. package/src/tests/vision-finder.test.ts +728 -0
  52. package/src/tools/index.ts +2 -0
  53. package/src/tools/interactions/click.ts +61 -3
  54. package/src/tools/interactions/find.ts +117 -6
  55. package/src/tools/interactions/screenshot.ts +3 -21
  56. package/src/tools/session/battery-info.ts +83 -0
  57. package/src/utils/paths.ts +22 -0
package/CHANGELOG.md CHANGED
@@ -1,3 +1,15 @@
1
+ ## [1.31.0](https://github.com/appium/appium-mcp/compare/v1.30.0...v1.31.0) (2026-03-21)
2
+
3
+ ### Features
4
+
5
+ * **tools:** implement battery info ([#219](https://github.com/appium/appium-mcp/issues/219)) ([f67ea39](https://github.com/appium/appium-mcp/commit/f67ea3989a34dc13ca36bccb915fb87ae8f7304a))
6
+
7
+ ## [1.30.0](https://github.com/appium/appium-mcp/compare/v1.29.0...v1.30.0) (2026-03-20)
8
+
9
+ ### Features
10
+
11
+ * **ai-finder:** add natural language element finding using vision models ([#200](https://github.com/appium/appium-mcp/issues/200)) ([2b43267](https://github.com/appium/appium-mcp/commit/2b43267b63c936c2e982d36b8e8fed658bd9a51c))
12
+
1
13
  ## [1.29.0](https://github.com/appium/appium-mcp/compare/v1.28.0...v1.29.0) (2026-03-20)
2
14
 
3
15
  ### Features
package/README.md CHANGED
@@ -19,6 +19,7 @@ MCP Appium is an intelligent MCP (Model Context Protocol) server designed to emp
19
19
  ## 🚀 Features
20
20
 
21
21
  - **Cross-Platform Support**: Automate tests for both Android (UiAutomator2) and iOS (XCUITest).
22
+ - **AI-Powered Element Finding**: Locate UI elements using natural language descriptions powered by vision models - no need for complex XPath or selectors.
22
23
  - **Intelligent Locator Generation**: AI-powered element identification using priority-based strategies.
23
24
  - **Interactive Session Management**: Easily create and manage sessions on local mobile devices.
24
25
  - **Smart Element Interactions**: Perform actions like clicks, text input, screenshots, and element finding.
@@ -176,6 +177,55 @@ Set the `CAPABILITIES_CONFIG` environment variable to point to your configuratio
176
177
 
177
178
  Set the `SCREENSHOTS_DIR` environment variable to specify where screenshots are saved. If not set, screenshots are saved to the current working directory. Supports both absolute and relative paths (relative paths are resolved from the current working directory). The directory is created automatically if it doesn't exist.
178
179
 
180
+ ### AI Vision Element Finding
181
+
182
+ Configure AI-powered element finding using vision models. This feature allows you to locate UI elements using natural language descriptions instead of traditional XPath or ID selectors.
183
+
184
+ **Required Environment Variables:**
185
+
186
+ ```json
187
+ {
188
+ "appium-mcp": {
189
+ "env": {
190
+ "ANDROID_HOME": "/path/to/android/sdk",
191
+ "AI_VISION_API_BASE_URL": "https://dashscope.aliyuncs.com/compatible-mode/v1",
192
+ "AI_VISION_API_TOKEN": "your_api_key_here"
193
+ }
194
+ }
195
+ }
196
+ ```
197
+
198
+ **Optional Environment Variables:**
199
+
200
+ - `AI_VISION_MODEL`: Model name (default: `Qwen3-VL-235B-A22B-Instruct`)
201
+ - `AI_VISION_COORD_TYPE`: Coordinate type - `normalized` or `absolute` (default: `normalized`)
202
+ - `AI_VISION_IMAGE_MAX_WIDTH`: Max image width for compression in pixels (default: `1080`)
203
+ - `AI_VISION_IMAGE_QUALITY`: JPEG quality 1-100 (default: `80`)
204
+
205
+ **Supported Vision Model Providers:**
206
+
207
+ Based on benchmark testing, the following models are recommended:
208
+
209
+ 1. **Qwen3-VL-235B-A22B-Instruct**
210
+ - Provider: Alibaba Cloud DashScope
211
+ - Accuracy: 100%
212
+ - Speed: 12649ms
213
+ - API: `https://dashscope.aliyuncs.com/compatible-mode/v1`
214
+
215
+ 2. **gemini-3-flash-preview**
216
+ - Provider: Google AI
217
+ - Accuracy: 100%
218
+ - Speed: 17353
219
+ - API: `https://generativelanguage.googleapis.com/v1beta`
220
+
221
+ More models benchmarked can be found [here](src/tests/benchmark_model/TEST_REPORT.md).
222
+
223
+ **Performance Features:**
224
+
225
+ - **Image Compression**: Automatically compresses screenshots to reduce API latency and token costs (50-80% size reduction)
226
+ - **Result Caching**: Caches results for 5 minutes using a module-level LRU cache (max 50 entries) that persists across tool calls, avoiding redundant API calls for identical screenshot + instruction pairs
227
+ - **Coordinate Handling**: In `normalized` mode (default), the model returns 0–1000 range coordinates that are automatically scaled to absolute pixel coordinates using the original image dimensions — independent of any image compression. In `absolute` mode, image resizing is disabled so the model's returned pixel coordinates always map directly to the original screen dimensions.
228
+
179
229
  ### Performance Optimization
180
230
 
181
231
  #### NO_UI Mode
@@ -263,7 +313,7 @@ The default regex pattern allows any URL that starts with `http://` or `https://
263
313
 
264
314
  | Tool | Description |
265
315
  | --------------------- | -------------------------------------------------------------------------------------------- |
266
- | `appium_find_element` | Find a specific element using various locator strategies (xpath, id, accessibility id, etc.) |
316
+ | `appium_find_element` | Find a specific element using traditional locator strategies (xpath, id, accessibility id, etc.) **OR** AI-powered natural language descriptions (e.g., "yellow search button at bottom"). Supports both traditional and AI modes. |
267
317
  | `appium_tap_by_coordinates` | Tap at specific screen coordinates (x, y). On iOS, coordinates are in points. On Android, coordinates are in device pixels. Use `appium_get_page_source` for accurate coordinates. |
268
318
  | `appium_click` | Click on an element |
269
319
  | `appium_double_tap` | Perform double tap on an element |
@@ -290,6 +340,7 @@ The default regex pattern allows any URL that starts with `http://` or `https://
290
340
  | `appium_get_geolocation` | Get the current GPS coordinates (latitude, longitude, altitude) of the device. |
291
341
  | `appium_reset_geolocation` | Reset the simulated/mocked geolocation back to the system default. On iOS, clears the simulated location. On Android real devices, removes the mock location provider. Not supported on Android emulators. |
292
342
  | `appium_mobile_get_device_info` | Get device information (model, OS version, locale, timezone, screen density, etc.). On iOS real devices, includes detailed lockdown info (hardware model, product type, CPU architecture, etc.). |
343
+ | `appium_mobile_get_battery_info` | Get the current battery level (as a percentage) and charging state of the device. Works on both iOS and Android. |
293
344
 
294
345
  ### App Management
295
346
 
@@ -326,6 +377,43 @@ Open Amazon mobile app, search for "iPhone 15 Pro", select the first search resu
326
377
 
327
378
  This example demonstrates a complete e-commerce checkout flow that can be automated using MCP Appium's intelligent locator generation and test creation capabilities.
328
379
 
380
+ ### AI-Powered Element Finding Examples
381
+
382
+ **Traditional Mode (XPath/ID):**
383
+ ```json
384
+ {
385
+ "tool": "appium_find_element",
386
+ "arguments": {
387
+ "strategy": "xpath",
388
+ "selector": "//android.widget.Button[@text='Search']"
389
+ }
390
+ }
391
+ ```
392
+
393
+ **AI Mode (Natural Language):**
394
+ ```json
395
+ {
396
+ "tool": "appium_find_element",
397
+ "arguments": {
398
+ "strategy": "ai_instruction",
399
+ "ai_instruction": "yellow search button at the bottom of the screen"
400
+ }
401
+ }
402
+ ```
403
+
404
+ **More AI Mode Examples:**
405
+ - `"username input field at top"`
406
+ - `"settings icon in top-right corner"`
407
+ - `"red delete button next to the item"`
408
+ - `"blue submit button at bottom"`
409
+ - `"profile picture in navigation bar"`
410
+
411
+ **Benefits of AI Mode:**
412
+ - **No Complex Selectors**: Describe elements in plain language
413
+ - **Resilient to UI Changes**: Semantic understanding adapts to layout changes
414
+ - **Faster Development**: No need to inspect element hierarchies
415
+ - **Works Across Languages**: Describe in any language you're comfortable with
416
+
329
417
  ### Working in Your Native Language
330
418
 
331
419
  **MCP Appium works seamlessly in any language** - you don't need to know English! The AI assistant understands and responds in your native language. Simply describe what you want to do in your preferred language:
@@ -0,0 +1,41 @@
1
+ /**
2
+ * Type definitions for AI Vision Finder
3
+ */
4
+ /**
5
+ * AI Vision configuration interface
6
+ */
7
+ export interface AIVisionConfig {
8
+ model: string;
9
+ apiBaseUrl: string;
10
+ apiToken: string;
11
+ coordType: 'normalized' | 'absolute';
12
+ imageMaxWidth: number;
13
+ imageQuality: number;
14
+ }
15
+ /**
16
+ * Bounding box type: [x1, y1, x2, y2]
17
+ * - x1, y1: top-left corner coordinates
18
+ * - x2, y2: bottom-right corner coordinates
19
+ */
20
+ export type BBox = [x1: number, y1: number, x2: number, y2: number];
21
+ /**
22
+ * Bounding box coordinates interface
23
+ * Matches the format returned by vision models
24
+ */
25
+ export interface BBoxCoordinates {
26
+ target: string;
27
+ bbox_2d: BBox;
28
+ }
29
+ /**
30
+ * AI element finding result interface
31
+ */
32
+ export interface AIFindResult {
33
+ bbox: BBox;
34
+ center: {
35
+ x: number;
36
+ y: number;
37
+ };
38
+ target: string;
39
+ annotatedImagePath?: string;
40
+ }
41
+ //# sourceMappingURL=types.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../src/ai-finder/types.ts"],"names":[],"mappings":"AAAA;;GAEG;AAEH;;GAEG;AACH,MAAM,WAAW,cAAc;IAC7B,KAAK,EAAE,MAAM,CAAC;IACd,UAAU,EAAE,MAAM,CAAC;IACnB,QAAQ,EAAE,MAAM,CAAC;IACjB,SAAS,EAAE,YAAY,GAAG,UAAU,CAAC;IACrC,aAAa,EAAE,MAAM,CAAC;IACtB,YAAY,EAAE,MAAM,CAAC;CACtB;AAED;;;;GAIG;AACH,MAAM,MAAM,IAAI,GAAG,CAAC,EAAE,EAAE,MAAM,EAAE,EAAE,EAAE,MAAM,EAAE,EAAE,EAAE,MAAM,EAAE,EAAE,EAAE,MAAM,CAAC,CAAC;AAEpE;;;GAGG;AACH,MAAM,WAAW,eAAe;IAC9B,MAAM,EAAE,MAAM,CAAC;IACf,OAAO,EAAE,IAAI,CAAC;CACf;AAED;;GAEG;AACH,MAAM,WAAW,YAAY;IAC3B,IAAI,EAAE,IAAI,CAAC;IACX,MAAM,EAAE;QAAE,CAAC,EAAE,MAAM,CAAC;QAAC,CAAC,EAAE,MAAM,CAAA;KAAE,CAAC;IACjC,MAAM,EAAE,MAAM,CAAC;IACf,kBAAkB,CAAC,EAAE,MAAM,CAAC;CAC7B"}
@@ -0,0 +1,5 @@
1
+ /**
2
+ * Type definitions for AI Vision Finder
3
+ */
4
+ export {};
5
+ //# sourceMappingURL=types.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"types.js","sourceRoot":"","sources":["../../src/ai-finder/types.ts"],"names":[],"mappings":"AAAA;;GAEG"}
@@ -0,0 +1,101 @@
1
+ /**
2
+ * AI Vision Finder Module
3
+ *
4
+ * Core module for AI-powered element finding using vision models.
5
+ * Implementation aligns with benchmark_model.ts standards.
6
+ */
7
+ import type { AIFindResult } from './types.js';
8
+ /**
9
+ * AI Vision Finder class
10
+ * Based on benchmark results: Qwen3-VL-235B-A22B-Instruct (100% accuracy, 8417ms)
11
+ */
12
+ export declare class AIVisionFinder {
13
+ private config;
14
+ private readonly cache;
15
+ private readonly CACHE_TTL_MS;
16
+ constructor();
17
+ /**
18
+ * Find element using AI vision model
19
+ * @param screenshotBase64 - Base64 encoded screenshot
20
+ * @param instruction - Natural language instruction
21
+ * @param imageWidth - Original image width
22
+ * @param imageHeight - Original image height
23
+ * @returns AI find result with bbox and center coordinates
24
+ */
25
+ findElement(screenshotBase64: string, instruction: string, imageWidth: number, imageHeight: number): Promise<AIFindResult>;
26
+ /**
27
+ * Compress image using @appium/support sharp utilities
28
+ * Reduces API latency and token consumption
29
+ *
30
+ * Returns both the base64-encoded image and its MIME type so that the caller
31
+ * can construct a correct data URL. On compression failure the original bytes
32
+ * are returned with mimeType 'image/png' (Appium screenshots are always PNG).
33
+ *
34
+ * **Resizing policy**: Resizing is intentionally skipped when
35
+ * `coordType === 'absolute'`. In absolute mode the vision model returns pixel
36
+ * coordinates relative to the image it received. If the image were resized,
37
+ * those coordinates would map to the compressed dimensions rather than the
38
+ * original screen dimensions, causing incorrect tap positions. Only JPEG
39
+ * quality compression is applied in that case.
40
+ */
41
+ private compressImage;
42
+ /**
43
+ * Build prompt for vision model
44
+ * Matches benchmark_model.ts prompt format for consistency
45
+ */
46
+ private buildPrompt;
47
+ /**
48
+ * Call vision model API
49
+ * Matches benchmark_model.ts implementation
50
+ */
51
+ private callVisionAPI;
52
+ /**
53
+ * Parse bbox coordinates from model response
54
+ * Matches benchmark_model.ts parsing logic
55
+ */
56
+ private parseBBox;
57
+ /**
58
+ * Convert coordinates based on model's coordinate type
59
+ * Matches benchmark_model.ts coordinate conversion logic
60
+ *
61
+ * Coordinate type modes:
62
+ * - **normalized** (default, AI_VISION_COORD_TYPE=normalized):
63
+ * The vision model returns coordinates in the range 0–1000, where
64
+ * (0,0) is the top-left corner and (1000,1000) is the bottom-right corner.
65
+ * This method scales them to absolute pixel coordinates using the original
66
+ * image dimensions. This mode is independent of image compression.
67
+ *
68
+ * - **absolute** (AI_VISION_COORD_TYPE=absolute):
69
+ * The vision model returns pixel coordinates directly based on the image
70
+ * it received (which may be the compressed image). Coordinates are used
71
+ * as-is and are NOT automatically scaled back to the original resolution.
72
+ * Use this mode only if the model explicitly outputs absolute pixel values.
73
+ */
74
+ private convertCoordinates;
75
+ /**
76
+ * Draw bounding box on image and save to file
77
+ * Based on benchmark_model.ts drawBBoxOnImage implementation
78
+ * @param screenshotBase64 - Base64 encoded screenshot
79
+ * @param bbox - Bounding box coordinates [x1, y1, x2, y2]
80
+ * @param imageWidth - Image width
81
+ * @param imageHeight - Image height
82
+ * @param targetName - Target element name for label
83
+ * @returns Absolute path to the annotated image file
84
+ */
85
+ private drawBBoxOnImage;
86
+ /**
87
+ * Generate cache key from instruction and image
88
+ */
89
+ private generateCacheKey;
90
+ /**
91
+ * Get result from cache if valid
92
+ * TTL expiry and LRU eviction are handled automatically by LRUCache
93
+ */
94
+ private getFromCache;
95
+ /**
96
+ * Save result to cache
97
+ * TTL expiry and LRU eviction are handled automatically by LRUCache
98
+ */
99
+ private saveToCache;
100
+ }
101
+ //# sourceMappingURL=vision-finder.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"vision-finder.d.ts","sourceRoot":"","sources":["../../src/ai-finder/vision-finder.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AASH,OAAO,KAAK,EAIV,YAAY,EACb,MAAM,YAAY,CAAC;AAGpB;;;GAGG;AACH,qBAAa,cAAc;IACzB,OAAO,CAAC,MAAM,CAAiB;IAC/B,OAAO,CAAC,QAAQ,CAAC,KAAK,CAAiC;IACvD,OAAO,CAAC,QAAQ,CAAC,YAAY,CAAiB;;IAyC9C;;;;;;;OAOG;IACG,WAAW,CACf,gBAAgB,EAAE,MAAM,EACxB,WAAW,EAAE,MAAM,EACnB,UAAU,EAAE,MAAM,EAClB,WAAW,EAAE,MAAM,GAClB,OAAO,CAAC,YAAY,CAAC;IAkFxB;;;;;;;;;;;;;;OAcG;YACW,aAAa;IA8D3B;;;OAGG;IACH,OAAO,CAAC,WAAW;IAmDnB;;;OAGG;YACW,aAAa;IAyD3B;;;OAGG;IACH,OAAO,CAAC,SAAS;IAyCjB;;;;;;;;;;;;;;;;OAgBG;IACH,OAAO,CAAC,kBAAkB;IA+C1B;;;;;;;;;OASG;YACW,eAAe;IA+D7B;;OAEG;IACH,OAAO,CAAC,gBAAgB;IASxB;;;OAGG;IACH,OAAO,CAAC,YAAY;IAIpB;;;OAGG;IACH,OAAO,CAAC,WAAW;CAGpB"}