native-devtools-mcp 0.1.7 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/cli.js +12 -5
- package/package.json +5 -3
- package/README.md +0 -297
package/bin/cli.js
CHANGED
|
@@ -6,6 +6,7 @@ const fs = require("fs");
|
|
|
6
6
|
|
|
7
7
|
const PLATFORMS = {
|
|
8
8
|
"darwin-arm64": "@sh3ll3x3c/native-devtools-mcp-darwin-arm64",
|
|
9
|
+
"win32-x64": "@sh3ll3x3c/native-devtools-mcp-win32-x64",
|
|
9
10
|
};
|
|
10
11
|
|
|
11
12
|
function getPlatformPackage() {
|
|
@@ -16,7 +17,9 @@ function getPlatformPackage() {
|
|
|
16
17
|
const pkg = PLATFORMS[key];
|
|
17
18
|
if (!pkg) {
|
|
18
19
|
console.error(`Unsupported platform: ${platform}-${arch}`);
|
|
19
|
-
console.error(
|
|
20
|
+
console.error(
|
|
21
|
+
"native-devtools-mcp supports: darwin-arm64 (Apple Silicon), win32-x64 (Windows x64)"
|
|
22
|
+
);
|
|
20
23
|
process.exit(1);
|
|
21
24
|
}
|
|
22
25
|
|
|
@@ -29,16 +32,20 @@ function findBinary() {
|
|
|
29
32
|
const platformDir = `${platform}-${arch}`;
|
|
30
33
|
const pkg = getPlatformPackage();
|
|
31
34
|
|
|
35
|
+
// Binary name differs by platform
|
|
36
|
+
const binaryName =
|
|
37
|
+
platform === "win32" ? "native-devtools-mcp.exe" : "native-devtools-mcp";
|
|
38
|
+
|
|
32
39
|
// Try to find the platform-specific package
|
|
33
40
|
const possiblePaths = [
|
|
34
41
|
// Local development (binary in sibling directory)
|
|
35
|
-
path.join(__dirname, "..", platformDir, "bin",
|
|
42
|
+
path.join(__dirname, "..", platformDir, "bin", binaryName),
|
|
36
43
|
// When installed as a dependency
|
|
37
|
-
path.join(__dirname, "..", "node_modules", pkg, "bin",
|
|
44
|
+
path.join(__dirname, "..", "node_modules", pkg, "bin", binaryName),
|
|
38
45
|
// When installed globally or via npx
|
|
39
|
-
path.join(__dirname, "..", "..", pkg, "bin",
|
|
46
|
+
path.join(__dirname, "..", "..", pkg, "bin", binaryName),
|
|
40
47
|
// Hoisted in node_modules
|
|
41
|
-
path.join(__dirname, "..", "..", "..", pkg, "bin",
|
|
48
|
+
path.join(__dirname, "..", "..", "..", pkg, "bin", binaryName),
|
|
42
49
|
];
|
|
43
50
|
|
|
44
51
|
for (const binPath of possiblePaths) {
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "native-devtools-mcp",
|
|
3
|
-
"version": "0.1
|
|
3
|
+
"version": "0.2.1",
|
|
4
4
|
"description": "MCP server for testing native desktop applications",
|
|
5
5
|
"license": "MIT",
|
|
6
6
|
"repository": {
|
|
@@ -15,7 +15,8 @@
|
|
|
15
15
|
"desktop",
|
|
16
16
|
"testing",
|
|
17
17
|
"automation",
|
|
18
|
-
"macos"
|
|
18
|
+
"macos",
|
|
19
|
+
"windows"
|
|
19
20
|
],
|
|
20
21
|
"bin": {
|
|
21
22
|
"native-devtools-mcp": "bin/cli.js"
|
|
@@ -24,7 +25,8 @@
|
|
|
24
25
|
"bin"
|
|
25
26
|
],
|
|
26
27
|
"optionalDependencies": {
|
|
27
|
-
"@sh3ll3x3c/native-devtools-mcp-darwin-arm64": "0.1
|
|
28
|
+
"@sh3ll3x3c/native-devtools-mcp-darwin-arm64": "0.2.1",
|
|
29
|
+
"@sh3ll3x3c/native-devtools-mcp-win32-x64": "0.2.1"
|
|
28
30
|
},
|
|
29
31
|
"engines": {
|
|
30
32
|
"node": ">=18"
|
package/README.md
DELETED
|
@@ -1,297 +0,0 @@
|
|
|
1
|
-
# native-devtools-mcp
|
|
2
|
-
|
|
3
|
-
A Model Context Protocol (MCP) server for testing native desktop applications, similar to how Chrome DevTools enables web UI testing.
|
|
4
|
-
|
|
5
|
-
> **100% Local & Private** - All processing happens on your machine. No data is sent to external servers. Screenshots, UI interactions, and app data never leave your device.
|
|
6
|
-
|
|
7
|
-
## Platform Support
|
|
8
|
-
|
|
9
|
-
| Platform | Status |
|
|
10
|
-
|----------|--------|
|
|
11
|
-
| **macOS** | Supported |
|
|
12
|
-
| **Windows** | Planned |
|
|
13
|
-
| **Linux** | Planned |
|
|
14
|
-
|
|
15
|
-
> Windows and Linux support will be added in future releases with platform-specific backends.
|
|
16
|
-
|
|
17
|
-
## Overview
|
|
18
|
-
|
|
19
|
-
This MCP server enables LLM-driven testing of native desktop apps by providing:
|
|
20
|
-
- **Screenshots** - Capture full screen, windows, or regions
|
|
21
|
-
- **Input simulation** - Click, type, scroll, drag via platform-native events
|
|
22
|
-
- **Window/app enumeration** - List and focus windows and applications
|
|
23
|
-
|
|
24
|
-
## Two Approaches to UI Interaction
|
|
25
|
-
|
|
26
|
-
This server provides **two distinct approaches** for interacting with application UIs. They are kept separate intentionally to give the LLM full control over which approach to use based on context.
|
|
27
|
-
|
|
28
|
-
### 1. AppDebugKit (`app_*` tools) - Element-Level Precision
|
|
29
|
-
|
|
30
|
-
For applications that embed [AppDebugKit](./AppDebugKit/), you get:
|
|
31
|
-
- **Element targeting by ID** - Click buttons, fill text fields by element reference
|
|
32
|
-
- **CSS-like selectors** - Query elements with `#id`, `.ClassName`, `[title=Save]`
|
|
33
|
-
- **View hierarchy inspection** - Traverse the UI tree programmatically
|
|
34
|
-
- **Framework-aware** - Works with AppKit and SwiftUI controls
|
|
35
|
-
|
|
36
|
-
**Best for:** Apps you control, AppKit/SwiftUI apps, when you need reliable element targeting.
|
|
37
|
-
|
|
38
|
-
```
|
|
39
|
-
app_connect → app_get_tree → app_query(".NSButton") → app_click(element_id)
|
|
40
|
-
```
|
|
41
|
-
|
|
42
|
-
### 2. CGEvent (`click`, `type_text`, etc.) - Universal Compatibility
|
|
43
|
-
|
|
44
|
-
For **any application** regardless of framework:
|
|
45
|
-
- **Screen coordinate targeting** - Click at (x, y) positions
|
|
46
|
-
- **Works with any UI framework** - egui, Electron, Qt, games, anything
|
|
47
|
-
- **No app modification required** - Just needs Accessibility permission
|
|
48
|
-
|
|
49
|
-
**Best for:** Third-party apps, egui/Electron/Qt apps, when AppDebugKit isn't available.
|
|
50
|
-
|
|
51
|
-
```
|
|
52
|
-
take_screenshot → (analyze visually) → click(x=500, y=300)
|
|
53
|
-
```
|
|
54
|
-
|
|
55
|
-
### Why Two Approaches?
|
|
56
|
-
|
|
57
|
-
The LLM needs to choose the right approach based on what it observes:
|
|
58
|
-
|
|
59
|
-
| Scenario | Recommended Approach |
|
|
60
|
-
|----------|---------------------|
|
|
61
|
-
| App with AppDebugKit embedded | `app_*` tools - reliable element IDs |
|
|
62
|
-
| egui/Electron/Qt app | CGEvent tools - coordinate-based |
|
|
63
|
-
| Unknown app | Try `app_connect`, fall back to CGEvent |
|
|
64
|
-
| App with poor view hierarchy | CGEvent even if AppDebugKit connected |
|
|
65
|
-
|
|
66
|
-
Merging them into auto-fallback would hide important context from the LLM and reduce its ability to make informed decisions.
|
|
67
|
-
|
|
68
|
-
## Installation
|
|
69
|
-
|
|
70
|
-
### Option 1: npm (Recommended)
|
|
71
|
-
|
|
72
|
-
```bash
|
|
73
|
-
# Install globally
|
|
74
|
-
npm install -g native-devtools-mcp
|
|
75
|
-
|
|
76
|
-
# Or run directly with npx
|
|
77
|
-
npx native-devtools-mcp
|
|
78
|
-
```
|
|
79
|
-
|
|
80
|
-
### Option 2: Build from source
|
|
81
|
-
|
|
82
|
-
```bash
|
|
83
|
-
# Clone the repository
|
|
84
|
-
git clone https://github.com/sh3ll3x3c/native-devtools-mcp
|
|
85
|
-
cd native-devtools-mcp
|
|
86
|
-
|
|
87
|
-
# Build
|
|
88
|
-
cargo build --release
|
|
89
|
-
|
|
90
|
-
# Binary location
|
|
91
|
-
./target/release/native-devtools-mcp
|
|
92
|
-
```
|
|
93
|
-
|
|
94
|
-
## Required Permissions (macOS)
|
|
95
|
-
|
|
96
|
-
This MCP server requires macOS privacy permissions to capture screenshots and simulate input. **These permissions are required for the tools to function.**
|
|
97
|
-
|
|
98
|
-
### Step-by-Step Setup
|
|
99
|
-
|
|
100
|
-
#### 1. Screen Recording Permission (required for screenshots)
|
|
101
|
-
|
|
102
|
-
1. Open **System Settings** → **Privacy & Security** → **Screen Recording**
|
|
103
|
-
2. Click the **+** button (you may need to unlock with your password)
|
|
104
|
-
3. Add the app that runs Claude Code:
|
|
105
|
-
- **VS Code**: `/Applications/Visual Studio Code.app`
|
|
106
|
-
- **Terminal**: `/Applications/Utilities/Terminal.app`
|
|
107
|
-
- **iTerm**: `/Applications/iTerm.app`
|
|
108
|
-
4. **Quit and restart the app completely** (not just reload)
|
|
109
|
-
|
|
110
|
-
#### 2. Accessibility Permission (required for click, type, scroll)
|
|
111
|
-
|
|
112
|
-
1. Open **System Settings** → **Privacy & Security** → **Accessibility**
|
|
113
|
-
2. Click the **+** button
|
|
114
|
-
3. Add the same app as above (VS Code, Terminal, etc.)
|
|
115
|
-
4. **Quit and restart the app completely**
|
|
116
|
-
|
|
117
|
-
#### 3. Tesseract OCR (required for `find_text`, and for `take_screenshot` OCR)
|
|
118
|
-
|
|
119
|
-
The `find_text` tool uses OCR to locate text on screen and return clickable coordinates. The `take_screenshot` tool also runs OCR by default (`include_ocr: true`) to return text annotations. This is the **recommended way** to interact with apps when AppDebugKit is not available.
|
|
120
|
-
|
|
121
|
-
```bash
|
|
122
|
-
brew install tesseract
|
|
123
|
-
```
|
|
124
|
-
|
|
125
|
-
### Important Notes
|
|
126
|
-
|
|
127
|
-
- **Grant permissions to the host app** (VS Code, Terminal), not to the MCP server binary itself
|
|
128
|
-
- **Restart is required** - Permissions don't take effect until you fully quit and reopen the app
|
|
129
|
-
- **No popup appears** - macOS won't prompt you; it silently fails if permissions are missing
|
|
130
|
-
- If you see `could not create image from display`, you need Screen Recording permission
|
|
131
|
-
- If clicks don't work, you need Accessibility permission
|
|
132
|
-
|
|
133
|
-
### Privacy & Security
|
|
134
|
-
|
|
135
|
-
All data stays on your machine:
|
|
136
|
-
- Screenshots are captured locally and sent directly to Claude via the MCP protocol
|
|
137
|
-
- No data is uploaded to external servers
|
|
138
|
-
- The MCP server runs entirely offline
|
|
139
|
-
- Source code is open for audit
|
|
140
|
-
|
|
141
|
-
## MCP Configuration
|
|
142
|
-
|
|
143
|
-
Add to your Claude Code MCP config (`~/.claude/claude_desktop_config.json`):
|
|
144
|
-
|
|
145
|
-
```json
|
|
146
|
-
{
|
|
147
|
-
"mcpServers": {
|
|
148
|
-
"native-devtools": {
|
|
149
|
-
"command": "npx",
|
|
150
|
-
"args": ["-y", "native-devtools-mcp"]
|
|
151
|
-
}
|
|
152
|
-
}
|
|
153
|
-
}
|
|
154
|
-
```
|
|
155
|
-
|
|
156
|
-
Or if you built from source:
|
|
157
|
-
|
|
158
|
-
```json
|
|
159
|
-
{
|
|
160
|
-
"mcpServers": {
|
|
161
|
-
"native-devtools": {
|
|
162
|
-
"command": "/path/to/native-devtools-mcp"
|
|
163
|
-
}
|
|
164
|
-
}
|
|
165
|
-
}
|
|
166
|
-
```
|
|
167
|
-
|
|
168
|
-
## Tools
|
|
169
|
-
|
|
170
|
-
### System Tools (work with any app)
|
|
171
|
-
|
|
172
|
-
| Tool | Description |
|
|
173
|
-
|------|-------------|
|
|
174
|
-
| `take_screenshot` | Capture screen, window, or region (base64 PNG). Includes OCR text annotations by default (`include_ocr: true`). |
|
|
175
|
-
| `list_windows` | List visible windows with IDs, titles, bounds |
|
|
176
|
-
| `list_apps` | List running applications |
|
|
177
|
-
| `focus_window` | Bring window/app to front |
|
|
178
|
-
| `get_displays` | Get display info (bounds, scale factors) for coordinate conversion |
|
|
179
|
-
| `find_text` | Find text on screen using OCR; returns screen coordinates for clicking |
|
|
180
|
-
|
|
181
|
-
### CGEvent Input Tools (work with any app, require Accessibility permission)
|
|
182
|
-
|
|
183
|
-
| Tool | Description |
|
|
184
|
-
|------|-------------|
|
|
185
|
-
| `click` | Click at screen/window/screenshot coordinates |
|
|
186
|
-
| `type_text` | Type text at cursor position |
|
|
187
|
-
| `press_key` | Press key combo (e.g., "return", modifiers: ["command"]) |
|
|
188
|
-
| `scroll` | Scroll at position |
|
|
189
|
-
| `drag` | Drag from point to point |
|
|
190
|
-
| `move_mouse` | Move cursor to position |
|
|
191
|
-
|
|
192
|
-
### AppDebugKit Tools (require app to embed AppDebugKit)
|
|
193
|
-
|
|
194
|
-
| Tool | Description |
|
|
195
|
-
|------|-------------|
|
|
196
|
-
| `app_connect` | Connect to app's debug server (ws://127.0.0.1:9222). Supports `expected_bundle_id` and `expected_app_name` validation. |
|
|
197
|
-
| `app_disconnect` | Disconnect from app |
|
|
198
|
-
| `app_get_info` | Get app metadata (name, bundle ID, version) |
|
|
199
|
-
| `app_get_tree` | Get view hierarchy |
|
|
200
|
-
| `app_query` | Find elements by CSS-like selector |
|
|
201
|
-
| `app_get_element` | Get element details by ID |
|
|
202
|
-
| `app_click` | Click element by ID |
|
|
203
|
-
| `app_type` | Type text into element |
|
|
204
|
-
| `app_press_key` | Press key in app context |
|
|
205
|
-
| `app_focus` | Focus element (make first responder) |
|
|
206
|
-
| `app_screenshot` | Screenshot element or window |
|
|
207
|
-
| `app_list_windows` | List app's windows |
|
|
208
|
-
| `app_focus_window` | Focus specific window |
|
|
209
|
-
|
|
210
|
-
Note: app_* tools (except `app_connect`) are only listed after a successful connection. The server emits a tools list change on connect/disconnect, so some clients may need to refresh/re-list tools to see the app_* set.
|
|
211
|
-
|
|
212
|
-
## How Screenshots and Clicking Work (macOS)
|
|
213
|
-
|
|
214
|
-
- **Screenshots** are captured via the system `screencapture` utility (`-x` silent, `-C` include cursor, `-R` region, `-l` window with `-o` to exclude shadow), written to a temp PNG, and returned as base64. The backing scale factor is tracked for coordinate conversion. Window screenshots exclude shadows so that pixel coordinates align exactly with `CGWindowBounds`, and OCR coordinates are automatically offset into screen space.
|
|
215
|
-
- **Clicks/inputs** use CoreGraphics CGEvent injection (HID event tap). This requires Accessibility permission and works across AppKit, SwiftUI, Electron, egui, etc. Window-relative or screenshot-pixel coordinates are converted to screen coordinates using window bounds and display scale.
|
|
216
|
-
|
|
217
|
-
## Coordinate Systems and Display Scaling
|
|
218
|
-
|
|
219
|
-
The `click` tool supports three coordinate input methods to handle macOS display scaling:
|
|
220
|
-
|
|
221
|
-
```json
|
|
222
|
-
// Direct screen coordinates
|
|
223
|
-
{ "x": 500, "y": 300 }
|
|
224
|
-
|
|
225
|
-
// Window-relative (converted using window bounds)
|
|
226
|
-
{ "window_x": 100, "window_y": 50, "window_id": 1234 }
|
|
227
|
-
|
|
228
|
-
// Screenshot pixels (converted using backing scale factor)
|
|
229
|
-
{ "screenshot_x": 200, "screenshot_y": 100, "screenshot_window_id": 1234 }
|
|
230
|
-
```
|
|
231
|
-
|
|
232
|
-
Use `get_displays` to understand the display configuration:
|
|
233
|
-
```json
|
|
234
|
-
{
|
|
235
|
-
"displays": [{
|
|
236
|
-
"id": 1,
|
|
237
|
-
"is_main": true,
|
|
238
|
-
"bounds": { "x": 0, "y": 0, "width": 3008, "height": 1692 },
|
|
239
|
-
"backing_scale_factor": 2.0,
|
|
240
|
-
"pixel_width": 6016,
|
|
241
|
-
"pixel_height": 3384
|
|
242
|
-
}]
|
|
243
|
-
}
|
|
244
|
-
```
|
|
245
|
-
|
|
246
|
-
## Example Usage
|
|
247
|
-
|
|
248
|
-
### With CGEvent (any app)
|
|
249
|
-
|
|
250
|
-
```
|
|
251
|
-
User: Click the Submit button in the app
|
|
252
|
-
|
|
253
|
-
Claude: [calls take_screenshot]
|
|
254
|
-
[analyzes screenshot, finds Submit button at approximately x=450, y=320]
|
|
255
|
-
[calls click with x=450, y=320]
|
|
256
|
-
```
|
|
257
|
-
|
|
258
|
-
### With AppDebugKit (embedded app)
|
|
259
|
-
|
|
260
|
-
```
|
|
261
|
-
User: Click the Submit button in the app
|
|
262
|
-
|
|
263
|
-
Claude: [calls app_connect with url="ws://127.0.0.1:9222"]
|
|
264
|
-
[calls app_query with selector="[title=Submit]"]
|
|
265
|
-
[receives element_id="view-42"]
|
|
266
|
-
[calls app_click with element_id="view-42"]
|
|
267
|
-
```
|
|
268
|
-
|
|
269
|
-
## Architecture
|
|
270
|
-
|
|
271
|
-
```
|
|
272
|
-
┌─────────────────┐ JSON-RPC 2.0 ┌──────────────────┐
|
|
273
|
-
│ Claude/Client │ ◄──────────────────► │ native-devtools │
|
|
274
|
-
│ (with vision) │ stdio │ MCP Server │
|
|
275
|
-
└─────────────────┘ └────────┬─────────┘
|
|
276
|
-
│
|
|
277
|
-
┌────────────────────────┼────────────────────────┐
|
|
278
|
-
│ │ │
|
|
279
|
-
▼ ▼ ▼
|
|
280
|
-
┌─────────────┐ ┌─────────────┐ ┌─────────────┐
|
|
281
|
-
│ AppDebugKit │ │ CGEvent │ │ System │
|
|
282
|
-
│ (WebSocket) │ │ Input │ │ APIs │
|
|
283
|
-
│ │ │ │ │ │
|
|
284
|
-
│ Element-level│ │ Coordinate │ │ Screenshots │
|
|
285
|
-
│ interaction │ │ based input │ │ Window enum │
|
|
286
|
-
└─────────────┘ └─────────────┘ └─────────────┘
|
|
287
|
-
│ │
|
|
288
|
-
▼ ▼
|
|
289
|
-
┌─────────────┐ ┌─────────────┐
|
|
290
|
-
│ Apps with │ │ Any app │
|
|
291
|
-
│ AppDebugKit │ │ (egui, etc) │
|
|
292
|
-
└─────────────┘ └─────────────┘
|
|
293
|
-
```
|
|
294
|
-
|
|
295
|
-
## License
|
|
296
|
-
|
|
297
|
-
MIT
|