screenhand 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/commands/automate.md +28 -0
- package/.claude/commands/debug-ui.md +19 -0
- package/.claude/commands/screenshot.md +15 -0
- package/.github/FUNDING.yml +1 -0
- package/.github/ISSUE_TEMPLATE/bug_report.md +27 -0
- package/.github/ISSUE_TEMPLATE/feature_request.md +20 -0
- package/.mcp.json +8 -0
- package/DESKTOP_MCP_GUIDE.md +92 -0
- package/LICENSE +661 -21
- package/README.md +97 -292
- package/SECURITY.md +44 -0
- package/docs/architecture.md +47 -0
- package/install-skills.sh +19 -0
- package/mcp-bridge.ts +271 -0
- package/mcp-desktop.ts +1221 -0
- package/native/macos-bridge/Package.swift +21 -0
- package/native/macos-bridge/Sources/AccessibilityBridge.swift +261 -0
- package/native/macos-bridge/Sources/AppManagement.swift +129 -0
- package/native/macos-bridge/Sources/CoreGraphicsBridge.swift +242 -0
- package/native/macos-bridge/Sources/ObserverBridge.swift +120 -0
- package/native/macos-bridge/Sources/VisionBridge.swift +80 -0
- package/native/macos-bridge/Sources/main.swift +345 -0
- package/native/windows-bridge/AppManagement.cs +234 -0
- package/native/windows-bridge/InputBridge.cs +436 -0
- package/native/windows-bridge/Program.cs +265 -0
- package/native/windows-bridge/ScreenCapture.cs +329 -0
- package/native/windows-bridge/UIAutomationBridge.cs +571 -0
- package/native/windows-bridge/WindowsBridge.csproj +17 -0
- package/package.json +3 -14
- package/playbooks/devpost.json +186 -0
- package/playbooks/instagram.json +41 -0
- package/playbooks/instagram_v2.json +201 -0
- package/playbooks/x_v1.json +211 -0
- package/scripts/devpost-live-loop.mjs +421 -0
- package/src/config.ts +30 -0
- package/src/index.ts +92 -0
- package/src/logging/timeline-logger.ts +55 -0
- package/src/mcp/server.ts +449 -0
- package/src/memory/recall.ts +191 -0
- package/src/memory/research.ts +146 -0
- package/src/memory/seeds.ts +123 -0
- package/src/memory/session.ts +201 -0
- package/src/memory/store.ts +434 -0
- package/src/memory/types.ts +69 -0
- package/src/native/bridge-client.ts +239 -0
- package/src/native/macos-bridge-client.ts +22 -0
- package/src/runtime/accessibility-adapter.ts +487 -0
- package/src/runtime/app-adapter.ts +169 -0
- package/src/runtime/applescript-adapter.ts +376 -0
- package/src/runtime/ax-role-map.ts +102 -0
- package/src/runtime/browser-adapter.ts +129 -0
- package/src/runtime/cdp-chrome-adapter.ts +676 -0
- package/src/runtime/composite-adapter.ts +274 -0
- package/src/runtime/executor.ts +396 -0
- package/src/runtime/locator-cache.ts +33 -0
- package/src/runtime/planning-loop.ts +81 -0
- package/src/runtime/service.ts +448 -0
- package/src/runtime/session-manager.ts +50 -0
- package/src/runtime/state-observer.ts +136 -0
- package/src/runtime/vision-adapter.ts +297 -0
- package/src/types.ts +297 -0
- package/tests/bridge-client.test.ts +176 -0
- package/tests/browser-stealth.test.ts +210 -0
- package/tests/composite-adapter.test.ts +64 -0
- package/tests/mcp-server.test.ts +151 -0
- package/tests/memory-recall.test.ts +339 -0
- package/tests/memory-research.test.ts +159 -0
- package/tests/memory-seeds.test.ts +120 -0
- package/tests/memory-store.test.ts +392 -0
- package/tests/types.test.ts +92 -0
- package/tsconfig.check.json +17 -0
- package/tsconfig.json +19 -0
- package/vitest.config.ts +8 -0
- package/dist/config.js +0 -9
- package/dist/index.js +0 -55
- package/dist/logging/timeline-logger.js +0 -29
- package/dist/mcp/mcp-stdio-server.js +0 -284
- package/dist/mcp/server.js +0 -347
- package/dist/mcp-entry.js +0 -62
- package/dist/memory/recall.js +0 -160
- package/dist/memory/research.js +0 -98
- package/dist/memory/seeds.js +0 -89
- package/dist/memory/session.js +0 -161
- package/dist/memory/store.js +0 -391
- package/dist/memory/types.js +0 -4
- package/dist/native/bridge-client.js +0 -173
- package/dist/native/macos-bridge-client.js +0 -5
- package/dist/runtime/accessibility-adapter.js +0 -377
- package/dist/runtime/app-adapter.js +0 -48
- package/dist/runtime/applescript-adapter.js +0 -283
- package/dist/runtime/ax-role-map.js +0 -80
- package/dist/runtime/browser-adapter.js +0 -36
- package/dist/runtime/cdp-chrome-adapter.js +0 -505
- package/dist/runtime/composite-adapter.js +0 -205
- package/dist/runtime/executor.js +0 -250
- package/dist/runtime/locator-cache.js +0 -12
- package/dist/runtime/planning-loop.js +0 -47
- package/dist/runtime/service.js +0 -372
- package/dist/runtime/session-manager.js +0 -28
- package/dist/runtime/state-observer.js +0 -105
- package/dist/runtime/vision-adapter.js +0 -208
- package/dist/test-mcp-protocol.js +0 -138
- package/dist/types.js +0 -1
package/README.md
CHANGED
|
@@ -4,41 +4,43 @@
|
|
|
4
4
|
|
|
5
5
|
**Give AI eyes and hands on your desktop.**
|
|
6
6
|
|
|
7
|
-
|
|
7
|
+
ScreenHand is an [MCP server](https://modelcontextprotocol.io/) that lets AI agents see your screen, click buttons, type text, and control any app on macOS and Windows.
|
|
8
8
|
|
|
9
|
-
[](LICENSE)
|
|
10
10
|
[](https://www.npmjs.com/package/screenhand)
|
|
11
11
|
[]()
|
|
12
12
|
[]()
|
|
13
13
|
|
|
14
|
-
[Website](https://screenhand.com) | [Quick Start](#quick-start) | [
|
|
14
|
+
[Website](https://screenhand.com) | [Quick Start](#quick-start) | [Use Cases](#use-cases) | [FAQ](#faq)
|
|
15
15
|
|
|
16
16
|
</div>
|
|
17
17
|
|
|
18
18
|
---
|
|
19
19
|
|
|
20
|
-
##
|
|
20
|
+
## The Problem
|
|
21
21
|
|
|
22
|
-
|
|
22
|
+
AI assistants are powerful — but they're blind. They can't see what's on your screen, click a button, or type into an app. If you want Claude to help you automate a workflow, debug a UI, or fill out a form, you're stuck copy-pasting screenshots and describing what you see.
|
|
23
|
+
|
|
24
|
+
**ScreenHand fixes that.** It gives any AI agent direct access to your desktop through native OS APIs — not slow screenshot-and-guess loops.
|
|
25
|
+
|
|
26
|
+
## How It Works
|
|
27
|
+
|
|
28
|
+
You connect ScreenHand to your AI client (Claude, Cursor, Codex CLI, etc.) via the [Model Context Protocol](https://modelcontextprotocol.io/). Once connected, your AI can:
|
|
23
29
|
|
|
24
30
|
- **See** your screen via screenshots and OCR
|
|
25
|
-
- **Read** UI elements via Accessibility APIs
|
|
31
|
+
- **Read** UI elements directly via native Accessibility APIs
|
|
26
32
|
- **Click** buttons, menus, and links
|
|
27
33
|
- **Type** text into any input field
|
|
28
34
|
- **Control** Chrome tabs via DevTools Protocol
|
|
29
|
-
- **
|
|
30
|
-
|
|
31
|
-
It works as an [MCP (Model Context Protocol)](https://modelcontextprotocol.io/) server, meaning any MCP-compatible AI client can use it out of the box.
|
|
35
|
+
- **Automate** cross-app workflows
|
|
32
36
|
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
|
36
|
-
|
|
37
|
-
|
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
| AI can't automate workflows | 25+ tools for cross-app automation |
|
|
41
|
-
| Only works on one OS | Native bridges for both macOS and Windows |
|
|
37
|
+
```
|
|
38
|
+
Your AI Client (Claude, Cursor, etc.)
|
|
39
|
+
| MCP protocol (stdio)
|
|
40
|
+
ScreenHand
|
|
41
|
+
| Native OS APIs
|
|
42
|
+
Your Desktop (any app, any browser)
|
|
43
|
+
```
|
|
42
44
|
|
|
43
45
|
## Quick Start
|
|
44
46
|
|
|
@@ -50,9 +52,10 @@ npm run build:native # macOS — builds Swift bridge
|
|
|
50
52
|
# npm run build:native:windows # Windows — builds .NET bridge
|
|
51
53
|
```
|
|
52
54
|
|
|
53
|
-
|
|
55
|
+
### Connect to Your AI Client
|
|
54
56
|
|
|
55
|
-
|
|
57
|
+
<details>
|
|
58
|
+
<summary><strong>Claude Desktop</strong></summary>
|
|
56
59
|
|
|
57
60
|
Add to `~/Library/Application Support/Claude/claude_desktop_config.json`:
|
|
58
61
|
|
|
@@ -61,13 +64,15 @@ Add to `~/Library/Application Support/Claude/claude_desktop_config.json`:
|
|
|
61
64
|
"mcpServers": {
|
|
62
65
|
"screenhand": {
|
|
63
66
|
"command": "npx",
|
|
64
|
-
"args": ["tsx", "/path/to/screenhand/
|
|
67
|
+
"args": ["tsx", "/path/to/screenhand/mcp-desktop.ts"]
|
|
65
68
|
}
|
|
66
69
|
}
|
|
67
70
|
}
|
|
68
71
|
```
|
|
72
|
+
</details>
|
|
69
73
|
|
|
70
|
-
|
|
74
|
+
<details>
|
|
75
|
+
<summary><strong>Claude Code</strong></summary>
|
|
71
76
|
|
|
72
77
|
Add to your project `.mcp.json` or `~/.claude/settings.json`:
|
|
73
78
|
|
|
@@ -76,352 +81,152 @@ Add to your project `.mcp.json` or `~/.claude/settings.json`:
|
|
|
76
81
|
"mcpServers": {
|
|
77
82
|
"screenhand": {
|
|
78
83
|
"command": "npx",
|
|
79
|
-
"args": ["tsx", "/path/to/screenhand/
|
|
84
|
+
"args": ["tsx", "/path/to/screenhand/mcp-desktop.ts"]
|
|
80
85
|
}
|
|
81
86
|
}
|
|
82
87
|
}
|
|
83
88
|
```
|
|
89
|
+
</details>
|
|
84
90
|
|
|
85
|
-
|
|
91
|
+
<details>
|
|
92
|
+
<summary><strong>Cursor</strong></summary>
|
|
86
93
|
|
|
87
|
-
Add to `.cursor/mcp.json` in your project (or `~/.cursor/mcp.json`
|
|
94
|
+
Add to `.cursor/mcp.json` in your project (or `~/.cursor/mcp.json` globally):
|
|
88
95
|
|
|
89
96
|
```json
|
|
90
97
|
{
|
|
91
98
|
"mcpServers": {
|
|
92
99
|
"screenhand": {
|
|
93
100
|
"command": "npx",
|
|
94
|
-
"args": ["tsx", "/path/to/screenhand/
|
|
101
|
+
"args": ["tsx", "/path/to/screenhand/mcp-desktop.ts"]
|
|
95
102
|
}
|
|
96
103
|
}
|
|
97
104
|
}
|
|
98
105
|
```
|
|
106
|
+
</details>
|
|
99
107
|
|
|
100
|
-
|
|
108
|
+
<details>
|
|
109
|
+
<summary><strong>OpenAI Codex CLI</strong></summary>
|
|
101
110
|
|
|
102
111
|
Add to `~/.codex/config.toml`:
|
|
103
112
|
|
|
104
113
|
```toml
|
|
105
114
|
[mcp.screenhand]
|
|
106
115
|
command = "npx"
|
|
107
|
-
args = ["tsx", "/path/to/screenhand/
|
|
116
|
+
args = ["tsx", "/path/to/screenhand/mcp-desktop.ts"]
|
|
108
117
|
transport = "stdio"
|
|
109
118
|
```
|
|
119
|
+
</details>
|
|
110
120
|
|
|
111
|
-
|
|
121
|
+
<details>
|
|
122
|
+
<summary><strong>Any MCP Client</strong></summary>
|
|
112
123
|
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
```json
|
|
116
|
-
{
|
|
117
|
-
"mcpServers": {
|
|
118
|
-
"screenhand": {
|
|
119
|
-
"command": "npx",
|
|
120
|
-
"args": ["tsx", "/path/to/screenhand/src/mcp-entry.ts"]
|
|
121
|
-
}
|
|
122
|
-
}
|
|
123
|
-
}
|
|
124
|
-
```
|
|
125
|
-
|
|
126
|
-
> **Why?** OpenClaw's built-in desktop control sends a screenshot to an LLM for every click (~3-5s, costs an API call). ScreenHand uses native Accessibility APIs — `press('Send')` runs in ~50ms with zero AI calls. See the full [integration guide](docs/openclaw-integration.md).
|
|
127
|
-
|
|
128
|
-
### Any MCP Client
|
|
129
|
-
|
|
130
|
-
ScreenHand is a standard MCP server over stdio. It works with any MCP-compatible client — just point it at `src/mcp-entry.ts`.
|
|
124
|
+
ScreenHand is a standard MCP server over stdio. Point any MCP-compatible client at `mcp-desktop.ts`.
|
|
125
|
+
</details>
|
|
131
126
|
|
|
132
127
|
Replace `/path/to/screenhand` with the actual path where you cloned the repo.
|
|
133
128
|
|
|
134
|
-
## Tools
|
|
135
|
-
|
|
136
|
-
ScreenHand exposes 25+ tools organized by category.
|
|
137
|
-
|
|
138
|
-
### See the Screen
|
|
139
|
-
|
|
140
|
-
| Tool | What it does | Speed |
|
|
141
|
-
|------|-------------|-------|
|
|
142
|
-
| `screenshot` | Full screenshot + OCR — returns all visible text | ~600ms |
|
|
143
|
-
| `screenshot_file` | Screenshot saved to file (for viewing the image) | ~400ms |
|
|
144
|
-
| `ocr` | OCR with element positions and bounding boxes | ~600ms |
|
|
145
|
-
|
|
146
|
-
### Control Any App (Accessibility / UI Automation)
|
|
147
|
-
|
|
148
|
-
| Tool | What it does | Speed |
|
|
149
|
-
|------|-------------|-------|
|
|
150
|
-
| `apps` | List running apps with bundle IDs and PIDs | ~10ms |
|
|
151
|
-
| `windows` | List visible windows with positions and sizes | ~10ms |
|
|
152
|
-
| `focus` | Bring an app to the front | ~10ms |
|
|
153
|
-
| `launch` | Launch an app by bundle ID or name | ~1s |
|
|
154
|
-
| `ui_tree` | Full UI element tree — instant, no OCR needed | ~50ms |
|
|
155
|
-
| `ui_find` | Find a UI element by text or title | ~50ms |
|
|
156
|
-
| `ui_press` | Click a UI element by its title | ~50ms |
|
|
157
|
-
| `ui_set_value` | Set value of a text field, slider, etc. | ~50ms |
|
|
158
|
-
| `menu_click` | Click a menu bar item by path | ~100ms |
|
|
159
|
-
|
|
160
|
-
### Keyboard and Mouse
|
|
161
|
-
|
|
162
|
-
| Tool | What it does |
|
|
163
|
-
|------|-------------|
|
|
164
|
-
| `click` | Click at screen coordinates |
|
|
165
|
-
| `click_text` | Find text via OCR and click it (fallback) |
|
|
166
|
-
| `type_text` | Type text via keyboard |
|
|
167
|
-
| `key` | Key combo (e.g. `cmd+s`, `ctrl+shift+n`) |
|
|
168
|
-
| `drag` | Drag from point A to B |
|
|
169
|
-
| `scroll` | Scroll at a position |
|
|
170
|
-
|
|
171
|
-
### Chrome Browser (CDP)
|
|
172
|
-
|
|
173
|
-
| Tool | What it does |
|
|
174
|
-
|------|-------------|
|
|
175
|
-
| `browser_tabs` | List all open Chrome tabs |
|
|
176
|
-
| `browser_open` | Open URL in new tab |
|
|
177
|
-
| `browser_navigate` | Navigate active tab to URL |
|
|
178
|
-
| `browser_js` | Run JavaScript in a tab |
|
|
179
|
-
| `browser_dom` | Query DOM with CSS selectors |
|
|
180
|
-
| `browser_click` | Click element by CSS selector (uses CDP mouse events) |
|
|
181
|
-
| `browser_type` | Type into an input field (uses CDP keyboard events, React-compatible) |
|
|
182
|
-
| `browser_wait` | Wait for a page condition |
|
|
183
|
-
| `browser_page_info` | Get page title, URL, and content |
|
|
184
|
-
|
|
185
|
-
### Anti-Detection & Stealth (CDP)
|
|
186
|
-
|
|
187
|
-
Tools for interacting with sites that have bot detection (Instagram, LinkedIn, etc.):
|
|
188
|
-
|
|
189
|
-
| Tool | What it does |
|
|
190
|
-
|------|-------------|
|
|
191
|
-
| `browser_stealth` | Inject anti-detection patches (hides webdriver flag, fakes plugins/languages) |
|
|
192
|
-
| `browser_fill_form` | Human-like typing with random delays via CDP keyboard events |
|
|
193
|
-
| `browser_human_click` | Realistic mouse event sequence (mouseMoved → mousePressed → mouseReleased) |
|
|
194
|
-
|
|
195
|
-
> **Tip:** Call `browser_stealth` once after navigating to a protected site. Then use `browser_fill_form` and `browser_human_click` for interactions. The regular `browser_type` and `browser_click` also use CDP Input events now.
|
|
196
|
-
|
|
197
|
-
### Platform Playbooks (lazy-loaded)
|
|
198
|
-
|
|
199
|
-
Pre-built automation knowledge for specific platforms — selectors, URLs, flows, and **error solutions**.
|
|
200
|
-
|
|
201
|
-
| Tool | What it does |
|
|
202
|
-
|------|-------------|
|
|
203
|
-
| `platform_guide` | Get automation guide for a platform (selectors, URLs, flows, errors+solutions) |
|
|
204
|
-
| `export_playbook` | Auto-generate a playbook from your session. Share it to help others. |
|
|
205
|
-
|
|
206
|
-
```
|
|
207
|
-
platform_guide({ platform: "devpost", section: "errors" }) # Just errors + solutions
|
|
208
|
-
platform_guide({ platform: "devpost", section: "selectors" }) # All CSS selectors
|
|
209
|
-
platform_guide({ platform: "devpost", section: "flows" }) # Step-by-step workflows
|
|
210
|
-
platform_guide({ platform: "devpost" }) # Full playbook
|
|
211
|
-
```
|
|
212
|
-
|
|
213
|
-
**Contributing playbooks:** After automating any site, run:
|
|
214
|
-
```
|
|
215
|
-
export_playbook({ platform: "twitter", domain: "twitter.com" })
|
|
216
|
-
```
|
|
217
|
-
This auto-extracts URLs, selectors, errors+solutions from your session and saves a ready-to-share `playbooks/twitter.json`.
|
|
218
|
-
|
|
219
|
-
Available platforms: `devpost`. Add more by running `export_playbook` or creating JSON files in `playbooks/`.
|
|
220
|
-
|
|
221
|
-
Zero performance cost — files only read when `platform_guide` is called.
|
|
222
|
-
|
|
223
|
-
### AppleScript (macOS only)
|
|
224
|
-
|
|
225
|
-
| Tool | What it does |
|
|
226
|
-
|------|-------------|
|
|
227
|
-
| `applescript` | Run any AppleScript command |
|
|
228
|
-
|
|
229
|
-
### Memory (Learning) — zero-config, zero-latency
|
|
230
|
-
|
|
231
|
-
ScreenHand gets smarter every time you use it — **no manual setup needed**.
|
|
232
|
-
|
|
233
|
-
**What happens automatically:**
|
|
234
|
-
- Every tool call is logged (async, non-blocking — adds ~0ms to response time)
|
|
235
|
-
- After 3+ consecutive successes, the winning sequence is saved as a reusable strategy
|
|
236
|
-
- Known error patterns are tracked with resolutions (e.g. "launch times out → use focus() instead")
|
|
237
|
-
- On every tool call, the response includes **auto-recall hints**:
|
|
238
|
-
- Error warnings if the tool has failed before
|
|
239
|
-
- Next-step suggestions if you're mid-way through a known strategy
|
|
240
|
-
|
|
241
|
-
**Predefined seed strategies:**
|
|
242
|
-
- Ships with 12 common macOS workflows (Photo Booth, Chrome navigation, copy/paste, Finder, export PDF, etc.)
|
|
243
|
-
- Loaded automatically on first boot — the system has knowledge from day one
|
|
244
|
-
- Seeds are searchable via `memory_recall` and provide next-step hints like any learned strategy
|
|
245
|
-
|
|
246
|
-
**Background web research:**
|
|
247
|
-
- When a tool fails and no resolution exists, ScreenHand searches for a fix in the background (non-blocking)
|
|
248
|
-
- Uses Claude API (haiku, if `ANTHROPIC_API_KEY` is set) or DuckDuckGo instant answers as fallback
|
|
249
|
-
- Resolutions are saved to both error cache and strategy store — zero-latency recall next time
|
|
250
|
-
- Completely silent and fire-and-forget — never blocks tool responses or throws errors
|
|
251
|
-
|
|
252
|
-
**Fingerprint matching & feedback loop:**
|
|
253
|
-
- Each strategy is fingerprinted by its tool sequence (e.g. `apps→focus→ui_press`)
|
|
254
|
-
- O(1) exact-match lookup when the agent follows a known sequence
|
|
255
|
-
- Success/failure outcomes are tracked per strategy — unreliable strategies are auto-penalized and eventually skipped
|
|
256
|
-
- Keyword-based fuzzy search with reliability scoring for `memory_recall`
|
|
257
|
-
|
|
258
|
-
**Production-grade under the hood:**
|
|
259
|
-
- All data cached in RAM at startup — lookups are ~0ms, disk is only for persistence
|
|
260
|
-
- Disk writes are async and buffered (100ms debounce) — never block tool calls
|
|
261
|
-
- Sync flush on process exit (SIGINT/SIGTERM) — no lost writes
|
|
262
|
-
- Per-line JSONL parsing — corrupted lines are skipped, not fatal
|
|
263
|
-
- LRU eviction: 500 strategies, 200 error patterns max (oldest evicted automatically)
|
|
264
|
-
- File locking (`.lock` + PID) prevents corruption from concurrent instances
|
|
265
|
-
- Action log auto-rotates at 10 MB
|
|
266
|
-
- Data lives in `.screenhand/memory/` as JSONL (grep-friendly, no database)
|
|
267
|
-
|
|
268
|
-
| Tool | What it does |
|
|
269
|
-
|------|-------------|
|
|
270
|
-
| `memory_recall` | Explicitly search past strategies by task description |
|
|
271
|
-
| `memory_save` | Manually save the current session (auto-save handles most cases) |
|
|
272
|
-
| `memory_errors` | View all known error patterns and their resolutions |
|
|
273
|
-
| `memory_stats` | Action counts, success rates, top tools, disk usage |
|
|
274
|
-
| `memory_clear` | Clear actions, strategies, errors, or all data |
|
|
275
|
-
|
|
276
|
-
## How It Works
|
|
277
|
-
|
|
278
|
-
ScreenHand has three layers:
|
|
279
|
-
|
|
280
|
-
```
|
|
281
|
-
AI Client (Claude, Cursor, etc.)
|
|
282
|
-
↓ MCP protocol (stdio)
|
|
283
|
-
ScreenHand MCP Server (TypeScript)
|
|
284
|
-
↓ JSON-RPC (stdio)
|
|
285
|
-
Native Bridge (Swift on macOS / C# on Windows)
|
|
286
|
-
↓ Platform APIs
|
|
287
|
-
Operating System (Accessibility, CoreGraphics, UI Automation, SendInput)
|
|
288
|
-
```
|
|
289
|
-
|
|
290
|
-
1. **Native bridge** — talks directly to OS-level APIs:
|
|
291
|
-
- **macOS**: Swift binary using Accessibility APIs, CoreGraphics, and Vision framework (OCR)
|
|
292
|
-
- **Windows**: C# (.NET 8) binary using UI Automation, SendInput, GDI+, and Windows.Media.Ocr
|
|
293
|
-
2. **TypeScript MCP server** — routes tools to the correct bridge, handles Chrome CDP, manages sessions
|
|
294
|
-
3. **MCP protocol** — standard Model Context Protocol so any AI client can connect
|
|
295
|
-
|
|
296
|
-
The native bridge is auto-selected based on your OS. Both bridges speak the same JSON-RPC protocol, so all tools work identically on both platforms.
|
|
297
|
-
|
|
298
129
|
## Use Cases
|
|
299
130
|
|
|
300
|
-
###
|
|
301
|
-
|
|
131
|
+
### Automate Repetitive Workflows
|
|
132
|
+
Tell your AI "submit this form on 10 websites" or "export all these reports as PDFs" — and it does it. ScreenHand handles the clicking, typing, and navigating across any app.
|
|
302
133
|
|
|
303
|
-
###
|
|
304
|
-
|
|
134
|
+
### Debug UIs Faster
|
|
135
|
+
Instead of clicking through your app manually, let Claude inspect the full UI element tree, check states, and walk through flows — all from your terminal.
|
|
305
136
|
|
|
306
|
-
### Browser Automation
|
|
307
|
-
Fill forms, scrape data, run JavaScript, navigate pages
|
|
137
|
+
### Browser Automation Without Selenium
|
|
138
|
+
Fill forms, scrape data, run JavaScript, and navigate pages through Chrome DevTools Protocol. Works with sites that block traditional automation.
|
|
308
139
|
|
|
309
140
|
### Cross-App Workflows
|
|
310
|
-
Read
|
|
311
|
-
|
|
312
|
-
### UI Testing
|
|
313
|
-
Click buttons, verify text appears, catch visual regressions — all driven by AI.
|
|
314
|
-
|
|
315
|
-
## Requirements
|
|
316
|
-
|
|
317
|
-
### macOS
|
|
141
|
+
Read data from a spreadsheet, search it in Chrome, paste results into Notes — chain actions across your entire desktop.
|
|
318
142
|
|
|
319
|
-
-
|
|
320
|
-
|
|
321
|
-
- Accessibility permissions: System Settings > Privacy & Security > Accessibility > enable your terminal
|
|
322
|
-
- Chrome with `--remote-debugging-port=9222` (only for browser tools)
|
|
143
|
+
### AI-Powered UI Testing
|
|
144
|
+
Click buttons, verify text appears, check element states, and catch regressions — all driven by your AI agent.
|
|
323
145
|
|
|
324
|
-
|
|
146
|
+
## What's Included
|
|
325
147
|
|
|
326
|
-
|
|
327
|
-
- Node.js 18+
|
|
328
|
-
- [.NET 8 SDK](https://dotnet.microsoft.com/download/dotnet/8.0)
|
|
329
|
-
- No special permissions needed — UI Automation works without admin
|
|
330
|
-
- Chrome with `--remote-debugging-port=9222` (only for browser tools)
|
|
331
|
-
- Build: `npm run build:native:windows`
|
|
148
|
+
ScreenHand exposes **70+ tools** organized by what you need to do:
|
|
332
149
|
|
|
333
|
-
|
|
150
|
+
| Category | Examples | What For |
|
|
151
|
+
|----------|----------|----------|
|
|
152
|
+
| **Screen** | `screenshot`, `ocr` | See what's on screen, read all visible text |
|
|
153
|
+
| **App Control** | `ui_tree`, `ui_press`, `menu_click` | Read and interact with any native app |
|
|
154
|
+
| **Keyboard & Mouse** | `click`, `type_text`, `key`, `drag` | Direct input control |
|
|
155
|
+
| **Chrome Browser** | `browser_navigate`, `browser_js`, `browser_dom` | Full browser automation via CDP |
|
|
156
|
+
| **Memory** | `memory_recall`, `memory_save` | ScreenHand learns from past sessions |
|
|
157
|
+
| **AppleScript** | `applescript` | Run AppleScript on macOS |
|
|
334
158
|
|
|
335
|
-
|
|
159
|
+
For the full tool reference, see the [tool documentation](DESKTOP_MCP_GUIDE.md).
|
|
336
160
|
|
|
337
|
-
|
|
338
|
-
- `/debug-ui` — inspect the UI tree of any app
|
|
339
|
-
- `/automate` — describe a task and Claude does it
|
|
340
|
-
|
|
341
|
-
**Install globally** so they work in any project:
|
|
161
|
+
## Requirements
|
|
342
162
|
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
163
|
+
| | macOS | Windows |
|
|
164
|
+
|---|---|---|
|
|
165
|
+
| **OS** | macOS 12+ | Windows 10 (1809+) |
|
|
166
|
+
| **Runtime** | Node.js 18+ | Node.js 18+ |
|
|
167
|
+
| **Permissions** | Accessibility (System Settings) | None (no admin needed) |
|
|
168
|
+
| **Browser tools** | Chrome with `--remote-debugging-port=9222` | Same |
|
|
169
|
+
| **Build** | `npm run build:native` | `npm run build:native:windows` |
|
|
346
170
|
|
|
347
171
|
## Development
|
|
348
172
|
|
|
349
173
|
```bash
|
|
350
|
-
npm run check # type-check
|
|
351
|
-
npm test # run test suite
|
|
174
|
+
npm run check # type-check
|
|
175
|
+
npm test # run test suite
|
|
352
176
|
npm run build # compile TypeScript
|
|
353
|
-
npm run build:native # build
|
|
354
|
-
npm run build:native:windows # build .NET bridge (Windows)
|
|
177
|
+
npm run build:native # build native bridge
|
|
355
178
|
```
|
|
356
179
|
|
|
357
180
|
## FAQ
|
|
358
181
|
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
### How does ScreenHand differ from Anthropic's Computer Use?
|
|
363
|
-
Anthropic's Computer Use is a cloud-based feature built into Claude. ScreenHand is an open-source, local-first tool that runs entirely on your machine with no cloud dependency. It uses native OS APIs (Accessibility on macOS, UI Automation on Windows) which are faster and more reliable than screenshot-based approaches.
|
|
182
|
+
<details>
|
|
183
|
+
<summary><strong>What is ScreenHand?</strong></summary>
|
|
364
184
|
|
|
365
|
-
|
|
366
|
-
|
|
185
|
+
An MCP server that gives AI agents the ability to see and control your desktop. It uses native OS APIs (Accessibility on macOS, UI Automation on Windows) for fast, reliable automation — not slow screenshot-based guessing.
|
|
186
|
+
</details>
|
|
367
187
|
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
| **How it sees the UI** | Native Accessibility/UI Automation APIs — reads the actual element tree | Screenshots + LLM vision — interprets pixels |
|
|
371
|
-
| **Speed** | ~50ms per UI action | Seconds per action (screenshot → LLM → click) |
|
|
372
|
-
| **Accuracy** | Exact element targeting by role/title | Coordinate-based — can misclick if layout shifts |
|
|
373
|
-
| **Architecture** | MCP server — works with any MCP client (Claude, Cursor, Codex CLI) | Standalone agent — tied to its own runtime |
|
|
374
|
-
| **Model lock-in** | None — any MCP-compatible AI decides what to do | Supports multiple LLMs but runs its own agent loop |
|
|
375
|
-
| **Learning memory** | Built-in: auto-learns strategies, tracks errors, O(1) fingerprint recall | Skill-based: 5,000+ community skills, but no automatic learning from usage |
|
|
376
|
-
| **Security** | Scoped MCP tools, audit logging, no browser cookie access | Full computer access, uses browser cookies, significant security surface |
|
|
377
|
-
| **Setup** | `npm install` + grant accessibility permission | Requires careful sandboxing, not recommended on personal machines |
|
|
378
|
-
|
|
379
|
-
**TL;DR**: OpenClaw is a powerful autonomous agent for tinkerers who want maximum flexibility. ScreenHand is a focused, fast, secure automation layer designed to be embedded into any AI workflow via MCP — with native API speed instead of screenshot-based guessing.
|
|
188
|
+
<details>
|
|
189
|
+
<summary><strong>How is this different from Anthropic's Computer Use?</strong></summary>
|
|
380
190
|
|
|
381
|
-
|
|
382
|
-
|
|
191
|
+
Computer Use is cloud-based and built into Claude. ScreenHand is open-source, runs locally on your machine, and uses native OS APIs which are faster and more reliable than screenshot-based approaches. It also works with any MCP-compatible client, not just Claude.
|
|
192
|
+
</details>
|
|
383
193
|
|
|
384
|
-
|
|
385
|
-
|
|
194
|
+
<details>
|
|
195
|
+
<summary><strong>Is it safe?</strong></summary>
|
|
386
196
|
|
|
387
|
-
|
|
388
|
-
|
|
197
|
+
ScreenHand runs entirely on your machine — no screen data is sent to external servers. All tool calls are audit-logged. See our [Security Policy](SECURITY.md) for details on permissions and boundaries.
|
|
198
|
+
</details>
|
|
389
199
|
|
|
390
|
-
|
|
391
|
-
|
|
200
|
+
<details>
|
|
201
|
+
<summary><strong>What AI clients work with it?</strong></summary>
|
|
392
202
|
|
|
393
|
-
|
|
394
|
-
|
|
203
|
+
Any MCP-compatible client: Claude Desktop, Claude Code, Cursor, Windsurf, OpenAI Codex CLI, and more.
|
|
204
|
+
</details>
|
|
395
205
|
|
|
396
|
-
|
|
397
|
-
|
|
206
|
+
<details>
|
|
207
|
+
<summary><strong>Can it control any app?</strong></summary>
|
|
398
208
|
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
### Is the memory data safe from corruption?
|
|
403
|
-
Yes. JSONL files are parsed line-by-line — a single corrupted line is skipped without affecting other entries. File locking prevents concurrent write corruption. Pending writes are flushed synchronously on exit (SIGINT/SIGTERM). Cache sizes are capped with LRU eviction to prevent unbounded growth.
|
|
209
|
+
On macOS, any app that exposes Accessibility elements (most do). On Windows, any app supporting UI Automation. For apps with custom rendering (games, some Electron apps), OCR is available as a fallback.
|
|
210
|
+
</details>
|
|
404
211
|
|
|
405
212
|
## Contributing
|
|
406
213
|
|
|
407
|
-
Contributions
|
|
214
|
+
Contributions welcome! Please open an issue first to discuss what you'd like to change.
|
|
408
215
|
|
|
409
216
|
```bash
|
|
410
217
|
git clone https://github.com/manushi4/screenhand.git
|
|
411
218
|
cd screenhand
|
|
412
|
-
npm install
|
|
413
|
-
npm run build:native
|
|
414
|
-
npm test
|
|
219
|
+
npm install && npm run build:native && npm test
|
|
415
220
|
```
|
|
416
221
|
|
|
417
222
|
## License
|
|
418
223
|
|
|
419
|
-
|
|
224
|
+
[AGPL-3.0](LICENSE) — Copyright (C) 2025 Clazro Technology Private Limited
|
|
420
225
|
|
|
421
226
|
---
|
|
422
227
|
|
|
423
228
|
<div align="center">
|
|
424
229
|
|
|
425
|
-
**[screenhand.com](https://screenhand.com)** | Built by [
|
|
230
|
+
**[screenhand.com](https://screenhand.com)** | Built by **[Clazro Technology Private Limited](https://github.com/manushi4)**
|
|
426
231
|
|
|
427
232
|
</div>
|
package/SECURITY.md
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
# Security Policy
|
|
2
|
+
|
|
3
|
+
## Overview
|
|
4
|
+
|
|
5
|
+
ScreenHand is a desktop automation tool with significant system access. We take security seriously.
|
|
6
|
+
|
|
7
|
+
## What ScreenHand Can Access
|
|
8
|
+
|
|
9
|
+
- **Screen content** via screenshots and OCR
|
|
10
|
+
- **UI elements** via native Accessibility APIs (macOS) / UI Automation (Windows)
|
|
11
|
+
- **Keyboard and mouse** input simulation
|
|
12
|
+
- **Chrome browser** tabs via DevTools Protocol (requires Chrome launched with debug port)
|
|
13
|
+
- **AppleScript** execution (macOS only)
|
|
14
|
+
|
|
15
|
+
## What ScreenHand Cannot Do
|
|
16
|
+
|
|
17
|
+
- ScreenHand does **not** send screen data or any information to external servers
|
|
18
|
+
- It does **not** access browser cookies, passwords, or stored credentials
|
|
19
|
+
- It does **not** run with elevated/admin privileges
|
|
20
|
+
- It does **not** modify system settings or install background services
|
|
21
|
+
- It does **not** communicate with any remote server (all operations are local)
|
|
22
|
+
|
|
23
|
+
## Permissions Required
|
|
24
|
+
|
|
25
|
+
### macOS
|
|
26
|
+
- **Accessibility permission**: System Settings > Privacy & Security > Accessibility > enable your terminal app
|
|
27
|
+
- This is a standard macOS requirement for any app that reads UI elements or simulates input
|
|
28
|
+
|
|
29
|
+
### Windows
|
|
30
|
+
- No special permissions needed — UI Automation works without admin for most applications
|
|
31
|
+
|
|
32
|
+
## Audit Logging
|
|
33
|
+
|
|
34
|
+
All tool calls are logged to `.audit-log.jsonl` with timestamps. This file is gitignored by default and stays on your machine.
|
|
35
|
+
|
|
36
|
+
## Reporting a Vulnerability
|
|
37
|
+
|
|
38
|
+
If you discover a security vulnerability, please email **security@screenhand.com** instead of opening a public issue.
|
|
39
|
+
|
|
40
|
+
We will acknowledge receipt within 48 hours and aim to provide a fix within 7 days for critical issues.
|
|
41
|
+
|
|
42
|
+
## Responsible Use
|
|
43
|
+
|
|
44
|
+
ScreenHand is designed for legitimate automation, testing, and productivity use cases. Users are responsible for ensuring their use complies with applicable laws and the terms of service of any applications they automate.
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
# MVP Architecture
|
|
2
|
+
|
|
3
|
+
## Design Goals
|
|
4
|
+
- Fast execution by keeping session and context persistent.
|
|
5
|
+
- Predictable completion by hard action budgets.
|
|
6
|
+
- No infinite loops: each tool call returns success or structured failure.
|
|
7
|
+
- LLM plans high-level intent; runtime handles micro-logic.
|
|
8
|
+
|
|
9
|
+
## Layers
|
|
10
|
+
1. `MCP Server Layer`
|
|
11
|
+
- Accepts tool requests (`session_start`, `navigate`, `press`, `type_into`, `wait_for`, `extract`, `screenshot`).
|
|
12
|
+
- Validates args and forwards to runtime service.
|
|
13
|
+
|
|
14
|
+
2. `Runtime Service Layer`
|
|
15
|
+
- Orchestrates session manager, executor, adapter, logging, and cache.
|
|
16
|
+
- Converts low-level errors into structured failure payloads.
|
|
17
|
+
|
|
18
|
+
3. `Executor Layer`
|
|
19
|
+
- Runs bounded state machine for action tools:
|
|
20
|
+
- locate (cached first, fallback strategy)
|
|
21
|
+
- act
|
|
22
|
+
- verify
|
|
23
|
+
- optional retry
|
|
24
|
+
- Enforces per-step time budgets.
|
|
25
|
+
|
|
26
|
+
4. `Browser Adapter Layer`
|
|
27
|
+
- Thin contract for browser operations.
|
|
28
|
+
- Current scaffold uses a placeholder adapter; later replace with CDP or Playwright robot-mode adapter.
|
|
29
|
+
|
|
30
|
+
## Core Runtime Flow
|
|
31
|
+
1. `session_start(profile)` ensures a persistent session ID.
|
|
32
|
+
2. `navigate(url)` completes within timeout and returns url/title.
|
|
33
|
+
3. `press` / `type_into` run bounded loop with max retries.
|
|
34
|
+
4. `wait_for(condition)` waits only for explicit UI conditions.
|
|
35
|
+
5. `extract(target, format)` returns structured data.
|
|
36
|
+
6. On failure, return structured diagnostics + timings.
|
|
37
|
+
|
|
38
|
+
## Key Data Contracts
|
|
39
|
+
- `ActionBudget`: `locateMs`, `actMs`, `verifyMs`, `maxRetries`.
|
|
40
|
+
- `ActionTelemetry`: per-action timing + retry count + status.
|
|
41
|
+
- `RuntimeError`: error code, attempts, page meta, and cause.
|
|
42
|
+
|
|
43
|
+
## Next Implementation Phase
|
|
44
|
+
- Harden the current CDP adapter with richer locator heuristics and cleanup hooks.
|
|
45
|
+
- Add locator strategy expansion (role/text/selector priority + fuzzy fallback).
|
|
46
|
+
- Persist locator cache per site/action.
|
|
47
|
+
- Wire transport for actual MCP protocol endpoint.
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
#!/bin/bash
|
|
2
|
+
# Install ScreenHand skills globally for Claude Code
|
|
3
|
+
# Usage: ./install-skills.sh
|
|
4
|
+
|
|
5
|
+
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
|
|
6
|
+
TARGET="$HOME/.claude/commands"
|
|
7
|
+
|
|
8
|
+
mkdir -p "$TARGET"
|
|
9
|
+
|
|
10
|
+
cp "$SCRIPT_DIR/.claude/commands/screenshot.md" "$TARGET/desktop-screenshot.md"
|
|
11
|
+
cp "$SCRIPT_DIR/.claude/commands/debug-ui.md" "$TARGET/desktop-debug-ui.md"
|
|
12
|
+
cp "$SCRIPT_DIR/.claude/commands/automate.md" "$TARGET/desktop-automate.md"
|
|
13
|
+
|
|
14
|
+
echo "Installed skills to $TARGET:"
|
|
15
|
+
echo " /desktop-screenshot — capture and describe your screen"
|
|
16
|
+
echo " /desktop-debug-ui — inspect any app's UI tree"
|
|
17
|
+
echo " /desktop-automate — automate a multi-step workflow"
|
|
18
|
+
echo ""
|
|
19
|
+
echo "These are now available globally in any Claude Code session."
|