screenhand 0.3.4 → 0.3.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/mcp-desktop.js +67 -75
- package/package.json +1 -1
package/dist/mcp-desktop.js
CHANGED
|
@@ -267,100 +267,92 @@ async function ensureCDP(overridePort) {
|
|
|
267
267
|
throw new Error("Chrome not running with --remote-debugging-port. Launch with: /Applications/Google\\ Chrome.app/Contents/MacOS/Google\\ Chrome --remote-debugging-port=9222 --user-data-dir=/tmp/chrome-debug");
|
|
268
268
|
}
|
|
269
269
|
const server = new McpServer({ name: "screenhand", version: "3.0.0" }, {
|
|
270
|
-
instructions: `ScreenHand gives you native desktop control on macOS/Windows. 111 tools.
|
|
270
|
+
instructions: `ScreenHand gives you native desktop control on macOS/Windows. 111 tools.
|
|
271
271
|
|
|
272
|
-
##
|
|
272
|
+
## Quick Actions (just do it)
|
|
273
|
+
For simple tasks, go direct — no setup needed:
|
|
273
274
|
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
|
|
277
|
-
|
|
275
|
+
focus("com.apple.Notes") → ui_press("New Note") → type_text("hello") → key("cmd+s")
|
|
276
|
+
browser_navigate("https://...") → browser_click("#btn") → browser_js("return ...")
|
|
277
|
+
|
|
278
|
+
## Tool Speed (fastest first)
|
|
279
|
+
1. **ui_press / key / type_text** — native AX, ~50ms
|
|
280
|
+
2. **browser_* tools** — CDP, ~10ms (background, no focus needed)
|
|
281
|
+
3. ***_with_fallback** — auto-tries AX → CDP → OCR (~100-500ms)
|
|
282
|
+
4. **screenshot + ocr** — visual, ~600ms (canvas apps only)
|
|
283
|
+
5. **applescript** — macOS scripting (Finder, Mail, Safari)
|
|
278
284
|
|
|
279
|
-
|
|
285
|
+
## The Golden Sequence (for multi-step workflows)
|
|
286
|
+
For complex tasks with 3+ steps, follow this order:
|
|
287
|
+
|
|
288
|
+
### 1. KNOW (before touching anything)
|
|
289
|
+
platform_guide("figma") → get selectors, flows, known errors
|
|
290
|
+
memory_recall("figma export") → reuse past strategies
|
|
291
|
+
If unknown app: platform_explore("bundleId") or platform_learn("domain")
|
|
280
292
|
|
|
281
293
|
### 2. SEE (understand current state)
|
|
282
294
|
apps() → what's running?
|
|
283
|
-
perception_start() →
|
|
284
|
-
world_state() → current app, windows, controls
|
|
285
|
-
screenshot() → visual confirmation if needed
|
|
295
|
+
perception_start() → continuous monitoring (for multi-step only)
|
|
296
|
+
world_state() → current app, windows, controls
|
|
286
297
|
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
### 3. NAVIGATE (get to the right place)
|
|
298
|
+
### 3. NAVIGATE
|
|
290
299
|
focus("com.figma.Desktop") → bring app to front
|
|
291
|
-
ui_tree() → see all clickable elements
|
|
292
|
-
ui_find("Export") → check if
|
|
300
|
+
ui_tree() → see all clickable elements
|
|
301
|
+
ui_find("Export") → check if target exists
|
|
293
302
|
|
|
294
|
-
### 4. ACT
|
|
295
|
-
click_with_fallback("Export") → click
|
|
296
|
-
type_with_fallback("filename") → type
|
|
303
|
+
### 4. ACT
|
|
304
|
+
click_with_fallback("Export") → click (auto-tries multiple methods)
|
|
305
|
+
type_with_fallback("filename") → type with fallback
|
|
297
306
|
key("cmd+shift+e") → keyboard shortcuts
|
|
298
|
-
drag(fromX, fromY, toX, toY) → drag and drop
|
|
299
|
-
scroll(direction) → scroll up/down/left/right
|
|
300
|
-
|
|
301
|
-
Always prefer *_with_fallback tools over bare click/type — they auto-recover when one method fails.
|
|
302
|
-
|
|
303
|
-
### 5. VERIFY (confirm it worked)
|
|
304
|
-
world_state() → did UI change as expected?
|
|
305
|
-
world_state_diff() → what exactly changed since last check?
|
|
306
|
-
screenshot() → visual proof
|
|
307
|
-
|
|
308
|
-
### 6. STOP (clean up)
|
|
309
|
-
perception_stop() → stop monitoring (save resources)
|
|
310
|
-
memory_save("figma_export", ...) → save successful strategy for next time
|
|
311
307
|
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
browser_dom() → read page structure (CSS selectors)
|
|
316
|
-
browser_click("#submit") → click element by CSS selector
|
|
317
|
-
browser_type("input", "text") → type into form field
|
|
318
|
-
browser_fill_form({...}) → fill multiple fields at once (human-like timing)
|
|
319
|
-
browser_js("return ...") → run JavaScript for complex extraction/actions
|
|
320
|
-
browser_wait("selector") → wait for element to appear
|
|
321
|
-
browser_human_click(x, y) → human-like click with randomized timing
|
|
308
|
+
### 5. VERIFY
|
|
309
|
+
world_state() → did UI change?
|
|
310
|
+
world_state_diff() → what changed?
|
|
322
311
|
|
|
323
|
-
|
|
312
|
+
### 6. STOP
|
|
313
|
+
perception_stop() → stop monitoring
|
|
314
|
+
memory_save("task", ...) → save strategy for next time
|
|
324
315
|
|
|
325
|
-
##
|
|
326
|
-
|
|
327
|
-
plan_execute(goalId) → auto-run deterministic steps, pauses at LLM steps for your judgment
|
|
328
|
-
plan_step_resolve(goalId, tool, params) → you provide the tool+params for LLM steps
|
|
329
|
-
plan_status(goalId) → check progress
|
|
330
|
-
plan_cancel(goalId) → abort if needed
|
|
316
|
+
## Strategy Selection (optional — for when you want to be smart about it)
|
|
317
|
+
Use these tools to pick the best approach. Skip for quick one-off actions.
|
|
331
318
|
|
|
332
|
-
|
|
319
|
+
**coverage_report(bundleId)** — what does ScreenHand know about this app?
|
|
320
|
+
- Empty (0 selectors/flows) → learn first: scan_menu_bar() + platform_explore()
|
|
321
|
+
- Has data + high stability → go fast: direct tools (ui_press, key)
|
|
322
|
+
- Has error patterns → be careful: use *_with_fallback tools
|
|
333
323
|
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
324
|
+
**learning_status(bundleId)** — how experienced is ScreenHand with this app?
|
|
325
|
+
- 100+ samples → app is well-known, direct tools are safe
|
|
326
|
+
- 0 samples → unknown app, use *_with_fallback
|
|
327
|
+
- AX score high → use ui_tree + ui_press
|
|
328
|
+
- CDP score high → it's a web app, use browser_* tools
|
|
329
|
+
- Vision score high → canvas app, use screenshot + ocr
|
|
340
330
|
|
|
341
|
-
|
|
331
|
+
## Browser Automation
|
|
332
|
+
browser_navigate/browser_click/browser_type/browser_js — all work in background (~10ms)
|
|
333
|
+
browser_stealth() — activate before sites with bot detection
|
|
334
|
+
browser_fill_form({...}) — human-like multi-field form filling
|
|
335
|
+
browser_human_click(x, y) — randomized timing to avoid detection
|
|
342
336
|
|
|
343
|
-
##
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
337
|
+
## Planning (let ScreenHand figure out the steps)
|
|
338
|
+
plan_goal("Export video as H.264") → generates step-by-step plan from playbooks/strategies/references
|
|
339
|
+
plan_execute(goalId) → auto-runs known steps, pauses at LLM steps for your judgment
|
|
340
|
+
plan_step_resolve(goalId, tool, params) → you resolve paused steps
|
|
341
|
+
plan_status(goalId) / plan_list() / plan_cancel(goalId)
|
|
348
342
|
|
|
349
|
-
##
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
- recovery_status() → see active cooldowns and cached strategies
|
|
353
|
-
- recovery_configure() → tune recovery budget (max time, max retries)
|
|
343
|
+
## Repeatable Workflows
|
|
344
|
+
playbook_record() → do work → export_playbook() → job_create("name", steps) → worker_start()
|
|
345
|
+
Jobs survive restarts. Worker daemon runs independently.
|
|
354
346
|
|
|
355
|
-
##
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
3. ***_with_fallback** — auto-tries multiple methods (~100-500ms)
|
|
359
|
-
4. **screenshot + ocr** — visual capture, ~600ms (only for canvas apps)
|
|
360
|
-
5. **applescript** — macOS scripting (Finder, Mail, Safari, etc.)
|
|
347
|
+
## Multi-Agent
|
|
348
|
+
session_claim() → work → session_heartbeat() → session_release()
|
|
349
|
+
supervisor_start() — auto-detects stalled agents and recovers
|
|
361
350
|
|
|
362
|
-
##
|
|
363
|
-
|
|
351
|
+
## Self-Healing (automatic)
|
|
352
|
+
Tool failures auto-retry with alternative strategies. Learning is automatic — every call improves selectors, timing, and recovery per app.
|
|
353
|
+
- learning_status() — inspect learned knowledge
|
|
354
|
+
- recovery_status() — check recovery state
|
|
355
|
+
- recovery_configure() — tune recovery budget
|
|
364
356
|
`,
|
|
365
357
|
});
|
|
366
358
|
// ═══════════════════════════════════════════════
|
|
@@ -6248,7 +6240,7 @@ server.tool("ingest_tutorial", "Extract structured playbook steps from a video t
|
|
|
6248
6240
|
}],
|
|
6249
6241
|
};
|
|
6250
6242
|
});
|
|
6251
|
-
server.tool("coverage_report", "
|
|
6243
|
+
server.tool("coverage_report", "Check what ScreenHand knows about an app: shortcuts, selectors, flows, playbooks, error patterns, and stability %. Useful before complex workflows to decide strategy: learn first (if empty), go fast (if high coverage), or use fallback tools (if error patterns exist). Optional for quick actions.", {
|
|
6252
6244
|
bundleId: z.string().describe("macOS bundle ID (e.g. com.blackmagic-design.DaVinciResolveLite)"),
|
|
6253
6245
|
appName: z.string().describe("Human-readable app name"),
|
|
6254
6246
|
includeLiveMenuScan: z.boolean().optional().describe("Also scan the live menu bar for comparison (requires app to be running, needs pid)"),
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "screenhand",
|
|
3
|
-
"version": "0.3.
|
|
3
|
+
"version": "0.3.6",
|
|
4
4
|
"mcpName": "io.github.manushi4/screenhand",
|
|
5
5
|
"description": "Give AI eyes and hands on your desktop. ScreenHand is an open-source MCP server that lets Claude and other AI agents see your screen, click buttons, type text, and control any app on macOS and Windows.",
|
|
6
6
|
"homepage": "https://screenhand.com",
|