screenhand 0.3.7 → 0.3.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/mcp-desktop.js +94 -57
- package/package.json +1 -1
package/dist/mcp-desktop.js
CHANGED
|
@@ -267,82 +267,86 @@ async function ensureCDP(overridePort) {
|
|
|
267
267
|
throw new Error("Chrome not running with --remote-debugging-port. Launch with: /Applications/Google\\ Chrome.app/Contents/MacOS/Google\\ Chrome --remote-debugging-port=9222 --user-data-dir=/tmp/chrome-debug");
|
|
268
268
|
}
|
|
269
269
|
const server = new McpServer({ name: "screenhand", version: "3.0.0" }, {
|
|
270
|
-
instructions: `ScreenHand gives you native desktop control on macOS/Windows. 111 tools.
|
|
271
|
-
|
|
272
|
-
## Quick Actions (just do it)
|
|
273
|
-
For simple tasks, go direct — no setup needed:
|
|
270
|
+
instructions: `ScreenHand gives you native desktop control on macOS/Windows. 111 tools across 7 layers.
|
|
274
271
|
|
|
272
|
+
## Quick Actions (1-2 steps, no setup)
|
|
275
273
|
focus("com.apple.Notes") → ui_press("New Note") → type_text("hello") → key("cmd+s")
|
|
276
274
|
browser_navigate("https://...") → browser_click("#btn") → browser_js("return ...")
|
|
277
275
|
|
|
278
|
-
##
|
|
279
|
-
1. **ui_press / key / type_text** — native AX, ~50ms
|
|
280
|
-
2. **browser_* tools** — CDP, ~10ms (background, no focus needed)
|
|
281
|
-
3. ***_with_fallback** — auto-tries AX → CDP → OCR (~100-500ms)
|
|
282
|
-
4. **screenshot + ocr** — visual, ~600ms (canvas apps only)
|
|
283
|
-
5. **applescript** — macOS scripting (Finder, Mail, Safari)
|
|
284
|
-
|
|
285
|
-
## The Golden Sequence (for multi-step workflows)
|
|
286
|
-
For complex tasks with 3+ steps, follow this order:
|
|
276
|
+
## Smart Decision Flow (3+ steps)
|
|
287
277
|
|
|
288
|
-
###
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
278
|
+
### Step 0: DECIDE — learn or go?
|
|
279
|
+
coverage_report(bundleId, appName) → tells you exactly what ScreenHand knows
|
|
280
|
+
- "0 selectors, 0 flows" → LEARN FIRST (Step 0a)
|
|
281
|
+
- "Has selectors + flows" → GO (skip to Step 1)
|
|
282
|
+
- "Has error patterns for your tool" → use *_with_fallback tools
|
|
292
283
|
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
284
|
+
learning_status(bundleId) → tells you WHICH tools to use
|
|
285
|
+
- AX score > 0.9 → use ui_press/ui_tree (fastest, ~50ms)
|
|
286
|
+
- CDP score high → it's a web app → use browser_* tools (~10ms)
|
|
287
|
+
- Vision score high → canvas app → use screenshot + ocr (~600ms)
|
|
288
|
+
- 0 samples → unknown app → always use *_with_fallback
|
|
297
289
|
|
|
298
|
-
###
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
290
|
+
### Step 0a: LEARN (only if coverage_report says gaps)
|
|
291
|
+
scan_menu_bar() → discover shortcuts + menu structure
|
|
292
|
+
platform_explore("bundleId") → map all interactive elements
|
|
293
|
+
platform_guide("platform") → load curated selectors/flows/errors
|
|
294
|
+
memory_recall("task description") → reuse past strategies
|
|
295
|
+
Then go to Step 1.
|
|
302
296
|
|
|
303
|
-
###
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
297
|
+
### Step 1: SEE
|
|
298
|
+
perception_start() → turns on continuous monitoring (3 rates: AX 100ms, CDP 300ms, Vision 1s)
|
|
299
|
+
world_state() → verify windows + controls are tracked
|
|
300
|
+
If world_state shows 0 controls → wait 1-2s for perception to populate, then retry.
|
|
307
301
|
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
302
|
+
While perception runs, you get automatic features:
|
|
303
|
+
- Auto world_state_diff after every action tool (Δ line in response)
|
|
304
|
+
- Auto dialog dismissal (learning-ranked: Cancel/OK/Escape)
|
|
305
|
+
- Auto context switch when apps change (loads new reference)
|
|
311
306
|
|
|
312
|
-
###
|
|
313
|
-
|
|
314
|
-
|
|
307
|
+
### Step 2: ACT + VERIFY (loop)
|
|
308
|
+
Each action tool response includes: world summary + Δ changes + perception freshness + learning hints.
|
|
309
|
+
No need to manually call world_state() or world_state_diff() — it's automatic.
|
|
315
310
|
|
|
316
|
-
|
|
317
|
-
|
|
311
|
+
**Tool priority:**
|
|
312
|
+
1. ui_press / key / type_text — native AX, ~50ms (when AX score high)
|
|
313
|
+
2. browser_* tools — CDP, ~10ms, background (web content)
|
|
314
|
+
3. *_with_fallback — auto-tries AX → CDP → OCR (~100-500ms, when unsure)
|
|
315
|
+
4. screenshot + ocr — visual (~600ms, canvas apps / visual verification)
|
|
316
|
+
5. applescript — macOS scripting (Finder, Mail, bulk ops)
|
|
318
317
|
|
|
319
|
-
**
|
|
320
|
-
-
|
|
321
|
-
-
|
|
322
|
-
-
|
|
318
|
+
**Read the Δ line after each action:**
|
|
319
|
+
- "Δ controls: 690→728" → UI changed, action worked
|
|
320
|
+
- "Δ dialogs: 0→1" → dialog appeared, auto-dismiss will handle it
|
|
321
|
+
- No Δ line → nothing changed, action may have failed
|
|
323
322
|
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
-
|
|
328
|
-
|
|
329
|
-
|
|
323
|
+
### Step 3: RECORD (optional — make it repeatable)
|
|
324
|
+
playbook_record(action="start", platform="notes") → start capturing
|
|
325
|
+
... do the workflow ...
|
|
326
|
+
playbook_record(action="clean") → auto-remove failed steps + retries
|
|
327
|
+
playbook_record(action="status") → review steps (shows ⚠️FAILED markers)
|
|
328
|
+
playbook_record(action="trim", removeSteps=[2,5]) → remove specific bad steps
|
|
329
|
+
playbook_record(action="stop", name="my workflow") → save as reusable playbook
|
|
330
330
|
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
browser_fill_form({...}) — human-like multi-field form filling
|
|
335
|
-
browser_human_click(x, y) — randomized timing to avoid detection
|
|
331
|
+
### Step 4: STOP
|
|
332
|
+
perception_stop() → stop monitoring, save resources
|
|
333
|
+
memory_save("key", "strategy") → save what worked for next time
|
|
336
334
|
|
|
337
335
|
## Planning (let ScreenHand figure out the steps)
|
|
338
|
-
plan_goal("Export video as H.264") → generates
|
|
339
|
-
plan_execute(goalId) → auto-runs known steps, pauses at LLM steps
|
|
336
|
+
plan_goal("Export video as H.264") → generates plan from playbooks/strategies/references
|
|
337
|
+
plan_execute(goalId) → auto-runs known steps, pauses at LLM steps
|
|
340
338
|
plan_step_resolve(goalId, tool, params) → you resolve paused steps
|
|
341
339
|
plan_status(goalId) / plan_list() / plan_cancel(goalId)
|
|
342
340
|
|
|
343
|
-
##
|
|
344
|
-
|
|
345
|
-
|
|
341
|
+
## Browser
|
|
342
|
+
browser_navigate/click/type/js — background via CDP (~10ms)
|
|
343
|
+
browser_stealth() — before sites with bot detection
|
|
344
|
+
browser_fill_form({...}) — human-like form filling
|
|
345
|
+
browser_human_click(x, y) — randomized timing
|
|
346
|
+
All browser tools accept cdpPort param for Electron apps (e.g. 9333)
|
|
347
|
+
|
|
348
|
+
## Jobs (survive restarts)
|
|
349
|
+
playbook → job_create("name", steps) → job_run(id) or worker_start() for background
|
|
346
350
|
|
|
347
351
|
## Multi-Agent
|
|
348
352
|
session_claim() → work → session_heartbeat() → session_release()
|
|
@@ -449,6 +453,39 @@ recoveryEngine.setAppMap(appMap);
|
|
|
449
453
|
planner.setToolRegistry(toolRegistry);
|
|
450
454
|
planner.setAppMap(appMap);
|
|
451
455
|
perceptionManager.setLearningEngine(learningEngine);
|
|
456
|
+
// ── Reactive event loop: wire perception events to automatic responses ──
|
|
457
|
+
// These fire at perception speed (100-300ms), not LLM speed (~2-3s).
|
|
458
|
+
perceptionManager.on("dialog_detected", (event) => {
|
|
459
|
+
// Auto-dismiss unexpected dialogs using the best strategy from learning
|
|
460
|
+
const bundleId = worldModel.getState().focusedApp?.bundleId;
|
|
461
|
+
const ranked = bundleId
|
|
462
|
+
? learningEngine.rankRecoveryStrategies("unexpected_dialog", bundleId)
|
|
463
|
+
: [];
|
|
464
|
+
// Pick the top-ranked strategy, or default to Escape
|
|
465
|
+
const bestStrategy = ranked.length > 0 && ranked[0].score > 0.3
|
|
466
|
+
? ranked[0].strategyId
|
|
467
|
+
: "dismiss_dialog_escape";
|
|
468
|
+
// Map strategy to tool call
|
|
469
|
+
const strategyActions = {
|
|
470
|
+
dismiss_dialog_cancel: { tool: "click_text", params: { text: "Cancel" } },
|
|
471
|
+
dismiss_dialog_ok: { tool: "click_text", params: { text: "OK" } },
|
|
472
|
+
dismiss_dialog_escape: { tool: "key", params: { combo: "Escape" } },
|
|
473
|
+
grant_permission_allow: { tool: "click_text", params: { text: "Allow" } },
|
|
474
|
+
grant_permission_ok: { tool: "click_text", params: { text: "OK" } },
|
|
475
|
+
};
|
|
476
|
+
const action = strategyActions[bestStrategy] ?? strategyActions["dismiss_dialog_escape"];
|
|
477
|
+
console.error(`[reactive] Dialog detected: "${event.title}" (pid=${event.pid}) → auto-${bestStrategy}`);
|
|
478
|
+
// Execute non-blocking — fire and forget, don't block perception loop
|
|
479
|
+
toolRegistry.toExecutor()(action.tool, action.params).catch((err) => {
|
|
480
|
+
console.error(`[reactive] Auto-dismiss failed: ${err instanceof Error ? err.message : err}`);
|
|
481
|
+
});
|
|
482
|
+
});
|
|
483
|
+
perceptionManager.on("app_switched", (event) => {
|
|
484
|
+
// Auto-update context tracker when app switches (loads new reference/playbook)
|
|
485
|
+
contextTracker.updateContext("focus", { bundleId: event.bundleId });
|
|
486
|
+
// Log for observability
|
|
487
|
+
console.error(`[reactive] App switched to ${event.bundleId} (pid=${event.pid})`);
|
|
488
|
+
});
|
|
452
489
|
const mcpRecorder = new McpPlaybookRecorder(playbooksDir);
|
|
453
490
|
const referenceMerger = new ReferenceMerger(referencesDir);
|
|
454
491
|
const communityPublisher = new PlaybookPublisher();
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "screenhand",
|
|
3
|
-
"version": "0.3.
|
|
3
|
+
"version": "0.3.8",
|
|
4
4
|
"mcpName": "io.github.manushi4/screenhand",
|
|
5
5
|
"description": "Give AI eyes and hands on your desktop. ScreenHand is an open-source MCP server that lets Claude and other AI agents see your screen, click buttons, type text, and control any app on macOS and Windows.",
|
|
6
6
|
"homepage": "https://screenhand.com",
|