npm - screenhand - Versions diffs - 0.3.7 → 0.3.9 - Mend

screenhand 0.3.7 → 0.3.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/dist/mcp-desktop.js +154 -57
package/dist/src/learning/engine.js +20 -0
package/dist/src/perception/coordinator.js +23 -8
package/dist/src/perception/types.js +1 -0
package/dist/src/runtime/execution-contract.js +14 -1
package/package.json +1 -1

package/dist/mcp-desktop.js CHANGED Viewed

@@ -267,82 +267,86 @@ async function ensureCDP(overridePort) {
     throw new Error("Chrome not running with --remote-debugging-port. Launch with: /Applications/Google\\ Chrome.app/Contents/MacOS/Google\\ Chrome --remote-debugging-port=9222 --user-data-dir=/tmp/chrome-debug");
 }
 const server = new McpServer({ name: "screenhand", version: "3.0.0" }, {
-    instructions: `ScreenHand gives you native desktop control on macOS/Windows. 111 tools.
-## Quick Actions (just do it)
-For simple tasks, go direct — no setup needed:
+    instructions: `ScreenHand gives you native desktop control on macOS/Windows. 111 tools across 7 layers.
+## Quick Actions (1-2 steps, no setup)
 focus("com.apple.Notes") → ui_press("New Note") → type_text("hello") → key("cmd+s")
 browser_navigate("https://...") → browser_click("#btn") → browser_js("return ...")
-## Tool Speed (fastest first)
-1. **ui_press / key / type_text** — native AX, ~50ms
-2. **browser_* tools** — CDP, ~10ms (background, no focus needed)
-3. ***_with_fallback** — auto-tries AX → CDP → OCR (~100-500ms)
-4. **screenshot + ocr** — visual, ~600ms (canvas apps only)
-5. **applescript** — macOS scripting (Finder, Mail, Safari)
-## The Golden Sequence (for multi-step workflows)
-For complex tasks with 3+ steps, follow this order:
+## Smart Decision Flow (3+ steps)
-### 1. KNOW (before touching anything)
-platform_guide("figma")          → get selectors, flows, known errors
-memory_recall("figma export")    → reuse past strategies
-If unknown app: platform_explore("bundleId") or platform_learn("domain")
+### Step 0: DECIDE — learn or go?
+coverage_report(bundleId, appName) → tells you exactly what ScreenHand knows
+- "0 selectors, 0 flows" → LEARN FIRST (Step 0a)
+- "Has selectors + flows" → GO (skip to Step 1)
+- "Has error patterns for your tool" → use *_with_fallback tools
-### 2. SEE (understand current state)
-apps()                           → what's running?
-perception_start()               → continuous monitoring (for multi-step only)
-world_state()                    → current app, windows, controls
+learning_status(bundleId) → tells you WHICH tools to use
+- AX score > 0.9 → use ui_press/ui_tree (fastest, ~50ms)
+- CDP score high → it's a web app → use browser_* tools (~10ms)
+- Vision score high → canvas app → use screenshot + ocr (~600ms)
+- 0 samples → unknown app → always use *_with_fallback
-### 3. NAVIGATE
-focus("com.figma.Desktop")       → bring app to front
-ui_tree()                        → see all clickable elements
-ui_find("Export")                → check if target exists
+### Step 0a: LEARN (only if coverage_report says gaps)
+scan_menu_bar()                  → discover shortcuts + menu structure
+platform_explore("bundleId")     → map all interactive elements
+platform_guide("platform")       → load curated selectors/flows/errors
+memory_recall("task description") → reuse past strategies
+Then go to Step 1.
-### 4. ACT
-click_with_fallback("Export")    → click (auto-tries multiple methods)
-type_with_fallback("filename")   → type with fallback
-key("cmd+shift+e")               → keyboard shortcuts
+### Step 1: SEE
+perception_start()               → turns on continuous monitoring (3 rates: AX 100ms, CDP 300ms, Vision 1s)
+world_state()                    → verify windows + controls are tracked
+If world_state shows 0 controls → wait 1-2s for perception to populate, then retry.
-### 5. VERIFY
-world_state()                    → did UI change?
-world_state_diff()               → what changed?
+While perception runs, you get automatic features:
+- Auto world_state_diff after every action tool (Δ line in response)
+- Auto dialog dismissal (learning-ranked: Cancel/OK/Escape)
+- Auto context switch when apps change (loads new reference)
-### 6. STOP
-perception_stop()                → stop monitoring
-memory_save("task", ...)         → save strategy for next time
+### Step 2: ACT + VERIFY (loop)
+Each action tool response includes: world summary + Δ changes + perception freshness + learning hints.
+No need to manually call world_state() or world_state_diff() — it's automatic.
-## Strategy Selection (optional — for when you want to be smart about it)
-Use these tools to pick the best approach. Skip for quick one-off actions.
+**Tool priority:**
+1. ui_press / key / type_text — native AX, ~50ms (when AX score high)
+2. browser_* tools — CDP, ~10ms, background (web content)
+3. *_with_fallback — auto-tries AX → CDP → OCR (~100-500ms, when unsure)
+4. screenshot + ocr — visual (~600ms, canvas apps / visual verification)
+5. applescript — macOS scripting (Finder, Mail, bulk ops)
-**coverage_report(bundleId)** — what does ScreenHand know about this app?
-- Empty (0 selectors/flows) → learn first: scan_menu_bar() + platform_explore()
-- Has data + high stability → go fast: direct tools (ui_press, key)
-- Has error patterns → be careful: use *_with_fallback tools
+**Read the Δ line after each action:**
+- "Δ controls: 690→728" → UI changed, action worked
+- "Δ dialogs: 0→1" → dialog appeared, auto-dismiss will handle it
+- No Δ line → nothing changed, action may have failed
-**learning_status(bundleId)** — how experienced is ScreenHand with this app?
-- 100+ samples → app is well-known, direct tools are safe
-- 0 samples → unknown app, use *_with_fallback
-- AX score high → use ui_tree + ui_press
-- CDP score high → it's a web app, use browser_* tools
-- Vision score high → canvas app, use screenshot + ocr
+### Step 3: RECORD (optional — make it repeatable)
+playbook_record(action="start", platform="notes")  → start capturing
+... do the workflow ...
+playbook_record(action="clean")                     → auto-remove failed steps + retries
+playbook_record(action="status")                    → review steps (shows ⚠️FAILED markers)
+playbook_record(action="trim", removeSteps=[2,5])   → remove specific bad steps
+playbook_record(action="stop", name="my workflow")  → save as reusable playbook
-## Browser Automation
-browser_navigate/browser_click/browser_type/browser_js — all work in background (~10ms)
-browser_stealth() — activate before sites with bot detection
-browser_fill_form({...}) — human-like multi-field form filling
-browser_human_click(x, y) — randomized timing to avoid detection
+### Step 4: STOP
+perception_stop()                → stop monitoring, save resources
+memory_save("key", "strategy")   → save what worked for next time
 ## Planning (let ScreenHand figure out the steps)
-plan_goal("Export video as H.264")  → generates step-by-step plan from playbooks/strategies/references
-plan_execute(goalId)                → auto-runs known steps, pauses at LLM steps for your judgment
+plan_goal("Export video as H.264")  → generates plan from playbooks/strategies/references
+plan_execute(goalId)                → auto-runs known steps, pauses at LLM steps
 plan_step_resolve(goalId, tool, params) → you resolve paused steps
 plan_status(goalId) / plan_list() / plan_cancel(goalId)
-## Repeatable Workflows
-playbook_record() → do work → export_playbook() → job_create("name", steps) → worker_start()
-Jobs survive restarts. Worker daemon runs independently.
+## Browser
+browser_navigate/click/type/js — background via CDP (~10ms)
+browser_stealth() — before sites with bot detection
+browser_fill_form({...}) — human-like form filling
+browser_human_click(x, y) — randomized timing
+All browser tools accept cdpPort param for Electron apps (e.g. 9333)
+## Jobs (survive restarts)
+playbook → job_create("name", steps) → job_run(id) or worker_start() for background
 ## Multi-Agent
 session_claim() → work → session_heartbeat() → session_release()
@@ -449,6 +453,39 @@ recoveryEngine.setAppMap(appMap);
 planner.setToolRegistry(toolRegistry);
 planner.setAppMap(appMap);
 perceptionManager.setLearningEngine(learningEngine);
+// ── Reactive event loop: wire perception events to automatic responses ──
+// These fire at perception speed (100-300ms), not LLM speed (~2-3s).
+perceptionManager.on("dialog_detected", (event) => {
+    // Auto-dismiss unexpected dialogs using the best strategy from learning
+    const bundleId = worldModel.getState().focusedApp?.bundleId;
+    const ranked = bundleId
+        ? learningEngine.rankRecoveryStrategies("unexpected_dialog", bundleId)
+        : [];
+    // Pick the top-ranked strategy, or default to Escape
+    const bestStrategy = ranked.length > 0 && ranked[0].score > 0.3
+        ? ranked[0].strategyId
+        : "dismiss_dialog_escape";
+    // Map strategy to tool call
+    const strategyActions = {
+        dismiss_dialog_cancel: { tool: "click_text", params: { text: "Cancel" } },
+        dismiss_dialog_ok: { tool: "click_text", params: { text: "OK" } },
+        dismiss_dialog_escape: { tool: "key", params: { combo: "Escape" } },
+        grant_permission_allow: { tool: "click_text", params: { text: "Allow" } },
+        grant_permission_ok: { tool: "click_text", params: { text: "OK" } },
+    };
+    const action = strategyActions[bestStrategy] ?? strategyActions["dismiss_dialog_escape"];
+    console.error(`[reactive] Dialog detected: "${event.title}" (pid=${event.pid}) → auto-${bestStrategy}`);
+    // Execute non-blocking — fire and forget, don't block perception loop
+    toolRegistry.toExecutor()(action.tool, action.params).catch((err) => {
+        console.error(`[reactive] Auto-dismiss failed: ${err instanceof Error ? err.message : err}`);
+    });
+});
+perceptionManager.on("app_switched", (event) => {
+    // Auto-update context tracker when app switches (loads new reference/playbook)
+    contextTracker.updateContext("focus", { bundleId: event.bundleId });
+    // Log for observability
+    console.error(`[reactive] App switched to ${event.bundleId} (pid=${event.pid})`);
+});
 const mcpRecorder = new McpPlaybookRecorder(playbooksDir);
 const referenceMerger = new ReferenceMerger(referencesDir);
 const communityPublisher = new PlaybookPublisher();
@@ -4182,6 +4219,27 @@ server.tool("click_with_fallback", "Click a target by text using the canonical f
                     }
                     throw new Error("Target not found via OCR");
                 }
+                case "window_buffer": {
+                    // Last resort: capture GPU window buffer (works even when window is hidden),
+                    // OCR it, find target text, translate window-relative to screen-absolute coords
+                    const wbWindowId = await resolveWindowId(targetPid);
+                    if (!wbWindowId)
+                        throw new Error("No window found for window_buffer capture");
+                    const wbShot = await bridge.call("cg.captureWindow", { windowId: wbWindowId });
+                    const wbMatches = await bridge.call("vision.findText", { imagePath: wbShot.path, searchText: target });
+                    const wbMatch = Array.isArray(wbMatches) ? wbMatches[0] : null;
+                    if (!wbMatch?.bounds)
+                        throw new Error("Target not found via window buffer OCR");
+                    // Translate window-relative coords to screen-absolute
+                    const allWins = await bridge.call("app.windows");
+                    const winInfo = allWins.find((w) => w.windowId === wbWindowId);
+                    const winX = winInfo?.bounds?.x ?? 0;
+                    const winY = winInfo?.bounds?.y ?? 0;
+                    const absX = winX + wbMatch.bounds.x + wbMatch.bounds.width / 2;
+                    const absY = winY + wbMatch.bounds.y + wbMatch.bounds.height / 2;
+                    await bridge.call("cg.mouseClick", { x: absX, y: absY });
+                    return { ok: true, method, durationMs: Date.now() - start, fallbackFrom: null, retries: attempt, error: null, target: `${target} at (${Math.round(absX)},${Math.round(absY)}) [window_buffer]` };
+                }
             }
             throw new Error(`Unknown method: ${method}`);
         }
@@ -4516,6 +4574,22 @@ server.tool("read_with_fallback", "Read text content from the screen or a specif
                     const ocr = await bridge.call("vision.ocr", { imagePath: shot.path });
                     return { ok: true, method, durationMs: Date.now() - start, fallbackFrom: null, retries: attempt, error: null, target: ocr.text?.slice(0, 4000) ?? "" };
                 }
+                case "window_buffer": {
+                    // GPU window buffer capture — reads content even when window is behind other apps
+                    const rbWindowId = await resolveWindowId(targetPid);
+                    if (!rbWindowId)
+                        throw new Error("No window found for window_buffer read");
+                    const rbShot = await bridge.call("cg.captureWindow", { windowId: rbWindowId });
+                    if (target) {
+                        const rbMatches = await bridge.call("vision.findText", { imagePath: rbShot.path, searchText: target });
+                        const rbMatch = Array.isArray(rbMatches) ? rbMatches[0] : null;
+                        if (!rbMatch)
+                            throw new Error("Text not found via window buffer OCR");
+                        return { ok: true, method, durationMs: Date.now() - start, fallbackFrom: null, retries: attempt, error: null, target: rbMatch.text };
+                    }
+                    const rbOcr = await bridge.call("vision.ocr", { imagePath: rbShot.path });
+                    return { ok: true, method, durationMs: Date.now() - start, fallbackFrom: null, retries: attempt, error: null, target: rbOcr.text?.slice(0, 4000) ?? "" };
+                }
             }
             throw new Error(`Method ${method} does not support read`);
         }
@@ -4631,6 +4705,29 @@ server.tool("locate_with_fallback", "Find an element's position on screen using
                     const b = match.bounds;
                     return { ok: true, method, durationMs: Date.now() - start, fallbackFrom: null, retries: attempt, error: null, target: `${target} at (${b.x},${b.y} ${b.width}x${b.height})` };
                 }
+                case "window_buffer": {
+                    // GPU window buffer capture + OCR — works even when window is hidden
+                    const lbWindowId = await resolveWindowId(targetPid);
+                    if (!lbWindowId)
+                        throw new Error("No window found for window_buffer locate");
+                    const lbShot = await bridge.call("cg.captureWindow", { windowId: lbWindowId });
+                    const lbMatches = await bridge.call("vision.findText", { imagePath: lbShot.path, searchText: target });
+                    const lbMatch = Array.isArray(lbMatches) ? lbMatches[0] : null;
+                    if (!lbMatch?.bounds)
+                        throw new Error("Target not found via window buffer OCR");
+                    // Translate window-relative to screen-absolute bounds
+                    const lbWins = await bridge.call("app.windows");
+                    const lbWinInfo = lbWins.find((w) => w.windowId === lbWindowId);
+                    const lbOffX = lbWinInfo?.bounds?.x ?? 0;
+                    const lbOffY = lbWinInfo?.bounds?.y ?? 0;
+                    const lbBounds = {
+                        x: lbOffX + lbMatch.bounds.x,
+                        y: lbOffY + lbMatch.bounds.y,
+                        width: lbMatch.bounds.width,
+                        height: lbMatch.bounds.height,
+                    };
+                    return { ok: true, method, durationMs: Date.now() - start, fallbackFrom: null, retries: attempt, error: null, target: `${target} at (${lbBounds.x},${lbBounds.y} ${lbBounds.width}x${lbBounds.height}) [window_buffer]` };
+                }
             }
             throw new Error(`Method ${method} does not support locate`);
         }

package/dist/src/learning/engine.js CHANGED Viewed

@@ -126,6 +126,26 @@ export class LearningEngine {
     rankSensors(bundleId) {
         return this.sensors.rank(bundleId);
     }
+    /**
+     * Detect whether an app is "vision-only" — AX can't see its content,
+     * so window buffer capture + OCR is the only viable perception source.
+     * Returns true when AX has failed enough times with a low score and
+     * at least one other source (vision/ocr) has succeeded.
+     */
+    isVisionOnlyApp(bundleId) {
+        const ranked = this.sensors.rank(bundleId);
+        if (ranked.length < 2)
+            return false;
+        const ax = ranked.find(r => r.sourceType === "ax");
+        const vision = ranked.find(r => r.sourceType === "vision" || r.sourceType === "ocr");
+        // AX score near zero + vision/ocr has some success
+        if (ax && ax.score < 0.15 && vision && vision.score > 0.3)
+            return true;
+        // No AX entry at all but vision works
+        if (!ax && vision && vision.score > 0.3)
+            return true;
+        return false;
+    }
     /**
      * Query verified UI patterns for a given app, optionally filtered by tool.
      */

package/dist/src/perception/coordinator.js CHANGED Viewed

@@ -768,13 +768,18 @@ export class PerceptionCoordinator extends EventEmitter {
         // Safe CLI mode is already enabled via setSafeCLI() in start().
         // This allows vision/OCR for canvas-heavy apps like Canva in Chrome.
         // Skip vision if learning engine shows it consistently fails for this app,
-        // but retry every 20th cycle to re-evaluate (apps may gain windows later)
+        // but retry every 20th cycle to re-evaluate (apps may gain windows later).
+        // Exception: vision-only apps (AX blind) — vision/OCR is their ONLY perception
+        // source, so never skip it. Window buffer capture works even when window is hidden.
         if (this.learningEngine && this.activeAppContext) {
-            const ranked = this.learningEngine.rankSensors(this.activeAppContext.bundleId);
-            const visionRank = ranked.find(r => r.sourceType === "vision");
-            if (visionRank && visionRank.score < 0.1 && ranked.length >= 2 && this.stats.slowCycles % 20 !== 0) {
-                this.stats.slowCycles++;
-                return; // Vision consistently fails for this app — skip (retry every 20th cycle)
+            const isVisionOnly = this.learningEngine.isVisionOnlyApp(this.activeAppContext.bundleId);
+            if (!isVisionOnly) {
+                const ranked = this.learningEngine.rankSensors(this.activeAppContext.bundleId);
+                const visionRank = ranked.find(r => r.sourceType === "vision");
+                if (visionRank && visionRank.score < 0.1 && ranked.length >= 2 && this.stats.slowCycles % 20 !== 0) {
+                    this.stats.slowCycles++;
+                    return; // Vision consistently fails for this app — skip (retry every 20th cycle)
+                }
             }
         }
         const timestamp = new Date().toISOString();
@@ -860,14 +865,24 @@ export class PerceptionCoordinator extends EventEmitter {
                     },
                 });
             }
-            // Record vision sensor outcome
+            // Record vision sensor outcome — also record as window_buffer for vision-only apps
+            // so the fallback chain knows this source works for element location
             if (this.learningEngine && this.activeAppContext) {
+                const latencyMs = Date.now() - new Date(timestamp).getTime();
                 this.learningEngine.recordSensorOutcome({
                     bundleId: this.activeAppContext.bundleId,
                     sourceType: "vision",
                     success: !!diffEvent,
-                    latencyMs: Date.now() - new Date(timestamp).getTime(),
+                    latencyMs,
                 });
+                if (this.learningEngine.isVisionOnlyApp(this.activeAppContext.bundleId) && ocrEvent) {
+                    this.learningEngine.recordSensorOutcome({
+                        bundleId: this.activeAppContext.bundleId,
+                        sourceType: "window_buffer",
+                        success: true,
+                        latencyMs,
+                    });
+                }
             }
         }
         catch {

package/dist/src/perception/types.js CHANGED Viewed

@@ -21,6 +21,7 @@ export const DEFAULT_PERCEPTION_CONFIG = {
     enableAX: true,
     enableCDP: true,
     enableVision: true,
+    enableWindowBuffer: true,
     maxROIsPerCycle: 3,
     skipCaptureLock: false,
 };

package/dist/src/runtime/execution-contract.js CHANGED Viewed

@@ -23,7 +23,7 @@
  */
 // ── 1. Fallback Chain ──────────────────────────────────────────────────
 /** Ordered list of execution methods, from fastest/most reliable to slowest/least reliable */
-const EXECUTION_METHODS = ["ax", "cdp", "ocr", "coordinates"];
+const EXECUTION_METHODS = ["ax", "cdp", "ocr", "window_buffer", "coordinates"];
 const METHOD_CAPABILITIES = {
     ax: {
         method: "ax",
@@ -61,6 +61,18 @@ const METHOD_CAPABILITIES = {
         requiresBridge: true,
         requiresCDP: false,
     },
+    window_buffer: {
+        method: "window_buffer",
+        canClick: true,
+        canType: false,
+        canRead: true,
+        canLocate: true,
+        canSelect: false,
+        canScroll: false,
+        avgLatencyMs: 350,
+        requiresBridge: true,
+        requiresCDP: false,
+    },
     coordinates: {
         method: "coordinates",
         canClick: true,
@@ -90,6 +102,7 @@ const SENSOR_TO_METHOD = {
     chrome: "cdp",
     ocr: "ocr",
     vision: "ocr",
+    window_buffer: "window_buffer",
     coordinates: "coordinates",
 };
 /**

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "screenhand",
-  "version": "0.3.7",
+  "version": "0.3.9",
   "mcpName": "io.github.manushi4/screenhand",
   "description": "Give AI eyes and hands on your desktop. ScreenHand is an open-source MCP server that lets Claude and other AI agents see your screen, click buttons, type text, and control any app on macOS and Windows.",
   "homepage": "https://screenhand.com",